Better parsing of "Generic" elements in pairUpElements
parent
fdc944ecbf
commit
66e586df6f
|
@ -390,10 +390,11 @@ function parseCheerioNode($: cheerio.Root, node: cheerio.Element): IPostElement
|
||||||
? functionMap[type]($(node))
|
? functionMap[type]($(node))
|
||||||
: createGenericElement();
|
: createGenericElement();
|
||||||
|
|
||||||
// Parse the childrens only if the node is a <b>/<i> element
|
// Parse the childrens only if the node is a <b>/<i> element, a list
|
||||||
// or a list element. For the link in unnecessary while for
|
// or a unknown element. For the link in unnecessary while for the
|
||||||
// the spoilers is already done in parseCheerioSpoilerNode
|
// spoilers is already done in parseCheerioSpoilerNode
|
||||||
if (type === "Formatted" || type === "List") {
|
const includeTypes: NodeTypeT[] = ["Formatted", "List", "Unknown"];
|
||||||
|
if (includeTypes.includes(type)) {
|
||||||
const childPosts = cheerioNode
|
const childPosts = cheerioNode
|
||||||
.contents() // @todo Change to children() after cheerio RC6
|
.contents() // @todo Change to children() after cheerio RC6
|
||||||
.toArray()
|
.toArray()
|
||||||
|
@ -411,39 +412,68 @@ function parseCheerioNode($: cheerio.Root, node: cheerio.Element): IPostElement
|
||||||
* the corresponding value to each characterizing element (i.e. author).
|
* the corresponding value to each characterizing element (i.e. author).
|
||||||
*/
|
*/
|
||||||
function pairUpElements(elements: IPostElement[]): IPostElement[] {
|
function pairUpElements(elements: IPostElement[]): IPostElement[] {
|
||||||
// First ignore the "Generic" type elements, because
|
// Local variables
|
||||||
// they usually are containers for other data, like
|
const shallow = [...elements];
|
||||||
// overview or download links.
|
|
||||||
const validElements = elements.filter((e) => e.type !== "Generic");
|
// Parse all the generic elements that
|
||||||
|
// act as "container" for other information
|
||||||
|
shallow
|
||||||
|
.filter((e) => e.type === "Generic")
|
||||||
|
.map((e) => ({
|
||||||
|
element: e,
|
||||||
|
pairs: pairUpElements(e.content)
|
||||||
|
}))
|
||||||
|
.forEach((e) => {
|
||||||
|
// Find the index of the elements
|
||||||
|
const index = shallow.indexOf(e.element);
|
||||||
|
|
||||||
|
// Remove that elements
|
||||||
|
shallow.splice(index, 1);
|
||||||
|
|
||||||
|
// Add the pairs at the index of the deleted element
|
||||||
|
e.pairs.forEach((e, i) => shallow.splice(index + i, 0, e));
|
||||||
|
});
|
||||||
|
|
||||||
|
// Ignore the "Generic" elements that we have already parsed
|
||||||
|
//const validElements = shallow.filter((e) => e.type !== "Generic");
|
||||||
|
|
||||||
// Than we find all the IDs of "Text" elements where the
|
// Than we find all the IDs of "Text" elements where the
|
||||||
// text doesn't starts with double points. This means
|
// text doesn't starts with double points. This means
|
||||||
// that we find all the IDs of "title" elements.
|
// that we find all the IDs of "title" elements.
|
||||||
const indexes = validElements
|
const indexes = shallow
|
||||||
.filter(
|
.filter((e, i) => filterValidElements(e, i, shallow))
|
||||||
(e, i) =>
|
.map((e) => shallow.indexOf(e));
|
||||||
e.type === "Text" && // This element must be a text
|
|
||||||
((e.text.endsWith(":") && e.text !== ":") || // This element's text must ends with ":"
|
|
||||||
validElements[i + 1]?.text.startsWith(":")) // The next element's text must start with ":"
|
|
||||||
)
|
|
||||||
.map((e) => validElements.indexOf(e));
|
|
||||||
|
|
||||||
// Now we find all the elements between indexes and
|
// Now we find all the elements between indexes and
|
||||||
// associate them with the previous "title" element
|
// associate them with the previous "title" element
|
||||||
const data = indexes.map((i, j) => parseGroupData(i, j, indexes, validElements));
|
const data = indexes.map((i, j) => parseGroupData(i, j, indexes, shallow));
|
||||||
|
|
||||||
// Now parse all the "invalid" elements,
|
|
||||||
// so all the elements with "Generic" type
|
|
||||||
const genericElementsPairs = elements
|
|
||||||
.filter((e) => e.type === "Generic")
|
|
||||||
.map((e) => pairUpElements(e.content));
|
|
||||||
|
|
||||||
const flatten: IPostElement[] = [].concat(...genericElementsPairs);
|
|
||||||
data.push(...flatten);
|
|
||||||
|
|
||||||
return data;
|
return data;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function filterValidElements(element: IPostElement, index: number, array: IPostElement[]): boolean {
|
||||||
|
// Check if this element is a "title" checking also the next element
|
||||||
|
const isPostfixDoublePoints = element.text.endsWith(":") && element.text !== ":";
|
||||||
|
const nextElementIsValue = array[index + 1]?.text.startsWith(":");
|
||||||
|
const elementIsTextTitle =
|
||||||
|
element.type === "Text" && (isPostfixDoublePoints || nextElementIsValue);
|
||||||
|
|
||||||
|
// Special values tha must be set has "title"
|
||||||
|
const specialValues = ["DOWNLOAD"];
|
||||||
|
const specialTypes = ["Image"];
|
||||||
|
|
||||||
|
// Used to ignore already merged elements with name (ignore spoilers)
|
||||||
|
// because they have as name the content of the spoiler button
|
||||||
|
const hasName = element.name !== "" && element.type !== "Spoiler";
|
||||||
|
|
||||||
|
return (
|
||||||
|
elementIsTextTitle ||
|
||||||
|
specialTypes.includes(element.type) ||
|
||||||
|
specialValues.includes(element.text.toUpperCase()) ||
|
||||||
|
hasName
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Associate the relative values to a title.
|
* Associate the relative values to a title.
|
||||||
* @param start Title index in the `elements` array
|
* @param start Title index in the `elements` array
|
||||||
|
@ -458,7 +488,7 @@ function parseGroupData(
|
||||||
elements: IPostElement[]
|
elements: IPostElement[]
|
||||||
): IPostElement {
|
): IPostElement {
|
||||||
// Local variables
|
// Local variables
|
||||||
const endsWithSpecialCharsRegex = /[-]$/;
|
const endsWithSpecialCharsRegex = /[-:]$/;
|
||||||
const startsWithDoublePointsRegex = /^[:]/;
|
const startsWithDoublePointsRegex = /^[:]/;
|
||||||
|
|
||||||
// Find all the elements (title + data) of the same data group
|
// Find all the elements (title + data) of the same data group
|
||||||
|
@ -468,6 +498,10 @@ function parseGroupData(
|
||||||
// Extract the title
|
// Extract the title
|
||||||
const title = group.shift();
|
const title = group.shift();
|
||||||
|
|
||||||
|
// If the title is already named (beacuse it was
|
||||||
|
// previously elaborated) return it witout
|
||||||
|
if (title.name !== "" && title.type !== "Spoiler") return title;
|
||||||
|
|
||||||
// Assign name and text of the title
|
// Assign name and text of the title
|
||||||
title.name = title.text.replace(endsWithSpecialCharsRegex, "").trim();
|
title.name = title.text.replace(endsWithSpecialCharsRegex, "").trim();
|
||||||
title.text = group
|
title.text = group
|
||||||
|
@ -481,15 +515,13 @@ function parseGroupData(
|
||||||
.join(" ") // Join with space
|
.join(" ") // Join with space
|
||||||
.trim();
|
.trim();
|
||||||
|
|
||||||
// Append all the content of non-text elements.
|
// Append all the content of the elements.
|
||||||
group
|
group.forEach(
|
||||||
.filter((e) => e.type !== "Text")
|
(e) =>
|
||||||
.forEach(
|
e.type === "Spoiler"
|
||||||
(e) =>
|
? title.content.push(...e.content) // Add all the content fo the spoiler
|
||||||
e.type === "Spoiler"
|
: title.content.push(e) // Add the element itself
|
||||||
? title.content.push(...e.content) // Add all the content fo the spoiler
|
);
|
||||||
: title.content.push(e) // Add the element itself
|
|
||||||
);
|
|
||||||
|
|
||||||
return title;
|
return title;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue