Better parsing of "Generic" elements in pairUpElements
							parent
							
								
									fdc944ecbf
								
							
						
					
					
						commit
						66e586df6f
					
				| 
						 | 
					@ -390,10 +390,11 @@ function parseCheerioNode($: cheerio.Root, node: cheerio.Element): IPostElement
 | 
				
			||||||
    ? functionMap[type]($(node))
 | 
					    ? functionMap[type]($(node))
 | 
				
			||||||
    : createGenericElement();
 | 
					    : createGenericElement();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // Parse the childrens only if the node is a <b>/<i> element
 | 
					  // Parse the childrens only if the node is a <b>/<i> element, a list
 | 
				
			||||||
  // or a list element. For the link in unnecessary while for
 | 
					  // or a unknown element. For the link in unnecessary while for the
 | 
				
			||||||
  // the spoilers is already done in parseCheerioSpoilerNode
 | 
					  // spoilers is already done in parseCheerioSpoilerNode
 | 
				
			||||||
  if (type === "Formatted" || type === "List") {
 | 
					  const includeTypes: NodeTypeT[] = ["Formatted", "List", "Unknown"];
 | 
				
			||||||
 | 
					  if (includeTypes.includes(type)) {
 | 
				
			||||||
    const childPosts = cheerioNode
 | 
					    const childPosts = cheerioNode
 | 
				
			||||||
      .contents() // @todo Change to children() after cheerio RC6
 | 
					      .contents() // @todo Change to children() after cheerio RC6
 | 
				
			||||||
      .toArray()
 | 
					      .toArray()
 | 
				
			||||||
| 
						 | 
					@ -411,39 +412,68 @@ function parseCheerioNode($: cheerio.Root, node: cheerio.Element): IPostElement
 | 
				
			||||||
 * the corresponding value to each characterizing element (i.e. author).
 | 
					 * the corresponding value to each characterizing element (i.e. author).
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
function pairUpElements(elements: IPostElement[]): IPostElement[] {
 | 
					function pairUpElements(elements: IPostElement[]): IPostElement[] {
 | 
				
			||||||
  // First ignore the "Generic" type elements, because
 | 
					  // Local variables
 | 
				
			||||||
  // they usually are containers for other data, like
 | 
					  const shallow = [...elements];
 | 
				
			||||||
  // overview or download links.
 | 
					
 | 
				
			||||||
  const validElements = elements.filter((e) => e.type !== "Generic");
 | 
					  // Parse all the generic elements that
 | 
				
			||||||
 | 
					  // act as "container" for other information
 | 
				
			||||||
 | 
					  shallow
 | 
				
			||||||
 | 
					    .filter((e) => e.type === "Generic")
 | 
				
			||||||
 | 
					    .map((e) => ({
 | 
				
			||||||
 | 
					      element: e,
 | 
				
			||||||
 | 
					      pairs: pairUpElements(e.content)
 | 
				
			||||||
 | 
					    }))
 | 
				
			||||||
 | 
					    .forEach((e) => {
 | 
				
			||||||
 | 
					      // Find the index of the elements
 | 
				
			||||||
 | 
					      const index = shallow.indexOf(e.element);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      // Remove that elements
 | 
				
			||||||
 | 
					      shallow.splice(index, 1);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      // Add the pairs at the index of the deleted element
 | 
				
			||||||
 | 
					      e.pairs.forEach((e, i) => shallow.splice(index + i, 0, e));
 | 
				
			||||||
 | 
					    });
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Ignore the "Generic" elements that we have already parsed
 | 
				
			||||||
 | 
					  //const validElements = shallow.filter((e) => e.type !== "Generic");
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // Than we find all the IDs of "Text" elements where the
 | 
					  // Than we find all the IDs of "Text" elements where the
 | 
				
			||||||
  // text doesn't starts with double points. This means
 | 
					  // text doesn't starts with double points. This means
 | 
				
			||||||
  // that we find all the IDs of "title" elements.
 | 
					  // that we find all the IDs of "title" elements.
 | 
				
			||||||
  const indexes = validElements
 | 
					  const indexes = shallow
 | 
				
			||||||
    .filter(
 | 
					    .filter((e, i) => filterValidElements(e, i, shallow))
 | 
				
			||||||
      (e, i) =>
 | 
					    .map((e) => shallow.indexOf(e));
 | 
				
			||||||
        e.type === "Text" && // This element must be a text
 | 
					 | 
				
			||||||
        ((e.text.endsWith(":") && e.text !== ":") || // This element's text must ends with ":"
 | 
					 | 
				
			||||||
          validElements[i + 1]?.text.startsWith(":")) // The next element's text must start with ":"
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    .map((e) => validElements.indexOf(e));
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // Now we find all the elements between indexes and
 | 
					  // Now we find all the elements between indexes and
 | 
				
			||||||
  // associate them with the previous "title" element
 | 
					  // associate them with the previous "title" element
 | 
				
			||||||
  const data = indexes.map((i, j) => parseGroupData(i, j, indexes, validElements));
 | 
					  const data = indexes.map((i, j) => parseGroupData(i, j, indexes, shallow));
 | 
				
			||||||
 | 
					 | 
				
			||||||
  // Now parse all the "invalid" elements,
 | 
					 | 
				
			||||||
  // so all the elements with "Generic" type
 | 
					 | 
				
			||||||
  const genericElementsPairs = elements
 | 
					 | 
				
			||||||
    .filter((e) => e.type === "Generic")
 | 
					 | 
				
			||||||
    .map((e) => pairUpElements(e.content));
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  const flatten: IPostElement[] = [].concat(...genericElementsPairs);
 | 
					 | 
				
			||||||
  data.push(...flatten);
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
  return data;
 | 
					  return data;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					function filterValidElements(element: IPostElement, index: number, array: IPostElement[]): boolean {
 | 
				
			||||||
 | 
					  // Check if this element is a "title" checking also the next element
 | 
				
			||||||
 | 
					  const isPostfixDoublePoints = element.text.endsWith(":") && element.text !== ":";
 | 
				
			||||||
 | 
					  const nextElementIsValue = array[index + 1]?.text.startsWith(":");
 | 
				
			||||||
 | 
					  const elementIsTextTitle =
 | 
				
			||||||
 | 
					    element.type === "Text" && (isPostfixDoublePoints || nextElementIsValue);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Special values tha must be set has "title"
 | 
				
			||||||
 | 
					  const specialValues = ["DOWNLOAD"];
 | 
				
			||||||
 | 
					  const specialTypes = ["Image"];
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // Used to ignore already merged elements with name (ignore spoilers)
 | 
				
			||||||
 | 
					  // because they have as name the content of the spoiler button
 | 
				
			||||||
 | 
					  const hasName = element.name !== "" && element.type !== "Spoiler";
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  return (
 | 
				
			||||||
 | 
					    elementIsTextTitle ||
 | 
				
			||||||
 | 
					    specialTypes.includes(element.type) ||
 | 
				
			||||||
 | 
					    specialValues.includes(element.text.toUpperCase()) ||
 | 
				
			||||||
 | 
					    hasName
 | 
				
			||||||
 | 
					  );
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/**
 | 
					/**
 | 
				
			||||||
 * Associate the relative values to a title.
 | 
					 * Associate the relative values to a title.
 | 
				
			||||||
 * @param start Title index in the `elements` array
 | 
					 * @param start Title index in the `elements` array
 | 
				
			||||||
| 
						 | 
					@ -458,7 +488,7 @@ function parseGroupData(
 | 
				
			||||||
  elements: IPostElement[]
 | 
					  elements: IPostElement[]
 | 
				
			||||||
): IPostElement {
 | 
					): IPostElement {
 | 
				
			||||||
  // Local variables
 | 
					  // Local variables
 | 
				
			||||||
  const endsWithSpecialCharsRegex = /[-]$/;
 | 
					  const endsWithSpecialCharsRegex = /[-:]$/;
 | 
				
			||||||
  const startsWithDoublePointsRegex = /^[:]/;
 | 
					  const startsWithDoublePointsRegex = /^[:]/;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // Find all the elements (title + data) of the same data group
 | 
					  // Find all the elements (title + data) of the same data group
 | 
				
			||||||
| 
						 | 
					@ -468,6 +498,10 @@ function parseGroupData(
 | 
				
			||||||
  // Extract the title
 | 
					  // Extract the title
 | 
				
			||||||
  const title = group.shift();
 | 
					  const title = group.shift();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  // If the title is already named (beacuse it was
 | 
				
			||||||
 | 
					  // previously elaborated) return it witout
 | 
				
			||||||
 | 
					  if (title.name !== "" && title.type !== "Spoiler") return title;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // Assign name and text of the title
 | 
					  // Assign name and text of the title
 | 
				
			||||||
  title.name = title.text.replace(endsWithSpecialCharsRegex, "").trim();
 | 
					  title.name = title.text.replace(endsWithSpecialCharsRegex, "").trim();
 | 
				
			||||||
  title.text = group
 | 
					  title.text = group
 | 
				
			||||||
| 
						 | 
					@ -481,15 +515,13 @@ function parseGroupData(
 | 
				
			||||||
    .join(" ") // Join with space
 | 
					    .join(" ") // Join with space
 | 
				
			||||||
    .trim();
 | 
					    .trim();
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  // Append all the content of non-text elements.
 | 
					  // Append all the content of the elements.
 | 
				
			||||||
  group
 | 
					  group.forEach(
 | 
				
			||||||
    .filter((e) => e.type !== "Text")
 | 
					    (e) =>
 | 
				
			||||||
    .forEach(
 | 
					      e.type === "Spoiler"
 | 
				
			||||||
      (e) =>
 | 
					        ? title.content.push(...e.content) // Add all the content fo the spoiler
 | 
				
			||||||
        e.type === "Spoiler"
 | 
					        : title.content.push(e) // Add the element itself
 | 
				
			||||||
          ? title.content.push(...e.content) // Add all the content fo the spoiler
 | 
					  );
 | 
				
			||||||
          : title.content.push(e) // Add the element itself
 | 
					 | 
				
			||||||
    );
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
  return title;
 | 
					  return title;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue