Better reduction and removal of empty elements
							parent
							
								
									8e118de909
								
							
						
					
					
						commit
						be246d50b1
					
				| 
						 | 
				
			
			@ -11,7 +11,7 @@ import { POST } from "../constants/css-selector";
 | 
			
		|||
//#region Interfaces
 | 
			
		||||
 | 
			
		||||
export interface IPostElement {
 | 
			
		||||
  type: "Empty" | "Text" | "Link" | "Image" | "Spoiler";
 | 
			
		||||
  type: "Generic" | "Text" | "Link" | "Image" | "Spoiler";
 | 
			
		||||
  name: string;
 | 
			
		||||
  text: string;
 | 
			
		||||
  content: IPostElement[];
 | 
			
		||||
| 
						 | 
				
			
			@ -41,12 +41,20 @@ export function parseF95ThreadPost($: cheerio.Root, post: cheerio.Cheerio): IPos
 | 
			
		|||
  const elements = post
 | 
			
		||||
    .contents()
 | 
			
		||||
    .toArray()
 | 
			
		||||
    .map((e) => parseCheerioNode($, e)) // Parse the nodes
 | 
			
		||||
    .filter((e) => !isPostElementEmpty(e)) // Ignore the empty nodes
 | 
			
		||||
    .map((e) => reducePostElement(e)); // Compress the nodes
 | 
			
		||||
    .map((e) => parseCheerioNode($, e)); // Parse the nodes
 | 
			
		||||
 | 
			
		||||
  // ... then parse the elements to create the pairs of title/data
 | 
			
		||||
  return associateElementsWithName(elements);
 | 
			
		||||
  // Create a supernode
 | 
			
		||||
  let supernode = createGenericElement();
 | 
			
		||||
  supernode.content = elements;
 | 
			
		||||
 | 
			
		||||
  // Reduce the nodes
 | 
			
		||||
  supernode = reducePostElement(supernode);
 | 
			
		||||
 | 
			
		||||
  // Remove the empty nodes
 | 
			
		||||
  supernode = removeEmptyContentFromElement(supernode);
 | 
			
		||||
 | 
			
		||||
  // Finally parse the elements to create the pairs of title/data
 | 
			
		||||
  return associateNameToElements(supernode.content);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
//#endregion Public methods
 | 
			
		||||
| 
						 | 
				
			
			@ -185,6 +193,8 @@ function parseCheerioTextNode(node: cheerio.Cheerio): IPostElement {
 | 
			
		|||
 * Check if the node has non empty `name` and `text`.
 | 
			
		||||
 */
 | 
			
		||||
function isPostElementUnknown(node: IPostElement): boolean {
 | 
			
		||||
  // @todo For some strange reason, if the node IS empty but
 | 
			
		||||
  // node.type === "Text" the 2nd statement return false.
 | 
			
		||||
  return node.name.trim() === "" && node.text.trim() === "";
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -199,9 +209,9 @@ function isPostElementEmpty(node: IPostElement): boolean {
 | 
			
		|||
/**
 | 
			
		||||
 * Create a `IPostElement` without name, text or content.
 | 
			
		||||
 */
 | 
			
		||||
function createEmptyElement(): IPostElement {
 | 
			
		||||
function createGenericElement(): IPostElement {
 | 
			
		||||
  return {
 | 
			
		||||
    type: "Empty",
 | 
			
		||||
    type: "Generic",
 | 
			
		||||
    name: "",
 | 
			
		||||
    text: "",
 | 
			
		||||
    content: []
 | 
			
		||||
| 
						 | 
				
			
			@ -272,65 +282,36 @@ function getCheerioNonChildrenText(node: cheerio.Cheerio): string {
 | 
			
		|||
 * Collapse an `IPostElement` element with a single subnode
 | 
			
		||||
 * in the `Content` field in case it has no information.
 | 
			
		||||
 */
 | 
			
		||||
function reducePostElement(element: IPostElement, recursive = true): IPostElement {
 | 
			
		||||
function reducePostElement(element: IPostElement): IPostElement {
 | 
			
		||||
  // Local variables
 | 
			
		||||
  const shallowCopy = Object.assign({}, element);
 | 
			
		||||
 | 
			
		||||
  // Find the posts without name and text
 | 
			
		||||
  const unknownChildrens = shallowCopy.content.filter((e) => isPostElementUnknown(e));
 | 
			
		||||
  if (recursive) {
 | 
			
		||||
    // Copy the array of children
 | 
			
		||||
    const copy = [...unknownChildrens];
 | 
			
		||||
 | 
			
		||||
    copy.map((e) => {
 | 
			
		||||
      // Reduce the element
 | 
			
		||||
      const reduced = reducePostElement(e);
 | 
			
		||||
 | 
			
		||||
      // Replace the element
 | 
			
		||||
      const index = unknownChildrens.indexOf(e);
 | 
			
		||||
      unknownChildrens[index] = reduced;
 | 
			
		||||
    });
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Eliminate non-useful child nodes
 | 
			
		||||
  if (isPostElementUnknown(shallowCopy) && unknownChildrens.length > 0) {
 | 
			
		||||
    // Find the valid elements to add to the node
 | 
			
		||||
    const childContents = unknownChildrens
 | 
			
		||||
      .filter((e) => !shallowCopy.content.includes(e))
 | 
			
		||||
      .map((e) => (e.content.length > 0 ? e.content : e));
 | 
			
		||||
 | 
			
		||||
    // Save the elements NOT IN unknownChildren
 | 
			
		||||
    shallowCopy.content = shallowCopy.content.filter((e) => !unknownChildrens.includes(e));
 | 
			
		||||
 | 
			
		||||
    // Merge the non-empty children of this node with
 | 
			
		||||
    // the content of the empty children of this node
 | 
			
		||||
    const newContent = [].concat(...childContents);
 | 
			
		||||
    shallowCopy.content.push(...newContent);
 | 
			
		||||
  }
 | 
			
		||||
  // If the node has only one child, return it
 | 
			
		||||
  else if (isPostElementUnknown(shallowCopy) && shallowCopy.content.length === 1) {
 | 
			
		||||
    return shallowCopy.content[0];
 | 
			
		||||
  if (isPostElementUnknown(shallowCopy) && shallowCopy.content.length === 1) {
 | 
			
		||||
    return reducePostElement(shallowCopy.content[0]);
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  // Reduce element's childs
 | 
			
		||||
  shallowCopy.content = shallowCopy.content.map((e) => reducePostElement(e));
 | 
			
		||||
 | 
			
		||||
  return shallowCopy;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
function removeEmptyElement(element: IPostElement): IPostElement {
 | 
			
		||||
  // Create a copy of the content
 | 
			
		||||
  const contentCopy = [...element.content];
 | 
			
		||||
/**
 | 
			
		||||
 * Remove all empty children elements of the elements for parameter.
 | 
			
		||||
 */
 | 
			
		||||
function removeEmptyContentFromElement(element: IPostElement, recursive = true): IPostElement {
 | 
			
		||||
  // Create a copy of the element
 | 
			
		||||
  const copy = Object.assign({}, element);
 | 
			
		||||
 | 
			
		||||
  contentCopy.map((e) => {
 | 
			
		||||
  // Find the non-empty nodes
 | 
			
		||||
    const validNodes = e.content.filter((e) => !isPostElementEmpty(e));
 | 
			
		||||
  const validNodes = copy.content.filter((e) => !isPostElementEmpty(e));
 | 
			
		||||
 | 
			
		||||
    // Clean this element children
 | 
			
		||||
    const cleanNodes = validNodes.map((e) => removeEmptyElement(e));
 | 
			
		||||
  // Reduce nested contents if recursive
 | 
			
		||||
  if (recursive) validNodes.forEach((e) => removeEmptyContentFromElement(e));
 | 
			
		||||
 | 
			
		||||
  // Assign the nodes
 | 
			
		||||
    e.content = cleanNodes;
 | 
			
		||||
  });
 | 
			
		||||
 | 
			
		||||
  const copy: IPostElement = Object.assign({}, element);
 | 
			
		||||
  copy.content = contentCopy;
 | 
			
		||||
  copy.content = validNodes;
 | 
			
		||||
 | 
			
		||||
  return copy;
 | 
			
		||||
}
 | 
			
		||||
| 
						 | 
				
			
			@ -340,7 +321,7 @@ function removeEmptyElement(element: IPostElement): IPostElement {
 | 
			
		|||
 */
 | 
			
		||||
function parseCheerioNode($: cheerio.Root, node: cheerio.Element): IPostElement {
 | 
			
		||||
  // Local variables
 | 
			
		||||
  let post: IPostElement = createEmptyElement();
 | 
			
		||||
  let post: IPostElement = createGenericElement();
 | 
			
		||||
  const cheerioNode = $(node);
 | 
			
		||||
 | 
			
		||||
  // Parse the node
 | 
			
		||||
| 
						 | 
				
			
			@ -349,6 +330,8 @@ function parseCheerioNode($: cheerio.Root, node: cheerio.Element): IPostElement
 | 
			
		|||
    else if (isSpoilerNode(cheerioNode)) post = parseCheerioSpoilerNode($, cheerioNode);
 | 
			
		||||
    else if (isLinkNode(node)) post = parseCheerioLinkNode(cheerioNode);
 | 
			
		||||
 | 
			
		||||
    // Avoid duplication of link name
 | 
			
		||||
    if (!isLinkNode(node)) {
 | 
			
		||||
      // Parse the node's childrens
 | 
			
		||||
      const childPosts = cheerioNode
 | 
			
		||||
        .contents() // @todo Change to children() after cheerio RC6
 | 
			
		||||
| 
						 | 
				
			
			@ -358,6 +341,7 @@ function parseCheerioNode($: cheerio.Root, node: cheerio.Element): IPostElement
 | 
			
		|||
        .filter((el) => !isPostElementEmpty(el));
 | 
			
		||||
      post.content.push(...childPosts);
 | 
			
		||||
    }
 | 
			
		||||
  }
 | 
			
		||||
 | 
			
		||||
  return post;
 | 
			
		||||
}
 | 
			
		||||
| 
						 | 
				
			
			@ -366,7 +350,7 @@ function parseCheerioNode($: cheerio.Root, node: cheerio.Element): IPostElement
 | 
			
		|||
 * It simplifies the `IPostElement` elements by associating
 | 
			
		||||
 * the corresponding value to each characterizing element (i.e. author).
 | 
			
		||||
 */
 | 
			
		||||
function associateElementsWithName(elements: IPostElement[]): IPostElement[] {
 | 
			
		||||
function associateNameToElements(elements: IPostElement[]): IPostElement[] {
 | 
			
		||||
  // Local variables
 | 
			
		||||
  const pairs: IPostElement[] = [];
 | 
			
		||||
  const specialCharsRegex = /^[-!$%^&*()_+|~=`{}[\]:";'<>?,./]/;
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue