Better reduction and removal of empty elements
parent
8e118de909
commit
be246d50b1
|
@ -11,7 +11,7 @@ import { POST } from "../constants/css-selector";
|
||||||
//#region Interfaces
|
//#region Interfaces
|
||||||
|
|
||||||
export interface IPostElement {
|
export interface IPostElement {
|
||||||
type: "Empty" | "Text" | "Link" | "Image" | "Spoiler";
|
type: "Generic" | "Text" | "Link" | "Image" | "Spoiler";
|
||||||
name: string;
|
name: string;
|
||||||
text: string;
|
text: string;
|
||||||
content: IPostElement[];
|
content: IPostElement[];
|
||||||
|
@ -41,12 +41,20 @@ export function parseF95ThreadPost($: cheerio.Root, post: cheerio.Cheerio): IPos
|
||||||
const elements = post
|
const elements = post
|
||||||
.contents()
|
.contents()
|
||||||
.toArray()
|
.toArray()
|
||||||
.map((e) => parseCheerioNode($, e)) // Parse the nodes
|
.map((e) => parseCheerioNode($, e)); // Parse the nodes
|
||||||
.filter((e) => !isPostElementEmpty(e)) // Ignore the empty nodes
|
|
||||||
.map((e) => reducePostElement(e)); // Compress the nodes
|
|
||||||
|
|
||||||
// ... then parse the elements to create the pairs of title/data
|
// Create a supernode
|
||||||
return associateElementsWithName(elements);
|
let supernode = createGenericElement();
|
||||||
|
supernode.content = elements;
|
||||||
|
|
||||||
|
// Reduce the nodes
|
||||||
|
supernode = reducePostElement(supernode);
|
||||||
|
|
||||||
|
// Remove the empty nodes
|
||||||
|
supernode = removeEmptyContentFromElement(supernode);
|
||||||
|
|
||||||
|
// Finally parse the elements to create the pairs of title/data
|
||||||
|
return associateNameToElements(supernode.content);
|
||||||
}
|
}
|
||||||
|
|
||||||
//#endregion Public methods
|
//#endregion Public methods
|
||||||
|
@ -185,6 +193,8 @@ function parseCheerioTextNode(node: cheerio.Cheerio): IPostElement {
|
||||||
* Check if the node has non empty `name` and `text`.
|
* Check if the node has non empty `name` and `text`.
|
||||||
*/
|
*/
|
||||||
function isPostElementUnknown(node: IPostElement): boolean {
|
function isPostElementUnknown(node: IPostElement): boolean {
|
||||||
|
// @todo For some strange reason, if the node IS empty but
|
||||||
|
// node.type === "Text" the 2nd statement return false.
|
||||||
return node.name.trim() === "" && node.text.trim() === "";
|
return node.name.trim() === "" && node.text.trim() === "";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -199,9 +209,9 @@ function isPostElementEmpty(node: IPostElement): boolean {
|
||||||
/**
|
/**
|
||||||
* Create a `IPostElement` without name, text or content.
|
* Create a `IPostElement` without name, text or content.
|
||||||
*/
|
*/
|
||||||
function createEmptyElement(): IPostElement {
|
function createGenericElement(): IPostElement {
|
||||||
return {
|
return {
|
||||||
type: "Empty",
|
type: "Generic",
|
||||||
name: "",
|
name: "",
|
||||||
text: "",
|
text: "",
|
||||||
content: []
|
content: []
|
||||||
|
@ -272,65 +282,36 @@ function getCheerioNonChildrenText(node: cheerio.Cheerio): string {
|
||||||
* Collapse an `IPostElement` element with a single subnode
|
* Collapse an `IPostElement` element with a single subnode
|
||||||
* in the `Content` field in case it has no information.
|
* in the `Content` field in case it has no information.
|
||||||
*/
|
*/
|
||||||
function reducePostElement(element: IPostElement, recursive = true): IPostElement {
|
function reducePostElement(element: IPostElement): IPostElement {
|
||||||
// Local variables
|
// Local variables
|
||||||
const shallowCopy = Object.assign({}, element);
|
const shallowCopy = Object.assign({}, element);
|
||||||
|
|
||||||
// Find the posts without name and text
|
|
||||||
const unknownChildrens = shallowCopy.content.filter((e) => isPostElementUnknown(e));
|
|
||||||
if (recursive) {
|
|
||||||
// Copy the array of children
|
|
||||||
const copy = [...unknownChildrens];
|
|
||||||
|
|
||||||
copy.map((e) => {
|
|
||||||
// Reduce the element
|
|
||||||
const reduced = reducePostElement(e);
|
|
||||||
|
|
||||||
// Replace the element
|
|
||||||
const index = unknownChildrens.indexOf(e);
|
|
||||||
unknownChildrens[index] = reduced;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
// Eliminate non-useful child nodes
|
|
||||||
if (isPostElementUnknown(shallowCopy) && unknownChildrens.length > 0) {
|
|
||||||
// Find the valid elements to add to the node
|
|
||||||
const childContents = unknownChildrens
|
|
||||||
.filter((e) => !shallowCopy.content.includes(e))
|
|
||||||
.map((e) => (e.content.length > 0 ? e.content : e));
|
|
||||||
|
|
||||||
// Save the elements NOT IN unknownChildren
|
|
||||||
shallowCopy.content = shallowCopy.content.filter((e) => !unknownChildrens.includes(e));
|
|
||||||
|
|
||||||
// Merge the non-empty children of this node with
|
|
||||||
// the content of the empty children of this node
|
|
||||||
const newContent = [].concat(...childContents);
|
|
||||||
shallowCopy.content.push(...newContent);
|
|
||||||
}
|
|
||||||
// If the node has only one child, return it
|
// If the node has only one child, return it
|
||||||
else if (isPostElementUnknown(shallowCopy) && shallowCopy.content.length === 1) {
|
if (isPostElementUnknown(shallowCopy) && shallowCopy.content.length === 1) {
|
||||||
return shallowCopy.content[0];
|
return reducePostElement(shallowCopy.content[0]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Reduce element's childs
|
||||||
|
shallowCopy.content = shallowCopy.content.map((e) => reducePostElement(e));
|
||||||
|
|
||||||
return shallowCopy;
|
return shallowCopy;
|
||||||
}
|
}
|
||||||
|
|
||||||
function removeEmptyElement(element: IPostElement): IPostElement {
|
/**
|
||||||
// Create a copy of the content
|
* Remove all empty children elements of the elements for parameter.
|
||||||
const contentCopy = [...element.content];
|
*/
|
||||||
|
function removeEmptyContentFromElement(element: IPostElement, recursive = true): IPostElement {
|
||||||
|
// Create a copy of the element
|
||||||
|
const copy = Object.assign({}, element);
|
||||||
|
|
||||||
contentCopy.map((e) => {
|
// Find the non-empty nodes
|
||||||
// Find the non-empty nodes
|
const validNodes = copy.content.filter((e) => !isPostElementEmpty(e));
|
||||||
const validNodes = e.content.filter((e) => !isPostElementEmpty(e));
|
|
||||||
|
|
||||||
// Clean this element children
|
// Reduce nested contents if recursive
|
||||||
const cleanNodes = validNodes.map((e) => removeEmptyElement(e));
|
if (recursive) validNodes.forEach((e) => removeEmptyContentFromElement(e));
|
||||||
|
|
||||||
// Assign the nodes
|
// Assign the nodes
|
||||||
e.content = cleanNodes;
|
copy.content = validNodes;
|
||||||
});
|
|
||||||
|
|
||||||
const copy: IPostElement = Object.assign({}, element);
|
|
||||||
copy.content = contentCopy;
|
|
||||||
|
|
||||||
return copy;
|
return copy;
|
||||||
}
|
}
|
||||||
|
@ -340,7 +321,7 @@ function removeEmptyElement(element: IPostElement): IPostElement {
|
||||||
*/
|
*/
|
||||||
function parseCheerioNode($: cheerio.Root, node: cheerio.Element): IPostElement {
|
function parseCheerioNode($: cheerio.Root, node: cheerio.Element): IPostElement {
|
||||||
// Local variables
|
// Local variables
|
||||||
let post: IPostElement = createEmptyElement();
|
let post: IPostElement = createGenericElement();
|
||||||
const cheerioNode = $(node);
|
const cheerioNode = $(node);
|
||||||
|
|
||||||
// Parse the node
|
// Parse the node
|
||||||
|
@ -349,14 +330,17 @@ function parseCheerioNode($: cheerio.Root, node: cheerio.Element): IPostElement
|
||||||
else if (isSpoilerNode(cheerioNode)) post = parseCheerioSpoilerNode($, cheerioNode);
|
else if (isSpoilerNode(cheerioNode)) post = parseCheerioSpoilerNode($, cheerioNode);
|
||||||
else if (isLinkNode(node)) post = parseCheerioLinkNode(cheerioNode);
|
else if (isLinkNode(node)) post = parseCheerioLinkNode(cheerioNode);
|
||||||
|
|
||||||
// Parse the node's childrens
|
// Avoid duplication of link name
|
||||||
const childPosts = cheerioNode
|
if (!isLinkNode(node)) {
|
||||||
.contents() // @todo Change to children() after cheerio RC6
|
// Parse the node's childrens
|
||||||
.toArray()
|
const childPosts = cheerioNode
|
||||||
.filter((el) => el) // Ignore undefined elements
|
.contents() // @todo Change to children() after cheerio RC6
|
||||||
.map((el) => parseCheerioNode($, el))
|
.toArray()
|
||||||
.filter((el) => !isPostElementEmpty(el));
|
.filter((el) => el) // Ignore undefined elements
|
||||||
post.content.push(...childPosts);
|
.map((el) => parseCheerioNode($, el))
|
||||||
|
.filter((el) => !isPostElementEmpty(el));
|
||||||
|
post.content.push(...childPosts);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return post;
|
return post;
|
||||||
|
@ -366,7 +350,7 @@ function parseCheerioNode($: cheerio.Root, node: cheerio.Element): IPostElement
|
||||||
* It simplifies the `IPostElement` elements by associating
|
* It simplifies the `IPostElement` elements by associating
|
||||||
* the corresponding value to each characterizing element (i.e. author).
|
* the corresponding value to each characterizing element (i.e. author).
|
||||||
*/
|
*/
|
||||||
function associateElementsWithName(elements: IPostElement[]): IPostElement[] {
|
function associateNameToElements(elements: IPostElement[]): IPostElement[] {
|
||||||
// Local variables
|
// Local variables
|
||||||
const pairs: IPostElement[] = [];
|
const pairs: IPostElement[] = [];
|
||||||
const specialCharsRegex = /^[-!$%^&*()_+|~=`{}[\]:";'<>?,./]/;
|
const specialCharsRegex = /^[-!$%^&*()_+|~=`{}[\]:";'<>?,./]/;
|
||||||
|
|
Loading…
Reference in New Issue