From f7bad33c1f28709232afc95f718f99a49fb03606 Mon Sep 17 00:00:00 2001
From: MillenniumEarl <MillenniumEarl@users.noreply.github.com>
Date: Sun, 21 Feb 2021 12:50:46 +0100
Subject: [PATCH] Add post-parser.ts

---
 src/scripts/post-parser.ts | 291 +++++++++++++++++++++++++++++++++++++
 1 file changed, 291 insertions(+)
 create mode 100644 src/scripts/post-parser.ts
diff --git a/src/scripts/post-parser.ts b/src/scripts/post-parser.ts
new file mode 100644
index 0000000..52fbfea
--- /dev/null
+++ b/src/scripts/post-parser.ts
@@ -0,0 +1,291 @@
+
+//#region Interfaces
+
+export interface IPostElement {
+    Type: "Empty" | "Text" | "Link" | "Image" | "Spoiler",
+    Name: string,
+    Text: string,
+    Content: IPostElement[]
+}
+
+export interface ILink extends IPostElement {
+    Type: "Image" | "Link",
+    Href: string,
+}
+
+//#endregion Interfaces
+
+//#region Public methods
+/**
+ * Given the main post of the page (#1) it extracts the information contained.
+ */
+export function parseCheerioMainPost($: cheerio.Root, post: cheerio.Cheerio): IPostElement[] {
+    // The data is divided between "tag" and "text" elements. 
+    // Simple data is composed of a "tag" element followed 
+    // by a "text" element, while more complex data (contained 
+    // in spoilers) is composed of a "tag" element, followed 
+    // by a text containing only ":" and then by an additional 
+    // "tag" element having as the first term "Spoiler"
+
+    // First fetch all the elements in the post
+    const elements = post.contents().toArray().map(el => {
+        const node = parseCheerioNode($, el);
+        if (node.Name || node.Text || node.Content.length != 0) {
+            return node;
+        }
+    }).filter(el => el);
+
+    // ... then parse the elements to create the pairs of title/data
+    return parsePostElements(elements);
+}
+//#endregion Public methods
+
+//#region Private methods
+
+/**
+ * Process a spoiler element by getting its text broken 
+ * down by any other spoiler elements present.
+ */
+function parseCheerioSpoilerNode($: cheerio.Root, spoiler: cheerio.Cheerio): IPostElement {
+    // A spoiler block is composed of a div with class "bbCodeSpoiler", 
+    // containing a div "bbCodeSpoiler-content" containing, in cascade, 
+    // a div with class "bbCodeBlock--spoiler" and a div with class "bbCodeBlock-content". 
+    // This last tag contains the required data.
+
+    // Local variables
+    const BUTTON_CLASS = "button.bbCodeSpoiler-button";
+    const SPOILER_CONTENT_CLASS = "div.bbCodeSpoiler-content > div.bbCodeBlock--spoiler > div.bbCodeBlock-content";
+    const content: IPostElement = {
+        Type: "Spoiler",
+        Name: "",
+        Text: "",
+        Content: []
+    };
+
+    // Find the title of the spoiler (contained in the button)
+    const button = spoiler.find(BUTTON_CLASS).toArray().shift();
+    content.Name = $(button).text().trim();
+
+    // Parse the content of the spoiler
+    spoiler.find(SPOILER_CONTENT_CLASS).contents().map((idx, el) => {
+        // Convert the element
+        const element = $(el);
+
+        // Parse nested spoiler
+        if (element.attr("class") === "bbCodeSpoiler") {
+            const spoiler = parseCheerioSpoilerNode($, element);
+            content.Content.push(spoiler);
+        }
+        //@ts-ignore
+        // else if (el.name === "br") {
+        //     // Add new line
+        //     content.Text += "\n";
+        // }
+        else if (el.type === "text") {
+            // Append text
+            content.Text += element.text();
+        }
+    });
+
+    // Clean text
+    content.Text = content.Text.replace(/\s\s+/g, ' ').trim();
+    return content;
+}
+
+/**
+ * Check if the node passed as a parameter is of text type. 
+ * This also includes formatted nodes (i.e. `<b>`).
+ */
+function isTextNode(node: cheerio.Element): boolean {
+    const formattedTags = ["b", "i"]
+    const isText = node.type === "text";
+    const isFormatted = node.type === "tag" && formattedTags.includes(node.name);
+
+    return isText || isFormatted;
+}
+
+/**
+ * Gets the text of the node only, excluding child nodes. 
+ * Also includes formatted text elements (i.e. `<b>`).
+ */
+function getCheerioNonChildrenText(node: cheerio.Cheerio): string {
+    // Find all the text nodes in the node
+    const text = node.first().contents().filter((idx, el) => {
+        return isTextNode(el);
+    }).text();
+
+    // Clean and return the text
+    return text.replace(/\s\s+/g, ' ').trim();
+}
+
+/**
+ * Process a node and see if it contains a 
+ * link or image. If not, it returns `null`.
+ */
+function parseCheerioLinkNode(element: cheerio.Cheerio): ILink | null {
+    //@ts-ignore
+    const name = element[0]?.name;
+    let returnValue: ILink = null;
+
+    if (name === "img") {
+        returnValue = {
+            Name: "",
+            Type: "Image",
+            Text: element.attr("alt"),
+            Href: element.attr("data-src"),
+            Content: []
+        }
+    }
+    else if (name === "a") {
+        returnValue = {
+            Name: "",
+            Type: "Link",
+            Text: element.text().replace(/\s\s+/g, ' ').trim(),
+            Href: element.attr("href"),
+            Content: []
+        }
+    }
+
+    return returnValue;
+}
+
+/**
+ * Collapse an `IPostElement` element with a single subnode 
+ * in the `Content` field in case it has no information.
+ */
+function reducePostElement(element: IPostElement): IPostElement {
+    if (element.Content.length === 1) {
+        const content = element.Content[0] as IPostElement;
+        const nullValues = (!element.Name || !content.Name) && (!element.Text || !content.Text);
+        const sameValues = (element.Name === content.Name) || (element.Text === content.Text)
+
+        if (nullValues || sameValues) {
+            element.Name = element.Name || content.Name;
+            element.Text = element.Text || content.Text;
+            element.Content = content.Content;
+            element.Type = content.Type;
+
+            // If the content is a link, add the HREF to the element
+            const contentILink = content as ILink;
+            const elementILink = element as ILink;
+            if (contentILink.Href) elementILink.Href = contentILink.Href;
+        }
+    }
+
+    return element;
+}
+
+/**
+ * Transform a `cheerio.Cheerio` node into an `IPostElement` element with its subnodes.
+ * @param reduce Compress subsequent subnodes if they contain no information. Default: `true`.
+ */
+function parseCheerioNode($: cheerio.Root, node: cheerio.Element, reduce = true): IPostElement {
+    // Local variables
+    let content: IPostElement = {
+        Type: "Empty",
+        Name: "",
+        Text: "",
+        Content: []
+    };
+    const cheerioNode = $(node);
+
+    if (isTextNode(node)) {
+        content.Text = cheerioNode.text().replace(/\s\s+/g, ' ').trim();
+        content.Type = "Text";
+    } else {
+        // Get the number of children that the element own
+        const nChildren = cheerioNode.children().length;
+
+        // Get the text of the element without childrens
+        content.Text = getCheerioNonChildrenText(cheerioNode);
+
+        // Parse spoilers
+        if (cheerioNode.attr("class") === "bbCodeSpoiler") {
+            const spoiler = parseCheerioSpoilerNode($, cheerioNode);
+
+            // Add element if not null
+            if (spoiler) {
+                content.Content.push(spoiler);
+                content.Type = "Spoiler";
+            }
+        }
+        // Parse links
+        else if (nChildren === 0 && cheerioNode.length != 0) {
+            const link = parseCheerioLinkNode(cheerioNode);
+
+            // Add element if not null
+            if (link) {
+                content.Content.push(link);
+                content.Type = "Link";
+            }
+        } else {
+            cheerioNode.children().map((idx, el) => {
+                // Parse the children of the element passed as parameter
+                const childElement = parseCheerioNode($, el);
+
+                // If the children is valid (not empty) push it
+                if ((childElement.Text || childElement.Content.length !== 0) && !isTextNode(el)) {
+                    content.Content.push(childElement);
+                }
+            });
+        }
+    }
+
+    return reduce ? reducePostElement(content) : content;
+}
+
+/**
+ * It simplifies the `IPostElement` elements by associating 
+ * the corresponding value to each characterizing element (i.e. author).
+ */
+function parsePostElements(elements: IPostElement[]): IPostElement[] {
+    // Local variables
+    const pairs: IPostElement[] = [];
+    const specialCharsRegex = /^[-!$%^&*()_+|~=`{}\[\]:";'<>?,.\/]/;
+    const specialRegex = new RegExp(specialCharsRegex);
+
+    for (let i = 0; i < elements.length; i++) {
+        // If the text starts with a special char, clean it
+        const startWithSpecial = specialRegex.test(elements[i].Text);
+
+        // /^[-!$%^&*()_+|~=`{}\[\]:";'<>?,.\/]/
+        // Get the uppercase text
+        const upperText = elements[i].Text.toUpperCase();
+
+        // Get the latest IPostElement in "pairs"
+        const lastIndex = pairs.length - 1;
+        const lastPair = pairs[lastIndex];
+
+        // If this statement is valid, we have a "data"
+        if (elements[i].Type === "Text" && startWithSpecial && pairs.length > 0) {
+            // We merge this element with the last element appended to 'pairs'
+            const cleanText = elements[i].Text.replace(specialCharsRegex, "").trim();
+            lastPair.Text = lastPair.Text || cleanText;
+            lastPair.Content.push(...elements[i].Content);
+        }
+        // This is a special case
+        else if (elements[i].Text.startsWith("Overview:\n")) {
+            // We add the overview to the pairs as a text element
+            elements[i].Type = "Text";
+            elements[i].Name = "Overview";
+            elements[i].Text = elements[i].Text.replace("Overview:\n", "");
+            pairs.push(elements[i]);
+        }
+        // We have an element referred to the previous "title"
+        else if (elements[i].Type != "Text" && pairs.length > 0) {
+            // We append this element to the content of the last title
+            lastPair.Content.push(elements[i]);
+        }
+        // ... else we have a "title" (we need to swap the text to the name because it is a title)
+        else {
+            const swap: IPostElement = Object.assign({}, elements[i]);
+            swap.Name = elements[i].Text;
+            swap.Text = "";
+            pairs.push(swap);
+        }
+    }
+
+    return pairs;
+}
+
+//#endregion Private methods
\ No newline at end of file