Refactoring code

2021-03-22 11:00:45 +01:00 · 2021-03-22 11:00:45 +01:00 · b7d27483cc
parent b74a212c80
commit b7d27483cc
1 changed files with 34 additions and 66 deletions
--- a/src/scripts/scrape-data/post-parse.ts
+++ b/src/scripts/scrape-data/post-parse.ts
@ -235,6 +235,31 @@ function parseCheerioTextNode(node: cheerio.Cheerio): IPostElement {
  return content;
 }

+/**
+ * Gets the text of the node only, excluding child nodes.
+ * Also includes formatted text elements (i.e. `<b>`).
+ */
+function getCheerioNonChildrenText(node: cheerio.Cheerio): string {
+  // Local variable
+  let text = "";
+
+  // If the node has no children, return the node's text
+  if (node.contents().length === 1) {
+    // @todo Remove IF after cheerio RC6
+    text = node.text();
+  } else {
+    // Find all the text nodes in the node
+    text = node
+      .first()
+      .contents() // @todo Change to children() after cheerio RC6
+      .filter((idx, e) => isTextNode(e))
+      .text();
+  }
+
+  // Clean and return the text
+  return text.replace(/\s\s+/g, " ").trim();
+}
+
 //#endregion Parse Cheerio node

 //#region IPostElement utility
@ -268,66 +293,8 @@ function createGenericElement(): IPostElement {
  };
 }

-/**
- * Check if the element contains the overview of a thread (post #1).
- */
-function elementIsOverview(element: IPostElement): boolean {
-  // Search the text element that start with "overview"
-  const result = element.content
-    .filter((e) => e.type === "Text")
-    .find((e) => e.text.toUpperCase().startsWith("OVERVIEW"));
-  return result !== undefined;
-}
-
-/**
- * If the element contains the overview of a thread, parse it.
- */
-function getOverviewFromElement(element: IPostElement): string {
-  // Local variables
-  const alphanumericRegex = new RegExp("[a-zA-Z0-9]+");
-
-  // Get all the text values of the overview
-  const textes = element.content
-    .filter((e) => e.type === "Text")
-    .filter((e) => {
-      const cleanValue = e.text.toUpperCase().replace("OVERVIEW", "").trim();
-      const isAlphanumeric = alphanumericRegex.test(cleanValue);
-
-      return cleanValue !== "" && isAlphanumeric;
-    })
-    .map((e) => e.text);
-
-  // Joins the textes
-  return textes.join(" ");
-}
-
 //#endregion IPostElement utility

-/**
- * Gets the text of the node only, excluding child nodes.
- * Also includes formatted text elements (i.e. `<b>`).
- */
-function getCheerioNonChildrenText(node: cheerio.Cheerio): string {
-  // Local variable
-  let text = "";
-
-  // If the node has no children, return the node's text
-  if (node.contents().length === 1) {
-    // @todo Remove IF after cheerio RC6
-    text = node.text();
-  } else {
-    // Find all the text nodes in the node
-    text = node
-      .first()
-      .contents() // @todo Change to children() after cheerio RC6
-      .filter((idx, e) => isTextNode(e))
-      .text();
-  }
-
-  // Clean and return the text
-  return text.replace(/\s\s+/g, " ").trim();
-}
-
 /**
 * Collapse an `IPostElement` element with a single subnode
 * in the `Content` field in case it has no information.
@ -434,14 +401,9 @@ function pairUpElements(elements: IPostElement[]): IPostElement[] {
      e.pairs.forEach((e, i) => shallow.splice(index + i, 0, e));
    });

-  // Ignore the "Generic" elements that we have already parsed
-  //const validElements = shallow.filter((e) => e.type !== "Generic");
-
-  // Than we find all the IDs of "Text" elements where the
-  // text doesn't starts with double points. This means
-  // that we find all the IDs of "title" elements.
+  // Than we find all the IDs of the elements that are "titles".
  const indexes = shallow
-    .filter((e, i) => filterValidElements(e, i, shallow))
+    .filter((e, i) => isValidTitleElement(e, i, shallow))
    .map((e) => shallow.indexOf(e));

  // Now we find all the elements between indexes and
@ -449,7 +411,13 @@ function pairUpElements(elements: IPostElement[]): IPostElement[] {
  return indexes.map((i, j) => parseGroupData(i, j, indexes, shallow));
 }

-function filterValidElements(element: IPostElement, index: number, array: IPostElement[]): boolean {
+/**
+ * Verify if the `element` is a valid title.
+ * @param element Element to check
+ * @param index Index of the element in `array`
+ * @param array Array of elements to check
+ */
+function isValidTitleElement(element: IPostElement, index: number, array: IPostElement[]): boolean {
  // Check if this element is a "title" checking also the next element
  const isPostfixDoublePoints = element.text.endsWith(":") && element.text !== ":";
  const nextElementIsValue = array[index + 1]?.text.startsWith(":");