From 8cdc7c718af98c99259a264b44e5f11f7f7f4c12 Mon Sep 17 00:00:00 2001 From: MillenniumEarl Date: Sun, 21 Mar 2021 11:36:13 +0100 Subject: [PATCH 01/22] Update JS-DOC --- src/scripts/scrape-data/post-parse.ts | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/scripts/scrape-data/post-parse.ts b/src/scripts/scrape-data/post-parse.ts index c7218e9..5a2adb8 100644 --- a/src/scripts/scrape-data/post-parse.ts +++ b/src/scripts/scrape-data/post-parse.ts @@ -10,15 +10,36 @@ import { POST } from "../constants/css-selector"; //#region Interfaces +/** + * Represents an element contained in the post. + */ export interface IPostElement { + /** + * Type of element. + */ type: "Generic" | "Text" | "Link" | "Image" | "Spoiler"; + /** + * Name associated with the element. + */ name: string; + /** + * Text of the content of the element excluding any children. + */ text: string; + /** + * Children elements contained in this element. + */ content: IPostElement[]; } +/** + * Represents a link type link in the post. + */ export interface ILink extends IPostElement { type: "Image" | "Link"; + /** + * Link to the resource. + */ href: string; } From 5f9ad0056a210a3a5892bde4603ee51ac1340f51 Mon Sep 17 00:00:00 2001 From: MillenniumEarl Date: Sun, 21 Mar 2021 11:40:57 +0100 Subject: [PATCH 02/22] Don't parse spoiler text --- src/scripts/scrape-data/post-parse.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/scripts/scrape-data/post-parse.ts b/src/scripts/scrape-data/post-parse.ts index 5a2adb8..73560fe 100644 --- a/src/scripts/scrape-data/post-parse.ts +++ b/src/scripts/scrape-data/post-parse.ts @@ -162,8 +162,8 @@ function parseCheerioSpoilerNode($: cheerio.Root, node: cheerio.Cheerio): IPostE .toArray() .map((el) => parseCheerioNode($, el)); - // Clean text - spoiler.text = spoiler.text.replace(/\s\s+/g, " ").trim(); + // Clean text (Spoiler has no text) @todo + // spoiler.text = spoiler.text.replace(/\s\s+/g, " ").trim(); return spoiler; } @@ -307,7 +307,7 @@ function reducePostElement(element: IPostElement): IPostElement { // Local variables const shallowCopy = Object.assign({}, element); - // If the node has only one child, return it + // If the node has only one child, reduce and return it if (isPostElementUnknown(shallowCopy) && shallowCopy.content.length === 1) { return reducePostElement(shallowCopy.content[0]); } From 701678b577ba5b4f09129fb181a3b0f4579c7639 Mon Sep 17 00:00:00 2001 From: MillenniumEarl Date: Sun, 21 Mar 2021 16:05:17 +0100 Subject: [PATCH 03/22] Rework final pair up of elements --- src/scripts/scrape-data/post-parse.ts | 77 +++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/src/scripts/scrape-data/post-parse.ts b/src/scripts/scrape-data/post-parse.ts index 73560fe..716c19e 100644 --- a/src/scripts/scrape-data/post-parse.ts +++ b/src/scripts/scrape-data/post-parse.ts @@ -376,6 +376,7 @@ function associateNameToElements(elements: IPostElement[]): IPostElement[] { const pairs: IPostElement[] = []; const specialCharsRegex = /^[-!$%^&*()_+|~=`{}[\]:";'<>?,./]/; const specialRegex = new RegExp(specialCharsRegex); + const x = pairUp(elements); for (let i = 0; i < elements.length; i++) { // If the text starts with a special char, clean it @@ -417,4 +418,80 @@ function associateNameToElements(elements: IPostElement[]): IPostElement[] { return pairs; } +function pairUp(elements: IPostElement[]): IPostElement[] { + // First ignore the "Generic" type elements, because + // they usually are containers for other data, like + // overview or download links. + const validElements = elements.filter((e) => e.type !== "Generic"); + + // Than we find all the IDs of "Text" elements where the + // text doesn't starts with double points. This means + // that we find all the IDs of "title" elements. + const indexes = validElements + .filter( + (e, i) => + e.type === "Text" && // This element must be a text + ((e.text.endsWith(":") && e.text !== ":") || // This element's text must ends with ":" + validElements[i + 1]?.text.startsWith(":")) // The next element's text must start with ":" + ) + .map((e) => validElements.indexOf(e)); + + // Now we find all the elements between indexes and + // associate them with the previous "title" element + const data = indexes.map((i, j) => parseGroupData(i, j, indexes, validElements)); + + // Now parse all the "invalid" elements, + // so all the elements with "Generic" type + const genericElementsPairs = elements + .filter((e) => e.type === "Generic") + .map((e) => pairUp(e.content)); + + const flatten: IPostElement[] = [].concat(...genericElementsPairs); + data.push(...flatten); + + return data; +} + +function parseGroupData( + start: number, + index: number, + indexes: number[], + elements: IPostElement[] +): IPostElement { + // Local variables + const endsWithSpecialCharsRegex = /[-!$%^&*()_+|~=`{}[\]:";'<>?,./]$/; + const startsWithDoublePointsRegex = /^[:]/; + + // Find all the elements (title + data) of the same data group + const nextIndex = indexes[index + 1] ?? elements.length; + const group = elements.slice(start, nextIndex); + + // Extract the title + const title = group.shift(); + + // Assign name and text of the title + title.name = title.text.replace(endsWithSpecialCharsRegex, "").trim(); + title.text = group + .filter((e) => e.type === "Text") + .map((e) => + e.text + .replace(startsWithDoublePointsRegex, "") // Remove the starting ":" from the element's text + .replace(endsWithSpecialCharsRegex, "") // Remove any special chars at the end + .trim() + ) + .join(" "); // Join with space + + // Append all the content of non-text elements. + group + .filter((e) => e.type !== "Text") + .forEach( + (e) => + e.type === "Spoiler" + ? title.content.push(...e.content) // Add all the content fo the spoiler + : title.content.push(e) // Add the element itself + ); + + return title; +} + //#endregion Private methods From 14a290468f9a7ebf9e97e52416a3c9dae1ae3aa4 Mon Sep 17 00:00:00 2001 From: MillenniumEarl Date: Sun, 21 Mar 2021 16:55:15 +0100 Subject: [PATCH 04/22] Fix removeEmptyContent recursion and parse node children only for formmating nodes --- src/scripts/scrape-data/post-parse.ts | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/scripts/scrape-data/post-parse.ts b/src/scripts/scrape-data/post-parse.ts index 716c19e..cc5e24a 100644 --- a/src/scripts/scrape-data/post-parse.ts +++ b/src/scripts/scrape-data/post-parse.ts @@ -325,11 +325,13 @@ function removeEmptyContentFromElement(element: IPostElement, recursive = true): // Create a copy of the element const copy = Object.assign({}, element); - // Find the non-empty nodes - const validNodes = copy.content.filter((e) => !isPostElementEmpty(e)); - // Reduce nested contents if recursive - if (recursive) validNodes.forEach((e) => removeEmptyContentFromElement(e)); + const recursiveResult = recursive + ? element.content.map((e) => removeEmptyContentFromElement(e)) + : copy.content; + + // Find the non-empty nodes + const validNodes = recursiveResult.filter((e) => !isPostElementEmpty(e)); // Assign the nodes copy.content = validNodes; @@ -351,8 +353,10 @@ function parseCheerioNode($: cheerio.Root, node: cheerio.Element): IPostElement else if (isSpoilerNode(cheerioNode)) post = parseCheerioSpoilerNode($, cheerioNode); else if (isLinkNode(node)) post = parseCheerioLinkNode(cheerioNode); - // Avoid duplication of link name - if (!isLinkNode(node)) { + // Check for childrens only if the node is a / element. + // For the link in unnecessary while for the spoilers is + // already done in parseCheerioSpoilerNode + if (isFormattingNode(node)) { // Parse the node's childrens const childPosts = cheerioNode .contents() // @todo Change to children() after cheerio RC6 @@ -479,7 +483,8 @@ function parseGroupData( .replace(endsWithSpecialCharsRegex, "") // Remove any special chars at the end .trim() ) - .join(" "); // Join with space + .join(" ") // Join with space + .trim(); // Append all the content of non-text elements. group From d0e87e0ead8fef446e6b3827c2911d1c719b7a2e Mon Sep 17 00:00:00 2001 From: MillenniumEarl Date: Sun, 21 Mar 2021 16:56:37 +0100 Subject: [PATCH 05/22] Remove only - from the element value --- src/scripts/scrape-data/post-parse.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/scripts/scrape-data/post-parse.ts b/src/scripts/scrape-data/post-parse.ts index cc5e24a..e1d2047 100644 --- a/src/scripts/scrape-data/post-parse.ts +++ b/src/scripts/scrape-data/post-parse.ts @@ -419,7 +419,7 @@ function associateNameToElements(elements: IPostElement[]): IPostElement[] { } } - return pairs; + return pairUp(elements); } function pairUp(elements: IPostElement[]): IPostElement[] { @@ -463,7 +463,7 @@ function parseGroupData( elements: IPostElement[] ): IPostElement { // Local variables - const endsWithSpecialCharsRegex = /[-!$%^&*()_+|~=`{}[\]:";'<>?,./]$/; + const endsWithSpecialCharsRegex = /[-]$/; const startsWithDoublePointsRegex = /^[:]/; // Find all the elements (title + data) of the same data group From dcc5ed973ffb929b4df6b8e6e4c124a7f640af60 Mon Sep 17 00:00:00 2001 From: MillenniumEarl Date: Sun, 21 Mar 2021 17:10:49 +0100 Subject: [PATCH 06/22] Add list check --- src/scripts/scrape-data/post-parse.ts | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/scripts/scrape-data/post-parse.ts b/src/scripts/scrape-data/post-parse.ts index e1d2047..15b4340 100644 --- a/src/scripts/scrape-data/post-parse.ts +++ b/src/scripts/scrape-data/post-parse.ts @@ -129,6 +129,13 @@ function isNoScriptNode(node: cheerio.Element): boolean { return node.type === "tag" && node.name === "noscript"; } +/** + * Check if the node is a list element, i.e. `
  • ` or `
      ` tag. + */ +function isListNode(node: cheerio.Element): boolean { + return node.type === "tag" && (node.name === "ul" || node.name === "li"); +} + //#endregion Node Type //#region Parse Cheerio node @@ -353,10 +360,10 @@ function parseCheerioNode($: cheerio.Root, node: cheerio.Element): IPostElement else if (isSpoilerNode(cheerioNode)) post = parseCheerioSpoilerNode($, cheerioNode); else if (isLinkNode(node)) post = parseCheerioLinkNode(cheerioNode); - // Check for childrens only if the node is a / element. - // For the link in unnecessary while for the spoilers is - // already done in parseCheerioSpoilerNode - if (isFormattingNode(node)) { + // Check for childrens only if the node is a / element + // or a list element. For the link in unnecessary while for + // the spoilers is already done in parseCheerioSpoilerNode + if (isFormattingNode(node) || isListNode(node)) { // Parse the node's childrens const childPosts = cheerioNode .contents() // @todo Change to children() after cheerio RC6 @@ -380,7 +387,6 @@ function associateNameToElements(elements: IPostElement[]): IPostElement[] { const pairs: IPostElement[] = []; const specialCharsRegex = /^[-!$%^&*()_+|~=`{}[\]:";'<>?,./]/; const specialRegex = new RegExp(specialCharsRegex); - const x = pairUp(elements); for (let i = 0; i < elements.length; i++) { // If the text starts with a special char, clean it From 751036f0d38bcabbbe29d02c36f15097b08967e9 Mon Sep 17 00:00:00 2001 From: MillenniumEarl Date: Sun, 21 Mar 2021 18:44:34 +0100 Subject: [PATCH 07/22] Reduce cyclomatic complexity in parseCheerioNode --- src/scripts/scrape-data/post-parse.ts | 70 +++++++++++++++++++-------- 1 file changed, 51 insertions(+), 19 deletions(-) diff --git a/src/scripts/scrape-data/post-parse.ts b/src/scripts/scrape-data/post-parse.ts index 15b4340..768869e 100644 --- a/src/scripts/scrape-data/post-parse.ts +++ b/src/scripts/scrape-data/post-parse.ts @@ -8,6 +8,9 @@ // Import from files import { POST } from "../constants/css-selector"; +// Types +type NodeTypeT = "Text" | "Formatted" | "Spoiler" | "Link" | "List" | "Noscript" | "Unknown"; + //#region Interfaces /** @@ -136,6 +139,25 @@ function isListNode(node: cheerio.Element): boolean { return node.type === "tag" && (node.name === "ul" || node.name === "li"); } +/** + * Idetnify the type of node passed by parameter. + */ +function nodeType($: cheerio.Root, node: cheerio.Element): NodeTypeT { + // Function map + const functionMap = { + Text: (node: cheerio.Element) => isTextNode(node) && !isFormattingNode(node), + Formatted: (node: cheerio.Element) => isFormattingNode(node), + Spoiler: (node: cheerio.Element) => isSpoilerNode($(node)), + Link: (node: cheerio.Element) => isLinkNode(node), + List: (node: cheerio.Element) => isListNode(node), + Noscript: (node: cheerio.Element) => isNoScriptNode(node) + }; + + // Parse and return the type of the node + const result = Object.keys(functionMap).find((e) => functionMap[e](node)); + return result ? (result as NodeTypeT) : "Unknown"; +} + //#endregion Node Type //#region Parse Cheerio node @@ -351,28 +373,34 @@ function removeEmptyContentFromElement(element: IPostElement, recursive = true): */ function parseCheerioNode($: cheerio.Root, node: cheerio.Element): IPostElement { // Local variables - let post: IPostElement = createGenericElement(); const cheerioNode = $(node); - // Parse the node - if (!isNoScriptNode(node)) { - if (isTextNode(node) && !isFormattingNode(node)) post = parseCheerioTextNode(cheerioNode); - else if (isSpoilerNode(cheerioNode)) post = parseCheerioSpoilerNode($, cheerioNode); - else if (isLinkNode(node)) post = parseCheerioLinkNode(cheerioNode); + // Function mapping + const functionMap = { + Text: (node: cheerio.Cheerio) => parseCheerioTextNode(node), + Spoiler: (node: cheerio.Cheerio) => parseCheerioSpoilerNode($, node), + Link: (node: cheerio.Cheerio) => parseCheerioLinkNode(node) + }; - // Check for childrens only if the node is a / element - // or a list element. For the link in unnecessary while for - // the spoilers is already done in parseCheerioSpoilerNode - if (isFormattingNode(node) || isListNode(node)) { - // Parse the node's childrens - const childPosts = cheerioNode - .contents() // @todo Change to children() after cheerio RC6 - .toArray() - .filter((el) => el) // Ignore undefined elements - .map((el) => parseCheerioNode($, el)) - .filter((el) => !isPostElementEmpty(el)); - post.content.push(...childPosts); - } + // Get the type of node + const type = nodeType($, node); + + // Get the post based on the type of node + const post = Object.keys(functionMap).includes(type) + ? functionMap[type]($(node)) + : createGenericElement(); + + // Parse the childrens only if the node is a / element + // or a list element. For the link in unnecessary while for + // the spoilers is already done in parseCheerioSpoilerNode + if (type === "Formatted" || type === "List") { + const childPosts = cheerioNode + .contents() // @todo Change to children() after cheerio RC6 + .toArray() + .filter((e) => e) // Ignore undefined elements + .map((e) => parseCheerioNode($, e)) + .filter((e) => !isPostElementEmpty(e)); + post.content.push(...childPosts); } return post; @@ -428,6 +456,10 @@ function associateNameToElements(elements: IPostElement[]): IPostElement[] { return pairUp(elements); } +/** + * It simplifies the `IPostElement` elements by associating + * the corresponding value to each characterizing element (i.e. author). + */ function pairUp(elements: IPostElement[]): IPostElement[] { // First ignore the "Generic" type elements, because // they usually are containers for other data, like From 6a6827e39a5e5b056f8fb76a68567c4e6939c051 Mon Sep 17 00:00:00 2001 From: MillenniumEarl Date: Sun, 21 Mar 2021 18:54:58 +0100 Subject: [PATCH 08/22] Fix genre parsing --- src/scripts/scrape-data/handiwork-parse.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scripts/scrape-data/handiwork-parse.ts b/src/scripts/scrape-data/handiwork-parse.ts index 764a904..1d34cf7 100644 --- a/src/scripts/scrape-data/handiwork-parse.ts +++ b/src/scripts/scrape-data/handiwork-parse.ts @@ -206,7 +206,7 @@ function fillWithPostData(hw: HandiWork, elements: IPostElement[]) { if (censored) hw.censored = stringToBoolean(censored.text); // Get the genres - const genre = getPostElementByName(elements, "genre")?.content.shift()?.text; + const genre = getPostElementByName(elements, "genre")?.text; hw.genre = genre ?.split(",") .map((s) => s.trim()) From e211bb30e4136addbadacb025504708d99482052 Mon Sep 17 00:00:00 2001 From: MillenniumEarl Date: Sun, 21 Mar 2021 18:55:08 +0100 Subject: [PATCH 09/22] Update JS-DOC --- src/scripts/scrape-data/post-parse.ts | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/scripts/scrape-data/post-parse.ts b/src/scripts/scrape-data/post-parse.ts index 768869e..0393afc 100644 --- a/src/scripts/scrape-data/post-parse.ts +++ b/src/scripts/scrape-data/post-parse.ts @@ -494,6 +494,13 @@ function pairUp(elements: IPostElement[]): IPostElement[] { return data; } +/** + * Associate the relative values to a title. + * @param start Title index in the `elements` array + * @param index `start` index in `indexes` + * @param indexes List of titles indices in the `elements` array + * @param elements Array of elements to group + */ function parseGroupData( start: number, index: number, From 061008c5a5c8fef888160f825510dcc796d41da6 Mon Sep 17 00:00:00 2001 From: MillenniumEarl Date: Sun, 21 Mar 2021 18:56:05 +0100 Subject: [PATCH 10/22] Replace associateNameToElements with pairUpElements --- src/scripts/scrape-data/post-parse.ts | 56 ++------------------------- 1 file changed, 3 insertions(+), 53 deletions(-) diff --git a/src/scripts/scrape-data/post-parse.ts b/src/scripts/scrape-data/post-parse.ts index 0393afc..32f344a 100644 --- a/src/scripts/scrape-data/post-parse.ts +++ b/src/scripts/scrape-data/post-parse.ts @@ -78,7 +78,7 @@ export function parseF95ThreadPost($: cheerio.Root, post: cheerio.Cheerio): IPos supernode = removeEmptyContentFromElement(supernode); // Finally parse the elements to create the pairs of title/data - return associateNameToElements(supernode.content); + return pairUpElements(supernode.content); } //#endregion Public methods @@ -410,57 +410,7 @@ function parseCheerioNode($: cheerio.Root, node: cheerio.Element): IPostElement * It simplifies the `IPostElement` elements by associating * the corresponding value to each characterizing element (i.e. author). */ -function associateNameToElements(elements: IPostElement[]): IPostElement[] { - // Local variables - const pairs: IPostElement[] = []; - const specialCharsRegex = /^[-!$%^&*()_+|~=`{}[\]:";'<>?,./]/; - const specialRegex = new RegExp(specialCharsRegex); - - for (let i = 0; i < elements.length; i++) { - // If the text starts with a special char, clean it - const startWithSpecial = specialRegex.test(elements[i].text); - - // Get the latest IPostElement in "pairs" - const lastIndex = pairs.length - 1; - const lastPair = pairs[lastIndex]; - - // If this statement is valid, we have a "data" - if (elements[i].type === "Text" && startWithSpecial && pairs.length > 0) { - // We merge this element with the last element appended to 'pairs' - const cleanText = elements[i].text.replace(specialCharsRegex, "").trim(); - lastPair.text = lastPair.text || cleanText; - lastPair.content.push(...elements[i].content); - } - // This is a special case - else if (elementIsOverview(elements[i])) { - // We add the overview to the pairs as a text element - elements[i].type = "Text"; - elements[i].name = "Overview"; - elements[i].text = getOverviewFromElement(elements[i]); - pairs.push(elements[i]); - } - // We have an element referred to the previous "title" - else if (elements[i].type != "Text" && pairs.length > 0) { - // We append this element to the content of the last title - lastPair.content.push(elements[i]); - } - // ... else we have a "title" (we need to swap the text to the name because it is a title) - else { - const swap: IPostElement = Object.assign({}, elements[i]); - swap.name = elements[i].text; - swap.text = ""; - pairs.push(swap); - } - } - - return pairUp(elements); -} - -/** - * It simplifies the `IPostElement` elements by associating - * the corresponding value to each characterizing element (i.e. author). - */ -function pairUp(elements: IPostElement[]): IPostElement[] { +function pairUpElements(elements: IPostElement[]): IPostElement[] { // First ignore the "Generic" type elements, because // they usually are containers for other data, like // overview or download links. @@ -486,7 +436,7 @@ function pairUp(elements: IPostElement[]): IPostElement[] { // so all the elements with "Generic" type const genericElementsPairs = elements .filter((e) => e.type === "Generic") - .map((e) => pairUp(e.content)); + .map((e) => pairUpElements(e.content)); const flatten: IPostElement[] = [].concat(...genericElementsPairs); data.push(...flatten); From fdc944ecbff3fbf0d1994c4078ac58ecfaec0996 Mon Sep 17 00:00:00 2001 From: MillenniumEarl Date: Sun, 21 Mar 2021 19:13:35 +0100 Subject: [PATCH 11/22] Replace "el" with "e" --- src/scripts/scrape-data/post-parse.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/scripts/scrape-data/post-parse.ts b/src/scripts/scrape-data/post-parse.ts index 32f344a..36ce5a1 100644 --- a/src/scripts/scrape-data/post-parse.ts +++ b/src/scripts/scrape-data/post-parse.ts @@ -118,8 +118,8 @@ function isLinkNode(node: cheerio.Element): boolean { // The node is a valid DOM element if (node.type === "tag") { - const el = node as cheerio.TagElement; - valid = el.name === "a" || el.name === "img"; + const e = node as cheerio.TagElement; + valid = e.name === "a" || e.name === "img"; } return valid; @@ -189,7 +189,7 @@ function parseCheerioSpoilerNode($: cheerio.Root, node: cheerio.Cheerio): IPostE .find(POST.SPOILER_CONTENT) .contents() .toArray() - .map((el) => parseCheerioNode($, el)); + .map((e) => parseCheerioNode($, e)); // Clean text (Spoiler has no text) @todo // spoiler.text = spoiler.text.replace(/\s\s+/g, " ").trim(); @@ -320,7 +320,7 @@ function getCheerioNonChildrenText(node: cheerio.Cheerio): string { text = node .first() .contents() // @todo Change to children() after cheerio RC6 - .filter((idx, el) => isTextNode(el)) + .filter((idx, e) => isTextNode(e)) .text(); } From 66e586df6f9ef3bd26abb90a574eb26421956777 Mon Sep 17 00:00:00 2001 From: MillenniumEarl Date: Sun, 21 Mar 2021 21:59:48 +0100 Subject: [PATCH 12/22] Better parsing of "Generic" elements in pairUpElements --- src/scripts/scrape-data/post-parse.ts | 104 +++++++++++++++++--------- 1 file changed, 68 insertions(+), 36 deletions(-) diff --git a/src/scripts/scrape-data/post-parse.ts b/src/scripts/scrape-data/post-parse.ts index 36ce5a1..5b29291 100644 --- a/src/scripts/scrape-data/post-parse.ts +++ b/src/scripts/scrape-data/post-parse.ts @@ -390,10 +390,11 @@ function parseCheerioNode($: cheerio.Root, node: cheerio.Element): IPostElement ? functionMap[type]($(node)) : createGenericElement(); - // Parse the childrens only if the node is a / element - // or a list element. For the link in unnecessary while for - // the spoilers is already done in parseCheerioSpoilerNode - if (type === "Formatted" || type === "List") { + // Parse the childrens only if the node is a / element, a list + // or a unknown element. For the link in unnecessary while for the + // spoilers is already done in parseCheerioSpoilerNode + const includeTypes: NodeTypeT[] = ["Formatted", "List", "Unknown"]; + if (includeTypes.includes(type)) { const childPosts = cheerioNode .contents() // @todo Change to children() after cheerio RC6 .toArray() @@ -411,39 +412,68 @@ function parseCheerioNode($: cheerio.Root, node: cheerio.Element): IPostElement * the corresponding value to each characterizing element (i.e. author). */ function pairUpElements(elements: IPostElement[]): IPostElement[] { - // First ignore the "Generic" type elements, because - // they usually are containers for other data, like - // overview or download links. - const validElements = elements.filter((e) => e.type !== "Generic"); + // Local variables + const shallow = [...elements]; + + // Parse all the generic elements that + // act as "container" for other information + shallow + .filter((e) => e.type === "Generic") + .map((e) => ({ + element: e, + pairs: pairUpElements(e.content) + })) + .forEach((e) => { + // Find the index of the elements + const index = shallow.indexOf(e.element); + + // Remove that elements + shallow.splice(index, 1); + + // Add the pairs at the index of the deleted element + e.pairs.forEach((e, i) => shallow.splice(index + i, 0, e)); + }); + + // Ignore the "Generic" elements that we have already parsed + //const validElements = shallow.filter((e) => e.type !== "Generic"); // Than we find all the IDs of "Text" elements where the // text doesn't starts with double points. This means // that we find all the IDs of "title" elements. - const indexes = validElements - .filter( - (e, i) => - e.type === "Text" && // This element must be a text - ((e.text.endsWith(":") && e.text !== ":") || // This element's text must ends with ":" - validElements[i + 1]?.text.startsWith(":")) // The next element's text must start with ":" - ) - .map((e) => validElements.indexOf(e)); + const indexes = shallow + .filter((e, i) => filterValidElements(e, i, shallow)) + .map((e) => shallow.indexOf(e)); // Now we find all the elements between indexes and // associate them with the previous "title" element - const data = indexes.map((i, j) => parseGroupData(i, j, indexes, validElements)); - - // Now parse all the "invalid" elements, - // so all the elements with "Generic" type - const genericElementsPairs = elements - .filter((e) => e.type === "Generic") - .map((e) => pairUpElements(e.content)); - - const flatten: IPostElement[] = [].concat(...genericElementsPairs); - data.push(...flatten); + const data = indexes.map((i, j) => parseGroupData(i, j, indexes, shallow)); return data; } +function filterValidElements(element: IPostElement, index: number, array: IPostElement[]): boolean { + // Check if this element is a "title" checking also the next element + const isPostfixDoublePoints = element.text.endsWith(":") && element.text !== ":"; + const nextElementIsValue = array[index + 1]?.text.startsWith(":"); + const elementIsTextTitle = + element.type === "Text" && (isPostfixDoublePoints || nextElementIsValue); + + // Special values tha must be set has "title" + const specialValues = ["DOWNLOAD"]; + const specialTypes = ["Image"]; + + // Used to ignore already merged elements with name (ignore spoilers) + // because they have as name the content of the spoiler button + const hasName = element.name !== "" && element.type !== "Spoiler"; + + return ( + elementIsTextTitle || + specialTypes.includes(element.type) || + specialValues.includes(element.text.toUpperCase()) || + hasName + ); +} + /** * Associate the relative values to a title. * @param start Title index in the `elements` array @@ -458,7 +488,7 @@ function parseGroupData( elements: IPostElement[] ): IPostElement { // Local variables - const endsWithSpecialCharsRegex = /[-]$/; + const endsWithSpecialCharsRegex = /[-:]$/; const startsWithDoublePointsRegex = /^[:]/; // Find all the elements (title + data) of the same data group @@ -468,6 +498,10 @@ function parseGroupData( // Extract the title const title = group.shift(); + // If the title is already named (beacuse it was + // previously elaborated) return it witout + if (title.name !== "" && title.type !== "Spoiler") return title; + // Assign name and text of the title title.name = title.text.replace(endsWithSpecialCharsRegex, "").trim(); title.text = group @@ -481,15 +515,13 @@ function parseGroupData( .join(" ") // Join with space .trim(); - // Append all the content of non-text elements. - group - .filter((e) => e.type !== "Text") - .forEach( - (e) => - e.type === "Spoiler" - ? title.content.push(...e.content) // Add all the content fo the spoiler - : title.content.push(e) // Add the element itself - ); + // Append all the content of the elements. + group.forEach( + (e) => + e.type === "Spoiler" + ? title.content.push(...e.content) // Add all the content fo the spoiler + : title.content.push(e) // Add the element itself + ); return title; } From dcd9744809e3f9220fb6955a808f8744b70315f0 Mon Sep 17 00:00:00 2001 From: MillenniumEarl Date: Sun, 21 Mar 2021 22:00:27 +0100 Subject: [PATCH 13/22] Fix cover assignment --- src/scripts/scrape-data/handiwork-parse.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/scripts/scrape-data/handiwork-parse.ts b/src/scripts/scrape-data/handiwork-parse.ts index 1d34cf7..b1c3391 100644 --- a/src/scripts/scrape-data/handiwork-parse.ts +++ b/src/scripts/scrape-data/handiwork-parse.ts @@ -193,7 +193,7 @@ function fillWithPostData(hw: HandiWork, elements: IPostElement[]) { ?.text?.split(",") .map((s) => s.trim()); hw.version = getPostElementByName(elements, "version")?.text; - hw.installation = getPostElementByName(elements, "installation")?.content.shift()?.text; + hw.installation = getPostElementByName(elements, "installation")?.text; hw.pages = getPostElementByName(elements, "pages")?.text; hw.resolution = getPostElementByName(elements, "resolution") ?.text?.split(",") From 04e94892395d42cdfbe4c27e3276ca397ebabee6 Mon Sep 17 00:00:00 2001 From: MillenniumEarl Date: Sun, 21 Mar 2021 22:01:31 +0100 Subject: [PATCH 14/22] Fix cover assignment --- src/scripts/scrape-data/handiwork-parse.ts | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/scripts/scrape-data/handiwork-parse.ts b/src/scripts/scrape-data/handiwork-parse.ts index b1c3391..149e00a 100644 --- a/src/scripts/scrape-data/handiwork-parse.ts +++ b/src/scripts/scrape-data/handiwork-parse.ts @@ -213,9 +213,7 @@ function fillWithPostData(hw: HandiWork, elements: IPostElement[]) { .filter((s) => s !== ""); // Get the cover - const cover = getPostElementByName(elements, "overview")?.content.find( - (el) => el.type === "Image" - ) as ILink; + const cover = elements.find((e) => e.type === "Image") as ILink; hw.cover = cover?.href; // Fill the dates From b74a212c806ac4dd2d7d443b55c65b8c8a071a05 Mon Sep 17 00:00:00 2001 From: MillenniumEarl Date: Sun, 21 Mar 2021 22:22:06 +0100 Subject: [PATCH 15/22] Simplify return --- src/scripts/scrape-data/post-parse.ts | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/scripts/scrape-data/post-parse.ts b/src/scripts/scrape-data/post-parse.ts index 5b29291..776e358 100644 --- a/src/scripts/scrape-data/post-parse.ts +++ b/src/scripts/scrape-data/post-parse.ts @@ -446,9 +446,7 @@ function pairUpElements(elements: IPostElement[]): IPostElement[] { // Now we find all the elements between indexes and // associate them with the previous "title" element - const data = indexes.map((i, j) => parseGroupData(i, j, indexes, shallow)); - - return data; + return indexes.map((i, j) => parseGroupData(i, j, indexes, shallow)); } function filterValidElements(element: IPostElement, index: number, array: IPostElement[]): boolean { From b7d27483cc0188cc3bccf9ae2e1e2285821f61df Mon Sep 17 00:00:00 2001 From: MillenniumEarl Date: Mon, 22 Mar 2021 11:00:45 +0100 Subject: [PATCH 16/22] Refactoring code --- src/scripts/scrape-data/post-parse.ts | 100 +++++++++----------------- 1 file changed, 34 insertions(+), 66 deletions(-) diff --git a/src/scripts/scrape-data/post-parse.ts b/src/scripts/scrape-data/post-parse.ts index 776e358..dc7e8a2 100644 --- a/src/scripts/scrape-data/post-parse.ts +++ b/src/scripts/scrape-data/post-parse.ts @@ -235,6 +235,31 @@ function parseCheerioTextNode(node: cheerio.Cheerio): IPostElement { return content; } +/** + * Gets the text of the node only, excluding child nodes. + * Also includes formatted text elements (i.e. ``). + */ +function getCheerioNonChildrenText(node: cheerio.Cheerio): string { + // Local variable + let text = ""; + + // If the node has no children, return the node's text + if (node.contents().length === 1) { + // @todo Remove IF after cheerio RC6 + text = node.text(); + } else { + // Find all the text nodes in the node + text = node + .first() + .contents() // @todo Change to children() after cheerio RC6 + .filter((idx, e) => isTextNode(e)) + .text(); + } + + // Clean and return the text + return text.replace(/\s\s+/g, " ").trim(); +} + //#endregion Parse Cheerio node //#region IPostElement utility @@ -268,66 +293,8 @@ function createGenericElement(): IPostElement { }; } -/** - * Check if the element contains the overview of a thread (post #1). - */ -function elementIsOverview(element: IPostElement): boolean { - // Search the text element that start with "overview" - const result = element.content - .filter((e) => e.type === "Text") - .find((e) => e.text.toUpperCase().startsWith("OVERVIEW")); - return result !== undefined; -} - -/** - * If the element contains the overview of a thread, parse it. - */ -function getOverviewFromElement(element: IPostElement): string { - // Local variables - const alphanumericRegex = new RegExp("[a-zA-Z0-9]+"); - - // Get all the text values of the overview - const textes = element.content - .filter((e) => e.type === "Text") - .filter((e) => { - const cleanValue = e.text.toUpperCase().replace("OVERVIEW", "").trim(); - const isAlphanumeric = alphanumericRegex.test(cleanValue); - - return cleanValue !== "" && isAlphanumeric; - }) - .map((e) => e.text); - - // Joins the textes - return textes.join(" "); -} - //#endregion IPostElement utility -/** - * Gets the text of the node only, excluding child nodes. - * Also includes formatted text elements (i.e. ``). - */ -function getCheerioNonChildrenText(node: cheerio.Cheerio): string { - // Local variable - let text = ""; - - // If the node has no children, return the node's text - if (node.contents().length === 1) { - // @todo Remove IF after cheerio RC6 - text = node.text(); - } else { - // Find all the text nodes in the node - text = node - .first() - .contents() // @todo Change to children() after cheerio RC6 - .filter((idx, e) => isTextNode(e)) - .text(); - } - - // Clean and return the text - return text.replace(/\s\s+/g, " ").trim(); -} - /** * Collapse an `IPostElement` element with a single subnode * in the `Content` field in case it has no information. @@ -434,14 +401,9 @@ function pairUpElements(elements: IPostElement[]): IPostElement[] { e.pairs.forEach((e, i) => shallow.splice(index + i, 0, e)); }); - // Ignore the "Generic" elements that we have already parsed - //const validElements = shallow.filter((e) => e.type !== "Generic"); - - // Than we find all the IDs of "Text" elements where the - // text doesn't starts with double points. This means - // that we find all the IDs of "title" elements. + // Than we find all the IDs of the elements that are "titles". const indexes = shallow - .filter((e, i) => filterValidElements(e, i, shallow)) + .filter((e, i) => isValidTitleElement(e, i, shallow)) .map((e) => shallow.indexOf(e)); // Now we find all the elements between indexes and @@ -449,7 +411,13 @@ function pairUpElements(elements: IPostElement[]): IPostElement[] { return indexes.map((i, j) => parseGroupData(i, j, indexes, shallow)); } -function filterValidElements(element: IPostElement, index: number, array: IPostElement[]): boolean { +/** + * Verify if the `element` is a valid title. + * @param element Element to check + * @param index Index of the element in `array` + * @param array Array of elements to check + */ +function isValidTitleElement(element: IPostElement, index: number, array: IPostElement[]): boolean { // Check if this element is a "title" checking also the next element const isPostfixDoublePoints = element.text.endsWith(":") && element.text !== ":"; const nextElementIsValue = array[index + 1]?.text.startsWith(":"); From 5112143a0ebb1b6648804ffd5608e0998b54efc7 Mon Sep 17 00:00:00 2001 From: MillenniumEarl Date: Mon, 22 Mar 2021 11:33:45 +0100 Subject: [PATCH 17/22] Rename NodeTypeT to NodeTypeT --- src/scripts/scrape-data/post-parse.ts | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/scripts/scrape-data/post-parse.ts b/src/scripts/scrape-data/post-parse.ts index dc7e8a2..5d5e9f4 100644 --- a/src/scripts/scrape-data/post-parse.ts +++ b/src/scripts/scrape-data/post-parse.ts @@ -9,7 +9,7 @@ import { POST } from "../constants/css-selector"; // Types -type NodeTypeT = "Text" | "Formatted" | "Spoiler" | "Link" | "List" | "Noscript" | "Unknown"; +type TNodeType = "Text" | "Formatted" | "Spoiler" | "Link" | "List" | "Noscript" | "Unknown"; //#region Interfaces @@ -142,7 +142,7 @@ function isListNode(node: cheerio.Element): boolean { /** * Idetnify the type of node passed by parameter. */ -function nodeType($: cheerio.Root, node: cheerio.Element): NodeTypeT { +function nodeType($: cheerio.Root, node: cheerio.Element): TNodeType { // Function map const functionMap = { Text: (node: cheerio.Element) => isTextNode(node) && !isFormattingNode(node), @@ -155,7 +155,7 @@ function nodeType($: cheerio.Root, node: cheerio.Element): NodeTypeT { // Parse and return the type of the node const result = Object.keys(functionMap).find((e) => functionMap[e](node)); - return result ? (result as NodeTypeT) : "Unknown"; + return result ? (result as TNodeType) : "Unknown"; } //#endregion Node Type @@ -360,7 +360,7 @@ function parseCheerioNode($: cheerio.Root, node: cheerio.Element): IPostElement // Parse the childrens only if the node is a / element, a list // or a unknown element. For the link in unnecessary while for the // spoilers is already done in parseCheerioSpoilerNode - const includeTypes: NodeTypeT[] = ["Formatted", "List", "Unknown"]; + const includeTypes: TNodeType[] = ["Formatted", "List", "Unknown"]; if (includeTypes.includes(type)) { const childPosts = cheerioNode .contents() // @todo Change to children() after cheerio RC6 @@ -402,10 +402,12 @@ function pairUpElements(elements: IPostElement[]): IPostElement[] { }); // Than we find all the IDs of the elements that are "titles". - const indexes = shallow + let indexes = shallow .filter((e, i) => isValidTitleElement(e, i, shallow)) .map((e) => shallow.indexOf(e)); + if (indexes.length === 0) indexes = shallow.map((e, i) => i); + // Now we find all the elements between indexes and // associate them with the previous "title" element return indexes.map((i, j) => parseGroupData(i, j, indexes, shallow)); From b4b83f36e13d70ac71f1f8e106ff01bc8c36187e Mon Sep 17 00:00:00 2001 From: MillenniumEarl Date: Mon, 22 Mar 2021 12:38:23 +0100 Subject: [PATCH 18/22] Clean element before validation to remove useless elements --- src/scripts/scrape-data/post-parse.ts | 31 ++++++++++++++++++++++++--- 1 file changed, 28 insertions(+), 3 deletions(-) diff --git a/src/scripts/scrape-data/post-parse.ts b/src/scripts/scrape-data/post-parse.ts index 5d5e9f4..798f0f7 100644 --- a/src/scripts/scrape-data/post-parse.ts +++ b/src/scripts/scrape-data/post-parse.ts @@ -293,6 +293,29 @@ function createGenericElement(): IPostElement { }; } +/** + * Clean the element `name` and `text` removing initial and final special characters. + */ +function cleanElement(element: IPostElement): IPostElement { + // Local variables + const shallow = Object.assign({}, element); + const specialCharSet = /[-!$%^&*()_+|~=`{}[\]:";'<>?,./]/; + const startsWithSpecialCharsRegex = new RegExp("^" + specialCharSet.source); + const endsWithSpecialCharsRegex = new RegExp(specialCharSet.source + "$"); + + shallow.name = shallow.name + .replace(startsWithSpecialCharsRegex, "") + .replace(endsWithSpecialCharsRegex, "") + .trim(); + + shallow.text = shallow.text + .replace(startsWithSpecialCharsRegex, "") + .replace(endsWithSpecialCharsRegex, "") + .trim(); + + return shallow; +} + //#endregion IPostElement utility /** @@ -327,7 +350,9 @@ function removeEmptyContentFromElement(element: IPostElement, recursive = true): : copy.content; // Find the non-empty nodes - const validNodes = recursiveResult.filter((e) => !isPostElementEmpty(e)); + const validNodes = recursiveResult + .filter((e) => !isPostElementEmpty(e)) // Remove the empty nodes + .filter((e) => !isPostElementEmpty(cleanElement(e))); // Remove the useless nodes // Assign the nodes copy.content = validNodes; @@ -402,11 +427,11 @@ function pairUpElements(elements: IPostElement[]): IPostElement[] { }); // Than we find all the IDs of the elements that are "titles". - let indexes = shallow + const indexes = shallow .filter((e, i) => isValidTitleElement(e, i, shallow)) .map((e) => shallow.indexOf(e)); - if (indexes.length === 0) indexes = shallow.map((e, i) => i); + //if (indexes.length === 0) indexes = shallow.map((e, i) => i); // Now we find all the elements between indexes and // associate them with the previous "title" element From a72462becbb85bed0d45be03612b33fa59f98c78 Mon Sep 17 00:00:00 2001 From: MillenniumEarl Date: Mon, 22 Mar 2021 12:39:25 +0100 Subject: [PATCH 19/22] Coded parseAuthor --- src/scripts/scrape-data/handiwork-parse.ts | 63 ++++++++++++++-------- 1 file changed, 41 insertions(+), 22 deletions(-) diff --git a/src/scripts/scrape-data/handiwork-parse.ts b/src/scripts/scrape-data/handiwork-parse.ts index 149e00a..c99c55d 100644 --- a/src/scripts/scrape-data/handiwork-parse.ts +++ b/src/scripts/scrape-data/handiwork-parse.ts @@ -220,34 +220,15 @@ function fillWithPostData(hw: HandiWork, elements: IPostElement[]) { const releaseDate = getPostElementByName(elements, "release date")?.text; if (DateTime.fromISO(releaseDate).isValid) hw.lastRelease = new Date(releaseDate); - //#region Convert the author - const authorElement = - getPostElementByName(elements, "developer") || - getPostElementByName(elements, "developer/publisher") || - getPostElementByName(elements, "artist"); - const author: TAuthor = { - name: authorElement?.text, - platforms: [] - }; - - // Add the found platforms - authorElement?.content.forEach((el: ILink, idx) => { - const platform: TExternalPlatform = { - name: el.text, - link: el.href - }; - - author.platforms.push(platform); - }); - hw.authors = [author]; - //#endregion Convert the author + // Get the author + hw.authors = parseAuthor(elements); //#region Get the changelog hw.changelog = []; const changelogElement = getPostElementByName(elements, "changelog") || getPostElementByName(elements, "change-log"); - if (changelogElement?.content) { + if (false && changelogElement?.content) { const changelogSpoiler = changelogElement.content.find( (el) => el.type === "Spoiler" && el.content.length > 0 ); @@ -264,4 +245,42 @@ function fillWithPostData(hw: HandiWork, elements: IPostElement[]) { //#endregion Get the changelog } +/** + * Parse the author from the post's data. + */ +function parseAuthor(elements: IPostElement[]): TAuthor[] { + // Local variables + const author: TAuthor = { + name: "", + platforms: [] + }; + + // Fetch the authors from the post data + const authorElement = + getPostElementByName(elements, "developer") || + getPostElementByName(elements, "developer/publisher") || + getPostElementByName(elements, "artist"); + + if (authorElement) { + // Set the author name + author.name = authorElement.text; + + // Add the found platforms + authorElement.content.forEach((e: ILink) => { + // Ignore invalid links + if (e.href) { + // Create and push the new platform + const platform: TExternalPlatform = { + name: e.text, + link: e.href + }; + + author.platforms.push(platform); + } + }); + } + + return [author]; +} + //#endregion Private methods From 2acf7699700462a958c19358b8fe11e36c0f58a2 Mon Sep 17 00:00:00 2001 From: MillenniumEarl Date: Mon, 22 Mar 2021 14:18:39 +0100 Subject: [PATCH 20/22] Add specific type for changelog --- src/scripts/classes/handiwork/animation.ts | 4 ++-- src/scripts/classes/handiwork/asset.ts | 4 ++-- src/scripts/classes/handiwork/comic.ts | 4 ++-- src/scripts/classes/handiwork/game.ts | 4 ++-- src/scripts/classes/handiwork/handiwork.ts | 12 ++++++++++-- src/scripts/interfaces.ts | 16 +++++++++++++++- 6 files changed, 33 insertions(+), 11 deletions(-) diff --git a/src/scripts/classes/handiwork/animation.ts b/src/scripts/classes/handiwork/animation.ts index d097cc8..65c8447 100644 --- a/src/scripts/classes/handiwork/animation.ts +++ b/src/scripts/classes/handiwork/animation.ts @@ -6,7 +6,7 @@ "use strict"; // Modules from files -import { TAuthor, IAnimation, TRating, TCategory } from "../../interfaces"; +import { TAuthor, IAnimation, TRating, TCategory, TChangelog } from "../../interfaces"; export default class Animation implements IAnimation { //#region Properties @@ -19,7 +19,7 @@ export default class Animation implements IAnimation { resolution: string[]; authors: TAuthor[]; category: TCategory; - changelog: string[]; + changelog: TChangelog[]; cover: string; id: number; lastThreadUpdate: Date; diff --git a/src/scripts/classes/handiwork/asset.ts b/src/scripts/classes/handiwork/asset.ts index b18a9e5..5a434be 100644 --- a/src/scripts/classes/handiwork/asset.ts +++ b/src/scripts/classes/handiwork/asset.ts @@ -6,7 +6,7 @@ "use strict"; // Modules from files -import { TAuthor, IAsset, TRating, TCategory } from "../../interfaces"; +import { TAuthor, IAsset, TRating, TCategory, TChangelog } from "../../interfaces"; export default class Asset implements IAsset { //#region Properties @@ -18,7 +18,7 @@ export default class Asset implements IAsset { sku: string; authors: TAuthor[]; category: TCategory; - changelog: string[]; + changelog: TChangelog[]; cover: string; id: number; lastThreadUpdate: Date; diff --git a/src/scripts/classes/handiwork/comic.ts b/src/scripts/classes/handiwork/comic.ts index 6bf713a..ff5b795 100644 --- a/src/scripts/classes/handiwork/comic.ts +++ b/src/scripts/classes/handiwork/comic.ts @@ -6,7 +6,7 @@ "use strict"; // Modules from files -import { TAuthor, IComic, TRating, TCategory } from "../../interfaces"; +import { TAuthor, IComic, TRating, TCategory, TChangelog } from "../../interfaces"; export default class Comic implements IComic { //#region Properties @@ -15,7 +15,7 @@ export default class Comic implements IComic { resolution: string[]; authors: TAuthor[]; category: TCategory; - changelog: string[]; + changelog: TChangelog[]; cover: string; id: number; lastThreadUpdate: Date; diff --git a/src/scripts/classes/handiwork/game.ts b/src/scripts/classes/handiwork/game.ts index d422668..80a71d5 100644 --- a/src/scripts/classes/handiwork/game.ts +++ b/src/scripts/classes/handiwork/game.ts @@ -6,7 +6,7 @@ "use strict"; // Modules from files -import { TAuthor, TEngine, IGame, TRating, TStatus, TCategory } from "../../interfaces"; +import { TAuthor, TEngine, IGame, TRating, TStatus, TCategory, TChangelog } from "../../interfaces"; export default class Game implements IGame { //#region Properties @@ -22,7 +22,7 @@ export default class Game implements IGame { version: string; authors: TAuthor[]; category: TCategory; - changelog: string[]; + changelog: TChangelog[]; cover: string; id: number; lastThreadUpdate: Date; diff --git a/src/scripts/classes/handiwork/handiwork.ts b/src/scripts/classes/handiwork/handiwork.ts index cb68d27..c558c1a 100644 --- a/src/scripts/classes/handiwork/handiwork.ts +++ b/src/scripts/classes/handiwork/handiwork.ts @@ -6,7 +6,15 @@ "use strict"; // Modules from files -import { TAuthor, TRating, IHandiwork, TEngine, TCategory, TStatus } from "../../interfaces"; +import { + TAuthor, + TRating, + IHandiwork, + TEngine, + TCategory, + TStatus, + TChangelog +} from "../../interfaces"; /** * It represents a generic work, be it a game, a comic, an animation or an asset. @@ -25,7 +33,7 @@ export default class HandiWork implements IHandiwork { version: string; authors: TAuthor[]; category: TCategory; - changelog: string[]; + changelog: TChangelog[]; cover: string; id: number; lastThreadUpdate: Date; diff --git a/src/scripts/interfaces.ts b/src/scripts/interfaces.ts index 33916e6..64b1bae 100644 --- a/src/scripts/interfaces.ts +++ b/src/scripts/interfaces.ts @@ -51,6 +51,20 @@ export type TRating = { count: number; }; +/** + * Information about a single version of the product. + */ +export type TChangelog = { + /** + * Product version. + */ + version: string; + /** + * Version information. + */ + information: string[]; +}; + /** * List of possible graphics engines used for game development. */ @@ -101,7 +115,7 @@ export interface IBasic { /** * List of changes of the work for each version. */ - changelog: string[]; + changelog: TChangelog[]; /** * link to the cover image of the work. */ From bc683b2387d437783e7e3d9bbb247588a970a2a9 Mon Sep 17 00:00:00 2001 From: MillenniumEarl Date: Mon, 22 Mar 2021 14:18:57 +0100 Subject: [PATCH 21/22] Add special values in isValidTitleElement --- src/scripts/scrape-data/post-parse.ts | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/scripts/scrape-data/post-parse.ts b/src/scripts/scrape-data/post-parse.ts index 798f0f7..3837005 100644 --- a/src/scripts/scrape-data/post-parse.ts +++ b/src/scripts/scrape-data/post-parse.ts @@ -431,8 +431,6 @@ function pairUpElements(elements: IPostElement[]): IPostElement[] { .filter((e, i) => isValidTitleElement(e, i, shallow)) .map((e) => shallow.indexOf(e)); - //if (indexes.length === 0) indexes = shallow.map((e, i) => i); - // Now we find all the elements between indexes and // associate them with the previous "title" element return indexes.map((i, j) => parseGroupData(i, j, indexes, shallow)); @@ -452,7 +450,7 @@ function isValidTitleElement(element: IPostElement, index: number, array: IPostE element.type === "Text" && (isPostfixDoublePoints || nextElementIsValue); // Special values tha must be set has "title" - const specialValues = ["DOWNLOAD"]; + const specialValues = ["DOWNLOAD", "CHANGELOG", "CHANGE-LOG", "GENRE"]; const specialTypes = ["Image"]; // Used to ignore already merged elements with name (ignore spoilers) From f4e9575930c9e4f8b47c8c45a819f28ce2f933dd Mon Sep 17 00:00:00 2001 From: MillenniumEarl Date: Mon, 22 Mar 2021 14:50:12 +0100 Subject: [PATCH 22/22] Add method for changelog parse --- src/scripts/scrape-data/handiwork-parse.ts | 73 +++++++++++++++------- 1 file changed, 52 insertions(+), 21 deletions(-) diff --git a/src/scripts/scrape-data/handiwork-parse.ts b/src/scripts/scrape-data/handiwork-parse.ts index c99c55d..1699a19 100644 --- a/src/scripts/scrape-data/handiwork-parse.ts +++ b/src/scripts/scrape-data/handiwork-parse.ts @@ -11,7 +11,7 @@ import { DateTime } from "luxon"; // Modules from files import HandiWork from "../classes/handiwork/handiwork"; import Thread from "../classes/mapping/thread"; -import { IBasic, TAuthor, TEngine, TExternalPlatform, TStatus } from "../interfaces"; +import { IBasic, TAuthor, TChangelog, TEngine, TExternalPlatform, TStatus } from "../interfaces"; import shared, { TPrefixDict } from "../shared"; import { ILink, IPostElement } from "./post-parse"; @@ -223,26 +223,8 @@ function fillWithPostData(hw: HandiWork, elements: IPostElement[]) { // Get the author hw.authors = parseAuthor(elements); - //#region Get the changelog - hw.changelog = []; - const changelogElement = - getPostElementByName(elements, "changelog") || getPostElementByName(elements, "change-log"); - - if (false && changelogElement?.content) { - const changelogSpoiler = changelogElement.content.find( - (el) => el.type === "Spoiler" && el.content.length > 0 - ); - - // Add to the changelog the single spoilers - const spoilers = changelogSpoiler.content - .filter((e) => e.text.trim() !== "") - .map((e) => e.text); - hw.changelog.push(...spoilers); - - // Add at the end also the text of the "changelog" element - hw.changelog.push(changelogSpoiler.text); - } - //#endregion Get the changelog + // Get the changelog + hw.changelog = parseChangelog(elements); } /** @@ -283,4 +265,53 @@ function parseAuthor(elements: IPostElement[]): TAuthor[] { return [author]; } +/** + * Parse the changelog from the post's data. + */ +function parseChangelog(elements: IPostElement[]): TChangelog[] { + // Local variables + const changelog = []; + const changelogElement = + getPostElementByName(elements, "changelog") || getPostElementByName(elements, "change-log"); + + if (changelogElement) { + // regex used to match version tags + const versionRegex = /^v[0-9]+\.[0-9]+.*/; + + // Get the indexes of the version tags + const indexesVersion = changelogElement.content + .filter((e) => e.type === "Text" && versionRegex.test(e.text)) + .map((e) => changelogElement.content.indexOf(e)); + + const results = indexesVersion.map((i, j) => { + // In-loop variable + const versionChangelog: TChangelog = { + version: "", + information: [] + }; + + // Get the difference in indexes between this and the next version tag + const diff = indexesVersion[j + 1] ?? changelogElement.content.length; + + // fetch the group of data of this version tag + const group = changelogElement.content.slice(i, diff); + versionChangelog.version = group.shift().text.replace("v", "").trim(); + + // parse the data + group.forEach((e) => { + if (e.type === "Generic" || e.type === "Spoiler") { + const textes = e.content.map((c) => c.text); + versionChangelog.information.push(...textes); + } else versionChangelog.information.push(e.text); + }); + + return versionChangelog; + }); + + changelog.push(...results); + } + + return changelog; +} + //#endregion Private methods