From 71505cc653d5554349561db78cf38777cf2e79e8 Mon Sep 17 00:00:00 2001 From: MillenniumEarl Date: Thu, 11 Mar 2021 16:16:21 +0100 Subject: [PATCH 1/9] Remove @types/lodash --- package-lock.json | 13 ------------- package.json | 1 - 2 files changed, 14 deletions(-) diff --git a/package-lock.json b/package-lock.json index 53f9349..d5dd753 100644 --- a/package-lock.json +++ b/package-lock.json @@ -22,7 +22,6 @@ "@types/chai": "^4.2.15", "@types/chai-as-promised": "^7.1.3", "@types/inquirer": "^7.3.1", - "@types/lodash": "^4.14.168", "@types/luxon": "^1.25.2", "@types/mocha": "^8.2.1", "@types/node": "^14.14.27", @@ -519,12 +518,6 @@ "integrity": "sha512-cxWFQVseBm6O9Gbw1IWb8r6OS4OhSt3hPZLkFApLjM8TEXROBuQGLAH2i2gZpcXdLBIrpXuTDhH7Vbm1iXmNGA==", "dev": true }, - "node_modules/@types/lodash": { - "version": "4.14.168", - "resolved": "https://registry.npmjs.org/@types/lodash/-/lodash-4.14.168.tgz", - "integrity": "sha512-oVfRvqHV/V6D1yifJbVRU3TMp8OT6o6BG+U9MkwuJ3U8/CsDHvalRpsxBqivn71ztOFZBTfJMvETbqHiaNSj7Q==", - "dev": true - }, "node_modules/@types/luxon": { "version": "1.25.2", "resolved": "https://registry.npmjs.org/@types/luxon/-/luxon-1.25.2.tgz", @@ -5482,12 +5475,6 @@ "integrity": "sha512-cxWFQVseBm6O9Gbw1IWb8r6OS4OhSt3hPZLkFApLjM8TEXROBuQGLAH2i2gZpcXdLBIrpXuTDhH7Vbm1iXmNGA==", "dev": true }, - "@types/lodash": { - "version": "4.14.168", - "resolved": "https://registry.npmjs.org/@types/lodash/-/lodash-4.14.168.tgz", - "integrity": "sha512-oVfRvqHV/V6D1yifJbVRU3TMp8OT6o6BG+U9MkwuJ3U8/CsDHvalRpsxBqivn71ztOFZBTfJMvETbqHiaNSj7Q==", - "dev": true - }, "@types/luxon": { "version": "1.25.2", "resolved": "https://registry.npmjs.org/@types/luxon/-/luxon-1.25.2.tgz", diff --git a/package.json b/package.json index 51733b3..94a299d 100644 --- a/package.json +++ b/package.json @@ -48,7 +48,6 @@ "@types/chai": "^4.2.15", "@types/chai-as-promised": "^7.1.3", "@types/inquirer": "^7.3.1", - "@types/lodash": "^4.14.168", "@types/luxon": "^1.25.2", "@types/mocha": "^8.2.1", "@types/node": "^14.14.27", From 75267aa1a3ffd45c1f66f938b3049049b6d1aa40 Mon Sep 17 00:00:00 2001 From: MillenniumEarl Date: Thu, 11 Mar 2021 22:12:34 +0100 Subject: [PATCH 2/9] Ignore internal scripts in debug --- .vscode/launch.json | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index 6031ce2..097529f 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -5,7 +5,18 @@ "name": "Test", "request": "launch", "command": "npm run test", - "cwd": "${workspaceFolder}" + "cwd": "${workspaceFolder}", + }, + { + "type": "node-terminal", + "name": "Example", + "request": "launch", + "command": "npm run run-example", + "cwd": "${workspaceFolder}", + "skipFiles": [ + "${workspaceFolder}/node_modules/**/*", + "/**/*" + ] }, ] } \ No newline at end of file From 0b6880d5deca91e6bc03834cae84f5971dfa1806 Mon Sep 17 00:00:00 2001 From: MillenniumEarl Date: Thu, 11 Mar 2021 22:13:56 +0100 Subject: [PATCH 3/9] Add selectors for spoilers elements --- src/scripts/constants/css-selector.ts | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/scripts/constants/css-selector.ts b/src/scripts/constants/css-selector.ts index f9cdfcf..d1c528b 100644 --- a/src/scripts/constants/css-selector.ts +++ b/src/scripts/constants/css-selector.ts @@ -154,7 +154,16 @@ export const POST = { * * For use within a `THREAD.POSTS_IN_PAGE` selector. */ - BOOKMARKED: '* ul.message-attribution-opposite >li > a[title="Bookmark"].is-bookmarked' + BOOKMARKED: '* ul.message-attribution-opposite >li > a[title="Bookmark"].is-bookmarked', + /** + * Button used to hide/show a spoiler element of a post. + */ + SPOILER_BUTTON: "button.bbCodeSpoiler-button", + /** + * Contents of a spoiler element in a post. + */ + SPOILER_CONTENT: + "div.bbCodeSpoiler-content > div.bbCodeBlock--spoiler > div.bbCodeBlock-content" }; export const MEMBER = { From e06f0db041b3806990c8bdf1d36344574fe48922 Mon Sep 17 00:00:00 2001 From: MillenniumEarl Date: Mon, 15 Mar 2021 15:38:19 +0100 Subject: [PATCH 4/9] Change printWidth to 100 --- .prettierrc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.prettierrc b/.prettierrc index b83c371..60d1775 100644 --- a/.prettierrc +++ b/.prettierrc @@ -2,5 +2,5 @@ "semi": true, "trailingComma": "none", "singleQuote": false, - "printWidth": 90 + "printWidth": 100 } \ No newline at end of file From 1d5836c0d06b6e5a3348b41af55ff17bf4f3167f Mon Sep 17 00:00:00 2001 From: MillenniumEarl Date: Mon, 15 Mar 2021 15:39:26 +0100 Subject: [PATCH 5/9] Update prettier printWidth --- src/example.ts | 10 ++------- src/scripts/classes/handiwork/handiwork.ts | 9 +------- .../classes/query/handiwork-search-query.ts | 3 +-- .../classes/query/thread-search-query.ts | 3 +-- src/scripts/classes/session.ts | 4 ++-- src/scripts/constants/css-selector.ts | 6 ++---- src/scripts/interfaces.ts | 5 +---- src/scripts/network-helper.ts | 21 +++++-------------- src/scripts/scrape-data/handiwork-parse.ts | 14 ++++--------- src/scripts/scrape-data/json-ld.ts | 4 +--- test/classes/mapping/thread.ts | 4 +--- test/classes/prefix-parser.ts | 10 +-------- 12 files changed, 22 insertions(+), 71 deletions(-) diff --git a/src/example.ts b/src/example.ts index 89f97a5..b036a7f 100644 --- a/src/example.ts +++ b/src/example.ts @@ -60,11 +60,7 @@ async function main() { // Log in the platform console.log("Authenticating..."); - const result = await login( - process.env.F95_USERNAME, - process.env.F95_PASSWORD, - insert2faCode - ); + const result = await login(process.env.F95_USERNAME, process.env.F95_PASSWORD, insert2faCode); console.log(`Authentication result: ${result.message}\n`); // Manage failed login @@ -87,9 +83,7 @@ async function main() { latestQuery.includedTags = ["3d game"]; const latestUpdates = await getLatestUpdates(latestQuery, 1); - console.log( - `"${latestUpdates.shift().name}" was the last "3d game" tagged game to be updated\n` - ); + console.log(`"${latestUpdates.shift().name}" was the last "3d game" tagged game to be updated\n`); // Get game data for (const gamename of gameList) { diff --git a/src/scripts/classes/handiwork/handiwork.ts b/src/scripts/classes/handiwork/handiwork.ts index 6c5af21..cb68d27 100644 --- a/src/scripts/classes/handiwork/handiwork.ts +++ b/src/scripts/classes/handiwork/handiwork.ts @@ -6,14 +6,7 @@ "use strict"; // Modules from files -import { - TAuthor, - TRating, - IHandiwork, - TEngine, - TCategory, - TStatus -} from "../../interfaces"; +import { TAuthor, TRating, IHandiwork, TEngine, TCategory, TStatus } from "../../interfaces"; /** * It represents a generic work, be it a game, a comic, an animation or an asset. diff --git a/src/scripts/classes/query/handiwork-search-query.ts b/src/scripts/classes/query/handiwork-search-query.ts index e55c328..0ab3399 100644 --- a/src/scripts/classes/query/handiwork-search-query.ts +++ b/src/scripts/classes/query/handiwork-search-query.ts @@ -96,8 +96,7 @@ export default class HandiworkSearchQuery implements IQuery { // If the keywords are set or the number // of included tags is greather than 5, // we must perform a thread search - if (this.keywords || this.includedTags.length > MAX_TAGS_LATEST_SEARCH) - return "thread"; + if (this.keywords || this.includedTags.length > MAX_TAGS_LATEST_SEARCH) return "thread"; return DEFAULT_SEARCH_TYPE; } diff --git a/src/scripts/classes/query/thread-search-query.ts b/src/scripts/classes/query/thread-search-query.ts index cfa8308..10569c6 100644 --- a/src/scripts/classes/query/thread-search-query.ts +++ b/src/scripts/classes/query/thread-search-query.ts @@ -130,8 +130,7 @@ export default class ThreadSearchQuery implements IQuery { if (this.excludedTags) params["c[excludeTags]"] = this.excludedTags.join(","); // Set minimum reply number - if (this.minimumReplies > 0) - params["c[min_reply_count]"] = this.minimumReplies.toString(); + if (this.minimumReplies > 0) params["c[min_reply_count]"] = this.minimumReplies.toString(); // Add prefixes const parser = new PrefixParser(); diff --git a/src/scripts/classes/session.ts b/src/scripts/classes/session.ts index 79b328f..a3603b6 100644 --- a/src/scripts/classes/session.ts +++ b/src/scripts/classes/session.ts @@ -210,8 +210,8 @@ export default class Session { // Search for expired cookies const jarValid = - this._cookieJar.getCookiesSync("https://f95zone.to").filter((el) => el.TTL() === 0) - .length === 0; + this._cookieJar.getCookiesSync("https://f95zone.to").filter((el) => el.TTL() === 0).length === + 0; return dateValid && hashValid && jarValid; } diff --git a/src/scripts/constants/css-selector.ts b/src/scripts/constants/css-selector.ts index d1c528b..e1bc733 100644 --- a/src/scripts/constants/css-selector.ts +++ b/src/scripts/constants/css-selector.ts @@ -162,8 +162,7 @@ export const POST = { /** * Contents of a spoiler element in a post. */ - SPOILER_CONTENT: - "div.bbCodeSpoiler-content > div.bbCodeBlock--spoiler > div.bbCodeBlock-content" + SPOILER_CONTENT: "div.bbCodeSpoiler-content > div.bbCodeBlock--spoiler > div.bbCodeBlock-content" }; export const MEMBER = { @@ -214,8 +213,7 @@ export const MEMBER = { * If the text is `Unfollow` then the user is followed. * If the text is `Follow` then the user is not followed. */ - FOLLOWED: - "div.memberHeader-buttons > div.buttonGroup:first-child > a[data-sk-follow] > span", + FOLLOWED: "div.memberHeader-buttons > div.buttonGroup:first-child > a[data-sk-follow] > span", /** * Button used to ignore/unignore the user. * diff --git a/src/scripts/interfaces.ts b/src/scripts/interfaces.ts index ba544c6..33916e6 100644 --- a/src/scripts/interfaces.ts +++ b/src/scripts/interfaces.ts @@ -83,10 +83,7 @@ export type TCategory = "games" | "mods" | "comics" | "animations" | "assets"; /** * Valid names of classes that implement the IQuery interface. */ -export type TQueryInterface = - | "LatestSearchQuery" - | "ThreadSearchQuery" - | "HandiworkSearchQuery"; +export type TQueryInterface = "LatestSearchQuery" | "ThreadSearchQuery" | "HandiworkSearchQuery"; /** * Collection of values defined for each diff --git a/src/scripts/network-helper.ts b/src/scripts/network-helper.ts index 86293bf..d2d29d3 100644 --- a/src/scripts/network-helper.ts +++ b/src/scripts/network-helper.ts @@ -86,9 +86,7 @@ export async function fetchHTML( error: null }); - return isHTML - ? success(response.value.data as string) - : failure(unexpectedResponseError); + return isHTML ? success(response.value.data as string) : failure(unexpectedResponseError); } else return failure(response.value as GenericAxiosError); } @@ -105,8 +103,7 @@ export async function authenticate( force: boolean = false ): Promise { shared.logger.info(`Authenticating with user ${credentials.username}`); - if (!credentials.token) - throw new InvalidF95Token(`Invalid token for auth: ${credentials.token}`); + if (!credentials.token) throw new InvalidF95Token(`Invalid token for auth: ${credentials.token}`); // Secure the URL const secureURL = enforceHttpsUrl(urls.LOGIN); @@ -216,9 +213,7 @@ export async function fetchGETResponse( const response = await axios.get(secureURL, commonConfig); return success(response); } catch (e) { - shared.logger.error( - `(GET) Error ${e.message} occurred while trying to fetch ${secureURL}` - ); + shared.logger.error(`(GET) Error ${e.message} occurred while trying to fetch ${secureURL}`); const genericError = new GenericAxiosError({ id: 1, message: `(GET) Error ${e.message} occurred while trying to fetch ${secureURL}`, @@ -305,10 +300,7 @@ export function isStringAValidURL(url: string): boolean { * If `true`, the function will consider redirects a violation and return `false`. * Default: `false` */ -export async function urlExists( - url: string, - checkRedirect: boolean = false -): Promise { +export async function urlExists(url: string, checkRedirect: boolean = false): Promise { // Local variables let valid = false; @@ -376,10 +368,7 @@ function manageLoginPOSTResponse(response: AxiosResponse) { } // Get the error message (if any) and remove the new line chars - const errorMessage = $("body") - .find(GENERIC.LOGIN_MESSAGE_ERROR) - .text() - .replace(/\n/g, ""); + const errorMessage = $("body").find(GENERIC.LOGIN_MESSAGE_ERROR).text().replace(/\n/g, ""); // Return the result of the authentication const result = errorMessage.trim() === ""; diff --git a/src/scripts/scrape-data/handiwork-parse.ts b/src/scripts/scrape-data/handiwork-parse.ts index 892295b..98b4a25 100644 --- a/src/scripts/scrape-data/handiwork-parse.ts +++ b/src/scripts/scrape-data/handiwork-parse.ts @@ -122,10 +122,7 @@ function stringToBoolean(s: string): boolean { * * Case-insensitive. */ -function getPostElementByName( - elements: IPostElement[], - name: string -): IPostElement | undefined { +function getPostElementByName(elements: IPostElement[], name: string): IPostElement | undefined { return elements.find((el) => el.name.toUpperCase() === name.toUpperCase()); } @@ -162,8 +159,7 @@ function fillWithPrefixes(hw: HandiWork, prefixes: string[]) { // Check what the prefix indicates if (stringInDict(prefix, shared.prefixes["engines"])) engine = prefix as TEngine; - else if (stringInDict(prefix, shared.prefixes["statuses"])) - status = prefix as TStatus; + else if (stringInDict(prefix, shared.prefixes["statuses"])) status = prefix as TStatus; else if (stringInDict(prefix, fakeModDict)) mod = true; // Anyway add the prefix to list @@ -206,8 +202,7 @@ function fillWithPostData(hw: HandiWork, elements: IPostElement[]) { // Parse the censorship const censored = - getPostElementByName(elements, "censored") || - getPostElementByName(elements, "censorship"); + getPostElementByName(elements, "censored") || getPostElementByName(elements, "censorship"); if (censored) hw.censored = stringToBoolean(censored.text); // Get the genres @@ -249,8 +244,7 @@ function fillWithPostData(hw: HandiWork, elements: IPostElement[]) { //#region Get the changelog hw.changelog = []; const changelogElement = - getPostElementByName(elements, "changelog") || - getPostElementByName(elements, "change-log"); + getPostElementByName(elements, "changelog") || getPostElementByName(elements, "change-log"); if (changelogElement) { const changelogSpoiler = changelogElement?.content.find((el) => { return el.type === "Spoiler" && el.content.length > 0; diff --git a/src/scripts/scrape-data/json-ld.ts b/src/scripts/scrape-data/json-ld.ts index 360492a..446ee1c 100644 --- a/src/scripts/scrape-data/json-ld.ts +++ b/src/scripts/scrape-data/json-ld.ts @@ -59,9 +59,7 @@ function parseJSONLD(element: cheerio.Element): TJsonLD { const html = cheerio(element).html().trim(); // Obtain the JSON-LD - const data = html - .replace('", ""); + const data = html.replace('", ""); // Convert the string to an object return JSON.parse(data); diff --git a/test/classes/mapping/thread.ts b/test/classes/mapping/thread.ts index e9bac61..f932a40 100644 --- a/test/classes/mapping/thread.ts +++ b/test/classes/mapping/thread.ts @@ -34,8 +34,6 @@ export function suite(): void { it("Fetch post with invalid ID", async function fetchWithInvalidID() { Shared.setIsLogged(true); const thread = new Thread(-1); - await expect(thread.getPost(0)).to.be.rejectedWith( - "Index must be greater or equal than 1" - ); + await expect(thread.getPost(0)).to.be.rejectedWith("Index must be greater or equal than 1"); }); } diff --git a/test/classes/prefix-parser.ts b/test/classes/prefix-parser.ts index 6460179..f4a28cd 100644 --- a/test/classes/prefix-parser.ts +++ b/test/classes/prefix-parser.ts @@ -29,15 +29,7 @@ export function suite(): void { // Test values const testIDs = [103, 225, 44, 13, 2, 7, 22]; - const testPrefixes = [ - "corruption", - "pregnancy", - "slave", - "VN", - "RPGM", - "Ren'Py", - "Abandoned" - ]; + const testPrefixes = ["corruption", "pregnancy", "slave", "VN", "RPGM", "Ren'Py", "Abandoned"]; // Parse values const ids = parser.prefixesToIDs(testPrefixes); From 7b64528fd06d7c24a8a453947aae2923a3c10595 Mon Sep 17 00:00:00 2001 From: MillenniumEarl Date: Mon, 15 Mar 2021 18:43:03 +0100 Subject: [PATCH 6/9] Reworked script --- src/scripts/scrape-data/post-parse.ts | 380 ++++++++++++++++---------- 1 file changed, 234 insertions(+), 146 deletions(-) diff --git a/src/scripts/scrape-data/post-parse.ts b/src/scripts/scrape-data/post-parse.ts index 020fee2..c4dbf6b 100644 --- a/src/scripts/scrape-data/post-parse.ts +++ b/src/scripts/scrape-data/post-parse.ts @@ -5,6 +5,9 @@ "use strict"; +// Import from files +import { POST } from "../constants/css-selector"; + //#region Interfaces export interface IPostElement { @@ -22,6 +25,7 @@ export interface ILink extends IPostElement { //#endregion Interfaces //#region Public methods + /** * Given a post of a thread page it extracts the information contained in the body. */ @@ -40,34 +44,81 @@ export function parseF95ThreadPost( const elements = post .contents() .toArray() - .map((el) => parseCheerioNode($, el)) - .filter((node) => node.name || node.text || node.content.length != 0); + .map((el) => parseCheerioNode($, el)) // Parse the nodes + .filter((el) => !isPostElementEmpty(el)) // Ignore the empty nodes + .map((el) => reducePostElement(el)); // Compress the nodes // ... then parse the elements to create the pairs of title/data - return parsePostElements(elements); + return associateElementsWithName(elements); } + //#endregion Public methods //#region Private methods +//#region Node type + +/** + * Check if the node passed as a parameter is a formatting one (i.e. ``). + */ +function isFormattingNode(node: cheerio.Element): boolean { + const formattedTags = ["b", "i"]; + return node.type === "tag" && formattedTags.includes(node.name); +} + +/** + * Check if the node passed as a parameter is of text type. + */ +function isTextNode(node: cheerio.Element): boolean { + return node.type === "text"; +} + +/** + * Check if the node is a spoiler. + */ +function isSpoilerNode(node: cheerio.Cheerio): boolean { + return node.attr("class") === "bbCodeSpoiler"; +} + +/** + * Check if the node is a link or a image. + */ +function isLinkNode(node: cheerio.Element): boolean { + // Local variables + let valid = false; + + // The node is a valid DOM element + if (node.type === "tag") { + const el = node as cheerio.TagElement; + valid = el.name === "a" || el.name === "img"; + } + + return valid; +} + +/** + * Check if the node is a `noscript` tag. + */ +function isNoScriptNode(node: cheerio.Element): boolean { + return node.type === "tag" && node.name === "noscript"; +} + +//#endregion Node Type + +//#region Parse Cheerio node + /** * Process a spoiler element by getting its text broken * down by any other spoiler elements present. */ -function parseCheerioSpoilerNode( - $: cheerio.Root, - spoiler: cheerio.Cheerio -): IPostElement { +function parseCheerioSpoilerNode($: cheerio.Root, node: cheerio.Cheerio): IPostElement { // A spoiler block is composed of a div with class "bbCodeSpoiler", // containing a div "bbCodeSpoiler-content" containing, in cascade, // a div with class "bbCodeBlock--spoiler" and a div with class "bbCodeBlock-content". // This last tag contains the required data. // Local variables - const BUTTON_CLASS = "button.bbCodeSpoiler-button"; - const SPOILER_CONTENT_CLASS = - "div.bbCodeSpoiler-content > div.bbCodeBlock--spoiler > div.bbCodeBlock-content"; - const content: IPostElement = { + const spoiler: IPostElement = { type: "Spoiler", name: "", text: "", @@ -75,185 +126,222 @@ function parseCheerioSpoilerNode( }; // Find the title of the spoiler (contained in the button) - const button = spoiler.find(BUTTON_CLASS).toArray().shift(); - content.name = $(button).text().trim(); + spoiler.name = node.find(POST.SPOILER_BUTTON).first().text().trim(); // Parse the content of the spoiler - spoiler - .find(SPOILER_CONTENT_CLASS) + spoiler.content = node + .find(POST.SPOILER_CONTENT) .contents() - .map((idx, el) => { - // Convert the element - const element = $(el); - - // Parse nested spoiler - if (element.attr("class") === "bbCodeSpoiler") { - const spoiler = parseCheerioSpoilerNode($, element); - content.content.push(spoiler); - } else if (el.type === "text") { - // Append text - content.text += element.text(); - } - }); + .toArray() + .map((el) => parseCheerioNode($, el)); // Clean text - content.text = content.text.replace(/\s\s+/g, " ").trim(); - return content; + spoiler.text = spoiler.text.replace(/\s\s+/g, " ").trim(); + return spoiler; } /** - * Check if the node passed as a parameter is of text type. - * This also includes formatted nodes (i.e. ``). + * Process a node that contains a link or image. */ -function isTextNode(node: cheerio.Element): boolean { - const formattedTags = ["b", "i"]; - const isText = node.type === "text"; - const isFormatted = node.type === "tag" && formattedTags.includes(node.name); +function parseCheerioLinkNode(element: cheerio.Cheerio): ILink { + // Local variable + const link: ILink = { + type: "Link", + name: "", + text: "", + href: "", + content: [] + }; - return isText || isFormatted; + if (element.is("img")) { + link.type = "Image"; + link.text = element.attr("alt"); + link.href = element.attr("data-src"); + } else if (element.is("a")) { + link.type = "Link"; + link.text = element.text().replace(/\s\s+/g, " ").trim(); + link.href = element.attr("href"); + } + + return link; } +/** + * Process a text only node. + */ +function parseCheerioTextNode(node: cheerio.Cheerio): IPostElement { + const content: IPostElement = { + type: "Text", + name: "", + text: getCheerioNonChildrenText(node), + content: [] + }; + return content; +} + +//#endregion Parse Cheerio node + +//#region IPostElement utility + +/** + * Check if the node has non empty `name` and `text`. + */ +function isPostElementUnknown(node: IPostElement): boolean { + return node.name.trim() === "" && node.text.trim() === ""; +} + +/** + * Check if the node has a non empty property + * between `name`, `text` and `content`. + */ +function isPostElementEmpty(node: IPostElement): boolean { + return node.content.length === 0 && isPostElementUnknown(node); +} + +/** + * Create a `IPostElement` without name, text or content. + */ +function createEmptyElement(): IPostElement { + return { + type: "Empty", + name: "", + text: "", + content: [] + }; +} + +/** + * Check if the element contains the overview of a thread (post #1). + */ +function elementIsOverview(element: IPostElement): boolean { + // Search the text element that start with "overview" + const result = element.content + .filter((e) => e.type === "Text") + .find((e) => e.text.toUpperCase().startsWith("OVERVIEW")); + return result !== undefined; +} + +/** + * If the element contains the overview of a thread, parse it. + */ +function getOverviewFromElement(element: IPostElement): string { + // Local variables + const alphanumericRegex = new RegExp("[a-zA-Z0-9]+"); + + // Get all the text values of the overview + const textes = element.content + .filter((e) => e.type === "Text") + .filter((e) => { + const cleanValue = e.text.toUpperCase().replace("OVERVIEW", "").trim(); + const isAlphanumeric = alphanumericRegex.test(cleanValue); + + return cleanValue !== "" && isAlphanumeric; + }) + .map((e) => e.text); + + // Joins the textes + return textes.join(" "); +} + +//#endregion IPostElement utility + /** * Gets the text of the node only, excluding child nodes. * Also includes formatted text elements (i.e. ``). */ function getCheerioNonChildrenText(node: cheerio.Cheerio): string { - // Find all the text nodes in the node - const text = node - .first() - .contents() - .filter((idx, el) => { - return isTextNode(el); - }) - .text(); + // Local variable + let text = ""; + + // If the node has no children, return the node's text + if (node.contents().length === 1) { + // @todo Remove IF after cheerio RC6 + text = node.text(); + } else { + // Find all the text nodes in the node + text = node + .first() + .contents() // @todo Change to children() after cheerio RC6 + .filter((idx, el) => isTextNode(el)) + .text(); + } // Clean and return the text return text.replace(/\s\s+/g, " ").trim(); } -/** - * Process a node and see if it contains a - * link or image. If not, it returns `null`. - */ -function parseCheerioLinkNode(element: cheerio.Cheerio): ILink | null { - //@ts-ignore - const name = element[0]?.name; - const link: ILink = { - name: "", - type: "Link", - text: "", - href: "", - content: [] - }; - - if (name === "img") { - link.type = "Image"; - link.text = element.attr("alt"); - link.href = element.attr("data-src"); - } else if (name === "a") { - link.type = "Link"; - link.text = element.text().replace(/\s\s+/g, " ").trim(); - link.href = element.attr("href"); - } - - return link.href ? link : null; -} - /** * Collapse an `IPostElement` element with a single subnode * in the `Content` field in case it has no information. */ -function reducePostElement(element: IPostElement): IPostElement { - if (element.content.length === 1) { - const content = element.content[0] as IPostElement; - const nullValues = - (!element.name || !content.name) && (!element.text || !content.text); - const sameValues = element.name === content.name || element.text === content.text; +function reducePostElement(element: IPostElement, recursive = true): IPostElement { + // Local variables + const shallowCopy = Object.assign({}, element); - if (nullValues || sameValues) { - element.name = element.name || content.name; - element.text = element.text || content.text; - element.content.push(...content.content); - element.type = content.type; - - // If the content is a link, add the HREF to the element - const contentILink = content as ILink; - const elementILink = element as ILink; - if (contentILink.href) elementILink.href = contentILink.href; - } + // Find the posts without name and text + const unknownChildrens = shallowCopy.content.filter((e) => isPostElementUnknown(e)); + if (recursive) { + const recursiveUnknownChildrens = unknownChildrens.map((e) => reducePostElement(e)); + unknownChildrens.push(...recursiveUnknownChildrens); } - return element; + // Eliminate non-useful child nodes + if (isPostElementUnknown(shallowCopy) && unknownChildrens.length > 0) { + // Find the valid elements to add to the node + const childContents = unknownChildrens + .filter((e) => !shallowCopy.content.includes(e)) + .map((e) => (e.content.length > 0 ? e.content : e)); + + // Remove the empty elements + shallowCopy.content = shallowCopy.content.filter( + (e) => !unknownChildrens.includes(e) + ); + + // Merge the non-empty children of this node with + // the content of the empty children of this node + const newContent = [].concat(...childContents); + shallowCopy.content.push(...newContent); + } + // If the node has only one child, return it + else if (isPostElementUnknown(shallowCopy) && shallowCopy.content.length === 1) { + return shallowCopy.content[0]; + } + return shallowCopy; } /** * Transform a `cheerio.Cheerio` node into an `IPostElement` element with its subnodes. - * @param reduce Compress subsequent subnodes if they contain no information. Default: `true`. */ -function parseCheerioNode( - $: cheerio.Root, - node: cheerio.Element, - reduce = true -): IPostElement { +function parseCheerioNode($: cheerio.Root, node: cheerio.Element): IPostElement { // Local variables - const content: IPostElement = { - type: "Empty", - name: "", - text: "", - content: [] - }; + let post: IPostElement = createEmptyElement(); const cheerioNode = $(node); - if (isTextNode(node)) { - content.text = cheerioNode.text().replace(/\s\s+/g, " ").trim(); - content.type = "Text"; - } else { - // Get the number of children that the element own - const nChildren = cheerioNode.children().length; + // Parse the node + if (!isNoScriptNode(node)) { + if (isTextNode(node) && !isFormattingNode(node)) + post = parseCheerioTextNode(cheerioNode); + else if (isSpoilerNode(cheerioNode)) post = parseCheerioSpoilerNode($, cheerioNode); + else if (isLinkNode(node)) post = parseCheerioLinkNode(cheerioNode); - // Get the text of the element without childrens - content.text = getCheerioNonChildrenText(cheerioNode); - - // Parse spoilers - if (cheerioNode.attr("class") === "bbCodeSpoiler") { - const spoiler = parseCheerioSpoilerNode($, cheerioNode); - - // Add element if not null - if (spoiler) { - content.content.push(spoiler); - content.type = "Spoiler"; - } - } - // Parse links - else if (nChildren === 0 && cheerioNode.length != 0) { - const link = parseCheerioLinkNode(cheerioNode); - - // Add element if not null - if (link) { - content.content.push(link); - content.type = "Link"; - } - } else { - cheerioNode.children().map((idx, el) => { - // Parse the children of the element passed as parameter - const childElement = parseCheerioNode($, el); - - // If the children is valid (not empty) push it - if ((childElement.text || childElement.content.length !== 0) && !isTextNode(el)) { - content.content.push(childElement); - } - }); - } + // Parse the node's childrens + const childPosts = cheerioNode + .contents() // @todo Change to children() after cheerio RC6 + .toArray() + .filter((el) => el) // Ignore undefined elements + .map((el) => parseCheerioNode($, el)) + .filter((el) => !isPostElementEmpty(el)); + post.content.push(...childPosts); } - return reduce ? reducePostElement(content) : content; + return post; } /** * It simplifies the `IPostElement` elements by associating * the corresponding value to each characterizing element (i.e. author). */ -function parsePostElements(elements: IPostElement[]): IPostElement[] { +function associateElementsWithName(elements: IPostElement[]): IPostElement[] { // Local variables const pairs: IPostElement[] = []; const specialCharsRegex = /^[-!$%^&*()_+|~=`{}[\]:";'<>?,./]/; @@ -275,11 +363,11 @@ function parsePostElements(elements: IPostElement[]): IPostElement[] { lastPair.content.push(...elements[i].content); } // This is a special case - else if (elements[i].text.startsWith("Overview:\n")) { + else if (elementIsOverview(elements[i])) { // We add the overview to the pairs as a text element elements[i].type = "Text"; elements[i].name = "Overview"; - elements[i].text = elements[i].text.replace("Overview:\n", ""); + elements[i].text = getOverviewFromElement(elements[i]); pairs.push(elements[i]); } // We have an element referred to the previous "title" From 62b5c8ccc871f67c91a8cd8f19cf4198abf4f3c9 Mon Sep 17 00:00:00 2001 From: codefactor-io Date: Mon, 15 Mar 2021 20:33:33 +0000 Subject: [PATCH 7/9] [CodeFactor] Apply fixes --- src/scripts/scrape-data/post-parse.ts | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/src/scripts/scrape-data/post-parse.ts b/src/scripts/scrape-data/post-parse.ts index c4dbf6b..55319d6 100644 --- a/src/scripts/scrape-data/post-parse.ts +++ b/src/scripts/scrape-data/post-parse.ts @@ -29,10 +29,7 @@ export interface ILink extends IPostElement { /** * Given a post of a thread page it extracts the information contained in the body. */ -export function parseF95ThreadPost( - $: cheerio.Root, - post: cheerio.Cheerio -): IPostElement[] { +export function parseF95ThreadPost($: cheerio.Root, post: cheerio.Cheerio): IPostElement[] { // The data is divided between "tag" and "text" elements. // Simple data is composed of a "tag" element followed // by a "text" element, while more complex data (contained @@ -293,9 +290,7 @@ function reducePostElement(element: IPostElement, recursive = true): IPostElemen .map((e) => (e.content.length > 0 ? e.content : e)); // Remove the empty elements - shallowCopy.content = shallowCopy.content.filter( - (e) => !unknownChildrens.includes(e) - ); + shallowCopy.content = shallowCopy.content.filter((e) => !unknownChildrens.includes(e)); // Merge the non-empty children of this node with // the content of the empty children of this node @@ -319,8 +314,7 @@ function parseCheerioNode($: cheerio.Root, node: cheerio.Element): IPostElement // Parse the node if (!isNoScriptNode(node)) { - if (isTextNode(node) && !isFormattingNode(node)) - post = parseCheerioTextNode(cheerioNode); + if (isTextNode(node) && !isFormattingNode(node)) post = parseCheerioTextNode(cheerioNode); else if (isSpoilerNode(cheerioNode)) post = parseCheerioSpoilerNode($, cheerioNode); else if (isLinkNode(node)) post = parseCheerioLinkNode(cheerioNode); From a1afbbde201661b8ef97ec581e8e4a74bce2996a Mon Sep 17 00:00:00 2001 From: MillenniumEarl Date: Mon, 15 Mar 2021 21:35:56 +0100 Subject: [PATCH 8/9] Prettify scripts --- src/example.ts | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/example.ts b/src/example.ts index b036a7f..89f97a5 100644 --- a/src/example.ts +++ b/src/example.ts @@ -60,7 +60,11 @@ async function main() { // Log in the platform console.log("Authenticating..."); - const result = await login(process.env.F95_USERNAME, process.env.F95_PASSWORD, insert2faCode); + const result = await login( + process.env.F95_USERNAME, + process.env.F95_PASSWORD, + insert2faCode + ); console.log(`Authentication result: ${result.message}\n`); // Manage failed login @@ -83,7 +87,9 @@ async function main() { latestQuery.includedTags = ["3d game"]; const latestUpdates = await getLatestUpdates(latestQuery, 1); - console.log(`"${latestUpdates.shift().name}" was the last "3d game" tagged game to be updated\n`); + console.log( + `"${latestUpdates.shift().name}" was the last "3d game" tagged game to be updated\n` + ); // Get game data for (const gamename of gameList) { From ae32a80d2ae4bca7b614c12230d09e71283965be Mon Sep 17 00:00:00 2001 From: codefactor-io Date: Mon, 15 Mar 2021 20:36:41 +0000 Subject: [PATCH 9/9] [CodeFactor] Apply fixes --- src/example.ts | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/src/example.ts b/src/example.ts index 89f97a5..b036a7f 100644 --- a/src/example.ts +++ b/src/example.ts @@ -60,11 +60,7 @@ async function main() { // Log in the platform console.log("Authenticating..."); - const result = await login( - process.env.F95_USERNAME, - process.env.F95_PASSWORD, - insert2faCode - ); + const result = await login(process.env.F95_USERNAME, process.env.F95_PASSWORD, insert2faCode); console.log(`Authentication result: ${result.message}\n`); // Manage failed login @@ -87,9 +83,7 @@ async function main() { latestQuery.includedTags = ["3d game"]; const latestUpdates = await getLatestUpdates(latestQuery, 1); - console.log( - `"${latestUpdates.shift().name}" was the last "3d game" tagged game to be updated\n` - ); + console.log(`"${latestUpdates.shift().name}" was the last "3d game" tagged game to be updated\n`); // Get game data for (const gamename of gameList) {