From 98b357d05f3fdc6915a72fcd931d92b6eb4afc32 Mon Sep 17 00:00:00 2001
From: MillenniumEarl <MillenniumEarl@users.noreply.github.com>
Date: Tue, 2 Mar 2021 12:42:03 +0100
Subject: [PATCH] Scrape handiwork expoliting the new thread class

---
 src/scripts/scrape-data/handiwork-parse.ts | 239 +++++++++++++++++++++
 1 file changed, 239 insertions(+)
 create mode 100644 src/scripts/scrape-data/handiwork-parse.ts
diff --git a/src/scripts/scrape-data/handiwork-parse.ts b/src/scripts/scrape-data/handiwork-parse.ts
new file mode 100644
index 0000000..097435d
--- /dev/null
+++ b/src/scripts/scrape-data/handiwork-parse.ts
@@ -0,0 +1,239 @@
+"use strict";
+
+// Public modules from npm
+import luxon from "luxon";
+
+// Modules from files
+import HandiWork from "../classes/handiwork/handiwork";
+import Thread from "../classes/thread";
+import { IBasic, TAuthor, TEngine, TExternalPlatform, TStatus } from "../interfaces";
+import shared, { TPrefixDict } from "../shared";
+import { ILink, IPostElement } from "./post-parse";
+
+/**
+ * Gets information of a particular handiwork from its thread.
+ * 
+ * If you don't want to specify the object type, use `HandiWork`.
+ * 
+ * @todo It does not currently support assets.
+ */
+export default async function getHandiworkInformation<T extends IBasic>(url: string): Promise<T> {
+    shared.logger.info(`Obtaining handiwork from ${url}`);
+
+    // Fetch thread data
+    const id = extractIDFromURL(url);
+    const thread: Thread = new Thread(id);
+    await thread.fetch();
+
+    // Convert the info from thread to handiwork
+    const hw: HandiWork = {} as HandiWork;
+    hw.id = thread.id;
+    hw.url = thread.url;
+    hw.name = thread.title;
+    hw.category = thread.category;
+    hw.threadPublishingDate = thread.publication;
+    hw.lastThreadUpdate = thread.modified;
+    hw.tags = thread.tags;
+    hw.rating = thread.rating;
+    fillWithPrefixes(hw, thread.prefixes);
+
+    // Fetch info from first post
+    const post = thread.posts.find(p => p.id === 1);
+    fillWithPostData(hw, post.body);
+    
+    return <T><unknown>hw;
+};
+
+//#region Private methods
+
+//#region Utilities
+
+/**
+ * Extracts the work's unique ID from its URL.
+ */
+function extractIDFromURL(url: string): number {
+    shared.logger.trace("Extracting ID from URL...");
+
+    // URL are in the format https://f95zone.to/threads/GAMENAME-VERSION-DEVELOPER.ID/
+    // or https://f95zone.to/threads/ID/
+    const match = url.match(/([0-9]+)(?=\/|\b)(?!-|\.)/);
+    if (!match) return -1;
+
+    // Parse and return number
+    return parseInt(match[0], 10);
+}
+
+/**
+ * Makes an array of strings uppercase.
+ */
+function toUpperCaseArray(a: string[]): string[] {
+    /**
+     * Makes a string uppercase.
+     */
+    function toUpper(s: string): string {
+        return s.toUpperCase();
+    }
+    return a.map(toUpper);
+}
+
+/**
+ * Check if the string `s` is in the dict `a`.
+ * 
+ * Case insensitive.
+ */
+function stringInDict(s: string, a: TPrefixDict): boolean {
+    // Make uppercase all the strings in the array
+    const values = toUpperCaseArray(Object.values(a));
+
+    return values.includes(s.toUpperCase());
+}
+
+/**
+ * Convert a string to a boolean.
+ * 
+ * Check also for `yes`/`no` and `1`/`0`.
+ */
+function stringToBoolean(s: string): boolean {
+    // Local variables
+    const positiveTerms = ["true", "yes", "1"];
+    const negativeTerms = ["false", "no", "0"];
+    const cleanString = s.toLowerCase().trim();
+    let result = Boolean(s);
+
+    if (positiveTerms.includes(cleanString)) result = true;
+    else if (negativeTerms.includes(cleanString)) result = false;
+    return result;
+}
+
+/**
+ * Gets the element with the given name or `undefined`.
+ * 
+ * Case-insensitive.
+ */
+function getPostElementByName(elements: IPostElement[], name: string): IPostElement | undefined {
+    return elements.find(el =>  el.name.toUpperCase() === name.toUpperCase());
+}
+
+//#endregion Utilities
+
+/**
+ * Parse the post prefixes.
+ * 
+ * In particular, it elaborates the following prefixes for games:
+ * `Engine`, `Status`, `Mod`.
+ */
+function fillWithPrefixes(hw: HandiWork, prefixes: string[]) {
+    shared.logger.trace("Parsing prefixes...");
+
+    // Local variables
+    let mod = false;
+    let engine: TEngine = null;
+    let status: TStatus = null;
+
+    /**
+     * Emulated dictionary of mod prefixes.
+     */
+    const fakeModDict: TPrefixDict = {
+        0: "MOD",
+        1: "CHEAT MOD",
+    }
+
+    // Initialize the array
+    hw.prefixes = [];
+
+    prefixes.map((item, idx) => {
+        // Remove the square brackets
+        const prefix = item.replace("[", "").replace("]", "");
+
+        // Check what the prefix indicates
+        if (stringInDict(prefix, shared.prefixes["engines"])) engine = prefix as TEngine;
+        else if (stringInDict(prefix, shared.prefixes["statuses"])) status = prefix as TStatus;
+        else if (stringInDict(prefix, fakeModDict)) mod = true;
+
+        // Anyway add the prefix to list
+        hw.prefixes.push(prefix);
+    });
+
+    // If the status is not set, then the game is in development (Ongoing)
+    status = (!status && hw.category === "games") ? status : "Ongoing";
+
+    hw.engine = engine;
+    hw.status = status;
+    hw.mod = mod;
+}
+
+/**
+ * Compiles a HandiWork object with the data extracted
+ * from the main post of the HandiWork page.
+ * 
+ * The values that will be added are:
+ * `Overview`, `OS`, `Language`, `Version`, `Installation`,
+ * `Pages`, `Resolution`, `Lenght`, `Genre`, `Censored`,
+ * `LastRelease`, `Authors`, `Changelog`, `Cover`.
+ */
+function fillWithPostData(hw: HandiWork, elements: IPostElement[]) {
+    // First fill the "simple" elements
+    hw.overview = getPostElementByName(elements, "overview")?.text;
+    hw.os = getPostElementByName(elements, "os")?.text?.split(",").map(s => s.trim());
+    hw.language = getPostElementByName(elements, "language")?.text?.split(",").map(s => s.trim());
+    hw.version = getPostElementByName(elements, "version")?.text;
+    hw.installation = getPostElementByName(elements, "installation")?.content.shift()?.text;
+    hw.pages = getPostElementByName(elements, "pages")?.text;
+    hw.resolution = getPostElementByName(elements, "resolution")?.text?.split(",").map(s => s.trim());
+    hw.lenght = getPostElementByName(elements, "lenght")?.text;
+
+    // Parse the censorship
+    const censored = getPostElementByName(elements, "censored") || getPostElementByName(elements, "censorship");
+    if (censored) hw.censored = stringToBoolean(censored.text);
+
+    // Get the genres
+    const genre = getPostElementByName(elements, "genre")?.content.shift()?.text;
+    hw.genre = genre?.split(",").map(s => s.trim());
+
+    // Get the cover
+    const cover = getPostElementByName(elements, "overview")?.content.find(el => el.type === "Image") as ILink;
+    hw.cover = cover?.href;
+
+    // Fill the dates
+    const releaseDate = getPostElementByName(elements, "release date")?.text;
+    if (luxon.DateTime.fromISO(releaseDate).isValid) hw.lastRelease = new Date(releaseDate);
+
+    //#region Convert the author
+    const authorElement = getPostElementByName(elements, "developer") ||
+        getPostElementByName(elements, "developer/publisher") ||
+        getPostElementByName(elements, "artist");
+    const author: TAuthor = {
+        name: authorElement.text,
+        platforms: []
+    };
+
+    // Add the found platforms
+    authorElement?.content.forEach((el: ILink, idx) => {
+        const platform: TExternalPlatform = {
+            name: el.text,
+            link: el.href,
+        };
+
+        author.platforms.push(platform);
+    });
+    hw.authors = [author];
+    //#endregion Convert the author
+
+    //#region Get the changelog
+    hw.changelog = [];
+    const changelogElement = getPostElementByName(elements, "changelog") || getPostElementByName(elements, "change-log");
+    const changelogSpoiler = changelogElement?.content.find(el => {
+        return el.type === "Spoiler" && el.content.length > 0;
+    });
+
+    // Add to the changelog the single spoilers
+    changelogSpoiler.content.forEach(el => {
+        if (el.text.trim()) hw.changelog.push(el.text);
+    });
+
+    // Add at the ened also the text of the "changelog" element
+    hw.changelog.push(changelogSpoiler.text);
+    //#endregion Get the changelog
+}
+
+//#endregion Private methods
\ No newline at end of file