diff --git a/app/scripts/game-scraper.js b/app/scripts/game-scraper.js index 2116a93..dad8bd3 100644 --- a/app/scripts/game-scraper.js +++ b/app/scripts/game-scraper.js @@ -1,17 +1,17 @@ -'use strict'; +"use strict"; // Public modules from npm -const HTMLParser = require('node-html-parser'); -const puppeteer = require('puppeteer'); -const urlExist = require('url-exist'); +const HTMLParser = require("node-html-parser"); +const puppeteer = require("puppeteer"); +const urlExist = require("url-exist"); // Modules from file -const shared = require('./shared.js'); -const selectors = require('./costants/css-selectors.js'); -const { preparePage } = require('./puppeteer-helper.js'); -const GameDownload = require('./classes/game-download.js'); -const GameInfo = require('./classes/game-info.js'); -const { isStringAValidURL, isF95URL } = require('./urls-helper.js'); +const shared = require("./shared.js"); +const selectors = require("./costants/css-selectors.js"); +const { preparePage } = require("./puppeteer-helper.js"); +const GameDownload = require("./classes/game-download.js"); +const GameInfo = require("./classes/game-info.js"); +const { isStringAValidURL, isF95URL } = require("./urls-helper.js"); /** * @protected @@ -20,48 +20,52 @@ const { isStringAValidURL, isF95URL } = require('./urls-helper.js'); * @param {URL} url URL of the game/mod to extract data from * @return {Promise} Complete information about the game you are looking for */ -module.exports.getGameInfo = async function(browser, url) { - if (shared.debug) console.log('Obtaining game info'); +module.exports.getGameInfo = async function (browser, url) { + if (shared.debug) console.log("Obtaining game info"); - // Verify the correctness of the URL - if (!isF95URL(url)) throw url + ' is not a valid F95Zone URL'; - let exists = await urlExist(url.toString()); - if (!exists) return new GameInfo(); + // Verify the correctness of the URL + if (!isF95URL(url)) throw url + " is not a valid F95Zone URL"; + let exists = await urlExist(url.toString()); + if (!exists) return new GameInfo(); - let page = await preparePage(browser); // Set new isolated page - await page.setCookie(...shared.cookies); // Set cookies to avoid login - await page.goto(url.toString(), { - waitUntil: shared.WAIT_STATEMENT - }); // Go to the game page and wait until it loads + let page = await preparePage(browser); // Set new isolated page + await page.setCookie(...shared.cookies); // Set cookies to avoid login + await page.goto(url.toString(), { + waitUntil: shared.WAIT_STATEMENT, + }); // Go to the game page and wait until it loads - // It asynchronously searches for the elements and - // then waits at the end to compile the object to be returned - let info = new GameInfo(); - let title = getGameTitle(page); - let author = getGameAuthor(page); - let tags = getGameTags(page); - let previewSource = getGamePreviewSource(page); - let downloadData = getGameDownloadLink(page); - info = await parsePrefixes(page, info); // Fill status/engines/isMod - let structuredText = await getMainPostStructuredText(page); - let overview = getOverview(structuredText, info.isMod); - let parsedInfos = parseConversationPage(structuredText); + // It asynchronously searches for the elements and + // then waits at the end to compile the object to be returned + let info = new GameInfo(); + let title = getGameTitle(page); + let author = getGameAuthor(page); + let tags = getGameTags(page); + let previewSource = getGamePreviewSource(page); + let downloadData = getGameDownloadLink(page); + info = await parsePrefixes(page, info); // Fill status/engines/isMod + let structuredText = await getMainPostStructuredText(page); + let overview = getOverview(structuredText, info.isMod); + let parsedInfos = parseConversationPage(structuredText); - // Fill in the GameInfo element with the information obtained - info.name = await title; - info.author = await author; - info.overview = overview; - info.tags = await tags; - info.f95url = url; - info.version = info.isMod ? parsedInfos['MOD VERSION'] : parsedInfos['VERSION']; - info.lastUpdate = info.isMod ? parsedInfos['UPDATED'] : parsedInfos['THREAD UPDATED']; - info.previewSource = await previewSource; - info.downloadInfo = await downloadData; + // Fill in the GameInfo element with the information obtained + info.name = await title; + info.author = await author; + info.overview = overview; + info.tags = await tags; + info.f95url = url; + info.version = info.isMod + ? parsedInfos["MOD VERSION"] + : parsedInfos["VERSION"]; + info.lastUpdate = info.isMod + ? parsedInfos["UPDATED"] + : parsedInfos["THREAD UPDATED"]; + info.previewSource = await previewSource; + info.downloadInfo = await downloadData; - await page.close(); // Close the page - if (shared.debug) console.log('Founded data for ' + info.name); - return info; -} + await page.close(); // Close the page + if (shared.debug) console.log("Founded data for " + info.name); + return info; +}; //#region Private methods /** @@ -73,27 +77,30 @@ module.exports.getGameInfo = async function(browser, url) { * @returns {Promise} Game description */ function getOverview(text, isMod) { - // Get overview (different parsing for game and mod) - let overviewEndIndex; - if (isMod) overviewEndIndex = text.indexOf('Updated'); - else overviewEndIndex = text.indexOf('Thread Updated'); - return text.substring(0, overviewEndIndex).replace('Overview:\n', '').trim(); + // Get overview (different parsing for game and mod) + let overviewEndIndex; + if (isMod) overviewEndIndex = text.indexOf("Updated"); + else overviewEndIndex = text.indexOf("Thread Updated"); + return text.substring(0, overviewEndIndex).replace("Overview:\n", "").trim(); } /** * @private - * Extrapolate the page structure by removing the element tags + * Extrapolate the page structure by removing the element tags * and leaving only the text and its spacing. * @param {puppeteer.Page} page Page containing the text * @returns {Promise} Structured text */ async function getMainPostStructuredText(page) { -// Gets the first post, where are listed all the game's informations -let post = (await page.$$(selectors.THREAD_POSTS))[0]; + // Gets the first post, where are listed all the game's informations + let post = (await page.$$(selectors.THREAD_POSTS))[0]; -// The info are plain text so we need to parse the HTML code -let bodyHTML = await page.evaluate( /* istanbul ignore next */ (mainPost) => mainPost.innerHTML, post); -return HTMLParser.parse(bodyHTML).structuredText; + // The info are plain text so we need to parse the HTML code + let bodyHTML = await page.evaluate( + /* istanbul ignore next */ (mainPost) => mainPost.innerHTML, + post + ); + return HTMLParser.parse(bodyHTML).structuredText; } /** @@ -103,46 +110,47 @@ return HTMLParser.parse(bodyHTML).structuredText; * @returns {Promise} Game author */ async function getGameAuthor(page) { - // Get the game/mod name (without square brackets) - let titleHTML = await page.evaluate( /* istanbul ignore next */ (selector) => - document.querySelector(selector).innerHTML, - selectors.GAME_TITLE); - let structuredTitle = HTMLParser.parse(titleHTML); + // Get the game/mod name (without square brackets) + let titleHTML = await page.evaluate( + /* istanbul ignore next */ (selector) => + document.querySelector(selector).innerHTML, + selectors.GAME_TITLE + ); + let structuredTitle = HTMLParser.parse(titleHTML); - // The last element **shoud be** the title without prefixes (engines, status, other...) - let gameTitle = structuredTitle.childNodes.pop().rawText; + // The last element **shoud be** the title without prefixes (engines, status, other...) + let gameTitle = structuredTitle.childNodes.pop().rawText; - // The last square brackets contain the author - let startTitleIndex = gameTitle.lastIndexOf('[') + 1; - return gameTitle.substring(startTitleIndex, gameTitle.length - 1).trim(); + // The last square brackets contain the author + let startTitleIndex = gameTitle.lastIndexOf("[") + 1; + return gameTitle.substring(startTitleIndex, gameTitle.length - 1).trim(); } /** * @private - * Process the post text to get all the useful + * Process the post text to get all the useful * information in the format *DESCRIPTOR : VALUE*. * @param {String} text Structured text of the post * @returns {Object} Dictionary of information */ function parseConversationPage(text) { - let dataPairs = {}; + let dataPairs = {}; - // The information searched in the game post are one per line - let splittedText = text.split('\n'); - for (let line of splittedText) { + // The information searched in the game post are one per line + let splittedText = text.split("\n"); + for (let line of splittedText) { + if (!line.includes(":")) continue; - if (!line.includes(':')) continue; + // Create pair key/value + let splitted = line.split(":"); + let key = splitted[0].trim().toUpperCase(); // Uppercase to avoid mismatch + let value = splitted[1].trim(); - // Create pair key/value - let splitted = line.split(':'); - let key = splitted[0].trim().toUpperCase(); // Uppercase to avoid mismatch - let value = splitted[1].trim(); + // Add pair to the dict if valid + if (value != "") dataPairs[key] = value; + } - // Add pair to the dict if valid - if (value != '') dataPairs[key] = value; - } - - return dataPairs; + return dataPairs; } /** @@ -152,16 +160,19 @@ function parseConversationPage(text) { * @returns {Promise} URL of the image or null if failed to get it */ async function getGamePreviewSource(page) { - let src = await page.evaluate( /* istanbul ignore next */ (selector) => { - // Get the firs image available - let img = document.querySelector(selector); + let src = await page.evaluate( + /* istanbul ignore next */ (selector) => { + // Get the firs image available + let img = document.querySelector(selector); - if (img) return img.getAttribute('src'); - else return null; - }, selectors.GAME_IMAGES); + if (img) return img.getAttribute("src"); + else return null; + }, + selectors.GAME_IMAGES + ); - // Check if the URL is valid - return isStringAValidURL(src) ? new URL(src) : null; + // Check if the URL is valid + return isStringAValidURL(src) ? new URL(src) : null; } /** @@ -171,16 +182,18 @@ async function getGamePreviewSource(page) { * @returns {Promise} Game title */ async function getGameTitle(page) { - // Get the game/mod name (without square brackets) - let titleHTML = await page.evaluate( /* istanbul ignore next */ (selector) => - document.querySelector(selector).innerHTML, - selectors.GAME_TITLE); - let structuredTitle = HTMLParser.parse(titleHTML); + // Get the game/mod name (without square brackets) + let titleHTML = await page.evaluate( + /* istanbul ignore next */ (selector) => + document.querySelector(selector).innerHTML, + selectors.GAME_TITLE + ); + let structuredTitle = HTMLParser.parse(titleHTML); - // The last element **shoud be** the title without prefixes (engines, status, other...) - let gameTitle = structuredTitle.childNodes.pop().rawText; - let endTitleIndex = gameTitle.indexOf('['); - return gameTitle.substring(0, endTitleIndex).trim(); + // The last element **shoud be** the title without prefixes (engines, status, other...) + let gameTitle = structuredTitle.childNodes.pop().rawText; + let endTitleIndex = gameTitle.indexOf("["); + return gameTitle.substring(0, endTitleIndex).trim(); } /** @@ -190,43 +203,48 @@ async function getGameTitle(page) { * @returns {Promise} List of uppercase tags */ async function getGameTags(page) { - let tags = []; + let tags = []; - // Get the game tags - for (let handle of await page.$$(selectors.GAME_TAGS)) { - let tag = await page.evaluate( /* istanbul ignore next */ (element) => element.innerText, handle); - tags.push(tag.toUpperCase()); - } - return tags.sort(); + // Get the game tags + for (let handle of await page.$$(selectors.GAME_TAGS)) { + let tag = await page.evaluate( + /* istanbul ignore next */ (element) => element.innerText, + handle + ); + tags.push(tag.toUpperCase()); + } + return tags.sort(); } /** * @private - * Process the game title prefixes to extract information such as game status, + * Process the game title prefixes to extract information such as game status, * graphics engine used, and whether it is a mod or original game. * @param {puppeteer.Page} page Page containing the prefixes to be extrapolated * @param {GameInfo} info Object to assign the identified information to * @returns {Promise} GameInfo object passed in to which the identified information has been added */ async function parsePrefixes(page, info) { - const MOD_PREFIX = 'MOD'; - - // The 'Ongoing' status is not specified, only 'Abandoned'/'OnHold'/'Complete' - info.status = 'Ongoing'; - for (let handle of await page.$$(selectors.GAME_TITLE_PREFIXES)) { - let value = await page.evaluate( /* istanbul ignore next */ (element) => element.innerText, handle); + const MOD_PREFIX = "MOD"; - // Clean the prefix - let prefix = value.toUpperCase().replace('[', '').replace(']', '').trim(); + // The 'Ongoing' status is not specified, only 'Abandoned'/'OnHold'/'Complete' + info.status = "Ongoing"; + for (let handle of await page.$$(selectors.GAME_TITLE_PREFIXES)) { + let value = await page.evaluate( + /* istanbul ignore next */ (element) => element.innerText, + handle + ); - // Getting infos... - if (shared.statuses.includes(prefix)) info.status = prefix; - else if (shared.engines.includes(prefix)) info.engine = prefix; + // Clean the prefix + let prefix = value.toUpperCase().replace("[", "").replace("]", "").trim(); - // This is not a game but a mod - else if (prefix === MOD_PREFIX) info.isMod = true; - } - return info; + // Getting infos... + if (shared.statuses.includes(prefix)) info.status = prefix; + else if (shared.engines.includes(prefix)) info.engine = prefix; + // This is not a game but a mod + else if (prefix === MOD_PREFIX) info.isMod = true; + } + return info; } /** @@ -236,44 +254,62 @@ async function parsePrefixes(page, info) { * @returns {Promise} List of objects used for game download */ async function getGameDownloadLink(page) { - // Most used hosting platforms - let hostingPlatforms = ['MEGA', 'NOPY', 'FILESUPLOAD', 'MIXDROP', 'UPLOADHAVEN', 'PIXELDRAIN', 'FILESFM']; - - // Supported OS platforms - let platformOS = ['WIN', 'LINUX', 'MAC', 'ALL'] + // Most used hosting platforms + let hostingPlatforms = [ + "MEGA", + "NOPY", + "FILESUPLOAD", + "MIXDROP", + "UPLOADHAVEN", + "PIXELDRAIN", + "FILESFM", + ]; - // Gets the which contains the download links - let temp = await page.$$(selectors.DOWNLOAD_LINKS_CONTAINER); - if(temp.length === 0) return []; + // Supported OS platforms + let platformOS = ["WIN", "LINUX", "MAC", "ALL"]; - // Look for the container that contains the links - // It is necessary because the same css selector - // also identifies other elements on the page - let container = null; - for(let candidate of temp) { - if (container !== null) break; - let upperText = (await page.evaluate( /* istanbul ignore next */ (e) => e.innerText, candidate)).toUpperCase(); + // Gets the which contains the download links + let temp = await page.$$(selectors.DOWNLOAD_LINKS_CONTAINER); + if (temp.length === 0) return []; - // Search if the container contains the name of a hosting platform - for (let p of hostingPlatforms) { - if(upperText.includes(p)) { - container = candidate; - break; - } - } + // Look for the container that contains the links + // It is necessary because the same css selector + // also identifies other elements on the page + let container = null; + for (let candidate of temp) { + if (container !== null) break; + let upperText = ( + await page.evaluate( + /* istanbul ignore next */ (e) => e.innerText, + candidate + ) + ).toUpperCase(); + + // Search if the container contains the name of a hosting platform + for (let p of hostingPlatforms) { + if (upperText.includes(p)) { + container = candidate; + break; + } } - if(container === null) return []; + } + if (container === null) return []; - // Extract the HTML text from the container - let searchText = (await page.evaluate( /* istanbul ignore next */ (e) => e.innerHTML, container)).toLowerCase(); + // Extract the HTML text from the container + let searchText = ( + await page.evaluate( + /* istanbul ignore next */ (e) => e.innerHTML, + container + ) + ).toLowerCase(); - // Parse the download links - let downloadData = []; - for(let platform of platformOS) { - let data = extractGameHostingData(platform, searchText); - downloadData.push(...data); - } - return downloadData; + // Parse the download links + let downloadData = []; + for (let platform of platformOS) { + let data = extractGameHostingData(platform, searchText); + downloadData.push(...data); + } + return downloadData; } /** @@ -285,56 +321,55 @@ async function getGameDownloadLink(page) { * @returns {GameDownload[]} List of game download links for the selected platform */ function extractGameHostingData(platform, text) { - const PLATFORM_BOLD_OPEN = ''; - const CONTAINER_SPAN_CLOSE = ''; - const LINK_OPEN = 'platform - let endIndex = text.indexOf( - PLATFORM_BOLD_OPEN, - startIndex) + PLATFORM_BOLD_OPEN.length; + // Find the platform + let endIndex = + text.indexOf(PLATFORM_BOLD_OPEN, startIndex) + PLATFORM_BOLD_OPEN.length; - // Find the end of the container - if (endIndex === -1) text.indexOf( - CONTAINER_SPAN_CLOSE, - startIndex) + CONTAINER_SPAN_CLOSE.length; + // Find the end of the container + if (endIndex === -1) + text.indexOf(CONTAINER_SPAN_CLOSE, startIndex) + + CONTAINER_SPAN_CLOSE.length; - text = text.substring(startIndex, endIndex); - - let downloadData = []; - let linkTags = text.split(LINK_OPEN); - for(let tag of linkTags) { - // Ignore non-link string - if (!tag.includes(HREF_START)) continue; + text = text.substring(startIndex, endIndex); - // Find the hosting platform name - startIndex = tag.indexOf(TAG_CLOSE) + TAG_CLOSE.length; - endIndex = tag.indexOf(LINK_CLOSE, startIndex); - let hosting = tag.substring(startIndex, endIndex); + let downloadData = []; + let linkTags = text.split(LINK_OPEN); + for (let tag of linkTags) { + // Ignore non-link string + if (!tag.includes(HREF_START)) continue; - // Find the 'href' attribute - startIndex = tag.indexOf(HREF_START) + HREF_START.length; - endIndex = tag.indexOf(HREF_END, startIndex); - let link = tag.substring(startIndex, endIndex); + // Find the hosting platform name + startIndex = tag.indexOf(TAG_CLOSE) + TAG_CLOSE.length; + endIndex = tag.indexOf(LINK_CLOSE, startIndex); + let hosting = tag.substring(startIndex, endIndex); - if (isStringAValidURL(link)) { - let gd = new GameDownload(); - gd.hosting = hosting.toUpperCase(); - gd.link = new URL(link); - gd.supportedOS = platform.toUpperCase(); + // Find the 'href' attribute + startIndex = tag.indexOf(HREF_START) + HREF_START.length; + endIndex = tag.indexOf(HREF_END, startIndex); + let link = tag.substring(startIndex, endIndex); - downloadData.push(gd); - } + if (isStringAValidURL(link)) { + let gd = new GameDownload(); + gd.hosting = hosting.toUpperCase(); + gd.link = new URL(link); + gd.supportedOS = platform.toUpperCase(); + + downloadData.push(gd); } - return downloadData; + } + return downloadData; } -//#endregion Private methods \ No newline at end of file +//#endregion Private methods diff --git a/test/test.js b/test/test.js index e43c5d6..8ff269b 100644 --- a/test/test.js +++ b/test/test.js @@ -1,18 +1,25 @@ -const { debug, login, getGameData, loadF95BaseData, getUserData, logout } = require("../app/index"); +const { + debug, + login, + getGameData, + loadF95BaseData, + getUserData, + logout, +} = require("../app/index"); //debug(true); main(); async function main() { - let loginResult = await login("MillenniumEarl", "f9vTcRNuvxj4YpK"); + let loginResult = await login("MillenniumEarl", "f9vTcRNuvxj4YpK"); - if (loginResult.success) { - await loadF95BaseData(); - let gameData = await getGameData("kingdom of deception", false); - console.log(gameData.pop()); + if (loginResult.success) { + await loadF95BaseData(); + let gameData = await getGameData("kingdom of deception", false); + console.log(gameData.pop()); - // let userData = await getUserData(); - // console.log(userData); - } - logout(); -} \ No newline at end of file + // let userData = await getUserData(); + // console.log(userData); + } + logout(); +}