F95API/app/scripts/game-scraper.js

439 lines
14 KiB
JavaScript
Raw Normal View History

2020-10-16 07:58:08 +00:00
'use strict';
// Public modules from npm
2020-10-16 07:58:08 +00:00
const HTMLParser = require('node-html-parser');
const puppeteer = require('puppeteer'); // skipcq: JS-0128
// Modules from file
2020-10-16 07:58:08 +00:00
const shared = require('./shared.js');
const selectors = require('./constants/css-selectors.js');
const { preparePage } = require('./puppeteer-helper.js');
const GameDownload = require('./classes/game-download.js');
const GameInfo = require('./classes/game-info.js');
const { isStringAValidURL, isF95URL, urlExists } = require('./urls-helper.js');
/**
* @protected
* Get information from the game's main page.
* @param {puppeteer.Browser} browser Browser object used for navigation
* @param {String} url URL (String) of the game/mod to extract data from
* @return {Promise<GameInfo>} Complete information about the game you are
2020-10-10 09:45:43 +00:00
* looking for or null if the URL doesn't exists
*/
module.exports.getGameInfo = async function (browser, url) {
2020-10-16 07:58:08 +00:00
if (shared.debug) console.log('Obtaining game info');
// Verify the correctness of the URL
2020-10-16 07:58:08 +00:00
if (!isF95URL(url)) throw new Error(url + ' is not a valid F95Zone URL');
2020-10-16 07:21:19 +00:00
const exists = await urlExists(url);
2020-10-10 09:45:43 +00:00
if (!exists) return null;
2020-10-16 07:21:19 +00:00
const page = await preparePage(browser); // Set new isolated page
await page.setCookie(...shared.cookies); // Set cookies to avoid login
await page.goto(url, {
waitUntil: shared.WAIT_STATEMENT,
}); // Go to the game page and wait until it loads
// It asynchronously searches for the elements and
// then waits at the end to compile the object to be returned
let info = new GameInfo();
2020-10-16 07:21:19 +00:00
const title = getGameTitle(page);
const author = getGameAuthor(page);
const tags = getGameTags(page);
const previewSource = getGamePreviewSource(page);
//let downloadData = getGameDownloadLink(page);
info = await parsePrefixes(page, info); // Fill status/engines/isMod
2020-10-16 07:21:19 +00:00
const structuredText = await getMainPostStructuredText(page);
const overview = getOverview(structuredText, info.isMod);
const parsedInfos = parseConversationPage(structuredText);
const changelog = getLastChangelog(page);
// Fill in the GameInfo element with the information obtained
info.name = await title;
info.author = await author;
info.overview = overview;
info.tags = await tags;
info.f95url = url;
2020-10-16 07:52:20 +00:00
info.version = info.isMod ? parsedInfos.MOD_VERSION : parsedInfos.VERSION;
info.lastUpdate = info.isMod
? parsedInfos.UPDATED
2020-10-16 07:52:20 +00:00
: parsedInfos.THREAD_UPDATED;
info.previewSource = await previewSource;
2020-10-16 07:58:08 +00:00
info.changelog = (await changelog) || 'Unknown changelog';
//info.downloadInfo = await downloadData;
/* Downloading games without going directly to
* the platform appears to be prohibited by
* the guidelines. It is therefore useless to
* keep the links for downloading the games. */
await page.close(); // Close the page
2020-10-16 07:58:08 +00:00
if (shared.debug) console.log('Founded data for ' + info.name);
return info;
};
/**
* Obtain the game version without parsing again all the data of the game.
* @param {puppeteer.Browser} browser Browser object used for navigation
* @param {GameInfo} info Information about the game
* @returns {Promise<String>} Online version of the game
*/
module.exports.getGameVersionFromTitle = async function (browser, info) {
2020-10-16 07:21:19 +00:00
const page = await preparePage(browser); // Set new isolated page
await page.setCookie(...shared.cookies); // Set cookies to avoid login
await page.goto(info.f95url, {
waitUntil: shared.WAIT_STATEMENT,
}); // Go to the game page and wait until it loads
// Get the title
2020-10-16 07:21:19 +00:00
const titleHTML = await page.evaluate(
/* istanbul ignore next */
(selector) => document.querySelector(selector).innerHTML,
selectors.GAME_TITLE
);
2020-10-16 07:21:19 +00:00
const title = HTMLParser.parse(titleHTML).childNodes.pop().rawText;
// The title is in the following format: [PREFIXES] NAME GAME [VERSION] [AUTHOR]
2020-10-16 07:58:08 +00:00
const startIndex = title.indexOf('[') + 1;
const endIndex = title.indexOf(']', startIndex);
let version = title.substring(startIndex, endIndex).trim().toUpperCase();
2020-10-16 07:58:08 +00:00
if (version.startsWith('V')) version = version.replace('V', ''); // Replace only the first occurrence
return version;
};
//#region Private methods
/**
* @private
* Get the game description from its web page.
* Different processing depending on whether the game is a mod or not.
* @param {String} text Structured text extracted from the game's web page
* @param {Boolean} isMod Specify if it is a game or a mod
* @returns {Promise<String>} Game description
*/
function getOverview(text, isMod) {
// Get overview (different parsing for game and mod)
let overviewEndIndex;
2020-10-16 07:58:08 +00:00
if (isMod) overviewEndIndex = text.indexOf('Updated');
else overviewEndIndex = text.indexOf('Thread Updated');
return text.substring(0, overviewEndIndex).replace('Overview:\n', '').trim();
}
/**
* @private
* Extrapolate the page structure by removing the element tags
* and leaving only the text and its spacing.
* @param {puppeteer.Page} page Page containing the text
* @returns {Promise<String>} Structured text
*/
async function getMainPostStructuredText(page) {
// Gets the first post, where are listed all the game's informations
2020-10-16 07:21:19 +00:00
const post = (await page.$$(selectors.THREAD_POSTS))[0];
// The info are plain text so we need to parse the HTML code
2020-10-16 07:21:19 +00:00
const bodyHTML = await page.evaluate(
2020-10-16 07:45:58 +00:00
/* istanbul ignore next */
(mainPost) => mainPost.innerHTML,
post
);
return HTMLParser.parse(bodyHTML).structuredText;
}
/**
* @private
* Extrapolates and cleans the author from the page passed by parameter.
* @param {puppeteer.Page} page Page containing the author to be extrapolated
* @returns {Promise<String>} Game author
*/
async function getGameAuthor(page) {
// Get the game/mod name (without square brackets)
2020-10-16 07:21:19 +00:00
const titleHTML = await page.evaluate(
2020-10-16 07:45:58 +00:00
/* istanbul ignore next */
(selector) => document.querySelector(selector).innerHTML,
selectors.GAME_TITLE
);
2020-10-16 07:21:19 +00:00
const structuredTitle = HTMLParser.parse(titleHTML);
// The last element **shoud be** the title without prefixes (engines, status, other...)
2020-10-16 07:21:19 +00:00
const gameTitle = structuredTitle.childNodes.pop().rawText;
// The last square brackets contain the author
2020-10-16 07:58:08 +00:00
const startTitleIndex = gameTitle.lastIndexOf('[') + 1;
return gameTitle.substring(startTitleIndex, gameTitle.length - 1).trim();
}
/**
* @private
* Process the post text to get all the useful
* information in the format *DESCRIPTOR : VALUE*.
* @param {String} text Structured text of the post
* @returns {Object} Dictionary of information
*/
function parseConversationPage(text) {
2020-10-16 07:21:19 +00:00
const dataPairs = {};
// The information searched in the game post are one per line
2020-10-16 07:58:08 +00:00
const splittedText = text.split('\n');
2020-10-16 07:21:19 +00:00
for (const line of splittedText) {
2020-10-16 07:58:08 +00:00
if (!line.includes(':')) continue;
// Create pair key/value
2020-10-16 07:58:08 +00:00
const splitted = line.split(':');
const key = splitted[0].trim().toUpperCase().replaceAll(' ', '_'); // Uppercase to avoid mismatch
2020-10-16 07:21:19 +00:00
const value = splitted[1].trim();
// Add pair to the dict if valid
2020-10-16 07:58:08 +00:00
if (value !== '') dataPairs[key] = value;
}
return dataPairs;
}
/**
* @private
* Gets the URL of the image used as a preview for the game in the conversation.
* @param {puppeteer.Page} page Page containing the URL to be extrapolated
* @returns {Promise<String>} URL (String) of the image or null if failed to get it
*/
async function getGamePreviewSource(page) {
2020-10-16 07:21:19 +00:00
const src = await page.evaluate(
2020-10-16 07:45:58 +00:00
/* istanbul ignore next */
(selector) => {
// Get the firs image available
2020-10-16 07:21:19 +00:00
const img = document.querySelector(selector);
2020-10-16 07:58:08 +00:00
if (img) return img.getAttribute('src');
else return null;
},
selectors.GAME_IMAGES
);
// Check if the URL is valid
return isStringAValidURL(src) ? src : null;
}
/**
* @private
* Extrapolates and cleans the title from the page passed by parameter.
* @param {puppeteer.Page} page Page containing the title to be extrapolated
* @returns {Promise<String>} Game title
*/
async function getGameTitle(page) {
// Get the game/mod name (without square brackets)
2020-10-16 07:21:19 +00:00
const titleHTML = await page.evaluate(
2020-10-16 07:45:58 +00:00
/* istanbul ignore next */
(selector) => document.querySelector(selector).innerHTML,
selectors.GAME_TITLE
);
2020-10-16 07:21:19 +00:00
const structuredTitle = HTMLParser.parse(titleHTML);
// The last element **shoud be** the title without prefixes (engines, status, other...)
2020-10-16 07:21:19 +00:00
const gameTitle = structuredTitle.childNodes.pop().rawText;
2020-10-16 07:58:08 +00:00
const endTitleIndex = gameTitle.indexOf('[');
return gameTitle.substring(0, endTitleIndex).trim();
}
/**
* @private
* Get the alphabetically sorted list of tags associated with the game.
* @param {puppeteer.Page} page Page containing the tags to be extrapolated
* @returns {Promise<String[]>} List of uppercase tags
*/
async function getGameTags(page) {
2020-10-16 07:21:19 +00:00
const tags = [];
// Get the game tags
2020-10-16 07:21:19 +00:00
for (const handle of await page.$$(selectors.GAME_TAGS)) {
const tag = await page.evaluate(
2020-10-16 07:45:58 +00:00
/* istanbul ignore next */
(element) => element.innerText,
handle
);
tags.push(tag.toUpperCase());
}
return tags.sort();
}
/**
* @private
* Process the game title prefixes to extract information such as game status,
* graphics engine used, and whether it is a mod or original game.
* @param {puppeteer.Page} page Page containing the prefixes to be extrapolated
* @param {GameInfo} info Object to assign the identified information to
* @returns {Promise<GameInfo>} GameInfo object passed in to which the identified information has been added
*/
async function parsePrefixes(page, info) {
2020-10-16 07:58:08 +00:00
const MOD_PREFIX = 'MOD';
// The 'Ongoing' status is not specified, only 'Abandoned'/'OnHold'/'Complete'
2020-10-16 07:58:08 +00:00
info.status = 'Ongoing';
2020-10-16 07:21:19 +00:00
for (const handle of await page.$$(selectors.GAME_TITLE_PREFIXES)) {
const value = await page.evaluate(
2020-10-16 07:45:58 +00:00
/* istanbul ignore next */
(element) => element.innerText,
handle
);
// Clean the prefix
2020-10-16 07:58:08 +00:00
const prefix = value.toUpperCase().replace('[', '').replace(']', '').trim();
// Getting infos...
if (shared.statuses.includes(prefix)) info.status = prefix;
else if (shared.engines.includes(prefix)) info.engine = prefix;
// This is not a game but a mod
else if (prefix === MOD_PREFIX) info.isMod = true;
}
return info;
}
2020-10-14 14:04:50 +00:00
/**
* @private
* Get the last changelog available for the game.
* @param {puppeteer.Page} page Page containing the changelog
2020-10-15 14:47:52 +00:00
* @returns {Promise<String>} Changelog for the last version or null if no changelog is found
2020-10-14 14:04:50 +00:00
*/
async function getLastChangelog(page) {
// Gets the first post, where are listed all the game's informations
2020-10-16 07:21:19 +00:00
const post = (await page.$$(selectors.THREAD_POSTS))[0];
2020-10-16 07:21:19 +00:00
const spoiler = await post.$(selectors.THREAD_LAST_CHANGELOG);
if (!spoiler) return null;
2020-10-15 14:47:52 +00:00
2020-10-16 07:21:19 +00:00
const changelogHTML = await page.evaluate(
2020-10-14 14:04:50 +00:00
/* istanbul ignore next */
(e) => e.innerText,
spoiler
);
2020-10-16 07:21:19 +00:00
const parsedText = HTMLParser.parse(changelogHTML).structuredText;
2020-10-16 07:58:08 +00:00
return parsedText.replace('Spoiler', '').trim();
2020-10-14 14:04:50 +00:00
}
/**
* @private
* Get game download links for different platforms.
* @param {puppeteer.Page} page Page containing the links to be extrapolated
* @returns {Promise<GameDownload[]>} List of objects used for game download
*/
2020-10-16 07:45:58 +00:00
// skipcq: JS-0128
async function getGameDownloadLink(page) {
// Most used hosting platforms
2020-10-16 07:21:19 +00:00
const hostingPlatforms = [
2020-10-16 07:58:08 +00:00
'MEGA',
'NOPY',
'FILESUPLOAD',
'MIXDROP',
'UPLOADHAVEN',
'PIXELDRAIN',
'FILESFM',
];
// Supported OS platforms
2020-10-16 07:58:08 +00:00
const platformOS = ['WIN', 'LINUX', 'MAC', 'ALL'];
// Gets the <span> which contains the download links
2020-10-16 07:21:19 +00:00
const temp = await page.$$(selectors.DOWNLOAD_LINKS_CONTAINER);
if (temp.length === 0) return [];
// Look for the container that contains the links
// It is necessary because the same css selector
// also identifies other elements on the page
let container = null;
2020-10-16 07:21:19 +00:00
for (const candidate of temp) {
if (container !== null) break;
2020-10-16 07:21:19 +00:00
const upperText = (
await page.evaluate(
2020-10-16 07:45:58 +00:00
/* istanbul ignore next */
(e) => e.innerText,
candidate
)
).toUpperCase();
// Search if the container contains the name of a hosting platform
2020-10-16 07:21:19 +00:00
for (const p of hostingPlatforms) {
if (upperText.includes(p)) {
container = candidate;
break;
}
}
}
if (container === null) return [];
// Extract the HTML text from the container
2020-10-16 07:21:19 +00:00
const searchText = (
await page.evaluate(
2020-10-16 07:45:58 +00:00
/* istanbul ignore next */
(e) => e.innerHTML,
container
)
).toLowerCase();
// Parse the download links
2020-10-16 07:21:19 +00:00
const downloadData = [];
for (const platform of platformOS) {
const data = extractGameHostingData(platform, searchText);
downloadData.push(...data);
}
return downloadData;
}
/**
* @private
* From the HTML text it extracts the game download links for the specified operating system.
* @param {String} platform Name of the operating system to look for a compatible link to.
* It can only be *WIN/LINUX/MAC/ALL*
* @param {String} text HTML string to extract links from
* @returns {GameDownload[]} List of game download links for the selected platform
*/
function extractGameHostingData(platform, text) {
2020-10-16 07:58:08 +00:00
const PLATFORM_BOLD_OPEN = '<b>';
const CONTAINER_SPAN_CLOSE = '</span>';
const LINK_OPEN = '<a';
const LINK_CLOSE = '</a>';
const HREF_START = 'href=\'';
const HREF_END = '\'';
const TAG_CLOSE = '>';
// Identify the individual platforms
let startIndex = text.indexOf(platform.toLowerCase());
if (startIndex === -1) return [];
else startIndex += platform.length;
// Find the <b>platform</b>
let endIndex =
text.indexOf(PLATFORM_BOLD_OPEN, startIndex) + PLATFORM_BOLD_OPEN.length;
// Find the end of the container
if (endIndex === -1)
endIndex =
text.indexOf(CONTAINER_SPAN_CLOSE, startIndex) +
CONTAINER_SPAN_CLOSE.length;
text = text.substring(startIndex, endIndex);
2020-10-16 07:21:19 +00:00
const downloadData = [];
const linkTags = text.split(LINK_OPEN);
for (const tag of linkTags) {
// Ignore non-link string
if (!tag.includes(HREF_START)) continue;
// Find the hosting platform name
startIndex = tag.indexOf(TAG_CLOSE) + TAG_CLOSE.length;
endIndex = tag.indexOf(LINK_CLOSE, startIndex);
2020-10-16 07:21:19 +00:00
const hosting = tag.substring(startIndex, endIndex);
// Find the 'href' attribute
startIndex = tag.indexOf(HREF_START) + HREF_START.length;
endIndex = tag.indexOf(HREF_END, startIndex);
2020-10-16 07:21:19 +00:00
const link = tag.substring(startIndex, endIndex);
if (isStringAValidURL(link)) {
2020-10-16 07:21:19 +00:00
const gd = new GameDownload();
gd.hosting = hosting.toUpperCase();
gd.link = link;
gd.supportedOS = platform.toUpperCase();
downloadData.push(gd);
}
}
return downloadData;
}
//#endregion Private methods