pull/8/head
MillenniumEarl 2020-10-07 08:28:34 +02:00
commit 8a2fd49308
3 changed files with 255 additions and 210 deletions

View File

@ -1,4 +1,7 @@
# F95API # F95API
Unofficial Node JS module for scraping F95Zone platform
Unofficial Node JS module for scraping F95Zone platform
[![FOSSA Status](https://app.fossa.com/api/projects/git%2Bgithub.com%2FMillenniumEarl%2FF95API.svg?type=shield)](https://app.fossa.com/projects/git%2Bgithub.com%2FMillenniumEarl%2FF95API?ref=badge_shield) [![FOSSA Status](https://app.fossa.com/api/projects/git%2Bgithub.com%2FMillenniumEarl%2FF95API.svg?type=shield)](https://app.fossa.com/projects/git%2Bgithub.com%2FMillenniumEarl%2FF95API?ref=badge_shield)
[![DeepSource](https://deepsource.io/gh/MillenniumEarl/F95API.svg/?label=active+issues&show_trend=true)](https://deepsource.io/gh/MillenniumEarl/F95API/?ref=repository-badge)
[![codecov](https://codecov.io/gh/MillenniumEarl/F95API/branch/master/graph/badge.svg?token=KHN1TNIH7D)](undefined)

View File

@ -1,17 +1,17 @@
'use strict'; "use strict";
// Public modules from npm // Public modules from npm
const HTMLParser = require('node-html-parser'); const HTMLParser = require("node-html-parser");
const puppeteer = require('puppeteer'); const puppeteer = require("puppeteer");
const urlExist = require('url-exist'); const urlExist = require("url-exist");
// Modules from file // Modules from file
const shared = require('./shared.js'); const shared = require("./shared.js");
const selectors = require('./costants/css-selectors.js'); const selectors = require("./costants/css-selectors.js");
const { preparePage } = require('./puppeteer-helper.js'); const { preparePage } = require("./puppeteer-helper.js");
const GameDownload = require('./classes/game-download.js'); const GameDownload = require("./classes/game-download.js");
const GameInfo = require('./classes/game-info.js'); const GameInfo = require("./classes/game-info.js");
const { isStringAValidURL, isF95URL } = require('./urls-helper.js'); const { isStringAValidURL, isF95URL } = require("./urls-helper.js");
/** /**
* @protected * @protected
@ -20,48 +20,52 @@ const { isStringAValidURL, isF95URL } = require('./urls-helper.js');
* @param {URL} url URL of the game/mod to extract data from * @param {URL} url URL of the game/mod to extract data from
* @return {Promise<GameInfo>} Complete information about the game you are looking for * @return {Promise<GameInfo>} Complete information about the game you are looking for
*/ */
module.exports.getGameInfo = async function(browser, url) { module.exports.getGameInfo = async function (browser, url) {
if (shared.debug) console.log('Obtaining game info'); if (shared.debug) console.log("Obtaining game info");
// Verify the correctness of the URL // Verify the correctness of the URL
if (!isF95URL(url)) throw url + ' is not a valid F95Zone URL'; if (!isF95URL(url)) throw url + " is not a valid F95Zone URL";
let exists = await urlExist(url.toString()); let exists = await urlExist(url.toString());
if (!exists) return new GameInfo(); if (!exists) return new GameInfo();
let page = await preparePage(browser); // Set new isolated page let page = await preparePage(browser); // Set new isolated page
await page.setCookie(...shared.cookies); // Set cookies to avoid login await page.setCookie(...shared.cookies); // Set cookies to avoid login
await page.goto(url.toString(), { await page.goto(url.toString(), {
waitUntil: shared.WAIT_STATEMENT waitUntil: shared.WAIT_STATEMENT,
}); // Go to the game page and wait until it loads }); // Go to the game page and wait until it loads
// It asynchronously searches for the elements and // It asynchronously searches for the elements and
// then waits at the end to compile the object to be returned // then waits at the end to compile the object to be returned
let info = new GameInfo(); let info = new GameInfo();
let title = getGameTitle(page); let title = getGameTitle(page);
let author = getGameAuthor(page); let author = getGameAuthor(page);
let tags = getGameTags(page); let tags = getGameTags(page);
let previewSource = getGamePreviewSource(page); let previewSource = getGamePreviewSource(page);
let downloadData = getGameDownloadLink(page); let downloadData = getGameDownloadLink(page);
info = await parsePrefixes(page, info); // Fill status/engines/isMod info = await parsePrefixes(page, info); // Fill status/engines/isMod
let structuredText = await getMainPostStructuredText(page); let structuredText = await getMainPostStructuredText(page);
let overview = getOverview(structuredText, info.isMod); let overview = getOverview(structuredText, info.isMod);
let parsedInfos = parseConversationPage(structuredText); let parsedInfos = parseConversationPage(structuredText);
// Fill in the GameInfo element with the information obtained // Fill in the GameInfo element with the information obtained
info.name = await title; info.name = await title;
info.author = await author; info.author = await author;
info.overview = overview; info.overview = overview;
info.tags = await tags; info.tags = await tags;
info.f95url = url; info.f95url = url;
info.version = info.isMod ? parsedInfos['MOD VERSION'] : parsedInfos['VERSION']; info.version = info.isMod
info.lastUpdate = info.isMod ? parsedInfos['UPDATED'] : parsedInfos['THREAD UPDATED']; ? parsedInfos["MOD VERSION"]
info.previewSource = await previewSource; : parsedInfos["VERSION"];
info.downloadInfo = await downloadData; info.lastUpdate = info.isMod
? parsedInfos["UPDATED"]
: parsedInfos["THREAD UPDATED"];
info.previewSource = await previewSource;
info.downloadInfo = await downloadData;
await page.close(); // Close the page await page.close(); // Close the page
if (shared.debug) console.log('Founded data for ' + info.name); if (shared.debug) console.log("Founded data for " + info.name);
return info; return info;
} };
//#region Private methods //#region Private methods
/** /**
@ -73,27 +77,30 @@ module.exports.getGameInfo = async function(browser, url) {
* @returns {Promise<String>} Game description * @returns {Promise<String>} Game description
*/ */
function getOverview(text, isMod) { function getOverview(text, isMod) {
// Get overview (different parsing for game and mod) // Get overview (different parsing for game and mod)
let overviewEndIndex; let overviewEndIndex;
if (isMod) overviewEndIndex = text.indexOf('Updated'); if (isMod) overviewEndIndex = text.indexOf("Updated");
else overviewEndIndex = text.indexOf('Thread Updated'); else overviewEndIndex = text.indexOf("Thread Updated");
return text.substring(0, overviewEndIndex).replace('Overview:\n', '').trim(); return text.substring(0, overviewEndIndex).replace("Overview:\n", "").trim();
} }
/** /**
* @private * @private
* Extrapolate the page structure by removing the element tags * Extrapolate the page structure by removing the element tags
* and leaving only the text and its spacing. * and leaving only the text and its spacing.
* @param {puppeteer.Page} page Page containing the text * @param {puppeteer.Page} page Page containing the text
* @returns {Promise<String>} Structured text * @returns {Promise<String>} Structured text
*/ */
async function getMainPostStructuredText(page) { async function getMainPostStructuredText(page) {
// Gets the first post, where are listed all the game's informations // Gets the first post, where are listed all the game's informations
let post = (await page.$$(selectors.THREAD_POSTS))[0]; let post = (await page.$$(selectors.THREAD_POSTS))[0];
// The info are plain text so we need to parse the HTML code // The info are plain text so we need to parse the HTML code
let bodyHTML = await page.evaluate( /* istanbul ignore next */ (mainPost) => mainPost.innerHTML, post); let bodyHTML = await page.evaluate(
return HTMLParser.parse(bodyHTML).structuredText; /* istanbul ignore next */ (mainPost) => mainPost.innerHTML,
post
);
return HTMLParser.parse(bodyHTML).structuredText;
} }
/** /**
@ -103,46 +110,47 @@ return HTMLParser.parse(bodyHTML).structuredText;
* @returns {Promise<String>} Game author * @returns {Promise<String>} Game author
*/ */
async function getGameAuthor(page) { async function getGameAuthor(page) {
// Get the game/mod name (without square brackets) // Get the game/mod name (without square brackets)
let titleHTML = await page.evaluate( /* istanbul ignore next */ (selector) => let titleHTML = await page.evaluate(
document.querySelector(selector).innerHTML, /* istanbul ignore next */ (selector) =>
selectors.GAME_TITLE); document.querySelector(selector).innerHTML,
let structuredTitle = HTMLParser.parse(titleHTML); selectors.GAME_TITLE
);
let structuredTitle = HTMLParser.parse(titleHTML);
// The last element **shoud be** the title without prefixes (engines, status, other...) // The last element **shoud be** the title without prefixes (engines, status, other...)
let gameTitle = structuredTitle.childNodes.pop().rawText; let gameTitle = structuredTitle.childNodes.pop().rawText;
// The last square brackets contain the author // The last square brackets contain the author
let startTitleIndex = gameTitle.lastIndexOf('[') + 1; let startTitleIndex = gameTitle.lastIndexOf("[") + 1;
return gameTitle.substring(startTitleIndex, gameTitle.length - 1).trim(); return gameTitle.substring(startTitleIndex, gameTitle.length - 1).trim();
} }
/** /**
* @private * @private
* Process the post text to get all the useful * Process the post text to get all the useful
* information in the format *DESCRIPTOR : VALUE*. * information in the format *DESCRIPTOR : VALUE*.
* @param {String} text Structured text of the post * @param {String} text Structured text of the post
* @returns {Object} Dictionary of information * @returns {Object} Dictionary of information
*/ */
function parseConversationPage(text) { function parseConversationPage(text) {
let dataPairs = {}; let dataPairs = {};
// The information searched in the game post are one per line // The information searched in the game post are one per line
let splittedText = text.split('\n'); let splittedText = text.split("\n");
for (let line of splittedText) { for (let line of splittedText) {
if (!line.includes(":")) continue;
if (!line.includes(':')) continue; // Create pair key/value
let splitted = line.split(":");
let key = splitted[0].trim().toUpperCase(); // Uppercase to avoid mismatch
let value = splitted[1].trim();
// Create pair key/value // Add pair to the dict if valid
let splitted = line.split(':'); if (value != "") dataPairs[key] = value;
let key = splitted[0].trim().toUpperCase(); // Uppercase to avoid mismatch }
let value = splitted[1].trim();
// Add pair to the dict if valid return dataPairs;
if (value != '') dataPairs[key] = value;
}
return dataPairs;
} }
/** /**
@ -152,16 +160,19 @@ function parseConversationPage(text) {
* @returns {Promise<URL>} URL of the image or null if failed to get it * @returns {Promise<URL>} URL of the image or null if failed to get it
*/ */
async function getGamePreviewSource(page) { async function getGamePreviewSource(page) {
let src = await page.evaluate( /* istanbul ignore next */ (selector) => { let src = await page.evaluate(
// Get the firs image available /* istanbul ignore next */ (selector) => {
let img = document.querySelector(selector); // Get the firs image available
let img = document.querySelector(selector);
if (img) return img.getAttribute('src'); if (img) return img.getAttribute("src");
else return null; else return null;
}, selectors.GAME_IMAGES); },
selectors.GAME_IMAGES
);
// Check if the URL is valid // Check if the URL is valid
return isStringAValidURL(src) ? new URL(src) : null; return isStringAValidURL(src) ? new URL(src) : null;
} }
/** /**
@ -171,16 +182,18 @@ async function getGamePreviewSource(page) {
* @returns {Promise<String>} Game title * @returns {Promise<String>} Game title
*/ */
async function getGameTitle(page) { async function getGameTitle(page) {
// Get the game/mod name (without square brackets) // Get the game/mod name (without square brackets)
let titleHTML = await page.evaluate( /* istanbul ignore next */ (selector) => let titleHTML = await page.evaluate(
document.querySelector(selector).innerHTML, /* istanbul ignore next */ (selector) =>
selectors.GAME_TITLE); document.querySelector(selector).innerHTML,
let structuredTitle = HTMLParser.parse(titleHTML); selectors.GAME_TITLE
);
let structuredTitle = HTMLParser.parse(titleHTML);
// The last element **shoud be** the title without prefixes (engines, status, other...) // The last element **shoud be** the title without prefixes (engines, status, other...)
let gameTitle = structuredTitle.childNodes.pop().rawText; let gameTitle = structuredTitle.childNodes.pop().rawText;
let endTitleIndex = gameTitle.indexOf('['); let endTitleIndex = gameTitle.indexOf("[");
return gameTitle.substring(0, endTitleIndex).trim(); return gameTitle.substring(0, endTitleIndex).trim();
} }
/** /**
@ -190,43 +203,48 @@ async function getGameTitle(page) {
* @returns {Promise<String[]>} List of uppercase tags * @returns {Promise<String[]>} List of uppercase tags
*/ */
async function getGameTags(page) { async function getGameTags(page) {
let tags = []; let tags = [];
// Get the game tags // Get the game tags
for (let handle of await page.$$(selectors.GAME_TAGS)) { for (let handle of await page.$$(selectors.GAME_TAGS)) {
let tag = await page.evaluate( /* istanbul ignore next */ (element) => element.innerText, handle); let tag = await page.evaluate(
tags.push(tag.toUpperCase()); /* istanbul ignore next */ (element) => element.innerText,
} handle
return tags.sort(); );
tags.push(tag.toUpperCase());
}
return tags.sort();
} }
/** /**
* @private * @private
* Process the game title prefixes to extract information such as game status, * Process the game title prefixes to extract information such as game status,
* graphics engine used, and whether it is a mod or original game. * graphics engine used, and whether it is a mod or original game.
* @param {puppeteer.Page} page Page containing the prefixes to be extrapolated * @param {puppeteer.Page} page Page containing the prefixes to be extrapolated
* @param {GameInfo} info Object to assign the identified information to * @param {GameInfo} info Object to assign the identified information to
* @returns {Promise<GameInfo>} GameInfo object passed in to which the identified information has been added * @returns {Promise<GameInfo>} GameInfo object passed in to which the identified information has been added
*/ */
async function parsePrefixes(page, info) { async function parsePrefixes(page, info) {
const MOD_PREFIX = 'MOD'; const MOD_PREFIX = "MOD";
// The 'Ongoing' status is not specified, only 'Abandoned'/'OnHold'/'Complete'
info.status = 'Ongoing';
for (let handle of await page.$$(selectors.GAME_TITLE_PREFIXES)) {
let value = await page.evaluate( /* istanbul ignore next */ (element) => element.innerText, handle);
// Clean the prefix // The 'Ongoing' status is not specified, only 'Abandoned'/'OnHold'/'Complete'
let prefix = value.toUpperCase().replace('[', '').replace(']', '').trim(); info.status = "Ongoing";
for (let handle of await page.$$(selectors.GAME_TITLE_PREFIXES)) {
let value = await page.evaluate(
/* istanbul ignore next */ (element) => element.innerText,
handle
);
// Getting infos... // Clean the prefix
if (shared.statuses.includes(prefix)) info.status = prefix; let prefix = value.toUpperCase().replace("[", "").replace("]", "").trim();
else if (shared.engines.includes(prefix)) info.engine = prefix;
// This is not a game but a mod // Getting infos...
else if (prefix === MOD_PREFIX) info.isMod = true; if (shared.statuses.includes(prefix)) info.status = prefix;
} else if (shared.engines.includes(prefix)) info.engine = prefix;
return info; // This is not a game but a mod
else if (prefix === MOD_PREFIX) info.isMod = true;
}
return info;
} }
/** /**
@ -236,44 +254,62 @@ async function parsePrefixes(page, info) {
* @returns {Promise<GameDownload[]>} List of objects used for game download * @returns {Promise<GameDownload[]>} List of objects used for game download
*/ */
async function getGameDownloadLink(page) { async function getGameDownloadLink(page) {
// Most used hosting platforms // Most used hosting platforms
let hostingPlatforms = ['MEGA', 'NOPY', 'FILESUPLOAD', 'MIXDROP', 'UPLOADHAVEN', 'PIXELDRAIN', 'FILESFM']; let hostingPlatforms = [
"MEGA",
// Supported OS platforms "NOPY",
let platformOS = ['WIN', 'LINUX', 'MAC', 'ALL'] "FILESUPLOAD",
"MIXDROP",
"UPLOADHAVEN",
"PIXELDRAIN",
"FILESFM",
];
// Gets the <span> which contains the download links // Supported OS platforms
let temp = await page.$$(selectors.DOWNLOAD_LINKS_CONTAINER); let platformOS = ["WIN", "LINUX", "MAC", "ALL"];
if(temp.length === 0) return [];
// Look for the container that contains the links // Gets the <span> which contains the download links
// It is necessary because the same css selector let temp = await page.$$(selectors.DOWNLOAD_LINKS_CONTAINER);
// also identifies other elements on the page if (temp.length === 0) return [];
let container = null;
for(let candidate of temp) {
if (container !== null) break;
let upperText = (await page.evaluate( /* istanbul ignore next */ (e) => e.innerText, candidate)).toUpperCase();
// Search if the container contains the name of a hosting platform // Look for the container that contains the links
for (let p of hostingPlatforms) { // It is necessary because the same css selector
if(upperText.includes(p)) { // also identifies other elements on the page
container = candidate; let container = null;
break; for (let candidate of temp) {
} if (container !== null) break;
} let upperText = (
await page.evaluate(
/* istanbul ignore next */ (e) => e.innerText,
candidate
)
).toUpperCase();
// Search if the container contains the name of a hosting platform
for (let p of hostingPlatforms) {
if (upperText.includes(p)) {
container = candidate;
break;
}
} }
if(container === null) return []; }
if (container === null) return [];
// Extract the HTML text from the container // Extract the HTML text from the container
let searchText = (await page.evaluate( /* istanbul ignore next */ (e) => e.innerHTML, container)).toLowerCase(); let searchText = (
await page.evaluate(
/* istanbul ignore next */ (e) => e.innerHTML,
container
)
).toLowerCase();
// Parse the download links // Parse the download links
let downloadData = []; let downloadData = [];
for(let platform of platformOS) { for (let platform of platformOS) {
let data = extractGameHostingData(platform, searchText); let data = extractGameHostingData(platform, searchText);
downloadData.push(...data); downloadData.push(...data);
} }
return downloadData; return downloadData;
} }
/** /**
@ -285,56 +321,55 @@ async function getGameDownloadLink(page) {
* @returns {GameDownload[]} List of game download links for the selected platform * @returns {GameDownload[]} List of game download links for the selected platform
*/ */
function extractGameHostingData(platform, text) { function extractGameHostingData(platform, text) {
const PLATFORM_BOLD_OPEN = '<b>'; const PLATFORM_BOLD_OPEN = "<b>";
const CONTAINER_SPAN_CLOSE = '</span>'; const CONTAINER_SPAN_CLOSE = "</span>";
const LINK_OPEN = '<a'; const LINK_OPEN = "<a";
const LINK_CLOSE = '</a>'; const LINK_CLOSE = "</a>";
const HREF_START = 'href="'; const HREF_START = 'href="';
const HREF_END = '"'; const HREF_END = '"';
const TAG_CLOSE = '>'; const TAG_CLOSE = ">";
// Identify the individual platforms // Identify the individual platforms
let startIndex = text.indexOf(platform.toLowerCase()); let startIndex = text.indexOf(platform.toLowerCase());
if(startIndex === -1) return []; if (startIndex === -1) return [];
else startIndex += platform.length else startIndex += platform.length;
// Find the <b>platform</b> // Find the <b>platform</b>
let endIndex = text.indexOf( let endIndex =
PLATFORM_BOLD_OPEN, text.indexOf(PLATFORM_BOLD_OPEN, startIndex) + PLATFORM_BOLD_OPEN.length;
startIndex) + PLATFORM_BOLD_OPEN.length;
// Find the end of the container // Find the end of the container
if (endIndex === -1) text.indexOf( if (endIndex === -1)
CONTAINER_SPAN_CLOSE, text.indexOf(CONTAINER_SPAN_CLOSE, startIndex) +
startIndex) + CONTAINER_SPAN_CLOSE.length; CONTAINER_SPAN_CLOSE.length;
text = text.substring(startIndex, endIndex); text = text.substring(startIndex, endIndex);
let downloadData = [];
let linkTags = text.split(LINK_OPEN);
for(let tag of linkTags) {
// Ignore non-link string
if (!tag.includes(HREF_START)) continue;
// Find the hosting platform name let downloadData = [];
startIndex = tag.indexOf(TAG_CLOSE) + TAG_CLOSE.length; let linkTags = text.split(LINK_OPEN);
endIndex = tag.indexOf(LINK_CLOSE, startIndex); for (let tag of linkTags) {
let hosting = tag.substring(startIndex, endIndex); // Ignore non-link string
if (!tag.includes(HREF_START)) continue;
// Find the 'href' attribute // Find the hosting platform name
startIndex = tag.indexOf(HREF_START) + HREF_START.length; startIndex = tag.indexOf(TAG_CLOSE) + TAG_CLOSE.length;
endIndex = tag.indexOf(HREF_END, startIndex); endIndex = tag.indexOf(LINK_CLOSE, startIndex);
let link = tag.substring(startIndex, endIndex); let hosting = tag.substring(startIndex, endIndex);
if (isStringAValidURL(link)) { // Find the 'href' attribute
let gd = new GameDownload(); startIndex = tag.indexOf(HREF_START) + HREF_START.length;
gd.hosting = hosting.toUpperCase(); endIndex = tag.indexOf(HREF_END, startIndex);
gd.link = new URL(link); let link = tag.substring(startIndex, endIndex);
gd.supportedOS = platform.toUpperCase();
downloadData.push(gd); if (isStringAValidURL(link)) {
} let gd = new GameDownload();
gd.hosting = hosting.toUpperCase();
gd.link = new URL(link);
gd.supportedOS = platform.toUpperCase();
downloadData.push(gd);
} }
return downloadData; }
return downloadData;
} }
//#endregion Private methods //#endregion Private methods

View File

@ -1,18 +1,25 @@
const { debug, login, getGameData, loadF95BaseData, getUserData, logout } = require("../app/index"); const {
debug,
login,
getGameData,
loadF95BaseData,
getUserData,
logout,
} = require("../app/index");
//debug(true); //debug(true);
main(); main();
async function main() { async function main() {
let loginResult = await login("MillenniumEarl", "f9vTcRNuvxj4YpK"); let loginResult = await login("MillenniumEarl", "f9vTcRNuvxj4YpK");
if (loginResult.success) { if (loginResult.success) {
await loadF95BaseData(); await loadF95BaseData();
let gameData = await getGameData("kingdom of deception", false); let gameData = await getGameData("kingdom of deception", false);
console.log(gameData.pop()); console.log(gameData.pop());
// let userData = await getUserData(); // let userData = await getUserData();
// console.log(userData); // console.log(userData);
} }
logout(); logout();
} }