Add post-parser.ts
parent
1cad5c277f
commit
f7bad33c1f
|
@ -0,0 +1,291 @@
|
||||||
|
|
||||||
|
//#region Interfaces
|
||||||
|
|
||||||
|
export interface IPostElement {
|
||||||
|
Type: "Empty" | "Text" | "Link" | "Image" | "Spoiler",
|
||||||
|
Name: string,
|
||||||
|
Text: string,
|
||||||
|
Content: IPostElement[]
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface ILink extends IPostElement {
|
||||||
|
Type: "Image" | "Link",
|
||||||
|
Href: string,
|
||||||
|
}
|
||||||
|
|
||||||
|
//#endregion Interfaces
|
||||||
|
|
||||||
|
//#region Public methods
|
||||||
|
/**
|
||||||
|
* Given the main post of the page (#1) it extracts the information contained.
|
||||||
|
*/
|
||||||
|
export function parseCheerioMainPost($: cheerio.Root, post: cheerio.Cheerio): IPostElement[] {
|
||||||
|
// The data is divided between "tag" and "text" elements.
|
||||||
|
// Simple data is composed of a "tag" element followed
|
||||||
|
// by a "text" element, while more complex data (contained
|
||||||
|
// in spoilers) is composed of a "tag" element, followed
|
||||||
|
// by a text containing only ":" and then by an additional
|
||||||
|
// "tag" element having as the first term "Spoiler"
|
||||||
|
|
||||||
|
// First fetch all the elements in the post
|
||||||
|
const elements = post.contents().toArray().map(el => {
|
||||||
|
const node = parseCheerioNode($, el);
|
||||||
|
if (node.Name || node.Text || node.Content.length != 0) {
|
||||||
|
return node;
|
||||||
|
}
|
||||||
|
}).filter(el => el);
|
||||||
|
|
||||||
|
// ... then parse the elements to create the pairs of title/data
|
||||||
|
return parsePostElements(elements);
|
||||||
|
}
|
||||||
|
//#endregion Public methods
|
||||||
|
|
||||||
|
//#region Private methods
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Process a spoiler element by getting its text broken
|
||||||
|
* down by any other spoiler elements present.
|
||||||
|
*/
|
||||||
|
function parseCheerioSpoilerNode($: cheerio.Root, spoiler: cheerio.Cheerio): IPostElement {
|
||||||
|
// A spoiler block is composed of a div with class "bbCodeSpoiler",
|
||||||
|
// containing a div "bbCodeSpoiler-content" containing, in cascade,
|
||||||
|
// a div with class "bbCodeBlock--spoiler" and a div with class "bbCodeBlock-content".
|
||||||
|
// This last tag contains the required data.
|
||||||
|
|
||||||
|
// Local variables
|
||||||
|
const BUTTON_CLASS = "button.bbCodeSpoiler-button";
|
||||||
|
const SPOILER_CONTENT_CLASS = "div.bbCodeSpoiler-content > div.bbCodeBlock--spoiler > div.bbCodeBlock-content";
|
||||||
|
const content: IPostElement = {
|
||||||
|
Type: "Spoiler",
|
||||||
|
Name: "",
|
||||||
|
Text: "",
|
||||||
|
Content: []
|
||||||
|
};
|
||||||
|
|
||||||
|
// Find the title of the spoiler (contained in the button)
|
||||||
|
const button = spoiler.find(BUTTON_CLASS).toArray().shift();
|
||||||
|
content.Name = $(button).text().trim();
|
||||||
|
|
||||||
|
// Parse the content of the spoiler
|
||||||
|
spoiler.find(SPOILER_CONTENT_CLASS).contents().map((idx, el) => {
|
||||||
|
// Convert the element
|
||||||
|
const element = $(el);
|
||||||
|
|
||||||
|
// Parse nested spoiler
|
||||||
|
if (element.attr("class") === "bbCodeSpoiler") {
|
||||||
|
const spoiler = parseCheerioSpoilerNode($, element);
|
||||||
|
content.Content.push(spoiler);
|
||||||
|
}
|
||||||
|
//@ts-ignore
|
||||||
|
// else if (el.name === "br") {
|
||||||
|
// // Add new line
|
||||||
|
// content.Text += "\n";
|
||||||
|
// }
|
||||||
|
else if (el.type === "text") {
|
||||||
|
// Append text
|
||||||
|
content.Text += element.text();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Clean text
|
||||||
|
content.Text = content.Text.replace(/\s\s+/g, ' ').trim();
|
||||||
|
return content;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if the node passed as a parameter is of text type.
|
||||||
|
* This also includes formatted nodes (i.e. `<b>`).
|
||||||
|
*/
|
||||||
|
function isTextNode(node: cheerio.Element): boolean {
|
||||||
|
const formattedTags = ["b", "i"]
|
||||||
|
const isText = node.type === "text";
|
||||||
|
const isFormatted = node.type === "tag" && formattedTags.includes(node.name);
|
||||||
|
|
||||||
|
return isText || isFormatted;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the text of the node only, excluding child nodes.
|
||||||
|
* Also includes formatted text elements (i.e. `<b>`).
|
||||||
|
*/
|
||||||
|
function getCheerioNonChildrenText(node: cheerio.Cheerio): string {
|
||||||
|
// Find all the text nodes in the node
|
||||||
|
const text = node.first().contents().filter((idx, el) => {
|
||||||
|
return isTextNode(el);
|
||||||
|
}).text();
|
||||||
|
|
||||||
|
// Clean and return the text
|
||||||
|
return text.replace(/\s\s+/g, ' ').trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Process a node and see if it contains a
|
||||||
|
* link or image. If not, it returns `null`.
|
||||||
|
*/
|
||||||
|
function parseCheerioLinkNode(element: cheerio.Cheerio): ILink | null {
|
||||||
|
//@ts-ignore
|
||||||
|
const name = element[0]?.name;
|
||||||
|
let returnValue: ILink = null;
|
||||||
|
|
||||||
|
if (name === "img") {
|
||||||
|
returnValue = {
|
||||||
|
Name: "",
|
||||||
|
Type: "Image",
|
||||||
|
Text: element.attr("alt"),
|
||||||
|
Href: element.attr("data-src"),
|
||||||
|
Content: []
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if (name === "a") {
|
||||||
|
returnValue = {
|
||||||
|
Name: "",
|
||||||
|
Type: "Link",
|
||||||
|
Text: element.text().replace(/\s\s+/g, ' ').trim(),
|
||||||
|
Href: element.attr("href"),
|
||||||
|
Content: []
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return returnValue;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Collapse an `IPostElement` element with a single subnode
|
||||||
|
* in the `Content` field in case it has no information.
|
||||||
|
*/
|
||||||
|
function reducePostElement(element: IPostElement): IPostElement {
|
||||||
|
if (element.Content.length === 1) {
|
||||||
|
const content = element.Content[0] as IPostElement;
|
||||||
|
const nullValues = (!element.Name || !content.Name) && (!element.Text || !content.Text);
|
||||||
|
const sameValues = (element.Name === content.Name) || (element.Text === content.Text)
|
||||||
|
|
||||||
|
if (nullValues || sameValues) {
|
||||||
|
element.Name = element.Name || content.Name;
|
||||||
|
element.Text = element.Text || content.Text;
|
||||||
|
element.Content = content.Content;
|
||||||
|
element.Type = content.Type;
|
||||||
|
|
||||||
|
// If the content is a link, add the HREF to the element
|
||||||
|
const contentILink = content as ILink;
|
||||||
|
const elementILink = element as ILink;
|
||||||
|
if (contentILink.Href) elementILink.Href = contentILink.Href;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return element;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Transform a `cheerio.Cheerio` node into an `IPostElement` element with its subnodes.
|
||||||
|
* @param reduce Compress subsequent subnodes if they contain no information. Default: `true`.
|
||||||
|
*/
|
||||||
|
function parseCheerioNode($: cheerio.Root, node: cheerio.Element, reduce = true): IPostElement {
|
||||||
|
// Local variables
|
||||||
|
let content: IPostElement = {
|
||||||
|
Type: "Empty",
|
||||||
|
Name: "",
|
||||||
|
Text: "",
|
||||||
|
Content: []
|
||||||
|
};
|
||||||
|
const cheerioNode = $(node);
|
||||||
|
|
||||||
|
if (isTextNode(node)) {
|
||||||
|
content.Text = cheerioNode.text().replace(/\s\s+/g, ' ').trim();
|
||||||
|
content.Type = "Text";
|
||||||
|
} else {
|
||||||
|
// Get the number of children that the element own
|
||||||
|
const nChildren = cheerioNode.children().length;
|
||||||
|
|
||||||
|
// Get the text of the element without childrens
|
||||||
|
content.Text = getCheerioNonChildrenText(cheerioNode);
|
||||||
|
|
||||||
|
// Parse spoilers
|
||||||
|
if (cheerioNode.attr("class") === "bbCodeSpoiler") {
|
||||||
|
const spoiler = parseCheerioSpoilerNode($, cheerioNode);
|
||||||
|
|
||||||
|
// Add element if not null
|
||||||
|
if (spoiler) {
|
||||||
|
content.Content.push(spoiler);
|
||||||
|
content.Type = "Spoiler";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Parse links
|
||||||
|
else if (nChildren === 0 && cheerioNode.length != 0) {
|
||||||
|
const link = parseCheerioLinkNode(cheerioNode);
|
||||||
|
|
||||||
|
// Add element if not null
|
||||||
|
if (link) {
|
||||||
|
content.Content.push(link);
|
||||||
|
content.Type = "Link";
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
cheerioNode.children().map((idx, el) => {
|
||||||
|
// Parse the children of the element passed as parameter
|
||||||
|
const childElement = parseCheerioNode($, el);
|
||||||
|
|
||||||
|
// If the children is valid (not empty) push it
|
||||||
|
if ((childElement.Text || childElement.Content.length !== 0) && !isTextNode(el)) {
|
||||||
|
content.Content.push(childElement);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return reduce ? reducePostElement(content) : content;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* It simplifies the `IPostElement` elements by associating
|
||||||
|
* the corresponding value to each characterizing element (i.e. author).
|
||||||
|
*/
|
||||||
|
function parsePostElements(elements: IPostElement[]): IPostElement[] {
|
||||||
|
// Local variables
|
||||||
|
const pairs: IPostElement[] = [];
|
||||||
|
const specialCharsRegex = /^[-!$%^&*()_+|~=`{}\[\]:";'<>?,.\/]/;
|
||||||
|
const specialRegex = new RegExp(specialCharsRegex);
|
||||||
|
|
||||||
|
for (let i = 0; i < elements.length; i++) {
|
||||||
|
// If the text starts with a special char, clean it
|
||||||
|
const startWithSpecial = specialRegex.test(elements[i].Text);
|
||||||
|
|
||||||
|
// /^[-!$%^&*()_+|~=`{}\[\]:";'<>?,.\/]/
|
||||||
|
// Get the uppercase text
|
||||||
|
const upperText = elements[i].Text.toUpperCase();
|
||||||
|
|
||||||
|
// Get the latest IPostElement in "pairs"
|
||||||
|
const lastIndex = pairs.length - 1;
|
||||||
|
const lastPair = pairs[lastIndex];
|
||||||
|
|
||||||
|
// If this statement is valid, we have a "data"
|
||||||
|
if (elements[i].Type === "Text" && startWithSpecial && pairs.length > 0) {
|
||||||
|
// We merge this element with the last element appended to 'pairs'
|
||||||
|
const cleanText = elements[i].Text.replace(specialCharsRegex, "").trim();
|
||||||
|
lastPair.Text = lastPair.Text || cleanText;
|
||||||
|
lastPair.Content.push(...elements[i].Content);
|
||||||
|
}
|
||||||
|
// This is a special case
|
||||||
|
else if (elements[i].Text.startsWith("Overview:\n")) {
|
||||||
|
// We add the overview to the pairs as a text element
|
||||||
|
elements[i].Type = "Text";
|
||||||
|
elements[i].Name = "Overview";
|
||||||
|
elements[i].Text = elements[i].Text.replace("Overview:\n", "");
|
||||||
|
pairs.push(elements[i]);
|
||||||
|
}
|
||||||
|
// We have an element referred to the previous "title"
|
||||||
|
else if (elements[i].Type != "Text" && pairs.length > 0) {
|
||||||
|
// We append this element to the content of the last title
|
||||||
|
lastPair.Content.push(elements[i]);
|
||||||
|
}
|
||||||
|
// ... else we have a "title" (we need to swap the text to the name because it is a title)
|
||||||
|
else {
|
||||||
|
const swap: IPostElement = Object.assign({}, elements[i]);
|
||||||
|
swap.Name = elements[i].Text;
|
||||||
|
swap.Text = "";
|
||||||
|
pairs.push(swap);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return pairs;
|
||||||
|
}
|
||||||
|
|
||||||
|
//#endregion Private methods
|
Loading…
Reference in New Issue