import { load } from "cheerio"; import { type AnyNode } from "domhandler"; import config from "../config"; import Scraper from "./scraper"; import type { IScraperArticlesOptions } from "../interfaces/scaper-articles-options"; import type { IArticle } from "../interfaces/article"; import LogLevels from "../enums/log-levels"; import Props from "../enums/props"; import ScraperMethods from "../enums/scraper-methods"; export default class ScraperArticles { private readonly _name: string; private readonly _options: IScraperArticlesOptions; private readonly _scraper: Scraper; constructor (name: string, options: IScraperArticlesOptions) { this._name = name; this._options = options; this._scraper = new Scraper(); } private getProperty (domNodeElement: AnyNode, selector: string, prop: Props = Props.TEXT): string { const $ = load(domNodeElement); let value: string | undefined = ""; try { value = prop === Props.TEXT ? $(selector).text() : $(selector).attr(prop); } catch (err) { if (config.LOG_LEVEL === LogLevels.DEBUG) { console.debug(`${this._name} | Error raised\n`); console.debug(`From ${domNodeElement.type} can't get value using selector '${selector}'`); console.error(err.message); } } return value ?? ""; } private getTitle (article: AnyNode): string { const selector = this._options.titleSelector ?? ""; return selector !== "" ? this.getProperty(article, selector).trim() : ""; } private getContent (article: AnyNode): string { const selector = this._options.contentSelector ?? ""; return selector !== "" ? this.getProperty(article, selector).trim() : ""; } private getLink (article: AnyNode): string { const selector = this._options.linkSelector ?? ""; const url = selector !== "" ? this.getProperty(article, selector, Props.LINK) : this._options.url return ! this.isValidUrl(url) && this._options.linkPrefix !== undefined ? (this._options.linkPrefix + url.trim()).trim() : url.trim(); } private async getImage (article: AnyNode): Promise { let imgFile; const selector = this._options.imageSelector ?? ""; let imgUrl = selector !== "" ? this.getProperty(article, selector, Props.IMAGE) : ""; if (imgUrl !== "") { if (! this.isValidUrl(imgUrl) && this._options.imagePrefix) { imgUrl = this._options.imagePrefix + imgUrl.trim(); } imgFile = this._scraper.scrapeFile(imgUrl) .catch((err) => { if (config.LOG_LEVEL === LogLevels.DEBUG) { console.debug(`${this._name} | Error\n`); console.debug(`From ${article.type} can't get image using selector '${selector}'`); console.error(err.message); } }); } return imgFile; } private getAuthor (article: AnyNode): string { const selector = this._options.authorSelector ?? ""; return selector !== "" ? this.getProperty(article, selector).trim() : this._name; } private getDate (article: AnyNode): string { const selector = this._options.dateSelector ?? ""; return selector !== "" ? this.getProperty(article, selector).trim() : new Date(Date.now()).toLocaleDateString("es-CL"); } private isValidUrl (url: string): boolean { try { const parsedUrl = new URL(url); if (!parsedUrl.protocol || !parsedUrl.host) { return false; } return true; } catch (error) { return false; } }; public async getArticles (): Promise { const articles: IArticle[] = []; const startTime = Date.now(); try { console.info("Starting scraping", this._options); const response = await this._scraper.scrape({ url: this._options.url, scraperMethod: this._options.scraperMethod }); const html = this._options.scraperMethod === ScraperMethods.PUPPETEER ? response.data.data.html : response.data; const $ = load(html as string); const domElements = $(this._options.articlesSelector); if (config.LOG_LEVEL === LogLevels.DEBUG) { console.debug(`${this._name} | Articles obtained: ${domElements.length} `); } for (let i = 0; i < domElements.length; i++) { const article = domElements[i]; articles.push({ title: this.getTitle(article), content: this.getContent(article), link: this.getLink(article), image: await this.getImage(article), author: this.getAuthor(article), date: this.getDate(article) }); } } catch (err) { if (config.LOG_LEVEL === LogLevels.DEBUG) { console.debug(`${this._name} | Error\n`); console.error(err.message); } } finally { const endTime = Date.now(); const duration = (endTime - startTime) / 1000; console.info(`${this._name} | Execution time: ${duration}s`); } return articles; } }