| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141 |
- import { load } from "cheerio";
- import { type AnyNode } from "domhandler";
- import config from "../config";
- import Scraper from "./scraper";
- import type { IScraperArticlesOptions } from "../interfaces/scaper-articles-options";
- import type { IArticle } from "../interfaces/article";
- import LogLevels from "../enums/log-levels";
- import Props from "../enums/props";
- import ScraperMethods from "../enums/scraper-methods";
- export default class ScraperArticles {
- private readonly _name: string;
- private readonly _options: IScraperArticlesOptions;
- private readonly _scraper: Scraper;
- constructor (name: string, options: IScraperArticlesOptions) {
- this._name = name;
- this._options = options;
- this._scraper = new Scraper();
- }
- private getProperty (domNodeElement: AnyNode, selector: string, prop: Props = Props.TEXT): string {
- const $ = load(domNodeElement);
- let value: string | undefined = "";
- try {
- value = prop === Props.TEXT ? $(selector).text() : $(selector).attr(prop);
- } catch (err) {
- if (config.LOG_LEVEL === LogLevels.DEBUG) {
- console.debug(`${this._name} | Error raised\n`);
- console.debug(`From ${domNodeElement.type} can't get value using selector '${selector}'`);
- console.error(err.message);
- }
- }
- return value ?? "";
- }
- private getTitle (article: AnyNode): string {
- const selector = this._options.titleSelector ?? "";
- return selector !== "" ? this.getProperty(article, selector).trim() : "";
- }
- private getContent (article: AnyNode): string {
- const selector = this._options.contentSelector ?? "";
- return selector !== "" ? this.getProperty(article, selector).trim() : "";
- }
- private getLink (article: AnyNode): string {
- const selector = this._options.linkSelector ?? "";
- const url = selector !== "" ? this.getProperty(article, selector, Props.LINK) : this._options.url
- return ! this.isValidUrl(url) && this._options.linkPrefix !== undefined
- ? (this._options.linkPrefix + url.trim()).trim()
- : url.trim();
- }
- private async getImage (article: AnyNode): Promise<File | null> {
- let imgFile;
- const selector = this._options.imageSelector ?? "";
- let imgUrl = selector !== "" ? this.getProperty(article, selector, Props.IMAGE) : "";
- if (imgUrl !== "") {
- if (! this.isValidUrl(imgUrl) && this._options.imagePrefix) {
- imgUrl = this._options.imagePrefix + imgUrl.trim();
- }
- imgFile = this._scraper.scrapeFile(imgUrl)
- .catch((err) => {
- if (config.LOG_LEVEL === LogLevels.DEBUG) {
- console.debug(`${this._name} | Error\n`);
- console.debug(`From ${article.type} can't get image using selector '${selector}'`);
- console.error(err.message);
- }
- });
- }
- return imgFile;
- }
- private getAuthor (article: AnyNode): string {
- const selector = this._options.authorSelector ?? "";
- return selector !== "" ? this.getProperty(article, selector).trim() : this._name;
- }
- private getDate (article: AnyNode): string {
- const selector = this._options.dateSelector ?? "";
- return selector !== "" ? this.getProperty(article, selector).trim() : new Date(Date.now()).toLocaleDateString("es-CL");
- }
- private isValidUrl (url: string): boolean {
- try {
- const parsedUrl = new URL(url);
- if (!parsedUrl.protocol || !parsedUrl.host) {
- return false;
- }
- return true;
- } catch (error) {
- return false;
- }
- };
- public async getArticles (): Promise<IArticle[]> {
- const articles: IArticle[] = [];
- const startTime = Date.now();
- try {
- console.info("Starting scraping", this._options);
- const response = await this._scraper.scrape({ url: this._options.url, scraperMethod: this._options.scraperMethod });
- const html = this._options.scraperMethod === ScraperMethods.PUPPETEER ? response.data.data.html : response.data;
- const $ = load(html as string);
- const domElements = $(this._options.articlesSelector);
- if (config.LOG_LEVEL === LogLevels.DEBUG) {
- console.debug(`${this._name} | Articles obtained: ${domElements.length} `);
- }
- for (let i = 0; i < domElements.length; i++) {
- const article = domElements[i];
- articles.push({
- title: this.getTitle(article),
- content: this.getContent(article),
- link: this.getLink(article),
- image: await this.getImage(article),
- author: this.getAuthor(article),
- date: this.getDate(article)
- });
- }
- } catch (err) {
- if (config.LOG_LEVEL === LogLevels.DEBUG) {
- console.debug(`${this._name} | Error\n`);
- console.error(err.message);
- }
- } finally {
- const endTime = Date.now();
- const duration = (endTime - startTime) / 1000;
- console.info(`${this._name} | Execution time: ${duration}s`);
- }
- return articles;
- }
- }
|