scraper-articles.ts 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. import { load } from "cheerio";
  2. import { type AnyNode } from "domhandler";
  3. import config from "../config";
  4. import Scraper from "./scraper";
  5. import type { IScraperArticlesOptions } from "../interfaces/scaper-articles-options";
  6. import type { IArticle } from "../interfaces/article";
  7. import LogLevels from "../enums/log-levels";
  8. import Props from "../enums/props";
  9. import ScraperMethods from "../enums/scraper-methods";
  10. export default class ScraperArticles {
  11. private readonly _name: string;
  12. private readonly _options: IScraperArticlesOptions;
  13. private readonly _scraper: Scraper;
  14. constructor (name: string, options: IScraperArticlesOptions) {
  15. this._name = name;
  16. this._options = options;
  17. this._scraper = new Scraper();
  18. }
  19. private getProperty (domNodeElement: AnyNode, selector: string, prop: Props = Props.TEXT): string {
  20. const $ = load(domNodeElement);
  21. let value: string | undefined = "";
  22. try {
  23. value = prop === Props.TEXT ? $(selector).text() : $(selector).attr(prop);
  24. } catch (err) {
  25. if (config.LOG_LEVEL === LogLevels.DEBUG) {
  26. console.debug(`${this._name} | Error raised\n`);
  27. console.debug(`From ${domNodeElement.type} can't get value using selector '${selector}'`);
  28. console.error(err.message);
  29. }
  30. }
  31. return value ?? "";
  32. }
  33. private getTitle (article: AnyNode): string {
  34. const selector = this._options.titleSelector ?? "";
  35. return selector !== "" ? this.getProperty(article, selector).trim() : "";
  36. }
  37. private getContent (article: AnyNode): string {
  38. const selector = this._options.contentSelector ?? "";
  39. return selector !== "" ? this.getProperty(article, selector).trim() : "";
  40. }
  41. private getLink (article: AnyNode): string {
  42. const selector = this._options.linkSelector ?? "";
  43. const url = selector !== "" ? this.getProperty(article, selector, Props.LINK) : this._options.url
  44. return ! this.isValidUrl(url) && this._options.linkPrefix !== undefined
  45. ? (this._options.linkPrefix + url.trim()).trim()
  46. : url.trim();
  47. }
  48. private async getImage (article: AnyNode): Promise<File | null> {
  49. let imgFile;
  50. const selector = this._options.imageSelector ?? "";
  51. let imgUrl = selector !== "" ? this.getProperty(article, selector, Props.IMAGE) : "";
  52. if (imgUrl !== "") {
  53. if (! this.isValidUrl(imgUrl) && this._options.imagePrefix) {
  54. imgUrl = this._options.imagePrefix + imgUrl.trim();
  55. }
  56. imgFile = this._scraper.scrapeFile(imgUrl)
  57. .catch((err) => {
  58. if (config.LOG_LEVEL === LogLevels.DEBUG) {
  59. console.debug(`${this._name} | Error\n`);
  60. console.debug(`From ${article.type} can't get image using selector '${selector}'`);
  61. console.error(err.message);
  62. }
  63. });
  64. }
  65. return imgFile;
  66. }
  67. private getAuthor (article: AnyNode): string {
  68. const selector = this._options.authorSelector ?? "";
  69. return selector !== "" ? this.getProperty(article, selector).trim() : this._name;
  70. }
  71. private getDate (article: AnyNode): string {
  72. const selector = this._options.dateSelector ?? "";
  73. return selector !== "" ? this.getProperty(article, selector).trim() : new Date(Date.now()).toLocaleDateString("es-CL");
  74. }
  75. private isValidUrl (url: string): boolean {
  76. try {
  77. const parsedUrl = new URL(url);
  78. if (!parsedUrl.protocol || !parsedUrl.host) {
  79. return false;
  80. }
  81. return true;
  82. } catch (error) {
  83. return false;
  84. }
  85. };
  86. public async getArticles (): Promise<IArticle[]> {
  87. const articles: IArticle[] = [];
  88. const startTime = Date.now();
  89. try {
  90. console.info("Starting scraping", this._options);
  91. const response = await this._scraper.scrape({ url: this._options.url, scraperMethod: this._options.scraperMethod });
  92. const html = this._options.scraperMethod === ScraperMethods.PUPPETEER ? response.data.data.html : response.data;
  93. const $ = load(html as string);
  94. const domElements = $(this._options.articlesSelector);
  95. if (config.LOG_LEVEL === LogLevels.DEBUG) {
  96. console.debug(`${this._name} | Articles obtained: ${domElements.length} `);
  97. }
  98. for (let i = 0; i < domElements.length; i++) {
  99. const article = domElements[i];
  100. articles.push({
  101. title: this.getTitle(article),
  102. content: this.getContent(article),
  103. link: this.getLink(article),
  104. image: await this.getImage(article),
  105. author: this.getAuthor(article),
  106. date: this.getDate(article)
  107. });
  108. }
  109. } catch (err) {
  110. if (config.LOG_LEVEL === LogLevels.DEBUG) {
  111. console.debug(`${this._name} | Error\n`);
  112. console.error(err.message);
  113. }
  114. } finally {
  115. const endTime = Date.now();
  116. const duration = (endTime - startTime) / 1000;
  117. console.info(`${this._name} | Execution time: ${duration}s`);
  118. }
  119. return articles;
  120. }
  121. }