فهرست منبع

updated to post to individual accounts
added df portal
option to select method of scrapin

Pablo Barrera Yaksic 2 ماه پیش
والد
کامیت
cdaa108f0b
3فایلهای تغییر یافته به همراه49 افزوده شده و 27 حذف شده
  1. 13 6
      src/portales/theclinic/handler.ts
  2. 5 5
      src/utils/scraper-articles.ts
  3. 31 16
      src/utils/scraper.ts

+ 13 - 6
src/portales/theclinic/handler.ts

@@ -2,12 +2,19 @@ import { type Handler } from "aws-lambda";
 
 import config from "../../config";
 import Portal from "../portal";
+import ScraperMethods from "../../enums/scraper-methods";
 
 const name = "The Clinic";
 
-export const handler: Handler = new Portal(name, {
-  url: config.THECLINIC,
-  articlesSelector: ".listado article",
-  titleSelector: ".titulares h2 a",
-  linkSelector: ".titulares h2 a"
-}).getHandler();
+export const handler: Handler = new Portal(
+  name, 
+  config.MASTODON_KEY_THECLINIC,
+  {
+    url: config.THECLINIC,
+    articlesSelector: "div.listado-3 article",
+    titleSelector: "div.titulares h2 a",
+    linkSelector: "div.titulares h2 a",
+    imageSelector: "div.imagen-post img",
+    scraperMethod: ScraperMethods.PUPPETEER
+  }
+).getHandler();

+ 5 - 5
src/utils/scraper-articles.ts

@@ -1,6 +1,5 @@
-import { type AnyNode, load } from "cheerio";
-import "dotenv/config";
-
+import { load } from "cheerio";
+import { type AnyNode } from "domhandler";
 import config from "../config";
 import Scraper from "./scraper";
 
@@ -9,6 +8,7 @@ import type { IArticle } from "../interfaces/article";
 
 import LogLevels from "../enums/log-levels";
 import Props from "../enums/props";
+import ScraperMethods from "../enums/scraper-methods";
 
 export default class ScraperArticles {
   private readonly _name: string;
@@ -86,8 +86,8 @@ export default class ScraperArticles {
 
     try {
       console.info("Starting scraping", this._options);
-      const response = await this._scraper.scrape({ url: this._options.url });
-      const html = response.data.data.html;
+      const response = await this._scraper.scrape({ url: this._options.url, scraperMethod: this._options.scraperMethod });
+      const html = this._options.scraperMethod === ScraperMethods.PUPPETEER ? response.data.data.html : response.data;
 
       const $ = load(html as string);
       const domElements = $(this._options.articlesSelector);

+ 31 - 16
src/utils/scraper.ts

@@ -5,31 +5,46 @@ import LogLevels from "../enums/log-levels";
 
 import type { IScraperOptions } from "../interfaces/scraper-options";
 
+import ScraperMethods from "../enums/scraper-methods";
+
 export default class Scraper {
   private _options: IScraperOptions;
 
+  private logScrape(response: any) {
+    console.debug(`From '${this._options.url}'\n`);
+    console.debug("HTML\n");
+
+    if (this._options.scraperMethod == ScraperMethods.PUPPETEER) {
+      console.debug(response.data.data.html);
+      console.debug("\n");
+      console.debug("Screenshot (Base64)\n")
+      console.debug(response.data.data.screenshot);
+    } else {
+      console.debug(response.data);
+    }
+  }
+
   public async scrape (options: IScraperOptions): Promise<any> {
     this._options = options;
     let response: any;
+    const userAgent = this._options.userAgent ?? "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36";
+    const headers = {
+      "User-Agent": userAgent
+    }
 
     try {
-      response = await axios.post(config.PUPPETEER_URL, {
-        url: this._options.url,
-        screenshot: config.LOG_LEVEL === LogLevels.DEBUG,
-        incognito: this._options.incognito ?? false
-      }, {
-        headers: {
-          "User-Agent": this._options.userAgent ?? "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
-        }
-      });
-
+      if (this._options.scraperMethod == ScraperMethods.PUPPETEER) {
+        response = await axios.post(config.PUPPETEER_URL, {
+          url: this._options.url,
+          screenshot: config.LOG_LEVEL === LogLevels.DEBUG,
+          incognito: this._options.incognito ?? false
+        }, { headers });
+      } else {
+        response = await axios.get(this._options.url, { headers });
+      }
+      
       if (config.LOG_LEVEL === LogLevels.DEBUG) {
-        console.debug(`From '${this._options.url}'\n`);
-        console.debug("HTML\n")
-        console.debug(response.data.data.html);
-        console.debug("\n");
-        console.debug("Screenshot (Base64)\n")
-        console.debug(response.data.data.screenshot);
+        this.logScrape(response);
       }
     } catch (err: any) {
       console.error(err.message);