فهرست منبع

updated to post to individual accounts
added df portal
option to select method of scrapin

Pablo Barrera Yaksic 11 ماه پیش
والد
کامیت
cdaa108f0b
3فایلهای تغییر یافته به همراه49 افزوده شده و 27 حذف شده
  1. 13 6
      src/portales/theclinic/handler.ts
  2. 5 5
      src/utils/scraper-articles.ts
  3. 31 16
      src/utils/scraper.ts

+ 13 - 6
src/portales/theclinic/handler.ts

@@ -2,12 +2,19 @@ import { type Handler } from "aws-lambda";
 
 import config from "../../config";
 import Portal from "../portal";
+import ScraperMethods from "../../enums/scraper-methods";
 
 const name = "The Clinic";
 
-export const handler: Handler = new Portal(name, {
-  url: config.THECLINIC,
-  articlesSelector: ".listado article",
-  titleSelector: ".titulares h2 a",
-  linkSelector: ".titulares h2 a"
-}).getHandler();
+export const handler: Handler = new Portal(
+  name, 
+  config.MASTODON_KEY_THECLINIC,
+  {
+    url: config.THECLINIC,
+    articlesSelector: "div.listado-3 article",
+    titleSelector: "div.titulares h2 a",
+    linkSelector: "div.titulares h2 a",
+    imageSelector: "div.imagen-post img",
+    scraperMethod: ScraperMethods.PUPPETEER
+  }
+).getHandler();

+ 5 - 5
src/utils/scraper-articles.ts

@@ -1,6 +1,5 @@
-import { type AnyNode, load } from "cheerio";
-import "dotenv/config";
-
+import { load } from "cheerio";
+import { type AnyNode } from "domhandler";
 import config from "../config";
 import Scraper from "./scraper";
 
@@ -9,6 +8,7 @@ import type { IArticle } from "../interfaces/article";
 
 import LogLevels from "../enums/log-levels";
 import Props from "../enums/props";
+import ScraperMethods from "../enums/scraper-methods";
 
 export default class ScraperArticles {
   private readonly _name: string;
@@ -86,8 +86,8 @@ export default class ScraperArticles {
 
     try {
       console.info("Starting scraping", this._options);
-      const response = await this._scraper.scrape({ url: this._options.url });
-      const html = response.data.data.html;
+      const response = await this._scraper.scrape({ url: this._options.url, scraperMethod: this._options.scraperMethod });
+      const html = this._options.scraperMethod === ScraperMethods.PUPPETEER ? response.data.data.html : response.data;
 
       const $ = load(html as string);
       const domElements = $(this._options.articlesSelector);

+ 31 - 16
src/utils/scraper.ts

@@ -5,31 +5,46 @@ import LogLevels from "../enums/log-levels";
 
 import type { IScraperOptions } from "../interfaces/scraper-options";
 
+import ScraperMethods from "../enums/scraper-methods";
+
 export default class Scraper {
   private _options: IScraperOptions;
 
+  private logScrape(response: any) {
+    console.debug(`From '${this._options.url}'\n`);
+    console.debug("HTML\n");
+
+    if (this._options.scraperMethod == ScraperMethods.PUPPETEER) {
+      console.debug(response.data.data.html);
+      console.debug("\n");
+      console.debug("Screenshot (Base64)\n")
+      console.debug(response.data.data.screenshot);
+    } else {
+      console.debug(response.data);
+    }
+  }
+
   public async scrape (options: IScraperOptions): Promise<any> {
     this._options = options;
     let response: any;
+    const userAgent = this._options.userAgent ?? "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36";
+    const headers = {
+      "User-Agent": userAgent
+    }
 
     try {
-      response = await axios.post(config.PUPPETEER_URL, {
-        url: this._options.url,
-        screenshot: config.LOG_LEVEL === LogLevels.DEBUG,
-        incognito: this._options.incognito ?? false
-      }, {
-        headers: {
-          "User-Agent": this._options.userAgent ?? "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
-        }
-      });
-
+      if (this._options.scraperMethod == ScraperMethods.PUPPETEER) {
+        response = await axios.post(config.PUPPETEER_URL, {
+          url: this._options.url,
+          screenshot: config.LOG_LEVEL === LogLevels.DEBUG,
+          incognito: this._options.incognito ?? false
+        }, { headers });
+      } else {
+        response = await axios.get(this._options.url, { headers });
+      }
+      
       if (config.LOG_LEVEL === LogLevels.DEBUG) {
-        console.debug(`From '${this._options.url}'\n`);
-        console.debug("HTML\n")
-        console.debug(response.data.data.html);
-        console.debug("\n");
-        console.debug("Screenshot (Base64)\n")
-        console.debug(response.data.data.screenshot);
+        this.logScrape(response);
       }
     } catch (err: any) {
       console.error(err.message);