Parcourir la source

added biobiochile portal

Pablo Barrera Yaksic il y a 4 jours
Parent
commit
bf3bb2409c

+ 3 - 15
.env.example

@@ -13,19 +13,8 @@ MASTODON_DB_USER = "mastodon"
 MASTODON_DB_PASSWORD = "password"
 MASTODON_DB_DATABASE = "mastodon"
 
-CHILECULTURA = "https://chilecultura.gob.cl/events/"
-CIPER = "https://www.ciperchile.cl/actualidad/"
-CODEXVERDE = "https://codexverde.cl/"
-COOPERATIVA = "https://www.cooperativa.cl/"
-LOG_LEVEL = "debug"
-REDIS_CONN = "redis://localhost:6379/10"
-PUPPETEER_URL = "http://localhost:8000/api/v1/visit"
-MASTODON_URL = "https://<mastodon-domain>"
-MASTODON_API_URL = "https://<mastodon-domain>/api/v1/"
-MASTODON_STREAMING_URL = "wss://<mastodon-domain>/api/v1/streaming"
-MASTODON_ACCESS_TOKEN = "<access-token>"
-IMG_PLACEHOLDER = "https://placehold.co/600x400"
-
+# PORTALES
+BIOBIOCHILE = "https://www.biobiochile.cl/"
 CHILECULTURA = "https://chilecultura.gob.cl/events/"
 CIPER = "https://www.ciperchile.cl/actualidad/"
 CODEXVERDE = "https://codexverde.cl/"
@@ -40,12 +29,12 @@ GLACIARESCHILENOS = "https://www.glaciareschilenos.org/articulos/"
 INTERFERENCIA = "https://interferencia.cl/"
 LADERASUR = "https://laderasur.com/articulos-2/"
 LATERCERA = "https://www.latercera.com/canal/nacional/"
-METRODESANTIAGO = "https://xcancel.com/metrodesantiago"
 SISMOLOGIA = "https://www.sismologia.cl/index.html"
 TARREO = "https://www.tarreo.com/noticias/"
 THECLINIC = "https://www.theclinic.cl/lo-ultimo/"
 
 # KEYS
+MASTODON_KET_BIOBIOCHILE = ""
 MASTODON_KEY_CHILECULTURA = ""
 MASTODON_KEY_CIPER = ""
 MASTODON_KEY_CODEXVERDE = ""
@@ -60,7 +49,6 @@ MASTODON_KEY_GLACIARESCHILENOS = ""
 MASTODON_KEY_INTERFERENCIA = ""
 MASTODON_KEY_LADERASUR = ""
 MASTODON_KEY_LATERCERA = ""
-MASTODON_KEY_METRODESANTIAGO = ""
 MASTODON_KEY_SISMOLOGIA = ""
 MASTODON_KEY_TARREO = ""
 MASTODON_KEY_THECLINIC = ""

+ 2 - 3
src/config.ts

@@ -15,12 +15,12 @@ const config = {
   MASTODON_DB_PASSWORD: process.env.MASTODON_DB_PASSWORD,
   MASTODON_DB_DATABASE: process.env.MASTODON_DB_DATABASE,
   // PORTALES
+  BIOBIOCHILE: process.env.BIOBIOCHILE ?? "https://www.biobiochile.cl/",
   CHILECULTURA: process.env.CHILECULTURA ?? "https://chilecultura.gob.cl/",
   CIPER: process.env.CIPER ?? "https://www.ciperchile.cl/actualidad/",
   CODEXVERDE: process.env.CODEXVERDE ?? "https://codexverde.cl/",
   CONTRAPODER: process.env.CONTRAPODER ?? "https://contrapoderchile.cl/category/portada/",
   COOPERATIVA: process.env.COOPERATIVA ?? "https://www.cooperativa.cl/",
-  // COOPERATIVA: process.env.COOPERATIVA ?? "https://cooperativa.cl/noticias/site/cache/nroedic/todas/",
   DF: process.env.DF ?? "https://www.df.cl/ultimasnoticias",
   ELCIUDADANO: process.env.ELCIUDADANO ?? "https://www.elciudadano.com/chile/",
   ELDESCONCIERTO: process.env.ELDESCONCIERTO ?? "https://eldesconcierto.cl",
@@ -31,11 +31,11 @@ const config = {
   INTERFERENCIA: process.env.INTERFERENCIA ?? "https://interferencia.cl/",
   LADERASUR: process.env.LADERASUR ?? "https://laderasur.com/",
   LATERCERA: process.env.LATERCERA ?? "https://www.latercera.com/canal/nacional/",
-  // METRODESANTIAGO: process.env.METRODESANTIAGO ?? "https://xcancel.com/metrodesantiago",
   SISMOLOGIA: process.env.SISMOLOGIA ?? "https://www.sismologia.cl/index.html",
   TARREO: process.env.TARREO ?? "https://www.tarreo.com/noticias/",
   THECLINIC: process.env.THECLINIC ?? "https://www.theclinic.cl/lo-ultimo/",
   // KEYS
+  MASTODON_KEY_BIOBIOCHILE: process.env.MASTODON_KEY_BIOBIOCHILE ?? "",
   MASTODON_KEY_CHILECULTURA: process.env.MASTODON_KEY_CHILECULTURA ?? "",
   MASTODON_KEY_CIPER: process.env.MASTODON_KEY_CIPER ?? "",
   MASTODON_KEY_CODEXVERDE: process.env.MASTODON_KEY_CODEXVERDE ?? "",
@@ -51,7 +51,6 @@ const config = {
   MASTODON_KEY_INTERFERENCIA: process.env.MASTODON_KEY_INTERFERENCIA ?? "",
   MASTODON_KEY_LADERASUR: process.env.MASTODON_KEY_LADERASUR ?? "",
   MASTODON_KEY_LATERCERA: process.env.MASTODON_KEY_LATERCERA ?? "",
-  // MASTODON_KEY_METRODESANTIAGO: process.env.MASTODON_KEY_METRODESANTIAGO ?? "",
   MASTODON_KEY_SISMOLOGIA: process.env.MASTODON_KEY_SISMOLOGIA ?? "",
   MASTODON_KEY_TARREO: process.env.MASTODON_KEY_TARREO ?? "",
   MASTODON_KEY_THECLINIC: process.env.MASTODON_KEY_THECLINIC ?? "",

+ 2 - 1
src/index.ts

@@ -2,6 +2,7 @@ import { type Context, type Handler } from "aws-lambda";
 
 import config from "./config";
 
+import { handler as biobiochile } from "./portales/biobiochile/handler";
 import { handler as chilecultura } from "./portales/chilecultura/handler";
 import { handler as ciper } from "./portales/ciper/handler";
 import { handler as codexverde } from "./portales/codexverde/handler";
@@ -37,6 +38,7 @@ const context: Context = {
 };
 
 const portalsHandlers = {
+  "biobiochile": biobiochile,
   "chilecultura": chilecultura,
   "ciper": ciper,
   "codexverde": codexverde,
@@ -52,7 +54,6 @@ const portalsHandlers = {
   "interferencia": interferencia,
   "laderasur": laderasur,
   // "latercera": latercera,
-  // "metrodesantiago": metrodesantiago,
   "tarreo": tarreo,
   "theclinic": theclinic
 };

+ 4 - 0
src/portales/biobiochile/definition.yml

@@ -0,0 +1,4 @@
+biobiochile:
+  handler: ./src/portales/biobiochile/handler.handler
+  events: 
+    - schedule: rate(1 hour)

+ 20 - 0
src/portales/biobiochile/handler.ts

@@ -0,0 +1,20 @@
+import { type Handler } from "aws-lambda";
+
+import config from "../../config";
+import Portal from "../portal";
+import ScraperMethods from "../../enums/scraper-methods";
+
+const name = "BioBio Chile";
+
+export const handler: Handler = new Portal(
+  name,
+  config.MASTODON_KEY_BIOBIOCHILE,
+  {
+    url: config.BIOBIOCHILE,
+    articlesSelector: "article.main-article, article.article-highlights, article.article-horizontal, article.vertical, article.article-highlights-aside",
+    titleSelector: "h2",
+    linkSelector: "a",
+    scraperMethod: ScraperMethods.AXIOS,
+    hashtags: ["BioBioChile", "Noticias"]
+  }
+).getHandler();

+ 19 - 14
src/utils/scraper-articles.ts

@@ -15,13 +15,13 @@ export default class ScraperArticles {
   private readonly _options: IScraperArticlesOptions;
   private readonly _scraper: Scraper;
 
-  constructor (name: string, options: IScraperArticlesOptions) {
+  constructor(name: string, options: IScraperArticlesOptions) {
     this._name = name;
     this._options = options;
     this._scraper = new Scraper();
   }
 
-  private getProperty (domNodeElement: AnyNode, selector: string, prop: Props = Props.TEXT): string {
+  private getProperty(domNodeElement: AnyNode, selector: string, prop: Props = Props.TEXT): string {
     const $ = load(domNodeElement);
     let value: string | undefined = "";
     try {
@@ -36,32 +36,32 @@ export default class ScraperArticles {
     return value ?? "";
   }
 
-  private getTitle (article: AnyNode): string {
+  private getTitle(article: AnyNode): string {
     const selector = this._options.titleSelector ?? "";
     return selector !== "" ? this.getProperty(article, selector).trim() : "";
   }
 
-  private getContent (article: AnyNode): string {
+  private getContent(article: AnyNode): string {
     const selector = this._options.contentSelector ?? "";
     return selector !== "" ? this.getProperty(article, selector).trim() : "";
   }
 
-  private getLink (article: AnyNode): string {
+  private getLink(article: AnyNode): string {
     const selector = this._options.linkSelector ?? "";
     const url = selector !== "" ? this.getProperty(article, selector, Props.LINK) : this._options.url
 
-    return ! this.isValidUrl(url) && this._options.linkPrefix !== undefined
-     ? (this._options.linkPrefix + url.trim()).trim() 
-     : url.trim();
+    return !this.isValidUrl(url) && this._options.linkPrefix !== undefined
+      ? (this._options.linkPrefix + url.trim()).trim()
+      : url.trim();
   }
 
-  private async getImage (article: AnyNode): Promise<File | null> {
+  private async getImage(article: AnyNode): Promise<File | null> {
     let imgFile;
     const selector = this._options.imageSelector ?? "";
     let imgUrl = selector !== "" ? this.getProperty(article, selector, Props.IMAGE) : "";
 
     if (imgUrl !== "") {
-      if (! this.isValidUrl(imgUrl) && this._options.imagePrefix) {
+      if (!this.isValidUrl(imgUrl) && this._options.imagePrefix) {
         imgUrl = this._options.imagePrefix + imgUrl.trim();
       }
       imgFile = this._scraper.scrapeFile(imgUrl)
@@ -76,17 +76,17 @@ export default class ScraperArticles {
     return imgFile;
   }
 
-  private getAuthor (article: AnyNode): string {
+  private getAuthor(article: AnyNode): string {
     const selector = this._options.authorSelector ?? "";
     return selector !== "" ? this.getProperty(article, selector).trim() : this._name;
   }
 
-  private getDate (article: AnyNode): string {
+  private getDate(article: AnyNode): string {
     const selector = this._options.dateSelector ?? "";
     return selector !== "" ? this.getProperty(article, selector).trim() : new Date(Date.now()).toLocaleDateString("es-CL");
   }
 
-  private isValidUrl (url: string): boolean {
+  private isValidUrl(url: string): boolean {
     try {
       const parsedUrl = new URL(url);
       if (!parsedUrl.protocol || !parsedUrl.host) {
@@ -98,7 +98,7 @@ export default class ScraperArticles {
     }
   };
 
-  public async getArticles (): Promise<IArticle[]> {
+  public async getArticles(): Promise<IArticle[]> {
     const articles: IArticle[] = [];
     const startTime = Date.now();
 
@@ -116,6 +116,11 @@ export default class ScraperArticles {
 
       for (let i = 0; i < domElements.length; i++) {
         const article = domElements[i];
+
+        if (!this.getLink(article)) {
+          continue;
+        }
+
         articles.push({
           title: this.getTitle(article),
           content: this.getContent(article),