Browse Source

added new sources

Pablo Barrera Yaksic 11 months ago
parent
commit
146cc9abda

+ 5 - 3
.env.example

@@ -1,10 +1,12 @@
 LOG_LEVEL = "debug"
 REDIS_CONN = "redis://localhost:6379/10"
 PUPPETEER_URL = "http://localhost:8000/api/v1/visit"
-MASTODON_URL = "https://mastodon.cl/api/v1/"
-MASTODON_ACCESS_TOKEN = "J9uN5pjRDg8d2tlL8RRrlD7lwu16Mquc0KaTetiPN_8"
+MASTODON_URL = "https://<mastodon-domain>/api/v1/"
+MASTODON_ACCESS_TOKEN = "<access-token>"
 IMG_PLACEHOLDER = "https://placehold.co/600x400"
 
 THECLINIC = "https://www.theclinic.cl/lo-ultimo/"
 LATERCERA = "https://www.latercera.com/canal/nacional/"
-EMOL = "https://www.emol.com/nacional/"
+EMOL = "https://www.emol.com/nacional/"
+ELCIUDADANO = "https://www.elciudadano.com/chile/"
+ELDESCONCIERTO = "https://www.eldesconcierto.cl/nacional/"

+ 3 - 2
.gitignore

@@ -1,5 +1,6 @@
 node_modules
+dist
 .env
+.env.production
 .build
-.serverless
-dist
+.serverless

+ 1 - 1
package.json

@@ -1,6 +1,6 @@
 {
   "name": "bot-noticias",
-  "version": "0.0.1",
+  "version": "0.0.2",
   "description": "Bot que busca noticias y las replica en mastodon.cl",
   "main": "dist/index.js",
   "scripts": {

+ 6 - 1
serverless.yml

@@ -1,16 +1,21 @@
-service: bot-news
+service: bot-noticias
 frameworkVersion: '3'
+useDotenv: true
 
 provider:
   name: aws
   runtime: nodejs20.x
   region: us-east-1
+  timeout: 60
 
 plugins:
   - serverless-offline
   - serverless-plugin-typescript
+  - serverless-dotenv-plugin
 
 functions:
   - ${file(./src/portales/theclinic/definition.yml)}
   - ${file(./src/portales/emol/definition.yml)}
   - ${file(./src/portales/latercera/definition.yml)}
+  - ${file(./src/portales/elciudadano/definition.yml)}
+  - ${file(./src/portales/eldesconcierto/definition.yml)}

+ 3 - 1
src/config.ts

@@ -8,7 +8,9 @@ const config = {
   // PORTALES
   THECLINIC: process.env.THECLINIC ?? "https://www.theclinic.cl/lo-ultimo/",
   LATERCERA: process.env.LATERCERA ?? "https://www.latercera.com/canal/nacional/",
-  EMOL: process.env.EMOL_URL ?? "https://www.emol.com/nacional/"
+  EMOL: process.env.EMOL ?? "https://www.emol.com/nacional/",
+  ELCIUDADANO: process.env.ELCIUDADANO ?? "https://www.elciudadano.com/chile/",
+  ELDESCONCIERTO: process.env.ELDESCONCIERTO ?? "https://www.eldesconcierto.cl/nacional/"
 };
 
 export default config;

+ 31 - 0
src/index.ts

@@ -0,0 +1,31 @@
+import { type Context } from "aws-lambda";
+
+import { handler as elciudadano } from "./portales/elciudadano/handler";
+import { handler as eldesconcierto } from "./portales/eldesconcierto/handler";
+import { handler as emol } from "./portales/emol/handler";
+import { handler as latercera } from "./portales/latercera/handler";
+import { handler as theclinic } from "./portales/theclinic/handler";
+
+const context: Context = {
+  callbackWaitsForEmptyEventLoop: false,
+  functionName: "",
+  functionVersion: "$LATEST",
+  invokedFunctionArn: "",
+  memoryLimitInMB: "0",
+  awsRequestId: "",
+  logGroupName: "",
+  logStreamName: "",
+  done: () => {},
+  fail: () => {},
+  succeed: () => {},
+  getRemainingTimeInMillis: () => 1
+};
+
+const theCalls = Promise.all([
+  elciudadano(null, context, () => {}),
+  eldesconcierto(null, context, () => {}),
+  emol(null, context, () => {}),
+  latercera(null, context, () => {}),
+  theclinic(null, context, () => {})
+]);
+console.log("The end.", theCalls);

+ 1 - 1
src/interfaces/article.ts

@@ -2,7 +2,7 @@ export interface IArticle {
   title: string
   content: string
   link: string
-  image: string
+  image: File | null
   author: string
   date: string
 }

+ 4 - 0
src/portales/elciudadano/definition.yml

@@ -0,0 +1,4 @@
+elciudadano:
+  handler: ./src/portales/elciudadano/handler.handler
+  events: 
+    - schedule: rate(1 hour)

+ 78 - 0
src/portales/elciudadano/handler.ts

@@ -0,0 +1,78 @@
+import type { Handler } from "aws-lambda";
+import { createRestAPIClient } from "masto";
+import "dotenv/config";
+
+import ScraperArticles from "../../utils/scraper-articles";
+import RedisClient from "../../libs/redis-client";
+
+import config from "../../config";
+import LogLevels from "../../enums/log-levels";
+import Emojis from "../../enums/emojis";
+
+export const handler: Handler = async (event, context) => {
+  const name = "El Ciudadano";
+  try {
+    const redisClient = new RedisClient();
+    const mastodon = createRestAPIClient({
+      url: config.MASTODON_URL,
+      accessToken: config.MASTODON_ACCESS_TOKEN
+    });
+    const scraperArticles = new ScraperArticles(name, {
+      url: config.ELCIUDADANO,
+      articlesSelector: "article",
+      titleSelector: "div div.col-md-7 a h3",
+      contentSelector: "div div.col-md-7 a p",
+      linkSelector: "div div.col-md-7 a",
+      imageSelector: "div div.col-md-5 img",
+      authorSelector: "",
+      dateSelector: ""
+    });
+
+    const articles = await scraperArticles.getArticles();
+    if (config.LOG_LEVEL === LogLevels.DEBUG) {
+      console.log("Articles", articles);
+    }
+
+    let totalPublished = 0;
+    const length = articles.length;
+
+    // Order has to be reverser to appear in the correct order when posting
+    for (let i = length; i > length; i--) {
+      const article = articles[i];
+      const exists = await redisClient.retrieve(article.link);
+      if (exists !== null) {
+        continue;
+      }
+
+      const date = new Date(Date.now()).toLocaleDateString();
+      let message = `${Emojis.NEWS} ${article.title}.\n\n${article.content}\n${article.link}`;
+
+      if (message.trim().length === 0) {
+        continue;
+      }
+
+      if (message.length > 400) {
+        message = `${Emojis.NEWS} ${article.title}.\n\n${article.content}`.substring(0, 397) + "...";
+        message = `${message}\n${article.link}`;
+      }
+
+      console.log("Sending", message);
+
+      await mastodon.v1.statuses.create({ status: message });
+      await redisClient.store(article.link, date, { EX: 60 * 60 * 24 }); // EX: 24 hrs expiration
+      totalPublished++
+    }
+    console.log(`Published ${totalPublished} new articles`);
+  } catch (err: any) {
+    console.log('An error has occurred\n')
+    console.error(err.message);
+    if (config.LOG_LEVEL === LogLevels.DEBUG) {
+      console.debug("\nEvent\n");
+      console.debug(event);
+      console.debug("\nContext\n");
+      console.debug(context);
+    }
+  }
+
+  return "The End.";
+};

+ 4 - 0
src/portales/eldesconcierto/definition.yml

@@ -0,0 +1,4 @@
+eldesconcierto:
+  handler: ./src/portales/eldesconcierto/handler.handler
+  events: 
+    - schedule: rate(1 hour)

+ 73 - 0
src/portales/eldesconcierto/handler.ts

@@ -0,0 +1,73 @@
+import type { Handler } from "aws-lambda";
+import { createRestAPIClient } from "masto";
+import "dotenv/config";
+
+import ScraperArticles from "../../utils/scraper-articles";
+import RedisClient from "../../libs/redis-client";
+
+import config from "../../config";
+import LogLevels from "../../enums/log-levels";
+import Emojis from "../../enums/emojis";
+
+export const handler: Handler = async (event, context) => {
+  const name = "El Desconcierto";
+  try {
+    const redisClient = new RedisClient();
+    const mastodon = createRestAPIClient({
+      url: config.MASTODON_URL,
+      accessToken: config.MASTODON_ACCESS_TOKEN
+    });
+    const scraperArticles = new ScraperArticles(name, {
+      url: config.ELDESCONCIERTO,
+      articlesSelector: "div.the-section__rows figure",
+      titleSelector: "figcaption h2",
+      contentSelector: "figcaption h2",
+      linkSelector: "a",
+      imageSelector: "a img",
+      authorSelector: "",
+      dateSelector: ""
+    });
+
+    const articles = await scraperArticles.getArticles();
+    if (config.LOG_LEVEL === LogLevels.DEBUG) {
+      console.log("Articles", articles);
+    }
+
+    let totalPublished = 0;
+    const length = articles.length;
+
+    // Order has to be reverser to appear in the correct order when posting
+    for (let i = length; i > length; i--) {
+      const article = articles[i];
+      const exists = await redisClient.retrieve(article.link);
+      if (exists !== null) {
+        continue;
+      }
+
+      const date = new Date(Date.now()).toLocaleDateString();
+      const message = `${Emojis.NEWS} ${article.title} \n${article.link}`;
+
+      if (message.trim().length === 0) {
+        continue;
+      }
+
+      console.log("Sending", message);
+
+      await mastodon.v1.statuses.create({ status: message });
+      await redisClient.store(article.link, date, { EX: 60 * 60 * 24 }); // EX: 24 hrs expiration
+      totalPublished++
+    }
+    console.log(`Published ${totalPublished} new articles`);
+  } catch (err: any) {
+    console.log('An error has occurred\n')
+    console.error(err.message);
+    if (config.LOG_LEVEL === LogLevels.DEBUG) {
+      console.debug("\nEvent\n");
+      console.debug(event);
+      console.debug("\nContext\n");
+      console.debug(context);
+    }
+  }
+
+  return "The End.";
+};

+ 15 - 8
src/portales/emol/handler.ts

@@ -2,7 +2,6 @@ import type { Handler } from "aws-lambda";
 import { createRestAPIClient } from "masto";
 import "dotenv/config";
 
-import Scraper from "../../utils/scraper";
 import ScraperArticles from "../../utils/scraper-articles";
 import RedisClient from "../../libs/redis-client";
 
@@ -18,7 +17,6 @@ export const handler: Handler = async (event, context) => {
       url: config.MASTODON_URL,
       accessToken: config.MASTODON_ACCESS_TOKEN
     });
-    const scraper = new Scraper();
     const scraperArticles = new ScraperArticles(name, {
       url: config.EMOL,
       articlesSelector: "div.cont_378_e_2015 div.col_center_noticias_item div.col_center_noticia4dest-360px",
@@ -35,8 +33,11 @@ export const handler: Handler = async (event, context) => {
       console.log("Articles", articles);
     }
 
+    let totalPublished = 0;
     const length = articles.length;
-    for (let i = 0; i < length; i++) {
+
+    // Order has to be reverser to appear in the correct order when posting
+    for (let i = length; i > length; i--) {
       const article = articles[i];
       const exists = await redisClient.retrieve(article.link);
       if (exists !== null) {
@@ -44,19 +45,25 @@ export const handler: Handler = async (event, context) => {
       }
 
       const date = new Date(Date.now()).toLocaleDateString();
-      const message = `${Emojis.NEWS} ${article.title} \nhttps:${article.link}`;
+      const message = `${Emojis.NEWS} ${article.title}\nhttps:${article.link}`;
       const mediaIds = [""];
 
-      const imgFile = await scraper.scrapeImage(article.image);
-      if (imgFile !== null) {
-        const media = await mastodon.v2.media.create({ file: imgFile, description: article.content });
+      if (message.trim().length === 0) {
+        continue;
+      }
+
+      if (article.image !== null) {
+        const media = await mastodon.v2.media.create({ file: article.image, description: article.title });
         mediaIds.push(media.id);
       }
 
+      console.log("Sending", message);
+
       await mastodon.v1.statuses.create({ status: message, mediaIds });
       await redisClient.store(article.link, date, { EX: 60 * 60 * 24 }); // EX: 24 hrs expiration
+      totalPublished++
     }
-    console.log(`Published ${length} new articles`);
+    console.log(`Published ${totalPublished} new articles`);
   } catch (err: any) {
     console.log('An error has occurred\n')
     console.error(err.message);

+ 13 - 12
src/portales/latercera/handler.ts

@@ -2,7 +2,6 @@ import type { Handler } from "aws-lambda";
 import { createRestAPIClient } from "masto";
 import "dotenv/config";
 
-// import Scraper from "../../utils/scraper";
 import ScraperArticles from "../../utils/scraper-articles";
 import RedisClient from "../../libs/redis-client";
 
@@ -36,8 +35,11 @@ export const handler: Handler = async (event, context) => {
       console.log("Articles", articles);
     }
 
+    let totalPublished = 0;
     const length = articles.length;
-    for (let i = 0; i < length; i++) {
+
+    // Order has to be reverser to appear in the correct order when posting
+    for (let i = length; i > length; i--) {
       const article = articles[i];
       const exists = await redisClient.retrieve(article.link);
       if (exists !== null) {
@@ -45,20 +47,19 @@ export const handler: Handler = async (event, context) => {
       }
 
       const date = new Date(Date.now()).toLocaleDateString();
-      const message = `${Emojis.NEWS} ${article.title} \n${baseDomain}${article.link}`;
-      const mediaIds = [""];
+      const message = `${Emojis.NEWS} ${article.title}\n${baseDomain}${article.link}`;
+
+      if (message.trim().length === 0) {
+        continue;
+      }
 
-      // Since Mastodon is able to read metas this is not necessary
-      // const imgFile = await scraper.scrapeImage(article.image);
-      // if (imgFile !== null) {
-      //   const media = await mastodon.v2.media.create({ file: imgFile, description: article.content });
-      //   mediaIds.push(media.id);
-      // }
+      console.log("Sending", message);
 
-      await mastodon.v1.statuses.create({ status: message, mediaIds });
+      await mastodon.v1.statuses.create({ status: message });
       await redisClient.store(article.link, date, { EX: 60 * 60 * 24 }); // EX: 24 hrs expiration
+      totalPublished++
     }
-    console.log(`Published ${length} new articles`);
+    console.log(`Published ${totalPublished} new articles`);
   } catch (err: any) {
     console.log('An error has occurred\n')
     console.error(err.message);

+ 13 - 13
src/portales/theclinic/handler.ts

@@ -2,7 +2,6 @@ import type { Handler } from "aws-lambda";
 import { createRestAPIClient } from "masto";
 import "dotenv/config";
 
-// import Scraper from "../../utils/scraper";
 import ScraperArticles from "../../utils/scraper-articles";
 import RedisClient from "../../libs/redis-client";
 
@@ -18,7 +17,6 @@ export const handler: Handler = async (event, context) => {
       url: config.MASTODON_URL,
       accessToken: config.MASTODON_ACCESS_TOKEN
     });
-    // const scraper = new Scraper();
     const scraperArticles = new ScraperArticles(name, {
       url: config.THECLINIC,
       articlesSelector: ".listado article",
@@ -35,8 +33,11 @@ export const handler: Handler = async (event, context) => {
       console.log("Articles", articles);
     }
 
+    let totalPublished = 0;
     const length = articles.length;
-    for (let i = 0; i < length; i++) {
+
+    // Order has to be reverser to appear in the correct order when posting
+    for (let i = length; i > length; i--) {
       const article = articles[i];
       const exists = await redisClient.retrieve(article.link);
       if (exists !== null) {
@@ -44,20 +45,19 @@ export const handler: Handler = async (event, context) => {
       }
 
       const date = new Date(Date.now()).toLocaleDateString();
-      const message = `${Emojis.NEWS} ${article.title} \n${article.link}`;
-      const mediaIds = [""];
+      const message = `${Emojis.NEWS} ${article.title}\n${article.link}`;
+
+      if (message.trim().length === 0) {
+        continue;
+      }
 
-      // Since Mastodon is able to read metas this is not necessary
-      // const imgFile = await scraper.scrapeImage(article.image);
-      // if (imgFile !== null) {
-      //   const media = await mastodon.v2.media.create({ file: imgFile, description: article.content });
-      //   mediaIds.push(media.id);
-      // }
+      console.log("Sending", message);
 
-      await mastodon.v1.statuses.create({ status: message, mediaIds });
+      await mastodon.v1.statuses.create({ status: message });
       await redisClient.store(article.link, date, { EX: 60 * 60 * 24 }); // EX: 24 hrs expiration
+      totalPublished++
     }
-    console.log(`Publicated ${length} new articles`);
+    console.log(`Publicated ${totalPublished} new articles`);
   } catch (err: any) {
     console.log('An error has occurred\n')
     console.error(err.message);

+ 21 - 6
src/utils/scraper-articles.ts

@@ -50,9 +50,22 @@ export default class ScraperArticles {
     return selector !== "" ? this.getProperty(article, selector, Props.LINK) : this._options.url;
   }
 
-  private getImage (article: AnyNode): string {
+  private async getImage (article: AnyNode): Promise<File | null> {
+    let imgFile;
     const selector = this._options.imageSelector ?? "";
-    return selector !== "" ? this.getProperty(article, selector, Props.IMAGE) : config.IMG_PLACEHOLDER;
+    const imgUrl = selector !== "" ? this.getProperty(article, selector, Props.IMAGE) : "";
+
+    if (imgUrl !== "") {
+      imgFile = this._scraper.scrapeFile(imgUrl)
+        .catch((err) => {
+          if (config.LOG_LEVEL === LogLevels.DEBUG) {
+            console.debug(`${this._name} | Error raised\n`);
+            console.debug(`From ${article.type} can't get image using selector '${selector}'`);
+            console.error(err.message);
+          }
+        });
+    }
+    return imgFile;
   }
 
   private getAuthor (article: AnyNode): string {
@@ -62,7 +75,7 @@ export default class ScraperArticles {
 
   private getDate (article: AnyNode): string {
     const selector = this._options.dateSelector ?? "";
-    return selector !== "" ? this.getProperty(article, selector).trim() : new Date(Date.now()).toLocaleDateString();
+    return selector !== "" ? this.getProperty(article, selector).trim() : new Date(Date.now()).toLocaleDateString("es-CL");
   }
 
   public async getArticles (): Promise<IArticle[]> {
@@ -70,6 +83,7 @@ export default class ScraperArticles {
     const startTime = Date.now();
 
     try {
+      console.info("Starting scraping", this._options);
       const response = await this._scraper.scrape({ url: this._options.url });
       const html = response.data.data.html;
 
@@ -80,16 +94,17 @@ export default class ScraperArticles {
         console.debug(`${this._name} | Articles obtained: ${domElements.length} `);
       }
 
-      domElements.each((i, article) => {
+      for (let i = 0; i < domElements.length; i++) {
+        const article = domElements[i];
         articles.push({
           title: this.getTitle(article),
           content: this.getContent(article),
           link: this.getLink(article),
-          image: this.getImage(article),
+          image: await this.getImage(article),
           author: this.getAuthor(article),
           date: this.getDate(article)
         });
-      });
+      }
     } catch (err) {
       if (config.LOG_LEVEL === LogLevels.DEBUG) {
         console.debug(`${this._name} | Error raised\n`);

+ 8 - 8
src/utils/scraper.ts

@@ -38,21 +38,21 @@ export default class Scraper {
     return response;
   }
 
-  public async scrapeImage (url: string): Promise<File | null> {
+  public async scrapeFile (url: string): Promise<File | null> {
     this._options = { url };
-    let image: File | null = null;
+    let file: File | null = null;
 
     try {
-      const file = await fetch(url);
-      const blob = await file.blob();
+      const response = await fetch(url);
+      const blob = await response.blob();
 
-      const imgUrlParts = url.split("/");
-      const imgName = imgUrlParts[imgUrlParts.length - 1];
-      image = new File([blob], imgName);
+      const fileUrlParts = url.split("/");
+      const fileName = fileUrlParts[fileUrlParts.length - 1];
+      file = new File([blob], fileName);
     } catch (err: any) {
       console.error(err.message);
     }
 
-    return image;
+    return file;
   }
 }