Browse Source

added validUrl filter to evaluate url prefixes

Pablo Barrera Yaksic 2 months ago
parent
commit
73b3460cb0
1 changed files with 15 additions and 2 deletions
  1. 15 2
      src/utils/scraper-articles.ts

+ 15 - 2
src/utils/scraper-articles.ts

@@ -49,7 +49,8 @@ export default class ScraperArticles {
   private getLink (article: AnyNode): string {
     const selector = this._options.linkSelector ?? "";
     const url = selector !== "" ? this.getProperty(article, selector, Props.LINK) : this._options.url
-    return this._options.linkPrefix !== undefined && ! url.includes(this._options.linkPrefix)
+
+    return ! this.isValidUrl(url) && this._options.linkPrefix !== undefined
      ? (this._options.linkPrefix + url.trim()).trim() 
      : url.trim();
   }
@@ -60,7 +61,7 @@ export default class ScraperArticles {
     let imgUrl = selector !== "" ? this.getProperty(article, selector, Props.IMAGE) : "";
 
     if (imgUrl !== "") {
-      if (this._options.imagePrefix && ! imgUrl.includes(this._options.imagePrefix)) {
+      if (! this.isValidUrl(imgUrl) && this._options.imagePrefix) {
         imgUrl = this._options.imagePrefix + imgUrl.trim();
       }
       imgFile = this._scraper.scrapeFile(imgUrl)
@@ -85,6 +86,18 @@ export default class ScraperArticles {
     return selector !== "" ? this.getProperty(article, selector).trim() : new Date(Date.now()).toLocaleDateString("es-CL");
   }
 
+  private isValidUrl (url: string): boolean {
+    try {
+      const parsedUrl = new URL(url);
+      if (!parsedUrl.protocol || !parsedUrl.host) {
+        return false;
+      }
+      return true;
+    } catch (error) {
+      return false;
+    }
+  };
+
   public async getArticles (): Promise<IArticle[]> {
     const articles: IArticle[] = [];
     const startTime = Date.now();