From 8406c09a948853dd37081fc3e2615f6fffcec521 Mon Sep 17 00:00:00 2001
From: Antonio Ciolino <antonio.ciolino@gmail.com>
Date: Fri, 16 Jun 2023 13:53:00 -0400
Subject: [PATCH 1/3] Enable web scraping based on a urtl and a simple filter.

---
 collector/main.py         |  6 +++++-
 collector/scripts/link.py | 26 +++++++++++++++++++++++++-
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/collector/main.py b/collector/main.py
index cd800eb9e30..bb0054b92f1 100644
--- a/collector/main.py
+++ b/collector/main.py
@@ -1,7 +1,7 @@
 import os
 from InquirerPy import inquirer
 from scripts.youtube import youtube
-from scripts.link import link, links
+from scripts.link import link, links, crawler
 from scripts.substack import substack
 from scripts.medium import medium
 from scripts.gitbook import gitbook
@@ -42,6 +42,7 @@ def main():
       choices=[
         {"name": "Single URL", "value": "Single URL"},
         {"name": "Multiple URLs", "value": "Multiple URLs"},
+        {"name": "URL Crawler", "value": "URL Crawler"},
         {"name": "Abort", "value": "Abort"},
       ],
     ).execute()
@@ -51,6 +52,9 @@ def main():
     if method == 'Multiple URLs':
       links()
       exit(0)
+    if method == 'URL Crawler':
+      crawler()
+      exit(0)
 
   if method == 'Abort': exit(0)
   if method == 'YouTube Channel':
diff --git a/collector/scripts/link.py b/collector/scripts/link.py
index 8bcc02e0e58..17a532cb0cc 100644
--- a/collector/scripts/link.py
+++ b/collector/scripts/link.py
@@ -4,6 +4,8 @@
 from langchain.document_loaders import UnstructuredHTMLLoader
 from .link_utils import  append_meta
 from .utils import tokenize, ada_v2_cost
+import requests
+from bs4 import BeautifulSoup
     
 # Example Channel URL https://tim.blog/2022/08/09/nft-insider-trading-policy/
 def link():
@@ -64,6 +66,29 @@ def link():
   print(f"////////////////////////////")
   exit(0)
 
+def crawler():
+  prompt = "Paste in root URI of the pages of interest: "
+  new_link = input(prompt)
+  filter_value = input("Add a filter value for the url to ensure links don't wander too far: ")
+  #extract this from the uri provided
+  root_site = urlparse(new_link).scheme + "://" + urlparse(new_link).hostname
+  links = []
+  urls = new_link
+  links.append(new_link)
+  grab = requests.get(urls)
+  soup = BeautifulSoup(grab.text, 'html.parser')
+
+  # traverse paragraphs from soup
+  for link in soup.find_all("a"):
+    data = link.get('href').strip()
+    if filter_value in data:
+      print (data)
+      links.append(root_site + data)
+    else:
+       print (data + " does not apply for linking...")
+  #parse the links found  
+  parse_links(links)
+
 def links():
   links = []
   prompt = "Paste in the URL of an online article or blog: "
@@ -86,7 +111,6 @@ def links():
   parse_links(links)
 
 
-
 # parse links from array
 def parse_links(links):
     totalTokens = 0

From b262198d5327de23db6d8396ae19d0dd75b337b3 Mon Sep 17 00:00:00 2001
From: Antonio Ciolino <antonio.ciolino@gmail.com>
Date: Mon, 19 Jun 2023 09:52:21 -0400
Subject: [PATCH 2/3] ignore yarn

---
 .gitignore | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 0725f47c0da..a1d96b6e2b1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,4 +7,4 @@ __pycache__
 v-env
 .DS_Store
 aws_cf_deploy_anything_llm.json
-
+yarn.lock

From b3e35c8cd236a7635c1ee41899ed34167f3ffe8c Mon Sep 17 00:00:00 2001
From: Antonio Ciolino <antonio.ciolino@gmail.com>
Date: Mon, 19 Jun 2023 12:56:32 -0400
Subject: [PATCH 3/3] Updated Link scraper to avoid NoneType error.

---
 collector/scripts/link.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/collector/scripts/link.py b/collector/scripts/link.py
index 17a532cb0cc..2bc604e99c1 100644
--- a/collector/scripts/link.py
+++ b/collector/scripts/link.py
@@ -80,12 +80,14 @@ def crawler():
 
   # traverse paragraphs from soup
   for link in soup.find_all("a"):
-    data = link.get('href').strip()
-    if filter_value in data:
-      print (data)
-      links.append(root_site + data)
-    else:
-       print (data + " does not apply for linking...")
+    data = link.get('href')
+    if (data is not None):
+      if filter_value in data:
+        data = data.strip()
+        print (data)
+        links.append(root_site + data)
+      else:
+        print (data + " does not apply for linking...")
   #parse the links found  
   parse_links(links)