From ffa7453975390e7fe8744478870fa8de25b1d060 Mon Sep 17 00:00:00 2001
From: Skid Vis <skid@skidvis.com>
Date: Wed, 14 Jun 2023 10:04:18 -0500
Subject: [PATCH 1/4] Adds ability to import sitemaps to include a website

---
 collector/main.py            |   8 ++-
 collector/scripts/link.py    | 123 ++++++++++++++++++++---------------
 collector/scripts/sitemap.py |  26 ++++++++
 3 files changed, 102 insertions(+), 55 deletions(-)
 create mode 100644 collector/scripts/sitemap.py

diff --git a/collector/main.py b/collector/main.py
index efb05db12e3..cd800eb9e30 100644
--- a/collector/main.py
+++ b/collector/main.py
@@ -5,6 +5,7 @@
 from scripts.substack import substack
 from scripts.medium import medium
 from scripts.gitbook import gitbook
+from scripts.sitemap import sitemap
 
 def main():
   if os.name == 'nt':
@@ -13,7 +14,8 @@ def main():
       '2': 'Article or Blog Link',
       '3': 'Substack',
       '4': 'Medium',
-      '5': 'Gitbook'
+      '5': 'Gitbook',
+      '6': 'Sitemap',
     }
     print("There are options for data collection to make this easier for you.\nType the number of the method you wish to execute.")
     print("1. YouTube Channel\n2. Article or Blog Link (Single)\n3. Substack\n4. Medium\n\n[In development]:\nTwitter\n\n")
@@ -29,6 +31,7 @@ def main():
         {"name": "Article or Blog Link(s)", "value": "Article or Blog Link(s)"},
         {"name": "Gitbook", "value": "Gitbook"},
         {"name": "Twitter", "value": "Twitter", "disabled": "Needs PR"},
+        {"name": "Sitemap", "value": "Sitemap"},
         {"name": "Abort", "value": "Abort"},
       ],
     ).execute()
@@ -62,6 +65,9 @@ def main():
   if method == 'Gitbook':
     gitbook()
     exit(0)
+  if method == 'Sitemap':
+    sitemap()
+    exit(0)
 
   print("Selection was not valid.")
   exit(1)
diff --git a/collector/scripts/link.py b/collector/scripts/link.py
index 1f03a8f33c7..0dad18c6818 100644
--- a/collector/scripts/link.py
+++ b/collector/scripts/link.py
@@ -4,6 +4,7 @@
 from langchain.document_loaders import UnstructuredHTMLLoader
 from .link_utils import  append_meta
 from .utils import tokenize, ada_v2_cost
+from requests.exceptions import ReadTimeout
     
 # Example Channel URL https://tim.blog/2022/08/09/nft-insider-trading-policy/
 def link():
@@ -83,57 +84,71 @@ def links():
     print("No valid links provided!")
     exit(1)
 
-  totalTokens = 0
-  for link in links:
-    print(f"Working on {link}...")
-    session = HTMLSession()
-    req = session.get(link)
-    if(req.ok == False):
-      print(f"Could not reach {link} - skipping!")
-      continue
-    
-    req.html.render()
-    full_text = None
-    with tempfile.NamedTemporaryFile(mode = "w") as tmp:
-      tmp.write(req.html.html)
-      tmp.seek(0)
-      loader = UnstructuredHTMLLoader(tmp.name)
-      data = loader.load()[0]
-      full_text = data.page_content
-      tmp.close()
-  
-    link = append_meta(req, full_text, True)
-    if(len(full_text) > 0):
-      source = urlparse(req.url)
-      output_filename = f"website-{source.netloc}-{source.path.replace('/','_')}.json"
-      output_path = f"./outputs/website-logs"
-
-      transaction_output_filename = f"article-{source.path.replace('/','_')}.json"
-      transaction_output_dir = f"../server/storage/documents/website-{source.netloc}"
-
-      if os.path.isdir(output_path) == False:
-        os.makedirs(output_path)
-
-      if os.path.isdir(transaction_output_dir) == False:
-        os.makedirs(transaction_output_dir)
-
-      full_text = append_meta(req, full_text)
-      tokenCount = len(tokenize(full_text))
-      link['pageContent'] = full_text
-      link['token_count_estimate'] = tokenCount
-      totalTokens += tokenCount
-
-      with open(f"{output_path}/{output_filename}", 'w', encoding='utf-8') as file:
-        json.dump(link, file, ensure_ascii=True, indent=4)
-
-      with open(f"{transaction_output_dir}/{transaction_output_filename}", 'w', encoding='utf-8') as file:
-        json.dump(link, file, ensure_ascii=True, indent=4)
-    else:
-      print(f"Could not parse any meaningful data from {link}.")
-      continue
-
-  print(f"\n\n[Success]: {len(links)} article or link contents fetched!")
-  print(f"////////////////////////////")
-  print(f"Your estimated cost to embed this data using OpenAI's text-embedding-ada-002 model at $0.0004 / 1K tokens will cost {ada_v2_cost(totalTokens)} using {totalTokens} tokens.")
-  print(f"////////////////////////////")
-  exit(0)
\ No newline at end of file
+  parse_links(links)
+
+
+
+# parse links from array
+def parse_links(links):
+    totalTokens = 0
+    for link in links:
+        if link.endswith(".pdf"):
+            print(f"Skipping PDF file: {link}")
+            continue
+                
+        print(f"Working on {link}...")
+        session = HTMLSession()
+        
+        req = session.get(link, timeout=20) 
+
+        if not req.ok:
+            print(f"Could not reach {link} - skipping!")
+            continue
+        
+        req.html.render(timeout=10)    
+
+        full_text = None
+        with tempfile.NamedTemporaryFile(mode="w") as tmp:
+            tmp.write(req.html.html)
+            tmp.seek(0)
+            loader = UnstructuredHTMLLoader(tmp.name)
+            data = loader.load()[0]
+            full_text = data.page_content
+            tmp.close()
+        
+        link = append_meta(req, full_text, True)
+        if len(full_text) > 0:
+            source = urlparse(req.url)
+            output_filename = f"website-{source.netloc}-{source.path.replace('/','_')}.json"
+            output_path = f"./outputs/website-logs"
+
+            transaction_output_filename = f"article-{source.path.replace('/','_')}.json"
+            transaction_output_dir = f"../server/storage/documents/website-{source.netloc}"
+
+            if not os.path.isdir(output_path):
+                os.makedirs(output_path)
+
+            if not os.path.isdir(transaction_output_dir):
+                os.makedirs(transaction_output_dir)
+
+            full_text = append_meta(req, full_text)
+            tokenCount = len(tokenize(full_text))
+            link['pageContent'] = full_text
+            link['token_count_estimate'] = tokenCount
+            totalTokens += tokenCount
+
+            with open(f"{output_path}/{output_filename}", 'w', encoding='utf-8') as file:
+                json.dump(link, file, ensure_ascii=True, indent=4)
+
+            with open(f"{transaction_output_dir}/{transaction_output_filename}", 'w', encoding='utf-8') as file:
+                json.dump(link, file, ensure_ascii=True, indent=4)
+
+            req.session.close()
+        else:
+            print(f"Could not parse any meaningful data from {link}.")
+            continue    
+
+    print(f"\n\n[Success]: {len(links)} article or link contents fetched!")
+    print(f"////////////////////////////")
+    print(f"Your estimated cost to embed this data using OpenAI's text-embedding-ada-002 model at $0.0004 / 1K tokens will cost {ada_v2_cost(totalTokens)} using {totalTokens} tokens.")
+    print(f"////////////////////////////")
\ No newline at end of file
diff --git a/collector/scripts/sitemap.py b/collector/scripts/sitemap.py
new file mode 100644
index 00000000000..a3b4c23c410
--- /dev/null
+++ b/collector/scripts/sitemap.py
@@ -0,0 +1,26 @@
+import requests
+import xml.etree.ElementTree as ET
+from scripts.link import parse_links
+
+def parse_sitemap(url):
+    response = requests.get(url)
+    root = ET.fromstring(response.content)
+    
+    urls = []
+    for element in root.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}url'):
+        for loc in element.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}loc'):
+            urls.append(loc.text)
+    
+    return urls
+
+def sitemap():
+    sitemap_url = input("Enter the URL of the sitemap: ")
+    
+    if(len(sitemap_url) == 0):
+        print("No valid sitemap provided!")
+        exit(1)
+
+    url_array = parse_sitemap(sitemap_url)
+
+    #parse links from array
+    parse_links(url_array)

From e43cac51b272a999d15672ef56fe20e3a0747ec2 Mon Sep 17 00:00:00 2001
From: Skid Vis <skid@skidvis.com>
Date: Wed, 14 Jun 2023 12:30:22 -0500
Subject: [PATCH 2/4] adds example sitemap url

---
 collector/scripts/sitemap.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/collector/scripts/sitemap.py b/collector/scripts/sitemap.py
index a3b4c23c410..3895bcefb81 100644
--- a/collector/scripts/sitemap.py
+++ b/collector/scripts/sitemap.py
@@ -13,6 +13,7 @@ def parse_sitemap(url):
     
     return urls
 
+# Example sitemap URL https://www.nerdwallet.com/blog/wp-sitemap-news-articles-1.xml
 def sitemap():
     sitemap_url = input("Enter the URL of the sitemap: ")
     

From ecb783d2943658b2b4dc0c4f5712b00768afe2e4 Mon Sep 17 00:00:00 2001
From: Skid Vis <skid@skidvis.com>
Date: Wed, 14 Jun 2023 16:50:03 -0500
Subject: [PATCH 3/4] adds filter to bypass common image formats

---
 collector/scripts/link.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/collector/scripts/link.py b/collector/scripts/link.py
index 0dad18c6818..742583bc987 100644
--- a/collector/scripts/link.py
+++ b/collector/scripts/link.py
@@ -4,7 +4,7 @@
 from langchain.document_loaders import UnstructuredHTMLLoader
 from .link_utils import  append_meta
 from .utils import tokenize, ada_v2_cost
-from requests.exceptions import ReadTimeout
+import re
     
 # Example Channel URL https://tim.blog/2022/08/09/nft-insider-trading-policy/
 def link():
@@ -92,8 +92,8 @@ def links():
 def parse_links(links):
     totalTokens = 0
     for link in links:
-        if link.endswith(".pdf"):
-            print(f"Skipping PDF file: {link}")
+        if extensions_to_ignore(link):
+            print(f"Skipping PDF/Image file: {link}")
             continue
                 
         print(f"Working on {link}...")
@@ -151,4 +151,12 @@ def parse_links(links):
     print(f"\n\n[Success]: {len(links)} article or link contents fetched!")
     print(f"////////////////////////////")
     print(f"Your estimated cost to embed this data using OpenAI's text-embedding-ada-002 model at $0.0004 / 1K tokens will cost {ada_v2_cost(totalTokens)} using {totalTokens} tokens.")
-    print(f"////////////////////////////")
\ No newline at end of file
+    print(f"////////////////////////////")
+
+def extensions_to_ignore(string):
+    image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.pdf']
+
+    pattern = r'\b(' + '|'.join(re.escape(ext) for ext in image_extensions) + r')\b'
+    match = re.search(pattern, string, re.IGNORECASE)
+
+    return match is not None    
\ No newline at end of file

From 54e4a58799d40d00e2bd35ed451c6e3aa5c4c9af Mon Sep 17 00:00:00 2001
From: Skid Vis <skid@skidvis.com>
Date: Wed, 14 Jun 2023 17:12:25 -0500
Subject: [PATCH 4/4] moves filetype ignoring to sitemap script

---
 collector/scripts/link.py    | 17 ++---------------
 collector/scripts/sitemap.py | 14 +++++++++++++-
 2 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/collector/scripts/link.py b/collector/scripts/link.py
index 742583bc987..8bcc02e0e58 100644
--- a/collector/scripts/link.py
+++ b/collector/scripts/link.py
@@ -4,7 +4,6 @@
 from langchain.document_loaders import UnstructuredHTMLLoader
 from .link_utils import  append_meta
 from .utils import tokenize, ada_v2_cost
-import re
     
 # Example Channel URL https://tim.blog/2022/08/09/nft-insider-trading-policy/
 def link():
@@ -91,11 +90,7 @@ def links():
 # parse links from array
 def parse_links(links):
     totalTokens = 0
-    for link in links:
-        if extensions_to_ignore(link):
-            print(f"Skipping PDF/Image file: {link}")
-            continue
-                
+    for link in links:               
         print(f"Working on {link}...")
         session = HTMLSession()
         
@@ -151,12 +146,4 @@ def parse_links(links):
     print(f"\n\n[Success]: {len(links)} article or link contents fetched!")
     print(f"////////////////////////////")
     print(f"Your estimated cost to embed this data using OpenAI's text-embedding-ada-002 model at $0.0004 / 1K tokens will cost {ada_v2_cost(totalTokens)} using {totalTokens} tokens.")
-    print(f"////////////////////////////")
-
-def extensions_to_ignore(string):
-    image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.pdf']
-
-    pattern = r'\b(' + '|'.join(re.escape(ext) for ext in image_extensions) + r')\b'
-    match = re.search(pattern, string, re.IGNORECASE)
-
-    return match is not None    
\ No newline at end of file
+    print(f"////////////////////////////")
\ No newline at end of file
diff --git a/collector/scripts/sitemap.py b/collector/scripts/sitemap.py
index 3895bcefb81..e780bd9cc9f 100644
--- a/collector/scripts/sitemap.py
+++ b/collector/scripts/sitemap.py
@@ -1,6 +1,7 @@
 import requests
 import xml.etree.ElementTree as ET
 from scripts.link import parse_links
+import re
 
 def parse_sitemap(url):
     response = requests.get(url)
@@ -9,7 +10,10 @@ def parse_sitemap(url):
     urls = []
     for element in root.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}url'):
         for loc in element.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}loc'):
-            urls.append(loc.text)
+            if not has_extension_to_ignore(loc.text):
+                urls.append(loc.text)
+            else:
+                print(f"Skipping filetype: {loc.text}")
     
     return urls
 
@@ -25,3 +29,11 @@ def sitemap():
 
     #parse links from array
     parse_links(url_array)
+
+def has_extension_to_ignore(string):
+    image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.pdf']
+
+    pattern = r'\b(' + '|'.join(re.escape(ext) for ext in image_extensions) + r')\b'
+    match = re.search(pattern, string, re.IGNORECASE)
+
+    return match is not None    
\ No newline at end of file