From 8406c09a948853dd37081fc3e2615f6fffcec521 Mon Sep 17 00:00:00 2001 From: Antonio Ciolino Date: Fri, 16 Jun 2023 13:53:00 -0400 Subject: [PATCH 1/3] Enable web scraping based on a urtl and a simple filter. --- collector/main.py | 6 +++++- collector/scripts/link.py | 26 +++++++++++++++++++++++++- 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/collector/main.py b/collector/main.py index cd800eb9e30..bb0054b92f1 100644 --- a/collector/main.py +++ b/collector/main.py @@ -1,7 +1,7 @@ import os from InquirerPy import inquirer from scripts.youtube import youtube -from scripts.link import link, links +from scripts.link import link, links, crawler from scripts.substack import substack from scripts.medium import medium from scripts.gitbook import gitbook @@ -42,6 +42,7 @@ def main(): choices=[ {"name": "Single URL", "value": "Single URL"}, {"name": "Multiple URLs", "value": "Multiple URLs"}, + {"name": "URL Crawler", "value": "URL Crawler"}, {"name": "Abort", "value": "Abort"}, ], ).execute() @@ -51,6 +52,9 @@ def main(): if method == 'Multiple URLs': links() exit(0) + if method == 'URL Crawler': + crawler() + exit(0) if method == 'Abort': exit(0) if method == 'YouTube Channel': diff --git a/collector/scripts/link.py b/collector/scripts/link.py index 8bcc02e0e58..17a532cb0cc 100644 --- a/collector/scripts/link.py +++ b/collector/scripts/link.py @@ -4,6 +4,8 @@ from langchain.document_loaders import UnstructuredHTMLLoader from .link_utils import append_meta from .utils import tokenize, ada_v2_cost +import requests +from bs4 import BeautifulSoup # Example Channel URL https://tim.blog/2022/08/09/nft-insider-trading-policy/ def link(): @@ -64,6 +66,29 @@ def link(): print(f"////////////////////////////") exit(0) +def crawler(): + prompt = "Paste in root URI of the pages of interest: " + new_link = input(prompt) + filter_value = input("Add a filter value for the url to ensure links don't wander too far: ") + #extract this from the uri provided + root_site = urlparse(new_link).scheme + "://" + urlparse(new_link).hostname + links = [] + urls = new_link + links.append(new_link) + grab = requests.get(urls) + soup = BeautifulSoup(grab.text, 'html.parser') + + # traverse paragraphs from soup + for link in soup.find_all("a"): + data = link.get('href').strip() + if filter_value in data: + print (data) + links.append(root_site + data) + else: + print (data + " does not apply for linking...") + #parse the links found + parse_links(links) + def links(): links = [] prompt = "Paste in the URL of an online article or blog: " @@ -86,7 +111,6 @@ def links(): parse_links(links) - # parse links from array def parse_links(links): totalTokens = 0 From b262198d5327de23db6d8396ae19d0dd75b337b3 Mon Sep 17 00:00:00 2001 From: Antonio Ciolino Date: Mon, 19 Jun 2023 09:52:21 -0400 Subject: [PATCH 2/3] ignore yarn --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 0725f47c0da..a1d96b6e2b1 100644 --- a/.gitignore +++ b/.gitignore @@ -7,4 +7,4 @@ __pycache__ v-env .DS_Store aws_cf_deploy_anything_llm.json - +yarn.lock From b3e35c8cd236a7635c1ee41899ed34167f3ffe8c Mon Sep 17 00:00:00 2001 From: Antonio Ciolino Date: Mon, 19 Jun 2023 12:56:32 -0400 Subject: [PATCH 3/3] Updated Link scraper to avoid NoneType error. --- collector/scripts/link.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/collector/scripts/link.py b/collector/scripts/link.py index 17a532cb0cc..2bc604e99c1 100644 --- a/collector/scripts/link.py +++ b/collector/scripts/link.py @@ -80,12 +80,14 @@ def crawler(): # traverse paragraphs from soup for link in soup.find_all("a"): - data = link.get('href').strip() - if filter_value in data: - print (data) - links.append(root_site + data) - else: - print (data + " does not apply for linking...") + data = link.get('href') + if (data is not None): + if filter_value in data: + data = data.strip() + print (data) + links.append(root_site + data) + else: + print (data + " does not apply for linking...") #parse the links found parse_links(links)