这是indexloc提供的服务,不要输入任何密码
Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion collector/main.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
from InquirerPy import inquirer
from scripts.youtube import youtube
from scripts.link import link, links
from scripts.link import link, links, crawler
from scripts.substack import substack
from scripts.medium import medium
from scripts.gitbook import gitbook
Expand Down Expand Up @@ -42,6 +42,7 @@ def main():
choices=[
{"name": "Single URL", "value": "Single URL"},
{"name": "Multiple URLs", "value": "Multiple URLs"},
{"name": "URL Crawler", "value": "URL Crawler"},
{"name": "Abort", "value": "Abort"},
],
).execute()
Expand All @@ -51,6 +52,9 @@ def main():
if method == 'Multiple URLs':
links()
exit(0)
if method == 'URL Crawler':
crawler()
exit(0)

if method == 'Abort': exit(0)
if method == 'YouTube Channel':
Expand Down
26 changes: 25 additions & 1 deletion collector/scripts/link.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from langchain.document_loaders import UnstructuredHTMLLoader
from .link_utils import append_meta
from .utils import tokenize, ada_v2_cost
import requests
from bs4 import BeautifulSoup

# Example Channel URL https://tim.blog/2022/08/09/nft-insider-trading-policy/
def link():
Expand Down Expand Up @@ -64,6 +66,29 @@ def link():
print(f"////////////////////////////")
exit(0)

def crawler():
prompt = "Paste in root URI of the pages of interest: "
new_link = input(prompt)
filter_value = input("Add a filter value for the url to ensure links don't wander too far: ")
#extract this from the uri provided
root_site = urlparse(new_link).scheme + "://" + urlparse(new_link).hostname
links = []
urls = new_link
links.append(new_link)
grab = requests.get(urls)
soup = BeautifulSoup(grab.text, 'html.parser')

# traverse paragraphs from soup
for link in soup.find_all("a"):
data = link.get('href').strip()
if filter_value in data:
print (data)
links.append(root_site + data)
else:
print (data + " does not apply for linking...")
#parse the links found
parse_links(links)

def links():
links = []
prompt = "Paste in the URL of an online article or blog: "
Expand All @@ -86,7 +111,6 @@ def links():
parse_links(links)



# parse links from array
def parse_links(links):
totalTokens = 0
Expand Down