这是indexloc提供的服务,不要输入任何密码
Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 1 addition & 6 deletions collector/scripts/link.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from langchain.document_loaders import UnstructuredHTMLLoader
from .link_utils import append_meta
from .utils import tokenize, ada_v2_cost
from requests.exceptions import ReadTimeout

# Example Channel URL https://tim.blog/2022/08/09/nft-insider-trading-policy/
def link():
Expand Down Expand Up @@ -91,11 +90,7 @@ def links():
# parse links from array
def parse_links(links):
totalTokens = 0
for link in links:
if link.endswith(".pdf"):
print(f"Skipping PDF file: {link}")
continue

for link in links:
print(f"Working on {link}...")
session = HTMLSession()

Expand Down
14 changes: 13 additions & 1 deletion collector/scripts/sitemap.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import requests
import xml.etree.ElementTree as ET
from scripts.link import parse_links
import re

def parse_sitemap(url):
response = requests.get(url)
Expand All @@ -9,7 +10,10 @@ def parse_sitemap(url):
urls = []
for element in root.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}url'):
for loc in element.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}loc'):
urls.append(loc.text)
if not has_extension_to_ignore(loc.text):
urls.append(loc.text)
else:
print(f"Skipping filetype: {loc.text}")

return urls

Expand All @@ -25,3 +29,11 @@ def sitemap():

#parse links from array
parse_links(url_array)

def has_extension_to_ignore(string):
image_extensions = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.pdf']

pattern = r'\b(' + '|'.join(re.escape(ext) for ext in image_extensions) + r')\b'
match = re.search(pattern, string, re.IGNORECASE)

return match is not None