这是indexloc提供的服务,不要输入任何密码
Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
server/storage/documents/**
server/storage/vector-cache/**
server/storage/*.db
server/storage/lancedb
collector/hotdir/**
collector/v-env/**
collector/outputs/**
**/node_modules/
**/dist/
**/v-env/
**/__pycache__/
**/.env
**/.env.*
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,5 @@ v-env
node_modules
__pycache__
v-env
*.lock
.DS_Store

4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,9 @@ Next, you will need some content to embed. This could be a Youtube Channel, Medi

[Go set up and run collector scripts](./collector/README.md)

[Learn about documents](./server/documents/DOCUMENTS.md)
[Learn about documents](./server/storage/documents/DOCUMENTS.md)

[Learn about vector caching](./server/vector-cache/VECTOR_CACHE.md)
[Learn about vector caching](./server/storage/vector-cache/VECTOR_CACHE.md)

### Contributing
- create issue
Expand Down
71 changes: 30 additions & 41 deletions collector/main.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import os
from whaaaaat import prompt, Separator
from InquirerPy import inquirer
from scripts.youtube import youtube
from scripts.link import link, links
from scripts.substack import substack
Expand All @@ -20,57 +20,46 @@ def main():
selection = input("Your selection: ")
method = methods.get(str(selection))
else:
questions = [
{
"type": "list",
"name": "collector",
"message": "What kind of data would you like to add to convert into long-term memory?",
"choices": [
"YouTube Channel",
"Substack",
"Medium",
"Article or Blog Link(s)",
"Gitbook",
Separator(),
{"name": "Twitter", "disabled": "Needs PR"},
"Abort",
],
},
]
method = prompt(questions).get('collector')

if('Article or Blog Link' in method):
questions = [
{
"type": "list",
"name": "collector",
"message": "Do you want to scrape a single article/blog/url or many at once?",
"choices": [
'Single URL',
'Multiple URLs',
'Abort',
],
},
]
method = prompt(questions).get('collector')
if(method == 'Single URL'):
method = inquirer.select(
message="What kind of data would you like to add to convert into long-term memory?",
choices=[
{"name": "YouTube Channel", "value": "YouTube Channel"},
{"name": "Substack", "value": "Substack"},
{"name": "Medium", "value": "Medium"},
{"name": "Article or Blog Link(s)", "value": "Article or Blog Link(s)"},
{"name": "Gitbook", "value": "Gitbook"},
{"name": "Twitter", "value": "Twitter", "disabled": "Needs PR"},
{"name": "Abort", "value": "Abort"},
],
).execute()

if 'Article or Blog Link' in method:
method = inquirer.select(
message="Do you want to scrape a single article/blog/url or many at once?",
choices=[
{"name": "Single URL", "value": "Single URL"},
{"name": "Multiple URLs", "value": "Multiple URLs"},
{"name": "Abort", "value": "Abort"},
],
).execute()
if method == 'Single URL':
link()
exit(0)
if(method == 'Multiple URLs'):
if method == 'Multiple URLs':
links()
exit(0)

if(method == 'Abort'): exit(0)
if(method == 'YouTube Channel'):
if method == 'Abort': exit(0)
if method == 'YouTube Channel':
youtube()
exit(0)
if(method == 'Substack'):
if method == 'Substack':
substack()
exit(0)
if(method == 'Medium'):
if method == 'Medium':
medium()
exit(0)
if(method == 'Gitbook'):
if method == 'Gitbook':
gitbook()
exit(0)

Expand Down
129 changes: 6 additions & 123 deletions collector/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ cryptography==41.0.1
cssselect==1.2.0
dataclasses-json==0.5.7
Deprecated==1.2.14
docx2txt==0.8
et-xmlfile==1.1.0
exceptiongroup==1.1.1
fake-useragent==1.1.3
Expand All @@ -30,6 +31,7 @@ h11==0.14.0
httpcore==0.16.3
httpx==0.23.3
idna==3.4
InquirerPy==0.3.4
importlib-metadata==6.6.0
importlib-resources==5.12.0
install==1.3.5
Expand All @@ -54,132 +56,13 @@ pandas==1.5.3
parse==1.19.0
pdfminer.six==20221105
Pillow==9.5.0
prompt-toolkit==1.0.14
prompt-toolkit==3.0.38
pycparser==2.21
pydantic==1.10.8
pyee==8.2.2
Pygments==2.15.1
pyobjc==9.1.1
pyobjc-core==9.1.1
pyobjc-framework-Accounts==9.1.1
pyobjc-framework-AddressBook==9.1.1
pyobjc-framework-AdSupport==9.1.1
pyobjc-framework-AppleScriptKit==9.1.1
pyobjc-framework-AppleScriptObjC==9.1.1
pyobjc-framework-ApplicationServices==9.1.1
pyobjc-framework-AudioVideoBridging==9.1.1
pyobjc-framework-AuthenticationServices==9.1.1
pyobjc-framework-AutomaticAssessmentConfiguration==9.1.1
pyobjc-framework-Automator==9.1.1
pyobjc-framework-AVFoundation==9.1.1
pyobjc-framework-AVKit==9.1.1
pyobjc-framework-BusinessChat==9.1.1
pyobjc-framework-CalendarStore==9.1.1
pyobjc-framework-CFNetwork==9.1.1
pyobjc-framework-CloudKit==9.1.1
pyobjc-framework-Cocoa==9.1.1
pyobjc-framework-Collaboration==9.1.1
pyobjc-framework-ColorSync==9.1.1
pyobjc-framework-Contacts==9.1.1
pyobjc-framework-ContactsUI==9.1.1
pyobjc-framework-CoreAudio==9.1.1
pyobjc-framework-CoreAudioKit==9.1.1
pyobjc-framework-CoreBluetooth==9.1.1
pyobjc-framework-CoreData==9.1.1
pyobjc-framework-CoreHaptics==9.1.1
pyobjc-framework-CoreLocation==9.1.1
pyobjc-framework-CoreMedia==9.1.1
pyobjc-framework-CoreMediaIO==9.1.1
pyobjc-framework-CoreMIDI==9.1.1
pyobjc-framework-CoreML==9.1.1
pyobjc-framework-CoreMotion==9.1.1
pyobjc-framework-CoreServices==9.1.1
pyobjc-framework-CoreSpotlight==9.1.1
pyobjc-framework-CoreText==9.1.1
pyobjc-framework-CoreWLAN==9.1.1
pyobjc-framework-CryptoTokenKit==9.1.1
pyobjc-framework-DeviceCheck==9.1.1
pyobjc-framework-DictionaryServices==9.1.1
pyobjc-framework-DiscRecording==9.1.1
pyobjc-framework-DiscRecordingUI==9.1.1
pyobjc-framework-DiskArbitration==9.1.1
pyobjc-framework-DVDPlayback==9.1.1
pyobjc-framework-EventKit==9.1.1
pyobjc-framework-ExceptionHandling==9.1.1
pyobjc-framework-ExecutionPolicy==9.1.1
pyobjc-framework-ExternalAccessory==9.1.1
pyobjc-framework-FileProvider==9.1.1
pyobjc-framework-FileProviderUI==9.1.1
pyobjc-framework-FinderSync==9.1.1
pyobjc-framework-FSEvents==9.1.1
pyobjc-framework-GameCenter==9.1.1
pyobjc-framework-GameController==9.1.1
pyobjc-framework-GameKit==9.1.1
pyobjc-framework-GameplayKit==9.1.1
pyobjc-framework-ImageCaptureCore==9.1.1
pyobjc-framework-IMServicePlugIn==9.1.1
pyobjc-framework-InputMethodKit==9.1.1
pyobjc-framework-InstallerPlugins==9.1.1
pyobjc-framework-InstantMessage==9.1.1
pyobjc-framework-Intents==9.1.1
pyobjc-framework-IOBluetooth==9.1.1
pyobjc-framework-IOBluetoothUI==9.1.1
pyobjc-framework-IOSurface==9.1.1
pyobjc-framework-iTunesLibrary==9.1.1
pyobjc-framework-LatentSemanticMapping==9.1.1
pyobjc-framework-LaunchServices==9.1.1
pyobjc-framework-libdispatch==9.1.1
pyobjc-framework-libxpc==9.1.1
pyobjc-framework-LinkPresentation==9.1.1
pyobjc-framework-LocalAuthentication==9.1.1
pyobjc-framework-MapKit==9.1.1
pyobjc-framework-MediaAccessibility==9.1.1
pyobjc-framework-MediaLibrary==9.1.1
pyobjc-framework-MediaPlayer==9.1.1
pyobjc-framework-MediaToolbox==9.1.1
pyobjc-framework-Metal==9.1.1
pyobjc-framework-MetalKit==9.1.1
pyobjc-framework-MetalPerformanceShaders==9.1.1
pyobjc-framework-ModelIO==9.1.1
pyobjc-framework-MultipeerConnectivity==9.1.1
pyobjc-framework-NaturalLanguage==9.1.1
pyobjc-framework-NetFS==9.1.1
pyobjc-framework-Network==9.1.1
pyobjc-framework-NetworkExtension==9.1.1
pyobjc-framework-NotificationCenter==9.1.1
pyobjc-framework-OpenDirectory==9.1.1
pyobjc-framework-OSAKit==9.1.1
pyobjc-framework-OSLog==9.1.1
pyobjc-framework-PencilKit==9.1.1
pyobjc-framework-Photos==9.1.1
pyobjc-framework-PhotosUI==9.1.1
pyobjc-framework-PreferencePanes==9.1.1
pyobjc-framework-PushKit==9.1.1
pyobjc-framework-Quartz==9.1.1
pyobjc-framework-QuickLookThumbnailing==9.1.1
pyobjc-framework-SafariServices==9.1.1
pyobjc-framework-SceneKit==9.1.1
pyobjc-framework-ScreenSaver==9.1.1
pyobjc-framework-ScriptingBridge==9.1.1
pyobjc-framework-SearchKit==9.1.1
pyobjc-framework-Security==9.1.1
pyobjc-framework-SecurityFoundation==9.1.1
pyobjc-framework-SecurityInterface==9.1.1
pyobjc-framework-ServiceManagement==9.1.1
pyobjc-framework-Social==9.1.1
pyobjc-framework-SoundAnalysis==9.1.1
pyobjc-framework-Speech==9.1.1
pyobjc-framework-SpriteKit==9.1.1
pyobjc-framework-StoreKit==9.1.1
pyobjc-framework-SyncServices==9.1.1
pyobjc-framework-SystemConfiguration==9.1.1
pyobjc-framework-SystemExtensions==9.1.1
pyobjc-framework-UserNotifications==9.1.1
pyobjc-framework-VideoSubscriberAccount==9.1.1
pyobjc-framework-VideoToolbox==9.1.1
pyobjc-framework-Vision==9.1.1
pyobjc-framework-WebKit==9.1.1
pypandoc==1.4
pypdf==3.9.0
pyppeteer==1.0.2
pyquery==2.0.0
python-dateutil==2.8.2
Expand All @@ -199,6 +82,7 @@ six==1.16.0
sniffio==1.3.0
soupsieve==2.4.1
SQLAlchemy==2.0.15
tabulate==0.9.0
tenacity==8.2.2
text-unidecode==1.3
tiktoken==0.4.0
Expand All @@ -212,10 +96,9 @@ uuid==1.30
w3lib==2.1.1
wcwidth==0.2.6
websockets==10.4
whaaaaat==0.5.2
wrapt==1.14.1
xlrd==2.0.1
XlsxWriter==3.1.2
yarl==1.9.2
youtube-transcript-api==0.6.0
zipp==3.15.0
zipp==3.15.0
2 changes: 1 addition & 1 deletion collector/scripts/gitbook.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def gitbook():

primary_source = urlparse(url)
output_path = f"./outputs/gitbook-logs/{primary_source.netloc}"
transaction_output_dir = f"../server/documents/gitbook-{primary_source.netloc}"
transaction_output_dir = f"../server/storage/documents/gitbook-{primary_source.netloc}"

if os.path.exists(output_path) == False:os.makedirs(output_path)
if os.path.exists(transaction_output_dir) == False: os.makedirs(transaction_output_dir)
Expand Down
4 changes: 2 additions & 2 deletions collector/scripts/link.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def link():
output_path = f"./outputs/website-logs"

transaction_output_filename = f"article-{source.path.replace('/','_')}.json"
transaction_output_dir = f"../server/documents/website-{source.netloc}"
transaction_output_dir = f"../server/storage/documents/website-{source.netloc}"

if os.path.isdir(output_path) == False:
os.makedirs(output_path)
Expand Down Expand Up @@ -109,7 +109,7 @@ def links():
output_path = f"./outputs/website-logs"

transaction_output_filename = f"article-{source.path.replace('/','_')}.json"
transaction_output_dir = f"../server/documents/website-{source.netloc}"
transaction_output_dir = f"../server/storage/documents/website-{source.netloc}"

if os.path.isdir(output_path) == False:
os.makedirs(output_path)
Expand Down
2 changes: 1 addition & 1 deletion collector/scripts/medium.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def medium():
exit(1)

totalTokenCount = 0
transaction_output_dir = f"../server/documents/medium-{handle}"
transaction_output_dir = f"../server/storage/documents/medium-{handle}"
if os.path.isdir(transaction_output_dir) == False:
os.makedirs(transaction_output_dir)

Expand Down
2 changes: 1 addition & 1 deletion collector/scripts/substack.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def substack():
print(f"{len(valid_publications)} of {len(publications)} publications are readable publically text posts - collecting those.")

totalTokenCount = 0
transaction_output_dir = f"../server/documents/substack-{subdomain}"
transaction_output_dir = f"../server/storage/documents/substack-{subdomain}"
if os.path.isdir(transaction_output_dir) == False:
os.makedirs(transaction_output_dir)

Expand Down
2 changes: 1 addition & 1 deletion collector/scripts/watch/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def move_source(working_dir='hotdir', new_destination_filename= ''):
return

def write_to_server_documents(data, filename):
destination = f"../server/documents/custom-documents"
destination = f"../server/storage/documents/custom-documents"
if os.path.exists(destination) == False: os.makedirs(destination)
with open(f"{destination}/{filename}.json", 'w', encoding='utf-8') as file:
json.dump(data, file, ensure_ascii=True, indent=4)
2 changes: 1 addition & 1 deletion collector/scripts/youtube.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def youtube():
exit(1)

channel_data = fetch_channel_video_information(channel_id)
transaction_output_dir = f"../server/documents/youtube-{channel_data.get('channelTitle')}"
transaction_output_dir = f"../server/storage/documents/youtube-{channel_data.get('channelTitle')}"

if os.path.isdir(transaction_output_dir) == False:
os.makedirs(transaction_output_dir)
Expand Down
25 changes: 25 additions & 0 deletions docker/.env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
SERVER_PORT=3001
OPEN_AI_KEY=
OPEN_MODEL_PREF='gpt-3.5-turbo'
CACHE_VECTORS="true"

# Enable all below if you are using vector database: Chroma.
# VECTOR_DB="chroma"
# CHROMA_ENDPOINT='http://localhost:8000'

# Enable all below if you are using vector database: Pinecone.
VECTOR_DB="pinecone"
PINECONE_ENVIRONMENT=
PINECONE_API_KEY=
PINECONE_INDEX=

# Enable all below if you are using vector database: LanceDB.
# VECTOR_DB="lancedb"

# CLOUD DEPLOYMENT VARIRABLES ONLY
# AUTH_TOKEN="hunter2" # This is the password to your application if remote hosting.
# JWT_SECRET="my-random-string-for-seeding" # Only needed if AUTH_TOKEN is set. Please generate random string at least 12 chars long.
STORAGE_DIR="./server/storage"
GOOGLE_APIS_KEY=
UID='1000'
GID='1000'
Loading