From 68c2363cd60f4d64dba68d56b20a05ea218df1d7 Mon Sep 17 00:00:00 2001 From: Antonio Ciolino Date: Wed, 28 Jun 2023 20:07:25 -0400 Subject: [PATCH 1/3] . --- collector/README.md | 17 ++++++- collector/main.py | 4 ++ collector/requirements.txt | 7 ++- collector/scripts/twitter.py | 95 ++++++++++++++++++++++++++++++++++++ 4 files changed, 121 insertions(+), 2 deletions(-) create mode 100644 collector/scripts/twitter.py diff --git a/collector/README.md b/collector/README.md index 3dbb1357a67..dd293fccd30 100644 --- a/collector/README.md +++ b/collector/README.md @@ -48,4 +48,19 @@ Now uploads from the frontend will be processed as if you ran the `watch.py` scr - ![GCP Project Bar](../images/gcp-project-bar.png) - [Enable YouTube Data APIV3](https://console.cloud.google.com/apis/library/youtube.googleapis.com) - Once enabled generate a Credential key for this API -- Paste your key after `GOOGLE_APIS_KEY=` in your `collector/.env` file. \ No newline at end of file +<<<<<<< HEAD +- Paste your key after `GOOGLE_APIS_KEY=` in your `collector/.env` file. +======= +- Paste your key after `GOOGLE_APIS_KEY=` in your `collector/.env` file. + +### Using ther Twitter API +***required to get data form twitter with tweepy** +- Go to https://developer.twitter.com/en/portal/dashboard with your twitter account +- Create a new Project App + - Get your 4 keys and place them in your `collector.env` file + * TW_CONSUMER_KEY + * TW_CONSUMER_SECRET + * TW_ACCESS_TOKEN + * TW_ACCESS_TOKEN_SECRET + populate the .env with the values +>>>>>>> e477cbc (Twitter Feature) diff --git a/collector/main.py b/collector/main.py index bb0054b92f1..010242d6e9e 100644 --- a/collector/main.py +++ b/collector/main.py @@ -6,6 +6,10 @@ from scripts.medium import medium from scripts.gitbook import gitbook from scripts.sitemap import sitemap +<<<<<<< HEAD +======= +from scripts.twitter import twitter +>>>>>>> e477cbc (Twitter Feature) def main(): if os.name == 'nt': diff --git a/collector/requirements.txt b/collector/requirements.txt index 1ab1d70671e..604d22f2f6f 100644 --- a/collector/requirements.txt +++ b/collector/requirements.txt @@ -109,4 +109,9 @@ xlrd==2.0.1 XlsxWriter==3.1.2 yarl==1.9.2 youtube-transcript-api==0.6.0 -zipp==3.15.0 \ No newline at end of file +<<<<<<< HEAD +zipp==3.15.0 +======= +zipp==3.15.0 +tweepy==4.14.0 +>>>>>>> e477cbc (Twitter Feature) diff --git a/collector/scripts/twitter.py b/collector/scripts/twitter.py new file mode 100644 index 00000000000..31e3159fb06 --- /dev/null +++ b/collector/scripts/twitter.py @@ -0,0 +1,95 @@ +""" +Tweepy implementation of twitter reader. Requires the 4 twitter keys to operate. +""" + +import tweepy +import os, time +import pandas as pd +import json +from .utils import tokenize, ada_v2_cost + +def twitter(): + #get user and number of tweets to read + username = input("user timeline to read from (blank to ignore): ") + searchQuery = input("Search term, or leave blank to get user tweets (blank to ignore): ") + tweetCount = input("Gather the last number of tweets: ") + + #read your API kes to call the api. + consumer_key = os.environ["TW_CONSUMER_KEY"] + consumer_secret = os.environ["TW_CONSUMER_SECRET"] + access_token = os.environ["TW_ACCESS_TOKEN"] + access_token_secret = os.environ["TW_ACCESS_TOKEN_SECRET"] + + # Pass in our twitter API authentication key + auth = tweepy.OAuth1UserHandler( + consumer_key, consumer_secret, access_token, access_token_secret + ) + + # Instantiate the tweepy API + api = tweepy.API(auth, wait_on_rate_limit=True) + + try: + if (searchQuery == ''): + tweets = api.user_timeline(screen_name=username, tweet_mode = 'extended', count=tweetCount) + else: + tweets = api.search_tweets(q=searchQuery, tweet_mode = 'extended', count=tweetCount) + + # Pulling Some attributes from the tweet + attributes_container = [ + [tweet.id, tweet.user.screen_name, tweet.created_at, tweet.favorite_count, tweet.source, tweet.full_text] + for tweet in tweets + ] + + # Creation of column list to rename the columns in the dataframe + columns = ["id", "Screen Name", "Date Created", "Number of Likes", "Source of Tweet", "Tweet"] + + # Creation of Dataframe + tweets_df = pd.DataFrame(attributes_container, columns=columns) + + totalTokens = 0 + for index, row in tweets_df.iterrows(): + meta_link = twitter_meta(row, True) + output_filename = f"twitter-{username}-{row['Date Created']}.json" + output_path = f"./outputs/twitter-logs" + + transaction_output_filename = f"tweet-{username}-{row['id']}.json" + transaction_output_dir = f"../server/storage/documents/twitter-{username}" + + if not os.path.isdir(output_path): + os.makedirs(output_path) + + if not os.path.isdir(transaction_output_dir): + os.makedirs(transaction_output_dir) + + full_text = twitter_meta(row) + tokenCount = len(tokenize(full_text)) + meta_link['pageContent'] = full_text + meta_link['token_count_estimate'] = tokenCount + totalTokens += tokenCount + + with open(f"{output_path}/{output_filename}", 'w', encoding='utf-8') as file: + json.dump(meta_link, file, ensure_ascii=True, indent=4) + + with open(f"{transaction_output_dir}/{transaction_output_filename}", 'w', encoding='utf-8') as file: + json.dump(meta_link, file, ensure_ascii=True, indent=4) + + # print(f"{transaction_output_dir}/{transaction_output_filename}") + + print(f"{tokenCount} tokens written over {tweets_df.shape[0]} records.") + + except BaseException as e: + print("Status Failed: ", str(e)) + time.sleep(3) + + +def twitter_meta(row, metadata_only = False): + url = f"http://twitter.com/anyuser/status/{row['id']}" + title = f"Tweet {row['id']}" + meta = { + 'url': url, + 'title': title, + 'description': 'Tweet from ' + row["Screen Name"], + 'published': row["Date Created"].strftime('%Y-%m-%d %H:%M:%S'), + 'wordCount': len(row["Tweet"]), + } + return "Tweet JSON Metadata:\n"+json.dumps(meta)+"\n\n\nText Content:\n" + row["Tweet"] if metadata_only == False else meta From 7139d64a3775c056c34ca4271cb3c22bef47e99b Mon Sep 17 00:00:00 2001 From: Antonio Ciolino Date: Thu, 29 Jun 2023 09:56:53 -0400 Subject: [PATCH 2/3] twitter feature update --- collector/README.md | 4 ---- collector/main.py | 3 --- collector/requirements.txt | 4 ---- 3 files changed, 11 deletions(-) diff --git a/collector/README.md b/collector/README.md index dd293fccd30..ad72b8567a0 100644 --- a/collector/README.md +++ b/collector/README.md @@ -48,9 +48,6 @@ Now uploads from the frontend will be processed as if you ran the `watch.py` scr - ![GCP Project Bar](../images/gcp-project-bar.png) - [Enable YouTube Data APIV3](https://console.cloud.google.com/apis/library/youtube.googleapis.com) - Once enabled generate a Credential key for this API -<<<<<<< HEAD -- Paste your key after `GOOGLE_APIS_KEY=` in your `collector/.env` file. -======= - Paste your key after `GOOGLE_APIS_KEY=` in your `collector/.env` file. ### Using ther Twitter API @@ -63,4 +60,3 @@ Now uploads from the frontend will be processed as if you ran the `watch.py` scr * TW_ACCESS_TOKEN * TW_ACCESS_TOKEN_SECRET populate the .env with the values ->>>>>>> e477cbc (Twitter Feature) diff --git a/collector/main.py b/collector/main.py index 010242d6e9e..e6a9577ad60 100644 --- a/collector/main.py +++ b/collector/main.py @@ -6,10 +6,7 @@ from scripts.medium import medium from scripts.gitbook import gitbook from scripts.sitemap import sitemap -<<<<<<< HEAD -======= from scripts.twitter import twitter ->>>>>>> e477cbc (Twitter Feature) def main(): if os.name == 'nt': diff --git a/collector/requirements.txt b/collector/requirements.txt index 604d22f2f6f..7fc4f05d390 100644 --- a/collector/requirements.txt +++ b/collector/requirements.txt @@ -109,9 +109,5 @@ xlrd==2.0.1 XlsxWriter==3.1.2 yarl==1.9.2 youtube-transcript-api==0.6.0 -<<<<<<< HEAD -zipp==3.15.0 -======= zipp==3.15.0 tweepy==4.14.0 ->>>>>>> e477cbc (Twitter Feature) From 8b923959471b1075b62738e34091fb1badff2f10 Mon Sep 17 00:00:00 2001 From: Antonio Ciolino Date: Thu, 6 Jul 2023 15:25:36 -0400 Subject: [PATCH 3/3] Key validation and operation --- collector/main.py | 9 ++++++--- collector/scripts/twitter.py | 16 +++++++++++----- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/collector/main.py b/collector/main.py index e6a9577ad60..a7d3a7e755a 100644 --- a/collector/main.py +++ b/collector/main.py @@ -16,7 +16,8 @@ def main(): '3': 'Substack', '4': 'Medium', '5': 'Gitbook', - '6': 'Sitemap', + '6': 'Twitter', + '7': 'Sitemap', } print("There are options for data collection to make this easier for you.\nType the number of the method you wish to execute.") print("1. YouTube Channel\n2. Article or Blog Link (Single)\n3. Substack\n4. Medium\n\n[In development]:\nTwitter\n\n") @@ -31,7 +32,7 @@ def main(): {"name": "Medium", "value": "Medium"}, {"name": "Article or Blog Link(s)", "value": "Article or Blog Link(s)"}, {"name": "Gitbook", "value": "Gitbook"}, - {"name": "Twitter", "value": "Twitter", "disabled": "Needs PR"}, + {"name": "Twitter", "value": "Twitter"}, {"name": "Sitemap", "value": "Sitemap"}, {"name": "Abort", "value": "Abort"}, ], @@ -72,8 +73,10 @@ def main(): exit(0) if method == 'Sitemap': sitemap() + exit(0) + if method == 'Twitter': + twitter() exit(0) - print("Selection was not valid.") exit(1) diff --git a/collector/scripts/twitter.py b/collector/scripts/twitter.py index 31e3159fb06..a5c02948d6a 100644 --- a/collector/scripts/twitter.py +++ b/collector/scripts/twitter.py @@ -14,11 +14,15 @@ def twitter(): searchQuery = input("Search term, or leave blank to get user tweets (blank to ignore): ") tweetCount = input("Gather the last number of tweets: ") - #read your API kes to call the api. - consumer_key = os.environ["TW_CONSUMER_KEY"] - consumer_secret = os.environ["TW_CONSUMER_SECRET"] - access_token = os.environ["TW_ACCESS_TOKEN"] - access_token_secret = os.environ["TW_ACCESS_TOKEN_SECRET"] + # Read your API keys to call the API. + consumer_key = os.environ.get("TW_CONSUMER_KEY") + consumer_secret = os.environ.get("TW_CONSUMER_SECRET") + access_token = os.environ.get("TW_ACCESS_TOKEN") + access_token_secret = os.environ.get("TW_ACCESS_TOKEN_SECRET") + + # Check if any of the required environment variables is missing. + if not consumer_key or not consumer_secret or not access_token or not access_token_secret: + raise EnvironmentError("One of the twitter API environment variables are missing.") # Pass in our twitter API authentication key auth = tweepy.OAuth1UserHandler( @@ -83,6 +87,8 @@ def twitter(): def twitter_meta(row, metadata_only = False): + # Note that /anyuser is a known twitter hack for not knowing the user's handle + # https://stackoverflow.com/questions/897107/can-i-fetch-the-tweet-from-twitter-if-i-know-the-tweets-id url = f"http://twitter.com/anyuser/status/{row['id']}" title = f"Tweet {row['id']}" meta = {