From 4034b4dce8feb634675f4d8974e2415017a560e4 Mon Sep 17 00:00:00 2001 From: Andreas Baumgartner Date: Sun, 13 Nov 2022 23:57:56 +0100 Subject: [PATCH 1/4] expand shortened urls --- parser.py | 38 +++++++++++++++++++++++++++++++++++--- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/parser.py b/parser.py index 0e5a5aa..aba6a4a 100644 --- a/parser.py +++ b/parser.py @@ -23,7 +23,9 @@ import json import os import re +import requests import shutil +from urllib.parse import urlparse def read_json_from_js_file(filename): """Reads the contents of a Twitter-produced .js file into a dictionary.""" @@ -49,11 +51,15 @@ def tweet_json_to_markdown(tweet, username, archive_media_folder, output_media_f timestamp = int(round(datetime.datetime.strptime(timestamp_str, '%a %b %d %X %z %Y').timestamp())) # Example: Tue Mar 19 14:05:17 +0000 2019 body = tweet['full_text'] tweet_id_str = tweet['id_str'] - # replace t.co URLs with their original versions - if 'entities' in tweet and 'urls' in tweet['entities']: + # replace t.co URLs with their original versions and expand known url shorteners + if 'entities' in tweet and 'urls' in tweet['entities'] and len(tweet['entities']['urls']) > 0: for url in tweet['entities']['urls']: if 'url' in url and 'expanded_url' in url: - body = body.replace(url['url'], url['expanded_url']) + expanded_url = expand_short_url(url['expanded_url']) + if expanded_url != url['expanded_url']: + body = body.replace(url['url'], expanded_url) + else: + body = body.replace(url['url'], url['expanded_url']) # if the tweet is a reply, construct a header that links the names of the accounts being replied to the tweet being replied to header = '' if 'in_reply_to_status_id' in tweet: @@ -106,6 +112,32 @@ def tweet_json_to_markdown(tweet, username, archive_media_folder, output_media_f body = header + body + f'\n\n [{timestamp_str}](https://twitter.com/{username}/status/{tweet_id_str})' return timestamp, body +def is_short_url(url): + hostname = urlparse(url).hostname + shorteners = ['t.co', '7ax.de', 'bit.ly', 'buff.ly', 'cnn.it', 'ct.de', 'flic.kr', 'go.shr.lc', 'ift.tt', 'instagr.am', 'is.gd', 'j.mp', 'ku-rz.de', 'p.dw.com', 'pl0p.de', 'spon.de', 'sz.de', 'tiny.cc', 'tinyurl.com', 'trib.al', 'wp.me', 'www.sz.de', 'yfrog.com'] + if any(shortener in hostname for shortener in shorteners): + return True + return False + +def expand_short_url(url): + if is_short_url(url): + try: + request = requests.head(url, timeout=2) + except: + return url + if request.ok == False: + return url + try: + url_from_location_header = request.headers['location'] + except KeyError: + return url + if not url_from_location_header.startswith('http'): + return url + elif ':443' in url_from_location_header or is_short_url(url_from_location_header): + url_from_location_header = expand_short_url(url_from_location_header.replace('http:', 'https:')) + url = url_from_location_header + return url + def main(): input_folder = '.' From 9841a326285fd5b941b76541ee242212e5ad96be Mon Sep 17 00:00:00 2001 From: Andreas Baumgartner Date: Mon, 14 Nov 2022 22:52:15 +0100 Subject: [PATCH 2/4] Revert "expand shortened urls" This reverts commit 4034b4dce8feb634675f4d8974e2415017a560e4. --- parser.py | 38 +++----------------------------------- 1 file changed, 3 insertions(+), 35 deletions(-) diff --git a/parser.py b/parser.py index aba6a4a..0e5a5aa 100644 --- a/parser.py +++ b/parser.py @@ -23,9 +23,7 @@ import json import os import re -import requests import shutil -from urllib.parse import urlparse def read_json_from_js_file(filename): """Reads the contents of a Twitter-produced .js file into a dictionary.""" @@ -51,15 +49,11 @@ def tweet_json_to_markdown(tweet, username, archive_media_folder, output_media_f timestamp = int(round(datetime.datetime.strptime(timestamp_str, '%a %b %d %X %z %Y').timestamp())) # Example: Tue Mar 19 14:05:17 +0000 2019 body = tweet['full_text'] tweet_id_str = tweet['id_str'] - # replace t.co URLs with their original versions and expand known url shorteners - if 'entities' in tweet and 'urls' in tweet['entities'] and len(tweet['entities']['urls']) > 0: + # replace t.co URLs with their original versions + if 'entities' in tweet and 'urls' in tweet['entities']: for url in tweet['entities']['urls']: if 'url' in url and 'expanded_url' in url: - expanded_url = expand_short_url(url['expanded_url']) - if expanded_url != url['expanded_url']: - body = body.replace(url['url'], expanded_url) - else: - body = body.replace(url['url'], url['expanded_url']) + body = body.replace(url['url'], url['expanded_url']) # if the tweet is a reply, construct a header that links the names of the accounts being replied to the tweet being replied to header = '' if 'in_reply_to_status_id' in tweet: @@ -112,32 +106,6 @@ def tweet_json_to_markdown(tweet, username, archive_media_folder, output_media_f body = header + body + f'\n\n [{timestamp_str}](https://twitter.com/{username}/status/{tweet_id_str})' return timestamp, body -def is_short_url(url): - hostname = urlparse(url).hostname - shorteners = ['t.co', '7ax.de', 'bit.ly', 'buff.ly', 'cnn.it', 'ct.de', 'flic.kr', 'go.shr.lc', 'ift.tt', 'instagr.am', 'is.gd', 'j.mp', 'ku-rz.de', 'p.dw.com', 'pl0p.de', 'spon.de', 'sz.de', 'tiny.cc', 'tinyurl.com', 'trib.al', 'wp.me', 'www.sz.de', 'yfrog.com'] - if any(shortener in hostname for shortener in shorteners): - return True - return False - -def expand_short_url(url): - if is_short_url(url): - try: - request = requests.head(url, timeout=2) - except: - return url - if request.ok == False: - return url - try: - url_from_location_header = request.headers['location'] - except KeyError: - return url - if not url_from_location_header.startswith('http'): - return url - elif ':443' in url_from_location_header or is_short_url(url_from_location_header): - url_from_location_header = expand_short_url(url_from_location_header.replace('http:', 'https:')) - url = url_from_location_header - return url - def main(): input_folder = '.' From 9ddfe47573e3655a5cbf3d1e6fcde62ca1e81539 Mon Sep 17 00:00:00 2001 From: Andreas Baumgartner Date: Tue, 15 Nov 2022 00:32:53 +0100 Subject: [PATCH 3/4] move link expansion to separate file --- expand_urls.py | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 expand_urls.py diff --git a/expand_urls.py b/expand_urls.py new file mode 100644 index 0000000..c909bd1 --- /dev/null +++ b/expand_urls.py @@ -0,0 +1,95 @@ +import configparser +import glob +import os +import requests +import time +from urllib.parse import urlparse +from parser import read_json_from_js_file + +class URLExpander: + def __init__(self): + self.config = configparser.ConfigParser(allow_no_value=True, interpolation=None, strict=False) + self.config.optionxform = str + self.config.read('expand_urls.ini') + try: + self.shorteners = self.config.options('shorteners') + except configparser.Error: + print('No configuration found, using default configuration') + self.config['shorteners'] = {} + self.config['mappings'] = {} + self.shorteners = ['t.co', '7ax.de', 'bit.ly', 'buff.ly', 'cnn.it', 'ct.de', 'flic.kr', 'go.shr.lc', 'ift.tt', 'instagr.am', 'is.gd', 'j.mp', 'ku-rz.de', 'p.dw.com', 'pl0p.de', 'spon.de', 'sz.de', 'tiny.cc', 'tinyurl.com', 'trib.al', 'wp.me', 'www.sz.de', 'yfrog.com'] + [self.config.set('shorteners', x) for x in self.shorteners] + with open('expand_urls.ini', 'w') as inifile: + self.config.write(inifile) + + def get_input_filenames(self): + input_folder = '.' + + # Identify the file and folder names - they change slightly depending on the archive size it seems + data_folder = os.path.join(input_folder, 'data') + tweet_js_filename_templates = ['tweet.js', 'tweets.js', 'tweets-part*.js'] + input_filenames = [] + for tweet_js_filename_template in tweet_js_filename_templates: + input_filenames += glob.glob(os.path.join(data_folder, tweet_js_filename_template)) + if len(input_filenames)==0: + print(f'Error: no files matching {tweet_js_filename_templates} in {data_folder}') + exit() + return input_filenames + + def process_tweets(self): + for tweets_js_filename in self.get_input_filenames(): + print(f'Parsing {tweets_js_filename}...') + json = read_json_from_js_file(tweets_js_filename) + [self.parse_tweet(tweet) for tweet in json] + + def save_mapping(self, original_url, expanded_url): + self.config['mappings'][original_url] = expanded_url + with open('expand_urls.ini', 'w') as inifile: + self.config.write(inifile) + + def mapping_exists(self, original_url): + try: + tmp = self.config['mappings'][original_url] + except KeyError: # TODO: this fails always + return False + return True + + def parse_tweet(self, tweet): + tweet = tweet['tweet'] + if 'entities' in tweet and 'urls' in tweet['entities'] and len(tweet['entities']['urls']) > 0: + for url in tweet['entities']['urls']: + if 'url' in url and 'expanded_url' in url: + original_url = url['expanded_url'] + if not self.mapping_exists(original_url): + expanded_url = self.expand_short_url(original_url) + if expanded_url != original_url: + self.save_mapping(original_url, expanded_url) + + def is_short_url(self, url): + hostname = urlparse(url).hostname + if any(shortener == hostname for shortener in self.shorteners): + return True + return False + + def expand_short_url(self, url): + if self.is_short_url(url): + try: + request = requests.head(url) + time.sleep(0.75) + except: + pass + if request.ok == False: + return url + try: + url_from_location_header = request.headers['location'] + except KeyError: + return url + if not url_from_location_header.startswith('http'): + return url + elif ':443' in url_from_location_header or self.is_short_url(url_from_location_header): + url_from_location_header = self.expand_short_url(url_from_location_header.replace('http:', 'https:')) + url = url_from_location_header + return url + +if __name__ == '__main__': + URLExpander().process_tweets() From 2165d7c715e9d7082e814c905ac2e2cb53af0be3 Mon Sep 17 00:00:00 2001 From: Andreas Baumgartner Date: Tue, 15 Nov 2022 00:34:48 +0100 Subject: [PATCH 4/4] try to expand plaintext links in really old tweets --- expand_urls.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/expand_urls.py b/expand_urls.py index c909bd1..c3ae105 100644 --- a/expand_urls.py +++ b/expand_urls.py @@ -1,6 +1,7 @@ import configparser import glob import os +import re import requests import time from urllib.parse import urlparse @@ -64,6 +65,15 @@ def parse_tweet(self, tweet): expanded_url = self.expand_short_url(original_url) if expanded_url != original_url: self.save_mapping(original_url, expanded_url) + else: + # really old tweets may contain URLs as plain text in the body + possible_urls = re.finditer(r"https?://[a-z0-9\.]+/[a-z0-9?]{10}", tweet['full_text'], re.MULTILINE | re.IGNORECASE) + for (_, match) in enumerate(possible_urls): + matched_url = match.group(0) + if not self.mapping_exists(matched_url): + expanded_url = self.expand_short_url(matched_url) + if (expanded_url != matched_url): + self.save_mapping(matched_url, expanded_url) def is_short_url(self, url): hostname = urlparse(url).hostname