timhutton · abaumg · Nov 13, 2022 · Nov 14, 2022 · Nov 14, 2022 · Nov 14, 2022
diff --git a/expand_urls.py b/expand_urls.py
@@ -0,0 +1,105 @@
+import configparser
+import glob
+import os
+import re
+import requests
+import time
+from urllib.parse import urlparse
+from parser import read_json_from_js_file
+
+class URLExpander:
+    def __init__(self):
+        self.config = configparser.ConfigParser(allow_no_value=True, interpolation=None, strict=False)
+        self.config.optionxform = str
+        self.config.read('expand_urls.ini')
+        try:
+            self.shorteners = self.config.options('shorteners')
+        except configparser.Error:
+            print('No configuration found, using default configuration')
+            self.config['shorteners'] = {}
+            self.config['mappings'] = {}
+            self.shorteners = ['t.co', '7ax.de', 'bit.ly', 'buff.ly', 'cnn.it', 'ct.de', 'flic.kr', 'go.shr.lc', 'ift.tt', 'instagr.am', 'is.gd', 'j.mp', 'ku-rz.de', 'p.dw.com', 'pl0p.de', 'spon.de', 'sz.de', 'tiny.cc', 'tinyurl.com', 'trib.al', 'wp.me', 'www.sz.de', 'yfrog.com']
+            [self.config.set('shorteners', x) for x in self.shorteners]
+            with open('expand_urls.ini', 'w') as inifile:
+                self.config.write(inifile)
+
+    def get_input_filenames(self):
+        input_folder = '.'
+
+        # Identify the file and folder names - they change slightly depending on the archive size it seems
+        data_folder = os.path.join(input_folder, 'data')
+        tweet_js_filename_templates = ['tweet.js', 'tweets.js', 'tweets-part*.js']
+        input_filenames = []
+        for tweet_js_filename_template in tweet_js_filename_templates:
+            input_filenames += glob.glob(os.path.join(data_folder, tweet_js_filename_template))
+        if len(input_filenames)==0:
+            print(f'Error: no files matching {tweet_js_filename_templates} in {data_folder}')
+            exit()
+        return input_filenames
+
+    def process_tweets(self):
+        for tweets_js_filename in self.get_input_filenames():
+            print(f'Parsing {tweets_js_filename}...')
+            json = read_json_from_js_file(tweets_js_filename)
+            [self.parse_tweet(tweet) for tweet in json]
+
+    def save_mapping(self, original_url, expanded_url):
+        self.config['mappings'][original_url] = expanded_url
+        with open('expand_urls.ini', 'w') as inifile:
+            self.config.write(inifile)
+
+    def mapping_exists(self, original_url):
+        try:
+            tmp = self.config['mappings'][original_url]
+        except KeyError:    # TODO: this fails always
+            return False
+        return True
+
+    def parse_tweet(self, tweet):
+        tweet = tweet['tweet']
+        if 'entities' in tweet and 'urls' in tweet['entities'] and len(tweet['entities']['urls']) > 0:
+            for url in tweet['entities']['urls']:
+                if 'url' in url and 'expanded_url' in url:
+                    original_url = url['expanded_url']
+                    if not self.mapping_exists(original_url):
+                        expanded_url = self.expand_short_url(original_url)
+                        if expanded_url != original_url:
+                            self.save_mapping(original_url, expanded_url)
+        else:
+            # really old tweets may contain URLs as plain text in the body
+            possible_urls = re.finditer(r"https?://[a-z0-9\.]+/[a-z0-9?]{10}", tweet['full_text'], re.MULTILINE | re.IGNORECASE)
+            for (_, match) in enumerate(possible_urls):
+                matched_url = match.group(0)
+                if not self.mapping_exists(matched_url):
+                    expanded_url = self.expand_short_url(matched_url)
+                    if (expanded_url != matched_url):
+                        self.save_mapping(matched_url, expanded_url)
+
+    def is_short_url(self, url):
+        hostname = urlparse(url).hostname
+        if any(shortener == hostname for shortener in self.shorteners):
+            return True
+        return False
+
+    def expand_short_url(self, url):
+        if self.is_short_url(url):
+            try:
+                request = requests.head(url)
+                time.sleep(0.75)
+            except:
+                pass
+            if request.ok == False:
+                return url
+            try:
+                url_from_location_header = request.headers['location']
+            except KeyError:
+                return url
+            if not url_from_location_header.startswith('http'):
+                return url
+            elif ':443' in url_from_location_header or self.is_short_url(url_from_location_header):
+                url_from_location_header = self.expand_short_url(url_from_location_header.replace('http:', 'https:'))
+            url = url_from_location_header
+        return url
+
+if __name__ == '__main__':
+    URLExpander().process_tweets()