From 4034b4dce8feb634675f4d8974e2415017a560e4 Mon Sep 17 00:00:00 2001
From: Andreas Baumgartner <mail@andreas.bz.it>
Date: Sun, 13 Nov 2022 23:57:56 +0100
Subject: [PATCH 1/4] expand shortened urls

---
 parser.py | 38 +++++++++++++++++++++++++++++++++++---
 1 file changed, 35 insertions(+), 3 deletions(-)
diff --git a/parser.py b/parser.py
index 0e5a5aa..aba6a4a 100644
--- a/parser.py
+++ b/parser.py
@@ -23,7 +23,9 @@
 import json
 import os
 import re
+import requests
 import shutil
+from urllib.parse import urlparse
 
 def read_json_from_js_file(filename):
     """Reads the contents of a Twitter-produced .js file into a dictionary."""
@@ -49,11 +51,15 @@ def tweet_json_to_markdown(tweet, username, archive_media_folder, output_media_f
     timestamp = int(round(datetime.datetime.strptime(timestamp_str, '%a %b %d %X %z %Y').timestamp())) # Example: Tue Mar 19 14:05:17 +0000 2019
     body = tweet['full_text']
     tweet_id_str = tweet['id_str']
-    # replace t.co URLs with their original versions
-    if 'entities' in tweet and 'urls' in tweet['entities']:
+    # replace t.co URLs with their original versions and expand known url shorteners
+    if 'entities' in tweet and 'urls' in tweet['entities'] and len(tweet['entities']['urls']) > 0:
         for url in tweet['entities']['urls']:
             if 'url' in url and 'expanded_url' in url:
-                body = body.replace(url['url'], url['expanded_url'])
+                expanded_url = expand_short_url(url['expanded_url'])
+                if expanded_url != url['expanded_url']:
+                    body = body.replace(url['url'], expanded_url)
+                else:
+                    body = body.replace(url['url'], url['expanded_url'])
     # if the tweet is a reply, construct a header that links the names of the accounts being replied to the tweet being replied to
     header = ''
     if 'in_reply_to_status_id' in tweet:
@@ -106,6 +112,32 @@ def tweet_json_to_markdown(tweet, username, archive_media_folder, output_media_f
     body = header + body + f'\n\n<img src="media/tweet.ico" width="12" /> [{timestamp_str}](https://twitter.com/{username}/status/{tweet_id_str})'
     return timestamp, body
 
+def is_short_url(url):
+    hostname = urlparse(url).hostname
+    shorteners = ['t.co', '7ax.de', 'bit.ly', 'buff.ly', 'cnn.it', 'ct.de', 'flic.kr', 'go.shr.lc', 'ift.tt', 'instagr.am', 'is.gd', 'j.mp', 'ku-rz.de', 'p.dw.com', 'pl0p.de', 'spon.de', 'sz.de', 'tiny.cc', 'tinyurl.com', 'trib.al', 'wp.me', 'www.sz.de', 'yfrog.com']
+    if any(shortener in hostname for shortener in shorteners):
+        return True
+    return False
+
+def expand_short_url(url):
+    if is_short_url(url):
+        try:
+            request = requests.head(url, timeout=2)
+        except:
+            return url
+        if request.ok == False:
+            return url
+        try:
+            url_from_location_header = request.headers['location']
+        except KeyError:
+            return url
+        if not url_from_location_header.startswith('http'):
+            return url
+        elif ':443' in url_from_location_header or is_short_url(url_from_location_header):
+            url_from_location_header = expand_short_url(url_from_location_header.replace('http:', 'https:'))
+        url = url_from_location_header
+    return url
+
 def main():
 
     input_folder = '.'

From 9841a326285fd5b941b76541ee242212e5ad96be Mon Sep 17 00:00:00 2001
From: Andreas Baumgartner <mail@andreas.bz.it>
Date: Mon, 14 Nov 2022 22:52:15 +0100
Subject: [PATCH 2/4] Revert "expand shortened urls"

This reverts commit 4034b4dce8feb634675f4d8974e2415017a560e4.
---
 parser.py | 38 +++-----------------------------------
 1 file changed, 3 insertions(+), 35 deletions(-)

diff --git a/parser.py b/parser.py
index aba6a4a..0e5a5aa 100644
--- a/parser.py
+++ b/parser.py
@@ -23,9 +23,7 @@
 import json
 import os
 import re
-import requests
 import shutil
-from urllib.parse import urlparse
 
 def read_json_from_js_file(filename):
     """Reads the contents of a Twitter-produced .js file into a dictionary."""
@@ -51,15 +49,11 @@ def tweet_json_to_markdown(tweet, username, archive_media_folder, output_media_f
     timestamp = int(round(datetime.datetime.strptime(timestamp_str, '%a %b %d %X %z %Y').timestamp())) # Example: Tue Mar 19 14:05:17 +0000 2019
     body = tweet['full_text']
     tweet_id_str = tweet['id_str']
-    # replace t.co URLs with their original versions and expand known url shorteners
-    if 'entities' in tweet and 'urls' in tweet['entities'] and len(tweet['entities']['urls']) > 0:
+    # replace t.co URLs with their original versions
+    if 'entities' in tweet and 'urls' in tweet['entities']:
         for url in tweet['entities']['urls']:
             if 'url' in url and 'expanded_url' in url:
-                expanded_url = expand_short_url(url['expanded_url'])
-                if expanded_url != url['expanded_url']:
-                    body = body.replace(url['url'], expanded_url)
-                else:
-                    body = body.replace(url['url'], url['expanded_url'])
+                body = body.replace(url['url'], url['expanded_url'])
     # if the tweet is a reply, construct a header that links the names of the accounts being replied to the tweet being replied to
     header = ''
     if 'in_reply_to_status_id' in tweet:
@@ -112,32 +106,6 @@ def tweet_json_to_markdown(tweet, username, archive_media_folder, output_media_f
     body = header + body + f'\n\n<img src="media/tweet.ico" width="12" /> [{timestamp_str}](https://twitter.com/{username}/status/{tweet_id_str})'
     return timestamp, body
 
-def is_short_url(url):
-    hostname = urlparse(url).hostname
-    shorteners = ['t.co', '7ax.de', 'bit.ly', 'buff.ly', 'cnn.it', 'ct.de', 'flic.kr', 'go.shr.lc', 'ift.tt', 'instagr.am', 'is.gd', 'j.mp', 'ku-rz.de', 'p.dw.com', 'pl0p.de', 'spon.de', 'sz.de', 'tiny.cc', 'tinyurl.com', 'trib.al', 'wp.me', 'www.sz.de', 'yfrog.com']
-    if any(shortener in hostname for shortener in shorteners):
-        return True
-    return False
-
-def expand_short_url(url):
-    if is_short_url(url):
-        try:
-            request = requests.head(url, timeout=2)
-        except:
-            return url
-        if request.ok == False:
-            return url
-        try:
-            url_from_location_header = request.headers['location']
-        except KeyError:
-            return url
-        if not url_from_location_header.startswith('http'):
-            return url
-        elif ':443' in url_from_location_header or is_short_url(url_from_location_header):
-            url_from_location_header = expand_short_url(url_from_location_header.replace('http:', 'https:'))
-        url = url_from_location_header
-    return url
-
 def main():
 
     input_folder = '.'

From 9ddfe47573e3655a5cbf3d1e6fcde62ca1e81539 Mon Sep 17 00:00:00 2001
From: Andreas Baumgartner <mail@andreas.bz.it>
Date: Tue, 15 Nov 2022 00:32:53 +0100
Subject: [PATCH 3/4] move link expansion to separate file

---
 expand_urls.py | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 95 insertions(+)
 create mode 100644 expand_urls.py

diff --git a/expand_urls.py b/expand_urls.py
new file mode 100644
index 0000000..c909bd1
--- /dev/null
+++ b/expand_urls.py
@@ -0,0 +1,95 @@
+import configparser
+import glob
+import os
+import requests
+import time
+from urllib.parse import urlparse
+from parser import read_json_from_js_file
+
+class URLExpander:
+    def __init__(self):
+        self.config = configparser.ConfigParser(allow_no_value=True, interpolation=None, strict=False)
+        self.config.optionxform = str
+        self.config.read('expand_urls.ini')
+        try:
+            self.shorteners = self.config.options('shorteners')
+        except configparser.Error:
+            print('No configuration found, using default configuration')
+            self.config['shorteners'] = {}
+            self.config['mappings'] = {}
+            self.shorteners = ['t.co', '7ax.de', 'bit.ly', 'buff.ly', 'cnn.it', 'ct.de', 'flic.kr', 'go.shr.lc', 'ift.tt', 'instagr.am', 'is.gd', 'j.mp', 'ku-rz.de', 'p.dw.com', 'pl0p.de', 'spon.de', 'sz.de', 'tiny.cc', 'tinyurl.com', 'trib.al', 'wp.me', 'www.sz.de', 'yfrog.com']
+            [self.config.set('shorteners', x) for x in self.shorteners]
+            with open('expand_urls.ini', 'w') as inifile:
+                self.config.write(inifile)
+
+    def get_input_filenames(self):
+        input_folder = '.'
+
+        # Identify the file and folder names - they change slightly depending on the archive size it seems
+        data_folder = os.path.join(input_folder, 'data')
+        tweet_js_filename_templates = ['tweet.js', 'tweets.js', 'tweets-part*.js']
+        input_filenames = []
+        for tweet_js_filename_template in tweet_js_filename_templates:
+            input_filenames += glob.glob(os.path.join(data_folder, tweet_js_filename_template))
+        if len(input_filenames)==0:
+            print(f'Error: no files matching {tweet_js_filename_templates} in {data_folder}')
+            exit()
+        return input_filenames
+
+    def process_tweets(self):
+        for tweets_js_filename in self.get_input_filenames():
+            print(f'Parsing {tweets_js_filename}...')
+            json = read_json_from_js_file(tweets_js_filename)
+            [self.parse_tweet(tweet) for tweet in json]
+
+    def save_mapping(self, original_url, expanded_url):
+        self.config['mappings'][original_url] = expanded_url
+        with open('expand_urls.ini', 'w') as inifile:
+            self.config.write(inifile)
+
+    def mapping_exists(self, original_url):
+        try:
+            tmp = self.config['mappings'][original_url]
+        except KeyError:    # TODO: this fails always
+            return False
+        return True
+
+    def parse_tweet(self, tweet):
+        tweet = tweet['tweet']
+        if 'entities' in tweet and 'urls' in tweet['entities'] and len(tweet['entities']['urls']) > 0:
+            for url in tweet['entities']['urls']:
+                if 'url' in url and 'expanded_url' in url:
+                    original_url = url['expanded_url']
+                    if not self.mapping_exists(original_url):
+                        expanded_url = self.expand_short_url(original_url)
+                        if expanded_url != original_url:
+                            self.save_mapping(original_url, expanded_url)
+
+    def is_short_url(self, url):
+        hostname = urlparse(url).hostname
+        if any(shortener == hostname for shortener in self.shorteners):
+            return True
+        return False
+
+    def expand_short_url(self, url):
+        if self.is_short_url(url):
+            try:
+                request = requests.head(url)
+                time.sleep(0.75)
+            except:
+                pass
+            if request.ok == False:
+                return url
+            try:
+                url_from_location_header = request.headers['location']
+            except KeyError:
+                return url
+            if not url_from_location_header.startswith('http'):
+                return url
+            elif ':443' in url_from_location_header or self.is_short_url(url_from_location_header):
+                url_from_location_header = self.expand_short_url(url_from_location_header.replace('http:', 'https:'))
+            url = url_from_location_header
+        return url
+
+if __name__ == '__main__':
+    URLExpander().process_tweets()

From 2165d7c715e9d7082e814c905ac2e2cb53af0be3 Mon Sep 17 00:00:00 2001
From: Andreas Baumgartner <mail@andreas.bz.it>
Date: Tue, 15 Nov 2022 00:34:48 +0100
Subject: [PATCH 4/4] try to expand plaintext links in really old tweets

---
 expand_urls.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/expand_urls.py b/expand_urls.py
index c909bd1..c3ae105 100644
--- a/expand_urls.py
+++ b/expand_urls.py
@@ -1,6 +1,7 @@
 import configparser
 import glob
 import os
+import re
 import requests
 import time
 from urllib.parse import urlparse
@@ -64,6 +65,15 @@ def parse_tweet(self, tweet):
                         expanded_url = self.expand_short_url(original_url)
                         if expanded_url != original_url:
                             self.save_mapping(original_url, expanded_url)
+        else:
+            # really old tweets may contain URLs as plain text in the body
+            possible_urls = re.finditer(r"https?://[a-z0-9\.]+/[a-z0-9?]{10}", tweet['full_text'], re.MULTILINE | re.IGNORECASE)
+            for (_, match) in enumerate(possible_urls):
+                matched_url = match.group(0)
+                if not self.mapping_exists(matched_url):
+                    expanded_url = self.expand_short_url(matched_url)
+                    if (expanded_url != matched_url):
+                        self.save_mapping(matched_url, expanded_url)
 
     def is_short_url(self, url):
         hostname = urlparse(url).hostname