Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 105 additions & 0 deletions expand_urls.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import configparser
import glob
import os
import re
import requests
import time
from urllib.parse import urlparse
from parser import read_json_from_js_file

class URLExpander:
def __init__(self):
self.config = configparser.ConfigParser(allow_no_value=True, interpolation=None, strict=False)
self.config.optionxform = str
self.config.read('expand_urls.ini')
try:
self.shorteners = self.config.options('shorteners')
except configparser.Error:
print('No configuration found, using default configuration')
self.config['shorteners'] = {}
self.config['mappings'] = {}
self.shorteners = ['t.co', '7ax.de', 'bit.ly', 'buff.ly', 'cnn.it', 'ct.de', 'flic.kr', 'go.shr.lc', 'ift.tt', 'instagr.am', 'is.gd', 'j.mp', 'ku-rz.de', 'p.dw.com', 'pl0p.de', 'spon.de', 'sz.de', 'tiny.cc', 'tinyurl.com', 'trib.al', 'wp.me', 'www.sz.de', 'yfrog.com']
[self.config.set('shorteners', x) for x in self.shorteners]
with open('expand_urls.ini', 'w') as inifile:
self.config.write(inifile)

def get_input_filenames(self):
input_folder = '.'

# Identify the file and folder names - they change slightly depending on the archive size it seems
data_folder = os.path.join(input_folder, 'data')
tweet_js_filename_templates = ['tweet.js', 'tweets.js', 'tweets-part*.js']
input_filenames = []
for tweet_js_filename_template in tweet_js_filename_templates:
input_filenames += glob.glob(os.path.join(data_folder, tweet_js_filename_template))
if len(input_filenames)==0:
print(f'Error: no files matching {tweet_js_filename_templates} in {data_folder}')
exit()
return input_filenames

def process_tweets(self):
for tweets_js_filename in self.get_input_filenames():
print(f'Parsing {tweets_js_filename}...')
json = read_json_from_js_file(tweets_js_filename)
[self.parse_tweet(tweet) for tweet in json]

def save_mapping(self, original_url, expanded_url):
self.config['mappings'][original_url] = expanded_url
with open('expand_urls.ini', 'w') as inifile:
self.config.write(inifile)

def mapping_exists(self, original_url):
try:
tmp = self.config['mappings'][original_url]
except KeyError: # TODO: this fails always
return False
return True

def parse_tweet(self, tweet):
tweet = tweet['tweet']
if 'entities' in tweet and 'urls' in tweet['entities'] and len(tweet['entities']['urls']) > 0:
for url in tweet['entities']['urls']:
if 'url' in url and 'expanded_url' in url:
original_url = url['expanded_url']
if not self.mapping_exists(original_url):
expanded_url = self.expand_short_url(original_url)
if expanded_url != original_url:
self.save_mapping(original_url, expanded_url)
else:
# really old tweets may contain URLs as plain text in the body
possible_urls = re.finditer(r"https?://[a-z0-9\.]+/[a-z0-9?]{10}", tweet['full_text'], re.MULTILINE | re.IGNORECASE)
for (_, match) in enumerate(possible_urls):
matched_url = match.group(0)
if not self.mapping_exists(matched_url):
expanded_url = self.expand_short_url(matched_url)
if (expanded_url != matched_url):
self.save_mapping(matched_url, expanded_url)

def is_short_url(self, url):
hostname = urlparse(url).hostname
if any(shortener == hostname for shortener in self.shorteners):
return True
return False

def expand_short_url(self, url):
if self.is_short_url(url):
try:
request = requests.head(url)
time.sleep(0.75)
except:
pass
if request.ok == False:
return url
try:
url_from_location_header = request.headers['location']
except KeyError:
return url
if not url_from_location_header.startswith('http'):
return url
elif ':443' in url_from_location_header or self.is_short_url(url_from_location_header):
url_from_location_header = self.expand_short_url(url_from_location_header.replace('http:', 'https:'))
url = url_from_location_header
return url

if __name__ == '__main__':
URLExpander().process_tweets()