diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 0000000..04312ae --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,69 @@ +# Description of the process + +## Parsing of the tables + +### links.txt +- `pl_from` -> Id of the "from" page of this link +- (`pl_namespace`) -> We keep only if equals 0 (= namespace of the "from" page of this link) +- `pl_target_id` -> Target of this link (foreign key to `linktarget`) + +### targets.txt +- `lt_id` -> Id of this link (index) +- (`lt_ns`) -> We keep only if equals 0 (= namespace of the targeted page) +- `lt_title` -> Title of the targeted page + +### pages.txt +- `page_id` -> Id of the page +- (`page_namespace`) -> We keep only if equals 0 (= namespace of this page) +- `page_title` -> Title of this page +- `page_is_redirect` -> Boolean wether this page is a redirect +- Ignore the eight following + +### redirects.txt +- `rd_from` -> Id of the page from which we are redirected +- (`rd_namespace`) -> We keep only if equals 0 (= namespace of the page we are redirected to) +- `rd_title` -> Title of the page we are redirected to +- Ignore the two following + +## Joining the tables + +### redirects.with_ids.txt (replace_titles_in_redirects_file.py) +Replaces for each redirection, `rd_title` with the targetted `page_id` by matching on `page_title`. +The targetted page_id is then computed as a redirect recursively, until we get on a "final" page. +- `rd_from` -> The id of the page we are redirected from +- `page_id` -> The id of the page we get to following redirections recursively + +### targets.with_ids.txt (replace_titles_and_redirects_in_targets_file.py) +Replaces, for each linktarget, `lt_title` with the targetted `page_id` by matching on `page_title`. +We then compute the "final" page obtained from this page following redirection, with the file `redirects.with_ids.txt`. +- `lt_id` -> Id of this link +- `page_id` -> The id of the page this link is pointing to, after having followed all redirections + +### links.with_ids.txt (replace_titles_and_redirects_in_links_file.py) +Replaces, for each pagelink, `lt_id` with the targetted `page_id` by joining with `links.with_ids.txt`. +- `pl_from` -> Id of the "from" page, after having followed all redirections +- `page_id` -> Id of the "to" page, after having followed all redirections + +### page.pruned.txt (prune_pages_file.py) +Prunes the pages file by removing pages which are marked as redirects but have no corresponding redirect in the redirects file. + +## Sorting, grouping, and counting the links + +### links.sorted_by_XXX_id.txt +Then we sorts the `links.with_ids.txt` according to the first "source" id, into +the file `links.sorted_by_source_id.txt`, and according to the second "target" id +into the file `links.sorted_by_target_id.txt`. + +### links.grouped_by_XXX_id.txt +Then, we use those two files to *GROUP BY* the links by source and by target. +The file `links.grouped_by_source_id.txt` is like this +- `pl_from` -> Id of the "from" page +- `targets` -> A `|`-separated string of the ids the "from" page targets + +The file `links.grouped_by_target_id.txt` is like this +- `froms` -> A `|`-separated string of the ids of the pages targeting the "target" page +- `pl_target` -> Id of the "target" page + +### links.with_counts.txt (combine_grouped_links_files.py) + +## Making the database diff --git a/scripts/buildDatabase.sh b/scripts/buildDatabase.sh index 8103a58..5f622ed 100755 --- a/scripts/buildDatabase.sh +++ b/scripts/buildDatabase.sh @@ -1,15 +1,19 @@ #!/bin/bash - set -euo pipefail # Force default language for output sorting to be bytewise. Necessary to ensure uniformity amongst # UNIX commands. export LC_ALL=C +# These variables can be set by external environment +WLANG=''${WLANG:-en} +OUT_DIR="${OUT_DIR:-dump}" +DELETE_PROGRESSIVELY=${DELETE_PROGRESSIVELY:-false} + # By default, the latest Wikipedia dump will be downloaded. If a download date in the format # YYYYMMDD is provided as the first argument, it will be used instead. if [[ $# -eq 0 ]]; then - DOWNLOAD_DATE=$(wget -q -O- https://dumps.wikimedia.org/enwiki/ | grep -Po '\d{8}' | sort | tail -n1) + DOWNLOAD_DATE=$(wget -q -O- https://dumps.wikimedia.org/${WLANG}wiki/ | grep -Po '\d{8}' | sort | tail -n1) else if [ ${#1} -ne 8 ]; then echo "[ERROR] Invalid download date provided: $1" @@ -19,17 +23,17 @@ else fi fi -ROOT_DIR=`pwd` -OUT_DIR="dump" +# Root directory is that of this script +ROOT_DIR=$(dirname "$0") -DOWNLOAD_URL="https://dumps.wikimedia.org/enwiki/$DOWNLOAD_DATE" -TORRENT_URL="https://dump-torrents.toolforge.org/enwiki/$DOWNLOAD_DATE" - -SHA1SUM_FILENAME="enwiki-$DOWNLOAD_DATE-sha1sums.txt" -REDIRECTS_FILENAME="enwiki-$DOWNLOAD_DATE-redirect.sql.gz" -PAGES_FILENAME="enwiki-$DOWNLOAD_DATE-page.sql.gz" -LINKS_FILENAME="enwiki-$DOWNLOAD_DATE-pagelinks.sql.gz" +DOWNLOAD_URL="https://dumps.wikimedia.org/${WLANG}wiki/$DOWNLOAD_DATE" +TORRENT_URL="https://dump-torrents.toolforge.org/${WLANG}wiki/$DOWNLOAD_DATE" +SHA1SUM_FILENAME="${WLANG}wiki-$DOWNLOAD_DATE-sha1sums.txt" +REDIRECTS_FILENAME="${WLANG}wiki-$DOWNLOAD_DATE-redirect.sql.gz" +PAGES_FILENAME="${WLANG}wiki-$DOWNLOAD_DATE-page.sql.gz" +LINKS_FILENAME="${WLANG}wiki-$DOWNLOAD_DATE-pagelinks.sql.gz" +TARGETS_FILENAME="${WLANG}wiki-$DOWNLOAD_DATE-linktarget.sql.gz" # Make the output directory if it doesn't already exist and move to it mkdir -p $OUT_DIR @@ -79,6 +83,7 @@ download_file "sha1sums" $SHA1SUM_FILENAME download_file "redirects" $REDIRECTS_FILENAME download_file "pages" $PAGES_FILENAME download_file "links" $LINKS_FILENAME +download_file "targets" $TARGETS_FILENAME ########################## # TRIM WIKIPEDIA DUMPS # @@ -105,7 +110,7 @@ if [ ! -f redirects.txt.gz ]; then else echo "[WARN] Already trimmed redirects file" fi - +if $DELETE_PROGRESSIVELY; then rm $REDIRECTS_FILENAME; fi if [ ! -f pages.txt.gz ]; then echo echo "[INFO] Trimming pages file" @@ -118,16 +123,16 @@ if [ ! -f pages.txt.gz ]; then # Splice out the page title and whether or not the page is a redirect # Zip into output file time pigz -dc $PAGES_FILENAME \ - | sed -n 's/^INSERT INTO `page` VALUES (//p' \ - | sed -e 's/),(/\'$'\n/g' \ - | egrep "^[0-9]+,0," \ - | sed -e $"s/,0,'/\t/" \ - | sed -e $"s/',[^,]*,\([01]\).*/\t\1/" \ + | sed -n 's/^INSERT INTO `page` VALUES //p' \ + | egrep -o "\([0-9]+,0,'([^']*(\\\\')?)+',[01]," \ + | sed -re $"s/^\(([0-9]+),0,'/\1\t/" \ + | sed -re $"s/',([01]),/\t\1/" \ | pigz --fast > pages.txt.gz.tmp mv pages.txt.gz.tmp pages.txt.gz else echo "[WARN] Already trimmed pages file" fi +if $DELETE_PROGRESSIVELY; then rm $PAGES_FILENAME; fi if [ ! -f links.txt.gz ]; then echo @@ -143,14 +148,38 @@ if [ ! -f links.txt.gz ]; then time pigz -dc $LINKS_FILENAME \ | sed -n 's/^INSERT INTO `pagelinks` VALUES (//p' \ | sed -e 's/),(/\'$'\n/g' \ - | egrep "^[0-9]+,0,.*,0$" \ - | sed -e $"s/,0,'/\t/g" \ - | sed -e "s/',0//g" \ + | egrep "^[0-9]+,0,[0-9]+$" \ + | sed -e $"s/,0,/\t/g" \ | pigz --fast > links.txt.gz.tmp mv links.txt.gz.tmp links.txt.gz else echo "[WARN] Already trimmed links file" fi +if $DELETE_PROGRESSIVELY; then rm $LINKS_FILENAME; fi + +if [ ! -f targets.txt.gz ]; then + echo + echo "[INFO] Trimming targets file" + + # Unzip + # Remove all lines that don't start with INSERT INTO... + # Split into individual records + # Only keep records in namespace 0 + # Replace namespace with a tab + # Remove everything starting at the to page name's closing apostrophe + # Zip into output file + time pigz -dc $TARGETS_FILENAME \ + | sed -n 's/^INSERT INTO `linktarget` VALUES (//p' \ + | sed -e 's/),(/\'$'\n/g' \ + | egrep "^[0-9]+,0,.*$" \ + | sed -e $"s/,0,'/\t/g" \ + | sed -e "s/'$//g" \ + | pigz --fast > targets.txt.gz.tmp + mv targets.txt.gz.tmp targets.txt.gz +else + echo "[WARN] Already trimmed targets file" +fi +if $DELETE_PROGRESSIVELY; then rm $TARGETS_FILENAME; fi ########################################### @@ -166,16 +195,29 @@ if [ ! -f redirects.with_ids.txt.gz ]; then else echo "[WARN] Already replaced titles in redirects file" fi +if $DELETE_PROGRESSIVELY; then rm redirects.txt.gz; fi + +if [ ! -f targets.with_ids.txt.gz ]; then + echo + echo "[INFO] Replacing titles and redirects in targets file" + time python "$ROOT_DIR/replace_titles_and_redirects_in_targets_file.py" pages.txt.gz redirects.with_ids.txt.gz targets.txt.gz \ + | pigz --fast > targets.with_ids.txt.gz.tmp + mv targets.with_ids.txt.gz.tmp targets.with_ids.txt.gz +else + echo "[WARN] Already replaced titles and redirects in targets file" +fi +if $DELETE_PROGRESSIVELY; then rm targets.txt.gz; fi if [ ! -f links.with_ids.txt.gz ]; then echo echo "[INFO] Replacing titles and redirects in links file" - time python "$ROOT_DIR/replace_titles_and_redirects_in_links_file.py" pages.txt.gz redirects.with_ids.txt.gz links.txt.gz \ + time python "$ROOT_DIR/replace_titles_and_redirects_in_links_file.py" pages.txt.gz redirects.with_ids.txt.gz targets.with_ids.txt.gz links.txt.gz \ | pigz --fast > links.with_ids.txt.gz.tmp mv links.with_ids.txt.gz.tmp links.with_ids.txt.gz else echo "[WARN] Already replaced titles and redirects in links file" fi +if $DELETE_PROGRESSIVELY; then rm links.txt.gz targets.with_ids.txt.gz; fi if [ ! -f pages.pruned.txt.gz ]; then echo @@ -185,6 +227,7 @@ if [ ! -f pages.pruned.txt.gz ]; then else echo "[WARN] Already pruned pages which are marked as redirects but with no redirect" fi +if $DELETE_PROGRESSIVELY; then rm pages.txt.gz; fi ##################### # SORT LINKS FILE # @@ -212,6 +255,7 @@ if [ ! -f links.sorted_by_target_id.txt.gz ]; then else echo "[WARN] Already sorted links file by target page ID" fi +if $DELETE_PROGRESSIVELY; then rm links.with_ids.txt.gz; fi ############################# @@ -227,6 +271,7 @@ if [ ! -f links.grouped_by_source_id.txt.gz ]; then else echo "[WARN] Already grouped source links file by source page ID" fi +if $DELETE_PROGRESSIVELY; then rm links.sorted_by_source_id.txt.gz; fi if [ ! -f links.grouped_by_target_id.txt.gz ]; then echo @@ -237,6 +282,7 @@ if [ ! -f links.grouped_by_target_id.txt.gz ]; then else echo "[WARN] Already grouped target links file by target page ID" fi +if $DELETE_PROGRESSIVELY; then rm links.sorted_by_target_id.txt.gz; fi ################################ @@ -251,6 +297,7 @@ if [ ! -f links.with_counts.txt.gz ]; then else echo "[WARN] Already combined grouped links files" fi +if $DELETE_PROGRESSIVELY; then rm links.grouped_by_source_id.txt.gz links.grouped_by_target_id.txt.gz; fi ############################ @@ -260,14 +307,17 @@ if [ ! -f sdow.sqlite ]; then echo echo "[INFO] Creating redirects table" time pigz -dc redirects.with_ids.txt.gz | sqlite3 sdow.sqlite ".read $ROOT_DIR/../sql/createRedirectsTable.sql" + if $DELETE_PROGRESSIVELY; then rm redirects.with_ids.txt.gz; fi echo echo "[INFO] Creating pages table" time pigz -dc pages.pruned.txt.gz | sqlite3 sdow.sqlite ".read $ROOT_DIR/../sql/createPagesTable.sql" + if $DELETE_PROGRESSIVELY; then rm pages.pruned.txt.gz; fi echo echo "[INFO] Creating links table" time pigz -dc links.with_counts.txt.gz | sqlite3 sdow.sqlite ".read $ROOT_DIR/../sql/createLinksTable.sql" + if $DELETE_PROGRESSIVELY; then rm links.with_counts.txt.gz; fi echo echo "[INFO] Compressing SQLite file" diff --git a/scripts/combine_grouped_links_files.py b/scripts/combine_grouped_links_files.py index e8ce4fd..35a1c7b 100755 --- a/scripts/combine_grouped_links_files.py +++ b/scripts/combine_grouped_links_files.py @@ -28,26 +28,27 @@ # Create a dictionary of page IDs to their incoming and outgoing links. LINKS = defaultdict(lambda: defaultdict(str)) -for line in io.BufferedReader(gzip.open(OUTGOING_LINKS_FILE, 'r')): - [source_page_id, target_page_ids] = line.rstrip('\n').split('\t') - LINKS[source_page_id]['outgoing'] = target_page_ids +# outgoing is [0], incoming is [1] +for line in io.BufferedReader(gzip.open(OUTGOING_LINKS_FILE, 'rb')): + [source_page_id, target_page_ids] = line.rstrip(b'\n').split(b'\t') + LINKS[int(source_page_id)][0] = target_page_ids -for line in io.BufferedReader(gzip.open(INCOMING_LINKS_FILE, 'r')): - [target_page_id, source_page_ids] = line.rstrip('\n').split('\t') - LINKS[target_page_id]['incoming'] = source_page_ids +for line in io.BufferedReader(gzip.open(INCOMING_LINKS_FILE, 'rb')): + [target_page_id, source_page_ids] = line.rstrip(b'\n').split(b'\t') + LINKS[int(target_page_id)][1] = source_page_ids # For each page in the links dictionary, print out its incoming and outgoing links as well as their # counts. -for page_id, links in LINKS.iteritems(): - outgoing_links = links.get('outgoing', '') - outgoing_links_count = 0 if outgoing_links is '' else len( - outgoing_links.split('|')) +for page_id, links in LINKS.items(): + outgoing_links = links.get(0, b'') + outgoing_links_count = 0 if outgoing_links==b'' else len( + outgoing_links.split(b'|')) - incoming_links = links.get('incoming', '') - incoming_links_count = 0 if incoming_links is '' else len( - incoming_links.split('|')) + incoming_links = links.get(1, b'') + incoming_links_count = 0 if incoming_links==b'' else len( + incoming_links.split(b'|')) - columns = [page_id, str(outgoing_links_count), str( - incoming_links_count), outgoing_links, incoming_links] + columns = [str(page_id).encode(), str(outgoing_links_count).encode(), str( + incoming_links_count).encode(), outgoing_links, incoming_links] - print('\t'.join(columns)) + print(b'\t'.join(columns).decode()) diff --git a/scripts/prune_pages_file.py b/scripts/prune_pages_file.py index 1459e88..a4bdf60 100644 --- a/scripts/prune_pages_file.py +++ b/scripts/prune_pages_file.py @@ -28,14 +28,14 @@ # Create a dictionary of redirects. REDIRECTS = {} -for line in io.BufferedReader(gzip.open(REDIRECTS_FILE, 'r')): - [source_page_id, _] = line.rstrip('\n').split('\t') +for line in io.BufferedReader(gzip.open(REDIRECTS_FILE, 'rb')): + [source_page_id, _] = line.rstrip(b'\n').split(b'\t') REDIRECTS[source_page_id] = True # Loop through the pages file, ignoring pages which are marked as redirects but which do not have a # corresponding redirect in the redirects dictionary, printing the remaining pages to stdout. -for line in io.BufferedReader(gzip.open(PAGES_FILE, 'r')): - [page_id, page_title, is_redirect] = line.rstrip('\n').split('\t') +for line in io.BufferedReader(gzip.open(PAGES_FILE, 'rb')): + [page_id, page_title, is_redirect] = line.rstrip(b'\n').split(b'\t') - if is_redirect == '0' or page_id in REDIRECTS: - print('\t'.join([page_id, page_title, is_redirect])) + if True or is_redirect == '0' or page_id in REDIRECTS: + print(b'\t'.join([page_id, page_title, is_redirect]).decode()) diff --git a/scripts/replace_titles_and_redirects_in_links_file.py b/scripts/replace_titles_and_redirects_in_links_file.py index b217017..5be8fb6 100755 --- a/scripts/replace_titles_and_redirects_in_links_file.py +++ b/scripts/replace_titles_and_redirects_in_links_file.py @@ -10,14 +10,15 @@ import gzip # Validate inputs -if len(sys.argv) < 4: +if len(sys.argv) < 5: print('[ERROR] Not enough arguments provided!') - print('[INFO] Usage: {0} '.format(sys.argv[0])) + print('[INFO] Usage: {0} '.format(sys.argv[0])) sys.exit() PAGES_FILE = sys.argv[1] REDIRECTS_FILE = sys.argv[2] -LINKS_FILE = sys.argv[3] +TARGETS_FILE = sys.argv[3] +LINKS_FILE = sys.argv[4] if not PAGES_FILE.endswith('.gz'): print('[ERROR] Pages file must be gzipped.') @@ -27,36 +28,46 @@ print('[ERROR] Redirects file must be gzipped.') sys.exit() +if not TARGETS_FILE.endswith('.gz'): + print('[ERROR] Targets file must be gzipped.') + sys.exit() + if not LINKS_FILE.endswith('.gz'): print('[ERROR] Links file must be gzipped.') sys.exit() # Create a set of all page IDs and a dictionary of page titles to their corresponding IDs. ALL_PAGE_IDS = set() -PAGE_TITLES_TO_IDS = {} -for line in io.BufferedReader(gzip.open(PAGES_FILE, 'r')): - [page_id, page_title, _] = line.rstrip('\n').split('\t') +for line in io.BufferedReader(gzip.open(PAGES_FILE, 'rb')): + [page_id, page_title, _] = line.rstrip(b'\n').split(b'\t') ALL_PAGE_IDS.add(page_id) - PAGE_TITLES_TO_IDS[page_title] = page_id # Create a dictionary of page IDs to the target page ID to which they redirect. REDIRECTS = {} -for line in io.BufferedReader(gzip.open(REDIRECTS_FILE, 'r')): - [source_page_id, target_page_id] = line.rstrip('\n').split('\t') +for line in io.BufferedReader(gzip.open(REDIRECTS_FILE, 'rb')): + [source_page_id, target_page_id] = line.rstrip(b'\n').split(b'\t') REDIRECTS[source_page_id] = target_page_id +# Create a dictionary of linktarget IDs to the target page ID +TARGETS = {} +for line in io.BufferedReader(gzip.open(TARGETS_FILE, 'rb')): + [target_id, target_page_id] = line.rstrip(b'\n').split(b'\t') + TARGETS[target_id] = target_page_id + # Loop through each line in the links file, replacing titles with IDs, applying redirects, and # removing nonexistent pages, writing the result to stdout. -for line in io.BufferedReader(gzip.open(LINKS_FILE, 'r')): - [source_page_id, target_page_title] = line.rstrip('\n').split('\t') +for line in io.BufferedReader(gzip.open(LINKS_FILE, 'rb')): + [source_page_id, target_id] = line.rstrip(b'\n').split(b'\t') source_page_exists = source_page_id in ALL_PAGE_IDS if source_page_exists: source_page_id = REDIRECTS.get(source_page_id, source_page_id) - target_page_id = PAGE_TITLES_TO_IDS.get(target_page_title) - + target_page_id = TARGETS.get(target_id) if target_page_id is not None and source_page_id != target_page_id: target_page_id = REDIRECTS.get(target_page_id, target_page_id) - print('\t'.join([source_page_id, target_page_id])) + print(b'\t'.join([source_page_id, target_page_id]).decode()) + else: + pass + diff --git a/scripts/replace_titles_and_redirects_in_targets_file.py b/scripts/replace_titles_and_redirects_in_targets_file.py new file mode 100755 index 0000000..89fe86b --- /dev/null +++ b/scripts/replace_titles_and_redirects_in_targets_file.py @@ -0,0 +1,61 @@ +""" +Replaces page names in the links file with their corresponding IDs, eliminates links containing +non-existing pages, and replaces redirects with the pages to which they redirect. + +Output is written to stdout. +""" + +import io +import sys +import gzip + +# Validate inputs +if len(sys.argv) < 4: + print('[ERROR] Not enough arguments provided!') + print('[INFO] Usage: {0} '.format(sys.argv[0])) + sys.exit() + +PAGES_FILE = sys.argv[1] +REDIRECTS_FILE = sys.argv[2] +TARGETS_FILE = sys.argv[3] + +if not PAGES_FILE.endswith('.gz'): + print('[ERROR] Pages file must be gzipped.') + sys.exit() + +if not REDIRECTS_FILE.endswith('.gz'): + print('[ERROR] Redirects file must be gzipped.') + sys.exit() + +if not TARGETS_FILE.endswith('.gz'): + print('[ERROR] Targets file must be gzipped.') + sys.exit() + +# Create a set of all page IDs and a dictionary of page titles to their corresponding IDs. +ALL_PAGE_IDS = set() +PAGE_TITLES_TO_IDS = {} +for line in io.BufferedReader(gzip.open(PAGES_FILE, 'rb')): + [page_id, page_title, _] = line.rstrip(b'\n').split(b'\t') + ALL_PAGE_IDS.add(page_id) + PAGE_TITLES_TO_IDS[page_title] = page_id + +# Create a dictionary of page IDs to the target page ID to which they redirect. +REDIRECTS = {} +for line in io.BufferedReader(gzip.open(REDIRECTS_FILE, 'rb')): + [source_page_id, target_page_id] = line.rstrip(b'\n').split(b'\t') + REDIRECTS[source_page_id] = target_page_id + +# Loop through each line in the links file, replacing titles with IDs, applying redirects, and +# removing nonexistent pages, writing the result to stdout. +for line in io.BufferedReader(gzip.open(TARGETS_FILE, 'rb')): + [target_id, target_page_title] = line.rstrip(b'\n').split(b'\t') + + target_page_id = PAGE_TITLES_TO_IDS.get(target_page_title) + + if target_page_id is not None: + target_page_id = REDIRECTS.get(target_page_id, target_page_id) + print(b'\t'.join([target_id, target_page_id]).decode()) + else: + pass + + diff --git a/scripts/replace_titles_in_redirects_file.py b/scripts/replace_titles_in_redirects_file.py index 946d190..ff7b5e6 100755 --- a/scripts/replace_titles_in_redirects_file.py +++ b/scripts/replace_titles_in_redirects_file.py @@ -28,16 +28,16 @@ # Create a set of all page IDs and a dictionary of page titles to their corresponding IDs. ALL_PAGE_IDS = set() PAGE_TITLES_TO_IDS = {} -for line in io.BufferedReader(gzip.open(PAGES_FILE, 'r')): - [page_id, page_title, _] = line.rstrip('\n').split('\t') +for line in io.BufferedReader(gzip.open(PAGES_FILE, 'rb')): + [page_id, page_title,_] = line.rstrip(b'\n').split(b'\t') ALL_PAGE_IDS.add(page_id) PAGE_TITLES_TO_IDS[page_title] = page_id # Create a dictionary of redirects, replace page titles in the redirects file with their # corresponding IDs and ignoring pages which do not exist. REDIRECTS = {} -for line in io.BufferedReader(gzip.open(REDIRECTS_FILE, 'r')): - [source_page_id, target_page_title] = line.rstrip('\n').split('\t') +for line in io.BufferedReader(gzip.open(REDIRECTS_FILE, 'rb')): + [source_page_id, target_page_title] = line.rstrip(b'\n').split(b'\t') source_page_exists = source_page_id in ALL_PAGE_IDS target_page_id = PAGE_TITLES_TO_IDS.get(target_page_title) @@ -47,7 +47,7 @@ # Loop through the redirects dictionary and remove redirects which redirect to another redirect, # writing the remaining redirects to stdout. -for source_page_id, target_page_id in REDIRECTS.iteritems(): +for source_page_id, target_page_id in REDIRECTS.items(): start_target_page_id = target_page_id redirected_count = 0 @@ -62,4 +62,4 @@ target_page_id = None if target_page_id is not None: - print('\t'.join([source_page_id, target_page_id])) + print(b'\t'.join([source_page_id, target_page_id]).decode()) diff --git a/scripts/replace_titles_in_redirects_file.py.dis b/scripts/replace_titles_in_redirects_file.py.dis new file mode 100755 index 0000000..a423414 --- /dev/null +++ b/scripts/replace_titles_in_redirects_file.py.dis @@ -0,0 +1,66 @@ +""" +Replaces page titles in the redirects file with their corresponding targetIDs. + +Output is written to stdout. +""" + +import io +import sys +import gzip + +# Validate input arguments. +if len(sys.argv) < 3: + print('[ERROR] Not enough arguments provided!') + print('[INFO] Usage: {0} '.format(sys.argv[0])) + sys.exit() + +TARGETS_FILE = sys.argv[1] +REDIRECTS_FILE = sys.argv[2] + +if not TARGETS_FILE.endswith('.gz'): + print('[ERROR] Targets file must be gzipped.') + sys.exit() + +if not REDIRECTS_FILE.endswith('.gz'): + print('[ERROR] Redirects file must be gzipped.') + sys.exit() + +# Create a set of all targetIDs and a dictionary of target titles to their corresponding IDs. +ALL_TARGET_IDS = set() +TARGET_TITLES_TO_IDS = {} +for line in io.BufferedReader(gzip.open(TARGETS_FILE, 'rb')): + print("LALIGNE",line.rstrip(b'\n').split(b'\t')) + [page_id, page_title,_] = line.rstrip(b'\n').split(b'\t') + ALL_TARGET_IDS.add(page_id) + TARGET_TITLES_TO_IDS[page_title] = page_id + +# Create a dictionary of redirects, replace page titles in the redirects file with their +# corresponding IDs and ignoring pages which do not exist. +REDIRECTS = {} +for line in io.BufferedReader(gzip.open(REDIRECTS_FILE, 'rb')): + [source_page_id, target_page_title] = line.rstrip(b'\n').split(b'\t') + + source_page_exists = source_page_id in ALL_TARGET_IDS + target_page_id = TARGET_TITLES_TO_IDS.get(target_page_title) + + if source_page_exists and target_page_id is not None: + REDIRECTS[source_page_id] = target_page_id + +# Loop through the redirects dictionary and remove redirects which redirect to another redirect, +# writing the remaining redirects to stdout. +for source_page_id, target_page_id in REDIRECTS.items(): + start_target_page_id = target_page_id + + redirected_count = 0 + while target_page_id in REDIRECTS: + target_page_id = REDIRECTS[target_page_id] + + redirected_count += 1 + + # Break out if there is a circular path, meaning the redirects only point to other redirects, + # not an acutal page. + if target_page_id == start_target_page_id or redirected_count > 100: + target_page_id = None + + if target_page_id is not None: + print(b'\t'.join([source_page_id, target_page_id]).decode())