jwngr · MysaaJava · Nov 5, 2025 · coderabbitai · Nov 5, 2025 · coderabbitai
diff --git a/scripts/README.md b/scripts/README.md
@@ -0,0 +1,69 @@
+# Description of the process
+
+## Parsing of the tables
+
+### links.txt
+- `pl_from` -> Id of the "from" page of this link
+- (`pl_namespace`) -> We keep only if equals 0 (= namespace of the "from" page of this link)
+- `pl_target_id` -> Target of this link  (foreign key to `linktarget`)
+
+### targets.txt
+- `lt_id` -> Id of this link (index)
+- (`lt_ns`) -> We keep only if equals 0 (= namespace of the targeted page)
+- `lt_title` -> Title of the targeted page
+
+### pages.txt
+- `page_id` -> Id of the page
+- (`page_namespace`) -> We keep only if equals 0 (= namespace of this page)
+- `page_title` -> Title of this page
+- `page_is_redirect` -> Boolean wether this page is a redirect
- `page_is_redirect` -> Boolean wether this page is a redirect
+- `page_is_redirect` -> Boolean whether this page is a redirect
- `page_is_redirect` -> Boolean wether this page is a redirect
+- `page_is_redirect` -> Boolean whether this page is a redirect
+- Ignore the eight following
+
+### redirects.txt
+- `rd_from` -> Id of the page from which we are redirected
+- (`rd_namespace`) -> We keep only if equals 0 (= namespace of the page we are redirected to)
+- `rd_title` -> Title of the page we are redirected to
+- Ignore the two following
+
+## Joining the tables
+
+### redirects.with_ids.txt (replace_titles_in_redirects_file.py)
+Replaces for each redirection, `rd_title` with the targetted `page_id` by matching on `page_title`.
+The targetted page_id is then computed as a redirect recursively, until we get on a "final" page.
+- `rd_from` -> The id of the page we are redirected from
+- `page_id` -> The id of the page we get to following redirections recursively
+
+### targets.with_ids.txt (replace_titles_and_redirects_in_targets_file.py)
+Replaces, for each linktarget, `lt_title` with the targetted `page_id` by matching on `page_title`.
+We then compute the "final" page obtained from this page following redirection, with the file `redirects.with_ids.txt`.
+- `lt_id` -> Id of this link
+- `page_id` -> The id of the page this link is pointing to, after having followed all redirections
+
+### links.with_ids.txt (replace_titles_and_redirects_in_links_file.py)
+Replaces, for each pagelink, `lt_id` with the targetted `page_id` by joining with `links.with_ids.txt`.
-Replaces for each redirection, `rd_title` with the targetted `page_id` by matching on `page_title`.
-The targetted page_id is then computed as a redirect recursively, until we get on a "final" page.
- `rd_from` -> The id of the page we are redirected from
- `page_id` -> The id of the page we get to following redirections recursively
-
-### targets.with_ids.txt (replace_titles_and_redirects_in_targets_file.py)
-Replaces, for each linktarget, `lt_title` with the targetted `page_id` by matching on `page_title`.
-We then compute the "final" page obtained from this page following redirection, with the file `redirects.with_ids.txt`.
- `lt_id` -> Id of this link
- `page_id` -> The id of the page this link is pointing to, after having followed all redirections
-
-### links.with_ids.txt (replace_titles_and_redirects_in_links_file.py)
-Replaces, for each pagelink, `lt_id` with the targetted `page_id` by joining with `links.with_ids.txt`.
+Replaces for each redirection, `rd_title` with the targeted `page_id` by matching on `page_title`.
+The targeted page_id is then computed as a redirect recursively, until we get on a "final" page.
+- `rd_from` -> The id of the page we are redirected from
+- `page_id` -> The id of the page we get to following redirections recursively
+
+### targets.with_ids.txt (replace_titles_and_redirects_in_targets_file.py)
+Replaces, for each linktarget, `lt_title` with the targeted `page_id` by matching on `page_title`.
+We then compute the "final" page obtained from this page following redirection, with the file `redirects.with_ids.txt`.
+- `lt_id` -> Id of this link
+- `page_id` -> The id of the page this link is pointing to, after having followed all redirections
+
+### links.with_ids.txt (replace_titles_and_redirects_in_links_file.py)
+Replaces, for each pagelink, `lt_id` with the targeted `page_id` by joining with `links.with_ids.txt`.
-Replaces for each redirection, `rd_title` with the targetted `page_id` by matching on `page_title`.
-The targetted page_id is then computed as a redirect recursively, until we get on a "final" page.
- `rd_from` -> The id of the page we are redirected from
- `page_id` -> The id of the page we get to following redirections recursively
-
-### targets.with_ids.txt (replace_titles_and_redirects_in_targets_file.py)
-Replaces, for each linktarget, `lt_title` with the targetted `page_id` by matching on `page_title`.
-We then compute the "final" page obtained from this page following redirection, with the file `redirects.with_ids.txt`.
- `lt_id` -> Id of this link
- `page_id` -> The id of the page this link is pointing to, after having followed all redirections
-
-### links.with_ids.txt (replace_titles_and_redirects_in_links_file.py)
-Replaces, for each pagelink, `lt_id` with the targetted `page_id` by joining with `links.with_ids.txt`.
+Replaces for each redirection, `rd_title` with the targeted `page_id` by matching on `page_title`.
+The targeted page_id is then computed as a redirect recursively, until we get on a "final" page.
+- `rd_from` -> The id of the page we are redirected from
+- `page_id` -> The id of the page we get to following redirections recursively
+
+### targets.with_ids.txt (replace_titles_and_redirects_in_targets_file.py)
+Replaces, for each linktarget, `lt_title` with the targeted `page_id` by matching on `page_title`.
+We then compute the "final" page obtained from this page following redirection, with the file `redirects.with_ids.txt`.
+- `lt_id` -> Id of this link
+- `page_id` -> The id of the page this link is pointing to, after having followed all redirections
+
+### links.with_ids.txt (replace_titles_and_redirects_in_links_file.py)
+Replaces, for each pagelink, `lt_id` with the targeted `page_id` by joining with `links.with_ids.txt`.
+- `pl_from` -> Id of the "from" page, after having followed all redirections
+- `page_id` -> Id of the "to" page, after having followed all redirections
+
+### page.pruned.txt (prune_pages_file.py)
+Prunes the pages file by removing pages which are marked as redirects but have no corresponding redirect in the redirects file.
+
+## Sorting, grouping, and counting the links
+
+### links.sorted_by_XXX_id.txt
+Then we sorts the `links.with_ids.txt` according to the first "source" id, into
+the file `links.sorted_by_source_id.txt`, and according to the second "target" id 
+into the file `links.sorted_by_target_id.txt`.
+
+### links.grouped_by_XXX_id.txt
+Then, we use those two files to *GROUP BY* the links by source and by target.
+The file `links.grouped_by_source_id.txt` is like this
+- `pl_from` -> Id of the "from" page
+- `targets` -> A `|`-separated string of the ids the "from" page targets
+
+The file `links.grouped_by_target_id.txt` is like this
+- `froms` -> A `|`-separated string of the ids of the pages targeting the "target" page
+- `pl_target` -> Id of the "target" page
+
+### links.with_counts.txt (combine_grouped_links_files.py)
+
+## Making the database
diff --git a/scripts/buildDatabase.sh b/scripts/buildDatabase.sh
@@ -1,15 +1,19 @@
 #!/bin/bash
-
 set -euo pipefail
 
 # Force default language for output sorting to be bytewise. Necessary to ensure uniformity amongst
 # UNIX commands.
 export LC_ALL=C
 
+# These variables can be set by external environment
+WLANG=''${WLANG:-en}
+OUT_DIR="${OUT_DIR:-dump}"
+DELETE_PROGRESSIVELY=${DELETE_PROGRESSIVELY:-false}
+
 # By default, the latest Wikipedia dump will be downloaded. If a download date in the format
 # YYYYMMDD is provided as the first argument, it will be used instead.
 if [[ $# -eq 0 ]]; then
-  DOWNLOAD_DATE=$(wget -q -O- https://dumps.wikimedia.org/enwiki/ | grep -Po '\d{8}' | sort | tail -n1)
+  DOWNLOAD_DATE=$(wget -q -O- https://dumps.wikimedia.org/${WLANG}wiki/ | grep -Po '\d{8}' | sort | tail -n1)
 else
   if [ ${#1} -ne 8 ]; then
     echo "[ERROR] Invalid download date provided: $1"
@@ -19,17 +23,17 @@ else
   fi
 fi
 
-ROOT_DIR=`pwd`
-OUT_DIR="dump"
+# Root directory is that of this script
+ROOT_DIR=$(dirname "$0")
 
-DOWNLOAD_URL="https://dumps.wikimedia.org/enwiki/$DOWNLOAD_DATE"
-TORRENT_URL="https://dump-torrents.toolforge.org/enwiki/$DOWNLOAD_DATE"
-
-SHA1SUM_FILENAME="enwiki-$DOWNLOAD_DATE-sha1sums.txt"
-REDIRECTS_FILENAME="enwiki-$DOWNLOAD_DATE-redirect.sql.gz"
-PAGES_FILENAME="enwiki-$DOWNLOAD_DATE-page.sql.gz"
-LINKS_FILENAME="enwiki-$DOWNLOAD_DATE-pagelinks.sql.gz"
+DOWNLOAD_URL="https://dumps.wikimedia.org/${WLANG}wiki/$DOWNLOAD_DATE"
+TORRENT_URL="https://dump-torrents.toolforge.org/${WLANG}wiki/$DOWNLOAD_DATE"
 
+SHA1SUM_FILENAME="${WLANG}wiki-$DOWNLOAD_DATE-sha1sums.txt"
+REDIRECTS_FILENAME="${WLANG}wiki-$DOWNLOAD_DATE-redirect.sql.gz"
+PAGES_FILENAME="${WLANG}wiki-$DOWNLOAD_DATE-page.sql.gz"
+LINKS_FILENAME="${WLANG}wiki-$DOWNLOAD_DATE-pagelinks.sql.gz"
+TARGETS_FILENAME="${WLANG}wiki-$DOWNLOAD_DATE-linktarget.sql.gz"
 
 # Make the output directory if it doesn't already exist and move to it
 mkdir -p $OUT_DIR
@@ -79,6 +83,7 @@ download_file "sha1sums" $SHA1SUM_FILENAME
 download_file "redirects" $REDIRECTS_FILENAME
 download_file "pages" $PAGES_FILENAME
 download_file "links" $LINKS_FILENAME
+download_file "targets" $TARGETS_FILENAME
 
 ##########################
 #  TRIM WIKIPEDIA DUMPS  #
@@ -105,7 +110,7 @@ if [ ! -f redirects.txt.gz ]; then
 else
   echo "[WARN] Already trimmed redirects file"
 fi
-
+if $DELETE_PROGRESSIVELY; then rm $REDIRECTS_FILENAME; fi
 if [ ! -f pages.txt.gz ]; then
   echo
   echo "[INFO] Trimming pages file"
@@ -118,16 +123,16 @@ if [ ! -f pages.txt.gz ]; then
   # Splice out the page title and whether or not the page is a redirect
   # Zip into output file
   time pigz -dc $PAGES_FILENAME \
-    | sed -n 's/^INSERT INTO `page` VALUES (//p' \
-    | sed -e 's/),(/\'$'\n/g' \
-    | egrep "^[0-9]+,0," \
-    | sed -e $"s/,0,'/\t/" \
-    | sed -e $"s/',[^,]*,\([01]\).*/\t\1/" \
+    | sed -n 's/^INSERT INTO `page` VALUES //p' \
+    | egrep -o "\([0-9]+,0,'([^']*(\\\\')?)+',[01]," \
+    | sed -re $"s/^\(([0-9]+),0,'/\1\t/" \
+    | sed -re $"s/',([01]),/\t\1/" \
     | pigz --fast > pages.txt.gz.tmp
   mv pages.txt.gz.tmp pages.txt.gz
 else
   echo "[WARN] Already trimmed pages file"
 fi
+if $DELETE_PROGRESSIVELY; then rm $PAGES_FILENAME; fi
 
 if [ ! -f links.txt.gz ]; then
   echo
@@ -143,14 +148,38 @@ if [ ! -f links.txt.gz ]; then
   time pigz -dc $LINKS_FILENAME \
     | sed -n 's/^INSERT INTO `pagelinks` VALUES (//p' \
     | sed -e 's/),(/\'$'\n/g' \
-    | egrep "^[0-9]+,0,.*,0$" \
-    | sed -e $"s/,0,'/\t/g" \
-    | sed -e "s/',0//g" \
+    | egrep "^[0-9]+,0,[0-9]+$" \
+    | sed -e $"s/,0,/\t/g" \
     | pigz --fast > links.txt.gz.tmp
   mv links.txt.gz.tmp links.txt.gz
 else
   echo "[WARN] Already trimmed links file"
 fi
+if $DELETE_PROGRESSIVELY; then rm $LINKS_FILENAME; fi
+
+if [ ! -f targets.txt.gz ]; then
+  echo
+  echo "[INFO] Trimming targets file"
+
+  # Unzip
+  # Remove all lines that don't start with INSERT INTO...
+  # Split into individual records
+  # Only keep records in namespace 0
+  # Replace namespace with a tab
+  # Remove everything starting at the to page name's closing apostrophe
+  # Zip into output file
+  time pigz -dc $TARGETS_FILENAME \
+    | sed -n 's/^INSERT INTO `linktarget` VALUES (//p' \
+    | sed -e 's/),(/\'$'\n/g' \
+    | egrep "^[0-9]+,0,.*$" \
+    | sed -e $"s/,0,'/\t/g" \
+    | sed -e "s/'$//g" \
+    | pigz --fast > targets.txt.gz.tmp
+  mv targets.txt.gz.tmp targets.txt.gz
+else
+  echo "[WARN] Already trimmed targets file"
+fi
+if $DELETE_PROGRESSIVELY; then rm $TARGETS_FILENAME; fi
 
 
 ###########################################
@@ -166,16 +195,29 @@ if [ ! -f redirects.with_ids.txt.gz ]; then
 else
   echo "[WARN] Already replaced titles in redirects file"
 fi
+if $DELETE_PROGRESSIVELY; then rm redirects.txt.gz; fi
+
+if [ ! -f targets.with_ids.txt.gz ]; then
+  echo
+  echo "[INFO] Replacing titles and redirects in targets file"
+  time python "$ROOT_DIR/replace_titles_and_redirects_in_targets_file.py" pages.txt.gz redirects.with_ids.txt.gz targets.txt.gz \
+    | pigz --fast > targets.with_ids.txt.gz.tmp
+  mv targets.with_ids.txt.gz.tmp targets.with_ids.txt.gz
+else
+  echo "[WARN] Already replaced titles and redirects in targets file"
+fi
+if $DELETE_PROGRESSIVELY; then rm targets.txt.gz; fi
 
 if [ ! -f links.with_ids.txt.gz ]; then
   echo
   echo "[INFO] Replacing titles and redirects in links file"
-  time python "$ROOT_DIR/replace_titles_and_redirects_in_links_file.py" pages.txt.gz redirects.with_ids.txt.gz links.txt.gz \
+  time python "$ROOT_DIR/replace_titles_and_redirects_in_links_file.py" pages.txt.gz redirects.with_ids.txt.gz targets.with_ids.txt.gz links.txt.gz \
     | pigz --fast > links.with_ids.txt.gz.tmp
   mv links.with_ids.txt.gz.tmp links.with_ids.txt.gz
 else
   echo "[WARN] Already replaced titles and redirects in links file"
 fi
+if $DELETE_PROGRESSIVELY; then rm links.txt.gz targets.with_ids.txt.gz; fi
 
 if [ ! -f pages.pruned.txt.gz ]; then
   echo
@@ -185,6 +227,7 @@ if [ ! -f pages.pruned.txt.gz ]; then
 else
   echo "[WARN] Already pruned pages which are marked as redirects but with no redirect"
 fi
+if $DELETE_PROGRESSIVELY; then rm pages.txt.gz; fi
 
 #####################
 #  SORT LINKS FILE  #
@@ -212,6 +255,7 @@ if [ ! -f links.sorted_by_target_id.txt.gz ]; then
 else
   echo "[WARN] Already sorted links file by target page ID"
 fi
+if $DELETE_PROGRESSIVELY; then rm links.with_ids.txt.gz; fi
 
 
 #############################
@@ -227,6 +271,7 @@ if [ ! -f links.grouped_by_source_id.txt.gz ]; then
 else
   echo "[WARN] Already grouped source links file by source page ID"
 fi
+if $DELETE_PROGRESSIVELY; then rm links.sorted_by_source_id.txt.gz; fi
 
 if [ ! -f links.grouped_by_target_id.txt.gz ]; then
   echo
@@ -237,6 +282,7 @@ if [ ! -f links.grouped_by_target_id.txt.gz ]; then
 else
   echo "[WARN] Already grouped target links file by target page ID"
 fi
+if $DELETE_PROGRESSIVELY; then rm links.sorted_by_target_id.txt.gz; fi
 
 
 ################################
@@ -251,6 +297,7 @@ if [ ! -f links.with_counts.txt.gz ]; then
 else
   echo "[WARN] Already combined grouped links files"
 fi
+if $DELETE_PROGRESSIVELY; then rm links.grouped_by_source_id.txt.gz links.grouped_by_target_id.txt.gz; fi
 
 
 ############################
@@ -260,14 +307,17 @@ if [ ! -f sdow.sqlite ]; then
   echo
   echo "[INFO] Creating redirects table"
   time pigz -dc redirects.with_ids.txt.gz | sqlite3 sdow.sqlite ".read $ROOT_DIR/../sql/createRedirectsTable.sql"
+  if $DELETE_PROGRESSIVELY; then rm redirects.with_ids.txt.gz; fi
 
   echo
   echo "[INFO] Creating pages table"
   time pigz -dc pages.pruned.txt.gz | sqlite3 sdow.sqlite ".read $ROOT_DIR/../sql/createPagesTable.sql"
+  if $DELETE_PROGRESSIVELY; then rm pages.pruned.txt.gz; fi
 
   echo
   echo "[INFO] Creating links table"
   time pigz -dc links.with_counts.txt.gz | sqlite3 sdow.sqlite ".read $ROOT_DIR/../sql/createLinksTable.sql"
+  if $DELETE_PROGRESSIVELY; then rm links.with_counts.txt.gz; fi
 
   echo
   echo "[INFO] Compressing SQLite file"

diff --git a/scripts/combine_grouped_links_files.py b/scripts/combine_grouped_links_files.py
@@ -28,26 +28,27 @@
 
 # Create a dictionary of page IDs to their incoming and outgoing links.
 LINKS = defaultdict(lambda: defaultdict(str))
-for line in io.BufferedReader(gzip.open(OUTGOING_LINKS_FILE, 'r')):
-  [source_page_id, target_page_ids] = line.rstrip('\n').split('\t')
-  LINKS[source_page_id]['outgoing'] = target_page_ids
+# outgoing is [0], incoming is [1]
+for line in io.BufferedReader(gzip.open(OUTGOING_LINKS_FILE, 'rb')):
+  [source_page_id, target_page_ids] = line.rstrip(b'\n').split(b'\t')
+  LINKS[int(source_page_id)][0] = target_page_ids
 
-for line in io.BufferedReader(gzip.open(INCOMING_LINKS_FILE, 'r')):
-  [target_page_id, source_page_ids] = line.rstrip('\n').split('\t')
-  LINKS[target_page_id]['incoming'] = source_page_ids
+for line in io.BufferedReader(gzip.open(INCOMING_LINKS_FILE, 'rb')):
+  [target_page_id, source_page_ids] = line.rstrip(b'\n').split(b'\t')
+  LINKS[int(target_page_id)][1] = source_page_ids
 
 # For each page in the links dictionary, print out its incoming and outgoing links as well as their
 # counts.
-for page_id, links in LINKS.iteritems():
-  outgoing_links = links.get('outgoing', '')
-  outgoing_links_count = 0 if outgoing_links is '' else len(
-      outgoing_links.split('|'))
+for page_id, links in LINKS.items():
+  outgoing_links = links.get(0, b'')
+  outgoing_links_count = 0 if outgoing_links==b'' else len(
+      outgoing_links.split(b'|'))
 
-  incoming_links = links.get('incoming', '')
-  incoming_links_count = 0 if incoming_links is '' else len(
-      incoming_links.split('|'))
+  incoming_links = links.get(1, b'')
+  incoming_links_count = 0 if incoming_links==b'' else len(
+      incoming_links.split(b'|'))
 
-  columns = [page_id, str(outgoing_links_count), str(
-      incoming_links_count), outgoing_links, incoming_links]
+  columns = [str(page_id).encode(), str(outgoing_links_count).encode(), str(
+      incoming_links_count).encode(), outgoing_links, incoming_links]
 
-  print('\t'.join(columns))
+  print(b'\t'.join(columns).decode())
diff --git a/scripts/prune_pages_file.py b/scripts/prune_pages_file.py
@@ -28,14 +28,14 @@
 
 # Create a dictionary of redirects.
 REDIRECTS = {}
-for line in io.BufferedReader(gzip.open(REDIRECTS_FILE, 'r')):
-  [source_page_id, _] = line.rstrip('\n').split('\t')
+for line in io.BufferedReader(gzip.open(REDIRECTS_FILE, 'rb')):
+  [source_page_id, _] = line.rstrip(b'\n').split(b'\t')
   REDIRECTS[source_page_id] = True
 
 # Loop through the pages file, ignoring pages which are marked as redirects but which do not have a
 # corresponding redirect in the redirects dictionary, printing the remaining pages to stdout.
-for line in io.BufferedReader(gzip.open(PAGES_FILE, 'r')):
-  [page_id, page_title, is_redirect] = line.rstrip('\n').split('\t')
+for line in io.BufferedReader(gzip.open(PAGES_FILE, 'rb')):
+  [page_id, page_title, is_redirect] = line.rstrip(b'\n').split(b'\t')
 
-  if is_redirect == '0' or page_id in REDIRECTS:
-    print('\t'.join([page_id, page_title, is_redirect]))
+  if True or is_redirect == '0' or page_id in REDIRECTS:
+    print(b'\t'.join([page_id, page_title, is_redirect]).decode())
-for line in io.BufferedReader(gzip.open(PAGES_FILE, 'rb')):
-  [page_id, page_title, is_redirect] = line.rstrip(b'\n').split(b'\t')
-
-  if is_redirect == '0' or page_id in REDIRECTS:
-    print('\t'.join([page_id, page_title, is_redirect]))
-  if True or is_redirect == '0' or page_id in REDIRECTS:
-    print(b'\t'.join([page_id, page_title, is_redirect]).decode())
+for line in io.BufferedReader(gzip.open(PAGES_FILE, 'rb')):
+  [page_id, page_title, is_redirect] = line.rstrip(b'\n').split(b'\t')
+
+  if is_redirect == b'0' or page_id in REDIRECTS:
+    print(b'\t'.join([page_id, page_title, is_redirect]).decode())
-for line in io.BufferedReader(gzip.open(PAGES_FILE, 'rb')):
-  [page_id, page_title, is_redirect] = line.rstrip(b'\n').split(b'\t')
-
-  if is_redirect == '0' or page_id in REDIRECTS:
-    print('\t'.join([page_id, page_title, is_redirect]))
-  if True or is_redirect == '0' or page_id in REDIRECTS:
-    print(b'\t'.join([page_id, page_title, is_redirect]).decode())
+for line in io.BufferedReader(gzip.open(PAGES_FILE, 'rb')):
+  [page_id, page_title, is_redirect] = line.rstrip(b'\n').split(b'\t')
+
+  if is_redirect == b'0' or page_id in REDIRECTS:
+    print(b'\t'.join([page_id, page_title, is_redirect]).decode())