MysaaJava · MrElyazid · Nov 10, 2025
diff --git a/scripts/README.md b/scripts/README.md
@@ -16,7 +16,7 @@
 - `page_id` -> Id of the page
 - (`page_namespace`) -> We keep only if equals 0 (= namespace of this page)
 - `page_title` -> Title of this page
-- `page_is_redirect` -> Boolean wether this page is a redirect
+- `page_is_redirect` -> Boolean whether this page is a redirect
 - Ignore the eight following
 
 ### redirects.txt
@@ -28,19 +28,19 @@
 ## Joining the tables
 
 ### redirects.with_ids.txt (replace_titles_in_redirects_file.py)
-Replaces for each redirection, `rd_title` with the targetted `page_id` by matching on `page_title`.
+Replaces for each redirection, `rd_title` with the targeted `page_id` by matching on `page_title`.
 The targetted page_id is then computed as a redirect recursively, until we get on a "final" page.
 - `rd_from` -> The id of the page we are redirected from
 - `page_id` -> The id of the page we get to following redirections recursively
 
 ### targets.with_ids.txt (replace_titles_and_redirects_in_targets_file.py)
-Replaces, for each linktarget, `lt_title` with the targetted `page_id` by matching on `page_title`.
+Replaces, for each linktarget, `lt_title` with the targeted `page_id` by matching on `page_title`.
 We then compute the "final" page obtained from this page following redirection, with the file `redirects.with_ids.txt`.
 - `lt_id` -> Id of this link
 - `page_id` -> The id of the page this link is pointing to, after having followed all redirections
 
 ### links.with_ids.txt (replace_titles_and_redirects_in_links_file.py)
-Replaces, for each pagelink, `lt_id` with the targetted `page_id` by joining with `links.with_ids.txt`.
+Replaces, for each pagelink, `lt_id` with the targeted `page_id` by joining with `links.with_ids.txt`.
 - `pl_from` -> Id of the "from" page, after having followed all redirections
 - `page_id` -> Id of the "to" page, after having followed all redirections
 
@@ -65,5 +65,16 @@ The file `links.grouped_by_target_id.txt` is like this
 - `pl_target` -> Id of the "target" page
 
 ### links.with_counts.txt (combine_grouped_links_files.py)
+We *stream-merge* ( we dont load the two files into memory directly ) the two files `links.grouped_by_source_id.txt.gz` and `links.grouped_by_target_id.txt.gz` into a single sorted links file `links.with_counts.txt`.
 
 ## Making the database
+We create three tables : `redirects`, `tables`, `links` from the files : `redirects.with_ids.txt.gz`, `pages.pruned.txt.gz`, `links.with_counts.txt.gz`  
+
+
+# Notes on building the database locally (on Debian based machines) :
+
+Depending on your hardware, you might need to change some values in the `buildDatabase.sh` script, most notably how much of the cpu the `sort` command should take, you can define the value as a percentage or in gigabytes, while running on a DigitalOcean droplet with 16 gb of ram it was best to give the sort command 4 gb of ram, The whole database generation took about two hours ( 14gb final size for the sdow.sqlite file ).
+
+If the script keeps exiting with no error message, its possible that the system is killing it for taking too much ressources, you can investigate that by running : `dmesg | grep -i "killed process"`
+
+after creating the database you can inspect its content using `sqlitebrowser`
diff --git a/scripts/buildDatabase.sh b/scripts/buildDatabase.sh
@@ -24,7 +24,7 @@ else
 fi
 
 # Root directory is that of this script
-ROOT_DIR=$(dirname "$0")
+ROOT_DIR=$(cd "$(dirname "$0")" && pwd)
 
 DOWNLOAD_URL="https://dumps.wikimedia.org/${WLANG}wiki/$DOWNLOAD_DATE"
 TORRENT_URL="https://dump-torrents.toolforge.org/${WLANG}wiki/$DOWNLOAD_DATE"
@@ -188,8 +188,8 @@ if $DELETE_PROGRESSIVELY; then rm $TARGETS_FILENAME; fi
 if [ ! -f redirects.with_ids.txt.gz ]; then
   echo
   echo "[INFO] Replacing titles in redirects file"
-  time python "$ROOT_DIR/replace_titles_in_redirects_file.py" pages.txt.gz redirects.txt.gz \
-    | sort -S 100% -t $'\t' -k 1n,1n \
+  time python3 "$ROOT_DIR/replace_titles_in_redirects_file.py" pages.txt.gz redirects.txt.gz \
+    | sort -S 80% -t $'\t' -k 1n,1n \
     | pigz --fast > redirects.with_ids.txt.gz.tmp
   mv redirects.with_ids.txt.gz.tmp redirects.with_ids.txt.gz
 else
@@ -200,7 +200,7 @@ if $DELETE_PROGRESSIVELY; then rm redirects.txt.gz; fi
 if [ ! -f targets.with_ids.txt.gz ]; then
   echo
   echo "[INFO] Replacing titles and redirects in targets file"
-  time python "$ROOT_DIR/replace_titles_and_redirects_in_targets_file.py" pages.txt.gz redirects.with_ids.txt.gz targets.txt.gz \
+  time python3 "$ROOT_DIR/replace_titles_and_redirects_in_targets_file.py" pages.txt.gz redirects.with_ids.txt.gz targets.txt.gz \
     | pigz --fast > targets.with_ids.txt.gz.tmp
   mv targets.with_ids.txt.gz.tmp targets.with_ids.txt.gz
 else
@@ -211,7 +211,7 @@ if $DELETE_PROGRESSIVELY; then rm targets.txt.gz; fi
 if [ ! -f links.with_ids.txt.gz ]; then
   echo
   echo "[INFO] Replacing titles and redirects in links file"
-  time python "$ROOT_DIR/replace_titles_and_redirects_in_links_file.py" pages.txt.gz redirects.with_ids.txt.gz targets.with_ids.txt.gz links.txt.gz \
+  time python3 "$ROOT_DIR/replace_titles_and_redirects_in_links_file.py" pages.txt.gz redirects.with_ids.txt.gz targets.with_ids.txt.gz links.txt.gz \
     | pigz --fast > links.with_ids.txt.gz.tmp
   mv links.with_ids.txt.gz.tmp links.with_ids.txt.gz
 else
@@ -222,7 +222,7 @@ if $DELETE_PROGRESSIVELY; then rm links.txt.gz targets.with_ids.txt.gz; fi
 if [ ! -f pages.pruned.txt.gz ]; then
   echo
   echo "[INFO] Pruning pages which are marked as redirects but with no redirect"
-  time python "$ROOT_DIR/prune_pages_file.py" pages.txt.gz redirects.with_ids.txt.gz \
+  time python3 "$ROOT_DIR/prune_pages_file.py" pages.txt.gz redirects.with_ids.txt.gz \
     | pigz --fast > pages.pruned.txt.gz
 else
   echo "[WARN] Already pruned pages which are marked as redirects but with no redirect"
@@ -236,7 +236,7 @@ if [ ! -f links.sorted_by_source_id.txt.gz ]; then
   echo
   echo "[INFO] Sorting links file by source page ID"
   time pigz -dc links.with_ids.txt.gz \
-    | sort -S 80% -t $'\t' -k 1n,1n \
+    | sort -T . -S 80% -t $'\t' -k 1n,1n \
     | uniq \
     | pigz --fast > links.sorted_by_source_id.txt.gz.tmp
   mv links.sorted_by_source_id.txt.gz.tmp links.sorted_by_source_id.txt.gz
@@ -248,7 +248,7 @@ if [ ! -f links.sorted_by_target_id.txt.gz ]; then
   echo
   echo "[INFO] Sorting links file by target page ID"
   time pigz -dc links.with_ids.txt.gz \
-    | sort -S 80% -t $'\t' -k 2n,2n \
+    | sort -T . -S 80% -t $'\t' -k 2n,2n \
     | uniq \
     | pigz --fast > links.sorted_by_target_id.txt.gz.tmp
   mv links.sorted_by_target_id.txt.gz.tmp links.sorted_by_target_id.txt.gz
@@ -291,7 +291,7 @@ if $DELETE_PROGRESSIVELY; then rm links.sorted_by_target_id.txt.gz; fi
 if [ ! -f links.with_counts.txt.gz ]; then
   echo
   echo "[INFO] Combining grouped links files"
-  time python "$ROOT_DIR/combine_grouped_links_files.py" links.grouped_by_source_id.txt.gz links.grouped_by_target_id.txt.gz \
+  time python3 "$ROOT_DIR/combine_grouped_links_files.py" links.grouped_by_source_id.txt.gz links.grouped_by_target_id.txt.gz \
     | pigz --fast > links.with_counts.txt.gz.tmp
   mv links.with_counts.txt.gz.tmp links.with_counts.txt.gz
 else

diff --git a/scripts/combine_grouped_links_files.py b/scripts/combine_grouped_links_files.py
@@ -4,51 +4,72 @@
 Output is written to stdout.
 """
 
-import io
 import sys
 import gzip
-from collections import defaultdict
 
-# Validate input arguments.
+# validate input arguments.
 if len(sys.argv) < 2:
   print('[ERROR] Not enough arguments provided!')
-  print('[INFO] Usage: {0} <outgoing_links_file> <incoming_links_file>'.format(sys.argv[0]))
+  print('[INFO] Usage: {0} <outgoing_links_file> <incoming_links_file>'.format(sys.argv[0]), file=sys.stderr)
   sys.exit()
 
 OUTGOING_LINKS_FILE = sys.argv[1]
 INCOMING_LINKS_FILE = sys.argv[2]
 
 if not OUTGOING_LINKS_FILE.endswith('.gz'):
-  print('[ERROR] Outgoing links file must be gzipped.')
+  print('[ERROR] Outgoing links file must be gzipped.', file=sys.stderr)
   sys.exit()
 
 if not INCOMING_LINKS_FILE.endswith('.gz'):
-  print('[ERROR] Incoming links file must be gzipped.')
+  print('[ERROR] Incoming links file must be gzipped.', file=sys.stderr)
   sys.exit()
 
-# Create a dictionary of page IDs to their incoming and outgoing links.
-LINKS = defaultdict(lambda: defaultdict(str))
-# outgoing is [0], incoming is [1]
-for line in io.BufferedReader(gzip.open(OUTGOING_LINKS_FILE, 'rb')):
-  [source_page_id, target_page_ids] = line.rstrip(b'\n').split(b'\t')
-  LINKS[int(source_page_id)][0] = target_page_ids
+def parse_line(line):
+  parts = line.rstrip(b'\n').split(b'\t', 1)
+  return (int(parts[0]), parts[1] if len(parts) > 1 else b'')
 
-for line in io.BufferedReader(gzip.open(INCOMING_LINKS_FILE, 'rb')):
-  [target_page_id, source_page_ids] = line.rstrip(b'\n').split(b'\t')
-  LINKS[int(target_page_id)][1] = source_page_ids
+def file_iterator(filename):
+  with gzip.open(filename, 'rb') as f:
+    for line in f:
+      yield parse_line(line)
 
-# For each page in the links dictionary, print out its incoming and outgoing links as well as their
-# counts.
-for page_id, links in LINKS.items():
-  outgoing_links = links.get(0, b'')
-  outgoing_links_count = 0 if outgoing_links==b'' else len(
-      outgoing_links.split(b'|'))
+# Merge the two sorted files, we're using gnerators instead of dicts to stream the content
+# and not load the entire files into memory, this helps with RAM consumption a lot.
 
-  incoming_links = links.get(1, b'')
-  incoming_links_count = 0 if incoming_links==b'' else len(
-      incoming_links.split(b'|'))
+outgoing_iter = file_iterator(OUTGOING_LINKS_FILE)
+incoming_iter = file_iterator(INCOMING_LINKS_FILE)
 
-  columns = [str(page_id).encode(), str(outgoing_links_count).encode(), str(
-      incoming_links_count).encode(), outgoing_links, incoming_links]
+outgoing_current = next(outgoing_iter, None)
+incoming_current = next(incoming_iter, None)
 
-  print(b'\t'.join(columns).decode())
+while outgoing_current is not None or incoming_current is not None:
+  if outgoing_current is None:
+    page_id, incoming_links = incoming_current
+    outgoing_links = b''
+    incoming_current = next(incoming_iter, None)
+  elif incoming_current is None:
+    page_id, outgoing_links = outgoing_current
+    incoming_links = b''
+    outgoing_current = next(outgoing_iter, None)
+  elif outgoing_current[0] < incoming_current[0]:
+    page_id, outgoing_links = outgoing_current
+    incoming_links = b''
+    outgoing_current = next(outgoing_iter, None)
+  elif incoming_current[0] < outgoing_current[0]:
+    page_id, incoming_links = incoming_current
+    outgoing_links = b''
+    incoming_current = next(incoming_iter, None)
+  else:
+    page_id = outgoing_current[0]
+    outgoing_links = outgoing_current[1]
+    incoming_links = incoming_current[1]
+    outgoing_current = next(outgoing_iter, None)
+    incoming_current = next(incoming_iter, None)
+
+  outgoing_links_count = 0 if outgoing_links == b'' else len(outgoing_links.split(b'|'))
+  incoming_links_count = 0 if incoming_links == b'' else len(incoming_links.split(b'|'))
+
+  columns = [str(page_id).encode(), str(outgoing_links_count).encode(), 
+             str(incoming_links_count).encode(), outgoing_links, incoming_links]
+
+
diff --git a/scripts/prune_pages_file.py b/scripts/prune_pages_file.py
@@ -37,5 +37,5 @@
 for line in io.BufferedReader(gzip.open(PAGES_FILE, 'rb')):
   [page_id, page_title, is_redirect] = line.rstrip(b'\n').split(b'\t')
 
-  if True or is_redirect == '0' or page_id in REDIRECTS:
+  if is_redirect == '0' or page_id in REDIRECTS:
     print(b'\t'.join([page_id, page_title, is_redirect]).decode())
diff --git a/scripts/replace_titles_in_redirects_file.py.dis b/scripts/replace_titles_in_redirects_file.py.dis
@@ -29,7 +29,6 @@ if not REDIRECTS_FILE.endswith('.gz'):
 ALL_TARGET_IDS = set()
 TARGET_TITLES_TO_IDS = {}
 for line in io.BufferedReader(gzip.open(TARGETS_FILE, 'rb')):
-  print("LALIGNE",line.rstrip(b'\n').split(b'\t'))
   [page_id, page_title,_] = line.rstrip(b'\n').split(b'\t')
   ALL_TARGET_IDS.add(page_id)
   TARGET_TITLES_TO_IDS[page_title] = page_id