Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ The assembly jar file includes also the [WebGraph](https://webgraph.di.unimi.it/

### Javadocs

The Javadocs are created by `mvn javadoc:javadoc`. Then open the file `target/site/apidocs/index.html` in a browser.
The Javadocs are created by `mvn javadoc:javadoc`. Then open the file `target/reports/apidocs/index.html` in a browser.


### Source Code Formatting
Expand All @@ -41,7 +41,7 @@ The host-level web graph is built with help of PySpark, the corresponding code i

### Domain-Level Web Graph

The domain-level web graph is distilled from the host-level graph by mapping host names to domain names. The ID mapping is kept in memory as an int array or [FastUtil's big array](https://fastutil.di.unimi.it/docs/it/unimi/dsi/fastutil/BigArrays.html) if the host-level graph has more vertices than a Java array can hold (around 2³¹). The Java tool to fold the host graph is best run from the script [host2domaingraph.sh](src/script/host2domaingraph.sh).
The domain-level web graph is distilled from the host-level graph by mapping host names to domain names. The ID mapping is kept in memory as an int array or [FastUtil's big array](https://fastutil.di.unimi.it/docs/it/unimi/dsi/fastutil/BigArrays.html) if the host-level graph has more vertices than a Java array can hold (around 2³¹). The Java tool to fold the host graph is best run from the script [host2domaingraph.sh](src/script/host2domaingraph.sh). Please, see the script and the Java class [HostToDomainGraph](src/main/java/org/commoncrawl/webgraph/HostToDomainGraph.java) for further details.

### Processing Graphs using the WebGraph Framework

Expand Down
30 changes: 24 additions & 6 deletions src/main/java/org/commoncrawl/webgraph/HostToDomainGraph.java
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ public class HostToDomainGraph {
private long numInputLinesEdges = 0;
protected String lastRevHost = null;
protected Domain lastDomain = null;
protected String lastOutputDomain = null;
private TreeMap<String, Domain> domainQueue = new TreeMap<>();
private int maxQueueUsed = 0;

Expand Down Expand Up @@ -197,17 +198,26 @@ public static int compareRevDomainsSafe(String d1, String d2) {
char c1 = d1.charAt(i);
char c2 = d2.charAt(i);
if (c1 != c2) {
if (c1 == HYPHEN && c2 == DOT) {
/*
* Cannot finish "no.hedmark-folkemusikklag" unless "no.hedmark.os.www" is done
* because input which is mapped to a suffix (a prefix in reversed domain name
* notation) is still expected, e.g. "no.hedmark.www" which is mapped to
* "no.hedmark".
Comment on lines +205 to +206
Copy link
Copy Markdown
Contributor

@handecelikkanat handecelikkanat May 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@sebastian-nagel is "no.hedmark.www" being still mapped to "no.hedmark"?

(Im assuming this is about the previous domain folding mapping setting. Maybe this one relates to the sorting, but I couldnt imagine how.)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, "no.hedmark.www" is mapped to "no.hedmark". "no" is a public suffix, "hedmark.no" is not. See this comment and the included links in #33.

*/
return 0;
}
return c1 - c2;
} else if (c1 == HYPHEN) {
/*
* cannot finish "org.example-domain" unless "org.example" is done
* Cannot finish "org.example-domain" unless "org.example" is done.
*/
return 0;
} else if (c1 == DOT) {
dots++;
if (dots > 1) {
/*
* cannot finish "name.his.forgot.foobar" unless "name.his" is done
* Cannot finish "name.his.forgot.foobar" unless "name.his" is done.
*
* This is a special case of multi-part suffixes with more than two parts when
* the first part is also a public suffix, e.g. (in reversed domain name
Expand Down Expand Up @@ -401,7 +411,7 @@ private Domain queueDomain(StringBuilder sb, String domainName) {
String firstDomain = domainQueue.firstKey();
if (!Domain.isSafeToOutput(firstDomain, revDomainName)) {
/*
* queued domains are sorted lexicographically: if the first/current domain
* Queued domains are sorted lexicographically: if the first/current domain
* cannot be safely dequeued and written to output, this is also the case for
* the following ones.
*/
Expand Down Expand Up @@ -430,6 +440,7 @@ private String getNodeLine(Domain domain) {
}

private void getNodeLine(StringBuilder b, Domain domain) {
String domainName = null;
if (domain == null)
return;
if (domain.id >= 0 && domain.name != null) {
Expand All @@ -438,7 +449,8 @@ private void getNodeLine(StringBuilder b, Domain domain) {
}
b.append(domain.id);
b.append('\t');
b.append(reverseHost(domain.name));
domainName = reverseHost(domain.name);
b.append(domainName);
if (countHosts) {
b.append('\t');
b.append(domain.numberOfHosts);
Expand All @@ -447,6 +459,13 @@ private void getNodeLine(StringBuilder b, Domain domain) {
for (Long hostId : domain.ids) {
setValue(hostId.longValue(), domain.id);
}
if (lastOutputDomain != null && lastOutputDomain.compareTo(domainName) >= 0) {
String msg = "Output domains are not strictly monotonically sorted: " + lastOutputDomain + " <> "
+ domainName;
LOG.error(msg);
throw new RuntimeException(msg);
}
lastOutputDomain = domainName;
}

public String convertEdge(String line) {
Expand Down Expand Up @@ -540,8 +559,7 @@ private static void showHelp() {
System.err.println(" \tpublic suffix list, ");
System.err.println(
" \tsee https://github.com/publicsuffix/list/wiki/Format#divisions)");
System.err
.println(" \t- host-without-www: strip the www. prefix (keep the ");
System.err.println(" \t- host-without-www: strip the www. prefix (keep the ");
System.err.println(" \tfull host otherwise)");
System.err.println(" --multipart-suffixes-as-domains\toutput host names which are equal to multi-part");
System.err.println(" \tpublic suffixes (the suffix contains a dot) as domain");
Expand Down
121 changes: 84 additions & 37 deletions src/script/host2domaingraph.sh
Original file line number Diff line number Diff line change
Expand Up @@ -65,64 +65,111 @@ PARALLEL_SORT_THREADS=2
# 1 C locale is mandatory to keep reversed hosts of one domain or top-level domain
# together in a single block:
# echo -e "com.opus\ncom.opera\nco.mopus\nco.mopera" | shuf | LC_ALL=en_US.utf8 sort
# co.mopera
# com.opera
# com.opus
# co.mopus
# vs.
# echo -e "com.opus\ncom.opera\nco.mopus\nco.mopera" | shuf | LC_ALL=C sort
# co.mopera
# co.mopus
# com.opera
# com.opus
# This requirement is met by the output of the cc-pyspark job.
#
# 2 The second problem stems from the fact that a hyphen (valid in host and
# subdomain names) is sorted before the dot:
# In an older version, the input was re-sorted to try to group
# domains and their subdomains together:
#
# 2 Sorting with C locale, places a hyphen (valid in host and subdomain names)
# before a dot:
# ac.gov
# ac.gov.ascension
# ac.gov.ascension-island
# ac.gov.ascension.mail
# Unfortunately the output of the cc-pyspark job does not completely meet this
# sorting criterion.
# This causes that the domain "ac.gov.ascension" and its subdomain "ac.gov.ascension.mail"
# end up in two separated blocks of the input, even with sorting using the C locale.
#
# The initial solution to ensure that the subdomains of "ac.gov.ascension" are not split
# into two blocks, was to add an artificial dot temporarily to the end of each host
# name during sorting:
# zcat vertices.txt.gz | sed -e 's/$/./' \
# | sort $SORTOPTS -t$'\t' -k2,2 | sed -e 's/\.$//'
# The domain name "ac.gov.ascension" in the example above becomes temporarily
# "ac.gov.ascension." and is now sorted after "ac.gov.ascension-island."
#
# To avoid this step (re-sorting billions of lines is expensive), the HostToDomainGraph
# class now caches potentially "missorted" candidates and processes them later together
# with the related subdomains / host names.
#
# Note: The final sorting of the domain names is the same as if there would be
# a trailing dot:
# ac.gov.ascension-island
# ac.gov.ascension
# A sort order that keeps hosts/domains of a common suffix in one block can be
# also achieved if dots are replaced by commas:
# zcat vertices.txt.gz | tr . , \
# | sort $SORTOPTS -t$'\t' -k2,2 | tr , .
# This approach is utilized by the "Sort-friendly URI Reordering Transform" (SURT),
# see <http://crawler.archive.org/articles/user_manual/glossary.html#surt>.
#
# However, the public suffix list adds a further issue, which makes it impossible
# to group domains and subdomains together, by simply sorting the input:
#
# 3 The public suffix list adds a further issue: there are multi-part suffixes,
# such as "co.uk" (or "uk.co" in reverse domain name notation). And the suffixes
# of a multi-part suffix can be public suffixes themselves: also "uk" is a public
# suffix. But they do not need to. For example: "no" and "os.hordaland.no" are
# in the public suffix list but "hordaland.no" is not. In this situation,
# adding a trailing dot does not even guarantee that all hosts of a domain under
# a public suffix is in a contiguous block:
#
# $> cat hordaland.txt
# no.hordaland
# no.hordaland-teater
# no.hordaland.os
# no.hordaland.os.bibliotek
# no.hordaland.oygarden
# no.hordalandfolkemusikklag
#
# $> cat hordaland.txt | sed 's/$/./' | LC_ALL=C sort
# no.hordaland-teater.
# no.hordaland.
# no.hordaland.os.
# no.hordaland.os.bibliotek.
# no.hordaland.oygarden.
# no.hordalandfolkemusikklag.
# 3 There are multi-part suffixes, such as "co.uk" (or "uk.co" in reverse domain name
# notation). And the suffixes of a multi-part suffix can be public suffixes themselves:
# also "uk" is a public suffix. But they do not need to. For example: "no" and
# "os.hordaland.no" are in the public suffix list but "hordaland.no" is not.
# In this situation, adding a trailing dot does not even guarantee that all hosts of
# a domain under a public suffix are in a contiguous block:
#
# $> cat hordaland.txt
# no.hordaland
# no.hordaland-teater
# no.hordaland.os
# no.hordaland.os.bibliotek
# no.hordaland.oygarden
# no.hordalandfolkemusikklag
#
# $> cat hordaland.txt | sed 's/$/./' | LC_ALL=C sort
# no.hordaland-teater.
# no.hordaland.
# no.hordaland.os.
# no.hordaland.os.bibliotek.
# no.hordaland.oygarden.
# no.hordalandfolkemusikklag.
#
# The host names "no.hordaland." and "no.hordaland.oygarden." both
# are under the domain ""no.hordaland" (public suffix is "no").
#
# Please see https://github.com/commoncrawl/cc-webgraph/issues/3
# for further details.
# To address this issue (point 3), the HostToDomainGraph class now caches
# potentially "missorted" candidates and processes them later together
# with the related subdomains / host names.
#
# 4 This also addresses the fact, that re-sorting billions of input lines is
# computationally expensive.
#
# Output sorting:
#
# 5 Ideally, the domain output should be lexicographically sorted
# as well. This is a requirement to store the map of node names and IDs
# in an "immutable external prefix map" (IEPM).
# If a trailing dot is added and then removed (and no cache is used), the
# output sorting would be consequently the same as if there is a trailing dot:
# ac.gov.ascension-island.
# ac.gov.ascension.
# respectively (after removing the trailing dot)
# ac.gov.ascension-island
# ac.gov.ascension
#
# The required ASCII sorting is:
# ac.gov.ascension
# ac.gov.ascension-island
#
# We cannot re-sort the output because this would also require to change
# the node IDs because the WebGraph framework expects the arc/edge input
# to be numerically sorted. And the vertices/nodes are enumerated as they
# are sorted, i.e. node IDs are line numbers starting with zero.
#
# Note: The approach to replace dots by commas ensures proper lexicographic
# sorting even if the replacement is inverted. However, it does not guarantee
# that all domains of one suffix are in a contigous block, if that suffix
# is a suffix of another suffix. See point 3.
#
# Please see https://github.com/commoncrawl/cc-webgraph/issues/3
# and https://github.com/commoncrawl/cc-webgraph/issues/33
# for further details.
#

export LC_ALL=C
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,44 @@ class TestHostToDomainGraph {
"3\tno.hordalandfolkemusikklag\t1", //
};

/*
* Issue #33 : domain output not sorted if domain name is a string suffix of
* public suffix appears only after the longer suffix.
*/
String[] hostGraphDomainInSuffixA = { //
"0\tno.hedland", //
"1\tno.hedmark-folkemusikklag", //
"2\tno.hedmark-trafikk", //
"3\tno.hedmark.m", //
"4\tno.hedmark.os.www", //
"5\tno.hedmark.www", //
"6\tno.hedmarktrafikk", //
};
String[] hostGraphDomainInSuffixB = { //
"0\tno.hedland", //
"1\tno.hedmark-folkemusikklag", //
"2\tno.hedmark-trafikk", //
"3\tno.hedmark.os.www", //
"4\tno.hedmark.www", //
"5\tno.hedmarktrafikk", //
};
String[] domainGraphDomainInSuffixA = { //
"0\tno.hedland\t1", //
"1\tno.hedmark\t2", //
"2\tno.hedmark-folkemusikklag\t1", //
"3\tno.hedmark-trafikk\t1", //
"4\tno.hedmark.os.www\t1", //
"5\tno.hedmarktrafikk\t1", //
};
String[] domainGraphDomainInSuffixB = { //
"0\tno.hedland\t1", //
"1\tno.hedmark\t1", //
"2\tno.hedmark-folkemusikklag\t1", //
"3\tno.hedmark-trafikk\t1", //
"4\tno.hedmark.os.www\t1", //
"5\tno.hedmarktrafikk\t1", //
};

/**
* <code>forgot.his.name</name> is in the "private section" of the public suffix
* list, while <code>name</name> is in the ICANN section, see
Expand Down Expand Up @@ -194,7 +232,7 @@ private long[] getNodeIDs(String[] graph) {
}

/**
* test whether node names are properly sorted and IDs are correctly assigned
* Test whether node names are properly sorted and IDs are correctly assigned
* (sequentially, strictly monotonically increasing, no gaps)
*/
void testSorted(String[] graph) {
Expand Down Expand Up @@ -276,6 +314,26 @@ void testConvertNodesHyphenatedDomainsIncludingMultiPartSuffixes() {
convert(converter, hostGraphHyphenatedDomains));
}

@Test
void testConvertNodesEnsureSortedOutputA() {
testSorted(hostGraphDomainInSuffixA);
testSorted(domainGraphDomainInSuffixA);
converter.doCount(true);
String[] output = convert(converter, hostGraphDomainInSuffixA);
testSorted(output);
assertArrayEquals(domainGraphDomainInSuffixA, output);
}

@Test
void testConvertNodesEnsureSortedOutputB() {
testSorted(hostGraphDomainInSuffixB);
testSorted(domainGraphDomainInSuffixB);
converter.doCount(true);
String[] output = convert(converter, hostGraphDomainInSuffixB);
testSorted(output);
assertArrayEquals(domainGraphDomainInSuffixB, output);
}

@Test
void testConvertPrivateDomain() {
// verify sorting of input and expected output
Expand Down
Loading