Skip to content

Commit e867fe2

Browse files
Extend HostToDomainGraph to fold host-level graphs stripping the www. prefix (#30)
In this PR we: - added new CLI param for folding host graph into domain graph by stripping .www - deprecated old parameters --private-domain - unit tests --------- Co-authored-by: Sebastian Nagel <sebastian@commoncrawl.org>
1 parent 190d498 commit e867fe2

4 files changed

Lines changed: 148 additions & 7 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,3 +28,4 @@ hs_err_pid*
2828
.project
2929
.classpath
3030
.settings/
31+
.idea

src/main/java/org/commoncrawl/webgraph/HostToDomainGraph.java

Lines changed: 91 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,8 @@ public class HostToDomainGraph {
6868

6969
protected boolean countHosts = false;
7070
protected boolean privateDomains = false;
71+
72+
protected boolean stripWww = false;
7173
protected boolean includeMultiPartSuffixes = false;
7274

7375
protected long maxSize;
@@ -84,6 +86,13 @@ public class HostToDomainGraph {
8486

8587
private static Pattern SPLIT_HOST_PATTERN = Pattern.compile("\\.");
8688

89+
public final static String AGGREGATION_HOST_WITHOUT_WWW = "host-without-www";
90+
public final static String AGGREGATION_PRIVATE_DOMAIN = "private-domain";
91+
public final static String AGGREGATION_REGISTERED_DOMAIN = "registered-domain";
92+
93+
private final static List<String> ALLOWED_AGGREGATION_PARAMS = java.util.Arrays
94+
.asList(AGGREGATION_REGISTERED_DOMAIN, AGGREGATION_PRIVATE_DOMAIN, AGGREGATION_HOST_WITHOUT_WWW);
95+
8796
private Consumer<? super String> reporterInputNodes = (String line) -> {
8897
if ((numInputLinesNodes % 500000) != 0 || numInputLinesNodes == 0) {
8998
return;
@@ -281,6 +290,13 @@ public void multiPartSuffixesAsDomains(boolean include) {
281290
this.includeMultiPartSuffixes = include;
282291
}
283292

293+
/**
294+
* @param stripWww if true the www. prefix is stripped
295+
*/
296+
public void setStripWww(boolean stripWww) {
297+
this.stripWww = stripWww;
298+
}
299+
284300
/**
285301
* Reverse host name, eg. <code>www.example.com</code> is reversed to
286302
* <code>com.example.www</code>. Can also be used to "unreverse" a reversed host
@@ -327,13 +343,23 @@ public String convertNode(String line) {
327343
}
328344
lastRevHost = revHost;
329345
String host = reverseHost(revHost);
330-
String domain = EffectiveTldFinder.getAssignedDomain(host, true, !privateDomains);
346+
String domain = null;
331347
StringBuilder sb = new StringBuilder();
332-
if (domain == null && includeMultiPartSuffixes) {
333-
if (EffectiveTldFinder.getEffectiveTLDs().containsKey(host) && host.indexOf('.') != -1) {
334-
LOG.info("Accepting public suffix (containing dot) as domain: {}", host);
348+
if (this.stripWww) {
349+
if (host.startsWith("www.") && host.indexOf('.', 4) != -1) {
350+
// strip leading 'www' to reduce number of "duplicate" hosts,
351+
// but leave at least 2 trailing parts (www.com is a valid domain)
352+
host = host.substring(4);
335353
}
336354
domain = host;
355+
} else {
356+
domain = EffectiveTldFinder.getAssignedDomain(host, true, !privateDomains);
357+
if (domain == null && includeMultiPartSuffixes) {
358+
if (EffectiveTldFinder.getEffectiveTLDs().containsKey(host) && host.indexOf('.') != -1) {
359+
LOG.info("Accepting public suffix (containing dot) as domain: {}", host);
360+
}
361+
domain = host;
362+
}
337363
}
338364
if (domain == null) {
339365
LOG.warn("No domain for host: {}", host);
@@ -499,9 +525,24 @@ private static void showHelp() {
499525
System.err.println("Options:");
500526
System.err.println(" -h\t(also -? or --help) show usage message and exit");
501527
System.err.println(" -c\tcount hosts per domain (additional column in <nodes_out>");
502-
System.err.println(" --private-domains\tconvert to private domains (include suffixes from the");
528+
System.err.println(" --private-domains\t(deprecated - use --aggregation-level)");
529+
System.err.println(" \tconvert to private domains (include suffixes from the");
503530
System.err.println(" \tPRIVATE domains subdivision of the public suffix list,");
504-
System.err.println(" \tsee https://github.com/publicsuffix/list/wiki/Format#divisions");
531+
System.err.println(" \tsee https://github.com/publicsuffix/list/wiki/Format#divisions)");
532+
System.err.println(" --aggregation-level <level>\tdefine the strategy on which hosts are folded to domains.");
533+
System.err
534+
.println(" \t<level> values: registered-domain (default), private-domain, ");
535+
System.err.println(" \thost-without-www. ");
536+
System.err.println(" \t- registered-domain: convert only the registered domains ");
537+
System.err.println(" \t- private-domain: convert to private domains ");
538+
System.err.println(
539+
" \t(include suffixes from the PRIVATE domains subdivision of the ");
540+
System.err.println(" \tpublic suffix list, ");
541+
System.err.println(
542+
" \tsee https://github.com/publicsuffix/list/wiki/Format#divisions)");
543+
System.err
544+
.println(" \t- host-without-www: strip the www. prefix (keep the ");
545+
System.err.println(" \tfull host otherwise)");
505546
System.err.println(" --multipart-suffixes-as-domains\toutput host names which are equal to multi-part");
506547
System.err.println(" \tpublic suffixes (the suffix contains a dot) as domain");
507548
System.err.println(" \tnames, eg. `gov.uk', `freight.aero' or `altoadige.it'.");
@@ -512,6 +553,8 @@ public static void main(String[] args) {
512553
boolean countHosts = false;
513554
boolean includeMultiPartSuffixes = false;
514555
boolean privateDomains = false;
556+
String aggregationLevel = null;
557+
boolean stripWww = false;
515558
int argpos = 0;
516559
while (argpos < args.length && args[argpos].startsWith("-")) {
517560
switch (args[argpos]) {
@@ -528,9 +571,28 @@ public static void main(String[] args) {
528571
includeMultiPartSuffixes = true;
529572
break;
530573
case "--private-domains":
531-
case "--private": // back-ward compatibility
574+
case "--private": // back-ward compatibility (but deprecated in favour of --aggregation-level)
575+
LOG.warn(
576+
"The parameter --private / --private-domains is deprecated, in favour of --aggregation-level with value private-domain");
532577
privateDomains = true;
533578
break;
579+
case "--aggregation-level":
580+
if ((argpos + 1) >= args.length) {
581+
LOG.error("Missing value for option " + args[argpos]);
582+
showHelp();
583+
System.exit(1);
584+
}
585+
String value = args[argpos + 1];
586+
587+
if (!ALLOWED_AGGREGATION_PARAMS.contains(value)) {
588+
LOG.error("Unknown value for option " + args[argpos] + ": " + value);
589+
showHelp();
590+
System.exit(1);
591+
} else {
592+
aggregationLevel = value;
593+
}
594+
argpos++;
595+
break;
534596
default:
535597
System.err.println("Unknown option " + args[argpos]);
536598
showHelp();
@@ -549,15 +611,37 @@ public static void main(String[] args) {
549611
LOG.error("Invalid number: " + args[argpos + 0]);
550612
System.exit(1);
551613
}
614+
if (aggregationLevel != null) {
615+
if (privateDomains) {
616+
LOG.error(
617+
"You cannot specify both --private or --private-domains, and --aggregation-level. "
618+
+ "Prefer --aggregation-level [level] because it will supersede the other option.");
619+
System.exit(1);
620+
} else {
621+
switch (aggregationLevel) {
622+
case AGGREGATION_REGISTERED_DOMAIN:
623+
break;
624+
case AGGREGATION_PRIVATE_DOMAIN:
625+
privateDomains = true;
626+
break;
627+
case AGGREGATION_HOST_WITHOUT_WWW:
628+
stripWww = true;
629+
break;
630+
}
631+
}
632+
}
633+
552634
HostToDomainGraph converter;
553635
if (maxSize <= Arrays.MAX_ARRAY_SIZE) {
554636
converter = new HostToDomainGraph((int) maxSize);
555637
} else {
556638
converter = new HostToDomainGraphBig(maxSize);
557639
}
640+
558641
converter.doCount(countHosts);
559642
converter.multiPartSuffixesAsDomains(includeMultiPartSuffixes);
560643
converter.doPrivateDomains(privateDomains);
644+
converter.setStripWww(stripWww);
561645
converter.reportConfig();
562646
String nodesIn = args[argpos + 1];
563647
String nodesOut = args[argpos + 2];

src/script/host2domaingraph.sh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,13 @@ while true; do
1111
PROPERTIES=("${PROPERTIES[@]}" "$1")
1212
shift
1313
;;
14+
"--aggregation-level" )
15+
FLAGS=("${FLAGS[@]}" "$1")
16+
shift
17+
# takes one argument
18+
FLAGS=("${FLAGS[@]}" "$1")
19+
shift
20+
;;
1421
"-"* )
1522
FLAGS=("${FLAGS[@]}" "$1")
1623
shift

src/test/java/org/commoncrawl/webgraph/TestHostToDomainGraph.java

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,26 @@ class TestHostToDomainGraph {
139139
"8\tname.hit\t1", //
140140
};
141141

142+
String[] hostGraphWithWwwDomains = { //
143+
"0\tname.hiro", //
144+
"1\tname.hiro.adam", //
145+
"2\tname.hiro.www", //
146+
"3\tname.his.forgot.ben.www", //
147+
"4\tname.his.forgot.never", //
148+
"5\tname.his.prz.www", //
149+
"6\tname.his.www", //
150+
"7\tname.hit.www", //
151+
};
152+
String[] domainGraphWithWwwDomains = { //
153+
"0\tname.hiro\t2", //
154+
"1\tname.hiro.adam\t1", //
155+
"2\tname.his\t1", //
156+
"3\tname.his.forgot.ben\t1", //
157+
"4\tname.his.forgot.never\t1", //
158+
"5\tname.his.prz\t1", //
159+
"6\tname.hit\t1", //
160+
};
161+
142162
@BeforeEach
143163
void init() {
144164
converter = new HostToDomainGraph(maxGraphNodes);
@@ -267,4 +287,33 @@ void testConvertPrivateDomain() {
267287
assertArrayEquals(domainGraphPrivateDomains, convert(converter, hostGraphPrivateDomains));
268288
}
269289

290+
@Test
291+
void testConvertStripWww() {
292+
// verify sorting of input and expected output
293+
testSorted(hostGraphWithWwwDomains);
294+
testSorted(domainGraphWithWwwDomains);
295+
converter.doCount(true);
296+
converter.setStripWww(true);
297+
converter.multiPartSuffixesAsDomains(true);
298+
String[] convert = convert(converter, hostGraphWithWwwDomains);
299+
assertArrayEquals(domainGraphWithWwwDomains, convert);
300+
}
301+
302+
/**
303+
* Test that www.com is not stripped (only one trailing part after www.)
304+
*/
305+
@Test
306+
void testConvertStripWwwEdgeCaseWwwDotCom() {
307+
String[] hostGraph = { //
308+
"0\tcom.www", //
309+
};
310+
String[] expectedDomainGraph = { //
311+
"0\tcom.www\t1", //
312+
};
313+
converter.doCount(true);
314+
converter.setStripWww(true);
315+
String[] convert = convert(converter, hostGraph);
316+
assertArrayEquals(expectedDomainGraph, convert);
317+
}
318+
270319
}

0 commit comments

Comments
 (0)