Add script to synthesize intermediate domains to work around an algorithm limitation...
authorjefftk <jefftk@google.com>
Tue, 17 Mar 2015 17:03:07 +0000 (17:03 +0000)
committerjefftk <jefftk@google.com>
Tue, 17 Mar 2015 17:03:07 +0000 (17:03 +0000)
src/scripts/synthesize_entries.py [new file with mode: 0755]

diff --git a/src/scripts/synthesize_entries.py b/src/scripts/synthesize_entries.py
new file mode 100755 (executable)
index 0000000..67ed2b1
--- /dev/null
@@ -0,0 +1,77 @@
+#!/usr/bin/python
+
+# Usage: scripts/synthesize_entries.py \
+#          third_party/effective_tld_names/effective_tld_names.dat
+#
+# Author: Jeff Kaufman (jefftk@google.com)
+#
+# There are two unusual cases that the C code for the domain registry provider
+# can't handle properly:
+#
+# 1. list contains both *.c and a.b.c
+# 2. list contains a.b.c.d and c.d but not b.c.d
+#
+# We can fix this by synthesizing entries for b.c in case 1 and b.c.d in case
+# 2.  The first fix is completely ok, just redundant, but the second case means
+# categorizing foo.b.c.d as being under public suffic b.c.d instead of the
+# technically correct c.d.  This is rare and doesn't seem to be a problem for
+# existing entries (like amazonaws.com), so for now we'll just work around the
+# broken code with slightly messy data.
+#
+# See https://code.google.com/p/domain-registry-provider/issues/detail?id=3 for
+# more discussion.
+
+import sys
+from collections import defaultdict
+
+def start(effective_tld_names_fname):
+    entries = set()
+    with open(effective_tld_names_fname) as inf:
+        for line in inf:
+            line = line.split("//")[0].strip()  # remove comments and whitespace
+            if not line:
+                continue
+            entries.add(line)
+
+    # Allow multiple comments per entry, in case we want the same entry multiple
+    # times for different reasons.
+    new_entries = defaultdict(list) # {new_entry: [comments], ...]
+
+    for entry in entries:
+        if entry.count('.') >= 2:
+            leaf, candidate_entry = entry.split(".", 1)
+            if leaf.startswith("!") or leaf == "*":
+                continue
+            
+            _, parent = candidate_entry.split(".", 1)
+            if candidate_entry in entries:
+                continue
+            star_parent = "*.%s" % parent
+
+            if star_parent in entries or parent in entries:
+                # If entry is 'a.b.c' and '*.c' is also an entry, we need to
+                # synthesize a new entry 'b.c'.
+                #
+                # If entry is a.b.c and c is also an entry but b.c is not, we
+                # need to synthesize a new entry b.c.  This one isn't
+                # technically legal, but it's better than getting it completely
+                # wrong which is what the C code would otherwise do with this
+                # case.
+                new_entries[candidate_entry].append("For %s" % entry)
+
+    if not new_entries:
+        return
+                
+    with open(effective_tld_names_fname, "a") as outf:
+        category = "DOMAIN REGISTRY PROVIDER SYNTHESIZED DOMAINS"
+        def add_comment(s):
+            outf.write("// %s\n" % s)
+        add_comment("===BEGIN %s===" % category)
+        add_comment("synthesized by scripts/synthesize_entries.py")
+        for new_entry in sorted(new_entries):
+            for comment in sorted(new_entries[new_entry]):
+                add_comment(comment)
+            outf.write("%s\n" % new_entry)
+        add_comment("===END %s===" % category)
+if __name__ == "__main__":
+    start(*sys.argv[1:])