|
1 # This Source Code Form is subject to the terms of the Mozilla Public |
|
2 # License, v. 2.0. If a copy of the MPL was not distributed with this |
|
3 # file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
4 |
|
5 import codecs |
|
6 import encodings.idna |
|
7 import re |
|
8 import sys |
|
9 |
|
10 """ |
|
11 Processes a file containing effective TLD data. See the following URL for a |
|
12 description of effective TLDs and of the file format that this script |
|
13 processes (although for the latter you're better off just reading this file's |
|
14 short source code). |
|
15 |
|
16 http://wiki.mozilla.org/Gecko:Effective_TLD_Service |
|
17 """ |
|
18 |
|
19 def getEffectiveTLDs(path): |
|
20 file = codecs.open(path, "r", "UTF-8") |
|
21 domains = set() |
|
22 while True: |
|
23 line = file.readline() |
|
24 # line always contains a line terminator unless the file is empty |
|
25 if len(line) == 0: |
|
26 raise StopIteration |
|
27 line = line.rstrip() |
|
28 # comment, empty, or superfluous line for explicitness purposes |
|
29 if line.startswith("//") or "." not in line: |
|
30 continue |
|
31 line = re.split(r"[ \t\n]", line, 1)[0] |
|
32 entry = EffectiveTLDEntry(line) |
|
33 domain = entry.domain() |
|
34 assert domain not in domains, \ |
|
35 "repeating domain %s makes no sense" % domain |
|
36 domains.add(domain) |
|
37 yield entry |
|
38 |
|
39 def _normalizeHostname(domain): |
|
40 """ |
|
41 Normalizes the given domain, component by component. ASCII components are |
|
42 lowercased, while non-ASCII components are processed using the ToASCII |
|
43 algorithm. |
|
44 """ |
|
45 def convertLabel(label): |
|
46 if _isASCII(label): |
|
47 return label.lower() |
|
48 return encodings.idna.ToASCII(label) |
|
49 return ".".join(map(convertLabel, domain.split("."))) |
|
50 |
|
51 def _isASCII(s): |
|
52 "True if s consists entirely of ASCII characters, false otherwise." |
|
53 for c in s: |
|
54 if ord(c) > 127: |
|
55 return False |
|
56 return True |
|
57 |
|
58 class EffectiveTLDEntry: |
|
59 """ |
|
60 Stores an entry in an effective-TLD name file. |
|
61 """ |
|
62 |
|
63 _exception = False |
|
64 _wild = False |
|
65 |
|
66 def __init__(self, line): |
|
67 """ |
|
68 Creates a TLD entry from a line of data, which must have been stripped of |
|
69 the line ending. |
|
70 """ |
|
71 if line.startswith("!"): |
|
72 self._exception = True |
|
73 domain = line[1:] |
|
74 elif line.startswith("*."): |
|
75 self._wild = True |
|
76 domain = line[2:] |
|
77 else: |
|
78 domain = line |
|
79 self._domain = _normalizeHostname(domain) |
|
80 |
|
81 def domain(self): |
|
82 "The domain this represents." |
|
83 return self._domain |
|
84 |
|
85 def exception(self): |
|
86 "True if this entry's domain denotes does not denote an effective TLD." |
|
87 return self._exception |
|
88 |
|
89 def wild(self): |
|
90 "True if this entry represents a class of effective TLDs." |
|
91 return self._wild |
|
92 |
|
93 |
|
94 ################# |
|
95 # DO EVERYTHING # |
|
96 ################# |
|
97 |
|
98 def main(): |
|
99 """ |
|
100 argv[1] is the effective TLD file to parse. |
|
101 A C++ array of { domain, exception, wild } entries representing the |
|
102 eTLD file is then printed to stdout. |
|
103 """ |
|
104 |
|
105 def boolStr(b): |
|
106 if b: |
|
107 return "true" |
|
108 return "false" |
|
109 |
|
110 for etld in getEffectiveTLDs(sys.argv[1]): |
|
111 exception = boolStr(etld.exception()) |
|
112 wild = boolStr(etld.wild()) |
|
113 print 'ETLD_ENTRY("%s", %s, %s)' % (etld.domain(), exception, wild) |
|
114 |
|
115 if __name__ == '__main__': |
|
116 main() |