1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/netwerk/dns/prepare_tlds.py Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,116 @@ 1.4 +# This Source Code Form is subject to the terms of the Mozilla Public 1.5 +# License, v. 2.0. If a copy of the MPL was not distributed with this 1.6 +# file, You can obtain one at http://mozilla.org/MPL/2.0/. 1.7 + 1.8 +import codecs 1.9 +import encodings.idna 1.10 +import re 1.11 +import sys 1.12 + 1.13 +""" 1.14 +Processes a file containing effective TLD data. See the following URL for a 1.15 +description of effective TLDs and of the file format that this script 1.16 +processes (although for the latter you're better off just reading this file's 1.17 +short source code). 1.18 + 1.19 +http://wiki.mozilla.org/Gecko:Effective_TLD_Service 1.20 +""" 1.21 + 1.22 +def getEffectiveTLDs(path): 1.23 + file = codecs.open(path, "r", "UTF-8") 1.24 + domains = set() 1.25 + while True: 1.26 + line = file.readline() 1.27 + # line always contains a line terminator unless the file is empty 1.28 + if len(line) == 0: 1.29 + raise StopIteration 1.30 + line = line.rstrip() 1.31 + # comment, empty, or superfluous line for explicitness purposes 1.32 + if line.startswith("//") or "." not in line: 1.33 + continue 1.34 + line = re.split(r"[ \t\n]", line, 1)[0] 1.35 + entry = EffectiveTLDEntry(line) 1.36 + domain = entry.domain() 1.37 + assert domain not in domains, \ 1.38 + "repeating domain %s makes no sense" % domain 1.39 + domains.add(domain) 1.40 + yield entry 1.41 + 1.42 +def _normalizeHostname(domain): 1.43 + """ 1.44 + Normalizes the given domain, component by component. ASCII components are 1.45 + lowercased, while non-ASCII components are processed using the ToASCII 1.46 + algorithm. 1.47 + """ 1.48 + def convertLabel(label): 1.49 + if _isASCII(label): 1.50 + return label.lower() 1.51 + return encodings.idna.ToASCII(label) 1.52 + return ".".join(map(convertLabel, domain.split("."))) 1.53 + 1.54 +def _isASCII(s): 1.55 + "True if s consists entirely of ASCII characters, false otherwise." 1.56 + for c in s: 1.57 + if ord(c) > 127: 1.58 + return False 1.59 + return True 1.60 + 1.61 +class EffectiveTLDEntry: 1.62 + """ 1.63 + Stores an entry in an effective-TLD name file. 1.64 + """ 1.65 + 1.66 + _exception = False 1.67 + _wild = False 1.68 + 1.69 + def __init__(self, line): 1.70 + """ 1.71 + Creates a TLD entry from a line of data, which must have been stripped of 1.72 + the line ending. 1.73 + """ 1.74 + if line.startswith("!"): 1.75 + self._exception = True 1.76 + domain = line[1:] 1.77 + elif line.startswith("*."): 1.78 + self._wild = True 1.79 + domain = line[2:] 1.80 + else: 1.81 + domain = line 1.82 + self._domain = _normalizeHostname(domain) 1.83 + 1.84 + def domain(self): 1.85 + "The domain this represents." 1.86 + return self._domain 1.87 + 1.88 + def exception(self): 1.89 + "True if this entry's domain denotes does not denote an effective TLD." 1.90 + return self._exception 1.91 + 1.92 + def wild(self): 1.93 + "True if this entry represents a class of effective TLDs." 1.94 + return self._wild 1.95 + 1.96 + 1.97 +################# 1.98 +# DO EVERYTHING # 1.99 +################# 1.100 + 1.101 +def main(): 1.102 + """ 1.103 + argv[1] is the effective TLD file to parse. 1.104 + A C++ array of { domain, exception, wild } entries representing the 1.105 + eTLD file is then printed to stdout. 1.106 + """ 1.107 + 1.108 + def boolStr(b): 1.109 + if b: 1.110 + return "true" 1.111 + return "false" 1.112 + 1.113 + for etld in getEffectiveTLDs(sys.argv[1]): 1.114 + exception = boolStr(etld.exception()) 1.115 + wild = boolStr(etld.wild()) 1.116 + print 'ETLD_ENTRY("%s", %s, %s)' % (etld.domain(), exception, wild) 1.117 + 1.118 +if __name__ == '__main__': 1.119 + main()