michael@0: # This Source Code Form is subject to the terms of the Mozilla Public michael@0: # License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: # file, You can obtain one at http://mozilla.org/MPL/2.0/. michael@0: michael@0: import codecs michael@0: import encodings.idna michael@0: import re michael@0: import sys michael@0: michael@0: """ michael@0: Processes a file containing effective TLD data. See the following URL for a michael@0: description of effective TLDs and of the file format that this script michael@0: processes (although for the latter you're better off just reading this file's michael@0: short source code). michael@0: michael@0: http://wiki.mozilla.org/Gecko:Effective_TLD_Service michael@0: """ michael@0: michael@0: def getEffectiveTLDs(path): michael@0: file = codecs.open(path, "r", "UTF-8") michael@0: domains = set() michael@0: while True: michael@0: line = file.readline() michael@0: # line always contains a line terminator unless the file is empty michael@0: if len(line) == 0: michael@0: raise StopIteration michael@0: line = line.rstrip() michael@0: # comment, empty, or superfluous line for explicitness purposes michael@0: if line.startswith("//") or "." not in line: michael@0: continue michael@0: line = re.split(r"[ \t\n]", line, 1)[0] michael@0: entry = EffectiveTLDEntry(line) michael@0: domain = entry.domain() michael@0: assert domain not in domains, \ michael@0: "repeating domain %s makes no sense" % domain michael@0: domains.add(domain) michael@0: yield entry michael@0: michael@0: def _normalizeHostname(domain): michael@0: """ michael@0: Normalizes the given domain, component by component. ASCII components are michael@0: lowercased, while non-ASCII components are processed using the ToASCII michael@0: algorithm. michael@0: """ michael@0: def convertLabel(label): michael@0: if _isASCII(label): michael@0: return label.lower() michael@0: return encodings.idna.ToASCII(label) michael@0: return ".".join(map(convertLabel, domain.split("."))) michael@0: michael@0: def _isASCII(s): michael@0: "True if s consists entirely of ASCII characters, false otherwise." michael@0: for c in s: michael@0: if ord(c) > 127: michael@0: return False michael@0: return True michael@0: michael@0: class EffectiveTLDEntry: michael@0: """ michael@0: Stores an entry in an effective-TLD name file. michael@0: """ michael@0: michael@0: _exception = False michael@0: _wild = False michael@0: michael@0: def __init__(self, line): michael@0: """ michael@0: Creates a TLD entry from a line of data, which must have been stripped of michael@0: the line ending. michael@0: """ michael@0: if line.startswith("!"): michael@0: self._exception = True michael@0: domain = line[1:] michael@0: elif line.startswith("*."): michael@0: self._wild = True michael@0: domain = line[2:] michael@0: else: michael@0: domain = line michael@0: self._domain = _normalizeHostname(domain) michael@0: michael@0: def domain(self): michael@0: "The domain this represents." michael@0: return self._domain michael@0: michael@0: def exception(self): michael@0: "True if this entry's domain denotes does not denote an effective TLD." michael@0: return self._exception michael@0: michael@0: def wild(self): michael@0: "True if this entry represents a class of effective TLDs." michael@0: return self._wild michael@0: michael@0: michael@0: ################# michael@0: # DO EVERYTHING # michael@0: ################# michael@0: michael@0: def main(): michael@0: """ michael@0: argv[1] is the effective TLD file to parse. michael@0: A C++ array of { domain, exception, wild } entries representing the michael@0: eTLD file is then printed to stdout. michael@0: """ michael@0: michael@0: def boolStr(b): michael@0: if b: michael@0: return "true" michael@0: return "false" michael@0: michael@0: for etld in getEffectiveTLDs(sys.argv[1]): michael@0: exception = boolStr(etld.exception()) michael@0: wild = boolStr(etld.wild()) michael@0: print 'ETLD_ENTRY("%s", %s, %s)' % (etld.domain(), exception, wild) michael@0: michael@0: if __name__ == '__main__': michael@0: main()