netwerk/dns/prepare_tlds.py

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 # This Source Code Form is subject to the terms of the Mozilla Public
michael@0 2 # License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 3 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
michael@0 4
michael@0 5 import codecs
michael@0 6 import encodings.idna
michael@0 7 import re
michael@0 8 import sys
michael@0 9
michael@0 10 """
michael@0 11 Processes a file containing effective TLD data. See the following URL for a
michael@0 12 description of effective TLDs and of the file format that this script
michael@0 13 processes (although for the latter you're better off just reading this file's
michael@0 14 short source code).
michael@0 15
michael@0 16 http://wiki.mozilla.org/Gecko:Effective_TLD_Service
michael@0 17 """
michael@0 18
michael@0 19 def getEffectiveTLDs(path):
michael@0 20 file = codecs.open(path, "r", "UTF-8")
michael@0 21 domains = set()
michael@0 22 while True:
michael@0 23 line = file.readline()
michael@0 24 # line always contains a line terminator unless the file is empty
michael@0 25 if len(line) == 0:
michael@0 26 raise StopIteration
michael@0 27 line = line.rstrip()
michael@0 28 # comment, empty, or superfluous line for explicitness purposes
michael@0 29 if line.startswith("//") or "." not in line:
michael@0 30 continue
michael@0 31 line = re.split(r"[ \t\n]", line, 1)[0]
michael@0 32 entry = EffectiveTLDEntry(line)
michael@0 33 domain = entry.domain()
michael@0 34 assert domain not in domains, \
michael@0 35 "repeating domain %s makes no sense" % domain
michael@0 36 domains.add(domain)
michael@0 37 yield entry
michael@0 38
michael@0 39 def _normalizeHostname(domain):
michael@0 40 """
michael@0 41 Normalizes the given domain, component by component. ASCII components are
michael@0 42 lowercased, while non-ASCII components are processed using the ToASCII
michael@0 43 algorithm.
michael@0 44 """
michael@0 45 def convertLabel(label):
michael@0 46 if _isASCII(label):
michael@0 47 return label.lower()
michael@0 48 return encodings.idna.ToASCII(label)
michael@0 49 return ".".join(map(convertLabel, domain.split(".")))
michael@0 50
michael@0 51 def _isASCII(s):
michael@0 52 "True if s consists entirely of ASCII characters, false otherwise."
michael@0 53 for c in s:
michael@0 54 if ord(c) > 127:
michael@0 55 return False
michael@0 56 return True
michael@0 57
michael@0 58 class EffectiveTLDEntry:
michael@0 59 """
michael@0 60 Stores an entry in an effective-TLD name file.
michael@0 61 """
michael@0 62
michael@0 63 _exception = False
michael@0 64 _wild = False
michael@0 65
michael@0 66 def __init__(self, line):
michael@0 67 """
michael@0 68 Creates a TLD entry from a line of data, which must have been stripped of
michael@0 69 the line ending.
michael@0 70 """
michael@0 71 if line.startswith("!"):
michael@0 72 self._exception = True
michael@0 73 domain = line[1:]
michael@0 74 elif line.startswith("*."):
michael@0 75 self._wild = True
michael@0 76 domain = line[2:]
michael@0 77 else:
michael@0 78 domain = line
michael@0 79 self._domain = _normalizeHostname(domain)
michael@0 80
michael@0 81 def domain(self):
michael@0 82 "The domain this represents."
michael@0 83 return self._domain
michael@0 84
michael@0 85 def exception(self):
michael@0 86 "True if this entry's domain denotes does not denote an effective TLD."
michael@0 87 return self._exception
michael@0 88
michael@0 89 def wild(self):
michael@0 90 "True if this entry represents a class of effective TLDs."
michael@0 91 return self._wild
michael@0 92
michael@0 93
michael@0 94 #################
michael@0 95 # DO EVERYTHING #
michael@0 96 #################
michael@0 97
michael@0 98 def main():
michael@0 99 """
michael@0 100 argv[1] is the effective TLD file to parse.
michael@0 101 A C++ array of { domain, exception, wild } entries representing the
michael@0 102 eTLD file is then printed to stdout.
michael@0 103 """
michael@0 104
michael@0 105 def boolStr(b):
michael@0 106 if b:
michael@0 107 return "true"
michael@0 108 return "false"
michael@0 109
michael@0 110 for etld in getEffectiveTLDs(sys.argv[1]):
michael@0 111 exception = boolStr(etld.exception())
michael@0 112 wild = boolStr(etld.wild())
michael@0 113 print 'ETLD_ENTRY("%s", %s, %s)' % (etld.domain(), exception, wild)
michael@0 114
michael@0 115 if __name__ == '__main__':
michael@0 116 main()

mercurial