netwerk/dns/prepare_tlds.py

Wed, 31 Dec 2014 06:55:46 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:55:46 +0100
changeset 1
ca08bd8f51b2
permissions
-rw-r--r--

Added tag TORBROWSER_REPLICA for changeset 6474c204b198

     1 # This Source Code Form is subject to the terms of the Mozilla Public
     2 # License, v. 2.0. If a copy of the MPL was not distributed with this
     3 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
     5 import codecs
     6 import encodings.idna
     7 import re
     8 import sys
    10 """
    11 Processes a file containing effective TLD data.  See the following URL for a
    12 description of effective TLDs and of the file format that this script
    13 processes (although for the latter you're better off just reading this file's
    14 short source code).
    16 http://wiki.mozilla.org/Gecko:Effective_TLD_Service
    17 """
    19 def getEffectiveTLDs(path):
    20   file = codecs.open(path, "r", "UTF-8")
    21   domains = set()
    22   while True:
    23     line = file.readline()
    24     # line always contains a line terminator unless the file is empty
    25     if len(line) == 0:
    26       raise StopIteration
    27     line = line.rstrip()
    28     # comment, empty, or superfluous line for explicitness purposes
    29     if line.startswith("//") or "." not in line:
    30       continue
    31     line = re.split(r"[ \t\n]", line, 1)[0]
    32     entry = EffectiveTLDEntry(line)
    33     domain = entry.domain()
    34     assert domain not in domains, \
    35            "repeating domain %s makes no sense" % domain
    36     domains.add(domain)
    37     yield entry
    39 def _normalizeHostname(domain):
    40   """
    41   Normalizes the given domain, component by component.  ASCII components are
    42   lowercased, while non-ASCII components are processed using the ToASCII
    43   algorithm.
    44   """
    45   def convertLabel(label):
    46     if _isASCII(label):
    47       return label.lower()
    48     return encodings.idna.ToASCII(label)
    49   return ".".join(map(convertLabel, domain.split(".")))
    51 def _isASCII(s):
    52   "True if s consists entirely of ASCII characters, false otherwise."
    53   for c in s:
    54     if ord(c) > 127:
    55       return False
    56   return True
    58 class EffectiveTLDEntry:
    59   """
    60   Stores an entry in an effective-TLD name file.
    61   """
    63   _exception = False
    64   _wild = False
    66   def __init__(self, line):
    67     """
    68     Creates a TLD entry from a line of data, which must have been stripped of
    69     the line ending.
    70     """
    71     if line.startswith("!"):
    72       self._exception = True
    73       domain = line[1:]
    74     elif line.startswith("*."):
    75       self._wild = True
    76       domain = line[2:]
    77     else:
    78       domain = line
    79     self._domain = _normalizeHostname(domain)
    81   def domain(self):
    82     "The domain this represents."
    83     return self._domain
    85   def exception(self):
    86     "True if this entry's domain denotes does not denote an effective TLD."
    87     return self._exception
    89   def wild(self):
    90     "True if this entry represents a class of effective TLDs."
    91     return self._wild
    94 #################
    95 # DO EVERYTHING #
    96 #################
    98 def main():
    99   """
   100   argv[1] is the effective TLD file to parse.
   101   A C++ array of { domain, exception, wild } entries representing the
   102   eTLD file is then printed to stdout.
   103   """
   105   def boolStr(b):
   106     if b:
   107       return "true"
   108     return "false"
   110   for etld in getEffectiveTLDs(sys.argv[1]):
   111     exception = boolStr(etld.exception())
   112     wild = boolStr(etld.wild())
   113     print 'ETLD_ENTRY("%s", %s, %s)' % (etld.domain(), exception, wild)
   115 if __name__ == '__main__':
   116   main()

mercurial