netwerk/dns/prepare_tlds.py

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/netwerk/dns/prepare_tlds.py	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,116 @@
     1.4 +# This Source Code Form is subject to the terms of the Mozilla Public
     1.5 +# License, v. 2.0. If a copy of the MPL was not distributed with this
     1.6 +# file, You can obtain one at http://mozilla.org/MPL/2.0/.
     1.7 +
     1.8 +import codecs
     1.9 +import encodings.idna
    1.10 +import re
    1.11 +import sys
    1.12 +
    1.13 +"""
    1.14 +Processes a file containing effective TLD data.  See the following URL for a
    1.15 +description of effective TLDs and of the file format that this script
    1.16 +processes (although for the latter you're better off just reading this file's
    1.17 +short source code).
    1.18 +
    1.19 +http://wiki.mozilla.org/Gecko:Effective_TLD_Service
    1.20 +"""
    1.21 +
    1.22 +def getEffectiveTLDs(path):
    1.23 +  file = codecs.open(path, "r", "UTF-8")
    1.24 +  domains = set()
    1.25 +  while True:
    1.26 +    line = file.readline()
    1.27 +    # line always contains a line terminator unless the file is empty
    1.28 +    if len(line) == 0:
    1.29 +      raise StopIteration
    1.30 +    line = line.rstrip()
    1.31 +    # comment, empty, or superfluous line for explicitness purposes
    1.32 +    if line.startswith("//") or "." not in line:
    1.33 +      continue
    1.34 +    line = re.split(r"[ \t\n]", line, 1)[0]
    1.35 +    entry = EffectiveTLDEntry(line)
    1.36 +    domain = entry.domain()
    1.37 +    assert domain not in domains, \
    1.38 +           "repeating domain %s makes no sense" % domain
    1.39 +    domains.add(domain)
    1.40 +    yield entry
    1.41 +
    1.42 +def _normalizeHostname(domain):
    1.43 +  """
    1.44 +  Normalizes the given domain, component by component.  ASCII components are
    1.45 +  lowercased, while non-ASCII components are processed using the ToASCII
    1.46 +  algorithm.
    1.47 +  """
    1.48 +  def convertLabel(label):
    1.49 +    if _isASCII(label):
    1.50 +      return label.lower()
    1.51 +    return encodings.idna.ToASCII(label)
    1.52 +  return ".".join(map(convertLabel, domain.split(".")))
    1.53 +
    1.54 +def _isASCII(s):
    1.55 +  "True if s consists entirely of ASCII characters, false otherwise."
    1.56 +  for c in s:
    1.57 +    if ord(c) > 127:
    1.58 +      return False
    1.59 +  return True
    1.60 +
    1.61 +class EffectiveTLDEntry:
    1.62 +  """
    1.63 +  Stores an entry in an effective-TLD name file.
    1.64 +  """
    1.65 +
    1.66 +  _exception = False
    1.67 +  _wild = False
    1.68 +
    1.69 +  def __init__(self, line):
    1.70 +    """
    1.71 +    Creates a TLD entry from a line of data, which must have been stripped of
    1.72 +    the line ending.
    1.73 +    """
    1.74 +    if line.startswith("!"):
    1.75 +      self._exception = True
    1.76 +      domain = line[1:]
    1.77 +    elif line.startswith("*."):
    1.78 +      self._wild = True
    1.79 +      domain = line[2:]
    1.80 +    else:
    1.81 +      domain = line
    1.82 +    self._domain = _normalizeHostname(domain)
    1.83 +
    1.84 +  def domain(self):
    1.85 +    "The domain this represents."
    1.86 +    return self._domain
    1.87 +
    1.88 +  def exception(self):
    1.89 +    "True if this entry's domain denotes does not denote an effective TLD."
    1.90 +    return self._exception
    1.91 +
    1.92 +  def wild(self):
    1.93 +    "True if this entry represents a class of effective TLDs."
    1.94 +    return self._wild
    1.95 +
    1.96 +
    1.97 +#################
    1.98 +# DO EVERYTHING #
    1.99 +#################
   1.100 +
   1.101 +def main():
   1.102 +  """
   1.103 +  argv[1] is the effective TLD file to parse.
   1.104 +  A C++ array of { domain, exception, wild } entries representing the
   1.105 +  eTLD file is then printed to stdout.
   1.106 +  """
   1.107 +
   1.108 +  def boolStr(b):
   1.109 +    if b:
   1.110 +      return "true"
   1.111 +    return "false"
   1.112 +
   1.113 +  for etld in getEffectiveTLDs(sys.argv[1]):
   1.114 +    exception = boolStr(etld.exception())
   1.115 +    wild = boolStr(etld.wild())
   1.116 +    print 'ETLD_ENTRY("%s", %s, %s)' % (etld.domain(), exception, wild)
   1.117 +
   1.118 +if __name__ == '__main__':
   1.119 +  main()

mercurial