Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | # This Source Code Form is subject to the terms of the Mozilla Public |
michael@0 | 2 | # License, v. 2.0. If a copy of the MPL was not distributed with this |
michael@0 | 3 | # file, You can obtain one at http://mozilla.org/MPL/2.0/. |
michael@0 | 4 | |
michael@0 | 5 | import codecs |
michael@0 | 6 | import encodings.idna |
michael@0 | 7 | import re |
michael@0 | 8 | import sys |
michael@0 | 9 | |
michael@0 | 10 | """ |
michael@0 | 11 | Processes a file containing effective TLD data. See the following URL for a |
michael@0 | 12 | description of effective TLDs and of the file format that this script |
michael@0 | 13 | processes (although for the latter you're better off just reading this file's |
michael@0 | 14 | short source code). |
michael@0 | 15 | |
michael@0 | 16 | http://wiki.mozilla.org/Gecko:Effective_TLD_Service |
michael@0 | 17 | """ |
michael@0 | 18 | |
michael@0 | 19 | def getEffectiveTLDs(path): |
michael@0 | 20 | file = codecs.open(path, "r", "UTF-8") |
michael@0 | 21 | domains = set() |
michael@0 | 22 | while True: |
michael@0 | 23 | line = file.readline() |
michael@0 | 24 | # line always contains a line terminator unless the file is empty |
michael@0 | 25 | if len(line) == 0: |
michael@0 | 26 | raise StopIteration |
michael@0 | 27 | line = line.rstrip() |
michael@0 | 28 | # comment, empty, or superfluous line for explicitness purposes |
michael@0 | 29 | if line.startswith("//") or "." not in line: |
michael@0 | 30 | continue |
michael@0 | 31 | line = re.split(r"[ \t\n]", line, 1)[0] |
michael@0 | 32 | entry = EffectiveTLDEntry(line) |
michael@0 | 33 | domain = entry.domain() |
michael@0 | 34 | assert domain not in domains, \ |
michael@0 | 35 | "repeating domain %s makes no sense" % domain |
michael@0 | 36 | domains.add(domain) |
michael@0 | 37 | yield entry |
michael@0 | 38 | |
michael@0 | 39 | def _normalizeHostname(domain): |
michael@0 | 40 | """ |
michael@0 | 41 | Normalizes the given domain, component by component. ASCII components are |
michael@0 | 42 | lowercased, while non-ASCII components are processed using the ToASCII |
michael@0 | 43 | algorithm. |
michael@0 | 44 | """ |
michael@0 | 45 | def convertLabel(label): |
michael@0 | 46 | if _isASCII(label): |
michael@0 | 47 | return label.lower() |
michael@0 | 48 | return encodings.idna.ToASCII(label) |
michael@0 | 49 | return ".".join(map(convertLabel, domain.split("."))) |
michael@0 | 50 | |
michael@0 | 51 | def _isASCII(s): |
michael@0 | 52 | "True if s consists entirely of ASCII characters, false otherwise." |
michael@0 | 53 | for c in s: |
michael@0 | 54 | if ord(c) > 127: |
michael@0 | 55 | return False |
michael@0 | 56 | return True |
michael@0 | 57 | |
michael@0 | 58 | class EffectiveTLDEntry: |
michael@0 | 59 | """ |
michael@0 | 60 | Stores an entry in an effective-TLD name file. |
michael@0 | 61 | """ |
michael@0 | 62 | |
michael@0 | 63 | _exception = False |
michael@0 | 64 | _wild = False |
michael@0 | 65 | |
michael@0 | 66 | def __init__(self, line): |
michael@0 | 67 | """ |
michael@0 | 68 | Creates a TLD entry from a line of data, which must have been stripped of |
michael@0 | 69 | the line ending. |
michael@0 | 70 | """ |
michael@0 | 71 | if line.startswith("!"): |
michael@0 | 72 | self._exception = True |
michael@0 | 73 | domain = line[1:] |
michael@0 | 74 | elif line.startswith("*."): |
michael@0 | 75 | self._wild = True |
michael@0 | 76 | domain = line[2:] |
michael@0 | 77 | else: |
michael@0 | 78 | domain = line |
michael@0 | 79 | self._domain = _normalizeHostname(domain) |
michael@0 | 80 | |
michael@0 | 81 | def domain(self): |
michael@0 | 82 | "The domain this represents." |
michael@0 | 83 | return self._domain |
michael@0 | 84 | |
michael@0 | 85 | def exception(self): |
michael@0 | 86 | "True if this entry's domain denotes does not denote an effective TLD." |
michael@0 | 87 | return self._exception |
michael@0 | 88 | |
michael@0 | 89 | def wild(self): |
michael@0 | 90 | "True if this entry represents a class of effective TLDs." |
michael@0 | 91 | return self._wild |
michael@0 | 92 | |
michael@0 | 93 | |
michael@0 | 94 | ################# |
michael@0 | 95 | # DO EVERYTHING # |
michael@0 | 96 | ################# |
michael@0 | 97 | |
michael@0 | 98 | def main(): |
michael@0 | 99 | """ |
michael@0 | 100 | argv[1] is the effective TLD file to parse. |
michael@0 | 101 | A C++ array of { domain, exception, wild } entries representing the |
michael@0 | 102 | eTLD file is then printed to stdout. |
michael@0 | 103 | """ |
michael@0 | 104 | |
michael@0 | 105 | def boolStr(b): |
michael@0 | 106 | if b: |
michael@0 | 107 | return "true" |
michael@0 | 108 | return "false" |
michael@0 | 109 | |
michael@0 | 110 | for etld in getEffectiveTLDs(sys.argv[1]): |
michael@0 | 111 | exception = boolStr(etld.exception()) |
michael@0 | 112 | wild = boolStr(etld.wild()) |
michael@0 | 113 | print 'ETLD_ENTRY("%s", %s, %s)' % (etld.domain(), exception, wild) |
michael@0 | 114 | |
michael@0 | 115 | if __name__ == '__main__': |
michael@0 | 116 | main() |