js/src/builtin/make_intl_data.py

Sat, 03 Jan 2015 20:18:00 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Sat, 03 Jan 2015 20:18:00 +0100
branch
TOR_BUG_3246
changeset 7
129ffea94266
permissions
-rwxr-xr-x

Conditionally enable double key logic according to:
private browsing mode or privacy.thirdparty.isolate preference and
implement in GetCookieStringCommon and FindCookie where it counts...
With some reservations of how to convince FindCookie users to test
condition and pass a nullptr when disabling double key logic.

michael@0 1 #!/usr/bin/env python
michael@0 2 # -*- coding: utf-8 -*-
michael@0 3 #
michael@0 4 # This Source Code Form is subject to the terms of the Mozilla Public
michael@0 5 # License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 6 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
michael@0 7
michael@0 8 """ Usage: make_intl_data.py [language-subtag-registry.txt]
michael@0 9
michael@0 10 This script extracts information about mappings between deprecated and
michael@0 11 current BCP 47 language tags from the IANA Language Subtag Registry and
michael@0 12 converts it to JavaScript object definitions in IntlData.js. The definitions
michael@0 13 are used in Intl.js.
michael@0 14
michael@0 15 The IANA Language Subtag Registry is imported from
michael@0 16 http://www.iana.org/assignments/language-subtag-registry
michael@0 17 and uses the syntax specified in
michael@0 18 http://tools.ietf.org/html/rfc5646#section-3
michael@0 19 """
michael@0 20
michael@0 21 def readRegistryRecord(registry):
michael@0 22 """ Yields the records of the IANA Language Subtag Registry as dictionaries. """
michael@0 23 record = {}
michael@0 24 for line in registry:
michael@0 25 line = line.strip()
michael@0 26 if line == "":
michael@0 27 continue
michael@0 28 if line == "%%":
michael@0 29 yield record
michael@0 30 record = {}
michael@0 31 else:
michael@0 32 if ":" in line:
michael@0 33 key, value = line.split(":", 1)
michael@0 34 key, value = key.strip(), value.strip()
michael@0 35 record[key] = value
michael@0 36 else:
michael@0 37 # continuation line
michael@0 38 record[key] += " " + line
michael@0 39 if record:
michael@0 40 yield record
michael@0 41 return
michael@0 42
michael@0 43
michael@0 44 def readRegistry(registry):
michael@0 45 """ Reads IANA Language Subtag Registry and extracts information for Intl.js.
michael@0 46
michael@0 47 Information extracted:
michael@0 48 - langTagMappings: mappings from complete language tags to preferred
michael@0 49 complete language tags
michael@0 50 - langSubtagMappings: mappings from subtags to preferred subtags
michael@0 51 - extlangMappings: mappings from extlang subtags to preferred subtags,
michael@0 52 with prefix to be removed
michael@0 53 Returns these three mappings as dictionaries, along with the registry's
michael@0 54 file date.
michael@0 55
michael@0 56 We also check that mappings for language subtags don't affect extlang
michael@0 57 subtags and vice versa, so that CanonicalizeLanguageTag doesn't have
michael@0 58 to separate them for processing. Region codes are separated by case,
michael@0 59 and script codes by length, so they're unproblematic.
michael@0 60 """
michael@0 61 langTagMappings = {}
michael@0 62 langSubtagMappings = {}
michael@0 63 extlangMappings = {}
michael@0 64 languageSubtags = set()
michael@0 65 extlangSubtags = set()
michael@0 66
michael@0 67 for record in readRegistryRecord(registry):
michael@0 68 if "File-Date" in record:
michael@0 69 fileDate = record["File-Date"]
michael@0 70 continue
michael@0 71
michael@0 72 if record["Type"] == "grandfathered":
michael@0 73 # Grandfathered tags don't use standard syntax, so
michael@0 74 # CanonicalizeLanguageTag expects the mapping table to provide
michael@0 75 # the final form for all.
michael@0 76 # For langTagMappings, keys must be in lower case; values in
michael@0 77 # the case used in the registry.
michael@0 78 tag = record["Tag"]
michael@0 79 if "Preferred-Value" in record:
michael@0 80 langTagMappings[tag.lower()] = record["Preferred-Value"]
michael@0 81 else:
michael@0 82 langTagMappings[tag.lower()] = tag
michael@0 83 elif record["Type"] == "redundant":
michael@0 84 # For langTagMappings, keys must be in lower case; values in
michael@0 85 # the case used in the registry.
michael@0 86 if "Preferred-Value" in record:
michael@0 87 langTagMappings[record["Tag"].lower()] = record["Preferred-Value"]
michael@0 88 elif record["Type"] in ("language", "script", "region", "variant"):
michael@0 89 # For langSubtagMappings, keys and values must be in the case used
michael@0 90 # in the registry.
michael@0 91 subtag = record["Subtag"]
michael@0 92 if record["Type"] == "language":
michael@0 93 languageSubtags.add(subtag)
michael@0 94 if "Preferred-Value" in record:
michael@0 95 if subtag == "heploc":
michael@0 96 # The entry for heploc is unique in its complexity; handle
michael@0 97 # it as special case below.
michael@0 98 continue
michael@0 99 if "Prefix" in record:
michael@0 100 # This might indicate another heploc-like complex case.
michael@0 101 raise Exception("Please evaluate: subtag mapping with prefix value.")
michael@0 102 langSubtagMappings[subtag] = record["Preferred-Value"]
michael@0 103 elif record["Type"] == "extlang":
michael@0 104 # For extlangMappings, keys must be in the case used in the
michael@0 105 # registry; values are records with the preferred value and the
michael@0 106 # prefix to be removed.
michael@0 107 subtag = record["Subtag"]
michael@0 108 extlangSubtags.add(subtag)
michael@0 109 if "Preferred-Value" in record:
michael@0 110 preferred = record["Preferred-Value"]
michael@0 111 prefix = record["Prefix"]
michael@0 112 extlangMappings[subtag] = {"preferred": preferred, "prefix": prefix}
michael@0 113 else:
michael@0 114 # No other types are allowed by
michael@0 115 # http://tools.ietf.org/html/rfc5646#section-3.1.3
michael@0 116 assert False, "Unrecognized Type: {0}".format(record["Type"])
michael@0 117
michael@0 118 # Check that mappings for language subtags and extlang subtags don't affect
michael@0 119 # each other.
michael@0 120 for lang in languageSubtags:
michael@0 121 if lang in extlangMappings and extlangMappings[lang]["preferred"] != lang:
michael@0 122 raise Exception("Conflict: lang with extlang mapping: " + lang)
michael@0 123 for extlang in extlangSubtags:
michael@0 124 if extlang in langSubtagMappings:
michael@0 125 raise Exception("Conflict: extlang with lang mapping: " + extlang)
michael@0 126
michael@0 127 # Special case for heploc.
michael@0 128 langTagMappings["ja-latn-hepburn-heploc"] = "ja-Latn-alalc97"
michael@0 129
michael@0 130 return {"fileDate": fileDate,
michael@0 131 "langTagMappings": langTagMappings,
michael@0 132 "langSubtagMappings": langSubtagMappings,
michael@0 133 "extlangMappings": extlangMappings}
michael@0 134
michael@0 135
michael@0 136 def writeMappingsVar(intlData, dict, name, description, fileDate, url):
michael@0 137 """ Writes a variable definition with a mapping table to file intlData.
michael@0 138
michael@0 139 Writes the contents of dictionary dict to file intlData with the given
michael@0 140 variable name and a comment with description, fileDate, and URL.
michael@0 141 """
michael@0 142 intlData.write("\n")
michael@0 143 intlData.write("// {0}.\n".format(description))
michael@0 144 intlData.write("// Derived from IANA Language Subtag Registry, file date {0}.\n".format(fileDate))
michael@0 145 intlData.write("// {0}\n".format(url))
michael@0 146 intlData.write("var {0} = {{\n".format(name))
michael@0 147 keys = sorted(dict)
michael@0 148 for key in keys:
michael@0 149 if isinstance(dict[key], basestring):
michael@0 150 value = '"{0}"'.format(dict[key])
michael@0 151 else:
michael@0 152 preferred = dict[key]["preferred"]
michael@0 153 prefix = dict[key]["prefix"]
michael@0 154 value = '{{preferred: "{0}", prefix: "{1}"}}'.format(preferred, prefix)
michael@0 155 intlData.write(' "{0}": {1},\n'.format(key, value))
michael@0 156 intlData.write("};\n")
michael@0 157
michael@0 158
michael@0 159 def writeLanguageTagData(intlData, fileDate, url, langTagMappings, langSubtagMappings, extlangMappings):
michael@0 160 """ Writes the language tag data to the Intl data file. """
michael@0 161 writeMappingsVar(intlData, langTagMappings, "langTagMappings",
michael@0 162 "Mappings from complete tags to preferred values", fileDate, url)
michael@0 163 writeMappingsVar(intlData, langSubtagMappings, "langSubtagMappings",
michael@0 164 "Mappings from non-extlang subtags to preferred values", fileDate, url)
michael@0 165 writeMappingsVar(intlData, extlangMappings, "extlangMappings",
michael@0 166 "Mappings from extlang subtags to preferred values", fileDate, url)
michael@0 167
michael@0 168
michael@0 169 if __name__ == '__main__':
michael@0 170 import codecs
michael@0 171 import sys
michael@0 172 import urllib2
michael@0 173
michael@0 174 url = "http://www.iana.org/assignments/language-subtag-registry"
michael@0 175 if len(sys.argv) > 1:
michael@0 176 print("Always make sure you have the newest language-subtag-registry.txt!")
michael@0 177 registry = codecs.open(sys.argv[1], "r", encoding="utf-8")
michael@0 178 else:
michael@0 179 print("Downloading IANA Language Subtag Registry...")
michael@0 180 reader = urllib2.urlopen(url)
michael@0 181 text = reader.read().decode("utf-8")
michael@0 182 reader.close()
michael@0 183 registry = codecs.open("language-subtag-registry.txt", "w+", encoding="utf-8")
michael@0 184 registry.write(text)
michael@0 185 registry.seek(0)
michael@0 186
michael@0 187 print("Processing IANA Language Subtag Registry...")
michael@0 188 data = readRegistry(registry)
michael@0 189 fileDate = data["fileDate"]
michael@0 190 langTagMappings = data["langTagMappings"]
michael@0 191 langSubtagMappings = data["langSubtagMappings"]
michael@0 192 extlangMappings = data["extlangMappings"]
michael@0 193 registry.close()
michael@0 194
michael@0 195 print("Writing Intl data...")
michael@0 196 intlData = codecs.open("IntlData.js", "w", encoding="utf-8")
michael@0 197 intlData.write("// Generated by make_intl_data.py. DO NOT EDIT.\n")
michael@0 198 writeLanguageTagData(intlData, fileDate, url, langTagMappings, langSubtagMappings, extlangMappings)
michael@0 199 intlData.close()

mercurial