diff -r 000000000000 -r 6474c204b198 js/src/builtin/make_intl_data.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/js/src/builtin/make_intl_data.py Wed Dec 31 06:09:35 2014 +0100 @@ -0,0 +1,199 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +""" Usage: make_intl_data.py [language-subtag-registry.txt] + + This script extracts information about mappings between deprecated and + current BCP 47 language tags from the IANA Language Subtag Registry and + converts it to JavaScript object definitions in IntlData.js. The definitions + are used in Intl.js. + + The IANA Language Subtag Registry is imported from + http://www.iana.org/assignments/language-subtag-registry + and uses the syntax specified in + http://tools.ietf.org/html/rfc5646#section-3 +""" + +def readRegistryRecord(registry): + """ Yields the records of the IANA Language Subtag Registry as dictionaries. """ + record = {} + for line in registry: + line = line.strip() + if line == "": + continue + if line == "%%": + yield record + record = {} + else: + if ":" in line: + key, value = line.split(":", 1) + key, value = key.strip(), value.strip() + record[key] = value + else: + # continuation line + record[key] += " " + line + if record: + yield record + return + + +def readRegistry(registry): + """ Reads IANA Language Subtag Registry and extracts information for Intl.js. + + Information extracted: + - langTagMappings: mappings from complete language tags to preferred + complete language tags + - langSubtagMappings: mappings from subtags to preferred subtags + - extlangMappings: mappings from extlang subtags to preferred subtags, + with prefix to be removed + Returns these three mappings as dictionaries, along with the registry's + file date. + + We also check that mappings for language subtags don't affect extlang + subtags and vice versa, so that CanonicalizeLanguageTag doesn't have + to separate them for processing. Region codes are separated by case, + and script codes by length, so they're unproblematic. + """ + langTagMappings = {} + langSubtagMappings = {} + extlangMappings = {} + languageSubtags = set() + extlangSubtags = set() + + for record in readRegistryRecord(registry): + if "File-Date" in record: + fileDate = record["File-Date"] + continue + + if record["Type"] == "grandfathered": + # Grandfathered tags don't use standard syntax, so + # CanonicalizeLanguageTag expects the mapping table to provide + # the final form for all. + # For langTagMappings, keys must be in lower case; values in + # the case used in the registry. + tag = record["Tag"] + if "Preferred-Value" in record: + langTagMappings[tag.lower()] = record["Preferred-Value"] + else: + langTagMappings[tag.lower()] = tag + elif record["Type"] == "redundant": + # For langTagMappings, keys must be in lower case; values in + # the case used in the registry. + if "Preferred-Value" in record: + langTagMappings[record["Tag"].lower()] = record["Preferred-Value"] + elif record["Type"] in ("language", "script", "region", "variant"): + # For langSubtagMappings, keys and values must be in the case used + # in the registry. + subtag = record["Subtag"] + if record["Type"] == "language": + languageSubtags.add(subtag) + if "Preferred-Value" in record: + if subtag == "heploc": + # The entry for heploc is unique in its complexity; handle + # it as special case below. + continue + if "Prefix" in record: + # This might indicate another heploc-like complex case. + raise Exception("Please evaluate: subtag mapping with prefix value.") + langSubtagMappings[subtag] = record["Preferred-Value"] + elif record["Type"] == "extlang": + # For extlangMappings, keys must be in the case used in the + # registry; values are records with the preferred value and the + # prefix to be removed. + subtag = record["Subtag"] + extlangSubtags.add(subtag) + if "Preferred-Value" in record: + preferred = record["Preferred-Value"] + prefix = record["Prefix"] + extlangMappings[subtag] = {"preferred": preferred, "prefix": prefix} + else: + # No other types are allowed by + # http://tools.ietf.org/html/rfc5646#section-3.1.3 + assert False, "Unrecognized Type: {0}".format(record["Type"]) + + # Check that mappings for language subtags and extlang subtags don't affect + # each other. + for lang in languageSubtags: + if lang in extlangMappings and extlangMappings[lang]["preferred"] != lang: + raise Exception("Conflict: lang with extlang mapping: " + lang) + for extlang in extlangSubtags: + if extlang in langSubtagMappings: + raise Exception("Conflict: extlang with lang mapping: " + extlang) + + # Special case for heploc. + langTagMappings["ja-latn-hepburn-heploc"] = "ja-Latn-alalc97" + + return {"fileDate": fileDate, + "langTagMappings": langTagMappings, + "langSubtagMappings": langSubtagMappings, + "extlangMappings": extlangMappings} + + +def writeMappingsVar(intlData, dict, name, description, fileDate, url): + """ Writes a variable definition with a mapping table to file intlData. + + Writes the contents of dictionary dict to file intlData with the given + variable name and a comment with description, fileDate, and URL. + """ + intlData.write("\n") + intlData.write("// {0}.\n".format(description)) + intlData.write("// Derived from IANA Language Subtag Registry, file date {0}.\n".format(fileDate)) + intlData.write("// {0}\n".format(url)) + intlData.write("var {0} = {{\n".format(name)) + keys = sorted(dict) + for key in keys: + if isinstance(dict[key], basestring): + value = '"{0}"'.format(dict[key]) + else: + preferred = dict[key]["preferred"] + prefix = dict[key]["prefix"] + value = '{{preferred: "{0}", prefix: "{1}"}}'.format(preferred, prefix) + intlData.write(' "{0}": {1},\n'.format(key, value)) + intlData.write("};\n") + + +def writeLanguageTagData(intlData, fileDate, url, langTagMappings, langSubtagMappings, extlangMappings): + """ Writes the language tag data to the Intl data file. """ + writeMappingsVar(intlData, langTagMappings, "langTagMappings", + "Mappings from complete tags to preferred values", fileDate, url) + writeMappingsVar(intlData, langSubtagMappings, "langSubtagMappings", + "Mappings from non-extlang subtags to preferred values", fileDate, url) + writeMappingsVar(intlData, extlangMappings, "extlangMappings", + "Mappings from extlang subtags to preferred values", fileDate, url) + + +if __name__ == '__main__': + import codecs + import sys + import urllib2 + + url = "http://www.iana.org/assignments/language-subtag-registry" + if len(sys.argv) > 1: + print("Always make sure you have the newest language-subtag-registry.txt!") + registry = codecs.open(sys.argv[1], "r", encoding="utf-8") + else: + print("Downloading IANA Language Subtag Registry...") + reader = urllib2.urlopen(url) + text = reader.read().decode("utf-8") + reader.close() + registry = codecs.open("language-subtag-registry.txt", "w+", encoding="utf-8") + registry.write(text) + registry.seek(0) + + print("Processing IANA Language Subtag Registry...") + data = readRegistry(registry) + fileDate = data["fileDate"] + langTagMappings = data["langTagMappings"] + langSubtagMappings = data["langSubtagMappings"] + extlangMappings = data["extlangMappings"] + registry.close() + + print("Writing Intl data...") + intlData = codecs.open("IntlData.js", "w", encoding="utf-8") + intlData.write("// Generated by make_intl_data.py. DO NOT EDIT.\n") + writeLanguageTagData(intlData, fileDate, url, langTagMappings, langSubtagMappings, extlangMappings) + intlData.close()