michael@0: #!/usr/bin/env python michael@0: # -*- coding: utf-8 -*- michael@0: # michael@0: # This Source Code Form is subject to the terms of the Mozilla Public michael@0: # License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: # file, You can obtain one at http://mozilla.org/MPL/2.0/. michael@0: michael@0: """ Usage: make_intl_data.py [language-subtag-registry.txt] michael@0: michael@0: This script extracts information about mappings between deprecated and michael@0: current BCP 47 language tags from the IANA Language Subtag Registry and michael@0: converts it to JavaScript object definitions in IntlData.js. The definitions michael@0: are used in Intl.js. michael@0: michael@0: The IANA Language Subtag Registry is imported from michael@0: http://www.iana.org/assignments/language-subtag-registry michael@0: and uses the syntax specified in michael@0: http://tools.ietf.org/html/rfc5646#section-3 michael@0: """ michael@0: michael@0: def readRegistryRecord(registry): michael@0: """ Yields the records of the IANA Language Subtag Registry as dictionaries. """ michael@0: record = {} michael@0: for line in registry: michael@0: line = line.strip() michael@0: if line == "": michael@0: continue michael@0: if line == "%%": michael@0: yield record michael@0: record = {} michael@0: else: michael@0: if ":" in line: michael@0: key, value = line.split(":", 1) michael@0: key, value = key.strip(), value.strip() michael@0: record[key] = value michael@0: else: michael@0: # continuation line michael@0: record[key] += " " + line michael@0: if record: michael@0: yield record michael@0: return michael@0: michael@0: michael@0: def readRegistry(registry): michael@0: """ Reads IANA Language Subtag Registry and extracts information for Intl.js. michael@0: michael@0: Information extracted: michael@0: - langTagMappings: mappings from complete language tags to preferred michael@0: complete language tags michael@0: - langSubtagMappings: mappings from subtags to preferred subtags michael@0: - extlangMappings: mappings from extlang subtags to preferred subtags, michael@0: with prefix to be removed michael@0: Returns these three mappings as dictionaries, along with the registry's michael@0: file date. michael@0: michael@0: We also check that mappings for language subtags don't affect extlang michael@0: subtags and vice versa, so that CanonicalizeLanguageTag doesn't have michael@0: to separate them for processing. Region codes are separated by case, michael@0: and script codes by length, so they're unproblematic. michael@0: """ michael@0: langTagMappings = {} michael@0: langSubtagMappings = {} michael@0: extlangMappings = {} michael@0: languageSubtags = set() michael@0: extlangSubtags = set() michael@0: michael@0: for record in readRegistryRecord(registry): michael@0: if "File-Date" in record: michael@0: fileDate = record["File-Date"] michael@0: continue michael@0: michael@0: if record["Type"] == "grandfathered": michael@0: # Grandfathered tags don't use standard syntax, so michael@0: # CanonicalizeLanguageTag expects the mapping table to provide michael@0: # the final form for all. michael@0: # For langTagMappings, keys must be in lower case; values in michael@0: # the case used in the registry. michael@0: tag = record["Tag"] michael@0: if "Preferred-Value" in record: michael@0: langTagMappings[tag.lower()] = record["Preferred-Value"] michael@0: else: michael@0: langTagMappings[tag.lower()] = tag michael@0: elif record["Type"] == "redundant": michael@0: # For langTagMappings, keys must be in lower case; values in michael@0: # the case used in the registry. michael@0: if "Preferred-Value" in record: michael@0: langTagMappings[record["Tag"].lower()] = record["Preferred-Value"] michael@0: elif record["Type"] in ("language", "script", "region", "variant"): michael@0: # For langSubtagMappings, keys and values must be in the case used michael@0: # in the registry. michael@0: subtag = record["Subtag"] michael@0: if record["Type"] == "language": michael@0: languageSubtags.add(subtag) michael@0: if "Preferred-Value" in record: michael@0: if subtag == "heploc": michael@0: # The entry for heploc is unique in its complexity; handle michael@0: # it as special case below. michael@0: continue michael@0: if "Prefix" in record: michael@0: # This might indicate another heploc-like complex case. michael@0: raise Exception("Please evaluate: subtag mapping with prefix value.") michael@0: langSubtagMappings[subtag] = record["Preferred-Value"] michael@0: elif record["Type"] == "extlang": michael@0: # For extlangMappings, keys must be in the case used in the michael@0: # registry; values are records with the preferred value and the michael@0: # prefix to be removed. michael@0: subtag = record["Subtag"] michael@0: extlangSubtags.add(subtag) michael@0: if "Preferred-Value" in record: michael@0: preferred = record["Preferred-Value"] michael@0: prefix = record["Prefix"] michael@0: extlangMappings[subtag] = {"preferred": preferred, "prefix": prefix} michael@0: else: michael@0: # No other types are allowed by michael@0: # http://tools.ietf.org/html/rfc5646#section-3.1.3 michael@0: assert False, "Unrecognized Type: {0}".format(record["Type"]) michael@0: michael@0: # Check that mappings for language subtags and extlang subtags don't affect michael@0: # each other. michael@0: for lang in languageSubtags: michael@0: if lang in extlangMappings and extlangMappings[lang]["preferred"] != lang: michael@0: raise Exception("Conflict: lang with extlang mapping: " + lang) michael@0: for extlang in extlangSubtags: michael@0: if extlang in langSubtagMappings: michael@0: raise Exception("Conflict: extlang with lang mapping: " + extlang) michael@0: michael@0: # Special case for heploc. michael@0: langTagMappings["ja-latn-hepburn-heploc"] = "ja-Latn-alalc97" michael@0: michael@0: return {"fileDate": fileDate, michael@0: "langTagMappings": langTagMappings, michael@0: "langSubtagMappings": langSubtagMappings, michael@0: "extlangMappings": extlangMappings} michael@0: michael@0: michael@0: def writeMappingsVar(intlData, dict, name, description, fileDate, url): michael@0: """ Writes a variable definition with a mapping table to file intlData. michael@0: michael@0: Writes the contents of dictionary dict to file intlData with the given michael@0: variable name and a comment with description, fileDate, and URL. michael@0: """ michael@0: intlData.write("\n") michael@0: intlData.write("// {0}.\n".format(description)) michael@0: intlData.write("// Derived from IANA Language Subtag Registry, file date {0}.\n".format(fileDate)) michael@0: intlData.write("// {0}\n".format(url)) michael@0: intlData.write("var {0} = {{\n".format(name)) michael@0: keys = sorted(dict) michael@0: for key in keys: michael@0: if isinstance(dict[key], basestring): michael@0: value = '"{0}"'.format(dict[key]) michael@0: else: michael@0: preferred = dict[key]["preferred"] michael@0: prefix = dict[key]["prefix"] michael@0: value = '{{preferred: "{0}", prefix: "{1}"}}'.format(preferred, prefix) michael@0: intlData.write(' "{0}": {1},\n'.format(key, value)) michael@0: intlData.write("};\n") michael@0: michael@0: michael@0: def writeLanguageTagData(intlData, fileDate, url, langTagMappings, langSubtagMappings, extlangMappings): michael@0: """ Writes the language tag data to the Intl data file. """ michael@0: writeMappingsVar(intlData, langTagMappings, "langTagMappings", michael@0: "Mappings from complete tags to preferred values", fileDate, url) michael@0: writeMappingsVar(intlData, langSubtagMappings, "langSubtagMappings", michael@0: "Mappings from non-extlang subtags to preferred values", fileDate, url) michael@0: writeMappingsVar(intlData, extlangMappings, "extlangMappings", michael@0: "Mappings from extlang subtags to preferred values", fileDate, url) michael@0: michael@0: michael@0: if __name__ == '__main__': michael@0: import codecs michael@0: import sys michael@0: import urllib2 michael@0: michael@0: url = "http://www.iana.org/assignments/language-subtag-registry" michael@0: if len(sys.argv) > 1: michael@0: print("Always make sure you have the newest language-subtag-registry.txt!") michael@0: registry = codecs.open(sys.argv[1], "r", encoding="utf-8") michael@0: else: michael@0: print("Downloading IANA Language Subtag Registry...") michael@0: reader = urllib2.urlopen(url) michael@0: text = reader.read().decode("utf-8") michael@0: reader.close() michael@0: registry = codecs.open("language-subtag-registry.txt", "w+", encoding="utf-8") michael@0: registry.write(text) michael@0: registry.seek(0) michael@0: michael@0: print("Processing IANA Language Subtag Registry...") michael@0: data = readRegistry(registry) michael@0: fileDate = data["fileDate"] michael@0: langTagMappings = data["langTagMappings"] michael@0: langSubtagMappings = data["langSubtagMappings"] michael@0: extlangMappings = data["extlangMappings"] michael@0: registry.close() michael@0: michael@0: print("Writing Intl data...") michael@0: intlData = codecs.open("IntlData.js", "w", encoding="utf-8") michael@0: intlData.write("// Generated by make_intl_data.py. DO NOT EDIT.\n") michael@0: writeLanguageTagData(intlData, fileDate, url, langTagMappings, langSubtagMappings, extlangMappings) michael@0: intlData.close()