1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/js/src/builtin/make_intl_data.py Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,199 @@ 1.4 +#!/usr/bin/env python 1.5 +# -*- coding: utf-8 -*- 1.6 +# 1.7 +# This Source Code Form is subject to the terms of the Mozilla Public 1.8 +# License, v. 2.0. If a copy of the MPL was not distributed with this 1.9 +# file, You can obtain one at http://mozilla.org/MPL/2.0/. 1.10 + 1.11 +""" Usage: make_intl_data.py [language-subtag-registry.txt] 1.12 + 1.13 + This script extracts information about mappings between deprecated and 1.14 + current BCP 47 language tags from the IANA Language Subtag Registry and 1.15 + converts it to JavaScript object definitions in IntlData.js. The definitions 1.16 + are used in Intl.js. 1.17 + 1.18 + The IANA Language Subtag Registry is imported from 1.19 + http://www.iana.org/assignments/language-subtag-registry 1.20 + and uses the syntax specified in 1.21 + http://tools.ietf.org/html/rfc5646#section-3 1.22 +""" 1.23 + 1.24 +def readRegistryRecord(registry): 1.25 + """ Yields the records of the IANA Language Subtag Registry as dictionaries. """ 1.26 + record = {} 1.27 + for line in registry: 1.28 + line = line.strip() 1.29 + if line == "": 1.30 + continue 1.31 + if line == "%%": 1.32 + yield record 1.33 + record = {} 1.34 + else: 1.35 + if ":" in line: 1.36 + key, value = line.split(":", 1) 1.37 + key, value = key.strip(), value.strip() 1.38 + record[key] = value 1.39 + else: 1.40 + # continuation line 1.41 + record[key] += " " + line 1.42 + if record: 1.43 + yield record 1.44 + return 1.45 + 1.46 + 1.47 +def readRegistry(registry): 1.48 + """ Reads IANA Language Subtag Registry and extracts information for Intl.js. 1.49 + 1.50 + Information extracted: 1.51 + - langTagMappings: mappings from complete language tags to preferred 1.52 + complete language tags 1.53 + - langSubtagMappings: mappings from subtags to preferred subtags 1.54 + - extlangMappings: mappings from extlang subtags to preferred subtags, 1.55 + with prefix to be removed 1.56 + Returns these three mappings as dictionaries, along with the registry's 1.57 + file date. 1.58 + 1.59 + We also check that mappings for language subtags don't affect extlang 1.60 + subtags and vice versa, so that CanonicalizeLanguageTag doesn't have 1.61 + to separate them for processing. Region codes are separated by case, 1.62 + and script codes by length, so they're unproblematic. 1.63 + """ 1.64 + langTagMappings = {} 1.65 + langSubtagMappings = {} 1.66 + extlangMappings = {} 1.67 + languageSubtags = set() 1.68 + extlangSubtags = set() 1.69 + 1.70 + for record in readRegistryRecord(registry): 1.71 + if "File-Date" in record: 1.72 + fileDate = record["File-Date"] 1.73 + continue 1.74 + 1.75 + if record["Type"] == "grandfathered": 1.76 + # Grandfathered tags don't use standard syntax, so 1.77 + # CanonicalizeLanguageTag expects the mapping table to provide 1.78 + # the final form for all. 1.79 + # For langTagMappings, keys must be in lower case; values in 1.80 + # the case used in the registry. 1.81 + tag = record["Tag"] 1.82 + if "Preferred-Value" in record: 1.83 + langTagMappings[tag.lower()] = record["Preferred-Value"] 1.84 + else: 1.85 + langTagMappings[tag.lower()] = tag 1.86 + elif record["Type"] == "redundant": 1.87 + # For langTagMappings, keys must be in lower case; values in 1.88 + # the case used in the registry. 1.89 + if "Preferred-Value" in record: 1.90 + langTagMappings[record["Tag"].lower()] = record["Preferred-Value"] 1.91 + elif record["Type"] in ("language", "script", "region", "variant"): 1.92 + # For langSubtagMappings, keys and values must be in the case used 1.93 + # in the registry. 1.94 + subtag = record["Subtag"] 1.95 + if record["Type"] == "language": 1.96 + languageSubtags.add(subtag) 1.97 + if "Preferred-Value" in record: 1.98 + if subtag == "heploc": 1.99 + # The entry for heploc is unique in its complexity; handle 1.100 + # it as special case below. 1.101 + continue 1.102 + if "Prefix" in record: 1.103 + # This might indicate another heploc-like complex case. 1.104 + raise Exception("Please evaluate: subtag mapping with prefix value.") 1.105 + langSubtagMappings[subtag] = record["Preferred-Value"] 1.106 + elif record["Type"] == "extlang": 1.107 + # For extlangMappings, keys must be in the case used in the 1.108 + # registry; values are records with the preferred value and the 1.109 + # prefix to be removed. 1.110 + subtag = record["Subtag"] 1.111 + extlangSubtags.add(subtag) 1.112 + if "Preferred-Value" in record: 1.113 + preferred = record["Preferred-Value"] 1.114 + prefix = record["Prefix"] 1.115 + extlangMappings[subtag] = {"preferred": preferred, "prefix": prefix} 1.116 + else: 1.117 + # No other types are allowed by 1.118 + # http://tools.ietf.org/html/rfc5646#section-3.1.3 1.119 + assert False, "Unrecognized Type: {0}".format(record["Type"]) 1.120 + 1.121 + # Check that mappings for language subtags and extlang subtags don't affect 1.122 + # each other. 1.123 + for lang in languageSubtags: 1.124 + if lang in extlangMappings and extlangMappings[lang]["preferred"] != lang: 1.125 + raise Exception("Conflict: lang with extlang mapping: " + lang) 1.126 + for extlang in extlangSubtags: 1.127 + if extlang in langSubtagMappings: 1.128 + raise Exception("Conflict: extlang with lang mapping: " + extlang) 1.129 + 1.130 + # Special case for heploc. 1.131 + langTagMappings["ja-latn-hepburn-heploc"] = "ja-Latn-alalc97" 1.132 + 1.133 + return {"fileDate": fileDate, 1.134 + "langTagMappings": langTagMappings, 1.135 + "langSubtagMappings": langSubtagMappings, 1.136 + "extlangMappings": extlangMappings} 1.137 + 1.138 + 1.139 +def writeMappingsVar(intlData, dict, name, description, fileDate, url): 1.140 + """ Writes a variable definition with a mapping table to file intlData. 1.141 + 1.142 + Writes the contents of dictionary dict to file intlData with the given 1.143 + variable name and a comment with description, fileDate, and URL. 1.144 + """ 1.145 + intlData.write("\n") 1.146 + intlData.write("// {0}.\n".format(description)) 1.147 + intlData.write("// Derived from IANA Language Subtag Registry, file date {0}.\n".format(fileDate)) 1.148 + intlData.write("// {0}\n".format(url)) 1.149 + intlData.write("var {0} = {{\n".format(name)) 1.150 + keys = sorted(dict) 1.151 + for key in keys: 1.152 + if isinstance(dict[key], basestring): 1.153 + value = '"{0}"'.format(dict[key]) 1.154 + else: 1.155 + preferred = dict[key]["preferred"] 1.156 + prefix = dict[key]["prefix"] 1.157 + value = '{{preferred: "{0}", prefix: "{1}"}}'.format(preferred, prefix) 1.158 + intlData.write(' "{0}": {1},\n'.format(key, value)) 1.159 + intlData.write("};\n") 1.160 + 1.161 + 1.162 +def writeLanguageTagData(intlData, fileDate, url, langTagMappings, langSubtagMappings, extlangMappings): 1.163 + """ Writes the language tag data to the Intl data file. """ 1.164 + writeMappingsVar(intlData, langTagMappings, "langTagMappings", 1.165 + "Mappings from complete tags to preferred values", fileDate, url) 1.166 + writeMappingsVar(intlData, langSubtagMappings, "langSubtagMappings", 1.167 + "Mappings from non-extlang subtags to preferred values", fileDate, url) 1.168 + writeMappingsVar(intlData, extlangMappings, "extlangMappings", 1.169 + "Mappings from extlang subtags to preferred values", fileDate, url) 1.170 + 1.171 + 1.172 +if __name__ == '__main__': 1.173 + import codecs 1.174 + import sys 1.175 + import urllib2 1.176 + 1.177 + url = "http://www.iana.org/assignments/language-subtag-registry" 1.178 + if len(sys.argv) > 1: 1.179 + print("Always make sure you have the newest language-subtag-registry.txt!") 1.180 + registry = codecs.open(sys.argv[1], "r", encoding="utf-8") 1.181 + else: 1.182 + print("Downloading IANA Language Subtag Registry...") 1.183 + reader = urllib2.urlopen(url) 1.184 + text = reader.read().decode("utf-8") 1.185 + reader.close() 1.186 + registry = codecs.open("language-subtag-registry.txt", "w+", encoding="utf-8") 1.187 + registry.write(text) 1.188 + registry.seek(0) 1.189 + 1.190 + print("Processing IANA Language Subtag Registry...") 1.191 + data = readRegistry(registry) 1.192 + fileDate = data["fileDate"] 1.193 + langTagMappings = data["langTagMappings"] 1.194 + langSubtagMappings = data["langSubtagMappings"] 1.195 + extlangMappings = data["extlangMappings"] 1.196 + registry.close() 1.197 + 1.198 + print("Writing Intl data...") 1.199 + intlData = codecs.open("IntlData.js", "w", encoding="utf-8") 1.200 + intlData.write("// Generated by make_intl_data.py. DO NOT EDIT.\n") 1.201 + writeLanguageTagData(intlData, fileDate, url, langTagMappings, langSubtagMappings, extlangMappings) 1.202 + intlData.close()