js/src/builtin/make_intl_data.py

changeset 0
6474c204b198
equal deleted inserted replaced
-1:000000000000 0:78c32c95df8f
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 #
4 # This Source Code Form is subject to the terms of the Mozilla Public
5 # License, v. 2.0. If a copy of the MPL was not distributed with this
6 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
7
8 """ Usage: make_intl_data.py [language-subtag-registry.txt]
9
10 This script extracts information about mappings between deprecated and
11 current BCP 47 language tags from the IANA Language Subtag Registry and
12 converts it to JavaScript object definitions in IntlData.js. The definitions
13 are used in Intl.js.
14
15 The IANA Language Subtag Registry is imported from
16 http://www.iana.org/assignments/language-subtag-registry
17 and uses the syntax specified in
18 http://tools.ietf.org/html/rfc5646#section-3
19 """
20
21 def readRegistryRecord(registry):
22 """ Yields the records of the IANA Language Subtag Registry as dictionaries. """
23 record = {}
24 for line in registry:
25 line = line.strip()
26 if line == "":
27 continue
28 if line == "%%":
29 yield record
30 record = {}
31 else:
32 if ":" in line:
33 key, value = line.split(":", 1)
34 key, value = key.strip(), value.strip()
35 record[key] = value
36 else:
37 # continuation line
38 record[key] += " " + line
39 if record:
40 yield record
41 return
42
43
44 def readRegistry(registry):
45 """ Reads IANA Language Subtag Registry and extracts information for Intl.js.
46
47 Information extracted:
48 - langTagMappings: mappings from complete language tags to preferred
49 complete language tags
50 - langSubtagMappings: mappings from subtags to preferred subtags
51 - extlangMappings: mappings from extlang subtags to preferred subtags,
52 with prefix to be removed
53 Returns these three mappings as dictionaries, along with the registry's
54 file date.
55
56 We also check that mappings for language subtags don't affect extlang
57 subtags and vice versa, so that CanonicalizeLanguageTag doesn't have
58 to separate them for processing. Region codes are separated by case,
59 and script codes by length, so they're unproblematic.
60 """
61 langTagMappings = {}
62 langSubtagMappings = {}
63 extlangMappings = {}
64 languageSubtags = set()
65 extlangSubtags = set()
66
67 for record in readRegistryRecord(registry):
68 if "File-Date" in record:
69 fileDate = record["File-Date"]
70 continue
71
72 if record["Type"] == "grandfathered":
73 # Grandfathered tags don't use standard syntax, so
74 # CanonicalizeLanguageTag expects the mapping table to provide
75 # the final form for all.
76 # For langTagMappings, keys must be in lower case; values in
77 # the case used in the registry.
78 tag = record["Tag"]
79 if "Preferred-Value" in record:
80 langTagMappings[tag.lower()] = record["Preferred-Value"]
81 else:
82 langTagMappings[tag.lower()] = tag
83 elif record["Type"] == "redundant":
84 # For langTagMappings, keys must be in lower case; values in
85 # the case used in the registry.
86 if "Preferred-Value" in record:
87 langTagMappings[record["Tag"].lower()] = record["Preferred-Value"]
88 elif record["Type"] in ("language", "script", "region", "variant"):
89 # For langSubtagMappings, keys and values must be in the case used
90 # in the registry.
91 subtag = record["Subtag"]
92 if record["Type"] == "language":
93 languageSubtags.add(subtag)
94 if "Preferred-Value" in record:
95 if subtag == "heploc":
96 # The entry for heploc is unique in its complexity; handle
97 # it as special case below.
98 continue
99 if "Prefix" in record:
100 # This might indicate another heploc-like complex case.
101 raise Exception("Please evaluate: subtag mapping with prefix value.")
102 langSubtagMappings[subtag] = record["Preferred-Value"]
103 elif record["Type"] == "extlang":
104 # For extlangMappings, keys must be in the case used in the
105 # registry; values are records with the preferred value and the
106 # prefix to be removed.
107 subtag = record["Subtag"]
108 extlangSubtags.add(subtag)
109 if "Preferred-Value" in record:
110 preferred = record["Preferred-Value"]
111 prefix = record["Prefix"]
112 extlangMappings[subtag] = {"preferred": preferred, "prefix": prefix}
113 else:
114 # No other types are allowed by
115 # http://tools.ietf.org/html/rfc5646#section-3.1.3
116 assert False, "Unrecognized Type: {0}".format(record["Type"])
117
118 # Check that mappings for language subtags and extlang subtags don't affect
119 # each other.
120 for lang in languageSubtags:
121 if lang in extlangMappings and extlangMappings[lang]["preferred"] != lang:
122 raise Exception("Conflict: lang with extlang mapping: " + lang)
123 for extlang in extlangSubtags:
124 if extlang in langSubtagMappings:
125 raise Exception("Conflict: extlang with lang mapping: " + extlang)
126
127 # Special case for heploc.
128 langTagMappings["ja-latn-hepburn-heploc"] = "ja-Latn-alalc97"
129
130 return {"fileDate": fileDate,
131 "langTagMappings": langTagMappings,
132 "langSubtagMappings": langSubtagMappings,
133 "extlangMappings": extlangMappings}
134
135
136 def writeMappingsVar(intlData, dict, name, description, fileDate, url):
137 """ Writes a variable definition with a mapping table to file intlData.
138
139 Writes the contents of dictionary dict to file intlData with the given
140 variable name and a comment with description, fileDate, and URL.
141 """
142 intlData.write("\n")
143 intlData.write("// {0}.\n".format(description))
144 intlData.write("// Derived from IANA Language Subtag Registry, file date {0}.\n".format(fileDate))
145 intlData.write("// {0}\n".format(url))
146 intlData.write("var {0} = {{\n".format(name))
147 keys = sorted(dict)
148 for key in keys:
149 if isinstance(dict[key], basestring):
150 value = '"{0}"'.format(dict[key])
151 else:
152 preferred = dict[key]["preferred"]
153 prefix = dict[key]["prefix"]
154 value = '{{preferred: "{0}", prefix: "{1}"}}'.format(preferred, prefix)
155 intlData.write(' "{0}": {1},\n'.format(key, value))
156 intlData.write("};\n")
157
158
159 def writeLanguageTagData(intlData, fileDate, url, langTagMappings, langSubtagMappings, extlangMappings):
160 """ Writes the language tag data to the Intl data file. """
161 writeMappingsVar(intlData, langTagMappings, "langTagMappings",
162 "Mappings from complete tags to preferred values", fileDate, url)
163 writeMappingsVar(intlData, langSubtagMappings, "langSubtagMappings",
164 "Mappings from non-extlang subtags to preferred values", fileDate, url)
165 writeMappingsVar(intlData, extlangMappings, "extlangMappings",
166 "Mappings from extlang subtags to preferred values", fileDate, url)
167
168
169 if __name__ == '__main__':
170 import codecs
171 import sys
172 import urllib2
173
174 url = "http://www.iana.org/assignments/language-subtag-registry"
175 if len(sys.argv) > 1:
176 print("Always make sure you have the newest language-subtag-registry.txt!")
177 registry = codecs.open(sys.argv[1], "r", encoding="utf-8")
178 else:
179 print("Downloading IANA Language Subtag Registry...")
180 reader = urllib2.urlopen(url)
181 text = reader.read().decode("utf-8")
182 reader.close()
183 registry = codecs.open("language-subtag-registry.txt", "w+", encoding="utf-8")
184 registry.write(text)
185 registry.seek(0)
186
187 print("Processing IANA Language Subtag Registry...")
188 data = readRegistry(registry)
189 fileDate = data["fileDate"]
190 langTagMappings = data["langTagMappings"]
191 langSubtagMappings = data["langSubtagMappings"]
192 extlangMappings = data["extlangMappings"]
193 registry.close()
194
195 print("Writing Intl data...")
196 intlData = codecs.open("IntlData.js", "w", encoding="utf-8")
197 intlData.write("// Generated by make_intl_data.py. DO NOT EDIT.\n")
198 writeLanguageTagData(intlData, fileDate, url, langTagMappings, langSubtagMappings, extlangMappings)
199 intlData.close()

mercurial