|
1 #!/usr/bin/env python |
|
2 # -*- coding: utf-8 -*- |
|
3 # |
|
4 # This Source Code Form is subject to the terms of the Mozilla Public |
|
5 # License, v. 2.0. If a copy of the MPL was not distributed with this |
|
6 # file, You can obtain one at http://mozilla.org/MPL/2.0/. |
|
7 |
|
8 """ Usage: make_intl_data.py [language-subtag-registry.txt] |
|
9 |
|
10 This script extracts information about mappings between deprecated and |
|
11 current BCP 47 language tags from the IANA Language Subtag Registry and |
|
12 converts it to JavaScript object definitions in IntlData.js. The definitions |
|
13 are used in Intl.js. |
|
14 |
|
15 The IANA Language Subtag Registry is imported from |
|
16 http://www.iana.org/assignments/language-subtag-registry |
|
17 and uses the syntax specified in |
|
18 http://tools.ietf.org/html/rfc5646#section-3 |
|
19 """ |
|
20 |
|
21 def readRegistryRecord(registry): |
|
22 """ Yields the records of the IANA Language Subtag Registry as dictionaries. """ |
|
23 record = {} |
|
24 for line in registry: |
|
25 line = line.strip() |
|
26 if line == "": |
|
27 continue |
|
28 if line == "%%": |
|
29 yield record |
|
30 record = {} |
|
31 else: |
|
32 if ":" in line: |
|
33 key, value = line.split(":", 1) |
|
34 key, value = key.strip(), value.strip() |
|
35 record[key] = value |
|
36 else: |
|
37 # continuation line |
|
38 record[key] += " " + line |
|
39 if record: |
|
40 yield record |
|
41 return |
|
42 |
|
43 |
|
44 def readRegistry(registry): |
|
45 """ Reads IANA Language Subtag Registry and extracts information for Intl.js. |
|
46 |
|
47 Information extracted: |
|
48 - langTagMappings: mappings from complete language tags to preferred |
|
49 complete language tags |
|
50 - langSubtagMappings: mappings from subtags to preferred subtags |
|
51 - extlangMappings: mappings from extlang subtags to preferred subtags, |
|
52 with prefix to be removed |
|
53 Returns these three mappings as dictionaries, along with the registry's |
|
54 file date. |
|
55 |
|
56 We also check that mappings for language subtags don't affect extlang |
|
57 subtags and vice versa, so that CanonicalizeLanguageTag doesn't have |
|
58 to separate them for processing. Region codes are separated by case, |
|
59 and script codes by length, so they're unproblematic. |
|
60 """ |
|
61 langTagMappings = {} |
|
62 langSubtagMappings = {} |
|
63 extlangMappings = {} |
|
64 languageSubtags = set() |
|
65 extlangSubtags = set() |
|
66 |
|
67 for record in readRegistryRecord(registry): |
|
68 if "File-Date" in record: |
|
69 fileDate = record["File-Date"] |
|
70 continue |
|
71 |
|
72 if record["Type"] == "grandfathered": |
|
73 # Grandfathered tags don't use standard syntax, so |
|
74 # CanonicalizeLanguageTag expects the mapping table to provide |
|
75 # the final form for all. |
|
76 # For langTagMappings, keys must be in lower case; values in |
|
77 # the case used in the registry. |
|
78 tag = record["Tag"] |
|
79 if "Preferred-Value" in record: |
|
80 langTagMappings[tag.lower()] = record["Preferred-Value"] |
|
81 else: |
|
82 langTagMappings[tag.lower()] = tag |
|
83 elif record["Type"] == "redundant": |
|
84 # For langTagMappings, keys must be in lower case; values in |
|
85 # the case used in the registry. |
|
86 if "Preferred-Value" in record: |
|
87 langTagMappings[record["Tag"].lower()] = record["Preferred-Value"] |
|
88 elif record["Type"] in ("language", "script", "region", "variant"): |
|
89 # For langSubtagMappings, keys and values must be in the case used |
|
90 # in the registry. |
|
91 subtag = record["Subtag"] |
|
92 if record["Type"] == "language": |
|
93 languageSubtags.add(subtag) |
|
94 if "Preferred-Value" in record: |
|
95 if subtag == "heploc": |
|
96 # The entry for heploc is unique in its complexity; handle |
|
97 # it as special case below. |
|
98 continue |
|
99 if "Prefix" in record: |
|
100 # This might indicate another heploc-like complex case. |
|
101 raise Exception("Please evaluate: subtag mapping with prefix value.") |
|
102 langSubtagMappings[subtag] = record["Preferred-Value"] |
|
103 elif record["Type"] == "extlang": |
|
104 # For extlangMappings, keys must be in the case used in the |
|
105 # registry; values are records with the preferred value and the |
|
106 # prefix to be removed. |
|
107 subtag = record["Subtag"] |
|
108 extlangSubtags.add(subtag) |
|
109 if "Preferred-Value" in record: |
|
110 preferred = record["Preferred-Value"] |
|
111 prefix = record["Prefix"] |
|
112 extlangMappings[subtag] = {"preferred": preferred, "prefix": prefix} |
|
113 else: |
|
114 # No other types are allowed by |
|
115 # http://tools.ietf.org/html/rfc5646#section-3.1.3 |
|
116 assert False, "Unrecognized Type: {0}".format(record["Type"]) |
|
117 |
|
118 # Check that mappings for language subtags and extlang subtags don't affect |
|
119 # each other. |
|
120 for lang in languageSubtags: |
|
121 if lang in extlangMappings and extlangMappings[lang]["preferred"] != lang: |
|
122 raise Exception("Conflict: lang with extlang mapping: " + lang) |
|
123 for extlang in extlangSubtags: |
|
124 if extlang in langSubtagMappings: |
|
125 raise Exception("Conflict: extlang with lang mapping: " + extlang) |
|
126 |
|
127 # Special case for heploc. |
|
128 langTagMappings["ja-latn-hepburn-heploc"] = "ja-Latn-alalc97" |
|
129 |
|
130 return {"fileDate": fileDate, |
|
131 "langTagMappings": langTagMappings, |
|
132 "langSubtagMappings": langSubtagMappings, |
|
133 "extlangMappings": extlangMappings} |
|
134 |
|
135 |
|
136 def writeMappingsVar(intlData, dict, name, description, fileDate, url): |
|
137 """ Writes a variable definition with a mapping table to file intlData. |
|
138 |
|
139 Writes the contents of dictionary dict to file intlData with the given |
|
140 variable name and a comment with description, fileDate, and URL. |
|
141 """ |
|
142 intlData.write("\n") |
|
143 intlData.write("// {0}.\n".format(description)) |
|
144 intlData.write("// Derived from IANA Language Subtag Registry, file date {0}.\n".format(fileDate)) |
|
145 intlData.write("// {0}\n".format(url)) |
|
146 intlData.write("var {0} = {{\n".format(name)) |
|
147 keys = sorted(dict) |
|
148 for key in keys: |
|
149 if isinstance(dict[key], basestring): |
|
150 value = '"{0}"'.format(dict[key]) |
|
151 else: |
|
152 preferred = dict[key]["preferred"] |
|
153 prefix = dict[key]["prefix"] |
|
154 value = '{{preferred: "{0}", prefix: "{1}"}}'.format(preferred, prefix) |
|
155 intlData.write(' "{0}": {1},\n'.format(key, value)) |
|
156 intlData.write("};\n") |
|
157 |
|
158 |
|
159 def writeLanguageTagData(intlData, fileDate, url, langTagMappings, langSubtagMappings, extlangMappings): |
|
160 """ Writes the language tag data to the Intl data file. """ |
|
161 writeMappingsVar(intlData, langTagMappings, "langTagMappings", |
|
162 "Mappings from complete tags to preferred values", fileDate, url) |
|
163 writeMappingsVar(intlData, langSubtagMappings, "langSubtagMappings", |
|
164 "Mappings from non-extlang subtags to preferred values", fileDate, url) |
|
165 writeMappingsVar(intlData, extlangMappings, "extlangMappings", |
|
166 "Mappings from extlang subtags to preferred values", fileDate, url) |
|
167 |
|
168 |
|
169 if __name__ == '__main__': |
|
170 import codecs |
|
171 import sys |
|
172 import urllib2 |
|
173 |
|
174 url = "http://www.iana.org/assignments/language-subtag-registry" |
|
175 if len(sys.argv) > 1: |
|
176 print("Always make sure you have the newest language-subtag-registry.txt!") |
|
177 registry = codecs.open(sys.argv[1], "r", encoding="utf-8") |
|
178 else: |
|
179 print("Downloading IANA Language Subtag Registry...") |
|
180 reader = urllib2.urlopen(url) |
|
181 text = reader.read().decode("utf-8") |
|
182 reader.close() |
|
183 registry = codecs.open("language-subtag-registry.txt", "w+", encoding="utf-8") |
|
184 registry.write(text) |
|
185 registry.seek(0) |
|
186 |
|
187 print("Processing IANA Language Subtag Registry...") |
|
188 data = readRegistry(registry) |
|
189 fileDate = data["fileDate"] |
|
190 langTagMappings = data["langTagMappings"] |
|
191 langSubtagMappings = data["langSubtagMappings"] |
|
192 extlangMappings = data["extlangMappings"] |
|
193 registry.close() |
|
194 |
|
195 print("Writing Intl data...") |
|
196 intlData = codecs.open("IntlData.js", "w", encoding="utf-8") |
|
197 intlData.write("// Generated by make_intl_data.py. DO NOT EDIT.\n") |
|
198 writeLanguageTagData(intlData, fileDate, url, langTagMappings, langSubtagMappings, extlangMappings) |
|
199 intlData.close() |