js/src/builtin/make_intl_data.py

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/js/src/builtin/make_intl_data.py	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,199 @@
     1.4 +#!/usr/bin/env python
     1.5 +# -*- coding: utf-8 -*-
     1.6 +#
     1.7 +# This Source Code Form is subject to the terms of the Mozilla Public
     1.8 +# License, v. 2.0. If a copy of the MPL was not distributed with this
     1.9 +# file, You can obtain one at http://mozilla.org/MPL/2.0/.
    1.10 +
    1.11 +""" Usage: make_intl_data.py [language-subtag-registry.txt]
    1.12 +
    1.13 +    This script extracts information about mappings between deprecated and
    1.14 +    current BCP 47 language tags from the IANA Language Subtag Registry and
    1.15 +    converts it to JavaScript object definitions in IntlData.js. The definitions
    1.16 +    are used in Intl.js.
    1.17 +
    1.18 +    The IANA Language Subtag Registry is imported from
    1.19 +    http://www.iana.org/assignments/language-subtag-registry
    1.20 +    and uses the syntax specified in
    1.21 +    http://tools.ietf.org/html/rfc5646#section-3
    1.22 +"""
    1.23 +
    1.24 +def readRegistryRecord(registry):
    1.25 +    """ Yields the records of the IANA Language Subtag Registry as dictionaries. """
    1.26 +    record = {}
    1.27 +    for line in registry:
    1.28 +        line = line.strip()
    1.29 +        if line == "":
    1.30 +            continue
    1.31 +        if line == "%%":
    1.32 +            yield record
    1.33 +            record = {}
    1.34 +        else:
    1.35 +            if ":" in line:
    1.36 +                key, value = line.split(":", 1)
    1.37 +                key, value = key.strip(), value.strip()
    1.38 +                record[key] = value
    1.39 +            else:
    1.40 +                # continuation line
    1.41 +                record[key] += " " + line
    1.42 +    if record:
    1.43 +        yield record
    1.44 +    return
    1.45 +
    1.46 +
    1.47 +def readRegistry(registry):
    1.48 +    """ Reads IANA Language Subtag Registry and extracts information for Intl.js.
    1.49 +
    1.50 +        Information extracted:
    1.51 +        - langTagMappings: mappings from complete language tags to preferred
    1.52 +          complete language tags
    1.53 +        - langSubtagMappings: mappings from subtags to preferred subtags
    1.54 +        - extlangMappings: mappings from extlang subtags to preferred subtags,
    1.55 +          with prefix to be removed
    1.56 +        Returns these three mappings as dictionaries, along with the registry's
    1.57 +        file date.
    1.58 +
    1.59 +        We also check that mappings for language subtags don't affect extlang
    1.60 +        subtags and vice versa, so that CanonicalizeLanguageTag doesn't have
    1.61 +        to separate them for processing. Region codes are separated by case,
    1.62 +        and script codes by length, so they're unproblematic.
    1.63 +    """
    1.64 +    langTagMappings = {}
    1.65 +    langSubtagMappings = {}
    1.66 +    extlangMappings = {}
    1.67 +    languageSubtags = set()
    1.68 +    extlangSubtags = set()
    1.69 +
    1.70 +    for record in readRegistryRecord(registry):
    1.71 +        if "File-Date" in record:
    1.72 +            fileDate = record["File-Date"]
    1.73 +            continue
    1.74 +
    1.75 +        if record["Type"] == "grandfathered":
    1.76 +            # Grandfathered tags don't use standard syntax, so
    1.77 +            # CanonicalizeLanguageTag expects the mapping table to provide
    1.78 +            # the final form for all.
    1.79 +            # For langTagMappings, keys must be in lower case; values in
    1.80 +            # the case used in the registry.
    1.81 +            tag = record["Tag"]
    1.82 +            if "Preferred-Value" in record:
    1.83 +                langTagMappings[tag.lower()] = record["Preferred-Value"]
    1.84 +            else:
    1.85 +                langTagMappings[tag.lower()] = tag
    1.86 +        elif record["Type"] == "redundant":
    1.87 +            # For langTagMappings, keys must be in lower case; values in
    1.88 +            # the case used in the registry.
    1.89 +            if "Preferred-Value" in record:
    1.90 +                langTagMappings[record["Tag"].lower()] = record["Preferred-Value"]
    1.91 +        elif record["Type"] in ("language", "script", "region", "variant"):
    1.92 +            # For langSubtagMappings, keys and values must be in the case used
    1.93 +            # in the registry.
    1.94 +            subtag = record["Subtag"]
    1.95 +            if record["Type"] == "language":
    1.96 +                languageSubtags.add(subtag)
    1.97 +            if "Preferred-Value" in record:
    1.98 +                if subtag == "heploc":
    1.99 +                    # The entry for heploc is unique in its complexity; handle
   1.100 +                    # it as special case below.
   1.101 +                    continue
   1.102 +                if "Prefix" in record:
   1.103 +                    # This might indicate another heploc-like complex case.
   1.104 +                    raise Exception("Please evaluate: subtag mapping with prefix value.")
   1.105 +                langSubtagMappings[subtag] = record["Preferred-Value"]
   1.106 +        elif record["Type"] == "extlang":
   1.107 +            # For extlangMappings, keys must be in the case used in the
   1.108 +            # registry; values are records with the preferred value and the
   1.109 +            # prefix to be removed.
   1.110 +            subtag = record["Subtag"]
   1.111 +            extlangSubtags.add(subtag)
   1.112 +            if "Preferred-Value" in record:
   1.113 +                preferred = record["Preferred-Value"]
   1.114 +                prefix = record["Prefix"]
   1.115 +                extlangMappings[subtag] = {"preferred": preferred, "prefix": prefix}
   1.116 +        else:
   1.117 +            # No other types are allowed by
   1.118 +            # http://tools.ietf.org/html/rfc5646#section-3.1.3
   1.119 +            assert False, "Unrecognized Type: {0}".format(record["Type"])
   1.120 +
   1.121 +    # Check that mappings for language subtags and extlang subtags don't affect
   1.122 +    # each other.
   1.123 +    for lang in languageSubtags:
   1.124 +        if lang in extlangMappings and extlangMappings[lang]["preferred"] != lang:
   1.125 +            raise Exception("Conflict: lang with extlang mapping: " + lang)
   1.126 +    for extlang in extlangSubtags:
   1.127 +        if extlang in langSubtagMappings:
   1.128 +            raise Exception("Conflict: extlang with lang mapping: " + extlang)
   1.129 +
   1.130 +    # Special case for heploc.
   1.131 +    langTagMappings["ja-latn-hepburn-heploc"] = "ja-Latn-alalc97"
   1.132 +
   1.133 +    return {"fileDate": fileDate,
   1.134 +            "langTagMappings": langTagMappings,
   1.135 +            "langSubtagMappings": langSubtagMappings,
   1.136 +            "extlangMappings": extlangMappings}
   1.137 +
   1.138 +
   1.139 +def writeMappingsVar(intlData, dict, name, description, fileDate, url):
   1.140 +    """ Writes a variable definition with a mapping table to file intlData.
   1.141 +
   1.142 +        Writes the contents of dictionary dict to file intlData with the given
   1.143 +        variable name and a comment with description, fileDate, and URL.
   1.144 +    """
   1.145 +    intlData.write("\n")
   1.146 +    intlData.write("// {0}.\n".format(description))
   1.147 +    intlData.write("// Derived from IANA Language Subtag Registry, file date {0}.\n".format(fileDate))
   1.148 +    intlData.write("// {0}\n".format(url))
   1.149 +    intlData.write("var {0} = {{\n".format(name))
   1.150 +    keys = sorted(dict)
   1.151 +    for key in keys:
   1.152 +        if isinstance(dict[key], basestring):
   1.153 +            value = '"{0}"'.format(dict[key])
   1.154 +        else:
   1.155 +            preferred = dict[key]["preferred"]
   1.156 +            prefix = dict[key]["prefix"]
   1.157 +            value = '{{preferred: "{0}", prefix: "{1}"}}'.format(preferred, prefix)
   1.158 +        intlData.write('    "{0}": {1},\n'.format(key, value))
   1.159 +    intlData.write("};\n")
   1.160 +
   1.161 +
   1.162 +def writeLanguageTagData(intlData, fileDate, url, langTagMappings, langSubtagMappings, extlangMappings):
   1.163 +    """ Writes the language tag data to the Intl data file. """
   1.164 +    writeMappingsVar(intlData, langTagMappings, "langTagMappings",
   1.165 +                     "Mappings from complete tags to preferred values", fileDate, url)
   1.166 +    writeMappingsVar(intlData, langSubtagMappings, "langSubtagMappings",
   1.167 +                     "Mappings from non-extlang subtags to preferred values", fileDate, url)
   1.168 +    writeMappingsVar(intlData, extlangMappings, "extlangMappings",
   1.169 +                     "Mappings from extlang subtags to preferred values", fileDate, url)
   1.170 +
   1.171 +
   1.172 +if __name__ == '__main__':
   1.173 +    import codecs
   1.174 +    import sys
   1.175 +    import urllib2
   1.176 +
   1.177 +    url = "http://www.iana.org/assignments/language-subtag-registry"
   1.178 +    if len(sys.argv) > 1:
   1.179 +        print("Always make sure you have the newest language-subtag-registry.txt!")
   1.180 +        registry = codecs.open(sys.argv[1], "r", encoding="utf-8")
   1.181 +    else:
   1.182 +        print("Downloading IANA Language Subtag Registry...")
   1.183 +        reader = urllib2.urlopen(url)
   1.184 +        text = reader.read().decode("utf-8")
   1.185 +        reader.close()
   1.186 +        registry = codecs.open("language-subtag-registry.txt", "w+", encoding="utf-8")
   1.187 +        registry.write(text)
   1.188 +        registry.seek(0)
   1.189 +
   1.190 +    print("Processing IANA Language Subtag Registry...")
   1.191 +    data = readRegistry(registry)
   1.192 +    fileDate = data["fileDate"]
   1.193 +    langTagMappings = data["langTagMappings"]
   1.194 +    langSubtagMappings = data["langSubtagMappings"]
   1.195 +    extlangMappings = data["extlangMappings"]
   1.196 +    registry.close()
   1.197 +
   1.198 +    print("Writing Intl data...")
   1.199 +    intlData = codecs.open("IntlData.js", "w", encoding="utf-8")
   1.200 +    intlData.write("// Generated by make_intl_data.py. DO NOT EDIT.\n")
   1.201 +    writeLanguageTagData(intlData, fileDate, url, langTagMappings, langSubtagMappings, extlangMappings)
   1.202 +    intlData.close()

mercurial