DXR is a code search and navigation tool aimed at making sense of large projects. It supports full-text and regex searches as well as structural queries.

Mercurial (d8847129d134)

VCS Links

Line Code
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

""" Usage: make_intl_data.py [language-subtag-registry.txt]

    This script extracts information about mappings between deprecated and
    current BCP 47 language tags from the IANA Language Subtag Registry and
    converts it to JavaScript object definitions in IntlData.js. The definitions
    are used in Intl.js.

    The IANA Language Subtag Registry is imported from
    http://www.iana.org/assignments/language-subtag-registry
    and uses the syntax specified in
    http://tools.ietf.org/html/rfc5646#section-3
"""

def readRegistryRecord(registry):
    """ Yields the records of the IANA Language Subtag Registry as dictionaries. """
    record = {}
    for line in registry:
        line = line.strip()
        if line == "":
            continue
        if line == "%%":
            yield record
            record = {}
        else:
            if ":" in line:
                key, value = line.split(":", 1)
                key, value = key.strip(), value.strip()
                record[key] = value
            else:
                # continuation line
                record[key] += " " + line
    if record:
        yield record
    return


def readRegistry(registry):
    """ Reads IANA Language Subtag Registry and extracts information for Intl.js.

        Information extracted:
        - langTagMappings: mappings from complete language tags to preferred
          complete language tags
        - langSubtagMappings: mappings from subtags to preferred subtags
        - extlangMappings: mappings from extlang subtags to preferred subtags,
          with prefix to be removed
        Returns these three mappings as dictionaries, along with the registry's
        file date.

        We also check that mappings for language subtags don't affect extlang
        subtags and vice versa, so that CanonicalizeLanguageTag doesn't have
        to separate them for processing. Region codes are separated by case,
        and script codes by length, so they're unproblematic.
    """
    langTagMappings = {}
    langSubtagMappings = {}
    extlangMappings = {}
    languageSubtags = set()
    extlangSubtags = set()

    for record in readRegistryRecord(registry):
        if "File-Date" in record:
            fileDate = record["File-Date"]
            continue

        if record["Type"] == "grandfathered":
            # Grandfathered tags don't use standard syntax, so
            # CanonicalizeLanguageTag expects the mapping table to provide
            # the final form for all.
            # For langTagMappings, keys must be in lower case; values in
            # the case used in the registry.
            tag = record["Tag"]
            if "Preferred-Value" in record:
                langTagMappings[tag.lower()] = record["Preferred-Value"]
            else:
                langTagMappings[tag.lower()] = tag
        elif record["Type"] == "redundant":
            # For langTagMappings, keys must be in lower case; values in
            # the case used in the registry.
            if "Preferred-Value" in record:
                langTagMappings[record["Tag"].lower()] = record["Preferred-Value"]
        elif record["Type"] in ("language", "script", "region", "variant"):
            # For langSubtagMappings, keys and values must be in the case used
            # in the registry.
            subtag = record["Subtag"]
            if record["Type"] == "language":
                languageSubtags.add(subtag)
            if "Preferred-Value" in record:
                if subtag == "heploc":
                    # The entry for heploc is unique in its complexity; handle
                    # it as special case below.
                    continue
                if "Prefix" in record:
                    # This might indicate another heploc-like complex case.
                    raise Exception("Please evaluate: subtag mapping with prefix value.")
                langSubtagMappings[subtag] = record["Preferred-Value"]
        elif record["Type"] == "extlang":
            # For extlangMappings, keys must be in the case used in the
            # registry; values are records with the preferred value and the
            # prefix to be removed.
            subtag = record["Subtag"]
            extlangSubtags.add(subtag)
            if "Preferred-Value" in record:
                preferred = record["Preferred-Value"]
                prefix = record["Prefix"]
                extlangMappings[subtag] = {"preferred": preferred, "prefix": prefix}
        else:
            # No other types are allowed by
            # http://tools.ietf.org/html/rfc5646#section-3.1.3
            assert False, "Unrecognized Type: {0}".format(record["Type"])

    # Check that mappings for language subtags and extlang subtags don't affect
    # each other.
    for lang in languageSubtags:
        if lang in extlangMappings and extlangMappings[lang]["preferred"] != lang:
            raise Exception("Conflict: lang with extlang mapping: " + lang)
    for extlang in extlangSubtags:
        if extlang in langSubtagMappings:
            raise Exception("Conflict: extlang with lang mapping: " + extlang)

    # Special case for heploc.
    langTagMappings["ja-latn-hepburn-heploc"] = "ja-Latn-alalc97"

    return {"fileDate": fileDate,
            "langTagMappings": langTagMappings,
            "langSubtagMappings": langSubtagMappings,
            "extlangMappings": extlangMappings}


def writeMappingsVar(intlData, dict, name, description, fileDate, url):
    """ Writes a variable definition with a mapping table to file intlData.

        Writes the contents of dictionary dict to file intlData with the given
        variable name and a comment with description, fileDate, and URL.
    """
    intlData.write("\n")
    intlData.write("// {0}.\n".format(description))
    intlData.write("// Derived from IANA Language Subtag Registry, file date {0}.\n".format(fileDate))
    intlData.write("// {0}\n".format(url))
    intlData.write("var {0} = {{\n".format(name))
    keys = sorted(dict)
    for key in keys:
        if isinstance(dict[key], basestring):
            value = '"{0}"'.format(dict[key])
        else:
            preferred = dict[key]["preferred"]
            prefix = dict[key]["prefix"]
            value = '{{preferred: "{0}", prefix: "{1}"}}'.format(preferred, prefix)
        intlData.write('    "{0}": {1},\n'.format(key, value))
    intlData.write("};\n")


def writeLanguageTagData(intlData, fileDate, url, langTagMappings, langSubtagMappings, extlangMappings):
    """ Writes the language tag data to the Intl data file. """
    writeMappingsVar(intlData, langTagMappings, "langTagMappings",
                     "Mappings from complete tags to preferred values", fileDate, url)
    writeMappingsVar(intlData, langSubtagMappings, "langSubtagMappings",
                     "Mappings from non-extlang subtags to preferred values", fileDate, url)
    writeMappingsVar(intlData, extlangMappings, "extlangMappings",
                     "Mappings from extlang subtags to preferred values", fileDate, url)


if __name__ == '__main__':
    import codecs
    import sys
    import urllib2

    url = "http://www.iana.org/assignments/language-subtag-registry"
    if len(sys.argv) > 1:
        print("Always make sure you have the newest language-subtag-registry.txt!")
        registry = codecs.open(sys.argv[1], "r", encoding="utf-8")
    else:
        print("Downloading IANA Language Subtag Registry...")
        reader = urllib2.urlopen(url)
        text = reader.read().decode("utf-8")
        reader.close()
        registry = codecs.open("language-subtag-registry.txt", "w+", encoding="utf-8")
        registry.write(text)
        registry.seek(0)

    print("Processing IANA Language Subtag Registry...")
    data = readRegistry(registry)
    fileDate = data["fileDate"]
    langTagMappings = data["langTagMappings"]
    langSubtagMappings = data["langSubtagMappings"]
    extlangMappings = data["extlangMappings"]
    registry.close()

    print("Writing Intl data...")
    intlData = codecs.open("IntlData.js", "w", encoding="utf-8")
    intlData.write("// Generated by make_intl_data.py. DO NOT EDIT.\n")
    writeLanguageTagData(intlData, fileDate, url, langTagMappings, langSubtagMappings, extlangMappings)
    intlData.close()