summaryrefslogtreecommitdiff
path: root/js/src/builtin/intl/make_intl_data.py
diff options
context:
space:
mode:
Diffstat (limited to 'js/src/builtin/intl/make_intl_data.py')
-rw-r--r--js/src/builtin/intl/make_intl_data.py1642
1 files changed, 1462 insertions, 180 deletions
diff --git a/js/src/builtin/intl/make_intl_data.py b/js/src/builtin/intl/make_intl_data.py
index a81001e0f3..59ff14d76c 100644
--- a/js/src/builtin/intl/make_intl_data.py
+++ b/js/src/builtin/intl/make_intl_data.py
@@ -6,19 +6,15 @@
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
""" Usage:
- make_intl_data.py langtags [language-subtag-registry.txt]
+ make_intl_data.py langtags [cldr_core.zip]
make_intl_data.py tzdata
+ make_intl_data.py unicode-ext
Target "langtags":
- This script extracts information about mappings between deprecated and
- current BCP 47 language tags from the IANA Language Subtag Registry and
- converts it to JavaScript object definitions in
- LangTagMappingsGenerated.js. The definitions are used in Intl.js.
-
- The IANA Language Subtag Registry is imported from
- https://www.iana.org/assignments/language-subtag-registry
- and uses the syntax specified in
- https://tools.ietf.org/html/rfc5646#section-3
+ This script extracts information about 1) mappings between deprecated and
+ current Unicode BCP 47 locale identifiers, and 2) deprecated and current
+ BCP 47 Unicode extension value from CLDR, and converts it to C++ mapping
+ code in LanguageTagGenerated.cpp. The code is used in LanguageTag.cpp.
Target "tzdata":
@@ -36,194 +32,1330 @@ import sys
import tarfile
import tempfile
import urllib2
-import urlparse
from contextlib import closing
from functools import partial
-from itertools import chain, ifilter, ifilterfalse, imap, tee
+from itertools import chain, ifilter, ifilterfalse, imap, izip_longest, groupby, tee
from operator import attrgetter, itemgetter
+from urlparse import urlsplit
+from zipfile import ZipFile
+
+# From https://docs.python.org/3/library/itertools.html
+def grouper(iterable, n, fillvalue=None):
+ "Collect data into fixed-length chunks or blocks"
+ # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
+ args = [iter(iterable)] * n
+ return izip_longest(*args, fillvalue=fillvalue)
+
+def writeMappingHeader(println, description, source, url):
+ if type(description) is not list:
+ description = [description]
+ for desc in description:
+ println(u"// {0}".format(desc))
+ println(u"// Derived from {0}.".format(source))
+ println(u"// {0}".format(url))
+
+def writeMappingsVar(println, mapping, name, description, source, url):
+ """ Writes a variable definition with a mapping table.
+
+ Writes the contents of dictionary |mapping| through the |println|
+ function with the given variable name and a comment with description,
+ source, and URL.
+ """
+ println(u"")
+ writeMappingHeader(println, description, source, url)
+ println(u"var {0} = {{".format(name))
+ for (key, value) in sorted(mapping.items(), key=itemgetter(0)):
+ println(u' "{0}": "{1}",'.format(key, value))
+ println(u"};")
+
+def writeMappingsBinarySearch(println, fn_name, type_name, name, validate_fn, validate_case_fn,
+ mappings, tag_maxlength, description, source, url):
+ """ Emit code to perform a binary search on language tag subtags.
+
+ Uses the contents of |mapping|, which can either be a dictionary or set,
+ to emit a mapping function to find subtag replacements.
+ """
+ println(u"")
+ writeMappingHeader(println, description, source, url)
+ println(u"""
+bool js::intl::LanguageTag::{0}({1} {2}) {{
+ MOZ_ASSERT({3}({2}.span()));
+ MOZ_ASSERT({4}({2}.span()));
+""".format(fn_name, type_name, name, validate_fn, validate_case_fn).strip())
+
+ def write_array(subtags, name, length, fixed):
+ if fixed:
+ println(u" static const char {}[{}][{}] = {{".format(name, len(subtags),
+ length + 1))
+ else:
+ println(u" static const char* {}[{}] = {{".format(name, len(subtags)))
+
+ # Group in pairs of ten to not exceed the 80 line column limit.
+ for entries in grouper(subtags, 10):
+ entries = (u"\"{}\"".format(tag).rjust(length + 2)
+ for tag in entries if tag is not None)
+ println(u" {},".format(u", ".join(entries)))
+
+ println(u" };")
+
+ trailing_return = True
+
+ # Sort the subtags by length. That enables using an optimized comparator
+ # for the binary search, which only performs a single |memcmp| for multiple
+ # of two subtag lengths.
+ mappings_keys = mappings.keys() if type(mappings) == dict else mappings
+ for (length, subtags) in groupby(sorted(mappings_keys, key=len), len):
+ # Omit the length check if the current length is the maximum length.
+ if length != tag_maxlength:
+ println(u"""
+ if ({}.length() == {}) {{
+""".format(name, length).rstrip("\n"))
+ else:
+ trailing_return = False
+ println(u"""
+ {
+""".rstrip("\n"))
+
+ # The subtags need to be sorted for binary search to work.
+ subtags = sorted(subtags)
+
+ def equals(subtag):
+ return u"""{}.equalTo("{}")""".format(name, subtag)
+
+ # Don't emit a binary search for short lists.
+ if len(subtags) == 1:
+ if type(mappings) == dict:
+ println(u"""
+ if ({}) {{
+ {}.set("{}");
+ return true;
+ }}
+ return false;
+""".format(equals(subtags[0]), name, mappings[subtags[0]]).strip("\n"))
+ else:
+ println(u"""
+ return {};
+""".format(equals(subtags[0])).strip("\n"))
+ elif len(subtags) <= 4:
+ if type(mappings) == dict:
+ for subtag in subtags:
+ println(u"""
+ if ({}) {{
+ {}.set("{}");
+ return true;
+ }}
+""".format(equals(subtag), name, mappings[subtag]).strip("\n"))
+
+ println(u"""
+ return false;
+""".strip("\n"))
+ else:
+ cond = (equals(subtag) for subtag in subtags)
+ cond = (u" ||\n" + u" " * (4 + len("return "))).join(cond)
+ println(u"""
+ return {};
+""".format(cond).strip("\n"))
+ else:
+ write_array(subtags, name + "s", length, True)
+
+ if type(mappings) == dict:
+ write_array([mappings[k] for k in subtags], u"aliases", length, False)
+
+ println(u"""
+ if (const char* replacement = SearchReplacement({0}s, aliases, {0})) {{
+ {0}.set(mozilla::MakeCStringSpan(replacement));
+ return true;
+ }}
+ return false;
+""".format(name).rstrip())
+ else:
+ println(u"""
+ return HasReplacement({0}s, {0});
+""".format(name).rstrip())
-def readRegistryRecord(registry):
- """ Yields the records of the IANA Language Subtag Registry as dictionaries. """
- record = {}
- for line in registry:
- line = line.strip()
- if line == "":
+ println(u"""
+ }
+""".strip("\n"))
+
+ if trailing_return:
+ println(u"""
+ return false;""")
+
+ println(u"""
+}""".lstrip("\n"))
+
+
+def writeComplexLanguageTagMappings(println, complex_language_mappings,
+ description, source, url):
+ println(u"")
+ writeMappingHeader(println, description, source, url)
+ println(u"""
+void js::intl::LanguageTag::performComplexLanguageMappings() {
+ MOZ_ASSERT(IsStructurallyValidLanguageTag(language().span()));
+ MOZ_ASSERT(IsCanonicallyCasedLanguageTag(language().span()));
+""".lstrip())
+
+ # Merge duplicate language entries.
+ language_aliases = {}
+ for (deprecated_language, (language, script, region)) in (
+ sorted(complex_language_mappings.items(), key=itemgetter(0))
+ ):
+ key = (language, script, region)
+ if key not in language_aliases:
+ language_aliases[key] = []
+ else:
+ language_aliases[key].append(deprecated_language)
+
+ first_language = True
+ for (deprecated_language, (language, script, region)) in (
+ sorted(complex_language_mappings.items(), key=itemgetter(0))
+ ):
+ key = (language, script, region)
+ if deprecated_language in language_aliases[key]:
continue
- if line == "%%":
- yield record
- record = {}
+
+ if_kind = u"if" if first_language else u"else if"
+ first_language = False
+
+ cond = (u"language().equalTo(\"{}\")".format(lang)
+ for lang in [deprecated_language] + language_aliases[key])
+ cond = (u" ||\n" + u" " * (2 + len(if_kind) + 2)).join(cond)
+
+ println(u"""
+ {} ({}) {{""".format(if_kind, cond).strip("\n"))
+
+ println(u"""
+ setLanguage("{}");""".format(language).strip("\n"))
+
+ if script is not None:
+ println(u"""
+ if (script().missing()) {{
+ setScript("{}");
+ }}""".format(script).strip("\n"))
+ if region is not None:
+ println(u"""
+ if (region().missing()) {{
+ setRegion("{}");
+ }}""".format(region).strip("\n"))
+ println(u"""
+ }""".strip("\n"))
+
+ println(u"""
+}
+""".strip("\n"))
+
+
+def writeComplexRegionTagMappings(println, complex_region_mappings,
+ description, source, url):
+ println(u"")
+ writeMappingHeader(println, description, source, url)
+ println(u"""
+void js::intl::LanguageTag::performComplexRegionMappings() {
+ MOZ_ASSERT(IsStructurallyValidLanguageTag(language().span()));
+ MOZ_ASSERT(IsCanonicallyCasedLanguageTag(language().span()));
+ MOZ_ASSERT(IsStructurallyValidRegionTag(region().span()));
+ MOZ_ASSERT(IsCanonicallyCasedRegionTag(region().span()));
+""".lstrip())
+
+ # |non_default_replacements| is a list and hence not hashable. Convert it
+ # to a string to get a proper hashable value.
+ def hash_key(default, non_default_replacements):
+ return (default, str(sorted(str(v) for v in non_default_replacements)))
+
+ # Merge duplicate region entries.
+ region_aliases = {}
+ for (deprecated_region, (default, non_default_replacements)) in (
+ sorted(complex_region_mappings.items(), key=itemgetter(0))
+ ):
+ key = hash_key(default, non_default_replacements)
+ if key not in region_aliases:
+ region_aliases[key] = []
else:
- if ":" in line:
- key, value = line.split(":", 1)
- key, value = key.strip(), value.strip()
- record[key] = value
- else:
- # continuation line
- record[key] += " " + line
- if record:
- yield record
- return
+ region_aliases[key].append(deprecated_region)
+
+ first_region = True
+ for (deprecated_region, (default, non_default_replacements)) in (
+ sorted(complex_region_mappings.items(), key=itemgetter(0))
+ ):
+ key = hash_key(default, non_default_replacements)
+ if deprecated_region in region_aliases[key]:
+ continue
+
+ if_kind = u"if" if first_region else u"else if"
+ first_region = False
+
+ cond = (u"region().equalTo(\"{}\")".format(region)
+ for region in [deprecated_region] + region_aliases[key])
+ cond = (u" ||\n" + u" " * (2 + len(if_kind) + 2)).join(cond)
+
+ println(u"""
+ {} ({}) {{""".format(if_kind, cond).strip("\n"))
+
+ replacement_regions = sorted({region for (_, _, region) in non_default_replacements})
+
+ first_case = True
+ for replacement_region in replacement_regions:
+ replacement_language_script = sorted(((language, script)
+ for (language, script, region) in (
+ non_default_replacements
+ )
+ if region == replacement_region),
+ key=itemgetter(0))
+
+ if_kind = u"if" if first_case else u"else if"
+ first_case = False
+
+ def compare_tags(language, script):
+ if script is None:
+ return u"language().equalTo(\"{}\")".format(language)
+ return u"(language().equalTo(\"{}\") && script().equalTo(\"{}\"))".format(
+ language, script)
+
+ cond = (compare_tags(language, script)
+ for (language, script) in replacement_language_script)
+ cond = (u" ||\n" + u" " * (4 + len(if_kind) + 2)).join(cond)
+
+ println(u"""
+ {} ({}) {{
+ setRegion("{}");
+ }}""".format(if_kind, cond, replacement_region).rstrip().strip("\n"))
+
+ println(u"""
+ else {{
+ setRegion("{}");
+ }}
+ }}""".format(default).rstrip().strip("\n"))
+
+ println(u"""
+}
+""".strip("\n"))
+
+
+def writeVariantTagMappings(println, variant_mappings, description, source,
+ url):
+ """ Writes a function definition that maps variant subtags. """
+ println(u"""
+static const char* ToCharPointer(const char* str) {
+ return str;
+}
+
+static const char* ToCharPointer(const js::UniqueChars& str) {
+ return str.get();
+}
+
+template <typename T, typename U = T>
+static bool IsLessThan(const T& a, const U& b) {
+ return strcmp(ToCharPointer(a), ToCharPointer(b)) < 0;
+}
+""")
+ writeMappingHeader(println, description, source, url)
+ println(u"""
+bool js::intl::LanguageTag::performVariantMappings(JSContext* cx) {
+ // The variant subtags need to be sorted for binary search.
+ MOZ_ASSERT(std::is_sorted(variants_.begin(), variants_.end(),
+ IsLessThan<decltype(variants_)::ElementType>));
+
+ auto insertVariantSortedIfNotPresent = [&](const char* variant) {
+ auto* p = std::lower_bound(variants_.begin(), variants_.end(), variant,
+ IsLessThan<decltype(variants_)::ElementType,
+ decltype(variant)>);
+
+ // Don't insert the replacement when already present.
+ if (p != variants_.end() && strcmp(p->get(), variant) == 0) {
+ return true;
+ }
+
+ // Insert the preferred variant in sort order.
+ auto preferred = DuplicateString(cx, variant);
+ if (!preferred) {
+ return false;
+ }
+ return !!variants_.insert(p, std::move(preferred));
+ };
+
+ for (size_t i = 0; i < variants_.length(); ) {
+ auto& variant = variants_[i];
+ MOZ_ASSERT(IsCanonicallyCasedVariantTag(mozilla::MakeCStringSpan(variant.get())));
+""".lstrip())
+
+ first_variant = True
+
+ for (deprecated_variant, (type, replacement)) in (
+ sorted(variant_mappings.items(), key=itemgetter(0))
+ ):
+ if_kind = u"if" if first_variant else u"else if"
+ first_variant = False
+
+ println(u"""
+ {} (strcmp(variant.get(), "{}") == 0) {{
+ variants_.erase(variants_.begin() + i);
+""".format(if_kind, deprecated_variant).strip("\n"))
+
+ if type == "language":
+ println(u"""
+ setLanguage("{}");
+""".format(replacement).strip("\n"))
+ elif type == "region":
+ println(u"""
+ setRegion("{}");
+""".format(replacement).strip("\n"))
+ else:
+ assert type == "variant"
+ println(u"""
+ if (!insertVariantSortedIfNotPresent("{}")) {{
+ return false;
+ }}
+""".format(replacement).strip("\n"))
+
+ println(u"""
+ }
+""".strip("\n"))
+
+ println(u"""
+ else {
+ i++;
+ }
+ }
+ return true;
+}
+""".strip("\n"))
+
+
+def writeGrandfatheredMappingsFunction(println, grandfathered_mappings,
+ description, source, url):
+ """ Writes a function definition that maps grandfathered language tags. """
+ println(u"")
+ writeMappingHeader(println, description, source, url)
+ println(u"""\
+bool js::intl::LanguageTag::updateGrandfatheredMappings(JSContext* cx) {
+ // We're mapping regular grandfathered tags to non-grandfathered form here.
+ // Other tags remain unchanged.
+ //
+ // regular = "art-lojban"
+ // / "cel-gaulish"
+ // / "no-bok"
+ // / "no-nyn"
+ // / "zh-guoyu"
+ // / "zh-hakka"
+ // / "zh-min"
+ // / "zh-min-nan"
+ // / "zh-xiang"
+ //
+ // Therefore we can quickly exclude most tags by checking every
+ // |unicode_locale_id| subcomponent for characteristics not shared by any of
+ // the regular grandfathered (RG) tags:
+ //
+ // * Real-world |unicode_language_subtag|s are all two or three letters,
+ // so don't waste time running a useless |language.length > 3| fast-path.
+ // * No RG tag has a "script"-looking component.
+ // * No RG tag has a "region"-looking component.
+ // * The RG tags that match |unicode_locale_id| (art-lojban, cel-gaulish,
+ // zh-guoyu, zh-hakka, zh-xiang) have exactly one "variant". (no-bok,
+ // no-nyn, zh-min, and zh-min-nan require BCP47's extlang subtag
+ // that |unicode_locale_id| doesn't support.)
+ // * No RG tag contains |extensions| or |pu_extensions|.
+ if (script().present() ||
+ region().present() ||
+ variants().length() != 1 ||
+ extensions().length() != 0 ||
+ privateuse()) {
+ return true;
+ }
+
+ MOZ_ASSERT(IsCanonicallyCasedLanguageTag(language().span()));
+ MOZ_ASSERT(IsCanonicallyCasedVariantTag(mozilla::MakeCStringSpan(variants()[0].get())));
+
+ auto variantEqualTo = [this](const char* variant) {
+ return strcmp(variants()[0].get(), variant) == 0;
+ };""")
+
+ # From Unicode BCP 47 locale identifier <https://unicode.org/reports/tr35/>.
+ #
+ # Doesn't allow any 'extensions' subtags.
+ re_unicode_locale_id = re.compile(
+ r"""
+ ^
+ # unicode_language_id = unicode_language_subtag
+ # unicode_language_subtag = alpha{2,3} | alpha{5,8}
+ (?P<language>[a-z]{2,3}|[a-z]{5,8})
+
+ # (sep unicode_script_subtag)?
+ # unicode_script_subtag = alpha{4}
+ (?:-(?P<script>[a-z]{4}))?
+
+ # (sep unicode_region_subtag)?
+ # unicode_region_subtag = (alpha{2} | digit{3})
+ (?:-(?P<region>([a-z]{2}|[0-9]{3})))?
+
+ # (sep unicode_variant_subtag)*
+ # unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3})
+ (?P<variants>(-([a-z0-9]{5,8}|[0-9][a-z0-9]{3}))+)?
+
+ # pu_extensions?
+ # pu_extensions = sep [xX] (sep alphanum{1,8})+
+ (?:-(?P<privateuse>x(-[a-z0-9]{1,8})+))?
+ $
+ """, re.IGNORECASE | re.VERBOSE)
+
+ is_first = True
+
+ for (tag, modern) in sorted(grandfathered_mappings.items(), key=itemgetter(0)):
+ tag_match = re_unicode_locale_id.match(tag)
+ assert tag_match is not None
+
+ tag_language = tag_match.group("language")
+ assert tag_match.group("script") is None, (
+ "{} does not contain a script subtag".format(tag))
+ assert tag_match.group("region") is None, (
+ "{} does not contain a region subtag".format(tag))
+ tag_variants = tag_match.group("variants")
+ assert tag_variants is not None, (
+ "{} contains a variant subtag".format(tag))
+ assert tag_match.group("privateuse") is None, (
+ "{} does not contain a privateuse subtag".format(tag))
+
+ tag_variant = tag_variants[1:]
+ assert "-" not in tag_variant, (
+ "{} contains only a single variant".format(tag))
+
+ modern_match = re_unicode_locale_id.match(modern)
+ assert modern_match is not None
+
+ modern_language = modern_match.group("language")
+ modern_script = modern_match.group("script")
+ modern_region = modern_match.group("region")
+ modern_variants = modern_match.group("variants")
+ modern_privateuse = modern_match.group("privateuse")
+
+ println(u"""
+ // {} -> {}
+""".format(tag, modern).rstrip())
+
+ println(u"""
+ {}if (language().equalTo("{}") && variantEqualTo("{}")) {{
+ """.format("" if is_first else "else ",
+ tag_language,
+ tag_variant).rstrip().strip("\n"))
+
+ is_first = False
+
+ println(u"""
+ setLanguage("{}");
+ """.format(modern_language).rstrip().strip("\n"))
+
+ if modern_script is not None:
+ println(u"""
+ setScript("{}");
+ """.format(modern_script).rstrip().strip("\n"))
+
+ if modern_region is not None:
+ println(u"""
+ setRegion("{}");
+ """.format(modern_region).rstrip().strip("\n"))
+
+ assert modern_variants is None, (
+ "all regular grandfathered tags' modern forms do not contain variant subtags")
+
+ println(u"""
+ clearVariants();
+ """.rstrip().strip("\n"))
+
+ if modern_privateuse is not None:
+ println(u"""
+ auto privateuse = DuplicateString(cx, "{}");
+ if (!privateuse) {{
+ return false;
+ }}
+ setPrivateuse(std::move(privateuse));
+ """.format(modern_privateuse).rstrip().rstrip("\n"))
+
+ println(u"""
+ return true;
+ }""".rstrip().strip("\n"))
+ println(u"""
+ return true;
+}""")
-def readRegistry(registry):
- """ Reads IANA Language Subtag Registry and extracts information for Intl.js.
+
+def readSupplementalData(core_file):
+ """ Reads CLDR Supplemental Data and extracts information for Intl.js.
Information extracted:
- - langTagMappings: mappings from complete language tags to preferred
+ - grandfatheredMappings: mappings from grandfathered tags to preferred
complete language tags
- - langSubtagMappings: mappings from subtags to preferred subtags
- - extlangMappings: mappings from extlang subtags to preferred subtags,
- with prefix to be removed
- Returns these three mappings as dictionaries, along with the registry's
- file date.
-
- We also check that mappings for language subtags don't affect extlang
- subtags and vice versa, so that CanonicalizeLanguageTag doesn't have
- to separate them for processing. Region codes are separated by case,
- and script codes by length, so they're unproblematic.
+ - languageMappings: mappings from language subtags to preferred subtags
+ - complexLanguageMappings: mappings from language subtags with complex rules
+ - regionMappings: mappings from region subtags to preferred subtags
+ - complexRegionMappings: mappings from region subtags with complex rules
+ - variantMappings: mappings from variant subtags to preferred subtags
+ - likelySubtags: likely subtags used for generating test data only
+ Returns these mappings as dictionaries.
"""
- langTagMappings = {}
- langSubtagMappings = {}
- extlangMappings = {}
- languageSubtags = set()
- extlangSubtags = set()
-
- for record in readRegistryRecord(registry):
- if "File-Date" in record:
- fileDate = record["File-Date"]
+ import xml.etree.ElementTree as ET
+
+ # From Unicode BCP 47 locale identifier <https://unicode.org/reports/tr35/>.
+ re_unicode_language_id = re.compile(
+ r"""
+ ^
+ # unicode_language_id = unicode_language_subtag
+ # unicode_language_subtag = alpha{2,3} | alpha{5,8}
+ (?P<language>[a-z]{2,3}|[a-z]{5,8})
+
+ # (sep unicode_script_subtag)?
+ # unicode_script_subtag = alpha{4}
+ (?:-(?P<script>[a-z]{4}))?
+
+ # (sep unicode_region_subtag)?
+ # unicode_region_subtag = (alpha{2} | digit{3})
+ (?:-(?P<region>([a-z]{2}|[0-9]{3})))?
+
+ # (sep unicode_variant_subtag)*
+ # unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3})
+ (?P<variants>(-([a-z0-9]{5,8}|[0-9][a-z0-9]{3}))+)?
+ $
+ """, re.IGNORECASE | re.VERBOSE)
+
+ re_unicode_language_subtag = re.compile(
+ r"""
+ ^
+ # unicode_language_subtag = alpha{2,3} | alpha{5,8}
+ ([a-z]{2,3}|[a-z]{5,8})
+ $
+ """, re.IGNORECASE | re.VERBOSE)
+
+ re_unicode_region_subtag = re.compile(
+ r"""
+ ^
+ # unicode_region_subtag = (alpha{2} | digit{3})
+ ([a-z]{2}|[0-9]{3})
+ $
+ """, re.IGNORECASE | re.VERBOSE)
+
+ re_unicode_variant_subtag = re.compile(
+ r"""
+ ^
+ # unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3})
+ ([a-z0-9]{5,8}|(?:[0-9][a-z0-9]{3}))
+ $
+ """, re.IGNORECASE | re.VERBOSE)
+
+ # The fixed list of BCP 47 grandfathered language tags.
+ grandfathered_tags = (
+ "art-lojban",
+ "cel-gaulish",
+ "en-GB-oed",
+ "i-ami",
+ "i-bnn",
+ "i-default",
+ "i-enochian",
+ "i-hak",
+ "i-klingon",
+ "i-lux",
+ "i-mingo",
+ "i-navajo",
+ "i-pwn",
+ "i-tao",
+ "i-tay",
+ "i-tsu",
+ "no-bok",
+ "no-nyn",
+ "sgn-BE-FR",
+ "sgn-BE-NL",
+ "sgn-CH-DE",
+ "zh-guoyu",
+ "zh-hakka",
+ "zh-min",
+ "zh-min-nan",
+ "zh-xiang",
+ )
+
+ # The list of grandfathered tags which are valid Unicode BCP 47 locale identifiers.
+ unicode_bcp47_grandfathered_tags = {tag for tag in grandfathered_tags
+ if re_unicode_language_id.match(tag)}
+
+ # Dictionary of simple language subtag mappings, e.g. "in" -> "id".
+ language_mappings = {}
+
+ # Dictionary of complex language subtag mappings, modifying more than one
+ # subtag, e.g. "sh" -> ("sr", "Latn", None) and "cnr" -> ("sr", None, "ME").
+ complex_language_mappings = {}
+
+ # Dictionary of simple region subtag mappings, e.g. "DD" -> "DE".
+ region_mappings = {}
+
+ # Dictionary of complex region subtag mappings, containing more than one
+ # replacement, e.g. "SU" -> ("RU", ["AM",complex_region_mappings[type] = replacements "AZ", "BY", ...]).
+ complex_region_mappings = {}
+
+ # Dictionary of aliased variant subtags to a tuple of preferred replacement
+ # type and replacement, e.g. "arevela" -> ("language", "hy") or
+ # "aaland" -> ("region", "AX") or "heploc" -> ("variant", "alalc97").
+ variant_mappings = {}
+
+ # Dictionary of grandfathered mappings to preferred values.
+ grandfathered_mappings = {}
+
+ # CLDR uses "_" as the separator for some elements. Replace it with "-".
+ def bcp47_id(cldr_id):
+ return cldr_id.replace("_", "-")
+
+ # CLDR uses the canonical case for most entries, but there are some
+ # exceptions, like:
+ # <languageAlias type="drw" replacement="fa_af" reason="deprecated"/>
+ # Therefore canonicalize all tags to be on the safe side.
+ def bcp47_canonical(language, script, region):
+ # Canonical case for language subtags is lower case.
+ # Canonical case for script subtags is title case.
+ # Canonical case for region subtags is upper case.
+ return (language.lower() if language else None,
+ script.title() if script else None,
+ region.upper() if region else None)
+
+ tree = ET.parse(core_file.open("common/supplemental/supplementalMetadata.xml"))
+
+ for language_alias in tree.iterfind(".//languageAlias"):
+ type = bcp47_id(language_alias.get("type"))
+ replacement = bcp47_id(language_alias.get("replacement"))
+
+ # Handle grandfathered mappings first.
+ if type in unicode_bcp47_grandfathered_tags:
+ grandfathered_mappings[type] = replacement
continue
- if record["Type"] == "grandfathered":
- # Grandfathered tags don't use standard syntax, so
- # CanonicalizeLanguageTag expects the mapping table to provide
- # the final form for all.
- # For langTagMappings, keys must be in lower case; values in
- # the case used in the registry.
- tag = record["Tag"]
- if "Preferred-Value" in record:
- langTagMappings[tag.lower()] = record["Preferred-Value"]
- else:
- langTagMappings[tag.lower()] = tag
- elif record["Type"] == "redundant":
- # For langTagMappings, keys must be in lower case; values in
- # the case used in the registry.
- if "Preferred-Value" in record:
- langTagMappings[record["Tag"].lower()] = record["Preferred-Value"]
- elif record["Type"] in ("language", "script", "region", "variant"):
- # For langSubtagMappings, keys and values must be in the case used
- # in the registry.
- subtag = record["Subtag"]
- if record["Type"] == "language":
- languageSubtags.add(subtag)
- if "Preferred-Value" in record:
- if subtag == "heploc":
- # The entry for heploc is unique in its complexity; handle
- # it as special case below.
- continue
- if "Prefix" in record:
- # This might indicate another heploc-like complex case.
- raise Exception("Please evaluate: subtag mapping with prefix value.")
- langSubtagMappings[subtag] = record["Preferred-Value"]
- elif record["Type"] == "extlang":
- # For extlangMappings, keys must be in the case used in the
- # registry; values are records with the preferred value and the
- # prefix to be removed.
- subtag = record["Subtag"]
- extlangSubtags.add(subtag)
- if "Preferred-Value" in record:
- preferred = record["Preferred-Value"]
- prefix = record["Prefix"]
- extlangMappings[subtag] = {"preferred": preferred, "prefix": prefix}
+ # We're only interested in language subtag matches, so ignore any
+ # entries which have additional subtags.
+ if re_unicode_language_subtag.match(type) is None:
+ continue
+
+ assert type.islower()
+
+ if re_unicode_language_subtag.match(replacement) is not None:
+ # Canonical case for language subtags is lower-case.
+ language_mappings[type] = replacement.lower()
else:
- # No other types are allowed by
- # https://tools.ietf.org/html/rfc5646#section-3.1.3
- assert False, "Unrecognized Type: {0}".format(record["Type"])
+ replacement_match = re_unicode_language_id.match(replacement)
+ assert replacement_match is not None, (
+ "{} invalid Unicode BCP 47 locale identifier".format(replacement))
+ assert replacement_match.group("variants") is None, (
+ "{}: unexpected variant subtags in {}".format(type, replacement))
+
+ complex_language_mappings[type] = bcp47_canonical(replacement_match.group("language"),
+ replacement_match.group("script"),
+ replacement_match.group("region"))
+
+ for territory_alias in tree.iterfind(".//territoryAlias"):
+ type = territory_alias.get("type")
+ replacement = territory_alias.get("replacement")
+
+ # We're only interested in region subtag matches, so ignore any entries
+ # which contain legacy formats, e.g. three letter region codes.
+ if re_unicode_region_subtag.match(type) is None:
+ continue
- # Check that mappings for language subtags and extlang subtags don't affect
- # each other.
- for lang in languageSubtags:
- if lang in extlangMappings and extlangMappings[lang]["preferred"] != lang:
- raise Exception("Conflict: lang with extlang mapping: " + lang)
- for extlang in extlangSubtags:
- if extlang in langSubtagMappings:
- raise Exception("Conflict: extlang with lang mapping: " + extlang)
+ assert type.isupper() or type.isdigit()
- # Special case for heploc.
- langTagMappings["ja-latn-hepburn-heploc"] = "ja-Latn-alalc97"
+ if re_unicode_region_subtag.match(replacement) is not None:
+ # Canonical case for region subtags is upper-case.
+ region_mappings[type] = replacement.upper()
+ else:
+ # Canonical case for region subtags is upper-case.
+ replacements = [r.upper() for r in replacement.split(" ")]
+ assert all(
+ re_unicode_region_subtag.match(loc) is not None for loc in replacements
+ ), "{} invalid region subtags".format(replacement)
+ complex_region_mappings[type] = replacements
- return {"fileDate": fileDate,
- "langTagMappings": langTagMappings,
- "langSubtagMappings": langSubtagMappings,
- "extlangMappings": extlangMappings}
+ for variant_alias in tree.iterfind(".//variantAlias"):
+ type = variant_alias.get("type")
+ replacement = variant_alias.get("replacement")
+ assert re_unicode_variant_subtag.match(type) is not None, (
+ "{} invalid variant subtag".format(type))
-def writeMappingsVar(intlData, dict, name, description, fileDate, url):
- """ Writes a variable definition with a mapping table to file intlData.
+ # Normalize the case, because some variants are in upper case.
+ type = type.lower()
+
+ # The replacement can be a language, a region, or a variant subtag.
+ # Language and region subtags are case normalized, variant subtags can
+ # be in any case.
+
+ if re_unicode_language_subtag.match(replacement) is not None and replacement.islower():
+ variant_mappings[type] = ("language", replacement)
+
+ elif re_unicode_region_subtag.match(replacement) is not None:
+ assert replacement.isupper() or replacement.isdigit(), (
+ "{} invalid variant subtag replacement".format(replacement))
+ variant_mappings[type] = ("region", replacement)
- Writes the contents of dictionary dict to file intlData with the given
- variable name and a comment with description, fileDate, and URL.
- """
- intlData.write("\n")
- intlData.write("// {0}.\n".format(description))
- intlData.write("// Derived from IANA Language Subtag Registry, file date {0}.\n".format(fileDate))
- intlData.write("// {0}\n".format(url))
- intlData.write("var {0} = {{\n".format(name))
- keys = sorted(dict)
- for key in keys:
- if isinstance(dict[key], basestring):
- value = '"{0}"'.format(dict[key])
else:
- preferred = dict[key]["preferred"]
- prefix = dict[key]["prefix"]
- value = '{{preferred: "{0}", prefix: "{1}"}}'.format(preferred, prefix)
- intlData.write(' "{0}": {1},\n'.format(key, value))
- intlData.write("};\n")
+ assert re_unicode_variant_subtag.match(replacement) is not None, (
+ "{} invalid variant subtag replacement".format(replacement))
+ variant_mappings[type] = ("variant", replacement.lower())
+
+ tree = ET.parse(core_file.open("common/supplemental/likelySubtags.xml"))
+
+ likely_subtags = {}
+
+ for likely_subtag in tree.iterfind(".//likelySubtag"):
+ from_tag = bcp47_id(likely_subtag.get("from"))
+ from_match = re_unicode_language_id.match(from_tag)
+ assert from_match is not None, (
+ "{} invalid Unicode BCP 47 locale identifier".format(from_tag))
+ assert from_match.group("variants") is None, (
+ "unexpected variant subtags in {}".format(from_tag))
+
+ to_tag = bcp47_id(likely_subtag.get("to"))
+ to_match = re_unicode_language_id.match(to_tag)
+ assert to_match is not None, (
+ "{} invalid Unicode BCP 47 locale identifier".format(to_tag))
+ assert to_match.group("variants") is None, (
+ "unexpected variant subtags in {}".format(to_tag))
+
+ from_canonical = bcp47_canonical(from_match.group("language"),
+ from_match.group("script"),
+ from_match.group("region"))
+
+ to_canonical = bcp47_canonical(to_match.group("language"),
+ to_match.group("script"),
+ to_match.group("region"))
+
+ likely_subtags[from_canonical] = to_canonical
+
+ complex_region_mappings_final = {}
+
+ for (deprecated_region, replacements) in complex_region_mappings.items():
+ # Find all likely subtag entries which don't already contain a region
+ # subtag and whose target region is in the list of replacement regions.
+ region_likely_subtags = [(from_language, from_script, to_region)
+ for ((from_language, from_script, from_region),
+ (_, _, to_region)) in likely_subtags.items()
+ if from_region is None and to_region in replacements]
+
+ # The first replacement entry is the default region.
+ default = replacements[0]
+
+ # Find all likely subtag entries whose region matches the default region.
+ default_replacements = {(language, script)
+ for (language, script, region) in region_likely_subtags
+ if region == default}
+
+ # And finally find those entries which don't use the default region.
+ # These are the entries we're actually interested in, because those need
+ # to be handled specially when selecting the correct preferred region.
+ non_default_replacements = [(language, script, region)
+ for (language, script, region) in region_likely_subtags
+ if (language, script) not in default_replacements]
+
+ # If there are no non-default replacements, we can handle the region as
+ # part of the simple region mapping.
+ if non_default_replacements:
+ complex_region_mappings_final[deprecated_region] = (default, non_default_replacements)
+ else:
+ region_mappings[deprecated_region] = default
+
+ return {"grandfatheredMappings": grandfathered_mappings,
+ "languageMappings": language_mappings,
+ "complexLanguageMappings": complex_language_mappings,
+ "regionMappings": region_mappings,
+ "complexRegionMappings": complex_region_mappings_final,
+ "variantMappings": variant_mappings,
+ "likelySubtags": likely_subtags,
+ }
+
+def readUnicodeExtensions(core_file):
+ import xml.etree.ElementTree as ET
+
+ # Match all xml-files in the BCP 47 directory.
+ bcpFileRE = re.compile(r"^common/bcp47/.+\.xml$")
+
+ # https://www.unicode.org/reports/tr35/#Unicode_locale_identifier
+ #
+ # type = alphanum{3,8} (sep alphanum{3,8})* ;
+ typeRE = re.compile(r"^[a-z0-9]{3,8}(-[a-z0-9]{3,8})*$")
+
+ # Mapping from Unicode extension types to dict of deprecated to
+ # preferred values.
+ mapping = {
+ # Unicode BCP 47 U Extension
+ "u": {},
+
+ # Unicode BCP 47 T Extension
+ "t": {},
+ }
+ def readBCP47File(file):
+ tree = ET.parse(file)
+ for keyword in tree.iterfind(".//keyword/key"):
+ extension = keyword.get("extension", "u")
+ assert extension == "u" or extension == "t", (
+ "unknown extension type: {}".format(extension))
+
+ extension_name = keyword.get("name")
+
+ for type in keyword.iterfind("type"):
+ # <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>:
+ #
+ # The key or type name used by Unicode locale extension with 'u' extension
+ # syntax or the 't' extensions syntax. When alias below is absent, this name
+ # can be also used with the old style "@key=type" syntax.
+ name = type.get("name")
+
+ # Ignore the special name:
+ # - <https://unicode.org/reports/tr35/#CODEPOINTS>
+ # - <https://unicode.org/reports/tr35/#REORDER_CODE>
+ # - <https://unicode.org/reports/tr35/#RG_KEY_VALUE>
+ # - <https://unicode.org/reports/tr35/#SUBDIVISION_CODE>
+ # - <https://unicode.org/reports/tr35/#PRIVATE_USE>
+ if name in ("CODEPOINTS", "REORDER_CODE", "RG_KEY_VALUE", "SUBDIVISION_CODE",
+ "PRIVATE_USE"):
+ continue
-def writeLanguageTagData(intlData, fileDate, url, langTagMappings, langSubtagMappings, extlangMappings):
+ # All other names should match the 'type' production.
+ assert typeRE.match(name) is not None, (
+ "{} matches the 'type' production".format(name))
+
+ # <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>:
+ #
+ # The preferred value of the deprecated key, type or attribute element.
+ # When a key, type or attribute element is deprecated, this attribute is
+ # used for specifying a new canonical form if available.
+ preferred = type.get("preferred")
+
+ # <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>:
+ #
+ # The BCP 47 form is the canonical form, and recommended. Other aliases are
+ # included only for backwards compatibility.
+ alias = type.get("alias")
+
+ # <https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>
+ #
+ # Use the bcp47 data to replace keys, types, tfields, and tvalues by their
+ # canonical forms. See Section 3.6.4 U Extension Data Files) and Section
+ # 3.7.1 T Extension Data Files. The aliases are in the alias attribute
+ # value, while the canonical is in the name attribute value.
+
+ # 'preferred' contains the new preferred name, 'alias' the compatibility
+ # name, but then there's this entry where 'preferred' and 'alias' are the
+ # same. So which one to choose? Assume 'preferred' is the actual canonical
+ # name.
+ #
+ # <type name="islamicc"
+ # description="Civil (algorithmic) Arabic calendar"
+ # deprecated="true"
+ # preferred="islamic-civil"
+ # alias="islamic-civil"/>
+
+ if preferred is not None:
+ assert typeRE.match(preferred), preferred
+ mapping[extension].setdefault(extension_name, {})[name] = preferred
+
+ if alias is not None:
+ for alias_name in alias.lower().split(" "):
+ # Ignore alias entries which don't match the 'type' production.
+ if typeRE.match(alias_name) is None:
+ continue
+
+ # See comment above when 'alias' and 'preferred' are both present.
+ if (preferred is not None and
+ name in mapping[extension][extension_name]):
+ continue
+
+ # Skip over entries where 'name' and 'alias' are equal.
+ #
+ # <type name="pst8pdt"
+ # description="POSIX style time zone for US Pacific Time"
+ # alias="PST8PDT"
+ # since="1.8"/>
+ if name == alias_name:
+ continue
+
+ mapping[extension].setdefault(extension_name, {})[alias_name] = name
+
+ def readSupplementalMetadata(file):
+ # Find subdivision and region replacements.
+ #
+ # <https://www.unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>
+ #
+ # Replace aliases in special key values:
+ # - If there is an 'sd' or 'rg' key, replace any subdivision alias
+ # in its value in the same way, using subdivisionAlias data.
+ tree = ET.parse(file)
+ for alias in tree.iterfind(".//subdivisionAlias"):
+ type = alias.get("type")
+ assert typeRE.match(type) is not None, (
+ "{} matches the 'type' production".format(type))
+
+ # Take the first replacement when multiple ones are present.
+ replacement = alias.get("replacement").split(" ")[0].lower()
+
+ # Skip over invalid replacements.
+ #
+ # <subdivisionAlias type="fi01" replacement="AX" reason="overlong"/>
+ #
+ # It's not entirely clear to me if CLDR actually wants to use
+ # "axzzzz" as the replacement for this case.
+ if typeRE.match(replacement) is None:
+ continue
+
+ # 'subdivisionAlias' applies to 'rg' and 'sd' keys.
+ mapping["u"].setdefault("rg", {})[type] = replacement
+ mapping["u"].setdefault("sd", {})[type] = replacement
+
+ for name in core_file.namelist():
+ if bcpFileRE.match(name):
+ readBCP47File(core_file.open(name))
+
+ readSupplementalMetadata(core_file.open("common/supplemental/supplementalMetadata.xml"))
+
+ return {
+ "unicodeMappings": mapping["u"],
+ "transformMappings": mapping["t"],
+ }
+
+def writeCLDRLanguageTagData(println, data, url):
""" Writes the language tag data to the Intl data file. """
- writeMappingsVar(intlData, langTagMappings, "langTagMappings",
- "Mappings from complete tags to preferred values", fileDate, url)
- writeMappingsVar(intlData, langSubtagMappings, "langSubtagMappings",
- "Mappings from non-extlang subtags to preferred values", fileDate, url)
- writeMappingsVar(intlData, extlangMappings, "extlangMappings",
- "Mappings from extlang subtags to preferred values", fileDate, url)
-
-def updateLangTags(args):
- """ Update the LangTagMappingsGenerated.js file. """
+
+ println(generatedFileWarning)
+ println(u"// Version: CLDR-{}".format(data["version"]))
+ println(u"// URL: {}".format(url))
+
+ println(u"""
+#include "mozilla/Assertions.h"
+#include "mozilla/Span.h"
+#include "mozilla/TextUtils.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <iterator>
+#include <string>
+#include <type_traits>
+
+#include "jscntxt.h"
+#include "jsstr.h"
+
+#include "builtin/intl/LanguageTag.h"
+
+using namespace js::intl::LanguageTagLimits;
+
+template <size_t Length, size_t TagLength, size_t SubtagLength>
+static inline bool HasReplacement(
+ const char (&subtags)[Length][TagLength],
+ const js::intl::LanguageTagSubtag<SubtagLength>& subtag) {
+ MOZ_ASSERT(subtag.length() == TagLength - 1,
+ "subtag must have the same length as the list of subtags");
+
+ const char* ptr = subtag.span().data();
+ return std::binary_search(std::begin(subtags), std::end(subtags), ptr,
+ [](const char* a, const char* b) {
+ return memcmp(a, b, TagLength - 1) < 0;
+ });
+}
+
+template <size_t Length, size_t TagLength, size_t SubtagLength>
+static inline const char* SearchReplacement(
+ const char (&subtags)[Length][TagLength],
+ const char* (&aliases)[Length],
+ const js::intl::LanguageTagSubtag<SubtagLength>& subtag) {
+ MOZ_ASSERT(subtag.length() == TagLength - 1,
+ "subtag must have the same length as the list of subtags");
+
+ const char* ptr = subtag.span().data();
+ auto p = std::lower_bound(std::begin(subtags), std::end(subtags), ptr,
+ [](const char* a, const char* b) {
+ return memcmp(a, b, TagLength - 1) < 0;
+ });
+ if (p != std::end(subtags) && memcmp(*p, ptr, TagLength - 1) == 0) {
+ return aliases[std::distance(std::begin(subtags), p)];
+ }
+ return nullptr;
+}
+
+#ifdef DEBUG
+static bool IsAsciiLowercaseAlphanumeric(char c) {
+ return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c);
+}
+
+static bool IsAsciiLowercaseAlphanumericOrDash(char c) {
+ return IsAsciiLowercaseAlphanumeric(c) || c == '-';
+}
+
+static bool IsCanonicallyCasedLanguageTag(mozilla::Span<const char> span) {
+ // Tell the analysis the |std::all_of| function can't GC.
+ JS::AutoSuppressGCAnalysis nogc;
+
+ return std::all_of(span.begin(), span.end(), mozilla::IsAsciiLowercaseAlpha<char>);
+}
+
+static bool IsCanonicallyCasedRegionTag(mozilla::Span<const char> span) {
+ // Tell the analysis the |std::all_of| function can't GC.
+ JS::AutoSuppressGCAnalysis nogc;
+
+ return std::all_of(span.begin(), span.end(), mozilla::IsAsciiUppercaseAlpha<char>) ||
+ std::all_of(span.begin(), span.end(), mozilla::IsAsciiDigit<char>);
+}
+
+static bool IsCanonicallyCasedVariantTag(mozilla::Span<const char> span) {
+ // Tell the analysis the |std::all_of| function can't GC.
+ JS::AutoSuppressGCAnalysis nogc;
+
+ return std::all_of(span.begin(), span.end(), IsAsciiLowercaseAlphanumeric);
+}
+
+static bool IsCanonicallyCasedUnicodeKey(mozilla::Span<const char> key) {
+ return std::all_of(key.begin(), key.end(), IsAsciiLowercaseAlphanumeric);
+}
+
+static bool IsCanonicallyCasedUnicodeType(mozilla::Span<const char> type) {
+ return std::all_of(type.begin(), type.end(), IsAsciiLowercaseAlphanumericOrDash);
+}
+
+static bool IsCanonicallyCasedTransformKey(mozilla::Span<const char> key) {
+ return std::all_of(key.begin(), key.end(), IsAsciiLowercaseAlphanumeric);
+}
+
+static bool IsCanonicallyCasedTransformType(mozilla::Span<const char> type) {
+ return std::all_of(type.begin(), type.end(), IsAsciiLowercaseAlphanumericOrDash);
+}
+#endif
+""".rstrip())
+
+ source = u"CLDR Supplemental Data, version {}".format(data["version"])
+ grandfathered_mappings = data["grandfatheredMappings"]
+ language_mappings = data["languageMappings"]
+ complex_language_mappings = data["complexLanguageMappings"]
+ region_mappings = data["regionMappings"]
+ complex_region_mappings = data["complexRegionMappings"]
+ variant_mappings = data["variantMappings"]
+ unicode_mappings = data["unicodeMappings"]
+ transform_mappings = data["transformMappings"]
+
+ # unicode_language_subtag = alpha{2,3} | alpha{5,8} ;
+ language_maxlength = 8
+
+ # unicode_region_subtag = (alpha{2} | digit{3}) ;
+ region_maxlength = 3
+
+ writeMappingsBinarySearch(println, "languageMapping",
+ "LanguageSubtag&", "language",
+ "IsStructurallyValidLanguageTag",
+ "IsCanonicallyCasedLanguageTag",
+ language_mappings, language_maxlength,
+ "Mappings from language subtags to preferred values.", source, url)
+ writeMappingsBinarySearch(println, "complexLanguageMapping",
+ "const LanguageSubtag&", "language",
+ "IsStructurallyValidLanguageTag",
+ "IsCanonicallyCasedLanguageTag",
+ complex_language_mappings.keys(), language_maxlength,
+ "Language subtags with complex mappings.", source, url)
+ writeMappingsBinarySearch(println, "regionMapping",
+ "RegionSubtag&", "region",
+ "IsStructurallyValidRegionTag",
+ "IsCanonicallyCasedRegionTag",
+ region_mappings, region_maxlength,
+ "Mappings from region subtags to preferred values.", source, url)
+ writeMappingsBinarySearch(println, "complexRegionMapping",
+ "const RegionSubtag&", "region",
+ "IsStructurallyValidRegionTag",
+ "IsCanonicallyCasedRegionTag",
+ complex_region_mappings.keys(), region_maxlength,
+ "Region subtags with complex mappings.", source, url)
+
+ writeComplexLanguageTagMappings(println, complex_language_mappings,
+ "Language subtags with complex mappings.", source, url)
+ writeComplexRegionTagMappings(println, complex_region_mappings,
+ "Region subtags with complex mappings.", source, url)
+
+ writeVariantTagMappings(println, variant_mappings,
+ "Mappings from variant subtags to preferred values.", source, url)
+
+ writeGrandfatheredMappingsFunction(println, grandfathered_mappings,
+ "Canonicalize grandfathered locale identifiers.", source,
+ url)
+
+ writeUnicodeExtensionsMappings(println, unicode_mappings, "Unicode")
+ writeUnicodeExtensionsMappings(println, transform_mappings, "Transform")
+
+
+def writeCLDRLanguageTagLikelySubtagsTest(println, data, url):
+ """ Writes the likely-subtags test file. """
+
+ println(generatedFileWarning)
+
+ source = u"CLDR Supplemental Data, version {}".format(data["version"])
+ language_mappings = data["languageMappings"]
+ complex_language_mappings = data["complexLanguageMappings"]
+ region_mappings = data["regionMappings"]
+ complex_region_mappings = data["complexRegionMappings"]
+ likely_subtags = data["likelySubtags"]
+
+ def bcp47(tag):
+ (language, script, region) = tag
+ return "{}{}{}".format(language,
+ "-" + script if script else "",
+ "-" + region if region else "")
+
+ def canonical(tag):
+ (language, script, region) = tag
+
+ # Map deprecated language subtags.
+ if language in language_mappings:
+ language = language_mappings[language]
+ elif language in complex_language_mappings:
+ (language2, script2, region2) = complex_language_mappings[language]
+ (language, script, region) = (language2,
+ script if script else script2,
+ region if region else region2)
+
+ # Map deprecated region subtags.
+ if region in region_mappings:
+ region = region_mappings[region]
+ else:
+ # Assume no complex region mappings are needed for now.
+ assert region not in complex_region_mappings,\
+ "unexpected region with complex mappings: {}".format(region)
+
+ return (language, script, region)
+
+ # https://unicode.org/reports/tr35/#Likely_Subtags
+
+ def addLikelySubtags(tag):
+ # Step 1: Canonicalize.
+ (language, script, region) = canonical(tag)
+ if script == "Zzzz":
+ script = None
+ if region == "ZZ":
+ region = None
+
+ # Step 2: Lookup.
+ searches = ((language, script, region),
+ (language, None, region),
+ (language, script, None),
+ (language, None, None),
+ ("und", script, None))
+ search = next(search for search in searches if search in likely_subtags)
+
+ (language_s, script_s, region_s) = search
+ (language_m, script_m, region_m) = likely_subtags[search]
+
+ # Step 3: Return.
+ return (language if language != language_s else language_m,
+ script if script != script_s else script_m,
+ region if region != region_s else region_m)
+
+ # https://unicode.org/reports/tr35/#Likely_Subtags
+ def removeLikelySubtags(tag):
+ # Step 1: Add likely subtags.
+ max = addLikelySubtags(tag)
+
+ # Step 2: Remove variants (doesn't apply here).
+
+ # Step 3: Find a match.
+ (language, script, region) = max
+ for trial in ((language, None, None), (language, None, region), (language, script, None)):
+ if addLikelySubtags(trial) == max:
+ return trial
+
+ # Step 4: Return maximized if no match found.
+ return max
+
+ def likely_canonical(from_tag, to_tag):
+ # Canonicalize the input tag.
+ from_tag = canonical(from_tag)
+
+ # Update the expected result if necessary.
+ if from_tag in likely_subtags:
+ to_tag = likely_subtags[from_tag]
+
+ # Canonicalize the expected output.
+ to_canonical = canonical(to_tag)
+
+ # Sanity check: This should match the result of |addLikelySubtags|.
+ assert to_canonical == addLikelySubtags(from_tag)
+
+ return to_canonical
+
+ # |likely_subtags| contains non-canonicalized tags, so canonicalize it first.
+ likely_subtags_canonical = {k: likely_canonical(k, v) for (k, v) in likely_subtags.items()}
+
+ # Add test data for |Intl.Locale.prototype.maximize()|.
+ writeMappingsVar(println, {bcp47(k): bcp47(v) for (k, v) in likely_subtags_canonical.items()},
+ "maxLikelySubtags", "Extracted from likelySubtags.xml.", source, url)
+
+ # Use the maximalized tags as the input for the remove likely-subtags test.
+ minimized = {tag: removeLikelySubtags(tag) for tag in likely_subtags_canonical.values()}
+
+ # Add test data for |Intl.Locale.prototype.minimize()|.
+ writeMappingsVar(println, {bcp47(k): bcp47(v) for (k, v) in minimized.items()},
+ "minLikelySubtags", "Extracted from likelySubtags.xml.", source, url)
+
+ println(u"""
+for (let [tag, maximal] of Object.entries(maxLikelySubtags)) {
+ assertEq(new Intl.Locale(tag).maximize().toString(), maximal);
+}""")
+
+ println(u"""
+for (let [tag, minimal] of Object.entries(minLikelySubtags)) {
+ assertEq(new Intl.Locale(tag).minimize().toString(), minimal);
+}""")
+
+ println(u"""
+if (typeof reportCompare === "function")
+ reportCompare(0, 0);""")
+
+
+def updateCLDRLangTags(args):
+ """ Update the LanguageTagGenerated.cpp file. """
+ version = args.version
url = args.url
out = args.out
filename = args.file
+ url = url.replace("<VERSION>", version)
+
print("Arguments:")
+ print("\tCLDR version: %s" % version)
print("\tDownload url: %s" % url)
- print("\tLocal registry: %s" % filename)
+ if filename is not None:
+ print("\tLocal CLDR core.zip file: %s" % filename)
print("\tOutput file: %s" % out)
print("")
+ data = {
+ "version": version,
+ }
+
+ def readFiles(cldr_file):
+ with ZipFile(cldr_file) as zip_file:
+ data.update(readSupplementalData(zip_file))
+ data.update(readUnicodeExtensions(zip_file))
+
+ print("Processing CLDR data...")
if filename is not None:
- print("Always make sure you have the newest language-subtag-registry.txt!")
- registry = codecs.open(filename, "r", encoding="utf-8")
+ print("Always make sure you have the newest CLDR core.zip!")
+ with open(filename, "rb") as cldr_file:
+ readFiles(cldr_file)
else:
- print("Downloading IANA Language Subtag Registry...")
- with closing(urllib2.urlopen(url)) as reader:
- text = reader.read().decode("utf-8")
- registry = codecs.open("language-subtag-registry.txt", "w+", encoding="utf-8")
- registry.write(text)
- registry.seek(0)
-
- print("Processing IANA Language Subtag Registry...")
- with closing(registry) as reg:
- data = readRegistry(reg)
- fileDate = data["fileDate"]
- langTagMappings = data["langTagMappings"]
- langSubtagMappings = data["langSubtagMappings"]
- extlangMappings = data["extlangMappings"]
+ print("Downloading CLDR core.zip...")
+ with closing(urllib2.urlopen(url)) as cldr_file:
+ cldr_data = io.BytesIO(cldr_file.read())
+ readFiles(cldr_data)
print("Writing Intl data...")
- with codecs.open(out, "w", encoding="utf-8") as intlData:
- intlData.write("// Generated by make_intl_data.py. DO NOT EDIT.\n")
- writeLanguageTagData(intlData, fileDate, url, langTagMappings, langSubtagMappings, extlangMappings)
+ with io.open(out, mode="w", encoding="utf-8", newline="") as f:
+ println = partial(print, file=f)
+ writeCLDRLanguageTagData(println, data, url)
+
+ print("Writing Intl test data...")
+ test_file = os.path.join(os.path.dirname(os.path.abspath(__file__)),
+ "../../tests/non262/Intl/Locale/likely-subtags-generated.js")
+ with io.open(test_file, mode="w", encoding="utf-8", newline="") as f:
+ println = partial(print, file=f)
+
+ println(u"// |reftest| skip-if(!this.hasOwnProperty('Intl'))")
+ writeCLDRLanguageTagLikelySubtagsTest(println, data, url)
+
def flines(filepath, encoding="utf-8"):
""" Open filepath and iterate over its content. """
@@ -703,11 +1835,11 @@ def processTimeZones(tzdataDir, icuDir, icuTzDir, version, ignoreBackzone, ignor
println(u"// Format:")
println(u'// "LinkName", "Target" // ICU-Target [time zone file]')
- println(u"struct LinkAndTarget");
- println(u"{");
- println(u" const char* const link;");
- println(u" const char* const target;");
- println(u"};");
+ println(u"struct LinkAndTarget")
+ println(u"{")
+ println(u" const char* const link;")
+ println(u" const char* const target;")
+ println(u"};")
println(u"")
println(u"const LinkAndTarget ianaLinksCanonicalizedDifferentlyByICU[] = {")
for (zone, target, icuTarget) in incorrectLinks:
@@ -928,7 +2060,7 @@ def updateTzdata(topsrcdir, args):
if tzDir is None:
print("Downloading tzdata file...")
with closing(urllib2.urlopen(url)) as tzfile:
- fname = urlparse.urlsplit(tzfile.geturl()).path.split("/")[-1]
+ fname = urlsplit(tzfile.geturl()).path.split("/")[-1]
with tempfile.NamedTemporaryFile(suffix=fname) as tztmpfile:
print("File stored in %s" % tztmpfile.name)
tztmpfile.write(tzfile.read())
@@ -937,6 +2069,152 @@ def updateTzdata(topsrcdir, args):
else:
updateFrom(tzDir)
+def writeUnicodeExtensionsMappings(println, mapping, extension):
+ println(u"""
+template <size_t Length>
+static inline bool Is{0}Key(
+ mozilla::Span<const char> key, const char (&str)[Length]) {{
+ static_assert(Length == {0}KeyLength + 1,
+ "{0} extension key is two characters long");
+ return memcmp(key.data(), str, Length - 1) == 0;
+}}
+
+template <size_t Length>
+static inline bool Is{0}Type(
+ mozilla::Span<const char> type, const char (&str)[Length]) {{
+ static_assert(Length > {0}KeyLength + 1,
+ "{0} extension type contains more than two characters");
+ return type.size() == (Length - 1) &&
+ memcmp(type.data(), str, Length - 1) == 0;
+}}
+""".format(extension).rstrip("\n"))
+
+ linear_search_max_length = 4
+
+ needs_binary_search = any(len(replacements.items()) > linear_search_max_length
+ for replacements in mapping.values())
+
+ if needs_binary_search:
+ println(u"""
+static int32_t Compare{0}Type(const char* a, mozilla::Span<const char> b) {{
+ MOZ_ASSERT(!std::char_traits<char>::find(b.data(), b.size(), '\\0'),
+ "unexpected null-character in string");
+
+ using UnsignedChar = unsigned char;
+ for (size_t i = 0; i < b.size(); i++) {{
+ // |a| is zero-terminated and |b| doesn't contain a null-terminator. So if
+ // we've reached the end of |a|, the below if-statement will always be true.
+ // That ensures we don't read past the end of |a|.
+ if (int32_t r = UnsignedChar(a[i]) - UnsignedChar(b[i])) {{
+ return r;
+ }}
+ }}
+
+ // Return zero if both strings are equal or a negative number if |b| is a
+ // prefix of |a|.
+ return -int32_t(UnsignedChar(a[b.size()]));
+}}
+
+template <size_t Length>
+static inline const char* Search{0}Replacement(
+ const char* (&types)[Length], const char* (&aliases)[Length],
+ mozilla::Span<const char> type) {{
+
+ auto p = std::lower_bound(std::begin(types), std::end(types), type,
+ [](const auto& a, const auto& b) {{
+ return Compare{0}Type(a, b) < 0;
+ }});
+ if (p != std::end(types) && Compare{0}Type(*p, type) == 0) {{
+ return aliases[std::distance(std::begin(types), p)];
+ }}
+ return nullptr;
+}}
+""".format(extension).rstrip("\n"))
+
+ println(u"""
+/**
+ * Mapping from deprecated BCP 47 {0} extension types to their preferred
+ * values.
+ *
+ * Spec: https://www.unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files
+ * Spec: https://www.unicode.org/reports/tr35/#t_Extension
+ */
+const char* js::intl::LanguageTag::replace{0}ExtensionType(
+ mozilla::Span<const char> key, mozilla::Span<const char> type) {{
+ MOZ_ASSERT(key.size() == {0}KeyLength);
+ MOZ_ASSERT(IsCanonicallyCased{0}Key(key));
+
+ MOZ_ASSERT(type.size() > {0}KeyLength);
+ MOZ_ASSERT(IsCanonicallyCased{0}Type(type));
+""".format(extension))
+
+ def to_hash_key(replacements):
+ return str(sorted(replacements.items()))
+
+ def write_array(subtags, name, length):
+ max_entries = (80 - len(" ")) // (length + len('"", '))
+
+ println(u" static const char* {}[{}] = {{".format(name, len(subtags)))
+
+ for entries in grouper(subtags, max_entries):
+ entries = (u"\"{}\"".format(tag).rjust(length + 2)
+ for tag in entries if tag is not None)
+ println(u" {},".format(u", ".join(entries)))
+
+ println(u" };")
+
+ # Merge duplicate keys.
+ key_aliases = {}
+ for (key, replacements) in sorted(mapping.items(), key=itemgetter(0)):
+ hash_key = to_hash_key(replacements)
+ if hash_key not in key_aliases:
+ key_aliases[hash_key] = []
+ else:
+ key_aliases[hash_key].append(key)
+
+ first_key = True
+ for (key, replacements) in sorted(mapping.items(), key=itemgetter(0)):
+ hash_key = to_hash_key(replacements)
+ if key in key_aliases[hash_key]:
+ continue
+
+ cond = (u"Is{}Key(key, \"{}\")".format(extension, k)
+ for k in [key] + key_aliases[hash_key])
+
+ if_kind = u"if" if first_key else u"else if"
+ cond = (u" ||\n" + u" " * (2 + len(if_kind) + 2)).join(cond)
+ println(u"""
+ {} ({}) {{""".format(if_kind, cond).strip("\n"))
+ first_key = False
+
+ replacements = sorted(replacements.items(), key=itemgetter(0))
+
+ if len(replacements) > linear_search_max_length:
+ types = [t for (t, _) in replacements]
+ preferred = [r for (_, r) in replacements]
+ max_len = max(len(k) for k in types + preferred)
+
+ write_array(types, "types", max_len)
+ write_array(preferred, "aliases", max_len)
+ println(u"""
+ return Search{}Replacement(types, aliases, type);
+""".format(extension).strip("\n"))
+ else:
+ for (type, replacement) in replacements:
+ println(u"""
+ if (Is{}Type(type, "{}")) {{
+ return "{}";
+ }}""".format(extension, type, replacement).strip("\n"))
+
+ println(u"""
+ }""".lstrip("\n"))
+
+ println(u"""
+ return nullptr;
+}
+""".strip("\n"))
+
+
if __name__ == "__main__":
import argparse
@@ -955,20 +2233,24 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Update intl data.")
subparsers = parser.add_subparsers(help="Select update mode")
- parser_tags = subparsers.add_parser("langtags",
- help="Update language-subtag-registry")
- parser_tags.add_argument("--url",
- metavar="URL",
- default="https://www.iana.org/assignments/language-subtag-registry",
- type=EnsureHttps,
- help="Download url for language-subtag-registry.txt (default: %(default)s)")
- parser_tags.add_argument("--out",
- default="LangTagMappingsGenerated.js",
- help="Output file (default: %(default)s)")
- parser_tags.add_argument("file",
- nargs="?",
- help="Local language-subtag-registry.txt file, if omitted uses <URL>")
- parser_tags.set_defaults(func=updateLangTags)
+ parser_cldr_tags = subparsers.add_parser("langtags",
+ help="Update CLDR language tags data")
+ parser_cldr_tags.add_argument("--version",
+ metavar="VERSION",
+ required=True,
+ help="CLDR version number")
+ parser_cldr_tags.add_argument("--url",
+ metavar="URL",
+ default="https://unicode.org/Public/cldr/<VERSION>/core.zip",
+ type=EnsureHttps,
+ help="Download url CLDR data (default: %(default)s)")
+ parser_cldr_tags.add_argument("--out",
+ default="LanguageTagGenerated.cpp",
+ help="Output file (default: %(default)s)")
+ parser_cldr_tags.add_argument("file",
+ nargs="?",
+ help="Local cldr-core.zip file, if omitted uses <URL>")
+ parser_cldr_tags.set_defaults(func=updateCLDRLangTags)
parser_tz = subparsers.add_parser("tzdata", help="Update tzdata")
parser_tz.add_argument("--tz",