diff options
Diffstat (limited to 'js/src/builtin/intl/make_intl_data.py')
-rw-r--r-- | js/src/builtin/intl/make_intl_data.py | 1642 |
1 files changed, 1462 insertions, 180 deletions
diff --git a/js/src/builtin/intl/make_intl_data.py b/js/src/builtin/intl/make_intl_data.py index a81001e0f3..59ff14d76c 100644 --- a/js/src/builtin/intl/make_intl_data.py +++ b/js/src/builtin/intl/make_intl_data.py @@ -6,19 +6,15 @@ # file, You can obtain one at http://mozilla.org/MPL/2.0/. """ Usage: - make_intl_data.py langtags [language-subtag-registry.txt] + make_intl_data.py langtags [cldr_core.zip] make_intl_data.py tzdata + make_intl_data.py unicode-ext Target "langtags": - This script extracts information about mappings between deprecated and - current BCP 47 language tags from the IANA Language Subtag Registry and - converts it to JavaScript object definitions in - LangTagMappingsGenerated.js. The definitions are used in Intl.js. - - The IANA Language Subtag Registry is imported from - https://www.iana.org/assignments/language-subtag-registry - and uses the syntax specified in - https://tools.ietf.org/html/rfc5646#section-3 + This script extracts information about 1) mappings between deprecated and + current Unicode BCP 47 locale identifiers, and 2) deprecated and current + BCP 47 Unicode extension value from CLDR, and converts it to C++ mapping + code in LanguageTagGenerated.cpp. The code is used in LanguageTag.cpp. Target "tzdata": @@ -36,194 +32,1330 @@ import sys import tarfile import tempfile import urllib2 -import urlparse from contextlib import closing from functools import partial -from itertools import chain, ifilter, ifilterfalse, imap, tee +from itertools import chain, ifilter, ifilterfalse, imap, izip_longest, groupby, tee from operator import attrgetter, itemgetter +from urlparse import urlsplit +from zipfile import ZipFile + +# From https://docs.python.org/3/library/itertools.html +def grouper(iterable, n, fillvalue=None): + "Collect data into fixed-length chunks or blocks" + # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx" + args = [iter(iterable)] * n + return izip_longest(*args, fillvalue=fillvalue) + +def writeMappingHeader(println, description, source, url): + if type(description) is not list: + description = [description] + for desc in description: + println(u"// {0}".format(desc)) + println(u"// Derived from {0}.".format(source)) + println(u"// {0}".format(url)) + +def writeMappingsVar(println, mapping, name, description, source, url): + """ Writes a variable definition with a mapping table. + + Writes the contents of dictionary |mapping| through the |println| + function with the given variable name and a comment with description, + source, and URL. + """ + println(u"") + writeMappingHeader(println, description, source, url) + println(u"var {0} = {{".format(name)) + for (key, value) in sorted(mapping.items(), key=itemgetter(0)): + println(u' "{0}": "{1}",'.format(key, value)) + println(u"};") + +def writeMappingsBinarySearch(println, fn_name, type_name, name, validate_fn, validate_case_fn, + mappings, tag_maxlength, description, source, url): + """ Emit code to perform a binary search on language tag subtags. + + Uses the contents of |mapping|, which can either be a dictionary or set, + to emit a mapping function to find subtag replacements. + """ + println(u"") + writeMappingHeader(println, description, source, url) + println(u""" +bool js::intl::LanguageTag::{0}({1} {2}) {{ + MOZ_ASSERT({3}({2}.span())); + MOZ_ASSERT({4}({2}.span())); +""".format(fn_name, type_name, name, validate_fn, validate_case_fn).strip()) + + def write_array(subtags, name, length, fixed): + if fixed: + println(u" static const char {}[{}][{}] = {{".format(name, len(subtags), + length + 1)) + else: + println(u" static const char* {}[{}] = {{".format(name, len(subtags))) + + # Group in pairs of ten to not exceed the 80 line column limit. + for entries in grouper(subtags, 10): + entries = (u"\"{}\"".format(tag).rjust(length + 2) + for tag in entries if tag is not None) + println(u" {},".format(u", ".join(entries))) + + println(u" };") + + trailing_return = True + + # Sort the subtags by length. That enables using an optimized comparator + # for the binary search, which only performs a single |memcmp| for multiple + # of two subtag lengths. + mappings_keys = mappings.keys() if type(mappings) == dict else mappings + for (length, subtags) in groupby(sorted(mappings_keys, key=len), len): + # Omit the length check if the current length is the maximum length. + if length != tag_maxlength: + println(u""" + if ({}.length() == {}) {{ +""".format(name, length).rstrip("\n")) + else: + trailing_return = False + println(u""" + { +""".rstrip("\n")) + + # The subtags need to be sorted for binary search to work. + subtags = sorted(subtags) + + def equals(subtag): + return u"""{}.equalTo("{}")""".format(name, subtag) + + # Don't emit a binary search for short lists. + if len(subtags) == 1: + if type(mappings) == dict: + println(u""" + if ({}) {{ + {}.set("{}"); + return true; + }} + return false; +""".format(equals(subtags[0]), name, mappings[subtags[0]]).strip("\n")) + else: + println(u""" + return {}; +""".format(equals(subtags[0])).strip("\n")) + elif len(subtags) <= 4: + if type(mappings) == dict: + for subtag in subtags: + println(u""" + if ({}) {{ + {}.set("{}"); + return true; + }} +""".format(equals(subtag), name, mappings[subtag]).strip("\n")) + + println(u""" + return false; +""".strip("\n")) + else: + cond = (equals(subtag) for subtag in subtags) + cond = (u" ||\n" + u" " * (4 + len("return "))).join(cond) + println(u""" + return {}; +""".format(cond).strip("\n")) + else: + write_array(subtags, name + "s", length, True) + + if type(mappings) == dict: + write_array([mappings[k] for k in subtags], u"aliases", length, False) + + println(u""" + if (const char* replacement = SearchReplacement({0}s, aliases, {0})) {{ + {0}.set(mozilla::MakeCStringSpan(replacement)); + return true; + }} + return false; +""".format(name).rstrip()) + else: + println(u""" + return HasReplacement({0}s, {0}); +""".format(name).rstrip()) -def readRegistryRecord(registry): - """ Yields the records of the IANA Language Subtag Registry as dictionaries. """ - record = {} - for line in registry: - line = line.strip() - if line == "": + println(u""" + } +""".strip("\n")) + + if trailing_return: + println(u""" + return false;""") + + println(u""" +}""".lstrip("\n")) + + +def writeComplexLanguageTagMappings(println, complex_language_mappings, + description, source, url): + println(u"") + writeMappingHeader(println, description, source, url) + println(u""" +void js::intl::LanguageTag::performComplexLanguageMappings() { + MOZ_ASSERT(IsStructurallyValidLanguageTag(language().span())); + MOZ_ASSERT(IsCanonicallyCasedLanguageTag(language().span())); +""".lstrip()) + + # Merge duplicate language entries. + language_aliases = {} + for (deprecated_language, (language, script, region)) in ( + sorted(complex_language_mappings.items(), key=itemgetter(0)) + ): + key = (language, script, region) + if key not in language_aliases: + language_aliases[key] = [] + else: + language_aliases[key].append(deprecated_language) + + first_language = True + for (deprecated_language, (language, script, region)) in ( + sorted(complex_language_mappings.items(), key=itemgetter(0)) + ): + key = (language, script, region) + if deprecated_language in language_aliases[key]: continue - if line == "%%": - yield record - record = {} + + if_kind = u"if" if first_language else u"else if" + first_language = False + + cond = (u"language().equalTo(\"{}\")".format(lang) + for lang in [deprecated_language] + language_aliases[key]) + cond = (u" ||\n" + u" " * (2 + len(if_kind) + 2)).join(cond) + + println(u""" + {} ({}) {{""".format(if_kind, cond).strip("\n")) + + println(u""" + setLanguage("{}");""".format(language).strip("\n")) + + if script is not None: + println(u""" + if (script().missing()) {{ + setScript("{}"); + }}""".format(script).strip("\n")) + if region is not None: + println(u""" + if (region().missing()) {{ + setRegion("{}"); + }}""".format(region).strip("\n")) + println(u""" + }""".strip("\n")) + + println(u""" +} +""".strip("\n")) + + +def writeComplexRegionTagMappings(println, complex_region_mappings, + description, source, url): + println(u"") + writeMappingHeader(println, description, source, url) + println(u""" +void js::intl::LanguageTag::performComplexRegionMappings() { + MOZ_ASSERT(IsStructurallyValidLanguageTag(language().span())); + MOZ_ASSERT(IsCanonicallyCasedLanguageTag(language().span())); + MOZ_ASSERT(IsStructurallyValidRegionTag(region().span())); + MOZ_ASSERT(IsCanonicallyCasedRegionTag(region().span())); +""".lstrip()) + + # |non_default_replacements| is a list and hence not hashable. Convert it + # to a string to get a proper hashable value. + def hash_key(default, non_default_replacements): + return (default, str(sorted(str(v) for v in non_default_replacements))) + + # Merge duplicate region entries. + region_aliases = {} + for (deprecated_region, (default, non_default_replacements)) in ( + sorted(complex_region_mappings.items(), key=itemgetter(0)) + ): + key = hash_key(default, non_default_replacements) + if key not in region_aliases: + region_aliases[key] = [] else: - if ":" in line: - key, value = line.split(":", 1) - key, value = key.strip(), value.strip() - record[key] = value - else: - # continuation line - record[key] += " " + line - if record: - yield record - return + region_aliases[key].append(deprecated_region) + + first_region = True + for (deprecated_region, (default, non_default_replacements)) in ( + sorted(complex_region_mappings.items(), key=itemgetter(0)) + ): + key = hash_key(default, non_default_replacements) + if deprecated_region in region_aliases[key]: + continue + + if_kind = u"if" if first_region else u"else if" + first_region = False + + cond = (u"region().equalTo(\"{}\")".format(region) + for region in [deprecated_region] + region_aliases[key]) + cond = (u" ||\n" + u" " * (2 + len(if_kind) + 2)).join(cond) + + println(u""" + {} ({}) {{""".format(if_kind, cond).strip("\n")) + + replacement_regions = sorted({region for (_, _, region) in non_default_replacements}) + + first_case = True + for replacement_region in replacement_regions: + replacement_language_script = sorted(((language, script) + for (language, script, region) in ( + non_default_replacements + ) + if region == replacement_region), + key=itemgetter(0)) + + if_kind = u"if" if first_case else u"else if" + first_case = False + + def compare_tags(language, script): + if script is None: + return u"language().equalTo(\"{}\")".format(language) + return u"(language().equalTo(\"{}\") && script().equalTo(\"{}\"))".format( + language, script) + + cond = (compare_tags(language, script) + for (language, script) in replacement_language_script) + cond = (u" ||\n" + u" " * (4 + len(if_kind) + 2)).join(cond) + + println(u""" + {} ({}) {{ + setRegion("{}"); + }}""".format(if_kind, cond, replacement_region).rstrip().strip("\n")) + + println(u""" + else {{ + setRegion("{}"); + }} + }}""".format(default).rstrip().strip("\n")) + + println(u""" +} +""".strip("\n")) + + +def writeVariantTagMappings(println, variant_mappings, description, source, + url): + """ Writes a function definition that maps variant subtags. """ + println(u""" +static const char* ToCharPointer(const char* str) { + return str; +} + +static const char* ToCharPointer(const js::UniqueChars& str) { + return str.get(); +} + +template <typename T, typename U = T> +static bool IsLessThan(const T& a, const U& b) { + return strcmp(ToCharPointer(a), ToCharPointer(b)) < 0; +} +""") + writeMappingHeader(println, description, source, url) + println(u""" +bool js::intl::LanguageTag::performVariantMappings(JSContext* cx) { + // The variant subtags need to be sorted for binary search. + MOZ_ASSERT(std::is_sorted(variants_.begin(), variants_.end(), + IsLessThan<decltype(variants_)::ElementType>)); + + auto insertVariantSortedIfNotPresent = [&](const char* variant) { + auto* p = std::lower_bound(variants_.begin(), variants_.end(), variant, + IsLessThan<decltype(variants_)::ElementType, + decltype(variant)>); + + // Don't insert the replacement when already present. + if (p != variants_.end() && strcmp(p->get(), variant) == 0) { + return true; + } + + // Insert the preferred variant in sort order. + auto preferred = DuplicateString(cx, variant); + if (!preferred) { + return false; + } + return !!variants_.insert(p, std::move(preferred)); + }; + + for (size_t i = 0; i < variants_.length(); ) { + auto& variant = variants_[i]; + MOZ_ASSERT(IsCanonicallyCasedVariantTag(mozilla::MakeCStringSpan(variant.get()))); +""".lstrip()) + + first_variant = True + + for (deprecated_variant, (type, replacement)) in ( + sorted(variant_mappings.items(), key=itemgetter(0)) + ): + if_kind = u"if" if first_variant else u"else if" + first_variant = False + + println(u""" + {} (strcmp(variant.get(), "{}") == 0) {{ + variants_.erase(variants_.begin() + i); +""".format(if_kind, deprecated_variant).strip("\n")) + + if type == "language": + println(u""" + setLanguage("{}"); +""".format(replacement).strip("\n")) + elif type == "region": + println(u""" + setRegion("{}"); +""".format(replacement).strip("\n")) + else: + assert type == "variant" + println(u""" + if (!insertVariantSortedIfNotPresent("{}")) {{ + return false; + }} +""".format(replacement).strip("\n")) + + println(u""" + } +""".strip("\n")) + + println(u""" + else { + i++; + } + } + return true; +} +""".strip("\n")) + + +def writeGrandfatheredMappingsFunction(println, grandfathered_mappings, + description, source, url): + """ Writes a function definition that maps grandfathered language tags. """ + println(u"") + writeMappingHeader(println, description, source, url) + println(u"""\ +bool js::intl::LanguageTag::updateGrandfatheredMappings(JSContext* cx) { + // We're mapping regular grandfathered tags to non-grandfathered form here. + // Other tags remain unchanged. + // + // regular = "art-lojban" + // / "cel-gaulish" + // / "no-bok" + // / "no-nyn" + // / "zh-guoyu" + // / "zh-hakka" + // / "zh-min" + // / "zh-min-nan" + // / "zh-xiang" + // + // Therefore we can quickly exclude most tags by checking every + // |unicode_locale_id| subcomponent for characteristics not shared by any of + // the regular grandfathered (RG) tags: + // + // * Real-world |unicode_language_subtag|s are all two or three letters, + // so don't waste time running a useless |language.length > 3| fast-path. + // * No RG tag has a "script"-looking component. + // * No RG tag has a "region"-looking component. + // * The RG tags that match |unicode_locale_id| (art-lojban, cel-gaulish, + // zh-guoyu, zh-hakka, zh-xiang) have exactly one "variant". (no-bok, + // no-nyn, zh-min, and zh-min-nan require BCP47's extlang subtag + // that |unicode_locale_id| doesn't support.) + // * No RG tag contains |extensions| or |pu_extensions|. + if (script().present() || + region().present() || + variants().length() != 1 || + extensions().length() != 0 || + privateuse()) { + return true; + } + + MOZ_ASSERT(IsCanonicallyCasedLanguageTag(language().span())); + MOZ_ASSERT(IsCanonicallyCasedVariantTag(mozilla::MakeCStringSpan(variants()[0].get()))); + + auto variantEqualTo = [this](const char* variant) { + return strcmp(variants()[0].get(), variant) == 0; + };""") + + # From Unicode BCP 47 locale identifier <https://unicode.org/reports/tr35/>. + # + # Doesn't allow any 'extensions' subtags. + re_unicode_locale_id = re.compile( + r""" + ^ + # unicode_language_id = unicode_language_subtag + # unicode_language_subtag = alpha{2,3} | alpha{5,8} + (?P<language>[a-z]{2,3}|[a-z]{5,8}) + + # (sep unicode_script_subtag)? + # unicode_script_subtag = alpha{4} + (?:-(?P<script>[a-z]{4}))? + + # (sep unicode_region_subtag)? + # unicode_region_subtag = (alpha{2} | digit{3}) + (?:-(?P<region>([a-z]{2}|[0-9]{3})))? + + # (sep unicode_variant_subtag)* + # unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3}) + (?P<variants>(-([a-z0-9]{5,8}|[0-9][a-z0-9]{3}))+)? + + # pu_extensions? + # pu_extensions = sep [xX] (sep alphanum{1,8})+ + (?:-(?P<privateuse>x(-[a-z0-9]{1,8})+))? + $ + """, re.IGNORECASE | re.VERBOSE) + + is_first = True + + for (tag, modern) in sorted(grandfathered_mappings.items(), key=itemgetter(0)): + tag_match = re_unicode_locale_id.match(tag) + assert tag_match is not None + + tag_language = tag_match.group("language") + assert tag_match.group("script") is None, ( + "{} does not contain a script subtag".format(tag)) + assert tag_match.group("region") is None, ( + "{} does not contain a region subtag".format(tag)) + tag_variants = tag_match.group("variants") + assert tag_variants is not None, ( + "{} contains a variant subtag".format(tag)) + assert tag_match.group("privateuse") is None, ( + "{} does not contain a privateuse subtag".format(tag)) + + tag_variant = tag_variants[1:] + assert "-" not in tag_variant, ( + "{} contains only a single variant".format(tag)) + + modern_match = re_unicode_locale_id.match(modern) + assert modern_match is not None + + modern_language = modern_match.group("language") + modern_script = modern_match.group("script") + modern_region = modern_match.group("region") + modern_variants = modern_match.group("variants") + modern_privateuse = modern_match.group("privateuse") + + println(u""" + // {} -> {} +""".format(tag, modern).rstrip()) + + println(u""" + {}if (language().equalTo("{}") && variantEqualTo("{}")) {{ + """.format("" if is_first else "else ", + tag_language, + tag_variant).rstrip().strip("\n")) + + is_first = False + + println(u""" + setLanguage("{}"); + """.format(modern_language).rstrip().strip("\n")) + + if modern_script is not None: + println(u""" + setScript("{}"); + """.format(modern_script).rstrip().strip("\n")) + + if modern_region is not None: + println(u""" + setRegion("{}"); + """.format(modern_region).rstrip().strip("\n")) + + assert modern_variants is None, ( + "all regular grandfathered tags' modern forms do not contain variant subtags") + + println(u""" + clearVariants(); + """.rstrip().strip("\n")) + + if modern_privateuse is not None: + println(u""" + auto privateuse = DuplicateString(cx, "{}"); + if (!privateuse) {{ + return false; + }} + setPrivateuse(std::move(privateuse)); + """.format(modern_privateuse).rstrip().rstrip("\n")) + + println(u""" + return true; + }""".rstrip().strip("\n")) + println(u""" + return true; +}""") -def readRegistry(registry): - """ Reads IANA Language Subtag Registry and extracts information for Intl.js. + +def readSupplementalData(core_file): + """ Reads CLDR Supplemental Data and extracts information for Intl.js. Information extracted: - - langTagMappings: mappings from complete language tags to preferred + - grandfatheredMappings: mappings from grandfathered tags to preferred complete language tags - - langSubtagMappings: mappings from subtags to preferred subtags - - extlangMappings: mappings from extlang subtags to preferred subtags, - with prefix to be removed - Returns these three mappings as dictionaries, along with the registry's - file date. - - We also check that mappings for language subtags don't affect extlang - subtags and vice versa, so that CanonicalizeLanguageTag doesn't have - to separate them for processing. Region codes are separated by case, - and script codes by length, so they're unproblematic. + - languageMappings: mappings from language subtags to preferred subtags + - complexLanguageMappings: mappings from language subtags with complex rules + - regionMappings: mappings from region subtags to preferred subtags + - complexRegionMappings: mappings from region subtags with complex rules + - variantMappings: mappings from variant subtags to preferred subtags + - likelySubtags: likely subtags used for generating test data only + Returns these mappings as dictionaries. """ - langTagMappings = {} - langSubtagMappings = {} - extlangMappings = {} - languageSubtags = set() - extlangSubtags = set() - - for record in readRegistryRecord(registry): - if "File-Date" in record: - fileDate = record["File-Date"] + import xml.etree.ElementTree as ET + + # From Unicode BCP 47 locale identifier <https://unicode.org/reports/tr35/>. + re_unicode_language_id = re.compile( + r""" + ^ + # unicode_language_id = unicode_language_subtag + # unicode_language_subtag = alpha{2,3} | alpha{5,8} + (?P<language>[a-z]{2,3}|[a-z]{5,8}) + + # (sep unicode_script_subtag)? + # unicode_script_subtag = alpha{4} + (?:-(?P<script>[a-z]{4}))? + + # (sep unicode_region_subtag)? + # unicode_region_subtag = (alpha{2} | digit{3}) + (?:-(?P<region>([a-z]{2}|[0-9]{3})))? + + # (sep unicode_variant_subtag)* + # unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3}) + (?P<variants>(-([a-z0-9]{5,8}|[0-9][a-z0-9]{3}))+)? + $ + """, re.IGNORECASE | re.VERBOSE) + + re_unicode_language_subtag = re.compile( + r""" + ^ + # unicode_language_subtag = alpha{2,3} | alpha{5,8} + ([a-z]{2,3}|[a-z]{5,8}) + $ + """, re.IGNORECASE | re.VERBOSE) + + re_unicode_region_subtag = re.compile( + r""" + ^ + # unicode_region_subtag = (alpha{2} | digit{3}) + ([a-z]{2}|[0-9]{3}) + $ + """, re.IGNORECASE | re.VERBOSE) + + re_unicode_variant_subtag = re.compile( + r""" + ^ + # unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3}) + ([a-z0-9]{5,8}|(?:[0-9][a-z0-9]{3})) + $ + """, re.IGNORECASE | re.VERBOSE) + + # The fixed list of BCP 47 grandfathered language tags. + grandfathered_tags = ( + "art-lojban", + "cel-gaulish", + "en-GB-oed", + "i-ami", + "i-bnn", + "i-default", + "i-enochian", + "i-hak", + "i-klingon", + "i-lux", + "i-mingo", + "i-navajo", + "i-pwn", + "i-tao", + "i-tay", + "i-tsu", + "no-bok", + "no-nyn", + "sgn-BE-FR", + "sgn-BE-NL", + "sgn-CH-DE", + "zh-guoyu", + "zh-hakka", + "zh-min", + "zh-min-nan", + "zh-xiang", + ) + + # The list of grandfathered tags which are valid Unicode BCP 47 locale identifiers. + unicode_bcp47_grandfathered_tags = {tag for tag in grandfathered_tags + if re_unicode_language_id.match(tag)} + + # Dictionary of simple language subtag mappings, e.g. "in" -> "id". + language_mappings = {} + + # Dictionary of complex language subtag mappings, modifying more than one + # subtag, e.g. "sh" -> ("sr", "Latn", None) and "cnr" -> ("sr", None, "ME"). + complex_language_mappings = {} + + # Dictionary of simple region subtag mappings, e.g. "DD" -> "DE". + region_mappings = {} + + # Dictionary of complex region subtag mappings, containing more than one + # replacement, e.g. "SU" -> ("RU", ["AM",complex_region_mappings[type] = replacements "AZ", "BY", ...]). + complex_region_mappings = {} + + # Dictionary of aliased variant subtags to a tuple of preferred replacement + # type and replacement, e.g. "arevela" -> ("language", "hy") or + # "aaland" -> ("region", "AX") or "heploc" -> ("variant", "alalc97"). + variant_mappings = {} + + # Dictionary of grandfathered mappings to preferred values. + grandfathered_mappings = {} + + # CLDR uses "_" as the separator for some elements. Replace it with "-". + def bcp47_id(cldr_id): + return cldr_id.replace("_", "-") + + # CLDR uses the canonical case for most entries, but there are some + # exceptions, like: + # <languageAlias type="drw" replacement="fa_af" reason="deprecated"/> + # Therefore canonicalize all tags to be on the safe side. + def bcp47_canonical(language, script, region): + # Canonical case for language subtags is lower case. + # Canonical case for script subtags is title case. + # Canonical case for region subtags is upper case. + return (language.lower() if language else None, + script.title() if script else None, + region.upper() if region else None) + + tree = ET.parse(core_file.open("common/supplemental/supplementalMetadata.xml")) + + for language_alias in tree.iterfind(".//languageAlias"): + type = bcp47_id(language_alias.get("type")) + replacement = bcp47_id(language_alias.get("replacement")) + + # Handle grandfathered mappings first. + if type in unicode_bcp47_grandfathered_tags: + grandfathered_mappings[type] = replacement continue - if record["Type"] == "grandfathered": - # Grandfathered tags don't use standard syntax, so - # CanonicalizeLanguageTag expects the mapping table to provide - # the final form for all. - # For langTagMappings, keys must be in lower case; values in - # the case used in the registry. - tag = record["Tag"] - if "Preferred-Value" in record: - langTagMappings[tag.lower()] = record["Preferred-Value"] - else: - langTagMappings[tag.lower()] = tag - elif record["Type"] == "redundant": - # For langTagMappings, keys must be in lower case; values in - # the case used in the registry. - if "Preferred-Value" in record: - langTagMappings[record["Tag"].lower()] = record["Preferred-Value"] - elif record["Type"] in ("language", "script", "region", "variant"): - # For langSubtagMappings, keys and values must be in the case used - # in the registry. - subtag = record["Subtag"] - if record["Type"] == "language": - languageSubtags.add(subtag) - if "Preferred-Value" in record: - if subtag == "heploc": - # The entry for heploc is unique in its complexity; handle - # it as special case below. - continue - if "Prefix" in record: - # This might indicate another heploc-like complex case. - raise Exception("Please evaluate: subtag mapping with prefix value.") - langSubtagMappings[subtag] = record["Preferred-Value"] - elif record["Type"] == "extlang": - # For extlangMappings, keys must be in the case used in the - # registry; values are records with the preferred value and the - # prefix to be removed. - subtag = record["Subtag"] - extlangSubtags.add(subtag) - if "Preferred-Value" in record: - preferred = record["Preferred-Value"] - prefix = record["Prefix"] - extlangMappings[subtag] = {"preferred": preferred, "prefix": prefix} + # We're only interested in language subtag matches, so ignore any + # entries which have additional subtags. + if re_unicode_language_subtag.match(type) is None: + continue + + assert type.islower() + + if re_unicode_language_subtag.match(replacement) is not None: + # Canonical case for language subtags is lower-case. + language_mappings[type] = replacement.lower() else: - # No other types are allowed by - # https://tools.ietf.org/html/rfc5646#section-3.1.3 - assert False, "Unrecognized Type: {0}".format(record["Type"]) + replacement_match = re_unicode_language_id.match(replacement) + assert replacement_match is not None, ( + "{} invalid Unicode BCP 47 locale identifier".format(replacement)) + assert replacement_match.group("variants") is None, ( + "{}: unexpected variant subtags in {}".format(type, replacement)) + + complex_language_mappings[type] = bcp47_canonical(replacement_match.group("language"), + replacement_match.group("script"), + replacement_match.group("region")) + + for territory_alias in tree.iterfind(".//territoryAlias"): + type = territory_alias.get("type") + replacement = territory_alias.get("replacement") + + # We're only interested in region subtag matches, so ignore any entries + # which contain legacy formats, e.g. three letter region codes. + if re_unicode_region_subtag.match(type) is None: + continue - # Check that mappings for language subtags and extlang subtags don't affect - # each other. - for lang in languageSubtags: - if lang in extlangMappings and extlangMappings[lang]["preferred"] != lang: - raise Exception("Conflict: lang with extlang mapping: " + lang) - for extlang in extlangSubtags: - if extlang in langSubtagMappings: - raise Exception("Conflict: extlang with lang mapping: " + extlang) + assert type.isupper() or type.isdigit() - # Special case for heploc. - langTagMappings["ja-latn-hepburn-heploc"] = "ja-Latn-alalc97" + if re_unicode_region_subtag.match(replacement) is not None: + # Canonical case for region subtags is upper-case. + region_mappings[type] = replacement.upper() + else: + # Canonical case for region subtags is upper-case. + replacements = [r.upper() for r in replacement.split(" ")] + assert all( + re_unicode_region_subtag.match(loc) is not None for loc in replacements + ), "{} invalid region subtags".format(replacement) + complex_region_mappings[type] = replacements - return {"fileDate": fileDate, - "langTagMappings": langTagMappings, - "langSubtagMappings": langSubtagMappings, - "extlangMappings": extlangMappings} + for variant_alias in tree.iterfind(".//variantAlias"): + type = variant_alias.get("type") + replacement = variant_alias.get("replacement") + assert re_unicode_variant_subtag.match(type) is not None, ( + "{} invalid variant subtag".format(type)) -def writeMappingsVar(intlData, dict, name, description, fileDate, url): - """ Writes a variable definition with a mapping table to file intlData. + # Normalize the case, because some variants are in upper case. + type = type.lower() + + # The replacement can be a language, a region, or a variant subtag. + # Language and region subtags are case normalized, variant subtags can + # be in any case. + + if re_unicode_language_subtag.match(replacement) is not None and replacement.islower(): + variant_mappings[type] = ("language", replacement) + + elif re_unicode_region_subtag.match(replacement) is not None: + assert replacement.isupper() or replacement.isdigit(), ( + "{} invalid variant subtag replacement".format(replacement)) + variant_mappings[type] = ("region", replacement) - Writes the contents of dictionary dict to file intlData with the given - variable name and a comment with description, fileDate, and URL. - """ - intlData.write("\n") - intlData.write("// {0}.\n".format(description)) - intlData.write("// Derived from IANA Language Subtag Registry, file date {0}.\n".format(fileDate)) - intlData.write("// {0}\n".format(url)) - intlData.write("var {0} = {{\n".format(name)) - keys = sorted(dict) - for key in keys: - if isinstance(dict[key], basestring): - value = '"{0}"'.format(dict[key]) else: - preferred = dict[key]["preferred"] - prefix = dict[key]["prefix"] - value = '{{preferred: "{0}", prefix: "{1}"}}'.format(preferred, prefix) - intlData.write(' "{0}": {1},\n'.format(key, value)) - intlData.write("};\n") + assert re_unicode_variant_subtag.match(replacement) is not None, ( + "{} invalid variant subtag replacement".format(replacement)) + variant_mappings[type] = ("variant", replacement.lower()) + + tree = ET.parse(core_file.open("common/supplemental/likelySubtags.xml")) + + likely_subtags = {} + + for likely_subtag in tree.iterfind(".//likelySubtag"): + from_tag = bcp47_id(likely_subtag.get("from")) + from_match = re_unicode_language_id.match(from_tag) + assert from_match is not None, ( + "{} invalid Unicode BCP 47 locale identifier".format(from_tag)) + assert from_match.group("variants") is None, ( + "unexpected variant subtags in {}".format(from_tag)) + + to_tag = bcp47_id(likely_subtag.get("to")) + to_match = re_unicode_language_id.match(to_tag) + assert to_match is not None, ( + "{} invalid Unicode BCP 47 locale identifier".format(to_tag)) + assert to_match.group("variants") is None, ( + "unexpected variant subtags in {}".format(to_tag)) + + from_canonical = bcp47_canonical(from_match.group("language"), + from_match.group("script"), + from_match.group("region")) + + to_canonical = bcp47_canonical(to_match.group("language"), + to_match.group("script"), + to_match.group("region")) + + likely_subtags[from_canonical] = to_canonical + + complex_region_mappings_final = {} + + for (deprecated_region, replacements) in complex_region_mappings.items(): + # Find all likely subtag entries which don't already contain a region + # subtag and whose target region is in the list of replacement regions. + region_likely_subtags = [(from_language, from_script, to_region) + for ((from_language, from_script, from_region), + (_, _, to_region)) in likely_subtags.items() + if from_region is None and to_region in replacements] + + # The first replacement entry is the default region. + default = replacements[0] + + # Find all likely subtag entries whose region matches the default region. + default_replacements = {(language, script) + for (language, script, region) in region_likely_subtags + if region == default} + + # And finally find those entries which don't use the default region. + # These are the entries we're actually interested in, because those need + # to be handled specially when selecting the correct preferred region. + non_default_replacements = [(language, script, region) + for (language, script, region) in region_likely_subtags + if (language, script) not in default_replacements] + + # If there are no non-default replacements, we can handle the region as + # part of the simple region mapping. + if non_default_replacements: + complex_region_mappings_final[deprecated_region] = (default, non_default_replacements) + else: + region_mappings[deprecated_region] = default + + return {"grandfatheredMappings": grandfathered_mappings, + "languageMappings": language_mappings, + "complexLanguageMappings": complex_language_mappings, + "regionMappings": region_mappings, + "complexRegionMappings": complex_region_mappings_final, + "variantMappings": variant_mappings, + "likelySubtags": likely_subtags, + } + +def readUnicodeExtensions(core_file): + import xml.etree.ElementTree as ET + + # Match all xml-files in the BCP 47 directory. + bcpFileRE = re.compile(r"^common/bcp47/.+\.xml$") + + # https://www.unicode.org/reports/tr35/#Unicode_locale_identifier + # + # type = alphanum{3,8} (sep alphanum{3,8})* ; + typeRE = re.compile(r"^[a-z0-9]{3,8}(-[a-z0-9]{3,8})*$") + + # Mapping from Unicode extension types to dict of deprecated to + # preferred values. + mapping = { + # Unicode BCP 47 U Extension + "u": {}, + + # Unicode BCP 47 T Extension + "t": {}, + } + def readBCP47File(file): + tree = ET.parse(file) + for keyword in tree.iterfind(".//keyword/key"): + extension = keyword.get("extension", "u") + assert extension == "u" or extension == "t", ( + "unknown extension type: {}".format(extension)) + + extension_name = keyword.get("name") + + for type in keyword.iterfind("type"): + # <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>: + # + # The key or type name used by Unicode locale extension with 'u' extension + # syntax or the 't' extensions syntax. When alias below is absent, this name + # can be also used with the old style "@key=type" syntax. + name = type.get("name") + + # Ignore the special name: + # - <https://unicode.org/reports/tr35/#CODEPOINTS> + # - <https://unicode.org/reports/tr35/#REORDER_CODE> + # - <https://unicode.org/reports/tr35/#RG_KEY_VALUE> + # - <https://unicode.org/reports/tr35/#SUBDIVISION_CODE> + # - <https://unicode.org/reports/tr35/#PRIVATE_USE> + if name in ("CODEPOINTS", "REORDER_CODE", "RG_KEY_VALUE", "SUBDIVISION_CODE", + "PRIVATE_USE"): + continue -def writeLanguageTagData(intlData, fileDate, url, langTagMappings, langSubtagMappings, extlangMappings): + # All other names should match the 'type' production. + assert typeRE.match(name) is not None, ( + "{} matches the 'type' production".format(name)) + + # <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>: + # + # The preferred value of the deprecated key, type or attribute element. + # When a key, type or attribute element is deprecated, this attribute is + # used for specifying a new canonical form if available. + preferred = type.get("preferred") + + # <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>: + # + # The BCP 47 form is the canonical form, and recommended. Other aliases are + # included only for backwards compatibility. + alias = type.get("alias") + + # <https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers> + # + # Use the bcp47 data to replace keys, types, tfields, and tvalues by their + # canonical forms. See Section 3.6.4 U Extension Data Files) and Section + # 3.7.1 T Extension Data Files. The aliases are in the alias attribute + # value, while the canonical is in the name attribute value. + + # 'preferred' contains the new preferred name, 'alias' the compatibility + # name, but then there's this entry where 'preferred' and 'alias' are the + # same. So which one to choose? Assume 'preferred' is the actual canonical + # name. + # + # <type name="islamicc" + # description="Civil (algorithmic) Arabic calendar" + # deprecated="true" + # preferred="islamic-civil" + # alias="islamic-civil"/> + + if preferred is not None: + assert typeRE.match(preferred), preferred + mapping[extension].setdefault(extension_name, {})[name] = preferred + + if alias is not None: + for alias_name in alias.lower().split(" "): + # Ignore alias entries which don't match the 'type' production. + if typeRE.match(alias_name) is None: + continue + + # See comment above when 'alias' and 'preferred' are both present. + if (preferred is not None and + name in mapping[extension][extension_name]): + continue + + # Skip over entries where 'name' and 'alias' are equal. + # + # <type name="pst8pdt" + # description="POSIX style time zone for US Pacific Time" + # alias="PST8PDT" + # since="1.8"/> + if name == alias_name: + continue + + mapping[extension].setdefault(extension_name, {})[alias_name] = name + + def readSupplementalMetadata(file): + # Find subdivision and region replacements. + # + # <https://www.unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers> + # + # Replace aliases in special key values: + # - If there is an 'sd' or 'rg' key, replace any subdivision alias + # in its value in the same way, using subdivisionAlias data. + tree = ET.parse(file) + for alias in tree.iterfind(".//subdivisionAlias"): + type = alias.get("type") + assert typeRE.match(type) is not None, ( + "{} matches the 'type' production".format(type)) + + # Take the first replacement when multiple ones are present. + replacement = alias.get("replacement").split(" ")[0].lower() + + # Skip over invalid replacements. + # + # <subdivisionAlias type="fi01" replacement="AX" reason="overlong"/> + # + # It's not entirely clear to me if CLDR actually wants to use + # "axzzzz" as the replacement for this case. + if typeRE.match(replacement) is None: + continue + + # 'subdivisionAlias' applies to 'rg' and 'sd' keys. + mapping["u"].setdefault("rg", {})[type] = replacement + mapping["u"].setdefault("sd", {})[type] = replacement + + for name in core_file.namelist(): + if bcpFileRE.match(name): + readBCP47File(core_file.open(name)) + + readSupplementalMetadata(core_file.open("common/supplemental/supplementalMetadata.xml")) + + return { + "unicodeMappings": mapping["u"], + "transformMappings": mapping["t"], + } + +def writeCLDRLanguageTagData(println, data, url): """ Writes the language tag data to the Intl data file. """ - writeMappingsVar(intlData, langTagMappings, "langTagMappings", - "Mappings from complete tags to preferred values", fileDate, url) - writeMappingsVar(intlData, langSubtagMappings, "langSubtagMappings", - "Mappings from non-extlang subtags to preferred values", fileDate, url) - writeMappingsVar(intlData, extlangMappings, "extlangMappings", - "Mappings from extlang subtags to preferred values", fileDate, url) - -def updateLangTags(args): - """ Update the LangTagMappingsGenerated.js file. """ + + println(generatedFileWarning) + println(u"// Version: CLDR-{}".format(data["version"])) + println(u"// URL: {}".format(url)) + + println(u""" +#include "mozilla/Assertions.h" +#include "mozilla/Span.h" +#include "mozilla/TextUtils.h" + +#include <algorithm> +#include <cstdint> +#include <cstring> +#include <iterator> +#include <string> +#include <type_traits> + +#include "jscntxt.h" +#include "jsstr.h" + +#include "builtin/intl/LanguageTag.h" + +using namespace js::intl::LanguageTagLimits; + +template <size_t Length, size_t TagLength, size_t SubtagLength> +static inline bool HasReplacement( + const char (&subtags)[Length][TagLength], + const js::intl::LanguageTagSubtag<SubtagLength>& subtag) { + MOZ_ASSERT(subtag.length() == TagLength - 1, + "subtag must have the same length as the list of subtags"); + + const char* ptr = subtag.span().data(); + return std::binary_search(std::begin(subtags), std::end(subtags), ptr, + [](const char* a, const char* b) { + return memcmp(a, b, TagLength - 1) < 0; + }); +} + +template <size_t Length, size_t TagLength, size_t SubtagLength> +static inline const char* SearchReplacement( + const char (&subtags)[Length][TagLength], + const char* (&aliases)[Length], + const js::intl::LanguageTagSubtag<SubtagLength>& subtag) { + MOZ_ASSERT(subtag.length() == TagLength - 1, + "subtag must have the same length as the list of subtags"); + + const char* ptr = subtag.span().data(); + auto p = std::lower_bound(std::begin(subtags), std::end(subtags), ptr, + [](const char* a, const char* b) { + return memcmp(a, b, TagLength - 1) < 0; + }); + if (p != std::end(subtags) && memcmp(*p, ptr, TagLength - 1) == 0) { + return aliases[std::distance(std::begin(subtags), p)]; + } + return nullptr; +} + +#ifdef DEBUG +static bool IsAsciiLowercaseAlphanumeric(char c) { + return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c); +} + +static bool IsAsciiLowercaseAlphanumericOrDash(char c) { + return IsAsciiLowercaseAlphanumeric(c) || c == '-'; +} + +static bool IsCanonicallyCasedLanguageTag(mozilla::Span<const char> span) { + // Tell the analysis the |std::all_of| function can't GC. + JS::AutoSuppressGCAnalysis nogc; + + return std::all_of(span.begin(), span.end(), mozilla::IsAsciiLowercaseAlpha<char>); +} + +static bool IsCanonicallyCasedRegionTag(mozilla::Span<const char> span) { + // Tell the analysis the |std::all_of| function can't GC. + JS::AutoSuppressGCAnalysis nogc; + + return std::all_of(span.begin(), span.end(), mozilla::IsAsciiUppercaseAlpha<char>) || + std::all_of(span.begin(), span.end(), mozilla::IsAsciiDigit<char>); +} + +static bool IsCanonicallyCasedVariantTag(mozilla::Span<const char> span) { + // Tell the analysis the |std::all_of| function can't GC. + JS::AutoSuppressGCAnalysis nogc; + + return std::all_of(span.begin(), span.end(), IsAsciiLowercaseAlphanumeric); +} + +static bool IsCanonicallyCasedUnicodeKey(mozilla::Span<const char> key) { + return std::all_of(key.begin(), key.end(), IsAsciiLowercaseAlphanumeric); +} + +static bool IsCanonicallyCasedUnicodeType(mozilla::Span<const char> type) { + return std::all_of(type.begin(), type.end(), IsAsciiLowercaseAlphanumericOrDash); +} + +static bool IsCanonicallyCasedTransformKey(mozilla::Span<const char> key) { + return std::all_of(key.begin(), key.end(), IsAsciiLowercaseAlphanumeric); +} + +static bool IsCanonicallyCasedTransformType(mozilla::Span<const char> type) { + return std::all_of(type.begin(), type.end(), IsAsciiLowercaseAlphanumericOrDash); +} +#endif +""".rstrip()) + + source = u"CLDR Supplemental Data, version {}".format(data["version"]) + grandfathered_mappings = data["grandfatheredMappings"] + language_mappings = data["languageMappings"] + complex_language_mappings = data["complexLanguageMappings"] + region_mappings = data["regionMappings"] + complex_region_mappings = data["complexRegionMappings"] + variant_mappings = data["variantMappings"] + unicode_mappings = data["unicodeMappings"] + transform_mappings = data["transformMappings"] + + # unicode_language_subtag = alpha{2,3} | alpha{5,8} ; + language_maxlength = 8 + + # unicode_region_subtag = (alpha{2} | digit{3}) ; + region_maxlength = 3 + + writeMappingsBinarySearch(println, "languageMapping", + "LanguageSubtag&", "language", + "IsStructurallyValidLanguageTag", + "IsCanonicallyCasedLanguageTag", + language_mappings, language_maxlength, + "Mappings from language subtags to preferred values.", source, url) + writeMappingsBinarySearch(println, "complexLanguageMapping", + "const LanguageSubtag&", "language", + "IsStructurallyValidLanguageTag", + "IsCanonicallyCasedLanguageTag", + complex_language_mappings.keys(), language_maxlength, + "Language subtags with complex mappings.", source, url) + writeMappingsBinarySearch(println, "regionMapping", + "RegionSubtag&", "region", + "IsStructurallyValidRegionTag", + "IsCanonicallyCasedRegionTag", + region_mappings, region_maxlength, + "Mappings from region subtags to preferred values.", source, url) + writeMappingsBinarySearch(println, "complexRegionMapping", + "const RegionSubtag&", "region", + "IsStructurallyValidRegionTag", + "IsCanonicallyCasedRegionTag", + complex_region_mappings.keys(), region_maxlength, + "Region subtags with complex mappings.", source, url) + + writeComplexLanguageTagMappings(println, complex_language_mappings, + "Language subtags with complex mappings.", source, url) + writeComplexRegionTagMappings(println, complex_region_mappings, + "Region subtags with complex mappings.", source, url) + + writeVariantTagMappings(println, variant_mappings, + "Mappings from variant subtags to preferred values.", source, url) + + writeGrandfatheredMappingsFunction(println, grandfathered_mappings, + "Canonicalize grandfathered locale identifiers.", source, + url) + + writeUnicodeExtensionsMappings(println, unicode_mappings, "Unicode") + writeUnicodeExtensionsMappings(println, transform_mappings, "Transform") + + +def writeCLDRLanguageTagLikelySubtagsTest(println, data, url): + """ Writes the likely-subtags test file. """ + + println(generatedFileWarning) + + source = u"CLDR Supplemental Data, version {}".format(data["version"]) + language_mappings = data["languageMappings"] + complex_language_mappings = data["complexLanguageMappings"] + region_mappings = data["regionMappings"] + complex_region_mappings = data["complexRegionMappings"] + likely_subtags = data["likelySubtags"] + + def bcp47(tag): + (language, script, region) = tag + return "{}{}{}".format(language, + "-" + script if script else "", + "-" + region if region else "") + + def canonical(tag): + (language, script, region) = tag + + # Map deprecated language subtags. + if language in language_mappings: + language = language_mappings[language] + elif language in complex_language_mappings: + (language2, script2, region2) = complex_language_mappings[language] + (language, script, region) = (language2, + script if script else script2, + region if region else region2) + + # Map deprecated region subtags. + if region in region_mappings: + region = region_mappings[region] + else: + # Assume no complex region mappings are needed for now. + assert region not in complex_region_mappings,\ + "unexpected region with complex mappings: {}".format(region) + + return (language, script, region) + + # https://unicode.org/reports/tr35/#Likely_Subtags + + def addLikelySubtags(tag): + # Step 1: Canonicalize. + (language, script, region) = canonical(tag) + if script == "Zzzz": + script = None + if region == "ZZ": + region = None + + # Step 2: Lookup. + searches = ((language, script, region), + (language, None, region), + (language, script, None), + (language, None, None), + ("und", script, None)) + search = next(search for search in searches if search in likely_subtags) + + (language_s, script_s, region_s) = search + (language_m, script_m, region_m) = likely_subtags[search] + + # Step 3: Return. + return (language if language != language_s else language_m, + script if script != script_s else script_m, + region if region != region_s else region_m) + + # https://unicode.org/reports/tr35/#Likely_Subtags + def removeLikelySubtags(tag): + # Step 1: Add likely subtags. + max = addLikelySubtags(tag) + + # Step 2: Remove variants (doesn't apply here). + + # Step 3: Find a match. + (language, script, region) = max + for trial in ((language, None, None), (language, None, region), (language, script, None)): + if addLikelySubtags(trial) == max: + return trial + + # Step 4: Return maximized if no match found. + return max + + def likely_canonical(from_tag, to_tag): + # Canonicalize the input tag. + from_tag = canonical(from_tag) + + # Update the expected result if necessary. + if from_tag in likely_subtags: + to_tag = likely_subtags[from_tag] + + # Canonicalize the expected output. + to_canonical = canonical(to_tag) + + # Sanity check: This should match the result of |addLikelySubtags|. + assert to_canonical == addLikelySubtags(from_tag) + + return to_canonical + + # |likely_subtags| contains non-canonicalized tags, so canonicalize it first. + likely_subtags_canonical = {k: likely_canonical(k, v) for (k, v) in likely_subtags.items()} + + # Add test data for |Intl.Locale.prototype.maximize()|. + writeMappingsVar(println, {bcp47(k): bcp47(v) for (k, v) in likely_subtags_canonical.items()}, + "maxLikelySubtags", "Extracted from likelySubtags.xml.", source, url) + + # Use the maximalized tags as the input for the remove likely-subtags test. + minimized = {tag: removeLikelySubtags(tag) for tag in likely_subtags_canonical.values()} + + # Add test data for |Intl.Locale.prototype.minimize()|. + writeMappingsVar(println, {bcp47(k): bcp47(v) for (k, v) in minimized.items()}, + "minLikelySubtags", "Extracted from likelySubtags.xml.", source, url) + + println(u""" +for (let [tag, maximal] of Object.entries(maxLikelySubtags)) { + assertEq(new Intl.Locale(tag).maximize().toString(), maximal); +}""") + + println(u""" +for (let [tag, minimal] of Object.entries(minLikelySubtags)) { + assertEq(new Intl.Locale(tag).minimize().toString(), minimal); +}""") + + println(u""" +if (typeof reportCompare === "function") + reportCompare(0, 0);""") + + +def updateCLDRLangTags(args): + """ Update the LanguageTagGenerated.cpp file. """ + version = args.version url = args.url out = args.out filename = args.file + url = url.replace("<VERSION>", version) + print("Arguments:") + print("\tCLDR version: %s" % version) print("\tDownload url: %s" % url) - print("\tLocal registry: %s" % filename) + if filename is not None: + print("\tLocal CLDR core.zip file: %s" % filename) print("\tOutput file: %s" % out) print("") + data = { + "version": version, + } + + def readFiles(cldr_file): + with ZipFile(cldr_file) as zip_file: + data.update(readSupplementalData(zip_file)) + data.update(readUnicodeExtensions(zip_file)) + + print("Processing CLDR data...") if filename is not None: - print("Always make sure you have the newest language-subtag-registry.txt!") - registry = codecs.open(filename, "r", encoding="utf-8") + print("Always make sure you have the newest CLDR core.zip!") + with open(filename, "rb") as cldr_file: + readFiles(cldr_file) else: - print("Downloading IANA Language Subtag Registry...") - with closing(urllib2.urlopen(url)) as reader: - text = reader.read().decode("utf-8") - registry = codecs.open("language-subtag-registry.txt", "w+", encoding="utf-8") - registry.write(text) - registry.seek(0) - - print("Processing IANA Language Subtag Registry...") - with closing(registry) as reg: - data = readRegistry(reg) - fileDate = data["fileDate"] - langTagMappings = data["langTagMappings"] - langSubtagMappings = data["langSubtagMappings"] - extlangMappings = data["extlangMappings"] + print("Downloading CLDR core.zip...") + with closing(urllib2.urlopen(url)) as cldr_file: + cldr_data = io.BytesIO(cldr_file.read()) + readFiles(cldr_data) print("Writing Intl data...") - with codecs.open(out, "w", encoding="utf-8") as intlData: - intlData.write("// Generated by make_intl_data.py. DO NOT EDIT.\n") - writeLanguageTagData(intlData, fileDate, url, langTagMappings, langSubtagMappings, extlangMappings) + with io.open(out, mode="w", encoding="utf-8", newline="") as f: + println = partial(print, file=f) + writeCLDRLanguageTagData(println, data, url) + + print("Writing Intl test data...") + test_file = os.path.join(os.path.dirname(os.path.abspath(__file__)), + "../../tests/non262/Intl/Locale/likely-subtags-generated.js") + with io.open(test_file, mode="w", encoding="utf-8", newline="") as f: + println = partial(print, file=f) + + println(u"// |reftest| skip-if(!this.hasOwnProperty('Intl'))") + writeCLDRLanguageTagLikelySubtagsTest(println, data, url) + def flines(filepath, encoding="utf-8"): """ Open filepath and iterate over its content. """ @@ -703,11 +1835,11 @@ def processTimeZones(tzdataDir, icuDir, icuTzDir, version, ignoreBackzone, ignor println(u"// Format:") println(u'// "LinkName", "Target" // ICU-Target [time zone file]') - println(u"struct LinkAndTarget"); - println(u"{"); - println(u" const char* const link;"); - println(u" const char* const target;"); - println(u"};"); + println(u"struct LinkAndTarget") + println(u"{") + println(u" const char* const link;") + println(u" const char* const target;") + println(u"};") println(u"") println(u"const LinkAndTarget ianaLinksCanonicalizedDifferentlyByICU[] = {") for (zone, target, icuTarget) in incorrectLinks: @@ -928,7 +2060,7 @@ def updateTzdata(topsrcdir, args): if tzDir is None: print("Downloading tzdata file...") with closing(urllib2.urlopen(url)) as tzfile: - fname = urlparse.urlsplit(tzfile.geturl()).path.split("/")[-1] + fname = urlsplit(tzfile.geturl()).path.split("/")[-1] with tempfile.NamedTemporaryFile(suffix=fname) as tztmpfile: print("File stored in %s" % tztmpfile.name) tztmpfile.write(tzfile.read()) @@ -937,6 +2069,152 @@ def updateTzdata(topsrcdir, args): else: updateFrom(tzDir) +def writeUnicodeExtensionsMappings(println, mapping, extension): + println(u""" +template <size_t Length> +static inline bool Is{0}Key( + mozilla::Span<const char> key, const char (&str)[Length]) {{ + static_assert(Length == {0}KeyLength + 1, + "{0} extension key is two characters long"); + return memcmp(key.data(), str, Length - 1) == 0; +}} + +template <size_t Length> +static inline bool Is{0}Type( + mozilla::Span<const char> type, const char (&str)[Length]) {{ + static_assert(Length > {0}KeyLength + 1, + "{0} extension type contains more than two characters"); + return type.size() == (Length - 1) && + memcmp(type.data(), str, Length - 1) == 0; +}} +""".format(extension).rstrip("\n")) + + linear_search_max_length = 4 + + needs_binary_search = any(len(replacements.items()) > linear_search_max_length + for replacements in mapping.values()) + + if needs_binary_search: + println(u""" +static int32_t Compare{0}Type(const char* a, mozilla::Span<const char> b) {{ + MOZ_ASSERT(!std::char_traits<char>::find(b.data(), b.size(), '\\0'), + "unexpected null-character in string"); + + using UnsignedChar = unsigned char; + for (size_t i = 0; i < b.size(); i++) {{ + // |a| is zero-terminated and |b| doesn't contain a null-terminator. So if + // we've reached the end of |a|, the below if-statement will always be true. + // That ensures we don't read past the end of |a|. + if (int32_t r = UnsignedChar(a[i]) - UnsignedChar(b[i])) {{ + return r; + }} + }} + + // Return zero if both strings are equal or a negative number if |b| is a + // prefix of |a|. + return -int32_t(UnsignedChar(a[b.size()])); +}} + +template <size_t Length> +static inline const char* Search{0}Replacement( + const char* (&types)[Length], const char* (&aliases)[Length], + mozilla::Span<const char> type) {{ + + auto p = std::lower_bound(std::begin(types), std::end(types), type, + [](const auto& a, const auto& b) {{ + return Compare{0}Type(a, b) < 0; + }}); + if (p != std::end(types) && Compare{0}Type(*p, type) == 0) {{ + return aliases[std::distance(std::begin(types), p)]; + }} + return nullptr; +}} +""".format(extension).rstrip("\n")) + + println(u""" +/** + * Mapping from deprecated BCP 47 {0} extension types to their preferred + * values. + * + * Spec: https://www.unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files + * Spec: https://www.unicode.org/reports/tr35/#t_Extension + */ +const char* js::intl::LanguageTag::replace{0}ExtensionType( + mozilla::Span<const char> key, mozilla::Span<const char> type) {{ + MOZ_ASSERT(key.size() == {0}KeyLength); + MOZ_ASSERT(IsCanonicallyCased{0}Key(key)); + + MOZ_ASSERT(type.size() > {0}KeyLength); + MOZ_ASSERT(IsCanonicallyCased{0}Type(type)); +""".format(extension)) + + def to_hash_key(replacements): + return str(sorted(replacements.items())) + + def write_array(subtags, name, length): + max_entries = (80 - len(" ")) // (length + len('"", ')) + + println(u" static const char* {}[{}] = {{".format(name, len(subtags))) + + for entries in grouper(subtags, max_entries): + entries = (u"\"{}\"".format(tag).rjust(length + 2) + for tag in entries if tag is not None) + println(u" {},".format(u", ".join(entries))) + + println(u" };") + + # Merge duplicate keys. + key_aliases = {} + for (key, replacements) in sorted(mapping.items(), key=itemgetter(0)): + hash_key = to_hash_key(replacements) + if hash_key not in key_aliases: + key_aliases[hash_key] = [] + else: + key_aliases[hash_key].append(key) + + first_key = True + for (key, replacements) in sorted(mapping.items(), key=itemgetter(0)): + hash_key = to_hash_key(replacements) + if key in key_aliases[hash_key]: + continue + + cond = (u"Is{}Key(key, \"{}\")".format(extension, k) + for k in [key] + key_aliases[hash_key]) + + if_kind = u"if" if first_key else u"else if" + cond = (u" ||\n" + u" " * (2 + len(if_kind) + 2)).join(cond) + println(u""" + {} ({}) {{""".format(if_kind, cond).strip("\n")) + first_key = False + + replacements = sorted(replacements.items(), key=itemgetter(0)) + + if len(replacements) > linear_search_max_length: + types = [t for (t, _) in replacements] + preferred = [r for (_, r) in replacements] + max_len = max(len(k) for k in types + preferred) + + write_array(types, "types", max_len) + write_array(preferred, "aliases", max_len) + println(u""" + return Search{}Replacement(types, aliases, type); +""".format(extension).strip("\n")) + else: + for (type, replacement) in replacements: + println(u""" + if (Is{}Type(type, "{}")) {{ + return "{}"; + }}""".format(extension, type, replacement).strip("\n")) + + println(u""" + }""".lstrip("\n")) + + println(u""" + return nullptr; +} +""".strip("\n")) + + if __name__ == "__main__": import argparse @@ -955,20 +2233,24 @@ if __name__ == "__main__": parser = argparse.ArgumentParser(description="Update intl data.") subparsers = parser.add_subparsers(help="Select update mode") - parser_tags = subparsers.add_parser("langtags", - help="Update language-subtag-registry") - parser_tags.add_argument("--url", - metavar="URL", - default="https://www.iana.org/assignments/language-subtag-registry", - type=EnsureHttps, - help="Download url for language-subtag-registry.txt (default: %(default)s)") - parser_tags.add_argument("--out", - default="LangTagMappingsGenerated.js", - help="Output file (default: %(default)s)") - parser_tags.add_argument("file", - nargs="?", - help="Local language-subtag-registry.txt file, if omitted uses <URL>") - parser_tags.set_defaults(func=updateLangTags) + parser_cldr_tags = subparsers.add_parser("langtags", + help="Update CLDR language tags data") + parser_cldr_tags.add_argument("--version", + metavar="VERSION", + required=True, + help="CLDR version number") + parser_cldr_tags.add_argument("--url", + metavar="URL", + default="https://unicode.org/Public/cldr/<VERSION>/core.zip", + type=EnsureHttps, + help="Download url CLDR data (default: %(default)s)") + parser_cldr_tags.add_argument("--out", + default="LanguageTagGenerated.cpp", + help="Output file (default: %(default)s)") + parser_cldr_tags.add_argument("file", + nargs="?", + help="Local cldr-core.zip file, if omitted uses <URL>") + parser_cldr_tags.set_defaults(func=updateCLDRLangTags) parser_tz = subparsers.add_parser("tzdata", help="Update tzdata") parser_tz.add_argument("--tz", |