1 files changed, 1462 insertions, 180 deletions
diff --git a/js/src/builtin/intl/make_intl_data.py b/js/src/builtin/intl/make_intl_data.py
index a81001e0f3..59ff14d76c 100644
--- a/js/src/builtin/intl/make_intl_data.py
+++ b/js/src/builtin/intl/make_intl_data.py
@@ -6,19 +6,15 @@
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 """ Usage:
-    make_intl_data.py langtags [language-subtag-registry.txt]
+    make_intl_data.py langtags [cldr_core.zip]
     make_intl_data.py tzdata
+    make_intl_data.py unicode-ext
 
     Target "langtags":
-    This script extracts information about mappings between deprecated and
-    current BCP 47 language tags from the IANA Language Subtag Registry and
-    converts it to JavaScript object definitions in
-    LangTagMappingsGenerated.js. The definitions are used in Intl.js.
-
-    The IANA Language Subtag Registry is imported from
-    https://www.iana.org/assignments/language-subtag-registry
-    and uses the syntax specified in
-    https://tools.ietf.org/html/rfc5646#section-3
+    This script extracts information about 1) mappings between deprecated and
+    current Unicode BCP 47 locale identifiers, and 2) deprecated and current
+    BCP 47 Unicode extension value from CLDR, and converts it to C++ mapping
+    code in LanguageTagGenerated.cpp. The code is used in LanguageTag.cpp.
 
 
     Target "tzdata":
@@ -36,194 +32,1330 @@ import sys
 import tarfile
 import tempfile
 import urllib2
-import urlparse
 from contextlib import closing
 from functools import partial
-from itertools import chain, ifilter, ifilterfalse, imap, tee
+from itertools import chain, ifilter, ifilterfalse, imap, izip_longest, groupby, tee
 from operator import attrgetter, itemgetter
+from urlparse import urlsplit
+from zipfile import ZipFile
+
+# From https://docs.python.org/3/library/itertools.html
+def grouper(iterable, n, fillvalue=None):
+    "Collect data into fixed-length chunks or blocks"
+    # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
+    args = [iter(iterable)] * n
+    return izip_longest(*args, fillvalue=fillvalue)
+
+def writeMappingHeader(println, description, source, url):
+    if type(description) is not list:
+        description = [description]
+    for desc in description:
+        println(u"// {0}".format(desc))
+    println(u"// Derived from {0}.".format(source))
+    println(u"// {0}".format(url))
+
+def writeMappingsVar(println, mapping, name, description, source, url):
+    """ Writes a variable definition with a mapping table.
+
+        Writes the contents of dictionary |mapping| through the |println|
+        function with the given variable name and a comment with description,
+        source, and URL.
+    """
+    println(u"")
+    writeMappingHeader(println, description, source, url)
+    println(u"var {0} = {{".format(name))
+    for (key, value) in sorted(mapping.items(), key=itemgetter(0)):
+        println(u'    "{0}": "{1}",'.format(key, value))
+    println(u"};")
+
+def writeMappingsBinarySearch(println, fn_name, type_name, name, validate_fn, validate_case_fn,
+                              mappings, tag_maxlength, description, source, url):
+    """ Emit code to perform a binary search on language tag subtags.
+
+        Uses the contents of |mapping|, which can either be a dictionary or set,
+        to emit a mapping function to find subtag replacements.
+    """
+    println(u"")
+    writeMappingHeader(println, description, source, url)
+    println(u"""
+bool js::intl::LanguageTag::{0}({1} {2}) {{
+  MOZ_ASSERT({3}({2}.span()));
+  MOZ_ASSERT({4}({2}.span()));
+""".format(fn_name, type_name, name, validate_fn, validate_case_fn).strip())
+
+    def write_array(subtags, name, length, fixed):
+        if fixed:
+            println(u"    static const char {}[{}][{}] = {{".format(name, len(subtags),
+                                                                    length + 1))
+        else:
+            println(u"    static const char* {}[{}] = {{".format(name, len(subtags)))
+
+        # Group in pairs of ten to not exceed the 80 line column limit.
+        for entries in grouper(subtags, 10):
+            entries = (u"\"{}\"".format(tag).rjust(length + 2)
+                       for tag in entries if tag is not None)
+            println(u"      {},".format(u", ".join(entries)))
+
+        println(u"    };")
+
+    trailing_return = True
+
+    # Sort the subtags by length. That enables using an optimized comparator
+    # for the binary search, which only performs a single |memcmp| for multiple
+    # of two subtag lengths.
+    mappings_keys = mappings.keys() if type(mappings) == dict else mappings
+    for (length, subtags) in groupby(sorted(mappings_keys, key=len), len):
+        # Omit the length check if the current length is the maximum length.
+        if length != tag_maxlength:
+            println(u"""
+  if ({}.length() == {}) {{
+""".format(name, length).rstrip("\n"))
+        else:
+            trailing_return = False
+            println(u"""
+  {
+""".rstrip("\n"))
+
+        # The subtags need to be sorted for binary search to work.
+        subtags = sorted(subtags)
+
+        def equals(subtag):
+            return u"""{}.equalTo("{}")""".format(name, subtag)
+
+        # Don't emit a binary search for short lists.
+        if len(subtags) == 1:
+            if type(mappings) == dict:
+                println(u"""
+    if ({}) {{
+      {}.set("{}");
+      return true;
+    }}
+    return false;
+""".format(equals(subtags[0]), name, mappings[subtags[0]]).strip("\n"))
+            else:
+                println(u"""
+    return {};
+""".format(equals(subtags[0])).strip("\n"))
+        elif len(subtags) <= 4:
+            if type(mappings) == dict:
+                for subtag in subtags:
+                    println(u"""
+    if ({}) {{
+      {}.set("{}");
+      return true;
+    }}
+""".format(equals(subtag), name, mappings[subtag]).strip("\n"))
+
+                println(u"""
+    return false;
+""".strip("\n"))
+            else:
+                cond = (equals(subtag) for subtag in subtags)
+                cond = (u" ||\n" + u" " * (4 + len("return "))).join(cond)
+                println(u"""
+    return {};
+""".format(cond).strip("\n"))
+        else:
+            write_array(subtags, name + "s", length, True)
+
+            if type(mappings) == dict:
+                write_array([mappings[k] for k in subtags], u"aliases", length, False)
+
+                println(u"""
+    if (const char* replacement = SearchReplacement({0}s, aliases, {0})) {{
+      {0}.set(mozilla::MakeCStringSpan(replacement));
+      return true;
+    }}
+    return false;
+""".format(name).rstrip())
+            else:
+                println(u"""
+    return HasReplacement({0}s, {0});
+""".format(name).rstrip())
 
-def readRegistryRecord(registry):
-    """ Yields the records of the IANA Language Subtag Registry as dictionaries. """
-    record = {}
-    for line in registry:
-        line = line.strip()
-        if line == "":
+        println(u"""
+  }
+""".strip("\n"))
+
+    if trailing_return:
+        println(u"""
+  return false;""")
+
+    println(u"""
+}""".lstrip("\n"))
+
+
+def writeComplexLanguageTagMappings(println, complex_language_mappings,
+                                    description, source, url):
+    println(u"")
+    writeMappingHeader(println, description, source, url)
+    println(u"""
+void js::intl::LanguageTag::performComplexLanguageMappings() {
+  MOZ_ASSERT(IsStructurallyValidLanguageTag(language().span()));
+  MOZ_ASSERT(IsCanonicallyCasedLanguageTag(language().span()));
+""".lstrip())
+
+    # Merge duplicate language entries.
+    language_aliases = {}
+    for (deprecated_language, (language, script, region)) in (
+        sorted(complex_language_mappings.items(), key=itemgetter(0))
+    ):
+        key = (language, script, region)
+        if key not in language_aliases:
+            language_aliases[key] = []
+        else:
+            language_aliases[key].append(deprecated_language)
+
+    first_language = True
+    for (deprecated_language, (language, script, region)) in (
+        sorted(complex_language_mappings.items(), key=itemgetter(0))
+    ):
+        key = (language, script, region)
+        if deprecated_language in language_aliases[key]:
             continue
-        if line == "%%":
-            yield record
-            record = {}
+
+        if_kind = u"if" if first_language else u"else if"
+        first_language = False
+
+        cond = (u"language().equalTo(\"{}\")".format(lang)
+                for lang in [deprecated_language] + language_aliases[key])
+        cond = (u" ||\n" + u" " * (2 + len(if_kind) + 2)).join(cond)
+
+        println(u"""
+  {} ({}) {{""".format(if_kind, cond).strip("\n"))
+
+        println(u"""
+    setLanguage("{}");""".format(language).strip("\n"))
+
+        if script is not None:
+            println(u"""
+    if (script().missing()) {{
+      setScript("{}");
+    }}""".format(script).strip("\n"))
+        if region is not None:
+            println(u"""
+    if (region().missing()) {{
+      setRegion("{}");
+    }}""".format(region).strip("\n"))
+        println(u"""
+  }""".strip("\n"))
+
+    println(u"""
+}
+""".strip("\n"))
+
+
+def writeComplexRegionTagMappings(println, complex_region_mappings,
+                                  description, source, url):
+    println(u"")
+    writeMappingHeader(println, description, source, url)
+    println(u"""
+void js::intl::LanguageTag::performComplexRegionMappings() {
+  MOZ_ASSERT(IsStructurallyValidLanguageTag(language().span()));
+  MOZ_ASSERT(IsCanonicallyCasedLanguageTag(language().span()));
+  MOZ_ASSERT(IsStructurallyValidRegionTag(region().span()));
+  MOZ_ASSERT(IsCanonicallyCasedRegionTag(region().span()));
+""".lstrip())
+
+    # |non_default_replacements| is a list and hence not hashable. Convert it
+    # to a string to get a proper hashable value.
+    def hash_key(default, non_default_replacements):
+        return (default, str(sorted(str(v) for v in non_default_replacements)))
+
+    # Merge duplicate region entries.
+    region_aliases = {}
+    for (deprecated_region, (default, non_default_replacements)) in (
+        sorted(complex_region_mappings.items(), key=itemgetter(0))
+    ):
+        key = hash_key(default, non_default_replacements)
+        if key not in region_aliases:
+            region_aliases[key] = []
         else:
-            if ":" in line:
-                key, value = line.split(":", 1)
-                key, value = key.strip(), value.strip()
-                record[key] = value
-            else:
-                # continuation line
-                record[key] += " " + line
-    if record:
-        yield record
-    return
+            region_aliases[key].append(deprecated_region)
+
+    first_region = True
+    for (deprecated_region, (default, non_default_replacements)) in (
+        sorted(complex_region_mappings.items(), key=itemgetter(0))
+    ):
+        key = hash_key(default, non_default_replacements)
+        if deprecated_region in region_aliases[key]:
+            continue
+
+        if_kind = u"if" if first_region else u"else if"
+        first_region = False
+
+        cond = (u"region().equalTo(\"{}\")".format(region)
+                for region in [deprecated_region] + region_aliases[key])
+        cond = (u" ||\n" + u" " * (2 + len(if_kind) + 2)).join(cond)
+
+        println(u"""
+  {} ({}) {{""".format(if_kind, cond).strip("\n"))
+
+        replacement_regions = sorted({region for (_, _, region) in non_default_replacements})
+
+        first_case = True
+        for replacement_region in replacement_regions:
+            replacement_language_script = sorted(((language, script)
+                                                  for (language, script, region) in (
+                                                      non_default_replacements
+                                                  )
+                                                  if region == replacement_region),
+                                                 key=itemgetter(0))
+
+            if_kind = u"if" if first_case else u"else if"
+            first_case = False
+
+            def compare_tags(language, script):
+                if script is None:
+                    return u"language().equalTo(\"{}\")".format(language)
+                return u"(language().equalTo(\"{}\") && script().equalTo(\"{}\"))".format(
+                    language, script)
+
+            cond = (compare_tags(language, script)
+                    for (language, script) in replacement_language_script)
+            cond = (u" ||\n" + u" " * (4 + len(if_kind) + 2)).join(cond)
+
+            println(u"""
+    {} ({}) {{
+      setRegion("{}");
+    }}""".format(if_kind, cond, replacement_region).rstrip().strip("\n"))
+
+        println(u"""
+    else {{
+      setRegion("{}");
+    }}
+  }}""".format(default).rstrip().strip("\n"))
+
+    println(u"""
+}
+""".strip("\n"))
+
+
+def writeVariantTagMappings(println, variant_mappings, description, source,
+                            url):
+    """ Writes a function definition that maps variant subtags. """
+    println(u"""
+static const char* ToCharPointer(const char* str) {
+  return str;
+}
+
+static const char* ToCharPointer(const js::UniqueChars& str) {
+  return str.get();
+}
+
+template <typename T, typename U = T>
+static bool IsLessThan(const T& a, const U& b) {
+  return strcmp(ToCharPointer(a), ToCharPointer(b)) < 0;
+}
+""")
+    writeMappingHeader(println, description, source, url)
+    println(u"""
+bool js::intl::LanguageTag::performVariantMappings(JSContext* cx) {
+  // The variant subtags need to be sorted for binary search.
+  MOZ_ASSERT(std::is_sorted(variants_.begin(), variants_.end(),
+                            IsLessThan<decltype(variants_)::ElementType>));
+
+  auto insertVariantSortedIfNotPresent = [&](const char* variant) {
+    auto* p = std::lower_bound(variants_.begin(), variants_.end(), variant,
+                               IsLessThan<decltype(variants_)::ElementType,
+                                          decltype(variant)>);
+
+    // Don't insert the replacement when already present.
+    if (p != variants_.end() && strcmp(p->get(), variant) == 0) {
+      return true;
+    }
+
+    // Insert the preferred variant in sort order.
+    auto preferred = DuplicateString(cx, variant);
+    if (!preferred) {
+      return false;
+    }
+    return !!variants_.insert(p, std::move(preferred));
+  };
+
+  for (size_t i = 0; i < variants_.length(); ) {
+    auto& variant = variants_[i];
+    MOZ_ASSERT(IsCanonicallyCasedVariantTag(mozilla::MakeCStringSpan(variant.get())));
+""".lstrip())
+
+    first_variant = True
+
+    for (deprecated_variant, (type, replacement)) in (
+        sorted(variant_mappings.items(), key=itemgetter(0))
+    ):
+        if_kind = u"if" if first_variant else u"else if"
+        first_variant = False
+
+        println(u"""
+    {} (strcmp(variant.get(), "{}") == 0) {{
+      variants_.erase(variants_.begin() + i);
+""".format(if_kind, deprecated_variant).strip("\n"))
+
+        if type == "language":
+            println(u"""
+      setLanguage("{}");
+""".format(replacement).strip("\n"))
+        elif type == "region":
+            println(u"""
+      setRegion("{}");
+""".format(replacement).strip("\n"))
+        else:
+            assert type == "variant"
+            println(u"""
+      if (!insertVariantSortedIfNotPresent("{}")) {{
+        return false;
+      }}
+""".format(replacement).strip("\n"))
+
+        println(u"""
+    }
+""".strip("\n"))
+
+    println(u"""
+    else {
+      i++;
+    }
+  }
+  return true;
+}
+""".strip("\n"))
+
+
+def writeGrandfatheredMappingsFunction(println, grandfathered_mappings,
+                                       description, source, url):
+    """ Writes a function definition that maps grandfathered language tags. """
+    println(u"")
+    writeMappingHeader(println, description, source, url)
+    println(u"""\
+bool js::intl::LanguageTag::updateGrandfatheredMappings(JSContext* cx) {
+  // We're mapping regular grandfathered tags to non-grandfathered form here.
+  // Other tags remain unchanged.
+  //
+  // regular       = "art-lojban"
+  //               / "cel-gaulish"
+  //               / "no-bok"
+  //               / "no-nyn"
+  //               / "zh-guoyu"
+  //               / "zh-hakka"
+  //               / "zh-min"
+  //               / "zh-min-nan"
+  //               / "zh-xiang"
+  //
+  // Therefore we can quickly exclude most tags by checking every
+  // |unicode_locale_id| subcomponent for characteristics not shared by any of
+  // the regular grandfathered (RG) tags:
+  //
+  //   * Real-world |unicode_language_subtag|s are all two or three letters,
+  //     so don't waste time running a useless |language.length > 3| fast-path.
+  //   * No RG tag has a "script"-looking component.
+  //   * No RG tag has a "region"-looking component.
+  //   * The RG tags that match |unicode_locale_id| (art-lojban, cel-gaulish,
+  //     zh-guoyu, zh-hakka, zh-xiang) have exactly one "variant". (no-bok,
+  //     no-nyn, zh-min, and zh-min-nan require BCP47's extlang subtag
+  //     that |unicode_locale_id| doesn't support.)
+  //   * No RG tag contains |extensions| or |pu_extensions|.
+  if (script().present() ||
+      region().present() ||
+      variants().length() != 1 ||
+      extensions().length() != 0 ||
+      privateuse()) {
+    return true;
+  }
+
+  MOZ_ASSERT(IsCanonicallyCasedLanguageTag(language().span()));
+  MOZ_ASSERT(IsCanonicallyCasedVariantTag(mozilla::MakeCStringSpan(variants()[0].get())));
+
+  auto variantEqualTo = [this](const char* variant) {
+    return strcmp(variants()[0].get(), variant) == 0;
+  };""")
+
+    # From Unicode BCP 47 locale identifier <https://unicode.org/reports/tr35/>.
+    #
+    # Doesn't allow any 'extensions' subtags.
+    re_unicode_locale_id = re.compile(
+        r"""
+        ^
+        # unicode_language_id = unicode_language_subtag
+        #     unicode_language_subtag = alpha{2,3} | alpha{5,8}
+        (?P<language>[a-z]{2,3}|[a-z]{5,8})
+
+        # (sep unicode_script_subtag)?
+        #     unicode_script_subtag = alpha{4}
+        (?:-(?P<script>[a-z]{4}))?
+
+        # (sep unicode_region_subtag)?
+        #     unicode_region_subtag = (alpha{2} | digit{3})
+        (?:-(?P<region>([a-z]{2}|[0-9]{3})))?
+
+        # (sep unicode_variant_subtag)*
+        #     unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3})
+        (?P<variants>(-([a-z0-9]{5,8}|[0-9][a-z0-9]{3}))+)?
+
+        # pu_extensions?
+        #     pu_extensions = sep [xX] (sep alphanum{1,8})+
+        (?:-(?P<privateuse>x(-[a-z0-9]{1,8})+))?
+        $
+        """, re.IGNORECASE | re.VERBOSE)
+
+    is_first = True
+
+    for (tag, modern) in sorted(grandfathered_mappings.items(), key=itemgetter(0)):
+        tag_match = re_unicode_locale_id.match(tag)
+        assert tag_match is not None
+
+        tag_language = tag_match.group("language")
+        assert tag_match.group("script") is None, (
+               "{} does not contain a script subtag".format(tag))
+        assert tag_match.group("region") is None, (
+               "{} does not contain a region subtag".format(tag))
+        tag_variants = tag_match.group("variants")
+        assert tag_variants is not None, (
+               "{} contains a variant subtag".format(tag))
+        assert tag_match.group("privateuse") is None, (
+               "{} does not contain a privateuse subtag".format(tag))
+
+        tag_variant = tag_variants[1:]
+        assert "-" not in tag_variant, (
+               "{} contains only a single variant".format(tag))
+
+        modern_match = re_unicode_locale_id.match(modern)
+        assert modern_match is not None
+
+        modern_language = modern_match.group("language")
+        modern_script = modern_match.group("script")
+        modern_region = modern_match.group("region")
+        modern_variants = modern_match.group("variants")
+        modern_privateuse = modern_match.group("privateuse")
+
+        println(u"""
+  // {} -> {}
+""".format(tag, modern).rstrip())
+
+        println(u"""
+  {}if (language().equalTo("{}") && variantEqualTo("{}")) {{
+        """.format("" if is_first else "else ",
+                   tag_language,
+                   tag_variant).rstrip().strip("\n"))
+
+        is_first = False
+
+        println(u"""
+    setLanguage("{}");
+        """.format(modern_language).rstrip().strip("\n"))
+
+        if modern_script is not None:
+            println(u"""
+    setScript("{}");
+            """.format(modern_script).rstrip().strip("\n"))
+
+        if modern_region is not None:
+            println(u"""
+    setRegion("{}");
+            """.format(modern_region).rstrip().strip("\n"))
+
+        assert modern_variants is None, (
+            "all regular grandfathered tags' modern forms do not contain variant subtags")
+
+        println(u"""
+    clearVariants();
+        """.rstrip().strip("\n"))
+
+        if modern_privateuse is not None:
+            println(u"""
+    auto privateuse = DuplicateString(cx, "{}");
+    if (!privateuse) {{
+      return false;
+    }}
+    setPrivateuse(std::move(privateuse));
+        """.format(modern_privateuse).rstrip().rstrip("\n"))
+
+        println(u"""
+    return true;
+  }""".rstrip().strip("\n"))
 
+    println(u"""
+  return true;
+}""")
 
-def readRegistry(registry):
-    """ Reads IANA Language Subtag Registry and extracts information for Intl.js.
+
+def readSupplementalData(core_file):
+    """ Reads CLDR Supplemental Data and extracts information for Intl.js.
 
         Information extracted:
-        - langTagMappings: mappings from complete language tags to preferred
+        - grandfatheredMappings: mappings from grandfathered tags to preferred
           complete language tags
-        - langSubtagMappings: mappings from subtags to preferred subtags
-        - extlangMappings: mappings from extlang subtags to preferred subtags,
-          with prefix to be removed
-        Returns these three mappings as dictionaries, along with the registry's
-        file date.
-
-        We also check that mappings for language subtags don't affect extlang
-        subtags and vice versa, so that CanonicalizeLanguageTag doesn't have
-        to separate them for processing. Region codes are separated by case,
-        and script codes by length, so they're unproblematic.
+        - languageMappings: mappings from language subtags to preferred subtags
+        - complexLanguageMappings: mappings from language subtags with complex rules
+        - regionMappings: mappings from region subtags to preferred subtags
+        - complexRegionMappings: mappings from region subtags with complex rules
+        - variantMappings: mappings from variant subtags to preferred subtags
+        - likelySubtags: likely subtags used for generating test data only
+        Returns these mappings as dictionaries.
     """
-    langTagMappings = {}
-    langSubtagMappings = {}
-    extlangMappings = {}
-    languageSubtags = set()
-    extlangSubtags = set()
-
-    for record in readRegistryRecord(registry):
-        if "File-Date" in record:
-            fileDate = record["File-Date"]
+    import xml.etree.ElementTree as ET
+
+    # From Unicode BCP 47 locale identifier <https://unicode.org/reports/tr35/>.
+    re_unicode_language_id = re.compile(
+        r"""
+        ^
+        # unicode_language_id = unicode_language_subtag
+        #     unicode_language_subtag = alpha{2,3} | alpha{5,8}
+        (?P<language>[a-z]{2,3}|[a-z]{5,8})
+
+        # (sep unicode_script_subtag)?
+        #     unicode_script_subtag = alpha{4}
+        (?:-(?P<script>[a-z]{4}))?
+
+        # (sep unicode_region_subtag)?
+        #     unicode_region_subtag = (alpha{2} | digit{3})
+        (?:-(?P<region>([a-z]{2}|[0-9]{3})))?
+
+        # (sep unicode_variant_subtag)*
+        #     unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3})
+        (?P<variants>(-([a-z0-9]{5,8}|[0-9][a-z0-9]{3}))+)?
+        $
+        """, re.IGNORECASE | re.VERBOSE)
+
+    re_unicode_language_subtag = re.compile(
+        r"""
+        ^
+        # unicode_language_subtag = alpha{2,3} | alpha{5,8}
+        ([a-z]{2,3}|[a-z]{5,8})
+        $
+        """, re.IGNORECASE | re.VERBOSE)
+
+    re_unicode_region_subtag = re.compile(
+        r"""
+        ^
+        # unicode_region_subtag = (alpha{2} | digit{3})
+        ([a-z]{2}|[0-9]{3})
+        $
+        """, re.IGNORECASE | re.VERBOSE)
+
+    re_unicode_variant_subtag = re.compile(
+        r"""
+        ^
+        # unicode_variant_subtag = (alphanum{5,8} | digit alphanum{3})
+        ([a-z0-9]{5,8}|(?:[0-9][a-z0-9]{3}))
+        $
+        """, re.IGNORECASE | re.VERBOSE)
+
+    # The fixed list of BCP 47 grandfathered language tags.
+    grandfathered_tags = (
+        "art-lojban",
+        "cel-gaulish",
+        "en-GB-oed",
+        "i-ami",
+        "i-bnn",
+        "i-default",
+        "i-enochian",
+        "i-hak",
+        "i-klingon",
+        "i-lux",
+        "i-mingo",
+        "i-navajo",
+        "i-pwn",
+        "i-tao",
+        "i-tay",
+        "i-tsu",
+        "no-bok",
+        "no-nyn",
+        "sgn-BE-FR",
+        "sgn-BE-NL",
+        "sgn-CH-DE",
+        "zh-guoyu",
+        "zh-hakka",
+        "zh-min",
+        "zh-min-nan",
+        "zh-xiang",
+    )
+
+    # The list of grandfathered tags which are valid Unicode BCP 47 locale identifiers.
+    unicode_bcp47_grandfathered_tags = {tag for tag in grandfathered_tags
+                                        if re_unicode_language_id.match(tag)}
+
+    # Dictionary of simple language subtag mappings, e.g. "in" -> "id".
+    language_mappings = {}
+
+    # Dictionary of complex language subtag mappings, modifying more than one
+    # subtag, e.g. "sh" -> ("sr", "Latn", None) and "cnr" -> ("sr", None, "ME").
+    complex_language_mappings = {}
+
+    # Dictionary of simple region subtag mappings, e.g. "DD" -> "DE".
+    region_mappings = {}
+
+    # Dictionary of complex region subtag mappings, containing more than one
+    # replacement, e.g. "SU" -> ("RU", ["AM",complex_region_mappings[type] = replacements "AZ", "BY", ...]).
+    complex_region_mappings = {}
+
+    # Dictionary of aliased variant subtags to a tuple of preferred replacement
+    # type and replacement, e.g. "arevela" -> ("language", "hy") or
+    # "aaland" -> ("region", "AX") or "heploc" -> ("variant", "alalc97").
+    variant_mappings = {}
+
+    # Dictionary of grandfathered mappings to preferred values.
+    grandfathered_mappings = {}
+
+    # CLDR uses "_" as the separator for some elements. Replace it with "-".
+    def bcp47_id(cldr_id):
+        return cldr_id.replace("_", "-")
+
+    # CLDR uses the canonical case for most entries, but there are some
+    # exceptions, like:
+    #   <languageAlias type="drw" replacement="fa_af" reason="deprecated"/>
+    # Therefore canonicalize all tags to be on the safe side.
+    def bcp47_canonical(language, script, region):
+        # Canonical case for language subtags is lower case.
+        # Canonical case for script subtags is title case.
+        # Canonical case for region subtags is upper case.
+        return (language.lower() if language else None,
+                script.title() if script else None,
+                region.upper() if region else None)
+
+    tree = ET.parse(core_file.open("common/supplemental/supplementalMetadata.xml"))
+
+    for language_alias in tree.iterfind(".//languageAlias"):
+        type = bcp47_id(language_alias.get("type"))
+        replacement = bcp47_id(language_alias.get("replacement"))
+
+        # Handle grandfathered mappings first.
+        if type in unicode_bcp47_grandfathered_tags:
+            grandfathered_mappings[type] = replacement
             continue
 
-        if record["Type"] == "grandfathered":
-            # Grandfathered tags don't use standard syntax, so
-            # CanonicalizeLanguageTag expects the mapping table to provide
-            # the final form for all.
-            # For langTagMappings, keys must be in lower case; values in
-            # the case used in the registry.
-            tag = record["Tag"]
-            if "Preferred-Value" in record:
-                langTagMappings[tag.lower()] = record["Preferred-Value"]
-            else:
-                langTagMappings[tag.lower()] = tag
-        elif record["Type"] == "redundant":
-            # For langTagMappings, keys must be in lower case; values in
-            # the case used in the registry.
-            if "Preferred-Value" in record:
-                langTagMappings[record["Tag"].lower()] = record["Preferred-Value"]
-        elif record["Type"] in ("language", "script", "region", "variant"):
-            # For langSubtagMappings, keys and values must be in the case used
-            # in the registry.
-            subtag = record["Subtag"]
-            if record["Type"] == "language":
-                languageSubtags.add(subtag)
-            if "Preferred-Value" in record:
-                if subtag == "heploc":
-                    # The entry for heploc is unique in its complexity; handle
-                    # it as special case below.
-                    continue
-                if "Prefix" in record:
-                    # This might indicate another heploc-like complex case.
-                    raise Exception("Please evaluate: subtag mapping with prefix value.")
-                langSubtagMappings[subtag] = record["Preferred-Value"]
-        elif record["Type"] == "extlang":
-            # For extlangMappings, keys must be in the case used in the
-            # registry; values are records with the preferred value and the
-            # prefix to be removed.
-            subtag = record["Subtag"]
-            extlangSubtags.add(subtag)
-            if "Preferred-Value" in record:
-                preferred = record["Preferred-Value"]
-                prefix = record["Prefix"]
-                extlangMappings[subtag] = {"preferred": preferred, "prefix": prefix}
+        # We're only interested in language subtag matches, so ignore any
+        # entries which have additional subtags.
+        if re_unicode_language_subtag.match(type) is None:
+            continue
+
+        assert type.islower()
+
+        if re_unicode_language_subtag.match(replacement) is not None:
+            # Canonical case for language subtags is lower-case.
+            language_mappings[type] = replacement.lower()
         else:
-            # No other types are allowed by
-            # https://tools.ietf.org/html/rfc5646#section-3.1.3
-            assert False, "Unrecognized Type: {0}".format(record["Type"])
+            replacement_match = re_unicode_language_id.match(replacement)
+            assert replacement_match is not None, (
+                   "{} invalid Unicode BCP 47 locale identifier".format(replacement))
+            assert replacement_match.group("variants") is None, (
+                   "{}: unexpected variant subtags in {}".format(type, replacement))
+
+            complex_language_mappings[type] = bcp47_canonical(replacement_match.group("language"),
+                                                              replacement_match.group("script"),
+                                                              replacement_match.group("region"))
+
+    for territory_alias in tree.iterfind(".//territoryAlias"):
+        type = territory_alias.get("type")
+        replacement = territory_alias.get("replacement")
+
+        # We're only interested in region subtag matches, so ignore any entries
+        # which contain legacy formats, e.g. three letter region codes.
+        if re_unicode_region_subtag.match(type) is None:
+            continue
 
-    # Check that mappings for language subtags and extlang subtags don't affect
-    # each other.
-    for lang in languageSubtags:
-        if lang in extlangMappings and extlangMappings[lang]["preferred"] != lang:
-            raise Exception("Conflict: lang with extlang mapping: " + lang)
-    for extlang in extlangSubtags:
-        if extlang in langSubtagMappings:
-            raise Exception("Conflict: extlang with lang mapping: " + extlang)
+        assert type.isupper() or type.isdigit()
 
-    # Special case for heploc.
-    langTagMappings["ja-latn-hepburn-heploc"] = "ja-Latn-alalc97"
+        if re_unicode_region_subtag.match(replacement) is not None:
+            # Canonical case for region subtags is upper-case.
+            region_mappings[type] = replacement.upper()
+        else:
+            # Canonical case for region subtags is upper-case.
+            replacements = [r.upper() for r in replacement.split(" ")]
+            assert all(
+                re_unicode_region_subtag.match(loc) is not None for loc in replacements
+            ), "{} invalid region subtags".format(replacement)
+            complex_region_mappings[type] = replacements
 
-    return {"fileDate": fileDate,
-            "langTagMappings": langTagMappings,
-            "langSubtagMappings": langSubtagMappings,
-            "extlangMappings": extlangMappings}
+    for variant_alias in tree.iterfind(".//variantAlias"):
+        type = variant_alias.get("type")
+        replacement = variant_alias.get("replacement")
 
+        assert re_unicode_variant_subtag.match(type) is not None, (
+               "{} invalid variant subtag".format(type))
 
-def writeMappingsVar(intlData, dict, name, description, fileDate, url):
-    """ Writes a variable definition with a mapping table to file intlData.
+        # Normalize the case, because some variants are in upper case.
+        type = type.lower()
+
+        # The replacement can be a language, a region, or a variant subtag.
+        # Language and region subtags are case normalized, variant subtags can
+        # be in any case.
+
+        if re_unicode_language_subtag.match(replacement) is not None and replacement.islower():
+            variant_mappings[type] = ("language", replacement)
+
+        elif re_unicode_region_subtag.match(replacement) is not None:
+            assert replacement.isupper() or replacement.isdigit(), (
+                   "{} invalid variant subtag replacement".format(replacement))
+            variant_mappings[type] = ("region", replacement)
 
-        Writes the contents of dictionary dict to file intlData with the given
-        variable name and a comment with description, fileDate, and URL.
-    """
-    intlData.write("\n")
-    intlData.write("// {0}.\n".format(description))
-    intlData.write("// Derived from IANA Language Subtag Registry, file date {0}.\n".format(fileDate))
-    intlData.write("// {0}\n".format(url))
-    intlData.write("var {0} = {{\n".format(name))
-    keys = sorted(dict)
-    for key in keys:
-        if isinstance(dict[key], basestring):
-            value = '"{0}"'.format(dict[key])
         else:
-            preferred = dict[key]["preferred"]
-            prefix = dict[key]["prefix"]
-            value = '{{preferred: "{0}", prefix: "{1}"}}'.format(preferred, prefix)
-        intlData.write('    "{0}": {1},\n'.format(key, value))
-    intlData.write("};\n")
+            assert re_unicode_variant_subtag.match(replacement) is not None, (
+                   "{} invalid variant subtag replacement".format(replacement))
+            variant_mappings[type] = ("variant", replacement.lower())
+
+    tree = ET.parse(core_file.open("common/supplemental/likelySubtags.xml"))
+
+    likely_subtags = {}
+
+    for likely_subtag in tree.iterfind(".//likelySubtag"):
+        from_tag = bcp47_id(likely_subtag.get("from"))
+        from_match = re_unicode_language_id.match(from_tag)
+        assert from_match is not None, (
+               "{} invalid Unicode BCP 47 locale identifier".format(from_tag))
+        assert from_match.group("variants") is None, (
+               "unexpected variant subtags in {}".format(from_tag))
+
+        to_tag = bcp47_id(likely_subtag.get("to"))
+        to_match = re_unicode_language_id.match(to_tag)
+        assert to_match is not None, (
+               "{} invalid Unicode BCP 47 locale identifier".format(to_tag))
+        assert to_match.group("variants") is None, (
+               "unexpected variant subtags in {}".format(to_tag))
+
+        from_canonical = bcp47_canonical(from_match.group("language"),
+                                         from_match.group("script"),
+                                         from_match.group("region"))
+
+        to_canonical = bcp47_canonical(to_match.group("language"),
+                                       to_match.group("script"),
+                                       to_match.group("region"))
+
+        likely_subtags[from_canonical] = to_canonical
+
+    complex_region_mappings_final = {}
+
+    for (deprecated_region, replacements) in complex_region_mappings.items():
+        # Find all likely subtag entries which don't already contain a region
+        # subtag and whose target region is in the list of replacement regions.
+        region_likely_subtags = [(from_language, from_script, to_region)
+                                 for ((from_language, from_script, from_region),
+                                      (_, _, to_region)) in likely_subtags.items()
+                                 if from_region is None and to_region in replacements]
+
+        # The first replacement entry is the default region.
+        default = replacements[0]
+
+        # Find all likely subtag entries whose region matches the default region.
+        default_replacements = {(language, script)
+                                for (language, script, region) in region_likely_subtags
+                                if region == default}
+
+        # And finally find those entries which don't use the default region.
+        # These are the entries we're actually interested in, because those need
+        # to be handled specially when selecting the correct preferred region.
+        non_default_replacements = [(language, script, region)
+                                    for (language, script, region) in region_likely_subtags
+                                    if (language, script) not in default_replacements]
+
+        # If there are no non-default replacements, we can handle the region as
+        # part of the simple region mapping.
+        if non_default_replacements:
+            complex_region_mappings_final[deprecated_region] = (default, non_default_replacements)
+        else:
+            region_mappings[deprecated_region] = default
+
+    return {"grandfatheredMappings": grandfathered_mappings,
+            "languageMappings": language_mappings,
+            "complexLanguageMappings": complex_language_mappings,
+            "regionMappings": region_mappings,
+            "complexRegionMappings": complex_region_mappings_final,
+            "variantMappings": variant_mappings,
+            "likelySubtags": likely_subtags,
+            }
+
+def readUnicodeExtensions(core_file):
+    import xml.etree.ElementTree as ET
+
+    # Match all xml-files in the BCP 47 directory.
+    bcpFileRE = re.compile(r"^common/bcp47/.+\.xml$")
+
+    # https://www.unicode.org/reports/tr35/#Unicode_locale_identifier
+    #
+    # type = alphanum{3,8} (sep alphanum{3,8})* ;
+    typeRE = re.compile(r"^[a-z0-9]{3,8}(-[a-z0-9]{3,8})*$")
+
+    # Mapping from Unicode extension types to dict of deprecated to
+    # preferred values.
+    mapping = {
+        # Unicode BCP 47 U Extension
+        "u": {},
+
+        # Unicode BCP 47 T Extension
+        "t": {},
+    }
 
+    def readBCP47File(file):
+        tree = ET.parse(file)
+        for keyword in tree.iterfind(".//keyword/key"):
+            extension = keyword.get("extension", "u")
+            assert extension == "u" or extension == "t", (
+                   "unknown extension type: {}".format(extension))
+
+            extension_name = keyword.get("name")
+
+            for type in keyword.iterfind("type"):
+                # <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>:
+                #
+                # The key or type name used by Unicode locale extension with 'u' extension
+                # syntax or the 't' extensions syntax. When alias below is absent, this name
+                # can be also used with the old style "@key=type" syntax.
+                name = type.get("name")
+
+                # Ignore the special name:
+                # - <https://unicode.org/reports/tr35/#CODEPOINTS>
+                # - <https://unicode.org/reports/tr35/#REORDER_CODE>
+                # - <https://unicode.org/reports/tr35/#RG_KEY_VALUE>
+                # - <https://unicode.org/reports/tr35/#SUBDIVISION_CODE>
+                # - <https://unicode.org/reports/tr35/#PRIVATE_USE>
+                if name in ("CODEPOINTS", "REORDER_CODE", "RG_KEY_VALUE", "SUBDIVISION_CODE",
+                            "PRIVATE_USE"):
+                    continue
 
-def writeLanguageTagData(intlData, fileDate, url, langTagMappings, langSubtagMappings, extlangMappings):
+                # All other names should match the 'type' production.
+                assert typeRE.match(name) is not None, (
+                       "{} matches the 'type' production".format(name))
+
+                # <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>:
+                #
+                # The preferred value of the deprecated key, type or attribute element.
+                # When a key, type or attribute element is deprecated, this attribute is
+                # used for specifying a new canonical form if available.
+                preferred = type.get("preferred")
+
+                # <https://unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files>:
+                #
+                # The BCP 47 form is the canonical form, and recommended. Other aliases are
+                # included only for backwards compatibility.
+                alias = type.get("alias")
+
+                # <https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>
+                #
+                # Use the bcp47 data to replace keys, types, tfields, and tvalues by their
+                # canonical forms. See Section 3.6.4 U Extension Data Files) and Section
+                # 3.7.1 T Extension Data Files. The aliases are in the alias attribute
+                # value, while the canonical is in the name attribute value.
+
+                # 'preferred' contains the new preferred name, 'alias' the compatibility
+                # name, but then there's this entry where 'preferred' and 'alias' are the
+                # same. So which one to choose? Assume 'preferred' is the actual canonical
+                # name.
+                #
+                # <type name="islamicc"
+                #       description="Civil (algorithmic) Arabic calendar"
+                #       deprecated="true"
+                #       preferred="islamic-civil"
+                #       alias="islamic-civil"/>
+
+                if preferred is not None:
+                    assert typeRE.match(preferred), preferred
+                    mapping[extension].setdefault(extension_name, {})[name] = preferred
+
+                if alias is not None:
+                    for alias_name in alias.lower().split(" "):
+                        # Ignore alias entries which don't match the 'type' production.
+                        if typeRE.match(alias_name) is None:
+                            continue
+
+                        # See comment above when 'alias' and 'preferred' are both present.
+                        if (preferred is not None and
+                            name in mapping[extension][extension_name]):
+                            continue
+
+                        # Skip over entries where 'name' and 'alias' are equal.
+                        #
+                        # <type name="pst8pdt"
+                        #       description="POSIX style time zone for US Pacific Time"
+                        #       alias="PST8PDT"
+                        #       since="1.8"/>
+                        if name == alias_name:
+                            continue
+
+                        mapping[extension].setdefault(extension_name, {})[alias_name] = name
+
+    def readSupplementalMetadata(file):
+        # Find subdivision and region replacements.
+        #
+        # <https://www.unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers>
+        #
+        # Replace aliases in special key values:
+        #   - If there is an 'sd' or 'rg' key, replace any subdivision alias
+        #     in its value in the same way, using subdivisionAlias data.
+        tree = ET.parse(file)
+        for alias in tree.iterfind(".//subdivisionAlias"):
+            type = alias.get("type")
+            assert typeRE.match(type) is not None, (
+                   "{} matches the 'type' production".format(type))
+
+            # Take the first replacement when multiple ones are present.
+            replacement = alias.get("replacement").split(" ")[0].lower()
+
+            # Skip over invalid replacements.
+            #
+            # <subdivisionAlias type="fi01" replacement="AX" reason="overlong"/>
+            #
+            # It's not entirely clear to me if CLDR actually wants to use
+            # "axzzzz" as the replacement for this case.
+            if typeRE.match(replacement) is None:
+                continue
+
+            # 'subdivisionAlias' applies to 'rg' and 'sd' keys.
+            mapping["u"].setdefault("rg", {})[type] = replacement
+            mapping["u"].setdefault("sd", {})[type] = replacement
+
+    for name in core_file.namelist():
+        if bcpFileRE.match(name):
+            readBCP47File(core_file.open(name))
+
+    readSupplementalMetadata(core_file.open("common/supplemental/supplementalMetadata.xml"))
+
+    return {
+        "unicodeMappings": mapping["u"],
+        "transformMappings": mapping["t"],
+    }
+
+def writeCLDRLanguageTagData(println, data, url):
     """ Writes the language tag data to the Intl data file. """
-    writeMappingsVar(intlData, langTagMappings, "langTagMappings",
-                     "Mappings from complete tags to preferred values", fileDate, url)
-    writeMappingsVar(intlData, langSubtagMappings, "langSubtagMappings",
-                     "Mappings from non-extlang subtags to preferred values", fileDate, url)
-    writeMappingsVar(intlData, extlangMappings, "extlangMappings",
-                     "Mappings from extlang subtags to preferred values", fileDate, url)
-
-def updateLangTags(args):
-    """ Update the LangTagMappingsGenerated.js file. """
+
+    println(generatedFileWarning)
+    println(u"// Version: CLDR-{}".format(data["version"]))
+    println(u"// URL: {}".format(url))
+
+    println(u"""
+#include "mozilla/Assertions.h"
+#include "mozilla/Span.h"
+#include "mozilla/TextUtils.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <iterator>
+#include <string>
+#include <type_traits>
+
+#include "jscntxt.h"
+#include "jsstr.h"
+
+#include "builtin/intl/LanguageTag.h"
+
+using namespace js::intl::LanguageTagLimits;
+
+template <size_t Length, size_t TagLength, size_t SubtagLength>
+static inline bool HasReplacement(
+    const char (&subtags)[Length][TagLength],
+    const js::intl::LanguageTagSubtag<SubtagLength>& subtag) {
+  MOZ_ASSERT(subtag.length() == TagLength - 1,
+             "subtag must have the same length as the list of subtags");
+
+  const char* ptr = subtag.span().data();
+  return std::binary_search(std::begin(subtags), std::end(subtags), ptr,
+                            [](const char* a, const char* b) {
+    return memcmp(a, b, TagLength - 1) < 0;
+  });
+}
+
+template <size_t Length, size_t TagLength, size_t SubtagLength>
+static inline const char* SearchReplacement(
+    const char (&subtags)[Length][TagLength],
+    const char* (&aliases)[Length],
+    const js::intl::LanguageTagSubtag<SubtagLength>& subtag) {
+  MOZ_ASSERT(subtag.length() == TagLength - 1,
+             "subtag must have the same length as the list of subtags");
+
+  const char* ptr = subtag.span().data();
+  auto p = std::lower_bound(std::begin(subtags), std::end(subtags), ptr,
+                            [](const char* a, const char* b) {
+    return memcmp(a, b, TagLength - 1) < 0;
+  });
+  if (p != std::end(subtags) && memcmp(*p, ptr, TagLength - 1) == 0) {
+    return aliases[std::distance(std::begin(subtags), p)];
+  }
+  return nullptr;
+}
+
+#ifdef DEBUG
+static bool IsAsciiLowercaseAlphanumeric(char c) {
+  return mozilla::IsAsciiLowercaseAlpha(c) || mozilla::IsAsciiDigit(c);
+}
+
+static bool IsAsciiLowercaseAlphanumericOrDash(char c) {
+  return IsAsciiLowercaseAlphanumeric(c) || c == '-';
+}
+
+static bool IsCanonicallyCasedLanguageTag(mozilla::Span<const char> span) {
+  // Tell the analysis the |std::all_of| function can't GC.
+  JS::AutoSuppressGCAnalysis nogc;
+
+  return std::all_of(span.begin(), span.end(), mozilla::IsAsciiLowercaseAlpha<char>);
+}
+
+static bool IsCanonicallyCasedRegionTag(mozilla::Span<const char> span) {
+  // Tell the analysis the |std::all_of| function can't GC.
+  JS::AutoSuppressGCAnalysis nogc;
+
+  return std::all_of(span.begin(), span.end(), mozilla::IsAsciiUppercaseAlpha<char>) ||
+         std::all_of(span.begin(), span.end(), mozilla::IsAsciiDigit<char>);
+}
+
+static bool IsCanonicallyCasedVariantTag(mozilla::Span<const char> span) {
+  // Tell the analysis the |std::all_of| function can't GC.
+  JS::AutoSuppressGCAnalysis nogc;
+
+  return std::all_of(span.begin(), span.end(), IsAsciiLowercaseAlphanumeric);
+}
+
+static bool IsCanonicallyCasedUnicodeKey(mozilla::Span<const char> key) {
+  return std::all_of(key.begin(), key.end(), IsAsciiLowercaseAlphanumeric);
+}
+
+static bool IsCanonicallyCasedUnicodeType(mozilla::Span<const char> type) {
+  return std::all_of(type.begin(), type.end(), IsAsciiLowercaseAlphanumericOrDash);
+}
+
+static bool IsCanonicallyCasedTransformKey(mozilla::Span<const char> key) {
+  return std::all_of(key.begin(), key.end(), IsAsciiLowercaseAlphanumeric);
+}
+
+static bool IsCanonicallyCasedTransformType(mozilla::Span<const char> type) {
+  return std::all_of(type.begin(), type.end(), IsAsciiLowercaseAlphanumericOrDash);
+}
+#endif
+""".rstrip())
+
+    source = u"CLDR Supplemental Data, version {}".format(data["version"])
+    grandfathered_mappings = data["grandfatheredMappings"]
+    language_mappings = data["languageMappings"]
+    complex_language_mappings = data["complexLanguageMappings"]
+    region_mappings = data["regionMappings"]
+    complex_region_mappings = data["complexRegionMappings"]
+    variant_mappings = data["variantMappings"]
+    unicode_mappings = data["unicodeMappings"]
+    transform_mappings = data["transformMappings"]
+
+    # unicode_language_subtag = alpha{2,3} | alpha{5,8} ;
+    language_maxlength = 8
+
+    # unicode_region_subtag = (alpha{2} | digit{3}) ;
+    region_maxlength = 3
+
+    writeMappingsBinarySearch(println, "languageMapping",
+                              "LanguageSubtag&", "language",
+                              "IsStructurallyValidLanguageTag",
+                              "IsCanonicallyCasedLanguageTag",
+                              language_mappings, language_maxlength,
+                              "Mappings from language subtags to preferred values.", source, url)
+    writeMappingsBinarySearch(println, "complexLanguageMapping",
+                              "const LanguageSubtag&", "language",
+                              "IsStructurallyValidLanguageTag",
+                              "IsCanonicallyCasedLanguageTag",
+                              complex_language_mappings.keys(), language_maxlength,
+                              "Language subtags with complex mappings.", source, url)
+    writeMappingsBinarySearch(println, "regionMapping",
+                              "RegionSubtag&", "region",
+                              "IsStructurallyValidRegionTag",
+                              "IsCanonicallyCasedRegionTag",
+                              region_mappings, region_maxlength,
+                              "Mappings from region subtags to preferred values.", source, url)
+    writeMappingsBinarySearch(println, "complexRegionMapping",
+                              "const RegionSubtag&", "region",
+                              "IsStructurallyValidRegionTag",
+                              "IsCanonicallyCasedRegionTag",
+                              complex_region_mappings.keys(), region_maxlength,
+                              "Region subtags with complex mappings.", source, url)
+
+    writeComplexLanguageTagMappings(println, complex_language_mappings,
+                                    "Language subtags with complex mappings.", source, url)
+    writeComplexRegionTagMappings(println, complex_region_mappings,
+                                  "Region subtags with complex mappings.", source, url)
+
+    writeVariantTagMappings(println, variant_mappings,
+                            "Mappings from variant subtags to preferred values.", source, url)
+
+    writeGrandfatheredMappingsFunction(println, grandfathered_mappings,
+                                       "Canonicalize grandfathered locale identifiers.", source,
+                                       url)
+
+    writeUnicodeExtensionsMappings(println, unicode_mappings, "Unicode")
+    writeUnicodeExtensionsMappings(println, transform_mappings, "Transform")
+
+
+def writeCLDRLanguageTagLikelySubtagsTest(println, data, url):
+    """ Writes the likely-subtags test file. """
+
+    println(generatedFileWarning)
+
+    source = u"CLDR Supplemental Data, version {}".format(data["version"])
+    language_mappings = data["languageMappings"]
+    complex_language_mappings = data["complexLanguageMappings"]
+    region_mappings = data["regionMappings"]
+    complex_region_mappings = data["complexRegionMappings"]
+    likely_subtags = data["likelySubtags"]
+
+    def bcp47(tag):
+        (language, script, region) = tag
+        return "{}{}{}".format(language,
+                               "-" + script if script else "",
+                               "-" + region if region else "")
+
+    def canonical(tag):
+        (language, script, region) = tag
+
+        # Map deprecated language subtags.
+        if language in language_mappings:
+            language = language_mappings[language]
+        elif language in complex_language_mappings:
+            (language2, script2, region2) = complex_language_mappings[language]
+            (language, script, region) = (language2,
+                                          script if script else script2,
+                                          region if region else region2)
+
+        # Map deprecated region subtags.
+        if region in region_mappings:
+            region = region_mappings[region]
+        else:
+            # Assume no complex region mappings are needed for now.
+            assert region not in complex_region_mappings,\
+                   "unexpected region with complex mappings: {}".format(region)
+
+        return (language, script, region)
+
+    # https://unicode.org/reports/tr35/#Likely_Subtags
+
+    def addLikelySubtags(tag):
+        # Step 1: Canonicalize.
+        (language, script, region) = canonical(tag)
+        if script == "Zzzz":
+            script = None
+        if region == "ZZ":
+            region = None
+
+        # Step 2: Lookup.
+        searches = ((language, script, region),
+                    (language, None, region),
+                    (language, script, None),
+                    (language, None, None),
+                    ("und", script, None))
+        search = next(search for search in searches if search in likely_subtags)
+
+        (language_s, script_s, region_s) = search
+        (language_m, script_m, region_m) = likely_subtags[search]
+
+        # Step 3: Return.
+        return (language if language != language_s else language_m,
+                script if script != script_s else script_m,
+                region if region != region_s else region_m)
+
+    # https://unicode.org/reports/tr35/#Likely_Subtags
+    def removeLikelySubtags(tag):
+        # Step 1: Add likely subtags.
+        max = addLikelySubtags(tag)
+
+        # Step 2: Remove variants (doesn't apply here).
+
+        # Step 3: Find a match.
+        (language, script, region) = max
+        for trial in ((language, None, None), (language, None, region), (language, script, None)):
+            if addLikelySubtags(trial) == max:
+                return trial
+
+        # Step 4: Return maximized if no match found.
+        return max
+
+    def likely_canonical(from_tag, to_tag):
+        # Canonicalize the input tag.
+        from_tag = canonical(from_tag)
+
+        # Update the expected result if necessary.
+        if from_tag in likely_subtags:
+            to_tag = likely_subtags[from_tag]
+
+        # Canonicalize the expected output.
+        to_canonical = canonical(to_tag)
+
+        # Sanity check: This should match the result of |addLikelySubtags|.
+        assert to_canonical == addLikelySubtags(from_tag)
+
+        return to_canonical
+
+    # |likely_subtags| contains non-canonicalized tags, so canonicalize it first.
+    likely_subtags_canonical = {k: likely_canonical(k, v) for (k, v) in likely_subtags.items()}
+
+    # Add test data for |Intl.Locale.prototype.maximize()|.
+    writeMappingsVar(println, {bcp47(k): bcp47(v) for (k, v) in likely_subtags_canonical.items()},
+                     "maxLikelySubtags", "Extracted from likelySubtags.xml.", source, url)
+
+    # Use the maximalized tags as the input for the remove likely-subtags test.
+    minimized = {tag: removeLikelySubtags(tag) for tag in likely_subtags_canonical.values()}
+
+    # Add test data for |Intl.Locale.prototype.minimize()|.
+    writeMappingsVar(println, {bcp47(k): bcp47(v) for (k, v) in minimized.items()},
+                     "minLikelySubtags", "Extracted from likelySubtags.xml.", source, url)
+
+    println(u"""
+for (let [tag, maximal] of Object.entries(maxLikelySubtags)) {
+    assertEq(new Intl.Locale(tag).maximize().toString(), maximal);
+}""")
+
+    println(u"""
+for (let [tag, minimal] of Object.entries(minLikelySubtags)) {
+    assertEq(new Intl.Locale(tag).minimize().toString(), minimal);
+}""")
+
+    println(u"""
+if (typeof reportCompare === "function")
+    reportCompare(0, 0);""")
+
+
+def updateCLDRLangTags(args):
+    """ Update the LanguageTagGenerated.cpp file. """
+    version = args.version
     url = args.url
     out = args.out
     filename = args.file
 
+    url = url.replace("<VERSION>", version)
+
     print("Arguments:")
+    print("\tCLDR version: %s" % version)
     print("\tDownload url: %s" % url)
-    print("\tLocal registry: %s" % filename)
+    if filename is not None:
+        print("\tLocal CLDR core.zip file: %s" % filename)
     print("\tOutput file: %s" % out)
     print("")
 
+    data = {
+        "version": version,
+    }
+
+    def readFiles(cldr_file):
+        with ZipFile(cldr_file) as zip_file:
+            data.update(readSupplementalData(zip_file))
+            data.update(readUnicodeExtensions(zip_file))
+
+    print("Processing CLDR data...")
     if filename is not None:
-        print("Always make sure you have the newest language-subtag-registry.txt!")
-        registry = codecs.open(filename, "r", encoding="utf-8")
+        print("Always make sure you have the newest CLDR core.zip!")
+        with open(filename, "rb") as cldr_file:
+            readFiles(cldr_file)
     else:
-        print("Downloading IANA Language Subtag Registry...")
-        with closing(urllib2.urlopen(url)) as reader:
-            text = reader.read().decode("utf-8")
-        registry = codecs.open("language-subtag-registry.txt", "w+", encoding="utf-8")
-        registry.write(text)
-        registry.seek(0)
-
-    print("Processing IANA Language Subtag Registry...")
-    with closing(registry) as reg:
-        data = readRegistry(reg)
-    fileDate = data["fileDate"]
-    langTagMappings = data["langTagMappings"]
-    langSubtagMappings = data["langSubtagMappings"]
-    extlangMappings = data["extlangMappings"]
+        print("Downloading CLDR core.zip...")
+        with closing(urllib2.urlopen(url)) as cldr_file:
+            cldr_data = io.BytesIO(cldr_file.read())
+            readFiles(cldr_data)
 
     print("Writing Intl data...")
-    with codecs.open(out, "w", encoding="utf-8") as intlData:
-        intlData.write("// Generated by make_intl_data.py. DO NOT EDIT.\n")
-        writeLanguageTagData(intlData, fileDate, url, langTagMappings, langSubtagMappings, extlangMappings)
+    with io.open(out, mode="w", encoding="utf-8", newline="") as f:
+        println = partial(print, file=f)
+        writeCLDRLanguageTagData(println, data, url)
+
+    print("Writing Intl test data...")
+    test_file = os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                             "../../tests/non262/Intl/Locale/likely-subtags-generated.js")
+    with io.open(test_file, mode="w", encoding="utf-8", newline="") as f:
+        println = partial(print, file=f)
+
+        println(u"// |reftest| skip-if(!this.hasOwnProperty('Intl'))")
+        writeCLDRLanguageTagLikelySubtagsTest(println, data, url)
+
 
 def flines(filepath, encoding="utf-8"):
     """ Open filepath and iterate over its content. """
@@ -703,11 +1835,11 @@ def processTimeZones(tzdataDir, icuDir, icuTzDir, version, ignoreBackzone, ignor
 
         println(u"// Format:")
         println(u'// "LinkName", "Target" // ICU-Target [time zone file]')
-        println(u"struct LinkAndTarget");
-        println(u"{");
-        println(u"    const char* const link;");
-        println(u"    const char* const target;");
-        println(u"};");
+        println(u"struct LinkAndTarget")
+        println(u"{")
+        println(u"    const char* const link;")
+        println(u"    const char* const target;")
+        println(u"};")
         println(u"")
         println(u"const LinkAndTarget ianaLinksCanonicalizedDifferentlyByICU[] = {")
         for (zone, target, icuTarget) in incorrectLinks:
@@ -928,7 +2060,7 @@ def updateTzdata(topsrcdir, args):
     if tzDir is None:
         print("Downloading tzdata file...")
         with closing(urllib2.urlopen(url)) as tzfile:
-            fname = urlparse.urlsplit(tzfile.geturl()).path.split("/")[-1]
+            fname = urlsplit(tzfile.geturl()).path.split("/")[-1]
             with tempfile.NamedTemporaryFile(suffix=fname) as tztmpfile:
                 print("File stored in %s" % tztmpfile.name)
                 tztmpfile.write(tzfile.read())
@@ -937,6 +2069,152 @@ def updateTzdata(topsrcdir, args):
     else:
         updateFrom(tzDir)
 
+def writeUnicodeExtensionsMappings(println, mapping, extension):
+    println(u"""
+template <size_t Length>
+static inline bool Is{0}Key(
+  mozilla::Span<const char> key, const char (&str)[Length]) {{
+  static_assert(Length == {0}KeyLength + 1,
+                "{0} extension key is two characters long");
+  return memcmp(key.data(), str, Length - 1) == 0;
+}}
+
+template <size_t Length>
+static inline bool Is{0}Type(
+  mozilla::Span<const char> type, const char (&str)[Length]) {{
+  static_assert(Length > {0}KeyLength + 1,
+                "{0} extension type contains more than two characters");
+  return type.size() == (Length - 1) &&
+         memcmp(type.data(), str, Length - 1) == 0;
+}}
+""".format(extension).rstrip("\n"))
+
+    linear_search_max_length = 4
+
+    needs_binary_search = any(len(replacements.items()) > linear_search_max_length
+                              for replacements in mapping.values())
+
+    if needs_binary_search:
+        println(u"""
+static int32_t Compare{0}Type(const char* a, mozilla::Span<const char> b) {{
+  MOZ_ASSERT(!std::char_traits<char>::find(b.data(), b.size(), '\\0'),
+             "unexpected null-character in string");
+
+  using UnsignedChar = unsigned char;
+  for (size_t i = 0; i < b.size(); i++) {{
+    // |a| is zero-terminated and |b| doesn't contain a null-terminator. So if
+    // we've reached the end of |a|, the below if-statement will always be true.
+    // That ensures we don't read past the end of |a|.
+    if (int32_t r = UnsignedChar(a[i]) - UnsignedChar(b[i])) {{
+      return r;
+    }}
+  }}
+
+  // Return zero if both strings are equal or a negative number if |b| is a
+  // prefix of |a|.
+  return -int32_t(UnsignedChar(a[b.size()]));
+}}
+
+template <size_t Length>
+static inline const char* Search{0}Replacement(
+  const char* (&types)[Length], const char* (&aliases)[Length],
+  mozilla::Span<const char> type) {{
+
+  auto p = std::lower_bound(std::begin(types), std::end(types), type,
+                            [](const auto& a, const auto& b) {{
+    return Compare{0}Type(a, b) < 0;
+  }});
+  if (p != std::end(types) && Compare{0}Type(*p, type) == 0) {{
+    return aliases[std::distance(std::begin(types), p)];
+  }}
+  return nullptr;
+}}
+""".format(extension).rstrip("\n"))
+
+    println(u"""
+/**
+ * Mapping from deprecated BCP 47 {0} extension types to their preferred
+ * values.
+ *
+ * Spec: https://www.unicode.org/reports/tr35/#Unicode_Locale_Extension_Data_Files
+ * Spec: https://www.unicode.org/reports/tr35/#t_Extension
+ */
+const char* js::intl::LanguageTag::replace{0}ExtensionType(
+    mozilla::Span<const char> key, mozilla::Span<const char> type) {{
+  MOZ_ASSERT(key.size() == {0}KeyLength);
+  MOZ_ASSERT(IsCanonicallyCased{0}Key(key));
+
+  MOZ_ASSERT(type.size() > {0}KeyLength);
+  MOZ_ASSERT(IsCanonicallyCased{0}Type(type));
+""".format(extension))
+
+    def to_hash_key(replacements):
+        return str(sorted(replacements.items()))
+
+    def write_array(subtags, name, length):
+        max_entries = (80 - len("    ")) // (length + len('"", '))
+
+        println(u"    static const char* {}[{}] = {{".format(name, len(subtags)))
+
+        for entries in grouper(subtags, max_entries):
+            entries = (u"\"{}\"".format(tag).rjust(length + 2)
+                       for tag in entries if tag is not None)
+            println(u"      {},".format(u", ".join(entries)))
+
+        println(u"    };")
+
+    # Merge duplicate keys.
+    key_aliases = {}
+    for (key, replacements) in sorted(mapping.items(), key=itemgetter(0)):
+        hash_key = to_hash_key(replacements)
+        if hash_key not in key_aliases:
+            key_aliases[hash_key] = []
+        else:
+            key_aliases[hash_key].append(key)
+
+    first_key = True
+    for (key, replacements) in sorted(mapping.items(), key=itemgetter(0)):
+        hash_key = to_hash_key(replacements)
+        if key in key_aliases[hash_key]:
+            continue
+
+        cond = (u"Is{}Key(key, \"{}\")".format(extension, k)
+                for k in [key] + key_aliases[hash_key])
+
+        if_kind = u"if" if first_key else u"else if"
+        cond = (u" ||\n" + u" " * (2 + len(if_kind) + 2)).join(cond)
+        println(u"""
+  {} ({}) {{""".format(if_kind, cond).strip("\n"))
+        first_key = False
+
+        replacements = sorted(replacements.items(), key=itemgetter(0))
+
+        if len(replacements) > linear_search_max_length:
+            types = [t for (t, _) in replacements]
+            preferred = [r for (_, r) in replacements]
+            max_len = max(len(k) for k in types + preferred)
+
+            write_array(types, "types", max_len)
+            write_array(preferred, "aliases", max_len)
+            println(u"""
+    return Search{}Replacement(types, aliases, type);
+""".format(extension).strip("\n"))
+        else:
+            for (type, replacement) in replacements:
+                println(u"""
+    if (Is{}Type(type, "{}")) {{
+      return "{}";
+    }}""".format(extension, type, replacement).strip("\n"))
+
+        println(u"""
+  }""".lstrip("\n"))
+
+    println(u"""
+  return nullptr;
+}
+""".strip("\n"))
+
+
 if __name__ == "__main__":
     import argparse
 
@@ -955,20 +2233,24 @@ if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Update intl data.")
     subparsers = parser.add_subparsers(help="Select update mode")
 
-    parser_tags = subparsers.add_parser("langtags",
-                                        help="Update language-subtag-registry")
-    parser_tags.add_argument("--url",
-                             metavar="URL",
-                             default="https://www.iana.org/assignments/language-subtag-registry",
-                             type=EnsureHttps,
-                             help="Download url for language-subtag-registry.txt (default: %(default)s)")
-    parser_tags.add_argument("--out",
-                             default="LangTagMappingsGenerated.js",
-                             help="Output file (default: %(default)s)")
-    parser_tags.add_argument("file",
-                             nargs="?",
-                             help="Local language-subtag-registry.txt file, if omitted uses <URL>")
-    parser_tags.set_defaults(func=updateLangTags)
+    parser_cldr_tags = subparsers.add_parser("langtags",
+                                             help="Update CLDR language tags data")
+    parser_cldr_tags.add_argument("--version",
+                                  metavar="VERSION",
+                                  required=True,
+                                  help="CLDR version number")
+    parser_cldr_tags.add_argument("--url",
+                                  metavar="URL",
+                                  default="https://unicode.org/Public/cldr/<VERSION>/core.zip",
+                                  type=EnsureHttps,
+                                  help="Download url CLDR data (default: %(default)s)")
+    parser_cldr_tags.add_argument("--out",
+                                  default="LanguageTagGenerated.cpp",
+                                  help="Output file (default: %(default)s)")
+    parser_cldr_tags.add_argument("file",
+                                  nargs="?",
+                                  help="Local cldr-core.zip file, if omitted uses <URL>")
+    parser_cldr_tags.set_defaults(func=updateCLDRLangTags)
 
     parser_tz = subparsers.add_parser("tzdata", help="Update tzdata")
     parser_tz.add_argument("--tz",