diff options
Diffstat (limited to 'js/src/new-regexp/gen-regexp-special-case.cc')
-rw-r--r-- | js/src/new-regexp/gen-regexp-special-case.cc | 165 |
1 files changed, 0 insertions, 165 deletions
diff --git a/js/src/new-regexp/gen-regexp-special-case.cc b/js/src/new-regexp/gen-regexp-special-case.cc deleted file mode 100644 index 5a82c5d277..0000000000 --- a/js/src/new-regexp/gen-regexp-special-case.cc +++ /dev/null @@ -1,165 +0,0 @@ -// Copyright 2020 the V8 project authors. All rights reserved. -// Use of this source code is governed by a BSD-style license that can be -// found in the LICENSE file. - -#include <fstream> -#include <iomanip> -#include <iostream> -#include <sstream> - -#include "new-regexp/special-case.h" - -namespace v8 { -namespace internal { - -static const uc32 kSurrogateStart = 0xd800; -static const uc32 kSurrogateEnd = 0xdfff; -static const uc32 kNonBmpStart = 0x10000; - -// The following code generates "src/regexp/special-case.cc". -void PrintSet(std::ofstream& out, const char* name, - const icu::UnicodeSet& set) { - out << "icu::UnicodeSet Build" << name << "() {\n" - << " icu::UnicodeSet set;\n"; - for (int32_t i = 0; i < set.getRangeCount(); i++) { - if (set.getRangeStart(i) == set.getRangeEnd(i)) { - out << " set.add(0x" << set.getRangeStart(i) << ");\n"; - } else { - out << " set.add(0x" << set.getRangeStart(i) << ", 0x" - << set.getRangeEnd(i) << ");\n"; - } - } - out << " set.freeze();\n" - << " return set;\n" - << "}\n\n"; - - out << "struct " << name << "Data {\n" - << " " << name << "Data() : set(Build" << name << "()) {}\n" - << " const icu::UnicodeSet set;\n" - << "};\n\n"; - - out << "//static\n" - << "const icu::UnicodeSet& RegExpCaseFolding::" << name << "() {\n" - << " static base::LazyInstance<" << name << "Data>::type set =\n" - << " LAZY_INSTANCE_INITIALIZER;\n" - << " return set.Pointer()->set;\n" - << "}\n\n"; -} - -void PrintSpecial(std::ofstream& out) { - icu::UnicodeSet current; - icu::UnicodeSet special_add; - icu::UnicodeSet ignore; - UErrorCode status = U_ZERO_ERROR; - icu::UnicodeSet upper("[\\p{Lu}]", status); - CHECK(U_SUCCESS(status)); - - // Iterate through all chars in BMP except surrogates. - for (UChar32 i = 0; i < kNonBmpStart; i++) { - if (i >= kSurrogateStart && i <= kSurrogateEnd) { - continue; // Ignore surrogate range - } - current.set(i, i); - current.closeOver(USET_CASE_INSENSITIVE); - - // Check to see if all characters in the case-folding equivalence - // class as defined by UnicodeSet::closeOver all map to the same - // canonical value. - UChar32 canonical = RegExpCaseFolding::Canonicalize(i); - bool class_has_matching_canonical_char = false; - bool class_has_non_matching_canonical_char = false; - for (int32_t j = 0; j < current.getRangeCount(); j++) { - for (UChar32 c = current.getRangeStart(j); c <= current.getRangeEnd(j); - c++) { - if (c == i) { - continue; - } - UChar32 other_canonical = RegExpCaseFolding::Canonicalize(c); - if (canonical == other_canonical) { - class_has_matching_canonical_char = true; - } else { - class_has_non_matching_canonical_char = true; - } - } - } - // If any other character in i's equivalence class has a - // different canonical value, then i needs special handling. If - // no other character shares a canonical value with i, we can - // ignore i when adding alternatives for case-independent - // comparison. If at least one other character shares a - // canonical value, then i needs special handling. - if (class_has_non_matching_canonical_char) { - if (class_has_matching_canonical_char) { - special_add.add(i); - } else { - ignore.add(i); - } - } - } - - // Verify that no Unicode equivalence class contains two non-trivial - // JS equivalence classes. Every character in SpecialAddSet has the - // same canonical value as every other non-IgnoreSet character in - // its Unicode equivalence class. Therefore, if we call closeOver on - // a set containing no IgnoreSet characters, the only characters - // that must be removed from the result are in IgnoreSet. This fact - // is used in CharacterRange::AddCaseEquivalents. - for (int32_t i = 0; i < special_add.getRangeCount(); i++) { - for (UChar32 c = special_add.getRangeStart(i); - c <= special_add.getRangeEnd(i); c++) { - UChar32 canonical = RegExpCaseFolding::Canonicalize(c); - current.set(c, c); - current.closeOver(USET_CASE_INSENSITIVE); - current.removeAll(ignore); - for (int32_t j = 0; j < current.getRangeCount(); j++) { - for (UChar32 c2 = current.getRangeStart(j); - c2 <= current.getRangeEnd(j); c2++) { - CHECK_EQ(canonical, RegExpCaseFolding::Canonicalize(c2)); - } - } - } - } - - PrintSet(out, "IgnoreSet", ignore); - PrintSet(out, "SpecialAddSet", special_add); -} - -void WriteHeader(const char* header_filename) { - std::ofstream out(header_filename); - out << std::hex << std::setfill('0') << std::setw(4); - out << "// Copyright 2020 the V8 project authors. All rights reserved.\n" - << "// Use of this source code is governed by a BSD-style license that\n" - << "// can be found in the LICENSE file.\n\n" - << "// Automatically generated by regexp/gen-regexp-special-case.cc\n\n" - << "// The following functions are used to build UnicodeSets\n" - << "// for special cases where the case-folding algorithm used by\n" - << "// UnicodeSet::closeOver(USET_CASE_INSENSITIVE) does not match\n" - << "// the algorithm defined in ECMAScript 2020 21.2.2.8.2 (Runtime\n" - << "// Semantics: Canonicalize) step 3.\n\n" - << "#ifdef V8_INTL_SUPPORT\n" - << "#include \"src/base/lazy-instance.h\"\n\n" - << "#include \"src/regexp/special-case.h\"\n\n" - << "#include \"unicode/uniset.h\"\n" - << "namespace v8 {\n" - << "namespace internal {\n\n"; - - PrintSpecial(out); - - out << "\n" - << "} // namespace internal\n" - << "} // namespace v8\n" - << "#endif // V8_INTL_SUPPORT\n"; -} - -} // namespace internal -} // namespace v8 - -int main(int argc, const char** argv) { - if (argc != 2) { - std::cerr << "Usage: " << argv[0] << " <output filename>\n"; - std::exit(1); - } - v8::internal::WriteHeader(argv[1]); - - return 0; -} |