diff options
Diffstat (limited to 'js/src/regexp/special-case.h')
-rw-r--r-- | js/src/regexp/special-case.h | 79 |
1 files changed, 79 insertions, 0 deletions
diff --git a/js/src/regexp/special-case.h b/js/src/regexp/special-case.h new file mode 100644 index 0000000000..1ccec5d31a --- /dev/null +++ b/js/src/regexp/special-case.h @@ -0,0 +1,79 @@ +// Copyright 2019 the V8 project authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef V8_REGEXP_SPECIAL_CASE_H_ +#define V8_REGEXP_SPECIAL_CASE_H_ + +#ifdef V8_INTL_SUPPORT +#include "unicode/uversion.h" +namespace U_ICU_NAMESPACE { +class UnicodeSet; +} // namespace U_ICU_NAMESPACE + +namespace v8 { +namespace internal { + +// Functions to build special sets of Unicode characters that need special +// handling under "i" mode that cannot use closeOver(USET_CASE_INSENSITIVE). +// +// For the characters in the "ignore set", the process should not treat other +// characters in the result of closeOver(USET_CASE_INSENSITIVE) as case +// equivlant under the ECMA262 RegExp "i" mode because these characters are +// uppercase themselves that no other characters in the set uppercase to. +// +// For the characters in the "special add set", the proecess should add only +// those characters in the result of closeOver(USET_CASE_INSENSITIVE) which is +// not uppercase characters as case equivlant under the ECMA262 RegExp "i" mode +// and also that ONE uppercase character that other non uppercase character +// uppercase into to the set. Other uppercase characters in the result of +// closeOver(USET_CASE_INSENSITIVE) should not be considered because ECMA262 +// RegExp "i" mode consider two characters as "case equivlant" if both +// characters uppercase to the same character. +// +// For example, consider the following case equivalent set defined by Unicode +// standard. Notice there are more than one uppercase characters in this set: +// U+212B Å Angstrom Sign - an uppercase character. +// U+00C5 Å Latin Capital Letter A with Ring Above - an uppercase character. +// U+00E5 å Latin Small Letter A with Ring Above - a lowercase character which +// uppercase to U+00C5. +// In this case equivlant set is a special set and need special handling while +// considering "case equivlant" under the ECMA262 RegExp "i" mode which is +// different than Unicode Standard: +// * U+212B should be included into the "ignore" set because there are no other +// characters, under the ECMA262 "i" mode, are considered as "case equivlant" +// to it because U+212B is itself an uppercase but neither U+00C5 nor U+00E5 +// uppercase to U+212B. +// * U+00C5 and U+00E5 will both be included into the "special add" set. While +// calculate the "equivlant set" under ECMA262 "i" mode, the process will +// add U+00E5, because it is not an uppercase character in the set. The +// process will also add U+00C5, because it is the uppercase character which +// other non uppercase character, U+00C5, uppercase into. +// +// For characters not included in "ignore set" and "special add set", the +// process will just use closeOver(USET_CASE_INSENSITIVE) to calcualte, which is +// much faster. +// +// Under Unicode 12.0, there are only 7 characters in the "special add set" and +// 4 characters in "ignore set" so even the special add process is slower, it is +// limited to a small set of cases only. +// +// The implementation of these two function will be generated by calling ICU +// icu::UnicodeSet during the build time into gen/src/regexp/special-case.cc by +// the code in src/regexp/gen-regexp-special-case.cc. +// +// These two function will be used with LazyInstance<> template to generate +// global sharable set to reduce memory usage and speed up performance. + +// Function to build and return the Ignore set. +icu::UnicodeSet BuildIgnoreSet(); + +// Function to build and return the Special Add set. +icu::UnicodeSet BuildSpecialAddSet(); + +} // namespace internal +} // namespace v8 + +#endif // V8_INTL_SUPPORT + +#endif // V8_REGEXP_SPECIAL_CASE_H_ |