// Copyright 2019 the V8 project authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include #include #include #include #include "src/base/logging.h" #include "unicode/uchar.h" #include "unicode/uniset.h" namespace v8 { namespace internal { // The following code generates BuildSpecialAddSet() and BuildIgnoreSet() // functions into "src/regexp/special-case.cc". // See more details in http://shorturl.at/adfO5 void PrintSet(std::ofstream& out, const char* func_name, const icu::UnicodeSet& set) { out << "icu::UnicodeSet " << func_name << "() {\n" << " icu::UnicodeSet set;\n"; for (int32_t i = 0; i < set.getRangeCount(); i++) { if (set.getRangeStart(i) == set.getRangeEnd(i)) { out << " set.add(0x" << set.getRangeStart(i) << ");\n"; } else { out << " set.add(0x" << set.getRangeStart(i) << ", 0x" << set.getRangeEnd(i) << ");\n"; } } out << " set.freeze();\n" << " return set;\n" << "}\n"; } void PrintSpecial(std::ofstream& out) { icu::UnicodeSet current; icu::UnicodeSet processed(0xd800, 0xdbff); // Ignore surrogate range. icu::UnicodeSet special_add; icu::UnicodeSet ignore; UErrorCode status = U_ZERO_ERROR; icu::UnicodeSet upper("[\\p{Lu}]", status); CHECK(U_SUCCESS(status)); // Iterate through all chars in BMP except ASCII and Surrogate. for (UChar32 i = 0x80; i < 0x010000; i++) { // Ignore those characters which is already processed. if (!processed.contains(i)) { current.set(i, i); current.closeOver(USET_CASE_INSENSITIVE); // Remember we already processed current. processed.addAll(current); // All uppercase characters in current. icu::UnicodeSet keep_upper(current); keep_upper.retainAll(upper); // Check if we have more than one uppercase character in current. // If there are more than one uppercase character, then it is a special // set which need to be added into either "Special Add" set or "Ignore" // set. int32_t number_of_upper = 0; for (int32_t i = 0; i < keep_upper.getRangeCount() && i <= 1; i++) { number_of_upper += keep_upper.getRangeEnd(i) - keep_upper.getRangeStart(i) + 1; } if (number_of_upper > 1) { // Add all non uppercase characters (could be Ll or Mn) to special add // set. current.removeAll(upper); special_add.addAll(current); // Add the uppercase characters of non uppercase character to // special add set. CHECK_GT(current.getRangeCount(), 0); UChar32 main_upper = u_toupper(current.getRangeStart(0)); special_add.add(main_upper); // Add all uppercase except the main upper to ignore set. keep_upper.remove(main_upper); ignore.addAll(keep_upper); } } } // Remove any ASCII special_add.remove(0x0000, 0x007f); PrintSet(out, "BuildIgnoreSet", ignore); PrintSet(out, "BuildSpecialAddSet", special_add); } void WriteHeader(const char* header_filename) { std::ofstream out(header_filename); out << std::hex << std::setfill('0') << std::setw(4); out << "// Automatically generated by regexp/gen-regexp-special-case.cc\n" << "// The following functions are used to build icu::UnicodeSet\n" << "// for specical cases different between Unicode and ECMA262.\n" << "#ifdef V8_INTL_SUPPORT\n" << "#include \"src/regexp/special-case.h\"\n\n" << "#include \"unicode/uniset.h\"\n" << "namespace v8 {\n" << "namespace internal {\n\n"; PrintSpecial(out); out << "\n" << "} // namespace internal\n" << "} // namespace v8\n" << "#endif // V8_INTL_SUPPORT\n"; } } // namespace internal } // namespace v8 int main(int argc, const char** argv) { if (argc != 2) { std::cerr << "Usage: " << argv[0] << " \n"; std::exit(1); } v8::internal::WriteHeader(argv[1]); return 0; }