diff options
Diffstat (limited to 'deps/icu-small/source/i18n/regexcmp.cpp')
-rw-r--r-- | deps/icu-small/source/i18n/regexcmp.cpp | 361 |
1 files changed, 181 insertions, 180 deletions
diff --git a/deps/icu-small/source/i18n/regexcmp.cpp b/deps/icu-small/source/i18n/regexcmp.cpp index 410ff9513b..0c5fca6f67 100644 --- a/deps/icu-small/source/i18n/regexcmp.cpp +++ b/deps/icu-small/source/i18n/regexcmp.cpp @@ -28,6 +28,7 @@ #include "patternprops.h" #include "putilimp.h" #include "cmemory.h" +#include "cstr.h" #include "cstring.h" #include "uvectr32.h" #include "uvectr64.h" @@ -3892,7 +3893,7 @@ void RegexCompile::stripNOPs() { // //------------------------------------------------------------------------------ void RegexCompile::error(UErrorCode e) { - if (U_SUCCESS(*fStatus)) { + if (U_SUCCESS(*fStatus) || e == U_MEMORY_ALLOCATION_ERROR) { *fStatus = e; // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are @@ -4370,209 +4371,209 @@ static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) { // Includes trying the Java "properties" that aren't supported as // normal ICU UnicodeSet properties // -static const UChar posSetPrefix[] = {0x5b, 0x5c, 0x70, 0x7b, 0}; // "[\p{" -static const UChar negSetPrefix[] = {0x5b, 0x5c, 0x50, 0x7b, 0}; // "[\P{" UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) { - UnicodeString setExpr; - UnicodeSet *set; - uint32_t usetFlags = 0; if (U_FAILURE(*fStatus)) { - return NULL; + return nullptr; } + LocalPointer<UnicodeSet> set; + UErrorCode status = U_ZERO_ERROR; - // - // First try the property as we received it - // - if (negated) { - setExpr.append(negSetPrefix, -1); - } else { - setExpr.append(posSetPrefix, -1); - } - setExpr.append(propName); - setExpr.append(chRBrace); - setExpr.append(chRBracket); - if (fModeFlags & UREGEX_CASE_INSENSITIVE) { - usetFlags |= USET_CASE_INSENSITIVE; - } - set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus); - if (U_SUCCESS(*fStatus)) { - return set; - } - delete set; - set = NULL; - - // - // The property as it was didn't work. - - // Do [:word:]. It is not recognized as a property by UnicodeSet. "word" not standard POSIX - // or standard Java, but many other regular expression packages do recognize it. - - if (propName.caseCompare(UNICODE_STRING_SIMPLE("word"), 0) == 0) { - *fStatus = U_ZERO_ERROR; - set = new UnicodeSet(*(fRXPat->fStaticSets[URX_ISWORD_SET])); - if (set == NULL) { - *fStatus = U_MEMORY_ALLOCATION_ERROR; - return set; + do { // non-loop, exists to allow breaks from the block. + // + // First try the property as we received it + // + UnicodeString setExpr; + uint32_t usetFlags = 0; + setExpr.append(u"[\\p{", -1); + setExpr.append(propName); + setExpr.append(u"}]", -1); + if (fModeFlags & UREGEX_CASE_INSENSITIVE) { + usetFlags |= USET_CASE_INSENSITIVE; } - if (negated) { - set->complement(); + set.adoptInsteadAndCheckErrorCode(new UnicodeSet(setExpr, usetFlags, NULL, status), status); + if (U_SUCCESS(status) || status == U_MEMORY_ALLOCATION_ERROR) { + break; } - return set; - } + // + // The incoming property wasn't directly recognized by ICU. - // Do Java fixes - - // InGreek -> InGreek or Coptic, that being the official Unicode name for that block. - // InCombiningMarksforSymbols -> InCombiningDiacriticalMarksforSymbols. - // - // Note on Spaces: either "InCombiningMarksForSymbols" or "InCombining Marks for Symbols" - // is accepted by Java. The property part of the name is compared - // case-insenstively. The spaces must be exactly as shown, either - // all there, or all omitted, with exactly one at each position - // if they are present. From checking against JDK 1.6 - // - // This code should be removed when ICU properties support the Java compatibility names - // (ICU 4.0?) - // - UnicodeString mPropName = propName; - if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InGreek"), 0) == 0) { - mPropName = UNICODE_STRING_SIMPLE("InGreek and Coptic"); - } - if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombining Marks for Symbols"), 0) == 0 || - mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombiningMarksforSymbols"), 0) == 0) { - mPropName = UNICODE_STRING_SIMPLE("InCombining Diacritical Marks for Symbols"); - } - else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) { - mPropName = UNICODE_STRING_SIMPLE("javaValidCodePoint"); - } + // Check [:word:] and [:all:]. These are not recognized as a properties by ICU UnicodeSet. + // Java accepts 'word' with mixed case. + // Java accepts 'all' only in all lower case. - // See if the property looks like a Java "InBlockName", which - // we will recast as "Block=BlockName" - // - if (mPropName.startsWith(u"In", 2) && propName.length()>=3) { - setExpr.truncate(4); // Leaves "[\p{", or "[\P{" - setExpr.append(u"Block=", -1); - setExpr.append(UnicodeString(mPropName, 2)); // Property with the leading "In" removed. - setExpr.append(chRBrace); - setExpr.append(chRBracket); - *fStatus = U_ZERO_ERROR; - set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus); - if (U_SUCCESS(*fStatus)) { - return set; + status = U_ZERO_ERROR; + if (propName.caseCompare(u"word", -1, 0) == 0) { + set.adoptInsteadAndCheckErrorCode(new UnicodeSet(*(fRXPat->fStaticSets[URX_ISWORD_SET])), status); + break; + } + if (propName.compare(u"all", -1) == 0) { + set.adoptInsteadAndCheckErrorCode(new UnicodeSet(0, 0x10ffff), status); + break; } - delete set; - set = NULL; - } - if (propName.startsWith(UNICODE_STRING_SIMPLE("java")) || - propName.compare(UNICODE_STRING_SIMPLE("all")) == 0) - { - UErrorCode localStatus = U_ZERO_ERROR; - //setExpr.remove(); - set = new UnicodeSet(); - // - // Try the various Java specific properties. - // These all begin with "java" + + // Do Java InBlock expressions // - if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDefined")) == 0) { - addCategory(set, U_GC_CN_MASK, localStatus); - set->complement(); - } - else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDigit")) == 0) { - addCategory(set, U_GC_ND_MASK, localStatus); - } - else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaIdentifierIgnorable")) == 0) { - addIdentifierIgnorable(set, localStatus); - } - else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaISOControl")) == 0) { - set->add(0, 0x1F).add(0x7F, 0x9F); - } - else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierPart")) == 0) { - addCategory(set, U_GC_L_MASK, localStatus); - addCategory(set, U_GC_SC_MASK, localStatus); - addCategory(set, U_GC_PC_MASK, localStatus); - addCategory(set, U_GC_ND_MASK, localStatus); - addCategory(set, U_GC_NL_MASK, localStatus); - addCategory(set, U_GC_MC_MASK, localStatus); - addCategory(set, U_GC_MN_MASK, localStatus); - addIdentifierIgnorable(set, localStatus); - } - else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierStart")) == 0) { - addCategory(set, U_GC_L_MASK, localStatus); - addCategory(set, U_GC_NL_MASK, localStatus); - addCategory(set, U_GC_SC_MASK, localStatus); - addCategory(set, U_GC_PC_MASK, localStatus); - } - else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetter")) == 0) { - addCategory(set, U_GC_L_MASK, localStatus); - } - else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetterOrDigit")) == 0) { - addCategory(set, U_GC_L_MASK, localStatus); - addCategory(set, U_GC_ND_MASK, localStatus); - } - else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLowerCase")) == 0) { - addCategory(set, U_GC_LL_MASK, localStatus); - } - else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaMirrored")) == 0) { - set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, localStatus); - } - else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSpaceChar")) == 0) { - addCategory(set, U_GC_Z_MASK, localStatus); - } - else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSupplementaryCodePoint")) == 0) { - set->add(0x10000, UnicodeSet::MAX_VALUE); - } - else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaTitleCase")) == 0) { - addCategory(set, U_GC_LT_MASK, localStatus); - } - else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierStart")) == 0) { - addCategory(set, U_GC_L_MASK, localStatus); - addCategory(set, U_GC_NL_MASK, localStatus); - } - else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierPart")) == 0) { - addCategory(set, U_GC_L_MASK, localStatus); - addCategory(set, U_GC_PC_MASK, localStatus); - addCategory(set, U_GC_ND_MASK, localStatus); - addCategory(set, U_GC_NL_MASK, localStatus); - addCategory(set, U_GC_MC_MASK, localStatus); - addCategory(set, U_GC_MN_MASK, localStatus); - addIdentifierIgnorable(set, localStatus); - } - else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUpperCase")) == 0) { - addCategory(set, U_GC_LU_MASK, localStatus); - } - else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaValidCodePoint")) == 0) { - set->add(0, UnicodeSet::MAX_VALUE); - } - else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaWhitespace")) == 0) { - addCategory(set, U_GC_Z_MASK, localStatus); - set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f)); - set->add(9, 0x0d).add(0x1c, 0x1f); - } - else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) { - set->add(0, UnicodeSet::MAX_VALUE); + UnicodeString mPropName = propName; + if (mPropName.startsWith(u"In", 2) && mPropName.length() >= 3) { + status = U_ZERO_ERROR; + set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status); + if (U_FAILURE(status)) { + break; + } + UnicodeString blockName(mPropName, 2); // Property with the leading "In" removed. + set->applyPropertyAlias(UnicodeString(u"Block"), blockName, status); + break; } - if (U_SUCCESS(localStatus) && !set->isEmpty()) { - *fStatus = U_ZERO_ERROR; - if (usetFlags & USET_CASE_INSENSITIVE) { + // Check for the Java form "IsBooleanPropertyValue", which we will recast + // as "BooleanPropertyValue". The property value can be either a + // a General Category or a Script Name. + + if (propName.startsWith(u"Is", 2) && propName.length()>=3) { + mPropName.remove(0, 2); // Strip the "Is" + if (mPropName.indexOf(u'=') >= 0) { + // Reject any "Is..." property expression containing an '=', that is, + // any non-binary property expression. + status = U_REGEX_PROPERTY_SYNTAX; + break; + } + + if (mPropName.caseCompare(u"assigned", -1, 0) == 0) { + mPropName.setTo(u"unassigned", -1); + negated = !negated; + } else if (mPropName.caseCompare(u"TitleCase", -1, 0) == 0) { + mPropName.setTo(u"Titlecase_Letter", -1); + } + + mPropName.insert(0, u"[\\p{", -1); + mPropName.append(u"}]", -1); + set.adoptInsteadAndCheckErrorCode(new UnicodeSet(mPropName, *fStatus), status); + + if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) { set->closeOver(USET_CASE_INSENSITIVE); } - if (negated) { + break; + + } + + if (propName.startsWith(u"java", -1)) { + status = U_ZERO_ERROR; + set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status); + if (U_FAILURE(status)) { + break; + } + // + // Try the various Java specific properties. + // These all begin with "java" + // + if (propName.compare(u"javaDefined", -1) == 0) { + addCategory(set.getAlias(), U_GC_CN_MASK, status); set->complement(); } - return set; + else if (propName.compare(u"javaDigit", -1) == 0) { + addCategory(set.getAlias(), U_GC_ND_MASK, status); + } + else if (propName.compare(u"javaIdentifierIgnorable", -1) == 0) { + addIdentifierIgnorable(set.getAlias(), status); + } + else if (propName.compare(u"javaISOControl", -1) == 0) { + set->add(0, 0x1F).add(0x7F, 0x9F); + } + else if (propName.compare(u"javaJavaIdentifierPart", -1) == 0) { + addCategory(set.getAlias(), U_GC_L_MASK, status); + addCategory(set.getAlias(), U_GC_SC_MASK, status); + addCategory(set.getAlias(), U_GC_PC_MASK, status); + addCategory(set.getAlias(), U_GC_ND_MASK, status); + addCategory(set.getAlias(), U_GC_NL_MASK, status); + addCategory(set.getAlias(), U_GC_MC_MASK, status); + addCategory(set.getAlias(), U_GC_MN_MASK, status); + addIdentifierIgnorable(set.getAlias(), status); + } + else if (propName.compare(u"javaJavaIdentifierStart", -1) == 0) { + addCategory(set.getAlias(), U_GC_L_MASK, status); + addCategory(set.getAlias(), U_GC_NL_MASK, status); + addCategory(set.getAlias(), U_GC_SC_MASK, status); + addCategory(set.getAlias(), U_GC_PC_MASK, status); + } + else if (propName.compare(u"javaLetter", -1) == 0) { + addCategory(set.getAlias(), U_GC_L_MASK, status); + } + else if (propName.compare(u"javaLetterOrDigit", -1) == 0) { + addCategory(set.getAlias(), U_GC_L_MASK, status); + addCategory(set.getAlias(), U_GC_ND_MASK, status); + } + else if (propName.compare(u"javaLowerCase", -1) == 0) { + addCategory(set.getAlias(), U_GC_LL_MASK, status); + } + else if (propName.compare(u"javaMirrored", -1) == 0) { + set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, status); + } + else if (propName.compare(u"javaSpaceChar", -1) == 0) { + addCategory(set.getAlias(), U_GC_Z_MASK, status); + } + else if (propName.compare(u"javaSupplementaryCodePoint", -1) == 0) { + set->add(0x10000, UnicodeSet::MAX_VALUE); + } + else if (propName.compare(u"javaTitleCase", -1) == 0) { + addCategory(set.getAlias(), U_GC_LT_MASK, status); + } + else if (propName.compare(u"javaUnicodeIdentifierStart", -1) == 0) { + addCategory(set.getAlias(), U_GC_L_MASK, status); + addCategory(set.getAlias(), U_GC_NL_MASK, status); + } + else if (propName.compare(u"javaUnicodeIdentifierPart", -1) == 0) { + addCategory(set.getAlias(), U_GC_L_MASK, status); + addCategory(set.getAlias(), U_GC_PC_MASK, status); + addCategory(set.getAlias(), U_GC_ND_MASK, status); + addCategory(set.getAlias(), U_GC_NL_MASK, status); + addCategory(set.getAlias(), U_GC_MC_MASK, status); + addCategory(set.getAlias(), U_GC_MN_MASK, status); + addIdentifierIgnorable(set.getAlias(), status); + } + else if (propName.compare(u"javaUpperCase", -1) == 0) { + addCategory(set.getAlias(), U_GC_LU_MASK, status); + } + else if (propName.compare(u"javaValidCodePoint", -1) == 0) { + set->add(0, UnicodeSet::MAX_VALUE); + } + else if (propName.compare(u"javaWhitespace", -1) == 0) { + addCategory(set.getAlias(), U_GC_Z_MASK, status); + set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f)); + set->add(9, 0x0d).add(0x1c, 0x1f); + } else { + status = U_REGEX_PROPERTY_SYNTAX; + } + + if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) { + set->closeOver(USET_CASE_INSENSITIVE); + } + break; + } + + // Unrecognized property. ICU didn't like it as it was, and none of the Java compatibility + // extensions matched it. + status = U_REGEX_PROPERTY_SYNTAX; + } while (false); // End of do loop block. Code above breaks out of the block on success or hard failure. + + if (U_SUCCESS(status)) { + U_ASSERT(set.isValid()); + if (negated) { + set->complement(); } - delete set; - set = NULL; + return set.orphan(); + } else { + if (status == U_ILLEGAL_ARGUMENT_ERROR) { + status = U_REGEX_PROPERTY_SYNTAX; + } + error(status); + return nullptr; } - error(*fStatus); - return NULL; } - // // SetEval Part of the evaluation of [set expressions]. // Perform any pending (stacked) operations with precedence |