summaryrefslogtreecommitdiff
path: root/deps/icu-small/source/i18n/regexcmp.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'deps/icu-small/source/i18n/regexcmp.cpp')
-rw-r--r--deps/icu-small/source/i18n/regexcmp.cpp361
1 files changed, 181 insertions, 180 deletions
diff --git a/deps/icu-small/source/i18n/regexcmp.cpp b/deps/icu-small/source/i18n/regexcmp.cpp
index 410ff9513b..0c5fca6f67 100644
--- a/deps/icu-small/source/i18n/regexcmp.cpp
+++ b/deps/icu-small/source/i18n/regexcmp.cpp
@@ -28,6 +28,7 @@
#include "patternprops.h"
#include "putilimp.h"
#include "cmemory.h"
+#include "cstr.h"
#include "cstring.h"
#include "uvectr32.h"
#include "uvectr64.h"
@@ -3892,7 +3893,7 @@ void RegexCompile::stripNOPs() {
//
//------------------------------------------------------------------------------
void RegexCompile::error(UErrorCode e) {
- if (U_SUCCESS(*fStatus)) {
+ if (U_SUCCESS(*fStatus) || e == U_MEMORY_ALLOCATION_ERROR) {
*fStatus = e;
// Hmm. fParseErr (UParseError) line & offset fields are int32_t in public
// API (see common/unicode/parseerr.h), while fLineNum and fCharNum are
@@ -4370,209 +4371,209 @@ static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) {
// Includes trying the Java "properties" that aren't supported as
// normal ICU UnicodeSet properties
//
-static const UChar posSetPrefix[] = {0x5b, 0x5c, 0x70, 0x7b, 0}; // "[\p{"
-static const UChar negSetPrefix[] = {0x5b, 0x5c, 0x50, 0x7b, 0}; // "[\P{"
UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) {
- UnicodeString setExpr;
- UnicodeSet *set;
- uint32_t usetFlags = 0;
if (U_FAILURE(*fStatus)) {
- return NULL;
+ return nullptr;
}
+ LocalPointer<UnicodeSet> set;
+ UErrorCode status = U_ZERO_ERROR;
- //
- // First try the property as we received it
- //
- if (negated) {
- setExpr.append(negSetPrefix, -1);
- } else {
- setExpr.append(posSetPrefix, -1);
- }
- setExpr.append(propName);
- setExpr.append(chRBrace);
- setExpr.append(chRBracket);
- if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
- usetFlags |= USET_CASE_INSENSITIVE;
- }
- set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus);
- if (U_SUCCESS(*fStatus)) {
- return set;
- }
- delete set;
- set = NULL;
-
- //
- // The property as it was didn't work.
-
- // Do [:word:]. It is not recognized as a property by UnicodeSet. "word" not standard POSIX
- // or standard Java, but many other regular expression packages do recognize it.
-
- if (propName.caseCompare(UNICODE_STRING_SIMPLE("word"), 0) == 0) {
- *fStatus = U_ZERO_ERROR;
- set = new UnicodeSet(*(fRXPat->fStaticSets[URX_ISWORD_SET]));
- if (set == NULL) {
- *fStatus = U_MEMORY_ALLOCATION_ERROR;
- return set;
+ do { // non-loop, exists to allow breaks from the block.
+ //
+ // First try the property as we received it
+ //
+ UnicodeString setExpr;
+ uint32_t usetFlags = 0;
+ setExpr.append(u"[\\p{", -1);
+ setExpr.append(propName);
+ setExpr.append(u"}]", -1);
+ if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
+ usetFlags |= USET_CASE_INSENSITIVE;
}
- if (negated) {
- set->complement();
+ set.adoptInsteadAndCheckErrorCode(new UnicodeSet(setExpr, usetFlags, NULL, status), status);
+ if (U_SUCCESS(status) || status == U_MEMORY_ALLOCATION_ERROR) {
+ break;
}
- return set;
- }
+ //
+ // The incoming property wasn't directly recognized by ICU.
- // Do Java fixes -
- // InGreek -> InGreek or Coptic, that being the official Unicode name for that block.
- // InCombiningMarksforSymbols -> InCombiningDiacriticalMarksforSymbols.
- //
- // Note on Spaces: either "InCombiningMarksForSymbols" or "InCombining Marks for Symbols"
- // is accepted by Java. The property part of the name is compared
- // case-insenstively. The spaces must be exactly as shown, either
- // all there, or all omitted, with exactly one at each position
- // if they are present. From checking against JDK 1.6
- //
- // This code should be removed when ICU properties support the Java compatibility names
- // (ICU 4.0?)
- //
- UnicodeString mPropName = propName;
- if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InGreek"), 0) == 0) {
- mPropName = UNICODE_STRING_SIMPLE("InGreek and Coptic");
- }
- if (mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombining Marks for Symbols"), 0) == 0 ||
- mPropName.caseCompare(UNICODE_STRING_SIMPLE("InCombiningMarksforSymbols"), 0) == 0) {
- mPropName = UNICODE_STRING_SIMPLE("InCombining Diacritical Marks for Symbols");
- }
- else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) {
- mPropName = UNICODE_STRING_SIMPLE("javaValidCodePoint");
- }
+ // Check [:word:] and [:all:]. These are not recognized as a properties by ICU UnicodeSet.
+ // Java accepts 'word' with mixed case.
+ // Java accepts 'all' only in all lower case.
- // See if the property looks like a Java "InBlockName", which
- // we will recast as "Block=BlockName"
- //
- if (mPropName.startsWith(u"In", 2) && propName.length()>=3) {
- setExpr.truncate(4); // Leaves "[\p{", or "[\P{"
- setExpr.append(u"Block=", -1);
- setExpr.append(UnicodeString(mPropName, 2)); // Property with the leading "In" removed.
- setExpr.append(chRBrace);
- setExpr.append(chRBracket);
- *fStatus = U_ZERO_ERROR;
- set = new UnicodeSet(setExpr, usetFlags, NULL, *fStatus);
- if (U_SUCCESS(*fStatus)) {
- return set;
+ status = U_ZERO_ERROR;
+ if (propName.caseCompare(u"word", -1, 0) == 0) {
+ set.adoptInsteadAndCheckErrorCode(new UnicodeSet(*(fRXPat->fStaticSets[URX_ISWORD_SET])), status);
+ break;
+ }
+ if (propName.compare(u"all", -1) == 0) {
+ set.adoptInsteadAndCheckErrorCode(new UnicodeSet(0, 0x10ffff), status);
+ break;
}
- delete set;
- set = NULL;
- }
- if (propName.startsWith(UNICODE_STRING_SIMPLE("java")) ||
- propName.compare(UNICODE_STRING_SIMPLE("all")) == 0)
- {
- UErrorCode localStatus = U_ZERO_ERROR;
- //setExpr.remove();
- set = new UnicodeSet();
- //
- // Try the various Java specific properties.
- // These all begin with "java"
+
+ // Do Java InBlock expressions
//
- if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDefined")) == 0) {
- addCategory(set, U_GC_CN_MASK, localStatus);
- set->complement();
- }
- else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaDigit")) == 0) {
- addCategory(set, U_GC_ND_MASK, localStatus);
- }
- else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaIdentifierIgnorable")) == 0) {
- addIdentifierIgnorable(set, localStatus);
- }
- else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaISOControl")) == 0) {
- set->add(0, 0x1F).add(0x7F, 0x9F);
- }
- else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierPart")) == 0) {
- addCategory(set, U_GC_L_MASK, localStatus);
- addCategory(set, U_GC_SC_MASK, localStatus);
- addCategory(set, U_GC_PC_MASK, localStatus);
- addCategory(set, U_GC_ND_MASK, localStatus);
- addCategory(set, U_GC_NL_MASK, localStatus);
- addCategory(set, U_GC_MC_MASK, localStatus);
- addCategory(set, U_GC_MN_MASK, localStatus);
- addIdentifierIgnorable(set, localStatus);
- }
- else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaJavaIdentifierStart")) == 0) {
- addCategory(set, U_GC_L_MASK, localStatus);
- addCategory(set, U_GC_NL_MASK, localStatus);
- addCategory(set, U_GC_SC_MASK, localStatus);
- addCategory(set, U_GC_PC_MASK, localStatus);
- }
- else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetter")) == 0) {
- addCategory(set, U_GC_L_MASK, localStatus);
- }
- else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLetterOrDigit")) == 0) {
- addCategory(set, U_GC_L_MASK, localStatus);
- addCategory(set, U_GC_ND_MASK, localStatus);
- }
- else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaLowerCase")) == 0) {
- addCategory(set, U_GC_LL_MASK, localStatus);
- }
- else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaMirrored")) == 0) {
- set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, localStatus);
- }
- else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSpaceChar")) == 0) {
- addCategory(set, U_GC_Z_MASK, localStatus);
- }
- else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaSupplementaryCodePoint")) == 0) {
- set->add(0x10000, UnicodeSet::MAX_VALUE);
- }
- else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaTitleCase")) == 0) {
- addCategory(set, U_GC_LT_MASK, localStatus);
- }
- else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierStart")) == 0) {
- addCategory(set, U_GC_L_MASK, localStatus);
- addCategory(set, U_GC_NL_MASK, localStatus);
- }
- else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUnicodeIdentifierPart")) == 0) {
- addCategory(set, U_GC_L_MASK, localStatus);
- addCategory(set, U_GC_PC_MASK, localStatus);
- addCategory(set, U_GC_ND_MASK, localStatus);
- addCategory(set, U_GC_NL_MASK, localStatus);
- addCategory(set, U_GC_MC_MASK, localStatus);
- addCategory(set, U_GC_MN_MASK, localStatus);
- addIdentifierIgnorable(set, localStatus);
- }
- else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaUpperCase")) == 0) {
- addCategory(set, U_GC_LU_MASK, localStatus);
- }
- else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaValidCodePoint")) == 0) {
- set->add(0, UnicodeSet::MAX_VALUE);
- }
- else if (mPropName.compare(UNICODE_STRING_SIMPLE("javaWhitespace")) == 0) {
- addCategory(set, U_GC_Z_MASK, localStatus);
- set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f));
- set->add(9, 0x0d).add(0x1c, 0x1f);
- }
- else if (mPropName.compare(UNICODE_STRING_SIMPLE("all")) == 0) {
- set->add(0, UnicodeSet::MAX_VALUE);
+ UnicodeString mPropName = propName;
+ if (mPropName.startsWith(u"In", 2) && mPropName.length() >= 3) {
+ status = U_ZERO_ERROR;
+ set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status);
+ if (U_FAILURE(status)) {
+ break;
+ }
+ UnicodeString blockName(mPropName, 2); // Property with the leading "In" removed.
+ set->applyPropertyAlias(UnicodeString(u"Block"), blockName, status);
+ break;
}
- if (U_SUCCESS(localStatus) && !set->isEmpty()) {
- *fStatus = U_ZERO_ERROR;
- if (usetFlags & USET_CASE_INSENSITIVE) {
+ // Check for the Java form "IsBooleanPropertyValue", which we will recast
+ // as "BooleanPropertyValue". The property value can be either a
+ // a General Category or a Script Name.
+
+ if (propName.startsWith(u"Is", 2) && propName.length()>=3) {
+ mPropName.remove(0, 2); // Strip the "Is"
+ if (mPropName.indexOf(u'=') >= 0) {
+ // Reject any "Is..." property expression containing an '=', that is,
+ // any non-binary property expression.
+ status = U_REGEX_PROPERTY_SYNTAX;
+ break;
+ }
+
+ if (mPropName.caseCompare(u"assigned", -1, 0) == 0) {
+ mPropName.setTo(u"unassigned", -1);
+ negated = !negated;
+ } else if (mPropName.caseCompare(u"TitleCase", -1, 0) == 0) {
+ mPropName.setTo(u"Titlecase_Letter", -1);
+ }
+
+ mPropName.insert(0, u"[\\p{", -1);
+ mPropName.append(u"}]", -1);
+ set.adoptInsteadAndCheckErrorCode(new UnicodeSet(mPropName, *fStatus), status);
+
+ if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
set->closeOver(USET_CASE_INSENSITIVE);
}
- if (negated) {
+ break;
+
+ }
+
+ if (propName.startsWith(u"java", -1)) {
+ status = U_ZERO_ERROR;
+ set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status);
+ if (U_FAILURE(status)) {
+ break;
+ }
+ //
+ // Try the various Java specific properties.
+ // These all begin with "java"
+ //
+ if (propName.compare(u"javaDefined", -1) == 0) {
+ addCategory(set.getAlias(), U_GC_CN_MASK, status);
set->complement();
}
- return set;
+ else if (propName.compare(u"javaDigit", -1) == 0) {
+ addCategory(set.getAlias(), U_GC_ND_MASK, status);
+ }
+ else if (propName.compare(u"javaIdentifierIgnorable", -1) == 0) {
+ addIdentifierIgnorable(set.getAlias(), status);
+ }
+ else if (propName.compare(u"javaISOControl", -1) == 0) {
+ set->add(0, 0x1F).add(0x7F, 0x9F);
+ }
+ else if (propName.compare(u"javaJavaIdentifierPart", -1) == 0) {
+ addCategory(set.getAlias(), U_GC_L_MASK, status);
+ addCategory(set.getAlias(), U_GC_SC_MASK, status);
+ addCategory(set.getAlias(), U_GC_PC_MASK, status);
+ addCategory(set.getAlias(), U_GC_ND_MASK, status);
+ addCategory(set.getAlias(), U_GC_NL_MASK, status);
+ addCategory(set.getAlias(), U_GC_MC_MASK, status);
+ addCategory(set.getAlias(), U_GC_MN_MASK, status);
+ addIdentifierIgnorable(set.getAlias(), status);
+ }
+ else if (propName.compare(u"javaJavaIdentifierStart", -1) == 0) {
+ addCategory(set.getAlias(), U_GC_L_MASK, status);
+ addCategory(set.getAlias(), U_GC_NL_MASK, status);
+ addCategory(set.getAlias(), U_GC_SC_MASK, status);
+ addCategory(set.getAlias(), U_GC_PC_MASK, status);
+ }
+ else if (propName.compare(u"javaLetter", -1) == 0) {
+ addCategory(set.getAlias(), U_GC_L_MASK, status);
+ }
+ else if (propName.compare(u"javaLetterOrDigit", -1) == 0) {
+ addCategory(set.getAlias(), U_GC_L_MASK, status);
+ addCategory(set.getAlias(), U_GC_ND_MASK, status);
+ }
+ else if (propName.compare(u"javaLowerCase", -1) == 0) {
+ addCategory(set.getAlias(), U_GC_LL_MASK, status);
+ }
+ else if (propName.compare(u"javaMirrored", -1) == 0) {
+ set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, status);
+ }
+ else if (propName.compare(u"javaSpaceChar", -1) == 0) {
+ addCategory(set.getAlias(), U_GC_Z_MASK, status);
+ }
+ else if (propName.compare(u"javaSupplementaryCodePoint", -1) == 0) {
+ set->add(0x10000, UnicodeSet::MAX_VALUE);
+ }
+ else if (propName.compare(u"javaTitleCase", -1) == 0) {
+ addCategory(set.getAlias(), U_GC_LT_MASK, status);
+ }
+ else if (propName.compare(u"javaUnicodeIdentifierStart", -1) == 0) {
+ addCategory(set.getAlias(), U_GC_L_MASK, status);
+ addCategory(set.getAlias(), U_GC_NL_MASK, status);
+ }
+ else if (propName.compare(u"javaUnicodeIdentifierPart", -1) == 0) {
+ addCategory(set.getAlias(), U_GC_L_MASK, status);
+ addCategory(set.getAlias(), U_GC_PC_MASK, status);
+ addCategory(set.getAlias(), U_GC_ND_MASK, status);
+ addCategory(set.getAlias(), U_GC_NL_MASK, status);
+ addCategory(set.getAlias(), U_GC_MC_MASK, status);
+ addCategory(set.getAlias(), U_GC_MN_MASK, status);
+ addIdentifierIgnorable(set.getAlias(), status);
+ }
+ else if (propName.compare(u"javaUpperCase", -1) == 0) {
+ addCategory(set.getAlias(), U_GC_LU_MASK, status);
+ }
+ else if (propName.compare(u"javaValidCodePoint", -1) == 0) {
+ set->add(0, UnicodeSet::MAX_VALUE);
+ }
+ else if (propName.compare(u"javaWhitespace", -1) == 0) {
+ addCategory(set.getAlias(), U_GC_Z_MASK, status);
+ set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f));
+ set->add(9, 0x0d).add(0x1c, 0x1f);
+ } else {
+ status = U_REGEX_PROPERTY_SYNTAX;
+ }
+
+ if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
+ set->closeOver(USET_CASE_INSENSITIVE);
+ }
+ break;
+ }
+
+ // Unrecognized property. ICU didn't like it as it was, and none of the Java compatibility
+ // extensions matched it.
+ status = U_REGEX_PROPERTY_SYNTAX;
+ } while (false); // End of do loop block. Code above breaks out of the block on success or hard failure.
+
+ if (U_SUCCESS(status)) {
+ U_ASSERT(set.isValid());
+ if (negated) {
+ set->complement();
}
- delete set;
- set = NULL;
+ return set.orphan();
+ } else {
+ if (status == U_ILLEGAL_ARGUMENT_ERROR) {
+ status = U_REGEX_PROPERTY_SYNTAX;
+ }
+ error(status);
+ return nullptr;
}
- error(*fStatus);
- return NULL;
}
-
//
// SetEval Part of the evaluation of [set expressions].
// Perform any pending (stacked) operations with precedence