diff options
Diffstat (limited to 'deps/node/deps/icu-small/source/common/uniset_props.cpp')
-rw-r--r-- | deps/node/deps/icu-small/source/common/uniset_props.cpp | 1214 |
1 files changed, 0 insertions, 1214 deletions
diff --git a/deps/node/deps/icu-small/source/common/uniset_props.cpp b/deps/node/deps/icu-small/source/common/uniset_props.cpp deleted file mode 100644 index 1312de20..00000000 --- a/deps/node/deps/icu-small/source/common/uniset_props.cpp +++ /dev/null @@ -1,1214 +0,0 @@ -// © 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html -/* -******************************************************************************* -* -* Copyright (C) 1999-2014, International Business Machines -* Corporation and others. All Rights Reserved. -* -******************************************************************************* -* file name: uniset_props.cpp -* encoding: UTF-8 -* tab size: 8 (not used) -* indentation:4 -* -* created on: 2004aug25 -* created by: Markus W. Scherer -* -* Character property dependent functions moved here from uniset.cpp -*/ - -#include "unicode/utypes.h" -#include "unicode/uniset.h" -#include "unicode/parsepos.h" -#include "unicode/uchar.h" -#include "unicode/uscript.h" -#include "unicode/symtable.h" -#include "unicode/uset.h" -#include "unicode/locid.h" -#include "unicode/brkiter.h" -#include "uset_imp.h" -#include "ruleiter.h" -#include "cmemory.h" -#include "ucln_cmn.h" -#include "util.h" -#include "uvector.h" -#include "uprops.h" -#include "propname.h" -#include "normalizer2impl.h" -#include "uinvchar.h" -#include "uprops.h" -#include "charstr.h" -#include "cstring.h" -#include "mutex.h" -#include "umutex.h" -#include "uassert.h" -#include "hash.h" - -U_NAMESPACE_USE - -// initial storage. Must be >= 0 -// *** same as in uniset.cpp ! *** -#define START_EXTRA 16 - -// Define UChar constants using hex for EBCDIC compatibility -// Used #define to reduce private static exports and memory access time. -#define SET_OPEN ((UChar)0x005B) /*[*/ -#define SET_CLOSE ((UChar)0x005D) /*]*/ -#define HYPHEN ((UChar)0x002D) /*-*/ -#define COMPLEMENT ((UChar)0x005E) /*^*/ -#define COLON ((UChar)0x003A) /*:*/ -#define BACKSLASH ((UChar)0x005C) /*\*/ -#define INTERSECTION ((UChar)0x0026) /*&*/ -#define UPPER_U ((UChar)0x0055) /*U*/ -#define LOWER_U ((UChar)0x0075) /*u*/ -#define OPEN_BRACE ((UChar)123) /*{*/ -#define CLOSE_BRACE ((UChar)125) /*}*/ -#define UPPER_P ((UChar)0x0050) /*P*/ -#define LOWER_P ((UChar)0x0070) /*p*/ -#define UPPER_N ((UChar)78) /*N*/ -#define EQUALS ((UChar)0x003D) /*=*/ - -//static const UChar POSIX_OPEN[] = { SET_OPEN,COLON,0 }; // "[:" -static const UChar POSIX_CLOSE[] = { COLON,SET_CLOSE,0 }; // ":]" -//static const UChar PERL_OPEN[] = { BACKSLASH,LOWER_P,0 }; // "\\p" -//static const UChar PERL_CLOSE[] = { CLOSE_BRACE,0 }; // "}" -//static const UChar NAME_OPEN[] = { BACKSLASH,UPPER_N,0 }; // "\\N" -static const UChar HYPHEN_RIGHT_BRACE[] = {HYPHEN,SET_CLOSE,0}; /*-]*/ - -// Special property set IDs -static const char ANY[] = "ANY"; // [\u0000-\U0010FFFF] -static const char ASCII[] = "ASCII"; // [\u0000-\u007F] -static const char ASSIGNED[] = "Assigned"; // [:^Cn:] - -// Unicode name property alias -#define NAME_PROP "na" -#define NAME_PROP_LENGTH 2 - -/** - * Delimiter string used in patterns to close a category reference: - * ":]". Example: "[:Lu:]". - */ -//static const UChar CATEGORY_CLOSE[] = {COLON, SET_CLOSE, 0x0000}; /* ":]" */ - -// Cached sets ------------------------------------------------------------- *** - -U_CDECL_BEGIN -static UBool U_CALLCONV uset_cleanup(); - -static UnicodeSet *uni32Singleton; -static icu::UInitOnce uni32InitOnce = U_INITONCE_INITIALIZER; - -/** - * Cleanup function for UnicodeSet - */ -static UBool U_CALLCONV uset_cleanup(void) { - delete uni32Singleton; - uni32Singleton = NULL; - uni32InitOnce.reset(); - return TRUE; -} - -U_CDECL_END - -U_NAMESPACE_BEGIN - -namespace { - -// Cache some sets for other services -------------------------------------- *** -void U_CALLCONV createUni32Set(UErrorCode &errorCode) { - U_ASSERT(uni32Singleton == NULL); - uni32Singleton = new UnicodeSet(UNICODE_STRING_SIMPLE("[:age=3.2:]"), errorCode); - if(uni32Singleton==NULL) { - errorCode=U_MEMORY_ALLOCATION_ERROR; - } else { - uni32Singleton->freeze(); - } - ucln_common_registerCleanup(UCLN_COMMON_USET, uset_cleanup); -} - - -U_CFUNC UnicodeSet * -uniset_getUnicode32Instance(UErrorCode &errorCode) { - umtx_initOnce(uni32InitOnce, &createUni32Set, errorCode); - return uni32Singleton; -} - -// helper functions for matching of pattern syntax pieces ------------------ *** -// these functions are parallel to the PERL_OPEN etc. strings above - -// using these functions is not only faster than UnicodeString::compare() and -// caseCompare(), but they also make UnicodeSet work for simple patterns when -// no Unicode properties data is available - when caseCompare() fails - -static inline UBool -isPerlOpen(const UnicodeString &pattern, int32_t pos) { - UChar c; - return pattern.charAt(pos)==BACKSLASH && ((c=pattern.charAt(pos+1))==LOWER_P || c==UPPER_P); -} - -/*static inline UBool -isPerlClose(const UnicodeString &pattern, int32_t pos) { - return pattern.charAt(pos)==CLOSE_BRACE; -}*/ - -static inline UBool -isNameOpen(const UnicodeString &pattern, int32_t pos) { - return pattern.charAt(pos)==BACKSLASH && pattern.charAt(pos+1)==UPPER_N; -} - -static inline UBool -isPOSIXOpen(const UnicodeString &pattern, int32_t pos) { - return pattern.charAt(pos)==SET_OPEN && pattern.charAt(pos+1)==COLON; -} - -/*static inline UBool -isPOSIXClose(const UnicodeString &pattern, int32_t pos) { - return pattern.charAt(pos)==COLON && pattern.charAt(pos+1)==SET_CLOSE; -}*/ - -// TODO memory debugging provided inside uniset.cpp -// could be made available here but probably obsolete with use of modern -// memory leak checker tools -#define _dbgct(me) - -} // namespace - -//---------------------------------------------------------------- -// Constructors &c -//---------------------------------------------------------------- - -/** - * Constructs a set from the given pattern, optionally ignoring - * white space. See the class description for the syntax of the - * pattern language. - * @param pattern a string specifying what characters are in the set - */ -UnicodeSet::UnicodeSet(const UnicodeString& pattern, - UErrorCode& status) : - len(0), capacity(START_EXTRA), list(0), bmpSet(0), buffer(0), - bufferCapacity(0), patLen(0), pat(NULL), strings(NULL), stringSpan(NULL), - fFlags(0) -{ - if(U_SUCCESS(status)){ - list = (UChar32*) uprv_malloc(sizeof(UChar32) * capacity); - /* test for NULL */ - if(list == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; - }else{ - allocateStrings(status); - applyPattern(pattern, status); - } - } - _dbgct(this); -} - -//---------------------------------------------------------------- -// Public API -//---------------------------------------------------------------- - -UnicodeSet& UnicodeSet::applyPattern(const UnicodeString& pattern, - UErrorCode& status) { - // Equivalent to - // return applyPattern(pattern, USET_IGNORE_SPACE, NULL, status); - // but without dependency on closeOver(). - ParsePosition pos(0); - applyPatternIgnoreSpace(pattern, pos, NULL, status); - if (U_FAILURE(status)) return *this; - - int32_t i = pos.getIndex(); - // Skip over trailing whitespace - ICU_Utility::skipWhitespace(pattern, i, TRUE); - if (i != pattern.length()) { - status = U_ILLEGAL_ARGUMENT_ERROR; - } - return *this; -} - -void -UnicodeSet::applyPatternIgnoreSpace(const UnicodeString& pattern, - ParsePosition& pos, - const SymbolTable* symbols, - UErrorCode& status) { - if (U_FAILURE(status)) { - return; - } - if (isFrozen()) { - status = U_NO_WRITE_PERMISSION; - return; - } - // Need to build the pattern in a temporary string because - // _applyPattern calls add() etc., which set pat to empty. - UnicodeString rebuiltPat; - RuleCharacterIterator chars(pattern, symbols, pos); - applyPattern(chars, symbols, rebuiltPat, USET_IGNORE_SPACE, NULL, 0, status); - if (U_FAILURE(status)) return; - if (chars.inVariable()) { - // syntaxError(chars, "Extra chars in variable value"); - status = U_MALFORMED_SET; - return; - } - setPattern(rebuiltPat); -} - -/** - * Return true if the given position, in the given pattern, appears - * to be the start of a UnicodeSet pattern. - */ -UBool UnicodeSet::resemblesPattern(const UnicodeString& pattern, int32_t pos) { - return ((pos+1) < pattern.length() && - pattern.charAt(pos) == (UChar)91/*[*/) || - resemblesPropertyPattern(pattern, pos); -} - -//---------------------------------------------------------------- -// Implementation: Pattern parsing -//---------------------------------------------------------------- - -namespace { - -/** - * A small all-inline class to manage a UnicodeSet pointer. Add - * operator->() etc. as needed. - */ -class UnicodeSetPointer { - UnicodeSet* p; -public: - inline UnicodeSetPointer() : p(0) {} - inline ~UnicodeSetPointer() { delete p; } - inline UnicodeSet* pointer() { return p; } - inline UBool allocate() { - if (p == 0) { - p = new UnicodeSet(); - } - return p != 0; - } -}; - -constexpr int32_t MAX_DEPTH = 100; - -} // namespace - -/** - * Parse the pattern from the given RuleCharacterIterator. The - * iterator is advanced over the parsed pattern. - * @param chars iterator over the pattern characters. Upon return - * it will be advanced to the first character after the parsed - * pattern, or the end of the iteration if all characters are - * parsed. - * @param symbols symbol table to use to parse and dereference - * variables, or null if none. - * @param rebuiltPat the pattern that was parsed, rebuilt or - * copied from the input pattern, as appropriate. - * @param options a bit mask of zero or more of the following: - * IGNORE_SPACE, CASE. - */ -void UnicodeSet::applyPattern(RuleCharacterIterator& chars, - const SymbolTable* symbols, - UnicodeString& rebuiltPat, - uint32_t options, - UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute), - int32_t depth, - UErrorCode& ec) { - if (U_FAILURE(ec)) return; - if (depth > MAX_DEPTH) { - ec = U_ILLEGAL_ARGUMENT_ERROR; - return; - } - - // Syntax characters: [ ] ^ - & { } - - // Recognized special forms for chars, sets: c-c s-s s&s - - int32_t opts = RuleCharacterIterator::PARSE_VARIABLES | - RuleCharacterIterator::PARSE_ESCAPES; - if ((options & USET_IGNORE_SPACE) != 0) { - opts |= RuleCharacterIterator::SKIP_WHITESPACE; - } - - UnicodeString patLocal, buf; - UBool usePat = FALSE; - UnicodeSetPointer scratch; - RuleCharacterIterator::Pos backup; - - // mode: 0=before [, 1=between [...], 2=after ] - // lastItem: 0=none, 1=char, 2=set - int8_t lastItem = 0, mode = 0; - UChar32 lastChar = 0; - UChar op = 0; - - UBool invert = FALSE; - - clear(); - - while (mode != 2 && !chars.atEnd()) { - U_ASSERT((lastItem == 0 && op == 0) || - (lastItem == 1 && (op == 0 || op == HYPHEN /*'-'*/)) || - (lastItem == 2 && (op == 0 || op == HYPHEN /*'-'*/ || - op == INTERSECTION /*'&'*/))); - - UChar32 c = 0; - UBool literal = FALSE; - UnicodeSet* nested = 0; // alias - do not delete - - // -------- Check for property pattern - - // setMode: 0=none, 1=unicodeset, 2=propertypat, 3=preparsed - int8_t setMode = 0; - if (resemblesPropertyPattern(chars, opts)) { - setMode = 2; - } - - // -------- Parse '[' of opening delimiter OR nested set. - // If there is a nested set, use `setMode' to define how - // the set should be parsed. If the '[' is part of the - // opening delimiter for this pattern, parse special - // strings "[", "[^", "[-", and "[^-". Check for stand-in - // characters representing a nested set in the symbol - // table. - - else { - // Prepare to backup if necessary - chars.getPos(backup); - c = chars.next(opts, literal, ec); - if (U_FAILURE(ec)) return; - - if (c == 0x5B /*'['*/ && !literal) { - if (mode == 1) { - chars.setPos(backup); // backup - setMode = 1; - } else { - // Handle opening '[' delimiter - mode = 1; - patLocal.append((UChar) 0x5B /*'['*/); - chars.getPos(backup); // prepare to backup - c = chars.next(opts, literal, ec); - if (U_FAILURE(ec)) return; - if (c == 0x5E /*'^'*/ && !literal) { - invert = TRUE; - patLocal.append((UChar) 0x5E /*'^'*/); - chars.getPos(backup); // prepare to backup - c = chars.next(opts, literal, ec); - if (U_FAILURE(ec)) return; - } - // Fall through to handle special leading '-'; - // otherwise restart loop for nested [], \p{}, etc. - if (c == HYPHEN /*'-'*/) { - literal = TRUE; - // Fall through to handle literal '-' below - } else { - chars.setPos(backup); // backup - continue; - } - } - } else if (symbols != 0) { - const UnicodeFunctor *m = symbols->lookupMatcher(c); - if (m != 0) { - const UnicodeSet *ms = dynamic_cast<const UnicodeSet *>(m); - if (ms == NULL) { - ec = U_MALFORMED_SET; - return; - } - // casting away const, but `nested' won't be modified - // (important not to modify stored set) - nested = const_cast<UnicodeSet*>(ms); - setMode = 3; - } - } - } - - // -------- Handle a nested set. This either is inline in - // the pattern or represented by a stand-in that has - // previously been parsed and was looked up in the symbol - // table. - - if (setMode != 0) { - if (lastItem == 1) { - if (op != 0) { - // syntaxError(chars, "Char expected after operator"); - ec = U_MALFORMED_SET; - return; - } - add(lastChar, lastChar); - _appendToPat(patLocal, lastChar, FALSE); - lastItem = 0; - op = 0; - } - - if (op == HYPHEN /*'-'*/ || op == INTERSECTION /*'&'*/) { - patLocal.append(op); - } - - if (nested == 0) { - // lazy allocation - if (!scratch.allocate()) { - ec = U_MEMORY_ALLOCATION_ERROR; - return; - } - nested = scratch.pointer(); - } - switch (setMode) { - case 1: - nested->applyPattern(chars, symbols, patLocal, options, caseClosure, depth + 1, ec); - break; - case 2: - chars.skipIgnored(opts); - nested->applyPropertyPattern(chars, patLocal, ec); - if (U_FAILURE(ec)) return; - break; - case 3: // `nested' already parsed - nested->_toPattern(patLocal, FALSE); - break; - } - - usePat = TRUE; - - if (mode == 0) { - // Entire pattern is a category; leave parse loop - *this = *nested; - mode = 2; - break; - } - - switch (op) { - case HYPHEN: /*'-'*/ - removeAll(*nested); - break; - case INTERSECTION: /*'&'*/ - retainAll(*nested); - break; - case 0: - addAll(*nested); - break; - } - - op = 0; - lastItem = 2; - - continue; - } - - if (mode == 0) { - // syntaxError(chars, "Missing '['"); - ec = U_MALFORMED_SET; - return; - } - - // -------- Parse special (syntax) characters. If the - // current character is not special, or if it is escaped, - // then fall through and handle it below. - - if (!literal) { - switch (c) { - case 0x5D /*']'*/: - if (lastItem == 1) { - add(lastChar, lastChar); - _appendToPat(patLocal, lastChar, FALSE); - } - // Treat final trailing '-' as a literal - if (op == HYPHEN /*'-'*/) { - add(op, op); - patLocal.append(op); - } else if (op == INTERSECTION /*'&'*/) { - // syntaxError(chars, "Trailing '&'"); - ec = U_MALFORMED_SET; - return; - } - patLocal.append((UChar) 0x5D /*']'*/); - mode = 2; - continue; - case HYPHEN /*'-'*/: - if (op == 0) { - if (lastItem != 0) { - op = (UChar) c; - continue; - } else { - // Treat final trailing '-' as a literal - add(c, c); - c = chars.next(opts, literal, ec); - if (U_FAILURE(ec)) return; - if (c == 0x5D /*']'*/ && !literal) { - patLocal.append(HYPHEN_RIGHT_BRACE, 2); - mode = 2; - continue; - } - } - } - // syntaxError(chars, "'-' not after char or set"); - ec = U_MALFORMED_SET; - return; - case INTERSECTION /*'&'*/: - if (lastItem == 2 && op == 0) { - op = (UChar) c; - continue; - } - // syntaxError(chars, "'&' not after set"); - ec = U_MALFORMED_SET; - return; - case 0x5E /*'^'*/: - // syntaxError(chars, "'^' not after '['"); - ec = U_MALFORMED_SET; - return; - case 0x7B /*'{'*/: - if (op != 0) { - // syntaxError(chars, "Missing operand after operator"); - ec = U_MALFORMED_SET; - return; - } - if (lastItem == 1) { - add(lastChar, lastChar); - _appendToPat(patLocal, lastChar, FALSE); - } - lastItem = 0; - buf.truncate(0); - { - UBool ok = FALSE; - while (!chars.atEnd()) { - c = chars.next(opts, literal, ec); - if (U_FAILURE(ec)) return; - if (c == 0x7D /*'}'*/ && !literal) { - ok = TRUE; - break; - } - buf.append(c); - } - if (buf.length() < 1 || !ok) { - // syntaxError(chars, "Invalid multicharacter string"); - ec = U_MALFORMED_SET; - return; - } - } - // We have new string. Add it to set and continue; - // we don't need to drop through to the further - // processing - add(buf); - patLocal.append((UChar) 0x7B /*'{'*/); - _appendToPat(patLocal, buf, FALSE); - patLocal.append((UChar) 0x7D /*'}'*/); - continue; - case SymbolTable::SYMBOL_REF: - // symbols nosymbols - // [a-$] error error (ambiguous) - // [a$] anchor anchor - // [a-$x] var "x"* literal '$' - // [a-$.] error literal '$' - // *We won't get here in the case of var "x" - { - chars.getPos(backup); - c = chars.next(opts, literal, ec); - if (U_FAILURE(ec)) return; - UBool anchor = (c == 0x5D /*']'*/ && !literal); - if (symbols == 0 && !anchor) { - c = SymbolTable::SYMBOL_REF; - chars.setPos(backup); - break; // literal '$' - } - if (anchor && op == 0) { - if (lastItem == 1) { - add(lastChar, lastChar); - _appendToPat(patLocal, lastChar, FALSE); - } - add(U_ETHER); - usePat = TRUE; - patLocal.append((UChar) SymbolTable::SYMBOL_REF); - patLocal.append((UChar) 0x5D /*']'*/); - mode = 2; - continue; - } - // syntaxError(chars, "Unquoted '$'"); - ec = U_MALFORMED_SET; - return; - } - default: - break; - } - } - - // -------- Parse literal characters. This includes both - // escaped chars ("\u4E01") and non-syntax characters - // ("a"). - - switch (lastItem) { - case 0: - lastItem = 1; - lastChar = c; - break; - case 1: - if (op == HYPHEN /*'-'*/) { - if (lastChar >= c) { - // Don't allow redundant (a-a) or empty (b-a) ranges; - // these are most likely typos. - // syntaxError(chars, "Invalid range"); - ec = U_MALFORMED_SET; - return; - } - add(lastChar, c); - _appendToPat(patLocal, lastChar, FALSE); - patLocal.append(op); - _appendToPat(patLocal, c, FALSE); - lastItem = 0; - op = 0; - } else { - add(lastChar, lastChar); - _appendToPat(patLocal, lastChar, FALSE); - lastChar = c; - } - break; - case 2: - if (op != 0) { - // syntaxError(chars, "Set expected after operator"); - ec = U_MALFORMED_SET; - return; - } - lastChar = c; - lastItem = 1; - break; - } - } - - if (mode != 2) { - // syntaxError(chars, "Missing ']'"); - ec = U_MALFORMED_SET; - return; - } - - chars.skipIgnored(opts); - - /** - * Handle global flags (invert, case insensitivity). If this - * pattern should be compiled case-insensitive, then we need - * to close over case BEFORE COMPLEMENTING. This makes - * patterns like /[^abc]/i work. - */ - if ((options & USET_CASE_INSENSITIVE) != 0) { - (this->*caseClosure)(USET_CASE_INSENSITIVE); - } - else if ((options & USET_ADD_CASE_MAPPINGS) != 0) { - (this->*caseClosure)(USET_ADD_CASE_MAPPINGS); - } - if (invert) { - complement(); - } - - // Use the rebuilt pattern (patLocal) only if necessary. Prefer the - // generated pattern. - if (usePat) { - rebuiltPat.append(patLocal); - } else { - _generatePattern(rebuiltPat, FALSE); - } - if (isBogus() && U_SUCCESS(ec)) { - // We likely ran out of memory. AHHH! - ec = U_MEMORY_ALLOCATION_ERROR; - } -} - -//---------------------------------------------------------------- -// Property set implementation -//---------------------------------------------------------------- - -namespace { - -static UBool numericValueFilter(UChar32 ch, void* context) { - return u_getNumericValue(ch) == *(double*)context; -} - -static UBool versionFilter(UChar32 ch, void* context) { - static const UVersionInfo none = { 0, 0, 0, 0 }; - UVersionInfo v; - u_charAge(ch, v); - UVersionInfo* version = (UVersionInfo*)context; - return uprv_memcmp(&v, &none, sizeof(v)) > 0 && uprv_memcmp(&v, version, sizeof(v)) <= 0; -} - -static UBool scriptExtensionsFilter(UChar32 ch, void* context) { - return uscript_hasScript(ch, *(UScriptCode*)context); -} - -} // namespace - -/** - * Generic filter-based scanning code for UCD property UnicodeSets. - */ -void UnicodeSet::applyFilter(UnicodeSet::Filter filter, - void* context, - const UnicodeSet* inclusions, - UErrorCode &status) { - if (U_FAILURE(status)) return; - - // Logically, walk through all Unicode characters, noting the start - // and end of each range for which filter.contain(c) is - // true. Add each range to a set. - // - // To improve performance, use an inclusions set which - // encodes information about character ranges that are known - // to have identical properties. - // inclusions contains the first characters of - // same-value ranges for the given property. - - clear(); - - UChar32 startHasProperty = -1; - int32_t limitRange = inclusions->getRangeCount(); - - for (int j=0; j<limitRange; ++j) { - // get current range - UChar32 start = inclusions->getRangeStart(j); - UChar32 end = inclusions->getRangeEnd(j); - - // for all the code points in the range, process - for (UChar32 ch = start; ch <= end; ++ch) { - // only add to this UnicodeSet on inflection points -- - // where the hasProperty value changes to false - if ((*filter)(ch, context)) { - if (startHasProperty < 0) { - startHasProperty = ch; - } - } else if (startHasProperty >= 0) { - add(startHasProperty, ch-1); - startHasProperty = -1; - } - } - } - if (startHasProperty >= 0) { - add((UChar32)startHasProperty, (UChar32)0x10FFFF); - } - if (isBogus() && U_SUCCESS(status)) { - // We likely ran out of memory. AHHH! - status = U_MEMORY_ALLOCATION_ERROR; - } -} - -namespace { - -/** Maps map values to 1 if the mask contains their value'th bit, all others to 0. */ -uint32_t U_CALLCONV generalCategoryMaskFilter(const void *context, uint32_t value) { - uint32_t mask = *(const uint32_t *)context; - value = U_MASK(value) & mask; - if (value != 0) { value = 1; } - return value; -} - -/** Maps one map value to 1, all others to 0. */ -uint32_t U_CALLCONV intValueFilter(const void *context, uint32_t value) { - uint32_t v = *(const uint32_t *)context; - return value == v ? 1 : 0; -} - -} // namespace - -void UnicodeSet::applyIntPropertyValue(const UCPMap *map, - UCPMapValueFilter *filter, const void *context, - UErrorCode &errorCode) { - if (U_FAILURE(errorCode)) { return; } - clear(); - UChar32 start = 0, end; - uint32_t value; - while ((end = ucpmap_getRange(map, start, UCPMAP_RANGE_NORMAL, 0, - filter, context, &value)) >= 0) { - if (value != 0) { - add(start, end); - } - start = end + 1; - } - if (isBogus()) { - errorCode = U_MEMORY_ALLOCATION_ERROR; - } -} - -namespace { - -static UBool mungeCharName(char* dst, const char* src, int32_t dstCapacity) { - /* Note: we use ' ' in compiler code page */ - int32_t j = 0; - char ch; - --dstCapacity; /* make room for term. zero */ - while ((ch = *src++) != 0) { - if (ch == ' ' && (j==0 || (j>0 && dst[j-1]==' '))) { - continue; - } - if (j >= dstCapacity) return FALSE; - dst[j++] = ch; - } - if (j > 0 && dst[j-1] == ' ') --j; - dst[j] = 0; - return TRUE; -} - -} // namespace - -//---------------------------------------------------------------- -// Property set API -//---------------------------------------------------------------- - -#define FAIL(ec) {ec=U_ILLEGAL_ARGUMENT_ERROR; return *this;} - -UnicodeSet& -UnicodeSet::applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode& ec) { - if (U_FAILURE(ec)) { return *this; } - // All of the following check isFrozen() before modifying this set. - if (prop == UCHAR_GENERAL_CATEGORY_MASK) { - const UCPMap *map = u_getIntPropertyMap(UCHAR_GENERAL_CATEGORY, &ec); - applyIntPropertyValue(map, generalCategoryMaskFilter, &value, ec); - } else if (prop == UCHAR_SCRIPT_EXTENSIONS) { - const UnicodeSet* inclusions = CharacterProperties::getInclusionsForProperty(prop, ec); - UScriptCode script = (UScriptCode)value; - applyFilter(scriptExtensionsFilter, &script, inclusions, ec); - } else if (0 <= prop && prop < UCHAR_BINARY_LIMIT) { - if (value == 0 || value == 1) { - const USet *set = u_getBinaryPropertySet(prop, &ec); - if (U_FAILURE(ec)) { return *this; } - copyFrom(*UnicodeSet::fromUSet(set), TRUE); - if (value == 0) { - complement(); - } - } else { - clear(); - } - } else if (UCHAR_INT_START <= prop && prop < UCHAR_INT_LIMIT) { - const UCPMap *map = u_getIntPropertyMap(prop, &ec); - applyIntPropertyValue(map, intValueFilter, &value, ec); - } else { - // This code used to always call getInclusions(property source) - // which sets an error for an unsupported property. - ec = U_ILLEGAL_ARGUMENT_ERROR; - // Otherwise we would just clear() this set because - // getIntPropertyValue(c, prop) returns 0 for all code points. - } - return *this; -} - -UnicodeSet& -UnicodeSet::applyPropertyAlias(const UnicodeString& prop, - const UnicodeString& value, - UErrorCode& ec) { - if (U_FAILURE(ec) || isFrozen()) return *this; - - // prop and value used to be converted to char * using the default - // converter instead of the invariant conversion. - // This should not be necessary because all Unicode property and value - // names use only invariant characters. - // If there are any variant characters, then we won't find them anyway. - // Checking first avoids assertion failures in the conversion. - if( !uprv_isInvariantUString(prop.getBuffer(), prop.length()) || - !uprv_isInvariantUString(value.getBuffer(), value.length()) - ) { - FAIL(ec); - } - CharString pname, vname; - pname.appendInvariantChars(prop, ec); - vname.appendInvariantChars(value, ec); - if (U_FAILURE(ec)) return *this; - - UProperty p; - int32_t v; - UBool invert = FALSE; - - if (value.length() > 0) { - p = u_getPropertyEnum(pname.data()); - if (p == UCHAR_INVALID_CODE) FAIL(ec); - - // Treat gc as gcm - if (p == UCHAR_GENERAL_CATEGORY) { - p = UCHAR_GENERAL_CATEGORY_MASK; - } - - if ((p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) || - (p >= UCHAR_INT_START && p < UCHAR_INT_LIMIT) || - (p >= UCHAR_MASK_START && p < UCHAR_MASK_LIMIT)) { - v = u_getPropertyValueEnum(p, vname.data()); - if (v == UCHAR_INVALID_CODE) { - // Handle numeric CCC - if (p == UCHAR_CANONICAL_COMBINING_CLASS || - p == UCHAR_TRAIL_CANONICAL_COMBINING_CLASS || - p == UCHAR_LEAD_CANONICAL_COMBINING_CLASS) { - char* end; - double val = uprv_strtod(vname.data(), &end); - // Anything between 0 and 255 is valid even if unused. - // Cast double->int only after range check. - // We catch NaN here because comparing it with both 0 and 255 will be false - // (as are all comparisons with NaN). - if (*end != 0 || !(0 <= val && val <= 255) || - (v = (int32_t)val) != val) { - // non-integral value or outside 0..255, or trailing junk - FAIL(ec); - } - } else { - FAIL(ec); - } - } - } - - else { - - switch (p) { - case UCHAR_NUMERIC_VALUE: - { - char* end; - double val = uprv_strtod(vname.data(), &end); - if (*end != 0) { - FAIL(ec); - } - applyFilter(numericValueFilter, &val, - CharacterProperties::getInclusionsForProperty(p, ec), ec); - return *this; - } - case UCHAR_NAME: - { - // Must munge name, since u_charFromName() does not do - // 'loose' matching. - char buf[128]; // it suffices that this be > uprv_getMaxCharNameLength - if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec); - UChar32 ch = u_charFromName(U_EXTENDED_CHAR_NAME, buf, &ec); - if (U_SUCCESS(ec)) { - clear(); - add(ch); - return *this; - } else { - FAIL(ec); - } - } - case UCHAR_UNICODE_1_NAME: - // ICU 49 deprecates the Unicode_1_Name property APIs. - FAIL(ec); - case UCHAR_AGE: - { - // Must munge name, since u_versionFromString() does not do - // 'loose' matching. - char buf[128]; - if (!mungeCharName(buf, vname.data(), sizeof(buf))) FAIL(ec); - UVersionInfo version; - u_versionFromString(version, buf); - applyFilter(versionFilter, &version, - CharacterProperties::getInclusionsForProperty(p, ec), ec); - return *this; - } - case UCHAR_SCRIPT_EXTENSIONS: - v = u_getPropertyValueEnum(UCHAR_SCRIPT, vname.data()); - if (v == UCHAR_INVALID_CODE) { - FAIL(ec); - } - // fall through to calling applyIntPropertyValue() - break; - default: - // p is a non-binary, non-enumerated property that we - // don't support (yet). - FAIL(ec); - } - } - } - - else { - // value is empty. Interpret as General Category, Script, or - // Binary property. - p = UCHAR_GENERAL_CATEGORY_MASK; - v = u_getPropertyValueEnum(p, pname.data()); - if (v == UCHAR_INVALID_CODE) { - p = UCHAR_SCRIPT; - v = u_getPropertyValueEnum(p, pname.data()); - if (v == UCHAR_INVALID_CODE) { - p = u_getPropertyEnum(pname.data()); - if (p >= UCHAR_BINARY_START && p < UCHAR_BINARY_LIMIT) { - v = 1; - } else if (0 == uprv_comparePropertyNames(ANY, pname.data())) { - set(MIN_VALUE, MAX_VALUE); - return *this; - } else if (0 == uprv_comparePropertyNames(ASCII, pname.data())) { - set(0, 0x7F); - return *this; - } else if (0 == uprv_comparePropertyNames(ASSIGNED, pname.data())) { - // [:Assigned:]=[:^Cn:] - p = UCHAR_GENERAL_CATEGORY_MASK; - v = U_GC_CN_MASK; - invert = TRUE; - } else { - FAIL(ec); - } - } - } - } - - applyIntPropertyValue(p, v, ec); - if(invert) { - complement(); - } - - if (isBogus() && U_SUCCESS(ec)) { - // We likely ran out of memory. AHHH! - ec = U_MEMORY_ALLOCATION_ERROR; - } - return *this; -} - -//---------------------------------------------------------------- -// Property set patterns -//---------------------------------------------------------------- - -/** - * Return true if the given position, in the given pattern, appears - * to be the start of a property set pattern. - */ -UBool UnicodeSet::resemblesPropertyPattern(const UnicodeString& pattern, - int32_t pos) { - // Patterns are at least 5 characters long - if ((pos+5) > pattern.length()) { - return FALSE; - } - - // Look for an opening [:, [:^, \p, or \P - return isPOSIXOpen(pattern, pos) || isPerlOpen(pattern, pos) || isNameOpen(pattern, pos); -} - -/** - * Return true if the given iterator appears to point at a - * property pattern. Regardless of the result, return with the - * iterator unchanged. - * @param chars iterator over the pattern characters. Upon return - * it will be unchanged. - * @param iterOpts RuleCharacterIterator options - */ -UBool UnicodeSet::resemblesPropertyPattern(RuleCharacterIterator& chars, - int32_t iterOpts) { - // NOTE: literal will always be FALSE, because we don't parse escapes. - UBool result = FALSE, literal; - UErrorCode ec = U_ZERO_ERROR; - iterOpts &= ~RuleCharacterIterator::PARSE_ESCAPES; - RuleCharacterIterator::Pos pos; - chars.getPos(pos); - UChar32 c = chars.next(iterOpts, literal, ec); - if (c == 0x5B /*'['*/ || c == 0x5C /*'\\'*/) { - UChar32 d = chars.next(iterOpts & ~RuleCharacterIterator::SKIP_WHITESPACE, - literal, ec); - result = (c == 0x5B /*'['*/) ? (d == 0x3A /*':'*/) : - (d == 0x4E /*'N'*/ || d == 0x70 /*'p'*/ || d == 0x50 /*'P'*/); - } - chars.setPos(pos); - return result && U_SUCCESS(ec); -} - -/** - * Parse the given property pattern at the given parse position. - */ -UnicodeSet& UnicodeSet::applyPropertyPattern(const UnicodeString& pattern, - ParsePosition& ppos, - UErrorCode &ec) { - int32_t pos = ppos.getIndex(); - - UBool posix = FALSE; // true for [:pat:], false for \p{pat} \P{pat} \N{pat} - UBool isName = FALSE; // true for \N{pat}, o/w false - UBool invert = FALSE; - - if (U_FAILURE(ec)) return *this; - - // Minimum length is 5 characters, e.g. \p{L} - if ((pos+5) > pattern.length()) { - FAIL(ec); - } - - // On entry, ppos should point to one of the following locations: - // Look for an opening [:, [:^, \p, or \P - if (isPOSIXOpen(pattern, pos)) { - posix = TRUE; - pos += 2; - pos = ICU_Utility::skipWhitespace(pattern, pos); - if (pos < pattern.length() && pattern.charAt(pos) == COMPLEMENT) { - ++pos; - invert = TRUE; - } - } else if (isPerlOpen(pattern, pos) || isNameOpen(pattern, pos)) { - UChar c = pattern.charAt(pos+1); - invert = (c == UPPER_P); - isName = (c == UPPER_N); - pos += 2; - pos = ICU_Utility::skipWhitespace(pattern, pos); - if (pos == pattern.length() || pattern.charAt(pos++) != OPEN_BRACE) { - // Syntax error; "\p" or "\P" not followed by "{" - FAIL(ec); - } - } else { - // Open delimiter not seen - FAIL(ec); - } - - // Look for the matching close delimiter, either :] or } - int32_t close; - if (posix) { - close = pattern.indexOf(POSIX_CLOSE, 2, pos); - } else { - close = pattern.indexOf(CLOSE_BRACE, pos); - } - if (close < 0) { - // Syntax error; close delimiter missing - FAIL(ec); - } - - // Look for an '=' sign. If this is present, we will parse a - // medium \p{gc=Cf} or long \p{GeneralCategory=Format} - // pattern. - int32_t equals = pattern.indexOf(EQUALS, pos); - UnicodeString propName, valueName; - if (equals >= 0 && equals < close && !isName) { - // Equals seen; parse medium/long pattern - pattern.extractBetween(pos, equals, propName); - pattern.extractBetween(equals+1, close, valueName); - } - - else { - // Handle case where no '=' is seen, and \N{} - pattern.extractBetween(pos, close, propName); - - // Handle \N{name} - if (isName) { - // This is a little inefficient since it means we have to - // parse NAME_PROP back to UCHAR_NAME even though we already - // know it's UCHAR_NAME. If we refactor the API to - // support args of (UProperty, char*) then we can remove - // NAME_PROP and make this a little more efficient. - valueName = propName; - propName = UnicodeString(NAME_PROP, NAME_PROP_LENGTH, US_INV); - } - } - - applyPropertyAlias(propName, valueName, ec); - - if (U_SUCCESS(ec)) { - if (invert) { - complement(); - } - - // Move to the limit position after the close delimiter if the - // parse succeeded. - ppos.setIndex(close + (posix ? 2 : 1)); - } - - return *this; -} - -/** - * Parse a property pattern. - * @param chars iterator over the pattern characters. Upon return - * it will be advanced to the first character after the parsed - * pattern, or the end of the iteration if all characters are - * parsed. - * @param rebuiltPat the pattern that was parsed, rebuilt or - * copied from the input pattern, as appropriate. - */ -void UnicodeSet::applyPropertyPattern(RuleCharacterIterator& chars, - UnicodeString& rebuiltPat, - UErrorCode& ec) { - if (U_FAILURE(ec)) return; - UnicodeString pattern; - chars.lookahead(pattern); - ParsePosition pos(0); - applyPropertyPattern(pattern, pos, ec); - if (U_FAILURE(ec)) return; - if (pos.getIndex() == 0) { - // syntaxError(chars, "Invalid property pattern"); - ec = U_MALFORMED_SET; - return; - } - chars.jumpahead(pos.getIndex()); - rebuiltPat.append(pattern, 0, pos.getIndex()); -} - -U_NAMESPACE_END |