diff options
Diffstat (limited to 'deps/node/deps/icu-small/source/i18n/collationsets.cpp')
-rw-r--r-- | deps/node/deps/icu-small/source/i18n/collationsets.cpp | 612 |
1 files changed, 0 insertions, 612 deletions
diff --git a/deps/node/deps/icu-small/source/i18n/collationsets.cpp b/deps/node/deps/icu-small/source/i18n/collationsets.cpp deleted file mode 100644 index 09581416..00000000 --- a/deps/node/deps/icu-small/source/i18n/collationsets.cpp +++ /dev/null @@ -1,612 +0,0 @@ -// © 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html -/* -******************************************************************************* -* Copyright (C) 2013-2014, International Business Machines -* Corporation and others. All Rights Reserved. -******************************************************************************* -* collationsets.cpp -* -* created on: 2013feb09 -* created by: Markus W. Scherer -*/ - -#include "unicode/utypes.h" - -#if !UCONFIG_NO_COLLATION - -#include "unicode/ucharstrie.h" -#include "unicode/uniset.h" -#include "unicode/unistr.h" -#include "unicode/ustringtrie.h" -#include "collation.h" -#include "collationdata.h" -#include "collationsets.h" -#include "normalizer2impl.h" -#include "uassert.h" -#include "utf16collationiterator.h" -#include "utrie2.h" - -U_NAMESPACE_BEGIN - -U_CDECL_BEGIN - -static UBool U_CALLCONV -enumTailoredRange(const void *context, UChar32 start, UChar32 end, uint32_t ce32) { - if(ce32 == Collation::FALLBACK_CE32) { - return TRUE; // fallback to base, not tailored - } - TailoredSet *ts = (TailoredSet *)context; - return ts->handleCE32(start, end, ce32); -} - -U_CDECL_END - -void -TailoredSet::forData(const CollationData *d, UErrorCode &ec) { - if(U_FAILURE(ec)) { return; } - errorCode = ec; // Preserve info & warning codes. - data = d; - baseData = d->base; - U_ASSERT(baseData != NULL); - utrie2_enum(data->trie, NULL, enumTailoredRange, this); - ec = errorCode; -} - -UBool -TailoredSet::handleCE32(UChar32 start, UChar32 end, uint32_t ce32) { - U_ASSERT(ce32 != Collation::FALLBACK_CE32); - if(Collation::isSpecialCE32(ce32)) { - ce32 = data->getIndirectCE32(ce32); - if(ce32 == Collation::FALLBACK_CE32) { - return U_SUCCESS(errorCode); - } - } - do { - uint32_t baseCE32 = baseData->getFinalCE32(baseData->getCE32(start)); - // Do not just continue if ce32 == baseCE32 because - // contractions and expansions in different data objects - // normally differ even if they have the same data offsets. - if(Collation::isSelfContainedCE32(ce32) && Collation::isSelfContainedCE32(baseCE32)) { - // fastpath - if(ce32 != baseCE32) { - tailored->add(start); - } - } else { - compare(start, ce32, baseCE32); - } - } while(++start <= end); - return U_SUCCESS(errorCode); -} - -void -TailoredSet::compare(UChar32 c, uint32_t ce32, uint32_t baseCE32) { - if(Collation::isPrefixCE32(ce32)) { - const UChar *p = data->contexts + Collation::indexFromCE32(ce32); - ce32 = data->getFinalCE32(CollationData::readCE32(p)); - if(Collation::isPrefixCE32(baseCE32)) { - const UChar *q = baseData->contexts + Collation::indexFromCE32(baseCE32); - baseCE32 = baseData->getFinalCE32(CollationData::readCE32(q)); - comparePrefixes(c, p + 2, q + 2); - } else { - addPrefixes(data, c, p + 2); - } - } else if(Collation::isPrefixCE32(baseCE32)) { - const UChar *q = baseData->contexts + Collation::indexFromCE32(baseCE32); - baseCE32 = baseData->getFinalCE32(CollationData::readCE32(q)); - addPrefixes(baseData, c, q + 2); - } - - if(Collation::isContractionCE32(ce32)) { - const UChar *p = data->contexts + Collation::indexFromCE32(ce32); - if((ce32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) != 0) { - ce32 = Collation::NO_CE32; - } else { - ce32 = data->getFinalCE32(CollationData::readCE32(p)); - } - if(Collation::isContractionCE32(baseCE32)) { - const UChar *q = baseData->contexts + Collation::indexFromCE32(baseCE32); - if((baseCE32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) != 0) { - baseCE32 = Collation::NO_CE32; - } else { - baseCE32 = baseData->getFinalCE32(CollationData::readCE32(q)); - } - compareContractions(c, p + 2, q + 2); - } else { - addContractions(c, p + 2); - } - } else if(Collation::isContractionCE32(baseCE32)) { - const UChar *q = baseData->contexts + Collation::indexFromCE32(baseCE32); - baseCE32 = baseData->getFinalCE32(CollationData::readCE32(q)); - addContractions(c, q + 2); - } - - int32_t tag; - if(Collation::isSpecialCE32(ce32)) { - tag = Collation::tagFromCE32(ce32); - U_ASSERT(tag != Collation::PREFIX_TAG); - U_ASSERT(tag != Collation::CONTRACTION_TAG); - // Currently, the tailoring data builder does not write offset tags. - // They might be useful for saving space, - // but they would complicate the builder, - // and in tailorings we assume that performance of tailored characters is more important. - U_ASSERT(tag != Collation::OFFSET_TAG); - } else { - tag = -1; - } - int32_t baseTag; - if(Collation::isSpecialCE32(baseCE32)) { - baseTag = Collation::tagFromCE32(baseCE32); - U_ASSERT(baseTag != Collation::PREFIX_TAG); - U_ASSERT(baseTag != Collation::CONTRACTION_TAG); - } else { - baseTag = -1; - } - - // Non-contextual mappings, expansions, etc. - if(baseTag == Collation::OFFSET_TAG) { - // We might be comparing a tailoring CE which is a copy of - // a base offset-tag CE, via the [optimize [set]] syntax - // or when a single-character mapping was copied for tailored contractions. - // Offset tags always result in long-primary CEs, - // with common secondary/tertiary weights. - if(!Collation::isLongPrimaryCE32(ce32)) { - add(c); - return; - } - int64_t dataCE = baseData->ces[Collation::indexFromCE32(baseCE32)]; - uint32_t p = Collation::getThreeBytePrimaryForOffsetData(c, dataCE); - if(Collation::primaryFromLongPrimaryCE32(ce32) != p) { - add(c); - return; - } - } - - if(tag != baseTag) { - add(c); - return; - } - - if(tag == Collation::EXPANSION32_TAG) { - const uint32_t *ce32s = data->ce32s + Collation::indexFromCE32(ce32); - int32_t length = Collation::lengthFromCE32(ce32); - - const uint32_t *baseCE32s = baseData->ce32s + Collation::indexFromCE32(baseCE32); - int32_t baseLength = Collation::lengthFromCE32(baseCE32); - - if(length != baseLength) { - add(c); - return; - } - for(int32_t i = 0; i < length; ++i) { - if(ce32s[i] != baseCE32s[i]) { - add(c); - break; - } - } - } else if(tag == Collation::EXPANSION_TAG) { - const int64_t *ces = data->ces + Collation::indexFromCE32(ce32); - int32_t length = Collation::lengthFromCE32(ce32); - - const int64_t *baseCEs = baseData->ces + Collation::indexFromCE32(baseCE32); - int32_t baseLength = Collation::lengthFromCE32(baseCE32); - - if(length != baseLength) { - add(c); - return; - } - for(int32_t i = 0; i < length; ++i) { - if(ces[i] != baseCEs[i]) { - add(c); - break; - } - } - } else if(tag == Collation::HANGUL_TAG) { - UChar jamos[3]; - int32_t length = Hangul::decompose(c, jamos); - if(tailored->contains(jamos[0]) || tailored->contains(jamos[1]) || - (length == 3 && tailored->contains(jamos[2]))) { - add(c); - } - } else if(ce32 != baseCE32) { - add(c); - } -} - -void -TailoredSet::comparePrefixes(UChar32 c, const UChar *p, const UChar *q) { - // Parallel iteration over prefixes of both tables. - UCharsTrie::Iterator prefixes(p, 0, errorCode); - UCharsTrie::Iterator basePrefixes(q, 0, errorCode); - const UnicodeString *tp = NULL; // Tailoring prefix. - const UnicodeString *bp = NULL; // Base prefix. - // Use a string with a U+FFFF as the limit sentinel. - // U+FFFF is untailorable and will not occur in prefixes. - UnicodeString none((UChar)0xffff); - for(;;) { - if(tp == NULL) { - if(prefixes.next(errorCode)) { - tp = &prefixes.getString(); - } else { - tp = &none; - } - } - if(bp == NULL) { - if(basePrefixes.next(errorCode)) { - bp = &basePrefixes.getString(); - } else { - bp = &none; - } - } - if(tp == &none && bp == &none) { break; } - int32_t cmp = tp->compare(*bp); - if(cmp < 0) { - // tp occurs in the tailoring but not in the base. - addPrefix(data, *tp, c, (uint32_t)prefixes.getValue()); - tp = NULL; - } else if(cmp > 0) { - // bp occurs in the base but not in the tailoring. - addPrefix(baseData, *bp, c, (uint32_t)basePrefixes.getValue()); - bp = NULL; - } else { - setPrefix(*tp); - compare(c, (uint32_t)prefixes.getValue(), (uint32_t)basePrefixes.getValue()); - resetPrefix(); - tp = NULL; - bp = NULL; - } - } -} - -void -TailoredSet::compareContractions(UChar32 c, const UChar *p, const UChar *q) { - // Parallel iteration over suffixes of both tables. - UCharsTrie::Iterator suffixes(p, 0, errorCode); - UCharsTrie::Iterator baseSuffixes(q, 0, errorCode); - const UnicodeString *ts = NULL; // Tailoring suffix. - const UnicodeString *bs = NULL; // Base suffix. - // Use a string with two U+FFFF as the limit sentinel. - // U+FFFF is untailorable and will not occur in contractions except maybe - // as a single suffix character for a root-collator boundary contraction. - UnicodeString none((UChar)0xffff); - none.append((UChar)0xffff); - for(;;) { - if(ts == NULL) { - if(suffixes.next(errorCode)) { - ts = &suffixes.getString(); - } else { - ts = &none; - } - } - if(bs == NULL) { - if(baseSuffixes.next(errorCode)) { - bs = &baseSuffixes.getString(); - } else { - bs = &none; - } - } - if(ts == &none && bs == &none) { break; } - int32_t cmp = ts->compare(*bs); - if(cmp < 0) { - // ts occurs in the tailoring but not in the base. - addSuffix(c, *ts); - ts = NULL; - } else if(cmp > 0) { - // bs occurs in the base but not in the tailoring. - addSuffix(c, *bs); - bs = NULL; - } else { - suffix = ts; - compare(c, (uint32_t)suffixes.getValue(), (uint32_t)baseSuffixes.getValue()); - suffix = NULL; - ts = NULL; - bs = NULL; - } - } -} - -void -TailoredSet::addPrefixes(const CollationData *d, UChar32 c, const UChar *p) { - UCharsTrie::Iterator prefixes(p, 0, errorCode); - while(prefixes.next(errorCode)) { - addPrefix(d, prefixes.getString(), c, (uint32_t)prefixes.getValue()); - } -} - -void -TailoredSet::addPrefix(const CollationData *d, const UnicodeString &pfx, UChar32 c, uint32_t ce32) { - setPrefix(pfx); - ce32 = d->getFinalCE32(ce32); - if(Collation::isContractionCE32(ce32)) { - const UChar *p = d->contexts + Collation::indexFromCE32(ce32); - addContractions(c, p + 2); - } - tailored->add(UnicodeString(unreversedPrefix).append(c)); - resetPrefix(); -} - -void -TailoredSet::addContractions(UChar32 c, const UChar *p) { - UCharsTrie::Iterator suffixes(p, 0, errorCode); - while(suffixes.next(errorCode)) { - addSuffix(c, suffixes.getString()); - } -} - -void -TailoredSet::addSuffix(UChar32 c, const UnicodeString &sfx) { - tailored->add(UnicodeString(unreversedPrefix).append(c).append(sfx)); -} - -void -TailoredSet::add(UChar32 c) { - if(unreversedPrefix.isEmpty() && suffix == NULL) { - tailored->add(c); - } else { - UnicodeString s(unreversedPrefix); - s.append(c); - if(suffix != NULL) { - s.append(*suffix); - } - tailored->add(s); - } -} - -ContractionsAndExpansions::CESink::~CESink() {} - -U_CDECL_BEGIN - -static UBool U_CALLCONV -enumCnERange(const void *context, UChar32 start, UChar32 end, uint32_t ce32) { - ContractionsAndExpansions *cne = (ContractionsAndExpansions *)context; - if(cne->checkTailored == 0) { - // There is no tailoring. - // No need to collect nor check the tailored set. - } else if(cne->checkTailored < 0) { - // Collect the set of code points with mappings in the tailoring data. - if(ce32 == Collation::FALLBACK_CE32) { - return TRUE; // fallback to base, not tailored - } else { - cne->tailored.add(start, end); - } - // checkTailored > 0: Exclude tailored ranges from the base data enumeration. - } else if(start == end) { - if(cne->tailored.contains(start)) { - return TRUE; - } - } else if(cne->tailored.containsSome(start, end)) { - cne->ranges.set(start, end).removeAll(cne->tailored); - int32_t count = cne->ranges.getRangeCount(); - for(int32_t i = 0; i < count; ++i) { - cne->handleCE32(cne->ranges.getRangeStart(i), cne->ranges.getRangeEnd(i), ce32); - } - return U_SUCCESS(cne->errorCode); - } - cne->handleCE32(start, end, ce32); - return U_SUCCESS(cne->errorCode); -} - -U_CDECL_END - -void -ContractionsAndExpansions::forData(const CollationData *d, UErrorCode &ec) { - if(U_FAILURE(ec)) { return; } - errorCode = ec; // Preserve info & warning codes. - // Add all from the data, can be tailoring or base. - if(d->base != NULL) { - checkTailored = -1; - } - data = d; - utrie2_enum(data->trie, NULL, enumCnERange, this); - if(d->base == NULL || U_FAILURE(errorCode)) { - ec = errorCode; - return; - } - // Add all from the base data but only for un-tailored code points. - tailored.freeze(); - checkTailored = 1; - data = d->base; - utrie2_enum(data->trie, NULL, enumCnERange, this); - ec = errorCode; -} - -void -ContractionsAndExpansions::forCodePoint(const CollationData *d, UChar32 c, UErrorCode &ec) { - if(U_FAILURE(ec)) { return; } - errorCode = ec; // Preserve info & warning codes. - uint32_t ce32 = d->getCE32(c); - if(ce32 == Collation::FALLBACK_CE32) { - d = d->base; - ce32 = d->getCE32(c); - } - data = d; - handleCE32(c, c, ce32); - ec = errorCode; -} - -void -ContractionsAndExpansions::handleCE32(UChar32 start, UChar32 end, uint32_t ce32) { - for(;;) { - if((ce32 & 0xff) < Collation::SPECIAL_CE32_LOW_BYTE) { - // !isSpecialCE32() - if(sink != NULL) { - sink->handleCE(Collation::ceFromSimpleCE32(ce32)); - } - return; - } - switch(Collation::tagFromCE32(ce32)) { - case Collation::FALLBACK_TAG: - return; - case Collation::RESERVED_TAG_3: - case Collation::BUILDER_DATA_TAG: - case Collation::LEAD_SURROGATE_TAG: - if(U_SUCCESS(errorCode)) { errorCode = U_INTERNAL_PROGRAM_ERROR; } - return; - case Collation::LONG_PRIMARY_TAG: - if(sink != NULL) { - sink->handleCE(Collation::ceFromLongPrimaryCE32(ce32)); - } - return; - case Collation::LONG_SECONDARY_TAG: - if(sink != NULL) { - sink->handleCE(Collation::ceFromLongSecondaryCE32(ce32)); - } - return; - case Collation::LATIN_EXPANSION_TAG: - if(sink != NULL) { - ces[0] = Collation::latinCE0FromCE32(ce32); - ces[1] = Collation::latinCE1FromCE32(ce32); - sink->handleExpansion(ces, 2); - } - // Optimization: If we have a prefix, - // then the relevant strings have been added already. - if(unreversedPrefix.isEmpty()) { - addExpansions(start, end); - } - return; - case Collation::EXPANSION32_TAG: - if(sink != NULL) { - const uint32_t *ce32s = data->ce32s + Collation::indexFromCE32(ce32); - int32_t length = Collation::lengthFromCE32(ce32); - for(int32_t i = 0; i < length; ++i) { - ces[i] = Collation::ceFromCE32(*ce32s++); - } - sink->handleExpansion(ces, length); - } - // Optimization: If we have a prefix, - // then the relevant strings have been added already. - if(unreversedPrefix.isEmpty()) { - addExpansions(start, end); - } - return; - case Collation::EXPANSION_TAG: - if(sink != NULL) { - int32_t length = Collation::lengthFromCE32(ce32); - sink->handleExpansion(data->ces + Collation::indexFromCE32(ce32), length); - } - // Optimization: If we have a prefix, - // then the relevant strings have been added already. - if(unreversedPrefix.isEmpty()) { - addExpansions(start, end); - } - return; - case Collation::PREFIX_TAG: - handlePrefixes(start, end, ce32); - return; - case Collation::CONTRACTION_TAG: - handleContractions(start, end, ce32); - return; - case Collation::DIGIT_TAG: - // Fetch the non-numeric-collation CE32 and continue. - ce32 = data->ce32s[Collation::indexFromCE32(ce32)]; - break; - case Collation::U0000_TAG: - U_ASSERT(start == 0 && end == 0); - // Fetch the normal ce32 for U+0000 and continue. - ce32 = data->ce32s[0]; - break; - case Collation::HANGUL_TAG: - if(sink != NULL) { - // TODO: This should be optimized, - // especially if [start..end] is the complete Hangul range. (assert that) - UTF16CollationIterator iter(data, FALSE, NULL, NULL, NULL); - UChar hangul[1] = { 0 }; - for(UChar32 c = start; c <= end; ++c) { - hangul[0] = (UChar)c; - iter.setText(hangul, hangul + 1); - int32_t length = iter.fetchCEs(errorCode); - if(U_FAILURE(errorCode)) { return; } - // Ignore the terminating non-CE. - U_ASSERT(length >= 2 && iter.getCE(length - 1) == Collation::NO_CE); - sink->handleExpansion(iter.getCEs(), length - 1); - } - } - // Optimization: If we have a prefix, - // then the relevant strings have been added already. - if(unreversedPrefix.isEmpty()) { - addExpansions(start, end); - } - return; - case Collation::OFFSET_TAG: - // Currently no need to send offset CEs to the sink. - return; - case Collation::IMPLICIT_TAG: - // Currently no need to send implicit CEs to the sink. - return; - } - } -} - -void -ContractionsAndExpansions::handlePrefixes( - UChar32 start, UChar32 end, uint32_t ce32) { - const UChar *p = data->contexts + Collation::indexFromCE32(ce32); - ce32 = CollationData::readCE32(p); // Default if no prefix match. - handleCE32(start, end, ce32); - if(!addPrefixes) { return; } - UCharsTrie::Iterator prefixes(p + 2, 0, errorCode); - while(prefixes.next(errorCode)) { - setPrefix(prefixes.getString()); - // Prefix/pre-context mappings are special kinds of contractions - // that always yield expansions. - addStrings(start, end, contractions); - addStrings(start, end, expansions); - handleCE32(start, end, (uint32_t)prefixes.getValue()); - } - resetPrefix(); -} - -void -ContractionsAndExpansions::handleContractions( - UChar32 start, UChar32 end, uint32_t ce32) { - const UChar *p = data->contexts + Collation::indexFromCE32(ce32); - if((ce32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) != 0) { - // No match on the single code point. - // We are underneath a prefix, and the default mapping is just - // a fallback to the mappings for a shorter prefix. - U_ASSERT(!unreversedPrefix.isEmpty()); - } else { - ce32 = CollationData::readCE32(p); // Default if no suffix match. - U_ASSERT(!Collation::isContractionCE32(ce32)); - handleCE32(start, end, ce32); - } - UCharsTrie::Iterator suffixes(p + 2, 0, errorCode); - while(suffixes.next(errorCode)) { - suffix = &suffixes.getString(); - addStrings(start, end, contractions); - if(!unreversedPrefix.isEmpty()) { - addStrings(start, end, expansions); - } - handleCE32(start, end, (uint32_t)suffixes.getValue()); - } - suffix = NULL; -} - -void -ContractionsAndExpansions::addExpansions(UChar32 start, UChar32 end) { - if(unreversedPrefix.isEmpty() && suffix == NULL) { - if(expansions != NULL) { - expansions->add(start, end); - } - } else { - addStrings(start, end, expansions); - } -} - -void -ContractionsAndExpansions::addStrings(UChar32 start, UChar32 end, UnicodeSet *set) { - if(set == NULL) { return; } - UnicodeString s(unreversedPrefix); - do { - s.append(start); - if(suffix != NULL) { - s.append(*suffix); - } - set->add(s); - s.truncate(unreversedPrefix.length()); - } while(++start <= end); -} - -U_NAMESPACE_END - -#endif // !UCONFIG_NO_COLLATION |