diff options
Diffstat (limited to 'deps/node/deps/icu-small/source/i18n/collationfastlatin.cpp')
-rw-r--r-- | deps/node/deps/icu-small/source/i18n/collationfastlatin.cpp | 1099 |
1 files changed, 0 insertions, 1099 deletions
diff --git a/deps/node/deps/icu-small/source/i18n/collationfastlatin.cpp b/deps/node/deps/icu-small/source/i18n/collationfastlatin.cpp deleted file mode 100644 index b98b8457..00000000 --- a/deps/node/deps/icu-small/source/i18n/collationfastlatin.cpp +++ /dev/null @@ -1,1099 +0,0 @@ -// © 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html -/* -******************************************************************************* -* Copyright (C) 2013-2015, International Business Machines -* Corporation and others. All Rights Reserved. -******************************************************************************* -* collationfastlatin.cpp -* -* created on: 2013aug18 -* created by: Markus W. Scherer -*/ - -#include "unicode/utypes.h" - -#if !UCONFIG_NO_COLLATION - -#include "unicode/ucol.h" -#include "collationdata.h" -#include "collationfastlatin.h" -#include "collationsettings.h" -#include "uassert.h" - -U_NAMESPACE_BEGIN - -int32_t -CollationFastLatin::getOptions(const CollationData *data, const CollationSettings &settings, - uint16_t *primaries, int32_t capacity) { - const uint16_t *table = data->fastLatinTable; - if(table == NULL) { return -1; } - U_ASSERT(capacity == LATIN_LIMIT); - if(capacity != LATIN_LIMIT) { return -1; } - - uint32_t miniVarTop; - if((settings.options & CollationSettings::ALTERNATE_MASK) == 0) { - // No mini primaries are variable, set a variableTop just below the - // lowest long mini primary. - miniVarTop = MIN_LONG - 1; - } else { - int32_t headerLength = *table & 0xff; - int32_t i = 1 + settings.getMaxVariable(); - if(i >= headerLength) { - return -1; // variableTop >= digits, should not occur - } - miniVarTop = table[i]; - } - - UBool digitsAreReordered = FALSE; - if(settings.hasReordering()) { - uint32_t prevStart = 0; - uint32_t beforeDigitStart = 0; - uint32_t digitStart = 0; - uint32_t afterDigitStart = 0; - for(int32_t group = UCOL_REORDER_CODE_FIRST; - group < UCOL_REORDER_CODE_FIRST + CollationData::MAX_NUM_SPECIAL_REORDER_CODES; - ++group) { - uint32_t start = data->getFirstPrimaryForGroup(group); - start = settings.reorder(start); - if(group == UCOL_REORDER_CODE_DIGIT) { - beforeDigitStart = prevStart; - digitStart = start; - } else if(start != 0) { - if(start < prevStart) { - // The permutation affects the groups up to Latin. - return -1; - } - // In the future, there might be a special group between digits & Latin. - if(digitStart != 0 && afterDigitStart == 0 && prevStart == beforeDigitStart) { - afterDigitStart = start; - } - prevStart = start; - } - } - uint32_t latinStart = data->getFirstPrimaryForGroup(USCRIPT_LATIN); - latinStart = settings.reorder(latinStart); - if(latinStart < prevStart) { - return -1; - } - if(afterDigitStart == 0) { - afterDigitStart = latinStart; - } - if(!(beforeDigitStart < digitStart && digitStart < afterDigitStart)) { - digitsAreReordered = TRUE; - } - } - - table += (table[0] & 0xff); // skip the header - for(UChar32 c = 0; c < LATIN_LIMIT; ++c) { - uint32_t p = table[c]; - if(p >= MIN_SHORT) { - p &= SHORT_PRIMARY_MASK; - } else if(p > miniVarTop) { - p &= LONG_PRIMARY_MASK; - } else { - p = 0; - } - primaries[c] = (uint16_t)p; - } - if(digitsAreReordered || (settings.options & CollationSettings::NUMERIC) != 0) { - // Bail out for digits. - for(UChar32 c = 0x30; c <= 0x39; ++c) { primaries[c] = 0; } - } - - // Shift the miniVarTop above other options. - return ((int32_t)miniVarTop << 16) | settings.options; -} - -int32_t -CollationFastLatin::compareUTF16(const uint16_t *table, const uint16_t *primaries, int32_t options, - const UChar *left, int32_t leftLength, - const UChar *right, int32_t rightLength) { - // This is a modified copy of CollationCompare::compareUpToQuaternary(), - // optimized for common Latin text. - // Keep them in sync! - // Keep compareUTF16() and compareUTF8() in sync very closely! - - U_ASSERT((table[0] >> 8) == VERSION); - table += (table[0] & 0xff); // skip the header - uint32_t variableTop = (uint32_t)options >> 16; // see getOptions() - options &= 0xffff; // needed for CollationSettings::getStrength() to work - - // Check for supported characters, fetch mini CEs, and compare primaries. - int32_t leftIndex = 0, rightIndex = 0; - /** - * Single mini CE or a pair. - * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits. - * If there is only one, then it is in the lower bits, and the upper bits are 0. - */ - uint32_t leftPair = 0, rightPair = 0; - for(;;) { - // We fetch CEs until we get a non-ignorable primary or reach the end. - while(leftPair == 0) { - if(leftIndex == leftLength) { - leftPair = EOS; - break; - } - UChar32 c = left[leftIndex++]; - if(c <= LATIN_MAX) { - leftPair = primaries[c]; - if(leftPair != 0) { break; } - if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { - return BAIL_OUT_RESULT; - } - leftPair = table[c]; - } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { - leftPair = table[c - PUNCT_START + LATIN_LIMIT]; - } else { - leftPair = lookup(table, c); - } - if(leftPair >= MIN_SHORT) { - leftPair &= SHORT_PRIMARY_MASK; - break; - } else if(leftPair > variableTop) { - leftPair &= LONG_PRIMARY_MASK; - break; - } else { - leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); - if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; } - leftPair = getPrimaries(variableTop, leftPair); - } - } - - while(rightPair == 0) { - if(rightIndex == rightLength) { - rightPair = EOS; - break; - } - UChar32 c = right[rightIndex++]; - if(c <= LATIN_MAX) { - rightPair = primaries[c]; - if(rightPair != 0) { break; } - if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { - return BAIL_OUT_RESULT; - } - rightPair = table[c]; - } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { - rightPair = table[c - PUNCT_START + LATIN_LIMIT]; - } else { - rightPair = lookup(table, c); - } - if(rightPair >= MIN_SHORT) { - rightPair &= SHORT_PRIMARY_MASK; - break; - } else if(rightPair > variableTop) { - rightPair &= LONG_PRIMARY_MASK; - break; - } else { - rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); - if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; } - rightPair = getPrimaries(variableTop, rightPair); - } - } - - if(leftPair == rightPair) { - if(leftPair == EOS) { break; } - leftPair = rightPair = 0; - continue; - } - uint32_t leftPrimary = leftPair & 0xffff; - uint32_t rightPrimary = rightPair & 0xffff; - if(leftPrimary != rightPrimary) { - // Return the primary difference. - return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER; - } - if(leftPair == EOS) { break; } - leftPair >>= 16; - rightPair >>= 16; - } - // In the following, we need to re-fetch each character because we did not buffer the CEs, - // but we know that the string is well-formed and - // only contains supported characters and mappings. - - // We might skip the secondary level but continue with the case level - // which is turned on separately. - if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) { - leftIndex = rightIndex = 0; - leftPair = rightPair = 0; - for(;;) { - while(leftPair == 0) { - if(leftIndex == leftLength) { - leftPair = EOS; - break; - } - UChar32 c = left[leftIndex++]; - if(c <= LATIN_MAX) { - leftPair = table[c]; - } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { - leftPair = table[c - PUNCT_START + LATIN_LIMIT]; - } else { - leftPair = lookup(table, c); - } - if(leftPair >= MIN_SHORT) { - leftPair = getSecondariesFromOneShortCE(leftPair); - break; - } else if(leftPair > variableTop) { - leftPair = COMMON_SEC_PLUS_OFFSET; - break; - } else { - leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); - leftPair = getSecondaries(variableTop, leftPair); - } - } - - while(rightPair == 0) { - if(rightIndex == rightLength) { - rightPair = EOS; - break; - } - UChar32 c = right[rightIndex++]; - if(c <= LATIN_MAX) { - rightPair = table[c]; - } else if(PUNCT_START <= c && c < PUNCT_LIMIT) { - rightPair = table[c - PUNCT_START + LATIN_LIMIT]; - } else { - rightPair = lookup(table, c); - } - if(rightPair >= MIN_SHORT) { - rightPair = getSecondariesFromOneShortCE(rightPair); - break; - } else if(rightPair > variableTop) { - rightPair = COMMON_SEC_PLUS_OFFSET; - break; - } else { - rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); - rightPair = getSecondaries(variableTop, rightPair); - } - } - - if(leftPair == rightPair) { - if(leftPair == EOS) { break; } - leftPair = rightPair = 0; - continue; - } - uint32_t leftSecondary = leftPair & 0xffff; - uint32_t rightSecondary = rightPair & 0xffff; - if(leftSecondary != rightSecondary) { - if((options & CollationSettings::BACKWARD_SECONDARY) != 0) { - // Full support for backwards secondary requires backwards contraction matching - // and moving backwards between merge separators. - return BAIL_OUT_RESULT; - } - return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER; - } - if(leftPair == EOS) { break; } - leftPair >>= 16; - rightPair >>= 16; - } - } - - if((options & CollationSettings::CASE_LEVEL) != 0) { - UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY; - leftIndex = rightIndex = 0; - leftPair = rightPair = 0; - for(;;) { - while(leftPair == 0) { - if(leftIndex == leftLength) { - leftPair = EOS; - break; - } - UChar32 c = left[leftIndex++]; - leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); - if(leftPair < MIN_LONG) { - leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); - } - leftPair = getCases(variableTop, strengthIsPrimary, leftPair); - } - - while(rightPair == 0) { - if(rightIndex == rightLength) { - rightPair = EOS; - break; - } - UChar32 c = right[rightIndex++]; - rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); - if(rightPair < MIN_LONG) { - rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); - } - rightPair = getCases(variableTop, strengthIsPrimary, rightPair); - } - - if(leftPair == rightPair) { - if(leftPair == EOS) { break; } - leftPair = rightPair = 0; - continue; - } - uint32_t leftCase = leftPair & 0xffff; - uint32_t rightCase = rightPair & 0xffff; - if(leftCase != rightCase) { - if((options & CollationSettings::UPPER_FIRST) == 0) { - return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER; - } else { - return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS; - } - } - if(leftPair == EOS) { break; } - leftPair >>= 16; - rightPair >>= 16; - } - } - if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; } - - // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off. - UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options); - - leftIndex = rightIndex = 0; - leftPair = rightPair = 0; - for(;;) { - while(leftPair == 0) { - if(leftIndex == leftLength) { - leftPair = EOS; - break; - } - UChar32 c = left[leftIndex++]; - leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); - if(leftPair < MIN_LONG) { - leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); - } - leftPair = getTertiaries(variableTop, withCaseBits, leftPair); - } - - while(rightPair == 0) { - if(rightIndex == rightLength) { - rightPair = EOS; - break; - } - UChar32 c = right[rightIndex++]; - rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); - if(rightPair < MIN_LONG) { - rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); - } - rightPair = getTertiaries(variableTop, withCaseBits, rightPair); - } - - if(leftPair == rightPair) { - if(leftPair == EOS) { break; } - leftPair = rightPair = 0; - continue; - } - uint32_t leftTertiary = leftPair & 0xffff; - uint32_t rightTertiary = rightPair & 0xffff; - if(leftTertiary != rightTertiary) { - if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) { - // Pass through EOS and MERGE_WEIGHT - // and keep real tertiary weights larger than the MERGE_WEIGHT. - // Tertiary CEs (secondary ignorables) are not supported in fast Latin. - if(leftTertiary > MERGE_WEIGHT) { - leftTertiary ^= CASE_MASK; - } - if(rightTertiary > MERGE_WEIGHT) { - rightTertiary ^= CASE_MASK; - } - } - return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER; - } - if(leftPair == EOS) { break; } - leftPair >>= 16; - rightPair >>= 16; - } - if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; } - - leftIndex = rightIndex = 0; - leftPair = rightPair = 0; - for(;;) { - while(leftPair == 0) { - if(leftIndex == leftLength) { - leftPair = EOS; - break; - } - UChar32 c = left[leftIndex++]; - leftPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); - if(leftPair < MIN_LONG) { - leftPair = nextPair(table, c, leftPair, left, NULL, leftIndex, leftLength); - } - leftPair = getQuaternaries(variableTop, leftPair); - } - - while(rightPair == 0) { - if(rightIndex == rightLength) { - rightPair = EOS; - break; - } - UChar32 c = right[rightIndex++]; - rightPair = (c <= LATIN_MAX) ? table[c] : lookup(table, c); - if(rightPair < MIN_LONG) { - rightPair = nextPair(table, c, rightPair, right, NULL, rightIndex, rightLength); - } - rightPair = getQuaternaries(variableTop, rightPair); - } - - if(leftPair == rightPair) { - if(leftPair == EOS) { break; } - leftPair = rightPair = 0; - continue; - } - uint32_t leftQuaternary = leftPair & 0xffff; - uint32_t rightQuaternary = rightPair & 0xffff; - if(leftQuaternary != rightQuaternary) { - return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER; - } - if(leftPair == EOS) { break; } - leftPair >>= 16; - rightPair >>= 16; - } - return UCOL_EQUAL; -} - -int32_t -CollationFastLatin::compareUTF8(const uint16_t *table, const uint16_t *primaries, int32_t options, - const uint8_t *left, int32_t leftLength, - const uint8_t *right, int32_t rightLength) { - // Keep compareUTF16() and compareUTF8() in sync very closely! - - U_ASSERT((table[0] >> 8) == VERSION); - table += (table[0] & 0xff); // skip the header - uint32_t variableTop = (uint32_t)options >> 16; // see RuleBasedCollator::getFastLatinOptions() - options &= 0xffff; // needed for CollationSettings::getStrength() to work - - // Check for supported characters, fetch mini CEs, and compare primaries. - int32_t leftIndex = 0, rightIndex = 0; - /** - * Single mini CE or a pair. - * The current mini CE is in the lower 16 bits, the next one is in the upper 16 bits. - * If there is only one, then it is in the lower bits, and the upper bits are 0. - */ - uint32_t leftPair = 0, rightPair = 0; - // Note: There is no need to assemble the code point. - // We only need to look up the table entry for the character, - // and nextPair() looks for whether c==0. - for(;;) { - // We fetch CEs until we get a non-ignorable primary or reach the end. - while(leftPair == 0) { - if(leftIndex == leftLength) { - leftPair = EOS; - break; - } - UChar32 c = left[leftIndex++]; - uint8_t t; - if(c <= 0x7f) { - leftPair = primaries[c]; - if(leftPair != 0) { break; } - if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { - return BAIL_OUT_RESULT; - } - leftPair = table[c]; - } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && leftIndex != leftLength && - 0x80 <= (t = left[leftIndex]) && t <= 0xbf) { - ++leftIndex; - c = ((c - 0xc2) << 6) + t; - leftPair = primaries[c]; - if(leftPair != 0) { break; } - leftPair = table[c]; - } else { - leftPair = lookupUTF8(table, c, left, leftIndex, leftLength); - } - if(leftPair >= MIN_SHORT) { - leftPair &= SHORT_PRIMARY_MASK; - break; - } else if(leftPair > variableTop) { - leftPair &= LONG_PRIMARY_MASK; - break; - } else { - leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); - if(leftPair == BAIL_OUT) { return BAIL_OUT_RESULT; } - leftPair = getPrimaries(variableTop, leftPair); - } - } - - while(rightPair == 0) { - if(rightIndex == rightLength) { - rightPair = EOS; - break; - } - UChar32 c = right[rightIndex++]; - uint8_t t; - if(c <= 0x7f) { - rightPair = primaries[c]; - if(rightPair != 0) { break; } - if(c <= 0x39 && c >= 0x30 && (options & CollationSettings::NUMERIC) != 0) { - return BAIL_OUT_RESULT; - } - rightPair = table[c]; - } else if(c <= LATIN_MAX_UTF8_LEAD && 0xc2 <= c && rightIndex != rightLength && - 0x80 <= (t = right[rightIndex]) && t <= 0xbf) { - ++rightIndex; - c = ((c - 0xc2) << 6) + t; - rightPair = primaries[c]; - if(rightPair != 0) { break; } - rightPair = table[c]; - } else { - rightPair = lookupUTF8(table, c, right, rightIndex, rightLength); - } - if(rightPair >= MIN_SHORT) { - rightPair &= SHORT_PRIMARY_MASK; - break; - } else if(rightPair > variableTop) { - rightPair &= LONG_PRIMARY_MASK; - break; - } else { - rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); - if(rightPair == BAIL_OUT) { return BAIL_OUT_RESULT; } - rightPair = getPrimaries(variableTop, rightPair); - } - } - - if(leftPair == rightPair) { - if(leftPair == EOS) { break; } - leftPair = rightPair = 0; - continue; - } - uint32_t leftPrimary = leftPair & 0xffff; - uint32_t rightPrimary = rightPair & 0xffff; - if(leftPrimary != rightPrimary) { - // Return the primary difference. - return (leftPrimary < rightPrimary) ? UCOL_LESS : UCOL_GREATER; - } - if(leftPair == EOS) { break; } - leftPair >>= 16; - rightPair >>= 16; - } - // In the following, we need to re-fetch each character because we did not buffer the CEs, - // but we know that the string is well-formed and - // only contains supported characters and mappings. - - // We might skip the secondary level but continue with the case level - // which is turned on separately. - if(CollationSettings::getStrength(options) >= UCOL_SECONDARY) { - leftIndex = rightIndex = 0; - leftPair = rightPair = 0; - for(;;) { - while(leftPair == 0) { - if(leftIndex == leftLength) { - leftPair = EOS; - break; - } - UChar32 c = left[leftIndex++]; - if(c <= 0x7f) { - leftPair = table[c]; - } else if(c <= LATIN_MAX_UTF8_LEAD) { - leftPair = table[((c - 0xc2) << 6) + left[leftIndex++]]; - } else { - leftPair = lookupUTF8Unsafe(table, c, left, leftIndex); - } - if(leftPair >= MIN_SHORT) { - leftPair = getSecondariesFromOneShortCE(leftPair); - break; - } else if(leftPair > variableTop) { - leftPair = COMMON_SEC_PLUS_OFFSET; - break; - } else { - leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); - leftPair = getSecondaries(variableTop, leftPair); - } - } - - while(rightPair == 0) { - if(rightIndex == rightLength) { - rightPair = EOS; - break; - } - UChar32 c = right[rightIndex++]; - if(c <= 0x7f) { - rightPair = table[c]; - } else if(c <= LATIN_MAX_UTF8_LEAD) { - rightPair = table[((c - 0xc2) << 6) + right[rightIndex++]]; - } else { - rightPair = lookupUTF8Unsafe(table, c, right, rightIndex); - } - if(rightPair >= MIN_SHORT) { - rightPair = getSecondariesFromOneShortCE(rightPair); - break; - } else if(rightPair > variableTop) { - rightPair = COMMON_SEC_PLUS_OFFSET; - break; - } else { - rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); - rightPair = getSecondaries(variableTop, rightPair); - } - } - - if(leftPair == rightPair) { - if(leftPair == EOS) { break; } - leftPair = rightPair = 0; - continue; - } - uint32_t leftSecondary = leftPair & 0xffff; - uint32_t rightSecondary = rightPair & 0xffff; - if(leftSecondary != rightSecondary) { - if((options & CollationSettings::BACKWARD_SECONDARY) != 0) { - // Full support for backwards secondary requires backwards contraction matching - // and moving backwards between merge separators. - return BAIL_OUT_RESULT; - } - return (leftSecondary < rightSecondary) ? UCOL_LESS : UCOL_GREATER; - } - if(leftPair == EOS) { break; } - leftPair >>= 16; - rightPair >>= 16; - } - } - - if((options & CollationSettings::CASE_LEVEL) != 0) { - UBool strengthIsPrimary = CollationSettings::getStrength(options) == UCOL_PRIMARY; - leftIndex = rightIndex = 0; - leftPair = rightPair = 0; - for(;;) { - while(leftPair == 0) { - if(leftIndex == leftLength) { - leftPair = EOS; - break; - } - UChar32 c = left[leftIndex++]; - leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex); - if(leftPair < MIN_LONG) { - leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); - } - leftPair = getCases(variableTop, strengthIsPrimary, leftPair); - } - - while(rightPair == 0) { - if(rightIndex == rightLength) { - rightPair = EOS; - break; - } - UChar32 c = right[rightIndex++]; - rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex); - if(rightPair < MIN_LONG) { - rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); - } - rightPair = getCases(variableTop, strengthIsPrimary, rightPair); - } - - if(leftPair == rightPair) { - if(leftPair == EOS) { break; } - leftPair = rightPair = 0; - continue; - } - uint32_t leftCase = leftPair & 0xffff; - uint32_t rightCase = rightPair & 0xffff; - if(leftCase != rightCase) { - if((options & CollationSettings::UPPER_FIRST) == 0) { - return (leftCase < rightCase) ? UCOL_LESS : UCOL_GREATER; - } else { - return (leftCase < rightCase) ? UCOL_GREATER : UCOL_LESS; - } - } - if(leftPair == EOS) { break; } - leftPair >>= 16; - rightPair >>= 16; - } - } - if(CollationSettings::getStrength(options) <= UCOL_SECONDARY) { return UCOL_EQUAL; } - - // Remove the case bits from the tertiary weight when caseLevel is on or caseFirst is off. - UBool withCaseBits = CollationSettings::isTertiaryWithCaseBits(options); - - leftIndex = rightIndex = 0; - leftPair = rightPair = 0; - for(;;) { - while(leftPair == 0) { - if(leftIndex == leftLength) { - leftPair = EOS; - break; - } - UChar32 c = left[leftIndex++]; - leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex); - if(leftPair < MIN_LONG) { - leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); - } - leftPair = getTertiaries(variableTop, withCaseBits, leftPair); - } - - while(rightPair == 0) { - if(rightIndex == rightLength) { - rightPair = EOS; - break; - } - UChar32 c = right[rightIndex++]; - rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex); - if(rightPair < MIN_LONG) { - rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); - } - rightPair = getTertiaries(variableTop, withCaseBits, rightPair); - } - - if(leftPair == rightPair) { - if(leftPair == EOS) { break; } - leftPair = rightPair = 0; - continue; - } - uint32_t leftTertiary = leftPair & 0xffff; - uint32_t rightTertiary = rightPair & 0xffff; - if(leftTertiary != rightTertiary) { - if(CollationSettings::sortsTertiaryUpperCaseFirst(options)) { - // Pass through EOS and MERGE_WEIGHT - // and keep real tertiary weights larger than the MERGE_WEIGHT. - // Tertiary CEs (secondary ignorables) are not supported in fast Latin. - if(leftTertiary > MERGE_WEIGHT) { - leftTertiary ^= CASE_MASK; - } - if(rightTertiary > MERGE_WEIGHT) { - rightTertiary ^= CASE_MASK; - } - } - return (leftTertiary < rightTertiary) ? UCOL_LESS : UCOL_GREATER; - } - if(leftPair == EOS) { break; } - leftPair >>= 16; - rightPair >>= 16; - } - if(CollationSettings::getStrength(options) <= UCOL_TERTIARY) { return UCOL_EQUAL; } - - leftIndex = rightIndex = 0; - leftPair = rightPair = 0; - for(;;) { - while(leftPair == 0) { - if(leftIndex == leftLength) { - leftPair = EOS; - break; - } - UChar32 c = left[leftIndex++]; - leftPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, left, leftIndex); - if(leftPair < MIN_LONG) { - leftPair = nextPair(table, c, leftPair, NULL, left, leftIndex, leftLength); - } - leftPair = getQuaternaries(variableTop, leftPair); - } - - while(rightPair == 0) { - if(rightIndex == rightLength) { - rightPair = EOS; - break; - } - UChar32 c = right[rightIndex++]; - rightPair = (c <= 0x7f) ? table[c] : lookupUTF8Unsafe(table, c, right, rightIndex); - if(rightPair < MIN_LONG) { - rightPair = nextPair(table, c, rightPair, NULL, right, rightIndex, rightLength); - } - rightPair = getQuaternaries(variableTop, rightPair); - } - - if(leftPair == rightPair) { - if(leftPair == EOS) { break; } - leftPair = rightPair = 0; - continue; - } - uint32_t leftQuaternary = leftPair & 0xffff; - uint32_t rightQuaternary = rightPair & 0xffff; - if(leftQuaternary != rightQuaternary) { - return (leftQuaternary < rightQuaternary) ? UCOL_LESS : UCOL_GREATER; - } - if(leftPair == EOS) { break; } - leftPair >>= 16; - rightPair >>= 16; - } - return UCOL_EQUAL; -} - -uint32_t -CollationFastLatin::lookup(const uint16_t *table, UChar32 c) { - U_ASSERT(c > LATIN_MAX); - if(PUNCT_START <= c && c < PUNCT_LIMIT) { - return table[c - PUNCT_START + LATIN_LIMIT]; - } else if(c == 0xfffe) { - return MERGE_WEIGHT; - } else if(c == 0xffff) { - return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; - } else { - return BAIL_OUT; - } -} - -uint32_t -CollationFastLatin::lookupUTF8(const uint16_t *table, UChar32 c, - const uint8_t *s8, int32_t &sIndex, int32_t sLength) { - // The caller handled ASCII and valid/supported Latin. - U_ASSERT(c > 0x7f); - int32_t i2 = sIndex + 1; - if(i2 < sLength || sLength < 0) { - uint8_t t1 = s8[sIndex]; - uint8_t t2 = s8[i2]; - sIndex += 2; - if(c == 0xe2 && t1 == 0x80 && 0x80 <= t2 && t2 <= 0xbf) { - return table[(LATIN_LIMIT - 0x80) + t2]; // 2000..203F -> 0180..01BF - } else if(c == 0xef && t1 == 0xbf) { - if(t2 == 0xbe) { - return MERGE_WEIGHT; // U+FFFE - } else if(t2 == 0xbf) { - return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; // U+FFFF - } - } - } - return BAIL_OUT; -} - -uint32_t -CollationFastLatin::lookupUTF8Unsafe(const uint16_t *table, UChar32 c, - const uint8_t *s8, int32_t &sIndex) { - // The caller handled ASCII. - // The string is well-formed and contains only supported characters. - U_ASSERT(c > 0x7f); - if(c <= LATIN_MAX_UTF8_LEAD) { - return table[((c - 0xc2) << 6) + s8[sIndex++]]; // 0080..017F - } - uint8_t t2 = s8[sIndex + 1]; - sIndex += 2; - if(c == 0xe2) { - return table[(LATIN_LIMIT - 0x80) + t2]; // 2000..203F -> 0180..01BF - } else if(t2 == 0xbe) { - return MERGE_WEIGHT; // U+FFFE - } else { - return MAX_SHORT | COMMON_SEC | LOWER_CASE | COMMON_TER; // U+FFFF - } -} - -uint32_t -CollationFastLatin::nextPair(const uint16_t *table, UChar32 c, uint32_t ce, - const UChar *s16, const uint8_t *s8, int32_t &sIndex, int32_t &sLength) { - if(ce >= MIN_LONG || ce < CONTRACTION) { - return ce; // simple or special mini CE - } else if(ce >= EXPANSION) { - int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK); - return ((uint32_t)table[index + 1] << 16) | table[index]; - } else /* ce >= CONTRACTION */ { - if(c == 0 && sLength < 0) { - sLength = sIndex - 1; - return EOS; - } - // Contraction list: Default mapping followed by - // 0 or more single-character contraction suffix mappings. - int32_t index = NUM_FAST_CHARS + (ce & INDEX_MASK); - if(sIndex != sLength) { - // Read the next character. - int32_t c2; - int32_t nextIndex = sIndex; - if(s16 != NULL) { - c2 = s16[nextIndex++]; - if(c2 > LATIN_MAX) { - if(PUNCT_START <= c2 && c2 < PUNCT_LIMIT) { - c2 = c2 - PUNCT_START + LATIN_LIMIT; // 2000..203F -> 0180..01BF - } else if(c2 == 0xfffe || c2 == 0xffff) { - c2 = -1; // U+FFFE & U+FFFF cannot occur in contractions. - } else { - return BAIL_OUT; - } - } - } else { - c2 = s8[nextIndex++]; - if(c2 > 0x7f) { - uint8_t t; - if(c2 <= 0xc5 && 0xc2 <= c2 && nextIndex != sLength && - 0x80 <= (t = s8[nextIndex]) && t <= 0xbf) { - c2 = ((c2 - 0xc2) << 6) + t; // 0080..017F - ++nextIndex; - } else { - int32_t i2 = nextIndex + 1; - if(i2 < sLength || sLength < 0) { - if(c2 == 0xe2 && s8[nextIndex] == 0x80 && - 0x80 <= (t = s8[i2]) && t <= 0xbf) { - c2 = (LATIN_LIMIT - 0x80) + t; // 2000..203F -> 0180..01BF - } else if(c2 == 0xef && s8[nextIndex] == 0xbf && - ((t = s8[i2]) == 0xbe || t == 0xbf)) { - c2 = -1; // U+FFFE & U+FFFF cannot occur in contractions. - } else { - return BAIL_OUT; - } - } else { - return BAIL_OUT; - } - nextIndex += 2; - } - } - } - if(c2 == 0 && sLength < 0) { - sLength = sIndex; - c2 = -1; - } - // Look for the next character in the contraction suffix list, - // which is in ascending order of single suffix characters. - int32_t i = index; - int32_t head = table[i]; // first skip the default mapping - int32_t x; - do { - i += head >> CONTR_LENGTH_SHIFT; - head = table[i]; - x = head & CONTR_CHAR_MASK; - } while(x < c2); - if(x == c2) { - index = i; - sIndex = nextIndex; - } - } - // Return the CE or CEs for the default or contraction mapping. - int32_t length = table[index] >> CONTR_LENGTH_SHIFT; - if(length == 1) { - return BAIL_OUT; - } - ce = table[index + 1]; - if(length == 2) { - return ce; - } else { - return ((uint32_t)table[index + 2] << 16) | ce; - } - } -} - -uint32_t -CollationFastLatin::getSecondaries(uint32_t variableTop, uint32_t pair) { - if(pair <= 0xffff) { - // one mini CE - if(pair >= MIN_SHORT) { - pair = getSecondariesFromOneShortCE(pair); - } else if(pair > variableTop) { - pair = COMMON_SEC_PLUS_OFFSET; - } else if(pair >= MIN_LONG) { - pair = 0; // variable - } - // else special mini CE - } else { - uint32_t ce = pair & 0xffff; - if(ce >= MIN_SHORT) { - pair = (pair & TWO_SECONDARIES_MASK) + TWO_SEC_OFFSETS; - } else if(ce > variableTop) { - pair = TWO_COMMON_SEC_PLUS_OFFSET; - } else { - U_ASSERT(ce >= MIN_LONG); - pair = 0; // variable - } - } - return pair; -} - -uint32_t -CollationFastLatin::getCases(uint32_t variableTop, UBool strengthIsPrimary, uint32_t pair) { - // Primary+caseLevel: Ignore case level weights of primary ignorables. - // Otherwise: Ignore case level weights of secondary ignorables. - // For details see the comments in the CollationCompare class. - // Tertiary CEs (secondary ignorables) are not supported in fast Latin. - if(pair <= 0xffff) { - // one mini CE - if(pair >= MIN_SHORT) { - // A high secondary weight means we really have two CEs, - // a primary CE and a secondary CE. - uint32_t ce = pair; - pair &= CASE_MASK; // explicit weight of primary CE - if(!strengthIsPrimary && (ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { - pair |= LOWER_CASE << 16; // implied weight of secondary CE - } - } else if(pair > variableTop) { - pair = LOWER_CASE; - } else if(pair >= MIN_LONG) { - pair = 0; // variable - } - // else special mini CE - } else { - // two mini CEs, same primary groups, neither expands like above - uint32_t ce = pair & 0xffff; - if(ce >= MIN_SHORT) { - if(strengthIsPrimary && (pair & (SHORT_PRIMARY_MASK << 16)) == 0) { - pair &= CASE_MASK; - } else { - pair &= TWO_CASES_MASK; - } - } else if(ce > variableTop) { - pair = TWO_LOWER_CASES; - } else { - U_ASSERT(ce >= MIN_LONG); - pair = 0; // variable - } - } - return pair; -} - -uint32_t -CollationFastLatin::getTertiaries(uint32_t variableTop, UBool withCaseBits, uint32_t pair) { - if(pair <= 0xffff) { - // one mini CE - if(pair >= MIN_SHORT) { - // A high secondary weight means we really have two CEs, - // a primary CE and a secondary CE. - uint32_t ce = pair; - if(withCaseBits) { - pair = (pair & CASE_AND_TERTIARY_MASK) + TER_OFFSET; - if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { - pair |= (LOWER_CASE | COMMON_TER_PLUS_OFFSET) << 16; - } - } else { - pair = (pair & TERTIARY_MASK) + TER_OFFSET; - if((ce & SECONDARY_MASK) >= MIN_SEC_HIGH) { - pair |= COMMON_TER_PLUS_OFFSET << 16; - } - } - } else if(pair > variableTop) { - pair = (pair & TERTIARY_MASK) + TER_OFFSET; - if(withCaseBits) { - pair |= LOWER_CASE; - } - } else if(pair >= MIN_LONG) { - pair = 0; // variable - } - // else special mini CE - } else { - // two mini CEs, same primary groups, neither expands like above - uint32_t ce = pair & 0xffff; - if(ce >= MIN_SHORT) { - if(withCaseBits) { - pair &= TWO_CASES_MASK | TWO_TERTIARIES_MASK; - } else { - pair &= TWO_TERTIARIES_MASK; - } - pair += TWO_TER_OFFSETS; - } else if(ce > variableTop) { - pair = (pair & TWO_TERTIARIES_MASK) + TWO_TER_OFFSETS; - if(withCaseBits) { - pair |= TWO_LOWER_CASES; - } - } else { - U_ASSERT(ce >= MIN_LONG); - pair = 0; // variable - } - } - return pair; -} - -uint32_t -CollationFastLatin::getQuaternaries(uint32_t variableTop, uint32_t pair) { - // Return the primary weight of a variable CE, - // or the maximum primary weight for a non-variable, not-completely-ignorable CE. - if(pair <= 0xffff) { - // one mini CE - if(pair >= MIN_SHORT) { - // A high secondary weight means we really have two CEs, - // a primary CE and a secondary CE. - if((pair & SECONDARY_MASK) >= MIN_SEC_HIGH) { - pair = TWO_SHORT_PRIMARIES_MASK; - } else { - pair = SHORT_PRIMARY_MASK; - } - } else if(pair > variableTop) { - pair = SHORT_PRIMARY_MASK; - } else if(pair >= MIN_LONG) { - pair &= LONG_PRIMARY_MASK; // variable - } - // else special mini CE - } else { - // two mini CEs, same primary groups, neither expands like above - uint32_t ce = pair & 0xffff; - if(ce > variableTop) { - pair = TWO_SHORT_PRIMARIES_MASK; - } else { - U_ASSERT(ce >= MIN_LONG); - pair &= TWO_LONG_PRIMARIES_MASK; // variable - } - } - return pair; -} - -U_NAMESPACE_END - -#endif // !UCONFIG_NO_COLLATION |