/* ******************************************************************************* * Copyright (C) 2013-2014, International Business Machines * Corporation and others. All Rights Reserved. ******************************************************************************* * collationsets.cpp * * created on: 2013feb09 * created by: Markus W. Scherer */ #include "unicode/utypes.h" #if !UCONFIG_NO_COLLATION #include "unicode/ucharstrie.h" #include "unicode/uniset.h" #include "unicode/unistr.h" #include "unicode/ustringtrie.h" #include "collation.h" #include "collationdata.h" #include "collationsets.h" #include "normalizer2impl.h" #include "uassert.h" #include "utf16collationiterator.h" #include "utrie2.h" U_NAMESPACE_BEGIN U_CDECL_BEGIN static UBool U_CALLCONV enumTailoredRange(const void *context, UChar32 start, UChar32 end, uint32_t ce32) { if(ce32 == Collation::FALLBACK_CE32) { return TRUE; // fallback to base, not tailored } TailoredSet *ts = (TailoredSet *)context; return ts->handleCE32(start, end, ce32); } U_CDECL_END void TailoredSet::forData(const CollationData *d, UErrorCode &ec) { if(U_FAILURE(ec)) { return; } errorCode = ec; // Preserve info & warning codes. data = d; baseData = d->base; U_ASSERT(baseData != NULL); utrie2_enum(data->trie, NULL, enumTailoredRange, this); ec = errorCode; } UBool TailoredSet::handleCE32(UChar32 start, UChar32 end, uint32_t ce32) { U_ASSERT(ce32 != Collation::FALLBACK_CE32); if(Collation::isSpecialCE32(ce32)) { ce32 = data->getIndirectCE32(ce32); if(ce32 == Collation::FALLBACK_CE32) { return U_SUCCESS(errorCode); } } do { uint32_t baseCE32 = baseData->getFinalCE32(baseData->getCE32(start)); // Do not just continue if ce32 == baseCE32 because // contractions and expansions in different data objects // normally differ even if they have the same data offsets. if(Collation::isSelfContainedCE32(ce32) && Collation::isSelfContainedCE32(baseCE32)) { // fastpath if(ce32 != baseCE32) { tailored->add(start); } } else { compare(start, ce32, baseCE32); } } while(++start <= end); return U_SUCCESS(errorCode); } void TailoredSet::compare(UChar32 c, uint32_t ce32, uint32_t baseCE32) { if(Collation::isPrefixCE32(ce32)) { const UChar *p = data->contexts + Collation::indexFromCE32(ce32); ce32 = data->getFinalCE32(CollationData::readCE32(p)); if(Collation::isPrefixCE32(baseCE32)) { const UChar *q = baseData->contexts + Collation::indexFromCE32(baseCE32); baseCE32 = baseData->getFinalCE32(CollationData::readCE32(q)); comparePrefixes(c, p + 2, q + 2); } else { addPrefixes(data, c, p + 2); } } else if(Collation::isPrefixCE32(baseCE32)) { const UChar *q = baseData->contexts + Collation::indexFromCE32(baseCE32); baseCE32 = baseData->getFinalCE32(CollationData::readCE32(q)); addPrefixes(baseData, c, q + 2); } if(Collation::isContractionCE32(ce32)) { const UChar *p = data->contexts + Collation::indexFromCE32(ce32); if((ce32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) != 0) { ce32 = Collation::NO_CE32; } else { ce32 = data->getFinalCE32(CollationData::readCE32(p)); } if(Collation::isContractionCE32(baseCE32)) { const UChar *q = baseData->contexts + Collation::indexFromCE32(baseCE32); if((baseCE32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) != 0) { baseCE32 = Collation::NO_CE32; } else { baseCE32 = baseData->getFinalCE32(CollationData::readCE32(q)); } compareContractions(c, p + 2, q + 2); } else { addContractions(c, p + 2); } } else if(Collation::isContractionCE32(baseCE32)) { const UChar *q = baseData->contexts + Collation::indexFromCE32(baseCE32); baseCE32 = baseData->getFinalCE32(CollationData::readCE32(q)); addContractions(c, q + 2); } int32_t tag; if(Collation::isSpecialCE32(ce32)) { tag = Collation::tagFromCE32(ce32); U_ASSERT(tag != Collation::PREFIX_TAG); U_ASSERT(tag != Collation::CONTRACTION_TAG); // Currently, the tailoring data builder does not write offset tags. // They might be useful for saving space, // but they would complicate the builder, // and in tailorings we assume that performance of tailored characters is more important. U_ASSERT(tag != Collation::OFFSET_TAG); } else { tag = -1; } int32_t baseTag; if(Collation::isSpecialCE32(baseCE32)) { baseTag = Collation::tagFromCE32(baseCE32); U_ASSERT(baseTag != Collation::PREFIX_TAG); U_ASSERT(baseTag != Collation::CONTRACTION_TAG); } else { baseTag = -1; } // Non-contextual mappings, expansions, etc. if(baseTag == Collation::OFFSET_TAG) { // We might be comparing a tailoring CE which is a copy of // a base offset-tag CE, via the [optimize [set]] syntax // or when a single-character mapping was copied for tailored contractions. // Offset tags always result in long-primary CEs, // with common secondary/tertiary weights. if(!Collation::isLongPrimaryCE32(ce32)) { add(c); return; } int64_t dataCE = baseData->ces[Collation::indexFromCE32(baseCE32)]; uint32_t p = Collation::getThreeBytePrimaryForOffsetData(c, dataCE); if(Collation::primaryFromLongPrimaryCE32(ce32) != p) { add(c); return; } } if(tag != baseTag) { add(c); return; } if(tag == Collation::EXPANSION32_TAG) { const uint32_t *ce32s = data->ce32s + Collation::indexFromCE32(ce32); int32_t length = Collation::lengthFromCE32(ce32); const uint32_t *baseCE32s = baseData->ce32s + Collation::indexFromCE32(baseCE32); int32_t baseLength = Collation::lengthFromCE32(baseCE32); if(length != baseLength) { add(c); return; } for(int32_t i = 0; i < length; ++i) { if(ce32s[i] != baseCE32s[i]) { add(c); break; } } } else if(tag == Collation::EXPANSION_TAG) { const int64_t *ces = data->ces + Collation::indexFromCE32(ce32); int32_t length = Collation::lengthFromCE32(ce32); const int64_t *baseCEs = baseData->ces + Collation::indexFromCE32(baseCE32); int32_t baseLength = Collation::lengthFromCE32(baseCE32); if(length != baseLength) { add(c); return; } for(int32_t i = 0; i < length; ++i) { if(ces[i] != baseCEs[i]) { add(c); break; } } } else if(tag == Collation::HANGUL_TAG) { UChar jamos[3]; int32_t length = Hangul::decompose(c, jamos); if(tailored->contains(jamos[0]) || tailored->contains(jamos[1]) || (length == 3 && tailored->contains(jamos[2]))) { add(c); } } else if(ce32 != baseCE32) { add(c); } } void TailoredSet::comparePrefixes(UChar32 c, const UChar *p, const UChar *q) { // Parallel iteration over prefixes of both tables. UCharsTrie::Iterator prefixes(p, 0, errorCode); UCharsTrie::Iterator basePrefixes(q, 0, errorCode); const UnicodeString *tp = NULL; // Tailoring prefix. const UnicodeString *bp = NULL; // Base prefix. // Use a string with a U+FFFF as the limit sentinel. // U+FFFF is untailorable and will not occur in prefixes. UnicodeString none((UChar)0xffff); for(;;) { if(tp == NULL) { if(prefixes.next(errorCode)) { tp = &prefixes.getString(); } else { tp = &none; } } if(bp == NULL) { if(basePrefixes.next(errorCode)) { bp = &basePrefixes.getString(); } else { bp = &none; } } if(tp == &none && bp == &none) { break; } int32_t cmp = tp->compare(*bp); if(cmp < 0) { // tp occurs in the tailoring but not in the base. addPrefix(data, *tp, c, (uint32_t)prefixes.getValue()); tp = NULL; } else if(cmp > 0) { // bp occurs in the base but not in the tailoring. addPrefix(baseData, *bp, c, (uint32_t)basePrefixes.getValue()); bp = NULL; } else { setPrefix(*tp); compare(c, (uint32_t)prefixes.getValue(), (uint32_t)basePrefixes.getValue()); resetPrefix(); tp = NULL; bp = NULL; } } } void TailoredSet::compareContractions(UChar32 c, const UChar *p, const UChar *q) { // Parallel iteration over suffixes of both tables. UCharsTrie::Iterator suffixes(p, 0, errorCode); UCharsTrie::Iterator baseSuffixes(q, 0, errorCode); const UnicodeString *ts = NULL; // Tailoring suffix. const UnicodeString *bs = NULL; // Base suffix. // Use a string with two U+FFFF as the limit sentinel. // U+FFFF is untailorable and will not occur in contractions except maybe // as a single suffix character for a root-collator boundary contraction. UnicodeString none((UChar)0xffff); none.append((UChar)0xffff); for(;;) { if(ts == NULL) { if(suffixes.next(errorCode)) { ts = &suffixes.getString(); } else { ts = &none; } } if(bs == NULL) { if(baseSuffixes.next(errorCode)) { bs = &baseSuffixes.getString(); } else { bs = &none; } } if(ts == &none && bs == &none) { break; } int32_t cmp = ts->compare(*bs); if(cmp < 0) { // ts occurs in the tailoring but not in the base. addSuffix(c, *ts); ts = NULL; } else if(cmp > 0) { // bs occurs in the base but not in the tailoring. addSuffix(c, *bs); bs = NULL; } else { suffix = ts; compare(c, (uint32_t)suffixes.getValue(), (uint32_t)baseSuffixes.getValue()); suffix = NULL; ts = NULL; bs = NULL; } } } void TailoredSet::addPrefixes(const CollationData *d, UChar32 c, const UChar *p) { UCharsTrie::Iterator prefixes(p, 0, errorCode); while(prefixes.next(errorCode)) { addPrefix(d, prefixes.getString(), c, (uint32_t)prefixes.getValue()); } } void TailoredSet::addPrefix(const CollationData *d, const UnicodeString &pfx, UChar32 c, uint32_t ce32) { setPrefix(pfx); ce32 = d->getFinalCE32(ce32); if(Collation::isContractionCE32(ce32)) { const UChar *p = d->contexts + Collation::indexFromCE32(ce32); addContractions(c, p + 2); } tailored->add(UnicodeString(unreversedPrefix).append(c)); resetPrefix(); } void TailoredSet::addContractions(UChar32 c, const UChar *p) { UCharsTrie::Iterator suffixes(p, 0, errorCode); while(suffixes.next(errorCode)) { addSuffix(c, suffixes.getString()); } } void TailoredSet::addSuffix(UChar32 c, const UnicodeString &sfx) { tailored->add(UnicodeString(unreversedPrefix).append(c).append(sfx)); } void TailoredSet::add(UChar32 c) { if(unreversedPrefix.isEmpty() && suffix == NULL) { tailored->add(c); } else { UnicodeString s(unreversedPrefix); s.append(c); if(suffix != NULL) { s.append(*suffix); } tailored->add(s); } } ContractionsAndExpansions::CESink::~CESink() {} U_CDECL_BEGIN static UBool U_CALLCONV enumCnERange(const void *context, UChar32 start, UChar32 end, uint32_t ce32) { ContractionsAndExpansions *cne = (ContractionsAndExpansions *)context; if(cne->checkTailored == 0) { // There is no tailoring. // No need to collect nor check the tailored set. } else if(cne->checkTailored < 0) { // Collect the set of code points with mappings in the tailoring data. if(ce32 == Collation::FALLBACK_CE32) { return TRUE; // fallback to base, not tailored } else { cne->tailored.add(start, end); } // checkTailored > 0: Exclude tailored ranges from the base data enumeration. } else if(start == end) { if(cne->tailored.contains(start)) { return TRUE; } } else if(cne->tailored.containsSome(start, end)) { cne->ranges.set(start, end).removeAll(cne->tailored); int32_t count = cne->ranges.getRangeCount(); for(int32_t i = 0; i < count; ++i) { cne->handleCE32(cne->ranges.getRangeStart(i), cne->ranges.getRangeEnd(i), ce32); } return U_SUCCESS(cne->errorCode); } cne->handleCE32(start, end, ce32); return U_SUCCESS(cne->errorCode); } U_CDECL_END void ContractionsAndExpansions::forData(const CollationData *d, UErrorCode &ec) { if(U_FAILURE(ec)) { return; } errorCode = ec; // Preserve info & warning codes. // Add all from the data, can be tailoring or base. if(d->base != NULL) { checkTailored = -1; } data = d; utrie2_enum(data->trie, NULL, enumCnERange, this); if(d->base == NULL || U_FAILURE(errorCode)) { ec = errorCode; return; } // Add all from the base data but only for un-tailored code points. tailored.freeze(); checkTailored = 1; data = d->base; utrie2_enum(data->trie, NULL, enumCnERange, this); ec = errorCode; } void ContractionsAndExpansions::forCodePoint(const CollationData *d, UChar32 c, UErrorCode &ec) { if(U_FAILURE(ec)) { return; } errorCode = ec; // Preserve info & warning codes. uint32_t ce32 = d->getCE32(c); if(ce32 == Collation::FALLBACK_CE32) { d = d->base; ce32 = d->getCE32(c); } data = d; handleCE32(c, c, ce32); ec = errorCode; } void ContractionsAndExpansions::handleCE32(UChar32 start, UChar32 end, uint32_t ce32) { for(;;) { if((ce32 & 0xff) < Collation::SPECIAL_CE32_LOW_BYTE) { // !isSpecialCE32() if(sink != NULL) { sink->handleCE(Collation::ceFromSimpleCE32(ce32)); } return; } switch(Collation::tagFromCE32(ce32)) { case Collation::FALLBACK_TAG: return; case Collation::RESERVED_TAG_3: case Collation::BUILDER_DATA_TAG: case Collation::LEAD_SURROGATE_TAG: if(U_SUCCESS(errorCode)) { errorCode = U_INTERNAL_PROGRAM_ERROR; } return; case Collation::LONG_PRIMARY_TAG: if(sink != NULL) { sink->handleCE(Collation::ceFromLongPrimaryCE32(ce32)); } return; case Collation::LONG_SECONDARY_TAG: if(sink != NULL) { sink->handleCE(Collation::ceFromLongSecondaryCE32(ce32)); } return; case Collation::LATIN_EXPANSION_TAG: if(sink != NULL) { ces[0] = Collation::latinCE0FromCE32(ce32); ces[1] = Collation::latinCE1FromCE32(ce32); sink->handleExpansion(ces, 2); } // Optimization: If we have a prefix, // then the relevant strings have been added already. if(unreversedPrefix.isEmpty()) { addExpansions(start, end); } return; case Collation::EXPANSION32_TAG: if(sink != NULL) { const uint32_t *ce32s = data->ce32s + Collation::indexFromCE32(ce32); int32_t length = Collation::lengthFromCE32(ce32); for(int32_t i = 0; i < length; ++i) { ces[i] = Collation::ceFromCE32(*ce32s++); } sink->handleExpansion(ces, length); } // Optimization: If we have a prefix, // then the relevant strings have been added already. if(unreversedPrefix.isEmpty()) { addExpansions(start, end); } return; case Collation::EXPANSION_TAG: if(sink != NULL) { int32_t length = Collation::lengthFromCE32(ce32); sink->handleExpansion(data->ces + Collation::indexFromCE32(ce32), length); } // Optimization: If we have a prefix, // then the relevant strings have been added already. if(unreversedPrefix.isEmpty()) { addExpansions(start, end); } return; case Collation::PREFIX_TAG: handlePrefixes(start, end, ce32); return; case Collation::CONTRACTION_TAG: handleContractions(start, end, ce32); return; case Collation::DIGIT_TAG: // Fetch the non-numeric-collation CE32 and continue. ce32 = data->ce32s[Collation::indexFromCE32(ce32)]; break; case Collation::U0000_TAG: U_ASSERT(start == 0 && end == 0); // Fetch the normal ce32 for U+0000 and continue. ce32 = data->ce32s[0]; break; case Collation::HANGUL_TAG: if(sink != NULL) { // TODO: This should be optimized, // especially if [start..end] is the complete Hangul range. (assert that) UTF16CollationIterator iter(data, FALSE, NULL, NULL, NULL); UChar hangul[1] = { 0 }; for(UChar32 c = start; c <= end; ++c) { hangul[0] = (UChar)c; iter.setText(hangul, hangul + 1); int32_t length = iter.fetchCEs(errorCode); if(U_FAILURE(errorCode)) { return; } // Ignore the terminating non-CE. U_ASSERT(length >= 2 && iter.getCE(length - 1) == Collation::NO_CE); sink->handleExpansion(iter.getCEs(), length - 1); } } // Optimization: If we have a prefix, // then the relevant strings have been added already. if(unreversedPrefix.isEmpty()) { addExpansions(start, end); } return; case Collation::OFFSET_TAG: // Currently no need to send offset CEs to the sink. return; case Collation::IMPLICIT_TAG: // Currently no need to send implicit CEs to the sink. return; } } } void ContractionsAndExpansions::handlePrefixes( UChar32 start, UChar32 end, uint32_t ce32) { const UChar *p = data->contexts + Collation::indexFromCE32(ce32); ce32 = CollationData::readCE32(p); // Default if no prefix match. handleCE32(start, end, ce32); if(!addPrefixes) { return; } UCharsTrie::Iterator prefixes(p + 2, 0, errorCode); while(prefixes.next(errorCode)) { setPrefix(prefixes.getString()); // Prefix/pre-context mappings are special kinds of contractions // that always yield expansions. addStrings(start, end, contractions); addStrings(start, end, expansions); handleCE32(start, end, (uint32_t)prefixes.getValue()); } resetPrefix(); } void ContractionsAndExpansions::handleContractions( UChar32 start, UChar32 end, uint32_t ce32) { const UChar *p = data->contexts + Collation::indexFromCE32(ce32); if((ce32 & Collation::CONTRACT_SINGLE_CP_NO_MATCH) != 0) { // No match on the single code point. // We are underneath a prefix, and the default mapping is just // a fallback to the mappings for a shorter prefix. U_ASSERT(!unreversedPrefix.isEmpty()); } else { ce32 = CollationData::readCE32(p); // Default if no suffix match. U_ASSERT(!Collation::isContractionCE32(ce32)); handleCE32(start, end, ce32); } UCharsTrie::Iterator suffixes(p + 2, 0, errorCode); while(suffixes.next(errorCode)) { suffix = &suffixes.getString(); addStrings(start, end, contractions); if(!unreversedPrefix.isEmpty()) { addStrings(start, end, expansions); } handleCE32(start, end, (uint32_t)suffixes.getValue()); } suffix = NULL; } void ContractionsAndExpansions::addExpansions(UChar32 start, UChar32 end) { if(unreversedPrefix.isEmpty() && suffix == NULL) { if(expansions != NULL) { expansions->add(start, end); } } else { addStrings(start, end, expansions); } } void ContractionsAndExpansions::addStrings(UChar32 start, UChar32 end, UnicodeSet *set) { if(set == NULL) { return; } UnicodeString s(unreversedPrefix); do { s.append(start); if(suffix != NULL) { s.append(*suffix); } set->add(s); s.truncate(unreversedPrefix.length()); } while(++start <= end); } U_NAMESPACE_END #endif // !UCONFIG_NO_COLLATION