summaryrefslogtreecommitdiff
path: root/deps/icu-small/source/common/normalizer2impl.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'deps/icu-small/source/common/normalizer2impl.cpp')
-rw-r--r--deps/icu-small/source/common/normalizer2impl.cpp434
1 files changed, 201 insertions, 233 deletions
diff --git a/deps/icu-small/source/common/normalizer2impl.cpp b/deps/icu-small/source/common/normalizer2impl.cpp
index 15b4a52893..e7ae646c41 100644
--- a/deps/icu-small/source/common/normalizer2impl.cpp
+++ b/deps/icu-small/source/common/normalizer2impl.cpp
@@ -16,6 +16,8 @@
* created by: Markus W. Scherer
*/
+// #define UCPTRIE_DEBUG
+
#include "unicode/utypes.h"
#if !UCONFIG_NO_NORMALIZATION
@@ -24,7 +26,9 @@
#include "unicode/edits.h"
#include "unicode/normalizer2.h"
#include "unicode/stringoptions.h"
+#include "unicode/ucptrie.h"
#include "unicode/udata.h"
+#include "unicode/umutablecptrie.h"
#include "unicode/ustring.h"
#include "unicode/utf16.h"
#include "unicode/utf8.h"
@@ -34,8 +38,8 @@
#include "normalizer2impl.h"
#include "putilimp.h"
#include "uassert.h"
+#include "ucptrie_impl.h"
#include "uset_imp.h"
-#include "utrie2.h"
#include "uvector.h"
U_NAMESPACE_BEGIN
@@ -62,7 +66,7 @@ inline uint8_t leadByteForCP(UChar32 c) {
* Returns the code point from one single well-formed UTF-8 byte sequence
* between cpStart and cpLimit.
*
- * UTrie2 UTF-8 macros do not assemble whole code points (for efficiency).
+ * Trie UTF-8 macros do not assemble whole code points (for efficiency).
* When we do need the code point, we call this function.
* We should not need it for normalization-inert data (norm16==0).
* Illegal sequences yield the error value norm16==0 just like real normalization-inert code points.
@@ -122,7 +126,7 @@ int32_t getJamoTMinusBase(const uint8_t *src, const uint8_t *limit) {
}
} else if (src[1] == 0x87) {
uint8_t t = src[2];
- if ((int8_t)t <= (int8_t)0x82) {
+ if ((int8_t)t <= (int8_t)0x82u) {
return t - (0xa7 - 0x40);
}
}
@@ -253,7 +257,7 @@ UBool ReorderingBuffer::appendSupplementary(UChar32 c, uint8_t cc, UErrorCode &e
return TRUE;
}
-UBool ReorderingBuffer::append(const UChar *s, int32_t length,
+UBool ReorderingBuffer::append(const UChar *s, int32_t length, UBool isNFD,
uint8_t leadCC, uint8_t trailCC,
UErrorCode &errorCode) {
if(length==0) {
@@ -280,8 +284,11 @@ UBool ReorderingBuffer::append(const UChar *s, int32_t length,
while(i<length) {
U16_NEXT(s, i, length, c);
if(i<length) {
- // s must be in NFD, otherwise we need to use getCC().
- leadCC=Normalizer2Impl::getCCFromYesOrMaybe(impl.getNorm16(c));
+ if (isNFD) {
+ leadCC = Normalizer2Impl::getCCFromYesOrMaybe(impl.getRawNorm16(c));
+ } else {
+ leadCC = impl.getCC(impl.getNorm16(c));
+ }
} else {
leadCC=trailCC;
}
@@ -411,7 +418,8 @@ struct CanonIterData : public UMemory {
CanonIterData(UErrorCode &errorCode);
~CanonIterData();
void addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode);
- UTrie2 *trie;
+ UMutableCPTrie *mutableTrie;
+ UCPTrie *trie;
UVector canonStartSets; // contains UnicodeSet *
};
@@ -420,22 +428,22 @@ Normalizer2Impl::~Normalizer2Impl() {
}
void
-Normalizer2Impl::init(const int32_t *inIndexes, const UTrie2 *inTrie,
+Normalizer2Impl::init(const int32_t *inIndexes, const UCPTrie *inTrie,
const uint16_t *inExtraData, const uint8_t *inSmallFCD) {
- minDecompNoCP=inIndexes[IX_MIN_DECOMP_NO_CP];
- minCompNoMaybeCP=inIndexes[IX_MIN_COMP_NO_MAYBE_CP];
- minLcccCP=inIndexes[IX_MIN_LCCC_CP];
-
- minYesNo=inIndexes[IX_MIN_YES_NO];
- minYesNoMappingsOnly=inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
- minNoNo=inIndexes[IX_MIN_NO_NO];
- minNoNoCompBoundaryBefore=inIndexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE];
- minNoNoCompNoMaybeCC=inIndexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC];
- minNoNoEmpty=inIndexes[IX_MIN_NO_NO_EMPTY];
- limitNoNo=inIndexes[IX_LIMIT_NO_NO];
- minMaybeYes=inIndexes[IX_MIN_MAYBE_YES];
- U_ASSERT((minMaybeYes&7)==0); // 8-aligned for noNoDelta bit fields
- centerNoNoDelta=(minMaybeYes>>DELTA_SHIFT)-MAX_DELTA-1;
+ minDecompNoCP = static_cast<UChar>(inIndexes[IX_MIN_DECOMP_NO_CP]);
+ minCompNoMaybeCP = static_cast<UChar>(inIndexes[IX_MIN_COMP_NO_MAYBE_CP]);
+ minLcccCP = static_cast<UChar>(inIndexes[IX_MIN_LCCC_CP]);
+
+ minYesNo = static_cast<uint16_t>(inIndexes[IX_MIN_YES_NO]);
+ minYesNoMappingsOnly = static_cast<uint16_t>(inIndexes[IX_MIN_YES_NO_MAPPINGS_ONLY]);
+ minNoNo = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO]);
+ minNoNoCompBoundaryBefore = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE]);
+ minNoNoCompNoMaybeCC = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC]);
+ minNoNoEmpty = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO_EMPTY]);
+ limitNoNo = static_cast<uint16_t>(inIndexes[IX_LIMIT_NO_NO]);
+ minMaybeYes = static_cast<uint16_t>(inIndexes[IX_MIN_MAYBE_YES]);
+ U_ASSERT((minMaybeYes & 7) == 0); // 8-aligned for noNoDelta bit fields
+ centerNoNoDelta = (minMaybeYes >> DELTA_SHIFT) - MAX_DELTA - 1;
normTrie=inTrie;
@@ -445,75 +453,8 @@ Normalizer2Impl::init(const int32_t *inIndexes, const UTrie2 *inTrie,
smallFCD=inSmallFCD;
}
-class LcccContext {
-public:
- LcccContext(const Normalizer2Impl &ni, UnicodeSet &s) : impl(ni), set(s) {}
-
- void handleRange(UChar32 start, UChar32 end, uint16_t norm16) {
- if (norm16 > Normalizer2Impl::MIN_NORMAL_MAYBE_YES &&
- norm16 != Normalizer2Impl::JAMO_VT) {
- set.add(start, end);
- } else if (impl.minNoNoCompNoMaybeCC <= norm16 && norm16 < impl.limitNoNo) {
- uint16_t fcd16=impl.getFCD16(start);
- if(fcd16>0xff) { set.add(start, end); }
- }
- }
-
-private:
- const Normalizer2Impl &impl;
- UnicodeSet &set;
-};
-
-namespace {
-
-struct PropertyStartsContext {
- PropertyStartsContext(const Normalizer2Impl &ni, const USetAdder *adder)
- : impl(ni), sa(adder) {}
-
- const Normalizer2Impl &impl;
- const USetAdder *sa;
-};
-
-} // namespace
-
U_CDECL_BEGIN
-static UBool U_CALLCONV
-enumLcccRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
- ((LcccContext *)context)->handleRange(start, end, (uint16_t)value);
- return TRUE;
-}
-
-static UBool U_CALLCONV
-enumNorm16PropertyStartsRange(const void *context, UChar32 start, UChar32 end, uint32_t value) {
- /* add the start code point to the USet */
- const PropertyStartsContext *ctx=(const PropertyStartsContext *)context;
- const USetAdder *sa=ctx->sa;
- sa->add(sa->set, start);
- if (start != end && ctx->impl.isAlgorithmicNoNo((uint16_t)value) &&
- (value & Normalizer2Impl::DELTA_TCCC_MASK) > Normalizer2Impl::DELTA_TCCC_1) {
- // Range of code points with same-norm16-value algorithmic decompositions.
- // They might have different non-zero FCD16 values.
- uint16_t prevFCD16=ctx->impl.getFCD16(start);
- while(++start<=end) {
- uint16_t fcd16=ctx->impl.getFCD16(start);
- if(fcd16!=prevFCD16) {
- sa->add(sa->set, start);
- prevFCD16=fcd16;
- }
- }
- }
- return TRUE;
-}
-
-static UBool U_CALLCONV
-enumPropertyStartsRange(const void *context, UChar32 start, UChar32 /*end*/, uint32_t /*value*/) {
- /* add the start code point to the USet */
- const USetAdder *sa=(const USetAdder *)context;
- sa->add(sa->set, start);
- return TRUE;
-}
-
static uint32_t U_CALLCONV
segmentStarterMapper(const void * /*context*/, uint32_t value) {
return value&CANON_NOT_SEGMENT_STARTER;
@@ -523,15 +464,44 @@ U_CDECL_END
void
Normalizer2Impl::addLcccChars(UnicodeSet &set) const {
- LcccContext context(*this, set);
- utrie2_enum(normTrie, NULL, enumLcccRange, &context);
+ UChar32 start = 0, end;
+ uint32_t norm16;
+ while ((end = ucptrie_getRange(normTrie, start, UCPMAP_RANGE_FIXED_LEAD_SURROGATES, INERT,
+ nullptr, nullptr, &norm16)) >= 0) {
+ if (norm16 > Normalizer2Impl::MIN_NORMAL_MAYBE_YES &&
+ norm16 != Normalizer2Impl::JAMO_VT) {
+ set.add(start, end);
+ } else if (minNoNoCompNoMaybeCC <= norm16 && norm16 < limitNoNo) {
+ uint16_t fcd16 = getFCD16(start);
+ if (fcd16 > 0xff) { set.add(start, end); }
+ }
+ start = end + 1;
+ }
}
void
Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode*/) const {
- /* add the start code point of each same-value range of each trie */
- PropertyStartsContext context(*this, sa);
- utrie2_enum(normTrie, NULL, enumNorm16PropertyStartsRange, &context);
+ // Add the start code point of each same-value range of the trie.
+ UChar32 start = 0, end;
+ uint32_t value;
+ while ((end = ucptrie_getRange(normTrie, start, UCPMAP_RANGE_FIXED_LEAD_SURROGATES, INERT,
+ nullptr, nullptr, &value)) >= 0) {
+ sa->add(sa->set, start);
+ if (start != end && isAlgorithmicNoNo((uint16_t)value) &&
+ (value & Normalizer2Impl::DELTA_TCCC_MASK) > Normalizer2Impl::DELTA_TCCC_1) {
+ // Range of code points with same-norm16-value algorithmic decompositions.
+ // They might have different non-zero FCD16 values.
+ uint16_t prevFCD16 = getFCD16(start);
+ while (++start <= end) {
+ uint16_t fcd16 = getFCD16(start);
+ if (fcd16 != prevFCD16) {
+ sa->add(sa->set, start);
+ prevFCD16 = fcd16;
+ }
+ }
+ }
+ start = end + 1;
+ }
/* add Hangul LV syllables and LV+1 because of skippables */
for(UChar c=Hangul::HANGUL_BASE; c<Hangul::HANGUL_LIMIT; c+=Hangul::JAMO_T_COUNT) {
@@ -543,10 +513,15 @@ Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode
void
Normalizer2Impl::addCanonIterPropertyStarts(const USetAdder *sa, UErrorCode &errorCode) const {
- /* add the start code point of each same-value range of the canonical iterator data trie */
- if(ensureCanonIterData(errorCode)) {
- // currently only used for the SEGMENT_STARTER property
- utrie2_enum(fCanonIterData->trie, segmentStarterMapper, enumPropertyStartsRange, sa);
+ // Add the start code point of each same-value range of the canonical iterator data trie.
+ if (!ensureCanonIterData(errorCode)) { return; }
+ // Currently only used for the SEGMENT_STARTER property.
+ UChar32 start = 0, end;
+ uint32_t value;
+ while ((end = ucptrie_getRange(fCanonIterData->trie, start, UCPMAP_RANGE_NORMAL, 0,
+ segmentStarterMapper, nullptr, &value)) >= 0) {
+ sa->add(sa->set, start);
+ start = end + 1;
}
}
@@ -633,27 +608,23 @@ Normalizer2Impl::decompose(const UChar *src, const UChar *limit,
// count code units below the minimum or with irrelevant data for the quick check
for(prevSrc=src; src!=limit;) {
if( (c=*src)<minNoCP ||
- isMostDecompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
+ isMostDecompYesAndZeroCC(norm16=UCPTRIE_FAST_BMP_GET(normTrie, UCPTRIE_16, c))
) {
++src;
- } else if(!U16_IS_SURROGATE(c)) {
+ } else if(!U16_IS_LEAD(c)) {
break;
} else {
UChar c2;
- if(U16_IS_SURROGATE_LEAD(c)) {
- if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
- c=U16_GET_SUPPLEMENTARY(c, c2);
- }
- } else /* trail surrogate */ {
- if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
- --src;
- c=U16_GET_SUPPLEMENTARY(c2, c);
+ if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
+ c=U16_GET_SUPPLEMENTARY(c, c2);
+ norm16=UCPTRIE_FAST_SUPP_GET(normTrie, UCPTRIE_16, c);
+ if(isMostDecompYesAndZeroCC(norm16)) {
+ src+=2;
+ } else {
+ break;
}
- }
- if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {
- src+=U16_LENGTH(c);
} else {
- break;
+ ++src; // unpaired lead surrogate: inert
}
}
}
@@ -713,7 +684,7 @@ Normalizer2Impl::decomposeShort(const UChar *src, const UChar *limit,
const UChar *prevSrc = src;
UChar32 c;
uint16_t norm16;
- UTRIE2_U16_NEXT16(normTrie, src, limit, c, norm16);
+ UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, src, limit, c, norm16);
if (stopAtCompBoundary && norm16HasCompBoundaryBefore(norm16)) {
return prevSrc;
}
@@ -737,7 +708,7 @@ UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16,
}
// Maps to an isCompYesAndZeroCC.
c=mapAlgorithmic(c, norm16);
- norm16=getNorm16(c);
+ norm16=getRawNorm16(c);
}
if (norm16 < minYesNo) {
// c does not decompose
@@ -758,7 +729,7 @@ UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16,
} else {
leadCC=0;
}
- return buffer.append((const UChar *)mapping+1, length, leadCC, trailCC, errorCode);
+ return buffer.append((const UChar *)mapping+1, length, TRUE, leadCC, trailCC, errorCode);
}
const uint8_t *
@@ -771,7 +742,7 @@ Normalizer2Impl::decomposeShort(const uint8_t *src, const uint8_t *limit,
while (src < limit) {
const uint8_t *prevSrc = src;
uint16_t norm16;
- UTRIE2_U8_NEXT16(normTrie, src, limit, norm16);
+ UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16);
// Get the decomposition and the lead and trail cc's.
UChar32 c = U_SENTINEL;
if (norm16 >= limitNoNo) {
@@ -789,7 +760,7 @@ Normalizer2Impl::decomposeShort(const uint8_t *src, const uint8_t *limit,
}
c = codePointFromValidUTF8(prevSrc, src);
c = mapAlgorithmic(c, norm16);
- norm16 = getNorm16(c);
+ norm16 = getRawNorm16(c);
} else if (stopAtCompBoundary && norm16 < minNoNoCompNoMaybeCC) {
return prevSrc;
}
@@ -828,7 +799,7 @@ Normalizer2Impl::decomposeShort(const uint8_t *src, const uint8_t *limit,
} else {
leadCC = 0;
}
- if (!buffer.append((const char16_t *)mapping+1, length, leadCC, trailCC, errorCode)) {
+ if (!buffer.append((const char16_t *)mapping+1, length, TRUE, leadCC, trailCC, errorCode)) {
return nullptr;
}
}
@@ -854,7 +825,7 @@ Normalizer2Impl::getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) c
length=0;
U16_APPEND_UNSAFE(buffer, length, c);
// The mapping might decompose further.
- norm16 = getNorm16(c);
+ norm16 = getRawNorm16(c);
}
if (norm16 < minYesNo) {
return decomp;
@@ -926,19 +897,30 @@ void Normalizer2Impl::decomposeAndAppend(const UChar *src, const UChar *limit,
return;
}
// Just merge the strings at the boundary.
- ForwardUTrie2StringIterator iter(normTrie, src, limit);
- uint8_t firstCC, prevCC, cc;
- firstCC=prevCC=cc=getCC(iter.next16());
- while(cc!=0) {
- prevCC=cc;
- cc=getCC(iter.next16());
- };
+ bool isFirst = true;
+ uint8_t firstCC = 0, prevCC = 0, cc;
+ const UChar *p = src;
+ while (p != limit) {
+ const UChar *codePointStart = p;
+ UChar32 c;
+ uint16_t norm16;
+ UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16);
+ if ((cc = getCC(norm16)) == 0) {
+ p = codePointStart;
+ break;
+ }
+ if (isFirst) {
+ firstCC = cc;
+ isFirst = false;
+ }
+ prevCC = cc;
+ }
if(limit==NULL) { // appendZeroCC() needs limit!=NULL
- limit=u_strchr(iter.codePointStart, 0);
+ limit=u_strchr(p, 0);
}
- if (buffer.append(src, (int32_t)(iter.codePointStart-src), firstCC, prevCC, errorCode)) {
- buffer.appendZeroCC(iter.codePointStart, limit, errorCode);
+ if (buffer.append(src, (int32_t)(p - src), FALSE, firstCC, prevCC, errorCode)) {
+ buffer.appendZeroCC(p, limit, errorCode);
}
}
@@ -1085,7 +1067,7 @@ void Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const
}
UChar32 composite=compositeAndFwd>>1;
if((compositeAndFwd&1)!=0) {
- addComposites(getCompositionsListForComposite(getNorm16(composite)), set);
+ addComposites(getCompositionsListForComposite(getRawNorm16(composite)), set);
}
set.add(composite);
} while((firstUnit&COMP_1_LAST_TUPLE)==0);
@@ -1124,7 +1106,7 @@ void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStart
prevCC=0;
for(;;) {
- UTRIE2_U16_NEXT16(normTrie, p, limit, c, norm16);
+ UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16);
cc=getCCFromYesOrMaybe(norm16);
if( // this character combines backward and
isMaybe(norm16) &&
@@ -1229,7 +1211,7 @@ void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStart
// Is the composite a starter that combines forward?
if(compositeAndFwd&1) {
compositionsList=
- getCompositionsListForComposite(getNorm16(composite));
+ getCompositionsListForComposite(getRawNorm16(composite));
} else {
compositionsList=NULL;
}
@@ -1268,7 +1250,7 @@ void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStart
UChar32
Normalizer2Impl::composePair(UChar32 a, UChar32 b) const {
- uint16_t norm16=getNorm16(a); // maps an out-of-range 'a' to inert norm16=0
+ uint16_t norm16=getNorm16(a); // maps an out-of-range 'a' to inert norm16
const uint16_t *list;
if(isInert(norm16)) {
return U_SENTINEL;
@@ -1359,29 +1341,23 @@ Normalizer2Impl::compose(const UChar *src, const UChar *limit,
return TRUE;
}
if( (c=*src)<minNoMaybeCP ||
- isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
+ isCompYesAndZeroCC(norm16=UCPTRIE_FAST_BMP_GET(normTrie, UCPTRIE_16, c))
) {
++src;
} else {
prevSrc = src++;
- if(!U16_IS_SURROGATE(c)) {
+ if(!U16_IS_LEAD(c)) {
break;
} else {
UChar c2;
- if(U16_IS_SURROGATE_LEAD(c)) {
- if(src!=limit && U16_IS_TRAIL(c2=*src)) {
- ++src;
- c=U16_GET_SUPPLEMENTARY(c, c2);
- }
- } else /* trail surrogate */ {
- if(prevBoundary<prevSrc && U16_IS_LEAD(c2=*(prevSrc-1))) {
- --prevSrc;
- c=U16_GET_SUPPLEMENTARY(c2, c);
+ if(src!=limit && U16_IS_TRAIL(c2=*src)) {
+ ++src;
+ c=U16_GET_SUPPLEMENTARY(c, c2);
+ norm16=UCPTRIE_FAST_SUPP_GET(normTrie, UCPTRIE_16, c);
+ if(!isCompYesAndZeroCC(norm16)) {
+ break;
}
}
- if(!isCompYesAndZeroCC(norm16=getNorm16(c))) {
- break;
- }
}
}
}
@@ -1529,7 +1505,7 @@ Normalizer2Impl::compose(const UChar *src, const UChar *limit,
}
uint8_t prevCC = cc;
nextSrc = src;
- UTRIE2_U16_NEXT16(normTrie, nextSrc, limit, c, n16);
+ UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, nextSrc, limit, c, n16);
if (n16 >= MIN_YES_YES_WITH_CC) {
cc = getCCFromNormalYesOrMaybe(n16);
if (prevCC > cc) {
@@ -1559,7 +1535,7 @@ Normalizer2Impl::compose(const UChar *src, const UChar *limit,
// decompose and recompose.
if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) {
const UChar *p = prevSrc;
- UTRIE2_U16_PREV16(normTrie, prevBoundary, p, c, norm16);
+ UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, prevBoundary, p, c, norm16);
if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
prevSrc = p;
}
@@ -1626,29 +1602,23 @@ Normalizer2Impl::composeQuickCheck(const UChar *src, const UChar *limit,
return src;
}
if( (c=*src)<minNoMaybeCP ||
- isCompYesAndZeroCC(norm16=UTRIE2_GET16_FROM_U16_SINGLE_LEAD(normTrie, c))
+ isCompYesAndZeroCC(norm16=UCPTRIE_FAST_BMP_GET(normTrie, UCPTRIE_16, c))
) {
++src;
} else {
prevSrc = src++;
- if(!U16_IS_SURROGATE(c)) {
+ if(!U16_IS_LEAD(c)) {
break;
} else {
UChar c2;
- if(U16_IS_SURROGATE_LEAD(c)) {
- if(src!=limit && U16_IS_TRAIL(c2=*src)) {
- ++src;
- c=U16_GET_SUPPLEMENTARY(c, c2);
- }
- } else /* trail surrogate */ {
- if(prevBoundary<prevSrc && U16_IS_LEAD(c2=*(prevSrc-1))) {
- --prevSrc;
- c=U16_GET_SUPPLEMENTARY(c2, c);
+ if(src!=limit && U16_IS_TRAIL(c2=*src)) {
+ ++src;
+ c=U16_GET_SUPPLEMENTARY(c, c2);
+ norm16=UCPTRIE_FAST_SUPP_GET(normTrie, UCPTRIE_16, c);
+ if(!isCompYesAndZeroCC(norm16)) {
+ break;
}
}
- if(!isCompYesAndZeroCC(norm16=getNorm16(c))) {
- break;
- }
}
}
}
@@ -1665,7 +1635,7 @@ Normalizer2Impl::composeQuickCheck(const UChar *src, const UChar *limit,
} else {
const UChar *p = prevSrc;
uint16_t n16;
- UTRIE2_U16_PREV16(normTrie, prevBoundary, p, c, n16);
+ UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, prevBoundary, p, c, n16);
if (norm16HasCompBoundaryAfter(n16, onlyContiguous)) {
prevBoundary = prevSrc;
} else {
@@ -1699,7 +1669,7 @@ Normalizer2Impl::composeQuickCheck(const UChar *src, const UChar *limit,
}
uint8_t prevCC = cc;
nextSrc = src;
- UTRIE2_U16_NEXT16(normTrie, nextSrc, limit, c, norm16);
+ UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, nextSrc, limit, c, norm16);
if (isMaybeOrNonZeroCC(norm16)) {
cc = getCCFromYesOrMaybe(norm16);
if (!(prevCC <= cc || cc == 0)) {
@@ -1786,7 +1756,7 @@ Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous,
++src;
} else {
prevSrc = src;
- UTRIE2_U8_NEXT16(normTrie, src, limit, norm16);
+ UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16);
if (!isCompYesAndZeroCC(norm16)) {
break;
}
@@ -1945,7 +1915,7 @@ Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous,
}
uint8_t prevCC = cc;
nextSrc = src;
- UTRIE2_U8_NEXT16(normTrie, nextSrc, limit, n16);
+ UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, nextSrc, limit, n16);
if (n16 >= MIN_YES_YES_WITH_CC) {
cc = getCCFromNormalYesOrMaybe(n16);
if (prevCC > cc) {
@@ -1975,7 +1945,7 @@ Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous,
// decompose and recompose.
if (prevBoundary != prevSrc && !norm16HasCompBoundaryBefore(norm16)) {
const uint8_t *p = prevSrc;
- UTRIE2_U8_PREV16(normTrie, prevBoundary, p, norm16);
+ UCPTRIE_FAST_U8_PREV(normTrie, UCPTRIE_16, prevBoundary, p, norm16);
if (!norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
prevSrc = p;
}
@@ -2023,7 +1993,7 @@ UBool Normalizer2Impl::hasCompBoundaryBefore(const UChar *src, const UChar *limi
}
UChar32 c;
uint16_t norm16;
- UTRIE2_U16_NEXT16(normTrie, src, limit, c, norm16);
+ UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, src, limit, c, norm16);
return norm16HasCompBoundaryBefore(norm16);
}
@@ -2032,7 +2002,7 @@ UBool Normalizer2Impl::hasCompBoundaryBefore(const uint8_t *src, const uint8_t *
return TRUE;
}
uint16_t norm16;
- UTRIE2_U8_NEXT16(normTrie, src, limit, norm16);
+ UCPTRIE_FAST_U8_NEXT(normTrie, UCPTRIE_16, src, limit, norm16);
return norm16HasCompBoundaryBefore(norm16);
}
@@ -2043,7 +2013,7 @@ UBool Normalizer2Impl::hasCompBoundaryAfter(const UChar *start, const UChar *p,
}
UChar32 c;
uint16_t norm16;
- UTRIE2_U16_PREV16(normTrie, start, p, c, norm16);
+ UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, start, p, c, norm16);
return norm16HasCompBoundaryAfter(norm16, onlyContiguous);
}
@@ -2053,36 +2023,42 @@ UBool Normalizer2Impl::hasCompBoundaryAfter(const uint8_t *start, const uint8_t
return TRUE;
}
uint16_t norm16;
- UTRIE2_U8_PREV16(normTrie, start, p, norm16);
+ UCPTRIE_FAST_U8_PREV(normTrie, UCPTRIE_16, start, p, norm16);
return norm16HasCompBoundaryAfter(norm16, onlyContiguous);
}
const UChar *Normalizer2Impl::findPreviousCompBoundary(const UChar *start, const UChar *p,
UBool onlyContiguous) const {
- BackwardUTrie2StringIterator iter(normTrie, start, p);
- for(;;) {
- uint16_t norm16=iter.previous16();
+ while (p != start) {
+ const UChar *codePointLimit = p;
+ UChar32 c;
+ uint16_t norm16;
+ UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, start, p, c, norm16);
if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
- return iter.codePointLimit;
+ return codePointLimit;
}
- if (hasCompBoundaryBefore(iter.codePoint, norm16)) {
- return iter.codePointStart;
+ if (hasCompBoundaryBefore(c, norm16)) {
+ return p;
}
}
+ return p;
}
const UChar *Normalizer2Impl::findNextCompBoundary(const UChar *p, const UChar *limit,
UBool onlyContiguous) const {
- ForwardUTrie2StringIterator iter(normTrie, p, limit);
- for(;;) {
- uint16_t norm16=iter.next16();
- if (hasCompBoundaryBefore(iter.codePoint, norm16)) {
- return iter.codePointStart;
+ while (p != limit) {
+ const UChar *codePointStart = p;
+ UChar32 c;
+ uint16_t norm16;
+ UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16);
+ if (hasCompBoundaryBefore(c, norm16)) {
+ return codePointStart;
}
if (norm16HasCompBoundaryAfter(norm16, onlyContiguous)) {
- return iter.codePointLimit;
+ return p;
}
}
+ return p;
}
uint8_t Normalizer2Impl::getPreviousTrailCC(const UChar *start, const UChar *p) const {
@@ -2130,7 +2106,7 @@ uint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c) const {
}
// Maps to an isCompYesAndZeroCC.
c=mapAlgorithmic(c, norm16);
- norm16=getNorm16(c);
+ norm16=getRawNorm16(c);
}
}
if(norm16<=minYesNo || isHangulLVT(norm16)) {
@@ -2195,17 +2171,10 @@ Normalizer2Impl::makeFCD(const UChar *src, const UChar *limit,
prevFCD16=0;
++src;
} else {
- if(U16_IS_SURROGATE(c)) {
+ if(U16_IS_LEAD(c)) {
UChar c2;
- if(U16_IS_SURROGATE_LEAD(c)) {
- if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
- c=U16_GET_SUPPLEMENTARY(c, c2);
- }
- } else /* trail surrogate */ {
- if(prevSrc<src && U16_IS_LEAD(c2=*(src-1))) {
- --src;
- c=U16_GET_SUPPLEMENTARY(c2, c);
- }
+ if((src+1)!=limit && U16_IS_TRAIL(c2=src[1])) {
+ c=U16_GET_SUPPLEMENTARY(c, c2);
}
}
if((fcd16=getFCD16FromNormData(c))<=0xff) {
@@ -2336,7 +2305,7 @@ const UChar *Normalizer2Impl::findPreviousFCDBoundary(const UChar *start, const
const UChar *codePointLimit = p;
UChar32 c;
uint16_t norm16;
- UTRIE2_U16_PREV16(normTrie, start, p, c, norm16);
+ UCPTRIE_FAST_U16_PREV(normTrie, UCPTRIE_16, start, p, c, norm16);
if (c < minDecompNoCP || norm16HasDecompBoundaryAfter(norm16)) {
return codePointLimit;
}
@@ -2352,7 +2321,7 @@ const UChar *Normalizer2Impl::findNextFCDBoundary(const UChar *p, const UChar *l
const UChar *codePointStart=p;
UChar32 c;
uint16_t norm16;
- UTRIE2_U16_NEXT16(normTrie, p, limit, c, norm16);
+ UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16);
if (c < minLcccCP || norm16HasDecompBoundaryBefore(norm16)) {
return codePointStart;
}
@@ -2366,19 +2335,20 @@ const UChar *Normalizer2Impl::findNextFCDBoundary(const UChar *p, const UChar *l
// CanonicalIterator data -------------------------------------------------- ***
CanonIterData::CanonIterData(UErrorCode &errorCode) :
- trie(utrie2_open(0, 0, &errorCode)),
+ mutableTrie(umutablecptrie_open(0, 0, &errorCode)), trie(nullptr),
canonStartSets(uprv_deleteUObject, NULL, errorCode) {}
CanonIterData::~CanonIterData() {
- utrie2_close(trie);
+ umutablecptrie_close(mutableTrie);
+ ucptrie_close(trie);
}
void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode &errorCode) {
- uint32_t canonValue=utrie2_get32(trie, decompLead);
+ uint32_t canonValue = umutablecptrie_get(mutableTrie, decompLead);
if((canonValue&(CANON_HAS_SET|CANON_VALUE_MASK))==0 && origin!=0) {
// origin is the first character whose decomposition starts with
// the character for which we are setting the value.
- utrie2_set32(trie, decompLead, canonValue|origin, &errorCode);
+ umutablecptrie_set(mutableTrie, decompLead, canonValue|origin, &errorCode);
} else {
// origin is not the first character, or it is U+0000.
UnicodeSet *set;
@@ -2390,7 +2360,7 @@ void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode
}
UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK);
canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|(uint32_t)canonStartSets.size();
- utrie2_set32(trie, decompLead, canonValue, &errorCode);
+ umutablecptrie_set(mutableTrie, decompLead, canonValue, &errorCode);
canonStartSets.addElement(set, errorCode);
if(firstOrigin!=0) {
set->add(firstOrigin);
@@ -2406,7 +2376,6 @@ void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode
class InitCanonIterData {
public:
static void doInit(Normalizer2Impl *impl, UErrorCode &errorCode);
- static void handleRange(Normalizer2Impl *impl, UChar32 start, UChar32 end, uint16_t value, UErrorCode &errorCode);
};
U_CDECL_BEGIN
@@ -2417,18 +2386,6 @@ initCanonIterData(Normalizer2Impl *impl, UErrorCode &errorCode) {
InitCanonIterData::doInit(impl, errorCode);
}
-// Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters.
-// context: the Normalizer2Impl
-static UBool U_CALLCONV
-enumCIDRangeHandler(const void *context, UChar32 start, UChar32 end, uint32_t value) {
- UErrorCode errorCode = U_ZERO_ERROR;
- if (value != Normalizer2Impl::INERT) {
- Normalizer2Impl *impl = (Normalizer2Impl *)context;
- InitCanonIterData::handleRange(impl, start, end, (uint16_t)value, errorCode);
- }
- return U_SUCCESS(errorCode);
-}
-
U_CDECL_END
void InitCanonIterData::doInit(Normalizer2Impl *impl, UErrorCode &errorCode) {
@@ -2438,8 +2395,24 @@ void InitCanonIterData::doInit(Normalizer2Impl *impl, UErrorCode &errorCode) {
errorCode=U_MEMORY_ALLOCATION_ERROR;
}
if (U_SUCCESS(errorCode)) {
- utrie2_enum(impl->normTrie, NULL, enumCIDRangeHandler, impl);
- utrie2_freeze(impl->fCanonIterData->trie, UTRIE2_32_VALUE_BITS, &errorCode);
+ UChar32 start = 0, end;
+ uint32_t value;
+ while ((end = ucptrie_getRange(impl->normTrie, start,
+ UCPMAP_RANGE_FIXED_LEAD_SURROGATES, Normalizer2Impl::INERT,
+ nullptr, nullptr, &value)) >= 0) {
+ // Call Normalizer2Impl::makeCanonIterDataFromNorm16() for a range of same-norm16 characters.
+ if (value != Normalizer2Impl::INERT) {
+ impl->makeCanonIterDataFromNorm16(start, end, value, *impl->fCanonIterData, errorCode);
+ }
+ start = end + 1;
+ }
+#ifdef UCPTRIE_DEBUG
+ umutablecptrie_setName(impl->fCanonIterData->mutableTrie, "CanonIterData");
+#endif
+ impl->fCanonIterData->trie = umutablecptrie_buildImmutable(
+ impl->fCanonIterData->mutableTrie, UCPTRIE_TYPE_SMALL, UCPTRIE_VALUE_BITS_32, &errorCode);
+ umutablecptrie_close(impl->fCanonIterData->mutableTrie);
+ impl->fCanonIterData->mutableTrie = nullptr;
}
if (U_FAILURE(errorCode)) {
delete impl->fCanonIterData;
@@ -2447,11 +2420,6 @@ void InitCanonIterData::doInit(Normalizer2Impl *impl, UErrorCode &errorCode) {
}
}
-void InitCanonIterData::handleRange(
- Normalizer2Impl *impl, UChar32 start, UChar32 end, uint16_t value, UErrorCode &errorCode) {
- impl->makeCanonIterDataFromNorm16(start, end, value, *impl->fCanonIterData, errorCode);
-}
-
void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, const uint16_t norm16,
CanonIterData &newData,
UErrorCode &errorCode) const {
@@ -2465,7 +2433,7 @@ void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, co
return;
}
for(UChar32 c=start; c<=end; ++c) {
- uint32_t oldValue=utrie2_get32(newData.trie, c);
+ uint32_t oldValue = umutablecptrie_get(newData.mutableTrie, c);
uint32_t newValue=oldValue;
if(isMaybeOrNonZeroCC(norm16)) {
// not a segment starter if it occurs in a decomposition or has cc!=0
@@ -2483,7 +2451,7 @@ void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, co
if (isDecompNoAlgorithmic(norm16_2)) {
// Maps to an isCompYesAndZeroCC.
c2 = mapAlgorithmic(c2, norm16_2);
- norm16_2 = getNorm16(c2);
+ norm16_2 = getRawNorm16(c2);
// No compatibility mappings for the CanonicalIterator.
U_ASSERT(!(isHangulLV(norm16_2) || isHangulLVT(norm16_2)));
}
@@ -2510,10 +2478,10 @@ void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, co
if(norm16_2>=minNoNo) {
while(i<length) {
U16_NEXT_UNSAFE(mapping, i, c2);
- uint32_t c2Value=utrie2_get32(newData.trie, c2);
+ uint32_t c2Value = umutablecptrie_get(newData.mutableTrie, c2);
if((c2Value&CANON_NOT_SEGMENT_STARTER)==0) {
- utrie2_set32(newData.trie, c2, c2Value|CANON_NOT_SEGMENT_STARTER,
- &errorCode);
+ umutablecptrie_set(newData.mutableTrie, c2,
+ c2Value|CANON_NOT_SEGMENT_STARTER, &errorCode);
}
}
}
@@ -2524,7 +2492,7 @@ void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, co
}
}
if(newValue!=oldValue) {
- utrie2_set32(newData.trie, c, newValue, &errorCode);
+ umutablecptrie_set(newData.mutableTrie, c, newValue, &errorCode);
}
}
}
@@ -2537,7 +2505,7 @@ UBool Normalizer2Impl::ensureCanonIterData(UErrorCode &errorCode) const {
}
int32_t Normalizer2Impl::getCanonValue(UChar32 c) const {
- return (int32_t)utrie2_get32(fCanonIterData->trie, c);
+ return (int32_t)ucptrie_get(fCanonIterData->trie, c);
}
const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const {
@@ -2561,7 +2529,7 @@ UBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const {
set.add(value);
}
if((canonValue&CANON_HAS_COMPOSITIONS)!=0) {
- uint16_t norm16=getNorm16(c);
+ uint16_t norm16=getRawNorm16(c);
if(norm16==JAMO_L) {
UChar32 syllable=
(UChar32)(Hangul::HANGUL_BASE+(c-Hangul::JAMO_L_BASE)*Hangul::JAMO_VT_COUNT);
@@ -2608,7 +2576,7 @@ unorm2_swap(const UDataSwapper *ds,
pInfo->dataFormat[1]==0x72 &&
pInfo->dataFormat[2]==0x6d &&
pInfo->dataFormat[3]==0x32 &&
- (1<=formatVersion0 && formatVersion0<=3)
+ (1<=formatVersion0 && formatVersion0<=4)
)) {
udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n",
pInfo->dataFormat[0], pInfo->dataFormat[1],
@@ -2669,9 +2637,9 @@ unorm2_swap(const UDataSwapper *ds,
ds->swapArray32(ds, inBytes, nextOffset-offset, outBytes, pErrorCode);
offset=nextOffset;
- /* swap the UTrie2 */
+ /* swap the trie */
nextOffset=indexes[Normalizer2Impl::IX_EXTRA_DATA_OFFSET];
- utrie2_swap(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
+ utrie_swapAnyVersion(ds, inBytes+offset, nextOffset-offset, outBytes+offset, pErrorCode);
offset=nextOffset;
/* swap the uint16_t extraData[] */