summaryrefslogtreecommitdiff
path: root/deps/icu-small/source/common/umutablecptrie.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'deps/icu-small/source/common/umutablecptrie.cpp')
-rw-r--r--deps/icu-small/source/common/umutablecptrie.cpp1678
1 files changed, 1678 insertions, 0 deletions
diff --git a/deps/icu-small/source/common/umutablecptrie.cpp b/deps/icu-small/source/common/umutablecptrie.cpp
new file mode 100644
index 0000000000..40af4b6c16
--- /dev/null
+++ b/deps/icu-small/source/common/umutablecptrie.cpp
@@ -0,0 +1,1678 @@
+// © 2017 and later: Unicode, Inc. and others.
+// License & terms of use: http://www.unicode.org/copyright.html
+
+// umutablecptrie.cpp (inspired by utrie2_builder.cpp)
+// created: 2017dec29 Markus W. Scherer
+
+// #define UCPTRIE_DEBUG
+#ifdef UCPTRIE_DEBUG
+# include <stdio.h>
+#endif
+
+#include "unicode/utypes.h"
+#include "unicode/ucptrie.h"
+#include "unicode/umutablecptrie.h"
+#include "unicode/uobject.h"
+#include "unicode/utf16.h"
+#include "cmemory.h"
+#include "uassert.h"
+#include "ucptrie_impl.h"
+
+U_NAMESPACE_BEGIN
+
+namespace {
+
+constexpr int32_t MAX_UNICODE = 0x10ffff;
+
+constexpr int32_t UNICODE_LIMIT = 0x110000;
+constexpr int32_t BMP_LIMIT = 0x10000;
+constexpr int32_t ASCII_LIMIT = 0x80;
+
+constexpr int32_t I_LIMIT = UNICODE_LIMIT >> UCPTRIE_SHIFT_3;
+constexpr int32_t BMP_I_LIMIT = BMP_LIMIT >> UCPTRIE_SHIFT_3;
+constexpr int32_t ASCII_I_LIMIT = ASCII_LIMIT >> UCPTRIE_SHIFT_3;
+
+constexpr int32_t SMALL_DATA_BLOCKS_PER_BMP_BLOCK = (1 << (UCPTRIE_FAST_SHIFT - UCPTRIE_SHIFT_3));
+
+// Flag values for data blocks.
+constexpr uint8_t ALL_SAME = 0;
+constexpr uint8_t MIXED = 1;
+constexpr uint8_t SAME_AS = 2;
+
+/** Start with allocation of 16k data entries. */
+constexpr int32_t INITIAL_DATA_LENGTH = ((int32_t)1 << 14);
+
+/** Grow about 8x each time. */
+constexpr int32_t MEDIUM_DATA_LENGTH = ((int32_t)1 << 17);
+
+/**
+ * Maximum length of the build-time data array.
+ * One entry per 0x110000 code points.
+ */
+constexpr int32_t MAX_DATA_LENGTH = UNICODE_LIMIT;
+
+// Flag values for index-3 blocks while compacting/building.
+constexpr uint8_t I3_NULL = 0;
+constexpr uint8_t I3_BMP = 1;
+constexpr uint8_t I3_16 = 2;
+constexpr uint8_t I3_18 = 3;
+
+constexpr int32_t INDEX_3_18BIT_BLOCK_LENGTH = UCPTRIE_INDEX_3_BLOCK_LENGTH + UCPTRIE_INDEX_3_BLOCK_LENGTH / 8;
+
+class AllSameBlocks;
+
+class MutableCodePointTrie : public UMemory {
+public:
+ MutableCodePointTrie(uint32_t initialValue, uint32_t errorValue, UErrorCode &errorCode);
+ MutableCodePointTrie(const MutableCodePointTrie &other, UErrorCode &errorCode);
+ MutableCodePointTrie(const MutableCodePointTrie &other) = delete;
+ ~MutableCodePointTrie();
+
+ MutableCodePointTrie &operator=(const MutableCodePointTrie &other) = delete;
+
+ static MutableCodePointTrie *fromUCPMap(const UCPMap *map, UErrorCode &errorCode);
+ static MutableCodePointTrie *fromUCPTrie(const UCPTrie *trie, UErrorCode &errorCode);
+
+ uint32_t get(UChar32 c) const;
+ int32_t getRange(UChar32 start, UCPMapValueFilter *filter, const void *context,
+ uint32_t *pValue) const;
+
+ void set(UChar32 c, uint32_t value, UErrorCode &errorCode);
+ void setRange(UChar32 start, UChar32 end, uint32_t value, UErrorCode &errorCode);
+
+ UCPTrie *build(UCPTrieType type, UCPTrieValueWidth valueWidth, UErrorCode &errorCode);
+
+private:
+ void clear();
+
+ bool ensureHighStart(UChar32 c);
+ int32_t allocDataBlock(int32_t blockLength);
+ int32_t getDataBlock(int32_t i);
+
+ void maskValues(uint32_t mask);
+ UChar32 findHighStart() const;
+ int32_t compactWholeDataBlocks(int32_t fastILimit, AllSameBlocks &allSameBlocks);
+ int32_t compactData(int32_t fastILimit, uint32_t *newData, int32_t dataNullIndex);
+ int32_t compactIndex(int32_t fastILimit, UErrorCode &errorCode);
+ int32_t compactTrie(int32_t fastILimit, UErrorCode &errorCode);
+
+ uint32_t *index = nullptr;
+ int32_t indexCapacity = 0;
+ int32_t index3NullOffset = -1;
+ uint32_t *data = nullptr;
+ int32_t dataCapacity = 0;
+ int32_t dataLength = 0;
+ int32_t dataNullOffset = -1;
+
+ uint32_t origInitialValue;
+ uint32_t initialValue;
+ uint32_t errorValue;
+ UChar32 highStart;
+ uint32_t highValue;
+#ifdef UCPTRIE_DEBUG
+public:
+ const char *name;
+#endif
+private:
+ /** Temporary array while building the final data. */
+ uint16_t *index16 = nullptr;
+ uint8_t flags[UNICODE_LIMIT >> UCPTRIE_SHIFT_3];
+};
+
+MutableCodePointTrie::MutableCodePointTrie(uint32_t iniValue, uint32_t errValue, UErrorCode &errorCode) :
+ origInitialValue(iniValue), initialValue(iniValue), errorValue(errValue),
+ highStart(0), highValue(initialValue)
+#ifdef UCPTRIE_DEBUG
+ , name("open")
+#endif
+ {
+ if (U_FAILURE(errorCode)) { return; }
+ index = (uint32_t *)uprv_malloc(BMP_I_LIMIT * 4);
+ data = (uint32_t *)uprv_malloc(INITIAL_DATA_LENGTH * 4);
+ if (index == nullptr || data == nullptr) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
+ indexCapacity = BMP_I_LIMIT;
+ dataCapacity = INITIAL_DATA_LENGTH;
+}
+
+MutableCodePointTrie::MutableCodePointTrie(const MutableCodePointTrie &other, UErrorCode &errorCode) :
+ index3NullOffset(other.index3NullOffset),
+ dataNullOffset(other.dataNullOffset),
+ origInitialValue(other.origInitialValue), initialValue(other.initialValue),
+ errorValue(other.errorValue),
+ highStart(other.highStart), highValue(other.highValue)
+#ifdef UCPTRIE_DEBUG
+ , name("mutable clone")
+#endif
+ {
+ if (U_FAILURE(errorCode)) { return; }
+ int32_t iCapacity = highStart <= BMP_LIMIT ? BMP_I_LIMIT : I_LIMIT;
+ index = (uint32_t *)uprv_malloc(iCapacity * 4);
+ data = (uint32_t *)uprv_malloc(other.dataCapacity * 4);
+ if (index == nullptr || data == nullptr) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
+ indexCapacity = iCapacity;
+ dataCapacity = other.dataCapacity;
+
+ int32_t iLimit = highStart >> UCPTRIE_SHIFT_3;
+ uprv_memcpy(flags, other.flags, iLimit);
+ uprv_memcpy(index, other.index, iLimit * 4);
+ uprv_memcpy(data, other.data, (size_t)other.dataLength * 4);
+ dataLength = other.dataLength;
+ U_ASSERT(other.index16 == nullptr);
+}
+
+MutableCodePointTrie::~MutableCodePointTrie() {
+ uprv_free(index);
+ uprv_free(data);
+ uprv_free(index16);
+}
+
+MutableCodePointTrie *MutableCodePointTrie::fromUCPMap(const UCPMap *map, UErrorCode &errorCode) {
+ // Use the highValue as the initialValue to reduce the highStart.
+ uint32_t errorValue = ucpmap_get(map, -1);
+ uint32_t initialValue = ucpmap_get(map, 0x10ffff);
+ LocalPointer<MutableCodePointTrie> mutableTrie(
+ new MutableCodePointTrie(initialValue, errorValue, errorCode),
+ errorCode);
+ if (U_FAILURE(errorCode)) {
+ return nullptr;
+ }
+ UChar32 start = 0, end;
+ uint32_t value;
+ while ((end = ucpmap_getRange(map, start, UCPMAP_RANGE_NORMAL, 0,
+ nullptr, nullptr, &value)) >= 0) {
+ if (value != initialValue) {
+ if (start == end) {
+ mutableTrie->set(start, value, errorCode);
+ } else {
+ mutableTrie->setRange(start, end, value, errorCode);
+ }
+ }
+ start = end + 1;
+ }
+ if (U_SUCCESS(errorCode)) {
+ return mutableTrie.orphan();
+ } else {
+ return nullptr;
+ }
+}
+
+MutableCodePointTrie *MutableCodePointTrie::fromUCPTrie(const UCPTrie *trie, UErrorCode &errorCode) {
+ // Use the highValue as the initialValue to reduce the highStart.
+ uint32_t errorValue;
+ uint32_t initialValue;
+ switch (trie->valueWidth) {
+ case UCPTRIE_VALUE_BITS_16:
+ errorValue = trie->data.ptr16[trie->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET];
+ initialValue = trie->data.ptr16[trie->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET];
+ break;
+ case UCPTRIE_VALUE_BITS_32:
+ errorValue = trie->data.ptr32[trie->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET];
+ initialValue = trie->data.ptr32[trie->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET];
+ break;
+ case UCPTRIE_VALUE_BITS_8:
+ errorValue = trie->data.ptr8[trie->dataLength - UCPTRIE_ERROR_VALUE_NEG_DATA_OFFSET];
+ initialValue = trie->data.ptr8[trie->dataLength - UCPTRIE_HIGH_VALUE_NEG_DATA_OFFSET];
+ break;
+ default:
+ // Unreachable if the trie is properly initialized.
+ errorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ return nullptr;
+ }
+ LocalPointer<MutableCodePointTrie> mutableTrie(
+ new MutableCodePointTrie(initialValue, errorValue, errorCode),
+ errorCode);
+ if (U_FAILURE(errorCode)) {
+ return nullptr;
+ }
+ UChar32 start = 0, end;
+ uint32_t value;
+ while ((end = ucptrie_getRange(trie, start, UCPMAP_RANGE_NORMAL, 0,
+ nullptr, nullptr, &value)) >= 0) {
+ if (value != initialValue) {
+ if (start == end) {
+ mutableTrie->set(start, value, errorCode);
+ } else {
+ mutableTrie->setRange(start, end, value, errorCode);
+ }
+ }
+ start = end + 1;
+ }
+ if (U_SUCCESS(errorCode)) {
+ return mutableTrie.orphan();
+ } else {
+ return nullptr;
+ }
+}
+
+void MutableCodePointTrie::clear() {
+ index3NullOffset = dataNullOffset = -1;
+ dataLength = 0;
+ highValue = initialValue = origInitialValue;
+ highStart = 0;
+ uprv_free(index16);
+ index16 = nullptr;
+}
+
+uint32_t MutableCodePointTrie::get(UChar32 c) const {
+ if ((uint32_t)c > MAX_UNICODE) {
+ return errorValue;
+ }
+ if (c >= highStart) {
+ return highValue;
+ }
+ int32_t i = c >> UCPTRIE_SHIFT_3;
+ if (flags[i] == ALL_SAME) {
+ return index[i];
+ } else {
+ return data[index[i] + (c & UCPTRIE_SMALL_DATA_MASK)];
+ }
+}
+
+inline uint32_t maybeFilterValue(uint32_t value, uint32_t initialValue, uint32_t nullValue,
+ UCPMapValueFilter *filter, const void *context) {
+ if (value == initialValue) {
+ value = nullValue;
+ } else if (filter != nullptr) {
+ value = filter(context, value);
+ }
+ return value;
+}
+
+UChar32 MutableCodePointTrie::getRange(
+ UChar32 start, UCPMapValueFilter *filter, const void *context,
+ uint32_t *pValue) const {
+ if ((uint32_t)start > MAX_UNICODE) {
+ return U_SENTINEL;
+ }
+ if (start >= highStart) {
+ if (pValue != nullptr) {
+ uint32_t value = highValue;
+ if (filter != nullptr) { value = filter(context, value); }
+ *pValue = value;
+ }
+ return MAX_UNICODE;
+ }
+ uint32_t nullValue = initialValue;
+ if (filter != nullptr) { nullValue = filter(context, nullValue); }
+ UChar32 c = start;
+ uint32_t value;
+ bool haveValue = false;
+ int32_t i = c >> UCPTRIE_SHIFT_3;
+ do {
+ if (flags[i] == ALL_SAME) {
+ uint32_t value2 = maybeFilterValue(index[i], initialValue, nullValue,
+ filter, context);
+ if (haveValue) {
+ if (value2 != value) {
+ return c - 1;
+ }
+ } else {
+ value = value2;
+ if (pValue != nullptr) { *pValue = value; }
+ haveValue = true;
+ }
+ c = (c + UCPTRIE_SMALL_DATA_BLOCK_LENGTH) & ~UCPTRIE_SMALL_DATA_MASK;
+ } else /* MIXED */ {
+ int32_t di = index[i] + (c & UCPTRIE_SMALL_DATA_MASK);
+ uint32_t value2 = maybeFilterValue(data[di], initialValue, nullValue,
+ filter, context);
+ if (haveValue) {
+ if (value2 != value) {
+ return c - 1;
+ }
+ } else {
+ value = value2;
+ if (pValue != nullptr) { *pValue = value; }
+ haveValue = true;
+ }
+ while ((++c & UCPTRIE_SMALL_DATA_MASK) != 0) {
+ if (maybeFilterValue(data[++di], initialValue, nullValue,
+ filter, context) != value) {
+ return c - 1;
+ }
+ }
+ }
+ ++i;
+ } while (c < highStart);
+ U_ASSERT(haveValue);
+ if (maybeFilterValue(highValue, initialValue, nullValue,
+ filter, context) != value) {
+ return c - 1;
+ } else {
+ return MAX_UNICODE;
+ }
+}
+
+void
+writeBlock(uint32_t *block, uint32_t value) {
+ uint32_t *limit = block + UCPTRIE_SMALL_DATA_BLOCK_LENGTH;
+ while (block < limit) {
+ *block++ = value;
+ }
+}
+
+bool MutableCodePointTrie::ensureHighStart(UChar32 c) {
+ if (c >= highStart) {
+ // Round up to a UCPTRIE_CP_PER_INDEX_2_ENTRY boundary to simplify compaction.
+ c = (c + UCPTRIE_CP_PER_INDEX_2_ENTRY) & ~(UCPTRIE_CP_PER_INDEX_2_ENTRY - 1);
+ int32_t i = highStart >> UCPTRIE_SHIFT_3;
+ int32_t iLimit = c >> UCPTRIE_SHIFT_3;
+ if (iLimit > indexCapacity) {
+ uint32_t *newIndex = (uint32_t *)uprv_malloc(I_LIMIT * 4);
+ if (newIndex == nullptr) { return false; }
+ uprv_memcpy(newIndex, index, i * 4);
+ uprv_free(index);
+ index = newIndex;
+ indexCapacity = I_LIMIT;
+ }
+ do {
+ flags[i] = ALL_SAME;
+ index[i] = initialValue;
+ } while(++i < iLimit);
+ highStart = c;
+ }
+ return true;
+}
+
+int32_t MutableCodePointTrie::allocDataBlock(int32_t blockLength) {
+ int32_t newBlock = dataLength;
+ int32_t newTop = newBlock + blockLength;
+ if (newTop > dataCapacity) {
+ int32_t capacity;
+ if (dataCapacity < MEDIUM_DATA_LENGTH) {
+ capacity = MEDIUM_DATA_LENGTH;
+ } else if (dataCapacity < MAX_DATA_LENGTH) {
+ capacity = MAX_DATA_LENGTH;
+ } else {
+ // Should never occur.
+ // Either MAX_DATA_LENGTH is incorrect,
+ // or the code writes more values than should be possible.
+ return -1;
+ }
+ uint32_t *newData = (uint32_t *)uprv_malloc(capacity * 4);
+ if (newData == nullptr) {
+ return -1;
+ }
+ uprv_memcpy(newData, data, (size_t)dataLength * 4);
+ uprv_free(data);
+ data = newData;
+ dataCapacity = capacity;
+ }
+ dataLength = newTop;
+ return newBlock;
+}
+
+/**
+ * No error checking for illegal arguments.
+ *
+ * @return -1 if no new data block available (out of memory in data array)
+ * @internal
+ */
+int32_t MutableCodePointTrie::getDataBlock(int32_t i) {
+ if (flags[i] == MIXED) {
+ return index[i];
+ }
+ if (i < BMP_I_LIMIT) {
+ int32_t newBlock = allocDataBlock(UCPTRIE_FAST_DATA_BLOCK_LENGTH);
+ if (newBlock < 0) { return newBlock; }
+ int32_t iStart = i & ~(SMALL_DATA_BLOCKS_PER_BMP_BLOCK -1);
+ int32_t iLimit = iStart + SMALL_DATA_BLOCKS_PER_BMP_BLOCK;
+ do {
+ U_ASSERT(flags[iStart] == ALL_SAME);
+ writeBlock(data + newBlock, index[iStart]);
+ flags[iStart] = MIXED;
+ index[iStart++] = newBlock;
+ newBlock += UCPTRIE_SMALL_DATA_BLOCK_LENGTH;
+ } while (iStart < iLimit);
+ return index[i];
+ } else {
+ int32_t newBlock = allocDataBlock(UCPTRIE_SMALL_DATA_BLOCK_LENGTH);
+ if (newBlock < 0) { return newBlock; }
+ writeBlock(data + newBlock, index[i]);
+ flags[i] = MIXED;
+ index[i] = newBlock;
+ return newBlock;
+ }
+}
+
+void MutableCodePointTrie::set(UChar32 c, uint32_t value, UErrorCode &errorCode) {
+ if (U_FAILURE(errorCode)) {
+ return;
+ }
+ if ((uint32_t)c > MAX_UNICODE) {
+ errorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ return;
+ }
+
+ int32_t block;
+ if (!ensureHighStart(c) || (block = getDataBlock(c >> UCPTRIE_SHIFT_3)) < 0) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
+
+ data[block + (c & UCPTRIE_SMALL_DATA_MASK)] = value;
+}
+
+void
+fillBlock(uint32_t *block, UChar32 start, UChar32 limit, uint32_t value) {
+ uint32_t *pLimit = block + limit;
+ block += start;
+ while (block < pLimit) {
+ *block++ = value;
+ }
+}
+
+void MutableCodePointTrie::setRange(UChar32 start, UChar32 end, uint32_t value, UErrorCode &errorCode) {
+ if (U_FAILURE(errorCode)) {
+ return;
+ }
+ if ((uint32_t)start > MAX_UNICODE || (uint32_t)end > MAX_UNICODE || start > end) {
+ errorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ return;
+ }
+ if (!ensureHighStart(end)) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
+
+ UChar32 limit = end + 1;
+ if (start & UCPTRIE_SMALL_DATA_MASK) {
+ // Set partial block at [start..following block boundary[.
+ int32_t block = getDataBlock(start >> UCPTRIE_SHIFT_3);
+ if (block < 0) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
+
+ UChar32 nextStart = (start + UCPTRIE_SMALL_DATA_MASK) & ~UCPTRIE_SMALL_DATA_MASK;
+ if (nextStart <= limit) {
+ fillBlock(data + block, start & UCPTRIE_SMALL_DATA_MASK, UCPTRIE_SMALL_DATA_BLOCK_LENGTH,
+ value);
+ start = nextStart;
+ } else {
+ fillBlock(data + block, start & UCPTRIE_SMALL_DATA_MASK, limit & UCPTRIE_SMALL_DATA_MASK,
+ value);
+ return;
+ }
+ }
+
+ // Number of positions in the last, partial block.
+ int32_t rest = limit & UCPTRIE_SMALL_DATA_MASK;
+
+ // Round down limit to a block boundary.
+ limit &= ~UCPTRIE_SMALL_DATA_MASK;
+
+ // Iterate over all-value blocks.
+ while (start < limit) {
+ int32_t i = start >> UCPTRIE_SHIFT_3;
+ if (flags[i] == ALL_SAME) {
+ index[i] = value;
+ } else /* MIXED */ {
+ fillBlock(data + index[i], 0, UCPTRIE_SMALL_DATA_BLOCK_LENGTH, value);
+ }
+ start += UCPTRIE_SMALL_DATA_BLOCK_LENGTH;
+ }
+
+ if (rest > 0) {
+ // Set partial block at [last block boundary..limit[.
+ int32_t block = getDataBlock(start >> UCPTRIE_SHIFT_3);
+ if (block < 0) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return;
+ }
+
+ fillBlock(data + block, 0, rest, value);
+ }
+}
+
+/* compaction --------------------------------------------------------------- */
+
+void MutableCodePointTrie::maskValues(uint32_t mask) {
+ initialValue &= mask;
+ errorValue &= mask;
+ highValue &= mask;
+ int32_t iLimit = highStart >> UCPTRIE_SHIFT_3;
+ for (int32_t i = 0; i < iLimit; ++i) {
+ if (flags[i] == ALL_SAME) {
+ index[i] &= mask;
+ }
+ }
+ for (int32_t i = 0; i < dataLength; ++i) {
+ data[i] &= mask;
+ }
+}
+
+inline bool
+equalBlocks(const uint32_t *s, const uint32_t *t, int32_t length) {
+ while (length > 0 && *s == *t) {
+ ++s;
+ ++t;
+ --length;
+ }
+ return length == 0;
+}
+
+inline bool
+equalBlocks(const uint16_t *s, const uint32_t *t, int32_t length) {
+ while (length > 0 && *s == *t) {
+ ++s;
+ ++t;
+ --length;
+ }
+ return length == 0;
+}
+
+inline bool
+equalBlocks(const uint16_t *s, const uint16_t *t, int32_t length) {
+ while (length > 0 && *s == *t) {
+ ++s;
+ ++t;
+ --length;
+ }
+ return length == 0;
+}
+
+bool allValuesSameAs(const uint32_t *p, int32_t length, uint32_t value) {
+ const uint32_t *pLimit = p + length;
+ while (p < pLimit && *p == value) { ++p; }
+ return p == pLimit;
+}
+
+/** Search for an identical block. */
+int32_t findSameBlock(const uint32_t *p, int32_t pStart, int32_t length,
+ const uint32_t *q, int32_t qStart, int32_t blockLength) {
+ // Ensure that we do not even partially get past length.
+ length -= blockLength;
+
+ q += qStart;
+ while (pStart <= length) {
+ if (equalBlocks(p + pStart, q, blockLength)) {
+ return pStart;
+ }
+ ++pStart;
+ }
+ return -1;
+}
+
+int32_t findSameBlock(const uint16_t *p, int32_t pStart, int32_t length,
+ const uint32_t *q, int32_t qStart, int32_t blockLength) {
+ // Ensure that we do not even partially get past length.
+ length -= blockLength;
+
+ q += qStart;
+ while (pStart <= length) {
+ if (equalBlocks(p + pStart, q, blockLength)) {
+ return pStart;
+ }
+ ++pStart;
+ }
+ return -1;
+}
+
+int32_t findSameBlock(const uint16_t *p, int32_t pStart, int32_t length,
+ const uint16_t *q, int32_t qStart, int32_t blockLength) {
+ // Ensure that we do not even partially get past length.
+ length -= blockLength;
+
+ q += qStart;
+ while (pStart <= length) {
+ if (equalBlocks(p + pStart, q, blockLength)) {
+ return pStart;
+ }
+ ++pStart;
+ }
+ return -1;
+}
+
+int32_t findAllSameBlock(const uint32_t *p, int32_t start, int32_t limit,
+ uint32_t value, int32_t blockLength) {
+ // Ensure that we do not even partially get past limit.
+ limit -= blockLength;
+
+ for (int32_t block = start; block <= limit; ++block) {
+ if (p[block] == value) {
+ for (int32_t i = 1;; ++i) {
+ if (i == blockLength) {
+ return block;
+ }
+ if (p[block + i] != value) {
+ block += i;
+ break;
+ }
+ }
+ }
+ }
+ return -1;
+}
+
+/**
+ * Look for maximum overlap of the beginning of the other block
+ * with the previous, adjacent block.
+ */
+int32_t getOverlap(const uint32_t *p, int32_t length,
+ const uint32_t *q, int32_t qStart, int32_t blockLength) {
+ int32_t overlap = blockLength - 1;
+ U_ASSERT(overlap <= length);
+ q += qStart;
+ while (overlap > 0 && !equalBlocks(p + (length - overlap), q, overlap)) {
+ --overlap;
+ }
+ return overlap;
+}
+
+int32_t getOverlap(const uint16_t *p, int32_t length,
+ const uint32_t *q, int32_t qStart, int32_t blockLength) {
+ int32_t overlap = blockLength - 1;
+ U_ASSERT(overlap <= length);
+ q += qStart;
+ while (overlap > 0 && !equalBlocks(p + (length - overlap), q, overlap)) {
+ --overlap;
+ }
+ return overlap;
+}
+
+int32_t getOverlap(const uint16_t *p, int32_t length,
+ const uint16_t *q, int32_t qStart, int32_t blockLength) {
+ int32_t overlap = blockLength - 1;
+ U_ASSERT(overlap <= length);
+ q += qStart;
+ while (overlap > 0 && !equalBlocks(p + (length - overlap), q, overlap)) {
+ --overlap;
+ }
+ return overlap;
+}
+
+int32_t getAllSameOverlap(const uint32_t *p, int32_t length, uint32_t value,
+ int32_t blockLength) {
+ int32_t min = length - (blockLength - 1);
+ int32_t i = length;
+ while (min < i && p[i - 1] == value) { --i; }
+ return length - i;
+}
+
+bool isStartOfSomeFastBlock(uint32_t dataOffset, const uint32_t index[], int32_t fastILimit) {
+ for (int32_t i = 0; i < fastILimit; i += SMALL_DATA_BLOCKS_PER_BMP_BLOCK) {
+ if (index[i] == dataOffset) {
+ return true;
+ }
+ }
+ return false;
+}
+
+/**
+ * Finds the start of the last range in the trie by enumerating backward.
+ * Indexes for code points higher than this will be omitted.
+ */
+UChar32 MutableCodePointTrie::findHighStart() const {
+ int32_t i = highStart >> UCPTRIE_SHIFT_3;
+ while (i > 0) {
+ bool match;
+ if (flags[--i] == ALL_SAME) {
+ match = index[i] == highValue;
+ } else /* MIXED */ {
+ const uint32_t *p = data + index[i];
+ for (int32_t j = 0;; ++j) {
+ if (j == UCPTRIE_SMALL_DATA_BLOCK_LENGTH) {
+ match = true;
+ break;
+ }
+ if (p[j] != highValue) {
+ match = false;
+ break;
+ }
+ }
+ }
+ if (!match) {
+ return (i + 1) << UCPTRIE_SHIFT_3;
+ }
+ }
+ return 0;
+}
+
+class AllSameBlocks {
+public:
+ static constexpr int32_t NEW_UNIQUE = -1;
+ static constexpr int32_t OVERFLOW = -2;
+
+ AllSameBlocks() : length(0), mostRecent(-1) {}
+
+ int32_t findOrAdd(int32_t index, int32_t count, uint32_t value) {
+ if (mostRecent >= 0 && values[mostRecent] == value) {
+ refCounts[mostRecent] += count;
+ return indexes[mostRecent];
+ }
+ for (int32_t i = 0; i < length; ++i) {
+ if (values[i] == value) {
+ mostRecent = i;
+ refCounts[i] += count;
+ return indexes[i];
+ }
+ }
+ if (length == CAPACITY) {
+ return OVERFLOW;
+ }
+ mostRecent = length;
+ indexes[length] = index;
+ values[length] = value;
+ refCounts[length++] = count;
+ return NEW_UNIQUE;
+ }
+
+ /** Replaces the block which has the lowest reference count. */
+ void add(int32_t index, int32_t count, uint32_t value) {
+ U_ASSERT(length == CAPACITY);
+ int32_t least = -1;
+ int32_t leastCount = I_LIMIT;
+ for (int32_t i = 0; i < length; ++i) {
+ U_ASSERT(values[i] != value);
+ if (refCounts[i] < leastCount) {
+ least = i;
+ leastCount = refCounts[i];
+ }
+ }
+ U_ASSERT(least >= 0);
+ mostRecent = least;
+ indexes[least] = index;
+ values[least] = value;
+ refCounts[least] = count;
+ }
+
+ int32_t findMostUsed() const {
+ if (length == 0) { return -1; }
+ int32_t max = -1;
+ int32_t maxCount = 0;
+ for (int32_t i = 0; i < length; ++i) {
+ if (refCounts[i] > maxCount) {
+ max = i;
+ maxCount = refCounts[i];
+ }
+ }
+ return indexes[max];
+ }
+
+private:
+ static constexpr int32_t CAPACITY = 32;
+
+ int32_t length;
+ int32_t mostRecent;
+
+ int32_t indexes[CAPACITY];
+ uint32_t values[CAPACITY];
+ int32_t refCounts[CAPACITY];
+};
+
+int32_t MutableCodePointTrie::compactWholeDataBlocks(int32_t fastILimit, AllSameBlocks &allSameBlocks) {
+#ifdef UCPTRIE_DEBUG
+ bool overflow = false;
+#endif
+
+ // ASCII data will be stored as a linear table, even if the following code
+ // does not yet count it that way.
+ int32_t newDataCapacity = ASCII_LIMIT;
+ // Add room for a small data null block in case it would match the start of
+ // a fast data block where dataNullOffset must not be set in that case.
+ newDataCapacity += UCPTRIE_SMALL_DATA_BLOCK_LENGTH;
+ // Add room for special values (errorValue, highValue) and padding.
+ newDataCapacity += 4;
+ int32_t iLimit = highStart >> UCPTRIE_SHIFT_3;
+ int32_t blockLength = UCPTRIE_FAST_DATA_BLOCK_LENGTH;
+ int32_t inc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK;
+ for (int32_t i = 0; i < iLimit; i += inc) {
+ if (i == fastILimit) {
+ blockLength = UCPTRIE_SMALL_DATA_BLOCK_LENGTH;
+ inc = 1;
+ }
+ uint32_t value = index[i];
+ if (flags[i] == MIXED) {
+ // Really mixed?
+ const uint32_t *p = data + value;
+ value = *p;
+ if (allValuesSameAs(p + 1, blockLength - 1, value)) {
+ flags[i] = ALL_SAME;
+ index[i] = value;
+ // Fall through to ALL_SAME handling.
+ } else {
+ newDataCapacity += blockLength;
+ continue;
+ }
+ } else {
+ U_ASSERT(flags[i] == ALL_SAME);
+ if (inc > 1) {
+ // Do all of the fast-range data block's ALL_SAME parts have the same value?
+ bool allSame = true;
+ int32_t next_i = i + inc;
+ for (int32_t j = i + 1; j < next_i; ++j) {
+ U_ASSERT(flags[j] == ALL_SAME);
+ if (index[j] != value) {
+ allSame = false;
+ break;
+ }
+ }
+ if (!allSame) {
+ // Turn it into a MIXED block.
+ if (getDataBlock(i) < 0) {
+ return -1;
+ }
+ newDataCapacity += blockLength;
+ continue;
+ }
+ }
+ }
+ // Is there another ALL_SAME block with the same value?
+ int32_t other = allSameBlocks.findOrAdd(i, inc, value);
+ if (other == AllSameBlocks::OVERFLOW) {
+ // The fixed-size array overflowed. Slow check for a duplicate block.
+#ifdef UCPTRIE_DEBUG
+ if (!overflow) {
+ puts("UCPTrie AllSameBlocks overflow");
+ overflow = true;
+ }
+#endif
+ int32_t jInc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK;
+ for (int32_t j = 0;; j += jInc) {
+ if (j == i) {
+ allSameBlocks.add(i, inc, value);
+ break;
+ }
+ if (j == fastILimit) {
+ jInc = 1;
+ }
+ if (flags[j] == ALL_SAME && index[j] == value) {
+ allSameBlocks.add(j, jInc + inc, value);
+ other = j;
+ break;
+ // We could keep counting blocks with the same value
+ // before we add the first one, which may improve compaction in rare cases,
+ // but it would make it slower.
+ }
+ }
+ }
+ if (other >= 0) {
+ flags[i] = SAME_AS;
+ index[i] = other;
+ } else {
+ // New unique same-value block.
+ newDataCapacity += blockLength;
+ }
+ }
+ return newDataCapacity;
+}
+
+#ifdef UCPTRIE_DEBUG
+# define DEBUG_DO(expr) expr
+#else
+# define DEBUG_DO(expr)
+#endif
+
+#ifdef UCPTRIE_DEBUG
+// Braille symbols: U+28xx = UTF-8 E2 A0 80..E2 A3 BF
+int32_t appendValue(char s[], int32_t length, uint32_t value) {
+ value ^= value >> 16;
+ value ^= value >> 8;
+ s[length] = 0xE2;
+ s[length + 1] = (char)(0xA0 + ((value >> 6) & 3));
+ s[length + 2] = (char)(0x80 + (value & 0x3F));
+ return length + 3;
+}
+
+void printBlock(const uint32_t *block, int32_t blockLength, uint32_t value,
+ UChar32 start, int32_t overlap, uint32_t initialValue) {
+ char s[UCPTRIE_FAST_DATA_BLOCK_LENGTH * 3 + 3];
+ int32_t length = 0;
+ int32_t i;
+ for (i = 0; i < overlap; ++i) {
+ length = appendValue(s, length, 0); // Braille blank
+ }
+ s[length++] = '|';
+ for (; i < blockLength; ++i) {
+ if (block != nullptr) {
+ value = block[i];
+ }
+ if (value == initialValue) {
+ value = 0x40; // Braille lower left dot
+ }
+ length = appendValue(s, length, value);
+ }
+ s[length] = 0;
+ start += overlap;
+ if (start <= 0xffff) {
+ printf(" %04lX %s|\n", (long)start, s);
+ } else if (start <= 0xfffff) {
+ printf(" %5lX %s|\n", (long)start, s);
+ } else {
+ printf(" %6lX %s|\n", (long)start, s);
+ }
+}
+#endif
+
+/**
+ * Compacts a build-time trie.
+ *
+ * The compaction
+ * - removes blocks that are identical with earlier ones
+ * - overlaps each new non-duplicate block as much as possible with the previously-written one
+ * - works with fast-range data blocks whose length is a multiple of that of
+ * higher-code-point data blocks
+ *
+ * It does not try to find an optimal order of writing, deduplicating, and overlapping blocks.
+ */
+int32_t MutableCodePointTrie::compactData(int32_t fastILimit,
+ uint32_t *newData, int32_t dataNullIndex) {
+#ifdef UCPTRIE_DEBUG
+ int32_t countSame=0, sumOverlaps=0;
+ bool printData = dataLength == 29088 /* line.brk */ ||
+ // dataLength == 30048 /* CanonIterData */ ||
+ dataLength == 50400 /* zh.txt~stroke */;
+#endif
+
+ // The linear ASCII data has been copied into newData already.
+ int32_t newDataLength = 0;
+ for (int32_t i = 0; newDataLength < ASCII_LIMIT;
+ newDataLength += UCPTRIE_FAST_DATA_BLOCK_LENGTH, i += SMALL_DATA_BLOCKS_PER_BMP_BLOCK) {
+ index[i] = newDataLength;
+#ifdef UCPTRIE_DEBUG
+ if (printData) {
+ printBlock(newData + newDataLength, UCPTRIE_FAST_DATA_BLOCK_LENGTH, 0, newDataLength, 0, initialValue);
+ }
+#endif
+ }
+
+ int32_t iLimit = highStart >> UCPTRIE_SHIFT_3;
+ int32_t blockLength = UCPTRIE_FAST_DATA_BLOCK_LENGTH;
+ int32_t inc = SMALL_DATA_BLOCKS_PER_BMP_BLOCK;
+ int32_t fastLength = 0;
+ for (int32_t i = ASCII_I_LIMIT; i < iLimit; i += inc) {
+ if (i == fastILimit) {
+ blockLength = UCPTRIE_SMALL_DATA_BLOCK_LENGTH;
+ inc = 1;
+ fastLength = newDataLength;
+ }
+ if (flags[i] == ALL_SAME) {
+ uint32_t value = index[i];
+ int32_t n;
+ // Find an earlier part of the data array of length blockLength
+ // that is filled with this value.
+ // If we find a match, and the current block is the data null block,
+ // and it is not a fast block but matches the start of a fast block,
+ // then we need to continue looking.
+ // This is because this small block is shorter than the fast block,
+ // and not all of the rest of the fast block is filled with this value.
+ // Otherwise trie.getRange() would detect that the fast block starts at
+ // dataNullOffset and assume incorrectly that it is filled with the null value.
+ for (int32_t start = 0;
+ (n = findAllSameBlock(newData, start, newDataLength,
+ value, blockLength)) >= 0 &&
+ i == dataNullIndex && i >= fastILimit && n < fastLength &&
+ isStartOfSomeFastBlock(n, index, fastILimit);
+ start = n + 1) {}
+ if (n >= 0) {
+ DEBUG_DO(++countSame);
+ index[i] = n;
+ } else {
+ n = getAllSameOverlap(newData, newDataLength, value, blockLength);
+ DEBUG_DO(sumOverlaps += n);
+#ifdef UCPTRIE_DEBUG
+ if (printData) {
+ printBlock(nullptr, blockLength, value, i << UCPTRIE_SHIFT_3, n, initialValue);
+ }
+#endif
+ index[i] = newDataLength - n;
+ while (n < blockLength) {
+ newData[newDataLength++] = value;
+ ++n;
+ }
+ }
+ } else if (flags[i] == MIXED) {
+ const uint32_t *block = data + index[i];
+ int32_t n = findSameBlock(newData, 0, newDataLength, block, 0, blockLength);
+ if (n >= 0) {
+ DEBUG_DO(++countSame);
+ index[i] = n;
+ } else {
+ n = getOverlap(newData, newDataLength, block, 0, blockLength);
+ DEBUG_DO(sumOverlaps += n);
+#ifdef UCPTRIE_DEBUG
+ if (printData) {
+ printBlock(block, blockLength, 0, i << UCPTRIE_SHIFT_3, n, initialValue);
+ }
+#endif
+ index[i] = newDataLength - n;
+ while (n < blockLength) {
+ newData[newDataLength++] = block[n++];
+ }
+ }
+ } else /* SAME_AS */ {
+ uint32_t j = index[i];
+ index[i] = index[j];
+ }
+ }
+
+#ifdef UCPTRIE_DEBUG
+ /* we saved some space */
+ printf("compacting UCPTrie: count of 32-bit data words %lu->%lu countSame=%ld sumOverlaps=%ld\n",
+ (long)dataLength, (long)newDataLength, (long)countSame, (long)sumOverlaps);
+#endif
+ return newDataLength;
+}
+
+int32_t MutableCodePointTrie::compactIndex(int32_t fastILimit, UErrorCode &errorCode) {
+ int32_t fastIndexLength = fastILimit >> (UCPTRIE_FAST_SHIFT - UCPTRIE_SHIFT_3);
+ if ((highStart >> UCPTRIE_FAST_SHIFT) <= fastIndexLength) {
+ // Only the linear fast index, no multi-stage index tables.
+ index3NullOffset = UCPTRIE_NO_INDEX3_NULL_OFFSET;
+ return fastIndexLength;
+ }
+
+ // Condense the fast index table.
+ // Also, does it contain an index-3 block with all dataNullOffset?
+ uint16_t fastIndex[UCPTRIE_BMP_INDEX_LENGTH]; // fastIndexLength
+ int32_t i3FirstNull = -1;
+ for (int32_t i = 0, j = 0; i < fastILimit; ++j) {
+ uint32_t i3 = index[i];
+ fastIndex[j] = (uint16_t)i3;
+ if (i3 == (uint32_t)dataNullOffset) {
+ if (i3FirstNull < 0) {
+ i3FirstNull = j;
+ } else if (index3NullOffset < 0 &&
+ (j - i3FirstNull + 1) == UCPTRIE_INDEX_3_BLOCK_LENGTH) {
+ index3NullOffset = i3FirstNull;
+ }
+ } else {
+ i3FirstNull = -1;
+ }
+ // Set the index entries that compactData() skipped.
+ // Needed when the multi-stage index covers the fast index range as well.
+ int32_t iNext = i + SMALL_DATA_BLOCKS_PER_BMP_BLOCK;
+ while (++i < iNext) {
+ i3 += UCPTRIE_SMALL_DATA_BLOCK_LENGTH;
+ index[i] = i3;
+ }
+ }
+
+ // Examine index-3 blocks. For each determine one of:
+ // - same as the index-3 null block
+ // - same as a fast-index block
+ // - 16-bit indexes
+ // - 18-bit indexes
+ // We store this in the first flags entry for the index-3 block.
+ //
+ // Also determine an upper limit for the index-3 table length.
+ int32_t index3Capacity = 0;
+ i3FirstNull = index3NullOffset;
+ // If the fast index covers the whole BMP, then
+ // the multi-stage index is only for supplementary code points.
+ // Otherwise, the multi-stage index covers all of Unicode.
+ int32_t iStart = fastILimit < BMP_I_LIMIT ? 0 : BMP_I_LIMIT;
+ int32_t iLimit = highStart >> UCPTRIE_SHIFT_3;
+ for (int32_t i = iStart; i < iLimit;) {
+ int32_t j = i;
+ int32_t jLimit = i + UCPTRIE_INDEX_3_BLOCK_LENGTH;
+ uint32_t oredI3 = 0;
+ bool isNull = true;
+ do {
+ uint32_t i3 = index[j];
+ oredI3 |= i3;
+ if (i3 != (uint32_t)dataNullOffset) {
+ isNull = false;
+ }
+ } while (++j < jLimit);
+ if (isNull) {
+ flags[i] = I3_NULL;
+ if (i3FirstNull < 0) {
+ if (oredI3 <= 0xffff) {
+ index3Capacity += UCPTRIE_INDEX_3_BLOCK_LENGTH;
+ } else {
+ index3Capacity += INDEX_3_18BIT_BLOCK_LENGTH;
+ }
+ i3FirstNull = 0;
+ }
+ } else {
+ if (oredI3 <= 0xffff) {
+ int32_t n = findSameBlock(fastIndex, 0, fastIndexLength,
+ index, i, UCPTRIE_INDEX_3_BLOCK_LENGTH);
+ if (n >= 0) {
+ flags[i] = I3_BMP;
+ index[i] = n;
+ } else {
+ flags[i] = I3_16;
+ index3Capacity += UCPTRIE_INDEX_3_BLOCK_LENGTH;
+ }
+ } else {
+ flags[i] = I3_18;
+ index3Capacity += INDEX_3_18BIT_BLOCK_LENGTH;
+ }
+ }
+ i = j;
+ }
+
+ int32_t index2Capacity = (iLimit - iStart) >> UCPTRIE_SHIFT_2_3;
+
+ // Length of the index-1 table, rounded up.
+ int32_t index1Length = (index2Capacity + UCPTRIE_INDEX_2_MASK) >> UCPTRIE_SHIFT_1_2;
+
+ // Index table: Fast index, index-1, index-3, index-2.
+ // +1 for possible index table padding.
+ int32_t index16Capacity = fastIndexLength + index1Length + index3Capacity + index2Capacity + 1;
+ index16 = (uint16_t *)uprv_malloc(index16Capacity * 2);
+ if (index16 == nullptr) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return 0;
+ }
+ uprv_memcpy(index16, fastIndex, fastIndexLength * 2);
+
+ // Compact the index-3 table and write an uncompacted version of the index-2 table.
+ uint16_t index2[UNICODE_LIMIT >> UCPTRIE_SHIFT_2]; // index2Capacity
+ int32_t i2Length = 0;
+ i3FirstNull = index3NullOffset;
+ int32_t index3Start = fastIndexLength + index1Length;
+ int32_t indexLength = index3Start;
+ for (int32_t i = iStart; i < iLimit; i += UCPTRIE_INDEX_3_BLOCK_LENGTH) {
+ int32_t i3;
+ uint8_t f = flags[i];
+ if (f == I3_NULL && i3FirstNull < 0) {
+ // First index-3 null block. Write & overlap it like a normal block, then remember it.
+ f = dataNullOffset <= 0xffff ? I3_16 : I3_18;
+ i3FirstNull = 0;
+ }
+ if (f == I3_NULL) {
+ i3 = index3NullOffset;
+ } else if (f == I3_BMP) {
+ i3 = index[i];
+ } else if (f == I3_16) {
+ int32_t n = findSameBlock(index16, index3Start, indexLength,
+ index, i, UCPTRIE_INDEX_3_BLOCK_LENGTH);
+ if (n >= 0) {
+ i3 = n;
+ } else {
+ if (indexLength == index3Start) {
+ // No overlap at the boundary between the index-1 and index-3 tables.
+ n = 0;
+ } else {
+ n = getOverlap(index16, indexLength,
+ index, i, UCPTRIE_INDEX_3_BLOCK_LENGTH);
+ }
+ i3 = indexLength - n;
+ while (n < UCPTRIE_INDEX_3_BLOCK_LENGTH) {
+ index16[indexLength++] = index[i + n++];
+ }
+ }
+ } else {
+ U_ASSERT(f == I3_18);
+ // Encode an index-3 block that contains one or more data indexes exceeding 16 bits.
+ int32_t j = i;
+ int32_t jLimit = i + UCPTRIE_INDEX_3_BLOCK_LENGTH;
+ int32_t k = indexLength;
+ do {
+ ++k;
+ uint32_t v = index[j++];
+ uint32_t upperBits = (v & 0x30000) >> 2;
+ index16[k++] = v;
+ v = index[j++];
+ upperBits |= (v & 0x30000) >> 4;
+ index16[k++] = v;
+ v = index[j++];
+ upperBits |= (v & 0x30000) >> 6;
+ index16[k++] = v;
+ v = index[j++];
+ upperBits |= (v & 0x30000) >> 8;
+ index16[k++] = v;
+ v = index[j++];
+ upperBits |= (v & 0x30000) >> 10;
+ index16[k++] = v;
+ v = index[j++];
+ upperBits |= (v & 0x30000) >> 12;
+ index16[k++] = v;
+ v = index[j++];
+ upperBits |= (v & 0x30000) >> 14;
+ index16[k++] = v;
+ v = index[j++];
+ upperBits |= (v & 0x30000) >> 16;
+ index16[k++] = v;
+ index16[k - 9] = upperBits;
+ } while (j < jLimit);
+ int32_t n = findSameBlock(index16, index3Start, indexLength,
+ index16, indexLength, INDEX_3_18BIT_BLOCK_LENGTH);
+ if (n >= 0) {
+ i3 = n | 0x8000;
+ } else {
+ if (indexLength == index3Start) {
+ // No overlap at the boundary between the index-1 and index-3 tables.
+ n = 0;
+ } else {
+ n = getOverlap(index16, indexLength,
+ index16, indexLength, INDEX_3_18BIT_BLOCK_LENGTH);
+ }
+ i3 = (indexLength - n) | 0x8000;
+ if (n > 0) {
+ int32_t start = indexLength;
+ while (n < INDEX_3_18BIT_BLOCK_LENGTH) {
+ index16[indexLength++] = index16[start + n++];
+ }
+ } else {
+ indexLength += INDEX_3_18BIT_BLOCK_LENGTH;
+ }
+ }
+ }
+ if (index3NullOffset < 0 && i3FirstNull >= 0) {
+ index3NullOffset = i3;
+ }
+ // Set the index-2 table entry.
+ index2[i2Length++] = i3;
+ }
+ U_ASSERT(i2Length == index2Capacity);
+ U_ASSERT(indexLength <= index3Start + index3Capacity);
+
+ if (index3NullOffset < 0) {
+ index3NullOffset = UCPTRIE_NO_INDEX3_NULL_OFFSET;
+ }
+ if (indexLength >= (UCPTRIE_NO_INDEX3_NULL_OFFSET + UCPTRIE_INDEX_3_BLOCK_LENGTH)) {
+ // The index-3 offsets exceed 15 bits, or
+ // the last one cannot be distinguished from the no-null-block value.
+ errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
+ return 0;
+ }
+
+ // Compact the index-2 table and write the index-1 table.
+ int32_t blockLength = UCPTRIE_INDEX_2_BLOCK_LENGTH;
+ int32_t i1 = fastIndexLength;
+ for (int32_t i = 0; i < i2Length; i += blockLength) {
+ if ((i2Length - i) < blockLength) {
+ // highStart is inside the last index-2 block. Shorten it.
+ blockLength = i2Length - i;
+ }
+ int32_t i2;
+ int32_t n = findSameBlock(index16, index3Start, indexLength,
+ index2, i, blockLength);
+ if (n >= 0) {
+ i2 = n;
+ } else {
+ if (indexLength == index3Start) {
+ // No overlap at the boundary between the index-1 and index-3/2 tables.
+ n = 0;
+ } else {
+ n = getOverlap(index16, indexLength, index2, i, blockLength);
+ }
+ i2 = indexLength - n;
+ while (n < blockLength) {
+ index16[indexLength++] = index2[i + n++];
+ }
+ }
+ // Set the index-1 table entry.
+ index16[i1++] = i2;
+ }
+ U_ASSERT(i1 == index3Start);
+ U_ASSERT(indexLength <= index16Capacity);
+
+#ifdef UCPTRIE_DEBUG
+ /* we saved some space */
+ printf("compacting UCPTrie: count of 16-bit index words %lu->%lu\n",
+ (long)iLimit, (long)indexLength);
+#endif
+
+ return indexLength;
+}
+
+int32_t MutableCodePointTrie::compactTrie(int32_t fastILimit, UErrorCode &errorCode) {
+ // Find the real highStart and round it up.
+ U_ASSERT((highStart & (UCPTRIE_CP_PER_INDEX_2_ENTRY - 1)) == 0);
+ highValue = get(MAX_UNICODE);
+ int32_t realHighStart = findHighStart();
+ realHighStart = (realHighStart + (UCPTRIE_CP_PER_INDEX_2_ENTRY - 1)) &
+ ~(UCPTRIE_CP_PER_INDEX_2_ENTRY - 1);
+ if (realHighStart == UNICODE_LIMIT) {
+ highValue = initialValue;
+ }
+
+#ifdef UCPTRIE_DEBUG
+ printf("UCPTrie: highStart U+%06lx highValue 0x%lx initialValue 0x%lx\n",
+ (long)realHighStart, (long)highValue, (long)initialValue);
+#endif
+
+ // We always store indexes and data values for the fast range.
+ // Pin highStart to the top of that range while building.
+ UChar32 fastLimit = fastILimit << UCPTRIE_SHIFT_3;
+ if (realHighStart < fastLimit) {
+ for (int32_t i = (realHighStart >> UCPTRIE_SHIFT_3); i < fastILimit; ++i) {
+ flags[i] = ALL_SAME;
+ index[i] = highValue;
+ }
+ highStart = fastLimit;
+ } else {
+ highStart = realHighStart;
+ }
+
+ uint32_t asciiData[ASCII_LIMIT];
+ for (int32_t i = 0; i < ASCII_LIMIT; ++i) {
+ asciiData[i] = get(i);
+ }
+
+ // First we look for which data blocks have the same value repeated over the whole block,
+ // deduplicate such blocks, find a good null data block (for faster enumeration),
+ // and get an upper bound for the necessary data array length.
+ AllSameBlocks allSameBlocks;
+ int32_t newDataCapacity = compactWholeDataBlocks(fastILimit, allSameBlocks);
+ if (newDataCapacity < 0) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return 0;
+ }
+ uint32_t *newData = (uint32_t *)uprv_malloc(newDataCapacity * 4);
+ if (newData == nullptr) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ return 0;
+ }
+ uprv_memcpy(newData, asciiData, sizeof(asciiData));
+
+ int32_t dataNullIndex = allSameBlocks.findMostUsed();
+ int32_t newDataLength = compactData(fastILimit, newData, dataNullIndex);
+ U_ASSERT(newDataLength <= newDataCapacity);
+ uprv_free(data);
+ data = newData;
+ dataCapacity = newDataCapacity;
+ dataLength = newDataLength;
+ if (dataLength > (0x3ffff + UCPTRIE_SMALL_DATA_BLOCK_LENGTH)) {
+ // The offset of the last data block is too high to be stored in the index table.
+ errorCode = U_INDEX_OUTOFBOUNDS_ERROR;
+ return 0;
+ }
+
+ if (dataNullIndex >= 0) {
+ dataNullOffset = index[dataNullIndex];
+#ifdef UCPTRIE_DEBUG
+ if (data[dataNullOffset] != initialValue) {
+ printf("UCPTrie initialValue %lx -> more common nullValue %lx\n",
+ (long)initialValue, (long)data[dataNullOffset]);
+ }
+#endif
+ initialValue = data[dataNullOffset];
+ } else {
+ dataNullOffset = UCPTRIE_NO_DATA_NULL_OFFSET;
+ }
+
+ int32_t indexLength = compactIndex(fastILimit, errorCode);
+ highStart = realHighStart;
+ return indexLength;
+}
+
+UCPTrie *MutableCodePointTrie::build(UCPTrieType type, UCPTrieValueWidth valueWidth, UErrorCode &errorCode) {
+ if (U_FAILURE(errorCode)) {
+ return nullptr;
+ }
+ if (type < UCPTRIE_TYPE_FAST || UCPTRIE_TYPE_SMALL < type ||
+ valueWidth < UCPTRIE_VALUE_BITS_16 || UCPTRIE_VALUE_BITS_8 < valueWidth) {
+ errorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ return nullptr;
+ }
+
+ // The mutable trie always stores 32-bit values.
+ // When we build a UCPTrie for a smaller value width, we first mask off unused bits
+ // before compacting the data.
+ switch (valueWidth) {
+ case UCPTRIE_VALUE_BITS_32:
+ break;
+ case UCPTRIE_VALUE_BITS_16:
+ maskValues(0xffff);
+ break;
+ case UCPTRIE_VALUE_BITS_8:
+ maskValues(0xff);
+ break;
+ default:
+ break;
+ }
+
+ UChar32 fastLimit = type == UCPTRIE_TYPE_FAST ? BMP_LIMIT : UCPTRIE_SMALL_LIMIT;
+ int32_t indexLength = compactTrie(fastLimit >> UCPTRIE_SHIFT_3, errorCode);
+ if (U_FAILURE(errorCode)) {
+ clear();
+ return nullptr;
+ }
+
+ // Ensure data table alignment: The index length must be even for uint32_t data.
+ if (valueWidth == UCPTRIE_VALUE_BITS_32 && (indexLength & 1) != 0) {
+ index16[indexLength++] = 0xffee; // arbitrary value
+ }
+
+ // Make the total trie structure length a multiple of 4 bytes by padding the data table,
+ // and store special values as the last two data values.
+ int32_t length = indexLength * 2;
+ if (valueWidth == UCPTRIE_VALUE_BITS_16) {
+ if (((indexLength ^ dataLength) & 1) != 0) {
+ // padding
+ data[dataLength++] = errorValue;
+ }
+ if (data[dataLength - 1] != errorValue || data[dataLength - 2] != highValue) {
+ data[dataLength++] = highValue;
+ data[dataLength++] = errorValue;
+ }
+ length += dataLength * 2;
+ } else if (valueWidth == UCPTRIE_VALUE_BITS_32) {
+ // 32-bit data words never need padding to a multiple of 4 bytes.
+ if (data[dataLength - 1] != errorValue || data[dataLength - 2] != highValue) {
+ if (data[dataLength - 1] != highValue) {
+ data[dataLength++] = highValue;
+ }
+ data[dataLength++] = errorValue;
+ }
+ length += dataLength * 4;
+ } else {
+ int32_t and3 = (length + dataLength) & 3;
+ if (and3 == 0 && data[dataLength - 1] == errorValue && data[dataLength - 2] == highValue) {
+ // all set
+ } else if(and3 == 3 && data[dataLength - 1] == highValue) {
+ data[dataLength++] = errorValue;
+ } else {
+ while (and3 != 2) {
+ data[dataLength++] = highValue;
+ and3 = (and3 + 1) & 3;
+ }
+ data[dataLength++] = highValue;
+ data[dataLength++] = errorValue;
+ }
+ length += dataLength;
+ }
+
+ // Calculate the total length of the UCPTrie as a single memory block.
+ length += sizeof(UCPTrie);
+ U_ASSERT((length & 3) == 0);
+
+ uint8_t *bytes = (uint8_t *)uprv_malloc(length);
+ if (bytes == nullptr) {
+ errorCode = U_MEMORY_ALLOCATION_ERROR;
+ clear();
+ return nullptr;
+ }
+ UCPTrie *trie = reinterpret_cast<UCPTrie *>(bytes);
+ uprv_memset(trie, 0, sizeof(UCPTrie));
+ trie->indexLength = indexLength;
+ trie->dataLength = dataLength;
+
+ trie->highStart = highStart;
+ // Round up shifted12HighStart to a multiple of 0x1000 for easy testing from UTF-8 lead bytes.
+ // Runtime code needs to then test for the real highStart as well.
+ trie->shifted12HighStart = (highStart + 0xfff) >> 12;
+ trie->type = type;
+ trie->valueWidth = valueWidth;
+
+ trie->index3NullOffset = index3NullOffset;
+ trie->dataNullOffset = dataNullOffset;
+ trie->nullValue = initialValue;
+
+ bytes += sizeof(UCPTrie);
+
+ // Fill the index and data arrays.
+ uint16_t *dest16 = (uint16_t *)bytes;
+ trie->index = dest16;
+
+ if (highStart <= fastLimit) {
+ // Condense only the fast index from the mutable-trie index.
+ for (int32_t i = 0, j = 0; j < indexLength; i += SMALL_DATA_BLOCKS_PER_BMP_BLOCK, ++j) {
+ *dest16++ = (uint16_t)index[i]; // dest16[j]
+ }
+ } else {
+ uprv_memcpy(dest16, index16, indexLength * 2);
+ dest16 += indexLength;
+ }
+ bytes += indexLength * 2;
+
+ // Write the data array.
+ const uint32_t *p = data;
+ switch (valueWidth) {
+ case UCPTRIE_VALUE_BITS_16:
+ // Write 16-bit data values.
+ trie->data.ptr16 = dest16;
+ for (int32_t i = dataLength; i > 0; --i) {
+ *dest16++ = (uint16_t)*p++;
+ }
+ break;
+ case UCPTRIE_VALUE_BITS_32:
+ // Write 32-bit data values.
+ trie->data.ptr32 = (uint32_t *)bytes;
+ uprv_memcpy(bytes, p, (size_t)dataLength * 4);
+ break;
+ case UCPTRIE_VALUE_BITS_8:
+ // Write 8-bit data values.
+ trie->data.ptr8 = bytes;
+ for (int32_t i = dataLength; i > 0; --i) {
+ *bytes++ = (uint8_t)*p++;
+ }
+ break;
+ default:
+ // Will not occur, valueWidth checked at the beginning.
+ break;
+ }
+
+#ifdef UCPTRIE_DEBUG
+ trie->name = name;
+
+ ucptrie_printLengths(trie, "");
+#endif
+
+ clear();
+ return trie;
+}
+
+} // namespace
+
+U_NAMESPACE_END
+
+U_NAMESPACE_USE
+
+U_CAPI UMutableCPTrie * U_EXPORT2
+umutablecptrie_open(uint32_t initialValue, uint32_t errorValue, UErrorCode *pErrorCode) {
+ if (U_FAILURE(*pErrorCode)) {
+ return nullptr;
+ }
+ LocalPointer<MutableCodePointTrie> trie(
+ new MutableCodePointTrie(initialValue, errorValue, *pErrorCode), *pErrorCode);
+ if (U_FAILURE(*pErrorCode)) {
+ return nullptr;
+ }
+ return reinterpret_cast<UMutableCPTrie *>(trie.orphan());
+}
+
+U_CAPI UMutableCPTrie * U_EXPORT2
+umutablecptrie_clone(const UMutableCPTrie *other, UErrorCode *pErrorCode) {
+ if (U_FAILURE(*pErrorCode)) {
+ return nullptr;
+ }
+ if (other == nullptr) {
+ return nullptr;
+ }
+ LocalPointer<MutableCodePointTrie> clone(
+ new MutableCodePointTrie(*reinterpret_cast<const MutableCodePointTrie *>(other), *pErrorCode), *pErrorCode);
+ if (U_FAILURE(*pErrorCode)) {
+ return nullptr;
+ }
+ return reinterpret_cast<UMutableCPTrie *>(clone.orphan());
+}
+
+U_CAPI void U_EXPORT2
+umutablecptrie_close(UMutableCPTrie *trie) {
+ delete reinterpret_cast<MutableCodePointTrie *>(trie);
+}
+
+U_CAPI UMutableCPTrie * U_EXPORT2
+umutablecptrie_fromUCPMap(const UCPMap *map, UErrorCode *pErrorCode) {
+ if (U_FAILURE(*pErrorCode)) {
+ return nullptr;
+ }
+ if (map == nullptr) {
+ *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ return nullptr;
+ }
+ return reinterpret_cast<UMutableCPTrie *>(MutableCodePointTrie::fromUCPMap(map, *pErrorCode));
+}
+
+U_CAPI UMutableCPTrie * U_EXPORT2
+umutablecptrie_fromUCPTrie(const UCPTrie *trie, UErrorCode *pErrorCode) {
+ if (U_FAILURE(*pErrorCode)) {
+ return nullptr;
+ }
+ if (trie == nullptr) {
+ *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR;
+ return nullptr;
+ }
+ return reinterpret_cast<UMutableCPTrie *>(MutableCodePointTrie::fromUCPTrie(trie, *pErrorCode));
+}
+
+U_CAPI uint32_t U_EXPORT2
+umutablecptrie_get(const UMutableCPTrie *trie, UChar32 c) {
+ return reinterpret_cast<const MutableCodePointTrie *>(trie)->get(c);
+}
+
+namespace {
+
+UChar32 getRange(const void *trie, UChar32 start,
+ UCPMapValueFilter *filter, const void *context, uint32_t *pValue) {
+ return reinterpret_cast<const MutableCodePointTrie *>(trie)->
+ getRange(start, filter, context, pValue);
+}
+
+} // namespace
+
+U_CAPI UChar32 U_EXPORT2
+umutablecptrie_getRange(const UMutableCPTrie *trie, UChar32 start,
+ UCPMapRangeOption option, uint32_t surrogateValue,
+ UCPMapValueFilter *filter, const void *context, uint32_t *pValue) {
+ return ucptrie_internalGetRange(getRange, trie, start,
+ option, surrogateValue,
+ filter, context, pValue);
+}
+
+U_CAPI void U_EXPORT2
+umutablecptrie_set(UMutableCPTrie *trie, UChar32 c, uint32_t value, UErrorCode *pErrorCode) {
+ if (U_FAILURE(*pErrorCode)) {
+ return;
+ }
+ reinterpret_cast<MutableCodePointTrie *>(trie)->set(c, value, *pErrorCode);
+}
+
+U_CAPI void U_EXPORT2
+umutablecptrie_setRange(UMutableCPTrie *trie, UChar32 start, UChar32 end,
+ uint32_t value, UErrorCode *pErrorCode) {
+ if (U_FAILURE(*pErrorCode)) {
+ return;
+ }
+ reinterpret_cast<MutableCodePointTrie *>(trie)->setRange(start, end, value, *pErrorCode);
+}
+
+/* Compact and internally serialize the trie. */
+U_CAPI UCPTrie * U_EXPORT2
+umutablecptrie_buildImmutable(UMutableCPTrie *trie, UCPTrieType type, UCPTrieValueWidth valueWidth,
+ UErrorCode *pErrorCode) {
+ if (U_FAILURE(*pErrorCode)) {
+ return nullptr;
+ }
+ return reinterpret_cast<MutableCodePointTrie *>(trie)->build(type, valueWidth, *pErrorCode);
+}
+
+#ifdef UCPTRIE_DEBUG
+U_CFUNC void umutablecptrie_setName(UMutableCPTrie *trie, const char *name) {
+ reinterpret_cast<MutableCodePointTrie *>(trie)->name = name;
+}
+#endif