diff options
Diffstat (limited to 'deps/icu-small/source/common/normalizer2impl.h')
-rw-r--r-- | deps/icu-small/source/common/normalizer2impl.h | 44 |
1 files changed, 35 insertions, 9 deletions
diff --git a/deps/icu-small/source/common/normalizer2impl.h b/deps/icu-small/source/common/normalizer2impl.h index 9dd4d1e5ab..2e6aff3088 100644 --- a/deps/icu-small/source/common/normalizer2impl.h +++ b/deps/icu-small/source/common/normalizer2impl.h @@ -24,12 +24,20 @@ #if !UCONFIG_NO_NORMALIZATION #include "unicode/normalizer2.h" +#include "unicode/ucptrie.h" #include "unicode/unistr.h" #include "unicode/unorm.h" +#include "unicode/utf.h" #include "unicode/utf16.h" #include "mutex.h" +#include "udataswp.h" #include "uset_imp.h" -#include "utrie2.h" + +// When the nfc.nrm data is *not* hardcoded into the common library +// (with this constant set to 0), +// then it needs to be built into the data package: +// Add nfc.nrm to icu4c/source/data/Makefile.in DAT_FILES_SHORT +#define NORM2_HARDCODE_NFC_DATA 1 U_NAMESPACE_BEGIN @@ -118,7 +126,7 @@ public: buffer[0]=(UChar)(JAMO_L_BASE+c/JAMO_V_COUNT); buffer[1]=(UChar)(JAMO_V_BASE+c%JAMO_V_COUNT); } else { - buffer[0]=orig-c2; // LV syllable + buffer[0]=(UChar)(orig-c2); // LV syllable buffer[1]=(UChar)(JAMO_T_BASE+c2); } } @@ -158,8 +166,7 @@ public: appendBMP((UChar)c, cc, errorCode) : appendSupplementary(c, cc, errorCode); } - // s must be in NFD, otherwise change the implementation. - UBool append(const UChar *s, int32_t length, + UBool append(const UChar *s, int32_t length, UBool isNFD, uint8_t leadCC, uint8_t trailCC, UErrorCode &errorCode); UBool appendBMP(UChar c, uint8_t cc, UErrorCode &errorCode) { @@ -243,7 +250,7 @@ public: } virtual ~Normalizer2Impl(); - void init(const int32_t *inIndexes, const UTrie2 *inTrie, + void init(const int32_t *inIndexes, const UCPTrie *inTrie, const uint16_t *inExtraData, const uint8_t *inSmallFCD); void addLcccChars(UnicodeSet &set) const; @@ -254,7 +261,12 @@ public: UBool ensureCanonIterData(UErrorCode &errorCode) const; - uint16_t getNorm16(UChar32 c) const { return UTRIE2_GET16(normTrie, c); } + // The trie stores values for lead surrogate code *units*. + // Surrogate code *points* are inert. + uint16_t getNorm16(UChar32 c) const { + return U_IS_LEAD(c) ? INERT : UCPTRIE_FAST_GET(normTrie, UCPTRIE_16, c); + } + uint16_t getRawNorm16(UChar32 c) const { return UCPTRIE_FAST_GET(normTrie, UCPTRIE_16, c); } UNormalizationCheckResult getCompQuickCheck(uint16_t norm16) const { if(norm16<minNoNo || MIN_YES_YES_WITH_CC<=norm16) { @@ -704,7 +716,7 @@ private: uint16_t centerNoNoDelta; uint16_t minMaybeYes; - const UTrie2 *normTrie; + const UCPTrie *normTrie; const uint16_t *maybeYesCompositions; const uint16_t *extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters const uint8_t *smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0 @@ -764,7 +776,7 @@ unorm_getFCD16(UChar32 c); /** * Format of Normalizer2 .nrm data files. - * Format version 3.0. + * Format version 4.0. * * Normalizer2 .nrm data files provide data for the Unicode Normalization algorithms. * ICU ships with data files for standard Unicode Normalization Forms @@ -818,7 +830,7 @@ unorm_getFCD16(UChar32 c); * minMaybeYes=indexes[IX_MIN_MAYBE_YES]; * See the normTrie description below and the design doc for details. * - * UTrie2 normTrie; -- see utrie2_impl.h and utrie2.h + * UCPTrie normTrie; -- see ucptrie_impl.h and ucptrie.h, same as Java CodePointTrie * * The trie holds the main normalization data. Each code point is mapped to a 16-bit value. * Rather than using independent bits in the value (which would require more than 16 bits), @@ -946,6 +958,20 @@ unorm_getFCD16(UChar32 c); * which is artificially assigned "worst case" values lccc=1 and tccc=255. * * - A mapping to an empty string has explicit lccc=1 and tccc=255 values. + * + * Changes from format version 3 to format version 4 (ICU 63) ------------------ + * + * Switched from UTrie2 to UCPTrie/CodePointTrie. + * + * The new trie no longer stores different values for surrogate code *units* vs. + * surrogate code *points*. + * Lead surrogates still have values for optimized UTF-16 string processing. + * When looking up code point properties, the code now checks for lead surrogates and + * treats them as inert. + * + * gennorm2 now has to reject mappings for surrogate code points. + * UTS #46 maps unpaired surrogates to U+FFFD in code rather than via its + * custom normalization data file. */ #endif /* !UCONFIG_NO_NORMALIZATION */ |