summaryrefslogtreecommitdiff
path: root/deps/icu-small/source/common/normalizer2impl.h
diff options
context:
space:
mode:
authorSteven R. Loomis <srloomis@us.ibm.com>2017-09-21 15:31:38 -0700
committerSteven R. Loomis <srloomis@us.ibm.com>2017-11-09 18:25:58 -0800
commit44d3e17985befbd45457d5ad7f0a0387849e1b2f (patch)
treef75f2eddb868f13254b7f514875534dee616c0d6 /deps/icu-small/source/common/normalizer2impl.h
parent3b3ceafaf922e1d79950595eaa501aa412913820 (diff)
downloadandroid-node-v8-44d3e17985befbd45457d5ad7f0a0387849e1b2f.tar.gz
android-node-v8-44d3e17985befbd45457d5ad7f0a0387849e1b2f.tar.bz2
android-node-v8-44d3e17985befbd45457d5ad7f0a0387849e1b2f.zip
deps: ICU 60 bump
- Update to released ICU 60.1, including: - CLDR 32 (many new languages and data improvements) - Unicode 10 (8,518 new characters, including four new scripts, 7,494 new Han characters, and 56 new emoji characters) - UTF-8 malformed bytes now handled according to W3C/WHATWG spec Fixes: https://github.com/nodejs/node/issues/15540 PR-URL: https://github.com/nodejs/node/pull/16876 Reviewed-By: James M Snell <jasnell@gmail.com> Reviewed-By: Michael Dawson <michael_dawson@ca.ibm.com>
Diffstat (limited to 'deps/icu-small/source/common/normalizer2impl.h')
-rw-r--r--deps/icu-small/source/common/normalizer2impl.h304
1 files changed, 233 insertions, 71 deletions
diff --git a/deps/icu-small/source/common/normalizer2impl.h b/deps/icu-small/source/common/normalizer2impl.h
index 946abee98f..9dd4d1e5ab 100644
--- a/deps/icu-small/source/common/normalizer2impl.h
+++ b/deps/icu-small/source/common/normalizer2impl.h
@@ -35,6 +35,11 @@ U_NAMESPACE_BEGIN
struct CanonIterData;
+class ByteSink;
+class Edits;
+class InitCanonIterData;
+class LcccContext;
+
class U_COMMON_API Hangul {
public:
/* Korean Hangul and Jamo constants */
@@ -63,9 +68,9 @@ public:
return HANGUL_BASE<=c && c<HANGUL_LIMIT;
}
static inline UBool
- isHangulWithoutJamoT(UChar c) {
+ isHangulLV(UChar32 c) {
c-=HANGUL_BASE;
- return c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
+ return 0<=c && c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
}
static inline UBool isJamoL(UChar32 c) {
return (uint32_t)(c-JAMO_L_BASE)<JAMO_L_COUNT;
@@ -73,6 +78,14 @@ public:
static inline UBool isJamoV(UChar32 c) {
return (uint32_t)(c-JAMO_V_BASE)<JAMO_V_COUNT;
}
+ static inline UBool isJamoT(UChar32 c) {
+ int32_t t=c-JAMO_T_BASE;
+ return 0<t && t<JAMO_T_COUNT; // not JAMO_T_BASE itself
+ }
+ static UBool isJamo(UChar32 c) {
+ return JAMO_L_BASE<=c && c<=JAMO_T_END &&
+ (c<=JAMO_L_END || (JAMO_V_BASE<=c && c<=JAMO_V_END) || JAMO_T_BASE<c);
+ }
/**
* Decomposes c, which must be a Hangul syllable, into buffer
@@ -117,10 +130,13 @@ class Normalizer2Impl;
class U_COMMON_API ReorderingBuffer : public UMemory {
public:
+ /** Constructs only; init() should be called. */
ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest) :
impl(ni), str(dest),
start(NULL), reorderStart(NULL), limit(NULL),
remainingCapacity(0), lastCC(0) {}
+ /** Constructs, removes the string contents, and initializes for a small initial capacity. */
+ ReorderingBuffer(const Normalizer2Impl &ni, UnicodeString &dest, UErrorCode &errorCode);
~ReorderingBuffer() {
if(start!=NULL) {
str.releaseBuffer((int32_t)(limit-start));
@@ -135,11 +151,7 @@ public:
uint8_t getLastCC() const { return lastCC; }
UBool equals(const UChar *start, const UChar *limit) const;
-
- // For Hangul composition, replacing the Leading consonant Jamo with the syllable.
- void setLastChar(UChar c) {
- *(limit-1)=c;
- }
+ UBool equals(const uint8_t *otherStart, const uint8_t *otherLimit) const;
UBool append(UChar32 c, uint8_t cc, UErrorCode &errorCode) {
return (c<=0xffff) ?
@@ -218,6 +230,12 @@ private:
UChar *codePointStart, *codePointLimit;
};
+/**
+ * Low-level implementation of the Unicode Normalization Algorithm.
+ * For the data structure and details see the documentation at the end of
+ * this normalizer2impl.h and in the design doc at
+ * http://site.icu-project.org/design/normalization/custom
+ */
class U_COMMON_API Normalizer2Impl : public UObject {
public:
Normalizer2Impl() : normTrie(NULL), fCanonIterData(NULL) {
@@ -234,8 +252,6 @@ public:
// low-level properties ------------------------------------------------ ***
- const UTrie2 *getNormTrie() const { return normTrie; }
-
UBool ensureCanonIterData(UErrorCode &errorCode) const;
uint16_t getNorm16(UChar32 c) const { return UTRIE2_GET16(normTrie, c); }
@@ -255,15 +271,22 @@ public:
uint8_t getCC(uint16_t norm16) const {
if(norm16>=MIN_NORMAL_MAYBE_YES) {
- return (uint8_t)norm16;
+ return getCCFromNormalYesOrMaybe(norm16);
}
if(norm16<minNoNo || limitNoNo<=norm16) {
return 0;
}
return getCCFromNoNo(norm16);
}
+ static uint8_t getCCFromNormalYesOrMaybe(uint16_t norm16) {
+ return (uint8_t)(norm16 >> OFFSET_SHIFT);
+ }
static uint8_t getCCFromYesOrMaybe(uint16_t norm16) {
- return norm16>=MIN_NORMAL_MAYBE_YES ? (uint8_t)norm16 : 0;
+ return norm16>=MIN_NORMAL_MAYBE_YES ? getCCFromNormalYesOrMaybe(norm16) : 0;
+ }
+ uint8_t getCCFromYesOrMaybeCP(UChar32 c) const {
+ if (c < minCompNoMaybeCP) { return 0; }
+ return getCCFromYesOrMaybe(getNorm16(c));
}
/**
@@ -272,10 +295,8 @@ public:
* @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0.
*/
uint16_t getFCD16(UChar32 c) const {
- if(c<0) {
+ if(c<minDecompNoCP) {
return 0;
- } else if(c<0x180) {
- return tccc180[c];
} else if(c<=0xffff) {
if(!singleLeadMightHaveNonZeroFCD16(c)) { return 0; }
}
@@ -291,9 +312,7 @@ public:
*/
uint16_t nextFCD16(const UChar *&s, const UChar *limit) const {
UChar32 c=*s++;
- if(c<0x180) {
- return tccc180[c];
- } else if(!singleLeadMightHaveNonZeroFCD16(c)) {
+ if(c<minDecompNoCP || !singleLeadMightHaveNonZeroFCD16(c)) {
return 0;
}
UChar c2;
@@ -311,8 +330,8 @@ public:
*/
uint16_t previousFCD16(const UChar *start, const UChar *&s) const {
UChar32 c=*--s;
- if(c<0x180) {
- return tccc180[c];
+ if(c<minDecompNoCP) {
+ return 0;
}
if(!U16_IS_TRAIL(c)) {
if(!singleLeadMightHaveNonZeroFCD16(c)) {
@@ -328,8 +347,6 @@ public:
return getFCD16FromNormData(c);
}
- /** Returns the FCD data for U+0000<=c<U+0180. */
- uint16_t getFCD16FromBelow180(UChar32 c) const { return tccc180[c]; }
/** Returns TRUE if the single-or-lead code unit c might have non-zero FCD data. */
UBool singleLeadMightHaveNonZeroFCD16(UChar32 lead) const {
// 0<=lead<=0xffff
@@ -340,9 +357,6 @@ public:
/** Returns the FCD value from the regular normalization data. */
uint16_t getFCD16FromNormData(UChar32 c) const;
- void makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, uint16_t norm16,
- CanonIterData &newData, UErrorCode &errorCode) const;
-
/**
* Gets the decomposition for one code point.
* @param c code point
@@ -367,14 +381,25 @@ public:
UBool getCanonStartSet(UChar32 c, UnicodeSet &set) const;
enum {
- MIN_CCC_LCCC_CP=0x300
- };
+ // Fixed norm16 values.
+ MIN_YES_YES_WITH_CC=0xfe02,
+ JAMO_VT=0xfe00,
+ MIN_NORMAL_MAYBE_YES=0xfc00,
+ JAMO_L=2, // offset=1 hasCompBoundaryAfter=FALSE
+ INERT=1, // offset=0 hasCompBoundaryAfter=TRUE
+
+ // norm16 bit 0 is comp-boundary-after.
+ HAS_COMP_BOUNDARY_AFTER=1,
+ OFFSET_SHIFT=1,
+
+ // For algorithmic one-way mappings, norm16 bits 2..1 indicate the
+ // tccc (0, 1, >1) for quick FCC boundary-after tests.
+ DELTA_TCCC_0=0,
+ DELTA_TCCC_1=2,
+ DELTA_TCCC_GT_1=4,
+ DELTA_TCCC_MASK=6,
+ DELTA_SHIFT=3,
- enum {
- MIN_YES_YES_WITH_CC=0xff01,
- JAMO_VT=0xff00,
- MIN_NORMAL_MAYBE_YES=0xfe00,
- JAMO_L=1,
MAX_DELTA=0x40
};
@@ -394,21 +419,32 @@ public:
IX_MIN_COMP_NO_MAYBE_CP,
// Norm16 value thresholds for quick check combinations and types of extra data.
- IX_MIN_YES_NO, // Mappings & compositions in [minYesNo..minYesNoMappingsOnly[.
+
+ /** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */
+ IX_MIN_YES_NO,
+ /** Mappings are comp-normalized. */
IX_MIN_NO_NO,
IX_LIMIT_NO_NO,
IX_MIN_MAYBE_YES,
- IX_MIN_YES_NO_MAPPINGS_ONLY, // Mappings only in [minYesNoMappingsOnly..minNoNo[.
-
- IX_RESERVED15,
+ /** Mappings only in [minYesNoMappingsOnly..minNoNo[. */
+ IX_MIN_YES_NO_MAPPINGS_ONLY,
+ /** Mappings are not comp-normalized but have a comp boundary before. */
+ IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE,
+ /** Mappings do not have a comp boundary before. */
+ IX_MIN_NO_NO_COMP_NO_MAYBE_CC,
+ /** Mappings to the empty string. */
+ IX_MIN_NO_NO_EMPTY,
+
+ IX_MIN_LCCC_CP,
+ IX_RESERVED19,
IX_COUNT
};
enum {
MAPPING_HAS_CCC_LCCC_WORD=0x80,
MAPPING_HAS_RAW_MAPPING=0x40,
- MAPPING_NO_COMP_BOUNDARY_AFTER=0x20,
+ // unused bit 0x20,
MAPPING_LENGTH_MASK=0x1f
};
@@ -457,6 +493,12 @@ public:
UnicodeString &safeMiddle,
ReorderingBuffer &buffer,
UErrorCode &errorCode) const;
+
+ /** sink==nullptr: isNormalized() */
+ UBool composeUTF8(uint32_t options, UBool onlyContiguous,
+ const uint8_t *src, const uint8_t *limit,
+ ByteSink *sink, icu::Edits *edits, UErrorCode &errorCode) const;
+
const UChar *makeFCD(const UChar *src, const UChar *limit,
ReorderingBuffer *buffer, UErrorCode &errorCode) const;
void makeFCDAndAppend(const UChar *src, const UChar *limit,
@@ -465,27 +507,42 @@ public:
ReorderingBuffer &buffer,
UErrorCode &errorCode) const;
- UBool hasDecompBoundary(UChar32 c, UBool before) const;
+ UBool hasDecompBoundaryBefore(UChar32 c) const;
+ UBool norm16HasDecompBoundaryBefore(uint16_t norm16) const;
+ UBool hasDecompBoundaryAfter(UChar32 c) const;
+ UBool norm16HasDecompBoundaryAfter(uint16_t norm16) const;
UBool isDecompInert(UChar32 c) const { return isDecompYesAndZeroCC(getNorm16(c)); }
UBool hasCompBoundaryBefore(UChar32 c) const {
- return c<minCompNoMaybeCP || hasCompBoundaryBefore(c, getNorm16(c));
+ return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(getNorm16(c));
}
- UBool hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous, UBool testInert) const;
-
- UBool hasFCDBoundaryBefore(UChar32 c) const { return c<MIN_CCC_LCCC_CP || getFCD16(c)<=0xff; }
- UBool hasFCDBoundaryAfter(UChar32 c) const {
- uint16_t fcd16=getFCD16(c);
- return fcd16<=1 || (fcd16&0xff)==0;
+ UBool hasCompBoundaryAfter(UChar32 c, UBool onlyContiguous) const {
+ return norm16HasCompBoundaryAfter(getNorm16(c), onlyContiguous);
+ }
+ UBool isCompInert(UChar32 c, UBool onlyContiguous) const {
+ uint16_t norm16=getNorm16(c);
+ return isCompYesAndZeroCC(norm16) &&
+ (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 &&
+ (!onlyContiguous || isInert(norm16) || *getMapping(norm16) <= 0x1ff);
}
+
+ UBool hasFCDBoundaryBefore(UChar32 c) const { return hasDecompBoundaryBefore(c); }
+ UBool hasFCDBoundaryAfter(UChar32 c) const { return hasDecompBoundaryAfter(c); }
UBool isFCDInert(UChar32 c) const { return getFCD16(c)<=1; }
private:
+ friend class InitCanonIterData;
+ friend class LcccContext;
+
UBool isMaybe(uint16_t norm16) const { return minMaybeYes<=norm16 && norm16<=JAMO_VT; }
UBool isMaybeOrNonZeroCC(uint16_t norm16) const { return norm16>=minMaybeYes; }
- static UBool isInert(uint16_t norm16) { return norm16==0; }
- static UBool isJamoL(uint16_t norm16) { return norm16==1; }
+ static UBool isInert(uint16_t norm16) { return norm16==INERT; }
+ static UBool isJamoL(uint16_t norm16) { return norm16==JAMO_L; }
static UBool isJamoVT(uint16_t norm16) { return norm16==JAMO_VT; }
- UBool isHangul(uint16_t norm16) const { return norm16==minYesNo; }
+ uint16_t hangulLVT() const { return minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER; }
+ UBool isHangulLV(uint16_t norm16) const { return norm16==minYesNo; }
+ UBool isHangulLVT(uint16_t norm16) const {
+ return norm16==hangulLVT();
+ }
UBool isCompYesAndZeroCC(uint16_t norm16) const { return norm16<minNoNo; }
// UBool isCompYes(uint16_t norm16) const {
// return norm16>=MIN_YES_YES_WITH_CC || norm16<minNoNo;
@@ -504,7 +561,7 @@ private:
/**
* A little faster and simpler than isDecompYesAndZeroCC() but does not include
* the MaybeYes which combine-forward and have ccc=0.
- * (Standard Unicode 5.2 normalization does not have such characters.)
+ * (Standard Unicode 10 normalization does not have such characters.)
*/
UBool isMostDecompYesAndZeroCC(uint16_t norm16) const {
return norm16<minYesNo || norm16==MIN_NORMAL_MAYBE_YES || norm16==JAMO_VT;
@@ -514,7 +571,7 @@ private:
// For use with isCompYes().
// Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC.
// static uint8_t getCCFromYes(uint16_t norm16) {
- // return norm16>=MIN_YES_YES_WITH_CC ? (uint8_t)norm16 : 0;
+ // return norm16>=MIN_YES_YES_WITH_CC ? getCCFromNormalYesOrMaybe(norm16) : 0;
// }
uint8_t getCCFromNoNo(uint16_t norm16) const {
const uint16_t *mapping=getMapping(norm16);
@@ -525,30 +582,47 @@ private:
}
}
// requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC()
- uint8_t getTrailCCFromCompYesAndZeroCC(const UChar *cpStart, const UChar *cpLimit) const;
+ uint8_t getTrailCCFromCompYesAndZeroCC(uint16_t norm16) const {
+ if(norm16<=minYesNo) {
+ return 0; // yesYes and Hangul LV have ccc=tccc=0
+ } else {
+ // For Hangul LVT we harmlessly fetch a firstUnit with tccc=0 here.
+ return (uint8_t)(*getMapping(norm16)>>8); // tccc from yesNo
+ }
+ }
+ uint8_t getPreviousTrailCC(const UChar *start, const UChar *p) const;
+ uint8_t getPreviousTrailCC(const uint8_t *start, const uint8_t *p) const;
// Requires algorithmic-NoNo.
UChar32 mapAlgorithmic(UChar32 c, uint16_t norm16) const {
- return c+norm16-(minMaybeYes-MAX_DELTA-1);
+ return c+(norm16>>DELTA_SHIFT)-centerNoNoDelta;
+ }
+ UChar32 getAlgorithmicDelta(uint16_t norm16) const {
+ return (norm16>>DELTA_SHIFT)-centerNoNoDelta;
}
// Requires minYesNo<norm16<limitNoNo.
- const uint16_t *getMapping(uint16_t norm16) const { return extraData+norm16; }
+ const uint16_t *getMapping(uint16_t norm16) const { return extraData+(norm16>>OFFSET_SHIFT); }
const uint16_t *getCompositionsListForDecompYes(uint16_t norm16) const {
- if(norm16==0 || MIN_NORMAL_MAYBE_YES<=norm16) {
+ if(norm16<JAMO_L || MIN_NORMAL_MAYBE_YES<=norm16) {
return NULL;
} else if(norm16<minMaybeYes) {
- return extraData+norm16; // for yesYes; if Jamo L: harmless empty list
+ return getMapping(norm16); // for yesYes; if Jamo L: harmless empty list
} else {
return maybeYesCompositions+norm16-minMaybeYes;
}
}
const uint16_t *getCompositionsListForComposite(uint16_t norm16) const {
- const uint16_t *list=extraData+norm16; // composite has both mapping & compositions list
+ // A composite has both mapping & compositions list.
+ const uint16_t *list=getMapping(norm16);
return list+ // mapping pointer
- 1+ // +1 to skip the first unit with the mapping lenth
+ 1+ // +1 to skip the first unit with the mapping length
(*list&MAPPING_LENGTH_MASK); // + mapping length
}
+ const uint16_t *getCompositionsListForMaybe(uint16_t norm16) const {
+ // minMaybeYes<=norm16<MIN_NORMAL_MAYBE_YES
+ return maybeYesCompositions+((norm16-minMaybeYes)>>OFFSET_SHIFT);
+ }
/**
* @param c code point must have compositions
* @return compositions list pointer
@@ -563,46 +637,78 @@ private:
UChar32 minNeedDataCP,
ReorderingBuffer *buffer,
UErrorCode &errorCode) const;
- UBool decomposeShort(const UChar *src, const UChar *limit,
- ReorderingBuffer &buffer, UErrorCode &errorCode) const;
+ const UChar *decomposeShort(const UChar *src, const UChar *limit,
+ UBool stopAtCompBoundary, UBool onlyContiguous,
+ ReorderingBuffer &buffer, UErrorCode &errorCode) const;
UBool decompose(UChar32 c, uint16_t norm16,
ReorderingBuffer &buffer, UErrorCode &errorCode) const;
+ const uint8_t *decomposeShort(const uint8_t *src, const uint8_t *limit,
+ UBool stopAtCompBoundary, UBool onlyContiguous,
+ ReorderingBuffer &buffer, UErrorCode &errorCode) const;
+
static int32_t combine(const uint16_t *list, UChar32 trail);
void addComposites(const uint16_t *list, UnicodeSet &set) const;
void recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex,
UBool onlyContiguous) const;
- UBool hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const;
- const UChar *findPreviousCompBoundary(const UChar *start, const UChar *p) const;
- const UChar *findNextCompBoundary(const UChar *p, const UChar *limit) const;
+ UBool hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const {
+ return c<minCompNoMaybeCP || norm16HasCompBoundaryBefore(norm16);
+ }
+ UBool norm16HasCompBoundaryBefore(uint16_t norm16) const {
+ return norm16 < minNoNoCompNoMaybeCC || isAlgorithmicNoNo(norm16);
+ }
+ UBool hasCompBoundaryBefore(const UChar *src, const UChar *limit) const;
+ UBool hasCompBoundaryBefore(const uint8_t *src, const uint8_t *limit) const;
+ UBool hasCompBoundaryAfter(const UChar *start, const UChar *p,
+ UBool onlyContiguous) const;
+ UBool hasCompBoundaryAfter(const uint8_t *start, const uint8_t *p,
+ UBool onlyContiguous) const;
+ UBool norm16HasCompBoundaryAfter(uint16_t norm16, UBool onlyContiguous) const {
+ return (norm16 & HAS_COMP_BOUNDARY_AFTER) != 0 &&
+ (!onlyContiguous || isTrailCC01ForCompBoundaryAfter(norm16));
+ }
+ /** For FCC: Given norm16 HAS_COMP_BOUNDARY_AFTER, does it have tccc<=1? */
+ UBool isTrailCC01ForCompBoundaryAfter(uint16_t norm16) const {
+ return isInert(norm16) || (isDecompNoAlgorithmic(norm16) ?
+ (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1 : *getMapping(norm16) <= 0x1ff);
+ }
+
+ const UChar *findPreviousCompBoundary(const UChar *start, const UChar *p, UBool onlyContiguous) const;
+ const UChar *findNextCompBoundary(const UChar *p, const UChar *limit, UBool onlyContiguous) const;
const UChar *findPreviousFCDBoundary(const UChar *start, const UChar *p) const;
const UChar *findNextFCDBoundary(const UChar *p, const UChar *limit) const;
+ void makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, const uint16_t norm16,
+ CanonIterData &newData, UErrorCode &errorCode) const;
+
int32_t getCanonValue(UChar32 c) const;
const UnicodeSet &getCanonStartSet(int32_t n) const;
// UVersionInfo dataVersion;
- // Code point thresholds for quick check codes.
- UChar32 minDecompNoCP;
- UChar32 minCompNoMaybeCP;
+ // BMP code point thresholds for quick check loops looking at single UTF-16 code units.
+ UChar minDecompNoCP;
+ UChar minCompNoMaybeCP;
+ UChar minLcccCP;
// Norm16 value thresholds for quick check combinations and types of extra data.
uint16_t minYesNo;
uint16_t minYesNoMappingsOnly;
uint16_t minNoNo;
+ uint16_t minNoNoCompBoundaryBefore;
+ uint16_t minNoNoCompNoMaybeCC;
+ uint16_t minNoNoEmpty;
uint16_t limitNoNo;
+ uint16_t centerNoNoDelta;
uint16_t minMaybeYes;
const UTrie2 *normTrie;
const uint16_t *maybeYesCompositions;
const uint16_t *extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters
const uint8_t *smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0
- uint8_t tccc180[0x180]; // tccc values for U+0000..U+017F
-public: // CanonIterData is public to allow access from C callback functions.
UInitOnce fCanonIterDataInitOnce;
CanonIterData *fCanonIterData;
};
@@ -658,13 +764,14 @@ unorm_getFCD16(UChar32 c);
/**
* Format of Normalizer2 .nrm data files.
- * Format version 2.0.
+ * Format version 3.0.
*
* Normalizer2 .nrm data files provide data for the Unicode Normalization algorithms.
* ICU ships with data files for standard Unicode Normalization Forms
* NFC and NFD (nfc.nrm), NFKC and NFKD (nfkc.nrm) and NFKC_Casefold (nfkc_cf.nrm).
* Custom (application-specific) data can be built into additional .nrm files
* with the gennorm2 build tool.
+ * ICU ships with one such file, uts46.nrm, for the implementation of UTS #46.
*
* Normalizer2.getInstance() causes a .nrm file to be loaded, unless it has been
* cached already. Internally, Normalizer2Impl.load() reads the .nrm file.
@@ -695,14 +802,20 @@ unorm_getFCD16(UChar32 c);
* with a decomposition mapping, that is, with NF*D_QC=No.
* minCompNoMaybeCP=indexes[IX_MIN_COMP_NO_MAYBE_CP] is the lowest code point
* with NF*C_QC=No (has a one-way mapping) or Maybe (combines backward).
+ * minLcccCP=indexes[IX_MIN_LCCC_CP] (index 18, new in formatVersion 3)
+ * is the lowest code point with lccc!=0.
*
- * The next five indexes are thresholds of 16-bit trie values for ranges of
+ * The next eight indexes are thresholds of 16-bit trie values for ranges of
* values indicating multiple normalization properties.
+ * They are listed here in threshold order, not in the order they are stored in the indexes.
* minYesNo=indexes[IX_MIN_YES_NO];
+ * minYesNoMappingsOnly=indexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
* minNoNo=indexes[IX_MIN_NO_NO];
+ * minNoNoCompBoundaryBefore=indexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE];
+ * minNoNoCompNoMaybeCC=indexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC];
+ * minNoNoEmpty=indexes[IX_MIN_NO_NO_EMPTY];
* limitNoNo=indexes[IX_LIMIT_NO_NO];
* minMaybeYes=indexes[IX_MIN_MAYBE_YES];
- * minYesNoMappingsOnly=indexes[IX_MIN_YES_NO_MAPPINGS_ONLY];
* See the normTrie description below and the design doc for details.
*
* UTrie2 normTrie; -- see utrie2_impl.h and utrie2.h
@@ -710,12 +823,14 @@ unorm_getFCD16(UChar32 c);
* The trie holds the main normalization data. Each code point is mapped to a 16-bit value.
* Rather than using independent bits in the value (which would require more than 16 bits),
* information is extracted primarily via range checks.
+ * Except, format version 3 uses bit 0 for hasCompBoundaryAfter().
* For example, a 16-bit value norm16 in the range minYesNo<=norm16<minNoNo
* means that the character has NF*C_QC=Yes and NF*D_QC=No properties,
* which means it has a two-way (round-trip) decomposition mapping.
* Values in the range 2<=norm16<limitNoNo are also directly indexes into the extraData
* pointing to mappings, compositions lists, or both.
- * Value norm16==0 means that the character is normalization-inert, that is,
+ * Value norm16==INERT (0 in versions 1 & 2, 1 in version 3)
+ * means that the character is normalization-inert, that is,
* it does not have a mapping, does not participate in composition, has a zero
* canonical combining class, and forms a boundary where text before it and after it
* can be normalized independently.
@@ -729,7 +844,7 @@ unorm_getFCD16(UChar32 c);
* The trie has a value for each lead surrogate code unit representing the "worst case"
* properties of the 1024 supplementary characters whose UTF-16 form starts with
* the lead surrogate. If all of the 1024 supplementary characters are normalization-inert,
- * then their lead surrogate code unit has the trie value 0.
+ * then their lead surrogate code unit has the trie value INERT.
* When the lead surrogate unit's value exceeds the quick check minimum during processing,
* the properties for the full supplementary code point need to be looked up.
*
@@ -738,6 +853,7 @@ unorm_getFCD16(UChar32 c);
*
* There is only one byte offset for the end of these two arrays.
* The split between them is given by the constant and variable mentioned above.
+ * In version 3, the difference must be shifted right by OFFSET_SHIFT.
*
* The maybeYesCompositions array contains compositions lists for characters that
* combine both forward (as starters in composition pairs)
@@ -754,6 +870,8 @@ unorm_getFCD16(UChar32 c);
* followed by only mappings for "NoNo" characters.
* (Referring to pairs of NFC/NFD quick check values.)
* The norm16 values of those characters are directly indexes into the extraData array.
+ * In version 3, the norm16 values must be shifted right by OFFSET_SHIFT
+ * for accessing extraData.
*
* The data structures for compositions lists and mappings are described in the design doc.
*
@@ -784,6 +902,50 @@ unorm_getFCD16(UChar32 c);
* This is fully equivalent with formatVersion 1's MAPPING_PLUS_COMPOSITION_LIST flag.
* It is needed for the new (in ICU 49) composePair(), not for other normalization.
* - Addition of the smallFCD[] bit set.
+ *
+ * Changes from format version 2 to format version 3 (ICU 60) ------------------
+ *
+ * - norm16 bit 0 indicates hasCompBoundaryAfter(),
+ * except that for contiguous composition (FCC) the tccc must be checked as well.
+ * Data indexes and ccc values are shifted left by one (OFFSET_SHIFT).
+ * Thresholds like minNoNo are tested before shifting.
+ *
+ * - Algorithmic mapping deltas are shifted left by two more bits (total DELTA_SHIFT),
+ * to make room for two bits (three values) indicating whether the tccc is 0, 1, or greater.
+ * See DELTA_TCCC_MASK etc.
+ * This helps with fetching tccc/FCD values and FCC hasCompBoundaryAfter().
+ * minMaybeYes is 8-aligned so that the DELTA_TCCC_MASK bits can be tested directly.
+ *
+ * - Algorithmic mappings are only used for mapping to "comp yes and ccc=0" characters,
+ * and ASCII characters are mapped algorithmically only to other ASCII characters.
+ * This helps with hasCompBoundaryBefore() and compose() fast paths.
+ * It is never necessary any more to loop for algorithmic mappings.
+ *
+ * - Addition of indexes[IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE],
+ * indexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC], and indexes[IX_MIN_NO_NO_EMPTY],
+ * and separation of the noNo extraData into distinct ranges.
+ * With this, the noNo norm16 value indicates whether the mapping is
+ * compose-normalized, not normalized but hasCompBoundaryBefore(),
+ * not even that, or maps to an empty string.
+ * hasCompBoundaryBefore() can be determined solely from the norm16 value.
+ *
+ * - The norm16 value for Hangul LVT is now different from that for Hangul LV,
+ * so that hasCompBoundaryAfter() need not check for the syllable type.
+ * For Hangul LV, minYesNo continues to be used (no comp-boundary-after).
+ * For Hangul LVT, minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER is used.
+ * The extraData units at these indexes are set to firstUnit=2 and firstUnit=3, respectively,
+ * to simplify some code.
+ *
+ * - The extraData firstUnit bit 5 is no longer necessary
+ * (norm16 bit 0 used instead of firstUnit MAPPING_NO_COMP_BOUNDARY_AFTER),
+ * is reserved again, and always set to 0.
+ *
+ * - Addition of indexes[IX_MIN_LCCC_CP], the first code point where lccc!=0.
+ * This used to be hardcoded to U+0300, but in data like NFKC_Casefold it is lower:
+ * U+00AD Soft Hyphen maps to an empty string,
+ * which is artificially assigned "worst case" values lccc=1 and tccc=255.
+ *
+ * - A mapping to an empty string has explicit lccc=1 and tccc=255 values.
*/
#endif /* !UCONFIG_NO_NORMALIZATION */