// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * * Copyright (C) 2009-2014, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: normalizer2impl.h * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * created on: 2009nov22 * created by: Markus W. Scherer */ #ifndef __NORMALIZER2IMPL_H__ #define __NORMALIZER2IMPL_H__ #include "unicode/utypes.h" #if !UCONFIG_NO_NORMALIZATION #include "unicode/normalizer2.h" #include "unicode/unistr.h" #include "unicode/unorm.h" #include "unicode/utf16.h" #include "mutex.h" #include "uset_imp.h" #include "utrie2.h" U_NAMESPACE_BEGIN struct CanonIterData; class ByteSink; class Edits; class InitCanonIterData; class LcccContext; class U_COMMON_API Hangul { public: /* Korean Hangul and Jamo constants */ enum { JAMO_L_BASE=0x1100, /* "lead" jamo */ JAMO_L_END=0x1112, JAMO_V_BASE=0x1161, /* "vowel" jamo */ JAMO_V_END=0x1175, JAMO_T_BASE=0x11a7, /* "trail" jamo */ JAMO_T_END=0x11c2, HANGUL_BASE=0xac00, HANGUL_END=0xd7a3, JAMO_L_COUNT=19, JAMO_V_COUNT=21, JAMO_T_COUNT=28, JAMO_VT_COUNT=JAMO_V_COUNT*JAMO_T_COUNT, HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT, HANGUL_LIMIT=HANGUL_BASE+HANGUL_COUNT }; static inline UBool isHangul(UChar32 c) { return HANGUL_BASE<=c && c=MIN_NORMAL_MAYBE_YES) { return getCCFromNormalYesOrMaybe(norm16); } if(norm16> OFFSET_SHIFT); } static uint8_t getCCFromYesOrMaybe(uint16_t norm16) { return norm16>=MIN_NORMAL_MAYBE_YES ? getCCFromNormalYesOrMaybe(norm16) : 0; } uint8_t getCCFromYesOrMaybeCP(UChar32 c) const { if (c < minCompNoMaybeCP) { return 0; } return getCCFromYesOrMaybe(getNorm16(c)); } /** * Returns the FCD data for code point c. * @param c A Unicode code point. * @return The lccc(c) in bits 15..8 and tccc(c) in bits 7..0. */ uint16_t getFCD16(UChar32 c) const { if(c>8]; if(bits==0) { return false; } return (UBool)((bits>>((lead>>5)&7))&1); } /** Returns the FCD value from the regular normalization data. */ uint16_t getFCD16FromNormData(UChar32 c) const; /** * Gets the decomposition for one code point. * @param c code point * @param buffer out-only buffer for algorithmic decompositions * @param length out-only, takes the length of the decomposition, if any * @return pointer to the decomposition, or NULL if none */ const UChar *getDecomposition(UChar32 c, UChar buffer[4], int32_t &length) const; /** * Gets the raw decomposition for one code point. * @param c code point * @param buffer out-only buffer for algorithmic decompositions * @param length out-only, takes the length of the decomposition, if any * @return pointer to the decomposition, or NULL if none */ const UChar *getRawDecomposition(UChar32 c, UChar buffer[30], int32_t &length) const; UChar32 composePair(UChar32 a, UChar32 b) const; UBool isCanonSegmentStarter(UChar32 c) const; UBool getCanonStartSet(UChar32 c, UnicodeSet &set) const; enum { // Fixed norm16 values. MIN_YES_YES_WITH_CC=0xfe02, JAMO_VT=0xfe00, MIN_NORMAL_MAYBE_YES=0xfc00, JAMO_L=2, // offset=1 hasCompBoundaryAfter=FALSE INERT=1, // offset=0 hasCompBoundaryAfter=TRUE // norm16 bit 0 is comp-boundary-after. HAS_COMP_BOUNDARY_AFTER=1, OFFSET_SHIFT=1, // For algorithmic one-way mappings, norm16 bits 2..1 indicate the // tccc (0, 1, >1) for quick FCC boundary-after tests. DELTA_TCCC_0=0, DELTA_TCCC_1=2, DELTA_TCCC_GT_1=4, DELTA_TCCC_MASK=6, DELTA_SHIFT=3, MAX_DELTA=0x40 }; enum { // Byte offsets from the start of the data, after the generic header. IX_NORM_TRIE_OFFSET, IX_EXTRA_DATA_OFFSET, IX_SMALL_FCD_OFFSET, IX_RESERVED3_OFFSET, IX_RESERVED4_OFFSET, IX_RESERVED5_OFFSET, IX_RESERVED6_OFFSET, IX_TOTAL_SIZE, // Code point thresholds for quick check codes. IX_MIN_DECOMP_NO_CP, IX_MIN_COMP_NO_MAYBE_CP, // Norm16 value thresholds for quick check combinations and types of extra data. /** Mappings & compositions in [minYesNo..minYesNoMappingsOnly[. */ IX_MIN_YES_NO, /** Mappings are comp-normalized. */ IX_MIN_NO_NO, IX_LIMIT_NO_NO, IX_MIN_MAYBE_YES, /** Mappings only in [minYesNoMappingsOnly..minNoNo[. */ IX_MIN_YES_NO_MAPPINGS_ONLY, /** Mappings are not comp-normalized but have a comp boundary before. */ IX_MIN_NO_NO_COMP_BOUNDARY_BEFORE, /** Mappings do not have a comp boundary before. */ IX_MIN_NO_NO_COMP_NO_MAYBE_CC, /** Mappings to the empty string. */ IX_MIN_NO_NO_EMPTY, IX_MIN_LCCC_CP, IX_RESERVED19, IX_COUNT }; enum { MAPPING_HAS_CCC_LCCC_WORD=0x80, MAPPING_HAS_RAW_MAPPING=0x40, // unused bit 0x20, MAPPING_LENGTH_MASK=0x1f }; enum { COMP_1_LAST_TUPLE=0x8000, COMP_1_TRIPLE=1, COMP_1_TRAIL_LIMIT=0x3400, COMP_1_TRAIL_MASK=0x7ffe, COMP_1_TRAIL_SHIFT=9, // 10-1 for the "triple" bit COMP_2_TRAIL_SHIFT=6, COMP_2_TRAIL_MASK=0xffc0 }; // higher-level functionality ------------------------------------------ *** // NFD without an NFD Normalizer2 instance. UnicodeString &decompose(const UnicodeString &src, UnicodeString &dest, UErrorCode &errorCode) const; /** * Decomposes [src, limit[ and writes the result to dest. * limit can be NULL if src is NUL-terminated. * destLengthEstimate is the initial dest buffer capacity and can be -1. */ void decompose(const UChar *src, const UChar *limit, UnicodeString &dest, int32_t destLengthEstimate, UErrorCode &errorCode) const; const UChar *decompose(const UChar *src, const UChar *limit, ReorderingBuffer *buffer, UErrorCode &errorCode) const; void decomposeAndAppend(const UChar *src, const UChar *limit, UBool doDecompose, UnicodeString &safeMiddle, ReorderingBuffer &buffer, UErrorCode &errorCode) const; UBool compose(const UChar *src, const UChar *limit, UBool onlyContiguous, UBool doCompose, ReorderingBuffer &buffer, UErrorCode &errorCode) const; const UChar *composeQuickCheck(const UChar *src, const UChar *limit, UBool onlyContiguous, UNormalizationCheckResult *pQCResult) const; void composeAndAppend(const UChar *src, const UChar *limit, UBool doCompose, UBool onlyContiguous, UnicodeString &safeMiddle, ReorderingBuffer &buffer, UErrorCode &errorCode) const; /** sink==nullptr: isNormalized() */ UBool composeUTF8(uint32_t options, UBool onlyContiguous, const uint8_t *src, const uint8_t *limit, ByteSink *sink, icu::Edits *edits, UErrorCode &errorCode) const; const UChar *makeFCD(const UChar *src, const UChar *limit, ReorderingBuffer *buffer, UErrorCode &errorCode) const; void makeFCDAndAppend(const UChar *src, const UChar *limit, UBool doMakeFCD, UnicodeString &safeMiddle, ReorderingBuffer &buffer, UErrorCode &errorCode) const; UBool hasDecompBoundaryBefore(UChar32 c) const; UBool norm16HasDecompBoundaryBefore(uint16_t norm16) const; UBool hasDecompBoundaryAfter(UChar32 c) const; UBool norm16HasDecompBoundaryAfter(uint16_t norm16) const; UBool isDecompInert(UChar32 c) const { return isDecompYesAndZeroCC(getNorm16(c)); } UBool hasCompBoundaryBefore(UChar32 c) const { return c=minMaybeYes; } static UBool isInert(uint16_t norm16) { return norm16==INERT; } static UBool isJamoL(uint16_t norm16) { return norm16==JAMO_L; } static UBool isJamoVT(uint16_t norm16) { return norm16==JAMO_VT; } uint16_t hangulLVT() const { return minYesNoMappingsOnly|HAS_COMP_BOUNDARY_AFTER; } UBool isHangulLV(uint16_t norm16) const { return norm16==minYesNo; } UBool isHangulLVT(uint16_t norm16) const { return norm16==hangulLVT(); } UBool isCompYesAndZeroCC(uint16_t norm16) const { return norm16=MIN_YES_YES_WITH_CC || norm16=limitNoNo; } // For use with isCompYes(). // Perhaps the compiler can combine the two tests for MIN_YES_YES_WITH_CC. // static uint8_t getCCFromYes(uint16_t norm16) { // return norm16>=MIN_YES_YES_WITH_CC ? getCCFromNormalYesOrMaybe(norm16) : 0; // } uint8_t getCCFromNoNo(uint16_t norm16) const { const uint16_t *mapping=getMapping(norm16); if(*mapping&MAPPING_HAS_CCC_LCCC_WORD) { return (uint8_t)*(mapping-1); } else { return 0; } } // requires that the [cpStart..cpLimit[ character passes isCompYesAndZeroCC() uint8_t getTrailCCFromCompYesAndZeroCC(uint16_t norm16) const { if(norm16<=minYesNo) { return 0; // yesYes and Hangul LV have ccc=tccc=0 } else { // For Hangul LVT we harmlessly fetch a firstUnit with tccc=0 here. return (uint8_t)(*getMapping(norm16)>>8); // tccc from yesNo } } uint8_t getPreviousTrailCC(const UChar *start, const UChar *p) const; uint8_t getPreviousTrailCC(const uint8_t *start, const uint8_t *p) const; // Requires algorithmic-NoNo. UChar32 mapAlgorithmic(UChar32 c, uint16_t norm16) const { return c+(norm16>>DELTA_SHIFT)-centerNoNoDelta; } UChar32 getAlgorithmicDelta(uint16_t norm16) const { return (norm16>>DELTA_SHIFT)-centerNoNoDelta; } // Requires minYesNo>OFFSET_SHIFT); } const uint16_t *getCompositionsListForDecompYes(uint16_t norm16) const { if(norm16>OFFSET_SHIFT); } /** * @param c code point must have compositions * @return compositions list pointer */ const uint16_t *getCompositionsList(uint16_t norm16) const { return isDecompYes(norm16) ? getCompositionsListForDecompYes(norm16) : getCompositionsListForComposite(norm16); } const UChar *copyLowPrefixFromNulTerminated(const UChar *src, UChar32 minNeedDataCP, ReorderingBuffer *buffer, UErrorCode &errorCode) const; const UChar *decomposeShort(const UChar *src, const UChar *limit, UBool stopAtCompBoundary, UBool onlyContiguous, ReorderingBuffer &buffer, UErrorCode &errorCode) const; UBool decompose(UChar32 c, uint16_t norm16, ReorderingBuffer &buffer, UErrorCode &errorCode) const; const uint8_t *decomposeShort(const uint8_t *src, const uint8_t *limit, UBool stopAtCompBoundary, UBool onlyContiguous, ReorderingBuffer &buffer, UErrorCode &errorCode) const; static int32_t combine(const uint16_t *list, UChar32 trail); void addComposites(const uint16_t *list, UnicodeSet &set) const; void recompose(ReorderingBuffer &buffer, int32_t recomposeStartIndex, UBool onlyContiguous) const; UBool hasCompBoundaryBefore(UChar32 c, uint16_t norm16) const { return c