diff options
Diffstat (limited to 'deps/icu-small/source/common/filterednormalizer2.cpp')
-rw-r--r-- | deps/icu-small/source/common/filterednormalizer2.cpp | 288 |
1 files changed, 288 insertions, 0 deletions
diff --git a/deps/icu-small/source/common/filterednormalizer2.cpp b/deps/icu-small/source/common/filterednormalizer2.cpp new file mode 100644 index 0000000000..44ed9c13a0 --- /dev/null +++ b/deps/icu-small/source/common/filterednormalizer2.cpp @@ -0,0 +1,288 @@ +/* +******************************************************************************* +* +* Copyright (C) 2009-2012, International Business Machines +* Corporation and others. All Rights Reserved. +* +******************************************************************************* +* file name: filterednormalizer2.cpp +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2009dec10 +* created by: Markus W. Scherer +*/ + +#include "unicode/utypes.h" + +#if !UCONFIG_NO_NORMALIZATION + +#include "unicode/normalizer2.h" +#include "unicode/uniset.h" +#include "unicode/unistr.h" +#include "unicode/unorm.h" +#include "cpputils.h" + +U_NAMESPACE_BEGIN + +FilteredNormalizer2::~FilteredNormalizer2() {} + +UnicodeString & +FilteredNormalizer2::normalize(const UnicodeString &src, + UnicodeString &dest, + UErrorCode &errorCode) const { + uprv_checkCanGetBuffer(src, errorCode); + if(U_FAILURE(errorCode)) { + dest.setToBogus(); + return dest; + } + if(&dest==&src) { + errorCode=U_ILLEGAL_ARGUMENT_ERROR; + return dest; + } + dest.remove(); + return normalize(src, dest, USET_SPAN_SIMPLE, errorCode); +} + +// Internal: No argument checking, and appends to dest. +// Pass as input spanCondition the one that is likely to yield a non-zero +// span length at the start of src. +// For set=[:age=3.2:], since almost all common characters were in Unicode 3.2, +// USET_SPAN_SIMPLE should be passed in for the start of src +// and USET_SPAN_NOT_CONTAINED should be passed in if we continue after +// an in-filter prefix. +UnicodeString & +FilteredNormalizer2::normalize(const UnicodeString &src, + UnicodeString &dest, + USetSpanCondition spanCondition, + UErrorCode &errorCode) const { + UnicodeString tempDest; // Don't throw away destination buffer between iterations. + for(int32_t prevSpanLimit=0; prevSpanLimit<src.length();) { + int32_t spanLimit=set.span(src, prevSpanLimit, spanCondition); + int32_t spanLength=spanLimit-prevSpanLimit; + if(spanCondition==USET_SPAN_NOT_CONTAINED) { + if(spanLength!=0) { + dest.append(src, prevSpanLimit, spanLength); + } + spanCondition=USET_SPAN_SIMPLE; + } else { + if(spanLength!=0) { + // Not norm2.normalizeSecondAndAppend() because we do not want + // to modify the non-filter part of dest. + dest.append(norm2.normalize(src.tempSubStringBetween(prevSpanLimit, spanLimit), + tempDest, errorCode)); + if(U_FAILURE(errorCode)) { + break; + } + } + spanCondition=USET_SPAN_NOT_CONTAINED; + } + prevSpanLimit=spanLimit; + } + return dest; +} + +UnicodeString & +FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first, + const UnicodeString &second, + UErrorCode &errorCode) const { + return normalizeSecondAndAppend(first, second, TRUE, errorCode); +} + +UnicodeString & +FilteredNormalizer2::append(UnicodeString &first, + const UnicodeString &second, + UErrorCode &errorCode) const { + return normalizeSecondAndAppend(first, second, FALSE, errorCode); +} + +UnicodeString & +FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first, + const UnicodeString &second, + UBool doNormalize, + UErrorCode &errorCode) const { + uprv_checkCanGetBuffer(first, errorCode); + uprv_checkCanGetBuffer(second, errorCode); + if(U_FAILURE(errorCode)) { + return first; + } + if(&first==&second) { + errorCode=U_ILLEGAL_ARGUMENT_ERROR; + return first; + } + if(first.isEmpty()) { + if(doNormalize) { + return normalize(second, first, errorCode); + } else { + return first=second; + } + } + // merge the in-filter suffix of the first string with the in-filter prefix of the second + int32_t prefixLimit=set.span(second, 0, USET_SPAN_SIMPLE); + if(prefixLimit!=0) { + UnicodeString prefix(second.tempSubString(0, prefixLimit)); + int32_t suffixStart=set.spanBack(first, INT32_MAX, USET_SPAN_SIMPLE); + if(suffixStart==0) { + if(doNormalize) { + norm2.normalizeSecondAndAppend(first, prefix, errorCode); + } else { + norm2.append(first, prefix, errorCode); + } + } else { + UnicodeString middle(first, suffixStart, INT32_MAX); + if(doNormalize) { + norm2.normalizeSecondAndAppend(middle, prefix, errorCode); + } else { + norm2.append(middle, prefix, errorCode); + } + first.replace(suffixStart, INT32_MAX, middle); + } + } + if(prefixLimit<second.length()) { + UnicodeString rest(second.tempSubString(prefixLimit, INT32_MAX)); + if(doNormalize) { + normalize(rest, first, USET_SPAN_NOT_CONTAINED, errorCode); + } else { + first.append(rest); + } + } + return first; +} + +UBool +FilteredNormalizer2::getDecomposition(UChar32 c, UnicodeString &decomposition) const { + return set.contains(c) && norm2.getDecomposition(c, decomposition); +} + +UBool +FilteredNormalizer2::getRawDecomposition(UChar32 c, UnicodeString &decomposition) const { + return set.contains(c) && norm2.getRawDecomposition(c, decomposition); +} + +UChar32 +FilteredNormalizer2::composePair(UChar32 a, UChar32 b) const { + return (set.contains(a) && set.contains(b)) ? norm2.composePair(a, b) : U_SENTINEL; +} + +uint8_t +FilteredNormalizer2::getCombiningClass(UChar32 c) const { + return set.contains(c) ? norm2.getCombiningClass(c) : 0; +} + +UBool +FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode) const { + uprv_checkCanGetBuffer(s, errorCode); + if(U_FAILURE(errorCode)) { + return FALSE; + } + USetSpanCondition spanCondition=USET_SPAN_SIMPLE; + for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) { + int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition); + if(spanCondition==USET_SPAN_NOT_CONTAINED) { + spanCondition=USET_SPAN_SIMPLE; + } else { + if( !norm2.isNormalized(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode) || + U_FAILURE(errorCode) + ) { + return FALSE; + } + spanCondition=USET_SPAN_NOT_CONTAINED; + } + prevSpanLimit=spanLimit; + } + return TRUE; +} + +UNormalizationCheckResult +FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const { + uprv_checkCanGetBuffer(s, errorCode); + if(U_FAILURE(errorCode)) { + return UNORM_MAYBE; + } + UNormalizationCheckResult result=UNORM_YES; + USetSpanCondition spanCondition=USET_SPAN_SIMPLE; + for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) { + int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition); + if(spanCondition==USET_SPAN_NOT_CONTAINED) { + spanCondition=USET_SPAN_SIMPLE; + } else { + UNormalizationCheckResult qcResult= + norm2.quickCheck(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode); + if(U_FAILURE(errorCode) || qcResult==UNORM_NO) { + return qcResult; + } else if(qcResult==UNORM_MAYBE) { + result=qcResult; + } + spanCondition=USET_SPAN_NOT_CONTAINED; + } + prevSpanLimit=spanLimit; + } + return result; +} + +int32_t +FilteredNormalizer2::spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const { + uprv_checkCanGetBuffer(s, errorCode); + if(U_FAILURE(errorCode)) { + return 0; + } + USetSpanCondition spanCondition=USET_SPAN_SIMPLE; + for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) { + int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition); + if(spanCondition==USET_SPAN_NOT_CONTAINED) { + spanCondition=USET_SPAN_SIMPLE; + } else { + int32_t yesLimit= + prevSpanLimit+ + norm2.spanQuickCheckYes( + s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode); + if(U_FAILURE(errorCode) || yesLimit<spanLimit) { + return yesLimit; + } + spanCondition=USET_SPAN_NOT_CONTAINED; + } + prevSpanLimit=spanLimit; + } + return s.length(); +} + +UBool +FilteredNormalizer2::hasBoundaryBefore(UChar32 c) const { + return !set.contains(c) || norm2.hasBoundaryBefore(c); +} + +UBool +FilteredNormalizer2::hasBoundaryAfter(UChar32 c) const { + return !set.contains(c) || norm2.hasBoundaryAfter(c); +} + +UBool +FilteredNormalizer2::isInert(UChar32 c) const { + return !set.contains(c) || norm2.isInert(c); +} + +U_NAMESPACE_END + +// C API ------------------------------------------------------------------- *** + +U_NAMESPACE_USE + +U_CAPI UNormalizer2 * U_EXPORT2 +unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode) { + if(U_FAILURE(*pErrorCode)) { + return NULL; + } + if(filterSet==NULL) { + *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; + return NULL; + } + Normalizer2 *fn2=new FilteredNormalizer2(*(Normalizer2 *)norm2, + *UnicodeSet::fromUSet(filterSet)); + if(fn2==NULL) { + *pErrorCode=U_MEMORY_ALLOCATION_ERROR; + } + return (UNormalizer2 *)fn2; +} + +#endif // !UCONFIG_NO_NORMALIZATION |