summaryrefslogtreecommitdiff
path: root/deps/icu-small/source/common/filterednormalizer2.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'deps/icu-small/source/common/filterednormalizer2.cpp')
-rw-r--r--deps/icu-small/source/common/filterednormalizer2.cpp288
1 files changed, 288 insertions, 0 deletions
diff --git a/deps/icu-small/source/common/filterednormalizer2.cpp b/deps/icu-small/source/common/filterednormalizer2.cpp
new file mode 100644
index 0000000000..44ed9c13a0
--- /dev/null
+++ b/deps/icu-small/source/common/filterednormalizer2.cpp
@@ -0,0 +1,288 @@
+/*
+*******************************************************************************
+*
+* Copyright (C) 2009-2012, International Business Machines
+* Corporation and others. All Rights Reserved.
+*
+*******************************************************************************
+* file name: filterednormalizer2.cpp
+* encoding: US-ASCII
+* tab size: 8 (not used)
+* indentation:4
+*
+* created on: 2009dec10
+* created by: Markus W. Scherer
+*/
+
+#include "unicode/utypes.h"
+
+#if !UCONFIG_NO_NORMALIZATION
+
+#include "unicode/normalizer2.h"
+#include "unicode/uniset.h"
+#include "unicode/unistr.h"
+#include "unicode/unorm.h"
+#include "cpputils.h"
+
+U_NAMESPACE_BEGIN
+
+FilteredNormalizer2::~FilteredNormalizer2() {}
+
+UnicodeString &
+FilteredNormalizer2::normalize(const UnicodeString &src,
+ UnicodeString &dest,
+ UErrorCode &errorCode) const {
+ uprv_checkCanGetBuffer(src, errorCode);
+ if(U_FAILURE(errorCode)) {
+ dest.setToBogus();
+ return dest;
+ }
+ if(&dest==&src) {
+ errorCode=U_ILLEGAL_ARGUMENT_ERROR;
+ return dest;
+ }
+ dest.remove();
+ return normalize(src, dest, USET_SPAN_SIMPLE, errorCode);
+}
+
+// Internal: No argument checking, and appends to dest.
+// Pass as input spanCondition the one that is likely to yield a non-zero
+// span length at the start of src.
+// For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
+// USET_SPAN_SIMPLE should be passed in for the start of src
+// and USET_SPAN_NOT_CONTAINED should be passed in if we continue after
+// an in-filter prefix.
+UnicodeString &
+FilteredNormalizer2::normalize(const UnicodeString &src,
+ UnicodeString &dest,
+ USetSpanCondition spanCondition,
+ UErrorCode &errorCode) const {
+ UnicodeString tempDest; // Don't throw away destination buffer between iterations.
+ for(int32_t prevSpanLimit=0; prevSpanLimit<src.length();) {
+ int32_t spanLimit=set.span(src, prevSpanLimit, spanCondition);
+ int32_t spanLength=spanLimit-prevSpanLimit;
+ if(spanCondition==USET_SPAN_NOT_CONTAINED) {
+ if(spanLength!=0) {
+ dest.append(src, prevSpanLimit, spanLength);
+ }
+ spanCondition=USET_SPAN_SIMPLE;
+ } else {
+ if(spanLength!=0) {
+ // Not norm2.normalizeSecondAndAppend() because we do not want
+ // to modify the non-filter part of dest.
+ dest.append(norm2.normalize(src.tempSubStringBetween(prevSpanLimit, spanLimit),
+ tempDest, errorCode));
+ if(U_FAILURE(errorCode)) {
+ break;
+ }
+ }
+ spanCondition=USET_SPAN_NOT_CONTAINED;
+ }
+ prevSpanLimit=spanLimit;
+ }
+ return dest;
+}
+
+UnicodeString &
+FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
+ const UnicodeString &second,
+ UErrorCode &errorCode) const {
+ return normalizeSecondAndAppend(first, second, TRUE, errorCode);
+}
+
+UnicodeString &
+FilteredNormalizer2::append(UnicodeString &first,
+ const UnicodeString &second,
+ UErrorCode &errorCode) const {
+ return normalizeSecondAndAppend(first, second, FALSE, errorCode);
+}
+
+UnicodeString &
+FilteredNormalizer2::normalizeSecondAndAppend(UnicodeString &first,
+ const UnicodeString &second,
+ UBool doNormalize,
+ UErrorCode &errorCode) const {
+ uprv_checkCanGetBuffer(first, errorCode);
+ uprv_checkCanGetBuffer(second, errorCode);
+ if(U_FAILURE(errorCode)) {
+ return first;
+ }
+ if(&first==&second) {
+ errorCode=U_ILLEGAL_ARGUMENT_ERROR;
+ return first;
+ }
+ if(first.isEmpty()) {
+ if(doNormalize) {
+ return normalize(second, first, errorCode);
+ } else {
+ return first=second;
+ }
+ }
+ // merge the in-filter suffix of the first string with the in-filter prefix of the second
+ int32_t prefixLimit=set.span(second, 0, USET_SPAN_SIMPLE);
+ if(prefixLimit!=0) {
+ UnicodeString prefix(second.tempSubString(0, prefixLimit));
+ int32_t suffixStart=set.spanBack(first, INT32_MAX, USET_SPAN_SIMPLE);
+ if(suffixStart==0) {
+ if(doNormalize) {
+ norm2.normalizeSecondAndAppend(first, prefix, errorCode);
+ } else {
+ norm2.append(first, prefix, errorCode);
+ }
+ } else {
+ UnicodeString middle(first, suffixStart, INT32_MAX);
+ if(doNormalize) {
+ norm2.normalizeSecondAndAppend(middle, prefix, errorCode);
+ } else {
+ norm2.append(middle, prefix, errorCode);
+ }
+ first.replace(suffixStart, INT32_MAX, middle);
+ }
+ }
+ if(prefixLimit<second.length()) {
+ UnicodeString rest(second.tempSubString(prefixLimit, INT32_MAX));
+ if(doNormalize) {
+ normalize(rest, first, USET_SPAN_NOT_CONTAINED, errorCode);
+ } else {
+ first.append(rest);
+ }
+ }
+ return first;
+}
+
+UBool
+FilteredNormalizer2::getDecomposition(UChar32 c, UnicodeString &decomposition) const {
+ return set.contains(c) && norm2.getDecomposition(c, decomposition);
+}
+
+UBool
+FilteredNormalizer2::getRawDecomposition(UChar32 c, UnicodeString &decomposition) const {
+ return set.contains(c) && norm2.getRawDecomposition(c, decomposition);
+}
+
+UChar32
+FilteredNormalizer2::composePair(UChar32 a, UChar32 b) const {
+ return (set.contains(a) && set.contains(b)) ? norm2.composePair(a, b) : U_SENTINEL;
+}
+
+uint8_t
+FilteredNormalizer2::getCombiningClass(UChar32 c) const {
+ return set.contains(c) ? norm2.getCombiningClass(c) : 0;
+}
+
+UBool
+FilteredNormalizer2::isNormalized(const UnicodeString &s, UErrorCode &errorCode) const {
+ uprv_checkCanGetBuffer(s, errorCode);
+ if(U_FAILURE(errorCode)) {
+ return FALSE;
+ }
+ USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
+ for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
+ int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
+ if(spanCondition==USET_SPAN_NOT_CONTAINED) {
+ spanCondition=USET_SPAN_SIMPLE;
+ } else {
+ if( !norm2.isNormalized(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode) ||
+ U_FAILURE(errorCode)
+ ) {
+ return FALSE;
+ }
+ spanCondition=USET_SPAN_NOT_CONTAINED;
+ }
+ prevSpanLimit=spanLimit;
+ }
+ return TRUE;
+}
+
+UNormalizationCheckResult
+FilteredNormalizer2::quickCheck(const UnicodeString &s, UErrorCode &errorCode) const {
+ uprv_checkCanGetBuffer(s, errorCode);
+ if(U_FAILURE(errorCode)) {
+ return UNORM_MAYBE;
+ }
+ UNormalizationCheckResult result=UNORM_YES;
+ USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
+ for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
+ int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
+ if(spanCondition==USET_SPAN_NOT_CONTAINED) {
+ spanCondition=USET_SPAN_SIMPLE;
+ } else {
+ UNormalizationCheckResult qcResult=
+ norm2.quickCheck(s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
+ if(U_FAILURE(errorCode) || qcResult==UNORM_NO) {
+ return qcResult;
+ } else if(qcResult==UNORM_MAYBE) {
+ result=qcResult;
+ }
+ spanCondition=USET_SPAN_NOT_CONTAINED;
+ }
+ prevSpanLimit=spanLimit;
+ }
+ return result;
+}
+
+int32_t
+FilteredNormalizer2::spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const {
+ uprv_checkCanGetBuffer(s, errorCode);
+ if(U_FAILURE(errorCode)) {
+ return 0;
+ }
+ USetSpanCondition spanCondition=USET_SPAN_SIMPLE;
+ for(int32_t prevSpanLimit=0; prevSpanLimit<s.length();) {
+ int32_t spanLimit=set.span(s, prevSpanLimit, spanCondition);
+ if(spanCondition==USET_SPAN_NOT_CONTAINED) {
+ spanCondition=USET_SPAN_SIMPLE;
+ } else {
+ int32_t yesLimit=
+ prevSpanLimit+
+ norm2.spanQuickCheckYes(
+ s.tempSubStringBetween(prevSpanLimit, spanLimit), errorCode);
+ if(U_FAILURE(errorCode) || yesLimit<spanLimit) {
+ return yesLimit;
+ }
+ spanCondition=USET_SPAN_NOT_CONTAINED;
+ }
+ prevSpanLimit=spanLimit;
+ }
+ return s.length();
+}
+
+UBool
+FilteredNormalizer2::hasBoundaryBefore(UChar32 c) const {
+ return !set.contains(c) || norm2.hasBoundaryBefore(c);
+}
+
+UBool
+FilteredNormalizer2::hasBoundaryAfter(UChar32 c) const {
+ return !set.contains(c) || norm2.hasBoundaryAfter(c);
+}
+
+UBool
+FilteredNormalizer2::isInert(UChar32 c) const {
+ return !set.contains(c) || norm2.isInert(c);
+}
+
+U_NAMESPACE_END
+
+// C API ------------------------------------------------------------------- ***
+
+U_NAMESPACE_USE
+
+U_CAPI UNormalizer2 * U_EXPORT2
+unorm2_openFiltered(const UNormalizer2 *norm2, const USet *filterSet, UErrorCode *pErrorCode) {
+ if(U_FAILURE(*pErrorCode)) {
+ return NULL;
+ }
+ if(filterSet==NULL) {
+ *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR;
+ return NULL;
+ }
+ Normalizer2 *fn2=new FilteredNormalizer2(*(Normalizer2 *)norm2,
+ *UnicodeSet::fromUSet(filterSet));
+ if(fn2==NULL) {
+ *pErrorCode=U_MEMORY_ALLOCATION_ERROR;
+ }
+ return (UNormalizer2 *)fn2;
+}
+
+#endif // !UCONFIG_NO_NORMALIZATION