diff options
Diffstat (limited to 'deps/node/deps/icu-small/source/common/ustrtrns.cpp')
-rw-r--r-- | deps/node/deps/icu-small/source/common/ustrtrns.cpp | 1451 |
1 files changed, 0 insertions, 1451 deletions
diff --git a/deps/node/deps/icu-small/source/common/ustrtrns.cpp b/deps/node/deps/icu-small/source/common/ustrtrns.cpp deleted file mode 100644 index 583ec63c..00000000 --- a/deps/node/deps/icu-small/source/common/ustrtrns.cpp +++ /dev/null @@ -1,1451 +0,0 @@ -// © 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html -/* -****************************************************************************** -* -* Copyright (C) 2001-2016, International Business Machines -* Corporation and others. All Rights Reserved. -* -****************************************************************************** -* -* File ustrtrns.cpp -* -* Modification History: -* -* Date Name Description -* 9/10/2001 Ram Creation. -****************************************************************************** -*/ - -/******************************************************************************* - * - * u_strTo* and u_strFrom* APIs - * WCS functions moved to ustr_wcs.c for better modularization - * - ******************************************************************************* - */ - - -#include "unicode/putil.h" -#include "unicode/ustring.h" -#include "unicode/utf.h" -#include "unicode/utf8.h" -#include "unicode/utf16.h" -#include "cstring.h" -#include "cmemory.h" -#include "ustr_imp.h" -#include "uassert.h" - -U_CAPI UChar* U_EXPORT2 -u_strFromUTF32WithSub(UChar *dest, - int32_t destCapacity, - int32_t *pDestLength, - const UChar32 *src, - int32_t srcLength, - UChar32 subchar, int32_t *pNumSubstitutions, - UErrorCode *pErrorCode) { - const UChar32 *srcLimit; - UChar32 ch; - UChar *destLimit; - UChar *pDest; - int32_t reqLength; - int32_t numSubstitutions; - - /* args check */ - if(U_FAILURE(*pErrorCode)){ - return NULL; - } - if( (src==NULL && srcLength!=0) || srcLength < -1 || - (destCapacity<0) || (dest == NULL && destCapacity > 0) || - subchar > 0x10ffff || U_IS_SURROGATE(subchar) - ) { - *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; - return NULL; - } - - if(pNumSubstitutions != NULL) { - *pNumSubstitutions = 0; - } - - pDest = dest; - destLimit = (dest!=NULL)?(dest + destCapacity):NULL; - reqLength = 0; - numSubstitutions = 0; - - if(srcLength < 0) { - /* simple loop for conversion of a NUL-terminated BMP string */ - while((ch=*src) != 0 && - ((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff))) { - ++src; - if(pDest < destLimit) { - *pDest++ = (UChar)ch; - } else { - ++reqLength; - } - } - srcLimit = src; - if(ch != 0) { - /* "complicated" case, find the end of the remaining string */ - while(*++srcLimit != 0) {} - } - } else { - srcLimit = (src!=NULL)?(src + srcLength):NULL; - } - - /* convert with length */ - while(src < srcLimit) { - ch = *src++; - do { - /* usually "loops" once; twice only for writing subchar */ - if((uint32_t)ch < 0xd800 || (0xe000 <= ch && ch <= 0xffff)) { - if(pDest < destLimit) { - *pDest++ = (UChar)ch; - } else { - ++reqLength; - } - break; - } else if(0x10000 <= ch && ch <= 0x10ffff) { - if(pDest!=NULL && ((pDest + 2) <= destLimit)) { - *pDest++ = U16_LEAD(ch); - *pDest++ = U16_TRAIL(ch); - } else { - reqLength += 2; - } - break; - } else if((ch = subchar) < 0) { - /* surrogate code point, or not a Unicode code point at all */ - *pErrorCode = U_INVALID_CHAR_FOUND; - return NULL; - } else { - ++numSubstitutions; - } - } while(TRUE); - } - - reqLength += (int32_t)(pDest - dest); - if(pDestLength) { - *pDestLength = reqLength; - } - if(pNumSubstitutions != NULL) { - *pNumSubstitutions = numSubstitutions; - } - - /* Terminate the buffer */ - u_terminateUChars(dest, destCapacity, reqLength, pErrorCode); - - return dest; -} - -U_CAPI UChar* U_EXPORT2 -u_strFromUTF32(UChar *dest, - int32_t destCapacity, - int32_t *pDestLength, - const UChar32 *src, - int32_t srcLength, - UErrorCode *pErrorCode) { - return u_strFromUTF32WithSub( - dest, destCapacity, pDestLength, - src, srcLength, - U_SENTINEL, NULL, - pErrorCode); -} - -U_CAPI UChar32* U_EXPORT2 -u_strToUTF32WithSub(UChar32 *dest, - int32_t destCapacity, - int32_t *pDestLength, - const UChar *src, - int32_t srcLength, - UChar32 subchar, int32_t *pNumSubstitutions, - UErrorCode *pErrorCode) { - const UChar *srcLimit; - UChar32 ch; - UChar ch2; - UChar32 *destLimit; - UChar32 *pDest; - int32_t reqLength; - int32_t numSubstitutions; - - /* args check */ - if(U_FAILURE(*pErrorCode)){ - return NULL; - } - if( (src==NULL && srcLength!=0) || srcLength < -1 || - (destCapacity<0) || (dest == NULL && destCapacity > 0) || - subchar > 0x10ffff || U_IS_SURROGATE(subchar) - ) { - *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; - return NULL; - } - - if(pNumSubstitutions != NULL) { - *pNumSubstitutions = 0; - } - - pDest = dest; - destLimit = (dest!=NULL)?(dest + destCapacity):NULL; - reqLength = 0; - numSubstitutions = 0; - - if(srcLength < 0) { - /* simple loop for conversion of a NUL-terminated BMP string */ - while((ch=*src) != 0 && !U16_IS_SURROGATE(ch)) { - ++src; - if(pDest < destLimit) { - *pDest++ = ch; - } else { - ++reqLength; - } - } - srcLimit = src; - if(ch != 0) { - /* "complicated" case, find the end of the remaining string */ - while(*++srcLimit != 0) {} - } - } else { - srcLimit = (src!=NULL)?(src + srcLength):NULL; - } - - /* convert with length */ - while(src < srcLimit) { - ch = *src++; - if(!U16_IS_SURROGATE(ch)) { - /* write or count ch below */ - } else if(U16_IS_SURROGATE_LEAD(ch) && src < srcLimit && U16_IS_TRAIL(ch2 = *src)) { - ++src; - ch = U16_GET_SUPPLEMENTARY(ch, ch2); - } else if((ch = subchar) < 0) { - /* unpaired surrogate */ - *pErrorCode = U_INVALID_CHAR_FOUND; - return NULL; - } else { - ++numSubstitutions; - } - if(pDest < destLimit) { - *pDest++ = ch; - } else { - ++reqLength; - } - } - - reqLength += (int32_t)(pDest - dest); - if(pDestLength) { - *pDestLength = reqLength; - } - if(pNumSubstitutions != NULL) { - *pNumSubstitutions = numSubstitutions; - } - - /* Terminate the buffer */ - u_terminateUChar32s(dest, destCapacity, reqLength, pErrorCode); - - return dest; -} - -U_CAPI UChar32* U_EXPORT2 -u_strToUTF32(UChar32 *dest, - int32_t destCapacity, - int32_t *pDestLength, - const UChar *src, - int32_t srcLength, - UErrorCode *pErrorCode) { - return u_strToUTF32WithSub( - dest, destCapacity, pDestLength, - src, srcLength, - U_SENTINEL, NULL, - pErrorCode); -} - -U_CAPI UChar* U_EXPORT2 -u_strFromUTF8WithSub(UChar *dest, - int32_t destCapacity, - int32_t *pDestLength, - const char* src, - int32_t srcLength, - UChar32 subchar, int32_t *pNumSubstitutions, - UErrorCode *pErrorCode){ - /* args check */ - if(U_FAILURE(*pErrorCode)) { - return NULL; - } - if( (src==NULL && srcLength!=0) || srcLength < -1 || - (destCapacity<0) || (dest == NULL && destCapacity > 0) || - subchar > 0x10ffff || U_IS_SURROGATE(subchar) - ) { - *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; - return NULL; - } - - if(pNumSubstitutions!=NULL) { - *pNumSubstitutions=0; - } - UChar *pDest = dest; - UChar *pDestLimit = dest+destCapacity; - int32_t reqLength = 0; - int32_t numSubstitutions=0; - - /* - * Inline processing of UTF-8 byte sequences: - * - * Byte sequences for the most common characters are handled inline in - * the conversion loops. In order to reduce the path lengths for those - * characters, the tests are arranged in a kind of binary search. - * ASCII (<=0x7f) is checked first, followed by the dividing point - * between 2- and 3-byte sequences (0xe0). - * The 3-byte branch is tested first to speed up CJK text. - * The compiler should combine the subtractions for the two tests for 0xe0. - * Each branch then tests for the other end of its range. - */ - - if(srcLength < 0){ - /* - * Transform a NUL-terminated string. - * The code explicitly checks for NULs only in the lead byte position. - * A NUL byte in the trail byte position fails the trail byte range check anyway. - */ - int32_t i; - UChar32 c; - for(i = 0; (c = (uint8_t)src[i]) != 0 && (pDest < pDestLimit);) { - // modified copy of U8_NEXT() - ++i; - if(U8_IS_SINGLE(c)) { - *pDest++=(UChar)c; - } else { - uint8_t __t1, __t2; - if( /* handle U+0800..U+FFFF inline */ - (0xe0<=(c) && (c)<0xf0) && - U8_IS_VALID_LEAD3_AND_T1((c), src[i]) && - (__t2=src[(i)+1]-0x80)<=0x3f) { - *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2; - i+=2; - } else if( /* handle U+0080..U+07FF inline */ - ((c)<0xe0 && (c)>=0xc2) && - (__t1=src[i]-0x80)<=0x3f) { - *pDest++ = (((c)&0x1f)<<6)|__t1; - ++(i); - } else { - /* function call for "complicated" and error cases */ - (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), -1, c, -1); - if(c<0 && (++numSubstitutions, c = subchar) < 0) { - *pErrorCode = U_INVALID_CHAR_FOUND; - return NULL; - } else if(c<=0xFFFF) { - *(pDest++)=(UChar)c; - } else { - *(pDest++)=U16_LEAD(c); - if(pDest<pDestLimit) { - *(pDest++)=U16_TRAIL(c); - } else { - reqLength++; - break; - } - } - } - } - } - - /* Pre-flight the rest of the string. */ - while((c = (uint8_t)src[i]) != 0) { - // modified copy of U8_NEXT() - ++i; - if(U8_IS_SINGLE(c)) { - ++reqLength; - } else { - uint8_t __t1, __t2; - if( /* handle U+0800..U+FFFF inline */ - (0xe0<=(c) && (c)<0xf0) && - U8_IS_VALID_LEAD3_AND_T1((c), src[i]) && - (__t2=src[(i)+1]-0x80)<=0x3f) { - ++reqLength; - i+=2; - } else if( /* handle U+0080..U+07FF inline */ - ((c)<0xe0 && (c)>=0xc2) && - (__t1=src[i]-0x80)<=0x3f) { - ++reqLength; - ++(i); - } else { - /* function call for "complicated" and error cases */ - (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), -1, c, -1); - if(c<0 && (++numSubstitutions, c = subchar) < 0) { - *pErrorCode = U_INVALID_CHAR_FOUND; - return NULL; - } - reqLength += U16_LENGTH(c); - } - } - } - } else /* srcLength >= 0 */ { - /* Faster loop without ongoing checking for srcLength and pDestLimit. */ - int32_t i = 0; - UChar32 c; - for(;;) { - /* - * Each iteration of the inner loop progresses by at most 3 UTF-8 - * bytes and one UChar, for most characters. - * For supplementary code points (4 & 2), which are rare, - * there is an additional adjustment. - */ - int32_t count = (int32_t)(pDestLimit - pDest); - int32_t count2 = (srcLength - i) / 3; - if(count > count2) { - count = count2; /* min(remaining dest, remaining src/3) */ - } - if(count < 3) { - /* - * Too much overhead if we get near the end of the string, - * continue with the next loop. - */ - break; - } - - do { - // modified copy of U8_NEXT() - c = (uint8_t)src[i++]; - if(U8_IS_SINGLE(c)) { - *pDest++=(UChar)c; - } else { - uint8_t __t1, __t2; - if( /* handle U+0800..U+FFFF inline */ - (0xe0<=(c) && (c)<0xf0) && - ((i)+1)<srcLength && - U8_IS_VALID_LEAD3_AND_T1((c), src[i]) && - (__t2=src[(i)+1]-0x80)<=0x3f) { - *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2; - i+=2; - } else if( /* handle U+0080..U+07FF inline */ - ((c)<0xe0 && (c)>=0xc2) && - ((i)!=srcLength) && - (__t1=src[i]-0x80)<=0x3f) { - *pDest++ = (((c)&0x1f)<<6)|__t1; - ++(i); - } else { - if(c >= 0xf0 || subchar > 0xffff) { - // We may read up to four bytes and write up to two UChars, - // which we didn't account for with computing count, - // so we adjust it here. - if(--count == 0) { - --i; // back out byte c - break; - } - } - - /* function call for "complicated" and error cases */ - (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1); - if(c<0 && (++numSubstitutions, c = subchar) < 0) { - *pErrorCode = U_INVALID_CHAR_FOUND; - return NULL; - } else if(c<=0xFFFF) { - *(pDest++)=(UChar)c; - } else { - *(pDest++)=U16_LEAD(c); - *(pDest++)=U16_TRAIL(c); - } - } - } - } while(--count > 0); - } - - while(i < srcLength && (pDest < pDestLimit)) { - // modified copy of U8_NEXT() - c = (uint8_t)src[i++]; - if(U8_IS_SINGLE(c)) { - *pDest++=(UChar)c; - } else { - uint8_t __t1, __t2; - if( /* handle U+0800..U+FFFF inline */ - (0xe0<=(c) && (c)<0xf0) && - ((i)+1)<srcLength && - U8_IS_VALID_LEAD3_AND_T1((c), src[i]) && - (__t2=src[(i)+1]-0x80)<=0x3f) { - *pDest++ = (((c)&0xf)<<12)|((src[i]&0x3f)<<6)|__t2; - i+=2; - } else if( /* handle U+0080..U+07FF inline */ - ((c)<0xe0 && (c)>=0xc2) && - ((i)!=srcLength) && - (__t1=src[i]-0x80)<=0x3f) { - *pDest++ = (((c)&0x1f)<<6)|__t1; - ++(i); - } else { - /* function call for "complicated" and error cases */ - (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1); - if(c<0 && (++numSubstitutions, c = subchar) < 0) { - *pErrorCode = U_INVALID_CHAR_FOUND; - return NULL; - } else if(c<=0xFFFF) { - *(pDest++)=(UChar)c; - } else { - *(pDest++)=U16_LEAD(c); - if(pDest<pDestLimit) { - *(pDest++)=U16_TRAIL(c); - } else { - reqLength++; - break; - } - } - } - } - } - - /* Pre-flight the rest of the string. */ - while(i < srcLength) { - // modified copy of U8_NEXT() - c = (uint8_t)src[i++]; - if(U8_IS_SINGLE(c)) { - ++reqLength; - } else { - uint8_t __t1, __t2; - if( /* handle U+0800..U+FFFF inline */ - (0xe0<=(c) && (c)<0xf0) && - ((i)+1)<srcLength && - U8_IS_VALID_LEAD3_AND_T1((c), src[i]) && - (__t2=src[(i)+1]-0x80)<=0x3f) { - ++reqLength; - i+=2; - } else if( /* handle U+0080..U+07FF inline */ - ((c)<0xe0 && (c)>=0xc2) && - ((i)!=srcLength) && - (__t1=src[i]-0x80)<=0x3f) { - ++reqLength; - ++(i); - } else { - /* function call for "complicated" and error cases */ - (c)=utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, c, -1); - if(c<0 && (++numSubstitutions, c = subchar) < 0) { - *pErrorCode = U_INVALID_CHAR_FOUND; - return NULL; - } - reqLength += U16_LENGTH(c); - } - } - } - } - - reqLength+=(int32_t)(pDest - dest); - - if(pNumSubstitutions!=NULL) { - *pNumSubstitutions=numSubstitutions; - } - - if(pDestLength){ - *pDestLength = reqLength; - } - - /* Terminate the buffer */ - u_terminateUChars(dest,destCapacity,reqLength,pErrorCode); - - return dest; -} - -U_CAPI UChar* U_EXPORT2 -u_strFromUTF8(UChar *dest, - int32_t destCapacity, - int32_t *pDestLength, - const char* src, - int32_t srcLength, - UErrorCode *pErrorCode){ - return u_strFromUTF8WithSub( - dest, destCapacity, pDestLength, - src, srcLength, - U_SENTINEL, NULL, - pErrorCode); -} - -U_CAPI UChar * U_EXPORT2 -u_strFromUTF8Lenient(UChar *dest, - int32_t destCapacity, - int32_t *pDestLength, - const char *src, - int32_t srcLength, - UErrorCode *pErrorCode) { - UChar *pDest = dest; - UChar32 ch; - int32_t reqLength = 0; - uint8_t* pSrc = (uint8_t*) src; - - /* args check */ - if(U_FAILURE(*pErrorCode)){ - return NULL; - } - - if( (src==NULL && srcLength!=0) || srcLength < -1 || - (destCapacity<0) || (dest == NULL && destCapacity > 0) - ) { - *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; - return NULL; - } - - if(srcLength < 0) { - /* Transform a NUL-terminated string. */ - UChar *pDestLimit = (dest!=NULL)?(dest+destCapacity):NULL; - uint8_t t1, t2, t3; /* trail bytes */ - - while(((ch = *pSrc) != 0) && (pDest < pDestLimit)) { - if(ch < 0xc0) { - /* - * ASCII, or a trail byte in lead position which is treated like - * a single-byte sequence for better character boundary - * resynchronization after illegal sequences. - */ - *pDest++=(UChar)ch; - ++pSrc; - continue; - } else if(ch < 0xe0) { /* U+0080..U+07FF */ - if((t1 = pSrc[1]) != 0) { - /* 0x3080 = (0xc0 << 6) + 0x80 */ - *pDest++ = (UChar)((ch << 6) + t1 - 0x3080); - pSrc += 2; - continue; - } - } else if(ch < 0xf0) { /* U+0800..U+FFFF */ - if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0) { - /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ - /* 0x2080 = (0x80 << 6) + 0x80 */ - *pDest++ = (UChar)((ch << 12) + (t1 << 6) + t2 - 0x2080); - pSrc += 3; - continue; - } - } else /* f0..f4 */ { /* U+10000..U+10FFFF */ - if((t1 = pSrc[1]) != 0 && (t2 = pSrc[2]) != 0 && (t3 = pSrc[3]) != 0) { - pSrc += 4; - /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ - ch = (ch << 18) + (t1 << 12) + (t2 << 6) + t3 - 0x3c82080; - *(pDest++) = U16_LEAD(ch); - if(pDest < pDestLimit) { - *(pDest++) = U16_TRAIL(ch); - } else { - reqLength = 1; - break; - } - continue; - } - } - - /* truncated character at the end */ - *pDest++ = 0xfffd; - while(*++pSrc != 0) {} - break; - } - - /* Pre-flight the rest of the string. */ - while((ch = *pSrc) != 0) { - if(ch < 0xc0) { - /* - * ASCII, or a trail byte in lead position which is treated like - * a single-byte sequence for better character boundary - * resynchronization after illegal sequences. - */ - ++reqLength; - ++pSrc; - continue; - } else if(ch < 0xe0) { /* U+0080..U+07FF */ - if(pSrc[1] != 0) { - ++reqLength; - pSrc += 2; - continue; - } - } else if(ch < 0xf0) { /* U+0800..U+FFFF */ - if(pSrc[1] != 0 && pSrc[2] != 0) { - ++reqLength; - pSrc += 3; - continue; - } - } else /* f0..f4 */ { /* U+10000..U+10FFFF */ - if(pSrc[1] != 0 && pSrc[2] != 0 && pSrc[3] != 0) { - reqLength += 2; - pSrc += 4; - continue; - } - } - - /* truncated character at the end */ - ++reqLength; - break; - } - } else /* srcLength >= 0 */ { - const uint8_t *pSrcLimit = (pSrc!=NULL)?(pSrc + srcLength):NULL; - - /* - * This function requires that if srcLength is given, then it must be - * destCapatity >= srcLength so that we need not check for - * destination buffer overflow in the loop. - */ - if(destCapacity < srcLength) { - if(pDestLength != NULL) { - *pDestLength = srcLength; /* this likely overestimates the true destLength! */ - } - *pErrorCode = U_BUFFER_OVERFLOW_ERROR; - return NULL; - } - - if((pSrcLimit - pSrc) >= 4) { - pSrcLimit -= 3; /* temporarily reduce pSrcLimit */ - - /* in this loop, we can always access at least 4 bytes, up to pSrc+3 */ - do { - ch = *pSrc++; - if(ch < 0xc0) { - /* - * ASCII, or a trail byte in lead position which is treated like - * a single-byte sequence for better character boundary - * resynchronization after illegal sequences. - */ - *pDest++=(UChar)ch; - } else if(ch < 0xe0) { /* U+0080..U+07FF */ - /* 0x3080 = (0xc0 << 6) + 0x80 */ - *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080); - } else if(ch < 0xf0) { /* U+0800..U+FFFF */ - /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ - /* 0x2080 = (0x80 << 6) + 0x80 */ - ch = (ch << 12) + (*pSrc++ << 6); - *pDest++ = (UChar)(ch + *pSrc++ - 0x2080); - } else /* f0..f4 */ { /* U+10000..U+10FFFF */ - /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ - ch = (ch << 18) + (*pSrc++ << 12); - ch += *pSrc++ << 6; - ch += *pSrc++ - 0x3c82080; - *(pDest++) = U16_LEAD(ch); - *(pDest++) = U16_TRAIL(ch); - } - } while(pSrc < pSrcLimit); - - pSrcLimit += 3; /* restore original pSrcLimit */ - } - - while(pSrc < pSrcLimit) { - ch = *pSrc++; - if(ch < 0xc0) { - /* - * ASCII, or a trail byte in lead position which is treated like - * a single-byte sequence for better character boundary - * resynchronization after illegal sequences. - */ - *pDest++=(UChar)ch; - continue; - } else if(ch < 0xe0) { /* U+0080..U+07FF */ - if(pSrc < pSrcLimit) { - /* 0x3080 = (0xc0 << 6) + 0x80 */ - *pDest++ = (UChar)((ch << 6) + *pSrc++ - 0x3080); - continue; - } - } else if(ch < 0xf0) { /* U+0800..U+FFFF */ - if((pSrcLimit - pSrc) >= 2) { - /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ - /* 0x2080 = (0x80 << 6) + 0x80 */ - ch = (ch << 12) + (*pSrc++ << 6); - *pDest++ = (UChar)(ch + *pSrc++ - 0x2080); - pSrc += 3; - continue; - } - } else /* f0..f4 */ { /* U+10000..U+10FFFF */ - if((pSrcLimit - pSrc) >= 3) { - /* 0x3c82080 = (0xf0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */ - ch = (ch << 18) + (*pSrc++ << 12); - ch += *pSrc++ << 6; - ch += *pSrc++ - 0x3c82080; - *(pDest++) = U16_LEAD(ch); - *(pDest++) = U16_TRAIL(ch); - pSrc += 4; - continue; - } - } - - /* truncated character at the end */ - *pDest++ = 0xfffd; - break; - } - } - - reqLength+=(int32_t)(pDest - dest); - - if(pDestLength){ - *pDestLength = reqLength; - } - - /* Terminate the buffer */ - u_terminateUChars(dest,destCapacity,reqLength,pErrorCode); - - return dest; -} - -static inline uint8_t * -_appendUTF8(uint8_t *pDest, UChar32 c) { - /* it is 0<=c<=0x10ffff and not a surrogate if called by a validating function */ - if((c)<=0x7f) { - *pDest++=(uint8_t)c; - } else if(c<=0x7ff) { - *pDest++=(uint8_t)((c>>6)|0xc0); - *pDest++=(uint8_t)((c&0x3f)|0x80); - } else if(c<=0xffff) { - *pDest++=(uint8_t)((c>>12)|0xe0); - *pDest++=(uint8_t)(((c>>6)&0x3f)|0x80); - *pDest++=(uint8_t)(((c)&0x3f)|0x80); - } else /* if((uint32_t)(c)<=0x10ffff) */ { - *pDest++=(uint8_t)(((c)>>18)|0xf0); - *pDest++=(uint8_t)((((c)>>12)&0x3f)|0x80); - *pDest++=(uint8_t)((((c)>>6)&0x3f)|0x80); - *pDest++=(uint8_t)(((c)&0x3f)|0x80); - } - return pDest; -} - - -U_CAPI char* U_EXPORT2 -u_strToUTF8WithSub(char *dest, - int32_t destCapacity, - int32_t *pDestLength, - const UChar *pSrc, - int32_t srcLength, - UChar32 subchar, int32_t *pNumSubstitutions, - UErrorCode *pErrorCode){ - int32_t reqLength=0; - uint32_t ch=0,ch2=0; - uint8_t *pDest = (uint8_t *)dest; - uint8_t *pDestLimit = (pDest!=NULL)?(pDest + destCapacity):NULL; - int32_t numSubstitutions; - - /* args check */ - if(U_FAILURE(*pErrorCode)){ - return NULL; - } - - if( (pSrc==NULL && srcLength!=0) || srcLength < -1 || - (destCapacity<0) || (dest == NULL && destCapacity > 0) || - subchar > 0x10ffff || U_IS_SURROGATE(subchar) - ) { - *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; - return NULL; - } - - if(pNumSubstitutions!=NULL) { - *pNumSubstitutions=0; - } - numSubstitutions=0; - - if(srcLength==-1) { - while((ch=*pSrc)!=0) { - ++pSrc; - if(ch <= 0x7f) { - if(pDest<pDestLimit) { - *pDest++ = (uint8_t)ch; - } else { - reqLength = 1; - break; - } - } else if(ch <= 0x7ff) { - if((pDestLimit - pDest) >= 2) { - *pDest++=(uint8_t)((ch>>6)|0xc0); - *pDest++=(uint8_t)((ch&0x3f)|0x80); - } else { - reqLength = 2; - break; - } - } else if(ch <= 0xd7ff || ch >= 0xe000) { - if((pDestLimit - pDest) >= 3) { - *pDest++=(uint8_t)((ch>>12)|0xe0); - *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); - *pDest++=(uint8_t)((ch&0x3f)|0x80); - } else { - reqLength = 3; - break; - } - } else /* ch is a surrogate */ { - int32_t length; - - /*need not check for NUL because NUL fails U16_IS_TRAIL() anyway*/ - if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) { - ++pSrc; - ch=U16_GET_SUPPLEMENTARY(ch, ch2); - } else if(subchar>=0) { - ch=subchar; - ++numSubstitutions; - } else { - /* Unicode 3.2 forbids surrogate code points in UTF-8 */ - *pErrorCode = U_INVALID_CHAR_FOUND; - return NULL; - } - - length = U8_LENGTH(ch); - if((pDestLimit - pDest) >= length) { - /* convert and append*/ - pDest=_appendUTF8(pDest, ch); - } else { - reqLength = length; - break; - } - } - } - while((ch=*pSrc++)!=0) { - if(ch<=0x7f) { - ++reqLength; - } else if(ch<=0x7ff) { - reqLength+=2; - } else if(!U16_IS_SURROGATE(ch)) { - reqLength+=3; - } else if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) { - ++pSrc; - reqLength+=4; - } else if(subchar>=0) { - reqLength+=U8_LENGTH(subchar); - ++numSubstitutions; - } else { - /* Unicode 3.2 forbids surrogate code points in UTF-8 */ - *pErrorCode = U_INVALID_CHAR_FOUND; - return NULL; - } - } - } else { - const UChar *pSrcLimit = (pSrc!=NULL)?(pSrc+srcLength):NULL; - int32_t count; - - /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ - for(;;) { - /* - * Each iteration of the inner loop progresses by at most 3 UTF-8 - * bytes and one UChar, for most characters. - * For supplementary code points (4 & 2), which are rare, - * there is an additional adjustment. - */ - count = (int32_t)((pDestLimit - pDest) / 3); - srcLength = (int32_t)(pSrcLimit - pSrc); - if(count > srcLength) { - count = srcLength; /* min(remaining dest/3, remaining src) */ - } - if(count < 3) { - /* - * Too much overhead if we get near the end of the string, - * continue with the next loop. - */ - break; - } - do { - ch=*pSrc++; - if(ch <= 0x7f) { - *pDest++ = (uint8_t)ch; - } else if(ch <= 0x7ff) { - *pDest++=(uint8_t)((ch>>6)|0xc0); - *pDest++=(uint8_t)((ch&0x3f)|0x80); - } else if(ch <= 0xd7ff || ch >= 0xe000) { - *pDest++=(uint8_t)((ch>>12)|0xe0); - *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); - *pDest++=(uint8_t)((ch&0x3f)|0x80); - } else /* ch is a surrogate */ { - /* - * We will read two UChars and probably output four bytes, - * which we didn't account for with computing count, - * so we adjust it here. - */ - if(--count == 0) { - --pSrc; /* undo ch=*pSrc++ for the lead surrogate */ - break; /* recompute count */ - } - - if(U16_IS_SURROGATE_LEAD(ch) && U16_IS_TRAIL(ch2=*pSrc)) { - ++pSrc; - ch=U16_GET_SUPPLEMENTARY(ch, ch2); - - /* writing 4 bytes per 2 UChars is ok */ - *pDest++=(uint8_t)((ch>>18)|0xf0); - *pDest++=(uint8_t)(((ch>>12)&0x3f)|0x80); - *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); - *pDest++=(uint8_t)((ch&0x3f)|0x80); - } else { - /* Unicode 3.2 forbids surrogate code points in UTF-8 */ - if(subchar>=0) { - ch=subchar; - ++numSubstitutions; - } else { - *pErrorCode = U_INVALID_CHAR_FOUND; - return NULL; - } - - /* convert and append*/ - pDest=_appendUTF8(pDest, ch); - } - } - } while(--count > 0); - } - - while(pSrc<pSrcLimit) { - ch=*pSrc++; - if(ch <= 0x7f) { - if(pDest<pDestLimit) { - *pDest++ = (uint8_t)ch; - } else { - reqLength = 1; - break; - } - } else if(ch <= 0x7ff) { - if((pDestLimit - pDest) >= 2) { - *pDest++=(uint8_t)((ch>>6)|0xc0); - *pDest++=(uint8_t)((ch&0x3f)|0x80); - } else { - reqLength = 2; - break; - } - } else if(ch <= 0xd7ff || ch >= 0xe000) { - if((pDestLimit - pDest) >= 3) { - *pDest++=(uint8_t)((ch>>12)|0xe0); - *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); - *pDest++=(uint8_t)((ch&0x3f)|0x80); - } else { - reqLength = 3; - break; - } - } else /* ch is a surrogate */ { - int32_t length; - - if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) { - ++pSrc; - ch=U16_GET_SUPPLEMENTARY(ch, ch2); - } else if(subchar>=0) { - ch=subchar; - ++numSubstitutions; - } else { - /* Unicode 3.2 forbids surrogate code points in UTF-8 */ - *pErrorCode = U_INVALID_CHAR_FOUND; - return NULL; - } - - length = U8_LENGTH(ch); - if((pDestLimit - pDest) >= length) { - /* convert and append*/ - pDest=_appendUTF8(pDest, ch); - } else { - reqLength = length; - break; - } - } - } - while(pSrc<pSrcLimit) { - ch=*pSrc++; - if(ch<=0x7f) { - ++reqLength; - } else if(ch<=0x7ff) { - reqLength+=2; - } else if(!U16_IS_SURROGATE(ch)) { - reqLength+=3; - } else if(U16_IS_SURROGATE_LEAD(ch) && pSrc<pSrcLimit && U16_IS_TRAIL(ch2=*pSrc)) { - ++pSrc; - reqLength+=4; - } else if(subchar>=0) { - reqLength+=U8_LENGTH(subchar); - ++numSubstitutions; - } else { - /* Unicode 3.2 forbids surrogate code points in UTF-8 */ - *pErrorCode = U_INVALID_CHAR_FOUND; - return NULL; - } - } - } - - reqLength+=(int32_t)(pDest - (uint8_t *)dest); - - if(pNumSubstitutions!=NULL) { - *pNumSubstitutions=numSubstitutions; - } - - if(pDestLength){ - *pDestLength = reqLength; - } - - /* Terminate the buffer */ - u_terminateChars(dest, destCapacity, reqLength, pErrorCode); - return dest; -} - -U_CAPI char* U_EXPORT2 -u_strToUTF8(char *dest, - int32_t destCapacity, - int32_t *pDestLength, - const UChar *pSrc, - int32_t srcLength, - UErrorCode *pErrorCode){ - return u_strToUTF8WithSub( - dest, destCapacity, pDestLength, - pSrc, srcLength, - U_SENTINEL, NULL, - pErrorCode); -} - -U_CAPI UChar* U_EXPORT2 -u_strFromJavaModifiedUTF8WithSub( - UChar *dest, - int32_t destCapacity, - int32_t *pDestLength, - const char *src, - int32_t srcLength, - UChar32 subchar, int32_t *pNumSubstitutions, - UErrorCode *pErrorCode) { - /* args check */ - if(U_FAILURE(*pErrorCode)) { - return NULL; - } - if( (src==NULL && srcLength!=0) || srcLength < -1 || - (dest==NULL && destCapacity!=0) || destCapacity<0 || - subchar > 0x10ffff || U_IS_SURROGATE(subchar) - ) { - *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; - return NULL; - } - - if(pNumSubstitutions!=NULL) { - *pNumSubstitutions=0; - } - UChar *pDest = dest; - UChar *pDestLimit = dest+destCapacity; - int32_t reqLength = 0; - int32_t numSubstitutions=0; - - if(srcLength < 0) { - /* - * Transform a NUL-terminated ASCII string. - * Handle non-ASCII strings with slower code. - */ - UChar32 c; - while(((c = (uint8_t)*src) != 0) && c <= 0x7f && (pDest < pDestLimit)) { - *pDest++=(UChar)c; - ++src; - } - if(c == 0) { - reqLength=(int32_t)(pDest - dest); - if(pDestLength) { - *pDestLength = reqLength; - } - - /* Terminate the buffer */ - u_terminateUChars(dest, destCapacity, reqLength, pErrorCode); - return dest; - } - srcLength = static_cast<int32_t>(uprv_strlen(src)); - } - - /* Faster loop without ongoing checking for srcLength and pDestLimit. */ - UChar32 ch; - uint8_t t1, t2; - int32_t i = 0; - for(;;) { - int32_t count = (int32_t)(pDestLimit - pDest); - int32_t count2 = srcLength - i; - if(count >= count2 && srcLength > 0 && U8_IS_SINGLE(*src)) { - /* fast ASCII loop */ - int32_t start = i; - uint8_t b; - while(i < srcLength && U8_IS_SINGLE(b = src[i])) { - *pDest++=b; - ++i; - } - int32_t delta = i - start; - count -= delta; - count2 -= delta; - } - /* - * Each iteration of the inner loop progresses by at most 3 UTF-8 - * bytes and one UChar. - */ - if(subchar > 0xFFFF) { - break; - } - count2 /= 3; - if(count > count2) { - count = count2; /* min(remaining dest, remaining src/3) */ - } - if(count < 3) { - /* - * Too much overhead if we get near the end of the string, - * continue with the next loop. - */ - break; - } - do { - ch = (uint8_t)src[i++]; - if(U8_IS_SINGLE(ch)) { - *pDest++=(UChar)ch; - } else { - if(ch >= 0xe0) { - if( /* handle U+0000..U+FFFF inline */ - ch <= 0xef && - (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f && - (t2 = (uint8_t)(src[i+1] - 0x80)) <= 0x3f - ) { - /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ - *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); - i += 2; - continue; - } - } else { - if( /* handle U+0000..U+07FF inline */ - ch >= 0xc0 && - (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f - ) { - *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); - ++i; - continue; - } - } - - if(subchar < 0) { - *pErrorCode = U_INVALID_CHAR_FOUND; - return NULL; - } else if(subchar > 0xffff && --count == 0) { - /* - * We need to write two UChars, adjusted count for that, - * and ran out of space. - */ - --i; // back out byte ch - break; - } else { - /* function call for error cases */ - utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1); - ++numSubstitutions; - *(pDest++)=(UChar)subchar; - } - } - } while(--count > 0); - } - - while(i < srcLength && (pDest < pDestLimit)) { - ch = (uint8_t)src[i++]; - if(U8_IS_SINGLE(ch)){ - *pDest++=(UChar)ch; - } else { - if(ch >= 0xe0) { - if( /* handle U+0000..U+FFFF inline */ - ch <= 0xef && - (i+1) < srcLength && - (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f && - (t2 = (uint8_t)(src[i+1] - 0x80)) <= 0x3f - ) { - /* no need for (ch & 0xf) because the upper bits are truncated after <<12 in the cast to (UChar) */ - *pDest++ = (UChar)((ch << 12) | (t1 << 6) | t2); - i += 2; - continue; - } - } else { - if( /* handle U+0000..U+07FF inline */ - ch >= 0xc0 && - i < srcLength && - (t1 = (uint8_t)(src[i] - 0x80)) <= 0x3f - ) { - *pDest++ = (UChar)(((ch & 0x1f) << 6) | t1); - ++i; - continue; - } - } - - if(subchar < 0) { - *pErrorCode = U_INVALID_CHAR_FOUND; - return NULL; - } else { - /* function call for error cases */ - utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1); - ++numSubstitutions; - if(subchar<=0xFFFF) { - *(pDest++)=(UChar)subchar; - } else { - *(pDest++)=U16_LEAD(subchar); - if(pDest<pDestLimit) { - *(pDest++)=U16_TRAIL(subchar); - } else { - reqLength++; - break; - } - } - } - } - } - - /* Pre-flight the rest of the string. */ - while(i < srcLength) { - ch = (uint8_t)src[i++]; - if(U8_IS_SINGLE(ch)) { - reqLength++; - } else { - if(ch >= 0xe0) { - if( /* handle U+0000..U+FFFF inline */ - ch <= 0xef && - (i+1) < srcLength && - (uint8_t)(src[i] - 0x80) <= 0x3f && - (uint8_t)(src[i+1] - 0x80) <= 0x3f - ) { - reqLength++; - i += 2; - continue; - } - } else { - if( /* handle U+0000..U+07FF inline */ - ch >= 0xc0 && - i < srcLength && - (uint8_t)(src[i] - 0x80) <= 0x3f - ) { - reqLength++; - ++i; - continue; - } - } - - if(subchar < 0) { - *pErrorCode = U_INVALID_CHAR_FOUND; - return NULL; - } else { - /* function call for error cases */ - utf8_nextCharSafeBody((const uint8_t *)src, &(i), srcLength, ch, -1); - ++numSubstitutions; - reqLength+=U16_LENGTH(ch); - } - } - } - - if(pNumSubstitutions!=NULL) { - *pNumSubstitutions=numSubstitutions; - } - - reqLength+=(int32_t)(pDest - dest); - if(pDestLength) { - *pDestLength = reqLength; - } - - /* Terminate the buffer */ - u_terminateUChars(dest, destCapacity, reqLength, pErrorCode); - return dest; -} - -U_CAPI char* U_EXPORT2 -u_strToJavaModifiedUTF8( - char *dest, - int32_t destCapacity, - int32_t *pDestLength, - const UChar *src, - int32_t srcLength, - UErrorCode *pErrorCode) { - int32_t reqLength=0; - uint32_t ch=0; - uint8_t *pDest = (uint8_t *)dest; - uint8_t *pDestLimit = pDest + destCapacity; - const UChar *pSrcLimit; - int32_t count; - - /* args check */ - if(U_FAILURE(*pErrorCode)){ - return NULL; - } - if( (src==NULL && srcLength!=0) || srcLength < -1 || - (dest==NULL && destCapacity!=0) || destCapacity<0 - ) { - *pErrorCode = U_ILLEGAL_ARGUMENT_ERROR; - return NULL; - } - - if(srcLength==-1) { - /* Convert NUL-terminated ASCII, then find the string length. */ - while((ch=*src)<=0x7f && ch != 0 && pDest<pDestLimit) { - *pDest++ = (uint8_t)ch; - ++src; - } - if(ch == 0) { - reqLength=(int32_t)(pDest - (uint8_t *)dest); - if(pDestLength) { - *pDestLength = reqLength; - } - - /* Terminate the buffer */ - u_terminateChars(dest, destCapacity, reqLength, pErrorCode); - return dest; - } - srcLength = u_strlen(src); - } - - /* Faster loop without ongoing checking for pSrcLimit and pDestLimit. */ - pSrcLimit = (src!=NULL)?(src+srcLength):NULL; - for(;;) { - count = (int32_t)(pDestLimit - pDest); - srcLength = (int32_t)(pSrcLimit - src); - if(count >= srcLength && srcLength > 0 && *src <= 0x7f) { - /* fast ASCII loop */ - const UChar *prevSrc = src; - int32_t delta; - while(src < pSrcLimit && (ch = *src) <= 0x7f && ch != 0) { - *pDest++=(uint8_t)ch; - ++src; - } - delta = (int32_t)(src - prevSrc); - count -= delta; - srcLength -= delta; - } - /* - * Each iteration of the inner loop progresses by at most 3 UTF-8 - * bytes and one UChar. - */ - count /= 3; - if(count > srcLength) { - count = srcLength; /* min(remaining dest/3, remaining src) */ - } - if(count < 3) { - /* - * Too much overhead if we get near the end of the string, - * continue with the next loop. - */ - break; - } - do { - ch=*src++; - if(ch <= 0x7f && ch != 0) { - *pDest++ = (uint8_t)ch; - } else if(ch <= 0x7ff) { - *pDest++=(uint8_t)((ch>>6)|0xc0); - *pDest++=(uint8_t)((ch&0x3f)|0x80); - } else { - *pDest++=(uint8_t)((ch>>12)|0xe0); - *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); - *pDest++=(uint8_t)((ch&0x3f)|0x80); - } - } while(--count > 0); - } - - while(src<pSrcLimit) { - ch=*src++; - if(ch <= 0x7f && ch != 0) { - if(pDest<pDestLimit) { - *pDest++ = (uint8_t)ch; - } else { - reqLength = 1; - break; - } - } else if(ch <= 0x7ff) { - if((pDestLimit - pDest) >= 2) { - *pDest++=(uint8_t)((ch>>6)|0xc0); - *pDest++=(uint8_t)((ch&0x3f)|0x80); - } else { - reqLength = 2; - break; - } - } else { - if((pDestLimit - pDest) >= 3) { - *pDest++=(uint8_t)((ch>>12)|0xe0); - *pDest++=(uint8_t)(((ch>>6)&0x3f)|0x80); - *pDest++=(uint8_t)((ch&0x3f)|0x80); - } else { - reqLength = 3; - break; - } - } - } - while(src<pSrcLimit) { - ch=*src++; - if(ch <= 0x7f && ch != 0) { - ++reqLength; - } else if(ch<=0x7ff) { - reqLength+=2; - } else { - reqLength+=3; - } - } - - reqLength+=(int32_t)(pDest - (uint8_t *)dest); - if(pDestLength){ - *pDestLength = reqLength; - } - - /* Terminate the buffer */ - u_terminateChars(dest, destCapacity, reqLength, pErrorCode); - return dest; -} |