diff options
Diffstat (limited to 'deps/node/deps/icu-small/source/common/ucnvbocu.cpp')
-rw-r--r-- | deps/node/deps/icu-small/source/common/ucnvbocu.cpp | 1413 |
1 files changed, 0 insertions, 1413 deletions
diff --git a/deps/node/deps/icu-small/source/common/ucnvbocu.cpp b/deps/node/deps/icu-small/source/common/ucnvbocu.cpp deleted file mode 100644 index 5b66c505..00000000 --- a/deps/node/deps/icu-small/source/common/ucnvbocu.cpp +++ /dev/null @@ -1,1413 +0,0 @@ -// © 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html -/* -****************************************************************************** -* -* Copyright (C) 2002-2016, International Business Machines -* Corporation and others. All Rights Reserved. -* -****************************************************************************** -* file name: ucnvbocu.cpp -* encoding: UTF-8 -* tab size: 8 (not used) -* indentation:4 -* -* created on: 2002mar27 -* created by: Markus W. Scherer -* -* This is an implementation of the Binary Ordered Compression for Unicode, -* in its MIME-friendly form as defined in http://www.unicode.org/notes/tn6/ -*/ - -#include "unicode/utypes.h" - -#if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION - -#include "unicode/ucnv.h" -#include "unicode/ucnv_cb.h" -#include "unicode/utf16.h" -#include "putilimp.h" -#include "ucnv_bld.h" -#include "ucnv_cnv.h" -#include "uassert.h" - -/* BOCU-1 constants and macros ---------------------------------------------- */ - -/* - * BOCU-1 encodes the code points of a Unicode string as - * a sequence of byte-encoded differences (slope detection), - * preserving lexical order. - * - * Optimize the difference-taking for runs of Unicode text within - * small scripts: - * - * Most small scripts are allocated within aligned 128-blocks of Unicode - * code points. Lexical order is preserved if the "previous code point" state - * is always moved into the middle of such a block. - * - * Additionally, "prev" is moved from anywhere in the Unihan and Hangul - * areas into the middle of those areas. - * - * C0 control codes and space are encoded with their US-ASCII bytes. - * "prev" is reset for C0 controls but not for space. - */ - -/* initial value for "prev": middle of the ASCII range */ -#define BOCU1_ASCII_PREV 0x40 - -/* bounding byte values for differences */ -#define BOCU1_MIN 0x21 -#define BOCU1_MIDDLE 0x90 -#define BOCU1_MAX_LEAD 0xfe -#define BOCU1_MAX_TRAIL 0xff -#define BOCU1_RESET 0xff - -/* number of lead bytes */ -#define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1) - -/* adjust trail byte counts for the use of some C0 control byte values */ -#define BOCU1_TRAIL_CONTROLS_COUNT 20 -#define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT) - -/* number of trail bytes */ -#define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT) - -/* - * number of positive and negative single-byte codes - * (counting 0==BOCU1_MIDDLE among the positive ones) - */ -#define BOCU1_SINGLE 64 - -/* number of lead bytes for positive and negative 2/3/4-byte sequences */ -#define BOCU1_LEAD_2 43 -#define BOCU1_LEAD_3 3 -#define BOCU1_LEAD_4 1 - -/* The difference value range for single-byters. */ -#define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1) -#define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE) - -/* The difference value range for double-byters. */ -#define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT) -#define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT) - -/* The difference value range for 3-byters. */ -#define BOCU1_REACH_POS_3 \ - (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT) - -#define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT) - -/* The lead byte start values. */ -#define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1) -#define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2) -#define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3) - /* ==BOCU1_MAX_LEAD */ - -#define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1) -#define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2) -#define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3) - /* ==BOCU1_MIN+1 */ - -/* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */ -#define BOCU1_LENGTH_FROM_LEAD(lead) \ - ((BOCU1_START_NEG_2<=(lead) && (lead)<BOCU1_START_POS_2) ? 1 : \ - (BOCU1_START_NEG_3<=(lead) && (lead)<BOCU1_START_POS_3) ? 2 : \ - (BOCU1_START_NEG_4<=(lead) && (lead)<BOCU1_START_POS_4) ? 3 : 4) - -/* The length of a byte sequence, according to its packed form. */ -#define BOCU1_LENGTH_FROM_PACKED(packed) \ - ((uint32_t)(packed)<0x04000000 ? (packed)>>24 : 4) - -/* - * 12 commonly used C0 control codes (and space) are only used to encode - * themselves directly, - * which makes BOCU-1 MIME-usable and reasonably safe for - * ASCII-oriented software. - * - * These controls are - * 0 NUL - * - * 7 BEL - * 8 BS - * - * 9 TAB - * a LF - * b VT - * c FF - * d CR - * - * e SO - * f SI - * - * 1a SUB - * 1b ESC - * - * The other 20 C0 controls are also encoded directly (to preserve order) - * but are also used as trail bytes in difference encoding - * (for better compression). - */ -#define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t]) - -/* - * Byte value map for control codes, - * from external byte values 0x00..0x20 - * to trail byte values 0..19 (0..0x13) as used in the difference calculation. - * External byte values that are illegal as trail bytes are mapped to -1. - */ -static const int8_t -bocu1ByteToTrail[BOCU1_MIN]={ -/* 0 1 2 3 4 5 6 7 */ - -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1, - -/* 8 9 a b c d e f */ - -1, -1, -1, -1, -1, -1, -1, -1, - -/* 10 11 12 13 14 15 16 17 */ - 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, - -/* 18 19 1a 1b 1c 1d 1e 1f */ - 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13, - -/* 20 */ - -1 -}; - -/* - * Byte value map for control codes, - * from trail byte values 0..19 (0..0x13) as used in the difference calculation - * to external byte values 0x00..0x20. - */ -static const int8_t -bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={ -/* 0 1 2 3 4 5 6 7 */ - 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11, - -/* 8 9 a b c d e f */ - 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, - -/* 10 11 12 13 */ - 0x1c, 0x1d, 0x1e, 0x1f -}; - -/** - * Integer division and modulo with negative numerators - * yields negative modulo results and quotients that are one more than - * what we need here. - * This macro adjust the results so that the modulo-value m is always >=0. - * - * For positive n, the if() condition is always FALSE. - * - * @param n Number to be split into quotient and rest. - * Will be modified to contain the quotient. - * @param d Divisor. - * @param m Output variable for the rest (modulo result). - */ -#define NEGDIVMOD(n, d, m) { \ - (m)=(n)%(d); \ - (n)/=(d); \ - if((m)<0) { \ - --(n); \ - (m)+=(d); \ - } \ -} - -/* Faster versions of packDiff() for single-byte-encoded diff values. */ - -/** Is a diff value encodable in a single byte? */ -#define DIFF_IS_SINGLE(diff) (BOCU1_REACH_NEG_1<=(diff) && (diff)<=BOCU1_REACH_POS_1) - -/** Encode a diff value in a single byte. */ -#define PACK_SINGLE_DIFF(diff) (BOCU1_MIDDLE+(diff)) - -/** Is a diff value encodable in two bytes? */ -#define DIFF_IS_DOUBLE(diff) (BOCU1_REACH_NEG_2<=(diff) && (diff)<=BOCU1_REACH_POS_2) - -/* BOCU-1 implementation functions ------------------------------------------ */ - -#define BOCU1_SIMPLE_PREV(c) (((c)&~0x7f)+BOCU1_ASCII_PREV) - -/** - * Compute the next "previous" value for differencing - * from the current code point. - * - * @param c current code point, 0x3040..0xd7a3 (rest handled by macro below) - * @return "previous code point" state value - */ -static inline int32_t -bocu1Prev(int32_t c) { - /* compute new prev */ - if(/* 0x3040<=c && */ c<=0x309f) { - /* Hiragana is not 128-aligned */ - return 0x3070; - } else if(0x4e00<=c && c<=0x9fa5) { - /* CJK Unihan */ - return 0x4e00-BOCU1_REACH_NEG_2; - } else if(0xac00<=c /* && c<=0xd7a3 */) { - /* Korean Hangul */ - return (0xd7a3+0xac00)/2; - } else { - /* mostly small scripts */ - return BOCU1_SIMPLE_PREV(c); - } -} - -/** Fast version of bocu1Prev() for most scripts. */ -#define BOCU1_PREV(c) ((c)<0x3040 || (c)>0xd7a3 ? BOCU1_SIMPLE_PREV(c) : bocu1Prev(c)) - -/* - * The BOCU-1 converter uses the standard setup code in ucnv.c/ucnv_bld.c. - * The UConverter fields are used as follows: - * - * fromUnicodeStatus encoder's prev (0 will be interpreted as BOCU1_ASCII_PREV) - * - * toUnicodeStatus decoder's prev (0 will be interpreted as BOCU1_ASCII_PREV) - * mode decoder's incomplete (diff<<2)|count (ignored when toULength==0) - */ - -/* BOCU-1-from-Unicode conversion functions --------------------------------- */ - -/** - * Encode a difference -0x10ffff..0x10ffff in 1..4 bytes - * and return a packed integer with them. - * - * The encoding favors small absolute differences with short encodings - * to compress runs of same-script characters. - * - * Optimized version with unrolled loops and fewer floating-point operations - * than the standard packDiff(). - * - * @param diff difference value -0x10ffff..0x10ffff - * @return - * 0x010000zz for 1-byte sequence zz - * 0x0200yyzz for 2-byte sequence yy zz - * 0x03xxyyzz for 3-byte sequence xx yy zz - * 0xwwxxyyzz for 4-byte sequence ww xx yy zz (ww>0x03) - */ -static int32_t -packDiff(int32_t diff) { - int32_t result, m; - - U_ASSERT(!DIFF_IS_SINGLE(diff)); /* assume we won't be called where diff==BOCU1_REACH_NEG_1=-64 */ - if(diff>=BOCU1_REACH_NEG_1) { - /* mostly positive differences, and single-byte negative ones */ -#if 0 /* single-byte case handled in macros, see below */ - if(diff<=BOCU1_REACH_POS_1) { - /* single byte */ - return 0x01000000|(BOCU1_MIDDLE+diff); - } else -#endif - if(diff<=BOCU1_REACH_POS_2) { - /* two bytes */ - diff-=BOCU1_REACH_POS_1+1; - result=0x02000000; - - m=diff%BOCU1_TRAIL_COUNT; - diff/=BOCU1_TRAIL_COUNT; - result|=BOCU1_TRAIL_TO_BYTE(m); - - result|=(BOCU1_START_POS_2+diff)<<8; - } else if(diff<=BOCU1_REACH_POS_3) { - /* three bytes */ - diff-=BOCU1_REACH_POS_2+1; - result=0x03000000; - - m=diff%BOCU1_TRAIL_COUNT; - diff/=BOCU1_TRAIL_COUNT; - result|=BOCU1_TRAIL_TO_BYTE(m); - - m=diff%BOCU1_TRAIL_COUNT; - diff/=BOCU1_TRAIL_COUNT; - result|=BOCU1_TRAIL_TO_BYTE(m)<<8; - - result|=(BOCU1_START_POS_3+diff)<<16; - } else { - /* four bytes */ - diff-=BOCU1_REACH_POS_3+1; - - m=diff%BOCU1_TRAIL_COUNT; - diff/=BOCU1_TRAIL_COUNT; - result=BOCU1_TRAIL_TO_BYTE(m); - - m=diff%BOCU1_TRAIL_COUNT; - diff/=BOCU1_TRAIL_COUNT; - result|=BOCU1_TRAIL_TO_BYTE(m)<<8; - - /* - * We know that / and % would deliver quotient 0 and rest=diff. - * Avoid division and modulo for performance. - */ - result|=BOCU1_TRAIL_TO_BYTE(diff)<<16; - - result|=((uint32_t)BOCU1_START_POS_4)<<24; - } - } else { - /* two- to four-byte negative differences */ - if(diff>=BOCU1_REACH_NEG_2) { - /* two bytes */ - diff-=BOCU1_REACH_NEG_1; - result=0x02000000; - - NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); - result|=BOCU1_TRAIL_TO_BYTE(m); - - result|=(BOCU1_START_NEG_2+diff)<<8; - } else if(diff>=BOCU1_REACH_NEG_3) { - /* three bytes */ - diff-=BOCU1_REACH_NEG_2; - result=0x03000000; - - NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); - result|=BOCU1_TRAIL_TO_BYTE(m); - - NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); - result|=BOCU1_TRAIL_TO_BYTE(m)<<8; - - result|=(BOCU1_START_NEG_3+diff)<<16; - } else { - /* four bytes */ - diff-=BOCU1_REACH_NEG_3; - - NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); - result=BOCU1_TRAIL_TO_BYTE(m); - - NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); - result|=BOCU1_TRAIL_TO_BYTE(m)<<8; - - /* - * We know that NEGDIVMOD would deliver - * quotient -1 and rest=diff+BOCU1_TRAIL_COUNT. - * Avoid division and modulo for performance. - */ - m=diff+BOCU1_TRAIL_COUNT; - result|=BOCU1_TRAIL_TO_BYTE(m)<<16; - - result|=BOCU1_MIN<<24; - } - } - return result; -} - - -static void U_CALLCONV -_Bocu1FromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, - UErrorCode *pErrorCode) { - UConverter *cnv; - const UChar *source, *sourceLimit; - uint8_t *target; - int32_t targetCapacity; - int32_t *offsets; - - int32_t prev, c, diff; - - int32_t sourceIndex, nextSourceIndex; - - /* set up the local pointers */ - cnv=pArgs->converter; - source=pArgs->source; - sourceLimit=pArgs->sourceLimit; - target=(uint8_t *)pArgs->target; - targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); - offsets=pArgs->offsets; - - /* get the converter state from UConverter */ - c=cnv->fromUChar32; - prev=(int32_t)cnv->fromUnicodeStatus; - if(prev==0) { - prev=BOCU1_ASCII_PREV; - } - - /* sourceIndex=-1 if the current character began in the previous buffer */ - sourceIndex= c==0 ? 0 : -1; - nextSourceIndex=0; - - /* conversion loop */ - if(c!=0 && targetCapacity>0) { - goto getTrail; - } - -fastSingle: - /* fast loop for single-byte differences */ - /* use only one loop counter variable, targetCapacity, not also source */ - diff=(int32_t)(sourceLimit-source); - if(targetCapacity>diff) { - targetCapacity=diff; - } - while(targetCapacity>0 && (c=*source)<0x3000) { - if(c<=0x20) { - if(c!=0x20) { - prev=BOCU1_ASCII_PREV; - } - *target++=(uint8_t)c; - *offsets++=nextSourceIndex++; - ++source; - --targetCapacity; - } else { - diff=c-prev; - if(DIFF_IS_SINGLE(diff)) { - prev=BOCU1_SIMPLE_PREV(c); - *target++=(uint8_t)PACK_SINGLE_DIFF(diff); - *offsets++=nextSourceIndex++; - ++source; - --targetCapacity; - } else { - break; - } - } - } - /* restore real values */ - targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target); - sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */ - - /* regular loop for all cases */ - while(source<sourceLimit) { - if(targetCapacity>0) { - c=*source++; - ++nextSourceIndex; - - if(c<=0x20) { - /* - * ISO C0 control & space: - * Encode directly for MIME compatibility, - * and reset state except for space, to not disrupt compression. - */ - if(c!=0x20) { - prev=BOCU1_ASCII_PREV; - } - *target++=(uint8_t)c; - *offsets++=sourceIndex; - --targetCapacity; - - sourceIndex=nextSourceIndex; - continue; - } - - if(U16_IS_LEAD(c)) { -getTrail: - if(source<sourceLimit) { - /* test the following code unit */ - UChar trail=*source; - if(U16_IS_TRAIL(trail)) { - ++source; - ++nextSourceIndex; - c=U16_GET_SUPPLEMENTARY(c, trail); - } - } else { - /* no more input */ - c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */ - break; - } - } - - /* - * all other Unicode code points c==U+0021..U+10ffff - * are encoded with the difference c-prev - * - * a new prev is computed from c, - * placed in the middle of a 0x80-block (for most small scripts) or - * in the middle of the Unihan and Hangul blocks - * to statistically minimize the following difference - */ - diff=c-prev; - prev=BOCU1_PREV(c); - if(DIFF_IS_SINGLE(diff)) { - *target++=(uint8_t)PACK_SINGLE_DIFF(diff); - *offsets++=sourceIndex; - --targetCapacity; - sourceIndex=nextSourceIndex; - if(c<0x3000) { - goto fastSingle; - } - } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) { - /* optimize 2-byte case */ - int32_t m; - - if(diff>=0) { - diff-=BOCU1_REACH_POS_1+1; - m=diff%BOCU1_TRAIL_COUNT; - diff/=BOCU1_TRAIL_COUNT; - diff+=BOCU1_START_POS_2; - } else { - diff-=BOCU1_REACH_NEG_1; - NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); - diff+=BOCU1_START_NEG_2; - } - *target++=(uint8_t)diff; - *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m); - *offsets++=sourceIndex; - *offsets++=sourceIndex; - targetCapacity-=2; - sourceIndex=nextSourceIndex; - } else { - int32_t length; /* will be 2..4 */ - - diff=packDiff(diff); - length=BOCU1_LENGTH_FROM_PACKED(diff); - - /* write the output character bytes from diff and length */ - /* from the first if in the loop we know that targetCapacity>0 */ - if(length<=targetCapacity) { - switch(length) { - /* each branch falls through to the next one */ - case 4: - *target++=(uint8_t)(diff>>24); - *offsets++=sourceIndex; - U_FALLTHROUGH; - case 3: - *target++=(uint8_t)(diff>>16); - *offsets++=sourceIndex; - U_FALLTHROUGH; - case 2: - *target++=(uint8_t)(diff>>8); - *offsets++=sourceIndex; - /* case 1: handled above */ - *target++=(uint8_t)diff; - *offsets++=sourceIndex; - U_FALLTHROUGH; - default: - /* will never occur */ - break; - } - targetCapacity-=length; - sourceIndex=nextSourceIndex; - } else { - uint8_t *charErrorBuffer; - - /* - * We actually do this backwards here: - * In order to save an intermediate variable, we output - * first to the overflow buffer what does not fit into the - * regular target. - */ - /* we know that 1<=targetCapacity<length<=4 */ - length-=targetCapacity; - charErrorBuffer=(uint8_t *)cnv->charErrorBuffer; - switch(length) { - /* each branch falls through to the next one */ - case 3: - *charErrorBuffer++=(uint8_t)(diff>>16); - U_FALLTHROUGH; - case 2: - *charErrorBuffer++=(uint8_t)(diff>>8); - U_FALLTHROUGH; - case 1: - *charErrorBuffer=(uint8_t)diff; - U_FALLTHROUGH; - default: - /* will never occur */ - break; - } - cnv->charErrorBufferLength=(int8_t)length; - - /* now output what fits into the regular target */ - diff>>=8*length; /* length was reduced by targetCapacity */ - switch(targetCapacity) { - /* each branch falls through to the next one */ - case 3: - *target++=(uint8_t)(diff>>16); - *offsets++=sourceIndex; - U_FALLTHROUGH; - case 2: - *target++=(uint8_t)(diff>>8); - *offsets++=sourceIndex; - U_FALLTHROUGH; - case 1: - *target++=(uint8_t)diff; - *offsets++=sourceIndex; - U_FALLTHROUGH; - default: - /* will never occur */ - break; - } - - /* target overflow */ - targetCapacity=0; - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - break; - } - } - } else { - /* target is full */ - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - break; - } - } - - /* set the converter state back into UConverter */ - cnv->fromUChar32= c<0 ? -c : 0; - cnv->fromUnicodeStatus=(uint32_t)prev; - - /* write back the updated pointers */ - pArgs->source=source; - pArgs->target=(char *)target; - pArgs->offsets=offsets; -} - -/* - * Identical to _Bocu1FromUnicodeWithOffsets but without offset handling. - * If a change is made in the original function, then either - * change this function the same way or - * re-copy the original function and remove the variables - * offsets, sourceIndex, and nextSourceIndex. - */ -static void U_CALLCONV -_Bocu1FromUnicode(UConverterFromUnicodeArgs *pArgs, - UErrorCode *pErrorCode) { - UConverter *cnv; - const UChar *source, *sourceLimit; - uint8_t *target; - int32_t targetCapacity; - - int32_t prev, c, diff; - - /* set up the local pointers */ - cnv=pArgs->converter; - source=pArgs->source; - sourceLimit=pArgs->sourceLimit; - target=(uint8_t *)pArgs->target; - targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); - - /* get the converter state from UConverter */ - c=cnv->fromUChar32; - prev=(int32_t)cnv->fromUnicodeStatus; - if(prev==0) { - prev=BOCU1_ASCII_PREV; - } - - /* conversion loop */ - if(c!=0 && targetCapacity>0) { - goto getTrail; - } - -fastSingle: - /* fast loop for single-byte differences */ - /* use only one loop counter variable, targetCapacity, not also source */ - diff=(int32_t)(sourceLimit-source); - if(targetCapacity>diff) { - targetCapacity=diff; - } - while(targetCapacity>0 && (c=*source)<0x3000) { - if(c<=0x20) { - if(c!=0x20) { - prev=BOCU1_ASCII_PREV; - } - *target++=(uint8_t)c; - } else { - diff=c-prev; - if(DIFF_IS_SINGLE(diff)) { - prev=BOCU1_SIMPLE_PREV(c); - *target++=(uint8_t)PACK_SINGLE_DIFF(diff); - } else { - break; - } - } - ++source; - --targetCapacity; - } - /* restore real values */ - targetCapacity=(int32_t)((const uint8_t *)pArgs->targetLimit-target); - - /* regular loop for all cases */ - while(source<sourceLimit) { - if(targetCapacity>0) { - c=*source++; - - if(c<=0x20) { - /* - * ISO C0 control & space: - * Encode directly for MIME compatibility, - * and reset state except for space, to not disrupt compression. - */ - if(c!=0x20) { - prev=BOCU1_ASCII_PREV; - } - *target++=(uint8_t)c; - --targetCapacity; - continue; - } - - if(U16_IS_LEAD(c)) { -getTrail: - if(source<sourceLimit) { - /* test the following code unit */ - UChar trail=*source; - if(U16_IS_TRAIL(trail)) { - ++source; - c=U16_GET_SUPPLEMENTARY(c, trail); - } - } else { - /* no more input */ - c=-c; /* negative lead surrogate as "incomplete" indicator to avoid c=0 everywhere else */ - break; - } - } - - /* - * all other Unicode code points c==U+0021..U+10ffff - * are encoded with the difference c-prev - * - * a new prev is computed from c, - * placed in the middle of a 0x80-block (for most small scripts) or - * in the middle of the Unihan and Hangul blocks - * to statistically minimize the following difference - */ - diff=c-prev; - prev=BOCU1_PREV(c); - if(DIFF_IS_SINGLE(diff)) { - *target++=(uint8_t)PACK_SINGLE_DIFF(diff); - --targetCapacity; - if(c<0x3000) { - goto fastSingle; - } - } else if(DIFF_IS_DOUBLE(diff) && 2<=targetCapacity) { - /* optimize 2-byte case */ - int32_t m; - - if(diff>=0) { - diff-=BOCU1_REACH_POS_1+1; - m=diff%BOCU1_TRAIL_COUNT; - diff/=BOCU1_TRAIL_COUNT; - diff+=BOCU1_START_POS_2; - } else { - diff-=BOCU1_REACH_NEG_1; - NEGDIVMOD(diff, BOCU1_TRAIL_COUNT, m); - diff+=BOCU1_START_NEG_2; - } - *target++=(uint8_t)diff; - *target++=(uint8_t)BOCU1_TRAIL_TO_BYTE(m); - targetCapacity-=2; - } else { - int32_t length; /* will be 2..4 */ - - diff=packDiff(diff); - length=BOCU1_LENGTH_FROM_PACKED(diff); - - /* write the output character bytes from diff and length */ - /* from the first if in the loop we know that targetCapacity>0 */ - if(length<=targetCapacity) { - switch(length) { - /* each branch falls through to the next one */ - case 4: - *target++=(uint8_t)(diff>>24); - U_FALLTHROUGH; - case 3: - *target++=(uint8_t)(diff>>16); - /* case 2: handled above */ - *target++=(uint8_t)(diff>>8); - /* case 1: handled above */ - *target++=(uint8_t)diff; - U_FALLTHROUGH; - default: - /* will never occur */ - break; - } - targetCapacity-=length; - } else { - uint8_t *charErrorBuffer; - - /* - * We actually do this backwards here: - * In order to save an intermediate variable, we output - * first to the overflow buffer what does not fit into the - * regular target. - */ - /* we know that 1<=targetCapacity<length<=4 */ - length-=targetCapacity; - charErrorBuffer=(uint8_t *)cnv->charErrorBuffer; - switch(length) { - /* each branch falls through to the next one */ - case 3: - *charErrorBuffer++=(uint8_t)(diff>>16); - U_FALLTHROUGH; - case 2: - *charErrorBuffer++=(uint8_t)(diff>>8); - U_FALLTHROUGH; - case 1: - *charErrorBuffer=(uint8_t)diff; - U_FALLTHROUGH; - default: - /* will never occur */ - break; - } - cnv->charErrorBufferLength=(int8_t)length; - - /* now output what fits into the regular target */ - diff>>=8*length; /* length was reduced by targetCapacity */ - switch(targetCapacity) { - /* each branch falls through to the next one */ - case 3: - *target++=(uint8_t)(diff>>16); - U_FALLTHROUGH; - case 2: - *target++=(uint8_t)(diff>>8); - U_FALLTHROUGH; - case 1: - *target++=(uint8_t)diff; - U_FALLTHROUGH; - default: - /* will never occur */ - break; - } - - /* target overflow */ - targetCapacity=0; - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - break; - } - } - } else { - /* target is full */ - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - break; - } - } - - /* set the converter state back into UConverter */ - cnv->fromUChar32= c<0 ? -c : 0; - cnv->fromUnicodeStatus=(uint32_t)prev; - - /* write back the updated pointers */ - pArgs->source=source; - pArgs->target=(char *)target; -} - -/* BOCU-1-to-Unicode conversion functions ----------------------------------- */ - -/** - * Function for BOCU-1 decoder; handles multi-byte lead bytes. - * - * @param b lead byte; - * BOCU1_MIN<=b<BOCU1_START_NEG_2 or BOCU1_START_POS_2<=b<BOCU1_MAX_LEAD - * @return (diff<<2)|count - */ -static inline int32_t -decodeBocu1LeadByte(int32_t b) { - int32_t diff, count; - - if(b>=BOCU1_START_NEG_2) { - /* positive difference */ - if(b<BOCU1_START_POS_3) { - /* two bytes */ - diff=((int32_t)b-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1; - count=1; - } else if(b<BOCU1_START_POS_4) { - /* three bytes */ - diff=((int32_t)b-BOCU1_START_POS_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_2+1; - count=2; - } else { - /* four bytes */ - diff=BOCU1_REACH_POS_3+1; - count=3; - } - } else { - /* negative difference */ - if(b>=BOCU1_START_NEG_3) { - /* two bytes */ - diff=((int32_t)b-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1; - count=1; - } else if(b>BOCU1_MIN) { - /* three bytes */ - diff=((int32_t)b-BOCU1_START_NEG_3)*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_2; - count=2; - } else { - /* four bytes */ - diff=-BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_3; - count=3; - } - } - - /* return the state for decoding the trail byte(s) */ - return (diff<<2)|count; -} - -/** - * Function for BOCU-1 decoder; handles multi-byte trail bytes. - * - * @param count number of remaining trail bytes including this one - * @param b trail byte - * @return new delta for diff including b - <0 indicates an error - * - * @see decodeBocu1 - */ -static inline int32_t -decodeBocu1TrailByte(int32_t count, int32_t b) { - if(b<=0x20) { - /* skip some C0 controls and make the trail byte range contiguous */ - b=bocu1ByteToTrail[b]; - /* b<0 for an illegal trail byte value will result in return<0 below */ -#if BOCU1_MAX_TRAIL<0xff - } else if(b>BOCU1_MAX_TRAIL) { - return -99; -#endif - } else { - b-=BOCU1_TRAIL_BYTE_OFFSET; - } - - /* add trail byte into difference and decrement count */ - if(count==1) { - return b; - } else if(count==2) { - return b*BOCU1_TRAIL_COUNT; - } else /* count==3 */ { - return b*(BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT); - } -} - -static void U_CALLCONV -_Bocu1ToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, - UErrorCode *pErrorCode) { - UConverter *cnv; - const uint8_t *source, *sourceLimit; - UChar *target; - const UChar *targetLimit; - int32_t *offsets; - - int32_t prev, count, diff, c; - - int8_t byteIndex; - uint8_t *bytes; - - int32_t sourceIndex, nextSourceIndex; - - /* set up the local pointers */ - cnv=pArgs->converter; - source=(const uint8_t *)pArgs->source; - sourceLimit=(const uint8_t *)pArgs->sourceLimit; - target=pArgs->target; - targetLimit=pArgs->targetLimit; - offsets=pArgs->offsets; - - /* get the converter state from UConverter */ - prev=(int32_t)cnv->toUnicodeStatus; - if(prev==0) { - prev=BOCU1_ASCII_PREV; - } - diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */ - count=diff&3; - diff>>=2; - - byteIndex=cnv->toULength; - bytes=cnv->toUBytes; - - /* sourceIndex=-1 if the current character began in the previous buffer */ - sourceIndex=byteIndex==0 ? 0 : -1; - nextSourceIndex=0; - - /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */ - if(count>0 && byteIndex>0 && target<targetLimit) { - goto getTrail; - } - -fastSingle: - /* fast loop for single-byte differences */ - /* use count as the only loop counter variable */ - diff=(int32_t)(sourceLimit-source); - count=(int32_t)(pArgs->targetLimit-target); - if(count>diff) { - count=diff; - } - while(count>0) { - if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) { - c=prev+(c-BOCU1_MIDDLE); - if(c<0x3000) { - *target++=(UChar)c; - *offsets++=nextSourceIndex++; - prev=BOCU1_SIMPLE_PREV(c); - } else { - break; - } - } else if(c<=0x20) { - if(c!=0x20) { - prev=BOCU1_ASCII_PREV; - } - *target++=(UChar)c; - *offsets++=nextSourceIndex++; - } else { - break; - } - ++source; - --count; - } - sourceIndex=nextSourceIndex; /* wrong if offsets==NULL but does not matter */ - - /* decode a sequence of single and lead bytes */ - while(source<sourceLimit) { - if(target>=targetLimit) { - /* target is full */ - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - break; - } - - ++nextSourceIndex; - c=*source++; - if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) { - /* Write a code point directly from a single-byte difference. */ - c=prev+(c-BOCU1_MIDDLE); - if(c<0x3000) { - *target++=(UChar)c; - *offsets++=sourceIndex; - prev=BOCU1_SIMPLE_PREV(c); - sourceIndex=nextSourceIndex; - goto fastSingle; - } - } else if(c<=0x20) { - /* - * Direct-encoded C0 control code or space. - * Reset prev for C0 control codes but not for space. - */ - if(c!=0x20) { - prev=BOCU1_ASCII_PREV; - } - *target++=(UChar)c; - *offsets++=sourceIndex; - sourceIndex=nextSourceIndex; - continue; - } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) { - /* Optimize two-byte case. */ - if(c>=BOCU1_MIDDLE) { - diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1; - } else { - diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1; - } - - /* trail byte */ - ++nextSourceIndex; - c=decodeBocu1TrailByte(1, *source++); - if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) { - bytes[0]=source[-2]; - bytes[1]=source[-1]; - byteIndex=2; - *pErrorCode=U_ILLEGAL_CHAR_FOUND; - break; - } - } else if(c==BOCU1_RESET) { - /* only reset the state, no code point */ - prev=BOCU1_ASCII_PREV; - sourceIndex=nextSourceIndex; - continue; - } else { - /* - * For multi-byte difference lead bytes, set the decoder state - * with the partial difference value from the lead byte and - * with the number of trail bytes. - */ - bytes[0]=(uint8_t)c; - byteIndex=1; - - diff=decodeBocu1LeadByte(c); - count=diff&3; - diff>>=2; -getTrail: - for(;;) { - if(source>=sourceLimit) { - goto endloop; - } - ++nextSourceIndex; - c=bytes[byteIndex++]=*source++; - - /* trail byte in any position */ - c=decodeBocu1TrailByte(count, c); - if(c<0) { - *pErrorCode=U_ILLEGAL_CHAR_FOUND; - goto endloop; - } - - diff+=c; - if(--count==0) { - /* final trail byte, deliver a code point */ - byteIndex=0; - c=prev+diff; - if((uint32_t)c>0x10ffff) { - *pErrorCode=U_ILLEGAL_CHAR_FOUND; - goto endloop; - } - break; - } - } - } - - /* calculate the next prev and output c */ - prev=BOCU1_PREV(c); - if(c<=0xffff) { - *target++=(UChar)c; - *offsets++=sourceIndex; - } else { - /* output surrogate pair */ - *target++=U16_LEAD(c); - if(target<targetLimit) { - *target++=U16_TRAIL(c); - *offsets++=sourceIndex; - *offsets++=sourceIndex; - } else { - /* target overflow */ - *offsets++=sourceIndex; - cnv->UCharErrorBuffer[0]=U16_TRAIL(c); - cnv->UCharErrorBufferLength=1; - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - break; - } - } - sourceIndex=nextSourceIndex; - } -endloop: - - if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) { - /* set the converter state in UConverter to deal with the next character */ - cnv->toUnicodeStatus=BOCU1_ASCII_PREV; - cnv->mode=0; - } else { - /* set the converter state back into UConverter */ - cnv->toUnicodeStatus=(uint32_t)prev; - cnv->mode=(diff<<2)|count; - } - cnv->toULength=byteIndex; - - /* write back the updated pointers */ - pArgs->source=(const char *)source; - pArgs->target=target; - pArgs->offsets=offsets; - return; -} - -/* - * Identical to _Bocu1ToUnicodeWithOffsets but without offset handling. - * If a change is made in the original function, then either - * change this function the same way or - * re-copy the original function and remove the variables - * offsets, sourceIndex, and nextSourceIndex. - */ -static void U_CALLCONV -_Bocu1ToUnicode(UConverterToUnicodeArgs *pArgs, - UErrorCode *pErrorCode) { - UConverter *cnv; - const uint8_t *source, *sourceLimit; - UChar *target; - const UChar *targetLimit; - - int32_t prev, count, diff, c; - - int8_t byteIndex; - uint8_t *bytes; - - /* set up the local pointers */ - cnv=pArgs->converter; - source=(const uint8_t *)pArgs->source; - sourceLimit=(const uint8_t *)pArgs->sourceLimit; - target=pArgs->target; - targetLimit=pArgs->targetLimit; - - /* get the converter state from UConverter */ - prev=(int32_t)cnv->toUnicodeStatus; - if(prev==0) { - prev=BOCU1_ASCII_PREV; - } - diff=cnv->mode; /* mode may be set to UCNV_SI by ucnv_bld.c but then toULength==0 */ - count=diff&3; - diff>>=2; - - byteIndex=cnv->toULength; - bytes=cnv->toUBytes; - - /* conversion "loop" similar to _SCSUToUnicodeWithOffsets() */ - if(count>0 && byteIndex>0 && target<targetLimit) { - goto getTrail; - } - -fastSingle: - /* fast loop for single-byte differences */ - /* use count as the only loop counter variable */ - diff=(int32_t)(sourceLimit-source); - count=(int32_t)(pArgs->targetLimit-target); - if(count>diff) { - count=diff; - } - while(count>0) { - if(BOCU1_START_NEG_2<=(c=*source) && c<BOCU1_START_POS_2) { - c=prev+(c-BOCU1_MIDDLE); - if(c<0x3000) { - *target++=(UChar)c; - prev=BOCU1_SIMPLE_PREV(c); - } else { - break; - } - } else if(c<=0x20) { - if(c!=0x20) { - prev=BOCU1_ASCII_PREV; - } - *target++=(UChar)c; - } else { - break; - } - ++source; - --count; - } - - /* decode a sequence of single and lead bytes */ - while(source<sourceLimit) { - if(target>=targetLimit) { - /* target is full */ - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - break; - } - - c=*source++; - if(BOCU1_START_NEG_2<=c && c<BOCU1_START_POS_2) { - /* Write a code point directly from a single-byte difference. */ - c=prev+(c-BOCU1_MIDDLE); - if(c<0x3000) { - *target++=(UChar)c; - prev=BOCU1_SIMPLE_PREV(c); - goto fastSingle; - } - } else if(c<=0x20) { - /* - * Direct-encoded C0 control code or space. - * Reset prev for C0 control codes but not for space. - */ - if(c!=0x20) { - prev=BOCU1_ASCII_PREV; - } - *target++=(UChar)c; - continue; - } else if(BOCU1_START_NEG_3<=c && c<BOCU1_START_POS_3 && source<sourceLimit) { - /* Optimize two-byte case. */ - if(c>=BOCU1_MIDDLE) { - diff=((int32_t)c-BOCU1_START_POS_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_POS_1+1; - } else { - diff=((int32_t)c-BOCU1_START_NEG_2)*BOCU1_TRAIL_COUNT+BOCU1_REACH_NEG_1; - } - - /* trail byte */ - c=decodeBocu1TrailByte(1, *source++); - if(c<0 || (uint32_t)(c=prev+diff+c)>0x10ffff) { - bytes[0]=source[-2]; - bytes[1]=source[-1]; - byteIndex=2; - *pErrorCode=U_ILLEGAL_CHAR_FOUND; - break; - } - } else if(c==BOCU1_RESET) { - /* only reset the state, no code point */ - prev=BOCU1_ASCII_PREV; - continue; - } else { - /* - * For multi-byte difference lead bytes, set the decoder state - * with the partial difference value from the lead byte and - * with the number of trail bytes. - */ - bytes[0]=(uint8_t)c; - byteIndex=1; - - diff=decodeBocu1LeadByte(c); - count=diff&3; - diff>>=2; -getTrail: - for(;;) { - if(source>=sourceLimit) { - goto endloop; - } - c=bytes[byteIndex++]=*source++; - - /* trail byte in any position */ - c=decodeBocu1TrailByte(count, c); - if(c<0) { - *pErrorCode=U_ILLEGAL_CHAR_FOUND; - goto endloop; - } - - diff+=c; - if(--count==0) { - /* final trail byte, deliver a code point */ - byteIndex=0; - c=prev+diff; - if((uint32_t)c>0x10ffff) { - *pErrorCode=U_ILLEGAL_CHAR_FOUND; - goto endloop; - } - break; - } - } - } - - /* calculate the next prev and output c */ - prev=BOCU1_PREV(c); - if(c<=0xffff) { - *target++=(UChar)c; - } else { - /* output surrogate pair */ - *target++=U16_LEAD(c); - if(target<targetLimit) { - *target++=U16_TRAIL(c); - } else { - /* target overflow */ - cnv->UCharErrorBuffer[0]=U16_TRAIL(c); - cnv->UCharErrorBufferLength=1; - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - break; - } - } - } -endloop: - - if(*pErrorCode==U_ILLEGAL_CHAR_FOUND) { - /* set the converter state in UConverter to deal with the next character */ - cnv->toUnicodeStatus=BOCU1_ASCII_PREV; - cnv->mode=0; - } else { - /* set the converter state back into UConverter */ - cnv->toUnicodeStatus=(uint32_t)prev; - cnv->mode=(diff<<2)|count; - } - cnv->toULength=byteIndex; - - /* write back the updated pointers */ - pArgs->source=(const char *)source; - pArgs->target=target; - return; -} - -/* miscellaneous ------------------------------------------------------------ */ - -static const UConverterImpl _Bocu1Impl={ - UCNV_BOCU1, - - NULL, - NULL, - - NULL, - NULL, - NULL, - - _Bocu1ToUnicode, - _Bocu1ToUnicodeWithOffsets, - _Bocu1FromUnicode, - _Bocu1FromUnicodeWithOffsets, - NULL, - - NULL, - NULL, - NULL, - NULL, - ucnv_getCompleteUnicodeSet, - - NULL, - NULL -}; - -static const UConverterStaticData _Bocu1StaticData={ - sizeof(UConverterStaticData), - "BOCU-1", - 1214, /* CCSID for BOCU-1 */ - UCNV_IBM, UCNV_BOCU1, - 1, 4, /* one UChar generates at least 1 byte and at most 4 bytes */ - { 0x1a, 0, 0, 0 }, 1, /* BOCU-1 never needs to write a subchar */ - FALSE, FALSE, - 0, - 0, - { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ -}; - -const UConverterSharedData _Bocu1Data= - UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_Bocu1StaticData, &_Bocu1Impl); - -#endif |