diff options
Diffstat (limited to 'deps/node/deps/icu-small/source/common/ucnvscsu.cpp')
-rw-r--r-- | deps/node/deps/icu-small/source/common/ucnvscsu.cpp | 2045 |
1 files changed, 0 insertions, 2045 deletions
diff --git a/deps/node/deps/icu-small/source/common/ucnvscsu.cpp b/deps/node/deps/icu-small/source/common/ucnvscsu.cpp deleted file mode 100644 index eb7b7ad5..00000000 --- a/deps/node/deps/icu-small/source/common/ucnvscsu.cpp +++ /dev/null @@ -1,2045 +0,0 @@ -// © 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html -/* -****************************************************************************** -* -* Copyright (C) 2000-2016, International Business Machines -* Corporation and others. All Rights Reserved. -* -****************************************************************************** -* file name: ucnvscsu.c -* encoding: UTF-8 -* tab size: 8 (not used) -* indentation:4 -* -* created on: 2000nov18 -* created by: Markus W. Scherer -* -* This is an implementation of the Standard Compression Scheme for Unicode -* as defined in http://www.unicode.org/unicode/reports/tr6/ . -* Reserved commands and window settings are treated as illegal sequences and -* will result in callback calls. -*/ - -#include "unicode/utypes.h" - -#if !UCONFIG_NO_CONVERSION && !UCONFIG_ONLY_HTML_CONVERSION - -#include "unicode/ucnv.h" -#include "unicode/ucnv_cb.h" -#include "unicode/utf16.h" -#include "ucnv_bld.h" -#include "ucnv_cnv.h" -#include "cmemory.h" - -/* SCSU definitions --------------------------------------------------------- */ - -/* SCSU command byte values */ -enum { - SQ0=0x01, /* Quote from window pair 0 */ - SQ7=0x08, /* Quote from window pair 7 */ - SDX=0x0B, /* Define a window as extended */ - Srs=0x0C, /* reserved */ - SQU=0x0E, /* Quote a single Unicode character */ - SCU=0x0F, /* Change to Unicode mode */ - SC0=0x10, /* Select window 0 */ - SC7=0x17, /* Select window 7 */ - SD0=0x18, /* Define and select window 0 */ - SD7=0x1F, /* Define and select window 7 */ - - UC0=0xE0, /* Select window 0 */ - UC7=0xE7, /* Select window 7 */ - UD0=0xE8, /* Define and select window 0 */ - UD7=0xEF, /* Define and select window 7 */ - UQU=0xF0, /* Quote a single Unicode character */ - UDX=0xF1, /* Define a Window as extended */ - Urs=0xF2 /* reserved */ -}; - -enum { - /* - * Unicode code points from 3400 to E000 are not adressible by - * dynamic window, since in these areas no short run alphabets are - * found. Therefore add gapOffset to all values from gapThreshold. - */ - gapThreshold=0x68, - gapOffset=0xAC00, - - /* values between reservedStart and fixedThreshold are reserved */ - reservedStart=0xA8, - - /* use table of predefined fixed offsets for values from fixedThreshold */ - fixedThreshold=0xF9 -}; - -/* constant offsets for the 8 static windows */ -static const uint32_t staticOffsets[8]={ - 0x0000, /* ASCII for quoted tags */ - 0x0080, /* Latin - 1 Supplement (for access to punctuation) */ - 0x0100, /* Latin Extended-A */ - 0x0300, /* Combining Diacritical Marks */ - 0x2000, /* General Punctuation */ - 0x2080, /* Currency Symbols */ - 0x2100, /* Letterlike Symbols and Number Forms */ - 0x3000 /* CJK Symbols and punctuation */ -}; - -/* initial offsets for the 8 dynamic (sliding) windows */ -static const uint32_t initialDynamicOffsets[8]={ - 0x0080, /* Latin-1 */ - 0x00C0, /* Latin Extended A */ - 0x0400, /* Cyrillic */ - 0x0600, /* Arabic */ - 0x0900, /* Devanagari */ - 0x3040, /* Hiragana */ - 0x30A0, /* Katakana */ - 0xFF00 /* Fullwidth ASCII */ -}; - -/* Table of fixed predefined Offsets */ -static const uint32_t fixedOffsets[]={ - /* 0xF9 */ 0x00C0, /* Latin-1 Letters + half of Latin Extended A */ - /* 0xFA */ 0x0250, /* IPA extensions */ - /* 0xFB */ 0x0370, /* Greek */ - /* 0xFC */ 0x0530, /* Armenian */ - /* 0xFD */ 0x3040, /* Hiragana */ - /* 0xFE */ 0x30A0, /* Katakana */ - /* 0xFF */ 0xFF60 /* Halfwidth Katakana */ -}; - -/* state values */ -enum { - readCommand, - quotePairOne, - quotePairTwo, - quoteOne, - definePairOne, - definePairTwo, - defineOne -}; - -typedef struct SCSUData { - /* dynamic window offsets, intitialize to default values from initialDynamicOffsets */ - uint32_t toUDynamicOffsets[8]; - uint32_t fromUDynamicOffsets[8]; - - /* state machine state - toUnicode */ - UBool toUIsSingleByteMode; - uint8_t toUState; - int8_t toUQuoteWindow, toUDynamicWindow; - uint8_t toUByteOne; - uint8_t toUPadding[3]; - - /* state machine state - fromUnicode */ - UBool fromUIsSingleByteMode; - int8_t fromUDynamicWindow; - - /* - * windowUse[] keeps track of the use of the dynamic windows: - * At nextWindowUseIndex there is the least recently used window, - * and the following windows (in a wrapping manner) are more and more - * recently used. - * At nextWindowUseIndex-1 there is the most recently used window. - */ - uint8_t locale; - int8_t nextWindowUseIndex; - int8_t windowUse[8]; -} SCSUData; - -static const int8_t initialWindowUse[8]={ 7, 0, 3, 2, 4, 5, 6, 1 }; -static const int8_t initialWindowUse_ja[8]={ 3, 2, 4, 1, 0, 7, 5, 6 }; - -enum { - lGeneric, l_ja -}; - -/* SCSU setup functions ----------------------------------------------------- */ -U_CDECL_BEGIN -static void U_CALLCONV -_SCSUReset(UConverter *cnv, UConverterResetChoice choice) { - SCSUData *scsu=(SCSUData *)cnv->extraInfo; - - if(choice<=UCNV_RESET_TO_UNICODE) { - /* reset toUnicode */ - uprv_memcpy(scsu->toUDynamicOffsets, initialDynamicOffsets, 32); - - scsu->toUIsSingleByteMode=TRUE; - scsu->toUState=readCommand; - scsu->toUQuoteWindow=scsu->toUDynamicWindow=0; - scsu->toUByteOne=0; - - cnv->toULength=0; - } - if(choice!=UCNV_RESET_TO_UNICODE) { - /* reset fromUnicode */ - uprv_memcpy(scsu->fromUDynamicOffsets, initialDynamicOffsets, 32); - - scsu->fromUIsSingleByteMode=TRUE; - scsu->fromUDynamicWindow=0; - - scsu->nextWindowUseIndex=0; - switch(scsu->locale) { - case l_ja: - uprv_memcpy(scsu->windowUse, initialWindowUse_ja, 8); - break; - default: - uprv_memcpy(scsu->windowUse, initialWindowUse, 8); - break; - } - - cnv->fromUChar32=0; - } -} - -static void U_CALLCONV -_SCSUOpen(UConverter *cnv, - UConverterLoadArgs *pArgs, - UErrorCode *pErrorCode) { - const char *locale=pArgs->locale; - if(pArgs->onlyTestIsLoadable) { - return; - } - cnv->extraInfo=uprv_malloc(sizeof(SCSUData)); - if(cnv->extraInfo!=NULL) { - if(locale!=NULL && locale[0]=='j' && locale[1]=='a' && (locale[2]==0 || locale[2]=='_')) { - ((SCSUData *)cnv->extraInfo)->locale=l_ja; - } else { - ((SCSUData *)cnv->extraInfo)->locale=lGeneric; - } - _SCSUReset(cnv, UCNV_RESET_BOTH); - } else { - *pErrorCode=U_MEMORY_ALLOCATION_ERROR; - } - - /* Set the substitution character U+fffd as a Unicode string. */ - cnv->subUChars[0]=0xfffd; - cnv->subCharLen=-1; -} - -static void U_CALLCONV -_SCSUClose(UConverter *cnv) { - if(cnv->extraInfo!=NULL) { - if(!cnv->isExtraLocal) { - uprv_free(cnv->extraInfo); - } - cnv->extraInfo=NULL; - } -} - -/* SCSU-to-Unicode conversion functions ------------------------------------- */ - -static void U_CALLCONV -_SCSUToUnicodeWithOffsets(UConverterToUnicodeArgs *pArgs, - UErrorCode *pErrorCode) { - UConverter *cnv; - SCSUData *scsu; - const uint8_t *source, *sourceLimit; - UChar *target; - const UChar *targetLimit; - int32_t *offsets; - UBool isSingleByteMode; - uint8_t state, byteOne; - int8_t quoteWindow, dynamicWindow; - - int32_t sourceIndex, nextSourceIndex; - - uint8_t b; - - /* set up the local pointers */ - cnv=pArgs->converter; - scsu=(SCSUData *)cnv->extraInfo; - - source=(const uint8_t *)pArgs->source; - sourceLimit=(const uint8_t *)pArgs->sourceLimit; - target=pArgs->target; - targetLimit=pArgs->targetLimit; - offsets=pArgs->offsets; - - /* get the state machine state */ - isSingleByteMode=scsu->toUIsSingleByteMode; - state=scsu->toUState; - quoteWindow=scsu->toUQuoteWindow; - dynamicWindow=scsu->toUDynamicWindow; - byteOne=scsu->toUByteOne; - - /* sourceIndex=-1 if the current character began in the previous buffer */ - sourceIndex=state==readCommand ? 0 : -1; - nextSourceIndex=0; - - /* - * conversion "loop" - * - * For performance, this is not a normal C loop. - * Instead, there are two code blocks for the two SCSU modes. - * The function branches to either one, and a change of the mode is done with a goto to - * the other branch. - * - * Each branch has two conventional loops: - * - a fast-path loop for the most common codes in the mode - * - a loop for all other codes in the mode - * When the fast-path runs into a code that it cannot handle, its loop ends and it - * runs into the following loop to handle the other codes. - * The end of the input or output buffer is also handled by the slower loop. - * The slow loop jumps (goto) to the fast-path loop again as soon as possible. - * - * The callback handling is done by returning with an error code. - * The conversion framework actually calls the callback function. - */ - if(isSingleByteMode) { - /* fast path for single-byte mode */ - if(state==readCommand) { -fastSingle: - while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) { - ++source; - ++nextSourceIndex; - if(b<=0x7f) { - /* write US-ASCII graphic character or DEL */ - *target++=(UChar)b; - if(offsets!=NULL) { - *offsets++=sourceIndex; - } - } else { - /* write from dynamic window */ - uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f); - if(c<=0xffff) { - *target++=(UChar)c; - if(offsets!=NULL) { - *offsets++=sourceIndex; - } - } else { - /* output surrogate pair */ - *target++=(UChar)(0xd7c0+(c>>10)); - if(target<targetLimit) { - *target++=(UChar)(0xdc00|(c&0x3ff)); - if(offsets!=NULL) { - *offsets++=sourceIndex; - *offsets++=sourceIndex; - } - } else { - /* target overflow */ - if(offsets!=NULL) { - *offsets++=sourceIndex; - } - cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff)); - cnv->UCharErrorBufferLength=1; - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - goto endloop; - } - } - } - sourceIndex=nextSourceIndex; - } - } - - /* normal state machine for single-byte mode, minus handling for what fastSingle covers */ -singleByteMode: - while(source<sourceLimit) { - if(target>=targetLimit) { - /* target is full */ - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - break; - } - b=*source++; - ++nextSourceIndex; - switch(state) { - case readCommand: - /* redundant conditions are commented out */ - /* here: b<0x20 because otherwise we would be in fastSingle */ - if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) { - /* CR/LF/TAB/NUL */ - *target++=(UChar)b; - if(offsets!=NULL) { - *offsets++=sourceIndex; - } - sourceIndex=nextSourceIndex; - goto fastSingle; - } else if(SC0<=b) { - if(b<=SC7) { - dynamicWindow=(int8_t)(b-SC0); - sourceIndex=nextSourceIndex; - goto fastSingle; - } else /* if(SD0<=b && b<=SD7) */ { - dynamicWindow=(int8_t)(b-SD0); - state=defineOne; - } - } else if(/* SQ0<=b && */ b<=SQ7) { - quoteWindow=(int8_t)(b-SQ0); - state=quoteOne; - } else if(b==SDX) { - state=definePairOne; - } else if(b==SQU) { - state=quotePairOne; - } else if(b==SCU) { - sourceIndex=nextSourceIndex; - isSingleByteMode=FALSE; - goto fastUnicode; - } else /* Srs */ { - /* callback(illegal) */ - *pErrorCode=U_ILLEGAL_CHAR_FOUND; - cnv->toUBytes[0]=b; - cnv->toULength=1; - goto endloop; - } - - /* store the first byte of a multibyte sequence in toUBytes[] */ - cnv->toUBytes[0]=b; - cnv->toULength=1; - break; - case quotePairOne: - byteOne=b; - cnv->toUBytes[1]=b; - cnv->toULength=2; - state=quotePairTwo; - break; - case quotePairTwo: - *target++=(UChar)((byteOne<<8)|b); - if(offsets!=NULL) { - *offsets++=sourceIndex; - } - sourceIndex=nextSourceIndex; - state=readCommand; - goto fastSingle; - case quoteOne: - if(b<0x80) { - /* all static offsets are in the BMP */ - *target++=(UChar)(staticOffsets[quoteWindow]+b); - if(offsets!=NULL) { - *offsets++=sourceIndex; - } - } else { - /* write from dynamic window */ - uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f); - if(c<=0xffff) { - *target++=(UChar)c; - if(offsets!=NULL) { - *offsets++=sourceIndex; - } - } else { - /* output surrogate pair */ - *target++=(UChar)(0xd7c0+(c>>10)); - if(target<targetLimit) { - *target++=(UChar)(0xdc00|(c&0x3ff)); - if(offsets!=NULL) { - *offsets++=sourceIndex; - *offsets++=sourceIndex; - } - } else { - /* target overflow */ - if(offsets!=NULL) { - *offsets++=sourceIndex; - } - cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff)); - cnv->UCharErrorBufferLength=1; - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - goto endloop; - } - } - } - sourceIndex=nextSourceIndex; - state=readCommand; - goto fastSingle; - case definePairOne: - dynamicWindow=(int8_t)((b>>5)&7); - byteOne=(uint8_t)(b&0x1f); - cnv->toUBytes[1]=b; - cnv->toULength=2; - state=definePairTwo; - break; - case definePairTwo: - scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL); - sourceIndex=nextSourceIndex; - state=readCommand; - goto fastSingle; - case defineOne: - if(b==0) { - /* callback(illegal): Reserved window offset value 0 */ - cnv->toUBytes[1]=b; - cnv->toULength=2; - goto endloop; - } else if(b<gapThreshold) { - scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL; - } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) { - scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset; - } else if(b>=fixedThreshold) { - scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold]; - } else { - /* callback(illegal): Reserved window offset value 0xa8..0xf8 */ - cnv->toUBytes[1]=b; - cnv->toULength=2; - goto endloop; - } - sourceIndex=nextSourceIndex; - state=readCommand; - goto fastSingle; - } - } - } else { - /* fast path for Unicode mode */ - if(state==readCommand) { -fastUnicode: - while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) { - *target++=(UChar)((b<<8)|source[1]); - if(offsets!=NULL) { - *offsets++=sourceIndex; - } - sourceIndex=nextSourceIndex; - nextSourceIndex+=2; - source+=2; - } - } - - /* normal state machine for Unicode mode */ -/* unicodeByteMode: */ - while(source<sourceLimit) { - if(target>=targetLimit) { - /* target is full */ - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - break; - } - b=*source++; - ++nextSourceIndex; - switch(state) { - case readCommand: - if((uint8_t)(b-UC0)>(Urs-UC0)) { - byteOne=b; - cnv->toUBytes[0]=b; - cnv->toULength=1; - state=quotePairTwo; - } else if(/* UC0<=b && */ b<=UC7) { - dynamicWindow=(int8_t)(b-UC0); - sourceIndex=nextSourceIndex; - isSingleByteMode=TRUE; - goto fastSingle; - } else if(/* UD0<=b && */ b<=UD7) { - dynamicWindow=(int8_t)(b-UD0); - isSingleByteMode=TRUE; - cnv->toUBytes[0]=b; - cnv->toULength=1; - state=defineOne; - goto singleByteMode; - } else if(b==UDX) { - isSingleByteMode=TRUE; - cnv->toUBytes[0]=b; - cnv->toULength=1; - state=definePairOne; - goto singleByteMode; - } else if(b==UQU) { - cnv->toUBytes[0]=b; - cnv->toULength=1; - state=quotePairOne; - } else /* Urs */ { - /* callback(illegal) */ - *pErrorCode=U_ILLEGAL_CHAR_FOUND; - cnv->toUBytes[0]=b; - cnv->toULength=1; - goto endloop; - } - break; - case quotePairOne: - byteOne=b; - cnv->toUBytes[1]=b; - cnv->toULength=2; - state=quotePairTwo; - break; - case quotePairTwo: - *target++=(UChar)((byteOne<<8)|b); - if(offsets!=NULL) { - *offsets++=sourceIndex; - } - sourceIndex=nextSourceIndex; - state=readCommand; - goto fastUnicode; - } - } - } -endloop: - - /* set the converter state back into UConverter */ - if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) { - /* reset to deal with the next character */ - state=readCommand; - } else if(state==readCommand) { - /* not in a multi-byte sequence, reset toULength */ - cnv->toULength=0; - } - scsu->toUIsSingleByteMode=isSingleByteMode; - scsu->toUState=state; - scsu->toUQuoteWindow=quoteWindow; - scsu->toUDynamicWindow=dynamicWindow; - scsu->toUByteOne=byteOne; - - /* write back the updated pointers */ - pArgs->source=(const char *)source; - pArgs->target=target; - pArgs->offsets=offsets; - return; -} - -/* - * Identical to _SCSUToUnicodeWithOffsets but without offset handling. - * If a change is made in the original function, then either - * change this function the same way or - * re-copy the original function and remove the variables - * offsets, sourceIndex, and nextSourceIndex. - */ -static void U_CALLCONV -_SCSUToUnicode(UConverterToUnicodeArgs *pArgs, - UErrorCode *pErrorCode) { - UConverter *cnv; - SCSUData *scsu; - const uint8_t *source, *sourceLimit; - UChar *target; - const UChar *targetLimit; - UBool isSingleByteMode; - uint8_t state, byteOne; - int8_t quoteWindow, dynamicWindow; - - uint8_t b; - - /* set up the local pointers */ - cnv=pArgs->converter; - scsu=(SCSUData *)cnv->extraInfo; - - source=(const uint8_t *)pArgs->source; - sourceLimit=(const uint8_t *)pArgs->sourceLimit; - target=pArgs->target; - targetLimit=pArgs->targetLimit; - - /* get the state machine state */ - isSingleByteMode=scsu->toUIsSingleByteMode; - state=scsu->toUState; - quoteWindow=scsu->toUQuoteWindow; - dynamicWindow=scsu->toUDynamicWindow; - byteOne=scsu->toUByteOne; - - /* - * conversion "loop" - * - * For performance, this is not a normal C loop. - * Instead, there are two code blocks for the two SCSU modes. - * The function branches to either one, and a change of the mode is done with a goto to - * the other branch. - * - * Each branch has two conventional loops: - * - a fast-path loop for the most common codes in the mode - * - a loop for all other codes in the mode - * When the fast-path runs into a code that it cannot handle, its loop ends and it - * runs into the following loop to handle the other codes. - * The end of the input or output buffer is also handled by the slower loop. - * The slow loop jumps (goto) to the fast-path loop again as soon as possible. - * - * The callback handling is done by returning with an error code. - * The conversion framework actually calls the callback function. - */ - if(isSingleByteMode) { - /* fast path for single-byte mode */ - if(state==readCommand) { -fastSingle: - while(source<sourceLimit && target<targetLimit && (b=*source)>=0x20) { - ++source; - if(b<=0x7f) { - /* write US-ASCII graphic character or DEL */ - *target++=(UChar)b; - } else { - /* write from dynamic window */ - uint32_t c=scsu->toUDynamicOffsets[dynamicWindow]+(b&0x7f); - if(c<=0xffff) { - *target++=(UChar)c; - } else { - /* output surrogate pair */ - *target++=(UChar)(0xd7c0+(c>>10)); - if(target<targetLimit) { - *target++=(UChar)(0xdc00|(c&0x3ff)); - } else { - /* target overflow */ - cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff)); - cnv->UCharErrorBufferLength=1; - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - goto endloop; - } - } - } - } - } - - /* normal state machine for single-byte mode, minus handling for what fastSingle covers */ -singleByteMode: - while(source<sourceLimit) { - if(target>=targetLimit) { - /* target is full */ - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - break; - } - b=*source++; - switch(state) { - case readCommand: - /* redundant conditions are commented out */ - /* here: b<0x20 because otherwise we would be in fastSingle */ - if((1UL<<b)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) { - /* CR/LF/TAB/NUL */ - *target++=(UChar)b; - goto fastSingle; - } else if(SC0<=b) { - if(b<=SC7) { - dynamicWindow=(int8_t)(b-SC0); - goto fastSingle; - } else /* if(SD0<=b && b<=SD7) */ { - dynamicWindow=(int8_t)(b-SD0); - state=defineOne; - } - } else if(/* SQ0<=b && */ b<=SQ7) { - quoteWindow=(int8_t)(b-SQ0); - state=quoteOne; - } else if(b==SDX) { - state=definePairOne; - } else if(b==SQU) { - state=quotePairOne; - } else if(b==SCU) { - isSingleByteMode=FALSE; - goto fastUnicode; - } else /* Srs */ { - /* callback(illegal) */ - *pErrorCode=U_ILLEGAL_CHAR_FOUND; - cnv->toUBytes[0]=b; - cnv->toULength=1; - goto endloop; - } - - /* store the first byte of a multibyte sequence in toUBytes[] */ - cnv->toUBytes[0]=b; - cnv->toULength=1; - break; - case quotePairOne: - byteOne=b; - cnv->toUBytes[1]=b; - cnv->toULength=2; - state=quotePairTwo; - break; - case quotePairTwo: - *target++=(UChar)((byteOne<<8)|b); - state=readCommand; - goto fastSingle; - case quoteOne: - if(b<0x80) { - /* all static offsets are in the BMP */ - *target++=(UChar)(staticOffsets[quoteWindow]+b); - } else { - /* write from dynamic window */ - uint32_t c=scsu->toUDynamicOffsets[quoteWindow]+(b&0x7f); - if(c<=0xffff) { - *target++=(UChar)c; - } else { - /* output surrogate pair */ - *target++=(UChar)(0xd7c0+(c>>10)); - if(target<targetLimit) { - *target++=(UChar)(0xdc00|(c&0x3ff)); - } else { - /* target overflow */ - cnv->UCharErrorBuffer[0]=(UChar)(0xdc00|(c&0x3ff)); - cnv->UCharErrorBufferLength=1; - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - goto endloop; - } - } - } - state=readCommand; - goto fastSingle; - case definePairOne: - dynamicWindow=(int8_t)((b>>5)&7); - byteOne=(uint8_t)(b&0x1f); - cnv->toUBytes[1]=b; - cnv->toULength=2; - state=definePairTwo; - break; - case definePairTwo: - scsu->toUDynamicOffsets[dynamicWindow]=0x10000+(byteOne<<15UL | b<<7UL); - state=readCommand; - goto fastSingle; - case defineOne: - if(b==0) { - /* callback(illegal): Reserved window offset value 0 */ - cnv->toUBytes[1]=b; - cnv->toULength=2; - goto endloop; - } else if(b<gapThreshold) { - scsu->toUDynamicOffsets[dynamicWindow]=b<<7UL; - } else if((uint8_t)(b-gapThreshold)<(reservedStart-gapThreshold)) { - scsu->toUDynamicOffsets[dynamicWindow]=(b<<7UL)+gapOffset; - } else if(b>=fixedThreshold) { - scsu->toUDynamicOffsets[dynamicWindow]=fixedOffsets[b-fixedThreshold]; - } else { - /* callback(illegal): Reserved window offset value 0xa8..0xf8 */ - cnv->toUBytes[1]=b; - cnv->toULength=2; - goto endloop; - } - state=readCommand; - goto fastSingle; - } - } - } else { - /* fast path for Unicode mode */ - if(state==readCommand) { -fastUnicode: - while(source+1<sourceLimit && target<targetLimit && (uint8_t)((b=*source)-UC0)>(Urs-UC0)) { - *target++=(UChar)((b<<8)|source[1]); - source+=2; - } - } - - /* normal state machine for Unicode mode */ -/* unicodeByteMode: */ - while(source<sourceLimit) { - if(target>=targetLimit) { - /* target is full */ - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - break; - } - b=*source++; - switch(state) { - case readCommand: - if((uint8_t)(b-UC0)>(Urs-UC0)) { - byteOne=b; - cnv->toUBytes[0]=b; - cnv->toULength=1; - state=quotePairTwo; - } else if(/* UC0<=b && */ b<=UC7) { - dynamicWindow=(int8_t)(b-UC0); - isSingleByteMode=TRUE; - goto fastSingle; - } else if(/* UD0<=b && */ b<=UD7) { - dynamicWindow=(int8_t)(b-UD0); - isSingleByteMode=TRUE; - cnv->toUBytes[0]=b; - cnv->toULength=1; - state=defineOne; - goto singleByteMode; - } else if(b==UDX) { - isSingleByteMode=TRUE; - cnv->toUBytes[0]=b; - cnv->toULength=1; - state=definePairOne; - goto singleByteMode; - } else if(b==UQU) { - cnv->toUBytes[0]=b; - cnv->toULength=1; - state=quotePairOne; - } else /* Urs */ { - /* callback(illegal) */ - *pErrorCode=U_ILLEGAL_CHAR_FOUND; - cnv->toUBytes[0]=b; - cnv->toULength=1; - goto endloop; - } - break; - case quotePairOne: - byteOne=b; - cnv->toUBytes[1]=b; - cnv->toULength=2; - state=quotePairTwo; - break; - case quotePairTwo: - *target++=(UChar)((byteOne<<8)|b); - state=readCommand; - goto fastUnicode; - } - } - } -endloop: - - /* set the converter state back into UConverter */ - if(U_FAILURE(*pErrorCode) && *pErrorCode!=U_BUFFER_OVERFLOW_ERROR) { - /* reset to deal with the next character */ - state=readCommand; - } else if(state==readCommand) { - /* not in a multi-byte sequence, reset toULength */ - cnv->toULength=0; - } - scsu->toUIsSingleByteMode=isSingleByteMode; - scsu->toUState=state; - scsu->toUQuoteWindow=quoteWindow; - scsu->toUDynamicWindow=dynamicWindow; - scsu->toUByteOne=byteOne; - - /* write back the updated pointers */ - pArgs->source=(const char *)source; - pArgs->target=target; - return; -} -U_CDECL_END -/* SCSU-from-Unicode conversion functions ----------------------------------- */ - -/* - * This SCSU Encoder is fairly simple but uses all SCSU commands to achieve - * reasonable results. The lookahead is minimal. - * Many cases are simple: - * A character fits directly into the current mode, a dynamic or static window, - * or is not compressible. These cases are tested first. - * Real compression heuristics are applied to the rest, in code branches for - * single/Unicode mode and BMP/supplementary code points. - * The heuristics used here are extremely simple. - */ - -/* get the number of the window that this character is in, or -1 */ -static int8_t -getWindow(const uint32_t offsets[8], uint32_t c) { - int i; - for(i=0; i<8; ++i) { - if((uint32_t)(c-offsets[i])<=0x7f) { - return (int8_t)(i); - } - } - return -1; -} - -/* is the character in the dynamic window starting at the offset, or in the direct-encoded range? */ -static UBool -isInOffsetWindowOrDirect(uint32_t offset, uint32_t c) { - return (UBool)(c<=offset+0x7f && - (c>=offset || (c<=0x7f && - (c>=0x20 || (1UL<<c)&0x2601)))); - /* binary 0010 0110 0000 0001, - check for b==0xd || b==0xa || b==9 || b==0 */ -} - -/* - * getNextDynamicWindow returns the next dynamic window to be redefined - */ -static int8_t -getNextDynamicWindow(SCSUData *scsu) { - int8_t window=scsu->windowUse[scsu->nextWindowUseIndex]; - if(++scsu->nextWindowUseIndex==8) { - scsu->nextWindowUseIndex=0; - } - return window; -} - -/* - * useDynamicWindow() adjusts - * windowUse[] and nextWindowUseIndex for the algorithm to choose - * the next dynamic window to be defined; - * a subclass may override it and provide its own algorithm. - */ -static void -useDynamicWindow(SCSUData *scsu, int8_t window) { - /* - * move the existing window, which just became the most recently used one, - * up in windowUse[] to nextWindowUseIndex-1 - */ - - /* first, find the index of the window - backwards to favor the more recently used windows */ - int i, j; - - i=scsu->nextWindowUseIndex; - do { - if(--i<0) { - i=7; - } - } while(scsu->windowUse[i]!=window); - - /* now copy each windowUse[i+1] to [i] */ - j=i+1; - if(j==8) { - j=0; - } - while(j!=scsu->nextWindowUseIndex) { - scsu->windowUse[i]=scsu->windowUse[j]; - i=j; - if(++j==8) { j=0; } - } - - /* finally, set the window into the most recently used index */ - scsu->windowUse[i]=window; -} - -/* - * calculate the offset and the code for a dynamic window that contains the character - * takes fixed offsets into account - * the offset of the window is stored in the offset variable, - * the code is returned - * - * return offset code: -1 none <=0xff code for SDn/UDn else code for SDX/UDX, subtract 0x200 to get the true code - */ -static int -getDynamicOffset(uint32_t c, uint32_t *pOffset) { - int i; - - for(i=0; i<7; ++i) { - if((uint32_t)(c-fixedOffsets[i])<=0x7f) { - *pOffset=fixedOffsets[i]; - return 0xf9+i; - } - } - - if(c<0x80) { - /* No dynamic window for US-ASCII. */ - return -1; - } else if(c<0x3400 || - (uint32_t)(c-0x10000)<(0x14000-0x10000) || - (uint32_t)(c-0x1d000)<=(0x1ffff-0x1d000) - ) { - /* This character is in a code range for a "small", i.e., reasonably windowable, script. */ - *pOffset=c&0x7fffff80; - return (int)(c>>7); - } else if(0xe000<=c && c!=0xfeff && c<0xfff0) { - /* For these characters we need to take the gapOffset into account. */ - *pOffset=c&0x7fffff80; - return (int)((c-gapOffset)>>7); - } else { - return -1; - } -} -U_CDECL_BEGIN -/* - * Idea for compression: - * - save SCSUData and other state before really starting work - * - at endloop, see if compression could be better with just unicode mode - * - don't do this if a callback has been called - * - if unicode mode would be smaller, then override the results with it - may need SCU at the beginning - * - different buffer handling! - * - * Drawback or need for corrective handling: - * it is desirable to encode U+feff as SQU fe ff for the SCSU signature, and - * it is desirable to start a document in US-ASCII/Latin-1 for as long as possible - * not only for compression but also for HTML/XML documents with following charset/encoding announcers. - * - * How to achieve both? - * - Only replace the result after an SDX or SCU? - */ - -static void U_CALLCONV -_SCSUFromUnicodeWithOffsets(UConverterFromUnicodeArgs *pArgs, - UErrorCode *pErrorCode) { - UConverter *cnv; - SCSUData *scsu; - const UChar *source, *sourceLimit; - uint8_t *target; - int32_t targetCapacity; - int32_t *offsets; - - UBool isSingleByteMode; - uint8_t dynamicWindow; - uint32_t currentOffset; - - uint32_t c, delta; - - int32_t sourceIndex, nextSourceIndex; - - int32_t length; - - /* variables for compression heuristics */ - uint32_t offset; - UChar lead, trail; - int code; - int8_t window; - - /* set up the local pointers */ - cnv=pArgs->converter; - scsu=(SCSUData *)cnv->extraInfo; - - /* set up the local pointers */ - source=pArgs->source; - sourceLimit=pArgs->sourceLimit; - target=(uint8_t *)pArgs->target; - targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); - offsets=pArgs->offsets; - - /* get the state machine state */ - isSingleByteMode=scsu->fromUIsSingleByteMode; - dynamicWindow=scsu->fromUDynamicWindow; - currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; - - c=cnv->fromUChar32; - - /* sourceIndex=-1 if the current character began in the previous buffer */ - sourceIndex= c==0 ? 0 : -1; - nextSourceIndex=0; - - /* similar conversion "loop" as in toUnicode */ -loop: - if(isSingleByteMode) { - if(c!=0 && targetCapacity>0) { - goto getTrailSingle; - } - - /* state machine for single-byte mode */ -/* singleByteMode: */ - while(source<sourceLimit) { - if(targetCapacity<=0) { - /* target is full */ - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - break; - } - c=*source++; - ++nextSourceIndex; - - if((c-0x20)<=0x5f) { - /* pass US-ASCII graphic character through */ - *target++=(uint8_t)c; - if(offsets!=NULL) { - *offsets++=sourceIndex; - } - --targetCapacity; - } else if(c<0x20) { - if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) { - /* CR/LF/TAB/NUL */ - *target++=(uint8_t)c; - if(offsets!=NULL) { - *offsets++=sourceIndex; - } - --targetCapacity; - } else { - /* quote C0 control character */ - c|=SQ0<<8; - length=2; - goto outputBytes; - } - } else if((delta=c-currentOffset)<=0x7f) { - /* use the current dynamic window */ - *target++=(uint8_t)(delta|0x80); - if(offsets!=NULL) { - *offsets++=sourceIndex; - } - --targetCapacity; - } else if(U16_IS_SURROGATE(c)) { - if(U16_IS_SURROGATE_LEAD(c)) { -getTrailSingle: - lead=(UChar)c; - if(source<sourceLimit) { - /* test the following code unit */ - trail=*source; - if(U16_IS_TRAIL(trail)) { - ++source; - ++nextSourceIndex; - c=U16_GET_SUPPLEMENTARY(c, trail); - /* convert this surrogate code point */ - /* exit this condition tree */ - } else { - /* this is an unmatched lead code unit (1st surrogate) */ - /* callback(illegal) */ - *pErrorCode=U_ILLEGAL_CHAR_FOUND; - goto endloop; - } - } else { - /* no more input */ - break; - } - } else { - /* this is an unmatched trail code unit (2nd surrogate) */ - /* callback(illegal) */ - *pErrorCode=U_ILLEGAL_CHAR_FOUND; - goto endloop; - } - - /* compress supplementary character U+10000..U+10ffff */ - if((delta=c-currentOffset)<=0x7f) { - /* use the current dynamic window */ - *target++=(uint8_t)(delta|0x80); - if(offsets!=NULL) { - *offsets++=sourceIndex; - } - --targetCapacity; - } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { - /* there is a dynamic window that contains this character, change to it */ - dynamicWindow=window; - currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; - useDynamicWindow(scsu, dynamicWindow); - c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; - length=2; - goto outputBytes; - } else if((code=getDynamicOffset(c, &offset))>=0) { - /* might check if there are more characters in this window to come */ - /* define an extended window with this character */ - code-=0x200; - dynamicWindow=getNextDynamicWindow(scsu); - currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; - useDynamicWindow(scsu, dynamicWindow); - c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80; - length=4; - goto outputBytes; - } else { - /* change to Unicode mode and output this (lead, trail) pair */ - isSingleByteMode=FALSE; - *target++=(uint8_t)SCU; - if(offsets!=NULL) { - *offsets++=sourceIndex; - } - --targetCapacity; - c=((uint32_t)lead<<16)|trail; - length=4; - goto outputBytes; - } - } else if(c<0xa0) { - /* quote C1 control character */ - c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */ - length=2; - goto outputBytes; - } else if(c==0xfeff || c>=0xfff0) { - /* quote signature character=byte order mark and specials */ - c|=SQU<<16; - length=3; - goto outputBytes; - } else { - /* compress all other BMP characters */ - if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { - /* there is a window defined that contains this character - switch to it or quote from it? */ - if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) { - /* change to dynamic window */ - dynamicWindow=window; - currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; - useDynamicWindow(scsu, dynamicWindow); - c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; - length=2; - goto outputBytes; - } else { - /* quote from dynamic window */ - c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80; - length=2; - goto outputBytes; - } - } else if((window=getWindow(staticOffsets, c))>=0) { - /* quote from static window */ - c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]); - length=2; - goto outputBytes; - } else if((code=getDynamicOffset(c, &offset))>=0) { - /* define a dynamic window with this character */ - dynamicWindow=getNextDynamicWindow(scsu); - currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; - useDynamicWindow(scsu, dynamicWindow); - c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80; - length=3; - goto outputBytes; - } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) && - (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400)) - ) { - /* - * this character is not compressible (a BMP ideograph or similar); - * switch to Unicode mode if this is the last character in the block - * or there is at least one more ideograph following immediately - */ - isSingleByteMode=FALSE; - c|=SCU<<16; - length=3; - goto outputBytes; - } else { - /* quote Unicode */ - c|=SQU<<16; - length=3; - goto outputBytes; - } - } - - /* normal end of conversion: prepare for a new character */ - c=0; - sourceIndex=nextSourceIndex; - } - } else { - if(c!=0 && targetCapacity>0) { - goto getTrailUnicode; - } - - /* state machine for Unicode mode */ -/* unicodeByteMode: */ - while(source<sourceLimit) { - if(targetCapacity<=0) { - /* target is full */ - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - break; - } - c=*source++; - ++nextSourceIndex; - - if((uint32_t)(c-0x3400)<(0xd800-0x3400)) { - /* not compressible, write character directly */ - if(targetCapacity>=2) { - *target++=(uint8_t)(c>>8); - *target++=(uint8_t)c; - if(offsets!=NULL) { - *offsets++=sourceIndex; - *offsets++=sourceIndex; - } - targetCapacity-=2; - } else { - length=2; - goto outputBytes; - } - } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) { - /* compress BMP character if the following one is not an uncompressible ideograph */ - if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) { - if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint32_t)(c-0x41)<26)) { - /* ASCII digit or letter */ - isSingleByteMode=TRUE; - c|=((uint32_t)(UC0+dynamicWindow)<<8)|c; - length=2; - goto outputBytes; - } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { - /* there is a dynamic window that contains this character, change to it */ - isSingleByteMode=TRUE; - dynamicWindow=window; - currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; - useDynamicWindow(scsu, dynamicWindow); - c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; - length=2; - goto outputBytes; - } else if((code=getDynamicOffset(c, &offset))>=0) { - /* define a dynamic window with this character */ - isSingleByteMode=TRUE; - dynamicWindow=getNextDynamicWindow(scsu); - currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; - useDynamicWindow(scsu, dynamicWindow); - c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80; - length=3; - goto outputBytes; - } - } - - /* don't know how to compress this character, just write it directly */ - length=2; - goto outputBytes; - } else if(c<0xe000) { - /* c is a surrogate */ - if(U16_IS_SURROGATE_LEAD(c)) { -getTrailUnicode: - lead=(UChar)c; - if(source<sourceLimit) { - /* test the following code unit */ - trail=*source; - if(U16_IS_TRAIL(trail)) { - ++source; - ++nextSourceIndex; - c=U16_GET_SUPPLEMENTARY(c, trail); - /* convert this surrogate code point */ - /* exit this condition tree */ - } else { - /* this is an unmatched lead code unit (1st surrogate) */ - /* callback(illegal) */ - *pErrorCode=U_ILLEGAL_CHAR_FOUND; - goto endloop; - } - } else { - /* no more input */ - break; - } - } else { - /* this is an unmatched trail code unit (2nd surrogate) */ - /* callback(illegal) */ - *pErrorCode=U_ILLEGAL_CHAR_FOUND; - goto endloop; - } - - /* compress supplementary character */ - if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 && - !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400)) - ) { - /* - * there is a dynamic window that contains this character and - * the following character is not uncompressible, - * change to the window - */ - isSingleByteMode=TRUE; - dynamicWindow=window; - currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; - useDynamicWindow(scsu, dynamicWindow); - c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; - length=2; - goto outputBytes; - } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */ - (code=getDynamicOffset(c, &offset))>=0 - ) { - /* two supplementary characters in (probably) the same window - define an extended one */ - isSingleByteMode=TRUE; - code-=0x200; - dynamicWindow=getNextDynamicWindow(scsu); - currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; - useDynamicWindow(scsu, dynamicWindow); - c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80; - length=4; - goto outputBytes; - } else { - /* don't know how to compress this character, just write it directly */ - c=((uint32_t)lead<<16)|trail; - length=4; - goto outputBytes; - } - } else /* 0xe000<=c<0xf300 */ { - /* quote to avoid SCSU tags */ - c|=UQU<<16; - length=3; - goto outputBytes; - } - - /* normal end of conversion: prepare for a new character */ - c=0; - sourceIndex=nextSourceIndex; - } - } -endloop: - - /* set the converter state back into UConverter */ - scsu->fromUIsSingleByteMode=isSingleByteMode; - scsu->fromUDynamicWindow=dynamicWindow; - - cnv->fromUChar32=c; - - /* write back the updated pointers */ - pArgs->source=source; - pArgs->target=(char *)target; - pArgs->offsets=offsets; - return; - -outputBytes: - /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */ - /* from the first if in the loop we know that targetCapacity>0 */ - if(length<=targetCapacity) { - if(offsets==NULL) { - switch(length) { - /* each branch falls through to the next one */ - case 4: - *target++=(uint8_t)(c>>24); - U_FALLTHROUGH; - case 3: - *target++=(uint8_t)(c>>16); - U_FALLTHROUGH; - case 2: - *target++=(uint8_t)(c>>8); - U_FALLTHROUGH; - case 1: - *target++=(uint8_t)c; - U_FALLTHROUGH; - default: - /* will never occur */ - break; - } - } else { - switch(length) { - /* each branch falls through to the next one */ - case 4: - *target++=(uint8_t)(c>>24); - *offsets++=sourceIndex; - U_FALLTHROUGH; - case 3: - *target++=(uint8_t)(c>>16); - *offsets++=sourceIndex; - U_FALLTHROUGH; - case 2: - *target++=(uint8_t)(c>>8); - *offsets++=sourceIndex; - U_FALLTHROUGH; - case 1: - *target++=(uint8_t)c; - *offsets++=sourceIndex; - U_FALLTHROUGH; - default: - /* will never occur */ - break; - } - } - targetCapacity-=length; - - /* normal end of conversion: prepare for a new character */ - c=0; - sourceIndex=nextSourceIndex; - goto loop; - } else { - uint8_t *p; - - /* - * We actually do this backwards here: - * In order to save an intermediate variable, we output - * first to the overflow buffer what does not fit into the - * regular target. - */ - /* we know that 0<=targetCapacity<length<=4 */ - /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */ - length-=targetCapacity; - p=(uint8_t *)cnv->charErrorBuffer; - switch(length) { - /* each branch falls through to the next one */ - case 4: - *p++=(uint8_t)(c>>24); - U_FALLTHROUGH; - case 3: - *p++=(uint8_t)(c>>16); - U_FALLTHROUGH; - case 2: - *p++=(uint8_t)(c>>8); - U_FALLTHROUGH; - case 1: - *p=(uint8_t)c; - U_FALLTHROUGH; - default: - /* will never occur */ - break; - } - cnv->charErrorBufferLength=(int8_t)length; - - /* now output what fits into the regular target */ - c>>=8*length; /* length was reduced by targetCapacity */ - switch(targetCapacity) { - /* each branch falls through to the next one */ - case 3: - *target++=(uint8_t)(c>>16); - if(offsets!=NULL) { - *offsets++=sourceIndex; - } - U_FALLTHROUGH; - case 2: - *target++=(uint8_t)(c>>8); - if(offsets!=NULL) { - *offsets++=sourceIndex; - } - U_FALLTHROUGH; - case 1: - *target++=(uint8_t)c; - if(offsets!=NULL) { - *offsets++=sourceIndex; - } - U_FALLTHROUGH; - default: - break; - } - - /* target overflow */ - targetCapacity=0; - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - c=0; - goto endloop; - } -} - -/* - * Identical to _SCSUFromUnicodeWithOffsets but without offset handling. - * If a change is made in the original function, then either - * change this function the same way or - * re-copy the original function and remove the variables - * offsets, sourceIndex, and nextSourceIndex. - */ -static void U_CALLCONV -_SCSUFromUnicode(UConverterFromUnicodeArgs *pArgs, - UErrorCode *pErrorCode) { - UConverter *cnv; - SCSUData *scsu; - const UChar *source, *sourceLimit; - uint8_t *target; - int32_t targetCapacity; - - UBool isSingleByteMode; - uint8_t dynamicWindow; - uint32_t currentOffset; - - uint32_t c, delta; - - int32_t length; - - /* variables for compression heuristics */ - uint32_t offset; - UChar lead, trail; - int code; - int8_t window; - - /* set up the local pointers */ - cnv=pArgs->converter; - scsu=(SCSUData *)cnv->extraInfo; - - /* set up the local pointers */ - source=pArgs->source; - sourceLimit=pArgs->sourceLimit; - target=(uint8_t *)pArgs->target; - targetCapacity=(int32_t)(pArgs->targetLimit-pArgs->target); - - /* get the state machine state */ - isSingleByteMode=scsu->fromUIsSingleByteMode; - dynamicWindow=scsu->fromUDynamicWindow; - currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; - - c=cnv->fromUChar32; - - /* similar conversion "loop" as in toUnicode */ -loop: - if(isSingleByteMode) { - if(c!=0 && targetCapacity>0) { - goto getTrailSingle; - } - - /* state machine for single-byte mode */ -/* singleByteMode: */ - while(source<sourceLimit) { - if(targetCapacity<=0) { - /* target is full */ - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - break; - } - c=*source++; - - if((c-0x20)<=0x5f) { - /* pass US-ASCII graphic character through */ - *target++=(uint8_t)c; - --targetCapacity; - } else if(c<0x20) { - if((1UL<<c)&0x2601 /* binary 0010 0110 0000 0001, check for b==0xd || b==0xa || b==9 || b==0 */) { - /* CR/LF/TAB/NUL */ - *target++=(uint8_t)c; - --targetCapacity; - } else { - /* quote C0 control character */ - c|=SQ0<<8; - length=2; - goto outputBytes; - } - } else if((delta=c-currentOffset)<=0x7f) { - /* use the current dynamic window */ - *target++=(uint8_t)(delta|0x80); - --targetCapacity; - } else if(U16_IS_SURROGATE(c)) { - if(U16_IS_SURROGATE_LEAD(c)) { -getTrailSingle: - lead=(UChar)c; - if(source<sourceLimit) { - /* test the following code unit */ - trail=*source; - if(U16_IS_TRAIL(trail)) { - ++source; - c=U16_GET_SUPPLEMENTARY(c, trail); - /* convert this surrogate code point */ - /* exit this condition tree */ - } else { - /* this is an unmatched lead code unit (1st surrogate) */ - /* callback(illegal) */ - *pErrorCode=U_ILLEGAL_CHAR_FOUND; - goto endloop; - } - } else { - /* no more input */ - break; - } - } else { - /* this is an unmatched trail code unit (2nd surrogate) */ - /* callback(illegal) */ - *pErrorCode=U_ILLEGAL_CHAR_FOUND; - goto endloop; - } - - /* compress supplementary character U+10000..U+10ffff */ - if((delta=c-currentOffset)<=0x7f) { - /* use the current dynamic window */ - *target++=(uint8_t)(delta|0x80); - --targetCapacity; - } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { - /* there is a dynamic window that contains this character, change to it */ - dynamicWindow=window; - currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; - useDynamicWindow(scsu, dynamicWindow); - c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; - length=2; - goto outputBytes; - } else if((code=getDynamicOffset(c, &offset))>=0) { - /* might check if there are more characters in this window to come */ - /* define an extended window with this character */ - code-=0x200; - dynamicWindow=getNextDynamicWindow(scsu); - currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; - useDynamicWindow(scsu, dynamicWindow); - c=((uint32_t)SDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80; - length=4; - goto outputBytes; - } else { - /* change to Unicode mode and output this (lead, trail) pair */ - isSingleByteMode=FALSE; - *target++=(uint8_t)SCU; - --targetCapacity; - c=((uint32_t)lead<<16)|trail; - length=4; - goto outputBytes; - } - } else if(c<0xa0) { - /* quote C1 control character */ - c=(c&0x7f)|(SQ0+1)<<8; /* SQ0+1==SQ1 */ - length=2; - goto outputBytes; - } else if(c==0xfeff || c>=0xfff0) { - /* quote signature character=byte order mark and specials */ - c|=SQU<<16; - length=3; - goto outputBytes; - } else { - /* compress all other BMP characters */ - if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { - /* there is a window defined that contains this character - switch to it or quote from it? */ - if(source>=sourceLimit || isInOffsetWindowOrDirect(scsu->fromUDynamicOffsets[window], *source)) { - /* change to dynamic window */ - dynamicWindow=window; - currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; - useDynamicWindow(scsu, dynamicWindow); - c=((uint32_t)(SC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; - length=2; - goto outputBytes; - } else { - /* quote from dynamic window */ - c=((uint32_t)(SQ0+window)<<8)|(c-scsu->fromUDynamicOffsets[window])|0x80; - length=2; - goto outputBytes; - } - } else if((window=getWindow(staticOffsets, c))>=0) { - /* quote from static window */ - c=((uint32_t)(SQ0+window)<<8)|(c-staticOffsets[window]); - length=2; - goto outputBytes; - } else if((code=getDynamicOffset(c, &offset))>=0) { - /* define a dynamic window with this character */ - dynamicWindow=getNextDynamicWindow(scsu); - currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; - useDynamicWindow(scsu, dynamicWindow); - c=((uint32_t)(SD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80; - length=3; - goto outputBytes; - } else if((uint32_t)(c-0x3400)<(0xd800-0x3400) && - (source>=sourceLimit || (uint32_t)(*source-0x3400)<(0xd800-0x3400)) - ) { - /* - * this character is not compressible (a BMP ideograph or similar); - * switch to Unicode mode if this is the last character in the block - * or there is at least one more ideograph following immediately - */ - isSingleByteMode=FALSE; - c|=SCU<<16; - length=3; - goto outputBytes; - } else { - /* quote Unicode */ - c|=SQU<<16; - length=3; - goto outputBytes; - } - } - - /* normal end of conversion: prepare for a new character */ - c=0; - } - } else { - if(c!=0 && targetCapacity>0) { - goto getTrailUnicode; - } - - /* state machine for Unicode mode */ -/* unicodeByteMode: */ - while(source<sourceLimit) { - if(targetCapacity<=0) { - /* target is full */ - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - break; - } - c=*source++; - - if((uint32_t)(c-0x3400)<(0xd800-0x3400)) { - /* not compressible, write character directly */ - if(targetCapacity>=2) { - *target++=(uint8_t)(c>>8); - *target++=(uint8_t)c; - targetCapacity-=2; - } else { - length=2; - goto outputBytes; - } - } else if((uint32_t)(c-0x3400)>=(0xf300-0x3400) /* c<0x3400 || c>=0xf300 */) { - /* compress BMP character if the following one is not an uncompressible ideograph */ - if(!(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400))) { - if(((uint32_t)(c-0x30)<10 || (uint32_t)(c-0x61)<26 || (uint32_t)(c-0x41)<26)) { - /* ASCII digit or letter */ - isSingleByteMode=TRUE; - c|=((uint32_t)(UC0+dynamicWindow)<<8)|c; - length=2; - goto outputBytes; - } else if((window=getWindow(scsu->fromUDynamicOffsets, c))>=0) { - /* there is a dynamic window that contains this character, change to it */ - isSingleByteMode=TRUE; - dynamicWindow=window; - currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; - useDynamicWindow(scsu, dynamicWindow); - c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; - length=2; - goto outputBytes; - } else if((code=getDynamicOffset(c, &offset))>=0) { - /* define a dynamic window with this character */ - isSingleByteMode=TRUE; - dynamicWindow=getNextDynamicWindow(scsu); - currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; - useDynamicWindow(scsu, dynamicWindow); - c=((uint32_t)(UD0+dynamicWindow)<<16)|((uint32_t)code<<8)|(c-currentOffset)|0x80; - length=3; - goto outputBytes; - } - } - - /* don't know how to compress this character, just write it directly */ - length=2; - goto outputBytes; - } else if(c<0xe000) { - /* c is a surrogate */ - if(U16_IS_SURROGATE_LEAD(c)) { -getTrailUnicode: - lead=(UChar)c; - if(source<sourceLimit) { - /* test the following code unit */ - trail=*source; - if(U16_IS_TRAIL(trail)) { - ++source; - c=U16_GET_SUPPLEMENTARY(c, trail); - /* convert this surrogate code point */ - /* exit this condition tree */ - } else { - /* this is an unmatched lead code unit (1st surrogate) */ - /* callback(illegal) */ - *pErrorCode=U_ILLEGAL_CHAR_FOUND; - goto endloop; - } - } else { - /* no more input */ - break; - } - } else { - /* this is an unmatched trail code unit (2nd surrogate) */ - /* callback(illegal) */ - *pErrorCode=U_ILLEGAL_CHAR_FOUND; - goto endloop; - } - - /* compress supplementary character */ - if( (window=getWindow(scsu->fromUDynamicOffsets, c))>=0 && - !(source<sourceLimit && (uint32_t)(*source-0x3400)<(0xd800-0x3400)) - ) { - /* - * there is a dynamic window that contains this character and - * the following character is not uncompressible, - * change to the window - */ - isSingleByteMode=TRUE; - dynamicWindow=window; - currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]; - useDynamicWindow(scsu, dynamicWindow); - c=((uint32_t)(UC0+dynamicWindow)<<8)|(c-currentOffset)|0x80; - length=2; - goto outputBytes; - } else if(source<sourceLimit && lead==*source && /* too lazy to check trail in same window as source[1] */ - (code=getDynamicOffset(c, &offset))>=0 - ) { - /* two supplementary characters in (probably) the same window - define an extended one */ - isSingleByteMode=TRUE; - code-=0x200; - dynamicWindow=getNextDynamicWindow(scsu); - currentOffset=scsu->fromUDynamicOffsets[dynamicWindow]=offset; - useDynamicWindow(scsu, dynamicWindow); - c=((uint32_t)UDX<<24)|((uint32_t)dynamicWindow<<21)|((uint32_t)code<<8)|(c-currentOffset)|0x80; - length=4; - goto outputBytes; - } else { - /* don't know how to compress this character, just write it directly */ - c=((uint32_t)lead<<16)|trail; - length=4; - goto outputBytes; - } - } else /* 0xe000<=c<0xf300 */ { - /* quote to avoid SCSU tags */ - c|=UQU<<16; - length=3; - goto outputBytes; - } - - /* normal end of conversion: prepare for a new character */ - c=0; - } - } -endloop: - - /* set the converter state back into UConverter */ - scsu->fromUIsSingleByteMode=isSingleByteMode; - scsu->fromUDynamicWindow=dynamicWindow; - - cnv->fromUChar32=c; - - /* write back the updated pointers */ - pArgs->source=source; - pArgs->target=(char *)target; - return; - -outputBytes: - /* write the output character bytes from c and length [code copied from ucnvmbcs.c] */ - /* from the first if in the loop we know that targetCapacity>0 */ - if(length<=targetCapacity) { - switch(length) { - /* each branch falls through to the next one */ - case 4: - *target++=(uint8_t)(c>>24); - U_FALLTHROUGH; - case 3: - *target++=(uint8_t)(c>>16); - U_FALLTHROUGH; - case 2: - *target++=(uint8_t)(c>>8); - U_FALLTHROUGH; - case 1: - *target++=(uint8_t)c; - U_FALLTHROUGH; - default: - /* will never occur */ - break; - } - targetCapacity-=length; - - /* normal end of conversion: prepare for a new character */ - c=0; - goto loop; - } else { - uint8_t *p; - - /* - * We actually do this backwards here: - * In order to save an intermediate variable, we output - * first to the overflow buffer what does not fit into the - * regular target. - */ - /* we know that 0<=targetCapacity<length<=4 */ - /* targetCapacity==0 when SCU+supplementary where SCU used up targetCapacity==1 */ - length-=targetCapacity; - p=(uint8_t *)cnv->charErrorBuffer; - switch(length) { - /* each branch falls through to the next one */ - case 4: - *p++=(uint8_t)(c>>24); - U_FALLTHROUGH; - case 3: - *p++=(uint8_t)(c>>16); - U_FALLTHROUGH; - case 2: - *p++=(uint8_t)(c>>8); - U_FALLTHROUGH; - case 1: - *p=(uint8_t)c; - U_FALLTHROUGH; - default: - /* will never occur */ - break; - } - cnv->charErrorBufferLength=(int8_t)length; - - /* now output what fits into the regular target */ - c>>=8*length; /* length was reduced by targetCapacity */ - switch(targetCapacity) { - /* each branch falls through to the next one */ - case 3: - *target++=(uint8_t)(c>>16); - U_FALLTHROUGH; - case 2: - *target++=(uint8_t)(c>>8); - U_FALLTHROUGH; - case 1: - *target++=(uint8_t)c; - U_FALLTHROUGH; - default: - break; - } - - /* target overflow */ - targetCapacity=0; - *pErrorCode=U_BUFFER_OVERFLOW_ERROR; - c=0; - goto endloop; - } -} - -/* miscellaneous ------------------------------------------------------------ */ - -static const char * U_CALLCONV -_SCSUGetName(const UConverter *cnv) { - SCSUData *scsu=(SCSUData *)cnv->extraInfo; - - switch(scsu->locale) { - case l_ja: - return "SCSU,locale=ja"; - default: - return "SCSU"; - } -} - -/* structure for SafeClone calculations */ -struct cloneSCSUStruct -{ - UConverter cnv; - SCSUData mydata; -}; - -static UConverter * U_CALLCONV -_SCSUSafeClone(const UConverter *cnv, - void *stackBuffer, - int32_t *pBufferSize, - UErrorCode *status) -{ - struct cloneSCSUStruct * localClone; - int32_t bufferSizeNeeded = sizeof(struct cloneSCSUStruct); - - if (U_FAILURE(*status)){ - return 0; - } - - if (*pBufferSize == 0){ /* 'preflighting' request - set needed size into *pBufferSize */ - *pBufferSize = bufferSizeNeeded; - return 0; - } - - localClone = (struct cloneSCSUStruct *)stackBuffer; - /* ucnv.c/ucnv_safeClone() copied the main UConverter already */ - - uprv_memcpy(&localClone->mydata, cnv->extraInfo, sizeof(SCSUData)); - localClone->cnv.extraInfo = &localClone->mydata; - localClone->cnv.isExtraLocal = TRUE; - - return &localClone->cnv; -} -U_CDECL_END - -static const UConverterImpl _SCSUImpl={ - UCNV_SCSU, - - NULL, - NULL, - - _SCSUOpen, - _SCSUClose, - _SCSUReset, - - _SCSUToUnicode, - _SCSUToUnicodeWithOffsets, - _SCSUFromUnicode, - _SCSUFromUnicodeWithOffsets, - NULL, - - NULL, - _SCSUGetName, - NULL, - _SCSUSafeClone, - ucnv_getCompleteUnicodeSet, - NULL, - NULL -}; - -static const UConverterStaticData _SCSUStaticData={ - sizeof(UConverterStaticData), - "SCSU", - 1212, /* CCSID for SCSU */ - UCNV_IBM, UCNV_SCSU, - 1, 3, /* one UChar generates at least 1 byte and at most 3 bytes */ - /* - * The subchar here is ignored because _SCSUOpen() sets U+fffd as a Unicode - * substitution string. - */ - { 0x0e, 0xff, 0xfd, 0 }, 3, - FALSE, FALSE, - 0, - 0, - { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 } /* reserved */ -}; - -const UConverterSharedData _SCSUData= - UCNV_IMMUTABLE_SHARED_DATA_INITIALIZER(&_SCSUStaticData, &_SCSUImpl); - -#endif |