diff options
Diffstat (limited to 'deps/node/deps/icu-small/source/common/unames.cpp')
-rw-r--r-- | deps/node/deps/icu-small/source/common/unames.cpp | 2101 |
1 files changed, 0 insertions, 2101 deletions
diff --git a/deps/node/deps/icu-small/source/common/unames.cpp b/deps/node/deps/icu-small/source/common/unames.cpp deleted file mode 100644 index 5f752b0d..00000000 --- a/deps/node/deps/icu-small/source/common/unames.cpp +++ /dev/null @@ -1,2101 +0,0 @@ -// © 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html -/* -****************************************************************************** -* -* Copyright (C) 1999-2014, International Business Machines -* Corporation and others. All Rights Reserved. -* -****************************************************************************** -* file name: unames.c -* encoding: UTF-8 -* tab size: 8 (not used) -* indentation:4 -* -* created on: 1999oct04 -* created by: Markus W. Scherer -*/ - -#include "unicode/utypes.h" -#include "unicode/putil.h" -#include "unicode/uchar.h" -#include "unicode/udata.h" -#include "unicode/utf.h" -#include "unicode/utf16.h" -#include "uassert.h" -#include "ustr_imp.h" -#include "umutex.h" -#include "cmemory.h" -#include "cstring.h" -#include "ucln_cmn.h" -#include "udataswp.h" -#include "uprops.h" - -U_NAMESPACE_BEGIN - -/* prototypes ------------------------------------------------------------- */ - -static const char DATA_NAME[] = "unames"; -static const char DATA_TYPE[] = "icu"; - -#define GROUP_SHIFT 5 -#define LINES_PER_GROUP (1L<<GROUP_SHIFT) -#define GROUP_MASK (LINES_PER_GROUP-1) - -/* - * This struct was replaced by explicitly accessing equivalent - * fields from triples of uint16_t. - * The Group struct was padded to 8 bytes on compilers for early ARM CPUs, - * which broke the assumption that sizeof(Group)==6 and that the ++ operator - * would advance by 6 bytes (3 uint16_t). - * - * We can't just change the data structure because it's loaded from a data file, - * and we don't want to make it less compact, so we changed the access code. - * - * For details see ICU tickets 6331 and 6008. -typedef struct { - uint16_t groupMSB, - offsetHigh, offsetLow; / * avoid padding * / -} Group; - */ -enum { - GROUP_MSB, - GROUP_OFFSET_HIGH, - GROUP_OFFSET_LOW, - GROUP_LENGTH -}; - -/* - * Get the 32-bit group offset. - * @param group (const uint16_t *) pointer to a Group triple of uint16_t - * @return group offset (int32_t) - */ -#define GET_GROUP_OFFSET(group) ((int32_t)(group)[GROUP_OFFSET_HIGH]<<16|(group)[GROUP_OFFSET_LOW]) - -#define NEXT_GROUP(group) ((group)+GROUP_LENGTH) -#define PREV_GROUP(group) ((group)-GROUP_LENGTH) - -typedef struct { - uint32_t start, end; - uint8_t type, variant; - uint16_t size; -} AlgorithmicRange; - -typedef struct { - uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset; -} UCharNames; - -/* - * Get the groups table from a UCharNames struct. - * The groups table consists of one uint16_t groupCount followed by - * groupCount groups. Each group is a triple of uint16_t, see GROUP_LENGTH - * and the comment for the old struct Group above. - * - * @param names (const UCharNames *) pointer to the UCharNames indexes - * @return (const uint16_t *) pointer to the groups table - */ -#define GET_GROUPS(names) (const uint16_t *)((const char *)names+names->groupsOffset) - -typedef struct { - const char *otherName; - UChar32 code; -} FindName; - -#define DO_FIND_NAME NULL - -static UDataMemory *uCharNamesData=NULL; -static UCharNames *uCharNames=NULL; -static icu::UInitOnce gCharNamesInitOnce = U_INITONCE_INITIALIZER; - -/* - * Maximum length of character names (regular & 1.0). - */ -static int32_t gMaxNameLength=0; - -/* - * Set of chars used in character names (regular & 1.0). - * Chars are platform-dependent (can be EBCDIC). - */ -static uint32_t gNameSet[8]={ 0 }; - -#define U_NONCHARACTER_CODE_POINT U_CHAR_CATEGORY_COUNT -#define U_LEAD_SURROGATE U_CHAR_CATEGORY_COUNT + 1 -#define U_TRAIL_SURROGATE U_CHAR_CATEGORY_COUNT + 2 - -#define U_CHAR_EXTENDED_CATEGORY_COUNT (U_CHAR_CATEGORY_COUNT + 3) - -static const char * const charCatNames[U_CHAR_EXTENDED_CATEGORY_COUNT] = { - "unassigned", - "uppercase letter", - "lowercase letter", - "titlecase letter", - "modifier letter", - "other letter", - "non spacing mark", - "enclosing mark", - "combining spacing mark", - "decimal digit number", - "letter number", - "other number", - "space separator", - "line separator", - "paragraph separator", - "control", - "format", - "private use area", - "surrogate", - "dash punctuation", - "start punctuation", - "end punctuation", - "connector punctuation", - "other punctuation", - "math symbol", - "currency symbol", - "modifier symbol", - "other symbol", - "initial punctuation", - "final punctuation", - "noncharacter", - "lead surrogate", - "trail surrogate" -}; - -/* implementation ----------------------------------------------------------- */ - -static UBool U_CALLCONV unames_cleanup(void) -{ - if(uCharNamesData) { - udata_close(uCharNamesData); - uCharNamesData = NULL; - } - if(uCharNames) { - uCharNames = NULL; - } - gCharNamesInitOnce.reset(); - gMaxNameLength=0; - return TRUE; -} - -static UBool U_CALLCONV -isAcceptable(void * /*context*/, - const char * /*type*/, const char * /*name*/, - const UDataInfo *pInfo) { - return (UBool)( - pInfo->size>=20 && - pInfo->isBigEndian==U_IS_BIG_ENDIAN && - pInfo->charsetFamily==U_CHARSET_FAMILY && - pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */ - pInfo->dataFormat[1]==0x6e && - pInfo->dataFormat[2]==0x61 && - pInfo->dataFormat[3]==0x6d && - pInfo->formatVersion[0]==1); -} - -static void U_CALLCONV -loadCharNames(UErrorCode &status) { - U_ASSERT(uCharNamesData == NULL); - U_ASSERT(uCharNames == NULL); - - uCharNamesData = udata_openChoice(NULL, DATA_TYPE, DATA_NAME, isAcceptable, NULL, &status); - if(U_FAILURE(status)) { - uCharNamesData = NULL; - } else { - uCharNames = (UCharNames *)udata_getMemory(uCharNamesData); - } - ucln_common_registerCleanup(UCLN_COMMON_UNAMES, unames_cleanup); -} - - -static UBool -isDataLoaded(UErrorCode *pErrorCode) { - umtx_initOnce(gCharNamesInitOnce, &loadCharNames, *pErrorCode); - return U_SUCCESS(*pErrorCode); -} - -#define WRITE_CHAR(buffer, bufferLength, bufferPos, c) { \ - if((bufferLength)>0) { \ - *(buffer)++=c; \ - --(bufferLength); \ - } \ - ++(bufferPos); \ -} - -#define U_ISO_COMMENT U_CHAR_NAME_CHOICE_COUNT - -/* - * Important: expandName() and compareName() are almost the same - - * apply fixes to both. - * - * UnicodeData.txt uses ';' as a field separator, so no - * field can contain ';' as part of its contents. - * In unames.dat, it is marked as token[';']==-1 only if the - * semicolon is used in the data file - which is iff we - * have Unicode 1.0 names or ISO comments or aliases. - * So, it will be token[';']==-1 if we store U1.0 names/ISO comments/aliases - * although we know that it will never be part of a name. - */ -static uint16_t -expandName(UCharNames *names, - const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice, - char *buffer, uint16_t bufferLength) { - uint16_t *tokens=(uint16_t *)names+8; - uint16_t token, tokenCount=*tokens++, bufferPos=0; - uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset; - uint8_t c; - - if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { - /* - * skip the modern name if it is not requested _and_ - * if the semicolon byte value is a character, not a token number - */ - if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) { - int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice; - do { - while(nameLength>0) { - --nameLength; - if(*name++==';') { - break; - } - } - } while(--fieldIndex>0); - } else { - /* - * the semicolon byte value is a token number, therefore - * only modern names are stored in unames.dat and there is no - * such requested alternate name here - */ - nameLength=0; - } - } - - /* write each letter directly, and write a token word per token */ - while(nameLength>0) { - --nameLength; - c=*name++; - - if(c>=tokenCount) { - if(c!=';') { - /* implicit letter */ - WRITE_CHAR(buffer, bufferLength, bufferPos, c); - } else { - /* finished */ - break; - } - } else { - token=tokens[c]; - if(token==(uint16_t)(-2)) { - /* this is a lead byte for a double-byte token */ - token=tokens[c<<8|*name++]; - --nameLength; - } - if(token==(uint16_t)(-1)) { - if(c!=';') { - /* explicit letter */ - WRITE_CHAR(buffer, bufferLength, bufferPos, c); - } else { - /* stop, but skip the semicolon if we are seeking - extended names and there was no 2.0 name but there - is a 1.0 name. */ - if(!bufferPos && nameChoice == U_EXTENDED_CHAR_NAME) { - if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) { - continue; - } - } - /* finished */ - break; - } - } else { - /* write token word */ - uint8_t *tokenString=tokenStrings+token; - while((c=*tokenString++)!=0) { - WRITE_CHAR(buffer, bufferLength, bufferPos, c); - } - } - } - } - - /* zero-terminate */ - if(bufferLength>0) { - *buffer=0; - } - - return bufferPos; -} - -/* - * compareName() is almost the same as expandName() except that it compares - * the currently expanded name to an input name. - * It returns the match/no match result as soon as possible. - */ -static UBool -compareName(UCharNames *names, - const uint8_t *name, uint16_t nameLength, UCharNameChoice nameChoice, - const char *otherName) { - uint16_t *tokens=(uint16_t *)names+8; - uint16_t token, tokenCount=*tokens++; - uint8_t *tokenStrings=(uint8_t *)names+names->tokenStringOffset; - uint8_t c; - const char *origOtherName = otherName; - - if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { - /* - * skip the modern name if it is not requested _and_ - * if the semicolon byte value is a character, not a token number - */ - if((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) { - int fieldIndex= nameChoice==U_ISO_COMMENT ? 2 : nameChoice; - do { - while(nameLength>0) { - --nameLength; - if(*name++==';') { - break; - } - } - } while(--fieldIndex>0); - } else { - /* - * the semicolon byte value is a token number, therefore - * only modern names are stored in unames.dat and there is no - * such requested alternate name here - */ - nameLength=0; - } - } - - /* compare each letter directly, and compare a token word per token */ - while(nameLength>0) { - --nameLength; - c=*name++; - - if(c>=tokenCount) { - if(c!=';') { - /* implicit letter */ - if((char)c!=*otherName++) { - return FALSE; - } - } else { - /* finished */ - break; - } - } else { - token=tokens[c]; - if(token==(uint16_t)(-2)) { - /* this is a lead byte for a double-byte token */ - token=tokens[c<<8|*name++]; - --nameLength; - } - if(token==(uint16_t)(-1)) { - if(c!=';') { - /* explicit letter */ - if((char)c!=*otherName++) { - return FALSE; - } - } else { - /* stop, but skip the semicolon if we are seeking - extended names and there was no 2.0 name but there - is a 1.0 name. */ - if(otherName == origOtherName && nameChoice == U_EXTENDED_CHAR_NAME) { - if ((uint8_t)';'>=tokenCount || tokens[(uint8_t)';']==(uint16_t)(-1)) { - continue; - } - } - /* finished */ - break; - } - } else { - /* write token word */ - uint8_t *tokenString=tokenStrings+token; - while((c=*tokenString++)!=0) { - if((char)c!=*otherName++) { - return FALSE; - } - } - } - } - } - - /* complete match? */ - return (UBool)(*otherName==0); -} - -static uint8_t getCharCat(UChar32 cp) { - uint8_t cat; - - if (U_IS_UNICODE_NONCHAR(cp)) { - return U_NONCHARACTER_CODE_POINT; - } - - if ((cat = u_charType(cp)) == U_SURROGATE) { - cat = U_IS_LEAD(cp) ? U_LEAD_SURROGATE : U_TRAIL_SURROGATE; - } - - return cat; -} - -static const char *getCharCatName(UChar32 cp) { - uint8_t cat = getCharCat(cp); - - /* Return unknown if the table of names above is not up to - date. */ - - if (cat >= UPRV_LENGTHOF(charCatNames)) { - return "unknown"; - } else { - return charCatNames[cat]; - } -} - -static uint16_t getExtName(uint32_t code, char *buffer, uint16_t bufferLength) { - const char *catname = getCharCatName(code); - uint16_t length = 0; - - UChar32 cp; - int ndigits, i; - - WRITE_CHAR(buffer, bufferLength, length, '<'); - while (catname[length - 1]) { - WRITE_CHAR(buffer, bufferLength, length, catname[length - 1]); - } - WRITE_CHAR(buffer, bufferLength, length, '-'); - for (cp = code, ndigits = 0; cp; ++ndigits, cp >>= 4) - ; - if (ndigits < 4) - ndigits = 4; - for (cp = code, i = ndigits; (cp || i > 0) && bufferLength; cp >>= 4, bufferLength--) { - uint8_t v = (uint8_t)(cp & 0xf); - buffer[--i] = (v < 10 ? '0' + v : 'A' + v - 10); - } - buffer += ndigits; - length += static_cast<uint16_t>(ndigits); - WRITE_CHAR(buffer, bufferLength, length, '>'); - - return length; -} - -/* - * getGroup() does a binary search for the group that contains the - * Unicode code point "code". - * The return value is always a valid Group* that may contain "code" - * or else is the highest group before "code". - * If the lowest group is after "code", then that one is returned. - */ -static const uint16_t * -getGroup(UCharNames *names, uint32_t code) { - const uint16_t *groups=GET_GROUPS(names); - uint16_t groupMSB=(uint16_t)(code>>GROUP_SHIFT), - start=0, - limit=*groups++, - number; - - /* binary search for the group of names that contains the one for code */ - while(start<limit-1) { - number=(uint16_t)((start+limit)/2); - if(groupMSB<groups[number*GROUP_LENGTH+GROUP_MSB]) { - limit=number; - } else { - start=number; - } - } - - /* return this regardless of whether it is an exact match */ - return groups+start*GROUP_LENGTH; -} - -/* - * expandGroupLengths() reads a block of compressed lengths of 32 strings and - * expands them into offsets and lengths for each string. - * Lengths are stored with a variable-width encoding in consecutive nibbles: - * If a nibble<0xc, then it is the length itself (0=empty string). - * If a nibble>=0xc, then it forms a length value with the following nibble. - * Calculation see below. - * The offsets and lengths arrays must be at least 33 (one more) long because - * there is no check here at the end if the last nibble is still used. - */ -static const uint8_t * -expandGroupLengths(const uint8_t *s, - uint16_t offsets[LINES_PER_GROUP+1], uint16_t lengths[LINES_PER_GROUP+1]) { - /* read the lengths of the 32 strings in this group and get each string's offset */ - uint16_t i=0, offset=0, length=0; - uint8_t lengthByte; - - /* all 32 lengths must be read to get the offset of the first group string */ - while(i<LINES_PER_GROUP) { - lengthByte=*s++; - - /* read even nibble - MSBs of lengthByte */ - if(length>=12) { - /* double-nibble length spread across two bytes */ - length=(uint16_t)(((length&0x3)<<4|lengthByte>>4)+12); - lengthByte&=0xf; - } else if((lengthByte /* &0xf0 */)>=0xc0) { - /* double-nibble length spread across this one byte */ - length=(uint16_t)((lengthByte&0x3f)+12); - } else { - /* single-nibble length in MSBs */ - length=(uint16_t)(lengthByte>>4); - lengthByte&=0xf; - } - - *offsets++=offset; - *lengths++=length; - - offset+=length; - ++i; - - /* read odd nibble - LSBs of lengthByte */ - if((lengthByte&0xf0)==0) { - /* this nibble was not consumed for a double-nibble length above */ - length=lengthByte; - if(length<12) { - /* single-nibble length in LSBs */ - *offsets++=offset; - *lengths++=length; - - offset+=length; - ++i; - } - } else { - length=0; /* prevent double-nibble detection in the next iteration */ - } - } - - /* now, s is at the first group string */ - return s; -} - -static uint16_t -expandGroupName(UCharNames *names, const uint16_t *group, - uint16_t lineNumber, UCharNameChoice nameChoice, - char *buffer, uint16_t bufferLength) { - uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2]; - const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group); - s=expandGroupLengths(s, offsets, lengths); - return expandName(names, s+offsets[lineNumber], lengths[lineNumber], nameChoice, - buffer, bufferLength); -} - -static uint16_t -getName(UCharNames *names, uint32_t code, UCharNameChoice nameChoice, - char *buffer, uint16_t bufferLength) { - const uint16_t *group=getGroup(names, code); - if((uint16_t)(code>>GROUP_SHIFT)==group[GROUP_MSB]) { - return expandGroupName(names, group, (uint16_t)(code&GROUP_MASK), nameChoice, - buffer, bufferLength); - } else { - /* group not found */ - /* zero-terminate */ - if(bufferLength>0) { - *buffer=0; - } - return 0; - } -} - -/* - * enumGroupNames() enumerates all the names in a 32-group - * and either calls the enumerator function or finds a given input name. - */ -static UBool -enumGroupNames(UCharNames *names, const uint16_t *group, - UChar32 start, UChar32 end, - UEnumCharNamesFn *fn, void *context, - UCharNameChoice nameChoice) { - uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2]; - const uint8_t *s=(uint8_t *)names+names->groupStringOffset+GET_GROUP_OFFSET(group); - - s=expandGroupLengths(s, offsets, lengths); - if(fn!=DO_FIND_NAME) { - char buffer[200]; - uint16_t length; - - while(start<=end) { - length=expandName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, buffer, sizeof(buffer)); - if (!length && nameChoice == U_EXTENDED_CHAR_NAME) { - buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0; - } - /* here, we assume that the buffer is large enough */ - if(length>0) { - if(!fn(context, start, nameChoice, buffer, length)) { - return FALSE; - } - } - ++start; - } - } else { - const char *otherName=((FindName *)context)->otherName; - while(start<=end) { - if(compareName(names, s+offsets[start&GROUP_MASK], lengths[start&GROUP_MASK], nameChoice, otherName)) { - ((FindName *)context)->code=start; - return FALSE; - } - ++start; - } - } - return TRUE; -} - -/* - * enumExtNames enumerate extended names. - * It only needs to do it if it is called with a real function and not - * with the dummy DO_FIND_NAME, because u_charFromName() does a check - * for extended names by itself. - */ -static UBool -enumExtNames(UChar32 start, UChar32 end, - UEnumCharNamesFn *fn, void *context) -{ - if(fn!=DO_FIND_NAME) { - char buffer[200]; - uint16_t length; - - while(start<=end) { - buffer[length = getExtName(start, buffer, sizeof(buffer))] = 0; - /* here, we assume that the buffer is large enough */ - if(length>0) { - if(!fn(context, start, U_EXTENDED_CHAR_NAME, buffer, length)) { - return FALSE; - } - } - ++start; - } - } - - return TRUE; -} - -static UBool -enumNames(UCharNames *names, - UChar32 start, UChar32 limit, - UEnumCharNamesFn *fn, void *context, - UCharNameChoice nameChoice) { - uint16_t startGroupMSB, endGroupMSB, groupCount; - const uint16_t *group, *groupLimit; - - startGroupMSB=(uint16_t)(start>>GROUP_SHIFT); - endGroupMSB=(uint16_t)((limit-1)>>GROUP_SHIFT); - - /* find the group that contains start, or the highest before it */ - group=getGroup(names, start); - - if(startGroupMSB<group[GROUP_MSB] && nameChoice==U_EXTENDED_CHAR_NAME) { - /* enumerate synthetic names between start and the group start */ - UChar32 extLimit=((UChar32)group[GROUP_MSB]<<GROUP_SHIFT); - if(extLimit>limit) { - extLimit=limit; - } - if(!enumExtNames(start, extLimit-1, fn, context)) { - return FALSE; - } - start=extLimit; - } - - if(startGroupMSB==endGroupMSB) { - if(startGroupMSB==group[GROUP_MSB]) { - /* if start and limit-1 are in the same group, then enumerate only in that one */ - return enumGroupNames(names, group, start, limit-1, fn, context, nameChoice); - } - } else { - const uint16_t *groups=GET_GROUPS(names); - groupCount=*groups++; - groupLimit=groups+groupCount*GROUP_LENGTH; - - if(startGroupMSB==group[GROUP_MSB]) { - /* enumerate characters in the partial start group */ - if((start&GROUP_MASK)!=0) { - if(!enumGroupNames(names, group, - start, ((UChar32)startGroupMSB<<GROUP_SHIFT)+LINES_PER_GROUP-1, - fn, context, nameChoice)) { - return FALSE; - } - group=NEXT_GROUP(group); /* continue with the next group */ - } - } else if(startGroupMSB>group[GROUP_MSB]) { - /* make sure that we start enumerating with the first group after start */ - const uint16_t *nextGroup=NEXT_GROUP(group); - if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > startGroupMSB && nameChoice == U_EXTENDED_CHAR_NAME) { - UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT; - if (end > limit) { - end = limit; - } - if (!enumExtNames(start, end - 1, fn, context)) { - return FALSE; - } - } - group=nextGroup; - } - - /* enumerate entire groups between the start- and end-groups */ - while(group<groupLimit && group[GROUP_MSB]<endGroupMSB) { - const uint16_t *nextGroup; - start=(UChar32)group[GROUP_MSB]<<GROUP_SHIFT; - if(!enumGroupNames(names, group, start, start+LINES_PER_GROUP-1, fn, context, nameChoice)) { - return FALSE; - } - nextGroup=NEXT_GROUP(group); - if (nextGroup < groupLimit && nextGroup[GROUP_MSB] > group[GROUP_MSB] + 1 && nameChoice == U_EXTENDED_CHAR_NAME) { - UChar32 end = nextGroup[GROUP_MSB] << GROUP_SHIFT; - if (end > limit) { - end = limit; - } - if (!enumExtNames((group[GROUP_MSB] + 1) << GROUP_SHIFT, end - 1, fn, context)) { - return FALSE; - } - } - group=nextGroup; - } - - /* enumerate within the end group (group[GROUP_MSB]==endGroupMSB) */ - if(group<groupLimit && group[GROUP_MSB]==endGroupMSB) { - return enumGroupNames(names, group, (limit-1)&~GROUP_MASK, limit-1, fn, context, nameChoice); - } else if (nameChoice == U_EXTENDED_CHAR_NAME && group == groupLimit) { - UChar32 next = (PREV_GROUP(group)[GROUP_MSB] + 1) << GROUP_SHIFT; - if (next > start) { - start = next; - } - } else { - return TRUE; - } - } - - /* we have not found a group, which means everything is made of - extended names. */ - if (nameChoice == U_EXTENDED_CHAR_NAME) { - if (limit > UCHAR_MAX_VALUE + 1) { - limit = UCHAR_MAX_VALUE + 1; - } - return enumExtNames(start, limit - 1, fn, context); - } - - return TRUE; -} - -static uint16_t -writeFactorSuffix(const uint16_t *factors, uint16_t count, - const char *s, /* suffix elements */ - uint32_t code, - uint16_t indexes[8], /* output fields from here */ - const char *elementBases[8], const char *elements[8], - char *buffer, uint16_t bufferLength) { - uint16_t i, factor, bufferPos=0; - char c; - - /* write elements according to the factors */ - - /* - * the factorized elements are determined by modulo arithmetic - * with the factors of this algorithm - * - * note that for fewer operations, count is decremented here - */ - --count; - for(i=count; i>0; --i) { - factor=factors[i]; - indexes[i]=(uint16_t)(code%factor); - code/=factor; - } - /* - * we don't need to calculate the last modulus because start<=code<=end - * guarantees here that code<=factors[0] - */ - indexes[0]=(uint16_t)code; - - /* write each element */ - for(;;) { - if(elementBases!=NULL) { - *elementBases++=s; - } - - /* skip indexes[i] strings */ - factor=indexes[i]; - while(factor>0) { - while(*s++!=0) {} - --factor; - } - if(elements!=NULL) { - *elements++=s; - } - - /* write element */ - while((c=*s++)!=0) { - WRITE_CHAR(buffer, bufferLength, bufferPos, c); - } - - /* we do not need to perform the rest of this loop for i==count - break here */ - if(i>=count) { - break; - } - - /* skip the rest of the strings for this factors[i] */ - factor=(uint16_t)(factors[i]-indexes[i]-1); - while(factor>0) { - while(*s++!=0) {} - --factor; - } - - ++i; - } - - /* zero-terminate */ - if(bufferLength>0) { - *buffer=0; - } - - return bufferPos; -} - -/* - * Important: - * Parts of findAlgName() are almost the same as some of getAlgName(). - * Fixes must be applied to both. - */ -static uint16_t -getAlgName(AlgorithmicRange *range, uint32_t code, UCharNameChoice nameChoice, - char *buffer, uint16_t bufferLength) { - uint16_t bufferPos=0; - - /* Only the normative character name can be algorithmic. */ - if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { - /* zero-terminate */ - if(bufferLength>0) { - *buffer=0; - } - return 0; - } - - switch(range->type) { - case 0: { - /* name = prefix hex-digits */ - const char *s=(const char *)(range+1); - char c; - - uint16_t i, count; - - /* copy prefix */ - while((c=*s++)!=0) { - WRITE_CHAR(buffer, bufferLength, bufferPos, c); - } - - /* write hexadecimal code point value */ - count=range->variant; - - /* zero-terminate */ - if(count<bufferLength) { - buffer[count]=0; - } - - for(i=count; i>0;) { - if(--i<bufferLength) { - c=(char)(code&0xf); - if(c<10) { - c+='0'; - } else { - c+='A'-10; - } - buffer[i]=c; - } - code>>=4; - } - - bufferPos+=count; - break; - } - case 1: { - /* name = prefix factorized-elements */ - uint16_t indexes[8]; - const uint16_t *factors=(const uint16_t *)(range+1); - uint16_t count=range->variant; - const char *s=(const char *)(factors+count); - char c; - - /* copy prefix */ - while((c=*s++)!=0) { - WRITE_CHAR(buffer, bufferLength, bufferPos, c); - } - - bufferPos+=writeFactorSuffix(factors, count, - s, code-range->start, indexes, NULL, NULL, buffer, bufferLength); - break; - } - default: - /* undefined type */ - /* zero-terminate */ - if(bufferLength>0) { - *buffer=0; - } - break; - } - - return bufferPos; -} - -/* - * Important: enumAlgNames() and findAlgName() are almost the same. - * Any fix must be applied to both. - */ -static UBool -enumAlgNames(AlgorithmicRange *range, - UChar32 start, UChar32 limit, - UEnumCharNamesFn *fn, void *context, - UCharNameChoice nameChoice) { - char buffer[200]; - uint16_t length; - - if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { - return TRUE; - } - - switch(range->type) { - case 0: { - char *s, *end; - char c; - - /* get the full name of the start character */ - length=getAlgName(range, (uint32_t)start, nameChoice, buffer, sizeof(buffer)); - if(length<=0) { - return TRUE; - } - - /* call the enumerator function with this first character */ - if(!fn(context, start, nameChoice, buffer, length)) { - return FALSE; - } - - /* go to the end of the name; all these names have the same length */ - end=buffer; - while(*end!=0) { - ++end; - } - - /* enumerate the rest of the names */ - while(++start<limit) { - /* increment the hexadecimal number on a character-basis */ - s=end; - for (;;) { - c=*--s; - if(('0'<=c && c<'9') || ('A'<=c && c<'F')) { - *s=(char)(c+1); - break; - } else if(c=='9') { - *s='A'; - break; - } else if(c=='F') { - *s='0'; - } - } - - if(!fn(context, start, nameChoice, buffer, length)) { - return FALSE; - } - } - break; - } - case 1: { - uint16_t indexes[8]; - const char *elementBases[8], *elements[8]; - const uint16_t *factors=(const uint16_t *)(range+1); - uint16_t count=range->variant; - const char *s=(const char *)(factors+count); - char *suffix, *t; - uint16_t prefixLength, i, idx; - - char c; - - /* name = prefix factorized-elements */ - - /* copy prefix */ - suffix=buffer; - prefixLength=0; - while((c=*s++)!=0) { - *suffix++=c; - ++prefixLength; - } - - /* append the suffix of the start character */ - length=(uint16_t)(prefixLength+writeFactorSuffix(factors, count, - s, (uint32_t)start-range->start, - indexes, elementBases, elements, - suffix, (uint16_t)(sizeof(buffer)-prefixLength))); - - /* call the enumerator function with this first character */ - if(!fn(context, start, nameChoice, buffer, length)) { - return FALSE; - } - - /* enumerate the rest of the names */ - while(++start<limit) { - /* increment the indexes in lexical order bound by the factors */ - i=count; - for (;;) { - idx=(uint16_t)(indexes[--i]+1); - if(idx<factors[i]) { - /* skip one index and its element string */ - indexes[i]=idx; - s=elements[i]; - while(*s++!=0) { - } - elements[i]=s; - break; - } else { - /* reset this index to 0 and its element string to the first one */ - indexes[i]=0; - elements[i]=elementBases[i]; - } - } - - /* to make matters a little easier, just append all elements to the suffix */ - t=suffix; - length=prefixLength; - for(i=0; i<count; ++i) { - s=elements[i]; - while((c=*s++)!=0) { - *t++=c; - ++length; - } - } - /* zero-terminate */ - *t=0; - - if(!fn(context, start, nameChoice, buffer, length)) { - return FALSE; - } - } - break; - } - default: - /* undefined type */ - break; - } - - return TRUE; -} - -/* - * findAlgName() is almost the same as enumAlgNames() except that it - * returns the code point for a name if it fits into the range. - * It returns 0xffff otherwise. - */ -static UChar32 -findAlgName(AlgorithmicRange *range, UCharNameChoice nameChoice, const char *otherName) { - UChar32 code; - - if(nameChoice!=U_UNICODE_CHAR_NAME && nameChoice!=U_EXTENDED_CHAR_NAME) { - return 0xffff; - } - - switch(range->type) { - case 0: { - /* name = prefix hex-digits */ - const char *s=(const char *)(range+1); - char c; - - uint16_t i, count; - - /* compare prefix */ - while((c=*s++)!=0) { - if((char)c!=*otherName++) { - return 0xffff; - } - } - - /* read hexadecimal code point value */ - count=range->variant; - code=0; - for(i=0; i<count; ++i) { - c=*otherName++; - if('0'<=c && c<='9') { - code=(code<<4)|(c-'0'); - } else if('A'<=c && c<='F') { - code=(code<<4)|(c-'A'+10); - } else { - return 0xffff; - } - } - - /* does it fit into the range? */ - if(*otherName==0 && range->start<=(uint32_t)code && (uint32_t)code<=range->end) { - return code; - } - break; - } - case 1: { - char buffer[64]; - uint16_t indexes[8]; - const char *elementBases[8], *elements[8]; - const uint16_t *factors=(const uint16_t *)(range+1); - uint16_t count=range->variant; - const char *s=(const char *)(factors+count), *t; - UChar32 start, limit; - uint16_t i, idx; - - char c; - - /* name = prefix factorized-elements */ - - /* compare prefix */ - while((c=*s++)!=0) { - if((char)c!=*otherName++) { - return 0xffff; - } - } - - start=(UChar32)range->start; - limit=(UChar32)(range->end+1); - - /* initialize the suffix elements for enumeration; indexes should all be set to 0 */ - writeFactorSuffix(factors, count, s, 0, - indexes, elementBases, elements, buffer, sizeof(buffer)); - - /* compare the first suffix */ - if(0==uprv_strcmp(otherName, buffer)) { - return start; - } - - /* enumerate and compare the rest of the suffixes */ - while(++start<limit) { - /* increment the indexes in lexical order bound by the factors */ - i=count; - for (;;) { - idx=(uint16_t)(indexes[--i]+1); - if(idx<factors[i]) { - /* skip one index and its element string */ - indexes[i]=idx; - s=elements[i]; - while(*s++!=0) {} - elements[i]=s; - break; - } else { - /* reset this index to 0 and its element string to the first one */ - indexes[i]=0; - elements[i]=elementBases[i]; - } - } - - /* to make matters a little easier, just compare all elements of the suffix */ - t=otherName; - for(i=0; i<count; ++i) { - s=elements[i]; - while((c=*s++)!=0) { - if(c!=*t++) { - s=""; /* does not match */ - i=99; - } - } - } - if(i<99 && *t==0) { - return start; - } - } - break; - } - default: - /* undefined type */ - break; - } - - return 0xffff; -} - -/* sets of name characters, maximum name lengths ---------------------------- */ - -#define SET_ADD(set, c) ((set)[(uint8_t)c>>5]|=((uint32_t)1<<((uint8_t)c&0x1f))) -#define SET_CONTAINS(set, c) (((set)[(uint8_t)c>>5]&((uint32_t)1<<((uint8_t)c&0x1f)))!=0) - -static int32_t -calcStringSetLength(uint32_t set[8], const char *s) { - int32_t length=0; - char c; - - while((c=*s++)!=0) { - SET_ADD(set, c); - ++length; - } - return length; -} - -static int32_t -calcAlgNameSetsLengths(int32_t maxNameLength) { - AlgorithmicRange *range; - uint32_t *p; - uint32_t rangeCount; - int32_t length; - - /* enumerate algorithmic ranges */ - p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset); - rangeCount=*p; - range=(AlgorithmicRange *)(p+1); - while(rangeCount>0) { - switch(range->type) { - case 0: - /* name = prefix + (range->variant times) hex-digits */ - /* prefix */ - length=calcStringSetLength(gNameSet, (const char *)(range+1))+range->variant; - if(length>maxNameLength) { - maxNameLength=length; - } - break; - case 1: { - /* name = prefix factorized-elements */ - const uint16_t *factors=(const uint16_t *)(range+1); - const char *s; - int32_t i, count=range->variant, factor, factorLength, maxFactorLength; - - /* prefix length */ - s=(const char *)(factors+count); - length=calcStringSetLength(gNameSet, s); - s+=length+1; /* start of factor suffixes */ - - /* get the set and maximum factor suffix length for each factor */ - for(i=0; i<count; ++i) { - maxFactorLength=0; - for(factor=factors[i]; factor>0; --factor) { - factorLength=calcStringSetLength(gNameSet, s); - s+=factorLength+1; - if(factorLength>maxFactorLength) { - maxFactorLength=factorLength; - } - } - length+=maxFactorLength; - } - - if(length>maxNameLength) { - maxNameLength=length; - } - break; - } - default: - /* unknown type */ - break; - } - - range=(AlgorithmicRange *)((uint8_t *)range+range->size); - --rangeCount; - } - return maxNameLength; -} - -static int32_t -calcExtNameSetsLengths(int32_t maxNameLength) { - int32_t i, length; - - for(i=0; i<UPRV_LENGTHOF(charCatNames); ++i) { - /* - * for each category, count the length of the category name - * plus 9= - * 2 for <> - * 1 for - - * 6 for most hex digits per code point - */ - length=9+calcStringSetLength(gNameSet, charCatNames[i]); - if(length>maxNameLength) { - maxNameLength=length; - } - } - return maxNameLength; -} - -static int32_t -calcNameSetLength(const uint16_t *tokens, uint16_t tokenCount, const uint8_t *tokenStrings, int8_t *tokenLengths, - uint32_t set[8], - const uint8_t **pLine, const uint8_t *lineLimit) { - const uint8_t *line=*pLine; - int32_t length=0, tokenLength; - uint16_t c, token; - - while(line!=lineLimit && (c=*line++)!=(uint8_t)';') { - if(c>=tokenCount) { - /* implicit letter */ - SET_ADD(set, c); - ++length; - } else { - token=tokens[c]; - if(token==(uint16_t)(-2)) { - /* this is a lead byte for a double-byte token */ - c=c<<8|*line++; - token=tokens[c]; - } - if(token==(uint16_t)(-1)) { - /* explicit letter */ - SET_ADD(set, c); - ++length; - } else { - /* count token word */ - if(tokenLengths!=NULL) { - /* use cached token length */ - tokenLength=tokenLengths[c]; - if(tokenLength==0) { - tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token); - tokenLengths[c]=(int8_t)tokenLength; - } - } else { - tokenLength=calcStringSetLength(set, (const char *)tokenStrings+token); - } - length+=tokenLength; - } - } - } - - *pLine=line; - return length; -} - -static void -calcGroupNameSetsLengths(int32_t maxNameLength) { - uint16_t offsets[LINES_PER_GROUP+2], lengths[LINES_PER_GROUP+2]; - - uint16_t *tokens=(uint16_t *)uCharNames+8; - uint16_t tokenCount=*tokens++; - uint8_t *tokenStrings=(uint8_t *)uCharNames+uCharNames->tokenStringOffset; - - int8_t *tokenLengths; - - const uint16_t *group; - const uint8_t *s, *line, *lineLimit; - - int32_t groupCount, lineNumber, length; - - tokenLengths=(int8_t *)uprv_malloc(tokenCount); - if(tokenLengths!=NULL) { - uprv_memset(tokenLengths, 0, tokenCount); - } - - group=GET_GROUPS(uCharNames); - groupCount=*group++; - - /* enumerate all groups */ - while(groupCount>0) { - s=(uint8_t *)uCharNames+uCharNames->groupStringOffset+GET_GROUP_OFFSET(group); - s=expandGroupLengths(s, offsets, lengths); - - /* enumerate all lines in each group */ - for(lineNumber=0; lineNumber<LINES_PER_GROUP; ++lineNumber) { - line=s+offsets[lineNumber]; - length=lengths[lineNumber]; - if(length==0) { - continue; - } - - lineLimit=line+length; - - /* read regular name */ - length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit); - if(length>maxNameLength) { - maxNameLength=length; - } - if(line==lineLimit) { - continue; - } - - /* read Unicode 1.0 name */ - length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gNameSet, &line, lineLimit); - if(length>maxNameLength) { - maxNameLength=length; - } - if(line==lineLimit) { - continue; - } - - /* read ISO comment */ - /*length=calcNameSetLength(tokens, tokenCount, tokenStrings, tokenLengths, gISOCommentSet, &line, lineLimit);*/ - } - - group=NEXT_GROUP(group); - --groupCount; - } - - if(tokenLengths!=NULL) { - uprv_free(tokenLengths); - } - - /* set gMax... - name length last for threading */ - gMaxNameLength=maxNameLength; -} - -static UBool -calcNameSetsLengths(UErrorCode *pErrorCode) { - static const char extChars[]="0123456789ABCDEF<>-"; - int32_t i, maxNameLength; - - if(gMaxNameLength!=0) { - return TRUE; - } - - if(!isDataLoaded(pErrorCode)) { - return FALSE; - } - - /* set hex digits, used in various names, and <>-, used in extended names */ - for(i=0; i<(int32_t)sizeof(extChars)-1; ++i) { - SET_ADD(gNameSet, extChars[i]); - } - - /* set sets and lengths from algorithmic names */ - maxNameLength=calcAlgNameSetsLengths(0); - - /* set sets and lengths from extended names */ - maxNameLength=calcExtNameSetsLengths(maxNameLength); - - /* set sets and lengths from group names, set global maximum values */ - calcGroupNameSetsLengths(maxNameLength); - - return TRUE; -} - -U_NAMESPACE_END - -/* public API --------------------------------------------------------------- */ - -U_NAMESPACE_USE - -U_CAPI int32_t U_EXPORT2 -u_charName(UChar32 code, UCharNameChoice nameChoice, - char *buffer, int32_t bufferLength, - UErrorCode *pErrorCode) { - AlgorithmicRange *algRange; - uint32_t *p; - uint32_t i; - int32_t length; - - /* check the argument values */ - if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { - return 0; - } else if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || - bufferLength<0 || (bufferLength>0 && buffer==NULL) - ) { - *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; - return 0; - } - - if((uint32_t)code>UCHAR_MAX_VALUE || !isDataLoaded(pErrorCode)) { - return u_terminateChars(buffer, bufferLength, 0, pErrorCode); - } - - length=0; - - /* try algorithmic names first */ - p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset); - i=*p; - algRange=(AlgorithmicRange *)(p+1); - while(i>0) { - if(algRange->start<=(uint32_t)code && (uint32_t)code<=algRange->end) { - length=getAlgName(algRange, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength); - break; - } - algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size); - --i; - } - - if(i==0) { - if (nameChoice == U_EXTENDED_CHAR_NAME) { - length = getName(uCharNames, (uint32_t )code, U_EXTENDED_CHAR_NAME, buffer, (uint16_t) bufferLength); - if (!length) { - /* extended character name */ - length = getExtName((uint32_t) code, buffer, (uint16_t) bufferLength); - } - } else { - /* normal character name */ - length=getName(uCharNames, (uint32_t)code, nameChoice, buffer, (uint16_t)bufferLength); - } - } - - return u_terminateChars(buffer, bufferLength, length, pErrorCode); -} - -U_CAPI int32_t U_EXPORT2 -u_getISOComment(UChar32 /*c*/, - char *dest, int32_t destCapacity, - UErrorCode *pErrorCode) { - /* check the argument values */ - if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { - return 0; - } else if(destCapacity<0 || (destCapacity>0 && dest==NULL)) { - *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; - return 0; - } - - return u_terminateChars(dest, destCapacity, 0, pErrorCode); -} - -U_CAPI UChar32 U_EXPORT2 -u_charFromName(UCharNameChoice nameChoice, - const char *name, - UErrorCode *pErrorCode) { - char upper[120], lower[120]; - FindName findName; - AlgorithmicRange *algRange; - uint32_t *p; - uint32_t i; - UChar32 cp = 0; - char c0; - UChar32 error = 0xffff; /* Undefined, but use this for backwards compatibility. */ - - if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { - return error; - } - - if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || name==NULL || *name==0) { - *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; - return error; - } - - if(!isDataLoaded(pErrorCode)) { - return error; - } - - /* construct the uppercase and lowercase of the name first */ - for(i=0; i<sizeof(upper); ++i) { - if((c0=*name++)!=0) { - upper[i]=uprv_toupper(c0); - lower[i]=uprv_tolower(c0); - } else { - upper[i]=lower[i]=0; - break; - } - } - if(i==sizeof(upper)) { - /* name too long, there is no such character */ - *pErrorCode = U_ILLEGAL_CHAR_FOUND; - return error; - } - // i==strlen(name)==strlen(lower)==strlen(upper) - - /* try extended names first */ - if (lower[0] == '<') { - if (nameChoice == U_EXTENDED_CHAR_NAME) { - // Parse a string like "<category-HHHH>" where HHHH is a hex code point. - if (lower[--i] == '>' && i >= 3 && lower[--i] != '-') { - while (i >= 3 && lower[--i] != '-') {} - - if (i >= 2 && lower[i] == '-') { - uint32_t cIdx; - - lower[i] = 0; - - for (++i; lower[i] != '>'; ++i) { - if (lower[i] >= '0' && lower[i] <= '9') { - cp = (cp << 4) + lower[i] - '0'; - } else if (lower[i] >= 'a' && lower[i] <= 'f') { - cp = (cp << 4) + lower[i] - 'a' + 10; - } else { - *pErrorCode = U_ILLEGAL_CHAR_FOUND; - return error; - } - } - - /* Now validate the category name. - We could use a binary search, or a trie, if - we really wanted to. */ - - for (lower[i] = 0, cIdx = 0; cIdx < UPRV_LENGTHOF(charCatNames); ++cIdx) { - - if (!uprv_strcmp(lower + 1, charCatNames[cIdx])) { - if (getCharCat(cp) == cIdx) { - return cp; - } - break; - } - } - } - } - } - - *pErrorCode = U_ILLEGAL_CHAR_FOUND; - return error; - } - - /* try algorithmic names now */ - p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset); - i=*p; - algRange=(AlgorithmicRange *)(p+1); - while(i>0) { - if((cp=findAlgName(algRange, nameChoice, upper))!=0xffff) { - return cp; - } - algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size); - --i; - } - - /* normal character name */ - findName.otherName=upper; - findName.code=error; - enumNames(uCharNames, 0, UCHAR_MAX_VALUE + 1, DO_FIND_NAME, &findName, nameChoice); - if (findName.code == error) { - *pErrorCode = U_ILLEGAL_CHAR_FOUND; - } - return findName.code; -} - -U_CAPI void U_EXPORT2 -u_enumCharNames(UChar32 start, UChar32 limit, - UEnumCharNamesFn *fn, - void *context, - UCharNameChoice nameChoice, - UErrorCode *pErrorCode) { - AlgorithmicRange *algRange; - uint32_t *p; - uint32_t i; - - if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { - return; - } - - if(nameChoice>=U_CHAR_NAME_CHOICE_COUNT || fn==NULL) { - *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; - return; - } - - if((uint32_t) limit > UCHAR_MAX_VALUE + 1) { - limit = UCHAR_MAX_VALUE + 1; - } - if((uint32_t)start>=(uint32_t)limit) { - return; - } - - if(!isDataLoaded(pErrorCode)) { - return; - } - - /* interleave the data-driven ones with the algorithmic ones */ - /* iterate over all algorithmic ranges; assume that they are in ascending order */ - p=(uint32_t *)((uint8_t *)uCharNames+uCharNames->algNamesOffset); - i=*p; - algRange=(AlgorithmicRange *)(p+1); - while(i>0) { - /* enumerate the character names before the current algorithmic range */ - /* here: start<limit */ - if((uint32_t)start<algRange->start) { - if((uint32_t)limit<=algRange->start) { - enumNames(uCharNames, start, limit, fn, context, nameChoice); - return; - } - if(!enumNames(uCharNames, start, (UChar32)algRange->start, fn, context, nameChoice)) { - return; - } - start=(UChar32)algRange->start; - } - /* enumerate the character names in the current algorithmic range */ - /* here: algRange->start<=start<limit */ - if((uint32_t)start<=algRange->end) { - if((uint32_t)limit<=(algRange->end+1)) { - enumAlgNames(algRange, start, limit, fn, context, nameChoice); - return; - } - if(!enumAlgNames(algRange, start, (UChar32)algRange->end+1, fn, context, nameChoice)) { - return; - } - start=(UChar32)algRange->end+1; - } - /* continue to the next algorithmic range (here: start<limit) */ - algRange=(AlgorithmicRange *)((uint8_t *)algRange+algRange->size); - --i; - } - /* enumerate the character names after the last algorithmic range */ - enumNames(uCharNames, start, limit, fn, context, nameChoice); -} - -U_CAPI int32_t U_EXPORT2 -uprv_getMaxCharNameLength() { - UErrorCode errorCode=U_ZERO_ERROR; - if(calcNameSetsLengths(&errorCode)) { - return gMaxNameLength; - } else { - return 0; - } -} - -/** - * Converts the char set cset into a Unicode set uset. - * @param cset Set of 256 bit flags corresponding to a set of chars. - * @param uset USet to receive characters. Existing contents are deleted. - */ -static void -charSetToUSet(uint32_t cset[8], const USetAdder *sa) { - UChar us[256]; - char cs[256]; - - int32_t i, length; - UErrorCode errorCode; - - errorCode=U_ZERO_ERROR; - - if(!calcNameSetsLengths(&errorCode)) { - return; - } - - /* build a char string with all chars that are used in character names */ - length=0; - for(i=0; i<256; ++i) { - if(SET_CONTAINS(cset, i)) { - cs[length++]=(char)i; - } - } - - /* convert the char string to a UChar string */ - u_charsToUChars(cs, us, length); - - /* add each UChar to the USet */ - for(i=0; i<length; ++i) { - if(us[i]!=0 || cs[i]==0) { /* non-invariant chars become (UChar)0 */ - sa->add(sa->set, us[i]); - } - } -} - -/** - * Fills set with characters that are used in Unicode character names. - * @param set USet to receive characters. - */ -U_CAPI void U_EXPORT2 -uprv_getCharNameCharacters(const USetAdder *sa) { - charSetToUSet(gNameSet, sa); -} - -/* data swapping ------------------------------------------------------------ */ - -/* - * The token table contains non-negative entries for token bytes, - * and -1 for bytes that represent themselves in the data file's charset. - * -2 entries are used for lead bytes. - * - * Direct bytes (-1 entries) must be translated from the input charset family - * to the output charset family. - * makeTokenMap() writes a permutation mapping for this. - * Use it once for single-/lead-byte tokens and once more for all trail byte - * tokens. (';' is an unused trail byte marked with -1.) - */ -static void -makeTokenMap(const UDataSwapper *ds, - int16_t tokens[], uint16_t tokenCount, - uint8_t map[256], - UErrorCode *pErrorCode) { - UBool usedOutChar[256]; - uint16_t i, j; - uint8_t c1, c2; - - if(U_FAILURE(*pErrorCode)) { - return; - } - - if(ds->inCharset==ds->outCharset) { - /* Same charset family: identity permutation */ - for(i=0; i<256; ++i) { - map[i]=(uint8_t)i; - } - } else { - uprv_memset(map, 0, 256); - uprv_memset(usedOutChar, 0, 256); - - if(tokenCount>256) { - tokenCount=256; - } - - /* set the direct bytes (byte 0 always maps to itself) */ - for(i=1; i<tokenCount; ++i) { - if(tokens[i]==-1) { - /* convert the direct byte character */ - c1=(uint8_t)i; - ds->swapInvChars(ds, &c1, 1, &c2, pErrorCode); - if(U_FAILURE(*pErrorCode)) { - udata_printError(ds, "unames/makeTokenMap() finds variant character 0x%02x used (input charset family %d)\n", - i, ds->inCharset); - return; - } - - /* enter the converted character into the map and mark it used */ - map[c1]=c2; - usedOutChar[c2]=TRUE; - } - } - - /* set the mappings for the rest of the permutation */ - for(i=j=1; i<tokenCount; ++i) { - /* set mappings that were not set for direct bytes */ - if(map[i]==0) { - /* set an output byte value that was not used as an output byte above */ - while(usedOutChar[j]) { - ++j; - } - map[i]=(uint8_t)j++; - } - } - - /* - * leave mappings at tokenCount and above unset if tokenCount<256 - * because they won't be used - */ - } -} - -U_CAPI int32_t U_EXPORT2 -uchar_swapNames(const UDataSwapper *ds, - const void *inData, int32_t length, void *outData, - UErrorCode *pErrorCode) { - const UDataInfo *pInfo; - int32_t headerSize; - - const uint8_t *inBytes; - uint8_t *outBytes; - - uint32_t tokenStringOffset, groupsOffset, groupStringOffset, algNamesOffset, - offset, i, count, stringsCount; - - const AlgorithmicRange *inRange; - AlgorithmicRange *outRange; - - /* udata_swapDataHeader checks the arguments */ - headerSize=udata_swapDataHeader(ds, inData, length, outData, pErrorCode); - if(pErrorCode==NULL || U_FAILURE(*pErrorCode)) { - return 0; - } - - /* check data format and format version */ - pInfo=(const UDataInfo *)((const char *)inData+4); - if(!( - pInfo->dataFormat[0]==0x75 && /* dataFormat="unam" */ - pInfo->dataFormat[1]==0x6e && - pInfo->dataFormat[2]==0x61 && - pInfo->dataFormat[3]==0x6d && - pInfo->formatVersion[0]==1 - )) { - udata_printError(ds, "uchar_swapNames(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as unames.icu\n", - pInfo->dataFormat[0], pInfo->dataFormat[1], - pInfo->dataFormat[2], pInfo->dataFormat[3], - pInfo->formatVersion[0]); - *pErrorCode=U_UNSUPPORTED_ERROR; - return 0; - } - - inBytes=(const uint8_t *)inData+headerSize; - outBytes=(uint8_t *)outData+headerSize; - if(length<0) { - algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3]); - } else { - length-=headerSize; - if( length<20 || - (uint32_t)length<(algNamesOffset=ds->readUInt32(((const uint32_t *)inBytes)[3])) - ) { - udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu\n", - length); - *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; - return 0; - } - } - - if(length<0) { - /* preflighting: iterate through algorithmic ranges */ - offset=algNamesOffset; - count=ds->readUInt32(*((const uint32_t *)(inBytes+offset))); - offset+=4; - - for(i=0; i<count; ++i) { - inRange=(const AlgorithmicRange *)(inBytes+offset); - offset+=ds->readUInt16(inRange->size); - } - } else { - /* swap data */ - const uint16_t *p; - uint16_t *q, *temp; - - int16_t tokens[512]; - uint16_t tokenCount; - - uint8_t map[256], trailMap[256]; - - /* copy the data for inaccessible bytes */ - if(inBytes!=outBytes) { - uprv_memcpy(outBytes, inBytes, length); - } - - /* the initial 4 offsets first */ - tokenStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[0]); - groupsOffset=ds->readUInt32(((const uint32_t *)inBytes)[1]); - groupStringOffset=ds->readUInt32(((const uint32_t *)inBytes)[2]); - ds->swapArray32(ds, inBytes, 16, outBytes, pErrorCode); - - /* - * now the tokens table - * it needs to be permutated along with the compressed name strings - */ - p=(const uint16_t *)(inBytes+16); - q=(uint16_t *)(outBytes+16); - - /* read and swap the tokenCount */ - tokenCount=ds->readUInt16(*p); - ds->swapArray16(ds, p, 2, q, pErrorCode); - ++p; - ++q; - - /* read the first 512 tokens and make the token maps */ - if(tokenCount<=512) { - count=tokenCount; - } else { - count=512; - } - for(i=0; i<count; ++i) { - tokens[i]=udata_readInt16(ds, p[i]); - } - for(; i<512; ++i) { - tokens[i]=0; /* fill the rest of the tokens array if tokenCount<512 */ - } - makeTokenMap(ds, tokens, tokenCount, map, pErrorCode); - makeTokenMap(ds, tokens+256, (uint16_t)(tokenCount>256 ? tokenCount-256 : 0), trailMap, pErrorCode); - if(U_FAILURE(*pErrorCode)) { - return 0; - } - - /* - * swap and permutate the tokens - * go through a temporary array to support in-place swapping - */ - temp=(uint16_t *)uprv_malloc(tokenCount*2); - if(temp==NULL) { - udata_printError(ds, "out of memory swapping %u unames.icu tokens\n", - tokenCount); - *pErrorCode=U_MEMORY_ALLOCATION_ERROR; - return 0; - } - - /* swap and permutate single-/lead-byte tokens */ - for(i=0; i<tokenCount && i<256; ++i) { - ds->swapArray16(ds, p+i, 2, temp+map[i], pErrorCode); - } - - /* swap and permutate trail-byte tokens */ - for(; i<tokenCount; ++i) { - ds->swapArray16(ds, p+i, 2, temp+(i&0xffffff00)+trailMap[i&0xff], pErrorCode); - } - - /* copy the result into the output and free the temporary array */ - uprv_memcpy(q, temp, tokenCount*2); - uprv_free(temp); - - /* - * swap the token strings but not a possible padding byte after - * the terminating NUL of the last string - */ - udata_swapInvStringBlock(ds, inBytes+tokenStringOffset, (int32_t)(groupsOffset-tokenStringOffset), - outBytes+tokenStringOffset, pErrorCode); - if(U_FAILURE(*pErrorCode)) { - udata_printError(ds, "uchar_swapNames(token strings) failed\n"); - return 0; - } - - /* swap the group table */ - count=ds->readUInt16(*((const uint16_t *)(inBytes+groupsOffset))); - ds->swapArray16(ds, inBytes+groupsOffset, (int32_t)((1+count*3)*2), - outBytes+groupsOffset, pErrorCode); - - /* - * swap the group strings - * swap the string bytes but not the nibble-encoded string lengths - */ - if(ds->inCharset!=ds->outCharset) { - uint16_t offsets[LINES_PER_GROUP+1], lengths[LINES_PER_GROUP+1]; - - const uint8_t *inStrings, *nextInStrings; - uint8_t *outStrings; - - uint8_t c; - - inStrings=inBytes+groupStringOffset; - outStrings=outBytes+groupStringOffset; - - stringsCount=algNamesOffset-groupStringOffset; - - /* iterate through string groups until only a few padding bytes are left */ - while(stringsCount>32) { - nextInStrings=expandGroupLengths(inStrings, offsets, lengths); - - /* move past the length bytes */ - stringsCount-=(uint32_t)(nextInStrings-inStrings); - outStrings+=nextInStrings-inStrings; - inStrings=nextInStrings; - - count=offsets[31]+lengths[31]; /* total number of string bytes in this group */ - stringsCount-=count; - - /* swap the string bytes using map[] and trailMap[] */ - while(count>0) { - c=*inStrings++; - *outStrings++=map[c]; - if(tokens[c]!=-2) { - --count; - } else { - /* token lead byte: swap the trail byte, too */ - *outStrings++=trailMap[*inStrings++]; - count-=2; - } - } - } - } - - /* swap the algorithmic ranges */ - offset=algNamesOffset; - count=ds->readUInt32(*((const uint32_t *)(inBytes+offset))); - ds->swapArray32(ds, inBytes+offset, 4, outBytes+offset, pErrorCode); - offset+=4; - - for(i=0; i<count; ++i) { - if(offset>(uint32_t)length) { - udata_printError(ds, "uchar_swapNames(): too few bytes (%d after header) for unames.icu algorithmic range %u\n", - length, i); - *pErrorCode=U_INDEX_OUTOFBOUNDS_ERROR; - return 0; - } - - inRange=(const AlgorithmicRange *)(inBytes+offset); - outRange=(AlgorithmicRange *)(outBytes+offset); - offset+=ds->readUInt16(inRange->size); - - ds->swapArray32(ds, inRange, 8, outRange, pErrorCode); - ds->swapArray16(ds, &inRange->size, 2, &outRange->size, pErrorCode); - switch(inRange->type) { - case 0: - /* swap prefix string */ - ds->swapInvChars(ds, inRange+1, (int32_t)uprv_strlen((const char *)(inRange+1)), - outRange+1, pErrorCode); - if(U_FAILURE(*pErrorCode)) { - udata_printError(ds, "uchar_swapNames(prefix string of algorithmic range %u) failed\n", - i); - return 0; - } - break; - case 1: - { - /* swap factors and the prefix and factor strings */ - uint32_t factorsCount; - - factorsCount=inRange->variant; - p=(const uint16_t *)(inRange+1); - q=(uint16_t *)(outRange+1); - ds->swapArray16(ds, p, (int32_t)(factorsCount*2), q, pErrorCode); - - /* swap the strings, up to the last terminating NUL */ - p+=factorsCount; - q+=factorsCount; - stringsCount=(uint32_t)((inBytes+offset)-(const uint8_t *)p); - while(stringsCount>0 && ((const uint8_t *)p)[stringsCount-1]!=0) { - --stringsCount; - } - ds->swapInvChars(ds, p, (int32_t)stringsCount, q, pErrorCode); - } - break; - default: - udata_printError(ds, "uchar_swapNames(): unknown type %u of algorithmic range %u\n", - inRange->type, i); - *pErrorCode=U_UNSUPPORTED_ERROR; - return 0; - } - } - } - - return headerSize+(int32_t)offset; -} - -/* - * Hey, Emacs, please set the following: - * - * Local Variables: - * indent-tabs-mode: nil - * End: - * - */ |