diff options
Diffstat (limited to 'deps/node/deps/icu-small/source/common/bmpset.cpp')
-rw-r--r-- | deps/node/deps/icu-small/source/common/bmpset.cpp | 741 |
1 files changed, 0 insertions, 741 deletions
diff --git a/deps/node/deps/icu-small/source/common/bmpset.cpp b/deps/node/deps/icu-small/source/common/bmpset.cpp deleted file mode 100644 index bc79f5e5..00000000 --- a/deps/node/deps/icu-small/source/common/bmpset.cpp +++ /dev/null @@ -1,741 +0,0 @@ -// © 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html -/* -****************************************************************************** -* -* Copyright (C) 2007-2012, International Business Machines -* Corporation and others. All Rights Reserved. -* -****************************************************************************** -* file name: bmpset.cpp -* encoding: UTF-8 -* tab size: 8 (not used) -* indentation:4 -* -* created on: 2007jan29 -* created by: Markus W. Scherer -*/ - -#include "unicode/utypes.h" -#include "unicode/uniset.h" -#include "unicode/utf8.h" -#include "unicode/utf16.h" -#include "cmemory.h" -#include "bmpset.h" -#include "uassert.h" - -U_NAMESPACE_BEGIN - -BMPSet::BMPSet(const int32_t *parentList, int32_t parentListLength) : - list(parentList), listLength(parentListLength) { - uprv_memset(latin1Contains, 0, sizeof(latin1Contains)); - uprv_memset(table7FF, 0, sizeof(table7FF)); - uprv_memset(bmpBlockBits, 0, sizeof(bmpBlockBits)); - - /* - * Set the list indexes for binary searches for - * U+0800, U+1000, U+2000, .., U+F000, U+10000. - * U+0800 is the first 3-byte-UTF-8 code point. Lower code points are - * looked up in the bit tables. - * The last pair of indexes is for finding supplementary code points. - */ - list4kStarts[0]=findCodePoint(0x800, 0, listLength-1); - int32_t i; - for(i=1; i<=0x10; ++i) { - list4kStarts[i]=findCodePoint(i<<12, list4kStarts[i-1], listLength-1); - } - list4kStarts[0x11]=listLength-1; - containsFFFD=containsSlow(0xfffd, list4kStarts[0xf], list4kStarts[0x10]); - - initBits(); - overrideIllegal(); -} - -BMPSet::BMPSet(const BMPSet &otherBMPSet, const int32_t *newParentList, int32_t newParentListLength) : - containsFFFD(otherBMPSet.containsFFFD), - list(newParentList), listLength(newParentListLength) { - uprv_memcpy(latin1Contains, otherBMPSet.latin1Contains, sizeof(latin1Contains)); - uprv_memcpy(table7FF, otherBMPSet.table7FF, sizeof(table7FF)); - uprv_memcpy(bmpBlockBits, otherBMPSet.bmpBlockBits, sizeof(bmpBlockBits)); - uprv_memcpy(list4kStarts, otherBMPSet.list4kStarts, sizeof(list4kStarts)); -} - -BMPSet::~BMPSet() { -} - -/* - * Set bits in a bit rectangle in "vertical" bit organization. - * start<limit<=0x800 - */ -static void set32x64Bits(uint32_t table[64], int32_t start, int32_t limit) { - U_ASSERT(start<limit); - U_ASSERT(limit<=0x800); - - int32_t lead=start>>6; // Named for UTF-8 2-byte lead byte with upper 5 bits. - int32_t trail=start&0x3f; // Named for UTF-8 2-byte trail byte with lower 6 bits. - - // Set one bit indicating an all-one block. - uint32_t bits=(uint32_t)1<<lead; - if((start+1)==limit) { // Single-character shortcut. - table[trail]|=bits; - return; - } - - int32_t limitLead=limit>>6; - int32_t limitTrail=limit&0x3f; - - if(lead==limitLead) { - // Partial vertical bit column. - while(trail<limitTrail) { - table[trail++]|=bits; - } - } else { - // Partial vertical bit column, - // followed by a bit rectangle, - // followed by another partial vertical bit column. - if(trail>0) { - do { - table[trail++]|=bits; - } while(trail<64); - ++lead; - } - if(lead<limitLead) { - bits=~(((unsigned)1<<lead)-1); - if(limitLead<0x20) { - bits&=((unsigned)1<<limitLead)-1; - } - for(trail=0; trail<64; ++trail) { - table[trail]|=bits; - } - } - // limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0. - // In that case, bits=1<<limitLead is undefined but the bits value - // is not used because trail<limitTrail is already false. - bits=(uint32_t)1<<((limitLead == 0x20) ? (limitLead - 1) : limitLead); - for(trail=0; trail<limitTrail; ++trail) { - table[trail]|=bits; - } - } -} - -void BMPSet::initBits() { - UChar32 start, limit; - int32_t listIndex=0; - - // Set latin1Contains[]. - do { - start=list[listIndex++]; - if(listIndex<listLength) { - limit=list[listIndex++]; - } else { - limit=0x110000; - } - if(start>=0x100) { - break; - } - do { - latin1Contains[start++]=1; - } while(start<limit && start<0x100); - } while(limit<=0x100); - - // Find the first range overlapping with (or after) 80..FF again, - // to include them in table7FF as well. - for(listIndex=0;;) { - start=list[listIndex++]; - if(listIndex<listLength) { - limit=list[listIndex++]; - } else { - limit=0x110000; - } - if(limit>0x80) { - if(start<0x80) { - start=0x80; - } - break; - } - } - - // Set table7FF[]. - while(start<0x800) { - set32x64Bits(table7FF, start, limit<=0x800 ? limit : 0x800); - if(limit>0x800) { - start=0x800; - break; - } - - start=list[listIndex++]; - if(listIndex<listLength) { - limit=list[listIndex++]; - } else { - limit=0x110000; - } - } - - // Set bmpBlockBits[]. - int32_t minStart=0x800; - while(start<0x10000) { - if(limit>0x10000) { - limit=0x10000; - } - - if(start<minStart) { - start=minStart; - } - if(start<limit) { // Else: Another range entirely in a known mixed-value block. - if(start&0x3f) { - // Mixed-value block of 64 code points. - start>>=6; - bmpBlockBits[start&0x3f]|=0x10001<<(start>>6); - start=(start+1)<<6; // Round up to the next block boundary. - minStart=start; // Ignore further ranges in this block. - } - if(start<limit) { - if(start<(limit&~0x3f)) { - // Multiple all-ones blocks of 64 code points each. - set32x64Bits(bmpBlockBits, start>>6, limit>>6); - } - - if(limit&0x3f) { - // Mixed-value block of 64 code points. - limit>>=6; - bmpBlockBits[limit&0x3f]|=0x10001<<(limit>>6); - limit=(limit+1)<<6; // Round up to the next block boundary. - minStart=limit; // Ignore further ranges in this block. - } - } - } - - if(limit==0x10000) { - break; - } - - start=list[listIndex++]; - if(listIndex<listLength) { - limit=list[listIndex++]; - } else { - limit=0x110000; - } - } -} - -/* - * Override some bits and bytes to the result of contains(FFFD) - * for faster validity checking at runtime. - * No need to set 0 values where they were reset to 0 in the constructor - * and not modified by initBits(). - * (table7FF[] 0..7F, bmpBlockBits[] 0..7FF) - * Need to set 0 values for surrogates D800..DFFF. - */ -void BMPSet::overrideIllegal() { - uint32_t bits, mask; - int32_t i; - - if(containsFFFD) { - bits=3; // Lead bytes 0xC0 and 0xC1. - for(i=0; i<64; ++i) { - table7FF[i]|=bits; - } - - bits=1; // Lead byte 0xE0. - for(i=0; i<32; ++i) { // First half of 4k block. - bmpBlockBits[i]|=bits; - } - - mask= static_cast<uint32_t>(~(0x10001<<0xd)); // Lead byte 0xED. - bits=1<<0xd; - for(i=32; i<64; ++i) { // Second half of 4k block. - bmpBlockBits[i]=(bmpBlockBits[i]&mask)|bits; - } - } else { - mask= static_cast<uint32_t>(~(0x10001<<0xd)); // Lead byte 0xED. - for(i=32; i<64; ++i) { // Second half of 4k block. - bmpBlockBits[i]&=mask; - } - } -} - -int32_t BMPSet::findCodePoint(UChar32 c, int32_t lo, int32_t hi) const { - /* Examples: - findCodePoint(c) - set list[] c=0 1 3 4 7 8 - === ============== =========== - [] [110000] 0 0 0 0 0 0 - [\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2 - [\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2 - [:Any:] [0, 110000] 1 1 1 1 1 1 - */ - - // Return the smallest i such that c < list[i]. Assume - // list[len - 1] == HIGH and that c is legal (0..HIGH-1). - if (c < list[lo]) - return lo; - // High runner test. c is often after the last range, so an - // initial check for this condition pays off. - if (lo >= hi || c >= list[hi-1]) - return hi; - // invariant: c >= list[lo] - // invariant: c < list[hi] - for (;;) { - int32_t i = (lo + hi) >> 1; - if (i == lo) { - break; // Found! - } else if (c < list[i]) { - hi = i; - } else { - lo = i; - } - } - return hi; -} - -UBool -BMPSet::contains(UChar32 c) const { - if((uint32_t)c<=0xff) { - return (UBool)latin1Contains[c]; - } else if((uint32_t)c<=0x7ff) { - return (UBool)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0); - } else if((uint32_t)c<0xd800 || (c>=0xe000 && c<=0xffff)) { - int lead=c>>12; - uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; - if(twoBits<=1) { - // All 64 code points with the same bits 15..6 - // are either in the set or not. - return (UBool)twoBits; - } else { - // Look up the code point in its 4k block of code points. - return containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]); - } - } else if((uint32_t)c<=0x10ffff) { - // surrogate or supplementary code point - return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]); - } else { - // Out-of-range code points get FALSE, consistent with long-standing - // behavior of UnicodeSet::contains(c). - return FALSE; - } -} - -/* - * Check for sufficient length for trail unit for each surrogate pair. - * Handle single surrogates as surrogate code points as usual in ICU. - */ -const UChar * -BMPSet::span(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const { - UChar c, c2; - - if(spanCondition) { - // span - do { - c=*s; - if(c<=0xff) { - if(!latin1Contains[c]) { - break; - } - } else if(c<=0x7ff) { - if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) { - break; - } - } else if(c<0xd800 || c>=0xe000) { - int lead=c>>12; - uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; - if(twoBits<=1) { - // All 64 code points with the same bits 15..6 - // are either in the set or not. - if(twoBits==0) { - break; - } - } else { - // Look up the code point in its 4k block of code points. - if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) { - break; - } - } - } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) { - // surrogate code point - if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) { - break; - } - } else { - // surrogate pair - if(!containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) { - break; - } - ++s; - } - } while(++s<limit); - } else { - // span not - do { - c=*s; - if(c<=0xff) { - if(latin1Contains[c]) { - break; - } - } else if(c<=0x7ff) { - if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) { - break; - } - } else if(c<0xd800 || c>=0xe000) { - int lead=c>>12; - uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; - if(twoBits<=1) { - // All 64 code points with the same bits 15..6 - // are either in the set or not. - if(twoBits!=0) { - break; - } - } else { - // Look up the code point in its 4k block of code points. - if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) { - break; - } - } - } else if(c>=0xdc00 || (s+1)==limit || (c2=s[1])<0xdc00 || c2>=0xe000) { - // surrogate code point - if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) { - break; - } - } else { - // surrogate pair - if(containsSlow(U16_GET_SUPPLEMENTARY(c, c2), list4kStarts[0x10], list4kStarts[0x11])) { - break; - } - ++s; - } - } while(++s<limit); - } - return s; -} - -/* Symmetrical with span(). */ -const UChar * -BMPSet::spanBack(const UChar *s, const UChar *limit, USetSpanCondition spanCondition) const { - UChar c, c2; - - if(spanCondition) { - // span - for(;;) { - c=*(--limit); - if(c<=0xff) { - if(!latin1Contains[c]) { - break; - } - } else if(c<=0x7ff) { - if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))==0) { - break; - } - } else if(c<0xd800 || c>=0xe000) { - int lead=c>>12; - uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; - if(twoBits<=1) { - // All 64 code points with the same bits 15..6 - // are either in the set or not. - if(twoBits==0) { - break; - } - } else { - // Look up the code point in its 4k block of code points. - if(!containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) { - break; - } - } - } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) { - // surrogate code point - if(!containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) { - break; - } - } else { - // surrogate pair - if(!containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) { - break; - } - --limit; - } - if(s==limit) { - return s; - } - } - } else { - // span not - for(;;) { - c=*(--limit); - if(c<=0xff) { - if(latin1Contains[c]) { - break; - } - } else if(c<=0x7ff) { - if((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) { - break; - } - } else if(c<0xd800 || c>=0xe000) { - int lead=c>>12; - uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; - if(twoBits<=1) { - // All 64 code points with the same bits 15..6 - // are either in the set or not. - if(twoBits!=0) { - break; - } - } else { - // Look up the code point in its 4k block of code points. - if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1])) { - break; - } - } - } else if(c<0xdc00 || s==limit || (c2=*(limit-1))<0xd800 || c2>=0xdc00) { - // surrogate code point - if(containsSlow(c, list4kStarts[0xd], list4kStarts[0xe])) { - break; - } - } else { - // surrogate pair - if(containsSlow(U16_GET_SUPPLEMENTARY(c2, c), list4kStarts[0x10], list4kStarts[0x11])) { - break; - } - --limit; - } - if(s==limit) { - return s; - } - } - } - return limit+1; -} - -/* - * Precheck for sufficient trail bytes at end of string only once per span. - * Check validity. - */ -const uint8_t * -BMPSet::spanUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { - const uint8_t *limit=s+length; - uint8_t b=*s; - if(U8_IS_SINGLE(b)) { - // Initial all-ASCII span. - if(spanCondition) { - do { - if(!latin1Contains[b] || ++s==limit) { - return s; - } - b=*s; - } while(U8_IS_SINGLE(b)); - } else { - do { - if(latin1Contains[b] || ++s==limit) { - return s; - } - b=*s; - } while(U8_IS_SINGLE(b)); - } - length=(int32_t)(limit-s); - } - - if(spanCondition!=USET_SPAN_NOT_CONTAINED) { - spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. - } - - const uint8_t *limit0=limit; - - /* - * Make sure that the last 1/2/3/4-byte sequence before limit is complete - * or runs into a lead byte. - * In the span loop compare s with limit only once - * per multi-byte character. - * - * Give a trailing illegal sequence the same value as the result of contains(FFFD), - * including it if that is part of the span, otherwise set limit0 to before - * the truncated sequence. - */ - b=*(limit-1); - if((int8_t)b<0) { - // b>=0x80: lead or trail byte - if(b<0xc0) { - // single trail byte, check for preceding 3- or 4-byte lead byte - if(length>=2 && (b=*(limit-2))>=0xe0) { - limit-=2; - if(containsFFFD!=spanCondition) { - limit0=limit; - } - } else if(b<0xc0 && b>=0x80 && length>=3 && (b=*(limit-3))>=0xf0) { - // 4-byte lead byte with only two trail bytes - limit-=3; - if(containsFFFD!=spanCondition) { - limit0=limit; - } - } - } else { - // lead byte with no trail bytes - --limit; - if(containsFFFD!=spanCondition) { - limit0=limit; - } - } - } - - uint8_t t1, t2, t3; - - while(s<limit) { - b=*s; - if(U8_IS_SINGLE(b)) { - // ASCII - if(spanCondition) { - do { - if(!latin1Contains[b]) { - return s; - } else if(++s==limit) { - return limit0; - } - b=*s; - } while(U8_IS_SINGLE(b)); - } else { - do { - if(latin1Contains[b]) { - return s; - } else if(++s==limit) { - return limit0; - } - b=*s; - } while(U8_IS_SINGLE(b)); - } - } - ++s; // Advance past the lead byte. - if(b>=0xe0) { - if(b<0xf0) { - if( /* handle U+0000..U+FFFF inline */ - (t1=(uint8_t)(s[0]-0x80)) <= 0x3f && - (t2=(uint8_t)(s[1]-0x80)) <= 0x3f - ) { - b&=0xf; - uint32_t twoBits=(bmpBlockBits[t1]>>b)&0x10001; - if(twoBits<=1) { - // All 64 code points with this lead byte and middle trail byte - // are either in the set or not. - if(twoBits!=(uint32_t)spanCondition) { - return s-1; - } - } else { - // Look up the code point in its 4k block of code points. - UChar32 c=(b<<12)|(t1<<6)|t2; - if(containsSlow(c, list4kStarts[b], list4kStarts[b+1]) != spanCondition) { - return s-1; - } - } - s+=2; - continue; - } - } else if( /* handle U+10000..U+10FFFF inline */ - (t1=(uint8_t)(s[0]-0x80)) <= 0x3f && - (t2=(uint8_t)(s[1]-0x80)) <= 0x3f && - (t3=(uint8_t)(s[2]-0x80)) <= 0x3f - ) { - // Give an illegal sequence the same value as the result of contains(FFFD). - UChar32 c=((UChar32)(b-0xf0)<<18)|((UChar32)t1<<12)|(t2<<6)|t3; - if( ( (0x10000<=c && c<=0x10ffff) ? - containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) : - containsFFFD - ) != spanCondition - ) { - return s-1; - } - s+=3; - continue; - } - } else { - if( /* handle U+0000..U+07FF inline */ - b>=0xc0 && - (t1=(uint8_t)(*s-0x80)) <= 0x3f - ) { - if((USetSpanCondition)((table7FF[t1]&((uint32_t)1<<(b&0x1f)))!=0) != spanCondition) { - return s-1; - } - ++s; - continue; - } - } - - // Give an illegal sequence the same value as the result of contains(FFFD). - // Handle each byte of an illegal sequence separately to simplify the code; - // no need to optimize error handling. - if(containsFFFD!=spanCondition) { - return s-1; - } - } - - return limit0; -} - -/* - * While going backwards through UTF-8 optimize only for ASCII. - * Unlike UTF-16, UTF-8 is not forward-backward symmetrical, that is, it is not - * possible to tell from the last byte in a multi-byte sequence how many - * preceding bytes there should be. Therefore, going backwards through UTF-8 - * is much harder than going forward. - */ -int32_t -BMPSet::spanBackUTF8(const uint8_t *s, int32_t length, USetSpanCondition spanCondition) const { - if(spanCondition!=USET_SPAN_NOT_CONTAINED) { - spanCondition=USET_SPAN_CONTAINED; // Pin to 0/1 values. - } - - uint8_t b; - - do { - b=s[--length]; - if(U8_IS_SINGLE(b)) { - // ASCII sub-span - if(spanCondition) { - do { - if(!latin1Contains[b]) { - return length+1; - } else if(length==0) { - return 0; - } - b=s[--length]; - } while(U8_IS_SINGLE(b)); - } else { - do { - if(latin1Contains[b]) { - return length+1; - } else if(length==0) { - return 0; - } - b=s[--length]; - } while(U8_IS_SINGLE(b)); - } - } - - int32_t prev=length; - UChar32 c; - // trail byte: collect a multi-byte character - // (or lead byte in last-trail position) - c=utf8_prevCharSafeBody(s, 0, &length, b, -3); - // c is a valid code point, not ASCII, not a surrogate - if(c<=0x7ff) { - if((USetSpanCondition)((table7FF[c&0x3f]&((uint32_t)1<<(c>>6)))!=0) != spanCondition) { - return prev+1; - } - } else if(c<=0xffff) { - int lead=c>>12; - uint32_t twoBits=(bmpBlockBits[(c>>6)&0x3f]>>lead)&0x10001; - if(twoBits<=1) { - // All 64 code points with the same bits 15..6 - // are either in the set or not. - if(twoBits!=(uint32_t)spanCondition) { - return prev+1; - } - } else { - // Look up the code point in its 4k block of code points. - if(containsSlow(c, list4kStarts[lead], list4kStarts[lead+1]) != spanCondition) { - return prev+1; - } - } - } else { - if(containsSlow(c, list4kStarts[0x10], list4kStarts[0x11]) != spanCondition) { - return prev+1; - } - } - } while(length>0); - return 0; -} - -U_NAMESPACE_END |