diff options
Diffstat (limited to 'deps/node/deps/icu-small/source/i18n/uspoof_impl.cpp')
-rw-r--r-- | deps/node/deps/icu-small/source/i18n/uspoof_impl.cpp | 982 |
1 files changed, 0 insertions, 982 deletions
diff --git a/deps/node/deps/icu-small/source/i18n/uspoof_impl.cpp b/deps/node/deps/icu-small/source/i18n/uspoof_impl.cpp deleted file mode 100644 index c1034c2e..00000000 --- a/deps/node/deps/icu-small/source/i18n/uspoof_impl.cpp +++ /dev/null @@ -1,982 +0,0 @@ -// © 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html -/* -********************************************************************** -* Copyright (C) 2008-2016, International Business Machines -* Corporation and others. All Rights Reserved. -********************************************************************** -*/ - -#include "unicode/utypes.h" -#include "unicode/uspoof.h" -#include "unicode/uchar.h" -#include "unicode/uniset.h" -#include "unicode/utf16.h" -#include "utrie2.h" -#include "cmemory.h" -#include "cstring.h" -#include "scriptset.h" -#include "umutex.h" -#include "udataswp.h" -#include "uassert.h" -#include "ucln_in.h" -#include "uspoof_impl.h" - -#if !UCONFIG_NO_NORMALIZATION - - -U_NAMESPACE_BEGIN - -UOBJECT_DEFINE_RTTI_IMPLEMENTATION(SpoofImpl) - -SpoofImpl::SpoofImpl(SpoofData *data, UErrorCode& status) { - construct(status); - fSpoofData = data; -} - -SpoofImpl::SpoofImpl(UErrorCode& status) { - construct(status); - - // TODO: Call this method where it is actually needed, instead of in the - // constructor, to allow for lazy data loading. See #12696. - fSpoofData = SpoofData::getDefault(status); -} - -SpoofImpl::SpoofImpl() { - UErrorCode status = U_ZERO_ERROR; - construct(status); - - // TODO: Call this method where it is actually needed, instead of in the - // constructor, to allow for lazy data loading. See #12696. - fSpoofData = SpoofData::getDefault(status); -} - -void SpoofImpl::construct(UErrorCode& status) { - fMagic = USPOOF_MAGIC; - fChecks = USPOOF_ALL_CHECKS; - fSpoofData = NULL; - fAllowedCharsSet = NULL; - fAllowedLocales = NULL; - fRestrictionLevel = USPOOF_HIGHLY_RESTRICTIVE; - - if (U_FAILURE(status)) { return; } - - UnicodeSet *allowedCharsSet = new UnicodeSet(0, 0x10ffff); - fAllowedCharsSet = allowedCharsSet; - fAllowedLocales = uprv_strdup(""); - if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; - return; - } - allowedCharsSet->freeze(); -} - - -// Copy Constructor, used by the user level clone() function. -SpoofImpl::SpoofImpl(const SpoofImpl &src, UErrorCode &status) : - fMagic(0), fChecks(USPOOF_ALL_CHECKS), fSpoofData(NULL), fAllowedCharsSet(NULL) , - fAllowedLocales(NULL) { - if (U_FAILURE(status)) { - return; - } - fMagic = src.fMagic; - fChecks = src.fChecks; - if (src.fSpoofData != NULL) { - fSpoofData = src.fSpoofData->addReference(); - } - fAllowedCharsSet = static_cast<const UnicodeSet *>(src.fAllowedCharsSet->clone()); - fAllowedLocales = uprv_strdup(src.fAllowedLocales); - if (fAllowedCharsSet == NULL || fAllowedLocales == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; - } - fRestrictionLevel = src.fRestrictionLevel; -} - -SpoofImpl::~SpoofImpl() { - fMagic = 0; // head off application errors by preventing use of - // of deleted objects. - if (fSpoofData != NULL) { - fSpoofData->removeReference(); // Will delete if refCount goes to zero. - } - delete fAllowedCharsSet; - uprv_free((void *)fAllowedLocales); -} - -// Cast this instance as a USpoofChecker for the C API. -USpoofChecker *SpoofImpl::asUSpoofChecker() { - return reinterpret_cast<USpoofChecker*>(this); -} - -// -// Incoming parameter check on Status and the SpoofChecker object -// received from the C API. -// -const SpoofImpl *SpoofImpl::validateThis(const USpoofChecker *sc, UErrorCode &status) { - if (U_FAILURE(status)) { - return NULL; - } - if (sc == NULL) { - status = U_ILLEGAL_ARGUMENT_ERROR; - return NULL; - } - SpoofImpl *This = (SpoofImpl *)sc; - if (This->fMagic != USPOOF_MAGIC) { - status = U_INVALID_FORMAT_ERROR; - return NULL; - } - if (This->fSpoofData != NULL && !This->fSpoofData->validateDataVersion(status)) { - return NULL; - } - return This; -} - -SpoofImpl *SpoofImpl::validateThis(USpoofChecker *sc, UErrorCode &status) { - return const_cast<SpoofImpl *> - (SpoofImpl::validateThis(const_cast<const USpoofChecker *>(sc), status)); -} - - -void SpoofImpl::setAllowedLocales(const char *localesList, UErrorCode &status) { - UnicodeSet allowedChars; - UnicodeSet *tmpSet = NULL; - const char *locStart = localesList; - const char *locEnd = NULL; - const char *localesListEnd = localesList + uprv_strlen(localesList); - int32_t localeListCount = 0; // Number of locales provided by caller. - - // Loop runs once per locale from the localesList, a comma separated list of locales. - do { - locEnd = uprv_strchr(locStart, ','); - if (locEnd == NULL) { - locEnd = localesListEnd; - } - while (*locStart == ' ') { - locStart++; - } - const char *trimmedEnd = locEnd-1; - while (trimmedEnd > locStart && *trimmedEnd == ' ') { - trimmedEnd--; - } - if (trimmedEnd <= locStart) { - break; - } - const char *locale = uprv_strndup(locStart, (int32_t)(trimmedEnd + 1 - locStart)); - localeListCount++; - - // We have one locale from the locales list. - // Add the script chars for this locale to the accumulating set of allowed chars. - // If the locale is no good, we will be notified back via status. - addScriptChars(locale, &allowedChars, status); - uprv_free((void *)locale); - if (U_FAILURE(status)) { - break; - } - locStart = locEnd + 1; - } while (locStart < localesListEnd); - - // If our caller provided an empty list of locales, we disable the allowed characters checking - if (localeListCount == 0) { - uprv_free((void *)fAllowedLocales); - fAllowedLocales = uprv_strdup(""); - tmpSet = new UnicodeSet(0, 0x10ffff); - if (fAllowedLocales == NULL || tmpSet == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; - return; - } - tmpSet->freeze(); - delete fAllowedCharsSet; - fAllowedCharsSet = tmpSet; - fChecks &= ~USPOOF_CHAR_LIMIT; - return; - } - - - // Add all common and inherited characters to the set of allowed chars. - UnicodeSet tempSet; - tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status); - allowedChars.addAll(tempSet); - tempSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status); - allowedChars.addAll(tempSet); - - // If anything went wrong, we bail out without changing - // the state of the spoof checker. - if (U_FAILURE(status)) { - return; - } - - // Store the updated spoof checker state. - tmpSet = static_cast<UnicodeSet *>(allowedChars.clone()); - const char *tmpLocalesList = uprv_strdup(localesList); - if (tmpSet == NULL || tmpLocalesList == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; - return; - } - uprv_free((void *)fAllowedLocales); - fAllowedLocales = tmpLocalesList; - tmpSet->freeze(); - delete fAllowedCharsSet; - fAllowedCharsSet = tmpSet; - fChecks |= USPOOF_CHAR_LIMIT; -} - - -const char * SpoofImpl::getAllowedLocales(UErrorCode &/*status*/) { - return fAllowedLocales; -} - - -// Given a locale (a language), add all the characters from all of the scripts used with that language -// to the allowedChars UnicodeSet - -void SpoofImpl::addScriptChars(const char *locale, UnicodeSet *allowedChars, UErrorCode &status) { - UScriptCode scripts[30]; - - int32_t numScripts = uscript_getCode(locale, scripts, UPRV_LENGTHOF(scripts), &status); - if (U_FAILURE(status)) { - return; - } - if (status == U_USING_DEFAULT_WARNING) { - status = U_ILLEGAL_ARGUMENT_ERROR; - return; - } - UnicodeSet tmpSet; - int32_t i; - for (i=0; i<numScripts; i++) { - tmpSet.applyIntPropertyValue(UCHAR_SCRIPT, scripts[i], status); - allowedChars->addAll(tmpSet); - } -} - -// Computes the augmented script set for a code point, according to UTS 39 section 5.1. -void SpoofImpl::getAugmentedScriptSet(UChar32 codePoint, ScriptSet& result, UErrorCode& status) { - result.resetAll(); - result.setScriptExtensions(codePoint, status); - if (U_FAILURE(status)) { return; } - - // Section 5.1 step 1 - if (result.test(USCRIPT_HAN, status)) { - result.set(USCRIPT_HAN_WITH_BOPOMOFO, status); - result.set(USCRIPT_JAPANESE, status); - result.set(USCRIPT_KOREAN, status); - } - if (result.test(USCRIPT_HIRAGANA, status)) { - result.set(USCRIPT_JAPANESE, status); - } - if (result.test(USCRIPT_KATAKANA, status)) { - result.set(USCRIPT_JAPANESE, status); - } - if (result.test(USCRIPT_HANGUL, status)) { - result.set(USCRIPT_KOREAN, status); - } - if (result.test(USCRIPT_BOPOMOFO, status)) { - result.set(USCRIPT_HAN_WITH_BOPOMOFO, status); - } - - // Section 5.1 step 2 - if (result.test(USCRIPT_COMMON, status) || result.test(USCRIPT_INHERITED, status)) { - result.setAll(); - } -} - -// Computes the resolved script set for a string, according to UTS 39 section 5.1. -void SpoofImpl::getResolvedScriptSet(const UnicodeString& input, ScriptSet& result, UErrorCode& status) const { - getResolvedScriptSetWithout(input, USCRIPT_CODE_LIMIT, result, status); -} - -// Computes the resolved script set for a string, omitting characters having the specified script. -// If USCRIPT_CODE_LIMIT is passed as the second argument, all characters are included. -void SpoofImpl::getResolvedScriptSetWithout(const UnicodeString& input, UScriptCode script, ScriptSet& result, UErrorCode& status) const { - result.setAll(); - - ScriptSet temp; - UChar32 codePoint; - for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) { - codePoint = input.char32At(i); - - // Compute the augmented script set for the character - getAugmentedScriptSet(codePoint, temp, status); - if (U_FAILURE(status)) { return; } - - // Intersect the augmented script set with the resolved script set, but only if the character doesn't - // have the script specified in the function call - if (script == USCRIPT_CODE_LIMIT || !temp.test(script, status)) { - result.intersect(temp); - } - } -} - -// Computes the set of numerics for a string, according to UTS 39 section 5.3. -void SpoofImpl::getNumerics(const UnicodeString& input, UnicodeSet& result, UErrorCode& /*status*/) const { - result.clear(); - - UChar32 codePoint; - for (int32_t i = 0; i < input.length(); i += U16_LENGTH(codePoint)) { - codePoint = input.char32At(i); - - // Store a representative character for each kind of decimal digit - if (u_charType(codePoint) == U_DECIMAL_DIGIT_NUMBER) { - // Store the zero character as a representative for comparison. - // Unicode guarantees it is codePoint - value - result.add(codePoint - (UChar32)u_getNumericValue(codePoint)); - } - } -} - -// Computes the restriction level of a string, according to UTS 39 section 5.2. -URestrictionLevel SpoofImpl::getRestrictionLevel(const UnicodeString& input, UErrorCode& status) const { - // Section 5.2 step 1: - if (!fAllowedCharsSet->containsAll(input)) { - return USPOOF_UNRESTRICTIVE; - } - - // Section 5.2 step 2 - // Java use a static UnicodeSet for this test. In C++, avoid the static variable - // and just do a simple for loop. - UBool allASCII = TRUE; - for (int32_t i=0, length=input.length(); i<length; i++) { - if (input.charAt(i) > 0x7f) { - allASCII = FALSE; - break; - } - } - if (allASCII) { - return USPOOF_ASCII; - } - - // Section 5.2 steps 3: - ScriptSet resolvedScriptSet; - getResolvedScriptSet(input, resolvedScriptSet, status); - if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; } - - // Section 5.2 step 4: - if (!resolvedScriptSet.isEmpty()) { - return USPOOF_SINGLE_SCRIPT_RESTRICTIVE; - } - - // Section 5.2 step 5: - ScriptSet resolvedNoLatn; - getResolvedScriptSetWithout(input, USCRIPT_LATIN, resolvedNoLatn, status); - if (U_FAILURE(status)) { return USPOOF_UNRESTRICTIVE; } - - // Section 5.2 step 6: - if (resolvedNoLatn.test(USCRIPT_HAN_WITH_BOPOMOFO, status) - || resolvedNoLatn.test(USCRIPT_JAPANESE, status) - || resolvedNoLatn.test(USCRIPT_KOREAN, status)) { - return USPOOF_HIGHLY_RESTRICTIVE; - } - - // Section 5.2 step 7: - if (!resolvedNoLatn.isEmpty() - && !resolvedNoLatn.test(USCRIPT_CYRILLIC, status) - && !resolvedNoLatn.test(USCRIPT_GREEK, status) - && !resolvedNoLatn.test(USCRIPT_CHEROKEE, status)) { - return USPOOF_MODERATELY_RESTRICTIVE; - } - - // Section 5.2 step 8: - return USPOOF_MINIMALLY_RESTRICTIVE; -} - -int32_t SpoofImpl::findHiddenOverlay(const UnicodeString& input, UErrorCode&) const { - bool sawLeadCharacter = false; - for (int32_t i=0; i<input.length();) { - UChar32 cp = input.char32At(i); - if (sawLeadCharacter && cp == 0x0307) { - return i; - } - uint8_t combiningClass = u_getCombiningClass(cp); - // Skip over characters except for those with combining class 0 (non-combining characters) or with - // combining class 230 (same class as U+0307) - U_ASSERT(u_getCombiningClass(0x0307) == 230); - if (combiningClass == 0 || combiningClass == 230) { - sawLeadCharacter = isIllegalCombiningDotLeadCharacter(cp); - } - i += U16_LENGTH(cp); - } - return -1; -} - -static inline bool isIllegalCombiningDotLeadCharacterNoLookup(UChar32 cp) { - return cp == u'i' || cp == u'j' || cp == u'ı' || cp == u'ȷ' || cp == u'l' || - u_hasBinaryProperty(cp, UCHAR_SOFT_DOTTED); -} - -bool SpoofImpl::isIllegalCombiningDotLeadCharacter(UChar32 cp) const { - if (isIllegalCombiningDotLeadCharacterNoLookup(cp)) { - return true; - } - UnicodeString skelStr; - fSpoofData->confusableLookup(cp, skelStr); - UChar32 finalCp = skelStr.char32At(skelStr.moveIndex32(skelStr.length(), -1)); - if (finalCp != cp && isIllegalCombiningDotLeadCharacterNoLookup(finalCp)) { - return true; - } - return false; -} - - - -// Convert a text format hex number. Utility function used by builder code. Static. -// Input: UChar *string text. Output: a UChar32 -// Input has been pre-checked, and will have no non-hex chars. -// The number must fall in the code point range of 0..0x10ffff -// Static Function. -UChar32 SpoofImpl::ScanHex(const UChar *s, int32_t start, int32_t limit, UErrorCode &status) { - if (U_FAILURE(status)) { - return 0; - } - U_ASSERT(limit-start > 0); - uint32_t val = 0; - int i; - for (i=start; i<limit; i++) { - int digitVal = s[i] - 0x30; - if (digitVal>9) { - digitVal = 0xa + (s[i] - 0x41); // Upper Case 'A' - } - if (digitVal>15) { - digitVal = 0xa + (s[i] - 0x61); // Lower Case 'a' - } - U_ASSERT(digitVal <= 0xf); - val <<= 4; - val += digitVal; - } - if (val > 0x10ffff) { - status = U_PARSE_ERROR; - val = 0; - } - return (UChar32)val; -} - - -//----------------------------------------- -// -// class CheckResult Implementation -// -//----------------------------------------- - -CheckResult::CheckResult() : fMagic(USPOOF_CHECK_MAGIC) { - clear(); -} - -USpoofCheckResult* CheckResult::asUSpoofCheckResult() { - return reinterpret_cast<USpoofCheckResult*>(this); -} - -// -// Incoming parameter check on Status and the CheckResult object -// received from the C API. -// -const CheckResult* CheckResult::validateThis(const USpoofCheckResult *ptr, UErrorCode &status) { - if (U_FAILURE(status)) { return NULL; } - if (ptr == NULL) { - status = U_ILLEGAL_ARGUMENT_ERROR; - return NULL; - } - CheckResult *This = (CheckResult*) ptr; - if (This->fMagic != USPOOF_CHECK_MAGIC) { - status = U_INVALID_FORMAT_ERROR; - return NULL; - } - return This; -} - -CheckResult* CheckResult::validateThis(USpoofCheckResult *ptr, UErrorCode &status) { - return const_cast<CheckResult *> - (CheckResult::validateThis(const_cast<const USpoofCheckResult*>(ptr), status)); -} - -void CheckResult::clear() { - fChecks = 0; - fNumerics.clear(); - fRestrictionLevel = USPOOF_UNDEFINED_RESTRICTIVE; -} - -int32_t CheckResult::toCombinedBitmask(int32_t enabledChecks) { - if ((enabledChecks & USPOOF_AUX_INFO) != 0 && fRestrictionLevel != USPOOF_UNDEFINED_RESTRICTIVE) { - return fChecks | fRestrictionLevel; - } else { - return fChecks; - } -} - -CheckResult::~CheckResult() { -} - -//---------------------------------------------------------------------------------------------- -// -// class SpoofData Implementation -// -//---------------------------------------------------------------------------------------------- - - -UBool SpoofData::validateDataVersion(UErrorCode &status) const { - if (U_FAILURE(status) || - fRawData == NULL || - fRawData->fMagic != USPOOF_MAGIC || - fRawData->fFormatVersion[0] != USPOOF_CONFUSABLE_DATA_FORMAT_VERSION || - fRawData->fFormatVersion[1] != 0 || - fRawData->fFormatVersion[2] != 0 || - fRawData->fFormatVersion[3] != 0) { - status = U_INVALID_FORMAT_ERROR; - return FALSE; - } - return TRUE; -} - -static UBool U_CALLCONV -spoofDataIsAcceptable(void *context, - const char * /* type */, const char * /*name*/, - const UDataInfo *pInfo) { - if( - pInfo->size >= 20 && - pInfo->isBigEndian == U_IS_BIG_ENDIAN && - pInfo->charsetFamily == U_CHARSET_FAMILY && - pInfo->dataFormat[0] == 0x43 && // dataFormat="Cfu " - pInfo->dataFormat[1] == 0x66 && - pInfo->dataFormat[2] == 0x75 && - pInfo->dataFormat[3] == 0x20 && - pInfo->formatVersion[0] == USPOOF_CONFUSABLE_DATA_FORMAT_VERSION - ) { - UVersionInfo *version = static_cast<UVersionInfo *>(context); - if(version != NULL) { - uprv_memcpy(version, pInfo->dataVersion, 4); - } - return TRUE; - } else { - return FALSE; - } -} - -// Methods for the loading of the default confusables data file. The confusable -// data is loaded only when it is needed. -// -// SpoofData::getDefault() - Return the default confusables data, and call the -// initOnce() if it is not available. Adds a reference -// to the SpoofData that the caller is responsible for -// decrementing when they are done with the data. -// -// uspoof_loadDefaultData - Called once, from initOnce(). The resulting SpoofData -// is shared by all spoof checkers using the default data. -// -// uspoof_cleanupDefaultData - Called during cleanup. -// - -static UInitOnce gSpoofInitDefaultOnce = U_INITONCE_INITIALIZER; -static SpoofData* gDefaultSpoofData; - -static UBool U_CALLCONV -uspoof_cleanupDefaultData(void) { - if (gDefaultSpoofData) { - // Will delete, assuming all user-level spoof checkers were closed. - gDefaultSpoofData->removeReference(); - gDefaultSpoofData = nullptr; - gSpoofInitDefaultOnce.reset(); - } - return TRUE; -} - -static void U_CALLCONV uspoof_loadDefaultData(UErrorCode& status) { - UDataMemory *udm = udata_openChoice(nullptr, "cfu", "confusables", - spoofDataIsAcceptable, - nullptr, // context, would receive dataVersion if supplied. - &status); - if (U_FAILURE(status)) { return; } - gDefaultSpoofData = new SpoofData(udm, status); - if (U_FAILURE(status)) { - delete gDefaultSpoofData; - gDefaultSpoofData = nullptr; - return; - } - if (gDefaultSpoofData == nullptr) { - status = U_MEMORY_ALLOCATION_ERROR; - return; - } - ucln_i18n_registerCleanup(UCLN_I18N_SPOOFDATA, uspoof_cleanupDefaultData); -} - -SpoofData* SpoofData::getDefault(UErrorCode& status) { - umtx_initOnce(gSpoofInitDefaultOnce, &uspoof_loadDefaultData, status); - if (U_FAILURE(status)) { return NULL; } - gDefaultSpoofData->addReference(); - return gDefaultSpoofData; -} - - - -SpoofData::SpoofData(UDataMemory *udm, UErrorCode &status) -{ - reset(); - if (U_FAILURE(status)) { - return; - } - fUDM = udm; - // fRawData is non-const because it may be constructed by the data builder. - fRawData = reinterpret_cast<SpoofDataHeader *>( - const_cast<void *>(udata_getMemory(udm))); - validateDataVersion(status); - initPtrs(status); -} - - -SpoofData::SpoofData(const void *data, int32_t length, UErrorCode &status) -{ - reset(); - if (U_FAILURE(status)) { - return; - } - if ((size_t)length < sizeof(SpoofDataHeader)) { - status = U_INVALID_FORMAT_ERROR; - return; - } - if (data == NULL) { - status = U_ILLEGAL_ARGUMENT_ERROR; - return; - } - void *ncData = const_cast<void *>(data); - fRawData = static_cast<SpoofDataHeader *>(ncData); - if (length < fRawData->fLength) { - status = U_INVALID_FORMAT_ERROR; - return; - } - validateDataVersion(status); - initPtrs(status); -} - - -// Spoof Data constructor for use from data builder. -// Initializes a new, empty data area that will be populated later. -SpoofData::SpoofData(UErrorCode &status) { - reset(); - if (U_FAILURE(status)) { - return; - } - fDataOwned = true; - - // The spoof header should already be sized to be a multiple of 16 bytes. - // Just in case it's not, round it up. - uint32_t initialSize = (sizeof(SpoofDataHeader) + 15) & ~15; - U_ASSERT(initialSize == sizeof(SpoofDataHeader)); - - fRawData = static_cast<SpoofDataHeader *>(uprv_malloc(initialSize)); - fMemLimit = initialSize; - if (fRawData == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; - return; - } - uprv_memset(fRawData, 0, initialSize); - - fRawData->fMagic = USPOOF_MAGIC; - fRawData->fFormatVersion[0] = USPOOF_CONFUSABLE_DATA_FORMAT_VERSION; - fRawData->fFormatVersion[1] = 0; - fRawData->fFormatVersion[2] = 0; - fRawData->fFormatVersion[3] = 0; - initPtrs(status); -} - -// reset() - initialize all fields. -// Should be updated if any new fields are added. -// Called by constructors to put things in a known initial state. -void SpoofData::reset() { - fRawData = NULL; - fDataOwned = FALSE; - fUDM = NULL; - fMemLimit = 0; - fRefCount = 1; - fCFUKeys = NULL; - fCFUValues = NULL; - fCFUStrings = NULL; -} - - -// SpoofData::initPtrs() -// Initialize the pointers to the various sections of the raw data. -// -// This function is used both during the Trie building process (multiple -// times, as the individual data sections are added), and -// during the opening of a Spoof Checker from prebuilt data. -// -// The pointers for non-existent data sections (identified by an offset of 0) -// are set to NULL. -// -// Note: During building the data, adding each new data section -// reallocs the raw data area, which likely relocates it, which -// in turn requires reinitializing all of the pointers into it, hence -// multiple calls to this function during building. -// -void SpoofData::initPtrs(UErrorCode &status) { - fCFUKeys = NULL; - fCFUValues = NULL; - fCFUStrings = NULL; - if (U_FAILURE(status)) { - return; - } - if (fRawData->fCFUKeys != 0) { - fCFUKeys = (int32_t *)((char *)fRawData + fRawData->fCFUKeys); - } - if (fRawData->fCFUStringIndex != 0) { - fCFUValues = (uint16_t *)((char *)fRawData + fRawData->fCFUStringIndex); - } - if (fRawData->fCFUStringTable != 0) { - fCFUStrings = (UChar *)((char *)fRawData + fRawData->fCFUStringTable); - } -} - - -SpoofData::~SpoofData() { - if (fDataOwned) { - uprv_free(fRawData); - } - fRawData = NULL; - if (fUDM != NULL) { - udata_close(fUDM); - } - fUDM = NULL; -} - - -void SpoofData::removeReference() { - if (umtx_atomic_dec(&fRefCount) == 0) { - delete this; - } -} - - -SpoofData *SpoofData::addReference() { - umtx_atomic_inc(&fRefCount); - return this; -} - - -void *SpoofData::reserveSpace(int32_t numBytes, UErrorCode &status) { - if (U_FAILURE(status)) { - return NULL; - } - if (!fDataOwned) { - U_ASSERT(FALSE); - status = U_INTERNAL_PROGRAM_ERROR; - return NULL; - } - - numBytes = (numBytes + 15) & ~15; // Round up to a multiple of 16 - uint32_t returnOffset = fMemLimit; - fMemLimit += numBytes; - fRawData = static_cast<SpoofDataHeader *>(uprv_realloc(fRawData, fMemLimit)); - fRawData->fLength = fMemLimit; - uprv_memset((char *)fRawData + returnOffset, 0, numBytes); - initPtrs(status); - return (char *)fRawData + returnOffset; -} - -int32_t SpoofData::serialize(void *buf, int32_t capacity, UErrorCode &status) const { - int32_t dataSize = fRawData->fLength; - if (capacity < dataSize) { - status = U_BUFFER_OVERFLOW_ERROR; - return dataSize; - } - uprv_memcpy(buf, fRawData, dataSize); - return dataSize; -} - -int32_t SpoofData::size() const { - return fRawData->fLength; -} - -//------------------------------- -// -// Front-end APIs for SpoofData -// -//------------------------------- - -int32_t SpoofData::confusableLookup(UChar32 inChar, UnicodeString &dest) const { - // Perform a binary search. - // [lo, hi), i.e lo is inclusive, hi is exclusive. - // The result after the loop will be in lo. - int32_t lo = 0; - int32_t hi = length(); - do { - int32_t mid = (lo + hi) / 2; - if (codePointAt(mid) > inChar) { - hi = mid; - } else if (codePointAt(mid) < inChar) { - lo = mid; - } else { - // Found result. Break early. - lo = mid; - break; - } - } while (hi - lo > 1); - - // Did we find an entry? If not, the char maps to itself. - if (codePointAt(lo) != inChar) { - dest.append(inChar); - return 1; - } - - // Add the element to the string builder and return. - return appendValueTo(lo, dest); -} - -int32_t SpoofData::length() const { - return fRawData->fCFUKeysSize; -} - -UChar32 SpoofData::codePointAt(int32_t index) const { - return ConfusableDataUtils::keyToCodePoint(fCFUKeys[index]); -} - -int32_t SpoofData::appendValueTo(int32_t index, UnicodeString& dest) const { - int32_t stringLength = ConfusableDataUtils::keyToLength(fCFUKeys[index]); - - // Value is either a char (for strings of length 1) or - // an index into the string table (for longer strings) - uint16_t value = fCFUValues[index]; - if (stringLength == 1) { - dest.append((UChar)value); - } else { - dest.append(fCFUStrings + value, stringLength); - } - - return stringLength; -} - - -U_NAMESPACE_END - -U_NAMESPACE_USE - -//----------------------------------------------------------------------------- -// -// uspoof_swap - byte swap and char encoding swap of spoof data -// -//----------------------------------------------------------------------------- -U_CAPI int32_t U_EXPORT2 -uspoof_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, - UErrorCode *status) { - - if (status == NULL || U_FAILURE(*status)) { - return 0; - } - if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) { - *status=U_ILLEGAL_ARGUMENT_ERROR; - return 0; - } - - // - // Check that the data header is for spoof data. - // (Header contents are defined in gencfu.cpp) - // - const UDataInfo *pInfo = (const UDataInfo *)((const char *)inData+4); - if(!( pInfo->dataFormat[0]==0x43 && /* dataFormat="Cfu " */ - pInfo->dataFormat[1]==0x66 && - pInfo->dataFormat[2]==0x75 && - pInfo->dataFormat[3]==0x20 && - pInfo->formatVersion[0]==USPOOF_CONFUSABLE_DATA_FORMAT_VERSION && - pInfo->formatVersion[1]==0 && - pInfo->formatVersion[2]==0 && - pInfo->formatVersion[3]==0 )) { - udata_printError(ds, "uspoof_swap(): data format %02x.%02x.%02x.%02x " - "(format version %02x %02x %02x %02x) is not recognized\n", - pInfo->dataFormat[0], pInfo->dataFormat[1], - pInfo->dataFormat[2], pInfo->dataFormat[3], - pInfo->formatVersion[0], pInfo->formatVersion[1], - pInfo->formatVersion[2], pInfo->formatVersion[3]); - *status=U_UNSUPPORTED_ERROR; - return 0; - } - - // - // Swap the data header. (This is the generic ICU Data Header, not the uspoof Specific - // header). This swap also conveniently gets us - // the size of the ICU d.h., which lets us locate the start - // of the uspoof specific data. - // - int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status); - - - // - // Get the Spoof Data Header, and check that it appears to be OK. - // - // - const uint8_t *inBytes =(const uint8_t *)inData+headerSize; - SpoofDataHeader *spoofDH = (SpoofDataHeader *)inBytes; - if (ds->readUInt32(spoofDH->fMagic) != USPOOF_MAGIC || - ds->readUInt32(spoofDH->fLength) < sizeof(SpoofDataHeader)) - { - udata_printError(ds, "uspoof_swap(): Spoof Data header is invalid.\n"); - *status=U_UNSUPPORTED_ERROR; - return 0; - } - - // - // Prefight operation? Just return the size - // - int32_t spoofDataLength = ds->readUInt32(spoofDH->fLength); - int32_t totalSize = headerSize + spoofDataLength; - if (length < 0) { - return totalSize; - } - - // - // Check that length passed in is consistent with length from Spoof data header. - // - if (length < totalSize) { - udata_printError(ds, "uspoof_swap(): too few bytes (%d after ICU Data header) for spoof data.\n", - spoofDataLength); - *status=U_INDEX_OUTOFBOUNDS_ERROR; - return 0; - } - - - // - // Swap the Data. Do the data itself first, then the Spoof Data Header, because - // we need to reference the header to locate the data, and an - // inplace swap of the header leaves it unusable. - // - uint8_t *outBytes = (uint8_t *)outData + headerSize; - SpoofDataHeader *outputDH = (SpoofDataHeader *)outBytes; - - int32_t sectionStart; - int32_t sectionLength; - - // - // If not swapping in place, zero out the output buffer before starting. - // Gaps may exist between the individual sections, and these must be zeroed in - // the output buffer. The simplest way to do that is to just zero the whole thing. - // - if (inBytes != outBytes) { - uprv_memset(outBytes, 0, spoofDataLength); - } - - // Confusables Keys Section (fCFUKeys) - sectionStart = ds->readUInt32(spoofDH->fCFUKeys); - sectionLength = ds->readUInt32(spoofDH->fCFUKeysSize) * 4; - ds->swapArray32(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); - - // String Index Section - sectionStart = ds->readUInt32(spoofDH->fCFUStringIndex); - sectionLength = ds->readUInt32(spoofDH->fCFUStringIndexSize) * 2; - ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); - - // String Table Section - sectionStart = ds->readUInt32(spoofDH->fCFUStringTable); - sectionLength = ds->readUInt32(spoofDH->fCFUStringTableLen) * 2; - ds->swapArray16(ds, inBytes+sectionStart, sectionLength, outBytes+sectionStart, status); - - // And, last, swap the header itself. - // int32_t fMagic // swap this - // uint8_t fFormatVersion[4] // Do not swap this, just copy - // int32_t fLength and all the rest // Swap the rest, all is 32 bit stuff. - // - uint32_t magic = ds->readUInt32(spoofDH->fMagic); - ds->writeUInt32((uint32_t *)&outputDH->fMagic, magic); - - if (outputDH->fFormatVersion != spoofDH->fFormatVersion) { - uprv_memcpy(outputDH->fFormatVersion, spoofDH->fFormatVersion, sizeof(spoofDH->fFormatVersion)); - } - // swap starting at fLength - ds->swapArray32(ds, &spoofDH->fLength, sizeof(SpoofDataHeader)-8 /* minus magic and fFormatVersion[4] */, &outputDH->fLength, status); - - return totalSize; -} - -#endif |