diff options
Diffstat (limited to 'deps/icu-small/source/i18n/identifier_info.cpp')
-rw-r--r-- | deps/icu-small/source/i18n/identifier_info.cpp | 310 |
1 files changed, 310 insertions, 0 deletions
diff --git a/deps/icu-small/source/i18n/identifier_info.cpp b/deps/icu-small/source/i18n/identifier_info.cpp new file mode 100644 index 0000000000..05882830a5 --- /dev/null +++ b/deps/icu-small/source/i18n/identifier_info.cpp @@ -0,0 +1,310 @@ +/* +********************************************************************** +* Copyright (C) 2012-2014, International Business Machines +* Corporation and others. All Rights Reserved. +********************************************************************** +*/ + +#include "unicode/utypes.h" + +#include "unicode/uchar.h" +#include "unicode/utf16.h" + +#include "identifier_info.h" +#include "mutex.h" +#include "scriptset.h" +#include "ucln_in.h" +#include "uvector.h" + +U_NAMESPACE_BEGIN + +static UnicodeSet *ASCII; +static ScriptSet *JAPANESE; +static ScriptSet *CHINESE; +static ScriptSet *KOREAN; +static ScriptSet *CONFUSABLE_WITH_LATIN; +static UInitOnce gIdentifierInfoInitOnce = U_INITONCE_INITIALIZER; + + +U_CDECL_BEGIN +static UBool U_CALLCONV +IdentifierInfo_cleanup(void) { + delete ASCII; + ASCII = NULL; + delete JAPANESE; + JAPANESE = NULL; + delete CHINESE; + CHINESE = NULL; + delete KOREAN; + KOREAN = NULL; + delete CONFUSABLE_WITH_LATIN; + CONFUSABLE_WITH_LATIN = NULL; + gIdentifierInfoInitOnce.reset(); + return TRUE; +} + +static void U_CALLCONV +IdentifierInfo_init(UErrorCode &status) { + ASCII = new UnicodeSet(0, 0x7f); + JAPANESE = new ScriptSet(); + CHINESE = new ScriptSet(); + KOREAN = new ScriptSet(); + CONFUSABLE_WITH_LATIN = new ScriptSet(); + if (ASCII == NULL || JAPANESE == NULL || CHINESE == NULL || KOREAN == NULL + || CONFUSABLE_WITH_LATIN == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + return; + } + ASCII->freeze(); + JAPANESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HIRAGANA, status) + .set(USCRIPT_KATAKANA, status); + CHINESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_BOPOMOFO, status); + KOREAN->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HANGUL, status); + CONFUSABLE_WITH_LATIN->set(USCRIPT_CYRILLIC, status).set(USCRIPT_GREEK, status) + .set(USCRIPT_CHEROKEE, status); + ucln_i18n_registerCleanup(UCLN_I18N_IDENTIFIER_INFO, IdentifierInfo_cleanup); +} +U_CDECL_END + + +IdentifierInfo::IdentifierInfo(UErrorCode &status): + fIdentifier(NULL), fRequiredScripts(NULL), fScriptSetSet(NULL), + fCommonAmongAlternates(NULL), fNumerics(NULL), fIdentifierProfile(NULL) { + umtx_initOnce(gIdentifierInfoInitOnce, &IdentifierInfo_init, status); + if (U_FAILURE(status)) { + return; + } + + fIdentifier = new UnicodeString(); + fRequiredScripts = new ScriptSet(); + fScriptSetSet = uhash_open(uhash_hashScriptSet, uhash_compareScriptSet, NULL, &status); + uhash_setKeyDeleter(fScriptSetSet, uhash_deleteScriptSet); + fCommonAmongAlternates = new ScriptSet(); + fNumerics = new UnicodeSet(); + fIdentifierProfile = new UnicodeSet(0, 0x10FFFF); + + if (U_SUCCESS(status) && (fIdentifier == NULL || fRequiredScripts == NULL || fScriptSetSet == NULL || + fCommonAmongAlternates == NULL || fNumerics == NULL || fIdentifierProfile == NULL)) { + status = U_MEMORY_ALLOCATION_ERROR; + } +} + +IdentifierInfo::~IdentifierInfo() { + delete fIdentifier; + delete fRequiredScripts; + uhash_close(fScriptSetSet); + delete fCommonAmongAlternates; + delete fNumerics; + delete fIdentifierProfile; +} + + +IdentifierInfo &IdentifierInfo::clear() { + fRequiredScripts->resetAll(); + uhash_removeAll(fScriptSetSet); + fNumerics->clear(); + fCommonAmongAlternates->resetAll(); + return *this; +} + + +IdentifierInfo &IdentifierInfo::setIdentifierProfile(const UnicodeSet &identifierProfile) { + *fIdentifierProfile = identifierProfile; + return *this; +} + + +const UnicodeSet &IdentifierInfo::getIdentifierProfile() const { + return *fIdentifierProfile; +} + + +IdentifierInfo &IdentifierInfo::setIdentifier(const UnicodeString &identifier, UErrorCode &status) { + if (U_FAILURE(status)) { + return *this; + } + *fIdentifier = identifier; + clear(); + ScriptSet scriptsForCP; + UChar32 cp; + for (int32_t i = 0; i < identifier.length(); i += U16_LENGTH(cp)) { + cp = identifier.char32At(i); + // Store a representative character for each kind of decimal digit + if (u_charType(cp) == U_DECIMAL_DIGIT_NUMBER) { + // Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value + fNumerics->add(cp - (UChar32)u_getNumericValue(cp)); + } + UScriptCode extensions[500]; + int32_t extensionsCount = uscript_getScriptExtensions(cp, extensions, UPRV_LENGTHOF(extensions), &status); + if (U_FAILURE(status)) { + return *this; + } + scriptsForCP.resetAll(); + for (int32_t j=0; j<extensionsCount; j++) { + scriptsForCP.set(extensions[j], status); + } + scriptsForCP.reset(USCRIPT_COMMON, status); + scriptsForCP.reset(USCRIPT_INHERITED, status); + switch (scriptsForCP.countMembers()) { + case 0: break; + case 1: + // Single script, record it. + fRequiredScripts->Union(scriptsForCP); + break; + default: + if (!fRequiredScripts->intersects(scriptsForCP) + && !uhash_geti(fScriptSetSet, &scriptsForCP)) { + // If the set hasn't been added already, add it + // (Add a copy, fScriptSetSet takes ownership of the copy.) + uhash_puti(fScriptSetSet, new ScriptSet(scriptsForCP), 1, &status); + } + break; + } + } + // Now make a final pass through ScriptSetSet to remove alternates that came before singles. + // [Kana], [Kana Hira] => [Kana] + // This is relatively infrequent, so doesn't have to be optimized. + // We also compute any commonalities among the alternates. + if (uhash_count(fScriptSetSet) > 0) { + fCommonAmongAlternates->setAll(); + for (int32_t it = UHASH_FIRST;;) { + const UHashElement *nextHashEl = uhash_nextElement(fScriptSetSet, &it); + if (nextHashEl == NULL) { + break; + } + ScriptSet *next = static_cast<ScriptSet *>(nextHashEl->key.pointer); + // [Kana], [Kana Hira] => [Kana] + if (fRequiredScripts->intersects(*next)) { + uhash_removeElement(fScriptSetSet, nextHashEl); + } else { + fCommonAmongAlternates->intersect(*next); + // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]] + for (int32_t otherIt = UHASH_FIRST;;) { + const UHashElement *otherHashEl = uhash_nextElement(fScriptSetSet, &otherIt); + if (otherHashEl == NULL) { + break; + } + ScriptSet *other = static_cast<ScriptSet *>(otherHashEl->key.pointer); + if (next != other && next->contains(*other)) { + uhash_removeElement(fScriptSetSet, nextHashEl); + break; + } + } + } + } + } + if (uhash_count(fScriptSetSet) == 0) { + fCommonAmongAlternates->resetAll(); + } + return *this; +} + + +const UnicodeString *IdentifierInfo::getIdentifier() const { + return fIdentifier; +} + +const ScriptSet *IdentifierInfo::getScripts() const { + return fRequiredScripts; +} + +const UHashtable *IdentifierInfo::getAlternates() const { + return fScriptSetSet; +} + + +const UnicodeSet *IdentifierInfo::getNumerics() const { + return fNumerics; +} + +const ScriptSet *IdentifierInfo::getCommonAmongAlternates() const { + return fCommonAmongAlternates; +} + +#if !UCONFIG_NO_NORMALIZATION + +URestrictionLevel IdentifierInfo::getRestrictionLevel(UErrorCode &status) const { + if (!fIdentifierProfile->containsAll(*fIdentifier) || getNumerics()->size() > 1) { + return USPOOF_UNRESTRICTIVE; + } + if (ASCII->containsAll(*fIdentifier)) { + return USPOOF_ASCII; + } + // This is a bit tricky. We look at a number of factors. + // The number of scripts in the text. + // Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc]) + // Plus number of alternates otherwise (this only works because we only test cardinality up to 2.) + + // Note: the requiredScripts set omits COMMON and INHERITED; they are taken out at the + // time it is created, in setIdentifier(). + int32_t cardinalityPlus = fRequiredScripts->countMembers() + + (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1); + if (cardinalityPlus < 2) { + return USPOOF_SINGLE_SCRIPT_RESTRICTIVE; + } + if (containsWithAlternates(*JAPANESE, *fRequiredScripts) || containsWithAlternates(*CHINESE, *fRequiredScripts) + || containsWithAlternates(*KOREAN, *fRequiredScripts)) { + return USPOOF_HIGHLY_RESTRICTIVE; + } + if (cardinalityPlus == 2 && + fRequiredScripts->test(USCRIPT_LATIN, status) && + !fRequiredScripts->intersects(*CONFUSABLE_WITH_LATIN)) { + return USPOOF_MODERATELY_RESTRICTIVE; + } + return USPOOF_MINIMALLY_RESTRICTIVE; +} + +#endif /* !UCONFIG_NO_NORMALIZATION */ + +int32_t IdentifierInfo::getScriptCount() const { + // Note: Common and Inherited scripts were removed by setIdentifier(), and do not appear in fRequiredScripts. + int32_t count = fRequiredScripts->countMembers() + + (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1); + return count; +} + + + +UBool IdentifierInfo::containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const { + if (!container.contains(containee)) { + return FALSE; + } + for (int32_t iter = UHASH_FIRST; ;) { + const UHashElement *hashEl = uhash_nextElement(fScriptSetSet, &iter); + if (hashEl == NULL) { + break; + } + ScriptSet *alternatives = static_cast<ScriptSet *>(hashEl->key.pointer); + if (!container.intersects(*alternatives)) { + return false; + } + } + return true; +} + +UnicodeString &IdentifierInfo::displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status) { + UVector sorted(status); + if (U_FAILURE(status)) { + return dest; + } + for (int32_t pos = UHASH_FIRST; ;) { + const UHashElement *el = uhash_nextElement(alternates, &pos); + if (el == NULL) { + break; + } + ScriptSet *ss = static_cast<ScriptSet *>(el->key.pointer); + sorted.addElement(ss, status); + } + sorted.sort(uhash_compareScriptSet, status); + UnicodeString separator = UNICODE_STRING_SIMPLE("; "); + for (int32_t i=0; i<sorted.size(); i++) { + if (i>0) { + dest.append(separator); + } + ScriptSet *ss = static_cast<ScriptSet *>(sorted.elementAt(i)); + ss->displayScripts(dest); + } + return dest; +} + +U_NAMESPACE_END |