diff options
Diffstat (limited to 'deps/icu-small/source/i18n/uspoof_wsconf.cpp')
-rw-r--r-- | deps/icu-small/source/i18n/uspoof_wsconf.cpp | 435 |
1 files changed, 435 insertions, 0 deletions
diff --git a/deps/icu-small/source/i18n/uspoof_wsconf.cpp b/deps/icu-small/source/i18n/uspoof_wsconf.cpp new file mode 100644 index 0000000000..ad73ed690c --- /dev/null +++ b/deps/icu-small/source/i18n/uspoof_wsconf.cpp @@ -0,0 +1,435 @@ +/* +****************************************************************************** +* +* Copyright (C) 2008-2013, International Business Machines +* Corporation and others. All Rights Reserved. +* +****************************************************************************** +* file name: uspoof_wsconf.cpp +* encoding: US-ASCII +* tab size: 8 (not used) +* indentation:4 +* +* created on: 2009Jan05 (refactoring earlier files) +* created by: Andy Heninger +* +* Internal functions for compililing Whole Script confusable source data +* into its binary (runtime) form. The binary data format is described +* in uspoof_impl.h +*/ + +#include "unicode/utypes.h" +#include "unicode/uspoof.h" + +#if !UCONFIG_NO_NORMALIZATION + +#if !UCONFIG_NO_REGULAR_EXPRESSIONS + +#include "unicode/unorm.h" +#include "unicode/uregex.h" +#include "unicode/ustring.h" +#include "cmemory.h" +#include "scriptset.h" +#include "uspoof_impl.h" +#include "uhash.h" +#include "uvector.h" +#include "uassert.h" +#include "uspoof_wsconf.h" + +U_NAMESPACE_USE + + +// Regular expression for parsing a line from the Unicode file confusablesWholeScript.txt +// Example Lines: +// 006F ; Latn; Deva; A # (o) LATIN SMALL LETTER O +// 0048..0049 ; Latn; Grek; A # [2] (H..I) LATIN CAPITAL LETTER H..LATIN CAPITAL LETTER I +// | | | | +// | | | |---- Which table, Any Case or Lower Case (A or L) +// | | |----------Target script. We need this. +// | |----------------Src script. Should match the script of the source +// | code points. Beyond checking that, we don't keep it. +// |--------------------------------Source code points or range. +// +// The expression will match _all_ lines, including erroneous lines. +// The result of the parse is returned via the contents of the (match) groups. +static const char *parseExp = + "(?m)" // Multi-line mode + "^([ \\t]*(?:#.*?)?)$" // A blank or comment line. Matches Group 1. + "|^(?:" // OR + "\\s*([0-9A-F]{4,})(?:..([0-9A-F]{4,}))?\\s*;" // Code point range. Groups 2 and 3. + "\\s*([A-Za-z]+)\\s*;" // The source script. Group 4. + "\\s*([A-Za-z]+)\\s*;" // The target script. Group 5. + "\\s*(?:(A)|(L))" // The table A or L. Group 6 or 7 + "[ \\t]*(?:#.*?)?" // Trailing commment + ")$|" // OR + "^(.*?)$"; // An error line. Group 8. + // Any line not matching the preceding + // parts of the expression.will match + // this, and thus be flagged as an error + + +// Extract a regular expression match group into a char * string. +// The group must contain only invariant characters. +// Used for script names +// +static void extractGroup( + URegularExpression *e, int32_t group, char *destBuf, int32_t destCapacity, UErrorCode &status) { + + UChar ubuf[50]; + ubuf[0] = 0; + destBuf[0] = 0; + int32_t len = uregex_group(e, group, ubuf, 50, &status); + if (U_FAILURE(status) || len == -1 || len >= destCapacity) { + return; + } + UnicodeString s(FALSE, ubuf, len); // Aliasing constructor + s.extract(0, len, destBuf, destCapacity, US_INV); +} + + + +U_NAMESPACE_BEGIN + +// Build the Whole Script Confusable data +// +// TODO: Reorganize. Either get rid of the WSConfusableDataBuilder class, +// because everything is local to this one build function anyhow, +// OR +// break this function into more reasonably sized pieces, with +// state in WSConfusableDataBuilder. +// +void buildWSConfusableData(SpoofImpl *spImpl, const char * confusablesWS, + int32_t confusablesWSLen, UParseError *pe, UErrorCode &status) +{ + if (U_FAILURE(status)) { + return; + } + URegularExpression *parseRegexp = NULL; + int32_t inputLen = 0; + UChar *input = NULL; + int32_t lineNum = 0; + + UVector *scriptSets = NULL; + uint32_t rtScriptSetsCount = 2; + + UTrie2 *anyCaseTrie = NULL; + UTrie2 *lowerCaseTrie = NULL; + + anyCaseTrie = utrie2_open(0, 0, &status); + lowerCaseTrie = utrie2_open(0, 0, &status); + + UnicodeString pattern(parseExp, -1, US_INV); + + // The scriptSets vector provides a mapping from TRIE values to the set of scripts. + // + // Reserved TRIE values: + // 0: Code point has no whole script confusables. + // 1: Code point is of script Common or Inherited. + // These code points do not participate in whole script confusable detection. + // (This is logically equivalent to saying that they contain confusables in + // all scripts) + // + // Because Trie values are indexes into the ScriptSets vector, pre-fill + // vector positions 0 and 1 to avoid conflicts with the reserved values. + + scriptSets = new UVector(status); + if (scriptSets == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + goto cleanup; + } + scriptSets->addElement((void *)NULL, status); + scriptSets->addElement((void *)NULL, status); + + // Convert the user input data from UTF-8 to UChar (UTF-16) + u_strFromUTF8(NULL, 0, &inputLen, confusablesWS, confusablesWSLen, &status); + if (status != U_BUFFER_OVERFLOW_ERROR) { + goto cleanup; + } + status = U_ZERO_ERROR; + input = static_cast<UChar *>(uprv_malloc((inputLen+1) * sizeof(UChar))); + if (input == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + goto cleanup; + } + u_strFromUTF8(input, inputLen+1, NULL, confusablesWS, confusablesWSLen, &status); + + parseRegexp = uregex_open(pattern.getBuffer(), pattern.length(), 0, NULL, &status); + + // Zap any Byte Order Mark at the start of input. Changing it to a space is benign + // given the syntax of the input. + if (*input == 0xfeff) { + *input = 0x20; + } + + // Parse the input, one line per iteration of this loop. + uregex_setText(parseRegexp, input, inputLen, &status); + while (uregex_findNext(parseRegexp, &status)) { + lineNum++; + if (uregex_start(parseRegexp, 1, &status) >= 0) { + // this was a blank or comment line. + continue; + } + if (uregex_start(parseRegexp, 8, &status) >= 0) { + // input file syntax error. + status = U_PARSE_ERROR; + goto cleanup; + } + if (U_FAILURE(status)) { + goto cleanup; + } + + // Pick up the start and optional range end code points from the parsed line. + UChar32 startCodePoint = SpoofImpl::ScanHex( + input, uregex_start(parseRegexp, 2, &status), uregex_end(parseRegexp, 2, &status), status); + UChar32 endCodePoint = startCodePoint; + if (uregex_start(parseRegexp, 3, &status) >=0) { + endCodePoint = SpoofImpl::ScanHex( + input, uregex_start(parseRegexp, 3, &status), uregex_end(parseRegexp, 3, &status), status); + } + + // Extract the two script names from the source line. We need these in an 8 bit + // default encoding (will be EBCDIC on IBM mainframes) in order to pass them on + // to the ICU u_getPropertyValueEnum() function. Ugh. + char srcScriptName[20]; + char targScriptName[20]; + extractGroup(parseRegexp, 4, srcScriptName, sizeof(srcScriptName), status); + extractGroup(parseRegexp, 5, targScriptName, sizeof(targScriptName), status); + UScriptCode srcScript = + static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, srcScriptName)); + UScriptCode targScript = + static_cast<UScriptCode>(u_getPropertyValueEnum(UCHAR_SCRIPT, targScriptName)); + if (U_FAILURE(status)) { + goto cleanup; + } + if (srcScript == USCRIPT_INVALID_CODE || targScript == USCRIPT_INVALID_CODE) { + status = U_INVALID_FORMAT_ERROR; + goto cleanup; + } + + // select the table - (A) any case or (L) lower case only + UTrie2 *table = anyCaseTrie; + if (uregex_start(parseRegexp, 7, &status) >= 0) { + table = lowerCaseTrie; + } + + // Build the set of scripts containing confusable characters for + // the code point(s) specified in this input line. + // Sanity check that the script of the source code point is the same + // as the source script indicated in the input file. Failure of this check is + // an error in the input file. + // Include the source script in the set (needed for Mixed Script Confusable detection). + // + UChar32 cp; + for (cp=startCodePoint; cp<=endCodePoint; cp++) { + int32_t setIndex = utrie2_get32(table, cp); + BuilderScriptSet *bsset = NULL; + if (setIndex > 0) { + U_ASSERT(setIndex < scriptSets->size()); + bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(setIndex)); + } else { + bsset = new BuilderScriptSet(); + if (bsset == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + goto cleanup; + } + bsset->codePoint = cp; + bsset->trie = table; + bsset->sset = new ScriptSet(); + setIndex = scriptSets->size(); + bsset->index = setIndex; + bsset->rindex = 0; + if (bsset->sset == NULL) { + status = U_MEMORY_ALLOCATION_ERROR; + goto cleanup; + } + scriptSets->addElement(bsset, status); + utrie2_set32(table, cp, setIndex, &status); + } + bsset->sset->set(targScript, status); + bsset->sset->set(srcScript, status); + + if (U_FAILURE(status)) { + goto cleanup; + } + UScriptCode cpScript = uscript_getScript(cp, &status); + if (cpScript != srcScript) { + status = U_INVALID_FORMAT_ERROR; + goto cleanup; + } + } + } + + // Eliminate duplicate script sets. At this point we have a separate + // script set for every code point that had data in the input file. + // + // We eliminate underlying ScriptSet objects, not the BuildScriptSets that wrap them + // + // printf("Number of scriptSets: %d\n", scriptSets->size()); + { + int32_t duplicateCount = 0; + rtScriptSetsCount = 2; + for (int32_t outeri=2; outeri<scriptSets->size(); outeri++) { + BuilderScriptSet *outerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(outeri)); + if (outerSet->index != static_cast<uint32_t>(outeri)) { + // This set was already identified as a duplicate. + // It will not be allocated a position in the runtime array of ScriptSets. + continue; + } + outerSet->rindex = rtScriptSetsCount++; + for (int32_t inneri=outeri+1; inneri<scriptSets->size(); inneri++) { + BuilderScriptSet *innerSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(inneri)); + if (*(outerSet->sset) == *(innerSet->sset) && outerSet->sset != innerSet->sset) { + delete innerSet->sset; + innerSet->scriptSetOwned = FALSE; + innerSet->sset = outerSet->sset; + innerSet->index = outeri; + innerSet->rindex = outerSet->rindex; + duplicateCount++; + } + // But this doesn't get all. We need to fix the TRIE. + } + } + // printf("Number of distinct script sets: %d\n", rtScriptSetsCount); + } + + + + // Update the Trie values to be reflect the run time script indexes (after duplicate merging). + // (Trie Values 0 and 1 are reserved, and the corresponding slots in scriptSets + // are unused, which is why the loop index starts at 2.) + { + for (int32_t i=2; i<scriptSets->size(); i++) { + BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); + if (bSet->rindex != (uint32_t)i) { + utrie2_set32(bSet->trie, bSet->codePoint, bSet->rindex, &status); + } + } + } + + // For code points with script==Common or script==Inherited, + // Set the reserved value of 1 into both Tries. These characters do not participate + // in Whole Script Confusable detection; this reserved value is the means + // by which they are detected. + { + UnicodeSet ignoreSet; + ignoreSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_COMMON, status); + UnicodeSet inheritedSet; + inheritedSet.applyIntPropertyValue(UCHAR_SCRIPT, USCRIPT_INHERITED, status); + ignoreSet.addAll(inheritedSet); + for (int32_t rn=0; rn<ignoreSet.getRangeCount(); rn++) { + UChar32 rangeStart = ignoreSet.getRangeStart(rn); + UChar32 rangeEnd = ignoreSet.getRangeEnd(rn); + utrie2_setRange32(anyCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status); + utrie2_setRange32(lowerCaseTrie, rangeStart, rangeEnd, 1, TRUE, &status); + } + } + + // Serialize the data to the Spoof Detector + { + utrie2_freeze(anyCaseTrie, UTRIE2_16_VALUE_BITS, &status); + int32_t size = utrie2_serialize(anyCaseTrie, NULL, 0, &status); + // printf("Any case Trie size: %d\n", size); + if (status != U_BUFFER_OVERFLOW_ERROR) { + goto cleanup; + } + status = U_ZERO_ERROR; + spImpl->fSpoofData->fRawData->fAnyCaseTrie = spImpl->fSpoofData->fMemLimit; + spImpl->fSpoofData->fRawData->fAnyCaseTrieLength = size; + spImpl->fSpoofData->fAnyCaseTrie = anyCaseTrie; + void *where = spImpl->fSpoofData->reserveSpace(size, status); + utrie2_serialize(anyCaseTrie, where, size, &status); + + utrie2_freeze(lowerCaseTrie, UTRIE2_16_VALUE_BITS, &status); + size = utrie2_serialize(lowerCaseTrie, NULL, 0, &status); + // printf("Lower case Trie size: %d\n", size); + if (status != U_BUFFER_OVERFLOW_ERROR) { + goto cleanup; + } + status = U_ZERO_ERROR; + spImpl->fSpoofData->fRawData->fLowerCaseTrie = spImpl->fSpoofData->fMemLimit; + spImpl->fSpoofData->fRawData->fLowerCaseTrieLength = size; + spImpl->fSpoofData->fLowerCaseTrie = lowerCaseTrie; + where = spImpl->fSpoofData->reserveSpace(size, status); + utrie2_serialize(lowerCaseTrie, where, size, &status); + + spImpl->fSpoofData->fRawData->fScriptSets = spImpl->fSpoofData->fMemLimit; + spImpl->fSpoofData->fRawData->fScriptSetsLength = rtScriptSetsCount; + ScriptSet *rtScriptSets = static_cast<ScriptSet *> + (spImpl->fSpoofData->reserveSpace(rtScriptSetsCount * sizeof(ScriptSet), status)); + uint32_t rindex = 2; + for (int32_t i=2; i<scriptSets->size(); i++) { + BuilderScriptSet *bSet = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); + if (bSet->rindex < rindex) { + // We have already copied this script set to the serialized data. + continue; + } + U_ASSERT(rindex == bSet->rindex); + rtScriptSets[rindex] = *bSet->sset; // Assignment of a ScriptSet just copies the bits. + rindex++; + } + } + + // Open new utrie2s from the serialized data. We don't want to keep the ones + // we just built because we would then have two copies of the data, one internal to + // the utries that we have already constructed, and one in the serialized data area. + // An alternative would be to not pre-serialize the Trie data, but that makes the + // spoof detector data different, depending on how the detector was constructed. + // It's simpler to keep the data always the same. + + spImpl->fSpoofData->fAnyCaseTrie = utrie2_openFromSerialized( + UTRIE2_16_VALUE_BITS, + (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fAnyCaseTrie, + spImpl->fSpoofData->fRawData->fAnyCaseTrieLength, + NULL, + &status); + + spImpl->fSpoofData->fLowerCaseTrie = utrie2_openFromSerialized( + UTRIE2_16_VALUE_BITS, + (const char *)spImpl->fSpoofData->fRawData + spImpl->fSpoofData->fRawData->fLowerCaseTrie, + spImpl->fSpoofData->fRawData->fAnyCaseTrieLength, + NULL, + &status); + + + +cleanup: + if (U_FAILURE(status)) { + pe->line = lineNum; + } + uregex_close(parseRegexp); + uprv_free(input); + + int32_t i; + if (scriptSets != NULL) { + for (i=0; i<scriptSets->size(); i++) { + BuilderScriptSet *bsset = static_cast<BuilderScriptSet *>(scriptSets->elementAt(i)); + delete bsset; + } + delete scriptSets; + } + utrie2_close(anyCaseTrie); + utrie2_close(lowerCaseTrie); + return; +} + +U_NAMESPACE_END + + + +BuilderScriptSet::BuilderScriptSet() { + codePoint = -1; + trie = NULL; + sset = NULL; + index = 0; + rindex = 0; + scriptSetOwned = TRUE; +} + +BuilderScriptSet::~BuilderScriptSet() { + if (scriptSetOwned) { + delete sset; + } +} + +#endif +#endif // !UCONFIG_NO_REGULAR_EXPRESSIONS |