diff options
Diffstat (limited to 'deps/node/deps/icu-small/source/common/rbbi.cpp')
-rw-r--r-- | deps/node/deps/icu-small/source/common/rbbi.cpp | 1274 |
1 files changed, 0 insertions, 1274 deletions
diff --git a/deps/node/deps/icu-small/source/common/rbbi.cpp b/deps/node/deps/icu-small/source/common/rbbi.cpp deleted file mode 100644 index cb376650..00000000 --- a/deps/node/deps/icu-small/source/common/rbbi.cpp +++ /dev/null @@ -1,1274 +0,0 @@ -// © 2016 and later: Unicode, Inc. and others. -// License & terms of use: http://www.unicode.org/copyright.html -/* -*************************************************************************** -* Copyright (C) 1999-2016 International Business Machines Corporation -* and others. All rights reserved. -*************************************************************************** -*/ -// -// file: rbbi.cpp Contains the implementation of the rule based break iterator -// runtime engine and the API implementation for -// class RuleBasedBreakIterator -// - -#include "utypeinfo.h" // for 'typeid' to work - -#include "unicode/utypes.h" - -#if !UCONFIG_NO_BREAK_ITERATION - -#include <cinttypes> - -#include "unicode/rbbi.h" -#include "unicode/schriter.h" -#include "unicode/uchriter.h" -#include "unicode/uclean.h" -#include "unicode/udata.h" - -#include "brkeng.h" -#include "ucln_cmn.h" -#include "cmemory.h" -#include "cstring.h" -#include "rbbidata.h" -#include "rbbi_cache.h" -#include "rbbirb.h" -#include "uassert.h" -#include "umutex.h" -#include "uvectr32.h" - -// if U_LOCAL_SERVICE_HOOK is defined, then localsvc.cpp is expected to be included. -#if U_LOCAL_SERVICE_HOOK -#include "localsvc.h" -#endif - -#ifdef RBBI_DEBUG -static UBool gTrace = FALSE; -#endif - -U_NAMESPACE_BEGIN - -// The state number of the starting state -constexpr int32_t START_STATE = 1; - -// The state-transition value indicating "stop" -constexpr int32_t STOP_STATE = 0; - - -UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RuleBasedBreakIterator) - - -//======================================================================= -// constructors -//======================================================================= - -/** - * Constructs a RuleBasedBreakIterator that uses the already-created - * tables object that is passed in as a parameter. - */ -RuleBasedBreakIterator::RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status) - : fSCharIter(UnicodeString()) -{ - init(status); - fData = new RBBIDataWrapper(data, status); // status checked in constructor - if (U_FAILURE(status)) {return;} - if(fData == 0) { - status = U_MEMORY_ALLOCATION_ERROR; - return; - } -} - -// -// Construct from precompiled binary rules (tables). This constructor is public API, -// taking the rules as a (const uint8_t *) to match the type produced by getBinaryRules(). -// -RuleBasedBreakIterator::RuleBasedBreakIterator(const uint8_t *compiledRules, - uint32_t ruleLength, - UErrorCode &status) - : fSCharIter(UnicodeString()) -{ - init(status); - if (U_FAILURE(status)) { - return; - } - if (compiledRules == NULL || ruleLength < sizeof(RBBIDataHeader)) { - status = U_ILLEGAL_ARGUMENT_ERROR; - return; - } - const RBBIDataHeader *data = (const RBBIDataHeader *)compiledRules; - if (data->fLength > ruleLength) { - status = U_ILLEGAL_ARGUMENT_ERROR; - return; - } - fData = new RBBIDataWrapper(data, RBBIDataWrapper::kDontAdopt, status); - if (U_FAILURE(status)) {return;} - if(fData == 0) { - status = U_MEMORY_ALLOCATION_ERROR; - return; - } -} - - -//------------------------------------------------------------------------------- -// -// Constructor from a UDataMemory handle to precompiled break rules -// stored in an ICU data file. -// -//------------------------------------------------------------------------------- -RuleBasedBreakIterator::RuleBasedBreakIterator(UDataMemory* udm, UErrorCode &status) - : fSCharIter(UnicodeString()) -{ - init(status); - fData = new RBBIDataWrapper(udm, status); // status checked in constructor - if (U_FAILURE(status)) {return;} - if(fData == 0) { - status = U_MEMORY_ALLOCATION_ERROR; - return; - } -} - - - -//------------------------------------------------------------------------------- -// -// Constructor from a set of rules supplied as a string. -// -//------------------------------------------------------------------------------- -RuleBasedBreakIterator::RuleBasedBreakIterator( const UnicodeString &rules, - UParseError &parseError, - UErrorCode &status) - : fSCharIter(UnicodeString()) -{ - init(status); - if (U_FAILURE(status)) {return;} - RuleBasedBreakIterator *bi = (RuleBasedBreakIterator *) - RBBIRuleBuilder::createRuleBasedBreakIterator(rules, &parseError, status); - // Note: This is a bit awkward. The RBBI ruleBuilder has a factory method that - // creates and returns a complete RBBI. From here, in a constructor, we - // can't just return the object created by the builder factory, hence - // the assignment of the factory created object to "this". - if (U_SUCCESS(status)) { - *this = *bi; - delete bi; - } -} - - -//------------------------------------------------------------------------------- -// -// Default Constructor. Create an empty shell that can be set up later. -// Used when creating a RuleBasedBreakIterator from a set -// of rules. -//------------------------------------------------------------------------------- -RuleBasedBreakIterator::RuleBasedBreakIterator() - : fSCharIter(UnicodeString()) -{ - UErrorCode status = U_ZERO_ERROR; - init(status); -} - - -//------------------------------------------------------------------------------- -// -// Copy constructor. Will produce a break iterator with the same behavior, -// and which iterates over the same text, as the one passed in. -// -//------------------------------------------------------------------------------- -RuleBasedBreakIterator::RuleBasedBreakIterator(const RuleBasedBreakIterator& other) -: BreakIterator(other), - fSCharIter(UnicodeString()) -{ - UErrorCode status = U_ZERO_ERROR; - this->init(status); - *this = other; -} - - -/** - * Destructor - */ -RuleBasedBreakIterator::~RuleBasedBreakIterator() { - if (fCharIter != &fSCharIter) { - // fCharIter was adopted from the outside. - delete fCharIter; - } - fCharIter = NULL; - - utext_close(&fText); - - if (fData != NULL) { - fData->removeReference(); - fData = NULL; - } - delete fBreakCache; - fBreakCache = NULL; - - delete fDictionaryCache; - fDictionaryCache = NULL; - - delete fLanguageBreakEngines; - fLanguageBreakEngines = NULL; - - delete fUnhandledBreakEngine; - fUnhandledBreakEngine = NULL; -} - -/** - * Assignment operator. Sets this iterator to have the same behavior, - * and iterate over the same text, as the one passed in. - */ -RuleBasedBreakIterator& -RuleBasedBreakIterator::operator=(const RuleBasedBreakIterator& that) { - if (this == &that) { - return *this; - } - BreakIterator::operator=(that); - - if (fLanguageBreakEngines != NULL) { - delete fLanguageBreakEngines; - fLanguageBreakEngines = NULL; // Just rebuild for now - } - // TODO: clone fLanguageBreakEngines from "that" - UErrorCode status = U_ZERO_ERROR; - utext_clone(&fText, &that.fText, FALSE, TRUE, &status); - - if (fCharIter != &fSCharIter) { - delete fCharIter; - } - fCharIter = &fSCharIter; - - if (that.fCharIter != NULL && that.fCharIter != &that.fSCharIter) { - // This is a little bit tricky - it will intially appear that - // this->fCharIter is adopted, even if that->fCharIter was - // not adopted. That's ok. - fCharIter = that.fCharIter->clone(); - } - fSCharIter = that.fSCharIter; - if (fCharIter == NULL) { - fCharIter = &fSCharIter; - } - - if (fData != NULL) { - fData->removeReference(); - fData = NULL; - } - if (that.fData != NULL) { - fData = that.fData->addReference(); - } - - fPosition = that.fPosition; - fRuleStatusIndex = that.fRuleStatusIndex; - fDone = that.fDone; - - // TODO: both the dictionary and the main cache need to be copied. - // Current position could be within a dictionary range. Trying to continue - // the iteration without the caches present would go to the rules, with - // the assumption that the current position is on a rule boundary. - fBreakCache->reset(fPosition, fRuleStatusIndex); - fDictionaryCache->reset(); - - return *this; -} - - - -//----------------------------------------------------------------------------- -// -// init() Shared initialization routine. Used by all the constructors. -// Initializes all fields, leaving the object in a consistent state. -// -//----------------------------------------------------------------------------- -void RuleBasedBreakIterator::init(UErrorCode &status) { - fCharIter = NULL; - fData = NULL; - fPosition = 0; - fRuleStatusIndex = 0; - fDone = false; - fDictionaryCharCount = 0; - fLanguageBreakEngines = NULL; - fUnhandledBreakEngine = NULL; - fBreakCache = NULL; - fDictionaryCache = NULL; - - // Note: IBM xlC is unable to assign or initialize member fText from UTEXT_INITIALIZER. - // fText = UTEXT_INITIALIZER; - static const UText initializedUText = UTEXT_INITIALIZER; - uprv_memcpy(&fText, &initializedUText, sizeof(UText)); - - if (U_FAILURE(status)) { - return; - } - - utext_openUChars(&fText, NULL, 0, &status); - fDictionaryCache = new DictionaryCache(this, status); - fBreakCache = new BreakCache(this, status); - if (U_SUCCESS(status) && (fDictionaryCache == NULL || fBreakCache == NULL)) { - status = U_MEMORY_ALLOCATION_ERROR; - } - -#ifdef RBBI_DEBUG - static UBool debugInitDone = FALSE; - if (debugInitDone == FALSE) { - char *debugEnv = getenv("U_RBBIDEBUG"); - if (debugEnv && uprv_strstr(debugEnv, "trace")) { - gTrace = TRUE; - } - debugInitDone = TRUE; - } -#endif -} - - - -//----------------------------------------------------------------------------- -// -// clone - Returns a newly-constructed RuleBasedBreakIterator with the same -// behavior, and iterating over the same text, as this one. -// Virtual function: does the right thing with subclasses. -// -//----------------------------------------------------------------------------- -BreakIterator* -RuleBasedBreakIterator::clone(void) const { - return new RuleBasedBreakIterator(*this); -} - -/** - * Equality operator. Returns TRUE if both BreakIterators are of the - * same class, have the same behavior, and iterate over the same text. - */ -UBool -RuleBasedBreakIterator::operator==(const BreakIterator& that) const { - if (typeid(*this) != typeid(that)) { - return FALSE; - } - if (this == &that) { - return TRUE; - } - - // The base class BreakIterator carries no state that participates in equality, - // and does not implement an equality function that would otherwise be - // checked at this point. - - const RuleBasedBreakIterator& that2 = (const RuleBasedBreakIterator&) that; - - if (!utext_equals(&fText, &that2.fText)) { - // The two break iterators are operating on different text, - // or have a different iteration position. - // Note that fText's position is always the same as the break iterator's position. - return FALSE; - }; - - if (!(fPosition == that2.fPosition && - fRuleStatusIndex == that2.fRuleStatusIndex && - fDone == that2.fDone)) { - return FALSE; - } - - if (that2.fData == fData || - (fData != NULL && that2.fData != NULL && *that2.fData == *fData)) { - // The two break iterators are using the same rules. - return TRUE; - } - return FALSE; -} - -/** - * Compute a hash code for this BreakIterator - * @return A hash code - */ -int32_t -RuleBasedBreakIterator::hashCode(void) const { - int32_t hash = 0; - if (fData != NULL) { - hash = fData->hashCode(); - } - return hash; -} - - -void RuleBasedBreakIterator::setText(UText *ut, UErrorCode &status) { - if (U_FAILURE(status)) { - return; - } - fBreakCache->reset(); - fDictionaryCache->reset(); - utext_clone(&fText, ut, FALSE, TRUE, &status); - - // Set up a dummy CharacterIterator to be returned if anyone - // calls getText(). With input from UText, there is no reasonable - // way to return a characterIterator over the actual input text. - // Return one over an empty string instead - this is the closest - // we can come to signaling a failure. - // (GetText() is obsolete, this failure is sort of OK) - fSCharIter.setText(UnicodeString()); - - if (fCharIter != &fSCharIter) { - // existing fCharIter was adopted from the outside. Delete it now. - delete fCharIter; - } - fCharIter = &fSCharIter; - - this->first(); -} - - -UText *RuleBasedBreakIterator::getUText(UText *fillIn, UErrorCode &status) const { - UText *result = utext_clone(fillIn, &fText, FALSE, TRUE, &status); - return result; -} - - -//======================================================================= -// BreakIterator overrides -//======================================================================= - -/** - * Return a CharacterIterator over the text being analyzed. - */ -CharacterIterator& -RuleBasedBreakIterator::getText() const { - return *fCharIter; -} - -/** - * Set the iterator to analyze a new piece of text. This function resets - * the current iteration position to the beginning of the text. - * @param newText An iterator over the text to analyze. - */ -void -RuleBasedBreakIterator::adoptText(CharacterIterator* newText) { - // If we are holding a CharacterIterator adopted from a - // previous call to this function, delete it now. - if (fCharIter != &fSCharIter) { - delete fCharIter; - } - - fCharIter = newText; - UErrorCode status = U_ZERO_ERROR; - fBreakCache->reset(); - fDictionaryCache->reset(); - if (newText==NULL || newText->startIndex() != 0) { - // startIndex !=0 wants to be an error, but there's no way to report it. - // Make the iterator text be an empty string. - utext_openUChars(&fText, NULL, 0, &status); - } else { - utext_openCharacterIterator(&fText, newText, &status); - } - this->first(); -} - -/** - * Set the iterator to analyze a new piece of text. This function resets - * the current iteration position to the beginning of the text. - * @param newText An iterator over the text to analyze. - */ -void -RuleBasedBreakIterator::setText(const UnicodeString& newText) { - UErrorCode status = U_ZERO_ERROR; - fBreakCache->reset(); - fDictionaryCache->reset(); - utext_openConstUnicodeString(&fText, &newText, &status); - - // Set up a character iterator on the string. - // Needed in case someone calls getText(). - // Can not, unfortunately, do this lazily on the (probably never) - // call to getText(), because getText is const. - fSCharIter.setText(newText); - - if (fCharIter != &fSCharIter) { - // old fCharIter was adopted from the outside. Delete it. - delete fCharIter; - } - fCharIter = &fSCharIter; - - this->first(); -} - - -/** - * Provide a new UText for the input text. Must reference text with contents identical - * to the original. - * Intended for use with text data originating in Java (garbage collected) environments - * where the data may be moved in memory at arbitrary times. - */ -RuleBasedBreakIterator &RuleBasedBreakIterator::refreshInputText(UText *input, UErrorCode &status) { - if (U_FAILURE(status)) { - return *this; - } - if (input == NULL) { - status = U_ILLEGAL_ARGUMENT_ERROR; - return *this; - } - int64_t pos = utext_getNativeIndex(&fText); - // Shallow read-only clone of the new UText into the existing input UText - utext_clone(&fText, input, FALSE, TRUE, &status); - if (U_FAILURE(status)) { - return *this; - } - utext_setNativeIndex(&fText, pos); - if (utext_getNativeIndex(&fText) != pos) { - // Sanity check. The new input utext is supposed to have the exact same - // contents as the old. If we can't set to the same position, it doesn't. - // The contents underlying the old utext might be invalid at this point, - // so it's not safe to check directly. - status = U_ILLEGAL_ARGUMENT_ERROR; - } - return *this; -} - - -/** - * Sets the current iteration position to the beginning of the text, position zero. - * @return The new iterator position, which is zero. - */ -int32_t RuleBasedBreakIterator::first(void) { - UErrorCode status = U_ZERO_ERROR; - if (!fBreakCache->seek(0)) { - fBreakCache->populateNear(0, status); - } - fBreakCache->current(); - U_ASSERT(fPosition == 0); - return 0; -} - -/** - * Sets the current iteration position to the end of the text. - * @return The text's past-the-end offset. - */ -int32_t RuleBasedBreakIterator::last(void) { - int32_t endPos = (int32_t)utext_nativeLength(&fText); - UBool endShouldBeBoundary = isBoundary(endPos); // Has side effect of setting iterator position. - (void)endShouldBeBoundary; - U_ASSERT(endShouldBeBoundary); - U_ASSERT(fPosition == endPos); - return endPos; -} - -/** - * Advances the iterator either forward or backward the specified number of steps. - * Negative values move backward, and positive values move forward. This is - * equivalent to repeatedly calling next() or previous(). - * @param n The number of steps to move. The sign indicates the direction - * (negative is backwards, and positive is forwards). - * @return The character offset of the boundary position n boundaries away from - * the current one. - */ -int32_t RuleBasedBreakIterator::next(int32_t n) { - int32_t result = 0; - if (n > 0) { - for (; n > 0 && result != UBRK_DONE; --n) { - result = next(); - } - } else if (n < 0) { - for (; n < 0 && result != UBRK_DONE; ++n) { - result = previous(); - } - } else { - result = current(); - } - return result; -} - -/** - * Advances the iterator to the next boundary position. - * @return The position of the first boundary after this one. - */ -int32_t RuleBasedBreakIterator::next(void) { - fBreakCache->next(); - return fDone ? UBRK_DONE : fPosition; -} - -/** - * Move the iterator backwards, to the boundary preceding the current one. - * - * Starts from the current position within fText. - * Starting position need not be on a boundary. - * - * @return The position of the boundary position immediately preceding the starting position. - */ -int32_t RuleBasedBreakIterator::previous(void) { - UErrorCode status = U_ZERO_ERROR; - fBreakCache->previous(status); - return fDone ? UBRK_DONE : fPosition; -} - -/** - * Sets the iterator to refer to the first boundary position following - * the specified position. - * @param startPos The position from which to begin searching for a break position. - * @return The position of the first break after the current position. - */ -int32_t RuleBasedBreakIterator::following(int32_t startPos) { - // if the supplied position is before the beginning, return the - // text's starting offset - if (startPos < 0) { - return first(); - } - - // Move requested offset to a code point start. It might be on a trail surrogate, - // or on a trail byte if the input is UTF-8. Or it may be beyond the end of the text. - utext_setNativeIndex(&fText, startPos); - startPos = (int32_t)utext_getNativeIndex(&fText); - - UErrorCode status = U_ZERO_ERROR; - fBreakCache->following(startPos, status); - return fDone ? UBRK_DONE : fPosition; -} - -/** - * Sets the iterator to refer to the last boundary position before the - * specified position. - * @param offset The position to begin searching for a break from. - * @return The position of the last boundary before the starting position. - */ -int32_t RuleBasedBreakIterator::preceding(int32_t offset) { - if (offset > utext_nativeLength(&fText)) { - return last(); - } - - // Move requested offset to a code point start. It might be on a trail surrogate, - // or on a trail byte if the input is UTF-8. - - utext_setNativeIndex(&fText, offset); - int32_t adjustedOffset = static_cast<int32_t>(utext_getNativeIndex(&fText)); - - UErrorCode status = U_ZERO_ERROR; - fBreakCache->preceding(adjustedOffset, status); - return fDone ? UBRK_DONE : fPosition; -} - -/** - * Returns true if the specfied position is a boundary position. As a side - * effect, leaves the iterator pointing to the first boundary position at - * or after "offset". - * - * @param offset the offset to check. - * @return True if "offset" is a boundary position. - */ -UBool RuleBasedBreakIterator::isBoundary(int32_t offset) { - // out-of-range indexes are never boundary positions - if (offset < 0) { - first(); // For side effects on current position, tag values. - return FALSE; - } - - // Adjust offset to be on a code point boundary and not beyond the end of the text. - // Note that isBoundary() is always false for offsets that are not on code point boundaries. - // But we still need the side effect of leaving iteration at the following boundary. - - utext_setNativeIndex(&fText, offset); - int32_t adjustedOffset = static_cast<int32_t>(utext_getNativeIndex(&fText)); - - bool result = false; - UErrorCode status = U_ZERO_ERROR; - if (fBreakCache->seek(adjustedOffset) || fBreakCache->populateNear(adjustedOffset, status)) { - result = (fBreakCache->current() == offset); - } - - if (result && adjustedOffset < offset && utext_char32At(&fText, offset) == U_SENTINEL) { - // Original offset is beyond the end of the text. Return FALSE, it's not a boundary, - // but the iteration position remains set to the end of the text, which is a boundary. - return FALSE; - } - if (!result) { - // Not on a boundary. isBoundary() must leave iterator on the following boundary. - // Cache->seek(), above, left us on the preceding boundary, so advance one. - next(); - } - return result; -} - - -/** - * Returns the current iteration position. - * @return The current iteration position. - */ -int32_t RuleBasedBreakIterator::current(void) const { - return fPosition; -} - - -//======================================================================= -// implementation -//======================================================================= - -// -// RBBIRunMode - the state machine runs an extra iteration at the beginning and end -// of user text. A variable with this enum type keeps track of where we -// are. The state machine only fetches user input while in the RUN mode. -// -enum RBBIRunMode { - RBBI_START, // state machine processing is before first char of input - RBBI_RUN, // state machine processing is in the user text - RBBI_END // state machine processing is after end of user text. -}; - - -// Map from look-ahead break states (corresponds to rules) to boundary positions. -// Allows multiple lookahead break rules to be in flight at the same time. -// -// This is a temporary approach for ICU 57. A better fix is to make the look-ahead numbers -// in the state table be sequential, then we can just index an array. And the -// table could also tell us in advance how big that array needs to be. -// -// Before ICU 57 there was just a single simple variable for a look-ahead match that -// was in progress. Two rules at once did not work. - -static const int32_t kMaxLookaheads = 8; -struct LookAheadResults { - int32_t fUsedSlotLimit; - int32_t fPositions[8]; - int16_t fKeys[8]; - - LookAheadResults() : fUsedSlotLimit(0), fPositions(), fKeys() {}; - - int32_t getPosition(int16_t key) { - for (int32_t i=0; i<fUsedSlotLimit; ++i) { - if (fKeys[i] == key) { - return fPositions[i]; - } - } - U_ASSERT(FALSE); - return -1; - } - - void setPosition(int16_t key, int32_t position) { - int32_t i; - for (i=0; i<fUsedSlotLimit; ++i) { - if (fKeys[i] == key) { - fPositions[i] = position; - return; - } - } - if (i >= kMaxLookaheads) { - U_ASSERT(FALSE); - i = kMaxLookaheads - 1; - } - fKeys[i] = key; - fPositions[i] = position; - U_ASSERT(fUsedSlotLimit == i); - fUsedSlotLimit = i + 1; - } -}; - - -//----------------------------------------------------------------------------------- -// -// handleNext() -// Run the state machine to find a boundary -// -//----------------------------------------------------------------------------------- -int32_t RuleBasedBreakIterator::handleNext() { - int32_t state; - uint16_t category = 0; - RBBIRunMode mode; - - RBBIStateTableRow *row; - UChar32 c; - LookAheadResults lookAheadMatches; - int32_t result = 0; - int32_t initialPosition = 0; - const RBBIStateTable *statetable = fData->fForwardTable; - const char *tableData = statetable->fTableData; - uint32_t tableRowLen = statetable->fRowLen; - #ifdef RBBI_DEBUG - if (gTrace) { - RBBIDebugPuts("Handle Next pos char state category"); - } - #endif - - // handleNext alway sets the break tag value. - // Set the default for it. - fRuleStatusIndex = 0; - - fDictionaryCharCount = 0; - - // if we're already at the end of the text, return DONE. - initialPosition = fPosition; - UTEXT_SETNATIVEINDEX(&fText, initialPosition); - result = initialPosition; - c = UTEXT_NEXT32(&fText); - if (c==U_SENTINEL) { - fDone = TRUE; - return UBRK_DONE; - } - - // Set the initial state for the state machine - state = START_STATE; - row = (RBBIStateTableRow *) - //(statetable->fTableData + (statetable->fRowLen * state)); - (tableData + tableRowLen * state); - - - mode = RBBI_RUN; - if (statetable->fFlags & RBBI_BOF_REQUIRED) { - category = 2; - mode = RBBI_START; - } - - - // loop until we reach the end of the text or transition to state 0 - // - for (;;) { - if (c == U_SENTINEL) { - // Reached end of input string. - if (mode == RBBI_END) { - // We have already run the loop one last time with the - // character set to the psueudo {eof} value. Now it is time - // to unconditionally bail out. - break; - } - // Run the loop one last time with the fake end-of-input character category. - mode = RBBI_END; - category = 1; - } - - // - // Get the char category. An incoming category of 1 or 2 means that - // we are preset for doing the beginning or end of input, and - // that we shouldn't get a category from an actual text input character. - // - if (mode == RBBI_RUN) { - // look up the current character's character category, which tells us - // which column in the state table to look at. - // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, - // not the size of the character going in, which is a UChar32. - // - category = UTRIE2_GET16(fData->fTrie, c); - - // Check the dictionary bit in the character's category. - // Counter is only used by dictionary based iteration. - // Chars that need to be handled by a dictionary have a flag bit set - // in their category values. - // - if ((category & 0x4000) != 0) { - fDictionaryCharCount++; - // And off the dictionary flag bit. - category &= ~0x4000; - } - } - - #ifdef RBBI_DEBUG - if (gTrace) { - RBBIDebugPrintf(" %4" PRId64 " ", utext_getNativeIndex(&fText)); - if (0x20<=c && c<0x7f) { - RBBIDebugPrintf("\"%c\" ", c); - } else { - RBBIDebugPrintf("%5x ", c); - } - RBBIDebugPrintf("%3d %3d\n", state, category); - } - #endif - - // State Transition - move machine to its next state - // - - // fNextState is a variable-length array. - U_ASSERT(category<fData->fHeader->fCatCount); - state = row->fNextState[category]; /*Not accessing beyond memory*/ - row = (RBBIStateTableRow *) - // (statetable->fTableData + (statetable->fRowLen * state)); - (tableData + tableRowLen * state); - - - if (row->fAccepting == -1) { - // Match found, common case. - if (mode != RBBI_START) { - result = (int32_t)UTEXT_GETNATIVEINDEX(&fText); - } - fRuleStatusIndex = row->fTagIdx; // Remember the break status (tag) values. - } - - int16_t completedRule = row->fAccepting; - if (completedRule > 0) { - // Lookahead match is completed. - int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule); - if (lookaheadResult >= 0) { - fRuleStatusIndex = row->fTagIdx; - fPosition = lookaheadResult; - return lookaheadResult; - } - } - int16_t rule = row->fLookAhead; - if (rule != 0) { - // At the position of a '/' in a look-ahead match. Record it. - int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(&fText); - lookAheadMatches.setPosition(rule, pos); - } - - if (state == STOP_STATE) { - // This is the normal exit from the lookup state machine. - // We have advanced through the string until it is certain that no - // longer match is possible, no matter what characters follow. - break; - } - - // Advance to the next character. - // If this is a beginning-of-input loop iteration, don't advance - // the input position. The next iteration will be processing the - // first real input character. - if (mode == RBBI_RUN) { - c = UTEXT_NEXT32(&fText); - } else { - if (mode == RBBI_START) { - mode = RBBI_RUN; - } - } - } - - // The state machine is done. Check whether it found a match... - - // If the iterator failed to advance in the match engine, force it ahead by one. - // (This really indicates a defect in the break rules. They should always match - // at least one character.) - if (result == initialPosition) { - utext_setNativeIndex(&fText, initialPosition); - utext_next32(&fText); - result = (int32_t)utext_getNativeIndex(&fText); - fRuleStatusIndex = 0; - } - - // Leave the iterator at our result position. - fPosition = result; - #ifdef RBBI_DEBUG - if (gTrace) { - RBBIDebugPrintf("result = %d\n\n", result); - } - #endif - return result; -} - - -//----------------------------------------------------------------------------------- -// -// handleSafePrevious() -// -// Iterate backwards using the safe reverse rules. -// The logic of this function is similar to handleNext(), but simpler -// because the safe table does not require as many options. -// -//----------------------------------------------------------------------------------- -int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) { - int32_t state; - uint16_t category = 0; - RBBIStateTableRow *row; - UChar32 c; - int32_t result = 0; - - const RBBIStateTable *stateTable = fData->fReverseTable; - UTEXT_SETNATIVEINDEX(&fText, fromPosition); - #ifdef RBBI_DEBUG - if (gTrace) { - RBBIDebugPuts("Handle Previous pos char state category"); - } - #endif - - // if we're already at the start of the text, return DONE. - if (fData == NULL || UTEXT_GETNATIVEINDEX(&fText)==0) { - return BreakIterator::DONE; - } - - // Set the initial state for the state machine - c = UTEXT_PREVIOUS32(&fText); - state = START_STATE; - row = (RBBIStateTableRow *) - (stateTable->fTableData + (stateTable->fRowLen * state)); - - // loop until we reach the start of the text or transition to state 0 - // - for (; c != U_SENTINEL; c = UTEXT_PREVIOUS32(&fText)) { - - // look up the current character's character category, which tells us - // which column in the state table to look at. - // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned, - // not the size of the character going in, which is a UChar32. - // - // And off the dictionary flag bit. For reverse iteration it is not used. - category = UTRIE2_GET16(fData->fTrie, c); - category &= ~0x4000; - - #ifdef RBBI_DEBUG - if (gTrace) { - RBBIDebugPrintf(" %4d ", (int32_t)utext_getNativeIndex(&fText)); - if (0x20<=c && c<0x7f) { - RBBIDebugPrintf("\"%c\" ", c); - } else { - RBBIDebugPrintf("%5x ", c); - } - RBBIDebugPrintf("%3d %3d\n", state, category); - } - #endif - - // State Transition - move machine to its next state - // - // fNextState is a variable-length array. - U_ASSERT(category<fData->fHeader->fCatCount); - state = row->fNextState[category]; /*Not accessing beyond memory*/ - row = (RBBIStateTableRow *) - (stateTable->fTableData + (stateTable->fRowLen * state)); - - if (state == STOP_STATE) { - // This is the normal exit from the lookup state machine. - // Transistion to state zero means we have found a safe point. - break; - } - } - - // The state machine is done. Check whether it found a match... - result = (int32_t)UTEXT_GETNATIVEINDEX(&fText); - #ifdef RBBI_DEBUG - if (gTrace) { - RBBIDebugPrintf("result = %d\n\n", result); - } - #endif - return result; -} - -//------------------------------------------------------------------------------- -// -// getRuleStatus() Return the break rule tag associated with the current -// iterator position. If the iterator arrived at its current -// position by iterating forwards, the value will have been -// cached by the handleNext() function. -// -//------------------------------------------------------------------------------- - -int32_t RuleBasedBreakIterator::getRuleStatus() const { - - // fLastRuleStatusIndex indexes to the start of the appropriate status record - // (the number of status values.) - // This function returns the last (largest) of the array of status values. - int32_t idx = fRuleStatusIndex + fData->fRuleStatusTable[fRuleStatusIndex]; - int32_t tagVal = fData->fRuleStatusTable[idx]; - - return tagVal; -} - - -int32_t RuleBasedBreakIterator::getRuleStatusVec( - int32_t *fillInVec, int32_t capacity, UErrorCode &status) { - if (U_FAILURE(status)) { - return 0; - } - - int32_t numVals = fData->fRuleStatusTable[fRuleStatusIndex]; - int32_t numValsToCopy = numVals; - if (numVals > capacity) { - status = U_BUFFER_OVERFLOW_ERROR; - numValsToCopy = capacity; - } - int i; - for (i=0; i<numValsToCopy; i++) { - fillInVec[i] = fData->fRuleStatusTable[fRuleStatusIndex + i + 1]; - } - return numVals; -} - - - -//------------------------------------------------------------------------------- -// -// getBinaryRules Access to the compiled form of the rules, -// for use by build system tools that save the data -// for standard iterator types. -// -//------------------------------------------------------------------------------- -const uint8_t *RuleBasedBreakIterator::getBinaryRules(uint32_t &length) { - const uint8_t *retPtr = NULL; - length = 0; - - if (fData != NULL) { - retPtr = (const uint8_t *)fData->fHeader; - length = fData->fHeader->fLength; - } - return retPtr; -} - - -BreakIterator * RuleBasedBreakIterator::createBufferClone(void * /*stackBuffer*/, - int32_t &bufferSize, - UErrorCode &status) -{ - if (U_FAILURE(status)){ - return NULL; - } - - if (bufferSize == 0) { - bufferSize = 1; // preflighting for deprecated functionality - return NULL; - } - - BreakIterator *clonedBI = clone(); - if (clonedBI == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; - } else { - status = U_SAFECLONE_ALLOCATED_WARNING; - } - return (RuleBasedBreakIterator *)clonedBI; -} - -U_NAMESPACE_END - - -static icu::UStack *gLanguageBreakFactories = nullptr; -static const icu::UnicodeString *gEmptyString = nullptr; -static icu::UInitOnce gLanguageBreakFactoriesInitOnce = U_INITONCE_INITIALIZER; -static icu::UInitOnce gRBBIInitOnce = U_INITONCE_INITIALIZER; - -/** - * Release all static memory held by breakiterator. - */ -U_CDECL_BEGIN -static UBool U_CALLCONV rbbi_cleanup(void) { - delete gLanguageBreakFactories; - gLanguageBreakFactories = nullptr; - delete gEmptyString; - gEmptyString = nullptr; - gLanguageBreakFactoriesInitOnce.reset(); - gRBBIInitOnce.reset(); - return TRUE; -} -U_CDECL_END - -U_CDECL_BEGIN -static void U_CALLCONV _deleteFactory(void *obj) { - delete (icu::LanguageBreakFactory *) obj; -} -U_CDECL_END -U_NAMESPACE_BEGIN - -static void U_CALLCONV rbbiInit() { - gEmptyString = new UnicodeString(); - ucln_common_registerCleanup(UCLN_COMMON_RBBI, rbbi_cleanup); -} - -static void U_CALLCONV initLanguageFactories() { - UErrorCode status = U_ZERO_ERROR; - U_ASSERT(gLanguageBreakFactories == NULL); - gLanguageBreakFactories = new UStack(_deleteFactory, NULL, status); - if (gLanguageBreakFactories != NULL && U_SUCCESS(status)) { - ICULanguageBreakFactory *builtIn = new ICULanguageBreakFactory(status); - gLanguageBreakFactories->push(builtIn, status); -#ifdef U_LOCAL_SERVICE_HOOK - LanguageBreakFactory *extra = (LanguageBreakFactory *)uprv_svc_hook("languageBreakFactory", &status); - if (extra != NULL) { - gLanguageBreakFactories->push(extra, status); - } -#endif - } - ucln_common_registerCleanup(UCLN_COMMON_RBBI, rbbi_cleanup); -} - - -static const LanguageBreakEngine* -getLanguageBreakEngineFromFactory(UChar32 c) -{ - umtx_initOnce(gLanguageBreakFactoriesInitOnce, &initLanguageFactories); - if (gLanguageBreakFactories == NULL) { - return NULL; - } - - int32_t i = gLanguageBreakFactories->size(); - const LanguageBreakEngine *lbe = NULL; - while (--i >= 0) { - LanguageBreakFactory *factory = (LanguageBreakFactory *)(gLanguageBreakFactories->elementAt(i)); - lbe = factory->getEngineFor(c); - if (lbe != NULL) { - break; - } - } - return lbe; -} - - -//------------------------------------------------------------------------------- -// -// getLanguageBreakEngine Find an appropriate LanguageBreakEngine for the -// the character c. -// -//------------------------------------------------------------------------------- -const LanguageBreakEngine * -RuleBasedBreakIterator::getLanguageBreakEngine(UChar32 c) { - const LanguageBreakEngine *lbe = NULL; - UErrorCode status = U_ZERO_ERROR; - - if (fLanguageBreakEngines == NULL) { - fLanguageBreakEngines = new UStack(status); - if (fLanguageBreakEngines == NULL || U_FAILURE(status)) { - delete fLanguageBreakEngines; - fLanguageBreakEngines = 0; - return NULL; - } - } - - int32_t i = fLanguageBreakEngines->size(); - while (--i >= 0) { - lbe = (const LanguageBreakEngine *)(fLanguageBreakEngines->elementAt(i)); - if (lbe->handles(c)) { - return lbe; - } - } - - // No existing dictionary took the character. See if a factory wants to - // give us a new LanguageBreakEngine for this character. - lbe = getLanguageBreakEngineFromFactory(c); - - // If we got one, use it and push it on our stack. - if (lbe != NULL) { - fLanguageBreakEngines->push((void *)lbe, status); - // Even if we can't remember it, we can keep looking it up, so - // return it even if the push fails. - return lbe; - } - - // No engine is forthcoming for this character. Add it to the - // reject set. Create the reject break engine if needed. - if (fUnhandledBreakEngine == NULL) { - fUnhandledBreakEngine = new UnhandledEngine(status); - if (U_SUCCESS(status) && fUnhandledBreakEngine == NULL) { - status = U_MEMORY_ALLOCATION_ERROR; - return nullptr; - } - // Put it last so that scripts for which we have an engine get tried - // first. - fLanguageBreakEngines->insertElementAt(fUnhandledBreakEngine, 0, status); - // If we can't insert it, or creation failed, get rid of it - if (U_FAILURE(status)) { - delete fUnhandledBreakEngine; - fUnhandledBreakEngine = 0; - return NULL; - } - } - - // Tell the reject engine about the character; at its discretion, it may - // add more than just the one character. - fUnhandledBreakEngine->handleCharacter(c); - - return fUnhandledBreakEngine; -} - -void RuleBasedBreakIterator::dumpCache() { - fBreakCache->dumpCache(); -} - -void RuleBasedBreakIterator::dumpTables() { - fData->printData(); -} - -/** - * Returns the description used to create this iterator - */ - -const UnicodeString& -RuleBasedBreakIterator::getRules() const { - if (fData != NULL) { - return fData->getRuleSourceString(); - } else { - umtx_initOnce(gRBBIInitOnce, &rbbiInit); - return *gEmptyString; - } -} - -U_NAMESPACE_END - -#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |