summaryrefslogtreecommitdiff
path: root/deps/icu-small/source/common/rbbi.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'deps/icu-small/source/common/rbbi.cpp')
-rw-r--r--deps/icu-small/source/common/rbbi.cpp112
1 files changed, 17 insertions, 95 deletions
diff --git a/deps/icu-small/source/common/rbbi.cpp b/deps/icu-small/source/common/rbbi.cpp
index 69f92d94c6..c5ea2770ba 100644
--- a/deps/icu-small/source/common/rbbi.cpp
+++ b/deps/icu-small/source/common/rbbi.cpp
@@ -651,7 +651,7 @@ UBool RuleBasedBreakIterator::isBoundary(int32_t offset) {
}
// Adjust offset to be on a code point boundary and not beyond the end of the text.
- // Note that isBoundary() is always be false for offsets that are not on code point boundaries.
+ // Note that isBoundary() is always false for offsets that are not on code point boundaries.
// But we still need the side effect of leaving iteration at the following boundary.
utext_setNativeIndex(&fText, offset);
@@ -937,26 +937,23 @@ int32_t RuleBasedBreakIterator::handleNext() {
}
-
//-----------------------------------------------------------------------------------
//
-// handlePrevious()
+// handleSafePrevious()
//
// Iterate backwards using the safe reverse rules.
-// The logic of this function is very similar to handleNext(), above.
+// The logic of this function is similar to handleNext(), but simpler
+// because the safe table does not require as many options.
//
//-----------------------------------------------------------------------------------
-int32_t RuleBasedBreakIterator::handlePrevious(int32_t fromPosition) {
+int32_t RuleBasedBreakIterator::handleSafePrevious(int32_t fromPosition) {
int32_t state;
uint16_t category = 0;
- RBBIRunMode mode;
RBBIStateTableRow *row;
UChar32 c;
- LookAheadResults lookAheadMatches;
int32_t result = 0;
- int32_t initialPosition = 0;
- const RBBIStateTable *stateTable = fData->fSafeRevTable;
+ const RBBIStateTable *stateTable = fData->fReverseTable;
UTEXT_SETNATIVEINDEX(&fText, fromPosition);
#ifdef RBBI_DEBUG
if (gTrace) {
@@ -969,54 +966,24 @@ int32_t RuleBasedBreakIterator::handlePrevious(int32_t fromPosition) {
return BreakIterator::DONE;
}
- // Set up the starting char.
- initialPosition = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
- result = initialPosition;
- c = UTEXT_PREVIOUS32(&fText);
-
// Set the initial state for the state machine
+ c = UTEXT_PREVIOUS32(&fText);
state = START_STATE;
row = (RBBIStateTableRow *)
(stateTable->fTableData + (stateTable->fRowLen * state));
- category = 3;
- mode = RBBI_RUN;
- if (stateTable->fFlags & RBBI_BOF_REQUIRED) {
- category = 2;
- mode = RBBI_START;
- }
-
// loop until we reach the start of the text or transition to state 0
//
- for (;;) {
- if (c == U_SENTINEL) {
- // Reached end of input string.
- if (mode == RBBI_END) {
- // We have already run the loop one last time with the
- // character set to the psueudo {eof} value. Now it is time
- // to unconditionally bail out.
- break;
- }
- // Run the loop one last time with the fake end-of-input character category.
- mode = RBBI_END;
- category = 1;
- }
+ for (; c != U_SENTINEL; c = UTEXT_PREVIOUS32(&fText)) {
+ // look up the current character's character category, which tells us
+ // which column in the state table to look at.
+ // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
+ // not the size of the character going in, which is a UChar32.
//
- // Get the char category. An incoming category of 1 or 2 means that
- // we are preset for doing the beginning or end of input, and
- // that we shouldn't get a category from an actual text input character.
- //
- if (mode == RBBI_RUN) {
- // look up the current character's character category, which tells us
- // which column in the state table to look at.
- // Note: the 16 in UTRIE_GET16 refers to the size of the data being returned,
- // not the size of the character going in, which is a UChar32.
- //
- // And off the dictionary flag bit. For reverse iteration it is not used.
- category = UTRIE2_GET16(fData->fTrie, c);
- category &= ~0x4000;
- }
+ // And off the dictionary flag bit. For reverse iteration it is not used.
+ category = UTRIE2_GET16(fData->fTrie, c);
+ category &= ~0x4000;
#ifdef RBBI_DEBUG
if (gTrace) {
@@ -1032,65 +999,21 @@ int32_t RuleBasedBreakIterator::handlePrevious(int32_t fromPosition) {
// State Transition - move machine to its next state
//
-
// fNextState is a variable-length array.
U_ASSERT(category<fData->fHeader->fCatCount);
state = row->fNextState[category]; /*Not accessing beyond memory*/
row = (RBBIStateTableRow *)
(stateTable->fTableData + (stateTable->fRowLen * state));
- if (row->fAccepting == -1) {
- // Match found, common case.
- result = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
- }
-
- int16_t completedRule = row->fAccepting;
- if (completedRule > 0) {
- // Lookahead match is completed.
- int32_t lookaheadResult = lookAheadMatches.getPosition(completedRule);
- if (lookaheadResult >= 0) {
- UTEXT_SETNATIVEINDEX(&fText, lookaheadResult);
- return lookaheadResult;
- }
- }
- int16_t rule = row->fLookAhead;
- if (rule != 0) {
- // At the position of a '/' in a look-ahead match. Record it.
- int32_t pos = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
- lookAheadMatches.setPosition(rule, pos);
- }
-
if (state == STOP_STATE) {
// This is the normal exit from the lookup state machine.
- // We have advanced through the string until it is certain that no
- // longer match is possible, no matter what characters follow.
+ // Transistion to state zero means we have found a safe point.
break;
}
-
- // Move (backwards) to the next character to process.
- // If this is a beginning-of-input loop iteration, don't advance
- // the input position. The next iteration will be processing the
- // first real input character.
- if (mode == RBBI_RUN) {
- c = UTEXT_PREVIOUS32(&fText);
- } else {
- if (mode == RBBI_START) {
- mode = RBBI_RUN;
- }
- }
}
// The state machine is done. Check whether it found a match...
-
- // If the iterator failed to advance in the match engine, force it ahead by one.
- // (This really indicates a defect in the break rules. They should always match
- // at least one character.)
- if (result == initialPosition) {
- UTEXT_SETNATIVEINDEX(&fText, initialPosition);
- UTEXT_PREVIOUS32(&fText);
- result = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
- }
-
+ result = (int32_t)UTEXT_GETNATIVEINDEX(&fText);
#ifdef RBBI_DEBUG
if (gTrace) {
RBBIDebugPrintf("result = %d\n\n", result);
@@ -1099,7 +1022,6 @@ int32_t RuleBasedBreakIterator::handlePrevious(int32_t fromPosition) {
return result;
}
-
//-------------------------------------------------------------------------------
//
// getRuleStatus() Return the break rule tag associated with the current