summaryrefslogtreecommitdiff
path: root/deps/icu-small/source/common/rbbitblb.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'deps/icu-small/source/common/rbbitblb.cpp')
-rw-r--r--deps/icu-small/source/common/rbbitblb.cpp345
1 files changed, 309 insertions, 36 deletions
diff --git a/deps/icu-small/source/common/rbbitblb.cpp b/deps/icu-small/source/common/rbbitblb.cpp
index 61661a5442..42116b0f95 100644
--- a/deps/icu-small/source/common/rbbitblb.cpp
+++ b/deps/icu-small/source/common/rbbitblb.cpp
@@ -27,21 +27,19 @@
U_NAMESPACE_BEGIN
-RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode) :
- fTree(*rootNode) {
- fRB = rb;
- fStatus = fRB->fStatus;
- UErrorCode status = U_ZERO_ERROR;
- fDStates = new UVector(status);
- if (U_FAILURE(*fStatus)) {
- return;
- }
+RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status) :
+ fRB(rb),
+ fTree(*rootNode),
+ fStatus(&status),
+ fDStates(nullptr),
+ fSafeTable(nullptr) {
if (U_FAILURE(status)) {
- *fStatus = status;
return;
}
- if (fDStates == NULL) {
- *fStatus = U_MEMORY_ALLOCATION_ERROR;;
+ // fDStates is UVector<RBBIStateDescriptor *>
+ fDStates = new UVector(status);
+ if (U_SUCCESS(status) && fDStates == nullptr ) {
+ status = U_MEMORY_ALLOCATION_ERROR;
}
}
@@ -52,17 +50,18 @@ RBBITableBuilder::~RBBITableBuilder() {
for (i=0; i<fDStates->size(); i++) {
delete (RBBIStateDescriptor *)fDStates->elementAt(i);
}
- delete fDStates;
+ delete fDStates;
+ delete fSafeTable;
}
//-----------------------------------------------------------------------------
//
-// RBBITableBuilder::build - This is the main function for building the DFA state transtion
-// table from the RBBI rules parse tree.
+// RBBITableBuilder::buildForwardTable - This is the main function for building
+// the DFA state transition table from the RBBI rules parse tree.
//
//-----------------------------------------------------------------------------
-void RBBITableBuilder::build() {
+void RBBITableBuilder::buildForwardTable() {
if (U_FAILURE(*fStatus)) {
return;
@@ -189,8 +188,6 @@ void RBBITableBuilder::build() {
// for all tables. Merge the ones from this table into the global set.
//
mergeRuleStatusVals();
-
- if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();};
}
@@ -1081,18 +1078,18 @@ void RBBITableBuilder::printPosSets(RBBINode *n) {
//
// findDuplCharClassFrom()
//
-bool RBBITableBuilder::findDuplCharClassFrom(int32_t &baseCategory, int32_t &duplCategory) {
+bool RBBITableBuilder::findDuplCharClassFrom(IntPair *categories) {
int32_t numStates = fDStates->size();
int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
uint16_t table_base;
uint16_t table_dupl;
- for (; baseCategory < numCols-1; ++baseCategory) {
- for (duplCategory=baseCategory+1; duplCategory < numCols; ++duplCategory) {
+ for (; categories->first < numCols-1; categories->first++) {
+ for (categories->second=categories->first+1; categories->second < numCols; categories->second++) {
for (int32_t state=0; state<numStates; state++) {
RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
- table_base = (uint16_t)sd->fDtran->elementAti(baseCategory);
- table_dupl = (uint16_t)sd->fDtran->elementAti(duplCategory);
+ table_base = (uint16_t)sd->fDtran->elementAti(categories->first);
+ table_dupl = (uint16_t)sd->fDtran->elementAti(categories->second);
if (table_base != table_dupl) {
break;
}
@@ -1121,14 +1118,14 @@ void RBBITableBuilder::removeColumn(int32_t column) {
/*
* findDuplicateState
*/
-bool RBBITableBuilder::findDuplicateState(int32_t &firstState, int32_t &duplState) {
+bool RBBITableBuilder::findDuplicateState(IntPair *states) {
int32_t numStates = fDStates->size();
int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
- for (; firstState<numStates-1; ++firstState) {
- RBBIStateDescriptor *firstSD = (RBBIStateDescriptor *)fDStates->elementAt(firstState);
- for (duplState=firstState+1; duplState<numStates; ++duplState) {
- RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(duplState);
+ for (; states->first<numStates-1; states->first++) {
+ RBBIStateDescriptor *firstSD = (RBBIStateDescriptor *)fDStates->elementAt(states->first);
+ for (states->second=states->first+1; states->second<numStates; states->second++) {
+ RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(states->second);
if (firstSD->fAccepting != duplSD->fAccepting ||
firstSD->fLookAhead != duplSD->fLookAhead ||
firstSD->fTagsIdx != duplSD->fTagsIdx) {
@@ -1139,8 +1136,36 @@ bool RBBITableBuilder::findDuplicateState(int32_t &firstState, int32_t &duplStat
int32_t firstVal = firstSD->fDtran->elementAti(col);
int32_t duplVal = duplSD->fDtran->elementAti(col);
if (!((firstVal == duplVal) ||
- ((firstVal == firstState || firstVal == duplState) &&
- (duplVal == firstState || duplVal == duplState)))) {
+ ((firstVal == states->first || firstVal == states->second) &&
+ (duplVal == states->first || duplVal == states->second)))) {
+ rowsMatch = false;
+ break;
+ }
+ }
+ if (rowsMatch) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+
+bool RBBITableBuilder::findDuplicateSafeState(IntPair *states) {
+ int32_t numStates = fSafeTable->size();
+
+ for (; states->first<numStates-1; states->first++) {
+ UnicodeString *firstRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->first));
+ for (states->second=states->first+1; states->second<numStates; states->second++) {
+ UnicodeString *duplRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->second));
+ bool rowsMatch = true;
+ int32_t numCols = firstRow->length();
+ for (int32_t col=0; col < numCols; ++col) {
+ int32_t firstVal = firstRow->charAt(col);
+ int32_t duplVal = duplRow->charAt(col);
+ if (!((firstVal == duplVal) ||
+ ((firstVal == states->first || firstVal == states->second) &&
+ (duplVal == states->first || duplVal == states->second)))) {
rowsMatch = false;
break;
}
@@ -1153,7 +1178,10 @@ bool RBBITableBuilder::findDuplicateState(int32_t &firstState, int32_t &duplStat
return false;
}
-void RBBITableBuilder::removeState(int32_t keepState, int32_t duplState) {
+
+void RBBITableBuilder::removeState(IntPair duplStates) {
+ const int32_t keepState = duplStates.first;
+ const int32_t duplState = duplStates.second;
U_ASSERT(keepState < duplState);
U_ASSERT(duplState < fDStates->size());
@@ -1188,19 +1216,44 @@ void RBBITableBuilder::removeState(int32_t keepState, int32_t duplState) {
}
}
+void RBBITableBuilder::removeSafeState(IntPair duplStates) {
+ const int32_t keepState = duplStates.first;
+ const int32_t duplState = duplStates.second;
+ U_ASSERT(keepState < duplState);
+ U_ASSERT(duplState < fSafeTable->size());
+
+ fSafeTable->removeElementAt(duplState); // Note that fSafeTable has a deleter function
+ // and will auto-delete the removed element.
+ int32_t numStates = fSafeTable->size();
+ for (int32_t state=0; state<numStates; ++state) {
+ UnicodeString *sd = (UnicodeString *)fSafeTable->elementAt(state);
+ int32_t numCols = sd->length();
+ for (int32_t col=0; col<numCols; col++) {
+ int32_t existingVal = sd->charAt(col);
+ int32_t newVal = existingVal;
+ if (existingVal == duplState) {
+ newVal = keepState;
+ } else if (existingVal > duplState) {
+ newVal = existingVal - 1;
+ }
+ sd->setCharAt(col, newVal);
+ }
+ }
+}
+
/*
* RemoveDuplicateStates
*/
void RBBITableBuilder::removeDuplicateStates() {
- int32_t firstState = 3;
- int32_t duplicateState = 0;
- while (findDuplicateState(firstState, duplicateState)) {
- // printf("Removing duplicate states (%d, %d)\n", firstState, duplicateState);
- removeState(firstState, duplicateState);
+ IntPair dupls = {3, 0};
+ while (findDuplicateState(&dupls)) {
+ // printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second);
+ removeState(dupls);
}
}
+
//-----------------------------------------------------------------------------
//
// getTableSize() Calculate the size of the runtime form of this
@@ -1277,6 +1330,185 @@ void RBBITableBuilder::exportTable(void *where) {
}
+/**
+ * Synthesize a safe state table from the main state table.
+ */
+void RBBITableBuilder::buildSafeReverseTable(UErrorCode &status) {
+ // The safe table creation has three steps:
+
+ // 1. Identifiy pairs of character classes that are "safe." Safe means that boundaries
+ // following the pair do not depend on context or state before the pair. To test
+ // whether a pair is safe, run it through the main forward state table, starting
+ // from each state. If the the final state is the same, no matter what the starting state,
+ // the pair is safe.
+ //
+ // 2. Build a state table that recognizes the safe pairs. It's similar to their
+ // forward table, with a column for each input character [class], and a row for
+ // each state. Row 1 is the start state, and row 0 is the stop state. Initially
+ // create an additional state for each input character category; being in
+ // one of these states means that the character has been seen, and is potentially
+ // the first of a pair. In each of these rows, the entry for the second character
+ // of a safe pair is set to the stop state (0), indicating that a match was found.
+ // All other table entries are set to the state corresponding the current input
+ // character, allowing that charcter to be the of a start following pair.
+ //
+ // Because the safe rules are to be run in reverse, moving backwards in the text,
+ // the first and second pair categories are swapped when building the table.
+ //
+ // 3. Compress the table. There are typically many rows (states) that are
+ // equivalent - that have zeroes (match completed) in the same columns -
+ // and can be folded together.
+
+ // Each safe pair is stored as two UChars in the safePair string.
+ UnicodeString safePairs;
+
+ int32_t numCharClasses = fRB->fSetBuilder->getNumCharCategories();
+ int32_t numStates = fDStates->size();
+
+ for (int32_t c1=0; c1<numCharClasses; ++c1) {
+ for (int32_t c2=0; c2 < numCharClasses; ++c2) {
+ int32_t wantedEndState = -1;
+ int32_t endState = 0;
+ for (int32_t startState = 1; startState < numStates; ++startState) {
+ RBBIStateDescriptor *startStateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(startState));
+ int32_t s2 = startStateD->fDtran->elementAti(c1);
+ RBBIStateDescriptor *s2StateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(s2));
+ endState = s2StateD->fDtran->elementAti(c2);
+ if (wantedEndState < 0) {
+ wantedEndState = endState;
+ } else {
+ if (wantedEndState != endState) {
+ break;
+ }
+ }
+ }
+ if (wantedEndState == endState) {
+ safePairs.append((char16_t)c1);
+ safePairs.append((char16_t)c2);
+ // printf("(%d, %d) ", c1, c2);
+ }
+ }
+ // printf("\n");
+ }
+
+ // Populate the initial safe table.
+ // The table as a whole is UVector<UnicodeString>
+ // Each row is represented by a UnicodeString, being used as a Vector<int16>.
+ // Row 0 is the stop state.
+ // Row 1 is the start sate.
+ // Row 2 and beyond are other states, initially one per char class, but
+ // after initial construction, many of the states will be combined, compacting the table.
+ // The String holds the nextState data only. The four leading fields of a row, fAccepting,
+ // fLookAhead, etc. are not needed for the safe table, and are omitted at this stage of building.
+
+ U_ASSERT(fSafeTable == nullptr);
+ fSafeTable = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, numCharClasses + 2, status);
+ for (int32_t row=0; row<numCharClasses + 2; ++row) {
+ fSafeTable->addElement(new UnicodeString(numCharClasses, 0, numCharClasses+4), status);
+ }
+
+ // From the start state, each input char class transitions to the state for that input.
+ UnicodeString &startState = *static_cast<UnicodeString *>(fSafeTable->elementAt(1));
+ for (int32_t charClass=0; charClass < numCharClasses; ++charClass) {
+ // Note: +2 for the start & stop state.
+ startState.setCharAt(charClass, charClass+2);
+ }
+
+ // Initially make every other state table row look like the start state row,
+ for (int32_t row=2; row<numCharClasses+2; ++row) {
+ UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(row));
+ rowState = startState; // UnicodeString assignment, copies contents.
+ }
+
+ // Run through the safe pairs, set the next state to zero when pair has been seen.
+ // Zero being the stop state, meaning we found a safe point.
+ for (int32_t pairIdx=0; pairIdx<safePairs.length(); pairIdx+=2) {
+ int32_t c1 = safePairs.charAt(pairIdx);
+ int32_t c2 = safePairs.charAt(pairIdx + 1);
+
+ UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(c2 + 2));
+ rowState.setCharAt(c1, 0);
+ }
+
+ // Remove duplicate or redundant rows from the table.
+ IntPair states = {1, 0};
+ while (findDuplicateSafeState(&states)) {
+ // printf("Removing duplicate safe states (%d, %d)\n", states.first, states.second);
+ removeSafeState(states);
+ }
+}
+
+
+//-----------------------------------------------------------------------------
+//
+// getSafeTableSize() Calculate the size of the runtime form of this
+// safe state table.
+//
+//-----------------------------------------------------------------------------
+int32_t RBBITableBuilder::getSafeTableSize() const {
+ int32_t size = 0;
+ int32_t numRows;
+ int32_t numCols;
+ int32_t rowSize;
+
+ if (fSafeTable == nullptr) {
+ return 0;
+ }
+
+ size = offsetof(RBBIStateTable, fTableData); // The header, with no rows to the table.
+
+ numRows = fSafeTable->size();
+ numCols = fRB->fSetBuilder->getNumCharCategories();
+
+ rowSize = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t)*numCols;
+ size += numRows * rowSize;
+ return size;
+}
+
+
+//-----------------------------------------------------------------------------
+//
+// exportSafeTable() export the state transition table in the format required
+// by the runtime engine. getTableSize() bytes of memory
+// must be available at the output address "where".
+//
+//-----------------------------------------------------------------------------
+void RBBITableBuilder::exportSafeTable(void *where) {
+ RBBIStateTable *table = (RBBIStateTable *)where;
+ uint32_t state;
+ int col;
+
+ if (U_FAILURE(*fStatus) || fSafeTable == nullptr) {
+ return;
+ }
+
+ int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
+ if (catCount > 0x7fff ||
+ fSafeTable->size() > 0x7fff) {
+ *fStatus = U_BRK_INTERNAL_ERROR;
+ return;
+ }
+
+ table->fRowLen = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t) * catCount;
+ table->fNumStates = fSafeTable->size();
+ table->fFlags = 0;
+ table->fReserved = 0;
+
+ for (state=0; state<table->fNumStates; state++) {
+ UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(state);
+ RBBIStateTableRow *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
+ row->fAccepting = 0;
+ row->fLookAhead = 0;
+ row->fTagIdx = 0;
+ row->fReserved = 0;
+ for (col=0; col<catCount; col++) {
+ row->fNextState[col] = rowString->charAt(col);
+ }
+ }
+}
+
+
+
//-----------------------------------------------------------------------------
//
@@ -1331,6 +1563,47 @@ void RBBITableBuilder::printStates() {
#endif
+//-----------------------------------------------------------------------------
+//
+// printSafeTable Debug Function. Dump the fully constructed safe table.
+//
+//-----------------------------------------------------------------------------
+#ifdef RBBI_DEBUG
+void RBBITableBuilder::printReverseTable() {
+ int c; // input "character"
+ int n; // state number
+
+ RBBIDebugPrintf(" Safe Reverse Table \n");
+ if (fSafeTable == nullptr) {
+ RBBIDebugPrintf(" --- nullptr ---\n");
+ return;
+ }
+ RBBIDebugPrintf("state | i n p u t s y m b o l s \n");
+ RBBIDebugPrintf(" | Acc LA Tag");
+ for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
+ RBBIDebugPrintf(" %2d", c);
+ }
+ RBBIDebugPrintf("\n");
+ RBBIDebugPrintf(" |---------------");
+ for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
+ RBBIDebugPrintf("---");
+ }
+ RBBIDebugPrintf("\n");
+
+ for (n=0; n<fSafeTable->size(); n++) {
+ UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(n);
+ RBBIDebugPrintf(" %3d | " , n);
+ RBBIDebugPrintf("%3d %3d %5d ", 0, 0, 0); // Accepting, LookAhead, Tags
+ for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
+ RBBIDebugPrintf(" %2d", rowString->charAt(c));
+ }
+ RBBIDebugPrintf("\n");
+ }
+ RBBIDebugPrintf("\n\n");
+}
+#endif
+
+
//-----------------------------------------------------------------------------
//