1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
|
// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html
//
// rbbitblb.h
//
/*
**********************************************************************
* Copyright (c) 2002-2016, International Business Machines
* Corporation and others. All Rights Reserved.
**********************************************************************
*/
#ifndef RBBITBLB_H
#define RBBITBLB_H
#include "unicode/utypes.h"
#include "unicode/uobject.h"
#include "unicode/rbbi.h"
#include "rbbinode.h"
U_NAMESPACE_BEGIN
class RBBIRuleScanner;
class RBBIRuleBuilder;
class UVector32;
//
// class RBBITableBuilder is part of the RBBI rule compiler.
// It builds the state transition table used by the RBBI runtime
// from the expression syntax tree generated by the rule scanner.
//
// This class is part of the RBBI implementation only.
// There is no user-visible public API here.
//
class RBBITableBuilder : public UMemory {
public:
RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode);
~RBBITableBuilder();
void build();
int32_t getTableSize() const; // Return the runtime size in bytes of
// the built state table
/** Fill in the runtime state table. Sufficient memory must exist at the specified location.
*/
void exportTable(void *where);
/** Find duplicate (redundant) character classes, beginning after the specifed
* pair, within this state table. This is an iterator-like function, used to
* identify char classes (state table columns) that can be eliminated.
*/
bool findDuplCharClassFrom(int &baseClass, int &duplClass);
/** Remove a column from the state table. Used when two character categories
* have been found equivalent, and merged together, to eliminate the uneeded table column.
*/
void removeColumn(int32_t column);
/** Check for, and remove dupicate states (table rows). */
void removeDuplicateStates();
private:
void calcNullable(RBBINode *n);
void calcFirstPos(RBBINode *n);
void calcLastPos(RBBINode *n);
void calcFollowPos(RBBINode *n);
void calcChainedFollowPos(RBBINode *n);
void bofFixup();
void buildStateTable();
void flagAcceptingStates();
void flagLookAheadStates();
void flagTaggedStates();
void mergeRuleStatusVals();
/**
* Merge redundant state table columns, eliminating character classes with identical behavior.
* Done after the state tables are generated, just before converting to their run-time format.
*/
int32_t mergeColumns();
void addRuleRootNodes(UVector *dest, RBBINode *node);
/** Find the next duplicate state. An iterator function.
* @param firstState (in/out) begin looking at this state, return the first of the
* pair of duplicates.
* @param duplicateState returns the duplicate state of fistState
* @return true if a duplicate pair of states was found.
*/
bool findDuplicateState(int32_t &firstState, int32_t &duplicateState);
/** Remove a duplicate state/
* @param keepState First of the duplicate pair. Keep it.
* @param duplState Duplicate state. Remove it. Redirect all references to the duplicate state
* to refer to keepState instead.
*/
void removeState(int32_t keepState, int32_t duplState);
// Set functions for UVector.
// TODO: make a USet subclass of UVector
void setAdd(UVector *dest, UVector *source);
UBool setEquals(UVector *a, UVector *b);
void sortedAdd(UVector **dest, int32_t val);
public:
#ifdef RBBI_DEBUG
void printSet(UVector *s);
void printPosSets(RBBINode *n /* = NULL*/);
void printStates();
void printRuleStatusTable();
#else
#define printSet(s)
#define printPosSets(n)
#define printStates()
#define printRuleStatusTable()
#endif
private:
RBBIRuleBuilder *fRB;
RBBINode *&fTree; // The root node of the parse tree to build a
// table for.
UErrorCode *fStatus;
UVector *fDStates; // D states (Aho's terminology)
// Index is state number
// Contents are RBBIStateDescriptor pointers.
RBBITableBuilder(const RBBITableBuilder &other); // forbid copying of this class
RBBITableBuilder &operator=(const RBBITableBuilder &other); // forbid copying of this class
};
//
// RBBIStateDescriptor - The DFA is constructed as a set of these descriptors,
// one for each state.
class RBBIStateDescriptor : public UMemory {
public:
UBool fMarked;
int32_t fAccepting;
int32_t fLookAhead;
UVector *fTagVals;
int32_t fTagsIdx;
UVector *fPositions; // Set of parse tree positions associated
// with this state. Unordered (it's a set).
// UVector contents are RBBINode *
UVector32 *fDtran; // Transitions out of this state.
// indexed by input character
// contents is int index of dest state
// in RBBITableBuilder.fDStates
RBBIStateDescriptor(int maxInputSymbol, UErrorCode *fStatus);
~RBBIStateDescriptor();
private:
RBBIStateDescriptor(const RBBIStateDescriptor &other); // forbid copying of this class
RBBIStateDescriptor &operator=(const RBBIStateDescriptor &other); // forbid copying of this class
};
U_NAMESPACE_END
#endif
|