1*0Sstevel@tonic-gate // Copyright (c) 1994 James Clark
2*0Sstevel@tonic-gate // See the file COPYING for copying permission.
3*0Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI"
4*0Sstevel@tonic-gate
5*0Sstevel@tonic-gate #ifdef __GNUG__
6*0Sstevel@tonic-gate #pragma implementation
7*0Sstevel@tonic-gate #endif
8*0Sstevel@tonic-gate #include "splib.h"
9*0Sstevel@tonic-gate #include "types.h"
10*0Sstevel@tonic-gate #include "macros.h"
11*0Sstevel@tonic-gate #include "StringOf.h"
12*0Sstevel@tonic-gate #include "Trie.h"
13*0Sstevel@tonic-gate #include "TrieBuilder.h"
14*0Sstevel@tonic-gate #include "Priority.h"
15*0Sstevel@tonic-gate #include <stdlib.h>
16*0Sstevel@tonic-gate
17*0Sstevel@tonic-gate #ifdef SP_NAMESPACE
18*0Sstevel@tonic-gate namespace SP_NAMESPACE {
19*0Sstevel@tonic-gate #endif
20*0Sstevel@tonic-gate
~Trie()21*0Sstevel@tonic-gate Trie::~Trie()
22*0Sstevel@tonic-gate {
23*0Sstevel@tonic-gate if (next_)
24*0Sstevel@tonic-gate delete [] next_;
25*0Sstevel@tonic-gate }
26*0Sstevel@tonic-gate
Trie(const Trie & t)27*0Sstevel@tonic-gate Trie::Trie(const Trie &t)
28*0Sstevel@tonic-gate : nCodes_(t.nCodes_),
29*0Sstevel@tonic-gate token_(t.token_),
30*0Sstevel@tonic-gate tokenLength_(t.tokenLength_),
31*0Sstevel@tonic-gate priority_(t.priority_),
32*0Sstevel@tonic-gate blank_(t.blank_)
33*0Sstevel@tonic-gate {
34*0Sstevel@tonic-gate if (t.next_) {
35*0Sstevel@tonic-gate next_ = new Trie[nCodes_];
36*0Sstevel@tonic-gate for (int i = 0; i < nCodes_; i++)
37*0Sstevel@tonic-gate next_[i] = t.next_[i];
38*0Sstevel@tonic-gate }
39*0Sstevel@tonic-gate else
40*0Sstevel@tonic-gate next_ = 0;
41*0Sstevel@tonic-gate }
42*0Sstevel@tonic-gate
operator =(const Trie & t)43*0Sstevel@tonic-gate Trie &Trie::operator=(const Trie &t)
44*0Sstevel@tonic-gate {
45*0Sstevel@tonic-gate if (next_)
46*0Sstevel@tonic-gate delete [] next_;
47*0Sstevel@tonic-gate nCodes_ = t.nCodes_;
48*0Sstevel@tonic-gate token_ = t.token_;
49*0Sstevel@tonic-gate tokenLength_ = t.tokenLength_;
50*0Sstevel@tonic-gate priority_ = t.priority_;
51*0Sstevel@tonic-gate blank_ = t.blank_;
52*0Sstevel@tonic-gate if (t.next_) {
53*0Sstevel@tonic-gate next_ = new Trie[nCodes_];
54*0Sstevel@tonic-gate for (int i = 0; i < nCodes_; i++)
55*0Sstevel@tonic-gate next_[i] = t.next_[i];
56*0Sstevel@tonic-gate }
57*0Sstevel@tonic-gate else
58*0Sstevel@tonic-gate next_ = 0;
59*0Sstevel@tonic-gate return *this;
60*0Sstevel@tonic-gate }
61*0Sstevel@tonic-gate
TrieBuilder(int nCodes)62*0Sstevel@tonic-gate TrieBuilder::TrieBuilder(int nCodes)
63*0Sstevel@tonic-gate : nCodes_(nCodes), root_(new Trie)
64*0Sstevel@tonic-gate {
65*0Sstevel@tonic-gate root_->token_ = 0;
66*0Sstevel@tonic-gate root_->tokenLength_ = 0;
67*0Sstevel@tonic-gate root_->priority_ = Priority::data;
68*0Sstevel@tonic-gate root_->nCodes_ = nCodes;
69*0Sstevel@tonic-gate }
70*0Sstevel@tonic-gate
recognize(const String<EquivCode> & chars,Token t,Priority::Type priority,TokenVector & ambiguities)71*0Sstevel@tonic-gate void TrieBuilder::recognize(const String<EquivCode> &chars,
72*0Sstevel@tonic-gate Token t,
73*0Sstevel@tonic-gate Priority::Type priority,
74*0Sstevel@tonic-gate TokenVector &ambiguities)
75*0Sstevel@tonic-gate {
76*0Sstevel@tonic-gate setToken(extendTrie(root_.pointer(), chars), chars.size(), t, priority,
77*0Sstevel@tonic-gate ambiguities);
78*0Sstevel@tonic-gate }
79*0Sstevel@tonic-gate
recognize(const String<EquivCode> & chars,const String<EquivCode> & set,Token t,Priority::Type priority,TokenVector & ambiguities)80*0Sstevel@tonic-gate void TrieBuilder::recognize(const String<EquivCode> &chars,
81*0Sstevel@tonic-gate const String<EquivCode> &set,
82*0Sstevel@tonic-gate Token t,
83*0Sstevel@tonic-gate Priority::Type priority,
84*0Sstevel@tonic-gate TokenVector &ambiguities)
85*0Sstevel@tonic-gate {
86*0Sstevel@tonic-gate Trie *trie = extendTrie(root_.pointer(), chars);
87*0Sstevel@tonic-gate
88*0Sstevel@tonic-gate for (size_t i = 0; i < set.size(); i++)
89*0Sstevel@tonic-gate setToken(forceNext(trie, set[i]), chars.size() + 1, t, priority,
90*0Sstevel@tonic-gate ambiguities);
91*0Sstevel@tonic-gate }
92*0Sstevel@tonic-gate
recognizeB(const String<EquivCode> & chars,int bSequenceLength,size_t maxBlankSequence,const String<EquivCode> & blankCodes,const String<EquivCode> & chars2,Token token,TokenVector & ambiguities)93*0Sstevel@tonic-gate void TrieBuilder::recognizeB(const String<EquivCode> &chars,
94*0Sstevel@tonic-gate int bSequenceLength,
95*0Sstevel@tonic-gate size_t maxBlankSequence,
96*0Sstevel@tonic-gate const String<EquivCode> &blankCodes,
97*0Sstevel@tonic-gate const String<EquivCode> &chars2,
98*0Sstevel@tonic-gate Token token,
99*0Sstevel@tonic-gate TokenVector &ambiguities)
100*0Sstevel@tonic-gate {
101*0Sstevel@tonic-gate doB(extendTrie(root_.pointer(), chars),
102*0Sstevel@tonic-gate chars.size(),
103*0Sstevel@tonic-gate bSequenceLength,
104*0Sstevel@tonic-gate maxBlankSequence,
105*0Sstevel@tonic-gate blankCodes,
106*0Sstevel@tonic-gate chars2,
107*0Sstevel@tonic-gate token,
108*0Sstevel@tonic-gate Priority::blank(bSequenceLength),
109*0Sstevel@tonic-gate ambiguities);
110*0Sstevel@tonic-gate }
111*0Sstevel@tonic-gate
recognizeEE(EquivCode code,Token t)112*0Sstevel@tonic-gate void TrieBuilder::recognizeEE(EquivCode code, Token t)
113*0Sstevel@tonic-gate {
114*0Sstevel@tonic-gate Trie *trie = forceNext(root_.pointer(), code);
115*0Sstevel@tonic-gate trie->tokenLength_ = 0; // it has length 0 in the buffer
116*0Sstevel@tonic-gate trie->token_ = t;
117*0Sstevel@tonic-gate trie->priority_ = Priority::data;
118*0Sstevel@tonic-gate }
119*0Sstevel@tonic-gate
doB(Trie * trie,int tokenLength,int minBLength,size_t maxLength,const String<EquivCode> & blankCodes,const String<EquivCode> & chars2,Token token,Priority::Type pri,TokenVector & ambiguities)120*0Sstevel@tonic-gate void TrieBuilder::doB(Trie *trie,
121*0Sstevel@tonic-gate int tokenLength,
122*0Sstevel@tonic-gate int minBLength,
123*0Sstevel@tonic-gate size_t maxLength,
124*0Sstevel@tonic-gate const String<EquivCode> &blankCodes,
125*0Sstevel@tonic-gate const String<EquivCode> &chars2,
126*0Sstevel@tonic-gate Token token,
127*0Sstevel@tonic-gate Priority::Type pri,
128*0Sstevel@tonic-gate TokenVector &ambiguities)
129*0Sstevel@tonic-gate {
130*0Sstevel@tonic-gate if (minBLength == 0 && trie->next_ == 0) {
131*0Sstevel@tonic-gate if (!trie->blank_) {
132*0Sstevel@tonic-gate BlankTrie *b = new BlankTrie;
133*0Sstevel@tonic-gate trie->blank_ = b;
134*0Sstevel@tonic-gate b->maxBlanksToScan_ = maxLength;
135*0Sstevel@tonic-gate b->additionalLength_ = tokenLength;
136*0Sstevel@tonic-gate b->codeIsBlank_.assign(nCodes_, 0);
137*0Sstevel@tonic-gate for (size_t i = 0; i < blankCodes.size(); i++)
138*0Sstevel@tonic-gate b->codeIsBlank_[blankCodes[i]] = 1;
139*0Sstevel@tonic-gate b->tokenLength_ = 0;
140*0Sstevel@tonic-gate b->token_ = 0;
141*0Sstevel@tonic-gate b->priority_ = Priority::data;
142*0Sstevel@tonic-gate b->nCodes_ = nCodes_;
143*0Sstevel@tonic-gate }
144*0Sstevel@tonic-gate else {
145*0Sstevel@tonic-gate // A B sequence is not allowed to be adjacent to a character
146*0Sstevel@tonic-gate // that can occur in a blank sequence, so maxLength will be
147*0Sstevel@tonic-gate // the same at a node, no matter how we got there.
148*0Sstevel@tonic-gate ASSERT(trie->blank_->maxBlanksToScan_ == maxLength);
149*0Sstevel@tonic-gate ASSERT(trie->blank_->additionalLength_ == tokenLength);
150*0Sstevel@tonic-gate }
151*0Sstevel@tonic-gate if (chars2.size() == 0)
152*0Sstevel@tonic-gate setToken(trie, tokenLength, token, pri, ambiguities);
153*0Sstevel@tonic-gate else
154*0Sstevel@tonic-gate setToken(extendTrie(trie->blank_.pointer(), chars2),
155*0Sstevel@tonic-gate chars2.size(),
156*0Sstevel@tonic-gate token,
157*0Sstevel@tonic-gate pri,
158*0Sstevel@tonic-gate ambiguities);
159*0Sstevel@tonic-gate }
160*0Sstevel@tonic-gate else {
161*0Sstevel@tonic-gate if (minBLength == 0)
162*0Sstevel@tonic-gate setToken(extendTrie(trie, chars2), tokenLength + chars2.size(),
163*0Sstevel@tonic-gate token, pri, ambiguities);
164*0Sstevel@tonic-gate for (size_t i = 0; i < blankCodes.size(); i++)
165*0Sstevel@tonic-gate doB(forceNext(trie, blankCodes[i]),
166*0Sstevel@tonic-gate tokenLength + 1,
167*0Sstevel@tonic-gate minBLength == 0 ? 0 : minBLength - 1,
168*0Sstevel@tonic-gate maxLength - 1,
169*0Sstevel@tonic-gate blankCodes,
170*0Sstevel@tonic-gate chars2,
171*0Sstevel@tonic-gate token,
172*0Sstevel@tonic-gate pri,
173*0Sstevel@tonic-gate ambiguities);
174*0Sstevel@tonic-gate }
175*0Sstevel@tonic-gate }
176*0Sstevel@tonic-gate
extendTrie(Trie * trie,const String<EquivCode> & s)177*0Sstevel@tonic-gate Trie *TrieBuilder::extendTrie(Trie *trie, const String<EquivCode> &s)
178*0Sstevel@tonic-gate {
179*0Sstevel@tonic-gate for (size_t i = 0; i < s.size(); i++)
180*0Sstevel@tonic-gate trie = forceNext(trie, s[i]);
181*0Sstevel@tonic-gate return trie;
182*0Sstevel@tonic-gate }
183*0Sstevel@tonic-gate
setToken(Trie * trie,int tokenLength,Token token,Priority::Type pri,TokenVector & ambiguities)184*0Sstevel@tonic-gate void TrieBuilder::setToken(Trie *trie,
185*0Sstevel@tonic-gate int tokenLength,
186*0Sstevel@tonic-gate Token token,
187*0Sstevel@tonic-gate Priority::Type pri,
188*0Sstevel@tonic-gate TokenVector &ambiguities)
189*0Sstevel@tonic-gate {
190*0Sstevel@tonic-gate if (tokenLength > trie->tokenLength_
191*0Sstevel@tonic-gate || (tokenLength == trie->tokenLength_
192*0Sstevel@tonic-gate && pri > trie->priority_)) {
193*0Sstevel@tonic-gate trie->tokenLength_ = tokenLength;
194*0Sstevel@tonic-gate trie->token_ = token;
195*0Sstevel@tonic-gate trie->priority_ = pri;
196*0Sstevel@tonic-gate }
197*0Sstevel@tonic-gate else if (trie->tokenLength_ == tokenLength
198*0Sstevel@tonic-gate && trie->priority_ == pri
199*0Sstevel@tonic-gate && trie->token_ != token
200*0Sstevel@tonic-gate && trie->token_ != 0) {
201*0Sstevel@tonic-gate ambiguities.push_back(Token(trie->token_));
202*0Sstevel@tonic-gate ambiguities.push_back(token);
203*0Sstevel@tonic-gate }
204*0Sstevel@tonic-gate if (trie->hasNext()) {
205*0Sstevel@tonic-gate for (int i = 0; i < nCodes_; i++)
206*0Sstevel@tonic-gate setToken(&trie->next_[i], tokenLength, token, pri, ambiguities);
207*0Sstevel@tonic-gate }
208*0Sstevel@tonic-gate }
209*0Sstevel@tonic-gate
copyInto(Trie * into,const Trie * from,int additionalLength)210*0Sstevel@tonic-gate void TrieBuilder::copyInto(Trie *into, const Trie *from, int additionalLength)
211*0Sstevel@tonic-gate {
212*0Sstevel@tonic-gate if (from->token_ != 0) {
213*0Sstevel@tonic-gate TokenVector ambiguities;
214*0Sstevel@tonic-gate setToken(into, from->tokenLength_ + additionalLength, from->token_,
215*0Sstevel@tonic-gate from->priority_, ambiguities);
216*0Sstevel@tonic-gate ASSERT(ambiguities.size() == 0);
217*0Sstevel@tonic-gate }
218*0Sstevel@tonic-gate if (from->hasNext())
219*0Sstevel@tonic-gate for (int i = 0; i < nCodes_; i++)
220*0Sstevel@tonic-gate copyInto(forceNext(into, i), &from->next_[i], additionalLength);
221*0Sstevel@tonic-gate }
222*0Sstevel@tonic-gate
forceNext(Trie * trie,EquivCode c)223*0Sstevel@tonic-gate Trie *TrieBuilder::forceNext(Trie *trie, EquivCode c)
224*0Sstevel@tonic-gate {
225*0Sstevel@tonic-gate if (!trie->hasNext()) {
226*0Sstevel@tonic-gate trie->next_ = new Trie[nCodes_];
227*0Sstevel@tonic-gate if (trie->blank_) {
228*0Sstevel@tonic-gate trie->blank_->additionalLength_ += 1;
229*0Sstevel@tonic-gate trie->blank_->maxBlanksToScan_ -= 1;
230*0Sstevel@tonic-gate }
231*0Sstevel@tonic-gate Owner<BlankTrie> blankOwner(trie->blank_.extract());
232*0Sstevel@tonic-gate const BlankTrie *b = blankOwner.pointer();
233*0Sstevel@tonic-gate for (int i = 0; i < nCodes_; i++) {
234*0Sstevel@tonic-gate Trie *p = &trie->next_[i];
235*0Sstevel@tonic-gate if (b && b->codeIsBlank(i))
236*0Sstevel@tonic-gate trie->next_[i].blank_ = (blankOwner
237*0Sstevel@tonic-gate ? blankOwner.extract()
238*0Sstevel@tonic-gate : new BlankTrie(*b));
239*0Sstevel@tonic-gate p->token_ = trie->token_;
240*0Sstevel@tonic-gate p->tokenLength_ = trie->tokenLength_;
241*0Sstevel@tonic-gate p->priority_ = trie->priority_;
242*0Sstevel@tonic-gate p->nCodes_ = nCodes_;
243*0Sstevel@tonic-gate }
244*0Sstevel@tonic-gate if (b)
245*0Sstevel@tonic-gate // -1 because 1 was added above
246*0Sstevel@tonic-gate copyInto(trie, b, b->additionalLength_ - 1);
247*0Sstevel@tonic-gate }
248*0Sstevel@tonic-gate return &trie->next_[c];
249*0Sstevel@tonic-gate }
250*0Sstevel@tonic-gate
251*0Sstevel@tonic-gate #ifdef SP_NAMESPACE
252*0Sstevel@tonic-gate }
253*0Sstevel@tonic-gate #endif
254