xref: /onnv-gate/usr/src/cmd/man/src/util/nsgmls.src/lib/TrieBuilder.cxx (revision 0:68f95e015346)
1*0Sstevel@tonic-gate // Copyright (c) 1994 James Clark
2*0Sstevel@tonic-gate // See the file COPYING for copying permission.
3*0Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
4*0Sstevel@tonic-gate 
5*0Sstevel@tonic-gate #ifdef __GNUG__
6*0Sstevel@tonic-gate #pragma implementation
7*0Sstevel@tonic-gate #endif
8*0Sstevel@tonic-gate #include "splib.h"
9*0Sstevel@tonic-gate #include "types.h"
10*0Sstevel@tonic-gate #include "macros.h"
11*0Sstevel@tonic-gate #include "StringOf.h"
12*0Sstevel@tonic-gate #include "Trie.h"
13*0Sstevel@tonic-gate #include "TrieBuilder.h"
14*0Sstevel@tonic-gate #include "Priority.h"
15*0Sstevel@tonic-gate #include <stdlib.h>
16*0Sstevel@tonic-gate 
17*0Sstevel@tonic-gate #ifdef SP_NAMESPACE
18*0Sstevel@tonic-gate namespace SP_NAMESPACE {
19*0Sstevel@tonic-gate #endif
20*0Sstevel@tonic-gate 
~Trie()21*0Sstevel@tonic-gate Trie::~Trie()
22*0Sstevel@tonic-gate {
23*0Sstevel@tonic-gate   if (next_)
24*0Sstevel@tonic-gate     delete [] next_;
25*0Sstevel@tonic-gate }
26*0Sstevel@tonic-gate 
Trie(const Trie & t)27*0Sstevel@tonic-gate Trie::Trie(const Trie &t)
28*0Sstevel@tonic-gate : nCodes_(t.nCodes_),
29*0Sstevel@tonic-gate   token_(t.token_),
30*0Sstevel@tonic-gate   tokenLength_(t.tokenLength_),
31*0Sstevel@tonic-gate   priority_(t.priority_),
32*0Sstevel@tonic-gate   blank_(t.blank_)
33*0Sstevel@tonic-gate {
34*0Sstevel@tonic-gate   if (t.next_) {
35*0Sstevel@tonic-gate     next_ = new Trie[nCodes_];
36*0Sstevel@tonic-gate     for (int i = 0; i < nCodes_; i++)
37*0Sstevel@tonic-gate       next_[i] = t.next_[i];
38*0Sstevel@tonic-gate   }
39*0Sstevel@tonic-gate   else
40*0Sstevel@tonic-gate     next_ = 0;
41*0Sstevel@tonic-gate }
42*0Sstevel@tonic-gate 
operator =(const Trie & t)43*0Sstevel@tonic-gate Trie &Trie::operator=(const Trie &t)
44*0Sstevel@tonic-gate {
45*0Sstevel@tonic-gate   if (next_)
46*0Sstevel@tonic-gate     delete [] next_;
47*0Sstevel@tonic-gate   nCodes_ = t.nCodes_;
48*0Sstevel@tonic-gate   token_ = t.token_;
49*0Sstevel@tonic-gate   tokenLength_ = t.tokenLength_;
50*0Sstevel@tonic-gate   priority_ = t.priority_;
51*0Sstevel@tonic-gate   blank_ = t.blank_;
52*0Sstevel@tonic-gate   if (t.next_) {
53*0Sstevel@tonic-gate     next_ = new Trie[nCodes_];
54*0Sstevel@tonic-gate     for (int i = 0; i < nCodes_; i++)
55*0Sstevel@tonic-gate       next_[i] = t.next_[i];
56*0Sstevel@tonic-gate   }
57*0Sstevel@tonic-gate   else
58*0Sstevel@tonic-gate     next_ = 0;
59*0Sstevel@tonic-gate   return *this;
60*0Sstevel@tonic-gate }
61*0Sstevel@tonic-gate 
TrieBuilder(int nCodes)62*0Sstevel@tonic-gate TrieBuilder::TrieBuilder(int nCodes)
63*0Sstevel@tonic-gate : nCodes_(nCodes), root_(new Trie)
64*0Sstevel@tonic-gate {
65*0Sstevel@tonic-gate   root_->token_ = 0;
66*0Sstevel@tonic-gate   root_->tokenLength_ = 0;
67*0Sstevel@tonic-gate   root_->priority_ = Priority::data;
68*0Sstevel@tonic-gate   root_->nCodes_ = nCodes;
69*0Sstevel@tonic-gate }
70*0Sstevel@tonic-gate 
recognize(const String<EquivCode> & chars,Token t,Priority::Type priority,TokenVector & ambiguities)71*0Sstevel@tonic-gate void TrieBuilder::recognize(const String<EquivCode> &chars,
72*0Sstevel@tonic-gate 			    Token t,
73*0Sstevel@tonic-gate 			    Priority::Type priority,
74*0Sstevel@tonic-gate 			    TokenVector &ambiguities)
75*0Sstevel@tonic-gate {
76*0Sstevel@tonic-gate   setToken(extendTrie(root_.pointer(), chars), chars.size(), t, priority,
77*0Sstevel@tonic-gate 	   ambiguities);
78*0Sstevel@tonic-gate }
79*0Sstevel@tonic-gate 
recognize(const String<EquivCode> & chars,const String<EquivCode> & set,Token t,Priority::Type priority,TokenVector & ambiguities)80*0Sstevel@tonic-gate void TrieBuilder::recognize(const String<EquivCode> &chars,
81*0Sstevel@tonic-gate 			    const String<EquivCode> &set,
82*0Sstevel@tonic-gate 			    Token t,
83*0Sstevel@tonic-gate 			    Priority::Type priority,
84*0Sstevel@tonic-gate 			    TokenVector &ambiguities)
85*0Sstevel@tonic-gate {
86*0Sstevel@tonic-gate   Trie *trie = extendTrie(root_.pointer(), chars);
87*0Sstevel@tonic-gate 
88*0Sstevel@tonic-gate   for (size_t i = 0; i < set.size(); i++)
89*0Sstevel@tonic-gate     setToken(forceNext(trie, set[i]), chars.size() + 1, t, priority,
90*0Sstevel@tonic-gate 	     ambiguities);
91*0Sstevel@tonic-gate }
92*0Sstevel@tonic-gate 
recognizeB(const String<EquivCode> & chars,int bSequenceLength,size_t maxBlankSequence,const String<EquivCode> & blankCodes,const String<EquivCode> & chars2,Token token,TokenVector & ambiguities)93*0Sstevel@tonic-gate void TrieBuilder::recognizeB(const String<EquivCode> &chars,
94*0Sstevel@tonic-gate 			     int bSequenceLength,
95*0Sstevel@tonic-gate 			     size_t maxBlankSequence,
96*0Sstevel@tonic-gate 			     const String<EquivCode> &blankCodes,
97*0Sstevel@tonic-gate 			     const String<EquivCode> &chars2,
98*0Sstevel@tonic-gate 			     Token token,
99*0Sstevel@tonic-gate 			     TokenVector &ambiguities)
100*0Sstevel@tonic-gate {
101*0Sstevel@tonic-gate   doB(extendTrie(root_.pointer(), chars),
102*0Sstevel@tonic-gate       chars.size(),
103*0Sstevel@tonic-gate       bSequenceLength,
104*0Sstevel@tonic-gate       maxBlankSequence,
105*0Sstevel@tonic-gate       blankCodes,
106*0Sstevel@tonic-gate       chars2,
107*0Sstevel@tonic-gate       token,
108*0Sstevel@tonic-gate       Priority::blank(bSequenceLength),
109*0Sstevel@tonic-gate       ambiguities);
110*0Sstevel@tonic-gate }
111*0Sstevel@tonic-gate 
recognizeEE(EquivCode code,Token t)112*0Sstevel@tonic-gate void TrieBuilder::recognizeEE(EquivCode code, Token t)
113*0Sstevel@tonic-gate {
114*0Sstevel@tonic-gate   Trie *trie = forceNext(root_.pointer(), code);
115*0Sstevel@tonic-gate   trie->tokenLength_ = 0;	// it has length 0 in the buffer
116*0Sstevel@tonic-gate   trie->token_ = t;
117*0Sstevel@tonic-gate   trie->priority_ = Priority::data;
118*0Sstevel@tonic-gate }
119*0Sstevel@tonic-gate 
doB(Trie * trie,int tokenLength,int minBLength,size_t maxLength,const String<EquivCode> & blankCodes,const String<EquivCode> & chars2,Token token,Priority::Type pri,TokenVector & ambiguities)120*0Sstevel@tonic-gate void TrieBuilder::doB(Trie *trie,
121*0Sstevel@tonic-gate 		      int tokenLength,
122*0Sstevel@tonic-gate 		      int minBLength,
123*0Sstevel@tonic-gate 		      size_t maxLength,
124*0Sstevel@tonic-gate 		      const String<EquivCode> &blankCodes,
125*0Sstevel@tonic-gate 		      const String<EquivCode> &chars2,
126*0Sstevel@tonic-gate 		      Token token,
127*0Sstevel@tonic-gate 		      Priority::Type pri,
128*0Sstevel@tonic-gate 		      TokenVector &ambiguities)
129*0Sstevel@tonic-gate {
130*0Sstevel@tonic-gate   if (minBLength == 0 && trie->next_ == 0) {
131*0Sstevel@tonic-gate     if (!trie->blank_) {
132*0Sstevel@tonic-gate       BlankTrie *b = new BlankTrie;
133*0Sstevel@tonic-gate       trie->blank_ = b;
134*0Sstevel@tonic-gate       b->maxBlanksToScan_ = maxLength;
135*0Sstevel@tonic-gate       b->additionalLength_ = tokenLength;
136*0Sstevel@tonic-gate       b->codeIsBlank_.assign(nCodes_, 0);
137*0Sstevel@tonic-gate       for (size_t i = 0; i < blankCodes.size(); i++)
138*0Sstevel@tonic-gate 	b->codeIsBlank_[blankCodes[i]] = 1;
139*0Sstevel@tonic-gate       b->tokenLength_ = 0;
140*0Sstevel@tonic-gate       b->token_ = 0;
141*0Sstevel@tonic-gate       b->priority_ = Priority::data;
142*0Sstevel@tonic-gate       b->nCodes_ = nCodes_;
143*0Sstevel@tonic-gate     }
144*0Sstevel@tonic-gate     else {
145*0Sstevel@tonic-gate       // A B sequence is not allowed to be adjacent to a character
146*0Sstevel@tonic-gate       // that can occur in a blank sequence, so maxLength will be
147*0Sstevel@tonic-gate       // the same at a node, no matter how we got there.
148*0Sstevel@tonic-gate       ASSERT(trie->blank_->maxBlanksToScan_ == maxLength);
149*0Sstevel@tonic-gate       ASSERT(trie->blank_->additionalLength_ == tokenLength);
150*0Sstevel@tonic-gate     }
151*0Sstevel@tonic-gate     if (chars2.size() == 0)
152*0Sstevel@tonic-gate       setToken(trie, tokenLength, token, pri, ambiguities);
153*0Sstevel@tonic-gate     else
154*0Sstevel@tonic-gate       setToken(extendTrie(trie->blank_.pointer(), chars2),
155*0Sstevel@tonic-gate 	       chars2.size(),
156*0Sstevel@tonic-gate 	       token,
157*0Sstevel@tonic-gate 	       pri,
158*0Sstevel@tonic-gate 	       ambiguities);
159*0Sstevel@tonic-gate   }
160*0Sstevel@tonic-gate   else {
161*0Sstevel@tonic-gate     if (minBLength == 0)
162*0Sstevel@tonic-gate       setToken(extendTrie(trie, chars2), tokenLength + chars2.size(),
163*0Sstevel@tonic-gate 	       token, pri, ambiguities);
164*0Sstevel@tonic-gate     for (size_t i = 0; i < blankCodes.size(); i++)
165*0Sstevel@tonic-gate       doB(forceNext(trie, blankCodes[i]),
166*0Sstevel@tonic-gate 	  tokenLength + 1,
167*0Sstevel@tonic-gate 	  minBLength == 0 ? 0 : minBLength - 1,
168*0Sstevel@tonic-gate 	  maxLength - 1,
169*0Sstevel@tonic-gate 	  blankCodes,
170*0Sstevel@tonic-gate 	  chars2,
171*0Sstevel@tonic-gate 	  token,
172*0Sstevel@tonic-gate 	  pri,
173*0Sstevel@tonic-gate 	  ambiguities);
174*0Sstevel@tonic-gate   }
175*0Sstevel@tonic-gate }
176*0Sstevel@tonic-gate 
extendTrie(Trie * trie,const String<EquivCode> & s)177*0Sstevel@tonic-gate Trie *TrieBuilder::extendTrie(Trie *trie, const String<EquivCode> &s)
178*0Sstevel@tonic-gate {
179*0Sstevel@tonic-gate   for (size_t i = 0; i < s.size(); i++)
180*0Sstevel@tonic-gate     trie = forceNext(trie, s[i]);
181*0Sstevel@tonic-gate   return trie;
182*0Sstevel@tonic-gate }
183*0Sstevel@tonic-gate 
setToken(Trie * trie,int tokenLength,Token token,Priority::Type pri,TokenVector & ambiguities)184*0Sstevel@tonic-gate void TrieBuilder::setToken(Trie *trie,
185*0Sstevel@tonic-gate 			   int tokenLength,
186*0Sstevel@tonic-gate 			   Token token,
187*0Sstevel@tonic-gate 			   Priority::Type pri,
188*0Sstevel@tonic-gate 			   TokenVector &ambiguities)
189*0Sstevel@tonic-gate {
190*0Sstevel@tonic-gate   if (tokenLength > trie->tokenLength_
191*0Sstevel@tonic-gate       || (tokenLength == trie->tokenLength_
192*0Sstevel@tonic-gate 	  && pri > trie->priority_)) {
193*0Sstevel@tonic-gate     trie->tokenLength_ = tokenLength;
194*0Sstevel@tonic-gate     trie->token_ = token;
195*0Sstevel@tonic-gate     trie->priority_ = pri;
196*0Sstevel@tonic-gate   }
197*0Sstevel@tonic-gate   else if (trie->tokenLength_ == tokenLength
198*0Sstevel@tonic-gate 	   && trie->priority_ == pri
199*0Sstevel@tonic-gate 	   && trie->token_ != token
200*0Sstevel@tonic-gate 	   && trie->token_ != 0) {
201*0Sstevel@tonic-gate     ambiguities.push_back(Token(trie->token_));
202*0Sstevel@tonic-gate     ambiguities.push_back(token);
203*0Sstevel@tonic-gate   }
204*0Sstevel@tonic-gate   if (trie->hasNext()) {
205*0Sstevel@tonic-gate     for (int i = 0; i < nCodes_; i++)
206*0Sstevel@tonic-gate       setToken(&trie->next_[i], tokenLength, token, pri, ambiguities);
207*0Sstevel@tonic-gate   }
208*0Sstevel@tonic-gate }
209*0Sstevel@tonic-gate 
copyInto(Trie * into,const Trie * from,int additionalLength)210*0Sstevel@tonic-gate void TrieBuilder::copyInto(Trie *into, const Trie *from, int additionalLength)
211*0Sstevel@tonic-gate {
212*0Sstevel@tonic-gate   if (from->token_ != 0) {
213*0Sstevel@tonic-gate     TokenVector ambiguities;
214*0Sstevel@tonic-gate     setToken(into, from->tokenLength_ + additionalLength, from->token_,
215*0Sstevel@tonic-gate 	     from->priority_, ambiguities);
216*0Sstevel@tonic-gate     ASSERT(ambiguities.size() == 0);
217*0Sstevel@tonic-gate   }
218*0Sstevel@tonic-gate   if (from->hasNext())
219*0Sstevel@tonic-gate     for (int i = 0; i < nCodes_; i++)
220*0Sstevel@tonic-gate       copyInto(forceNext(into, i), &from->next_[i], additionalLength);
221*0Sstevel@tonic-gate }
222*0Sstevel@tonic-gate 
forceNext(Trie * trie,EquivCode c)223*0Sstevel@tonic-gate Trie *TrieBuilder::forceNext(Trie *trie, EquivCode c)
224*0Sstevel@tonic-gate {
225*0Sstevel@tonic-gate   if (!trie->hasNext()) {
226*0Sstevel@tonic-gate     trie->next_ = new Trie[nCodes_];
227*0Sstevel@tonic-gate     if (trie->blank_) {
228*0Sstevel@tonic-gate       trie->blank_->additionalLength_ += 1;
229*0Sstevel@tonic-gate       trie->blank_->maxBlanksToScan_ -= 1;
230*0Sstevel@tonic-gate     }
231*0Sstevel@tonic-gate     Owner<BlankTrie> blankOwner(trie->blank_.extract());
232*0Sstevel@tonic-gate     const BlankTrie *b = blankOwner.pointer();
233*0Sstevel@tonic-gate     for (int i = 0; i < nCodes_; i++) {
234*0Sstevel@tonic-gate       Trie *p = &trie->next_[i];
235*0Sstevel@tonic-gate       if (b && b->codeIsBlank(i))
236*0Sstevel@tonic-gate 	trie->next_[i].blank_ = (blankOwner
237*0Sstevel@tonic-gate 				 ? blankOwner.extract()
238*0Sstevel@tonic-gate 				 : new BlankTrie(*b));
239*0Sstevel@tonic-gate       p->token_ = trie->token_;
240*0Sstevel@tonic-gate       p->tokenLength_ = trie->tokenLength_;
241*0Sstevel@tonic-gate       p->priority_ = trie->priority_;
242*0Sstevel@tonic-gate       p->nCodes_ = nCodes_;
243*0Sstevel@tonic-gate     }
244*0Sstevel@tonic-gate     if (b)
245*0Sstevel@tonic-gate       // -1 because 1 was added above
246*0Sstevel@tonic-gate       copyInto(trie, b, b->additionalLength_ - 1);
247*0Sstevel@tonic-gate   }
248*0Sstevel@tonic-gate   return &trie->next_[c];
249*0Sstevel@tonic-gate }
250*0Sstevel@tonic-gate 
251*0Sstevel@tonic-gate #ifdef SP_NAMESPACE
252*0Sstevel@tonic-gate }
253*0Sstevel@tonic-gate #endif
254