xref: /onnv-gate/usr/src/cmd/man/src/util/nsgmls.src/lib/TrieBuilder.cxx (revision 0:68f95e015346)
1 // Copyright (c) 1994 James Clark
2 // See the file COPYING for copying permission.
3 #pragma ident	"%Z%%M%	%I%	%E% SMI"
4 
5 #ifdef __GNUG__
6 #pragma implementation
7 #endif
8 #include "splib.h"
9 #include "types.h"
10 #include "macros.h"
11 #include "StringOf.h"
12 #include "Trie.h"
13 #include "TrieBuilder.h"
14 #include "Priority.h"
15 #include <stdlib.h>
16 
17 #ifdef SP_NAMESPACE
18 namespace SP_NAMESPACE {
19 #endif
20 
~Trie()21 Trie::~Trie()
22 {
23   if (next_)
24     delete [] next_;
25 }
26 
Trie(const Trie & t)27 Trie::Trie(const Trie &t)
28 : nCodes_(t.nCodes_),
29   token_(t.token_),
30   tokenLength_(t.tokenLength_),
31   priority_(t.priority_),
32   blank_(t.blank_)
33 {
34   if (t.next_) {
35     next_ = new Trie[nCodes_];
36     for (int i = 0; i < nCodes_; i++)
37       next_[i] = t.next_[i];
38   }
39   else
40     next_ = 0;
41 }
42 
operator =(const Trie & t)43 Trie &Trie::operator=(const Trie &t)
44 {
45   if (next_)
46     delete [] next_;
47   nCodes_ = t.nCodes_;
48   token_ = t.token_;
49   tokenLength_ = t.tokenLength_;
50   priority_ = t.priority_;
51   blank_ = t.blank_;
52   if (t.next_) {
53     next_ = new Trie[nCodes_];
54     for (int i = 0; i < nCodes_; i++)
55       next_[i] = t.next_[i];
56   }
57   else
58     next_ = 0;
59   return *this;
60 }
61 
TrieBuilder(int nCodes)62 TrieBuilder::TrieBuilder(int nCodes)
63 : nCodes_(nCodes), root_(new Trie)
64 {
65   root_->token_ = 0;
66   root_->tokenLength_ = 0;
67   root_->priority_ = Priority::data;
68   root_->nCodes_ = nCodes;
69 }
70 
recognize(const String<EquivCode> & chars,Token t,Priority::Type priority,TokenVector & ambiguities)71 void TrieBuilder::recognize(const String<EquivCode> &chars,
72 			    Token t,
73 			    Priority::Type priority,
74 			    TokenVector &ambiguities)
75 {
76   setToken(extendTrie(root_.pointer(), chars), chars.size(), t, priority,
77 	   ambiguities);
78 }
79 
recognize(const String<EquivCode> & chars,const String<EquivCode> & set,Token t,Priority::Type priority,TokenVector & ambiguities)80 void TrieBuilder::recognize(const String<EquivCode> &chars,
81 			    const String<EquivCode> &set,
82 			    Token t,
83 			    Priority::Type priority,
84 			    TokenVector &ambiguities)
85 {
86   Trie *trie = extendTrie(root_.pointer(), chars);
87 
88   for (size_t i = 0; i < set.size(); i++)
89     setToken(forceNext(trie, set[i]), chars.size() + 1, t, priority,
90 	     ambiguities);
91 }
92 
recognizeB(const String<EquivCode> & chars,int bSequenceLength,size_t maxBlankSequence,const String<EquivCode> & blankCodes,const String<EquivCode> & chars2,Token token,TokenVector & ambiguities)93 void TrieBuilder::recognizeB(const String<EquivCode> &chars,
94 			     int bSequenceLength,
95 			     size_t maxBlankSequence,
96 			     const String<EquivCode> &blankCodes,
97 			     const String<EquivCode> &chars2,
98 			     Token token,
99 			     TokenVector &ambiguities)
100 {
101   doB(extendTrie(root_.pointer(), chars),
102       chars.size(),
103       bSequenceLength,
104       maxBlankSequence,
105       blankCodes,
106       chars2,
107       token,
108       Priority::blank(bSequenceLength),
109       ambiguities);
110 }
111 
recognizeEE(EquivCode code,Token t)112 void TrieBuilder::recognizeEE(EquivCode code, Token t)
113 {
114   Trie *trie = forceNext(root_.pointer(), code);
115   trie->tokenLength_ = 0;	// it has length 0 in the buffer
116   trie->token_ = t;
117   trie->priority_ = Priority::data;
118 }
119 
doB(Trie * trie,int tokenLength,int minBLength,size_t maxLength,const String<EquivCode> & blankCodes,const String<EquivCode> & chars2,Token token,Priority::Type pri,TokenVector & ambiguities)120 void TrieBuilder::doB(Trie *trie,
121 		      int tokenLength,
122 		      int minBLength,
123 		      size_t maxLength,
124 		      const String<EquivCode> &blankCodes,
125 		      const String<EquivCode> &chars2,
126 		      Token token,
127 		      Priority::Type pri,
128 		      TokenVector &ambiguities)
129 {
130   if (minBLength == 0 && trie->next_ == 0) {
131     if (!trie->blank_) {
132       BlankTrie *b = new BlankTrie;
133       trie->blank_ = b;
134       b->maxBlanksToScan_ = maxLength;
135       b->additionalLength_ = tokenLength;
136       b->codeIsBlank_.assign(nCodes_, 0);
137       for (size_t i = 0; i < blankCodes.size(); i++)
138 	b->codeIsBlank_[blankCodes[i]] = 1;
139       b->tokenLength_ = 0;
140       b->token_ = 0;
141       b->priority_ = Priority::data;
142       b->nCodes_ = nCodes_;
143     }
144     else {
145       // A B sequence is not allowed to be adjacent to a character
146       // that can occur in a blank sequence, so maxLength will be
147       // the same at a node, no matter how we got there.
148       ASSERT(trie->blank_->maxBlanksToScan_ == maxLength);
149       ASSERT(trie->blank_->additionalLength_ == tokenLength);
150     }
151     if (chars2.size() == 0)
152       setToken(trie, tokenLength, token, pri, ambiguities);
153     else
154       setToken(extendTrie(trie->blank_.pointer(), chars2),
155 	       chars2.size(),
156 	       token,
157 	       pri,
158 	       ambiguities);
159   }
160   else {
161     if (minBLength == 0)
162       setToken(extendTrie(trie, chars2), tokenLength + chars2.size(),
163 	       token, pri, ambiguities);
164     for (size_t i = 0; i < blankCodes.size(); i++)
165       doB(forceNext(trie, blankCodes[i]),
166 	  tokenLength + 1,
167 	  minBLength == 0 ? 0 : minBLength - 1,
168 	  maxLength - 1,
169 	  blankCodes,
170 	  chars2,
171 	  token,
172 	  pri,
173 	  ambiguities);
174   }
175 }
176 
extendTrie(Trie * trie,const String<EquivCode> & s)177 Trie *TrieBuilder::extendTrie(Trie *trie, const String<EquivCode> &s)
178 {
179   for (size_t i = 0; i < s.size(); i++)
180     trie = forceNext(trie, s[i]);
181   return trie;
182 }
183 
setToken(Trie * trie,int tokenLength,Token token,Priority::Type pri,TokenVector & ambiguities)184 void TrieBuilder::setToken(Trie *trie,
185 			   int tokenLength,
186 			   Token token,
187 			   Priority::Type pri,
188 			   TokenVector &ambiguities)
189 {
190   if (tokenLength > trie->tokenLength_
191       || (tokenLength == trie->tokenLength_
192 	  && pri > trie->priority_)) {
193     trie->tokenLength_ = tokenLength;
194     trie->token_ = token;
195     trie->priority_ = pri;
196   }
197   else if (trie->tokenLength_ == tokenLength
198 	   && trie->priority_ == pri
199 	   && trie->token_ != token
200 	   && trie->token_ != 0) {
201     ambiguities.push_back(Token(trie->token_));
202     ambiguities.push_back(token);
203   }
204   if (trie->hasNext()) {
205     for (int i = 0; i < nCodes_; i++)
206       setToken(&trie->next_[i], tokenLength, token, pri, ambiguities);
207   }
208 }
209 
copyInto(Trie * into,const Trie * from,int additionalLength)210 void TrieBuilder::copyInto(Trie *into, const Trie *from, int additionalLength)
211 {
212   if (from->token_ != 0) {
213     TokenVector ambiguities;
214     setToken(into, from->tokenLength_ + additionalLength, from->token_,
215 	     from->priority_, ambiguities);
216     ASSERT(ambiguities.size() == 0);
217   }
218   if (from->hasNext())
219     for (int i = 0; i < nCodes_; i++)
220       copyInto(forceNext(into, i), &from->next_[i], additionalLength);
221 }
222 
forceNext(Trie * trie,EquivCode c)223 Trie *TrieBuilder::forceNext(Trie *trie, EquivCode c)
224 {
225   if (!trie->hasNext()) {
226     trie->next_ = new Trie[nCodes_];
227     if (trie->blank_) {
228       trie->blank_->additionalLength_ += 1;
229       trie->blank_->maxBlanksToScan_ -= 1;
230     }
231     Owner<BlankTrie> blankOwner(trie->blank_.extract());
232     const BlankTrie *b = blankOwner.pointer();
233     for (int i = 0; i < nCodes_; i++) {
234       Trie *p = &trie->next_[i];
235       if (b && b->codeIsBlank(i))
236 	trie->next_[i].blank_ = (blankOwner
237 				 ? blankOwner.extract()
238 				 : new BlankTrie(*b));
239       p->token_ = trie->token_;
240       p->tokenLength_ = trie->tokenLength_;
241       p->priority_ = trie->priority_;
242       p->nCodes_ = nCodes_;
243     }
244     if (b)
245       // -1 because 1 was added above
246       copyInto(trie, b, b->additionalLength_ - 1);
247   }
248   return &trie->next_[c];
249 }
250 
251 #ifdef SP_NAMESPACE
252 }
253 #endif
254