xref: /onnv-gate/usr/src/cmd/man/src/util/nsgmls.src/lib/Syntax.cxx (revision 0:68f95e015346)
1 // Copyright (c) 1994 James Clark
2 // See the file COPYING for copying permission.
3 #pragma ident	"%Z%%M%	%I%	%E% SMI"
4 
5 #ifdef __GNUG__
6 #pragma implementation
7 #endif
8 #include "splib.h"
9 #include "Syntax.h"
10 #include "Sd.h"
11 #include "CharsetInfo.h"
12 #include "ISetIter.h"
13 #include "macros.h"
14 #include "MarkupScan.h"
15 #include "constant.h"
16 
17 #ifdef SP_NAMESPACE
18 namespace SP_NAMESPACE {
19 #endif
20 
21 const int Syntax::referenceQuantity_[] = {
22   40,
23   960,
24   960,
25   16,
26   16,
27   16,
28   32,
29   96,
30   16,
31   240,
32   8,
33   2,
34   240,
35   960,
36   24
37 };
38 
Syntax(const Sd & sd)39 Syntax::Syntax(const Sd &sd)
40 : generalSubst_(0),
41   entitySubst_(0),
42   categoryTable_(otherCategory),
43   shuncharControls_(0),
44   multicode_(0),
45   markupScanTable_(MarkupScan::normal)
46 {
47   static const char lcletter[] = "abcdefghijklmnopqrstuvwxyz";
48   static const char ucletter[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
49   int i;
50   for (i = 0; i < 26; i++) {
51     Char lc = sd.execToInternal(lcletter[i]);
52     Char uc = sd.execToInternal(ucletter[i]);
53     set_[nameStart] += lc;
54     set_[nameStart] += uc;
55     set_[minimumData] += lc;
56     set_[minimumData] += uc;
57     set_[significant] += lc;
58     set_[significant] += uc;
59     if (i < 6) {
60       set_[hexDigit] += lc;
61       set_[hexDigit] += uc;
62     }
63     categoryTable_.setChar(lc, nameStartCategory);
64     categoryTable_.setChar(uc, nameStartCategory);
65     subst(lc, uc);
66   }
67   static const char digits[] = "0123456789";
68   for (i = 0; i < 10; i++) {
69     Char c = sd.execToInternal(digits[i]);
70     set_[digit] += c;
71     set_[hexDigit] += c;
72     set_[minimumData] += c;
73     set_[significant] += c;
74     categoryTable_.setChar(c, digitCategory);
75   }
76   static const char special[] = "'()+,-./:=?";
77   for (i = 0; special[i] != '\0'; i++) {
78     Char c = sd.execToInternal(special[i]);
79     set_[minimumData] += c;
80     set_[significant] += c;
81   }
82 
83   if (sd.www()) {
84     static const char wwwSpecial[] = { 33, 35, 36, 37, 42, 59, 64, 95, 0 };
85     for (i = 0; wwwSpecial[i] != '\0'; i++) {
86       const CharsetInfo &charset = sd.internalCharset();
87       WideChar c;
88       ISet<WideChar> set;
89       if (charset.univToDesc(wwwSpecial[i], c, set) > 0 && c <= Char(-1)) {
90 	set_[minimumData] += Char(c);
91 	set_[significant] += c;
92       }
93     }
94   }
95 
96   for (i = 0; i < nQuantity; i++)
97     quantity_[i] = referenceQuantity_[i];
98   for (i = 0; i < 3; i++)
99     standardFunctionValid_[i] = 0;
100 }
101 
addNameCharacters(const ISet<Char> & set)102 void Syntax::addNameCharacters(const ISet<Char> &set)
103 {
104   ISetIter<Char> iter(set);
105   Char min, max;
106   while (iter.next(min, max)) {
107     set_[nmchar].addRange(min, max);
108     set_[significant].addRange(min, max);
109     categoryTable_.setRange(min, max, otherNameCategory);
110   }
111 }
112 
addNameStartCharacters(const ISet<Char> & set)113 void Syntax::addNameStartCharacters(const ISet<Char> &set)
114 {
115   ISetIter<Char> iter(set);
116   Char min, max;
117   while (iter.next(min, max)) {
118     set_[nameStart].addRange(min, max);
119     set_[significant].addRange(min, max);
120     categoryTable_.setRange(min, max, nameStartCategory);
121   }
122 }
123 
addSubst(Char lc,Char uc)124 void Syntax::addSubst(Char lc, Char uc)
125 {
126   subst(lc, uc);
127 }
128 
setStandardFunction(StandardFunction f,Char c)129 void Syntax::setStandardFunction(StandardFunction f, Char c)
130 {
131   standardFunction_[f] = c;
132   standardFunctionValid_[f] = 1;
133   set_[minimumData] += c;
134   set_[s] += c;
135   categoryTable_.setChar(c, sCategory);
136   set_[functionChar] += c;
137   set_[significant] += c;
138   switch (f) {
139   case fSPACE:
140     set_[blank] += c;
141     break;
142   case fRE:
143   case fRS:
144     break;
145   }
146 }
147 
enterStandardFunctionNames()148 void Syntax::enterStandardFunctionNames()
149 {
150   static Syntax::ReservedName name[3] = {
151     rRE, rRS, rSPACE
152   };
153   for (int i = 0; i < 3; i++)
154     if (standardFunctionValid_[i])
155       functionTable_.insert(reservedName(name[i]), standardFunction_[i]);
156 }
157 
setDelimGeneral(int i,const StringC & str)158 void Syntax::setDelimGeneral(int i, const StringC &str)
159 {
160   delimGeneral_[i] = str;
161   for (size_t j = 0; j < str.size(); j++)
162     set_[significant] += str[j];
163 }
164 
addDelimShortref(const StringC & str,const CharsetInfo & charset)165 void Syntax::addDelimShortref(const StringC &str, const CharsetInfo &charset)
166 {
167   if (str.size() == 1 && str[0] != charset.execToDesc('B') && !isB(str[0]))
168     delimShortrefSimple_.add(str[0]);
169   else
170     delimShortrefComplex_.push_back(str);
171   for (size_t i = 0; i < str.size(); i++)
172     set_[significant] += str[i];
173 }
174 
addDelimShortrefs(const ISet<Char> & shortrefChars,const CharsetInfo & charset)175 void Syntax::addDelimShortrefs(const ISet<Char> &shortrefChars,
176 			       const CharsetInfo &charset)
177 {
178   ISetIter<Char> blankIter(set_[blank]);
179   Char min, max;
180   StringC specialChars;
181   while (blankIter.next(min, max)) {
182     do {
183       specialChars += min;
184     } while (min++ != max);
185   }
186   specialChars += charset.execToDesc('B');
187   const ISet<Char> *simpleCharsPtr = &shortrefChars;
188   ISet<Char> simpleChars;
189   for (size_t i = 0; i < specialChars.size(); i++)
190     if (shortrefChars.contains(specialChars[i])) {
191       if (simpleCharsPtr != &simpleChars) {
192 	simpleChars = shortrefChars;
193 	simpleCharsPtr = &simpleChars;
194       }
195       simpleChars.remove(specialChars[i]);
196     }
197   ISetIter<Char> iter(*simpleCharsPtr);
198   while (iter.next(min, max)) {
199     delimShortrefSimple_.addRange(min, max);
200     set_[significant].addRange(min, max);
201   }
202 }
203 
addFunctionChar(const StringC & str,FunctionClass fun,Char c)204 void Syntax::addFunctionChar(const StringC &str, FunctionClass fun, Char c)
205 {
206   switch (fun) {
207   case cFUNCHAR:
208     break;
209   case cSEPCHAR:
210     set_[s] += c;
211     categoryTable_.setChar(c, sCategory);
212     set_[blank] += c;
213     set_[sepchar] += c;
214     break;
215   case cMSOCHAR:
216     multicode_ = 1;
217     markupScanTable_.setChar(c, MarkupScan::out);
218     break;
219   case cMSICHAR:
220     // don't need to do anything special if we just have MSICHARs
221     markupScanTable_.setChar(c, MarkupScan::in);
222     break;
223   case cMSSCHAR:
224     multicode_ = 1;
225     markupScanTable_.setChar(c, MarkupScan::suppress);
226     break;
227   }
228   set_[functionChar] += c;
229   set_[significant] += c;
230   functionTable_.insert(str, c);
231 }
232 
setName(int i,const StringC & str)233 void Syntax::setName(int i, const StringC &str)
234 {
235   names_[i] = str;
236   nameTable_.insert(str, i);
237 }
238 
setNamecaseGeneral(Boolean b)239 void Syntax::setNamecaseGeneral(Boolean b)
240 {
241   namecaseGeneral_ = b;
242   generalSubst_ = b ? &upperSubst_ : &identitySubst_;
243 }
244 
setNamecaseEntity(Boolean b)245 void Syntax::setNamecaseEntity(Boolean b)
246 {
247   namecaseEntity_ = b;
248   entitySubst_ = b ? &upperSubst_ : &identitySubst_;
249 }
250 
subst(Char from,Char to)251 void Syntax::subst(Char from, Char to)
252 {
253   upperSubst_.addSubst(from, to);
254 }
255 
addShunchar(Char c)256 void Syntax::addShunchar(Char c)
257 {
258   shunchar_.add(c);
259 }
260 
lookupReservedName(const StringC & str,ReservedName * result) const261 Boolean Syntax::lookupReservedName(const StringC &str,
262 				   ReservedName *result) const
263 {
264   const int *tem = nameTable_.lookup(str);
265   if (tem) {
266     *result = ReservedName(*tem);
267     return 1;
268   }
269   else
270     return 0;
271 }
272 
lookupFunctionChar(const StringC & name,Char * result) const273 Boolean Syntax::lookupFunctionChar(const StringC &name, Char *result) const
274 {
275   const Char *p = functionTable_.lookup(name);
276   if (p) {
277     *result = *p;
278     return 1;
279   }
280   else
281     return 0;
282 }
283 
284 #ifdef __GNUG__
285 typedef HashTableIter<StringC,Char> Dummy_HashTableIter_StringC_Char;
286 #endif
287 
charFunctionName(Char c,const StringC * & name) const288 Boolean Syntax::charFunctionName(Char c, const StringC *&name) const
289 {
290   HashTableIter<StringC,Char> iter(functionTable_);
291   const Char *cp;
292   while (iter.next(name, cp))
293     if (*cp == c)
294       return 1;
295   return 0;
296 }
297 
isValidShortref(const StringC & str) const298 Boolean Syntax::isValidShortref(const StringC &str) const
299 {
300   if (str.size() == 1 && delimShortrefSimple_.contains(str[0]))
301     return 1;
302   for (size_t i = 0; i < delimShortrefComplex_.size(); i++)
303     if (str == delimShortrefComplex_[i])
304       return 1;
305   return 0;
306 }
307 
implySgmlChar(const Sd & sd)308 void Syntax::implySgmlChar(const Sd &sd)
309 
310 {
311   const CharsetInfo &internalCharset = sd.internalCharset();
312   internalCharset.getDescSet(set_[sgmlChar]);
313   ISet<WideChar> invalid;
314   checkSgmlChar(sd, 0, 0, invalid);
315   ISetIter<WideChar> iter(invalid);
316   WideChar min, max;
317   while (iter.next(min, max)) {
318     do {
319       if (min <= charMax)
320 	set_[sgmlChar].remove(Char(min));
321     } while (min++ != max);
322   }
323 }
324 
checkSgmlChar(const Sd & sd,const::SP_NAMESPACE_SCOPE Syntax * otherSyntax,Boolean invalidUseDocumentCharset,ISet<WideChar> & invalid) const325 void Syntax::checkSgmlChar(const Sd &sd,
326 			   const ::SP_NAMESPACE_SCOPE Syntax *otherSyntax,
327 			   Boolean invalidUseDocumentCharset,
328 			   ISet<WideChar> &invalid) const
329 {
330   ISetIter<Char> iter(shunchar_);
331   Char min, max;
332   while (iter.next(min, max)) {
333     if (min <= max) {
334       do {
335 	Char c;
336 	if (!sd.internalCharsetIsDocCharset()) {
337 	  UnivChar univ;
338 	  WideChar tem;
339 	  ISet<WideChar> set;
340 	  if (sd.docCharset().descToUniv(min, univ)
341 	      && sd.internalCharset().univToDesc(univ, tem, set)
342 	      && tem <= charMax)
343 	    c = Char(tem);
344 	  else {
345 	    const PublicId *base;
346 	    StringC lit;
347 	    Number n;
348 	    CharsetDeclRange::Type type;
349 	    // If it's a declared but unknown character,
350 	    // then it can't be significant,
351 	    if (invalidUseDocumentCharset
352 	        && sd.docCharsetDecl().getCharInfo(min,
353 		                                   base,
354 						   type,
355 						   n,
356 						   lit)
357 		&& type != CharsetDeclRange::unused)
358 	      invalid += min;
359 	    continue;
360 	  }
361 	}
362 	else
363 	  c = min;
364 	if (!set_[significant].contains(c)
365 	    && (!otherSyntax || !otherSyntax->set_[significant].contains(c))
366 	    && set_[sgmlChar].contains(c))
367 	 invalid += invalidUseDocumentCharset ? min : c;
368       } while (min++ != max);
369     }
370   }
371   if (shuncharControls_) {
372     UnivChar i;
373     const CharsetInfo &charset = invalidUseDocumentCharset ? sd.docCharset() : sd.internalCharset();
374     for (i = 0; i < 32; i++)
375       checkUnivControlChar(i, charset, otherSyntax, invalid);
376     for (i = 127; i < 160; i++)
377       checkUnivControlChar(i, charset, otherSyntax, invalid);
378   }
379 }
380 
checkUnivControlChar(UnivChar univChar,const CharsetInfo & internalCharset,const::SP_NAMESPACE_SCOPE Syntax * otherSyntax,ISet<WideChar> & invalid) const381 void Syntax::checkUnivControlChar(UnivChar univChar,
382 				  const CharsetInfo &internalCharset,
383 				  const ::SP_NAMESPACE_SCOPE Syntax *otherSyntax,
384 				  ISet<WideChar> &invalid) const
385 {
386   WideChar c;
387   ISet<WideChar> set;
388   switch (internalCharset.univToDesc(univChar, c, set)) {
389   case 0:
390     break;
391   case 1:
392     set += c;
393     // fall through
394   default:
395     {
396       ISetIter<WideChar> iter(set);
397       WideChar min, max;
398       while (iter.next(min, max)) {
399 	do {
400 	  if (min > charMax)
401 	    break;
402 	  Char ch = Char(min);
403 	  if (!set_[significant].contains(ch)
404 	      && (!otherSyntax
405 		  || !otherSyntax->set_[significant].contains(ch))
406 	      && set_[sgmlChar].contains(ch))
407 	    invalid += ch;
408 	} while (min++ != max);
409       }
410     }
411   }
412 }
413 
rniReservedName(ReservedName i) const414 StringC Syntax::rniReservedName(ReservedName i) const
415 {
416   StringC result = delimGeneral(dRNI);
417   result += reservedName(i);
418   return result;
419 }
420 
upperSubstTable() const421 const SubstTable<Char> &Syntax::upperSubstTable() const
422 {
423   return upperSubst_;
424 }
425 
peroDelim() const426 const StringC &Syntax::peroDelim() const
427 {
428   return delimGeneral(dPERO);
429 }
430 
isHexDigit(Xchar c) const431 Boolean Syntax::isHexDigit(Xchar c) const
432 {
433   switch (categoryTable_[c]) {
434   case digitCategory:
435     return 1;
436   case nameStartCategory:
437     break;
438   default:
439     return 0;
440   }
441   return set_[hexDigit].contains(Char(c));
442 }
443 
addEntity(const StringC & name,Char c)444 void Syntax::addEntity(const StringC &name, Char c)
445 {
446   entityNames_.push_back(name);
447   entityChars_ += c;
448 }
449 
450 #ifdef SP_NAMESPACE
451 }
452 #endif
453