1 // Copyright (c) 1994 James Clark
2 // See the file COPYING for copying permission.
3 #pragma ident "%Z%%M% %I% %E% SMI"
4
5 #ifdef __GNUG__
6 #pragma implementation
7 #endif
8 #include "splib.h"
9 #include "Syntax.h"
10 #include "Sd.h"
11 #include "CharsetInfo.h"
12 #include "ISetIter.h"
13 #include "macros.h"
14 #include "MarkupScan.h"
15 #include "constant.h"
16
17 #ifdef SP_NAMESPACE
18 namespace SP_NAMESPACE {
19 #endif
20
21 const int Syntax::referenceQuantity_[] = {
22 40,
23 960,
24 960,
25 16,
26 16,
27 16,
28 32,
29 96,
30 16,
31 240,
32 8,
33 2,
34 240,
35 960,
36 24
37 };
38
Syntax(const Sd & sd)39 Syntax::Syntax(const Sd &sd)
40 : generalSubst_(0),
41 entitySubst_(0),
42 categoryTable_(otherCategory),
43 shuncharControls_(0),
44 multicode_(0),
45 markupScanTable_(MarkupScan::normal)
46 {
47 static const char lcletter[] = "abcdefghijklmnopqrstuvwxyz";
48 static const char ucletter[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
49 int i;
50 for (i = 0; i < 26; i++) {
51 Char lc = sd.execToInternal(lcletter[i]);
52 Char uc = sd.execToInternal(ucletter[i]);
53 set_[nameStart] += lc;
54 set_[nameStart] += uc;
55 set_[minimumData] += lc;
56 set_[minimumData] += uc;
57 set_[significant] += lc;
58 set_[significant] += uc;
59 if (i < 6) {
60 set_[hexDigit] += lc;
61 set_[hexDigit] += uc;
62 }
63 categoryTable_.setChar(lc, nameStartCategory);
64 categoryTable_.setChar(uc, nameStartCategory);
65 subst(lc, uc);
66 }
67 static const char digits[] = "0123456789";
68 for (i = 0; i < 10; i++) {
69 Char c = sd.execToInternal(digits[i]);
70 set_[digit] += c;
71 set_[hexDigit] += c;
72 set_[minimumData] += c;
73 set_[significant] += c;
74 categoryTable_.setChar(c, digitCategory);
75 }
76 static const char special[] = "'()+,-./:=?";
77 for (i = 0; special[i] != '\0'; i++) {
78 Char c = sd.execToInternal(special[i]);
79 set_[minimumData] += c;
80 set_[significant] += c;
81 }
82
83 if (sd.www()) {
84 static const char wwwSpecial[] = { 33, 35, 36, 37, 42, 59, 64, 95, 0 };
85 for (i = 0; wwwSpecial[i] != '\0'; i++) {
86 const CharsetInfo &charset = sd.internalCharset();
87 WideChar c;
88 ISet<WideChar> set;
89 if (charset.univToDesc(wwwSpecial[i], c, set) > 0 && c <= Char(-1)) {
90 set_[minimumData] += Char(c);
91 set_[significant] += c;
92 }
93 }
94 }
95
96 for (i = 0; i < nQuantity; i++)
97 quantity_[i] = referenceQuantity_[i];
98 for (i = 0; i < 3; i++)
99 standardFunctionValid_[i] = 0;
100 }
101
addNameCharacters(const ISet<Char> & set)102 void Syntax::addNameCharacters(const ISet<Char> &set)
103 {
104 ISetIter<Char> iter(set);
105 Char min, max;
106 while (iter.next(min, max)) {
107 set_[nmchar].addRange(min, max);
108 set_[significant].addRange(min, max);
109 categoryTable_.setRange(min, max, otherNameCategory);
110 }
111 }
112
addNameStartCharacters(const ISet<Char> & set)113 void Syntax::addNameStartCharacters(const ISet<Char> &set)
114 {
115 ISetIter<Char> iter(set);
116 Char min, max;
117 while (iter.next(min, max)) {
118 set_[nameStart].addRange(min, max);
119 set_[significant].addRange(min, max);
120 categoryTable_.setRange(min, max, nameStartCategory);
121 }
122 }
123
addSubst(Char lc,Char uc)124 void Syntax::addSubst(Char lc, Char uc)
125 {
126 subst(lc, uc);
127 }
128
setStandardFunction(StandardFunction f,Char c)129 void Syntax::setStandardFunction(StandardFunction f, Char c)
130 {
131 standardFunction_[f] = c;
132 standardFunctionValid_[f] = 1;
133 set_[minimumData] += c;
134 set_[s] += c;
135 categoryTable_.setChar(c, sCategory);
136 set_[functionChar] += c;
137 set_[significant] += c;
138 switch (f) {
139 case fSPACE:
140 set_[blank] += c;
141 break;
142 case fRE:
143 case fRS:
144 break;
145 }
146 }
147
enterStandardFunctionNames()148 void Syntax::enterStandardFunctionNames()
149 {
150 static Syntax::ReservedName name[3] = {
151 rRE, rRS, rSPACE
152 };
153 for (int i = 0; i < 3; i++)
154 if (standardFunctionValid_[i])
155 functionTable_.insert(reservedName(name[i]), standardFunction_[i]);
156 }
157
setDelimGeneral(int i,const StringC & str)158 void Syntax::setDelimGeneral(int i, const StringC &str)
159 {
160 delimGeneral_[i] = str;
161 for (size_t j = 0; j < str.size(); j++)
162 set_[significant] += str[j];
163 }
164
addDelimShortref(const StringC & str,const CharsetInfo & charset)165 void Syntax::addDelimShortref(const StringC &str, const CharsetInfo &charset)
166 {
167 if (str.size() == 1 && str[0] != charset.execToDesc('B') && !isB(str[0]))
168 delimShortrefSimple_.add(str[0]);
169 else
170 delimShortrefComplex_.push_back(str);
171 for (size_t i = 0; i < str.size(); i++)
172 set_[significant] += str[i];
173 }
174
addDelimShortrefs(const ISet<Char> & shortrefChars,const CharsetInfo & charset)175 void Syntax::addDelimShortrefs(const ISet<Char> &shortrefChars,
176 const CharsetInfo &charset)
177 {
178 ISetIter<Char> blankIter(set_[blank]);
179 Char min, max;
180 StringC specialChars;
181 while (blankIter.next(min, max)) {
182 do {
183 specialChars += min;
184 } while (min++ != max);
185 }
186 specialChars += charset.execToDesc('B');
187 const ISet<Char> *simpleCharsPtr = &shortrefChars;
188 ISet<Char> simpleChars;
189 for (size_t i = 0; i < specialChars.size(); i++)
190 if (shortrefChars.contains(specialChars[i])) {
191 if (simpleCharsPtr != &simpleChars) {
192 simpleChars = shortrefChars;
193 simpleCharsPtr = &simpleChars;
194 }
195 simpleChars.remove(specialChars[i]);
196 }
197 ISetIter<Char> iter(*simpleCharsPtr);
198 while (iter.next(min, max)) {
199 delimShortrefSimple_.addRange(min, max);
200 set_[significant].addRange(min, max);
201 }
202 }
203
addFunctionChar(const StringC & str,FunctionClass fun,Char c)204 void Syntax::addFunctionChar(const StringC &str, FunctionClass fun, Char c)
205 {
206 switch (fun) {
207 case cFUNCHAR:
208 break;
209 case cSEPCHAR:
210 set_[s] += c;
211 categoryTable_.setChar(c, sCategory);
212 set_[blank] += c;
213 set_[sepchar] += c;
214 break;
215 case cMSOCHAR:
216 multicode_ = 1;
217 markupScanTable_.setChar(c, MarkupScan::out);
218 break;
219 case cMSICHAR:
220 // don't need to do anything special if we just have MSICHARs
221 markupScanTable_.setChar(c, MarkupScan::in);
222 break;
223 case cMSSCHAR:
224 multicode_ = 1;
225 markupScanTable_.setChar(c, MarkupScan::suppress);
226 break;
227 }
228 set_[functionChar] += c;
229 set_[significant] += c;
230 functionTable_.insert(str, c);
231 }
232
setName(int i,const StringC & str)233 void Syntax::setName(int i, const StringC &str)
234 {
235 names_[i] = str;
236 nameTable_.insert(str, i);
237 }
238
setNamecaseGeneral(Boolean b)239 void Syntax::setNamecaseGeneral(Boolean b)
240 {
241 namecaseGeneral_ = b;
242 generalSubst_ = b ? &upperSubst_ : &identitySubst_;
243 }
244
setNamecaseEntity(Boolean b)245 void Syntax::setNamecaseEntity(Boolean b)
246 {
247 namecaseEntity_ = b;
248 entitySubst_ = b ? &upperSubst_ : &identitySubst_;
249 }
250
subst(Char from,Char to)251 void Syntax::subst(Char from, Char to)
252 {
253 upperSubst_.addSubst(from, to);
254 }
255
addShunchar(Char c)256 void Syntax::addShunchar(Char c)
257 {
258 shunchar_.add(c);
259 }
260
lookupReservedName(const StringC & str,ReservedName * result) const261 Boolean Syntax::lookupReservedName(const StringC &str,
262 ReservedName *result) const
263 {
264 const int *tem = nameTable_.lookup(str);
265 if (tem) {
266 *result = ReservedName(*tem);
267 return 1;
268 }
269 else
270 return 0;
271 }
272
lookupFunctionChar(const StringC & name,Char * result) const273 Boolean Syntax::lookupFunctionChar(const StringC &name, Char *result) const
274 {
275 const Char *p = functionTable_.lookup(name);
276 if (p) {
277 *result = *p;
278 return 1;
279 }
280 else
281 return 0;
282 }
283
284 #ifdef __GNUG__
285 typedef HashTableIter<StringC,Char> Dummy_HashTableIter_StringC_Char;
286 #endif
287
charFunctionName(Char c,const StringC * & name) const288 Boolean Syntax::charFunctionName(Char c, const StringC *&name) const
289 {
290 HashTableIter<StringC,Char> iter(functionTable_);
291 const Char *cp;
292 while (iter.next(name, cp))
293 if (*cp == c)
294 return 1;
295 return 0;
296 }
297
isValidShortref(const StringC & str) const298 Boolean Syntax::isValidShortref(const StringC &str) const
299 {
300 if (str.size() == 1 && delimShortrefSimple_.contains(str[0]))
301 return 1;
302 for (size_t i = 0; i < delimShortrefComplex_.size(); i++)
303 if (str == delimShortrefComplex_[i])
304 return 1;
305 return 0;
306 }
307
implySgmlChar(const Sd & sd)308 void Syntax::implySgmlChar(const Sd &sd)
309
310 {
311 const CharsetInfo &internalCharset = sd.internalCharset();
312 internalCharset.getDescSet(set_[sgmlChar]);
313 ISet<WideChar> invalid;
314 checkSgmlChar(sd, 0, 0, invalid);
315 ISetIter<WideChar> iter(invalid);
316 WideChar min, max;
317 while (iter.next(min, max)) {
318 do {
319 if (min <= charMax)
320 set_[sgmlChar].remove(Char(min));
321 } while (min++ != max);
322 }
323 }
324
checkSgmlChar(const Sd & sd,const::SP_NAMESPACE_SCOPE Syntax * otherSyntax,Boolean invalidUseDocumentCharset,ISet<WideChar> & invalid) const325 void Syntax::checkSgmlChar(const Sd &sd,
326 const ::SP_NAMESPACE_SCOPE Syntax *otherSyntax,
327 Boolean invalidUseDocumentCharset,
328 ISet<WideChar> &invalid) const
329 {
330 ISetIter<Char> iter(shunchar_);
331 Char min, max;
332 while (iter.next(min, max)) {
333 if (min <= max) {
334 do {
335 Char c;
336 if (!sd.internalCharsetIsDocCharset()) {
337 UnivChar univ;
338 WideChar tem;
339 ISet<WideChar> set;
340 if (sd.docCharset().descToUniv(min, univ)
341 && sd.internalCharset().univToDesc(univ, tem, set)
342 && tem <= charMax)
343 c = Char(tem);
344 else {
345 const PublicId *base;
346 StringC lit;
347 Number n;
348 CharsetDeclRange::Type type;
349 // If it's a declared but unknown character,
350 // then it can't be significant,
351 if (invalidUseDocumentCharset
352 && sd.docCharsetDecl().getCharInfo(min,
353 base,
354 type,
355 n,
356 lit)
357 && type != CharsetDeclRange::unused)
358 invalid += min;
359 continue;
360 }
361 }
362 else
363 c = min;
364 if (!set_[significant].contains(c)
365 && (!otherSyntax || !otherSyntax->set_[significant].contains(c))
366 && set_[sgmlChar].contains(c))
367 invalid += invalidUseDocumentCharset ? min : c;
368 } while (min++ != max);
369 }
370 }
371 if (shuncharControls_) {
372 UnivChar i;
373 const CharsetInfo &charset = invalidUseDocumentCharset ? sd.docCharset() : sd.internalCharset();
374 for (i = 0; i < 32; i++)
375 checkUnivControlChar(i, charset, otherSyntax, invalid);
376 for (i = 127; i < 160; i++)
377 checkUnivControlChar(i, charset, otherSyntax, invalid);
378 }
379 }
380
checkUnivControlChar(UnivChar univChar,const CharsetInfo & internalCharset,const::SP_NAMESPACE_SCOPE Syntax * otherSyntax,ISet<WideChar> & invalid) const381 void Syntax::checkUnivControlChar(UnivChar univChar,
382 const CharsetInfo &internalCharset,
383 const ::SP_NAMESPACE_SCOPE Syntax *otherSyntax,
384 ISet<WideChar> &invalid) const
385 {
386 WideChar c;
387 ISet<WideChar> set;
388 switch (internalCharset.univToDesc(univChar, c, set)) {
389 case 0:
390 break;
391 case 1:
392 set += c;
393 // fall through
394 default:
395 {
396 ISetIter<WideChar> iter(set);
397 WideChar min, max;
398 while (iter.next(min, max)) {
399 do {
400 if (min > charMax)
401 break;
402 Char ch = Char(min);
403 if (!set_[significant].contains(ch)
404 && (!otherSyntax
405 || !otherSyntax->set_[significant].contains(ch))
406 && set_[sgmlChar].contains(ch))
407 invalid += ch;
408 } while (min++ != max);
409 }
410 }
411 }
412 }
413
rniReservedName(ReservedName i) const414 StringC Syntax::rniReservedName(ReservedName i) const
415 {
416 StringC result = delimGeneral(dRNI);
417 result += reservedName(i);
418 return result;
419 }
420
upperSubstTable() const421 const SubstTable<Char> &Syntax::upperSubstTable() const
422 {
423 return upperSubst_;
424 }
425
peroDelim() const426 const StringC &Syntax::peroDelim() const
427 {
428 return delimGeneral(dPERO);
429 }
430
isHexDigit(Xchar c) const431 Boolean Syntax::isHexDigit(Xchar c) const
432 {
433 switch (categoryTable_[c]) {
434 case digitCategory:
435 return 1;
436 case nameStartCategory:
437 break;
438 default:
439 return 0;
440 }
441 return set_[hexDigit].contains(Char(c));
442 }
443
addEntity(const StringC & name,Char c)444 void Syntax::addEntity(const StringC &name, Char c)
445 {
446 entityNames_.push_back(name);
447 entityChars_ += c;
448 }
449
450 #ifdef SP_NAMESPACE
451 }
452 #endif
453