1 // Copyright (c) 1994, 1997 James Clark
2 // See the file COPYING for copying permission.
3 #pragma ident	"%Z%%M%	%I%	%E% SMI"
4 
5 #ifdef __GNUG__
6 #pragma implementation
7 #endif
8 #include "splib.h"
9 
10 #ifdef SP_MULTI_BYTE
11 
12 #include "XMLCodingSystem.h"
13 #include "UTF8CodingSystem.h"
14 #include "CodingSystemKit.h"
15 #include "Boolean.h"
16 #include "Owner.h"
17 #include "macros.h"
18 #include <stddef.h>
19 #include <string.h>
20 
21 #ifdef SP_DECLARE_MEMMOVE
22 extern "C" {
23   void *memmove(void *, const void *, size_t);
24 }
25 #endif
26 
27 #ifdef SP_NAMESPACE
28 namespace SP_NAMESPACE {
29 #endif
30 
31 const Char ISO646_TAB = 0x9;
32 const Char ISO646_LF = 0xA;
33 const Char ISO646_CR = 0xD;
34 const Char ISO646_SPACE = 0x20;
35 const Char ISO646_QUOT = 0x22;
36 const Char ISO646_APOS = 0x27;
37 const Char ISO646_LT = 0x3C;
38 const Char ISO646_EQUAL = 0x3D;
39 const Char ISO646_GT = 0x3E;
40 const Char ISO646_QUEST = 0x3F;
41 const Char ISO646_LETTER_a = 0x61;
42 const Char ISO646_LETTER_c = 0x63;
43 const Char ISO646_LETTER_d = 0x64;
44 const Char ISO646_LETTER_e = 0x65;
45 const Char ISO646_LETTER_g = 0x67;
46 const Char ISO646_LETTER_i = 0x69;
47 const Char ISO646_LETTER_l = 0x6C;
48 const Char ISO646_LETTER_m = 0x6D;
49 const Char ISO646_LETTER_n = 0x6E;
50 const Char ISO646_LETTER_o = 0x6F;
51 const Char ISO646_LETTER_x = 0x78;
52 
53 class XMLDecoder : public Decoder {
54 public:
55   XMLDecoder(const InputCodingSystemKit *);
56   size_t decode(Char *to, const char *from, size_t fromLen,
57 		const char **rest);
58   Boolean convertOffset(unsigned long &offset) const;
59 private:
60 
61   class UCS2 : public Decoder {
62   public:
63     UCS2(Boolean swapBytes);
64     size_t decode(Char *to, const char *from, size_t fromLen,
65 		  const char **rest);
66     Boolean convertOffset(unsigned long &offset) const;
67   private:
68     Boolean swapBytes_;
69   };
70   // Don't keep parsing a PI longer than this.
71   // We want to avoid reading some enormous file into memory just because
72   // some quote was left off.
73   enum { piMaxSize = 1024*32 };
74 
75   void initDecoderDefault();
76   void initDecoderPI();
77   Boolean extractEncoding(StringC &name);
78   static Boolean isWS(Char);
79 
80   enum DetectPhase {
81     phaseInit,
82     phasePI,
83     phaseFinish
84   };
85   DetectPhase phase_;
86   Boolean byteOrderMark_;
87   Boolean lsbFirst_;
88   int guessBytesPerChar_;
89   Owner<Decoder> subDecoder_;
90   // Contains all the characters passed to caller that were
91   // not produced by subDecoder_.
92   StringC pi_;
93   Char piLiteral_;
94   const InputCodingSystemKit *kit_;
95 };
96 
XMLCodingSystem(const InputCodingSystemKit * kit)97 XMLCodingSystem::XMLCodingSystem(const InputCodingSystemKit *kit)
98 : kit_(kit)
99 {
100 }
101 
makeDecoder() const102 Decoder *XMLCodingSystem::makeDecoder() const
103 {
104   return new XMLDecoder(kit_);
105 }
106 
makeEncoder() const107 Encoder *XMLCodingSystem::makeEncoder() const
108 {
109   UTF8CodingSystem utf8;
110   return utf8.makeEncoder();
111 }
112 
XMLDecoder(const InputCodingSystemKit * kit)113 XMLDecoder::XMLDecoder(const InputCodingSystemKit *kit)
114 : Decoder(1),
115   kit_(kit),
116   phase_(phaseInit),
117   byteOrderMark_(0),
118   lsbFirst_(0),
119   guessBytesPerChar_(1),
120   piLiteral_(0)
121 {
122 }
123 
decode(Char * to,const char * from,size_t fromLen,const char ** rest)124 size_t XMLDecoder::decode(Char *to, const char *from, size_t fromLen,
125 			  const char **rest)
126 {
127   if (phase_ == phaseFinish)
128     return subDecoder_->decode(to, from, fromLen, rest);
129   if (phase_ == phaseInit) {
130     if (fromLen == 0) {
131       *rest = from;
132       return 0;
133     }
134     switch ((unsigned char)*from) {
135     case 0x00:
136     case 0x3C:
137     case 0xFF:
138     case 0xFE:
139       if (fromLen < 2) {
140 	*rest = from;
141 	return 0;
142       }
143       switch (((unsigned char)from[0] << 8) | (unsigned char)from[1]) {
144       case 0xFEFF:
145 	phase_ = phasePI;
146 	byteOrderMark_ = 1;
147 	guessBytesPerChar_ = 2;
148 	from += 2;
149 	fromLen -= 2;
150 	break;
151       case 0xFFFE:
152 	lsbFirst_ = 1;
153 	phase_ = phasePI;
154 	byteOrderMark_ = 1;
155 	guessBytesPerChar_ = 2;
156 	from += 2;
157 	fromLen -= 2;
158 	break;
159       case 0x3C3F:
160 	phase_ = phasePI;
161 	break;
162       case 0x3C00:
163 	lsbFirst_ = 1;
164 	phase_ = phasePI;
165 	guessBytesPerChar_ = 2;
166 	break;
167       case 0x003C:
168 	phase_ = phasePI;
169 	guessBytesPerChar_ = 2;
170 	break;
171       default:
172 	break;
173       }
174       if (phase_ == phasePI)
175 	break;
176       // fall through
177     default:
178       phase_ = phaseFinish;
179       guessBytesPerChar_ = 1;
180       initDecoderDefault();
181       return subDecoder_->decode(to, from, fromLen, rest);
182     }
183   }
184   ASSERT(phase_ == phasePI);
185   Char *p = to;
186   for (; fromLen > guessBytesPerChar_;
187        fromLen -= guessBytesPerChar_, from += guessBytesPerChar_) {
188     if (!piLiteral_ && pi_.size() > 0 && pi_[pi_.size() - 1] == ISO646_GT) {
189       initDecoderPI();
190       phase_ = phaseFinish;
191       return (p - to) + subDecoder_->decode(p, from, fromLen, rest);
192     }
193     Char c = (unsigned char)from[0];
194     if (guessBytesPerChar_ > 1) {
195       if (lsbFirst_)
196 	c |= (unsigned char)from[1] << 8;
197       else {
198 	c <<= 8;
199 	c |= (unsigned char)from[1];
200       }
201     }
202     static const Char startBytes[] = {
203       ISO646_LT, ISO646_QUEST, ISO646_LETTER_x, ISO646_LETTER_m, ISO646_LETTER_l
204     };
205     // Stop accumulating the PI if we get characters that are illegal in the PI.
206     if (c == 0
207         || c >= 0x7F
208 	|| (pi_.size() > 0 && c == ISO646_LT)
209 	|| pi_.size() > piMaxSize
210 	|| (pi_.size() < 5 && c != startBytes[pi_.size()])
211 	|| (pi_.size() == 5 && !isWS(c))) {
212       initDecoderDefault();
213       phase_ = phaseFinish;
214       break;
215     }
216     *p++ = c;
217     pi_ += c;
218     if (piLiteral_) {
219       if (c == piLiteral_)
220 	piLiteral_ = 0;
221     }
222     else if (c == ISO646_QUOT || c == ISO646_APOS)
223       piLiteral_ = c;
224   }
225   size_t n = p - to;
226   if (phase_ == phaseFinish && fromLen > 0)
227     n += subDecoder_->decode(p, from, fromLen, rest);
228   return n;
229 }
230 
convertOffset(unsigned long & n) const231 Boolean XMLDecoder::convertOffset(unsigned long &n) const
232 {
233   if (n <= pi_.size())
234     n *= guessBytesPerChar_;
235   else {
236     if (!subDecoder_)
237       return 0;
238     unsigned long tem = n - pi_.size();
239     if (!subDecoder_->convertOffset(tem))
240       return 0;
241     n = tem + pi_.size() * guessBytesPerChar_;
242   }
243   if (byteOrderMark_)
244     n += 2;
245   return 1;
246 }
247 
initDecoderDefault()248 void XMLDecoder::initDecoderDefault()
249 {
250   if (guessBytesPerChar_ == 1) {
251     UTF8CodingSystem utf8;
252     subDecoder_ = utf8.makeDecoder();
253   }
254   else {
255     unsigned short n = 0x1;
256     minBytesPerChar_ = 2;
257     subDecoder_ = new UCS2((*(char *)&n == 0x1) != lsbFirst_);
258   }
259 }
260 
initDecoderPI()261 void XMLDecoder::initDecoderPI()
262 {
263   StringC name;
264   if (!extractEncoding(name))
265     initDecoderDefault();
266   const char *dummy;
267   static const UnivCharsetDesc::Range range = { 0, 128, 0 };
268   CharsetInfo piCharset(UnivCharsetDesc(&range, 1));
269   const InputCodingSystem *ics
270     = kit_->makeInputCodingSystem(name,
271 				  piCharset,
272 				  0,
273 				  dummy);
274   if (ics) {
275     subDecoder_ = ics->makeDecoder();
276     minBytesPerChar_ = subDecoder_->minBytesPerChar();
277   }
278   if (!subDecoder_)
279     initDecoderDefault();
280 }
281 
isWS(Char c)282 Boolean XMLDecoder::isWS(Char c)
283 {
284   switch (c) {
285   case ISO646_CR:
286   case ISO646_LF:
287   case ISO646_SPACE:
288   case ISO646_TAB:
289     return 1;
290   }
291   return 0;
292 }
293 
extractEncoding(StringC & name)294 Boolean XMLDecoder::extractEncoding(StringC &name)
295 {
296   Char lit = 0;
297   for (size_t i = 5; i < pi_.size(); i++) {
298     if (!lit) {
299       if (pi_[i] == ISO646_APOS || pi_[i] == ISO646_QUOT)
300 	lit = pi_[i];
301       else if (pi_[i] == ISO646_EQUAL) {
302 	size_t j = i;
303 	for (; j > 0; j--) {
304 	  if (!isWS(pi_[j - 1]))
305 	    break;
306 	}
307 	size_t nameEnd = j;
308 	for (; j > 0; j--) {
309 	  if (isWS(pi_[j - 1]) || pi_[j - 1] == ISO646_QUOT || pi_[j - 1] == ISO646_APOS)
310 	    break;
311 	}
312 	static const Char encodingName[] = {
313 	  ISO646_LETTER_e, ISO646_LETTER_n, ISO646_LETTER_c, ISO646_LETTER_o,
314 	  ISO646_LETTER_d, ISO646_LETTER_i, ISO646_LETTER_n, ISO646_LETTER_g,
315 	  0
316 	};
317 	const Char *s = encodingName;
318 	for (; *s && j < nameEnd; j++, s++)
319 	  if (pi_[j] != *s)
320 	    break;
321 	if (j == nameEnd && *s == 0) {
322 	  size_t j = i + 1;
323 	  for (; j < pi_.size(); j++) {
324 	    if (!isWS(pi_[j]))
325 	      break;
326 	  }
327 	  if (pi_[j] == ISO646_QUOT || pi_[j] == ISO646_APOS) {
328 	    Char lit = pi_[j];
329 	    size_t nameStart = j + 1;
330 	    for (++j; j < pi_.size(); j++) {
331 	      if (pi_[j] == lit) {
332 		if (j > nameStart) {
333 		  name.assign(&pi_[nameStart], j - nameStart);
334 		  return 1;
335 		}
336 		break;
337 	      }
338 	    }
339 	  }
340 	  return 0;
341 	}
342       }
343     }
344     else if (pi_[i] == lit)
345       lit = 0;
346   }
347   return 0;
348 }
349 
UCS2(Boolean swapBytes)350 XMLDecoder::UCS2::UCS2(Boolean swapBytes)
351 : swapBytes_(swapBytes)
352 {
353 }
354 
decode(Char * to,const char * from,size_t fromLen,const char ** rest)355 size_t XMLDecoder::UCS2::decode(Char *to, const char *from, size_t fromLen,
356 				const char **rest)
357 {
358   union U {
359     unsigned short word;
360     char bytes[2];
361   };
362   fromLen &= ~1;
363   *rest = from + fromLen;
364   if (sizeof(Char) == 2) {
365     if (!swapBytes_) {
366       if (from != (char *)to)
367 	memmove(to, from, fromLen);
368       return fromLen/2;
369     }
370   }
371   if (swapBytes_) {
372     for (size_t n = fromLen; n > 0; n -= 2) {
373       U u;
374       u.bytes[1] = *from++;
375       u.bytes[0] = *from++;
376       *to++ = u.word;
377     }
378   }
379   else  {
380     for (size_t n = fromLen; n > 0; n -= 2) {
381       U u;
382       u.bytes[0] = *from++;
383       u.bytes[1] = *from++;
384       *to++ = u.word;
385     }
386   }
387   return fromLen/2;
388 }
389 
convertOffset(unsigned long & n) const390 Boolean XMLDecoder::UCS2::convertOffset(unsigned long &n) const
391 {
392   n *= 2;
393   return 1;
394 }
395 
396 #ifdef SP_NAMESPACE
397 }
398 #endif
399 
400 #else /* not SP_MULTI_BYTE */
401 
402 #ifndef __GNUG__
403 static char non_empty_translation_unit;	// sigh
404 #endif
405 
406 #endif /* not SP_MULTI_BYTE */
407