1 // Copyright (c) 1994, 1997 James Clark
2 // See the file COPYING for copying permission.
3 #pragma ident "%Z%%M% %I% %E% SMI"
4
5 #ifdef __GNUG__
6 #pragma implementation
7 #endif
8 #include "splib.h"
9
10 #ifdef SP_MULTI_BYTE
11
12 #include "XMLCodingSystem.h"
13 #include "UTF8CodingSystem.h"
14 #include "CodingSystemKit.h"
15 #include "Boolean.h"
16 #include "Owner.h"
17 #include "macros.h"
18 #include <stddef.h>
19 #include <string.h>
20
21 #ifdef SP_DECLARE_MEMMOVE
22 extern "C" {
23 void *memmove(void *, const void *, size_t);
24 }
25 #endif
26
27 #ifdef SP_NAMESPACE
28 namespace SP_NAMESPACE {
29 #endif
30
31 const Char ISO646_TAB = 0x9;
32 const Char ISO646_LF = 0xA;
33 const Char ISO646_CR = 0xD;
34 const Char ISO646_SPACE = 0x20;
35 const Char ISO646_QUOT = 0x22;
36 const Char ISO646_APOS = 0x27;
37 const Char ISO646_LT = 0x3C;
38 const Char ISO646_EQUAL = 0x3D;
39 const Char ISO646_GT = 0x3E;
40 const Char ISO646_QUEST = 0x3F;
41 const Char ISO646_LETTER_a = 0x61;
42 const Char ISO646_LETTER_c = 0x63;
43 const Char ISO646_LETTER_d = 0x64;
44 const Char ISO646_LETTER_e = 0x65;
45 const Char ISO646_LETTER_g = 0x67;
46 const Char ISO646_LETTER_i = 0x69;
47 const Char ISO646_LETTER_l = 0x6C;
48 const Char ISO646_LETTER_m = 0x6D;
49 const Char ISO646_LETTER_n = 0x6E;
50 const Char ISO646_LETTER_o = 0x6F;
51 const Char ISO646_LETTER_x = 0x78;
52
53 class XMLDecoder : public Decoder {
54 public:
55 XMLDecoder(const InputCodingSystemKit *);
56 size_t decode(Char *to, const char *from, size_t fromLen,
57 const char **rest);
58 Boolean convertOffset(unsigned long &offset) const;
59 private:
60
61 class UCS2 : public Decoder {
62 public:
63 UCS2(Boolean swapBytes);
64 size_t decode(Char *to, const char *from, size_t fromLen,
65 const char **rest);
66 Boolean convertOffset(unsigned long &offset) const;
67 private:
68 Boolean swapBytes_;
69 };
70 // Don't keep parsing a PI longer than this.
71 // We want to avoid reading some enormous file into memory just because
72 // some quote was left off.
73 enum { piMaxSize = 1024*32 };
74
75 void initDecoderDefault();
76 void initDecoderPI();
77 Boolean extractEncoding(StringC &name);
78 static Boolean isWS(Char);
79
80 enum DetectPhase {
81 phaseInit,
82 phasePI,
83 phaseFinish
84 };
85 DetectPhase phase_;
86 Boolean byteOrderMark_;
87 Boolean lsbFirst_;
88 int guessBytesPerChar_;
89 Owner<Decoder> subDecoder_;
90 // Contains all the characters passed to caller that were
91 // not produced by subDecoder_.
92 StringC pi_;
93 Char piLiteral_;
94 const InputCodingSystemKit *kit_;
95 };
96
XMLCodingSystem(const InputCodingSystemKit * kit)97 XMLCodingSystem::XMLCodingSystem(const InputCodingSystemKit *kit)
98 : kit_(kit)
99 {
100 }
101
makeDecoder() const102 Decoder *XMLCodingSystem::makeDecoder() const
103 {
104 return new XMLDecoder(kit_);
105 }
106
makeEncoder() const107 Encoder *XMLCodingSystem::makeEncoder() const
108 {
109 UTF8CodingSystem utf8;
110 return utf8.makeEncoder();
111 }
112
XMLDecoder(const InputCodingSystemKit * kit)113 XMLDecoder::XMLDecoder(const InputCodingSystemKit *kit)
114 : Decoder(1),
115 kit_(kit),
116 phase_(phaseInit),
117 byteOrderMark_(0),
118 lsbFirst_(0),
119 guessBytesPerChar_(1),
120 piLiteral_(0)
121 {
122 }
123
decode(Char * to,const char * from,size_t fromLen,const char ** rest)124 size_t XMLDecoder::decode(Char *to, const char *from, size_t fromLen,
125 const char **rest)
126 {
127 if (phase_ == phaseFinish)
128 return subDecoder_->decode(to, from, fromLen, rest);
129 if (phase_ == phaseInit) {
130 if (fromLen == 0) {
131 *rest = from;
132 return 0;
133 }
134 switch ((unsigned char)*from) {
135 case 0x00:
136 case 0x3C:
137 case 0xFF:
138 case 0xFE:
139 if (fromLen < 2) {
140 *rest = from;
141 return 0;
142 }
143 switch (((unsigned char)from[0] << 8) | (unsigned char)from[1]) {
144 case 0xFEFF:
145 phase_ = phasePI;
146 byteOrderMark_ = 1;
147 guessBytesPerChar_ = 2;
148 from += 2;
149 fromLen -= 2;
150 break;
151 case 0xFFFE:
152 lsbFirst_ = 1;
153 phase_ = phasePI;
154 byteOrderMark_ = 1;
155 guessBytesPerChar_ = 2;
156 from += 2;
157 fromLen -= 2;
158 break;
159 case 0x3C3F:
160 phase_ = phasePI;
161 break;
162 case 0x3C00:
163 lsbFirst_ = 1;
164 phase_ = phasePI;
165 guessBytesPerChar_ = 2;
166 break;
167 case 0x003C:
168 phase_ = phasePI;
169 guessBytesPerChar_ = 2;
170 break;
171 default:
172 break;
173 }
174 if (phase_ == phasePI)
175 break;
176 // fall through
177 default:
178 phase_ = phaseFinish;
179 guessBytesPerChar_ = 1;
180 initDecoderDefault();
181 return subDecoder_->decode(to, from, fromLen, rest);
182 }
183 }
184 ASSERT(phase_ == phasePI);
185 Char *p = to;
186 for (; fromLen > guessBytesPerChar_;
187 fromLen -= guessBytesPerChar_, from += guessBytesPerChar_) {
188 if (!piLiteral_ && pi_.size() > 0 && pi_[pi_.size() - 1] == ISO646_GT) {
189 initDecoderPI();
190 phase_ = phaseFinish;
191 return (p - to) + subDecoder_->decode(p, from, fromLen, rest);
192 }
193 Char c = (unsigned char)from[0];
194 if (guessBytesPerChar_ > 1) {
195 if (lsbFirst_)
196 c |= (unsigned char)from[1] << 8;
197 else {
198 c <<= 8;
199 c |= (unsigned char)from[1];
200 }
201 }
202 static const Char startBytes[] = {
203 ISO646_LT, ISO646_QUEST, ISO646_LETTER_x, ISO646_LETTER_m, ISO646_LETTER_l
204 };
205 // Stop accumulating the PI if we get characters that are illegal in the PI.
206 if (c == 0
207 || c >= 0x7F
208 || (pi_.size() > 0 && c == ISO646_LT)
209 || pi_.size() > piMaxSize
210 || (pi_.size() < 5 && c != startBytes[pi_.size()])
211 || (pi_.size() == 5 && !isWS(c))) {
212 initDecoderDefault();
213 phase_ = phaseFinish;
214 break;
215 }
216 *p++ = c;
217 pi_ += c;
218 if (piLiteral_) {
219 if (c == piLiteral_)
220 piLiteral_ = 0;
221 }
222 else if (c == ISO646_QUOT || c == ISO646_APOS)
223 piLiteral_ = c;
224 }
225 size_t n = p - to;
226 if (phase_ == phaseFinish && fromLen > 0)
227 n += subDecoder_->decode(p, from, fromLen, rest);
228 return n;
229 }
230
convertOffset(unsigned long & n) const231 Boolean XMLDecoder::convertOffset(unsigned long &n) const
232 {
233 if (n <= pi_.size())
234 n *= guessBytesPerChar_;
235 else {
236 if (!subDecoder_)
237 return 0;
238 unsigned long tem = n - pi_.size();
239 if (!subDecoder_->convertOffset(tem))
240 return 0;
241 n = tem + pi_.size() * guessBytesPerChar_;
242 }
243 if (byteOrderMark_)
244 n += 2;
245 return 1;
246 }
247
initDecoderDefault()248 void XMLDecoder::initDecoderDefault()
249 {
250 if (guessBytesPerChar_ == 1) {
251 UTF8CodingSystem utf8;
252 subDecoder_ = utf8.makeDecoder();
253 }
254 else {
255 unsigned short n = 0x1;
256 minBytesPerChar_ = 2;
257 subDecoder_ = new UCS2((*(char *)&n == 0x1) != lsbFirst_);
258 }
259 }
260
initDecoderPI()261 void XMLDecoder::initDecoderPI()
262 {
263 StringC name;
264 if (!extractEncoding(name))
265 initDecoderDefault();
266 const char *dummy;
267 static const UnivCharsetDesc::Range range = { 0, 128, 0 };
268 CharsetInfo piCharset(UnivCharsetDesc(&range, 1));
269 const InputCodingSystem *ics
270 = kit_->makeInputCodingSystem(name,
271 piCharset,
272 0,
273 dummy);
274 if (ics) {
275 subDecoder_ = ics->makeDecoder();
276 minBytesPerChar_ = subDecoder_->minBytesPerChar();
277 }
278 if (!subDecoder_)
279 initDecoderDefault();
280 }
281
isWS(Char c)282 Boolean XMLDecoder::isWS(Char c)
283 {
284 switch (c) {
285 case ISO646_CR:
286 case ISO646_LF:
287 case ISO646_SPACE:
288 case ISO646_TAB:
289 return 1;
290 }
291 return 0;
292 }
293
extractEncoding(StringC & name)294 Boolean XMLDecoder::extractEncoding(StringC &name)
295 {
296 Char lit = 0;
297 for (size_t i = 5; i < pi_.size(); i++) {
298 if (!lit) {
299 if (pi_[i] == ISO646_APOS || pi_[i] == ISO646_QUOT)
300 lit = pi_[i];
301 else if (pi_[i] == ISO646_EQUAL) {
302 size_t j = i;
303 for (; j > 0; j--) {
304 if (!isWS(pi_[j - 1]))
305 break;
306 }
307 size_t nameEnd = j;
308 for (; j > 0; j--) {
309 if (isWS(pi_[j - 1]) || pi_[j - 1] == ISO646_QUOT || pi_[j - 1] == ISO646_APOS)
310 break;
311 }
312 static const Char encodingName[] = {
313 ISO646_LETTER_e, ISO646_LETTER_n, ISO646_LETTER_c, ISO646_LETTER_o,
314 ISO646_LETTER_d, ISO646_LETTER_i, ISO646_LETTER_n, ISO646_LETTER_g,
315 0
316 };
317 const Char *s = encodingName;
318 for (; *s && j < nameEnd; j++, s++)
319 if (pi_[j] != *s)
320 break;
321 if (j == nameEnd && *s == 0) {
322 size_t j = i + 1;
323 for (; j < pi_.size(); j++) {
324 if (!isWS(pi_[j]))
325 break;
326 }
327 if (pi_[j] == ISO646_QUOT || pi_[j] == ISO646_APOS) {
328 Char lit = pi_[j];
329 size_t nameStart = j + 1;
330 for (++j; j < pi_.size(); j++) {
331 if (pi_[j] == lit) {
332 if (j > nameStart) {
333 name.assign(&pi_[nameStart], j - nameStart);
334 return 1;
335 }
336 break;
337 }
338 }
339 }
340 return 0;
341 }
342 }
343 }
344 else if (pi_[i] == lit)
345 lit = 0;
346 }
347 return 0;
348 }
349
UCS2(Boolean swapBytes)350 XMLDecoder::UCS2::UCS2(Boolean swapBytes)
351 : swapBytes_(swapBytes)
352 {
353 }
354
decode(Char * to,const char * from,size_t fromLen,const char ** rest)355 size_t XMLDecoder::UCS2::decode(Char *to, const char *from, size_t fromLen,
356 const char **rest)
357 {
358 union U {
359 unsigned short word;
360 char bytes[2];
361 };
362 fromLen &= ~1;
363 *rest = from + fromLen;
364 if (sizeof(Char) == 2) {
365 if (!swapBytes_) {
366 if (from != (char *)to)
367 memmove(to, from, fromLen);
368 return fromLen/2;
369 }
370 }
371 if (swapBytes_) {
372 for (size_t n = fromLen; n > 0; n -= 2) {
373 U u;
374 u.bytes[1] = *from++;
375 u.bytes[0] = *from++;
376 *to++ = u.word;
377 }
378 }
379 else {
380 for (size_t n = fromLen; n > 0; n -= 2) {
381 U u;
382 u.bytes[0] = *from++;
383 u.bytes[1] = *from++;
384 *to++ = u.word;
385 }
386 }
387 return fromLen/2;
388 }
389
convertOffset(unsigned long & n) const390 Boolean XMLDecoder::UCS2::convertOffset(unsigned long &n) const
391 {
392 n *= 2;
393 return 1;
394 }
395
396 #ifdef SP_NAMESPACE
397 }
398 #endif
399
400 #else /* not SP_MULTI_BYTE */
401
402 #ifndef __GNUG__
403 static char non_empty_translation_unit; // sigh
404 #endif
405
406 #endif /* not SP_MULTI_BYTE */
407