1 // Copyright (c) 1997 James Clark
2 // See the file COPYING for copying permission.
3 #pragma ident "%Z%%M% %I% %E% SMI"
4
5 #ifdef __GNUG__
6 #pragma implementation
7 #endif
8
9 #include "splib.h"
10 #include "CodingSystemKit.h"
11 #include "TranslateCodingSystem.h"
12 #ifdef SP_MULTI_BYTE
13 #include "UTF8CodingSystem.h"
14 #include "Fixed2CodingSystem.h"
15 #include "UnicodeCodingSystem.h"
16 #include "XMLCodingSystem.h"
17 #include "EUCJPCodingSystem.h"
18 #include "SJISCodingSystem.h"
19 #include "Big5CodingSystem.h"
20 #ifdef WIN32
21 #include "Win32CodingSystem.h"
22 #endif
23 #endif /* SP_MULTI_BYTE */
24 #include "IdentityCodingSystem.h"
25 #include "Owner.h"
26
27 #include <ctype.h>
28
29 #ifdef SP_NAMESPACE
30 namespace SP_NAMESPACE {
31 #endif
32
33 #ifdef SP_MULTI_BYTE
34 const Char unicodeReplaceChar = 0xfffd;
35 #endif
36
37 class CodingSystemKitImpl : public CodingSystemKit {
38 public:
39 CodingSystemKitImpl(const TranslateCodingSystem::Desc *);
40 CodingSystemKit *copy() const;
41 Char replacementChar() const;
42 const CodingSystem *
43 identityCodingSystem() const;
44 const InputCodingSystem *
45 identityInputCodingSystem() const;
46 const InputCodingSystem *
47 makeInputCodingSystem(const StringC &,
48 const CharsetInfo &,
49 Boolean isBctf,
50 const char *&) const;
51 const CodingSystem *
52 makeCodingSystem(const char *, Boolean isBctf) const;
53 enum CodingSystemId {
54 identity,
55 fixed2,
56 utf8,
57 unicode,
58 eucjp,
59 euccn,
60 euckr,
61 sjisBctf,
62 eucBctf,
63 sjis,
64 big5,
65 big5Bctf,
66 ansi,
67 oem,
68 maybeUnicode,
69 xml,
70 iso8859_1,
71 iso8859_2,
72 iso8859_3,
73 iso8859_4,
74 iso8859_5,
75 iso8859_6,
76 iso8859_7,
77 iso8859_8,
78 iso8859_9
79 };
80 struct Entry {
81 const char *name;
82 CodingSystemId id;
83 };
84 static Boolean match(const StringC &s,
85 const CharsetInfo &charset,
86 const char *key);
87 static Boolean match(const char *s,
88 const char *key);
89 private:
90 const CodingSystem *
91 makeCodingSystem(CodingSystemId) const;
92 const Entry *firstEntry(Boolean isBctf) const;
93 #ifdef SP_MULTI_BYTE
94 UTF8CodingSystem utf8CodingSystem_;
95 Fixed2CodingSystem fixed2CodingSystem_;
96 UnicodeCodingSystem unicodeCodingSystem_;
97 XMLCodingSystem xmlCodingSystem_;
98 EUCJPCodingSystem eucBctf_;
99 SJISCodingSystem sjisBctf_;
100 Big5CodingSystem big5Bctf_;
101 TranslateCodingSystem eucjpCodingSystem_;
102 TranslateCodingSystem euccnCodingSystem_;
103 TranslateCodingSystem euckrCodingSystem_;
104 TranslateCodingSystem sjisCodingSystem_;
105 TranslateCodingSystem big5CodingSystem_;
106 TranslateCodingSystem iso8859_1CodingSystem_;
107 TranslateCodingSystem iso8859_2CodingSystem_;
108 TranslateCodingSystem iso8859_3CodingSystem_;
109 TranslateCodingSystem iso8859_4CodingSystem_;
110 TranslateCodingSystem iso8859_5CodingSystem_;
111 TranslateCodingSystem iso8859_6CodingSystem_;
112 TranslateCodingSystem iso8859_7CodingSystem_;
113 TranslateCodingSystem iso8859_8CodingSystem_;
114 TranslateCodingSystem iso8859_9CodingSystem_;
115 #ifdef WIN32
116 Win32CodingSystem ansiCodingSystem_;
117 Win32CodingSystem oemCodingSystem_;
118 UnicodeCodingSystem maybeUnicodeCodingSystem_;
119 #endif
120 #endif /* SP_MULTI_BYTE */
121 IdentityCodingSystem identityCodingSystem_;
122 const TranslateCodingSystem::Desc *systemCharsetDesc_;
123 static const Entry bctfTable_[];
124 enum { nEncodingsRequireUnicode = 8 };
125 static const Entry encodingTable_[];
126 };
127
128
129 static const TranslateCodingSystem::Desc iso10646Desc[] = {
130 { CharsetRegistry::ISO10646_UCS2, 0x0 },
131 { CharsetRegistry::UNREGISTERED, 0x0 },
132 };
133
134 #ifdef SP_MULTI_BYTE
135
136 static const TranslateCodingSystem::Desc jisDesc[] = {
137 { CharsetRegistry::ISO646_C0, 0x0 },
138 { CharsetRegistry::ISO646_JIS_G0, 0x0 },
139 { CharsetRegistry::ISO6429, 0x80 },
140 { CharsetRegistry::JIS0201, 0x80 },
141 { CharsetRegistry::JIS0208, 0x8080 },
142 { CharsetRegistry::UNREGISTERED, 0x0 }
143 };
144
145 static const TranslateCodingSystem::Desc jis2Desc[] = {
146 { CharsetRegistry::ISO646_C0, 0x0 },
147 { CharsetRegistry::ISO646_JIS_G0, 0x0 },
148 { CharsetRegistry::ISO6429, 0x80 },
149 { CharsetRegistry::JIS0201, 0x80 },
150 { CharsetRegistry::JIS0208, 0x8080 },
151 { CharsetRegistry::JIS0212, 0x8000 },
152 { CharsetRegistry::UNREGISTERED, 0x0 }
153 };
154
155 static const TranslateCodingSystem::Desc gbDesc[] = {
156 { CharsetRegistry::ISO646_C0, 0x0 },
157 { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
158 { CharsetRegistry::ISO6429, 0x80 },
159 { CharsetRegistry::GB2312, 0x8080 },
160 { CharsetRegistry::UNREGISTERED, 0x0 }
161 };
162
163 static const TranslateCodingSystem::Desc big5Desc[] = {
164 { CharsetRegistry::ISO646_C0, 0x0 },
165 { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
166 { CharsetRegistry::BIG5, 0x0 },
167 { CharsetRegistry::UNREGISTERED, 0x0 }
168 };
169
170 static const TranslateCodingSystem::Desc kscDesc[] = {
171 { CharsetRegistry::ISO646_C0, 0x0 },
172 { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
173 { CharsetRegistry::ISO6429, 0x80 },
174 { CharsetRegistry::KSC5601, 0x8080 },
175 { CharsetRegistry::UNREGISTERED, 0x0 }
176 };
177
178 static const TranslateCodingSystem::Desc iso8859_1Desc[] = {
179 { CharsetRegistry::ISO646_C0, 0x0 },
180 { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
181 { CharsetRegistry::ISO6429, 0x80 },
182 { CharsetRegistry::ISO8859_1, 0x80 },
183 { CharsetRegistry::UNREGISTERED, 0x0 }
184 };
185
186 static const TranslateCodingSystem::Desc iso8859_2Desc[] = {
187 { CharsetRegistry::ISO646_C0, 0x0 },
188 { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
189 { CharsetRegistry::ISO6429, 0x80 },
190 { CharsetRegistry::ISO8859_2, 0x80 },
191 { CharsetRegistry::UNREGISTERED, 0x0 }
192 };
193
194 static const TranslateCodingSystem::Desc iso8859_3Desc[] = {
195 { CharsetRegistry::ISO646_C0, 0x0 },
196 { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
197 { CharsetRegistry::ISO6429, 0x80 },
198 { CharsetRegistry::ISO8859_3, 0x80 },
199 { CharsetRegistry::UNREGISTERED, 0x0 }
200 };
201
202 static const TranslateCodingSystem::Desc iso8859_4Desc[] = {
203 { CharsetRegistry::ISO646_C0, 0x0 },
204 { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
205 { CharsetRegistry::ISO6429, 0x80 },
206 { CharsetRegistry::ISO8859_4, 0x80 },
207 { CharsetRegistry::UNREGISTERED, 0x0 }
208 };
209
210 static const TranslateCodingSystem::Desc iso8859_5Desc[] = {
211 { CharsetRegistry::ISO646_C0, 0x0 },
212 { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
213 { CharsetRegistry::ISO6429, 0x80 },
214 { CharsetRegistry::ISO8859_5, 0x80 },
215 { CharsetRegistry::UNREGISTERED, 0x0 }
216 };
217
218 static const TranslateCodingSystem::Desc iso8859_6Desc[] = {
219 { CharsetRegistry::ISO646_C0, 0x0 },
220 { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
221 { CharsetRegistry::ISO6429, 0x80 },
222 { CharsetRegistry::ISO8859_6, 0x80 },
223 { CharsetRegistry::UNREGISTERED, 0x0 }
224 };
225
226 static const TranslateCodingSystem::Desc iso8859_7Desc[] = {
227 { CharsetRegistry::ISO646_C0, 0x0 },
228 { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
229 { CharsetRegistry::ISO6429, 0x80 },
230 { CharsetRegistry::ISO8859_7, 0x80 },
231 { CharsetRegistry::UNREGISTERED, 0x0 }
232 };
233
234 static const TranslateCodingSystem::Desc iso8859_8Desc[] = {
235 { CharsetRegistry::ISO646_C0, 0x0 },
236 { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
237 { CharsetRegistry::ISO6429, 0x80 },
238 { CharsetRegistry::ISO8859_8, 0x80 },
239 { CharsetRegistry::UNREGISTERED, 0x0 }
240 };
241
242 static const TranslateCodingSystem::Desc iso8859_9Desc[] = {
243 { CharsetRegistry::ISO646_C0, 0x0 },
244 { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
245 { CharsetRegistry::ISO6429, 0x80 },
246 { CharsetRegistry::ISO8859_9, 0x80 },
247 { CharsetRegistry::UNREGISTERED, 0x0 }
248 };
249
250 #endif /* SP_MULTI_BYTE */
251
252 const CodingSystemKitImpl::Entry CodingSystemKitImpl::bctfTable_[] = {
253 { "IDENTITY", identity },
254 #ifdef SP_MULTI_BYTE
255 { "FIXED-2", fixed2 },
256 { "UTF-8", utf8 },
257 { "EUC", eucBctf },
258 { "SJIS", sjisBctf },
259 { "BIG5", big5Bctf },
260 #endif /* SP_MULTI_BYTE */
261 { 0, identity },
262 };
263
264 const CodingSystemKitImpl::Entry CodingSystemKitImpl::encodingTable_[] = {
265 #ifdef SP_MULTI_BYTE
266 { "UTF-8", utf8 },
267 { "UCS-2", fixed2 },
268 { "ISO-10646-UCS-2", fixed2 },
269 { "UNICODE", unicode },
270 // We don't really support UTF-16, but treating it
271 // as Unicode should work for the most part.
272 { "UTF-16", unicode },
273 { "WINDOWS", ansi },
274 { "MS-DOS", oem },
275 { "WUNICODE", maybeUnicode },
276 { "XML", xml },
277 // nEncodingsRequireUnicode = 8
278 { "IS8859-1", iso8859_1 },
279 { "ISO-8859-1", iso8859_1 },
280 { "IS8859-2", iso8859_2 },
281 { "ISO-8859-2", iso8859_2 },
282 { "IS8859-3", iso8859_3 },
283 { "ISO-8859-3", iso8859_3 },
284 { "IS8859-4", iso8859_4 },
285 { "ISO-8859-4", iso8859_4 },
286 { "IS8859-5", iso8859_5 },
287 { "ISO-8859-5", iso8859_5 },
288 { "IS8859-6", iso8859_6 },
289 { "ISO-8859-6", iso8859_6 },
290 { "IS8859-7", iso8859_7 },
291 { "ISO-8859-7", iso8859_7 },
292 { "IS8859-8", iso8859_8 },
293 { "ISO-8859-8", iso8859_8 },
294 { "IS8859-9", iso8859_9 },
295 { "ISO-8859-9", iso8859_9 },
296 { "EUC-JP", eucjp },
297 { "EUC-CN", euccn },
298 { "GB2312", euccn },
299 { "CN-GB", euccn }, // RFC 1922
300 { "EUC-KR", euckr },
301 { "SJIS", sjis },
302 { "SHIFT_JIS", sjis },
303 { "BIG5", big5 },
304 { "CN-BIG5", big5 }, // RFC 1922
305 #endif /* SP_MULTI_BYTE */
306 { 0, identity },
307 };
308
CodingSystemKitImpl(const TranslateCodingSystem::Desc * systemCharsetDesc)309 CodingSystemKitImpl::CodingSystemKitImpl(const TranslateCodingSystem::Desc *systemCharsetDesc)
310 : systemCharsetDesc_(systemCharsetDesc)
311 #ifdef SP_MULTI_BYTE
312 ,
313 #ifdef WIN32
314 ansiCodingSystem_(Win32CodingSystem::codePageAnsi),
315 oemCodingSystem_(Win32CodingSystem::codePageOEM),
316 maybeUnicodeCodingSystem_(&ansiCodingSystem_),
317 #endif
318 xmlCodingSystem_(this),
319 iso8859_1CodingSystem_(&identityCodingSystem_, iso8859_1Desc, &systemCharset_, 0x100, unicodeReplaceChar),
320 iso8859_2CodingSystem_(&identityCodingSystem_, iso8859_2Desc, &systemCharset_, 0x100, unicodeReplaceChar),
321 iso8859_3CodingSystem_(&identityCodingSystem_, iso8859_3Desc, &systemCharset_, 0x100, unicodeReplaceChar),
322 iso8859_4CodingSystem_(&identityCodingSystem_, iso8859_4Desc, &systemCharset_, 0x100, unicodeReplaceChar),
323 iso8859_5CodingSystem_(&identityCodingSystem_, iso8859_5Desc, &systemCharset_, 0x100, unicodeReplaceChar),
324 iso8859_6CodingSystem_(&identityCodingSystem_, iso8859_6Desc, &systemCharset_, 0x100, unicodeReplaceChar),
325 iso8859_7CodingSystem_(&identityCodingSystem_, iso8859_7Desc, &systemCharset_, 0x100, unicodeReplaceChar),
326 iso8859_8CodingSystem_(&identityCodingSystem_, iso8859_8Desc, &systemCharset_, 0x100, unicodeReplaceChar),
327 iso8859_9CodingSystem_(&identityCodingSystem_, iso8859_9Desc, &systemCharset_, 0x100, unicodeReplaceChar),
328 eucjpCodingSystem_(&eucBctf_, jis2Desc, &systemCharset_, 0x8000, unicodeReplaceChar),
329 euccnCodingSystem_(&eucBctf_, gbDesc, &systemCharset_, 0x8000, unicodeReplaceChar),
330 euckrCodingSystem_(&eucBctf_, kscDesc, &systemCharset_, 0x8000, unicodeReplaceChar),
331 sjisCodingSystem_(&sjisBctf_, jisDesc, &systemCharset_, 0x8000, unicodeReplaceChar),
332 big5CodingSystem_(&big5Bctf_, big5Desc, &systemCharset_, 0x0080, unicodeReplaceChar)
333 #endif /* SP_MULTI_BYTE */
334 {
335 UnivCharsetDesc desc;
336 for (const TranslateCodingSystem::Desc *p = systemCharsetDesc_;
337 p->number != CharsetRegistry::UNREGISTERED;
338 p++) {
339 Owner<CharsetRegistry::Iter> iter(CharsetRegistry::makeIter(p->number));
340 if (iter) {
341 WideChar min;
342 WideChar max;
343 UnivChar univ;
344 while (iter->next(min, max, univ)) {
345 min += p->add;
346 max += p->add;
347 if (min <= charMax) {
348 if (max > charMax)
349 max = charMax;
350 desc.addRange(min, max, univ);
351 }
352 }
353 }
354 }
355 systemCharset_.set(desc);
356 }
357
copy() const358 CodingSystemKit *CodingSystemKitImpl::copy() const
359 {
360 return new CodingSystemKitImpl(systemCharsetDesc_);
361 }
362
firstEntry(Boolean isBctf) const363 const CodingSystemKitImpl::Entry *CodingSystemKitImpl::firstEntry(Boolean isBctf) const
364 {
365 if (isBctf)
366 return bctfTable_;
367 #ifdef SP_MULTI_BYTE
368 else if (systemCharsetDesc_ != iso10646Desc)
369 return encodingTable_ + nEncodingsRequireUnicode;
370 #endif
371 else
372 return encodingTable_;
373 }
374
375 const InputCodingSystem *
makeInputCodingSystem(const StringC & s,const CharsetInfo & charset,Boolean isBctf,const char * & key) const376 CodingSystemKitImpl::makeInputCodingSystem(const StringC &s,
377 const CharsetInfo &charset,
378 Boolean isBctf,
379 const char *&key) const
380 {
381 for (const Entry *p = firstEntry(isBctf); p->name; p++)
382 if (match(s, charset, p->name)) {
383 key = p->name;
384 return makeCodingSystem(p->id);
385 }
386 return 0;
387 }
388
389 Boolean
match(const StringC & s,const CharsetInfo & charset,const char * key)390 CodingSystemKitImpl::match(const StringC &s,
391 const CharsetInfo &charset,
392 const char *key)
393 {
394 for (size_t i = 0; i < s.size(); i++) {
395 if (key[i] == '\0')
396 return 0;
397 if (charset.execToDesc(toupper(key[i])) != s[i]
398 && charset.execToDesc(tolower(key[i])) != s[i])
399 return 0;
400 }
401 return key[s.size()] == '\0';
402 }
403
404 const CodingSystem *
makeCodingSystem(const char * s,Boolean isBctf) const405 CodingSystemKitImpl::makeCodingSystem(const char *s,
406 Boolean isBctf)
407 const
408 {
409 for (const Entry *p = firstEntry(isBctf); p->name; p++)
410 if (match(s, p->name))
411 return makeCodingSystem(p->id);
412 return 0;
413 }
414
415 Boolean
match(const char * s,const char * key)416 CodingSystemKitImpl::match(const char *s,
417 const char *key)
418 {
419 for (; toupper(*key) == *s || tolower(*key) == *s; s++, key++) {
420 if (*s == '\0')
421 return 1;
422 }
423 return 0;
424 }
425
426 const CodingSystem *
makeCodingSystem(CodingSystemId id) const427 CodingSystemKitImpl::makeCodingSystem(CodingSystemId id) const
428 {
429 switch (id) {
430 case identity:
431 return &identityCodingSystem_;
432 #ifdef SP_MULTI_BYTE
433 case fixed2:
434 return &fixed2CodingSystem_;
435 case utf8:
436 return &utf8CodingSystem_;
437 case unicode:
438 return &unicodeCodingSystem_;
439 case eucBctf:
440 return &eucBctf_;
441 case sjisBctf:
442 return &sjisBctf_;
443 case big5Bctf:
444 return &big5Bctf_;
445 case eucjp:
446 return &eucjpCodingSystem_;
447 case euccn:
448 return &euccnCodingSystem_;
449 case euckr:
450 return &euckrCodingSystem_;
451 case sjis:
452 return &sjisCodingSystem_;
453 case big5:
454 return &big5CodingSystem_;
455 case iso8859_1:
456 if (systemCharsetDesc_ == iso10646Desc)
457 return &identityCodingSystem_;
458 else
459 return &iso8859_1CodingSystem_;
460 case iso8859_2:
461 return &iso8859_2CodingSystem_;
462 case iso8859_3:
463 return &iso8859_3CodingSystem_;
464 case iso8859_4:
465 return &iso8859_4CodingSystem_;
466 case iso8859_5:
467 return &iso8859_5CodingSystem_;
468 case iso8859_6:
469 return &iso8859_6CodingSystem_;
470 case iso8859_7:
471 return &iso8859_7CodingSystem_;
472 case iso8859_8:
473 return &iso8859_8CodingSystem_;
474 case iso8859_9:
475 return &iso8859_9CodingSystem_;
476 case xml:
477 return &xmlCodingSystem_;
478 #ifdef WIN32
479 case ansi:
480 return &ansiCodingSystem_;
481 case oem:
482 return &oemCodingSystem_;
483 case maybeUnicode:
484 return &maybeUnicodeCodingSystem_;
485 #endif /* WIN32 */
486 #endif /* SP_MULTI_BYTE */
487 default:
488 break;
489 }
490 return 0;
491 }
492
493 const InputCodingSystem *
identityInputCodingSystem() const494 CodingSystemKitImpl::identityInputCodingSystem() const
495 {
496 return &identityCodingSystem_;
497 }
498
499 const CodingSystem *
identityCodingSystem() const500 CodingSystemKitImpl::identityCodingSystem() const
501 {
502 return &identityCodingSystem_;
503 }
504
replacementChar() const505 Char CodingSystemKitImpl::replacementChar() const
506 {
507 // FIXME should vary with systemCharset
508 #ifdef SP_MULTI_BYTE
509 return unicodeReplaceChar;
510 #else
511 return 0;
512 #endif
513 }
514
515 CodingSystemKit *
make(const char * systemCharsetName)516 CodingSystemKit::make(const char *systemCharsetName)
517 {
518 #ifdef SP_MULTI_BYTE
519 if (systemCharsetName && CodingSystemKitImpl::match(systemCharsetName, "JIS"))
520 return new CodingSystemKitImpl(jis2Desc);
521 #endif
522 return new CodingSystemKitImpl(iso10646Desc);
523 }
524
~InputCodingSystemKit()525 InputCodingSystemKit::~InputCodingSystemKit()
526 {
527 }
528
529 #ifdef SP_NAMESPACE
530 }
531 #endif
532