1 // Copyright (c) 1997 James Clark
2 // See the file COPYING for copying permission.
3 #pragma ident	"%Z%%M%	%I%	%E% SMI"
4 
5 #ifdef __GNUG__
6 #pragma implementation
7 #endif
8 
9 #include "splib.h"
10 #include "CodingSystemKit.h"
11 #include "TranslateCodingSystem.h"
12 #ifdef SP_MULTI_BYTE
13 #include "UTF8CodingSystem.h"
14 #include "Fixed2CodingSystem.h"
15 #include "UnicodeCodingSystem.h"
16 #include "XMLCodingSystem.h"
17 #include "EUCJPCodingSystem.h"
18 #include "SJISCodingSystem.h"
19 #include "Big5CodingSystem.h"
20 #ifdef WIN32
21 #include "Win32CodingSystem.h"
22 #endif
23 #endif /* SP_MULTI_BYTE */
24 #include "IdentityCodingSystem.h"
25 #include "Owner.h"
26 
27 #include <ctype.h>
28 
29 #ifdef SP_NAMESPACE
30 namespace SP_NAMESPACE {
31 #endif
32 
33 #ifdef SP_MULTI_BYTE
34 const Char unicodeReplaceChar = 0xfffd;
35 #endif
36 
37 class CodingSystemKitImpl : public CodingSystemKit {
38 public:
39   CodingSystemKitImpl(const TranslateCodingSystem::Desc *);
40   CodingSystemKit *copy() const;
41   Char replacementChar() const;
42   const CodingSystem *
43     identityCodingSystem() const;
44   const InputCodingSystem *
45     identityInputCodingSystem() const;
46   const InputCodingSystem *
47     makeInputCodingSystem(const StringC &,
48 			  const CharsetInfo &,
49 			  Boolean isBctf,
50 			  const char *&) const;
51   const CodingSystem *
52     makeCodingSystem(const char *, Boolean isBctf) const;
53   enum CodingSystemId {
54     identity,
55     fixed2,
56     utf8,
57     unicode,
58     eucjp,
59     euccn,
60     euckr,
61     sjisBctf,
62     eucBctf,
63     sjis,
64     big5,
65     big5Bctf,
66     ansi,
67     oem,
68     maybeUnicode,
69     xml,
70     iso8859_1,
71     iso8859_2,
72     iso8859_3,
73     iso8859_4,
74     iso8859_5,
75     iso8859_6,
76     iso8859_7,
77     iso8859_8,
78     iso8859_9
79   };
80   struct Entry {
81     const char *name;
82     CodingSystemId id;
83   };
84   static Boolean match(const StringC &s,
85 		       const CharsetInfo &charset,
86 		       const char *key);
87   static Boolean match(const char *s,
88 		       const char *key);
89 private:
90   const CodingSystem *
91     makeCodingSystem(CodingSystemId) const;
92   const Entry *firstEntry(Boolean isBctf) const;
93 #ifdef SP_MULTI_BYTE
94   UTF8CodingSystem utf8CodingSystem_;
95   Fixed2CodingSystem fixed2CodingSystem_;
96   UnicodeCodingSystem unicodeCodingSystem_;
97   XMLCodingSystem xmlCodingSystem_;
98   EUCJPCodingSystem eucBctf_;
99   SJISCodingSystem sjisBctf_;
100   Big5CodingSystem big5Bctf_;
101   TranslateCodingSystem eucjpCodingSystem_;
102   TranslateCodingSystem euccnCodingSystem_;
103   TranslateCodingSystem euckrCodingSystem_;
104   TranslateCodingSystem sjisCodingSystem_;
105   TranslateCodingSystem big5CodingSystem_;
106   TranslateCodingSystem iso8859_1CodingSystem_;
107   TranslateCodingSystem iso8859_2CodingSystem_;
108   TranslateCodingSystem iso8859_3CodingSystem_;
109   TranslateCodingSystem iso8859_4CodingSystem_;
110   TranslateCodingSystem iso8859_5CodingSystem_;
111   TranslateCodingSystem iso8859_6CodingSystem_;
112   TranslateCodingSystem iso8859_7CodingSystem_;
113   TranslateCodingSystem iso8859_8CodingSystem_;
114   TranslateCodingSystem iso8859_9CodingSystem_;
115 #ifdef WIN32
116   Win32CodingSystem ansiCodingSystem_;
117   Win32CodingSystem oemCodingSystem_;
118   UnicodeCodingSystem maybeUnicodeCodingSystem_;
119 #endif
120 #endif /* SP_MULTI_BYTE */
121   IdentityCodingSystem identityCodingSystem_;
122   const TranslateCodingSystem::Desc *systemCharsetDesc_;
123   static const Entry bctfTable_[];
124   enum { nEncodingsRequireUnicode = 8 };
125   static const Entry encodingTable_[];
126 };
127 
128 
129 static const TranslateCodingSystem::Desc iso10646Desc[] = {
130   { CharsetRegistry::ISO10646_UCS2, 0x0 },
131   { CharsetRegistry::UNREGISTERED, 0x0 },
132 };
133 
134 #ifdef SP_MULTI_BYTE
135 
136 static const TranslateCodingSystem::Desc jisDesc[] = {
137   { CharsetRegistry::ISO646_C0, 0x0 },
138   { CharsetRegistry::ISO646_JIS_G0, 0x0 },
139   { CharsetRegistry::ISO6429, 0x80 },
140   { CharsetRegistry::JIS0201, 0x80 },
141   { CharsetRegistry::JIS0208, 0x8080 },
142   { CharsetRegistry::UNREGISTERED, 0x0 }
143 };
144 
145 static const TranslateCodingSystem::Desc jis2Desc[] = {
146   { CharsetRegistry::ISO646_C0, 0x0 },
147   { CharsetRegistry::ISO646_JIS_G0, 0x0 },
148   { CharsetRegistry::ISO6429, 0x80 },
149   { CharsetRegistry::JIS0201, 0x80 },
150   { CharsetRegistry::JIS0208, 0x8080 },
151   { CharsetRegistry::JIS0212, 0x8000 },
152   { CharsetRegistry::UNREGISTERED, 0x0 }
153 };
154 
155 static const TranslateCodingSystem::Desc gbDesc[] = {
156   { CharsetRegistry::ISO646_C0, 0x0 },
157   { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
158   { CharsetRegistry::ISO6429, 0x80 },
159   { CharsetRegistry::GB2312, 0x8080 },
160   { CharsetRegistry::UNREGISTERED, 0x0 }
161 };
162 
163 static const TranslateCodingSystem::Desc big5Desc[] = {
164   { CharsetRegistry::ISO646_C0, 0x0 },
165   { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
166   { CharsetRegistry::BIG5, 0x0 },
167   { CharsetRegistry::UNREGISTERED, 0x0 }
168 };
169 
170 static const TranslateCodingSystem::Desc kscDesc[] = {
171   { CharsetRegistry::ISO646_C0, 0x0 },
172   { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
173   { CharsetRegistry::ISO6429, 0x80 },
174   { CharsetRegistry::KSC5601, 0x8080 },
175   { CharsetRegistry::UNREGISTERED, 0x0 }
176 };
177 
178 static const TranslateCodingSystem::Desc iso8859_1Desc[] = {
179   { CharsetRegistry::ISO646_C0, 0x0 },
180   { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
181   { CharsetRegistry::ISO6429, 0x80 },
182   { CharsetRegistry::ISO8859_1, 0x80 },
183   { CharsetRegistry::UNREGISTERED, 0x0 }
184 };
185 
186 static const TranslateCodingSystem::Desc iso8859_2Desc[] = {
187   { CharsetRegistry::ISO646_C0, 0x0 },
188   { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
189   { CharsetRegistry::ISO6429, 0x80 },
190   { CharsetRegistry::ISO8859_2, 0x80 },
191   { CharsetRegistry::UNREGISTERED, 0x0 }
192 };
193 
194 static const TranslateCodingSystem::Desc iso8859_3Desc[] = {
195   { CharsetRegistry::ISO646_C0, 0x0 },
196   { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
197   { CharsetRegistry::ISO6429, 0x80 },
198   { CharsetRegistry::ISO8859_3, 0x80 },
199   { CharsetRegistry::UNREGISTERED, 0x0 }
200 };
201 
202 static const TranslateCodingSystem::Desc iso8859_4Desc[] = {
203   { CharsetRegistry::ISO646_C0, 0x0 },
204   { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
205   { CharsetRegistry::ISO6429, 0x80 },
206   { CharsetRegistry::ISO8859_4, 0x80 },
207   { CharsetRegistry::UNREGISTERED, 0x0 }
208 };
209 
210 static const TranslateCodingSystem::Desc iso8859_5Desc[] = {
211   { CharsetRegistry::ISO646_C0, 0x0 },
212   { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
213   { CharsetRegistry::ISO6429, 0x80 },
214   { CharsetRegistry::ISO8859_5, 0x80 },
215   { CharsetRegistry::UNREGISTERED, 0x0 }
216 };
217 
218 static const TranslateCodingSystem::Desc iso8859_6Desc[] = {
219   { CharsetRegistry::ISO646_C0, 0x0 },
220   { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
221   { CharsetRegistry::ISO6429, 0x80 },
222   { CharsetRegistry::ISO8859_6, 0x80 },
223   { CharsetRegistry::UNREGISTERED, 0x0 }
224 };
225 
226 static const TranslateCodingSystem::Desc iso8859_7Desc[] = {
227   { CharsetRegistry::ISO646_C0, 0x0 },
228   { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
229   { CharsetRegistry::ISO6429, 0x80 },
230   { CharsetRegistry::ISO8859_7, 0x80 },
231   { CharsetRegistry::UNREGISTERED, 0x0 }
232 };
233 
234 static const TranslateCodingSystem::Desc iso8859_8Desc[] = {
235   { CharsetRegistry::ISO646_C0, 0x0 },
236   { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
237   { CharsetRegistry::ISO6429, 0x80 },
238   { CharsetRegistry::ISO8859_8, 0x80 },
239   { CharsetRegistry::UNREGISTERED, 0x0 }
240 };
241 
242 static const TranslateCodingSystem::Desc iso8859_9Desc[] = {
243   { CharsetRegistry::ISO646_C0, 0x0 },
244   { CharsetRegistry::ISO646_ASCII_G0, 0x0 },
245   { CharsetRegistry::ISO6429, 0x80 },
246   { CharsetRegistry::ISO8859_9, 0x80 },
247   { CharsetRegistry::UNREGISTERED, 0x0 }
248 };
249 
250 #endif /* SP_MULTI_BYTE */
251 
252 const CodingSystemKitImpl::Entry CodingSystemKitImpl::bctfTable_[] = {
253   { "IDENTITY", identity },
254 #ifdef SP_MULTI_BYTE
255   { "FIXED-2", fixed2 },
256   { "UTF-8", utf8 },
257   { "EUC", eucBctf },
258   { "SJIS", sjisBctf },
259   { "BIG5", big5Bctf },
260 #endif /* SP_MULTI_BYTE */
261   { 0, identity },
262 };
263 
264 const CodingSystemKitImpl::Entry CodingSystemKitImpl::encodingTable_[] = {
265 #ifdef SP_MULTI_BYTE
266   { "UTF-8", utf8 },
267   { "UCS-2", fixed2 },
268   { "ISO-10646-UCS-2", fixed2 },
269   { "UNICODE", unicode },
270   // We don't really support UTF-16, but treating it
271   // as Unicode should work for the most part.
272   { "UTF-16", unicode },
273   { "WINDOWS", ansi },
274   { "MS-DOS", oem },
275   { "WUNICODE", maybeUnicode },
276   { "XML", xml },
277   // nEncodingsRequireUnicode = 8
278   { "IS8859-1", iso8859_1 },
279   { "ISO-8859-1", iso8859_1 },
280   { "IS8859-2", iso8859_2 },
281   { "ISO-8859-2", iso8859_2 },
282   { "IS8859-3", iso8859_3 },
283   { "ISO-8859-3", iso8859_3 },
284   { "IS8859-4", iso8859_4 },
285   { "ISO-8859-4", iso8859_4 },
286   { "IS8859-5", iso8859_5 },
287   { "ISO-8859-5", iso8859_5 },
288   { "IS8859-6", iso8859_6 },
289   { "ISO-8859-6", iso8859_6 },
290   { "IS8859-7", iso8859_7 },
291   { "ISO-8859-7", iso8859_7 },
292   { "IS8859-8", iso8859_8 },
293   { "ISO-8859-8", iso8859_8 },
294   { "IS8859-9", iso8859_9 },
295   { "ISO-8859-9", iso8859_9 },
296   { "EUC-JP", eucjp },
297   { "EUC-CN", euccn },
298   { "GB2312", euccn },
299   { "CN-GB", euccn },  // RFC 1922
300   { "EUC-KR", euckr },
301   { "SJIS", sjis },
302   { "SHIFT_JIS", sjis },
303   { "BIG5", big5 },
304   { "CN-BIG5", big5 }, // RFC 1922
305 #endif /* SP_MULTI_BYTE */
306   { 0, identity },
307 };
308 
CodingSystemKitImpl(const TranslateCodingSystem::Desc * systemCharsetDesc)309 CodingSystemKitImpl::CodingSystemKitImpl(const TranslateCodingSystem::Desc *systemCharsetDesc)
310 : systemCharsetDesc_(systemCharsetDesc)
311 #ifdef SP_MULTI_BYTE
312   ,
313 #ifdef WIN32
314   ansiCodingSystem_(Win32CodingSystem::codePageAnsi),
315   oemCodingSystem_(Win32CodingSystem::codePageOEM),
316   maybeUnicodeCodingSystem_(&ansiCodingSystem_),
317 #endif
318   xmlCodingSystem_(this),
319   iso8859_1CodingSystem_(&identityCodingSystem_, iso8859_1Desc, &systemCharset_, 0x100, unicodeReplaceChar),
320   iso8859_2CodingSystem_(&identityCodingSystem_, iso8859_2Desc, &systemCharset_, 0x100, unicodeReplaceChar),
321   iso8859_3CodingSystem_(&identityCodingSystem_, iso8859_3Desc, &systemCharset_, 0x100, unicodeReplaceChar),
322   iso8859_4CodingSystem_(&identityCodingSystem_, iso8859_4Desc, &systemCharset_, 0x100, unicodeReplaceChar),
323   iso8859_5CodingSystem_(&identityCodingSystem_, iso8859_5Desc, &systemCharset_, 0x100, unicodeReplaceChar),
324   iso8859_6CodingSystem_(&identityCodingSystem_, iso8859_6Desc, &systemCharset_, 0x100, unicodeReplaceChar),
325   iso8859_7CodingSystem_(&identityCodingSystem_, iso8859_7Desc, &systemCharset_, 0x100, unicodeReplaceChar),
326   iso8859_8CodingSystem_(&identityCodingSystem_, iso8859_8Desc, &systemCharset_, 0x100, unicodeReplaceChar),
327   iso8859_9CodingSystem_(&identityCodingSystem_, iso8859_9Desc, &systemCharset_, 0x100, unicodeReplaceChar),
328   eucjpCodingSystem_(&eucBctf_, jis2Desc, &systemCharset_, 0x8000, unicodeReplaceChar),
329   euccnCodingSystem_(&eucBctf_, gbDesc, &systemCharset_, 0x8000, unicodeReplaceChar),
330   euckrCodingSystem_(&eucBctf_, kscDesc, &systemCharset_, 0x8000, unicodeReplaceChar),
331   sjisCodingSystem_(&sjisBctf_, jisDesc, &systemCharset_, 0x8000, unicodeReplaceChar),
332   big5CodingSystem_(&big5Bctf_, big5Desc, &systemCharset_, 0x0080, unicodeReplaceChar)
333 #endif /* SP_MULTI_BYTE */
334 {
335   UnivCharsetDesc desc;
336   for (const TranslateCodingSystem::Desc *p = systemCharsetDesc_;
337        p->number != CharsetRegistry::UNREGISTERED;
338        p++) {
339     Owner<CharsetRegistry::Iter> iter(CharsetRegistry::makeIter(p->number));
340     if (iter) {
341       WideChar min;
342       WideChar max;
343       UnivChar univ;
344       while (iter->next(min, max, univ)) {
345 	min += p->add;
346 	max += p->add;
347 	if (min <= charMax) {
348 	  if (max > charMax)
349 	    max = charMax;
350 	  desc.addRange(min, max, univ);
351 	}
352       }
353     }
354   }
355   systemCharset_.set(desc);
356 }
357 
copy() const358 CodingSystemKit *CodingSystemKitImpl::copy() const
359 {
360   return new CodingSystemKitImpl(systemCharsetDesc_);
361 }
362 
firstEntry(Boolean isBctf) const363 const CodingSystemKitImpl::Entry *CodingSystemKitImpl::firstEntry(Boolean isBctf) const
364 {
365   if (isBctf)
366     return bctfTable_;
367 #ifdef SP_MULTI_BYTE
368   else if (systemCharsetDesc_ != iso10646Desc)
369     return encodingTable_ + nEncodingsRequireUnicode;
370 #endif
371   else
372     return encodingTable_;
373 }
374 
375 const InputCodingSystem *
makeInputCodingSystem(const StringC & s,const CharsetInfo & charset,Boolean isBctf,const char * & key) const376 CodingSystemKitImpl::makeInputCodingSystem(const StringC &s,
377 					   const CharsetInfo &charset,
378 					   Boolean isBctf,
379 					   const char *&key) const
380 {
381   for (const Entry *p = firstEntry(isBctf); p->name; p++)
382     if (match(s, charset, p->name)) {
383       key = p->name;
384       return makeCodingSystem(p->id);
385     }
386   return 0;
387 }
388 
389 Boolean
match(const StringC & s,const CharsetInfo & charset,const char * key)390 CodingSystemKitImpl::match(const StringC &s,
391 			   const CharsetInfo &charset,
392 			   const char *key)
393 {
394   for (size_t i = 0; i < s.size(); i++) {
395     if (key[i] == '\0')
396       return 0;
397     if (charset.execToDesc(toupper(key[i])) != s[i]
398         && charset.execToDesc(tolower(key[i])) != s[i])
399       return 0;
400   }
401   return key[s.size()] == '\0';
402 }
403 
404 const CodingSystem *
makeCodingSystem(const char * s,Boolean isBctf) const405 CodingSystemKitImpl::makeCodingSystem(const char *s,
406 				      Boolean isBctf)
407   const
408 {
409  for (const Entry *p = firstEntry(isBctf); p->name; p++)
410    if (match(s, p->name))
411       return makeCodingSystem(p->id);
412   return 0;
413 }
414 
415 Boolean
match(const char * s,const char * key)416 CodingSystemKitImpl::match(const char *s,
417 			   const char *key)
418 {
419   for (; toupper(*key) == *s || tolower(*key) == *s; s++, key++) {
420     if (*s == '\0')
421       return 1;
422   }
423   return 0;
424 }
425 
426 const CodingSystem *
makeCodingSystem(CodingSystemId id) const427 CodingSystemKitImpl::makeCodingSystem(CodingSystemId id) const
428 {
429   switch (id) {
430   case identity:
431     return &identityCodingSystem_;
432 #ifdef SP_MULTI_BYTE
433   case fixed2:
434     return &fixed2CodingSystem_;
435   case utf8:
436     return &utf8CodingSystem_;
437   case unicode:
438     return &unicodeCodingSystem_;
439   case eucBctf:
440     return &eucBctf_;
441   case sjisBctf:
442     return &sjisBctf_;
443   case big5Bctf:
444     return &big5Bctf_;
445   case eucjp:
446     return &eucjpCodingSystem_;
447   case euccn:
448     return &euccnCodingSystem_;
449   case euckr:
450     return &euckrCodingSystem_;
451   case sjis:
452     return &sjisCodingSystem_;
453   case big5:
454     return &big5CodingSystem_;
455   case iso8859_1:
456     if (systemCharsetDesc_ == iso10646Desc)
457       return &identityCodingSystem_;
458     else
459       return &iso8859_1CodingSystem_;
460   case iso8859_2:
461     return &iso8859_2CodingSystem_;
462   case iso8859_3:
463     return &iso8859_3CodingSystem_;
464   case iso8859_4:
465     return &iso8859_4CodingSystem_;
466   case iso8859_5:
467     return &iso8859_5CodingSystem_;
468   case iso8859_6:
469     return &iso8859_6CodingSystem_;
470   case iso8859_7:
471     return &iso8859_7CodingSystem_;
472   case iso8859_8:
473     return &iso8859_8CodingSystem_;
474   case iso8859_9:
475     return &iso8859_9CodingSystem_;
476   case xml:
477     return &xmlCodingSystem_;
478 #ifdef WIN32
479   case ansi:
480     return &ansiCodingSystem_;
481   case oem:
482     return &oemCodingSystem_;
483   case maybeUnicode:
484     return &maybeUnicodeCodingSystem_;
485 #endif /* WIN32 */
486 #endif /* SP_MULTI_BYTE */
487   default:
488     break;
489   }
490   return 0;
491 }
492 
493 const InputCodingSystem *
identityInputCodingSystem() const494 CodingSystemKitImpl::identityInputCodingSystem() const
495 {
496   return &identityCodingSystem_;
497 }
498 
499 const CodingSystem *
identityCodingSystem() const500 CodingSystemKitImpl::identityCodingSystem() const
501 {
502   return &identityCodingSystem_;
503 }
504 
replacementChar() const505 Char CodingSystemKitImpl::replacementChar() const
506 {
507   // FIXME should vary with systemCharset
508 #ifdef SP_MULTI_BYTE
509   return unicodeReplaceChar;
510 #else
511   return 0;
512 #endif
513 }
514 
515 CodingSystemKit *
make(const char * systemCharsetName)516 CodingSystemKit::make(const char *systemCharsetName)
517 {
518 #ifdef SP_MULTI_BYTE
519   if (systemCharsetName && CodingSystemKitImpl::match(systemCharsetName, "JIS"))
520     return new CodingSystemKitImpl(jis2Desc);
521 #endif
522   return new CodingSystemKitImpl(iso10646Desc);
523 }
524 
~InputCodingSystemKit()525 InputCodingSystemKit::~InputCodingSystemKit()
526 {
527 }
528 
529 #ifdef SP_NAMESPACE
530 }
531 #endif
532