1 /* Charset handling while reading PO files.
2 Copyright (C) 2001-2006 Free Software Foundation, Inc.
3 Written by Bruno Haible <haible@clisp.cons.org>, 2001.
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
8 any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software Foundation,
17 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
18
19
20 #ifdef HAVE_CONFIG_H
21 # include "config.h"
22 #endif
23 #include <alloca.h>
24
25 /* Specification. */
26 #include "po-charset.h"
27
28 #include <stdlib.h>
29 #include <string.h>
30
31 #include "xallocsa.h"
32 #include "xvasprintf.h"
33 #include "po-xerror.h"
34 #include "basename.h"
35 #include "progname.h"
36 #include "c-strstr.h"
37 #include "c-strcase.h"
38 #include "gettext.h"
39
40 #define _(str) gettext (str)
41
42 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
43
44 static const char ascii[] = "ASCII";
45
46 /* The canonicalized encoding name for ASCII. */
47 const char *po_charset_ascii = ascii;
48
49 static const char utf8[] = "UTF-8";
50
51 /* The canonicalized encoding name for UTF-8. */
52 const char *po_charset_utf8 = utf8;
53
54 /* Canonicalize an encoding name. */
55 const char *
po_charset_canonicalize(const char * charset)56 po_charset_canonicalize (const char *charset)
57 {
58 /* The list of charsets supported by glibc's iconv() and by the portable
59 iconv() across platforms. Taken from intl/config.charset. */
60 static const char *standard_charsets[] =
61 {
62 ascii, "ANSI_X3.4-1968", "US-ASCII", /* i = 0..2 */
63 "ISO-8859-1", "ISO_8859-1", /* i = 3, 4 */
64 "ISO-8859-2", "ISO_8859-2",
65 "ISO-8859-3", "ISO_8859-3",
66 "ISO-8859-4", "ISO_8859-4",
67 "ISO-8859-5", "ISO_8859-5",
68 "ISO-8859-6", "ISO_8859-6",
69 "ISO-8859-7", "ISO_8859-7",
70 "ISO-8859-8", "ISO_8859-8",
71 "ISO-8859-9", "ISO_8859-9",
72 "ISO-8859-13", "ISO_8859-13",
73 "ISO-8859-14", "ISO_8859-14",
74 "ISO-8859-15", "ISO_8859-15", /* i = 25, 26 */
75 "KOI8-R",
76 "KOI8-U",
77 "KOI8-T",
78 "CP850",
79 "CP866",
80 "CP874",
81 "CP932",
82 "CP949",
83 "CP950",
84 "CP1250",
85 "CP1251",
86 "CP1252",
87 "CP1253",
88 "CP1254",
89 "CP1255",
90 "CP1256",
91 "CP1257",
92 "GB2312",
93 "EUC-JP",
94 "EUC-KR",
95 "EUC-TW",
96 "BIG5",
97 "BIG5-HKSCS",
98 "GBK",
99 "GB18030",
100 "SHIFT_JIS",
101 "JOHAB",
102 "TIS-620",
103 "VISCII",
104 "GEORGIAN-PS",
105 utf8
106 };
107 size_t i;
108
109 for (i = 0; i < SIZEOF (standard_charsets); i++)
110 if (c_strcasecmp (charset, standard_charsets[i]) == 0)
111 return standard_charsets[i < 3 ? 0 : i < 27 ? ((i - 3) & ~1) + 3 : i];
112 return NULL;
113 }
114
115 /* Test for ASCII compatibility. */
116 bool
po_charset_ascii_compatible(const char * canon_charset)117 po_charset_ascii_compatible (const char *canon_charset)
118 {
119 /* There are only a few exceptions to ASCII compatibility. */
120 if (strcmp (canon_charset, "SHIFT_JIS") == 0
121 || strcmp (canon_charset, "JOHAB") == 0
122 || strcmp (canon_charset, "VISCII") == 0)
123 return false;
124 else
125 return true;
126 }
127
128 /* Test for a weird encoding, i.e. an encoding which has double-byte
129 characters ending in 0x5C. */
po_is_charset_weird(const char * canon_charset)130 bool po_is_charset_weird (const char *canon_charset)
131 {
132 static const char *weird_charsets[] =
133 {
134 "BIG5",
135 "BIG5-HKSCS",
136 "GBK",
137 "GB18030",
138 "SHIFT_JIS",
139 "JOHAB"
140 };
141 size_t i;
142
143 for (i = 0; i < SIZEOF (weird_charsets); i++)
144 if (strcmp (canon_charset, weird_charsets[i]) == 0)
145 return true;
146 return false;
147 }
148
149 /* Test for a weird CJK encoding, i.e. a weird encoding with CJK structure.
150 An encoding has CJK structure if every valid character stream is composed
151 of single bytes in the range 0x{00..7F} and of byte pairs in the range
152 0x{80..FF}{30..FF}. */
po_is_charset_weird_cjk(const char * canon_charset)153 bool po_is_charset_weird_cjk (const char *canon_charset)
154 {
155 static const char *weird_cjk_charsets[] =
156 { /* single bytes double bytes */
157 "BIG5", /* 0x{00..7F}, 0x{A1..F9}{40..FE} */
158 "BIG5-HKSCS", /* 0x{00..7F}, 0x{88..FE}{40..FE} */
159 "GBK", /* 0x{00..7F}, 0x{81..FE}{40..FE} */
160 "GB18030", /* 0x{00..7F}, 0x{81..FE}{30..FE} */
161 "SHIFT_JIS", /* 0x{00..7F}, 0x{81..F9}{40..FC} */
162 "JOHAB" /* 0x{00..7F}, 0x{84..F9}{31..FE} */
163 };
164 size_t i;
165
166 for (i = 0; i < SIZEOF (weird_cjk_charsets); i++)
167 if (strcmp (canon_charset, weird_cjk_charsets[i]) == 0)
168 return true;
169 return false;
170 }
171
172 /* Hardcoded iterator functions for all kinds of encodings.
173 We could also implement a general iterator function with iconv(),
174 but we need a fast one. */
175
176 /* Character iterator for 8-bit encodings. */
177 static size_t
char_iterator(const char * s)178 char_iterator (const char *s)
179 {
180 return 1;
181 }
182
183 /* Character iterator for GB2312. See libiconv/lib/euc_cn.h. */
184 /* Character iterator for EUC-KR. See libiconv/lib/euc_kr.h. */
185 static size_t
euc_character_iterator(const char * s)186 euc_character_iterator (const char *s)
187 {
188 unsigned char c = *s;
189 if (c >= 0xa1 && c < 0xff)
190 {
191 unsigned char c2 = s[1];
192 if (c2 >= 0xa1 && c2 < 0xff)
193 return 2;
194 }
195 return 1;
196 }
197
198 /* Character iterator for EUC-JP. See libiconv/lib/euc_jp.h. */
199 static size_t
euc_jp_character_iterator(const char * s)200 euc_jp_character_iterator (const char *s)
201 {
202 unsigned char c = *s;
203 if (c >= 0xa1 && c < 0xff)
204 {
205 unsigned char c2 = s[1];
206 if (c2 >= 0xa1 && c2 < 0xff)
207 return 2;
208 }
209 else if (c == 0x8e)
210 {
211 unsigned char c2 = s[1];
212 if (c2 >= 0xa1 && c2 < 0xe0)
213 return 2;
214 }
215 else if (c == 0x8f)
216 {
217 unsigned char c2 = s[1];
218 if (c2 >= 0xa1 && c2 < 0xff)
219 {
220 unsigned char c3 = s[2];
221 if (c3 >= 0xa1 && c3 < 0xff)
222 return 3;
223 }
224 }
225 return 1;
226 }
227
228 /* Character iterator for EUC-TW. See libiconv/lib/euc_tw.h. */
229 static size_t
euc_tw_character_iterator(const char * s)230 euc_tw_character_iterator (const char *s)
231 {
232 unsigned char c = *s;
233 if (c >= 0xa1 && c < 0xff)
234 {
235 unsigned char c2 = s[1];
236 if (c2 >= 0xa1 && c2 < 0xff)
237 return 2;
238 }
239 else if (c == 0x8e)
240 {
241 unsigned char c2 = s[1];
242 if (c2 >= 0xa1 && c2 <= 0xb0)
243 {
244 unsigned char c3 = s[2];
245 if (c3 >= 0xa1 && c3 < 0xff)
246 {
247 unsigned char c4 = s[3];
248 if (c4 >= 0xa1 && c4 < 0xff)
249 return 4;
250 }
251 }
252 }
253 return 1;
254 }
255
256 /* Character iterator for BIG5. See libiconv/lib/ces_big5.h. */
257 static size_t
big5_character_iterator(const char * s)258 big5_character_iterator (const char *s)
259 {
260 unsigned char c = *s;
261 if (c >= 0xa1 && c < 0xff)
262 {
263 unsigned char c2 = s[1];
264 if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0xa1 && c2 < 0xff))
265 return 2;
266 }
267 return 1;
268 }
269
270 /* Character iterator for BIG5-HKSCS. See libiconv/lib/big5hkscs.h. */
271 static size_t
big5hkscs_character_iterator(const char * s)272 big5hkscs_character_iterator (const char *s)
273 {
274 unsigned char c = *s;
275 if (c >= 0x88 && c < 0xff)
276 {
277 unsigned char c2 = s[1];
278 if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0xa1 && c2 < 0xff))
279 return 2;
280 }
281 return 1;
282 }
283
284 /* Character iterator for GBK. See libiconv/lib/ces_gbk.h and
285 libiconv/lib/gbk.h. */
286 static size_t
gbk_character_iterator(const char * s)287 gbk_character_iterator (const char *s)
288 {
289 unsigned char c = *s;
290 if (c >= 0x81 && c < 0xff)
291 {
292 unsigned char c2 = s[1];
293 if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0x80 && c2 < 0xff))
294 return 2;
295 }
296 return 1;
297 }
298
299 /* Character iterator for GB18030. See libiconv/lib/gb18030.h. */
300 static size_t
gb18030_character_iterator(const char * s)301 gb18030_character_iterator (const char *s)
302 {
303 unsigned char c = *s;
304 if (c >= 0x81 && c < 0xff)
305 {
306 unsigned char c2 = s[1];
307 if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0x80 && c2 < 0xff))
308 return 2;
309 }
310 if (c >= 0x81 && c <= 0x84)
311 {
312 unsigned char c2 = s[1];
313 if (c2 >= 0x30 && c2 <= 0x39)
314 {
315 unsigned char c3 = s[2];
316 if (c3 >= 0x81 && c3 < 0xff)
317 {
318 unsigned char c4 = s[3];
319 if (c4 >= 0x30 && c4 <= 0x39)
320 return 4;
321 }
322 }
323 }
324 return 1;
325 }
326
327 /* Character iterator for SHIFT_JIS. See libiconv/lib/sjis.h. */
328 static size_t
shift_jis_character_iterator(const char * s)329 shift_jis_character_iterator (const char *s)
330 {
331 unsigned char c = *s;
332 if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xf9))
333 {
334 unsigned char c2 = s[1];
335 if ((c2 >= 0x40 && c2 <= 0x7e) || (c2 >= 0x80 && c2 <= 0xfc))
336 return 2;
337 }
338 return 1;
339 }
340
341 /* Character iterator for JOHAB. See libiconv/lib/johab.h and
342 libiconv/lib/johab_hangul.h. */
343 static size_t
johab_character_iterator(const char * s)344 johab_character_iterator (const char *s)
345 {
346 unsigned char c = *s;
347 if (c >= 0x84 && c <= 0xd3)
348 {
349 unsigned char c2 = s[1];
350 if ((c2 >= 0x41 && c2 < 0x7f) || (c2 >= 0x81 && c2 < 0xff))
351 return 2;
352 }
353 else if (c >= 0xd9 && c <= 0xf9)
354 {
355 unsigned char c2 = s[1];
356 if ((c2 >= 0x31 && c2 <= 0x7e) || (c2 >= 0x91 && c2 <= 0xfe))
357 return 2;
358 }
359 return 1;
360 }
361
362 /* Character iterator for UTF-8. See libiconv/lib/utf8.h. */
363 static size_t
utf8_character_iterator(const char * s)364 utf8_character_iterator (const char *s)
365 {
366 unsigned char c = *s;
367 if (c >= 0xc2)
368 {
369 if (c < 0xe0)
370 {
371 unsigned char c2 = s[1];
372 if (c2 >= 0x80 && c2 < 0xc0)
373 return 2;
374 }
375 else if (c < 0xf0)
376 {
377 unsigned char c2 = s[1];
378 if (c2 >= 0x80 && c2 < 0xc0)
379 {
380 unsigned char c3 = s[2];
381 if (c3 >= 0x80 && c3 < 0xc0)
382 return 3;
383 }
384 }
385 else if (c < 0xf8)
386 {
387 unsigned char c2 = s[1];
388 if (c2 >= 0x80 && c2 < 0xc0)
389 {
390 unsigned char c3 = s[2];
391 if (c3 >= 0x80 && c3 < 0xc0)
392 {
393 unsigned char c4 = s[3];
394 if (c4 >= 0x80 && c4 < 0xc0)
395 return 4;
396 }
397 }
398 }
399 }
400 return 1;
401 }
402
403 /* Returns a character iterator for a given encoding.
404 Given a pointer into a string, it returns the number occupied by the next
405 single character. If the piece of string is not valid or if the *s == '\0',
406 it returns 1. */
407 character_iterator_t
po_charset_character_iterator(const char * canon_charset)408 po_charset_character_iterator (const char *canon_charset)
409 {
410 if (canon_charset == utf8)
411 return utf8_character_iterator;
412 if (strcmp (canon_charset, "GB2312") == 0
413 || strcmp (canon_charset, "EUC-KR") == 0)
414 return euc_character_iterator;
415 if (strcmp (canon_charset, "EUC-JP") == 0)
416 return euc_jp_character_iterator;
417 if (strcmp (canon_charset, "EUC-TW") == 0)
418 return euc_tw_character_iterator;
419 if (strcmp (canon_charset, "BIG5") == 0)
420 return big5_character_iterator;
421 if (strcmp (canon_charset, "BIG5-HKSCS") == 0)
422 return big5hkscs_character_iterator;
423 if (strcmp (canon_charset, "GBK") == 0)
424 return gbk_character_iterator;
425 if (strcmp (canon_charset, "GB18030") == 0)
426 return gb18030_character_iterator;
427 if (strcmp (canon_charset, "SHIFT_JIS") == 0)
428 return shift_jis_character_iterator;
429 if (strcmp (canon_charset, "JOHAB") == 0)
430 return johab_character_iterator;
431 return char_iterator;
432 }
433
434
435 /* The PO file's encoding, as specified in the header entry. */
436 const char *po_lex_charset;
437
438 #if HAVE_ICONV
439 /* Converter from the PO file's encoding to UTF-8. */
440 iconv_t po_lex_iconv;
441 #endif
442 /* If no converter is available, some information about the structure of the
443 PO file's encoding. */
444 bool po_lex_weird_cjk;
445
446 void
po_lex_charset_init()447 po_lex_charset_init ()
448 {
449 po_lex_charset = NULL;
450 #if HAVE_ICONV
451 po_lex_iconv = (iconv_t)(-1);
452 #endif
453 po_lex_weird_cjk = false;
454 }
455
456 void
po_lex_charset_set(const char * header_entry,const char * filename)457 po_lex_charset_set (const char *header_entry, const char *filename)
458 {
459 /* Verify the validity of CHARSET. It is necessary
460 1. for the correct treatment of multibyte characters containing
461 0x5C bytes in the PO lexer,
462 2. so that at run time, gettext() can call iconv() to convert
463 msgstr. */
464 const char *charsetstr = c_strstr (header_entry, "charset=");
465
466 if (charsetstr != NULL)
467 {
468 size_t len;
469 char *charset;
470 const char *canon_charset;
471
472 charsetstr += strlen ("charset=");
473 len = strcspn (charsetstr, " \t\n");
474 charset = (char *) xallocsa (len + 1);
475 memcpy (charset, charsetstr, len);
476 charset[len] = '\0';
477
478 canon_charset = po_charset_canonicalize (charset);
479 if (canon_charset == NULL)
480 {
481 /* Don't warn for POT files, because POT files usually contain
482 only ASCII msgids. */
483 size_t filenamelen = strlen (filename);
484
485 if (!(filenamelen >= 4
486 && memcmp (filename + filenamelen - 4, ".pot", 4) == 0
487 && strcmp (charset, "CHARSET") == 0))
488 {
489 char *warning_message =
490 xasprintf (_("\
491 Charset \"%s\" is not a portable encoding name.\n\
492 Message conversion to user's charset might not work.\n"),
493 charset);
494 po_xerror (PO_SEVERITY_WARNING, NULL,
495 filename, (size_t)(-1), (size_t)(-1), true,
496 warning_message);
497 free (warning_message);
498 }
499 }
500 else
501 {
502 const char *envval;
503
504 po_lex_charset = canon_charset;
505 #if HAVE_ICONV
506 if (po_lex_iconv != (iconv_t)(-1))
507 iconv_close (po_lex_iconv);
508 #endif
509
510 /* The old Solaris/openwin msgfmt and GNU msgfmt <= 0.10.35
511 don't know about multibyte encodings, and require a spurious
512 backslash after every multibyte character whose last byte is
513 0x5C. Some programs, like vim, distribute PO files in this
514 broken format. GNU msgfmt must continue to support this old
515 PO file format when the Makefile requests it. */
516 envval = getenv ("OLD_PO_FILE_INPUT");
517 if (envval != NULL && *envval != '\0')
518 {
519 /* Assume the PO file is in old format, with extraneous
520 backslashes. */
521 #if HAVE_ICONV
522 po_lex_iconv = (iconv_t)(-1);
523 #endif
524 po_lex_weird_cjk = false;
525 }
526 else
527 {
528 /* Use iconv() to parse multibyte characters. */
529 #if HAVE_ICONV
530 /* Avoid glibc-2.1 bug with EUC-KR. */
531 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
532 if (strcmp (po_lex_charset, "EUC-KR") == 0)
533 po_lex_iconv = (iconv_t)(-1);
534 else
535 # endif
536 /* Avoid Solaris 2.9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS,
537 GBK, GB18030. */
538 # if defined __sun && !defined _LIBICONV_VERSION
539 if ( strcmp (po_lex_charset, "GB2312") == 0
540 || strcmp (po_lex_charset, "EUC-TW") == 0
541 || strcmp (po_lex_charset, "BIG5") == 0
542 || strcmp (po_lex_charset, "BIG5-HKSCS") == 0
543 || strcmp (po_lex_charset, "GBK") == 0
544 || strcmp (po_lex_charset, "GB18030") == 0)
545 po_lex_iconv = (iconv_t)(-1);
546 else
547 # endif
548 po_lex_iconv = iconv_open ("UTF-8", po_lex_charset);
549 if (po_lex_iconv == (iconv_t)(-1))
550 {
551 char *warning_message;
552 const char *recommendation;
553 const char *note;
554 char *whole_message;
555
556 warning_message =
557 xasprintf (_("\
558 Charset \"%s\" is not supported. %s relies on iconv(),\n\
559 and iconv() does not support \"%s\".\n"),
560 po_lex_charset, basename (program_name),
561 po_lex_charset);
562
563 # if !defined _LIBICONV_VERSION
564 recommendation = _("\
565 Installing GNU libiconv and then reinstalling GNU gettext\n\
566 would fix this problem.\n");
567 # else
568 recommendation = "";
569 # endif
570
571 /* Test for a charset which has double-byte characters
572 ending in 0x5C. For these encodings, the string parser
573 is likely to be confused if it can't see the character
574 boundaries. */
575 po_lex_weird_cjk = po_is_charset_weird_cjk (po_lex_charset);
576 if (po_is_charset_weird (po_lex_charset)
577 && !po_lex_weird_cjk)
578 note = _("Continuing anyway, expect parse errors.");
579 else
580 note = _("Continuing anyway.");
581
582 whole_message =
583 xasprintf ("%s%s%s\n",
584 warning_message, recommendation, note);
585
586 po_xerror (PO_SEVERITY_WARNING, NULL,
587 filename, (size_t)(-1), (size_t)(-1), true,
588 whole_message);
589
590 free (whole_message);
591 free (warning_message);
592 }
593 #else
594 /* Test for a charset which has double-byte characters
595 ending in 0x5C. For these encodings, the string parser
596 is likely to be confused if it can't see the character
597 boundaries. */
598 po_lex_weird_cjk = po_is_charset_weird_cjk (po_lex_charset);
599 if (po_is_charset_weird (po_lex_charset) && !po_lex_weird_cjk)
600 {
601 char *warning_message;
602 const char *recommendation;
603 const char *note;
604 char *whole_message;
605
606 warning_message =
607 xasprintf (_("\
608 Charset \"%s\" is not supported. %s relies on iconv().\n\
609 This version was built without iconv().\n"),
610 po_lex_charset, basename (program_name));
611
612 recommendation = _("\
613 Installing GNU libiconv and then reinstalling GNU gettext\n\
614 would fix this problem.\n");
615
616 note = _("Continuing anyway, expect parse errors.");
617
618 whole_message =
619 xasprintf ("%s%s%s\n",
620 warning_message, recommendation, note);
621
622 po_xerror (PO_SEVERITY_WARNING, NULL,
623 filename, (size_t)(-1), (size_t)(-1), true,
624 whole_message);
625
626 free (whole_message);
627 free (warning_message);
628 }
629 #endif
630 }
631 }
632 freesa (charset);
633 }
634 else
635 {
636 /* Don't warn for POT files, because POT files usually contain
637 only ASCII msgids. */
638 size_t filenamelen = strlen (filename);
639
640 if (!(filenamelen >= 4
641 && memcmp (filename + filenamelen - 4, ".pot", 4) == 0))
642 po_xerror (PO_SEVERITY_WARNING,
643 NULL, filename, (size_t)(-1), (size_t)(-1), true,
644 _("\
645 Charset missing in header.\n\
646 Message conversion to user's charset will not work.\n"));
647 }
648 }
649
650 void
po_lex_charset_close()651 po_lex_charset_close ()
652 {
653 po_lex_charset = NULL;
654 #if HAVE_ICONV
655 if (po_lex_iconv != (iconv_t)(-1))
656 {
657 iconv_close (po_lex_iconv);
658 po_lex_iconv = (iconv_t)(-1);
659 }
660 #endif
661 po_lex_weird_cjk = false;
662 }
663