1 /* Charset handling while reading PO files.
2    Copyright (C) 2001-2006 Free Software Foundation, Inc.
3    Written by Bruno Haible <haible@clisp.cons.org>, 2001.
4 
5    This program is free software; you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation; either version 2, or (at your option)
8    any later version.
9 
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    You should have received a copy of the GNU General Public License
16    along with this program; if not, write to the Free Software Foundation,
17    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
18 
19 
20 #ifdef HAVE_CONFIG_H
21 # include "config.h"
22 #endif
23 #include <alloca.h>
24 
25 /* Specification.  */
26 #include "po-charset.h"
27 
28 #include <stdlib.h>
29 #include <string.h>
30 
31 #include "xallocsa.h"
32 #include "xvasprintf.h"
33 #include "po-xerror.h"
34 #include "basename.h"
35 #include "progname.h"
36 #include "c-strstr.h"
37 #include "c-strcase.h"
38 #include "gettext.h"
39 
40 #define _(str) gettext (str)
41 
42 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
43 
44 static const char ascii[] = "ASCII";
45 
46 /* The canonicalized encoding name for ASCII.  */
47 const char *po_charset_ascii = ascii;
48 
49 static const char utf8[] = "UTF-8";
50 
51 /* The canonicalized encoding name for UTF-8.  */
52 const char *po_charset_utf8 = utf8;
53 
54 /* Canonicalize an encoding name.  */
55 const char *
po_charset_canonicalize(const char * charset)56 po_charset_canonicalize (const char *charset)
57 {
58   /* The list of charsets supported by glibc's iconv() and by the portable
59      iconv() across platforms.  Taken from intl/config.charset.  */
60   static const char *standard_charsets[] =
61   {
62     ascii, "ANSI_X3.4-1968", "US-ASCII",	/* i = 0..2 */
63     "ISO-8859-1", "ISO_8859-1",			/* i = 3, 4 */
64     "ISO-8859-2", "ISO_8859-2",
65     "ISO-8859-3", "ISO_8859-3",
66     "ISO-8859-4", "ISO_8859-4",
67     "ISO-8859-5", "ISO_8859-5",
68     "ISO-8859-6", "ISO_8859-6",
69     "ISO-8859-7", "ISO_8859-7",
70     "ISO-8859-8", "ISO_8859-8",
71     "ISO-8859-9", "ISO_8859-9",
72     "ISO-8859-13", "ISO_8859-13",
73     "ISO-8859-14", "ISO_8859-14",
74     "ISO-8859-15", "ISO_8859-15",		/* i = 25, 26 */
75     "KOI8-R",
76     "KOI8-U",
77     "KOI8-T",
78     "CP850",
79     "CP866",
80     "CP874",
81     "CP932",
82     "CP949",
83     "CP950",
84     "CP1250",
85     "CP1251",
86     "CP1252",
87     "CP1253",
88     "CP1254",
89     "CP1255",
90     "CP1256",
91     "CP1257",
92     "GB2312",
93     "EUC-JP",
94     "EUC-KR",
95     "EUC-TW",
96     "BIG5",
97     "BIG5-HKSCS",
98     "GBK",
99     "GB18030",
100     "SHIFT_JIS",
101     "JOHAB",
102     "TIS-620",
103     "VISCII",
104     "GEORGIAN-PS",
105     utf8
106   };
107   size_t i;
108 
109   for (i = 0; i < SIZEOF (standard_charsets); i++)
110     if (c_strcasecmp (charset, standard_charsets[i]) == 0)
111       return standard_charsets[i < 3 ? 0 : i < 27 ? ((i - 3) & ~1) + 3 : i];
112   return NULL;
113 }
114 
115 /* Test for ASCII compatibility.  */
116 bool
po_charset_ascii_compatible(const char * canon_charset)117 po_charset_ascii_compatible (const char *canon_charset)
118 {
119   /* There are only a few exceptions to ASCII compatibility.  */
120   if (strcmp (canon_charset, "SHIFT_JIS") == 0
121       || strcmp (canon_charset, "JOHAB") == 0
122       || strcmp (canon_charset, "VISCII") == 0)
123     return false;
124   else
125     return true;
126 }
127 
128 /* Test for a weird encoding, i.e. an encoding which has double-byte
129    characters ending in 0x5C.  */
po_is_charset_weird(const char * canon_charset)130 bool po_is_charset_weird (const char *canon_charset)
131 {
132   static const char *weird_charsets[] =
133   {
134     "BIG5",
135     "BIG5-HKSCS",
136     "GBK",
137     "GB18030",
138     "SHIFT_JIS",
139     "JOHAB"
140   };
141   size_t i;
142 
143   for (i = 0; i < SIZEOF (weird_charsets); i++)
144     if (strcmp (canon_charset, weird_charsets[i]) == 0)
145       return true;
146   return false;
147 }
148 
149 /* Test for a weird CJK encoding, i.e. a weird encoding with CJK structure.
150    An encoding has CJK structure if every valid character stream is composed
151    of single bytes in the range 0x{00..7F} and of byte pairs in the range
152    0x{80..FF}{30..FF}.  */
po_is_charset_weird_cjk(const char * canon_charset)153 bool po_is_charset_weird_cjk (const char *canon_charset)
154 {
155   static const char *weird_cjk_charsets[] =
156   {			/* single bytes   double bytes       */
157     "BIG5",		/* 0x{00..7F},    0x{A1..F9}{40..FE} */
158     "BIG5-HKSCS",	/* 0x{00..7F},    0x{88..FE}{40..FE} */
159     "GBK",		/* 0x{00..7F},    0x{81..FE}{40..FE} */
160     "GB18030",		/* 0x{00..7F},    0x{81..FE}{30..FE} */
161     "SHIFT_JIS",	/* 0x{00..7F},    0x{81..F9}{40..FC} */
162     "JOHAB"		/* 0x{00..7F},    0x{84..F9}{31..FE} */
163   };
164   size_t i;
165 
166   for (i = 0; i < SIZEOF (weird_cjk_charsets); i++)
167     if (strcmp (canon_charset, weird_cjk_charsets[i]) == 0)
168       return true;
169   return false;
170 }
171 
172 /* Hardcoded iterator functions for all kinds of encodings.
173    We could also implement a general iterator function with iconv(),
174    but we need a fast one.  */
175 
176 /* Character iterator for 8-bit encodings.  */
177 static size_t
char_iterator(const char * s)178 char_iterator (const char *s)
179 {
180   return 1;
181 }
182 
183 /* Character iterator for GB2312.  See libiconv/lib/euc_cn.h.  */
184 /* Character iterator for EUC-KR.  See libiconv/lib/euc_kr.h.  */
185 static size_t
euc_character_iterator(const char * s)186 euc_character_iterator (const char *s)
187 {
188   unsigned char c = *s;
189   if (c >= 0xa1 && c < 0xff)
190     {
191       unsigned char c2 = s[1];
192       if (c2 >= 0xa1 && c2 < 0xff)
193 	return 2;
194     }
195   return 1;
196 }
197 
198 /* Character iterator for EUC-JP.  See libiconv/lib/euc_jp.h.  */
199 static size_t
euc_jp_character_iterator(const char * s)200 euc_jp_character_iterator (const char *s)
201 {
202   unsigned char c = *s;
203   if (c >= 0xa1 && c < 0xff)
204     {
205       unsigned char c2 = s[1];
206       if (c2 >= 0xa1 && c2 < 0xff)
207 	return 2;
208     }
209   else if (c == 0x8e)
210     {
211       unsigned char c2 = s[1];
212       if (c2 >= 0xa1 && c2 < 0xe0)
213 	return 2;
214     }
215   else if (c == 0x8f)
216     {
217       unsigned char c2 = s[1];
218       if (c2 >= 0xa1 && c2 < 0xff)
219 	{
220 	  unsigned char c3 = s[2];
221 	  if (c3 >= 0xa1 && c3 < 0xff)
222 	    return 3;
223 	}
224     }
225   return 1;
226 }
227 
228 /* Character iterator for EUC-TW.  See libiconv/lib/euc_tw.h.  */
229 static size_t
euc_tw_character_iterator(const char * s)230 euc_tw_character_iterator (const char *s)
231 {
232   unsigned char c = *s;
233   if (c >= 0xa1 && c < 0xff)
234     {
235       unsigned char c2 = s[1];
236       if (c2 >= 0xa1 && c2 < 0xff)
237 	return 2;
238     }
239   else if (c == 0x8e)
240     {
241       unsigned char c2 = s[1];
242       if (c2 >= 0xa1 && c2 <= 0xb0)
243 	{
244 	  unsigned char c3 = s[2];
245 	  if (c3 >= 0xa1 && c3 < 0xff)
246 	    {
247 	      unsigned char c4 = s[3];
248 	      if (c4 >= 0xa1 && c4 < 0xff)
249 		return 4;
250 	    }
251 	}
252     }
253   return 1;
254 }
255 
256 /* Character iterator for BIG5.  See libiconv/lib/ces_big5.h.  */
257 static size_t
big5_character_iterator(const char * s)258 big5_character_iterator (const char *s)
259 {
260   unsigned char c = *s;
261   if (c >= 0xa1 && c < 0xff)
262     {
263       unsigned char c2 = s[1];
264       if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0xa1 && c2 < 0xff))
265 	return 2;
266     }
267   return 1;
268 }
269 
270 /* Character iterator for BIG5-HKSCS.  See libiconv/lib/big5hkscs.h.  */
271 static size_t
big5hkscs_character_iterator(const char * s)272 big5hkscs_character_iterator (const char *s)
273 {
274   unsigned char c = *s;
275   if (c >= 0x88 && c < 0xff)
276     {
277       unsigned char c2 = s[1];
278       if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0xa1 && c2 < 0xff))
279 	return 2;
280     }
281   return 1;
282 }
283 
284 /* Character iterator for GBK.  See libiconv/lib/ces_gbk.h and
285    libiconv/lib/gbk.h.  */
286 static size_t
gbk_character_iterator(const char * s)287 gbk_character_iterator (const char *s)
288 {
289   unsigned char c = *s;
290   if (c >= 0x81 && c < 0xff)
291     {
292       unsigned char c2 = s[1];
293       if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0x80 && c2 < 0xff))
294 	return 2;
295     }
296   return 1;
297 }
298 
299 /* Character iterator for GB18030.  See libiconv/lib/gb18030.h.  */
300 static size_t
gb18030_character_iterator(const char * s)301 gb18030_character_iterator (const char *s)
302 {
303   unsigned char c = *s;
304   if (c >= 0x81 && c < 0xff)
305     {
306       unsigned char c2 = s[1];
307       if ((c2 >= 0x40 && c2 < 0x7f) || (c2 >= 0x80 && c2 < 0xff))
308 	return 2;
309     }
310   if (c >= 0x81 && c <= 0x84)
311     {
312       unsigned char c2 = s[1];
313       if (c2 >= 0x30 && c2 <= 0x39)
314 	{
315 	  unsigned char c3 = s[2];
316 	  if (c3 >= 0x81 && c3 < 0xff)
317 	    {
318 	      unsigned char c4 = s[3];
319 	      if (c4 >= 0x30 && c4 <= 0x39)
320 		return 4;
321 	    }
322 	}
323     }
324   return 1;
325 }
326 
327 /* Character iterator for SHIFT_JIS.  See libiconv/lib/sjis.h.  */
328 static size_t
shift_jis_character_iterator(const char * s)329 shift_jis_character_iterator (const char *s)
330 {
331   unsigned char c = *s;
332   if ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xf9))
333     {
334       unsigned char c2 = s[1];
335       if ((c2 >= 0x40 && c2 <= 0x7e) || (c2 >= 0x80 && c2 <= 0xfc))
336 	return 2;
337     }
338   return 1;
339 }
340 
341 /* Character iterator for JOHAB.  See libiconv/lib/johab.h and
342    libiconv/lib/johab_hangul.h.  */
343 static size_t
johab_character_iterator(const char * s)344 johab_character_iterator (const char *s)
345 {
346   unsigned char c = *s;
347   if (c >= 0x84 && c <= 0xd3)
348     {
349       unsigned char c2 = s[1];
350       if ((c2 >= 0x41 && c2 < 0x7f) || (c2 >= 0x81 && c2 < 0xff))
351 	return 2;
352     }
353   else if (c >= 0xd9 && c <= 0xf9)
354     {
355       unsigned char c2 = s[1];
356       if ((c2 >= 0x31 && c2 <= 0x7e) || (c2 >= 0x91 && c2 <= 0xfe))
357 	return 2;
358     }
359   return 1;
360 }
361 
362 /* Character iterator for UTF-8.  See libiconv/lib/utf8.h.  */
363 static size_t
utf8_character_iterator(const char * s)364 utf8_character_iterator (const char *s)
365 {
366   unsigned char c = *s;
367   if (c >= 0xc2)
368     {
369       if (c < 0xe0)
370 	{
371 	  unsigned char c2 = s[1];
372 	  if (c2 >= 0x80 && c2 < 0xc0)
373 	    return 2;
374 	}
375       else if (c < 0xf0)
376 	{
377 	  unsigned char c2 = s[1];
378 	  if (c2 >= 0x80 && c2 < 0xc0)
379 	    {
380 	      unsigned char c3 = s[2];
381 	      if (c3 >= 0x80 && c3 < 0xc0)
382 		return 3;
383 	    }
384 	}
385       else if (c < 0xf8)
386 	{
387 	  unsigned char c2 = s[1];
388 	  if (c2 >= 0x80 && c2 < 0xc0)
389 	    {
390 	      unsigned char c3 = s[2];
391 	      if (c3 >= 0x80 && c3 < 0xc0)
392 		{
393 		  unsigned char c4 = s[3];
394 		  if (c4 >= 0x80 && c4 < 0xc0)
395 		    return 4;
396 		}
397 	    }
398 	}
399     }
400   return 1;
401 }
402 
403 /* Returns a character iterator for a given encoding.
404    Given a pointer into a string, it returns the number occupied by the next
405    single character.  If the piece of string is not valid or if the *s == '\0',
406    it returns 1.  */
407 character_iterator_t
po_charset_character_iterator(const char * canon_charset)408 po_charset_character_iterator (const char *canon_charset)
409 {
410   if (canon_charset == utf8)
411     return utf8_character_iterator;
412   if (strcmp (canon_charset, "GB2312") == 0
413       || strcmp (canon_charset, "EUC-KR") == 0)
414     return euc_character_iterator;
415   if (strcmp (canon_charset, "EUC-JP") == 0)
416     return euc_jp_character_iterator;
417   if (strcmp (canon_charset, "EUC-TW") == 0)
418     return euc_tw_character_iterator;
419   if (strcmp (canon_charset, "BIG5") == 0)
420     return big5_character_iterator;
421   if (strcmp (canon_charset, "BIG5-HKSCS") == 0)
422     return big5hkscs_character_iterator;
423   if (strcmp (canon_charset, "GBK") == 0)
424     return gbk_character_iterator;
425   if (strcmp (canon_charset, "GB18030") == 0)
426     return gb18030_character_iterator;
427   if (strcmp (canon_charset, "SHIFT_JIS") == 0)
428     return shift_jis_character_iterator;
429   if (strcmp (canon_charset, "JOHAB") == 0)
430     return johab_character_iterator;
431   return char_iterator;
432 }
433 
434 
435 /* The PO file's encoding, as specified in the header entry.  */
436 const char *po_lex_charset;
437 
438 #if HAVE_ICONV
439 /* Converter from the PO file's encoding to UTF-8.  */
440 iconv_t po_lex_iconv;
441 #endif
442 /* If no converter is available, some information about the structure of the
443    PO file's encoding.  */
444 bool po_lex_weird_cjk;
445 
446 void
po_lex_charset_init()447 po_lex_charset_init ()
448 {
449   po_lex_charset = NULL;
450 #if HAVE_ICONV
451   po_lex_iconv = (iconv_t)(-1);
452 #endif
453   po_lex_weird_cjk = false;
454 }
455 
456 void
po_lex_charset_set(const char * header_entry,const char * filename)457 po_lex_charset_set (const char *header_entry, const char *filename)
458 {
459   /* Verify the validity of CHARSET.  It is necessary
460      1. for the correct treatment of multibyte characters containing
461 	0x5C bytes in the PO lexer,
462      2. so that at run time, gettext() can call iconv() to convert
463 	msgstr.  */
464   const char *charsetstr = c_strstr (header_entry, "charset=");
465 
466   if (charsetstr != NULL)
467     {
468       size_t len;
469       char *charset;
470       const char *canon_charset;
471 
472       charsetstr += strlen ("charset=");
473       len = strcspn (charsetstr, " \t\n");
474       charset = (char *) xallocsa (len + 1);
475       memcpy (charset, charsetstr, len);
476       charset[len] = '\0';
477 
478       canon_charset = po_charset_canonicalize (charset);
479       if (canon_charset == NULL)
480 	{
481 	  /* Don't warn for POT files, because POT files usually contain
482 	     only ASCII msgids.  */
483 	  size_t filenamelen = strlen (filename);
484 
485 	  if (!(filenamelen >= 4
486 		&& memcmp (filename + filenamelen - 4, ".pot", 4) == 0
487 		&& strcmp (charset, "CHARSET") == 0))
488 	    {
489 	      char *warning_message =
490 		xasprintf (_("\
491 Charset \"%s\" is not a portable encoding name.\n\
492 Message conversion to user's charset might not work.\n"),
493 			   charset);
494 	      po_xerror (PO_SEVERITY_WARNING, NULL,
495 			 filename, (size_t)(-1), (size_t)(-1), true,
496 			 warning_message);
497 	      free (warning_message);
498 	    }
499 	}
500       else
501 	{
502 	  const char *envval;
503 
504 	  po_lex_charset = canon_charset;
505 #if HAVE_ICONV
506 	  if (po_lex_iconv != (iconv_t)(-1))
507 	    iconv_close (po_lex_iconv);
508 #endif
509 
510 	  /* The old Solaris/openwin msgfmt and GNU msgfmt <= 0.10.35
511 	     don't know about multibyte encodings, and require a spurious
512 	     backslash after every multibyte character whose last byte is
513 	     0x5C.  Some programs, like vim, distribute PO files in this
514 	     broken format.  GNU msgfmt must continue to support this old
515 	     PO file format when the Makefile requests it.  */
516 	  envval = getenv ("OLD_PO_FILE_INPUT");
517 	  if (envval != NULL && *envval != '\0')
518 	    {
519 	      /* Assume the PO file is in old format, with extraneous
520 		 backslashes.  */
521 #if HAVE_ICONV
522 	      po_lex_iconv = (iconv_t)(-1);
523 #endif
524 	      po_lex_weird_cjk = false;
525 	    }
526 	  else
527 	    {
528 	      /* Use iconv() to parse multibyte characters.  */
529 #if HAVE_ICONV
530 	      /* Avoid glibc-2.1 bug with EUC-KR.  */
531 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
532 	      if (strcmp (po_lex_charset, "EUC-KR") == 0)
533 		po_lex_iconv = (iconv_t)(-1);
534 	      else
535 # endif
536 	      /* Avoid Solaris 2.9 bug with GB2312, EUC-TW, BIG5, BIG5-HKSCS,
537 		 GBK, GB18030.  */
538 # if defined __sun && !defined _LIBICONV_VERSION
539 	      if (   strcmp (po_lex_charset, "GB2312") == 0
540 		  || strcmp (po_lex_charset, "EUC-TW") == 0
541 		  || strcmp (po_lex_charset, "BIG5") == 0
542 		  || strcmp (po_lex_charset, "BIG5-HKSCS") == 0
543 		  || strcmp (po_lex_charset, "GBK") == 0
544 		  || strcmp (po_lex_charset, "GB18030") == 0)
545 		po_lex_iconv = (iconv_t)(-1);
546 	      else
547 # endif
548 	      po_lex_iconv = iconv_open ("UTF-8", po_lex_charset);
549 	      if (po_lex_iconv == (iconv_t)(-1))
550 		{
551 		  char *warning_message;
552 		  const char *recommendation;
553 		  const char *note;
554 		  char *whole_message;
555 
556 		  warning_message =
557 		    xasprintf (_("\
558 Charset \"%s\" is not supported. %s relies on iconv(),\n\
559 and iconv() does not support \"%s\".\n"),
560 			       po_lex_charset, basename (program_name),
561 			       po_lex_charset);
562 
563 # if !defined _LIBICONV_VERSION
564 		  recommendation = _("\
565 Installing GNU libiconv and then reinstalling GNU gettext\n\
566 would fix this problem.\n");
567 # else
568 		  recommendation = "";
569 # endif
570 
571 		  /* Test for a charset which has double-byte characters
572 		     ending in 0x5C.  For these encodings, the string parser
573 		     is likely to be confused if it can't see the character
574 		     boundaries.  */
575 		  po_lex_weird_cjk = po_is_charset_weird_cjk (po_lex_charset);
576 		  if (po_is_charset_weird (po_lex_charset)
577 		      && !po_lex_weird_cjk)
578 		    note = _("Continuing anyway, expect parse errors.");
579 		  else
580 		    note = _("Continuing anyway.");
581 
582 		  whole_message =
583 		    xasprintf ("%s%s%s\n",
584 			       warning_message, recommendation, note);
585 
586 		  po_xerror (PO_SEVERITY_WARNING, NULL,
587 			     filename, (size_t)(-1), (size_t)(-1), true,
588 			     whole_message);
589 
590 		  free (whole_message);
591 		  free (warning_message);
592 		}
593 #else
594 	      /* Test for a charset which has double-byte characters
595 		 ending in 0x5C.  For these encodings, the string parser
596 		 is likely to be confused if it can't see the character
597 		 boundaries.  */
598 	      po_lex_weird_cjk = po_is_charset_weird_cjk (po_lex_charset);
599 	      if (po_is_charset_weird (po_lex_charset) && !po_lex_weird_cjk)
600 		{
601 		  char *warning_message;
602 		  const char *recommendation;
603 		  const char *note;
604 		  char *whole_message;
605 
606 		  warning_message =
607 		    xasprintf (_("\
608 Charset \"%s\" is not supported. %s relies on iconv().\n\
609 This version was built without iconv().\n"),
610 			       po_lex_charset, basename (program_name));
611 
612 		  recommendation = _("\
613 Installing GNU libiconv and then reinstalling GNU gettext\n\
614 would fix this problem.\n");
615 
616 		  note = _("Continuing anyway, expect parse errors.");
617 
618 		  whole_message =
619 		    xasprintf ("%s%s%s\n",
620 			       warning_message, recommendation, note);
621 
622 		  po_xerror (PO_SEVERITY_WARNING, NULL,
623 			     filename, (size_t)(-1), (size_t)(-1), true,
624 			     whole_message);
625 
626 		  free (whole_message);
627 		  free (warning_message);
628 		}
629 #endif
630 	    }
631 	}
632       freesa (charset);
633     }
634   else
635     {
636       /* Don't warn for POT files, because POT files usually contain
637 	 only ASCII msgids.  */
638       size_t filenamelen = strlen (filename);
639 
640       if (!(filenamelen >= 4
641 	    && memcmp (filename + filenamelen - 4, ".pot", 4) == 0))
642 	po_xerror (PO_SEVERITY_WARNING,
643 		   NULL, filename, (size_t)(-1), (size_t)(-1), true,
644 		   _("\
645 Charset missing in header.\n\
646 Message conversion to user's charset will not work.\n"));
647     }
648 }
649 
650 void
po_lex_charset_close()651 po_lex_charset_close ()
652 {
653   po_lex_charset = NULL;
654 #if HAVE_ICONV
655   if (po_lex_iconv != (iconv_t)(-1))
656     {
657       iconv_close (po_lex_iconv);
658       po_lex_iconv = (iconv_t)(-1);
659     }
660 #endif
661   po_lex_weird_cjk = false;
662 }
663