xref: /netbsd-src/external/gpl2/gettext/dist/gettext-tools/src/msgl-iconv.c (revision 946379e7b37692fc43f68eb0d1c10daa0a7f3b6c)
1 /* Message list charset and locale charset handling.
2    Copyright (C) 2001-2003, 2005-2006 Free Software Foundation, Inc.
3    Written by Bruno Haible <haible@clisp.cons.org>, 2001.
4 
5    This program is free software; you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation; either version 2, or (at your option)
8    any later version.
9 
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    You should have received a copy of the GNU General Public License
16    along with this program; if not, write to the Free Software Foundation,
17    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
18 
19 
20 #ifdef HAVE_CONFIG_H
21 # include "config.h"
22 #endif
23 #include <alloca.h>
24 
25 /* Specification.  */
26 #include "msgl-iconv.h"
27 
28 #include <stdbool.h>
29 #include <stdlib.h>
30 #include <string.h>
31 
32 #if HAVE_ICONV
33 # include <iconv.h>
34 #endif
35 
36 #include "progname.h"
37 #include "basename.h"
38 #include "message.h"
39 #include "po-charset.h"
40 #include "xstriconv.h"
41 #include "msgl-ascii.h"
42 #include "xalloc.h"
43 #include "xallocsa.h"
44 #include "c-strstr.h"
45 #include "xvasprintf.h"
46 #include "po-xerror.h"
47 #include "gettext.h"
48 
49 #define _(str) gettext (str)
50 
51 
52 #if HAVE_ICONV
53 
54 static void conversion_error (const struct conversion_context* context)
55 #if defined __GNUC__ && ((__GNUC__ == 2 && __GNUC_MINOR__ >= 5) || __GNUC__ > 2)
56      __attribute__ ((noreturn))
57 #endif
58 ;
59 static void
conversion_error(const struct conversion_context * context)60 conversion_error (const struct conversion_context* context)
61 {
62   if (context->to_code == po_charset_utf8)
63     /* If a conversion to UTF-8 fails, the problem lies in the input.  */
64     po_xerror (PO_SEVERITY_FATAL_ERROR, context->message, NULL, 0, 0, false,
65 	       xasprintf (_("%s: input is not valid in \"%s\" encoding"),
66 			  context->from_filename, context->from_code));
67   else
68     po_xerror (PO_SEVERITY_FATAL_ERROR, context->message, NULL, 0, 0, false,
69 	       xasprintf (_("\
70 %s: error while converting from \"%s\" encoding to \"%s\" encoding"),
71 			  context->from_filename, context->from_code,
72 			  context->to_code));
73   /* NOTREACHED */
74   abort ();
75 }
76 
77 char *
convert_string(iconv_t cd,const char * string,const struct conversion_context * context)78 convert_string (iconv_t cd, const char *string,
79 		const struct conversion_context* context)
80 {
81   size_t len = strlen (string) + 1;
82   char *result = NULL;
83   size_t resultlen;
84 
85   if (xmem_cd_iconv (string, len, cd, &result, &resultlen) == 0)
86     /* Verify the result has exactly one NUL byte, at the end.  */
87     if (resultlen > 0 && result[resultlen - 1] == '\0'
88 	&& strlen (result) == resultlen - 1)
89       return result;
90 
91   conversion_error (context);
92   /* NOTREACHED */
93   return NULL;
94 }
95 
96 static void
convert_string_list(iconv_t cd,string_list_ty * slp,const struct conversion_context * context)97 convert_string_list (iconv_t cd, string_list_ty *slp,
98 		     const struct conversion_context* context)
99 {
100   size_t i;
101 
102   if (slp != NULL)
103     for (i = 0; i < slp->nitems; i++)
104       slp->item[i] = convert_string (cd, slp->item[i], context);
105 }
106 
107 static void
convert_prev_msgid(iconv_t cd,message_ty * mp,const struct conversion_context * context)108 convert_prev_msgid (iconv_t cd, message_ty *mp,
109 		    const struct conversion_context* context)
110 {
111   if (mp->prev_msgctxt != NULL)
112     mp->prev_msgctxt = convert_string (cd, mp->prev_msgctxt, context);
113   if (mp->prev_msgid != NULL)
114     mp->prev_msgid = convert_string (cd, mp->prev_msgid, context);
115   if (mp->prev_msgid_plural != NULL)
116     mp->prev_msgid_plural = convert_string (cd, mp->prev_msgid_plural, context);
117 }
118 
119 static void
convert_msgid(iconv_t cd,message_ty * mp,const struct conversion_context * context)120 convert_msgid (iconv_t cd, message_ty *mp,
121 	       const struct conversion_context* context)
122 {
123   if (mp->msgctxt != NULL)
124     mp->msgctxt = convert_string (cd, mp->msgctxt, context);
125   mp->msgid = convert_string (cd, mp->msgid, context);
126   if (mp->msgid_plural != NULL)
127     mp->msgid_plural = convert_string (cd, mp->msgid_plural, context);
128 }
129 
130 static void
convert_msgstr(iconv_t cd,message_ty * mp,const struct conversion_context * context)131 convert_msgstr (iconv_t cd, message_ty *mp,
132 		const struct conversion_context* context)
133 {
134   char *result = NULL;
135   size_t resultlen;
136 
137   if (!(mp->msgstr_len > 0 && mp->msgstr[mp->msgstr_len - 1] == '\0'))
138     abort ();
139 
140   if (xmem_cd_iconv (mp->msgstr, mp->msgstr_len, cd, &result, &resultlen) == 0)
141     /* Verify the result has a NUL byte at the end.  */
142     if (resultlen > 0 && result[resultlen - 1] == '\0')
143       /* Verify the result has the same number of NUL bytes.  */
144       {
145 	const char *p;
146 	const char *pend;
147 	int nulcount1;
148 	int nulcount2;
149 
150 	for (p = mp->msgstr, pend = p + mp->msgstr_len, nulcount1 = 0;
151 	     p < pend;
152 	     p += strlen (p) + 1, nulcount1++);
153 	for (p = result, pend = p + resultlen, nulcount2 = 0;
154 	     p < pend;
155 	     p += strlen (p) + 1, nulcount2++);
156 
157 	if (nulcount1 == nulcount2)
158 	  {
159 	    mp->msgstr = result;
160 	    mp->msgstr_len = resultlen;
161 	    return;
162 	  }
163       }
164 
165   conversion_error (context);
166 }
167 
168 #endif
169 
170 
171 bool
iconv_message_list(message_list_ty * mlp,const char * canon_from_code,const char * canon_to_code,const char * from_filename)172 iconv_message_list (message_list_ty *mlp,
173 		    const char *canon_from_code, const char *canon_to_code,
174 		    const char *from_filename)
175 {
176   bool canon_from_code_overridden = (canon_from_code != NULL);
177   bool msgids_changed;
178   size_t j;
179 
180   /* If the list is empty, nothing to do.  */
181   if (mlp->nitems == 0)
182     return false;
183 
184   /* Search the header entry, and extract and replace the charset name.  */
185   for (j = 0; j < mlp->nitems; j++)
186     if (is_header (mlp->item[j]) && !mlp->item[j]->obsolete)
187       {
188 	const char *header = mlp->item[j]->msgstr;
189 
190 	if (header != NULL)
191 	  {
192 	    const char *charsetstr = c_strstr (header, "charset=");
193 
194 	    if (charsetstr != NULL)
195 	      {
196 		size_t len;
197 		char *charset;
198 		const char *canon_charset;
199 		size_t len1, len2, len3;
200 		char *new_header;
201 
202 		charsetstr += strlen ("charset=");
203 		len = strcspn (charsetstr, " \t\n");
204 		charset = (char *) xallocsa (len + 1);
205 		memcpy (charset, charsetstr, len);
206 		charset[len] = '\0';
207 
208 		canon_charset = po_charset_canonicalize (charset);
209 		if (canon_charset == NULL)
210 		  {
211 		    if (!canon_from_code_overridden)
212 		      {
213 			/* Don't give an error for POT files, because POT
214 			   files usually contain only ASCII msgids.  */
215 			const char *filename = from_filename;
216 			size_t filenamelen;
217 
218 			if (filename != NULL
219 			    && (filenamelen = strlen (filename)) >= 4
220 			    && memcmp (filename + filenamelen - 4, ".pot", 4)
221 			       == 0
222 			    && strcmp (charset, "CHARSET") == 0)
223 			  canon_charset = po_charset_ascii;
224 			else
225 			  po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0,
226 				     false, xasprintf (_("\
227 present charset \"%s\" is not a portable encoding name"),
228 						charset));
229 		      }
230 		  }
231 		else
232 		  {
233 		    if (canon_from_code == NULL)
234 		      canon_from_code = canon_charset;
235 		    else if (canon_from_code != canon_charset)
236 		      po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0,  0,
237 				 false,
238 				 xasprintf (_("\
239 two different charsets \"%s\" and \"%s\" in input file"),
240 					    canon_from_code, canon_charset));
241 		  }
242 		freesa (charset);
243 
244 		len1 = charsetstr - header;
245 		len2 = strlen (canon_to_code);
246 		len3 = (header + strlen (header)) - (charsetstr + len);
247 		new_header = (char *) xmalloc (len1 + len2 + len3 + 1);
248 		memcpy (new_header, header, len1);
249 		memcpy (new_header + len1, canon_to_code, len2);
250 		memcpy (new_header + len1 + len2, charsetstr + len, len3 + 1);
251 		mlp->item[j]->msgstr = new_header;
252 		mlp->item[j]->msgstr_len = len1 + len2 + len3 + 1;
253 	      }
254 	  }
255       }
256   if (canon_from_code == NULL)
257     {
258       if (is_ascii_message_list (mlp))
259 	canon_from_code = po_charset_ascii;
260       else
261 	po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
262 		   _("\
263 input file doesn't contain a header entry with a charset specification"));
264     }
265 
266   msgids_changed = false;
267 
268   /* If the two encodings are the same, nothing to do.  */
269   if (canon_from_code != canon_to_code)
270     {
271 #if HAVE_ICONV
272       iconv_t cd;
273       struct conversion_context context;
274 
275       /* Avoid glibc-2.1 bug with EUC-KR.  */
276 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
277       if (strcmp (canon_from_code, "EUC-KR") == 0)
278 	cd = (iconv_t)(-1);
279       else
280 # endif
281       cd = iconv_open (canon_to_code, canon_from_code);
282       if (cd == (iconv_t)(-1))
283 	po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
284 		   xasprintf (_("\
285 Cannot convert from \"%s\" to \"%s\". %s relies on iconv(), \
286 and iconv() does not support this conversion."),
287 			      canon_from_code, canon_to_code,
288 			      basename (program_name)));
289 
290       context.from_code = canon_from_code;
291       context.to_code = canon_to_code;
292       context.from_filename = from_filename;
293 
294       for (j = 0; j < mlp->nitems; j++)
295 	{
296 	  message_ty *mp = mlp->item[j];
297 
298 	  if ((mp->msgctxt != NULL && !is_ascii_string (mp->msgctxt))
299 	      || !is_ascii_string (mp->msgid))
300 	    msgids_changed = true;
301 	  context.message = mp;
302 	  convert_string_list (cd, mp->comment, &context);
303 	  convert_string_list (cd, mp->comment_dot, &context);
304 	  convert_prev_msgid (cd, mp, &context);
305 	  convert_msgid (cd, mp, &context);
306 	  convert_msgstr (cd, mp, &context);
307 	}
308 
309       iconv_close (cd);
310 
311       if (msgids_changed)
312 	if (message_list_msgids_changed (mlp))
313 	  po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
314 		     xasprintf (_("\
315 Conversion from \"%s\" to \"%s\" introduces duplicates: \
316 some different msgids become equal."),
317 				canon_from_code, canon_to_code));
318 #else
319 	  po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
320 		     xasprintf (_("\
321 Cannot convert from \"%s\" to \"%s\". %s relies on iconv(). \
322 This version was built without iconv()."),
323 				canon_from_code, canon_to_code,
324 				basename (program_name)));
325 #endif
326     }
327 
328   return msgids_changed;
329 }
330 
331 msgdomain_list_ty *
iconv_msgdomain_list(msgdomain_list_ty * mdlp,const char * to_code,const char * from_filename)332 iconv_msgdomain_list (msgdomain_list_ty *mdlp,
333 		      const char *to_code,
334 		      const char *from_filename)
335 {
336   const char *canon_to_code;
337   size_t k;
338 
339   /* Canonicalize target encoding.  */
340   canon_to_code = po_charset_canonicalize (to_code);
341   if (canon_to_code == NULL)
342     po_xerror (PO_SEVERITY_FATAL_ERROR, NULL, NULL, 0, 0, false,
343 	       xasprintf (_("\
344 target charset \"%s\" is not a portable encoding name."),
345 			  to_code));
346 
347   for (k = 0; k < mdlp->nitems; k++)
348     iconv_message_list (mdlp->item[k]->messages, mdlp->encoding, canon_to_code,
349 			from_filename);
350 
351   mdlp->encoding = canon_to_code;
352   return mdlp;
353 }
354 
355 #if HAVE_ICONV
356 
357 static bool
iconvable_string(iconv_t cd,const char * string)358 iconvable_string (iconv_t cd, const char *string)
359 {
360   size_t len = strlen (string) + 1;
361   char *result = NULL;
362   size_t resultlen;
363 
364   if (xmem_cd_iconv (string, len, cd, &result, &resultlen) == 0)
365     {
366       /* Test if the result has exactly one NUL byte, at the end.  */
367       bool ok = (resultlen > 0 && result[resultlen - 1] == '\0'
368 		 && strlen (result) == resultlen - 1);
369       free (result);
370       return ok;
371     }
372   return false;
373 }
374 
375 static bool
iconvable_string_list(iconv_t cd,string_list_ty * slp)376 iconvable_string_list (iconv_t cd, string_list_ty *slp)
377 {
378   size_t i;
379 
380   if (slp != NULL)
381     for (i = 0; i < slp->nitems; i++)
382       if (!iconvable_string (cd, slp->item[i]))
383 	return false;
384   return true;
385 }
386 
387 static bool
iconvable_prev_msgid(iconv_t cd,message_ty * mp)388 iconvable_prev_msgid (iconv_t cd, message_ty *mp)
389 {
390   if (mp->prev_msgctxt != NULL)
391     if (!iconvable_string (cd, mp->prev_msgctxt))
392       return false;
393   if (mp->prev_msgid != NULL)
394     if (!iconvable_string (cd, mp->prev_msgid))
395       return false;
396   if (mp->msgid_plural != NULL)
397     if (!iconvable_string (cd, mp->prev_msgid_plural))
398       return false;
399   return true;
400 }
401 
402 static bool
iconvable_msgid(iconv_t cd,message_ty * mp)403 iconvable_msgid (iconv_t cd, message_ty *mp)
404 {
405   if (mp->msgctxt != NULL)
406     if (!iconvable_string (cd, mp->msgctxt))
407       return false;
408   if (!iconvable_string (cd, mp->msgid))
409     return false;
410   if (mp->msgid_plural != NULL)
411     if (!iconvable_string (cd, mp->msgid_plural))
412       return false;
413   return true;
414 }
415 
416 static bool
iconvable_msgstr(iconv_t cd,message_ty * mp)417 iconvable_msgstr (iconv_t cd, message_ty *mp)
418 {
419   char *result = NULL;
420   size_t resultlen;
421 
422   if (!(mp->msgstr_len > 0 && mp->msgstr[mp->msgstr_len - 1] == '\0'))
423     abort ();
424 
425   if (xmem_cd_iconv (mp->msgstr, mp->msgstr_len, cd, &result, &resultlen) == 0)
426     {
427       bool ok = false;
428 
429       /* Test if the result has a NUL byte at the end.  */
430       if (resultlen > 0 && result[resultlen - 1] == '\0')
431 	/* Test if the result has the same number of NUL bytes.  */
432 	{
433 	  const char *p;
434 	  const char *pend;
435 	  int nulcount1;
436 	  int nulcount2;
437 
438 	  for (p = mp->msgstr, pend = p + mp->msgstr_len, nulcount1 = 0;
439 	       p < pend;
440 	       p += strlen (p) + 1, nulcount1++);
441 	  for (p = result, pend = p + resultlen, nulcount2 = 0;
442 	       p < pend;
443 	       p += strlen (p) + 1, nulcount2++);
444 
445 	  if (nulcount1 == nulcount2)
446 	    ok = true;
447 	}
448 
449       free (result);
450       return ok;
451     }
452   return false;
453 }
454 
455 #endif
456 
457 bool
is_message_list_iconvable(message_list_ty * mlp,const char * canon_from_code,const char * canon_to_code)458 is_message_list_iconvable (message_list_ty *mlp,
459 			   const char *canon_from_code,
460 			   const char *canon_to_code)
461 {
462   bool canon_from_code_overridden = (canon_from_code != NULL);
463   size_t j;
464 
465   /* If the list is empty, nothing to check.  */
466   if (mlp->nitems == 0)
467     return true;
468 
469   /* Search the header entry, and extract the charset name.  */
470   for (j = 0; j < mlp->nitems; j++)
471     if (is_header (mlp->item[j]) && !mlp->item[j]->obsolete)
472       {
473 	const char *header = mlp->item[j]->msgstr;
474 
475 	if (header != NULL)
476 	  {
477 	    const char *charsetstr = c_strstr (header, "charset=");
478 
479 	    if (charsetstr != NULL)
480 	      {
481 		size_t len;
482 		char *charset;
483 		const char *canon_charset;
484 
485 		charsetstr += strlen ("charset=");
486 		len = strcspn (charsetstr, " \t\n");
487 		charset = (char *) xallocsa (len + 1);
488 		memcpy (charset, charsetstr, len);
489 		charset[len] = '\0';
490 
491 		canon_charset = po_charset_canonicalize (charset);
492 		if (canon_charset == NULL)
493 		  {
494 		    if (!canon_from_code_overridden)
495 		      {
496 			/* Don't give an error for POT files, because POT
497 			   files usually contain only ASCII msgids.  */
498 			if (strcmp (charset, "CHARSET") == 0)
499 			  canon_charset = po_charset_ascii;
500 			else
501 			  {
502 			    /* charset is not a portable encoding name.  */
503 			    freesa (charset);
504 			    return false;
505 			  }
506 		      }
507 		  }
508 		else
509 		  {
510 		    if (canon_from_code == NULL)
511 		      canon_from_code = canon_charset;
512 		    else if (canon_from_code != canon_charset)
513 		      {
514 			/* Two different charsets in input file.  */
515 			freesa (charset);
516 			return false;
517 		      }
518 		  }
519 		freesa (charset);
520 	      }
521 	  }
522       }
523   if (canon_from_code == NULL)
524     {
525       if (is_ascii_message_list (mlp))
526 	canon_from_code = po_charset_ascii;
527       else
528 	/* Input file lacks a header entry with a charset specification.  */
529 	return false;
530     }
531 
532   /* If the two encodings are the same, nothing to check.  */
533   if (canon_from_code != canon_to_code)
534     {
535 #if HAVE_ICONV
536       iconv_t cd;
537 
538       /* Avoid glibc-2.1 bug with EUC-KR.  */
539 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
540       if (strcmp (canon_from_code, "EUC-KR") == 0)
541 	cd = (iconv_t)(-1);
542       else
543 # endif
544       cd = iconv_open (canon_to_code, canon_from_code);
545       if (cd == (iconv_t)(-1))
546 	/* iconv() doesn't support this conversion.  */
547 	return false;
548 
549       for (j = 0; j < mlp->nitems; j++)
550 	{
551 	  message_ty *mp = mlp->item[j];
552 
553 	  if (!(iconvable_string_list (cd, mp->comment)
554 		&& iconvable_string_list (cd, mp->comment_dot)
555 		&& iconvable_prev_msgid (cd, mp)
556 		&& iconvable_msgid (cd, mp)
557 		&& iconvable_msgstr (cd, mp)))
558 	    return false;
559 	}
560 
561       iconv_close (cd);
562 #else
563       /* This version was built without iconv().  */
564       return false;
565 #endif
566     }
567 
568   return true;
569 }
570