1*946379e7Schristos /* Charset conversion.
2*946379e7Schristos Copyright (C) 2001-2006 Free Software Foundation, Inc.
3*946379e7Schristos Written by Bruno Haible and Simon Josefsson.
4*946379e7Schristos
5*946379e7Schristos This program is free software; you can redistribute it and/or modify
6*946379e7Schristos it under the terms of the GNU General Public License as published by
7*946379e7Schristos the Free Software Foundation; either version 2, or (at your option)
8*946379e7Schristos any later version.
9*946379e7Schristos
10*946379e7Schristos This program is distributed in the hope that it will be useful,
11*946379e7Schristos but WITHOUT ANY WARRANTY; without even the implied warranty of
12*946379e7Schristos MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13*946379e7Schristos GNU General Public License for more details.
14*946379e7Schristos
15*946379e7Schristos You should have received a copy of the GNU General Public License
16*946379e7Schristos along with this program; if not, write to the Free Software Foundation,
17*946379e7Schristos Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
18*946379e7Schristos
19*946379e7Schristos #include <config.h>
20*946379e7Schristos
21*946379e7Schristos /* Specification. */
22*946379e7Schristos #include "striconv.h"
23*946379e7Schristos
24*946379e7Schristos #include <errno.h>
25*946379e7Schristos #include <stdlib.h>
26*946379e7Schristos #include <string.h>
27*946379e7Schristos
28*946379e7Schristos #if HAVE_ICONV
29*946379e7Schristos # include <iconv.h>
30*946379e7Schristos /* Get MB_LEN_MAX, CHAR_BIT. */
31*946379e7Schristos # include <limits.h>
32*946379e7Schristos #endif
33*946379e7Schristos
34*946379e7Schristos #include "strdup.h"
35*946379e7Schristos #include "c-strcase.h"
36*946379e7Schristos
37*946379e7Schristos #ifndef SIZE_MAX
38*946379e7Schristos # define SIZE_MAX ((size_t) -1)
39*946379e7Schristos #endif
40*946379e7Schristos
41*946379e7Schristos
42*946379e7Schristos #if HAVE_ICONV
43*946379e7Schristos
44*946379e7Schristos int
mem_cd_iconv(const char * src,size_t srclen,iconv_t cd,char ** resultp,size_t * lengthp)45*946379e7Schristos mem_cd_iconv (const char *src, size_t srclen, iconv_t cd,
46*946379e7Schristos char **resultp, size_t *lengthp)
47*946379e7Schristos {
48*946379e7Schristos # define tmpbufsize 4096
49*946379e7Schristos size_t length;
50*946379e7Schristos char *result;
51*946379e7Schristos
52*946379e7Schristos /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
53*946379e7Schristos # if defined _LIBICONV_VERSION \
54*946379e7Schristos || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
55*946379e7Schristos /* Set to the initial state. */
56*946379e7Schristos iconv (cd, NULL, NULL, NULL, NULL);
57*946379e7Schristos # endif
58*946379e7Schristos
59*946379e7Schristos /* Determine the length we need. */
60*946379e7Schristos {
61*946379e7Schristos size_t count = 0;
62*946379e7Schristos char tmpbuf[tmpbufsize];
63*946379e7Schristos const char *inptr = src;
64*946379e7Schristos size_t insize = srclen;
65*946379e7Schristos
66*946379e7Schristos while (insize > 0)
67*946379e7Schristos {
68*946379e7Schristos char *outptr = tmpbuf;
69*946379e7Schristos size_t outsize = tmpbufsize;
70*946379e7Schristos size_t res = iconv (cd,
71*946379e7Schristos (ICONV_CONST char **) &inptr, &insize,
72*946379e7Schristos &outptr, &outsize);
73*946379e7Schristos
74*946379e7Schristos if (res == (size_t)(-1))
75*946379e7Schristos {
76*946379e7Schristos if (errno == E2BIG)
77*946379e7Schristos ;
78*946379e7Schristos else if (errno == EINVAL)
79*946379e7Schristos break;
80*946379e7Schristos else
81*946379e7Schristos return -1;
82*946379e7Schristos }
83*946379e7Schristos # if !defined _LIBICONV_VERSION && !defined __GLIBC__
84*946379e7Schristos /* Irix iconv() inserts a NUL byte if it cannot convert.
85*946379e7Schristos NetBSD iconv() inserts a question mark if it cannot convert.
86*946379e7Schristos Only GNU libiconv and GNU libc are known to prefer to fail rather
87*946379e7Schristos than doing a lossy conversion. */
88*946379e7Schristos else if (res > 0)
89*946379e7Schristos {
90*946379e7Schristos errno = EILSEQ;
91*946379e7Schristos return -1;
92*946379e7Schristos }
93*946379e7Schristos # endif
94*946379e7Schristos count += outptr - tmpbuf;
95*946379e7Schristos }
96*946379e7Schristos /* Avoid glibc-2.1 bug and Solaris 2.7 bug. */
97*946379e7Schristos # if defined _LIBICONV_VERSION \
98*946379e7Schristos || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
99*946379e7Schristos {
100*946379e7Schristos char *outptr = tmpbuf;
101*946379e7Schristos size_t outsize = tmpbufsize;
102*946379e7Schristos size_t res = iconv (cd, NULL, NULL, &outptr, &outsize);
103*946379e7Schristos
104*946379e7Schristos if (res == (size_t)(-1))
105*946379e7Schristos return -1;
106*946379e7Schristos count += outptr - tmpbuf;
107*946379e7Schristos }
108*946379e7Schristos # endif
109*946379e7Schristos length = count;
110*946379e7Schristos }
111*946379e7Schristos
112*946379e7Schristos if (length == 0)
113*946379e7Schristos {
114*946379e7Schristos *lengthp = 0;
115*946379e7Schristos return 0;
116*946379e7Schristos }
117*946379e7Schristos result = (*resultp != NULL ? realloc (*resultp, length) : malloc (length));
118*946379e7Schristos if (result == NULL)
119*946379e7Schristos {
120*946379e7Schristos errno = ENOMEM;
121*946379e7Schristos return -1;
122*946379e7Schristos }
123*946379e7Schristos *resultp = result;
124*946379e7Schristos *lengthp = length;
125*946379e7Schristos
126*946379e7Schristos /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
127*946379e7Schristos # if defined _LIBICONV_VERSION \
128*946379e7Schristos || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
129*946379e7Schristos /* Return to the initial state. */
130*946379e7Schristos iconv (cd, NULL, NULL, NULL, NULL);
131*946379e7Schristos # endif
132*946379e7Schristos
133*946379e7Schristos /* Do the conversion for real. */
134*946379e7Schristos {
135*946379e7Schristos const char *inptr = src;
136*946379e7Schristos size_t insize = srclen;
137*946379e7Schristos char *outptr = result;
138*946379e7Schristos size_t outsize = length;
139*946379e7Schristos
140*946379e7Schristos while (insize > 0)
141*946379e7Schristos {
142*946379e7Schristos size_t res = iconv (cd,
143*946379e7Schristos (ICONV_CONST char **) &inptr, &insize,
144*946379e7Schristos &outptr, &outsize);
145*946379e7Schristos
146*946379e7Schristos if (res == (size_t)(-1))
147*946379e7Schristos {
148*946379e7Schristos if (errno == EINVAL)
149*946379e7Schristos break;
150*946379e7Schristos else
151*946379e7Schristos return -1;
152*946379e7Schristos }
153*946379e7Schristos # if !defined _LIBICONV_VERSION && !defined __GLIBC__
154*946379e7Schristos /* Irix iconv() inserts a NUL byte if it cannot convert.
155*946379e7Schristos NetBSD iconv() inserts a question mark if it cannot convert.
156*946379e7Schristos Only GNU libiconv and GNU libc are known to prefer to fail rather
157*946379e7Schristos than doing a lossy conversion. */
158*946379e7Schristos else if (res > 0)
159*946379e7Schristos {
160*946379e7Schristos errno = EILSEQ;
161*946379e7Schristos return -1;
162*946379e7Schristos }
163*946379e7Schristos # endif
164*946379e7Schristos }
165*946379e7Schristos /* Avoid glibc-2.1 bug and Solaris 2.7 bug. */
166*946379e7Schristos # if defined _LIBICONV_VERSION \
167*946379e7Schristos || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
168*946379e7Schristos {
169*946379e7Schristos size_t res = iconv (cd, NULL, NULL, &outptr, &outsize);
170*946379e7Schristos
171*946379e7Schristos if (res == (size_t)(-1))
172*946379e7Schristos return -1;
173*946379e7Schristos }
174*946379e7Schristos # endif
175*946379e7Schristos if (outsize != 0)
176*946379e7Schristos abort ();
177*946379e7Schristos }
178*946379e7Schristos
179*946379e7Schristos return 0;
180*946379e7Schristos # undef tmpbufsize
181*946379e7Schristos }
182*946379e7Schristos
183*946379e7Schristos char *
str_cd_iconv(const char * src,iconv_t cd)184*946379e7Schristos str_cd_iconv (const char *src, iconv_t cd)
185*946379e7Schristos {
186*946379e7Schristos /* For most encodings, a trailing NUL byte in the input will be converted
187*946379e7Schristos to a trailing NUL byte in the output. But not for UTF-7. So that this
188*946379e7Schristos function is usable for UTF-7, we have to exclude the NUL byte from the
189*946379e7Schristos conversion and add it by hand afterwards. */
190*946379e7Schristos # if PROBABLY_SLOWER
191*946379e7Schristos
192*946379e7Schristos char *result = NULL;
193*946379e7Schristos size_t length;
194*946379e7Schristos int retval = mem_cd_iconv (src, strlen (src), cd, &result, &length);
195*946379e7Schristos char *final_result;
196*946379e7Schristos
197*946379e7Schristos if (retval < 0)
198*946379e7Schristos {
199*946379e7Schristos if (result != NULL)
200*946379e7Schristos {
201*946379e7Schristos int saved_errno = errno;
202*946379e7Schristos free (result);
203*946379e7Schristos errno = saved_errno;
204*946379e7Schristos }
205*946379e7Schristos return NULL;
206*946379e7Schristos }
207*946379e7Schristos
208*946379e7Schristos /* Add the terminating NUL byte. */
209*946379e7Schristos final_result =
210*946379e7Schristos (result != NULL ? realloc (result, length + 1) : malloc (length + 1));
211*946379e7Schristos if (final_result == NULL)
212*946379e7Schristos {
213*946379e7Schristos if (result != NULL)
214*946379e7Schristos free (result);
215*946379e7Schristos errno = ENOMEM;
216*946379e7Schristos return NULL;
217*946379e7Schristos }
218*946379e7Schristos final_result[length] = '\0';
219*946379e7Schristos
220*946379e7Schristos return final_result;
221*946379e7Schristos
222*946379e7Schristos # else
223*946379e7Schristos
224*946379e7Schristos char *result;
225*946379e7Schristos size_t result_size;
226*946379e7Schristos size_t length;
227*946379e7Schristos const char *inptr = src;
228*946379e7Schristos size_t inbytes_remaining = strlen (src);
229*946379e7Schristos
230*946379e7Schristos /* Make a guess for the worst-case output size, in order to avoid a
231*946379e7Schristos realloc. It's OK if the guess is wrong as long as it is not zero and
232*946379e7Schristos doesn't lead to an integer overflow. */
233*946379e7Schristos result_size = inbytes_remaining;
234*946379e7Schristos {
235*946379e7Schristos size_t approx_sqrt_SIZE_MAX = SIZE_MAX >> (sizeof (size_t) * CHAR_BIT / 2);
236*946379e7Schristos if (result_size <= approx_sqrt_SIZE_MAX / MB_LEN_MAX)
237*946379e7Schristos result_size *= MB_LEN_MAX;
238*946379e7Schristos }
239*946379e7Schristos result_size += 1; /* for the terminating NUL */
240*946379e7Schristos
241*946379e7Schristos result = (char *) malloc (result_size);
242*946379e7Schristos if (result == NULL)
243*946379e7Schristos {
244*946379e7Schristos errno = ENOMEM;
245*946379e7Schristos return NULL;
246*946379e7Schristos }
247*946379e7Schristos
248*946379e7Schristos /* Avoid glibc-2.1 bug and Solaris 2.7-2.9 bug. */
249*946379e7Schristos # if defined _LIBICONV_VERSION \
250*946379e7Schristos || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
251*946379e7Schristos /* Set to the initial state. */
252*946379e7Schristos iconv (cd, NULL, NULL, NULL, NULL);
253*946379e7Schristos # endif
254*946379e7Schristos
255*946379e7Schristos /* Do the conversion. */
256*946379e7Schristos {
257*946379e7Schristos char *outptr = result;
258*946379e7Schristos size_t outbytes_remaining = result_size - 1;
259*946379e7Schristos
260*946379e7Schristos for (;;)
261*946379e7Schristos {
262*946379e7Schristos /* Here inptr + inbytes_remaining = src + strlen (src),
263*946379e7Schristos outptr + outbytes_remaining = result + result_size - 1. */
264*946379e7Schristos size_t res = iconv (cd,
265*946379e7Schristos (ICONV_CONST char **) &inptr, &inbytes_remaining,
266*946379e7Schristos &outptr, &outbytes_remaining);
267*946379e7Schristos
268*946379e7Schristos if (res == (size_t)(-1))
269*946379e7Schristos {
270*946379e7Schristos if (errno == EINVAL)
271*946379e7Schristos break;
272*946379e7Schristos else if (errno == E2BIG)
273*946379e7Schristos {
274*946379e7Schristos size_t used = outptr - result;
275*946379e7Schristos size_t newsize = result_size * 2;
276*946379e7Schristos char *newresult;
277*946379e7Schristos
278*946379e7Schristos if (!(newsize > result_size))
279*946379e7Schristos {
280*946379e7Schristos errno = ENOMEM;
281*946379e7Schristos goto failed;
282*946379e7Schristos }
283*946379e7Schristos newresult = (char *) realloc (result, newsize);
284*946379e7Schristos if (newresult == NULL)
285*946379e7Schristos {
286*946379e7Schristos errno = ENOMEM;
287*946379e7Schristos goto failed;
288*946379e7Schristos }
289*946379e7Schristos result = newresult;
290*946379e7Schristos result_size = newsize;
291*946379e7Schristos outptr = result + used;
292*946379e7Schristos outbytes_remaining = result_size - 1 - used;
293*946379e7Schristos }
294*946379e7Schristos else
295*946379e7Schristos goto failed;
296*946379e7Schristos }
297*946379e7Schristos # if !defined _LIBICONV_VERSION && !defined __GLIBC__
298*946379e7Schristos /* Irix iconv() inserts a NUL byte if it cannot convert.
299*946379e7Schristos NetBSD iconv() inserts a question mark if it cannot convert.
300*946379e7Schristos Only GNU libiconv and GNU libc are known to prefer to fail rather
301*946379e7Schristos than doing a lossy conversion. */
302*946379e7Schristos else if (res > 0)
303*946379e7Schristos {
304*946379e7Schristos errno = EILSEQ;
305*946379e7Schristos goto failed;
306*946379e7Schristos }
307*946379e7Schristos # endif
308*946379e7Schristos else
309*946379e7Schristos break;
310*946379e7Schristos }
311*946379e7Schristos /* Avoid glibc-2.1 bug and Solaris 2.7 bug. */
312*946379e7Schristos # if defined _LIBICONV_VERSION \
313*946379e7Schristos || !((__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) || defined __sun)
314*946379e7Schristos for (;;)
315*946379e7Schristos {
316*946379e7Schristos /* Here outptr + outbytes_remaining = result + result_size - 1. */
317*946379e7Schristos size_t res = iconv (cd, NULL, NULL, &outptr, &outbytes_remaining);
318*946379e7Schristos
319*946379e7Schristos if (res == (size_t)(-1))
320*946379e7Schristos {
321*946379e7Schristos if (errno == E2BIG)
322*946379e7Schristos {
323*946379e7Schristos size_t used = outptr - result;
324*946379e7Schristos size_t newsize = result_size * 2;
325*946379e7Schristos char *newresult;
326*946379e7Schristos
327*946379e7Schristos if (!(newsize > result_size))
328*946379e7Schristos {
329*946379e7Schristos errno = ENOMEM;
330*946379e7Schristos goto failed;
331*946379e7Schristos }
332*946379e7Schristos newresult = (char *) realloc (result, newsize);
333*946379e7Schristos if (newresult == NULL)
334*946379e7Schristos {
335*946379e7Schristos errno = ENOMEM;
336*946379e7Schristos goto failed;
337*946379e7Schristos }
338*946379e7Schristos result = newresult;
339*946379e7Schristos result_size = newsize;
340*946379e7Schristos outptr = result + used;
341*946379e7Schristos outbytes_remaining = result_size - 1 - used;
342*946379e7Schristos }
343*946379e7Schristos else
344*946379e7Schristos goto failed;
345*946379e7Schristos }
346*946379e7Schristos else
347*946379e7Schristos break;
348*946379e7Schristos }
349*946379e7Schristos # endif
350*946379e7Schristos
351*946379e7Schristos /* Add the terminating NUL byte. */
352*946379e7Schristos *outptr++ = '\0';
353*946379e7Schristos
354*946379e7Schristos length = outptr - result;
355*946379e7Schristos }
356*946379e7Schristos
357*946379e7Schristos /* Give away unused memory. */
358*946379e7Schristos if (length < result_size)
359*946379e7Schristos {
360*946379e7Schristos char *smaller_result = (char *) realloc (result, length);
361*946379e7Schristos
362*946379e7Schristos if (smaller_result != NULL)
363*946379e7Schristos result = smaller_result;
364*946379e7Schristos }
365*946379e7Schristos
366*946379e7Schristos return result;
367*946379e7Schristos
368*946379e7Schristos failed:
369*946379e7Schristos {
370*946379e7Schristos int saved_errno = errno;
371*946379e7Schristos free (result);
372*946379e7Schristos errno = saved_errno;
373*946379e7Schristos return NULL;
374*946379e7Schristos }
375*946379e7Schristos
376*946379e7Schristos # endif
377*946379e7Schristos }
378*946379e7Schristos
379*946379e7Schristos #endif
380*946379e7Schristos
381*946379e7Schristos char *
str_iconv(const char * src,const char * from_codeset,const char * to_codeset)382*946379e7Schristos str_iconv (const char *src, const char *from_codeset, const char *to_codeset)
383*946379e7Schristos {
384*946379e7Schristos if (c_strcasecmp (from_codeset, to_codeset) == 0)
385*946379e7Schristos return strdup (src);
386*946379e7Schristos else
387*946379e7Schristos {
388*946379e7Schristos #if HAVE_ICONV
389*946379e7Schristos iconv_t cd;
390*946379e7Schristos char *result;
391*946379e7Schristos
392*946379e7Schristos /* Avoid glibc-2.1 bug with EUC-KR. */
393*946379e7Schristos # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
394*946379e7Schristos if (c_strcasecmp (from_codeset, "EUC-KR") == 0
395*946379e7Schristos || c_strcasecmp (to_codeset, "EUC-KR") == 0)
396*946379e7Schristos {
397*946379e7Schristos errno = EINVAL;
398*946379e7Schristos return NULL;
399*946379e7Schristos }
400*946379e7Schristos # endif
401*946379e7Schristos cd = iconv_open (to_codeset, from_codeset);
402*946379e7Schristos if (cd == (iconv_t) -1)
403*946379e7Schristos return NULL;
404*946379e7Schristos
405*946379e7Schristos result = str_cd_iconv (src, cd);
406*946379e7Schristos
407*946379e7Schristos if (result == NULL)
408*946379e7Schristos {
409*946379e7Schristos /* Close cd, but preserve the errno from str_cd_iconv. */
410*946379e7Schristos int saved_errno = errno;
411*946379e7Schristos iconv_close (cd);
412*946379e7Schristos errno = saved_errno;
413*946379e7Schristos }
414*946379e7Schristos else
415*946379e7Schristos {
416*946379e7Schristos if (iconv_close (cd) < 0)
417*946379e7Schristos {
418*946379e7Schristos /* Return NULL, but free the allocated memory, and while doing
419*946379e7Schristos that, preserve the errno from iconv_close. */
420*946379e7Schristos int saved_errno = errno;
421*946379e7Schristos free (result);
422*946379e7Schristos errno = saved_errno;
423*946379e7Schristos return NULL;
424*946379e7Schristos }
425*946379e7Schristos }
426*946379e7Schristos return result;
427*946379e7Schristos #else
428*946379e7Schristos /* This is a different error code than if iconv_open existed but didn't
429*946379e7Schristos support from_codeset and to_codeset, so that the caller can emit
430*946379e7Schristos an error message such as
431*946379e7Schristos "iconv() is not supported. Installing GNU libiconv and
432*946379e7Schristos then reinstalling this package would fix this." */
433*946379e7Schristos errno = ENOSYS;
434*946379e7Schristos return NULL;
435*946379e7Schristos #endif
436*946379e7Schristos }
437*946379e7Schristos }
438