xref: /netbsd-src/external/bsd/nvi/dist/common/conv.c (revision 6a493d6bc668897c91594964a732d38505b70cbb)
1 /*	$NetBSD: conv.c,v 1.2 2013/11/22 15:52:05 christos Exp $ */
2 /*-
3  * Copyright (c) 1993, 1994
4  *	The Regents of the University of California.  All rights reserved.
5  * Copyright (c) 1993, 1994, 1995, 1996
6  *	Keith Bostic.  All rights reserved.
7  *
8  * See the LICENSE file for redistribution information.
9  */
10 
11 #include "config.h"
12 
13 #ifndef lint
14 static const char sccsid[] = "Id: conv.c,v 1.27 2001/08/18 21:41:41 skimo Exp  (Berkeley) Date: 2001/08/18 21:41:41 ";
15 #endif /* not lint */
16 
17 #include <sys/types.h>
18 #include <sys/queue.h>
19 #include <sys/time.h>
20 
21 #include <bitstring.h>
22 #include <errno.h>
23 #include <limits.h>
24 #include <stdio.h>
25 #include <stdlib.h>
26 #include <string.h>
27 #include <unistd.h>
28 
29 #include "common.h"
30 
31 #ifdef USE_ICONV
32 #include <langinfo.h>
33 #include <iconv.h>
34 
35 #define LANGCODESET	nl_langinfo(CODESET)
36 #else
37 typedef int	iconv_t;
38 
39 #define LANGCODESET	""
40 #endif
41 
42 #include <locale.h>
43 
44 #ifdef USE_WIDECHAR
45 static int
46 raw2int(SCR *sp, const char * str, ssize_t len, CONVWIN *cw, size_t *tolen,
47 	const CHAR_T **dst)
48 {
49     int i;
50     CHAR_T **tostr = (CHAR_T **)(void *)&cw->bp1;
51     size_t  *blen = &cw->blen1;
52 
53     BINC_RETW(NULL, *tostr, *blen, len);
54 
55     *tolen = len;
56     for (i = 0; i < len; ++i)
57 	(*tostr)[i] = (u_char) str[i];
58 
59     *dst = cw->bp1;
60 
61     return 0;
62 }
63 
64 #ifndef ERROR_ON_CONVERT
65 #define HANDLE_ICONV_ERROR(o, i, ol, il) do {				\
66 		*o++ = *i++;						\
67 		ol--; il--;						\
68 	} while (/*CONSTCOND*/0)
69 #define HANDLE_MBR_ERROR(n, mbs, d, s) do {				\
70 		d = s;							\
71 		MEMSET(&mbs, 0, 1); 					\
72 		n = 1; 							\
73 	} while (/*CONSTCOND*/0)
74 #else
75 #define HANDLE_ICONV_ERROR goto err
76 #define	HANDLE_MBR_ERROR goto err
77 #endif
78 
79 #define CONV_BUFFER_SIZE    512
80 /* fill the buffer with codeset encoding of string pointed to by str
81  * left has the number of bytes left in str and is adjusted
82  * len contains the number of bytes put in the buffer
83  */
84 #ifdef USE_ICONV
85 #define CONVERT(str, left, src, len)				    	\
86     do {								\
87 	size_t outleft;							\
88 	char *bp = buffer;						\
89 	outleft = CONV_BUFFER_SIZE;					\
90 	errno = 0;							\
91 	if (iconv(id, (const char **)&str, &left, &bp, &outleft) 	\
92 	    == (size_t)-1 /* && errno != E2BIG */)			\
93 		HANDLE_ICONV_ERROR(bp, str, outleft, left);		\
94 	if ((len = CONV_BUFFER_SIZE - outleft) == 0) {			\
95 	    error = -left;						\
96 	    goto err;							\
97 	}				    				\
98 	src = buffer;							\
99     } while (0)
100 #else
101 #define CONVERT(str, left, src, len)
102 #endif
103 
104 static int
105 default_char2int(SCR *sp, const char * str, ssize_t len, CONVWIN *cw,
106 		size_t *tolen, const CHAR_T **dst, const char *enc)
107 {
108     int j;
109     size_t i = 0;
110     CHAR_T **tostr = (CHAR_T **)(void *)&cw->bp1;
111     size_t  *blen = &cw->blen1;
112     mbstate_t mbs;
113     size_t   n;
114     ssize_t  nlen = len;
115     const char *src = (const char *)str;
116     iconv_t	id = (iconv_t)-1;
117     char	buffer[CONV_BUFFER_SIZE];
118     size_t	left = len;
119     int		error = 1;
120 
121     MEMSET(&mbs, 0, 1);
122     BINC_RETW(NULL, *tostr, *blen, nlen);
123 
124 #ifdef USE_ICONV
125     if (strcmp(nl_langinfo(CODESET), enc)) {
126 	id = iconv_open(nl_langinfo(CODESET), enc);
127 	if (id == (iconv_t)-1)
128 	    goto err;
129 	CONVERT(str, left, src, len);
130     }
131 #endif
132 
133     for (i = 0, j = 0; j < len; ) {
134 	n = mbrtowc((*tostr)+i, src+j, len-j, &mbs);
135 	/* NULL character converted */
136 	if (n == (size_t)-2) error = -(len-j);
137 	if (n == (size_t)-1 || n == (size_t)-2)
138 	    HANDLE_MBR_ERROR(n, mbs, (*tostr)[i], src[j]);
139 	if (n == 0) n = 1;
140 	j += n;
141 	if (++i >= *blen) {
142 	    nlen += 256;
143 	    BINC_RETW(NULL, *tostr, *blen, nlen);
144 	}
145 	if (id != (iconv_t)-1 && j == len && left) {
146 	    CONVERT(str, left, src, len);
147 	    j = 0;
148 	}
149     }
150     *tolen = i;
151 
152     if (id != (iconv_t)-1)
153 	iconv_close(id);
154 
155     *dst = cw->bp1;
156 
157     return 0;
158 err:
159     *tolen = i;
160     if (id != (iconv_t)-1)
161 	iconv_close(id);
162     *dst = cw->bp1;
163 
164     return error;
165 }
166 
167 static int
168 fe_char2int(SCR *sp, const char * str, ssize_t len, CONVWIN *cw,
169 	    size_t *tolen, const CHAR_T **dst)
170 {
171     return default_char2int(sp, str, len, cw, tolen, dst, O_STR(sp, O_FILEENCODING));
172 }
173 
174 static int
175 ie_char2int(SCR *sp, const char * str, ssize_t len, CONVWIN *cw,
176 	    size_t *tolen, const CHAR_T **dst)
177 {
178     return default_char2int(sp, str, len, cw, tolen, dst, O_STR(sp, O_INPUTENCODING));
179 }
180 
181 static int
182 cs_char2int(SCR *sp, const char * str, ssize_t len, CONVWIN *cw,
183 	    size_t *tolen, const CHAR_T **dst)
184 {
185     return default_char2int(sp, str, len, cw, tolen, dst, LANGCODESET);
186 }
187 
188 static int
189 CHAR_T_int2char(SCR *sp, const CHAR_T * str, ssize_t len, CONVWIN *cw,
190 	size_t *tolen, const char **dst)
191 {
192     *tolen = len * sizeof(CHAR_T);
193     *dst = (const char *)(const void *)str;
194 
195     return 0;
196 }
197 
198 static int
199 CHAR_T_char2int(SCR *sp, const char * str, ssize_t len, CONVWIN *cw,
200 	size_t *tolen, const CHAR_T **dst)
201 {
202     *tolen = len / sizeof(CHAR_T);
203     *dst = (const CHAR_T*) str;
204 
205     return 0;
206 }
207 
208 static int
209 int2raw(SCR *sp, const CHAR_T * str, ssize_t len, CONVWIN *cw, size_t *tolen,
210 	const char **dst)
211 {
212     int i;
213     char **tostr = (char **)(void *)&cw->bp1;
214     size_t  *blen = &cw->blen1;
215 
216     BINC_RETC(NULL, *tostr, *blen, len);
217 
218     *tolen = len;
219     for (i = 0; i < len; ++i)
220 	(*tostr)[i] = str[i];
221 
222     *dst = cw->bp1;
223 
224     return 0;
225 }
226 
227 static int
228 default_int2char(SCR *sp, const CHAR_T * str, ssize_t len, CONVWIN *cw,
229 		size_t *tolen, const char **pdst, const char *enc)
230 {
231     size_t i, j;
232     int offset = 0;
233     char **tostr = (char **)(void *)&cw->bp1;
234     size_t  *blen = &cw->blen1;
235     mbstate_t mbs;
236     size_t n;
237     ssize_t  nlen = len + MB_CUR_MAX;
238     char *dst;
239     size_t buflen;
240     char	buffer[CONV_BUFFER_SIZE];
241     iconv_t	id = (iconv_t)-1;
242 
243 /* convert first len bytes of buffer and append it to cw->bp
244  * len is adjusted => 0
245  * offset contains the offset in cw->bp and is adjusted
246  * cw->bp is grown as required
247  */
248 #ifdef USE_ICONV
249 #define CONVERT2(len, cw, offset)					\
250     do {								\
251 	const char *bp = buffer;					\
252 	while (len != 0) {						\
253 	    size_t outleft = cw->blen1 - offset;			\
254 	    char *obp = (char *)cw->bp1 + offset;		    	\
255 	    if (cw->blen1 < offset + MB_CUR_MAX) {		    	\
256 		nlen += 256;						\
257 		BINC_RETC(NULL, cw->bp1, cw->blen1, nlen);		\
258 	    }						    		\
259 	    errno = 0;						    	\
260 	    if (iconv(id, &bp, &len, &obp, &outleft) == (size_t)-1 &&	\
261 		errno != E2BIG) 					\
262 		    HANDLE_ICONV_ERROR(obp, bp, outleft, len);		\
263 	    offset = cw->blen1 - outleft;			        \
264 	}							        \
265     } while (0)
266 #else
267 #define CONVERT2(len, cw, offset)
268 #endif
269 
270 
271     MEMSET(&mbs, 0, 1);
272     BINC_RETC(NULL, *tostr, *blen, nlen);
273     dst = *tostr; buflen = *blen;
274 
275 #ifdef USE_ICONV
276     if (strcmp(nl_langinfo(CODESET), enc)) {
277 	id = iconv_open(enc, nl_langinfo(CODESET));
278 	if (id == (iconv_t)-1)
279 	    goto err;
280 	dst = buffer; buflen = CONV_BUFFER_SIZE;
281     }
282 #endif
283 
284     for (i = 0, j = 0; i < (size_t)len; ++i) {
285 	n = wcrtomb(dst+j, str[i], &mbs);
286 	if (n == (size_t)-1)
287 	   HANDLE_MBR_ERROR(n, mbs, dst[j], str[i]);
288 	j += n;
289 	if (buflen < j + MB_CUR_MAX) {
290 	    if (id != (iconv_t)-1) {
291 		CONVERT2(j, cw, offset);
292 	    } else {
293 		nlen += 256;
294 		BINC_RETC(NULL, *tostr, *blen, nlen);
295 		dst = *tostr; buflen = *blen;
296 	    }
297 	}
298     }
299 
300     n = wcrtomb(dst+j, L'\0', &mbs);
301     j += n - 1;				/* don't count NUL at the end */
302     *tolen = j;
303 
304     if (id != (iconv_t)-1) {
305 	CONVERT2(j, cw, offset);
306 	*tolen = offset;
307     }
308 
309     *pdst = cw->bp1;
310 
311     return 0;
312 err:
313     *tolen = j;
314 
315     *pdst = cw->bp1;
316 
317     return 1;
318 }
319 
320 static int
321 fe_int2char(SCR *sp, const CHAR_T * str, ssize_t len, CONVWIN *cw,
322 	    size_t *tolen, const char **dst)
323 {
324     return default_int2char(sp, str, len, cw, tolen, dst, O_STR(sp, O_FILEENCODING));
325 }
326 
327 static int
328 cs_int2char(SCR *sp, const CHAR_T * str, ssize_t len, CONVWIN *cw,
329 	    size_t *tolen, const char **dst)
330 {
331     return default_int2char(sp, str, len, cw, tolen, dst, LANGCODESET);
332 }
333 
334 #endif
335 
336 
337 void
338 conv_init (SCR *orig, SCR *sp)
339 {
340     if (orig != NULL)
341 	MEMCPY(&sp->conv, &orig->conv, 1);
342     else {
343 	setlocale(LC_ALL, "");
344 #ifdef USE_WIDECHAR
345 	sp->conv.sys2int = cs_char2int;
346 	sp->conv.int2sys = cs_int2char;
347 	sp->conv.file2int = fe_char2int;
348 	sp->conv.int2file = fe_int2char;
349 	sp->conv.input2int = ie_char2int;
350 #endif
351 #ifdef USE_ICONV
352 	o_set(sp, O_FILEENCODING, OS_STRDUP, nl_langinfo(CODESET), 0);
353 	o_set(sp, O_INPUTENCODING, OS_STRDUP, nl_langinfo(CODESET), 0);
354 #endif
355     }
356 }
357 
358 int
359 conv_enc (SCR *sp, int option, const char *enc)
360 {
361 #if defined(USE_WIDECHAR) && defined(USE_ICONV)
362     iconv_t id;
363     char2wchar_t    *c2w;
364     wchar2char_t    *w2c;
365 
366     switch (option) {
367     case O_FILEENCODING:
368 	c2w = &sp->conv.file2int;
369 	w2c = &sp->conv.int2file;
370 	break;
371     case O_INPUTENCODING:
372 	c2w = &sp->conv.input2int;
373 	w2c = NULL;
374 	break;
375     default:
376 	c2w = NULL;
377 	w2c = NULL;
378 	break;
379     }
380 
381     if (!*enc) {
382 	if (c2w) *c2w = raw2int;
383 	if (w2c) *w2c = int2raw;
384 	return 0;
385     }
386 
387     if (!strcmp(enc, "WCHAR_T")) {
388 	if (c2w) *c2w = CHAR_T_char2int;
389 	if (w2c) *w2c = CHAR_T_int2char;
390 	return 0;
391     }
392 
393     id = iconv_open(enc, nl_langinfo(CODESET));
394     if (id == (iconv_t)-1)
395 	goto err;
396     iconv_close(id);
397     id = iconv_open(nl_langinfo(CODESET), enc);
398     if (id == (iconv_t)-1)
399 	goto err;
400     iconv_close(id);
401 
402     switch (option) {
403     case O_FILEENCODING:
404 	*c2w = fe_char2int;
405 	*w2c = fe_int2char;
406 	break;
407     case O_INPUTENCODING:
408 	*c2w = ie_char2int;
409 	break;
410     }
411 
412     F_CLR(sp, SC_CONV_ERROR);
413     F_SET(sp, SC_SCR_REFORMAT);
414 
415     return 0;
416 err:
417     switch (option) {
418     case O_FILEENCODING:
419 	msgq(sp, M_ERR,
420 	    "321|File encoding conversion not supported");
421 	break;
422     case O_INPUTENCODING:
423 	msgq(sp, M_ERR,
424 	    "322|Input encoding conversion not supported");
425 	break;
426     }
427 #endif
428     return 1;
429 }
430 
431