xref: /netbsd-src/lib/libc/citrus/modules/citrus_utf8.c (revision d710132b4b8ce7f7cccaaf660cb16aa16b4077a0)
1 /*	$NetBSD: citrus_utf8.c,v 1.8 2003/06/25 09:51:49 tshiozak Exp $	*/
2 
3 /*-
4  * Copyright (c)2002 Citrus Project,
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 /*-
30  * Copyright (c) 1993
31  *	The Regents of the University of California.  All rights reserved.
32  *
33  * This code is derived from software contributed to Berkeley by
34  * Paul Borman at Krystal Technologies.
35  *
36  * Redistribution and use in source and binary forms, with or without
37  * modification, are permitted provided that the following conditions
38  * are met:
39  * 1. Redistributions of source code must retain the above copyright
40  *    notice, this list of conditions and the following disclaimer.
41  * 2. Redistributions in binary form must reproduce the above copyright
42  *    notice, this list of conditions and the following disclaimer in the
43  *    documentation and/or other materials provided with the distribution.
44  * 3. All advertising materials mentioning features or use of this software
45  *    must display the following acknowledgement:
46  *	This product includes software developed by the University of
47  *	California, Berkeley and its contributors.
48  * 4. Neither the name of the University nor the names of its contributors
49  *    may be used to endorse or promote products derived from this software
50  *    without specific prior written permission.
51  *
52  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
62  * SUCH DAMAGE.
63  */
64 
65 #include <sys/cdefs.h>
66 #if defined(LIBC_SCCS) && !defined(lint)
67 __RCSID("$NetBSD: citrus_utf8.c,v 1.8 2003/06/25 09:51:49 tshiozak Exp $");
68 #endif /* LIBC_SCCS and not lint */
69 
70 #include <assert.h>
71 #include <errno.h>
72 #include <string.h>
73 #include <stdio.h>
74 #include <stdlib.h>
75 #include <stddef.h>
76 #include <locale.h>
77 #include <wchar.h>
78 #include <sys/types.h>
79 #include <limits.h>
80 
81 #include "citrus_namespace.h"
82 #include "citrus_types.h"
83 #include "citrus_module.h"
84 #include "citrus_ctype.h"
85 #include "citrus_stdenc.h"
86 #include "citrus_utf8.h"
87 
88 
89 /* ----------------------------------------------------------------------
90  * private stuffs used by templates
91  */
92 
93 static int _UTF8_count_array[256];
94 static int const *_UTF8_count = NULL;
95 
96 static u_int32_t _UTF8_range[] = {
97 	0,	/*dummy*/
98 	0x00000000, 0x00000080, 0x00000800, 0x00010000,
99 	0x00200000, 0x04000000, 0x80000000,
100 };
101 
102 typedef struct {
103 	char ch[6];
104 	int chlen;
105 } _UTF8State;
106 
107 typedef struct {
108 } _UTF8EncodingInfo;
109 
110 typedef struct {
111 	_UTF8EncodingInfo	ei;
112 	struct {
113 		/* for future multi-locale facility */
114 		_UTF8State	s_mblen;
115 		_UTF8State	s_mbrlen;
116 		_UTF8State	s_mbrtowc;
117 		_UTF8State	s_mbtowc;
118 		_UTF8State	s_mbsrtowcs;
119 		_UTF8State	s_wcrtomb;
120 		_UTF8State	s_wcsrtombs;
121 		_UTF8State	s_wctomb;
122 	} states;
123 } _UTF8CTypeInfo;
124 
125 #define _CEI_TO_EI(_cei_)		(&(_cei_)->ei)
126 #define _CEI_TO_STATE(_ei_, _func_)	(_ei_)->states.s_##_func_
127 
128 #define _FUNCNAME(m)			_citrus_UTF8_##m
129 #define _ENCODING_INFO			_UTF8EncodingInfo
130 #define _CTYPE_INFO			_UTF8CTypeInfo
131 #define _ENCODING_STATE			_UTF8State
132 #define _ENCODING_MB_CUR_MAX(_ei_)	6
133 #define _ENCODING_IS_STATE_DEPENDENT	0
134 #define _STATE_NEEDS_EXPLICIT_INIT(_ps_)	0
135 
136 
137 static __inline void
138 _UTF8_init_count(void)
139 {
140 	int i;
141 	if (!_UTF8_count) {
142 		memset(_UTF8_count_array, 0, sizeof(_UTF8_count_array));
143 		for (i = 0; i <= 0x7f; i++)
144 			_UTF8_count_array[i] = 1;
145 		for (i = 0xc0; i <= 0xdf; i++)
146 			_UTF8_count_array[i] = 2;
147 		for (i = 0xe0; i <= 0xef; i++)
148 			_UTF8_count_array[i] = 3;
149 		for (i = 0xf0; i <= 0xf7; i++)
150 			_UTF8_count_array[i] = 4;
151 		for (i = 0xf8; i <= 0xfb; i++)
152 			_UTF8_count_array[i] = 5;
153 		for (i = 0xfc; i <= 0xfd; i++)
154 			_UTF8_count_array[i] = 6;
155 		_UTF8_count = _UTF8_count_array;
156 	}
157 }
158 
159 static int
160 _UTF8_findlen(wchar_t v)
161 {
162 	int i;
163 	u_int32_t c;
164 
165 	c = (u_int32_t)v;	/*XXX*/
166 	for (i = 1; i < sizeof(_UTF8_range) / sizeof(_UTF8_range[0]); i++)
167 		if (c >= _UTF8_range[i] && c < _UTF8_range[i + 1])
168 			return i;
169 
170 	return -1;	/*out of range*/
171 }
172 
173 static __inline void
174 /*ARGSUSED*/
175 _citrus_UTF8_init_state(_UTF8EncodingInfo *ei, _UTF8State *s)
176 {
177 	memset(s, 0, sizeof(*s));
178 }
179 
180 static __inline void
181 /*ARGSUSED*/
182 _citrus_UTF8_pack_state(_UTF8EncodingInfo *ei, void *pspriv,
183 			const _UTF8State *s)
184 {
185 	memcpy(pspriv, (const void *)s, sizeof(*s));
186 }
187 
188 static __inline void
189 /*ARGSUSED*/
190 _citrus_UTF8_unpack_state(_UTF8EncodingInfo *ei, _UTF8State *s,
191 			  const void *pspriv)
192 {
193 	memcpy((void *)s, pspriv, sizeof(*s));
194 }
195 
196 static int
197 _citrus_UTF8_mbrtowc_priv(_UTF8EncodingInfo *ei, wchar_t *pwc, const char **s,
198 			  size_t n, _UTF8State *psenc, size_t *nresult)
199 {
200 	wchar_t wchar;
201 	const char *s0;
202 	int c;
203 	int i;
204 	int chlenbak;
205 
206 	_DIAGASSERT(nresult != 0);
207 	_DIAGASSERT(ei != NULL);
208 	_DIAGASSERT(s != NULL);
209 	_DIAGASSERT(psenc != NULL);
210 
211 	s0 = *s;
212 
213 	if (s0 == NULL) {
214 		_citrus_UTF8_init_state(ei, psenc);
215 		*nresult = 0; /* state independent */
216 		return (0);
217 	}
218 
219 	chlenbak = psenc->chlen;
220 
221 	/* make sure we have the first byte in the buffer */
222 	switch (psenc->chlen) {
223 	case 0:
224 		if (n < 1) {
225 			goto restart;
226 		}
227 		psenc->ch[0] = *s0++;
228 		psenc->chlen = 1;
229 		n--;
230 		break;
231 	case 1: case 2: case 3: case 4: case 5:
232 		break;
233 	default:
234 		/* illegal state */
235 		goto ilseq;
236 	}
237 
238 	c = _UTF8_count[psenc->ch[0] & 0xff];
239 	if (c == 0)
240 		goto ilseq;
241 	while (psenc->chlen < c) {
242 		if (n < 1) {
243 			goto restart;
244 		}
245 		psenc->ch[psenc->chlen] = *s0++;
246 		psenc->chlen++;
247 		n--;
248 	}
249 
250 	switch (c) {
251 	case 1:
252 		wchar = psenc->ch[0] & 0xff;
253 		break;
254 	case 2: case 3: case 4: case 5: case 6:
255 		wchar = psenc->ch[0] & (0x7f >> c);
256 		for (i = 1; i < c; i++) {
257 			if ((psenc->ch[i] & 0xc0) != 0x80)
258 				goto ilseq;
259 			wchar <<= 6;
260 			wchar |= (psenc->ch[i] & 0x3f);
261 		}
262 
263 		_DIAGASSERT(findlen(wchar) == c);
264 
265 		break;
266 	}
267 
268 	*s = s0;
269 
270 	psenc->chlen = 0;
271 
272 	if (pwc)
273 		*pwc = wchar;
274 
275 	if (!wchar)
276 		*nresult = 0;
277 	else
278 		*nresult = c - chlenbak;
279 
280 	return (0);
281 
282 ilseq:
283 	psenc->chlen = 0;
284 	*nresult = (size_t)-1;
285 	return (EILSEQ);
286 
287 restart:
288 	*s = s0;
289 	*nresult = (size_t)-2;
290 	return (0);
291 }
292 
293 static int
294 _citrus_UTF8_wcrtomb_priv(_UTF8EncodingInfo *ei, char *s, size_t n, wchar_t wc,
295 			  _UTF8State *psenc, size_t *nresult)
296 {
297 	int cnt, i, ret;
298 	wchar_t c;
299 
300 	_DIAGASSERT(ei != NULL);
301 	_DIAGASSERT(nresult != 0);
302 	_DIAGASSERT(s != NULL);
303 
304 	/* reset state */
305 	if (wc == 0) {
306 		*nresult = 0; /* stateless */
307 		return 0;
308 	}
309 
310 	cnt = _UTF8_findlen(wc);
311 	if (cnt <= 0 || cnt > 6) {
312 		/* invalid UCS4 value */
313 		ret = EILSEQ;
314 		goto err;
315 	}
316 	if (n < cnt) {
317 		/* bound check failure */
318 		ret = E2BIG;
319 		goto err;
320 	}
321 
322 	c = wc;
323 	if (s) {
324 		for (i = cnt - 1; i > 0; i--) {
325 			s[i] = 0x80 | (c & 0x3f);
326 			c >>= 6;
327 		}
328 		s[0] = c;
329 		if (cnt == 1)
330 			s[0] &= 0x7f;
331 		else {
332 			s[0] &= (0x7f >> cnt);
333 			s[0] |= ((0xff00 >> cnt) & 0xff);
334 		}
335 	}
336 
337 	*nresult = (size_t)cnt;
338 	return 0;
339 
340 err:
341 	*nresult = (size_t)-1;
342 	return ret;
343 }
344 
345 static __inline int
346 /*ARGSUSED*/
347 _citrus_UTF8_stdenc_wctocs(_UTF8EncodingInfo * __restrict ei,
348 			   _csid_t * __restrict csid,
349 			   _index_t * __restrict idx,
350 			   wchar_t wc)
351 {
352 
353 	_DIAGASSERT(csid != NULL && idx != NULL);
354 
355 	*csid = 0;
356 	*idx = (_citrus_index_t)wc;
357 
358 	return (0);
359 }
360 
361 static __inline int
362 /*ARGSUSED*/
363 _citrus_UTF8_stdenc_cstowc(_UTF8EncodingInfo * __restrict ei,
364 			   wchar_t * __restrict wc,
365 			   _csid_t csid, _index_t idx)
366 {
367 
368 	_DIAGASSERT(wc != NULL);
369 
370 	if (csid != 0)
371 		return (EILSEQ);
372 
373 	*wc = (wchar_t)idx;
374 
375 	return (0);
376 }
377 
378 static int
379 /*ARGSUSED*/
380 _citrus_UTF8_encoding_module_init(_UTF8EncodingInfo * __restrict ei,
381 				  const void * __restrict var, size_t lenvar)
382 {
383 	_DIAGASSERT(ei != NULL);
384 
385 	_UTF8_init_count();
386 	memset((void *)ei, 0, sizeof(*ei));
387 
388 	return 0;
389 }
390 
391 static void
392 /*ARGSUSED*/
393 _citrus_UTF8_encoding_module_uninit(_UTF8EncodingInfo *ei)
394 {
395 }
396 
397 
398 /* ----------------------------------------------------------------------
399  * public interface for ctype
400  */
401 
402 _CITRUS_CTYPE_DECLS(UTF8);
403 _CITRUS_CTYPE_DEF_OPS(UTF8);
404 
405 #include "citrus_ctype_template.h"
406 
407 /* ----------------------------------------------------------------------
408  * public interface for stdenc
409  */
410 
411 _CITRUS_STDENC_DECLS(UTF8);
412 _CITRUS_STDENC_DEF_OPS(UTF8);
413 
414 #include "citrus_stdenc_template.h"
415