xref: /openbsd-src/lib/libc/citrus/citrus_utf8.c (revision de48c816001f4089ba44c6a1d1dfd9219af4061d)
1*de48c816Sschwarze /*	$OpenBSD: citrus_utf8.c,v 1.18 2016/09/07 17:15:06 schwarze Exp $ */
2c9b8e388Sstsp 
3c9b8e388Sstsp /*-
4c9b8e388Sstsp  * Copyright (c) 2002-2004 Tim J. Robbins
5c9b8e388Sstsp  * All rights reserved.
6c9b8e388Sstsp  *
7c9b8e388Sstsp  * Redistribution and use in source and binary forms, with or without
8c9b8e388Sstsp  * modification, are permitted provided that the following conditions
9c9b8e388Sstsp  * are met:
10c9b8e388Sstsp  * 1. Redistributions of source code must retain the above copyright
11c9b8e388Sstsp  *    notice, this list of conditions and the following disclaimer.
12c9b8e388Sstsp  * 2. Redistributions in binary form must reproduce the above copyright
13c9b8e388Sstsp  *    notice, this list of conditions and the following disclaimer in the
14c9b8e388Sstsp  *    documentation and/or other materials provided with the distribution.
15c9b8e388Sstsp  *
16c9b8e388Sstsp  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17c9b8e388Sstsp  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18c9b8e388Sstsp  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19c9b8e388Sstsp  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20c9b8e388Sstsp  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21c9b8e388Sstsp  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22c9b8e388Sstsp  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23c9b8e388Sstsp  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24c9b8e388Sstsp  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25c9b8e388Sstsp  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26c9b8e388Sstsp  * SUCH DAMAGE.
27c9b8e388Sstsp  */
28c9b8e388Sstsp 
29c9b8e388Sstsp #include <sys/types.h>
30c9b8e388Sstsp 
31c9b8e388Sstsp #include <errno.h>
32c9b8e388Sstsp #include <string.h>
33c9b8e388Sstsp #include <wchar.h>
34c9b8e388Sstsp 
35c9b8e388Sstsp #include "citrus_ctype.h"
36c9b8e388Sstsp 
37c9b8e388Sstsp struct _utf8_state {
38c9b8e388Sstsp 	wchar_t	ch;
39c9b8e388Sstsp 	int	want;
40c9b8e388Sstsp 	wchar_t	lbound;
41c9b8e388Sstsp };
42c9b8e388Sstsp 
43c9b8e388Sstsp size_t
_citrus_utf8_ctype_mbrtowc(wchar_t * __restrict pwc,const char * __restrict s,size_t n,mbstate_t * __restrict ps)44c9b8e388Sstsp _citrus_utf8_ctype_mbrtowc(wchar_t * __restrict pwc,
45754110aeSschwarze     const char * __restrict s, size_t n, mbstate_t * __restrict ps)
46c9b8e388Sstsp {
47c9b8e388Sstsp 	struct _utf8_state *us;
48c9b8e388Sstsp 	int ch, i, mask, want;
49c9b8e388Sstsp 	wchar_t lbound, wch;
50c9b8e388Sstsp 
51754110aeSschwarze 	us = (struct _utf8_state *)ps;
52c9b8e388Sstsp 
53c9b8e388Sstsp 	if (us->want < 0 || us->want > _CITRUS_UTF8_MB_CUR_MAX) {
54c9b8e388Sstsp 		errno = EINVAL;
558558486aSschwarze 		return -1;
56c9b8e388Sstsp 	}
57c9b8e388Sstsp 
58c9b8e388Sstsp 	if (s == NULL) {
59c9b8e388Sstsp 		s = "";
60c9b8e388Sstsp 		n = 1;
61c9b8e388Sstsp 		pwc = NULL;
62c9b8e388Sstsp 	}
63c9b8e388Sstsp 
648558486aSschwarze 	if (n == 0)
658558486aSschwarze 		return -2;
66c9b8e388Sstsp 
67c9b8e388Sstsp 	if (us->want == 0 && ((ch = (unsigned char)*s) & ~0x7f) == 0) {
68c9b8e388Sstsp 		/* Fast path for plain ASCII characters. */
69c9b8e388Sstsp 		if (pwc != NULL)
70c9b8e388Sstsp 			*pwc = ch;
718558486aSschwarze 		return ch != '\0' ? 1 : 0;
72c9b8e388Sstsp 	}
73c9b8e388Sstsp 
74c9b8e388Sstsp 	if (us->want == 0) {
75c9b8e388Sstsp 		/*
768558486aSschwarze 		 * Determine the number of bytes that make up this character
778558486aSschwarze 		 * from the first byte, and a mask that extracts the
788558486aSschwarze 		 * interesting bits of the first byte.  We already know
79c9b8e388Sstsp 		 * the character is at least two bytes long.
80c9b8e388Sstsp 		 *
81c9b8e388Sstsp 		 * We also specify a lower bound for the character code to
82c9b8e388Sstsp 		 * detect redundant, non-"shortest form" encodings. For
83c9b8e388Sstsp 		 * example, the sequence C0 80 is _not_ a legal representation
84c9b8e388Sstsp 		 * of the null character. This enforces a 1-to-1 mapping
85c9b8e388Sstsp 		 * between character codes and their multibyte representations.
86c9b8e388Sstsp 		 */
87c9b8e388Sstsp 		ch = (unsigned char)*s;
88c9b8e388Sstsp 		if ((ch & 0x80) == 0) {
89c9b8e388Sstsp 			mask = 0x7f;
90c9b8e388Sstsp 			want = 1;
91c9b8e388Sstsp 			lbound = 0;
92c9b8e388Sstsp 		} else if ((ch & 0xe0) == 0xc0) {
93c9b8e388Sstsp 			mask = 0x1f;
94c9b8e388Sstsp 			want = 2;
95c9b8e388Sstsp 			lbound = 0x80;
96c9b8e388Sstsp 		} else if ((ch & 0xf0) == 0xe0) {
97c9b8e388Sstsp 			mask = 0x0f;
98c9b8e388Sstsp 			want = 3;
99c9b8e388Sstsp 			lbound = 0x800;
100c9b8e388Sstsp 		} else if ((ch & 0xf8) == 0xf0) {
101c9b8e388Sstsp 			mask = 0x07;
102c9b8e388Sstsp 			want = 4;
103c9b8e388Sstsp 			lbound = 0x10000;
104c9b8e388Sstsp 		} else {
105c9b8e388Sstsp 			/*
106c9b8e388Sstsp 			 * Malformed input; input is not UTF-8.
107c9b8e388Sstsp 			 * See RFC 3629.
108c9b8e388Sstsp 			 */
109c9b8e388Sstsp 			errno = EILSEQ;
1108558486aSschwarze 			return -1;
111c9b8e388Sstsp 		}
112c9b8e388Sstsp 	} else {
113c9b8e388Sstsp 		want = us->want;
114c9b8e388Sstsp 		lbound = us->lbound;
115c9b8e388Sstsp 	}
116c9b8e388Sstsp 
117c9b8e388Sstsp 	/*
1188558486aSschwarze 	 * Decode the byte sequence representing the character in chunks
119c9b8e388Sstsp 	 * of 6 bits, most significant first.
120c9b8e388Sstsp 	 */
121c9b8e388Sstsp 	if (us->want == 0)
122c9b8e388Sstsp 		wch = (unsigned char)*s++ & mask;
123c9b8e388Sstsp 	else
124c9b8e388Sstsp 		wch = us->ch;
125*de48c816Sschwarze 	for (i = (us->want == 0) ? 1 : 0; i < want && (size_t)i < n; i++) {
126c9b8e388Sstsp 		if ((*s & 0xc0) != 0x80) {
127c9b8e388Sstsp 			/*
1288558486aSschwarze 			 * Malformed input; bad byte in the middle
129c9b8e388Sstsp 			 * of a character.
130c9b8e388Sstsp 			 */
131c9b8e388Sstsp 			errno = EILSEQ;
1328558486aSschwarze 			return -1;
133c9b8e388Sstsp 		}
134c9b8e388Sstsp 		wch <<= 6;
135c9b8e388Sstsp 		wch |= *s++ & 0x3f;
136c9b8e388Sstsp 	}
137c9b8e388Sstsp 	if (i < want) {
138c9b8e388Sstsp 		/* Incomplete multibyte sequence. */
139c9b8e388Sstsp 		us->want = want - i;
140c9b8e388Sstsp 		us->lbound = lbound;
141c9b8e388Sstsp 		us->ch = wch;
1428558486aSschwarze 		return -2;
143c9b8e388Sstsp 	}
144c9b8e388Sstsp 	if (wch < lbound) {
145c9b8e388Sstsp 		/*
146c9b8e388Sstsp 		 * Malformed input; redundant encoding.
147c9b8e388Sstsp 		 */
148c9b8e388Sstsp 		errno = EILSEQ;
1498558486aSschwarze 		return -1;
150c9b8e388Sstsp 	}
1512d1d6f4dSstsp 	if (wch >= 0xd800 && wch <= 0xdfff) {
152249006c8Sstsp 		/*
153249006c8Sstsp 		 * Malformed input; invalid code points.
154249006c8Sstsp 		 */
155249006c8Sstsp 		errno = EILSEQ;
1568558486aSschwarze 		return -1;
157249006c8Sstsp 	}
1588c886ad5Ssemarie 	if (wch > 0x10ffff) {
159f94991fbSsemarie 		/*
160f94991fbSsemarie 		 * Malformed input; invalid code points.
161f94991fbSsemarie 		 */
162f94991fbSsemarie 		errno = EILSEQ;
1638558486aSschwarze 		return -1;
164f94991fbSsemarie 	}
165c9b8e388Sstsp 	if (pwc != NULL)
166c9b8e388Sstsp 		*pwc = wch;
167c9b8e388Sstsp 	us->want = 0;
1688558486aSschwarze 	return wch == L'\0' ? 0 : want;
169c9b8e388Sstsp }
170c9b8e388Sstsp 
171c9b8e388Sstsp int
_citrus_utf8_ctype_mbsinit(const mbstate_t * __restrict ps)172754110aeSschwarze _citrus_utf8_ctype_mbsinit(const mbstate_t * __restrict ps)
173c9b8e388Sstsp {
1748558486aSschwarze 	return ((const struct _utf8_state *)ps)->want == 0;
175c9b8e388Sstsp }
176c9b8e388Sstsp 
177c9b8e388Sstsp size_t
_citrus_utf8_ctype_mbsnrtowcs(wchar_t * __restrict dst,const char ** __restrict src,size_t nmc,size_t len,mbstate_t * __restrict ps)1785a08728eSmatthew _citrus_utf8_ctype_mbsnrtowcs(wchar_t * __restrict dst,
179754110aeSschwarze     const char ** __restrict src, size_t nmc, size_t len,
180754110aeSschwarze     mbstate_t * __restrict ps)
181c9b8e388Sstsp {
182c9b8e388Sstsp 	struct _utf8_state *us;
1835a08728eSmatthew 	size_t i, o, r;
184c9b8e388Sstsp 
185754110aeSschwarze 	us = (struct _utf8_state *)ps;
186c9b8e388Sstsp 
1875a08728eSmatthew 	if (dst == NULL) {
188c9b8e388Sstsp 		/*
189c9b8e388Sstsp 		 * The fast path in the loop below is not safe if an ASCII
190c9b8e388Sstsp 		 * character appears as anything but the first byte of a
191c9b8e388Sstsp 		 * multibyte sequence. Check now to avoid doing it in the loop.
192c9b8e388Sstsp 		 */
1935a08728eSmatthew 		if (nmc > 0 && us->want > 0 && (unsigned char)(*src)[0] < 0x80) {
194c9b8e388Sstsp 			errno = EILSEQ;
1958558486aSschwarze 			return -1;
196c9b8e388Sstsp 		}
1975a08728eSmatthew 		for (i = o = 0; i < nmc; i += r, o++) {
1985a08728eSmatthew 			if ((unsigned char)(*src)[i] < 0x80) {
1995a08728eSmatthew 				/* Fast path for plain ASCII characters. */
2005a08728eSmatthew 				if ((*src)[i] == '\0')
2018558486aSschwarze 					return o;
2025a08728eSmatthew 				r = 1;
203c9b8e388Sstsp 			} else {
2045a08728eSmatthew 				r = _citrus_utf8_ctype_mbrtowc(NULL, *src + i,
205*de48c816Sschwarze 				    nmc - i, ps);
2065a08728eSmatthew 				if (r == (size_t)-1)
2078558486aSschwarze 					return r;
2085a08728eSmatthew 				if (r == (size_t)-2)
2098558486aSschwarze 					return o;
2105a08728eSmatthew 				if (r == 0)
2118558486aSschwarze 					return o;
212c9b8e388Sstsp 			}
213c9b8e388Sstsp 		}
2148558486aSschwarze 		return o;
215c9b8e388Sstsp 	}
216c9b8e388Sstsp 
217c9b8e388Sstsp 	/*
218c9b8e388Sstsp 	 * The fast path in the loop below is not safe if an ASCII
219c9b8e388Sstsp 	 * character appears as anything but the first byte of a
220c9b8e388Sstsp 	 * multibyte sequence. Check now to avoid doing it in the loop.
221c9b8e388Sstsp 	 */
2228558486aSschwarze 	if (len > 0 && nmc > 0 && us->want > 0 &&
2238558486aSschwarze 	    (unsigned char)(*src)[0] < 0x80) {
224c9b8e388Sstsp 		errno = EILSEQ;
2258558486aSschwarze 		return -1;
226c9b8e388Sstsp 	}
2275a08728eSmatthew 	for (i = o = 0; i < nmc && o < len; i += r, o++) {
2285a08728eSmatthew 		if ((unsigned char)(*src)[i] < 0x80) {
2295a08728eSmatthew 			/* Fast path for plain ASCII characters. */
2305a08728eSmatthew 			dst[o] = (wchar_t)(unsigned char)(*src)[i];
2315a08728eSmatthew 			if ((*src)[i] == '\0') {
2325a08728eSmatthew 				*src = NULL;
2338558486aSschwarze 				return o;
2345a08728eSmatthew 			}
2355a08728eSmatthew 			r = 1;
236c9b8e388Sstsp 		} else {
2375a08728eSmatthew 			r = _citrus_utf8_ctype_mbrtowc(dst + o, *src + i,
238*de48c816Sschwarze 			    nmc - i, ps);
2395a08728eSmatthew 			if (r == (size_t)-1) {
2405a08728eSmatthew 				*src += i;
2418558486aSschwarze 				return r;
242c9b8e388Sstsp 			}
2435a08728eSmatthew 			if (r == (size_t)-2) {
2445a08728eSmatthew 				*src += nmc;
2458558486aSschwarze 				return o;
246c9b8e388Sstsp 			}
2475a08728eSmatthew 			if (r == 0) {
2485a08728eSmatthew 				*src = NULL;
2498558486aSschwarze 				return o;
250c9b8e388Sstsp 			}
251c9b8e388Sstsp 		}
252c9b8e388Sstsp 	}
2535a08728eSmatthew 	*src += i;
2548558486aSschwarze 	return o;
255c9b8e388Sstsp }
256c9b8e388Sstsp 
257c9b8e388Sstsp size_t
_citrus_utf8_ctype_wcrtomb(char * __restrict s,wchar_t wc,mbstate_t * __restrict ps)258754110aeSschwarze _citrus_utf8_ctype_wcrtomb(char * __restrict s, wchar_t wc,
259754110aeSschwarze     mbstate_t * __restrict ps)
260c9b8e388Sstsp {
261c9b8e388Sstsp 	struct _utf8_state *us;
262c9b8e388Sstsp 	unsigned char lead;
263c9b8e388Sstsp 	int i, len;
264c9b8e388Sstsp 
265754110aeSschwarze 	us = (struct _utf8_state *)ps;
266c9b8e388Sstsp 
267c9b8e388Sstsp 	if (us->want != 0) {
268c9b8e388Sstsp 		errno = EINVAL;
2698558486aSschwarze 		return -1;
270c9b8e388Sstsp 	}
271c9b8e388Sstsp 
2728558486aSschwarze 	if (s == NULL)
2738558486aSschwarze 		return 1;
274c9b8e388Sstsp 
275fdb7d9f4Sbentley 	if (wc < 0 || (wc > 0xd7ff && wc < 0xe000) || wc > 0x10ffff) {
276962a275cSschwarze 		errno = EILSEQ;
2778558486aSschwarze 		return -1;
278c9b8e388Sstsp 	}
279c9b8e388Sstsp 
280c9b8e388Sstsp 	/*
2818558486aSschwarze 	 * Determine the number of bytes needed to represent this character.
282c9b8e388Sstsp 	 * We always output the shortest sequence possible. Also specify the
2838558486aSschwarze 	 * first few bits of the first byte, which contains the information
284c9b8e388Sstsp 	 * about the sequence length.
285c9b8e388Sstsp 	 */
286962a275cSschwarze 	if (wc <= 0x7f) {
287962a275cSschwarze 		/* Fast path for plain ASCII characters. */
288962a275cSschwarze 		*s = (char)wc;
2898558486aSschwarze 		return 1;
290962a275cSschwarze 	} else if (wc <= 0x7ff) {
291c9b8e388Sstsp 		lead = 0xc0;
292c9b8e388Sstsp 		len = 2;
293962a275cSschwarze 	} else if (wc <= 0xffff) {
294c9b8e388Sstsp 		lead = 0xe0;
295c9b8e388Sstsp 		len = 3;
296962a275cSschwarze 	} else {
297c9b8e388Sstsp 		lead = 0xf0;
298c9b8e388Sstsp 		len = 4;
299c9b8e388Sstsp 	}
300c9b8e388Sstsp 
301c9b8e388Sstsp 	/*
3028558486aSschwarze 	 * Output the bytes representing the character in chunks
3038558486aSschwarze 	 * of 6 bits, least significant last. The first byte is
304c9b8e388Sstsp 	 * a special case because it contains the sequence length
305c9b8e388Sstsp 	 * information.
306c9b8e388Sstsp 	 */
307c9b8e388Sstsp 	for (i = len - 1; i > 0; i--) {
308c9b8e388Sstsp 		s[i] = (wc & 0x3f) | 0x80;
309c9b8e388Sstsp 		wc >>= 6;
310c9b8e388Sstsp 	}
311c9b8e388Sstsp 	*s = (wc & 0xff) | lead;
312c9b8e388Sstsp 
3138558486aSschwarze 	return len;
314c9b8e388Sstsp }
315c9b8e388Sstsp 
316c9b8e388Sstsp size_t
_citrus_utf8_ctype_wcsnrtombs(char * __restrict dst,const wchar_t ** __restrict src,size_t nwc,size_t len,mbstate_t * __restrict ps)3175a08728eSmatthew _citrus_utf8_ctype_wcsnrtombs(char * __restrict dst,
318754110aeSschwarze     const wchar_t ** __restrict src, size_t nwc, size_t len,
319754110aeSschwarze     mbstate_t * __restrict ps)
320c9b8e388Sstsp {
321c9b8e388Sstsp 	struct _utf8_state *us;
322c9b8e388Sstsp 	char buf[_CITRUS_UTF8_MB_CUR_MAX];
3235a08728eSmatthew 	size_t i, o, r;
324c9b8e388Sstsp 
325754110aeSschwarze 	us = (struct _utf8_state *)ps;
326c9b8e388Sstsp 
327c9b8e388Sstsp 	if (us->want != 0) {
328c9b8e388Sstsp 		errno = EINVAL;
3298558486aSschwarze 		return -1;
330c9b8e388Sstsp 	}
331c9b8e388Sstsp 
3325a08728eSmatthew 	if (dst == NULL) {
3335a08728eSmatthew 		for (i = o = 0; i < nwc; i++, o += r) {
3345a08728eSmatthew 			wchar_t wc = (*src)[i];
3355a08728eSmatthew 			if (wc >= 0 && wc < 0x80) {
336c9b8e388Sstsp 				/* Fast path for plain ASCII characters. */
3375a08728eSmatthew 				if (wc == 0)
3388558486aSschwarze 					return o;
3395a08728eSmatthew 				r = 1;
3405a08728eSmatthew 			} else {
341*de48c816Sschwarze 				r = _citrus_utf8_ctype_wcrtomb(buf, wc, ps);
3425a08728eSmatthew 				if (r == (size_t)-1)
3438558486aSschwarze 					return r;
344c9b8e388Sstsp 			}
345c9b8e388Sstsp 		}
3468558486aSschwarze 		return o;
347c9b8e388Sstsp 	}
348c9b8e388Sstsp 
3495a08728eSmatthew 	for (i = o = 0; i < nwc && o < len; i++, o += r) {
3505a08728eSmatthew 		wchar_t wc = (*src)[i];
3515a08728eSmatthew 		if (wc >= 0 && wc < 0x80) {
352c9b8e388Sstsp 			/* Fast path for plain ASCII characters. */
3535a08728eSmatthew 			dst[o] = (wchar_t)wc;
3545a08728eSmatthew 			if (wc == 0) {
3555a08728eSmatthew 				*src = NULL;
3568558486aSschwarze 				return o;
3575a08728eSmatthew 			}
3585a08728eSmatthew 			r = 1;
3595a08728eSmatthew 		} else if (len - o >= _CITRUS_UTF8_MB_CUR_MAX) {
360c9b8e388Sstsp 			/* Enough space to translate in-place. */
361*de48c816Sschwarze 			r = _citrus_utf8_ctype_wcrtomb(dst + o, wc, ps);
3625a08728eSmatthew 			if (r == (size_t)-1) {
3635a08728eSmatthew 				*src += i;
3648558486aSschwarze 				return r;
365c9b8e388Sstsp 			}
366c9b8e388Sstsp 		} else {
3675a08728eSmatthew 			/* May not be enough space; use temp buffer. */
368*de48c816Sschwarze 			r = _citrus_utf8_ctype_wcrtomb(buf, wc, ps);
3695a08728eSmatthew 			if (r == (size_t)-1) {
3705a08728eSmatthew 				*src += i;
3718558486aSschwarze 				return r;
372c9b8e388Sstsp 			}
3735a08728eSmatthew 			if (r > len - o)
374c9b8e388Sstsp 				break;
3755a08728eSmatthew 			memcpy(dst + o, buf, r);
376c9b8e388Sstsp 		}
377c9b8e388Sstsp 	}
3785a08728eSmatthew 	*src += i;
3798558486aSschwarze 	return o;
380c9b8e388Sstsp }
381