1*de48c816Sschwarze /* $OpenBSD: citrus_utf8.c,v 1.18 2016/09/07 17:15:06 schwarze Exp $ */
2c9b8e388Sstsp
3c9b8e388Sstsp /*-
4c9b8e388Sstsp * Copyright (c) 2002-2004 Tim J. Robbins
5c9b8e388Sstsp * All rights reserved.
6c9b8e388Sstsp *
7c9b8e388Sstsp * Redistribution and use in source and binary forms, with or without
8c9b8e388Sstsp * modification, are permitted provided that the following conditions
9c9b8e388Sstsp * are met:
10c9b8e388Sstsp * 1. Redistributions of source code must retain the above copyright
11c9b8e388Sstsp * notice, this list of conditions and the following disclaimer.
12c9b8e388Sstsp * 2. Redistributions in binary form must reproduce the above copyright
13c9b8e388Sstsp * notice, this list of conditions and the following disclaimer in the
14c9b8e388Sstsp * documentation and/or other materials provided with the distribution.
15c9b8e388Sstsp *
16c9b8e388Sstsp * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17c9b8e388Sstsp * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18c9b8e388Sstsp * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19c9b8e388Sstsp * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20c9b8e388Sstsp * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21c9b8e388Sstsp * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22c9b8e388Sstsp * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23c9b8e388Sstsp * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24c9b8e388Sstsp * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25c9b8e388Sstsp * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26c9b8e388Sstsp * SUCH DAMAGE.
27c9b8e388Sstsp */
28c9b8e388Sstsp
29c9b8e388Sstsp #include <sys/types.h>
30c9b8e388Sstsp
31c9b8e388Sstsp #include <errno.h>
32c9b8e388Sstsp #include <string.h>
33c9b8e388Sstsp #include <wchar.h>
34c9b8e388Sstsp
35c9b8e388Sstsp #include "citrus_ctype.h"
36c9b8e388Sstsp
37c9b8e388Sstsp struct _utf8_state {
38c9b8e388Sstsp wchar_t ch;
39c9b8e388Sstsp int want;
40c9b8e388Sstsp wchar_t lbound;
41c9b8e388Sstsp };
42c9b8e388Sstsp
43c9b8e388Sstsp size_t
_citrus_utf8_ctype_mbrtowc(wchar_t * __restrict pwc,const char * __restrict s,size_t n,mbstate_t * __restrict ps)44c9b8e388Sstsp _citrus_utf8_ctype_mbrtowc(wchar_t * __restrict pwc,
45754110aeSschwarze const char * __restrict s, size_t n, mbstate_t * __restrict ps)
46c9b8e388Sstsp {
47c9b8e388Sstsp struct _utf8_state *us;
48c9b8e388Sstsp int ch, i, mask, want;
49c9b8e388Sstsp wchar_t lbound, wch;
50c9b8e388Sstsp
51754110aeSschwarze us = (struct _utf8_state *)ps;
52c9b8e388Sstsp
53c9b8e388Sstsp if (us->want < 0 || us->want > _CITRUS_UTF8_MB_CUR_MAX) {
54c9b8e388Sstsp errno = EINVAL;
558558486aSschwarze return -1;
56c9b8e388Sstsp }
57c9b8e388Sstsp
58c9b8e388Sstsp if (s == NULL) {
59c9b8e388Sstsp s = "";
60c9b8e388Sstsp n = 1;
61c9b8e388Sstsp pwc = NULL;
62c9b8e388Sstsp }
63c9b8e388Sstsp
648558486aSschwarze if (n == 0)
658558486aSschwarze return -2;
66c9b8e388Sstsp
67c9b8e388Sstsp if (us->want == 0 && ((ch = (unsigned char)*s) & ~0x7f) == 0) {
68c9b8e388Sstsp /* Fast path for plain ASCII characters. */
69c9b8e388Sstsp if (pwc != NULL)
70c9b8e388Sstsp *pwc = ch;
718558486aSschwarze return ch != '\0' ? 1 : 0;
72c9b8e388Sstsp }
73c9b8e388Sstsp
74c9b8e388Sstsp if (us->want == 0) {
75c9b8e388Sstsp /*
768558486aSschwarze * Determine the number of bytes that make up this character
778558486aSschwarze * from the first byte, and a mask that extracts the
788558486aSschwarze * interesting bits of the first byte. We already know
79c9b8e388Sstsp * the character is at least two bytes long.
80c9b8e388Sstsp *
81c9b8e388Sstsp * We also specify a lower bound for the character code to
82c9b8e388Sstsp * detect redundant, non-"shortest form" encodings. For
83c9b8e388Sstsp * example, the sequence C0 80 is _not_ a legal representation
84c9b8e388Sstsp * of the null character. This enforces a 1-to-1 mapping
85c9b8e388Sstsp * between character codes and their multibyte representations.
86c9b8e388Sstsp */
87c9b8e388Sstsp ch = (unsigned char)*s;
88c9b8e388Sstsp if ((ch & 0x80) == 0) {
89c9b8e388Sstsp mask = 0x7f;
90c9b8e388Sstsp want = 1;
91c9b8e388Sstsp lbound = 0;
92c9b8e388Sstsp } else if ((ch & 0xe0) == 0xc0) {
93c9b8e388Sstsp mask = 0x1f;
94c9b8e388Sstsp want = 2;
95c9b8e388Sstsp lbound = 0x80;
96c9b8e388Sstsp } else if ((ch & 0xf0) == 0xe0) {
97c9b8e388Sstsp mask = 0x0f;
98c9b8e388Sstsp want = 3;
99c9b8e388Sstsp lbound = 0x800;
100c9b8e388Sstsp } else if ((ch & 0xf8) == 0xf0) {
101c9b8e388Sstsp mask = 0x07;
102c9b8e388Sstsp want = 4;
103c9b8e388Sstsp lbound = 0x10000;
104c9b8e388Sstsp } else {
105c9b8e388Sstsp /*
106c9b8e388Sstsp * Malformed input; input is not UTF-8.
107c9b8e388Sstsp * See RFC 3629.
108c9b8e388Sstsp */
109c9b8e388Sstsp errno = EILSEQ;
1108558486aSschwarze return -1;
111c9b8e388Sstsp }
112c9b8e388Sstsp } else {
113c9b8e388Sstsp want = us->want;
114c9b8e388Sstsp lbound = us->lbound;
115c9b8e388Sstsp }
116c9b8e388Sstsp
117c9b8e388Sstsp /*
1188558486aSschwarze * Decode the byte sequence representing the character in chunks
119c9b8e388Sstsp * of 6 bits, most significant first.
120c9b8e388Sstsp */
121c9b8e388Sstsp if (us->want == 0)
122c9b8e388Sstsp wch = (unsigned char)*s++ & mask;
123c9b8e388Sstsp else
124c9b8e388Sstsp wch = us->ch;
125*de48c816Sschwarze for (i = (us->want == 0) ? 1 : 0; i < want && (size_t)i < n; i++) {
126c9b8e388Sstsp if ((*s & 0xc0) != 0x80) {
127c9b8e388Sstsp /*
1288558486aSschwarze * Malformed input; bad byte in the middle
129c9b8e388Sstsp * of a character.
130c9b8e388Sstsp */
131c9b8e388Sstsp errno = EILSEQ;
1328558486aSschwarze return -1;
133c9b8e388Sstsp }
134c9b8e388Sstsp wch <<= 6;
135c9b8e388Sstsp wch |= *s++ & 0x3f;
136c9b8e388Sstsp }
137c9b8e388Sstsp if (i < want) {
138c9b8e388Sstsp /* Incomplete multibyte sequence. */
139c9b8e388Sstsp us->want = want - i;
140c9b8e388Sstsp us->lbound = lbound;
141c9b8e388Sstsp us->ch = wch;
1428558486aSschwarze return -2;
143c9b8e388Sstsp }
144c9b8e388Sstsp if (wch < lbound) {
145c9b8e388Sstsp /*
146c9b8e388Sstsp * Malformed input; redundant encoding.
147c9b8e388Sstsp */
148c9b8e388Sstsp errno = EILSEQ;
1498558486aSschwarze return -1;
150c9b8e388Sstsp }
1512d1d6f4dSstsp if (wch >= 0xd800 && wch <= 0xdfff) {
152249006c8Sstsp /*
153249006c8Sstsp * Malformed input; invalid code points.
154249006c8Sstsp */
155249006c8Sstsp errno = EILSEQ;
1568558486aSschwarze return -1;
157249006c8Sstsp }
1588c886ad5Ssemarie if (wch > 0x10ffff) {
159f94991fbSsemarie /*
160f94991fbSsemarie * Malformed input; invalid code points.
161f94991fbSsemarie */
162f94991fbSsemarie errno = EILSEQ;
1638558486aSschwarze return -1;
164f94991fbSsemarie }
165c9b8e388Sstsp if (pwc != NULL)
166c9b8e388Sstsp *pwc = wch;
167c9b8e388Sstsp us->want = 0;
1688558486aSschwarze return wch == L'\0' ? 0 : want;
169c9b8e388Sstsp }
170c9b8e388Sstsp
171c9b8e388Sstsp int
_citrus_utf8_ctype_mbsinit(const mbstate_t * __restrict ps)172754110aeSschwarze _citrus_utf8_ctype_mbsinit(const mbstate_t * __restrict ps)
173c9b8e388Sstsp {
1748558486aSschwarze return ((const struct _utf8_state *)ps)->want == 0;
175c9b8e388Sstsp }
176c9b8e388Sstsp
177c9b8e388Sstsp size_t
_citrus_utf8_ctype_mbsnrtowcs(wchar_t * __restrict dst,const char ** __restrict src,size_t nmc,size_t len,mbstate_t * __restrict ps)1785a08728eSmatthew _citrus_utf8_ctype_mbsnrtowcs(wchar_t * __restrict dst,
179754110aeSschwarze const char ** __restrict src, size_t nmc, size_t len,
180754110aeSschwarze mbstate_t * __restrict ps)
181c9b8e388Sstsp {
182c9b8e388Sstsp struct _utf8_state *us;
1835a08728eSmatthew size_t i, o, r;
184c9b8e388Sstsp
185754110aeSschwarze us = (struct _utf8_state *)ps;
186c9b8e388Sstsp
1875a08728eSmatthew if (dst == NULL) {
188c9b8e388Sstsp /*
189c9b8e388Sstsp * The fast path in the loop below is not safe if an ASCII
190c9b8e388Sstsp * character appears as anything but the first byte of a
191c9b8e388Sstsp * multibyte sequence. Check now to avoid doing it in the loop.
192c9b8e388Sstsp */
1935a08728eSmatthew if (nmc > 0 && us->want > 0 && (unsigned char)(*src)[0] < 0x80) {
194c9b8e388Sstsp errno = EILSEQ;
1958558486aSschwarze return -1;
196c9b8e388Sstsp }
1975a08728eSmatthew for (i = o = 0; i < nmc; i += r, o++) {
1985a08728eSmatthew if ((unsigned char)(*src)[i] < 0x80) {
1995a08728eSmatthew /* Fast path for plain ASCII characters. */
2005a08728eSmatthew if ((*src)[i] == '\0')
2018558486aSschwarze return o;
2025a08728eSmatthew r = 1;
203c9b8e388Sstsp } else {
2045a08728eSmatthew r = _citrus_utf8_ctype_mbrtowc(NULL, *src + i,
205*de48c816Sschwarze nmc - i, ps);
2065a08728eSmatthew if (r == (size_t)-1)
2078558486aSschwarze return r;
2085a08728eSmatthew if (r == (size_t)-2)
2098558486aSschwarze return o;
2105a08728eSmatthew if (r == 0)
2118558486aSschwarze return o;
212c9b8e388Sstsp }
213c9b8e388Sstsp }
2148558486aSschwarze return o;
215c9b8e388Sstsp }
216c9b8e388Sstsp
217c9b8e388Sstsp /*
218c9b8e388Sstsp * The fast path in the loop below is not safe if an ASCII
219c9b8e388Sstsp * character appears as anything but the first byte of a
220c9b8e388Sstsp * multibyte sequence. Check now to avoid doing it in the loop.
221c9b8e388Sstsp */
2228558486aSschwarze if (len > 0 && nmc > 0 && us->want > 0 &&
2238558486aSschwarze (unsigned char)(*src)[0] < 0x80) {
224c9b8e388Sstsp errno = EILSEQ;
2258558486aSschwarze return -1;
226c9b8e388Sstsp }
2275a08728eSmatthew for (i = o = 0; i < nmc && o < len; i += r, o++) {
2285a08728eSmatthew if ((unsigned char)(*src)[i] < 0x80) {
2295a08728eSmatthew /* Fast path for plain ASCII characters. */
2305a08728eSmatthew dst[o] = (wchar_t)(unsigned char)(*src)[i];
2315a08728eSmatthew if ((*src)[i] == '\0') {
2325a08728eSmatthew *src = NULL;
2338558486aSschwarze return o;
2345a08728eSmatthew }
2355a08728eSmatthew r = 1;
236c9b8e388Sstsp } else {
2375a08728eSmatthew r = _citrus_utf8_ctype_mbrtowc(dst + o, *src + i,
238*de48c816Sschwarze nmc - i, ps);
2395a08728eSmatthew if (r == (size_t)-1) {
2405a08728eSmatthew *src += i;
2418558486aSschwarze return r;
242c9b8e388Sstsp }
2435a08728eSmatthew if (r == (size_t)-2) {
2445a08728eSmatthew *src += nmc;
2458558486aSschwarze return o;
246c9b8e388Sstsp }
2475a08728eSmatthew if (r == 0) {
2485a08728eSmatthew *src = NULL;
2498558486aSschwarze return o;
250c9b8e388Sstsp }
251c9b8e388Sstsp }
252c9b8e388Sstsp }
2535a08728eSmatthew *src += i;
2548558486aSschwarze return o;
255c9b8e388Sstsp }
256c9b8e388Sstsp
257c9b8e388Sstsp size_t
_citrus_utf8_ctype_wcrtomb(char * __restrict s,wchar_t wc,mbstate_t * __restrict ps)258754110aeSschwarze _citrus_utf8_ctype_wcrtomb(char * __restrict s, wchar_t wc,
259754110aeSschwarze mbstate_t * __restrict ps)
260c9b8e388Sstsp {
261c9b8e388Sstsp struct _utf8_state *us;
262c9b8e388Sstsp unsigned char lead;
263c9b8e388Sstsp int i, len;
264c9b8e388Sstsp
265754110aeSschwarze us = (struct _utf8_state *)ps;
266c9b8e388Sstsp
267c9b8e388Sstsp if (us->want != 0) {
268c9b8e388Sstsp errno = EINVAL;
2698558486aSschwarze return -1;
270c9b8e388Sstsp }
271c9b8e388Sstsp
2728558486aSschwarze if (s == NULL)
2738558486aSschwarze return 1;
274c9b8e388Sstsp
275fdb7d9f4Sbentley if (wc < 0 || (wc > 0xd7ff && wc < 0xe000) || wc > 0x10ffff) {
276962a275cSschwarze errno = EILSEQ;
2778558486aSschwarze return -1;
278c9b8e388Sstsp }
279c9b8e388Sstsp
280c9b8e388Sstsp /*
2818558486aSschwarze * Determine the number of bytes needed to represent this character.
282c9b8e388Sstsp * We always output the shortest sequence possible. Also specify the
2838558486aSschwarze * first few bits of the first byte, which contains the information
284c9b8e388Sstsp * about the sequence length.
285c9b8e388Sstsp */
286962a275cSschwarze if (wc <= 0x7f) {
287962a275cSschwarze /* Fast path for plain ASCII characters. */
288962a275cSschwarze *s = (char)wc;
2898558486aSschwarze return 1;
290962a275cSschwarze } else if (wc <= 0x7ff) {
291c9b8e388Sstsp lead = 0xc0;
292c9b8e388Sstsp len = 2;
293962a275cSschwarze } else if (wc <= 0xffff) {
294c9b8e388Sstsp lead = 0xe0;
295c9b8e388Sstsp len = 3;
296962a275cSschwarze } else {
297c9b8e388Sstsp lead = 0xf0;
298c9b8e388Sstsp len = 4;
299c9b8e388Sstsp }
300c9b8e388Sstsp
301c9b8e388Sstsp /*
3028558486aSschwarze * Output the bytes representing the character in chunks
3038558486aSschwarze * of 6 bits, least significant last. The first byte is
304c9b8e388Sstsp * a special case because it contains the sequence length
305c9b8e388Sstsp * information.
306c9b8e388Sstsp */
307c9b8e388Sstsp for (i = len - 1; i > 0; i--) {
308c9b8e388Sstsp s[i] = (wc & 0x3f) | 0x80;
309c9b8e388Sstsp wc >>= 6;
310c9b8e388Sstsp }
311c9b8e388Sstsp *s = (wc & 0xff) | lead;
312c9b8e388Sstsp
3138558486aSschwarze return len;
314c9b8e388Sstsp }
315c9b8e388Sstsp
316c9b8e388Sstsp size_t
_citrus_utf8_ctype_wcsnrtombs(char * __restrict dst,const wchar_t ** __restrict src,size_t nwc,size_t len,mbstate_t * __restrict ps)3175a08728eSmatthew _citrus_utf8_ctype_wcsnrtombs(char * __restrict dst,
318754110aeSschwarze const wchar_t ** __restrict src, size_t nwc, size_t len,
319754110aeSschwarze mbstate_t * __restrict ps)
320c9b8e388Sstsp {
321c9b8e388Sstsp struct _utf8_state *us;
322c9b8e388Sstsp char buf[_CITRUS_UTF8_MB_CUR_MAX];
3235a08728eSmatthew size_t i, o, r;
324c9b8e388Sstsp
325754110aeSschwarze us = (struct _utf8_state *)ps;
326c9b8e388Sstsp
327c9b8e388Sstsp if (us->want != 0) {
328c9b8e388Sstsp errno = EINVAL;
3298558486aSschwarze return -1;
330c9b8e388Sstsp }
331c9b8e388Sstsp
3325a08728eSmatthew if (dst == NULL) {
3335a08728eSmatthew for (i = o = 0; i < nwc; i++, o += r) {
3345a08728eSmatthew wchar_t wc = (*src)[i];
3355a08728eSmatthew if (wc >= 0 && wc < 0x80) {
336c9b8e388Sstsp /* Fast path for plain ASCII characters. */
3375a08728eSmatthew if (wc == 0)
3388558486aSschwarze return o;
3395a08728eSmatthew r = 1;
3405a08728eSmatthew } else {
341*de48c816Sschwarze r = _citrus_utf8_ctype_wcrtomb(buf, wc, ps);
3425a08728eSmatthew if (r == (size_t)-1)
3438558486aSschwarze return r;
344c9b8e388Sstsp }
345c9b8e388Sstsp }
3468558486aSschwarze return o;
347c9b8e388Sstsp }
348c9b8e388Sstsp
3495a08728eSmatthew for (i = o = 0; i < nwc && o < len; i++, o += r) {
3505a08728eSmatthew wchar_t wc = (*src)[i];
3515a08728eSmatthew if (wc >= 0 && wc < 0x80) {
352c9b8e388Sstsp /* Fast path for plain ASCII characters. */
3535a08728eSmatthew dst[o] = (wchar_t)wc;
3545a08728eSmatthew if (wc == 0) {
3555a08728eSmatthew *src = NULL;
3568558486aSschwarze return o;
3575a08728eSmatthew }
3585a08728eSmatthew r = 1;
3595a08728eSmatthew } else if (len - o >= _CITRUS_UTF8_MB_CUR_MAX) {
360c9b8e388Sstsp /* Enough space to translate in-place. */
361*de48c816Sschwarze r = _citrus_utf8_ctype_wcrtomb(dst + o, wc, ps);
3625a08728eSmatthew if (r == (size_t)-1) {
3635a08728eSmatthew *src += i;
3648558486aSschwarze return r;
365c9b8e388Sstsp }
366c9b8e388Sstsp } else {
3675a08728eSmatthew /* May not be enough space; use temp buffer. */
368*de48c816Sschwarze r = _citrus_utf8_ctype_wcrtomb(buf, wc, ps);
3695a08728eSmatthew if (r == (size_t)-1) {
3705a08728eSmatthew *src += i;
3718558486aSschwarze return r;
372c9b8e388Sstsp }
3735a08728eSmatthew if (r > len - o)
374c9b8e388Sstsp break;
3755a08728eSmatthew memcpy(dst + o, buf, r);
376c9b8e388Sstsp }
377c9b8e388Sstsp }
3785a08728eSmatthew *src += i;
3798558486aSschwarze return o;
380c9b8e388Sstsp }
381