1 /* $OpenBSD: citrus_utf8.c,v 1.18 2016/09/07 17:15:06 schwarze Exp $ */
2
3 /*-
4 * Copyright (c) 2002-2004 Tim J. Robbins
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 #include <sys/types.h>
30
31 #include <errno.h>
32 #include <string.h>
33 #include <wchar.h>
34
35 #include "citrus_ctype.h"
36
37 struct _utf8_state {
38 wchar_t ch;
39 int want;
40 wchar_t lbound;
41 };
42
43 size_t
_citrus_utf8_ctype_mbrtowc(wchar_t * __restrict pwc,const char * __restrict s,size_t n,mbstate_t * __restrict ps)44 _citrus_utf8_ctype_mbrtowc(wchar_t * __restrict pwc,
45 const char * __restrict s, size_t n, mbstate_t * __restrict ps)
46 {
47 struct _utf8_state *us;
48 int ch, i, mask, want;
49 wchar_t lbound, wch;
50
51 us = (struct _utf8_state *)ps;
52
53 if (us->want < 0 || us->want > _CITRUS_UTF8_MB_CUR_MAX) {
54 errno = EINVAL;
55 return -1;
56 }
57
58 if (s == NULL) {
59 s = "";
60 n = 1;
61 pwc = NULL;
62 }
63
64 if (n == 0)
65 return -2;
66
67 if (us->want == 0 && ((ch = (unsigned char)*s) & ~0x7f) == 0) {
68 /* Fast path for plain ASCII characters. */
69 if (pwc != NULL)
70 *pwc = ch;
71 return ch != '\0' ? 1 : 0;
72 }
73
74 if (us->want == 0) {
75 /*
76 * Determine the number of bytes that make up this character
77 * from the first byte, and a mask that extracts the
78 * interesting bits of the first byte. We already know
79 * the character is at least two bytes long.
80 *
81 * We also specify a lower bound for the character code to
82 * detect redundant, non-"shortest form" encodings. For
83 * example, the sequence C0 80 is _not_ a legal representation
84 * of the null character. This enforces a 1-to-1 mapping
85 * between character codes and their multibyte representations.
86 */
87 ch = (unsigned char)*s;
88 if ((ch & 0x80) == 0) {
89 mask = 0x7f;
90 want = 1;
91 lbound = 0;
92 } else if ((ch & 0xe0) == 0xc0) {
93 mask = 0x1f;
94 want = 2;
95 lbound = 0x80;
96 } else if ((ch & 0xf0) == 0xe0) {
97 mask = 0x0f;
98 want = 3;
99 lbound = 0x800;
100 } else if ((ch & 0xf8) == 0xf0) {
101 mask = 0x07;
102 want = 4;
103 lbound = 0x10000;
104 } else {
105 /*
106 * Malformed input; input is not UTF-8.
107 * See RFC 3629.
108 */
109 errno = EILSEQ;
110 return -1;
111 }
112 } else {
113 want = us->want;
114 lbound = us->lbound;
115 }
116
117 /*
118 * Decode the byte sequence representing the character in chunks
119 * of 6 bits, most significant first.
120 */
121 if (us->want == 0)
122 wch = (unsigned char)*s++ & mask;
123 else
124 wch = us->ch;
125 for (i = (us->want == 0) ? 1 : 0; i < want && (size_t)i < n; i++) {
126 if ((*s & 0xc0) != 0x80) {
127 /*
128 * Malformed input; bad byte in the middle
129 * of a character.
130 */
131 errno = EILSEQ;
132 return -1;
133 }
134 wch <<= 6;
135 wch |= *s++ & 0x3f;
136 }
137 if (i < want) {
138 /* Incomplete multibyte sequence. */
139 us->want = want - i;
140 us->lbound = lbound;
141 us->ch = wch;
142 return -2;
143 }
144 if (wch < lbound) {
145 /*
146 * Malformed input; redundant encoding.
147 */
148 errno = EILSEQ;
149 return -1;
150 }
151 if (wch >= 0xd800 && wch <= 0xdfff) {
152 /*
153 * Malformed input; invalid code points.
154 */
155 errno = EILSEQ;
156 return -1;
157 }
158 if (wch > 0x10ffff) {
159 /*
160 * Malformed input; invalid code points.
161 */
162 errno = EILSEQ;
163 return -1;
164 }
165 if (pwc != NULL)
166 *pwc = wch;
167 us->want = 0;
168 return wch == L'\0' ? 0 : want;
169 }
170
171 int
_citrus_utf8_ctype_mbsinit(const mbstate_t * __restrict ps)172 _citrus_utf8_ctype_mbsinit(const mbstate_t * __restrict ps)
173 {
174 return ((const struct _utf8_state *)ps)->want == 0;
175 }
176
177 size_t
_citrus_utf8_ctype_mbsnrtowcs(wchar_t * __restrict dst,const char ** __restrict src,size_t nmc,size_t len,mbstate_t * __restrict ps)178 _citrus_utf8_ctype_mbsnrtowcs(wchar_t * __restrict dst,
179 const char ** __restrict src, size_t nmc, size_t len,
180 mbstate_t * __restrict ps)
181 {
182 struct _utf8_state *us;
183 size_t i, o, r;
184
185 us = (struct _utf8_state *)ps;
186
187 if (dst == NULL) {
188 /*
189 * The fast path in the loop below is not safe if an ASCII
190 * character appears as anything but the first byte of a
191 * multibyte sequence. Check now to avoid doing it in the loop.
192 */
193 if (nmc > 0 && us->want > 0 && (unsigned char)(*src)[0] < 0x80) {
194 errno = EILSEQ;
195 return -1;
196 }
197 for (i = o = 0; i < nmc; i += r, o++) {
198 if ((unsigned char)(*src)[i] < 0x80) {
199 /* Fast path for plain ASCII characters. */
200 if ((*src)[i] == '\0')
201 return o;
202 r = 1;
203 } else {
204 r = _citrus_utf8_ctype_mbrtowc(NULL, *src + i,
205 nmc - i, ps);
206 if (r == (size_t)-1)
207 return r;
208 if (r == (size_t)-2)
209 return o;
210 if (r == 0)
211 return o;
212 }
213 }
214 return o;
215 }
216
217 /*
218 * The fast path in the loop below is not safe if an ASCII
219 * character appears as anything but the first byte of a
220 * multibyte sequence. Check now to avoid doing it in the loop.
221 */
222 if (len > 0 && nmc > 0 && us->want > 0 &&
223 (unsigned char)(*src)[0] < 0x80) {
224 errno = EILSEQ;
225 return -1;
226 }
227 for (i = o = 0; i < nmc && o < len; i += r, o++) {
228 if ((unsigned char)(*src)[i] < 0x80) {
229 /* Fast path for plain ASCII characters. */
230 dst[o] = (wchar_t)(unsigned char)(*src)[i];
231 if ((*src)[i] == '\0') {
232 *src = NULL;
233 return o;
234 }
235 r = 1;
236 } else {
237 r = _citrus_utf8_ctype_mbrtowc(dst + o, *src + i,
238 nmc - i, ps);
239 if (r == (size_t)-1) {
240 *src += i;
241 return r;
242 }
243 if (r == (size_t)-2) {
244 *src += nmc;
245 return o;
246 }
247 if (r == 0) {
248 *src = NULL;
249 return o;
250 }
251 }
252 }
253 *src += i;
254 return o;
255 }
256
257 size_t
_citrus_utf8_ctype_wcrtomb(char * __restrict s,wchar_t wc,mbstate_t * __restrict ps)258 _citrus_utf8_ctype_wcrtomb(char * __restrict s, wchar_t wc,
259 mbstate_t * __restrict ps)
260 {
261 struct _utf8_state *us;
262 unsigned char lead;
263 int i, len;
264
265 us = (struct _utf8_state *)ps;
266
267 if (us->want != 0) {
268 errno = EINVAL;
269 return -1;
270 }
271
272 if (s == NULL)
273 return 1;
274
275 if (wc < 0 || (wc > 0xd7ff && wc < 0xe000) || wc > 0x10ffff) {
276 errno = EILSEQ;
277 return -1;
278 }
279
280 /*
281 * Determine the number of bytes needed to represent this character.
282 * We always output the shortest sequence possible. Also specify the
283 * first few bits of the first byte, which contains the information
284 * about the sequence length.
285 */
286 if (wc <= 0x7f) {
287 /* Fast path for plain ASCII characters. */
288 *s = (char)wc;
289 return 1;
290 } else if (wc <= 0x7ff) {
291 lead = 0xc0;
292 len = 2;
293 } else if (wc <= 0xffff) {
294 lead = 0xe0;
295 len = 3;
296 } else {
297 lead = 0xf0;
298 len = 4;
299 }
300
301 /*
302 * Output the bytes representing the character in chunks
303 * of 6 bits, least significant last. The first byte is
304 * a special case because it contains the sequence length
305 * information.
306 */
307 for (i = len - 1; i > 0; i--) {
308 s[i] = (wc & 0x3f) | 0x80;
309 wc >>= 6;
310 }
311 *s = (wc & 0xff) | lead;
312
313 return len;
314 }
315
316 size_t
_citrus_utf8_ctype_wcsnrtombs(char * __restrict dst,const wchar_t ** __restrict src,size_t nwc,size_t len,mbstate_t * __restrict ps)317 _citrus_utf8_ctype_wcsnrtombs(char * __restrict dst,
318 const wchar_t ** __restrict src, size_t nwc, size_t len,
319 mbstate_t * __restrict ps)
320 {
321 struct _utf8_state *us;
322 char buf[_CITRUS_UTF8_MB_CUR_MAX];
323 size_t i, o, r;
324
325 us = (struct _utf8_state *)ps;
326
327 if (us->want != 0) {
328 errno = EINVAL;
329 return -1;
330 }
331
332 if (dst == NULL) {
333 for (i = o = 0; i < nwc; i++, o += r) {
334 wchar_t wc = (*src)[i];
335 if (wc >= 0 && wc < 0x80) {
336 /* Fast path for plain ASCII characters. */
337 if (wc == 0)
338 return o;
339 r = 1;
340 } else {
341 r = _citrus_utf8_ctype_wcrtomb(buf, wc, ps);
342 if (r == (size_t)-1)
343 return r;
344 }
345 }
346 return o;
347 }
348
349 for (i = o = 0; i < nwc && o < len; i++, o += r) {
350 wchar_t wc = (*src)[i];
351 if (wc >= 0 && wc < 0x80) {
352 /* Fast path for plain ASCII characters. */
353 dst[o] = (wchar_t)wc;
354 if (wc == 0) {
355 *src = NULL;
356 return o;
357 }
358 r = 1;
359 } else if (len - o >= _CITRUS_UTF8_MB_CUR_MAX) {
360 /* Enough space to translate in-place. */
361 r = _citrus_utf8_ctype_wcrtomb(dst + o, wc, ps);
362 if (r == (size_t)-1) {
363 *src += i;
364 return r;
365 }
366 } else {
367 /* May not be enough space; use temp buffer. */
368 r = _citrus_utf8_ctype_wcrtomb(buf, wc, ps);
369 if (r == (size_t)-1) {
370 *src += i;
371 return r;
372 }
373 if (r > len - o)
374 break;
375 memcpy(dst + o, buf, r);
376 }
377 }
378 *src += i;
379 return o;
380 }
381