14776d4e8SJohn Marino /*
28a84c799SMatthew Dillon * Copyright 2015 Matthew Dillon <dillon@backplane.com> (mbintowcr, wcrtombin)
34776d4e8SJohn Marino * Copyright 2013 Garrett D'Amore <garrett@damore.org>
44776d4e8SJohn Marino * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
50d5acd74SJohn Marino * Copyright (c) 2002-2004 Tim J. Robbins
60d5acd74SJohn Marino * All rights reserved.
70d5acd74SJohn Marino *
80d5acd74SJohn Marino * Copyright (c) 2011 The FreeBSD Foundation
90d5acd74SJohn Marino * All rights reserved.
100d5acd74SJohn Marino * Portions of this software were developed by David Chisnall
110d5acd74SJohn Marino * under sponsorship from the FreeBSD Foundation.
120d5acd74SJohn Marino *
130d5acd74SJohn Marino * Redistribution and use in source and binary forms, with or without
140d5acd74SJohn Marino * modification, are permitted provided that the following conditions
150d5acd74SJohn Marino * are met:
160d5acd74SJohn Marino * 1. Redistributions of source code must retain the above copyright
170d5acd74SJohn Marino * notice, this list of conditions and the following disclaimer.
180d5acd74SJohn Marino * 2. Redistributions in binary form must reproduce the above copyright
190d5acd74SJohn Marino * notice, this list of conditions and the following disclaimer in the
200d5acd74SJohn Marino * documentation and/or other materials provided with the distribution.
210d5acd74SJohn Marino *
220d5acd74SJohn Marino * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
230d5acd74SJohn Marino * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
240d5acd74SJohn Marino * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
250d5acd74SJohn Marino * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
260d5acd74SJohn Marino * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
270d5acd74SJohn Marino * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
280d5acd74SJohn Marino * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
290d5acd74SJohn Marino * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
300d5acd74SJohn Marino * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
310d5acd74SJohn Marino * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
320d5acd74SJohn Marino * SUCH DAMAGE.
330d5acd74SJohn Marino */
340d5acd74SJohn Marino
358a84c799SMatthew Dillon /*
368a84c799SMatthew Dillon * WCSBIN_EOF - Indicate EOF on input buffer.
378a84c799SMatthew Dillon *
388a84c799SMatthew Dillon * WCSBIN_SURRO - Pass-through surrogate space (typically if the UTF-8
398a84c799SMatthew Dillon * has already been escaped), on bytes-to-wchars and
408a84c799SMatthew Dillon * wchars-to-bytes. Escaping of other illegal codes will
418a84c799SMatthew Dillon * still occur on input but de-escaping will not occur
428a84c799SMatthew Dillon * on output (they will remain in the surrogate space).
438a84c799SMatthew Dillon *
448a84c799SMatthew Dillon * WCSBIN_LONGCODES - Allow 4-byte >= 0x10FFFF, 5-byte and 6-byte sequences
458a84c799SMatthew Dillon * (normally illegal), otherwise escape it on input
468a84c799SMatthew Dillon * and fail on output.
478a84c799SMatthew Dillon *
488a84c799SMatthew Dillon * WCSBIN_STRICT - Allow byte-to-wide conversions to fail.
498a84c799SMatthew Dillon */
508a84c799SMatthew Dillon
510d5acd74SJohn Marino #include <sys/param.h>
520d5acd74SJohn Marino
530d5acd74SJohn Marino #include <errno.h>
540d5acd74SJohn Marino #include <limits.h>
550d5acd74SJohn Marino #include <runetype.h>
560d5acd74SJohn Marino #include <stdlib.h>
570d5acd74SJohn Marino #include <string.h>
580d5acd74SJohn Marino #include <wchar.h>
590d5acd74SJohn Marino #include "mblocal.h"
600d5acd74SJohn Marino
610d5acd74SJohn Marino static size_t _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict,
620d5acd74SJohn Marino size_t, mbstate_t * __restrict);
630d5acd74SJohn Marino static int _UTF8_mbsinit(const mbstate_t *);
640d5acd74SJohn Marino static size_t _UTF8_mbsnrtowcs(wchar_t * __restrict,
650d5acd74SJohn Marino const char ** __restrict, size_t, size_t,
660d5acd74SJohn Marino mbstate_t * __restrict);
670d5acd74SJohn Marino static size_t _UTF8_wcrtomb(char * __restrict, wchar_t,
680d5acd74SJohn Marino mbstate_t * __restrict);
690d5acd74SJohn Marino static size_t _UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict,
700d5acd74SJohn Marino size_t, size_t, mbstate_t * __restrict);
718a84c799SMatthew Dillon static size_t _UTF8_mbintowcr(wchar_t * __restrict dst,
728a84c799SMatthew Dillon const char * __restrict src,
738a84c799SMatthew Dillon size_t dlen, size_t *slen, int flags);
748a84c799SMatthew Dillon static size_t _UTF8_wcrtombin(char * __restrict dst,
758a84c799SMatthew Dillon const wchar_t * __restrict src,
768a84c799SMatthew Dillon size_t dlen, size_t *slen, int flags);
770d5acd74SJohn Marino
780d5acd74SJohn Marino typedef struct {
790d5acd74SJohn Marino wchar_t ch;
800d5acd74SJohn Marino int want;
810d5acd74SJohn Marino wchar_t lbound;
820d5acd74SJohn Marino } _UTF8State;
830d5acd74SJohn Marino
840d5acd74SJohn Marino int
_UTF8_init(struct xlocale_ctype * l,_RuneLocale * rl)850d5acd74SJohn Marino _UTF8_init(struct xlocale_ctype *l, _RuneLocale *rl)
860d5acd74SJohn Marino {
870d5acd74SJohn Marino
880d5acd74SJohn Marino l->__mbrtowc = _UTF8_mbrtowc;
890d5acd74SJohn Marino l->__wcrtomb = _UTF8_wcrtomb;
900d5acd74SJohn Marino l->__mbsinit = _UTF8_mbsinit;
910d5acd74SJohn Marino l->__mbsnrtowcs = _UTF8_mbsnrtowcs;
920d5acd74SJohn Marino l->__wcsnrtombs = _UTF8_wcsnrtombs;
938a84c799SMatthew Dillon l->__mbintowcr = _UTF8_mbintowcr;
948a84c799SMatthew Dillon l->__wcrtombin = _UTF8_wcrtombin;
950d5acd74SJohn Marino l->runes = rl;
964776d4e8SJohn Marino l->__mb_cur_max = 4;
970d5acd74SJohn Marino /*
980d5acd74SJohn Marino * UCS-4 encoding used as the internal representation, so
990d5acd74SJohn Marino * slots 0x0080-0x00FF are occuped and must be excluded
1000d5acd74SJohn Marino * from the single byte ctype by setting the limit.
1010d5acd74SJohn Marino */
1020d5acd74SJohn Marino l->__mb_sb_limit = 128;
1030d5acd74SJohn Marino
1040d5acd74SJohn Marino return (0);
1050d5acd74SJohn Marino }
1060d5acd74SJohn Marino
1070d5acd74SJohn Marino static int
_UTF8_mbsinit(const mbstate_t * ps)1080d5acd74SJohn Marino _UTF8_mbsinit(const mbstate_t *ps)
1090d5acd74SJohn Marino {
1100d5acd74SJohn Marino
1110d5acd74SJohn Marino return (ps == NULL || ((const _UTF8State *)ps)->want == 0);
1120d5acd74SJohn Marino }
1130d5acd74SJohn Marino
1140d5acd74SJohn Marino static size_t
_UTF8_mbrtowc(wchar_t * __restrict pwc,const char * __restrict s,size_t n,mbstate_t * __restrict ps)1150d5acd74SJohn Marino _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n,
1160d5acd74SJohn Marino mbstate_t * __restrict ps)
1170d5acd74SJohn Marino {
1180d5acd74SJohn Marino _UTF8State *us;
1190d5acd74SJohn Marino int ch, i, mask, want;
1200d5acd74SJohn Marino wchar_t lbound, wch;
1210d5acd74SJohn Marino
1220d5acd74SJohn Marino us = (_UTF8State *)ps;
1230d5acd74SJohn Marino
124594d13a0SJohn Marino if (us->want < 0 || us->want > 4) {
1250d5acd74SJohn Marino errno = EINVAL;
1260d5acd74SJohn Marino return ((size_t)-1);
1270d5acd74SJohn Marino }
1280d5acd74SJohn Marino
1290d5acd74SJohn Marino if (s == NULL) {
1300d5acd74SJohn Marino s = "";
1310d5acd74SJohn Marino n = 1;
1320d5acd74SJohn Marino pwc = NULL;
1330d5acd74SJohn Marino }
1340d5acd74SJohn Marino
1350d5acd74SJohn Marino if (n == 0)
1360d5acd74SJohn Marino /* Incomplete multibyte sequence */
1370d5acd74SJohn Marino return ((size_t)-2);
1380d5acd74SJohn Marino
1390d5acd74SJohn Marino if (us->want == 0) {
1400d5acd74SJohn Marino /*
1410d5acd74SJohn Marino * Determine the number of octets that make up this character
1420d5acd74SJohn Marino * from the first octet, and a mask that extracts the
1430d5acd74SJohn Marino * interesting bits of the first octet. We already know
1440d5acd74SJohn Marino * the character is at least two bytes long.
1450d5acd74SJohn Marino *
1460d5acd74SJohn Marino * We also specify a lower bound for the character code to
1470d5acd74SJohn Marino * detect redundant, non-"shortest form" encodings. For
1480d5acd74SJohn Marino * example, the sequence C0 80 is _not_ a legal representation
1490d5acd74SJohn Marino * of the null character. This enforces a 1-to-1 mapping
1500d5acd74SJohn Marino * between character codes and their multibyte representations.
1510d5acd74SJohn Marino */
1520d5acd74SJohn Marino ch = (unsigned char)*s;
1530d5acd74SJohn Marino if ((ch & 0x80) == 0) {
1544776d4e8SJohn Marino /* Fast path for plain ASCII characters. */
1554776d4e8SJohn Marino if (pwc != NULL)
1564776d4e8SJohn Marino *pwc = ch;
1574776d4e8SJohn Marino return (ch != '\0' ? 1 : 0);
1584776d4e8SJohn Marino }
1594776d4e8SJohn Marino if ((ch & 0xe0) == 0xc0) {
1600d5acd74SJohn Marino mask = 0x1f;
1610d5acd74SJohn Marino want = 2;
1620d5acd74SJohn Marino lbound = 0x80;
1630d5acd74SJohn Marino } else if ((ch & 0xf0) == 0xe0) {
1640d5acd74SJohn Marino mask = 0x0f;
1650d5acd74SJohn Marino want = 3;
1660d5acd74SJohn Marino lbound = 0x800;
1670d5acd74SJohn Marino } else if ((ch & 0xf8) == 0xf0) {
1680d5acd74SJohn Marino mask = 0x07;
1690d5acd74SJohn Marino want = 4;
1700d5acd74SJohn Marino lbound = 0x10000;
1710d5acd74SJohn Marino } else {
1720d5acd74SJohn Marino /*
1730d5acd74SJohn Marino * Malformed input; input is not UTF-8.
1740d5acd74SJohn Marino */
1750d5acd74SJohn Marino errno = EILSEQ;
1760d5acd74SJohn Marino return ((size_t)-1);
1770d5acd74SJohn Marino }
1780d5acd74SJohn Marino } else {
1790d5acd74SJohn Marino want = us->want;
1800d5acd74SJohn Marino lbound = us->lbound;
1810d5acd74SJohn Marino }
1820d5acd74SJohn Marino
1830d5acd74SJohn Marino /*
1840d5acd74SJohn Marino * Decode the octet sequence representing the character in chunks
1850d5acd74SJohn Marino * of 6 bits, most significant first.
1860d5acd74SJohn Marino */
1870d5acd74SJohn Marino if (us->want == 0)
1880d5acd74SJohn Marino wch = (unsigned char)*s++ & mask;
1890d5acd74SJohn Marino else
1900d5acd74SJohn Marino wch = us->ch;
1914776d4e8SJohn Marino
1920d5acd74SJohn Marino for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) {
1930d5acd74SJohn Marino if ((*s & 0xc0) != 0x80) {
1940d5acd74SJohn Marino /*
1950d5acd74SJohn Marino * Malformed input; bad characters in the middle
1960d5acd74SJohn Marino * of a character.
1970d5acd74SJohn Marino */
1980d5acd74SJohn Marino errno = EILSEQ;
1990d5acd74SJohn Marino return ((size_t)-1);
2000d5acd74SJohn Marino }
2010d5acd74SJohn Marino wch <<= 6;
2020d5acd74SJohn Marino wch |= *s++ & 0x3f;
2030d5acd74SJohn Marino }
2040d5acd74SJohn Marino if (i < want) {
2050d5acd74SJohn Marino /* Incomplete multibyte sequence. */
2060d5acd74SJohn Marino us->want = want - i;
2070d5acd74SJohn Marino us->lbound = lbound;
2080d5acd74SJohn Marino us->ch = wch;
2090d5acd74SJohn Marino return ((size_t)-2);
2100d5acd74SJohn Marino }
211*3d4b9338SJohn Marino if (wch < lbound || wch > 0x10ffff) {
2120d5acd74SJohn Marino /*
2138a84c799SMatthew Dillon * Malformed input; redundant encoding or illegal
2148a84c799SMatthew Dillon * code sequence.
2150d5acd74SJohn Marino */
2160d5acd74SJohn Marino errno = EILSEQ;
2170d5acd74SJohn Marino return ((size_t)-1);
2180d5acd74SJohn Marino }
2190d5acd74SJohn Marino if (pwc != NULL)
2200d5acd74SJohn Marino *pwc = wch;
2210d5acd74SJohn Marino us->want = 0;
2220d5acd74SJohn Marino return (wch == L'\0' ? 0 : want);
2230d5acd74SJohn Marino }
2240d5acd74SJohn Marino
2250d5acd74SJohn Marino static size_t
_UTF8_mbsnrtowcs(wchar_t * __restrict dst,const char ** __restrict src,size_t nms,size_t len,mbstate_t * __restrict ps)2260d5acd74SJohn Marino _UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src,
2270d5acd74SJohn Marino size_t nms, size_t len, mbstate_t * __restrict ps)
2280d5acd74SJohn Marino {
2290d5acd74SJohn Marino _UTF8State *us;
2300d5acd74SJohn Marino const char *s;
2310d5acd74SJohn Marino size_t nchr;
2320d5acd74SJohn Marino wchar_t wc;
2330d5acd74SJohn Marino size_t nb;
2340d5acd74SJohn Marino
2350d5acd74SJohn Marino us = (_UTF8State *)ps;
2360d5acd74SJohn Marino
2370d5acd74SJohn Marino s = *src;
2380d5acd74SJohn Marino nchr = 0;
2390d5acd74SJohn Marino
2400d5acd74SJohn Marino if (dst == NULL) {
2410d5acd74SJohn Marino /*
2420d5acd74SJohn Marino * The fast path in the loop below is not safe if an ASCII
2430d5acd74SJohn Marino * character appears as anything but the first byte of a
2440d5acd74SJohn Marino * multibyte sequence. Check now to avoid doing it in the loop.
2450d5acd74SJohn Marino */
2460d5acd74SJohn Marino if (nms > 0 && us->want > 0 && (signed char)*s > 0) {
2470d5acd74SJohn Marino errno = EILSEQ;
2480d5acd74SJohn Marino return ((size_t)-1);
2490d5acd74SJohn Marino }
2500d5acd74SJohn Marino for (;;) {
2510d5acd74SJohn Marino if (nms > 0 && (signed char)*s > 0)
2520d5acd74SJohn Marino /*
2530d5acd74SJohn Marino * Fast path for plain ASCII characters
2540d5acd74SJohn Marino * excluding NUL.
2550d5acd74SJohn Marino */
2560d5acd74SJohn Marino nb = 1;
2570d5acd74SJohn Marino else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) ==
2580d5acd74SJohn Marino (size_t)-1)
2590d5acd74SJohn Marino /* Invalid sequence - mbrtowc() sets errno. */
2600d5acd74SJohn Marino return ((size_t)-1);
2610d5acd74SJohn Marino else if (nb == 0 || nb == (size_t)-2)
2620d5acd74SJohn Marino return (nchr);
2630d5acd74SJohn Marino s += nb;
2640d5acd74SJohn Marino nms -= nb;
2650d5acd74SJohn Marino nchr++;
2660d5acd74SJohn Marino }
2670d5acd74SJohn Marino /*NOTREACHED*/
2680d5acd74SJohn Marino }
2690d5acd74SJohn Marino
2700d5acd74SJohn Marino /*
2710d5acd74SJohn Marino * The fast path in the loop below is not safe if an ASCII
2720d5acd74SJohn Marino * character appears as anything but the first byte of a
2730d5acd74SJohn Marino * multibyte sequence. Check now to avoid doing it in the loop.
2740d5acd74SJohn Marino */
2750d5acd74SJohn Marino if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) {
2760d5acd74SJohn Marino errno = EILSEQ;
2770d5acd74SJohn Marino return ((size_t)-1);
2780d5acd74SJohn Marino }
2790d5acd74SJohn Marino while (len-- > 0) {
2800d5acd74SJohn Marino if (nms > 0 && (signed char)*s > 0) {
2810d5acd74SJohn Marino /*
2820d5acd74SJohn Marino * Fast path for plain ASCII characters
2830d5acd74SJohn Marino * excluding NUL.
2840d5acd74SJohn Marino */
2850d5acd74SJohn Marino *dst = (wchar_t)*s;
2860d5acd74SJohn Marino nb = 1;
2870d5acd74SJohn Marino } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) ==
2880d5acd74SJohn Marino (size_t)-1) {
2890d5acd74SJohn Marino *src = s;
2900d5acd74SJohn Marino return ((size_t)-1);
2910d5acd74SJohn Marino } else if (nb == (size_t)-2) {
2920d5acd74SJohn Marino *src = s + nms;
2930d5acd74SJohn Marino return (nchr);
2940d5acd74SJohn Marino } else if (nb == 0) {
2950d5acd74SJohn Marino *src = NULL;
2960d5acd74SJohn Marino return (nchr);
2970d5acd74SJohn Marino }
2980d5acd74SJohn Marino s += nb;
2990d5acd74SJohn Marino nms -= nb;
3000d5acd74SJohn Marino nchr++;
3010d5acd74SJohn Marino dst++;
3020d5acd74SJohn Marino }
3030d5acd74SJohn Marino *src = s;
3040d5acd74SJohn Marino return (nchr);
3050d5acd74SJohn Marino }
3060d5acd74SJohn Marino
3070d5acd74SJohn Marino static size_t
_UTF8_wcrtomb(char * __restrict s,wchar_t wc,mbstate_t * __restrict ps)3080d5acd74SJohn Marino _UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps)
3090d5acd74SJohn Marino {
3100d5acd74SJohn Marino _UTF8State *us;
3110d5acd74SJohn Marino unsigned char lead;
3120d5acd74SJohn Marino int i, len;
3130d5acd74SJohn Marino
3140d5acd74SJohn Marino us = (_UTF8State *)ps;
3150d5acd74SJohn Marino
3160d5acd74SJohn Marino if (us->want != 0) {
3170d5acd74SJohn Marino errno = EINVAL;
3180d5acd74SJohn Marino return ((size_t)-1);
3190d5acd74SJohn Marino }
3200d5acd74SJohn Marino
3210d5acd74SJohn Marino if (s == NULL)
3220d5acd74SJohn Marino /* Reset to initial shift state (no-op) */
3230d5acd74SJohn Marino return (1);
3240d5acd74SJohn Marino
3250d5acd74SJohn Marino /*
3260d5acd74SJohn Marino * Determine the number of octets needed to represent this character.
3270d5acd74SJohn Marino * We always output the shortest sequence possible. Also specify the
3280d5acd74SJohn Marino * first few bits of the first octet, which contains the information
3290d5acd74SJohn Marino * about the sequence length.
3300d5acd74SJohn Marino */
3310d5acd74SJohn Marino if ((wc & ~0x7f) == 0) {
3324776d4e8SJohn Marino /* Fast path for plain ASCII characters. */
3334776d4e8SJohn Marino *s = (char)wc;
3344776d4e8SJohn Marino return (1);
3350d5acd74SJohn Marino } else if ((wc & ~0x7ff) == 0) {
3360d5acd74SJohn Marino lead = 0xc0;
3370d5acd74SJohn Marino len = 2;
3380d5acd74SJohn Marino } else if ((wc & ~0xffff) == 0) {
3390d5acd74SJohn Marino lead = 0xe0;
3400d5acd74SJohn Marino len = 3;
341*3d4b9338SJohn Marino } else if (wc <= 0x10ffff) {
3420d5acd74SJohn Marino lead = 0xf0;
3430d5acd74SJohn Marino len = 4;
3440d5acd74SJohn Marino } else {
3450d5acd74SJohn Marino errno = EILSEQ;
3460d5acd74SJohn Marino return ((size_t)-1);
3470d5acd74SJohn Marino }
3480d5acd74SJohn Marino
3490d5acd74SJohn Marino /*
3500d5acd74SJohn Marino * Output the octets representing the character in chunks
3510d5acd74SJohn Marino * of 6 bits, least significant last. The first octet is
3520d5acd74SJohn Marino * a special case because it contains the sequence length
3530d5acd74SJohn Marino * information.
3540d5acd74SJohn Marino */
3550d5acd74SJohn Marino for (i = len - 1; i > 0; i--) {
3560d5acd74SJohn Marino s[i] = (wc & 0x3f) | 0x80;
3570d5acd74SJohn Marino wc >>= 6;
3580d5acd74SJohn Marino }
3590d5acd74SJohn Marino *s = (wc & 0xff) | lead;
3600d5acd74SJohn Marino
3610d5acd74SJohn Marino return (len);
3620d5acd74SJohn Marino }
3630d5acd74SJohn Marino
3640d5acd74SJohn Marino static size_t
_UTF8_wcsnrtombs(char * __restrict dst,const wchar_t ** __restrict src,size_t nwc,size_t len,mbstate_t * __restrict ps)3650d5acd74SJohn Marino _UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,
3660d5acd74SJohn Marino size_t nwc, size_t len, mbstate_t * __restrict ps)
3670d5acd74SJohn Marino {
3680d5acd74SJohn Marino _UTF8State *us;
3690d5acd74SJohn Marino char buf[MB_LEN_MAX];
3700d5acd74SJohn Marino const wchar_t *s;
3710d5acd74SJohn Marino size_t nbytes;
3720d5acd74SJohn Marino size_t nb;
3730d5acd74SJohn Marino
3740d5acd74SJohn Marino us = (_UTF8State *)ps;
3750d5acd74SJohn Marino
3760d5acd74SJohn Marino if (us->want != 0) {
3770d5acd74SJohn Marino errno = EINVAL;
3780d5acd74SJohn Marino return ((size_t)-1);
3790d5acd74SJohn Marino }
3800d5acd74SJohn Marino
3810d5acd74SJohn Marino s = *src;
3820d5acd74SJohn Marino nbytes = 0;
3830d5acd74SJohn Marino
3840d5acd74SJohn Marino if (dst == NULL) {
3850d5acd74SJohn Marino while (nwc-- > 0) {
3860d5acd74SJohn Marino if (0 <= *s && *s < 0x80)
3870d5acd74SJohn Marino /* Fast path for plain ASCII characters. */
3880d5acd74SJohn Marino nb = 1;
3890d5acd74SJohn Marino else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) ==
3900d5acd74SJohn Marino (size_t)-1)
3910d5acd74SJohn Marino /* Invalid character - wcrtomb() sets errno. */
3920d5acd74SJohn Marino return ((size_t)-1);
3930d5acd74SJohn Marino if (*s == L'\0')
3940d5acd74SJohn Marino return (nbytes + nb - 1);
3950d5acd74SJohn Marino s++;
3960d5acd74SJohn Marino nbytes += nb;
3970d5acd74SJohn Marino }
3980d5acd74SJohn Marino return (nbytes);
3990d5acd74SJohn Marino }
4000d5acd74SJohn Marino
4010d5acd74SJohn Marino while (len > 0 && nwc-- > 0) {
4020d5acd74SJohn Marino if (0 <= *s && *s < 0x80) {
4030d5acd74SJohn Marino /* Fast path for plain ASCII characters. */
4040d5acd74SJohn Marino nb = 1;
4050d5acd74SJohn Marino *dst = *s;
4060d5acd74SJohn Marino } else if (len > (size_t)MB_CUR_MAX) {
4070d5acd74SJohn Marino /* Enough space to translate in-place. */
4080d5acd74SJohn Marino if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) {
4090d5acd74SJohn Marino *src = s;
4100d5acd74SJohn Marino return ((size_t)-1);
4110d5acd74SJohn Marino }
4120d5acd74SJohn Marino } else {
4130d5acd74SJohn Marino /*
4140d5acd74SJohn Marino * May not be enough space; use temp. buffer.
4150d5acd74SJohn Marino */
4160d5acd74SJohn Marino if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) {
4170d5acd74SJohn Marino *src = s;
4180d5acd74SJohn Marino return ((size_t)-1);
4190d5acd74SJohn Marino }
4200d5acd74SJohn Marino if (nb > (int)len)
4210d5acd74SJohn Marino /* MB sequence for character won't fit. */
4220d5acd74SJohn Marino break;
4234776d4e8SJohn Marino (void) memcpy(dst, buf, nb);
4240d5acd74SJohn Marino }
4250d5acd74SJohn Marino if (*s == L'\0') {
4260d5acd74SJohn Marino *src = NULL;
4270d5acd74SJohn Marino return (nbytes + nb - 1);
4280d5acd74SJohn Marino }
4290d5acd74SJohn Marino s++;
4300d5acd74SJohn Marino dst += nb;
4310d5acd74SJohn Marino len -= nb;
4320d5acd74SJohn Marino nbytes += nb;
4330d5acd74SJohn Marino }
4340d5acd74SJohn Marino *src = s;
4350d5acd74SJohn Marino return (nbytes);
4360d5acd74SJohn Marino }
4378a84c799SMatthew Dillon
4388a84c799SMatthew Dillon /*
4398a84c799SMatthew Dillon * Clean binary to wchar buffer conversions. This is basically like a normal
4408a84c799SMatthew Dillon * buffer conversion but with a sane argument API and escaping. See none.c
4418a84c799SMatthew Dillon * for a more complete description.
4428a84c799SMatthew Dillon */
4438a84c799SMatthew Dillon static size_t
_UTF8_mbintowcr(wchar_t * __restrict dst,const char * __restrict src,size_t dlen,size_t * slen,int flags)4448a84c799SMatthew Dillon _UTF8_mbintowcr(wchar_t * __restrict dst, const char * __restrict src,
4458a84c799SMatthew Dillon size_t dlen, size_t *slen, int flags)
4468a84c799SMatthew Dillon {
4478a84c799SMatthew Dillon size_t i;
4488a84c799SMatthew Dillon size_t j;
4498a84c799SMatthew Dillon size_t k;
4508a84c799SMatthew Dillon size_t n = *slen;
4518a84c799SMatthew Dillon int ch, mask, want;
4528a84c799SMatthew Dillon wchar_t lbound, wch;
4538a84c799SMatthew Dillon
4548a84c799SMatthew Dillon for (i = j = 0; i < n; ++i) {
4558a84c799SMatthew Dillon if (j == dlen)
4568a84c799SMatthew Dillon break;
4578a84c799SMatthew Dillon ch = (unsigned char)src[i];
4588a84c799SMatthew Dillon
4598a84c799SMatthew Dillon if ((ch & 0x80) == 0) {
4608a84c799SMatthew Dillon /* Fast path for plain ASCII characters. */
4618a84c799SMatthew Dillon if (dst)
4628a84c799SMatthew Dillon dst[j] = ch;
4638a84c799SMatthew Dillon ++j;
4648a84c799SMatthew Dillon continue;
4658a84c799SMatthew Dillon }
4668a84c799SMatthew Dillon if ((ch & 0xe0) == 0xc0) {
4678a84c799SMatthew Dillon mask = 0x1f;
4688a84c799SMatthew Dillon want = 2;
4698a84c799SMatthew Dillon lbound = 0x80;
4708a84c799SMatthew Dillon } else if ((ch & 0xf0) == 0xe0) {
4718a84c799SMatthew Dillon mask = 0x0f;
4728a84c799SMatthew Dillon want = 3;
4738a84c799SMatthew Dillon lbound = 0x800;
4748a84c799SMatthew Dillon } else if ((ch & 0xf8) == 0xf0) {
4758a84c799SMatthew Dillon mask = 0x07;
4768a84c799SMatthew Dillon want = 4;
4778a84c799SMatthew Dillon lbound = 0x10000;
4788a84c799SMatthew Dillon } else if ((ch & 0xfc) == 0xf8) {
4798a84c799SMatthew Dillon /* normally illegal, handled down below */
4808a84c799SMatthew Dillon mask = 0x03;
4818a84c799SMatthew Dillon want = 5;
4828a84c799SMatthew Dillon lbound = 0x200000;
4838a84c799SMatthew Dillon } else if ((ch & 0xfe) == 0xfc) {
4848a84c799SMatthew Dillon /* normally illegal, handled down below */
4858a84c799SMatthew Dillon mask = 0x01;
4868a84c799SMatthew Dillon want = 6;
4878a84c799SMatthew Dillon lbound = 0x4000000;
4888a84c799SMatthew Dillon } else {
4898a84c799SMatthew Dillon /*
4908a84c799SMatthew Dillon * Malformed input; input is not UTF-8, escape
4918a84c799SMatthew Dillon * with UTF-8B.
4928a84c799SMatthew Dillon */
4938a84c799SMatthew Dillon if (flags & WCSBIN_STRICT) {
4948a84c799SMatthew Dillon if (i == 0) {
4958a84c799SMatthew Dillon errno = EILSEQ;
4968a84c799SMatthew Dillon return ((size_t)-1);
4978a84c799SMatthew Dillon }
4988a84c799SMatthew Dillon break;
4998a84c799SMatthew Dillon }
5008a84c799SMatthew Dillon if (dst)
5018a84c799SMatthew Dillon dst[j] = 0xDC00 | ch;
5028a84c799SMatthew Dillon ++j;
5038a84c799SMatthew Dillon continue;
5048a84c799SMatthew Dillon }
5058a84c799SMatthew Dillon
5068a84c799SMatthew Dillon /*
5078a84c799SMatthew Dillon * Construct wchar_t from multibyte sequence.
5088a84c799SMatthew Dillon */
5098a84c799SMatthew Dillon wch = ch & mask;
5108a84c799SMatthew Dillon for (k = 1; k < want; ++k) {
5118a84c799SMatthew Dillon /*
5128a84c799SMatthew Dillon * Stop if not enough input (don't do this early
5138a84c799SMatthew Dillon * so we can detect illegal characters as they occur
5148a84c799SMatthew Dillon * in the stream).
5158a84c799SMatthew Dillon *
5168a84c799SMatthew Dillon * If termination is requested force-escape all chars.
5178a84c799SMatthew Dillon */
5188a84c799SMatthew Dillon if (i + k >= n) {
5198a84c799SMatthew Dillon if (flags & WCSBIN_EOF) {
5208a84c799SMatthew Dillon want = n - i;
5218a84c799SMatthew Dillon goto forceesc;
5228a84c799SMatthew Dillon }
5238a84c799SMatthew Dillon goto breakout;
5248a84c799SMatthew Dillon }
5258a84c799SMatthew Dillon
5268a84c799SMatthew Dillon ch = src[i+k];
5278a84c799SMatthew Dillon if ((ch & 0xc0) != 0x80) {
5288a84c799SMatthew Dillon /*
5298a84c799SMatthew Dillon * Malformed input, bad characters in the
5308a84c799SMatthew Dillon * middle of a multibyte sequence. Escape
5318a84c799SMatthew Dillon * with UTF-8B.
5328a84c799SMatthew Dillon */
5338a84c799SMatthew Dillon if (flags & WCSBIN_STRICT) {
5348a84c799SMatthew Dillon if (i == 0) {
5358a84c799SMatthew Dillon errno = EILSEQ;
5368a84c799SMatthew Dillon return ((size_t)-1);
5378a84c799SMatthew Dillon }
5388a84c799SMatthew Dillon goto breakout;
5398a84c799SMatthew Dillon }
5408a84c799SMatthew Dillon if (dst)
5418a84c799SMatthew Dillon dst[j] = 0xDC00 | (unsigned char)src[i];
5428a84c799SMatthew Dillon ++j;
5438a84c799SMatthew Dillon goto loopup;
5448a84c799SMatthew Dillon }
5458a84c799SMatthew Dillon wch <<= 6;
5468a84c799SMatthew Dillon wch |= ch & 0x3f;
5478a84c799SMatthew Dillon }
5488a84c799SMatthew Dillon
5498a84c799SMatthew Dillon /*
5508a84c799SMatthew Dillon * Check validity of the wchar. If invalid we could escape
5518a84c799SMatthew Dillon * just the first character and loop up, but it ought to be
5528a84c799SMatthew Dillon * more readable if we escape all the chars in the sequence
5538a84c799SMatthew Dillon * (since they are all >= 0x80 and might represent a legacy
5548a84c799SMatthew Dillon * 5-byte or 6-byte code).
5558a84c799SMatthew Dillon */
5568a84c799SMatthew Dillon if (wch < lbound ||
557*3d4b9338SJohn Marino ((flags & WCSBIN_LONGCODES) == 0 && wch > 0x10ffff)) {
5588a84c799SMatthew Dillon goto forceesc;
5598a84c799SMatthew Dillon }
5608a84c799SMatthew Dillon
5618a84c799SMatthew Dillon /*
5628a84c799SMatthew Dillon * Check if wch is a surrogate code (which also encloses our
5638a84c799SMatthew Dillon * UTF-8B escaping range). This is normally illegal in UTF8.
5648a84c799SMatthew Dillon * If it is, we need to escape each characer in the sequence.
5658a84c799SMatthew Dillon * Breakout if there isn't enough output buffer space.
5668a84c799SMatthew Dillon *
5678a84c799SMatthew Dillon * If (flags & WCSBIN_SURRO) the caller wishes to accept
5688a84c799SMatthew Dillon * surrogate codes, i.e. the input might potentially already
5698a84c799SMatthew Dillon * be escaped UTF8-B or unchecked UTF-16 that was converted
5708a84c799SMatthew Dillon * into UTF-8.
5718a84c799SMatthew Dillon */
5728a84c799SMatthew Dillon if ((flags & WCSBIN_SURRO) == 0 &&
5738a84c799SMatthew Dillon wch >= 0xD800 && wch <= 0xDFFF) {
5748a84c799SMatthew Dillon forceesc:
5758a84c799SMatthew Dillon if (j + want > dlen)
5768a84c799SMatthew Dillon break;
5778a84c799SMatthew Dillon if (flags & WCSBIN_STRICT) {
5788a84c799SMatthew Dillon if (i == 0) {
5798a84c799SMatthew Dillon errno = EILSEQ;
5808a84c799SMatthew Dillon return ((size_t)-1);
5818a84c799SMatthew Dillon }
5828a84c799SMatthew Dillon break;
5838a84c799SMatthew Dillon }
5848a84c799SMatthew Dillon for (k = 0; k < want; ++k) {
5858a84c799SMatthew Dillon if (dst) {
5868a84c799SMatthew Dillon dst[j] = 0xDC00 |
5878a84c799SMatthew Dillon (unsigned char)src[i+k];
5888a84c799SMatthew Dillon }
5898a84c799SMatthew Dillon ++j;
5908a84c799SMatthew Dillon }
5918a84c799SMatthew Dillon i += k - 1;
5928a84c799SMatthew Dillon } else {
5938a84c799SMatthew Dillon i += k - 1;
5948a84c799SMatthew Dillon if (dst)
5958a84c799SMatthew Dillon dst[j] = wch;
5968a84c799SMatthew Dillon ++j;
5978a84c799SMatthew Dillon }
5988a84c799SMatthew Dillon loopup:
5998a84c799SMatthew Dillon ;
6008a84c799SMatthew Dillon }
6018a84c799SMatthew Dillon breakout:
6028a84c799SMatthew Dillon *slen = i;
6038a84c799SMatthew Dillon
6048a84c799SMatthew Dillon return j;
6058a84c799SMatthew Dillon }
6068a84c799SMatthew Dillon
6078a84c799SMatthew Dillon static size_t
_UTF8_wcrtombin(char * __restrict dst,const wchar_t * __restrict src,size_t dlen,size_t * slen,int flags)6088a84c799SMatthew Dillon _UTF8_wcrtombin(char * __restrict dst, const wchar_t * __restrict src,
6098a84c799SMatthew Dillon size_t dlen, size_t *slen, int flags)
6108a84c799SMatthew Dillon {
6118a84c799SMatthew Dillon size_t i;
6128a84c799SMatthew Dillon size_t j;
6138a84c799SMatthew Dillon size_t k;
6148a84c799SMatthew Dillon size_t n = *slen;
6158a84c799SMatthew Dillon size_t len;
6168a84c799SMatthew Dillon unsigned char lead;
6178a84c799SMatthew Dillon wchar_t wc;
6188a84c799SMatthew Dillon
6198a84c799SMatthew Dillon for (i = j = 0; i < n; ++i) {
6208a84c799SMatthew Dillon if (j == dlen)
6218a84c799SMatthew Dillon break;
6228a84c799SMatthew Dillon wc = src[i];
6238a84c799SMatthew Dillon
6248a84c799SMatthew Dillon if ((wc & ~0x7f) == 0) {
6258a84c799SMatthew Dillon /* Fast path for plain ASCII characters. */
6268a84c799SMatthew Dillon if (dst)
6278a84c799SMatthew Dillon dst[j] = (unsigned char)wc;
6288a84c799SMatthew Dillon ++j;
6298a84c799SMatthew Dillon continue;
6308a84c799SMatthew Dillon }
6318a84c799SMatthew Dillon if ((wc & ~0x7ff) == 0) {
6328a84c799SMatthew Dillon lead = 0xc0;
6338a84c799SMatthew Dillon len = 2;
6348a84c799SMatthew Dillon } else if (wc >= 0xDC80 && wc <= 0xDCFF &&
6358a84c799SMatthew Dillon (flags & WCSBIN_SURRO) == 0) {
6368a84c799SMatthew Dillon if (flags & WCSBIN_STRICT) {
6378a84c799SMatthew Dillon /*
6388a84c799SMatthew Dillon * STRICT without SURRO is an error for
6398a84c799SMatthew Dillon * surrogates.
6408a84c799SMatthew Dillon */
6418a84c799SMatthew Dillon if (i == 0) {
6428a84c799SMatthew Dillon errno = EILSEQ;
6438a84c799SMatthew Dillon return ((size_t)-1);
6448a84c799SMatthew Dillon }
6458a84c799SMatthew Dillon break;
6468a84c799SMatthew Dillon }
6478a84c799SMatthew Dillon if (dst)
6488a84c799SMatthew Dillon dst[j] = (unsigned char)wc;
6498a84c799SMatthew Dillon ++j;
6508a84c799SMatthew Dillon continue;
6518a84c799SMatthew Dillon } else if ((wc & ~0xffff) == 0) {
6528a84c799SMatthew Dillon if (wc >= 0xD800 && wc <= 0xDFFF &&
6538a84c799SMatthew Dillon (flags & (WCSBIN_SURRO | WCSBIN_STRICT)) ==
6548a84c799SMatthew Dillon WCSBIN_STRICT) {
6558a84c799SMatthew Dillon /*
6568a84c799SMatthew Dillon * Surrogates in general are an error
6578a84c799SMatthew Dillon * if STRICT is specified and SURRO is not
6588a84c799SMatthew Dillon * specified.
6598a84c799SMatthew Dillon */
6608a84c799SMatthew Dillon if (i == 0) {
6618a84c799SMatthew Dillon errno = EILSEQ;
6628a84c799SMatthew Dillon return ((size_t)-1);
6638a84c799SMatthew Dillon }
6648a84c799SMatthew Dillon break;
6658a84c799SMatthew Dillon }
6668a84c799SMatthew Dillon lead = 0xe0;
6678a84c799SMatthew Dillon len = 3;
668*3d4b9338SJohn Marino } else if (wc <= 0x10ffff) {
6698a84c799SMatthew Dillon lead = 0xf0;
6708a84c799SMatthew Dillon len = 4;
6718a84c799SMatthew Dillon } else if ((flags & WCSBIN_LONGCODES) && wc < 0x200000) {
6728a84c799SMatthew Dillon /* normally illegal */
6738a84c799SMatthew Dillon lead = 0xf0;
6748a84c799SMatthew Dillon len = 4;
6758a84c799SMatthew Dillon } else if ((flags & WCSBIN_LONGCODES) && wc < 0x4000000) {
6768a84c799SMatthew Dillon /* normally illegal */
6778a84c799SMatthew Dillon lead = 0xf8;
6788a84c799SMatthew Dillon len = 5;
6798a84c799SMatthew Dillon } else if ((flags & WCSBIN_LONGCODES) &&
6808a84c799SMatthew Dillon (uint32_t)wc < 0x80000000U) {
6818a84c799SMatthew Dillon /* normally illegal */
6828a84c799SMatthew Dillon lead = 0xfc;
6838a84c799SMatthew Dillon len = 6;
6848a84c799SMatthew Dillon } else {
6858a84c799SMatthew Dillon if (i == 0) {
6868a84c799SMatthew Dillon errno = EILSEQ;
6878a84c799SMatthew Dillon return ((size_t)-1);
6888a84c799SMatthew Dillon }
6898a84c799SMatthew Dillon /* stop here, process error on next loop */
6908a84c799SMatthew Dillon break;
6918a84c799SMatthew Dillon }
6928a84c799SMatthew Dillon
6938a84c799SMatthew Dillon /*
6948a84c799SMatthew Dillon * Output the octets representing the character in chunks
6958a84c799SMatthew Dillon * of 6 bits, least significant last. The first octet is
6968a84c799SMatthew Dillon * a special case because it contains the sequence length
6978a84c799SMatthew Dillon * information.
6988a84c799SMatthew Dillon */
6998a84c799SMatthew Dillon if (j + len > dlen)
7008a84c799SMatthew Dillon break;
7018a84c799SMatthew Dillon k = j;
7028a84c799SMatthew Dillon j += len;
7038a84c799SMatthew Dillon if (dst) {
7048a84c799SMatthew Dillon while (--len > 0) {
7058a84c799SMatthew Dillon dst[k + len] = (wc & 0x3f) | 0x80;
7068a84c799SMatthew Dillon wc >>= 6;
7078a84c799SMatthew Dillon }
7088a84c799SMatthew Dillon dst[k] = (wc & 0xff) | lead;
7098a84c799SMatthew Dillon }
7108a84c799SMatthew Dillon }
7118a84c799SMatthew Dillon *slen = i;
7128a84c799SMatthew Dillon
7138a84c799SMatthew Dillon return j;
7148a84c799SMatthew Dillon }
7158a84c799SMatthew Dillon
7168a84c799SMatthew Dillon size_t
utf8towcr(wchar_t * __restrict dst,const char * __restrict src,size_t dlen,size_t * slen,int flags)7178a84c799SMatthew Dillon utf8towcr(wchar_t * __restrict dst, const char * __restrict src,
7188a84c799SMatthew Dillon size_t dlen, size_t *slen, int flags)
7198a84c799SMatthew Dillon {
7208a84c799SMatthew Dillon return _UTF8_mbintowcr(dst, src, dlen, slen, flags);
7218a84c799SMatthew Dillon }
7228a84c799SMatthew Dillon
7238a84c799SMatthew Dillon size_t
wcrtoutf8(char * __restrict dst,const wchar_t * __restrict src,size_t dlen,size_t * slen,int flags)7248a84c799SMatthew Dillon wcrtoutf8(char * __restrict dst, const wchar_t * __restrict src,
7258a84c799SMatthew Dillon size_t dlen, size_t *slen, int flags)
7268a84c799SMatthew Dillon {
7278a84c799SMatthew Dillon return _UTF8_wcrtombin(dst, src, dlen, slen, flags);
7288a84c799SMatthew Dillon }
729