1 /* $NetBSD: mbrtoc32.c,v 1.9 2024/08/20 17:43:24 riastradh Exp $ */ 2 3 /*- 4 * Copyright (c) 2024 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 * POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /* 30 * mbrtoc32(&c32, s, n, ps) 31 * 32 * Decode a Unicode UTF-32 code unit from up to n bytes out of the 33 * multibyte string s, and store it at c32, using multibyte 34 * encoding state ps. A UTF-32 code unit is also a Unicode scalar 35 * value, which is any Unicode code point except a surrogate. 36 * 37 * Return the number of bytes consumed on success, or 0 if the 38 * code unit is NUL, or (size_t)-2 if the input is incomplete, or 39 * (size_t)-1 on error with errno set to EILSEQ. 40 * 41 * In the case of incomplete input, the decoding state so far 42 * after processing s[0], s[1], ..., s[n - 1] is saved in ps, so 43 * subsequent calls to mbrtoc32 will pick up n bytes later into 44 * the input stream. 45 * 46 * References: 47 * 48 * The Unicode Standard, Version 15.0 -- Core Specification, The 49 * Unicode Consortium, Sec. 3.8 `Surrogates', p. 118. 50 * https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=144 51 * https://web.archive.org/web/20240718101254/https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=144 52 */ 53 54 #include <sys/cdefs.h> 55 __RCSID("$NetBSD: mbrtoc32.c,v 1.9 2024/08/20 17:43:24 riastradh Exp $"); 56 57 #include "namespace.h" 58 59 #include <sys/param.h> /* MIN */ 60 #include <sys/types.h> /* broken citrus_*.h */ 61 #include <sys/queue.h> /* broken citrus_*.h */ 62 63 #include <assert.h> 64 #include <errno.h> 65 #include <langinfo.h> 66 #include <limits.h> 67 #include <locale.h> 68 #include <paths.h> 69 #include <stdalign.h> 70 #include <stddef.h> 71 #include <stdlib.h> 72 #include <string.h> 73 #include <uchar.h> 74 #include <wchar.h> 75 76 #include "citrus_types.h" /* broken citrus_iconv.h */ 77 #include "citrus_module.h" /* broken citrus_iconv.h */ 78 #include "citrus_hash.h" /* broken citrus_iconv.h */ 79 #include "citrus_iconv.h" 80 #include "setlocale_local.h" 81 82 #include "mbrtoc32.h" 83 84 __CTASSERT(sizeof(struct mbrtoc32state) <= sizeof(mbstate_t)); 85 __CTASSERT(alignof(struct mbrtoc32state) <= alignof(mbstate_t)); 86 87 #ifdef __weak_alias 88 __weak_alias(mbrtoc32,_mbrtoc32) 89 __weak_alias(mbrtoc32_l,_mbrtoc32_l) 90 #endif 91 92 size_t 93 mbrtoc32(char32_t *restrict pc32, const char *restrict s, size_t n, 94 mbstate_t *restrict ps) 95 { 96 97 return mbrtoc32_l(pc32, s, n, ps, _current_locale()); 98 } 99 100 size_t 101 mbrtoc32_l(char32_t *restrict pc32, const char *restrict s, size_t n, 102 mbstate_t *restrict ps, locale_t restrict loc) 103 { 104 static mbstate_t psbuf; 105 struct _citrus_iconv *iconv = NULL; 106 wchar_t wc; 107 mbstate_t wcrtombstate = {0}; 108 char mb[MB_LEN_MAX]; 109 size_t mb_len; 110 char utf32le[MB_LEN_MAX]; 111 const char *src; 112 char *dst; 113 size_t srcleft, dstleft, inval; 114 char32_t c32; 115 size_t len; 116 int error, errno_save; 117 118 /* 119 * Save errno in case _citrus_iconv_* clobbers it. 120 */ 121 errno_save = errno; 122 123 /* 124 * `If ps is a null pointer, each function uses its own 125 * internal mbstate_t object instead, which is initialized at 126 * program startup to the initial conversion state; the 127 * functions are not required to avoid data races with other 128 * calls to the same function in this case. The 129 * implementation behaves as if no library function calls 130 * these functions with a null pointer for ps.' 131 */ 132 if (ps == NULL) 133 ps = &psbuf; 134 135 /* 136 * `If s is a null pointer, the mbrtoc32 function is equivalent 137 * to the call: 138 * 139 * mbrtoc32(NULL, "", 1, ps) 140 * 141 * In this case, the values of the parameters pc32 and n are 142 * ignored.' 143 */ 144 if (s == NULL) { 145 pc32 = NULL; 146 s = ""; 147 n = 1; 148 } 149 150 /* 151 * If input length is zero, the result is always incomplete by 152 * definition. Don't bother with iconv -- we'd have to 153 * disentangle truncated outputs. 154 */ 155 if (n == 0) { 156 len = (size_t)-2; 157 goto out; 158 } 159 160 /* 161 * Open an iconv handle to convert locale-dependent multibyte 162 * input to UTF-32LE. 163 */ 164 if ((error = _citrus_iconv_open(&iconv, _PATH_ICONV, 165 nl_langinfo_l(CODESET, loc), "utf-32le")) != 0) { 166 errno = EIO; /* XXX? */ 167 len = (size_t)-1; 168 goto out; 169 } 170 171 /* 172 * Consume the next locale-dependent wide character. If no 173 * wide character can be obtained, stop here. 174 */ 175 len = mbrtowc_l(&wc, s, n, ps, loc); 176 switch (len) { 177 case 0: /* NUL */ 178 if (pc32) 179 *pc32 = 0; 180 goto out; 181 case (size_t)-2: /* still incomplete after n bytes */ 182 case (size_t)-1: /* error */ 183 goto out; 184 default: /* consumed len bytes of input */ 185 break; 186 } 187 188 /* 189 * We consumed a wide character from the input. Convert it to 190 * a multibyte sequence _in the initial conversion state_, so 191 * we can pass that through iconv to get a Unicode scalar 192 * value. 193 */ 194 if ((mb_len = wcrtomb_l(mb, wc, &wcrtombstate, loc)) == (size_t)-1) { 195 len = (size_t)-1; 196 goto out; 197 } 198 199 /* 200 * Convert the multibyte sequence to UTF-16LE. 201 */ 202 src = mb; 203 srcleft = mb_len; 204 dst = utf32le; 205 dstleft = sizeof(utf32le); 206 error = _citrus_iconv_convert(iconv, &src, &srcleft, &dst, &dstleft, 207 _CITRUS_ICONV_F_HIDE_INVALID, &inval); 208 if (error) { 209 errno = error; 210 len = (size_t)-1; 211 goto out; 212 } 213 214 /* 215 * Successfully converted the multibyte sequence to UTF-16LE, 216 * which should produce exactly one UTF-32 code unit, encoded 217 * in little-endian, representing a code point. Get the code 218 * point. 219 */ 220 c32 = le32dec(utf32le); 221 222 /* 223 * Reject surrogate code points. We only deal in scalar 224 * values. 225 * 226 * XXX Is this necessary? Won't iconv take care of it for us? 227 */ 228 if (c32 >= 0xd800 && c32 <= 0xdfff) { 229 errno = EILSEQ; 230 len = (size_t)-1; 231 goto out; 232 } 233 234 /* 235 * Non-surrogate code point -- scalar value. Yield it. 236 */ 237 if (pc32) 238 *pc32 = c32; 239 240 /* 241 * If we got the null scalar value, return zero length, as the 242 * contract requires. 243 */ 244 if (c32 == 0) 245 len = 0; 246 247 /* 248 * Make sure we preserve errno on success. 249 */ 250 errno = errno_save; 251 252 out: errno_save = errno; 253 _citrus_iconv_close(iconv); 254 errno = errno_save; 255 return len; 256 } 257