1 /* $NetBSD: c32rtomb.c,v 1.6 2024/08/21 18:36:11 rillig Exp $ */ 2 3 /*- 4 * Copyright (c) 2024 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26 * POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 /* 30 * c32rtomb(s, c32, ps) 31 * 32 * Encode the Unicode UTF-32 code unit c32, which must not be a 33 * surrogate code point, into the multibyte buffer s under the 34 * current locale, using multibyte encoding state ps. A UTF-32 35 * code unit is also a Unicode scalar value, which is any Unicode 36 * code point except a surrogate. 37 * 38 * Return the number of bytes stored on success, or (size_t)-1 on 39 * error with errno set to EILSEQ. 40 * 41 * At most MB_CUR_MAX bytes will be stored. 42 * 43 * References: 44 * 45 * The Unicode Standard, Version 15.0 -- Core Specification, The 46 * Unicode Consortium, Sec. 3.8 `Surrogates', p. 118. 47 * https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=144 48 * https://web.archive.org/web/20240718101254/https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=144 49 */ 50 51 #include <sys/cdefs.h> 52 __RCSID("$NetBSD: c32rtomb.c,v 1.6 2024/08/21 18:36:11 rillig Exp $"); 53 54 #include "namespace.h" 55 56 #include <sys/types.h> /* broken citrus_*.h */ 57 #include <sys/queue.h> /* broken citrus_*.h */ 58 59 #include <assert.h> 60 #include <errno.h> 61 #include <langinfo.h> 62 #include <limits.h> 63 #include <locale.h> 64 #include <paths.h> 65 #include <stddef.h> 66 #include <stdlib.h> 67 #include <uchar.h> 68 #include <wchar.h> 69 70 #include "citrus_types.h" /* broken citrus_iconv.h */ 71 #include "citrus_module.h" /* broken citrus_iconv.h */ 72 #include "citrus_hash.h" /* broken citrus_iconv.h */ 73 #include "citrus_iconv.h" 74 #include "setlocale_local.h" 75 76 #ifdef __weak_alias 77 __weak_alias(c32rtomb,_c32rtomb) 78 __weak_alias(c32rtomb_l,_c32rtomb_l) 79 #endif 80 81 size_t 82 c32rtomb(char *restrict s, char32_t c32, mbstate_t *restrict ps) 83 { 84 85 return c32rtomb_l(s, c32, ps, _current_locale()); 86 } 87 88 size_t 89 c32rtomb_l(char *restrict s, char32_t c32, mbstate_t *restrict ps, 90 locale_t loc) 91 { 92 static mbstate_t psbuf; 93 struct _citrus_iconv *iconv = NULL; 94 char buf[2*MB_LEN_MAX]; /* [shift from init, wc] [shift to init] */ 95 char utf32le[4]; 96 const char *src; 97 char *dst; 98 size_t srcleft, dstleft, inval; 99 mbstate_t mbrtowcstate = {0}; 100 wchar_t wc; 101 size_t wc_len; 102 size_t len; 103 int error, errno_save; 104 105 /* 106 * Save errno in case _citrus_iconv_* clobbers it. 107 */ 108 errno_save = errno; 109 110 /* 111 * `If ps is a null pointer, each function uses its own 112 * internal mbstate_t object instead, which is initialized at 113 * program startup to the initial conversion state; the 114 * functions are not required to avoid data races with other 115 * calls to the same function in this case. The 116 * implementation behaves as if no library function calls 117 * these functions with a null pointer for ps.' 118 */ 119 if (ps == NULL) 120 ps = &psbuf; 121 122 /* 123 * `If s is a null pointer, the c32rtomb function is equivalent 124 * to the call 125 * 126 * c32rtomb(buf, L'\0', ps) 127 * 128 * where buf is an internal buffer.' 129 */ 130 if (s == NULL) { 131 s = buf; 132 c32 = L'\0'; 133 } 134 135 /* 136 * Reject surrogate code points. We only deal in scalar 137 * values. 138 * 139 * XXX Is this necessary? Won't iconv take care of it for us? 140 */ 141 if (c32 >= 0xd800 && c32 <= 0xdfff) { 142 errno = EILSEQ; 143 len = (size_t)-1; 144 goto out; 145 } 146 147 /* 148 * Open an iconv handle to convert UTF-32LE to locale-dependent 149 * multibyte output. 150 */ 151 if ((error = _citrus_iconv_open(&iconv, _PATH_ICONV, "utf-32le", 152 nl_langinfo_l(CODESET, loc))) != 0) { 153 errno = EIO; /* XXX? */ 154 len = (size_t)-1; 155 goto out; 156 } 157 158 /* 159 * Convert from UTF-32LE to a multibyte sequence. 160 */ 161 le32enc(utf32le, c32); 162 src = utf32le; 163 srcleft = sizeof(utf32le); 164 dst = buf; 165 dstleft = MB_CUR_MAX; 166 error = _citrus_iconv_convert(iconv, &src, &srcleft, &dst, &dstleft, 167 _CITRUS_ICONV_F_HIDE_INVALID, &inval); 168 if (error) { /* can't be incomplete, must be error */ 169 errno = error; 170 len = (size_t)-1; 171 goto out; 172 } 173 _DIAGASSERT(srcleft == 0); 174 _DIAGASSERT(dstleft <= MB_CUR_MAX); 175 176 /* 177 * If we didn't produce any output, that means the scalar value 178 * c32 can't be encoded in the current locale, so treat it as 179 * EILSEQ. 180 */ 181 len = MB_CUR_MAX - dstleft; 182 if (len == 0) { 183 errno = EILSEQ; 184 len = (size_t)-1; 185 goto out; 186 } 187 188 /* 189 * Now get a wide character out of the buffer. We don't care 190 * how much it consumes other than for a diagnostic assertion. 191 * It had better return exactly one wide character, because we 192 * are only allowed to encode one wide character's worth of 193 * multibyte output (possibly including a shift sequence). 194 * 195 * XXX What about combining characters? 196 */ 197 wc_len = mbrtowc_l(&wc, buf, len, &mbrtowcstate, loc); 198 switch (wc_len) { 199 case (size_t)-1: /* error, with errno set */ 200 len = (size_t)-1; 201 goto out; 202 case 0: /* decoded NUL */ 203 wc = 0; /* paranoia */ 204 len = wc_len; 205 break; 206 default: /* decoded wc */ 207 _DIAGASSERT(wc_len <= len); 208 } 209 210 /* 211 * Now put the wide character out, using the caller's 212 * conversion state so that we don't output unnecessary shift 213 * sequences. 214 */ 215 len = wcrtomb_l(s, wc, ps, loc); 216 if (len == (size_t)-1) /* error, with errno set */ 217 goto out; 218 219 /* 220 * Make sure we preserve errno on success. 221 */ 222 errno = errno_save; 223 224 out: errno_save = errno; 225 _citrus_iconv_close(iconv); 226 errno = errno_save; 227 return len; 228 } 229