1*445c67fbSrillig /* $NetBSD: c32rtomb.c,v 1.6 2024/08/21 18:36:11 rillig Exp $ */ 22cbd152aSriastradh 32cbd152aSriastradh /*- 42cbd152aSriastradh * Copyright (c) 2024 The NetBSD Foundation, Inc. 52cbd152aSriastradh * All rights reserved. 62cbd152aSriastradh * 72cbd152aSriastradh * Redistribution and use in source and binary forms, with or without 82cbd152aSriastradh * modification, are permitted provided that the following conditions 92cbd152aSriastradh * are met: 102cbd152aSriastradh * 1. Redistributions of source code must retain the above copyright 112cbd152aSriastradh * notice, this list of conditions and the following disclaimer. 122cbd152aSriastradh * 2. Redistributions in binary form must reproduce the above copyright 132cbd152aSriastradh * notice, this list of conditions and the following disclaimer in the 142cbd152aSriastradh * documentation and/or other materials provided with the distribution. 152cbd152aSriastradh * 162cbd152aSriastradh * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 172cbd152aSriastradh * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 182cbd152aSriastradh * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 192cbd152aSriastradh * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 202cbd152aSriastradh * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 212cbd152aSriastradh * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 222cbd152aSriastradh * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 232cbd152aSriastradh * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 242cbd152aSriastradh * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 252cbd152aSriastradh * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 262cbd152aSriastradh * POSSIBILITY OF SUCH DAMAGE. 272cbd152aSriastradh */ 282cbd152aSriastradh 292cbd152aSriastradh /* 302cbd152aSriastradh * c32rtomb(s, c32, ps) 312cbd152aSriastradh * 322cbd152aSriastradh * Encode the Unicode UTF-32 code unit c32, which must not be a 332cbd152aSriastradh * surrogate code point, into the multibyte buffer s under the 342cbd152aSriastradh * current locale, using multibyte encoding state ps. A UTF-32 352cbd152aSriastradh * code unit is also a Unicode scalar value, which is any Unicode 362cbd152aSriastradh * code point except a surrogate. 372cbd152aSriastradh * 382cbd152aSriastradh * Return the number of bytes stored on success, or (size_t)-1 on 392cbd152aSriastradh * error with errno set to EILSEQ. 402cbd152aSriastradh * 412cbd152aSriastradh * At most MB_CUR_MAX bytes will be stored. 422cbd152aSriastradh * 432cbd152aSriastradh * References: 442cbd152aSriastradh * 452cbd152aSriastradh * The Unicode Standard, Version 15.0 -- Core Specification, The 46*445c67fbSrillig * Unicode Consortium, Sec. 3.8 `Surrogates', p. 118. 47*445c67fbSrillig * https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=144 48*445c67fbSrillig * https://web.archive.org/web/20240718101254/https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=144 492cbd152aSriastradh */ 502cbd152aSriastradh 512cbd152aSriastradh #include <sys/cdefs.h> 52*445c67fbSrillig __RCSID("$NetBSD: c32rtomb.c,v 1.6 2024/08/21 18:36:11 rillig Exp $"); 53cd14a503Sriastradh 54cd14a503Sriastradh #include "namespace.h" 552cbd152aSriastradh 562cbd152aSriastradh #include <sys/types.h> /* broken citrus_*.h */ 572cbd152aSriastradh #include <sys/queue.h> /* broken citrus_*.h */ 582cbd152aSriastradh 592cbd152aSriastradh #include <assert.h> 602cbd152aSriastradh #include <errno.h> 612cbd152aSriastradh #include <langinfo.h> 622cbd152aSriastradh #include <limits.h> 634651a634Sriastradh #include <locale.h> 642cbd152aSriastradh #include <paths.h> 652cbd152aSriastradh #include <stddef.h> 662cbd152aSriastradh #include <stdlib.h> 672cbd152aSriastradh #include <uchar.h> 682cbd152aSriastradh #include <wchar.h> 692cbd152aSriastradh 702cbd152aSriastradh #include "citrus_types.h" /* broken citrus_iconv.h */ 712cbd152aSriastradh #include "citrus_module.h" /* broken citrus_iconv.h */ 722cbd152aSriastradh #include "citrus_hash.h" /* broken citrus_iconv.h */ 732cbd152aSriastradh #include "citrus_iconv.h" 744651a634Sriastradh #include "setlocale_local.h" 752cbd152aSriastradh 76cd14a503Sriastradh #ifdef __weak_alias 77cd14a503Sriastradh __weak_alias(c32rtomb,_c32rtomb) 784651a634Sriastradh __weak_alias(c32rtomb_l,_c32rtomb_l) 79cd14a503Sriastradh #endif 80cd14a503Sriastradh 812cbd152aSriastradh size_t 822cbd152aSriastradh c32rtomb(char *restrict s, char32_t c32, mbstate_t *restrict ps) 832cbd152aSriastradh { 844651a634Sriastradh 854651a634Sriastradh return c32rtomb_l(s, c32, ps, _current_locale()); 864651a634Sriastradh } 874651a634Sriastradh 884651a634Sriastradh size_t 894651a634Sriastradh c32rtomb_l(char *restrict s, char32_t c32, mbstate_t *restrict ps, 904651a634Sriastradh locale_t loc) 914651a634Sriastradh { 92a35ceff4Sriastradh static mbstate_t psbuf; 932cbd152aSriastradh struct _citrus_iconv *iconv = NULL; 94a35ceff4Sriastradh char buf[2*MB_LEN_MAX]; /* [shift from init, wc] [shift to init] */ 95a35ceff4Sriastradh char utf32le[4]; 962cbd152aSriastradh const char *src; 972cbd152aSriastradh char *dst; 98a35ceff4Sriastradh size_t srcleft, dstleft, inval; 99a35ceff4Sriastradh mbstate_t mbrtowcstate = {0}; 100a35ceff4Sriastradh wchar_t wc; 101c3a96d3fSriastradh size_t wc_len; 102a35ceff4Sriastradh size_t len; 1032cbd152aSriastradh int error, errno_save; 1042cbd152aSriastradh 1052cbd152aSriastradh /* 1062cbd152aSriastradh * Save errno in case _citrus_iconv_* clobbers it. 1072cbd152aSriastradh */ 1082cbd152aSriastradh errno_save = errno; 1092cbd152aSriastradh 1102cbd152aSriastradh /* 111a35ceff4Sriastradh * `If ps is a null pointer, each function uses its own 112a35ceff4Sriastradh * internal mbstate_t object instead, which is initialized at 113a35ceff4Sriastradh * program startup to the initial conversion state; the 114a35ceff4Sriastradh * functions are not required to avoid data races with other 115a35ceff4Sriastradh * calls to the same function in this case. The 116a35ceff4Sriastradh * implementation behaves as if no library function calls 117a35ceff4Sriastradh * these functions with a null pointer for ps.' 118a35ceff4Sriastradh */ 119a35ceff4Sriastradh if (ps == NULL) 120a35ceff4Sriastradh ps = &psbuf; 121a35ceff4Sriastradh 122a35ceff4Sriastradh /* 1232cbd152aSriastradh * `If s is a null pointer, the c32rtomb function is equivalent 1242cbd152aSriastradh * to the call 1252cbd152aSriastradh * 1262cbd152aSriastradh * c32rtomb(buf, L'\0', ps) 1272cbd152aSriastradh * 1282cbd152aSriastradh * where buf is an internal buffer.' 1292cbd152aSriastradh */ 1302cbd152aSriastradh if (s == NULL) { 1312cbd152aSriastradh s = buf; 1322cbd152aSriastradh c32 = L'\0'; 1332cbd152aSriastradh } 1342cbd152aSriastradh 1352cbd152aSriastradh /* 136a35ceff4Sriastradh * Reject surrogate code points. We only deal in scalar 137a35ceff4Sriastradh * values. 138a35ceff4Sriastradh * 139a35ceff4Sriastradh * XXX Is this necessary? Won't iconv take care of it for us? 1402cbd152aSriastradh */ 1412cbd152aSriastradh if (c32 >= 0xd800 && c32 <= 0xdfff) { 1422cbd152aSriastradh errno = EILSEQ; 1432cbd152aSriastradh len = (size_t)-1; 1442cbd152aSriastradh goto out; 1452cbd152aSriastradh } 1462cbd152aSriastradh 1472cbd152aSriastradh /* 1482cbd152aSriastradh * Open an iconv handle to convert UTF-32LE to locale-dependent 1492cbd152aSriastradh * multibyte output. 1502cbd152aSriastradh */ 1512cbd152aSriastradh if ((error = _citrus_iconv_open(&iconv, _PATH_ICONV, "utf-32le", 1524651a634Sriastradh nl_langinfo_l(CODESET, loc))) != 0) { 1532cbd152aSriastradh errno = EIO; /* XXX? */ 1542cbd152aSriastradh len = (size_t)-1; 1552cbd152aSriastradh goto out; 1562cbd152aSriastradh } 1572cbd152aSriastradh 1582cbd152aSriastradh /* 159a35ceff4Sriastradh * Convert from UTF-32LE to a multibyte sequence. 1602cbd152aSriastradh */ 161a35ceff4Sriastradh le32enc(utf32le, c32); 162a35ceff4Sriastradh src = utf32le; 163a35ceff4Sriastradh srcleft = sizeof(utf32le); 164a35ceff4Sriastradh dst = buf; 1652cbd152aSriastradh dstleft = MB_CUR_MAX; 166a35ceff4Sriastradh error = _citrus_iconv_convert(iconv, &src, &srcleft, &dst, &dstleft, 1672cbd152aSriastradh _CITRUS_ICONV_F_HIDE_INVALID, &inval); 1682cbd152aSriastradh if (error) { /* can't be incomplete, must be error */ 1692cbd152aSriastradh errno = error; 1702cbd152aSriastradh len = (size_t)-1; 1712cbd152aSriastradh goto out; 1722cbd152aSriastradh } 1732cbd152aSriastradh _DIAGASSERT(srcleft == 0); 1742cbd152aSriastradh _DIAGASSERT(dstleft <= MB_CUR_MAX); 1752cbd152aSriastradh 1762cbd152aSriastradh /* 1772cbd152aSriastradh * If we didn't produce any output, that means the scalar value 1782cbd152aSriastradh * c32 can't be encoded in the current locale, so treat it as 1792cbd152aSriastradh * EILSEQ. 1802cbd152aSriastradh */ 1812cbd152aSriastradh len = MB_CUR_MAX - dstleft; 1822cbd152aSriastradh if (len == 0) { 1832cbd152aSriastradh errno = EILSEQ; 1842cbd152aSriastradh len = (size_t)-1; 1852cbd152aSriastradh goto out; 1862cbd152aSriastradh } 1872cbd152aSriastradh 1882cbd152aSriastradh /* 189a35ceff4Sriastradh * Now get a wide character out of the buffer. We don't care 190a35ceff4Sriastradh * how much it consumes other than for a diagnostic assertion. 191a35ceff4Sriastradh * It had better return exactly one wide character, because we 192a35ceff4Sriastradh * are only allowed to encode one wide character's worth of 193a35ceff4Sriastradh * multibyte output (possibly including a shift sequence). 194a35ceff4Sriastradh * 195a35ceff4Sriastradh * XXX What about combining characters? 196a35ceff4Sriastradh */ 197c3a96d3fSriastradh wc_len = mbrtowc_l(&wc, buf, len, &mbrtowcstate, loc); 198c3a96d3fSriastradh switch (wc_len) { 199c3a96d3fSriastradh case (size_t)-1: /* error, with errno set */ 200c3a96d3fSriastradh len = (size_t)-1; 201a35ceff4Sriastradh goto out; 202a35ceff4Sriastradh case 0: /* decoded NUL */ 203a35ceff4Sriastradh wc = 0; /* paranoia */ 204c3a96d3fSriastradh len = wc_len; 205a35ceff4Sriastradh break; 206a35ceff4Sriastradh default: /* decoded wc */ 207c3a96d3fSriastradh _DIAGASSERT(wc_len <= len); 208a35ceff4Sriastradh } 209a35ceff4Sriastradh 210a35ceff4Sriastradh /* 211a35ceff4Sriastradh * Now put the wide character out, using the caller's 212a35ceff4Sriastradh * conversion state so that we don't output unnecessary shift 213a35ceff4Sriastradh * sequences. 214a35ceff4Sriastradh */ 215a35ceff4Sriastradh len = wcrtomb_l(s, wc, ps, loc); 216a35ceff4Sriastradh if (len == (size_t)-1) /* error, with errno set */ 217a35ceff4Sriastradh goto out; 218a35ceff4Sriastradh 219a35ceff4Sriastradh /* 2202cbd152aSriastradh * Make sure we preserve errno on success. 2212cbd152aSriastradh */ 2222cbd152aSriastradh errno = errno_save; 2232cbd152aSriastradh 2242cbd152aSriastradh out: errno_save = errno; 2252cbd152aSriastradh _citrus_iconv_close(iconv); 2262cbd152aSriastradh errno = errno_save; 2272cbd152aSriastradh return len; 2282cbd152aSriastradh } 229