xref: /netbsd-src/lib/libc/locale/c32rtomb.c (revision 445c67fbb42592ba7a3130514543f8a4a4ff204d)
1*445c67fbSrillig /*	$NetBSD: c32rtomb.c,v 1.6 2024/08/21 18:36:11 rillig Exp $	*/
22cbd152aSriastradh 
32cbd152aSriastradh /*-
42cbd152aSriastradh  * Copyright (c) 2024 The NetBSD Foundation, Inc.
52cbd152aSriastradh  * All rights reserved.
62cbd152aSriastradh  *
72cbd152aSriastradh  * Redistribution and use in source and binary forms, with or without
82cbd152aSriastradh  * modification, are permitted provided that the following conditions
92cbd152aSriastradh  * are met:
102cbd152aSriastradh  * 1. Redistributions of source code must retain the above copyright
112cbd152aSriastradh  *    notice, this list of conditions and the following disclaimer.
122cbd152aSriastradh  * 2. Redistributions in binary form must reproduce the above copyright
132cbd152aSriastradh  *    notice, this list of conditions and the following disclaimer in the
142cbd152aSriastradh  *    documentation and/or other materials provided with the distribution.
152cbd152aSriastradh  *
162cbd152aSriastradh  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
172cbd152aSriastradh  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
182cbd152aSriastradh  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
192cbd152aSriastradh  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
202cbd152aSriastradh  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
212cbd152aSriastradh  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
222cbd152aSriastradh  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
232cbd152aSriastradh  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
242cbd152aSriastradh  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
252cbd152aSriastradh  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
262cbd152aSriastradh  * POSSIBILITY OF SUCH DAMAGE.
272cbd152aSriastradh  */
282cbd152aSriastradh 
292cbd152aSriastradh /*
302cbd152aSriastradh  * c32rtomb(s, c32, ps)
312cbd152aSriastradh  *
322cbd152aSriastradh  *	Encode the Unicode UTF-32 code unit c32, which must not be a
332cbd152aSriastradh  *	surrogate code point, into the multibyte buffer s under the
342cbd152aSriastradh  *	current locale, using multibyte encoding state ps.  A UTF-32
352cbd152aSriastradh  *	code unit is also a Unicode scalar value, which is any Unicode
362cbd152aSriastradh  *	code point except a surrogate.
372cbd152aSriastradh  *
382cbd152aSriastradh  *	Return the number of bytes stored on success, or (size_t)-1 on
392cbd152aSriastradh  *	error with errno set to EILSEQ.
402cbd152aSriastradh  *
412cbd152aSriastradh  *	At most MB_CUR_MAX bytes will be stored.
422cbd152aSriastradh  *
432cbd152aSriastradh  * References:
442cbd152aSriastradh  *
452cbd152aSriastradh  *	The Unicode Standard, Version 15.0 -- Core Specification, The
46*445c67fbSrillig  *	Unicode Consortium, Sec. 3.8 `Surrogates', p. 118.
47*445c67fbSrillig  *	https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=144
48*445c67fbSrillig  *	https://web.archive.org/web/20240718101254/https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=144
492cbd152aSriastradh  */
502cbd152aSriastradh 
512cbd152aSriastradh #include <sys/cdefs.h>
52*445c67fbSrillig __RCSID("$NetBSD: c32rtomb.c,v 1.6 2024/08/21 18:36:11 rillig Exp $");
53cd14a503Sriastradh 
54cd14a503Sriastradh #include "namespace.h"
552cbd152aSriastradh 
562cbd152aSriastradh #include <sys/types.h>		/* broken citrus_*.h */
572cbd152aSriastradh #include <sys/queue.h>		/* broken citrus_*.h */
582cbd152aSriastradh 
592cbd152aSriastradh #include <assert.h>
602cbd152aSriastradh #include <errno.h>
612cbd152aSriastradh #include <langinfo.h>
622cbd152aSriastradh #include <limits.h>
634651a634Sriastradh #include <locale.h>
642cbd152aSriastradh #include <paths.h>
652cbd152aSriastradh #include <stddef.h>
662cbd152aSriastradh #include <stdlib.h>
672cbd152aSriastradh #include <uchar.h>
682cbd152aSriastradh #include <wchar.h>
692cbd152aSriastradh 
702cbd152aSriastradh #include "citrus_types.h"	/* broken citrus_iconv.h */
712cbd152aSriastradh #include "citrus_module.h"	/* broken citrus_iconv.h */
722cbd152aSriastradh #include "citrus_hash.h"	/* broken citrus_iconv.h */
732cbd152aSriastradh #include "citrus_iconv.h"
744651a634Sriastradh #include "setlocale_local.h"
752cbd152aSriastradh 
76cd14a503Sriastradh #ifdef __weak_alias
77cd14a503Sriastradh __weak_alias(c32rtomb,_c32rtomb)
784651a634Sriastradh __weak_alias(c32rtomb_l,_c32rtomb_l)
79cd14a503Sriastradh #endif
80cd14a503Sriastradh 
812cbd152aSriastradh size_t
822cbd152aSriastradh c32rtomb(char *restrict s, char32_t c32, mbstate_t *restrict ps)
832cbd152aSriastradh {
844651a634Sriastradh 
854651a634Sriastradh 	return c32rtomb_l(s, c32, ps, _current_locale());
864651a634Sriastradh }
874651a634Sriastradh 
884651a634Sriastradh size_t
894651a634Sriastradh c32rtomb_l(char *restrict s, char32_t c32, mbstate_t *restrict ps,
904651a634Sriastradh     locale_t loc)
914651a634Sriastradh {
92a35ceff4Sriastradh 	static mbstate_t psbuf;
932cbd152aSriastradh 	struct _citrus_iconv *iconv = NULL;
94a35ceff4Sriastradh 	char buf[2*MB_LEN_MAX];	/* [shift from init, wc] [shift to init] */
95a35ceff4Sriastradh 	char utf32le[4];
962cbd152aSriastradh 	const char *src;
972cbd152aSriastradh 	char *dst;
98a35ceff4Sriastradh 	size_t srcleft, dstleft, inval;
99a35ceff4Sriastradh 	mbstate_t mbrtowcstate = {0};
100a35ceff4Sriastradh 	wchar_t wc;
101c3a96d3fSriastradh 	size_t wc_len;
102a35ceff4Sriastradh 	size_t len;
1032cbd152aSriastradh 	int error, errno_save;
1042cbd152aSriastradh 
1052cbd152aSriastradh 	/*
1062cbd152aSriastradh 	 * Save errno in case _citrus_iconv_* clobbers it.
1072cbd152aSriastradh 	 */
1082cbd152aSriastradh 	errno_save = errno;
1092cbd152aSriastradh 
1102cbd152aSriastradh 	/*
111a35ceff4Sriastradh 	 * `If ps is a null pointer, each function uses its own
112a35ceff4Sriastradh 	 *  internal mbstate_t object instead, which is initialized at
113a35ceff4Sriastradh 	 *  program startup to the initial conversion state; the
114a35ceff4Sriastradh 	 *  functions are not required to avoid data races with other
115a35ceff4Sriastradh 	 *  calls to the same function in this case.  The
116a35ceff4Sriastradh 	 *  implementation behaves as if no library function calls
117a35ceff4Sriastradh 	 *  these functions with a null pointer for ps.'
118a35ceff4Sriastradh 	 */
119a35ceff4Sriastradh 	if (ps == NULL)
120a35ceff4Sriastradh 		ps = &psbuf;
121a35ceff4Sriastradh 
122a35ceff4Sriastradh 	/*
1232cbd152aSriastradh 	 * `If s is a null pointer, the c32rtomb function is equivalent
1242cbd152aSriastradh 	 *  to the call
1252cbd152aSriastradh 	 *
1262cbd152aSriastradh 	 *	c32rtomb(buf, L'\0', ps)
1272cbd152aSriastradh 	 *
1282cbd152aSriastradh 	 *  where buf is an internal buffer.'
1292cbd152aSriastradh 	 */
1302cbd152aSriastradh 	if (s == NULL) {
1312cbd152aSriastradh 		s = buf;
1322cbd152aSriastradh 		c32 = L'\0';
1332cbd152aSriastradh 	}
1342cbd152aSriastradh 
1352cbd152aSriastradh 	/*
136a35ceff4Sriastradh 	 * Reject surrogate code points.  We only deal in scalar
137a35ceff4Sriastradh 	 * values.
138a35ceff4Sriastradh 	 *
139a35ceff4Sriastradh 	 * XXX Is this necessary?  Won't iconv take care of it for us?
1402cbd152aSriastradh 	 */
1412cbd152aSriastradh 	if (c32 >= 0xd800 && c32 <= 0xdfff) {
1422cbd152aSriastradh 		errno = EILSEQ;
1432cbd152aSriastradh 		len = (size_t)-1;
1442cbd152aSriastradh 		goto out;
1452cbd152aSriastradh 	}
1462cbd152aSriastradh 
1472cbd152aSriastradh 	/*
1482cbd152aSriastradh 	 * Open an iconv handle to convert UTF-32LE to locale-dependent
1492cbd152aSriastradh 	 * multibyte output.
1502cbd152aSriastradh 	 */
1512cbd152aSriastradh 	if ((error = _citrus_iconv_open(&iconv, _PATH_ICONV, "utf-32le",
1524651a634Sriastradh 		    nl_langinfo_l(CODESET, loc))) != 0) {
1532cbd152aSriastradh 		errno = EIO; /* XXX? */
1542cbd152aSriastradh 		len = (size_t)-1;
1552cbd152aSriastradh 		goto out;
1562cbd152aSriastradh 	}
1572cbd152aSriastradh 
1582cbd152aSriastradh 	/*
159a35ceff4Sriastradh 	 * Convert from UTF-32LE to a multibyte sequence.
1602cbd152aSriastradh 	 */
161a35ceff4Sriastradh 	le32enc(utf32le, c32);
162a35ceff4Sriastradh 	src = utf32le;
163a35ceff4Sriastradh 	srcleft = sizeof(utf32le);
164a35ceff4Sriastradh 	dst = buf;
1652cbd152aSriastradh 	dstleft = MB_CUR_MAX;
166a35ceff4Sriastradh 	error = _citrus_iconv_convert(iconv, &src, &srcleft, &dst, &dstleft,
1672cbd152aSriastradh 	    _CITRUS_ICONV_F_HIDE_INVALID, &inval);
1682cbd152aSriastradh 	if (error) {		/* can't be incomplete, must be error */
1692cbd152aSriastradh 		errno = error;
1702cbd152aSriastradh 		len = (size_t)-1;
1712cbd152aSriastradh 		goto out;
1722cbd152aSriastradh 	}
1732cbd152aSriastradh 	_DIAGASSERT(srcleft == 0);
1742cbd152aSriastradh 	_DIAGASSERT(dstleft <= MB_CUR_MAX);
1752cbd152aSriastradh 
1762cbd152aSriastradh 	/*
1772cbd152aSriastradh 	 * If we didn't produce any output, that means the scalar value
1782cbd152aSriastradh 	 * c32 can't be encoded in the current locale, so treat it as
1792cbd152aSriastradh 	 * EILSEQ.
1802cbd152aSriastradh 	 */
1812cbd152aSriastradh 	len = MB_CUR_MAX - dstleft;
1822cbd152aSriastradh 	if (len == 0) {
1832cbd152aSriastradh 		errno = EILSEQ;
1842cbd152aSriastradh 		len = (size_t)-1;
1852cbd152aSriastradh 		goto out;
1862cbd152aSriastradh 	}
1872cbd152aSriastradh 
1882cbd152aSriastradh 	/*
189a35ceff4Sriastradh 	 * Now get a wide character out of the buffer.  We don't care
190a35ceff4Sriastradh 	 * how much it consumes other than for a diagnostic assertion.
191a35ceff4Sriastradh 	 * It had better return exactly one wide character, because we
192a35ceff4Sriastradh 	 * are only allowed to encode one wide character's worth of
193a35ceff4Sriastradh 	 * multibyte output (possibly including a shift sequence).
194a35ceff4Sriastradh 	 *
195a35ceff4Sriastradh 	 * XXX What about combining characters?
196a35ceff4Sriastradh 	 */
197c3a96d3fSriastradh 	wc_len = mbrtowc_l(&wc, buf, len, &mbrtowcstate, loc);
198c3a96d3fSriastradh 	switch (wc_len) {
199c3a96d3fSriastradh 	case (size_t)-1:	/* error, with errno set */
200c3a96d3fSriastradh 		len = (size_t)-1;
201a35ceff4Sriastradh 		goto out;
202a35ceff4Sriastradh 	case 0:			/* decoded NUL */
203a35ceff4Sriastradh 		wc = 0;		/* paranoia */
204c3a96d3fSriastradh 		len = wc_len;
205a35ceff4Sriastradh 		break;
206a35ceff4Sriastradh 	default:		/* decoded wc */
207c3a96d3fSriastradh 		_DIAGASSERT(wc_len <= len);
208a35ceff4Sriastradh 	}
209a35ceff4Sriastradh 
210a35ceff4Sriastradh 	/*
211a35ceff4Sriastradh 	 * Now put the wide character out, using the caller's
212a35ceff4Sriastradh 	 * conversion state so that we don't output unnecessary shift
213a35ceff4Sriastradh 	 * sequences.
214a35ceff4Sriastradh 	 */
215a35ceff4Sriastradh 	len = wcrtomb_l(s, wc, ps, loc);
216a35ceff4Sriastradh 	if (len == (size_t)-1)	/* error, with errno set */
217a35ceff4Sriastradh 		goto out;
218a35ceff4Sriastradh 
219a35ceff4Sriastradh 	/*
2202cbd152aSriastradh 	 * Make sure we preserve errno on success.
2212cbd152aSriastradh 	 */
2222cbd152aSriastradh 	errno = errno_save;
2232cbd152aSriastradh 
2242cbd152aSriastradh out:	errno_save = errno;
2252cbd152aSriastradh 	_citrus_iconv_close(iconv);
2262cbd152aSriastradh 	errno = errno_save;
2272cbd152aSriastradh 	return len;
2282cbd152aSriastradh }
229