xref: /netbsd-src/lib/libc/locale/c16rtomb.c (revision 7255e47a1ac0d6623287cacc12e09e5ae686ea56)
1*7255e47aSriastradh /*	$NetBSD: c16rtomb.c,v 1.9 2024/10/09 14:28:56 riastradh Exp $	*/
22cbd152aSriastradh 
32cbd152aSriastradh /*-
42cbd152aSriastradh  * Copyright (c) 2024 The NetBSD Foundation, Inc.
52cbd152aSriastradh  * All rights reserved.
62cbd152aSriastradh  *
72cbd152aSriastradh  * Redistribution and use in source and binary forms, with or without
82cbd152aSriastradh  * modification, are permitted provided that the following conditions
92cbd152aSriastradh  * are met:
102cbd152aSriastradh  * 1. Redistributions of source code must retain the above copyright
112cbd152aSriastradh  *    notice, this list of conditions and the following disclaimer.
122cbd152aSriastradh  * 2. Redistributions in binary form must reproduce the above copyright
132cbd152aSriastradh  *    notice, this list of conditions and the following disclaimer in the
142cbd152aSriastradh  *    documentation and/or other materials provided with the distribution.
152cbd152aSriastradh  *
162cbd152aSriastradh  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
172cbd152aSriastradh  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
182cbd152aSriastradh  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
192cbd152aSriastradh  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
202cbd152aSriastradh  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
212cbd152aSriastradh  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
222cbd152aSriastradh  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
232cbd152aSriastradh  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
242cbd152aSriastradh  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
252cbd152aSriastradh  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
262cbd152aSriastradh  * POSSIBILITY OF SUCH DAMAGE.
272cbd152aSriastradh  */
282cbd152aSriastradh 
292cbd152aSriastradh /*
302cbd152aSriastradh  * c16rtomb(s, c16, ps)
312cbd152aSriastradh  *
322cbd152aSriastradh  *	Encode the Unicode UTF-16 code unit c16, which may be surrogate
332cbd152aSriastradh  *	code point, into the multibyte buffer s under the current
342cbd152aSriastradh  *	locale, using multibyte encoding state ps.
352cbd152aSriastradh  *
362cbd152aSriastradh  *	If c16 is a high surrogate, no output will be produced, but c16
372cbd152aSriastradh  *	will be remembered; this must be followed by another call
382cbd152aSriastradh  *	passing the trailing low surrogate.
392cbd152aSriastradh  *
402cbd152aSriastradh  *	If c16 is a low surrogate, it must have been preceded by a call
412cbd152aSriastradh  *	with the leading high surrogate; at this point the combined
422cbd152aSriastradh  *	scalar value will be produced as output.
432cbd152aSriastradh  *
442cbd152aSriastradh  *	Return the number of bytes stored on success, or (size_t)-1 on
452cbd152aSriastradh  *	error with errno set to EILSEQ.
462cbd152aSriastradh  *
472cbd152aSriastradh  *	At most MB_CUR_MAX bytes will be stored.
482cbd152aSriastradh  *
492cbd152aSriastradh  * References:
502cbd152aSriastradh  *
512cbd152aSriastradh  *	The Unicode Standard, Version 15.0 -- Core Specification, The
52445c67fbSrillig  *	Unicode Consortium, Sec. 3.8 `Surrogates', p. 118.
532cbd152aSriastradh  *	https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=144
542cbd152aSriastradh  *	https://web.archive.org/web/20240718101254/https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=144
552cbd152aSriastradh  *
562cbd152aSriastradh  *	The Unicode Standard, Version 15.0 -- Core Specification, The
572cbd152aSriastradh  *	Unicode Consortium, Sec. 3.9 `Unicode Encoding Forms': UTF-16,
582cbd152aSriastradh  *	p. 124.
592cbd152aSriastradh  *	https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=150
602cbd152aSriastradh  *	https://web.archive.org/web/20240718101254/https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=150
612cbd152aSriastradh  *
622cbd152aSriastradh  *	P. Hoffman and F. Yergeau, `UTF-16, an encoding of ISO 10646',
632cbd152aSriastradh  *	RFC 2781, Internet Engineering Task Force, February 2000,
642cbd152aSriastradh  *	Sec. 2.2: `Decoding UTF-16'.
652cbd152aSriastradh  *	https://datatracker.ietf.org/doc/html/rfc2781#section-2.2
662cbd152aSriastradh  */
672cbd152aSriastradh 
682cbd152aSriastradh #include <sys/cdefs.h>
69*7255e47aSriastradh __RCSID("$NetBSD: c16rtomb.c,v 1.9 2024/10/09 14:28:56 riastradh Exp $");
70cd14a503Sriastradh 
71cd14a503Sriastradh #include "namespace.h"
722cbd152aSriastradh 
732cbd152aSriastradh #include <assert.h>
742cbd152aSriastradh #include <errno.h>
752cbd152aSriastradh #include <limits.h>
764651a634Sriastradh #include <locale.h>
77b36f256cSriastradh #include <stdalign.h>
782cbd152aSriastradh #include <stddef.h>
792cbd152aSriastradh #include <uchar.h>
802cbd152aSriastradh 
812cbd152aSriastradh #include "c32rtomb.h"
824651a634Sriastradh #include "setlocale_local.h"
832cbd152aSriastradh 
842cbd152aSriastradh struct c16rtombstate {
852cbd152aSriastradh 	char16_t	surrogate;
862cbd152aSriastradh 	mbstate_t	mbs;
872cbd152aSriastradh };
882cbd152aSriastradh __CTASSERT(offsetof(struct c16rtombstate, mbs) <= sizeof(mbstate_t));
892cbd152aSriastradh __CTASSERT(sizeof(struct c32rtombstate) <= sizeof(mbstate_t) -
902cbd152aSriastradh     offsetof(struct c16rtombstate, mbs));
91b36f256cSriastradh __CTASSERT(alignof(struct c16rtombstate) <= alignof(mbstate_t));
922cbd152aSriastradh 
934651a634Sriastradh #ifdef __weak_alias
944651a634Sriastradh __weak_alias(c16rtomb_l,_c16rtomb_l)
954651a634Sriastradh #endif
964651a634Sriastradh 
972cbd152aSriastradh size_t
982cbd152aSriastradh c16rtomb(char *restrict s, char16_t c16, mbstate_t *restrict ps)
992cbd152aSriastradh {
1004651a634Sriastradh 
1014651a634Sriastradh 	return c16rtomb_l(s, c16, ps, _current_locale());
1024651a634Sriastradh }
1034651a634Sriastradh 
1044651a634Sriastradh size_t
1054651a634Sriastradh c16rtomb_l(char *restrict s, char16_t c16, mbstate_t *restrict ps,
1064651a634Sriastradh     locale_t loc)
1074651a634Sriastradh {
1082cbd152aSriastradh 	static mbstate_t psbuf;
1092cbd152aSriastradh 	char buf[MB_LEN_MAX];
1102cbd152aSriastradh 	struct c16rtombstate *S;
1112cbd152aSriastradh 	char32_t c32;
1122cbd152aSriastradh 
1132cbd152aSriastradh 	/*
1142cbd152aSriastradh 	 * `If ps is a null pointer, each function uses its own
1152cbd152aSriastradh 	 *  internal mbstate_t object instead, which is initialized at
1162cbd152aSriastradh 	 *  program startup to the initial conversion state; the
1172cbd152aSriastradh 	 *  functions are not required to avoid data races with other
1182cbd152aSriastradh 	 *  calls to the same function in this case.  The
1192cbd152aSriastradh 	 *  implementation behaves as if no library function calls
1202cbd152aSriastradh 	 *  these functions with a null pointer for ps.'
1212cbd152aSriastradh 	 */
1222cbd152aSriastradh 	if (ps == NULL)
1232cbd152aSriastradh 		ps = &psbuf;
1242cbd152aSriastradh 
1252cbd152aSriastradh 	/*
1262cbd152aSriastradh 	 * `If s is a null pointer, the c16rtomb function is equivalent
1272cbd152aSriastradh 	 *  to the call
1282cbd152aSriastradh 	 *
1292cbd152aSriastradh 	 *	c16rtomb(buf, L'\0', ps)
1302cbd152aSriastradh 	 *
1312cbd152aSriastradh 	 *  where buf is an internal buffer.
1322cbd152aSriastradh 	 */
1332cbd152aSriastradh 	if (s == NULL) {
1342cbd152aSriastradh 		s = buf;
1352cbd152aSriastradh 		c16 = L'\0';
1362cbd152aSriastradh 	}
1372cbd152aSriastradh 
1382cbd152aSriastradh 	/*
1392cbd152aSriastradh 	 * Open the private UTF-16 decoding state.
1402cbd152aSriastradh 	 */
1410d3267e8Schristos 	S = (struct c16rtombstate *)(void *)ps;
1422cbd152aSriastradh 
1432cbd152aSriastradh 	/*
1444a53dcdcSriastradh 	 * Handle several cases:
1452cbd152aSriastradh 	 *
1464a53dcdcSriastradh 	 * 1. c16 is null.
1474a53dcdcSriastradh 	 * 2. Pending high surrogate.
1484a53dcdcSriastradh 	 * 3. No pending high surrogate and c16 is a high surrogate.
1494a53dcdcSriastradh 	 * 4. No pending high surrogate and c16 is a low surrogate.
1504a53dcdcSriastradh 	 * 5. No pending high surrogate and c16 is a BMP scalar value.
1512cbd152aSriastradh 	 */
1524a53dcdcSriastradh 	if (c16 == L'\0') {	/* 1. null */
1534a53dcdcSriastradh 		/*
1544a53dcdcSriastradh 		 * `If c16 is a null wide character, a null byte is
1554a53dcdcSriastradh 		 *  stored, preceded by any shift sequence needed to
1564a53dcdcSriastradh 		 *  restore the initial shift state; the resulting
1574a53dcdcSriastradh 		 *  state described is the initial conversion state.'
1584a53dcdcSriastradh 		 *
1594a53dcdcSriastradh 		 * So if c16 is null, discard any pending high
1604a53dcdcSriastradh 		 * surrogate -- there's nothing we can legitimately do
1614a53dcdcSriastradh 		 * with it -- and convert a null scalar value, which by
1624a53dcdcSriastradh 		 * definition of c32rtomb writes out any shift sequence
1634a53dcdcSriastradh 		 * reset followed by a null byte.
1644a53dcdcSriastradh 		 */
1652cbd152aSriastradh 		S->surrogate = 0;
1664a53dcdcSriastradh 		c32 = 0;
1674a53dcdcSriastradh 	} else if (S->surrogate != 0) { /* 2. pending high surrogate */
1682cbd152aSriastradh 		/*
1694a53dcdcSriastradh 		 * If the previous code unit was a high surrogate, the
1704a53dcdcSriastradh 		 * next code unit must be a low surrogate.  Reject it
1714a53dcdcSriastradh 		 * if not; otherwise clear the high surrogate for next
1724a53dcdcSriastradh 		 * time and combine them to output a scalar value.
1732cbd152aSriastradh 		 */
1742cbd152aSriastradh 		if (c16 < 0xdc00 || c16 > 0xdfff) {
1752cbd152aSriastradh 			errno = EILSEQ;
1762cbd152aSriastradh 			return (size_t)-1;
1772cbd152aSriastradh 		}
1782cbd152aSriastradh 		const char16_t w1 = S->surrogate;
1792cbd152aSriastradh 		const char16_t w2 = c16;
180*7255e47aSriastradh 		c32 = __SHIFTIN(__SHIFTOUT(w1, __BITS(9,0)), __BITS(19,10)) |
181*7255e47aSriastradh 		    __SHIFTIN(__SHIFTOUT(w2, __BITS(9,0)), __BITS(9,0));
1822cbd152aSriastradh 		c32 += 0x10000;
1832cbd152aSriastradh 		S->surrogate = 0;
1844a53dcdcSriastradh 	} else if (c16 >= 0xd800 && c16 <= 0xdbff) { /* 3. high surrogate */
1854a53dcdcSriastradh 		/*
1864a53dcdcSriastradh 		 * No pending high surrogate and this code unit is a
1874a53dcdcSriastradh 		 * high surrogate.  Save it for next time, and output
1884a53dcdcSriastradh 		 * nothing -- we don't yet know what the next scalar
1894a53dcdcSriastradh 		 * value will be until we receive the low surrogate.
1904a53dcdcSriastradh 		 */
1912cbd152aSriastradh 		S->surrogate = c16;
1922cbd152aSriastradh 		return 0;	/* produced nothing */
1934a53dcdcSriastradh 	} else if (c16 >= 0xdc00 && c16 <= 0xdfff) { /* 4. low surrogate */
1944a53dcdcSriastradh 		/*
1954a53dcdcSriastradh 		 * No pending high surrogate and this code unit is a
1964a53dcdcSriastradh 		 * low surrogate.  That's invalid; fail with EILSEQ.
1974a53dcdcSriastradh 		 */
1982cbd152aSriastradh 		errno = EILSEQ;
1992cbd152aSriastradh 		return (size_t)-1;
2004a53dcdcSriastradh 	} else {		/* 5. not a surrogate */
2014a53dcdcSriastradh 		/*
2024a53dcdcSriastradh 		 * Code unit is a scalar value in the BMP.  Just output
2034a53dcdcSriastradh 		 * it as is.
2044a53dcdcSriastradh 		 */
2052cbd152aSriastradh 		c32 = c16;
2062cbd152aSriastradh 	}
2072cbd152aSriastradh 
2082cbd152aSriastradh 	/*
2092cbd152aSriastradh 	 * We have a scalar value.  Output it.
2102cbd152aSriastradh 	 */
2114651a634Sriastradh 	return c32rtomb_l(s, c32, &S->mbs, loc);
2122cbd152aSriastradh }
213