1*7255e47aSriastradh /* $NetBSD: c16rtomb.c,v 1.9 2024/10/09 14:28:56 riastradh Exp $ */ 22cbd152aSriastradh 32cbd152aSriastradh /*- 42cbd152aSriastradh * Copyright (c) 2024 The NetBSD Foundation, Inc. 52cbd152aSriastradh * All rights reserved. 62cbd152aSriastradh * 72cbd152aSriastradh * Redistribution and use in source and binary forms, with or without 82cbd152aSriastradh * modification, are permitted provided that the following conditions 92cbd152aSriastradh * are met: 102cbd152aSriastradh * 1. Redistributions of source code must retain the above copyright 112cbd152aSriastradh * notice, this list of conditions and the following disclaimer. 122cbd152aSriastradh * 2. Redistributions in binary form must reproduce the above copyright 132cbd152aSriastradh * notice, this list of conditions and the following disclaimer in the 142cbd152aSriastradh * documentation and/or other materials provided with the distribution. 152cbd152aSriastradh * 162cbd152aSriastradh * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 172cbd152aSriastradh * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 182cbd152aSriastradh * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 192cbd152aSriastradh * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 202cbd152aSriastradh * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 212cbd152aSriastradh * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 222cbd152aSriastradh * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 232cbd152aSriastradh * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 242cbd152aSriastradh * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 252cbd152aSriastradh * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 262cbd152aSriastradh * POSSIBILITY OF SUCH DAMAGE. 272cbd152aSriastradh */ 282cbd152aSriastradh 292cbd152aSriastradh /* 302cbd152aSriastradh * c16rtomb(s, c16, ps) 312cbd152aSriastradh * 322cbd152aSriastradh * Encode the Unicode UTF-16 code unit c16, which may be surrogate 332cbd152aSriastradh * code point, into the multibyte buffer s under the current 342cbd152aSriastradh * locale, using multibyte encoding state ps. 352cbd152aSriastradh * 362cbd152aSriastradh * If c16 is a high surrogate, no output will be produced, but c16 372cbd152aSriastradh * will be remembered; this must be followed by another call 382cbd152aSriastradh * passing the trailing low surrogate. 392cbd152aSriastradh * 402cbd152aSriastradh * If c16 is a low surrogate, it must have been preceded by a call 412cbd152aSriastradh * with the leading high surrogate; at this point the combined 422cbd152aSriastradh * scalar value will be produced as output. 432cbd152aSriastradh * 442cbd152aSriastradh * Return the number of bytes stored on success, or (size_t)-1 on 452cbd152aSriastradh * error with errno set to EILSEQ. 462cbd152aSriastradh * 472cbd152aSriastradh * At most MB_CUR_MAX bytes will be stored. 482cbd152aSriastradh * 492cbd152aSriastradh * References: 502cbd152aSriastradh * 512cbd152aSriastradh * The Unicode Standard, Version 15.0 -- Core Specification, The 52445c67fbSrillig * Unicode Consortium, Sec. 3.8 `Surrogates', p. 118. 532cbd152aSriastradh * https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=144 542cbd152aSriastradh * https://web.archive.org/web/20240718101254/https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=144 552cbd152aSriastradh * 562cbd152aSriastradh * The Unicode Standard, Version 15.0 -- Core Specification, The 572cbd152aSriastradh * Unicode Consortium, Sec. 3.9 `Unicode Encoding Forms': UTF-16, 582cbd152aSriastradh * p. 124. 592cbd152aSriastradh * https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=150 602cbd152aSriastradh * https://web.archive.org/web/20240718101254/https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=150 612cbd152aSriastradh * 622cbd152aSriastradh * P. Hoffman and F. Yergeau, `UTF-16, an encoding of ISO 10646', 632cbd152aSriastradh * RFC 2781, Internet Engineering Task Force, February 2000, 642cbd152aSriastradh * Sec. 2.2: `Decoding UTF-16'. 652cbd152aSriastradh * https://datatracker.ietf.org/doc/html/rfc2781#section-2.2 662cbd152aSriastradh */ 672cbd152aSriastradh 682cbd152aSriastradh #include <sys/cdefs.h> 69*7255e47aSriastradh __RCSID("$NetBSD: c16rtomb.c,v 1.9 2024/10/09 14:28:56 riastradh Exp $"); 70cd14a503Sriastradh 71cd14a503Sriastradh #include "namespace.h" 722cbd152aSriastradh 732cbd152aSriastradh #include <assert.h> 742cbd152aSriastradh #include <errno.h> 752cbd152aSriastradh #include <limits.h> 764651a634Sriastradh #include <locale.h> 77b36f256cSriastradh #include <stdalign.h> 782cbd152aSriastradh #include <stddef.h> 792cbd152aSriastradh #include <uchar.h> 802cbd152aSriastradh 812cbd152aSriastradh #include "c32rtomb.h" 824651a634Sriastradh #include "setlocale_local.h" 832cbd152aSriastradh 842cbd152aSriastradh struct c16rtombstate { 852cbd152aSriastradh char16_t surrogate; 862cbd152aSriastradh mbstate_t mbs; 872cbd152aSriastradh }; 882cbd152aSriastradh __CTASSERT(offsetof(struct c16rtombstate, mbs) <= sizeof(mbstate_t)); 892cbd152aSriastradh __CTASSERT(sizeof(struct c32rtombstate) <= sizeof(mbstate_t) - 902cbd152aSriastradh offsetof(struct c16rtombstate, mbs)); 91b36f256cSriastradh __CTASSERT(alignof(struct c16rtombstate) <= alignof(mbstate_t)); 922cbd152aSriastradh 934651a634Sriastradh #ifdef __weak_alias 944651a634Sriastradh __weak_alias(c16rtomb_l,_c16rtomb_l) 954651a634Sriastradh #endif 964651a634Sriastradh 972cbd152aSriastradh size_t 982cbd152aSriastradh c16rtomb(char *restrict s, char16_t c16, mbstate_t *restrict ps) 992cbd152aSriastradh { 1004651a634Sriastradh 1014651a634Sriastradh return c16rtomb_l(s, c16, ps, _current_locale()); 1024651a634Sriastradh } 1034651a634Sriastradh 1044651a634Sriastradh size_t 1054651a634Sriastradh c16rtomb_l(char *restrict s, char16_t c16, mbstate_t *restrict ps, 1064651a634Sriastradh locale_t loc) 1074651a634Sriastradh { 1082cbd152aSriastradh static mbstate_t psbuf; 1092cbd152aSriastradh char buf[MB_LEN_MAX]; 1102cbd152aSriastradh struct c16rtombstate *S; 1112cbd152aSriastradh char32_t c32; 1122cbd152aSriastradh 1132cbd152aSriastradh /* 1142cbd152aSriastradh * `If ps is a null pointer, each function uses its own 1152cbd152aSriastradh * internal mbstate_t object instead, which is initialized at 1162cbd152aSriastradh * program startup to the initial conversion state; the 1172cbd152aSriastradh * functions are not required to avoid data races with other 1182cbd152aSriastradh * calls to the same function in this case. The 1192cbd152aSriastradh * implementation behaves as if no library function calls 1202cbd152aSriastradh * these functions with a null pointer for ps.' 1212cbd152aSriastradh */ 1222cbd152aSriastradh if (ps == NULL) 1232cbd152aSriastradh ps = &psbuf; 1242cbd152aSriastradh 1252cbd152aSriastradh /* 1262cbd152aSriastradh * `If s is a null pointer, the c16rtomb function is equivalent 1272cbd152aSriastradh * to the call 1282cbd152aSriastradh * 1292cbd152aSriastradh * c16rtomb(buf, L'\0', ps) 1302cbd152aSriastradh * 1312cbd152aSriastradh * where buf is an internal buffer. 1322cbd152aSriastradh */ 1332cbd152aSriastradh if (s == NULL) { 1342cbd152aSriastradh s = buf; 1352cbd152aSriastradh c16 = L'\0'; 1362cbd152aSriastradh } 1372cbd152aSriastradh 1382cbd152aSriastradh /* 1392cbd152aSriastradh * Open the private UTF-16 decoding state. 1402cbd152aSriastradh */ 1410d3267e8Schristos S = (struct c16rtombstate *)(void *)ps; 1422cbd152aSriastradh 1432cbd152aSriastradh /* 1444a53dcdcSriastradh * Handle several cases: 1452cbd152aSriastradh * 1464a53dcdcSriastradh * 1. c16 is null. 1474a53dcdcSriastradh * 2. Pending high surrogate. 1484a53dcdcSriastradh * 3. No pending high surrogate and c16 is a high surrogate. 1494a53dcdcSriastradh * 4. No pending high surrogate and c16 is a low surrogate. 1504a53dcdcSriastradh * 5. No pending high surrogate and c16 is a BMP scalar value. 1512cbd152aSriastradh */ 1524a53dcdcSriastradh if (c16 == L'\0') { /* 1. null */ 1534a53dcdcSriastradh /* 1544a53dcdcSriastradh * `If c16 is a null wide character, a null byte is 1554a53dcdcSriastradh * stored, preceded by any shift sequence needed to 1564a53dcdcSriastradh * restore the initial shift state; the resulting 1574a53dcdcSriastradh * state described is the initial conversion state.' 1584a53dcdcSriastradh * 1594a53dcdcSriastradh * So if c16 is null, discard any pending high 1604a53dcdcSriastradh * surrogate -- there's nothing we can legitimately do 1614a53dcdcSriastradh * with it -- and convert a null scalar value, which by 1624a53dcdcSriastradh * definition of c32rtomb writes out any shift sequence 1634a53dcdcSriastradh * reset followed by a null byte. 1644a53dcdcSriastradh */ 1652cbd152aSriastradh S->surrogate = 0; 1664a53dcdcSriastradh c32 = 0; 1674a53dcdcSriastradh } else if (S->surrogate != 0) { /* 2. pending high surrogate */ 1682cbd152aSriastradh /* 1694a53dcdcSriastradh * If the previous code unit was a high surrogate, the 1704a53dcdcSriastradh * next code unit must be a low surrogate. Reject it 1714a53dcdcSriastradh * if not; otherwise clear the high surrogate for next 1724a53dcdcSriastradh * time and combine them to output a scalar value. 1732cbd152aSriastradh */ 1742cbd152aSriastradh if (c16 < 0xdc00 || c16 > 0xdfff) { 1752cbd152aSriastradh errno = EILSEQ; 1762cbd152aSriastradh return (size_t)-1; 1772cbd152aSriastradh } 1782cbd152aSriastradh const char16_t w1 = S->surrogate; 1792cbd152aSriastradh const char16_t w2 = c16; 180*7255e47aSriastradh c32 = __SHIFTIN(__SHIFTOUT(w1, __BITS(9,0)), __BITS(19,10)) | 181*7255e47aSriastradh __SHIFTIN(__SHIFTOUT(w2, __BITS(9,0)), __BITS(9,0)); 1822cbd152aSriastradh c32 += 0x10000; 1832cbd152aSriastradh S->surrogate = 0; 1844a53dcdcSriastradh } else if (c16 >= 0xd800 && c16 <= 0xdbff) { /* 3. high surrogate */ 1854a53dcdcSriastradh /* 1864a53dcdcSriastradh * No pending high surrogate and this code unit is a 1874a53dcdcSriastradh * high surrogate. Save it for next time, and output 1884a53dcdcSriastradh * nothing -- we don't yet know what the next scalar 1894a53dcdcSriastradh * value will be until we receive the low surrogate. 1904a53dcdcSriastradh */ 1912cbd152aSriastradh S->surrogate = c16; 1922cbd152aSriastradh return 0; /* produced nothing */ 1934a53dcdcSriastradh } else if (c16 >= 0xdc00 && c16 <= 0xdfff) { /* 4. low surrogate */ 1944a53dcdcSriastradh /* 1954a53dcdcSriastradh * No pending high surrogate and this code unit is a 1964a53dcdcSriastradh * low surrogate. That's invalid; fail with EILSEQ. 1974a53dcdcSriastradh */ 1982cbd152aSriastradh errno = EILSEQ; 1992cbd152aSriastradh return (size_t)-1; 2004a53dcdcSriastradh } else { /* 5. not a surrogate */ 2014a53dcdcSriastradh /* 2024a53dcdcSriastradh * Code unit is a scalar value in the BMP. Just output 2034a53dcdcSriastradh * it as is. 2044a53dcdcSriastradh */ 2052cbd152aSriastradh c32 = c16; 2062cbd152aSriastradh } 2072cbd152aSriastradh 2082cbd152aSriastradh /* 2092cbd152aSriastradh * We have a scalar value. Output it. 2102cbd152aSriastradh */ 2114651a634Sriastradh return c32rtomb_l(s, c32, &S->mbs, loc); 2122cbd152aSriastradh } 213