1*445c67fbSrillig /* $NetBSD: mbrtoc8.c,v 1.8 2024/08/21 18:36:11 rillig Exp $ */ 2c4e44ee2Sriastradh 3c4e44ee2Sriastradh /*- 4c4e44ee2Sriastradh * Copyright (c) 2024 The NetBSD Foundation, Inc. 5c4e44ee2Sriastradh * All rights reserved. 6c4e44ee2Sriastradh * 7c4e44ee2Sriastradh * Redistribution and use in source and binary forms, with or without 8c4e44ee2Sriastradh * modification, are permitted provided that the following conditions 9c4e44ee2Sriastradh * are met: 10c4e44ee2Sriastradh * 1. Redistributions of source code must retain the above copyright 11c4e44ee2Sriastradh * notice, this list of conditions and the following disclaimer. 12c4e44ee2Sriastradh * 2. Redistributions in binary form must reproduce the above copyright 13c4e44ee2Sriastradh * notice, this list of conditions and the following disclaimer in the 14c4e44ee2Sriastradh * documentation and/or other materials provided with the distribution. 15c4e44ee2Sriastradh * 16c4e44ee2Sriastradh * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17c4e44ee2Sriastradh * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18c4e44ee2Sriastradh * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19c4e44ee2Sriastradh * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20c4e44ee2Sriastradh * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21c4e44ee2Sriastradh * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22c4e44ee2Sriastradh * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23c4e44ee2Sriastradh * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24c4e44ee2Sriastradh * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25c4e44ee2Sriastradh * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26c4e44ee2Sriastradh * POSSIBILITY OF SUCH DAMAGE. 27c4e44ee2Sriastradh */ 28c4e44ee2Sriastradh 29c4e44ee2Sriastradh /* 3028eba817Sriastradh * mbrtoc8(&c8, s, n, ps) 31c4e44ee2Sriastradh * 32c4e44ee2Sriastradh * Decode a Unicode scalar value from up to n bytes out of the 33c4e44ee2Sriastradh * multibyte string s, using multibyte encoding state ps, and 34c4e44ee2Sriastradh * store the next code unit in the UTF-8 representation of that 35c4e44ee2Sriastradh * scalar value at c8. 36c4e44ee2Sriastradh * 37c4e44ee2Sriastradh * If the UTF-8 representation of that scalar value is multiple 38839ad3f6Srillig * bytes long, mbrtoc8 will yield the leading byte in one call that 39c4e44ee2Sriastradh * consumes input, and will yield the trailing bytes in subsequent 40c4e44ee2Sriastradh * calls without consuming any input and returning (size_t)-3 41c4e44ee2Sriastradh * instead. 42c4e44ee2Sriastradh * 43c4e44ee2Sriastradh * Return the number of bytes consumed on success, or: 44c4e44ee2Sriastradh * 45c4e44ee2Sriastradh * - 0 if the code unit is NUL, or 46c4e44ee2Sriastradh * - (size_t)-3 if a trailing byte was returned without consuming 47c4e44ee2Sriastradh * any additional input, or 48c4e44ee2Sriastradh * - (size_t)-2 if the input is incomplete, or 49c4e44ee2Sriastradh * - (size_t)-1 on error with errno set to EILSEQ. 50c4e44ee2Sriastradh * 51c4e44ee2Sriastradh * In the case of incomplete input, the decoding state so far 52c4e44ee2Sriastradh * after processing s[0], s[1], ..., s[n - 1] is saved in ps, so 53c4e44ee2Sriastradh * subsequent calls to mbrtoc8 will pick up n bytes later into 54c4e44ee2Sriastradh * the input stream. 55c4e44ee2Sriastradh * 56c4e44ee2Sriastradh * References: 57c4e44ee2Sriastradh * 58c4e44ee2Sriastradh * The Unicode Standard, Version 15.0 -- Core Specification, The 59*445c67fbSrillig * Unicode Consortium, Sec. 3.8 `Surrogates', p. 118. 60c4e44ee2Sriastradh * https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=144 61c4e44ee2Sriastradh * https://web.archive.org/web/20240718101254/https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=144 62c4e44ee2Sriastradh * 63c4e44ee2Sriastradh * The Unicode Standard, Version 15.0 -- Core Specification, The 64839ad3f6Srillig * Unicode Consortium, Sec. 3.9 `Unicode Encoding Forms': UTF-8, 65c4e44ee2Sriastradh * p. 124. 66c4e44ee2Sriastradh * https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=150 67c4e44ee2Sriastradh * https://web.archive.org/web/20240718101254/https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=150 68c4e44ee2Sriastradh * 69c4e44ee2Sriastradh * F. Yergeau, `UTF-8, a transformation format of ISO 10646', 70c4e44ee2Sriastradh * RFC 3629, Internet Engineering Task Force, November 2003. 71c4e44ee2Sriastradh * https://datatracker.ietf.org/doc/html/rfc3629 72c4e44ee2Sriastradh */ 73c4e44ee2Sriastradh 74c4e44ee2Sriastradh #include <sys/cdefs.h> 75*445c67fbSrillig __RCSID("$NetBSD: mbrtoc8.c,v 1.8 2024/08/21 18:36:11 rillig Exp $"); 76565f5b16Sriastradh 77565f5b16Sriastradh #include "namespace.h" 78c4e44ee2Sriastradh 79c4e44ee2Sriastradh #include <assert.h> 80c4e44ee2Sriastradh #include <errno.h> 814651a634Sriastradh #include <locale.h> 82c4e44ee2Sriastradh #include <stdalign.h> 83c4e44ee2Sriastradh #include <stddef.h> 84c4e44ee2Sriastradh #include <uchar.h> 85c4e44ee2Sriastradh 86c4e44ee2Sriastradh #include "mbrtoc32.h" 874651a634Sriastradh #include "setlocale_local.h" 88c4e44ee2Sriastradh 89c4e44ee2Sriastradh struct mbrtoc8state { 90c4e44ee2Sriastradh char8_t nleft; 91c4e44ee2Sriastradh char8_t buf[3]; 92c4e44ee2Sriastradh mbstate_t mbs; 93c4e44ee2Sriastradh }; 94c4e44ee2Sriastradh __CTASSERT(offsetof(struct mbrtoc8state, mbs) <= sizeof(mbstate_t)); 95c4e44ee2Sriastradh __CTASSERT(sizeof(struct mbrtoc32state) <= sizeof(mbstate_t) - 96c4e44ee2Sriastradh offsetof(struct mbrtoc8state, mbs)); 97c4e44ee2Sriastradh __CTASSERT(alignof(struct mbrtoc8state) <= alignof(mbstate_t)); 98c4e44ee2Sriastradh 994651a634Sriastradh #ifdef __weak_alias 1004651a634Sriastradh __weak_alias(mbrtoc8_l,_mbrtoc8_l) 1014651a634Sriastradh #endif 1024651a634Sriastradh 103c4e44ee2Sriastradh size_t 104c4e44ee2Sriastradh mbrtoc8(char8_t *restrict pc8, const char *restrict s, size_t n, 105c4e44ee2Sriastradh mbstate_t *restrict ps) 106c4e44ee2Sriastradh { 1074651a634Sriastradh 1084651a634Sriastradh return mbrtoc8_l(pc8, s, n, ps, _current_locale()); 1094651a634Sriastradh } 1104651a634Sriastradh 1114651a634Sriastradh size_t 1124651a634Sriastradh mbrtoc8_l(char8_t *restrict pc8, const char *restrict s, size_t n, 1134651a634Sriastradh mbstate_t *restrict ps, locale_t restrict loc) 1144651a634Sriastradh { 115c4e44ee2Sriastradh static mbstate_t psbuf; 116c4e44ee2Sriastradh struct mbrtoc8state *S; 117c4e44ee2Sriastradh char32_t c32; 118c4e44ee2Sriastradh size_t len; 119c4e44ee2Sriastradh 120c4e44ee2Sriastradh /* 121c4e44ee2Sriastradh * `If ps is a null pointer, each function uses its own 122c4e44ee2Sriastradh * internal mbstate_t object instead, which is initialized at 123c4e44ee2Sriastradh * program startup to the initial conversion state; the 124c4e44ee2Sriastradh * functions are not required to avoid data races with other 125c4e44ee2Sriastradh * calls to the same function in this case. The 126c4e44ee2Sriastradh * implementation behaves as if no library function calls 127c4e44ee2Sriastradh * these functions with a null pointer for ps.' 128c4e44ee2Sriastradh */ 129c4e44ee2Sriastradh if (ps == NULL) 130c4e44ee2Sriastradh ps = &psbuf; 131c4e44ee2Sriastradh 132c4e44ee2Sriastradh /* 133c4e44ee2Sriastradh * `If s is a null pointer, the mbrtoc8 function is equivalent 134c4e44ee2Sriastradh * to the call: 135c4e44ee2Sriastradh * 136c4e44ee2Sriastradh * mbrtoc8(NULL, "", 1, ps) 137c4e44ee2Sriastradh * 138c4e44ee2Sriastradh * In this case, the values of the parameters pc8 and n are 139c4e44ee2Sriastradh * ignored.' 140c4e44ee2Sriastradh */ 141c4e44ee2Sriastradh if (s == NULL) { 142c4e44ee2Sriastradh pc8 = NULL; 143c4e44ee2Sriastradh s = ""; 144c4e44ee2Sriastradh n = 1; 145c4e44ee2Sriastradh } 146c4e44ee2Sriastradh 147c4e44ee2Sriastradh /* 148c4e44ee2Sriastradh * Get the private conversion state. 149c4e44ee2Sriastradh */ 1500d3267e8Schristos S = (struct mbrtoc8state *)(void *)ps; 151c4e44ee2Sriastradh 152c4e44ee2Sriastradh /* 153c4e44ee2Sriastradh * If there are pending trailing bytes, yield them and return 154c4e44ee2Sriastradh * (size_t)-3 to indicate that no bytes of input were consumed. 155c4e44ee2Sriastradh */ 156c4e44ee2Sriastradh if (S->nleft) { 157c4e44ee2Sriastradh if (pc8) 158c4e44ee2Sriastradh *pc8 = S->buf[sizeof(S->buf) - S->nleft]; 159c4e44ee2Sriastradh S->buf[sizeof(S->buf) - S->nleft] = 0; /* paranoia */ 160c4e44ee2Sriastradh S->nleft--; 161c4e44ee2Sriastradh return (size_t)-3; 162c4e44ee2Sriastradh } 163c4e44ee2Sriastradh 164c4e44ee2Sriastradh /* 165c4e44ee2Sriastradh * Consume the next scalar value. If no full scalar value can 166c4e44ee2Sriastradh * be obtained, stop here. 167c4e44ee2Sriastradh */ 1684651a634Sriastradh len = mbrtoc32_l(&c32, s, n, &S->mbs, loc); 169c4e44ee2Sriastradh switch (len) { 170c4e44ee2Sriastradh case 0: /* NUL */ 171c4e44ee2Sriastradh if (pc8) 172c4e44ee2Sriastradh *pc8 = 0; 173c4e44ee2Sriastradh return 0; 174c4e44ee2Sriastradh case (size_t)-2: /* still incomplete after n bytes */ 175c4e44ee2Sriastradh case (size_t)-1: /* error */ 176c4e44ee2Sriastradh return len; 177c4e44ee2Sriastradh default: /* consumed len bytes of input */ 178c4e44ee2Sriastradh break; 179c4e44ee2Sriastradh } 180c4e44ee2Sriastradh 181c4e44ee2Sriastradh /* 182c4e44ee2Sriastradh * We consumed a scalar value from the input. 183c4e44ee2Sriastradh * 184c4e44ee2Sriastradh * Encode it as UTF-8, yield the leading byte, and buffer the 185c4e44ee2Sriastradh * trailing bytes to yield later. 186c4e44ee2Sriastradh * 187c4e44ee2Sriastradh * Table 3-6: UTF-8 Bit Distribution 188c4e44ee2Sriastradh * Table 3-7: Well-Formed UTF-8 Byte Sequences 189c4e44ee2Sriastradh */ 190c4e44ee2Sriastradh switch (c32) { 191c4e44ee2Sriastradh case 0x00 ... 0x7f: 192c4e44ee2Sriastradh if (pc8) 193c4e44ee2Sriastradh *pc8 = c32; 194c4e44ee2Sriastradh _DIAGASSERT(S->nleft == 0); 195c4e44ee2Sriastradh break; 196c4e44ee2Sriastradh case 0x0080 ... 0x07ff: 197c4e44ee2Sriastradh if (pc8) 19825c050a8Srillig *pc8 = 0xc0 | __SHIFTOUT(c32, __BITS(10,6)); 19925c050a8Srillig S->buf[2] = 0x80 | __SHIFTOUT(c32, __BITS(5,0)); 200c4e44ee2Sriastradh S->nleft = 1; 201c4e44ee2Sriastradh break; 202c4e44ee2Sriastradh case 0x0800 ... 0xffff: 203c4e44ee2Sriastradh if (pc8) 20425c050a8Srillig *pc8 = 0xe0 | __SHIFTOUT(c32, __BITS(15,12)); 20525c050a8Srillig S->buf[1] = 0x80 | __SHIFTOUT(c32, __BITS(11,6)); 20625c050a8Srillig S->buf[2] = 0x80 | __SHIFTOUT(c32, __BITS(5,0)); 207c4e44ee2Sriastradh S->nleft = 2; 208c4e44ee2Sriastradh break; 209c4e44ee2Sriastradh case 0x10000 ... 0x10ffff: 210c4e44ee2Sriastradh if (pc8) 21125c050a8Srillig *pc8 = 0xf0 | __SHIFTOUT(c32, __BITS(20,18)); 21225c050a8Srillig S->buf[0] = 0x80 | __SHIFTOUT(c32, __BITS(17,12)); 21325c050a8Srillig S->buf[1] = 0x80 | __SHIFTOUT(c32, __BITS(11,6)); 21425c050a8Srillig S->buf[2] = 0x80 | __SHIFTOUT(c32, __BITS(5,0)); 215c4e44ee2Sriastradh S->nleft = 3; 216c4e44ee2Sriastradh break; 217c4e44ee2Sriastradh default: 218c4e44ee2Sriastradh errno = EILSEQ; 219c4e44ee2Sriastradh return (size_t)-1; 220c4e44ee2Sriastradh } 221c4e44ee2Sriastradh 222c4e44ee2Sriastradh /* 223c4e44ee2Sriastradh * Return the number of bytes consumed from the input. 224c4e44ee2Sriastradh */ 225c4e44ee2Sriastradh return len; 226c4e44ee2Sriastradh } 227