1*49c2256cSrillig /* $NetBSD: c8rtomb.c,v 1.9 2024/10/12 16:44:44 rillig Exp $ */ 2c4e44ee2Sriastradh 3c4e44ee2Sriastradh /*- 4c4e44ee2Sriastradh * Copyright (c) 2024 The NetBSD Foundation, Inc. 5c4e44ee2Sriastradh * All rights reserved. 6c4e44ee2Sriastradh * 7c4e44ee2Sriastradh * Redistribution and use in source and binary forms, with or without 8c4e44ee2Sriastradh * modification, are permitted provided that the following conditions 9c4e44ee2Sriastradh * are met: 10c4e44ee2Sriastradh * 1. Redistributions of source code must retain the above copyright 11c4e44ee2Sriastradh * notice, this list of conditions and the following disclaimer. 12c4e44ee2Sriastradh * 2. Redistributions in binary form must reproduce the above copyright 13c4e44ee2Sriastradh * notice, this list of conditions and the following disclaimer in the 14c4e44ee2Sriastradh * documentation and/or other materials provided with the distribution. 15c4e44ee2Sriastradh * 16c4e44ee2Sriastradh * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 17c4e44ee2Sriastradh * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 18c4e44ee2Sriastradh * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19c4e44ee2Sriastradh * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 20c4e44ee2Sriastradh * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 21c4e44ee2Sriastradh * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 22c4e44ee2Sriastradh * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 23c4e44ee2Sriastradh * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 24c4e44ee2Sriastradh * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 25c4e44ee2Sriastradh * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 26c4e44ee2Sriastradh * POSSIBILITY OF SUCH DAMAGE. 27c4e44ee2Sriastradh */ 28c4e44ee2Sriastradh 29c4e44ee2Sriastradh /* 30c4e44ee2Sriastradh * c8rtomb(s, c8, ps) 31c4e44ee2Sriastradh * 32c4e44ee2Sriastradh * Encode the Unicode UTF-8 code unit c8 into the multibyte buffer 33c4e44ee2Sriastradh * s under the current locale, using multibyte encoding state ps. 34c4e44ee2Sriastradh * 35c4e44ee2Sriastradh * If c8 is not the last byte of a UTF-8 scalar value sequence, no 36c4e44ee2Sriastradh * output will be produced, but c8 will be remembered; this must 37c4e44ee2Sriastradh * be followed by another call passing the following bytes. 38c4e44ee2Sriastradh * 39c4e44ee2Sriastradh * Return the number of bytes stored on success, or (size_t)-1 on 40c4e44ee2Sriastradh * error with errno set to EILSEQ. 41c4e44ee2Sriastradh * 42c4e44ee2Sriastradh * At most MB_CUR_MAX bytes will be stored. 43c4e44ee2Sriastradh * 44c4e44ee2Sriastradh * References: 45c4e44ee2Sriastradh * 46c4e44ee2Sriastradh * The Unicode Standard, Version 15.0 -- Core Specification, The 47c4e44ee2Sriastradh * Unicode Consortium, Sec. 3.9 `Unicode Encoding Forms': UTF-8, 48c4e44ee2Sriastradh * p. 124. 49c4e44ee2Sriastradh * https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=150 50c4e44ee2Sriastradh * https://web.archive.org/web/20240718101254/https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=150 51c4e44ee2Sriastradh * 52c4e44ee2Sriastradh * F. Yergeau, `UTF-8, a transformation format of ISO 10646', 53c4e44ee2Sriastradh * RFC 3629, Internet Engineering Task Force, November 2003. 54c4e44ee2Sriastradh * https://datatracker.ietf.org/doc/html/rfc3629 55c4e44ee2Sriastradh */ 56c4e44ee2Sriastradh 57c4e44ee2Sriastradh #include <sys/cdefs.h> 58*49c2256cSrillig __RCSID("$NetBSD: c8rtomb.c,v 1.9 2024/10/12 16:44:44 rillig Exp $"); 59565f5b16Sriastradh 60565f5b16Sriastradh #include "namespace.h" 61c4e44ee2Sriastradh 62c4e44ee2Sriastradh #include <assert.h> 63c4e44ee2Sriastradh #include <errno.h> 64c4e44ee2Sriastradh #include <limits.h> 654651a634Sriastradh #include <locale.h> 66c4e44ee2Sriastradh #include <stdalign.h> 67c4e44ee2Sriastradh #include <stddef.h> 68c4e44ee2Sriastradh #include <stdint.h> 69c4e44ee2Sriastradh #include <uchar.h> 70c4e44ee2Sriastradh 71c4e44ee2Sriastradh #include "c32rtomb.h" 724651a634Sriastradh #include "setlocale_local.h" 73c4e44ee2Sriastradh 74c4e44ee2Sriastradh struct c8rtombstate { 75c4e44ee2Sriastradh char32_t state_c32; /* 8-bit state and 24-bit buffer */ 76c4e44ee2Sriastradh mbstate_t mbs; 77c4e44ee2Sriastradh }; 78c4e44ee2Sriastradh __CTASSERT(offsetof(struct c8rtombstate, mbs) <= sizeof(mbstate_t)); 79c4e44ee2Sriastradh __CTASSERT(sizeof(struct c32rtombstate) <= sizeof(mbstate_t) - 80c4e44ee2Sriastradh offsetof(struct c8rtombstate, mbs)); 81c4e44ee2Sriastradh __CTASSERT(alignof(struct c8rtombstate) <= alignof(mbstate_t)); 82c4e44ee2Sriastradh 83c4e44ee2Sriastradh /* 84c4e44ee2Sriastradh * UTF-8 validation, inspired by Bjoern Hoermann's UTF-8 decoder at 85c4e44ee2Sriastradh * <http://bjoern.hoehrmann.de/utf-8/decoder/dfa/>, but reimplemented 86c4e44ee2Sriastradh * from scratch. 87c4e44ee2Sriastradh */ 88c4e44ee2Sriastradh 89c4e44ee2Sriastradh #define UTF8_ACCEPT 0 90c4e44ee2Sriastradh #define UTF8_REJECT 96 91c4e44ee2Sriastradh 92*49c2256cSrillig typedef uint8_t utf8_class_t; 93*49c2256cSrillig typedef uint8_t utf8_state_t; 94c4e44ee2Sriastradh 9551dce3f5Srillig static const uint8_t utf8_classtab[] = { 96c4e44ee2Sriastradh 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 97c4e44ee2Sriastradh 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 98c4e44ee2Sriastradh 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 99c4e44ee2Sriastradh 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 100c4e44ee2Sriastradh 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 101c4e44ee2Sriastradh 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 102c4e44ee2Sriastradh 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 103c4e44ee2Sriastradh 11,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 7,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, 104c4e44ee2Sriastradh }; 105c4e44ee2Sriastradh 10651dce3f5Srillig static const uint8_t utf8_statetab[] = { 107c4e44ee2Sriastradh 0,96,12,36,48,84,72,60,96,96,96,24, 96, 0,96,96,96,96,96,96, 0, 0,96,96, 108c4e44ee2Sriastradh 96,12,96,96,96,96,96,96,96,96,96,96, 96,12,96,96,96,96,96,96,12,12,96,96, 109c4e44ee2Sriastradh 96,96,96,96,96,96,96,96,12,12,96,96, 96,36,96,96,96,96,96,96,96,36,96,96, 110c4e44ee2Sriastradh 96,36,96,96,96,96,96,96,36,36,96,96, 96,96,96,96,96,96,96,96,36,96,96,96, 111c4e44ee2Sriastradh 96,96,96,96,96,96,96,96,96,96,96,96, 112c4e44ee2Sriastradh }; 113c4e44ee2Sriastradh 114c4e44ee2Sriastradh static utf8_state_t 115c4e44ee2Sriastradh utf8_decode_step(utf8_state_t state, char8_t c8, char32_t *pc32) 116c4e44ee2Sriastradh { 117c4e44ee2Sriastradh const utf8_class_t class = utf8_classtab[c8]; 118c4e44ee2Sriastradh 119c4e44ee2Sriastradh *pc32 = (state == UTF8_ACCEPT 120c4e44ee2Sriastradh ? (c8 & (0xff >> class)) 121c4e44ee2Sriastradh : ((c8 & 0x3f) | (*pc32 << 6))); 122c4e44ee2Sriastradh 123c4e44ee2Sriastradh return utf8_statetab[state + class]; 124c4e44ee2Sriastradh } 125c4e44ee2Sriastradh 1264651a634Sriastradh #ifdef __weak_alias 1274651a634Sriastradh __weak_alias(c8rtomb_l,_c8rtomb_l) 1284651a634Sriastradh #endif 1294651a634Sriastradh 130c4e44ee2Sriastradh size_t 131c4e44ee2Sriastradh c8rtomb(char *restrict s, char8_t c8, mbstate_t *restrict ps) 132c4e44ee2Sriastradh { 1334651a634Sriastradh 1344651a634Sriastradh return c8rtomb_l(s, c8, ps, _current_locale()); 1354651a634Sriastradh } 1364651a634Sriastradh 1374651a634Sriastradh size_t 1384651a634Sriastradh c8rtomb_l(char *restrict s, char8_t c8, mbstate_t *restrict ps, locale_t loc) 1394651a634Sriastradh { 140c4e44ee2Sriastradh static mbstate_t psbuf; 141c4e44ee2Sriastradh char buf[MB_LEN_MAX]; 142c4e44ee2Sriastradh struct c8rtombstate *S; 143c4e44ee2Sriastradh utf8_state_t state; 144c4e44ee2Sriastradh char32_t c32; 145c4e44ee2Sriastradh 146c4e44ee2Sriastradh /* 147c4e44ee2Sriastradh * `If ps is a null pointer, each function uses its own 148c4e44ee2Sriastradh * internal mbstate_t object instead, which is initialized at 149c4e44ee2Sriastradh * program startup to the initial conversion state; the 150c4e44ee2Sriastradh * functions are not required to avoid data races with other 151c4e44ee2Sriastradh * calls to the same function in this case. The 152c4e44ee2Sriastradh * implementation behaves as if no library function calls 153c4e44ee2Sriastradh * these functions with a null pointer for ps.' 154c4e44ee2Sriastradh */ 155c4e44ee2Sriastradh if (ps == NULL) 156c4e44ee2Sriastradh ps = &psbuf; 157c4e44ee2Sriastradh 158c4e44ee2Sriastradh /* 159c4e44ee2Sriastradh * `If s is a null pointer, the c8rtomb function is equivalent 160c4e44ee2Sriastradh * to the call 161c4e44ee2Sriastradh * 162c4e44ee2Sriastradh * c8rtomb(buf, u8'\0', ps) 163c4e44ee2Sriastradh * 164c4e44ee2Sriastradh * where buf is an internal buffer. 165c4e44ee2Sriastradh */ 166c4e44ee2Sriastradh if (s == NULL) { 167c4e44ee2Sriastradh s = buf; 168c4e44ee2Sriastradh c8 = 0; /* XXX u8'\0' */ 169c4e44ee2Sriastradh } 170c4e44ee2Sriastradh 171c4e44ee2Sriastradh /* 172c4e44ee2Sriastradh * Open the private UTF-8 decoding state. 173c4e44ee2Sriastradh */ 1740d3267e8Schristos S = (struct c8rtombstate *)(void *)ps; 175c4e44ee2Sriastradh 176c4e44ee2Sriastradh /* 177c4e44ee2Sriastradh * `If c8 is a null character, a null byte is stored, preceded 178c4e44ee2Sriastradh * by any shift sequence needed to restore the initial shift 179c4e44ee2Sriastradh * state; the resulting state described is the initial 180c4e44ee2Sriastradh * conversion state.' 181c4e44ee2Sriastradh * 1824a53dcdcSriastradh * So if c8 is null, discard any buffered input -- there's 1834a53dcdcSriastradh * nothing we can legitimately do with it -- and convert a null 1844a53dcdcSriastradh * scalar value, which by definition of c32rtomb writes out any 1854a53dcdcSriastradh * shift sequence reset followed by a null byte. 186c4e44ee2Sriastradh */ 1874a53dcdcSriastradh if (c8 == '\0') { 1884a53dcdcSriastradh c32 = 0; 1894a53dcdcSriastradh goto accept; 190c4e44ee2Sriastradh } 191c4e44ee2Sriastradh 192c4e44ee2Sriastradh /* 193c4e44ee2Sriastradh * Get the current state and buffer. 194c4e44ee2Sriastradh */ 195c4e44ee2Sriastradh __CTASSERT(UTF8_ACCEPT == 0); /* initial conversion state */ 196bb01338bSriastradh state = __SHIFTOUT(S->state_c32, __BITS(31,24)); 197bb01338bSriastradh c32 = __SHIFTOUT(S->state_c32, __BITS(23,0)); 198c4e44ee2Sriastradh 199c4e44ee2Sriastradh /* 200c4e44ee2Sriastradh * Feed the byte into the state machine to update the state. 201c4e44ee2Sriastradh */ 202c4e44ee2Sriastradh state = utf8_decode_step(state, c8, &c32); 203c4e44ee2Sriastradh switch (state) { 204c4e44ee2Sriastradh case UTF8_REJECT: 205c4e44ee2Sriastradh /* 206c4e44ee2Sriastradh * Invalid UTF-8. Fail with EILSEQ. 207c4e44ee2Sriastradh */ 208c4e44ee2Sriastradh errno = EILSEQ; 209c4e44ee2Sriastradh return (size_t)-1; 210c4e44ee2Sriastradh default: 211c4e44ee2Sriastradh /* 212c4e44ee2Sriastradh * Valid UTF-8 so far but incomplete. Update state and 213c4e44ee2Sriastradh * output nothing. 214c4e44ee2Sriastradh */ 215*49c2256cSrillig S->state_c32 = 2160d3267e8Schristos __SHIFTIN(state, __BITS(31,24)) | 217*49c2256cSrillig __SHIFTIN(c32, __BITS(23,0)); 218c4e44ee2Sriastradh return 0; 219c4e44ee2Sriastradh case UTF8_ACCEPT: 2204a53dcdcSriastradh accept: 221c4e44ee2Sriastradh /* 222c4e44ee2Sriastradh * We have a scalar value. Clear the state and output 223c4e44ee2Sriastradh * the scalar value. 224c4e44ee2Sriastradh */ 225c4e44ee2Sriastradh S->state_c32 = 0; 2264651a634Sriastradh return c32rtomb_l(s, c32, &S->mbs, loc); 227c4e44ee2Sriastradh } 228c4e44ee2Sriastradh } 229