xref: /netbsd-src/lib/libc/locale/mbrtoc16.3 (revision fdd9db8a91c767e1b3e0b7be194f588935269cca)
1*fdd9db8aSriastradh.\"	$NetBSD: mbrtoc16.3,v 1.10 2024/08/23 12:59:49 riastradh Exp $
22cbd152aSriastradh.\"
32cbd152aSriastradh.\" Copyright (c) 2024 The NetBSD Foundation, Inc.
42cbd152aSriastradh.\" All rights reserved.
52cbd152aSriastradh.\"
62cbd152aSriastradh.\" Redistribution and use in source and binary forms, with or without
72cbd152aSriastradh.\" modification, are permitted provided that the following conditions
82cbd152aSriastradh.\" are met:
92cbd152aSriastradh.\" 1. Redistributions of source code must retain the above copyright
102cbd152aSriastradh.\"    notice, this list of conditions and the following disclaimer.
112cbd152aSriastradh.\" 2. Redistributions in binary form must reproduce the above copyright
122cbd152aSriastradh.\"    notice, this list of conditions and the following disclaimer in the
132cbd152aSriastradh.\"    documentation and/or other materials provided with the distribution.
142cbd152aSriastradh.\"
152cbd152aSriastradh.\" THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
162cbd152aSriastradh.\" ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
172cbd152aSriastradh.\" TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
182cbd152aSriastradh.\" PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
192cbd152aSriastradh.\" BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
202cbd152aSriastradh.\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
212cbd152aSriastradh.\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
222cbd152aSriastradh.\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
232cbd152aSriastradh.\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
242cbd152aSriastradh.\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
252cbd152aSriastradh.\" POSSIBILITY OF SUCH DAMAGE.
262cbd152aSriastradh.\"
272cbd152aSriastradh.Dd August 14, 2024
282cbd152aSriastradh.Dt MBRTOC16 3
292cbd152aSriastradh.Os
302cbd152aSriastradh.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
312cbd152aSriastradh.Sh NAME
322cbd152aSriastradh.Nm mbrtoc16
332514fdacSriastradh.Nd Restartable multibyte to UTF-16 conversion
342cbd152aSriastradh.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
352cbd152aSriastradh.Sh LIBRARY
362cbd152aSriastradh.Lb libc
372cbd152aSriastradh.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
382cbd152aSriastradh.Sh SYNOPSIS
39e5f039efSuwe.
402cbd152aSriastradh.In uchar.h
41e5f039efSuwe.
422cbd152aSriastradh.Ft size_t
43e5f039efSuwe.Fo mbrtoc16
44e5f039efSuwe.Fa "char16_t * restrict pc16"
45e5f039efSuwe.Fa "const char * restrict s"
46e5f039efSuwe.Fa "size_t n"
47e5f039efSuwe.Fa "mbstate_t * restrict ps"
48e5f039efSuwe.Fc
492cbd152aSriastradh.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
502cbd152aSriastradh.Sh DESCRIPTION
512cbd152aSriastradhThe
522cbd152aSriastradh.Nm
5314f09d28Sriastradhfunction decodes multibyte characters in the current locale and
5414f09d28Sriastradhconverts them to UTF-16, keeping state so it can restart after
5514f09d28Sriastradhincremental progress.
562514fdacSriastradh.Pp
572514fdacSriastradhEach call to
582514fdacSriastradh.Nm :
592514fdacSriastradh.Bl -enum -compact
602514fdacSriastradh.It
612514fdacSriastradhexamines up to
622cbd152aSriastradh.Fa n
632514fdacSriastradhbytes starting at
642514fdacSriastradh.Fa s ,
652514fdacSriastradh.It
662514fdacSriastradhyields a UTF-16 code unit if available by storing it at
672514fdacSriastradh.Li * Ns Fa pc16 ,
682514fdacSriastradh.It
692514fdacSriastradhsaves state at
702514fdacSriastradh.Fa ps ,
712514fdacSriastradhand
722514fdacSriastradh.It
732514fdacSriastradhreturns either the number of bytes consumed if any or a special return
742514fdacSriastradhvalue.
752514fdacSriastradh.El
762514fdacSriastradh.Pp
772514fdacSriastradhSpecifically:
782cbd152aSriastradh.Bl -bullet
792cbd152aSriastradh.It
802cbd152aSriastradhIf the multibyte sequence at
812cbd152aSriastradh.Fa s
822514fdacSriastradhis invalid after any previous input saved at
832514fdacSriastradh.Fa ps ,
842514fdacSriastradhor if an error occurs in decoding,
852cbd152aSriastradh.Nm
862cbd152aSriastradhreturns
872cbd152aSriastradh.Li (size_t)-1
882cbd152aSriastradhand sets
892cbd152aSriastradh.Xr errno 2
902cbd152aSriastradhto indicate the error.
912cbd152aSriastradh.It
922cbd152aSriastradhIf the multibyte sequence at
932cbd152aSriastradh.Fa s
942cbd152aSriastradhis still incomplete after
952cbd152aSriastradh.Fa n
962514fdacSriastradhbytes, including any previous input saved in
972cbd152aSriastradh.Fa ps ,
982cbd152aSriastradh.Nm
992cbd152aSriastradhsaves its state in
1002cbd152aSriastradh.Fa ps
1012cbd152aSriastradhafter all the input so far and returns
1022cbd152aSriastradh.Li "(size_t)-2".
1037a641cfeSriastradh.Sy All
1047a641cfeSriastradh.Fa n
1057a641cfeSriastradhbytes of input are consumed in this case.
1062cbd152aSriastradh.It
1072cbd152aSriastradhIf
1082cbd152aSriastradh.Nm
1092514fdacSriastradhhad previously decoded a multibyte character but has not yet yielded
1102514fdacSriastradhall the code units of its UTF-16 encoding, it stores the next UTF-16
1112514fdacSriastradhcode unit at
1122514fdacSriastradh.Li * Ns Fa pc16
1132514fdacSriastradhand returns
1142514fdacSriastradh.Li "(size_t)-3" .
1157a641cfeSriastradh.Sy \&No
1167a641cfeSriastradhbytes of input are consumed in this case.
1172514fdacSriastradh.It
1182514fdacSriastradhIf
1192514fdacSriastradh.Nm
1202514fdacSriastradhdecodes the null multibyte character, then it stores zero at
1212cbd152aSriastradh.Li * Ns Fa pc16
1222cbd152aSriastradhand returns zero.
1232cbd152aSriastradh.It
1242514fdacSriastradhOtherwise,
1252cbd152aSriastradh.Nm
1262514fdacSriastradhdecodes a single multibyte character, stores the first (and possibly
1272514fdacSriastradhonly) code unit in its UTF-16 encoding at
1282cbd152aSriastradh.Li * Ns Fa pc16 ,
1292514fdacSriastradhand returns the number of bytes consumed to decode the first multibyte
1302514fdacSriastradhcharacter.
1312514fdacSriastradh.El
1322514fdacSriastradh.Pp
1332cbd152aSriastradhIf
1342514fdacSriastradh.Fa pc16
1352514fdacSriastradhis a null pointer, nothing is stored, but the effects on
1362cbd152aSriastradh.Fa ps
1372514fdacSriastradhand the return value are unchanged.
1382cbd152aSriastradh.Pp
1392cbd152aSriastradhIf
1402cbd152aSriastradh.Fa s
1412cbd152aSriastradhis a null pointer, the
1422cbd152aSriastradh.Nm
1432cbd152aSriastradhcall is equivalent to:
1442cbd152aSriastradh.Bd -ragged -offset indent
1452cbd152aSriastradh.Fo mbrtoc16
1462cbd152aSriastradh.Li NULL ,
1472cbd152aSriastradh.Li \*q\*q ,
1482cbd152aSriastradh.Li 1 ,
1492cbd152aSriastradh.Fa ps
1502cbd152aSriastradh.Fc
1512cbd152aSriastradh.Ed
1522cbd152aSriastradh.Pp
1532cbd152aSriastradhThis always returns zero, and has the effect of resetting
1542cbd152aSriastradh.Fa ps
1552cbd152aSriastradhto the initial conversion state, without writing to
1562cbd152aSriastradh.Fa pc16 ,
1572cbd152aSriastradheven if it is nonnull.
1582cbd152aSriastradh.Pp
1592cbd152aSriastradhIf
1602cbd152aSriastradh.Fa ps
1612cbd152aSriastradhis a null pointer,
1622cbd152aSriastradh.Nm
1632cbd152aSriastradhuses an internal
1642cbd152aSriastradh.Vt mbstate_t
1652cbd152aSriastradhobject with static storage duration, distinct from all other
1662cbd152aSriastradh.Vt mbstate_t
167e5f039efSuweobjects
168e5f039efSuwe.Po
169e5f039efSuweincluding those used by
170676a922fSriastradh.Xr mbrtoc8 3 ,
1712cbd152aSriastradh.Xr mbrtoc32 3 ,
172676a922fSriastradh.Xr c8rtomb 3 ,
1732cbd152aSriastradh.Xr c16rtomb 3 ,
1742cbd152aSriastradhand
175e5f039efSuwe.Xr c32rtomb 3
176e5f039efSuwe.Pc ,
1772cbd152aSriastradhwhich is initialized at program startup to the initial conversion
1782cbd152aSriastradhstate.
1792cbd152aSriastradh.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
1802514fdacSriastradh.Sh IMPLEMENTATION NOTES
1812514fdacSriastradhOn well-formed input, the
1822514fdacSriastradh.Nm
1832514fdacSriastradhfunction yields either a Unicode scalar value in the Basic Multilingual
1842514fdacSriastradhPlane (BMP), i.e., a 16-bit Unicode code point that is not a surrogate
1852514fdacSriastradhcode point, or, over two successive calls, yields the high and low
1862514fdacSriastradhsurrogate code points (in that order) of a Unicode scalar value outside
1872514fdacSriastradhthe BMP.
1882514fdacSriastradh.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
1892cbd152aSriastradh.Sh RETURN VALUES
1902cbd152aSriastradhThe
1912cbd152aSriastradh.Nm
1922cbd152aSriastradhfunction returns:
193e5f039efSuwe.Bl -tag -width Li
1942cbd152aSriastradh.It Li 0
195e5f039efSuwe.Bq null
196f51bb893Sriastradhif
197f51bb893Sriastradh.Nm
198f51bb893Sriastradhdecoded a null multibyte character.
199e5f039efSuwe.It Ar i
200e5f039efSuwe.Bq code unit
2012cbd152aSriastradhwhere
2027a641cfeSriastradh.Li 1
2032cbd152aSriastradh\*(Le
204e5f039efSuwe.Ar i
2052cbd152aSriastradh\*(Le
2062cbd152aSriastradh.Fa n ,
207f51bb893Sriastradhif
2082cbd152aSriastradh.Nm
209f51bb893Sriastradhconsumed
210f51bb893Sriastradh.Ar i
211f51bb893Sriastradhbytes of input to decode the next multibyte character, yielding a
2122514fdacSriastradhUTF-16 code unit.
2132cbd152aSriastradh.It Li (size_t)-3
214e5f039efSuwe.Bq continuation
215f51bb893Sriastradhif
216f51bb893Sriastradh.Nm
2172514fdacSriastradhconsumed no new bytes of input but yielded a UTF-16 code unit that was
2182514fdacSriastradhpending from previous input.
2192cbd152aSriastradh.It Li (size_t)-2
220e5f039efSuwe.Bq incomplete
221f51bb893Sriastradhif
2222cbd152aSriastradh.Nm
2232514fdacSriastradhfound only an incomplete multibyte sequence after all
2242cbd152aSriastradh.Fa n
2252514fdacSriastradhbytes of input and any previous input, and saved its state to restart
2262514fdacSriastradhin the next call with
227f51bb893Sriastradh.Fa ps .
2282cbd152aSriastradh.It Li (size_t)-1
229e5f039efSuwe.Bq error
2302cbd152aSriastradhif any encoding error was detected;
2312cbd152aSriastradh.Xr errno 2
2322cbd152aSriastradhis set to reflect the error.
2332cbd152aSriastradh.El
2342cbd152aSriastradh.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
2352cbd152aSriastradh.Sh EXAMPLES
2362cbd152aSriastradhPrint the UTF-16 code units of a multibyte string in hexadecimal text:
2372cbd152aSriastradh.Bd -literal -offset indent
2382cbd152aSriastradhchar *s = ...;
2392cbd152aSriastradhsize_t n = ...;
2402cbd152aSriastradhmbstate_t mbs = {0};    /* initial conversion state */
2412cbd152aSriastradh
2422cbd152aSriastradhwhile (n) {
2432cbd152aSriastradh        char16_t c16;
2442cbd152aSriastradh        size_t len;
2452cbd152aSriastradh
2462cbd152aSriastradh        len = mbrtoc16(&c16, s, n, &mbs);
2472cbd152aSriastradh        switch (len) {
248*fdd9db8aSriastradh        case 0:         /* NUL terminator */
249*fdd9db8aSriastradh                assert(c16 == 0);
2502cbd152aSriastradh                goto out;
2512cbd152aSriastradh        default:        /* scalar value or high surrogate */
25207c776ecSriastradh                printf("U+%04"PRIx16"\en", (uint16_t)c16);
2532cbd152aSriastradh                break;
2542cbd152aSriastradh        case (size_t)-3: /* low surrogate */
25507c776ecSriastradh                printf("continue U+%04"PRIx16"\en", (uint16_t)c16);
2562cbd152aSriastradh                break;
2572cbd152aSriastradh        case (size_t)-2: /* incomplete */
2582cbd152aSriastradh                printf("incomplete\en");
2592cbd152aSriastradh                goto readmore;
2602cbd152aSriastradh        case (size_t)-1: /* error */
26107c776ecSriastradh                printf("error: %d\en", errno);
2622cbd152aSriastradh                goto out;
2632cbd152aSriastradh        }
2642cbd152aSriastradh        s += len;
2652cbd152aSriastradh        n -= len;
2662cbd152aSriastradh}
2672cbd152aSriastradh.Ed
2682cbd152aSriastradh.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
2692cbd152aSriastradh.Sh ERRORS
270e5f039efSuwe.Bl -tag -width Bq
2712cbd152aSriastradh.It Bq Er EILSEQ
2722514fdacSriastradhThe multibyte sequence cannot be decoded in the current locale as a
2732514fdacSriastradhUnicode scalar value.
2742cbd152aSriastradh.It Bq Er EIO
2752cbd152aSriastradhAn error occurred in loading the locale's character conversions.
2762cbd152aSriastradh.El
2772cbd152aSriastradh.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
2782cbd152aSriastradh.Sh SEE ALSO
2792cbd152aSriastradh.Xr c16rtomb 3 ,
2802cbd152aSriastradh.Xr c32rtomb 3 ,
281685764b6Sriastradh.Xr c8rtomb 3 ,
2822cbd152aSriastradh.Xr mbrtoc32 3 ,
283685764b6Sriastradh.Xr mbrtoc8 3 ,
2842cbd152aSriastradh.Xr uchar 3
2852cbd152aSriastradh.Rs
2862cbd152aSriastradh.%B The Unicode Standard
2872cbd152aSriastradh.%O Version 15.0 \(em Core Specification
2882cbd152aSriastradh.%Q The Unicode Consortium
2892cbd152aSriastradh.%D September 2022
2902cbd152aSriastradh.%U https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf
2912cbd152aSriastradh.Re
2922cbd152aSriastradh.Rs
2932cbd152aSriastradh.%A P. Hoffman
2942cbd152aSriastradh.%A F. Yergeau
2952cbd152aSriastradh.%T UTF-16, an encoding of ISO 10646
2962cbd152aSriastradh.%R RFC 2781
2972cbd152aSriastradh.%D February 2000
2982cbd152aSriastradh.%I Internet Engineering Task Force
2992cbd152aSriastradh.%U https://datatracker.ietf.org/doc/html/rfc2781
3002cbd152aSriastradh.Re
3012cbd152aSriastradh.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
3022cbd152aSriastradh.Sh STANDARDS
3032cbd152aSriastradhThe
3042cbd152aSriastradh.Nm
3052cbd152aSriastradhfunction conforms to
3062cbd152aSriastradh.St -isoC-2011 .
3072cbd152aSriastradh.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
3082cbd152aSriastradh.Sh HISTORY
3092cbd152aSriastradhThe
3102cbd152aSriastradh.Nm
3112cbd152aSriastradhfunction first appeared in
3122cbd152aSriastradh.Nx 11.0 .
313