xref: /netbsd-src/lib/libc/locale/mbrtoc8.3 (revision fdd9db8a91c767e1b3e0b7be194f588935269cca)
1*fdd9db8aSriastradh.\"	$NetBSD: mbrtoc8.3,v 1.7 2024/08/23 12:59:49 riastradh Exp $
2c4e44ee2Sriastradh.\"
3c4e44ee2Sriastradh.\" Copyright (c) 2024 The NetBSD Foundation, Inc.
4c4e44ee2Sriastradh.\" All rights reserved.
5c4e44ee2Sriastradh.\"
6c4e44ee2Sriastradh.\" Redistribution and use in source and binary forms, with or without
7c4e44ee2Sriastradh.\" modification, are permitted provided that the following conditions
8c4e44ee2Sriastradh.\" are met:
9c4e44ee2Sriastradh.\" 1. Redistributions of source code must retain the above copyright
10c4e44ee2Sriastradh.\"    notice, this list of conditions and the following disclaimer.
11c4e44ee2Sriastradh.\" 2. Redistributions in binary form must reproduce the above copyright
12c4e44ee2Sriastradh.\"    notice, this list of conditions and the following disclaimer in the
13c4e44ee2Sriastradh.\"    documentation and/or other materials provided with the distribution.
14c4e44ee2Sriastradh.\"
15c4e44ee2Sriastradh.\" THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
16c4e44ee2Sriastradh.\" ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
17c4e44ee2Sriastradh.\" TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18c4e44ee2Sriastradh.\" PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
19c4e44ee2Sriastradh.\" BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20c4e44ee2Sriastradh.\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21c4e44ee2Sriastradh.\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
22c4e44ee2Sriastradh.\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23c4e44ee2Sriastradh.\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
24c4e44ee2Sriastradh.\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25c4e44ee2Sriastradh.\" POSSIBILITY OF SUCH DAMAGE.
26c4e44ee2Sriastradh.\"
27c4e44ee2Sriastradh.Dd August 15, 2024
28c4e44ee2Sriastradh.Dt MBRTOC8 3
29c4e44ee2Sriastradh.Os
30c4e44ee2Sriastradh.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
31c4e44ee2Sriastradh.Sh NAME
32c4e44ee2Sriastradh.Nm mbrtoc8
3361573944Sriastradh.Nd Restartable multibyte to UTF-8 conversion
34c4e44ee2Sriastradh.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
35c4e44ee2Sriastradh.Sh LIBRARY
36c4e44ee2Sriastradh.Lb libc
37c4e44ee2Sriastradh.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
38c4e44ee2Sriastradh.Sh SYNOPSIS
39d9a547eaSuwe.
40c4e44ee2Sriastradh.In uchar.h
41d9a547eaSuwe.
42c4e44ee2Sriastradh.Ft size_t
43d9a547eaSuwe.Fo mbrtoc8
44d9a547eaSuwe.Fa "char8_t * restrict pc8"
45d9a547eaSuwe.Fa "const char * restrict s"
46d9a547eaSuwe.Fa "size_t n"
47d9a547eaSuwe.Fa "mbstate_t * restrict ps"
48d9a547eaSuwe.Fc
49c4e44ee2Sriastradh.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
50c4e44ee2Sriastradh.Sh DESCRIPTION
51c4e44ee2SriastradhThe
52c4e44ee2Sriastradh.Nm
534c9fb026Sriastradhfunction decodes multibyte characters in the current locale and
544c9fb026Sriastradhconverts them to UTF-8, keeping state so it can restart after
554c9fb026Sriastradhincremental progress.
5661573944Sriastradh.Pp
5761573944SriastradhEach call to
5861573944Sriastradh.Nm :
5961573944Sriastradh.Bl -enum -compact
6061573944Sriastradh.It
6161573944Sriastradhexamines up to
62c4e44ee2Sriastradh.Fa n
6361573944Sriastradhbytes starting at
6461573944Sriastradh.Fa s ,
6561573944Sriastradh.It
6661573944Sriastradhyields a UTF-8 code unit if available by storing it at
6761573944Sriastradh.Li * Ns Fa pc8 ,
6861573944Sriastradh.It
6961573944Sriastradhsaves state at
7061573944Sriastradh.Fa ps ,
7161573944Sriastradhand
7261573944Sriastradh.It
7361573944Sriastradhreturns either the number of bytes consumed if any or a special return
7461573944Sriastradhvalue.
7561573944Sriastradh.El
7661573944Sriastradh.Pp
7761573944SriastradhSpecifically:
78c4e44ee2Sriastradh.Bl -bullet
79c4e44ee2Sriastradh.It
80c4e44ee2SriastradhIf the multibyte sequence at
81c4e44ee2Sriastradh.Fa s
8261573944Sriastradhis invalid after any previous input saved at
8361573944Sriastradh.Fa ps ,
8461573944Sriastradhor if an error occurs in decoding,
85c4e44ee2Sriastradh.Nm
86c4e44ee2Sriastradhreturns
87c4e44ee2Sriastradh.Li (size_t)-1
88c4e44ee2Sriastradhand sets
89c4e44ee2Sriastradh.Xr errno 2
90c4e44ee2Sriastradhto indicate the error.
91c4e44ee2Sriastradh.It
92c4e44ee2SriastradhIf the multibyte sequence at
93c4e44ee2Sriastradh.Fa s
94c4e44ee2Sriastradhis still incomplete after
95c4e44ee2Sriastradh.Fa n
9661573944Sriastradhbytes, including any previous input saved in
97c4e44ee2Sriastradh.Fa ps ,
98c4e44ee2Sriastradh.Nm
99c4e44ee2Sriastradhsaves its state in
100c4e44ee2Sriastradh.Fa ps
101c4e44ee2Sriastradhafter all the input so far and returns
102c4e44ee2Sriastradh.Li "(size_t)-2".
10384580e03Sriastradh.Sy All
10484580e03Sriastradh.Fa n
10584580e03Sriastradhbytes of input are consumed in this case.
106c4e44ee2Sriastradh.It
107c4e44ee2SriastradhIf
108c4e44ee2Sriastradh.Nm
10961573944Sriastradhhad previously decoded a multibyte character but has not yet yielded
11061573944Sriastradhall the code units of its UTF-8 encoding, it stores the next UTF-8 code
11161573944Sriastradhunit at
11261573944Sriastradh.Li * Ns Fa pc8
11361573944Sriastradhand returns
11461573944Sriastradh.Li "(size_t)-3" .
11584580e03Sriastradh.Sy \&No
11684580e03Sriastradhinput is consumed in this case.
11761573944Sriastradh.It
11861573944SriastradhIf
11961573944Sriastradh.Nm
12061573944Sriastradhdecodes the null multibyte character, then it stores zero at
121c4e44ee2Sriastradh.Li * Ns Fa pc8
122c4e44ee2Sriastradhand returns zero.
123c4e44ee2Sriastradh.It
12461573944SriastradhOtherwise,
125c4e44ee2Sriastradh.Nm
12661573944Sriastradhdecodes a single multibyte character, stores the first (and possibly
12761573944Sriastradhonly) code unit in its UTF-8 encoding at
128c4e44ee2Sriastradh.Li * Ns Fa pc8 ,
12961573944Sriastradhand returns the number of bytes consumed to decode the first multibyte
13061573944Sriastradhcharacter.
13161573944Sriastradh.El
13261573944Sriastradh.Pp
133c4e44ee2SriastradhIf
13461573944Sriastradh.Fa pc8
13561573944Sriastradhis a null pointer, nothing is stored, but the effects on
136c4e44ee2Sriastradh.Fa ps
13761573944Sriastradhand the return value are unchanged.
138c4e44ee2Sriastradh.Pp
139c4e44ee2SriastradhIf
140c4e44ee2Sriastradh.Fa s
141c4e44ee2Sriastradhis a null pointer, the
142c4e44ee2Sriastradh.Nm
143c4e44ee2Sriastradhcall is equivalent to:
144c4e44ee2Sriastradh.Bd -ragged -offset indent
145c4e44ee2Sriastradh.Fo mbrtoc8
146c4e44ee2Sriastradh.Li NULL ,
147c4e44ee2Sriastradh.Li \*q\*q ,
148c4e44ee2Sriastradh.Li 1 ,
149c4e44ee2Sriastradh.Fa ps
150c4e44ee2Sriastradh.Fc
151c4e44ee2Sriastradh.Ed
152c4e44ee2Sriastradh.Pp
153c4e44ee2SriastradhThis always returns zero, and has the effect of resetting
154c4e44ee2Sriastradh.Fa ps
155c4e44ee2Sriastradhto the initial conversion state, without writing to
156c4e44ee2Sriastradh.Fa pc8 ,
157c4e44ee2Sriastradheven if it is nonnull.
158c4e44ee2Sriastradh.Pp
159c4e44ee2SriastradhIf
160c4e44ee2Sriastradh.Fa ps
161c4e44ee2Sriastradhis a null pointer,
162c4e44ee2Sriastradh.Nm
163c4e44ee2Sriastradhuses an internal
164c4e44ee2Sriastradh.Vt mbstate_t
165c4e44ee2Sriastradhobject with static storage duration, distinct from all other
166c4e44ee2Sriastradh.Vt mbstate_t
167d9a547eaSuweobjects
168d9a547eaSuwe.Po
169d9a547eaSuweincluding those used by
170c4e44ee2Sriastradh.Xr mbrtoc16 3 ,
171c4e44ee2Sriastradh.Xr mbrtoc32 3 ,
172c4e44ee2Sriastradh.Xr c8rtomb 3 ,
173c4e44ee2Sriastradh.Xr c16rtomb 3 ,
174c4e44ee2Sriastradhand
175d9a547eaSuwe.Xr c32rtomb 3
176d9a547eaSuwe.Pc ,
177c4e44ee2Sriastradhwhich is initialized at program startup to the initial conversion
178c4e44ee2Sriastradhstate.
179c4e44ee2Sriastradh.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
18061573944Sriastradh.Sh IMPLEMENTATION NOTES
18161573944SriastradhOn well-formed input, the
18261573944Sriastradh.Nm
18361573944Sriastradhfunction yields either a Unicode scalar value in US-ASCII range, i.e.,
18461573944Sriastradha 7-bit Unicode code point, or, over two to four successive calls, the
18561573944Sriastradhleading and trailing code units in order of the UTF-8 encoding of a
18661573944SriastradhUnicode scalar value outside the US-ASCII range.
18761573944Sriastradh.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
188c4e44ee2Sriastradh.Sh RETURN VALUES
189c4e44ee2SriastradhThe
190c4e44ee2Sriastradh.Nm
191c4e44ee2Sriastradhfunction returns:
192d9a547eaSuwe.Bl -tag -width Li
193c4e44ee2Sriastradh.It Li 0
194d9a547eaSuwe.Bq null
195b97524e9Sriastradhif
196b97524e9Sriastradh.Nm
197b97524e9Sriastradhdecoded a null multibyte character.
198d9a547eaSuwe.It Ar i
199d9a547eaSuwe.Bq code unit
200c4e44ee2Sriastradhwhere
20184580e03Sriastradh.Li 1
202c4e44ee2Sriastradh\*(Le
203d9a547eaSuwe.Ar i
204c4e44ee2Sriastradh\*(Le
205c4e44ee2Sriastradh.Fa n ,
206b97524e9Sriastradhif
207c4e44ee2Sriastradh.Nm
208b97524e9Sriastradhconsumed
209d9a547eaSuwe.Ar i
210b97524e9Sriastradhbytes of input to decode the next multibyte character, yielding a
21161573944SriastradhUTF-8 code unit.
212c4e44ee2Sriastradh.It Li (size_t)-3
213d9a547eaSuwe.Bq continuation
214b97524e9Sriastradhif
215b97524e9Sriastradh.Nm
21661573944Sriastradhconsumed no new bytes of input but yielded a UTF-8 code unit that was
21761573944Sriastradhpending from previous input.
218c4e44ee2Sriastradh.It Li (size_t)-2
219d9a547eaSuwe.Bq incomplete
220b97524e9Sriastradhif
221c4e44ee2Sriastradh.Nm
22261573944Sriastradhfound only an incomplete multibyte sequence after all
223c4e44ee2Sriastradh.Fa n
22461573944Sriastradhbytes of input and any previous input, and saved its state to restart
22561573944Sriastradhin the next call with
226b97524e9Sriastradh.Fa ps .
227c4e44ee2Sriastradh.It Li (size_t)-1
228d9a547eaSuwe.Bq error
229c4e44ee2Sriastradhif any encoding error was detected;
230c4e44ee2Sriastradh.Xr errno 2
231c4e44ee2Sriastradhis set to reflect the error.
232c4e44ee2Sriastradh.El
233c4e44ee2Sriastradh.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
234c4e44ee2Sriastradh.Sh EXAMPLES
235c4e44ee2SriastradhPrint the UTF-8 code units of a multibyte string in hexadecimal text:
236c4e44ee2Sriastradh.Bd -literal -offset indent
237c4e44ee2Sriastradhchar *s = ...;
238c4e44ee2Sriastradhsize_t n = ...;
239c4e44ee2Sriastradhmbstate_t mbs = {0};    /* initial conversion state */
240c4e44ee2Sriastradh
241c4e44ee2Sriastradhwhile (n) {
242c4e44ee2Sriastradh        char8_t c8;
243c4e44ee2Sriastradh        size_t len;
244c4e44ee2Sriastradh
245c4e44ee2Sriastradh        len = mbrtoc8(&c8, s, n, &mbs);
246c4e44ee2Sriastradh        switch (len) {
247*fdd9db8aSriastradh        case 0:         /* NUL terminator */
248*fdd9db8aSriastradh                assert(c8 == 0);
249c4e44ee2Sriastradh                goto out;
250c4e44ee2Sriastradh        default:        /* consumed input and yielded a byte c8 */
251c4e44ee2Sriastradh                printf("0x%02hhx\en", c8);
252c4e44ee2Sriastradh                break;
253c4e44ee2Sriastradh        case (size_t)-3: /* yielded a pending byte c8 */
254c4e44ee2Sriastradh                printf("continue 0x%02hhx\en", c8);
255c4e44ee2Sriastradh                break;
256c4e44ee2Sriastradh        case (size_t)-2: /* incomplete */
257c4e44ee2Sriastradh                printf("incomplete\en");
258c4e44ee2Sriastradh                goto readmore;
259c4e44ee2Sriastradh        case (size_t)-1: /* error */
260c4e44ee2Sriastradh                printf("error: %d\en", errno);
261c4e44ee2Sriastradh                goto out;
262c4e44ee2Sriastradh        }
263c4e44ee2Sriastradh        s += len;
264c4e44ee2Sriastradh        n -= len;
265c4e44ee2Sriastradh}
266c4e44ee2Sriastradh.Ed
267c4e44ee2Sriastradh.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
268c4e44ee2Sriastradh.Sh ERRORS
269d9a547eaSuwe.Bl -tag -width Bq
270c4e44ee2Sriastradh.It Bq Er EILSEQ
27161573944SriastradhThe multibyte sequence cannot be decoded in the current locale as a
27261573944SriastradhUnicode scalar value.
273c4e44ee2Sriastradh.It Bq Er EIO
274c4e44ee2SriastradhAn error occurred in loading the locale's character conversions.
275c4e44ee2Sriastradh.El
276c4e44ee2Sriastradh.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
277c4e44ee2Sriastradh.Sh SEE ALSO
278c4e44ee2Sriastradh.Xr c8rtomb 3 ,
279c4e44ee2Sriastradh.Xr c16rtomb 3 ,
280c4e44ee2Sriastradh.Xr c32rtomb 3 ,
281c4e44ee2Sriastradh.Xr mbrtoc16 3 ,
282c4e44ee2Sriastradh.Xr mbrtoc32 3 ,
283c4e44ee2Sriastradh.Xr uchar 3
284c4e44ee2Sriastradh.Rs
285c4e44ee2Sriastradh.%B The Unicode Standard
286c4e44ee2Sriastradh.%O Version 15.0 \(em Core Specification
287c4e44ee2Sriastradh.%Q The Unicode Consortium
288c4e44ee2Sriastradh.%D September 2022
289c4e44ee2Sriastradh.%U https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf
290c4e44ee2Sriastradh.Re
291c4e44ee2Sriastradh.Rs
292c4e44ee2Sriastradh.%A F. Yergeau
293c4e44ee2Sriastradh.%T UTF-8, a transformation format of ISO 10646
294c4e44ee2Sriastradh.%R RFC 3629
295c4e44ee2Sriastradh.%D November 2003
296c4e44ee2Sriastradh.%I Internet Engineering Task Force
297c4e44ee2Sriastradh.%U https://datatracker.ietf.org/doc/html/rfc3629
298c4e44ee2Sriastradh.Re
299c4e44ee2Sriastradh.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
300c4e44ee2Sriastradh.\" .Sh STANDARDS
301c4e44ee2Sriastradh.\" The
302c4e44ee2Sriastradh.\" .Nm
303c4e44ee2Sriastradh.\" function conforms to
304c4e44ee2Sriastradh.\" .St -isoC-2023 .
305c4e44ee2Sriastradh.\" .\" XXX PR misc/58600: man pages lack C17, C23, C++98, C++03, C++11, C++17, C++20, C++23 citation syntax
306c4e44ee2Sriastradh.\"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
307c4e44ee2Sriastradh.Sh HISTORY
308c4e44ee2SriastradhThe
309c4e44ee2Sriastradh.Nm
310c4e44ee2Sriastradhfunction first appeared in
311c4e44ee2Sriastradh.Nx 11.0 .
312