xref: /freebsd-src/sys/libkern/iconv_ucs.c (revision fdafd315ad0d0f28a11b9fb4476a9ab059c62b92)
141f1dcccSKevin Lo /*-
2*4d846d26SWarner Losh  * SPDX-License-Identifier: BSD-2-Clause
38a36da99SPedro F. Giffuni  *
441f1dcccSKevin Lo  * Copyright (c) 2003, 2005 Ryuichiro Imura
541f1dcccSKevin Lo  * All rights reserved.
641f1dcccSKevin Lo  *
741f1dcccSKevin Lo  * Redistribution and use in source and binary forms, with or without
841f1dcccSKevin Lo  * modification, are permitted provided that the following conditions
941f1dcccSKevin Lo  * are met:
1041f1dcccSKevin Lo  * 1. Redistributions of source code must retain the above copyright
1141f1dcccSKevin Lo  *    notice, this list of conditions and the following disclaimer.
1241f1dcccSKevin Lo  * 2. Redistributions in binary form must reproduce the above copyright
1341f1dcccSKevin Lo  *    notice, this list of conditions and the following disclaimer in the
1441f1dcccSKevin Lo  *    documentation and/or other materials provided with the distribution.
1541f1dcccSKevin Lo  *
1641f1dcccSKevin Lo  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
1741f1dcccSKevin Lo  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1841f1dcccSKevin Lo  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1941f1dcccSKevin Lo  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
2041f1dcccSKevin Lo  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2141f1dcccSKevin Lo  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2241f1dcccSKevin Lo  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2341f1dcccSKevin Lo  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2441f1dcccSKevin Lo  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2541f1dcccSKevin Lo  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
2641f1dcccSKevin Lo  * SUCH DAMAGE.
2741f1dcccSKevin Lo  */
2841f1dcccSKevin Lo 
2941f1dcccSKevin Lo #include <sys/param.h>
3041f1dcccSKevin Lo #include <sys/kernel.h>
3141f1dcccSKevin Lo #include <sys/systm.h>
3241f1dcccSKevin Lo #include <sys/malloc.h>
3341f1dcccSKevin Lo #include <sys/iconv.h>
3441f1dcccSKevin Lo 
3541f1dcccSKevin Lo #include "iconv_converter_if.h"
3641f1dcccSKevin Lo 
3741f1dcccSKevin Lo /*
3841f1dcccSKevin Lo  * "UCS" converter
3941f1dcccSKevin Lo  */
4041f1dcccSKevin Lo 
4141f1dcccSKevin Lo #define	KICONV_UCS_COMBINE	0x1
4241f1dcccSKevin Lo #define	KICONV_UCS_FROM_UTF8	0x2
4341f1dcccSKevin Lo #define	KICONV_UCS_TO_UTF8	0x4
4441f1dcccSKevin Lo #define	KICONV_UCS_FROM_LE	0x8
4541f1dcccSKevin Lo #define	KICONV_UCS_TO_LE	0x10
4641f1dcccSKevin Lo #define	KICONV_UCS_FROM_UTF16	0x20
4741f1dcccSKevin Lo #define	KICONV_UCS_TO_UTF16	0x40
4841f1dcccSKevin Lo #define	KICONV_UCS_UCS4		0x80
4941f1dcccSKevin Lo 
5041f1dcccSKevin Lo #define	ENCODING_UTF16	"UTF-16BE"
5141f1dcccSKevin Lo #define	ENCODING_UTF8	"UTF-8"
5241f1dcccSKevin Lo 
5341f1dcccSKevin Lo static struct {
5441f1dcccSKevin Lo 	const char *name;
5541f1dcccSKevin Lo 	int from_flag, to_flag;
5641f1dcccSKevin Lo } unicode_family[] = {
5741f1dcccSKevin Lo 	{ "UTF-8",	KICONV_UCS_FROM_UTF8,	KICONV_UCS_TO_UTF8 },
5841f1dcccSKevin Lo 	{ "UCS-2LE",	KICONV_UCS_FROM_LE,	KICONV_UCS_TO_LE },
5941f1dcccSKevin Lo 	{ "UTF-16BE",	KICONV_UCS_FROM_UTF16,	KICONV_UCS_TO_UTF16 },
6041f1dcccSKevin Lo 	{ "UTF-16LE",	KICONV_UCS_FROM_UTF16|KICONV_UCS_FROM_LE,
6141f1dcccSKevin Lo 	    KICONV_UCS_TO_UTF16|KICONV_UCS_TO_LE },
6241f1dcccSKevin Lo 	{ NULL,		0,	0 }
6341f1dcccSKevin Lo };
6441f1dcccSKevin Lo 
6541f1dcccSKevin Lo static uint32_t utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen);
6641f1dcccSKevin Lo static u_char *ucs4_to_utf8(uint32_t ucs4, char * dst, size_t *utf8width, size_t dstlen);
6741f1dcccSKevin Lo static uint32_t encode_surrogate(uint32_t code);
6841f1dcccSKevin Lo static uint32_t decode_surrogate(const u_char *ucs);
6941f1dcccSKevin Lo 
7041f1dcccSKevin Lo #ifdef MODULE_DEPEND
7141f1dcccSKevin Lo MODULE_DEPEND(iconv_ucs, libiconv, 2, 2, 2);
7241f1dcccSKevin Lo #endif
7341f1dcccSKevin Lo 
7441f1dcccSKevin Lo /*
7541f1dcccSKevin Lo  * UCS converter instance
7641f1dcccSKevin Lo  */
7741f1dcccSKevin Lo struct iconv_ucs {
7841f1dcccSKevin Lo 	KOBJ_FIELDS;
7941f1dcccSKevin Lo 	int			convtype;
8041f1dcccSKevin Lo 	struct iconv_cspair *	d_csp;
8141f1dcccSKevin Lo 	struct iconv_cspair *	d_cspf;
8241f1dcccSKevin Lo 	void *			f_ctp;
8341f1dcccSKevin Lo 	void *			t_ctp;
8441f1dcccSKevin Lo 	void *			ctype;
8541f1dcccSKevin Lo };
8641f1dcccSKevin Lo 
8741f1dcccSKevin Lo static int
iconv_ucs_open(struct iconv_converter_class * dcp,struct iconv_cspair * csp,struct iconv_cspair * cspf,void ** dpp)8841f1dcccSKevin Lo iconv_ucs_open(struct iconv_converter_class *dcp,
8941f1dcccSKevin Lo 	struct iconv_cspair *csp, struct iconv_cspair *cspf, void **dpp)
9041f1dcccSKevin Lo {
9141f1dcccSKevin Lo 	struct iconv_ucs *dp;
9241f1dcccSKevin Lo 	int i;
9341f1dcccSKevin Lo 	const char *from, *to;
9441f1dcccSKevin Lo 
9541f1dcccSKevin Lo 	dp = (struct iconv_ucs *)kobj_create((struct kobj_class*)dcp, M_ICONV, M_WAITOK);
9641f1dcccSKevin Lo 	to = csp->cp_to;
9741f1dcccSKevin Lo 	from = cspf ? cspf->cp_from : csp->cp_from;
9841f1dcccSKevin Lo 
9941f1dcccSKevin Lo 	dp->convtype = 0;
10041f1dcccSKevin Lo 
10141f1dcccSKevin Lo 	if (cspf)
10241f1dcccSKevin Lo 		dp->convtype |= KICONV_UCS_COMBINE;
10341f1dcccSKevin Lo 	for (i = 0; unicode_family[i].name; i++) {
1040cbce067SJohn Baldwin 		if (strcasecmp(from, unicode_family[i].name) == 0)
10541f1dcccSKevin Lo 			dp->convtype |= unicode_family[i].from_flag;
1060cbce067SJohn Baldwin 		if (strcasecmp(to, unicode_family[i].name) == 0)
10741f1dcccSKevin Lo 			dp->convtype |= unicode_family[i].to_flag;
10841f1dcccSKevin Lo 	}
109fa27760eSKevin Lo 	if (strcmp(ENCODING_UNICODE, ENCODING_UTF16) == 0)
11041f1dcccSKevin Lo 		dp->convtype |= KICONV_UCS_UCS4;
11141f1dcccSKevin Lo 	else
11241f1dcccSKevin Lo 		dp->convtype &= ~KICONV_UCS_UCS4;
11341f1dcccSKevin Lo 
11441f1dcccSKevin Lo 	dp->f_ctp = dp->t_ctp = NULL;
11541f1dcccSKevin Lo 	if (dp->convtype & KICONV_UCS_COMBINE) {
11641f1dcccSKevin Lo 		if ((dp->convtype & KICONV_UCS_FROM_UTF8) == 0 &&
11741f1dcccSKevin Lo 		    (dp->convtype & KICONV_UCS_FROM_LE) == 0) {
11841f1dcccSKevin Lo 			iconv_open(ENCODING_UNICODE, from, &dp->f_ctp);
11941f1dcccSKevin Lo 		}
12041f1dcccSKevin Lo 		if ((dp->convtype & KICONV_UCS_TO_UTF8) == 0 &&
12141f1dcccSKevin Lo 		    (dp->convtype & KICONV_UCS_TO_LE) == 0) {
12241f1dcccSKevin Lo 			iconv_open(to, ENCODING_UNICODE, &dp->t_ctp);
12341f1dcccSKevin Lo 		}
12441f1dcccSKevin Lo 	}
12541f1dcccSKevin Lo 
12641f1dcccSKevin Lo 	dp->ctype = NULL;
12741f1dcccSKevin Lo 	if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_TO_UTF8))
12841f1dcccSKevin Lo 		iconv_open(KICONV_WCTYPE_NAME, ENCODING_UTF8, &dp->ctype);
12941f1dcccSKevin Lo 
13041f1dcccSKevin Lo 	dp->d_csp = csp;
13141f1dcccSKevin Lo 	if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE)) {
13241f1dcccSKevin Lo 		if (cspf) {
13341f1dcccSKevin Lo 			dp->d_cspf = cspf;
13441f1dcccSKevin Lo 			cspf->cp_refcount++;
13541f1dcccSKevin Lo 		} else
13641f1dcccSKevin Lo 			csp->cp_refcount++;
13741f1dcccSKevin Lo 	}
13841f1dcccSKevin Lo 	if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
13941f1dcccSKevin Lo 		csp->cp_refcount++;
14041f1dcccSKevin Lo 	*dpp = (void*)dp;
14141f1dcccSKevin Lo 	return 0;
14241f1dcccSKevin Lo }
14341f1dcccSKevin Lo 
14441f1dcccSKevin Lo static int
iconv_ucs_close(void * data)14541f1dcccSKevin Lo iconv_ucs_close(void *data)
14641f1dcccSKevin Lo {
14741f1dcccSKevin Lo 	struct iconv_ucs *dp = data;
14841f1dcccSKevin Lo 
14941f1dcccSKevin Lo 	if (dp->f_ctp)
15041f1dcccSKevin Lo 		iconv_close(dp->f_ctp);
15141f1dcccSKevin Lo 	if (dp->t_ctp)
15241f1dcccSKevin Lo 		iconv_close(dp->t_ctp);
15341f1dcccSKevin Lo 	if (dp->ctype)
15441f1dcccSKevin Lo 		iconv_close(dp->ctype);
15541f1dcccSKevin Lo 	if (dp->d_cspf)
15641f1dcccSKevin Lo 		dp->d_cspf->cp_refcount--;
15741f1dcccSKevin Lo 	else if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE))
15841f1dcccSKevin Lo 		dp->d_csp->cp_refcount--;
15941f1dcccSKevin Lo 	if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
16041f1dcccSKevin Lo 		dp->d_csp->cp_refcount--;
16141f1dcccSKevin Lo 	kobj_delete((struct kobj*)data, M_ICONV);
16241f1dcccSKevin Lo 	return 0;
16341f1dcccSKevin Lo }
16441f1dcccSKevin Lo 
16541f1dcccSKevin Lo static int
iconv_ucs_conv(void * d2p,const char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int convchar,int casetype)16641f1dcccSKevin Lo iconv_ucs_conv(void *d2p, const char **inbuf,
16741f1dcccSKevin Lo 	size_t *inbytesleft, char **outbuf, size_t *outbytesleft,
16841f1dcccSKevin Lo 	int convchar, int casetype)
16941f1dcccSKevin Lo {
17041f1dcccSKevin Lo 	struct iconv_ucs *dp = (struct iconv_ucs*)d2p;
17141f1dcccSKevin Lo 	int ret = 0, i;
17241f1dcccSKevin Lo 	size_t in, on, ir, or, inlen, outlen, ucslen;
17341f1dcccSKevin Lo 	const char *src, *p;
17441f1dcccSKevin Lo 	char *dst;
17541f1dcccSKevin Lo 	u_char ucs[4], *q;
17641f1dcccSKevin Lo 	uint32_t code;
17741f1dcccSKevin Lo 
17841f1dcccSKevin Lo 	if (inbuf == NULL || *inbuf == NULL || outbuf == NULL || *outbuf == NULL)
17941f1dcccSKevin Lo 		return 0;
18041f1dcccSKevin Lo 	ir = in = *inbytesleft;
18141f1dcccSKevin Lo 	or = on = *outbytesleft;
18241f1dcccSKevin Lo 	src = *inbuf;
18341f1dcccSKevin Lo 	dst = *outbuf;
18441f1dcccSKevin Lo 
18541f1dcccSKevin Lo 	while (ir > 0 && or > 0) {
18641f1dcccSKevin Lo 		/*
18741f1dcccSKevin Lo 		 * The first half of conversion.
18841f1dcccSKevin Lo 		 * (convert any code into ENCODING_UNICODE)
18941f1dcccSKevin Lo 		 */
19041f1dcccSKevin Lo 		code = 0;
19141f1dcccSKevin Lo 		p = src;
19241f1dcccSKevin Lo 		if (dp->convtype & KICONV_UCS_FROM_UTF8) {
19341f1dcccSKevin Lo 			/* convert UTF-8 to ENCODING_UNICODE */
19441f1dcccSKevin Lo 			inlen = 0;
19541f1dcccSKevin Lo 			code = utf8_to_ucs4(p, &inlen, ir);
19641f1dcccSKevin Lo 			if (code == 0) {
19741f1dcccSKevin Lo 				ret = -1;
19841f1dcccSKevin Lo 				break;
19941f1dcccSKevin Lo 			}
20041f1dcccSKevin Lo 
20141f1dcccSKevin Lo 			if (casetype == KICONV_FROM_LOWER && dp->ctype) {
20241f1dcccSKevin Lo 				code = towlower(code, dp->ctype);
20341f1dcccSKevin Lo 			} else if (casetype == KICONV_FROM_UPPER && dp->ctype) {
20441f1dcccSKevin Lo 				code = towupper(code, dp->ctype);
20541f1dcccSKevin Lo 			}
20641f1dcccSKevin Lo 
20741f1dcccSKevin Lo 			if ((code >= 0xd800 && code < 0xe000) || code >= 0x110000 ) {
20841f1dcccSKevin Lo 				/* reserved for utf-16 surrogate pair */
20941f1dcccSKevin Lo 				/* invalid unicode */
21041f1dcccSKevin Lo 				ret = -1;
21141f1dcccSKevin Lo 				break;
21241f1dcccSKevin Lo 			}
21341f1dcccSKevin Lo 
21441f1dcccSKevin Lo 			if (inlen == 4) {
21541f1dcccSKevin Lo 				if (dp->convtype & KICONV_UCS_UCS4) {
21641f1dcccSKevin Lo 					ucslen = 4;
21741f1dcccSKevin Lo 					code = encode_surrogate(code);
21841f1dcccSKevin Lo 				} else {
21941f1dcccSKevin Lo 					/* can't handle with ucs-2 */
22041f1dcccSKevin Lo 					ret = -1;
22141f1dcccSKevin Lo 					break;
22241f1dcccSKevin Lo 				}
22341f1dcccSKevin Lo 			} else {
22441f1dcccSKevin Lo 				ucslen = 2;
22541f1dcccSKevin Lo 			}
22641f1dcccSKevin Lo 
22741f1dcccSKevin Lo 			/* save UCS-4 into ucs[] */
22841f1dcccSKevin Lo 			for (q = ucs, i = ucslen - 1 ; i >= 0 ; i--)
22941f1dcccSKevin Lo 				*q++ = (code >> (i << 3)) & 0xff;
23041f1dcccSKevin Lo 
23141f1dcccSKevin Lo 		} else if (dp->convtype & KICONV_UCS_COMBINE && dp->f_ctp) {
23241f1dcccSKevin Lo 			/* convert local code to ENCODING_UNICODE */
23341f1dcccSKevin Lo 			ucslen = 4;
23441f1dcccSKevin Lo 			inlen = ir;
23541f1dcccSKevin Lo 			q = ucs;
23641f1dcccSKevin Lo 			ret = iconv_convchr_case(dp->f_ctp, &p, &inlen, (char **)&q,
23741f1dcccSKevin Lo 			    &ucslen, casetype & (KICONV_FROM_LOWER | KICONV_FROM_UPPER));
23841f1dcccSKevin Lo 			if (ret)
23941f1dcccSKevin Lo 				break;
24041f1dcccSKevin Lo 			inlen = ir - inlen;
24141f1dcccSKevin Lo 			ucslen = 4 - ucslen;
24241f1dcccSKevin Lo 
24341f1dcccSKevin Lo 		} else {
24441f1dcccSKevin Lo 			/* src code is a proper subset of ENCODING_UNICODE */
24541f1dcccSKevin Lo 			q = ucs;
24641f1dcccSKevin Lo 			if (dp->convtype & KICONV_UCS_FROM_LE) {
24741f1dcccSKevin Lo 				*q = *(p + 1);
24841f1dcccSKevin Lo 				*(q + 1) = *p;
24941f1dcccSKevin Lo 				p += 2;
25041f1dcccSKevin Lo 			} else {
25141f1dcccSKevin Lo 				*q = *p++;
25241f1dcccSKevin Lo 				*(q + 1) = *p++;
25341f1dcccSKevin Lo 			}
25441f1dcccSKevin Lo 			if ((*q & 0xfc) == 0xd8) {
25541f1dcccSKevin Lo 				if (dp->convtype & KICONV_UCS_UCS4 &&
25641f1dcccSKevin Lo 				    dp->convtype & KICONV_UCS_FROM_UTF16) {
25741f1dcccSKevin Lo 					inlen = ucslen = 4;
25841f1dcccSKevin Lo 				} else {
25941f1dcccSKevin Lo 					/* invalid unicode */
26041f1dcccSKevin Lo 					ret = -1;
26141f1dcccSKevin Lo 					break;
26241f1dcccSKevin Lo 				}
26341f1dcccSKevin Lo 			} else {
26441f1dcccSKevin Lo 				inlen = ucslen = 2;
26541f1dcccSKevin Lo 			}
26641f1dcccSKevin Lo 			if (ir < inlen) {
26741f1dcccSKevin Lo 				ret = -1;
26841f1dcccSKevin Lo 				break;
26941f1dcccSKevin Lo 			}
27041f1dcccSKevin Lo 			if (ucslen == 4) {
27141f1dcccSKevin Lo 				q += 2;
27241f1dcccSKevin Lo 				if (dp->convtype & KICONV_UCS_FROM_LE) {
27341f1dcccSKevin Lo 					*q = *(p + 1);
27441f1dcccSKevin Lo 					*(q + 1) = *p;
27541f1dcccSKevin Lo 				} else {
27641f1dcccSKevin Lo 					*q = *p++;
27741f1dcccSKevin Lo 					*(q + 1) = *p;
27841f1dcccSKevin Lo 				}
27941f1dcccSKevin Lo 				if ((*q & 0xfc) != 0xdc) {
28041f1dcccSKevin Lo 					/* invalid unicode */
28141f1dcccSKevin Lo 					ret = -1;
28241f1dcccSKevin Lo 					break;
28341f1dcccSKevin Lo 				}
28441f1dcccSKevin Lo 			}
28541f1dcccSKevin Lo 		}
28641f1dcccSKevin Lo 
28741f1dcccSKevin Lo 		/*
28841f1dcccSKevin Lo 		 * The second half of conversion.
28941f1dcccSKevin Lo 		 * (convert ENCODING_UNICODE into any code)
29041f1dcccSKevin Lo 		 */
29141f1dcccSKevin Lo 		p = ucs;
29241f1dcccSKevin Lo 		if (dp->convtype & KICONV_UCS_TO_UTF8) {
29341f1dcccSKevin Lo 			q = (u_char *)dst;
29441f1dcccSKevin Lo 			if (ucslen == 4 && dp->convtype & KICONV_UCS_UCS4) {
29541f1dcccSKevin Lo 				/* decode surrogate pair */
29641f1dcccSKevin Lo 				code = decode_surrogate(p);
29741f1dcccSKevin Lo 			} else {
29841f1dcccSKevin Lo 				code = (ucs[0] << 8) | ucs[1];
29941f1dcccSKevin Lo 			}
30041f1dcccSKevin Lo 
30141f1dcccSKevin Lo 			if (casetype == KICONV_LOWER && dp->ctype) {
30241f1dcccSKevin Lo 				code = towlower(code, dp->ctype);
30341f1dcccSKevin Lo 			} else if (casetype == KICONV_UPPER && dp->ctype) {
30441f1dcccSKevin Lo 				code = towupper(code, dp->ctype);
30541f1dcccSKevin Lo 			}
30641f1dcccSKevin Lo 
30741f1dcccSKevin Lo 			outlen = 0;
30841f1dcccSKevin Lo 			if (ucs4_to_utf8(code, q, &outlen, or) == NULL) {
30941f1dcccSKevin Lo 				ret = -1;
31041f1dcccSKevin Lo 				break;
31141f1dcccSKevin Lo 			}
31241f1dcccSKevin Lo 
31341f1dcccSKevin Lo 			src += inlen;
31441f1dcccSKevin Lo 			ir -= inlen;
31541f1dcccSKevin Lo 			dst += outlen;
31641f1dcccSKevin Lo 			or -= outlen;
31741f1dcccSKevin Lo 
31841f1dcccSKevin Lo 		} else if (dp->convtype & KICONV_UCS_COMBINE && dp->t_ctp) {
31941f1dcccSKevin Lo 			ret = iconv_convchr_case(dp->t_ctp, &p, &ucslen, &dst,
32041f1dcccSKevin Lo 			    &or, casetype & (KICONV_LOWER | KICONV_UPPER));
32141f1dcccSKevin Lo 			if (ret)
32241f1dcccSKevin Lo 				break;
32341f1dcccSKevin Lo 
32441f1dcccSKevin Lo 			src += inlen;
32541f1dcccSKevin Lo 			ir -= inlen;
32641f1dcccSKevin Lo 
32741f1dcccSKevin Lo 		} else {
32841f1dcccSKevin Lo 			/* dst code is a proper subset of ENCODING_UNICODE */
32941f1dcccSKevin Lo 			if (or < ucslen) {
33041f1dcccSKevin Lo 				ret = -1;
33141f1dcccSKevin Lo 				break;
33241f1dcccSKevin Lo 			}
33341f1dcccSKevin Lo 			src += inlen;
33441f1dcccSKevin Lo 			ir -= inlen;
33541f1dcccSKevin Lo 			or -= ucslen;
33641f1dcccSKevin Lo 			if (dp->convtype & KICONV_UCS_TO_LE) {
33741f1dcccSKevin Lo 				*dst++ = *(p + 1);
33841f1dcccSKevin Lo 				*dst++ = *p;
33941f1dcccSKevin Lo 				p += 2;
34041f1dcccSKevin Lo 			} else {
34141f1dcccSKevin Lo 				*dst++ = *p++;
34241f1dcccSKevin Lo 				*dst++ = *p++;
34341f1dcccSKevin Lo 			}
34441f1dcccSKevin Lo 			if (ucslen == 4) {
34541f1dcccSKevin Lo 				if ((dp->convtype & KICONV_UCS_UCS4) == 0 ||
34641f1dcccSKevin Lo 				    (dp->convtype & KICONV_UCS_TO_UTF16) == 0) {
34741f1dcccSKevin Lo 					ret = -1;
34841f1dcccSKevin Lo 					break;
34941f1dcccSKevin Lo 				}
35041f1dcccSKevin Lo 				if (dp->convtype & KICONV_UCS_TO_LE) {
35141f1dcccSKevin Lo 					*dst++ = *(p + 1);
35241f1dcccSKevin Lo 					*dst++ = *p;
35341f1dcccSKevin Lo 				} else {
35441f1dcccSKevin Lo 					*dst++ = *p++;
35541f1dcccSKevin Lo 					*dst++ = *p;
35641f1dcccSKevin Lo 				}
35741f1dcccSKevin Lo 			}
35841f1dcccSKevin Lo 		}
35941f1dcccSKevin Lo 
36041f1dcccSKevin Lo 		if (convchar == 1)
36141f1dcccSKevin Lo 			break;
36241f1dcccSKevin Lo 	}
36341f1dcccSKevin Lo 
36441f1dcccSKevin Lo 	*inbuf += in - ir;
36541f1dcccSKevin Lo 	*outbuf += on - or;
36641f1dcccSKevin Lo 	*inbytesleft -= in - ir;
36741f1dcccSKevin Lo 	*outbytesleft -= on - or;
36841f1dcccSKevin Lo 	return (ret);
36941f1dcccSKevin Lo }
37041f1dcccSKevin Lo 
37141f1dcccSKevin Lo static int
iconv_ucs_init(struct iconv_converter_class * dcp)37241f1dcccSKevin Lo iconv_ucs_init(struct iconv_converter_class *dcp)
37341f1dcccSKevin Lo {
37441f1dcccSKevin Lo 	int error;
37541f1dcccSKevin Lo 
37641f1dcccSKevin Lo 	error = iconv_add(ENCODING_UNICODE, ENCODING_UNICODE, ENCODING_UTF8);
37741f1dcccSKevin Lo 	if (error)
37841f1dcccSKevin Lo 		return (error);
37941f1dcccSKevin Lo 	error = iconv_add(ENCODING_UNICODE, ENCODING_UTF8, ENCODING_UNICODE);
38041f1dcccSKevin Lo 	if (error)
38141f1dcccSKevin Lo 		return (error);
38241f1dcccSKevin Lo 	return (0);
38341f1dcccSKevin Lo }
38441f1dcccSKevin Lo 
38541f1dcccSKevin Lo static int
iconv_ucs_done(struct iconv_converter_class * dcp)38641f1dcccSKevin Lo iconv_ucs_done(struct iconv_converter_class *dcp)
38741f1dcccSKevin Lo {
38841f1dcccSKevin Lo 	return (0);
38941f1dcccSKevin Lo }
39041f1dcccSKevin Lo 
39141f1dcccSKevin Lo static const char *
iconv_ucs_name(struct iconv_converter_class * dcp)39241f1dcccSKevin Lo iconv_ucs_name(struct iconv_converter_class *dcp)
39341f1dcccSKevin Lo {
39441f1dcccSKevin Lo 	return (ENCODING_UNICODE);
39541f1dcccSKevin Lo }
39641f1dcccSKevin Lo 
39741f1dcccSKevin Lo static kobj_method_t iconv_ucs_methods[] = {
39841f1dcccSKevin Lo 	KOBJMETHOD(iconv_converter_open,	iconv_ucs_open),
39941f1dcccSKevin Lo 	KOBJMETHOD(iconv_converter_close,	iconv_ucs_close),
40041f1dcccSKevin Lo 	KOBJMETHOD(iconv_converter_conv,	iconv_ucs_conv),
40141f1dcccSKevin Lo 	KOBJMETHOD(iconv_converter_init,	iconv_ucs_init),
40241f1dcccSKevin Lo 	KOBJMETHOD(iconv_converter_done,	iconv_ucs_done),
40341f1dcccSKevin Lo 	KOBJMETHOD(iconv_converter_name,	iconv_ucs_name),
40441f1dcccSKevin Lo 	{0, 0}
40541f1dcccSKevin Lo };
40641f1dcccSKevin Lo 
40741f1dcccSKevin Lo KICONV_CONVERTER(ucs, sizeof(struct iconv_ucs));
40841f1dcccSKevin Lo 
40941f1dcccSKevin Lo static uint32_t
utf8_to_ucs4(const char * src,size_t * utf8width,size_t srclen)41041f1dcccSKevin Lo utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen)
41141f1dcccSKevin Lo {
41241f1dcccSKevin Lo 	size_t i, w = 0;
41341f1dcccSKevin Lo 	uint32_t ucs4 = 0;
41441f1dcccSKevin Lo 
41541f1dcccSKevin Lo 	/*
41641f1dcccSKevin Lo 	 * get leading 1 byte from utf-8
41741f1dcccSKevin Lo 	 */
41841f1dcccSKevin Lo 	if ((*src & 0x80) == 0) {
41941f1dcccSKevin Lo 		/*
42041f1dcccSKevin Lo 		 * leading 1 bit is "0"
42141f1dcccSKevin Lo 		 *  utf-8: 0xxxxxxx
42241f1dcccSKevin Lo 		 *  ucs-4: 00000000 00000000 00000000 0xxxxxxx
42341f1dcccSKevin Lo 		 */
42441f1dcccSKevin Lo 		w = 1;
42541f1dcccSKevin Lo 		/* get trailing 7 bits */
42641f1dcccSKevin Lo 		ucs4 = *src & 0x7f;
42741f1dcccSKevin Lo 	} else if ((*src & 0xe0) == 0xc0) {
42841f1dcccSKevin Lo 		/*
42941f1dcccSKevin Lo 		 * leading 3 bits are "110"
43041f1dcccSKevin Lo 		 *  utf-8: 110xxxxx 10yyyyyy
43141f1dcccSKevin Lo 		 *  ucs-4: 00000000 00000000 00000xxx xxyyyyyy
43241f1dcccSKevin Lo 		 */
43341f1dcccSKevin Lo 		w = 2;
43441f1dcccSKevin Lo 		/* get trailing 5 bits */
43541f1dcccSKevin Lo 		ucs4 = *src & 0x1f;
43641f1dcccSKevin Lo 	} else if ((*src & 0xf0) == 0xe0) {
43741f1dcccSKevin Lo 		/*
43841f1dcccSKevin Lo 		 * leading 4 bits are "1110"
43941f1dcccSKevin Lo 		 *  utf-8: 1110xxxx 10yyyyyy 10zzzzzz
44041f1dcccSKevin Lo 		 *  ucs-4: 00000000 00000000 xxxxyyyy yyzzzzzz
44141f1dcccSKevin Lo 		 */
44241f1dcccSKevin Lo 		w = 3;
44341f1dcccSKevin Lo 		/* get trailing 4 bits */
44441f1dcccSKevin Lo 		ucs4 = *src & 0x0f;
44541f1dcccSKevin Lo 	} else if ((*src & 0xf8) == 0xf0) {
44641f1dcccSKevin Lo 		/*
44741f1dcccSKevin Lo 		 * leading 5 bits are "11110"
44841f1dcccSKevin Lo 		 *  utf-8: 11110www 10xxxxxx 10yyyyyy 10zzzzzz
44941f1dcccSKevin Lo 		 *  ucs-4: 00000000 000wwwxx xxxxyyyy yyzzzzzz
45041f1dcccSKevin Lo 		 */
45141f1dcccSKevin Lo 		w = 4;
45241f1dcccSKevin Lo 		/* get trailing 3 bits */
45341f1dcccSKevin Lo 		ucs4 = *src & 0x07;
45441f1dcccSKevin Lo 	} else {
45541f1dcccSKevin Lo 		/* out of utf-16 range or having illegal bits */
45641f1dcccSKevin Lo 		return (0);
45741f1dcccSKevin Lo 	}
45841f1dcccSKevin Lo 
45941f1dcccSKevin Lo 	if (srclen < w)
46041f1dcccSKevin Lo 		return (0);
46141f1dcccSKevin Lo 
46241f1dcccSKevin Lo 	/*
46341f1dcccSKevin Lo 	 * get left parts from utf-8
46441f1dcccSKevin Lo 	 */
46541f1dcccSKevin Lo 	for (i = 1 ; i < w ; i++) {
46641f1dcccSKevin Lo 		if ((*(src + i) & 0xc0) != 0x80) {
46741f1dcccSKevin Lo 			/* invalid: leading 2 bits are not "10" */
46841f1dcccSKevin Lo 			return (0);
46941f1dcccSKevin Lo 		}
47041f1dcccSKevin Lo 		/* concatenate trailing 6 bits into ucs4 */
47141f1dcccSKevin Lo 		ucs4 <<= 6;
47241f1dcccSKevin Lo 		ucs4 |= *(src + i) & 0x3f;
47341f1dcccSKevin Lo 	}
47441f1dcccSKevin Lo 
47541f1dcccSKevin Lo 	*utf8width = w;
47641f1dcccSKevin Lo 	return (ucs4);
47741f1dcccSKevin Lo }
47841f1dcccSKevin Lo 
47941f1dcccSKevin Lo static u_char *
ucs4_to_utf8(uint32_t ucs4,char * dst,size_t * utf8width,size_t dstlen)48041f1dcccSKevin Lo ucs4_to_utf8(uint32_t ucs4, char *dst, size_t *utf8width, size_t dstlen)
48141f1dcccSKevin Lo {
48241f1dcccSKevin Lo 	u_char lead, *p;
48341f1dcccSKevin Lo 	size_t i, w;
48441f1dcccSKevin Lo 
48541f1dcccSKevin Lo 	/*
48641f1dcccSKevin Lo 	 * determine utf-8 width and leading bits
48741f1dcccSKevin Lo 	 */
48841f1dcccSKevin Lo 	if (ucs4 < 0x80) {
48941f1dcccSKevin Lo 		w = 1;
49041f1dcccSKevin Lo 		lead = 0;	/* "0" */
49141f1dcccSKevin Lo 	} else if (ucs4 < 0x800) {
49241f1dcccSKevin Lo 		w = 2;
49341f1dcccSKevin Lo 		lead = 0xc0;	/* "11" */
49441f1dcccSKevin Lo 	} else if (ucs4 < 0x10000) {
49541f1dcccSKevin Lo 		w = 3;
49641f1dcccSKevin Lo 		lead = 0xe0;	/* "111" */
49741f1dcccSKevin Lo 	} else if (ucs4 < 0x200000) {
49841f1dcccSKevin Lo 		w = 4;
49941f1dcccSKevin Lo 		lead = 0xf0;	/* "1111" */
50041f1dcccSKevin Lo 	} else {
50141f1dcccSKevin Lo 		return (NULL);
50241f1dcccSKevin Lo 	}
50341f1dcccSKevin Lo 
50441f1dcccSKevin Lo 	if (dstlen < w)
50541f1dcccSKevin Lo 		return (NULL);
50641f1dcccSKevin Lo 
50741f1dcccSKevin Lo 	/*
50841f1dcccSKevin Lo 	 * construct utf-8
50941f1dcccSKevin Lo 	 */
51041f1dcccSKevin Lo 	p = dst;
51141f1dcccSKevin Lo 	for (i = w - 1 ; i >= 1 ; i--) {
51241f1dcccSKevin Lo 		/* get trailing 6 bits and put it with leading bit as "1" */
51341f1dcccSKevin Lo 		*(p + i) = (ucs4 & 0x3f) | 0x80;
51441f1dcccSKevin Lo 		ucs4 >>= 6;
51541f1dcccSKevin Lo 	}
51641f1dcccSKevin Lo 	*p = ucs4 | lead;
51741f1dcccSKevin Lo 
51841f1dcccSKevin Lo 	*utf8width = w;
51941f1dcccSKevin Lo 
52041f1dcccSKevin Lo 	return (p);
52141f1dcccSKevin Lo }
52241f1dcccSKevin Lo 
52341f1dcccSKevin Lo static uint32_t
encode_surrogate(uint32_t code)524484820d4SConrad Meyer encode_surrogate(uint32_t code)
52541f1dcccSKevin Lo {
52641f1dcccSKevin Lo 	return ((((code - 0x10000) << 6) & 0x3ff0000) |
52741f1dcccSKevin Lo 	    ((code - 0x10000) & 0x3ff) | 0xd800dc00);
52841f1dcccSKevin Lo }
52941f1dcccSKevin Lo 
53041f1dcccSKevin Lo static uint32_t
decode_surrogate(const u_char * ucs)531484820d4SConrad Meyer decode_surrogate(const u_char *ucs)
53241f1dcccSKevin Lo {
53341f1dcccSKevin Lo 	return ((((ucs[0] & 0x3) << 18) | (ucs[1] << 10) |
53441f1dcccSKevin Lo 	    ((ucs[2] & 0x3) << 8) | ucs[3]) + 0x10000);
53541f1dcccSKevin Lo }
536