141f1dcccSKevin Lo /*-
2*4d846d26SWarner Losh * SPDX-License-Identifier: BSD-2-Clause
38a36da99SPedro F. Giffuni *
441f1dcccSKevin Lo * Copyright (c) 2003, 2005 Ryuichiro Imura
541f1dcccSKevin Lo * All rights reserved.
641f1dcccSKevin Lo *
741f1dcccSKevin Lo * Redistribution and use in source and binary forms, with or without
841f1dcccSKevin Lo * modification, are permitted provided that the following conditions
941f1dcccSKevin Lo * are met:
1041f1dcccSKevin Lo * 1. Redistributions of source code must retain the above copyright
1141f1dcccSKevin Lo * notice, this list of conditions and the following disclaimer.
1241f1dcccSKevin Lo * 2. Redistributions in binary form must reproduce the above copyright
1341f1dcccSKevin Lo * notice, this list of conditions and the following disclaimer in the
1441f1dcccSKevin Lo * documentation and/or other materials provided with the distribution.
1541f1dcccSKevin Lo *
1641f1dcccSKevin Lo * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
1741f1dcccSKevin Lo * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1841f1dcccSKevin Lo * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1941f1dcccSKevin Lo * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
2041f1dcccSKevin Lo * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2141f1dcccSKevin Lo * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2241f1dcccSKevin Lo * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2341f1dcccSKevin Lo * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2441f1dcccSKevin Lo * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2541f1dcccSKevin Lo * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
2641f1dcccSKevin Lo * SUCH DAMAGE.
2741f1dcccSKevin Lo */
2841f1dcccSKevin Lo
2941f1dcccSKevin Lo #include <sys/param.h>
3041f1dcccSKevin Lo #include <sys/kernel.h>
3141f1dcccSKevin Lo #include <sys/systm.h>
3241f1dcccSKevin Lo #include <sys/malloc.h>
3341f1dcccSKevin Lo #include <sys/iconv.h>
3441f1dcccSKevin Lo
3541f1dcccSKevin Lo #include "iconv_converter_if.h"
3641f1dcccSKevin Lo
3741f1dcccSKevin Lo /*
3841f1dcccSKevin Lo * "UCS" converter
3941f1dcccSKevin Lo */
4041f1dcccSKevin Lo
4141f1dcccSKevin Lo #define KICONV_UCS_COMBINE 0x1
4241f1dcccSKevin Lo #define KICONV_UCS_FROM_UTF8 0x2
4341f1dcccSKevin Lo #define KICONV_UCS_TO_UTF8 0x4
4441f1dcccSKevin Lo #define KICONV_UCS_FROM_LE 0x8
4541f1dcccSKevin Lo #define KICONV_UCS_TO_LE 0x10
4641f1dcccSKevin Lo #define KICONV_UCS_FROM_UTF16 0x20
4741f1dcccSKevin Lo #define KICONV_UCS_TO_UTF16 0x40
4841f1dcccSKevin Lo #define KICONV_UCS_UCS4 0x80
4941f1dcccSKevin Lo
5041f1dcccSKevin Lo #define ENCODING_UTF16 "UTF-16BE"
5141f1dcccSKevin Lo #define ENCODING_UTF8 "UTF-8"
5241f1dcccSKevin Lo
5341f1dcccSKevin Lo static struct {
5441f1dcccSKevin Lo const char *name;
5541f1dcccSKevin Lo int from_flag, to_flag;
5641f1dcccSKevin Lo } unicode_family[] = {
5741f1dcccSKevin Lo { "UTF-8", KICONV_UCS_FROM_UTF8, KICONV_UCS_TO_UTF8 },
5841f1dcccSKevin Lo { "UCS-2LE", KICONV_UCS_FROM_LE, KICONV_UCS_TO_LE },
5941f1dcccSKevin Lo { "UTF-16BE", KICONV_UCS_FROM_UTF16, KICONV_UCS_TO_UTF16 },
6041f1dcccSKevin Lo { "UTF-16LE", KICONV_UCS_FROM_UTF16|KICONV_UCS_FROM_LE,
6141f1dcccSKevin Lo KICONV_UCS_TO_UTF16|KICONV_UCS_TO_LE },
6241f1dcccSKevin Lo { NULL, 0, 0 }
6341f1dcccSKevin Lo };
6441f1dcccSKevin Lo
6541f1dcccSKevin Lo static uint32_t utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen);
6641f1dcccSKevin Lo static u_char *ucs4_to_utf8(uint32_t ucs4, char * dst, size_t *utf8width, size_t dstlen);
6741f1dcccSKevin Lo static uint32_t encode_surrogate(uint32_t code);
6841f1dcccSKevin Lo static uint32_t decode_surrogate(const u_char *ucs);
6941f1dcccSKevin Lo
7041f1dcccSKevin Lo #ifdef MODULE_DEPEND
7141f1dcccSKevin Lo MODULE_DEPEND(iconv_ucs, libiconv, 2, 2, 2);
7241f1dcccSKevin Lo #endif
7341f1dcccSKevin Lo
7441f1dcccSKevin Lo /*
7541f1dcccSKevin Lo * UCS converter instance
7641f1dcccSKevin Lo */
7741f1dcccSKevin Lo struct iconv_ucs {
7841f1dcccSKevin Lo KOBJ_FIELDS;
7941f1dcccSKevin Lo int convtype;
8041f1dcccSKevin Lo struct iconv_cspair * d_csp;
8141f1dcccSKevin Lo struct iconv_cspair * d_cspf;
8241f1dcccSKevin Lo void * f_ctp;
8341f1dcccSKevin Lo void * t_ctp;
8441f1dcccSKevin Lo void * ctype;
8541f1dcccSKevin Lo };
8641f1dcccSKevin Lo
8741f1dcccSKevin Lo static int
iconv_ucs_open(struct iconv_converter_class * dcp,struct iconv_cspair * csp,struct iconv_cspair * cspf,void ** dpp)8841f1dcccSKevin Lo iconv_ucs_open(struct iconv_converter_class *dcp,
8941f1dcccSKevin Lo struct iconv_cspair *csp, struct iconv_cspair *cspf, void **dpp)
9041f1dcccSKevin Lo {
9141f1dcccSKevin Lo struct iconv_ucs *dp;
9241f1dcccSKevin Lo int i;
9341f1dcccSKevin Lo const char *from, *to;
9441f1dcccSKevin Lo
9541f1dcccSKevin Lo dp = (struct iconv_ucs *)kobj_create((struct kobj_class*)dcp, M_ICONV, M_WAITOK);
9641f1dcccSKevin Lo to = csp->cp_to;
9741f1dcccSKevin Lo from = cspf ? cspf->cp_from : csp->cp_from;
9841f1dcccSKevin Lo
9941f1dcccSKevin Lo dp->convtype = 0;
10041f1dcccSKevin Lo
10141f1dcccSKevin Lo if (cspf)
10241f1dcccSKevin Lo dp->convtype |= KICONV_UCS_COMBINE;
10341f1dcccSKevin Lo for (i = 0; unicode_family[i].name; i++) {
1040cbce067SJohn Baldwin if (strcasecmp(from, unicode_family[i].name) == 0)
10541f1dcccSKevin Lo dp->convtype |= unicode_family[i].from_flag;
1060cbce067SJohn Baldwin if (strcasecmp(to, unicode_family[i].name) == 0)
10741f1dcccSKevin Lo dp->convtype |= unicode_family[i].to_flag;
10841f1dcccSKevin Lo }
109fa27760eSKevin Lo if (strcmp(ENCODING_UNICODE, ENCODING_UTF16) == 0)
11041f1dcccSKevin Lo dp->convtype |= KICONV_UCS_UCS4;
11141f1dcccSKevin Lo else
11241f1dcccSKevin Lo dp->convtype &= ~KICONV_UCS_UCS4;
11341f1dcccSKevin Lo
11441f1dcccSKevin Lo dp->f_ctp = dp->t_ctp = NULL;
11541f1dcccSKevin Lo if (dp->convtype & KICONV_UCS_COMBINE) {
11641f1dcccSKevin Lo if ((dp->convtype & KICONV_UCS_FROM_UTF8) == 0 &&
11741f1dcccSKevin Lo (dp->convtype & KICONV_UCS_FROM_LE) == 0) {
11841f1dcccSKevin Lo iconv_open(ENCODING_UNICODE, from, &dp->f_ctp);
11941f1dcccSKevin Lo }
12041f1dcccSKevin Lo if ((dp->convtype & KICONV_UCS_TO_UTF8) == 0 &&
12141f1dcccSKevin Lo (dp->convtype & KICONV_UCS_TO_LE) == 0) {
12241f1dcccSKevin Lo iconv_open(to, ENCODING_UNICODE, &dp->t_ctp);
12341f1dcccSKevin Lo }
12441f1dcccSKevin Lo }
12541f1dcccSKevin Lo
12641f1dcccSKevin Lo dp->ctype = NULL;
12741f1dcccSKevin Lo if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_TO_UTF8))
12841f1dcccSKevin Lo iconv_open(KICONV_WCTYPE_NAME, ENCODING_UTF8, &dp->ctype);
12941f1dcccSKevin Lo
13041f1dcccSKevin Lo dp->d_csp = csp;
13141f1dcccSKevin Lo if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE)) {
13241f1dcccSKevin Lo if (cspf) {
13341f1dcccSKevin Lo dp->d_cspf = cspf;
13441f1dcccSKevin Lo cspf->cp_refcount++;
13541f1dcccSKevin Lo } else
13641f1dcccSKevin Lo csp->cp_refcount++;
13741f1dcccSKevin Lo }
13841f1dcccSKevin Lo if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
13941f1dcccSKevin Lo csp->cp_refcount++;
14041f1dcccSKevin Lo *dpp = (void*)dp;
14141f1dcccSKevin Lo return 0;
14241f1dcccSKevin Lo }
14341f1dcccSKevin Lo
14441f1dcccSKevin Lo static int
iconv_ucs_close(void * data)14541f1dcccSKevin Lo iconv_ucs_close(void *data)
14641f1dcccSKevin Lo {
14741f1dcccSKevin Lo struct iconv_ucs *dp = data;
14841f1dcccSKevin Lo
14941f1dcccSKevin Lo if (dp->f_ctp)
15041f1dcccSKevin Lo iconv_close(dp->f_ctp);
15141f1dcccSKevin Lo if (dp->t_ctp)
15241f1dcccSKevin Lo iconv_close(dp->t_ctp);
15341f1dcccSKevin Lo if (dp->ctype)
15441f1dcccSKevin Lo iconv_close(dp->ctype);
15541f1dcccSKevin Lo if (dp->d_cspf)
15641f1dcccSKevin Lo dp->d_cspf->cp_refcount--;
15741f1dcccSKevin Lo else if (dp->convtype & (KICONV_UCS_FROM_UTF8 | KICONV_UCS_FROM_LE))
15841f1dcccSKevin Lo dp->d_csp->cp_refcount--;
15941f1dcccSKevin Lo if (dp->convtype & (KICONV_UCS_TO_UTF8 | KICONV_UCS_TO_LE))
16041f1dcccSKevin Lo dp->d_csp->cp_refcount--;
16141f1dcccSKevin Lo kobj_delete((struct kobj*)data, M_ICONV);
16241f1dcccSKevin Lo return 0;
16341f1dcccSKevin Lo }
16441f1dcccSKevin Lo
16541f1dcccSKevin Lo static int
iconv_ucs_conv(void * d2p,const char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int convchar,int casetype)16641f1dcccSKevin Lo iconv_ucs_conv(void *d2p, const char **inbuf,
16741f1dcccSKevin Lo size_t *inbytesleft, char **outbuf, size_t *outbytesleft,
16841f1dcccSKevin Lo int convchar, int casetype)
16941f1dcccSKevin Lo {
17041f1dcccSKevin Lo struct iconv_ucs *dp = (struct iconv_ucs*)d2p;
17141f1dcccSKevin Lo int ret = 0, i;
17241f1dcccSKevin Lo size_t in, on, ir, or, inlen, outlen, ucslen;
17341f1dcccSKevin Lo const char *src, *p;
17441f1dcccSKevin Lo char *dst;
17541f1dcccSKevin Lo u_char ucs[4], *q;
17641f1dcccSKevin Lo uint32_t code;
17741f1dcccSKevin Lo
17841f1dcccSKevin Lo if (inbuf == NULL || *inbuf == NULL || outbuf == NULL || *outbuf == NULL)
17941f1dcccSKevin Lo return 0;
18041f1dcccSKevin Lo ir = in = *inbytesleft;
18141f1dcccSKevin Lo or = on = *outbytesleft;
18241f1dcccSKevin Lo src = *inbuf;
18341f1dcccSKevin Lo dst = *outbuf;
18441f1dcccSKevin Lo
18541f1dcccSKevin Lo while (ir > 0 && or > 0) {
18641f1dcccSKevin Lo /*
18741f1dcccSKevin Lo * The first half of conversion.
18841f1dcccSKevin Lo * (convert any code into ENCODING_UNICODE)
18941f1dcccSKevin Lo */
19041f1dcccSKevin Lo code = 0;
19141f1dcccSKevin Lo p = src;
19241f1dcccSKevin Lo if (dp->convtype & KICONV_UCS_FROM_UTF8) {
19341f1dcccSKevin Lo /* convert UTF-8 to ENCODING_UNICODE */
19441f1dcccSKevin Lo inlen = 0;
19541f1dcccSKevin Lo code = utf8_to_ucs4(p, &inlen, ir);
19641f1dcccSKevin Lo if (code == 0) {
19741f1dcccSKevin Lo ret = -1;
19841f1dcccSKevin Lo break;
19941f1dcccSKevin Lo }
20041f1dcccSKevin Lo
20141f1dcccSKevin Lo if (casetype == KICONV_FROM_LOWER && dp->ctype) {
20241f1dcccSKevin Lo code = towlower(code, dp->ctype);
20341f1dcccSKevin Lo } else if (casetype == KICONV_FROM_UPPER && dp->ctype) {
20441f1dcccSKevin Lo code = towupper(code, dp->ctype);
20541f1dcccSKevin Lo }
20641f1dcccSKevin Lo
20741f1dcccSKevin Lo if ((code >= 0xd800 && code < 0xe000) || code >= 0x110000 ) {
20841f1dcccSKevin Lo /* reserved for utf-16 surrogate pair */
20941f1dcccSKevin Lo /* invalid unicode */
21041f1dcccSKevin Lo ret = -1;
21141f1dcccSKevin Lo break;
21241f1dcccSKevin Lo }
21341f1dcccSKevin Lo
21441f1dcccSKevin Lo if (inlen == 4) {
21541f1dcccSKevin Lo if (dp->convtype & KICONV_UCS_UCS4) {
21641f1dcccSKevin Lo ucslen = 4;
21741f1dcccSKevin Lo code = encode_surrogate(code);
21841f1dcccSKevin Lo } else {
21941f1dcccSKevin Lo /* can't handle with ucs-2 */
22041f1dcccSKevin Lo ret = -1;
22141f1dcccSKevin Lo break;
22241f1dcccSKevin Lo }
22341f1dcccSKevin Lo } else {
22441f1dcccSKevin Lo ucslen = 2;
22541f1dcccSKevin Lo }
22641f1dcccSKevin Lo
22741f1dcccSKevin Lo /* save UCS-4 into ucs[] */
22841f1dcccSKevin Lo for (q = ucs, i = ucslen - 1 ; i >= 0 ; i--)
22941f1dcccSKevin Lo *q++ = (code >> (i << 3)) & 0xff;
23041f1dcccSKevin Lo
23141f1dcccSKevin Lo } else if (dp->convtype & KICONV_UCS_COMBINE && dp->f_ctp) {
23241f1dcccSKevin Lo /* convert local code to ENCODING_UNICODE */
23341f1dcccSKevin Lo ucslen = 4;
23441f1dcccSKevin Lo inlen = ir;
23541f1dcccSKevin Lo q = ucs;
23641f1dcccSKevin Lo ret = iconv_convchr_case(dp->f_ctp, &p, &inlen, (char **)&q,
23741f1dcccSKevin Lo &ucslen, casetype & (KICONV_FROM_LOWER | KICONV_FROM_UPPER));
23841f1dcccSKevin Lo if (ret)
23941f1dcccSKevin Lo break;
24041f1dcccSKevin Lo inlen = ir - inlen;
24141f1dcccSKevin Lo ucslen = 4 - ucslen;
24241f1dcccSKevin Lo
24341f1dcccSKevin Lo } else {
24441f1dcccSKevin Lo /* src code is a proper subset of ENCODING_UNICODE */
24541f1dcccSKevin Lo q = ucs;
24641f1dcccSKevin Lo if (dp->convtype & KICONV_UCS_FROM_LE) {
24741f1dcccSKevin Lo *q = *(p + 1);
24841f1dcccSKevin Lo *(q + 1) = *p;
24941f1dcccSKevin Lo p += 2;
25041f1dcccSKevin Lo } else {
25141f1dcccSKevin Lo *q = *p++;
25241f1dcccSKevin Lo *(q + 1) = *p++;
25341f1dcccSKevin Lo }
25441f1dcccSKevin Lo if ((*q & 0xfc) == 0xd8) {
25541f1dcccSKevin Lo if (dp->convtype & KICONV_UCS_UCS4 &&
25641f1dcccSKevin Lo dp->convtype & KICONV_UCS_FROM_UTF16) {
25741f1dcccSKevin Lo inlen = ucslen = 4;
25841f1dcccSKevin Lo } else {
25941f1dcccSKevin Lo /* invalid unicode */
26041f1dcccSKevin Lo ret = -1;
26141f1dcccSKevin Lo break;
26241f1dcccSKevin Lo }
26341f1dcccSKevin Lo } else {
26441f1dcccSKevin Lo inlen = ucslen = 2;
26541f1dcccSKevin Lo }
26641f1dcccSKevin Lo if (ir < inlen) {
26741f1dcccSKevin Lo ret = -1;
26841f1dcccSKevin Lo break;
26941f1dcccSKevin Lo }
27041f1dcccSKevin Lo if (ucslen == 4) {
27141f1dcccSKevin Lo q += 2;
27241f1dcccSKevin Lo if (dp->convtype & KICONV_UCS_FROM_LE) {
27341f1dcccSKevin Lo *q = *(p + 1);
27441f1dcccSKevin Lo *(q + 1) = *p;
27541f1dcccSKevin Lo } else {
27641f1dcccSKevin Lo *q = *p++;
27741f1dcccSKevin Lo *(q + 1) = *p;
27841f1dcccSKevin Lo }
27941f1dcccSKevin Lo if ((*q & 0xfc) != 0xdc) {
28041f1dcccSKevin Lo /* invalid unicode */
28141f1dcccSKevin Lo ret = -1;
28241f1dcccSKevin Lo break;
28341f1dcccSKevin Lo }
28441f1dcccSKevin Lo }
28541f1dcccSKevin Lo }
28641f1dcccSKevin Lo
28741f1dcccSKevin Lo /*
28841f1dcccSKevin Lo * The second half of conversion.
28941f1dcccSKevin Lo * (convert ENCODING_UNICODE into any code)
29041f1dcccSKevin Lo */
29141f1dcccSKevin Lo p = ucs;
29241f1dcccSKevin Lo if (dp->convtype & KICONV_UCS_TO_UTF8) {
29341f1dcccSKevin Lo q = (u_char *)dst;
29441f1dcccSKevin Lo if (ucslen == 4 && dp->convtype & KICONV_UCS_UCS4) {
29541f1dcccSKevin Lo /* decode surrogate pair */
29641f1dcccSKevin Lo code = decode_surrogate(p);
29741f1dcccSKevin Lo } else {
29841f1dcccSKevin Lo code = (ucs[0] << 8) | ucs[1];
29941f1dcccSKevin Lo }
30041f1dcccSKevin Lo
30141f1dcccSKevin Lo if (casetype == KICONV_LOWER && dp->ctype) {
30241f1dcccSKevin Lo code = towlower(code, dp->ctype);
30341f1dcccSKevin Lo } else if (casetype == KICONV_UPPER && dp->ctype) {
30441f1dcccSKevin Lo code = towupper(code, dp->ctype);
30541f1dcccSKevin Lo }
30641f1dcccSKevin Lo
30741f1dcccSKevin Lo outlen = 0;
30841f1dcccSKevin Lo if (ucs4_to_utf8(code, q, &outlen, or) == NULL) {
30941f1dcccSKevin Lo ret = -1;
31041f1dcccSKevin Lo break;
31141f1dcccSKevin Lo }
31241f1dcccSKevin Lo
31341f1dcccSKevin Lo src += inlen;
31441f1dcccSKevin Lo ir -= inlen;
31541f1dcccSKevin Lo dst += outlen;
31641f1dcccSKevin Lo or -= outlen;
31741f1dcccSKevin Lo
31841f1dcccSKevin Lo } else if (dp->convtype & KICONV_UCS_COMBINE && dp->t_ctp) {
31941f1dcccSKevin Lo ret = iconv_convchr_case(dp->t_ctp, &p, &ucslen, &dst,
32041f1dcccSKevin Lo &or, casetype & (KICONV_LOWER | KICONV_UPPER));
32141f1dcccSKevin Lo if (ret)
32241f1dcccSKevin Lo break;
32341f1dcccSKevin Lo
32441f1dcccSKevin Lo src += inlen;
32541f1dcccSKevin Lo ir -= inlen;
32641f1dcccSKevin Lo
32741f1dcccSKevin Lo } else {
32841f1dcccSKevin Lo /* dst code is a proper subset of ENCODING_UNICODE */
32941f1dcccSKevin Lo if (or < ucslen) {
33041f1dcccSKevin Lo ret = -1;
33141f1dcccSKevin Lo break;
33241f1dcccSKevin Lo }
33341f1dcccSKevin Lo src += inlen;
33441f1dcccSKevin Lo ir -= inlen;
33541f1dcccSKevin Lo or -= ucslen;
33641f1dcccSKevin Lo if (dp->convtype & KICONV_UCS_TO_LE) {
33741f1dcccSKevin Lo *dst++ = *(p + 1);
33841f1dcccSKevin Lo *dst++ = *p;
33941f1dcccSKevin Lo p += 2;
34041f1dcccSKevin Lo } else {
34141f1dcccSKevin Lo *dst++ = *p++;
34241f1dcccSKevin Lo *dst++ = *p++;
34341f1dcccSKevin Lo }
34441f1dcccSKevin Lo if (ucslen == 4) {
34541f1dcccSKevin Lo if ((dp->convtype & KICONV_UCS_UCS4) == 0 ||
34641f1dcccSKevin Lo (dp->convtype & KICONV_UCS_TO_UTF16) == 0) {
34741f1dcccSKevin Lo ret = -1;
34841f1dcccSKevin Lo break;
34941f1dcccSKevin Lo }
35041f1dcccSKevin Lo if (dp->convtype & KICONV_UCS_TO_LE) {
35141f1dcccSKevin Lo *dst++ = *(p + 1);
35241f1dcccSKevin Lo *dst++ = *p;
35341f1dcccSKevin Lo } else {
35441f1dcccSKevin Lo *dst++ = *p++;
35541f1dcccSKevin Lo *dst++ = *p;
35641f1dcccSKevin Lo }
35741f1dcccSKevin Lo }
35841f1dcccSKevin Lo }
35941f1dcccSKevin Lo
36041f1dcccSKevin Lo if (convchar == 1)
36141f1dcccSKevin Lo break;
36241f1dcccSKevin Lo }
36341f1dcccSKevin Lo
36441f1dcccSKevin Lo *inbuf += in - ir;
36541f1dcccSKevin Lo *outbuf += on - or;
36641f1dcccSKevin Lo *inbytesleft -= in - ir;
36741f1dcccSKevin Lo *outbytesleft -= on - or;
36841f1dcccSKevin Lo return (ret);
36941f1dcccSKevin Lo }
37041f1dcccSKevin Lo
37141f1dcccSKevin Lo static int
iconv_ucs_init(struct iconv_converter_class * dcp)37241f1dcccSKevin Lo iconv_ucs_init(struct iconv_converter_class *dcp)
37341f1dcccSKevin Lo {
37441f1dcccSKevin Lo int error;
37541f1dcccSKevin Lo
37641f1dcccSKevin Lo error = iconv_add(ENCODING_UNICODE, ENCODING_UNICODE, ENCODING_UTF8);
37741f1dcccSKevin Lo if (error)
37841f1dcccSKevin Lo return (error);
37941f1dcccSKevin Lo error = iconv_add(ENCODING_UNICODE, ENCODING_UTF8, ENCODING_UNICODE);
38041f1dcccSKevin Lo if (error)
38141f1dcccSKevin Lo return (error);
38241f1dcccSKevin Lo return (0);
38341f1dcccSKevin Lo }
38441f1dcccSKevin Lo
38541f1dcccSKevin Lo static int
iconv_ucs_done(struct iconv_converter_class * dcp)38641f1dcccSKevin Lo iconv_ucs_done(struct iconv_converter_class *dcp)
38741f1dcccSKevin Lo {
38841f1dcccSKevin Lo return (0);
38941f1dcccSKevin Lo }
39041f1dcccSKevin Lo
39141f1dcccSKevin Lo static const char *
iconv_ucs_name(struct iconv_converter_class * dcp)39241f1dcccSKevin Lo iconv_ucs_name(struct iconv_converter_class *dcp)
39341f1dcccSKevin Lo {
39441f1dcccSKevin Lo return (ENCODING_UNICODE);
39541f1dcccSKevin Lo }
39641f1dcccSKevin Lo
39741f1dcccSKevin Lo static kobj_method_t iconv_ucs_methods[] = {
39841f1dcccSKevin Lo KOBJMETHOD(iconv_converter_open, iconv_ucs_open),
39941f1dcccSKevin Lo KOBJMETHOD(iconv_converter_close, iconv_ucs_close),
40041f1dcccSKevin Lo KOBJMETHOD(iconv_converter_conv, iconv_ucs_conv),
40141f1dcccSKevin Lo KOBJMETHOD(iconv_converter_init, iconv_ucs_init),
40241f1dcccSKevin Lo KOBJMETHOD(iconv_converter_done, iconv_ucs_done),
40341f1dcccSKevin Lo KOBJMETHOD(iconv_converter_name, iconv_ucs_name),
40441f1dcccSKevin Lo {0, 0}
40541f1dcccSKevin Lo };
40641f1dcccSKevin Lo
40741f1dcccSKevin Lo KICONV_CONVERTER(ucs, sizeof(struct iconv_ucs));
40841f1dcccSKevin Lo
40941f1dcccSKevin Lo static uint32_t
utf8_to_ucs4(const char * src,size_t * utf8width,size_t srclen)41041f1dcccSKevin Lo utf8_to_ucs4(const char *src, size_t *utf8width, size_t srclen)
41141f1dcccSKevin Lo {
41241f1dcccSKevin Lo size_t i, w = 0;
41341f1dcccSKevin Lo uint32_t ucs4 = 0;
41441f1dcccSKevin Lo
41541f1dcccSKevin Lo /*
41641f1dcccSKevin Lo * get leading 1 byte from utf-8
41741f1dcccSKevin Lo */
41841f1dcccSKevin Lo if ((*src & 0x80) == 0) {
41941f1dcccSKevin Lo /*
42041f1dcccSKevin Lo * leading 1 bit is "0"
42141f1dcccSKevin Lo * utf-8: 0xxxxxxx
42241f1dcccSKevin Lo * ucs-4: 00000000 00000000 00000000 0xxxxxxx
42341f1dcccSKevin Lo */
42441f1dcccSKevin Lo w = 1;
42541f1dcccSKevin Lo /* get trailing 7 bits */
42641f1dcccSKevin Lo ucs4 = *src & 0x7f;
42741f1dcccSKevin Lo } else if ((*src & 0xe0) == 0xc0) {
42841f1dcccSKevin Lo /*
42941f1dcccSKevin Lo * leading 3 bits are "110"
43041f1dcccSKevin Lo * utf-8: 110xxxxx 10yyyyyy
43141f1dcccSKevin Lo * ucs-4: 00000000 00000000 00000xxx xxyyyyyy
43241f1dcccSKevin Lo */
43341f1dcccSKevin Lo w = 2;
43441f1dcccSKevin Lo /* get trailing 5 bits */
43541f1dcccSKevin Lo ucs4 = *src & 0x1f;
43641f1dcccSKevin Lo } else if ((*src & 0xf0) == 0xe0) {
43741f1dcccSKevin Lo /*
43841f1dcccSKevin Lo * leading 4 bits are "1110"
43941f1dcccSKevin Lo * utf-8: 1110xxxx 10yyyyyy 10zzzzzz
44041f1dcccSKevin Lo * ucs-4: 00000000 00000000 xxxxyyyy yyzzzzzz
44141f1dcccSKevin Lo */
44241f1dcccSKevin Lo w = 3;
44341f1dcccSKevin Lo /* get trailing 4 bits */
44441f1dcccSKevin Lo ucs4 = *src & 0x0f;
44541f1dcccSKevin Lo } else if ((*src & 0xf8) == 0xf0) {
44641f1dcccSKevin Lo /*
44741f1dcccSKevin Lo * leading 5 bits are "11110"
44841f1dcccSKevin Lo * utf-8: 11110www 10xxxxxx 10yyyyyy 10zzzzzz
44941f1dcccSKevin Lo * ucs-4: 00000000 000wwwxx xxxxyyyy yyzzzzzz
45041f1dcccSKevin Lo */
45141f1dcccSKevin Lo w = 4;
45241f1dcccSKevin Lo /* get trailing 3 bits */
45341f1dcccSKevin Lo ucs4 = *src & 0x07;
45441f1dcccSKevin Lo } else {
45541f1dcccSKevin Lo /* out of utf-16 range or having illegal bits */
45641f1dcccSKevin Lo return (0);
45741f1dcccSKevin Lo }
45841f1dcccSKevin Lo
45941f1dcccSKevin Lo if (srclen < w)
46041f1dcccSKevin Lo return (0);
46141f1dcccSKevin Lo
46241f1dcccSKevin Lo /*
46341f1dcccSKevin Lo * get left parts from utf-8
46441f1dcccSKevin Lo */
46541f1dcccSKevin Lo for (i = 1 ; i < w ; i++) {
46641f1dcccSKevin Lo if ((*(src + i) & 0xc0) != 0x80) {
46741f1dcccSKevin Lo /* invalid: leading 2 bits are not "10" */
46841f1dcccSKevin Lo return (0);
46941f1dcccSKevin Lo }
47041f1dcccSKevin Lo /* concatenate trailing 6 bits into ucs4 */
47141f1dcccSKevin Lo ucs4 <<= 6;
47241f1dcccSKevin Lo ucs4 |= *(src + i) & 0x3f;
47341f1dcccSKevin Lo }
47441f1dcccSKevin Lo
47541f1dcccSKevin Lo *utf8width = w;
47641f1dcccSKevin Lo return (ucs4);
47741f1dcccSKevin Lo }
47841f1dcccSKevin Lo
47941f1dcccSKevin Lo static u_char *
ucs4_to_utf8(uint32_t ucs4,char * dst,size_t * utf8width,size_t dstlen)48041f1dcccSKevin Lo ucs4_to_utf8(uint32_t ucs4, char *dst, size_t *utf8width, size_t dstlen)
48141f1dcccSKevin Lo {
48241f1dcccSKevin Lo u_char lead, *p;
48341f1dcccSKevin Lo size_t i, w;
48441f1dcccSKevin Lo
48541f1dcccSKevin Lo /*
48641f1dcccSKevin Lo * determine utf-8 width and leading bits
48741f1dcccSKevin Lo */
48841f1dcccSKevin Lo if (ucs4 < 0x80) {
48941f1dcccSKevin Lo w = 1;
49041f1dcccSKevin Lo lead = 0; /* "0" */
49141f1dcccSKevin Lo } else if (ucs4 < 0x800) {
49241f1dcccSKevin Lo w = 2;
49341f1dcccSKevin Lo lead = 0xc0; /* "11" */
49441f1dcccSKevin Lo } else if (ucs4 < 0x10000) {
49541f1dcccSKevin Lo w = 3;
49641f1dcccSKevin Lo lead = 0xe0; /* "111" */
49741f1dcccSKevin Lo } else if (ucs4 < 0x200000) {
49841f1dcccSKevin Lo w = 4;
49941f1dcccSKevin Lo lead = 0xf0; /* "1111" */
50041f1dcccSKevin Lo } else {
50141f1dcccSKevin Lo return (NULL);
50241f1dcccSKevin Lo }
50341f1dcccSKevin Lo
50441f1dcccSKevin Lo if (dstlen < w)
50541f1dcccSKevin Lo return (NULL);
50641f1dcccSKevin Lo
50741f1dcccSKevin Lo /*
50841f1dcccSKevin Lo * construct utf-8
50941f1dcccSKevin Lo */
51041f1dcccSKevin Lo p = dst;
51141f1dcccSKevin Lo for (i = w - 1 ; i >= 1 ; i--) {
51241f1dcccSKevin Lo /* get trailing 6 bits and put it with leading bit as "1" */
51341f1dcccSKevin Lo *(p + i) = (ucs4 & 0x3f) | 0x80;
51441f1dcccSKevin Lo ucs4 >>= 6;
51541f1dcccSKevin Lo }
51641f1dcccSKevin Lo *p = ucs4 | lead;
51741f1dcccSKevin Lo
51841f1dcccSKevin Lo *utf8width = w;
51941f1dcccSKevin Lo
52041f1dcccSKevin Lo return (p);
52141f1dcccSKevin Lo }
52241f1dcccSKevin Lo
52341f1dcccSKevin Lo static uint32_t
encode_surrogate(uint32_t code)524484820d4SConrad Meyer encode_surrogate(uint32_t code)
52541f1dcccSKevin Lo {
52641f1dcccSKevin Lo return ((((code - 0x10000) << 6) & 0x3ff0000) |
52741f1dcccSKevin Lo ((code - 0x10000) & 0x3ff) | 0xd800dc00);
52841f1dcccSKevin Lo }
52941f1dcccSKevin Lo
53041f1dcccSKevin Lo static uint32_t
decode_surrogate(const u_char * ucs)531484820d4SConrad Meyer decode_surrogate(const u_char *ucs)
53241f1dcccSKevin Lo {
53341f1dcccSKevin Lo return ((((ucs[0] & 0x3) << 18) | (ucs[1] << 10) |
53441f1dcccSKevin Lo ((ucs[2] & 0x3) << 8) | ucs[3]) + 0x10000);
53541f1dcccSKevin Lo }
536