1*6008Syy154373 /*
2*6008Syy154373 * CDDL HEADER START
3*6008Syy154373 *
4*6008Syy154373 * The contents of this file are subject to the terms of the
5*6008Syy154373 * Common Development and Distribution License (the "License").
6*6008Syy154373 * You may not use this file except in compliance with the License.
7*6008Syy154373 *
8*6008Syy154373 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*6008Syy154373 * or http://www.opensolaris.org/os/licensing.
10*6008Syy154373 * See the License for the specific language governing permissions
11*6008Syy154373 * and limitations under the License.
12*6008Syy154373 *
13*6008Syy154373 * When distributing Covered Code, include this CDDL HEADER in each
14*6008Syy154373 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*6008Syy154373 * If applicable, add the following below this CDDL HEADER, with the
16*6008Syy154373 * fields enclosed by brackets "[]" replaced with your own identifying
17*6008Syy154373 * information: Portions Copyright [yyyy] [name of copyright owner]
18*6008Syy154373 *
19*6008Syy154373 * CDDL HEADER END
20*6008Syy154373 */
21*6008Syy154373 /*
22*6008Syy154373 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23*6008Syy154373 * Use is subject to license terms.
24*6008Syy154373 */
25*6008Syy154373
26*6008Syy154373 #pragma ident "%Z%%M% %I% %E% SMI"
27*6008Syy154373
28*6008Syy154373 #include <sys/types.h>
29*6008Syy154373 #include <sys/param.h>
30*6008Syy154373 #include <sys/sysmacros.h>
31*6008Syy154373 #include <sys/systm.h>
32*6008Syy154373 #include <sys/debug.h>
33*6008Syy154373 #include <sys/kmem.h>
34*6008Syy154373 #include <sys/sunddi.h>
35*6008Syy154373 #include <sys/byteorder.h>
36*6008Syy154373 #include <sys/errno.h>
37*6008Syy154373 #include <sys/u8_textprep.h>
38*6008Syy154373 #include <sys/kiconv.h>
39*6008Syy154373 #include <sys/kiconv_cck_common.h>
40*6008Syy154373
41*6008Syy154373 /*LINTLIBRARY*/
42*6008Syy154373
43*6008Syy154373 /*
44*6008Syy154373 * Common kiconv_open method for UTF-8 -> CCK conversion.
45*6008Syy154373 */
46*6008Syy154373 void *
kiconv_open_to_cck()47*6008Syy154373 kiconv_open_to_cck()
48*6008Syy154373 {
49*6008Syy154373 kiconv_state_t st;
50*6008Syy154373
51*6008Syy154373 st = (kiconv_state_t)kmem_alloc(sizeof (kiconv_state_data_t), KM_SLEEP);
52*6008Syy154373
53*6008Syy154373 st->bom_processed = 0;
54*6008Syy154373
55*6008Syy154373 return ((void *)st);
56*6008Syy154373 }
57*6008Syy154373
58*6008Syy154373 /*
59*6008Syy154373 * Common kiconv_close method for UTF-8 -> CCK conversion.
60*6008Syy154373 */
61*6008Syy154373 int
kiconv_close_to_cck(void * kcd)62*6008Syy154373 kiconv_close_to_cck(void *kcd)
63*6008Syy154373 {
64*6008Syy154373 if (! kcd || kcd == (void *)-1)
65*6008Syy154373 return (EBADF);
66*6008Syy154373
67*6008Syy154373 kmem_free(kcd, sizeof (kiconv_state_data_t));
68*6008Syy154373
69*6008Syy154373 return (0);
70*6008Syy154373 }
71*6008Syy154373
72*6008Syy154373 /*
73*6008Syy154373 * Common routine to convert UTF-8 sequence to CCK legal character sequence.
74*6008Syy154373 */
75*6008Syy154373 size_t
kiconv_utf8_to_cck(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno,kiconv_utf8tocck_t ptr_utf8tocck)76*6008Syy154373 kiconv_utf8_to_cck(void *kcd, char **inbuf, size_t *inbytesleft,
77*6008Syy154373 char **outbuf, size_t *outbytesleft, int *errno,
78*6008Syy154373 kiconv_utf8tocck_t ptr_utf8tocck)
79*6008Syy154373 {
80*6008Syy154373 uchar_t *ib;
81*6008Syy154373 uchar_t *ob;
82*6008Syy154373 uchar_t *ibtail;
83*6008Syy154373 uchar_t *obtail;
84*6008Syy154373 uchar_t *oldib;
85*6008Syy154373 size_t ret_val;
86*6008Syy154373 size_t i; /* temp variable in for loop */
87*6008Syy154373 uint32_t u8;
88*6008Syy154373 int8_t sz;
89*6008Syy154373
90*6008Syy154373 /* Check on the kiconv code conversion descriptor. */
91*6008Syy154373 if (! kcd || kcd == (void *)-1) {
92*6008Syy154373 *errno = EBADF;
93*6008Syy154373 return ((size_t)-1);
94*6008Syy154373 }
95*6008Syy154373
96*6008Syy154373 /* If this is a state reset request, process and return. */
97*6008Syy154373 if (! inbuf || !(*inbuf)) {
98*6008Syy154373 ((kiconv_state_t)kcd)->bom_processed = 0;
99*6008Syy154373 return (0);
100*6008Syy154373 }
101*6008Syy154373
102*6008Syy154373 ret_val = 0;
103*6008Syy154373 ib = (uchar_t *)*inbuf;
104*6008Syy154373 ob = (uchar_t *)*outbuf;
105*6008Syy154373 ibtail = ib + *inbytesleft;
106*6008Syy154373 obtail = ob + *outbytesleft;
107*6008Syy154373
108*6008Syy154373 KICONV_CHECK_UTF8_BOM(ib, ibtail);
109*6008Syy154373
110*6008Syy154373 while (ib < ibtail) {
111*6008Syy154373 sz = u8_number_of_bytes[*ib];
112*6008Syy154373
113*6008Syy154373 /*
114*6008Syy154373 * If it is a 7-bit ASCII character, we don't need to
115*6008Syy154373 * process further and we just copy the character over.
116*6008Syy154373 *
117*6008Syy154373 * If not, we connect the chracter bytes up to four bytes,
118*6008Syy154373 * validate the bytes, and binary search for the corresponding
119*6008Syy154373 * table. If we find it from the mapping table, we put that
120*6008Syy154373 * into the output buffer; otherwise, we put a replacement
121*6008Syy154373 * character instead as a non-identical conversion.
122*6008Syy154373 */
123*6008Syy154373 if (sz == 1) {
124*6008Syy154373 if (ob >= obtail) {
125*6008Syy154373 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
126*6008Syy154373 }
127*6008Syy154373
128*6008Syy154373 *ob++ = *ib++;
129*6008Syy154373 continue;
130*6008Syy154373 }
131*6008Syy154373
132*6008Syy154373 /*
133*6008Syy154373 * Issue EILSEQ error if the first byte is a
134*6008Syy154373 * invalid UTF-8 character leading byte.
135*6008Syy154373 */
136*6008Syy154373 if (sz <= 0) {
137*6008Syy154373 KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
138*6008Syy154373 }
139*6008Syy154373
140*6008Syy154373 /*
141*6008Syy154373 * Issue EINVAL error if input buffer has an incomplete
142*6008Syy154373 * character at the end of the buffer.
143*6008Syy154373 */
144*6008Syy154373 if (ibtail - ib < sz) {
145*6008Syy154373 KICONV_SET_ERRNO_AND_BREAK(EINVAL);
146*6008Syy154373 }
147*6008Syy154373
148*6008Syy154373 /*
149*6008Syy154373 * We collect UTF-8 character bytes and also check if this
150*6008Syy154373 * is a valid UTF-8 character without any bogus bytes based
151*6008Syy154373 * on the latest UTF-8 binary representation.
152*6008Syy154373 */
153*6008Syy154373 oldib = ib;
154*6008Syy154373 u8 = *ib++;
155*6008Syy154373
156*6008Syy154373 if (KICONV_IS_INVALID_UTF8_SECOND_BYTE(*ib, u8))
157*6008Syy154373 goto ILLEGAL_CHAR_PROCESS;
158*6008Syy154373 u8 = (u8 << 8) | *ib++;
159*6008Syy154373
160*6008Syy154373 for (i = 2; i < sz; i++) {
161*6008Syy154373 if (*ib < 0x80 || *ib > 0xbf) {
162*6008Syy154373 ILLEGAL_CHAR_PROCESS:
163*6008Syy154373 *errno = EILSEQ;
164*6008Syy154373 ret_val = (size_t)-1;
165*6008Syy154373 ib = oldib;
166*6008Syy154373 goto ILLEGAL_CHAR_ERR;
167*6008Syy154373 }
168*6008Syy154373
169*6008Syy154373 u8 = (u8 << 8) | *ib++;
170*6008Syy154373 }
171*6008Syy154373
172*6008Syy154373 /* Now we have a valid UTF-8 character. */
173*6008Syy154373 sz = ptr_utf8tocck(u8, &ib, ibtail, ob, obtail, &ret_val);
174*6008Syy154373 if (sz < 0) {
175*6008Syy154373 ib = oldib;
176*6008Syy154373 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
177*6008Syy154373 }
178*6008Syy154373
179*6008Syy154373 ob += sz;
180*6008Syy154373 }
181*6008Syy154373
182*6008Syy154373 ILLEGAL_CHAR_ERR:
183*6008Syy154373 *inbuf = (char *)ib;
184*6008Syy154373 *inbytesleft = ibtail - ib;
185*6008Syy154373 *outbuf = (char *)ob;
186*6008Syy154373 *outbytesleft = obtail - ob;
187*6008Syy154373
188*6008Syy154373 return (ret_val);
189*6008Syy154373 }
190*6008Syy154373
191*6008Syy154373 size_t
kiconvstr_utf8_to_cck(uchar_t * ib,size_t * inlen,uchar_t * ob,size_t * outlen,int flag,int * errno,kiconv_utf8tocck_t ptr_utf8tocck)192*6008Syy154373 kiconvstr_utf8_to_cck(uchar_t *ib, size_t *inlen, uchar_t *ob, size_t *outlen,
193*6008Syy154373 int flag, int *errno, kiconv_utf8tocck_t ptr_utf8tocck)
194*6008Syy154373 {
195*6008Syy154373 uchar_t *ibtail;
196*6008Syy154373 uchar_t *obtail;
197*6008Syy154373 uchar_t *oldib;
198*6008Syy154373 size_t ret_val;
199*6008Syy154373 size_t i; /* temp variable in for loop */
200*6008Syy154373 uint32_t u8;
201*6008Syy154373 int8_t sz;
202*6008Syy154373 boolean_t do_not_ignore_null;
203*6008Syy154373
204*6008Syy154373 ret_val = 0;
205*6008Syy154373 ibtail = ib + *inlen;
206*6008Syy154373 obtail = ob + *outlen;
207*6008Syy154373 do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0);
208*6008Syy154373
209*6008Syy154373 KICONV_CHECK_UTF8_BOM_WITHOUT_STATE(ib, ibtail);
210*6008Syy154373
211*6008Syy154373 while (ib < ibtail) {
212*6008Syy154373 if (*ib == '\0' && do_not_ignore_null)
213*6008Syy154373 break;
214*6008Syy154373
215*6008Syy154373 sz = u8_number_of_bytes[*ib];
216*6008Syy154373
217*6008Syy154373 if (sz == 1) {
218*6008Syy154373 if (ob >= obtail) {
219*6008Syy154373 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
220*6008Syy154373 }
221*6008Syy154373
222*6008Syy154373 *ob++ = *ib++;
223*6008Syy154373 continue;
224*6008Syy154373 }
225*6008Syy154373
226*6008Syy154373 oldib = ib;
227*6008Syy154373
228*6008Syy154373 if (sz <= 0) {
229*6008Syy154373 KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ);
230*6008Syy154373 }
231*6008Syy154373
232*6008Syy154373 if (ibtail - ib < sz) {
233*6008Syy154373 if (flag & KICONV_REPLACE_INVALID) {
234*6008Syy154373 ib = ibtail;
235*6008Syy154373 goto REPLACE_INVALID;
236*6008Syy154373 }
237*6008Syy154373
238*6008Syy154373 KICONV_SET_ERRNO_AND_BREAK(EINVAL);
239*6008Syy154373 }
240*6008Syy154373
241*6008Syy154373 u8 = *ib++;
242*6008Syy154373
243*6008Syy154373 if (KICONV_IS_INVALID_UTF8_SECOND_BYTE(*ib, u8))
244*6008Syy154373 goto ILLEGAL_CHAR_PROCESS;
245*6008Syy154373 u8 = (u8 << 8) | *ib++;
246*6008Syy154373
247*6008Syy154373 for (i = 2; i < sz; i++) {
248*6008Syy154373 if (*ib < 0x80 || *ib > 0xbf) {
249*6008Syy154373 ILLEGAL_CHAR_PROCESS:
250*6008Syy154373 if (flag & KICONV_REPLACE_INVALID) {
251*6008Syy154373 ib = oldib + sz;
252*6008Syy154373 goto REPLACE_INVALID;
253*6008Syy154373 }
254*6008Syy154373
255*6008Syy154373 *errno = EILSEQ;
256*6008Syy154373 ret_val = (size_t)-1;
257*6008Syy154373 ib = oldib;
258*6008Syy154373 goto ILLEGAL_CHAR_ERR;
259*6008Syy154373 }
260*6008Syy154373
261*6008Syy154373 u8 = (u8 << 8) | *ib++;
262*6008Syy154373 }
263*6008Syy154373
264*6008Syy154373 /* Now we get a valid character encoded in UTF-8. */
265*6008Syy154373 sz = ptr_utf8tocck(u8, &ib, ibtail, ob, obtail, &ret_val);
266*6008Syy154373 if (sz < 0) {
267*6008Syy154373 ib = oldib;
268*6008Syy154373 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
269*6008Syy154373 }
270*6008Syy154373
271*6008Syy154373 ob += sz;
272*6008Syy154373 continue;
273*6008Syy154373
274*6008Syy154373 REPLACE_INVALID:
275*6008Syy154373 if (ob >= obtail) {
276*6008Syy154373 ib = oldib;
277*6008Syy154373 KICONV_SET_ERRNO_AND_BREAK(E2BIG);
278*6008Syy154373 }
279*6008Syy154373
280*6008Syy154373 *ob++ = KICONV_ASCII_REPLACEMENT_CHAR;
281*6008Syy154373 ret_val++;
282*6008Syy154373 }
283*6008Syy154373
284*6008Syy154373 ILLEGAL_CHAR_ERR:
285*6008Syy154373 *inlen = ibtail - ib;
286*6008Syy154373 *outlen = obtail - ob;
287*6008Syy154373
288*6008Syy154373 return (ret_val);
289*6008Syy154373 }
290*6008Syy154373
291*6008Syy154373 /*
292*6008Syy154373 * Search key in tbl[0] <= tbl[1] <= ... <= tbl[n-1]. Return 0 if not found.
293*6008Syy154373 * tbl[0] is a special element for non-identical conversion.
294*6008Syy154373 */
295*6008Syy154373 size_t
kiconv_binsearch(uint32_t key,void * tbl,size_t nitems)296*6008Syy154373 kiconv_binsearch(uint32_t key, void *tbl, size_t nitems)
297*6008Syy154373 {
298*6008Syy154373 size_t low, high, mid;
299*6008Syy154373 kiconv_table_t *table;
300*6008Syy154373
301*6008Syy154373 low = 1;
302*6008Syy154373 high = nitems - 1;
303*6008Syy154373 table = (kiconv_table_t *)tbl;
304*6008Syy154373
305*6008Syy154373 while (low <= high) {
306*6008Syy154373 mid = (low + high) / 2;
307*6008Syy154373
308*6008Syy154373 if (key < table[mid].key)
309*6008Syy154373 high = mid - 1;
310*6008Syy154373 else if (key > table[mid].key)
311*6008Syy154373 low = mid + 1;
312*6008Syy154373 else
313*6008Syy154373 return (mid);
314*6008Syy154373 }
315*6008Syy154373
316*6008Syy154373 return (0);
317*6008Syy154373 }
318