1*6008Syy154373 /*
2*6008Syy154373  * CDDL HEADER START
3*6008Syy154373  *
4*6008Syy154373  * The contents of this file are subject to the terms of the
5*6008Syy154373  * Common Development and Distribution License (the "License").
6*6008Syy154373  * You may not use this file except in compliance with the License.
7*6008Syy154373  *
8*6008Syy154373  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*6008Syy154373  * or http://www.opensolaris.org/os/licensing.
10*6008Syy154373  * See the License for the specific language governing permissions
11*6008Syy154373  * and limitations under the License.
12*6008Syy154373  *
13*6008Syy154373  * When distributing Covered Code, include this CDDL HEADER in each
14*6008Syy154373  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*6008Syy154373  * If applicable, add the following below this CDDL HEADER, with the
16*6008Syy154373  * fields enclosed by brackets "[]" replaced with your own identifying
17*6008Syy154373  * information: Portions Copyright [yyyy] [name of copyright owner]
18*6008Syy154373  *
19*6008Syy154373  * CDDL HEADER END
20*6008Syy154373  */
21*6008Syy154373 /*
22*6008Syy154373  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23*6008Syy154373  * Use is subject to license terms.
24*6008Syy154373  */
25*6008Syy154373 
26*6008Syy154373 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27*6008Syy154373 
28*6008Syy154373 #include <sys/types.h>
29*6008Syy154373 #include <sys/param.h>
30*6008Syy154373 #include <sys/sysmacros.h>
31*6008Syy154373 #include <sys/systm.h>
32*6008Syy154373 #include <sys/debug.h>
33*6008Syy154373 #include <sys/kmem.h>
34*6008Syy154373 #include <sys/sunddi.h>
35*6008Syy154373 #include <sys/byteorder.h>
36*6008Syy154373 #include <sys/errno.h>
37*6008Syy154373 #include <sys/u8_textprep.h>
38*6008Syy154373 #include <sys/kiconv.h>
39*6008Syy154373 #include <sys/kiconv_cck_common.h>
40*6008Syy154373 
41*6008Syy154373 /*LINTLIBRARY*/
42*6008Syy154373 
43*6008Syy154373 /*
44*6008Syy154373  * Common kiconv_open method for UTF-8 -> CCK conversion.
45*6008Syy154373  */
46*6008Syy154373 void *
kiconv_open_to_cck()47*6008Syy154373 kiconv_open_to_cck()
48*6008Syy154373 {
49*6008Syy154373 	kiconv_state_t st;
50*6008Syy154373 
51*6008Syy154373 	st = (kiconv_state_t)kmem_alloc(sizeof (kiconv_state_data_t), KM_SLEEP);
52*6008Syy154373 
53*6008Syy154373 	st->bom_processed = 0;
54*6008Syy154373 
55*6008Syy154373 	return ((void *)st);
56*6008Syy154373 }
57*6008Syy154373 
58*6008Syy154373 /*
59*6008Syy154373  * Common kiconv_close method for UTF-8 -> CCK conversion.
60*6008Syy154373  */
61*6008Syy154373 int
kiconv_close_to_cck(void * kcd)62*6008Syy154373 kiconv_close_to_cck(void *kcd)
63*6008Syy154373 {
64*6008Syy154373 	if (! kcd || kcd == (void *)-1)
65*6008Syy154373 		return (EBADF);
66*6008Syy154373 
67*6008Syy154373 	kmem_free(kcd, sizeof (kiconv_state_data_t));
68*6008Syy154373 
69*6008Syy154373 	return (0);
70*6008Syy154373 }
71*6008Syy154373 
72*6008Syy154373 /*
73*6008Syy154373  * Common routine to convert UTF-8 sequence to CCK legal character sequence.
74*6008Syy154373  */
75*6008Syy154373 size_t
kiconv_utf8_to_cck(void * kcd,char ** inbuf,size_t * inbytesleft,char ** outbuf,size_t * outbytesleft,int * errno,kiconv_utf8tocck_t ptr_utf8tocck)76*6008Syy154373 kiconv_utf8_to_cck(void *kcd, char **inbuf, size_t *inbytesleft,
77*6008Syy154373 	char **outbuf, size_t *outbytesleft, int *errno,
78*6008Syy154373 	kiconv_utf8tocck_t ptr_utf8tocck)
79*6008Syy154373 {
80*6008Syy154373 	uchar_t		*ib;
81*6008Syy154373 	uchar_t		*ob;
82*6008Syy154373 	uchar_t		*ibtail;
83*6008Syy154373 	uchar_t		*obtail;
84*6008Syy154373 	uchar_t		*oldib;
85*6008Syy154373 	size_t		ret_val;
86*6008Syy154373 	size_t		i;		/* temp variable in for loop */
87*6008Syy154373 	uint32_t	u8;
88*6008Syy154373 	int8_t		sz;
89*6008Syy154373 
90*6008Syy154373 	/* Check on the kiconv code conversion descriptor. */
91*6008Syy154373 	if (! kcd || kcd == (void *)-1) {
92*6008Syy154373 		*errno = EBADF;
93*6008Syy154373 		return ((size_t)-1);
94*6008Syy154373 	}
95*6008Syy154373 
96*6008Syy154373 	/* If this is a state reset request, process and return. */
97*6008Syy154373 	if (! inbuf || !(*inbuf)) {
98*6008Syy154373 		((kiconv_state_t)kcd)->bom_processed = 0;
99*6008Syy154373 		return (0);
100*6008Syy154373 	}
101*6008Syy154373 
102*6008Syy154373 	ret_val = 0;
103*6008Syy154373 	ib = (uchar_t *)*inbuf;
104*6008Syy154373 	ob = (uchar_t *)*outbuf;
105*6008Syy154373 	ibtail = ib + *inbytesleft;
106*6008Syy154373 	obtail = ob + *outbytesleft;
107*6008Syy154373 
108*6008Syy154373 	KICONV_CHECK_UTF8_BOM(ib, ibtail);
109*6008Syy154373 
110*6008Syy154373 	while (ib < ibtail) {
111*6008Syy154373 		sz = u8_number_of_bytes[*ib];
112*6008Syy154373 
113*6008Syy154373 		/*
114*6008Syy154373 		 * If it is a 7-bit ASCII character, we don't need to
115*6008Syy154373 		 * process further and we just copy the character over.
116*6008Syy154373 		 *
117*6008Syy154373 		 * If not, we connect the chracter bytes up to four bytes,
118*6008Syy154373 		 * validate the bytes, and binary search for the corresponding
119*6008Syy154373 		 * table. If we find it from the mapping table, we put that
120*6008Syy154373 		 * into the output buffer; otherwise, we put a replacement
121*6008Syy154373 		 * character instead as a non-identical conversion.
122*6008Syy154373 		 */
123*6008Syy154373 		if (sz == 1) {
124*6008Syy154373 			if (ob >= obtail) {
125*6008Syy154373 				KICONV_SET_ERRNO_AND_BREAK(E2BIG);
126*6008Syy154373 			}
127*6008Syy154373 
128*6008Syy154373 			*ob++ = *ib++;
129*6008Syy154373 			continue;
130*6008Syy154373 		}
131*6008Syy154373 
132*6008Syy154373 		/*
133*6008Syy154373 		 * Issue EILSEQ error if the first byte is a
134*6008Syy154373 		 * invalid UTF-8 character leading byte.
135*6008Syy154373 		 */
136*6008Syy154373 		if (sz <= 0) {
137*6008Syy154373 			KICONV_SET_ERRNO_AND_BREAK(EILSEQ);
138*6008Syy154373 		}
139*6008Syy154373 
140*6008Syy154373 		/*
141*6008Syy154373 		 * Issue EINVAL error if input buffer has an incomplete
142*6008Syy154373 		 * character at the end of the buffer.
143*6008Syy154373 		 */
144*6008Syy154373 		if (ibtail - ib < sz) {
145*6008Syy154373 			KICONV_SET_ERRNO_AND_BREAK(EINVAL);
146*6008Syy154373 		}
147*6008Syy154373 
148*6008Syy154373 		/*
149*6008Syy154373 		 * We collect UTF-8 character bytes and also check if this
150*6008Syy154373 		 * is a valid UTF-8 character without any bogus bytes based
151*6008Syy154373 		 * on the latest UTF-8 binary representation.
152*6008Syy154373 		 */
153*6008Syy154373 		oldib = ib;
154*6008Syy154373 		u8 = *ib++;
155*6008Syy154373 
156*6008Syy154373 		if (KICONV_IS_INVALID_UTF8_SECOND_BYTE(*ib, u8))
157*6008Syy154373 			goto ILLEGAL_CHAR_PROCESS;
158*6008Syy154373 		u8 = (u8 << 8) | *ib++;
159*6008Syy154373 
160*6008Syy154373 		for (i = 2; i < sz; i++) {
161*6008Syy154373 			if (*ib < 0x80 || *ib > 0xbf) {
162*6008Syy154373 ILLEGAL_CHAR_PROCESS:
163*6008Syy154373 				*errno = EILSEQ;
164*6008Syy154373 				ret_val = (size_t)-1;
165*6008Syy154373 				ib = oldib;
166*6008Syy154373 				goto ILLEGAL_CHAR_ERR;
167*6008Syy154373 			}
168*6008Syy154373 
169*6008Syy154373 			u8 = (u8 << 8) | *ib++;
170*6008Syy154373 		}
171*6008Syy154373 
172*6008Syy154373 		/* Now we have a valid UTF-8 character. */
173*6008Syy154373 		sz = ptr_utf8tocck(u8, &ib, ibtail, ob, obtail, &ret_val);
174*6008Syy154373 		if (sz < 0) {
175*6008Syy154373 			ib = oldib;
176*6008Syy154373 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
177*6008Syy154373 		}
178*6008Syy154373 
179*6008Syy154373 		ob += sz;
180*6008Syy154373 	}
181*6008Syy154373 
182*6008Syy154373 ILLEGAL_CHAR_ERR:
183*6008Syy154373 	*inbuf = (char *)ib;
184*6008Syy154373 	*inbytesleft = ibtail - ib;
185*6008Syy154373 	*outbuf = (char *)ob;
186*6008Syy154373 	*outbytesleft = obtail - ob;
187*6008Syy154373 
188*6008Syy154373 	return (ret_val);
189*6008Syy154373 }
190*6008Syy154373 
191*6008Syy154373 size_t
kiconvstr_utf8_to_cck(uchar_t * ib,size_t * inlen,uchar_t * ob,size_t * outlen,int flag,int * errno,kiconv_utf8tocck_t ptr_utf8tocck)192*6008Syy154373 kiconvstr_utf8_to_cck(uchar_t *ib, size_t *inlen, uchar_t *ob, size_t *outlen,
193*6008Syy154373 	int flag, int *errno, kiconv_utf8tocck_t ptr_utf8tocck)
194*6008Syy154373 {
195*6008Syy154373 	uchar_t		*ibtail;
196*6008Syy154373 	uchar_t		*obtail;
197*6008Syy154373 	uchar_t		*oldib;
198*6008Syy154373 	size_t		ret_val;
199*6008Syy154373 	size_t		i;		/* temp variable in for loop */
200*6008Syy154373 	uint32_t	u8;
201*6008Syy154373 	int8_t		sz;
202*6008Syy154373 	boolean_t	do_not_ignore_null;
203*6008Syy154373 
204*6008Syy154373 	ret_val = 0;
205*6008Syy154373 	ibtail = ib + *inlen;
206*6008Syy154373 	obtail = ob + *outlen;
207*6008Syy154373 	do_not_ignore_null = ((flag & KICONV_IGNORE_NULL) == 0);
208*6008Syy154373 
209*6008Syy154373 	KICONV_CHECK_UTF8_BOM_WITHOUT_STATE(ib, ibtail);
210*6008Syy154373 
211*6008Syy154373 	while (ib < ibtail) {
212*6008Syy154373 		if (*ib == '\0' && do_not_ignore_null)
213*6008Syy154373 			break;
214*6008Syy154373 
215*6008Syy154373 		sz = u8_number_of_bytes[*ib];
216*6008Syy154373 
217*6008Syy154373 		if (sz == 1) {
218*6008Syy154373 			if (ob >= obtail) {
219*6008Syy154373 				KICONV_SET_ERRNO_AND_BREAK(E2BIG);
220*6008Syy154373 			}
221*6008Syy154373 
222*6008Syy154373 			*ob++ = *ib++;
223*6008Syy154373 			continue;
224*6008Syy154373 		}
225*6008Syy154373 
226*6008Syy154373 		oldib = ib;
227*6008Syy154373 
228*6008Syy154373 		if (sz <= 0) {
229*6008Syy154373 			KICONV_SET_ERRNO_WITH_FLAG(1, EILSEQ);
230*6008Syy154373 		}
231*6008Syy154373 
232*6008Syy154373 		if (ibtail - ib < sz) {
233*6008Syy154373 			if (flag & KICONV_REPLACE_INVALID) {
234*6008Syy154373 				ib = ibtail;
235*6008Syy154373 				goto REPLACE_INVALID;
236*6008Syy154373 			}
237*6008Syy154373 
238*6008Syy154373 			KICONV_SET_ERRNO_AND_BREAK(EINVAL);
239*6008Syy154373 		}
240*6008Syy154373 
241*6008Syy154373 		u8 = *ib++;
242*6008Syy154373 
243*6008Syy154373 		if (KICONV_IS_INVALID_UTF8_SECOND_BYTE(*ib, u8))
244*6008Syy154373 			goto ILLEGAL_CHAR_PROCESS;
245*6008Syy154373 		u8 = (u8 << 8) | *ib++;
246*6008Syy154373 
247*6008Syy154373 		for (i = 2; i < sz; i++) {
248*6008Syy154373 			if (*ib < 0x80 || *ib > 0xbf) {
249*6008Syy154373 ILLEGAL_CHAR_PROCESS:
250*6008Syy154373 				if (flag & KICONV_REPLACE_INVALID) {
251*6008Syy154373 					ib = oldib + sz;
252*6008Syy154373 					goto REPLACE_INVALID;
253*6008Syy154373 				}
254*6008Syy154373 
255*6008Syy154373 				*errno = EILSEQ;
256*6008Syy154373 				ret_val = (size_t)-1;
257*6008Syy154373 				ib = oldib;
258*6008Syy154373 				goto ILLEGAL_CHAR_ERR;
259*6008Syy154373 			}
260*6008Syy154373 
261*6008Syy154373 			u8 = (u8 << 8) | *ib++;
262*6008Syy154373 		}
263*6008Syy154373 
264*6008Syy154373 		/* Now we get a valid character encoded in UTF-8. */
265*6008Syy154373 		sz = ptr_utf8tocck(u8, &ib, ibtail, ob, obtail, &ret_val);
266*6008Syy154373 		if (sz < 0) {
267*6008Syy154373 			ib = oldib;
268*6008Syy154373 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
269*6008Syy154373 		}
270*6008Syy154373 
271*6008Syy154373 		ob += sz;
272*6008Syy154373 		continue;
273*6008Syy154373 
274*6008Syy154373 REPLACE_INVALID:
275*6008Syy154373 		if (ob >= obtail) {
276*6008Syy154373 			ib = oldib;
277*6008Syy154373 			KICONV_SET_ERRNO_AND_BREAK(E2BIG);
278*6008Syy154373 		}
279*6008Syy154373 
280*6008Syy154373 		*ob++ = KICONV_ASCII_REPLACEMENT_CHAR;
281*6008Syy154373 		ret_val++;
282*6008Syy154373 	}
283*6008Syy154373 
284*6008Syy154373 ILLEGAL_CHAR_ERR:
285*6008Syy154373 	*inlen = ibtail - ib;
286*6008Syy154373 	*outlen = obtail - ob;
287*6008Syy154373 
288*6008Syy154373 	return (ret_val);
289*6008Syy154373 }
290*6008Syy154373 
291*6008Syy154373 /*
292*6008Syy154373  * Search key in tbl[0] <= tbl[1] <= ... <= tbl[n-1].  Return 0 if not found.
293*6008Syy154373  * tbl[0] is a special element for non-identical conversion.
294*6008Syy154373  */
295*6008Syy154373 size_t
kiconv_binsearch(uint32_t key,void * tbl,size_t nitems)296*6008Syy154373 kiconv_binsearch(uint32_t key, void *tbl, size_t nitems)
297*6008Syy154373 {
298*6008Syy154373 	size_t low, high, mid;
299*6008Syy154373 	kiconv_table_t *table;
300*6008Syy154373 
301*6008Syy154373 	low = 1;
302*6008Syy154373 	high = nitems - 1;
303*6008Syy154373 	table = (kiconv_table_t *)tbl;
304*6008Syy154373 
305*6008Syy154373 	while (low <= high) {
306*6008Syy154373 		mid = (low + high) / 2;
307*6008Syy154373 
308*6008Syy154373 		if (key < table[mid].key)
309*6008Syy154373 			high = mid - 1;
310*6008Syy154373 		else if (key > table[mid].key)
311*6008Syy154373 			low = mid + 1;
312*6008Syy154373 		else
313*6008Syy154373 			return (mid);
314*6008Syy154373 	}
315*6008Syy154373 
316*6008Syy154373 	return (0);
317*6008Syy154373 }
318