xref: /netbsd-src/external/bsd/openldap/dist/libraries/libldap/utf-8-conv.c (revision b1c86f5f087524e68db12794ee9c3e3da1ab17a0)
1 /*	$NetBSD: utf-8-conv.c,v 1.1.1.2 2010/03/08 02:14:20 lukem Exp $	*/
2 
3 /* OpenLDAP: pkg/ldap/libraries/libldap/utf-8-conv.c,v 1.16.2.5 2009/08/25 22:58:08 quanah Exp */
4 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
5  *
6  * Copyright 1998-2009 The OpenLDAP Foundation.
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted only as authorized by the OpenLDAP
11  * Public License.
12  *
13  * A copy of this license is available in the file LICENSE in the
14  * top-level directory of the distribution or, alternatively, at
15  * <http://www.OpenLDAP.org/license.html>.
16  */
17 /* Portions Copyright (C) 1999, 2000 Novell, Inc. All Rights Reserved.
18  *
19  * THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND
20  * TREATIES. USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT
21  * TO VERSION 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS
22  * AVAILABLE AT HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE"
23  * IN THE TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION
24  * OF THIS WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP
25  * PUBLIC LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT
26  * THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY.
27  *---
28  * Note: A verbatim copy of version 2.0.1 of the OpenLDAP Public License
29  * can be found in the file "build/LICENSE-2.0.1" in this distribution
30  * of OpenLDAP Software.
31  */
32 
33 /*
34  * UTF-8 Conversion Routines
35  *
36  * These routines convert between Wide Character and UTF-8,
37  * or between MultiByte and UTF-8 encodings.
38  *
39  * Both single character and string versions of the functions are provided.
40  * All functions return -1 if the character or string cannot be converted.
41  */
42 
43 #include "portable.h"
44 
45 #if SIZEOF_WCHAR_T >= 4
46 /* These routines assume ( sizeof(wchar_t) >= 4 ) */
47 
48 #include <stdio.h>
49 #include <ac/stdlib.h>		/* For wctomb, wcstombs, mbtowc, mbstowcs */
50 #include <ac/string.h>
51 #include <ac/time.h>		/* for time_t */
52 
53 #include "ldap-int.h"
54 
55 #include <ldap_utf8.h>
56 
57 static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
58 
59 
60 /*-----------------------------------------------------------------------------
61 					UTF-8 Format Summary
62 
63 ASCII chars 						7 bits
64     0xxxxxxx
65 
66 2-character UTF-8 sequence:        11 bits
67     110xxxxx  10xxxxxx
68 
69 3-character UTF-8                  16 bits
70     1110xxxx  10xxxxxx  10xxxxxx
71 
72 4-char UTF-8                       21 bits
73     11110xxx  10xxxxxx  10xxxxxx  10xxxxxx
74 
75 5-char UTF-8                       26 bits
76     111110xx  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx
77 
78 6-char UTF-8                       31 bits
79     1111110x  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx
80 
81 Unicode address space   (0 - 0x10FFFF)    21 bits
82 ISO-10646 address space (0 - 0x7FFFFFFF)  31 bits
83 
84 Note: This code does not prevent UTF-8 sequences which are longer than
85       necessary from being decoded.
86 */
87 
88 /*-----------------------------------------------------------------------------
89    Convert a UTF-8 character to a wide char.
90    Return the length of the UTF-8 input character in bytes.
91 */
92 int
93 ldap_x_utf8_to_wc ( wchar_t *wchar, const char *utf8char )
94 {
95 	int utflen, i;
96 	wchar_t ch;
97 
98 	if (utf8char == NULL) return -1;
99 
100 	/* Get UTF-8 sequence length from 1st byte */
101 	utflen = LDAP_UTF8_CHARLEN2(utf8char, utflen);
102 
103 	if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN ) return -1;
104 
105 	/* First byte minus length tag */
106 	ch = (wchar_t)(utf8char[0] & mask[utflen]);
107 
108 	for(i=1; i < utflen; i++) {
109 		/* Subsequent bytes must start with 10 */
110 		if ((utf8char[i] & 0xc0) != 0x80) return -1;
111 
112 		ch <<= 6;			/* 6 bits of data in each subsequent byte */
113 		ch |= (wchar_t)(utf8char[i] & 0x3f);
114 	}
115 
116 	if (wchar) *wchar = ch;
117 
118 	return utflen;
119 }
120 
121 /*-----------------------------------------------------------------------------
122    Convert a UTF-8 string to a wide char string.
123    No more than 'count' wide chars will be written to the output buffer.
124    Return the size of the converted string in wide chars, excl null terminator.
125 */
126 int
127 ldap_x_utf8s_to_wcs ( wchar_t *wcstr, const char *utf8str, size_t count )
128 {
129 	size_t wclen = 0;
130 	int utflen, i;
131 	wchar_t ch;
132 
133 
134 	/* If input ptr is NULL or empty... */
135 	if (utf8str == NULL || !*utf8str) {
136 		if ( wcstr )
137 			*wcstr = 0;
138 		return 0;
139 	}
140 
141 	/* Examine next UTF-8 character.  If output buffer is NULL, ignore count */
142 	while ( *utf8str && (wcstr==NULL || wclen<count) ) {
143 		/* Get UTF-8 sequence length from 1st byte */
144 		utflen = LDAP_UTF8_CHARLEN2(utf8str, utflen);
145 
146 		if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN ) return -1;
147 
148 		/* First byte minus length tag */
149 		ch = (wchar_t)(utf8str[0] & mask[utflen]);
150 
151 		for(i=1; i < utflen; i++) {
152 			/* Subsequent bytes must start with 10 */
153 			if ((utf8str[i] & 0xc0) != 0x80) return -1;
154 
155 			ch <<= 6;			/* 6 bits of data in each subsequent byte */
156 			ch |= (wchar_t)(utf8str[i] & 0x3f);
157 		}
158 
159 		if (wcstr) wcstr[wclen] = ch;
160 
161 		utf8str += utflen;	/* Move to next UTF-8 character */
162 		wclen++;			/* Count number of wide chars stored/required */
163 	}
164 
165 	/* Add null terminator if there's room in the buffer. */
166 	if (wcstr && wclen < count) wcstr[wclen] = 0;
167 
168 	return wclen;
169 }
170 
171 
172 /*-----------------------------------------------------------------------------
173    Convert one wide char to a UTF-8 character.
174    Return the length of the converted UTF-8 character in bytes.
175    No more than 'count' bytes will be written to the output buffer.
176 */
177 int
178 ldap_x_wc_to_utf8 ( char *utf8char, wchar_t wchar, size_t count )
179 {
180 	int len=0;
181 
182 	if (utf8char == NULL)   /* Just determine the required UTF-8 char length. */
183 	{						/* Ignore count */
184 		if( wchar < 0 )
185 			return -1;
186 		if( wchar < 0x80 )
187 			return 1;
188 		if( wchar < 0x800 )
189 			return 2;
190 		if( wchar < 0x10000 )
191 			return 3;
192 		if( wchar < 0x200000 )
193 			return 4;
194 		if( wchar < 0x4000000 )
195 			return 5;
196 #if SIZEOF_WCHAR_T > 4
197 		/* UL is not strictly needed by ANSI C */
198 		if( wchar < (wchar_t)0x80000000UL )
199 #endif /* SIZEOF_WCHAR_T > 4 */
200 			return 6;
201 		return -1;
202 	}
203 
204 
205 	if ( wchar < 0 ) {				/* Invalid wide character */
206 		len = -1;
207 
208 	} else if( wchar < 0x80 ) {
209 		if (count >= 1) {
210 			utf8char[len++] = (char)wchar;
211 		}
212 
213 	} else if( wchar < 0x800 ) {
214 		if (count >=2) {
215 			utf8char[len++] = 0xc0 | ( wchar >> 6 );
216 			utf8char[len++] = 0x80 | ( wchar & 0x3f );
217 		}
218 
219 	} else if( wchar < 0x10000 ) {
220 		if (count >= 3) {
221 			utf8char[len++] = 0xe0 | ( wchar >> 12 );
222 			utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
223 			utf8char[len++] = 0x80 | ( wchar & 0x3f );
224 		}
225 
226 	} else if( wchar < 0x200000 ) {
227 		if (count >= 4) {
228 			utf8char[len++] = 0xf0 | ( wchar >> 18 );
229 			utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
230 			utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
231 			utf8char[len++] = 0x80 | ( wchar & 0x3f );
232 		}
233 
234 	} else if( wchar < 0x4000000 ) {
235 		if (count >= 5) {
236 			utf8char[len++] = 0xf8 | ( wchar >> 24 );
237 			utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
238 			utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
239 			utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
240 			utf8char[len++] = 0x80 | ( wchar & 0x3f );
241 		}
242 
243 	} else
244 #if SIZEOF_WCHAR_T > 4
245 		/* UL is not strictly needed by ANSI C */
246 		if( wchar < (wchar_t)0x80000000UL )
247 #endif /* SIZEOF_WCHAR_T > 4 */
248 	{
249 		if (count >= 6) {
250 			utf8char[len++] = 0xfc | ( wchar >> 30 );
251 			utf8char[len++] = 0x80 | ( (wchar >> 24) & 0x3f );
252 			utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
253 			utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
254 			utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
255 			utf8char[len++] = 0x80 | ( wchar & 0x3f );
256 		}
257 
258 #if SIZEOF_WCHAR_T > 4
259 	} else {
260 		len = -1;
261 #endif /* SIZEOF_WCHAR_T > 4 */
262 	}
263 
264 	return len;
265 
266 }
267 
268 
269 /*-----------------------------------------------------------------------------
270    Convert a wide char string to a UTF-8 string.
271    No more than 'count' bytes will be written to the output buffer.
272    Return the # of bytes written to the output buffer, excl null terminator.
273 */
274 int
275 ldap_x_wcs_to_utf8s ( char *utf8str, const wchar_t *wcstr, size_t count )
276 {
277 	int len = 0;
278 	int n;
279 	char *p = utf8str;
280 	wchar_t empty = 0;		/* To avoid use of L"" construct */
281 
282 	if (wcstr == NULL)		/* Treat input ptr NULL as an empty string */
283 		wcstr = &empty;
284 
285 	if (utf8str == NULL)	/* Just compute size of output, excl null */
286 	{
287 		while (*wcstr)
288 		{
289 			/* Get UTF-8 size of next wide char */
290 			n = ldap_x_wc_to_utf8( NULL, *wcstr++, LDAP_MAX_UTF8_LEN);
291 			if (n == -1)
292 				return -1;
293 			len += n;
294 		}
295 
296 		return len;
297 	}
298 
299 
300 	/* Do the actual conversion. */
301 
302 	n = 1;					/* In case of empty wcstr */
303 	while (*wcstr)
304 	{
305 		n = ldap_x_wc_to_utf8( p, *wcstr++, count);
306 
307 		if (n <= 0)  		/* If encoding error (-1) or won't fit (0), quit */
308 			break;
309 
310 		p += n;
311 		count -= n;			/* Space left in output buffer */
312 	}
313 
314 	/* If not enough room for last character, pad remainder with null
315 	   so that return value = original count, indicating buffer full. */
316 	if (n == 0)
317 	{
318 		while (count--)
319 			*p++ = 0;
320 	}
321 
322 	/* Add a null terminator if there's room. */
323 	else if (count)
324 		*p = 0;
325 
326 	if (n == -1)			/* Conversion encountered invalid wide char. */
327 		return -1;
328 
329 	/* Return the number of bytes written to output buffer, excl null. */
330 	return (p - utf8str);
331 }
332 
333 
334 /*-----------------------------------------------------------------------------
335    Convert a UTF-8 character to a MultiByte character.
336    Return the size of the converted character in bytes.
337 */
338 int
339 ldap_x_utf8_to_mb ( char *mbchar, const char *utf8char,
340 		int (*f_wctomb)(char *mbchar, wchar_t wchar) )
341 {
342 	wchar_t wchar;
343 	int n;
344 	char tmp[6];				/* Large enough for biggest multibyte char */
345 
346 	if (f_wctomb == NULL)		/* If no conversion function was given... */
347 		f_wctomb = wctomb;		/*    use the local ANSI C function */
348 
349 	/* First convert UTF-8 char to a wide char */
350 	n = ldap_x_utf8_to_wc( &wchar, utf8char);
351 
352 	if (n == -1)
353 		return -1;		/* Invalid UTF-8 character */
354 
355 	if (mbchar == NULL)
356 		n = f_wctomb( tmp, wchar );
357 	else
358 		n = f_wctomb( mbchar, wchar);
359 
360 	return n;
361 }
362 
363 /*-----------------------------------------------------------------------------
364    Convert a UTF-8 string to a MultiByte string.
365    No more than 'count' bytes will be written to the output buffer.
366    Return the size of the converted string in bytes, excl null terminator.
367 */
368 int
369 ldap_x_utf8s_to_mbs ( char *mbstr, const char *utf8str, size_t count,
370 		size_t (*f_wcstombs)(char *mbstr, const wchar_t *wcstr, size_t count) )
371 {
372 	wchar_t *wcs;
373 	size_t wcsize;
374     int n;
375 
376 	if (f_wcstombs == NULL)		/* If no conversion function was given... */
377 		f_wcstombs = wcstombs;	/*    use the local ANSI C function */
378 
379 	if (utf8str == NULL || *utf8str == 0)	/* NULL or empty input string */
380 	{
381 		if (mbstr)
382 			*mbstr = 0;
383 		return 0;
384 	}
385 
386 /* Allocate memory for the maximum size wchar string that we could get. */
387 	wcsize = strlen(utf8str) + 1;
388 	wcs = (wchar_t *)LDAP_MALLOC(wcsize * sizeof(wchar_t));
389 	if (wcs == NULL)
390 		return -1;				/* Memory allocation failure. */
391 
392 	/* First convert the UTF-8 string to a wide char string */
393 	n = ldap_x_utf8s_to_wcs( wcs, utf8str, wcsize);
394 
395 	/* Then convert wide char string to multi-byte string */
396 	if (n != -1)
397 	{
398 		n = f_wcstombs(mbstr, wcs, count);
399 	}
400 
401 	LDAP_FREE(wcs);
402 
403 	return n;
404 }
405 
406 /*-----------------------------------------------------------------------------
407    Convert a MultiByte character to a UTF-8 character.
408    'mbsize' indicates the number of bytes of 'mbchar' to check.
409    Returns the number of bytes written to the output character.
410 */
411 int
412 ldap_x_mb_to_utf8 ( char *utf8char, const char *mbchar, size_t mbsize,
413 		int (*f_mbtowc)(wchar_t *wchar, const char *mbchar, size_t count) )
414 {
415     wchar_t wchar;
416     int n;
417 
418 	if (f_mbtowc == NULL)		/* If no conversion function was given... */
419 		f_mbtowc = mbtowc;		/*    use the local ANSI C function */
420 
421     if (mbsize == 0)				/* 0 is not valid. */
422         return -1;
423 
424     if (mbchar == NULL || *mbchar == 0)
425     {
426         if (utf8char)
427             *utf8char = 0;
428         return 1;
429     }
430 
431 	/* First convert the MB char to a Wide Char */
432 	n = f_mbtowc( &wchar, mbchar, mbsize);
433 
434 	if (n == -1)
435 		return -1;
436 
437 	/* Convert the Wide Char to a UTF-8 character. */
438 	n = ldap_x_wc_to_utf8( utf8char, wchar, LDAP_MAX_UTF8_LEN);
439 
440 	return n;
441 }
442 
443 
444 /*-----------------------------------------------------------------------------
445    Convert a MultiByte string to a UTF-8 string.
446    No more than 'count' bytes will be written to the output buffer.
447    Return the size of the converted string in bytes, excl null terminator.
448 */
449 int
450 ldap_x_mbs_to_utf8s ( char *utf8str, const char *mbstr, size_t count,
451 		size_t (*f_mbstowcs)(wchar_t *wcstr, const char *mbstr, size_t count) )
452 {
453 	wchar_t *wcs;
454 	int n;
455 	size_t wcsize;
456 
457 	if (mbstr == NULL)		   /* Treat NULL input string as an empty string */
458 		mbstr = "";
459 
460 	if (f_mbstowcs == NULL)		/* If no conversion function was given... */
461 		f_mbstowcs = mbstowcs;	/*    use the local ANSI C function */
462 
463 	/* Allocate memory for the maximum size wchar string that we could get. */
464 	wcsize = strlen(mbstr) + 1;
465 	wcs = (wchar_t *)LDAP_MALLOC( wcsize * sizeof(wchar_t) );
466 	if (wcs == NULL)
467 		return -1;
468 
469 	/* First convert multi-byte string to a wide char string */
470 	n = f_mbstowcs(wcs, mbstr, wcsize);
471 
472 	/* Convert wide char string to UTF-8 string */
473 	if (n != -1)
474 	{
475 		n = ldap_x_wcs_to_utf8s( utf8str, wcs, count);
476 	}
477 
478 	LDAP_FREE(wcs);
479 
480 	return n;
481 }
482 
483 #endif /* SIZEOF_WCHAR_T >= 4 */
484