xref: /netbsd-src/external/bsd/openldap/dist/libraries/libldap/utf-8-conv.c (revision 404fbe5fb94ca1e054339640cabb2801ce52dd30)
1 /* $OpenLDAP: pkg/ldap/libraries/libldap/utf-8-conv.c,v 1.16.2.3 2008/02/11 23:26:41 kurt Exp $ */
2 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
3  *
4  * Copyright 1998-2008 The OpenLDAP Foundation.
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted only as authorized by the OpenLDAP
9  * Public License.
10  *
11  * A copy of this license is available in the file LICENSE in the
12  * top-level directory of the distribution or, alternatively, at
13  * <http://www.OpenLDAP.org/license.html>.
14  */
15 /* Portions Copyright (C) 1999, 2000 Novell, Inc. All Rights Reserved.
16  *
17  * THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND
18  * TREATIES. USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT
19  * TO VERSION 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS
20  * AVAILABLE AT HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE"
21  * IN THE TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION
22  * OF THIS WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP
23  * PUBLIC LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT
24  * THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY.
25  *---
26  * Note: A verbatim copy of version 2.0.1 of the OpenLDAP Public License
27  * can be found in the file "build/LICENSE-2.0.1" in this distribution
28  * of OpenLDAP Software.
29  */
30 
31 /*
32  * UTF-8 Conversion Routines
33  *
34  * These routines convert between Wide Character and UTF-8,
35  * or between MultiByte and UTF-8 encodings.
36  *
37  * Both single character and string versions of the functions are provided.
38  * All functions return -1 if the character or string cannot be converted.
39  */
40 
41 #include "portable.h"
42 
43 #if SIZEOF_WCHAR_T >= 4
44 /* These routines assume ( sizeof(wchar_t) >= 4 ) */
45 
46 #include <stdio.h>
47 #include <ac/stdlib.h>		/* For wctomb, wcstombs, mbtowc, mbstowcs */
48 #include <ac/string.h>
49 #include <ac/time.h>		/* for time_t */
50 
51 #include "ldap-int.h"
52 
53 #include <ldap_utf8.h>
54 
55 static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
56 
57 
58 /*-----------------------------------------------------------------------------
59 					UTF-8 Format Summary
60 
61 ASCII chars 						7 bits
62     0xxxxxxx
63 
64 2-character UTF-8 sequence:        11 bits
65     110xxxxx  10xxxxxx
66 
67 3-character UTF-8                  16 bits
68     1110xxxx  10xxxxxx  10xxxxxx
69 
70 4-char UTF-8                       21 bits
71     11110xxx  10xxxxxx  10xxxxxx  10xxxxxx
72 
73 5-char UTF-8                       26 bits
74     111110xx  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx
75 
76 6-char UTF-8                       31 bits
77     1111110x  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx
78 
79 Unicode address space   (0 - 0x10FFFF)    21 bits
80 ISO-10646 address space (0 - 0x7FFFFFFF)  31 bits
81 
82 Note: This code does not prevent UTF-8 sequences which are longer than
83       necessary from being decoded.
84 */
85 
86 /*-----------------------------------------------------------------------------
87    Convert a UTF-8 character to a wide char.
88    Return the length of the UTF-8 input character in bytes.
89 */
90 int
91 ldap_x_utf8_to_wc ( wchar_t *wchar, const char *utf8char )
92 {
93 	int utflen, i;
94 	wchar_t ch;
95 
96 	if (utf8char == NULL) return -1;
97 
98 	/* Get UTF-8 sequence length from 1st byte */
99 	utflen = LDAP_UTF8_CHARLEN2(utf8char, utflen);
100 
101 	if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN ) return -1;
102 
103 	/* First byte minus length tag */
104 	ch = (wchar_t)(utf8char[0] & mask[utflen]);
105 
106 	for(i=1; i < utflen; i++) {
107 		/* Subsequent bytes must start with 10 */
108 		if ((utf8char[i] & 0xc0) != 0x80) return -1;
109 
110 		ch <<= 6;			/* 6 bits of data in each subsequent byte */
111 		ch |= (wchar_t)(utf8char[i] & 0x3f);
112 	}
113 
114 	if (wchar) *wchar = ch;
115 
116 	return utflen;
117 }
118 
119 /*-----------------------------------------------------------------------------
120    Convert a UTF-8 string to a wide char string.
121    No more than 'count' wide chars will be written to the output buffer.
122    Return the size of the converted string in wide chars, excl null terminator.
123 */
124 int
125 ldap_x_utf8s_to_wcs ( wchar_t *wcstr, const char *utf8str, size_t count )
126 {
127 	size_t wclen = 0;
128 	int utflen, i;
129 	wchar_t ch;
130 
131 
132 	/* If input ptr is NULL or empty... */
133 	if (utf8str == NULL || !*utf8str) {
134 		if ( wcstr )
135 			*wcstr = 0;
136 		return 0;
137 	}
138 
139 	/* Examine next UTF-8 character.  If output buffer is NULL, ignore count */
140 	while ( *utf8str && (wcstr==NULL || wclen<count) ) {
141 		/* Get UTF-8 sequence length from 1st byte */
142 		utflen = LDAP_UTF8_CHARLEN2(utf8str, utflen);
143 
144 		if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN ) return -1;
145 
146 		/* First byte minus length tag */
147 		ch = (wchar_t)(utf8str[0] & mask[utflen]);
148 
149 		for(i=1; i < utflen; i++) {
150 			/* Subsequent bytes must start with 10 */
151 			if ((utf8str[i] & 0xc0) != 0x80) return -1;
152 
153 			ch <<= 6;			/* 6 bits of data in each subsequent byte */
154 			ch |= (wchar_t)(utf8str[i] & 0x3f);
155 		}
156 
157 		if (wcstr) wcstr[wclen] = ch;
158 
159 		utf8str += utflen;	/* Move to next UTF-8 character */
160 		wclen++;			/* Count number of wide chars stored/required */
161 	}
162 
163 	/* Add null terminator if there's room in the buffer. */
164 	if (wcstr && wclen < count) wcstr[wclen] = 0;
165 
166 	return wclen;
167 }
168 
169 
170 /*-----------------------------------------------------------------------------
171    Convert one wide char to a UTF-8 character.
172    Return the length of the converted UTF-8 character in bytes.
173    No more than 'count' bytes will be written to the output buffer.
174 */
175 int
176 ldap_x_wc_to_utf8 ( char *utf8char, wchar_t wchar, size_t count )
177 {
178 	int len=0;
179 
180 	if (utf8char == NULL)   /* Just determine the required UTF-8 char length. */
181 	{						/* Ignore count */
182 		if( wchar < 0 )
183 			return -1;
184 		if( wchar < 0x80 )
185 			return 1;
186 		if( wchar < 0x800 )
187 			return 2;
188 		if( wchar < 0x10000 )
189 			return 3;
190 		if( wchar < 0x200000 )
191 			return 4;
192 		if( wchar < 0x4000000 )
193 			return 5;
194 		if( wchar < 0x80000000 )
195 			return 6;
196 		return -1;
197 	}
198 
199 
200 	if ( wchar < 0 ) {				/* Invalid wide character */
201 		len = -1;
202 
203 	} else if( wchar < 0x80 ) {
204 		if (count >= 1) {
205 			utf8char[len++] = (char)wchar;
206 		}
207 
208 	} else if( wchar < 0x800 ) {
209 		if (count >=2) {
210 			utf8char[len++] = 0xc0 | ( wchar >> 6 );
211 			utf8char[len++] = 0x80 | ( wchar & 0x3f );
212 		}
213 
214 	} else if( wchar < 0x10000 ) {
215 		if (count >= 3) {
216 			utf8char[len++] = 0xe0 | ( wchar >> 12 );
217 			utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
218 			utf8char[len++] = 0x80 | ( wchar & 0x3f );
219 		}
220 
221 	} else if( wchar < 0x200000 ) {
222 		if (count >= 4) {
223 			utf8char[len++] = 0xf0 | ( wchar >> 18 );
224 			utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
225 			utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
226 			utf8char[len++] = 0x80 | ( wchar & 0x3f );
227 		}
228 
229 	} else if( wchar < 0x4000000 ) {
230 		if (count >= 5) {
231 			utf8char[len++] = 0xf8 | ( wchar >> 24 );
232 			utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
233 			utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
234 			utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
235 			utf8char[len++] = 0x80 | ( wchar & 0x3f );
236 		}
237 
238 	} else if( wchar < 0x80000000 ) {
239 		if (count >= 6) {
240 			utf8char[len++] = 0xfc | ( wchar >> 30 );
241 			utf8char[len++] = 0x80 | ( (wchar >> 24) & 0x3f );
242 			utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
243 			utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
244 			utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
245 			utf8char[len++] = 0x80 | ( wchar & 0x3f );
246 		}
247 
248 	} else
249 		len = -1;
250 
251 	return len;
252 
253 }
254 
255 
256 /*-----------------------------------------------------------------------------
257    Convert a wide char string to a UTF-8 string.
258    No more than 'count' bytes will be written to the output buffer.
259    Return the # of bytes written to the output buffer, excl null terminator.
260 */
261 int
262 ldap_x_wcs_to_utf8s ( char *utf8str, const wchar_t *wcstr, size_t count )
263 {
264 	int len = 0;
265 	int n;
266 	char *p = utf8str;
267 	wchar_t empty = 0;		/* To avoid use of L"" construct */
268 
269 	if (wcstr == NULL)		/* Treat input ptr NULL as an empty string */
270 		wcstr = &empty;
271 
272 	if (utf8str == NULL)	/* Just compute size of output, excl null */
273 	{
274 		while (*wcstr)
275 		{
276 			/* Get UTF-8 size of next wide char */
277 			n = ldap_x_wc_to_utf8( NULL, *wcstr++, LDAP_MAX_UTF8_LEN);
278 			if (n == -1)
279 				return -1;
280 			len += n;
281 		}
282 
283 		return len;
284 	}
285 
286 
287 	/* Do the actual conversion. */
288 
289 	n = 1;					/* In case of empty wcstr */
290 	while (*wcstr)
291 	{
292 		n = ldap_x_wc_to_utf8( p, *wcstr++, count);
293 
294 		if (n <= 0)  		/* If encoding error (-1) or won't fit (0), quit */
295 			break;
296 
297 		p += n;
298 		count -= n;			/* Space left in output buffer */
299 	}
300 
301 	/* If not enough room for last character, pad remainder with null
302 	   so that return value = original count, indicating buffer full. */
303 	if (n == 0)
304 	{
305 		while (count--)
306 			*p++ = 0;
307 	}
308 
309 	/* Add a null terminator if there's room. */
310 	else if (count)
311 		*p = 0;
312 
313 	if (n == -1)			/* Conversion encountered invalid wide char. */
314 		return -1;
315 
316 	/* Return the number of bytes written to output buffer, excl null. */
317 	return (p - utf8str);
318 }
319 
320 
321 /*-----------------------------------------------------------------------------
322    Convert a UTF-8 character to a MultiByte character.
323    Return the size of the converted character in bytes.
324 */
325 int
326 ldap_x_utf8_to_mb ( char *mbchar, const char *utf8char,
327 		int (*f_wctomb)(char *mbchar, wchar_t wchar) )
328 {
329 	wchar_t wchar;
330 	int n;
331 	char tmp[6];				/* Large enough for biggest multibyte char */
332 
333 	if (f_wctomb == NULL)		/* If no conversion function was given... */
334 		f_wctomb = wctomb;		/*    use the local ANSI C function */
335 
336 	/* First convert UTF-8 char to a wide char */
337 	n = ldap_x_utf8_to_wc( &wchar, utf8char);
338 
339 	if (n == -1)
340 		return -1;		/* Invalid UTF-8 character */
341 
342 	if (mbchar == NULL)
343 		n = f_wctomb( tmp, wchar );
344 	else
345 		n = f_wctomb( mbchar, wchar);
346 
347 	return n;
348 }
349 
350 /*-----------------------------------------------------------------------------
351    Convert a UTF-8 string to a MultiByte string.
352    No more than 'count' bytes will be written to the output buffer.
353    Return the size of the converted string in bytes, excl null terminator.
354 */
355 int
356 ldap_x_utf8s_to_mbs ( char *mbstr, const char *utf8str, size_t count,
357 		size_t (*f_wcstombs)(char *mbstr, const wchar_t *wcstr, size_t count) )
358 {
359 	wchar_t *wcs;
360 	size_t wcsize;
361     int n;
362 
363 	if (f_wcstombs == NULL)		/* If no conversion function was given... */
364 		f_wcstombs = wcstombs;	/*    use the local ANSI C function */
365 
366 	if (utf8str == NULL || *utf8str == 0)	/* NULL or empty input string */
367 	{
368 		if (mbstr)
369 			*mbstr = 0;
370 		return 0;
371 	}
372 
373 /* Allocate memory for the maximum size wchar string that we could get. */
374 	wcsize = strlen(utf8str) + 1;
375 	wcs = (wchar_t *)LDAP_MALLOC(wcsize * sizeof(wchar_t));
376 	if (wcs == NULL)
377 		return -1;				/* Memory allocation failure. */
378 
379 	/* First convert the UTF-8 string to a wide char string */
380 	n = ldap_x_utf8s_to_wcs( wcs, utf8str, wcsize);
381 
382 	/* Then convert wide char string to multi-byte string */
383 	if (n != -1)
384 	{
385 		n = f_wcstombs(mbstr, wcs, count);
386 	}
387 
388 	LDAP_FREE(wcs);
389 
390 	return n;
391 }
392 
393 /*-----------------------------------------------------------------------------
394    Convert a MultiByte character to a UTF-8 character.
395    'mbsize' indicates the number of bytes of 'mbchar' to check.
396    Returns the number of bytes written to the output character.
397 */
398 int
399 ldap_x_mb_to_utf8 ( char *utf8char, const char *mbchar, size_t mbsize,
400 		int (*f_mbtowc)(wchar_t *wchar, const char *mbchar, size_t count) )
401 {
402     wchar_t wchar;
403     int n;
404 
405 	if (f_mbtowc == NULL)		/* If no conversion function was given... */
406 		f_mbtowc = mbtowc;		/*    use the local ANSI C function */
407 
408     if (mbsize == 0)				/* 0 is not valid. */
409         return -1;
410 
411     if (mbchar == NULL || *mbchar == 0)
412     {
413         if (utf8char)
414             *utf8char = 0;
415         return 1;
416     }
417 
418 	/* First convert the MB char to a Wide Char */
419 	n = f_mbtowc( &wchar, mbchar, mbsize);
420 
421 	if (n == -1)
422 		return -1;
423 
424 	/* Convert the Wide Char to a UTF-8 character. */
425 	n = ldap_x_wc_to_utf8( utf8char, wchar, LDAP_MAX_UTF8_LEN);
426 
427 	return n;
428 }
429 
430 
431 /*-----------------------------------------------------------------------------
432    Convert a MultiByte string to a UTF-8 string.
433    No more than 'count' bytes will be written to the output buffer.
434    Return the size of the converted string in bytes, excl null terminator.
435 */
436 int
437 ldap_x_mbs_to_utf8s ( char *utf8str, const char *mbstr, size_t count,
438 		size_t (*f_mbstowcs)(wchar_t *wcstr, const char *mbstr, size_t count) )
439 {
440 	wchar_t *wcs;
441 	int n;
442 	size_t wcsize;
443 
444 	if (mbstr == NULL)		   /* Treat NULL input string as an empty string */
445 		mbstr = "";
446 
447 	if (f_mbstowcs == NULL)		/* If no conversion function was given... */
448 		f_mbstowcs = mbstowcs;	/*    use the local ANSI C function */
449 
450 	/* Allocate memory for the maximum size wchar string that we could get. */
451 	wcsize = strlen(mbstr) + 1;
452 	wcs = (wchar_t *)LDAP_MALLOC( wcsize * sizeof(wchar_t) );
453 	if (wcs == NULL)
454 		return -1;
455 
456 	/* First convert multi-byte string to a wide char string */
457 	n = f_mbstowcs(wcs, mbstr, wcsize);
458 
459 	/* Convert wide char string to UTF-8 string */
460 	if (n != -1)
461 	{
462 		n = ldap_x_wcs_to_utf8s( utf8str, wcs, count);
463 	}
464 
465 	LDAP_FREE(wcs);
466 
467 	return n;
468 }
469 
470 #endif
471