xref: /netbsd-src/external/bsd/openldap/dist/libraries/libldap/utf-8-conv.c (revision 946379e7b37692fc43f68eb0d1c10daa0a7f3b6c)
1 /*	$NetBSD: utf-8-conv.c,v 1.1.1.4 2014/05/28 09:58:42 tron Exp $	*/
2 
3 /* $OpenLDAP$ */
4 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
5  *
6  * Copyright 1998-2014 The OpenLDAP Foundation.
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted only as authorized by the OpenLDAP
11  * Public License.
12  *
13  * A copy of this license is available in the file LICENSE in the
14  * top-level directory of the distribution or, alternatively, at
15  * <http://www.OpenLDAP.org/license.html>.
16  */
17 /* Portions Copyright (C) 1999, 2000 Novell, Inc. All Rights Reserved.
18  *
19  * THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND
20  * TREATIES. USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT
21  * TO VERSION 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS
22  * AVAILABLE AT HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE"
23  * IN THE TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION
24  * OF THIS WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP
25  * PUBLIC LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT
26  * THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY.
27  *---
28  * Note: A verbatim copy of version 2.0.1 of the OpenLDAP Public License
29  * can be found in the file "build/LICENSE-2.0.1" in this distribution
30  * of OpenLDAP Software.
31  */
32 
33 /*
34  * UTF-8 Conversion Routines
35  *
36  * These routines convert between Wide Character and UTF-8,
37  * or between MultiByte and UTF-8 encodings.
38  *
39  * Both single character and string versions of the functions are provided.
40  * All functions return -1 if the character or string cannot be converted.
41  */
42 
43 #include "portable.h"
44 
45 #if SIZEOF_WCHAR_T >= 4
46 /* These routines assume ( sizeof(wchar_t) >= 4 ) */
47 
48 #include <stdio.h>
49 #include <ac/stdlib.h>		/* For wctomb, wcstombs, mbtowc, mbstowcs */
50 #include <ac/string.h>
51 #include <ac/time.h>		/* for time_t */
52 
53 #include "ldap-int.h"
54 
55 #include <ldap_utf8.h>
56 
57 static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
58 
59 
60 /*-----------------------------------------------------------------------------
61 					UTF-8 Format Summary
62 
63 ASCII chars 						7 bits
64     0xxxxxxx
65 
66 2-character UTF-8 sequence:        11 bits
67     110xxxxx  10xxxxxx
68 
69 3-character UTF-8                  16 bits
70     1110xxxx  10xxxxxx  10xxxxxx
71 
72 4-char UTF-8                       21 bits
73     11110xxx  10xxxxxx  10xxxxxx  10xxxxxx
74 
75 5-char UTF-8                       26 bits
76     111110xx  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx
77 
78 6-char UTF-8                       31 bits
79     1111110x  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx
80 
81 Unicode address space   (0 - 0x10FFFF)    21 bits
82 ISO-10646 address space (0 - 0x7FFFFFFF)  31 bits
83 
84 Note: This code does not prevent UTF-8 sequences which are longer than
85       necessary from being decoded.
86 */
87 
88 /*-----------------------------------------------------------------------------
89    Convert a UTF-8 character to a wide char.
90    Return the length of the UTF-8 input character in bytes.
91 */
92 int
93 ldap_x_utf8_to_wc ( wchar_t *wchar, const char *utf8char )
94 {
95 	int utflen, i;
96 	wchar_t ch;
97 
98 	if (utf8char == NULL) return -1;
99 
100 	/* Get UTF-8 sequence length from 1st byte */
101 	utflen = LDAP_UTF8_CHARLEN2(utf8char, utflen);
102 
103 	if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN ) return -1;
104 
105 	/* First byte minus length tag */
106 	ch = (wchar_t)(utf8char[0] & mask[utflen]);
107 
108 	for(i=1; i < utflen; i++) {
109 		/* Subsequent bytes must start with 10 */
110 		if ((utf8char[i] & 0xc0) != 0x80) return -1;
111 
112 		ch <<= 6;			/* 6 bits of data in each subsequent byte */
113 		ch |= (wchar_t)(utf8char[i] & 0x3f);
114 	}
115 
116 	if (wchar) *wchar = ch;
117 
118 	return utflen;
119 }
120 
121 /*-----------------------------------------------------------------------------
122    Convert a UTF-8 string to a wide char string.
123    No more than 'count' wide chars will be written to the output buffer.
124    Return the size of the converted string in wide chars, excl null terminator.
125 */
126 int
127 ldap_x_utf8s_to_wcs ( wchar_t *wcstr, const char *utf8str, size_t count )
128 {
129 	size_t wclen = 0;
130 	int utflen, i;
131 	wchar_t ch;
132 
133 
134 	/* If input ptr is NULL or empty... */
135 	if (utf8str == NULL || !*utf8str) {
136 		if ( wcstr )
137 			*wcstr = 0;
138 		return 0;
139 	}
140 
141 	/* Examine next UTF-8 character.  If output buffer is NULL, ignore count */
142 	while ( *utf8str && (wcstr==NULL || wclen<count) ) {
143 		/* Get UTF-8 sequence length from 1st byte */
144 		utflen = LDAP_UTF8_CHARLEN2(utf8str, utflen);
145 
146 		if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN ) return -1;
147 
148 		/* First byte minus length tag */
149 		ch = (wchar_t)(utf8str[0] & mask[utflen]);
150 
151 		for(i=1; i < utflen; i++) {
152 			/* Subsequent bytes must start with 10 */
153 			if ((utf8str[i] & 0xc0) != 0x80) return -1;
154 
155 			ch <<= 6;			/* 6 bits of data in each subsequent byte */
156 			ch |= (wchar_t)(utf8str[i] & 0x3f);
157 		}
158 
159 		if (wcstr) wcstr[wclen] = ch;
160 
161 		utf8str += utflen;	/* Move to next UTF-8 character */
162 		wclen++;			/* Count number of wide chars stored/required */
163 	}
164 
165 	/* Add null terminator if there's room in the buffer. */
166 	if (wcstr && wclen < count) wcstr[wclen] = 0;
167 
168 	return wclen;
169 }
170 
171 
172 /*-----------------------------------------------------------------------------
173    Convert one wide char to a UTF-8 character.
174    Return the length of the converted UTF-8 character in bytes.
175    No more than 'count' bytes will be written to the output buffer.
176 */
177 int
178 ldap_x_wc_to_utf8 ( char *utf8char, wchar_t wchar, size_t count )
179 {
180 	int len=0;
181 
182 	if (utf8char == NULL)   /* Just determine the required UTF-8 char length. */
183 	{						/* Ignore count */
184 		if( wchar < 0 )
185 			return -1;
186 		if( wchar < 0x80 )
187 			return 1;
188 		if( wchar < 0x800 )
189 			return 2;
190 		if( wchar < 0x10000 )
191 			return 3;
192 		if( wchar < 0x200000 )
193 			return 4;
194 		if( wchar < 0x4000000 )
195 			return 5;
196 #if SIZEOF_WCHAR_T > 4
197 		/* UL is not strictly needed by ANSI C */
198 		if( wchar < (wchar_t)0x80000000UL )
199 #endif /* SIZEOF_WCHAR_T > 4 */
200 			return 6;
201 		return -1;
202 	}
203 
204 
205 	if ( wchar < 0 ) {				/* Invalid wide character */
206 		len = -1;
207 
208 	} else if( wchar < 0x80 ) {
209 		if (count >= 1) {
210 			utf8char[len++] = (char)wchar;
211 		}
212 
213 	} else if( wchar < 0x800 ) {
214 		if (count >=2) {
215 			utf8char[len++] = 0xc0 | ( wchar >> 6 );
216 			utf8char[len++] = 0x80 | ( wchar & 0x3f );
217 		}
218 
219 	} else if( wchar < 0x10000 ) {
220 		if (count >= 3) {
221 			utf8char[len++] = 0xe0 | ( wchar >> 12 );
222 			utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
223 			utf8char[len++] = 0x80 | ( wchar & 0x3f );
224 		}
225 
226 	} else if( wchar < 0x200000 ) {
227 		if (count >= 4) {
228 			utf8char[len++] = 0xf0 | ( wchar >> 18 );
229 			utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
230 			utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
231 			utf8char[len++] = 0x80 | ( wchar & 0x3f );
232 		}
233 
234 	} else if( wchar < 0x4000000 ) {
235 		if (count >= 5) {
236 			utf8char[len++] = 0xf8 | ( wchar >> 24 );
237 			utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
238 			utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
239 			utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
240 			utf8char[len++] = 0x80 | ( wchar & 0x3f );
241 		}
242 
243 	} else
244 #if SIZEOF_WCHAR_T > 4
245 		/* UL is not strictly needed by ANSI C */
246 		if( wchar < (wchar_t)0x80000000UL )
247 #endif /* SIZEOF_WCHAR_T > 4 */
248 	{
249 		if (count >= 6) {
250 			utf8char[len++] = 0xfc | ( wchar >> 30 );
251 			utf8char[len++] = 0x80 | ( (wchar >> 24) & 0x3f );
252 			utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
253 			utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
254 			utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
255 			utf8char[len++] = 0x80 | ( wchar & 0x3f );
256 		}
257 
258 #if SIZEOF_WCHAR_T > 4
259 	} else {
260 		len = -1;
261 #endif /* SIZEOF_WCHAR_T > 4 */
262 	}
263 
264 	return len;
265 
266 }
267 
268 
269 /*-----------------------------------------------------------------------------
270    Convert a wide char string to a UTF-8 string.
271    No more than 'count' bytes will be written to the output buffer.
272    Return the # of bytes written to the output buffer, excl null terminator.
273 */
274 int
275 ldap_x_wcs_to_utf8s ( char *utf8str, const wchar_t *wcstr, size_t count )
276 {
277 	int len = 0;
278 	int n;
279 	char *p = utf8str;
280 	wchar_t empty = 0;		/* To avoid use of L"" construct */
281 
282 	if (wcstr == NULL)		/* Treat input ptr NULL as an empty string */
283 		wcstr = &empty;
284 
285 	if (utf8str == NULL)	/* Just compute size of output, excl null */
286 	{
287 		while (*wcstr)
288 		{
289 			/* Get UTF-8 size of next wide char */
290 			n = ldap_x_wc_to_utf8( NULL, *wcstr++, LDAP_MAX_UTF8_LEN);
291 			if (n == -1)
292 				return -1;
293 			len += n;
294 		}
295 
296 		return len;
297 	}
298 
299 
300 	/* Do the actual conversion. */
301 
302 	n = 1;					/* In case of empty wcstr */
303 	while (*wcstr)
304 	{
305 		n = ldap_x_wc_to_utf8( p, *wcstr++, count);
306 
307 		if (n <= 0)  		/* If encoding error (-1) or won't fit (0), quit */
308 			break;
309 
310 		p += n;
311 		count -= n;			/* Space left in output buffer */
312 	}
313 
314 	/* If not enough room for last character, pad remainder with null
315 	   so that return value = original count, indicating buffer full. */
316 	if (n == 0)
317 	{
318 		while (count--)
319 			*p++ = 0;
320 	}
321 
322 	/* Add a null terminator if there's room. */
323 	else if (count)
324 		*p = 0;
325 
326 	if (n == -1)			/* Conversion encountered invalid wide char. */
327 		return -1;
328 
329 	/* Return the number of bytes written to output buffer, excl null. */
330 	return (p - utf8str);
331 }
332 
333 #ifdef ANDROID
334 int wctomb(char *s, wchar_t wc) { return wcrtomb(s,wc,NULL); }
335 int mbtowc(wchar_t *pwc, const char *s, size_t n) { return mbrtowc(pwc, s, n, NULL); }
336 #endif
337 
338 /*-----------------------------------------------------------------------------
339    Convert a UTF-8 character to a MultiByte character.
340    Return the size of the converted character in bytes.
341 */
342 int
343 ldap_x_utf8_to_mb ( char *mbchar, const char *utf8char,
344 		int (*f_wctomb)(char *mbchar, wchar_t wchar) )
345 {
346 	wchar_t wchar;
347 	int n;
348 	char tmp[6];				/* Large enough for biggest multibyte char */
349 
350 	if (f_wctomb == NULL)		/* If no conversion function was given... */
351 		f_wctomb = wctomb;		/*    use the local ANSI C function */
352 
353 	/* First convert UTF-8 char to a wide char */
354 	n = ldap_x_utf8_to_wc( &wchar, utf8char);
355 
356 	if (n == -1)
357 		return -1;		/* Invalid UTF-8 character */
358 
359 	if (mbchar == NULL)
360 		n = f_wctomb( tmp, wchar );
361 	else
362 		n = f_wctomb( mbchar, wchar);
363 
364 	return n;
365 }
366 
367 /*-----------------------------------------------------------------------------
368    Convert a UTF-8 string to a MultiByte string.
369    No more than 'count' bytes will be written to the output buffer.
370    Return the size of the converted string in bytes, excl null terminator.
371 */
372 int
373 ldap_x_utf8s_to_mbs ( char *mbstr, const char *utf8str, size_t count,
374 		size_t (*f_wcstombs)(char *mbstr, const wchar_t *wcstr, size_t count) )
375 {
376 	wchar_t *wcs;
377 	size_t wcsize;
378     int n;
379 
380 	if (f_wcstombs == NULL)		/* If no conversion function was given... */
381 		f_wcstombs = wcstombs;	/*    use the local ANSI C function */
382 
383 	if (utf8str == NULL || *utf8str == 0)	/* NULL or empty input string */
384 	{
385 		if (mbstr)
386 			*mbstr = 0;
387 		return 0;
388 	}
389 
390 /* Allocate memory for the maximum size wchar string that we could get. */
391 	wcsize = strlen(utf8str) + 1;
392 	wcs = (wchar_t *)LDAP_MALLOC(wcsize * sizeof(wchar_t));
393 	if (wcs == NULL)
394 		return -1;				/* Memory allocation failure. */
395 
396 	/* First convert the UTF-8 string to a wide char string */
397 	n = ldap_x_utf8s_to_wcs( wcs, utf8str, wcsize);
398 
399 	/* Then convert wide char string to multi-byte string */
400 	if (n != -1)
401 	{
402 		n = f_wcstombs(mbstr, wcs, count);
403 	}
404 
405 	LDAP_FREE(wcs);
406 
407 	return n;
408 }
409 
410 /*-----------------------------------------------------------------------------
411    Convert a MultiByte character to a UTF-8 character.
412    'mbsize' indicates the number of bytes of 'mbchar' to check.
413    Returns the number of bytes written to the output character.
414 */
415 int
416 ldap_x_mb_to_utf8 ( char *utf8char, const char *mbchar, size_t mbsize,
417 		int (*f_mbtowc)(wchar_t *wchar, const char *mbchar, size_t count) )
418 {
419     wchar_t wchar;
420     int n;
421 
422 	if (f_mbtowc == NULL)		/* If no conversion function was given... */
423 		f_mbtowc = mbtowc;		/*    use the local ANSI C function */
424 
425     if (mbsize == 0)				/* 0 is not valid. */
426         return -1;
427 
428     if (mbchar == NULL || *mbchar == 0)
429     {
430         if (utf8char)
431             *utf8char = 0;
432         return 1;
433     }
434 
435 	/* First convert the MB char to a Wide Char */
436 	n = f_mbtowc( &wchar, mbchar, mbsize);
437 
438 	if (n == -1)
439 		return -1;
440 
441 	/* Convert the Wide Char to a UTF-8 character. */
442 	n = ldap_x_wc_to_utf8( utf8char, wchar, LDAP_MAX_UTF8_LEN);
443 
444 	return n;
445 }
446 
447 
448 /*-----------------------------------------------------------------------------
449    Convert a MultiByte string to a UTF-8 string.
450    No more than 'count' bytes will be written to the output buffer.
451    Return the size of the converted string in bytes, excl null terminator.
452 */
453 int
454 ldap_x_mbs_to_utf8s ( char *utf8str, const char *mbstr, size_t count,
455 		size_t (*f_mbstowcs)(wchar_t *wcstr, const char *mbstr, size_t count) )
456 {
457 	wchar_t *wcs;
458 	int n;
459 	size_t wcsize;
460 
461 	if (mbstr == NULL)		   /* Treat NULL input string as an empty string */
462 		mbstr = "";
463 
464 	if (f_mbstowcs == NULL)		/* If no conversion function was given... */
465 		f_mbstowcs = mbstowcs;	/*    use the local ANSI C function */
466 
467 	/* Allocate memory for the maximum size wchar string that we could get. */
468 	wcsize = strlen(mbstr) + 1;
469 	wcs = (wchar_t *)LDAP_MALLOC( wcsize * sizeof(wchar_t) );
470 	if (wcs == NULL)
471 		return -1;
472 
473 	/* First convert multi-byte string to a wide char string */
474 	n = f_mbstowcs(wcs, mbstr, wcsize);
475 
476 	/* Convert wide char string to UTF-8 string */
477 	if (n != -1)
478 	{
479 		n = ldap_x_wcs_to_utf8s( utf8str, wcs, count);
480 	}
481 
482 	LDAP_FREE(wcs);
483 
484 	return n;
485 }
486 
487 #endif /* SIZEOF_WCHAR_T >= 4 */
488