xref: /netbsd-src/external/bsd/openldap/dist/libraries/libldap/utf-8-conv.c (revision 549b59ed3ccf0d36d3097190a0db27b770f3a839)
1 /*	$NetBSD: utf-8-conv.c,v 1.3 2021/08/14 16:14:56 christos Exp $	*/
2 
3 /* $OpenLDAP$ */
4 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
5  *
6  * Copyright 1998-2021 The OpenLDAP Foundation.
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted only as authorized by the OpenLDAP
11  * Public License.
12  *
13  * A copy of this license is available in the file LICENSE in the
14  * top-level directory of the distribution or, alternatively, at
15  * <http://www.OpenLDAP.org/license.html>.
16  */
17 /* Portions Copyright (C) 1999, 2000 Novell, Inc. All Rights Reserved.
18  *
19  * THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND
20  * TREATIES. USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT
21  * TO VERSION 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS
22  * AVAILABLE AT HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE"
23  * IN THE TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION
24  * OF THIS WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP
25  * PUBLIC LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT
26  * THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY.
27  *---
28  * Note: A verbatim copy of version 2.0.1 of the OpenLDAP Public License
29  * can be found in the file "build/LICENSE-2.0.1" in this distribution
30  * of OpenLDAP Software.
31  */
32 
33 /*
34  * UTF-8 Conversion Routines
35  *
36  * These routines convert between Wide Character and UTF-8,
37  * or between MultiByte and UTF-8 encodings.
38  *
39  * Both single character and string versions of the functions are provided.
40  * All functions return -1 if the character or string cannot be converted.
41  */
42 
43 #include <sys/cdefs.h>
44 __RCSID("$NetBSD: utf-8-conv.c,v 1.3 2021/08/14 16:14:56 christos Exp $");
45 
46 #include "portable.h"
47 
48 #if SIZEOF_WCHAR_T >= 4
49 /* These routines assume ( sizeof(wchar_t) >= 4 ) */
50 
51 #include <stdio.h>
52 #include <ac/stdlib.h>		/* For wctomb, wcstombs, mbtowc, mbstowcs */
53 #include <ac/string.h>
54 #include <ac/time.h>		/* for time_t */
55 
56 #include "ldap-int.h"
57 
58 #include <ldap_utf8.h>
59 
60 static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
61 
62 
63 /*-----------------------------------------------------------------------------
64 					UTF-8 Format Summary
65 
66 ASCII chars 						7 bits
67     0xxxxxxx
68 
69 2-character UTF-8 sequence:        11 bits
70     110xxxxx  10xxxxxx
71 
72 3-character UTF-8                  16 bits
73     1110xxxx  10xxxxxx  10xxxxxx
74 
75 4-char UTF-8                       21 bits
76     11110xxx  10xxxxxx  10xxxxxx  10xxxxxx
77 
78 5-char UTF-8                       26 bits
79     111110xx  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx
80 
81 6-char UTF-8                       31 bits
82     1111110x  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx
83 
84 Unicode address space   (0 - 0x10FFFF)    21 bits
85 ISO-10646 address space (0 - 0x7FFFFFFF)  31 bits
86 
87 Note: This code does not prevent UTF-8 sequences which are longer than
88       necessary from being decoded.
89 */
90 
91 /*-----------------------------------------------------------------------------
92    Convert a UTF-8 character to a wide char.
93    Return the length of the UTF-8 input character in bytes.
94 */
95 int
ldap_x_utf8_to_wc(wchar_t * wchar,const char * utf8char)96 ldap_x_utf8_to_wc ( wchar_t *wchar, const char *utf8char )
97 {
98 	int utflen, i;
99 	wchar_t ch;
100 
101 	if (utf8char == NULL) return -1;
102 
103 	/* Get UTF-8 sequence length from 1st byte */
104 	utflen = LDAP_UTF8_CHARLEN2(utf8char, utflen);
105 
106 	if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN ) return -1;
107 
108 	/* First byte minus length tag */
109 	ch = (wchar_t)(utf8char[0] & mask[utflen]);
110 
111 	for(i=1; i < utflen; i++) {
112 		/* Subsequent bytes must start with 10 */
113 		if ((utf8char[i] & 0xc0) != 0x80) return -1;
114 
115 		ch <<= 6;			/* 6 bits of data in each subsequent byte */
116 		ch |= (wchar_t)(utf8char[i] & 0x3f);
117 	}
118 
119 	if (wchar) *wchar = ch;
120 
121 	return utflen;
122 }
123 
124 /*-----------------------------------------------------------------------------
125    Convert a UTF-8 string to a wide char string.
126    No more than 'count' wide chars will be written to the output buffer.
127    Return the size of the converted string in wide chars, excl null terminator.
128 */
129 int
ldap_x_utf8s_to_wcs(wchar_t * wcstr,const char * utf8str,size_t count)130 ldap_x_utf8s_to_wcs ( wchar_t *wcstr, const char *utf8str, size_t count )
131 {
132 	size_t wclen = 0;
133 	int utflen, i;
134 	wchar_t ch;
135 
136 
137 	/* If input ptr is NULL or empty... */
138 	if (utf8str == NULL || !*utf8str) {
139 		if ( wcstr )
140 			*wcstr = 0;
141 		return 0;
142 	}
143 
144 	/* Examine next UTF-8 character.  If output buffer is NULL, ignore count */
145 	while ( *utf8str && (wcstr==NULL || wclen<count) ) {
146 		/* Get UTF-8 sequence length from 1st byte */
147 		utflen = LDAP_UTF8_CHARLEN2(utf8str, utflen);
148 
149 		if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN ) return -1;
150 
151 		/* First byte minus length tag */
152 		ch = (wchar_t)(utf8str[0] & mask[utflen]);
153 
154 		for(i=1; i < utflen; i++) {
155 			/* Subsequent bytes must start with 10 */
156 			if ((utf8str[i] & 0xc0) != 0x80) return -1;
157 
158 			ch <<= 6;			/* 6 bits of data in each subsequent byte */
159 			ch |= (wchar_t)(utf8str[i] & 0x3f);
160 		}
161 
162 		if (wcstr) wcstr[wclen] = ch;
163 
164 		utf8str += utflen;	/* Move to next UTF-8 character */
165 		wclen++;			/* Count number of wide chars stored/required */
166 	}
167 
168 	/* Add null terminator if there's room in the buffer. */
169 	if (wcstr && wclen < count) wcstr[wclen] = 0;
170 
171 	return wclen;
172 }
173 
174 
175 /*-----------------------------------------------------------------------------
176    Convert one wide char to a UTF-8 character.
177    Return the length of the converted UTF-8 character in bytes.
178    No more than 'count' bytes will be written to the output buffer.
179 */
180 int
ldap_x_wc_to_utf8(char * utf8char,wchar_t wchar,size_t count)181 ldap_x_wc_to_utf8 ( char *utf8char, wchar_t wchar, size_t count )
182 {
183 	int len=0;
184 
185 	if (utf8char == NULL)   /* Just determine the required UTF-8 char length. */
186 	{						/* Ignore count */
187 		if( wchar < 0 )
188 			return -1;
189 		if( wchar < 0x80 )
190 			return 1;
191 		if( wchar < 0x800 )
192 			return 2;
193 		if( wchar < 0x10000 )
194 			return 3;
195 		if( wchar < 0x200000 )
196 			return 4;
197 		if( wchar < 0x4000000 )
198 			return 5;
199 #if SIZEOF_WCHAR_T > 4
200 		/* UL is not strictly needed by ANSI C */
201 		if( wchar < (wchar_t)0x80000000UL )
202 #endif /* SIZEOF_WCHAR_T > 4 */
203 			return 6;
204 		return -1;
205 	}
206 
207 
208 	if ( wchar < 0 ) {				/* Invalid wide character */
209 		len = -1;
210 
211 	} else if( wchar < 0x80 ) {
212 		if (count >= 1) {
213 			utf8char[len++] = (char)wchar;
214 		}
215 
216 	} else if( wchar < 0x800 ) {
217 		if (count >=2) {
218 			utf8char[len++] = 0xc0 | ( wchar >> 6 );
219 			utf8char[len++] = 0x80 | ( wchar & 0x3f );
220 		}
221 
222 	} else if( wchar < 0x10000 ) {
223 		if (count >= 3) {
224 			utf8char[len++] = 0xe0 | ( wchar >> 12 );
225 			utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
226 			utf8char[len++] = 0x80 | ( wchar & 0x3f );
227 		}
228 
229 	} else if( wchar < 0x200000 ) {
230 		if (count >= 4) {
231 			utf8char[len++] = 0xf0 | ( wchar >> 18 );
232 			utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
233 			utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
234 			utf8char[len++] = 0x80 | ( wchar & 0x3f );
235 		}
236 
237 	} else if( wchar < 0x4000000 ) {
238 		if (count >= 5) {
239 			utf8char[len++] = 0xf8 | ( wchar >> 24 );
240 			utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
241 			utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
242 			utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
243 			utf8char[len++] = 0x80 | ( wchar & 0x3f );
244 		}
245 
246 	} else
247 #if SIZEOF_WCHAR_T > 4
248 		/* UL is not strictly needed by ANSI C */
249 		if( wchar < (wchar_t)0x80000000UL )
250 #endif /* SIZEOF_WCHAR_T > 4 */
251 	{
252 		if (count >= 6) {
253 			utf8char[len++] = 0xfc | ( wchar >> 30 );
254 			utf8char[len++] = 0x80 | ( (wchar >> 24) & 0x3f );
255 			utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
256 			utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
257 			utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
258 			utf8char[len++] = 0x80 | ( wchar & 0x3f );
259 		}
260 
261 #if SIZEOF_WCHAR_T > 4
262 	} else {
263 		len = -1;
264 #endif /* SIZEOF_WCHAR_T > 4 */
265 	}
266 
267 	return len;
268 
269 }
270 
271 
272 /*-----------------------------------------------------------------------------
273    Convert a wide char string to a UTF-8 string.
274    No more than 'count' bytes will be written to the output buffer.
275    Return the # of bytes written to the output buffer, excl null terminator.
276 */
277 int
ldap_x_wcs_to_utf8s(char * utf8str,const wchar_t * wcstr,size_t count)278 ldap_x_wcs_to_utf8s ( char *utf8str, const wchar_t *wcstr, size_t count )
279 {
280 	int len = 0;
281 	int n;
282 	char *p = utf8str;
283 	wchar_t empty = 0;		/* To avoid use of L"" construct */
284 
285 	if (wcstr == NULL)		/* Treat input ptr NULL as an empty string */
286 		wcstr = &empty;
287 
288 	if (utf8str == NULL)	/* Just compute size of output, excl null */
289 	{
290 		while (*wcstr)
291 		{
292 			/* Get UTF-8 size of next wide char */
293 			n = ldap_x_wc_to_utf8( NULL, *wcstr++, LDAP_MAX_UTF8_LEN);
294 			if (n == -1)
295 				return -1;
296 			len += n;
297 		}
298 
299 		return len;
300 	}
301 
302 
303 	/* Do the actual conversion. */
304 
305 	n = 1;					/* In case of empty wcstr */
306 	while (*wcstr)
307 	{
308 		n = ldap_x_wc_to_utf8( p, *wcstr++, count);
309 
310 		if (n <= 0)  		/* If encoding error (-1) or won't fit (0), quit */
311 			break;
312 
313 		p += n;
314 		count -= n;			/* Space left in output buffer */
315 	}
316 
317 	/* If not enough room for last character, pad remainder with null
318 	   so that return value = original count, indicating buffer full. */
319 	if (n == 0)
320 	{
321 		while (count--)
322 			*p++ = 0;
323 	}
324 
325 	/* Add a null terminator if there's room. */
326 	else if (count)
327 		*p = 0;
328 
329 	if (n == -1)			/* Conversion encountered invalid wide char. */
330 		return -1;
331 
332 	/* Return the number of bytes written to output buffer, excl null. */
333 	return (p - utf8str);
334 }
335 
336 #ifdef ANDROID
wctomb(char * s,wchar_t wc)337 int wctomb(char *s, wchar_t wc) { return wcrtomb(s,wc,NULL); }
mbtowc(wchar_t * pwc,const char * s,size_t n)338 int mbtowc(wchar_t *pwc, const char *s, size_t n) { return mbrtowc(pwc, s, n, NULL); }
339 #endif
340 
341 /*-----------------------------------------------------------------------------
342    Convert a UTF-8 character to a MultiByte character.
343    Return the size of the converted character in bytes.
344 */
345 int
ldap_x_utf8_to_mb(char * mbchar,const char * utf8char,int (* f_wctomb)(char * mbchar,wchar_t wchar))346 ldap_x_utf8_to_mb ( char *mbchar, const char *utf8char,
347 		int (*f_wctomb)(char *mbchar, wchar_t wchar) )
348 {
349 	wchar_t wchar;
350 	int n;
351 	char tmp[6];				/* Large enough for biggest multibyte char */
352 
353 	if (f_wctomb == NULL)		/* If no conversion function was given... */
354 		f_wctomb = wctomb;		/*    use the local ANSI C function */
355 
356 	/* First convert UTF-8 char to a wide char */
357 	n = ldap_x_utf8_to_wc( &wchar, utf8char);
358 
359 	if (n == -1)
360 		return -1;		/* Invalid UTF-8 character */
361 
362 	if (mbchar == NULL)
363 		n = f_wctomb( tmp, wchar );
364 	else
365 		n = f_wctomb( mbchar, wchar);
366 
367 	return n;
368 }
369 
370 /*-----------------------------------------------------------------------------
371    Convert a UTF-8 string to a MultiByte string.
372    No more than 'count' bytes will be written to the output buffer.
373    Return the size of the converted string in bytes, excl null terminator.
374 */
375 int
ldap_x_utf8s_to_mbs(char * mbstr,const char * utf8str,size_t count,size_t (* f_wcstombs)(char * mbstr,const wchar_t * wcstr,size_t count))376 ldap_x_utf8s_to_mbs ( char *mbstr, const char *utf8str, size_t count,
377 		size_t (*f_wcstombs)(char *mbstr, const wchar_t *wcstr, size_t count) )
378 {
379 	wchar_t *wcs;
380 	size_t wcsize;
381     int n;
382 
383 	if (f_wcstombs == NULL)		/* If no conversion function was given... */
384 		f_wcstombs = wcstombs;	/*    use the local ANSI C function */
385 
386 	if (utf8str == NULL || *utf8str == 0)	/* NULL or empty input string */
387 	{
388 		if (mbstr)
389 			*mbstr = 0;
390 		return 0;
391 	}
392 
393 /* Allocate memory for the maximum size wchar string that we could get. */
394 	wcsize = strlen(utf8str) + 1;
395 	wcs = (wchar_t *)LDAP_MALLOC(wcsize * sizeof(wchar_t));
396 	if (wcs == NULL)
397 		return -1;				/* Memory allocation failure. */
398 
399 	/* First convert the UTF-8 string to a wide char string */
400 	n = ldap_x_utf8s_to_wcs( wcs, utf8str, wcsize);
401 
402 	/* Then convert wide char string to multi-byte string */
403 	if (n != -1)
404 	{
405 		n = f_wcstombs(mbstr, wcs, count);
406 	}
407 
408 	LDAP_FREE(wcs);
409 
410 	return n;
411 }
412 
413 /*-----------------------------------------------------------------------------
414    Convert a MultiByte character to a UTF-8 character.
415    'mbsize' indicates the number of bytes of 'mbchar' to check.
416    Returns the number of bytes written to the output character.
417 */
418 int
ldap_x_mb_to_utf8(char * utf8char,const char * mbchar,size_t mbsize,int (* f_mbtowc)(wchar_t * wchar,const char * mbchar,size_t count))419 ldap_x_mb_to_utf8 ( char *utf8char, const char *mbchar, size_t mbsize,
420 		int (*f_mbtowc)(wchar_t *wchar, const char *mbchar, size_t count) )
421 {
422     wchar_t wchar;
423     int n;
424 
425 	if (f_mbtowc == NULL)		/* If no conversion function was given... */
426 		f_mbtowc = mbtowc;		/*    use the local ANSI C function */
427 
428     if (mbsize == 0)				/* 0 is not valid. */
429         return -1;
430 
431     if (mbchar == NULL || *mbchar == 0)
432     {
433         if (utf8char)
434             *utf8char = 0;
435         return 1;
436     }
437 
438 	/* First convert the MB char to a Wide Char */
439 	n = f_mbtowc( &wchar, mbchar, mbsize);
440 
441 	if (n == -1)
442 		return -1;
443 
444 	/* Convert the Wide Char to a UTF-8 character. */
445 	n = ldap_x_wc_to_utf8( utf8char, wchar, LDAP_MAX_UTF8_LEN);
446 
447 	return n;
448 }
449 
450 
451 /*-----------------------------------------------------------------------------
452    Convert a MultiByte string to a UTF-8 string.
453    No more than 'count' bytes will be written to the output buffer.
454    Return the size of the converted string in bytes, excl null terminator.
455 */
456 int
ldap_x_mbs_to_utf8s(char * utf8str,const char * mbstr,size_t count,size_t (* f_mbstowcs)(wchar_t * wcstr,const char * mbstr,size_t count))457 ldap_x_mbs_to_utf8s ( char *utf8str, const char *mbstr, size_t count,
458 		size_t (*f_mbstowcs)(wchar_t *wcstr, const char *mbstr, size_t count) )
459 {
460 	wchar_t *wcs;
461 	int n;
462 	size_t wcsize;
463 
464 	if (mbstr == NULL)		   /* Treat NULL input string as an empty string */
465 		mbstr = "";
466 
467 	if (f_mbstowcs == NULL)		/* If no conversion function was given... */
468 		f_mbstowcs = mbstowcs;	/*    use the local ANSI C function */
469 
470 	/* Allocate memory for the maximum size wchar string that we could get. */
471 	wcsize = strlen(mbstr) + 1;
472 	wcs = (wchar_t *)LDAP_MALLOC( wcsize * sizeof(wchar_t) );
473 	if (wcs == NULL)
474 		return -1;
475 
476 	/* First convert multi-byte string to a wide char string */
477 	n = f_mbstowcs(wcs, mbstr, wcsize);
478 
479 	/* Convert wide char string to UTF-8 string */
480 	if (n != -1)
481 	{
482 		n = ldap_x_wcs_to_utf8s( utf8str, wcs, count);
483 	}
484 
485 	LDAP_FREE(wcs);
486 
487 	return n;
488 }
489 
490 #endif /* SIZEOF_WCHAR_T >= 4 */
491