1 /* $NetBSD: utf-8-conv.c,v 1.3 2021/08/14 16:14:56 christos Exp $ */
2
3 /* $OpenLDAP$ */
4 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
5 *
6 * Copyright 1998-2021 The OpenLDAP Foundation.
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted only as authorized by the OpenLDAP
11 * Public License.
12 *
13 * A copy of this license is available in the file LICENSE in the
14 * top-level directory of the distribution or, alternatively, at
15 * <http://www.OpenLDAP.org/license.html>.
16 */
17 /* Portions Copyright (C) 1999, 2000 Novell, Inc. All Rights Reserved.
18 *
19 * THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND
20 * TREATIES. USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT
21 * TO VERSION 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS
22 * AVAILABLE AT HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE"
23 * IN THE TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION
24 * OF THIS WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP
25 * PUBLIC LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT
26 * THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY.
27 *---
28 * Note: A verbatim copy of version 2.0.1 of the OpenLDAP Public License
29 * can be found in the file "build/LICENSE-2.0.1" in this distribution
30 * of OpenLDAP Software.
31 */
32
33 /*
34 * UTF-8 Conversion Routines
35 *
36 * These routines convert between Wide Character and UTF-8,
37 * or between MultiByte and UTF-8 encodings.
38 *
39 * Both single character and string versions of the functions are provided.
40 * All functions return -1 if the character or string cannot be converted.
41 */
42
43 #include <sys/cdefs.h>
44 __RCSID("$NetBSD: utf-8-conv.c,v 1.3 2021/08/14 16:14:56 christos Exp $");
45
46 #include "portable.h"
47
48 #if SIZEOF_WCHAR_T >= 4
49 /* These routines assume ( sizeof(wchar_t) >= 4 ) */
50
51 #include <stdio.h>
52 #include <ac/stdlib.h> /* For wctomb, wcstombs, mbtowc, mbstowcs */
53 #include <ac/string.h>
54 #include <ac/time.h> /* for time_t */
55
56 #include "ldap-int.h"
57
58 #include <ldap_utf8.h>
59
60 static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
61
62
63 /*-----------------------------------------------------------------------------
64 UTF-8 Format Summary
65
66 ASCII chars 7 bits
67 0xxxxxxx
68
69 2-character UTF-8 sequence: 11 bits
70 110xxxxx 10xxxxxx
71
72 3-character UTF-8 16 bits
73 1110xxxx 10xxxxxx 10xxxxxx
74
75 4-char UTF-8 21 bits
76 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
77
78 5-char UTF-8 26 bits
79 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
80
81 6-char UTF-8 31 bits
82 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
83
84 Unicode address space (0 - 0x10FFFF) 21 bits
85 ISO-10646 address space (0 - 0x7FFFFFFF) 31 bits
86
87 Note: This code does not prevent UTF-8 sequences which are longer than
88 necessary from being decoded.
89 */
90
91 /*-----------------------------------------------------------------------------
92 Convert a UTF-8 character to a wide char.
93 Return the length of the UTF-8 input character in bytes.
94 */
95 int
ldap_x_utf8_to_wc(wchar_t * wchar,const char * utf8char)96 ldap_x_utf8_to_wc ( wchar_t *wchar, const char *utf8char )
97 {
98 int utflen, i;
99 wchar_t ch;
100
101 if (utf8char == NULL) return -1;
102
103 /* Get UTF-8 sequence length from 1st byte */
104 utflen = LDAP_UTF8_CHARLEN2(utf8char, utflen);
105
106 if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN ) return -1;
107
108 /* First byte minus length tag */
109 ch = (wchar_t)(utf8char[0] & mask[utflen]);
110
111 for(i=1; i < utflen; i++) {
112 /* Subsequent bytes must start with 10 */
113 if ((utf8char[i] & 0xc0) != 0x80) return -1;
114
115 ch <<= 6; /* 6 bits of data in each subsequent byte */
116 ch |= (wchar_t)(utf8char[i] & 0x3f);
117 }
118
119 if (wchar) *wchar = ch;
120
121 return utflen;
122 }
123
124 /*-----------------------------------------------------------------------------
125 Convert a UTF-8 string to a wide char string.
126 No more than 'count' wide chars will be written to the output buffer.
127 Return the size of the converted string in wide chars, excl null terminator.
128 */
129 int
ldap_x_utf8s_to_wcs(wchar_t * wcstr,const char * utf8str,size_t count)130 ldap_x_utf8s_to_wcs ( wchar_t *wcstr, const char *utf8str, size_t count )
131 {
132 size_t wclen = 0;
133 int utflen, i;
134 wchar_t ch;
135
136
137 /* If input ptr is NULL or empty... */
138 if (utf8str == NULL || !*utf8str) {
139 if ( wcstr )
140 *wcstr = 0;
141 return 0;
142 }
143
144 /* Examine next UTF-8 character. If output buffer is NULL, ignore count */
145 while ( *utf8str && (wcstr==NULL || wclen<count) ) {
146 /* Get UTF-8 sequence length from 1st byte */
147 utflen = LDAP_UTF8_CHARLEN2(utf8str, utflen);
148
149 if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN ) return -1;
150
151 /* First byte minus length tag */
152 ch = (wchar_t)(utf8str[0] & mask[utflen]);
153
154 for(i=1; i < utflen; i++) {
155 /* Subsequent bytes must start with 10 */
156 if ((utf8str[i] & 0xc0) != 0x80) return -1;
157
158 ch <<= 6; /* 6 bits of data in each subsequent byte */
159 ch |= (wchar_t)(utf8str[i] & 0x3f);
160 }
161
162 if (wcstr) wcstr[wclen] = ch;
163
164 utf8str += utflen; /* Move to next UTF-8 character */
165 wclen++; /* Count number of wide chars stored/required */
166 }
167
168 /* Add null terminator if there's room in the buffer. */
169 if (wcstr && wclen < count) wcstr[wclen] = 0;
170
171 return wclen;
172 }
173
174
175 /*-----------------------------------------------------------------------------
176 Convert one wide char to a UTF-8 character.
177 Return the length of the converted UTF-8 character in bytes.
178 No more than 'count' bytes will be written to the output buffer.
179 */
180 int
ldap_x_wc_to_utf8(char * utf8char,wchar_t wchar,size_t count)181 ldap_x_wc_to_utf8 ( char *utf8char, wchar_t wchar, size_t count )
182 {
183 int len=0;
184
185 if (utf8char == NULL) /* Just determine the required UTF-8 char length. */
186 { /* Ignore count */
187 if( wchar < 0 )
188 return -1;
189 if( wchar < 0x80 )
190 return 1;
191 if( wchar < 0x800 )
192 return 2;
193 if( wchar < 0x10000 )
194 return 3;
195 if( wchar < 0x200000 )
196 return 4;
197 if( wchar < 0x4000000 )
198 return 5;
199 #if SIZEOF_WCHAR_T > 4
200 /* UL is not strictly needed by ANSI C */
201 if( wchar < (wchar_t)0x80000000UL )
202 #endif /* SIZEOF_WCHAR_T > 4 */
203 return 6;
204 return -1;
205 }
206
207
208 if ( wchar < 0 ) { /* Invalid wide character */
209 len = -1;
210
211 } else if( wchar < 0x80 ) {
212 if (count >= 1) {
213 utf8char[len++] = (char)wchar;
214 }
215
216 } else if( wchar < 0x800 ) {
217 if (count >=2) {
218 utf8char[len++] = 0xc0 | ( wchar >> 6 );
219 utf8char[len++] = 0x80 | ( wchar & 0x3f );
220 }
221
222 } else if( wchar < 0x10000 ) {
223 if (count >= 3) {
224 utf8char[len++] = 0xe0 | ( wchar >> 12 );
225 utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
226 utf8char[len++] = 0x80 | ( wchar & 0x3f );
227 }
228
229 } else if( wchar < 0x200000 ) {
230 if (count >= 4) {
231 utf8char[len++] = 0xf0 | ( wchar >> 18 );
232 utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
233 utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
234 utf8char[len++] = 0x80 | ( wchar & 0x3f );
235 }
236
237 } else if( wchar < 0x4000000 ) {
238 if (count >= 5) {
239 utf8char[len++] = 0xf8 | ( wchar >> 24 );
240 utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
241 utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
242 utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
243 utf8char[len++] = 0x80 | ( wchar & 0x3f );
244 }
245
246 } else
247 #if SIZEOF_WCHAR_T > 4
248 /* UL is not strictly needed by ANSI C */
249 if( wchar < (wchar_t)0x80000000UL )
250 #endif /* SIZEOF_WCHAR_T > 4 */
251 {
252 if (count >= 6) {
253 utf8char[len++] = 0xfc | ( wchar >> 30 );
254 utf8char[len++] = 0x80 | ( (wchar >> 24) & 0x3f );
255 utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
256 utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
257 utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
258 utf8char[len++] = 0x80 | ( wchar & 0x3f );
259 }
260
261 #if SIZEOF_WCHAR_T > 4
262 } else {
263 len = -1;
264 #endif /* SIZEOF_WCHAR_T > 4 */
265 }
266
267 return len;
268
269 }
270
271
272 /*-----------------------------------------------------------------------------
273 Convert a wide char string to a UTF-8 string.
274 No more than 'count' bytes will be written to the output buffer.
275 Return the # of bytes written to the output buffer, excl null terminator.
276 */
277 int
ldap_x_wcs_to_utf8s(char * utf8str,const wchar_t * wcstr,size_t count)278 ldap_x_wcs_to_utf8s ( char *utf8str, const wchar_t *wcstr, size_t count )
279 {
280 int len = 0;
281 int n;
282 char *p = utf8str;
283 wchar_t empty = 0; /* To avoid use of L"" construct */
284
285 if (wcstr == NULL) /* Treat input ptr NULL as an empty string */
286 wcstr = ∅
287
288 if (utf8str == NULL) /* Just compute size of output, excl null */
289 {
290 while (*wcstr)
291 {
292 /* Get UTF-8 size of next wide char */
293 n = ldap_x_wc_to_utf8( NULL, *wcstr++, LDAP_MAX_UTF8_LEN);
294 if (n == -1)
295 return -1;
296 len += n;
297 }
298
299 return len;
300 }
301
302
303 /* Do the actual conversion. */
304
305 n = 1; /* In case of empty wcstr */
306 while (*wcstr)
307 {
308 n = ldap_x_wc_to_utf8( p, *wcstr++, count);
309
310 if (n <= 0) /* If encoding error (-1) or won't fit (0), quit */
311 break;
312
313 p += n;
314 count -= n; /* Space left in output buffer */
315 }
316
317 /* If not enough room for last character, pad remainder with null
318 so that return value = original count, indicating buffer full. */
319 if (n == 0)
320 {
321 while (count--)
322 *p++ = 0;
323 }
324
325 /* Add a null terminator if there's room. */
326 else if (count)
327 *p = 0;
328
329 if (n == -1) /* Conversion encountered invalid wide char. */
330 return -1;
331
332 /* Return the number of bytes written to output buffer, excl null. */
333 return (p - utf8str);
334 }
335
336 #ifdef ANDROID
wctomb(char * s,wchar_t wc)337 int wctomb(char *s, wchar_t wc) { return wcrtomb(s,wc,NULL); }
mbtowc(wchar_t * pwc,const char * s,size_t n)338 int mbtowc(wchar_t *pwc, const char *s, size_t n) { return mbrtowc(pwc, s, n, NULL); }
339 #endif
340
341 /*-----------------------------------------------------------------------------
342 Convert a UTF-8 character to a MultiByte character.
343 Return the size of the converted character in bytes.
344 */
345 int
ldap_x_utf8_to_mb(char * mbchar,const char * utf8char,int (* f_wctomb)(char * mbchar,wchar_t wchar))346 ldap_x_utf8_to_mb ( char *mbchar, const char *utf8char,
347 int (*f_wctomb)(char *mbchar, wchar_t wchar) )
348 {
349 wchar_t wchar;
350 int n;
351 char tmp[6]; /* Large enough for biggest multibyte char */
352
353 if (f_wctomb == NULL) /* If no conversion function was given... */
354 f_wctomb = wctomb; /* use the local ANSI C function */
355
356 /* First convert UTF-8 char to a wide char */
357 n = ldap_x_utf8_to_wc( &wchar, utf8char);
358
359 if (n == -1)
360 return -1; /* Invalid UTF-8 character */
361
362 if (mbchar == NULL)
363 n = f_wctomb( tmp, wchar );
364 else
365 n = f_wctomb( mbchar, wchar);
366
367 return n;
368 }
369
370 /*-----------------------------------------------------------------------------
371 Convert a UTF-8 string to a MultiByte string.
372 No more than 'count' bytes will be written to the output buffer.
373 Return the size of the converted string in bytes, excl null terminator.
374 */
375 int
ldap_x_utf8s_to_mbs(char * mbstr,const char * utf8str,size_t count,size_t (* f_wcstombs)(char * mbstr,const wchar_t * wcstr,size_t count))376 ldap_x_utf8s_to_mbs ( char *mbstr, const char *utf8str, size_t count,
377 size_t (*f_wcstombs)(char *mbstr, const wchar_t *wcstr, size_t count) )
378 {
379 wchar_t *wcs;
380 size_t wcsize;
381 int n;
382
383 if (f_wcstombs == NULL) /* If no conversion function was given... */
384 f_wcstombs = wcstombs; /* use the local ANSI C function */
385
386 if (utf8str == NULL || *utf8str == 0) /* NULL or empty input string */
387 {
388 if (mbstr)
389 *mbstr = 0;
390 return 0;
391 }
392
393 /* Allocate memory for the maximum size wchar string that we could get. */
394 wcsize = strlen(utf8str) + 1;
395 wcs = (wchar_t *)LDAP_MALLOC(wcsize * sizeof(wchar_t));
396 if (wcs == NULL)
397 return -1; /* Memory allocation failure. */
398
399 /* First convert the UTF-8 string to a wide char string */
400 n = ldap_x_utf8s_to_wcs( wcs, utf8str, wcsize);
401
402 /* Then convert wide char string to multi-byte string */
403 if (n != -1)
404 {
405 n = f_wcstombs(mbstr, wcs, count);
406 }
407
408 LDAP_FREE(wcs);
409
410 return n;
411 }
412
413 /*-----------------------------------------------------------------------------
414 Convert a MultiByte character to a UTF-8 character.
415 'mbsize' indicates the number of bytes of 'mbchar' to check.
416 Returns the number of bytes written to the output character.
417 */
418 int
ldap_x_mb_to_utf8(char * utf8char,const char * mbchar,size_t mbsize,int (* f_mbtowc)(wchar_t * wchar,const char * mbchar,size_t count))419 ldap_x_mb_to_utf8 ( char *utf8char, const char *mbchar, size_t mbsize,
420 int (*f_mbtowc)(wchar_t *wchar, const char *mbchar, size_t count) )
421 {
422 wchar_t wchar;
423 int n;
424
425 if (f_mbtowc == NULL) /* If no conversion function was given... */
426 f_mbtowc = mbtowc; /* use the local ANSI C function */
427
428 if (mbsize == 0) /* 0 is not valid. */
429 return -1;
430
431 if (mbchar == NULL || *mbchar == 0)
432 {
433 if (utf8char)
434 *utf8char = 0;
435 return 1;
436 }
437
438 /* First convert the MB char to a Wide Char */
439 n = f_mbtowc( &wchar, mbchar, mbsize);
440
441 if (n == -1)
442 return -1;
443
444 /* Convert the Wide Char to a UTF-8 character. */
445 n = ldap_x_wc_to_utf8( utf8char, wchar, LDAP_MAX_UTF8_LEN);
446
447 return n;
448 }
449
450
451 /*-----------------------------------------------------------------------------
452 Convert a MultiByte string to a UTF-8 string.
453 No more than 'count' bytes will be written to the output buffer.
454 Return the size of the converted string in bytes, excl null terminator.
455 */
456 int
ldap_x_mbs_to_utf8s(char * utf8str,const char * mbstr,size_t count,size_t (* f_mbstowcs)(wchar_t * wcstr,const char * mbstr,size_t count))457 ldap_x_mbs_to_utf8s ( char *utf8str, const char *mbstr, size_t count,
458 size_t (*f_mbstowcs)(wchar_t *wcstr, const char *mbstr, size_t count) )
459 {
460 wchar_t *wcs;
461 int n;
462 size_t wcsize;
463
464 if (mbstr == NULL) /* Treat NULL input string as an empty string */
465 mbstr = "";
466
467 if (f_mbstowcs == NULL) /* If no conversion function was given... */
468 f_mbstowcs = mbstowcs; /* use the local ANSI C function */
469
470 /* Allocate memory for the maximum size wchar string that we could get. */
471 wcsize = strlen(mbstr) + 1;
472 wcs = (wchar_t *)LDAP_MALLOC( wcsize * sizeof(wchar_t) );
473 if (wcs == NULL)
474 return -1;
475
476 /* First convert multi-byte string to a wide char string */
477 n = f_mbstowcs(wcs, mbstr, wcsize);
478
479 /* Convert wide char string to UTF-8 string */
480 if (n != -1)
481 {
482 n = ldap_x_wcs_to_utf8s( utf8str, wcs, count);
483 }
484
485 LDAP_FREE(wcs);
486
487 return n;
488 }
489
490 #endif /* SIZEOF_WCHAR_T >= 4 */
491