1 /* $NetBSD: utf-8-conv.c,v 1.1.1.4 2014/05/28 09:58:42 tron Exp $ */ 2 3 /* $OpenLDAP$ */ 4 /* This work is part of OpenLDAP Software <http://www.openldap.org/>. 5 * 6 * Copyright 1998-2014 The OpenLDAP Foundation. 7 * All rights reserved. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted only as authorized by the OpenLDAP 11 * Public License. 12 * 13 * A copy of this license is available in the file LICENSE in the 14 * top-level directory of the distribution or, alternatively, at 15 * <http://www.OpenLDAP.org/license.html>. 16 */ 17 /* Portions Copyright (C) 1999, 2000 Novell, Inc. All Rights Reserved. 18 * 19 * THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND 20 * TREATIES. USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT 21 * TO VERSION 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS 22 * AVAILABLE AT HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE" 23 * IN THE TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION 24 * OF THIS WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP 25 * PUBLIC LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT 26 * THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY. 27 *--- 28 * Note: A verbatim copy of version 2.0.1 of the OpenLDAP Public License 29 * can be found in the file "build/LICENSE-2.0.1" in this distribution 30 * of OpenLDAP Software. 31 */ 32 33 /* 34 * UTF-8 Conversion Routines 35 * 36 * These routines convert between Wide Character and UTF-8, 37 * or between MultiByte and UTF-8 encodings. 38 * 39 * Both single character and string versions of the functions are provided. 40 * All functions return -1 if the character or string cannot be converted. 41 */ 42 43 #include "portable.h" 44 45 #if SIZEOF_WCHAR_T >= 4 46 /* These routines assume ( sizeof(wchar_t) >= 4 ) */ 47 48 #include <stdio.h> 49 #include <ac/stdlib.h> /* For wctomb, wcstombs, mbtowc, mbstowcs */ 50 #include <ac/string.h> 51 #include <ac/time.h> /* for time_t */ 52 53 #include "ldap-int.h" 54 55 #include <ldap_utf8.h> 56 57 static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; 58 59 60 /*----------------------------------------------------------------------------- 61 UTF-8 Format Summary 62 63 ASCII chars 7 bits 64 0xxxxxxx 65 66 2-character UTF-8 sequence: 11 bits 67 110xxxxx 10xxxxxx 68 69 3-character UTF-8 16 bits 70 1110xxxx 10xxxxxx 10xxxxxx 71 72 4-char UTF-8 21 bits 73 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 74 75 5-char UTF-8 26 bits 76 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 77 78 6-char UTF-8 31 bits 79 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 80 81 Unicode address space (0 - 0x10FFFF) 21 bits 82 ISO-10646 address space (0 - 0x7FFFFFFF) 31 bits 83 84 Note: This code does not prevent UTF-8 sequences which are longer than 85 necessary from being decoded. 86 */ 87 88 /*----------------------------------------------------------------------------- 89 Convert a UTF-8 character to a wide char. 90 Return the length of the UTF-8 input character in bytes. 91 */ 92 int 93 ldap_x_utf8_to_wc ( wchar_t *wchar, const char *utf8char ) 94 { 95 int utflen, i; 96 wchar_t ch; 97 98 if (utf8char == NULL) return -1; 99 100 /* Get UTF-8 sequence length from 1st byte */ 101 utflen = LDAP_UTF8_CHARLEN2(utf8char, utflen); 102 103 if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN ) return -1; 104 105 /* First byte minus length tag */ 106 ch = (wchar_t)(utf8char[0] & mask[utflen]); 107 108 for(i=1; i < utflen; i++) { 109 /* Subsequent bytes must start with 10 */ 110 if ((utf8char[i] & 0xc0) != 0x80) return -1; 111 112 ch <<= 6; /* 6 bits of data in each subsequent byte */ 113 ch |= (wchar_t)(utf8char[i] & 0x3f); 114 } 115 116 if (wchar) *wchar = ch; 117 118 return utflen; 119 } 120 121 /*----------------------------------------------------------------------------- 122 Convert a UTF-8 string to a wide char string. 123 No more than 'count' wide chars will be written to the output buffer. 124 Return the size of the converted string in wide chars, excl null terminator. 125 */ 126 int 127 ldap_x_utf8s_to_wcs ( wchar_t *wcstr, const char *utf8str, size_t count ) 128 { 129 size_t wclen = 0; 130 int utflen, i; 131 wchar_t ch; 132 133 134 /* If input ptr is NULL or empty... */ 135 if (utf8str == NULL || !*utf8str) { 136 if ( wcstr ) 137 *wcstr = 0; 138 return 0; 139 } 140 141 /* Examine next UTF-8 character. If output buffer is NULL, ignore count */ 142 while ( *utf8str && (wcstr==NULL || wclen<count) ) { 143 /* Get UTF-8 sequence length from 1st byte */ 144 utflen = LDAP_UTF8_CHARLEN2(utf8str, utflen); 145 146 if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN ) return -1; 147 148 /* First byte minus length tag */ 149 ch = (wchar_t)(utf8str[0] & mask[utflen]); 150 151 for(i=1; i < utflen; i++) { 152 /* Subsequent bytes must start with 10 */ 153 if ((utf8str[i] & 0xc0) != 0x80) return -1; 154 155 ch <<= 6; /* 6 bits of data in each subsequent byte */ 156 ch |= (wchar_t)(utf8str[i] & 0x3f); 157 } 158 159 if (wcstr) wcstr[wclen] = ch; 160 161 utf8str += utflen; /* Move to next UTF-8 character */ 162 wclen++; /* Count number of wide chars stored/required */ 163 } 164 165 /* Add null terminator if there's room in the buffer. */ 166 if (wcstr && wclen < count) wcstr[wclen] = 0; 167 168 return wclen; 169 } 170 171 172 /*----------------------------------------------------------------------------- 173 Convert one wide char to a UTF-8 character. 174 Return the length of the converted UTF-8 character in bytes. 175 No more than 'count' bytes will be written to the output buffer. 176 */ 177 int 178 ldap_x_wc_to_utf8 ( char *utf8char, wchar_t wchar, size_t count ) 179 { 180 int len=0; 181 182 if (utf8char == NULL) /* Just determine the required UTF-8 char length. */ 183 { /* Ignore count */ 184 if( wchar < 0 ) 185 return -1; 186 if( wchar < 0x80 ) 187 return 1; 188 if( wchar < 0x800 ) 189 return 2; 190 if( wchar < 0x10000 ) 191 return 3; 192 if( wchar < 0x200000 ) 193 return 4; 194 if( wchar < 0x4000000 ) 195 return 5; 196 #if SIZEOF_WCHAR_T > 4 197 /* UL is not strictly needed by ANSI C */ 198 if( wchar < (wchar_t)0x80000000UL ) 199 #endif /* SIZEOF_WCHAR_T > 4 */ 200 return 6; 201 return -1; 202 } 203 204 205 if ( wchar < 0 ) { /* Invalid wide character */ 206 len = -1; 207 208 } else if( wchar < 0x80 ) { 209 if (count >= 1) { 210 utf8char[len++] = (char)wchar; 211 } 212 213 } else if( wchar < 0x800 ) { 214 if (count >=2) { 215 utf8char[len++] = 0xc0 | ( wchar >> 6 ); 216 utf8char[len++] = 0x80 | ( wchar & 0x3f ); 217 } 218 219 } else if( wchar < 0x10000 ) { 220 if (count >= 3) { 221 utf8char[len++] = 0xe0 | ( wchar >> 12 ); 222 utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f ); 223 utf8char[len++] = 0x80 | ( wchar & 0x3f ); 224 } 225 226 } else if( wchar < 0x200000 ) { 227 if (count >= 4) { 228 utf8char[len++] = 0xf0 | ( wchar >> 18 ); 229 utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f ); 230 utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f ); 231 utf8char[len++] = 0x80 | ( wchar & 0x3f ); 232 } 233 234 } else if( wchar < 0x4000000 ) { 235 if (count >= 5) { 236 utf8char[len++] = 0xf8 | ( wchar >> 24 ); 237 utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f ); 238 utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f ); 239 utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f ); 240 utf8char[len++] = 0x80 | ( wchar & 0x3f ); 241 } 242 243 } else 244 #if SIZEOF_WCHAR_T > 4 245 /* UL is not strictly needed by ANSI C */ 246 if( wchar < (wchar_t)0x80000000UL ) 247 #endif /* SIZEOF_WCHAR_T > 4 */ 248 { 249 if (count >= 6) { 250 utf8char[len++] = 0xfc | ( wchar >> 30 ); 251 utf8char[len++] = 0x80 | ( (wchar >> 24) & 0x3f ); 252 utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f ); 253 utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f ); 254 utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f ); 255 utf8char[len++] = 0x80 | ( wchar & 0x3f ); 256 } 257 258 #if SIZEOF_WCHAR_T > 4 259 } else { 260 len = -1; 261 #endif /* SIZEOF_WCHAR_T > 4 */ 262 } 263 264 return len; 265 266 } 267 268 269 /*----------------------------------------------------------------------------- 270 Convert a wide char string to a UTF-8 string. 271 No more than 'count' bytes will be written to the output buffer. 272 Return the # of bytes written to the output buffer, excl null terminator. 273 */ 274 int 275 ldap_x_wcs_to_utf8s ( char *utf8str, const wchar_t *wcstr, size_t count ) 276 { 277 int len = 0; 278 int n; 279 char *p = utf8str; 280 wchar_t empty = 0; /* To avoid use of L"" construct */ 281 282 if (wcstr == NULL) /* Treat input ptr NULL as an empty string */ 283 wcstr = ∅ 284 285 if (utf8str == NULL) /* Just compute size of output, excl null */ 286 { 287 while (*wcstr) 288 { 289 /* Get UTF-8 size of next wide char */ 290 n = ldap_x_wc_to_utf8( NULL, *wcstr++, LDAP_MAX_UTF8_LEN); 291 if (n == -1) 292 return -1; 293 len += n; 294 } 295 296 return len; 297 } 298 299 300 /* Do the actual conversion. */ 301 302 n = 1; /* In case of empty wcstr */ 303 while (*wcstr) 304 { 305 n = ldap_x_wc_to_utf8( p, *wcstr++, count); 306 307 if (n <= 0) /* If encoding error (-1) or won't fit (0), quit */ 308 break; 309 310 p += n; 311 count -= n; /* Space left in output buffer */ 312 } 313 314 /* If not enough room for last character, pad remainder with null 315 so that return value = original count, indicating buffer full. */ 316 if (n == 0) 317 { 318 while (count--) 319 *p++ = 0; 320 } 321 322 /* Add a null terminator if there's room. */ 323 else if (count) 324 *p = 0; 325 326 if (n == -1) /* Conversion encountered invalid wide char. */ 327 return -1; 328 329 /* Return the number of bytes written to output buffer, excl null. */ 330 return (p - utf8str); 331 } 332 333 #ifdef ANDROID 334 int wctomb(char *s, wchar_t wc) { return wcrtomb(s,wc,NULL); } 335 int mbtowc(wchar_t *pwc, const char *s, size_t n) { return mbrtowc(pwc, s, n, NULL); } 336 #endif 337 338 /*----------------------------------------------------------------------------- 339 Convert a UTF-8 character to a MultiByte character. 340 Return the size of the converted character in bytes. 341 */ 342 int 343 ldap_x_utf8_to_mb ( char *mbchar, const char *utf8char, 344 int (*f_wctomb)(char *mbchar, wchar_t wchar) ) 345 { 346 wchar_t wchar; 347 int n; 348 char tmp[6]; /* Large enough for biggest multibyte char */ 349 350 if (f_wctomb == NULL) /* If no conversion function was given... */ 351 f_wctomb = wctomb; /* use the local ANSI C function */ 352 353 /* First convert UTF-8 char to a wide char */ 354 n = ldap_x_utf8_to_wc( &wchar, utf8char); 355 356 if (n == -1) 357 return -1; /* Invalid UTF-8 character */ 358 359 if (mbchar == NULL) 360 n = f_wctomb( tmp, wchar ); 361 else 362 n = f_wctomb( mbchar, wchar); 363 364 return n; 365 } 366 367 /*----------------------------------------------------------------------------- 368 Convert a UTF-8 string to a MultiByte string. 369 No more than 'count' bytes will be written to the output buffer. 370 Return the size of the converted string in bytes, excl null terminator. 371 */ 372 int 373 ldap_x_utf8s_to_mbs ( char *mbstr, const char *utf8str, size_t count, 374 size_t (*f_wcstombs)(char *mbstr, const wchar_t *wcstr, size_t count) ) 375 { 376 wchar_t *wcs; 377 size_t wcsize; 378 int n; 379 380 if (f_wcstombs == NULL) /* If no conversion function was given... */ 381 f_wcstombs = wcstombs; /* use the local ANSI C function */ 382 383 if (utf8str == NULL || *utf8str == 0) /* NULL or empty input string */ 384 { 385 if (mbstr) 386 *mbstr = 0; 387 return 0; 388 } 389 390 /* Allocate memory for the maximum size wchar string that we could get. */ 391 wcsize = strlen(utf8str) + 1; 392 wcs = (wchar_t *)LDAP_MALLOC(wcsize * sizeof(wchar_t)); 393 if (wcs == NULL) 394 return -1; /* Memory allocation failure. */ 395 396 /* First convert the UTF-8 string to a wide char string */ 397 n = ldap_x_utf8s_to_wcs( wcs, utf8str, wcsize); 398 399 /* Then convert wide char string to multi-byte string */ 400 if (n != -1) 401 { 402 n = f_wcstombs(mbstr, wcs, count); 403 } 404 405 LDAP_FREE(wcs); 406 407 return n; 408 } 409 410 /*----------------------------------------------------------------------------- 411 Convert a MultiByte character to a UTF-8 character. 412 'mbsize' indicates the number of bytes of 'mbchar' to check. 413 Returns the number of bytes written to the output character. 414 */ 415 int 416 ldap_x_mb_to_utf8 ( char *utf8char, const char *mbchar, size_t mbsize, 417 int (*f_mbtowc)(wchar_t *wchar, const char *mbchar, size_t count) ) 418 { 419 wchar_t wchar; 420 int n; 421 422 if (f_mbtowc == NULL) /* If no conversion function was given... */ 423 f_mbtowc = mbtowc; /* use the local ANSI C function */ 424 425 if (mbsize == 0) /* 0 is not valid. */ 426 return -1; 427 428 if (mbchar == NULL || *mbchar == 0) 429 { 430 if (utf8char) 431 *utf8char = 0; 432 return 1; 433 } 434 435 /* First convert the MB char to a Wide Char */ 436 n = f_mbtowc( &wchar, mbchar, mbsize); 437 438 if (n == -1) 439 return -1; 440 441 /* Convert the Wide Char to a UTF-8 character. */ 442 n = ldap_x_wc_to_utf8( utf8char, wchar, LDAP_MAX_UTF8_LEN); 443 444 return n; 445 } 446 447 448 /*----------------------------------------------------------------------------- 449 Convert a MultiByte string to a UTF-8 string. 450 No more than 'count' bytes will be written to the output buffer. 451 Return the size of the converted string in bytes, excl null terminator. 452 */ 453 int 454 ldap_x_mbs_to_utf8s ( char *utf8str, const char *mbstr, size_t count, 455 size_t (*f_mbstowcs)(wchar_t *wcstr, const char *mbstr, size_t count) ) 456 { 457 wchar_t *wcs; 458 int n; 459 size_t wcsize; 460 461 if (mbstr == NULL) /* Treat NULL input string as an empty string */ 462 mbstr = ""; 463 464 if (f_mbstowcs == NULL) /* If no conversion function was given... */ 465 f_mbstowcs = mbstowcs; /* use the local ANSI C function */ 466 467 /* Allocate memory for the maximum size wchar string that we could get. */ 468 wcsize = strlen(mbstr) + 1; 469 wcs = (wchar_t *)LDAP_MALLOC( wcsize * sizeof(wchar_t) ); 470 if (wcs == NULL) 471 return -1; 472 473 /* First convert multi-byte string to a wide char string */ 474 n = f_mbstowcs(wcs, mbstr, wcsize); 475 476 /* Convert wide char string to UTF-8 string */ 477 if (n != -1) 478 { 479 n = ldap_x_wcs_to_utf8s( utf8str, wcs, count); 480 } 481 482 LDAP_FREE(wcs); 483 484 return n; 485 } 486 487 #endif /* SIZEOF_WCHAR_T >= 4 */ 488