1 /* $OpenLDAP: pkg/ldap/libraries/libldap/utf-8-conv.c,v 1.16.2.3 2008/02/11 23:26:41 kurt Exp $ */ 2 /* This work is part of OpenLDAP Software <http://www.openldap.org/>. 3 * 4 * Copyright 1998-2008 The OpenLDAP Foundation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted only as authorized by the OpenLDAP 9 * Public License. 10 * 11 * A copy of this license is available in the file LICENSE in the 12 * top-level directory of the distribution or, alternatively, at 13 * <http://www.OpenLDAP.org/license.html>. 14 */ 15 /* Portions Copyright (C) 1999, 2000 Novell, Inc. All Rights Reserved. 16 * 17 * THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND 18 * TREATIES. USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT 19 * TO VERSION 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS 20 * AVAILABLE AT HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE" 21 * IN THE TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION 22 * OF THIS WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP 23 * PUBLIC LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT 24 * THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY. 25 *--- 26 * Note: A verbatim copy of version 2.0.1 of the OpenLDAP Public License 27 * can be found in the file "build/LICENSE-2.0.1" in this distribution 28 * of OpenLDAP Software. 29 */ 30 31 /* 32 * UTF-8 Conversion Routines 33 * 34 * These routines convert between Wide Character and UTF-8, 35 * or between MultiByte and UTF-8 encodings. 36 * 37 * Both single character and string versions of the functions are provided. 38 * All functions return -1 if the character or string cannot be converted. 39 */ 40 41 #include "portable.h" 42 43 #if SIZEOF_WCHAR_T >= 4 44 /* These routines assume ( sizeof(wchar_t) >= 4 ) */ 45 46 #include <stdio.h> 47 #include <ac/stdlib.h> /* For wctomb, wcstombs, mbtowc, mbstowcs */ 48 #include <ac/string.h> 49 #include <ac/time.h> /* for time_t */ 50 51 #include "ldap-int.h" 52 53 #include <ldap_utf8.h> 54 55 static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; 56 57 58 /*----------------------------------------------------------------------------- 59 UTF-8 Format Summary 60 61 ASCII chars 7 bits 62 0xxxxxxx 63 64 2-character UTF-8 sequence: 11 bits 65 110xxxxx 10xxxxxx 66 67 3-character UTF-8 16 bits 68 1110xxxx 10xxxxxx 10xxxxxx 69 70 4-char UTF-8 21 bits 71 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 72 73 5-char UTF-8 26 bits 74 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 75 76 6-char UTF-8 31 bits 77 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 78 79 Unicode address space (0 - 0x10FFFF) 21 bits 80 ISO-10646 address space (0 - 0x7FFFFFFF) 31 bits 81 82 Note: This code does not prevent UTF-8 sequences which are longer than 83 necessary from being decoded. 84 */ 85 86 /*----------------------------------------------------------------------------- 87 Convert a UTF-8 character to a wide char. 88 Return the length of the UTF-8 input character in bytes. 89 */ 90 int 91 ldap_x_utf8_to_wc ( wchar_t *wchar, const char *utf8char ) 92 { 93 int utflen, i; 94 wchar_t ch; 95 96 if (utf8char == NULL) return -1; 97 98 /* Get UTF-8 sequence length from 1st byte */ 99 utflen = LDAP_UTF8_CHARLEN2(utf8char, utflen); 100 101 if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN ) return -1; 102 103 /* First byte minus length tag */ 104 ch = (wchar_t)(utf8char[0] & mask[utflen]); 105 106 for(i=1; i < utflen; i++) { 107 /* Subsequent bytes must start with 10 */ 108 if ((utf8char[i] & 0xc0) != 0x80) return -1; 109 110 ch <<= 6; /* 6 bits of data in each subsequent byte */ 111 ch |= (wchar_t)(utf8char[i] & 0x3f); 112 } 113 114 if (wchar) *wchar = ch; 115 116 return utflen; 117 } 118 119 /*----------------------------------------------------------------------------- 120 Convert a UTF-8 string to a wide char string. 121 No more than 'count' wide chars will be written to the output buffer. 122 Return the size of the converted string in wide chars, excl null terminator. 123 */ 124 int 125 ldap_x_utf8s_to_wcs ( wchar_t *wcstr, const char *utf8str, size_t count ) 126 { 127 size_t wclen = 0; 128 int utflen, i; 129 wchar_t ch; 130 131 132 /* If input ptr is NULL or empty... */ 133 if (utf8str == NULL || !*utf8str) { 134 if ( wcstr ) 135 *wcstr = 0; 136 return 0; 137 } 138 139 /* Examine next UTF-8 character. If output buffer is NULL, ignore count */ 140 while ( *utf8str && (wcstr==NULL || wclen<count) ) { 141 /* Get UTF-8 sequence length from 1st byte */ 142 utflen = LDAP_UTF8_CHARLEN2(utf8str, utflen); 143 144 if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN ) return -1; 145 146 /* First byte minus length tag */ 147 ch = (wchar_t)(utf8str[0] & mask[utflen]); 148 149 for(i=1; i < utflen; i++) { 150 /* Subsequent bytes must start with 10 */ 151 if ((utf8str[i] & 0xc0) != 0x80) return -1; 152 153 ch <<= 6; /* 6 bits of data in each subsequent byte */ 154 ch |= (wchar_t)(utf8str[i] & 0x3f); 155 } 156 157 if (wcstr) wcstr[wclen] = ch; 158 159 utf8str += utflen; /* Move to next UTF-8 character */ 160 wclen++; /* Count number of wide chars stored/required */ 161 } 162 163 /* Add null terminator if there's room in the buffer. */ 164 if (wcstr && wclen < count) wcstr[wclen] = 0; 165 166 return wclen; 167 } 168 169 170 /*----------------------------------------------------------------------------- 171 Convert one wide char to a UTF-8 character. 172 Return the length of the converted UTF-8 character in bytes. 173 No more than 'count' bytes will be written to the output buffer. 174 */ 175 int 176 ldap_x_wc_to_utf8 ( char *utf8char, wchar_t wchar, size_t count ) 177 { 178 int len=0; 179 180 if (utf8char == NULL) /* Just determine the required UTF-8 char length. */ 181 { /* Ignore count */ 182 if( wchar < 0 ) 183 return -1; 184 if( wchar < 0x80 ) 185 return 1; 186 if( wchar < 0x800 ) 187 return 2; 188 if( wchar < 0x10000 ) 189 return 3; 190 if( wchar < 0x200000 ) 191 return 4; 192 if( wchar < 0x4000000 ) 193 return 5; 194 if( wchar < 0x80000000 ) 195 return 6; 196 return -1; 197 } 198 199 200 if ( wchar < 0 ) { /* Invalid wide character */ 201 len = -1; 202 203 } else if( wchar < 0x80 ) { 204 if (count >= 1) { 205 utf8char[len++] = (char)wchar; 206 } 207 208 } else if( wchar < 0x800 ) { 209 if (count >=2) { 210 utf8char[len++] = 0xc0 | ( wchar >> 6 ); 211 utf8char[len++] = 0x80 | ( wchar & 0x3f ); 212 } 213 214 } else if( wchar < 0x10000 ) { 215 if (count >= 3) { 216 utf8char[len++] = 0xe0 | ( wchar >> 12 ); 217 utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f ); 218 utf8char[len++] = 0x80 | ( wchar & 0x3f ); 219 } 220 221 } else if( wchar < 0x200000 ) { 222 if (count >= 4) { 223 utf8char[len++] = 0xf0 | ( wchar >> 18 ); 224 utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f ); 225 utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f ); 226 utf8char[len++] = 0x80 | ( wchar & 0x3f ); 227 } 228 229 } else if( wchar < 0x4000000 ) { 230 if (count >= 5) { 231 utf8char[len++] = 0xf8 | ( wchar >> 24 ); 232 utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f ); 233 utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f ); 234 utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f ); 235 utf8char[len++] = 0x80 | ( wchar & 0x3f ); 236 } 237 238 } else if( wchar < 0x80000000 ) { 239 if (count >= 6) { 240 utf8char[len++] = 0xfc | ( wchar >> 30 ); 241 utf8char[len++] = 0x80 | ( (wchar >> 24) & 0x3f ); 242 utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f ); 243 utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f ); 244 utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f ); 245 utf8char[len++] = 0x80 | ( wchar & 0x3f ); 246 } 247 248 } else 249 len = -1; 250 251 return len; 252 253 } 254 255 256 /*----------------------------------------------------------------------------- 257 Convert a wide char string to a UTF-8 string. 258 No more than 'count' bytes will be written to the output buffer. 259 Return the # of bytes written to the output buffer, excl null terminator. 260 */ 261 int 262 ldap_x_wcs_to_utf8s ( char *utf8str, const wchar_t *wcstr, size_t count ) 263 { 264 int len = 0; 265 int n; 266 char *p = utf8str; 267 wchar_t empty = 0; /* To avoid use of L"" construct */ 268 269 if (wcstr == NULL) /* Treat input ptr NULL as an empty string */ 270 wcstr = ∅ 271 272 if (utf8str == NULL) /* Just compute size of output, excl null */ 273 { 274 while (*wcstr) 275 { 276 /* Get UTF-8 size of next wide char */ 277 n = ldap_x_wc_to_utf8( NULL, *wcstr++, LDAP_MAX_UTF8_LEN); 278 if (n == -1) 279 return -1; 280 len += n; 281 } 282 283 return len; 284 } 285 286 287 /* Do the actual conversion. */ 288 289 n = 1; /* In case of empty wcstr */ 290 while (*wcstr) 291 { 292 n = ldap_x_wc_to_utf8( p, *wcstr++, count); 293 294 if (n <= 0) /* If encoding error (-1) or won't fit (0), quit */ 295 break; 296 297 p += n; 298 count -= n; /* Space left in output buffer */ 299 } 300 301 /* If not enough room for last character, pad remainder with null 302 so that return value = original count, indicating buffer full. */ 303 if (n == 0) 304 { 305 while (count--) 306 *p++ = 0; 307 } 308 309 /* Add a null terminator if there's room. */ 310 else if (count) 311 *p = 0; 312 313 if (n == -1) /* Conversion encountered invalid wide char. */ 314 return -1; 315 316 /* Return the number of bytes written to output buffer, excl null. */ 317 return (p - utf8str); 318 } 319 320 321 /*----------------------------------------------------------------------------- 322 Convert a UTF-8 character to a MultiByte character. 323 Return the size of the converted character in bytes. 324 */ 325 int 326 ldap_x_utf8_to_mb ( char *mbchar, const char *utf8char, 327 int (*f_wctomb)(char *mbchar, wchar_t wchar) ) 328 { 329 wchar_t wchar; 330 int n; 331 char tmp[6]; /* Large enough for biggest multibyte char */ 332 333 if (f_wctomb == NULL) /* If no conversion function was given... */ 334 f_wctomb = wctomb; /* use the local ANSI C function */ 335 336 /* First convert UTF-8 char to a wide char */ 337 n = ldap_x_utf8_to_wc( &wchar, utf8char); 338 339 if (n == -1) 340 return -1; /* Invalid UTF-8 character */ 341 342 if (mbchar == NULL) 343 n = f_wctomb( tmp, wchar ); 344 else 345 n = f_wctomb( mbchar, wchar); 346 347 return n; 348 } 349 350 /*----------------------------------------------------------------------------- 351 Convert a UTF-8 string to a MultiByte string. 352 No more than 'count' bytes will be written to the output buffer. 353 Return the size of the converted string in bytes, excl null terminator. 354 */ 355 int 356 ldap_x_utf8s_to_mbs ( char *mbstr, const char *utf8str, size_t count, 357 size_t (*f_wcstombs)(char *mbstr, const wchar_t *wcstr, size_t count) ) 358 { 359 wchar_t *wcs; 360 size_t wcsize; 361 int n; 362 363 if (f_wcstombs == NULL) /* If no conversion function was given... */ 364 f_wcstombs = wcstombs; /* use the local ANSI C function */ 365 366 if (utf8str == NULL || *utf8str == 0) /* NULL or empty input string */ 367 { 368 if (mbstr) 369 *mbstr = 0; 370 return 0; 371 } 372 373 /* Allocate memory for the maximum size wchar string that we could get. */ 374 wcsize = strlen(utf8str) + 1; 375 wcs = (wchar_t *)LDAP_MALLOC(wcsize * sizeof(wchar_t)); 376 if (wcs == NULL) 377 return -1; /* Memory allocation failure. */ 378 379 /* First convert the UTF-8 string to a wide char string */ 380 n = ldap_x_utf8s_to_wcs( wcs, utf8str, wcsize); 381 382 /* Then convert wide char string to multi-byte string */ 383 if (n != -1) 384 { 385 n = f_wcstombs(mbstr, wcs, count); 386 } 387 388 LDAP_FREE(wcs); 389 390 return n; 391 } 392 393 /*----------------------------------------------------------------------------- 394 Convert a MultiByte character to a UTF-8 character. 395 'mbsize' indicates the number of bytes of 'mbchar' to check. 396 Returns the number of bytes written to the output character. 397 */ 398 int 399 ldap_x_mb_to_utf8 ( char *utf8char, const char *mbchar, size_t mbsize, 400 int (*f_mbtowc)(wchar_t *wchar, const char *mbchar, size_t count) ) 401 { 402 wchar_t wchar; 403 int n; 404 405 if (f_mbtowc == NULL) /* If no conversion function was given... */ 406 f_mbtowc = mbtowc; /* use the local ANSI C function */ 407 408 if (mbsize == 0) /* 0 is not valid. */ 409 return -1; 410 411 if (mbchar == NULL || *mbchar == 0) 412 { 413 if (utf8char) 414 *utf8char = 0; 415 return 1; 416 } 417 418 /* First convert the MB char to a Wide Char */ 419 n = f_mbtowc( &wchar, mbchar, mbsize); 420 421 if (n == -1) 422 return -1; 423 424 /* Convert the Wide Char to a UTF-8 character. */ 425 n = ldap_x_wc_to_utf8( utf8char, wchar, LDAP_MAX_UTF8_LEN); 426 427 return n; 428 } 429 430 431 /*----------------------------------------------------------------------------- 432 Convert a MultiByte string to a UTF-8 string. 433 No more than 'count' bytes will be written to the output buffer. 434 Return the size of the converted string in bytes, excl null terminator. 435 */ 436 int 437 ldap_x_mbs_to_utf8s ( char *utf8str, const char *mbstr, size_t count, 438 size_t (*f_mbstowcs)(wchar_t *wcstr, const char *mbstr, size_t count) ) 439 { 440 wchar_t *wcs; 441 int n; 442 size_t wcsize; 443 444 if (mbstr == NULL) /* Treat NULL input string as an empty string */ 445 mbstr = ""; 446 447 if (f_mbstowcs == NULL) /* If no conversion function was given... */ 448 f_mbstowcs = mbstowcs; /* use the local ANSI C function */ 449 450 /* Allocate memory for the maximum size wchar string that we could get. */ 451 wcsize = strlen(mbstr) + 1; 452 wcs = (wchar_t *)LDAP_MALLOC( wcsize * sizeof(wchar_t) ); 453 if (wcs == NULL) 454 return -1; 455 456 /* First convert multi-byte string to a wide char string */ 457 n = f_mbstowcs(wcs, mbstr, wcsize); 458 459 /* Convert wide char string to UTF-8 string */ 460 if (n != -1) 461 { 462 n = ldap_x_wcs_to_utf8s( utf8str, wcs, count); 463 } 464 465 LDAP_FREE(wcs); 466 467 return n; 468 } 469 470 #endif 471