1 /* $NetBSD: utf-8.c,v 1.1.1.2 2010/03/08 02:14:20 lukem Exp $ */ 2 3 /* utf-8.c -- Basic UTF-8 routines */ 4 /* OpenLDAP: pkg/ldap/libraries/libldap/utf-8.c,v 1.36.2.4 2009/01/22 00:00:56 kurt Exp */ 5 /* This work is part of OpenLDAP Software <http://www.openldap.org/>. 6 * 7 * Copyright 1998-2009 The OpenLDAP Foundation. 8 * All rights reserved. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted only as authorized by the OpenLDAP 12 * Public License. 13 * 14 * A copy of this license is available in the file LICENSE in the 15 * top-level directory of the distribution or, alternatively, at 16 * <http://www.OpenLDAP.org/license.html>. 17 */ 18 /* Basic UTF-8 routines 19 * 20 * These routines are "dumb". Though they understand UTF-8, 21 * they don't grok Unicode. That is, they can push bits, 22 * but don't have a clue what the bits represent. That's 23 * good enough for use with the LDAP Client SDK. 24 * 25 * These routines are not optimized. 26 */ 27 28 #include "portable.h" 29 30 #include <stdio.h> 31 32 #include <ac/stdlib.h> 33 34 #include <ac/socket.h> 35 #include <ac/string.h> 36 #include <ac/time.h> 37 38 #include "ldap_utf8.h" 39 40 #include "ldap-int.h" 41 #include "ldap_defaults.h" 42 43 /* 44 * return the number of bytes required to hold the 45 * NULL-terminated UTF-8 string NOT INCLUDING the 46 * termination. 47 */ 48 ber_len_t ldap_utf8_bytes( const char * p ) 49 { 50 ber_len_t bytes; 51 52 for( bytes=0; p[bytes]; bytes++ ) { 53 /* EMPTY */ ; 54 } 55 56 return bytes; 57 } 58 59 ber_len_t ldap_utf8_chars( const char * p ) 60 { 61 /* could be optimized and could check for invalid sequences */ 62 ber_len_t chars=0; 63 64 for( ; *p ; LDAP_UTF8_INCR(p) ) { 65 chars++; 66 } 67 68 return chars; 69 } 70 71 /* return offset to next character */ 72 int ldap_utf8_offset( const char * p ) 73 { 74 return LDAP_UTF8_NEXT(p) - p; 75 } 76 77 /* 78 * Returns length indicated by first byte. 79 */ 80 const char ldap_utf8_lentab[] = { 81 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 82 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 83 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 84 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 85 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 86 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 87 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 88 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 }; 89 90 int ldap_utf8_charlen( const char * p ) 91 { 92 if (!(*p & 0x80)) 93 return 1; 94 95 return ldap_utf8_lentab[*(const unsigned char *)p ^ 0x80]; 96 } 97 98 /* 99 * Make sure the UTF-8 char used the shortest possible encoding 100 * returns charlen if valid, 0 if not. 101 * 102 * Here are the valid UTF-8 encodings, taken from RFC 2279 page 4. 103 * The table is slightly modified from that of the RFC. 104 * 105 * UCS-4 range (hex) UTF-8 sequence (binary) 106 * 0000 0000-0000 007F 0....... 107 * 0000 0080-0000 07FF 110++++. 10...... 108 * 0000 0800-0000 FFFF 1110++++ 10+..... 10...... 109 * 0001 0000-001F FFFF 11110+++ 10++.... 10...... 10...... 110 * 0020 0000-03FF FFFF 111110++ 10+++... 10...... 10...... 10...... 111 * 0400 0000-7FFF FFFF 1111110+ 10++++.. 10...... 10...... 10...... 10...... 112 * 113 * The '.' bits are "don't cares". When validating a UTF-8 sequence, 114 * at least one of the '+' bits must be set, otherwise the character 115 * should have been encoded in fewer octets. Note that in the two-octet 116 * case, only the first octet needs to be validated, and this is done 117 * in the ldap_utf8_lentab[] above. 118 */ 119 120 /* mask of required bits in second octet */ 121 #undef c 122 #define c const char 123 c ldap_utf8_mintab[] = { 124 (c)0x20, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, 125 (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, 126 (c)0x30, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, 127 (c)0x38, (c)0x80, (c)0x80, (c)0x80, (c)0x3c, (c)0x80, (c)0x00, (c)0x00 }; 128 #undef c 129 130 int ldap_utf8_charlen2( const char * p ) 131 { 132 int i = LDAP_UTF8_CHARLEN( p ); 133 134 if ( i > 2 ) { 135 if ( !( ldap_utf8_mintab[*p & 0x1f] & p[1] ) ) 136 i = 0; 137 } 138 return i; 139 } 140 141 /* conv UTF-8 to UCS-4, useful for comparisons */ 142 ldap_ucs4_t ldap_x_utf8_to_ucs4( const char * p ) 143 { 144 const unsigned char *c = (const unsigned char *) p; 145 ldap_ucs4_t ch; 146 int len, i; 147 static unsigned char mask[] = { 148 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; 149 150 len = LDAP_UTF8_CHARLEN2(p, len); 151 152 if( len == 0 ) return LDAP_UCS4_INVALID; 153 154 ch = c[0] & mask[len]; 155 156 for(i=1; i < len; i++) { 157 if ((c[i] & 0xc0) != 0x80) { 158 return LDAP_UCS4_INVALID; 159 } 160 161 ch <<= 6; 162 ch |= c[i] & 0x3f; 163 } 164 165 return ch; 166 } 167 168 /* conv UCS-4 to UTF-8, not used */ 169 int ldap_x_ucs4_to_utf8( ldap_ucs4_t c, char *buf ) 170 { 171 int len=0; 172 unsigned char* p = (unsigned char *) buf; 173 174 /* not a valid Unicode character */ 175 if ( c < 0 ) return 0; 176 177 /* Just return length, don't convert */ 178 if(buf == NULL) { 179 if( c < 0x80 ) return 1; 180 else if( c < 0x800 ) return 2; 181 else if( c < 0x10000 ) return 3; 182 else if( c < 0x200000 ) return 4; 183 else if( c < 0x4000000 ) return 5; 184 else return 6; 185 } 186 187 if( c < 0x80 ) { 188 p[len++] = c; 189 190 } else if( c < 0x800 ) { 191 p[len++] = 0xc0 | ( c >> 6 ); 192 p[len++] = 0x80 | ( c & 0x3f ); 193 194 } else if( c < 0x10000 ) { 195 p[len++] = 0xe0 | ( c >> 12 ); 196 p[len++] = 0x80 | ( (c >> 6) & 0x3f ); 197 p[len++] = 0x80 | ( c & 0x3f ); 198 199 } else if( c < 0x200000 ) { 200 p[len++] = 0xf0 | ( c >> 18 ); 201 p[len++] = 0x80 | ( (c >> 12) & 0x3f ); 202 p[len++] = 0x80 | ( (c >> 6) & 0x3f ); 203 p[len++] = 0x80 | ( c & 0x3f ); 204 205 } else if( c < 0x4000000 ) { 206 p[len++] = 0xf8 | ( c >> 24 ); 207 p[len++] = 0x80 | ( (c >> 18) & 0x3f ); 208 p[len++] = 0x80 | ( (c >> 12) & 0x3f ); 209 p[len++] = 0x80 | ( (c >> 6) & 0x3f ); 210 p[len++] = 0x80 | ( c & 0x3f ); 211 212 } else /* if( c < 0x80000000 ) */ { 213 p[len++] = 0xfc | ( c >> 30 ); 214 p[len++] = 0x80 | ( (c >> 24) & 0x3f ); 215 p[len++] = 0x80 | ( (c >> 18) & 0x3f ); 216 p[len++] = 0x80 | ( (c >> 12) & 0x3f ); 217 p[len++] = 0x80 | ( (c >> 6) & 0x3f ); 218 p[len++] = 0x80 | ( c & 0x3f ); 219 } 220 221 return len; 222 } 223 224 #define LDAP_UCS_UTF8LEN(c) \ 225 c < 0 ? 0 : (c < 0x80 ? 1 : (c < 0x800 ? 2 : (c < 0x10000 ? 3 : \ 226 (c < 0x200000 ? 4 : (c < 0x4000000 ? 5 : 6))))) 227 228 /* Convert a string to UTF-8 format. The input string is expected to 229 * have characters of 1, 2, or 4 octets (in network byte order) 230 * corresponding to the ASN.1 T61STRING, BMPSTRING, and UNIVERSALSTRING 231 * types respectively. (Here T61STRING just means that there is one 232 * octet per character and characters may use the high bit of the octet. 233 * The characters are assumed to use ISO mappings, no provision is made 234 * for converting from T.61 coding rules to Unicode.) 235 */ 236 237 int 238 ldap_ucs_to_utf8s( struct berval *ucs, int csize, struct berval *utf8s ) 239 { 240 unsigned char *in, *end; 241 char *ptr; 242 ldap_ucs4_t u; 243 int i, l = 0; 244 245 utf8s->bv_val = NULL; 246 utf8s->bv_len = 0; 247 248 in = (unsigned char *)ucs->bv_val; 249 250 /* Make sure we stop at an even multiple of csize */ 251 end = in + ( ucs->bv_len & ~(csize-1) ); 252 253 for (; in < end; ) { 254 u = *in++; 255 if (csize > 1) { 256 u <<= 8; 257 u |= *in++; 258 } 259 if (csize > 2) { 260 u <<= 8; 261 u |= *in++; 262 u <<= 8; 263 u |= *in++; 264 } 265 i = LDAP_UCS_UTF8LEN(u); 266 if (i == 0) 267 return LDAP_INVALID_SYNTAX; 268 l += i; 269 } 270 271 utf8s->bv_val = LDAP_MALLOC( l+1 ); 272 if (utf8s->bv_val == NULL) 273 return LDAP_NO_MEMORY; 274 utf8s->bv_len = l; 275 276 ptr = utf8s->bv_val; 277 for (in = (unsigned char *)ucs->bv_val; in < end; ) { 278 u = *in++; 279 if (csize > 1) { 280 u <<= 8; 281 u |= *in++; 282 } 283 if (csize > 2) { 284 u <<= 8; 285 u |= *in++; 286 u <<= 8; 287 u |= *in++; 288 } 289 ptr += ldap_x_ucs4_to_utf8(u, ptr); 290 } 291 *ptr = '\0'; 292 return LDAP_SUCCESS; 293 } 294 295 /* 296 * Advance to the next UTF-8 character 297 * 298 * Ignores length of multibyte character, instead rely on 299 * continuation markers to find start of next character. 300 * This allows for "resyncing" of when invalid characters 301 * are provided provided the start of the next character 302 * is appears within the 6 bytes examined. 303 */ 304 char* ldap_utf8_next( const char * p ) 305 { 306 int i; 307 const unsigned char *u = (const unsigned char *) p; 308 309 if( LDAP_UTF8_ISASCII(u) ) { 310 return (char *) &p[1]; 311 } 312 313 for( i=1; i<6; i++ ) { 314 if ( ( u[i] & 0xc0 ) != 0x80 ) { 315 return (char *) &p[i]; 316 } 317 } 318 319 return (char *) &p[i]; 320 } 321 322 /* 323 * Advance to the previous UTF-8 character 324 * 325 * Ignores length of multibyte character, instead rely on 326 * continuation markers to find start of next character. 327 * This allows for "resyncing" of when invalid characters 328 * are provided provided the start of the next character 329 * is appears within the 6 bytes examined. 330 */ 331 char* ldap_utf8_prev( const char * p ) 332 { 333 int i; 334 const unsigned char *u = (const unsigned char *) p; 335 336 for( i=-1; i>-6 ; i-- ) { 337 if ( ( u[i] & 0xc0 ) != 0x80 ) { 338 return (char *) &p[i]; 339 } 340 } 341 342 return (char *) &p[i]; 343 } 344 345 /* 346 * Copy one UTF-8 character from src to dst returning 347 * number of bytes copied. 348 * 349 * Ignores length of multibyte character, instead rely on 350 * continuation markers to find start of next character. 351 * This allows for "resyncing" of when invalid characters 352 * are provided provided the start of the next character 353 * is appears within the 6 bytes examined. 354 */ 355 int ldap_utf8_copy( char* dst, const char *src ) 356 { 357 int i; 358 const unsigned char *u = (const unsigned char *) src; 359 360 dst[0] = src[0]; 361 362 if( LDAP_UTF8_ISASCII(u) ) { 363 return 1; 364 } 365 366 for( i=1; i<6; i++ ) { 367 if ( ( u[i] & 0xc0 ) != 0x80 ) { 368 return i; 369 } 370 dst[i] = src[i]; 371 } 372 373 return i; 374 } 375 376 #ifndef UTF8_ALPHA_CTYPE 377 /* 378 * UTF-8 ctype routines 379 * Only deals with characters < 0x80 (ie: US-ASCII) 380 */ 381 382 int ldap_utf8_isascii( const char * p ) 383 { 384 unsigned c = * (const unsigned char *) p; 385 return LDAP_ASCII(c); 386 } 387 388 int ldap_utf8_isdigit( const char * p ) 389 { 390 unsigned c = * (const unsigned char *) p; 391 392 if(!LDAP_ASCII(c)) return 0; 393 394 return LDAP_DIGIT( c ); 395 } 396 397 int ldap_utf8_isxdigit( const char * p ) 398 { 399 unsigned c = * (const unsigned char *) p; 400 401 if(!LDAP_ASCII(c)) return 0; 402 403 return LDAP_HEX(c); 404 } 405 406 int ldap_utf8_isspace( const char * p ) 407 { 408 unsigned c = * (const unsigned char *) p; 409 410 if(!LDAP_ASCII(c)) return 0; 411 412 switch(c) { 413 case ' ': 414 case '\t': 415 case '\n': 416 case '\r': 417 case '\v': 418 case '\f': 419 return 1; 420 } 421 422 return 0; 423 } 424 425 /* 426 * These are not needed by the C SDK and are 427 * not "good enough" for general use. 428 */ 429 int ldap_utf8_isalpha( const char * p ) 430 { 431 unsigned c = * (const unsigned char *) p; 432 433 if(!LDAP_ASCII(c)) return 0; 434 435 return LDAP_ALPHA(c); 436 } 437 438 int ldap_utf8_isalnum( const char * p ) 439 { 440 unsigned c = * (const unsigned char *) p; 441 442 if(!LDAP_ASCII(c)) return 0; 443 444 return LDAP_ALNUM(c); 445 } 446 447 int ldap_utf8_islower( const char * p ) 448 { 449 unsigned c = * (const unsigned char *) p; 450 451 if(!LDAP_ASCII(c)) return 0; 452 453 return LDAP_LOWER(c); 454 } 455 456 int ldap_utf8_isupper( const char * p ) 457 { 458 unsigned c = * (const unsigned char *) p; 459 460 if(!LDAP_ASCII(c)) return 0; 461 462 return LDAP_UPPER(c); 463 } 464 #endif 465 466 467 /* 468 * UTF-8 string routines 469 */ 470 471 /* like strchr() */ 472 char * (ldap_utf8_strchr)( const char *str, const char *chr ) 473 { 474 for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) { 475 if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( chr ) ) { 476 return (char *) str; 477 } 478 } 479 480 return NULL; 481 } 482 483 /* like strcspn() but returns number of bytes, not characters */ 484 ber_len_t (ldap_utf8_strcspn)( const char *str, const char *set ) 485 { 486 const char *cstr; 487 const char *cset; 488 489 for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) { 490 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) { 491 if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) { 492 return cstr - str; 493 } 494 } 495 } 496 497 return cstr - str; 498 } 499 500 /* like strspn() but returns number of bytes, not characters */ 501 ber_len_t (ldap_utf8_strspn)( const char *str, const char *set ) 502 { 503 const char *cstr; 504 const char *cset; 505 506 for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) { 507 for( cset = set; ; LDAP_UTF8_INCR(cset) ) { 508 if( *cset == '\0' ) { 509 return cstr - str; 510 } 511 512 if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) { 513 break; 514 } 515 } 516 } 517 518 return cstr - str; 519 } 520 521 /* like strpbrk(), replaces strchr() as well */ 522 char *(ldap_utf8_strpbrk)( const char *str, const char *set ) 523 { 524 for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) { 525 const char *cset; 526 527 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) { 528 if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( cset ) ) { 529 return (char *) str; 530 } 531 } 532 } 533 534 return NULL; 535 } 536 537 /* like strtok_r(), not strtok() */ 538 char *(ldap_utf8_strtok)(char *str, const char *sep, char **last) 539 { 540 char *begin; 541 char *end; 542 543 if( last == NULL ) return NULL; 544 545 begin = str ? str : *last; 546 547 begin += ldap_utf8_strspn( begin, sep ); 548 549 if( *begin == '\0' ) { 550 *last = NULL; 551 return NULL; 552 } 553 554 end = &begin[ ldap_utf8_strcspn( begin, sep ) ]; 555 556 if( *end != '\0' ) { 557 char *next = LDAP_UTF8_NEXT( end ); 558 *end = '\0'; 559 end = next; 560 } 561 562 *last = end; 563 return begin; 564 } 565