1 /* $OpenLDAP: pkg/ldap/libraries/liblunicode/ucstr.c,v 1.37.2.4 2008/04/14 19:12:11 quanah Exp $ */ 2 /* This work is part of OpenLDAP Software <http://www.openldap.org/>. 3 * 4 * Copyright 1998-2008 The OpenLDAP Foundation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted only as authorized by the OpenLDAP 9 * Public License. 10 * 11 * A copy of this license is available in file LICENSE in the 12 * top-level directory of the distribution or, alternatively, at 13 * <http://www.OpenLDAP.org/license.html>. 14 */ 15 16 #include "portable.h" 17 18 #include <ac/bytes.h> 19 #include <ac/ctype.h> 20 #include <ac/string.h> 21 #include <ac/stdlib.h> 22 23 #include <lber_pvt.h> 24 25 #include <ldap_utf8.h> 26 #include <ldap_pvt_uc.h> 27 28 #define malloc(x) ber_memalloc_x(x,ctx) 29 #define realloc(x,y) ber_memrealloc_x(x,y,ctx) 30 #define free(x) ber_memfree_x(x,ctx) 31 32 int ucstrncmp( 33 const ldap_unicode_t *u1, 34 const ldap_unicode_t *u2, 35 ber_len_t n ) 36 { 37 for(; 0 < n; ++u1, ++u2, --n ) { 38 if( *u1 != *u2 ) { 39 return *u1 < *u2 ? -1 : +1; 40 } 41 if ( *u1 == 0 ) { 42 return 0; 43 } 44 } 45 return 0; 46 } 47 48 int ucstrncasecmp( 49 const ldap_unicode_t *u1, 50 const ldap_unicode_t *u2, 51 ber_len_t n ) 52 { 53 for(; 0 < n; ++u1, ++u2, --n ) { 54 ldap_unicode_t uu1 = uctolower( *u1 ); 55 ldap_unicode_t uu2 = uctolower( *u2 ); 56 57 if( uu1 != uu2 ) { 58 return uu1 < uu2 ? -1 : +1; 59 } 60 if ( uu1 == 0 ) { 61 return 0; 62 } 63 } 64 return 0; 65 } 66 67 ldap_unicode_t * ucstrnchr( 68 const ldap_unicode_t *u, 69 ber_len_t n, 70 ldap_unicode_t c ) 71 { 72 for(; 0 < n; ++u, --n ) { 73 if( *u == c ) { 74 return (ldap_unicode_t *) u; 75 } 76 } 77 78 return NULL; 79 } 80 81 ldap_unicode_t * ucstrncasechr( 82 const ldap_unicode_t *u, 83 ber_len_t n, 84 ldap_unicode_t c ) 85 { 86 c = uctolower( c ); 87 for(; 0 < n; ++u, --n ) { 88 if( uctolower( *u ) == c ) { 89 return (ldap_unicode_t *) u; 90 } 91 } 92 93 return NULL; 94 } 95 96 void ucstr2upper( 97 ldap_unicode_t *u, 98 ber_len_t n ) 99 { 100 for(; 0 < n; ++u, --n ) { 101 *u = uctoupper( *u ); 102 } 103 } 104 105 struct berval * UTF8bvnormalize( 106 struct berval *bv, 107 struct berval *newbv, 108 unsigned flags, 109 void *ctx ) 110 { 111 int i, j, len, clen, outpos, ucsoutlen, outsize, last; 112 char *out, *outtmp, *s; 113 ac_uint4 *ucs, *p, *ucsout; 114 115 static unsigned char mask[] = { 116 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; 117 118 unsigned casefold = flags & LDAP_UTF8_CASEFOLD; 119 unsigned approx = flags & LDAP_UTF8_APPROX; 120 121 if ( bv == NULL ) { 122 return NULL; 123 } 124 125 s = bv->bv_val; 126 len = bv->bv_len; 127 128 if ( len == 0 ) { 129 return ber_dupbv_x( newbv, bv, ctx ); 130 } 131 132 if ( !newbv ) { 133 newbv = ber_memalloc_x( sizeof(struct berval), ctx ); 134 if ( !newbv ) return NULL; 135 } 136 137 /* Should first check to see if string is already in proper 138 * normalized form. This is almost as time consuming as 139 * the normalization though. 140 */ 141 142 /* finish off everything up to character before first non-ascii */ 143 if ( LDAP_UTF8_ISASCII( s ) ) { 144 if ( casefold ) { 145 outsize = len + 7; 146 out = (char *) ber_memalloc_x( outsize, ctx ); 147 if ( out == NULL ) { 148 return NULL; 149 } 150 outpos = 0; 151 152 for ( i = 1; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) { 153 out[outpos++] = TOLOWER( s[i-1] ); 154 } 155 if ( i == len ) { 156 out[outpos++] = TOLOWER( s[len-1] ); 157 out[outpos] = '\0'; 158 newbv->bv_val = out; 159 newbv->bv_len = outpos; 160 return newbv; 161 } 162 } else { 163 for ( i = 1; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) { 164 /* empty */ 165 } 166 167 if ( i == len ) { 168 return ber_str2bv_x( s, len, 1, newbv, ctx ); 169 } 170 171 outsize = len + 7; 172 out = (char *) ber_memalloc_x( outsize, ctx ); 173 if ( out == NULL ) { 174 return NULL; 175 } 176 outpos = i - 1; 177 memcpy(out, s, outpos); 178 } 179 } else { 180 outsize = len + 7; 181 out = (char *) ber_memalloc_x( outsize, ctx ); 182 if ( out == NULL ) { 183 return NULL; 184 } 185 outpos = 0; 186 i = 0; 187 } 188 189 p = ucs = ber_memalloc_x( len * sizeof(*ucs), ctx ); 190 if ( ucs == NULL ) { 191 ber_memfree_x(out, ctx); 192 return NULL; 193 } 194 195 /* convert character before first non-ascii to ucs-4 */ 196 if ( i > 0 ) { 197 *p = casefold ? TOLOWER( s[i-1] ) : s[i-1]; 198 p++; 199 } 200 201 /* s[i] is now first non-ascii character */ 202 for (;;) { 203 /* s[i] is non-ascii */ 204 /* convert everything up to next ascii to ucs-4 */ 205 while ( i < len ) { 206 clen = LDAP_UTF8_CHARLEN2( s + i, clen ); 207 if ( clen == 0 ) { 208 ber_memfree_x( ucs, ctx ); 209 ber_memfree_x( out, ctx ); 210 return NULL; 211 } 212 if ( clen == 1 ) { 213 /* ascii */ 214 break; 215 } 216 *p = s[i] & mask[clen]; 217 i++; 218 for( j = 1; j < clen; j++ ) { 219 if ( (s[i] & 0xc0) != 0x80 ) { 220 ber_memfree_x( ucs, ctx ); 221 ber_memfree_x( out, ctx ); 222 return NULL; 223 } 224 *p <<= 6; 225 *p |= s[i] & 0x3f; 226 i++; 227 } 228 if ( casefold ) { 229 *p = uctolower( *p ); 230 } 231 p++; 232 } 233 /* normalize ucs of length p - ucs */ 234 uccompatdecomp( ucs, p - ucs, &ucsout, &ucsoutlen, ctx ); 235 if ( approx ) { 236 for ( j = 0; j < ucsoutlen; j++ ) { 237 if ( ucsout[j] < 0x80 ) { 238 out[outpos++] = ucsout[j]; 239 } 240 } 241 } else { 242 ucsoutlen = uccanoncomp( ucsout, ucsoutlen ); 243 /* convert ucs to utf-8 and store in out */ 244 for ( j = 0; j < ucsoutlen; j++ ) { 245 /* allocate more space if not enough room for 246 6 bytes and terminator */ 247 if ( outsize - outpos < 7 ) { 248 outsize = ucsoutlen - j + outpos + 6; 249 outtmp = (char *) ber_memrealloc_x( out, outsize, ctx ); 250 if ( outtmp == NULL ) { 251 ber_memfree_x( ucsout, ctx ); 252 ber_memfree_x( ucs, ctx ); 253 ber_memfree_x( out, ctx ); 254 return NULL; 255 } 256 out = outtmp; 257 } 258 outpos += ldap_x_ucs4_to_utf8( ucsout[j], &out[outpos] ); 259 } 260 } 261 262 ber_memfree_x( ucsout, ctx ); 263 ucsout = NULL; 264 265 if ( i == len ) { 266 break; 267 } 268 269 last = i; 270 271 /* Allocate more space in out if necessary */ 272 if (len - i >= outsize - outpos) { 273 outsize += 1 + ((len - i) - (outsize - outpos)); 274 outtmp = (char *) ber_memrealloc_x(out, outsize, ctx); 275 if (outtmp == NULL) { 276 ber_memfree_x( ucs, ctx ); 277 ber_memfree_x( out, ctx ); 278 return NULL; 279 } 280 out = outtmp; 281 } 282 283 /* s[i] is ascii */ 284 /* finish off everything up to char before next non-ascii */ 285 for ( i++; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) { 286 out[outpos++] = casefold ? TOLOWER( s[i-1] ) : s[i-1]; 287 } 288 if ( i == len ) { 289 out[outpos++] = casefold ? TOLOWER( s[len-1] ) : s[len-1]; 290 break; 291 } 292 293 /* convert character before next non-ascii to ucs-4 */ 294 *ucs = casefold ? TOLOWER( s[i-1] ) : s[i-1]; 295 p = ucs + 1; 296 } 297 298 ber_memfree_x( ucs, ctx ); 299 out[outpos] = '\0'; 300 newbv->bv_val = out; 301 newbv->bv_len = outpos; 302 return newbv; 303 } 304 305 /* compare UTF8-strings, optionally ignore casing */ 306 /* slow, should be optimized */ 307 int UTF8bvnormcmp( 308 struct berval *bv1, 309 struct berval *bv2, 310 unsigned flags, 311 void *ctx ) 312 { 313 int i, l1, l2, len, ulen, res = 0; 314 char *s1, *s2, *done; 315 ac_uint4 *ucs, *ucsout1, *ucsout2; 316 317 unsigned casefold = flags & LDAP_UTF8_CASEFOLD; 318 unsigned norm1 = flags & LDAP_UTF8_ARG1NFC; 319 unsigned norm2 = flags & LDAP_UTF8_ARG2NFC; 320 321 if (bv1 == NULL) { 322 return bv2 == NULL ? 0 : -1; 323 324 } else if (bv2 == NULL) { 325 return 1; 326 } 327 328 l1 = bv1->bv_len; 329 l2 = bv2->bv_len; 330 331 len = (l1 < l2) ? l1 : l2; 332 if (len == 0) { 333 return l1 == 0 ? (l2 == 0 ? 0 : -1) : 1; 334 } 335 336 s1 = bv1->bv_val; 337 s2 = bv2->bv_val; 338 done = s1 + len; 339 340 while ( (s1 < done) && LDAP_UTF8_ISASCII(s1) && LDAP_UTF8_ISASCII(s2) ) { 341 if (casefold) { 342 char c1 = TOLOWER(*s1); 343 char c2 = TOLOWER(*s2); 344 res = c1 - c2; 345 } else { 346 res = *s1 - *s2; 347 } 348 s1++; 349 s2++; 350 if (res) { 351 /* done unless next character in s1 or s2 is non-ascii */ 352 if (s1 < done) { 353 if (!LDAP_UTF8_ISASCII(s1) || !LDAP_UTF8_ISASCII(s2)) { 354 break; 355 } 356 } else if (((len < l1) && !LDAP_UTF8_ISASCII(s1)) || 357 ((len < l2) && !LDAP_UTF8_ISASCII(s2))) 358 { 359 break; 360 } 361 return res; 362 } 363 } 364 365 /* We have encountered non-ascii or strings equal up to len */ 366 367 /* set i to number of iterations */ 368 i = s1 - done + len; 369 /* passed through loop at least once? */ 370 if (i > 0) { 371 if (!res && (s1 == done) && 372 ((len == l1) || LDAP_UTF8_ISASCII(s1)) && 373 ((len == l2) || LDAP_UTF8_ISASCII(s2))) { 374 /* all ascii and equal up to len */ 375 return l1 - l2; 376 } 377 378 /* rewind one char, and do normalized compare from there */ 379 s1--; 380 s2--; 381 l1 -= i - 1; 382 l2 -= i - 1; 383 } 384 385 /* Should first check to see if strings are already in 386 * proper normalized form. 387 */ 388 ucs = malloc( ( ( norm1 || l1 > l2 ) ? l1 : l2 ) * sizeof(*ucs) ); 389 if ( ucs == NULL ) { 390 return l1 > l2 ? 1 : -1; /* what to do??? */ 391 } 392 393 /* 394 * XXYYZ: we convert to ucs4 even though -llunicode 395 * expects ucs2 in an ac_uint4 396 */ 397 398 /* convert and normalize 1st string */ 399 for ( i = 0, ulen = 0; i < l1; i += len, ulen++ ) { 400 ucs[ulen] = ldap_x_utf8_to_ucs4( s1 + i ); 401 if ( ucs[ulen] == LDAP_UCS4_INVALID ) { 402 free( ucs ); 403 return -1; /* what to do??? */ 404 } 405 len = LDAP_UTF8_CHARLEN( s1 + i ); 406 } 407 408 if ( norm1 ) { 409 ucsout1 = ucs; 410 l1 = ulen; 411 ucs = malloc( l2 * sizeof(*ucs) ); 412 if ( ucs == NULL ) { 413 free( ucsout1 ); 414 return l1 > l2 ? 1 : -1; /* what to do??? */ 415 } 416 } else { 417 uccompatdecomp( ucs, ulen, &ucsout1, &l1, ctx ); 418 l1 = uccanoncomp( ucsout1, l1 ); 419 } 420 421 /* convert and normalize 2nd string */ 422 for ( i = 0, ulen = 0; i < l2; i += len, ulen++ ) { 423 ucs[ulen] = ldap_x_utf8_to_ucs4( s2 + i ); 424 if ( ucs[ulen] == LDAP_UCS4_INVALID ) { 425 free( ucsout1 ); 426 free( ucs ); 427 return 1; /* what to do??? */ 428 } 429 len = LDAP_UTF8_CHARLEN( s2 + i ); 430 } 431 432 if ( norm2 ) { 433 ucsout2 = ucs; 434 l2 = ulen; 435 } else { 436 uccompatdecomp( ucs, ulen, &ucsout2, &l2, ctx ); 437 l2 = uccanoncomp( ucsout2, l2 ); 438 free( ucs ); 439 } 440 441 res = casefold 442 ? ucstrncasecmp( ucsout1, ucsout2, l1 < l2 ? l1 : l2 ) 443 : ucstrncmp( ucsout1, ucsout2, l1 < l2 ? l1 : l2 ); 444 free( ucsout1 ); 445 free( ucsout2 ); 446 447 if ( res != 0 ) { 448 return res; 449 } 450 if ( l1 == l2 ) { 451 return 0; 452 } 453 return l1 > l2 ? 1 : -1; 454 } 455