1 /* $NetBSD: ucstr.c,v 1.3 2021/08/14 16:14:57 christos Exp $ */
2
3 /* $OpenLDAP$ */
4 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
5 *
6 * Copyright 1998-2021 The OpenLDAP Foundation.
7 * All rights reserved.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted only as authorized by the OpenLDAP
11 * Public License.
12 *
13 * A copy of this license is available in file LICENSE in the
14 * top-level directory of the distribution or, alternatively, at
15 * <http://www.OpenLDAP.org/license.html>.
16 */
17
18 #include <sys/cdefs.h>
19 __RCSID("$NetBSD: ucstr.c,v 1.3 2021/08/14 16:14:57 christos Exp $");
20
21 #include "portable.h"
22
23 #include <ac/bytes.h>
24 #include <ac/ctype.h>
25 #include <ac/string.h>
26 #include <ac/stdlib.h>
27
28 #include <lber_pvt.h>
29
30 #include <ldap_utf8.h>
31 #include <ldap_pvt_uc.h>
32
33 #define malloc(x) ber_memalloc_x(x,ctx)
34 #define realloc(x,y) ber_memrealloc_x(x,y,ctx)
35 #define free(x) ber_memfree_x(x,ctx)
36
ucstrncmp(const ldap_unicode_t * u1,const ldap_unicode_t * u2,ber_len_t n)37 int ucstrncmp(
38 const ldap_unicode_t *u1,
39 const ldap_unicode_t *u2,
40 ber_len_t n )
41 {
42 for(; 0 < n; ++u1, ++u2, --n ) {
43 if( *u1 != *u2 ) {
44 return *u1 < *u2 ? -1 : +1;
45 }
46 if ( *u1 == 0 ) {
47 return 0;
48 }
49 }
50 return 0;
51 }
52
ucstrncasecmp(const ldap_unicode_t * u1,const ldap_unicode_t * u2,ber_len_t n)53 int ucstrncasecmp(
54 const ldap_unicode_t *u1,
55 const ldap_unicode_t *u2,
56 ber_len_t n )
57 {
58 for(; 0 < n; ++u1, ++u2, --n ) {
59 ldap_unicode_t uu1 = uctolower( *u1 );
60 ldap_unicode_t uu2 = uctolower( *u2 );
61
62 if( uu1 != uu2 ) {
63 return uu1 < uu2 ? -1 : +1;
64 }
65 if ( uu1 == 0 ) {
66 return 0;
67 }
68 }
69 return 0;
70 }
71
ucstrnchr(const ldap_unicode_t * u,ber_len_t n,ldap_unicode_t c)72 ldap_unicode_t * ucstrnchr(
73 const ldap_unicode_t *u,
74 ber_len_t n,
75 ldap_unicode_t c )
76 {
77 for(; 0 < n; ++u, --n ) {
78 if( *u == c ) {
79 return (ldap_unicode_t *) u;
80 }
81 }
82
83 return NULL;
84 }
85
ucstrncasechr(const ldap_unicode_t * u,ber_len_t n,ldap_unicode_t c)86 ldap_unicode_t * ucstrncasechr(
87 const ldap_unicode_t *u,
88 ber_len_t n,
89 ldap_unicode_t c )
90 {
91 c = uctolower( c );
92 for(; 0 < n; ++u, --n ) {
93 if( uctolower( *u ) == c ) {
94 return (ldap_unicode_t *) u;
95 }
96 }
97
98 return NULL;
99 }
100
ucstr2upper(ldap_unicode_t * u,ber_len_t n)101 void ucstr2upper(
102 ldap_unicode_t *u,
103 ber_len_t n )
104 {
105 for(; 0 < n; ++u, --n ) {
106 *u = uctoupper( *u );
107 }
108 }
109
UTF8bvnormalize(struct berval * bv,struct berval * newbv,unsigned flags,void * ctx)110 struct berval * UTF8bvnormalize(
111 struct berval *bv,
112 struct berval *newbv,
113 unsigned flags,
114 void *ctx )
115 {
116 int i, j, len, clen, outpos, ucsoutlen, outsize, last;
117 int didnewbv = 0;
118 char *out, *outtmp, *s;
119 ac_uint4 *ucs, *p, *ucsout;
120
121 static unsigned char mask[] = {
122 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
123
124 unsigned casefold = flags & LDAP_UTF8_CASEFOLD;
125 unsigned approx = flags & LDAP_UTF8_APPROX;
126
127 if ( bv == NULL ) {
128 return NULL;
129 }
130
131 s = bv->bv_val;
132 len = bv->bv_len;
133
134 if ( len == 0 ) {
135 return ber_dupbv_x( newbv, bv, ctx );
136 }
137
138 if ( !newbv ) {
139 newbv = ber_memalloc_x( sizeof(struct berval), ctx );
140 if ( !newbv ) return NULL;
141 didnewbv = 1;
142 }
143
144 /* Should first check to see if string is already in proper
145 * normalized form. This is almost as time consuming as
146 * the normalization though.
147 */
148
149 /* finish off everything up to character before first non-ascii */
150 if ( LDAP_UTF8_ISASCII( s ) ) {
151 if ( casefold ) {
152 outsize = len + 7;
153 out = (char *) ber_memalloc_x( outsize, ctx );
154 if ( out == NULL ) {
155 fail:
156 if ( didnewbv )
157 ber_memfree_x( newbv, ctx );
158 return NULL;
159 }
160 outpos = 0;
161
162 for ( i = 1; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) {
163 out[outpos++] = TOLOWER( s[i-1] );
164 }
165 if ( i == len ) {
166 out[outpos++] = TOLOWER( s[len-1] );
167 out[outpos] = '\0';
168 newbv->bv_val = out;
169 newbv->bv_len = outpos;
170 return newbv;
171 }
172 } else {
173 for ( i = 1; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) {
174 /* empty */
175 }
176
177 if ( i == len ) {
178 return ber_str2bv_x( s, len, 1, newbv, ctx );
179 }
180
181 outsize = len + 7;
182 out = (char *) ber_memalloc_x( outsize, ctx );
183 if ( out == NULL ) {
184 goto fail;
185 }
186 outpos = i - 1;
187 memcpy(out, s, outpos);
188 }
189 } else {
190 outsize = len + 7;
191 out = (char *) ber_memalloc_x( outsize, ctx );
192 if ( out == NULL ) {
193 goto fail;
194 }
195 outpos = 0;
196 i = 0;
197 }
198
199 p = ucs = ber_memalloc_x( len * sizeof(*ucs), ctx );
200 if ( ucs == NULL ) {
201 ber_memfree_x(out, ctx);
202 goto fail;
203 }
204
205 /* convert character before first non-ascii to ucs-4 */
206 if ( i > 0 ) {
207 *p = casefold ? TOLOWER( s[i-1] ) : s[i-1];
208 p++;
209 }
210
211 /* s[i] is now first non-ascii character */
212 for (;;) {
213 /* s[i] is non-ascii */
214 /* convert everything up to next ascii to ucs-4 */
215 while ( i < len ) {
216 clen = LDAP_UTF8_CHARLEN2( s + i, clen );
217 if ( clen == 0 ) {
218 ber_memfree_x( ucs, ctx );
219 ber_memfree_x( out, ctx );
220 goto fail;
221 }
222 if ( clen == 1 ) {
223 /* ascii */
224 break;
225 }
226 *p = s[i] & mask[clen];
227 i++;
228 for( j = 1; j < clen; j++ ) {
229 if ( (s[i] & 0xc0) != 0x80 ) {
230 ber_memfree_x( ucs, ctx );
231 ber_memfree_x( out, ctx );
232 goto fail;
233 }
234 *p <<= 6;
235 *p |= s[i] & 0x3f;
236 i++;
237 }
238 if ( casefold ) {
239 *p = uctolower( *p );
240 }
241 p++;
242 }
243 /* normalize ucs of length p - ucs */
244 uccompatdecomp( ucs, p - ucs, &ucsout, &ucsoutlen, ctx );
245 if ( approx ) {
246 for ( j = 0; j < ucsoutlen; j++ ) {
247 if ( ucsout[j] < 0x80 ) {
248 out[outpos++] = ucsout[j];
249 }
250 }
251 } else {
252 ucsoutlen = uccanoncomp( ucsout, ucsoutlen );
253 /* convert ucs to utf-8 and store in out */
254 for ( j = 0; j < ucsoutlen; j++ ) {
255 /* allocate more space if not enough room for
256 6 bytes and terminator */
257 if ( outsize - outpos < 7 ) {
258 outsize = ucsoutlen - j + outpos + 6;
259 outtmp = (char *) ber_memrealloc_x( out, outsize, ctx );
260 if ( outtmp == NULL ) {
261 ber_memfree_x( ucsout, ctx );
262 ber_memfree_x( ucs, ctx );
263 ber_memfree_x( out, ctx );
264 goto fail;
265 }
266 out = outtmp;
267 }
268 outpos += ldap_x_ucs4_to_utf8( ucsout[j], &out[outpos] );
269 }
270 }
271
272 ber_memfree_x( ucsout, ctx );
273 ucsout = NULL;
274
275 if ( i == len ) {
276 break;
277 }
278
279 last = i;
280
281 /* Allocate more space in out if necessary */
282 if (len - i >= outsize - outpos) {
283 outsize += 1 + ((len - i) - (outsize - outpos));
284 outtmp = (char *) ber_memrealloc_x(out, outsize, ctx);
285 if (outtmp == NULL) {
286 ber_memfree_x( ucs, ctx );
287 ber_memfree_x( out, ctx );
288 goto fail;
289 }
290 out = outtmp;
291 }
292
293 /* s[i] is ascii */
294 /* finish off everything up to char before next non-ascii */
295 for ( i++; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) {
296 out[outpos++] = casefold ? TOLOWER( s[i-1] ) : s[i-1];
297 }
298 if ( i == len ) {
299 out[outpos++] = casefold ? TOLOWER( s[len-1] ) : s[len-1];
300 break;
301 }
302
303 /* convert character before next non-ascii to ucs-4 */
304 *ucs = casefold ? TOLOWER( s[i-1] ) : s[i-1];
305 p = ucs + 1;
306 }
307
308 ber_memfree_x( ucs, ctx );
309 out[outpos] = '\0';
310 newbv->bv_val = out;
311 newbv->bv_len = outpos;
312 return newbv;
313 }
314
315 /* compare UTF8-strings, optionally ignore casing */
316 /* slow, should be optimized */
UTF8bvnormcmp(struct berval * bv1,struct berval * bv2,unsigned flags,void * ctx)317 int UTF8bvnormcmp(
318 struct berval *bv1,
319 struct berval *bv2,
320 unsigned flags,
321 void *ctx )
322 {
323 int i, l1, l2, len, ulen, res = 0;
324 char *s1, *s2, *done;
325 ac_uint4 *ucs, *ucsout1, *ucsout2;
326
327 unsigned casefold = flags & LDAP_UTF8_CASEFOLD;
328 unsigned norm1 = flags & LDAP_UTF8_ARG1NFC;
329 unsigned norm2 = flags & LDAP_UTF8_ARG2NFC;
330
331 if (bv1 == NULL) {
332 return bv2 == NULL ? 0 : -1;
333
334 } else if (bv2 == NULL) {
335 return 1;
336 }
337
338 l1 = bv1->bv_len;
339 l2 = bv2->bv_len;
340
341 len = (l1 < l2) ? l1 : l2;
342 if (len == 0) {
343 return l1 == 0 ? (l2 == 0 ? 0 : -1) : 1;
344 }
345
346 s1 = bv1->bv_val;
347 s2 = bv2->bv_val;
348 done = s1 + len;
349
350 while ( (s1 < done) && LDAP_UTF8_ISASCII(s1) && LDAP_UTF8_ISASCII(s2) ) {
351 if (casefold) {
352 char c1 = TOLOWER(*s1);
353 char c2 = TOLOWER(*s2);
354 res = c1 - c2;
355 } else {
356 res = *s1 - *s2;
357 }
358 s1++;
359 s2++;
360 if (res) {
361 /* done unless next character in s1 or s2 is non-ascii */
362 if (s1 < done) {
363 if (!LDAP_UTF8_ISASCII(s1) || !LDAP_UTF8_ISASCII(s2)) {
364 break;
365 }
366 } else if (((len < l1) && !LDAP_UTF8_ISASCII(s1)) ||
367 ((len < l2) && !LDAP_UTF8_ISASCII(s2)))
368 {
369 break;
370 }
371 return res;
372 }
373 }
374
375 /* We have encountered non-ascii or strings equal up to len */
376
377 /* set i to number of iterations */
378 i = s1 - done + len;
379 /* passed through loop at least once? */
380 if (i > 0) {
381 if (!res && (s1 == done) &&
382 ((len == l1) || LDAP_UTF8_ISASCII(s1)) &&
383 ((len == l2) || LDAP_UTF8_ISASCII(s2))) {
384 /* all ascii and equal up to len */
385 return l1 - l2;
386 }
387
388 /* rewind one char, and do normalized compare from there */
389 s1--;
390 s2--;
391 l1 -= i - 1;
392 l2 -= i - 1;
393 }
394
395 /* Should first check to see if strings are already in
396 * proper normalized form.
397 */
398 ucs = malloc( ( ( norm1 || l1 > l2 ) ? l1 : l2 ) * sizeof(*ucs) );
399 if ( ucs == NULL ) {
400 return l1 > l2 ? 1 : -1; /* what to do??? */
401 }
402
403 /*
404 * XXYYZ: we convert to ucs4 even though -llunicode
405 * expects ucs2 in an ac_uint4
406 */
407
408 /* convert and normalize 1st string */
409 for ( i = 0, ulen = 0; i < l1; i += len, ulen++ ) {
410 ucs[ulen] = ldap_x_utf8_to_ucs4( s1 + i );
411 if ( ucs[ulen] == LDAP_UCS4_INVALID ) {
412 free( ucs );
413 return -1; /* what to do??? */
414 }
415 len = LDAP_UTF8_CHARLEN( s1 + i );
416 }
417
418 if ( norm1 ) {
419 ucsout1 = ucs;
420 l1 = ulen;
421 ucs = malloc( l2 * sizeof(*ucs) );
422 if ( ucs == NULL ) {
423 free( ucsout1 );
424 return l1 > l2 ? 1 : -1; /* what to do??? */
425 }
426 } else {
427 uccompatdecomp( ucs, ulen, &ucsout1, &l1, ctx );
428 l1 = uccanoncomp( ucsout1, l1 );
429 }
430
431 /* convert and normalize 2nd string */
432 for ( i = 0, ulen = 0; i < l2; i += len, ulen++ ) {
433 ucs[ulen] = ldap_x_utf8_to_ucs4( s2 + i );
434 if ( ucs[ulen] == LDAP_UCS4_INVALID ) {
435 free( ucsout1 );
436 free( ucs );
437 return 1; /* what to do??? */
438 }
439 len = LDAP_UTF8_CHARLEN( s2 + i );
440 }
441
442 if ( norm2 ) {
443 ucsout2 = ucs;
444 l2 = ulen;
445 } else {
446 uccompatdecomp( ucs, ulen, &ucsout2, &l2, ctx );
447 l2 = uccanoncomp( ucsout2, l2 );
448 free( ucs );
449 }
450
451 res = casefold
452 ? ucstrncasecmp( ucsout1, ucsout2, l1 < l2 ? l1 : l2 )
453 : ucstrncmp( ucsout1, ucsout2, l1 < l2 ? l1 : l2 );
454 free( ucsout1 );
455 free( ucsout2 );
456
457 if ( res != 0 ) {
458 return res;
459 }
460 if ( l1 == l2 ) {
461 return 0;
462 }
463 return l1 > l2 ? 1 : -1;
464 }
465