xref: /netbsd-src/external/bsd/openldap/dist/libraries/liblunicode/ucstr.c (revision 549b59ed3ccf0d36d3097190a0db27b770f3a839)
1 /*	$NetBSD: ucstr.c,v 1.3 2021/08/14 16:14:57 christos Exp $	*/
2 
3 /* $OpenLDAP$ */
4 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
5  *
6  * Copyright 1998-2021 The OpenLDAP Foundation.
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted only as authorized by the OpenLDAP
11  * Public License.
12  *
13  * A copy of this license is available in file LICENSE in the
14  * top-level directory of the distribution or, alternatively, at
15  * <http://www.OpenLDAP.org/license.html>.
16  */
17 
18 #include <sys/cdefs.h>
19 __RCSID("$NetBSD: ucstr.c,v 1.3 2021/08/14 16:14:57 christos Exp $");
20 
21 #include "portable.h"
22 
23 #include <ac/bytes.h>
24 #include <ac/ctype.h>
25 #include <ac/string.h>
26 #include <ac/stdlib.h>
27 
28 #include <lber_pvt.h>
29 
30 #include <ldap_utf8.h>
31 #include <ldap_pvt_uc.h>
32 
33 #define	malloc(x)	ber_memalloc_x(x,ctx)
34 #define	realloc(x,y)	ber_memrealloc_x(x,y,ctx)
35 #define	free(x)		ber_memfree_x(x,ctx)
36 
ucstrncmp(const ldap_unicode_t * u1,const ldap_unicode_t * u2,ber_len_t n)37 int ucstrncmp(
38 	const ldap_unicode_t *u1,
39 	const ldap_unicode_t *u2,
40 	ber_len_t n )
41 {
42 	for(; 0 < n; ++u1, ++u2, --n ) {
43 		if( *u1 != *u2 ) {
44 			return *u1 < *u2 ? -1 : +1;
45 		}
46 		if ( *u1 == 0 ) {
47 			return 0;
48 		}
49 	}
50 	return 0;
51 }
52 
ucstrncasecmp(const ldap_unicode_t * u1,const ldap_unicode_t * u2,ber_len_t n)53 int ucstrncasecmp(
54 	const ldap_unicode_t *u1,
55 	const ldap_unicode_t *u2,
56 	ber_len_t n )
57 {
58 	for(; 0 < n; ++u1, ++u2, --n ) {
59 		ldap_unicode_t uu1 = uctolower( *u1 );
60 		ldap_unicode_t uu2 = uctolower( *u2 );
61 
62 		if( uu1 != uu2 ) {
63 			return uu1 < uu2 ? -1 : +1;
64 		}
65 		if ( uu1 == 0 ) {
66 			return 0;
67 		}
68 	}
69 	return 0;
70 }
71 
ucstrnchr(const ldap_unicode_t * u,ber_len_t n,ldap_unicode_t c)72 ldap_unicode_t * ucstrnchr(
73 	const ldap_unicode_t *u,
74 	ber_len_t n,
75 	ldap_unicode_t c )
76 {
77 	for(; 0 < n; ++u, --n ) {
78 		if( *u == c ) {
79 			return (ldap_unicode_t *) u;
80 		}
81 	}
82 
83 	return NULL;
84 }
85 
ucstrncasechr(const ldap_unicode_t * u,ber_len_t n,ldap_unicode_t c)86 ldap_unicode_t * ucstrncasechr(
87 	const ldap_unicode_t *u,
88 	ber_len_t n,
89 	ldap_unicode_t c )
90 {
91 	c = uctolower( c );
92 	for(; 0 < n; ++u, --n ) {
93 		if( uctolower( *u ) == c ) {
94 			return (ldap_unicode_t *) u;
95 		}
96 	}
97 
98 	return NULL;
99 }
100 
ucstr2upper(ldap_unicode_t * u,ber_len_t n)101 void ucstr2upper(
102 	ldap_unicode_t *u,
103 	ber_len_t n )
104 {
105 	for(; 0 < n; ++u, --n ) {
106 		*u = uctoupper( *u );
107 	}
108 }
109 
UTF8bvnormalize(struct berval * bv,struct berval * newbv,unsigned flags,void * ctx)110 struct berval * UTF8bvnormalize(
111 	struct berval *bv,
112 	struct berval *newbv,
113 	unsigned flags,
114 	void *ctx )
115 {
116 	int i, j, len, clen, outpos, ucsoutlen, outsize, last;
117 	int didnewbv = 0;
118 	char *out, *outtmp, *s;
119 	ac_uint4 *ucs, *p, *ucsout;
120 
121 	static unsigned char mask[] = {
122 		0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
123 
124 	unsigned casefold = flags & LDAP_UTF8_CASEFOLD;
125 	unsigned approx = flags & LDAP_UTF8_APPROX;
126 
127 	if ( bv == NULL ) {
128 		return NULL;
129 	}
130 
131 	s = bv->bv_val;
132 	len = bv->bv_len;
133 
134 	if ( len == 0 ) {
135 		return ber_dupbv_x( newbv, bv, ctx );
136 	}
137 
138 	if ( !newbv ) {
139 		newbv = ber_memalloc_x( sizeof(struct berval), ctx );
140 		if ( !newbv ) return NULL;
141 		didnewbv = 1;
142 	}
143 
144 	/* Should first check to see if string is already in proper
145 	 * normalized form. This is almost as time consuming as
146 	 * the normalization though.
147 	 */
148 
149 	/* finish off everything up to character before first non-ascii */
150 	if ( LDAP_UTF8_ISASCII( s ) ) {
151 		if ( casefold ) {
152 			outsize = len + 7;
153 			out = (char *) ber_memalloc_x( outsize, ctx );
154 			if ( out == NULL ) {
155 fail:
156 				if ( didnewbv )
157 					ber_memfree_x( newbv, ctx );
158 				return NULL;
159 			}
160 			outpos = 0;
161 
162 			for ( i = 1; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) {
163 				out[outpos++] = TOLOWER( s[i-1] );
164 			}
165 			if ( i == len ) {
166 				out[outpos++] = TOLOWER( s[len-1] );
167 				out[outpos] = '\0';
168 				newbv->bv_val = out;
169 				newbv->bv_len = outpos;
170 				return newbv;
171 			}
172 		} else {
173 			for ( i = 1; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) {
174 				/* empty */
175 			}
176 
177 			if ( i == len ) {
178 				return ber_str2bv_x( s, len, 1, newbv, ctx );
179 			}
180 
181 			outsize = len + 7;
182 			out = (char *) ber_memalloc_x( outsize, ctx );
183 			if ( out == NULL ) {
184 				goto fail;
185 			}
186 			outpos = i - 1;
187 			memcpy(out, s, outpos);
188 		}
189 	} else {
190 		outsize = len + 7;
191 		out = (char *) ber_memalloc_x( outsize, ctx );
192 		if ( out == NULL ) {
193 			goto fail;
194 		}
195 		outpos = 0;
196 		i = 0;
197 	}
198 
199 	p = ucs = ber_memalloc_x( len * sizeof(*ucs), ctx );
200 	if ( ucs == NULL ) {
201 		ber_memfree_x(out, ctx);
202 		goto fail;
203 	}
204 
205 	/* convert character before first non-ascii to ucs-4 */
206 	if ( i > 0 ) {
207 		*p = casefold ? TOLOWER( s[i-1] ) : s[i-1];
208 		p++;
209 	}
210 
211 	/* s[i] is now first non-ascii character */
212 	for (;;) {
213 		/* s[i] is non-ascii */
214 		/* convert everything up to next ascii to ucs-4 */
215 		while ( i < len ) {
216 			clen = LDAP_UTF8_CHARLEN2( s + i, clen );
217 			if ( clen == 0 ) {
218 				ber_memfree_x( ucs, ctx );
219 				ber_memfree_x( out, ctx );
220 				goto fail;
221 			}
222 			if ( clen == 1 ) {
223 				/* ascii */
224 				break;
225 			}
226 			*p = s[i] & mask[clen];
227 			i++;
228 			for( j = 1; j < clen; j++ ) {
229 				if ( (s[i] & 0xc0) != 0x80 ) {
230 					ber_memfree_x( ucs, ctx );
231 					ber_memfree_x( out, ctx );
232 					goto fail;
233 				}
234 				*p <<= 6;
235 				*p |= s[i] & 0x3f;
236 				i++;
237 			}
238 			if ( casefold ) {
239 				*p = uctolower( *p );
240 			}
241 			p++;
242 		}
243 		/* normalize ucs of length p - ucs */
244 		uccompatdecomp( ucs, p - ucs, &ucsout, &ucsoutlen, ctx );
245 		if ( approx ) {
246 			for ( j = 0; j < ucsoutlen; j++ ) {
247 				if ( ucsout[j] < 0x80 ) {
248 					out[outpos++] = ucsout[j];
249 				}
250 			}
251 		} else {
252 			ucsoutlen = uccanoncomp( ucsout, ucsoutlen );
253 			/* convert ucs to utf-8 and store in out */
254 			for ( j = 0; j < ucsoutlen; j++ ) {
255 				/* allocate more space if not enough room for
256 				   6 bytes and terminator */
257 				if ( outsize - outpos < 7 ) {
258 					outsize = ucsoutlen - j + outpos + 6;
259 					outtmp = (char *) ber_memrealloc_x( out, outsize, ctx );
260 					if ( outtmp == NULL ) {
261 						ber_memfree_x( ucsout, ctx );
262 						ber_memfree_x( ucs, ctx );
263 						ber_memfree_x( out, ctx );
264 						goto fail;
265 					}
266 					out = outtmp;
267 				}
268 				outpos += ldap_x_ucs4_to_utf8( ucsout[j], &out[outpos] );
269 			}
270 		}
271 
272 		ber_memfree_x( ucsout, ctx );
273 		ucsout = NULL;
274 
275 		if ( i == len ) {
276 			break;
277 		}
278 
279 		last = i;
280 
281 		/* Allocate more space in out if necessary */
282 		if (len - i >= outsize - outpos) {
283 			outsize += 1 + ((len - i) - (outsize - outpos));
284 			outtmp = (char *) ber_memrealloc_x(out, outsize, ctx);
285 			if (outtmp == NULL) {
286 				ber_memfree_x( ucs, ctx );
287 				ber_memfree_x( out, ctx );
288 				goto fail;
289 			}
290 			out = outtmp;
291 		}
292 
293 		/* s[i] is ascii */
294 		/* finish off everything up to char before next non-ascii */
295 		for ( i++; (i < len) && LDAP_UTF8_ISASCII(s + i); i++ ) {
296 			out[outpos++] = casefold ? TOLOWER( s[i-1] ) : s[i-1];
297 		}
298 		if ( i == len ) {
299 			out[outpos++] = casefold ? TOLOWER( s[len-1] ) : s[len-1];
300 			break;
301 		}
302 
303 		/* convert character before next non-ascii to ucs-4 */
304 		*ucs = casefold ? TOLOWER( s[i-1] ) : s[i-1];
305 		p = ucs + 1;
306 	}
307 
308 	ber_memfree_x( ucs, ctx );
309 	out[outpos] = '\0';
310 	newbv->bv_val = out;
311 	newbv->bv_len = outpos;
312 	return newbv;
313 }
314 
315 /* compare UTF8-strings, optionally ignore casing */
316 /* slow, should be optimized */
UTF8bvnormcmp(struct berval * bv1,struct berval * bv2,unsigned flags,void * ctx)317 int UTF8bvnormcmp(
318 	struct berval *bv1,
319 	struct berval *bv2,
320 	unsigned flags,
321 	void *ctx )
322 {
323 	int i, l1, l2, len, ulen, res = 0;
324 	char *s1, *s2, *done;
325 	ac_uint4 *ucs, *ucsout1, *ucsout2;
326 
327 	unsigned casefold = flags & LDAP_UTF8_CASEFOLD;
328 	unsigned norm1 = flags & LDAP_UTF8_ARG1NFC;
329 	unsigned norm2 = flags & LDAP_UTF8_ARG2NFC;
330 
331 	if (bv1 == NULL) {
332 		return bv2 == NULL ? 0 : -1;
333 
334 	} else if (bv2 == NULL) {
335 		return 1;
336 	}
337 
338 	l1 = bv1->bv_len;
339 	l2 = bv2->bv_len;
340 
341 	len = (l1 < l2) ? l1 : l2;
342 	if (len == 0) {
343 		return l1 == 0 ? (l2 == 0 ? 0 : -1) : 1;
344 	}
345 
346 	s1 = bv1->bv_val;
347 	s2 = bv2->bv_val;
348 	done = s1 + len;
349 
350 	while ( (s1 < done) && LDAP_UTF8_ISASCII(s1) && LDAP_UTF8_ISASCII(s2) ) {
351 		if (casefold) {
352 			char c1 = TOLOWER(*s1);
353 			char c2 = TOLOWER(*s2);
354 			res = c1 - c2;
355 		} else {
356 			res = *s1 - *s2;
357 		}
358 		s1++;
359 		s2++;
360 		if (res) {
361 			/* done unless next character in s1 or s2 is non-ascii */
362 			if (s1 < done) {
363 				if (!LDAP_UTF8_ISASCII(s1) || !LDAP_UTF8_ISASCII(s2)) {
364 					break;
365 				}
366 			} else if (((len < l1) && !LDAP_UTF8_ISASCII(s1)) ||
367 				((len < l2) && !LDAP_UTF8_ISASCII(s2)))
368 			{
369 				break;
370 			}
371 			return res;
372 		}
373 	}
374 
375 	/* We have encountered non-ascii or strings equal up to len */
376 
377 	/* set i to number of iterations */
378 	i = s1 - done + len;
379 	/* passed through loop at least once? */
380 	if (i > 0) {
381 		if (!res && (s1 == done) &&
382 		    ((len == l1) || LDAP_UTF8_ISASCII(s1)) &&
383 		    ((len == l2) || LDAP_UTF8_ISASCII(s2))) {
384 			/* all ascii and equal up to len */
385 			return l1 - l2;
386 		}
387 
388 		/* rewind one char, and do normalized compare from there */
389 		s1--;
390 		s2--;
391 		l1 -= i - 1;
392 		l2 -= i - 1;
393 	}
394 
395 	/* Should first check to see if strings are already in
396 	 * proper normalized form.
397 	 */
398 	ucs = malloc( ( ( norm1 || l1 > l2 ) ? l1 : l2 ) * sizeof(*ucs) );
399 	if ( ucs == NULL ) {
400 		return l1 > l2 ? 1 : -1; /* what to do??? */
401 	}
402 
403 	/*
404 	 * XXYYZ: we convert to ucs4 even though -llunicode
405 	 * expects ucs2 in an ac_uint4
406 	 */
407 
408 	/* convert and normalize 1st string */
409 	for ( i = 0, ulen = 0; i < l1; i += len, ulen++ ) {
410 		ucs[ulen] = ldap_x_utf8_to_ucs4( s1 + i );
411 		if ( ucs[ulen] == LDAP_UCS4_INVALID ) {
412 			free( ucs );
413 			return -1; /* what to do??? */
414 		}
415 		len = LDAP_UTF8_CHARLEN( s1 + i );
416 	}
417 
418 	if ( norm1 ) {
419 		ucsout1 = ucs;
420 		l1 = ulen;
421 		ucs = malloc( l2 * sizeof(*ucs) );
422 		if ( ucs == NULL ) {
423 			free( ucsout1 );
424 			return l1 > l2 ? 1 : -1; /* what to do??? */
425 		}
426 	} else {
427 		uccompatdecomp( ucs, ulen, &ucsout1, &l1, ctx );
428 		l1 = uccanoncomp( ucsout1, l1 );
429 	}
430 
431 	/* convert and normalize 2nd string */
432 	for ( i = 0, ulen = 0; i < l2; i += len, ulen++ ) {
433 		ucs[ulen] = ldap_x_utf8_to_ucs4( s2 + i );
434 		if ( ucs[ulen] == LDAP_UCS4_INVALID ) {
435 			free( ucsout1 );
436 			free( ucs );
437 			return 1; /* what to do??? */
438 		}
439 		len = LDAP_UTF8_CHARLEN( s2 + i );
440 	}
441 
442 	if ( norm2 ) {
443 		ucsout2 = ucs;
444 		l2 = ulen;
445 	} else {
446 		uccompatdecomp( ucs, ulen, &ucsout2, &l2, ctx );
447 		l2 = uccanoncomp( ucsout2, l2 );
448 		free( ucs );
449 	}
450 
451 	res = casefold
452 		? ucstrncasecmp( ucsout1, ucsout2, l1 < l2 ? l1 : l2 )
453 		: ucstrncmp( ucsout1, ucsout2, l1 < l2 ? l1 : l2 );
454 	free( ucsout1 );
455 	free( ucsout2 );
456 
457 	if ( res != 0 ) {
458 		return res;
459 	}
460 	if ( l1 == l2 ) {
461 		return 0;
462 	}
463 	return l1 > l2 ? 1 : -1;
464 }
465