xref: /netbsd-src/external/bsd/openldap/dist/libraries/libldap/utf-8.c (revision 549b59ed3ccf0d36d3097190a0db27b770f3a839)
1 /*	$NetBSD: utf-8.c,v 1.3 2021/08/14 16:14:56 christos Exp $	*/
2 
3 /* utf-8.c -- Basic UTF-8 routines */
4 /* $OpenLDAP$ */
5 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
6  *
7  * Copyright 1998-2021 The OpenLDAP Foundation.
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted only as authorized by the OpenLDAP
12  * Public License.
13  *
14  * A copy of this license is available in the file LICENSE in the
15  * top-level directory of the distribution or, alternatively, at
16  * <http://www.OpenLDAP.org/license.html>.
17  */
18 /* Basic UTF-8 routines
19  *
20  * These routines are "dumb".  Though they understand UTF-8,
21  * they don't grok Unicode.  That is, they can push bits,
22  * but don't have a clue what the bits represent.  That's
23  * good enough for use with the LDAP Client SDK.
24  *
25  * These routines are not optimized.
26  */
27 
28 #include <sys/cdefs.h>
29 __RCSID("$NetBSD: utf-8.c,v 1.3 2021/08/14 16:14:56 christos Exp $");
30 
31 #include "portable.h"
32 
33 #include <stdio.h>
34 
35 #include <ac/stdlib.h>
36 
37 #include <ac/socket.h>
38 #include <ac/string.h>
39 #include <ac/time.h>
40 
41 #include "ldap_utf8.h"
42 
43 #include "ldap-int.h"
44 #include "ldap_defaults.h"
45 
46 /*
47  * return the number of bytes required to hold the
48  * NULL-terminated UTF-8 string NOT INCLUDING the
49  * termination.
50  */
ldap_utf8_bytes(const char * p)51 ber_len_t ldap_utf8_bytes( const char * p )
52 {
53 	ber_len_t bytes;
54 
55 	for( bytes=0; p[bytes]; bytes++ ) {
56 		/* EMPTY */ ;
57 	}
58 
59 	return bytes;
60 }
61 
ldap_utf8_chars(const char * p)62 ber_len_t ldap_utf8_chars( const char * p )
63 {
64 	/* could be optimized and could check for invalid sequences */
65 	ber_len_t chars=0;
66 
67 	for( ; *p ; LDAP_UTF8_INCR(p) ) {
68 		chars++;
69 	}
70 
71 	return chars;
72 }
73 
74 /* return offset to next character */
ldap_utf8_offset(const char * p)75 int ldap_utf8_offset( const char * p )
76 {
77 	return LDAP_UTF8_NEXT(p) - p;
78 }
79 
80 /*
81  * Returns length indicated by first byte.
82  */
83 const char ldap_utf8_lentab[] = {
84 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
85 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
86 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
87 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
88 	0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
89 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
90 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
91 	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 };
92 
ldap_utf8_charlen(const char * p)93 int ldap_utf8_charlen( const char * p )
94 {
95 	if (!(*p & 0x80))
96 		return 1;
97 
98 	return ldap_utf8_lentab[*(const unsigned char *)p ^ 0x80];
99 }
100 
101 /*
102  * Make sure the UTF-8 char used the shortest possible encoding
103  * returns charlen if valid, 0 if not.
104  *
105  * Here are the valid UTF-8 encodings, taken from RFC 2279 page 4.
106  * The table is slightly modified from that of the RFC.
107  *
108  * UCS-4 range (hex)      UTF-8 sequence (binary)
109  * 0000 0000-0000 007F   0.......
110  * 0000 0080-0000 07FF   110++++. 10......
111  * 0000 0800-0000 FFFF   1110++++ 10+..... 10......
112  * 0001 0000-001F FFFF   11110+++ 10++.... 10...... 10......
113  * 0020 0000-03FF FFFF   111110++ 10+++... 10...... 10...... 10......
114  * 0400 0000-7FFF FFFF   1111110+ 10++++.. 10...... 10...... 10...... 10......
115  *
116  * The '.' bits are "don't cares". When validating a UTF-8 sequence,
117  * at least one of the '+' bits must be set, otherwise the character
118  * should have been encoded in fewer octets. Note that in the two-octet
119  * case, only the first octet needs to be validated, and this is done
120  * in the ldap_utf8_lentab[] above.
121  */
122 
123 /* mask of required bits in second octet */
124 #undef c
125 #define c const char
126 c ldap_utf8_mintab[] = {
127 	(c)0x20, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
128 	(c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
129 	(c)0x30, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
130 	(c)0x38, (c)0x80, (c)0x80, (c)0x80, (c)0x3c, (c)0x80, (c)0x00, (c)0x00 };
131 #undef c
132 
ldap_utf8_charlen2(const char * p)133 int ldap_utf8_charlen2( const char * p )
134 {
135 	int i = LDAP_UTF8_CHARLEN( p );
136 
137 	if ( i > 2 ) {
138 		if ( !( ldap_utf8_mintab[*p & 0x1f] & p[1] ) )
139 			i = 0;
140 	}
141 	return i;
142 }
143 
144 /* conv UTF-8 to UCS-4, useful for comparisons */
ldap_x_utf8_to_ucs4(const char * p)145 ldap_ucs4_t ldap_x_utf8_to_ucs4( const char * p )
146 {
147     const unsigned char *c = (const unsigned char *) p;
148     ldap_ucs4_t ch;
149 	int len, i;
150 	static unsigned char mask[] = {
151 		0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
152 
153 	len = LDAP_UTF8_CHARLEN2(p, len);
154 
155 	if( len == 0 ) return LDAP_UCS4_INVALID;
156 
157 	ch = c[0] & mask[len];
158 
159 	for(i=1; i < len; i++) {
160 		if ((c[i] & 0xc0) != 0x80) {
161 			return LDAP_UCS4_INVALID;
162 		}
163 
164 		ch <<= 6;
165 		ch |= c[i] & 0x3f;
166 	}
167 
168 	return ch;
169 }
170 
171 /* conv UCS-4 to UTF-8, not used */
ldap_x_ucs4_to_utf8(ldap_ucs4_t c,char * buf)172 int ldap_x_ucs4_to_utf8( ldap_ucs4_t c, char *buf )
173 {
174 	int len=0;
175 	unsigned char* p = (unsigned char *) buf;
176 
177 	/* not a valid Unicode character */
178 	if ( c < 0 ) return 0;
179 
180 	/* Just return length, don't convert */
181 	if(buf == NULL) {
182 		if( c < 0x80 ) return 1;
183 		else if( c < 0x800 ) return 2;
184 		else if( c < 0x10000 ) return 3;
185 		else if( c < 0x200000 ) return 4;
186 		else if( c < 0x4000000 ) return 5;
187 		else return 6;
188 	}
189 
190 	if( c < 0x80 ) {
191 		p[len++] = c;
192 
193 	} else if( c < 0x800 ) {
194 		p[len++] = 0xc0 | ( c >> 6 );
195 		p[len++] = 0x80 | ( c & 0x3f );
196 
197 	} else if( c < 0x10000 ) {
198 		p[len++] = 0xe0 | ( c >> 12 );
199 		p[len++] = 0x80 | ( (c >> 6) & 0x3f );
200 		p[len++] = 0x80 | ( c & 0x3f );
201 
202 	} else if( c < 0x200000 ) {
203 		p[len++] = 0xf0 | ( c >> 18 );
204 		p[len++] = 0x80 | ( (c >> 12) & 0x3f );
205 		p[len++] = 0x80 | ( (c >> 6) & 0x3f );
206 		p[len++] = 0x80 | ( c & 0x3f );
207 
208 	} else if( c < 0x4000000 ) {
209 		p[len++] = 0xf8 | ( c >> 24 );
210 		p[len++] = 0x80 | ( (c >> 18) & 0x3f );
211 		p[len++] = 0x80 | ( (c >> 12) & 0x3f );
212 		p[len++] = 0x80 | ( (c >> 6) & 0x3f );
213 		p[len++] = 0x80 | ( c & 0x3f );
214 
215 	} else /* if( c < 0x80000000 ) */ {
216 		p[len++] = 0xfc | ( c >> 30 );
217 		p[len++] = 0x80 | ( (c >> 24) & 0x3f );
218 		p[len++] = 0x80 | ( (c >> 18) & 0x3f );
219 		p[len++] = 0x80 | ( (c >> 12) & 0x3f );
220 		p[len++] = 0x80 | ( (c >> 6) & 0x3f );
221 		p[len++] = 0x80 | ( c & 0x3f );
222 	}
223 
224 	return len;
225 }
226 
227 #define LDAP_UCS_UTF8LEN(c)	\
228 	c < 0 ? 0 : (c < 0x80 ? 1 : (c < 0x800 ? 2 : (c < 0x10000 ? 3 : \
229 	(c < 0x200000 ? 4 : (c < 0x4000000 ? 5 : 6)))))
230 
231 /* Convert a string to UTF-8 format. The input string is expected to
232  * have characters of 1, 2, or 4 octets (in network byte order)
233  * corresponding to the ASN.1 T61STRING, BMPSTRING, and UNIVERSALSTRING
234  * types respectively. (Here T61STRING just means that there is one
235  * octet per character and characters may use the high bit of the octet.
236  * The characters are assumed to use ISO mappings, no provision is made
237  * for converting from T.61 coding rules to Unicode.)
238  */
239 
240 int
ldap_ucs_to_utf8s(struct berval * ucs,int csize,struct berval * utf8s)241 ldap_ucs_to_utf8s( struct berval *ucs, int csize, struct berval *utf8s )
242 {
243 	unsigned char *in, *end;
244 	char *ptr;
245 	ldap_ucs4_t u;
246 	int i, l = 0;
247 
248 	utf8s->bv_val = NULL;
249 	utf8s->bv_len = 0;
250 
251 	in = (unsigned char *)ucs->bv_val;
252 
253 	/* Make sure we stop at an even multiple of csize */
254 	end = in + ( ucs->bv_len & ~(csize-1) );
255 
256 	for (; in < end; ) {
257 		u = *in++;
258 		if (csize > 1) {
259 			u <<= 8;
260 			u |= *in++;
261 		}
262 		if (csize > 2) {
263 			u <<= 8;
264 			u |= *in++;
265 			u <<= 8;
266 			u |= *in++;
267 		}
268 		i = LDAP_UCS_UTF8LEN(u);
269 		if (i == 0)
270 			return LDAP_INVALID_SYNTAX;
271 		l += i;
272 	}
273 
274 	utf8s->bv_val = LDAP_MALLOC( l+1 );
275 	if (utf8s->bv_val == NULL)
276 		return LDAP_NO_MEMORY;
277 	utf8s->bv_len = l;
278 
279 	ptr = utf8s->bv_val;
280 	for (in = (unsigned char *)ucs->bv_val; in < end; ) {
281 		u = *in++;
282 		if (csize > 1) {
283 			u <<= 8;
284 			u |= *in++;
285 		}
286 		if (csize > 2) {
287 			u <<= 8;
288 			u |= *in++;
289 			u <<= 8;
290 			u |= *in++;
291 		}
292 		ptr += ldap_x_ucs4_to_utf8(u, ptr);
293 	}
294 	*ptr = '\0';
295 	return LDAP_SUCCESS;
296 }
297 
298 /*
299  * Advance to the next UTF-8 character
300  *
301  * Ignores length of multibyte character, instead rely on
302  * continuation markers to find start of next character.
303  * This allows for "resyncing" of when invalid characters
304  * are provided provided the start of the next character
305  * is appears within the 6 bytes examined.
306  */
ldap_utf8_next(const char * p)307 char* ldap_utf8_next( const char * p )
308 {
309 	int i;
310 	const unsigned char *u = (const unsigned char *) p;
311 
312 	if( LDAP_UTF8_ISASCII(u) ) {
313 		return (char *) &p[1];
314 	}
315 
316 	for( i=1; i<6; i++ ) {
317 		if ( ( u[i] & 0xc0 ) != 0x80 ) {
318 			return (char *) &p[i];
319 		}
320 	}
321 
322 	return (char *) &p[i];
323 }
324 
325 /*
326  * Advance to the previous UTF-8 character
327  *
328  * Ignores length of multibyte character, instead rely on
329  * continuation markers to find start of next character.
330  * This allows for "resyncing" of when invalid characters
331  * are provided provided the start of the next character
332  * is appears within the 6 bytes examined.
333  */
ldap_utf8_prev(const char * p)334 char* ldap_utf8_prev( const char * p )
335 {
336 	int i;
337 	const unsigned char *u = (const unsigned char *) p;
338 
339 	for( i=-1; i>-6 ; i-- ) {
340 		if ( ( u[i] & 0xc0 ) != 0x80 ) {
341 			return (char *) &p[i];
342 		}
343 	}
344 
345 	return (char *) &p[i];
346 }
347 
348 /*
349  * Copy one UTF-8 character from src to dst returning
350  * number of bytes copied.
351  *
352  * Ignores length of multibyte character, instead rely on
353  * continuation markers to find start of next character.
354  * This allows for "resyncing" of when invalid characters
355  * are provided provided the start of the next character
356  * is appears within the 6 bytes examined.
357  */
ldap_utf8_copy(char * dst,const char * src)358 int ldap_utf8_copy( char* dst, const char *src )
359 {
360 	int i;
361 	const unsigned char *u = (const unsigned char *) src;
362 
363 	dst[0] = src[0];
364 
365 	if( LDAP_UTF8_ISASCII(u) ) {
366 		return 1;
367 	}
368 
369 	for( i=1; i<6; i++ ) {
370 		if ( ( u[i] & 0xc0 ) != 0x80 ) {
371 			return i;
372 		}
373 		dst[i] = src[i];
374 	}
375 
376 	return i;
377 }
378 
379 #ifndef UTF8_ALPHA_CTYPE
380 /*
381  * UTF-8 ctype routines
382  * Only deals with characters < 0x80 (ie: US-ASCII)
383  */
384 
ldap_utf8_isascii(const char * p)385 int ldap_utf8_isascii( const char * p )
386 {
387 	unsigned c = * (const unsigned char *) p;
388 	return LDAP_ASCII(c);
389 }
390 
ldap_utf8_isdigit(const char * p)391 int ldap_utf8_isdigit( const char * p )
392 {
393 	unsigned c = * (const unsigned char *) p;
394 
395 	if(!LDAP_ASCII(c)) return 0;
396 
397 	return LDAP_DIGIT( c );
398 }
399 
ldap_utf8_isxdigit(const char * p)400 int ldap_utf8_isxdigit( const char * p )
401 {
402 	unsigned c = * (const unsigned char *) p;
403 
404 	if(!LDAP_ASCII(c)) return 0;
405 
406 	return LDAP_HEX(c);
407 }
408 
ldap_utf8_isspace(const char * p)409 int ldap_utf8_isspace( const char * p )
410 {
411 	unsigned c = * (const unsigned char *) p;
412 
413 	if(!LDAP_ASCII(c)) return 0;
414 
415 	switch(c) {
416 	case ' ':
417 	case '\t':
418 	case '\n':
419 	case '\r':
420 	case '\v':
421 	case '\f':
422 		return 1;
423 	}
424 
425 	return 0;
426 }
427 
428 /*
429  * These are not needed by the C SDK and are
430  * not "good enough" for general use.
431  */
ldap_utf8_isalpha(const char * p)432 int ldap_utf8_isalpha( const char * p )
433 {
434 	unsigned c = * (const unsigned char *) p;
435 
436 	if(!LDAP_ASCII(c)) return 0;
437 
438 	return LDAP_ALPHA(c);
439 }
440 
ldap_utf8_isalnum(const char * p)441 int ldap_utf8_isalnum( const char * p )
442 {
443 	unsigned c = * (const unsigned char *) p;
444 
445 	if(!LDAP_ASCII(c)) return 0;
446 
447 	return LDAP_ALNUM(c);
448 }
449 
ldap_utf8_islower(const char * p)450 int ldap_utf8_islower( const char * p )
451 {
452 	unsigned c = * (const unsigned char *) p;
453 
454 	if(!LDAP_ASCII(c)) return 0;
455 
456 	return LDAP_LOWER(c);
457 }
458 
ldap_utf8_isupper(const char * p)459 int ldap_utf8_isupper( const char * p )
460 {
461 	unsigned c = * (const unsigned char *) p;
462 
463 	if(!LDAP_ASCII(c)) return 0;
464 
465 	return LDAP_UPPER(c);
466 }
467 #endif
468 
469 
470 /*
471  * UTF-8 string routines
472  */
473 
474 /* like strchr() */
475 char * (ldap_utf8_strchr)( const char *str, const char *chr )
476 {
477 	for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
478 		if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( chr ) ) {
479 			return (char *) str;
480 		}
481 	}
482 
483 	return NULL;
484 }
485 
486 /* like strcspn() but returns number of bytes, not characters */
ber_len_t(ldap_utf8_strcspn)487 ber_len_t (ldap_utf8_strcspn)( const char *str, const char *set )
488 {
489 	const char *cstr;
490 	const char *cset;
491 
492 	for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
493 		for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
494 			if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
495 				return cstr - str;
496 			}
497 		}
498 	}
499 
500 	return cstr - str;
501 }
502 
503 /* like strspn() but returns number of bytes, not characters */
ber_len_t(ldap_utf8_strspn)504 ber_len_t (ldap_utf8_strspn)( const char *str, const char *set )
505 {
506 	const char *cstr;
507 	const char *cset;
508 
509 	for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
510 		for( cset = set; ; LDAP_UTF8_INCR(cset) ) {
511 			if( *cset == '\0' ) {
512 				return cstr - str;
513 			}
514 
515 			if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
516 				break;
517 			}
518 		}
519 	}
520 
521 	return cstr - str;
522 }
523 
524 /* like strpbrk(), replaces strchr() as well */
525 char *(ldap_utf8_strpbrk)( const char *str, const char *set )
526 {
527 	for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
528 		const char *cset;
529 
530 		for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
531 			if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( cset ) ) {
532 				return (char *) str;
533 			}
534 		}
535 	}
536 
537 	return NULL;
538 }
539 
540 /* like strtok_r(), not strtok() */
541 char *(ldap_utf8_strtok)(char *str, const char *sep, char **last)
542 {
543 	char *begin;
544 	char *end;
545 
546 	if( last == NULL ) return NULL;
547 
548 	begin = str ? str : *last;
549 
550 	begin += ldap_utf8_strspn( begin, sep );
551 
552 	if( *begin == '\0' ) {
553 		*last = NULL;
554 		return NULL;
555 	}
556 
557 	end = &begin[ ldap_utf8_strcspn( begin, sep ) ];
558 
559 	if( *end != '\0' ) {
560 		char *next = LDAP_UTF8_NEXT( end );
561 		*end = '\0';
562 		end = next;
563 	}
564 
565 	*last = end;
566 	return begin;
567 }
568