xref: /netbsd-src/external/bsd/openldap/dist/libraries/libldap/utf-8.c (revision b1c86f5f087524e68db12794ee9c3e3da1ab17a0)
1 /*	$NetBSD: utf-8.c,v 1.1.1.2 2010/03/08 02:14:20 lukem Exp $	*/
2 
3 /* utf-8.c -- Basic UTF-8 routines */
4 /* OpenLDAP: pkg/ldap/libraries/libldap/utf-8.c,v 1.36.2.4 2009/01/22 00:00:56 kurt Exp */
5 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
6  *
7  * Copyright 1998-2009 The OpenLDAP Foundation.
8  * All rights reserved.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted only as authorized by the OpenLDAP
12  * Public License.
13  *
14  * A copy of this license is available in the file LICENSE in the
15  * top-level directory of the distribution or, alternatively, at
16  * <http://www.OpenLDAP.org/license.html>.
17  */
18 /* Basic UTF-8 routines
19  *
20  * These routines are "dumb".  Though they understand UTF-8,
21  * they don't grok Unicode.  That is, they can push bits,
22  * but don't have a clue what the bits represent.  That's
23  * good enough for use with the LDAP Client SDK.
24  *
25  * These routines are not optimized.
26  */
27 
28 #include "portable.h"
29 
30 #include <stdio.h>
31 
32 #include <ac/stdlib.h>
33 
34 #include <ac/socket.h>
35 #include <ac/string.h>
36 #include <ac/time.h>
37 
38 #include "ldap_utf8.h"
39 
40 #include "ldap-int.h"
41 #include "ldap_defaults.h"
42 
43 /*
44  * return the number of bytes required to hold the
45  * NULL-terminated UTF-8 string NOT INCLUDING the
46  * termination.
47  */
48 ber_len_t ldap_utf8_bytes( const char * p )
49 {
50 	ber_len_t bytes;
51 
52 	for( bytes=0; p[bytes]; bytes++ ) {
53 		/* EMPTY */ ;
54 	}
55 
56 	return bytes;
57 }
58 
59 ber_len_t ldap_utf8_chars( const char * p )
60 {
61 	/* could be optimized and could check for invalid sequences */
62 	ber_len_t chars=0;
63 
64 	for( ; *p ; LDAP_UTF8_INCR(p) ) {
65 		chars++;
66 	}
67 
68 	return chars;
69 }
70 
71 /* return offset to next character */
72 int ldap_utf8_offset( const char * p )
73 {
74 	return LDAP_UTF8_NEXT(p) - p;
75 }
76 
77 /*
78  * Returns length indicated by first byte.
79  */
80 const char ldap_utf8_lentab[] = {
81 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
82 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
83 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
84 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
85 	0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
86 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
87 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
88 	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 };
89 
90 int ldap_utf8_charlen( const char * p )
91 {
92 	if (!(*p & 0x80))
93 		return 1;
94 
95 	return ldap_utf8_lentab[*(const unsigned char *)p ^ 0x80];
96 }
97 
98 /*
99  * Make sure the UTF-8 char used the shortest possible encoding
100  * returns charlen if valid, 0 if not.
101  *
102  * Here are the valid UTF-8 encodings, taken from RFC 2279 page 4.
103  * The table is slightly modified from that of the RFC.
104  *
105  * UCS-4 range (hex)      UTF-8 sequence (binary)
106  * 0000 0000-0000 007F   0.......
107  * 0000 0080-0000 07FF   110++++. 10......
108  * 0000 0800-0000 FFFF   1110++++ 10+..... 10......
109  * 0001 0000-001F FFFF   11110+++ 10++.... 10...... 10......
110  * 0020 0000-03FF FFFF   111110++ 10+++... 10...... 10...... 10......
111  * 0400 0000-7FFF FFFF   1111110+ 10++++.. 10...... 10...... 10...... 10......
112  *
113  * The '.' bits are "don't cares". When validating a UTF-8 sequence,
114  * at least one of the '+' bits must be set, otherwise the character
115  * should have been encoded in fewer octets. Note that in the two-octet
116  * case, only the first octet needs to be validated, and this is done
117  * in the ldap_utf8_lentab[] above.
118  */
119 
120 /* mask of required bits in second octet */
121 #undef c
122 #define c const char
123 c ldap_utf8_mintab[] = {
124 	(c)0x20, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
125 	(c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
126 	(c)0x30, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
127 	(c)0x38, (c)0x80, (c)0x80, (c)0x80, (c)0x3c, (c)0x80, (c)0x00, (c)0x00 };
128 #undef c
129 
130 int ldap_utf8_charlen2( const char * p )
131 {
132 	int i = LDAP_UTF8_CHARLEN( p );
133 
134 	if ( i > 2 ) {
135 		if ( !( ldap_utf8_mintab[*p & 0x1f] & p[1] ) )
136 			i = 0;
137 	}
138 	return i;
139 }
140 
141 /* conv UTF-8 to UCS-4, useful for comparisons */
142 ldap_ucs4_t ldap_x_utf8_to_ucs4( const char * p )
143 {
144     const unsigned char *c = (const unsigned char *) p;
145     ldap_ucs4_t ch;
146 	int len, i;
147 	static unsigned char mask[] = {
148 		0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
149 
150 	len = LDAP_UTF8_CHARLEN2(p, len);
151 
152 	if( len == 0 ) return LDAP_UCS4_INVALID;
153 
154 	ch = c[0] & mask[len];
155 
156 	for(i=1; i < len; i++) {
157 		if ((c[i] & 0xc0) != 0x80) {
158 			return LDAP_UCS4_INVALID;
159 		}
160 
161 		ch <<= 6;
162 		ch |= c[i] & 0x3f;
163 	}
164 
165 	return ch;
166 }
167 
168 /* conv UCS-4 to UTF-8, not used */
169 int ldap_x_ucs4_to_utf8( ldap_ucs4_t c, char *buf )
170 {
171 	int len=0;
172 	unsigned char* p = (unsigned char *) buf;
173 
174 	/* not a valid Unicode character */
175 	if ( c < 0 ) return 0;
176 
177 	/* Just return length, don't convert */
178 	if(buf == NULL) {
179 		if( c < 0x80 ) return 1;
180 		else if( c < 0x800 ) return 2;
181 		else if( c < 0x10000 ) return 3;
182 		else if( c < 0x200000 ) return 4;
183 		else if( c < 0x4000000 ) return 5;
184 		else return 6;
185 	}
186 
187 	if( c < 0x80 ) {
188 		p[len++] = c;
189 
190 	} else if( c < 0x800 ) {
191 		p[len++] = 0xc0 | ( c >> 6 );
192 		p[len++] = 0x80 | ( c & 0x3f );
193 
194 	} else if( c < 0x10000 ) {
195 		p[len++] = 0xe0 | ( c >> 12 );
196 		p[len++] = 0x80 | ( (c >> 6) & 0x3f );
197 		p[len++] = 0x80 | ( c & 0x3f );
198 
199 	} else if( c < 0x200000 ) {
200 		p[len++] = 0xf0 | ( c >> 18 );
201 		p[len++] = 0x80 | ( (c >> 12) & 0x3f );
202 		p[len++] = 0x80 | ( (c >> 6) & 0x3f );
203 		p[len++] = 0x80 | ( c & 0x3f );
204 
205 	} else if( c < 0x4000000 ) {
206 		p[len++] = 0xf8 | ( c >> 24 );
207 		p[len++] = 0x80 | ( (c >> 18) & 0x3f );
208 		p[len++] = 0x80 | ( (c >> 12) & 0x3f );
209 		p[len++] = 0x80 | ( (c >> 6) & 0x3f );
210 		p[len++] = 0x80 | ( c & 0x3f );
211 
212 	} else /* if( c < 0x80000000 ) */ {
213 		p[len++] = 0xfc | ( c >> 30 );
214 		p[len++] = 0x80 | ( (c >> 24) & 0x3f );
215 		p[len++] = 0x80 | ( (c >> 18) & 0x3f );
216 		p[len++] = 0x80 | ( (c >> 12) & 0x3f );
217 		p[len++] = 0x80 | ( (c >> 6) & 0x3f );
218 		p[len++] = 0x80 | ( c & 0x3f );
219 	}
220 
221 	return len;
222 }
223 
224 #define LDAP_UCS_UTF8LEN(c)	\
225 	c < 0 ? 0 : (c < 0x80 ? 1 : (c < 0x800 ? 2 : (c < 0x10000 ? 3 : \
226 	(c < 0x200000 ? 4 : (c < 0x4000000 ? 5 : 6)))))
227 
228 /* Convert a string to UTF-8 format. The input string is expected to
229  * have characters of 1, 2, or 4 octets (in network byte order)
230  * corresponding to the ASN.1 T61STRING, BMPSTRING, and UNIVERSALSTRING
231  * types respectively. (Here T61STRING just means that there is one
232  * octet per character and characters may use the high bit of the octet.
233  * The characters are assumed to use ISO mappings, no provision is made
234  * for converting from T.61 coding rules to Unicode.)
235  */
236 
237 int
238 ldap_ucs_to_utf8s( struct berval *ucs, int csize, struct berval *utf8s )
239 {
240 	unsigned char *in, *end;
241 	char *ptr;
242 	ldap_ucs4_t u;
243 	int i, l = 0;
244 
245 	utf8s->bv_val = NULL;
246 	utf8s->bv_len = 0;
247 
248 	in = (unsigned char *)ucs->bv_val;
249 
250 	/* Make sure we stop at an even multiple of csize */
251 	end = in + ( ucs->bv_len & ~(csize-1) );
252 
253 	for (; in < end; ) {
254 		u = *in++;
255 		if (csize > 1) {
256 			u <<= 8;
257 			u |= *in++;
258 		}
259 		if (csize > 2) {
260 			u <<= 8;
261 			u |= *in++;
262 			u <<= 8;
263 			u |= *in++;
264 		}
265 		i = LDAP_UCS_UTF8LEN(u);
266 		if (i == 0)
267 			return LDAP_INVALID_SYNTAX;
268 		l += i;
269 	}
270 
271 	utf8s->bv_val = LDAP_MALLOC( l+1 );
272 	if (utf8s->bv_val == NULL)
273 		return LDAP_NO_MEMORY;
274 	utf8s->bv_len = l;
275 
276 	ptr = utf8s->bv_val;
277 	for (in = (unsigned char *)ucs->bv_val; in < end; ) {
278 		u = *in++;
279 		if (csize > 1) {
280 			u <<= 8;
281 			u |= *in++;
282 		}
283 		if (csize > 2) {
284 			u <<= 8;
285 			u |= *in++;
286 			u <<= 8;
287 			u |= *in++;
288 		}
289 		ptr += ldap_x_ucs4_to_utf8(u, ptr);
290 	}
291 	*ptr = '\0';
292 	return LDAP_SUCCESS;
293 }
294 
295 /*
296  * Advance to the next UTF-8 character
297  *
298  * Ignores length of multibyte character, instead rely on
299  * continuation markers to find start of next character.
300  * This allows for "resyncing" of when invalid characters
301  * are provided provided the start of the next character
302  * is appears within the 6 bytes examined.
303  */
304 char* ldap_utf8_next( const char * p )
305 {
306 	int i;
307 	const unsigned char *u = (const unsigned char *) p;
308 
309 	if( LDAP_UTF8_ISASCII(u) ) {
310 		return (char *) &p[1];
311 	}
312 
313 	for( i=1; i<6; i++ ) {
314 		if ( ( u[i] & 0xc0 ) != 0x80 ) {
315 			return (char *) &p[i];
316 		}
317 	}
318 
319 	return (char *) &p[i];
320 }
321 
322 /*
323  * Advance to the previous UTF-8 character
324  *
325  * Ignores length of multibyte character, instead rely on
326  * continuation markers to find start of next character.
327  * This allows for "resyncing" of when invalid characters
328  * are provided provided the start of the next character
329  * is appears within the 6 bytes examined.
330  */
331 char* ldap_utf8_prev( const char * p )
332 {
333 	int i;
334 	const unsigned char *u = (const unsigned char *) p;
335 
336 	for( i=-1; i>-6 ; i-- ) {
337 		if ( ( u[i] & 0xc0 ) != 0x80 ) {
338 			return (char *) &p[i];
339 		}
340 	}
341 
342 	return (char *) &p[i];
343 }
344 
345 /*
346  * Copy one UTF-8 character from src to dst returning
347  * number of bytes copied.
348  *
349  * Ignores length of multibyte character, instead rely on
350  * continuation markers to find start of next character.
351  * This allows for "resyncing" of when invalid characters
352  * are provided provided the start of the next character
353  * is appears within the 6 bytes examined.
354  */
355 int ldap_utf8_copy( char* dst, const char *src )
356 {
357 	int i;
358 	const unsigned char *u = (const unsigned char *) src;
359 
360 	dst[0] = src[0];
361 
362 	if( LDAP_UTF8_ISASCII(u) ) {
363 		return 1;
364 	}
365 
366 	for( i=1; i<6; i++ ) {
367 		if ( ( u[i] & 0xc0 ) != 0x80 ) {
368 			return i;
369 		}
370 		dst[i] = src[i];
371 	}
372 
373 	return i;
374 }
375 
376 #ifndef UTF8_ALPHA_CTYPE
377 /*
378  * UTF-8 ctype routines
379  * Only deals with characters < 0x80 (ie: US-ASCII)
380  */
381 
382 int ldap_utf8_isascii( const char * p )
383 {
384 	unsigned c = * (const unsigned char *) p;
385 	return LDAP_ASCII(c);
386 }
387 
388 int ldap_utf8_isdigit( const char * p )
389 {
390 	unsigned c = * (const unsigned char *) p;
391 
392 	if(!LDAP_ASCII(c)) return 0;
393 
394 	return LDAP_DIGIT( c );
395 }
396 
397 int ldap_utf8_isxdigit( const char * p )
398 {
399 	unsigned c = * (const unsigned char *) p;
400 
401 	if(!LDAP_ASCII(c)) return 0;
402 
403 	return LDAP_HEX(c);
404 }
405 
406 int ldap_utf8_isspace( const char * p )
407 {
408 	unsigned c = * (const unsigned char *) p;
409 
410 	if(!LDAP_ASCII(c)) return 0;
411 
412 	switch(c) {
413 	case ' ':
414 	case '\t':
415 	case '\n':
416 	case '\r':
417 	case '\v':
418 	case '\f':
419 		return 1;
420 	}
421 
422 	return 0;
423 }
424 
425 /*
426  * These are not needed by the C SDK and are
427  * not "good enough" for general use.
428  */
429 int ldap_utf8_isalpha( const char * p )
430 {
431 	unsigned c = * (const unsigned char *) p;
432 
433 	if(!LDAP_ASCII(c)) return 0;
434 
435 	return LDAP_ALPHA(c);
436 }
437 
438 int ldap_utf8_isalnum( const char * p )
439 {
440 	unsigned c = * (const unsigned char *) p;
441 
442 	if(!LDAP_ASCII(c)) return 0;
443 
444 	return LDAP_ALNUM(c);
445 }
446 
447 int ldap_utf8_islower( const char * p )
448 {
449 	unsigned c = * (const unsigned char *) p;
450 
451 	if(!LDAP_ASCII(c)) return 0;
452 
453 	return LDAP_LOWER(c);
454 }
455 
456 int ldap_utf8_isupper( const char * p )
457 {
458 	unsigned c = * (const unsigned char *) p;
459 
460 	if(!LDAP_ASCII(c)) return 0;
461 
462 	return LDAP_UPPER(c);
463 }
464 #endif
465 
466 
467 /*
468  * UTF-8 string routines
469  */
470 
471 /* like strchr() */
472 char * (ldap_utf8_strchr)( const char *str, const char *chr )
473 {
474 	for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
475 		if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( chr ) ) {
476 			return (char *) str;
477 		}
478 	}
479 
480 	return NULL;
481 }
482 
483 /* like strcspn() but returns number of bytes, not characters */
484 ber_len_t (ldap_utf8_strcspn)( const char *str, const char *set )
485 {
486 	const char *cstr;
487 	const char *cset;
488 
489 	for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
490 		for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
491 			if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
492 				return cstr - str;
493 			}
494 		}
495 	}
496 
497 	return cstr - str;
498 }
499 
500 /* like strspn() but returns number of bytes, not characters */
501 ber_len_t (ldap_utf8_strspn)( const char *str, const char *set )
502 {
503 	const char *cstr;
504 	const char *cset;
505 
506 	for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
507 		for( cset = set; ; LDAP_UTF8_INCR(cset) ) {
508 			if( *cset == '\0' ) {
509 				return cstr - str;
510 			}
511 
512 			if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
513 				break;
514 			}
515 		}
516 	}
517 
518 	return cstr - str;
519 }
520 
521 /* like strpbrk(), replaces strchr() as well */
522 char *(ldap_utf8_strpbrk)( const char *str, const char *set )
523 {
524 	for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
525 		const char *cset;
526 
527 		for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
528 			if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( cset ) ) {
529 				return (char *) str;
530 			}
531 		}
532 	}
533 
534 	return NULL;
535 }
536 
537 /* like strtok_r(), not strtok() */
538 char *(ldap_utf8_strtok)(char *str, const char *sep, char **last)
539 {
540 	char *begin;
541 	char *end;
542 
543 	if( last == NULL ) return NULL;
544 
545 	begin = str ? str : *last;
546 
547 	begin += ldap_utf8_strspn( begin, sep );
548 
549 	if( *begin == '\0' ) {
550 		*last = NULL;
551 		return NULL;
552 	}
553 
554 	end = &begin[ ldap_utf8_strcspn( begin, sep ) ];
555 
556 	if( *end != '\0' ) {
557 		char *next = LDAP_UTF8_NEXT( end );
558 		*end = '\0';
559 		end = next;
560 	}
561 
562 	*last = end;
563 	return begin;
564 }
565