xref: /netbsd-src/external/bsd/openldap/dist/libraries/libldap/utf-8.c (revision 549b59ed3ccf0d36d3097190a0db27b770f3a839)
1*549b59edSchristos /*	$NetBSD: utf-8.c,v 1.3 2021/08/14 16:14:56 christos Exp $	*/
24e6df137Slukem 
32de962bdSlukem /* utf-8.c -- Basic UTF-8 routines */
4d11b170bStron /* $OpenLDAP$ */
52de962bdSlukem /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
62de962bdSlukem  *
7*549b59edSchristos  * Copyright 1998-2021 The OpenLDAP Foundation.
82de962bdSlukem  * All rights reserved.
92de962bdSlukem  *
102de962bdSlukem  * Redistribution and use in source and binary forms, with or without
112de962bdSlukem  * modification, are permitted only as authorized by the OpenLDAP
122de962bdSlukem  * Public License.
132de962bdSlukem  *
142de962bdSlukem  * A copy of this license is available in the file LICENSE in the
152de962bdSlukem  * top-level directory of the distribution or, alternatively, at
162de962bdSlukem  * <http://www.OpenLDAP.org/license.html>.
172de962bdSlukem  */
182de962bdSlukem /* Basic UTF-8 routines
192de962bdSlukem  *
202de962bdSlukem  * These routines are "dumb".  Though they understand UTF-8,
212de962bdSlukem  * they don't grok Unicode.  That is, they can push bits,
222de962bdSlukem  * but don't have a clue what the bits represent.  That's
232de962bdSlukem  * good enough for use with the LDAP Client SDK.
242de962bdSlukem  *
252de962bdSlukem  * These routines are not optimized.
262de962bdSlukem  */
272de962bdSlukem 
28376af7d7Schristos #include <sys/cdefs.h>
29*549b59edSchristos __RCSID("$NetBSD: utf-8.c,v 1.3 2021/08/14 16:14:56 christos Exp $");
30376af7d7Schristos 
312de962bdSlukem #include "portable.h"
322de962bdSlukem 
332de962bdSlukem #include <stdio.h>
342de962bdSlukem 
352de962bdSlukem #include <ac/stdlib.h>
362de962bdSlukem 
372de962bdSlukem #include <ac/socket.h>
382de962bdSlukem #include <ac/string.h>
392de962bdSlukem #include <ac/time.h>
402de962bdSlukem 
412de962bdSlukem #include "ldap_utf8.h"
422de962bdSlukem 
432de962bdSlukem #include "ldap-int.h"
442de962bdSlukem #include "ldap_defaults.h"
452de962bdSlukem 
462de962bdSlukem /*
472de962bdSlukem  * return the number of bytes required to hold the
482de962bdSlukem  * NULL-terminated UTF-8 string NOT INCLUDING the
492de962bdSlukem  * termination.
502de962bdSlukem  */
ldap_utf8_bytes(const char * p)512de962bdSlukem ber_len_t ldap_utf8_bytes( const char * p )
522de962bdSlukem {
532de962bdSlukem 	ber_len_t bytes;
542de962bdSlukem 
552de962bdSlukem 	for( bytes=0; p[bytes]; bytes++ ) {
562de962bdSlukem 		/* EMPTY */ ;
572de962bdSlukem 	}
582de962bdSlukem 
592de962bdSlukem 	return bytes;
602de962bdSlukem }
612de962bdSlukem 
ldap_utf8_chars(const char * p)622de962bdSlukem ber_len_t ldap_utf8_chars( const char * p )
632de962bdSlukem {
642de962bdSlukem 	/* could be optimized and could check for invalid sequences */
652de962bdSlukem 	ber_len_t chars=0;
662de962bdSlukem 
672de962bdSlukem 	for( ; *p ; LDAP_UTF8_INCR(p) ) {
682de962bdSlukem 		chars++;
692de962bdSlukem 	}
702de962bdSlukem 
712de962bdSlukem 	return chars;
722de962bdSlukem }
732de962bdSlukem 
742de962bdSlukem /* return offset to next character */
ldap_utf8_offset(const char * p)752de962bdSlukem int ldap_utf8_offset( const char * p )
762de962bdSlukem {
772de962bdSlukem 	return LDAP_UTF8_NEXT(p) - p;
782de962bdSlukem }
792de962bdSlukem 
802de962bdSlukem /*
812de962bdSlukem  * Returns length indicated by first byte.
822de962bdSlukem  */
832de962bdSlukem const char ldap_utf8_lentab[] = {
842de962bdSlukem 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
852de962bdSlukem 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
862de962bdSlukem 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
872de962bdSlukem 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
882de962bdSlukem 	0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
892de962bdSlukem 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
902de962bdSlukem 	3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
912de962bdSlukem 	4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 };
922de962bdSlukem 
ldap_utf8_charlen(const char * p)932de962bdSlukem int ldap_utf8_charlen( const char * p )
942de962bdSlukem {
952de962bdSlukem 	if (!(*p & 0x80))
962de962bdSlukem 		return 1;
972de962bdSlukem 
982de962bdSlukem 	return ldap_utf8_lentab[*(const unsigned char *)p ^ 0x80];
992de962bdSlukem }
1002de962bdSlukem 
1012de962bdSlukem /*
1022de962bdSlukem  * Make sure the UTF-8 char used the shortest possible encoding
1032de962bdSlukem  * returns charlen if valid, 0 if not.
1042de962bdSlukem  *
1052de962bdSlukem  * Here are the valid UTF-8 encodings, taken from RFC 2279 page 4.
1062de962bdSlukem  * The table is slightly modified from that of the RFC.
1072de962bdSlukem  *
1082de962bdSlukem  * UCS-4 range (hex)      UTF-8 sequence (binary)
1092de962bdSlukem  * 0000 0000-0000 007F   0.......
1102de962bdSlukem  * 0000 0080-0000 07FF   110++++. 10......
1112de962bdSlukem  * 0000 0800-0000 FFFF   1110++++ 10+..... 10......
1122de962bdSlukem  * 0001 0000-001F FFFF   11110+++ 10++.... 10...... 10......
1132de962bdSlukem  * 0020 0000-03FF FFFF   111110++ 10+++... 10...... 10...... 10......
1142de962bdSlukem  * 0400 0000-7FFF FFFF   1111110+ 10++++.. 10...... 10...... 10...... 10......
1152de962bdSlukem  *
1162de962bdSlukem  * The '.' bits are "don't cares". When validating a UTF-8 sequence,
1172de962bdSlukem  * at least one of the '+' bits must be set, otherwise the character
1182de962bdSlukem  * should have been encoded in fewer octets. Note that in the two-octet
1192de962bdSlukem  * case, only the first octet needs to be validated, and this is done
1202de962bdSlukem  * in the ldap_utf8_lentab[] above.
1212de962bdSlukem  */
1222de962bdSlukem 
1232de962bdSlukem /* mask of required bits in second octet */
1242de962bdSlukem #undef c
1252de962bdSlukem #define c const char
1262de962bdSlukem c ldap_utf8_mintab[] = {
1272de962bdSlukem 	(c)0x20, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
1282de962bdSlukem 	(c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
1292de962bdSlukem 	(c)0x30, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
1302de962bdSlukem 	(c)0x38, (c)0x80, (c)0x80, (c)0x80, (c)0x3c, (c)0x80, (c)0x00, (c)0x00 };
1312de962bdSlukem #undef c
1322de962bdSlukem 
ldap_utf8_charlen2(const char * p)1332de962bdSlukem int ldap_utf8_charlen2( const char * p )
1342de962bdSlukem {
1352de962bdSlukem 	int i = LDAP_UTF8_CHARLEN( p );
1362de962bdSlukem 
1372de962bdSlukem 	if ( i > 2 ) {
1382de962bdSlukem 		if ( !( ldap_utf8_mintab[*p & 0x1f] & p[1] ) )
1392de962bdSlukem 			i = 0;
1402de962bdSlukem 	}
1412de962bdSlukem 	return i;
1422de962bdSlukem }
1432de962bdSlukem 
1442de962bdSlukem /* conv UTF-8 to UCS-4, useful for comparisons */
ldap_x_utf8_to_ucs4(const char * p)1452de962bdSlukem ldap_ucs4_t ldap_x_utf8_to_ucs4( const char * p )
1462de962bdSlukem {
1472de962bdSlukem     const unsigned char *c = (const unsigned char *) p;
1482de962bdSlukem     ldap_ucs4_t ch;
1492de962bdSlukem 	int len, i;
1502de962bdSlukem 	static unsigned char mask[] = {
1512de962bdSlukem 		0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
1522de962bdSlukem 
1532de962bdSlukem 	len = LDAP_UTF8_CHARLEN2(p, len);
1542de962bdSlukem 
1552de962bdSlukem 	if( len == 0 ) return LDAP_UCS4_INVALID;
1562de962bdSlukem 
1572de962bdSlukem 	ch = c[0] & mask[len];
1582de962bdSlukem 
1592de962bdSlukem 	for(i=1; i < len; i++) {
1602de962bdSlukem 		if ((c[i] & 0xc0) != 0x80) {
1612de962bdSlukem 			return LDAP_UCS4_INVALID;
1622de962bdSlukem 		}
1632de962bdSlukem 
1642de962bdSlukem 		ch <<= 6;
1652de962bdSlukem 		ch |= c[i] & 0x3f;
1662de962bdSlukem 	}
1672de962bdSlukem 
1682de962bdSlukem 	return ch;
1692de962bdSlukem }
1702de962bdSlukem 
1712de962bdSlukem /* conv UCS-4 to UTF-8, not used */
ldap_x_ucs4_to_utf8(ldap_ucs4_t c,char * buf)1722de962bdSlukem int ldap_x_ucs4_to_utf8( ldap_ucs4_t c, char *buf )
1732de962bdSlukem {
1742de962bdSlukem 	int len=0;
1752de962bdSlukem 	unsigned char* p = (unsigned char *) buf;
1762de962bdSlukem 
1772de962bdSlukem 	/* not a valid Unicode character */
1782de962bdSlukem 	if ( c < 0 ) return 0;
1792de962bdSlukem 
1802de962bdSlukem 	/* Just return length, don't convert */
1812de962bdSlukem 	if(buf == NULL) {
1822de962bdSlukem 		if( c < 0x80 ) return 1;
1832de962bdSlukem 		else if( c < 0x800 ) return 2;
1842de962bdSlukem 		else if( c < 0x10000 ) return 3;
1852de962bdSlukem 		else if( c < 0x200000 ) return 4;
1862de962bdSlukem 		else if( c < 0x4000000 ) return 5;
1872de962bdSlukem 		else return 6;
1882de962bdSlukem 	}
1892de962bdSlukem 
1902de962bdSlukem 	if( c < 0x80 ) {
1912de962bdSlukem 		p[len++] = c;
1922de962bdSlukem 
1932de962bdSlukem 	} else if( c < 0x800 ) {
1942de962bdSlukem 		p[len++] = 0xc0 | ( c >> 6 );
1952de962bdSlukem 		p[len++] = 0x80 | ( c & 0x3f );
1962de962bdSlukem 
1972de962bdSlukem 	} else if( c < 0x10000 ) {
1982de962bdSlukem 		p[len++] = 0xe0 | ( c >> 12 );
1992de962bdSlukem 		p[len++] = 0x80 | ( (c >> 6) & 0x3f );
2002de962bdSlukem 		p[len++] = 0x80 | ( c & 0x3f );
2012de962bdSlukem 
2022de962bdSlukem 	} else if( c < 0x200000 ) {
2032de962bdSlukem 		p[len++] = 0xf0 | ( c >> 18 );
2042de962bdSlukem 		p[len++] = 0x80 | ( (c >> 12) & 0x3f );
2052de962bdSlukem 		p[len++] = 0x80 | ( (c >> 6) & 0x3f );
2062de962bdSlukem 		p[len++] = 0x80 | ( c & 0x3f );
2072de962bdSlukem 
2082de962bdSlukem 	} else if( c < 0x4000000 ) {
2092de962bdSlukem 		p[len++] = 0xf8 | ( c >> 24 );
2102de962bdSlukem 		p[len++] = 0x80 | ( (c >> 18) & 0x3f );
2112de962bdSlukem 		p[len++] = 0x80 | ( (c >> 12) & 0x3f );
2122de962bdSlukem 		p[len++] = 0x80 | ( (c >> 6) & 0x3f );
2132de962bdSlukem 		p[len++] = 0x80 | ( c & 0x3f );
2142de962bdSlukem 
2152de962bdSlukem 	} else /* if( c < 0x80000000 ) */ {
2162de962bdSlukem 		p[len++] = 0xfc | ( c >> 30 );
2172de962bdSlukem 		p[len++] = 0x80 | ( (c >> 24) & 0x3f );
2182de962bdSlukem 		p[len++] = 0x80 | ( (c >> 18) & 0x3f );
2192de962bdSlukem 		p[len++] = 0x80 | ( (c >> 12) & 0x3f );
2202de962bdSlukem 		p[len++] = 0x80 | ( (c >> 6) & 0x3f );
2212de962bdSlukem 		p[len++] = 0x80 | ( c & 0x3f );
2222de962bdSlukem 	}
2232de962bdSlukem 
2242de962bdSlukem 	return len;
2252de962bdSlukem }
2262de962bdSlukem 
2272de962bdSlukem #define LDAP_UCS_UTF8LEN(c)	\
2282de962bdSlukem 	c < 0 ? 0 : (c < 0x80 ? 1 : (c < 0x800 ? 2 : (c < 0x10000 ? 3 : \
2292de962bdSlukem 	(c < 0x200000 ? 4 : (c < 0x4000000 ? 5 : 6)))))
2302de962bdSlukem 
2312de962bdSlukem /* Convert a string to UTF-8 format. The input string is expected to
2322de962bdSlukem  * have characters of 1, 2, or 4 octets (in network byte order)
2332de962bdSlukem  * corresponding to the ASN.1 T61STRING, BMPSTRING, and UNIVERSALSTRING
2342de962bdSlukem  * types respectively. (Here T61STRING just means that there is one
2352de962bdSlukem  * octet per character and characters may use the high bit of the octet.
2362de962bdSlukem  * The characters are assumed to use ISO mappings, no provision is made
2372de962bdSlukem  * for converting from T.61 coding rules to Unicode.)
2382de962bdSlukem  */
2392de962bdSlukem 
2402de962bdSlukem int
ldap_ucs_to_utf8s(struct berval * ucs,int csize,struct berval * utf8s)2412de962bdSlukem ldap_ucs_to_utf8s( struct berval *ucs, int csize, struct berval *utf8s )
2422de962bdSlukem {
2432de962bdSlukem 	unsigned char *in, *end;
2442de962bdSlukem 	char *ptr;
2452de962bdSlukem 	ldap_ucs4_t u;
2462de962bdSlukem 	int i, l = 0;
2472de962bdSlukem 
2482de962bdSlukem 	utf8s->bv_val = NULL;
2492de962bdSlukem 	utf8s->bv_len = 0;
2502de962bdSlukem 
2512de962bdSlukem 	in = (unsigned char *)ucs->bv_val;
2522de962bdSlukem 
2532de962bdSlukem 	/* Make sure we stop at an even multiple of csize */
2542de962bdSlukem 	end = in + ( ucs->bv_len & ~(csize-1) );
2552de962bdSlukem 
2562de962bdSlukem 	for (; in < end; ) {
2572de962bdSlukem 		u = *in++;
2582de962bdSlukem 		if (csize > 1) {
2592de962bdSlukem 			u <<= 8;
2602de962bdSlukem 			u |= *in++;
2612de962bdSlukem 		}
2622de962bdSlukem 		if (csize > 2) {
2632de962bdSlukem 			u <<= 8;
2642de962bdSlukem 			u |= *in++;
2652de962bdSlukem 			u <<= 8;
2662de962bdSlukem 			u |= *in++;
2672de962bdSlukem 		}
2682de962bdSlukem 		i = LDAP_UCS_UTF8LEN(u);
2692de962bdSlukem 		if (i == 0)
2702de962bdSlukem 			return LDAP_INVALID_SYNTAX;
2712de962bdSlukem 		l += i;
2722de962bdSlukem 	}
2732de962bdSlukem 
2742de962bdSlukem 	utf8s->bv_val = LDAP_MALLOC( l+1 );
2752de962bdSlukem 	if (utf8s->bv_val == NULL)
2762de962bdSlukem 		return LDAP_NO_MEMORY;
2772de962bdSlukem 	utf8s->bv_len = l;
2782de962bdSlukem 
2792de962bdSlukem 	ptr = utf8s->bv_val;
2802de962bdSlukem 	for (in = (unsigned char *)ucs->bv_val; in < end; ) {
2812de962bdSlukem 		u = *in++;
2822de962bdSlukem 		if (csize > 1) {
2832de962bdSlukem 			u <<= 8;
2842de962bdSlukem 			u |= *in++;
2852de962bdSlukem 		}
2862de962bdSlukem 		if (csize > 2) {
2872de962bdSlukem 			u <<= 8;
2882de962bdSlukem 			u |= *in++;
2892de962bdSlukem 			u <<= 8;
2902de962bdSlukem 			u |= *in++;
2912de962bdSlukem 		}
2922de962bdSlukem 		ptr += ldap_x_ucs4_to_utf8(u, ptr);
2932de962bdSlukem 	}
2942de962bdSlukem 	*ptr = '\0';
2952de962bdSlukem 	return LDAP_SUCCESS;
2962de962bdSlukem }
2972de962bdSlukem 
2982de962bdSlukem /*
2992de962bdSlukem  * Advance to the next UTF-8 character
3002de962bdSlukem  *
3012de962bdSlukem  * Ignores length of multibyte character, instead rely on
3022de962bdSlukem  * continuation markers to find start of next character.
3032de962bdSlukem  * This allows for "resyncing" of when invalid characters
3042de962bdSlukem  * are provided provided the start of the next character
3052de962bdSlukem  * is appears within the 6 bytes examined.
3062de962bdSlukem  */
ldap_utf8_next(const char * p)3072de962bdSlukem char* ldap_utf8_next( const char * p )
3082de962bdSlukem {
3092de962bdSlukem 	int i;
3102de962bdSlukem 	const unsigned char *u = (const unsigned char *) p;
3112de962bdSlukem 
3122de962bdSlukem 	if( LDAP_UTF8_ISASCII(u) ) {
3132de962bdSlukem 		return (char *) &p[1];
3142de962bdSlukem 	}
3152de962bdSlukem 
3162de962bdSlukem 	for( i=1; i<6; i++ ) {
3172de962bdSlukem 		if ( ( u[i] & 0xc0 ) != 0x80 ) {
3182de962bdSlukem 			return (char *) &p[i];
3192de962bdSlukem 		}
3202de962bdSlukem 	}
3212de962bdSlukem 
3222de962bdSlukem 	return (char *) &p[i];
3232de962bdSlukem }
3242de962bdSlukem 
3252de962bdSlukem /*
3262de962bdSlukem  * Advance to the previous UTF-8 character
3272de962bdSlukem  *
3282de962bdSlukem  * Ignores length of multibyte character, instead rely on
3292de962bdSlukem  * continuation markers to find start of next character.
3302de962bdSlukem  * This allows for "resyncing" of when invalid characters
3312de962bdSlukem  * are provided provided the start of the next character
3322de962bdSlukem  * is appears within the 6 bytes examined.
3332de962bdSlukem  */
ldap_utf8_prev(const char * p)3342de962bdSlukem char* ldap_utf8_prev( const char * p )
3352de962bdSlukem {
3362de962bdSlukem 	int i;
3372de962bdSlukem 	const unsigned char *u = (const unsigned char *) p;
3382de962bdSlukem 
3392de962bdSlukem 	for( i=-1; i>-6 ; i-- ) {
3402de962bdSlukem 		if ( ( u[i] & 0xc0 ) != 0x80 ) {
3412de962bdSlukem 			return (char *) &p[i];
3422de962bdSlukem 		}
3432de962bdSlukem 	}
3442de962bdSlukem 
3452de962bdSlukem 	return (char *) &p[i];
3462de962bdSlukem }
3472de962bdSlukem 
3482de962bdSlukem /*
3492de962bdSlukem  * Copy one UTF-8 character from src to dst returning
3502de962bdSlukem  * number of bytes copied.
3512de962bdSlukem  *
3522de962bdSlukem  * Ignores length of multibyte character, instead rely on
3532de962bdSlukem  * continuation markers to find start of next character.
3542de962bdSlukem  * This allows for "resyncing" of when invalid characters
3552de962bdSlukem  * are provided provided the start of the next character
3562de962bdSlukem  * is appears within the 6 bytes examined.
3572de962bdSlukem  */
ldap_utf8_copy(char * dst,const char * src)3582de962bdSlukem int ldap_utf8_copy( char* dst, const char *src )
3592de962bdSlukem {
3602de962bdSlukem 	int i;
3612de962bdSlukem 	const unsigned char *u = (const unsigned char *) src;
3622de962bdSlukem 
3632de962bdSlukem 	dst[0] = src[0];
3642de962bdSlukem 
3652de962bdSlukem 	if( LDAP_UTF8_ISASCII(u) ) {
3662de962bdSlukem 		return 1;
3672de962bdSlukem 	}
3682de962bdSlukem 
3692de962bdSlukem 	for( i=1; i<6; i++ ) {
3702de962bdSlukem 		if ( ( u[i] & 0xc0 ) != 0x80 ) {
3712de962bdSlukem 			return i;
3722de962bdSlukem 		}
3732de962bdSlukem 		dst[i] = src[i];
3742de962bdSlukem 	}
3752de962bdSlukem 
3762de962bdSlukem 	return i;
3772de962bdSlukem }
3782de962bdSlukem 
3792de962bdSlukem #ifndef UTF8_ALPHA_CTYPE
3802de962bdSlukem /*
3812de962bdSlukem  * UTF-8 ctype routines
3822de962bdSlukem  * Only deals with characters < 0x80 (ie: US-ASCII)
3832de962bdSlukem  */
3842de962bdSlukem 
ldap_utf8_isascii(const char * p)3852de962bdSlukem int ldap_utf8_isascii( const char * p )
3862de962bdSlukem {
3872de962bdSlukem 	unsigned c = * (const unsigned char *) p;
3882de962bdSlukem 	return LDAP_ASCII(c);
3892de962bdSlukem }
3902de962bdSlukem 
ldap_utf8_isdigit(const char * p)3912de962bdSlukem int ldap_utf8_isdigit( const char * p )
3922de962bdSlukem {
3932de962bdSlukem 	unsigned c = * (const unsigned char *) p;
3942de962bdSlukem 
3952de962bdSlukem 	if(!LDAP_ASCII(c)) return 0;
3962de962bdSlukem 
3972de962bdSlukem 	return LDAP_DIGIT( c );
3982de962bdSlukem }
3992de962bdSlukem 
ldap_utf8_isxdigit(const char * p)4002de962bdSlukem int ldap_utf8_isxdigit( const char * p )
4012de962bdSlukem {
4022de962bdSlukem 	unsigned c = * (const unsigned char *) p;
4032de962bdSlukem 
4042de962bdSlukem 	if(!LDAP_ASCII(c)) return 0;
4052de962bdSlukem 
4062de962bdSlukem 	return LDAP_HEX(c);
4072de962bdSlukem }
4082de962bdSlukem 
ldap_utf8_isspace(const char * p)4092de962bdSlukem int ldap_utf8_isspace( const char * p )
4102de962bdSlukem {
4112de962bdSlukem 	unsigned c = * (const unsigned char *) p;
4122de962bdSlukem 
4132de962bdSlukem 	if(!LDAP_ASCII(c)) return 0;
4142de962bdSlukem 
4152de962bdSlukem 	switch(c) {
4162de962bdSlukem 	case ' ':
4172de962bdSlukem 	case '\t':
4182de962bdSlukem 	case '\n':
4192de962bdSlukem 	case '\r':
4202de962bdSlukem 	case '\v':
4212de962bdSlukem 	case '\f':
4222de962bdSlukem 		return 1;
4232de962bdSlukem 	}
4242de962bdSlukem 
4252de962bdSlukem 	return 0;
4262de962bdSlukem }
4272de962bdSlukem 
4282de962bdSlukem /*
4292de962bdSlukem  * These are not needed by the C SDK and are
4302de962bdSlukem  * not "good enough" for general use.
4312de962bdSlukem  */
ldap_utf8_isalpha(const char * p)4322de962bdSlukem int ldap_utf8_isalpha( const char * p )
4332de962bdSlukem {
4342de962bdSlukem 	unsigned c = * (const unsigned char *) p;
4352de962bdSlukem 
4362de962bdSlukem 	if(!LDAP_ASCII(c)) return 0;
4372de962bdSlukem 
4382de962bdSlukem 	return LDAP_ALPHA(c);
4392de962bdSlukem }
4402de962bdSlukem 
ldap_utf8_isalnum(const char * p)4412de962bdSlukem int ldap_utf8_isalnum( const char * p )
4422de962bdSlukem {
4432de962bdSlukem 	unsigned c = * (const unsigned char *) p;
4442de962bdSlukem 
4452de962bdSlukem 	if(!LDAP_ASCII(c)) return 0;
4462de962bdSlukem 
4472de962bdSlukem 	return LDAP_ALNUM(c);
4482de962bdSlukem }
4492de962bdSlukem 
ldap_utf8_islower(const char * p)4502de962bdSlukem int ldap_utf8_islower( const char * p )
4512de962bdSlukem {
4522de962bdSlukem 	unsigned c = * (const unsigned char *) p;
4532de962bdSlukem 
4542de962bdSlukem 	if(!LDAP_ASCII(c)) return 0;
4552de962bdSlukem 
4562de962bdSlukem 	return LDAP_LOWER(c);
4572de962bdSlukem }
4582de962bdSlukem 
ldap_utf8_isupper(const char * p)4592de962bdSlukem int ldap_utf8_isupper( const char * p )
4602de962bdSlukem {
4612de962bdSlukem 	unsigned c = * (const unsigned char *) p;
4622de962bdSlukem 
4632de962bdSlukem 	if(!LDAP_ASCII(c)) return 0;
4642de962bdSlukem 
4652de962bdSlukem 	return LDAP_UPPER(c);
4662de962bdSlukem }
4672de962bdSlukem #endif
4682de962bdSlukem 
4692de962bdSlukem 
4702de962bdSlukem /*
4712de962bdSlukem  * UTF-8 string routines
4722de962bdSlukem  */
4732de962bdSlukem 
4742de962bdSlukem /* like strchr() */
4752de962bdSlukem char * (ldap_utf8_strchr)( const char *str, const char *chr )
4762de962bdSlukem {
4772de962bdSlukem 	for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
4782de962bdSlukem 		if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( chr ) ) {
4792de962bdSlukem 			return (char *) str;
4802de962bdSlukem 		}
4812de962bdSlukem 	}
4822de962bdSlukem 
4832de962bdSlukem 	return NULL;
4842de962bdSlukem }
4852de962bdSlukem 
4862de962bdSlukem /* like strcspn() but returns number of bytes, not characters */
ber_len_t(ldap_utf8_strcspn)4872de962bdSlukem ber_len_t (ldap_utf8_strcspn)( const char *str, const char *set )
4882de962bdSlukem {
4892de962bdSlukem 	const char *cstr;
4902de962bdSlukem 	const char *cset;
4912de962bdSlukem 
4922de962bdSlukem 	for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
4932de962bdSlukem 		for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
4942de962bdSlukem 			if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
4952de962bdSlukem 				return cstr - str;
4962de962bdSlukem 			}
4972de962bdSlukem 		}
4982de962bdSlukem 	}
4992de962bdSlukem 
5002de962bdSlukem 	return cstr - str;
5012de962bdSlukem }
5022de962bdSlukem 
5032de962bdSlukem /* like strspn() but returns number of bytes, not characters */
ber_len_t(ldap_utf8_strspn)5042de962bdSlukem ber_len_t (ldap_utf8_strspn)( const char *str, const char *set )
5052de962bdSlukem {
5062de962bdSlukem 	const char *cstr;
5072de962bdSlukem 	const char *cset;
5082de962bdSlukem 
5092de962bdSlukem 	for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
5102de962bdSlukem 		for( cset = set; ; LDAP_UTF8_INCR(cset) ) {
5112de962bdSlukem 			if( *cset == '\0' ) {
5122de962bdSlukem 				return cstr - str;
5132de962bdSlukem 			}
5142de962bdSlukem 
5152de962bdSlukem 			if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
5162de962bdSlukem 				break;
5172de962bdSlukem 			}
5182de962bdSlukem 		}
5192de962bdSlukem 	}
5202de962bdSlukem 
5212de962bdSlukem 	return cstr - str;
5222de962bdSlukem }
5232de962bdSlukem 
5242de962bdSlukem /* like strpbrk(), replaces strchr() as well */
5252de962bdSlukem char *(ldap_utf8_strpbrk)( const char *str, const char *set )
5262de962bdSlukem {
5272de962bdSlukem 	for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
5282de962bdSlukem 		const char *cset;
5292de962bdSlukem 
5302de962bdSlukem 		for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
5312de962bdSlukem 			if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( cset ) ) {
5322de962bdSlukem 				return (char *) str;
5332de962bdSlukem 			}
5342de962bdSlukem 		}
5352de962bdSlukem 	}
5362de962bdSlukem 
5372de962bdSlukem 	return NULL;
5382de962bdSlukem }
5392de962bdSlukem 
5402de962bdSlukem /* like strtok_r(), not strtok() */
5412de962bdSlukem char *(ldap_utf8_strtok)(char *str, const char *sep, char **last)
5422de962bdSlukem {
5432de962bdSlukem 	char *begin;
5442de962bdSlukem 	char *end;
5452de962bdSlukem 
5462de962bdSlukem 	if( last == NULL ) return NULL;
5472de962bdSlukem 
5482de962bdSlukem 	begin = str ? str : *last;
5492de962bdSlukem 
5502de962bdSlukem 	begin += ldap_utf8_strspn( begin, sep );
5512de962bdSlukem 
5522de962bdSlukem 	if( *begin == '\0' ) {
5532de962bdSlukem 		*last = NULL;
5542de962bdSlukem 		return NULL;
5552de962bdSlukem 	}
5562de962bdSlukem 
5572de962bdSlukem 	end = &begin[ ldap_utf8_strcspn( begin, sep ) ];
5582de962bdSlukem 
5592de962bdSlukem 	if( *end != '\0' ) {
5602de962bdSlukem 		char *next = LDAP_UTF8_NEXT( end );
5612de962bdSlukem 		*end = '\0';
5622de962bdSlukem 		end = next;
5632de962bdSlukem 	}
5642de962bdSlukem 
5652de962bdSlukem 	*last = end;
5662de962bdSlukem 	return begin;
5672de962bdSlukem }
568