1*549b59edSchristos /* $NetBSD: utf-8.c,v 1.3 2021/08/14 16:14:56 christos Exp $ */
24e6df137Slukem
32de962bdSlukem /* utf-8.c -- Basic UTF-8 routines */
4d11b170bStron /* $OpenLDAP$ */
52de962bdSlukem /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
62de962bdSlukem *
7*549b59edSchristos * Copyright 1998-2021 The OpenLDAP Foundation.
82de962bdSlukem * All rights reserved.
92de962bdSlukem *
102de962bdSlukem * Redistribution and use in source and binary forms, with or without
112de962bdSlukem * modification, are permitted only as authorized by the OpenLDAP
122de962bdSlukem * Public License.
132de962bdSlukem *
142de962bdSlukem * A copy of this license is available in the file LICENSE in the
152de962bdSlukem * top-level directory of the distribution or, alternatively, at
162de962bdSlukem * <http://www.OpenLDAP.org/license.html>.
172de962bdSlukem */
182de962bdSlukem /* Basic UTF-8 routines
192de962bdSlukem *
202de962bdSlukem * These routines are "dumb". Though they understand UTF-8,
212de962bdSlukem * they don't grok Unicode. That is, they can push bits,
222de962bdSlukem * but don't have a clue what the bits represent. That's
232de962bdSlukem * good enough for use with the LDAP Client SDK.
242de962bdSlukem *
252de962bdSlukem * These routines are not optimized.
262de962bdSlukem */
272de962bdSlukem
28376af7d7Schristos #include <sys/cdefs.h>
29*549b59edSchristos __RCSID("$NetBSD: utf-8.c,v 1.3 2021/08/14 16:14:56 christos Exp $");
30376af7d7Schristos
312de962bdSlukem #include "portable.h"
322de962bdSlukem
332de962bdSlukem #include <stdio.h>
342de962bdSlukem
352de962bdSlukem #include <ac/stdlib.h>
362de962bdSlukem
372de962bdSlukem #include <ac/socket.h>
382de962bdSlukem #include <ac/string.h>
392de962bdSlukem #include <ac/time.h>
402de962bdSlukem
412de962bdSlukem #include "ldap_utf8.h"
422de962bdSlukem
432de962bdSlukem #include "ldap-int.h"
442de962bdSlukem #include "ldap_defaults.h"
452de962bdSlukem
462de962bdSlukem /*
472de962bdSlukem * return the number of bytes required to hold the
482de962bdSlukem * NULL-terminated UTF-8 string NOT INCLUDING the
492de962bdSlukem * termination.
502de962bdSlukem */
ldap_utf8_bytes(const char * p)512de962bdSlukem ber_len_t ldap_utf8_bytes( const char * p )
522de962bdSlukem {
532de962bdSlukem ber_len_t bytes;
542de962bdSlukem
552de962bdSlukem for( bytes=0; p[bytes]; bytes++ ) {
562de962bdSlukem /* EMPTY */ ;
572de962bdSlukem }
582de962bdSlukem
592de962bdSlukem return bytes;
602de962bdSlukem }
612de962bdSlukem
ldap_utf8_chars(const char * p)622de962bdSlukem ber_len_t ldap_utf8_chars( const char * p )
632de962bdSlukem {
642de962bdSlukem /* could be optimized and could check for invalid sequences */
652de962bdSlukem ber_len_t chars=0;
662de962bdSlukem
672de962bdSlukem for( ; *p ; LDAP_UTF8_INCR(p) ) {
682de962bdSlukem chars++;
692de962bdSlukem }
702de962bdSlukem
712de962bdSlukem return chars;
722de962bdSlukem }
732de962bdSlukem
742de962bdSlukem /* return offset to next character */
ldap_utf8_offset(const char * p)752de962bdSlukem int ldap_utf8_offset( const char * p )
762de962bdSlukem {
772de962bdSlukem return LDAP_UTF8_NEXT(p) - p;
782de962bdSlukem }
792de962bdSlukem
802de962bdSlukem /*
812de962bdSlukem * Returns length indicated by first byte.
822de962bdSlukem */
832de962bdSlukem const char ldap_utf8_lentab[] = {
842de962bdSlukem 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
852de962bdSlukem 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
862de962bdSlukem 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
872de962bdSlukem 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
882de962bdSlukem 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
892de962bdSlukem 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
902de962bdSlukem 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
912de962bdSlukem 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 };
922de962bdSlukem
ldap_utf8_charlen(const char * p)932de962bdSlukem int ldap_utf8_charlen( const char * p )
942de962bdSlukem {
952de962bdSlukem if (!(*p & 0x80))
962de962bdSlukem return 1;
972de962bdSlukem
982de962bdSlukem return ldap_utf8_lentab[*(const unsigned char *)p ^ 0x80];
992de962bdSlukem }
1002de962bdSlukem
1012de962bdSlukem /*
1022de962bdSlukem * Make sure the UTF-8 char used the shortest possible encoding
1032de962bdSlukem * returns charlen if valid, 0 if not.
1042de962bdSlukem *
1052de962bdSlukem * Here are the valid UTF-8 encodings, taken from RFC 2279 page 4.
1062de962bdSlukem * The table is slightly modified from that of the RFC.
1072de962bdSlukem *
1082de962bdSlukem * UCS-4 range (hex) UTF-8 sequence (binary)
1092de962bdSlukem * 0000 0000-0000 007F 0.......
1102de962bdSlukem * 0000 0080-0000 07FF 110++++. 10......
1112de962bdSlukem * 0000 0800-0000 FFFF 1110++++ 10+..... 10......
1122de962bdSlukem * 0001 0000-001F FFFF 11110+++ 10++.... 10...... 10......
1132de962bdSlukem * 0020 0000-03FF FFFF 111110++ 10+++... 10...... 10...... 10......
1142de962bdSlukem * 0400 0000-7FFF FFFF 1111110+ 10++++.. 10...... 10...... 10...... 10......
1152de962bdSlukem *
1162de962bdSlukem * The '.' bits are "don't cares". When validating a UTF-8 sequence,
1172de962bdSlukem * at least one of the '+' bits must be set, otherwise the character
1182de962bdSlukem * should have been encoded in fewer octets. Note that in the two-octet
1192de962bdSlukem * case, only the first octet needs to be validated, and this is done
1202de962bdSlukem * in the ldap_utf8_lentab[] above.
1212de962bdSlukem */
1222de962bdSlukem
1232de962bdSlukem /* mask of required bits in second octet */
1242de962bdSlukem #undef c
1252de962bdSlukem #define c const char
1262de962bdSlukem c ldap_utf8_mintab[] = {
1272de962bdSlukem (c)0x20, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
1282de962bdSlukem (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
1292de962bdSlukem (c)0x30, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
1302de962bdSlukem (c)0x38, (c)0x80, (c)0x80, (c)0x80, (c)0x3c, (c)0x80, (c)0x00, (c)0x00 };
1312de962bdSlukem #undef c
1322de962bdSlukem
ldap_utf8_charlen2(const char * p)1332de962bdSlukem int ldap_utf8_charlen2( const char * p )
1342de962bdSlukem {
1352de962bdSlukem int i = LDAP_UTF8_CHARLEN( p );
1362de962bdSlukem
1372de962bdSlukem if ( i > 2 ) {
1382de962bdSlukem if ( !( ldap_utf8_mintab[*p & 0x1f] & p[1] ) )
1392de962bdSlukem i = 0;
1402de962bdSlukem }
1412de962bdSlukem return i;
1422de962bdSlukem }
1432de962bdSlukem
1442de962bdSlukem /* conv UTF-8 to UCS-4, useful for comparisons */
ldap_x_utf8_to_ucs4(const char * p)1452de962bdSlukem ldap_ucs4_t ldap_x_utf8_to_ucs4( const char * p )
1462de962bdSlukem {
1472de962bdSlukem const unsigned char *c = (const unsigned char *) p;
1482de962bdSlukem ldap_ucs4_t ch;
1492de962bdSlukem int len, i;
1502de962bdSlukem static unsigned char mask[] = {
1512de962bdSlukem 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
1522de962bdSlukem
1532de962bdSlukem len = LDAP_UTF8_CHARLEN2(p, len);
1542de962bdSlukem
1552de962bdSlukem if( len == 0 ) return LDAP_UCS4_INVALID;
1562de962bdSlukem
1572de962bdSlukem ch = c[0] & mask[len];
1582de962bdSlukem
1592de962bdSlukem for(i=1; i < len; i++) {
1602de962bdSlukem if ((c[i] & 0xc0) != 0x80) {
1612de962bdSlukem return LDAP_UCS4_INVALID;
1622de962bdSlukem }
1632de962bdSlukem
1642de962bdSlukem ch <<= 6;
1652de962bdSlukem ch |= c[i] & 0x3f;
1662de962bdSlukem }
1672de962bdSlukem
1682de962bdSlukem return ch;
1692de962bdSlukem }
1702de962bdSlukem
1712de962bdSlukem /* conv UCS-4 to UTF-8, not used */
ldap_x_ucs4_to_utf8(ldap_ucs4_t c,char * buf)1722de962bdSlukem int ldap_x_ucs4_to_utf8( ldap_ucs4_t c, char *buf )
1732de962bdSlukem {
1742de962bdSlukem int len=0;
1752de962bdSlukem unsigned char* p = (unsigned char *) buf;
1762de962bdSlukem
1772de962bdSlukem /* not a valid Unicode character */
1782de962bdSlukem if ( c < 0 ) return 0;
1792de962bdSlukem
1802de962bdSlukem /* Just return length, don't convert */
1812de962bdSlukem if(buf == NULL) {
1822de962bdSlukem if( c < 0x80 ) return 1;
1832de962bdSlukem else if( c < 0x800 ) return 2;
1842de962bdSlukem else if( c < 0x10000 ) return 3;
1852de962bdSlukem else if( c < 0x200000 ) return 4;
1862de962bdSlukem else if( c < 0x4000000 ) return 5;
1872de962bdSlukem else return 6;
1882de962bdSlukem }
1892de962bdSlukem
1902de962bdSlukem if( c < 0x80 ) {
1912de962bdSlukem p[len++] = c;
1922de962bdSlukem
1932de962bdSlukem } else if( c < 0x800 ) {
1942de962bdSlukem p[len++] = 0xc0 | ( c >> 6 );
1952de962bdSlukem p[len++] = 0x80 | ( c & 0x3f );
1962de962bdSlukem
1972de962bdSlukem } else if( c < 0x10000 ) {
1982de962bdSlukem p[len++] = 0xe0 | ( c >> 12 );
1992de962bdSlukem p[len++] = 0x80 | ( (c >> 6) & 0x3f );
2002de962bdSlukem p[len++] = 0x80 | ( c & 0x3f );
2012de962bdSlukem
2022de962bdSlukem } else if( c < 0x200000 ) {
2032de962bdSlukem p[len++] = 0xf0 | ( c >> 18 );
2042de962bdSlukem p[len++] = 0x80 | ( (c >> 12) & 0x3f );
2052de962bdSlukem p[len++] = 0x80 | ( (c >> 6) & 0x3f );
2062de962bdSlukem p[len++] = 0x80 | ( c & 0x3f );
2072de962bdSlukem
2082de962bdSlukem } else if( c < 0x4000000 ) {
2092de962bdSlukem p[len++] = 0xf8 | ( c >> 24 );
2102de962bdSlukem p[len++] = 0x80 | ( (c >> 18) & 0x3f );
2112de962bdSlukem p[len++] = 0x80 | ( (c >> 12) & 0x3f );
2122de962bdSlukem p[len++] = 0x80 | ( (c >> 6) & 0x3f );
2132de962bdSlukem p[len++] = 0x80 | ( c & 0x3f );
2142de962bdSlukem
2152de962bdSlukem } else /* if( c < 0x80000000 ) */ {
2162de962bdSlukem p[len++] = 0xfc | ( c >> 30 );
2172de962bdSlukem p[len++] = 0x80 | ( (c >> 24) & 0x3f );
2182de962bdSlukem p[len++] = 0x80 | ( (c >> 18) & 0x3f );
2192de962bdSlukem p[len++] = 0x80 | ( (c >> 12) & 0x3f );
2202de962bdSlukem p[len++] = 0x80 | ( (c >> 6) & 0x3f );
2212de962bdSlukem p[len++] = 0x80 | ( c & 0x3f );
2222de962bdSlukem }
2232de962bdSlukem
2242de962bdSlukem return len;
2252de962bdSlukem }
2262de962bdSlukem
2272de962bdSlukem #define LDAP_UCS_UTF8LEN(c) \
2282de962bdSlukem c < 0 ? 0 : (c < 0x80 ? 1 : (c < 0x800 ? 2 : (c < 0x10000 ? 3 : \
2292de962bdSlukem (c < 0x200000 ? 4 : (c < 0x4000000 ? 5 : 6)))))
2302de962bdSlukem
2312de962bdSlukem /* Convert a string to UTF-8 format. The input string is expected to
2322de962bdSlukem * have characters of 1, 2, or 4 octets (in network byte order)
2332de962bdSlukem * corresponding to the ASN.1 T61STRING, BMPSTRING, and UNIVERSALSTRING
2342de962bdSlukem * types respectively. (Here T61STRING just means that there is one
2352de962bdSlukem * octet per character and characters may use the high bit of the octet.
2362de962bdSlukem * The characters are assumed to use ISO mappings, no provision is made
2372de962bdSlukem * for converting from T.61 coding rules to Unicode.)
2382de962bdSlukem */
2392de962bdSlukem
2402de962bdSlukem int
ldap_ucs_to_utf8s(struct berval * ucs,int csize,struct berval * utf8s)2412de962bdSlukem ldap_ucs_to_utf8s( struct berval *ucs, int csize, struct berval *utf8s )
2422de962bdSlukem {
2432de962bdSlukem unsigned char *in, *end;
2442de962bdSlukem char *ptr;
2452de962bdSlukem ldap_ucs4_t u;
2462de962bdSlukem int i, l = 0;
2472de962bdSlukem
2482de962bdSlukem utf8s->bv_val = NULL;
2492de962bdSlukem utf8s->bv_len = 0;
2502de962bdSlukem
2512de962bdSlukem in = (unsigned char *)ucs->bv_val;
2522de962bdSlukem
2532de962bdSlukem /* Make sure we stop at an even multiple of csize */
2542de962bdSlukem end = in + ( ucs->bv_len & ~(csize-1) );
2552de962bdSlukem
2562de962bdSlukem for (; in < end; ) {
2572de962bdSlukem u = *in++;
2582de962bdSlukem if (csize > 1) {
2592de962bdSlukem u <<= 8;
2602de962bdSlukem u |= *in++;
2612de962bdSlukem }
2622de962bdSlukem if (csize > 2) {
2632de962bdSlukem u <<= 8;
2642de962bdSlukem u |= *in++;
2652de962bdSlukem u <<= 8;
2662de962bdSlukem u |= *in++;
2672de962bdSlukem }
2682de962bdSlukem i = LDAP_UCS_UTF8LEN(u);
2692de962bdSlukem if (i == 0)
2702de962bdSlukem return LDAP_INVALID_SYNTAX;
2712de962bdSlukem l += i;
2722de962bdSlukem }
2732de962bdSlukem
2742de962bdSlukem utf8s->bv_val = LDAP_MALLOC( l+1 );
2752de962bdSlukem if (utf8s->bv_val == NULL)
2762de962bdSlukem return LDAP_NO_MEMORY;
2772de962bdSlukem utf8s->bv_len = l;
2782de962bdSlukem
2792de962bdSlukem ptr = utf8s->bv_val;
2802de962bdSlukem for (in = (unsigned char *)ucs->bv_val; in < end; ) {
2812de962bdSlukem u = *in++;
2822de962bdSlukem if (csize > 1) {
2832de962bdSlukem u <<= 8;
2842de962bdSlukem u |= *in++;
2852de962bdSlukem }
2862de962bdSlukem if (csize > 2) {
2872de962bdSlukem u <<= 8;
2882de962bdSlukem u |= *in++;
2892de962bdSlukem u <<= 8;
2902de962bdSlukem u |= *in++;
2912de962bdSlukem }
2922de962bdSlukem ptr += ldap_x_ucs4_to_utf8(u, ptr);
2932de962bdSlukem }
2942de962bdSlukem *ptr = '\0';
2952de962bdSlukem return LDAP_SUCCESS;
2962de962bdSlukem }
2972de962bdSlukem
2982de962bdSlukem /*
2992de962bdSlukem * Advance to the next UTF-8 character
3002de962bdSlukem *
3012de962bdSlukem * Ignores length of multibyte character, instead rely on
3022de962bdSlukem * continuation markers to find start of next character.
3032de962bdSlukem * This allows for "resyncing" of when invalid characters
3042de962bdSlukem * are provided provided the start of the next character
3052de962bdSlukem * is appears within the 6 bytes examined.
3062de962bdSlukem */
ldap_utf8_next(const char * p)3072de962bdSlukem char* ldap_utf8_next( const char * p )
3082de962bdSlukem {
3092de962bdSlukem int i;
3102de962bdSlukem const unsigned char *u = (const unsigned char *) p;
3112de962bdSlukem
3122de962bdSlukem if( LDAP_UTF8_ISASCII(u) ) {
3132de962bdSlukem return (char *) &p[1];
3142de962bdSlukem }
3152de962bdSlukem
3162de962bdSlukem for( i=1; i<6; i++ ) {
3172de962bdSlukem if ( ( u[i] & 0xc0 ) != 0x80 ) {
3182de962bdSlukem return (char *) &p[i];
3192de962bdSlukem }
3202de962bdSlukem }
3212de962bdSlukem
3222de962bdSlukem return (char *) &p[i];
3232de962bdSlukem }
3242de962bdSlukem
3252de962bdSlukem /*
3262de962bdSlukem * Advance to the previous UTF-8 character
3272de962bdSlukem *
3282de962bdSlukem * Ignores length of multibyte character, instead rely on
3292de962bdSlukem * continuation markers to find start of next character.
3302de962bdSlukem * This allows for "resyncing" of when invalid characters
3312de962bdSlukem * are provided provided the start of the next character
3322de962bdSlukem * is appears within the 6 bytes examined.
3332de962bdSlukem */
ldap_utf8_prev(const char * p)3342de962bdSlukem char* ldap_utf8_prev( const char * p )
3352de962bdSlukem {
3362de962bdSlukem int i;
3372de962bdSlukem const unsigned char *u = (const unsigned char *) p;
3382de962bdSlukem
3392de962bdSlukem for( i=-1; i>-6 ; i-- ) {
3402de962bdSlukem if ( ( u[i] & 0xc0 ) != 0x80 ) {
3412de962bdSlukem return (char *) &p[i];
3422de962bdSlukem }
3432de962bdSlukem }
3442de962bdSlukem
3452de962bdSlukem return (char *) &p[i];
3462de962bdSlukem }
3472de962bdSlukem
3482de962bdSlukem /*
3492de962bdSlukem * Copy one UTF-8 character from src to dst returning
3502de962bdSlukem * number of bytes copied.
3512de962bdSlukem *
3522de962bdSlukem * Ignores length of multibyte character, instead rely on
3532de962bdSlukem * continuation markers to find start of next character.
3542de962bdSlukem * This allows for "resyncing" of when invalid characters
3552de962bdSlukem * are provided provided the start of the next character
3562de962bdSlukem * is appears within the 6 bytes examined.
3572de962bdSlukem */
ldap_utf8_copy(char * dst,const char * src)3582de962bdSlukem int ldap_utf8_copy( char* dst, const char *src )
3592de962bdSlukem {
3602de962bdSlukem int i;
3612de962bdSlukem const unsigned char *u = (const unsigned char *) src;
3622de962bdSlukem
3632de962bdSlukem dst[0] = src[0];
3642de962bdSlukem
3652de962bdSlukem if( LDAP_UTF8_ISASCII(u) ) {
3662de962bdSlukem return 1;
3672de962bdSlukem }
3682de962bdSlukem
3692de962bdSlukem for( i=1; i<6; i++ ) {
3702de962bdSlukem if ( ( u[i] & 0xc0 ) != 0x80 ) {
3712de962bdSlukem return i;
3722de962bdSlukem }
3732de962bdSlukem dst[i] = src[i];
3742de962bdSlukem }
3752de962bdSlukem
3762de962bdSlukem return i;
3772de962bdSlukem }
3782de962bdSlukem
3792de962bdSlukem #ifndef UTF8_ALPHA_CTYPE
3802de962bdSlukem /*
3812de962bdSlukem * UTF-8 ctype routines
3822de962bdSlukem * Only deals with characters < 0x80 (ie: US-ASCII)
3832de962bdSlukem */
3842de962bdSlukem
ldap_utf8_isascii(const char * p)3852de962bdSlukem int ldap_utf8_isascii( const char * p )
3862de962bdSlukem {
3872de962bdSlukem unsigned c = * (const unsigned char *) p;
3882de962bdSlukem return LDAP_ASCII(c);
3892de962bdSlukem }
3902de962bdSlukem
ldap_utf8_isdigit(const char * p)3912de962bdSlukem int ldap_utf8_isdigit( const char * p )
3922de962bdSlukem {
3932de962bdSlukem unsigned c = * (const unsigned char *) p;
3942de962bdSlukem
3952de962bdSlukem if(!LDAP_ASCII(c)) return 0;
3962de962bdSlukem
3972de962bdSlukem return LDAP_DIGIT( c );
3982de962bdSlukem }
3992de962bdSlukem
ldap_utf8_isxdigit(const char * p)4002de962bdSlukem int ldap_utf8_isxdigit( const char * p )
4012de962bdSlukem {
4022de962bdSlukem unsigned c = * (const unsigned char *) p;
4032de962bdSlukem
4042de962bdSlukem if(!LDAP_ASCII(c)) return 0;
4052de962bdSlukem
4062de962bdSlukem return LDAP_HEX(c);
4072de962bdSlukem }
4082de962bdSlukem
ldap_utf8_isspace(const char * p)4092de962bdSlukem int ldap_utf8_isspace( const char * p )
4102de962bdSlukem {
4112de962bdSlukem unsigned c = * (const unsigned char *) p;
4122de962bdSlukem
4132de962bdSlukem if(!LDAP_ASCII(c)) return 0;
4142de962bdSlukem
4152de962bdSlukem switch(c) {
4162de962bdSlukem case ' ':
4172de962bdSlukem case '\t':
4182de962bdSlukem case '\n':
4192de962bdSlukem case '\r':
4202de962bdSlukem case '\v':
4212de962bdSlukem case '\f':
4222de962bdSlukem return 1;
4232de962bdSlukem }
4242de962bdSlukem
4252de962bdSlukem return 0;
4262de962bdSlukem }
4272de962bdSlukem
4282de962bdSlukem /*
4292de962bdSlukem * These are not needed by the C SDK and are
4302de962bdSlukem * not "good enough" for general use.
4312de962bdSlukem */
ldap_utf8_isalpha(const char * p)4322de962bdSlukem int ldap_utf8_isalpha( const char * p )
4332de962bdSlukem {
4342de962bdSlukem unsigned c = * (const unsigned char *) p;
4352de962bdSlukem
4362de962bdSlukem if(!LDAP_ASCII(c)) return 0;
4372de962bdSlukem
4382de962bdSlukem return LDAP_ALPHA(c);
4392de962bdSlukem }
4402de962bdSlukem
ldap_utf8_isalnum(const char * p)4412de962bdSlukem int ldap_utf8_isalnum( const char * p )
4422de962bdSlukem {
4432de962bdSlukem unsigned c = * (const unsigned char *) p;
4442de962bdSlukem
4452de962bdSlukem if(!LDAP_ASCII(c)) return 0;
4462de962bdSlukem
4472de962bdSlukem return LDAP_ALNUM(c);
4482de962bdSlukem }
4492de962bdSlukem
ldap_utf8_islower(const char * p)4502de962bdSlukem int ldap_utf8_islower( const char * p )
4512de962bdSlukem {
4522de962bdSlukem unsigned c = * (const unsigned char *) p;
4532de962bdSlukem
4542de962bdSlukem if(!LDAP_ASCII(c)) return 0;
4552de962bdSlukem
4562de962bdSlukem return LDAP_LOWER(c);
4572de962bdSlukem }
4582de962bdSlukem
ldap_utf8_isupper(const char * p)4592de962bdSlukem int ldap_utf8_isupper( const char * p )
4602de962bdSlukem {
4612de962bdSlukem unsigned c = * (const unsigned char *) p;
4622de962bdSlukem
4632de962bdSlukem if(!LDAP_ASCII(c)) return 0;
4642de962bdSlukem
4652de962bdSlukem return LDAP_UPPER(c);
4662de962bdSlukem }
4672de962bdSlukem #endif
4682de962bdSlukem
4692de962bdSlukem
4702de962bdSlukem /*
4712de962bdSlukem * UTF-8 string routines
4722de962bdSlukem */
4732de962bdSlukem
4742de962bdSlukem /* like strchr() */
4752de962bdSlukem char * (ldap_utf8_strchr)( const char *str, const char *chr )
4762de962bdSlukem {
4772de962bdSlukem for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
4782de962bdSlukem if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( chr ) ) {
4792de962bdSlukem return (char *) str;
4802de962bdSlukem }
4812de962bdSlukem }
4822de962bdSlukem
4832de962bdSlukem return NULL;
4842de962bdSlukem }
4852de962bdSlukem
4862de962bdSlukem /* like strcspn() but returns number of bytes, not characters */
ber_len_t(ldap_utf8_strcspn)4872de962bdSlukem ber_len_t (ldap_utf8_strcspn)( const char *str, const char *set )
4882de962bdSlukem {
4892de962bdSlukem const char *cstr;
4902de962bdSlukem const char *cset;
4912de962bdSlukem
4922de962bdSlukem for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
4932de962bdSlukem for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
4942de962bdSlukem if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
4952de962bdSlukem return cstr - str;
4962de962bdSlukem }
4972de962bdSlukem }
4982de962bdSlukem }
4992de962bdSlukem
5002de962bdSlukem return cstr - str;
5012de962bdSlukem }
5022de962bdSlukem
5032de962bdSlukem /* like strspn() but returns number of bytes, not characters */
ber_len_t(ldap_utf8_strspn)5042de962bdSlukem ber_len_t (ldap_utf8_strspn)( const char *str, const char *set )
5052de962bdSlukem {
5062de962bdSlukem const char *cstr;
5072de962bdSlukem const char *cset;
5082de962bdSlukem
5092de962bdSlukem for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
5102de962bdSlukem for( cset = set; ; LDAP_UTF8_INCR(cset) ) {
5112de962bdSlukem if( *cset == '\0' ) {
5122de962bdSlukem return cstr - str;
5132de962bdSlukem }
5142de962bdSlukem
5152de962bdSlukem if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
5162de962bdSlukem break;
5172de962bdSlukem }
5182de962bdSlukem }
5192de962bdSlukem }
5202de962bdSlukem
5212de962bdSlukem return cstr - str;
5222de962bdSlukem }
5232de962bdSlukem
5242de962bdSlukem /* like strpbrk(), replaces strchr() as well */
5252de962bdSlukem char *(ldap_utf8_strpbrk)( const char *str, const char *set )
5262de962bdSlukem {
5272de962bdSlukem for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
5282de962bdSlukem const char *cset;
5292de962bdSlukem
5302de962bdSlukem for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
5312de962bdSlukem if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( cset ) ) {
5322de962bdSlukem return (char *) str;
5332de962bdSlukem }
5342de962bdSlukem }
5352de962bdSlukem }
5362de962bdSlukem
5372de962bdSlukem return NULL;
5382de962bdSlukem }
5392de962bdSlukem
5402de962bdSlukem /* like strtok_r(), not strtok() */
5412de962bdSlukem char *(ldap_utf8_strtok)(char *str, const char *sep, char **last)
5422de962bdSlukem {
5432de962bdSlukem char *begin;
5442de962bdSlukem char *end;
5452de962bdSlukem
5462de962bdSlukem if( last == NULL ) return NULL;
5472de962bdSlukem
5482de962bdSlukem begin = str ? str : *last;
5492de962bdSlukem
5502de962bdSlukem begin += ldap_utf8_strspn( begin, sep );
5512de962bdSlukem
5522de962bdSlukem if( *begin == '\0' ) {
5532de962bdSlukem *last = NULL;
5542de962bdSlukem return NULL;
5552de962bdSlukem }
5562de962bdSlukem
5572de962bdSlukem end = &begin[ ldap_utf8_strcspn( begin, sep ) ];
5582de962bdSlukem
5592de962bdSlukem if( *end != '\0' ) {
5602de962bdSlukem char *next = LDAP_UTF8_NEXT( end );
5612de962bdSlukem *end = '\0';
5622de962bdSlukem end = next;
5632de962bdSlukem }
5642de962bdSlukem
5652de962bdSlukem *last = end;
5662de962bdSlukem return begin;
5672de962bdSlukem }
568