xref: /netbsd-src/external/bsd/openldap/dist/include/ldap_pvt_uc.h (revision 549b59ed3ccf0d36d3097190a0db27b770f3a839)
1*549b59edSchristos /*	$NetBSD: ldap_pvt_uc.h,v 1.3 2021/08/14 16:14:55 christos Exp $	*/
24e6df137Slukem 
3d11b170bStron /* $OpenLDAP$ */
42de962bdSlukem /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
52de962bdSlukem  *
6*549b59edSchristos  * Copyright 1998-2021 The OpenLDAP Foundation.
72de962bdSlukem  * All rights reserved.
82de962bdSlukem  *
92de962bdSlukem  * Redistribution and use in source and binary forms, with or without
102de962bdSlukem  * modification, are permitted only as authorized by the OpenLDAP
112de962bdSlukem  * Public License.
122de962bdSlukem  *
132de962bdSlukem  * A copy of this license is available in file LICENSE in the
142de962bdSlukem  * top-level directory of the distribution or, alternatively, at
152de962bdSlukem  * <http://www.OpenLDAP.org/license.html>.
162de962bdSlukem  */
172de962bdSlukem 
182de962bdSlukem /*
192de962bdSlukem  * ldap_pvt_uc.h - Header for Unicode functions.
202de962bdSlukem  * These are meant to be used by the OpenLDAP distribution only.
212de962bdSlukem  * These should be named ldap_pvt_....()
222de962bdSlukem  */
232de962bdSlukem 
242de962bdSlukem #ifndef _LDAP_PVT_UC_H
252de962bdSlukem #define _LDAP_PVT_UC_H 1
262de962bdSlukem 
272de962bdSlukem #include <lber.h>				/* get ber_slen_t */
282de962bdSlukem 
292de962bdSlukem #include <ac/bytes.h>
302de962bdSlukem #include "../libraries/liblunicode/ucdata/ucdata.h"
312de962bdSlukem 
322de962bdSlukem LDAP_BEGIN_DECL
332de962bdSlukem 
342de962bdSlukem /*
352de962bdSlukem  * UTF-8 (in utf-8.c)
362de962bdSlukem  */
372de962bdSlukem 
382de962bdSlukem /* UCDATA uses UCS-2 passed in a 4 byte unsigned int */
392de962bdSlukem typedef ac_uint4 ldap_unicode_t;
402de962bdSlukem 
412de962bdSlukem /* Convert a string with csize octets per character to UTF-8 */
422de962bdSlukem LDAP_F( int ) ldap_ucs_to_utf8s LDAP_P((
432de962bdSlukem 	struct berval *ucs, int csize, struct berval *utf8s ));
442de962bdSlukem 
452de962bdSlukem 
462de962bdSlukem /* returns the number of bytes in the UTF-8 string */
472de962bdSlukem LDAP_F (ber_len_t) ldap_utf8_bytes( const char * );
482de962bdSlukem /* returns the number of UTF-8 characters in the string */
492de962bdSlukem LDAP_F (ber_len_t) ldap_utf8_chars( const char * );
502de962bdSlukem /* returns the length (in bytes) of the UTF-8 character */
512de962bdSlukem LDAP_F (int) ldap_utf8_offset( const char * );
522de962bdSlukem /* returns the length (in bytes) indicated by the UTF-8 character */
532de962bdSlukem LDAP_F (int) ldap_utf8_charlen( const char * );
542de962bdSlukem 
552de962bdSlukem /* returns the length (in bytes) indicated by the UTF-8 character
562de962bdSlukem  * also checks that shortest possible encoding was used
572de962bdSlukem  */
582de962bdSlukem LDAP_F (int) ldap_utf8_charlen2( const char * );
592de962bdSlukem 
602de962bdSlukem /* copies a UTF-8 character and returning number of bytes copied */
612de962bdSlukem LDAP_F (int) ldap_utf8_copy( char *, const char *);
622de962bdSlukem 
632de962bdSlukem /* returns pointer of next UTF-8 character in string */
642de962bdSlukem LDAP_F (char*) ldap_utf8_next( const char * );
652de962bdSlukem /* returns pointer of previous UTF-8 character in string */
662de962bdSlukem LDAP_F (char*) ldap_utf8_prev( const char * );
672de962bdSlukem 
682de962bdSlukem /* primitive ctype routines -- not aware of non-ascii characters */
692de962bdSlukem LDAP_F (int) ldap_utf8_isascii( const char * );
702de962bdSlukem LDAP_F (int) ldap_utf8_isalpha( const char * );
712de962bdSlukem LDAP_F (int) ldap_utf8_isalnum( const char * );
722de962bdSlukem LDAP_F (int) ldap_utf8_isdigit( const char * );
732de962bdSlukem LDAP_F (int) ldap_utf8_isxdigit( const char * );
742de962bdSlukem LDAP_F (int) ldap_utf8_isspace( const char * );
752de962bdSlukem 
762de962bdSlukem /* span characters not in set, return bytes spanned */
772de962bdSlukem LDAP_F (ber_len_t) ldap_utf8_strcspn( const char* str, const char *set);
782de962bdSlukem /* span characters in set, return bytes spanned */
792de962bdSlukem LDAP_F (ber_len_t) ldap_utf8_strspn( const char* str, const char *set);
80*549b59edSchristos /* return first occurrence of character in string */
812de962bdSlukem LDAP_F (char *) ldap_utf8_strchr( const char* str, const char *chr);
822de962bdSlukem /* return first character of set in string */
832de962bdSlukem LDAP_F (char *) ldap_utf8_strpbrk( const char* str, const char *set);
842de962bdSlukem /* reentrant tokenizer */
852de962bdSlukem LDAP_F (char*) ldap_utf8_strtok( char* sp, const char* sep, char **last);
862de962bdSlukem 
872de962bdSlukem /* Optimizations */
882de962bdSlukem LDAP_V (const char) ldap_utf8_lentab[128];
892de962bdSlukem LDAP_V (const char) ldap_utf8_mintab[32];
902de962bdSlukem 
912de962bdSlukem #define LDAP_UTF8_ISASCII(p) ( !(*(const unsigned char *)(p) & 0x80 ) )
922de962bdSlukem #define LDAP_UTF8_CHARLEN(p) ( LDAP_UTF8_ISASCII(p) \
932de962bdSlukem 	? 1 : ldap_utf8_lentab[*(const unsigned char *)(p) ^ 0x80] )
942de962bdSlukem 
952de962bdSlukem /* This is like CHARLEN but additionally validates to make sure
962de962bdSlukem  * the char used the shortest possible encoding.
972de962bdSlukem  * 'l' is used to temporarily hold the result of CHARLEN.
982de962bdSlukem  */
992de962bdSlukem #define LDAP_UTF8_CHARLEN2(p, l) ( ( ( l = LDAP_UTF8_CHARLEN( p )) < 3 || \
1002de962bdSlukem 	( ldap_utf8_mintab[*(const unsigned char *)(p) & 0x1f] & (p)[1] ) ) ? \
1012de962bdSlukem 	l : 0 )
1022de962bdSlukem 
1032de962bdSlukem #define LDAP_UTF8_OFFSET(p) ( LDAP_UTF8_ISASCII(p) \
1042de962bdSlukem 	? 1 : ldap_utf8_offset((p)) )
1052de962bdSlukem 
1062de962bdSlukem #define LDAP_UTF8_COPY(d,s) ( LDAP_UTF8_ISASCII(s) \
1072de962bdSlukem 	? (*(d) = *(s), 1) : ldap_utf8_copy((d),(s)) )
1082de962bdSlukem 
1092de962bdSlukem #define LDAP_UTF8_NEXT(p) (	LDAP_UTF8_ISASCII(p) \
1102de962bdSlukem 	? (char *)(p)+1 : ldap_utf8_next((p)) )
1112de962bdSlukem 
1122de962bdSlukem #define LDAP_UTF8_INCR(p) ((p) = LDAP_UTF8_NEXT(p))
1132de962bdSlukem 
1142de962bdSlukem /* For symmetry */
1152de962bdSlukem #define LDAP_UTF8_PREV(p) (ldap_utf8_prev((p)))
1162de962bdSlukem #define LDAP_UTF8_DECR(p) ((p)=LDAP_UTF8_PREV((p)))
1172de962bdSlukem 
1182de962bdSlukem 
1192de962bdSlukem /* these probably should be renamed */
1202de962bdSlukem LDAP_LUNICODE_F(int) ucstrncmp(
1212de962bdSlukem 	const ldap_unicode_t *,
1222de962bdSlukem 	const ldap_unicode_t *,
1232de962bdSlukem 	ber_len_t );
1242de962bdSlukem 
1252de962bdSlukem LDAP_LUNICODE_F(int) ucstrncasecmp(
1262de962bdSlukem 	const ldap_unicode_t *,
1272de962bdSlukem 	const ldap_unicode_t *,
1282de962bdSlukem 	ber_len_t );
1292de962bdSlukem 
1302de962bdSlukem LDAP_LUNICODE_F(ldap_unicode_t *) ucstrnchr(
1312de962bdSlukem 	const ldap_unicode_t *,
1322de962bdSlukem 	ber_len_t,
1332de962bdSlukem 	ldap_unicode_t );
1342de962bdSlukem 
1352de962bdSlukem LDAP_LUNICODE_F(ldap_unicode_t *) ucstrncasechr(
1362de962bdSlukem 	const ldap_unicode_t *,
1372de962bdSlukem 	ber_len_t,
1382de962bdSlukem 	ldap_unicode_t );
1392de962bdSlukem 
1402de962bdSlukem LDAP_LUNICODE_F(void) ucstr2upper(
1412de962bdSlukem 	ldap_unicode_t *,
1422de962bdSlukem 	ber_len_t );
1432de962bdSlukem 
1442de962bdSlukem #define LDAP_UTF8_NOCASEFOLD	0x0U
1452de962bdSlukem #define LDAP_UTF8_CASEFOLD	0x1U
1462de962bdSlukem #define LDAP_UTF8_ARG1NFC	0x2U
1472de962bdSlukem #define LDAP_UTF8_ARG2NFC	0x4U
1482de962bdSlukem #define LDAP_UTF8_APPROX	0x8U
1492de962bdSlukem 
1502de962bdSlukem LDAP_LUNICODE_F(struct berval *) UTF8bvnormalize(
1512de962bdSlukem 	struct berval *,
1522de962bdSlukem 	struct berval *,
1532de962bdSlukem 	unsigned,
1542de962bdSlukem 	void *memctx );
1552de962bdSlukem 
1562de962bdSlukem LDAP_LUNICODE_F(int) UTF8bvnormcmp(
1572de962bdSlukem 	struct berval *,
1582de962bdSlukem 	struct berval *,
1592de962bdSlukem 	unsigned,
1602de962bdSlukem 	void *memctx );
1612de962bdSlukem 
1622de962bdSlukem LDAP_END_DECL
1632de962bdSlukem 
1642de962bdSlukem #endif
1652de962bdSlukem 
166