xref: /onnv-gate/usr/src/lib/libldap5/sources/ldap/common/ldaputf8.c (revision 0:68f95e015346)
1*0Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
2*0Sstevel@tonic-gate 
3*0Sstevel@tonic-gate /*
4*0Sstevel@tonic-gate  * The contents of this file are subject to the Netscape Public
5*0Sstevel@tonic-gate  * License Version 1.1 (the "License"); you may not use this file
6*0Sstevel@tonic-gate  * except in compliance with the License. You may obtain a copy of
7*0Sstevel@tonic-gate  * the License at http://www.mozilla.org/NPL/
8*0Sstevel@tonic-gate  *
9*0Sstevel@tonic-gate  * Software distributed under the License is distributed on an "AS
10*0Sstevel@tonic-gate  * IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
11*0Sstevel@tonic-gate  * implied. See the License for the specific language governing
12*0Sstevel@tonic-gate  * rights and limitations under the License.
13*0Sstevel@tonic-gate  *
14*0Sstevel@tonic-gate  * The Original Code is Mozilla Communicator client code, released
15*0Sstevel@tonic-gate  * March 31, 1998.
16*0Sstevel@tonic-gate  *
17*0Sstevel@tonic-gate  * The Initial Developer of the Original Code is Netscape
18*0Sstevel@tonic-gate  * Communications Corporation. Portions created by Netscape are
19*0Sstevel@tonic-gate  * Copyright (C) 1998-1999 Netscape Communications Corporation. All
20*0Sstevel@tonic-gate  * Rights Reserved.
21*0Sstevel@tonic-gate  *
22*0Sstevel@tonic-gate  * Contributor(s):
23*0Sstevel@tonic-gate  */
24*0Sstevel@tonic-gate 
25*0Sstevel@tonic-gate /* uft8.c - misc. utf8 "string" functions. */
26*0Sstevel@tonic-gate #include "ldap-int.h"
27*0Sstevel@tonic-gate 
28*0Sstevel@tonic-gate static char UTF8len[64]
29*0Sstevel@tonic-gate = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
30*0Sstevel@tonic-gate    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
31*0Sstevel@tonic-gate    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
32*0Sstevel@tonic-gate    2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 5, 6};
33*0Sstevel@tonic-gate 
34*0Sstevel@tonic-gate int
35*0Sstevel@tonic-gate LDAP_CALL
ldap_utf8len(const char * s)36*0Sstevel@tonic-gate ldap_utf8len (const char* s)
37*0Sstevel@tonic-gate      /* Return the number of char's in the character at *s. */
38*0Sstevel@tonic-gate {
39*0Sstevel@tonic-gate     return ldap_utf8next((char*)s) - s;
40*0Sstevel@tonic-gate }
41*0Sstevel@tonic-gate 
42*0Sstevel@tonic-gate char*
43*0Sstevel@tonic-gate LDAP_CALL
ldap_utf8next(char * s)44*0Sstevel@tonic-gate ldap_utf8next (char* s)
45*0Sstevel@tonic-gate      /* Return a pointer to the character immediately following *s.
46*0Sstevel@tonic-gate 	Handle any valid UTF-8 character, including '\0' and ASCII.
47*0Sstevel@tonic-gate 	Try to handle a misaligned pointer or a malformed character.
48*0Sstevel@tonic-gate      */
49*0Sstevel@tonic-gate {
50*0Sstevel@tonic-gate     register unsigned char* next = (unsigned char*)s;
51*0Sstevel@tonic-gate     switch (UTF8len [(*next >> 2) & 0x3F]) {
52*0Sstevel@tonic-gate       case 0: /* erroneous: s points to the middle of a character. */
53*0Sstevel@tonic-gate       case 6: if ((*++next & 0xC0) != 0x80) break;
54*0Sstevel@tonic-gate       case 5: if ((*++next & 0xC0) != 0x80) break;
55*0Sstevel@tonic-gate       case 4: if ((*++next & 0xC0) != 0x80) break;
56*0Sstevel@tonic-gate       case 3: if ((*++next & 0xC0) != 0x80) break;
57*0Sstevel@tonic-gate       case 2: if ((*++next & 0xC0) != 0x80) break;
58*0Sstevel@tonic-gate       case 1: ++next;
59*0Sstevel@tonic-gate     }
60*0Sstevel@tonic-gate     return (char*) next;
61*0Sstevel@tonic-gate }
62*0Sstevel@tonic-gate 
63*0Sstevel@tonic-gate char*
64*0Sstevel@tonic-gate LDAP_CALL
ldap_utf8prev(char * s)65*0Sstevel@tonic-gate ldap_utf8prev (char* s)
66*0Sstevel@tonic-gate      /* Return a pointer to the character immediately preceding *s.
67*0Sstevel@tonic-gate 	Handle any valid UTF-8 character, including '\0' and ASCII.
68*0Sstevel@tonic-gate 	Try to handle a misaligned pointer or a malformed character.
69*0Sstevel@tonic-gate      */
70*0Sstevel@tonic-gate {
71*0Sstevel@tonic-gate     register unsigned char* prev = (unsigned char*)s;
72*0Sstevel@tonic-gate     unsigned char* limit = prev - 6;
73*0Sstevel@tonic-gate     while (((*--prev & 0xC0) == 0x80) && (prev != limit)) {
74*0Sstevel@tonic-gate     	;
75*0Sstevel@tonic-gate     }
76*0Sstevel@tonic-gate     return (char*) prev;
77*0Sstevel@tonic-gate }
78*0Sstevel@tonic-gate 
79*0Sstevel@tonic-gate int
80*0Sstevel@tonic-gate LDAP_CALL
ldap_utf8copy(char * dst,const char * src)81*0Sstevel@tonic-gate ldap_utf8copy (char* dst, const char* src)
82*0Sstevel@tonic-gate      /* Copy a character from src to dst; return the number of char's copied.
83*0Sstevel@tonic-gate 	Handle any valid UTF-8 character, including '\0' and ASCII.
84*0Sstevel@tonic-gate 	Try to handle a misaligned pointer or a malformed character.
85*0Sstevel@tonic-gate      */
86*0Sstevel@tonic-gate {
87*0Sstevel@tonic-gate     register const unsigned char* s = (const unsigned char*)src;
88*0Sstevel@tonic-gate     switch (UTF8len [(*s >> 2) & 0x3F]) {
89*0Sstevel@tonic-gate       case 0: /* erroneous: s points to the middle of a character. */
90*0Sstevel@tonic-gate       case 6: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
91*0Sstevel@tonic-gate       case 5: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
92*0Sstevel@tonic-gate       case 4: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
93*0Sstevel@tonic-gate       case 3: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
94*0Sstevel@tonic-gate       case 2: *dst++ = *s++; if ((*s & 0xC0) != 0x80) break;
95*0Sstevel@tonic-gate       case 1: *dst   = *s++;
96*0Sstevel@tonic-gate     }
97*0Sstevel@tonic-gate     return s - (const unsigned char*)src;
98*0Sstevel@tonic-gate }
99*0Sstevel@tonic-gate 
100*0Sstevel@tonic-gate size_t
101*0Sstevel@tonic-gate LDAP_CALL
ldap_utf8characters(const char * src)102*0Sstevel@tonic-gate ldap_utf8characters (const char* src)
103*0Sstevel@tonic-gate      /* Return the number of UTF-8 characters in the 0-terminated array s. */
104*0Sstevel@tonic-gate {
105*0Sstevel@tonic-gate     register char* s = (char*)src;
106*0Sstevel@tonic-gate     size_t n;
107*0Sstevel@tonic-gate     for (n = 0; *s; LDAP_UTF8INC(s)) ++n;
108*0Sstevel@tonic-gate     return n;
109*0Sstevel@tonic-gate }
110*0Sstevel@tonic-gate 
111*0Sstevel@tonic-gate unsigned long LDAP_CALL
ldap_utf8getcc(const char ** src)112*0Sstevel@tonic-gate ldap_utf8getcc( const char** src )
113*0Sstevel@tonic-gate {
114*0Sstevel@tonic-gate     register unsigned long c;
115*0Sstevel@tonic-gate     register const unsigned char* s = (const unsigned char*)*src;
116*0Sstevel@tonic-gate     switch (UTF8len [(*s >> 2) & 0x3F]) {
117*0Sstevel@tonic-gate       case 0: /* erroneous: s points to the middle of a character. */
118*0Sstevel@tonic-gate 	      c = (*s++) & 0x3F; goto more5;
119*0Sstevel@tonic-gate       case 1: c = (*s++); break;
120*0Sstevel@tonic-gate       case 2: c = (*s++) & 0x1F; goto more1;
121*0Sstevel@tonic-gate       case 3: c = (*s++) & 0x0F; goto more2;
122*0Sstevel@tonic-gate       case 4: c = (*s++) & 0x07; goto more3;
123*0Sstevel@tonic-gate       case 5: c = (*s++) & 0x03; goto more4;
124*0Sstevel@tonic-gate       case 6: c = (*s++) & 0x01; goto more5;
125*0Sstevel@tonic-gate       more5: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
126*0Sstevel@tonic-gate       more4: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
127*0Sstevel@tonic-gate       more3: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
128*0Sstevel@tonic-gate       more2: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
129*0Sstevel@tonic-gate       more1: if ((*s & 0xC0) != 0x80) break; c = (c << 6) | ((*s++) & 0x3F);
130*0Sstevel@tonic-gate 	break;
131*0Sstevel@tonic-gate     }
132*0Sstevel@tonic-gate     *src = (const char*)s;
133*0Sstevel@tonic-gate     return c;
134*0Sstevel@tonic-gate }
135*0Sstevel@tonic-gate 
136*0Sstevel@tonic-gate char*
137*0Sstevel@tonic-gate LDAP_CALL
ldap_utf8strtok_r(char * sp,const char * brk,char ** next)138*0Sstevel@tonic-gate ldap_utf8strtok_r( char* sp, const char* brk, char** next)
139*0Sstevel@tonic-gate {
140*0Sstevel@tonic-gate     const char *bp;
141*0Sstevel@tonic-gate     unsigned long sc, bc;
142*0Sstevel@tonic-gate     char *tok;
143*0Sstevel@tonic-gate 
144*0Sstevel@tonic-gate     if (sp == NULL && (sp = *next) == NULL)
145*0Sstevel@tonic-gate       return NULL;
146*0Sstevel@tonic-gate 
147*0Sstevel@tonic-gate     /* Skip leading delimiters; roughly, sp += strspn(sp, brk) */
148*0Sstevel@tonic-gate   cont:
149*0Sstevel@tonic-gate     sc = LDAP_UTF8GETC(sp);
150*0Sstevel@tonic-gate     for (bp = brk; (bc = LDAP_UTF8GETCC(bp)) != 0;) {
151*0Sstevel@tonic-gate 	if (sc == bc)
152*0Sstevel@tonic-gate 	  goto cont;
153*0Sstevel@tonic-gate     }
154*0Sstevel@tonic-gate 
155*0Sstevel@tonic-gate     if (sc == 0) { /* no non-delimiter characters */
156*0Sstevel@tonic-gate 	*next = NULL;
157*0Sstevel@tonic-gate 	return NULL;
158*0Sstevel@tonic-gate     }
159*0Sstevel@tonic-gate     tok = LDAP_UTF8PREV(sp);
160*0Sstevel@tonic-gate 
161*0Sstevel@tonic-gate     /* Scan token; roughly, sp += strcspn(sp, brk)
162*0Sstevel@tonic-gate      * Note that brk must be 0-terminated; we stop if we see that, too.
163*0Sstevel@tonic-gate      */
164*0Sstevel@tonic-gate     while (1) {
165*0Sstevel@tonic-gate 	sc = LDAP_UTF8GETC(sp);
166*0Sstevel@tonic-gate 	bp = brk;
167*0Sstevel@tonic-gate 	do {
168*0Sstevel@tonic-gate 	    if ((bc = LDAP_UTF8GETCC(bp)) == sc) {
169*0Sstevel@tonic-gate 		if (sc == 0) {
170*0Sstevel@tonic-gate 		    *next = NULL;
171*0Sstevel@tonic-gate 		} else {
172*0Sstevel@tonic-gate 		    *next = sp;
173*0Sstevel@tonic-gate 		    *(LDAP_UTF8PREV(sp)) = 0;
174*0Sstevel@tonic-gate 		}
175*0Sstevel@tonic-gate 		return tok;
176*0Sstevel@tonic-gate 	    }
177*0Sstevel@tonic-gate 	} while (bc != 0);
178*0Sstevel@tonic-gate     }
179*0Sstevel@tonic-gate     /* NOTREACHED */
180*0Sstevel@tonic-gate }
181*0Sstevel@tonic-gate 
182*0Sstevel@tonic-gate int
183*0Sstevel@tonic-gate LDAP_CALL
ldap_utf8isalnum(char * s)184*0Sstevel@tonic-gate ldap_utf8isalnum( char* s )
185*0Sstevel@tonic-gate {
186*0Sstevel@tonic-gate     register unsigned char c = *(unsigned char*)s;
187*0Sstevel@tonic-gate     if (0x80 & c) return 0;
188*0Sstevel@tonic-gate     if (c >= 'A' && c <= 'Z') return 1;
189*0Sstevel@tonic-gate     if (c >= 'a' && c <= 'z') return 1;
190*0Sstevel@tonic-gate     if (c >= '0' && c <= '9') return 1;
191*0Sstevel@tonic-gate     return 0;
192*0Sstevel@tonic-gate }
193*0Sstevel@tonic-gate 
194*0Sstevel@tonic-gate int
195*0Sstevel@tonic-gate LDAP_CALL
ldap_utf8isalpha(char * s)196*0Sstevel@tonic-gate ldap_utf8isalpha( char* s )
197*0Sstevel@tonic-gate {
198*0Sstevel@tonic-gate     register unsigned char c = *(unsigned char*)s;
199*0Sstevel@tonic-gate     if (0x80 & c) return 0;
200*0Sstevel@tonic-gate     if (c >= 'A' && c <= 'Z') return 1;
201*0Sstevel@tonic-gate     if (c >= 'a' && c <= 'z') return 1;
202*0Sstevel@tonic-gate     return 0;
203*0Sstevel@tonic-gate }
204*0Sstevel@tonic-gate 
205*0Sstevel@tonic-gate int
206*0Sstevel@tonic-gate LDAP_CALL
ldap_utf8isdigit(char * s)207*0Sstevel@tonic-gate ldap_utf8isdigit( char* s )
208*0Sstevel@tonic-gate {
209*0Sstevel@tonic-gate     register unsigned char c = *(unsigned char*)s;
210*0Sstevel@tonic-gate     if (0x80 & c) return 0;
211*0Sstevel@tonic-gate     if (c >= '0' && c <= '9') return 1;
212*0Sstevel@tonic-gate     return 0;
213*0Sstevel@tonic-gate }
214*0Sstevel@tonic-gate 
215*0Sstevel@tonic-gate int
216*0Sstevel@tonic-gate LDAP_CALL
ldap_utf8isxdigit(char * s)217*0Sstevel@tonic-gate ldap_utf8isxdigit( char* s )
218*0Sstevel@tonic-gate {
219*0Sstevel@tonic-gate     register unsigned char c = *(unsigned char*)s;
220*0Sstevel@tonic-gate     if (0x80 & c) return 0;
221*0Sstevel@tonic-gate     if (c >= '0' && c <= '9') return 1;
222*0Sstevel@tonic-gate     if (c >= 'A' && c <= 'F') return 1;
223*0Sstevel@tonic-gate     if (c >= 'a' && c <= 'f') return 1;
224*0Sstevel@tonic-gate     return 0;
225*0Sstevel@tonic-gate }
226*0Sstevel@tonic-gate 
227*0Sstevel@tonic-gate int
228*0Sstevel@tonic-gate LDAP_CALL
ldap_utf8isspace(char * s)229*0Sstevel@tonic-gate ldap_utf8isspace( char* s )
230*0Sstevel@tonic-gate {
231*0Sstevel@tonic-gate     register unsigned char *c = (unsigned char*)s;
232*0Sstevel@tonic-gate     int len = ldap_utf8len(s);
233*0Sstevel@tonic-gate 
234*0Sstevel@tonic-gate     if (len == 0) {
235*0Sstevel@tonic-gate 	return 0;
236*0Sstevel@tonic-gate     } else if (len == 1) {
237*0Sstevel@tonic-gate 	switch (*c) {
238*0Sstevel@tonic-gate 	    case 0x09:
239*0Sstevel@tonic-gate 	    case 0x0A:
240*0Sstevel@tonic-gate 	    case 0x0B:
241*0Sstevel@tonic-gate 	    case 0x0C:
242*0Sstevel@tonic-gate 	    case 0x0D:
243*0Sstevel@tonic-gate 	    case 0x20:
244*0Sstevel@tonic-gate 		return 1;
245*0Sstevel@tonic-gate 	    default:
246*0Sstevel@tonic-gate 		return 0;
247*0Sstevel@tonic-gate 	}
248*0Sstevel@tonic-gate     } else if (len == 2) {
249*0Sstevel@tonic-gate 	if (*c == 0xc2) {
250*0Sstevel@tonic-gate 		return *(c+1) == 0x80;
251*0Sstevel@tonic-gate 	}
252*0Sstevel@tonic-gate     } else if (len == 3) {
253*0Sstevel@tonic-gate 	if (*c == 0xE2) {
254*0Sstevel@tonic-gate 	    c++;
255*0Sstevel@tonic-gate 	    if (*c == 0x80) {
256*0Sstevel@tonic-gate 		c++;
257*0Sstevel@tonic-gate 		return (*c>=0x80 && *c<=0x8a);
258*0Sstevel@tonic-gate 	    }
259*0Sstevel@tonic-gate 	} else if (*c == 0xE3) {
260*0Sstevel@tonic-gate 	    return (*(c+1)==0x80) && (*(c+2)==0x80);
261*0Sstevel@tonic-gate 	} else if (*c==0xEF) {
262*0Sstevel@tonic-gate 	    return (*(c+1)==0xBB) && (*(c+2)==0xBF);
263*0Sstevel@tonic-gate 	}
264*0Sstevel@tonic-gate 	return 0;
265*0Sstevel@tonic-gate     }
266*0Sstevel@tonic-gate 
267*0Sstevel@tonic-gate     /* should never reach here */
268*0Sstevel@tonic-gate     return 0;
269*0Sstevel@tonic-gate }
270