1 /* $NetBSD: utf-8.c,v 1.3 2021/08/14 16:14:56 christos Exp $ */
2
3 /* utf-8.c -- Basic UTF-8 routines */
4 /* $OpenLDAP$ */
5 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
6 *
7 * Copyright 1998-2021 The OpenLDAP Foundation.
8 * All rights reserved.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted only as authorized by the OpenLDAP
12 * Public License.
13 *
14 * A copy of this license is available in the file LICENSE in the
15 * top-level directory of the distribution or, alternatively, at
16 * <http://www.OpenLDAP.org/license.html>.
17 */
18 /* Basic UTF-8 routines
19 *
20 * These routines are "dumb". Though they understand UTF-8,
21 * they don't grok Unicode. That is, they can push bits,
22 * but don't have a clue what the bits represent. That's
23 * good enough for use with the LDAP Client SDK.
24 *
25 * These routines are not optimized.
26 */
27
28 #include <sys/cdefs.h>
29 __RCSID("$NetBSD: utf-8.c,v 1.3 2021/08/14 16:14:56 christos Exp $");
30
31 #include "portable.h"
32
33 #include <stdio.h>
34
35 #include <ac/stdlib.h>
36
37 #include <ac/socket.h>
38 #include <ac/string.h>
39 #include <ac/time.h>
40
41 #include "ldap_utf8.h"
42
43 #include "ldap-int.h"
44 #include "ldap_defaults.h"
45
46 /*
47 * return the number of bytes required to hold the
48 * NULL-terminated UTF-8 string NOT INCLUDING the
49 * termination.
50 */
ldap_utf8_bytes(const char * p)51 ber_len_t ldap_utf8_bytes( const char * p )
52 {
53 ber_len_t bytes;
54
55 for( bytes=0; p[bytes]; bytes++ ) {
56 /* EMPTY */ ;
57 }
58
59 return bytes;
60 }
61
ldap_utf8_chars(const char * p)62 ber_len_t ldap_utf8_chars( const char * p )
63 {
64 /* could be optimized and could check for invalid sequences */
65 ber_len_t chars=0;
66
67 for( ; *p ; LDAP_UTF8_INCR(p) ) {
68 chars++;
69 }
70
71 return chars;
72 }
73
74 /* return offset to next character */
ldap_utf8_offset(const char * p)75 int ldap_utf8_offset( const char * p )
76 {
77 return LDAP_UTF8_NEXT(p) - p;
78 }
79
80 /*
81 * Returns length indicated by first byte.
82 */
83 const char ldap_utf8_lentab[] = {
84 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
85 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
86 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
87 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
88 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
89 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
90 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
91 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 };
92
ldap_utf8_charlen(const char * p)93 int ldap_utf8_charlen( const char * p )
94 {
95 if (!(*p & 0x80))
96 return 1;
97
98 return ldap_utf8_lentab[*(const unsigned char *)p ^ 0x80];
99 }
100
101 /*
102 * Make sure the UTF-8 char used the shortest possible encoding
103 * returns charlen if valid, 0 if not.
104 *
105 * Here are the valid UTF-8 encodings, taken from RFC 2279 page 4.
106 * The table is slightly modified from that of the RFC.
107 *
108 * UCS-4 range (hex) UTF-8 sequence (binary)
109 * 0000 0000-0000 007F 0.......
110 * 0000 0080-0000 07FF 110++++. 10......
111 * 0000 0800-0000 FFFF 1110++++ 10+..... 10......
112 * 0001 0000-001F FFFF 11110+++ 10++.... 10...... 10......
113 * 0020 0000-03FF FFFF 111110++ 10+++... 10...... 10...... 10......
114 * 0400 0000-7FFF FFFF 1111110+ 10++++.. 10...... 10...... 10...... 10......
115 *
116 * The '.' bits are "don't cares". When validating a UTF-8 sequence,
117 * at least one of the '+' bits must be set, otherwise the character
118 * should have been encoded in fewer octets. Note that in the two-octet
119 * case, only the first octet needs to be validated, and this is done
120 * in the ldap_utf8_lentab[] above.
121 */
122
123 /* mask of required bits in second octet */
124 #undef c
125 #define c const char
126 c ldap_utf8_mintab[] = {
127 (c)0x20, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
128 (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
129 (c)0x30, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
130 (c)0x38, (c)0x80, (c)0x80, (c)0x80, (c)0x3c, (c)0x80, (c)0x00, (c)0x00 };
131 #undef c
132
ldap_utf8_charlen2(const char * p)133 int ldap_utf8_charlen2( const char * p )
134 {
135 int i = LDAP_UTF8_CHARLEN( p );
136
137 if ( i > 2 ) {
138 if ( !( ldap_utf8_mintab[*p & 0x1f] & p[1] ) )
139 i = 0;
140 }
141 return i;
142 }
143
144 /* conv UTF-8 to UCS-4, useful for comparisons */
ldap_x_utf8_to_ucs4(const char * p)145 ldap_ucs4_t ldap_x_utf8_to_ucs4( const char * p )
146 {
147 const unsigned char *c = (const unsigned char *) p;
148 ldap_ucs4_t ch;
149 int len, i;
150 static unsigned char mask[] = {
151 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
152
153 len = LDAP_UTF8_CHARLEN2(p, len);
154
155 if( len == 0 ) return LDAP_UCS4_INVALID;
156
157 ch = c[0] & mask[len];
158
159 for(i=1; i < len; i++) {
160 if ((c[i] & 0xc0) != 0x80) {
161 return LDAP_UCS4_INVALID;
162 }
163
164 ch <<= 6;
165 ch |= c[i] & 0x3f;
166 }
167
168 return ch;
169 }
170
171 /* conv UCS-4 to UTF-8, not used */
ldap_x_ucs4_to_utf8(ldap_ucs4_t c,char * buf)172 int ldap_x_ucs4_to_utf8( ldap_ucs4_t c, char *buf )
173 {
174 int len=0;
175 unsigned char* p = (unsigned char *) buf;
176
177 /* not a valid Unicode character */
178 if ( c < 0 ) return 0;
179
180 /* Just return length, don't convert */
181 if(buf == NULL) {
182 if( c < 0x80 ) return 1;
183 else if( c < 0x800 ) return 2;
184 else if( c < 0x10000 ) return 3;
185 else if( c < 0x200000 ) return 4;
186 else if( c < 0x4000000 ) return 5;
187 else return 6;
188 }
189
190 if( c < 0x80 ) {
191 p[len++] = c;
192
193 } else if( c < 0x800 ) {
194 p[len++] = 0xc0 | ( c >> 6 );
195 p[len++] = 0x80 | ( c & 0x3f );
196
197 } else if( c < 0x10000 ) {
198 p[len++] = 0xe0 | ( c >> 12 );
199 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
200 p[len++] = 0x80 | ( c & 0x3f );
201
202 } else if( c < 0x200000 ) {
203 p[len++] = 0xf0 | ( c >> 18 );
204 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
205 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
206 p[len++] = 0x80 | ( c & 0x3f );
207
208 } else if( c < 0x4000000 ) {
209 p[len++] = 0xf8 | ( c >> 24 );
210 p[len++] = 0x80 | ( (c >> 18) & 0x3f );
211 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
212 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
213 p[len++] = 0x80 | ( c & 0x3f );
214
215 } else /* if( c < 0x80000000 ) */ {
216 p[len++] = 0xfc | ( c >> 30 );
217 p[len++] = 0x80 | ( (c >> 24) & 0x3f );
218 p[len++] = 0x80 | ( (c >> 18) & 0x3f );
219 p[len++] = 0x80 | ( (c >> 12) & 0x3f );
220 p[len++] = 0x80 | ( (c >> 6) & 0x3f );
221 p[len++] = 0x80 | ( c & 0x3f );
222 }
223
224 return len;
225 }
226
227 #define LDAP_UCS_UTF8LEN(c) \
228 c < 0 ? 0 : (c < 0x80 ? 1 : (c < 0x800 ? 2 : (c < 0x10000 ? 3 : \
229 (c < 0x200000 ? 4 : (c < 0x4000000 ? 5 : 6)))))
230
231 /* Convert a string to UTF-8 format. The input string is expected to
232 * have characters of 1, 2, or 4 octets (in network byte order)
233 * corresponding to the ASN.1 T61STRING, BMPSTRING, and UNIVERSALSTRING
234 * types respectively. (Here T61STRING just means that there is one
235 * octet per character and characters may use the high bit of the octet.
236 * The characters are assumed to use ISO mappings, no provision is made
237 * for converting from T.61 coding rules to Unicode.)
238 */
239
240 int
ldap_ucs_to_utf8s(struct berval * ucs,int csize,struct berval * utf8s)241 ldap_ucs_to_utf8s( struct berval *ucs, int csize, struct berval *utf8s )
242 {
243 unsigned char *in, *end;
244 char *ptr;
245 ldap_ucs4_t u;
246 int i, l = 0;
247
248 utf8s->bv_val = NULL;
249 utf8s->bv_len = 0;
250
251 in = (unsigned char *)ucs->bv_val;
252
253 /* Make sure we stop at an even multiple of csize */
254 end = in + ( ucs->bv_len & ~(csize-1) );
255
256 for (; in < end; ) {
257 u = *in++;
258 if (csize > 1) {
259 u <<= 8;
260 u |= *in++;
261 }
262 if (csize > 2) {
263 u <<= 8;
264 u |= *in++;
265 u <<= 8;
266 u |= *in++;
267 }
268 i = LDAP_UCS_UTF8LEN(u);
269 if (i == 0)
270 return LDAP_INVALID_SYNTAX;
271 l += i;
272 }
273
274 utf8s->bv_val = LDAP_MALLOC( l+1 );
275 if (utf8s->bv_val == NULL)
276 return LDAP_NO_MEMORY;
277 utf8s->bv_len = l;
278
279 ptr = utf8s->bv_val;
280 for (in = (unsigned char *)ucs->bv_val; in < end; ) {
281 u = *in++;
282 if (csize > 1) {
283 u <<= 8;
284 u |= *in++;
285 }
286 if (csize > 2) {
287 u <<= 8;
288 u |= *in++;
289 u <<= 8;
290 u |= *in++;
291 }
292 ptr += ldap_x_ucs4_to_utf8(u, ptr);
293 }
294 *ptr = '\0';
295 return LDAP_SUCCESS;
296 }
297
298 /*
299 * Advance to the next UTF-8 character
300 *
301 * Ignores length of multibyte character, instead rely on
302 * continuation markers to find start of next character.
303 * This allows for "resyncing" of when invalid characters
304 * are provided provided the start of the next character
305 * is appears within the 6 bytes examined.
306 */
ldap_utf8_next(const char * p)307 char* ldap_utf8_next( const char * p )
308 {
309 int i;
310 const unsigned char *u = (const unsigned char *) p;
311
312 if( LDAP_UTF8_ISASCII(u) ) {
313 return (char *) &p[1];
314 }
315
316 for( i=1; i<6; i++ ) {
317 if ( ( u[i] & 0xc0 ) != 0x80 ) {
318 return (char *) &p[i];
319 }
320 }
321
322 return (char *) &p[i];
323 }
324
325 /*
326 * Advance to the previous UTF-8 character
327 *
328 * Ignores length of multibyte character, instead rely on
329 * continuation markers to find start of next character.
330 * This allows for "resyncing" of when invalid characters
331 * are provided provided the start of the next character
332 * is appears within the 6 bytes examined.
333 */
ldap_utf8_prev(const char * p)334 char* ldap_utf8_prev( const char * p )
335 {
336 int i;
337 const unsigned char *u = (const unsigned char *) p;
338
339 for( i=-1; i>-6 ; i-- ) {
340 if ( ( u[i] & 0xc0 ) != 0x80 ) {
341 return (char *) &p[i];
342 }
343 }
344
345 return (char *) &p[i];
346 }
347
348 /*
349 * Copy one UTF-8 character from src to dst returning
350 * number of bytes copied.
351 *
352 * Ignores length of multibyte character, instead rely on
353 * continuation markers to find start of next character.
354 * This allows for "resyncing" of when invalid characters
355 * are provided provided the start of the next character
356 * is appears within the 6 bytes examined.
357 */
ldap_utf8_copy(char * dst,const char * src)358 int ldap_utf8_copy( char* dst, const char *src )
359 {
360 int i;
361 const unsigned char *u = (const unsigned char *) src;
362
363 dst[0] = src[0];
364
365 if( LDAP_UTF8_ISASCII(u) ) {
366 return 1;
367 }
368
369 for( i=1; i<6; i++ ) {
370 if ( ( u[i] & 0xc0 ) != 0x80 ) {
371 return i;
372 }
373 dst[i] = src[i];
374 }
375
376 return i;
377 }
378
379 #ifndef UTF8_ALPHA_CTYPE
380 /*
381 * UTF-8 ctype routines
382 * Only deals with characters < 0x80 (ie: US-ASCII)
383 */
384
ldap_utf8_isascii(const char * p)385 int ldap_utf8_isascii( const char * p )
386 {
387 unsigned c = * (const unsigned char *) p;
388 return LDAP_ASCII(c);
389 }
390
ldap_utf8_isdigit(const char * p)391 int ldap_utf8_isdigit( const char * p )
392 {
393 unsigned c = * (const unsigned char *) p;
394
395 if(!LDAP_ASCII(c)) return 0;
396
397 return LDAP_DIGIT( c );
398 }
399
ldap_utf8_isxdigit(const char * p)400 int ldap_utf8_isxdigit( const char * p )
401 {
402 unsigned c = * (const unsigned char *) p;
403
404 if(!LDAP_ASCII(c)) return 0;
405
406 return LDAP_HEX(c);
407 }
408
ldap_utf8_isspace(const char * p)409 int ldap_utf8_isspace( const char * p )
410 {
411 unsigned c = * (const unsigned char *) p;
412
413 if(!LDAP_ASCII(c)) return 0;
414
415 switch(c) {
416 case ' ':
417 case '\t':
418 case '\n':
419 case '\r':
420 case '\v':
421 case '\f':
422 return 1;
423 }
424
425 return 0;
426 }
427
428 /*
429 * These are not needed by the C SDK and are
430 * not "good enough" for general use.
431 */
ldap_utf8_isalpha(const char * p)432 int ldap_utf8_isalpha( const char * p )
433 {
434 unsigned c = * (const unsigned char *) p;
435
436 if(!LDAP_ASCII(c)) return 0;
437
438 return LDAP_ALPHA(c);
439 }
440
ldap_utf8_isalnum(const char * p)441 int ldap_utf8_isalnum( const char * p )
442 {
443 unsigned c = * (const unsigned char *) p;
444
445 if(!LDAP_ASCII(c)) return 0;
446
447 return LDAP_ALNUM(c);
448 }
449
ldap_utf8_islower(const char * p)450 int ldap_utf8_islower( const char * p )
451 {
452 unsigned c = * (const unsigned char *) p;
453
454 if(!LDAP_ASCII(c)) return 0;
455
456 return LDAP_LOWER(c);
457 }
458
ldap_utf8_isupper(const char * p)459 int ldap_utf8_isupper( const char * p )
460 {
461 unsigned c = * (const unsigned char *) p;
462
463 if(!LDAP_ASCII(c)) return 0;
464
465 return LDAP_UPPER(c);
466 }
467 #endif
468
469
470 /*
471 * UTF-8 string routines
472 */
473
474 /* like strchr() */
475 char * (ldap_utf8_strchr)( const char *str, const char *chr )
476 {
477 for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
478 if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( chr ) ) {
479 return (char *) str;
480 }
481 }
482
483 return NULL;
484 }
485
486 /* like strcspn() but returns number of bytes, not characters */
ber_len_t(ldap_utf8_strcspn)487 ber_len_t (ldap_utf8_strcspn)( const char *str, const char *set )
488 {
489 const char *cstr;
490 const char *cset;
491
492 for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
493 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
494 if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
495 return cstr - str;
496 }
497 }
498 }
499
500 return cstr - str;
501 }
502
503 /* like strspn() but returns number of bytes, not characters */
ber_len_t(ldap_utf8_strspn)504 ber_len_t (ldap_utf8_strspn)( const char *str, const char *set )
505 {
506 const char *cstr;
507 const char *cset;
508
509 for( cstr = str; *cstr != '\0'; LDAP_UTF8_INCR(cstr) ) {
510 for( cset = set; ; LDAP_UTF8_INCR(cset) ) {
511 if( *cset == '\0' ) {
512 return cstr - str;
513 }
514
515 if( ldap_x_utf8_to_ucs4( cstr ) == ldap_x_utf8_to_ucs4( cset ) ) {
516 break;
517 }
518 }
519 }
520
521 return cstr - str;
522 }
523
524 /* like strpbrk(), replaces strchr() as well */
525 char *(ldap_utf8_strpbrk)( const char *str, const char *set )
526 {
527 for( ; *str != '\0'; LDAP_UTF8_INCR(str) ) {
528 const char *cset;
529
530 for( cset = set; *cset != '\0'; LDAP_UTF8_INCR(cset) ) {
531 if( ldap_x_utf8_to_ucs4( str ) == ldap_x_utf8_to_ucs4( cset ) ) {
532 return (char *) str;
533 }
534 }
535 }
536
537 return NULL;
538 }
539
540 /* like strtok_r(), not strtok() */
541 char *(ldap_utf8_strtok)(char *str, const char *sep, char **last)
542 {
543 char *begin;
544 char *end;
545
546 if( last == NULL ) return NULL;
547
548 begin = str ? str : *last;
549
550 begin += ldap_utf8_strspn( begin, sep );
551
552 if( *begin == '\0' ) {
553 *last = NULL;
554 return NULL;
555 }
556
557 end = &begin[ ldap_utf8_strcspn( begin, sep ) ];
558
559 if( *end != '\0' ) {
560 char *next = LDAP_UTF8_NEXT( end );
561 *end = '\0';
562 end = next;
563 }
564
565 *last = end;
566 return begin;
567 }
568