xref: /netbsd-src/external/ibm-public/postfix/dist/src/util/casefold.c (revision bdc22b2e01993381dcefeff2bc9b56ca75a4235c)
1 /*	$NetBSD: casefold.c,v 1.2 2017/02/14 01:16:49 christos Exp $	*/
2 
3 /*++
4 /* NAME
5 /*	casefold 3
6 /* SUMMARY
7 /*	casefold text for caseless comparison
8 /* SYNOPSIS
9 /*	#include <stringops.h>
10 /*
11 /*	char	*casefold(
12 /*	VSTRING *dst,
13 /*	const char *src)
14 /*
15 /*	char	*casefold_append(
16 /*	VSTRING *dst,
17 /*	const char *src)
18 /*
19 /*	char	*casefold_len(
20 /*	VSTRING *dst,
21 /*	const char *src,
22 /*	ssize_t	src_len)
23 /* AUXILIARY FUNCTIONS
24 /*	char	*casefoldx(
25 /*	int	flags,
26 /*	VSTRING *dst,
27 /*	const char *src,
28 /*	ssize_t	src_len)
29 /* DESCRIPTION
30 /*	casefold() converts text to a form that is suitable for
31 /*	caseless comparison, rather than presentation to humans.
32 /*
33 /*	When compiled without EAI support or util_utf8_enable is
34 /*	zero, casefold() implements ASCII case folding, leaving
35 /*	non-ASCII byte values unchanged.
36 /*
37 /*	When compiled with EAI support and util_utf8_enable is
38 /*	non-zero, casefold() implements UTF-8 case folding using
39 /*	the en_US locale, as recommended when the conversion result
40 /*	is not meant to be presented to humans.
41 /*
42 /*	casefold_len() implements casefold() with a source length
43 /*	argument.
44 /*
45 /*	casefold_append() implements casefold() without overwriting
46 /*	the result.
47 /*
48 /*	casefoldx() implements a more complex API that implements
49 /*	all of the above and more.
50 /*
51 /*	Arguments:
52 /* .IP src
53 /*	Null-terminated input string.
54 /* .IP dest
55 /*	Output buffer, null-terminated. Specify a null pointer to
56 /*	use an internal buffer that is overwritten upon each call.
57 /* .IP src_len
58 /*	The string length, -1 to determine the length dynamically.
59 /* .IP flags
60 /*	Bitwise OR of zero or more of the following:
61 /* .RS
62 /* .IP CASEF_FLAG_UTF8
63 /*	Enable UTF-8 support. This flag has no effect when compiled
64 /*	without EAI support.
65 /* .IP CASEF_FLAG_APPEND
66 /*	Append the result to the buffer, instead of overwriting it.
67 /* DIAGNOSTICS
68 /*	All errors are fatal. There appear to be no input-dependent
69 /*	errors.
70 /*
71 /*	With the ICU 4.8 library, there is no casefold error for
72 /*	UTF-8 code points U+0000..U+10FFFF (including surrogate
73 /*	range), not even when running inside an empty chroot jail.
74 /*	Nor does malformed UTF-8 trigger errors; non-UTF-8 bytes
75 /*	are copied verbatim. Based on ICU 4.8 source-code review
76 /*	and experimentation(!) we conclude that UTF-8 casefolding
77 /*	has no data-dependent error cases, and that it is safe to
78 /*	treat all casefolding errors as fatal runtime errors.
79 /* LICENSE
80 /* .ad
81 /* .fi
82 /*	The Secure Mailer license must be distributed with this software.
83 /* AUTHOR(S)
84 /*	Wietse Venema
85 /*	IBM T.J. Watson Research
86 /*	P.O. Box 704
87 /*	Yorktown Heights, NY 10598, USA
88 /*--*/
89 
90 /* System library. */
91 
92 #include <sys_defs.h>
93 #include <string.h>
94 #include <ctype.h>
95 #ifndef NO_EAI
96 #include <unicode/ucasemap.h>
97 #include <unicode/ustring.h>
98 #include <unicode/uchar.h>
99 #endif
100 
101 /* Utility library. */
102 
103 #include <msg.h>
104 #include <stringops.h>
105 
106 #define STR(x) vstring_str(x)
107 #define LEN(x) VSTRING_LEN(x)
108 
109 /* casefoldx - casefold an UTF-8 string */
110 
111 char   *casefoldx(int flags, VSTRING *dest, const char *src, ssize_t len)
112 {
113     size_t  old_len;
114 
115 #ifdef NO_EAI
116 
117     /*
118      * ASCII mode only.
119      */
120     if (len < 0)
121 	len = strlen(src);
122     if ((flags & CASEF_FLAG_APPEND) == 0)
123 	VSTRING_RESET(dest);
124     old_len = VSTRING_LEN(dest);
125     vstring_strncat(dest, src, len);
126     lowercase(STR(dest) + old_len);
127     return (STR(dest));
128 #else
129 
130     /*
131      * Unicode mode.
132      */
133     const char myname[] = "casefold";
134     static VSTRING *fold_buf = 0;
135     static UCaseMap *csm = 0;
136     UErrorCode error;
137     ssize_t space_needed;
138     int     n;
139 
140     /*
141      * Handle special cases.
142      */
143     if (len < 0)
144 	len = strlen(src);
145     if (dest == 0)
146 	dest = (fold_buf != 0 ? fold_buf : (fold_buf = vstring_alloc(100)));
147     if ((flags & CASEF_FLAG_APPEND) == 0)
148 	VSTRING_RESET(dest);
149     old_len = VSTRING_LEN(dest);
150 
151     /*
152      * All-ASCII input, or ASCII mode only.
153      */
154     if ((flags & CASEF_FLAG_UTF8) == 0 || allascii(src)) {
155 	vstring_strncat(dest, src, len);
156 	lowercase(STR(dest) + old_len);
157 	return (STR(dest));
158     }
159 
160     /*
161      * ICU 4.8 ucasemap_utf8FoldCase() does not complain about UTF-8 syntax
162      * errors. XXX Based on source-code review we conclude that non-UTF-8
163      * bytes are copied verbatim, and experiments confirm this. Given that
164      * this behavior is intentional, we assume that it will stay that way.
165      */
166 #if 0
167     if (valid_utf8_string(src, len) == 0) {
168 	if (err)
169 	    *err = "malformed UTF-8 or invalid codepoint";
170 	return (0);
171     }
172 #endif
173 
174     /*
175      * One-time initialization. With ICU 4.8 this works while chrooted.
176      */
177     if (csm == 0) {
178 	error = U_ZERO_ERROR;
179 	csm = ucasemap_open("en_US", U_FOLD_CASE_DEFAULT, &error);
180 	if (U_SUCCESS(error) == 0)
181 	    msg_fatal("ucasemap_open error: %s", u_errorName(error));
182     }
183 
184     /*
185      * Fold the input, adjusting the buffer size if needed. Safety: don't
186      * loop forever.
187      *
188      * Note: the requested amount of space for casemapped output (as reported
189      * with space_needed below) does not include storage for the null
190      * terminator. The terminator is written only when the output buffer is
191      * large enough. This is why we overallocate space when the output does
192      * not fit. But if the output fits exactly, then the ouput will be
193      * unterminated, and we have to terminate the output ourselves.
194      */
195     for (n = 0; n < 3; n++) {
196 	error = U_ZERO_ERROR;
197 	space_needed = ucasemap_utf8FoldCase(csm, STR(dest) + old_len,
198 				     vstring_avail(dest), src, len, &error);
199 	if (U_SUCCESS(error)) {
200 	    VSTRING_AT_OFFSET(dest, old_len + space_needed);
201 	    if (vstring_avail(dest) == 0)	/* exact fit, no terminator */
202 		VSTRING_TERMINATE(dest);	/* add terminator */
203 	    break;
204 	} else if (error == U_BUFFER_OVERFLOW_ERROR) {
205 	    VSTRING_SPACE(dest, space_needed + 1);	/* for terminator */
206 	} else {
207 	    msg_fatal("%s: conversion error for \"%s\": %s",
208 		      myname, src, u_errorName(error));
209 	}
210     }
211     return (STR(dest));
212 #endif						/* NO_EAI */
213 }
214 
215 #ifdef TEST
216 
217 static void encode_utf8(VSTRING *buffer, int codepoint)
218 {
219     const char myname[] = "encode_utf8";
220 
221     VSTRING_RESET(buffer);
222     if (codepoint < 0x80) {
223 	VSTRING_ADDCH(buffer, codepoint);
224     } else if (codepoint < 0x800) {
225 	VSTRING_ADDCH(buffer, 0xc0 | (codepoint >> 6));
226 	VSTRING_ADDCH(buffer, 0x80 | (codepoint & 0x3f));
227     } else if (codepoint < 0x10000) {
228 	VSTRING_ADDCH(buffer, 0xe0 | (codepoint >> 12));
229 	VSTRING_ADDCH(buffer, 0x80 | ((codepoint >> 6) & 0x3f));
230 	VSTRING_ADDCH(buffer, 0x80 | (codepoint & 0x3f));
231     } else if (codepoint <= 0x10FFFF) {
232 	VSTRING_ADDCH(buffer, 0xf0 | (codepoint >> 18));
233 	VSTRING_ADDCH(buffer, 0x80 | ((codepoint >> 12) & 0x3f));
234 	VSTRING_ADDCH(buffer, 0x80 | ((codepoint >> 6) & 0x3f));
235 	VSTRING_ADDCH(buffer, 0x80 | (codepoint & 0x3f));
236     } else {
237 	msg_panic("%s: out-of-range codepoint U+%X", myname, codepoint);
238     }
239     VSTRING_TERMINATE(buffer);
240 }
241 
242 #include <stdlib.h>
243 #include <stdio.h>
244 #include <locale.h>
245 
246 #include <vstream.h>
247 #include <vstring_vstream.h>
248 #include <msg_vstream.h>
249 
250 int     main(int argc, char **argv)
251 {
252     VSTRING *buffer = vstring_alloc(1);
253     VSTRING *dest = vstring_alloc(1);
254     char   *bp;
255     char   *conv_res;
256     char   *cmd;
257     int     codepoint, first, last;
258     VSTREAM *fp;
259 
260     if (setlocale(LC_ALL, "C") == 0)
261 	msg_fatal("setlocale(LC_ALL, C) failed: %m");
262 
263     msg_vstream_init(argv[0], VSTREAM_ERR);
264 
265     util_utf8_enable = 1;
266 
267     VSTRING_SPACE(buffer, 256);			/* chroot/file pathname */
268 
269     while (vstring_fgets_nonl(buffer, VSTREAM_IN)) {
270 	bp = STR(buffer);
271 	vstream_printf("> %s\n", bp);
272 	cmd = mystrtok(&bp, CHARS_SPACE);
273 	if (cmd == 0 || *cmd == '#')
274 	    continue;
275 	while (ISSPACE(*bp))
276 	    bp++;
277 
278 	/*
279 	 * Null-terminated string.
280 	 */
281 	if (strcmp(cmd, "fold") == 0) {
282 	    conv_res = casefold(dest, bp);
283 	    vstream_printf("\"%s\" ->fold \"%s\"\n", bp, conv_res);
284 	}
285 
286 	/*
287 	 * Codepoint range.
288 	 */
289 	else if (strcmp(cmd, "range") == 0
290 		 && sscanf(bp, "%i %i", &first, &last) == 2
291 		 && first <= last) {
292 	    for (codepoint = first; codepoint <= last; codepoint++) {
293 		if (codepoint >= 0xD800 && codepoint <= 0xDFFF) {
294 		    vstream_printf("skipping surrogate range\n");
295 		    codepoint = 0xDFFF;
296 		} else {
297 		    encode_utf8(buffer, codepoint);
298 		    if (msg_verbose)
299 			vstream_printf("U+%X -> %s\n", codepoint, STR(buffer));
300 		    if (valid_utf8_string(STR(buffer), LEN(buffer)) == 0)
301 			msg_fatal("bad utf-8 encoding for U+%X\n", codepoint);
302 		    casefold(dest, STR(buffer));
303 		}
304 	    }
305 	    vstream_printf("range completed: 0x%x..0x%x\n", first, last);
306 	}
307 
308 	/*
309 	 * Chroot directory.
310 	 */
311 	else if (strcmp(cmd, "chroot") == 0
312 		 && sscanf(bp, "%255s", STR(buffer)) == 1) {
313 	    if (geteuid() == 0) {
314 		if (chdir(STR(buffer)) < 0)
315 		    msg_fatal("chdir(%s): %m", STR(buffer));
316 		if (chroot(STR(buffer)) < 0)
317 		    msg_fatal("chroot(%s): %m", STR(buffer));
318 		vstream_printf("chroot %s completed\n", STR(buffer));
319 	    }
320 	}
321 
322 	/*
323 	 * File.
324 	 */
325 	else if (strcmp(cmd, "file") == 0
326 		 && sscanf(bp, "%255s", STR(buffer)) == 1) {
327 	    if ((fp = vstream_fopen(STR(buffer), O_RDONLY, 0)) == 0)
328 		msg_fatal("open(%s): %m", STR(buffer));
329 	    while (vstring_fgets_nonl(buffer, fp))
330 		vstream_printf("%s\n", casefold(dest, STR(buffer)));
331 	    vstream_fclose(fp);
332 	}
333 
334 	/*
335 	 * Verbose.
336 	 */
337 	else if (strcmp(cmd, "verbose") == 0
338 		 && sscanf(bp, "%i", &msg_verbose) == 1) {
339 	     /* void */ ;
340 	}
341 
342 	/*
343 	 * Usage
344 	 */
345 	else {
346 	    vstream_printf("Usage: %s chroot <path> | file <path> | fold <text> | range <first> <last> | verbose <int>\n",
347 			   argv[0]);
348 	}
349 	vstream_fflush(VSTREAM_OUT);
350     }
351     vstring_free(buffer);
352     vstring_free(dest);
353     exit(0);
354 }
355 
356 #endif					/* TEST */
357