xref: /netbsd-src/external/ibm-public/postfix/dist/src/util/casefold.c (revision 33881f779a77dce6440bdc44610d94de75bebefe)
1 /*	$NetBSD: casefold.c,v 1.3 2020/03/18 19:05:21 christos Exp $	*/
2 
3 /*++
4 /* NAME
5 /*	casefold 3
6 /* SUMMARY
7 /*	casefold text for caseless comparison
8 /* SYNOPSIS
9 /*	#include <stringops.h>
10 /*
11 /*	char	*casefold(
12 /*	VSTRING *dst,
13 /*	const char *src)
14 /*
15 /*	char	*casefold_append(
16 /*	VSTRING *dst,
17 /*	const char *src)
18 /*
19 /*	char	*casefold_len(
20 /*	VSTRING *dst,
21 /*	const char *src,
22 /*	ssize_t	src_len)
23 /* AUXILIARY FUNCTIONS
24 /*	char	*casefoldx(
25 /*	int	flags,
26 /*	VSTRING *dst,
27 /*	const char *src,
28 /*	ssize_t	src_len)
29 /* DESCRIPTION
30 /*	casefold() converts text to a form that is suitable for
31 /*	caseless comparison, rather than presentation to humans.
32 /*
33 /*	When compiled without EAI support or util_utf8_enable is
34 /*	zero, casefold() implements ASCII case folding, leaving
35 /*	non-ASCII byte values unchanged.
36 /*
37 /*	When compiled with EAI support and util_utf8_enable is
38 /*	non-zero, casefold() implements UTF-8 case folding using
39 /*	the en_US locale, as recommended when the conversion result
40 /*	is not meant to be presented to humans.
41 /*
42 /*	casefold_len() implements casefold() with a source length
43 /*	argument.
44 /*
45 /*	casefold_append() implements casefold() without overwriting
46 /*	the result.
47 /*
48 /*	casefoldx() implements a more complex API that implements
49 /*	all of the above and more.
50 /*
51 /*	Arguments:
52 /* .IP src
53 /*	Null-terminated input string.
54 /* .IP dest
55 /*	Output buffer, null-terminated. Specify a null pointer to
56 /*	use an internal buffer that is overwritten upon each call.
57 /* .IP src_len
58 /*	The string length, -1 to determine the length dynamically.
59 /* .IP flags
60 /*	Bitwise OR of zero or more of the following:
61 /* .RS
62 /* .IP CASEF_FLAG_UTF8
63 /*	Enable UTF-8 support. This flag has no effect when compiled
64 /*	without EAI support.
65 /* .IP CASEF_FLAG_APPEND
66 /*	Append the result to the buffer, instead of overwriting it.
67 /* DIAGNOSTICS
68 /*	All errors are fatal. There appear to be no input-dependent
69 /*	errors.
70 /*
71 /*	With the ICU 4.8 library, there is no casefold error for
72 /*	UTF-8 code points U+0000..U+10FFFF (including surrogate
73 /*	range), not even when running inside an empty chroot jail.
74 /*	Nor does malformed UTF-8 trigger errors; non-UTF-8 bytes
75 /*	are copied verbatim. Based on ICU 4.8 source-code review
76 /*	and experimentation(!) we conclude that UTF-8 casefolding
77 /*	has no data-dependent error cases, and that it is safe to
78 /*	treat all casefolding errors as fatal runtime errors.
79 /* LICENSE
80 /* .ad
81 /* .fi
82 /*	The Secure Mailer license must be distributed with this software.
83 /* AUTHOR(S)
84 /*	Wietse Venema
85 /*	IBM T.J. Watson Research
86 /*	P.O. Box 704
87 /*	Yorktown Heights, NY 10598, USA
88 /*
89 /*	Wietse Venema
90 /*	Google, Inc.
91 /*	111 8th Avenue
92 /*	New York, NY 10011, USA
93 /*--*/
94 
95 /* System library. */
96 
97 #include <sys_defs.h>
98 #include <string.h>
99 #include <ctype.h>
100 #ifndef NO_EAI
101 #include <unicode/ucasemap.h>
102 #include <unicode/ustring.h>
103 #include <unicode/uchar.h>
104 #endif
105 
106 /* Utility library. */
107 
108 #include <msg.h>
109 #include <stringops.h>
110 
111 #define STR(x) vstring_str(x)
112 #define LEN(x) VSTRING_LEN(x)
113 
114 /* casefoldx - casefold an UTF-8 string */
115 
casefoldx(int flags,VSTRING * dest,const char * src,ssize_t len)116 char   *casefoldx(int flags, VSTRING *dest, const char *src, ssize_t len)
117 {
118     size_t  old_len;
119 
120 #ifdef NO_EAI
121 
122     /*
123      * ASCII mode only.
124      */
125     if (len < 0)
126 	len = strlen(src);
127     if ((flags & CASEF_FLAG_APPEND) == 0)
128 	VSTRING_RESET(dest);
129     old_len = VSTRING_LEN(dest);
130     vstring_strncat(dest, src, len);
131     lowercase(STR(dest) + old_len);
132     return (STR(dest));
133 #else
134 
135     /*
136      * Unicode mode.
137      */
138     const char myname[] = "casefold";
139     static VSTRING *fold_buf = 0;
140     static UCaseMap *csm = 0;
141     UErrorCode error;
142     ssize_t space_needed;
143     int     n;
144 
145     /*
146      * Handle special cases.
147      */
148     if (len < 0)
149 	len = strlen(src);
150     if (dest == 0)
151 	dest = (fold_buf != 0 ? fold_buf : (fold_buf = vstring_alloc(100)));
152     if ((flags & CASEF_FLAG_APPEND) == 0)
153 	VSTRING_RESET(dest);
154     old_len = VSTRING_LEN(dest);
155 
156     /*
157      * All-ASCII input, or ASCII mode only.
158      */
159     if ((flags & CASEF_FLAG_UTF8) == 0 || allascii(src)) {
160 	vstring_strncat(dest, src, len);
161 	lowercase(STR(dest) + old_len);
162 	return (STR(dest));
163     }
164 
165     /*
166      * ICU 4.8 ucasemap_utf8FoldCase() does not complain about UTF-8 syntax
167      * errors. XXX Based on source-code review we conclude that non-UTF-8
168      * bytes are copied verbatim, and experiments confirm this. Given that
169      * this behavior is intentional, we assume that it will stay that way.
170      */
171 #if 0
172     if (valid_utf8_string(src, len) == 0) {
173 	if (err)
174 	    *err = "malformed UTF-8 or invalid codepoint";
175 	return (0);
176     }
177 #endif
178 
179     /*
180      * One-time initialization. With ICU 4.8 this works while chrooted.
181      */
182     if (csm == 0) {
183 	error = U_ZERO_ERROR;
184 	csm = ucasemap_open("en_US", U_FOLD_CASE_DEFAULT, &error);
185 	if (U_SUCCESS(error) == 0)
186 	    msg_fatal("ucasemap_open error: %s", u_errorName(error));
187     }
188 
189     /*
190      * Fold the input, adjusting the buffer size if needed. Safety: don't
191      * loop forever.
192      *
193      * Note: the requested amount of space for casemapped output (as reported
194      * with space_needed below) does not include storage for the null
195      * terminator. The terminator is written only when the output buffer is
196      * large enough. This is why we overallocate space when the output does
197      * not fit. But if the output fits exactly, then the output will be
198      * unterminated, and we have to terminate the output ourselves.
199      */
200     for (n = 0; n < 3; n++) {
201 	error = U_ZERO_ERROR;
202 	space_needed = ucasemap_utf8FoldCase(csm, STR(dest) + old_len,
203 				     vstring_avail(dest), src, len, &error);
204 	if (U_SUCCESS(error)) {
205 	    vstring_set_payload_size(dest, old_len + space_needed);
206 	    if (vstring_avail(dest) == 0)	/* exact fit, no terminator */
207 		VSTRING_TERMINATE(dest);	/* add terminator */
208 	    break;
209 	} else if (error == U_BUFFER_OVERFLOW_ERROR) {
210 	    VSTRING_SPACE(dest, space_needed + 1);	/* for terminator */
211 	} else {
212 	    msg_fatal("%s: conversion error for \"%s\": %s",
213 		      myname, src, u_errorName(error));
214 	}
215     }
216     return (STR(dest));
217 #endif						/* NO_EAI */
218 }
219 
220 #ifdef TEST
221 
encode_utf8(VSTRING * buffer,int codepoint)222 static void encode_utf8(VSTRING *buffer, int codepoint)
223 {
224     const char myname[] = "encode_utf8";
225 
226     VSTRING_RESET(buffer);
227     if (codepoint < 0x80) {
228 	VSTRING_ADDCH(buffer, codepoint);
229     } else if (codepoint < 0x800) {
230 	VSTRING_ADDCH(buffer, 0xc0 | (codepoint >> 6));
231 	VSTRING_ADDCH(buffer, 0x80 | (codepoint & 0x3f));
232     } else if (codepoint < 0x10000) {
233 	VSTRING_ADDCH(buffer, 0xe0 | (codepoint >> 12));
234 	VSTRING_ADDCH(buffer, 0x80 | ((codepoint >> 6) & 0x3f));
235 	VSTRING_ADDCH(buffer, 0x80 | (codepoint & 0x3f));
236     } else if (codepoint <= 0x10FFFF) {
237 	VSTRING_ADDCH(buffer, 0xf0 | (codepoint >> 18));
238 	VSTRING_ADDCH(buffer, 0x80 | ((codepoint >> 12) & 0x3f));
239 	VSTRING_ADDCH(buffer, 0x80 | ((codepoint >> 6) & 0x3f));
240 	VSTRING_ADDCH(buffer, 0x80 | (codepoint & 0x3f));
241     } else {
242 	msg_panic("%s: out-of-range codepoint U+%X", myname, codepoint);
243     }
244     VSTRING_TERMINATE(buffer);
245 }
246 
247 #include <stdlib.h>
248 #include <stdio.h>
249 #include <locale.h>
250 
251 #include <vstream.h>
252 #include <vstring_vstream.h>
253 #include <msg_vstream.h>
254 
main(int argc,char ** argv)255 int     main(int argc, char **argv)
256 {
257     VSTRING *buffer = vstring_alloc(1);
258     VSTRING *dest = vstring_alloc(1);
259     char   *bp;
260     char   *conv_res;
261     char   *cmd;
262     int     codepoint, first, last;
263     VSTREAM *fp;
264 
265     if (setlocale(LC_ALL, "C") == 0)
266 	msg_fatal("setlocale(LC_ALL, C) failed: %m");
267 
268     msg_vstream_init(argv[0], VSTREAM_ERR);
269 
270     util_utf8_enable = 1;
271 
272     VSTRING_SPACE(buffer, 256);			/* chroot/file pathname */
273 
274     while (vstring_fgets_nonl(buffer, VSTREAM_IN)) {
275 	bp = STR(buffer);
276 	vstream_printf("> %s\n", bp);
277 	cmd = mystrtok(&bp, CHARS_SPACE);
278 	if (cmd == 0 || *cmd == '#')
279 	    continue;
280 	while (ISSPACE(*bp))
281 	    bp++;
282 
283 	/*
284 	 * Null-terminated string.
285 	 */
286 	if (strcmp(cmd, "fold") == 0) {
287 	    conv_res = casefold(dest, bp);
288 	    vstream_printf("\"%s\" ->fold \"%s\"\n", bp, conv_res);
289 	}
290 
291 	/*
292 	 * Codepoint range.
293 	 */
294 	else if (strcmp(cmd, "range") == 0
295 		 && sscanf(bp, "%i %i", &first, &last) == 2
296 		 && first <= last) {
297 	    for (codepoint = first; codepoint <= last; codepoint++) {
298 		if (codepoint >= 0xD800 && codepoint <= 0xDFFF) {
299 		    vstream_printf("skipping surrogate range\n");
300 		    codepoint = 0xDFFF;
301 		} else {
302 		    encode_utf8(buffer, codepoint);
303 		    if (msg_verbose)
304 			vstream_printf("U+%X -> %s\n", codepoint, STR(buffer));
305 		    if (valid_utf8_string(STR(buffer), LEN(buffer)) == 0)
306 			msg_fatal("bad utf-8 encoding for U+%X\n", codepoint);
307 		    casefold(dest, STR(buffer));
308 		}
309 	    }
310 	    vstream_printf("range completed: 0x%x..0x%x\n", first, last);
311 	}
312 
313 	/*
314 	 * Chroot directory.
315 	 */
316 	else if (strcmp(cmd, "chroot") == 0
317 		 && sscanf(bp, "%255s", STR(buffer)) == 1) {
318 	    if (geteuid() == 0) {
319 		if (chdir(STR(buffer)) < 0)
320 		    msg_fatal("chdir(%s): %m", STR(buffer));
321 		if (chroot(STR(buffer)) < 0)
322 		    msg_fatal("chroot(%s): %m", STR(buffer));
323 		vstream_printf("chroot %s completed\n", STR(buffer));
324 	    }
325 	}
326 
327 	/*
328 	 * File.
329 	 */
330 	else if (strcmp(cmd, "file") == 0
331 		 && sscanf(bp, "%255s", STR(buffer)) == 1) {
332 	    if ((fp = vstream_fopen(STR(buffer), O_RDONLY, 0)) == 0)
333 		msg_fatal("open(%s): %m", STR(buffer));
334 	    while (vstring_fgets_nonl(buffer, fp))
335 		vstream_printf("%s\n", casefold(dest, STR(buffer)));
336 	    vstream_fclose(fp);
337 	}
338 
339 	/*
340 	 * Verbose.
341 	 */
342 	else if (strcmp(cmd, "verbose") == 0
343 		 && sscanf(bp, "%i", &msg_verbose) == 1) {
344 	     /* void */ ;
345 	}
346 
347 	/*
348 	 * Usage
349 	 */
350 	else {
351 	    vstream_printf("Usage: %s chroot <path> | file <path> | fold <text> | range <first> <last> | verbose <int>\n",
352 			   argv[0]);
353 	}
354 	vstream_fflush(VSTREAM_OUT);
355     }
356     vstring_free(buffer);
357     vstring_free(dest);
358     exit(0);
359 }
360 
361 #endif					/* TEST */
362