1 /* $NetBSD: casefold.c,v 1.3 2020/03/18 19:05:21 christos Exp $ */
2
3 /*++
4 /* NAME
5 /* casefold 3
6 /* SUMMARY
7 /* casefold text for caseless comparison
8 /* SYNOPSIS
9 /* #include <stringops.h>
10 /*
11 /* char *casefold(
12 /* VSTRING *dst,
13 /* const char *src)
14 /*
15 /* char *casefold_append(
16 /* VSTRING *dst,
17 /* const char *src)
18 /*
19 /* char *casefold_len(
20 /* VSTRING *dst,
21 /* const char *src,
22 /* ssize_t src_len)
23 /* AUXILIARY FUNCTIONS
24 /* char *casefoldx(
25 /* int flags,
26 /* VSTRING *dst,
27 /* const char *src,
28 /* ssize_t src_len)
29 /* DESCRIPTION
30 /* casefold() converts text to a form that is suitable for
31 /* caseless comparison, rather than presentation to humans.
32 /*
33 /* When compiled without EAI support or util_utf8_enable is
34 /* zero, casefold() implements ASCII case folding, leaving
35 /* non-ASCII byte values unchanged.
36 /*
37 /* When compiled with EAI support and util_utf8_enable is
38 /* non-zero, casefold() implements UTF-8 case folding using
39 /* the en_US locale, as recommended when the conversion result
40 /* is not meant to be presented to humans.
41 /*
42 /* casefold_len() implements casefold() with a source length
43 /* argument.
44 /*
45 /* casefold_append() implements casefold() without overwriting
46 /* the result.
47 /*
48 /* casefoldx() implements a more complex API that implements
49 /* all of the above and more.
50 /*
51 /* Arguments:
52 /* .IP src
53 /* Null-terminated input string.
54 /* .IP dest
55 /* Output buffer, null-terminated. Specify a null pointer to
56 /* use an internal buffer that is overwritten upon each call.
57 /* .IP src_len
58 /* The string length, -1 to determine the length dynamically.
59 /* .IP flags
60 /* Bitwise OR of zero or more of the following:
61 /* .RS
62 /* .IP CASEF_FLAG_UTF8
63 /* Enable UTF-8 support. This flag has no effect when compiled
64 /* without EAI support.
65 /* .IP CASEF_FLAG_APPEND
66 /* Append the result to the buffer, instead of overwriting it.
67 /* DIAGNOSTICS
68 /* All errors are fatal. There appear to be no input-dependent
69 /* errors.
70 /*
71 /* With the ICU 4.8 library, there is no casefold error for
72 /* UTF-8 code points U+0000..U+10FFFF (including surrogate
73 /* range), not even when running inside an empty chroot jail.
74 /* Nor does malformed UTF-8 trigger errors; non-UTF-8 bytes
75 /* are copied verbatim. Based on ICU 4.8 source-code review
76 /* and experimentation(!) we conclude that UTF-8 casefolding
77 /* has no data-dependent error cases, and that it is safe to
78 /* treat all casefolding errors as fatal runtime errors.
79 /* LICENSE
80 /* .ad
81 /* .fi
82 /* The Secure Mailer license must be distributed with this software.
83 /* AUTHOR(S)
84 /* Wietse Venema
85 /* IBM T.J. Watson Research
86 /* P.O. Box 704
87 /* Yorktown Heights, NY 10598, USA
88 /*
89 /* Wietse Venema
90 /* Google, Inc.
91 /* 111 8th Avenue
92 /* New York, NY 10011, USA
93 /*--*/
94
95 /* System library. */
96
97 #include <sys_defs.h>
98 #include <string.h>
99 #include <ctype.h>
100 #ifndef NO_EAI
101 #include <unicode/ucasemap.h>
102 #include <unicode/ustring.h>
103 #include <unicode/uchar.h>
104 #endif
105
106 /* Utility library. */
107
108 #include <msg.h>
109 #include <stringops.h>
110
111 #define STR(x) vstring_str(x)
112 #define LEN(x) VSTRING_LEN(x)
113
114 /* casefoldx - casefold an UTF-8 string */
115
casefoldx(int flags,VSTRING * dest,const char * src,ssize_t len)116 char *casefoldx(int flags, VSTRING *dest, const char *src, ssize_t len)
117 {
118 size_t old_len;
119
120 #ifdef NO_EAI
121
122 /*
123 * ASCII mode only.
124 */
125 if (len < 0)
126 len = strlen(src);
127 if ((flags & CASEF_FLAG_APPEND) == 0)
128 VSTRING_RESET(dest);
129 old_len = VSTRING_LEN(dest);
130 vstring_strncat(dest, src, len);
131 lowercase(STR(dest) + old_len);
132 return (STR(dest));
133 #else
134
135 /*
136 * Unicode mode.
137 */
138 const char myname[] = "casefold";
139 static VSTRING *fold_buf = 0;
140 static UCaseMap *csm = 0;
141 UErrorCode error;
142 ssize_t space_needed;
143 int n;
144
145 /*
146 * Handle special cases.
147 */
148 if (len < 0)
149 len = strlen(src);
150 if (dest == 0)
151 dest = (fold_buf != 0 ? fold_buf : (fold_buf = vstring_alloc(100)));
152 if ((flags & CASEF_FLAG_APPEND) == 0)
153 VSTRING_RESET(dest);
154 old_len = VSTRING_LEN(dest);
155
156 /*
157 * All-ASCII input, or ASCII mode only.
158 */
159 if ((flags & CASEF_FLAG_UTF8) == 0 || allascii(src)) {
160 vstring_strncat(dest, src, len);
161 lowercase(STR(dest) + old_len);
162 return (STR(dest));
163 }
164
165 /*
166 * ICU 4.8 ucasemap_utf8FoldCase() does not complain about UTF-8 syntax
167 * errors. XXX Based on source-code review we conclude that non-UTF-8
168 * bytes are copied verbatim, and experiments confirm this. Given that
169 * this behavior is intentional, we assume that it will stay that way.
170 */
171 #if 0
172 if (valid_utf8_string(src, len) == 0) {
173 if (err)
174 *err = "malformed UTF-8 or invalid codepoint";
175 return (0);
176 }
177 #endif
178
179 /*
180 * One-time initialization. With ICU 4.8 this works while chrooted.
181 */
182 if (csm == 0) {
183 error = U_ZERO_ERROR;
184 csm = ucasemap_open("en_US", U_FOLD_CASE_DEFAULT, &error);
185 if (U_SUCCESS(error) == 0)
186 msg_fatal("ucasemap_open error: %s", u_errorName(error));
187 }
188
189 /*
190 * Fold the input, adjusting the buffer size if needed. Safety: don't
191 * loop forever.
192 *
193 * Note: the requested amount of space for casemapped output (as reported
194 * with space_needed below) does not include storage for the null
195 * terminator. The terminator is written only when the output buffer is
196 * large enough. This is why we overallocate space when the output does
197 * not fit. But if the output fits exactly, then the output will be
198 * unterminated, and we have to terminate the output ourselves.
199 */
200 for (n = 0; n < 3; n++) {
201 error = U_ZERO_ERROR;
202 space_needed = ucasemap_utf8FoldCase(csm, STR(dest) + old_len,
203 vstring_avail(dest), src, len, &error);
204 if (U_SUCCESS(error)) {
205 vstring_set_payload_size(dest, old_len + space_needed);
206 if (vstring_avail(dest) == 0) /* exact fit, no terminator */
207 VSTRING_TERMINATE(dest); /* add terminator */
208 break;
209 } else if (error == U_BUFFER_OVERFLOW_ERROR) {
210 VSTRING_SPACE(dest, space_needed + 1); /* for terminator */
211 } else {
212 msg_fatal("%s: conversion error for \"%s\": %s",
213 myname, src, u_errorName(error));
214 }
215 }
216 return (STR(dest));
217 #endif /* NO_EAI */
218 }
219
220 #ifdef TEST
221
encode_utf8(VSTRING * buffer,int codepoint)222 static void encode_utf8(VSTRING *buffer, int codepoint)
223 {
224 const char myname[] = "encode_utf8";
225
226 VSTRING_RESET(buffer);
227 if (codepoint < 0x80) {
228 VSTRING_ADDCH(buffer, codepoint);
229 } else if (codepoint < 0x800) {
230 VSTRING_ADDCH(buffer, 0xc0 | (codepoint >> 6));
231 VSTRING_ADDCH(buffer, 0x80 | (codepoint & 0x3f));
232 } else if (codepoint < 0x10000) {
233 VSTRING_ADDCH(buffer, 0xe0 | (codepoint >> 12));
234 VSTRING_ADDCH(buffer, 0x80 | ((codepoint >> 6) & 0x3f));
235 VSTRING_ADDCH(buffer, 0x80 | (codepoint & 0x3f));
236 } else if (codepoint <= 0x10FFFF) {
237 VSTRING_ADDCH(buffer, 0xf0 | (codepoint >> 18));
238 VSTRING_ADDCH(buffer, 0x80 | ((codepoint >> 12) & 0x3f));
239 VSTRING_ADDCH(buffer, 0x80 | ((codepoint >> 6) & 0x3f));
240 VSTRING_ADDCH(buffer, 0x80 | (codepoint & 0x3f));
241 } else {
242 msg_panic("%s: out-of-range codepoint U+%X", myname, codepoint);
243 }
244 VSTRING_TERMINATE(buffer);
245 }
246
247 #include <stdlib.h>
248 #include <stdio.h>
249 #include <locale.h>
250
251 #include <vstream.h>
252 #include <vstring_vstream.h>
253 #include <msg_vstream.h>
254
main(int argc,char ** argv)255 int main(int argc, char **argv)
256 {
257 VSTRING *buffer = vstring_alloc(1);
258 VSTRING *dest = vstring_alloc(1);
259 char *bp;
260 char *conv_res;
261 char *cmd;
262 int codepoint, first, last;
263 VSTREAM *fp;
264
265 if (setlocale(LC_ALL, "C") == 0)
266 msg_fatal("setlocale(LC_ALL, C) failed: %m");
267
268 msg_vstream_init(argv[0], VSTREAM_ERR);
269
270 util_utf8_enable = 1;
271
272 VSTRING_SPACE(buffer, 256); /* chroot/file pathname */
273
274 while (vstring_fgets_nonl(buffer, VSTREAM_IN)) {
275 bp = STR(buffer);
276 vstream_printf("> %s\n", bp);
277 cmd = mystrtok(&bp, CHARS_SPACE);
278 if (cmd == 0 || *cmd == '#')
279 continue;
280 while (ISSPACE(*bp))
281 bp++;
282
283 /*
284 * Null-terminated string.
285 */
286 if (strcmp(cmd, "fold") == 0) {
287 conv_res = casefold(dest, bp);
288 vstream_printf("\"%s\" ->fold \"%s\"\n", bp, conv_res);
289 }
290
291 /*
292 * Codepoint range.
293 */
294 else if (strcmp(cmd, "range") == 0
295 && sscanf(bp, "%i %i", &first, &last) == 2
296 && first <= last) {
297 for (codepoint = first; codepoint <= last; codepoint++) {
298 if (codepoint >= 0xD800 && codepoint <= 0xDFFF) {
299 vstream_printf("skipping surrogate range\n");
300 codepoint = 0xDFFF;
301 } else {
302 encode_utf8(buffer, codepoint);
303 if (msg_verbose)
304 vstream_printf("U+%X -> %s\n", codepoint, STR(buffer));
305 if (valid_utf8_string(STR(buffer), LEN(buffer)) == 0)
306 msg_fatal("bad utf-8 encoding for U+%X\n", codepoint);
307 casefold(dest, STR(buffer));
308 }
309 }
310 vstream_printf("range completed: 0x%x..0x%x\n", first, last);
311 }
312
313 /*
314 * Chroot directory.
315 */
316 else if (strcmp(cmd, "chroot") == 0
317 && sscanf(bp, "%255s", STR(buffer)) == 1) {
318 if (geteuid() == 0) {
319 if (chdir(STR(buffer)) < 0)
320 msg_fatal("chdir(%s): %m", STR(buffer));
321 if (chroot(STR(buffer)) < 0)
322 msg_fatal("chroot(%s): %m", STR(buffer));
323 vstream_printf("chroot %s completed\n", STR(buffer));
324 }
325 }
326
327 /*
328 * File.
329 */
330 else if (strcmp(cmd, "file") == 0
331 && sscanf(bp, "%255s", STR(buffer)) == 1) {
332 if ((fp = vstream_fopen(STR(buffer), O_RDONLY, 0)) == 0)
333 msg_fatal("open(%s): %m", STR(buffer));
334 while (vstring_fgets_nonl(buffer, fp))
335 vstream_printf("%s\n", casefold(dest, STR(buffer)));
336 vstream_fclose(fp);
337 }
338
339 /*
340 * Verbose.
341 */
342 else if (strcmp(cmd, "verbose") == 0
343 && sscanf(bp, "%i", &msg_verbose) == 1) {
344 /* void */ ;
345 }
346
347 /*
348 * Usage
349 */
350 else {
351 vstream_printf("Usage: %s chroot <path> | file <path> | fold <text> | range <first> <last> | verbose <int>\n",
352 argv[0]);
353 }
354 vstream_fflush(VSTREAM_OUT);
355 }
356 vstring_free(buffer);
357 vstring_free(dest);
358 exit(0);
359 }
360
361 #endif /* TEST */
362