1 /* $NetBSD: casefold.c,v 1.3 2020/03/18 19:05:21 christos Exp $ */ 2 3 /*++ 4 /* NAME 5 /* casefold 3 6 /* SUMMARY 7 /* casefold text for caseless comparison 8 /* SYNOPSIS 9 /* #include <stringops.h> 10 /* 11 /* char *casefold( 12 /* VSTRING *dst, 13 /* const char *src) 14 /* 15 /* char *casefold_append( 16 /* VSTRING *dst, 17 /* const char *src) 18 /* 19 /* char *casefold_len( 20 /* VSTRING *dst, 21 /* const char *src, 22 /* ssize_t src_len) 23 /* AUXILIARY FUNCTIONS 24 /* char *casefoldx( 25 /* int flags, 26 /* VSTRING *dst, 27 /* const char *src, 28 /* ssize_t src_len) 29 /* DESCRIPTION 30 /* casefold() converts text to a form that is suitable for 31 /* caseless comparison, rather than presentation to humans. 32 /* 33 /* When compiled without EAI support or util_utf8_enable is 34 /* zero, casefold() implements ASCII case folding, leaving 35 /* non-ASCII byte values unchanged. 36 /* 37 /* When compiled with EAI support and util_utf8_enable is 38 /* non-zero, casefold() implements UTF-8 case folding using 39 /* the en_US locale, as recommended when the conversion result 40 /* is not meant to be presented to humans. 41 /* 42 /* casefold_len() implements casefold() with a source length 43 /* argument. 44 /* 45 /* casefold_append() implements casefold() without overwriting 46 /* the result. 47 /* 48 /* casefoldx() implements a more complex API that implements 49 /* all of the above and more. 50 /* 51 /* Arguments: 52 /* .IP src 53 /* Null-terminated input string. 54 /* .IP dest 55 /* Output buffer, null-terminated. Specify a null pointer to 56 /* use an internal buffer that is overwritten upon each call. 57 /* .IP src_len 58 /* The string length, -1 to determine the length dynamically. 59 /* .IP flags 60 /* Bitwise OR of zero or more of the following: 61 /* .RS 62 /* .IP CASEF_FLAG_UTF8 63 /* Enable UTF-8 support. This flag has no effect when compiled 64 /* without EAI support. 65 /* .IP CASEF_FLAG_APPEND 66 /* Append the result to the buffer, instead of overwriting it. 67 /* DIAGNOSTICS 68 /* All errors are fatal. There appear to be no input-dependent 69 /* errors. 70 /* 71 /* With the ICU 4.8 library, there is no casefold error for 72 /* UTF-8 code points U+0000..U+10FFFF (including surrogate 73 /* range), not even when running inside an empty chroot jail. 74 /* Nor does malformed UTF-8 trigger errors; non-UTF-8 bytes 75 /* are copied verbatim. Based on ICU 4.8 source-code review 76 /* and experimentation(!) we conclude that UTF-8 casefolding 77 /* has no data-dependent error cases, and that it is safe to 78 /* treat all casefolding errors as fatal runtime errors. 79 /* LICENSE 80 /* .ad 81 /* .fi 82 /* The Secure Mailer license must be distributed with this software. 83 /* AUTHOR(S) 84 /* Wietse Venema 85 /* IBM T.J. Watson Research 86 /* P.O. Box 704 87 /* Yorktown Heights, NY 10598, USA 88 /* 89 /* Wietse Venema 90 /* Google, Inc. 91 /* 111 8th Avenue 92 /* New York, NY 10011, USA 93 /*--*/ 94 95 /* System library. */ 96 97 #include <sys_defs.h> 98 #include <string.h> 99 #include <ctype.h> 100 #ifndef NO_EAI 101 #include <unicode/ucasemap.h> 102 #include <unicode/ustring.h> 103 #include <unicode/uchar.h> 104 #endif 105 106 /* Utility library. */ 107 108 #include <msg.h> 109 #include <stringops.h> 110 111 #define STR(x) vstring_str(x) 112 #define LEN(x) VSTRING_LEN(x) 113 114 /* casefoldx - casefold an UTF-8 string */ 115 116 char *casefoldx(int flags, VSTRING *dest, const char *src, ssize_t len) 117 { 118 size_t old_len; 119 120 #ifdef NO_EAI 121 122 /* 123 * ASCII mode only. 124 */ 125 if (len < 0) 126 len = strlen(src); 127 if ((flags & CASEF_FLAG_APPEND) == 0) 128 VSTRING_RESET(dest); 129 old_len = VSTRING_LEN(dest); 130 vstring_strncat(dest, src, len); 131 lowercase(STR(dest) + old_len); 132 return (STR(dest)); 133 #else 134 135 /* 136 * Unicode mode. 137 */ 138 const char myname[] = "casefold"; 139 static VSTRING *fold_buf = 0; 140 static UCaseMap *csm = 0; 141 UErrorCode error; 142 ssize_t space_needed; 143 int n; 144 145 /* 146 * Handle special cases. 147 */ 148 if (len < 0) 149 len = strlen(src); 150 if (dest == 0) 151 dest = (fold_buf != 0 ? fold_buf : (fold_buf = vstring_alloc(100))); 152 if ((flags & CASEF_FLAG_APPEND) == 0) 153 VSTRING_RESET(dest); 154 old_len = VSTRING_LEN(dest); 155 156 /* 157 * All-ASCII input, or ASCII mode only. 158 */ 159 if ((flags & CASEF_FLAG_UTF8) == 0 || allascii(src)) { 160 vstring_strncat(dest, src, len); 161 lowercase(STR(dest) + old_len); 162 return (STR(dest)); 163 } 164 165 /* 166 * ICU 4.8 ucasemap_utf8FoldCase() does not complain about UTF-8 syntax 167 * errors. XXX Based on source-code review we conclude that non-UTF-8 168 * bytes are copied verbatim, and experiments confirm this. Given that 169 * this behavior is intentional, we assume that it will stay that way. 170 */ 171 #if 0 172 if (valid_utf8_string(src, len) == 0) { 173 if (err) 174 *err = "malformed UTF-8 or invalid codepoint"; 175 return (0); 176 } 177 #endif 178 179 /* 180 * One-time initialization. With ICU 4.8 this works while chrooted. 181 */ 182 if (csm == 0) { 183 error = U_ZERO_ERROR; 184 csm = ucasemap_open("en_US", U_FOLD_CASE_DEFAULT, &error); 185 if (U_SUCCESS(error) == 0) 186 msg_fatal("ucasemap_open error: %s", u_errorName(error)); 187 } 188 189 /* 190 * Fold the input, adjusting the buffer size if needed. Safety: don't 191 * loop forever. 192 * 193 * Note: the requested amount of space for casemapped output (as reported 194 * with space_needed below) does not include storage for the null 195 * terminator. The terminator is written only when the output buffer is 196 * large enough. This is why we overallocate space when the output does 197 * not fit. But if the output fits exactly, then the output will be 198 * unterminated, and we have to terminate the output ourselves. 199 */ 200 for (n = 0; n < 3; n++) { 201 error = U_ZERO_ERROR; 202 space_needed = ucasemap_utf8FoldCase(csm, STR(dest) + old_len, 203 vstring_avail(dest), src, len, &error); 204 if (U_SUCCESS(error)) { 205 vstring_set_payload_size(dest, old_len + space_needed); 206 if (vstring_avail(dest) == 0) /* exact fit, no terminator */ 207 VSTRING_TERMINATE(dest); /* add terminator */ 208 break; 209 } else if (error == U_BUFFER_OVERFLOW_ERROR) { 210 VSTRING_SPACE(dest, space_needed + 1); /* for terminator */ 211 } else { 212 msg_fatal("%s: conversion error for \"%s\": %s", 213 myname, src, u_errorName(error)); 214 } 215 } 216 return (STR(dest)); 217 #endif /* NO_EAI */ 218 } 219 220 #ifdef TEST 221 222 static void encode_utf8(VSTRING *buffer, int codepoint) 223 { 224 const char myname[] = "encode_utf8"; 225 226 VSTRING_RESET(buffer); 227 if (codepoint < 0x80) { 228 VSTRING_ADDCH(buffer, codepoint); 229 } else if (codepoint < 0x800) { 230 VSTRING_ADDCH(buffer, 0xc0 | (codepoint >> 6)); 231 VSTRING_ADDCH(buffer, 0x80 | (codepoint & 0x3f)); 232 } else if (codepoint < 0x10000) { 233 VSTRING_ADDCH(buffer, 0xe0 | (codepoint >> 12)); 234 VSTRING_ADDCH(buffer, 0x80 | ((codepoint >> 6) & 0x3f)); 235 VSTRING_ADDCH(buffer, 0x80 | (codepoint & 0x3f)); 236 } else if (codepoint <= 0x10FFFF) { 237 VSTRING_ADDCH(buffer, 0xf0 | (codepoint >> 18)); 238 VSTRING_ADDCH(buffer, 0x80 | ((codepoint >> 12) & 0x3f)); 239 VSTRING_ADDCH(buffer, 0x80 | ((codepoint >> 6) & 0x3f)); 240 VSTRING_ADDCH(buffer, 0x80 | (codepoint & 0x3f)); 241 } else { 242 msg_panic("%s: out-of-range codepoint U+%X", myname, codepoint); 243 } 244 VSTRING_TERMINATE(buffer); 245 } 246 247 #include <stdlib.h> 248 #include <stdio.h> 249 #include <locale.h> 250 251 #include <vstream.h> 252 #include <vstring_vstream.h> 253 #include <msg_vstream.h> 254 255 int main(int argc, char **argv) 256 { 257 VSTRING *buffer = vstring_alloc(1); 258 VSTRING *dest = vstring_alloc(1); 259 char *bp; 260 char *conv_res; 261 char *cmd; 262 int codepoint, first, last; 263 VSTREAM *fp; 264 265 if (setlocale(LC_ALL, "C") == 0) 266 msg_fatal("setlocale(LC_ALL, C) failed: %m"); 267 268 msg_vstream_init(argv[0], VSTREAM_ERR); 269 270 util_utf8_enable = 1; 271 272 VSTRING_SPACE(buffer, 256); /* chroot/file pathname */ 273 274 while (vstring_fgets_nonl(buffer, VSTREAM_IN)) { 275 bp = STR(buffer); 276 vstream_printf("> %s\n", bp); 277 cmd = mystrtok(&bp, CHARS_SPACE); 278 if (cmd == 0 || *cmd == '#') 279 continue; 280 while (ISSPACE(*bp)) 281 bp++; 282 283 /* 284 * Null-terminated string. 285 */ 286 if (strcmp(cmd, "fold") == 0) { 287 conv_res = casefold(dest, bp); 288 vstream_printf("\"%s\" ->fold \"%s\"\n", bp, conv_res); 289 } 290 291 /* 292 * Codepoint range. 293 */ 294 else if (strcmp(cmd, "range") == 0 295 && sscanf(bp, "%i %i", &first, &last) == 2 296 && first <= last) { 297 for (codepoint = first; codepoint <= last; codepoint++) { 298 if (codepoint >= 0xD800 && codepoint <= 0xDFFF) { 299 vstream_printf("skipping surrogate range\n"); 300 codepoint = 0xDFFF; 301 } else { 302 encode_utf8(buffer, codepoint); 303 if (msg_verbose) 304 vstream_printf("U+%X -> %s\n", codepoint, STR(buffer)); 305 if (valid_utf8_string(STR(buffer), LEN(buffer)) == 0) 306 msg_fatal("bad utf-8 encoding for U+%X\n", codepoint); 307 casefold(dest, STR(buffer)); 308 } 309 } 310 vstream_printf("range completed: 0x%x..0x%x\n", first, last); 311 } 312 313 /* 314 * Chroot directory. 315 */ 316 else if (strcmp(cmd, "chroot") == 0 317 && sscanf(bp, "%255s", STR(buffer)) == 1) { 318 if (geteuid() == 0) { 319 if (chdir(STR(buffer)) < 0) 320 msg_fatal("chdir(%s): %m", STR(buffer)); 321 if (chroot(STR(buffer)) < 0) 322 msg_fatal("chroot(%s): %m", STR(buffer)); 323 vstream_printf("chroot %s completed\n", STR(buffer)); 324 } 325 } 326 327 /* 328 * File. 329 */ 330 else if (strcmp(cmd, "file") == 0 331 && sscanf(bp, "%255s", STR(buffer)) == 1) { 332 if ((fp = vstream_fopen(STR(buffer), O_RDONLY, 0)) == 0) 333 msg_fatal("open(%s): %m", STR(buffer)); 334 while (vstring_fgets_nonl(buffer, fp)) 335 vstream_printf("%s\n", casefold(dest, STR(buffer))); 336 vstream_fclose(fp); 337 } 338 339 /* 340 * Verbose. 341 */ 342 else if (strcmp(cmd, "verbose") == 0 343 && sscanf(bp, "%i", &msg_verbose) == 1) { 344 /* void */ ; 345 } 346 347 /* 348 * Usage 349 */ 350 else { 351 vstream_printf("Usage: %s chroot <path> | file <path> | fold <text> | range <first> <last> | verbose <int>\n", 352 argv[0]); 353 } 354 vstream_fflush(VSTREAM_OUT); 355 } 356 vstring_free(buffer); 357 vstring_free(dest); 358 exit(0); 359 } 360 361 #endif /* TEST */ 362