1 /* $NetBSD: casefold.c,v 1.2 2017/02/14 01:16:49 christos Exp $ */ 2 3 /*++ 4 /* NAME 5 /* casefold 3 6 /* SUMMARY 7 /* casefold text for caseless comparison 8 /* SYNOPSIS 9 /* #include <stringops.h> 10 /* 11 /* char *casefold( 12 /* VSTRING *dst, 13 /* const char *src) 14 /* 15 /* char *casefold_append( 16 /* VSTRING *dst, 17 /* const char *src) 18 /* 19 /* char *casefold_len( 20 /* VSTRING *dst, 21 /* const char *src, 22 /* ssize_t src_len) 23 /* AUXILIARY FUNCTIONS 24 /* char *casefoldx( 25 /* int flags, 26 /* VSTRING *dst, 27 /* const char *src, 28 /* ssize_t src_len) 29 /* DESCRIPTION 30 /* casefold() converts text to a form that is suitable for 31 /* caseless comparison, rather than presentation to humans. 32 /* 33 /* When compiled without EAI support or util_utf8_enable is 34 /* zero, casefold() implements ASCII case folding, leaving 35 /* non-ASCII byte values unchanged. 36 /* 37 /* When compiled with EAI support and util_utf8_enable is 38 /* non-zero, casefold() implements UTF-8 case folding using 39 /* the en_US locale, as recommended when the conversion result 40 /* is not meant to be presented to humans. 41 /* 42 /* casefold_len() implements casefold() with a source length 43 /* argument. 44 /* 45 /* casefold_append() implements casefold() without overwriting 46 /* the result. 47 /* 48 /* casefoldx() implements a more complex API that implements 49 /* all of the above and more. 50 /* 51 /* Arguments: 52 /* .IP src 53 /* Null-terminated input string. 54 /* .IP dest 55 /* Output buffer, null-terminated. Specify a null pointer to 56 /* use an internal buffer that is overwritten upon each call. 57 /* .IP src_len 58 /* The string length, -1 to determine the length dynamically. 59 /* .IP flags 60 /* Bitwise OR of zero or more of the following: 61 /* .RS 62 /* .IP CASEF_FLAG_UTF8 63 /* Enable UTF-8 support. This flag has no effect when compiled 64 /* without EAI support. 65 /* .IP CASEF_FLAG_APPEND 66 /* Append the result to the buffer, instead of overwriting it. 67 /* DIAGNOSTICS 68 /* All errors are fatal. There appear to be no input-dependent 69 /* errors. 70 /* 71 /* With the ICU 4.8 library, there is no casefold error for 72 /* UTF-8 code points U+0000..U+10FFFF (including surrogate 73 /* range), not even when running inside an empty chroot jail. 74 /* Nor does malformed UTF-8 trigger errors; non-UTF-8 bytes 75 /* are copied verbatim. Based on ICU 4.8 source-code review 76 /* and experimentation(!) we conclude that UTF-8 casefolding 77 /* has no data-dependent error cases, and that it is safe to 78 /* treat all casefolding errors as fatal runtime errors. 79 /* LICENSE 80 /* .ad 81 /* .fi 82 /* The Secure Mailer license must be distributed with this software. 83 /* AUTHOR(S) 84 /* Wietse Venema 85 /* IBM T.J. Watson Research 86 /* P.O. Box 704 87 /* Yorktown Heights, NY 10598, USA 88 /*--*/ 89 90 /* System library. */ 91 92 #include <sys_defs.h> 93 #include <string.h> 94 #include <ctype.h> 95 #ifndef NO_EAI 96 #include <unicode/ucasemap.h> 97 #include <unicode/ustring.h> 98 #include <unicode/uchar.h> 99 #endif 100 101 /* Utility library. */ 102 103 #include <msg.h> 104 #include <stringops.h> 105 106 #define STR(x) vstring_str(x) 107 #define LEN(x) VSTRING_LEN(x) 108 109 /* casefoldx - casefold an UTF-8 string */ 110 111 char *casefoldx(int flags, VSTRING *dest, const char *src, ssize_t len) 112 { 113 size_t old_len; 114 115 #ifdef NO_EAI 116 117 /* 118 * ASCII mode only. 119 */ 120 if (len < 0) 121 len = strlen(src); 122 if ((flags & CASEF_FLAG_APPEND) == 0) 123 VSTRING_RESET(dest); 124 old_len = VSTRING_LEN(dest); 125 vstring_strncat(dest, src, len); 126 lowercase(STR(dest) + old_len); 127 return (STR(dest)); 128 #else 129 130 /* 131 * Unicode mode. 132 */ 133 const char myname[] = "casefold"; 134 static VSTRING *fold_buf = 0; 135 static UCaseMap *csm = 0; 136 UErrorCode error; 137 ssize_t space_needed; 138 int n; 139 140 /* 141 * Handle special cases. 142 */ 143 if (len < 0) 144 len = strlen(src); 145 if (dest == 0) 146 dest = (fold_buf != 0 ? fold_buf : (fold_buf = vstring_alloc(100))); 147 if ((flags & CASEF_FLAG_APPEND) == 0) 148 VSTRING_RESET(dest); 149 old_len = VSTRING_LEN(dest); 150 151 /* 152 * All-ASCII input, or ASCII mode only. 153 */ 154 if ((flags & CASEF_FLAG_UTF8) == 0 || allascii(src)) { 155 vstring_strncat(dest, src, len); 156 lowercase(STR(dest) + old_len); 157 return (STR(dest)); 158 } 159 160 /* 161 * ICU 4.8 ucasemap_utf8FoldCase() does not complain about UTF-8 syntax 162 * errors. XXX Based on source-code review we conclude that non-UTF-8 163 * bytes are copied verbatim, and experiments confirm this. Given that 164 * this behavior is intentional, we assume that it will stay that way. 165 */ 166 #if 0 167 if (valid_utf8_string(src, len) == 0) { 168 if (err) 169 *err = "malformed UTF-8 or invalid codepoint"; 170 return (0); 171 } 172 #endif 173 174 /* 175 * One-time initialization. With ICU 4.8 this works while chrooted. 176 */ 177 if (csm == 0) { 178 error = U_ZERO_ERROR; 179 csm = ucasemap_open("en_US", U_FOLD_CASE_DEFAULT, &error); 180 if (U_SUCCESS(error) == 0) 181 msg_fatal("ucasemap_open error: %s", u_errorName(error)); 182 } 183 184 /* 185 * Fold the input, adjusting the buffer size if needed. Safety: don't 186 * loop forever. 187 * 188 * Note: the requested amount of space for casemapped output (as reported 189 * with space_needed below) does not include storage for the null 190 * terminator. The terminator is written only when the output buffer is 191 * large enough. This is why we overallocate space when the output does 192 * not fit. But if the output fits exactly, then the ouput will be 193 * unterminated, and we have to terminate the output ourselves. 194 */ 195 for (n = 0; n < 3; n++) { 196 error = U_ZERO_ERROR; 197 space_needed = ucasemap_utf8FoldCase(csm, STR(dest) + old_len, 198 vstring_avail(dest), src, len, &error); 199 if (U_SUCCESS(error)) { 200 VSTRING_AT_OFFSET(dest, old_len + space_needed); 201 if (vstring_avail(dest) == 0) /* exact fit, no terminator */ 202 VSTRING_TERMINATE(dest); /* add terminator */ 203 break; 204 } else if (error == U_BUFFER_OVERFLOW_ERROR) { 205 VSTRING_SPACE(dest, space_needed + 1); /* for terminator */ 206 } else { 207 msg_fatal("%s: conversion error for \"%s\": %s", 208 myname, src, u_errorName(error)); 209 } 210 } 211 return (STR(dest)); 212 #endif /* NO_EAI */ 213 } 214 215 #ifdef TEST 216 217 static void encode_utf8(VSTRING *buffer, int codepoint) 218 { 219 const char myname[] = "encode_utf8"; 220 221 VSTRING_RESET(buffer); 222 if (codepoint < 0x80) { 223 VSTRING_ADDCH(buffer, codepoint); 224 } else if (codepoint < 0x800) { 225 VSTRING_ADDCH(buffer, 0xc0 | (codepoint >> 6)); 226 VSTRING_ADDCH(buffer, 0x80 | (codepoint & 0x3f)); 227 } else if (codepoint < 0x10000) { 228 VSTRING_ADDCH(buffer, 0xe0 | (codepoint >> 12)); 229 VSTRING_ADDCH(buffer, 0x80 | ((codepoint >> 6) & 0x3f)); 230 VSTRING_ADDCH(buffer, 0x80 | (codepoint & 0x3f)); 231 } else if (codepoint <= 0x10FFFF) { 232 VSTRING_ADDCH(buffer, 0xf0 | (codepoint >> 18)); 233 VSTRING_ADDCH(buffer, 0x80 | ((codepoint >> 12) & 0x3f)); 234 VSTRING_ADDCH(buffer, 0x80 | ((codepoint >> 6) & 0x3f)); 235 VSTRING_ADDCH(buffer, 0x80 | (codepoint & 0x3f)); 236 } else { 237 msg_panic("%s: out-of-range codepoint U+%X", myname, codepoint); 238 } 239 VSTRING_TERMINATE(buffer); 240 } 241 242 #include <stdlib.h> 243 #include <stdio.h> 244 #include <locale.h> 245 246 #include <vstream.h> 247 #include <vstring_vstream.h> 248 #include <msg_vstream.h> 249 250 int main(int argc, char **argv) 251 { 252 VSTRING *buffer = vstring_alloc(1); 253 VSTRING *dest = vstring_alloc(1); 254 char *bp; 255 char *conv_res; 256 char *cmd; 257 int codepoint, first, last; 258 VSTREAM *fp; 259 260 if (setlocale(LC_ALL, "C") == 0) 261 msg_fatal("setlocale(LC_ALL, C) failed: %m"); 262 263 msg_vstream_init(argv[0], VSTREAM_ERR); 264 265 util_utf8_enable = 1; 266 267 VSTRING_SPACE(buffer, 256); /* chroot/file pathname */ 268 269 while (vstring_fgets_nonl(buffer, VSTREAM_IN)) { 270 bp = STR(buffer); 271 vstream_printf("> %s\n", bp); 272 cmd = mystrtok(&bp, CHARS_SPACE); 273 if (cmd == 0 || *cmd == '#') 274 continue; 275 while (ISSPACE(*bp)) 276 bp++; 277 278 /* 279 * Null-terminated string. 280 */ 281 if (strcmp(cmd, "fold") == 0) { 282 conv_res = casefold(dest, bp); 283 vstream_printf("\"%s\" ->fold \"%s\"\n", bp, conv_res); 284 } 285 286 /* 287 * Codepoint range. 288 */ 289 else if (strcmp(cmd, "range") == 0 290 && sscanf(bp, "%i %i", &first, &last) == 2 291 && first <= last) { 292 for (codepoint = first; codepoint <= last; codepoint++) { 293 if (codepoint >= 0xD800 && codepoint <= 0xDFFF) { 294 vstream_printf("skipping surrogate range\n"); 295 codepoint = 0xDFFF; 296 } else { 297 encode_utf8(buffer, codepoint); 298 if (msg_verbose) 299 vstream_printf("U+%X -> %s\n", codepoint, STR(buffer)); 300 if (valid_utf8_string(STR(buffer), LEN(buffer)) == 0) 301 msg_fatal("bad utf-8 encoding for U+%X\n", codepoint); 302 casefold(dest, STR(buffer)); 303 } 304 } 305 vstream_printf("range completed: 0x%x..0x%x\n", first, last); 306 } 307 308 /* 309 * Chroot directory. 310 */ 311 else if (strcmp(cmd, "chroot") == 0 312 && sscanf(bp, "%255s", STR(buffer)) == 1) { 313 if (geteuid() == 0) { 314 if (chdir(STR(buffer)) < 0) 315 msg_fatal("chdir(%s): %m", STR(buffer)); 316 if (chroot(STR(buffer)) < 0) 317 msg_fatal("chroot(%s): %m", STR(buffer)); 318 vstream_printf("chroot %s completed\n", STR(buffer)); 319 } 320 } 321 322 /* 323 * File. 324 */ 325 else if (strcmp(cmd, "file") == 0 326 && sscanf(bp, "%255s", STR(buffer)) == 1) { 327 if ((fp = vstream_fopen(STR(buffer), O_RDONLY, 0)) == 0) 328 msg_fatal("open(%s): %m", STR(buffer)); 329 while (vstring_fgets_nonl(buffer, fp)) 330 vstream_printf("%s\n", casefold(dest, STR(buffer))); 331 vstream_fclose(fp); 332 } 333 334 /* 335 * Verbose. 336 */ 337 else if (strcmp(cmd, "verbose") == 0 338 && sscanf(bp, "%i", &msg_verbose) == 1) { 339 /* void */ ; 340 } 341 342 /* 343 * Usage 344 */ 345 else { 346 vstream_printf("Usage: %s chroot <path> | file <path> | fold <text> | range <first> <last> | verbose <int>\n", 347 argv[0]); 348 } 349 vstream_fflush(VSTREAM_OUT); 350 } 351 vstring_free(buffer); 352 vstring_free(dest); 353 exit(0); 354 } 355 356 #endif /* TEST */ 357