1 /* $OpenBSD: utf8.c,v 1.3 2016/05/30 12:57:21 schwarze Exp $ */ 2 /* 3 * Copyright (c) 2016 Ingo Schwarze <schwarze@openbsd.org> 4 * 5 * Permission to use, copy, modify, and distribute this software for any 6 * purpose with or without fee is hereby granted, provided that the above 7 * copyright notice and this permission notice appear in all copies. 8 * 9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 16 */ 17 18 /* 19 * Utility functions for multibyte-character handling, 20 * in particular to sanitize untrusted strings for terminal output. 21 */ 22 23 #include <sys/types.h> 24 #include <langinfo.h> 25 #include <limits.h> 26 #include <stdarg.h> 27 #include <stdio.h> 28 #include <stdlib.h> 29 #include <string.h> 30 #include <vis.h> 31 #include <wchar.h> 32 33 #include "utf8.h" 34 35 static int dangerous_locale(void); 36 static int grow_dst(char **, size_t *, size_t, char **, size_t); 37 static int vasnmprintf(char **, size_t, int *, const char *, va_list); 38 39 40 /* 41 * For US-ASCII and UTF-8 encodings, we can safely recover from 42 * encoding errors and from non-printable characters. For any 43 * other encodings, err to the side of caution and abort parsing: 44 * For state-dependent encodings, recovery is impossible. 45 * For arbitrary encodings, replacement of non-printable 46 * characters would be non-trivial and too fragile. 47 */ 48 49 static int 50 dangerous_locale(void) { 51 char *loc; 52 53 loc = nl_langinfo(CODESET); 54 return strcmp(loc, "US-ASCII") && strcmp(loc, "UTF-8"); 55 } 56 57 static int 58 grow_dst(char **dst, size_t *sz, size_t maxsz, char **dp, size_t need) 59 { 60 char *tp; 61 size_t tsz; 62 63 if (*dp + need < *dst + *sz) 64 return 0; 65 tsz = *sz + 128; 66 if (tsz > maxsz) 67 tsz = maxsz; 68 if ((tp = realloc(*dst, tsz)) == NULL) 69 return -1; 70 *dp = tp + (*dp - *dst); 71 *dst = tp; 72 *sz = tsz; 73 return 0; 74 } 75 76 /* 77 * The following two functions limit the number of bytes written, 78 * including the terminating '\0', to sz. Unless wp is NULL, 79 * they limit the number of display columns occupied to *wp. 80 * Whichever is reached first terminates the output string. 81 * To stay close to the standard interfaces, they return the number of 82 * non-NUL bytes that would have been written if both were unlimited. 83 * If wp is NULL, newline, carriage return, and tab are allowed; 84 * otherwise, the actual number of columns occupied by what was 85 * written is returned in *wp. 86 */ 87 88 static int 89 vasnmprintf(char **str, size_t maxsz, int *wp, const char *fmt, va_list ap) 90 { 91 char *src; /* Source string returned from vasprintf. */ 92 char *sp; /* Pointer into src. */ 93 char *dst; /* Destination string to be returned. */ 94 char *dp; /* Pointer into dst. */ 95 char *tp; /* Temporary pointer for dst. */ 96 size_t sz; /* Number of bytes allocated for dst. */ 97 wchar_t wc; /* Wide character at sp. */ 98 int len; /* Number of bytes in the character at sp. */ 99 int ret; /* Number of bytes needed to format src. */ 100 int width; /* Display width of the character wc. */ 101 int total_width, max_width, print; 102 103 src = NULL; 104 if ((ret = vasprintf(&src, fmt, ap)) <= 0) 105 goto fail; 106 107 sz = strlen(src) + 1; 108 if ((dst = malloc(sz)) == NULL) { 109 free(src); 110 goto fail; 111 } 112 113 if (maxsz > INT_MAX) 114 maxsz = INT_MAX; 115 116 sp = src; 117 dp = dst; 118 ret = 0; 119 print = 1; 120 total_width = 0; 121 max_width = wp == NULL ? INT_MAX : *wp; 122 while (*sp != '\0') { 123 if ((len = mbtowc(&wc, sp, MB_CUR_MAX)) == -1) { 124 (void)mbtowc(NULL, NULL, MB_CUR_MAX); 125 if (dangerous_locale()) { 126 ret = -1; 127 break; 128 } 129 len = 1; 130 width = -1; 131 } else if (wp == NULL && 132 (wc == L'\n' || wc == L'\r' || wc == L'\t')) { 133 /* 134 * Don't use width uninitialized; the actual 135 * value doesn't matter because total_width 136 * is only returned for wp != NULL. 137 */ 138 width = 0; 139 } else if ((width = wcwidth(wc)) == -1 && 140 dangerous_locale()) { 141 ret = -1; 142 break; 143 } 144 145 /* Valid, printable character. */ 146 147 if (width >= 0) { 148 if (print && (dp - dst >= (int)maxsz - len || 149 total_width > max_width - width)) 150 print = 0; 151 if (print) { 152 if (grow_dst(&dst, &sz, maxsz, 153 &dp, len) == -1) { 154 ret = -1; 155 break; 156 } 157 total_width += width; 158 memcpy(dp, sp, len); 159 dp += len; 160 } 161 sp += len; 162 if (ret >= 0) 163 ret += len; 164 continue; 165 } 166 167 /* Escaping required. */ 168 169 while (len > 0) { 170 if (print && (dp - dst >= (int)maxsz - 4 || 171 total_width > max_width - 4)) 172 print = 0; 173 if (print) { 174 if (grow_dst(&dst, &sz, maxsz, 175 &dp, 4) == -1) { 176 ret = -1; 177 break; 178 } 179 tp = vis(dp, *sp, VIS_OCTAL | VIS_ALL, 0); 180 width = tp - dp; 181 total_width += width; 182 dp = tp; 183 } else 184 width = 4; 185 len--; 186 sp++; 187 if (ret >= 0) 188 ret += width; 189 } 190 if (len > 0) 191 break; 192 } 193 free(src); 194 *dp = '\0'; 195 *str = dst; 196 if (wp != NULL) 197 *wp = total_width; 198 199 /* 200 * If the string was truncated by the width limit but 201 * would have fit into the size limit, the only sane way 202 * to report the problem is using the return value, such 203 * that the usual idiom "if (ret < 0 || ret >= sz) error" 204 * works as expected. 205 */ 206 207 if (ret < (int)maxsz && !print) 208 ret = -1; 209 return ret; 210 211 fail: 212 if (wp != NULL) 213 *wp = 0; 214 if (ret == 0) { 215 *str = src; 216 return 0; 217 } else { 218 *str = NULL; 219 return -1; 220 } 221 } 222 223 int 224 snmprintf(char *str, size_t sz, int *wp, const char *fmt, ...) 225 { 226 va_list ap; 227 char *cp; 228 int ret; 229 230 va_start(ap, fmt); 231 ret = vasnmprintf(&cp, sz, wp, fmt, ap); 232 va_end(ap); 233 if (cp != NULL) { 234 (void)strlcpy(str, cp, sz); 235 free(cp); 236 } else 237 *str = '\0'; 238 return ret; 239 } 240 241 /* 242 * To stay close to the standard interfaces, the following functions 243 * return the number of non-NUL bytes written. 244 */ 245 246 int 247 vfmprintf(FILE *stream, const char *fmt, va_list ap) 248 { 249 char *str; 250 int ret; 251 252 if ((ret = vasnmprintf(&str, INT_MAX, NULL, fmt, ap)) < 0) 253 return -1; 254 if (fputs(str, stream) == EOF) 255 ret = -1; 256 free(str); 257 return ret; 258 } 259 260 int 261 fmprintf(FILE *stream, const char *fmt, ...) 262 { 263 va_list ap; 264 int ret; 265 266 va_start(ap, fmt); 267 ret = vfmprintf(stream, fmt, ap); 268 va_end(ap); 269 return ret; 270 } 271 272 int 273 mprintf(const char *fmt, ...) 274 { 275 va_list ap; 276 int ret; 277 278 va_start(ap, fmt); 279 ret = vfmprintf(stdout, fmt, ap); 280 va_end(ap); 281 return ret; 282 } 283