1 /* $OpenBSD: utf8.c,v 1.44 2019/11/25 15:04:15 nicm Exp $ */ 2 3 /* 4 * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER 15 * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING 16 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include <sys/types.h> 20 21 #include <ctype.h> 22 #include <errno.h> 23 #include <stdlib.h> 24 #include <string.h> 25 #include <vis.h> 26 #include <wchar.h> 27 28 #include "tmux.h" 29 30 static int utf8_width(wchar_t); 31 32 /* Set a single character. */ 33 void 34 utf8_set(struct utf8_data *ud, u_char ch) 35 { 36 static const struct utf8_data empty = { { 0 }, 1, 1, 1 }; 37 38 memcpy(ud, &empty, sizeof *ud); 39 *ud->data = ch; 40 } 41 42 /* Copy UTF-8 character. */ 43 void 44 utf8_copy(struct utf8_data *to, const struct utf8_data *from) 45 { 46 u_int i; 47 48 memcpy(to, from, sizeof *to); 49 50 for (i = to->size; i < sizeof to->data; i++) 51 to->data[i] = '\0'; 52 } 53 54 /* 55 * Open UTF-8 sequence. 56 * 57 * 11000010-11011111 C2-DF start of 2-byte sequence 58 * 11100000-11101111 E0-EF start of 3-byte sequence 59 * 11110000-11110100 F0-F4 start of 4-byte sequence 60 */ 61 enum utf8_state 62 utf8_open(struct utf8_data *ud, u_char ch) 63 { 64 memset(ud, 0, sizeof *ud); 65 if (ch >= 0xc2 && ch <= 0xdf) 66 ud->size = 2; 67 else if (ch >= 0xe0 && ch <= 0xef) 68 ud->size = 3; 69 else if (ch >= 0xf0 && ch <= 0xf4) 70 ud->size = 4; 71 else 72 return (UTF8_ERROR); 73 utf8_append(ud, ch); 74 return (UTF8_MORE); 75 } 76 77 /* Append character to UTF-8, closing if finished. */ 78 enum utf8_state 79 utf8_append(struct utf8_data *ud, u_char ch) 80 { 81 wchar_t wc; 82 int width; 83 84 if (ud->have >= ud->size) 85 fatalx("UTF-8 character overflow"); 86 if (ud->size > sizeof ud->data) 87 fatalx("UTF-8 character size too large"); 88 89 if (ud->have != 0 && (ch & 0xc0) != 0x80) 90 ud->width = 0xff; 91 92 ud->data[ud->have++] = ch; 93 if (ud->have != ud->size) 94 return (UTF8_MORE); 95 96 if (ud->width == 0xff) 97 return (UTF8_ERROR); 98 99 if (utf8_combine(ud, &wc) != UTF8_DONE) 100 return (UTF8_ERROR); 101 if ((width = utf8_width(wc)) < 0) 102 return (UTF8_ERROR); 103 ud->width = width; 104 105 return (UTF8_DONE); 106 } 107 108 /* Get width of Unicode character. */ 109 static int 110 utf8_width(wchar_t wc) 111 { 112 int width; 113 114 width = wcwidth(wc); 115 if (width < 0 || width > 0xff) { 116 log_debug("Unicode %04lx, wcwidth() %d", (long)wc, width); 117 return (-1); 118 } 119 return (width); 120 } 121 122 /* Combine UTF-8 into Unicode. */ 123 enum utf8_state 124 utf8_combine(const struct utf8_data *ud, wchar_t *wc) 125 { 126 switch (mbtowc(wc, ud->data, ud->size)) { 127 case -1: 128 log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data, 129 errno); 130 mbtowc(NULL, NULL, MB_CUR_MAX); 131 return (UTF8_ERROR); 132 case 0: 133 return (UTF8_ERROR); 134 default: 135 return (UTF8_DONE); 136 } 137 } 138 139 /* Split Unicode into UTF-8. */ 140 enum utf8_state 141 utf8_split(wchar_t wc, struct utf8_data *ud) 142 { 143 char s[MB_LEN_MAX]; 144 int slen; 145 146 slen = wctomb(s, wc); 147 if (slen <= 0 || slen > (int)sizeof ud->data) 148 return (UTF8_ERROR); 149 150 memcpy(ud->data, s, slen); 151 ud->size = slen; 152 153 ud->width = utf8_width(wc); 154 return (UTF8_DONE); 155 } 156 157 /* 158 * Encode len characters from src into dst, which is guaranteed to have four 159 * bytes available for each character from src (for \abc or UTF-8) plus space 160 * for \0. 161 */ 162 int 163 utf8_strvis(char *dst, const char *src, size_t len, int flag) 164 { 165 struct utf8_data ud; 166 const char *start, *end; 167 enum utf8_state more; 168 size_t i; 169 170 start = dst; 171 end = src + len; 172 173 while (src < end) { 174 if ((more = utf8_open(&ud, *src)) == UTF8_MORE) { 175 while (++src < end && more == UTF8_MORE) 176 more = utf8_append(&ud, *src); 177 if (more == UTF8_DONE) { 178 /* UTF-8 character finished. */ 179 for (i = 0; i < ud.size; i++) 180 *dst++ = ud.data[i]; 181 continue; 182 } 183 /* Not a complete, valid UTF-8 character. */ 184 src -= ud.have; 185 } 186 if (src[0] == '$' && src < end - 1) { 187 if (isalpha((u_char)src[1]) || 188 src[1] == '_' || 189 src[1] == '{') 190 *dst++ = '\\'; 191 *dst++ = '$'; 192 } else if (src < end - 1) 193 dst = vis(dst, src[0], flag, src[1]); 194 else if (src < end) 195 dst = vis(dst, src[0], flag, '\0'); 196 src++; 197 } 198 199 *dst = '\0'; 200 return (dst - start); 201 } 202 203 /* Same as utf8_strvis but allocate the buffer. */ 204 int 205 utf8_stravis(char **dst, const char *src, int flag) 206 { 207 char *buf; 208 int len; 209 210 buf = xreallocarray(NULL, 4, strlen(src) + 1); 211 len = utf8_strvis(buf, src, strlen(src), flag); 212 213 *dst = xrealloc(buf, len + 1); 214 return (len); 215 } 216 217 /* Does this string contain anything that isn't valid UTF-8? */ 218 int 219 utf8_isvalid(const char *s) 220 { 221 struct utf8_data ud; 222 const char *end; 223 enum utf8_state more; 224 225 end = s + strlen(s); 226 while (s < end) { 227 if ((more = utf8_open(&ud, *s)) == UTF8_MORE) { 228 while (++s < end && more == UTF8_MORE) 229 more = utf8_append(&ud, *s); 230 if (more == UTF8_DONE) 231 continue; 232 return (0); 233 } 234 if (*s < 0x20 || *s > 0x7e) 235 return (0); 236 s++; 237 } 238 return (1); 239 } 240 241 /* 242 * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free 243 * the returned string. Anything not valid printable ASCII or UTF-8 is 244 * stripped. 245 */ 246 char * 247 utf8_sanitize(const char *src) 248 { 249 char *dst; 250 size_t n; 251 enum utf8_state more; 252 struct utf8_data ud; 253 u_int i; 254 255 dst = NULL; 256 257 n = 0; 258 while (*src != '\0') { 259 dst = xreallocarray(dst, n + 1, sizeof *dst); 260 if ((more = utf8_open(&ud, *src)) == UTF8_MORE) { 261 while (*++src != '\0' && more == UTF8_MORE) 262 more = utf8_append(&ud, *src); 263 if (more == UTF8_DONE) { 264 dst = xreallocarray(dst, n + ud.width, 265 sizeof *dst); 266 for (i = 0; i < ud.width; i++) 267 dst[n++] = '_'; 268 continue; 269 } 270 src -= ud.have; 271 } 272 if (*src > 0x1f && *src < 0x7f) 273 dst[n++] = *src; 274 else 275 dst[n++] = '_'; 276 src++; 277 } 278 279 dst = xreallocarray(dst, n + 1, sizeof *dst); 280 dst[n] = '\0'; 281 return (dst); 282 } 283 284 /* Get UTF-8 buffer length. */ 285 size_t 286 utf8_strlen(const struct utf8_data *s) 287 { 288 size_t i; 289 290 for (i = 0; s[i].size != 0; i++) 291 /* nothing */; 292 return (i); 293 } 294 295 /* Get UTF-8 string width. */ 296 u_int 297 utf8_strwidth(const struct utf8_data *s, ssize_t n) 298 { 299 ssize_t i; 300 u_int width; 301 302 width = 0; 303 for (i = 0; s[i].size != 0; i++) { 304 if (n != -1 && n == i) 305 break; 306 width += s[i].width; 307 } 308 return (width); 309 } 310 311 /* 312 * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0. 313 * Caller frees. 314 */ 315 struct utf8_data * 316 utf8_fromcstr(const char *src) 317 { 318 struct utf8_data *dst; 319 size_t n; 320 enum utf8_state more; 321 322 dst = NULL; 323 324 n = 0; 325 while (*src != '\0') { 326 dst = xreallocarray(dst, n + 1, sizeof *dst); 327 if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) { 328 while (*++src != '\0' && more == UTF8_MORE) 329 more = utf8_append(&dst[n], *src); 330 if (more == UTF8_DONE) { 331 n++; 332 continue; 333 } 334 src -= dst[n].have; 335 } 336 utf8_set(&dst[n], *src); 337 n++; 338 src++; 339 } 340 341 dst = xreallocarray(dst, n + 1, sizeof *dst); 342 dst[n].size = 0; 343 return (dst); 344 } 345 346 /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */ 347 char * 348 utf8_tocstr(struct utf8_data *src) 349 { 350 char *dst; 351 size_t n; 352 353 dst = NULL; 354 355 n = 0; 356 for(; src->size != 0; src++) { 357 dst = xreallocarray(dst, n + src->size, 1); 358 memcpy(dst + n, src->data, src->size); 359 n += src->size; 360 } 361 362 dst = xreallocarray(dst, n + 1, 1); 363 dst[n] = '\0'; 364 return (dst); 365 } 366 367 /* Get width of UTF-8 string. */ 368 u_int 369 utf8_cstrwidth(const char *s) 370 { 371 struct utf8_data tmp; 372 u_int width; 373 enum utf8_state more; 374 375 width = 0; 376 while (*s != '\0') { 377 if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) { 378 while (*++s != '\0' && more == UTF8_MORE) 379 more = utf8_append(&tmp, *s); 380 if (more == UTF8_DONE) { 381 width += tmp.width; 382 continue; 383 } 384 s -= tmp.have; 385 } 386 if (*s > 0x1f && *s != 0x7f) 387 width++; 388 s++; 389 } 390 return (width); 391 } 392 393 /* Pad UTF-8 string to width on the left. Caller frees. */ 394 char * 395 utf8_padcstr(const char *s, u_int width) 396 { 397 size_t slen; 398 char *out; 399 u_int n, i; 400 401 n = utf8_cstrwidth(s); 402 if (n >= width) 403 return (xstrdup(s)); 404 405 slen = strlen(s); 406 out = xmalloc(slen + 1 + (width - n)); 407 memcpy(out, s, slen); 408 for (i = n; i < width; i++) 409 out[slen++] = ' '; 410 out[slen] = '\0'; 411 return (out); 412 } 413 414 /* Pad UTF-8 string to width on the right. Caller frees. */ 415 char * 416 utf8_rpadcstr(const char *s, u_int width) 417 { 418 size_t slen; 419 char *out; 420 u_int n, i; 421 422 n = utf8_cstrwidth(s); 423 if (n >= width) 424 return (xstrdup(s)); 425 426 slen = strlen(s); 427 out = xmalloc(slen + 1 + (width - n)); 428 for (i = 0; i < width - n; i++) 429 out[i] = ' '; 430 memcpy(out + i, s, slen); 431 out[i + slen] = '\0'; 432 return (out); 433 } 434 435 int 436 utf8_cstrhas(const char *s, const struct utf8_data *ud) 437 { 438 struct utf8_data *copy, *loop; 439 int found = 0; 440 441 copy = utf8_fromcstr(s); 442 for (loop = copy; loop->size != 0; loop++) { 443 if (loop->size != ud->size) 444 continue; 445 if (memcmp(loop->data, ud->data, loop->size) == 0) { 446 found = 1; 447 break; 448 } 449 } 450 free(copy); 451 452 return (found); 453 } 454