1 /* $OpenBSD$ */ 2 3 /* 4 * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER 15 * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING 16 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include <sys/types.h> 20 21 #include <ctype.h> 22 #include <errno.h> 23 #include <stdlib.h> 24 #include <string.h> 25 #include <wchar.h> 26 27 #include "tmux.h" 28 29 struct utf8_item { 30 RB_ENTRY(utf8_item) index_entry; 31 u_int index; 32 33 RB_ENTRY(utf8_item) data_entry; 34 char data[UTF8_SIZE]; 35 u_char size; 36 }; 37 38 static int 39 utf8_data_cmp(struct utf8_item *ui1, struct utf8_item *ui2) 40 { 41 if (ui1->size < ui2->size) 42 return (-1); 43 if (ui1->size > ui2->size) 44 return (1); 45 return (memcmp(ui1->data, ui2->data, ui1->size)); 46 } 47 RB_HEAD(utf8_data_tree, utf8_item); 48 RB_GENERATE_STATIC(utf8_data_tree, utf8_item, data_entry, utf8_data_cmp); 49 static struct utf8_data_tree utf8_data_tree = RB_INITIALIZER(utf8_data_tree); 50 51 static int 52 utf8_index_cmp(struct utf8_item *ui1, struct utf8_item *ui2) 53 { 54 if (ui1->index < ui2->index) 55 return (-1); 56 if (ui1->index > ui2->index) 57 return (1); 58 return (0); 59 } 60 RB_HEAD(utf8_index_tree, utf8_item); 61 RB_GENERATE_STATIC(utf8_index_tree, utf8_item, index_entry, utf8_index_cmp); 62 static struct utf8_index_tree utf8_index_tree = RB_INITIALIZER(utf8_index_tree); 63 64 static u_int utf8_next_index; 65 66 #define UTF8_GET_SIZE(uc) (((uc) >> 24) & 0x1f) 67 #define UTF8_GET_WIDTH(flags) (((uc) >> 29) - 1) 68 69 #define UTF8_SET_SIZE(size) (((utf8_char)(size)) << 24) 70 #define UTF8_SET_WIDTH(width) ((((utf8_char)(width)) + 1) << 29) 71 72 /* Get a UTF-8 item from data. */ 73 static struct utf8_item * 74 utf8_item_by_data(const char *data, size_t size) 75 { 76 struct utf8_item ui; 77 78 memcpy(ui.data, data, size); 79 ui.size = size; 80 81 return (RB_FIND(utf8_data_tree, &utf8_data_tree, &ui)); 82 } 83 84 /* Get a UTF-8 item from data. */ 85 static struct utf8_item * 86 utf8_item_by_index(u_int index) 87 { 88 struct utf8_item ui; 89 90 ui.index = index; 91 92 return (RB_FIND(utf8_index_tree, &utf8_index_tree, &ui)); 93 } 94 95 /* Add a UTF-8 item. */ 96 static int 97 utf8_put_item(const u_char *data, size_t size, u_int *index) 98 { 99 struct utf8_item *ui; 100 101 ui = utf8_item_by_data((const char *)data, size); 102 if (ui != NULL) { 103 *index = ui->index; 104 log_debug("%s: found %.*s = %u", __func__, (int)size, data, 105 *index); 106 return (0); 107 } 108 109 if (utf8_next_index == 0xffffff + 1) 110 return (-1); 111 112 ui = xcalloc(1, sizeof *ui); 113 ui->index = utf8_next_index++; 114 RB_INSERT(utf8_index_tree, &utf8_index_tree, ui); 115 116 memcpy(ui->data, data, size); 117 ui->size = size; 118 RB_INSERT(utf8_data_tree, &utf8_data_tree, ui); 119 120 *index = ui->index; 121 log_debug("%s: added %.*s = %u", __func__, (int)size, data, *index); 122 return (0); 123 } 124 125 /* Get UTF-8 character from data. */ 126 enum utf8_state 127 utf8_from_data(const struct utf8_data *ud, utf8_char *uc) 128 { 129 u_int index; 130 131 if (ud->width > 2) 132 fatalx("invalid UTF-8 width: %u", ud->width); 133 134 if (ud->size > UTF8_SIZE) 135 goto fail; 136 if (ud->size <= 3) { 137 index = (((utf8_char)ud->data[2] << 16)| 138 ((utf8_char)ud->data[1] << 8)| 139 ((utf8_char)ud->data[0])); 140 } else if (utf8_put_item(ud->data, ud->size, &index) != 0) 141 goto fail; 142 *uc = UTF8_SET_SIZE(ud->size)|UTF8_SET_WIDTH(ud->width)|index; 143 log_debug("%s: (%d %d %.*s) -> %08x", __func__, ud->width, ud->size, 144 (int)ud->size, ud->data, *uc); 145 return (UTF8_DONE); 146 147 fail: 148 if (ud->width == 0) 149 *uc = UTF8_SET_SIZE(0)|UTF8_SET_WIDTH(0); 150 else if (ud->width == 1) 151 *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x20; 152 else 153 *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x2020; 154 return (UTF8_ERROR); 155 } 156 157 /* Get UTF-8 data from character. */ 158 void 159 utf8_to_data(utf8_char uc, struct utf8_data *ud) 160 { 161 struct utf8_item *ui; 162 u_int index; 163 164 memset(ud, 0, sizeof *ud); 165 ud->size = ud->have = UTF8_GET_SIZE(uc); 166 ud->width = UTF8_GET_WIDTH(uc); 167 168 if (ud->size <= 3) { 169 ud->data[2] = (uc >> 16); 170 ud->data[1] = ((uc >> 8) & 0xff); 171 ud->data[0] = (uc & 0xff); 172 } else { 173 index = (uc & 0xffffff); 174 if ((ui = utf8_item_by_index(index)) == NULL) 175 memset(ud->data, ' ', ud->size); 176 else 177 memcpy(ud->data, ui->data, ud->size); 178 } 179 180 log_debug("%s: %08x -> (%d %d %.*s)", __func__, uc, ud->width, ud->size, 181 (int)ud->size, ud->data); 182 } 183 184 /* Get UTF-8 character from a single ASCII character. */ 185 u_int 186 utf8_build_one(u_char ch) 187 { 188 return (UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|ch); 189 } 190 191 /* Set a single character. */ 192 void 193 utf8_set(struct utf8_data *ud, u_char ch) 194 { 195 static const struct utf8_data empty = { { 0 }, 1, 1, 1 }; 196 197 memcpy(ud, &empty, sizeof *ud); 198 *ud->data = ch; 199 } 200 201 /* Copy UTF-8 character. */ 202 void 203 utf8_copy(struct utf8_data *to, const struct utf8_data *from) 204 { 205 u_int i; 206 207 memcpy(to, from, sizeof *to); 208 209 for (i = to->size; i < sizeof to->data; i++) 210 to->data[i] = '\0'; 211 } 212 213 /* Get width of Unicode character. */ 214 static enum utf8_state 215 utf8_width(struct utf8_data *ud, int *width) 216 { 217 wchar_t wc; 218 219 #ifdef HAVE_UTF8PROC 220 switch (utf8proc_mbtowc(&wc, ud->data, ud->size)) { 221 #else 222 switch (mbtowc(&wc, (char *)ud->data, ud->size)) { 223 #endif 224 case -1: 225 log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data, 226 errno); 227 mbtowc(NULL, NULL, MB_CUR_MAX); 228 return (UTF8_ERROR); 229 case 0: 230 return (UTF8_ERROR); 231 } 232 #ifdef HAVE_UTF8PROC 233 *width = utf8proc_wcwidth(wc); 234 #else 235 *width = wcwidth(wc); 236 #endif 237 if (*width >= 0 && *width <= 0xff) 238 return (UTF8_DONE); 239 log_debug("UTF-8 %.*s, wcwidth() %d", (int)ud->size, ud->data, *width); 240 241 #ifndef __OpenBSD__ 242 /* 243 * Many platforms (particularly and inevitably OS X) have no width for 244 * relatively common characters (wcwidth() returns -1); assume width 1 245 * in this case. This will be wrong for genuinely nonprintable 246 * characters, but they should be rare. We may pass through stuff that 247 * ideally we would block, but this is no worse than sending the same 248 * to the terminal without tmux. 249 */ 250 if (*width < 0) { 251 *width = 1; 252 return (UTF8_DONE); 253 } 254 #endif 255 return (UTF8_ERROR); 256 } 257 258 /* 259 * Open UTF-8 sequence. 260 * 261 * 11000010-11011111 C2-DF start of 2-byte sequence 262 * 11100000-11101111 E0-EF start of 3-byte sequence 263 * 11110000-11110100 F0-F4 start of 4-byte sequence 264 */ 265 enum utf8_state 266 utf8_open(struct utf8_data *ud, u_char ch) 267 { 268 memset(ud, 0, sizeof *ud); 269 if (ch >= 0xc2 && ch <= 0xdf) 270 ud->size = 2; 271 else if (ch >= 0xe0 && ch <= 0xef) 272 ud->size = 3; 273 else if (ch >= 0xf0 && ch <= 0xf4) 274 ud->size = 4; 275 else 276 return (UTF8_ERROR); 277 utf8_append(ud, ch); 278 return (UTF8_MORE); 279 } 280 281 /* Append character to UTF-8, closing if finished. */ 282 enum utf8_state 283 utf8_append(struct utf8_data *ud, u_char ch) 284 { 285 int width; 286 287 if (ud->have >= ud->size) 288 fatalx("UTF-8 character overflow"); 289 if (ud->size > sizeof ud->data) 290 fatalx("UTF-8 character size too large"); 291 292 if (ud->have != 0 && (ch & 0xc0) != 0x80) 293 ud->width = 0xff; 294 295 ud->data[ud->have++] = ch; 296 if (ud->have != ud->size) 297 return (UTF8_MORE); 298 299 if (ud->width == 0xff) 300 return (UTF8_ERROR); 301 if (utf8_width(ud, &width) != UTF8_DONE) 302 return (UTF8_ERROR); 303 ud->width = width; 304 305 return (UTF8_DONE); 306 } 307 308 /* 309 * Encode len characters from src into dst, which is guaranteed to have four 310 * bytes available for each character from src (for \abc or UTF-8) plus space 311 * for \0. 312 */ 313 int 314 utf8_strvis(char *dst, const char *src, size_t len, int flag) 315 { 316 struct utf8_data ud; 317 const char *start = dst, *end = src + len; 318 enum utf8_state more; 319 size_t i; 320 321 while (src < end) { 322 if ((more = utf8_open(&ud, *src)) == UTF8_MORE) { 323 while (++src < end && more == UTF8_MORE) 324 more = utf8_append(&ud, *src); 325 if (more == UTF8_DONE) { 326 /* UTF-8 character finished. */ 327 for (i = 0; i < ud.size; i++) 328 *dst++ = ud.data[i]; 329 continue; 330 } 331 /* Not a complete, valid UTF-8 character. */ 332 src -= ud.have; 333 } 334 if (src[0] == '$' && src < end - 1) { 335 if (isalpha((u_char)src[1]) || 336 src[1] == '_' || 337 src[1] == '{') 338 *dst++ = '\\'; 339 *dst++ = '$'; 340 } else if (src < end - 1) 341 dst = vis(dst, src[0], flag, src[1]); 342 else if (src < end) 343 dst = vis(dst, src[0], flag, '\0'); 344 src++; 345 } 346 *dst = '\0'; 347 return (dst - start); 348 } 349 350 /* Same as utf8_strvis but allocate the buffer. */ 351 int 352 utf8_stravis(char **dst, const char *src, int flag) 353 { 354 char *buf; 355 int len; 356 357 buf = xreallocarray(NULL, 4, strlen(src) + 1); 358 len = utf8_strvis(buf, src, strlen(src), flag); 359 360 *dst = xrealloc(buf, len + 1); 361 return (len); 362 } 363 364 /* Same as utf8_strvis but allocate the buffer. */ 365 int 366 utf8_stravisx(char **dst, const char *src, size_t srclen, int flag) 367 { 368 char *buf; 369 int len; 370 371 buf = xreallocarray(NULL, 4, srclen + 1); 372 len = utf8_strvis(buf, src, srclen, flag); 373 374 *dst = xrealloc(buf, len + 1); 375 return (len); 376 } 377 378 /* Does this string contain anything that isn't valid UTF-8? */ 379 int 380 utf8_isvalid(const char *s) 381 { 382 struct utf8_data ud; 383 const char *end; 384 enum utf8_state more; 385 386 end = s + strlen(s); 387 while (s < end) { 388 if ((more = utf8_open(&ud, *s)) == UTF8_MORE) { 389 while (++s < end && more == UTF8_MORE) 390 more = utf8_append(&ud, *s); 391 if (more == UTF8_DONE) 392 continue; 393 return (0); 394 } 395 if (*s < 0x20 || *s > 0x7e) 396 return (0); 397 s++; 398 } 399 return (1); 400 } 401 402 /* 403 * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free 404 * the returned string. Anything not valid printable ASCII or UTF-8 is 405 * stripped. 406 */ 407 char * 408 utf8_sanitize(const char *src) 409 { 410 char *dst = NULL; 411 size_t n = 0; 412 enum utf8_state more; 413 struct utf8_data ud; 414 u_int i; 415 416 while (*src != '\0') { 417 dst = xreallocarray(dst, n + 1, sizeof *dst); 418 if ((more = utf8_open(&ud, *src)) == UTF8_MORE) { 419 while (*++src != '\0' && more == UTF8_MORE) 420 more = utf8_append(&ud, *src); 421 if (more == UTF8_DONE) { 422 dst = xreallocarray(dst, n + ud.width, 423 sizeof *dst); 424 for (i = 0; i < ud.width; i++) 425 dst[n++] = '_'; 426 continue; 427 } 428 src -= ud.have; 429 } 430 if (*src > 0x1f && *src < 0x7f) 431 dst[n++] = *src; 432 else 433 dst[n++] = '_'; 434 src++; 435 } 436 dst = xreallocarray(dst, n + 1, sizeof *dst); 437 dst[n] = '\0'; 438 return (dst); 439 } 440 441 /* Get UTF-8 buffer length. */ 442 size_t 443 utf8_strlen(const struct utf8_data *s) 444 { 445 size_t i; 446 447 for (i = 0; s[i].size != 0; i++) 448 /* nothing */; 449 return (i); 450 } 451 452 /* Get UTF-8 string width. */ 453 u_int 454 utf8_strwidth(const struct utf8_data *s, ssize_t n) 455 { 456 ssize_t i; 457 u_int width = 0; 458 459 for (i = 0; s[i].size != 0; i++) { 460 if (n != -1 && n == i) 461 break; 462 width += s[i].width; 463 } 464 return (width); 465 } 466 467 /* 468 * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0. 469 * Caller frees. 470 */ 471 struct utf8_data * 472 utf8_fromcstr(const char *src) 473 { 474 struct utf8_data *dst = NULL; 475 size_t n = 0; 476 enum utf8_state more; 477 478 while (*src != '\0') { 479 dst = xreallocarray(dst, n + 1, sizeof *dst); 480 if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) { 481 while (*++src != '\0' && more == UTF8_MORE) 482 more = utf8_append(&dst[n], *src); 483 if (more == UTF8_DONE) { 484 n++; 485 continue; 486 } 487 src -= dst[n].have; 488 } 489 utf8_set(&dst[n], *src); 490 n++; 491 src++; 492 } 493 dst = xreallocarray(dst, n + 1, sizeof *dst); 494 dst[n].size = 0; 495 return (dst); 496 } 497 498 /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */ 499 char * 500 utf8_tocstr(struct utf8_data *src) 501 { 502 char *dst = NULL; 503 size_t n = 0; 504 505 for(; src->size != 0; src++) { 506 dst = xreallocarray(dst, n + src->size, 1); 507 memcpy(dst + n, src->data, src->size); 508 n += src->size; 509 } 510 dst = xreallocarray(dst, n + 1, 1); 511 dst[n] = '\0'; 512 return (dst); 513 } 514 515 /* Get width of UTF-8 string. */ 516 u_int 517 utf8_cstrwidth(const char *s) 518 { 519 struct utf8_data tmp; 520 u_int width; 521 enum utf8_state more; 522 523 width = 0; 524 while (*s != '\0') { 525 if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) { 526 while (*++s != '\0' && more == UTF8_MORE) 527 more = utf8_append(&tmp, *s); 528 if (more == UTF8_DONE) { 529 width += tmp.width; 530 continue; 531 } 532 s -= tmp.have; 533 } 534 if (*s > 0x1f && *s != 0x7f) 535 width++; 536 s++; 537 } 538 return (width); 539 } 540 541 /* Pad UTF-8 string to width on the left. Caller frees. */ 542 char * 543 utf8_padcstr(const char *s, u_int width) 544 { 545 size_t slen; 546 char *out; 547 u_int n, i; 548 549 n = utf8_cstrwidth(s); 550 if (n >= width) 551 return (xstrdup(s)); 552 553 slen = strlen(s); 554 out = xmalloc(slen + 1 + (width - n)); 555 memcpy(out, s, slen); 556 for (i = n; i < width; i++) 557 out[slen++] = ' '; 558 out[slen] = '\0'; 559 return (out); 560 } 561 562 /* Pad UTF-8 string to width on the right. Caller frees. */ 563 char * 564 utf8_rpadcstr(const char *s, u_int width) 565 { 566 size_t slen; 567 char *out; 568 u_int n, i; 569 570 n = utf8_cstrwidth(s); 571 if (n >= width) 572 return (xstrdup(s)); 573 574 slen = strlen(s); 575 out = xmalloc(slen + 1 + (width - n)); 576 for (i = 0; i < width - n; i++) 577 out[i] = ' '; 578 memcpy(out + i, s, slen); 579 out[i + slen] = '\0'; 580 return (out); 581 } 582 583 int 584 utf8_cstrhas(const char *s, const struct utf8_data *ud) 585 { 586 struct utf8_data *copy, *loop; 587 int found = 0; 588 589 copy = utf8_fromcstr(s); 590 for (loop = copy; loop->size != 0; loop++) { 591 if (loop->size != ud->size) 592 continue; 593 if (memcmp(loop->data, ud->data, loop->size) == 0) { 594 found = 1; 595 break; 596 } 597 } 598 free(copy); 599 600 return (found); 601 } 602