1 /* $OpenBSD$ */ 2 3 /* 4 * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER 15 * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING 16 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include <sys/types.h> 20 21 #include <ctype.h> 22 #include <errno.h> 23 #include <stdlib.h> 24 #include <string.h> 25 #include <wchar.h> 26 27 #include "tmux.h" 28 29 struct utf8_item { 30 RB_ENTRY(utf8_item) index_entry; 31 u_int index; 32 33 RB_ENTRY(utf8_item) data_entry; 34 char data[UTF8_SIZE]; 35 u_char size; 36 }; 37 38 static int 39 utf8_data_cmp(struct utf8_item *ui1, struct utf8_item *ui2) 40 { 41 if (ui1->size < ui2->size) 42 return (-1); 43 if (ui1->size > ui2->size) 44 return (1); 45 return (memcmp(ui1->data, ui2->data, ui1->size)); 46 } 47 RB_HEAD(utf8_data_tree, utf8_item); 48 RB_GENERATE_STATIC(utf8_data_tree, utf8_item, data_entry, utf8_data_cmp); 49 static struct utf8_data_tree utf8_data_tree = RB_INITIALIZER(utf8_data_tree); 50 51 static int 52 utf8_index_cmp(struct utf8_item *ui1, struct utf8_item *ui2) 53 { 54 if (ui1->index < ui2->index) 55 return (-1); 56 if (ui1->index > ui2->index) 57 return (1); 58 return (0); 59 } 60 RB_HEAD(utf8_index_tree, utf8_item); 61 RB_GENERATE_STATIC(utf8_index_tree, utf8_item, index_entry, utf8_index_cmp); 62 static struct utf8_index_tree utf8_index_tree = RB_INITIALIZER(utf8_index_tree); 63 64 static u_int utf8_next_index; 65 66 #define UTF8_GET_SIZE(uc) (((uc) >> 24) & 0x1f) 67 #define UTF8_GET_WIDTH(uc) (((uc) >> 29) - 1) 68 69 #define UTF8_SET_SIZE(size) (((utf8_char)(size)) << 24) 70 #define UTF8_SET_WIDTH(width) ((((utf8_char)(width)) + 1) << 29) 71 72 /* Get a UTF-8 item from data. */ 73 static struct utf8_item * 74 utf8_item_by_data(const char *data, size_t size) 75 { 76 struct utf8_item ui; 77 78 memcpy(ui.data, data, size); 79 ui.size = size; 80 81 return (RB_FIND(utf8_data_tree, &utf8_data_tree, &ui)); 82 } 83 84 /* Get a UTF-8 item from data. */ 85 static struct utf8_item * 86 utf8_item_by_index(u_int index) 87 { 88 struct utf8_item ui; 89 90 ui.index = index; 91 92 return (RB_FIND(utf8_index_tree, &utf8_index_tree, &ui)); 93 } 94 95 /* Add a UTF-8 item. */ 96 static int 97 utf8_put_item(const u_char *data, size_t size, u_int *index) 98 { 99 struct utf8_item *ui; 100 101 ui = utf8_item_by_data((const char *)data, size); 102 if (ui != NULL) { 103 *index = ui->index; 104 log_debug("%s: found %.*s = %u", __func__, (int)size, data, 105 *index); 106 return (0); 107 } 108 109 if (utf8_next_index == 0xffffff + 1) 110 return (-1); 111 112 ui = xcalloc(1, sizeof *ui); 113 ui->index = utf8_next_index++; 114 RB_INSERT(utf8_index_tree, &utf8_index_tree, ui); 115 116 memcpy(ui->data, data, size); 117 ui->size = size; 118 RB_INSERT(utf8_data_tree, &utf8_data_tree, ui); 119 120 *index = ui->index; 121 log_debug("%s: added %.*s = %u", __func__, (int)size, data, *index); 122 return (0); 123 } 124 125 /* Get UTF-8 character from data. */ 126 enum utf8_state 127 utf8_from_data(const struct utf8_data *ud, utf8_char *uc) 128 { 129 u_int index; 130 131 if (ud->width > 2) 132 fatalx("invalid UTF-8 width: %u", ud->width); 133 134 if (ud->size > UTF8_SIZE) 135 goto fail; 136 if (ud->size <= 3) { 137 index = (((utf8_char)ud->data[2] << 16)| 138 ((utf8_char)ud->data[1] << 8)| 139 ((utf8_char)ud->data[0])); 140 } else if (utf8_put_item(ud->data, ud->size, &index) != 0) 141 goto fail; 142 *uc = UTF8_SET_SIZE(ud->size)|UTF8_SET_WIDTH(ud->width)|index; 143 log_debug("%s: (%d %d %.*s) -> %08x", __func__, ud->width, ud->size, 144 (int)ud->size, ud->data, *uc); 145 return (UTF8_DONE); 146 147 fail: 148 if (ud->width == 0) 149 *uc = UTF8_SET_SIZE(0)|UTF8_SET_WIDTH(0); 150 else if (ud->width == 1) 151 *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x20; 152 else 153 *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x2020; 154 return (UTF8_ERROR); 155 } 156 157 /* Get UTF-8 data from character. */ 158 void 159 utf8_to_data(utf8_char uc, struct utf8_data *ud) 160 { 161 struct utf8_item *ui; 162 u_int index; 163 164 memset(ud, 0, sizeof *ud); 165 ud->size = ud->have = UTF8_GET_SIZE(uc); 166 ud->width = UTF8_GET_WIDTH(uc); 167 168 if (ud->size <= 3) { 169 ud->data[2] = (uc >> 16); 170 ud->data[1] = ((uc >> 8) & 0xff); 171 ud->data[0] = (uc & 0xff); 172 } else { 173 index = (uc & 0xffffff); 174 if ((ui = utf8_item_by_index(index)) == NULL) 175 memset(ud->data, ' ', ud->size); 176 else 177 memcpy(ud->data, ui->data, ud->size); 178 } 179 180 log_debug("%s: %08x -> (%d %d %.*s)", __func__, uc, ud->width, ud->size, 181 (int)ud->size, ud->data); 182 } 183 184 /* Get UTF-8 character from a single ASCII character. */ 185 u_int 186 utf8_build_one(u_char ch) 187 { 188 return (UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|ch); 189 } 190 191 /* Set a single character. */ 192 void 193 utf8_set(struct utf8_data *ud, u_char ch) 194 { 195 static const struct utf8_data empty = { { 0 }, 1, 1, 1 }; 196 197 memcpy(ud, &empty, sizeof *ud); 198 *ud->data = ch; 199 } 200 201 /* Copy UTF-8 character. */ 202 void 203 utf8_copy(struct utf8_data *to, const struct utf8_data *from) 204 { 205 u_int i; 206 207 memcpy(to, from, sizeof *to); 208 209 for (i = to->size; i < sizeof to->data; i++) 210 to->data[i] = '\0'; 211 } 212 213 /* Get width of Unicode character. */ 214 static enum utf8_state 215 utf8_width(struct utf8_data *ud, int *width) 216 { 217 wchar_t wc; 218 219 #ifdef HAVE_UTF8PROC 220 switch (utf8proc_mbtowc(&wc, ud->data, ud->size)) { 221 #else 222 switch (mbtowc(&wc, (char *)ud->data, ud->size)) { 223 #endif 224 case -1: 225 log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data, 226 errno); 227 mbtowc(NULL, NULL, MB_CUR_MAX); 228 return (UTF8_ERROR); 229 case 0: 230 return (UTF8_ERROR); 231 } 232 #ifdef HAVE_UTF8PROC 233 *width = utf8proc_wcwidth(wc); 234 #else 235 *width = wcwidth(wc); 236 #endif 237 if (*width >= 0 && *width <= 0xff) 238 return (UTF8_DONE); 239 log_debug("UTF-8 %.*s, wcwidth() %d", (int)ud->size, ud->data, *width); 240 return (UTF8_ERROR); 241 } 242 243 /* 244 * Open UTF-8 sequence. 245 * 246 * 11000010-11011111 C2-DF start of 2-byte sequence 247 * 11100000-11101111 E0-EF start of 3-byte sequence 248 * 11110000-11110100 F0-F4 start of 4-byte sequence 249 */ 250 enum utf8_state 251 utf8_open(struct utf8_data *ud, u_char ch) 252 { 253 memset(ud, 0, sizeof *ud); 254 if (ch >= 0xc2 && ch <= 0xdf) 255 ud->size = 2; 256 else if (ch >= 0xe0 && ch <= 0xef) 257 ud->size = 3; 258 else if (ch >= 0xf0 && ch <= 0xf4) 259 ud->size = 4; 260 else 261 return (UTF8_ERROR); 262 utf8_append(ud, ch); 263 return (UTF8_MORE); 264 } 265 266 /* Append character to UTF-8, closing if finished. */ 267 enum utf8_state 268 utf8_append(struct utf8_data *ud, u_char ch) 269 { 270 int width; 271 272 if (ud->have >= ud->size) 273 fatalx("UTF-8 character overflow"); 274 if (ud->size > sizeof ud->data) 275 fatalx("UTF-8 character size too large"); 276 277 if (ud->have != 0 && (ch & 0xc0) != 0x80) 278 ud->width = 0xff; 279 280 ud->data[ud->have++] = ch; 281 if (ud->have != ud->size) 282 return (UTF8_MORE); 283 284 if (ud->width == 0xff) 285 return (UTF8_ERROR); 286 if (utf8_width(ud, &width) != UTF8_DONE) 287 return (UTF8_ERROR); 288 ud->width = width; 289 290 return (UTF8_DONE); 291 } 292 293 /* 294 * Encode len characters from src into dst, which is guaranteed to have four 295 * bytes available for each character from src (for \abc or UTF-8) plus space 296 * for \0. 297 */ 298 int 299 utf8_strvis(char *dst, const char *src, size_t len, int flag) 300 { 301 struct utf8_data ud; 302 const char *start = dst, *end = src + len; 303 enum utf8_state more; 304 size_t i; 305 306 while (src < end) { 307 if ((more = utf8_open(&ud, *src)) == UTF8_MORE) { 308 while (++src < end && more == UTF8_MORE) 309 more = utf8_append(&ud, *src); 310 if (more == UTF8_DONE) { 311 /* UTF-8 character finished. */ 312 for (i = 0; i < ud.size; i++) 313 *dst++ = ud.data[i]; 314 continue; 315 } 316 /* Not a complete, valid UTF-8 character. */ 317 src -= ud.have; 318 } 319 if (src[0] == '$' && src < end - 1) { 320 if (isalpha((u_char)src[1]) || 321 src[1] == '_' || 322 src[1] == '{') 323 *dst++ = '\\'; 324 *dst++ = '$'; 325 } else if (src < end - 1) 326 dst = vis(dst, src[0], flag, src[1]); 327 else if (src < end) 328 dst = vis(dst, src[0], flag, '\0'); 329 src++; 330 } 331 *dst = '\0'; 332 return (dst - start); 333 } 334 335 /* Same as utf8_strvis but allocate the buffer. */ 336 int 337 utf8_stravis(char **dst, const char *src, int flag) 338 { 339 char *buf; 340 int len; 341 342 buf = xreallocarray(NULL, 4, strlen(src) + 1); 343 len = utf8_strvis(buf, src, strlen(src), flag); 344 345 *dst = xrealloc(buf, len + 1); 346 return (len); 347 } 348 349 /* Same as utf8_strvis but allocate the buffer. */ 350 int 351 utf8_stravisx(char **dst, const char *src, size_t srclen, int flag) 352 { 353 char *buf; 354 int len; 355 356 buf = xreallocarray(NULL, 4, srclen + 1); 357 len = utf8_strvis(buf, src, srclen, flag); 358 359 *dst = xrealloc(buf, len + 1); 360 return (len); 361 } 362 363 /* Does this string contain anything that isn't valid UTF-8? */ 364 int 365 utf8_isvalid(const char *s) 366 { 367 struct utf8_data ud; 368 const char *end; 369 enum utf8_state more; 370 371 end = s + strlen(s); 372 while (s < end) { 373 if ((more = utf8_open(&ud, *s)) == UTF8_MORE) { 374 while (++s < end && more == UTF8_MORE) 375 more = utf8_append(&ud, *s); 376 if (more == UTF8_DONE) 377 continue; 378 return (0); 379 } 380 if (*s < 0x20 || *s > 0x7e) 381 return (0); 382 s++; 383 } 384 return (1); 385 } 386 387 /* 388 * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free 389 * the returned string. Anything not valid printable ASCII or UTF-8 is 390 * stripped. 391 */ 392 char * 393 utf8_sanitize(const char *src) 394 { 395 char *dst = NULL; 396 size_t n = 0; 397 enum utf8_state more; 398 struct utf8_data ud; 399 u_int i; 400 401 while (*src != '\0') { 402 dst = xreallocarray(dst, n + 1, sizeof *dst); 403 if ((more = utf8_open(&ud, *src)) == UTF8_MORE) { 404 while (*++src != '\0' && more == UTF8_MORE) 405 more = utf8_append(&ud, *src); 406 if (more == UTF8_DONE) { 407 dst = xreallocarray(dst, n + ud.width, 408 sizeof *dst); 409 for (i = 0; i < ud.width; i++) 410 dst[n++] = '_'; 411 continue; 412 } 413 src -= ud.have; 414 } 415 if (*src > 0x1f && *src < 0x7f) 416 dst[n++] = *src; 417 else 418 dst[n++] = '_'; 419 src++; 420 } 421 dst = xreallocarray(dst, n + 1, sizeof *dst); 422 dst[n] = '\0'; 423 return (dst); 424 } 425 426 /* Get UTF-8 buffer length. */ 427 size_t 428 utf8_strlen(const struct utf8_data *s) 429 { 430 size_t i; 431 432 for (i = 0; s[i].size != 0; i++) 433 /* nothing */; 434 return (i); 435 } 436 437 /* Get UTF-8 string width. */ 438 u_int 439 utf8_strwidth(const struct utf8_data *s, ssize_t n) 440 { 441 ssize_t i; 442 u_int width = 0; 443 444 for (i = 0; s[i].size != 0; i++) { 445 if (n != -1 && n == i) 446 break; 447 width += s[i].width; 448 } 449 return (width); 450 } 451 452 /* 453 * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0. 454 * Caller frees. 455 */ 456 struct utf8_data * 457 utf8_fromcstr(const char *src) 458 { 459 struct utf8_data *dst = NULL; 460 size_t n = 0; 461 enum utf8_state more; 462 463 while (*src != '\0') { 464 dst = xreallocarray(dst, n + 1, sizeof *dst); 465 if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) { 466 while (*++src != '\0' && more == UTF8_MORE) 467 more = utf8_append(&dst[n], *src); 468 if (more == UTF8_DONE) { 469 n++; 470 continue; 471 } 472 src -= dst[n].have; 473 } 474 utf8_set(&dst[n], *src); 475 n++; 476 src++; 477 } 478 dst = xreallocarray(dst, n + 1, sizeof *dst); 479 dst[n].size = 0; 480 return (dst); 481 } 482 483 /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */ 484 char * 485 utf8_tocstr(struct utf8_data *src) 486 { 487 char *dst = NULL; 488 size_t n = 0; 489 490 for(; src->size != 0; src++) { 491 dst = xreallocarray(dst, n + src->size, 1); 492 memcpy(dst + n, src->data, src->size); 493 n += src->size; 494 } 495 dst = xreallocarray(dst, n + 1, 1); 496 dst[n] = '\0'; 497 return (dst); 498 } 499 500 /* Get width of UTF-8 string. */ 501 u_int 502 utf8_cstrwidth(const char *s) 503 { 504 struct utf8_data tmp; 505 u_int width; 506 enum utf8_state more; 507 508 width = 0; 509 while (*s != '\0') { 510 if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) { 511 while (*++s != '\0' && more == UTF8_MORE) 512 more = utf8_append(&tmp, *s); 513 if (more == UTF8_DONE) { 514 width += tmp.width; 515 continue; 516 } 517 s -= tmp.have; 518 } 519 if (*s > 0x1f && *s != 0x7f) 520 width++; 521 s++; 522 } 523 return (width); 524 } 525 526 /* Pad UTF-8 string to width on the left. Caller frees. */ 527 char * 528 utf8_padcstr(const char *s, u_int width) 529 { 530 size_t slen; 531 char *out; 532 u_int n, i; 533 534 n = utf8_cstrwidth(s); 535 if (n >= width) 536 return (xstrdup(s)); 537 538 slen = strlen(s); 539 out = xmalloc(slen + 1 + (width - n)); 540 memcpy(out, s, slen); 541 for (i = n; i < width; i++) 542 out[slen++] = ' '; 543 out[slen] = '\0'; 544 return (out); 545 } 546 547 /* Pad UTF-8 string to width on the right. Caller frees. */ 548 char * 549 utf8_rpadcstr(const char *s, u_int width) 550 { 551 size_t slen; 552 char *out; 553 u_int n, i; 554 555 n = utf8_cstrwidth(s); 556 if (n >= width) 557 return (xstrdup(s)); 558 559 slen = strlen(s); 560 out = xmalloc(slen + 1 + (width - n)); 561 for (i = 0; i < width - n; i++) 562 out[i] = ' '; 563 memcpy(out + i, s, slen); 564 out[i + slen] = '\0'; 565 return (out); 566 } 567 568 int 569 utf8_cstrhas(const char *s, const struct utf8_data *ud) 570 { 571 struct utf8_data *copy, *loop; 572 int found = 0; 573 574 copy = utf8_fromcstr(s); 575 for (loop = copy; loop->size != 0; loop++) { 576 if (loop->size != ud->size) 577 continue; 578 if (memcmp(loop->data, ud->data, loop->size) == 0) { 579 found = 1; 580 break; 581 } 582 } 583 free(copy); 584 585 return (found); 586 } 587