1 /* $OpenBSD: utf8.c,v 1.60 2023/01/08 22:15:30 nicm Exp $ */ 2 3 /* 4 * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER 15 * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING 16 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include <sys/types.h> 20 21 #include <ctype.h> 22 #include <errno.h> 23 #include <stdlib.h> 24 #include <string.h> 25 #include <vis.h> 26 #include <wchar.h> 27 28 #include "tmux.h" 29 30 struct utf8_item { 31 RB_ENTRY(utf8_item) index_entry; 32 u_int index; 33 34 RB_ENTRY(utf8_item) data_entry; 35 char data[UTF8_SIZE]; 36 u_char size; 37 }; 38 39 static int 40 utf8_data_cmp(struct utf8_item *ui1, struct utf8_item *ui2) 41 { 42 if (ui1->size < ui2->size) 43 return (-1); 44 if (ui1->size > ui2->size) 45 return (1); 46 return (memcmp(ui1->data, ui2->data, ui1->size)); 47 } 48 RB_HEAD(utf8_data_tree, utf8_item); 49 RB_GENERATE_STATIC(utf8_data_tree, utf8_item, data_entry, utf8_data_cmp); 50 static struct utf8_data_tree utf8_data_tree = RB_INITIALIZER(utf8_data_tree); 51 52 static int 53 utf8_index_cmp(struct utf8_item *ui1, struct utf8_item *ui2) 54 { 55 if (ui1->index < ui2->index) 56 return (-1); 57 if (ui1->index > ui2->index) 58 return (1); 59 return (0); 60 } 61 RB_HEAD(utf8_index_tree, utf8_item); 62 RB_GENERATE_STATIC(utf8_index_tree, utf8_item, index_entry, utf8_index_cmp); 63 static struct utf8_index_tree utf8_index_tree = RB_INITIALIZER(utf8_index_tree); 64 65 static u_int utf8_next_index; 66 67 #define UTF8_GET_SIZE(uc) (((uc) >> 24) & 0x1f) 68 #define UTF8_GET_WIDTH(uc) (((uc) >> 29) - 1) 69 70 #define UTF8_SET_SIZE(size) (((utf8_char)(size)) << 24) 71 #define UTF8_SET_WIDTH(width) ((((utf8_char)(width)) + 1) << 29) 72 73 /* Get a UTF-8 item from data. */ 74 static struct utf8_item * 75 utf8_item_by_data(const char *data, size_t size) 76 { 77 struct utf8_item ui; 78 79 memcpy(ui.data, data, size); 80 ui.size = size; 81 82 return (RB_FIND(utf8_data_tree, &utf8_data_tree, &ui)); 83 } 84 85 /* Get a UTF-8 item from data. */ 86 static struct utf8_item * 87 utf8_item_by_index(u_int index) 88 { 89 struct utf8_item ui; 90 91 ui.index = index; 92 93 return (RB_FIND(utf8_index_tree, &utf8_index_tree, &ui)); 94 } 95 96 /* Add a UTF-8 item. */ 97 static int 98 utf8_put_item(const char *data, size_t size, u_int *index) 99 { 100 struct utf8_item *ui; 101 102 ui = utf8_item_by_data(data, size); 103 if (ui != NULL) { 104 *index = ui->index; 105 log_debug("%s: found %.*s = %u", __func__, (int)size, data, 106 *index); 107 return (0); 108 } 109 110 if (utf8_next_index == 0xffffff + 1) 111 return (-1); 112 113 ui = xcalloc(1, sizeof *ui); 114 ui->index = utf8_next_index++; 115 RB_INSERT(utf8_index_tree, &utf8_index_tree, ui); 116 117 memcpy(ui->data, data, size); 118 ui->size = size; 119 RB_INSERT(utf8_data_tree, &utf8_data_tree, ui); 120 121 *index = ui->index; 122 log_debug("%s: added %.*s = %u", __func__, (int)size, data, *index); 123 return (0); 124 } 125 126 /* Get UTF-8 character from data. */ 127 enum utf8_state 128 utf8_from_data(const struct utf8_data *ud, utf8_char *uc) 129 { 130 u_int index; 131 132 if (ud->width > 2) 133 fatalx("invalid UTF-8 width: %u", ud->width); 134 135 if (ud->size > UTF8_SIZE) 136 goto fail; 137 if (ud->size <= 3) { 138 index = (((utf8_char)ud->data[2] << 16)| 139 ((utf8_char)ud->data[1] << 8)| 140 ((utf8_char)ud->data[0])); 141 } else if (utf8_put_item(ud->data, ud->size, &index) != 0) 142 goto fail; 143 *uc = UTF8_SET_SIZE(ud->size)|UTF8_SET_WIDTH(ud->width)|index; 144 log_debug("%s: (%d %d %.*s) -> %08x", __func__, ud->width, ud->size, 145 (int)ud->size, ud->data, *uc); 146 return (UTF8_DONE); 147 148 fail: 149 if (ud->width == 0) 150 *uc = UTF8_SET_SIZE(0)|UTF8_SET_WIDTH(0); 151 else if (ud->width == 1) 152 *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x20; 153 else 154 *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x2020; 155 return (UTF8_ERROR); 156 } 157 158 /* Get UTF-8 data from character. */ 159 void 160 utf8_to_data(utf8_char uc, struct utf8_data *ud) 161 { 162 struct utf8_item *ui; 163 u_int index; 164 165 memset(ud, 0, sizeof *ud); 166 ud->size = ud->have = UTF8_GET_SIZE(uc); 167 ud->width = UTF8_GET_WIDTH(uc); 168 169 if (ud->size <= 3) { 170 ud->data[2] = (uc >> 16); 171 ud->data[1] = ((uc >> 8) & 0xff); 172 ud->data[0] = (uc & 0xff); 173 } else { 174 index = (uc & 0xffffff); 175 if ((ui = utf8_item_by_index(index)) == NULL) 176 memset(ud->data, ' ', ud->size); 177 else 178 memcpy(ud->data, ui->data, ud->size); 179 } 180 181 log_debug("%s: %08x -> (%d %d %.*s)", __func__, uc, ud->width, ud->size, 182 (int)ud->size, ud->data); 183 } 184 185 /* Get UTF-8 character from a single ASCII character. */ 186 u_int 187 utf8_build_one(u_char ch) 188 { 189 return (UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|ch); 190 } 191 192 /* Set a single character. */ 193 void 194 utf8_set(struct utf8_data *ud, u_char ch) 195 { 196 static const struct utf8_data empty = { { 0 }, 1, 1, 1 }; 197 198 memcpy(ud, &empty, sizeof *ud); 199 *ud->data = ch; 200 } 201 202 /* Copy UTF-8 character. */ 203 void 204 utf8_copy(struct utf8_data *to, const struct utf8_data *from) 205 { 206 u_int i; 207 208 memcpy(to, from, sizeof *to); 209 210 for (i = to->size; i < sizeof to->data; i++) 211 to->data[i] = '\0'; 212 } 213 214 /* Get width of Unicode character. */ 215 static enum utf8_state 216 utf8_width(struct utf8_data *ud, int *width) 217 { 218 wchar_t wc; 219 220 switch (mbtowc(&wc, ud->data, ud->size)) { 221 case -1: 222 log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data, 223 errno); 224 mbtowc(NULL, NULL, MB_CUR_MAX); 225 return (UTF8_ERROR); 226 case 0: 227 return (UTF8_ERROR); 228 } 229 log_debug("UTF-8 %.*s is %08X", (int)ud->size, ud->data, (u_int)wc); 230 *width = wcwidth(wc); 231 log_debug("wcwidth(%08X) returned %d", (u_int)wc, *width); 232 if (*width < 0) { 233 /* 234 * C1 control characters are nonprintable, so they are always 235 * zero width. 236 */ 237 *width = (wc >= 0x80 && wc <= 0x9f) ? 0 : 1; 238 } 239 if (*width >= 0 && *width <= 0xff) 240 return (UTF8_DONE); 241 return (UTF8_ERROR); 242 } 243 244 /* 245 * Open UTF-8 sequence. 246 * 247 * 11000010-11011111 C2-DF start of 2-byte sequence 248 * 11100000-11101111 E0-EF start of 3-byte sequence 249 * 11110000-11110100 F0-F4 start of 4-byte sequence 250 */ 251 enum utf8_state 252 utf8_open(struct utf8_data *ud, u_char ch) 253 { 254 memset(ud, 0, sizeof *ud); 255 if (ch >= 0xc2 && ch <= 0xdf) 256 ud->size = 2; 257 else if (ch >= 0xe0 && ch <= 0xef) 258 ud->size = 3; 259 else if (ch >= 0xf0 && ch <= 0xf4) 260 ud->size = 4; 261 else 262 return (UTF8_ERROR); 263 utf8_append(ud, ch); 264 return (UTF8_MORE); 265 } 266 267 /* Append character to UTF-8, closing if finished. */ 268 enum utf8_state 269 utf8_append(struct utf8_data *ud, u_char ch) 270 { 271 int width; 272 273 if (ud->have >= ud->size) 274 fatalx("UTF-8 character overflow"); 275 if (ud->size > sizeof ud->data) 276 fatalx("UTF-8 character size too large"); 277 278 if (ud->have != 0 && (ch & 0xc0) != 0x80) 279 ud->width = 0xff; 280 281 ud->data[ud->have++] = ch; 282 if (ud->have != ud->size) 283 return (UTF8_MORE); 284 285 if (ud->width == 0xff) 286 return (UTF8_ERROR); 287 if (utf8_width(ud, &width) != UTF8_DONE) 288 return (UTF8_ERROR); 289 ud->width = width; 290 291 return (UTF8_DONE); 292 } 293 294 /* 295 * Encode len characters from src into dst, which is guaranteed to have four 296 * bytes available for each character from src (for \abc or UTF-8) plus space 297 * for \0. 298 */ 299 int 300 utf8_strvis(char *dst, const char *src, size_t len, int flag) 301 { 302 struct utf8_data ud; 303 const char *start = dst, *end = src + len; 304 enum utf8_state more; 305 size_t i; 306 307 while (src < end) { 308 if ((more = utf8_open(&ud, *src)) == UTF8_MORE) { 309 while (++src < end && more == UTF8_MORE) 310 more = utf8_append(&ud, *src); 311 if (more == UTF8_DONE) { 312 /* UTF-8 character finished. */ 313 for (i = 0; i < ud.size; i++) 314 *dst++ = ud.data[i]; 315 continue; 316 } 317 /* Not a complete, valid UTF-8 character. */ 318 src -= ud.have; 319 } 320 if (src[0] == '$' && src < end - 1) { 321 if (isalpha((u_char)src[1]) || 322 src[1] == '_' || 323 src[1] == '{') 324 *dst++ = '\\'; 325 *dst++ = '$'; 326 } else if (src < end - 1) 327 dst = vis(dst, src[0], flag, src[1]); 328 else if (src < end) 329 dst = vis(dst, src[0], flag, '\0'); 330 src++; 331 } 332 *dst = '\0'; 333 return (dst - start); 334 } 335 336 /* Same as utf8_strvis but allocate the buffer. */ 337 int 338 utf8_stravis(char **dst, const char *src, int flag) 339 { 340 char *buf; 341 int len; 342 343 buf = xreallocarray(NULL, 4, strlen(src) + 1); 344 len = utf8_strvis(buf, src, strlen(src), flag); 345 346 *dst = xrealloc(buf, len + 1); 347 return (len); 348 } 349 350 /* Same as utf8_strvis but allocate the buffer. */ 351 int 352 utf8_stravisx(char **dst, const char *src, size_t srclen, int flag) 353 { 354 char *buf; 355 int len; 356 357 buf = xreallocarray(NULL, 4, srclen + 1); 358 len = utf8_strvis(buf, src, srclen, flag); 359 360 *dst = xrealloc(buf, len + 1); 361 return (len); 362 } 363 364 /* Does this string contain anything that isn't valid UTF-8? */ 365 int 366 utf8_isvalid(const char *s) 367 { 368 struct utf8_data ud; 369 const char *end; 370 enum utf8_state more; 371 372 end = s + strlen(s); 373 while (s < end) { 374 if ((more = utf8_open(&ud, *s)) == UTF8_MORE) { 375 while (++s < end && more == UTF8_MORE) 376 more = utf8_append(&ud, *s); 377 if (more == UTF8_DONE) 378 continue; 379 return (0); 380 } 381 if (*s < 0x20 || *s > 0x7e) 382 return (0); 383 s++; 384 } 385 return (1); 386 } 387 388 /* 389 * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free 390 * the returned string. Anything not valid printable ASCII or UTF-8 is 391 * stripped. 392 */ 393 char * 394 utf8_sanitize(const char *src) 395 { 396 char *dst = NULL; 397 size_t n = 0; 398 enum utf8_state more; 399 struct utf8_data ud; 400 u_int i; 401 402 while (*src != '\0') { 403 dst = xreallocarray(dst, n + 1, sizeof *dst); 404 if ((more = utf8_open(&ud, *src)) == UTF8_MORE) { 405 while (*++src != '\0' && more == UTF8_MORE) 406 more = utf8_append(&ud, *src); 407 if (more == UTF8_DONE) { 408 dst = xreallocarray(dst, n + ud.width, 409 sizeof *dst); 410 for (i = 0; i < ud.width; i++) 411 dst[n++] = '_'; 412 continue; 413 } 414 src -= ud.have; 415 } 416 if (*src > 0x1f && *src < 0x7f) 417 dst[n++] = *src; 418 else 419 dst[n++] = '_'; 420 src++; 421 } 422 dst = xreallocarray(dst, n + 1, sizeof *dst); 423 dst[n] = '\0'; 424 return (dst); 425 } 426 427 /* Get UTF-8 buffer length. */ 428 size_t 429 utf8_strlen(const struct utf8_data *s) 430 { 431 size_t i; 432 433 for (i = 0; s[i].size != 0; i++) 434 /* nothing */; 435 return (i); 436 } 437 438 /* Get UTF-8 string width. */ 439 u_int 440 utf8_strwidth(const struct utf8_data *s, ssize_t n) 441 { 442 ssize_t i; 443 u_int width = 0; 444 445 for (i = 0; s[i].size != 0; i++) { 446 if (n != -1 && n == i) 447 break; 448 width += s[i].width; 449 } 450 return (width); 451 } 452 453 /* 454 * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0. 455 * Caller frees. 456 */ 457 struct utf8_data * 458 utf8_fromcstr(const char *src) 459 { 460 struct utf8_data *dst = NULL; 461 size_t n = 0; 462 enum utf8_state more; 463 464 while (*src != '\0') { 465 dst = xreallocarray(dst, n + 1, sizeof *dst); 466 if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) { 467 while (*++src != '\0' && more == UTF8_MORE) 468 more = utf8_append(&dst[n], *src); 469 if (more == UTF8_DONE) { 470 n++; 471 continue; 472 } 473 src -= dst[n].have; 474 } 475 utf8_set(&dst[n], *src); 476 n++; 477 src++; 478 } 479 dst = xreallocarray(dst, n + 1, sizeof *dst); 480 dst[n].size = 0; 481 return (dst); 482 } 483 484 /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */ 485 char * 486 utf8_tocstr(struct utf8_data *src) 487 { 488 char *dst = NULL; 489 size_t n = 0; 490 491 for(; src->size != 0; src++) { 492 dst = xreallocarray(dst, n + src->size, 1); 493 memcpy(dst + n, src->data, src->size); 494 n += src->size; 495 } 496 dst = xreallocarray(dst, n + 1, 1); 497 dst[n] = '\0'; 498 return (dst); 499 } 500 501 /* Get width of UTF-8 string. */ 502 u_int 503 utf8_cstrwidth(const char *s) 504 { 505 struct utf8_data tmp; 506 u_int width; 507 enum utf8_state more; 508 509 width = 0; 510 while (*s != '\0') { 511 if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) { 512 while (*++s != '\0' && more == UTF8_MORE) 513 more = utf8_append(&tmp, *s); 514 if (more == UTF8_DONE) { 515 width += tmp.width; 516 continue; 517 } 518 s -= tmp.have; 519 } 520 if (*s > 0x1f && *s != 0x7f) 521 width++; 522 s++; 523 } 524 return (width); 525 } 526 527 /* Pad UTF-8 string to width on the left. Caller frees. */ 528 char * 529 utf8_padcstr(const char *s, u_int width) 530 { 531 size_t slen; 532 char *out; 533 u_int n, i; 534 535 n = utf8_cstrwidth(s); 536 if (n >= width) 537 return (xstrdup(s)); 538 539 slen = strlen(s); 540 out = xmalloc(slen + 1 + (width - n)); 541 memcpy(out, s, slen); 542 for (i = n; i < width; i++) 543 out[slen++] = ' '; 544 out[slen] = '\0'; 545 return (out); 546 } 547 548 /* Pad UTF-8 string to width on the right. Caller frees. */ 549 char * 550 utf8_rpadcstr(const char *s, u_int width) 551 { 552 size_t slen; 553 char *out; 554 u_int n, i; 555 556 n = utf8_cstrwidth(s); 557 if (n >= width) 558 return (xstrdup(s)); 559 560 slen = strlen(s); 561 out = xmalloc(slen + 1 + (width - n)); 562 for (i = 0; i < width - n; i++) 563 out[i] = ' '; 564 memcpy(out + i, s, slen); 565 out[i + slen] = '\0'; 566 return (out); 567 } 568 569 int 570 utf8_cstrhas(const char *s, const struct utf8_data *ud) 571 { 572 struct utf8_data *copy, *loop; 573 int found = 0; 574 575 copy = utf8_fromcstr(s); 576 for (loop = copy; loop->size != 0; loop++) { 577 if (loop->size != ud->size) 578 continue; 579 if (memcmp(loop->data, ud->data, loop->size) == 0) { 580 found = 1; 581 break; 582 } 583 } 584 free(copy); 585 586 return (found); 587 } 588