1 /* $OpenBSD: utf8.c,v 1.59 2022/12/16 08:19:58 nicm Exp $ */ 2 3 /* 4 * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER 15 * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING 16 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include <sys/types.h> 20 21 #include <ctype.h> 22 #include <errno.h> 23 #include <stdlib.h> 24 #include <string.h> 25 #include <vis.h> 26 #include <wchar.h> 27 28 #include "tmux.h" 29 30 struct utf8_item { 31 RB_ENTRY(utf8_item) index_entry; 32 u_int index; 33 34 RB_ENTRY(utf8_item) data_entry; 35 char data[UTF8_SIZE]; 36 u_char size; 37 }; 38 39 static int 40 utf8_data_cmp(struct utf8_item *ui1, struct utf8_item *ui2) 41 { 42 if (ui1->size < ui2->size) 43 return (-1); 44 if (ui1->size > ui2->size) 45 return (1); 46 return (memcmp(ui1->data, ui2->data, ui1->size)); 47 } 48 RB_HEAD(utf8_data_tree, utf8_item); 49 RB_GENERATE_STATIC(utf8_data_tree, utf8_item, data_entry, utf8_data_cmp); 50 static struct utf8_data_tree utf8_data_tree = RB_INITIALIZER(utf8_data_tree); 51 52 static int 53 utf8_index_cmp(struct utf8_item *ui1, struct utf8_item *ui2) 54 { 55 if (ui1->index < ui2->index) 56 return (-1); 57 if (ui1->index > ui2->index) 58 return (1); 59 return (0); 60 } 61 RB_HEAD(utf8_index_tree, utf8_item); 62 RB_GENERATE_STATIC(utf8_index_tree, utf8_item, index_entry, utf8_index_cmp); 63 static struct utf8_index_tree utf8_index_tree = RB_INITIALIZER(utf8_index_tree); 64 65 static u_int utf8_next_index; 66 67 #define UTF8_GET_SIZE(uc) (((uc) >> 24) & 0x1f) 68 #define UTF8_GET_WIDTH(uc) (((uc) >> 29) - 1) 69 70 #define UTF8_SET_SIZE(size) (((utf8_char)(size)) << 24) 71 #define UTF8_SET_WIDTH(width) ((((utf8_char)(width)) + 1) << 29) 72 73 /* Get a UTF-8 item from data. */ 74 static struct utf8_item * 75 utf8_item_by_data(const char *data, size_t size) 76 { 77 struct utf8_item ui; 78 79 memcpy(ui.data, data, size); 80 ui.size = size; 81 82 return (RB_FIND(utf8_data_tree, &utf8_data_tree, &ui)); 83 } 84 85 /* Get a UTF-8 item from data. */ 86 static struct utf8_item * 87 utf8_item_by_index(u_int index) 88 { 89 struct utf8_item ui; 90 91 ui.index = index; 92 93 return (RB_FIND(utf8_index_tree, &utf8_index_tree, &ui)); 94 } 95 96 /* Add a UTF-8 item. */ 97 static int 98 utf8_put_item(const char *data, size_t size, u_int *index) 99 { 100 struct utf8_item *ui; 101 102 ui = utf8_item_by_data(data, size); 103 if (ui != NULL) { 104 *index = ui->index; 105 log_debug("%s: found %.*s = %u", __func__, (int)size, data, 106 *index); 107 return (0); 108 } 109 110 if (utf8_next_index == 0xffffff + 1) 111 return (-1); 112 113 ui = xcalloc(1, sizeof *ui); 114 ui->index = utf8_next_index++; 115 RB_INSERT(utf8_index_tree, &utf8_index_tree, ui); 116 117 memcpy(ui->data, data, size); 118 ui->size = size; 119 RB_INSERT(utf8_data_tree, &utf8_data_tree, ui); 120 121 *index = ui->index; 122 log_debug("%s: added %.*s = %u", __func__, (int)size, data, *index); 123 return (0); 124 } 125 126 /* Get UTF-8 character from data. */ 127 enum utf8_state 128 utf8_from_data(const struct utf8_data *ud, utf8_char *uc) 129 { 130 u_int index; 131 132 if (ud->width > 2) 133 fatalx("invalid UTF-8 width: %u", ud->width); 134 135 if (ud->size > UTF8_SIZE) 136 goto fail; 137 if (ud->size <= 3) { 138 index = (((utf8_char)ud->data[2] << 16)| 139 ((utf8_char)ud->data[1] << 8)| 140 ((utf8_char)ud->data[0])); 141 } else if (utf8_put_item(ud->data, ud->size, &index) != 0) 142 goto fail; 143 *uc = UTF8_SET_SIZE(ud->size)|UTF8_SET_WIDTH(ud->width)|index; 144 log_debug("%s: (%d %d %.*s) -> %08x", __func__, ud->width, ud->size, 145 (int)ud->size, ud->data, *uc); 146 return (UTF8_DONE); 147 148 fail: 149 if (ud->width == 0) 150 *uc = UTF8_SET_SIZE(0)|UTF8_SET_WIDTH(0); 151 else if (ud->width == 1) 152 *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x20; 153 else 154 *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x2020; 155 return (UTF8_ERROR); 156 } 157 158 /* Get UTF-8 data from character. */ 159 void 160 utf8_to_data(utf8_char uc, struct utf8_data *ud) 161 { 162 struct utf8_item *ui; 163 u_int index; 164 165 memset(ud, 0, sizeof *ud); 166 ud->size = ud->have = UTF8_GET_SIZE(uc); 167 ud->width = UTF8_GET_WIDTH(uc); 168 169 if (ud->size <= 3) { 170 ud->data[2] = (uc >> 16); 171 ud->data[1] = ((uc >> 8) & 0xff); 172 ud->data[0] = (uc & 0xff); 173 } else { 174 index = (uc & 0xffffff); 175 if ((ui = utf8_item_by_index(index)) == NULL) 176 memset(ud->data, ' ', ud->size); 177 else 178 memcpy(ud->data, ui->data, ud->size); 179 } 180 181 log_debug("%s: %08x -> (%d %d %.*s)", __func__, uc, ud->width, ud->size, 182 (int)ud->size, ud->data); 183 } 184 185 /* Get UTF-8 character from a single ASCII character. */ 186 u_int 187 utf8_build_one(u_char ch) 188 { 189 return (UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|ch); 190 } 191 192 /* Set a single character. */ 193 void 194 utf8_set(struct utf8_data *ud, u_char ch) 195 { 196 static const struct utf8_data empty = { { 0 }, 1, 1, 1 }; 197 198 memcpy(ud, &empty, sizeof *ud); 199 *ud->data = ch; 200 } 201 202 /* Copy UTF-8 character. */ 203 void 204 utf8_copy(struct utf8_data *to, const struct utf8_data *from) 205 { 206 u_int i; 207 208 memcpy(to, from, sizeof *to); 209 210 for (i = to->size; i < sizeof to->data; i++) 211 to->data[i] = '\0'; 212 } 213 214 /* Get width of Unicode character. */ 215 static enum utf8_state 216 utf8_width(struct utf8_data *ud, int *width) 217 { 218 wchar_t wc; 219 220 switch (mbtowc(&wc, ud->data, ud->size)) { 221 case -1: 222 log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data, 223 errno); 224 mbtowc(NULL, NULL, MB_CUR_MAX); 225 return (UTF8_ERROR); 226 case 0: 227 return (UTF8_ERROR); 228 } 229 *width = wcwidth(wc); 230 log_debug("UTF-8 %.*s %#x, wcwidth() %d", (int)ud->size, ud->data, 231 (u_int)wc, *width); 232 if (*width >= 0 && *width <= 0xff) 233 return (UTF8_DONE); 234 return (UTF8_ERROR); 235 } 236 237 /* 238 * Open UTF-8 sequence. 239 * 240 * 11000010-11011111 C2-DF start of 2-byte sequence 241 * 11100000-11101111 E0-EF start of 3-byte sequence 242 * 11110000-11110100 F0-F4 start of 4-byte sequence 243 */ 244 enum utf8_state 245 utf8_open(struct utf8_data *ud, u_char ch) 246 { 247 memset(ud, 0, sizeof *ud); 248 if (ch >= 0xc2 && ch <= 0xdf) 249 ud->size = 2; 250 else if (ch >= 0xe0 && ch <= 0xef) 251 ud->size = 3; 252 else if (ch >= 0xf0 && ch <= 0xf4) 253 ud->size = 4; 254 else 255 return (UTF8_ERROR); 256 utf8_append(ud, ch); 257 return (UTF8_MORE); 258 } 259 260 /* Append character to UTF-8, closing if finished. */ 261 enum utf8_state 262 utf8_append(struct utf8_data *ud, u_char ch) 263 { 264 int width; 265 266 if (ud->have >= ud->size) 267 fatalx("UTF-8 character overflow"); 268 if (ud->size > sizeof ud->data) 269 fatalx("UTF-8 character size too large"); 270 271 if (ud->have != 0 && (ch & 0xc0) != 0x80) 272 ud->width = 0xff; 273 274 ud->data[ud->have++] = ch; 275 if (ud->have != ud->size) 276 return (UTF8_MORE); 277 278 if (ud->width == 0xff) 279 return (UTF8_ERROR); 280 if (utf8_width(ud, &width) != UTF8_DONE) 281 return (UTF8_ERROR); 282 ud->width = width; 283 284 return (UTF8_DONE); 285 } 286 287 /* 288 * Encode len characters from src into dst, which is guaranteed to have four 289 * bytes available for each character from src (for \abc or UTF-8) plus space 290 * for \0. 291 */ 292 int 293 utf8_strvis(char *dst, const char *src, size_t len, int flag) 294 { 295 struct utf8_data ud; 296 const char *start = dst, *end = src + len; 297 enum utf8_state more; 298 size_t i; 299 300 while (src < end) { 301 if ((more = utf8_open(&ud, *src)) == UTF8_MORE) { 302 while (++src < end && more == UTF8_MORE) 303 more = utf8_append(&ud, *src); 304 if (more == UTF8_DONE) { 305 /* UTF-8 character finished. */ 306 for (i = 0; i < ud.size; i++) 307 *dst++ = ud.data[i]; 308 continue; 309 } 310 /* Not a complete, valid UTF-8 character. */ 311 src -= ud.have; 312 } 313 if (src[0] == '$' && src < end - 1) { 314 if (isalpha((u_char)src[1]) || 315 src[1] == '_' || 316 src[1] == '{') 317 *dst++ = '\\'; 318 *dst++ = '$'; 319 } else if (src < end - 1) 320 dst = vis(dst, src[0], flag, src[1]); 321 else if (src < end) 322 dst = vis(dst, src[0], flag, '\0'); 323 src++; 324 } 325 *dst = '\0'; 326 return (dst - start); 327 } 328 329 /* Same as utf8_strvis but allocate the buffer. */ 330 int 331 utf8_stravis(char **dst, const char *src, int flag) 332 { 333 char *buf; 334 int len; 335 336 buf = xreallocarray(NULL, 4, strlen(src) + 1); 337 len = utf8_strvis(buf, src, strlen(src), flag); 338 339 *dst = xrealloc(buf, len + 1); 340 return (len); 341 } 342 343 /* Same as utf8_strvis but allocate the buffer. */ 344 int 345 utf8_stravisx(char **dst, const char *src, size_t srclen, int flag) 346 { 347 char *buf; 348 int len; 349 350 buf = xreallocarray(NULL, 4, srclen + 1); 351 len = utf8_strvis(buf, src, srclen, flag); 352 353 *dst = xrealloc(buf, len + 1); 354 return (len); 355 } 356 357 /* Does this string contain anything that isn't valid UTF-8? */ 358 int 359 utf8_isvalid(const char *s) 360 { 361 struct utf8_data ud; 362 const char *end; 363 enum utf8_state more; 364 365 end = s + strlen(s); 366 while (s < end) { 367 if ((more = utf8_open(&ud, *s)) == UTF8_MORE) { 368 while (++s < end && more == UTF8_MORE) 369 more = utf8_append(&ud, *s); 370 if (more == UTF8_DONE) 371 continue; 372 return (0); 373 } 374 if (*s < 0x20 || *s > 0x7e) 375 return (0); 376 s++; 377 } 378 return (1); 379 } 380 381 /* 382 * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free 383 * the returned string. Anything not valid printable ASCII or UTF-8 is 384 * stripped. 385 */ 386 char * 387 utf8_sanitize(const char *src) 388 { 389 char *dst = NULL; 390 size_t n = 0; 391 enum utf8_state more; 392 struct utf8_data ud; 393 u_int i; 394 395 while (*src != '\0') { 396 dst = xreallocarray(dst, n + 1, sizeof *dst); 397 if ((more = utf8_open(&ud, *src)) == UTF8_MORE) { 398 while (*++src != '\0' && more == UTF8_MORE) 399 more = utf8_append(&ud, *src); 400 if (more == UTF8_DONE) { 401 dst = xreallocarray(dst, n + ud.width, 402 sizeof *dst); 403 for (i = 0; i < ud.width; i++) 404 dst[n++] = '_'; 405 continue; 406 } 407 src -= ud.have; 408 } 409 if (*src > 0x1f && *src < 0x7f) 410 dst[n++] = *src; 411 else 412 dst[n++] = '_'; 413 src++; 414 } 415 dst = xreallocarray(dst, n + 1, sizeof *dst); 416 dst[n] = '\0'; 417 return (dst); 418 } 419 420 /* Get UTF-8 buffer length. */ 421 size_t 422 utf8_strlen(const struct utf8_data *s) 423 { 424 size_t i; 425 426 for (i = 0; s[i].size != 0; i++) 427 /* nothing */; 428 return (i); 429 } 430 431 /* Get UTF-8 string width. */ 432 u_int 433 utf8_strwidth(const struct utf8_data *s, ssize_t n) 434 { 435 ssize_t i; 436 u_int width = 0; 437 438 for (i = 0; s[i].size != 0; i++) { 439 if (n != -1 && n == i) 440 break; 441 width += s[i].width; 442 } 443 return (width); 444 } 445 446 /* 447 * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0. 448 * Caller frees. 449 */ 450 struct utf8_data * 451 utf8_fromcstr(const char *src) 452 { 453 struct utf8_data *dst = NULL; 454 size_t n = 0; 455 enum utf8_state more; 456 457 while (*src != '\0') { 458 dst = xreallocarray(dst, n + 1, sizeof *dst); 459 if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) { 460 while (*++src != '\0' && more == UTF8_MORE) 461 more = utf8_append(&dst[n], *src); 462 if (more == UTF8_DONE) { 463 n++; 464 continue; 465 } 466 src -= dst[n].have; 467 } 468 utf8_set(&dst[n], *src); 469 n++; 470 src++; 471 } 472 dst = xreallocarray(dst, n + 1, sizeof *dst); 473 dst[n].size = 0; 474 return (dst); 475 } 476 477 /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */ 478 char * 479 utf8_tocstr(struct utf8_data *src) 480 { 481 char *dst = NULL; 482 size_t n = 0; 483 484 for(; src->size != 0; src++) { 485 dst = xreallocarray(dst, n + src->size, 1); 486 memcpy(dst + n, src->data, src->size); 487 n += src->size; 488 } 489 dst = xreallocarray(dst, n + 1, 1); 490 dst[n] = '\0'; 491 return (dst); 492 } 493 494 /* Get width of UTF-8 string. */ 495 u_int 496 utf8_cstrwidth(const char *s) 497 { 498 struct utf8_data tmp; 499 u_int width; 500 enum utf8_state more; 501 502 width = 0; 503 while (*s != '\0') { 504 if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) { 505 while (*++s != '\0' && more == UTF8_MORE) 506 more = utf8_append(&tmp, *s); 507 if (more == UTF8_DONE) { 508 width += tmp.width; 509 continue; 510 } 511 s -= tmp.have; 512 } 513 if (*s > 0x1f && *s != 0x7f) 514 width++; 515 s++; 516 } 517 return (width); 518 } 519 520 /* Pad UTF-8 string to width on the left. Caller frees. */ 521 char * 522 utf8_padcstr(const char *s, u_int width) 523 { 524 size_t slen; 525 char *out; 526 u_int n, i; 527 528 n = utf8_cstrwidth(s); 529 if (n >= width) 530 return (xstrdup(s)); 531 532 slen = strlen(s); 533 out = xmalloc(slen + 1 + (width - n)); 534 memcpy(out, s, slen); 535 for (i = n; i < width; i++) 536 out[slen++] = ' '; 537 out[slen] = '\0'; 538 return (out); 539 } 540 541 /* Pad UTF-8 string to width on the right. Caller frees. */ 542 char * 543 utf8_rpadcstr(const char *s, u_int width) 544 { 545 size_t slen; 546 char *out; 547 u_int n, i; 548 549 n = utf8_cstrwidth(s); 550 if (n >= width) 551 return (xstrdup(s)); 552 553 slen = strlen(s); 554 out = xmalloc(slen + 1 + (width - n)); 555 for (i = 0; i < width - n; i++) 556 out[i] = ' '; 557 memcpy(out + i, s, slen); 558 out[i + slen] = '\0'; 559 return (out); 560 } 561 562 int 563 utf8_cstrhas(const char *s, const struct utf8_data *ud) 564 { 565 struct utf8_data *copy, *loop; 566 int found = 0; 567 568 copy = utf8_fromcstr(s); 569 for (loop = copy; loop->size != 0; loop++) { 570 if (loop->size != ud->size) 571 continue; 572 if (memcmp(loop->data, ud->data, loop->size) == 0) { 573 found = 1; 574 break; 575 } 576 } 577 free(copy); 578 579 return (found); 580 } 581