1 /* $OpenBSD: utf8.c,v 1.58 2021/06/10 07:56:47 nicm Exp $ */ 2 3 /* 4 * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER 15 * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING 16 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include <sys/types.h> 20 21 #include <ctype.h> 22 #include <errno.h> 23 #include <stdlib.h> 24 #include <string.h> 25 #include <vis.h> 26 #include <wchar.h> 27 28 #include "tmux.h" 29 30 struct utf8_item { 31 RB_ENTRY(utf8_item) index_entry; 32 u_int index; 33 34 RB_ENTRY(utf8_item) data_entry; 35 char data[UTF8_SIZE]; 36 u_char size; 37 }; 38 39 static int 40 utf8_data_cmp(struct utf8_item *ui1, struct utf8_item *ui2) 41 { 42 if (ui1->size < ui2->size) 43 return (-1); 44 if (ui1->size > ui2->size) 45 return (1); 46 return (memcmp(ui1->data, ui2->data, ui1->size)); 47 } 48 RB_HEAD(utf8_data_tree, utf8_item); 49 RB_GENERATE_STATIC(utf8_data_tree, utf8_item, data_entry, utf8_data_cmp); 50 static struct utf8_data_tree utf8_data_tree = RB_INITIALIZER(utf8_data_tree); 51 52 static int 53 utf8_index_cmp(struct utf8_item *ui1, struct utf8_item *ui2) 54 { 55 if (ui1->index < ui2->index) 56 return (-1); 57 if (ui1->index > ui2->index) 58 return (1); 59 return (0); 60 } 61 RB_HEAD(utf8_index_tree, utf8_item); 62 RB_GENERATE_STATIC(utf8_index_tree, utf8_item, index_entry, utf8_index_cmp); 63 static struct utf8_index_tree utf8_index_tree = RB_INITIALIZER(utf8_index_tree); 64 65 static u_int utf8_next_index; 66 67 #define UTF8_GET_SIZE(uc) (((uc) >> 24) & 0x1f) 68 #define UTF8_GET_WIDTH(uc) (((uc) >> 29) - 1) 69 70 #define UTF8_SET_SIZE(size) (((utf8_char)(size)) << 24) 71 #define UTF8_SET_WIDTH(width) ((((utf8_char)(width)) + 1) << 29) 72 73 /* Get a UTF-8 item from data. */ 74 static struct utf8_item * 75 utf8_item_by_data(const char *data, size_t size) 76 { 77 struct utf8_item ui; 78 79 memcpy(ui.data, data, size); 80 ui.size = size; 81 82 return (RB_FIND(utf8_data_tree, &utf8_data_tree, &ui)); 83 } 84 85 /* Get a UTF-8 item from data. */ 86 static struct utf8_item * 87 utf8_item_by_index(u_int index) 88 { 89 struct utf8_item ui; 90 91 ui.index = index; 92 93 return (RB_FIND(utf8_index_tree, &utf8_index_tree, &ui)); 94 } 95 96 /* Add a UTF-8 item. */ 97 static int 98 utf8_put_item(const char *data, size_t size, u_int *index) 99 { 100 struct utf8_item *ui; 101 102 ui = utf8_item_by_data(data, size); 103 if (ui != NULL) { 104 *index = ui->index; 105 log_debug("%s: found %.*s = %u", __func__, (int)size, data, 106 *index); 107 return (0); 108 } 109 110 if (utf8_next_index == 0xffffff + 1) 111 return (-1); 112 113 ui = xcalloc(1, sizeof *ui); 114 ui->index = utf8_next_index++; 115 RB_INSERT(utf8_index_tree, &utf8_index_tree, ui); 116 117 memcpy(ui->data, data, size); 118 ui->size = size; 119 RB_INSERT(utf8_data_tree, &utf8_data_tree, ui); 120 121 *index = ui->index; 122 log_debug("%s: added %.*s = %u", __func__, (int)size, data, *index); 123 return (0); 124 } 125 126 /* Get UTF-8 character from data. */ 127 enum utf8_state 128 utf8_from_data(const struct utf8_data *ud, utf8_char *uc) 129 { 130 u_int index; 131 132 if (ud->width > 2) 133 fatalx("invalid UTF-8 width: %u", ud->width); 134 135 if (ud->size > UTF8_SIZE) 136 goto fail; 137 if (ud->size <= 3) { 138 index = (((utf8_char)ud->data[2] << 16)| 139 ((utf8_char)ud->data[1] << 8)| 140 ((utf8_char)ud->data[0])); 141 } else if (utf8_put_item(ud->data, ud->size, &index) != 0) 142 goto fail; 143 *uc = UTF8_SET_SIZE(ud->size)|UTF8_SET_WIDTH(ud->width)|index; 144 log_debug("%s: (%d %d %.*s) -> %08x", __func__, ud->width, ud->size, 145 (int)ud->size, ud->data, *uc); 146 return (UTF8_DONE); 147 148 fail: 149 if (ud->width == 0) 150 *uc = UTF8_SET_SIZE(0)|UTF8_SET_WIDTH(0); 151 else if (ud->width == 1) 152 *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x20; 153 else 154 *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x2020; 155 return (UTF8_ERROR); 156 } 157 158 /* Get UTF-8 data from character. */ 159 void 160 utf8_to_data(utf8_char uc, struct utf8_data *ud) 161 { 162 struct utf8_item *ui; 163 u_int index; 164 165 memset(ud, 0, sizeof *ud); 166 ud->size = ud->have = UTF8_GET_SIZE(uc); 167 ud->width = UTF8_GET_WIDTH(uc); 168 169 if (ud->size <= 3) { 170 ud->data[2] = (uc >> 16); 171 ud->data[1] = ((uc >> 8) & 0xff); 172 ud->data[0] = (uc & 0xff); 173 } else { 174 index = (uc & 0xffffff); 175 if ((ui = utf8_item_by_index(index)) == NULL) 176 memset(ud->data, ' ', ud->size); 177 else 178 memcpy(ud->data, ui->data, ud->size); 179 } 180 181 log_debug("%s: %08x -> (%d %d %.*s)", __func__, uc, ud->width, ud->size, 182 (int)ud->size, ud->data); 183 } 184 185 /* Get UTF-8 character from a single ASCII character. */ 186 u_int 187 utf8_build_one(u_char ch) 188 { 189 return (UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|ch); 190 } 191 192 /* Set a single character. */ 193 void 194 utf8_set(struct utf8_data *ud, u_char ch) 195 { 196 static const struct utf8_data empty = { { 0 }, 1, 1, 1 }; 197 198 memcpy(ud, &empty, sizeof *ud); 199 *ud->data = ch; 200 } 201 202 /* Copy UTF-8 character. */ 203 void 204 utf8_copy(struct utf8_data *to, const struct utf8_data *from) 205 { 206 u_int i; 207 208 memcpy(to, from, sizeof *to); 209 210 for (i = to->size; i < sizeof to->data; i++) 211 to->data[i] = '\0'; 212 } 213 214 /* Get width of Unicode character. */ 215 static enum utf8_state 216 utf8_width(struct utf8_data *ud, int *width) 217 { 218 wchar_t wc; 219 220 switch (mbtowc(&wc, ud->data, ud->size)) { 221 case -1: 222 log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data, 223 errno); 224 mbtowc(NULL, NULL, MB_CUR_MAX); 225 return (UTF8_ERROR); 226 case 0: 227 return (UTF8_ERROR); 228 } 229 *width = wcwidth(wc); 230 if (*width < 0 || *width > 0xff) { 231 log_debug("UTF-8 %.*s, wcwidth() %d", (int)ud->size, ud->data, 232 *width); 233 return (UTF8_ERROR); 234 } 235 return (UTF8_DONE); 236 } 237 238 /* 239 * Open UTF-8 sequence. 240 * 241 * 11000010-11011111 C2-DF start of 2-byte sequence 242 * 11100000-11101111 E0-EF start of 3-byte sequence 243 * 11110000-11110100 F0-F4 start of 4-byte sequence 244 */ 245 enum utf8_state 246 utf8_open(struct utf8_data *ud, u_char ch) 247 { 248 memset(ud, 0, sizeof *ud); 249 if (ch >= 0xc2 && ch <= 0xdf) 250 ud->size = 2; 251 else if (ch >= 0xe0 && ch <= 0xef) 252 ud->size = 3; 253 else if (ch >= 0xf0 && ch <= 0xf4) 254 ud->size = 4; 255 else 256 return (UTF8_ERROR); 257 utf8_append(ud, ch); 258 return (UTF8_MORE); 259 } 260 261 /* Append character to UTF-8, closing if finished. */ 262 enum utf8_state 263 utf8_append(struct utf8_data *ud, u_char ch) 264 { 265 int width; 266 267 if (ud->have >= ud->size) 268 fatalx("UTF-8 character overflow"); 269 if (ud->size > sizeof ud->data) 270 fatalx("UTF-8 character size too large"); 271 272 if (ud->have != 0 && (ch & 0xc0) != 0x80) 273 ud->width = 0xff; 274 275 ud->data[ud->have++] = ch; 276 if (ud->have != ud->size) 277 return (UTF8_MORE); 278 279 if (ud->width == 0xff) 280 return (UTF8_ERROR); 281 if (utf8_width(ud, &width) != UTF8_DONE) 282 return (UTF8_ERROR); 283 ud->width = width; 284 285 return (UTF8_DONE); 286 } 287 288 /* 289 * Encode len characters from src into dst, which is guaranteed to have four 290 * bytes available for each character from src (for \abc or UTF-8) plus space 291 * for \0. 292 */ 293 int 294 utf8_strvis(char *dst, const char *src, size_t len, int flag) 295 { 296 struct utf8_data ud; 297 const char *start = dst, *end = src + len; 298 enum utf8_state more; 299 size_t i; 300 301 while (src < end) { 302 if ((more = utf8_open(&ud, *src)) == UTF8_MORE) { 303 while (++src < end && more == UTF8_MORE) 304 more = utf8_append(&ud, *src); 305 if (more == UTF8_DONE) { 306 /* UTF-8 character finished. */ 307 for (i = 0; i < ud.size; i++) 308 *dst++ = ud.data[i]; 309 continue; 310 } 311 /* Not a complete, valid UTF-8 character. */ 312 src -= ud.have; 313 } 314 if (src[0] == '$' && src < end - 1) { 315 if (isalpha((u_char)src[1]) || 316 src[1] == '_' || 317 src[1] == '{') 318 *dst++ = '\\'; 319 *dst++ = '$'; 320 } else if (src < end - 1) 321 dst = vis(dst, src[0], flag, src[1]); 322 else if (src < end) 323 dst = vis(dst, src[0], flag, '\0'); 324 src++; 325 } 326 *dst = '\0'; 327 return (dst - start); 328 } 329 330 /* Same as utf8_strvis but allocate the buffer. */ 331 int 332 utf8_stravis(char **dst, const char *src, int flag) 333 { 334 char *buf; 335 int len; 336 337 buf = xreallocarray(NULL, 4, strlen(src) + 1); 338 len = utf8_strvis(buf, src, strlen(src), flag); 339 340 *dst = xrealloc(buf, len + 1); 341 return (len); 342 } 343 344 /* Same as utf8_strvis but allocate the buffer. */ 345 int 346 utf8_stravisx(char **dst, const char *src, size_t srclen, int flag) 347 { 348 char *buf; 349 int len; 350 351 buf = xreallocarray(NULL, 4, srclen + 1); 352 len = utf8_strvis(buf, src, srclen, flag); 353 354 *dst = xrealloc(buf, len + 1); 355 return (len); 356 } 357 358 /* Does this string contain anything that isn't valid UTF-8? */ 359 int 360 utf8_isvalid(const char *s) 361 { 362 struct utf8_data ud; 363 const char *end; 364 enum utf8_state more; 365 366 end = s + strlen(s); 367 while (s < end) { 368 if ((more = utf8_open(&ud, *s)) == UTF8_MORE) { 369 while (++s < end && more == UTF8_MORE) 370 more = utf8_append(&ud, *s); 371 if (more == UTF8_DONE) 372 continue; 373 return (0); 374 } 375 if (*s < 0x20 || *s > 0x7e) 376 return (0); 377 s++; 378 } 379 return (1); 380 } 381 382 /* 383 * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free 384 * the returned string. Anything not valid printable ASCII or UTF-8 is 385 * stripped. 386 */ 387 char * 388 utf8_sanitize(const char *src) 389 { 390 char *dst = NULL; 391 size_t n = 0; 392 enum utf8_state more; 393 struct utf8_data ud; 394 u_int i; 395 396 while (*src != '\0') { 397 dst = xreallocarray(dst, n + 1, sizeof *dst); 398 if ((more = utf8_open(&ud, *src)) == UTF8_MORE) { 399 while (*++src != '\0' && more == UTF8_MORE) 400 more = utf8_append(&ud, *src); 401 if (more == UTF8_DONE) { 402 dst = xreallocarray(dst, n + ud.width, 403 sizeof *dst); 404 for (i = 0; i < ud.width; i++) 405 dst[n++] = '_'; 406 continue; 407 } 408 src -= ud.have; 409 } 410 if (*src > 0x1f && *src < 0x7f) 411 dst[n++] = *src; 412 else 413 dst[n++] = '_'; 414 src++; 415 } 416 dst = xreallocarray(dst, n + 1, sizeof *dst); 417 dst[n] = '\0'; 418 return (dst); 419 } 420 421 /* Get UTF-8 buffer length. */ 422 size_t 423 utf8_strlen(const struct utf8_data *s) 424 { 425 size_t i; 426 427 for (i = 0; s[i].size != 0; i++) 428 /* nothing */; 429 return (i); 430 } 431 432 /* Get UTF-8 string width. */ 433 u_int 434 utf8_strwidth(const struct utf8_data *s, ssize_t n) 435 { 436 ssize_t i; 437 u_int width = 0; 438 439 for (i = 0; s[i].size != 0; i++) { 440 if (n != -1 && n == i) 441 break; 442 width += s[i].width; 443 } 444 return (width); 445 } 446 447 /* 448 * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0. 449 * Caller frees. 450 */ 451 struct utf8_data * 452 utf8_fromcstr(const char *src) 453 { 454 struct utf8_data *dst = NULL; 455 size_t n = 0; 456 enum utf8_state more; 457 458 while (*src != '\0') { 459 dst = xreallocarray(dst, n + 1, sizeof *dst); 460 if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) { 461 while (*++src != '\0' && more == UTF8_MORE) 462 more = utf8_append(&dst[n], *src); 463 if (more == UTF8_DONE) { 464 n++; 465 continue; 466 } 467 src -= dst[n].have; 468 } 469 utf8_set(&dst[n], *src); 470 n++; 471 src++; 472 } 473 dst = xreallocarray(dst, n + 1, sizeof *dst); 474 dst[n].size = 0; 475 return (dst); 476 } 477 478 /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */ 479 char * 480 utf8_tocstr(struct utf8_data *src) 481 { 482 char *dst = NULL; 483 size_t n = 0; 484 485 for(; src->size != 0; src++) { 486 dst = xreallocarray(dst, n + src->size, 1); 487 memcpy(dst + n, src->data, src->size); 488 n += src->size; 489 } 490 dst = xreallocarray(dst, n + 1, 1); 491 dst[n] = '\0'; 492 return (dst); 493 } 494 495 /* Get width of UTF-8 string. */ 496 u_int 497 utf8_cstrwidth(const char *s) 498 { 499 struct utf8_data tmp; 500 u_int width; 501 enum utf8_state more; 502 503 width = 0; 504 while (*s != '\0') { 505 if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) { 506 while (*++s != '\0' && more == UTF8_MORE) 507 more = utf8_append(&tmp, *s); 508 if (more == UTF8_DONE) { 509 width += tmp.width; 510 continue; 511 } 512 s -= tmp.have; 513 } 514 if (*s > 0x1f && *s != 0x7f) 515 width++; 516 s++; 517 } 518 return (width); 519 } 520 521 /* Pad UTF-8 string to width on the left. Caller frees. */ 522 char * 523 utf8_padcstr(const char *s, u_int width) 524 { 525 size_t slen; 526 char *out; 527 u_int n, i; 528 529 n = utf8_cstrwidth(s); 530 if (n >= width) 531 return (xstrdup(s)); 532 533 slen = strlen(s); 534 out = xmalloc(slen + 1 + (width - n)); 535 memcpy(out, s, slen); 536 for (i = n; i < width; i++) 537 out[slen++] = ' '; 538 out[slen] = '\0'; 539 return (out); 540 } 541 542 /* Pad UTF-8 string to width on the right. Caller frees. */ 543 char * 544 utf8_rpadcstr(const char *s, u_int width) 545 { 546 size_t slen; 547 char *out; 548 u_int n, i; 549 550 n = utf8_cstrwidth(s); 551 if (n >= width) 552 return (xstrdup(s)); 553 554 slen = strlen(s); 555 out = xmalloc(slen + 1 + (width - n)); 556 for (i = 0; i < width - n; i++) 557 out[i] = ' '; 558 memcpy(out + i, s, slen); 559 out[i + slen] = '\0'; 560 return (out); 561 } 562 563 int 564 utf8_cstrhas(const char *s, const struct utf8_data *ud) 565 { 566 struct utf8_data *copy, *loop; 567 int found = 0; 568 569 copy = utf8_fromcstr(s); 570 for (loop = copy; loop->size != 0; loop++) { 571 if (loop->size != ud->size) 572 continue; 573 if (memcmp(loop->data, ud->data, loop->size) == 0) { 574 found = 1; 575 break; 576 } 577 } 578 free(copy); 579 580 return (found); 581 } 582