1 /* $OpenBSD: utf8.c,v 1.55 2020/06/09 10:37:00 nicm Exp $ */ 2 3 /* 4 * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER 15 * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING 16 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include <sys/types.h> 20 21 #include <ctype.h> 22 #include <errno.h> 23 #include <stdlib.h> 24 #include <string.h> 25 #include <vis.h> 26 #include <wchar.h> 27 28 #include "tmux.h" 29 30 struct utf8_item { 31 RB_ENTRY(utf8_item) index_entry; 32 u_int index; 33 34 RB_ENTRY(utf8_item) data_entry; 35 char data[UTF8_SIZE]; 36 u_char size; 37 }; 38 39 static int 40 utf8_data_cmp(struct utf8_item *ui1, struct utf8_item *ui2) 41 { 42 if (ui1->size < ui2->size) 43 return (-1); 44 if (ui1->size > ui2->size) 45 return (1); 46 return (memcmp(ui1->data, ui2->data, ui1->size)); 47 } 48 RB_HEAD(utf8_data_tree, utf8_item); 49 RB_GENERATE_STATIC(utf8_data_tree, utf8_item, data_entry, utf8_data_cmp); 50 static struct utf8_data_tree utf8_data_tree = RB_INITIALIZER(utf8_data_tree); 51 52 static int 53 utf8_index_cmp(struct utf8_item *ui1, struct utf8_item *ui2) 54 { 55 if (ui1->index < ui2->index) 56 return (-1); 57 if (ui1->index > ui2->index) 58 return (1); 59 return (0); 60 } 61 RB_HEAD(utf8_index_tree, utf8_item); 62 RB_GENERATE_STATIC(utf8_index_tree, utf8_item, index_entry, utf8_index_cmp); 63 static struct utf8_index_tree utf8_index_tree = RB_INITIALIZER(utf8_index_tree); 64 65 static u_int utf8_next_index; 66 67 #define UTF8_GET_SIZE(uc) (((uc) >> 24) & 0x1f) 68 #define UTF8_GET_WIDTH(flags) (((uc) >> 29) - 1) 69 70 #define UTF8_SET_SIZE(size) (((utf8_char)(size)) << 24) 71 #define UTF8_SET_WIDTH(width) ((((utf8_char)(width)) + 1) << 29) 72 73 /* Get a UTF-8 item from data. */ 74 static struct utf8_item * 75 utf8_item_by_data(const char *data, size_t size) 76 { 77 struct utf8_item ui; 78 79 memcpy(ui.data, data, size); 80 ui.size = size; 81 82 return (RB_FIND(utf8_data_tree, &utf8_data_tree, &ui)); 83 } 84 85 /* Get a UTF-8 item from data. */ 86 static struct utf8_item * 87 utf8_item_by_index(u_int index) 88 { 89 struct utf8_item ui; 90 91 ui.index = index; 92 93 return (RB_FIND(utf8_index_tree, &utf8_index_tree, &ui)); 94 } 95 96 /* Add a UTF-8 item. */ 97 static int 98 utf8_put_item(const char *data, size_t size, u_int *index) 99 { 100 struct utf8_item *ui; 101 102 ui = utf8_item_by_data(data, size); 103 if (ui != NULL) { 104 log_debug("%s: found %.*s = %u", __func__, (int)size, data, 105 *index); 106 *index = ui->index; 107 return (0); 108 } 109 110 if (utf8_next_index == 0xffffff + 1) 111 return (-1); 112 113 ui = xcalloc(1, sizeof *ui); 114 ui->index = utf8_next_index++; 115 RB_INSERT(utf8_index_tree, &utf8_index_tree, ui); 116 117 memcpy(ui->data, data, size); 118 ui->size = size; 119 RB_INSERT(utf8_data_tree, &utf8_data_tree, ui); 120 121 log_debug("%s: added %.*s = %u", __func__, (int)size, data, *index); 122 *index = ui->index; 123 return (0); 124 } 125 126 /* Get UTF-8 character from data. */ 127 enum utf8_state 128 utf8_from_data(const struct utf8_data *ud, utf8_char *uc) 129 { 130 u_int index; 131 132 if (ud->width > 2) 133 fatalx("invalid UTF-8 width: %u", ud->width); 134 135 if (ud->size > UTF8_SIZE) 136 goto fail; 137 if (ud->size <= 3) { 138 index = (((utf8_char)ud->data[2] << 16)| 139 ((utf8_char)ud->data[1] << 8)| 140 ((utf8_char)ud->data[0])); 141 } else if (utf8_put_item(ud->data, ud->size, &index) != 0) 142 goto fail; 143 *uc = UTF8_SET_SIZE(ud->size)|UTF8_SET_WIDTH(ud->width)|index; 144 log_debug("%s: (%d %d %.*s) -> %08x", __func__, ud->width, ud->size, 145 (int)ud->size, ud->data, *uc); 146 return (UTF8_DONE); 147 148 fail: 149 if (ud->width == 0) 150 *uc = UTF8_SET_SIZE(0)|UTF8_SET_WIDTH(0); 151 else if (ud->width == 1) 152 *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x20; 153 else 154 *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x2020; 155 return (UTF8_ERROR); 156 } 157 158 /* Get UTF-8 data from character. */ 159 void 160 utf8_to_data(utf8_char uc, struct utf8_data *ud) 161 { 162 struct utf8_item *ui; 163 u_int index; 164 165 memset(ud, 0, sizeof *ud); 166 ud->size = ud->have = UTF8_GET_SIZE(uc); 167 ud->width = UTF8_GET_WIDTH(uc); 168 169 if (ud->size <= 3) { 170 ud->data[2] = (uc >> 16); 171 ud->data[1] = ((uc >> 8) & 0xff); 172 ud->data[0] = (uc & 0xff); 173 } else { 174 index = (uc & 0xffffff); 175 if ((ui = utf8_item_by_index(index)) == NULL) 176 memset(ud->data, ' ', ud->size); 177 else 178 memcpy(ud->data, ui->data, ud->size); 179 } 180 181 log_debug("%s: %08x -> (%d %d %.*s)", __func__, uc, ud->width, ud->size, 182 (int)ud->size, ud->data); 183 } 184 185 /* Get UTF-8 character from a single ASCII character. */ 186 u_int 187 utf8_build_one(u_char ch) 188 { 189 return (UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|ch); 190 } 191 192 /* Set a single character. */ 193 void 194 utf8_set(struct utf8_data *ud, u_char ch) 195 { 196 static const struct utf8_data empty = { { 0 }, 1, 1, 1 }; 197 198 memcpy(ud, &empty, sizeof *ud); 199 *ud->data = ch; 200 } 201 202 /* Copy UTF-8 character. */ 203 void 204 utf8_copy(struct utf8_data *to, const struct utf8_data *from) 205 { 206 u_int i; 207 208 memcpy(to, from, sizeof *to); 209 210 for (i = to->size; i < sizeof to->data; i++) 211 to->data[i] = '\0'; 212 } 213 214 /* Get width of Unicode character. */ 215 static enum utf8_state 216 utf8_width(struct utf8_data *ud, int *width) 217 { 218 wchar_t wc; 219 220 switch (mbtowc(&wc, ud->data, ud->size)) { 221 case -1: 222 log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data, 223 errno); 224 mbtowc(NULL, NULL, MB_CUR_MAX); 225 return (UTF8_ERROR); 226 case 0: 227 return (UTF8_ERROR); 228 } 229 *width = wcwidth(wc); 230 if (*width < 0 || *width > 0xff) { 231 log_debug("UTF-8 %.*s, wcwidth() %d", (int)ud->size, ud->data, 232 *width); 233 return (UTF8_ERROR); 234 } 235 return (UTF8_DONE); 236 } 237 238 /* 239 * Open UTF-8 sequence. 240 * 241 * 11000010-11011111 C2-DF start of 2-byte sequence 242 * 11100000-11101111 E0-EF start of 3-byte sequence 243 * 11110000-11110100 F0-F4 start of 4-byte sequence 244 */ 245 enum utf8_state 246 utf8_open(struct utf8_data *ud, u_char ch) 247 { 248 memset(ud, 0, sizeof *ud); 249 if (ch >= 0xc2 && ch <= 0xdf) 250 ud->size = 2; 251 else if (ch >= 0xe0 && ch <= 0xef) 252 ud->size = 3; 253 else if (ch >= 0xf0 && ch <= 0xf4) 254 ud->size = 4; 255 else 256 return (UTF8_ERROR); 257 utf8_append(ud, ch); 258 return (UTF8_MORE); 259 } 260 261 /* Append character to UTF-8, closing if finished. */ 262 enum utf8_state 263 utf8_append(struct utf8_data *ud, u_char ch) 264 { 265 int width; 266 267 if (ud->have >= ud->size) 268 fatalx("UTF-8 character overflow"); 269 if (ud->size > sizeof ud->data) 270 fatalx("UTF-8 character size too large"); 271 272 if (ud->have != 0 && (ch & 0xc0) != 0x80) 273 ud->width = 0xff; 274 275 ud->data[ud->have++] = ch; 276 if (ud->have != ud->size) 277 return (UTF8_MORE); 278 279 if (ud->width == 0xff) 280 return (UTF8_ERROR); 281 if (utf8_width(ud, &width) != UTF8_DONE) 282 return (UTF8_ERROR); 283 ud->width = width; 284 285 return (UTF8_DONE); 286 } 287 288 /* 289 * Encode len characters from src into dst, which is guaranteed to have four 290 * bytes available for each character from src (for \abc or UTF-8) plus space 291 * for \0. 292 */ 293 int 294 utf8_strvis(char *dst, const char *src, size_t len, int flag) 295 { 296 struct utf8_data ud; 297 const char *start = dst, *end = src + len; 298 enum utf8_state more; 299 size_t i; 300 301 while (src < end) { 302 if ((more = utf8_open(&ud, *src)) == UTF8_MORE) { 303 while (++src < end && more == UTF8_MORE) 304 more = utf8_append(&ud, *src); 305 if (more == UTF8_DONE) { 306 /* UTF-8 character finished. */ 307 for (i = 0; i < ud.size; i++) 308 *dst++ = ud.data[i]; 309 continue; 310 } 311 /* Not a complete, valid UTF-8 character. */ 312 src -= ud.have; 313 } 314 if (src[0] == '$' && src < end - 1) { 315 if (isalpha((u_char)src[1]) || 316 src[1] == '_' || 317 src[1] == '{') 318 *dst++ = '\\'; 319 *dst++ = '$'; 320 } else if (src < end - 1) 321 dst = vis(dst, src[0], flag, src[1]); 322 else if (src < end) 323 dst = vis(dst, src[0], flag, '\0'); 324 src++; 325 } 326 *dst = '\0'; 327 return (dst - start); 328 } 329 330 /* Same as utf8_strvis but allocate the buffer. */ 331 int 332 utf8_stravis(char **dst, const char *src, int flag) 333 { 334 char *buf; 335 int len; 336 337 buf = xreallocarray(NULL, 4, strlen(src) + 1); 338 len = utf8_strvis(buf, src, strlen(src), flag); 339 340 *dst = xrealloc(buf, len + 1); 341 return (len); 342 } 343 344 /* Does this string contain anything that isn't valid UTF-8? */ 345 int 346 utf8_isvalid(const char *s) 347 { 348 struct utf8_data ud; 349 const char *end; 350 enum utf8_state more; 351 352 end = s + strlen(s); 353 while (s < end) { 354 if ((more = utf8_open(&ud, *s)) == UTF8_MORE) { 355 while (++s < end && more == UTF8_MORE) 356 more = utf8_append(&ud, *s); 357 if (more == UTF8_DONE) 358 continue; 359 return (0); 360 } 361 if (*s < 0x20 || *s > 0x7e) 362 return (0); 363 s++; 364 } 365 return (1); 366 } 367 368 /* 369 * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free 370 * the returned string. Anything not valid printable ASCII or UTF-8 is 371 * stripped. 372 */ 373 char * 374 utf8_sanitize(const char *src) 375 { 376 char *dst = NULL; 377 size_t n = 0; 378 enum utf8_state more; 379 struct utf8_data ud; 380 u_int i; 381 382 while (*src != '\0') { 383 dst = xreallocarray(dst, n + 1, sizeof *dst); 384 if ((more = utf8_open(&ud, *src)) == UTF8_MORE) { 385 while (*++src != '\0' && more == UTF8_MORE) 386 more = utf8_append(&ud, *src); 387 if (more == UTF8_DONE) { 388 dst = xreallocarray(dst, n + ud.width, 389 sizeof *dst); 390 for (i = 0; i < ud.width; i++) 391 dst[n++] = '_'; 392 continue; 393 } 394 src -= ud.have; 395 } 396 if (*src > 0x1f && *src < 0x7f) 397 dst[n++] = *src; 398 else 399 dst[n++] = '_'; 400 src++; 401 } 402 dst = xreallocarray(dst, n + 1, sizeof *dst); 403 dst[n] = '\0'; 404 return (dst); 405 } 406 407 /* Get UTF-8 buffer length. */ 408 size_t 409 utf8_strlen(const struct utf8_data *s) 410 { 411 size_t i; 412 413 for (i = 0; s[i].size != 0; i++) 414 /* nothing */; 415 return (i); 416 } 417 418 /* Get UTF-8 string width. */ 419 u_int 420 utf8_strwidth(const struct utf8_data *s, ssize_t n) 421 { 422 ssize_t i; 423 u_int width = 0; 424 425 for (i = 0; s[i].size != 0; i++) { 426 if (n != -1 && n == i) 427 break; 428 width += s[i].width; 429 } 430 return (width); 431 } 432 433 /* 434 * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0. 435 * Caller frees. 436 */ 437 struct utf8_data * 438 utf8_fromcstr(const char *src) 439 { 440 struct utf8_data *dst = NULL; 441 size_t n = 0; 442 enum utf8_state more; 443 444 while (*src != '\0') { 445 dst = xreallocarray(dst, n + 1, sizeof *dst); 446 if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) { 447 while (*++src != '\0' && more == UTF8_MORE) 448 more = utf8_append(&dst[n], *src); 449 if (more == UTF8_DONE) { 450 n++; 451 continue; 452 } 453 src -= dst[n].have; 454 } 455 utf8_set(&dst[n], *src); 456 n++; 457 src++; 458 } 459 dst = xreallocarray(dst, n + 1, sizeof *dst); 460 dst[n].size = 0; 461 return (dst); 462 } 463 464 /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */ 465 char * 466 utf8_tocstr(struct utf8_data *src) 467 { 468 char *dst = NULL; 469 size_t n = 0; 470 471 for(; src->size != 0; src++) { 472 dst = xreallocarray(dst, n + src->size, 1); 473 memcpy(dst + n, src->data, src->size); 474 n += src->size; 475 } 476 dst = xreallocarray(dst, n + 1, 1); 477 dst[n] = '\0'; 478 return (dst); 479 } 480 481 /* Get width of UTF-8 string. */ 482 u_int 483 utf8_cstrwidth(const char *s) 484 { 485 struct utf8_data tmp; 486 u_int width; 487 enum utf8_state more; 488 489 width = 0; 490 while (*s != '\0') { 491 if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) { 492 while (*++s != '\0' && more == UTF8_MORE) 493 more = utf8_append(&tmp, *s); 494 if (more == UTF8_DONE) { 495 width += tmp.width; 496 continue; 497 } 498 s -= tmp.have; 499 } 500 if (*s > 0x1f && *s != 0x7f) 501 width++; 502 s++; 503 } 504 return (width); 505 } 506 507 /* Pad UTF-8 string to width on the left. Caller frees. */ 508 char * 509 utf8_padcstr(const char *s, u_int width) 510 { 511 size_t slen; 512 char *out; 513 u_int n, i; 514 515 n = utf8_cstrwidth(s); 516 if (n >= width) 517 return (xstrdup(s)); 518 519 slen = strlen(s); 520 out = xmalloc(slen + 1 + (width - n)); 521 memcpy(out, s, slen); 522 for (i = n; i < width; i++) 523 out[slen++] = ' '; 524 out[slen] = '\0'; 525 return (out); 526 } 527 528 /* Pad UTF-8 string to width on the right. Caller frees. */ 529 char * 530 utf8_rpadcstr(const char *s, u_int width) 531 { 532 size_t slen; 533 char *out; 534 u_int n, i; 535 536 n = utf8_cstrwidth(s); 537 if (n >= width) 538 return (xstrdup(s)); 539 540 slen = strlen(s); 541 out = xmalloc(slen + 1 + (width - n)); 542 for (i = 0; i < width - n; i++) 543 out[i] = ' '; 544 memcpy(out + i, s, slen); 545 out[i + slen] = '\0'; 546 return (out); 547 } 548 549 int 550 utf8_cstrhas(const char *s, const struct utf8_data *ud) 551 { 552 struct utf8_data *copy, *loop; 553 int found = 0; 554 555 copy = utf8_fromcstr(s); 556 for (loop = copy; loop->size != 0; loop++) { 557 if (loop->size != ud->size) 558 continue; 559 if (memcmp(loop->data, ud->data, loop->size) == 0) { 560 found = 1; 561 break; 562 } 563 } 564 free(copy); 565 566 return (found); 567 } 568