1 /* $OpenBSD: utf8.c,v 1.67 2025/01/01 15:17:36 nicm Exp $ */ 2 3 /* 4 * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com> 5 * 6 * Permission to use, copy, modify, and distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER 15 * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING 16 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 */ 18 19 #include <sys/types.h> 20 21 #include <ctype.h> 22 #include <errno.h> 23 #include <stdlib.h> 24 #include <string.h> 25 #include <vis.h> 26 27 #include "tmux.h" 28 29 struct utf8_width_item { 30 wchar_t wc; 31 u_int width; 32 int allocated; 33 34 RB_ENTRY(utf8_width_item) entry; 35 }; 36 37 static int 38 utf8_width_cache_cmp(struct utf8_width_item *uw1, struct utf8_width_item *uw2) 39 { 40 if (uw1->wc < uw2->wc) 41 return (-1); 42 if (uw1->wc > uw2->wc) 43 return (1); 44 return (0); 45 } 46 RB_HEAD(utf8_width_cache, utf8_width_item); 47 RB_GENERATE_STATIC(utf8_width_cache, utf8_width_item, entry, 48 utf8_width_cache_cmp); 49 static struct utf8_width_cache utf8_width_cache = 50 RB_INITIALIZER(utf8_width_cache); 51 52 static struct utf8_width_item utf8_default_width_cache[] = { 53 { .wc = 0x0261D, .width = 2 }, 54 { .wc = 0x026F9, .width = 2 }, 55 { .wc = 0x0270A, .width = 2 }, 56 { .wc = 0x0270B, .width = 2 }, 57 { .wc = 0x0270C, .width = 2 }, 58 { .wc = 0x0270D, .width = 2 }, 59 { .wc = 0x1F1E6, .width = 2 }, 60 { .wc = 0x1F1E7, .width = 2 }, 61 { .wc = 0x1F1E8, .width = 2 }, 62 { .wc = 0x1F1E9, .width = 2 }, 63 { .wc = 0x1F1EA, .width = 2 }, 64 { .wc = 0x1F1EB, .width = 2 }, 65 { .wc = 0x1F1EC, .width = 2 }, 66 { .wc = 0x1F1ED, .width = 2 }, 67 { .wc = 0x1F1EE, .width = 2 }, 68 { .wc = 0x1F1EF, .width = 2 }, 69 { .wc = 0x1F1F0, .width = 2 }, 70 { .wc = 0x1F1F1, .width = 2 }, 71 { .wc = 0x1F1F2, .width = 2 }, 72 { .wc = 0x1F1F3, .width = 2 }, 73 { .wc = 0x1F1F4, .width = 2 }, 74 { .wc = 0x1F1F5, .width = 2 }, 75 { .wc = 0x1F1F6, .width = 2 }, 76 { .wc = 0x1F1F7, .width = 2 }, 77 { .wc = 0x1F1F8, .width = 2 }, 78 { .wc = 0x1F1F9, .width = 2 }, 79 { .wc = 0x1F1FA, .width = 2 }, 80 { .wc = 0x1F1FB, .width = 2 }, 81 { .wc = 0x1F1FC, .width = 2 }, 82 { .wc = 0x1F1FD, .width = 2 }, 83 { .wc = 0x1F1FE, .width = 2 }, 84 { .wc = 0x1F1FF, .width = 2 }, 85 { .wc = 0x1F385, .width = 2 }, 86 { .wc = 0x1F3C2, .width = 2 }, 87 { .wc = 0x1F3C3, .width = 2 }, 88 { .wc = 0x1F3C4, .width = 2 }, 89 { .wc = 0x1F3C7, .width = 2 }, 90 { .wc = 0x1F3CA, .width = 2 }, 91 { .wc = 0x1F3CB, .width = 2 }, 92 { .wc = 0x1F3CC, .width = 2 }, 93 { .wc = 0x1F3FB, .width = 2 }, 94 { .wc = 0x1F3FC, .width = 2 }, 95 { .wc = 0x1F3FD, .width = 2 }, 96 { .wc = 0x1F3FE, .width = 2 }, 97 { .wc = 0x1F3FF, .width = 2 }, 98 { .wc = 0x1F442, .width = 2 }, 99 { .wc = 0x1F443, .width = 2 }, 100 { .wc = 0x1F446, .width = 2 }, 101 { .wc = 0x1F447, .width = 2 }, 102 { .wc = 0x1F448, .width = 2 }, 103 { .wc = 0x1F449, .width = 2 }, 104 { .wc = 0x1F44A, .width = 2 }, 105 { .wc = 0x1F44B, .width = 2 }, 106 { .wc = 0x1F44C, .width = 2 }, 107 { .wc = 0x1F44D, .width = 2 }, 108 { .wc = 0x1F44E, .width = 2 }, 109 { .wc = 0x1F44F, .width = 2 }, 110 { .wc = 0x1F450, .width = 2 }, 111 { .wc = 0x1F466, .width = 2 }, 112 { .wc = 0x1F467, .width = 2 }, 113 { .wc = 0x1F468, .width = 2 }, 114 { .wc = 0x1F469, .width = 2 }, 115 { .wc = 0x1F46B, .width = 2 }, 116 { .wc = 0x1F46C, .width = 2 }, 117 { .wc = 0x1F46D, .width = 2 }, 118 { .wc = 0x1F46E, .width = 2 }, 119 { .wc = 0x1F470, .width = 2 }, 120 { .wc = 0x1F471, .width = 2 }, 121 { .wc = 0x1F472, .width = 2 }, 122 { .wc = 0x1F473, .width = 2 }, 123 { .wc = 0x1F474, .width = 2 }, 124 { .wc = 0x1F475, .width = 2 }, 125 { .wc = 0x1F476, .width = 2 }, 126 { .wc = 0x1F477, .width = 2 }, 127 { .wc = 0x1F478, .width = 2 }, 128 { .wc = 0x1F47C, .width = 2 }, 129 { .wc = 0x1F481, .width = 2 }, 130 { .wc = 0x1F482, .width = 2 }, 131 { .wc = 0x1F483, .width = 2 }, 132 { .wc = 0x1F485, .width = 2 }, 133 { .wc = 0x1F486, .width = 2 }, 134 { .wc = 0x1F487, .width = 2 }, 135 { .wc = 0x1F48F, .width = 2 }, 136 { .wc = 0x1F491, .width = 2 }, 137 { .wc = 0x1F4AA, .width = 2 }, 138 { .wc = 0x1F574, .width = 2 }, 139 { .wc = 0x1F575, .width = 2 }, 140 { .wc = 0x1F57A, .width = 2 }, 141 { .wc = 0x1F590, .width = 2 }, 142 { .wc = 0x1F595, .width = 2 }, 143 { .wc = 0x1F596, .width = 2 }, 144 { .wc = 0x1F645, .width = 2 }, 145 { .wc = 0x1F646, .width = 2 }, 146 { .wc = 0x1F647, .width = 2 }, 147 { .wc = 0x1F64B, .width = 2 }, 148 { .wc = 0x1F64C, .width = 2 }, 149 { .wc = 0x1F64D, .width = 2 }, 150 { .wc = 0x1F64E, .width = 2 }, 151 { .wc = 0x1F64F, .width = 2 }, 152 { .wc = 0x1F6A3, .width = 2 }, 153 { .wc = 0x1F6B4, .width = 2 }, 154 { .wc = 0x1F6B5, .width = 2 }, 155 { .wc = 0x1F6B6, .width = 2 }, 156 { .wc = 0x1F6C0, .width = 2 }, 157 { .wc = 0x1F6CC, .width = 2 }, 158 { .wc = 0x1F90C, .width = 2 }, 159 { .wc = 0x1F90F, .width = 2 }, 160 { .wc = 0x1F918, .width = 2 }, 161 { .wc = 0x1F919, .width = 2 }, 162 { .wc = 0x1F91A, .width = 2 }, 163 { .wc = 0x1F91B, .width = 2 }, 164 { .wc = 0x1F91C, .width = 2 }, 165 { .wc = 0x1F91D, .width = 2 }, 166 { .wc = 0x1F91E, .width = 2 }, 167 { .wc = 0x1F91F, .width = 2 }, 168 { .wc = 0x1F926, .width = 2 }, 169 { .wc = 0x1F930, .width = 2 }, 170 { .wc = 0x1F931, .width = 2 }, 171 { .wc = 0x1F932, .width = 2 }, 172 { .wc = 0x1F933, .width = 2 }, 173 { .wc = 0x1F934, .width = 2 }, 174 { .wc = 0x1F935, .width = 2 }, 175 { .wc = 0x1F936, .width = 2 }, 176 { .wc = 0x1F937, .width = 2 }, 177 { .wc = 0x1F938, .width = 2 }, 178 { .wc = 0x1F939, .width = 2 }, 179 { .wc = 0x1F93D, .width = 2 }, 180 { .wc = 0x1F93E, .width = 2 }, 181 { .wc = 0x1F977, .width = 2 }, 182 { .wc = 0x1F9B5, .width = 2 }, 183 { .wc = 0x1F9B6, .width = 2 }, 184 { .wc = 0x1F9B8, .width = 2 }, 185 { .wc = 0x1F9B9, .width = 2 }, 186 { .wc = 0x1F9BB, .width = 2 }, 187 { .wc = 0x1F9CD, .width = 2 }, 188 { .wc = 0x1F9CE, .width = 2 }, 189 { .wc = 0x1F9CF, .width = 2 }, 190 { .wc = 0x1F9D1, .width = 2 }, 191 { .wc = 0x1F9D2, .width = 2 }, 192 { .wc = 0x1F9D3, .width = 2 }, 193 { .wc = 0x1F9D4, .width = 2 }, 194 { .wc = 0x1F9D5, .width = 2 }, 195 { .wc = 0x1F9D6, .width = 2 }, 196 { .wc = 0x1F9D7, .width = 2 }, 197 { .wc = 0x1F9D8, .width = 2 }, 198 { .wc = 0x1F9D9, .width = 2 }, 199 { .wc = 0x1F9DA, .width = 2 }, 200 { .wc = 0x1F9DB, .width = 2 }, 201 { .wc = 0x1F9DC, .width = 2 }, 202 { .wc = 0x1F9DD, .width = 2 }, 203 { .wc = 0x1FAC3, .width = 2 }, 204 { .wc = 0x1FAC4, .width = 2 }, 205 { .wc = 0x1FAC5, .width = 2 }, 206 { .wc = 0x1FAF0, .width = 2 }, 207 { .wc = 0x1FAF1, .width = 2 }, 208 { .wc = 0x1FAF2, .width = 2 }, 209 { .wc = 0x1FAF3, .width = 2 }, 210 { .wc = 0x1FAF4, .width = 2 }, 211 { .wc = 0x1FAF5, .width = 2 }, 212 { .wc = 0x1FAF6, .width = 2 }, 213 { .wc = 0x1FAF7, .width = 2 }, 214 { .wc = 0x1FAF8, .width = 2 } 215 }; 216 217 struct utf8_item { 218 RB_ENTRY(utf8_item) index_entry; 219 u_int index; 220 221 RB_ENTRY(utf8_item) data_entry; 222 char data[UTF8_SIZE]; 223 u_char size; 224 }; 225 226 static int 227 utf8_data_cmp(struct utf8_item *ui1, struct utf8_item *ui2) 228 { 229 if (ui1->size < ui2->size) 230 return (-1); 231 if (ui1->size > ui2->size) 232 return (1); 233 return (memcmp(ui1->data, ui2->data, ui1->size)); 234 } 235 RB_HEAD(utf8_data_tree, utf8_item); 236 RB_GENERATE_STATIC(utf8_data_tree, utf8_item, data_entry, utf8_data_cmp); 237 static struct utf8_data_tree utf8_data_tree = RB_INITIALIZER(utf8_data_tree); 238 239 static int 240 utf8_index_cmp(struct utf8_item *ui1, struct utf8_item *ui2) 241 { 242 if (ui1->index < ui2->index) 243 return (-1); 244 if (ui1->index > ui2->index) 245 return (1); 246 return (0); 247 } 248 RB_HEAD(utf8_index_tree, utf8_item); 249 RB_GENERATE_STATIC(utf8_index_tree, utf8_item, index_entry, utf8_index_cmp); 250 static struct utf8_index_tree utf8_index_tree = RB_INITIALIZER(utf8_index_tree); 251 252 static int utf8_no_width; 253 static u_int utf8_next_index; 254 255 #define UTF8_GET_SIZE(uc) (((uc) >> 24) & 0x1f) 256 #define UTF8_GET_WIDTH(uc) (((uc) >> 29) - 1) 257 258 #define UTF8_SET_SIZE(size) (((utf8_char)(size)) << 24) 259 #define UTF8_SET_WIDTH(width) ((((utf8_char)(width)) + 1) << 29) 260 261 /* Get a UTF-8 item from data. */ 262 static struct utf8_item * 263 utf8_item_by_data(const u_char *data, size_t size) 264 { 265 struct utf8_item ui; 266 267 memcpy(ui.data, data, size); 268 ui.size = size; 269 270 return (RB_FIND(utf8_data_tree, &utf8_data_tree, &ui)); 271 } 272 273 /* Get a UTF-8 item from data. */ 274 static struct utf8_item * 275 utf8_item_by_index(u_int index) 276 { 277 struct utf8_item ui; 278 279 ui.index = index; 280 281 return (RB_FIND(utf8_index_tree, &utf8_index_tree, &ui)); 282 } 283 284 /* Find a codepoint in the cache. */ 285 static struct utf8_width_item * 286 utf8_find_in_width_cache(wchar_t wc) 287 { 288 struct utf8_width_item uw; 289 290 uw.wc = wc; 291 return RB_FIND(utf8_width_cache, &utf8_width_cache, &uw); 292 } 293 294 /* Parse a single codepoint option. */ 295 static void 296 utf8_add_to_width_cache(const char *s) 297 { 298 struct utf8_width_item *uw, *old; 299 char *copy, *cp, *endptr; 300 u_int width; 301 const char *errstr; 302 struct utf8_data *ud; 303 wchar_t wc; 304 unsigned long long n; 305 306 copy = xstrdup(s); 307 if ((cp = strchr(copy, '=')) == NULL) { 308 free(copy); 309 return; 310 } 311 *cp++ = '\0'; 312 313 width = strtonum(cp, 0, 2, &errstr); 314 if (errstr != NULL) { 315 free(copy); 316 return; 317 } 318 319 if (strncmp(copy, "U+", 2) == 0) { 320 errno = 0; 321 n = strtoull(copy + 2, &endptr, 16); 322 if (copy[2] == '\0' || 323 *endptr != '\0' || 324 n == 0 || 325 n > WCHAR_MAX || 326 (errno == ERANGE && n == ULLONG_MAX)) { 327 free(copy); 328 return; 329 } 330 wc = n; 331 } else { 332 utf8_no_width = 1; 333 ud = utf8_fromcstr(copy); 334 utf8_no_width = 0; 335 if (ud[0].size == 0 || ud[1].size != 0) { 336 free(ud); 337 free(copy); 338 return; 339 } 340 #ifdef HAVE_UTF8PROC 341 if (utf8proc_mbtowc(&wc, ud[0].data, ud[0].size) <= 0) { 342 #else 343 if (mbtowc(&wc, ud[0].data, ud[0].size) <= 0) { 344 #endif 345 free(ud); 346 free(copy); 347 return; 348 } 349 free(ud); 350 } 351 352 log_debug("Unicode width cache: %08X=%u", (u_int)wc, width); 353 354 uw = xcalloc(1, sizeof *uw); 355 uw->wc = wc; 356 uw->width = width; 357 uw->allocated = 1; 358 359 old = RB_INSERT(utf8_width_cache, &utf8_width_cache, uw); 360 if (old != NULL) { 361 RB_REMOVE(utf8_width_cache, &utf8_width_cache, old); 362 if (old->allocated) 363 free(old); 364 RB_INSERT(utf8_width_cache, &utf8_width_cache, uw); 365 } 366 367 free(copy); 368 } 369 370 /* Rebuild cache of widths. */ 371 void 372 utf8_update_width_cache(void) 373 { 374 struct utf8_width_item *uw, *uw1; 375 struct options_entry *o; 376 struct options_array_item *a; 377 u_int i; 378 379 RB_FOREACH_SAFE (uw, utf8_width_cache, &utf8_width_cache, uw1) { 380 RB_REMOVE(utf8_width_cache, &utf8_width_cache, uw); 381 if (uw->allocated) 382 free(uw); 383 } 384 385 for (i = 0; i < nitems(utf8_default_width_cache); i++) { 386 RB_INSERT(utf8_width_cache, &utf8_width_cache, 387 &utf8_default_width_cache[i]); 388 } 389 390 o = options_get(global_options, "codepoint-widths"); 391 a = options_array_first(o); 392 while (a != NULL) { 393 utf8_add_to_width_cache(options_array_item_value(a)->string); 394 a = options_array_next(a); 395 } 396 } 397 398 /* Add a UTF-8 item. */ 399 static int 400 utf8_put_item(const u_char *data, size_t size, u_int *index) 401 { 402 struct utf8_item *ui; 403 404 ui = utf8_item_by_data(data, size); 405 if (ui != NULL) { 406 *index = ui->index; 407 log_debug("%s: found %.*s = %u", __func__, (int)size, data, 408 *index); 409 return (0); 410 } 411 412 if (utf8_next_index == 0xffffff + 1) 413 return (-1); 414 415 ui = xcalloc(1, sizeof *ui); 416 ui->index = utf8_next_index++; 417 RB_INSERT(utf8_index_tree, &utf8_index_tree, ui); 418 419 memcpy(ui->data, data, size); 420 ui->size = size; 421 RB_INSERT(utf8_data_tree, &utf8_data_tree, ui); 422 423 *index = ui->index; 424 log_debug("%s: added %.*s = %u", __func__, (int)size, data, *index); 425 return (0); 426 } 427 428 /* Get UTF-8 character from data. */ 429 enum utf8_state 430 utf8_from_data(const struct utf8_data *ud, utf8_char *uc) 431 { 432 u_int index; 433 434 if (ud->width > 2) 435 fatalx("invalid UTF-8 width: %u", ud->width); 436 437 if (ud->size > UTF8_SIZE) 438 goto fail; 439 if (ud->size <= 3) { 440 index = (((utf8_char)ud->data[2] << 16)| 441 ((utf8_char)ud->data[1] << 8)| 442 ((utf8_char)ud->data[0])); 443 } else if (utf8_put_item(ud->data, ud->size, &index) != 0) 444 goto fail; 445 *uc = UTF8_SET_SIZE(ud->size)|UTF8_SET_WIDTH(ud->width)|index; 446 log_debug("%s: (%d %d %.*s) -> %08x", __func__, ud->width, ud->size, 447 (int)ud->size, ud->data, *uc); 448 return (UTF8_DONE); 449 450 fail: 451 if (ud->width == 0) 452 *uc = UTF8_SET_SIZE(0)|UTF8_SET_WIDTH(0); 453 else if (ud->width == 1) 454 *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x20; 455 else 456 *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x2020; 457 return (UTF8_ERROR); 458 } 459 460 /* Get UTF-8 data from character. */ 461 void 462 utf8_to_data(utf8_char uc, struct utf8_data *ud) 463 { 464 struct utf8_item *ui; 465 u_int index; 466 467 memset(ud, 0, sizeof *ud); 468 ud->size = ud->have = UTF8_GET_SIZE(uc); 469 ud->width = UTF8_GET_WIDTH(uc); 470 471 if (ud->size <= 3) { 472 ud->data[2] = (uc >> 16); 473 ud->data[1] = ((uc >> 8) & 0xff); 474 ud->data[0] = (uc & 0xff); 475 } else { 476 index = (uc & 0xffffff); 477 if ((ui = utf8_item_by_index(index)) == NULL) 478 memset(ud->data, ' ', ud->size); 479 else 480 memcpy(ud->data, ui->data, ud->size); 481 } 482 483 log_debug("%s: %08x -> (%d %d %.*s)", __func__, uc, ud->width, ud->size, 484 (int)ud->size, ud->data); 485 } 486 487 /* Get UTF-8 character from a single ASCII character. */ 488 u_int 489 utf8_build_one(u_char ch) 490 { 491 return (UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|ch); 492 } 493 494 /* Set a single character. */ 495 void 496 utf8_set(struct utf8_data *ud, u_char ch) 497 { 498 static const struct utf8_data empty = { { 0 }, 1, 1, 1 }; 499 500 memcpy(ud, &empty, sizeof *ud); 501 *ud->data = ch; 502 } 503 504 /* Copy UTF-8 character. */ 505 void 506 utf8_copy(struct utf8_data *to, const struct utf8_data *from) 507 { 508 u_int i; 509 510 memcpy(to, from, sizeof *to); 511 512 for (i = to->size; i < sizeof to->data; i++) 513 to->data[i] = '\0'; 514 } 515 516 /* Get width of Unicode character. */ 517 static enum utf8_state 518 utf8_width(struct utf8_data *ud, int *width) 519 { 520 struct utf8_width_item *uw; 521 wchar_t wc; 522 523 if (utf8_towc(ud, &wc) != UTF8_DONE) 524 return (UTF8_ERROR); 525 uw = utf8_find_in_width_cache(wc); 526 if (uw != NULL) { 527 *width = uw->width; 528 log_debug("cached width for %08X is %d", (u_int)wc, *width); 529 return (UTF8_DONE); 530 } 531 532 *width = wcwidth(wc); 533 log_debug("wcwidth(%05X) returned %d", (u_int)wc, *width); 534 if (*width < 0) { 535 /* 536 * C1 control characters are nonprintable, so they are always 537 * zero width. 538 */ 539 *width = (wc >= 0x80 && wc <= 0x9f) ? 0 : 1; 540 } 541 if (*width >= 0 && *width <= 0xff) 542 return (UTF8_DONE); 543 return (UTF8_ERROR); 544 } 545 546 /* Convert UTF-8 character to wide character. */ 547 enum utf8_state 548 utf8_towc(const struct utf8_data *ud, wchar_t *wc) 549 { 550 switch (mbtowc(wc, ud->data, ud->size)) { 551 case -1: 552 log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data, 553 errno); 554 mbtowc(NULL, NULL, MB_CUR_MAX); 555 return (UTF8_ERROR); 556 case 0: 557 return (UTF8_ERROR); 558 } 559 log_debug("UTF-8 %.*s is %05X", (int)ud->size, ud->data, (u_int)*wc); 560 return (UTF8_DONE); 561 } 562 563 /* Convert wide character to UTF-8 character. */ 564 enum utf8_state 565 utf8_fromwc(wchar_t wc, struct utf8_data *ud) 566 { 567 int size, width; 568 569 size = wctomb(ud->data, wc); 570 if (size < 0) { 571 log_debug("UTF-8 %d, wctomb() %d", wc, errno); 572 wctomb(NULL, 0); 573 return (UTF8_ERROR); 574 } 575 if (size == 0) 576 return (UTF8_ERROR); 577 ud->size = ud->have = size; 578 if (utf8_width(ud, &width) == UTF8_DONE) { 579 ud->width = width; 580 return (UTF8_DONE); 581 } 582 return (UTF8_ERROR); 583 } 584 585 /* 586 * Open UTF-8 sequence. 587 * 588 * 11000010-11011111 C2-DF start of 2-byte sequence 589 * 11100000-11101111 E0-EF start of 3-byte sequence 590 * 11110000-11110100 F0-F4 start of 4-byte sequence 591 */ 592 enum utf8_state 593 utf8_open(struct utf8_data *ud, u_char ch) 594 { 595 memset(ud, 0, sizeof *ud); 596 if (ch >= 0xc2 && ch <= 0xdf) 597 ud->size = 2; 598 else if (ch >= 0xe0 && ch <= 0xef) 599 ud->size = 3; 600 else if (ch >= 0xf0 && ch <= 0xf4) 601 ud->size = 4; 602 else 603 return (UTF8_ERROR); 604 utf8_append(ud, ch); 605 return (UTF8_MORE); 606 } 607 608 /* Append character to UTF-8, closing if finished. */ 609 enum utf8_state 610 utf8_append(struct utf8_data *ud, u_char ch) 611 { 612 int width; 613 614 if (ud->have >= ud->size) 615 fatalx("UTF-8 character overflow"); 616 if (ud->size > sizeof ud->data) 617 fatalx("UTF-8 character size too large"); 618 619 if (ud->have != 0 && (ch & 0xc0) != 0x80) 620 ud->width = 0xff; 621 622 ud->data[ud->have++] = ch; 623 if (ud->have != ud->size) 624 return (UTF8_MORE); 625 626 if (!utf8_no_width) { 627 if (ud->width == 0xff) 628 return (UTF8_ERROR); 629 if (utf8_width(ud, &width) != UTF8_DONE) 630 return (UTF8_ERROR); 631 ud->width = width; 632 } 633 634 return (UTF8_DONE); 635 } 636 637 /* 638 * Encode len characters from src into dst, which is guaranteed to have four 639 * bytes available for each character from src (for \abc or UTF-8) plus space 640 * for \0. 641 */ 642 int 643 utf8_strvis(char *dst, const char *src, size_t len, int flag) 644 { 645 struct utf8_data ud; 646 const char *start = dst, *end = src + len; 647 enum utf8_state more; 648 size_t i; 649 650 while (src < end) { 651 if ((more = utf8_open(&ud, *src)) == UTF8_MORE) { 652 while (++src < end && more == UTF8_MORE) 653 more = utf8_append(&ud, *src); 654 if (more == UTF8_DONE) { 655 /* UTF-8 character finished. */ 656 for (i = 0; i < ud.size; i++) 657 *dst++ = ud.data[i]; 658 continue; 659 } 660 /* Not a complete, valid UTF-8 character. */ 661 src -= ud.have; 662 } 663 if ((flag & VIS_DQ) && src[0] == '$' && src < end - 1) { 664 if (isalpha((u_char)src[1]) || 665 src[1] == '_' || 666 src[1] == '{') 667 *dst++ = '\\'; 668 *dst++ = '$'; 669 } else if (src < end - 1) 670 dst = vis(dst, src[0], flag, src[1]); 671 else if (src < end) 672 dst = vis(dst, src[0], flag, '\0'); 673 src++; 674 } 675 *dst = '\0'; 676 return (dst - start); 677 } 678 679 /* Same as utf8_strvis but allocate the buffer. */ 680 int 681 utf8_stravis(char **dst, const char *src, int flag) 682 { 683 char *buf; 684 int len; 685 686 buf = xreallocarray(NULL, 4, strlen(src) + 1); 687 len = utf8_strvis(buf, src, strlen(src), flag); 688 689 *dst = xrealloc(buf, len + 1); 690 return (len); 691 } 692 693 /* Same as utf8_strvis but allocate the buffer. */ 694 int 695 utf8_stravisx(char **dst, const char *src, size_t srclen, int flag) 696 { 697 char *buf; 698 int len; 699 700 buf = xreallocarray(NULL, 4, srclen + 1); 701 len = utf8_strvis(buf, src, srclen, flag); 702 703 *dst = xrealloc(buf, len + 1); 704 return (len); 705 } 706 707 /* Does this string contain anything that isn't valid UTF-8? */ 708 int 709 utf8_isvalid(const char *s) 710 { 711 struct utf8_data ud; 712 const char *end; 713 enum utf8_state more; 714 715 end = s + strlen(s); 716 while (s < end) { 717 if ((more = utf8_open(&ud, *s)) == UTF8_MORE) { 718 while (++s < end && more == UTF8_MORE) 719 more = utf8_append(&ud, *s); 720 if (more == UTF8_DONE) 721 continue; 722 return (0); 723 } 724 if (*s < 0x20 || *s > 0x7e) 725 return (0); 726 s++; 727 } 728 return (1); 729 } 730 731 /* 732 * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free 733 * the returned string. Anything not valid printable ASCII or UTF-8 is 734 * stripped. 735 */ 736 char * 737 utf8_sanitize(const char *src) 738 { 739 char *dst = NULL; 740 size_t n = 0; 741 enum utf8_state more; 742 struct utf8_data ud; 743 u_int i; 744 745 while (*src != '\0') { 746 dst = xreallocarray(dst, n + 1, sizeof *dst); 747 if ((more = utf8_open(&ud, *src)) == UTF8_MORE) { 748 while (*++src != '\0' && more == UTF8_MORE) 749 more = utf8_append(&ud, *src); 750 if (more == UTF8_DONE) { 751 dst = xreallocarray(dst, n + ud.width, 752 sizeof *dst); 753 for (i = 0; i < ud.width; i++) 754 dst[n++] = '_'; 755 continue; 756 } 757 src -= ud.have; 758 } 759 if (*src > 0x1f && *src < 0x7f) 760 dst[n++] = *src; 761 else 762 dst[n++] = '_'; 763 src++; 764 } 765 dst = xreallocarray(dst, n + 1, sizeof *dst); 766 dst[n] = '\0'; 767 return (dst); 768 } 769 770 /* Get UTF-8 buffer length. */ 771 size_t 772 utf8_strlen(const struct utf8_data *s) 773 { 774 size_t i; 775 776 for (i = 0; s[i].size != 0; i++) 777 /* nothing */; 778 return (i); 779 } 780 781 /* Get UTF-8 string width. */ 782 u_int 783 utf8_strwidth(const struct utf8_data *s, ssize_t n) 784 { 785 ssize_t i; 786 u_int width = 0; 787 788 for (i = 0; s[i].size != 0; i++) { 789 if (n != -1 && n == i) 790 break; 791 width += s[i].width; 792 } 793 return (width); 794 } 795 796 /* 797 * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0. 798 * Caller frees. 799 */ 800 struct utf8_data * 801 utf8_fromcstr(const char *src) 802 { 803 struct utf8_data *dst = NULL; 804 size_t n = 0; 805 enum utf8_state more; 806 807 while (*src != '\0') { 808 dst = xreallocarray(dst, n + 1, sizeof *dst); 809 if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) { 810 while (*++src != '\0' && more == UTF8_MORE) 811 more = utf8_append(&dst[n], *src); 812 if (more == UTF8_DONE) { 813 n++; 814 continue; 815 } 816 src -= dst[n].have; 817 } 818 utf8_set(&dst[n], *src); 819 n++; 820 src++; 821 } 822 dst = xreallocarray(dst, n + 1, sizeof *dst); 823 dst[n].size = 0; 824 return (dst); 825 } 826 827 /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */ 828 char * 829 utf8_tocstr(struct utf8_data *src) 830 { 831 char *dst = NULL; 832 size_t n = 0; 833 834 for(; src->size != 0; src++) { 835 dst = xreallocarray(dst, n + src->size, 1); 836 memcpy(dst + n, src->data, src->size); 837 n += src->size; 838 } 839 dst = xreallocarray(dst, n + 1, 1); 840 dst[n] = '\0'; 841 return (dst); 842 } 843 844 /* Get width of UTF-8 string. */ 845 u_int 846 utf8_cstrwidth(const char *s) 847 { 848 struct utf8_data tmp; 849 u_int width; 850 enum utf8_state more; 851 852 width = 0; 853 while (*s != '\0') { 854 if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) { 855 while (*++s != '\0' && more == UTF8_MORE) 856 more = utf8_append(&tmp, *s); 857 if (more == UTF8_DONE) { 858 width += tmp.width; 859 continue; 860 } 861 s -= tmp.have; 862 } 863 if (*s > 0x1f && *s != 0x7f) 864 width++; 865 s++; 866 } 867 return (width); 868 } 869 870 /* Pad UTF-8 string to width on the left. Caller frees. */ 871 char * 872 utf8_padcstr(const char *s, u_int width) 873 { 874 size_t slen; 875 char *out; 876 u_int n, i; 877 878 n = utf8_cstrwidth(s); 879 if (n >= width) 880 return (xstrdup(s)); 881 882 slen = strlen(s); 883 out = xmalloc(slen + 1 + (width - n)); 884 memcpy(out, s, slen); 885 for (i = n; i < width; i++) 886 out[slen++] = ' '; 887 out[slen] = '\0'; 888 return (out); 889 } 890 891 /* Pad UTF-8 string to width on the right. Caller frees. */ 892 char * 893 utf8_rpadcstr(const char *s, u_int width) 894 { 895 size_t slen; 896 char *out; 897 u_int n, i; 898 899 n = utf8_cstrwidth(s); 900 if (n >= width) 901 return (xstrdup(s)); 902 903 slen = strlen(s); 904 out = xmalloc(slen + 1 + (width - n)); 905 for (i = 0; i < width - n; i++) 906 out[i] = ' '; 907 memcpy(out + i, s, slen); 908 out[i + slen] = '\0'; 909 return (out); 910 } 911 912 int 913 utf8_cstrhas(const char *s, const struct utf8_data *ud) 914 { 915 struct utf8_data *copy, *loop; 916 int found = 0; 917 918 copy = utf8_fromcstr(s); 919 for (loop = copy; loop->size != 0; loop++) { 920 if (loop->size != ud->size) 921 continue; 922 if (memcmp(loop->data, ud->data, loop->size) == 0) { 923 found = 1; 924 break; 925 } 926 } 927 free(copy); 928 929 return (found); 930 } 931