1*3d40d63aSnicm /* $OpenBSD: utf8.c,v 1.67 2025/01/01 15:17:36 nicm Exp $ */ 2311827fbSnicm 3311827fbSnicm /* 498ca8272Snicm * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com> 5311827fbSnicm * 6311827fbSnicm * Permission to use, copy, modify, and distribute this software for any 7311827fbSnicm * purpose with or without fee is hereby granted, provided that the above 8311827fbSnicm * copyright notice and this permission notice appear in all copies. 9311827fbSnicm * 10311827fbSnicm * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11311827fbSnicm * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12311827fbSnicm * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13311827fbSnicm * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14311827fbSnicm * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER 15311827fbSnicm * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING 16311827fbSnicm * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17311827fbSnicm */ 18311827fbSnicm 19311827fbSnicm #include <sys/types.h> 20311827fbSnicm 215c131106Snicm #include <ctype.h> 22eea13297Snicm #include <errno.h> 234b2ce9a7Snicm #include <stdlib.h> 24311827fbSnicm #include <string.h> 25dbbd1b46Snicm #include <vis.h> 26311827fbSnicm 27311827fbSnicm #include "tmux.h" 28311827fbSnicm 29*3d40d63aSnicm struct utf8_width_item { 30*3d40d63aSnicm wchar_t wc; 31*3d40d63aSnicm u_int width; 32*3d40d63aSnicm int allocated; 33*3d40d63aSnicm 34*3d40d63aSnicm RB_ENTRY(utf8_width_item) entry; 35*3d40d63aSnicm }; 36*3d40d63aSnicm 37*3d40d63aSnicm static int 38*3d40d63aSnicm utf8_width_cache_cmp(struct utf8_width_item *uw1, struct utf8_width_item *uw2) 39*3d40d63aSnicm { 40*3d40d63aSnicm if (uw1->wc < uw2->wc) 41*3d40d63aSnicm return (-1); 42*3d40d63aSnicm if (uw1->wc > uw2->wc) 43*3d40d63aSnicm return (1); 44*3d40d63aSnicm return (0); 45*3d40d63aSnicm } 46*3d40d63aSnicm RB_HEAD(utf8_width_cache, utf8_width_item); 47*3d40d63aSnicm RB_GENERATE_STATIC(utf8_width_cache, utf8_width_item, entry, 48*3d40d63aSnicm utf8_width_cache_cmp); 49*3d40d63aSnicm static struct utf8_width_cache utf8_width_cache = 50*3d40d63aSnicm RB_INITIALIZER(utf8_width_cache); 51*3d40d63aSnicm 52*3d40d63aSnicm static struct utf8_width_item utf8_default_width_cache[] = { 53*3d40d63aSnicm { .wc = 0x0261D, .width = 2 }, 54*3d40d63aSnicm { .wc = 0x026F9, .width = 2 }, 55*3d40d63aSnicm { .wc = 0x0270A, .width = 2 }, 56*3d40d63aSnicm { .wc = 0x0270B, .width = 2 }, 57*3d40d63aSnicm { .wc = 0x0270C, .width = 2 }, 58*3d40d63aSnicm { .wc = 0x0270D, .width = 2 }, 59*3d40d63aSnicm { .wc = 0x1F1E6, .width = 2 }, 60*3d40d63aSnicm { .wc = 0x1F1E7, .width = 2 }, 61*3d40d63aSnicm { .wc = 0x1F1E8, .width = 2 }, 62*3d40d63aSnicm { .wc = 0x1F1E9, .width = 2 }, 63*3d40d63aSnicm { .wc = 0x1F1EA, .width = 2 }, 64*3d40d63aSnicm { .wc = 0x1F1EB, .width = 2 }, 65*3d40d63aSnicm { .wc = 0x1F1EC, .width = 2 }, 66*3d40d63aSnicm { .wc = 0x1F1ED, .width = 2 }, 67*3d40d63aSnicm { .wc = 0x1F1EE, .width = 2 }, 68*3d40d63aSnicm { .wc = 0x1F1EF, .width = 2 }, 69*3d40d63aSnicm { .wc = 0x1F1F0, .width = 2 }, 70*3d40d63aSnicm { .wc = 0x1F1F1, .width = 2 }, 71*3d40d63aSnicm { .wc = 0x1F1F2, .width = 2 }, 72*3d40d63aSnicm { .wc = 0x1F1F3, .width = 2 }, 73*3d40d63aSnicm { .wc = 0x1F1F4, .width = 2 }, 74*3d40d63aSnicm { .wc = 0x1F1F5, .width = 2 }, 75*3d40d63aSnicm { .wc = 0x1F1F6, .width = 2 }, 76*3d40d63aSnicm { .wc = 0x1F1F7, .width = 2 }, 77*3d40d63aSnicm { .wc = 0x1F1F8, .width = 2 }, 78*3d40d63aSnicm { .wc = 0x1F1F9, .width = 2 }, 79*3d40d63aSnicm { .wc = 0x1F1FA, .width = 2 }, 80*3d40d63aSnicm { .wc = 0x1F1FB, .width = 2 }, 81*3d40d63aSnicm { .wc = 0x1F1FC, .width = 2 }, 82*3d40d63aSnicm { .wc = 0x1F1FD, .width = 2 }, 83*3d40d63aSnicm { .wc = 0x1F1FE, .width = 2 }, 84*3d40d63aSnicm { .wc = 0x1F1FF, .width = 2 }, 85*3d40d63aSnicm { .wc = 0x1F385, .width = 2 }, 86*3d40d63aSnicm { .wc = 0x1F3C2, .width = 2 }, 87*3d40d63aSnicm { .wc = 0x1F3C3, .width = 2 }, 88*3d40d63aSnicm { .wc = 0x1F3C4, .width = 2 }, 89*3d40d63aSnicm { .wc = 0x1F3C7, .width = 2 }, 90*3d40d63aSnicm { .wc = 0x1F3CA, .width = 2 }, 91*3d40d63aSnicm { .wc = 0x1F3CB, .width = 2 }, 92*3d40d63aSnicm { .wc = 0x1F3CC, .width = 2 }, 93*3d40d63aSnicm { .wc = 0x1F3FB, .width = 2 }, 94*3d40d63aSnicm { .wc = 0x1F3FC, .width = 2 }, 95*3d40d63aSnicm { .wc = 0x1F3FD, .width = 2 }, 96*3d40d63aSnicm { .wc = 0x1F3FE, .width = 2 }, 97*3d40d63aSnicm { .wc = 0x1F3FF, .width = 2 }, 98*3d40d63aSnicm { .wc = 0x1F442, .width = 2 }, 99*3d40d63aSnicm { .wc = 0x1F443, .width = 2 }, 100*3d40d63aSnicm { .wc = 0x1F446, .width = 2 }, 101*3d40d63aSnicm { .wc = 0x1F447, .width = 2 }, 102*3d40d63aSnicm { .wc = 0x1F448, .width = 2 }, 103*3d40d63aSnicm { .wc = 0x1F449, .width = 2 }, 104*3d40d63aSnicm { .wc = 0x1F44A, .width = 2 }, 105*3d40d63aSnicm { .wc = 0x1F44B, .width = 2 }, 106*3d40d63aSnicm { .wc = 0x1F44C, .width = 2 }, 107*3d40d63aSnicm { .wc = 0x1F44D, .width = 2 }, 108*3d40d63aSnicm { .wc = 0x1F44E, .width = 2 }, 109*3d40d63aSnicm { .wc = 0x1F44F, .width = 2 }, 110*3d40d63aSnicm { .wc = 0x1F450, .width = 2 }, 111*3d40d63aSnicm { .wc = 0x1F466, .width = 2 }, 112*3d40d63aSnicm { .wc = 0x1F467, .width = 2 }, 113*3d40d63aSnicm { .wc = 0x1F468, .width = 2 }, 114*3d40d63aSnicm { .wc = 0x1F469, .width = 2 }, 115*3d40d63aSnicm { .wc = 0x1F46B, .width = 2 }, 116*3d40d63aSnicm { .wc = 0x1F46C, .width = 2 }, 117*3d40d63aSnicm { .wc = 0x1F46D, .width = 2 }, 118*3d40d63aSnicm { .wc = 0x1F46E, .width = 2 }, 119*3d40d63aSnicm { .wc = 0x1F470, .width = 2 }, 120*3d40d63aSnicm { .wc = 0x1F471, .width = 2 }, 121*3d40d63aSnicm { .wc = 0x1F472, .width = 2 }, 122*3d40d63aSnicm { .wc = 0x1F473, .width = 2 }, 123*3d40d63aSnicm { .wc = 0x1F474, .width = 2 }, 124*3d40d63aSnicm { .wc = 0x1F475, .width = 2 }, 125*3d40d63aSnicm { .wc = 0x1F476, .width = 2 }, 126*3d40d63aSnicm { .wc = 0x1F477, .width = 2 }, 127*3d40d63aSnicm { .wc = 0x1F478, .width = 2 }, 128*3d40d63aSnicm { .wc = 0x1F47C, .width = 2 }, 129*3d40d63aSnicm { .wc = 0x1F481, .width = 2 }, 130*3d40d63aSnicm { .wc = 0x1F482, .width = 2 }, 131*3d40d63aSnicm { .wc = 0x1F483, .width = 2 }, 132*3d40d63aSnicm { .wc = 0x1F485, .width = 2 }, 133*3d40d63aSnicm { .wc = 0x1F486, .width = 2 }, 134*3d40d63aSnicm { .wc = 0x1F487, .width = 2 }, 135*3d40d63aSnicm { .wc = 0x1F48F, .width = 2 }, 136*3d40d63aSnicm { .wc = 0x1F491, .width = 2 }, 137*3d40d63aSnicm { .wc = 0x1F4AA, .width = 2 }, 138*3d40d63aSnicm { .wc = 0x1F574, .width = 2 }, 139*3d40d63aSnicm { .wc = 0x1F575, .width = 2 }, 140*3d40d63aSnicm { .wc = 0x1F57A, .width = 2 }, 141*3d40d63aSnicm { .wc = 0x1F590, .width = 2 }, 142*3d40d63aSnicm { .wc = 0x1F595, .width = 2 }, 143*3d40d63aSnicm { .wc = 0x1F596, .width = 2 }, 144*3d40d63aSnicm { .wc = 0x1F645, .width = 2 }, 145*3d40d63aSnicm { .wc = 0x1F646, .width = 2 }, 146*3d40d63aSnicm { .wc = 0x1F647, .width = 2 }, 147*3d40d63aSnicm { .wc = 0x1F64B, .width = 2 }, 148*3d40d63aSnicm { .wc = 0x1F64C, .width = 2 }, 149*3d40d63aSnicm { .wc = 0x1F64D, .width = 2 }, 150*3d40d63aSnicm { .wc = 0x1F64E, .width = 2 }, 151*3d40d63aSnicm { .wc = 0x1F64F, .width = 2 }, 152*3d40d63aSnicm { .wc = 0x1F6A3, .width = 2 }, 153*3d40d63aSnicm { .wc = 0x1F6B4, .width = 2 }, 154*3d40d63aSnicm { .wc = 0x1F6B5, .width = 2 }, 155*3d40d63aSnicm { .wc = 0x1F6B6, .width = 2 }, 156*3d40d63aSnicm { .wc = 0x1F6C0, .width = 2 }, 157*3d40d63aSnicm { .wc = 0x1F6CC, .width = 2 }, 158*3d40d63aSnicm { .wc = 0x1F90C, .width = 2 }, 159*3d40d63aSnicm { .wc = 0x1F90F, .width = 2 }, 160*3d40d63aSnicm { .wc = 0x1F918, .width = 2 }, 161*3d40d63aSnicm { .wc = 0x1F919, .width = 2 }, 162*3d40d63aSnicm { .wc = 0x1F91A, .width = 2 }, 163*3d40d63aSnicm { .wc = 0x1F91B, .width = 2 }, 164*3d40d63aSnicm { .wc = 0x1F91C, .width = 2 }, 165*3d40d63aSnicm { .wc = 0x1F91D, .width = 2 }, 166*3d40d63aSnicm { .wc = 0x1F91E, .width = 2 }, 167*3d40d63aSnicm { .wc = 0x1F91F, .width = 2 }, 168*3d40d63aSnicm { .wc = 0x1F926, .width = 2 }, 169*3d40d63aSnicm { .wc = 0x1F930, .width = 2 }, 170*3d40d63aSnicm { .wc = 0x1F931, .width = 2 }, 171*3d40d63aSnicm { .wc = 0x1F932, .width = 2 }, 172*3d40d63aSnicm { .wc = 0x1F933, .width = 2 }, 173*3d40d63aSnicm { .wc = 0x1F934, .width = 2 }, 174*3d40d63aSnicm { .wc = 0x1F935, .width = 2 }, 175*3d40d63aSnicm { .wc = 0x1F936, .width = 2 }, 176*3d40d63aSnicm { .wc = 0x1F937, .width = 2 }, 177*3d40d63aSnicm { .wc = 0x1F938, .width = 2 }, 178*3d40d63aSnicm { .wc = 0x1F939, .width = 2 }, 179*3d40d63aSnicm { .wc = 0x1F93D, .width = 2 }, 180*3d40d63aSnicm { .wc = 0x1F93E, .width = 2 }, 181*3d40d63aSnicm { .wc = 0x1F977, .width = 2 }, 182*3d40d63aSnicm { .wc = 0x1F9B5, .width = 2 }, 183*3d40d63aSnicm { .wc = 0x1F9B6, .width = 2 }, 184*3d40d63aSnicm { .wc = 0x1F9B8, .width = 2 }, 185*3d40d63aSnicm { .wc = 0x1F9B9, .width = 2 }, 186*3d40d63aSnicm { .wc = 0x1F9BB, .width = 2 }, 187*3d40d63aSnicm { .wc = 0x1F9CD, .width = 2 }, 188*3d40d63aSnicm { .wc = 0x1F9CE, .width = 2 }, 189*3d40d63aSnicm { .wc = 0x1F9CF, .width = 2 }, 190*3d40d63aSnicm { .wc = 0x1F9D1, .width = 2 }, 191*3d40d63aSnicm { .wc = 0x1F9D2, .width = 2 }, 192*3d40d63aSnicm { .wc = 0x1F9D3, .width = 2 }, 193*3d40d63aSnicm { .wc = 0x1F9D4, .width = 2 }, 194*3d40d63aSnicm { .wc = 0x1F9D5, .width = 2 }, 195*3d40d63aSnicm { .wc = 0x1F9D6, .width = 2 }, 196*3d40d63aSnicm { .wc = 0x1F9D7, .width = 2 }, 197*3d40d63aSnicm { .wc = 0x1F9D8, .width = 2 }, 198*3d40d63aSnicm { .wc = 0x1F9D9, .width = 2 }, 199*3d40d63aSnicm { .wc = 0x1F9DA, .width = 2 }, 200*3d40d63aSnicm { .wc = 0x1F9DB, .width = 2 }, 201*3d40d63aSnicm { .wc = 0x1F9DC, .width = 2 }, 202*3d40d63aSnicm { .wc = 0x1F9DD, .width = 2 }, 203*3d40d63aSnicm { .wc = 0x1FAC3, .width = 2 }, 204*3d40d63aSnicm { .wc = 0x1FAC4, .width = 2 }, 205*3d40d63aSnicm { .wc = 0x1FAC5, .width = 2 }, 206*3d40d63aSnicm { .wc = 0x1FAF0, .width = 2 }, 207*3d40d63aSnicm { .wc = 0x1FAF1, .width = 2 }, 208*3d40d63aSnicm { .wc = 0x1FAF2, .width = 2 }, 209*3d40d63aSnicm { .wc = 0x1FAF3, .width = 2 }, 210*3d40d63aSnicm { .wc = 0x1FAF4, .width = 2 }, 211*3d40d63aSnicm { .wc = 0x1FAF5, .width = 2 }, 212*3d40d63aSnicm { .wc = 0x1FAF6, .width = 2 }, 213*3d40d63aSnicm { .wc = 0x1FAF7, .width = 2 }, 214*3d40d63aSnicm { .wc = 0x1FAF8, .width = 2 } 2152af49740Snicm }; 2162af49740Snicm 21770a57860Snicm struct utf8_item { 218c0b83f5fSnicm RB_ENTRY(utf8_item) index_entry; 219c0b83f5fSnicm u_int index; 2205832c8deSnicm 221c0b83f5fSnicm RB_ENTRY(utf8_item) data_entry; 2225832c8deSnicm char data[UTF8_SIZE]; 2235832c8deSnicm u_char size; 2245832c8deSnicm }; 2255832c8deSnicm 2265832c8deSnicm static int 227c0b83f5fSnicm utf8_data_cmp(struct utf8_item *ui1, struct utf8_item *ui2) 2285832c8deSnicm { 22970a57860Snicm if (ui1->size < ui2->size) 2305832c8deSnicm return (-1); 23170a57860Snicm if (ui1->size > ui2->size) 2325832c8deSnicm return (1); 23370a57860Snicm return (memcmp(ui1->data, ui2->data, ui1->size)); 2345832c8deSnicm } 235c0b83f5fSnicm RB_HEAD(utf8_data_tree, utf8_item); 236c0b83f5fSnicm RB_GENERATE_STATIC(utf8_data_tree, utf8_item, data_entry, utf8_data_cmp); 237c0b83f5fSnicm static struct utf8_data_tree utf8_data_tree = RB_INITIALIZER(utf8_data_tree); 2385832c8deSnicm 239c0b83f5fSnicm static int 240c0b83f5fSnicm utf8_index_cmp(struct utf8_item *ui1, struct utf8_item *ui2) 241c0b83f5fSnicm { 242c0b83f5fSnicm if (ui1->index < ui2->index) 243c0b83f5fSnicm return (-1); 244c0b83f5fSnicm if (ui1->index > ui2->index) 245c0b83f5fSnicm return (1); 246c0b83f5fSnicm return (0); 247c0b83f5fSnicm } 248c0b83f5fSnicm RB_HEAD(utf8_index_tree, utf8_item); 249c0b83f5fSnicm RB_GENERATE_STATIC(utf8_index_tree, utf8_item, index_entry, utf8_index_cmp); 250c0b83f5fSnicm static struct utf8_index_tree utf8_index_tree = RB_INITIALIZER(utf8_index_tree); 251c0b83f5fSnicm 252*3d40d63aSnicm static int utf8_no_width; 253c0b83f5fSnicm static u_int utf8_next_index; 2545832c8deSnicm 255734270a0Snicm #define UTF8_GET_SIZE(uc) (((uc) >> 24) & 0x1f) 2568f36458cSnicm #define UTF8_GET_WIDTH(uc) (((uc) >> 29) - 1) 2575832c8deSnicm 258734270a0Snicm #define UTF8_SET_SIZE(size) (((utf8_char)(size)) << 24) 259734270a0Snicm #define UTF8_SET_WIDTH(width) ((((utf8_char)(width)) + 1) << 29) 2605832c8deSnicm 261c0b83f5fSnicm /* Get a UTF-8 item from data. */ 26270a57860Snicm static struct utf8_item * 263423d19d0Snicm utf8_item_by_data(const u_char *data, size_t size) 2645832c8deSnicm { 26570a57860Snicm struct utf8_item ui; 2665832c8deSnicm 26770a57860Snicm memcpy(ui.data, data, size); 26870a57860Snicm ui.size = size; 2695832c8deSnicm 270c0b83f5fSnicm return (RB_FIND(utf8_data_tree, &utf8_data_tree, &ui)); 2715832c8deSnicm } 2725832c8deSnicm 273c0b83f5fSnicm /* Get a UTF-8 item from data. */ 274c0b83f5fSnicm static struct utf8_item * 275c0b83f5fSnicm utf8_item_by_index(u_int index) 2765832c8deSnicm { 277c0b83f5fSnicm struct utf8_item ui; 278c0b83f5fSnicm 279c0b83f5fSnicm ui.index = index; 280c0b83f5fSnicm 281c0b83f5fSnicm return (RB_FIND(utf8_index_tree, &utf8_index_tree, &ui)); 2825832c8deSnicm } 2835832c8deSnicm 284*3d40d63aSnicm /* Find a codepoint in the cache. */ 285*3d40d63aSnicm static struct utf8_width_item * 286*3d40d63aSnicm utf8_find_in_width_cache(wchar_t wc) 287*3d40d63aSnicm { 288*3d40d63aSnicm struct utf8_width_item uw; 289*3d40d63aSnicm 290*3d40d63aSnicm uw.wc = wc; 291*3d40d63aSnicm return RB_FIND(utf8_width_cache, &utf8_width_cache, &uw); 292*3d40d63aSnicm } 293*3d40d63aSnicm 294*3d40d63aSnicm /* Parse a single codepoint option. */ 295*3d40d63aSnicm static void 296*3d40d63aSnicm utf8_add_to_width_cache(const char *s) 297*3d40d63aSnicm { 298*3d40d63aSnicm struct utf8_width_item *uw, *old; 299*3d40d63aSnicm char *copy, *cp, *endptr; 300*3d40d63aSnicm u_int width; 301*3d40d63aSnicm const char *errstr; 302*3d40d63aSnicm struct utf8_data *ud; 303*3d40d63aSnicm wchar_t wc; 304*3d40d63aSnicm unsigned long long n; 305*3d40d63aSnicm 306*3d40d63aSnicm copy = xstrdup(s); 307*3d40d63aSnicm if ((cp = strchr(copy, '=')) == NULL) { 308*3d40d63aSnicm free(copy); 309*3d40d63aSnicm return; 310*3d40d63aSnicm } 311*3d40d63aSnicm *cp++ = '\0'; 312*3d40d63aSnicm 313*3d40d63aSnicm width = strtonum(cp, 0, 2, &errstr); 314*3d40d63aSnicm if (errstr != NULL) { 315*3d40d63aSnicm free(copy); 316*3d40d63aSnicm return; 317*3d40d63aSnicm } 318*3d40d63aSnicm 319*3d40d63aSnicm if (strncmp(copy, "U+", 2) == 0) { 320*3d40d63aSnicm errno = 0; 321*3d40d63aSnicm n = strtoull(copy + 2, &endptr, 16); 322*3d40d63aSnicm if (copy[2] == '\0' || 323*3d40d63aSnicm *endptr != '\0' || 324*3d40d63aSnicm n == 0 || 325*3d40d63aSnicm n > WCHAR_MAX || 326*3d40d63aSnicm (errno == ERANGE && n == ULLONG_MAX)) { 327*3d40d63aSnicm free(copy); 328*3d40d63aSnicm return; 329*3d40d63aSnicm } 330*3d40d63aSnicm wc = n; 331*3d40d63aSnicm } else { 332*3d40d63aSnicm utf8_no_width = 1; 333*3d40d63aSnicm ud = utf8_fromcstr(copy); 334*3d40d63aSnicm utf8_no_width = 0; 335*3d40d63aSnicm if (ud[0].size == 0 || ud[1].size != 0) { 336*3d40d63aSnicm free(ud); 337*3d40d63aSnicm free(copy); 338*3d40d63aSnicm return; 339*3d40d63aSnicm } 340*3d40d63aSnicm #ifdef HAVE_UTF8PROC 341*3d40d63aSnicm if (utf8proc_mbtowc(&wc, ud[0].data, ud[0].size) <= 0) { 342*3d40d63aSnicm #else 343*3d40d63aSnicm if (mbtowc(&wc, ud[0].data, ud[0].size) <= 0) { 344*3d40d63aSnicm #endif 345*3d40d63aSnicm free(ud); 346*3d40d63aSnicm free(copy); 347*3d40d63aSnicm return; 348*3d40d63aSnicm } 349*3d40d63aSnicm free(ud); 350*3d40d63aSnicm } 351*3d40d63aSnicm 352*3d40d63aSnicm log_debug("Unicode width cache: %08X=%u", (u_int)wc, width); 353*3d40d63aSnicm 354*3d40d63aSnicm uw = xcalloc(1, sizeof *uw); 355*3d40d63aSnicm uw->wc = wc; 356*3d40d63aSnicm uw->width = width; 357*3d40d63aSnicm uw->allocated = 1; 358*3d40d63aSnicm 359*3d40d63aSnicm old = RB_INSERT(utf8_width_cache, &utf8_width_cache, uw); 360*3d40d63aSnicm if (old != NULL) { 361*3d40d63aSnicm RB_REMOVE(utf8_width_cache, &utf8_width_cache, old); 362*3d40d63aSnicm if (old->allocated) 363*3d40d63aSnicm free(old); 364*3d40d63aSnicm RB_INSERT(utf8_width_cache, &utf8_width_cache, uw); 365*3d40d63aSnicm } 366*3d40d63aSnicm 367*3d40d63aSnicm free(copy); 368*3d40d63aSnicm } 369*3d40d63aSnicm 370*3d40d63aSnicm /* Rebuild cache of widths. */ 371*3d40d63aSnicm void 372*3d40d63aSnicm utf8_update_width_cache(void) 373*3d40d63aSnicm { 374*3d40d63aSnicm struct utf8_width_item *uw, *uw1; 375*3d40d63aSnicm struct options_entry *o; 376*3d40d63aSnicm struct options_array_item *a; 377*3d40d63aSnicm u_int i; 378*3d40d63aSnicm 379*3d40d63aSnicm RB_FOREACH_SAFE (uw, utf8_width_cache, &utf8_width_cache, uw1) { 380*3d40d63aSnicm RB_REMOVE(utf8_width_cache, &utf8_width_cache, uw); 381*3d40d63aSnicm if (uw->allocated) 382*3d40d63aSnicm free(uw); 383*3d40d63aSnicm } 384*3d40d63aSnicm 385*3d40d63aSnicm for (i = 0; i < nitems(utf8_default_width_cache); i++) { 386*3d40d63aSnicm RB_INSERT(utf8_width_cache, &utf8_width_cache, 387*3d40d63aSnicm &utf8_default_width_cache[i]); 388*3d40d63aSnicm } 389*3d40d63aSnicm 390*3d40d63aSnicm o = options_get(global_options, "codepoint-widths"); 391*3d40d63aSnicm a = options_array_first(o); 392*3d40d63aSnicm while (a != NULL) { 393*3d40d63aSnicm utf8_add_to_width_cache(options_array_item_value(a)->string); 394*3d40d63aSnicm a = options_array_next(a); 395*3d40d63aSnicm } 396*3d40d63aSnicm } 397*3d40d63aSnicm 39870a57860Snicm /* Add a UTF-8 item. */ 39970a57860Snicm static int 4006af87e9aSnicm utf8_put_item(const u_char *data, size_t size, u_int *index) 4015832c8deSnicm { 40270a57860Snicm struct utf8_item *ui; 40370a57860Snicm 404c0b83f5fSnicm ui = utf8_item_by_data(data, size); 40570a57860Snicm if (ui != NULL) { 40610e1651aSnicm *index = ui->index; 407c0b83f5fSnicm log_debug("%s: found %.*s = %u", __func__, (int)size, data, 408c0b83f5fSnicm *index); 40970a57860Snicm return (0); 41070a57860Snicm } 41170a57860Snicm 412c0b83f5fSnicm if (utf8_next_index == 0xffffff + 1) 41370a57860Snicm return (-1); 41470a57860Snicm 415c0b83f5fSnicm ui = xcalloc(1, sizeof *ui); 416c0b83f5fSnicm ui->index = utf8_next_index++; 417c0b83f5fSnicm RB_INSERT(utf8_index_tree, &utf8_index_tree, ui); 418c0b83f5fSnicm 41970a57860Snicm memcpy(ui->data, data, size); 42070a57860Snicm ui->size = size; 421c0b83f5fSnicm RB_INSERT(utf8_data_tree, &utf8_data_tree, ui); 42270a57860Snicm 423c0b83f5fSnicm *index = ui->index; 42410e1651aSnicm log_debug("%s: added %.*s = %u", __func__, (int)size, data, *index); 42570a57860Snicm return (0); 42670a57860Snicm } 42770a57860Snicm 42870a57860Snicm /* Get UTF-8 character from data. */ 42970a57860Snicm enum utf8_state 43070a57860Snicm utf8_from_data(const struct utf8_data *ud, utf8_char *uc) 43170a57860Snicm { 432c0b83f5fSnicm u_int index; 4335832c8deSnicm 434a49f5513Snicm if (ud->width > 2) 435051d3296Snicm fatalx("invalid UTF-8 width: %u", ud->width); 4365832c8deSnicm 437a49f5513Snicm if (ud->size > UTF8_SIZE) 4385832c8deSnicm goto fail; 439734270a0Snicm if (ud->size <= 3) { 440c0b83f5fSnicm index = (((utf8_char)ud->data[2] << 16)| 441734270a0Snicm ((utf8_char)ud->data[1] << 8)| 442734270a0Snicm ((utf8_char)ud->data[0])); 443c0b83f5fSnicm } else if (utf8_put_item(ud->data, ud->size, &index) != 0) 4445832c8deSnicm goto fail; 445c0b83f5fSnicm *uc = UTF8_SET_SIZE(ud->size)|UTF8_SET_WIDTH(ud->width)|index; 446734270a0Snicm log_debug("%s: (%d %d %.*s) -> %08x", __func__, ud->width, ud->size, 447734270a0Snicm (int)ud->size, ud->data, *uc); 44870a57860Snicm return (UTF8_DONE); 4495832c8deSnicm 4505832c8deSnicm fail: 451a49f5513Snicm if (ud->width == 0) 452734270a0Snicm *uc = UTF8_SET_SIZE(0)|UTF8_SET_WIDTH(0); 453a49f5513Snicm else if (ud->width == 1) 454734270a0Snicm *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x20; 45570a57860Snicm else 456734270a0Snicm *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x2020; 45770a57860Snicm return (UTF8_ERROR); 4585832c8deSnicm } 4595832c8deSnicm 46070a57860Snicm /* Get UTF-8 data from character. */ 4615832c8deSnicm void 46270a57860Snicm utf8_to_data(utf8_char uc, struct utf8_data *ud) 4635832c8deSnicm { 46470a57860Snicm struct utf8_item *ui; 465c0b83f5fSnicm u_int index; 4665832c8deSnicm 4675832c8deSnicm memset(ud, 0, sizeof *ud); 468734270a0Snicm ud->size = ud->have = UTF8_GET_SIZE(uc); 469734270a0Snicm ud->width = UTF8_GET_WIDTH(uc); 4705832c8deSnicm 4715832c8deSnicm if (ud->size <= 3) { 472734270a0Snicm ud->data[2] = (uc >> 16); 473734270a0Snicm ud->data[1] = ((uc >> 8) & 0xff); 474734270a0Snicm ud->data[0] = (uc & 0xff); 475734270a0Snicm } else { 476c0b83f5fSnicm index = (uc & 0xffffff); 477c0b83f5fSnicm if ((ui = utf8_item_by_index(index)) == NULL) 4785832c8deSnicm memset(ud->data, ' ', ud->size); 479c0b83f5fSnicm else 48070a57860Snicm memcpy(ud->data, ui->data, ud->size); 4815832c8deSnicm } 4825832c8deSnicm 483734270a0Snicm log_debug("%s: %08x -> (%d %d %.*s)", __func__, uc, ud->width, ud->size, 484734270a0Snicm (int)ud->size, ud->data); 485734270a0Snicm } 486734270a0Snicm 48770a57860Snicm /* Get UTF-8 character from a single ASCII character. */ 488eba5d769Snicm u_int 489a49f5513Snicm utf8_build_one(u_char ch) 4905832c8deSnicm { 491734270a0Snicm return (UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|ch); 4925832c8deSnicm } 4935832c8deSnicm 4944b2ce9a7Snicm /* Set a single character. */ 4954b2ce9a7Snicm void 4969b3c9bc5Snicm utf8_set(struct utf8_data *ud, u_char ch) 4974b2ce9a7Snicm { 4986eb338b3Snicm static const struct utf8_data empty = { { 0 }, 1, 1, 1 }; 499e931849fSnicm 5006eb338b3Snicm memcpy(ud, &empty, sizeof *ud); 5019b3c9bc5Snicm *ud->data = ch; 502e931849fSnicm } 503e931849fSnicm 504e931849fSnicm /* Copy UTF-8 character. */ 505e931849fSnicm void 506e931849fSnicm utf8_copy(struct utf8_data *to, const struct utf8_data *from) 507e931849fSnicm { 508e931849fSnicm u_int i; 509e931849fSnicm 510e931849fSnicm memcpy(to, from, sizeof *to); 511e931849fSnicm 512e931849fSnicm for (i = to->size; i < sizeof to->data; i++) 513e931849fSnicm to->data[i] = '\0'; 5144b2ce9a7Snicm } 5154b2ce9a7Snicm 51670a57860Snicm /* Get width of Unicode character. */ 5176852c63bSnicm static enum utf8_state 5186852c63bSnicm utf8_width(struct utf8_data *ud, int *width) 51970a57860Snicm { 520*3d40d63aSnicm struct utf8_width_item *uw; 5216852c63bSnicm wchar_t wc; 52270a57860Snicm 5232af49740Snicm if (utf8_towc(ud, &wc) != UTF8_DONE) 5246852c63bSnicm return (UTF8_ERROR); 525*3d40d63aSnicm uw = utf8_find_in_width_cache(wc); 526*3d40d63aSnicm if (uw != NULL) { 527*3d40d63aSnicm *width = uw->width; 528*3d40d63aSnicm log_debug("cached width for %08X is %d", (u_int)wc, *width); 5292af49740Snicm return (UTF8_DONE); 53070a57860Snicm } 5312af49740Snicm 5326852c63bSnicm *width = wcwidth(wc); 533ecd3a22eSnicm log_debug("wcwidth(%05X) returned %d", (u_int)wc, *width); 534cc390fd4Snicm if (*width < 0) { 535cc390fd4Snicm /* 536cc390fd4Snicm * C1 control characters are nonprintable, so they are always 537cc390fd4Snicm * zero width. 538cc390fd4Snicm */ 539cc390fd4Snicm *width = (wc >= 0x80 && wc <= 0x9f) ? 0 : 1; 540cc390fd4Snicm } 541485d86f6Snicm if (*width >= 0 && *width <= 0xff) 5426852c63bSnicm return (UTF8_DONE); 543485d86f6Snicm return (UTF8_ERROR); 54470a57860Snicm } 54570a57860Snicm 5462af49740Snicm /* Convert UTF-8 character to wide character. */ 5472af49740Snicm enum utf8_state 5482af49740Snicm utf8_towc(const struct utf8_data *ud, wchar_t *wc) 5492af49740Snicm { 5502af49740Snicm switch (mbtowc(wc, ud->data, ud->size)) { 5512af49740Snicm case -1: 5522af49740Snicm log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data, 5532af49740Snicm errno); 5542af49740Snicm mbtowc(NULL, NULL, MB_CUR_MAX); 5552af49740Snicm return (UTF8_ERROR); 5562af49740Snicm case 0: 5572af49740Snicm return (UTF8_ERROR); 5582af49740Snicm } 5592af49740Snicm log_debug("UTF-8 %.*s is %05X", (int)ud->size, ud->data, (u_int)*wc); 5602af49740Snicm return (UTF8_DONE); 5612af49740Snicm } 5622af49740Snicm 563b843f94bSnicm /* Convert wide character to UTF-8 character. */ 564b843f94bSnicm enum utf8_state 565b843f94bSnicm utf8_fromwc(wchar_t wc, struct utf8_data *ud) 566b843f94bSnicm { 567b843f94bSnicm int size, width; 568b843f94bSnicm 569b843f94bSnicm size = wctomb(ud->data, wc); 570b843f94bSnicm if (size < 0) { 571b843f94bSnicm log_debug("UTF-8 %d, wctomb() %d", wc, errno); 572b843f94bSnicm wctomb(NULL, 0); 573b843f94bSnicm return (UTF8_ERROR); 574b843f94bSnicm } 575b843f94bSnicm if (size == 0) 576b843f94bSnicm return (UTF8_ERROR); 577b843f94bSnicm ud->size = ud->have = size; 578b843f94bSnicm if (utf8_width(ud, &width) == UTF8_DONE) { 579b843f94bSnicm ud->width = width; 580b843f94bSnicm return (UTF8_DONE); 581b843f94bSnicm } 582b843f94bSnicm return (UTF8_ERROR); 583b843f94bSnicm } 584b843f94bSnicm 58540cac527Snicm /* 58640cac527Snicm * Open UTF-8 sequence. 58740cac527Snicm * 58840cac527Snicm * 11000010-11011111 C2-DF start of 2-byte sequence 58940cac527Snicm * 11100000-11101111 E0-EF start of 3-byte sequence 59040cac527Snicm * 11110000-11110100 F0-F4 start of 4-byte sequence 59140cac527Snicm */ 59239d4fc02Snicm enum utf8_state 5939b3c9bc5Snicm utf8_open(struct utf8_data *ud, u_char ch) 59440cac527Snicm { 5959b3c9bc5Snicm memset(ud, 0, sizeof *ud); 59640cac527Snicm if (ch >= 0xc2 && ch <= 0xdf) 5979b3c9bc5Snicm ud->size = 2; 59840cac527Snicm else if (ch >= 0xe0 && ch <= 0xef) 5999b3c9bc5Snicm ud->size = 3; 60040cac527Snicm else if (ch >= 0xf0 && ch <= 0xf4) 6019b3c9bc5Snicm ud->size = 4; 60240cac527Snicm else 60339d4fc02Snicm return (UTF8_ERROR); 6049b3c9bc5Snicm utf8_append(ud, ch); 60539d4fc02Snicm return (UTF8_MORE); 60640cac527Snicm } 60740cac527Snicm 60839d4fc02Snicm /* Append character to UTF-8, closing if finished. */ 60939d4fc02Snicm enum utf8_state 6109b3c9bc5Snicm utf8_append(struct utf8_data *ud, u_char ch) 61140cac527Snicm { 61298da63d5Snicm int width; 61398da63d5Snicm 6149b3c9bc5Snicm if (ud->have >= ud->size) 61540cac527Snicm fatalx("UTF-8 character overflow"); 6169b3c9bc5Snicm if (ud->size > sizeof ud->data) 61740cac527Snicm fatalx("UTF-8 character size too large"); 61840cac527Snicm 61927a2633fSnicm if (ud->have != 0 && (ch & 0xc0) != 0x80) 62027a2633fSnicm ud->width = 0xff; 62127a2633fSnicm 6229b3c9bc5Snicm ud->data[ud->have++] = ch; 6239b3c9bc5Snicm if (ud->have != ud->size) 62439d4fc02Snicm return (UTF8_MORE); 62540cac527Snicm 626*3d40d63aSnicm if (!utf8_no_width) { 62727a2633fSnicm if (ud->width == 0xff) 62839d4fc02Snicm return (UTF8_ERROR); 6296852c63bSnicm if (utf8_width(ud, &width) != UTF8_DONE) 63098da63d5Snicm return (UTF8_ERROR); 63198da63d5Snicm ud->width = width; 632*3d40d63aSnicm } 63398da63d5Snicm 63439d4fc02Snicm return (UTF8_DONE); 635311827fbSnicm } 636311827fbSnicm 637dbbd1b46Snicm /* 638dbbd1b46Snicm * Encode len characters from src into dst, which is guaranteed to have four 639dbbd1b46Snicm * bytes available for each character from src (for \abc or UTF-8) plus space 640dbbd1b46Snicm * for \0. 641dbbd1b46Snicm */ 642dbbd1b46Snicm int 643dbbd1b46Snicm utf8_strvis(char *dst, const char *src, size_t len, int flag) 644dbbd1b46Snicm { 6459b3c9bc5Snicm struct utf8_data ud; 64670a57860Snicm const char *start = dst, *end = src + len; 64739d4fc02Snicm enum utf8_state more; 648dbbd1b46Snicm size_t i; 649dbbd1b46Snicm 650dbbd1b46Snicm while (src < end) { 65139d4fc02Snicm if ((more = utf8_open(&ud, *src)) == UTF8_MORE) { 65239d4fc02Snicm while (++src < end && more == UTF8_MORE) 6539b3c9bc5Snicm more = utf8_append(&ud, *src); 65439d4fc02Snicm if (more == UTF8_DONE) { 655dbbd1b46Snicm /* UTF-8 character finished. */ 6569b3c9bc5Snicm for (i = 0; i < ud.size; i++) 6579b3c9bc5Snicm *dst++ = ud.data[i]; 658dbbd1b46Snicm continue; 65939d4fc02Snicm } 66027a2633fSnicm /* Not a complete, valid UTF-8 character. */ 6619b3c9bc5Snicm src -= ud.have; 662dbbd1b46Snicm } 6637e151e3fSnicm if ((flag & VIS_DQ) && src[0] == '$' && src < end - 1) { 664be9b6b3fSnicm if (isalpha((u_char)src[1]) || 665be9b6b3fSnicm src[1] == '_' || 666be9b6b3fSnicm src[1] == '{') 6675c131106Snicm *dst++ = '\\'; 6685c131106Snicm *dst++ = '$'; 6695c131106Snicm } else if (src < end - 1) 670dbbd1b46Snicm dst = vis(dst, src[0], flag, src[1]); 671dbbd1b46Snicm else if (src < end) 672dbbd1b46Snicm dst = vis(dst, src[0], flag, '\0'); 673dbbd1b46Snicm src++; 674dbbd1b46Snicm } 675dbbd1b46Snicm *dst = '\0'; 676dbbd1b46Snicm return (dst - start); 677dbbd1b46Snicm } 6784b2ce9a7Snicm 679f50390e0Snicm /* Same as utf8_strvis but allocate the buffer. */ 680f50390e0Snicm int 681f50390e0Snicm utf8_stravis(char **dst, const char *src, int flag) 682f50390e0Snicm { 683f50390e0Snicm char *buf; 684f50390e0Snicm int len; 685f50390e0Snicm 686f50390e0Snicm buf = xreallocarray(NULL, 4, strlen(src) + 1); 687f50390e0Snicm len = utf8_strvis(buf, src, strlen(src), flag); 688f50390e0Snicm 689f50390e0Snicm *dst = xrealloc(buf, len + 1); 690f50390e0Snicm return (len); 691f50390e0Snicm } 692f50390e0Snicm 6936523adafSnicm /* Same as utf8_strvis but allocate the buffer. */ 6946523adafSnicm int 6956523adafSnicm utf8_stravisx(char **dst, const char *src, size_t srclen, int flag) 6966523adafSnicm { 6976523adafSnicm char *buf; 6986523adafSnicm int len; 6996523adafSnicm 7006523adafSnicm buf = xreallocarray(NULL, 4, srclen + 1); 7016523adafSnicm len = utf8_strvis(buf, src, srclen, flag); 7026523adafSnicm 7036523adafSnicm *dst = xrealloc(buf, len + 1); 7046523adafSnicm return (len); 7056523adafSnicm } 7066523adafSnicm 7079d9ffcabSnicm /* Does this string contain anything that isn't valid UTF-8? */ 7089d9ffcabSnicm int 7099d9ffcabSnicm utf8_isvalid(const char *s) 7109d9ffcabSnicm { 7119d9ffcabSnicm struct utf8_data ud; 7129d9ffcabSnicm const char *end; 7139d9ffcabSnicm enum utf8_state more; 7149d9ffcabSnicm 7159d9ffcabSnicm end = s + strlen(s); 7169d9ffcabSnicm while (s < end) { 7179d9ffcabSnicm if ((more = utf8_open(&ud, *s)) == UTF8_MORE) { 7189d9ffcabSnicm while (++s < end && more == UTF8_MORE) 7199d9ffcabSnicm more = utf8_append(&ud, *s); 7209d9ffcabSnicm if (more == UTF8_DONE) 7219d9ffcabSnicm continue; 7229d9ffcabSnicm return (0); 7239d9ffcabSnicm } 7249d9ffcabSnicm if (*s < 0x20 || *s > 0x7e) 7259d9ffcabSnicm return (0); 7269d9ffcabSnicm s++; 7279d9ffcabSnicm } 7289d9ffcabSnicm return (1); 7299d9ffcabSnicm } 7309d9ffcabSnicm 7314b2ce9a7Snicm /* 73262f1fdfdSnicm * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free 73362f1fdfdSnicm * the returned string. Anything not valid printable ASCII or UTF-8 is 73462f1fdfdSnicm * stripped. 73562f1fdfdSnicm */ 73662f1fdfdSnicm char * 73762f1fdfdSnicm utf8_sanitize(const char *src) 73862f1fdfdSnicm { 73970a57860Snicm char *dst = NULL; 74070a57860Snicm size_t n = 0; 74139d4fc02Snicm enum utf8_state more; 7429b3c9bc5Snicm struct utf8_data ud; 74362f1fdfdSnicm u_int i; 74462f1fdfdSnicm 74562f1fdfdSnicm while (*src != '\0') { 74662f1fdfdSnicm dst = xreallocarray(dst, n + 1, sizeof *dst); 74739d4fc02Snicm if ((more = utf8_open(&ud, *src)) == UTF8_MORE) { 74839d4fc02Snicm while (*++src != '\0' && more == UTF8_MORE) 7499b3c9bc5Snicm more = utf8_append(&ud, *src); 75039d4fc02Snicm if (more == UTF8_DONE) { 7519b3c9bc5Snicm dst = xreallocarray(dst, n + ud.width, 75262f1fdfdSnicm sizeof *dst); 7539b3c9bc5Snicm for (i = 0; i < ud.width; i++) 75462f1fdfdSnicm dst[n++] = '_'; 75562f1fdfdSnicm continue; 75662f1fdfdSnicm } 7579b3c9bc5Snicm src -= ud.have; 75862f1fdfdSnicm } 75962f1fdfdSnicm if (*src > 0x1f && *src < 0x7f) 76027a2633fSnicm dst[n++] = *src; 76139d4fc02Snicm else 76239d4fc02Snicm dst[n++] = '_'; 76362f1fdfdSnicm src++; 76462f1fdfdSnicm } 76562f1fdfdSnicm dst = xreallocarray(dst, n + 1, sizeof *dst); 76662f1fdfdSnicm dst[n] = '\0'; 76762f1fdfdSnicm return (dst); 76862f1fdfdSnicm } 76962f1fdfdSnicm 770746b61e4Snicm /* Get UTF-8 buffer length. */ 771746b61e4Snicm size_t 772746b61e4Snicm utf8_strlen(const struct utf8_data *s) 773746b61e4Snicm { 774746b61e4Snicm size_t i; 775746b61e4Snicm 776746b61e4Snicm for (i = 0; s[i].size != 0; i++) 777746b61e4Snicm /* nothing */; 778746b61e4Snicm return (i); 779746b61e4Snicm } 780746b61e4Snicm 781746b61e4Snicm /* Get UTF-8 string width. */ 782746b61e4Snicm u_int 783746b61e4Snicm utf8_strwidth(const struct utf8_data *s, ssize_t n) 784746b61e4Snicm { 785746b61e4Snicm ssize_t i; 78670a57860Snicm u_int width = 0; 787746b61e4Snicm 788746b61e4Snicm for (i = 0; s[i].size != 0; i++) { 789746b61e4Snicm if (n != -1 && n == i) 790746b61e4Snicm break; 791746b61e4Snicm width += s[i].width; 792746b61e4Snicm } 793746b61e4Snicm return (width); 794746b61e4Snicm } 795746b61e4Snicm 79662f1fdfdSnicm /* 7974b2ce9a7Snicm * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0. 7984b2ce9a7Snicm * Caller frees. 7994b2ce9a7Snicm */ 8004b2ce9a7Snicm struct utf8_data * 8014b2ce9a7Snicm utf8_fromcstr(const char *src) 8024b2ce9a7Snicm { 80370a57860Snicm struct utf8_data *dst = NULL; 80470a57860Snicm size_t n = 0; 80539d4fc02Snicm enum utf8_state more; 8064b2ce9a7Snicm 8074b2ce9a7Snicm while (*src != '\0') { 80864cf113cSnicm dst = xreallocarray(dst, n + 1, sizeof *dst); 80939d4fc02Snicm if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) { 81039d4fc02Snicm while (*++src != '\0' && more == UTF8_MORE) 8114b2ce9a7Snicm more = utf8_append(&dst[n], *src); 81239d4fc02Snicm if (more == UTF8_DONE) { 8134b2ce9a7Snicm n++; 8144b2ce9a7Snicm continue; 8154b2ce9a7Snicm } 8164b2ce9a7Snicm src -= dst[n].have; 8174b2ce9a7Snicm } 8184b2ce9a7Snicm utf8_set(&dst[n], *src); 8194b2ce9a7Snicm n++; 82027a2633fSnicm src++; 82127a2633fSnicm } 82264cf113cSnicm dst = xreallocarray(dst, n + 1, sizeof *dst); 8234b2ce9a7Snicm dst[n].size = 0; 8244b2ce9a7Snicm return (dst); 8254b2ce9a7Snicm } 8264b2ce9a7Snicm 8274b2ce9a7Snicm /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */ 8284b2ce9a7Snicm char * 8294b2ce9a7Snicm utf8_tocstr(struct utf8_data *src) 8304b2ce9a7Snicm { 83170a57860Snicm char *dst = NULL; 83270a57860Snicm size_t n = 0; 8334b2ce9a7Snicm 8344b2ce9a7Snicm for(; src->size != 0; src++) { 83564cf113cSnicm dst = xreallocarray(dst, n + src->size, 1); 8364b2ce9a7Snicm memcpy(dst + n, src->data, src->size); 8374b2ce9a7Snicm n += src->size; 8384b2ce9a7Snicm } 83964cf113cSnicm dst = xreallocarray(dst, n + 1, 1); 8404b2ce9a7Snicm dst[n] = '\0'; 8414b2ce9a7Snicm return (dst); 8424b2ce9a7Snicm } 8434b2ce9a7Snicm 8444b2ce9a7Snicm /* Get width of UTF-8 string. */ 8454b2ce9a7Snicm u_int 8464b2ce9a7Snicm utf8_cstrwidth(const char *s) 8474b2ce9a7Snicm { 8484b2ce9a7Snicm struct utf8_data tmp; 8494b2ce9a7Snicm u_int width; 85039d4fc02Snicm enum utf8_state more; 8514b2ce9a7Snicm 8524b2ce9a7Snicm width = 0; 8534b2ce9a7Snicm while (*s != '\0') { 85439d4fc02Snicm if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) { 85539d4fc02Snicm while (*++s != '\0' && more == UTF8_MORE) 8564b2ce9a7Snicm more = utf8_append(&tmp, *s); 85739d4fc02Snicm if (more == UTF8_DONE) { 8584b2ce9a7Snicm width += tmp.width; 8594b2ce9a7Snicm continue; 8604b2ce9a7Snicm } 8614b2ce9a7Snicm s -= tmp.have; 8624b2ce9a7Snicm } 86339d4fc02Snicm if (*s > 0x1f && *s != 0x7f) 8644b2ce9a7Snicm width++; 8654b2ce9a7Snicm s++; 8664b2ce9a7Snicm } 8674b2ce9a7Snicm return (width); 8684b2ce9a7Snicm } 8694b2ce9a7Snicm 870a318a7faSnicm /* Pad UTF-8 string to width on the left. Caller frees. */ 8711d1963bbSnicm char * 8721d1963bbSnicm utf8_padcstr(const char *s, u_int width) 8731d1963bbSnicm { 8741d1963bbSnicm size_t slen; 8751d1963bbSnicm char *out; 8761d1963bbSnicm u_int n, i; 8771d1963bbSnicm 8781d1963bbSnicm n = utf8_cstrwidth(s); 8791d1963bbSnicm if (n >= width) 8801d1963bbSnicm return (xstrdup(s)); 8811d1963bbSnicm 8821d1963bbSnicm slen = strlen(s); 8831d1963bbSnicm out = xmalloc(slen + 1 + (width - n)); 8841d1963bbSnicm memcpy(out, s, slen); 8851d1963bbSnicm for (i = n; i < width; i++) 8861d1963bbSnicm out[slen++] = ' '; 8871d1963bbSnicm out[slen] = '\0'; 8881d1963bbSnicm return (out); 8891d1963bbSnicm } 8907db4c597Snicm 891a318a7faSnicm /* Pad UTF-8 string to width on the right. Caller frees. */ 892a318a7faSnicm char * 893a318a7faSnicm utf8_rpadcstr(const char *s, u_int width) 894a318a7faSnicm { 895a318a7faSnicm size_t slen; 896a318a7faSnicm char *out; 897a318a7faSnicm u_int n, i; 898a318a7faSnicm 899a318a7faSnicm n = utf8_cstrwidth(s); 900a318a7faSnicm if (n >= width) 901a318a7faSnicm return (xstrdup(s)); 902a318a7faSnicm 903a318a7faSnicm slen = strlen(s); 904a318a7faSnicm out = xmalloc(slen + 1 + (width - n)); 905a318a7faSnicm for (i = 0; i < width - n; i++) 906a318a7faSnicm out[i] = ' '; 907a318a7faSnicm memcpy(out + i, s, slen); 908a318a7faSnicm out[i + slen] = '\0'; 909a318a7faSnicm return (out); 910a318a7faSnicm } 911a318a7faSnicm 9127db4c597Snicm int 9137db4c597Snicm utf8_cstrhas(const char *s, const struct utf8_data *ud) 9147db4c597Snicm { 9157db4c597Snicm struct utf8_data *copy, *loop; 9167db4c597Snicm int found = 0; 9177db4c597Snicm 9187db4c597Snicm copy = utf8_fromcstr(s); 9197db4c597Snicm for (loop = copy; loop->size != 0; loop++) { 9207db4c597Snicm if (loop->size != ud->size) 9217db4c597Snicm continue; 9227db4c597Snicm if (memcmp(loop->data, ud->data, loop->size) == 0) { 9237db4c597Snicm found = 1; 9247db4c597Snicm break; 9257db4c597Snicm } 9267db4c597Snicm } 9277db4c597Snicm free(copy); 9287db4c597Snicm 9297db4c597Snicm return (found); 9307db4c597Snicm } 931