xref: /openbsd-src/usr.bin/tmux/utf8.c (revision 3d40d63a87e7d477e956d9dfca1e5d50688e719c)
1*3d40d63aSnicm /* $OpenBSD: utf8.c,v 1.67 2025/01/01 15:17:36 nicm Exp $ */
2311827fbSnicm 
3311827fbSnicm /*
498ca8272Snicm  * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
5311827fbSnicm  *
6311827fbSnicm  * Permission to use, copy, modify, and distribute this software for any
7311827fbSnicm  * purpose with or without fee is hereby granted, provided that the above
8311827fbSnicm  * copyright notice and this permission notice appear in all copies.
9311827fbSnicm  *
10311827fbSnicm  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11311827fbSnicm  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12311827fbSnicm  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13311827fbSnicm  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14311827fbSnicm  * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
15311827fbSnicm  * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
16311827fbSnicm  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17311827fbSnicm  */
18311827fbSnicm 
19311827fbSnicm #include <sys/types.h>
20311827fbSnicm 
215c131106Snicm #include <ctype.h>
22eea13297Snicm #include <errno.h>
234b2ce9a7Snicm #include <stdlib.h>
24311827fbSnicm #include <string.h>
25dbbd1b46Snicm #include <vis.h>
26311827fbSnicm 
27311827fbSnicm #include "tmux.h"
28311827fbSnicm 
29*3d40d63aSnicm struct utf8_width_item {
30*3d40d63aSnicm 	wchar_t				wc;
31*3d40d63aSnicm 	u_int				width;
32*3d40d63aSnicm 	int				allocated;
33*3d40d63aSnicm 
34*3d40d63aSnicm 	RB_ENTRY(utf8_width_item)	entry;
35*3d40d63aSnicm };
36*3d40d63aSnicm 
37*3d40d63aSnicm static int
38*3d40d63aSnicm utf8_width_cache_cmp(struct utf8_width_item *uw1, struct utf8_width_item *uw2)
39*3d40d63aSnicm {
40*3d40d63aSnicm 	if (uw1->wc < uw2->wc)
41*3d40d63aSnicm 		return (-1);
42*3d40d63aSnicm 	if (uw1->wc > uw2->wc)
43*3d40d63aSnicm 		return (1);
44*3d40d63aSnicm 	return (0);
45*3d40d63aSnicm }
46*3d40d63aSnicm RB_HEAD(utf8_width_cache, utf8_width_item);
47*3d40d63aSnicm RB_GENERATE_STATIC(utf8_width_cache, utf8_width_item, entry,
48*3d40d63aSnicm     utf8_width_cache_cmp);
49*3d40d63aSnicm static struct utf8_width_cache utf8_width_cache =
50*3d40d63aSnicm     RB_INITIALIZER(utf8_width_cache);
51*3d40d63aSnicm 
52*3d40d63aSnicm static struct utf8_width_item utf8_default_width_cache[] = {
53*3d40d63aSnicm 	{ .wc = 0x0261D, .width = 2 },
54*3d40d63aSnicm 	{ .wc = 0x026F9, .width = 2 },
55*3d40d63aSnicm 	{ .wc = 0x0270A, .width = 2 },
56*3d40d63aSnicm 	{ .wc = 0x0270B, .width = 2 },
57*3d40d63aSnicm 	{ .wc = 0x0270C, .width = 2 },
58*3d40d63aSnicm 	{ .wc = 0x0270D, .width = 2 },
59*3d40d63aSnicm 	{ .wc = 0x1F1E6, .width = 2 },
60*3d40d63aSnicm 	{ .wc = 0x1F1E7, .width = 2 },
61*3d40d63aSnicm 	{ .wc = 0x1F1E8, .width = 2 },
62*3d40d63aSnicm 	{ .wc = 0x1F1E9, .width = 2 },
63*3d40d63aSnicm 	{ .wc = 0x1F1EA, .width = 2 },
64*3d40d63aSnicm 	{ .wc = 0x1F1EB, .width = 2 },
65*3d40d63aSnicm 	{ .wc = 0x1F1EC, .width = 2 },
66*3d40d63aSnicm 	{ .wc = 0x1F1ED, .width = 2 },
67*3d40d63aSnicm 	{ .wc = 0x1F1EE, .width = 2 },
68*3d40d63aSnicm 	{ .wc = 0x1F1EF, .width = 2 },
69*3d40d63aSnicm 	{ .wc = 0x1F1F0, .width = 2 },
70*3d40d63aSnicm 	{ .wc = 0x1F1F1, .width = 2 },
71*3d40d63aSnicm 	{ .wc = 0x1F1F2, .width = 2 },
72*3d40d63aSnicm 	{ .wc = 0x1F1F3, .width = 2 },
73*3d40d63aSnicm 	{ .wc = 0x1F1F4, .width = 2 },
74*3d40d63aSnicm 	{ .wc = 0x1F1F5, .width = 2 },
75*3d40d63aSnicm 	{ .wc = 0x1F1F6, .width = 2 },
76*3d40d63aSnicm 	{ .wc = 0x1F1F7, .width = 2 },
77*3d40d63aSnicm 	{ .wc = 0x1F1F8, .width = 2 },
78*3d40d63aSnicm 	{ .wc = 0x1F1F9, .width = 2 },
79*3d40d63aSnicm 	{ .wc = 0x1F1FA, .width = 2 },
80*3d40d63aSnicm 	{ .wc = 0x1F1FB, .width = 2 },
81*3d40d63aSnicm 	{ .wc = 0x1F1FC, .width = 2 },
82*3d40d63aSnicm 	{ .wc = 0x1F1FD, .width = 2 },
83*3d40d63aSnicm 	{ .wc = 0x1F1FE, .width = 2 },
84*3d40d63aSnicm 	{ .wc = 0x1F1FF, .width = 2 },
85*3d40d63aSnicm 	{ .wc = 0x1F385, .width = 2 },
86*3d40d63aSnicm 	{ .wc = 0x1F3C2, .width = 2 },
87*3d40d63aSnicm 	{ .wc = 0x1F3C3, .width = 2 },
88*3d40d63aSnicm 	{ .wc = 0x1F3C4, .width = 2 },
89*3d40d63aSnicm 	{ .wc = 0x1F3C7, .width = 2 },
90*3d40d63aSnicm 	{ .wc = 0x1F3CA, .width = 2 },
91*3d40d63aSnicm 	{ .wc = 0x1F3CB, .width = 2 },
92*3d40d63aSnicm 	{ .wc = 0x1F3CC, .width = 2 },
93*3d40d63aSnicm 	{ .wc = 0x1F3FB, .width = 2 },
94*3d40d63aSnicm 	{ .wc = 0x1F3FC, .width = 2 },
95*3d40d63aSnicm 	{ .wc = 0x1F3FD, .width = 2 },
96*3d40d63aSnicm 	{ .wc = 0x1F3FE, .width = 2 },
97*3d40d63aSnicm 	{ .wc = 0x1F3FF, .width = 2 },
98*3d40d63aSnicm 	{ .wc = 0x1F442, .width = 2 },
99*3d40d63aSnicm 	{ .wc = 0x1F443, .width = 2 },
100*3d40d63aSnicm 	{ .wc = 0x1F446, .width = 2 },
101*3d40d63aSnicm 	{ .wc = 0x1F447, .width = 2 },
102*3d40d63aSnicm 	{ .wc = 0x1F448, .width = 2 },
103*3d40d63aSnicm 	{ .wc = 0x1F449, .width = 2 },
104*3d40d63aSnicm 	{ .wc = 0x1F44A, .width = 2 },
105*3d40d63aSnicm 	{ .wc = 0x1F44B, .width = 2 },
106*3d40d63aSnicm 	{ .wc = 0x1F44C, .width = 2 },
107*3d40d63aSnicm 	{ .wc = 0x1F44D, .width = 2 },
108*3d40d63aSnicm 	{ .wc = 0x1F44E, .width = 2 },
109*3d40d63aSnicm 	{ .wc = 0x1F44F, .width = 2 },
110*3d40d63aSnicm 	{ .wc = 0x1F450, .width = 2 },
111*3d40d63aSnicm 	{ .wc = 0x1F466, .width = 2 },
112*3d40d63aSnicm 	{ .wc = 0x1F467, .width = 2 },
113*3d40d63aSnicm 	{ .wc = 0x1F468, .width = 2 },
114*3d40d63aSnicm 	{ .wc = 0x1F469, .width = 2 },
115*3d40d63aSnicm 	{ .wc = 0x1F46B, .width = 2 },
116*3d40d63aSnicm 	{ .wc = 0x1F46C, .width = 2 },
117*3d40d63aSnicm 	{ .wc = 0x1F46D, .width = 2 },
118*3d40d63aSnicm 	{ .wc = 0x1F46E, .width = 2 },
119*3d40d63aSnicm 	{ .wc = 0x1F470, .width = 2 },
120*3d40d63aSnicm 	{ .wc = 0x1F471, .width = 2 },
121*3d40d63aSnicm 	{ .wc = 0x1F472, .width = 2 },
122*3d40d63aSnicm 	{ .wc = 0x1F473, .width = 2 },
123*3d40d63aSnicm 	{ .wc = 0x1F474, .width = 2 },
124*3d40d63aSnicm 	{ .wc = 0x1F475, .width = 2 },
125*3d40d63aSnicm 	{ .wc = 0x1F476, .width = 2 },
126*3d40d63aSnicm 	{ .wc = 0x1F477, .width = 2 },
127*3d40d63aSnicm 	{ .wc = 0x1F478, .width = 2 },
128*3d40d63aSnicm 	{ .wc = 0x1F47C, .width = 2 },
129*3d40d63aSnicm 	{ .wc = 0x1F481, .width = 2 },
130*3d40d63aSnicm 	{ .wc = 0x1F482, .width = 2 },
131*3d40d63aSnicm 	{ .wc = 0x1F483, .width = 2 },
132*3d40d63aSnicm 	{ .wc = 0x1F485, .width = 2 },
133*3d40d63aSnicm 	{ .wc = 0x1F486, .width = 2 },
134*3d40d63aSnicm 	{ .wc = 0x1F487, .width = 2 },
135*3d40d63aSnicm 	{ .wc = 0x1F48F, .width = 2 },
136*3d40d63aSnicm 	{ .wc = 0x1F491, .width = 2 },
137*3d40d63aSnicm 	{ .wc = 0x1F4AA, .width = 2 },
138*3d40d63aSnicm 	{ .wc = 0x1F574, .width = 2 },
139*3d40d63aSnicm 	{ .wc = 0x1F575, .width = 2 },
140*3d40d63aSnicm 	{ .wc = 0x1F57A, .width = 2 },
141*3d40d63aSnicm 	{ .wc = 0x1F590, .width = 2 },
142*3d40d63aSnicm 	{ .wc = 0x1F595, .width = 2 },
143*3d40d63aSnicm 	{ .wc = 0x1F596, .width = 2 },
144*3d40d63aSnicm 	{ .wc = 0x1F645, .width = 2 },
145*3d40d63aSnicm 	{ .wc = 0x1F646, .width = 2 },
146*3d40d63aSnicm 	{ .wc = 0x1F647, .width = 2 },
147*3d40d63aSnicm 	{ .wc = 0x1F64B, .width = 2 },
148*3d40d63aSnicm 	{ .wc = 0x1F64C, .width = 2 },
149*3d40d63aSnicm 	{ .wc = 0x1F64D, .width = 2 },
150*3d40d63aSnicm 	{ .wc = 0x1F64E, .width = 2 },
151*3d40d63aSnicm 	{ .wc = 0x1F64F, .width = 2 },
152*3d40d63aSnicm 	{ .wc = 0x1F6A3, .width = 2 },
153*3d40d63aSnicm 	{ .wc = 0x1F6B4, .width = 2 },
154*3d40d63aSnicm 	{ .wc = 0x1F6B5, .width = 2 },
155*3d40d63aSnicm 	{ .wc = 0x1F6B6, .width = 2 },
156*3d40d63aSnicm 	{ .wc = 0x1F6C0, .width = 2 },
157*3d40d63aSnicm 	{ .wc = 0x1F6CC, .width = 2 },
158*3d40d63aSnicm 	{ .wc = 0x1F90C, .width = 2 },
159*3d40d63aSnicm 	{ .wc = 0x1F90F, .width = 2 },
160*3d40d63aSnicm 	{ .wc = 0x1F918, .width = 2 },
161*3d40d63aSnicm 	{ .wc = 0x1F919, .width = 2 },
162*3d40d63aSnicm 	{ .wc = 0x1F91A, .width = 2 },
163*3d40d63aSnicm 	{ .wc = 0x1F91B, .width = 2 },
164*3d40d63aSnicm 	{ .wc = 0x1F91C, .width = 2 },
165*3d40d63aSnicm 	{ .wc = 0x1F91D, .width = 2 },
166*3d40d63aSnicm 	{ .wc = 0x1F91E, .width = 2 },
167*3d40d63aSnicm 	{ .wc = 0x1F91F, .width = 2 },
168*3d40d63aSnicm 	{ .wc = 0x1F926, .width = 2 },
169*3d40d63aSnicm 	{ .wc = 0x1F930, .width = 2 },
170*3d40d63aSnicm 	{ .wc = 0x1F931, .width = 2 },
171*3d40d63aSnicm 	{ .wc = 0x1F932, .width = 2 },
172*3d40d63aSnicm 	{ .wc = 0x1F933, .width = 2 },
173*3d40d63aSnicm 	{ .wc = 0x1F934, .width = 2 },
174*3d40d63aSnicm 	{ .wc = 0x1F935, .width = 2 },
175*3d40d63aSnicm 	{ .wc = 0x1F936, .width = 2 },
176*3d40d63aSnicm 	{ .wc = 0x1F937, .width = 2 },
177*3d40d63aSnicm 	{ .wc = 0x1F938, .width = 2 },
178*3d40d63aSnicm 	{ .wc = 0x1F939, .width = 2 },
179*3d40d63aSnicm 	{ .wc = 0x1F93D, .width = 2 },
180*3d40d63aSnicm 	{ .wc = 0x1F93E, .width = 2 },
181*3d40d63aSnicm 	{ .wc = 0x1F977, .width = 2 },
182*3d40d63aSnicm 	{ .wc = 0x1F9B5, .width = 2 },
183*3d40d63aSnicm 	{ .wc = 0x1F9B6, .width = 2 },
184*3d40d63aSnicm 	{ .wc = 0x1F9B8, .width = 2 },
185*3d40d63aSnicm 	{ .wc = 0x1F9B9, .width = 2 },
186*3d40d63aSnicm 	{ .wc = 0x1F9BB, .width = 2 },
187*3d40d63aSnicm 	{ .wc = 0x1F9CD, .width = 2 },
188*3d40d63aSnicm 	{ .wc = 0x1F9CE, .width = 2 },
189*3d40d63aSnicm 	{ .wc = 0x1F9CF, .width = 2 },
190*3d40d63aSnicm 	{ .wc = 0x1F9D1, .width = 2 },
191*3d40d63aSnicm 	{ .wc = 0x1F9D2, .width = 2 },
192*3d40d63aSnicm 	{ .wc = 0x1F9D3, .width = 2 },
193*3d40d63aSnicm 	{ .wc = 0x1F9D4, .width = 2 },
194*3d40d63aSnicm 	{ .wc = 0x1F9D5, .width = 2 },
195*3d40d63aSnicm 	{ .wc = 0x1F9D6, .width = 2 },
196*3d40d63aSnicm 	{ .wc = 0x1F9D7, .width = 2 },
197*3d40d63aSnicm 	{ .wc = 0x1F9D8, .width = 2 },
198*3d40d63aSnicm 	{ .wc = 0x1F9D9, .width = 2 },
199*3d40d63aSnicm 	{ .wc = 0x1F9DA, .width = 2 },
200*3d40d63aSnicm 	{ .wc = 0x1F9DB, .width = 2 },
201*3d40d63aSnicm 	{ .wc = 0x1F9DC, .width = 2 },
202*3d40d63aSnicm 	{ .wc = 0x1F9DD, .width = 2 },
203*3d40d63aSnicm 	{ .wc = 0x1FAC3, .width = 2 },
204*3d40d63aSnicm 	{ .wc = 0x1FAC4, .width = 2 },
205*3d40d63aSnicm 	{ .wc = 0x1FAC5, .width = 2 },
206*3d40d63aSnicm 	{ .wc = 0x1FAF0, .width = 2 },
207*3d40d63aSnicm 	{ .wc = 0x1FAF1, .width = 2 },
208*3d40d63aSnicm 	{ .wc = 0x1FAF2, .width = 2 },
209*3d40d63aSnicm 	{ .wc = 0x1FAF3, .width = 2 },
210*3d40d63aSnicm 	{ .wc = 0x1FAF4, .width = 2 },
211*3d40d63aSnicm 	{ .wc = 0x1FAF5, .width = 2 },
212*3d40d63aSnicm 	{ .wc = 0x1FAF6, .width = 2 },
213*3d40d63aSnicm 	{ .wc = 0x1FAF7, .width = 2 },
214*3d40d63aSnicm 	{ .wc = 0x1FAF8, .width = 2 }
2152af49740Snicm };
2162af49740Snicm 
21770a57860Snicm struct utf8_item {
218c0b83f5fSnicm 	RB_ENTRY(utf8_item)	index_entry;
219c0b83f5fSnicm 	u_int			index;
2205832c8deSnicm 
221c0b83f5fSnicm 	RB_ENTRY(utf8_item)	data_entry;
2225832c8deSnicm 	char			data[UTF8_SIZE];
2235832c8deSnicm 	u_char			size;
2245832c8deSnicm };
2255832c8deSnicm 
2265832c8deSnicm static int
227c0b83f5fSnicm utf8_data_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
2285832c8deSnicm {
22970a57860Snicm 	if (ui1->size < ui2->size)
2305832c8deSnicm 		return (-1);
23170a57860Snicm 	if (ui1->size > ui2->size)
2325832c8deSnicm 		return (1);
23370a57860Snicm 	return (memcmp(ui1->data, ui2->data, ui1->size));
2345832c8deSnicm }
235c0b83f5fSnicm RB_HEAD(utf8_data_tree, utf8_item);
236c0b83f5fSnicm RB_GENERATE_STATIC(utf8_data_tree, utf8_item, data_entry, utf8_data_cmp);
237c0b83f5fSnicm static struct utf8_data_tree utf8_data_tree = RB_INITIALIZER(utf8_data_tree);
2385832c8deSnicm 
239c0b83f5fSnicm static int
240c0b83f5fSnicm utf8_index_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
241c0b83f5fSnicm {
242c0b83f5fSnicm 	if (ui1->index < ui2->index)
243c0b83f5fSnicm 		return (-1);
244c0b83f5fSnicm 	if (ui1->index > ui2->index)
245c0b83f5fSnicm 		return (1);
246c0b83f5fSnicm 	return (0);
247c0b83f5fSnicm }
248c0b83f5fSnicm RB_HEAD(utf8_index_tree, utf8_item);
249c0b83f5fSnicm RB_GENERATE_STATIC(utf8_index_tree, utf8_item, index_entry, utf8_index_cmp);
250c0b83f5fSnicm static struct utf8_index_tree utf8_index_tree = RB_INITIALIZER(utf8_index_tree);
251c0b83f5fSnicm 
252*3d40d63aSnicm static int	utf8_no_width;
253c0b83f5fSnicm static u_int	utf8_next_index;
2545832c8deSnicm 
255734270a0Snicm #define UTF8_GET_SIZE(uc) (((uc) >> 24) & 0x1f)
2568f36458cSnicm #define UTF8_GET_WIDTH(uc) (((uc) >> 29) - 1)
2575832c8deSnicm 
258734270a0Snicm #define UTF8_SET_SIZE(size) (((utf8_char)(size)) << 24)
259734270a0Snicm #define UTF8_SET_WIDTH(width) ((((utf8_char)(width)) + 1) << 29)
2605832c8deSnicm 
261c0b83f5fSnicm /* Get a UTF-8 item from data. */
26270a57860Snicm static struct utf8_item *
263423d19d0Snicm utf8_item_by_data(const u_char *data, size_t size)
2645832c8deSnicm {
26570a57860Snicm 	struct utf8_item	ui;
2665832c8deSnicm 
26770a57860Snicm 	memcpy(ui.data, data, size);
26870a57860Snicm 	ui.size = size;
2695832c8deSnicm 
270c0b83f5fSnicm 	return (RB_FIND(utf8_data_tree, &utf8_data_tree, &ui));
2715832c8deSnicm }
2725832c8deSnicm 
273c0b83f5fSnicm /* Get a UTF-8 item from data. */
274c0b83f5fSnicm static struct utf8_item *
275c0b83f5fSnicm utf8_item_by_index(u_int index)
2765832c8deSnicm {
277c0b83f5fSnicm 	struct utf8_item	ui;
278c0b83f5fSnicm 
279c0b83f5fSnicm 	ui.index = index;
280c0b83f5fSnicm 
281c0b83f5fSnicm 	return (RB_FIND(utf8_index_tree, &utf8_index_tree, &ui));
2825832c8deSnicm }
2835832c8deSnicm 
284*3d40d63aSnicm /* Find a codepoint in the cache. */
285*3d40d63aSnicm static struct utf8_width_item *
286*3d40d63aSnicm utf8_find_in_width_cache(wchar_t wc)
287*3d40d63aSnicm {
288*3d40d63aSnicm 	struct utf8_width_item	uw;
289*3d40d63aSnicm 
290*3d40d63aSnicm 	uw.wc = wc;
291*3d40d63aSnicm 	return RB_FIND(utf8_width_cache, &utf8_width_cache, &uw);
292*3d40d63aSnicm }
293*3d40d63aSnicm 
294*3d40d63aSnicm /* Parse a single codepoint option. */
295*3d40d63aSnicm static void
296*3d40d63aSnicm utf8_add_to_width_cache(const char *s)
297*3d40d63aSnicm {
298*3d40d63aSnicm 	struct utf8_width_item	*uw, *old;
299*3d40d63aSnicm 	char			*copy, *cp, *endptr;
300*3d40d63aSnicm 	u_int			 width;
301*3d40d63aSnicm 	const char		*errstr;
302*3d40d63aSnicm 	struct utf8_data	*ud;
303*3d40d63aSnicm 	wchar_t			 wc;
304*3d40d63aSnicm 	unsigned long long	 n;
305*3d40d63aSnicm 
306*3d40d63aSnicm 	copy = xstrdup(s);
307*3d40d63aSnicm 	if ((cp = strchr(copy, '=')) == NULL) {
308*3d40d63aSnicm 		free(copy);
309*3d40d63aSnicm 		return;
310*3d40d63aSnicm 	}
311*3d40d63aSnicm 	*cp++ = '\0';
312*3d40d63aSnicm 
313*3d40d63aSnicm 	width = strtonum(cp, 0, 2, &errstr);
314*3d40d63aSnicm 	if (errstr != NULL) {
315*3d40d63aSnicm 		free(copy);
316*3d40d63aSnicm 		return;
317*3d40d63aSnicm 	}
318*3d40d63aSnicm 
319*3d40d63aSnicm 	if (strncmp(copy, "U+", 2) == 0) {
320*3d40d63aSnicm 		errno = 0;
321*3d40d63aSnicm 		n = strtoull(copy + 2, &endptr, 16);
322*3d40d63aSnicm 		if (copy[2] == '\0' ||
323*3d40d63aSnicm 		    *endptr != '\0' ||
324*3d40d63aSnicm 		    n == 0 ||
325*3d40d63aSnicm 		    n > WCHAR_MAX ||
326*3d40d63aSnicm 		    (errno == ERANGE && n == ULLONG_MAX)) {
327*3d40d63aSnicm 			free(copy);
328*3d40d63aSnicm 			return;
329*3d40d63aSnicm 		}
330*3d40d63aSnicm 		wc = n;
331*3d40d63aSnicm 	} else {
332*3d40d63aSnicm 		utf8_no_width = 1;
333*3d40d63aSnicm 		ud = utf8_fromcstr(copy);
334*3d40d63aSnicm 		utf8_no_width = 0;
335*3d40d63aSnicm 		if (ud[0].size == 0 || ud[1].size != 0) {
336*3d40d63aSnicm 			free(ud);
337*3d40d63aSnicm 			free(copy);
338*3d40d63aSnicm 			return;
339*3d40d63aSnicm 		}
340*3d40d63aSnicm #ifdef HAVE_UTF8PROC
341*3d40d63aSnicm 		if (utf8proc_mbtowc(&wc, ud[0].data, ud[0].size) <= 0) {
342*3d40d63aSnicm #else
343*3d40d63aSnicm 		if (mbtowc(&wc, ud[0].data, ud[0].size) <= 0) {
344*3d40d63aSnicm #endif
345*3d40d63aSnicm 			free(ud);
346*3d40d63aSnicm 			free(copy);
347*3d40d63aSnicm 			return;
348*3d40d63aSnicm 		}
349*3d40d63aSnicm 		free(ud);
350*3d40d63aSnicm 	}
351*3d40d63aSnicm 
352*3d40d63aSnicm 	log_debug("Unicode width cache: %08X=%u", (u_int)wc, width);
353*3d40d63aSnicm 
354*3d40d63aSnicm 	uw = xcalloc(1, sizeof *uw);
355*3d40d63aSnicm 	uw->wc = wc;
356*3d40d63aSnicm 	uw->width = width;
357*3d40d63aSnicm 	uw->allocated = 1;
358*3d40d63aSnicm 
359*3d40d63aSnicm 	old = RB_INSERT(utf8_width_cache, &utf8_width_cache, uw);
360*3d40d63aSnicm 	if (old != NULL) {
361*3d40d63aSnicm 		RB_REMOVE(utf8_width_cache, &utf8_width_cache, old);
362*3d40d63aSnicm 		if (old->allocated)
363*3d40d63aSnicm 			free(old);
364*3d40d63aSnicm 		RB_INSERT(utf8_width_cache, &utf8_width_cache, uw);
365*3d40d63aSnicm 	}
366*3d40d63aSnicm 
367*3d40d63aSnicm 	free(copy);
368*3d40d63aSnicm }
369*3d40d63aSnicm 
370*3d40d63aSnicm /* Rebuild cache of widths. */
371*3d40d63aSnicm void
372*3d40d63aSnicm utf8_update_width_cache(void)
373*3d40d63aSnicm {
374*3d40d63aSnicm 	struct utf8_width_item		*uw, *uw1;
375*3d40d63aSnicm 	struct options_entry		*o;
376*3d40d63aSnicm 	struct options_array_item	*a;
377*3d40d63aSnicm 	u_int				 i;
378*3d40d63aSnicm 
379*3d40d63aSnicm 	RB_FOREACH_SAFE (uw, utf8_width_cache, &utf8_width_cache, uw1) {
380*3d40d63aSnicm 		RB_REMOVE(utf8_width_cache, &utf8_width_cache, uw);
381*3d40d63aSnicm 		if (uw->allocated)
382*3d40d63aSnicm 			free(uw);
383*3d40d63aSnicm 	}
384*3d40d63aSnicm 
385*3d40d63aSnicm 	for (i = 0; i < nitems(utf8_default_width_cache); i++) {
386*3d40d63aSnicm 		RB_INSERT(utf8_width_cache, &utf8_width_cache,
387*3d40d63aSnicm 		    &utf8_default_width_cache[i]);
388*3d40d63aSnicm 	}
389*3d40d63aSnicm 
390*3d40d63aSnicm 	o = options_get(global_options, "codepoint-widths");
391*3d40d63aSnicm 	a = options_array_first(o);
392*3d40d63aSnicm 	while (a != NULL) {
393*3d40d63aSnicm 		utf8_add_to_width_cache(options_array_item_value(a)->string);
394*3d40d63aSnicm 		a = options_array_next(a);
395*3d40d63aSnicm 	}
396*3d40d63aSnicm }
397*3d40d63aSnicm 
39870a57860Snicm /* Add a UTF-8 item. */
39970a57860Snicm static int
4006af87e9aSnicm utf8_put_item(const u_char *data, size_t size, u_int *index)
4015832c8deSnicm {
40270a57860Snicm 	struct utf8_item	*ui;
40370a57860Snicm 
404c0b83f5fSnicm 	ui = utf8_item_by_data(data, size);
40570a57860Snicm 	if (ui != NULL) {
40610e1651aSnicm 		*index = ui->index;
407c0b83f5fSnicm 		log_debug("%s: found %.*s = %u", __func__, (int)size, data,
408c0b83f5fSnicm 		    *index);
40970a57860Snicm 		return (0);
41070a57860Snicm 	}
41170a57860Snicm 
412c0b83f5fSnicm 	if (utf8_next_index == 0xffffff + 1)
41370a57860Snicm 		return (-1);
41470a57860Snicm 
415c0b83f5fSnicm 	ui = xcalloc(1, sizeof *ui);
416c0b83f5fSnicm 	ui->index = utf8_next_index++;
417c0b83f5fSnicm 	RB_INSERT(utf8_index_tree, &utf8_index_tree, ui);
418c0b83f5fSnicm 
41970a57860Snicm 	memcpy(ui->data, data, size);
42070a57860Snicm 	ui->size = size;
421c0b83f5fSnicm 	RB_INSERT(utf8_data_tree, &utf8_data_tree, ui);
42270a57860Snicm 
423c0b83f5fSnicm 	*index = ui->index;
42410e1651aSnicm 	log_debug("%s: added %.*s = %u", __func__, (int)size, data, *index);
42570a57860Snicm 	return (0);
42670a57860Snicm }
42770a57860Snicm 
42870a57860Snicm /* Get UTF-8 character from data. */
42970a57860Snicm enum utf8_state
43070a57860Snicm utf8_from_data(const struct utf8_data *ud, utf8_char *uc)
43170a57860Snicm {
432c0b83f5fSnicm 	u_int	index;
4335832c8deSnicm 
434a49f5513Snicm 	if (ud->width > 2)
435051d3296Snicm 		fatalx("invalid UTF-8 width: %u", ud->width);
4365832c8deSnicm 
437a49f5513Snicm 	if (ud->size > UTF8_SIZE)
4385832c8deSnicm 		goto fail;
439734270a0Snicm 	if (ud->size <= 3) {
440c0b83f5fSnicm 		index = (((utf8_char)ud->data[2] << 16)|
441734270a0Snicm 			  ((utf8_char)ud->data[1] << 8)|
442734270a0Snicm 			  ((utf8_char)ud->data[0]));
443c0b83f5fSnicm 	} else if (utf8_put_item(ud->data, ud->size, &index) != 0)
4445832c8deSnicm 		goto fail;
445c0b83f5fSnicm 	*uc = UTF8_SET_SIZE(ud->size)|UTF8_SET_WIDTH(ud->width)|index;
446734270a0Snicm 	log_debug("%s: (%d %d %.*s) -> %08x", __func__, ud->width, ud->size,
447734270a0Snicm 	    (int)ud->size, ud->data, *uc);
44870a57860Snicm 	return (UTF8_DONE);
4495832c8deSnicm 
4505832c8deSnicm fail:
451a49f5513Snicm 	if (ud->width == 0)
452734270a0Snicm 		*uc = UTF8_SET_SIZE(0)|UTF8_SET_WIDTH(0);
453a49f5513Snicm 	else if (ud->width == 1)
454734270a0Snicm 		*uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x20;
45570a57860Snicm 	else
456734270a0Snicm 		*uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x2020;
45770a57860Snicm 	return (UTF8_ERROR);
4585832c8deSnicm }
4595832c8deSnicm 
46070a57860Snicm /* Get UTF-8 data from character. */
4615832c8deSnicm void
46270a57860Snicm utf8_to_data(utf8_char uc, struct utf8_data *ud)
4635832c8deSnicm {
46470a57860Snicm 	struct utf8_item	*ui;
465c0b83f5fSnicm 	u_int			 index;
4665832c8deSnicm 
4675832c8deSnicm 	memset(ud, 0, sizeof *ud);
468734270a0Snicm 	ud->size = ud->have = UTF8_GET_SIZE(uc);
469734270a0Snicm 	ud->width = UTF8_GET_WIDTH(uc);
4705832c8deSnicm 
4715832c8deSnicm 	if (ud->size <= 3) {
472734270a0Snicm 		ud->data[2] = (uc >> 16);
473734270a0Snicm 		ud->data[1] = ((uc >> 8) & 0xff);
474734270a0Snicm 		ud->data[0] = (uc & 0xff);
475734270a0Snicm 	} else {
476c0b83f5fSnicm 		index = (uc & 0xffffff);
477c0b83f5fSnicm 		if ((ui = utf8_item_by_index(index)) == NULL)
4785832c8deSnicm 			memset(ud->data, ' ', ud->size);
479c0b83f5fSnicm 		else
48070a57860Snicm 			memcpy(ud->data, ui->data, ud->size);
4815832c8deSnicm 	}
4825832c8deSnicm 
483734270a0Snicm 	log_debug("%s: %08x -> (%d %d %.*s)", __func__, uc, ud->width, ud->size,
484734270a0Snicm 	    (int)ud->size, ud->data);
485734270a0Snicm }
486734270a0Snicm 
48770a57860Snicm /* Get UTF-8 character from a single ASCII character. */
488eba5d769Snicm u_int
489a49f5513Snicm utf8_build_one(u_char ch)
4905832c8deSnicm {
491734270a0Snicm 	return (UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|ch);
4925832c8deSnicm }
4935832c8deSnicm 
4944b2ce9a7Snicm /* Set a single character. */
4954b2ce9a7Snicm void
4969b3c9bc5Snicm utf8_set(struct utf8_data *ud, u_char ch)
4974b2ce9a7Snicm {
4986eb338b3Snicm 	static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
499e931849fSnicm 
5006eb338b3Snicm 	memcpy(ud, &empty, sizeof *ud);
5019b3c9bc5Snicm 	*ud->data = ch;
502e931849fSnicm }
503e931849fSnicm 
504e931849fSnicm /* Copy UTF-8 character. */
505e931849fSnicm void
506e931849fSnicm utf8_copy(struct utf8_data *to, const struct utf8_data *from)
507e931849fSnicm {
508e931849fSnicm 	u_int	i;
509e931849fSnicm 
510e931849fSnicm 	memcpy(to, from, sizeof *to);
511e931849fSnicm 
512e931849fSnicm 	for (i = to->size; i < sizeof to->data; i++)
513e931849fSnicm 		to->data[i] = '\0';
5144b2ce9a7Snicm }
5154b2ce9a7Snicm 
51670a57860Snicm /* Get width of Unicode character. */
5176852c63bSnicm static enum utf8_state
5186852c63bSnicm utf8_width(struct utf8_data *ud, int *width)
51970a57860Snicm {
520*3d40d63aSnicm 	struct utf8_width_item	*uw;
5216852c63bSnicm 	wchar_t			 wc;
52270a57860Snicm 
5232af49740Snicm 	if (utf8_towc(ud, &wc) != UTF8_DONE)
5246852c63bSnicm 		return (UTF8_ERROR);
525*3d40d63aSnicm 	uw = utf8_find_in_width_cache(wc);
526*3d40d63aSnicm 	if (uw != NULL) {
527*3d40d63aSnicm 		*width = uw->width;
528*3d40d63aSnicm 		log_debug("cached width for %08X is %d", (u_int)wc, *width);
5292af49740Snicm 		return (UTF8_DONE);
53070a57860Snicm 	}
5312af49740Snicm 
5326852c63bSnicm 	*width = wcwidth(wc);
533ecd3a22eSnicm 	log_debug("wcwidth(%05X) returned %d", (u_int)wc, *width);
534cc390fd4Snicm 	if (*width < 0) {
535cc390fd4Snicm 		/*
536cc390fd4Snicm 		 * C1 control characters are nonprintable, so they are always
537cc390fd4Snicm 		 * zero width.
538cc390fd4Snicm 		 */
539cc390fd4Snicm 		*width = (wc >= 0x80 && wc <= 0x9f) ? 0 : 1;
540cc390fd4Snicm 	}
541485d86f6Snicm 	if (*width >= 0 && *width <= 0xff)
5426852c63bSnicm 		return (UTF8_DONE);
543485d86f6Snicm 	return (UTF8_ERROR);
54470a57860Snicm }
54570a57860Snicm 
5462af49740Snicm /* Convert UTF-8 character to wide character. */
5472af49740Snicm enum utf8_state
5482af49740Snicm utf8_towc(const struct utf8_data *ud, wchar_t *wc)
5492af49740Snicm {
5502af49740Snicm 	switch (mbtowc(wc, ud->data, ud->size)) {
5512af49740Snicm 	case -1:
5522af49740Snicm 		log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
5532af49740Snicm 		    errno);
5542af49740Snicm 		mbtowc(NULL, NULL, MB_CUR_MAX);
5552af49740Snicm 		return (UTF8_ERROR);
5562af49740Snicm 	case 0:
5572af49740Snicm 		return (UTF8_ERROR);
5582af49740Snicm 	}
5592af49740Snicm 	log_debug("UTF-8 %.*s is %05X", (int)ud->size, ud->data, (u_int)*wc);
5602af49740Snicm 	return (UTF8_DONE);
5612af49740Snicm }
5622af49740Snicm 
563b843f94bSnicm /* Convert wide character to UTF-8 character. */
564b843f94bSnicm enum utf8_state
565b843f94bSnicm utf8_fromwc(wchar_t wc, struct utf8_data *ud)
566b843f94bSnicm {
567b843f94bSnicm 	int	size, width;
568b843f94bSnicm 
569b843f94bSnicm 	size = wctomb(ud->data, wc);
570b843f94bSnicm 	if (size < 0) {
571b843f94bSnicm 		log_debug("UTF-8 %d, wctomb() %d", wc, errno);
572b843f94bSnicm 		wctomb(NULL, 0);
573b843f94bSnicm 		return (UTF8_ERROR);
574b843f94bSnicm 	}
575b843f94bSnicm 	if (size == 0)
576b843f94bSnicm 		return (UTF8_ERROR);
577b843f94bSnicm 	ud->size = ud->have = size;
578b843f94bSnicm 	if (utf8_width(ud, &width) == UTF8_DONE) {
579b843f94bSnicm 		ud->width = width;
580b843f94bSnicm 		return (UTF8_DONE);
581b843f94bSnicm 	}
582b843f94bSnicm 	return (UTF8_ERROR);
583b843f94bSnicm }
584b843f94bSnicm 
58540cac527Snicm /*
58640cac527Snicm  * Open UTF-8 sequence.
58740cac527Snicm  *
58840cac527Snicm  * 11000010-11011111 C2-DF start of 2-byte sequence
58940cac527Snicm  * 11100000-11101111 E0-EF start of 3-byte sequence
59040cac527Snicm  * 11110000-11110100 F0-F4 start of 4-byte sequence
59140cac527Snicm  */
59239d4fc02Snicm enum utf8_state
5939b3c9bc5Snicm utf8_open(struct utf8_data *ud, u_char ch)
59440cac527Snicm {
5959b3c9bc5Snicm 	memset(ud, 0, sizeof *ud);
59640cac527Snicm 	if (ch >= 0xc2 && ch <= 0xdf)
5979b3c9bc5Snicm 		ud->size = 2;
59840cac527Snicm 	else if (ch >= 0xe0 && ch <= 0xef)
5999b3c9bc5Snicm 		ud->size = 3;
60040cac527Snicm 	else if (ch >= 0xf0 && ch <= 0xf4)
6019b3c9bc5Snicm 		ud->size = 4;
60240cac527Snicm 	else
60339d4fc02Snicm 		return (UTF8_ERROR);
6049b3c9bc5Snicm 	utf8_append(ud, ch);
60539d4fc02Snicm 	return (UTF8_MORE);
60640cac527Snicm }
60740cac527Snicm 
60839d4fc02Snicm /* Append character to UTF-8, closing if finished. */
60939d4fc02Snicm enum utf8_state
6109b3c9bc5Snicm utf8_append(struct utf8_data *ud, u_char ch)
61140cac527Snicm {
61298da63d5Snicm 	int	width;
61398da63d5Snicm 
6149b3c9bc5Snicm 	if (ud->have >= ud->size)
61540cac527Snicm 		fatalx("UTF-8 character overflow");
6169b3c9bc5Snicm 	if (ud->size > sizeof ud->data)
61740cac527Snicm 		fatalx("UTF-8 character size too large");
61840cac527Snicm 
61927a2633fSnicm 	if (ud->have != 0 && (ch & 0xc0) != 0x80)
62027a2633fSnicm 		ud->width = 0xff;
62127a2633fSnicm 
6229b3c9bc5Snicm 	ud->data[ud->have++] = ch;
6239b3c9bc5Snicm 	if (ud->have != ud->size)
62439d4fc02Snicm 		return (UTF8_MORE);
62540cac527Snicm 
626*3d40d63aSnicm 	if (!utf8_no_width) {
62727a2633fSnicm 		if (ud->width == 0xff)
62839d4fc02Snicm 			return (UTF8_ERROR);
6296852c63bSnicm 		if (utf8_width(ud, &width) != UTF8_DONE)
63098da63d5Snicm 			return (UTF8_ERROR);
63198da63d5Snicm 		ud->width = width;
632*3d40d63aSnicm 	}
63398da63d5Snicm 
63439d4fc02Snicm 	return (UTF8_DONE);
635311827fbSnicm }
636311827fbSnicm 
637dbbd1b46Snicm /*
638dbbd1b46Snicm  * Encode len characters from src into dst, which is guaranteed to have four
639dbbd1b46Snicm  * bytes available for each character from src (for \abc or UTF-8) plus space
640dbbd1b46Snicm  * for \0.
641dbbd1b46Snicm  */
642dbbd1b46Snicm int
643dbbd1b46Snicm utf8_strvis(char *dst, const char *src, size_t len, int flag)
644dbbd1b46Snicm {
6459b3c9bc5Snicm 	struct utf8_data	 ud;
64670a57860Snicm 	const char		*start = dst, *end = src + len;
64739d4fc02Snicm 	enum utf8_state		 more;
648dbbd1b46Snicm 	size_t			 i;
649dbbd1b46Snicm 
650dbbd1b46Snicm 	while (src < end) {
65139d4fc02Snicm 		if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
65239d4fc02Snicm 			while (++src < end && more == UTF8_MORE)
6539b3c9bc5Snicm 				more = utf8_append(&ud, *src);
65439d4fc02Snicm 			if (more == UTF8_DONE) {
655dbbd1b46Snicm 				/* UTF-8 character finished. */
6569b3c9bc5Snicm 				for (i = 0; i < ud.size; i++)
6579b3c9bc5Snicm 					*dst++ = ud.data[i];
658dbbd1b46Snicm 				continue;
65939d4fc02Snicm 			}
66027a2633fSnicm 			/* Not a complete, valid UTF-8 character. */
6619b3c9bc5Snicm 			src -= ud.have;
662dbbd1b46Snicm 		}
6637e151e3fSnicm 		if ((flag & VIS_DQ) && src[0] == '$' && src < end - 1) {
664be9b6b3fSnicm 			if (isalpha((u_char)src[1]) ||
665be9b6b3fSnicm 			    src[1] == '_' ||
666be9b6b3fSnicm 			    src[1] == '{')
6675c131106Snicm 				*dst++ = '\\';
6685c131106Snicm 			*dst++ = '$';
6695c131106Snicm 		} else if (src < end - 1)
670dbbd1b46Snicm 			dst = vis(dst, src[0], flag, src[1]);
671dbbd1b46Snicm 		else if (src < end)
672dbbd1b46Snicm 			dst = vis(dst, src[0], flag, '\0');
673dbbd1b46Snicm 		src++;
674dbbd1b46Snicm 	}
675dbbd1b46Snicm 	*dst = '\0';
676dbbd1b46Snicm 	return (dst - start);
677dbbd1b46Snicm }
6784b2ce9a7Snicm 
679f50390e0Snicm /* Same as utf8_strvis but allocate the buffer. */
680f50390e0Snicm int
681f50390e0Snicm utf8_stravis(char **dst, const char *src, int flag)
682f50390e0Snicm {
683f50390e0Snicm 	char	*buf;
684f50390e0Snicm 	int	 len;
685f50390e0Snicm 
686f50390e0Snicm 	buf = xreallocarray(NULL, 4, strlen(src) + 1);
687f50390e0Snicm 	len = utf8_strvis(buf, src, strlen(src), flag);
688f50390e0Snicm 
689f50390e0Snicm 	*dst = xrealloc(buf, len + 1);
690f50390e0Snicm 	return (len);
691f50390e0Snicm }
692f50390e0Snicm 
6936523adafSnicm /* Same as utf8_strvis but allocate the buffer. */
6946523adafSnicm int
6956523adafSnicm utf8_stravisx(char **dst, const char *src, size_t srclen, int flag)
6966523adafSnicm {
6976523adafSnicm 	char	*buf;
6986523adafSnicm 	int	 len;
6996523adafSnicm 
7006523adafSnicm 	buf = xreallocarray(NULL, 4, srclen + 1);
7016523adafSnicm 	len = utf8_strvis(buf, src, srclen, flag);
7026523adafSnicm 
7036523adafSnicm 	*dst = xrealloc(buf, len + 1);
7046523adafSnicm 	return (len);
7056523adafSnicm }
7066523adafSnicm 
7079d9ffcabSnicm /* Does this string contain anything that isn't valid UTF-8? */
7089d9ffcabSnicm int
7099d9ffcabSnicm utf8_isvalid(const char *s)
7109d9ffcabSnicm {
7119d9ffcabSnicm 	struct utf8_data ud;
7129d9ffcabSnicm 	const char	*end;
7139d9ffcabSnicm 	enum utf8_state	 more;
7149d9ffcabSnicm 
7159d9ffcabSnicm 	end = s + strlen(s);
7169d9ffcabSnicm 	while (s < end) {
7179d9ffcabSnicm 		if ((more = utf8_open(&ud, *s)) == UTF8_MORE) {
7189d9ffcabSnicm 			while (++s < end && more == UTF8_MORE)
7199d9ffcabSnicm 				more = utf8_append(&ud, *s);
7209d9ffcabSnicm 			if (more == UTF8_DONE)
7219d9ffcabSnicm 				continue;
7229d9ffcabSnicm 			return (0);
7239d9ffcabSnicm 		}
7249d9ffcabSnicm 		if (*s < 0x20 || *s > 0x7e)
7259d9ffcabSnicm 			return (0);
7269d9ffcabSnicm 		s++;
7279d9ffcabSnicm 	}
7289d9ffcabSnicm 	return (1);
7299d9ffcabSnicm }
7309d9ffcabSnicm 
7314b2ce9a7Snicm /*
73262f1fdfdSnicm  * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
73362f1fdfdSnicm  * the returned string. Anything not valid printable ASCII or UTF-8 is
73462f1fdfdSnicm  * stripped.
73562f1fdfdSnicm  */
73662f1fdfdSnicm char *
73762f1fdfdSnicm utf8_sanitize(const char *src)
73862f1fdfdSnicm {
73970a57860Snicm 	char		*dst = NULL;
74070a57860Snicm 	size_t		 n = 0;
74139d4fc02Snicm 	enum utf8_state	 more;
7429b3c9bc5Snicm 	struct utf8_data ud;
74362f1fdfdSnicm 	u_int		 i;
74462f1fdfdSnicm 
74562f1fdfdSnicm 	while (*src != '\0') {
74662f1fdfdSnicm 		dst = xreallocarray(dst, n + 1, sizeof *dst);
74739d4fc02Snicm 		if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
74839d4fc02Snicm 			while (*++src != '\0' && more == UTF8_MORE)
7499b3c9bc5Snicm 				more = utf8_append(&ud, *src);
75039d4fc02Snicm 			if (more == UTF8_DONE) {
7519b3c9bc5Snicm 				dst = xreallocarray(dst, n + ud.width,
75262f1fdfdSnicm 				    sizeof *dst);
7539b3c9bc5Snicm 				for (i = 0; i < ud.width; i++)
75462f1fdfdSnicm 					dst[n++] = '_';
75562f1fdfdSnicm 				continue;
75662f1fdfdSnicm 			}
7579b3c9bc5Snicm 			src -= ud.have;
75862f1fdfdSnicm 		}
75962f1fdfdSnicm 		if (*src > 0x1f && *src < 0x7f)
76027a2633fSnicm 			dst[n++] = *src;
76139d4fc02Snicm 		else
76239d4fc02Snicm 			dst[n++] = '_';
76362f1fdfdSnicm 		src++;
76462f1fdfdSnicm 	}
76562f1fdfdSnicm 	dst = xreallocarray(dst, n + 1, sizeof *dst);
76662f1fdfdSnicm 	dst[n] = '\0';
76762f1fdfdSnicm 	return (dst);
76862f1fdfdSnicm }
76962f1fdfdSnicm 
770746b61e4Snicm /* Get UTF-8 buffer length. */
771746b61e4Snicm size_t
772746b61e4Snicm utf8_strlen(const struct utf8_data *s)
773746b61e4Snicm {
774746b61e4Snicm 	size_t	i;
775746b61e4Snicm 
776746b61e4Snicm 	for (i = 0; s[i].size != 0; i++)
777746b61e4Snicm 		/* nothing */;
778746b61e4Snicm 	return (i);
779746b61e4Snicm }
780746b61e4Snicm 
781746b61e4Snicm /* Get UTF-8 string width. */
782746b61e4Snicm u_int
783746b61e4Snicm utf8_strwidth(const struct utf8_data *s, ssize_t n)
784746b61e4Snicm {
785746b61e4Snicm 	ssize_t	i;
78670a57860Snicm 	u_int	width = 0;
787746b61e4Snicm 
788746b61e4Snicm 	for (i = 0; s[i].size != 0; i++) {
789746b61e4Snicm 		if (n != -1 && n == i)
790746b61e4Snicm 			break;
791746b61e4Snicm 		width += s[i].width;
792746b61e4Snicm 	}
793746b61e4Snicm 	return (width);
794746b61e4Snicm }
795746b61e4Snicm 
79662f1fdfdSnicm /*
7974b2ce9a7Snicm  * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
7984b2ce9a7Snicm  * Caller frees.
7994b2ce9a7Snicm  */
8004b2ce9a7Snicm struct utf8_data *
8014b2ce9a7Snicm utf8_fromcstr(const char *src)
8024b2ce9a7Snicm {
80370a57860Snicm 	struct utf8_data	*dst = NULL;
80470a57860Snicm 	size_t			 n = 0;
80539d4fc02Snicm 	enum utf8_state		 more;
8064b2ce9a7Snicm 
8074b2ce9a7Snicm 	while (*src != '\0') {
80864cf113cSnicm 		dst = xreallocarray(dst, n + 1, sizeof *dst);
80939d4fc02Snicm 		if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
81039d4fc02Snicm 			while (*++src != '\0' && more == UTF8_MORE)
8114b2ce9a7Snicm 				more = utf8_append(&dst[n], *src);
81239d4fc02Snicm 			if (more == UTF8_DONE) {
8134b2ce9a7Snicm 				n++;
8144b2ce9a7Snicm 				continue;
8154b2ce9a7Snicm 			}
8164b2ce9a7Snicm 			src -= dst[n].have;
8174b2ce9a7Snicm 		}
8184b2ce9a7Snicm 		utf8_set(&dst[n], *src);
8194b2ce9a7Snicm 		n++;
82027a2633fSnicm 		src++;
82127a2633fSnicm 	}
82264cf113cSnicm 	dst = xreallocarray(dst, n + 1, sizeof *dst);
8234b2ce9a7Snicm 	dst[n].size = 0;
8244b2ce9a7Snicm 	return (dst);
8254b2ce9a7Snicm }
8264b2ce9a7Snicm 
8274b2ce9a7Snicm /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
8284b2ce9a7Snicm char *
8294b2ce9a7Snicm utf8_tocstr(struct utf8_data *src)
8304b2ce9a7Snicm {
83170a57860Snicm 	char	*dst = NULL;
83270a57860Snicm 	size_t	 n = 0;
8334b2ce9a7Snicm 
8344b2ce9a7Snicm 	for(; src->size != 0; src++) {
83564cf113cSnicm 		dst = xreallocarray(dst, n + src->size, 1);
8364b2ce9a7Snicm 		memcpy(dst + n, src->data, src->size);
8374b2ce9a7Snicm 		n += src->size;
8384b2ce9a7Snicm 	}
83964cf113cSnicm 	dst = xreallocarray(dst, n + 1, 1);
8404b2ce9a7Snicm 	dst[n] = '\0';
8414b2ce9a7Snicm 	return (dst);
8424b2ce9a7Snicm }
8434b2ce9a7Snicm 
8444b2ce9a7Snicm /* Get width of UTF-8 string. */
8454b2ce9a7Snicm u_int
8464b2ce9a7Snicm utf8_cstrwidth(const char *s)
8474b2ce9a7Snicm {
8484b2ce9a7Snicm 	struct utf8_data	tmp;
8494b2ce9a7Snicm 	u_int			width;
85039d4fc02Snicm 	enum utf8_state		more;
8514b2ce9a7Snicm 
8524b2ce9a7Snicm 	width = 0;
8534b2ce9a7Snicm 	while (*s != '\0') {
85439d4fc02Snicm 		if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
85539d4fc02Snicm 			while (*++s != '\0' && more == UTF8_MORE)
8564b2ce9a7Snicm 				more = utf8_append(&tmp, *s);
85739d4fc02Snicm 			if (more == UTF8_DONE) {
8584b2ce9a7Snicm 				width += tmp.width;
8594b2ce9a7Snicm 				continue;
8604b2ce9a7Snicm 			}
8614b2ce9a7Snicm 			s -= tmp.have;
8624b2ce9a7Snicm 		}
86339d4fc02Snicm 		if (*s > 0x1f && *s != 0x7f)
8644b2ce9a7Snicm 			width++;
8654b2ce9a7Snicm 		s++;
8664b2ce9a7Snicm 	}
8674b2ce9a7Snicm 	return (width);
8684b2ce9a7Snicm }
8694b2ce9a7Snicm 
870a318a7faSnicm /* Pad UTF-8 string to width on the left. Caller frees. */
8711d1963bbSnicm char *
8721d1963bbSnicm utf8_padcstr(const char *s, u_int width)
8731d1963bbSnicm {
8741d1963bbSnicm 	size_t	 slen;
8751d1963bbSnicm 	char	*out;
8761d1963bbSnicm 	u_int	 n, i;
8771d1963bbSnicm 
8781d1963bbSnicm 	n = utf8_cstrwidth(s);
8791d1963bbSnicm 	if (n >= width)
8801d1963bbSnicm 		return (xstrdup(s));
8811d1963bbSnicm 
8821d1963bbSnicm 	slen = strlen(s);
8831d1963bbSnicm 	out = xmalloc(slen + 1 + (width - n));
8841d1963bbSnicm 	memcpy(out, s, slen);
8851d1963bbSnicm 	for (i = n; i < width; i++)
8861d1963bbSnicm 		out[slen++] = ' ';
8871d1963bbSnicm 	out[slen] = '\0';
8881d1963bbSnicm 	return (out);
8891d1963bbSnicm }
8907db4c597Snicm 
891a318a7faSnicm /* Pad UTF-8 string to width on the right. Caller frees. */
892a318a7faSnicm char *
893a318a7faSnicm utf8_rpadcstr(const char *s, u_int width)
894a318a7faSnicm {
895a318a7faSnicm 	size_t	 slen;
896a318a7faSnicm 	char	*out;
897a318a7faSnicm 	u_int	 n, i;
898a318a7faSnicm 
899a318a7faSnicm 	n = utf8_cstrwidth(s);
900a318a7faSnicm 	if (n >= width)
901a318a7faSnicm 		return (xstrdup(s));
902a318a7faSnicm 
903a318a7faSnicm 	slen = strlen(s);
904a318a7faSnicm 	out = xmalloc(slen + 1 + (width - n));
905a318a7faSnicm 	for (i = 0; i < width - n; i++)
906a318a7faSnicm 		out[i] = ' ';
907a318a7faSnicm 	memcpy(out + i, s, slen);
908a318a7faSnicm 	out[i + slen] = '\0';
909a318a7faSnicm 	return (out);
910a318a7faSnicm }
911a318a7faSnicm 
9127db4c597Snicm int
9137db4c597Snicm utf8_cstrhas(const char *s, const struct utf8_data *ud)
9147db4c597Snicm {
9157db4c597Snicm 	struct utf8_data	*copy, *loop;
9167db4c597Snicm 	int			 found = 0;
9177db4c597Snicm 
9187db4c597Snicm 	copy = utf8_fromcstr(s);
9197db4c597Snicm 	for (loop = copy; loop->size != 0; loop++) {
9207db4c597Snicm 		if (loop->size != ud->size)
9217db4c597Snicm 			continue;
9227db4c597Snicm 		if (memcmp(loop->data, ud->data, loop->size) == 0) {
9237db4c597Snicm 			found = 1;
9247db4c597Snicm 			break;
9257db4c597Snicm 		}
9267db4c597Snicm 	}
9277db4c597Snicm 	free(copy);
9287db4c597Snicm 
9297db4c597Snicm 	return (found);
9307db4c597Snicm }
931