xref: /openbsd-src/usr.bin/tmux/utf8.c (revision dcc91c2622318df8f66a9bca2d2864253df1bfc3)
1 /* $OpenBSD: utf8.c,v 1.66 2024/07/12 11:21:18 nicm Exp $ */
2 
3 /*
4  * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
15  * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
16  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/types.h>
20 
21 #include <ctype.h>
22 #include <errno.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <vis.h>
26 
27 #include "tmux.h"
28 
29 static const wchar_t utf8_force_wide[] = {
30 	0x0261D,
31 	0x026F9,
32 	0x0270A,
33 	0x0270B,
34 	0x0270C,
35 	0x0270D,
36 	0x1F1E6,
37 	0x1F1E7,
38 	0x1F1E8,
39 	0x1F1E9,
40 	0x1F1EA,
41 	0x1F1EB,
42 	0x1F1EC,
43 	0x1F1ED,
44 	0x1F1EE,
45 	0x1F1EF,
46 	0x1F1F0,
47 	0x1F1F1,
48 	0x1F1F2,
49 	0x1F1F3,
50 	0x1F1F4,
51 	0x1F1F5,
52 	0x1F1F6,
53 	0x1F1F7,
54 	0x1F1F8,
55 	0x1F1F9,
56 	0x1F1FA,
57 	0x1F1FB,
58 	0x1F1FC,
59 	0x1F1FD,
60 	0x1F1FE,
61 	0x1F1FF,
62 	0x1F385,
63 	0x1F3C2,
64 	0x1F3C3,
65 	0x1F3C4,
66 	0x1F3C7,
67 	0x1F3CA,
68 	0x1F3CB,
69 	0x1F3CC,
70 	0x1F3FB,
71 	0x1F3FC,
72 	0x1F3FD,
73 	0x1F3FE,
74 	0x1F3FF,
75 	0x1F442,
76 	0x1F443,
77 	0x1F446,
78 	0x1F447,
79 	0x1F448,
80 	0x1F449,
81 	0x1F44A,
82 	0x1F44B,
83 	0x1F44C,
84 	0x1F44D,
85 	0x1F44E,
86 	0x1F44F,
87 	0x1F450,
88 	0x1F466,
89 	0x1F467,
90 	0x1F468,
91 	0x1F469,
92 	0x1F46B,
93 	0x1F46C,
94 	0x1F46D,
95 	0x1F46E,
96 	0x1F470,
97 	0x1F471,
98 	0x1F472,
99 	0x1F473,
100 	0x1F474,
101 	0x1F475,
102 	0x1F476,
103 	0x1F477,
104 	0x1F478,
105 	0x1F47C,
106 	0x1F481,
107 	0x1F482,
108 	0x1F483,
109 	0x1F485,
110 	0x1F486,
111 	0x1F487,
112 	0x1F48F,
113 	0x1F491,
114 	0x1F4AA,
115 	0x1F574,
116 	0x1F575,
117 	0x1F57A,
118 	0x1F590,
119 	0x1F595,
120 	0x1F596,
121 	0x1F645,
122 	0x1F646,
123 	0x1F647,
124 	0x1F64B,
125 	0x1F64C,
126 	0x1F64D,
127 	0x1F64E,
128 	0x1F64F,
129 	0x1F6A3,
130 	0x1F6B4,
131 	0x1F6B5,
132 	0x1F6B6,
133 	0x1F6C0,
134 	0x1F6CC,
135 	0x1F90C,
136 	0x1F90F,
137 	0x1F918,
138 	0x1F919,
139 	0x1F91A,
140 	0x1F91B,
141 	0x1F91C,
142 	0x1F91D,
143 	0x1F91E,
144 	0x1F91F,
145 	0x1F926,
146 	0x1F930,
147 	0x1F931,
148 	0x1F932,
149 	0x1F933,
150 	0x1F934,
151 	0x1F935,
152 	0x1F936,
153 	0x1F937,
154 	0x1F938,
155 	0x1F939,
156 	0x1F93D,
157 	0x1F93E,
158 	0x1F977,
159 	0x1F9B5,
160 	0x1F9B6,
161 	0x1F9B8,
162 	0x1F9B9,
163 	0x1F9BB,
164 	0x1F9CD,
165 	0x1F9CE,
166 	0x1F9CF,
167 	0x1F9D1,
168 	0x1F9D2,
169 	0x1F9D3,
170 	0x1F9D4,
171 	0x1F9D5,
172 	0x1F9D6,
173 	0x1F9D7,
174 	0x1F9D8,
175 	0x1F9D9,
176 	0x1F9DA,
177 	0x1F9DB,
178 	0x1F9DC,
179 	0x1F9DD,
180 	0x1FAC3,
181 	0x1FAC4,
182 	0x1FAC5,
183 	0x1FAF0,
184 	0x1FAF1,
185 	0x1FAF2,
186 	0x1FAF3,
187 	0x1FAF4,
188 	0x1FAF5,
189 	0x1FAF6,
190 	0x1FAF7,
191 	0x1FAF8
192 };
193 
194 struct utf8_item {
195 	RB_ENTRY(utf8_item)	index_entry;
196 	u_int			index;
197 
198 	RB_ENTRY(utf8_item)	data_entry;
199 	char			data[UTF8_SIZE];
200 	u_char			size;
201 };
202 
203 static int
204 utf8_data_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
205 {
206 	if (ui1->size < ui2->size)
207 		return (-1);
208 	if (ui1->size > ui2->size)
209 		return (1);
210 	return (memcmp(ui1->data, ui2->data, ui1->size));
211 }
212 RB_HEAD(utf8_data_tree, utf8_item);
213 RB_GENERATE_STATIC(utf8_data_tree, utf8_item, data_entry, utf8_data_cmp);
214 static struct utf8_data_tree utf8_data_tree = RB_INITIALIZER(utf8_data_tree);
215 
216 static int
217 utf8_index_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
218 {
219 	if (ui1->index < ui2->index)
220 		return (-1);
221 	if (ui1->index > ui2->index)
222 		return (1);
223 	return (0);
224 }
225 RB_HEAD(utf8_index_tree, utf8_item);
226 RB_GENERATE_STATIC(utf8_index_tree, utf8_item, index_entry, utf8_index_cmp);
227 static struct utf8_index_tree utf8_index_tree = RB_INITIALIZER(utf8_index_tree);
228 
229 static u_int utf8_next_index;
230 
231 #define UTF8_GET_SIZE(uc) (((uc) >> 24) & 0x1f)
232 #define UTF8_GET_WIDTH(uc) (((uc) >> 29) - 1)
233 
234 #define UTF8_SET_SIZE(size) (((utf8_char)(size)) << 24)
235 #define UTF8_SET_WIDTH(width) ((((utf8_char)(width)) + 1) << 29)
236 
237 /* Get a UTF-8 item from data. */
238 static struct utf8_item *
239 utf8_item_by_data(const u_char *data, size_t size)
240 {
241 	struct utf8_item	ui;
242 
243 	memcpy(ui.data, data, size);
244 	ui.size = size;
245 
246 	return (RB_FIND(utf8_data_tree, &utf8_data_tree, &ui));
247 }
248 
249 /* Get a UTF-8 item from data. */
250 static struct utf8_item *
251 utf8_item_by_index(u_int index)
252 {
253 	struct utf8_item	ui;
254 
255 	ui.index = index;
256 
257 	return (RB_FIND(utf8_index_tree, &utf8_index_tree, &ui));
258 }
259 
260 /* Add a UTF-8 item. */
261 static int
262 utf8_put_item(const u_char *data, size_t size, u_int *index)
263 {
264 	struct utf8_item	*ui;
265 
266 	ui = utf8_item_by_data(data, size);
267 	if (ui != NULL) {
268 		*index = ui->index;
269 		log_debug("%s: found %.*s = %u", __func__, (int)size, data,
270 		    *index);
271 		return (0);
272 	}
273 
274 	if (utf8_next_index == 0xffffff + 1)
275 		return (-1);
276 
277 	ui = xcalloc(1, sizeof *ui);
278 	ui->index = utf8_next_index++;
279 	RB_INSERT(utf8_index_tree, &utf8_index_tree, ui);
280 
281 	memcpy(ui->data, data, size);
282 	ui->size = size;
283 	RB_INSERT(utf8_data_tree, &utf8_data_tree, ui);
284 
285 	*index = ui->index;
286 	log_debug("%s: added %.*s = %u", __func__, (int)size, data, *index);
287 	return (0);
288 }
289 
290 static int
291 utf8_table_cmp(const void *vp1, const void *vp2)
292 {
293 	const wchar_t	*wc1 = vp1, *wc2 = vp2;
294 
295 	if (*wc1 < *wc2)
296 		return (-1);
297 	if (*wc1 > *wc2)
298 		return (1);
299 	return (0);
300 }
301 
302 /* Check if character in table. */
303 int
304 utf8_in_table(wchar_t find, const wchar_t *table, u_int count)
305 {
306 	wchar_t	*found;
307 
308 	found = bsearch(&find, table, count, sizeof *table, utf8_table_cmp);
309 	return (found != NULL);
310 }
311 
312 /* Get UTF-8 character from data. */
313 enum utf8_state
314 utf8_from_data(const struct utf8_data *ud, utf8_char *uc)
315 {
316 	u_int	index;
317 
318 	if (ud->width > 2)
319 		fatalx("invalid UTF-8 width: %u", ud->width);
320 
321 	if (ud->size > UTF8_SIZE)
322 		goto fail;
323 	if (ud->size <= 3) {
324 		index = (((utf8_char)ud->data[2] << 16)|
325 			  ((utf8_char)ud->data[1] << 8)|
326 			  ((utf8_char)ud->data[0]));
327 	} else if (utf8_put_item(ud->data, ud->size, &index) != 0)
328 		goto fail;
329 	*uc = UTF8_SET_SIZE(ud->size)|UTF8_SET_WIDTH(ud->width)|index;
330 	log_debug("%s: (%d %d %.*s) -> %08x", __func__, ud->width, ud->size,
331 	    (int)ud->size, ud->data, *uc);
332 	return (UTF8_DONE);
333 
334 fail:
335 	if (ud->width == 0)
336 		*uc = UTF8_SET_SIZE(0)|UTF8_SET_WIDTH(0);
337 	else if (ud->width == 1)
338 		*uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x20;
339 	else
340 		*uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x2020;
341 	return (UTF8_ERROR);
342 }
343 
344 /* Get UTF-8 data from character. */
345 void
346 utf8_to_data(utf8_char uc, struct utf8_data *ud)
347 {
348 	struct utf8_item	*ui;
349 	u_int			 index;
350 
351 	memset(ud, 0, sizeof *ud);
352 	ud->size = ud->have = UTF8_GET_SIZE(uc);
353 	ud->width = UTF8_GET_WIDTH(uc);
354 
355 	if (ud->size <= 3) {
356 		ud->data[2] = (uc >> 16);
357 		ud->data[1] = ((uc >> 8) & 0xff);
358 		ud->data[0] = (uc & 0xff);
359 	} else {
360 		index = (uc & 0xffffff);
361 		if ((ui = utf8_item_by_index(index)) == NULL)
362 			memset(ud->data, ' ', ud->size);
363 		else
364 			memcpy(ud->data, ui->data, ud->size);
365 	}
366 
367 	log_debug("%s: %08x -> (%d %d %.*s)", __func__, uc, ud->width, ud->size,
368 	    (int)ud->size, ud->data);
369 }
370 
371 /* Get UTF-8 character from a single ASCII character. */
372 u_int
373 utf8_build_one(u_char ch)
374 {
375 	return (UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|ch);
376 }
377 
378 /* Set a single character. */
379 void
380 utf8_set(struct utf8_data *ud, u_char ch)
381 {
382 	static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
383 
384 	memcpy(ud, &empty, sizeof *ud);
385 	*ud->data = ch;
386 }
387 
388 /* Copy UTF-8 character. */
389 void
390 utf8_copy(struct utf8_data *to, const struct utf8_data *from)
391 {
392 	u_int	i;
393 
394 	memcpy(to, from, sizeof *to);
395 
396 	for (i = to->size; i < sizeof to->data; i++)
397 		to->data[i] = '\0';
398 }
399 
400 /* Get width of Unicode character. */
401 static enum utf8_state
402 utf8_width(struct utf8_data *ud, int *width)
403 {
404 	wchar_t	wc;
405 
406 	if (utf8_towc(ud, &wc) != UTF8_DONE)
407 		return (UTF8_ERROR);
408 	if (utf8_in_table(wc, utf8_force_wide, nitems(utf8_force_wide))) {
409 		*width = 2;
410 		return (UTF8_DONE);
411 	}
412 
413 	*width = wcwidth(wc);
414 	log_debug("wcwidth(%05X) returned %d", (u_int)wc, *width);
415 	if (*width < 0) {
416 		/*
417 		 * C1 control characters are nonprintable, so they are always
418 		 * zero width.
419 		 */
420 		*width = (wc >= 0x80 && wc <= 0x9f) ? 0 : 1;
421 	}
422 	if (*width >= 0 && *width <= 0xff)
423 		return (UTF8_DONE);
424 	return (UTF8_ERROR);
425 }
426 
427 /* Convert UTF-8 character to wide character. */
428 enum utf8_state
429 utf8_towc(const struct utf8_data *ud, wchar_t *wc)
430 {
431 	switch (mbtowc(wc, ud->data, ud->size)) {
432 	case -1:
433 		log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
434 		    errno);
435 		mbtowc(NULL, NULL, MB_CUR_MAX);
436 		return (UTF8_ERROR);
437 	case 0:
438 		return (UTF8_ERROR);
439 	}
440 	log_debug("UTF-8 %.*s is %05X", (int)ud->size, ud->data, (u_int)*wc);
441 	return (UTF8_DONE);
442 }
443 
444 /* Convert wide character to UTF-8 character. */
445 enum utf8_state
446 utf8_fromwc(wchar_t wc, struct utf8_data *ud)
447 {
448 	int	size, width;
449 
450 	size = wctomb(ud->data, wc);
451 	if (size < 0) {
452 		log_debug("UTF-8 %d, wctomb() %d", wc, errno);
453 		wctomb(NULL, 0);
454 		return (UTF8_ERROR);
455 	}
456 	if (size == 0)
457 		return (UTF8_ERROR);
458 	ud->size = ud->have = size;
459 	if (utf8_width(ud, &width) == UTF8_DONE) {
460 		ud->width = width;
461 		return (UTF8_DONE);
462 	}
463 	return (UTF8_ERROR);
464 }
465 
466 /*
467  * Open UTF-8 sequence.
468  *
469  * 11000010-11011111 C2-DF start of 2-byte sequence
470  * 11100000-11101111 E0-EF start of 3-byte sequence
471  * 11110000-11110100 F0-F4 start of 4-byte sequence
472  */
473 enum utf8_state
474 utf8_open(struct utf8_data *ud, u_char ch)
475 {
476 	memset(ud, 0, sizeof *ud);
477 	if (ch >= 0xc2 && ch <= 0xdf)
478 		ud->size = 2;
479 	else if (ch >= 0xe0 && ch <= 0xef)
480 		ud->size = 3;
481 	else if (ch >= 0xf0 && ch <= 0xf4)
482 		ud->size = 4;
483 	else
484 		return (UTF8_ERROR);
485 	utf8_append(ud, ch);
486 	return (UTF8_MORE);
487 }
488 
489 /* Append character to UTF-8, closing if finished. */
490 enum utf8_state
491 utf8_append(struct utf8_data *ud, u_char ch)
492 {
493 	int	width;
494 
495 	if (ud->have >= ud->size)
496 		fatalx("UTF-8 character overflow");
497 	if (ud->size > sizeof ud->data)
498 		fatalx("UTF-8 character size too large");
499 
500 	if (ud->have != 0 && (ch & 0xc0) != 0x80)
501 		ud->width = 0xff;
502 
503 	ud->data[ud->have++] = ch;
504 	if (ud->have != ud->size)
505 		return (UTF8_MORE);
506 
507 	if (ud->width == 0xff)
508 		return (UTF8_ERROR);
509 	if (utf8_width(ud, &width) != UTF8_DONE)
510 		return (UTF8_ERROR);
511 	ud->width = width;
512 
513 	return (UTF8_DONE);
514 }
515 
516 /*
517  * Encode len characters from src into dst, which is guaranteed to have four
518  * bytes available for each character from src (for \abc or UTF-8) plus space
519  * for \0.
520  */
521 int
522 utf8_strvis(char *dst, const char *src, size_t len, int flag)
523 {
524 	struct utf8_data	 ud;
525 	const char		*start = dst, *end = src + len;
526 	enum utf8_state		 more;
527 	size_t			 i;
528 
529 	while (src < end) {
530 		if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
531 			while (++src < end && more == UTF8_MORE)
532 				more = utf8_append(&ud, *src);
533 			if (more == UTF8_DONE) {
534 				/* UTF-8 character finished. */
535 				for (i = 0; i < ud.size; i++)
536 					*dst++ = ud.data[i];
537 				continue;
538 			}
539 			/* Not a complete, valid UTF-8 character. */
540 			src -= ud.have;
541 		}
542 		if ((flag & VIS_DQ) && src[0] == '$' && src < end - 1) {
543 			if (isalpha((u_char)src[1]) ||
544 			    src[1] == '_' ||
545 			    src[1] == '{')
546 				*dst++ = '\\';
547 			*dst++ = '$';
548 		} else if (src < end - 1)
549 			dst = vis(dst, src[0], flag, src[1]);
550 		else if (src < end)
551 			dst = vis(dst, src[0], flag, '\0');
552 		src++;
553 	}
554 	*dst = '\0';
555 	return (dst - start);
556 }
557 
558 /* Same as utf8_strvis but allocate the buffer. */
559 int
560 utf8_stravis(char **dst, const char *src, int flag)
561 {
562 	char	*buf;
563 	int	 len;
564 
565 	buf = xreallocarray(NULL, 4, strlen(src) + 1);
566 	len = utf8_strvis(buf, src, strlen(src), flag);
567 
568 	*dst = xrealloc(buf, len + 1);
569 	return (len);
570 }
571 
572 /* Same as utf8_strvis but allocate the buffer. */
573 int
574 utf8_stravisx(char **dst, const char *src, size_t srclen, int flag)
575 {
576 	char	*buf;
577 	int	 len;
578 
579 	buf = xreallocarray(NULL, 4, srclen + 1);
580 	len = utf8_strvis(buf, src, srclen, flag);
581 
582 	*dst = xrealloc(buf, len + 1);
583 	return (len);
584 }
585 
586 /* Does this string contain anything that isn't valid UTF-8? */
587 int
588 utf8_isvalid(const char *s)
589 {
590 	struct utf8_data ud;
591 	const char	*end;
592 	enum utf8_state	 more;
593 
594 	end = s + strlen(s);
595 	while (s < end) {
596 		if ((more = utf8_open(&ud, *s)) == UTF8_MORE) {
597 			while (++s < end && more == UTF8_MORE)
598 				more = utf8_append(&ud, *s);
599 			if (more == UTF8_DONE)
600 				continue;
601 			return (0);
602 		}
603 		if (*s < 0x20 || *s > 0x7e)
604 			return (0);
605 		s++;
606 	}
607 	return (1);
608 }
609 
610 /*
611  * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
612  * the returned string. Anything not valid printable ASCII or UTF-8 is
613  * stripped.
614  */
615 char *
616 utf8_sanitize(const char *src)
617 {
618 	char		*dst = NULL;
619 	size_t		 n = 0;
620 	enum utf8_state	 more;
621 	struct utf8_data ud;
622 	u_int		 i;
623 
624 	while (*src != '\0') {
625 		dst = xreallocarray(dst, n + 1, sizeof *dst);
626 		if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
627 			while (*++src != '\0' && more == UTF8_MORE)
628 				more = utf8_append(&ud, *src);
629 			if (more == UTF8_DONE) {
630 				dst = xreallocarray(dst, n + ud.width,
631 				    sizeof *dst);
632 				for (i = 0; i < ud.width; i++)
633 					dst[n++] = '_';
634 				continue;
635 			}
636 			src -= ud.have;
637 		}
638 		if (*src > 0x1f && *src < 0x7f)
639 			dst[n++] = *src;
640 		else
641 			dst[n++] = '_';
642 		src++;
643 	}
644 	dst = xreallocarray(dst, n + 1, sizeof *dst);
645 	dst[n] = '\0';
646 	return (dst);
647 }
648 
649 /* Get UTF-8 buffer length. */
650 size_t
651 utf8_strlen(const struct utf8_data *s)
652 {
653 	size_t	i;
654 
655 	for (i = 0; s[i].size != 0; i++)
656 		/* nothing */;
657 	return (i);
658 }
659 
660 /* Get UTF-8 string width. */
661 u_int
662 utf8_strwidth(const struct utf8_data *s, ssize_t n)
663 {
664 	ssize_t	i;
665 	u_int	width = 0;
666 
667 	for (i = 0; s[i].size != 0; i++) {
668 		if (n != -1 && n == i)
669 			break;
670 		width += s[i].width;
671 	}
672 	return (width);
673 }
674 
675 /*
676  * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
677  * Caller frees.
678  */
679 struct utf8_data *
680 utf8_fromcstr(const char *src)
681 {
682 	struct utf8_data	*dst = NULL;
683 	size_t			 n = 0;
684 	enum utf8_state		 more;
685 
686 	while (*src != '\0') {
687 		dst = xreallocarray(dst, n + 1, sizeof *dst);
688 		if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
689 			while (*++src != '\0' && more == UTF8_MORE)
690 				more = utf8_append(&dst[n], *src);
691 			if (more == UTF8_DONE) {
692 				n++;
693 				continue;
694 			}
695 			src -= dst[n].have;
696 		}
697 		utf8_set(&dst[n], *src);
698 		n++;
699 		src++;
700 	}
701 	dst = xreallocarray(dst, n + 1, sizeof *dst);
702 	dst[n].size = 0;
703 	return (dst);
704 }
705 
706 /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
707 char *
708 utf8_tocstr(struct utf8_data *src)
709 {
710 	char	*dst = NULL;
711 	size_t	 n = 0;
712 
713 	for(; src->size != 0; src++) {
714 		dst = xreallocarray(dst, n + src->size, 1);
715 		memcpy(dst + n, src->data, src->size);
716 		n += src->size;
717 	}
718 	dst = xreallocarray(dst, n + 1, 1);
719 	dst[n] = '\0';
720 	return (dst);
721 }
722 
723 /* Get width of UTF-8 string. */
724 u_int
725 utf8_cstrwidth(const char *s)
726 {
727 	struct utf8_data	tmp;
728 	u_int			width;
729 	enum utf8_state		more;
730 
731 	width = 0;
732 	while (*s != '\0') {
733 		if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
734 			while (*++s != '\0' && more == UTF8_MORE)
735 				more = utf8_append(&tmp, *s);
736 			if (more == UTF8_DONE) {
737 				width += tmp.width;
738 				continue;
739 			}
740 			s -= tmp.have;
741 		}
742 		if (*s > 0x1f && *s != 0x7f)
743 			width++;
744 		s++;
745 	}
746 	return (width);
747 }
748 
749 /* Pad UTF-8 string to width on the left. Caller frees. */
750 char *
751 utf8_padcstr(const char *s, u_int width)
752 {
753 	size_t	 slen;
754 	char	*out;
755 	u_int	 n, i;
756 
757 	n = utf8_cstrwidth(s);
758 	if (n >= width)
759 		return (xstrdup(s));
760 
761 	slen = strlen(s);
762 	out = xmalloc(slen + 1 + (width - n));
763 	memcpy(out, s, slen);
764 	for (i = n; i < width; i++)
765 		out[slen++] = ' ';
766 	out[slen] = '\0';
767 	return (out);
768 }
769 
770 /* Pad UTF-8 string to width on the right. Caller frees. */
771 char *
772 utf8_rpadcstr(const char *s, u_int width)
773 {
774 	size_t	 slen;
775 	char	*out;
776 	u_int	 n, i;
777 
778 	n = utf8_cstrwidth(s);
779 	if (n >= width)
780 		return (xstrdup(s));
781 
782 	slen = strlen(s);
783 	out = xmalloc(slen + 1 + (width - n));
784 	for (i = 0; i < width - n; i++)
785 		out[i] = ' ';
786 	memcpy(out + i, s, slen);
787 	out[i + slen] = '\0';
788 	return (out);
789 }
790 
791 int
792 utf8_cstrhas(const char *s, const struct utf8_data *ud)
793 {
794 	struct utf8_data	*copy, *loop;
795 	int			 found = 0;
796 
797 	copy = utf8_fromcstr(s);
798 	for (loop = copy; loop->size != 0; loop++) {
799 		if (loop->size != ud->size)
800 			continue;
801 		if (memcmp(loop->data, ud->data, loop->size) == 0) {
802 			found = 1;
803 			break;
804 		}
805 	}
806 	free(copy);
807 
808 	return (found);
809 }
810