xref: /netbsd-src/external/bsd/tmux/dist/utf8.c (revision 890b6d91a44b7fcb2dfbcbc1e93463086e462d2d)
1 /* $OpenBSD$ */
2 
3 /*
4  * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
15  * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
16  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/types.h>
20 
21 #include <ctype.h>
22 #include <errno.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <wchar.h>
26 
27 #include "compat.h"
28 #include "tmux.h"
29 
30 static const wchar_t utf8_force_wide[] = {
31 	0x0261D,
32 	0x026F9,
33 	0x0270A,
34 	0x0270B,
35 	0x0270C,
36 	0x0270D,
37 	0x1F1E6,
38 	0x1F1E7,
39 	0x1F1E8,
40 	0x1F1E9,
41 	0x1F1EA,
42 	0x1F1EB,
43 	0x1F1EC,
44 	0x1F1ED,
45 	0x1F1EE,
46 	0x1F1EF,
47 	0x1F1F0,
48 	0x1F1F1,
49 	0x1F1F2,
50 	0x1F1F3,
51 	0x1F1F4,
52 	0x1F1F5,
53 	0x1F1F6,
54 	0x1F1F7,
55 	0x1F1F8,
56 	0x1F1F9,
57 	0x1F1FA,
58 	0x1F1FB,
59 	0x1F1FC,
60 	0x1F1FD,
61 	0x1F1FE,
62 	0x1F1FF,
63 	0x1F385,
64 	0x1F3C2,
65 	0x1F3C3,
66 	0x1F3C4,
67 	0x1F3C7,
68 	0x1F3CA,
69 	0x1F3CB,
70 	0x1F3CC,
71 	0x1F3FB,
72 	0x1F3FC,
73 	0x1F3FD,
74 	0x1F3FE,
75 	0x1F3FF,
76 	0x1F442,
77 	0x1F443,
78 	0x1F446,
79 	0x1F447,
80 	0x1F448,
81 	0x1F449,
82 	0x1F44A,
83 	0x1F44B,
84 	0x1F44C,
85 	0x1F44D,
86 	0x1F44E,
87 	0x1F44F,
88 	0x1F450,
89 	0x1F466,
90 	0x1F467,
91 	0x1F468,
92 	0x1F469,
93 	0x1F46B,
94 	0x1F46C,
95 	0x1F46D,
96 	0x1F46E,
97 	0x1F470,
98 	0x1F471,
99 	0x1F472,
100 	0x1F473,
101 	0x1F474,
102 	0x1F475,
103 	0x1F476,
104 	0x1F477,
105 	0x1F478,
106 	0x1F47C,
107 	0x1F481,
108 	0x1F482,
109 	0x1F483,
110 	0x1F485,
111 	0x1F486,
112 	0x1F487,
113 	0x1F48F,
114 	0x1F491,
115 	0x1F4AA,
116 	0x1F574,
117 	0x1F575,
118 	0x1F57A,
119 	0x1F590,
120 	0x1F595,
121 	0x1F596,
122 	0x1F645,
123 	0x1F646,
124 	0x1F647,
125 	0x1F64B,
126 	0x1F64C,
127 	0x1F64D,
128 	0x1F64E,
129 	0x1F64F,
130 	0x1F6A3,
131 	0x1F6B4,
132 	0x1F6B5,
133 	0x1F6B6,
134 	0x1F6C0,
135 	0x1F6CC,
136 	0x1F90C,
137 	0x1F90F,
138 	0x1F918,
139 	0x1F919,
140 	0x1F91A,
141 	0x1F91B,
142 	0x1F91C,
143 	0x1F91D,
144 	0x1F91E,
145 	0x1F91F,
146 	0x1F926,
147 	0x1F930,
148 	0x1F931,
149 	0x1F932,
150 	0x1F933,
151 	0x1F934,
152 	0x1F935,
153 	0x1F936,
154 	0x1F937,
155 	0x1F938,
156 	0x1F939,
157 	0x1F93D,
158 	0x1F93E,
159 	0x1F977,
160 	0x1F9B5,
161 	0x1F9B6,
162 	0x1F9B8,
163 	0x1F9B9,
164 	0x1F9BB,
165 	0x1F9CD,
166 	0x1F9CE,
167 	0x1F9CF,
168 	0x1F9D1,
169 	0x1F9D2,
170 	0x1F9D3,
171 	0x1F9D4,
172 	0x1F9D5,
173 	0x1F9D6,
174 	0x1F9D7,
175 	0x1F9D8,
176 	0x1F9D9,
177 	0x1F9DA,
178 	0x1F9DB,
179 	0x1F9DC,
180 	0x1F9DD,
181 	0x1FAC3,
182 	0x1FAC4,
183 	0x1FAC5,
184 	0x1FAF0,
185 	0x1FAF1,
186 	0x1FAF2,
187 	0x1FAF3,
188 	0x1FAF4,
189 	0x1FAF5,
190 	0x1FAF6,
191 	0x1FAF7,
192 	0x1FAF8
193 };
194 
195 struct utf8_item {
196 	RB_ENTRY(utf8_item)	index_entry;
197 	u_int			index;
198 
199 	RB_ENTRY(utf8_item)	data_entry;
200 	char			data[UTF8_SIZE];
201 	u_char			size;
202 };
203 
204 static int
205 utf8_data_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
206 {
207 	if (ui1->size < ui2->size)
208 		return (-1);
209 	if (ui1->size > ui2->size)
210 		return (1);
211 	return (memcmp(ui1->data, ui2->data, ui1->size));
212 }
213 RB_HEAD(utf8_data_tree, utf8_item);
214 RB_GENERATE_STATIC(utf8_data_tree, utf8_item, data_entry, utf8_data_cmp);
215 static struct utf8_data_tree utf8_data_tree = RB_INITIALIZER(utf8_data_tree);
216 
217 static int
218 utf8_index_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
219 {
220 	if (ui1->index < ui2->index)
221 		return (-1);
222 	if (ui1->index > ui2->index)
223 		return (1);
224 	return (0);
225 }
226 RB_HEAD(utf8_index_tree, utf8_item);
227 RB_GENERATE_STATIC(utf8_index_tree, utf8_item, index_entry, utf8_index_cmp);
228 static struct utf8_index_tree utf8_index_tree = RB_INITIALIZER(utf8_index_tree);
229 
230 static u_int utf8_next_index;
231 
232 #define UTF8_GET_SIZE(uc) (((uc) >> 24) & 0x1f)
233 #define UTF8_GET_WIDTH(uc) (((uc) >> 29) - 1)
234 
235 #define UTF8_SET_SIZE(size) (((utf8_char)(size)) << 24)
236 #define UTF8_SET_WIDTH(width) ((((utf8_char)(width)) + 1) << 29)
237 
238 /* Get a UTF-8 item from data. */
239 static struct utf8_item *
240 utf8_item_by_data(const u_char *data, size_t size)
241 {
242 	struct utf8_item	ui;
243 
244 	memcpy(ui.data, data, size);
245 	ui.size = size;
246 
247 	return (RB_FIND(utf8_data_tree, &utf8_data_tree, &ui));
248 }
249 
250 /* Get a UTF-8 item from data. */
251 static struct utf8_item *
252 utf8_item_by_index(u_int index)
253 {
254 	struct utf8_item	ui;
255 
256 	ui.index = index;
257 
258 	return (RB_FIND(utf8_index_tree, &utf8_index_tree, &ui));
259 }
260 
261 /* Add a UTF-8 item. */
262 static int
263 utf8_put_item(const u_char *data, size_t size, u_int *index)
264 {
265 	struct utf8_item	*ui;
266 
267 	ui = utf8_item_by_data((const unsigned char *)data, size);
268 	if (ui != NULL) {
269 		*index = ui->index;
270 		log_debug("%s: found %.*s = %u", __func__, (int)size, data,
271 		    *index);
272 		return (0);
273 	}
274 
275 	if (utf8_next_index == 0xffffff + 1)
276 		return (-1);
277 
278 	ui = xcalloc(1, sizeof *ui);
279 	ui->index = utf8_next_index++;
280 	RB_INSERT(utf8_index_tree, &utf8_index_tree, ui);
281 
282 	memcpy(ui->data, data, size);
283 	ui->size = size;
284 	RB_INSERT(utf8_data_tree, &utf8_data_tree, ui);
285 
286 	*index = ui->index;
287 	log_debug("%s: added %.*s = %u", __func__, (int)size, data, *index);
288 	return (0);
289 }
290 
291 static int
292 utf8_table_cmp(const void *vp1, const void *vp2)
293 {
294 	const wchar_t	*wc1 = vp1, *wc2 = vp2;
295 
296 	if (*wc1 < *wc2)
297 		return (-1);
298 	if (*wc1 > *wc2)
299 		return (1);
300 	return (0);
301 }
302 
303 /* Check if character in table. */
304 int
305 utf8_in_table(wchar_t find, const wchar_t *table, u_int count)
306 {
307 	wchar_t	*found;
308 
309 	found = bsearch(&find, table, count, sizeof *table, utf8_table_cmp);
310 	return (found != NULL);
311 }
312 
313 /* Get UTF-8 character from data. */
314 enum utf8_state
315 utf8_from_data(const struct utf8_data *ud, utf8_char *uc)
316 {
317 	u_int	index;
318 
319 	if (ud->width > 2)
320 		fatalx("invalid UTF-8 width: %u", ud->width);
321 
322 	if (ud->size > UTF8_SIZE)
323 		goto fail;
324 	if (ud->size <= 3) {
325 		index = (((utf8_char)ud->data[2] << 16)|
326 			  ((utf8_char)ud->data[1] << 8)|
327 			  ((utf8_char)ud->data[0]));
328 	} else if (utf8_put_item(ud->data, ud->size, &index) != 0)
329 		goto fail;
330 	*uc = UTF8_SET_SIZE(ud->size)|UTF8_SET_WIDTH(ud->width)|index;
331 	log_debug("%s: (%d %d %.*s) -> %08x", __func__, ud->width, ud->size,
332 	    (int)ud->size, ud->data, *uc);
333 	return (UTF8_DONE);
334 
335 fail:
336 	if (ud->width == 0)
337 		*uc = UTF8_SET_SIZE(0)|UTF8_SET_WIDTH(0);
338 	else if (ud->width == 1)
339 		*uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x20;
340 	else
341 		*uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x2020;
342 	return (UTF8_ERROR);
343 }
344 
345 /* Get UTF-8 data from character. */
346 void
347 utf8_to_data(utf8_char uc, struct utf8_data *ud)
348 {
349 	struct utf8_item	*ui;
350 	u_int			 index;
351 
352 	memset(ud, 0, sizeof *ud);
353 	ud->size = ud->have = UTF8_GET_SIZE(uc);
354 	ud->width = UTF8_GET_WIDTH(uc);
355 
356 	if (ud->size <= 3) {
357 		ud->data[2] = (uc >> 16);
358 		ud->data[1] = ((uc >> 8) & 0xff);
359 		ud->data[0] = (uc & 0xff);
360 	} else {
361 		index = (uc & 0xffffff);
362 		if ((ui = utf8_item_by_index(index)) == NULL)
363 			memset(ud->data, ' ', ud->size);
364 		else
365 			memcpy(ud->data, ui->data, ud->size);
366 	}
367 
368 	log_debug("%s: %08x -> (%d %d %.*s)", __func__, uc, ud->width, ud->size,
369 	    (int)ud->size, ud->data);
370 }
371 
372 /* Get UTF-8 character from a single ASCII character. */
373 u_int
374 utf8_build_one(u_char ch)
375 {
376 	return (UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|ch);
377 }
378 
379 /* Set a single character. */
380 void
381 utf8_set(struct utf8_data *ud, u_char ch)
382 {
383 	static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
384 
385 	memcpy(ud, &empty, sizeof *ud);
386 	*ud->data = ch;
387 }
388 
389 /* Copy UTF-8 character. */
390 void
391 utf8_copy(struct utf8_data *to, const struct utf8_data *from)
392 {
393 	u_int	i;
394 
395 	memcpy(to, from, sizeof *to);
396 
397 	for (i = to->size; i < sizeof to->data; i++)
398 		to->data[i] = '\0';
399 }
400 
401 /* Get width of Unicode character. */
402 static enum utf8_state
403 utf8_width(struct utf8_data *ud, int *width)
404 {
405 	wchar_t	wc;
406 
407 	if (utf8_towc(ud, &wc) != UTF8_DONE)
408 		return (UTF8_ERROR);
409 	if (utf8_in_table(wc, utf8_force_wide, nitems(utf8_force_wide))) {
410 		*width = 2;
411 		return (UTF8_DONE);
412 	}
413 #ifdef HAVE_UTF8PROC
414 	*width = utf8proc_wcwidth(wc);
415 	log_debug("utf8proc_wcwidth(%05X) returned %d", (u_int)wc, *width);
416 #else
417 	*width = wcwidth(wc);
418 	log_debug("wcwidth(%05X) returned %d", (u_int)wc, *width);
419 	if (*width < 0) {
420 		/*
421 		 * C1 control characters are nonprintable, so they are always
422 		 * zero width.
423 		 */
424 		*width = (wc >= 0x80 && wc <= 0x9f) ? 0 : 1;
425 	}
426 #endif
427 	if (*width >= 0 && *width <= 0xff)
428 		return (UTF8_DONE);
429 	return (UTF8_ERROR);
430 }
431 
432 /* Convert UTF-8 character to wide character. */
433 enum utf8_state
434 utf8_towc(const struct utf8_data *ud, wchar_t *wc)
435 {
436 #ifdef HAVE_UTF8PROC
437 	switch (utf8proc_mbtowc(wc, ud->data, ud->size)) {
438 #else
439 	switch (mbtowc(wc, __UNCONST(ud->data), ud->size)) {
440 #endif
441 	case -1:
442 		log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
443 		    errno);
444 		mbtowc(NULL, NULL, MB_CUR_MAX);
445 		return (UTF8_ERROR);
446 	case 0:
447 		return (UTF8_ERROR);
448 	}
449 	log_debug("UTF-8 %.*s is %05X", (int)ud->size, ud->data, (u_int)*wc);
450 	return (UTF8_DONE);
451 }
452 
453 /* Convert wide character to UTF-8 character. */
454 enum utf8_state
455 utf8_fromwc(wchar_t wc, struct utf8_data *ud)
456 {
457 	int	size, width;
458 
459 #ifdef HAVE_UTF8PROC
460 	size = utf8proc_wctomb(ud->data, wc);
461 #else
462 	size = wctomb((char *)ud->data, wc);
463 #endif
464 	if (size < 0) {
465 		log_debug("UTF-8 %d, wctomb() %d", wc, errno);
466 		wctomb(NULL, 0);
467 		return (UTF8_ERROR);
468 	}
469 	if (size == 0)
470 		return (UTF8_ERROR);
471 	ud->size = ud->have = size;
472 	if (utf8_width(ud, &width) == UTF8_DONE) {
473 		ud->width = width;
474 		return (UTF8_DONE);
475 	}
476 	return (UTF8_ERROR);
477 }
478 
479 /*
480  * Open UTF-8 sequence.
481  *
482  * 11000010-11011111 C2-DF start of 2-byte sequence
483  * 11100000-11101111 E0-EF start of 3-byte sequence
484  * 11110000-11110100 F0-F4 start of 4-byte sequence
485  */
486 enum utf8_state
487 utf8_open(struct utf8_data *ud, u_char ch)
488 {
489 	memset(ud, 0, sizeof *ud);
490 	if (ch >= 0xc2 && ch <= 0xdf)
491 		ud->size = 2;
492 	else if (ch >= 0xe0 && ch <= 0xef)
493 		ud->size = 3;
494 	else if (ch >= 0xf0 && ch <= 0xf4)
495 		ud->size = 4;
496 	else
497 		return (UTF8_ERROR);
498 	utf8_append(ud, ch);
499 	return (UTF8_MORE);
500 }
501 
502 /* Append character to UTF-8, closing if finished. */
503 enum utf8_state
504 utf8_append(struct utf8_data *ud, u_char ch)
505 {
506 	int	width;
507 
508 	if (ud->have >= ud->size)
509 		fatalx("UTF-8 character overflow");
510 	if (ud->size > sizeof ud->data)
511 		fatalx("UTF-8 character size too large");
512 
513 	if (ud->have != 0 && (ch & 0xc0) != 0x80)
514 		ud->width = 0xff;
515 
516 	ud->data[ud->have++] = ch;
517 	if (ud->have != ud->size)
518 		return (UTF8_MORE);
519 
520 	if (ud->width == 0xff)
521 		return (UTF8_ERROR);
522 	if (utf8_width(ud, &width) != UTF8_DONE)
523 		return (UTF8_ERROR);
524 	ud->width = width;
525 
526 	return (UTF8_DONE);
527 }
528 
529 /*
530  * Encode len characters from src into dst, which is guaranteed to have four
531  * bytes available for each character from src (for \abc or UTF-8) plus space
532  * for \0.
533  */
534 int
535 utf8_strvis(char *dst, const char *src, size_t len, int flag)
536 {
537 	struct utf8_data	 ud;
538 	const char		*start = dst, *end = src + len;
539 	enum utf8_state		 more;
540 	size_t			 i;
541 
542 	while (src < end) {
543 		if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
544 			while (++src < end && more == UTF8_MORE)
545 				more = utf8_append(&ud, *src);
546 			if (more == UTF8_DONE) {
547 				/* UTF-8 character finished. */
548 				for (i = 0; i < ud.size; i++)
549 					*dst++ = ud.data[i];
550 				continue;
551 			}
552 			/* Not a complete, valid UTF-8 character. */
553 			src -= ud.have;
554 		}
555 		if ((flag & VIS_DQ) && src[0] == '$' && src < end - 1) {
556 			if (isalpha((u_char)src[1]) ||
557 			    src[1] == '_' ||
558 			    src[1] == '{')
559 				*dst++ = '\\';
560 			*dst++ = '$';
561 		} else if (src < end - 1)
562 			dst = vis(dst, src[0], flag, src[1]);
563 		else if (src < end)
564 			dst = vis(dst, src[0], flag, '\0');
565 		src++;
566 	}
567 	*dst = '\0';
568 	return (dst - start);
569 }
570 
571 /* Same as utf8_strvis but allocate the buffer. */
572 int
573 utf8_stravis(char **dst, const char *src, int flag)
574 {
575 	char	*buf;
576 	int	 len;
577 
578 	buf = xreallocarray(NULL, 4, strlen(src) + 1);
579 	len = utf8_strvis(buf, src, strlen(src), flag);
580 
581 	*dst = xrealloc(buf, len + 1);
582 	return (len);
583 }
584 
585 /* Same as utf8_strvis but allocate the buffer. */
586 int
587 utf8_stravisx(char **dst, const char *src, size_t srclen, int flag)
588 {
589 	char	*buf;
590 	int	 len;
591 
592 	buf = xreallocarray(NULL, 4, srclen + 1);
593 	len = utf8_strvis(buf, src, srclen, flag);
594 
595 	*dst = xrealloc(buf, len + 1);
596 	return (len);
597 }
598 
599 /* Does this string contain anything that isn't valid UTF-8? */
600 int
601 utf8_isvalid(const char *s)
602 {
603 	struct utf8_data ud;
604 	const char	*end;
605 	enum utf8_state	 more;
606 
607 	end = s + strlen(s);
608 	while (s < end) {
609 		if ((more = utf8_open(&ud, *s)) == UTF8_MORE) {
610 			while (++s < end && more == UTF8_MORE)
611 				more = utf8_append(&ud, *s);
612 			if (more == UTF8_DONE)
613 				continue;
614 			return (0);
615 		}
616 		if (*s < 0x20 || *s > 0x7e)
617 			return (0);
618 		s++;
619 	}
620 	return (1);
621 }
622 
623 /*
624  * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
625  * the returned string. Anything not valid printable ASCII or UTF-8 is
626  * stripped.
627  */
628 char *
629 utf8_sanitize(const char *src)
630 {
631 	char		*dst = NULL;
632 	size_t		 n = 0;
633 	enum utf8_state	 more;
634 	struct utf8_data ud;
635 	u_int		 i;
636 
637 	while (*src != '\0') {
638 		dst = xreallocarray(dst, n + 1, sizeof *dst);
639 		if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
640 			while (*++src != '\0' && more == UTF8_MORE)
641 				more = utf8_append(&ud, *src);
642 			if (more == UTF8_DONE) {
643 				dst = xreallocarray(dst, n + ud.width,
644 				    sizeof *dst);
645 				for (i = 0; i < ud.width; i++)
646 					dst[n++] = '_';
647 				continue;
648 			}
649 			src -= ud.have;
650 		}
651 		if (*src > 0x1f && *src < 0x7f)
652 			dst[n++] = *src;
653 		else
654 			dst[n++] = '_';
655 		src++;
656 	}
657 	dst = xreallocarray(dst, n + 1, sizeof *dst);
658 	dst[n] = '\0';
659 	return (dst);
660 }
661 
662 /* Get UTF-8 buffer length. */
663 size_t
664 utf8_strlen(const struct utf8_data *s)
665 {
666 	size_t	i;
667 
668 	for (i = 0; s[i].size != 0; i++)
669 		/* nothing */;
670 	return (i);
671 }
672 
673 /* Get UTF-8 string width. */
674 u_int
675 utf8_strwidth(const struct utf8_data *s, ssize_t n)
676 {
677 	ssize_t	i;
678 	u_int	width = 0;
679 
680 	for (i = 0; s[i].size != 0; i++) {
681 		if (n != -1 && n == i)
682 			break;
683 		width += s[i].width;
684 	}
685 	return (width);
686 }
687 
688 /*
689  * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
690  * Caller frees.
691  */
692 struct utf8_data *
693 utf8_fromcstr(const char *src)
694 {
695 	struct utf8_data	*dst = NULL;
696 	size_t			 n = 0;
697 	enum utf8_state		 more;
698 
699 	while (*src != '\0') {
700 		dst = xreallocarray(dst, n + 1, sizeof *dst);
701 		if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
702 			while (*++src != '\0' && more == UTF8_MORE)
703 				more = utf8_append(&dst[n], *src);
704 			if (more == UTF8_DONE) {
705 				n++;
706 				continue;
707 			}
708 			src -= dst[n].have;
709 		}
710 		utf8_set(&dst[n], *src);
711 		n++;
712 		src++;
713 	}
714 	dst = xreallocarray(dst, n + 1, sizeof *dst);
715 	dst[n].size = 0;
716 	return (dst);
717 }
718 
719 /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
720 char *
721 utf8_tocstr(struct utf8_data *src)
722 {
723 	char	*dst = NULL;
724 	size_t	 n = 0;
725 
726 	for(; src->size != 0; src++) {
727 		dst = xreallocarray(dst, n + src->size, 1);
728 		memcpy(dst + n, src->data, src->size);
729 		n += src->size;
730 	}
731 	dst = xreallocarray(dst, n + 1, 1);
732 	dst[n] = '\0';
733 	return (dst);
734 }
735 
736 /* Get width of UTF-8 string. */
737 u_int
738 utf8_cstrwidth(const char *s)
739 {
740 	struct utf8_data	tmp;
741 	u_int			width;
742 	enum utf8_state		more;
743 
744 	width = 0;
745 	while (*s != '\0') {
746 		if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
747 			while (*++s != '\0' && more == UTF8_MORE)
748 				more = utf8_append(&tmp, *s);
749 			if (more == UTF8_DONE) {
750 				width += tmp.width;
751 				continue;
752 			}
753 			s -= tmp.have;
754 		}
755 		if (*s > 0x1f && *s != 0x7f)
756 			width++;
757 		s++;
758 	}
759 	return (width);
760 }
761 
762 /* Pad UTF-8 string to width on the left. Caller frees. */
763 char *
764 utf8_padcstr(const char *s, u_int width)
765 {
766 	size_t	 slen;
767 	char	*out;
768 	u_int	 n, i;
769 
770 	n = utf8_cstrwidth(s);
771 	if (n >= width)
772 		return (xstrdup(s));
773 
774 	slen = strlen(s);
775 	out = xmalloc(slen + 1 + (width - n));
776 	memcpy(out, s, slen);
777 	for (i = n; i < width; i++)
778 		out[slen++] = ' ';
779 	out[slen] = '\0';
780 	return (out);
781 }
782 
783 /* Pad UTF-8 string to width on the right. Caller frees. */
784 char *
785 utf8_rpadcstr(const char *s, u_int width)
786 {
787 	size_t	 slen;
788 	char	*out;
789 	u_int	 n, i;
790 
791 	n = utf8_cstrwidth(s);
792 	if (n >= width)
793 		return (xstrdup(s));
794 
795 	slen = strlen(s);
796 	out = xmalloc(slen + 1 + (width - n));
797 	for (i = 0; i < width - n; i++)
798 		out[i] = ' ';
799 	memcpy(out + i, s, slen);
800 	out[i + slen] = '\0';
801 	return (out);
802 }
803 
804 int
805 utf8_cstrhas(const char *s, const struct utf8_data *ud)
806 {
807 	struct utf8_data	*copy, *loop;
808 	int			 found = 0;
809 
810 	copy = utf8_fromcstr(s);
811 	for (loop = copy; loop->size != 0; loop++) {
812 		if (loop->size != ud->size)
813 			continue;
814 		if (memcmp(loop->data, ud->data, loop->size) == 0) {
815 			found = 1;
816 			break;
817 		}
818 	}
819 	free(copy);
820 
821 	return (found);
822 }
823