xref: /netbsd-src/external/bsd/tmux/dist/utf8.c (revision 796c32c94f6e154afc9de0f63da35c91bb739b45)
1 /* $OpenBSD$ */
2 
3 /*
4  * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
15  * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
16  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/types.h>
20 
21 #include <errno.h>
22 #include <stdlib.h>
23 #include <string.h>
24 #include <wchar.h>
25 
26 #include "tmux.h"
27 
28 static int	utf8_width(wchar_t);
29 
30 /* Set a single character. */
31 void
32 utf8_set(struct utf8_data *ud, u_char ch)
33 {
34 	static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
35 
36 	memcpy(ud, &empty, sizeof *ud);
37 	*ud->data = ch;
38 }
39 
40 /* Copy UTF-8 character. */
41 void
42 utf8_copy(struct utf8_data *to, const struct utf8_data *from)
43 {
44 	u_int	i;
45 
46 	memcpy(to, from, sizeof *to);
47 
48 	for (i = to->size; i < sizeof to->data; i++)
49 		to->data[i] = '\0';
50 }
51 
52 /*
53  * Open UTF-8 sequence.
54  *
55  * 11000010-11011111 C2-DF start of 2-byte sequence
56  * 11100000-11101111 E0-EF start of 3-byte sequence
57  * 11110000-11110100 F0-F4 start of 4-byte sequence
58  */
59 enum utf8_state
60 utf8_open(struct utf8_data *ud, u_char ch)
61 {
62 	memset(ud, 0, sizeof *ud);
63 	if (ch >= 0xc2 && ch <= 0xdf)
64 		ud->size = 2;
65 	else if (ch >= 0xe0 && ch <= 0xef)
66 		ud->size = 3;
67 	else if (ch >= 0xf0 && ch <= 0xf4)
68 		ud->size = 4;
69 	else
70 		return (UTF8_ERROR);
71 	utf8_append(ud, ch);
72 	return (UTF8_MORE);
73 }
74 
75 /* Append character to UTF-8, closing if finished. */
76 enum utf8_state
77 utf8_append(struct utf8_data *ud, u_char ch)
78 {
79 	wchar_t	wc;
80 	int	width;
81 
82 	if (ud->have >= ud->size)
83 		fatalx("UTF-8 character overflow");
84 	if (ud->size > sizeof ud->data)
85 		fatalx("UTF-8 character size too large");
86 
87 	if (ud->have != 0 && (ch & 0xc0) != 0x80)
88 		ud->width = 0xff;
89 
90 	ud->data[ud->have++] = ch;
91 	if (ud->have != ud->size)
92 		return (UTF8_MORE);
93 
94 	if (ud->width == 0xff)
95 		return (UTF8_ERROR);
96 
97 	if (utf8_combine(ud, &wc) != UTF8_DONE)
98 		return (UTF8_ERROR);
99 	if ((width = utf8_width(wc)) < 0)
100 		return (UTF8_ERROR);
101 	ud->width = width;
102 
103 	return (UTF8_DONE);
104 }
105 
106 /* Get width of Unicode character. */
107 static int
108 utf8_width(wchar_t wc)
109 {
110 	int	width;
111 
112 #ifdef HAVE_UTF8PROC
113 	width = utf8proc_wcwidth(wc);
114 #else
115 	width = wcwidth(wc);
116 #endif
117 	if (width < 0 || width > 0xff) {
118 		log_debug("Unicode %04lx, wcwidth() %d", (long)wc, width);
119 
120 #ifndef __OpenBSD__
121 		/*
122 		 * Many platforms (particularly and inevitably OS X) have no
123 		 * width for relatively common characters (wcwidth() returns
124 		 * -1); assume width 1 in this case. This will be wrong for
125 		 * genuinely nonprintable characters, but they should be
126 		 * rare. We may pass through stuff that ideally we would block,
127 		 * but this is no worse than sending the same to the terminal
128 		 * without tmux.
129 		 */
130 		if (width < 0)
131 			return (1);
132 #endif
133 		return (-1);
134 	}
135 	return (width);
136 }
137 
138 /* Combine UTF-8 into Unicode. */
139 enum utf8_state
140 utf8_combine(const struct utf8_data *ud, wchar_t *wc)
141 {
142 #ifdef HAVE_UTF8PROC
143 	switch (utf8proc_mbtowc(wc, ud->data, ud->size)) {
144 #else
145 	switch (mbtowc(wc, (const char *)ud->data, ud->size)) {
146 #endif
147 	case -1:
148 		log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
149 		    errno);
150 		mbtowc(NULL, NULL, MB_CUR_MAX);
151 		return (UTF8_ERROR);
152 	case 0:
153 		return (UTF8_ERROR);
154 	default:
155 		return (UTF8_DONE);
156 	}
157 }
158 
159 /* Split Unicode into UTF-8. */
160 enum utf8_state
161 utf8_split(wchar_t wc, struct utf8_data *ud)
162 {
163 	char	s[MB_LEN_MAX];
164 	int	slen;
165 
166 #ifdef HAVE_UTF8PROC
167 	slen = utf8proc_wctomb(s, wc);
168 #else
169 	slen = wctomb(s, wc);
170 #endif
171 	if (slen <= 0 || slen > (int)sizeof ud->data)
172 		return (UTF8_ERROR);
173 
174 	memcpy(ud->data, s, slen);
175 	ud->size = slen;
176 
177 	ud->width = utf8_width(wc);
178 	return (UTF8_DONE);
179 }
180 
181 /*
182  * Encode len characters from src into dst, which is guaranteed to have four
183  * bytes available for each character from src (for \abc or UTF-8) plus space
184  * for \0.
185  */
186 int
187 utf8_strvis(char *dst, const char *src, size_t len, int flag)
188 {
189 	struct utf8_data	 ud;
190 	const char		*start, *end;
191 	enum utf8_state		 more;
192 	size_t			 i;
193 
194 	start = dst;
195 	end = src + len;
196 
197 	while (src < end) {
198 		if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
199 			while (++src < end && more == UTF8_MORE)
200 				more = utf8_append(&ud, *src);
201 			if (more == UTF8_DONE) {
202 				/* UTF-8 character finished. */
203 				for (i = 0; i < ud.size; i++)
204 					*dst++ = ud.data[i];
205 				continue;
206 			}
207 			/* Not a complete, valid UTF-8 character. */
208 			src -= ud.have;
209 		}
210 		if (src < end - 1)
211 			dst = vis(dst, src[0], flag, src[1]);
212 		else if (src < end)
213 			dst = vis(dst, src[0], flag, '\0');
214 		src++;
215 	}
216 
217 	*dst = '\0';
218 	return (dst - start);
219 }
220 
221 /* Same as utf8_strvis but allocate the buffer. */
222 int
223 utf8_stravis(char **dst, const char *src, int flag)
224 {
225 	char	*buf;
226 	int	 len;
227 
228 	buf = xreallocarray(NULL, 4, strlen(src) + 1);
229 	len = utf8_strvis(buf, src, strlen(src), flag);
230 
231 	*dst = xrealloc(buf, len + 1);
232 	return (len);
233 }
234 
235 /* Does this string contain anything that isn't valid UTF-8? */
236 int
237 utf8_isvalid(const char *s)
238 {
239 	struct utf8_data	 ud;
240 	const char		*end;
241 	enum utf8_state		 more;
242 
243 	end = s + strlen(s);
244 	while (s < end) {
245 		if ((more = utf8_open(&ud, *s)) == UTF8_MORE) {
246 			while (++s < end && more == UTF8_MORE)
247 				more = utf8_append(&ud, *s);
248 			if (more == UTF8_DONE)
249 				continue;
250 			return (0);
251 		}
252 		if (*s < 0x20 || *s > 0x7e)
253 			return (0);
254 		s++;
255 	}
256 	return (1);
257 }
258 
259 /*
260  * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
261  * the returned string. Anything not valid printable ASCII or UTF-8 is
262  * stripped.
263  */
264 char *
265 utf8_sanitize(const char *src)
266 {
267 	char			*dst;
268 	size_t			 n;
269 	enum utf8_state		 more;
270 	struct utf8_data	 ud;
271 	u_int			 i;
272 
273 	dst = NULL;
274 
275 	n = 0;
276 	while (*src != '\0') {
277 		dst = xreallocarray(dst, n + 1, sizeof *dst);
278 		if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
279 			while (*++src != '\0' && more == UTF8_MORE)
280 				more = utf8_append(&ud, *src);
281 			if (more == UTF8_DONE) {
282 				dst = xreallocarray(dst, n + ud.width,
283 				    sizeof *dst);
284 				for (i = 0; i < ud.width; i++)
285 					dst[n++] = '_';
286 				continue;
287 			}
288 			src -= ud.have;
289 		}
290 		if (*src > 0x1f && *src < 0x7f)
291 			dst[n++] = *src;
292 		else
293 			dst[n++] = '_';
294 		src++;
295 	}
296 
297 	dst = xreallocarray(dst, n + 1, sizeof *dst);
298 	dst[n] = '\0';
299 	return (dst);
300 }
301 
302 /* Get UTF-8 buffer length. */
303 size_t
304 utf8_strlen(const struct utf8_data *s)
305 {
306 	size_t	i;
307 
308 	for (i = 0; s[i].size != 0; i++)
309 		/* nothing */;
310 	return (i);
311 }
312 
313 /* Get UTF-8 string width. */
314 u_int
315 utf8_strwidth(const struct utf8_data *s, ssize_t n)
316 {
317 	ssize_t	i;
318 	u_int	width;
319 
320 	width = 0;
321 	for (i = 0; s[i].size != 0; i++) {
322 		if (n != -1 && n == i)
323 			break;
324 		width += s[i].width;
325 	}
326 	return (width);
327 }
328 
329 /*
330  * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
331  * Caller frees.
332  */
333 struct utf8_data *
334 utf8_fromcstr(const char *src)
335 {
336 	struct utf8_data	*dst;
337 	size_t			 n;
338 	enum utf8_state		 more;
339 
340 	dst = NULL;
341 
342 	n = 0;
343 	while (*src != '\0') {
344 		dst = xreallocarray(dst, n + 1, sizeof *dst);
345 		if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
346 			while (*++src != '\0' && more == UTF8_MORE)
347 				more = utf8_append(&dst[n], *src);
348 			if (more == UTF8_DONE) {
349 				n++;
350 				continue;
351 			}
352 			src -= dst[n].have;
353 		}
354 		utf8_set(&dst[n], *src);
355 		n++;
356 		src++;
357 	}
358 
359 	dst = xreallocarray(dst, n + 1, sizeof *dst);
360 	dst[n].size = 0;
361 	return (dst);
362 }
363 
364 /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
365 char *
366 utf8_tocstr(struct utf8_data *src)
367 {
368 	char	*dst;
369 	size_t	 n;
370 
371 	dst = NULL;
372 
373 	n = 0;
374 	for(; src->size != 0; src++) {
375 		dst = xreallocarray(dst, n + src->size, 1);
376 		memcpy(dst + n, src->data, src->size);
377 		n += src->size;
378 	}
379 
380 	dst = xreallocarray(dst, n + 1, 1);
381 	dst[n] = '\0';
382 	return (dst);
383 }
384 
385 /* Get width of UTF-8 string. */
386 u_int
387 utf8_cstrwidth(const char *s)
388 {
389 	struct utf8_data	tmp;
390 	u_int			width;
391 	enum utf8_state		more;
392 
393 	width = 0;
394 	while (*s != '\0') {
395 		if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
396 			while (*++s != '\0' && more == UTF8_MORE)
397 				more = utf8_append(&tmp, *s);
398 			if (more == UTF8_DONE) {
399 				width += tmp.width;
400 				continue;
401 			}
402 			s -= tmp.have;
403 		}
404 		if (*s > 0x1f && *s != 0x7f)
405 			width++;
406 		s++;
407 	}
408 	return (width);
409 }
410 
411 /* Trim UTF-8 string to width. Caller frees. */
412 char *
413 utf8_trimcstr(const char *s, u_int width)
414 {
415 	struct utf8_data	*tmp, *next;
416 	char			*out;
417 	u_int			 at;
418 
419 	tmp = utf8_fromcstr(s);
420 
421 	at = 0;
422 	for (next = tmp; next->size != 0; next++) {
423 		if (at + next->width > width) {
424 			next->size = 0;
425 			break;
426 		}
427 		at += next->width;
428 	}
429 
430 	out = utf8_tocstr(tmp);
431 	free(tmp);
432 	return (out);
433 }
434 
435 /* Trim UTF-8 string to width. Caller frees. */
436 char *
437 utf8_rtrimcstr(const char *s, u_int width)
438 {
439 	struct utf8_data	*tmp, *next, *end;
440 	char			*out;
441 	u_int			 at;
442 
443 	tmp = utf8_fromcstr(s);
444 
445 	for (end = tmp; end->size != 0; end++)
446 		/* nothing */;
447 	if (end == tmp) {
448 		free(tmp);
449 		return (xstrdup(""));
450 	}
451 	next = end - 1;
452 
453 	at = 0;
454 	for (;;) {
455 		if (at + next->width > width) {
456 			next++;
457 			break;
458 		}
459 		at += next->width;
460 
461 		if (next == tmp)
462 			break;
463 		next--;
464 	}
465 
466 	out = utf8_tocstr(next);
467 	free(tmp);
468 	return (out);
469 }
470 
471 /* Pad UTF-8 string to width. Caller frees. */
472 char *
473 utf8_padcstr(const char *s, u_int width)
474 {
475 	size_t	 slen;
476 	char	*out;
477 	u_int	  n, i;
478 
479 	n = utf8_cstrwidth(s);
480 	if (n >= width)
481 		return (xstrdup(s));
482 
483 	slen = strlen(s);
484 	out = xmalloc(slen + 1 + (width - n));
485 	memcpy(out, s, slen);
486 	for (i = n; i < width; i++)
487 		out[slen++] = ' ';
488 	out[slen] = '\0';
489 	return (out);
490 }
491