xref: /openbsd-src/usr.bin/tmux/utf8.c (revision 99fd087599a8791921855f21bd7e36130f39aadc)
1 /* $OpenBSD: utf8.c,v 1.44 2019/11/25 15:04:15 nicm Exp $ */
2 
3 /*
4  * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
15  * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
16  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/types.h>
20 
21 #include <ctype.h>
22 #include <errno.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <vis.h>
26 #include <wchar.h>
27 
28 #include "tmux.h"
29 
30 static int	utf8_width(wchar_t);
31 
32 /* Set a single character. */
33 void
34 utf8_set(struct utf8_data *ud, u_char ch)
35 {
36 	static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
37 
38 	memcpy(ud, &empty, sizeof *ud);
39 	*ud->data = ch;
40 }
41 
42 /* Copy UTF-8 character. */
43 void
44 utf8_copy(struct utf8_data *to, const struct utf8_data *from)
45 {
46 	u_int	i;
47 
48 	memcpy(to, from, sizeof *to);
49 
50 	for (i = to->size; i < sizeof to->data; i++)
51 		to->data[i] = '\0';
52 }
53 
54 /*
55  * Open UTF-8 sequence.
56  *
57  * 11000010-11011111 C2-DF start of 2-byte sequence
58  * 11100000-11101111 E0-EF start of 3-byte sequence
59  * 11110000-11110100 F0-F4 start of 4-byte sequence
60  */
61 enum utf8_state
62 utf8_open(struct utf8_data *ud, u_char ch)
63 {
64 	memset(ud, 0, sizeof *ud);
65 	if (ch >= 0xc2 && ch <= 0xdf)
66 		ud->size = 2;
67 	else if (ch >= 0xe0 && ch <= 0xef)
68 		ud->size = 3;
69 	else if (ch >= 0xf0 && ch <= 0xf4)
70 		ud->size = 4;
71 	else
72 		return (UTF8_ERROR);
73 	utf8_append(ud, ch);
74 	return (UTF8_MORE);
75 }
76 
77 /* Append character to UTF-8, closing if finished. */
78 enum utf8_state
79 utf8_append(struct utf8_data *ud, u_char ch)
80 {
81 	wchar_t	wc;
82 	int	width;
83 
84 	if (ud->have >= ud->size)
85 		fatalx("UTF-8 character overflow");
86 	if (ud->size > sizeof ud->data)
87 		fatalx("UTF-8 character size too large");
88 
89 	if (ud->have != 0 && (ch & 0xc0) != 0x80)
90 		ud->width = 0xff;
91 
92 	ud->data[ud->have++] = ch;
93 	if (ud->have != ud->size)
94 		return (UTF8_MORE);
95 
96 	if (ud->width == 0xff)
97 		return (UTF8_ERROR);
98 
99 	if (utf8_combine(ud, &wc) != UTF8_DONE)
100 		return (UTF8_ERROR);
101 	if ((width = utf8_width(wc)) < 0)
102 		return (UTF8_ERROR);
103 	ud->width = width;
104 
105 	return (UTF8_DONE);
106 }
107 
108 /* Get width of Unicode character. */
109 static int
110 utf8_width(wchar_t wc)
111 {
112 	int	width;
113 
114 	width = wcwidth(wc);
115 	if (width < 0 || width > 0xff) {
116 		log_debug("Unicode %04lx, wcwidth() %d", (long)wc, width);
117 		return (-1);
118 	}
119 	return (width);
120 }
121 
122 /* Combine UTF-8 into Unicode. */
123 enum utf8_state
124 utf8_combine(const struct utf8_data *ud, wchar_t *wc)
125 {
126 	switch (mbtowc(wc, ud->data, ud->size)) {
127 	case -1:
128 		log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
129 		    errno);
130 		mbtowc(NULL, NULL, MB_CUR_MAX);
131 		return (UTF8_ERROR);
132 	case 0:
133 		return (UTF8_ERROR);
134 	default:
135 		return (UTF8_DONE);
136 	}
137 }
138 
139 /* Split Unicode into UTF-8. */
140 enum utf8_state
141 utf8_split(wchar_t wc, struct utf8_data *ud)
142 {
143 	char	s[MB_LEN_MAX];
144 	int	slen;
145 
146 	slen = wctomb(s, wc);
147 	if (slen <= 0 || slen > (int)sizeof ud->data)
148 		return (UTF8_ERROR);
149 
150 	memcpy(ud->data, s, slen);
151 	ud->size = slen;
152 
153 	ud->width = utf8_width(wc);
154 	return (UTF8_DONE);
155 }
156 
157 /*
158  * Encode len characters from src into dst, which is guaranteed to have four
159  * bytes available for each character from src (for \abc or UTF-8) plus space
160  * for \0.
161  */
162 int
163 utf8_strvis(char *dst, const char *src, size_t len, int flag)
164 {
165 	struct utf8_data	 ud;
166 	const char		*start, *end;
167 	enum utf8_state		 more;
168 	size_t			 i;
169 
170 	start = dst;
171 	end = src + len;
172 
173 	while (src < end) {
174 		if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
175 			while (++src < end && more == UTF8_MORE)
176 				more = utf8_append(&ud, *src);
177 			if (more == UTF8_DONE) {
178 				/* UTF-8 character finished. */
179 				for (i = 0; i < ud.size; i++)
180 					*dst++ = ud.data[i];
181 				continue;
182 			}
183 			/* Not a complete, valid UTF-8 character. */
184 			src -= ud.have;
185 		}
186 		if (src[0] == '$' && src < end - 1) {
187 			if (isalpha((u_char)src[1]) ||
188 			    src[1] == '_' ||
189 			    src[1] == '{')
190 				*dst++ = '\\';
191 			*dst++ = '$';
192 		} else if (src < end - 1)
193 			dst = vis(dst, src[0], flag, src[1]);
194 		else if (src < end)
195 			dst = vis(dst, src[0], flag, '\0');
196 		src++;
197 	}
198 
199 	*dst = '\0';
200 	return (dst - start);
201 }
202 
203 /* Same as utf8_strvis but allocate the buffer. */
204 int
205 utf8_stravis(char **dst, const char *src, int flag)
206 {
207 	char	*buf;
208 	int	 len;
209 
210 	buf = xreallocarray(NULL, 4, strlen(src) + 1);
211 	len = utf8_strvis(buf, src, strlen(src), flag);
212 
213 	*dst = xrealloc(buf, len + 1);
214 	return (len);
215 }
216 
217 /* Does this string contain anything that isn't valid UTF-8? */
218 int
219 utf8_isvalid(const char *s)
220 {
221 	struct utf8_data	 ud;
222 	const char		*end;
223 	enum utf8_state		 more;
224 
225 	end = s + strlen(s);
226 	while (s < end) {
227 		if ((more = utf8_open(&ud, *s)) == UTF8_MORE) {
228 			while (++s < end && more == UTF8_MORE)
229 				more = utf8_append(&ud, *s);
230 			if (more == UTF8_DONE)
231 				continue;
232 			return (0);
233 		}
234 		if (*s < 0x20 || *s > 0x7e)
235 			return (0);
236 		s++;
237 	}
238 	return (1);
239 }
240 
241 /*
242  * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
243  * the returned string. Anything not valid printable ASCII or UTF-8 is
244  * stripped.
245  */
246 char *
247 utf8_sanitize(const char *src)
248 {
249 	char			*dst;
250 	size_t			 n;
251 	enum utf8_state		 more;
252 	struct utf8_data	 ud;
253 	u_int			 i;
254 
255 	dst = NULL;
256 
257 	n = 0;
258 	while (*src != '\0') {
259 		dst = xreallocarray(dst, n + 1, sizeof *dst);
260 		if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
261 			while (*++src != '\0' && more == UTF8_MORE)
262 				more = utf8_append(&ud, *src);
263 			if (more == UTF8_DONE) {
264 				dst = xreallocarray(dst, n + ud.width,
265 				    sizeof *dst);
266 				for (i = 0; i < ud.width; i++)
267 					dst[n++] = '_';
268 				continue;
269 			}
270 			src -= ud.have;
271 		}
272 		if (*src > 0x1f && *src < 0x7f)
273 			dst[n++] = *src;
274 		else
275 			dst[n++] = '_';
276 		src++;
277 	}
278 
279 	dst = xreallocarray(dst, n + 1, sizeof *dst);
280 	dst[n] = '\0';
281 	return (dst);
282 }
283 
284 /* Get UTF-8 buffer length. */
285 size_t
286 utf8_strlen(const struct utf8_data *s)
287 {
288 	size_t	i;
289 
290 	for (i = 0; s[i].size != 0; i++)
291 		/* nothing */;
292 	return (i);
293 }
294 
295 /* Get UTF-8 string width. */
296 u_int
297 utf8_strwidth(const struct utf8_data *s, ssize_t n)
298 {
299 	ssize_t	i;
300 	u_int	width;
301 
302 	width = 0;
303 	for (i = 0; s[i].size != 0; i++) {
304 		if (n != -1 && n == i)
305 			break;
306 		width += s[i].width;
307 	}
308 	return (width);
309 }
310 
311 /*
312  * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
313  * Caller frees.
314  */
315 struct utf8_data *
316 utf8_fromcstr(const char *src)
317 {
318 	struct utf8_data	*dst;
319 	size_t			 n;
320 	enum utf8_state		 more;
321 
322 	dst = NULL;
323 
324 	n = 0;
325 	while (*src != '\0') {
326 		dst = xreallocarray(dst, n + 1, sizeof *dst);
327 		if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
328 			while (*++src != '\0' && more == UTF8_MORE)
329 				more = utf8_append(&dst[n], *src);
330 			if (more == UTF8_DONE) {
331 				n++;
332 				continue;
333 			}
334 			src -= dst[n].have;
335 		}
336 		utf8_set(&dst[n], *src);
337 		n++;
338 		src++;
339 	}
340 
341 	dst = xreallocarray(dst, n + 1, sizeof *dst);
342 	dst[n].size = 0;
343 	return (dst);
344 }
345 
346 /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
347 char *
348 utf8_tocstr(struct utf8_data *src)
349 {
350 	char	*dst;
351 	size_t	 n;
352 
353 	dst = NULL;
354 
355 	n = 0;
356 	for(; src->size != 0; src++) {
357 		dst = xreallocarray(dst, n + src->size, 1);
358 		memcpy(dst + n, src->data, src->size);
359 		n += src->size;
360 	}
361 
362 	dst = xreallocarray(dst, n + 1, 1);
363 	dst[n] = '\0';
364 	return (dst);
365 }
366 
367 /* Get width of UTF-8 string. */
368 u_int
369 utf8_cstrwidth(const char *s)
370 {
371 	struct utf8_data	tmp;
372 	u_int			width;
373 	enum utf8_state		more;
374 
375 	width = 0;
376 	while (*s != '\0') {
377 		if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
378 			while (*++s != '\0' && more == UTF8_MORE)
379 				more = utf8_append(&tmp, *s);
380 			if (more == UTF8_DONE) {
381 				width += tmp.width;
382 				continue;
383 			}
384 			s -= tmp.have;
385 		}
386 		if (*s > 0x1f && *s != 0x7f)
387 			width++;
388 		s++;
389 	}
390 	return (width);
391 }
392 
393 /* Pad UTF-8 string to width on the left. Caller frees. */
394 char *
395 utf8_padcstr(const char *s, u_int width)
396 {
397 	size_t	 slen;
398 	char	*out;
399 	u_int	  n, i;
400 
401 	n = utf8_cstrwidth(s);
402 	if (n >= width)
403 		return (xstrdup(s));
404 
405 	slen = strlen(s);
406 	out = xmalloc(slen + 1 + (width - n));
407 	memcpy(out, s, slen);
408 	for (i = n; i < width; i++)
409 		out[slen++] = ' ';
410 	out[slen] = '\0';
411 	return (out);
412 }
413 
414 /* Pad UTF-8 string to width on the right. Caller frees. */
415 char *
416 utf8_rpadcstr(const char *s, u_int width)
417 {
418 	size_t	 slen;
419 	char	*out;
420 	u_int	  n, i;
421 
422 	n = utf8_cstrwidth(s);
423 	if (n >= width)
424 		return (xstrdup(s));
425 
426 	slen = strlen(s);
427 	out = xmalloc(slen + 1 + (width - n));
428 	for (i = 0; i < width - n; i++)
429 		out[i] = ' ';
430 	memcpy(out + i, s, slen);
431 	out[i + slen] = '\0';
432 	return (out);
433 }
434 
435 int
436 utf8_cstrhas(const char *s, const struct utf8_data *ud)
437 {
438 	struct utf8_data	*copy, *loop;
439 	int			 found = 0;
440 
441 	copy = utf8_fromcstr(s);
442 	for (loop = copy; loop->size != 0; loop++) {
443 		if (loop->size != ud->size)
444 			continue;
445 		if (memcmp(loop->data, ud->data, loop->size) == 0) {
446 			found = 1;
447 			break;
448 		}
449 	}
450 	free(copy);
451 
452 	return (found);
453 }
454