xref: /openbsd-src/usr.bin/tmux/utf8.c (revision 03adc85b7600a1f8f04886b8321c1c1c0c4933d4)
1 /* $OpenBSD: utf8.c,v 1.35 2017/01/18 10:08:05 nicm Exp $ */
2 
3 /*
4  * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
15  * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
16  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/types.h>
20 
21 #include <errno.h>
22 #include <stdlib.h>
23 #include <string.h>
24 #include <vis.h>
25 #include <wchar.h>
26 
27 #include "tmux.h"
28 
29 static int	utf8_width(wchar_t);
30 
31 /* Set a single character. */
32 void
33 utf8_set(struct utf8_data *ud, u_char ch)
34 {
35 	static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
36 
37 	memcpy(ud, &empty, sizeof *ud);
38 	*ud->data = ch;
39 }
40 
41 /* Copy UTF-8 character. */
42 void
43 utf8_copy(struct utf8_data *to, const struct utf8_data *from)
44 {
45 	u_int	i;
46 
47 	memcpy(to, from, sizeof *to);
48 
49 	for (i = to->size; i < sizeof to->data; i++)
50 		to->data[i] = '\0';
51 }
52 
53 /*
54  * Open UTF-8 sequence.
55  *
56  * 11000010-11011111 C2-DF start of 2-byte sequence
57  * 11100000-11101111 E0-EF start of 3-byte sequence
58  * 11110000-11110100 F0-F4 start of 4-byte sequence
59  */
60 enum utf8_state
61 utf8_open(struct utf8_data *ud, u_char ch)
62 {
63 	memset(ud, 0, sizeof *ud);
64 	if (ch >= 0xc2 && ch <= 0xdf)
65 		ud->size = 2;
66 	else if (ch >= 0xe0 && ch <= 0xef)
67 		ud->size = 3;
68 	else if (ch >= 0xf0 && ch <= 0xf4)
69 		ud->size = 4;
70 	else
71 		return (UTF8_ERROR);
72 	utf8_append(ud, ch);
73 	return (UTF8_MORE);
74 }
75 
76 /* Append character to UTF-8, closing if finished. */
77 enum utf8_state
78 utf8_append(struct utf8_data *ud, u_char ch)
79 {
80 	wchar_t	wc;
81 	int	width;
82 
83 	if (ud->have >= ud->size)
84 		fatalx("UTF-8 character overflow");
85 	if (ud->size > sizeof ud->data)
86 		fatalx("UTF-8 character size too large");
87 
88 	if (ud->have != 0 && (ch & 0xc0) != 0x80)
89 		ud->width = 0xff;
90 
91 	ud->data[ud->have++] = ch;
92 	if (ud->have != ud->size)
93 		return (UTF8_MORE);
94 
95 	if (ud->width == 0xff)
96 		return (UTF8_ERROR);
97 
98 	if (utf8_combine(ud, &wc) != UTF8_DONE)
99 		return (UTF8_ERROR);
100 	if ((width = utf8_width(wc)) < 0)
101 		return (UTF8_ERROR);
102 	ud->width = width;
103 
104 	return (UTF8_DONE);
105 }
106 
107 /* Get width of Unicode character. */
108 static int
109 utf8_width(wchar_t wc)
110 {
111 	int	width;
112 
113 	width = wcwidth(wc);
114 	if (width < 0 || width > 0xff) {
115 		log_debug("Unicode %04x, wcwidth() %d", wc, width);
116 		return (-1);
117 	}
118 	return (width);
119 }
120 
121 /* Combine UTF-8 into Unicode. */
122 enum utf8_state
123 utf8_combine(const struct utf8_data *ud, wchar_t *wc)
124 {
125 	switch (mbtowc(wc, ud->data, ud->size)) {
126 	case -1:
127 		log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
128 		    errno);
129 		mbtowc(NULL, NULL, MB_CUR_MAX);
130 		return (UTF8_ERROR);
131 	case 0:
132 		return (UTF8_ERROR);
133 	default:
134 		return (UTF8_DONE);
135 	}
136 }
137 
138 /* Split Unicode into UTF-8. */
139 enum utf8_state
140 utf8_split(wchar_t wc, struct utf8_data *ud)
141 {
142 	char	s[MB_LEN_MAX];
143 	int	slen;
144 
145 	slen = wctomb(s, wc);
146 	if (slen <= 0 || slen > (int)sizeof ud->data)
147 		return (UTF8_ERROR);
148 
149 	memcpy(ud->data, s, slen);
150 	ud->size = slen;
151 
152 	ud->width = utf8_width(wc);
153 	return (UTF8_DONE);
154 }
155 
156 /*
157  * Encode len characters from src into dst, which is guaranteed to have four
158  * bytes available for each character from src (for \abc or UTF-8) plus space
159  * for \0.
160  */
161 int
162 utf8_strvis(char *dst, const char *src, size_t len, int flag)
163 {
164 	struct utf8_data	 ud;
165 	const char		*start, *end;
166 	enum utf8_state		 more;
167 	size_t			 i;
168 
169 	start = dst;
170 	end = src + len;
171 
172 	while (src < end) {
173 		if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
174 			while (++src < end && more == UTF8_MORE)
175 				more = utf8_append(&ud, *src);
176 			if (more == UTF8_DONE) {
177 				/* UTF-8 character finished. */
178 				for (i = 0; i < ud.size; i++)
179 					*dst++ = ud.data[i];
180 				continue;
181 			}
182 			/* Not a complete, valid UTF-8 character. */
183 			src -= ud.have;
184 		}
185 		if (src < end - 1)
186 			dst = vis(dst, src[0], flag, src[1]);
187 		else if (src < end)
188 			dst = vis(dst, src[0], flag, '\0');
189 		src++;
190 	}
191 
192 	*dst = '\0';
193 	return (dst - start);
194 }
195 
196 /* Same as utf8_strvis but allocate the buffer. */
197 int
198 utf8_stravis(char **dst, const char *src, int flag)
199 {
200 	char	*buf;
201 	int	 len;
202 
203 	buf = xreallocarray(NULL, 4, strlen(src) + 1);
204 	len = utf8_strvis(buf, src, strlen(src), flag);
205 
206 	*dst = xrealloc(buf, len + 1);
207 	return (len);
208 }
209 
210 /*
211  * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
212  * the returned string. Anything not valid printable ASCII or UTF-8 is
213  * stripped.
214  */
215 char *
216 utf8_sanitize(const char *src)
217 {
218 	char			*dst;
219 	size_t			 n;
220 	enum utf8_state		 more;
221 	struct utf8_data	 ud;
222 	u_int			 i;
223 
224 	dst = NULL;
225 
226 	n = 0;
227 	while (*src != '\0') {
228 		dst = xreallocarray(dst, n + 1, sizeof *dst);
229 		if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
230 			while (*++src != '\0' && more == UTF8_MORE)
231 				more = utf8_append(&ud, *src);
232 			if (more == UTF8_DONE) {
233 				dst = xreallocarray(dst, n + ud.width,
234 				    sizeof *dst);
235 				for (i = 0; i < ud.width; i++)
236 					dst[n++] = '_';
237 				continue;
238 			}
239 			src -= ud.have;
240 		}
241 		if (*src > 0x1f && *src < 0x7f)
242 			dst[n++] = *src;
243 		else
244 			dst[n++] = '_';
245 		src++;
246 	}
247 
248 	dst = xreallocarray(dst, n + 1, sizeof *dst);
249 	dst[n] = '\0';
250 	return (dst);
251 }
252 
253 /* Get UTF-8 buffer length. */
254 size_t
255 utf8_strlen(const struct utf8_data *s)
256 {
257 	size_t	i;
258 
259 	for (i = 0; s[i].size != 0; i++)
260 		/* nothing */;
261 	return (i);
262 }
263 
264 /* Get UTF-8 string width. */
265 u_int
266 utf8_strwidth(const struct utf8_data *s, ssize_t n)
267 {
268 	ssize_t	i;
269 	u_int	width;
270 
271 	width = 0;
272 	for (i = 0; s[i].size != 0; i++) {
273 		if (n != -1 && n == i)
274 			break;
275 		width += s[i].width;
276 	}
277 	return (width);
278 }
279 
280 /*
281  * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
282  * Caller frees.
283  */
284 struct utf8_data *
285 utf8_fromcstr(const char *src)
286 {
287 	struct utf8_data	*dst;
288 	size_t			 n;
289 	enum utf8_state		 more;
290 
291 	dst = NULL;
292 
293 	n = 0;
294 	while (*src != '\0') {
295 		dst = xreallocarray(dst, n + 1, sizeof *dst);
296 		if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
297 			while (*++src != '\0' && more == UTF8_MORE)
298 				more = utf8_append(&dst[n], *src);
299 			if (more == UTF8_DONE) {
300 				n++;
301 				continue;
302 			}
303 			src -= dst[n].have;
304 		}
305 		utf8_set(&dst[n], *src);
306 		n++;
307 		src++;
308 	}
309 
310 	dst = xreallocarray(dst, n + 1, sizeof *dst);
311 	dst[n].size = 0;
312 	return (dst);
313 }
314 
315 /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
316 char *
317 utf8_tocstr(struct utf8_data *src)
318 {
319 	char	*dst;
320 	size_t	 n;
321 
322 	dst = NULL;
323 
324 	n = 0;
325 	for(; src->size != 0; src++) {
326 		dst = xreallocarray(dst, n + src->size, 1);
327 		memcpy(dst + n, src->data, src->size);
328 		n += src->size;
329 	}
330 
331 	dst = xreallocarray(dst, n + 1, 1);
332 	dst[n] = '\0';
333 	return (dst);
334 }
335 
336 /* Get width of UTF-8 string. */
337 u_int
338 utf8_cstrwidth(const char *s)
339 {
340 	struct utf8_data	tmp;
341 	u_int			width;
342 	enum utf8_state		more;
343 
344 	width = 0;
345 	while (*s != '\0') {
346 		if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
347 			while (*++s != '\0' && more == UTF8_MORE)
348 				more = utf8_append(&tmp, *s);
349 			if (more == UTF8_DONE) {
350 				width += tmp.width;
351 				continue;
352 			}
353 			s -= tmp.have;
354 		}
355 		if (*s > 0x1f && *s != 0x7f)
356 			width++;
357 		s++;
358 	}
359 	return (width);
360 }
361 
362 /* Trim UTF-8 string to width. Caller frees. */
363 char *
364 utf8_trimcstr(const char *s, u_int width)
365 {
366 	struct utf8_data	*tmp, *next;
367 	char			*out;
368 	u_int			 at;
369 
370 	tmp = utf8_fromcstr(s);
371 
372 	at = 0;
373 	for (next = tmp; next->size != 0; next++) {
374 		if (at + next->width > width) {
375 			next->size = 0;
376 			break;
377 		}
378 		at += next->width;
379 	}
380 
381 	out = utf8_tocstr(tmp);
382 	free(tmp);
383 	return (out);
384 }
385 
386 /* Trim UTF-8 string to width. Caller frees. */
387 char *
388 utf8_rtrimcstr(const char *s, u_int width)
389 {
390 	struct utf8_data	*tmp, *next, *end;
391 	char			*out;
392 	u_int			 at;
393 
394 	tmp = utf8_fromcstr(s);
395 
396 	for (end = tmp; end->size != 0; end++)
397 		/* nothing */;
398 	if (end == tmp) {
399 		free(tmp);
400 		return (xstrdup(""));
401 	}
402 	next = end - 1;
403 
404 	at = 0;
405 	for (;;)
406 	{
407 		if (at + next->width > width) {
408 			next++;
409 			break;
410 		}
411 		at += next->width;
412 
413 		if (next == tmp)
414 			break;
415 		next--;
416 	}
417 
418 	out = utf8_tocstr(next);
419 	free(tmp);
420 	return (out);
421 }
422 
423 /* Pad UTF-8 string to width. Caller frees. */
424 char *
425 utf8_padcstr(const char *s, u_int width)
426 {
427 	size_t	 slen;
428 	char	*out;
429 	u_int	  n, i;
430 
431 	n = utf8_cstrwidth(s);
432 	if (n >= width)
433 		return (xstrdup(s));
434 
435 	slen = strlen(s);
436 	out = xmalloc(slen + 1 + (width - n));
437 	memcpy(out, s, slen);
438 	for (i = n; i < width; i++)
439 		out[slen++] = ' ';
440 	out[slen] = '\0';
441 	return (out);
442 }
443