xref: /netbsd-src/external/bsd/tmux/dist/utf8.c (revision 8ecbf5f02b752fcb7debe1a8fab1dc82602bc760)
1 /* $OpenBSD$ */
2 
3 /*
4  * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
15  * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
16  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/types.h>
20 
21 #include <ctype.h>
22 #include <errno.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <wchar.h>
26 
27 #include "tmux.h"
28 
29 static int	utf8_width(wchar_t);
30 
31 /* Set a single character. */
32 void
33 utf8_set(struct utf8_data *ud, u_char ch)
34 {
35 	static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
36 
37 	memcpy(ud, &empty, sizeof *ud);
38 	*ud->data = ch;
39 }
40 
41 /* Copy UTF-8 character. */
42 void
43 utf8_copy(struct utf8_data *to, const struct utf8_data *from)
44 {
45 	u_int	i;
46 
47 	memcpy(to, from, sizeof *to);
48 
49 	for (i = to->size; i < sizeof to->data; i++)
50 		to->data[i] = '\0';
51 }
52 
53 /*
54  * Open UTF-8 sequence.
55  *
56  * 11000010-11011111 C2-DF start of 2-byte sequence
57  * 11100000-11101111 E0-EF start of 3-byte sequence
58  * 11110000-11110100 F0-F4 start of 4-byte sequence
59  */
60 enum utf8_state
61 utf8_open(struct utf8_data *ud, u_char ch)
62 {
63 	memset(ud, 0, sizeof *ud);
64 	if (ch >= 0xc2 && ch <= 0xdf)
65 		ud->size = 2;
66 	else if (ch >= 0xe0 && ch <= 0xef)
67 		ud->size = 3;
68 	else if (ch >= 0xf0 && ch <= 0xf4)
69 		ud->size = 4;
70 	else
71 		return (UTF8_ERROR);
72 	utf8_append(ud, ch);
73 	return (UTF8_MORE);
74 }
75 
76 /* Append character to UTF-8, closing if finished. */
77 enum utf8_state
78 utf8_append(struct utf8_data *ud, u_char ch)
79 {
80 	wchar_t	wc;
81 	int	width;
82 
83 	if (ud->have >= ud->size)
84 		fatalx("UTF-8 character overflow");
85 	if (ud->size > sizeof ud->data)
86 		fatalx("UTF-8 character size too large");
87 
88 	if (ud->have != 0 && (ch & 0xc0) != 0x80)
89 		ud->width = 0xff;
90 
91 	ud->data[ud->have++] = ch;
92 	if (ud->have != ud->size)
93 		return (UTF8_MORE);
94 
95 	if (ud->width == 0xff)
96 		return (UTF8_ERROR);
97 
98 	if (utf8_combine(ud, &wc) != UTF8_DONE)
99 		return (UTF8_ERROR);
100 	if ((width = utf8_width(wc)) < 0)
101 		return (UTF8_ERROR);
102 	ud->width = width;
103 
104 	return (UTF8_DONE);
105 }
106 
107 /* Get width of Unicode character. */
108 static int
109 utf8_width(wchar_t wc)
110 {
111 	int	width;
112 
113 #ifdef HAVE_UTF8PROC
114 	width = utf8proc_wcwidth(wc);
115 #else
116 	width = wcwidth(wc);
117 #endif
118 	if (width < 0 || width > 0xff) {
119 		log_debug("Unicode %04lx, wcwidth() %d", (long)wc, width);
120 
121 #ifndef __OpenBSD__
122 		/*
123 		 * Many platforms (particularly and inevitably OS X) have no
124 		 * width for relatively common characters (wcwidth() returns
125 		 * -1); assume width 1 in this case. This will be wrong for
126 		 * genuinely nonprintable characters, but they should be
127 		 * rare. We may pass through stuff that ideally we would block,
128 		 * but this is no worse than sending the same to the terminal
129 		 * without tmux.
130 		 */
131 		if (width < 0)
132 			return (1);
133 #endif
134 		return (-1);
135 	}
136 	return (width);
137 }
138 
139 /* Combine UTF-8 into Unicode. */
140 enum utf8_state
141 utf8_combine(const struct utf8_data *ud, wchar_t *wc)
142 {
143 #ifdef HAVE_UTF8PROC
144 	switch (utf8proc_mbtowc(wc, ud->data, ud->size)) {
145 #else
146 	switch (mbtowc(wc, (const char *)ud->data, ud->size)) {
147 #endif
148 	case -1:
149 		log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
150 		    errno);
151 		mbtowc(NULL, NULL, MB_CUR_MAX);
152 		return (UTF8_ERROR);
153 	case 0:
154 		return (UTF8_ERROR);
155 	default:
156 		return (UTF8_DONE);
157 	}
158 }
159 
160 /* Split Unicode into UTF-8. */
161 enum utf8_state
162 utf8_split(wchar_t wc, struct utf8_data *ud)
163 {
164 	char	s[MB_LEN_MAX];
165 	int	slen;
166 
167 #ifdef HAVE_UTF8PROC
168 	slen = utf8proc_wctomb(s, wc);
169 #else
170 	slen = wctomb(s, wc);
171 #endif
172 	if (slen <= 0 || slen > (int)sizeof ud->data)
173 		return (UTF8_ERROR);
174 
175 	memcpy(ud->data, s, slen);
176 	ud->size = slen;
177 
178 	ud->width = utf8_width(wc);
179 	return (UTF8_DONE);
180 }
181 
182 /*
183  * Encode len characters from src into dst, which is guaranteed to have four
184  * bytes available for each character from src (for \abc or UTF-8) plus space
185  * for \0.
186  */
187 int
188 utf8_strvis(char *dst, const char *src, size_t len, int flag)
189 {
190 	struct utf8_data	 ud;
191 	const char		*start, *end;
192 	enum utf8_state		 more;
193 	size_t			 i;
194 
195 	start = dst;
196 	end = src + len;
197 
198 	while (src < end) {
199 		if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
200 			while (++src < end && more == UTF8_MORE)
201 				more = utf8_append(&ud, *src);
202 			if (more == UTF8_DONE) {
203 				/* UTF-8 character finished. */
204 				for (i = 0; i < ud.size; i++)
205 					*dst++ = ud.data[i];
206 				continue;
207 			}
208 			/* Not a complete, valid UTF-8 character. */
209 			src -= ud.have;
210 		}
211 		if (src[0] == '$' && src < end - 1) {
212 			if (isalpha((u_char)src[1]) ||
213 			    src[1] == '_' ||
214 			    src[1] == '{')
215 				*dst++ = '\\';
216 			*dst++ = '$';
217 		} else if (src < end - 1)
218 			dst = vis(dst, src[0], flag, src[1]);
219 		else if (src < end)
220 			dst = vis(dst, src[0], flag, '\0');
221 		src++;
222 	}
223 
224 	*dst = '\0';
225 	return (dst - start);
226 }
227 
228 /* Same as utf8_strvis but allocate the buffer. */
229 int
230 utf8_stravis(char **dst, const char *src, int flag)
231 {
232 	char	*buf;
233 	int	 len;
234 
235 	buf = xreallocarray(NULL, 4, strlen(src) + 1);
236 	len = utf8_strvis(buf, src, strlen(src), flag);
237 
238 	*dst = xrealloc(buf, len + 1);
239 	return (len);
240 }
241 
242 /* Does this string contain anything that isn't valid UTF-8? */
243 int
244 utf8_isvalid(const char *s)
245 {
246 	struct utf8_data	 ud;
247 	const char		*end;
248 	enum utf8_state		 more;
249 
250 	end = s + strlen(s);
251 	while (s < end) {
252 		if ((more = utf8_open(&ud, *s)) == UTF8_MORE) {
253 			while (++s < end && more == UTF8_MORE)
254 				more = utf8_append(&ud, *s);
255 			if (more == UTF8_DONE)
256 				continue;
257 			return (0);
258 		}
259 		if (*s < 0x20 || *s > 0x7e)
260 			return (0);
261 		s++;
262 	}
263 	return (1);
264 }
265 
266 /*
267  * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
268  * the returned string. Anything not valid printable ASCII or UTF-8 is
269  * stripped.
270  */
271 char *
272 utf8_sanitize(const char *src)
273 {
274 	char			*dst;
275 	size_t			 n;
276 	enum utf8_state		 more;
277 	struct utf8_data	 ud;
278 	u_int			 i;
279 
280 	dst = NULL;
281 
282 	n = 0;
283 	while (*src != '\0') {
284 		dst = xreallocarray(dst, n + 1, sizeof *dst);
285 		if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
286 			while (*++src != '\0' && more == UTF8_MORE)
287 				more = utf8_append(&ud, *src);
288 			if (more == UTF8_DONE) {
289 				dst = xreallocarray(dst, n + ud.width,
290 				    sizeof *dst);
291 				for (i = 0; i < ud.width; i++)
292 					dst[n++] = '_';
293 				continue;
294 			}
295 			src -= ud.have;
296 		}
297 		if (*src > 0x1f && *src < 0x7f)
298 			dst[n++] = *src;
299 		else
300 			dst[n++] = '_';
301 		src++;
302 	}
303 
304 	dst = xreallocarray(dst, n + 1, sizeof *dst);
305 	dst[n] = '\0';
306 	return (dst);
307 }
308 
309 /* Get UTF-8 buffer length. */
310 size_t
311 utf8_strlen(const struct utf8_data *s)
312 {
313 	size_t	i;
314 
315 	for (i = 0; s[i].size != 0; i++)
316 		/* nothing */;
317 	return (i);
318 }
319 
320 /* Get UTF-8 string width. */
321 u_int
322 utf8_strwidth(const struct utf8_data *s, ssize_t n)
323 {
324 	ssize_t	i;
325 	u_int	width;
326 
327 	width = 0;
328 	for (i = 0; s[i].size != 0; i++) {
329 		if (n != -1 && n == i)
330 			break;
331 		width += s[i].width;
332 	}
333 	return (width);
334 }
335 
336 /*
337  * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
338  * Caller frees.
339  */
340 struct utf8_data *
341 utf8_fromcstr(const char *src)
342 {
343 	struct utf8_data	*dst;
344 	size_t			 n;
345 	enum utf8_state		 more;
346 
347 	dst = NULL;
348 
349 	n = 0;
350 	while (*src != '\0') {
351 		dst = xreallocarray(dst, n + 1, sizeof *dst);
352 		if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
353 			while (*++src != '\0' && more == UTF8_MORE)
354 				more = utf8_append(&dst[n], *src);
355 			if (more == UTF8_DONE) {
356 				n++;
357 				continue;
358 			}
359 			src -= dst[n].have;
360 		}
361 		utf8_set(&dst[n], *src);
362 		n++;
363 		src++;
364 	}
365 
366 	dst = xreallocarray(dst, n + 1, sizeof *dst);
367 	dst[n].size = 0;
368 	return (dst);
369 }
370 
371 /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
372 char *
373 utf8_tocstr(struct utf8_data *src)
374 {
375 	char	*dst;
376 	size_t	 n;
377 
378 	dst = NULL;
379 
380 	n = 0;
381 	for(; src->size != 0; src++) {
382 		dst = xreallocarray(dst, n + src->size, 1);
383 		memcpy(dst + n, src->data, src->size);
384 		n += src->size;
385 	}
386 
387 	dst = xreallocarray(dst, n + 1, 1);
388 	dst[n] = '\0';
389 	return (dst);
390 }
391 
392 /* Get width of UTF-8 string. */
393 u_int
394 utf8_cstrwidth(const char *s)
395 {
396 	struct utf8_data	tmp;
397 	u_int			width;
398 	enum utf8_state		more;
399 
400 	width = 0;
401 	while (*s != '\0') {
402 		if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
403 			while (*++s != '\0' && more == UTF8_MORE)
404 				more = utf8_append(&tmp, *s);
405 			if (more == UTF8_DONE) {
406 				width += tmp.width;
407 				continue;
408 			}
409 			s -= tmp.have;
410 		}
411 		if (*s > 0x1f && *s != 0x7f)
412 			width++;
413 		s++;
414 	}
415 	return (width);
416 }
417 
418 /* Pad UTF-8 string to width. Caller frees. */
419 char *
420 utf8_padcstr(const char *s, u_int width)
421 {
422 	size_t	 slen;
423 	char	*out;
424 	u_int	  n, i;
425 
426 	n = utf8_cstrwidth(s);
427 	if (n >= width)
428 		return (xstrdup(s));
429 
430 	slen = strlen(s);
431 	out = xmalloc(slen + 1 + (width - n));
432 	memcpy(out, s, slen);
433 	for (i = n; i < width; i++)
434 		out[slen++] = ' ';
435 	out[slen] = '\0';
436 	return (out);
437 }
438 
439 int
440 utf8_cstrhas(const char *s, const struct utf8_data *ud)
441 {
442 	struct utf8_data	*copy, *loop;
443 	int			 found = 0;
444 
445 	copy = utf8_fromcstr(s);
446 	for (loop = copy; loop->size != 0; loop++) {
447 		if (loop->size != ud->size)
448 			continue;
449 		if (memcmp(loop->data, ud->data, loop->size) == 0) {
450 			found = 1;
451 			break;
452 		}
453 	}
454 	free(copy);
455 
456 	return (found);
457 }
458