xref: /netbsd-src/external/bsd/tmux/dist/utf8.c (revision e89934bbf778a6d6d6894877c4da59d0c7835b0f)
1 /* $OpenBSD$ */
2 
3 /*
4  * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
15  * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
16  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/types.h>
20 
21 #include <stdlib.h>
22 #include <string.h>
23 #include <wchar.h>
24 
25 #include "tmux.h"
26 
27 static int	utf8_width(wchar_t);
28 
29 /* Set a single character. */
30 void
31 utf8_set(struct utf8_data *ud, u_char ch)
32 {
33 	u_int	i;
34 
35 	*ud->data = ch;
36 	ud->have = 1;
37 	ud->size = 1;
38 
39 	ud->width = 1;
40 
41 	for (i = ud->size; i < sizeof ud->data; i++)
42 		ud->data[i] = '\0';
43 }
44 
45 /* Copy UTF-8 character. */
46 void
47 utf8_copy(struct utf8_data *to, const struct utf8_data *from)
48 {
49 	u_int	i;
50 
51 	memcpy(to, from, sizeof *to);
52 
53 	for (i = to->size; i < sizeof to->data; i++)
54 		to->data[i] = '\0';
55 }
56 
57 /*
58  * Open UTF-8 sequence.
59  *
60  * 11000010-11011111 C2-DF start of 2-byte sequence
61  * 11100000-11101111 E0-EF start of 3-byte sequence
62  * 11110000-11110100 F0-F4 start of 4-byte sequence
63  */
64 enum utf8_state
65 utf8_open(struct utf8_data *ud, u_char ch)
66 {
67 	memset(ud, 0, sizeof *ud);
68 	if (ch >= 0xc2 && ch <= 0xdf)
69 		ud->size = 2;
70 	else if (ch >= 0xe0 && ch <= 0xef)
71 		ud->size = 3;
72 	else if (ch >= 0xf0 && ch <= 0xf4)
73 		ud->size = 4;
74 	else
75 		return (UTF8_ERROR);
76 	utf8_append(ud, ch);
77 	return (UTF8_MORE);
78 }
79 
80 /* Append character to UTF-8, closing if finished. */
81 enum utf8_state
82 utf8_append(struct utf8_data *ud, u_char ch)
83 {
84 	wchar_t	wc;
85 	int	width;
86 
87 	if (ud->have >= ud->size)
88 		fatalx("UTF-8 character overflow");
89 	if (ud->size > sizeof ud->data)
90 		fatalx("UTF-8 character size too large");
91 
92 	if (ud->have != 0 && (ch & 0xc0) != 0x80)
93 		ud->width = 0xff;
94 
95 	ud->data[ud->have++] = ch;
96 	if (ud->have != ud->size)
97 		return (UTF8_MORE);
98 
99 	if (ud->width == 0xff)
100 		return (UTF8_ERROR);
101 
102 	if (utf8_combine(ud, &wc) != UTF8_DONE)
103 		return (UTF8_ERROR);
104 	if ((width = utf8_width(wc)) < 0)
105 		return (UTF8_ERROR);
106 	ud->width = width;
107 
108 	return (UTF8_DONE);
109 }
110 
111 /* Get width of Unicode character. */
112 static int
113 utf8_width(wchar_t wc)
114 {
115 	int	width;
116 
117 	width = wcwidth(wc);
118 	if (width < 0 || width > 0xff)
119 		return (-1);
120 	return (width);
121 }
122 
123 /* Combine UTF-8 into Unicode. */
124 enum utf8_state
125 utf8_combine(const struct utf8_data *ud, wchar_t *wc)
126 {
127 	switch (mbtowc(wc, (const char *)ud->data, ud->size)) {
128 	case -1:
129 		mbtowc(NULL, NULL, MB_CUR_MAX);
130 		return (UTF8_ERROR);
131 	case 0:
132 		return (UTF8_ERROR);
133 	default:
134 		return (UTF8_DONE);
135 	}
136 }
137 
138 /* Split Unicode into UTF-8. */
139 enum utf8_state
140 utf8_split(wchar_t wc, struct utf8_data *ud)
141 {
142 	char	s[MB_LEN_MAX];
143 	int	slen;
144 
145 	slen = wctomb(s, wc);
146 	if (slen <= 0 || slen > (int)sizeof ud->data)
147 		return (UTF8_ERROR);
148 
149 	memcpy(ud->data, s, slen);
150 	ud->size = slen;
151 
152 	ud->width = utf8_width(wc);
153 	return (UTF8_DONE);
154 }
155 
156 /*
157  * Encode len characters from src into dst, which is guaranteed to have four
158  * bytes available for each character from src (for \abc or UTF-8) plus space
159  * for \0.
160  */
161 int
162 utf8_strvis(char *dst, const char *src, size_t len, int flag)
163 {
164 	struct utf8_data	 ud;
165 	const char		*start, *end;
166 	enum utf8_state		 more;
167 	size_t			 i;
168 
169 	start = dst;
170 	end = src + len;
171 
172 	while (src < end) {
173 		if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
174 			while (++src < end && more == UTF8_MORE)
175 				more = utf8_append(&ud, *src);
176 			if (more == UTF8_DONE) {
177 				/* UTF-8 character finished. */
178 				for (i = 0; i < ud.size; i++)
179 					*dst++ = ud.data[i];
180 				continue;
181 			}
182 			/* Not a complete, valid UTF-8 character. */
183 			src -= ud.have;
184 		}
185 		if (src < end - 1)
186 			dst = vis(dst, src[0], flag, src[1]);
187 		else if (src < end)
188 			dst = vis(dst, src[0], flag, '\0');
189 		src++;
190 	}
191 
192 	*dst = '\0';
193 	return (dst - start);
194 }
195 
196 /*
197  * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
198  * the returned string. Anything not valid printable ASCII or UTF-8 is
199  * stripped.
200  */
201 char *
202 utf8_sanitize(const char *src)
203 {
204 	char			*dst;
205 	size_t			 n;
206 	enum utf8_state		 more;
207 	struct utf8_data	 ud;
208 	u_int			 i;
209 
210 	dst = NULL;
211 
212 	n = 0;
213 	while (*src != '\0') {
214 		dst = xreallocarray(dst, n + 1, sizeof *dst);
215 		if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
216 			while (*++src != '\0' && more == UTF8_MORE)
217 				more = utf8_append(&ud, *src);
218 			if (more == UTF8_DONE) {
219 				dst = xreallocarray(dst, n + ud.width,
220 				    sizeof *dst);
221 				for (i = 0; i < ud.width; i++)
222 					dst[n++] = '_';
223 				continue;
224 			}
225 			src -= ud.have;
226 		}
227 		if (*src > 0x1f && *src < 0x7f)
228 			dst[n++] = *src;
229 		else
230 			dst[n++] = '_';
231 		src++;
232 	}
233 
234 	dst = xreallocarray(dst, n + 1, sizeof *dst);
235 	dst[n] = '\0';
236 	return (dst);
237 }
238 
239 /*
240  * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
241  * Caller frees.
242  */
243 struct utf8_data *
244 utf8_fromcstr(const char *src)
245 {
246 	struct utf8_data	*dst;
247 	size_t			 n;
248 	enum utf8_state		 more;
249 
250 	dst = NULL;
251 
252 	n = 0;
253 	while (*src != '\0') {
254 		dst = xreallocarray(dst, n + 1, sizeof *dst);
255 		if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
256 			while (*++src != '\0' && more == UTF8_MORE)
257 				more = utf8_append(&dst[n], *src);
258 			if (more == UTF8_DONE) {
259 				n++;
260 				continue;
261 			}
262 			src -= dst[n].have;
263 		}
264 		utf8_set(&dst[n], *src);
265 		n++;
266 		src++;
267 	}
268 
269 	dst = xreallocarray(dst, n + 1, sizeof *dst);
270 	dst[n].size = 0;
271 	return (dst);
272 }
273 
274 /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
275 char *
276 utf8_tocstr(struct utf8_data *src)
277 {
278 	char	*dst;
279 	size_t	 n;
280 
281 	dst = NULL;
282 
283 	n = 0;
284 	for(; src->size != 0; src++) {
285 		dst = xreallocarray(dst, n + src->size, 1);
286 		memcpy(dst + n, src->data, src->size);
287 		n += src->size;
288 	}
289 
290 	dst = xreallocarray(dst, n + 1, 1);
291 	dst[n] = '\0';
292 	return (dst);
293 }
294 
295 /* Get width of UTF-8 string. */
296 u_int
297 utf8_cstrwidth(const char *s)
298 {
299 	struct utf8_data	tmp;
300 	u_int			width;
301 	enum utf8_state		more;
302 
303 	width = 0;
304 	while (*s != '\0') {
305 		if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
306 			while (*++s != '\0' && more == UTF8_MORE)
307 				more = utf8_append(&tmp, *s);
308 			if (more == UTF8_DONE) {
309 				width += tmp.width;
310 				continue;
311 			}
312 			s -= tmp.have;
313 		}
314 		if (*s > 0x1f && *s != 0x7f)
315 			width++;
316 		s++;
317 	}
318 	return (width);
319 }
320 
321 /* Trim UTF-8 string to width. Caller frees. */
322 char *
323 utf8_trimcstr(const char *s, u_int width)
324 {
325 	struct utf8_data	*tmp, *next;
326 	char			*out;
327 	u_int			 at;
328 
329 	tmp = utf8_fromcstr(s);
330 
331 	at = 0;
332 	for (next = tmp; next->size != 0; next++) {
333 		if (at + next->width > width) {
334 			next->size = 0;
335 			break;
336 		}
337 		at += next->width;
338 	}
339 
340 	out = utf8_tocstr(tmp);
341 	free(tmp);
342 	return (out);
343 }
344 
345 /* Trim UTF-8 string to width. Caller frees. */
346 char *
347 utf8_rtrimcstr(const char *s, u_int width)
348 {
349 	struct utf8_data	*tmp, *next, *end;
350 	char			*out;
351 	u_int			 at;
352 
353 	tmp = utf8_fromcstr(s);
354 
355 	for (end = tmp; end->size != 0; end++)
356 		/* nothing */;
357 	if (end == tmp) {
358 		free(tmp);
359 		return (xstrdup(""));
360 	}
361 	next = end - 1;
362 
363 	at = 0;
364 	for (;;)
365 	{
366 		if (at + next->width > width) {
367 			next++;
368 			break;
369 		}
370 		at += next->width;
371 
372 		if (next == tmp)
373 			break;
374 		next--;
375 	}
376 
377 	out = utf8_tocstr(next);
378 	free(tmp);
379 	return (out);
380 }
381 
382 /* Pad UTF-8 string to width. Caller frees. */
383 char *
384 utf8_padcstr(const char *s, u_int width)
385 {
386 	size_t	 slen;
387 	char	*out;
388 	u_int	  n, i;
389 
390 	n = utf8_cstrwidth(s);
391 	if (n >= width)
392 		return (xstrdup(s));
393 
394 	slen = strlen(s);
395 	out = xmalloc(slen + 1 + (width - n));
396 	memcpy(out, s, slen);
397 	for (i = n; i < width; i++)
398 		out[slen++] = ' ';
399 	out[slen] = '\0';
400 	return (out);
401 }
402