xref: /netbsd-src/lib/libc/gen/vis.c (revision 6de51c519f1b899da63c1bf576f478920b89083f)
1 /*	$NetBSD: vis.c,v 1.53 2013/02/15 00:28:10 christos Exp $	*/
2 
3 /*-
4  * Copyright (c) 1989, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. Neither the name of the University nor the names of its contributors
16  *    may be used to endorse or promote products derived from this software
17  *    without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 /*-
33  * Copyright (c) 1999, 2005 The NetBSD Foundation, Inc.
34  * All rights reserved.
35  *
36  * Redistribution and use in source and binary forms, with or without
37  * modification, are permitted provided that the following conditions
38  * are met:
39  * 1. Redistributions of source code must retain the above copyright
40  *    notice, this list of conditions and the following disclaimer.
41  * 2. Redistributions in binary form must reproduce the above copyright
42  *    notice, this list of conditions and the following disclaimer in the
43  *    documentation and/or other materials provided with the distribution.
44  *
45  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
46  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
47  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
48  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
49  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
50  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
51  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
52  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
53  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
54  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
55  * POSSIBILITY OF SUCH DAMAGE.
56  */
57 
58 #include <sys/cdefs.h>
59 #if defined(LIBC_SCCS) && !defined(lint)
60 __RCSID("$NetBSD: vis.c,v 1.53 2013/02/15 00:28:10 christos Exp $");
61 #endif /* LIBC_SCCS and not lint */
62 #ifdef __FBSDID
63 __FBSDID("$FreeBSD$");
64 #define	_DIAGASSERT(x)	assert(x)
65 #endif
66 
67 #include "namespace.h"
68 #include <sys/types.h>
69 
70 #include <assert.h>
71 #include <vis.h>
72 #include <errno.h>
73 #include <stdlib.h>
74 #include <wchar.h>
75 #include <wctype.h>
76 
77 #ifdef __weak_alias
78 __weak_alias(strvisx,_strvisx)
79 #endif
80 
81 #if !HAVE_VIS || !HAVE_SVIS
82 #include <ctype.h>
83 #include <limits.h>
84 #include <stdio.h>
85 #include <string.h>
86 
87 /*
88  * The reason for going through the trouble to deal with character encodings
89  * in vis(3), is that we use this to safe encode output of commands. This
90  * safe encoding varies depending on the character set. For example if we
91  * display ps output in French, we don't want to display French characters
92  * as M-foo.
93  */
94 
95 static wchar_t *do_svis(wchar_t *, wint_t, int, wint_t, const wchar_t *);
96 
97 #undef BELL
98 #define BELL L'\a'
99 
100 #define iswoctal(c)	(((u_char)(c)) >= L'0' && ((u_char)(c)) <= L'7')
101 #define iswwhite(c)	(c == L' ' || c == L'\t' || c == L'\n')
102 #define iswsafe(c)	(c == L'\b' || c == BELL || c == L'\r')
103 #define xtoa(c)		L"0123456789abcdef"[c]
104 #define XTOA(c)		L"0123456789ABCDEF"[c]
105 
106 #define MAXEXTRAS	9
107 
108 #define MAKEEXTRALIST(flag, extra, orig_str)				      \
109 do {									      \
110 	const wchar_t *orig = orig_str;					      \
111 	const wchar_t *o = orig;					      \
112 	wchar_t *e;							      \
113 	while (*o++)							      \
114 		continue;						      \
115 	extra = calloc((size_t)((o - orig) + MAXEXTRAS), sizeof(*extra));    \
116 	if (!extra) break;						      \
117 	for (o = orig, e = extra; (*e++ = *o++) != L'\0';)		      \
118 		continue;						      \
119 	e--;								      \
120 	if (flag & VIS_GLOB) {						      \
121 		*e++ = L'*';						      \
122 		*e++ = L'?';						      \
123 		*e++ = L'[';						      \
124 		*e++ = L'#';						      \
125 	}								      \
126 	if (flag & VIS_SP) *e++ = L' ';					      \
127 	if (flag & VIS_TAB) *e++ = L'\t';				      \
128 	if (flag & VIS_NL) *e++ = L'\n';				      \
129 	if ((flag & VIS_NOSLASH) == 0) *e++ = L'\\';			      \
130 	*e = L'\0';							      \
131 } while (/*CONSTCOND*/0)
132 
133 /*
134  * This is do_hvis, for HTTP style (RFC 1808)
135  */
136 static wchar_t *
137 do_hvis(wchar_t *dst, wint_t c, int flag, wint_t nextc, const wchar_t *extra)
138 {
139 	if (iswalnum(c)
140 	    /* safe */
141 	    || c == L'$' || c == L'-' || c == L'_' || c == L'.' || c == L'+'
142 	    /* extra */
143 	    || c == L'!' || c == L'*' || c == L'\'' || c == L'(' || c == L')'
144 	    || c == L',')
145 		dst = do_svis(dst, c, flag, nextc, extra);
146 	else {
147 		*dst++ = L'%';
148 		*dst++ = xtoa(((unsigned int)c >> 4) & 0xf);
149 		*dst++ = xtoa((unsigned int)c & 0xf);
150 	}
151 
152 	return dst;
153 }
154 
155 /*
156  * This is do_mvis, for Quoted-Printable MIME (RFC 2045)
157  * NB: No handling of long lines or CRLF.
158  */
159 static wchar_t *
160 do_mvis(wchar_t *dst, wint_t c, int flag, wint_t nextc, const wchar_t *extra)
161 {
162 	if ((c != L'\n') &&
163 	    /* Space at the end of the line */
164 	    ((iswspace(c) && (nextc == L'\r' || nextc == L'\n')) ||
165 	    /* Out of range */
166 	    (!iswspace(c) && (c < 33 || (c > 60 && c < 62) || c > 126)) ||
167 	    /* Specific char to be escaped */
168 	    wcschr(L"#$@[\\]^`{|}~", c) != NULL)) {
169 		*dst++ = L'=';
170 		*dst++ = XTOA(((unsigned int)c >> 4) & 0xf);
171 		*dst++ = XTOA((unsigned int)c & 0xf);
172 	} else
173 		dst = do_svis(dst, c, flag, nextc, extra);
174 	return dst;
175 }
176 
177 /*
178  * This is do_vis, the central code of vis.
179  * dst:	      Pointer to the destination buffer
180  * c:	      Character to encode
181  * flag:      Flag word
182  * nextc:     The character following 'c'
183  * extra:     Pointer to the list of extra characters to be
184  *	      backslash-protected.
185  */
186 static wchar_t *
187 do_svis(wchar_t *dst, wint_t c, int flag, wint_t nextc, const wchar_t *extra)
188 {
189 	int iswextra;
190 
191 	iswextra = wcschr(extra, c) != NULL;
192 	if (!iswextra && (iswgraph(c) || iswwhite(c) ||
193 	    ((flag & VIS_SAFE) && iswsafe(c)))) {
194 		*dst++ = c;
195 		return dst;
196 	}
197 	if (flag & VIS_CSTYLE) {
198 		switch (c) {
199 		case L'\n':
200 			*dst++ = L'\\'; *dst++ = L'n';
201 			return dst;
202 		case L'\r':
203 			*dst++ = L'\\'; *dst++ = L'r';
204 			return dst;
205 		case L'\b':
206 			*dst++ = L'\\'; *dst++ = L'b';
207 			return dst;
208 		case BELL:
209 			*dst++ = L'\\'; *dst++ = L'a';
210 			return dst;
211 		case L'\v':
212 			*dst++ = L'\\'; *dst++ = L'v';
213 			return dst;
214 		case L'\t':
215 			*dst++ = L'\\'; *dst++ = L't';
216 			return dst;
217 		case L'\f':
218 			*dst++ = L'\\'; *dst++ = L'f';
219 			return dst;
220 		case L' ':
221 			*dst++ = L'\\'; *dst++ = L's';
222 			return dst;
223 		case L'\0':
224 			*dst++ = L'\\'; *dst++ = L'0';
225 			if (iswoctal(nextc)) {
226 				*dst++ = L'0';
227 				*dst++ = L'0';
228 			}
229 			return dst;
230 		default:
231 			if (iswgraph(c)) {
232 				*dst++ = L'\\';
233 				*dst++ = c;
234 				return dst;
235 			}
236 		}
237 	}
238 	if (iswextra || ((c & 0177) == L' ') || (flag & VIS_OCTAL)) {
239 		*dst++ = L'\\';
240 		*dst++ = (u_char)(((u_int32_t)(u_char)c >> 6) & 03) + L'0';
241 		*dst++ = (u_char)(((u_int32_t)(u_char)c >> 3) & 07) + L'0';
242 		*dst++ =			     (c	      & 07) + L'0';
243 	} else {
244 		if ((flag & VIS_NOSLASH) == 0)
245 			*dst++ = L'\\';
246 
247 		if (c & 0200) {
248 			c &= 0177;
249 			*dst++ = L'M';
250 		}
251 
252 		if (iswcntrl(c)) {
253 			*dst++ = L'^';
254 			if (c == 0177)
255 				*dst++ = L'?';
256 			else
257 				*dst++ = c + L'@';
258 		} else {
259 			*dst++ = L'-';
260 			*dst++ = c;
261 		}
262 	}
263 	return dst;
264 }
265 
266 typedef wchar_t *(*visfun_t)(wchar_t *, wint_t, int, wint_t, const wchar_t *);
267 
268 /*
269  * Return the appropriate encoding function depending on the flags given.
270  */
271 static visfun_t
272 getvisfun(int flag)
273 {
274 	if (flag & VIS_HTTPSTYLE)
275 		return do_hvis;
276 	if (flag & VIS_MIMESTYLE)
277 		return do_mvis;
278 	return do_svis;
279 }
280 
281 /*
282  * istrsnvisx()
283  * 	The main internal function.
284  *	All user-visible functions call this one.
285  */
286 static int
287 istrsnvisx(char *mbdst, size_t *dlen, const char *mbsrc, size_t mblength,
288     int flag, const char *mbextra)
289 {
290 	wchar_t *dst, *src, *pdst, *psrc, *start, *extra, *nextra;
291 	size_t len, olen;
292 	wint_t c;
293 	visfun_t f;
294 	int clen, error = -1;
295 	ssize_t mbslength;
296 
297 	_DIAGASSERT(mbdst != NULL);
298 	_DIAGASSERT(mbsrc != NULL);
299 	_DIAGASSERT(mbextra != NULL);
300 
301 	/*
302 	 * Input (mbsrc) is a char string considered to be multibyte
303 	 * characters.  The input loop will read this string pulling
304 	 * one character, possibly multiple bytes, from mbsrc and
305 	 * converting each to wchar_t in src.
306 	 *
307 	 * The vis conversion will be done using the wide char
308 	 * wchar_t string.
309 	 *
310 	 * This will then be converted back to a multibyte string to
311 	 * return to the caller.
312 	 */
313 
314 	/* Allocate space for the wide char strings */
315 	psrc = pdst = extra = nextra = NULL;
316 	if (!mblength)
317 		mblength = strlen(mbsrc);
318 
319 	if ((psrc = calloc(mblength + 1, sizeof(*psrc))) == NULL)
320 		return -1;
321 	if ((pdst = calloc((4 * mblength) + 1, sizeof(*pdst))) == NULL)
322 		goto out;
323 	if ((extra = calloc((strlen(mbextra) + 1), sizeof(*extra))) == NULL)
324 		goto out;
325 
326 	dst = pdst;
327 	src = psrc;
328 
329 	/*
330 	 * Input loop.
331 	 * Handle up to mblength characters (not bytes).  We do not
332 	 * stop at NULs because we may be processing a block of data
333 	 * that includes NULs.  We process one more than the character
334 	 * count so that we also get the next character of input which
335 	 * is needed under some circumstances as a look-ahead character.
336 	 */
337 	mbslength = (ssize_t)mblength;
338 	/*
339 	 * When inputing a single character, must also read in the
340 	 * next character for nextc, the look-ahead character.
341 	 */
342 	if (mbslength == 1)
343 		mbslength++;
344 	while (mbslength > 0) {
345 		/* Convert one multibyte character to wchar_t. */
346 		clen = mbtowc(src, mbsrc, MB_LEN_MAX);
347 		if (clen < 0) {
348 			/* Conversion error, process as a byte instead. */
349 			*src = (wint_t)*mbsrc;
350 			clen = 1;
351 		}
352 		if (clen == 0)
353 			/*
354 			 * NUL in input gives 0 return value. process
355 			 * as single NUL byte.
356 			 */
357 			clen = 1;
358 		/* Advance output pointer if we still have input left. */
359 		src++;
360 		/* Advance input pointer by number of bytes read. */
361 		mbsrc += clen;
362 		/* Decrement input count */
363 		mbslength -= clen;
364 	}
365 	len = src - psrc;
366 	src = psrc;
367 	/*
368 	 * In the single character input case, we will have actually
369 	 * processed two characters, c and nextc.  Reset len back to
370 	 * just a single character.
371 	 */
372 	if (mblength < len)
373 		len = mblength;
374 
375 	/* Convert extra argument to list of characters for this mode. */
376 	mbstowcs(extra, mbextra, strlen(mbextra));
377 	MAKEEXTRALIST(flag, nextra, extra);
378 	if (!nextra) {
379 		if (dlen && *dlen == 0) {
380 			errno = ENOSPC;
381 			goto out;
382 		}
383 		*mbdst = '\0';		/* can't create nextra, return "" */
384 		error = 0;
385 		goto out;
386 	}
387 
388 	/* Look up which processing function to call. */
389 	f = getvisfun(flag);
390 
391 	/*
392 	 * Main processing loop.
393 	 * Call do_Xvis processing function one character at a time
394 	 * with next character available for look-ahead.
395 	 */
396 	for (start = dst; len > 0; len--) {
397 		c = *src++;
398 		dst = (*f)(dst, c, flag, len >= 1 ? *src : L'\0', nextra);
399 		if (dst == NULL) {
400 			errno = ENOSPC;
401 			goto out;
402 		}
403 	}
404 
405 	/* Terminate the output string. */
406 	*dst = L'\0';
407 
408 	/* Convert wchar_t string back to multibyte output string. */
409 	len = dlen ? *dlen : ((wcslen(start) + 1) * MB_LEN_MAX);
410 	olen = wcstombs(mbdst, start, len * sizeof(*mbdst));
411 
412 	free(nextra);
413 	free(extra);
414 	free(pdst);
415 	free(psrc);
416 
417 	return (int)olen;
418 out:
419 	free(nextra);
420 	free(extra);
421 	free(pdst);
422 	free(psrc);
423 	return error;
424 }
425 #endif
426 
427 #if !HAVE_SVIS
428 /*
429  *	The "svis" variants all take an "extra" arg that is a pointer
430  *	to a NUL-terminated list of characters to be encoded, too.
431  *	These functions are useful e. g. to encode strings in such a
432  *	way so that they are not interpreted by a shell.
433  */
434 
435 char *
436 svis(char *mbdst, int c, int flag, int nextc, const char *mbextra)
437 {
438 	char cc[2];
439 	int ret;
440 
441 	cc[0] = c;
442 	cc[1] = nextc;
443 
444 	ret = istrsnvisx(mbdst, NULL, cc, 1, flag, mbextra);
445 	if (ret < 0)
446 		return NULL;
447 	return mbdst + ret;
448 }
449 
450 char *
451 snvis(char *mbdst, size_t dlen, int c, int flag, int nextc, const char *mbextra)
452 {
453 	char cc[2];
454 	int ret;
455 
456 	cc[0] = c;
457 	cc[1] = nextc;
458 
459 	ret = istrsnvisx(mbdst, &dlen, cc, 1, flag, mbextra);
460 	if (ret < 0)
461 		return NULL;
462 	return mbdst + ret;
463 }
464 
465 int
466 strsvis(char *mbdst, const char *mbsrc, int flag, const char *mbextra)
467 {
468 	return istrsnvisx(mbdst, NULL, mbsrc, 0, flag, mbextra);
469 }
470 
471 int
472 strsnvis(char *mbdst, size_t dlen, const char *mbsrc, int flag, const char *mbextra)
473 {
474 	return istrsnvisx(mbdst, &dlen, mbsrc, 0, flag, mbextra);
475 }
476 
477 int
478 strsvisx(char *mbdst, const char *mbsrc, size_t len, int flag, const char *mbextra)
479 {
480 	return istrsnvisx(mbdst, NULL, mbsrc, len, flag, mbextra);
481 }
482 
483 int
484 strsnvisx(char *mbdst, size_t dlen, const char *mbsrc, size_t len, int flag,
485     const char *mbextra)
486 {
487 	return istrsnvisx(mbdst, &dlen, mbsrc, len, flag, mbextra);
488 }
489 #endif
490 
491 #if !HAVE_VIS
492 /*
493  * vis - visually encode characters
494  */
495 char *
496 vis(char *mbdst, int c, int flag, int nextc)
497 {
498 	char cc[2];
499 	int ret;
500 
501 	cc[0] = c;
502 	cc[1] = nextc;
503 
504 	ret = istrsnvisx(mbdst, NULL, cc, 1, flag, "");
505 	if (ret < 0)
506 		return NULL;
507 	return mbdst + ret;
508 }
509 
510 char *
511 nvis(char *mbdst, size_t dlen, int c, int flag, int nextc)
512 {
513 	char cc[2];
514 	int ret;
515 
516 	cc[0] = c;
517 	cc[1] = nextc;
518 
519 	ret = istrsnvisx(mbdst, &dlen, cc, 1, flag, "");
520 	if (ret < 0)
521 		return NULL;
522 	return mbdst + ret;
523 }
524 
525 /*
526  * strvis - visually encode characters from src into dst
527  *
528  *	Dst must be 4 times the size of src to account for possible
529  *	expansion.  The length of dst, not including the trailing NULL,
530  *	is returned.
531  */
532 
533 int
534 strvis(char *mbdst, const char *mbsrc, int flag)
535 {
536 	return istrsnvisx(mbdst, NULL, mbsrc, 0, flag, "");
537 }
538 
539 int
540 strnvis(char *mbdst, size_t dlen, const char *mbsrc, int flag)
541 {
542 	return istrsnvisx(mbdst, &dlen, mbsrc, 0, flag, "");
543 }
544 
545 /*
546  * strvisx - visually encode characters from src into dst
547  *
548  *	Dst must be 4 times the size of src to account for possible
549  *	expansion.  The length of dst, not including the trailing NULL,
550  *	is returned.
551  *
552  *	Strvisx encodes exactly len characters from src into dst.
553  *	This is useful for encoding a block of data.
554  */
555 
556 int
557 strvisx(char *mbdst, const char *mbsrc, size_t len, int flag)
558 {
559 	return istrsnvisx(mbdst, NULL, mbsrc, len, flag, "");
560 }
561 
562 int
563 strnvisx(char *mbdst, size_t dlen, const char *mbsrc, size_t len, int flag)
564 {
565 	return istrsnvisx(mbdst, &dlen, mbsrc, len, flag, "");
566 }
567 #endif
568