xref: /netbsd-src/usr.bin/tr/str.c (revision 8d25f7611bdbf7254ba850e36b4404c983a43836)
1*8d25f761Sleot /*	$NetBSD: str.c,v 1.30 2018/05/26 11:20:30 leot Exp $	*/
2a1228f55Sjtc 
39acecc03Sglass /*-
4a1228f55Sjtc  * Copyright (c) 1991, 1993
5a1228f55Sjtc  *	The Regents of the University of California.  All rights reserved.
69acecc03Sglass  *
79acecc03Sglass  * Redistribution and use in source and binary forms, with or without
89acecc03Sglass  * modification, are permitted provided that the following conditions
99acecc03Sglass  * are met:
109acecc03Sglass  * 1. Redistributions of source code must retain the above copyright
119acecc03Sglass  *    notice, this list of conditions and the following disclaimer.
129acecc03Sglass  * 2. Redistributions in binary form must reproduce the above copyright
139acecc03Sglass  *    notice, this list of conditions and the following disclaimer in the
149acecc03Sglass  *    documentation and/or other materials provided with the distribution.
1589aaa1bbSagc  * 3. Neither the name of the University nor the names of its contributors
169acecc03Sglass  *    may be used to endorse or promote products derived from this software
179acecc03Sglass  *    without specific prior written permission.
189acecc03Sglass  *
199acecc03Sglass  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
209acecc03Sglass  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
219acecc03Sglass  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
229acecc03Sglass  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
239acecc03Sglass  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
249acecc03Sglass  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
259acecc03Sglass  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
269acecc03Sglass  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
279acecc03Sglass  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
289acecc03Sglass  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
299acecc03Sglass  * SUCH DAMAGE.
309acecc03Sglass  */
319acecc03Sglass 
327e4d6ea1Slukem #include <sys/cdefs.h>
339acecc03Sglass #ifndef lint
34a1228f55Sjtc #if 0
35f1df59adSjtc static char sccsid[] = "@(#)str.c	8.2 (Berkeley) 4/28/95";
36a1228f55Sjtc #endif
37*8d25f761Sleot __RCSID("$NetBSD: str.c,v 1.30 2018/05/26 11:20:30 leot Exp $");
389acecc03Sglass #endif /* not lint */
399acecc03Sglass 
409acecc03Sglass #include <sys/types.h>
419acecc03Sglass 
427e4d6ea1Slukem #include <err.h>
439acecc03Sglass #include <errno.h>
449acecc03Sglass #include <stddef.h>
459acecc03Sglass #include <stdio.h>
469acecc03Sglass #include <stdlib.h>
479acecc03Sglass #include <string.h>
48be9cb223Sjtc #include <ctype.h>
49d05d2aa5Sdholland #include <assert.h>
509acecc03Sglass 
519acecc03Sglass #include "extern.h"
529acecc03Sglass 
537f7f5495Sdholland struct str {
547f7f5495Sdholland 	enum { STRING1, STRING2 } which;
557f7f5495Sdholland 	enum { EOS, INFINITE, NORMAL, RANGE, SEQUENCE, SET } state;
567f7f5495Sdholland 	int cnt;			/* character count */
577f7f5495Sdholland 	int lastch;			/* last character */
587f7f5495Sdholland 	int equiv[2];			/* equivalence set */
597f7f5495Sdholland 	int *set;			/* set of characters */
609f2ae0eaSdholland 	const char *str;		/* user's string */
617f7f5495Sdholland };
627f7f5495Sdholland 
631d578019Sjoerg static int backslash(STR *);
641d578019Sjoerg static int bracket(STR *);
651d578019Sjoerg static int c_class(const void *, const void *);
66aba78f75Sdholland static int *genclass(const char *, size_t);
671d578019Sjoerg static void genequiv(STR *);
681d578019Sjoerg static int genrange(STR *);
691d578019Sjoerg static void genseq(STR *);
709acecc03Sglass 
717f7f5495Sdholland STR *
str_create(int whichstring,const char * txt)728b42fe33Sdholland str_create(int whichstring, const char *txt)
737f7f5495Sdholland {
747f7f5495Sdholland 	STR *s;
757f7f5495Sdholland 
767f7f5495Sdholland 	s = malloc(sizeof(*s));
777f7f5495Sdholland 	if (s == NULL) {
787f7f5495Sdholland 		err(1, "Out of memory");
797f7f5495Sdholland 	}
807f7f5495Sdholland 
817f7f5495Sdholland 	s->which = whichstring == 2 ? STRING2 : STRING1;
827f7f5495Sdholland 	s->state = NORMAL;
837f7f5495Sdholland 	s->cnt = 0;
847f7f5495Sdholland 	s->lastch = OOBCH;
857f7f5495Sdholland 	s->equiv[0] = 0;
867f7f5495Sdholland 	s->equiv[1] = OOBCH;
877f7f5495Sdholland 	s->set = NULL;
888b42fe33Sdholland 	s->str = txt;
897f7f5495Sdholland 
907f7f5495Sdholland 	return s;
917f7f5495Sdholland }
927f7f5495Sdholland 
937f7f5495Sdholland void
str_destroy(STR * s)947f7f5495Sdholland str_destroy(STR *s)
957f7f5495Sdholland {
967f7f5495Sdholland 	if (s->set != NULL && s->set != s->equiv) {
977f7f5495Sdholland 		free(s->set);
987f7f5495Sdholland 	}
997f7f5495Sdholland 	free(s);
1007f7f5495Sdholland }
1017f7f5495Sdholland 
1029acecc03Sglass int
next(STR * s,int * ret)103d05d2aa5Sdholland next(STR *s, int *ret)
1049acecc03Sglass {
1057e4d6ea1Slukem 	int ch;
1069acecc03Sglass 
1079acecc03Sglass 	switch (s->state) {
1089acecc03Sglass 	case EOS:
109d05d2aa5Sdholland 		*ret = s->lastch;
1100ffd30a0Schristos 		return 0;
1119acecc03Sglass 	case INFINITE:
112d05d2aa5Sdholland 		*ret = s->lastch;
1130ffd30a0Schristos 		return 1;
1149acecc03Sglass 	case NORMAL:
1155e8eff37Sdholland 		ch = (unsigned char)s->str[0];
1165e8eff37Sdholland 		switch (ch) {
1179acecc03Sglass 		case '\0':
1189acecc03Sglass 			s->state = EOS;
119d05d2aa5Sdholland 			*ret = s->lastch;
1200ffd30a0Schristos 			return 0;
1219acecc03Sglass 		case '\\':
1229acecc03Sglass 			s->lastch = backslash(s);
1239acecc03Sglass 			break;
1249acecc03Sglass 		case '[':
1255e8eff37Sdholland 			if (bracket(s)) {
126d05d2aa5Sdholland 				return next(s, ret);
1275e8eff37Sdholland 			}
1289acecc03Sglass 			/* FALLTHROUGH */
1299acecc03Sglass 		default:
1309acecc03Sglass 			++s->str;
1319acecc03Sglass 			s->lastch = ch;
1329acecc03Sglass 			break;
1339acecc03Sglass 		}
1349acecc03Sglass 
1359acecc03Sglass 		/* We can start a range at any time. */
136d05d2aa5Sdholland 		if (s->str[0] == '-' && genrange(s)) {
137d05d2aa5Sdholland 			return next(s, ret);
138d05d2aa5Sdholland 		}
139d05d2aa5Sdholland 		*ret = s->lastch;
1400ffd30a0Schristos 		return 1;
1419acecc03Sglass 	case RANGE:
1425e8eff37Sdholland 		if (s->cnt == 0) {
1439acecc03Sglass 			s->state = NORMAL;
144d05d2aa5Sdholland 			return next(s, ret);
1459acecc03Sglass 		}
1465e8eff37Sdholland 		s->cnt--;
1479acecc03Sglass 		++s->lastch;
148d05d2aa5Sdholland 		*ret = s->lastch;
1490ffd30a0Schristos 		return 1;
1509acecc03Sglass 	case SEQUENCE:
1515e8eff37Sdholland 		if (s->cnt == 0) {
1529acecc03Sglass 			s->state = NORMAL;
153d05d2aa5Sdholland 			return next(s, ret);
1549acecc03Sglass 		}
1555e8eff37Sdholland 		s->cnt--;
156d05d2aa5Sdholland 		*ret = s->lastch;
1570ffd30a0Schristos 		return 1;
1589acecc03Sglass 	case SET:
1595e8eff37Sdholland 		s->lastch = s->set[s->cnt++];
1605e8eff37Sdholland 		if (s->lastch == OOBCH) {
1619acecc03Sglass 			s->state = NORMAL;
1625e8eff37Sdholland 			if (s->set != s->equiv) {
1635e8eff37Sdholland 				free(s->set);
1645e8eff37Sdholland 			}
1655e8eff37Sdholland 			s->set = NULL;
166d05d2aa5Sdholland 			return next(s, ret);
1679acecc03Sglass 		}
168d05d2aa5Sdholland 		*ret = s->lastch;
1690ffd30a0Schristos 		return 1;
1709acecc03Sglass 	}
1719acecc03Sglass 	/* NOTREACHED */
172d05d2aa5Sdholland 	assert(0);
173d05d2aa5Sdholland 	*ret = s->lastch;
1740ffd30a0Schristos 	return 0;
1759acecc03Sglass }
1769acecc03Sglass 
1779acecc03Sglass static int
bracket(STR * s)1781d578019Sjoerg bracket(STR *s)
1799acecc03Sglass {
1805e8eff37Sdholland 	const char *p;
181aba78f75Sdholland 	int *q;
1829acecc03Sglass 
1839acecc03Sglass 	switch (s->str[1]) {
1849acecc03Sglass 	case ':':				/* "[:class:]" */
1859acecc03Sglass 		if ((p = strstr(s->str + 2, ":]")) == NULL)
1860ffd30a0Schristos 			return 0;
1879acecc03Sglass 		s->str += 2;
188aba78f75Sdholland 		q = genclass(s->str, p - s->str);
189aba78f75Sdholland 		s->state = SET;
190aba78f75Sdholland 		s->set = q;
191aba78f75Sdholland 		s->cnt = 0;
1929acecc03Sglass 		s->str = p + 2;
1930ffd30a0Schristos 		return 1;
1949acecc03Sglass 	case '=':				/* "[=equiv=]" */
1959acecc03Sglass 		if ((p = strstr(s->str + 2, "=]")) == NULL)
1960ffd30a0Schristos 			return 0;
1979acecc03Sglass 		s->str += 2;
1989acecc03Sglass 		genequiv(s);
1998d0b47d1Sdholland 		s->str = p + 2;
2000ffd30a0Schristos 		return 1;
2019acecc03Sglass 	default:				/* "[\###*n]" or "[#*n]" */
2029acecc03Sglass 		if ((p = strpbrk(s->str + 2, "*]")) == NULL)
2030ffd30a0Schristos 			return 0;
2047e4d6ea1Slukem 		if (p[0] != '*' || strchr(p, ']') == NULL)
2050ffd30a0Schristos 			return 0;
2069acecc03Sglass 		s->str += 1;
2079acecc03Sglass 		genseq(s);
2080ffd30a0Schristos 		return 1;
2099acecc03Sglass 	}
2109acecc03Sglass 	/* NOTREACHED */
2119acecc03Sglass }
2129acecc03Sglass 
2139acecc03Sglass typedef struct {
2147acf67f4Slukem 	const char *name;
2151d578019Sjoerg 	int (*func)(int);
2169acecc03Sglass } CLASS;
2179acecc03Sglass 
2180ffd30a0Schristos static const CLASS classes[] = {
2190ffd30a0Schristos 	{ "alnum",  isalnum  },
2200ffd30a0Schristos 	{ "alpha",  isalpha  },
2210ffd30a0Schristos 	{ "blank",  isblank  },
2220ffd30a0Schristos 	{ "cntrl",  iscntrl  },
2230ffd30a0Schristos 	{ "digit",  isdigit  },
2240ffd30a0Schristos 	{ "graph",  isgraph  },
2250ffd30a0Schristos 	{ "lower",  islower  },
2260ffd30a0Schristos 	{ "print",  isprint  },
2270ffd30a0Schristos 	{ "punct",  ispunct  },
2280ffd30a0Schristos 	{ "space",  isspace  },
2290ffd30a0Schristos 	{ "upper",  isupper  },
2300ffd30a0Schristos 	{ "xdigit", isxdigit },
2319acecc03Sglass };
2329acecc03Sglass 
2335e8eff37Sdholland typedef struct {
2345e8eff37Sdholland 	const char *name;
2355e8eff37Sdholland 	size_t len;
2365e8eff37Sdholland } CLASSKEY;
2375e8eff37Sdholland 
238aba78f75Sdholland static int *
genclass(const char * class,size_t len)239aba78f75Sdholland genclass(const char *class, size_t len)
2409acecc03Sglass {
2415e8eff37Sdholland 	int ch;
2420ffd30a0Schristos 	const CLASS *cp;
2435e8eff37Sdholland 	CLASSKEY key;
2449acecc03Sglass 	int *p;
2455e8eff37Sdholland 	unsigned pos, num;
2469acecc03Sglass 
2475e8eff37Sdholland 	/* Find the class */
2485e8eff37Sdholland 	key.name = class;
2495e8eff37Sdholland 	key.len = len;
2505e8eff37Sdholland 	cp = bsearch(&key, classes, __arraycount(classes), sizeof(classes[0]),
2515e8eff37Sdholland 		     c_class);
2525e8eff37Sdholland 	if (cp == NULL) {
2535e8eff37Sdholland 		errx(1, "unknown class %.*s", (int)len, class);
2545e8eff37Sdholland 	}
2559acecc03Sglass 
2565e8eff37Sdholland 	/*
2575e8eff37Sdholland 	 * Figure out what characters are in the class
2585e8eff37Sdholland 	 */
2595e8eff37Sdholland 
2605e8eff37Sdholland 	num = NCHARS + 1;
2615e8eff37Sdholland 	p = malloc(num * sizeof(*p));
2625e8eff37Sdholland 	if (p == NULL) {
2637e4d6ea1Slukem 		err(1, "malloc");
2645e8eff37Sdholland 	}
2656438f483Schristos 
2665e8eff37Sdholland 	pos = 0;
2675e8eff37Sdholland 	for (ch = 0; ch < NCHARS; ch++) {
2685e8eff37Sdholland 		if (cp->func(ch)) {
2695e8eff37Sdholland 			p[pos++] = ch;
2705e8eff37Sdholland 		}
2715e8eff37Sdholland 	}
2729acecc03Sglass 
2735e8eff37Sdholland 	p[pos++] = OOBCH;
2745e8eff37Sdholland 	for (; pos < num; pos++) {
2755e8eff37Sdholland 		p[pos] = 0;
2765e8eff37Sdholland 	}
2775e8eff37Sdholland 
278aba78f75Sdholland 	return p;
2799acecc03Sglass }
2809acecc03Sglass 
2819acecc03Sglass static int
c_class(const void * av,const void * bv)2825e8eff37Sdholland c_class(const void *av, const void *bv)
2839acecc03Sglass {
2845e8eff37Sdholland 	const CLASSKEY *a = av;
2855e8eff37Sdholland 	const CLASS *b = bv;
2865e8eff37Sdholland 	size_t blen;
2875e8eff37Sdholland 	int r;
2885e8eff37Sdholland 
2895e8eff37Sdholland 	blen = strlen(b->name);
2905e8eff37Sdholland 	r = strncmp(a->name, b->name, a->len);
2915e8eff37Sdholland 	if (r != 0) {
2925e8eff37Sdholland 		return r;
2935e8eff37Sdholland 	}
2945e8eff37Sdholland 	if (a->len < blen) {
2955e8eff37Sdholland 		/* someone gave us a prefix of the right name */
2965e8eff37Sdholland 		return -1;
2975e8eff37Sdholland 	}
2985e8eff37Sdholland 	assert(a-> len == blen);
2995e8eff37Sdholland 	return 0;
3009acecc03Sglass }
3019acecc03Sglass 
3029acecc03Sglass /*
3039acecc03Sglass  * English doesn't have any equivalence classes, so for now
3049acecc03Sglass  * we just syntax check and grab the character.
3059acecc03Sglass  */
3069acecc03Sglass static void
genequiv(STR * s)3071d578019Sjoerg genequiv(STR *s)
3089acecc03Sglass {
309aba78f75Sdholland 	int ch;
310aba78f75Sdholland 
311aba78f75Sdholland 	ch = (unsigned char)s->str[0];
312aba78f75Sdholland 	if (ch == '\\') {
3139acecc03Sglass 		s->equiv[0] = backslash(s);
3149acecc03Sglass 	} else {
315aba78f75Sdholland 		s->equiv[0] = ch;
3168d0b47d1Sdholland 		s->str++;
3178d0b47d1Sdholland 	}
3188d0b47d1Sdholland 	if (s->str[0] != '=') {
3195e8eff37Sdholland 		errx(1, "Misplaced equivalence equals sign");
3205e8eff37Sdholland 	}
3218d0b47d1Sdholland 	s->str++;
3228d0b47d1Sdholland 	if (s->str[0] != ']') {
3238d0b47d1Sdholland 		errx(1, "Misplaced equivalence right bracket");
3249acecc03Sglass 	}
3258d0b47d1Sdholland 	s->str++;
3268d0b47d1Sdholland 
3279acecc03Sglass 	s->cnt = 0;
3289acecc03Sglass 	s->state = SET;
3299acecc03Sglass 	s->set = s->equiv;
3309acecc03Sglass }
3319acecc03Sglass 
3329acecc03Sglass static int
genrange(STR * s)3331d578019Sjoerg genrange(STR *s)
3349acecc03Sglass {
3359acecc03Sglass 	int stopval;
33648dbbc10Sdholland 	const char *savestart;
3379acecc03Sglass 
338d506ddf6Sdholland 	savestart = s->str++;
3395e8eff37Sdholland 	stopval = s->str[0] == '\\' ? backslash(s) : (unsigned char)*s->str++;
3405e8eff37Sdholland 	if (stopval < (unsigned char)s->lastch) {
3419acecc03Sglass 		s->str = savestart;
3420ffd30a0Schristos 		return 0;
3439acecc03Sglass 	}
3449acecc03Sglass 	s->cnt = stopval - s->lastch + 1;
3459acecc03Sglass 	s->state = RANGE;
3469acecc03Sglass 	--s->lastch;
3470ffd30a0Schristos 	return 1;
3489acecc03Sglass }
3499acecc03Sglass 
3509acecc03Sglass static void
genseq(STR * s)3511d578019Sjoerg genseq(STR *s)
3529acecc03Sglass {
3539acecc03Sglass 	char *ep;
3549acecc03Sglass 
3555e8eff37Sdholland 	if (s->which == STRING1) {
3565e8eff37Sdholland 		errx(1, "Sequences only valid in string2");
3575e8eff37Sdholland 	}
3589acecc03Sglass 
3595e8eff37Sdholland 	if (*s->str == '\\') {
3609acecc03Sglass 		s->lastch = backslash(s);
3615e8eff37Sdholland 	} else {
3629f2ae0eaSdholland 		s->lastch = (unsigned char)*s->str++;
3635e8eff37Sdholland 	}
3645e8eff37Sdholland 	if (*s->str != '*') {
3655e8eff37Sdholland 		errx(1, "Misplaced sequence asterisk");
3665e8eff37Sdholland 	}
3679acecc03Sglass 
3685e8eff37Sdholland 	s->str++;
3695e8eff37Sdholland 	switch (s->str[0]) {
3709acecc03Sglass 	case '\\':
3719acecc03Sglass 		s->cnt = backslash(s);
3729acecc03Sglass 		break;
3739acecc03Sglass 	case ']':
3749acecc03Sglass 		s->cnt = 0;
3759acecc03Sglass 		++s->str;
3769acecc03Sglass 		break;
3779acecc03Sglass 	default:
3785e8eff37Sdholland 		if (isdigit((unsigned char)s->str[0])) {
3799acecc03Sglass 			s->cnt = strtol(s->str, &ep, 0);
3809acecc03Sglass 			if (*ep == ']') {
3819acecc03Sglass 				s->str = ep + 1;
3829acecc03Sglass 				break;
3839acecc03Sglass 			}
3849acecc03Sglass 		}
3857e4d6ea1Slukem 		errx(1, "illegal sequence count");
3869acecc03Sglass 		/* NOTREACHED */
3879acecc03Sglass 	}
3889acecc03Sglass 
3899acecc03Sglass 	s->state = s->cnt ? SEQUENCE : INFINITE;
3909acecc03Sglass }
3919acecc03Sglass 
3929acecc03Sglass /*
3939acecc03Sglass  * Translate \??? into a character.  Up to 3 octal digits, if no digits either
3949acecc03Sglass  * an escape code or a literal character.
3959acecc03Sglass  */
3969acecc03Sglass static int
backslash(STR * s)3971d578019Sjoerg backslash(STR *s)
3989acecc03Sglass {
3997e4d6ea1Slukem 	int ch, cnt, val;
4009acecc03Sglass 
401aba78f75Sdholland 	cnt = val = 0;
402aba78f75Sdholland 	for (;;) {
403aba78f75Sdholland 		/* Consume the character we're already on. */
4045e8eff37Sdholland 		s->str++;
405aba78f75Sdholland 
406aba78f75Sdholland 		/* Look at the next character. */
4075e8eff37Sdholland 		ch = (unsigned char)s->str[0];
4085e8eff37Sdholland 		if (!isascii(ch) || !isdigit(ch)) {
4099acecc03Sglass 			break;
4105e8eff37Sdholland 		}
4119acecc03Sglass 		val = val * 8 + ch - '0';
4129acecc03Sglass 		if (++cnt == 3) {
413aba78f75Sdholland 			/* Enough digits; consume this one and stop */
4149acecc03Sglass 			++s->str;
4159acecc03Sglass 			break;
4169acecc03Sglass 		}
4179acecc03Sglass 	}
4185e8eff37Sdholland 	if (cnt) {
419aba78f75Sdholland 		/* We saw digits, so return their value */
420*8d25f761Sleot 		if (val >= OOBCH)
421*8d25f761Sleot 			errx(1, "Invalid octal character value");
4220ffd30a0Schristos 		return val;
4235e8eff37Sdholland 	}
424aba78f75Sdholland 	if (ch == '\0') {
425aba78f75Sdholland 		/* \<end> -> \ */
426aba78f75Sdholland 		s->state = EOS;
427aba78f75Sdholland 		return '\\';
4285e8eff37Sdholland 	}
429aba78f75Sdholland 
430aba78f75Sdholland 	/* Consume the escaped character */
431aba78f75Sdholland 	s->str++;
432aba78f75Sdholland 
4339acecc03Sglass 	switch (ch) {
4349acecc03Sglass 	case 'a':			/* escape characters */
4350ffd30a0Schristos 		return '\7';
4369acecc03Sglass 	case 'b':
4370ffd30a0Schristos 		return '\b';
4380ffd30a0Schristos 	case 'e':
4390ffd30a0Schristos 		return '\033';
4409acecc03Sglass 	case 'f':
4410ffd30a0Schristos 		return '\f';
4429acecc03Sglass 	case 'n':
4430ffd30a0Schristos 		return '\n';
4449acecc03Sglass 	case 'r':
4450ffd30a0Schristos 		return '\r';
4469acecc03Sglass 	case 't':
4470ffd30a0Schristos 		return '\t';
4489acecc03Sglass 	case 'v':
4490ffd30a0Schristos 		return '\13';
450aba78f75Sdholland 	default:			/* \q -> q */
4510ffd30a0Schristos 		return ch;
4529acecc03Sglass 	}
4539acecc03Sglass }
454