xref: /openbsd-src/usr.bin/tr/str.c (revision d13be5d47e4149db2549a9828e244d59dbc43f15)
1 /*	$OpenBSD: str.c,v 1.11 2009/10/27 23:59:46 deraadt Exp $	*/
2 /*	$NetBSD: str.c,v 1.7 1995/08/31 22:13:47 jtc Exp $	*/
3 
4 /*-
5  * Copyright (c) 1991, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 
33 #include <sys/cdefs.h>
34 #include <sys/types.h>
35 
36 #include <errno.h>
37 #include <stddef.h>
38 #include <stdio.h>
39 #include <stdlib.h>
40 #include <string.h>
41 #include <ctype.h>
42 #include <err.h>
43 
44 #include "extern.h"
45 
46 static int	backslash(STR *);
47 static int	bracket(STR *);
48 static int	c_class(const void *, const void *);
49 static void	genclass(STR *);
50 static void	genequiv(STR *);
51 static int	genrange(STR *);
52 static void	genseq(STR *);
53 
54 int
55 next(s)
56 	STR *s;
57 {
58 	int ch;
59 
60 	switch (s->state) {
61 	case EOS:
62 		return (0);
63 	case INFINITE:
64 		return (1);
65 	case NORMAL:
66 		switch (ch = *s->str) {
67 		case '\0':
68 			s->state = EOS;
69 			return (0);
70 		case '\\':
71 			s->lastch = backslash(s);
72 			break;
73 		case '[':
74 			if (bracket(s))
75 				return (next(s));
76 			/* FALLTHROUGH */
77 		default:
78 			++s->str;
79 			s->lastch = ch;
80 			break;
81 		}
82 
83 		/* We can start a range at any time. */
84 		if (s->str[0] == '-' && genrange(s))
85 			return (next(s));
86 		return (1);
87 	case RANGE:
88 		if (s->cnt-- == 0) {
89 			s->state = NORMAL;
90 			return (next(s));
91 		}
92 		++s->lastch;
93 		return (1);
94 	case SEQUENCE:
95 		if (s->cnt-- == 0) {
96 			s->state = NORMAL;
97 			return (next(s));
98 		}
99 		return (1);
100 	case SET:
101 		if ((s->lastch = s->set[s->cnt++]) == OOBCH) {
102 			s->state = NORMAL;
103 			return (next(s));
104 		}
105 		return (1);
106 	default:
107 		return 0;
108 	}
109 	/* NOTREACHED */
110 }
111 
112 static int
113 bracket(s)
114 	STR *s;
115 {
116 	char *p;
117 
118 	switch (s->str[1]) {
119 	case ':':				/* "[:class:]" */
120 		if ((p = strstr((char *)s->str + 2, ":]")) == NULL)
121 			return (0);
122 		*p = '\0';
123 		s->str += 2;
124 		genclass(s);
125 		s->str = (unsigned char *)p + 2;
126 		return (1);
127 	case '=':				/* "[=equiv=]" */
128 		if ((p = strstr((char *)s->str + 2, "=]")) == NULL)
129 			return (0);
130 		s->str += 2;
131 		genequiv(s);
132 		return (1);
133 	default:				/* "[\###*n]" or "[#*n]" */
134 		if ((p = strpbrk((char *)s->str + 2, "*]")) == NULL)
135 			return (0);
136 		if (p[0] != '*' || strchr(p, ']') == NULL)
137 			return (0);
138 		s->str += 1;
139 		genseq(s);
140 		return (1);
141 	}
142 	/* NOTREACHED */
143 }
144 
145 typedef struct {
146 	char *name;
147 	int (*func)(int);
148 	int *set;
149 } CLASS;
150 
151 static CLASS classes[] = {
152 	{ "alnum",  isalnum,  },
153 	{ "alpha",  isalpha,  },
154 	{ "blank",  isblank,  },
155 	{ "cntrl",  iscntrl,  },
156 	{ "digit",  isdigit,  },
157 	{ "graph",  isgraph,  },
158 	{ "lower",  islower,  },
159 	{ "print",  isprint,  },
160 	{ "punct",  ispunct,  },
161 	{ "space",  isspace,  },
162 	{ "upper",  isupper,  },
163 	{ "xdigit", isxdigit, },
164 };
165 
166 static void
167 genclass(s)
168 	STR *s;
169 {
170 	int cnt, (*func)(int);
171 	CLASS *cp, tmp;
172 	int *p;
173 
174 	tmp.name = (char *)s->str;
175 	if ((cp = (CLASS *)bsearch(&tmp, classes, sizeof(classes) /
176 	    sizeof(CLASS), sizeof(CLASS), c_class)) == NULL)
177 		errx(1, "unknown class %s", s->str);
178 
179 	if ((cp->set = p = calloc(NCHARS + 1, sizeof(int))) == NULL)
180 		errx(1, "no memory for a class");
181 	for (cnt = 0, func = cp->func; cnt < NCHARS; ++cnt)
182 		if ((func)(cnt))
183 			*p++ = cnt;
184 	*p = OOBCH;
185 
186 	s->cnt = 0;
187 	s->state = SET;
188 	s->set = cp->set;
189 }
190 
191 static int
192 c_class(a, b)
193 	const void *a, *b;
194 {
195 	return (strcmp(((CLASS *)a)->name, ((CLASS *)b)->name));
196 }
197 
198 /*
199  * English doesn't have any equivalence classes, so for now
200  * we just syntax check and grab the character.
201  */
202 static void
203 genequiv(s)
204 	STR *s;
205 {
206 	if (*s->str == '\\') {
207 		s->equiv[0] = backslash(s);
208 		if (*s->str != '=')
209 			errx(1, "misplaced equivalence equals sign");
210 	} else {
211 		s->equiv[0] = s->str[0];
212 		if (s->str[1] != '=')
213 			errx(1, "misplaced equivalence equals sign");
214 	}
215 	s->str += 2;
216 	s->cnt = 0;
217 	s->state = SET;
218 	s->set = s->equiv;
219 }
220 
221 static int
222 genrange(s)
223 	STR *s;
224 {
225 	int stopval;
226 	unsigned char *savestart;
227 
228 	savestart = s->str;
229 	stopval = *++s->str == '\\' ? backslash(s) : *s->str++;
230 	if (stopval < (u_char)s->lastch) {
231 		s->str = savestart;
232 		return (0);
233 	}
234 	s->cnt = stopval - s->lastch + 1;
235 	s->state = RANGE;
236 	--s->lastch;
237 	return (1);
238 }
239 
240 static void
241 genseq(s)
242 	STR *s;
243 {
244 	char *ep;
245 
246 	if (s->which == STRING1)
247 		errx(1, "sequences only valid in string2");
248 
249 	if (*s->str == '\\')
250 		s->lastch = backslash(s);
251 	else
252 		s->lastch = *s->str++;
253 	if (*s->str != '*')
254 		errx(1, "misplaced sequence asterisk");
255 
256 	switch (*++s->str) {
257 	case '\\':
258 		s->cnt = backslash(s);
259 		break;
260 	case ']':
261 		s->cnt = 0;
262 		++s->str;
263 		break;
264 	default:
265 		if (isdigit(*s->str)) {
266 			s->cnt = strtol((char *)s->str, &ep, 0);
267 			if (*ep == ']') {
268 				s->str = (unsigned char *)ep + 1;
269 				break;
270 			}
271 		}
272 		errx(1, "illegal sequence count");
273 		/* NOTREACHED */
274 	}
275 
276 	s->state = s->cnt ? SEQUENCE : INFINITE;
277 }
278 
279 /*
280  * Translate \??? into a character.  Up to 3 octal digits, if no digits either
281  * an escape code or a literal character.
282  */
283 static int
284 backslash(s)
285 	STR *s;
286 {
287 	int ch, cnt, val;
288 
289 	for (cnt = val = 0;;) {
290 		ch = *++s->str;
291 		if (!isascii(ch) || !isdigit(ch))
292 			break;
293 		val = val * 8 + ch - '0';
294 		if (++cnt == 3) {
295 			++s->str;
296 			break;
297 		}
298 	}
299 	if (cnt)
300 		return (val);
301 	if (ch != '\0')
302 		++s->str;
303 	switch (ch) {
304 		case 'a':			/* escape characters */
305 			return ('\7');
306 		case 'b':
307 			return ('\b');
308 		case 'f':
309 			return ('\f');
310 		case 'n':
311 			return ('\n');
312 		case 'r':
313 			return ('\r');
314 		case 't':
315 			return ('\t');
316 		case 'v':
317 			return ('\13');
318 		case '\0':			/*  \" -> \ */
319 			s->state = EOS;
320 			return ('\\');
321 		default:			/* \x" -> x */
322 			return (ch);
323 	}
324 }
325