xref: /openbsd-src/usr.bin/tr/str.c (revision 5a38ef86d0b61900239c7913d24a05e7b88a58f0)
1 /*	$OpenBSD: str.c,v 1.14 2021/11/02 03:09:15 cheloha Exp $	*/
2 /*	$NetBSD: str.c,v 1.7 1995/08/31 22:13:47 jtc Exp $	*/
3 
4 /*-
5  * Copyright (c) 1991, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  * 3. Neither the name of the University nor the names of its contributors
17  *    may be used to endorse or promote products derived from this software
18  *    without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30  * SUCH DAMAGE.
31  */
32 
33 #include <sys/types.h>
34 
35 #include <assert.h>
36 #include <errno.h>
37 #include <stddef.h>
38 #include <stdio.h>
39 #include <stdlib.h>
40 #include <string.h>
41 #include <ctype.h>
42 #include <err.h>
43 
44 #include "extern.h"
45 
46 static int	backslash(STR *);
47 static int	bracket(STR *);
48 static int	c_class(const void *, const void *);
49 static void	genclass(STR *);
50 static void	genequiv(STR *);
51 static int	genrange(STR *);
52 static void	genseq(STR *);
53 
54 int
55 next(s)
56 	STR *s;
57 {
58 	int ch;
59 
60 	switch (s->state) {
61 	case EOS:
62 		return (0);
63 	case INFINITE:
64 		return (1);
65 	case NORMAL:
66 		switch (ch = *s->str) {
67 		case '\0':
68 			s->state = EOS;
69 			return (0);
70 		case '\\':
71 			s->lastch = backslash(s);
72 			break;
73 		case '[':
74 			if (bracket(s))
75 				return (next(s));
76 			/* FALLTHROUGH */
77 		default:
78 			++s->str;
79 			s->lastch = ch;
80 			break;
81 		}
82 
83 		/* We can start a range at any time. */
84 		if (s->str[0] == '-' && genrange(s))
85 			return (next(s));
86 		return (1);
87 	case RANGE:
88 		if (s->cnt-- == 0) {
89 			s->state = NORMAL;
90 			return (next(s));
91 		}
92 		++s->lastch;
93 		return (1);
94 	case SEQUENCE:
95 		if (s->cnt-- == 0) {
96 			s->state = NORMAL;
97 			return (next(s));
98 		}
99 		return (1);
100 	case SET:
101 		if ((s->lastch = s->set[s->cnt++]) == OOBCH) {
102 			s->state = NORMAL;
103 			return (next(s));
104 		}
105 		return (1);
106 	default:
107 		return 0;
108 	}
109 	/* NOTREACHED */
110 }
111 
112 static int
113 bracket(s)
114 	STR *s;
115 {
116 	char *p;
117 
118 	switch (s->str[1]) {
119 	case ':':				/* "[:class:]" */
120 		if ((p = strstr((char *)s->str + 2, ":]")) == NULL)
121 			return (0);
122 		*p = '\0';
123 		s->str += 2;
124 		genclass(s);
125 		s->str = (unsigned char *)p + 2;
126 		return (1);
127 	case '=':				/* "[=equiv=]" */
128 		if ((p = strstr((char *)s->str + 2, "=]")) == NULL)
129 			return (0);
130 		s->str += 2;
131 		genequiv(s);
132 		return (1);
133 	default:				/* "[\###*n]" or "[#*n]" */
134 		if ((p = strpbrk((char *)s->str + 2, "*]")) == NULL)
135 			return (0);
136 		if (p[0] != '*' || strchr(p, ']') == NULL)
137 			return (0);
138 		s->str += 1;
139 		genseq(s);
140 		return (1);
141 	}
142 	/* NOTREACHED */
143 }
144 
145 typedef struct {
146 	char *name;
147 	int (*func)(int);
148 	int *set;
149 } CLASS;
150 
151 static CLASS classes[] = {
152 	{ "alnum",  isalnum,  },
153 	{ "alpha",  isalpha,  },
154 	{ "blank",  isblank,  },
155 	{ "cntrl",  iscntrl,  },
156 	{ "digit",  isdigit,  },
157 	{ "graph",  isgraph,  },
158 	{ "lower",  islower,  },
159 	{ "print",  isprint,  },
160 	{ "punct",  ispunct,  },
161 	{ "space",  isspace,  },
162 	{ "upper",  isupper,  },
163 	{ "xdigit", isxdigit, },
164 };
165 
166 static void
167 genclass(STR *s)
168 {
169 	CLASS *cp, tmp;
170 	size_t len;
171 	int i;
172 
173 	tmp.name = (char *)s->str;
174 	if ((cp = (CLASS *)bsearch(&tmp, classes, sizeof(classes) /
175 	    sizeof(CLASS), sizeof(CLASS), c_class)) == NULL)
176 		errx(1, "unknown class %s", s->str);
177 
178 	/*
179 	 * Generate the set of characters in the class if we haven't
180 	 * already done so.
181 	 */
182 	if (cp->set == NULL) {
183 		cp->set = reallocarray(NULL, NCHARS + 1, sizeof(*cp->set));
184 		if (cp->set == NULL)
185 			err(1, NULL);
186 		len = 0;
187 		for (i = 0; i < NCHARS; i++) {
188 			if (cp->func(i)) {
189 				cp->set[len] = i;
190 				len++;
191 			}
192 		}
193 		cp->set[len] = OOBCH;
194 		len++;
195 		cp->set = reallocarray(cp->set, len, sizeof(*cp->set));
196 		if (cp->set == NULL)
197 			err(1, NULL);
198 	}
199 
200 	s->cnt = 0;
201 	s->state = SET;
202 	s->set = cp->set;
203 }
204 
205 static int
206 c_class(a, b)
207 	const void *a, *b;
208 {
209 	return (strcmp(((CLASS *)a)->name, ((CLASS *)b)->name));
210 }
211 
212 /*
213  * English doesn't have any equivalence classes, so for now
214  * we just syntax check and grab the character.
215  */
216 static void
217 genequiv(s)
218 	STR *s;
219 {
220 	if (*s->str == '\\') {
221 		s->equiv[0] = backslash(s);
222 		if (*s->str != '=')
223 			errx(1, "misplaced equivalence equals sign");
224 	} else {
225 		s->equiv[0] = s->str[0];
226 		if (s->str[1] != '=')
227 			errx(1, "misplaced equivalence equals sign");
228 	}
229 	s->str += 2;
230 	s->cnt = 0;
231 	s->state = SET;
232 	s->set = s->equiv;
233 }
234 
235 static int
236 genrange(s)
237 	STR *s;
238 {
239 	int stopval;
240 	unsigned char *savestart;
241 
242 	savestart = s->str;
243 	stopval = *++s->str == '\\' ? backslash(s) : *s->str++;
244 	if (stopval < (u_char)s->lastch) {
245 		s->str = savestart;
246 		return (0);
247 	}
248 	s->cnt = stopval - s->lastch + 1;
249 	s->state = RANGE;
250 	--s->lastch;
251 	return (1);
252 }
253 
254 static void
255 genseq(s)
256 	STR *s;
257 {
258 	char *ep;
259 
260 	if (s->which == STRING1)
261 		errx(1, "sequences only valid in string2");
262 
263 	if (*s->str == '\\')
264 		s->lastch = backslash(s);
265 	else
266 		s->lastch = *s->str++;
267 	if (*s->str != '*')
268 		errx(1, "misplaced sequence asterisk");
269 
270 	switch (*++s->str) {
271 	case '\\':
272 		s->cnt = backslash(s);
273 		break;
274 	case ']':
275 		s->cnt = 0;
276 		++s->str;
277 		break;
278 	default:
279 		if (isdigit(*s->str)) {
280 			s->cnt = strtol((char *)s->str, &ep, 0);
281 			if (*ep == ']') {
282 				s->str = (unsigned char *)ep + 1;
283 				break;
284 			}
285 		}
286 		errx(1, "illegal sequence count");
287 		/* NOTREACHED */
288 	}
289 
290 	s->state = s->cnt ? SEQUENCE : INFINITE;
291 }
292 
293 /*
294  * Translate \??? into a character.  Up to 3 octal digits, if no digits either
295  * an escape code or a literal character.
296  */
297 static int
298 backslash(STR *s)
299 {
300 	size_t i;
301 	int ch, val;
302 
303 	assert(*s->str == '\\');
304 	s->str++;
305 
306 	/* Empty escapes become plain backslashes. */
307 	if (*s->str == '\0') {
308 		s->state = EOS;
309 		return ('\\');
310 	}
311 
312 	val = 0;
313 	for (i = 0; i < 3; i++) {
314 		if (s->str[i] < '0' || '7' < s->str[i])
315 			break;
316 		val = val * 8 + s->str[i] - '0';
317 	}
318 	if (i > 0) {
319 		if (val > UCHAR_MAX)
320 			errx(1, "octal value out of range: %d", val);
321 		s->str += i;
322 		return (val);
323 	}
324 
325 	ch = *s->str++;
326 	switch (ch) {
327 		case 'a':			/* escape characters */
328 			return ('\7');
329 		case 'b':
330 			return ('\b');
331 		case 'f':
332 			return ('\f');
333 		case 'n':
334 			return ('\n');
335 		case 'r':
336 			return ('\r');
337 		case 't':
338 			return ('\t');
339 		case 'v':
340 			return ('\13');
341 		default:			/* \x" -> x */
342 			return (ch);
343 	}
344 }
345