xref: /netbsd-src/usr.bin/tr/str.c (revision 8d25f7611bdbf7254ba850e36b4404c983a43836)
1 /*	$NetBSD: str.c,v 1.30 2018/05/26 11:20:30 leot Exp $	*/
2 
3 /*-
4  * Copyright (c) 1991, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  * 3. Neither the name of the University nor the names of its contributors
16  *    may be used to endorse or promote products derived from this software
17  *    without specific prior written permission.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  */
31 
32 #include <sys/cdefs.h>
33 #ifndef lint
34 #if 0
35 static char sccsid[] = "@(#)str.c	8.2 (Berkeley) 4/28/95";
36 #endif
37 __RCSID("$NetBSD: str.c,v 1.30 2018/05/26 11:20:30 leot Exp $");
38 #endif /* not lint */
39 
40 #include <sys/types.h>
41 
42 #include <err.h>
43 #include <errno.h>
44 #include <stddef.h>
45 #include <stdio.h>
46 #include <stdlib.h>
47 #include <string.h>
48 #include <ctype.h>
49 #include <assert.h>
50 
51 #include "extern.h"
52 
53 struct str {
54 	enum { STRING1, STRING2 } which;
55 	enum { EOS, INFINITE, NORMAL, RANGE, SEQUENCE, SET } state;
56 	int cnt;			/* character count */
57 	int lastch;			/* last character */
58 	int equiv[2];			/* equivalence set */
59 	int *set;			/* set of characters */
60 	const char *str;		/* user's string */
61 };
62 
63 static int backslash(STR *);
64 static int bracket(STR *);
65 static int c_class(const void *, const void *);
66 static int *genclass(const char *, size_t);
67 static void genequiv(STR *);
68 static int genrange(STR *);
69 static void genseq(STR *);
70 
71 STR *
str_create(int whichstring,const char * txt)72 str_create(int whichstring, const char *txt)
73 {
74 	STR *s;
75 
76 	s = malloc(sizeof(*s));
77 	if (s == NULL) {
78 		err(1, "Out of memory");
79 	}
80 
81 	s->which = whichstring == 2 ? STRING2 : STRING1;
82 	s->state = NORMAL;
83 	s->cnt = 0;
84 	s->lastch = OOBCH;
85 	s->equiv[0] = 0;
86 	s->equiv[1] = OOBCH;
87 	s->set = NULL;
88 	s->str = txt;
89 
90 	return s;
91 }
92 
93 void
str_destroy(STR * s)94 str_destroy(STR *s)
95 {
96 	if (s->set != NULL && s->set != s->equiv) {
97 		free(s->set);
98 	}
99 	free(s);
100 }
101 
102 int
next(STR * s,int * ret)103 next(STR *s, int *ret)
104 {
105 	int ch;
106 
107 	switch (s->state) {
108 	case EOS:
109 		*ret = s->lastch;
110 		return 0;
111 	case INFINITE:
112 		*ret = s->lastch;
113 		return 1;
114 	case NORMAL:
115 		ch = (unsigned char)s->str[0];
116 		switch (ch) {
117 		case '\0':
118 			s->state = EOS;
119 			*ret = s->lastch;
120 			return 0;
121 		case '\\':
122 			s->lastch = backslash(s);
123 			break;
124 		case '[':
125 			if (bracket(s)) {
126 				return next(s, ret);
127 			}
128 			/* FALLTHROUGH */
129 		default:
130 			++s->str;
131 			s->lastch = ch;
132 			break;
133 		}
134 
135 		/* We can start a range at any time. */
136 		if (s->str[0] == '-' && genrange(s)) {
137 			return next(s, ret);
138 		}
139 		*ret = s->lastch;
140 		return 1;
141 	case RANGE:
142 		if (s->cnt == 0) {
143 			s->state = NORMAL;
144 			return next(s, ret);
145 		}
146 		s->cnt--;
147 		++s->lastch;
148 		*ret = s->lastch;
149 		return 1;
150 	case SEQUENCE:
151 		if (s->cnt == 0) {
152 			s->state = NORMAL;
153 			return next(s, ret);
154 		}
155 		s->cnt--;
156 		*ret = s->lastch;
157 		return 1;
158 	case SET:
159 		s->lastch = s->set[s->cnt++];
160 		if (s->lastch == OOBCH) {
161 			s->state = NORMAL;
162 			if (s->set != s->equiv) {
163 				free(s->set);
164 			}
165 			s->set = NULL;
166 			return next(s, ret);
167 		}
168 		*ret = s->lastch;
169 		return 1;
170 	}
171 	/* NOTREACHED */
172 	assert(0);
173 	*ret = s->lastch;
174 	return 0;
175 }
176 
177 static int
bracket(STR * s)178 bracket(STR *s)
179 {
180 	const char *p;
181 	int *q;
182 
183 	switch (s->str[1]) {
184 	case ':':				/* "[:class:]" */
185 		if ((p = strstr(s->str + 2, ":]")) == NULL)
186 			return 0;
187 		s->str += 2;
188 		q = genclass(s->str, p - s->str);
189 		s->state = SET;
190 		s->set = q;
191 		s->cnt = 0;
192 		s->str = p + 2;
193 		return 1;
194 	case '=':				/* "[=equiv=]" */
195 		if ((p = strstr(s->str + 2, "=]")) == NULL)
196 			return 0;
197 		s->str += 2;
198 		genequiv(s);
199 		s->str = p + 2;
200 		return 1;
201 	default:				/* "[\###*n]" or "[#*n]" */
202 		if ((p = strpbrk(s->str + 2, "*]")) == NULL)
203 			return 0;
204 		if (p[0] != '*' || strchr(p, ']') == NULL)
205 			return 0;
206 		s->str += 1;
207 		genseq(s);
208 		return 1;
209 	}
210 	/* NOTREACHED */
211 }
212 
213 typedef struct {
214 	const char *name;
215 	int (*func)(int);
216 } CLASS;
217 
218 static const CLASS classes[] = {
219 	{ "alnum",  isalnum  },
220 	{ "alpha",  isalpha  },
221 	{ "blank",  isblank  },
222 	{ "cntrl",  iscntrl  },
223 	{ "digit",  isdigit  },
224 	{ "graph",  isgraph  },
225 	{ "lower",  islower  },
226 	{ "print",  isprint  },
227 	{ "punct",  ispunct  },
228 	{ "space",  isspace  },
229 	{ "upper",  isupper  },
230 	{ "xdigit", isxdigit },
231 };
232 
233 typedef struct {
234 	const char *name;
235 	size_t len;
236 } CLASSKEY;
237 
238 static int *
genclass(const char * class,size_t len)239 genclass(const char *class, size_t len)
240 {
241 	int ch;
242 	const CLASS *cp;
243 	CLASSKEY key;
244 	int *p;
245 	unsigned pos, num;
246 
247 	/* Find the class */
248 	key.name = class;
249 	key.len = len;
250 	cp = bsearch(&key, classes, __arraycount(classes), sizeof(classes[0]),
251 		     c_class);
252 	if (cp == NULL) {
253 		errx(1, "unknown class %.*s", (int)len, class);
254 	}
255 
256 	/*
257 	 * Figure out what characters are in the class
258 	 */
259 
260 	num = NCHARS + 1;
261 	p = malloc(num * sizeof(*p));
262 	if (p == NULL) {
263 		err(1, "malloc");
264 	}
265 
266 	pos = 0;
267 	for (ch = 0; ch < NCHARS; ch++) {
268 		if (cp->func(ch)) {
269 			p[pos++] = ch;
270 		}
271 	}
272 
273 	p[pos++] = OOBCH;
274 	for (; pos < num; pos++) {
275 		p[pos] = 0;
276 	}
277 
278 	return p;
279 }
280 
281 static int
c_class(const void * av,const void * bv)282 c_class(const void *av, const void *bv)
283 {
284 	const CLASSKEY *a = av;
285 	const CLASS *b = bv;
286 	size_t blen;
287 	int r;
288 
289 	blen = strlen(b->name);
290 	r = strncmp(a->name, b->name, a->len);
291 	if (r != 0) {
292 		return r;
293 	}
294 	if (a->len < blen) {
295 		/* someone gave us a prefix of the right name */
296 		return -1;
297 	}
298 	assert(a-> len == blen);
299 	return 0;
300 }
301 
302 /*
303  * English doesn't have any equivalence classes, so for now
304  * we just syntax check and grab the character.
305  */
306 static void
genequiv(STR * s)307 genequiv(STR *s)
308 {
309 	int ch;
310 
311 	ch = (unsigned char)s->str[0];
312 	if (ch == '\\') {
313 		s->equiv[0] = backslash(s);
314 	} else {
315 		s->equiv[0] = ch;
316 		s->str++;
317 	}
318 	if (s->str[0] != '=') {
319 		errx(1, "Misplaced equivalence equals sign");
320 	}
321 	s->str++;
322 	if (s->str[0] != ']') {
323 		errx(1, "Misplaced equivalence right bracket");
324 	}
325 	s->str++;
326 
327 	s->cnt = 0;
328 	s->state = SET;
329 	s->set = s->equiv;
330 }
331 
332 static int
genrange(STR * s)333 genrange(STR *s)
334 {
335 	int stopval;
336 	const char *savestart;
337 
338 	savestart = s->str++;
339 	stopval = s->str[0] == '\\' ? backslash(s) : (unsigned char)*s->str++;
340 	if (stopval < (unsigned char)s->lastch) {
341 		s->str = savestart;
342 		return 0;
343 	}
344 	s->cnt = stopval - s->lastch + 1;
345 	s->state = RANGE;
346 	--s->lastch;
347 	return 1;
348 }
349 
350 static void
genseq(STR * s)351 genseq(STR *s)
352 {
353 	char *ep;
354 
355 	if (s->which == STRING1) {
356 		errx(1, "Sequences only valid in string2");
357 	}
358 
359 	if (*s->str == '\\') {
360 		s->lastch = backslash(s);
361 	} else {
362 		s->lastch = (unsigned char)*s->str++;
363 	}
364 	if (*s->str != '*') {
365 		errx(1, "Misplaced sequence asterisk");
366 	}
367 
368 	s->str++;
369 	switch (s->str[0]) {
370 	case '\\':
371 		s->cnt = backslash(s);
372 		break;
373 	case ']':
374 		s->cnt = 0;
375 		++s->str;
376 		break;
377 	default:
378 		if (isdigit((unsigned char)s->str[0])) {
379 			s->cnt = strtol(s->str, &ep, 0);
380 			if (*ep == ']') {
381 				s->str = ep + 1;
382 				break;
383 			}
384 		}
385 		errx(1, "illegal sequence count");
386 		/* NOTREACHED */
387 	}
388 
389 	s->state = s->cnt ? SEQUENCE : INFINITE;
390 }
391 
392 /*
393  * Translate \??? into a character.  Up to 3 octal digits, if no digits either
394  * an escape code or a literal character.
395  */
396 static int
backslash(STR * s)397 backslash(STR *s)
398 {
399 	int ch, cnt, val;
400 
401 	cnt = val = 0;
402 	for (;;) {
403 		/* Consume the character we're already on. */
404 		s->str++;
405 
406 		/* Look at the next character. */
407 		ch = (unsigned char)s->str[0];
408 		if (!isascii(ch) || !isdigit(ch)) {
409 			break;
410 		}
411 		val = val * 8 + ch - '0';
412 		if (++cnt == 3) {
413 			/* Enough digits; consume this one and stop */
414 			++s->str;
415 			break;
416 		}
417 	}
418 	if (cnt) {
419 		/* We saw digits, so return their value */
420 		if (val >= OOBCH)
421 			errx(1, "Invalid octal character value");
422 		return val;
423 	}
424 	if (ch == '\0') {
425 		/* \<end> -> \ */
426 		s->state = EOS;
427 		return '\\';
428 	}
429 
430 	/* Consume the escaped character */
431 	s->str++;
432 
433 	switch (ch) {
434 	case 'a':			/* escape characters */
435 		return '\7';
436 	case 'b':
437 		return '\b';
438 	case 'e':
439 		return '\033';
440 	case 'f':
441 		return '\f';
442 	case 'n':
443 		return '\n';
444 	case 'r':
445 		return '\r';
446 	case 't':
447 		return '\t';
448 	case 'v':
449 		return '\13';
450 	default:			/* \q -> q */
451 		return ch;
452 	}
453 }
454