xref: /csrg-svn/usr.bin/tr/str.c (revision 51396)
1*51396Sbostic /*-
2*51396Sbostic  * Copyright (c) 1991 The Regents of the University of California.
3*51396Sbostic  * All rights reserved.
4*51396Sbostic  *
5*51396Sbostic  * %sccs.include.redist.c%
6*51396Sbostic  */
7*51396Sbostic 
8*51396Sbostic #ifndef lint
9*51396Sbostic static char sccsid[] = "@(#)str.c	5.1 (Berkeley) 10/24/91";
10*51396Sbostic #endif /* not lint */
11*51396Sbostic 
12*51396Sbostic #include <sys/cdefs.h>
13*51396Sbostic #include <sys/types.h>
14*51396Sbostic #include <errno.h>
15*51396Sbostic #include <stdio.h>
16*51396Sbostic #include <stddef.h>
17*51396Sbostic #include <stdlib.h>
18*51396Sbostic #include <string.h>
19*51396Sbostic #include "extern.h"
20*51396Sbostic 
21*51396Sbostic static int	backslash __P((STR *));
22*51396Sbostic static int	bracket __P((STR *));
23*51396Sbostic static int	c_class __P((const void *, const void *));
24*51396Sbostic static void	genclass __P((STR *));
25*51396Sbostic static void	genequiv __P((STR *));
26*51396Sbostic static int	genrange __P((STR *));
27*51396Sbostic static void	genseq __P((STR *));
28*51396Sbostic 
29*51396Sbostic int
30*51396Sbostic next(s)
31*51396Sbostic 	register STR *s;
32*51396Sbostic {
33*51396Sbostic 	register int ch;
34*51396Sbostic 
35*51396Sbostic 	switch (s->state) {
36*51396Sbostic 	case EOS:
37*51396Sbostic 		return (0);
38*51396Sbostic 	case INFINITE:
39*51396Sbostic 		return (1);
40*51396Sbostic 	case NORMAL:
41*51396Sbostic 		switch (ch = *s->str++) {
42*51396Sbostic 		case '\0':
43*51396Sbostic 			--s->str;
44*51396Sbostic 			s->state = EOS;
45*51396Sbostic 			return (0);
46*51396Sbostic 		case '\\':
47*51396Sbostic 			s->lastch = backslash(s);
48*51396Sbostic 			break;
49*51396Sbostic 		case '[':
50*51396Sbostic 			if (bracket(s))
51*51396Sbostic 				return (next(s));
52*51396Sbostic 			/* FALLTHROUGH */
53*51396Sbostic 		default:
54*51396Sbostic 			s->lastch = ch;
55*51396Sbostic 			break;
56*51396Sbostic 		}
57*51396Sbostic 
58*51396Sbostic 		/* We can start a range at any time. */
59*51396Sbostic 		if (s->str[0] == '-' && genrange(s))
60*51396Sbostic 			return (next(s));
61*51396Sbostic 		return (1);
62*51396Sbostic 	case RANGE:
63*51396Sbostic 		if (s->cnt-- == 0) {
64*51396Sbostic 			s->state = NORMAL;
65*51396Sbostic 			return (next(s));
66*51396Sbostic 		}
67*51396Sbostic 		++s->lastch;
68*51396Sbostic 		return (1);
69*51396Sbostic 	case SEQUENCE:
70*51396Sbostic 		if (s->cnt-- == 0) {
71*51396Sbostic 			s->state = NORMAL;
72*51396Sbostic 			return (next(s));
73*51396Sbostic 		}
74*51396Sbostic 		return (1);
75*51396Sbostic 	case SET:
76*51396Sbostic 	case ULSET:
77*51396Sbostic 		if ((s->lastch = s->set[s->cnt++]) == OOBCH) {
78*51396Sbostic 			s->state = NORMAL;
79*51396Sbostic 			return (next(s));
80*51396Sbostic 		}
81*51396Sbostic 		return (1);
82*51396Sbostic 	}
83*51396Sbostic 	/* NOTREACHED */
84*51396Sbostic }
85*51396Sbostic 
86*51396Sbostic static int
87*51396Sbostic bracket(s)
88*51396Sbostic 	register STR *s;
89*51396Sbostic {
90*51396Sbostic 	register char *p;
91*51396Sbostic 
92*51396Sbostic 	switch (*s->str) {
93*51396Sbostic 	case ':':				/* "[:class:]" */
94*51396Sbostic 		if ((p = strpbrk(s->str + 1, ":]")) == NULL)
95*51396Sbostic 			return (0);
96*51396Sbostic 		if (p[0] != ':' || p[1] != ']')
97*51396Sbostic 			return (0);
98*51396Sbostic 		*p = '\0';
99*51396Sbostic 		++s->str;
100*51396Sbostic 		genclass(s);
101*51396Sbostic 		s->str = p + 2;
102*51396Sbostic 		return (1);
103*51396Sbostic 	case '=':				/* "[=equiv=]" */
104*51396Sbostic 		if ((p = strpbrk(s->str + 1, "=]")) == NULL)
105*51396Sbostic 			return (0);
106*51396Sbostic 		if (p[0] != '=' || p[1] != ']')
107*51396Sbostic 			return (0);
108*51396Sbostic 		genequiv(s);
109*51396Sbostic 		return (1);
110*51396Sbostic 	default:				/* "[\###*]" or "[#*]" */
111*51396Sbostic 		if ((p = strpbrk(s->str + 1, "*]")) == NULL)
112*51396Sbostic 			return (0);
113*51396Sbostic 		if (p[0] != '*' || index(p, ']') == NULL)
114*51396Sbostic 			return (0);
115*51396Sbostic 		genseq(s);
116*51396Sbostic 		return (1);
117*51396Sbostic 	}
118*51396Sbostic 	/* NOTREACHED */
119*51396Sbostic }
120*51396Sbostic 
121*51396Sbostic int isalnum __P((int)),
122*51396Sbostic     isalpha __P((int)),
123*51396Sbostic     isblank __P((int)),
124*51396Sbostic     isspace __P((int)),
125*51396Sbostic     iscntrl __P((int)),
126*51396Sbostic     isdigit __P((int)),
127*51396Sbostic     isgraph __P((int)),
128*51396Sbostic     islower __P((int)),
129*51396Sbostic     isprint __P((int)),
130*51396Sbostic     ispunct __P((int)),
131*51396Sbostic     isupper __P((int)),
132*51396Sbostic     isxdigit __P((int));
133*51396Sbostic 
134*51396Sbostic typedef struct {
135*51396Sbostic 	char *name;
136*51396Sbostic 	int (*func) __P((int));
137*51396Sbostic 	u_int type;
138*51396Sbostic 	int *set;
139*51396Sbostic } CLASS;
140*51396Sbostic 
141*51396Sbostic static CLASS classes[] = {
142*51396Sbostic 	{ "alnum",  isalnum,  T_CLASS, },
143*51396Sbostic 	{ "alpha",  isalpha,  T_CLASS, },
144*51396Sbostic 	{ "blank",  isblank,  T_CLASS, },
145*51396Sbostic 	{ "cntrl",  iscntrl,  T_CLASS, },
146*51396Sbostic 	{ "digit",  isdigit,  T_CLASS, },
147*51396Sbostic 	{ "graph",  isgraph,  T_CLASS, },
148*51396Sbostic 	{ "lower",  islower,  T_UL, },
149*51396Sbostic 	{ "print",  isupper,  T_CLASS, },
150*51396Sbostic 	{ "punct",  ispunct,  T_CLASS, },
151*51396Sbostic 	{ "space",  isspace,  T_CLASS, },
152*51396Sbostic 	{ "upper",  isupper,  T_UL, },
153*51396Sbostic 	{ "xdigit", isxdigit, T_CLASS, },
154*51396Sbostic };
155*51396Sbostic 
156*51396Sbostic static void
157*51396Sbostic genclass(s)
158*51396Sbostic 	STR *s;
159*51396Sbostic {
160*51396Sbostic 	register int cnt, (*func) __P((int));
161*51396Sbostic 	CLASS *cp, tmp;
162*51396Sbostic 	int *p;
163*51396Sbostic 
164*51396Sbostic 	tmp.name = s->str;
165*51396Sbostic 	if ((cp = (CLASS *)bsearch(&tmp, classes, sizeof(classes) /
166*51396Sbostic 	    sizeof(CLASS), sizeof(CLASS), c_class)) == NULL)
167*51396Sbostic 		err("unknown class %s", s->str);
168*51396Sbostic 	if (!(cp->type | s->type))
169*51396Sbostic 		err("class %s illegally used");
170*51396Sbostic 
171*51396Sbostic 	if ((cp->set = p = malloc((NCHARS + 1) * sizeof(int))) == NULL)
172*51396Sbostic 		err("%s", strerror(errno));
173*51396Sbostic 	bzero(p, NCHARS);
174*51396Sbostic 	for (cnt = 0, func = cp->func; cnt < NCHARS; ++cnt)
175*51396Sbostic 		if ((func)(cnt))
176*51396Sbostic 			*p++ = cnt;
177*51396Sbostic 	*p = OOBCH;
178*51396Sbostic 
179*51396Sbostic 	s->cnt = 0;
180*51396Sbostic 	s->state = cp->type & T_UL ? ULSET : SET;
181*51396Sbostic 	s->set = cp->set;
182*51396Sbostic }
183*51396Sbostic 
184*51396Sbostic static int
185*51396Sbostic c_class(a, b)
186*51396Sbostic 	const void *a, *b;
187*51396Sbostic {
188*51396Sbostic 	return (strcmp(((CLASS *)a)->name, ((CLASS *)b)->name));
189*51396Sbostic }
190*51396Sbostic 
191*51396Sbostic /*
192*51396Sbostic  * English doesn't have any equivalence classes, so for now
193*51396Sbostic  * we just syntax check and grab the character.
194*51396Sbostic  */
195*51396Sbostic static void
196*51396Sbostic genequiv(s)
197*51396Sbostic 	STR *s;
198*51396Sbostic {
199*51396Sbostic 	static int val[2] = { 0, OOBCH };
200*51396Sbostic 
201*51396Sbostic 	if (*++s->str == '\\') {
202*51396Sbostic 		val[0] = backslash(s);
203*51396Sbostic 		if (*s->str != '=')
204*51396Sbostic 			err("misplaced equivalence equals sign");
205*51396Sbostic 	} else {
206*51396Sbostic 		val[0] = s->str[0];
207*51396Sbostic 		if (s->str[1] != '=')
208*51396Sbostic 			err("misplaced equivalence equals sign");
209*51396Sbostic 	}
210*51396Sbostic 	s->str += 2;
211*51396Sbostic 	s->cnt = 0;
212*51396Sbostic 	s->state = SET;
213*51396Sbostic 	s->set = val;
214*51396Sbostic }
215*51396Sbostic 
216*51396Sbostic static int
217*51396Sbostic genrange(s)
218*51396Sbostic 	STR *s;
219*51396Sbostic {
220*51396Sbostic 	int stopval;
221*51396Sbostic 	char *savestart;
222*51396Sbostic 
223*51396Sbostic 	savestart = s->str;
224*51396Sbostic 	stopval = *++s->str == '\\' ? backslash(s) : *s->str;
225*51396Sbostic 	if (stopval < s->lastch) {
226*51396Sbostic 		s->str = savestart;
227*51396Sbostic 		return (0);
228*51396Sbostic 	}
229*51396Sbostic 	s->cnt = stopval - s->lastch + 1;
230*51396Sbostic 	s->state = RANGE;
231*51396Sbostic 	--s->lastch;
232*51396Sbostic 	return (1);
233*51396Sbostic }
234*51396Sbostic 
235*51396Sbostic static void
236*51396Sbostic genseq(s)
237*51396Sbostic 	STR *s;
238*51396Sbostic {
239*51396Sbostic 	char *ep;
240*51396Sbostic 
241*51396Sbostic 	if (!(s->type & T_SEQ))
242*51396Sbostic 		err("sequences only valid in string1");
243*51396Sbostic 
244*51396Sbostic 	if (*s->str == '\\')
245*51396Sbostic 		s->lastch = backslash(s);
246*51396Sbostic 	else
247*51396Sbostic 		s->lastch = *s->str++;
248*51396Sbostic 	if (*s->str != '*')
249*51396Sbostic 		err("misplaced sequence asterisk");
250*51396Sbostic 
251*51396Sbostic 	switch (*++s->str) {
252*51396Sbostic 	case '\\':
253*51396Sbostic 		s->cnt = backslash(s);
254*51396Sbostic 		break;
255*51396Sbostic 	case ']':
256*51396Sbostic 		s->cnt = 0;
257*51396Sbostic 		++s->str;
258*51396Sbostic 		break;
259*51396Sbostic 	default:
260*51396Sbostic 		if (isdigit(*s->str)) {
261*51396Sbostic 			s->cnt = strtol(s->str, &ep, 0);
262*51396Sbostic 			if (*ep == ']') {
263*51396Sbostic 				s->str = ep + 1;
264*51396Sbostic 				break;
265*51396Sbostic 			}
266*51396Sbostic 		}
267*51396Sbostic 		err("illegal sequence count");
268*51396Sbostic 		/* NOTREACHED */
269*51396Sbostic 	}
270*51396Sbostic 
271*51396Sbostic 	s->state = s->cnt ? SEQUENCE : INFINITE;
272*51396Sbostic }
273*51396Sbostic 
274*51396Sbostic /* Use the #defines here, DON'T use them above. */
275*51396Sbostic #include <ctype.h>
276*51396Sbostic 
277*51396Sbostic /*
278*51396Sbostic  * Translate \??? into a character.  Up to 3 octal digits, if no digits either
279*51396Sbostic  * an escape code or a literal character.
280*51396Sbostic  */
281*51396Sbostic static int
282*51396Sbostic backslash(s)
283*51396Sbostic 	register STR *s;
284*51396Sbostic {
285*51396Sbostic 	register int ch, cnt, val;
286*51396Sbostic 
287*51396Sbostic 	for (cnt = val = 0;;) {
288*51396Sbostic 		ch = *++s->str;
289*51396Sbostic 		if (!isascii(ch) || !isdigit(ch))
290*51396Sbostic 			break;
291*51396Sbostic 		val = val * 8 + ch - '0';
292*51396Sbostic 		if (++cnt == 3)
293*51396Sbostic 			break;
294*51396Sbostic 	}
295*51396Sbostic 	if (cnt)
296*51396Sbostic 		return (val);
297*51396Sbostic 	++s->str;
298*51396Sbostic 	switch (ch) {
299*51396Sbostic 		case 'a':			/* escape characters */
300*51396Sbostic 			return ('\7');
301*51396Sbostic 		case 'b':
302*51396Sbostic 			return ('\b');
303*51396Sbostic 		case 'f':
304*51396Sbostic 			return ('\f');
305*51396Sbostic 		case 'n':
306*51396Sbostic 			return ('\n');
307*51396Sbostic 		case 'r':
308*51396Sbostic 			return ('\r');
309*51396Sbostic 		case 't':
310*51396Sbostic 			return ('\t');
311*51396Sbostic 		case 'v':
312*51396Sbostic 			return ('\13');
313*51396Sbostic 		case '\0':			/*  \" -> \ */
314*51396Sbostic 			s->state = EOS;
315*51396Sbostic 			return ('\\');
316*51396Sbostic 		default:			/* \x" -> x */
317*51396Sbostic 			return (ch);
318*51396Sbostic 	}
319*51396Sbostic }
320