xref: /netbsd-src/usr.bin/sed/compile.c (revision 811e6386f8c5e4a3521c7003da29ec8673e344fa)
1 /*-
2  * Copyright (c) 1992 Diomidis Spinellis.
3  * Copyright (c) 1992 The Regents of the University of California.
4  * All rights reserved.
5  *
6  * This code is derived from software contributed to Berkeley by
7  * Diomidis Spinellis of Imperial College, University of London.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  * 3. All advertising materials mentioning features or use of this software
18  *    must display the following acknowledgement:
19  *	This product includes software developed by the University of
20  *	California, Berkeley and its contributors.
21  * 4. Neither the name of the University nor the names of its contributors
22  *    may be used to endorse or promote products derived from this software
23  *    without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  */
37 
38 #ifndef lint
39 static char sccsid[] = "@(#)compile.c	5.6 (Berkeley) 11/2/92";
40 #endif /* not lint */
41 
42 #include <sys/types.h>
43 #include <sys/stat.h>
44 
45 #include <ctype.h>
46 #include <errno.h>
47 #include <fcntl.h>
48 #include <limits.h>
49 #include <regex.h>
50 #include <stdio.h>
51 #include <stdlib.h>
52 #include <string.h>
53 
54 #include "defs.h"
55 #include "extern.h"
56 
57 static char	 *compile_addr __P((char *, struct s_addr *));
58 static char	 *compile_delimited __P((char *, char *));
59 static char	 *compile_flags __P((char *, struct s_subst *));
60 static char	 *compile_re __P((char *, regex_t **));
61 static char	 *compile_subst __P((char *, struct s_subst *));
62 static char	 *compile_text __P((void));
63 static char	 *compile_tr __P((char *, char **));
64 static struct s_command
65 		**compile_stream __P((char *, struct s_command **, char *));
66 static char	 *duptoeol __P((char *));
67 static struct s_command
68 		 *findlabel __P((struct s_command *, struct s_command *));
69 static void	  fixuplabel __P((struct s_command *, struct s_command *,
70 		  	struct s_command *));
71 
72 /*
73  * Command specification.  This is used to drive the command parser.
74  */
75 struct s_format {
76 	char code;				/* Command code */
77 	int naddr;				/* Number of address args */
78 	enum e_args args;			/* Argument type */
79 };
80 
81 static struct s_format cmd_fmts[] = {
82 	{'{', 2, GROUP},
83 	{'a', 1, TEXT},
84 	{'b', 2, BRANCH},
85 	{'c', 2, TEXT},
86 	{'d', 2, EMPTY},
87 	{'D', 2, EMPTY},
88 	{'g', 2, EMPTY},
89 	{'G', 2, EMPTY},
90 	{'h', 2, EMPTY},
91 	{'H', 2, EMPTY},
92 	{'i', 1, TEXT},
93 	{'l', 2, EMPTY},
94 	{'n', 2, EMPTY},
95 	{'N', 2, EMPTY},
96 	{'p', 2, EMPTY},
97 	{'P', 2, EMPTY},
98 	{'q', 1, EMPTY},
99 	{'r', 1, RFILE},
100 	{'s', 2, SUBST},
101 	{'t', 2, BRANCH},
102 	{'w', 2, WFILE},
103 	{'x', 2, EMPTY},
104 	{'y', 2, TR},
105 	{'!', 2, NONSEL},
106 	{':', 0, LABEL},
107 	{'#', 0, COMMENT},
108 	{'=', 1, EMPTY},
109 	{'\0', 0, COMMENT},
110 };
111 
112 /* The compiled program. */
113 struct s_command *prog;
114 
115 /*
116  * Compile the program into prog.
117  * Initialise appends.
118  */
119 void
120 compile()
121 {
122 	*compile_stream(NULL, &prog, NULL) = NULL;
123 	fixuplabel(prog, prog, NULL);
124 	appends = xmalloc(sizeof(struct s_appends) * appendnum);
125 	match = xmalloc((maxnsub + 1) * sizeof(regmatch_t));
126 }
127 
128 #define EATSPACE() do {							\
129 	if (p)								\
130 		while (*p && isascii(*p) && isspace(*p))		\
131 			p++;						\
132 	} while (0)
133 
134 static struct s_command **
135 compile_stream(terminator, link, p)
136 	char *terminator;
137 	struct s_command **link;
138 	register char *p;
139 {
140 	static char lbuf[_POSIX2_LINE_MAX + 1];	/* To save stack */
141 	struct s_command *cmd, *cmd2;
142 	struct s_format *fp;
143 	int naddr;				/* Number of addresses */
144 
145 	if (p != NULL)
146 		goto semicolon;
147 	for (;;) {
148 		if ((p = cu_fgets(lbuf, sizeof(lbuf))) == NULL) {
149 			if (terminator != NULL)
150 				err(COMPILE, "unexpected EOF (pending }'s)");
151 			return (link);
152 		}
153 
154 semicolon:	EATSPACE();
155 		if (p && (*p == '#' || *p == '\0'))
156 			continue;
157 		if (*p == '}') {
158 			if (terminator == NULL)
159 				err(COMPILE, "unexpected }");
160 			return (link);
161 		}
162 		*link = cmd = xmalloc(sizeof(struct s_command));
163 		link = &cmd->next;
164 		cmd->nonsel = cmd->inrange = 0;
165 		/* First parse the addresses */
166 		naddr = 0;
167 		cmd->a1 = cmd->a2 = NULL;
168 
169 /* Valid characters to start an address */
170 #define	addrchar(c)	(strchr("0123456789/\\$", (c)))
171 		if (addrchar(*p)) {
172 			naddr++;
173 			cmd->a1 = xmalloc(sizeof(struct s_addr));
174 			p = compile_addr(p, cmd->a1);
175 			EATSPACE();				/* EXTENSION */
176 			if (*p == ',') {
177 				naddr++;
178 				p++;
179 				EATSPACE();			/* EXTENSION */
180 				cmd->a2 = xmalloc(sizeof(struct s_addr));
181 				p = compile_addr(p, cmd->a2);
182 			}
183 		}
184 
185 nonsel:		/* Now parse the command */
186 		EATSPACE();
187 		if (!*p)
188 			err(COMPILE, "command expected");
189 		cmd->code = *p;
190 		for (fp = cmd_fmts; fp->code; fp++)
191 			if (fp->code == *p)
192 				break;
193 		if (!fp->code)
194 			err(COMPILE, "invalid command code %c", *p);
195 		if (naddr > fp->naddr)
196 			err(COMPILE,
197 "command %c expects up to %d address(es), found %d", *p, fp->naddr, naddr);
198 		switch (fp->args) {
199 		case NONSEL:			/* ! */
200 			cmd->nonsel = ! cmd->nonsel;
201 			p++;
202 			goto nonsel;
203 		case GROUP:			/* { */
204 			p++;
205 			EATSPACE();
206 			if (!*p)
207 				p = NULL;
208 			cmd2 = xmalloc(sizeof(struct s_command));
209 			cmd2->code = '}';
210 			*compile_stream("}", &cmd->u.c, p) = cmd2;
211 			cmd->next = cmd2;
212 			link = &cmd2->next;
213 			break;
214 		case EMPTY:		/* d D g G h H l n N p P q x = \0 */
215 			p++;
216 			EATSPACE();
217 			if (*p == ';') {
218 				p++;
219 				link = &cmd->next;
220 				goto semicolon;
221 			}
222 			if (*p)
223 				err(COMPILE,
224 "extra characters at the end of %c command", cmd->code);
225 			break;
226 		case TEXT:			/* a c i */
227 			p++;
228 			EATSPACE();
229 			if (*p != '\\')
230 				err(COMPILE,
231 "command %c expects \\ followed by text", cmd->code);
232 			p++;
233 			EATSPACE();
234 			if (*p)
235 				err(COMPILE,
236 "extra characters after \\ at the end of %c command", cmd->code);
237 			cmd->t = compile_text();
238 			break;
239 		case COMMENT:			/* \0 # */
240 			break;
241 		case WFILE:			/* w */
242 			p++;
243 			EATSPACE();
244 			if (*p == '\0')
245 				err(COMPILE, "filename expected");
246 			cmd->t = duptoeol(p);
247 			if (aflag)
248 				cmd->u.fd = -1;
249 			else if ((cmd->u.fd = open(p,
250 			    O_WRONLY|O_APPEND|O_CREAT|O_TRUNC,
251 			    DEFFILEMODE)) == -1)
252 				err(FATAL, "%s: %s\n", p, strerror(errno));
253 			break;
254 		case RFILE:			/* r */
255 			p++;
256 			EATSPACE();
257 			if (*p == '\0')
258 				err(COMPILE, "filename expected");
259 			else
260 				cmd->t = duptoeol(p);
261 			break;
262 		case BRANCH:			/* b t */
263 			p++;
264 			EATSPACE();
265 			if (*p == '\0')
266 				cmd->t = NULL;
267 			else
268 				cmd->t = duptoeol(p);
269 			break;
270 		case LABEL:			/* : */
271 			p++;
272 			EATSPACE();
273 			cmd->t = duptoeol(p);
274 			if (strlen(p) == 0)
275 				err(COMPILE, "empty label");
276 			break;
277 		case SUBST:			/* s */
278 			p++;
279 			if (*p == '\0' || *p == '\\')
280 				err(COMPILE,
281 "substitute pattern can not be delimited by newline or backslash");
282 			cmd->u.s = xmalloc(sizeof(struct s_subst));
283 			p = compile_re(p, &cmd->u.s->re);
284 			if (p == NULL)
285 				err(COMPILE, "unterminated substitute pattern");
286 			--p;
287 			p = compile_subst(p, cmd->u.s);
288 			p = compile_flags(p, cmd->u.s);
289 			EATSPACE();
290 			if (*p == ';') {
291 				p++;
292 				link = &cmd->next;
293 				goto semicolon;
294 			}
295 			break;
296 		case TR:			/* y */
297 			p++;
298 			p = compile_tr(p, (char **)&cmd->u.y);
299 			EATSPACE();
300 			if (*p == ';') {
301 				p++;
302 				link = &cmd->next;
303 				goto semicolon;
304 			}
305 			if (*p)
306 				err(COMPILE,
307 "extra text at the end of a transform command");
308 			break;
309 		}
310 	}
311 }
312 
313 /*
314  * Get a delimited string.  P points to the delimeter of the string; d points
315  * to a buffer area.  Newline and delimiter escapes are processed; other
316  * escapes are ignored.
317  *
318  * Returns a pointer to the first character after the final delimiter or NULL
319  * in the case of a non-terminated string.  The character array d is filled
320  * with the processed string.
321  */
322 static char *
323 compile_delimited(p, d)
324 	char *p, *d;
325 {
326 	char c;
327 
328 	c = *p++;
329 	if (c == '\0')
330 		return (NULL);
331 	else if (c == '\\')
332 		err(COMPILE, "\\ can not be used as a string delimiter");
333 	else if (c == '\n')
334 		err(COMPILE, "newline can not be used as a string delimiter");
335 	while (*p) {
336 		if (*p == '\\' && p[1] == c)
337 			p++;
338 		else if (*p == '\\' && p[1] == 'n') {
339 			*d++ = '\n';
340 			p += 2;
341 			continue;
342 		} else if (*p == '\\' && p[1] == '\\')
343 			*d++ = *p++;
344 		else if (*p == c) {
345 			*d = '\0';
346 			return (p + 1);
347 		}
348 		*d++ = *p++;
349 	}
350 	return (NULL);
351 }
352 
353 /*
354  * Get a regular expression.  P points to the delimiter of the regular
355  * expression; repp points to the address of a regexp pointer.  Newline
356  * and delimiter escapes are processed; other escapes are ignored.
357  * Returns a pointer to the first character after the final delimiter
358  * or NULL in the case of a non terminated regular expression.  The regexp
359  * pointer is set to the compiled regular expression.
360  * Cflags are passed to regcomp.
361  */
362 static char *
363 compile_re(p, repp)
364 	char *p;
365 	regex_t **repp;
366 {
367 	int eval;
368 	char re[_POSIX2_LINE_MAX + 1];
369 
370 	p = compile_delimited(p, re);
371 	if (p && strlen(re) == 0) {
372 		*repp = NULL;
373 		return (p);
374 	}
375 	*repp = xmalloc(sizeof(regex_t));
376 #ifdef GNU_REGEX
377 	/* initialize pattern buffer */
378 	(*repp)->buffer = NULL;
379 	(*repp)->allocated = 0L;
380 	(*repp)->fastmap = (char *) malloc(FASTMAP_SIZE);
381 	(*repp)->translate = 0;
382 #endif
383 	if (p && (eval = regcomp(*repp, re, 0)) != 0)
384 		err(COMPILE, "RE error: %s", strregerror(eval, *repp));
385 	if (maxnsub < (*repp)->re_nsub)
386 		maxnsub = (*repp)->re_nsub;
387 	return (p);
388 }
389 
390 /*
391  * Compile the substitution string of a regular expression and set res to
392  * point to a saved copy of it.  Nsub is the number of parenthesized regular
393  * expressions.
394  */
395 static char *
396 compile_subst(p, s)
397 	char *p;
398 	struct s_subst *s;
399 {
400 	static char lbuf[_POSIX2_LINE_MAX + 1];
401 	int asize, ref, size;
402 	char c, *text, *op, *sp;
403 
404 	c = *p++;			/* Terminator character */
405 	if (c == '\0')
406 		return (NULL);
407 
408 	s->maxbref = 0;
409 	s->linenum = linenum;
410 	asize = 2 * _POSIX2_LINE_MAX + 1;
411 	text = xmalloc(asize);
412 	size = 0;
413 	do {
414 		op = sp = text + size;
415 		for (; *p; p++) {
416 			if (*p == '\\') {
417 				p++;
418 				if (strchr("123456789", *p) != NULL) {
419 					*sp++ = '\\';
420 					ref = *p - '0';
421 					if (s->re != NULL &&
422 					    ref > s->re->re_nsub)
423 						err(COMPILE,
424 "\\%c not defined in the RE", *p);
425 					if (s->maxbref < ref)
426 						s->maxbref = ref;
427 				} else if (*p == '&' || *p == '\\')
428 					*sp++ = '\\';
429 			} else if (*p == c) {
430 				p++;
431 				*sp++ = '\0';
432 				size += sp - op;
433 				s->new = xrealloc(text, size);
434 				return (p);
435 			} else if (*p == '\n') {
436 				err(COMPILE,
437 "unescaped newline inside substitute pattern");
438 				/* NOTREACHED */
439 			}
440 			*sp++ = *p;
441 		}
442 		size += sp - op;
443 		if (asize - size < _POSIX2_LINE_MAX + 1) {
444 			asize *= 2;
445 			text = xmalloc(asize);
446 		}
447 	} while (cu_fgets(p = lbuf, sizeof(lbuf)));
448 	err(COMPILE, "unterminated substitute in regular expression");
449 	/* NOTREACHED */
450 }
451 
452 /*
453  * Compile the flags of the s command
454  */
455 static char *
456 compile_flags(p, s)
457 	char *p;
458 	struct s_subst *s;
459 {
460 	int gn;			/* True if we have seen g or n */
461 	char wfile[_POSIX2_LINE_MAX + 1], *q;
462 
463 	s->n = 1;				/* Default */
464 	s->p = 0;
465 	s->wfile = NULL;
466 	s->wfd = -1;
467 	for (gn = 0;;) {
468 		EATSPACE();			/* EXTENSION */
469 		switch (*p) {
470 		case 'g':
471 			if (gn)
472 				err(COMPILE,
473 "more than one number or 'g' in substitute flags");
474 			gn = 1;
475 			s->n = 0;
476 			break;
477 		case '\0':
478 		case '\n':
479 		case ';':
480 			return (p);
481 		case 'p':
482 			s->p = 1;
483 			break;
484 		case '1': case '2': case '3':
485 		case '4': case '5': case '6':
486 		case '7': case '8': case '9':
487 			if (gn)
488 				err(COMPILE,
489 "more than one number or 'g' in substitute flags");
490 			gn = 1;
491 			/* XXX Check for overflow */
492 			s->n = (int)strtol(p, &p, 10);
493 			break;
494 		case 'w':
495 			p++;
496 #ifdef HISTORIC_PRACTICE
497 			if (*p != ' ') {
498 				err(WARNING, "space missing before w wfile");
499 				return (p);
500 			}
501 #endif
502 			EATSPACE();
503 			q = wfile;
504 			while (*p) {
505 				if (*p == '\n')
506 					break;
507 				*q++ = *p++;
508 			}
509 			*q = '\0';
510 			if (q == wfile)
511 				err(COMPILE, "no wfile specified");
512 			s->wfile = strdup(wfile);
513 			if (!aflag && (s->wfd = open(wfile,
514 			    O_WRONLY|O_APPEND|O_CREAT|O_TRUNC,
515 			    DEFFILEMODE)) == -1)
516 				err(FATAL, "%s: %s\n", wfile, strerror(errno));
517 			return (p);
518 		default:
519 			err(COMPILE,
520 			    "bad flag in substitute command: '%c'", *p);
521 			break;
522 		}
523 		p++;
524 	}
525 }
526 
527 /*
528  * Compile a translation set of strings into a lookup table.
529  */
530 static char *
531 compile_tr(p, transtab)
532 	char *p;
533 	char **transtab;
534 {
535 	int i;
536 	char *lt, *op, *np;
537 	char old[_POSIX2_LINE_MAX + 1];
538 	char new[_POSIX2_LINE_MAX + 1];
539 
540 	if (*p == '\0' || *p == '\\')
541 		err(COMPILE,
542 "transform pattern can not be delimited by newline or backslash");
543 	p = compile_delimited(p, old);
544 	if (p == NULL) {
545 		err(COMPILE, "unterminated transform source string");
546 		return (NULL);
547 	}
548 	p = compile_delimited(--p, new);
549 	if (p == NULL) {
550 		err(COMPILE, "unterminated transform target string");
551 		return (NULL);
552 	}
553 	EATSPACE();
554 	if (strlen(new) != strlen(old)) {
555 		err(COMPILE, "transform strings are not the same length");
556 		return (NULL);
557 	}
558 	/* We assume characters are 8 bits */
559 	lt = xmalloc(UCHAR_MAX);
560 	for (i = 0; i <= UCHAR_MAX; i++)
561 		lt[i] = (char)i;
562 	for (op = old, np = new; *op; op++, np++)
563 		lt[(u_char)*op] = *np;
564 	*transtab = lt;
565 	return (p);
566 }
567 
568 /*
569  * Compile the text following an a or i command.
570  */
571 static char *
572 compile_text()
573 {
574 	int asize, size;
575 	char *text, *p, *op, *s;
576 	char lbuf[_POSIX2_LINE_MAX + 1];
577 
578 	asize = 2 * _POSIX2_LINE_MAX + 1;
579 	text = xmalloc(asize);
580 	size = 0;
581 	while (cu_fgets(lbuf, sizeof(lbuf))) {
582 		op = s = text + size;
583 		p = lbuf;
584 		EATSPACE();
585 		for (; *p; p++) {
586 			if (*p == '\\')
587 				p++;
588 			*s++ = *p;
589 		}
590 		size += s - op;
591 		if (p[-2] != '\\') {
592 			*s = '\0';
593 			break;
594 		}
595 		if (asize - size < _POSIX2_LINE_MAX + 1) {
596 			asize *= 2;
597 			text = xmalloc(asize);
598 		}
599 	}
600 	return (xrealloc(text, size + 1));
601 }
602 
603 /*
604  * Get an address and return a pointer to the first character after
605  * it.  Fill the structure pointed to according to the address.
606  */
607 static char *
608 compile_addr(p, a)
609 	char *p;
610 	struct s_addr *a;
611 {
612 	char *end;
613 
614 	switch (*p) {
615 	case '\\':				/* Context address */
616 		++p;
617 		/* FALLTHROUGH */
618 	case '/':				/* Context address */
619 		p = compile_re(p, &a->u.r);
620 		if (p == NULL)
621 			err(COMPILE, "unterminated regular expression");
622 		a->type = AT_RE;
623 		return (p);
624 
625 	case '$':				/* Last line */
626 		a->type = AT_LAST;
627 		return (p + 1);
628 						/* Line number */
629 	case '0': case '1': case '2': case '3': case '4':
630 	case '5': case '6': case '7': case '8': case '9':
631 		a->type = AT_LINE;
632 		a->u.l = strtol(p, &end, 10);
633 		return (end);
634 	default:
635 		err(COMPILE, "expected context address");
636 		return (NULL);
637 	}
638 }
639 
640 /*
641  * Return a copy of all the characters up to \n or \0
642  */
643 static char *
644 duptoeol(s)
645 	register char *s;
646 {
647 	size_t len;
648 	char *start;
649 
650 	for (start = s; *s != '\0' && *s != '\n'; ++s);
651 	*s = '\0';
652 	len = s - start + 1;
653 	return (memmove(xmalloc(len), start, len));
654 }
655 
656 /*
657  * Find the label contained in the command l in the command linked list cp.
658  * L is excluded from the search.  Return NULL if not found.
659  */
660 static struct s_command *
661 findlabel(l, cp)
662 	struct s_command *l, *cp;
663 {
664 	struct s_command *r;
665 
666 	for (; cp; cp = cp->next)
667 		if (cp->code == ':' && cp != l && strcmp(l->t, cp->t) == 0)
668 			return (cp);
669 		else if (cp->code == '{' && (r = findlabel(l, cp->u.c)))
670 			return (r);
671 	return (NULL);
672 }
673 
674 /*
675  * Convert goto label names to addresses.
676  * Detect duplicate labels.
677  * Set appendnum to the number of a and r commands in the script.
678  * Free the memory used by labels in b and t commands (but not by :)
679  * Root is a pointer to the script linked list; cp points to the
680  * search start.
681  * TODO: Remove } nodes
682  */
683 static void
684 fixuplabel(root, cp, end)
685 	struct s_command *root, *cp, *end;
686 {
687 	struct s_command *cp2;
688 
689 	for (; cp != end; cp = cp->next)
690 		switch (cp->code) {
691 		case ':':
692 			if (findlabel(cp, root))
693 				err(COMPILE2, "duplicate label %s", cp->t);
694 			break;
695 		case 'a':
696 		case 'r':
697 			appendnum++;
698 			break;
699 		case 'b':
700 		case 't':
701 			if (cp->t == NULL) {
702 				cp->u.c = NULL;
703 				break;
704 			}
705 			if ((cp2 = findlabel(cp, root)) == NULL)
706 				err(COMPILE2, "undefined label '%s'", cp->t);
707 			free(cp->t);
708 			cp->u.c = cp2;
709 			break;
710 		case '{':
711 			fixuplabel(root, cp->u.c, cp->next);
712 			break;
713 		}
714 }
715