1 /* $Source: /u/mark/src/pax/RCS/regexp.c,v $
2 *
3 * $Revision: 1.2 $
4 *
5 * regexp.c - regular expression matching
6 *
7 * DESCRIPTION
8 *
9 * Underneath the reformatting and comment blocks which were added to
10 * make it consistent with the rest of the code, you will find a
11 * modified version of Henry Specer's regular expression library.
12 * Henry's functions were modified to provide the minimal regular
13 * expression matching, as required by P1003. Henry's code was
14 * copyrighted, and copy of the copyright message and restrictions
15 * are provided, verbatim, below:
16 *
17 * Copyright (c) 1986 by University of Toronto.
18 * Written by Henry Spencer. Not derived from licensed software.
19 *
20 * Permission is granted to anyone to use this software for any
21 * purpose on any computer system, and to redistribute it freely,
22 * subject to the following restrictions:
23 *
24 * 1. The author is not responsible for the consequences of use of
25 * this software, no matter how awful, even if they arise
26 * from defects in it.
27 *
28 * 2. The origin of this software must not be misrepresented, either
29 * by explicit claim or by omission.
30 *
31 * 3. Altered versions must be plainly marked as such, and must not
32 * be misrepresented as being the original software.
33 *
34 * Beware that some of this code is subtly aware of the way operator
35 * precedence is structured in regular expressions. Serious changes in
36 * regular-expression syntax might require a total rethink.
37 *
38 * AUTHORS
39 *
40 * Mark H. Colburn, NAPS International (mark@jhereg.mn.org)
41 * Henry Spencer, University of Torronto (henry@utzoo.edu)
42 *
43 * Sponsored by The USENIX Association for public distribution.
44 *
45 * $Log: regexp.c,v $
46 * Revision 1.2 89/02/12 10:05:39 mark
47 * 1.2 release fixes
48 *
49 * Revision 1.1 88/12/23 18:02:32 mark
50 * Initial revision
51 *
52 */
53
54 /* Headers */
55
56 #include "pax.h"
57
58 #ifndef lint
59 static char *Ident = "$Id: regexp.c,v 1.2 89/02/12 10:05:39 mark Exp $";
60 #endif
61
62
63 /*
64 * The "internal use only" fields in regexp.h are present to pass info from
65 * compile to execute that permits the execute phase to run lots faster on
66 * simple cases. They are:
67 *
68 * regstart char that must begin a match; '\0' if none obvious
69 * reganch is the match anchored (at beginning-of-line only)?
70 * regmust string (pointer into program) that match must include, or NULL
71 * regmlen length of regmust string
72 *
73 * Regstart and reganch permit very fast decisions on suitable starting points
74 * for a match, cutting down the work a lot. Regmust permits fast rejection
75 * of lines that cannot possibly match. The regmust tests are costly enough
76 * that regcomp() supplies a regmust only if the r.e. contains something
77 * potentially expensive (at present, the only such thing detected is * or +
78 * at the start of the r.e., which can involve a lot of backup). Regmlen is
79 * supplied because the test in regexec() needs it and regcomp() is computing
80 * it anyway.
81 */
82
83 /*
84 * Structure for regexp "program". This is essentially a linear encoding
85 * of a nondeterministic finite-state machine (aka syntax charts or
86 * "railroad normal form" in parsing technology). Each node is an opcode
87 * plus a "nxt" pointer, possibly plus an operand. "Nxt" pointers of
88 * all nodes except BRANCH implement concatenation; a "nxt" pointer with
89 * a BRANCH on both ends of it is connecting two alternatives. (Here we
90 * have one of the subtle syntax dependencies: an individual BRANCH (as
91 * opposed to a collection of them) is never concatenated with anything
92 * because of operator precedence.) The operand of some types of node is
93 * a literal string; for others, it is a node leading into a sub-FSM. In
94 * particular, the operand of a BRANCH node is the first node of the branch.
95 * (NB this is *not* a tree structure: the tail of the branch connects
96 * to the thing following the set of BRANCHes.) The opcodes are:
97 */
98
99 /* definition number opnd? meaning */
100 #define END 0 /* no End of program. */
101 #define BOL 1 /* no Match "" at beginning of line. */
102 #define EOL 2 /* no Match "" at end of line. */
103 #define ANY 3 /* no Match any one character. */
104 #define ANYOF 4 /* str Match any character in this string. */
105 #define ANYBUT 5 /* str Match any character not in this
106 * string. */
107 #define BRANCH 6 /* node Match this alternative, or the
108 * nxt... */
109 #define BACK 7 /* no Match "", "nxt" ptr points backward. */
110 #define EXACTLY 8 /* str Match this string. */
111 #define NOTHING 9 /* no Match empty string. */
112 #define STAR 10 /* node Match this (simple) thing 0 or more
113 * times. */
114 #define OPEN 20 /* no Mark this point in input as start of
115 * #n. */
116 /* OPEN+1 is number 1, etc. */
117 #define CLOSE 30 /* no Analogous to OPEN. */
118
119 /*
120 * Opcode notes:
121 *
122 * BRANCH The set of branches constituting a single choice are hooked
123 * together with their "nxt" pointers, since precedence prevents
124 * anything being concatenated to any individual branch. The
125 * "nxt" pointer of the last BRANCH in a choice points to the
126 * thing following the whole choice. This is also where the
127 * final "nxt" pointer of each individual branch points; each
128 * branch starts with the operand node of a BRANCH node.
129 *
130 * BACK Normal "nxt" pointers all implicitly point forward; BACK
131 * exists to make loop structures possible.
132 *
133 * STAR complex '*', are implemented as circular BRANCH structures
134 * using BACK. Simple cases (one character per match) are
135 * implemented with STAR for speed and to minimize recursive
136 * plunges.
137 *
138 * OPEN,CLOSE ...are numbered at compile time.
139 */
140
141 /*
142 * A node is one char of opcode followed by two chars of "nxt" pointer.
143 * "Nxt" pointers are stored as two 8-bit pieces, high order first. The
144 * value is a positive offset from the opcode of the node containing it.
145 * An operand, if any, simply follows the node. (Note that much of the
146 * code generation knows about this implicit relationship.)
147 *
148 * Using two bytes for the "nxt" pointer is vast overkill for most things,
149 * but allows patterns to get big without disasters.
150 */
151 #define OP(p) (*(p))
152 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
153 #define OPERAND(p) ((p) + 3)
154
155 /*
156 * Utility definitions.
157 */
158
159 #define FAIL(m) { regerror(m); return(NULL); }
160 #define ISMULT(c) ((c) == '*')
161 #define META "^$.[()|*\\"
162 #ifndef CHARBITS
163 #define UCHARAT(p) ((int)*(unsigned char *)(p))
164 #else
165 #define UCHARAT(p) ((int)*(p)&CHARBITS)
166 #endif
167
168 /*
169 * Flags to be passed up and down.
170 */
171 #define HASWIDTH 01 /* Known never to match null string. */
172 #define SIMPLE 02 /* Simple enough to be STAR operand. */
173 #define SPSTART 04 /* Starts with * */
174 #define WORST 0 /* Worst case. */
175
176 /*
177 * Global work variables for regcomp().
178 */
179 static char *regparse; /* Input-scan pointer. */
180 static int regnpar; /* () count. */
181 static char regdummy;
182 static char *regcode; /* Code-emit pointer; ®dummy = don't. */
183 static long regsize; /* Code size. */
184
185 /*
186 * Forward declarations for regcomp()'s friends.
187 */
188 #ifndef STATIC
189 #define STATIC static
190 #endif
191 STATIC char *reg();
192 STATIC char *regbranch();
193 STATIC char *regpiece();
194 STATIC char *regatom();
195 STATIC char *regnode();
196 STATIC char *regnext();
197 STATIC void regc();
198 STATIC void reginsert();
199 STATIC void regtail();
200 STATIC void regoptail();
201 #ifdef STRCSPN
202 STATIC int strcspn();
203 #endif
204
205 /*
206 - regcomp - compile a regular expression into internal code
207 *
208 * We can't allocate space until we know how big the compiled form will be,
209 * but we can't compile it (and thus know how big it is) until we've got a
210 * place to put the code. So we cheat: we compile it twice, once with code
211 * generation turned off and size counting turned on, and once "for real".
212 * This also means that we don't allocate space until we are sure that the
213 * thing really will compile successfully, and we never have to move the
214 * code and thus invalidate pointers into it. (Note that it has to be in
215 * one piece because free() must be able to free it all.)
216 *
217 * Beware that the optimization-preparation code in here knows about some
218 * of the structure of the compiled regexp.
219 */
regcomp(exp)220 regexp *regcomp(exp)
221 char *exp;
222 {
223 register regexp *r;
224 register char *scan;
225 register char *longest;
226 register int len;
227 int flags;
228 extern char *malloc();
229
230 if (exp == (char *)NULL)
231 FAIL("NULL argument");
232
233 /* First pass: determine size, legality. */
234 regparse = exp;
235 regnpar = 1;
236 regsize = 0L;
237 regcode = ®dummy;
238 regc(MAGIC);
239 if (reg(0, &flags) == (char *)NULL)
240 return ((regexp *)NULL);
241
242 /* Small enough for pointer-storage convention? */
243 if (regsize >= 32767L) /* Probably could be 65535L. */
244 FAIL("regexp too big");
245
246 /* Allocate space. */
247 r = (regexp *) malloc(sizeof(regexp) + (unsigned) regsize);
248 if (r == (regexp *) NULL)
249 FAIL("out of space");
250
251 /* Second pass: emit code. */
252 regparse = exp;
253 regnpar = 1;
254 regcode = r->program;
255 regc(MAGIC);
256 if (reg(0, &flags) == NULL)
257 return ((regexp *) NULL);
258
259 /* Dig out information for optimizations. */
260 r->regstart = '\0'; /* Worst-case defaults. */
261 r->reganch = 0;
262 r->regmust = NULL;
263 r->regmlen = 0;
264 scan = r->program + 1; /* First BRANCH. */
265 if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
266 scan = OPERAND(scan);
267
268 /* Starting-point info. */
269 if (OP(scan) == EXACTLY)
270 r->regstart = *OPERAND(scan);
271 else if (OP(scan) == BOL)
272 r->reganch++;
273
274 /*
275 * If there's something expensive in the r.e., find the longest
276 * literal string that must appear and make it the regmust. Resolve
277 * ties in favor of later strings, since the regstart check works
278 * with the beginning of the r.e. and avoiding duplication
279 * strengthens checking. Not a strong reason, but sufficient in the
280 * absence of others.
281 */
282 if (flags & SPSTART) {
283 longest = NULL;
284 len = 0;
285 for (; scan != NULL; scan = regnext(scan))
286 if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
287 longest = OPERAND(scan);
288 len = strlen(OPERAND(scan));
289 }
290 r->regmust = longest;
291 r->regmlen = len;
292 }
293 }
294 return (r);
295 }
296
297 /*
298 - reg - regular expression, i.e. main body or parenthesized thing
299 *
300 * Caller must absorb opening parenthesis.
301 *
302 * Combining parenthesis handling with the base level of regular expression
303 * is a trifle forced, but the need to tie the tails of the branches to what
304 * follows makes it hard to avoid.
305 */
reg(paren,flagp)306 static char *reg(paren, flagp)
307 int paren; /* Parenthesized? */
308 int *flagp;
309 {
310 register char *ret;
311 register char *br;
312 register char *ender;
313 register int parno;
314 int flags;
315
316 *flagp = HASWIDTH; /* Tentatively. */
317
318 /* Make an OPEN node, if parenthesized. */
319 if (paren) {
320 if (regnpar >= NSUBEXP)
321 FAIL("too many ()");
322 parno = regnpar;
323 regnpar++;
324 ret = regnode(OPEN + parno);
325 } else
326 ret = (char *)NULL;
327
328 /* Pick up the branches, linking them together. */
329 br = regbranch(&flags);
330 if (br == (char *)NULL)
331 return ((char *)NULL);
332 if (ret != (char *)NULL)
333 regtail(ret, br); /* OPEN -> first. */
334 else
335 ret = br;
336 if (!(flags & HASWIDTH))
337 *flagp &= ~HASWIDTH;
338 *flagp |= flags & SPSTART;
339 while (*regparse == '|') {
340 regparse++;
341 br = regbranch(&flags);
342 if (br == (char *)NULL)
343 return ((char *)NULL);
344 regtail(ret, br); /* BRANCH -> BRANCH. */
345 if (!(flags & HASWIDTH))
346 *flagp &= ~HASWIDTH;
347 *flagp |= flags & SPSTART;
348 }
349
350 /* Make a closing node, and hook it on the end. */
351 ender = regnode((paren) ? CLOSE + parno : END);
352 regtail(ret, ender);
353
354 /* Hook the tails of the branches to the closing node. */
355 for (br = ret; br != (char *)NULL; br = regnext(br))
356 regoptail(br, ender);
357
358 /* Check for proper termination. */
359 if (paren && *regparse++ != ')') {
360 FAIL("unmatched ()");
361 } else if (!paren && *regparse != '\0') {
362 if (*regparse == ')') {
363 FAIL("unmatched ()");
364 } else
365 FAIL("junk on end");/* "Can't happen". */
366 /* NOTREACHED */
367 }
368 return (ret);
369 }
370
371 /*
372 - regbranch - one alternative of an | operator
373 *
374 * Implements the concatenation operator.
375 */
regbranch(flagp)376 static char *regbranch(flagp)
377 int *flagp;
378 {
379 register char *ret;
380 register char *chain;
381 register char *latest;
382 int flags;
383
384 *flagp = WORST; /* Tentatively. */
385
386 ret = regnode(BRANCH);
387 chain = (char *)NULL;
388 while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
389 latest = regpiece(&flags);
390 if (latest == (char *)NULL)
391 return ((char *)NULL);
392 *flagp |= flags & HASWIDTH;
393 if (chain == (char *)NULL) /* First piece. */
394 *flagp |= flags & SPSTART;
395 else
396 regtail(chain, latest);
397 chain = latest;
398 }
399 if (chain == (char *)NULL) /* Loop ran zero times. */
400 regnode(NOTHING);
401
402 return (ret);
403 }
404
405 /*
406 - regpiece - something followed by possible [*]
407 *
408 * Note that the branching code sequence used for * is somewhat optimized:
409 * they use the same NOTHING node as both the endmarker for their branch
410 * list and the body of the last branch. It might seem that this node could
411 * be dispensed with entirely, but the endmarker role is not redundant.
412 */
regpiece(flagp)413 static char *regpiece(flagp)
414 int *flagp;
415 {
416 register char *ret;
417 register char op;
418 register char *nxt;
419 int flags;
420
421 ret = regatom(&flags);
422 if (ret == (char *)NULL)
423 return ((char *)NULL);
424
425 op = *regparse;
426 if (!ISMULT(op)) {
427 *flagp = flags;
428 return (ret);
429 }
430 if (!(flags & HASWIDTH))
431 FAIL("* operand could be empty");
432 *flagp = (WORST | SPSTART);
433
434 if (op == '*' && (flags & SIMPLE))
435 reginsert(STAR, ret);
436 else if (op == '*') {
437 /* Emit x* as (x&|), where & means "self". */
438 reginsert(BRANCH, ret); /* Either x */
439 regoptail(ret, regnode(BACK)); /* and loop */
440 regoptail(ret, ret); /* back */
441 regtail(ret, regnode(BRANCH)); /* or */
442 regtail(ret, regnode(NOTHING)); /* null. */
443 }
444 regparse++;
445 if (ISMULT(*regparse))
446 FAIL("nested *");
447
448 return (ret);
449 }
450
451 /*
452 - regatom - the lowest level
453 *
454 * Optimization: gobbles an entire sequence of ordinary characters so that
455 * it can turn them into a single node, which is smaller to store and
456 * faster to run. Backslashed characters are exceptions, each becoming a
457 * separate node; the code is simpler that way and it's not worth fixing.
458 */
regatom(flagp)459 static char *regatom(flagp)
460 int *flagp;
461 {
462 register char *ret;
463 int flags;
464
465 *flagp = WORST; /* Tentatively. */
466
467 switch (*regparse++) {
468 case '^':
469 ret = regnode(BOL);
470 break;
471 case '$':
472 ret = regnode(EOL);
473 break;
474 case '.':
475 ret = regnode(ANY);
476 *flagp |= HASWIDTH | SIMPLE;
477 break;
478 case '[':{
479 register int class;
480 register int classend;
481
482 if (*regparse == '^') { /* Complement of range. */
483 ret = regnode(ANYBUT);
484 regparse++;
485 } else
486 ret = regnode(ANYOF);
487 if (*regparse == ']' || *regparse == '-')
488 regc(*regparse++);
489 while (*regparse != '\0' && *regparse != ']') {
490 if (*regparse == '-') {
491 regparse++;
492 if (*regparse == ']' || *regparse == '\0')
493 regc('-');
494 else {
495 class = UCHARAT(regparse - 2) + 1;
496 classend = UCHARAT(regparse);
497 if (class > classend + 1)
498 FAIL("invalid [] range");
499 for (; class <= classend; class++)
500 regc(class);
501 regparse++;
502 }
503 } else
504 regc(*regparse++);
505 }
506 regc('\0');
507 if (*regparse != ']')
508 FAIL("unmatched []");
509 regparse++;
510 *flagp |= HASWIDTH | SIMPLE;
511 }
512 break;
513 case '(':
514 ret = reg(1, &flags);
515 if (ret == (char *)NULL)
516 return ((char *)NULL);
517 *flagp |= flags & (HASWIDTH | SPSTART);
518 break;
519 case '\0':
520 case '|':
521 case ')':
522 FAIL("internal urp"); /* Supposed to be caught earlier. */
523 break;
524 case '*':
525 FAIL("* follows nothing");
526 break;
527 case '\\':
528 if (*regparse == '\0')
529 FAIL("trailing \\");
530 ret = regnode(EXACTLY);
531 regc(*regparse++);
532 regc('\0');
533 *flagp |= HASWIDTH | SIMPLE;
534 break;
535 default:{
536 register int len;
537 register char ender;
538
539 regparse--;
540 len = strcspn(regparse, META);
541 if (len <= 0)
542 FAIL("internal disaster");
543 ender = *(regparse + len);
544 if (len > 1 && ISMULT(ender))
545 len--; /* Back off clear of * operand. */
546 *flagp |= HASWIDTH;
547 if (len == 1)
548 *flagp |= SIMPLE;
549 ret = regnode(EXACTLY);
550 while (len > 0) {
551 regc(*regparse++);
552 len--;
553 }
554 regc('\0');
555 }
556 break;
557 }
558
559 return (ret);
560 }
561
562 /*
563 - regnode - emit a node
564 */
regnode(op)565 static char *regnode(op)
566 char op;
567 {
568 register char *ret;
569 register char *ptr;
570
571 ret = regcode;
572 if (ret == ®dummy) {
573 regsize += 3;
574 return (ret);
575 }
576 ptr = ret;
577 *ptr++ = op;
578 *ptr++ = '\0'; /* Null "nxt" pointer. */
579 *ptr++ = '\0';
580 regcode = ptr;
581
582 return (ret);
583 }
584
585 /*
586 - regc - emit (if appropriate) a byte of code
587 */
regc(b)588 static void regc(b)
589 char b;
590 {
591 if (regcode != ®dummy)
592 *regcode++ = b;
593 else
594 regsize++;
595 }
596
597 /*
598 - reginsert - insert an operator in front of already-emitted operand
599 *
600 * Means relocating the operand.
601 */
reginsert(op,opnd)602 static void reginsert(op, opnd)
603 char op;
604 char *opnd;
605 {
606 register char *src;
607 register char *dst;
608 register char *place;
609
610 if (regcode == ®dummy) {
611 regsize += 3;
612 return;
613 }
614 src = regcode;
615 regcode += 3;
616 dst = regcode;
617 while (src > opnd)
618 *--dst = *--src;
619
620 place = opnd; /* Op node, where operand used to be. */
621 *place++ = op;
622 *place++ = '\0';
623 *place++ = '\0';
624 }
625
626 /*
627 - regtail - set the next-pointer at the end of a node chain
628 */
regtail(p,val)629 static void regtail(p, val)
630 char *p;
631 char *val;
632 {
633 register char *scan;
634 register char *temp;
635 register int offset;
636
637 if (p == ®dummy)
638 return;
639
640 /* Find last node. */
641 scan = p;
642 for (;;) {
643 temp = regnext(scan);
644 if (temp == (char *)NULL)
645 break;
646 scan = temp;
647 }
648
649 if (OP(scan) == BACK)
650 offset = scan - val;
651 else
652 offset = val - scan;
653 *(scan + 1) = (offset >> 8) & 0377;
654 *(scan + 2) = offset & 0377;
655 }
656
657 /*
658 - regoptail - regtail on operand of first argument; nop if operandless
659 */
regoptail(p,val)660 static void regoptail(p, val)
661 char *p;
662 char *val;
663 {
664 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
665 if (p == (char *)NULL || p == ®dummy || OP(p) != BRANCH)
666 return;
667 regtail(OPERAND(p), val);
668 }
669
670 /*
671 * regexec and friends
672 */
673
674 /*
675 * Global work variables for regexec().
676 */
677 static char *reginput; /* String-input pointer. */
678 static char *regbol; /* Beginning of input, for ^ check. */
679 static char **regstartp; /* Pointer to startp array. */
680 static char **regendp; /* Ditto for endp. */
681
682 /*
683 * Forwards.
684 */
685 STATIC int regtry();
686 STATIC int regmatch();
687 STATIC int regrepeat();
688
689 #ifdef DEBUG
690 int regnarrate = 0;
691 void regdump();
692 STATIC char *regprop();
693 #endif
694
695 /*
696 - regexec - match a regexp against a string
697 */
regexec(prog,string)698 int regexec(prog, string)
699 register regexp *prog;
700 register char *string;
701 {
702 register char *s;
703
704 /* Be paranoid... */
705 if (prog == (regexp *)NULL || string == (char *)NULL) {
706 regerror("NULL parameter");
707 return (0);
708 }
709 /* Check validity of program. */
710 if (UCHARAT(prog->program) != MAGIC) {
711 regerror("corrupted program");
712 return (0);
713 }
714 /* If there is a "must appear" string, look for it. */
715 if (prog->regmust != (char *)NULL) {
716 s = string;
717 while ((s = strchr(s, prog->regmust[0])) != (char *)NULL) {
718 if (strncmp(s, prog->regmust, prog->regmlen) == 0)
719 break; /* Found it. */
720 s++;
721 }
722 if (s == (char *)NULL) /* Not present. */
723 return (0);
724 }
725 /* Mark beginning of line for ^ . */
726 regbol = string;
727
728 /* Simplest case: anchored match need be tried only once. */
729 if (prog->reganch)
730 return (regtry(prog, string));
731
732 /* Messy cases: unanchored match. */
733 s = string;
734 if (prog->regstart != '\0')
735 /* We know what char it must start with. */
736 while ((s = strchr(s, prog->regstart)) != (char *)NULL) {
737 if (regtry(prog, s))
738 return (1);
739 s++;
740 }
741 else
742 /* We don't -- general case. */
743 do {
744 if (regtry(prog, s))
745 return (1);
746 } while (*s++ != '\0');
747
748 /* Failure. */
749 return (0);
750 }
751
752 /*
753 - regtry - try match at specific point
754 */
755 #ifdef __STDC__
756
regtry(regexp * prog,char * string)757 static int regtry(regexp *prog, char *string)
758
759 #else
760
761 static int regtry(prog, string)
762 regexp *prog;
763 char *string;
764
765 #endif
766 {
767 register int i;
768 register char **sp;
769 register char **ep;
770
771 reginput = string;
772 regstartp = prog->startp;
773 regendp = prog->endp;
774
775 sp = prog->startp;
776 ep = prog->endp;
777 for (i = NSUBEXP; i > 0; i--) {
778 *sp++ = (char *)NULL;
779 *ep++ = (char *)NULL;
780 }
781 if (regmatch(prog->program + 1)) {
782 prog->startp[0] = string;
783 prog->endp[0] = reginput;
784 return (1);
785 } else
786 return (0);
787 }
788
789 /*
790 - regmatch - main matching routine
791 *
792 * Conceptually the strategy is simple: check to see whether the current
793 * node matches, call self recursively to see whether the rest matches,
794 * and then act accordingly. In practice we make some effort to avoid
795 * recursion, in particular by going through "ordinary" nodes (that don't
796 * need to know whether the rest of the match failed) by a loop instead of
797 * by recursion.
798 */
799 #ifdef __STDC__
800
regmatch(char * prog)801 static int regmatch(char *prog)
802
803 #else
804
805 static int regmatch(prog)
806 char *prog;
807
808 #endif
809 {
810 register char *scan; /* Current node. */
811 char *nxt; /* nxt node. */
812
813 scan = prog;
814 #ifdef DEBUG
815 if (scan != (char *)NULL && regnarrate)
816 fprintf(stderr, "%s(\n", regprop(scan));
817 #endif
818 while (scan != (char *)NULL) {
819 #ifdef DEBUG
820 if (regnarrate)
821 fprintf(stderr, "%s...\n", regprop(scan));
822 #endif
823 nxt = regnext(scan);
824
825 switch (OP(scan)) {
826 case BOL:
827 if (reginput != regbol)
828 return (0);
829 break;
830 case EOL:
831 if (*reginput != '\0')
832 return (0);
833 break;
834 case ANY:
835 if (*reginput == '\0')
836 return (0);
837 reginput++;
838 break;
839 case EXACTLY:{
840 register int len;
841 register char *opnd;
842
843 opnd = OPERAND(scan);
844 /* Inline the first character, for speed. */
845 if (*opnd != *reginput)
846 return (0);
847 len = strlen(opnd);
848 if (len > 1 && strncmp(opnd, reginput, len) != 0)
849 return (0);
850 reginput += len;
851 }
852 break;
853 case ANYOF:
854 if (*reginput == '\0' ||
855 strchr(OPERAND(scan), *reginput) == (char *)NULL)
856 return (0);
857 reginput++;
858 break;
859 case ANYBUT:
860 if (*reginput == '\0' ||
861 strchr(OPERAND(scan), *reginput) != (char *)NULL)
862 return (0);
863 reginput++;
864 break;
865 case NOTHING:
866 break;
867 case BACK:
868 break;
869 case OPEN + 1:
870 case OPEN + 2:
871 case OPEN + 3:
872 case OPEN + 4:
873 case OPEN + 5:
874 case OPEN + 6:
875 case OPEN + 7:
876 case OPEN + 8:
877 case OPEN + 9:{
878 register int no;
879 register char *save;
880
881 no = OP(scan) - OPEN;
882 save = reginput;
883
884 if (regmatch(nxt)) {
885 /*
886 * Don't set startp if some later invocation of the same
887 * parentheses already has.
888 */
889 if (regstartp[no] == (char *)NULL)
890 regstartp[no] = save;
891 return (1);
892 } else
893 return (0);
894 }
895 break;
896 case CLOSE + 1:
897 case CLOSE + 2:
898 case CLOSE + 3:
899 case CLOSE + 4:
900 case CLOSE + 5:
901 case CLOSE + 6:
902 case CLOSE + 7:
903 case CLOSE + 8:
904 case CLOSE + 9:{
905 register int no;
906 register char *save;
907
908 no = OP(scan) - CLOSE;
909 save = reginput;
910
911 if (regmatch(nxt)) {
912 /*
913 * Don't set endp if some later invocation of the same
914 * parentheses already has.
915 */
916 if (regendp[no] == (char *)NULL)
917 regendp[no] = save;
918 return (1);
919 } else
920 return (0);
921 }
922 break;
923 case BRANCH:{
924 register char *save;
925
926 if (OP(nxt) != BRANCH) /* No choice. */
927 nxt = OPERAND(scan); /* Avoid recursion. */
928 else {
929 do {
930 save = reginput;
931 if (regmatch(OPERAND(scan)))
932 return (1);
933 reginput = save;
934 scan = regnext(scan);
935 } while (scan != (char *)NULL && OP(scan) == BRANCH);
936 return (0);
937 /* NOTREACHED */
938 }
939 }
940 break;
941 case STAR:{
942 register char nextch;
943 register int no;
944 register char *save;
945 register int minimum;
946
947 /*
948 * Lookahead to avoid useless match attempts when we know
949 * what character comes next.
950 */
951 nextch = '\0';
952 if (OP(nxt) == EXACTLY)
953 nextch = *OPERAND(nxt);
954 minimum = (OP(scan) == STAR) ? 0 : 1;
955 save = reginput;
956 no = regrepeat(OPERAND(scan));
957 while (no >= minimum) {
958 /* If it could work, try it. */
959 if (nextch == '\0' || *reginput == nextch)
960 if (regmatch(nxt))
961 return (1);
962 /* Couldn't or didn't -- back up. */
963 no--;
964 reginput = save + no;
965 }
966 return (0);
967 }
968 break;
969 case END:
970 return (1); /* Success! */
971 break;
972 default:
973 regerror("memory corruption");
974 return (0);
975 break;
976 }
977
978 scan = nxt;
979 }
980
981 /*
982 * We get here only if there's trouble -- normally "case END" is the
983 * terminating point.
984 */
985 regerror("corrupted pointers");
986 return (0);
987 }
988
989 /*
990 - regrepeat - repeatedly match something simple, report how many
991 */
992 #ifdef __STDC__
993
regrepeat(char * p)994 static int regrepeat(char *p)
995
996 #else
997
998 static int regrepeat(p)
999 char *p;
1000
1001 #endif
1002 {
1003 register int count = 0;
1004 register char *scan;
1005 register char *opnd;
1006
1007 scan = reginput;
1008 opnd = OPERAND(p);
1009 switch (OP(p)) {
1010 case ANY:
1011 count = strlen(scan);
1012 scan += count;
1013 break;
1014 case EXACTLY:
1015 while (*opnd == *scan) {
1016 count++;
1017 scan++;
1018 }
1019 break;
1020 case ANYOF:
1021 while (*scan != '\0' && strchr(opnd, *scan) != (char *)NULL) {
1022 count++;
1023 scan++;
1024 }
1025 break;
1026 case ANYBUT:
1027 while (*scan != '\0' && strchr(opnd, *scan) == (char *)NULL) {
1028 count++;
1029 scan++;
1030 }
1031 break;
1032 default: /* Oh dear. Called inappropriately. */
1033 regerror("internal foulup");
1034 count = 0; /* Best compromise. */
1035 break;
1036 }
1037 reginput = scan;
1038
1039 return (count);
1040 }
1041
1042
1043 /*
1044 - regnext - dig the "nxt" pointer out of a node
1045 */
1046 #ifdef __STDC__
1047
regnext(register char * p)1048 static char *regnext(register char *p)
1049
1050 #else
1051
1052 static char *regnext(p)
1053 register char *p;
1054
1055 #endif
1056 {
1057 register int offset;
1058
1059 if (p == ®dummy)
1060 return ((char *)NULL);
1061
1062 offset = NEXT(p);
1063 if (offset == 0)
1064 return ((char *)NULL);
1065
1066 if (OP(p) == BACK)
1067 return (p - offset);
1068 else
1069 return (p + offset);
1070 }
1071
1072 #ifdef DEBUG
1073
1074 STATIC char *regprop();
1075
1076 /*
1077 - regdump - dump a regexp onto stdout in vaguely comprehensible form
1078 */
1079 #ifdef __STDC__
1080
regdump(regexp * r)1081 void regdump(regexp *r)
1082
1083 #else
1084
1085 void regdump(r)
1086 regexp *r;
1087
1088 #endif
1089 {
1090 register char *s;
1091 register char op = EXACTLY; /* Arbitrary non-END op. */
1092 register char *nxt;
1093 extern char *strchr();
1094
1095
1096 s = r->program + 1;
1097 while (op != END) { /* While that wasn't END last time... */
1098 op = OP(s);
1099 printf("%2d%s", s - r->program, regprop(s)); /* Where, what. */
1100 nxt = regnext(s);
1101 if (nxt == (char *)NULL) /* nxt ptr. */
1102 printf("(0)");
1103 else
1104 printf("(%d)", (s - r->program) + (nxt - s));
1105 s += 3;
1106 if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1107 /* Literal string, where present. */
1108 while (*s != '\0') {
1109 putchar(*s);
1110 s++;
1111 }
1112 s++;
1113 }
1114 putchar('\n');
1115 }
1116
1117 /* Header fields of interest. */
1118 if (r->regstart != '\0')
1119 printf("start `%c' ", r->regstart);
1120 if (r->reganch)
1121 printf("anchored ");
1122 if (r->regmust != (char *)NULL)
1123 printf("must have \"%s\"", r->regmust);
1124 printf("\n");
1125 }
1126
1127 /*
1128 - regprop - printable representation of opcode
1129 */
1130 #ifdef __STDC__
1131
regprop(char * op)1132 static char *regprop(char *op)
1133
1134 #else
1135
1136 static char *regprop(op)
1137 char *op;
1138
1139 #endif
1140 {
1141 register char *p;
1142 static char buf[50];
1143
1144 strcpy(buf, ":");
1145
1146 switch (OP(op)) {
1147 case BOL:
1148 p = "BOL";
1149 break;
1150 case EOL:
1151 p = "EOL";
1152 break;
1153 case ANY:
1154 p = "ANY";
1155 break;
1156 case ANYOF:
1157 p = "ANYOF";
1158 break;
1159 case ANYBUT:
1160 p = "ANYBUT";
1161 break;
1162 case BRANCH:
1163 p = "BRANCH";
1164 break;
1165 case EXACTLY:
1166 p = "EXACTLY";
1167 break;
1168 case NOTHING:
1169 p = "NOTHING";
1170 break;
1171 case BACK:
1172 p = "BACK";
1173 break;
1174 case END:
1175 p = "END";
1176 break;
1177 case OPEN + 1:
1178 case OPEN + 2:
1179 case OPEN + 3:
1180 case OPEN + 4:
1181 case OPEN + 5:
1182 case OPEN + 6:
1183 case OPEN + 7:
1184 case OPEN + 8:
1185 case OPEN + 9:
1186 sprintf(buf + strlen(buf), "OPEN%d", OP(op) - OPEN);
1187 p = (char *)NULL;
1188 break;
1189 case CLOSE + 1:
1190 case CLOSE + 2:
1191 case CLOSE + 3:
1192 case CLOSE + 4:
1193 case CLOSE + 5:
1194 case CLOSE + 6:
1195 case CLOSE + 7:
1196 case CLOSE + 8:
1197 case CLOSE + 9:
1198 sprintf(buf + strlen(buf), "CLOSE%d", OP(op) - CLOSE);
1199 p = (char *)NULL;
1200 break;
1201 case STAR:
1202 p = "STAR";
1203 break;
1204 default:
1205 regerror("corrupted opcode");
1206 break;
1207 }
1208 if (p != (char *)NULL)
1209 strcat(buf, p);
1210 return (buf);
1211 }
1212 #endif
1213
1214 /*
1215 * The following is provided for those people who do not have strcspn() in
1216 * their C libraries. They should get off their butts and do something
1217 * about it; at least one public-domain implementation of those (highly
1218 * useful) string routines has been published on Usenet.
1219 */
1220 #ifdef STRCSPN
1221 /*
1222 * strcspn - find length of initial segment of s1 consisting entirely
1223 * of characters not from s2
1224 */
1225
1226 #ifdef __STDC__
1227
strcspn(char * s1,char * s2)1228 static int strcspn(char *s1, char *s2)
1229
1230 #else
1231
1232 static int strcspn(s1, s2)
1233 char *s1;
1234 char *s2;
1235
1236 #endif
1237 {
1238 register char *scan1;
1239 register char *scan2;
1240 register int count;
1241
1242 count = 0;
1243 for (scan1 = s1; *scan1 != '\0'; scan1++) {
1244 for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */
1245 if (*scan1 == *scan2++)
1246 return (count);
1247 count++;
1248 }
1249 return (count);
1250 }
1251 #endif
1252
1253
1254 /*
1255 - regsub - perform substitutions after a regexp match
1256 */
1257 #ifdef __STDC__
1258
regsub(regexp * prog,char * source,char * dest)1259 void regsub(regexp *prog, char *source, char *dest)
1260
1261 #else
1262
1263 void regsub(prog, source, dest)
1264 regexp *prog;
1265 char *source;
1266 char *dest;
1267
1268 #endif
1269 {
1270 register char *src;
1271 register char *dst;
1272 register char c;
1273 register int no;
1274 register int len;
1275 extern char *strncpy();
1276
1277 if (prog == (regexp *)NULL ||
1278 source == (char *)NULL || dest == (char *)NULL) {
1279 regerror("NULL parm to regsub");
1280 return;
1281 }
1282 if (UCHARAT(prog->program) != MAGIC) {
1283 regerror("damaged regexp fed to regsub");
1284 return;
1285 }
1286 src = source;
1287 dst = dest;
1288 while ((c = *src++) != '\0') {
1289 if (c == '&')
1290 no = 0;
1291 else if (c == '\\' && '0' <= *src && *src <= '9')
1292 no = *src++ - '0';
1293 else
1294 no = -1;
1295
1296 if (no < 0) { /* Ordinary character. */
1297 if (c == '\\' && (*src == '\\' || *src == '&'))
1298 c = *src++;
1299 *dst++ = c;
1300 } else if (prog->startp[no] != (char *)NULL &&
1301 prog->endp[no] != (char *)NULL) {
1302 len = prog->endp[no] - prog->startp[no];
1303 strncpy(dst, prog->startp[no], len);
1304 dst += len;
1305 if (len != 0 && *(dst - 1) == '\0') { /* strncpy hit NUL. */
1306 regerror("damaged match string");
1307 return;
1308 }
1309 }
1310 }
1311 *dst++ = '\0';
1312 }
1313
1314
1315 #ifdef __STDC__
1316
regerror(char * s)1317 void regerror(char *s)
1318
1319 #else
1320
1321 void regerror(s)
1322 char *s;
1323
1324 #endif
1325 {
1326 fprintf(stderr, "regexp(3): %s", s);
1327 exit(1);
1328 }
1329