1 #include "sam.h"
2
3 Rangeset sel;
4 String lastregexp;
5 /*
6 * Machine Information
7 */
8 typedef struct Inst Inst;
9
10 struct Inst
11 {
12 long type; /* <= Runemax ==> literal, otherwise action */
13 union {
14 int rsid;
15 int rsubid;
16 int class;
17 struct Inst *rother;
18 struct Inst *rright;
19 } r;
20 union{
21 struct Inst *lleft;
22 struct Inst *lnext;
23 } l;
24 };
25 #define sid r.rsid
26 #define subid r.rsubid
27 #define rclass r.class
28 #define other r.rother
29 #define right r.rright
30 #define left l.lleft
31 #define next l.lnext
32
33 #define NPROG 1024
34 Inst program[NPROG];
35 Inst *progp;
36 Inst *startinst; /* First inst. of program; might not be program[0] */
37 Inst *bstartinst; /* same for backwards machine */
38
39 typedef struct Ilist Ilist;
40 struct Ilist
41 {
42 Inst *inst; /* Instruction of the thread */
43 Rangeset se;
44 Posn startp; /* first char of match */
45 };
46
47 #define NLIST 127
48
49 Ilist *tl, *nl; /* This list, next list */
50 Ilist list[2][NLIST+1]; /* +1 for trailing null */
51 static Rangeset sempty;
52
53 /*
54 * Actions and Tokens
55 *
56 * 0x100xx are operators, value == precedence
57 * 0x200xx are tokens, i.e. operands for operators
58 */
59 enum {
60 OPERATOR = Runemask+1, /* Bitmask of all operators */
61 START = OPERATOR, /* Start, used for marker on stack */
62 RBRA, /* Right bracket, ) */
63 LBRA, /* Left bracket, ( */
64 OR, /* Alternation, | */
65 CAT, /* Concatentation, implicit operator */
66 STAR, /* Closure, * */
67 PLUS, /* a+ == aa* */
68 QUEST, /* a? == a|nothing, i.e. 0 or 1 a's */
69
70 ANY = OPERATOR<<1, /* Any character but newline, . */
71 NOP, /* No operation, internal use only */
72 BOL, /* Beginning of line, ^ */
73 EOL, /* End of line, $ */
74 CCLASS, /* Character class, [] */
75 NCCLASS, /* Negated character class, [^] */
76 END, /* Terminate: match found */
77
78 ISATOR = OPERATOR,
79 ISAND = OPERATOR<<1,
80 };
81
82 /*
83 * Parser Information
84 */
85 typedef struct Node Node;
86 struct Node
87 {
88 Inst *first;
89 Inst *last;
90 };
91
92 #define NSTACK 20
93 Node andstack[NSTACK];
94 Node *andp;
95 int atorstack[NSTACK];
96 int *atorp;
97 int lastwasand; /* Last token was operand */
98 int cursubid;
99 int subidstack[NSTACK];
100 int *subidp;
101 int backwards;
102 int nbra;
103 Rune *exprp; /* pointer to next character in source expression */
104 #define DCLASS 10 /* allocation increment */
105 int nclass; /* number active */
106 int Nclass; /* high water mark */
107 Rune **class;
108 int negateclass;
109
110 int addinst(Ilist *l, Inst *inst, Rangeset *sep);
111 void newmatch(Rangeset*);
112 void bnewmatch(Rangeset*);
113 void pushand(Inst*, Inst*);
114 void pushator(int);
115 Node *popand(int);
116 int popator(void);
117 void startlex(Rune*);
118 int lex(void);
119 void operator(int);
120 void operand(int);
121 void evaluntil(int);
122 void optimize(Inst*);
123 void bldcclass(void);
124
125 void
regerror(Err e)126 regerror(Err e)
127 {
128 Strzero(&lastregexp);
129 error(e);
130 }
131
132 void
regerror_c(Err e,int c)133 regerror_c(Err e, int c)
134 {
135 Strzero(&lastregexp);
136 error_c(e, c);
137 }
138
139 Inst *
newinst(int t)140 newinst(int t)
141 {
142 if(progp >= &program[NPROG])
143 regerror(Etoolong);
144 progp->type = t;
145 progp->left = 0;
146 progp->right = 0;
147 return progp++;
148 }
149
150 Inst *
realcompile(Rune * s)151 realcompile(Rune *s)
152 {
153 int token;
154
155 startlex(s);
156 atorp = atorstack;
157 andp = andstack;
158 subidp = subidstack;
159 cursubid = 0;
160 lastwasand = FALSE;
161 /* Start with a low priority operator to prime parser */
162 pushator(START-1);
163 while((token=lex()) != END){
164 if((token&ISATOR) == OPERATOR)
165 operator(token);
166 else
167 operand(token);
168 }
169 /* Close with a low priority operator */
170 evaluntil(START);
171 /* Force END */
172 operand(END);
173 evaluntil(START);
174 if(nbra)
175 regerror(Eleftpar);
176 --andp; /* points to first and only operand */
177 return andp->first;
178 }
179
180 void
compile(String * s)181 compile(String *s)
182 {
183 int i;
184 Inst *oprogp;
185
186 if(Strcmp(s, &lastregexp)==0)
187 return;
188 for(i=0; i<nclass; i++)
189 free(class[i]);
190 nclass = 0;
191 progp = program;
192 backwards = FALSE;
193 startinst = realcompile(s->s);
194 optimize(program);
195 oprogp = progp;
196 backwards = TRUE;
197 bstartinst = realcompile(s->s);
198 optimize(oprogp);
199 Strduplstr(&lastregexp, s);
200 }
201
202 void
operand(int t)203 operand(int t)
204 {
205 Inst *i;
206 if(lastwasand)
207 operator(CAT); /* catenate is implicit */
208 i = newinst(t);
209 if(t == CCLASS){
210 if(negateclass)
211 i->type = NCCLASS; /* UGH */
212 i->rclass = nclass-1; /* UGH */
213 }
214 pushand(i, i);
215 lastwasand = TRUE;
216 }
217
218 void
operator(int t)219 operator(int t)
220 {
221 if(t==RBRA && --nbra<0)
222 regerror(Erightpar);
223 if(t==LBRA){
224 /*
225 * if(++cursubid >= NSUBEXP)
226 * regerror(Esubexp);
227 */
228 cursubid++; /* silently ignored */
229 nbra++;
230 if(lastwasand)
231 operator(CAT);
232 }else
233 evaluntil(t);
234 if(t!=RBRA)
235 pushator(t);
236 lastwasand = FALSE;
237 if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
238 lastwasand = TRUE; /* these look like operands */
239 }
240
241 void
cant(char * s)242 cant(char *s)
243 {
244 char buf[100];
245
246 sprint(buf, "regexp: can't happen: %s", s);
247 panic(buf);
248 }
249
250 void
pushand(Inst * f,Inst * l)251 pushand(Inst *f, Inst *l)
252 {
253 if(andp >= &andstack[NSTACK])
254 cant("operand stack overflow");
255 andp->first = f;
256 andp->last = l;
257 andp++;
258 }
259
260 void
pushator(int t)261 pushator(int t)
262 {
263 if(atorp >= &atorstack[NSTACK])
264 cant("operator stack overflow");
265 *atorp++=t;
266 if(cursubid >= NSUBEXP)
267 *subidp++= -1;
268 else
269 *subidp++=cursubid;
270 }
271
272 Node *
popand(int op)273 popand(int op)
274 {
275 if(andp <= &andstack[0])
276 if(op)
277 regerror_c(Emissop, op);
278 else
279 regerror(Ebadregexp);
280 return --andp;
281 }
282
283 int
popator(void)284 popator(void)
285 {
286 if(atorp <= &atorstack[0])
287 cant("operator stack underflow");
288 --subidp;
289 return *--atorp;
290 }
291
292 void
evaluntil(int pri)293 evaluntil(int pri)
294 {
295 Node *op1, *op2, *t;
296 Inst *inst1, *inst2;
297
298 while(pri==RBRA || atorp[-1]>=pri){
299 switch(popator()){
300 case LBRA:
301 op1 = popand('(');
302 inst2 = newinst(RBRA);
303 inst2->subid = *subidp;
304 op1->last->next = inst2;
305 inst1 = newinst(LBRA);
306 inst1->subid = *subidp;
307 inst1->next = op1->first;
308 pushand(inst1, inst2);
309 return; /* must have been RBRA */
310 default:
311 panic("unknown regexp operator");
312 break;
313 case OR:
314 op2 = popand('|');
315 op1 = popand('|');
316 inst2 = newinst(NOP);
317 op2->last->next = inst2;
318 op1->last->next = inst2;
319 inst1 = newinst(OR);
320 inst1->right = op1->first;
321 inst1->left = op2->first;
322 pushand(inst1, inst2);
323 break;
324 case CAT:
325 op2 = popand(0);
326 op1 = popand(0);
327 if(backwards && op2->first->type!=END)
328 t = op1, op1 = op2, op2 = t;
329 op1->last->next = op2->first;
330 pushand(op1->first, op2->last);
331 break;
332 case STAR:
333 op2 = popand('*');
334 inst1 = newinst(OR);
335 op2->last->next = inst1;
336 inst1->right = op2->first;
337 pushand(inst1, inst1);
338 break;
339 case PLUS:
340 op2 = popand('+');
341 inst1 = newinst(OR);
342 op2->last->next = inst1;
343 inst1->right = op2->first;
344 pushand(op2->first, inst1);
345 break;
346 case QUEST:
347 op2 = popand('?');
348 inst1 = newinst(OR);
349 inst2 = newinst(NOP);
350 inst1->left = inst2;
351 inst1->right = op2->first;
352 op2->last->next = inst2;
353 pushand(inst1, inst2);
354 break;
355 }
356 }
357 }
358
359
360 void
optimize(Inst * start)361 optimize(Inst *start)
362 {
363 Inst *inst, *target;
364
365 for(inst=start; inst->type!=END; inst++){
366 target = inst->next;
367 while(target->type == NOP)
368 target = target->next;
369 inst->next = target;
370 }
371 }
372
373 #ifdef DEBUG
374 void
dumpstack(void)375 dumpstack(void){
376 Node *stk;
377 int *ip;
378
379 dprint("operators\n");
380 for(ip = atorstack; ip<atorp; ip++)
381 dprint("0%o\n", *ip);
382 dprint("operands\n");
383 for(stk = andstack; stk<andp; stk++)
384 dprint("0%o\t0%o\n", stk->first->type, stk->last->type);
385 }
386 void
dump(void)387 dump(void){
388 Inst *l;
389
390 l = program;
391 do{
392 dprint("%d:\t0%o\t%d\t%d\n", l-program, l->type,
393 l->left-program, l->right-program);
394 }while(l++->type);
395 }
396 #endif
397
398 void
startlex(Rune * s)399 startlex(Rune *s)
400 {
401 exprp = s;
402 nbra = 0;
403 }
404
405
406 int
lex(void)407 lex(void){
408 int c= *exprp++;
409
410 switch(c){
411 case '\\':
412 if(*exprp)
413 if((c= *exprp++)=='n')
414 c='\n';
415 break;
416 case 0:
417 c = END;
418 --exprp; /* In case we come here again */
419 break;
420 case '*':
421 c = STAR;
422 break;
423 case '?':
424 c = QUEST;
425 break;
426 case '+':
427 c = PLUS;
428 break;
429 case '|':
430 c = OR;
431 break;
432 case '.':
433 c = ANY;
434 break;
435 case '(':
436 c = LBRA;
437 break;
438 case ')':
439 c = RBRA;
440 break;
441 case '^':
442 c = BOL;
443 break;
444 case '$':
445 c = EOL;
446 break;
447 case '[':
448 c = CCLASS;
449 bldcclass();
450 break;
451 }
452 return c;
453 }
454
455 long
nextrec(void)456 nextrec(void){
457 if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0))
458 regerror(Ebadclass);
459 if(exprp[0] == '\\'){
460 exprp++;
461 if(*exprp=='n'){
462 exprp++;
463 return '\n';
464 }
465 return *exprp++|(Runemax+1);
466 }
467 return *exprp++;
468 }
469
470 void
bldcclass(void)471 bldcclass(void)
472 {
473 long c1, c2, n, na;
474 Rune *classp;
475
476 classp = emalloc(DCLASS*RUNESIZE);
477 n = 0;
478 na = DCLASS;
479 /* we have already seen the '[' */
480 if(*exprp == '^'){
481 classp[n++] = '\n'; /* don't match newline in negate case */
482 negateclass = TRUE;
483 exprp++;
484 }else
485 negateclass = FALSE;
486 while((c1 = nextrec()) != ']'){
487 if(c1 == '-'){
488 Error:
489 free(classp);
490 regerror(Ebadclass);
491 }
492 if(n+4 >= na){ /* 3 runes plus NUL */
493 na += DCLASS;
494 classp = erealloc(classp, na*RUNESIZE);
495 }
496 if(*exprp == '-'){
497 exprp++; /* eat '-' */
498 if((c2 = nextrec()) == ']')
499 goto Error;
500 classp[n+0] = Runemax;
501 classp[n+1] = c1;
502 classp[n+2] = c2;
503 n += 3;
504 }else
505 classp[n++] = c1;
506 }
507 classp[n] = 0;
508 if(nclass == Nclass){
509 Nclass += DCLASS;
510 class = erealloc(class, Nclass*sizeof(Rune*));
511 }
512 class[nclass++] = classp;
513 }
514
515 int
classmatch(int classno,int c,int negate)516 classmatch(int classno, int c, int negate)
517 {
518 Rune *p;
519
520 p = class[classno];
521 while(*p){
522 if(*p == Runemax){
523 if(p[1]<=c && c<=p[2])
524 return !negate;
525 p += 3;
526 }else if(*p++ == c)
527 return !negate;
528 }
529 return negate;
530 }
531
532 /*
533 * Note optimization in addinst:
534 * *l must be pending when addinst called; if *l has been looked
535 * at already, the optimization is a bug.
536 */
537 int
addinst(Ilist * l,Inst * inst,Rangeset * sep)538 addinst(Ilist *l, Inst *inst, Rangeset *sep)
539 {
540 Ilist *p;
541
542 for(p = l; p->inst; p++){
543 if(p->inst==inst){
544 if((sep)->p[0].p1 < p->se.p[0].p1)
545 p->se= *sep; /* this would be bug */
546 return 0; /* It's already there */
547 }
548 }
549 p->inst = inst;
550 p->se= *sep;
551 (p+1)->inst = 0;
552 return 1;
553 }
554
555 int
execute(File * f,Posn startp,Posn eof)556 execute(File *f, Posn startp, Posn eof)
557 {
558 int flag = 0;
559 Inst *inst;
560 Ilist *tlp;
561 Posn p = startp;
562 int nnl = 0, ntl;
563 int c;
564 int wrapped = 0;
565 int startchar = startinst->type<OPERATOR? startinst->type : 0;
566
567 list[0][0].inst = list[1][0].inst = 0;
568 sel.p[0].p1 = -1;
569 /* Execute machine once for each character */
570 for(;;p++){
571 doloop:
572 c = filereadc(f, p);
573 if(p>=eof || c<0){
574 switch(wrapped++){
575 case 0: /* let loop run one more click */
576 case 2:
577 break;
578 case 1: /* expired; wrap to beginning */
579 if(sel.p[0].p1>=0 || eof!=INFINITY)
580 goto Return;
581 list[0][0].inst = list[1][0].inst = 0;
582 p = 0;
583 goto doloop;
584 default:
585 goto Return;
586 }
587 }else if(((wrapped && p>=startp) || sel.p[0].p1>0) && nnl==0)
588 break;
589 /* fast check for first char */
590 if(startchar && nnl==0 && c!=startchar)
591 continue;
592 tl = list[flag];
593 nl = list[flag^=1];
594 nl->inst = 0;
595 ntl = nnl;
596 nnl = 0;
597 if(sel.p[0].p1<0 && (!wrapped || p<startp || startp==eof)){
598 /* Add first instruction to this list */
599 sempty.p[0].p1 = p;
600 if(addinst(tl, startinst, &sempty))
601 if(++ntl >= NLIST)
602 Overflow:
603 error(Eoverflow);
604 }
605 /* Execute machine until this list is empty */
606 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */
607 Switchstmt:
608 switch(inst->type){
609 default: /* regular character */
610 if(inst->type==c){
611 Addinst:
612 if(addinst(nl, inst->next, &tlp->se))
613 if(++nnl >= NLIST)
614 goto Overflow;
615 }
616 break;
617 case LBRA:
618 if(inst->subid>=0)
619 tlp->se.p[inst->subid].p1 = p;
620 inst = inst->next;
621 goto Switchstmt;
622 case RBRA:
623 if(inst->subid>=0)
624 tlp->se.p[inst->subid].p2 = p;
625 inst = inst->next;
626 goto Switchstmt;
627 case ANY:
628 if(c!='\n')
629 goto Addinst;
630 break;
631 case BOL:
632 if(p==0 || filereadc(f, p - 1)=='\n'){
633 Step:
634 inst = inst->next;
635 goto Switchstmt;
636 }
637 break;
638 case EOL:
639 if(c == '\n')
640 goto Step;
641 break;
642 case CCLASS:
643 if(c>=0 && classmatch(inst->rclass, c, 0))
644 goto Addinst;
645 break;
646 case NCCLASS:
647 if(c>=0 && classmatch(inst->rclass, c, 1))
648 goto Addinst;
649 break;
650 case OR:
651 /* evaluate right choice later */
652 if(addinst(tlp, inst->right, &tlp->se))
653 if(++ntl >= NLIST)
654 goto Overflow;
655 /* efficiency: advance and re-evaluate */
656 inst = inst->left;
657 goto Switchstmt;
658 case END: /* Match! */
659 tlp->se.p[0].p2 = p;
660 newmatch(&tlp->se);
661 break;
662 }
663 }
664 }
665 Return:
666 return sel.p[0].p1>=0;
667 }
668
669 void
newmatch(Rangeset * sp)670 newmatch(Rangeset *sp)
671 {
672 int i;
673
674 if(sel.p[0].p1<0 || sp->p[0].p1<sel.p[0].p1 ||
675 (sp->p[0].p1==sel.p[0].p1 && sp->p[0].p2>sel.p[0].p2))
676 for(i = 0; i<NSUBEXP; i++)
677 sel.p[i] = sp->p[i];
678 }
679
680 int
bexecute(File * f,Posn startp)681 bexecute(File *f, Posn startp)
682 {
683 int flag = 0;
684 Inst *inst;
685 Ilist *tlp;
686 Posn p = startp;
687 int nnl = 0, ntl;
688 int c;
689 int wrapped = 0;
690 int startchar = bstartinst->type<OPERATOR? bstartinst->type : 0;
691
692 list[0][0].inst = list[1][0].inst = 0;
693 sel.p[0].p1= -1;
694 /* Execute machine once for each character, including terminal NUL */
695 for(;;--p){
696 doloop:
697 if((c = filereadc(f, p - 1))==-1){
698 switch(wrapped++){
699 case 0: /* let loop run one more click */
700 case 2:
701 break;
702 case 1: /* expired; wrap to end */
703 if(sel.p[0].p1>=0)
704 case 3:
705 goto Return;
706 list[0][0].inst = list[1][0].inst = 0;
707 p = f->nc;
708 goto doloop;
709 default:
710 goto Return;
711 }
712 }else if(((wrapped && p<=startp) || sel.p[0].p1>0) && nnl==0)
713 break;
714 /* fast check for first char */
715 if(startchar && nnl==0 && c!=startchar)
716 continue;
717 tl = list[flag];
718 nl = list[flag^=1];
719 nl->inst = 0;
720 ntl = nnl;
721 nnl = 0;
722 if(sel.p[0].p1<0 && (!wrapped || p>startp)){
723 /* Add first instruction to this list */
724 /* the minus is so the optimizations in addinst work */
725 sempty.p[0].p1 = -p;
726 if(addinst(tl, bstartinst, &sempty))
727 if(++ntl >= NLIST)
728 Overflow:
729 error(Eoverflow);
730 }
731 /* Execute machine until this list is empty */
732 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */
733 Switchstmt:
734 switch(inst->type){
735 default: /* regular character */
736 if(inst->type == c){
737 Addinst:
738 if(addinst(nl, inst->next, &tlp->se))
739 if(++nnl >= NLIST)
740 goto Overflow;
741 }
742 break;
743 case LBRA:
744 if(inst->subid>=0)
745 tlp->se.p[inst->subid].p1 = p;
746 inst = inst->next;
747 goto Switchstmt;
748 case RBRA:
749 if(inst->subid >= 0)
750 tlp->se.p[inst->subid].p2 = p;
751 inst = inst->next;
752 goto Switchstmt;
753 case ANY:
754 if(c != '\n')
755 goto Addinst;
756 break;
757 case BOL:
758 if(c=='\n' || p==0){
759 Step:
760 inst = inst->next;
761 goto Switchstmt;
762 }
763 break;
764 case EOL:
765 if(p==f->nc || filereadc(f, p)=='\n')
766 goto Step;
767 break;
768 case CCLASS:
769 if(c>=0 && classmatch(inst->rclass, c, 0))
770 goto Addinst;
771 break;
772 case NCCLASS:
773 if(c>=0 && classmatch(inst->rclass, c, 1))
774 goto Addinst;
775 break;
776 case OR:
777 /* evaluate right choice later */
778 if(addinst(tl, inst->right, &tlp->se))
779 if(++ntl >= NLIST)
780 goto Overflow;
781 /* efficiency: advance and re-evaluate */
782 inst = inst->left;
783 goto Switchstmt;
784 case END: /* Match! */
785 tlp->se.p[0].p1 = -tlp->se.p[0].p1; /* minus sign */
786 tlp->se.p[0].p2 = p;
787 bnewmatch(&tlp->se);
788 break;
789 }
790 }
791 }
792 Return:
793 return sel.p[0].p1>=0;
794 }
795
796 void
bnewmatch(Rangeset * sp)797 bnewmatch(Rangeset *sp)
798 {
799 int i;
800 if(sel.p[0].p1<0 || sp->p[0].p1>sel.p[0].p2 || (sp->p[0].p1==sel.p[0].p2 && sp->p[0].p2<sel.p[0].p1))
801 for(i = 0; i<NSUBEXP; i++){ /* note the reversal; p1<=p2 */
802 sel.p[i].p1 = sp->p[i].p2;
803 sel.p[i].p2 = sp->p[i].p1;
804 }
805 }
806