xref: /plan9/sys/src/cmd/sam/regexp.c (revision f9e1cf08d3be51592e03e639fc848a68dc31a55e)
1 #include "sam.h"
2 
3 Rangeset	sel;
4 String		lastregexp;
5 /*
6  * Machine Information
7  */
8 typedef struct Inst Inst;
9 
10 struct Inst
11 {
12 	long	type;	/* < 0x10000 ==> literal, otherwise action */
13 	union {
14 		int rsid;
15 		int rsubid;
16 		int class;
17 		struct Inst *rother;
18 		struct Inst *rright;
19 	} r;
20 	union{
21 		struct Inst *lleft;
22 		struct Inst *lnext;
23 	} l;
24 };
25 #define	sid	r.rsid
26 #define	subid	r.rsubid
27 #define	rclass	r.class
28 #define	other	r.rother
29 #define	right	r.rright
30 #define	left	l.lleft
31 #define	next	l.lnext
32 
33 #define	NPROG	1024
34 Inst	program[NPROG];
35 Inst	*progp;
36 Inst	*startinst;	/* First inst. of program; might not be program[0] */
37 Inst	*bstartinst;	/* same for backwards machine */
38 
39 typedef struct Ilist Ilist;
40 struct Ilist
41 {
42 	Inst	*inst;		/* Instruction of the thread */
43 	Rangeset se;
44 	Posn	startp;		/* first char of match */
45 };
46 
47 #define	NLIST	127
48 
49 Ilist	*tl, *nl;	/* This list, next list */
50 Ilist	list[2][NLIST+1];	/* +1 for trailing null */
51 static	Rangeset sempty;
52 
53 /*
54  * Actions and Tokens
55  *
56  *	0x100xx are operators, value == precedence
57  *	0x200xx are tokens, i.e. operands for operators
58  */
59 #define	OPERATOR	0x10000	/* Bitmask of all operators */
60 #define	START		0x10000	/* Start, used for marker on stack */
61 #define	RBRA		0x10001	/* Right bracket, ) */
62 #define	LBRA		0x10002	/* Left bracket, ( */
63 #define	OR		0x10003	/* Alternation, | */
64 #define	CAT		0x10004	/* Concatentation, implicit operator */
65 #define	STAR		0x10005	/* Closure, * */
66 #define	PLUS		0x10006	/* a+ == aa* */
67 #define	QUEST		0x10007	/* a? == a|nothing, i.e. 0 or 1 a's */
68 #define	ANY		0x20000	/* Any character but newline, . */
69 #define	NOP		0x20001	/* No operation, internal use only */
70 #define	BOL		0x20002	/* Beginning of line, ^ */
71 #define	EOL		0x20003	/* End of line, $ */
72 #define	CCLASS		0x20004	/* Character class, [] */
73 #define	NCCLASS		0x20005	/* Negated character class, [^] */
74 #define	END		0x20077	/* Terminate: match found */
75 
76 #define	ISATOR		0x10000
77 #define	ISAND		0x20000
78 
79 /*
80  * Parser Information
81  */
82 typedef struct Node Node;
83 struct Node
84 {
85 	Inst	*first;
86 	Inst	*last;
87 };
88 
89 #define	NSTACK	20
90 Node	andstack[NSTACK];
91 Node	*andp;
92 int	atorstack[NSTACK];
93 int	*atorp;
94 int	lastwasand;	/* Last token was operand */
95 int	cursubid;
96 int	subidstack[NSTACK];
97 int	*subidp;
98 int	backwards;
99 int	nbra;
100 Rune	*exprp;		/* pointer to next character in source expression */
101 #define	DCLASS	10	/* allocation increment */
102 int	nclass;		/* number active */
103 int	Nclass;		/* high water mark */
104 Rune	**class;
105 int	negateclass;
106 
107 int	addinst(Ilist *l, Inst *inst, Rangeset *sep);
108 void	newmatch(Rangeset*);
109 void	bnewmatch(Rangeset*);
110 void	pushand(Inst*, Inst*);
111 void	pushator(int);
112 Node	*popand(int);
113 int	popator(void);
114 void	startlex(Rune*);
115 int	lex(void);
116 void	operator(int);
117 void	operand(int);
118 void	evaluntil(int);
119 void	optimize(Inst*);
120 void	bldcclass(void);
121 
122 void
123 regerror(Err e)
124 {
125 	Strzero(&lastregexp);
126 	error(e);
127 }
128 
129 void
130 regerror_c(Err e, int c)
131 {
132 	Strzero(&lastregexp);
133 	error_c(e, c);
134 }
135 
136 Inst *
137 newinst(int t)
138 {
139 	if(progp >= &program[NPROG])
140 		regerror(Etoolong);
141 	progp->type = t;
142 	progp->left = 0;
143 	progp->right = 0;
144 	return progp++;
145 }
146 
147 Inst *
148 realcompile(Rune *s)
149 {
150 	int token;
151 
152 	startlex(s);
153 	atorp = atorstack;
154 	andp = andstack;
155 	subidp = subidstack;
156 	cursubid = 0;
157 	lastwasand = FALSE;
158 	/* Start with a low priority operator to prime parser */
159 	pushator(START-1);
160 	while((token=lex()) != END){
161 		if((token&ISATOR) == OPERATOR)
162 			operator(token);
163 		else
164 			operand(token);
165 	}
166 	/* Close with a low priority operator */
167 	evaluntil(START);
168 	/* Force END */
169 	operand(END);
170 	evaluntil(START);
171 	if(nbra)
172 		regerror(Eleftpar);
173 	--andp;	/* points to first and only operand */
174 	return andp->first;
175 }
176 
177 void
178 compile(String *s)
179 {
180 	int i;
181 	Inst *oprogp;
182 
183 	if(Strcmp(s, &lastregexp)==0)
184 		return;
185 	for(i=0; i<nclass; i++)
186 		free(class[i]);
187 	nclass = 0;
188 	progp = program;
189 	backwards = FALSE;
190 	startinst = realcompile(s->s);
191 	optimize(program);
192 	oprogp = progp;
193 	backwards = TRUE;
194 	bstartinst = realcompile(s->s);
195 	optimize(oprogp);
196 	Strduplstr(&lastregexp, s);
197 }
198 
199 void
200 operand(int t)
201 {
202 	Inst *i;
203 	if(lastwasand)
204 		operator(CAT);	/* catenate is implicit */
205 	i = newinst(t);
206 	if(t == CCLASS){
207 		if(negateclass)
208 			i->type = NCCLASS;	/* UGH */
209 		i->rclass = nclass-1;		/* UGH */
210 	}
211 	pushand(i, i);
212 	lastwasand = TRUE;
213 }
214 
215 void
216 operator(int t)
217 {
218 	if(t==RBRA && --nbra<0)
219 		regerror(Erightpar);
220 	if(t==LBRA){
221 /*
222  *		if(++cursubid >= NSUBEXP)
223  *			regerror(Esubexp);
224  */
225 		cursubid++;	/* silently ignored */
226 		nbra++;
227 		if(lastwasand)
228 			operator(CAT);
229 	}else
230 		evaluntil(t);
231 	if(t!=RBRA)
232 		pushator(t);
233 	lastwasand = FALSE;
234 	if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
235 		lastwasand = TRUE;	/* these look like operands */
236 }
237 
238 void
239 cant(char *s)
240 {
241 	char buf[100];
242 
243 	sprint(buf, "regexp: can't happen: %s", s);
244 	panic(buf);
245 }
246 
247 void
248 pushand(Inst *f, Inst *l)
249 {
250 	if(andp >= &andstack[NSTACK])
251 		cant("operand stack overflow");
252 	andp->first = f;
253 	andp->last = l;
254 	andp++;
255 }
256 
257 void
258 pushator(int t)
259 {
260 	if(atorp >= &atorstack[NSTACK])
261 		cant("operator stack overflow");
262 	*atorp++=t;
263 	if(cursubid >= NSUBEXP)
264 		*subidp++= -1;
265 	else
266 		*subidp++=cursubid;
267 }
268 
269 Node *
270 popand(int op)
271 {
272 	if(andp <= &andstack[0])
273 		if(op)
274 			regerror_c(Emissop, op);
275 		else
276 			regerror(Ebadregexp);
277 	return --andp;
278 }
279 
280 int
281 popator(void)
282 {
283 	if(atorp <= &atorstack[0])
284 		cant("operator stack underflow");
285 	--subidp;
286 	return *--atorp;
287 }
288 
289 void
290 evaluntil(int pri)
291 {
292 	Node *op1, *op2, *t;
293 	Inst *inst1, *inst2;
294 
295 	while(pri==RBRA || atorp[-1]>=pri){
296 		switch(popator()){
297 		case LBRA:
298 			op1 = popand('(');
299 			inst2 = newinst(RBRA);
300 			inst2->subid = *subidp;
301 			op1->last->next = inst2;
302 			inst1 = newinst(LBRA);
303 			inst1->subid = *subidp;
304 			inst1->next = op1->first;
305 			pushand(inst1, inst2);
306 			return;		/* must have been RBRA */
307 		default:
308 			panic("unknown regexp operator");
309 			break;
310 		case OR:
311 			op2 = popand('|');
312 			op1 = popand('|');
313 			inst2 = newinst(NOP);
314 			op2->last->next = inst2;
315 			op1->last->next = inst2;
316 			inst1 = newinst(OR);
317 			inst1->right = op1->first;
318 			inst1->left = op2->first;
319 			pushand(inst1, inst2);
320 			break;
321 		case CAT:
322 			op2 = popand(0);
323 			op1 = popand(0);
324 			if(backwards && op2->first->type!=END)
325 				t = op1, op1 = op2, op2 = t;
326 			op1->last->next = op2->first;
327 			pushand(op1->first, op2->last);
328 			break;
329 		case STAR:
330 			op2 = popand('*');
331 			inst1 = newinst(OR);
332 			op2->last->next = inst1;
333 			inst1->right = op2->first;
334 			pushand(inst1, inst1);
335 			break;
336 		case PLUS:
337 			op2 = popand('+');
338 			inst1 = newinst(OR);
339 			op2->last->next = inst1;
340 			inst1->right = op2->first;
341 			pushand(op2->first, inst1);
342 			break;
343 		case QUEST:
344 			op2 = popand('?');
345 			inst1 = newinst(OR);
346 			inst2 = newinst(NOP);
347 			inst1->left = inst2;
348 			inst1->right = op2->first;
349 			op2->last->next = inst2;
350 			pushand(inst1, inst2);
351 			break;
352 		}
353 	}
354 }
355 
356 
357 void
358 optimize(Inst *start)
359 {
360 	Inst *inst, *target;
361 
362 	for(inst=start; inst->type!=END; inst++){
363 		target = inst->next;
364 		while(target->type == NOP)
365 			target = target->next;
366 		inst->next = target;
367 	}
368 }
369 
370 #ifdef	DEBUG
371 void
372 dumpstack(void){
373 	Node *stk;
374 	int *ip;
375 
376 	dprint("operators\n");
377 	for(ip = atorstack; ip<atorp; ip++)
378 		dprint("0%o\n", *ip);
379 	dprint("operands\n");
380 	for(stk = andstack; stk<andp; stk++)
381 		dprint("0%o\t0%o\n", stk->first->type, stk->last->type);
382 }
383 void
384 dump(void){
385 	Inst *l;
386 
387 	l = program;
388 	do{
389 		dprint("%d:\t0%o\t%d\t%d\n", l-program, l->type,
390 			l->left-program, l->right-program);
391 	}while(l++->type);
392 }
393 #endif
394 
395 void
396 startlex(Rune *s)
397 {
398 	exprp = s;
399 	nbra = 0;
400 }
401 
402 
403 int
404 lex(void){
405 	int c= *exprp++;
406 
407 	switch(c){
408 	case '\\':
409 		if(*exprp)
410 			if((c= *exprp++)=='n')
411 				c='\n';
412 		break;
413 	case 0:
414 		c = END;
415 		--exprp;	/* In case we come here again */
416 		break;
417 	case '*':
418 		c = STAR;
419 		break;
420 	case '?':
421 		c = QUEST;
422 		break;
423 	case '+':
424 		c = PLUS;
425 		break;
426 	case '|':
427 		c = OR;
428 		break;
429 	case '.':
430 		c = ANY;
431 		break;
432 	case '(':
433 		c = LBRA;
434 		break;
435 	case ')':
436 		c = RBRA;
437 		break;
438 	case '^':
439 		c = BOL;
440 		break;
441 	case '$':
442 		c = EOL;
443 		break;
444 	case '[':
445 		c = CCLASS;
446 		bldcclass();
447 		break;
448 	}
449 	return c;
450 }
451 
452 long
453 nextrec(void){
454 	if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0))
455 		regerror(Ebadclass);
456 	if(exprp[0] == '\\'){
457 		exprp++;
458 		if(*exprp=='n'){
459 			exprp++;
460 			return '\n';
461 		}
462 		return *exprp++|0x10000;
463 	}
464 	return *exprp++;
465 }
466 
467 void
468 bldcclass(void)
469 {
470 	long c1, c2, n, na;
471 	Rune *classp;
472 
473 	classp = emalloc(DCLASS*RUNESIZE);
474 	n = 0;
475 	na = DCLASS;
476 	/* we have already seen the '[' */
477 	if(*exprp == '^'){
478 		classp[n++] = '\n';	/* don't match newline in negate case */
479 		negateclass = TRUE;
480 		exprp++;
481 	}else
482 		negateclass = FALSE;
483 	while((c1 = nextrec()) != ']'){
484 		if(c1 == '-'){
485     Error:
486 			free(classp);
487 			regerror(Ebadclass);
488 		}
489 		if(n+4 >= na){		/* 3 runes plus NUL */
490 			na += DCLASS;
491 			classp = erealloc(classp, na*RUNESIZE);
492 		}
493 		if(*exprp == '-'){
494 			exprp++;	/* eat '-' */
495 			if((c2 = nextrec()) == ']')
496 				goto Error;
497 			classp[n+0] = 0xFFFF;
498 			classp[n+1] = c1;
499 			classp[n+2] = c2;
500 			n += 3;
501 		}else
502 			classp[n++] = c1;
503 	}
504 	classp[n] = 0;
505 	if(nclass == Nclass){
506 		Nclass += DCLASS;
507 		class = erealloc(class, Nclass*sizeof(Rune*));
508 	}
509 	class[nclass++] = classp;
510 }
511 
512 int
513 classmatch(int classno, int c, int negate)
514 {
515 	Rune *p;
516 
517 	p = class[classno];
518 	while(*p){
519 		if(*p == 0xFFFF){
520 			if(p[1]<=c && c<=p[2])
521 				return !negate;
522 			p += 3;
523 		}else if(*p++ == c)
524 			return !negate;
525 	}
526 	return negate;
527 }
528 
529 /*
530  * Note optimization in addinst:
531  * 	*l must be pending when addinst called; if *l has been looked
532  *		at already, the optimization is a bug.
533  */
534 int
535 addinst(Ilist *l, Inst *inst, Rangeset *sep)
536 {
537 	Ilist *p;
538 
539 	for(p = l; p->inst; p++){
540 		if(p->inst==inst){
541 			if((sep)->p[0].p1 < p->se.p[0].p1)
542 				p->se= *sep;	/* this would be bug */
543 			return 0;	/* It's already there */
544 		}
545 	}
546 	p->inst = inst;
547 	p->se= *sep;
548 	(p+1)->inst = 0;
549 	return 1;
550 }
551 
552 int
553 execute(File *f, Posn startp, Posn eof)
554 {
555 	int flag = 0;
556 	Inst *inst;
557 	Ilist *tlp;
558 	Posn p = startp;
559 	int nnl = 0, ntl;
560 	int c;
561 	int wrapped = 0;
562 	int startchar = startinst->type<OPERATOR? startinst->type : 0;
563 
564 	list[0][0].inst = list[1][0].inst = 0;
565 	sel.p[0].p1 = -1;
566 	/* Execute machine once for each character */
567 	for(;;p++){
568 	doloop:
569 		c = filereadc(f, p);
570 		if(p>=eof || c<0){
571 			switch(wrapped++){
572 			case 0:		/* let loop run one more click */
573 			case 2:
574 				break;
575 			case 1:		/* expired; wrap to beginning */
576 				if(sel.p[0].p1>=0 || eof!=INFINITY)
577 					goto Return;
578 				list[0][0].inst = list[1][0].inst = 0;
579 				p = 0;
580 				goto doloop;
581 			default:
582 				goto Return;
583 			}
584 		}else if(((wrapped && p>=startp) || sel.p[0].p1>0) && nnl==0)
585 			break;
586 		/* fast check for first char */
587 		if(startchar && nnl==0 && c!=startchar)
588 			continue;
589 		tl = list[flag];
590 		nl = list[flag^=1];
591 		nl->inst = 0;
592 		ntl = nnl;
593 		nnl = 0;
594 		if(sel.p[0].p1<0 && (!wrapped || p<startp || startp==eof)){
595 			/* Add first instruction to this list */
596 			sempty.p[0].p1 = p;
597 			if(addinst(tl, startinst, &sempty))
598 			if(++ntl >= NLIST)
599 	Overflow:
600 				error(Eoverflow);
601 		}
602 		/* Execute machine until this list is empty */
603 		for(tlp = tl; inst = tlp->inst; tlp++){	/* assignment = */
604 	Switchstmt:
605 			switch(inst->type){
606 			default:	/* regular character */
607 				if(inst->type==c){
608 	Addinst:
609 					if(addinst(nl, inst->next, &tlp->se))
610 					if(++nnl >= NLIST)
611 						goto Overflow;
612 				}
613 				break;
614 			case LBRA:
615 				if(inst->subid>=0)
616 					tlp->se.p[inst->subid].p1 = p;
617 				inst = inst->next;
618 				goto Switchstmt;
619 			case RBRA:
620 				if(inst->subid>=0)
621 					tlp->se.p[inst->subid].p2 = p;
622 				inst = inst->next;
623 				goto Switchstmt;
624 			case ANY:
625 				if(c!='\n')
626 					goto Addinst;
627 				break;
628 			case BOL:
629 				if(p==0 || filereadc(f, p - 1)=='\n'){
630 	Step:
631 					inst = inst->next;
632 					goto Switchstmt;
633 				}
634 				break;
635 			case EOL:
636 				if(c == '\n')
637 					goto Step;
638 				break;
639 			case CCLASS:
640 				if(c>=0 && classmatch(inst->rclass, c, 0))
641 					goto Addinst;
642 				break;
643 			case NCCLASS:
644 				if(c>=0 && classmatch(inst->rclass, c, 1))
645 					goto Addinst;
646 				break;
647 			case OR:
648 				/* evaluate right choice later */
649 				if(addinst(tl, inst->right, &tlp->se))
650 				if(++ntl >= NLIST)
651 					goto Overflow;
652 				/* efficiency: advance and re-evaluate */
653 				inst = inst->left;
654 				goto Switchstmt;
655 			case END:	/* Match! */
656 				tlp->se.p[0].p2 = p;
657 				newmatch(&tlp->se);
658 				break;
659 			}
660 		}
661 	}
662     Return:
663 	return sel.p[0].p1>=0;
664 }
665 
666 void
667 newmatch(Rangeset *sp)
668 {
669 	int i;
670 
671 	if(sel.p[0].p1<0 || sp->p[0].p1<sel.p[0].p1 ||
672 	   (sp->p[0].p1==sel.p[0].p1 && sp->p[0].p2>sel.p[0].p2))
673 		for(i = 0; i<NSUBEXP; i++)
674 			sel.p[i] = sp->p[i];
675 }
676 
677 int
678 bexecute(File *f, Posn startp)
679 {
680 	int flag = 0;
681 	Inst *inst;
682 	Ilist *tlp;
683 	Posn p = startp;
684 	int nnl = 0, ntl;
685 	int c;
686 	int wrapped = 0;
687 	int startchar = bstartinst->type<OPERATOR? bstartinst->type : 0;
688 
689 	list[0][0].inst = list[1][0].inst = 0;
690 	sel.p[0].p1= -1;
691 	/* Execute machine once for each character, including terminal NUL */
692 	for(;;--p){
693 	doloop:
694 		if((c = filereadc(f, p - 1))==-1){
695 			switch(wrapped++){
696 			case 0:		/* let loop run one more click */
697 			case 2:
698 				break;
699 			case 1:		/* expired; wrap to end */
700 				if(sel.p[0].p1>=0)
701 			case 3:
702 					goto Return;
703 				list[0][0].inst = list[1][0].inst = 0;
704 				p = f->nc;
705 				goto doloop;
706 			default:
707 				goto Return;
708 			}
709 		}else if(((wrapped && p<=startp) || sel.p[0].p1>0) && nnl==0)
710 			break;
711 		/* fast check for first char */
712 		if(startchar && nnl==0 && c!=startchar)
713 			continue;
714 		tl = list[flag];
715 		nl = list[flag^=1];
716 		nl->inst = 0;
717 		ntl = nnl;
718 		nnl = 0;
719 		if(sel.p[0].p1<0 && (!wrapped || p>startp)){
720 			/* Add first instruction to this list */
721 			/* the minus is so the optimizations in addinst work */
722 			sempty.p[0].p1 = -p;
723 			if(addinst(tl, bstartinst, &sempty))
724 			if(++ntl >= NLIST)
725 	Overflow:
726 				error(Eoverflow);
727 		}
728 		/* Execute machine until this list is empty */
729 		for(tlp = tl; inst = tlp->inst; tlp++){	/* assignment = */
730 	Switchstmt:
731 			switch(inst->type){
732 			default:	/* regular character */
733 				if(inst->type == c){
734 	Addinst:
735 					if(addinst(nl, inst->next, &tlp->se))
736 					if(++nnl >= NLIST)
737 						goto Overflow;
738 				}
739 				break;
740 			case LBRA:
741 				if(inst->subid>=0)
742 					tlp->se.p[inst->subid].p1 = p;
743 				inst = inst->next;
744 				goto Switchstmt;
745 			case RBRA:
746 				if(inst->subid >= 0)
747 					tlp->se.p[inst->subid].p2 = p;
748 				inst = inst->next;
749 				goto Switchstmt;
750 			case ANY:
751 				if(c != '\n')
752 					goto Addinst;
753 				break;
754 			case BOL:
755 				if(c=='\n' || p==0){
756 	Step:
757 					inst = inst->next;
758 					goto Switchstmt;
759 				}
760 				break;
761 			case EOL:
762 				if(p==f->nc || filereadc(f, p)=='\n')
763 					goto Step;
764 				break;
765 			case CCLASS:
766 				if(c>=0 && classmatch(inst->rclass, c, 0))
767 					goto Addinst;
768 				break;
769 			case NCCLASS:
770 				if(c>=0 && classmatch(inst->rclass, c, 1))
771 					goto Addinst;
772 				break;
773 			case OR:
774 				/* evaluate right choice later */
775 				if(addinst(tl, inst->right, &tlp->se))
776 				if(++ntl >= NLIST)
777 					goto Overflow;
778 				/* efficiency: advance and re-evaluate */
779 				inst = inst->left;
780 				goto Switchstmt;
781 			case END:	/* Match! */
782 				tlp->se.p[0].p1 = -tlp->se.p[0].p1; /* minus sign */
783 				tlp->se.p[0].p2 = p;
784 				bnewmatch(&tlp->se);
785 				break;
786 			}
787 		}
788 	}
789     Return:
790 	return sel.p[0].p1>=0;
791 }
792 
793 void
794 bnewmatch(Rangeset *sp)
795 {
796         int  i;
797         if(sel.p[0].p1<0 || sp->p[0].p1>sel.p[0].p2 || (sp->p[0].p1==sel.p[0].p2 && sp->p[0].p2<sel.p[0].p1))
798                 for(i = 0; i<NSUBEXP; i++){       /* note the reversal; p1<=p2 */
799                         sel.p[i].p1 = sp->p[i].p2;
800                         sel.p[i].p2 = sp->p[i].p1;
801                 }
802 }
803