xref: /plan9/sys/src/cmd/sam/regexp.c (revision 82726826a7b3d40fb66339b4b0e95b60314f98b9)
1 #include "sam.h"
2 
3 Rangeset	sel;
4 String		lastregexp;
5 /*
6  * Machine Information
7  */
8 typedef struct Inst Inst;
9 
10 struct Inst
11 {
12 	long	type;	/* <= Runemax ==> literal, otherwise action */
13 	union {
14 		int rsid;
15 		int rsubid;
16 		int class;
17 		struct Inst *rother;
18 		struct Inst *rright;
19 	} r;
20 	union{
21 		struct Inst *lleft;
22 		struct Inst *lnext;
23 	} l;
24 };
25 #define	sid	r.rsid
26 #define	subid	r.rsubid
27 #define	rclass	r.class
28 #define	other	r.rother
29 #define	right	r.rright
30 #define	left	l.lleft
31 #define	next	l.lnext
32 
33 #define	NPROG	1024
34 Inst	program[NPROG];
35 Inst	*progp;
36 Inst	*startinst;	/* First inst. of program; might not be program[0] */
37 Inst	*bstartinst;	/* same for backwards machine */
38 
39 typedef struct Ilist Ilist;
40 struct Ilist
41 {
42 	Inst	*inst;		/* Instruction of the thread */
43 	Rangeset se;
44 	Posn	startp;		/* first char of match */
45 };
46 
47 #define	NLIST	127
48 
49 Ilist	*tl, *nl;		/* This list, next list */
50 Ilist	list[2][NLIST+1];	/* +1 for trailing null */
51 static	Rangeset sempty;
52 
53 /*
54  * Actions and Tokens
55  *
56  *	0x100xx are operators, value == precedence
57  *	0x200xx are tokens, i.e. operands for operators
58  */
59 enum {
60 	OPERATOR = Runemask+1,	/* Bitmask of all operators */
61 	START	= OPERATOR,	/* Start, used for marker on stack */
62 	RBRA,			/* Right bracket, ) */
63 	LBRA,			/* Left bracket, ( */
64 	OR,			/* Alternation, | */
65 	CAT,			/* Concatentation, implicit operator */
66 	STAR,			/* Closure, * */
67 	PLUS,			/* a+ == aa* */
68 	QUEST,			/* a? == a|nothing, i.e. 0 or 1 a's */
69 
70 	ANY	= OPERATOR<<1,	/* Any character but newline, . */
71 	NOP,			/* No operation, internal use only */
72 	BOL,			/* Beginning of line, ^ */
73 	EOL,			/* End of line, $ */
74 	CCLASS,			/* Character class, [] */
75 	NCCLASS,		/* Negated character class, [^] */
76 	END,			/* Terminate: match found */
77 
78 	ISATOR	= OPERATOR,
79 	ISAND	= OPERATOR<<1,
80 };
81 
82 /*
83  * Parser Information
84  */
85 typedef struct Node Node;
86 struct Node
87 {
88 	Inst	*first;
89 	Inst	*last;
90 };
91 
92 #define	NSTACK	20
93 Node	andstack[NSTACK];
94 Node	*andp;
95 int	atorstack[NSTACK];
96 int	*atorp;
97 int	lastwasand;	/* Last token was operand */
98 int	cursubid;
99 int	subidstack[NSTACK];
100 int	*subidp;
101 int	backwards;
102 int	nbra;
103 Rune	*exprp;		/* pointer to next character in source expression */
104 #define	DCLASS	10	/* allocation increment */
105 int	nclass;		/* number active */
106 int	Nclass;		/* high water mark */
107 Rune	**class;
108 int	negateclass;
109 
110 int	addinst(Ilist *l, Inst *inst, Rangeset *sep);
111 void	newmatch(Rangeset*);
112 void	bnewmatch(Rangeset*);
113 void	pushand(Inst*, Inst*);
114 void	pushator(int);
115 Node	*popand(int);
116 int	popator(void);
117 void	startlex(Rune*);
118 int	lex(void);
119 void	operator(int);
120 void	operand(int);
121 void	evaluntil(int);
122 void	optimize(Inst*);
123 void	bldcclass(void);
124 
125 void
regerror(Err e)126 regerror(Err e)
127 {
128 	Strzero(&lastregexp);
129 	error(e);
130 }
131 
132 void
regerror_c(Err e,int c)133 regerror_c(Err e, int c)
134 {
135 	Strzero(&lastregexp);
136 	error_c(e, c);
137 }
138 
139 Inst *
newinst(int t)140 newinst(int t)
141 {
142 	if(progp >= &program[NPROG])
143 		regerror(Etoolong);
144 	progp->type = t;
145 	progp->left = 0;
146 	progp->right = 0;
147 	return progp++;
148 }
149 
150 Inst *
realcompile(Rune * s)151 realcompile(Rune *s)
152 {
153 	int token;
154 
155 	startlex(s);
156 	atorp = atorstack;
157 	andp = andstack;
158 	subidp = subidstack;
159 	cursubid = 0;
160 	lastwasand = FALSE;
161 	/* Start with a low priority operator to prime parser */
162 	pushator(START-1);
163 	while((token=lex()) != END){
164 		if((token&ISATOR) == OPERATOR)
165 			operator(token);
166 		else
167 			operand(token);
168 	}
169 	/* Close with a low priority operator */
170 	evaluntil(START);
171 	/* Force END */
172 	operand(END);
173 	evaluntil(START);
174 	if(nbra)
175 		regerror(Eleftpar);
176 	--andp;	/* points to first and only operand */
177 	return andp->first;
178 }
179 
180 void
compile(String * s)181 compile(String *s)
182 {
183 	int i;
184 	Inst *oprogp;
185 
186 	if(Strcmp(s, &lastregexp)==0)
187 		return;
188 	for(i=0; i<nclass; i++)
189 		free(class[i]);
190 	nclass = 0;
191 	progp = program;
192 	backwards = FALSE;
193 	startinst = realcompile(s->s);
194 	optimize(program);
195 	oprogp = progp;
196 	backwards = TRUE;
197 	bstartinst = realcompile(s->s);
198 	optimize(oprogp);
199 	Strduplstr(&lastregexp, s);
200 }
201 
202 void
operand(int t)203 operand(int t)
204 {
205 	Inst *i;
206 	if(lastwasand)
207 		operator(CAT);	/* catenate is implicit */
208 	i = newinst(t);
209 	if(t == CCLASS){
210 		if(negateclass)
211 			i->type = NCCLASS;	/* UGH */
212 		i->rclass = nclass-1;		/* UGH */
213 	}
214 	pushand(i, i);
215 	lastwasand = TRUE;
216 }
217 
218 void
operator(int t)219 operator(int t)
220 {
221 	if(t==RBRA && --nbra<0)
222 		regerror(Erightpar);
223 	if(t==LBRA){
224 /*
225  *		if(++cursubid >= NSUBEXP)
226  *			regerror(Esubexp);
227  */
228 		cursubid++;	/* silently ignored */
229 		nbra++;
230 		if(lastwasand)
231 			operator(CAT);
232 	}else
233 		evaluntil(t);
234 	if(t!=RBRA)
235 		pushator(t);
236 	lastwasand = FALSE;
237 	if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
238 		lastwasand = TRUE;	/* these look like operands */
239 }
240 
241 void
cant(char * s)242 cant(char *s)
243 {
244 	char buf[100];
245 
246 	sprint(buf, "regexp: can't happen: %s", s);
247 	panic(buf);
248 }
249 
250 void
pushand(Inst * f,Inst * l)251 pushand(Inst *f, Inst *l)
252 {
253 	if(andp >= &andstack[NSTACK])
254 		cant("operand stack overflow");
255 	andp->first = f;
256 	andp->last = l;
257 	andp++;
258 }
259 
260 void
pushator(int t)261 pushator(int t)
262 {
263 	if(atorp >= &atorstack[NSTACK])
264 		cant("operator stack overflow");
265 	*atorp++=t;
266 	if(cursubid >= NSUBEXP)
267 		*subidp++= -1;
268 	else
269 		*subidp++=cursubid;
270 }
271 
272 Node *
popand(int op)273 popand(int op)
274 {
275 	if(andp <= &andstack[0])
276 		if(op)
277 			regerror_c(Emissop, op);
278 		else
279 			regerror(Ebadregexp);
280 	return --andp;
281 }
282 
283 int
popator(void)284 popator(void)
285 {
286 	if(atorp <= &atorstack[0])
287 		cant("operator stack underflow");
288 	--subidp;
289 	return *--atorp;
290 }
291 
292 void
evaluntil(int pri)293 evaluntil(int pri)
294 {
295 	Node *op1, *op2, *t;
296 	Inst *inst1, *inst2;
297 
298 	while(pri==RBRA || atorp[-1]>=pri){
299 		switch(popator()){
300 		case LBRA:
301 			op1 = popand('(');
302 			inst2 = newinst(RBRA);
303 			inst2->subid = *subidp;
304 			op1->last->next = inst2;
305 			inst1 = newinst(LBRA);
306 			inst1->subid = *subidp;
307 			inst1->next = op1->first;
308 			pushand(inst1, inst2);
309 			return;		/* must have been RBRA */
310 		default:
311 			panic("unknown regexp operator");
312 			break;
313 		case OR:
314 			op2 = popand('|');
315 			op1 = popand('|');
316 			inst2 = newinst(NOP);
317 			op2->last->next = inst2;
318 			op1->last->next = inst2;
319 			inst1 = newinst(OR);
320 			inst1->right = op1->first;
321 			inst1->left = op2->first;
322 			pushand(inst1, inst2);
323 			break;
324 		case CAT:
325 			op2 = popand(0);
326 			op1 = popand(0);
327 			if(backwards && op2->first->type!=END)
328 				t = op1, op1 = op2, op2 = t;
329 			op1->last->next = op2->first;
330 			pushand(op1->first, op2->last);
331 			break;
332 		case STAR:
333 			op2 = popand('*');
334 			inst1 = newinst(OR);
335 			op2->last->next = inst1;
336 			inst1->right = op2->first;
337 			pushand(inst1, inst1);
338 			break;
339 		case PLUS:
340 			op2 = popand('+');
341 			inst1 = newinst(OR);
342 			op2->last->next = inst1;
343 			inst1->right = op2->first;
344 			pushand(op2->first, inst1);
345 			break;
346 		case QUEST:
347 			op2 = popand('?');
348 			inst1 = newinst(OR);
349 			inst2 = newinst(NOP);
350 			inst1->left = inst2;
351 			inst1->right = op2->first;
352 			op2->last->next = inst2;
353 			pushand(inst1, inst2);
354 			break;
355 		}
356 	}
357 }
358 
359 
360 void
optimize(Inst * start)361 optimize(Inst *start)
362 {
363 	Inst *inst, *target;
364 
365 	for(inst=start; inst->type!=END; inst++){
366 		target = inst->next;
367 		while(target->type == NOP)
368 			target = target->next;
369 		inst->next = target;
370 	}
371 }
372 
373 #ifdef	DEBUG
374 void
dumpstack(void)375 dumpstack(void){
376 	Node *stk;
377 	int *ip;
378 
379 	dprint("operators\n");
380 	for(ip = atorstack; ip<atorp; ip++)
381 		dprint("0%o\n", *ip);
382 	dprint("operands\n");
383 	for(stk = andstack; stk<andp; stk++)
384 		dprint("0%o\t0%o\n", stk->first->type, stk->last->type);
385 }
386 void
dump(void)387 dump(void){
388 	Inst *l;
389 
390 	l = program;
391 	do{
392 		dprint("%d:\t0%o\t%d\t%d\n", l-program, l->type,
393 			l->left-program, l->right-program);
394 	}while(l++->type);
395 }
396 #endif
397 
398 void
startlex(Rune * s)399 startlex(Rune *s)
400 {
401 	exprp = s;
402 	nbra = 0;
403 }
404 
405 
406 int
lex(void)407 lex(void){
408 	int c= *exprp++;
409 
410 	switch(c){
411 	case '\\':
412 		if(*exprp)
413 			if((c= *exprp++)=='n')
414 				c='\n';
415 		break;
416 	case 0:
417 		c = END;
418 		--exprp;	/* In case we come here again */
419 		break;
420 	case '*':
421 		c = STAR;
422 		break;
423 	case '?':
424 		c = QUEST;
425 		break;
426 	case '+':
427 		c = PLUS;
428 		break;
429 	case '|':
430 		c = OR;
431 		break;
432 	case '.':
433 		c = ANY;
434 		break;
435 	case '(':
436 		c = LBRA;
437 		break;
438 	case ')':
439 		c = RBRA;
440 		break;
441 	case '^':
442 		c = BOL;
443 		break;
444 	case '$':
445 		c = EOL;
446 		break;
447 	case '[':
448 		c = CCLASS;
449 		bldcclass();
450 		break;
451 	}
452 	return c;
453 }
454 
455 long
nextrec(void)456 nextrec(void){
457 	if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0))
458 		regerror(Ebadclass);
459 	if(exprp[0] == '\\'){
460 		exprp++;
461 		if(*exprp=='n'){
462 			exprp++;
463 			return '\n';
464 		}
465 		return *exprp++|(Runemax+1);
466 	}
467 	return *exprp++;
468 }
469 
470 void
bldcclass(void)471 bldcclass(void)
472 {
473 	long c1, c2, n, na;
474 	Rune *classp;
475 
476 	classp = emalloc(DCLASS*RUNESIZE);
477 	n = 0;
478 	na = DCLASS;
479 	/* we have already seen the '[' */
480 	if(*exprp == '^'){
481 		classp[n++] = '\n';	/* don't match newline in negate case */
482 		negateclass = TRUE;
483 		exprp++;
484 	}else
485 		negateclass = FALSE;
486 	while((c1 = nextrec()) != ']'){
487 		if(c1 == '-'){
488     Error:
489 			free(classp);
490 			regerror(Ebadclass);
491 		}
492 		if(n+4 >= na){		/* 3 runes plus NUL */
493 			na += DCLASS;
494 			classp = erealloc(classp, na*RUNESIZE);
495 		}
496 		if(*exprp == '-'){
497 			exprp++;	/* eat '-' */
498 			if((c2 = nextrec()) == ']')
499 				goto Error;
500 			classp[n+0] = Runemax;
501 			classp[n+1] = c1;
502 			classp[n+2] = c2;
503 			n += 3;
504 		}else
505 			classp[n++] = c1;
506 	}
507 	classp[n] = 0;
508 	if(nclass == Nclass){
509 		Nclass += DCLASS;
510 		class = erealloc(class, Nclass*sizeof(Rune*));
511 	}
512 	class[nclass++] = classp;
513 }
514 
515 int
classmatch(int classno,int c,int negate)516 classmatch(int classno, int c, int negate)
517 {
518 	Rune *p;
519 
520 	p = class[classno];
521 	while(*p){
522 		if(*p == Runemax){
523 			if(p[1]<=c && c<=p[2])
524 				return !negate;
525 			p += 3;
526 		}else if(*p++ == c)
527 			return !negate;
528 	}
529 	return negate;
530 }
531 
532 /*
533  * Note optimization in addinst:
534  * 	*l must be pending when addinst called; if *l has been looked
535  *		at already, the optimization is a bug.
536  */
537 int
addinst(Ilist * l,Inst * inst,Rangeset * sep)538 addinst(Ilist *l, Inst *inst, Rangeset *sep)
539 {
540 	Ilist *p;
541 
542 	for(p = l; p->inst; p++){
543 		if(p->inst==inst){
544 			if((sep)->p[0].p1 < p->se.p[0].p1)
545 				p->se= *sep;	/* this would be bug */
546 			return 0;	/* It's already there */
547 		}
548 	}
549 	p->inst = inst;
550 	p->se= *sep;
551 	(p+1)->inst = 0;
552 	return 1;
553 }
554 
555 int
execute(File * f,Posn startp,Posn eof)556 execute(File *f, Posn startp, Posn eof)
557 {
558 	int flag = 0;
559 	Inst *inst;
560 	Ilist *tlp;
561 	Posn p = startp;
562 	int nnl = 0, ntl;
563 	int c;
564 	int wrapped = 0;
565 	int startchar = startinst->type<OPERATOR? startinst->type : 0;
566 
567 	list[0][0].inst = list[1][0].inst = 0;
568 	sel.p[0].p1 = -1;
569 	/* Execute machine once for each character */
570 	for(;;p++){
571 	doloop:
572 		c = filereadc(f, p);
573 		if(p>=eof || c<0){
574 			switch(wrapped++){
575 			case 0:		/* let loop run one more click */
576 			case 2:
577 				break;
578 			case 1:		/* expired; wrap to beginning */
579 				if(sel.p[0].p1>=0 || eof!=INFINITY)
580 					goto Return;
581 				list[0][0].inst = list[1][0].inst = 0;
582 				p = 0;
583 				goto doloop;
584 			default:
585 				goto Return;
586 			}
587 		}else if(((wrapped && p>=startp) || sel.p[0].p1>0) && nnl==0)
588 			break;
589 		/* fast check for first char */
590 		if(startchar && nnl==0 && c!=startchar)
591 			continue;
592 		tl = list[flag];
593 		nl = list[flag^=1];
594 		nl->inst = 0;
595 		ntl = nnl;
596 		nnl = 0;
597 		if(sel.p[0].p1<0 && (!wrapped || p<startp || startp==eof)){
598 			/* Add first instruction to this list */
599 			sempty.p[0].p1 = p;
600 			if(addinst(tl, startinst, &sempty))
601 			if(++ntl >= NLIST)
602 	Overflow:
603 				error(Eoverflow);
604 		}
605 		/* Execute machine until this list is empty */
606 		for(tlp = tl; inst = tlp->inst; tlp++){	/* assignment = */
607 	Switchstmt:
608 			switch(inst->type){
609 			default:	/* regular character */
610 				if(inst->type==c){
611 	Addinst:
612 					if(addinst(nl, inst->next, &tlp->se))
613 					if(++nnl >= NLIST)
614 						goto Overflow;
615 				}
616 				break;
617 			case LBRA:
618 				if(inst->subid>=0)
619 					tlp->se.p[inst->subid].p1 = p;
620 				inst = inst->next;
621 				goto Switchstmt;
622 			case RBRA:
623 				if(inst->subid>=0)
624 					tlp->se.p[inst->subid].p2 = p;
625 				inst = inst->next;
626 				goto Switchstmt;
627 			case ANY:
628 				if(c!='\n')
629 					goto Addinst;
630 				break;
631 			case BOL:
632 				if(p==0 || filereadc(f, p - 1)=='\n'){
633 	Step:
634 					inst = inst->next;
635 					goto Switchstmt;
636 				}
637 				break;
638 			case EOL:
639 				if(c == '\n')
640 					goto Step;
641 				break;
642 			case CCLASS:
643 				if(c>=0 && classmatch(inst->rclass, c, 0))
644 					goto Addinst;
645 				break;
646 			case NCCLASS:
647 				if(c>=0 && classmatch(inst->rclass, c, 1))
648 					goto Addinst;
649 				break;
650 			case OR:
651 				/* evaluate right choice later */
652 				if(addinst(tlp, inst->right, &tlp->se))
653 				if(++ntl >= NLIST)
654 					goto Overflow;
655 				/* efficiency: advance and re-evaluate */
656 				inst = inst->left;
657 				goto Switchstmt;
658 			case END:	/* Match! */
659 				tlp->se.p[0].p2 = p;
660 				newmatch(&tlp->se);
661 				break;
662 			}
663 		}
664 	}
665     Return:
666 	return sel.p[0].p1>=0;
667 }
668 
669 void
newmatch(Rangeset * sp)670 newmatch(Rangeset *sp)
671 {
672 	int i;
673 
674 	if(sel.p[0].p1<0 || sp->p[0].p1<sel.p[0].p1 ||
675 	   (sp->p[0].p1==sel.p[0].p1 && sp->p[0].p2>sel.p[0].p2))
676 		for(i = 0; i<NSUBEXP; i++)
677 			sel.p[i] = sp->p[i];
678 }
679 
680 int
bexecute(File * f,Posn startp)681 bexecute(File *f, Posn startp)
682 {
683 	int flag = 0;
684 	Inst *inst;
685 	Ilist *tlp;
686 	Posn p = startp;
687 	int nnl = 0, ntl;
688 	int c;
689 	int wrapped = 0;
690 	int startchar = bstartinst->type<OPERATOR? bstartinst->type : 0;
691 
692 	list[0][0].inst = list[1][0].inst = 0;
693 	sel.p[0].p1= -1;
694 	/* Execute machine once for each character, including terminal NUL */
695 	for(;;--p){
696 	doloop:
697 		if((c = filereadc(f, p - 1))==-1){
698 			switch(wrapped++){
699 			case 0:		/* let loop run one more click */
700 			case 2:
701 				break;
702 			case 1:		/* expired; wrap to end */
703 				if(sel.p[0].p1>=0)
704 			case 3:
705 					goto Return;
706 				list[0][0].inst = list[1][0].inst = 0;
707 				p = f->nc;
708 				goto doloop;
709 			default:
710 				goto Return;
711 			}
712 		}else if(((wrapped && p<=startp) || sel.p[0].p1>0) && nnl==0)
713 			break;
714 		/* fast check for first char */
715 		if(startchar && nnl==0 && c!=startchar)
716 			continue;
717 		tl = list[flag];
718 		nl = list[flag^=1];
719 		nl->inst = 0;
720 		ntl = nnl;
721 		nnl = 0;
722 		if(sel.p[0].p1<0 && (!wrapped || p>startp)){
723 			/* Add first instruction to this list */
724 			/* the minus is so the optimizations in addinst work */
725 			sempty.p[0].p1 = -p;
726 			if(addinst(tl, bstartinst, &sempty))
727 			if(++ntl >= NLIST)
728 	Overflow:
729 				error(Eoverflow);
730 		}
731 		/* Execute machine until this list is empty */
732 		for(tlp = tl; inst = tlp->inst; tlp++){	/* assignment = */
733 	Switchstmt:
734 			switch(inst->type){
735 			default:	/* regular character */
736 				if(inst->type == c){
737 	Addinst:
738 					if(addinst(nl, inst->next, &tlp->se))
739 					if(++nnl >= NLIST)
740 						goto Overflow;
741 				}
742 				break;
743 			case LBRA:
744 				if(inst->subid>=0)
745 					tlp->se.p[inst->subid].p1 = p;
746 				inst = inst->next;
747 				goto Switchstmt;
748 			case RBRA:
749 				if(inst->subid >= 0)
750 					tlp->se.p[inst->subid].p2 = p;
751 				inst = inst->next;
752 				goto Switchstmt;
753 			case ANY:
754 				if(c != '\n')
755 					goto Addinst;
756 				break;
757 			case BOL:
758 				if(c=='\n' || p==0){
759 	Step:
760 					inst = inst->next;
761 					goto Switchstmt;
762 				}
763 				break;
764 			case EOL:
765 				if(p==f->nc || filereadc(f, p)=='\n')
766 					goto Step;
767 				break;
768 			case CCLASS:
769 				if(c>=0 && classmatch(inst->rclass, c, 0))
770 					goto Addinst;
771 				break;
772 			case NCCLASS:
773 				if(c>=0 && classmatch(inst->rclass, c, 1))
774 					goto Addinst;
775 				break;
776 			case OR:
777 				/* evaluate right choice later */
778 				if(addinst(tl, inst->right, &tlp->se))
779 				if(++ntl >= NLIST)
780 					goto Overflow;
781 				/* efficiency: advance and re-evaluate */
782 				inst = inst->left;
783 				goto Switchstmt;
784 			case END:	/* Match! */
785 				tlp->se.p[0].p1 = -tlp->se.p[0].p1; /* minus sign */
786 				tlp->se.p[0].p2 = p;
787 				bnewmatch(&tlp->se);
788 				break;
789 			}
790 		}
791 	}
792     Return:
793 	return sel.p[0].p1>=0;
794 }
795 
796 void
bnewmatch(Rangeset * sp)797 bnewmatch(Rangeset *sp)
798 {
799         int  i;
800         if(sel.p[0].p1<0 || sp->p[0].p1>sel.p[0].p2 || (sp->p[0].p1==sel.p[0].p2 && sp->p[0].p2<sel.p[0].p1))
801                 for(i = 0; i<NSUBEXP; i++){       /* note the reversal; p1<=p2 */
802                         sel.p[i].p1 = sp->p[i].p2;
803                         sel.p[i].p2 = sp->p[i].p1;
804                 }
805 }
806