xref: /plan9-contrib/sys/src/cmd/sam/regexp.c (revision 7dd7cddf99dd7472612f1413b4da293630e6b1bc)
1 #include "sam.h"
2 
3 Rangeset	sel;
4 String		lastregexp;
5 /*
6  * Machine Information
7  */
8 typedef struct Inst Inst;
9 
10 struct Inst
11 {
12 	long	type;	/* < 0x10000 ==> literal, otherwise action */
13 	union {
14 		int rsid;
15 		int rsubid;
16 		int class;
17 		struct Inst *rother;
18 		struct Inst *rright;
19 	} r;
20 	union{
21 		struct Inst *lleft;
22 		struct Inst *lnext;
23 	} l;
24 };
25 #define	sid	r.rsid
26 #define	subid	r.rsubid
27 #define	rclass	r.class
28 #define	other	r.rother
29 #define	right	r.rright
30 #define	left	l.lleft
31 #define	next	l.lnext
32 
33 #define	NPROG	1024
34 Inst	program[NPROG];
35 Inst	*progp;
36 Inst	*startinst;	/* First inst. of program; might not be program[0] */
37 Inst	*bstartinst;	/* same for backwards machine */
38 
39 typedef struct Ilist Ilist;
40 struct Ilist
41 {
42 	Inst	*inst;		/* Instruction of the thread */
43 	Rangeset se;
44 	Posn	startp;		/* first char of match */
45 };
46 
47 #define	NLIST	128
48 
49 Ilist	*tl, *nl;	/* This list, next list */
50 Ilist	list[2][NLIST];
51 static	Rangeset sempty;
52 
53 /*
54  * Actions and Tokens
55  *
56  *	0x100xx are operators, value == precedence
57  *	0x200xx are tokens, i.e. operands for operators
58  */
59 #define	OPERATOR	0x10000	/* Bitmask of all operators */
60 #define	START		0x10000	/* Start, used for marker on stack */
61 #define	RBRA		0x10001	/* Right bracket, ) */
62 #define	LBRA		0x10002	/* Left bracket, ( */
63 #define	OR		0x10003	/* Alternation, | */
64 #define	CAT		0x10004	/* Concatentation, implicit operator */
65 #define	STAR		0x10005	/* Closure, * */
66 #define	PLUS		0x10006	/* a+ == aa* */
67 #define	QUEST		0x10007	/* a? == a|nothing, i.e. 0 or 1 a's */
68 #define	ANY		0x20000	/* Any character but newline, . */
69 #define	NOP		0x20001	/* No operation, internal use only */
70 #define	BOL		0x20002	/* Beginning of line, ^ */
71 #define	EOL		0x20003	/* End of line, $ */
72 #define	CCLASS		0x20004	/* Character class, [] */
73 #define	NCCLASS		0x20005	/* Negated character class, [^] */
74 #define	END		0x20077	/* Terminate: match found */
75 
76 #define	ISATOR		0x10000
77 #define	ISAND		0x20000
78 
79 /*
80  * Parser Information
81  */
82 typedef struct Node Node;
83 struct Node
84 {
85 	Inst	*first;
86 	Inst	*last;
87 };
88 
89 #define	NSTACK	20
90 Node	andstack[NSTACK];
91 Node	*andp;
92 int	atorstack[NSTACK];
93 int	*atorp;
94 int	lastwasand;	/* Last token was operand */
95 int	cursubid;
96 int	subidstack[NSTACK];
97 int	*subidp;
98 int	backwards;
99 int	nbra;
100 Rune	*exprp;		/* pointer to next character in source expression */
101 #define	DCLASS	10	/* allocation increment */
102 int	nclass;		/* number active */
103 int	Nclass;		/* high water mark */
104 Rune	**class;
105 int	negateclass;
106 
107 void	addinst(Ilist *l, Inst *inst, Rangeset *sep);
108 void	newmatch(Rangeset*);
109 void	bnewmatch(Rangeset*);
110 void	pushand(Inst*, Inst*);
111 void	pushator(int);
112 Node	*popand(int);
113 int	popator(void);
114 void	startlex(Rune*);
115 int	lex(void);
116 void	operator(int);
117 void	operand(int);
118 void	evaluntil(int);
119 void	optimize(Inst*);
120 void	bldcclass(void);
121 
122 void
123 regerror(Err e)
124 {
125 	Strzero(&lastregexp);
126 	error(e);
127 }
128 
129 void
130 regerror_c(Err e, int c)
131 {
132 	Strzero(&lastregexp);
133 	error_c(e, c);
134 }
135 
136 Inst *
137 newinst(int t)
138 {
139 	if(progp >= &program[NPROG])
140 		regerror(Etoolong);
141 	progp->type = t;
142 	progp->left = 0;
143 	progp->right = 0;
144 	return progp++;
145 }
146 
147 Inst *
148 realcompile(Rune *s)
149 {
150 	int token;
151 
152 	startlex(s);
153 	atorp = atorstack;
154 	andp = andstack;
155 	subidp = subidstack;
156 	cursubid = 0;
157 	lastwasand = FALSE;
158 	/* Start with a low priority operator to prime parser */
159 	pushator(START-1);
160 	while((token=lex()) != END){
161 		if((token&ISATOR) == OPERATOR)
162 			operator(token);
163 		else
164 			operand(token);
165 	}
166 	/* Close with a low priority operator */
167 	evaluntil(START);
168 	/* Force END */
169 	operand(END);
170 	evaluntil(START);
171 	if(nbra)
172 		regerror(Eleftpar);
173 	--andp;	/* points to first and only operand */
174 	return andp->first;
175 }
176 
177 void
178 compile(String *s)
179 {
180 	int i;
181 	Inst *oprogp;
182 
183 	if(Strcmp(s, &lastregexp)==0)
184 		return;
185 	for(i=0; i<nclass; i++)
186 		free(class[i]);
187 	nclass = 0;
188 	progp = program;
189 	backwards = FALSE;
190 	startinst = realcompile(s->s);
191 	optimize(program);
192 	oprogp = progp;
193 	backwards = TRUE;
194 	bstartinst = realcompile(s->s);
195 	optimize(oprogp);
196 	Strduplstr(&lastregexp, s);
197 }
198 
199 void
200 operand(int t)
201 {
202 	Inst *i;
203 	if(lastwasand)
204 		operator(CAT);	/* catenate is implicit */
205 	i = newinst(t);
206 	if(t == CCLASS){
207 		if(negateclass)
208 			i->type = NCCLASS;	/* UGH */
209 		i->rclass = nclass-1;		/* UGH */
210 	}
211 	pushand(i, i);
212 	lastwasand = TRUE;
213 }
214 
215 void
216 operator(int t)
217 {
218 	if(t==RBRA && --nbra<0)
219 		regerror(Erightpar);
220 	if(t==LBRA){
221 /*
222  *		if(++cursubid >= NSUBEXP)
223  *			regerror(Esubexp);
224  */
225 		cursubid++;	/* silently ignored */
226 		nbra++;
227 		if(lastwasand)
228 			operator(CAT);
229 	}else
230 		evaluntil(t);
231 	if(t!=RBRA)
232 		pushator(t);
233 	lastwasand = FALSE;
234 	if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
235 		lastwasand = TRUE;	/* these look like operands */
236 }
237 
238 void
239 cant(char *s)
240 {
241 	char buf[100];
242 
243 	sprint(buf, "regexp: can't happen: %s", s);
244 	panic(buf);
245 }
246 
247 void
248 pushand(Inst *f, Inst *l)
249 {
250 	if(andp >= &andstack[NSTACK])
251 		cant("operand stack overflow");
252 	andp->first = f;
253 	andp->last = l;
254 	andp++;
255 }
256 
257 void
258 pushator(int t)
259 {
260 	if(atorp >= &atorstack[NSTACK])
261 		cant("operator stack overflow");
262 	*atorp++=t;
263 	if(cursubid >= NSUBEXP)
264 		*subidp++= -1;
265 	else
266 		*subidp++=cursubid;
267 }
268 
269 Node *
270 popand(int op)
271 {
272 	if(andp <= &andstack[0])
273 		if(op)
274 			regerror_c(Emissop, op);
275 		else
276 			regerror(Ebadregexp);
277 	return --andp;
278 }
279 
280 int
281 popator(void)
282 {
283 	if(atorp <= &atorstack[0])
284 		cant("operator stack underflow");
285 	--subidp;
286 	return *--atorp;
287 }
288 
289 void
290 evaluntil(int pri)
291 {
292 	Node *op1, *op2, *t;
293 	Inst *inst1, *inst2;
294 
295 	while(pri==RBRA || atorp[-1]>=pri){
296 		switch(popator()){
297 		case LBRA:
298 			op1 = popand('(');
299 			inst2 = newinst(RBRA);
300 			inst2->subid = *subidp;
301 			op1->last->next = inst2;
302 			inst1 = newinst(LBRA);
303 			inst1->subid = *subidp;
304 			inst1->next = op1->first;
305 			pushand(inst1, inst2);
306 			return;		/* must have been RBRA */
307 		default:
308 			panic("unknown regexp operator");
309 			break;
310 		case OR:
311 			op2 = popand('|');
312 			op1 = popand('|');
313 			inst2 = newinst(NOP);
314 			op2->last->next = inst2;
315 			op1->last->next = inst2;
316 			inst1 = newinst(OR);
317 			inst1->right = op1->first;
318 			inst1->left = op2->first;
319 			pushand(inst1, inst2);
320 			break;
321 		case CAT:
322 			op2 = popand(0);
323 			op1 = popand(0);
324 			if(backwards && op2->first->type!=END)
325 				t = op1, op1 = op2, op2 = t;
326 			op1->last->next = op2->first;
327 			pushand(op1->first, op2->last);
328 			break;
329 		case STAR:
330 			op2 = popand('*');
331 			inst1 = newinst(OR);
332 			op2->last->next = inst1;
333 			inst1->right = op2->first;
334 			pushand(inst1, inst1);
335 			break;
336 		case PLUS:
337 			op2 = popand('+');
338 			inst1 = newinst(OR);
339 			op2->last->next = inst1;
340 			inst1->right = op2->first;
341 			pushand(op2->first, inst1);
342 			break;
343 		case QUEST:
344 			op2 = popand('?');
345 			inst1 = newinst(OR);
346 			inst2 = newinst(NOP);
347 			inst1->left = inst2;
348 			inst1->right = op2->first;
349 			op2->last->next = inst2;
350 			pushand(inst1, inst2);
351 			break;
352 		}
353 	}
354 }
355 
356 
357 void
358 optimize(Inst *start)
359 {
360 	Inst *inst, *target;
361 
362 	for(inst=start; inst->type!=END; inst++){
363 		target = inst->next;
364 		while(target->type == NOP)
365 			target = target->next;
366 		inst->next = target;
367 	}
368 }
369 
370 #ifdef	DEBUG
371 void
372 dumpstack(void){
373 	Node *stk;
374 	int *ip;
375 
376 	dprint("operators\n");
377 	for(ip = atorstack; ip<atorp; ip++)
378 		dprint("0%o\n", *ip);
379 	dprint("operands\n");
380 	for(stk = andstack; stk<andp; stk++)
381 		dprint("0%o\t0%o\n", stk->first->type, stk->last->type);
382 }
383 void
384 dump(void){
385 	Inst *l;
386 
387 	l = program;
388 	do{
389 		dprint("%d:\t0%o\t%d\t%d\n", l-program, l->type,
390 			l->left-program, l->right-program);
391 	}while(l++->type);
392 }
393 #endif
394 
395 void
396 startlex(Rune *s)
397 {
398 	exprp = s;
399 	nbra = 0;
400 }
401 
402 
403 int
404 lex(void){
405 	int c= *exprp++;
406 
407 	switch(c){
408 	case '\\':
409 		if(*exprp)
410 			if((c= *exprp++)=='n')
411 				c='\n';
412 		break;
413 	case 0:
414 		c = END;
415 		--exprp;	/* In case we come here again */
416 		break;
417 	case '*':
418 		c = STAR;
419 		break;
420 	case '?':
421 		c = QUEST;
422 		break;
423 	case '+':
424 		c = PLUS;
425 		break;
426 	case '|':
427 		c = OR;
428 		break;
429 	case '.':
430 		c = ANY;
431 		break;
432 	case '(':
433 		c = LBRA;
434 		break;
435 	case ')':
436 		c = RBRA;
437 		break;
438 	case '^':
439 		c = BOL;
440 		break;
441 	case '$':
442 		c = EOL;
443 		break;
444 	case '[':
445 		c = CCLASS;
446 		bldcclass();
447 		break;
448 	}
449 	return c;
450 }
451 
452 long
453 nextrec(void){
454 	if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0))
455 		regerror(Ebadclass);
456 	if(exprp[0] == '\\'){
457 		exprp++;
458 		if(*exprp=='n'){
459 			exprp++;
460 			return '\n';
461 		}
462 		return *exprp++|0x10000;
463 	}
464 	return *exprp++;
465 }
466 
467 void
468 bldcclass(void)
469 {
470 	long c1, c2, n, na;
471 	Rune *classp;
472 
473 	classp = emalloc(DCLASS*RUNESIZE);
474 	n = 0;
475 	na = DCLASS;
476 	/* we have already seen the '[' */
477 	if(*exprp == '^'){
478 		classp[n++] = '\n';	/* don't match newline in negate case */
479 		negateclass = TRUE;
480 		exprp++;
481 	}else
482 		negateclass = FALSE;
483 	while((c1 = nextrec()) != ']'){
484 		if(c1 == '-'){
485     Error:
486 			free(classp);
487 			regerror(Ebadclass);
488 		}
489 		if(n+4 >= na){		/* 3 runes plus NUL */
490 			na += DCLASS;
491 			classp = erealloc(classp, na*RUNESIZE);
492 		}
493 		if(*exprp == '-'){
494 			exprp++;	/* eat '-' */
495 			if((c2 = nextrec()) == ']')
496 				goto Error;
497 			classp[n+0] = 0xFFFF;
498 			classp[n+1] = c1;
499 			classp[n+2] = c2;
500 			n += 3;
501 		}else
502 			classp[n++] = c1;
503 	}
504 	classp[n] = 0;
505 	if(nclass == Nclass){
506 		Nclass += DCLASS;
507 		class = erealloc(class, Nclass*sizeof(Rune*));
508 	}
509 	class[nclass++] = classp;
510 }
511 
512 int
513 classmatch(int classno, int c, int negate)
514 {
515 	Rune *p;
516 
517 	p = class[classno];
518 	while(*p){
519 		if(*p == 0xFFFF){
520 			if(p[1]<=c && c<=p[2])
521 				return !negate;
522 			p += 3;
523 		}else if(*p++ == c)
524 			return !negate;
525 	}
526 	return negate;
527 }
528 
529 /*
530  * Note optimization in addinst:
531  * 	*l must be pending when addinst called; if *l has been looked
532  *		at already, the optimization is a bug.
533  */
534 void
535 addinst(Ilist *l, Inst *inst, Rangeset *sep)
536 {
537 	Ilist *p;
538 
539 	for(p = l; p->inst; p++){
540 		if(p->inst==inst){
541 			if((sep)->p[0].p1 < p->se.p[0].p1)
542 				p->se= *sep;	/* this would be bug */
543 			return;	/* It's already there */
544 		}
545 	}
546 	p->inst = inst;
547 	p->se= *sep;
548 	(p+1)->inst = 0;
549 }
550 
551 int
552 execute(File *f, Posn startp, Posn eof)
553 {
554 	int flag = 0;
555 	Inst *inst;
556 	Ilist *tlp;
557 	Posn p = startp;
558 	int nnl = 0, ntl;
559 	int c;
560 	int wrapped = 0;
561 	int startchar = startinst->type<OPERATOR? startinst->type : 0;
562 
563 	list[0][0].inst = list[1][0].inst = 0;
564 	sel.p[0].p1 = -1;
565 	/* Execute machine once for each character */
566 	for(;;p++){
567 	doloop:
568 		c = filereadc(f, p);
569 		if(p>=eof || c<0){
570 			switch(wrapped++){
571 			case 0:		/* let loop run one more click */
572 			case 2:
573 				break;
574 			case 1:		/* expired; wrap to beginning */
575 				if(sel.p[0].p1>=0 || eof!=INFINITY)
576 					goto Return;
577 				list[0][0].inst = list[1][0].inst = 0;
578 				p = 0;
579 				goto doloop;
580 			default:
581 				goto Return;
582 			}
583 		}else if(((wrapped && p>=startp) || sel.p[0].p1>0) && nnl==0)
584 			break;
585 		/* fast check for first char */
586 		if(startchar && nnl==0 && c!=startchar)
587 			continue;
588 		tl = list[flag];
589 		nl = list[flag^=1];
590 		nl->inst = 0;
591 		ntl = nnl;
592 		nnl = 0;
593 		if(sel.p[0].p1<0 && (!wrapped || p<startp || startp==eof)){
594 			/* Add first instruction to this list */
595 			if(++ntl >= NLIST)
596 	Overflow:
597 				error(Eoverflow);
598 			sempty.p[0].p1 = p;
599 			addinst(tl, startinst, &sempty);
600 		}
601 		/* Execute machine until this list is empty */
602 		for(tlp = tl; inst = tlp->inst; tlp++){	/* assignment = */
603 	Switchstmt:
604 			switch(inst->type){
605 			default:	/* regular character */
606 				if(inst->type==c){
607 	Addinst:
608 					if(++nnl >= NLIST)
609 						goto Overflow;
610 					addinst(nl, inst->next, &tlp->se);
611 				}
612 				break;
613 			case LBRA:
614 				if(inst->subid>=0)
615 					tlp->se.p[inst->subid].p1 = p;
616 				inst = inst->next;
617 				goto Switchstmt;
618 			case RBRA:
619 				if(inst->subid>=0)
620 					tlp->se.p[inst->subid].p2 = p;
621 				inst = inst->next;
622 				goto Switchstmt;
623 			case ANY:
624 				if(c!='\n')
625 					goto Addinst;
626 				break;
627 			case BOL:
628 				if(p==0 || filereadc(f, p - 1)=='\n'){
629 	Step:
630 					inst = inst->next;
631 					goto Switchstmt;
632 				}
633 				break;
634 			case EOL:
635 				if(c == '\n')
636 					goto Step;
637 				break;
638 			case CCLASS:
639 				if(c>=0 && classmatch(inst->rclass, c, 0))
640 					goto Addinst;
641 				break;
642 			case NCCLASS:
643 				if(c>=0 && classmatch(inst->rclass, c, 1))
644 					goto Addinst;
645 				break;
646 			case OR:
647 				/* evaluate right choice later */
648 				if(++ntl >= NLIST)
649 					goto Overflow;
650 				addinst(tlp, inst->right, &tlp->se);
651 				/* efficiency: advance and re-evaluate */
652 				inst = inst->left;
653 				goto Switchstmt;
654 			case END:	/* Match! */
655 				tlp->se.p[0].p2 = p;
656 				newmatch(&tlp->se);
657 				break;
658 			}
659 		}
660 	}
661     Return:
662 	return sel.p[0].p1>=0;
663 }
664 
665 void
666 newmatch(Rangeset *sp)
667 {
668 	int i;
669 
670 	if(sel.p[0].p1<0 || sp->p[0].p1<sel.p[0].p1 ||
671 	   (sp->p[0].p1==sel.p[0].p1 && sp->p[0].p2>sel.p[0].p2))
672 		for(i = 0; i<NSUBEXP; i++)
673 			sel.p[i] = sp->p[i];
674 }
675 
676 int
677 bexecute(File *f, Posn startp)
678 {
679 	int flag = 0;
680 	Inst *inst;
681 	Ilist *tlp;
682 	Posn p = startp;
683 	int nnl = 0, ntl;
684 	int c;
685 	int wrapped = 0;
686 	int startchar = bstartinst->type<OPERATOR? bstartinst->type : 0;
687 
688 	list[0][0].inst = list[1][0].inst = 0;
689 	sel.p[0].p1= -1;
690 	/* Execute machine once for each character, including terminal NUL */
691 	for(;;--p){
692 	doloop:
693 		if((c = filereadc(f, p - 1))==-1){
694 			switch(wrapped++){
695 			case 0:		/* let loop run one more click */
696 			case 2:
697 				break;
698 			case 1:		/* expired; wrap to end */
699 				if(sel.p[0].p1>=0)
700 			case 3:
701 					goto Return;
702 				list[0][0].inst = list[1][0].inst = 0;
703 				p = f->nc;
704 				goto doloop;
705 			default:
706 				goto Return;
707 			}
708 		}else if(((wrapped && p<=startp) || sel.p[0].p1>0) && nnl==0)
709 			break;
710 		/* fast check for first char */
711 		if(startchar && nnl==0 && c!=startchar)
712 			continue;
713 		tl = list[flag];
714 		nl = list[flag^=1];
715 		nl->inst = 0;
716 		ntl = nnl;
717 		nnl = 0;
718 		if(sel.p[0].p1<0 && (!wrapped || p>startp)){
719 			/* Add first instruction to this list */
720 			if(++ntl >= NLIST)
721 	Overflow:
722 				error(Eoverflow);
723 			/* the minus is so the optimizations in addinst work */
724 			sempty.p[0].p1 = -p;
725 			addinst(tl, bstartinst, &sempty);
726 		}
727 		/* Execute machine until this list is empty */
728 		for(tlp = tl; inst = tlp->inst; tlp++){	/* assignment = */
729 	Switchstmt:
730 			switch(inst->type){
731 			default:	/* regular character */
732 				if(inst->type == c){
733 	Addinst:
734 					if(++nnl >= NLIST)
735 						goto Overflow;
736 					addinst(nl, inst->next, &tlp->se);
737 				}
738 				break;
739 			case LBRA:
740 				if(inst->subid>=0)
741 					tlp->se.p[inst->subid].p1 = p;
742 				inst = inst->next;
743 				goto Switchstmt;
744 			case RBRA:
745 				if(inst->subid >= 0)
746 					tlp->se.p[inst->subid].p2 = p;
747 				inst = inst->next;
748 				goto Switchstmt;
749 			case ANY:
750 				if(c != '\n')
751 					goto Addinst;
752 				break;
753 			case BOL:
754 				if(c=='\n' || p==0){
755 	Step:
756 					inst = inst->next;
757 					goto Switchstmt;
758 				}
759 				break;
760 			case EOL:
761 				if(p==f->nc || filereadc(f, p)=='\n')
762 					goto Step;
763 				break;
764 			case CCLASS:
765 				if(c>=0 && classmatch(inst->rclass, c, 0))
766 					goto Addinst;
767 				break;
768 			case NCCLASS:
769 				if(c>=0 && classmatch(inst->rclass, c, 1))
770 					goto Addinst;
771 				break;
772 			case OR:
773 				/* evaluate right choice later */
774 				if(++ntl >= NLIST)
775 					goto Overflow;
776 				addinst(tlp, inst->right, &tlp->se);
777 				/* efficiency: advance and re-evaluate */
778 				inst = inst->left;
779 				goto Switchstmt;
780 			case END:	/* Match! */
781 				tlp->se.p[0].p1 = -tlp->se.p[0].p1; /* minus sign */
782 				tlp->se.p[0].p2 = p;
783 				bnewmatch(&tlp->se);
784 				break;
785 			}
786 		}
787 	}
788     Return:
789 	return sel.p[0].p1>=0;
790 }
791 
792 void
793 bnewmatch(Rangeset *sp)
794 {
795         int  i;
796         if(sel.p[0].p1<0 || sp->p[0].p1>sel.p[0].p2 || (sp->p[0].p1==sel.p[0].p2 && sp->p[0].p2<sel.p[0].p1))
797                 for(i = 0; i<NSUBEXP; i++){       /* note the reversal; p1<=p2 */
798                         sel.p[i].p1 = sp->p[i].p2;
799                         sel.p[i].p2 = sp->p[i].p1;
800                 }
801 }
802