xref: /plan9-contrib/sys/src/cmd/sam/regexp.c (revision 219b2ee8daee37f4aad58d63f21287faa8e4ffdc)
1 #include "sam.h"
2 
3 Rangeset	sel;
4 String		lastregexp;
5 /*
6  * Machine Information
7  */
8 typedef struct Inst Inst;
9 
10 struct Inst
11 {
12 	long	type;	/* < 0x10000 ==> literal, otherwise action */
13 	union {
14 		int rsid;
15 		int rsubid;
16 		int class;
17 		struct Inst *rother;
18 		struct Inst *rright;
19 	} r;
20 	union{
21 		struct Inst *lleft;
22 		struct Inst *lnext;
23 	} l;
24 };
25 #define	sid	r.rsid
26 #define	subid	r.rsubid
27 #define	rclass	r.class
28 #define	other	r.rother
29 #define	right	r.rright
30 #define	left	l.lleft
31 #define	next	l.lnext
32 
33 #define	NPROG	1024
34 Inst	program[NPROG];
35 Inst	*progp;
36 Inst	*startinst;	/* First inst. of program; might not be program[0] */
37 Inst	*bstartinst;	/* same for backwards machine */
38 
39 typedef struct Ilist Ilist;
40 struct Ilist
41 {
42 	Inst	*inst;		/* Instruction of the thread */
43 	Rangeset se;
44 	Posn	startp;		/* first char of match */
45 };
46 
47 #define	NLIST	128
48 
49 Ilist	*tl, *nl;	/* This list, next list */
50 Ilist	list[2][NLIST];
51 static	Rangeset sempty;
52 
53 /*
54  * Actions and Tokens
55  *
56  *	0x100xx are operators, value == precedence
57  *	0x200xx are tokens, i.e. operands for operators
58  */
59 #define	OPERATOR	0x10000	/* Bitmask of all operators */
60 #define	START		0x10000	/* Start, used for marker on stack */
61 #define	RBRA		0x10001	/* Right bracket, ) */
62 #define	LBRA		0x10002	/* Left bracket, ( */
63 #define	OR		0x10003	/* Alternation, | */
64 #define	CAT		0x10004	/* Concatentation, implicit operator */
65 #define	STAR		0x10005	/* Closure, * */
66 #define	PLUS		0x10006	/* a+ == aa* */
67 #define	QUEST		0x10007	/* a? == a|nothing, i.e. 0 or 1 a's */
68 #define	ANY		0x20000	/* Any character but newline, . */
69 #define	NOP		0x20001	/* No operation, internal use only */
70 #define	BOL		0x20002	/* Beginning of line, ^ */
71 #define	EOL		0x20003	/* End of line, $ */
72 #define	CCLASS		0x20004	/* Character class, [] */
73 #define	NCCLASS		0x20005	/* Negated character class, [^] */
74 #define	END		0x20077	/* Terminate: match found */
75 
76 #define	ISATOR		0x10000
77 #define	ISAND		0x20000
78 
79 /*
80  * Parser Information
81  */
82 typedef struct Node Node;
83 struct Node
84 {
85 	Inst	*first;
86 	Inst	*last;
87 };
88 
89 #define	NSTACK	20
90 Node	andstack[NSTACK];
91 Node	*andp;
92 int	atorstack[NSTACK];
93 int	*atorp;
94 int	lastwasand;	/* Last token was operand */
95 int	cursubid;
96 int	subidstack[NSTACK];
97 int	*subidp;
98 int	backwards;
99 int	nbra;
100 Rune	*exprp;		/* pointer to next character in source expression */
101 #define	DCLASS	10	/* allocation increment */
102 int	nclass;		/* number active */
103 int	Nclass;		/* high water mark */
104 Rune	**class;
105 int	negateclass;
106 
107 void	addinst(Ilist *l, Inst *inst, Rangeset *sep);
108 void	newmatch(Rangeset*);
109 void	bnewmatch(Rangeset*);
110 void	pushand(Inst*, Inst*);
111 void	pushator(int);
112 Node	*popand(int);
113 int	popator(void);
114 void	startlex(Rune*);
115 int	lex(void);
116 void	operator(int);
117 void	operand(int);
118 void	evaluntil(int);
119 void	optimize(Inst*);
120 void	bldcclass(void);
121 
122 void
123 regerror(Err e)
124 {
125 	Strzero(&lastregexp);
126 	error(e);
127 }
128 
129 void
130 regerror_c(Err e, int c)
131 {
132 	Strzero(&lastregexp);
133 	error_c(e, c);
134 }
135 
136 Inst *
137 newinst(int t)
138 {
139 	if(progp >= &program[NPROG])
140 		regerror(Etoolong);
141 	progp->type = t;
142 	progp->left = 0;
143 	progp->right = 0;
144 	return progp++;
145 }
146 
147 Inst *
148 realcompile(Rune *s)
149 {
150 	int token;
151 
152 	startlex(s);
153 	atorp = atorstack;
154 	andp = andstack;
155 	subidp = subidstack;
156 	cursubid = 0;
157 	lastwasand = FALSE;
158 	/* Start with a low priority operator to prime parser */
159 	pushator(START-1);
160 	while((token=lex()) != END){
161 		if((token&ISATOR) == OPERATOR)
162 			operator(token);
163 		else
164 			operand(token);
165 	}
166 	/* Close with a low priority operator */
167 	evaluntil(START);
168 	/* Force END */
169 	operand(END);
170 	evaluntil(START);
171 	if(nbra)
172 		regerror(Eleftpar);
173 	--andp;	/* points to first and only operand */
174 	return andp->first;
175 }
176 
177 void
178 compile(String *s)
179 {
180 	int i;
181 	Inst *oprogp;
182 
183 	if(Strcmp(s, &lastregexp)==0)
184 		return;
185 	for(i=0; i<nclass; i++)
186 		free(class[i]);
187 	nclass = 0;
188 	progp = program;
189 	backwards = FALSE;
190 	startinst = realcompile(s->s);
191 	optimize(program);
192 	oprogp = progp;
193 	backwards = TRUE;
194 	bstartinst = realcompile(s->s);
195 	optimize(oprogp);
196 	Strduplstr(&lastregexp, s);
197 }
198 
199 void
200 operand(int t)
201 {
202 	Inst *i;
203 	if(lastwasand)
204 		operator(CAT);	/* catenate is implicit */
205 	i = newinst(t);
206 	if(t == CCLASS){
207 		if(negateclass)
208 			i->type = NCCLASS;	/* UGH */
209 		i->rclass = nclass-1;		/* UGH */
210 	}
211 	pushand(i, i);
212 	lastwasand = TRUE;
213 }
214 
215 void
216 operator(int t)
217 {
218 	if(t==RBRA && --nbra<0)
219 		regerror(Erightpar);
220 	if(t==LBRA){
221 /*
222  *		if(++cursubid >= NSUBEXP)
223  *			regerror(Esubexp);
224  */
225 		cursubid++;	/* silently ignored */
226 		nbra++;
227 		if(lastwasand)
228 			operator(CAT);
229 	}else
230 		evaluntil(t);
231 	if(t!=RBRA)
232 		pushator(t);
233 	lastwasand = FALSE;
234 	if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
235 		lastwasand = TRUE;	/* these look like operands */
236 }
237 
238 void
239 cant(char *s)
240 {
241 	char buf[100];
242 
243 	sprint(buf, "regexp: can't happen: %s", s);
244 	panic(buf);
245 }
246 
247 void
248 pushand(Inst *f, Inst *l)
249 {
250 	if(andp >= &andstack[NSTACK])
251 		cant("operand stack overflow");
252 	andp->first = f;
253 	andp->last = l;
254 	andp++;
255 }
256 
257 void
258 pushator(int t)
259 {
260 	if(atorp >= &atorstack[NSTACK])
261 		cant("operator stack overflow");
262 	*atorp++=t;
263 	if(cursubid >= NSUBEXP)
264 		*subidp++= -1;
265 	else
266 		*subidp++=cursubid;
267 }
268 
269 Node *
270 popand(int op)
271 {
272 	if(andp <= &andstack[0])
273 		if(op)
274 			regerror_c(Emissop, op);
275 		else
276 			regerror(Ebadregexp);
277 	return --andp;
278 }
279 
280 int
281 popator(void)
282 {
283 	if(atorp <= &atorstack[0])
284 		cant("operator stack underflow");
285 	--subidp;
286 	return *--atorp;
287 }
288 
289 void
290 evaluntil(int pri)
291 {
292 	Node *op1, *op2, *t;
293 	Inst *inst1, *inst2;
294 
295 	while(pri==RBRA || atorp[-1]>=pri){
296 		switch(popator()){
297 		case LBRA:
298 			op1 = popand('(');
299 			inst2 = newinst(RBRA);
300 			inst2->subid = *subidp;
301 			op1->last->next = inst2;
302 			inst1 = newinst(LBRA);
303 			inst1->subid = *subidp;
304 			inst1->next = op1->first;
305 			pushand(inst1, inst2);
306 			return;		/* must have been RBRA */
307 		default:
308 			panic("unknown regexp operator");
309 			break;
310 		case OR:
311 			op2 = popand('|');
312 			op1 = popand('|');
313 			inst2 = newinst(NOP);
314 			op2->last->next = inst2;
315 			op1->last->next = inst2;
316 			inst1 = newinst(OR);
317 			inst1->right = op1->first;
318 			inst1->left = op2->first;
319 			pushand(inst1, inst2);
320 			break;
321 		case CAT:
322 			op2 = popand(0);
323 			op1 = popand(0);
324 			if(backwards && op2->first->type!=END)
325 				t = op1, op1 = op2, op2 = t;
326 			op1->last->next = op2->first;
327 			pushand(op1->first, op2->last);
328 			break;
329 		case STAR:
330 			op2 = popand('*');
331 			inst1 = newinst(OR);
332 			op2->last->next = inst1;
333 			inst1->right = op2->first;
334 			pushand(inst1, inst1);
335 			break;
336 		case PLUS:
337 			op2 = popand('+');
338 			inst1 = newinst(OR);
339 			op2->last->next = inst1;
340 			inst1->right = op2->first;
341 			pushand(op2->first, inst1);
342 			break;
343 		case QUEST:
344 			op2 = popand('?');
345 			inst1 = newinst(OR);
346 			inst2 = newinst(NOP);
347 			inst1->left = inst2;
348 			inst1->right = op2->first;
349 			op2->last->next = inst2;
350 			pushand(inst1, inst2);
351 			break;
352 		}
353 	}
354 }
355 
356 
357 void
358 optimize(Inst *start)
359 {
360 	Inst *inst, *target;
361 
362 	for(inst=start; inst->type!=END; inst++){
363 		target = inst->next;
364 		while(target->type == NOP)
365 			target = target->next;
366 		inst->next = target;
367 	}
368 }
369 
370 #ifdef	DEBUG
371 void
372 dumpstack(void){
373 	Node *stk;
374 	int *ip;
375 
376 	dprint("operators\n");
377 	for(ip = atorstack; ip<atorp; ip++)
378 		dprint("0%o\n", *ip);
379 	dprint("operands\n");
380 	for(stk = andstack; stk<andp; stk++)
381 		dprint("0%o\t0%o\n", stk->first->type, stk->last->type);
382 }
383 void
384 dump(void){
385 	Inst *l;
386 
387 	l = program;
388 	do{
389 		dprint("%d:\t0%o\t%d\t%d\n", l-program, l->type,
390 			l->left-program, l->right-program);
391 	}while(l++->type);
392 }
393 #endif
394 
395 void
396 startlex(Rune *s)
397 {
398 	exprp = s;
399 	nbra = 0;
400 }
401 
402 
403 int
404 lex(void){
405 	int c= *exprp++;
406 
407 	switch(c){
408 	case '\\':
409 		if(*exprp)
410 			if((c= *exprp++)=='n')
411 				c='\n';
412 		break;
413 	case 0:
414 		c = END;
415 		--exprp;	/* In case we come here again */
416 		break;
417 	case '*':
418 		c = STAR;
419 		break;
420 	case '?':
421 		c = QUEST;
422 		break;
423 	case '+':
424 		c = PLUS;
425 		break;
426 	case '|':
427 		c = OR;
428 		break;
429 	case '.':
430 		c = ANY;
431 		break;
432 	case '(':
433 		c = LBRA;
434 		break;
435 	case ')':
436 		c = RBRA;
437 		break;
438 	case '^':
439 		c = BOL;
440 		break;
441 	case '$':
442 		c = EOL;
443 		break;
444 	case '[':
445 		c = CCLASS;
446 		bldcclass();
447 		break;
448 	}
449 	return c;
450 }
451 
452 long
453 nextrec(void){
454 	if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0))
455 		regerror(Ebadclass);
456 	if(exprp[0] == '\\'){
457 		exprp++;
458 		if(*exprp=='n'){
459 			exprp++;
460 			return '\n';
461 		}
462 		return *exprp++|0x10000;
463 	}
464 	return *exprp++;
465 }
466 
467 void
468 bldcclass(void)
469 {
470 	long c1, c2, n, na;
471 	Rune *classp;
472 
473 	classp = emalloc(DCLASS*RUNESIZE);
474 	n = 0;
475 	na = DCLASS;
476 	/* we have already seen the '[' */
477 	if(*exprp == '^'){
478 		classp[n++] = '\n';	/* don't match newline in negate case */
479 		negateclass = TRUE;
480 		exprp++;
481 	}else
482 		negateclass = FALSE;
483 	while((c1 = nextrec()) != ']'){
484 		if(c1 == '-'){
485     Error:
486 			free(classp);
487 			regerror(Ebadclass);
488 		}
489 		if(n+4 >= na){		/* 3 runes plus NUL */
490 			na += DCLASS;
491 			classp = erealloc(classp, na*RUNESIZE);
492 		}
493 		if(*exprp == '-'){
494 			exprp++;	/* eat '-' */
495 			if((c2 = nextrec()) == ']')
496 				goto Error;
497 			classp[n+0] = 0xFFFF;
498 			classp[n+1] = c1;
499 			classp[n+2] = c2;
500 			n += 3;
501 		}else
502 			classp[n++] = c1;
503 	}
504 	classp[n] = 0;
505 	if(nclass == Nclass){
506 		Nclass += DCLASS;
507 		class = erealloc(class, Nclass*sizeof(Rune*));
508 	}
509 	class[nclass++] = classp;
510 }
511 
512 int
513 classmatch(int classno, int c, int negate)
514 {
515 	Rune *p;
516 
517 	p = class[classno];
518 	while(*p){
519 		if(*p == 0xFFFF){
520 			if(p[1]<=c && c<=p[2])
521 				return !negate;
522 			p += 3;
523 		}else if(*p++ == c)
524 			return !negate;
525 	}
526 	return negate;
527 }
528 
529 /*
530  * Note optimization in addinst:
531  * 	*l must be pending when addinst called; if *l has been looked
532  *		at already, the optimization is a bug.
533  */
534 void
535 addinst(Ilist *l, Inst *inst, Rangeset *sep)
536 {
537 	Ilist *p;
538 
539 	for(p = l; p->inst; p++){
540 		if(p->inst==inst){
541 			if((sep)->p[0].p1 < p->se.p[0].p1)
542 				p->se= *sep;	/* this would be bug */
543 			return;	/* It's already there */
544 		}
545 	}
546 	p->inst = inst;
547 	p->se= *sep;
548 	(p+1)->inst = 0;
549 }
550 
551 int
552 execute(File *f, Posn startp, Posn eof)
553 {
554 	int flag = 0;
555 	Inst *inst;
556 	Ilist *tlp;
557 	Posn p = startp;
558 	int nnl = 0, ntl;
559 	int c;
560 	int wrapped = 0;
561 	int startchar = startinst->type<OPERATOR? startinst->type : 0;
562 
563 	list[0][0].inst = list[1][0].inst = 0;
564 	sel.p[0].p1 = -1;
565 	Fgetcset(f, startp);
566 	/* Execute machine once for each character */
567 	for(;;p++){
568 	doloop:
569 		c = Fgetc(f);
570 		if(p>=eof || c<0){
571 			switch(wrapped++){
572 			case 0:		/* let loop run one more click */
573 			case 2:
574 				break;
575 			case 1:		/* expired; wrap to beginning */
576 				if(sel.p[0].p1>=0 || eof!=INFINITY)
577 					goto Return;
578 				list[0][0].inst = list[1][0].inst = 0;
579 				Fgetcset(f, (Posn)0);
580 				p = 0;
581 				goto doloop;
582 			default:
583 				goto Return;
584 			}
585 		}else if(((wrapped && p>=startp) || sel.p[0].p1>0) && nnl==0)
586 			break;
587 		/* fast check for first char */
588 		if(startchar && nnl==0 && c!=startchar)
589 			continue;
590 		tl = list[flag];
591 		nl = list[flag^=1];
592 		nl->inst = 0;
593 		ntl = nnl;
594 		nnl = 0;
595 		if(sel.p[0].p1<0 && (!wrapped || p<startp || startp==eof)){
596 			/* Add first instruction to this list */
597 			if(++ntl >= NLIST)
598 	Overflow:
599 				error(Eoverflow);
600 			sempty.p[0].p1 = p;
601 			addinst(tl, startinst, &sempty);
602 		}
603 		/* Execute machine until this list is empty */
604 		for(tlp = tl; inst = tlp->inst; tlp++){	/* assignment = */
605 	Switchstmt:
606 			switch(inst->type){
607 			default:	/* regular character */
608 				if(inst->type==c){
609 	Addinst:
610 					if(++nnl >= NLIST)
611 						goto Overflow;
612 					addinst(nl, inst->next, &tlp->se);
613 				}
614 				break;
615 			case LBRA:
616 				if(inst->subid>=0)
617 					tlp->se.p[inst->subid].p1 = p;
618 				inst = inst->next;
619 				goto Switchstmt;
620 			case RBRA:
621 				if(inst->subid>=0)
622 					tlp->se.p[inst->subid].p2 = p;
623 				inst = inst->next;
624 				goto Switchstmt;
625 			case ANY:
626 				if(c!='\n')
627 					goto Addinst;
628 				break;
629 			case BOL:
630 				if(p == 0){
631 	Step:
632 					inst = inst->next;
633 					goto Switchstmt;
634 				}
635 				if(f->getci > 1){
636 					if(f->getcbuf[f->getci-2]=='\n')
637 						goto Step;
638 				}else{
639 					Rune c;
640 					if(Fchars(f, &c, p-1, p)==1 && c=='\n')
641 						goto Step;
642 				}
643 				break;
644 			case EOL:
645 				if(c == '\n')
646 					goto Step;
647 				break;
648 			case CCLASS:
649 				if(c>=0 && classmatch(inst->rclass, c, 0))
650 					goto Addinst;
651 				break;
652 			case NCCLASS:
653 				if(c>=0 && classmatch(inst->rclass, c, 1))
654 					goto Addinst;
655 				break;
656 			case OR:
657 				/* evaluate right choice later */
658 				if(++ntl >= NLIST)
659 					goto Overflow;
660 				addinst(tlp, inst->right, &tlp->se);
661 				/* efficiency: advance and re-evaluate */
662 				inst = inst->left;
663 				goto Switchstmt;
664 			case END:	/* Match! */
665 				tlp->se.p[0].p2 = p;
666 				newmatch(&tlp->se);
667 				break;
668 			}
669 		}
670 	}
671     Return:
672 	return sel.p[0].p1>=0;
673 }
674 
675 void
676 newmatch(Rangeset *sp)
677 {
678 	int i;
679 
680 	if(sel.p[0].p1<0 || sp->p[0].p1<sel.p[0].p1 ||
681 	   (sp->p[0].p1==sel.p[0].p1 && sp->p[0].p2>sel.p[0].p2))
682 		for(i = 0; i<NSUBEXP; i++)
683 			sel.p[i] = sp->p[i];
684 }
685 
686 int
687 bexecute(File *f, Posn startp)
688 {
689 	int flag = 0;
690 	Inst *inst;
691 	Ilist *tlp;
692 	Posn p = startp;
693 	int nnl = 0, ntl;
694 	int c;
695 	int wrapped = 0;
696 	int startchar = bstartinst->type<OPERATOR? bstartinst->type : 0;
697 
698 	list[0][0].inst = list[1][0].inst = 0;
699 	sel.p[0].p1= -1;
700 	Fgetcset(f, startp);
701 	/* Execute machine once for each character, including terminal NUL */
702 	for(;;--p){
703 	doloop:
704 		if((c = Fbgetc(f))==-1){
705 			switch(wrapped++){
706 			case 0:		/* let loop run one more click */
707 			case 2:
708 				break;
709 			case 1:		/* expired; wrap to end */
710 				if(sel.p[0].p1>=0)
711 			case 3:
712 					goto Return;
713 				list[0][0].inst = list[1][0].inst = 0;
714 				Fgetcset(f, f->nrunes);
715 				p = f->nrunes;
716 				goto doloop;
717 			default:
718 				goto Return;
719 			}
720 		}else if(((wrapped && p<=startp) || sel.p[0].p1>0) && nnl==0)
721 			break;
722 		/* fast check for first char */
723 		if(startchar && nnl==0 && c!=startchar)
724 			continue;
725 		tl = list[flag];
726 		nl = list[flag^=1];
727 		nl->inst = 0;
728 		ntl = nnl;
729 		nnl = 0;
730 		if(sel.p[0].p1<0 && (!wrapped || p>startp)){
731 			/* Add first instruction to this list */
732 			if(++ntl >= NLIST)
733 	Overflow:
734 				error(Eoverflow);
735 			/* the minus is so the optimizations in addinst work */
736 			sempty.p[0].p1 = -p;
737 			addinst(tl, bstartinst, &sempty);
738 		}
739 		/* Execute machine until this list is empty */
740 		for(tlp = tl; inst = tlp->inst; tlp++){	/* assignment = */
741 	Switchstmt:
742 			switch(inst->type){
743 			default:	/* regular character */
744 				if(inst->type == c){
745 	Addinst:
746 					if(++nnl >= NLIST)
747 						goto Overflow;
748 					addinst(nl, inst->next, &tlp->se);
749 				}
750 				break;
751 			case LBRA:
752 				if(inst->subid>=0)
753 					tlp->se.p[inst->subid].p1 = p;
754 				inst = inst->next;
755 				goto Switchstmt;
756 			case RBRA:
757 				if(inst->subid >= 0)
758 					tlp->se.p[inst->subid].p2 = p;
759 				inst = inst->next;
760 				goto Switchstmt;
761 			case ANY:
762 				if(c != '\n')
763 					goto Addinst;
764 				break;
765 			case BOL:
766 				if(c=='\n' || p==0){
767 	Step:
768 					inst = inst->next;
769 					goto Switchstmt;
770 				}
771 				break;
772 			case EOL:
773 				if(f->getci<f->ngetc-1){
774 					if(f->getcbuf[f->getci+1]=='\n')
775 						goto Step;
776 				}else if(p<f->nrunes-1){
777 					Rune c;
778 					if(Fchars(f, &c, p, p+1)==1 && c=='\n')
779 						goto Step;
780 				}
781 				break;
782 			case CCLASS:
783 				if(c>=0 && classmatch(inst->rclass, c, 0))
784 					goto Addinst;
785 				break;
786 			case NCCLASS:
787 				if(c>=0 && classmatch(inst->rclass, c, 1))
788 					goto Addinst;
789 				break;
790 			case OR:
791 				/* evaluate right choice later */
792 				if(++ntl >= NLIST)
793 					goto Overflow;
794 				addinst(tlp, inst->right, &tlp->se);
795 				/* efficiency: advance and re-evaluate */
796 				inst = inst->left;
797 				goto Switchstmt;
798 			case END:	/* Match! */
799 				tlp->se.p[0].p1 = -tlp->se.p[0].p1; /* minus sign */
800 				tlp->se.p[0].p2 = p;
801 				bnewmatch(&tlp->se);
802 				break;
803 			}
804 		}
805 	}
806     Return:
807 	return sel.p[0].p1>=0;
808 }
809 
810 void
811 bnewmatch(Rangeset *sp)
812 {
813         int  i;
814         if(sel.p[0].p1<0 || sp->p[0].p1>sel.p[0].p2 || (sp->p[0].p1==sel.p[0].p2 && sp->p[0].p2<sel.p[0].p1))
815                 for(i = 0; i<NSUBEXP; i++){       /* note the reversal; p1<=p2 */
816                         sel.p[i].p1 = sp->p[i].p2;
817                         sel.p[i].p2 = sp->p[i].p1;
818                 }
819 }
820