xref: /plan9/sys/src/cmd/acme/regx.c (revision ec59a3ddbfceee0efe34584c2c9981a5e5ff1ec4)
1 #include <u.h>
2 #include <libc.h>
3 #include <draw.h>
4 #include <thread.h>
5 #include <cursor.h>
6 #include <mouse.h>
7 #include <keyboard.h>
8 #include <frame.h>
9 #include <fcall.h>
10 #include <plumb.h>
11 #include "dat.h"
12 #include "fns.h"
13 
14 Rangeset	sel;
15 Rune		*lastregexp;
16 
17 /*
18  * Machine Information
19  */
20 typedef struct Inst Inst;
21 struct Inst
22 {
23 	uint	type;	/* < 0x10000 ==> literal, otherwise action */
24 	union {
25 		int sid;
26 		int subid;
27 		int class;
28 		Inst *other;
29 		Inst *right;
30 	};
31 	union{
32 		Inst *left;
33 		Inst *next;
34 	};
35 };
36 
37 #define	NPROG	1024
38 Inst	program[NPROG];
39 Inst	*progp;
40 Inst	*startinst;	/* First inst. of program; might not be program[0] */
41 Inst	*bstartinst;	/* same for backwards machine */
42 Channel	*rechan;	/* chan(Inst*) */
43 
44 typedef struct Ilist Ilist;
45 struct Ilist
46 {
47 	Inst	*inst;		/* Instruction of the thread */
48 	Rangeset se;
49 	uint	startp;		/* first char of match */
50 };
51 
52 #define	NLIST	128
53 
54 Ilist	*tl, *nl;	/* This list, next list */
55 Ilist	list[2][NLIST];
56 static	Rangeset sempty;
57 
58 /*
59  * Actions and Tokens
60  *
61  *	0x100xx are operators, value == precedence
62  *	0x200xx are tokens, i.e. operands for operators
63  */
64 #define	OPERATOR	0x10000	/* Bitmask of all operators */
65 #define	START		0x10000	/* Start, used for marker on stack */
66 #define	RBRA		0x10001	/* Right bracket, ) */
67 #define	LBRA		0x10002	/* Left bracket, ( */
68 #define	OR		0x10003	/* Alternation, | */
69 #define	CAT		0x10004	/* Concatentation, implicit operator */
70 #define	STAR		0x10005	/* Closure, * */
71 #define	PLUS		0x10006	/* a+ == aa* */
72 #define	QUEST		0x10007	/* a? == a|nothing, i.e. 0 or 1 a's */
73 #define	ANY		0x20000	/* Any character but newline, . */
74 #define	NOP		0x20001	/* No operation, internal use only */
75 #define	BOL		0x20002	/* Beginning of line, ^ */
76 #define	EOL		0x20003	/* End of line, $ */
77 #define	CCLASS		0x20004	/* Character class, [] */
78 #define	NCCLASS		0x20005	/* Negated character class, [^] */
79 #define	END		0x20077	/* Terminate: match found */
80 
81 #define	ISATOR		0x10000
82 #define	ISAND		0x20000
83 
84 /*
85  * Parser Information
86  */
87 typedef struct Node Node;
88 struct Node
89 {
90 	Inst	*first;
91 	Inst	*last;
92 };
93 
94 #define	NSTACK	20
95 Node	andstack[NSTACK];
96 Node	*andp;
97 int	atorstack[NSTACK];
98 int	*atorp;
99 int	lastwasand;	/* Last token was operand */
100 int	cursubid;
101 int	subidstack[NSTACK];
102 int	*subidp;
103 int	backwards;
104 int	nbra;
105 Rune	*exprp;		/* pointer to next character in source expression */
106 #define	DCLASS	10	/* allocation increment */
107 int	nclass;		/* number active */
108 int	Nclass;		/* high water mark */
109 Rune	**class;
110 int	negateclass;
111 
112 void	addinst(Ilist *l, Inst *inst, Rangeset *sep);
113 void	newmatch(Rangeset*);
114 void	bnewmatch(Rangeset*);
115 void	pushand(Inst*, Inst*);
116 void	pushator(int);
117 Node	*popand(int);
118 int	popator(void);
119 void	startlex(Rune*);
120 int	lex(void);
121 void	operator(int);
122 void	operand(int);
123 void	evaluntil(int);
124 void	optimize(Inst*);
125 void	bldcclass(void);
126 
127 void
128 rxinit(void)
129 {
130 	rechan = chancreate(sizeof(Inst*), 0);
131 	lastregexp = runemalloc(1);
132 }
133 
134 void
135 regerror(char *e)
136 {
137 	lastregexp[0] = 0;
138 	warning(nil, "regexp: %s\n", e);
139 	sendp(rechan, nil);
140 	threadexits(nil);
141 }
142 
143 Inst *
144 newinst(int t)
145 {
146 	if(progp >= &program[NPROG])
147 		regerror("expression too long");
148 	progp->type = t;
149 	progp->left = nil;
150 	progp->right = nil;
151 	return progp++;
152 }
153 
154 void
155 realcompile(void *arg)
156 {
157 	int token;
158 	Rune *s;
159 
160 	threadsetname("regcomp");
161 	s = arg;
162 	startlex(s);
163 	atorp = atorstack;
164 	andp = andstack;
165 	subidp = subidstack;
166 	cursubid = 0;
167 	lastwasand = FALSE;
168 	/* Start with a low priority operator to prime parser */
169 	pushator(START-1);
170 	while((token=lex()) != END){
171 		if((token&ISATOR) == OPERATOR)
172 			operator(token);
173 		else
174 			operand(token);
175 	}
176 	/* Close with a low priority operator */
177 	evaluntil(START);
178 	/* Force END */
179 	operand(END);
180 	evaluntil(START);
181 	if(nbra)
182 		regerror("unmatched `('");
183 	--andp;	/* points to first and only operand */
184 	sendp(rechan, andp->first);
185 	threadexits(nil);
186 }
187 
188 /* r is null terminated */
189 int
190 rxcompile(Rune *r)
191 {
192 	int i, nr;
193 	Inst *oprogp;
194 
195 	nr = runestrlen(r)+1;
196 	if(runeeq(lastregexp, runestrlen(lastregexp)+1, r, nr)==TRUE)
197 		return TRUE;
198 	lastregexp[0] = 0;
199 	for(i=0; i<nclass; i++)
200 		free(class[i]);
201 	nclass = 0;
202 	progp = program;
203 	backwards = FALSE;
204 	bstartinst = nil;
205 	threadcreate(realcompile, r, STACK);
206 	startinst = recvp(rechan);
207 	if(startinst == nil)
208 		return FALSE;
209 	optimize(program);
210 	oprogp = progp;
211 	backwards = TRUE;
212 	threadcreate(realcompile, r, STACK);
213 	bstartinst = recvp(rechan);
214 	if(bstartinst == nil)
215 		return FALSE;
216 	optimize(oprogp);
217 	lastregexp = runerealloc(lastregexp, nr);
218 	runemove(lastregexp, r, nr);
219 	return TRUE;
220 }
221 
222 void
223 operand(int t)
224 {
225 	Inst *i;
226 	if(lastwasand)
227 		operator(CAT);	/* catenate is implicit */
228 	i = newinst(t);
229 	if(t == CCLASS){
230 		if(negateclass)
231 			i->type = NCCLASS;	/* UGH */
232 		i->class = nclass-1;		/* UGH */
233 	}
234 	pushand(i, i);
235 	lastwasand = TRUE;
236 }
237 
238 void
239 operator(int t)
240 {
241 	if(t==RBRA && --nbra<0)
242 		regerror("unmatched `)'");
243 	if(t==LBRA){
244 		cursubid++;	/* silently ignored */
245 		nbra++;
246 		if(lastwasand)
247 			operator(CAT);
248 	}else
249 		evaluntil(t);
250 	if(t!=RBRA)
251 		pushator(t);
252 	lastwasand = FALSE;
253 	if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
254 		lastwasand = TRUE;	/* these look like operands */
255 }
256 
257 void
258 pushand(Inst *f, Inst *l)
259 {
260 	if(andp >= &andstack[NSTACK])
261 		error("operand stack overflow");
262 	andp->first = f;
263 	andp->last = l;
264 	andp++;
265 }
266 
267 void
268 pushator(int t)
269 {
270 	if(atorp >= &atorstack[NSTACK])
271 		error("operator stack overflow");
272 	*atorp++=t;
273 	if(cursubid >= NRange)
274 		*subidp++= -1;
275 	else
276 		*subidp++=cursubid;
277 }
278 
279 Node *
280 popand(int op)
281 {
282 	char buf[64];
283 
284 	if(andp <= &andstack[0])
285 		if(op){
286 			sprint(buf, "missing operand for %c", op);
287 			regerror(buf);
288 		}else
289 			regerror("malformed regexp");
290 	return --andp;
291 }
292 
293 int
294 popator()
295 {
296 	if(atorp <= &atorstack[0])
297 		error("operator stack underflow");
298 	--subidp;
299 	return *--atorp;
300 }
301 
302 void
303 evaluntil(int pri)
304 {
305 	Node *op1, *op2, *t;
306 	Inst *inst1, *inst2;
307 
308 	while(pri==RBRA || atorp[-1]>=pri){
309 		switch(popator()){
310 		case LBRA:
311 			op1 = popand('(');
312 			inst2 = newinst(RBRA);
313 			inst2->subid = *subidp;
314 			op1->last->next = inst2;
315 			inst1 = newinst(LBRA);
316 			inst1->subid = *subidp;
317 			inst1->next = op1->first;
318 			pushand(inst1, inst2);
319 			return;		/* must have been RBRA */
320 		default:
321 			error("unknown regexp operator");
322 			break;
323 		case OR:
324 			op2 = popand('|');
325 			op1 = popand('|');
326 			inst2 = newinst(NOP);
327 			op2->last->next = inst2;
328 			op1->last->next = inst2;
329 			inst1 = newinst(OR);
330 			inst1->right = op1->first;
331 			inst1->left = op2->first;
332 			pushand(inst1, inst2);
333 			break;
334 		case CAT:
335 			op2 = popand(0);
336 			op1 = popand(0);
337 			if(backwards && op2->first->type!=END){
338 				t = op1;
339 				op1 = op2;
340 				op2 = t;
341 			}
342 			op1->last->next = op2->first;
343 			pushand(op1->first, op2->last);
344 			break;
345 		case STAR:
346 			op2 = popand('*');
347 			inst1 = newinst(OR);
348 			op2->last->next = inst1;
349 			inst1->right = op2->first;
350 			pushand(inst1, inst1);
351 			break;
352 		case PLUS:
353 			op2 = popand('+');
354 			inst1 = newinst(OR);
355 			op2->last->next = inst1;
356 			inst1->right = op2->first;
357 			pushand(op2->first, inst1);
358 			break;
359 		case QUEST:
360 			op2 = popand('?');
361 			inst1 = newinst(OR);
362 			inst2 = newinst(NOP);
363 			inst1->left = inst2;
364 			inst1->right = op2->first;
365 			op2->last->next = inst2;
366 			pushand(inst1, inst2);
367 			break;
368 		}
369 	}
370 }
371 
372 
373 void
374 optimize(Inst *start)
375 {
376 	Inst *inst, *target;
377 
378 	for(inst=start; inst->type!=END; inst++){
379 		target = inst->next;
380 		while(target->type == NOP)
381 			target = target->next;
382 		inst->next = target;
383 	}
384 }
385 
386 void
387 startlex(Rune *s)
388 {
389 	exprp = s;
390 	nbra = 0;
391 }
392 
393 
394 int
395 lex(void){
396 	int c;
397 
398 	c = *exprp++;
399 	switch(c){
400 	case '\\':
401 		if(*exprp)
402 			if((c= *exprp++)=='n')
403 				c='\n';
404 		break;
405 	case 0:
406 		c = END;
407 		--exprp;	/* In case we come here again */
408 		break;
409 	case '*':
410 		c = STAR;
411 		break;
412 	case '?':
413 		c = QUEST;
414 		break;
415 	case '+':
416 		c = PLUS;
417 		break;
418 	case '|':
419 		c = OR;
420 		break;
421 	case '.':
422 		c = ANY;
423 		break;
424 	case '(':
425 		c = LBRA;
426 		break;
427 	case ')':
428 		c = RBRA;
429 		break;
430 	case '^':
431 		c = BOL;
432 		break;
433 	case '$':
434 		c = EOL;
435 		break;
436 	case '[':
437 		c = CCLASS;
438 		bldcclass();
439 		break;
440 	}
441 	return c;
442 }
443 
444 int
445 nextrec(void)
446 {
447 	if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0))
448 		regerror("malformed `[]'");
449 	if(exprp[0] == '\\'){
450 		exprp++;
451 		if(*exprp=='n'){
452 			exprp++;
453 			return '\n';
454 		}
455 		return *exprp++|0x10000;
456 	}
457 	return *exprp++;
458 }
459 
460 void
461 bldcclass(void)
462 {
463 	int c1, c2, n, na;
464 	Rune *classp;
465 
466 	classp = runemalloc(DCLASS);
467 	n = 0;
468 	na = DCLASS;
469 	/* we have already seen the '[' */
470 	if(*exprp == '^'){
471 		classp[n++] = '\n';	/* don't match newline in negate case */
472 		negateclass = TRUE;
473 		exprp++;
474 	}else
475 		negateclass = FALSE;
476 	while((c1 = nextrec()) != ']'){
477 		if(c1 == '-'){
478     Error:
479 			free(classp);
480 			regerror("malformed `[]'");
481 		}
482 		if(n+4 >= na){		/* 3 runes plus NUL */
483 			na += DCLASS;
484 			classp = runerealloc(classp, na);
485 		}
486 		if(*exprp == '-'){
487 			exprp++;	/* eat '-' */
488 			if((c2 = nextrec()) == ']')
489 				goto Error;
490 			classp[n+0] = 0xFFFF;
491 			classp[n+1] = c1;
492 			classp[n+2] = c2;
493 			n += 3;
494 		}else
495 			classp[n++] = c1;
496 	}
497 	classp[n] = 0;
498 	if(nclass == Nclass){
499 		Nclass += DCLASS;
500 		class = realloc(class, Nclass*sizeof(Rune*));
501 	}
502 	class[nclass++] = classp;
503 }
504 
505 int
506 classmatch(int classno, int c, int negate)
507 {
508 	Rune *p;
509 
510 	p = class[classno];
511 	while(*p){
512 		if(*p == 0xFFFF){
513 			if(p[1]<=c && c<=p[2])
514 				return !negate;
515 			p += 3;
516 		}else if(*p++ == c)
517 			return !negate;
518 	}
519 	return negate;
520 }
521 
522 /*
523  * Note optimization in addinst:
524  * 	*l must be pending when addinst called; if *l has been looked
525  *		at already, the optimization is a bug.
526  */
527 void
528 addinst(Ilist *l, Inst *inst, Rangeset *sep)
529 {
530 	Ilist *p;
531 
532 	for(p = l; p->inst; p++){
533 		if(p->inst==inst){
534 			if((sep)->r[0].q0 < p->se.r[0].q0)
535 				p->se= *sep;	/* this would be bug */
536 			return;	/* It's already there */
537 		}
538 	}
539 	p->inst = inst;
540 	p->se= *sep;
541 	(p+1)->inst = nil;
542 }
543 
544 int
545 rxnull(void)
546 {
547 	return startinst==nil || bstartinst==nil;
548 }
549 
550 /* either t!=nil or r!=nil, and we match the string in the appropriate place */
551 int
552 rxexecute(Text *t, Rune *r, uint startp, uint eof, Rangeset *rp)
553 {
554 	int flag;
555 	Inst *inst;
556 	Ilist *tlp;
557 	uint p;
558 	int nnl, ntl;
559 	int nc, c;
560 	int wrapped;
561 	int startchar;
562 
563 	flag = 0;
564 	p = startp;
565 	startchar = 0;
566 	wrapped = 0;
567 	nnl = 0;
568 	if(startinst->type<OPERATOR)
569 		startchar = startinst->type;
570 	list[0][0].inst = list[1][0].inst = nil;
571 	sel.r[0].q0 = -1;
572 	if(t != nil)
573 		nc = t->file->nc;
574 	else
575 		nc = runestrlen(r);
576 	/* Execute machine once for each character */
577 	for(;;p++){
578 	doloop:
579 		if(p>=eof || p>=nc){
580 			switch(wrapped++){
581 			case 0:		/* let loop run one more click */
582 			case 2:
583 				break;
584 			case 1:		/* expired; wrap to beginning */
585 				if(sel.r[0].q0>=0 || eof!=Infinity)
586 					goto Return;
587 				list[0][0].inst = list[1][0].inst = nil;
588 				p = 0;
589 				goto doloop;
590 			default:
591 				goto Return;
592 			}
593 			c = 0;
594 		}else{
595 			if(((wrapped && p>=startp) || sel.r[0].q0>0) && nnl==0)
596 				break;
597 			if(t != nil)
598 				c = textreadc(t, p);
599 			else
600 				c = r[p];
601 		}
602 		/* fast check for first char */
603 		if(startchar && nnl==0 && c!=startchar)
604 			continue;
605 		tl = list[flag];
606 		nl = list[flag^=1];
607 		nl->inst = nil;
608 		ntl = nnl;
609 		nnl = 0;
610 		if(sel.r[0].q0<0 && (!wrapped || p<startp || startp==eof)){
611 			/* Add first instruction to this list */
612 			if(++ntl >= NLIST){
613 	Overflow:
614 				warning(nil, "regexp list overflow\n");
615 				sel.r[0].q0 = -1;
616 				goto Return;
617 			}
618 			sempty.r[0].q0 = p;
619 			addinst(tl, startinst, &sempty);
620 		}
621 		/* Execute machine until this list is empty */
622 		for(tlp = tl; inst = tlp->inst; tlp++){	/* assignment = */
623 	Switchstmt:
624 			switch(inst->type){
625 			default:	/* regular character */
626 				if(inst->type==c){
627 	Addinst:
628 					if(++nnl >= NLIST)
629 						goto Overflow;
630 					addinst(nl, inst->next, &tlp->se);
631 				}
632 				break;
633 			case LBRA:
634 				if(inst->subid>=0)
635 					tlp->se.r[inst->subid].q0 = p;
636 				inst = inst->next;
637 				goto Switchstmt;
638 			case RBRA:
639 				if(inst->subid>=0)
640 					tlp->se.r[inst->subid].q1 = p;
641 				inst = inst->next;
642 				goto Switchstmt;
643 			case ANY:
644 				if(c!='\n')
645 					goto Addinst;
646 				break;
647 			case BOL:
648 				if(p==0 || (t!=nil && textreadc(t, p-1)=='\n') || (r!=nil && r[p-1]=='\n')){
649 	Step:
650 					inst = inst->next;
651 					goto Switchstmt;
652 				}
653 				break;
654 			case EOL:
655 				if(c == '\n')
656 					goto Step;
657 				break;
658 			case CCLASS:
659 				if(c>=0 && classmatch(inst->class, c, 0))
660 					goto Addinst;
661 				break;
662 			case NCCLASS:
663 				if(c>=0 && classmatch(inst->class, c, 1))
664 					goto Addinst;
665 				break;
666 			case OR:
667 				/* evaluate right choice later */
668 				if(++ntl >= NLIST)
669 					goto Overflow;
670 				addinst(tlp, inst->right, &tlp->se);
671 				/* efficiency: advance and re-evaluate */
672 				inst = inst->left;
673 				goto Switchstmt;
674 			case END:	/* Match! */
675 				tlp->se.r[0].q1 = p;
676 				newmatch(&tlp->se);
677 				break;
678 			}
679 		}
680 	}
681     Return:
682 	*rp = sel;
683 	return sel.r[0].q0 >= 0;
684 }
685 
686 void
687 newmatch(Rangeset *sp)
688 {
689 	if(sel.r[0].q0<0 || sp->r[0].q0<sel.r[0].q0 ||
690 	   (sp->r[0].q0==sel.r[0].q0 && sp->r[0].q1>sel.r[0].q1))
691 		sel = *sp;
692 }
693 
694 int
695 rxbexecute(Text *t, uint startp, Rangeset *rp)
696 {
697 	int flag;
698 	Inst *inst;
699 	Ilist *tlp;
700 	int p;
701 	int nnl, ntl;
702 	int c;
703 	int wrapped;
704 	int startchar;
705 
706 	flag = 0;
707 	nnl = 0;
708 	wrapped = 0;
709 	p = startp;
710 	startchar = 0;
711 	if(bstartinst->type<OPERATOR)
712 		startchar = bstartinst->type;
713 	list[0][0].inst = list[1][0].inst = nil;
714 	sel.r[0].q0= -1;
715 	/* Execute machine once for each character, including terminal NUL */
716 	for(;;--p){
717 	doloop:
718 		if(p <= 0){
719 			switch(wrapped++){
720 			case 0:		/* let loop run one more click */
721 			case 2:
722 				break;
723 			case 1:		/* expired; wrap to end */
724 				if(sel.r[0].q0>=0)
725 					goto Return;
726 				list[0][0].inst = list[1][0].inst = nil;
727 				p = t->file->nc;
728 				goto doloop;
729 			case 3:
730 			default:
731 				goto Return;
732 			}
733 			c = 0;
734 		}else{
735 			if(((wrapped && p<=startp) || sel.r[0].q0>0) && nnl==0)
736 				break;
737 			c = textreadc(t, p-1);
738 		}
739 		/* fast check for first char */
740 		if(startchar && nnl==0 && c!=startchar)
741 			continue;
742 		tl = list[flag];
743 		nl = list[flag^=1];
744 		nl->inst = nil;
745 		ntl = nnl;
746 		nnl = 0;
747 		if(sel.r[0].q0<0 && (!wrapped || p>startp)){
748 			/* Add first instruction to this list */
749 			if(++ntl >= NLIST){
750 	Overflow:
751 				warning(nil, "regexp list overflow\n");
752 				sel.r[0].q0 = -1;
753 				goto Return;
754 			}
755 			/* the minus is so the optimizations in addinst work */
756 			sempty.r[0].q0 = -p;
757 			addinst(tl, bstartinst, &sempty);
758 		}
759 		/* Execute machine until this list is empty */
760 		for(tlp = tl; inst = tlp->inst; tlp++){	/* assignment = */
761 	Switchstmt:
762 			switch(inst->type){
763 			default:	/* regular character */
764 				if(inst->type == c){
765 	Addinst:
766 					if(++nnl >= NLIST)
767 						goto Overflow;
768 					addinst(nl, inst->next, &tlp->se);
769 				}
770 				break;
771 			case LBRA:
772 				if(inst->subid>=0)
773 					tlp->se.r[inst->subid].q0 = p;
774 				inst = inst->next;
775 				goto Switchstmt;
776 			case RBRA:
777 				if(inst->subid >= 0)
778 					tlp->se.r[inst->subid].q1 = p;
779 				inst = inst->next;
780 				goto Switchstmt;
781 			case ANY:
782 				if(c != '\n')
783 					goto Addinst;
784 				break;
785 			case BOL:
786 				if(c=='\n' || p==0){
787 	Step:
788 					inst = inst->next;
789 					goto Switchstmt;
790 				}
791 				break;
792 			case EOL:
793 				if(p<t->file->nc && textreadc(t, p)=='\n')
794 					goto Step;
795 				break;
796 			case CCLASS:
797 				if(c>0 && classmatch(inst->class, c, 0))
798 					goto Addinst;
799 				break;
800 			case NCCLASS:
801 				if(c>0 && classmatch(inst->class, c, 1))
802 					goto Addinst;
803 				break;
804 			case OR:
805 				/* evaluate right choice later */
806 				if(++ntl >= NLIST)
807 					goto Overflow;
808 				addinst(tlp, inst->right, &tlp->se);
809 				/* efficiency: advance and re-evaluate */
810 				inst = inst->left;
811 				goto Switchstmt;
812 			case END:	/* Match! */
813 				tlp->se.r[0].q0 = -tlp->se.r[0].q0; /* minus sign */
814 				tlp->se.r[0].q1 = p;
815 				bnewmatch(&tlp->se);
816 				break;
817 			}
818 		}
819 	}
820     Return:
821 	*rp = sel;
822 	return sel.r[0].q0 >= 0;
823 }
824 
825 void
826 bnewmatch(Rangeset *sp)
827 {
828         int  i;
829 
830         if(sel.r[0].q0<0 || sp->r[0].q0>sel.r[0].q1 || (sp->r[0].q0==sel.r[0].q1 && sp->r[0].q1<sel.r[0].q0))
831                 for(i = 0; i<NRange; i++){       /* note the reversal; q0<=q1 */
832                         sel.r[i].q0 = sp->r[i].q1;
833                         sel.r[i].q1 = sp->r[i].q0;
834                 }
835 }
836