1 #include <u.h>
2 #include <libc.h>
3 #include <draw.h>
4 #include <thread.h>
5 #include <cursor.h>
6 #include <mouse.h>
7 #include <keyboard.h>
8 #include <frame.h>
9 #include <fcall.h>
10 #include <plumb.h>
11 #include "dat.h"
12 #include "fns.h"
13
14 Rangeset sel;
15 Rune *lastregexp;
16
17 /*
18 * Machine Information
19 */
20 typedef struct Inst Inst;
21 struct Inst
22 {
23 uint type; /* <= Runemax+1 ==> literal, otherwise action */
24 union {
25 int sid;
26 int subid;
27 int class;
28 Inst *other;
29 Inst *right;
30 };
31 union{
32 Inst *left;
33 Inst *next;
34 };
35 };
36
37 #define NPROG 1024
38 Inst program[NPROG];
39 Inst *progp;
40 Inst *startinst; /* First inst. of program; might not be program[0] */
41 Inst *bstartinst; /* same for backwards machine */
42 Channel *rechan; /* chan(Inst*) */
43
44 typedef struct Ilist Ilist;
45 struct Ilist
46 {
47 Inst *inst; /* Instruction of the thread */
48 Rangeset se;
49 uint startp; /* first char of match */
50 };
51
52 #define NLIST 127
53
54 Ilist *tl, *nl; /* This list, next list */
55 Ilist list[2][NLIST+1]; /* +1 for trailing null */
56 static Rangeset sempty;
57
58 /*
59 * Actions and Tokens
60 *
61 * 0x100xx are operators, value == precedence
62 * 0x200xx are tokens, i.e. operands for operators
63 */
64 enum {
65 OPERATOR = Runemask+1, /* Bitmask of all operators */
66 START = OPERATOR, /* Start, used for marker on stack */
67 RBRA, /* Right bracket, ) */
68 LBRA, /* Left bracket, ( */
69 OR, /* Alternation, | */
70 CAT, /* Concatentation, implicit operator */
71 STAR, /* Closure, * */
72 PLUS, /* a+ == aa* */
73 QUEST, /* a? == a|nothing, i.e. 0 or 1 a's */
74
75 ANY = OPERATOR<<1, /* Any character but newline, . */
76 NOP, /* No operation, internal use only */
77 BOL, /* Beginning of line, ^ */
78 EOL, /* End of line, $ */
79 CCLASS, /* Character class, [] */
80 NCCLASS, /* Negated character class, [^] */
81 END, /* Terminate: match found */
82
83 ISATOR = OPERATOR,
84 ISAND = OPERATOR<<1,
85 };
86
87 /*
88 * Parser Information
89 */
90 typedef struct Node Node;
91 struct Node
92 {
93 Inst *first;
94 Inst *last;
95 };
96
97 #define NSTACK 20
98 Node andstack[NSTACK];
99 Node *andp;
100 int atorstack[NSTACK];
101 int *atorp;
102 int lastwasand; /* Last token was operand */
103 int cursubid;
104 int subidstack[NSTACK];
105 int *subidp;
106 int backwards;
107 int nbra;
108 Rune *exprp; /* pointer to next character in source expression */
109 #define DCLASS 10 /* allocation increment */
110 int nclass; /* number active */
111 int Nclass; /* high water mark */
112 Rune **class;
113 int negateclass;
114
115 int addinst(Ilist *l, Inst *inst, Rangeset *sep);
116 void newmatch(Rangeset*);
117 void bnewmatch(Rangeset*);
118 void pushand(Inst*, Inst*);
119 void pushator(int);
120 Node *popand(int);
121 int popator(void);
122 void startlex(Rune*);
123 int lex(void);
124 void operator(int);
125 void operand(int);
126 void evaluntil(int);
127 void optimize(Inst*);
128 void bldcclass(void);
129
130 void
rxinit(void)131 rxinit(void)
132 {
133 rechan = chancreate(sizeof(Inst*), 0);
134 lastregexp = runemalloc(1);
135 }
136
137 void
regerror(char * e)138 regerror(char *e)
139 {
140 lastregexp[0] = 0;
141 warning(nil, "regexp: %s\n", e);
142 sendp(rechan, nil);
143 threadexits(nil);
144 }
145
146 Inst *
newinst(int t)147 newinst(int t)
148 {
149 if(progp >= &program[NPROG])
150 regerror("expression too long");
151 progp->type = t;
152 progp->left = nil;
153 progp->right = nil;
154 return progp++;
155 }
156
157 void
realcompile(void * arg)158 realcompile(void *arg)
159 {
160 int token;
161 Rune *s;
162
163 threadsetname("regcomp");
164 s = arg;
165 startlex(s);
166 atorp = atorstack;
167 andp = andstack;
168 subidp = subidstack;
169 cursubid = 0;
170 lastwasand = FALSE;
171 /* Start with a low priority operator to prime parser */
172 pushator(START-1);
173 while((token=lex()) != END){
174 if((token&ISATOR) == OPERATOR)
175 operator(token);
176 else
177 operand(token);
178 }
179 /* Close with a low priority operator */
180 evaluntil(START);
181 /* Force END */
182 operand(END);
183 evaluntil(START);
184 if(nbra)
185 regerror("unmatched `('");
186 --andp; /* points to first and only operand */
187 sendp(rechan, andp->first);
188 threadexits(nil);
189 }
190
191 /* r is null terminated */
192 int
rxcompile(Rune * r)193 rxcompile(Rune *r)
194 {
195 int i, nr;
196 Inst *oprogp;
197
198 nr = runestrlen(r)+1;
199 if(runeeq(lastregexp, runestrlen(lastregexp)+1, r, nr)==TRUE)
200 return TRUE;
201 lastregexp[0] = 0;
202 for(i=0; i<nclass; i++)
203 free(class[i]);
204 nclass = 0;
205 progp = program;
206 backwards = FALSE;
207 bstartinst = nil;
208 threadcreate(realcompile, r, STACK);
209 startinst = recvp(rechan);
210 if(startinst == nil)
211 return FALSE;
212 optimize(program);
213 oprogp = progp;
214 backwards = TRUE;
215 threadcreate(realcompile, r, STACK);
216 bstartinst = recvp(rechan);
217 if(bstartinst == nil)
218 return FALSE;
219 optimize(oprogp);
220 lastregexp = runerealloc(lastregexp, nr);
221 runemove(lastregexp, r, nr);
222 return TRUE;
223 }
224
225 void
operand(int t)226 operand(int t)
227 {
228 Inst *i;
229 if(lastwasand)
230 operator(CAT); /* catenate is implicit */
231 i = newinst(t);
232 if(t == CCLASS){
233 if(negateclass)
234 i->type = NCCLASS; /* UGH */
235 i->class = nclass-1; /* UGH */
236 }
237 pushand(i, i);
238 lastwasand = TRUE;
239 }
240
241 void
operator(int t)242 operator(int t)
243 {
244 if(t==RBRA && --nbra<0)
245 regerror("unmatched `)'");
246 if(t==LBRA){
247 cursubid++; /* silently ignored */
248 nbra++;
249 if(lastwasand)
250 operator(CAT);
251 }else
252 evaluntil(t);
253 if(t!=RBRA)
254 pushator(t);
255 lastwasand = FALSE;
256 if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
257 lastwasand = TRUE; /* these look like operands */
258 }
259
260 void
pushand(Inst * f,Inst * l)261 pushand(Inst *f, Inst *l)
262 {
263 if(andp >= &andstack[NSTACK])
264 error("operand stack overflow");
265 andp->first = f;
266 andp->last = l;
267 andp++;
268 }
269
270 void
pushator(int t)271 pushator(int t)
272 {
273 if(atorp >= &atorstack[NSTACK])
274 error("operator stack overflow");
275 *atorp++=t;
276 if(cursubid >= NRange)
277 *subidp++= -1;
278 else
279 *subidp++=cursubid;
280 }
281
282 Node *
popand(int op)283 popand(int op)
284 {
285 char buf[64];
286
287 if(andp <= &andstack[0])
288 if(op){
289 sprint(buf, "missing operand for %c", op);
290 regerror(buf);
291 }else
292 regerror("malformed regexp");
293 return --andp;
294 }
295
296 int
popator()297 popator()
298 {
299 if(atorp <= &atorstack[0])
300 error("operator stack underflow");
301 --subidp;
302 return *--atorp;
303 }
304
305 void
evaluntil(int pri)306 evaluntil(int pri)
307 {
308 Node *op1, *op2, *t;
309 Inst *inst1, *inst2;
310
311 while(pri==RBRA || atorp[-1]>=pri){
312 switch(popator()){
313 case LBRA:
314 op1 = popand('(');
315 inst2 = newinst(RBRA);
316 inst2->subid = *subidp;
317 op1->last->next = inst2;
318 inst1 = newinst(LBRA);
319 inst1->subid = *subidp;
320 inst1->next = op1->first;
321 pushand(inst1, inst2);
322 return; /* must have been RBRA */
323 default:
324 error("unknown regexp operator");
325 break;
326 case OR:
327 op2 = popand('|');
328 op1 = popand('|');
329 inst2 = newinst(NOP);
330 op2->last->next = inst2;
331 op1->last->next = inst2;
332 inst1 = newinst(OR);
333 inst1->right = op1->first;
334 inst1->left = op2->first;
335 pushand(inst1, inst2);
336 break;
337 case CAT:
338 op2 = popand(0);
339 op1 = popand(0);
340 if(backwards && op2->first->type!=END){
341 t = op1;
342 op1 = op2;
343 op2 = t;
344 }
345 op1->last->next = op2->first;
346 pushand(op1->first, op2->last);
347 break;
348 case STAR:
349 op2 = popand('*');
350 inst1 = newinst(OR);
351 op2->last->next = inst1;
352 inst1->right = op2->first;
353 pushand(inst1, inst1);
354 break;
355 case PLUS:
356 op2 = popand('+');
357 inst1 = newinst(OR);
358 op2->last->next = inst1;
359 inst1->right = op2->first;
360 pushand(op2->first, inst1);
361 break;
362 case QUEST:
363 op2 = popand('?');
364 inst1 = newinst(OR);
365 inst2 = newinst(NOP);
366 inst1->left = inst2;
367 inst1->right = op2->first;
368 op2->last->next = inst2;
369 pushand(inst1, inst2);
370 break;
371 }
372 }
373 }
374
375
376 void
optimize(Inst * start)377 optimize(Inst *start)
378 {
379 Inst *inst, *target;
380
381 for(inst=start; inst->type!=END; inst++){
382 target = inst->next;
383 while(target->type == NOP)
384 target = target->next;
385 inst->next = target;
386 }
387 }
388
389 void
startlex(Rune * s)390 startlex(Rune *s)
391 {
392 exprp = s;
393 nbra = 0;
394 }
395
396
397 int
lex(void)398 lex(void){
399 int c;
400
401 c = *exprp++;
402 switch(c){
403 case '\\':
404 if(*exprp)
405 if((c= *exprp++)=='n')
406 c='\n';
407 break;
408 case 0:
409 c = END;
410 --exprp; /* In case we come here again */
411 break;
412 case '*':
413 c = STAR;
414 break;
415 case '?':
416 c = QUEST;
417 break;
418 case '+':
419 c = PLUS;
420 break;
421 case '|':
422 c = OR;
423 break;
424 case '.':
425 c = ANY;
426 break;
427 case '(':
428 c = LBRA;
429 break;
430 case ')':
431 c = RBRA;
432 break;
433 case '^':
434 c = BOL;
435 break;
436 case '$':
437 c = EOL;
438 break;
439 case '[':
440 c = CCLASS;
441 bldcclass();
442 break;
443 }
444 return c;
445 }
446
447 int
nextrec(void)448 nextrec(void)
449 {
450 if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0))
451 regerror("malformed `[]'");
452 if(exprp[0] == '\\'){
453 exprp++;
454 if(*exprp=='n'){
455 exprp++;
456 return '\n';
457 }
458 return *exprp++|(Runemax+1);
459 }
460 return *exprp++;
461 }
462
463 void
bldcclass(void)464 bldcclass(void)
465 {
466 int c1, c2, n, na;
467 Rune *classp;
468
469 classp = runemalloc(DCLASS);
470 n = 0;
471 na = DCLASS;
472 /* we have already seen the '[' */
473 if(*exprp == '^'){
474 classp[n++] = '\n'; /* don't match newline in negate case */
475 negateclass = TRUE;
476 exprp++;
477 }else
478 negateclass = FALSE;
479 while((c1 = nextrec()) != ']'){
480 if(c1 == '-'){
481 Error:
482 free(classp);
483 regerror("malformed `[]'");
484 }
485 if(n+4 >= na){ /* 3 runes plus NUL */
486 na += DCLASS;
487 classp = runerealloc(classp, na);
488 }
489 if(*exprp == '-'){
490 exprp++; /* eat '-' */
491 if((c2 = nextrec()) == ']')
492 goto Error;
493 classp[n+0] = Runemax;
494 classp[n+1] = c1;
495 classp[n+2] = c2;
496 n += 3;
497 }else
498 classp[n++] = c1;
499 }
500 classp[n] = 0;
501 if(nclass == Nclass){
502 Nclass += DCLASS;
503 class = realloc(class, Nclass*sizeof(Rune*));
504 }
505 class[nclass++] = classp;
506 }
507
508 int
classmatch(int classno,int c,int negate)509 classmatch(int classno, int c, int negate)
510 {
511 Rune *p;
512
513 p = class[classno];
514 while(*p){
515 if(*p == Runemax){
516 if(p[1]<=c && c<=p[2])
517 return !negate;
518 p += 3;
519 }else if(*p++ == c)
520 return !negate;
521 }
522 return negate;
523 }
524
525 /*
526 * Note optimization in addinst:
527 * *l must be pending when addinst called; if *l has been looked
528 * at already, the optimization is a bug.
529 */
530 int
addinst(Ilist * l,Inst * inst,Rangeset * sep)531 addinst(Ilist *l, Inst *inst, Rangeset *sep)
532 {
533 Ilist *p;
534
535 for(p = l; p->inst; p++){
536 if(p->inst==inst){
537 if((sep)->r[0].q0 < p->se.r[0].q0)
538 p->se= *sep; /* this would be bug */
539 return 0; /* It's already there */
540 }
541 }
542 p->inst = inst;
543 p->se= *sep;
544 (p+1)->inst = nil;
545 return 1;
546 }
547
548 int
rxnull(void)549 rxnull(void)
550 {
551 return startinst==nil || bstartinst==nil;
552 }
553
554 /* either t!=nil or r!=nil, and we match the string in the appropriate place */
555 int
rxexecute(Text * t,Rune * r,uint startp,uint eof,Rangeset * rp)556 rxexecute(Text *t, Rune *r, uint startp, uint eof, Rangeset *rp)
557 {
558 int flag;
559 Inst *inst;
560 Ilist *tlp;
561 uint p;
562 int nnl, ntl;
563 int nc, c;
564 int wrapped;
565 int startchar;
566
567 flag = 0;
568 p = startp;
569 startchar = 0;
570 wrapped = 0;
571 nnl = 0;
572 if(startinst->type<OPERATOR)
573 startchar = startinst->type;
574 list[0][0].inst = list[1][0].inst = nil;
575 sel.r[0].q0 = -1;
576 if(t != nil)
577 nc = t->file->nc;
578 else
579 nc = runestrlen(r);
580 /* Execute machine once for each character */
581 for(;;p++){
582 doloop:
583 if(p>=eof || p>=nc){
584 switch(wrapped++){
585 case 0: /* let loop run one more click */
586 case 2:
587 break;
588 case 1: /* expired; wrap to beginning */
589 if(sel.r[0].q0>=0 || eof!=Infinity)
590 goto Return;
591 list[0][0].inst = list[1][0].inst = nil;
592 p = 0;
593 goto doloop;
594 default:
595 goto Return;
596 }
597 c = 0;
598 }else{
599 if(((wrapped && p>=startp) || sel.r[0].q0>0) && nnl==0)
600 break;
601 if(t != nil)
602 c = textreadc(t, p);
603 else
604 c = r[p];
605 }
606 /* fast check for first char */
607 if(startchar && nnl==0 && c!=startchar)
608 continue;
609 tl = list[flag];
610 nl = list[flag^=1];
611 nl->inst = nil;
612 ntl = nnl;
613 nnl = 0;
614 if(sel.r[0].q0<0 && (!wrapped || p<startp || startp==eof)){
615 /* Add first instruction to this list */
616 sempty.r[0].q0 = p;
617 if(addinst(tl, startinst, &sempty))
618 if(++ntl >= NLIST){
619 Overflow:
620 warning(nil, "regexp list overflow\n");
621 sel.r[0].q0 = -1;
622 goto Return;
623 }
624 }
625 /* Execute machine until this list is empty */
626 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */
627 Switchstmt:
628 switch(inst->type){
629 default: /* regular character */
630 if(inst->type==c){
631 Addinst:
632 if(addinst(nl, inst->next, &tlp->se))
633 if(++nnl >= NLIST)
634 goto Overflow;
635 }
636 break;
637 case LBRA:
638 if(inst->subid>=0)
639 tlp->se.r[inst->subid].q0 = p;
640 inst = inst->next;
641 goto Switchstmt;
642 case RBRA:
643 if(inst->subid>=0)
644 tlp->se.r[inst->subid].q1 = p;
645 inst = inst->next;
646 goto Switchstmt;
647 case ANY:
648 if(c!='\n')
649 goto Addinst;
650 break;
651 case BOL:
652 if(p==0 || (t!=nil && textreadc(t, p-1)=='\n') || (r!=nil && r[p-1]=='\n')){
653 Step:
654 inst = inst->next;
655 goto Switchstmt;
656 }
657 break;
658 case EOL:
659 if(c == '\n')
660 goto Step;
661 break;
662 case CCLASS:
663 if(c>=0 && classmatch(inst->class, c, 0))
664 goto Addinst;
665 break;
666 case NCCLASS:
667 if(c>=0 && classmatch(inst->class, c, 1))
668 goto Addinst;
669 break;
670 case OR:
671 /* evaluate right choice later */
672 if(addinst(tlp, inst->right, &tlp->se))
673 if(++ntl >= NLIST)
674 goto Overflow;
675 /* efficiency: advance and re-evaluate */
676 inst = inst->left;
677 goto Switchstmt;
678 case END: /* Match! */
679 tlp->se.r[0].q1 = p;
680 newmatch(&tlp->se);
681 break;
682 }
683 }
684 }
685 Return:
686 *rp = sel;
687 return sel.r[0].q0 >= 0;
688 }
689
690 void
newmatch(Rangeset * sp)691 newmatch(Rangeset *sp)
692 {
693 if(sel.r[0].q0<0 || sp->r[0].q0<sel.r[0].q0 ||
694 (sp->r[0].q0==sel.r[0].q0 && sp->r[0].q1>sel.r[0].q1))
695 sel = *sp;
696 }
697
698 int
rxbexecute(Text * t,uint startp,Rangeset * rp)699 rxbexecute(Text *t, uint startp, Rangeset *rp)
700 {
701 int flag;
702 Inst *inst;
703 Ilist *tlp;
704 int p;
705 int nnl, ntl;
706 int c;
707 int wrapped;
708 int startchar;
709
710 flag = 0;
711 nnl = 0;
712 wrapped = 0;
713 p = startp;
714 startchar = 0;
715 if(bstartinst->type<OPERATOR)
716 startchar = bstartinst->type;
717 list[0][0].inst = list[1][0].inst = nil;
718 sel.r[0].q0= -1;
719 /* Execute machine once for each character, including terminal NUL */
720 for(;;--p){
721 doloop:
722 if(p <= 0){
723 switch(wrapped++){
724 case 0: /* let loop run one more click */
725 case 2:
726 break;
727 case 1: /* expired; wrap to end */
728 if(sel.r[0].q0>=0)
729 goto Return;
730 list[0][0].inst = list[1][0].inst = nil;
731 p = t->file->nc;
732 goto doloop;
733 case 3:
734 default:
735 goto Return;
736 }
737 c = 0;
738 }else{
739 if(((wrapped && p<=startp) || sel.r[0].q0>0) && nnl==0)
740 break;
741 c = textreadc(t, p-1);
742 }
743 /* fast check for first char */
744 if(startchar && nnl==0 && c!=startchar)
745 continue;
746 tl = list[flag];
747 nl = list[flag^=1];
748 nl->inst = nil;
749 ntl = nnl;
750 nnl = 0;
751 if(sel.r[0].q0<0 && (!wrapped || p>startp)){
752 /* Add first instruction to this list */
753 /* the minus is so the optimizations in addinst work */
754 sempty.r[0].q0 = -p;
755 if(addinst(tl, bstartinst, &sempty))
756 if(++ntl >= NLIST){
757 Overflow:
758 warning(nil, "regexp list overflow\n");
759 sel.r[0].q0 = -1;
760 goto Return;
761 }
762 }
763 /* Execute machine until this list is empty */
764 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */
765 Switchstmt:
766 switch(inst->type){
767 default: /* regular character */
768 if(inst->type == c){
769 Addinst:
770 if(addinst(nl, inst->next, &tlp->se))
771 if(++nnl >= NLIST)
772 goto Overflow;
773 }
774 break;
775 case LBRA:
776 if(inst->subid>=0)
777 tlp->se.r[inst->subid].q0 = p;
778 inst = inst->next;
779 goto Switchstmt;
780 case RBRA:
781 if(inst->subid >= 0)
782 tlp->se.r[inst->subid].q1 = p;
783 inst = inst->next;
784 goto Switchstmt;
785 case ANY:
786 if(c != '\n')
787 goto Addinst;
788 break;
789 case BOL:
790 if(c=='\n' || p==0){
791 Step:
792 inst = inst->next;
793 goto Switchstmt;
794 }
795 break;
796 case EOL:
797 if(p<t->file->nc && textreadc(t, p)=='\n')
798 goto Step;
799 break;
800 case CCLASS:
801 if(c>0 && classmatch(inst->class, c, 0))
802 goto Addinst;
803 break;
804 case NCCLASS:
805 if(c>0 && classmatch(inst->class, c, 1))
806 goto Addinst;
807 break;
808 case OR:
809 /* evaluate right choice later */
810 if(addinst(tl, inst->right, &tlp->se))
811 if(++ntl >= NLIST)
812 goto Overflow;
813 /* efficiency: advance and re-evaluate */
814 inst = inst->left;
815 goto Switchstmt;
816 case END: /* Match! */
817 tlp->se.r[0].q0 = -tlp->se.r[0].q0; /* minus sign */
818 tlp->se.r[0].q1 = p;
819 bnewmatch(&tlp->se);
820 break;
821 }
822 }
823 }
824 Return:
825 *rp = sel;
826 return sel.r[0].q0 >= 0;
827 }
828
829 void
bnewmatch(Rangeset * sp)830 bnewmatch(Rangeset *sp)
831 {
832 int i;
833
834 if(sel.r[0].q0<0 || sp->r[0].q0>sel.r[0].q1 || (sp->r[0].q0==sel.r[0].q1 && sp->r[0].q1<sel.r[0].q0))
835 for(i = 0; i<NRange; i++){ /* note the reversal; q0<=q1 */
836 sel.r[i].q0 = sp->r[i].q1;
837 sel.r[i].q1 = sp->r[i].q0;
838 }
839 }
840