1 #include <u.h> 2 #include <libc.h> 3 #include <draw.h> 4 #include <thread.h> 5 #include <cursor.h> 6 #include <mouse.h> 7 #include <keyboard.h> 8 #include <frame.h> 9 #include <fcall.h> 10 #include <plumb.h> 11 #include "dat.h" 12 #include "fns.h" 13 14 Rangeset sel; 15 Rune *lastregexp; 16 17 /* 18 * Machine Information 19 */ 20 typedef struct Inst Inst; 21 struct Inst 22 { 23 uint type; /* <= Runemax+1 ==> literal, otherwise action */ 24 union { 25 int sid; 26 int subid; 27 int class; 28 Inst *other; 29 Inst *right; 30 }; 31 union{ 32 Inst *left; 33 Inst *next; 34 }; 35 }; 36 37 #define NPROG 1024 38 Inst program[NPROG]; 39 Inst *progp; 40 Inst *startinst; /* First inst. of program; might not be program[0] */ 41 Inst *bstartinst; /* same for backwards machine */ 42 Channel *rechan; /* chan(Inst*) */ 43 44 typedef struct Ilist Ilist; 45 struct Ilist 46 { 47 Inst *inst; /* Instruction of the thread */ 48 Rangeset se; 49 uint startp; /* first char of match */ 50 }; 51 52 #define NLIST 127 53 54 Ilist *tl, *nl; /* This list, next list */ 55 Ilist list[2][NLIST+1]; /* +1 for trailing null */ 56 static Rangeset sempty; 57 58 /* 59 * Actions and Tokens 60 * 61 * 0x100xx are operators, value == precedence 62 * 0x200xx are tokens, i.e. operands for operators 63 */ 64 enum { 65 OPERATOR = Runemask+1, /* Bitmask of all operators */ 66 START = OPERATOR, /* Start, used for marker on stack */ 67 RBRA, /* Right bracket, ) */ 68 LBRA, /* Left bracket, ( */ 69 OR, /* Alternation, | */ 70 CAT, /* Concatentation, implicit operator */ 71 STAR, /* Closure, * */ 72 PLUS, /* a+ == aa* */ 73 QUEST, /* a? == a|nothing, i.e. 0 or 1 a's */ 74 75 ANY = OPERATOR<<1, /* Any character but newline, . */ 76 NOP, /* No operation, internal use only */ 77 BOL, /* Beginning of line, ^ */ 78 EOL, /* End of line, $ */ 79 CCLASS, /* Character class, [] */ 80 NCCLASS, /* Negated character class, [^] */ 81 END, /* Terminate: match found */ 82 83 ISATOR = OPERATOR, 84 ISAND = OPERATOR<<1, 85 }; 86 87 /* 88 * Parser Information 89 */ 90 typedef struct Node Node; 91 struct Node 92 { 93 Inst *first; 94 Inst *last; 95 }; 96 97 #define NSTACK 20 98 Node andstack[NSTACK]; 99 Node *andp; 100 int atorstack[NSTACK]; 101 int *atorp; 102 int lastwasand; /* Last token was operand */ 103 int cursubid; 104 int subidstack[NSTACK]; 105 int *subidp; 106 int backwards; 107 int nbra; 108 Rune *exprp; /* pointer to next character in source expression */ 109 #define DCLASS 10 /* allocation increment */ 110 int nclass; /* number active */ 111 int Nclass; /* high water mark */ 112 Rune **class; 113 int negateclass; 114 115 int addinst(Ilist *l, Inst *inst, Rangeset *sep); 116 void newmatch(Rangeset*); 117 void bnewmatch(Rangeset*); 118 void pushand(Inst*, Inst*); 119 void pushator(int); 120 Node *popand(int); 121 int popator(void); 122 void startlex(Rune*); 123 int lex(void); 124 void operator(int); 125 void operand(int); 126 void evaluntil(int); 127 void optimize(Inst*); 128 void bldcclass(void); 129 130 void 131 rxinit(void) 132 { 133 rechan = chancreate(sizeof(Inst*), 0); 134 lastregexp = runemalloc(1); 135 } 136 137 void 138 regerror(char *e) 139 { 140 lastregexp[0] = 0; 141 warning(nil, "regexp: %s\n", e); 142 sendp(rechan, nil); 143 threadexits(nil); 144 } 145 146 Inst * 147 newinst(int t) 148 { 149 if(progp >= &program[NPROG]) 150 regerror("expression too long"); 151 progp->type = t; 152 progp->left = nil; 153 progp->right = nil; 154 return progp++; 155 } 156 157 void 158 realcompile(void *arg) 159 { 160 int token; 161 Rune *s; 162 163 threadsetname("regcomp"); 164 s = arg; 165 startlex(s); 166 atorp = atorstack; 167 andp = andstack; 168 subidp = subidstack; 169 cursubid = 0; 170 lastwasand = FALSE; 171 /* Start with a low priority operator to prime parser */ 172 pushator(START-1); 173 while((token=lex()) != END){ 174 if((token&ISATOR) == OPERATOR) 175 operator(token); 176 else 177 operand(token); 178 } 179 /* Close with a low priority operator */ 180 evaluntil(START); 181 /* Force END */ 182 operand(END); 183 evaluntil(START); 184 if(nbra) 185 regerror("unmatched `('"); 186 --andp; /* points to first and only operand */ 187 sendp(rechan, andp->first); 188 threadexits(nil); 189 } 190 191 /* r is null terminated */ 192 int 193 rxcompile(Rune *r) 194 { 195 int i, nr; 196 Inst *oprogp; 197 198 nr = runestrlen(r)+1; 199 if(runeeq(lastregexp, runestrlen(lastregexp)+1, r, nr)==TRUE) 200 return TRUE; 201 lastregexp[0] = 0; 202 for(i=0; i<nclass; i++) 203 free(class[i]); 204 nclass = 0; 205 progp = program; 206 backwards = FALSE; 207 bstartinst = nil; 208 threadcreate(realcompile, r, STACK); 209 startinst = recvp(rechan); 210 if(startinst == nil) 211 return FALSE; 212 optimize(program); 213 oprogp = progp; 214 backwards = TRUE; 215 threadcreate(realcompile, r, STACK); 216 bstartinst = recvp(rechan); 217 if(bstartinst == nil) 218 return FALSE; 219 optimize(oprogp); 220 lastregexp = runerealloc(lastregexp, nr); 221 runemove(lastregexp, r, nr); 222 return TRUE; 223 } 224 225 void 226 operand(int t) 227 { 228 Inst *i; 229 if(lastwasand) 230 operator(CAT); /* catenate is implicit */ 231 i = newinst(t); 232 if(t == CCLASS){ 233 if(negateclass) 234 i->type = NCCLASS; /* UGH */ 235 i->class = nclass-1; /* UGH */ 236 } 237 pushand(i, i); 238 lastwasand = TRUE; 239 } 240 241 void 242 operator(int t) 243 { 244 if(t==RBRA && --nbra<0) 245 regerror("unmatched `)'"); 246 if(t==LBRA){ 247 cursubid++; /* silently ignored */ 248 nbra++; 249 if(lastwasand) 250 operator(CAT); 251 }else 252 evaluntil(t); 253 if(t!=RBRA) 254 pushator(t); 255 lastwasand = FALSE; 256 if(t==STAR || t==QUEST || t==PLUS || t==RBRA) 257 lastwasand = TRUE; /* these look like operands */ 258 } 259 260 void 261 pushand(Inst *f, Inst *l) 262 { 263 if(andp >= &andstack[NSTACK]) 264 error("operand stack overflow"); 265 andp->first = f; 266 andp->last = l; 267 andp++; 268 } 269 270 void 271 pushator(int t) 272 { 273 if(atorp >= &atorstack[NSTACK]) 274 error("operator stack overflow"); 275 *atorp++=t; 276 if(cursubid >= NRange) 277 *subidp++= -1; 278 else 279 *subidp++=cursubid; 280 } 281 282 Node * 283 popand(int op) 284 { 285 char buf[64]; 286 287 if(andp <= &andstack[0]) 288 if(op){ 289 sprint(buf, "missing operand for %c", op); 290 regerror(buf); 291 }else 292 regerror("malformed regexp"); 293 return --andp; 294 } 295 296 int 297 popator() 298 { 299 if(atorp <= &atorstack[0]) 300 error("operator stack underflow"); 301 --subidp; 302 return *--atorp; 303 } 304 305 void 306 evaluntil(int pri) 307 { 308 Node *op1, *op2, *t; 309 Inst *inst1, *inst2; 310 311 while(pri==RBRA || atorp[-1]>=pri){ 312 switch(popator()){ 313 case LBRA: 314 op1 = popand('('); 315 inst2 = newinst(RBRA); 316 inst2->subid = *subidp; 317 op1->last->next = inst2; 318 inst1 = newinst(LBRA); 319 inst1->subid = *subidp; 320 inst1->next = op1->first; 321 pushand(inst1, inst2); 322 return; /* must have been RBRA */ 323 default: 324 error("unknown regexp operator"); 325 break; 326 case OR: 327 op2 = popand('|'); 328 op1 = popand('|'); 329 inst2 = newinst(NOP); 330 op2->last->next = inst2; 331 op1->last->next = inst2; 332 inst1 = newinst(OR); 333 inst1->right = op1->first; 334 inst1->left = op2->first; 335 pushand(inst1, inst2); 336 break; 337 case CAT: 338 op2 = popand(0); 339 op1 = popand(0); 340 if(backwards && op2->first->type!=END){ 341 t = op1; 342 op1 = op2; 343 op2 = t; 344 } 345 op1->last->next = op2->first; 346 pushand(op1->first, op2->last); 347 break; 348 case STAR: 349 op2 = popand('*'); 350 inst1 = newinst(OR); 351 op2->last->next = inst1; 352 inst1->right = op2->first; 353 pushand(inst1, inst1); 354 break; 355 case PLUS: 356 op2 = popand('+'); 357 inst1 = newinst(OR); 358 op2->last->next = inst1; 359 inst1->right = op2->first; 360 pushand(op2->first, inst1); 361 break; 362 case QUEST: 363 op2 = popand('?'); 364 inst1 = newinst(OR); 365 inst2 = newinst(NOP); 366 inst1->left = inst2; 367 inst1->right = op2->first; 368 op2->last->next = inst2; 369 pushand(inst1, inst2); 370 break; 371 } 372 } 373 } 374 375 376 void 377 optimize(Inst *start) 378 { 379 Inst *inst, *target; 380 381 for(inst=start; inst->type!=END; inst++){ 382 target = inst->next; 383 while(target->type == NOP) 384 target = target->next; 385 inst->next = target; 386 } 387 } 388 389 void 390 startlex(Rune *s) 391 { 392 exprp = s; 393 nbra = 0; 394 } 395 396 397 int 398 lex(void){ 399 int c; 400 401 c = *exprp++; 402 switch(c){ 403 case '\\': 404 if(*exprp) 405 if((c= *exprp++)=='n') 406 c='\n'; 407 break; 408 case 0: 409 c = END; 410 --exprp; /* In case we come here again */ 411 break; 412 case '*': 413 c = STAR; 414 break; 415 case '?': 416 c = QUEST; 417 break; 418 case '+': 419 c = PLUS; 420 break; 421 case '|': 422 c = OR; 423 break; 424 case '.': 425 c = ANY; 426 break; 427 case '(': 428 c = LBRA; 429 break; 430 case ')': 431 c = RBRA; 432 break; 433 case '^': 434 c = BOL; 435 break; 436 case '$': 437 c = EOL; 438 break; 439 case '[': 440 c = CCLASS; 441 bldcclass(); 442 break; 443 } 444 return c; 445 } 446 447 int 448 nextrec(void) 449 { 450 if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0)) 451 regerror("malformed `[]'"); 452 if(exprp[0] == '\\'){ 453 exprp++; 454 if(*exprp=='n'){ 455 exprp++; 456 return '\n'; 457 } 458 return *exprp++|(Runemax+1); 459 } 460 return *exprp++; 461 } 462 463 void 464 bldcclass(void) 465 { 466 int c1, c2, n, na; 467 Rune *classp; 468 469 classp = runemalloc(DCLASS); 470 n = 0; 471 na = DCLASS; 472 /* we have already seen the '[' */ 473 if(*exprp == '^'){ 474 classp[n++] = '\n'; /* don't match newline in negate case */ 475 negateclass = TRUE; 476 exprp++; 477 }else 478 negateclass = FALSE; 479 while((c1 = nextrec()) != ']'){ 480 if(c1 == '-'){ 481 Error: 482 free(classp); 483 regerror("malformed `[]'"); 484 } 485 if(n+4 >= na){ /* 3 runes plus NUL */ 486 na += DCLASS; 487 classp = runerealloc(classp, na); 488 } 489 if(*exprp == '-'){ 490 exprp++; /* eat '-' */ 491 if((c2 = nextrec()) == ']') 492 goto Error; 493 classp[n+0] = Runemax; 494 classp[n+1] = c1; 495 classp[n+2] = c2; 496 n += 3; 497 }else 498 classp[n++] = c1; 499 } 500 classp[n] = 0; 501 if(nclass == Nclass){ 502 Nclass += DCLASS; 503 class = realloc(class, Nclass*sizeof(Rune*)); 504 } 505 class[nclass++] = classp; 506 } 507 508 int 509 classmatch(int classno, int c, int negate) 510 { 511 Rune *p; 512 513 p = class[classno]; 514 while(*p){ 515 if(*p == Runemax){ 516 if(p[1]<=c && c<=p[2]) 517 return !negate; 518 p += 3; 519 }else if(*p++ == c) 520 return !negate; 521 } 522 return negate; 523 } 524 525 /* 526 * Note optimization in addinst: 527 * *l must be pending when addinst called; if *l has been looked 528 * at already, the optimization is a bug. 529 */ 530 int 531 addinst(Ilist *l, Inst *inst, Rangeset *sep) 532 { 533 Ilist *p; 534 535 for(p = l; p->inst; p++){ 536 if(p->inst==inst){ 537 if((sep)->r[0].q0 < p->se.r[0].q0) 538 p->se= *sep; /* this would be bug */ 539 return 0; /* It's already there */ 540 } 541 } 542 p->inst = inst; 543 p->se= *sep; 544 (p+1)->inst = nil; 545 return 1; 546 } 547 548 int 549 rxnull(void) 550 { 551 return startinst==nil || bstartinst==nil; 552 } 553 554 /* either t!=nil or r!=nil, and we match the string in the appropriate place */ 555 int 556 rxexecute(Text *t, Rune *r, uint startp, uint eof, Rangeset *rp) 557 { 558 int flag; 559 Inst *inst; 560 Ilist *tlp; 561 uint p; 562 int nnl, ntl; 563 int nc, c; 564 int wrapped; 565 int startchar; 566 567 flag = 0; 568 p = startp; 569 startchar = 0; 570 wrapped = 0; 571 nnl = 0; 572 if(startinst->type<OPERATOR) 573 startchar = startinst->type; 574 list[0][0].inst = list[1][0].inst = nil; 575 sel.r[0].q0 = -1; 576 if(t != nil) 577 nc = t->file->nc; 578 else 579 nc = runestrlen(r); 580 /* Execute machine once for each character */ 581 for(;;p++){ 582 doloop: 583 if(p>=eof || p>=nc){ 584 switch(wrapped++){ 585 case 0: /* let loop run one more click */ 586 case 2: 587 break; 588 case 1: /* expired; wrap to beginning */ 589 if(sel.r[0].q0>=0 || eof!=Infinity) 590 goto Return; 591 list[0][0].inst = list[1][0].inst = nil; 592 p = 0; 593 goto doloop; 594 default: 595 goto Return; 596 } 597 c = 0; 598 }else{ 599 if(((wrapped && p>=startp) || sel.r[0].q0>0) && nnl==0) 600 break; 601 if(t != nil) 602 c = textreadc(t, p); 603 else 604 c = r[p]; 605 } 606 /* fast check for first char */ 607 if(startchar && nnl==0 && c!=startchar) 608 continue; 609 tl = list[flag]; 610 nl = list[flag^=1]; 611 nl->inst = nil; 612 ntl = nnl; 613 nnl = 0; 614 if(sel.r[0].q0<0 && (!wrapped || p<startp || startp==eof)){ 615 /* Add first instruction to this list */ 616 sempty.r[0].q0 = p; 617 if(addinst(tl, startinst, &sempty)) 618 if(++ntl >= NLIST){ 619 Overflow: 620 warning(nil, "regexp list overflow\n"); 621 sel.r[0].q0 = -1; 622 goto Return; 623 } 624 } 625 /* Execute machine until this list is empty */ 626 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */ 627 Switchstmt: 628 switch(inst->type){ 629 default: /* regular character */ 630 if(inst->type==c){ 631 Addinst: 632 if(addinst(nl, inst->next, &tlp->se)) 633 if(++nnl >= NLIST) 634 goto Overflow; 635 } 636 break; 637 case LBRA: 638 if(inst->subid>=0) 639 tlp->se.r[inst->subid].q0 = p; 640 inst = inst->next; 641 goto Switchstmt; 642 case RBRA: 643 if(inst->subid>=0) 644 tlp->se.r[inst->subid].q1 = p; 645 inst = inst->next; 646 goto Switchstmt; 647 case ANY: 648 if(c!='\n') 649 goto Addinst; 650 break; 651 case BOL: 652 if(p==0 || (t!=nil && textreadc(t, p-1)=='\n') || (r!=nil && r[p-1]=='\n')){ 653 Step: 654 inst = inst->next; 655 goto Switchstmt; 656 } 657 break; 658 case EOL: 659 if(c == '\n') 660 goto Step; 661 break; 662 case CCLASS: 663 if(c>=0 && classmatch(inst->class, c, 0)) 664 goto Addinst; 665 break; 666 case NCCLASS: 667 if(c>=0 && classmatch(inst->class, c, 1)) 668 goto Addinst; 669 break; 670 case OR: 671 /* evaluate right choice later */ 672 if(addinst(tlp, inst->right, &tlp->se)) 673 if(++ntl >= NLIST) 674 goto Overflow; 675 /* efficiency: advance and re-evaluate */ 676 inst = inst->left; 677 goto Switchstmt; 678 case END: /* Match! */ 679 tlp->se.r[0].q1 = p; 680 newmatch(&tlp->se); 681 break; 682 } 683 } 684 } 685 Return: 686 *rp = sel; 687 return sel.r[0].q0 >= 0; 688 } 689 690 void 691 newmatch(Rangeset *sp) 692 { 693 if(sel.r[0].q0<0 || sp->r[0].q0<sel.r[0].q0 || 694 (sp->r[0].q0==sel.r[0].q0 && sp->r[0].q1>sel.r[0].q1)) 695 sel = *sp; 696 } 697 698 int 699 rxbexecute(Text *t, uint startp, Rangeset *rp) 700 { 701 int flag; 702 Inst *inst; 703 Ilist *tlp; 704 int p; 705 int nnl, ntl; 706 int c; 707 int wrapped; 708 int startchar; 709 710 flag = 0; 711 nnl = 0; 712 wrapped = 0; 713 p = startp; 714 startchar = 0; 715 if(bstartinst->type<OPERATOR) 716 startchar = bstartinst->type; 717 list[0][0].inst = list[1][0].inst = nil; 718 sel.r[0].q0= -1; 719 /* Execute machine once for each character, including terminal NUL */ 720 for(;;--p){ 721 doloop: 722 if(p <= 0){ 723 switch(wrapped++){ 724 case 0: /* let loop run one more click */ 725 case 2: 726 break; 727 case 1: /* expired; wrap to end */ 728 if(sel.r[0].q0>=0) 729 goto Return; 730 list[0][0].inst = list[1][0].inst = nil; 731 p = t->file->nc; 732 goto doloop; 733 case 3: 734 default: 735 goto Return; 736 } 737 c = 0; 738 }else{ 739 if(((wrapped && p<=startp) || sel.r[0].q0>0) && nnl==0) 740 break; 741 c = textreadc(t, p-1); 742 } 743 /* fast check for first char */ 744 if(startchar && nnl==0 && c!=startchar) 745 continue; 746 tl = list[flag]; 747 nl = list[flag^=1]; 748 nl->inst = nil; 749 ntl = nnl; 750 nnl = 0; 751 if(sel.r[0].q0<0 && (!wrapped || p>startp)){ 752 /* Add first instruction to this list */ 753 /* the minus is so the optimizations in addinst work */ 754 sempty.r[0].q0 = -p; 755 if(addinst(tl, bstartinst, &sempty)) 756 if(++ntl >= NLIST){ 757 Overflow: 758 warning(nil, "regexp list overflow\n"); 759 sel.r[0].q0 = -1; 760 goto Return; 761 } 762 } 763 /* Execute machine until this list is empty */ 764 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */ 765 Switchstmt: 766 switch(inst->type){ 767 default: /* regular character */ 768 if(inst->type == c){ 769 Addinst: 770 if(addinst(nl, inst->next, &tlp->se)) 771 if(++nnl >= NLIST) 772 goto Overflow; 773 } 774 break; 775 case LBRA: 776 if(inst->subid>=0) 777 tlp->se.r[inst->subid].q0 = p; 778 inst = inst->next; 779 goto Switchstmt; 780 case RBRA: 781 if(inst->subid >= 0) 782 tlp->se.r[inst->subid].q1 = p; 783 inst = inst->next; 784 goto Switchstmt; 785 case ANY: 786 if(c != '\n') 787 goto Addinst; 788 break; 789 case BOL: 790 if(c=='\n' || p==0){ 791 Step: 792 inst = inst->next; 793 goto Switchstmt; 794 } 795 break; 796 case EOL: 797 if(p<t->file->nc && textreadc(t, p)=='\n') 798 goto Step; 799 break; 800 case CCLASS: 801 if(c>0 && classmatch(inst->class, c, 0)) 802 goto Addinst; 803 break; 804 case NCCLASS: 805 if(c>0 && classmatch(inst->class, c, 1)) 806 goto Addinst; 807 break; 808 case OR: 809 /* evaluate right choice later */ 810 if(addinst(tl, inst->right, &tlp->se)) 811 if(++ntl >= NLIST) 812 goto Overflow; 813 /* efficiency: advance and re-evaluate */ 814 inst = inst->left; 815 goto Switchstmt; 816 case END: /* Match! */ 817 tlp->se.r[0].q0 = -tlp->se.r[0].q0; /* minus sign */ 818 tlp->se.r[0].q1 = p; 819 bnewmatch(&tlp->se); 820 break; 821 } 822 } 823 } 824 Return: 825 *rp = sel; 826 return sel.r[0].q0 >= 0; 827 } 828 829 void 830 bnewmatch(Rangeset *sp) 831 { 832 int i; 833 834 if(sel.r[0].q0<0 || sp->r[0].q0>sel.r[0].q1 || (sp->r[0].q0==sel.r[0].q1 && sp->r[0].q1<sel.r[0].q0)) 835 for(i = 0; i<NRange; i++){ /* note the reversal; q0<=q1 */ 836 sel.r[i].q0 = sp->r[i].q1; 837 sel.r[i].q1 = sp->r[i].q0; 838 } 839 } 840