1 #include <u.h> 2 #include <libc.h> 3 #include <draw.h> 4 #include <thread.h> 5 #include <cursor.h> 6 #include <mouse.h> 7 #include <keyboard.h> 8 #include <frame.h> 9 #include <fcall.h> 10 #include <plumb.h> 11 #include "dat.h" 12 #include "fns.h" 13 14 Rangeset sel; 15 Rune *lastregexp; 16 17 /* 18 * Machine Information 19 */ 20 typedef struct Inst Inst; 21 struct Inst 22 { 23 uint type; /* < 0x10000 ==> literal, otherwise action */ 24 union { 25 int sid; 26 int subid; 27 int class; 28 Inst *other; 29 Inst *right; 30 }; 31 union{ 32 Inst *left; 33 Inst *next; 34 }; 35 }; 36 37 #define NPROG 1024 38 Inst program[NPROG]; 39 Inst *progp; 40 Inst *startinst; /* First inst. of program; might not be program[0] */ 41 Inst *bstartinst; /* same for backwards machine */ 42 Channel *rechan; /* chan(Inst*) */ 43 44 typedef struct Ilist Ilist; 45 struct Ilist 46 { 47 Inst *inst; /* Instruction of the thread */ 48 Rangeset se; 49 uint startp; /* first char of match */ 50 }; 51 52 #define NLIST 128 53 54 Ilist *tl, *nl; /* This list, next list */ 55 Ilist list[2][NLIST]; 56 static Rangeset sempty; 57 58 /* 59 * Actions and Tokens 60 * 61 * 0x100xx are operators, value == precedence 62 * 0x200xx are tokens, i.e. operands for operators 63 */ 64 #define OPERATOR 0x10000 /* Bitmask of all operators */ 65 #define START 0x10000 /* Start, used for marker on stack */ 66 #define RBRA 0x10001 /* Right bracket, ) */ 67 #define LBRA 0x10002 /* Left bracket, ( */ 68 #define OR 0x10003 /* Alternation, | */ 69 #define CAT 0x10004 /* Concatentation, implicit operator */ 70 #define STAR 0x10005 /* Closure, * */ 71 #define PLUS 0x10006 /* a+ == aa* */ 72 #define QUEST 0x10007 /* a? == a|nothing, i.e. 0 or 1 a's */ 73 #define ANY 0x20000 /* Any character but newline, . */ 74 #define NOP 0x20001 /* No operation, internal use only */ 75 #define BOL 0x20002 /* Beginning of line, ^ */ 76 #define EOL 0x20003 /* End of line, $ */ 77 #define CCLASS 0x20004 /* Character class, [] */ 78 #define NCCLASS 0x20005 /* Negated character class, [^] */ 79 #define END 0x20077 /* Terminate: match found */ 80 81 #define ISATOR 0x10000 82 #define ISAND 0x20000 83 84 /* 85 * Parser Information 86 */ 87 typedef struct Node Node; 88 struct Node 89 { 90 Inst *first; 91 Inst *last; 92 }; 93 94 #define NSTACK 20 95 Node andstack[NSTACK]; 96 Node *andp; 97 int atorstack[NSTACK]; 98 int *atorp; 99 int lastwasand; /* Last token was operand */ 100 int cursubid; 101 int subidstack[NSTACK]; 102 int *subidp; 103 int backwards; 104 int nbra; 105 Rune *exprp; /* pointer to next character in source expression */ 106 #define DCLASS 10 /* allocation increment */ 107 int nclass; /* number active */ 108 int Nclass; /* high water mark */ 109 Rune **class; 110 int negateclass; 111 112 void addinst(Ilist *l, Inst *inst, Rangeset *sep); 113 void newmatch(Rangeset*); 114 void bnewmatch(Rangeset*); 115 void pushand(Inst*, Inst*); 116 void pushator(int); 117 Node *popand(int); 118 int popator(void); 119 void startlex(Rune*); 120 int lex(void); 121 void operator(int); 122 void operand(int); 123 void evaluntil(int); 124 void optimize(Inst*); 125 void bldcclass(void); 126 127 void 128 rxinit(void) 129 { 130 rechan = chancreate(sizeof(Inst*), 0); 131 lastregexp = runemalloc(1); 132 } 133 134 void 135 regerror(char *e) 136 { 137 lastregexp[0] = 0; 138 warning(nil, "regexp: %s\n", e); 139 sendp(rechan, nil); 140 threadexits(nil); 141 } 142 143 Inst * 144 newinst(int t) 145 { 146 if(progp >= &program[NPROG]) 147 regerror("expression too long"); 148 progp->type = t; 149 progp->left = nil; 150 progp->right = nil; 151 return progp++; 152 } 153 154 void 155 realcompile(void *arg) 156 { 157 int token; 158 Rune *s; 159 160 threadsetname("regcomp"); 161 s = arg; 162 startlex(s); 163 atorp = atorstack; 164 andp = andstack; 165 subidp = subidstack; 166 cursubid = 0; 167 lastwasand = FALSE; 168 /* Start with a low priority operator to prime parser */ 169 pushator(START-1); 170 while((token=lex()) != END){ 171 if((token&ISATOR) == OPERATOR) 172 operator(token); 173 else 174 operand(token); 175 } 176 /* Close with a low priority operator */ 177 evaluntil(START); 178 /* Force END */ 179 operand(END); 180 evaluntil(START); 181 if(nbra) 182 regerror("unmatched `('"); 183 --andp; /* points to first and only operand */ 184 sendp(rechan, andp->first); 185 threadexits(nil); 186 } 187 188 /* r is null terminated */ 189 int 190 rxcompile(Rune *r) 191 { 192 int i, nr; 193 Inst *oprogp; 194 195 nr = runestrlen(r)+1; 196 if(runeeq(lastregexp, runestrlen(lastregexp)+1, r, nr)==TRUE) 197 return TRUE; 198 lastregexp[0] = 0; 199 for(i=0; i<nclass; i++) 200 free(class[i]); 201 nclass = 0; 202 progp = program; 203 backwards = FALSE; 204 bstartinst = nil; 205 threadcreate(realcompile, r, STACK); 206 startinst = recvp(rechan); 207 if(startinst == nil) 208 return FALSE; 209 optimize(program); 210 oprogp = progp; 211 backwards = TRUE; 212 threadcreate(realcompile, r, STACK); 213 bstartinst = recvp(rechan); 214 if(bstartinst == nil) 215 return FALSE; 216 optimize(oprogp); 217 lastregexp = runerealloc(lastregexp, nr); 218 runemove(lastregexp, r, nr); 219 return TRUE; 220 } 221 222 void 223 operand(int t) 224 { 225 Inst *i; 226 if(lastwasand) 227 operator(CAT); /* catenate is implicit */ 228 i = newinst(t); 229 if(t == CCLASS){ 230 if(negateclass) 231 i->type = NCCLASS; /* UGH */ 232 i->class = nclass-1; /* UGH */ 233 } 234 pushand(i, i); 235 lastwasand = TRUE; 236 } 237 238 void 239 operator(int t) 240 { 241 if(t==RBRA && --nbra<0) 242 regerror("unmatched `)'"); 243 if(t==LBRA){ 244 cursubid++; /* silently ignored */ 245 nbra++; 246 if(lastwasand) 247 operator(CAT); 248 }else 249 evaluntil(t); 250 if(t!=RBRA) 251 pushator(t); 252 lastwasand = FALSE; 253 if(t==STAR || t==QUEST || t==PLUS || t==RBRA) 254 lastwasand = TRUE; /* these look like operands */ 255 } 256 257 void 258 pushand(Inst *f, Inst *l) 259 { 260 if(andp >= &andstack[NSTACK]) 261 error("operand stack overflow"); 262 andp->first = f; 263 andp->last = l; 264 andp++; 265 } 266 267 void 268 pushator(int t) 269 { 270 if(atorp >= &atorstack[NSTACK]) 271 error("operator stack overflow"); 272 *atorp++=t; 273 if(cursubid >= NRange) 274 *subidp++= -1; 275 else 276 *subidp++=cursubid; 277 } 278 279 Node * 280 popand(int op) 281 { 282 char buf[64]; 283 284 if(andp <= &andstack[0]) 285 if(op){ 286 sprint(buf, "missing operand for %c", op); 287 regerror(buf); 288 }else 289 regerror("malformed regexp"); 290 return --andp; 291 } 292 293 int 294 popator() 295 { 296 if(atorp <= &atorstack[0]) 297 error("operator stack underflow"); 298 --subidp; 299 return *--atorp; 300 } 301 302 void 303 evaluntil(int pri) 304 { 305 Node *op1, *op2, *t; 306 Inst *inst1, *inst2; 307 308 while(pri==RBRA || atorp[-1]>=pri){ 309 switch(popator()){ 310 case LBRA: 311 op1 = popand('('); 312 inst2 = newinst(RBRA); 313 inst2->subid = *subidp; 314 op1->last->next = inst2; 315 inst1 = newinst(LBRA); 316 inst1->subid = *subidp; 317 inst1->next = op1->first; 318 pushand(inst1, inst2); 319 return; /* must have been RBRA */ 320 default: 321 error("unknown regexp operator"); 322 break; 323 case OR: 324 op2 = popand('|'); 325 op1 = popand('|'); 326 inst2 = newinst(NOP); 327 op2->last->next = inst2; 328 op1->last->next = inst2; 329 inst1 = newinst(OR); 330 inst1->right = op1->first; 331 inst1->left = op2->first; 332 pushand(inst1, inst2); 333 break; 334 case CAT: 335 op2 = popand(0); 336 op1 = popand(0); 337 if(backwards && op2->first->type!=END){ 338 t = op1; 339 op1 = op2; 340 op2 = t; 341 } 342 op1->last->next = op2->first; 343 pushand(op1->first, op2->last); 344 break; 345 case STAR: 346 op2 = popand('*'); 347 inst1 = newinst(OR); 348 op2->last->next = inst1; 349 inst1->right = op2->first; 350 pushand(inst1, inst1); 351 break; 352 case PLUS: 353 op2 = popand('+'); 354 inst1 = newinst(OR); 355 op2->last->next = inst1; 356 inst1->right = op2->first; 357 pushand(op2->first, inst1); 358 break; 359 case QUEST: 360 op2 = popand('?'); 361 inst1 = newinst(OR); 362 inst2 = newinst(NOP); 363 inst1->left = inst2; 364 inst1->right = op2->first; 365 op2->last->next = inst2; 366 pushand(inst1, inst2); 367 break; 368 } 369 } 370 } 371 372 373 void 374 optimize(Inst *start) 375 { 376 Inst *inst, *target; 377 378 for(inst=start; inst->type!=END; inst++){ 379 target = inst->next; 380 while(target->type == NOP) 381 target = target->next; 382 inst->next = target; 383 } 384 } 385 386 void 387 startlex(Rune *s) 388 { 389 exprp = s; 390 nbra = 0; 391 } 392 393 394 int 395 lex(void){ 396 int c; 397 398 c = *exprp++; 399 switch(c){ 400 case '\\': 401 if(*exprp) 402 if((c= *exprp++)=='n') 403 c='\n'; 404 break; 405 case 0: 406 c = END; 407 --exprp; /* In case we come here again */ 408 break; 409 case '*': 410 c = STAR; 411 break; 412 case '?': 413 c = QUEST; 414 break; 415 case '+': 416 c = PLUS; 417 break; 418 case '|': 419 c = OR; 420 break; 421 case '.': 422 c = ANY; 423 break; 424 case '(': 425 c = LBRA; 426 break; 427 case ')': 428 c = RBRA; 429 break; 430 case '^': 431 c = BOL; 432 break; 433 case '$': 434 c = EOL; 435 break; 436 case '[': 437 c = CCLASS; 438 bldcclass(); 439 break; 440 } 441 return c; 442 } 443 444 int 445 nextrec(void) 446 { 447 if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0)) 448 regerror("malformed `[]'"); 449 if(exprp[0] == '\\'){ 450 exprp++; 451 if(*exprp=='n'){ 452 exprp++; 453 return '\n'; 454 } 455 return *exprp++|0x10000; 456 } 457 return *exprp++; 458 } 459 460 void 461 bldcclass(void) 462 { 463 int c1, c2, n, na; 464 Rune *classp; 465 466 classp = runemalloc(DCLASS); 467 n = 0; 468 na = DCLASS; 469 /* we have already seen the '[' */ 470 if(*exprp == '^'){ 471 classp[n++] = '\n'; /* don't match newline in negate case */ 472 negateclass = TRUE; 473 exprp++; 474 }else 475 negateclass = FALSE; 476 while((c1 = nextrec()) != ']'){ 477 if(c1 == '-'){ 478 Error: 479 free(classp); 480 regerror("malformed `[]'"); 481 } 482 if(n+4 >= na){ /* 3 runes plus NUL */ 483 na += DCLASS; 484 classp = runerealloc(classp, na); 485 } 486 if(*exprp == '-'){ 487 exprp++; /* eat '-' */ 488 if((c2 = nextrec()) == ']') 489 goto Error; 490 classp[n+0] = 0xFFFF; 491 classp[n+1] = c1; 492 classp[n+2] = c2; 493 n += 3; 494 }else 495 classp[n++] = c1; 496 } 497 classp[n] = 0; 498 if(nclass == Nclass){ 499 Nclass += DCLASS; 500 class = realloc(class, Nclass*sizeof(Rune*)); 501 } 502 class[nclass++] = classp; 503 } 504 505 int 506 classmatch(int classno, int c, int negate) 507 { 508 Rune *p; 509 510 p = class[classno]; 511 while(*p){ 512 if(*p == 0xFFFF){ 513 if(p[1]<=c && c<=p[2]) 514 return !negate; 515 p += 3; 516 }else if(*p++ == c) 517 return !negate; 518 } 519 return negate; 520 } 521 522 /* 523 * Note optimization in addinst: 524 * *l must be pending when addinst called; if *l has been looked 525 * at already, the optimization is a bug. 526 */ 527 void 528 addinst(Ilist *l, Inst *inst, Rangeset *sep) 529 { 530 Ilist *p; 531 532 for(p = l; p->inst; p++){ 533 if(p->inst==inst){ 534 if((sep)->r[0].q0 < p->se.r[0].q0) 535 p->se= *sep; /* this would be bug */ 536 return; /* It's already there */ 537 } 538 } 539 p->inst = inst; 540 p->se= *sep; 541 (p+1)->inst = nil; 542 } 543 544 int 545 rxnull(void) 546 { 547 return startinst==nil || bstartinst==nil; 548 } 549 550 /* either t!=nil or r!=nil, and we match the string in the appropriate place */ 551 int 552 rxexecute(Text *t, Rune *r, uint startp, uint eof, Rangeset *rp) 553 { 554 int flag; 555 Inst *inst; 556 Ilist *tlp; 557 uint p; 558 int nnl, ntl; 559 int nc, c; 560 int wrapped; 561 int startchar; 562 563 flag = 0; 564 p = startp; 565 startchar = 0; 566 wrapped = 0; 567 nnl = 0; 568 if(startinst->type<OPERATOR) 569 startchar = startinst->type; 570 list[0][0].inst = list[1][0].inst = nil; 571 sel.r[0].q0 = -1; 572 if(t != nil) 573 nc = t->file->nc; 574 else 575 nc = runestrlen(r); 576 /* Execute machine once for each character */ 577 for(;;p++){ 578 doloop: 579 if(p>=eof || p>=nc){ 580 switch(wrapped++){ 581 case 0: /* let loop run one more click */ 582 case 2: 583 break; 584 case 1: /* expired; wrap to beginning */ 585 if(sel.r[0].q0>=0 || eof!=Infinity) 586 goto Return; 587 list[0][0].inst = list[1][0].inst = nil; 588 p = 0; 589 goto doloop; 590 default: 591 goto Return; 592 } 593 c = 0; 594 }else{ 595 if(((wrapped && p>=startp) || sel.r[0].q0>0) && nnl==0) 596 break; 597 if(t != nil) 598 c = textreadc(t, p); 599 else 600 c = r[p]; 601 } 602 /* fast check for first char */ 603 if(startchar && nnl==0 && c!=startchar) 604 continue; 605 tl = list[flag]; 606 nl = list[flag^=1]; 607 nl->inst = nil; 608 ntl = nnl; 609 nnl = 0; 610 if(sel.r[0].q0<0 && (!wrapped || p<startp || startp==eof)){ 611 /* Add first instruction to this list */ 612 if(++ntl >= NLIST){ 613 Overflow: 614 warning(nil, "regexp list overflow\n"); 615 sel.r[0].q0 = -1; 616 goto Return; 617 } 618 sempty.r[0].q0 = p; 619 addinst(tl, startinst, &sempty); 620 } 621 /* Execute machine until this list is empty */ 622 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */ 623 Switchstmt: 624 switch(inst->type){ 625 default: /* regular character */ 626 if(inst->type==c){ 627 Addinst: 628 if(++nnl >= NLIST) 629 goto Overflow; 630 addinst(nl, inst->next, &tlp->se); 631 } 632 break; 633 case LBRA: 634 if(inst->subid>=0) 635 tlp->se.r[inst->subid].q0 = p; 636 inst = inst->next; 637 goto Switchstmt; 638 case RBRA: 639 if(inst->subid>=0) 640 tlp->se.r[inst->subid].q1 = p; 641 inst = inst->next; 642 goto Switchstmt; 643 case ANY: 644 if(c!='\n') 645 goto Addinst; 646 break; 647 case BOL: 648 if(p==0 || (t!=nil && textreadc(t, p-1)=='\n') || (r!=nil && r[p-1]=='\n')){ 649 Step: 650 inst = inst->next; 651 goto Switchstmt; 652 } 653 break; 654 case EOL: 655 if(c == '\n') 656 goto Step; 657 break; 658 case CCLASS: 659 if(c>=0 && classmatch(inst->class, c, 0)) 660 goto Addinst; 661 break; 662 case NCCLASS: 663 if(c>=0 && classmatch(inst->class, c, 1)) 664 goto Addinst; 665 break; 666 case OR: 667 /* evaluate right choice later */ 668 if(++ntl >= NLIST) 669 goto Overflow; 670 addinst(tlp, inst->right, &tlp->se); 671 /* efficiency: advance and re-evaluate */ 672 inst = inst->left; 673 goto Switchstmt; 674 case END: /* Match! */ 675 tlp->se.r[0].q1 = p; 676 newmatch(&tlp->se); 677 break; 678 } 679 } 680 } 681 Return: 682 *rp = sel; 683 return sel.r[0].q0 >= 0; 684 } 685 686 void 687 newmatch(Rangeset *sp) 688 { 689 if(sel.r[0].q0<0 || sp->r[0].q0<sel.r[0].q0 || 690 (sp->r[0].q0==sel.r[0].q0 && sp->r[0].q1>sel.r[0].q1)) 691 sel = *sp; 692 } 693 694 int 695 rxbexecute(Text *t, uint startp, Rangeset *rp) 696 { 697 int flag; 698 Inst *inst; 699 Ilist *tlp; 700 int p; 701 int nnl, ntl; 702 int c; 703 int wrapped; 704 int startchar; 705 706 flag = 0; 707 nnl = 0; 708 wrapped = 0; 709 p = startp; 710 startchar = 0; 711 if(bstartinst->type<OPERATOR) 712 startchar = bstartinst->type; 713 list[0][0].inst = list[1][0].inst = nil; 714 sel.r[0].q0= -1; 715 /* Execute machine once for each character, including terminal NUL */ 716 for(;;--p){ 717 doloop: 718 if(p <= 0){ 719 switch(wrapped++){ 720 case 0: /* let loop run one more click */ 721 case 2: 722 break; 723 case 1: /* expired; wrap to end */ 724 if(sel.r[0].q0>=0) 725 goto Return; 726 list[0][0].inst = list[1][0].inst = nil; 727 p = t->file->nc; 728 goto doloop; 729 case 3: 730 default: 731 goto Return; 732 } 733 c = 0; 734 }else{ 735 if(((wrapped && p<=startp) || sel.r[0].q0>0) && nnl==0) 736 break; 737 c = textreadc(t, p-1); 738 } 739 /* fast check for first char */ 740 if(startchar && nnl==0 && c!=startchar) 741 continue; 742 tl = list[flag]; 743 nl = list[flag^=1]; 744 nl->inst = nil; 745 ntl = nnl; 746 nnl = 0; 747 if(sel.r[0].q0<0 && (!wrapped || p>startp)){ 748 /* Add first instruction to this list */ 749 if(++ntl >= NLIST){ 750 Overflow: 751 warning(nil, "regexp list overflow\n"); 752 sel.r[0].q0 = -1; 753 goto Return; 754 } 755 /* the minus is so the optimizations in addinst work */ 756 sempty.r[0].q0 = -p; 757 addinst(tl, bstartinst, &sempty); 758 } 759 /* Execute machine until this list is empty */ 760 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */ 761 Switchstmt: 762 switch(inst->type){ 763 default: /* regular character */ 764 if(inst->type == c){ 765 Addinst: 766 if(++nnl >= NLIST) 767 goto Overflow; 768 addinst(nl, inst->next, &tlp->se); 769 } 770 break; 771 case LBRA: 772 if(inst->subid>=0) 773 tlp->se.r[inst->subid].q0 = p; 774 inst = inst->next; 775 goto Switchstmt; 776 case RBRA: 777 if(inst->subid >= 0) 778 tlp->se.r[inst->subid].q1 = p; 779 inst = inst->next; 780 goto Switchstmt; 781 case ANY: 782 if(c != '\n') 783 goto Addinst; 784 break; 785 case BOL: 786 if(c=='\n' || p==0){ 787 Step: 788 inst = inst->next; 789 goto Switchstmt; 790 } 791 break; 792 case EOL: 793 if(p<t->file->nc && textreadc(t, p)=='\n') 794 goto Step; 795 break; 796 case CCLASS: 797 if(c>0 && classmatch(inst->class, c, 0)) 798 goto Addinst; 799 break; 800 case NCCLASS: 801 if(c>0 && classmatch(inst->class, c, 1)) 802 goto Addinst; 803 break; 804 case OR: 805 /* evaluate right choice later */ 806 if(++ntl >= NLIST) 807 goto Overflow; 808 addinst(tlp, inst->right, &tlp->se); 809 /* efficiency: advance and re-evaluate */ 810 inst = inst->left; 811 goto Switchstmt; 812 case END: /* Match! */ 813 tlp->se.r[0].q0 = -tlp->se.r[0].q0; /* minus sign */ 814 tlp->se.r[0].q1 = p; 815 bnewmatch(&tlp->se); 816 break; 817 } 818 } 819 } 820 Return: 821 *rp = sel; 822 return sel.r[0].q0 >= 0; 823 } 824 825 void 826 bnewmatch(Rangeset *sp) 827 { 828 int i; 829 830 if(sel.r[0].q0<0 || sp->r[0].q0>sel.r[0].q1 || (sp->r[0].q0==sel.r[0].q1 && sp->r[0].q1<sel.r[0].q0)) 831 for(i = 0; i<NRange; i++){ /* note the reversal; q0<=q1 */ 832 sel.r[i].q0 = sp->r[i].q1; 833 sel.r[i].q1 = sp->r[i].q0; 834 } 835 } 836