1 #include "sam.h" 2 3 Rangeset sel; 4 String lastregexp; 5 /* 6 * Machine Information 7 */ 8 typedef struct Inst Inst; 9 10 struct Inst 11 { 12 long type; /* < 0x10000 ==> literal, otherwise action */ 13 union { 14 int rsid; 15 int rsubid; 16 int class; 17 struct Inst *rother; 18 struct Inst *rright; 19 } r; 20 union{ 21 struct Inst *lleft; 22 struct Inst *lnext; 23 } l; 24 }; 25 #define sid r.rsid 26 #define subid r.rsubid 27 #define rclass r.class 28 #define other r.rother 29 #define right r.rright 30 #define left l.lleft 31 #define next l.lnext 32 33 #define NPROG 1024 34 Inst program[NPROG]; 35 Inst *progp; 36 Inst *startinst; /* First inst. of program; might not be program[0] */ 37 Inst *bstartinst; /* same for backwards machine */ 38 39 typedef struct Ilist Ilist; 40 struct Ilist 41 { 42 Inst *inst; /* Instruction of the thread */ 43 Rangeset se; 44 Posn startp; /* first char of match */ 45 }; 46 47 #define NLIST 128 48 49 Ilist *tl, *nl; /* This list, next list */ 50 Ilist list[2][NLIST]; 51 static Rangeset sempty; 52 53 /* 54 * Actions and Tokens 55 * 56 * 0x100xx are operators, value == precedence 57 * 0x200xx are tokens, i.e. operands for operators 58 */ 59 #define OPERATOR 0x10000 /* Bitmask of all operators */ 60 #define START 0x10000 /* Start, used for marker on stack */ 61 #define RBRA 0x10001 /* Right bracket, ) */ 62 #define LBRA 0x10002 /* Left bracket, ( */ 63 #define OR 0x10003 /* Alternation, | */ 64 #define CAT 0x10004 /* Concatentation, implicit operator */ 65 #define STAR 0x10005 /* Closure, * */ 66 #define PLUS 0x10006 /* a+ == aa* */ 67 #define QUEST 0x10007 /* a? == a|nothing, i.e. 0 or 1 a's */ 68 #define ANY 0x20000 /* Any character but newline, . */ 69 #define NOP 0x20001 /* No operation, internal use only */ 70 #define BOL 0x20002 /* Beginning of line, ^ */ 71 #define EOL 0x20003 /* End of line, $ */ 72 #define CCLASS 0x20004 /* Character class, [] */ 73 #define NCCLASS 0x20005 /* Negated character class, [^] */ 74 #define END 0x20077 /* Terminate: match found */ 75 76 #define ISATOR 0x10000 77 #define ISAND 0x20000 78 79 /* 80 * Parser Information 81 */ 82 typedef struct Node Node; 83 struct Node 84 { 85 Inst *first; 86 Inst *last; 87 }; 88 89 #define NSTACK 20 90 Node andstack[NSTACK]; 91 Node *andp; 92 int atorstack[NSTACK]; 93 int *atorp; 94 int lastwasand; /* Last token was operand */ 95 int cursubid; 96 int subidstack[NSTACK]; 97 int *subidp; 98 int backwards; 99 int nbra; 100 Rune *exprp; /* pointer to next character in source expression */ 101 #define DCLASS 10 /* allocation increment */ 102 int nclass; /* number active */ 103 int Nclass; /* high water mark */ 104 Rune **class; 105 int negateclass; 106 107 void addinst(Ilist *l, Inst *inst, Rangeset *sep); 108 void newmatch(Rangeset*); 109 void bnewmatch(Rangeset*); 110 void pushand(Inst*, Inst*); 111 void pushator(int); 112 Node *popand(int); 113 int popator(void); 114 void startlex(Rune*); 115 int lex(void); 116 void operator(int); 117 void operand(int); 118 void evaluntil(int); 119 void optimize(Inst*); 120 void bldcclass(void); 121 122 void 123 regerror(Err e) 124 { 125 Strzero(&lastregexp); 126 error(e); 127 } 128 129 void 130 regerror_c(Err e, int c) 131 { 132 Strzero(&lastregexp); 133 error_c(e, c); 134 } 135 136 Inst * 137 newinst(int t) 138 { 139 if(progp >= &program[NPROG]) 140 regerror(Etoolong); 141 progp->type = t; 142 progp->left = 0; 143 progp->right = 0; 144 return progp++; 145 } 146 147 Inst * 148 realcompile(Rune *s) 149 { 150 int token; 151 152 startlex(s); 153 atorp = atorstack; 154 andp = andstack; 155 subidp = subidstack; 156 cursubid = 0; 157 lastwasand = FALSE; 158 /* Start with a low priority operator to prime parser */ 159 pushator(START-1); 160 while((token=lex()) != END){ 161 if((token&ISATOR) == OPERATOR) 162 operator(token); 163 else 164 operand(token); 165 } 166 /* Close with a low priority operator */ 167 evaluntil(START); 168 /* Force END */ 169 operand(END); 170 evaluntil(START); 171 if(nbra) 172 regerror(Eleftpar); 173 --andp; /* points to first and only operand */ 174 return andp->first; 175 } 176 177 void 178 compile(String *s) 179 { 180 int i; 181 Inst *oprogp; 182 183 if(Strcmp(s, &lastregexp)==0) 184 return; 185 for(i=0; i<nclass; i++) 186 free(class[i]); 187 nclass = 0; 188 progp = program; 189 backwards = FALSE; 190 startinst = realcompile(s->s); 191 optimize(program); 192 oprogp = progp; 193 backwards = TRUE; 194 bstartinst = realcompile(s->s); 195 optimize(oprogp); 196 Strduplstr(&lastregexp, s); 197 } 198 199 void 200 operand(int t) 201 { 202 Inst *i; 203 if(lastwasand) 204 operator(CAT); /* catenate is implicit */ 205 i = newinst(t); 206 if(t == CCLASS){ 207 if(negateclass) 208 i->type = NCCLASS; /* UGH */ 209 i->rclass = nclass-1; /* UGH */ 210 } 211 pushand(i, i); 212 lastwasand = TRUE; 213 } 214 215 void 216 operator(int t) 217 { 218 if(t==RBRA && --nbra<0) 219 regerror(Erightpar); 220 if(t==LBRA){ 221 /* 222 * if(++cursubid >= NSUBEXP) 223 * regerror(Esubexp); 224 */ 225 cursubid++; /* silently ignored */ 226 nbra++; 227 if(lastwasand) 228 operator(CAT); 229 }else 230 evaluntil(t); 231 if(t!=RBRA) 232 pushator(t); 233 lastwasand = FALSE; 234 if(t==STAR || t==QUEST || t==PLUS || t==RBRA) 235 lastwasand = TRUE; /* these look like operands */ 236 } 237 238 void 239 cant(char *s) 240 { 241 char buf[100]; 242 243 sprint(buf, "regexp: can't happen: %s", s); 244 panic(buf); 245 } 246 247 void 248 pushand(Inst *f, Inst *l) 249 { 250 if(andp >= &andstack[NSTACK]) 251 cant("operand stack overflow"); 252 andp->first = f; 253 andp->last = l; 254 andp++; 255 } 256 257 void 258 pushator(int t) 259 { 260 if(atorp >= &atorstack[NSTACK]) 261 cant("operator stack overflow"); 262 *atorp++=t; 263 if(cursubid >= NSUBEXP) 264 *subidp++= -1; 265 else 266 *subidp++=cursubid; 267 } 268 269 Node * 270 popand(int op) 271 { 272 if(andp <= &andstack[0]) 273 if(op) 274 regerror_c(Emissop, op); 275 else 276 regerror(Ebadregexp); 277 return --andp; 278 } 279 280 int 281 popator(void) 282 { 283 if(atorp <= &atorstack[0]) 284 cant("operator stack underflow"); 285 --subidp; 286 return *--atorp; 287 } 288 289 void 290 evaluntil(int pri) 291 { 292 Node *op1, *op2, *t; 293 Inst *inst1, *inst2; 294 295 while(pri==RBRA || atorp[-1]>=pri){ 296 switch(popator()){ 297 case LBRA: 298 op1 = popand('('); 299 inst2 = newinst(RBRA); 300 inst2->subid = *subidp; 301 op1->last->next = inst2; 302 inst1 = newinst(LBRA); 303 inst1->subid = *subidp; 304 inst1->next = op1->first; 305 pushand(inst1, inst2); 306 return; /* must have been RBRA */ 307 default: 308 panic("unknown regexp operator"); 309 break; 310 case OR: 311 op2 = popand('|'); 312 op1 = popand('|'); 313 inst2 = newinst(NOP); 314 op2->last->next = inst2; 315 op1->last->next = inst2; 316 inst1 = newinst(OR); 317 inst1->right = op1->first; 318 inst1->left = op2->first; 319 pushand(inst1, inst2); 320 break; 321 case CAT: 322 op2 = popand(0); 323 op1 = popand(0); 324 if(backwards && op2->first->type!=END) 325 t = op1, op1 = op2, op2 = t; 326 op1->last->next = op2->first; 327 pushand(op1->first, op2->last); 328 break; 329 case STAR: 330 op2 = popand('*'); 331 inst1 = newinst(OR); 332 op2->last->next = inst1; 333 inst1->right = op2->first; 334 pushand(inst1, inst1); 335 break; 336 case PLUS: 337 op2 = popand('+'); 338 inst1 = newinst(OR); 339 op2->last->next = inst1; 340 inst1->right = op2->first; 341 pushand(op2->first, inst1); 342 break; 343 case QUEST: 344 op2 = popand('?'); 345 inst1 = newinst(OR); 346 inst2 = newinst(NOP); 347 inst1->left = inst2; 348 inst1->right = op2->first; 349 op2->last->next = inst2; 350 pushand(inst1, inst2); 351 break; 352 } 353 } 354 } 355 356 357 void 358 optimize(Inst *start) 359 { 360 Inst *inst, *target; 361 362 for(inst=start; inst->type!=END; inst++){ 363 target = inst->next; 364 while(target->type == NOP) 365 target = target->next; 366 inst->next = target; 367 } 368 } 369 370 #ifdef DEBUG 371 void 372 dumpstack(void){ 373 Node *stk; 374 int *ip; 375 376 dprint("operators\n"); 377 for(ip = atorstack; ip<atorp; ip++) 378 dprint("0%o\n", *ip); 379 dprint("operands\n"); 380 for(stk = andstack; stk<andp; stk++) 381 dprint("0%o\t0%o\n", stk->first->type, stk->last->type); 382 } 383 void 384 dump(void){ 385 Inst *l; 386 387 l = program; 388 do{ 389 dprint("%d:\t0%o\t%d\t%d\n", l-program, l->type, 390 l->left-program, l->right-program); 391 }while(l++->type); 392 } 393 #endif 394 395 void 396 startlex(Rune *s) 397 { 398 exprp = s; 399 nbra = 0; 400 } 401 402 403 int 404 lex(void){ 405 int c= *exprp++; 406 407 switch(c){ 408 case '\\': 409 if(*exprp) 410 if((c= *exprp++)=='n') 411 c='\n'; 412 break; 413 case 0: 414 c = END; 415 --exprp; /* In case we come here again */ 416 break; 417 case '*': 418 c = STAR; 419 break; 420 case '?': 421 c = QUEST; 422 break; 423 case '+': 424 c = PLUS; 425 break; 426 case '|': 427 c = OR; 428 break; 429 case '.': 430 c = ANY; 431 break; 432 case '(': 433 c = LBRA; 434 break; 435 case ')': 436 c = RBRA; 437 break; 438 case '^': 439 c = BOL; 440 break; 441 case '$': 442 c = EOL; 443 break; 444 case '[': 445 c = CCLASS; 446 bldcclass(); 447 break; 448 } 449 return c; 450 } 451 452 long 453 nextrec(void){ 454 if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0)) 455 regerror(Ebadclass); 456 if(exprp[0] == '\\'){ 457 exprp++; 458 if(*exprp=='n'){ 459 exprp++; 460 return '\n'; 461 } 462 return *exprp++|0x10000; 463 } 464 return *exprp++; 465 } 466 467 void 468 bldcclass(void) 469 { 470 long c1, c2, n, na; 471 Rune *classp; 472 473 classp = emalloc(DCLASS*RUNESIZE); 474 n = 0; 475 na = DCLASS; 476 /* we have already seen the '[' */ 477 if(*exprp == '^'){ 478 classp[n++] = '\n'; /* don't match newline in negate case */ 479 negateclass = TRUE; 480 exprp++; 481 }else 482 negateclass = FALSE; 483 while((c1 = nextrec()) != ']'){ 484 if(c1 == '-'){ 485 Error: 486 free(classp); 487 regerror(Ebadclass); 488 } 489 if(n+4 >= na){ /* 3 runes plus NUL */ 490 na += DCLASS; 491 classp = erealloc(classp, na*RUNESIZE); 492 } 493 if(*exprp == '-'){ 494 exprp++; /* eat '-' */ 495 if((c2 = nextrec()) == ']') 496 goto Error; 497 classp[n+0] = 0xFFFF; 498 classp[n+1] = c1; 499 classp[n+2] = c2; 500 n += 3; 501 }else 502 classp[n++] = c1; 503 } 504 classp[n] = 0; 505 if(nclass == Nclass){ 506 Nclass += DCLASS; 507 class = erealloc(class, Nclass*sizeof(Rune*)); 508 } 509 class[nclass++] = classp; 510 } 511 512 int 513 classmatch(int classno, int c, int negate) 514 { 515 Rune *p; 516 517 p = class[classno]; 518 while(*p){ 519 if(*p == 0xFFFF){ 520 if(p[1]<=c && c<=p[2]) 521 return !negate; 522 p += 3; 523 }else if(*p++ == c) 524 return !negate; 525 } 526 return negate; 527 } 528 529 /* 530 * Note optimization in addinst: 531 * *l must be pending when addinst called; if *l has been looked 532 * at already, the optimization is a bug. 533 */ 534 void 535 addinst(Ilist *l, Inst *inst, Rangeset *sep) 536 { 537 Ilist *p; 538 539 for(p = l; p->inst; p++){ 540 if(p->inst==inst){ 541 if((sep)->p[0].p1 < p->se.p[0].p1) 542 p->se= *sep; /* this would be bug */ 543 return; /* It's already there */ 544 } 545 } 546 p->inst = inst; 547 p->se= *sep; 548 (p+1)->inst = 0; 549 } 550 551 int 552 execute(File *f, Posn startp, Posn eof) 553 { 554 int flag = 0; 555 Inst *inst; 556 Ilist *tlp; 557 Posn p = startp; 558 int nnl = 0, ntl; 559 int c; 560 int wrapped = 0; 561 int startchar = startinst->type<OPERATOR? startinst->type : 0; 562 563 list[0][0].inst = list[1][0].inst = 0; 564 sel.p[0].p1 = -1; 565 Fgetcset(f, startp); 566 /* Execute machine once for each character */ 567 for(;;p++){ 568 doloop: 569 c = Fgetc(f); 570 if(p>=eof || c<0){ 571 switch(wrapped++){ 572 case 0: /* let loop run one more click */ 573 case 2: 574 break; 575 case 1: /* expired; wrap to beginning */ 576 if(sel.p[0].p1>=0 || eof!=INFINITY) 577 goto Return; 578 list[0][0].inst = list[1][0].inst = 0; 579 Fgetcset(f, (Posn)0); 580 p = 0; 581 goto doloop; 582 default: 583 goto Return; 584 } 585 }else if(((wrapped && p>=startp) || sel.p[0].p1>0) && nnl==0) 586 break; 587 /* fast check for first char */ 588 if(startchar && nnl==0 && c!=startchar) 589 continue; 590 tl = list[flag]; 591 nl = list[flag^=1]; 592 nl->inst = 0; 593 ntl = nnl; 594 nnl = 0; 595 if(sel.p[0].p1<0 && (!wrapped || p<startp || startp==eof)){ 596 /* Add first instruction to this list */ 597 if(++ntl >= NLIST) 598 Overflow: 599 error(Eoverflow); 600 sempty.p[0].p1 = p; 601 addinst(tl, startinst, &sempty); 602 } 603 /* Execute machine until this list is empty */ 604 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */ 605 Switchstmt: 606 switch(inst->type){ 607 default: /* regular character */ 608 if(inst->type==c){ 609 Addinst: 610 if(++nnl >= NLIST) 611 goto Overflow; 612 addinst(nl, inst->next, &tlp->se); 613 } 614 break; 615 case LBRA: 616 if(inst->subid>=0) 617 tlp->se.p[inst->subid].p1 = p; 618 inst = inst->next; 619 goto Switchstmt; 620 case RBRA: 621 if(inst->subid>=0) 622 tlp->se.p[inst->subid].p2 = p; 623 inst = inst->next; 624 goto Switchstmt; 625 case ANY: 626 if(c!='\n') 627 goto Addinst; 628 break; 629 case BOL: 630 if(p == 0){ 631 Step: 632 inst = inst->next; 633 goto Switchstmt; 634 } 635 if(f->getci > 1){ 636 if(f->getcbuf[f->getci-2]=='\n') 637 goto Step; 638 }else{ 639 Rune c; 640 if(Fchars(f, &c, p-1, p)==1 && c=='\n') 641 goto Step; 642 } 643 break; 644 case EOL: 645 if(c == '\n') 646 goto Step; 647 break; 648 case CCLASS: 649 if(c>=0 && classmatch(inst->rclass, c, 0)) 650 goto Addinst; 651 break; 652 case NCCLASS: 653 if(c>=0 && classmatch(inst->rclass, c, 1)) 654 goto Addinst; 655 break; 656 case OR: 657 /* evaluate right choice later */ 658 if(++ntl >= NLIST) 659 goto Overflow; 660 addinst(tlp, inst->right, &tlp->se); 661 /* efficiency: advance and re-evaluate */ 662 inst = inst->left; 663 goto Switchstmt; 664 case END: /* Match! */ 665 tlp->se.p[0].p2 = p; 666 newmatch(&tlp->se); 667 break; 668 } 669 } 670 } 671 Return: 672 return sel.p[0].p1>=0; 673 } 674 675 void 676 newmatch(Rangeset *sp) 677 { 678 int i; 679 680 if(sel.p[0].p1<0 || sp->p[0].p1<sel.p[0].p1 || 681 (sp->p[0].p1==sel.p[0].p1 && sp->p[0].p2>sel.p[0].p2)) 682 for(i = 0; i<NSUBEXP; i++) 683 sel.p[i] = sp->p[i]; 684 } 685 686 int 687 bexecute(File *f, Posn startp) 688 { 689 int flag = 0; 690 Inst *inst; 691 Ilist *tlp; 692 Posn p = startp; 693 int nnl = 0, ntl; 694 int c; 695 int wrapped = 0; 696 int startchar = bstartinst->type<OPERATOR? bstartinst->type : 0; 697 698 list[0][0].inst = list[1][0].inst = 0; 699 sel.p[0].p1= -1; 700 Fgetcset(f, startp); 701 /* Execute machine once for each character, including terminal NUL */ 702 for(;;--p){ 703 doloop: 704 if((c = Fbgetc(f))==-1){ 705 switch(wrapped++){ 706 case 0: /* let loop run one more click */ 707 case 2: 708 break; 709 case 1: /* expired; wrap to end */ 710 if(sel.p[0].p1>=0) 711 case 3: 712 goto Return; 713 list[0][0].inst = list[1][0].inst = 0; 714 Fgetcset(f, f->nrunes); 715 p = f->nrunes; 716 goto doloop; 717 default: 718 goto Return; 719 } 720 }else if(((wrapped && p<=startp) || sel.p[0].p1>0) && nnl==0) 721 break; 722 /* fast check for first char */ 723 if(startchar && nnl==0 && c!=startchar) 724 continue; 725 tl = list[flag]; 726 nl = list[flag^=1]; 727 nl->inst = 0; 728 ntl = nnl; 729 nnl = 0; 730 if(sel.p[0].p1<0 && (!wrapped || p>startp)){ 731 /* Add first instruction to this list */ 732 if(++ntl >= NLIST) 733 Overflow: 734 error(Eoverflow); 735 /* the minus is so the optimizations in addinst work */ 736 sempty.p[0].p1 = -p; 737 addinst(tl, bstartinst, &sempty); 738 } 739 /* Execute machine until this list is empty */ 740 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */ 741 Switchstmt: 742 switch(inst->type){ 743 default: /* regular character */ 744 if(inst->type == c){ 745 Addinst: 746 if(++nnl >= NLIST) 747 goto Overflow; 748 addinst(nl, inst->next, &tlp->se); 749 } 750 break; 751 case LBRA: 752 if(inst->subid>=0) 753 tlp->se.p[inst->subid].p1 = p; 754 inst = inst->next; 755 goto Switchstmt; 756 case RBRA: 757 if(inst->subid >= 0) 758 tlp->se.p[inst->subid].p2 = p; 759 inst = inst->next; 760 goto Switchstmt; 761 case ANY: 762 if(c != '\n') 763 goto Addinst; 764 break; 765 case BOL: 766 if(c=='\n' || p==0){ 767 Step: 768 inst = inst->next; 769 goto Switchstmt; 770 } 771 break; 772 case EOL: 773 if(f->getci<f->ngetc-1){ 774 if(f->getcbuf[f->getci+1]=='\n') 775 goto Step; 776 }else if(p<f->nrunes-1){ 777 Rune c; 778 if(Fchars(f, &c, p, p+1)==1 && c=='\n') 779 goto Step; 780 } 781 break; 782 case CCLASS: 783 if(c>=0 && classmatch(inst->rclass, c, 0)) 784 goto Addinst; 785 break; 786 case NCCLASS: 787 if(c>=0 && classmatch(inst->rclass, c, 1)) 788 goto Addinst; 789 break; 790 case OR: 791 /* evaluate right choice later */ 792 if(++ntl >= NLIST) 793 goto Overflow; 794 addinst(tlp, inst->right, &tlp->se); 795 /* efficiency: advance and re-evaluate */ 796 inst = inst->left; 797 goto Switchstmt; 798 case END: /* Match! */ 799 tlp->se.p[0].p1 = -tlp->se.p[0].p1; /* minus sign */ 800 tlp->se.p[0].p2 = p; 801 bnewmatch(&tlp->se); 802 break; 803 } 804 } 805 } 806 Return: 807 return sel.p[0].p1>=0; 808 } 809 810 void 811 bnewmatch(Rangeset *sp) 812 { 813 int i; 814 if(sel.p[0].p1<0 || sp->p[0].p1>sel.p[0].p2 || (sp->p[0].p1==sel.p[0].p2 && sp->p[0].p2<sel.p[0].p1)) 815 for(i = 0; i<NSUBEXP; i++){ /* note the reversal; p1<=p2 */ 816 sel.p[i].p1 = sp->p[i].p2; 817 sel.p[i].p2 = sp->p[i].p1; 818 } 819 } 820