1 #include "sam.h" 2 3 Rangeset sel; 4 String lastregexp; 5 /* 6 * Machine Information 7 */ 8 typedef struct Inst Inst; 9 10 struct Inst 11 { 12 long type; /* <= Runemax ==> literal, otherwise action */ 13 union { 14 int rsid; 15 int rsubid; 16 int class; 17 struct Inst *rother; 18 struct Inst *rright; 19 } r; 20 union{ 21 struct Inst *lleft; 22 struct Inst *lnext; 23 } l; 24 }; 25 #define sid r.rsid 26 #define subid r.rsubid 27 #define rclass r.class 28 #define other r.rother 29 #define right r.rright 30 #define left l.lleft 31 #define next l.lnext 32 33 #define NPROG 1024 34 Inst program[NPROG]; 35 Inst *progp; 36 Inst *startinst; /* First inst. of program; might not be program[0] */ 37 Inst *bstartinst; /* same for backwards machine */ 38 39 typedef struct Ilist Ilist; 40 struct Ilist 41 { 42 Inst *inst; /* Instruction of the thread */ 43 Rangeset se; 44 Posn startp; /* first char of match */ 45 }; 46 47 #define NLIST 127 48 49 Ilist *tl, *nl; /* This list, next list */ 50 Ilist list[2][NLIST+1]; /* +1 for trailing null */ 51 static Rangeset sempty; 52 53 /* 54 * Actions and Tokens 55 * 56 * 0x100xx are operators, value == precedence 57 * 0x200xx are tokens, i.e. operands for operators 58 */ 59 enum { 60 OPERATOR = Runemask+1, /* Bitmask of all operators */ 61 START = OPERATOR, /* Start, used for marker on stack */ 62 RBRA, /* Right bracket, ) */ 63 LBRA, /* Left bracket, ( */ 64 OR, /* Alternation, | */ 65 CAT, /* Concatentation, implicit operator */ 66 STAR, /* Closure, * */ 67 PLUS, /* a+ == aa* */ 68 QUEST, /* a? == a|nothing, i.e. 0 or 1 a's */ 69 70 ANY = OPERATOR<<1, /* Any character but newline, . */ 71 NOP, /* No operation, internal use only */ 72 BOL, /* Beginning of line, ^ */ 73 EOL, /* End of line, $ */ 74 CCLASS, /* Character class, [] */ 75 NCCLASS, /* Negated character class, [^] */ 76 END, /* Terminate: match found */ 77 78 ISATOR = OPERATOR, 79 ISAND = OPERATOR<<1, 80 }; 81 82 /* 83 * Parser Information 84 */ 85 typedef struct Node Node; 86 struct Node 87 { 88 Inst *first; 89 Inst *last; 90 }; 91 92 #define NSTACK 20 93 Node andstack[NSTACK]; 94 Node *andp; 95 int atorstack[NSTACK]; 96 int *atorp; 97 int lastwasand; /* Last token was operand */ 98 int cursubid; 99 int subidstack[NSTACK]; 100 int *subidp; 101 int backwards; 102 int nbra; 103 Rune *exprp; /* pointer to next character in source expression */ 104 #define DCLASS 10 /* allocation increment */ 105 int nclass; /* number active */ 106 int Nclass; /* high water mark */ 107 Rune **class; 108 int negateclass; 109 110 int addinst(Ilist *l, Inst *inst, Rangeset *sep); 111 void newmatch(Rangeset*); 112 void bnewmatch(Rangeset*); 113 void pushand(Inst*, Inst*); 114 void pushator(int); 115 Node *popand(int); 116 int popator(void); 117 void startlex(Rune*); 118 int lex(void); 119 void operator(int); 120 void operand(int); 121 void evaluntil(int); 122 void optimize(Inst*); 123 void bldcclass(void); 124 125 void 126 regerror(Err e) 127 { 128 Strzero(&lastregexp); 129 error(e); 130 } 131 132 void 133 regerror_c(Err e, int c) 134 { 135 Strzero(&lastregexp); 136 error_c(e, c); 137 } 138 139 Inst * 140 newinst(int t) 141 { 142 if(progp >= &program[NPROG]) 143 regerror(Etoolong); 144 progp->type = t; 145 progp->left = 0; 146 progp->right = 0; 147 return progp++; 148 } 149 150 Inst * 151 realcompile(Rune *s) 152 { 153 int token; 154 155 startlex(s); 156 atorp = atorstack; 157 andp = andstack; 158 subidp = subidstack; 159 cursubid = 0; 160 lastwasand = FALSE; 161 /* Start with a low priority operator to prime parser */ 162 pushator(START-1); 163 while((token=lex()) != END){ 164 if((token&ISATOR) == OPERATOR) 165 operator(token); 166 else 167 operand(token); 168 } 169 /* Close with a low priority operator */ 170 evaluntil(START); 171 /* Force END */ 172 operand(END); 173 evaluntil(START); 174 if(nbra) 175 regerror(Eleftpar); 176 --andp; /* points to first and only operand */ 177 return andp->first; 178 } 179 180 void 181 compile(String *s) 182 { 183 int i; 184 Inst *oprogp; 185 186 if(Strcmp(s, &lastregexp)==0) 187 return; 188 for(i=0; i<nclass; i++) 189 free(class[i]); 190 nclass = 0; 191 progp = program; 192 backwards = FALSE; 193 startinst = realcompile(s->s); 194 optimize(program); 195 oprogp = progp; 196 backwards = TRUE; 197 bstartinst = realcompile(s->s); 198 optimize(oprogp); 199 Strduplstr(&lastregexp, s); 200 } 201 202 void 203 operand(int t) 204 { 205 Inst *i; 206 if(lastwasand) 207 operator(CAT); /* catenate is implicit */ 208 i = newinst(t); 209 if(t == CCLASS){ 210 if(negateclass) 211 i->type = NCCLASS; /* UGH */ 212 i->rclass = nclass-1; /* UGH */ 213 } 214 pushand(i, i); 215 lastwasand = TRUE; 216 } 217 218 void 219 operator(int t) 220 { 221 if(t==RBRA && --nbra<0) 222 regerror(Erightpar); 223 if(t==LBRA){ 224 /* 225 * if(++cursubid >= NSUBEXP) 226 * regerror(Esubexp); 227 */ 228 cursubid++; /* silently ignored */ 229 nbra++; 230 if(lastwasand) 231 operator(CAT); 232 }else 233 evaluntil(t); 234 if(t!=RBRA) 235 pushator(t); 236 lastwasand = FALSE; 237 if(t==STAR || t==QUEST || t==PLUS || t==RBRA) 238 lastwasand = TRUE; /* these look like operands */ 239 } 240 241 void 242 cant(char *s) 243 { 244 char buf[100]; 245 246 sprint(buf, "regexp: can't happen: %s", s); 247 panic(buf); 248 } 249 250 void 251 pushand(Inst *f, Inst *l) 252 { 253 if(andp >= &andstack[NSTACK]) 254 cant("operand stack overflow"); 255 andp->first = f; 256 andp->last = l; 257 andp++; 258 } 259 260 void 261 pushator(int t) 262 { 263 if(atorp >= &atorstack[NSTACK]) 264 cant("operator stack overflow"); 265 *atorp++=t; 266 if(cursubid >= NSUBEXP) 267 *subidp++= -1; 268 else 269 *subidp++=cursubid; 270 } 271 272 Node * 273 popand(int op) 274 { 275 if(andp <= &andstack[0]) 276 if(op) 277 regerror_c(Emissop, op); 278 else 279 regerror(Ebadregexp); 280 return --andp; 281 } 282 283 int 284 popator(void) 285 { 286 if(atorp <= &atorstack[0]) 287 cant("operator stack underflow"); 288 --subidp; 289 return *--atorp; 290 } 291 292 void 293 evaluntil(int pri) 294 { 295 Node *op1, *op2, *t; 296 Inst *inst1, *inst2; 297 298 while(pri==RBRA || atorp[-1]>=pri){ 299 switch(popator()){ 300 case LBRA: 301 op1 = popand('('); 302 inst2 = newinst(RBRA); 303 inst2->subid = *subidp; 304 op1->last->next = inst2; 305 inst1 = newinst(LBRA); 306 inst1->subid = *subidp; 307 inst1->next = op1->first; 308 pushand(inst1, inst2); 309 return; /* must have been RBRA */ 310 default: 311 panic("unknown regexp operator"); 312 break; 313 case OR: 314 op2 = popand('|'); 315 op1 = popand('|'); 316 inst2 = newinst(NOP); 317 op2->last->next = inst2; 318 op1->last->next = inst2; 319 inst1 = newinst(OR); 320 inst1->right = op1->first; 321 inst1->left = op2->first; 322 pushand(inst1, inst2); 323 break; 324 case CAT: 325 op2 = popand(0); 326 op1 = popand(0); 327 if(backwards && op2->first->type!=END) 328 t = op1, op1 = op2, op2 = t; 329 op1->last->next = op2->first; 330 pushand(op1->first, op2->last); 331 break; 332 case STAR: 333 op2 = popand('*'); 334 inst1 = newinst(OR); 335 op2->last->next = inst1; 336 inst1->right = op2->first; 337 pushand(inst1, inst1); 338 break; 339 case PLUS: 340 op2 = popand('+'); 341 inst1 = newinst(OR); 342 op2->last->next = inst1; 343 inst1->right = op2->first; 344 pushand(op2->first, inst1); 345 break; 346 case QUEST: 347 op2 = popand('?'); 348 inst1 = newinst(OR); 349 inst2 = newinst(NOP); 350 inst1->left = inst2; 351 inst1->right = op2->first; 352 op2->last->next = inst2; 353 pushand(inst1, inst2); 354 break; 355 } 356 } 357 } 358 359 360 void 361 optimize(Inst *start) 362 { 363 Inst *inst, *target; 364 365 for(inst=start; inst->type!=END; inst++){ 366 target = inst->next; 367 while(target->type == NOP) 368 target = target->next; 369 inst->next = target; 370 } 371 } 372 373 #ifdef DEBUG 374 void 375 dumpstack(void){ 376 Node *stk; 377 int *ip; 378 379 dprint("operators\n"); 380 for(ip = atorstack; ip<atorp; ip++) 381 dprint("0%o\n", *ip); 382 dprint("operands\n"); 383 for(stk = andstack; stk<andp; stk++) 384 dprint("0%o\t0%o\n", stk->first->type, stk->last->type); 385 } 386 void 387 dump(void){ 388 Inst *l; 389 390 l = program; 391 do{ 392 dprint("%d:\t0%o\t%d\t%d\n", l-program, l->type, 393 l->left-program, l->right-program); 394 }while(l++->type); 395 } 396 #endif 397 398 void 399 startlex(Rune *s) 400 { 401 exprp = s; 402 nbra = 0; 403 } 404 405 406 int 407 lex(void){ 408 int c= *exprp++; 409 410 switch(c){ 411 case '\\': 412 if(*exprp) 413 if((c= *exprp++)=='n') 414 c='\n'; 415 break; 416 case 0: 417 c = END; 418 --exprp; /* In case we come here again */ 419 break; 420 case '*': 421 c = STAR; 422 break; 423 case '?': 424 c = QUEST; 425 break; 426 case '+': 427 c = PLUS; 428 break; 429 case '|': 430 c = OR; 431 break; 432 case '.': 433 c = ANY; 434 break; 435 case '(': 436 c = LBRA; 437 break; 438 case ')': 439 c = RBRA; 440 break; 441 case '^': 442 c = BOL; 443 break; 444 case '$': 445 c = EOL; 446 break; 447 case '[': 448 c = CCLASS; 449 bldcclass(); 450 break; 451 } 452 return c; 453 } 454 455 long 456 nextrec(void){ 457 if(exprp[0]==0 || (exprp[0]=='\\' && exprp[1]==0)) 458 regerror(Ebadclass); 459 if(exprp[0] == '\\'){ 460 exprp++; 461 if(*exprp=='n'){ 462 exprp++; 463 return '\n'; 464 } 465 return *exprp++|(Runemax+1); 466 } 467 return *exprp++; 468 } 469 470 void 471 bldcclass(void) 472 { 473 long c1, c2, n, na; 474 Rune *classp; 475 476 classp = emalloc(DCLASS*RUNESIZE); 477 n = 0; 478 na = DCLASS; 479 /* we have already seen the '[' */ 480 if(*exprp == '^'){ 481 classp[n++] = '\n'; /* don't match newline in negate case */ 482 negateclass = TRUE; 483 exprp++; 484 }else 485 negateclass = FALSE; 486 while((c1 = nextrec()) != ']'){ 487 if(c1 == '-'){ 488 Error: 489 free(classp); 490 regerror(Ebadclass); 491 } 492 if(n+4 >= na){ /* 3 runes plus NUL */ 493 na += DCLASS; 494 classp = erealloc(classp, na*RUNESIZE); 495 } 496 if(*exprp == '-'){ 497 exprp++; /* eat '-' */ 498 if((c2 = nextrec()) == ']') 499 goto Error; 500 classp[n+0] = Runemax; 501 classp[n+1] = c1; 502 classp[n+2] = c2; 503 n += 3; 504 }else 505 classp[n++] = c1; 506 } 507 classp[n] = 0; 508 if(nclass == Nclass){ 509 Nclass += DCLASS; 510 class = erealloc(class, Nclass*sizeof(Rune*)); 511 } 512 class[nclass++] = classp; 513 } 514 515 int 516 classmatch(int classno, int c, int negate) 517 { 518 Rune *p; 519 520 p = class[classno]; 521 while(*p){ 522 if(*p == Runemax){ 523 if(p[1]<=c && c<=p[2]) 524 return !negate; 525 p += 3; 526 }else if(*p++ == c) 527 return !negate; 528 } 529 return negate; 530 } 531 532 /* 533 * Note optimization in addinst: 534 * *l must be pending when addinst called; if *l has been looked 535 * at already, the optimization is a bug. 536 */ 537 int 538 addinst(Ilist *l, Inst *inst, Rangeset *sep) 539 { 540 Ilist *p; 541 542 for(p = l; p->inst; p++){ 543 if(p->inst==inst){ 544 if((sep)->p[0].p1 < p->se.p[0].p1) 545 p->se= *sep; /* this would be bug */ 546 return 0; /* It's already there */ 547 } 548 } 549 p->inst = inst; 550 p->se= *sep; 551 (p+1)->inst = 0; 552 return 1; 553 } 554 555 int 556 execute(File *f, Posn startp, Posn eof) 557 { 558 int flag = 0; 559 Inst *inst; 560 Ilist *tlp; 561 Posn p = startp; 562 int nnl = 0, ntl; 563 int c; 564 int wrapped = 0; 565 int startchar = startinst->type<OPERATOR? startinst->type : 0; 566 567 list[0][0].inst = list[1][0].inst = 0; 568 sel.p[0].p1 = -1; 569 /* Execute machine once for each character */ 570 for(;;p++){ 571 doloop: 572 c = filereadc(f, p); 573 if(p>=eof || c<0){ 574 switch(wrapped++){ 575 case 0: /* let loop run one more click */ 576 case 2: 577 break; 578 case 1: /* expired; wrap to beginning */ 579 if(sel.p[0].p1>=0 || eof!=INFINITY) 580 goto Return; 581 list[0][0].inst = list[1][0].inst = 0; 582 p = 0; 583 goto doloop; 584 default: 585 goto Return; 586 } 587 }else if(((wrapped && p>=startp) || sel.p[0].p1>0) && nnl==0) 588 break; 589 /* fast check for first char */ 590 if(startchar && nnl==0 && c!=startchar) 591 continue; 592 tl = list[flag]; 593 nl = list[flag^=1]; 594 nl->inst = 0; 595 ntl = nnl; 596 nnl = 0; 597 if(sel.p[0].p1<0 && (!wrapped || p<startp || startp==eof)){ 598 /* Add first instruction to this list */ 599 sempty.p[0].p1 = p; 600 if(addinst(tl, startinst, &sempty)) 601 if(++ntl >= NLIST) 602 Overflow: 603 error(Eoverflow); 604 } 605 /* Execute machine until this list is empty */ 606 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */ 607 Switchstmt: 608 switch(inst->type){ 609 default: /* regular character */ 610 if(inst->type==c){ 611 Addinst: 612 if(addinst(nl, inst->next, &tlp->se)) 613 if(++nnl >= NLIST) 614 goto Overflow; 615 } 616 break; 617 case LBRA: 618 if(inst->subid>=0) 619 tlp->se.p[inst->subid].p1 = p; 620 inst = inst->next; 621 goto Switchstmt; 622 case RBRA: 623 if(inst->subid>=0) 624 tlp->se.p[inst->subid].p2 = p; 625 inst = inst->next; 626 goto Switchstmt; 627 case ANY: 628 if(c!='\n') 629 goto Addinst; 630 break; 631 case BOL: 632 if(p==0 || filereadc(f, p - 1)=='\n'){ 633 Step: 634 inst = inst->next; 635 goto Switchstmt; 636 } 637 break; 638 case EOL: 639 if(c == '\n') 640 goto Step; 641 break; 642 case CCLASS: 643 if(c>=0 && classmatch(inst->rclass, c, 0)) 644 goto Addinst; 645 break; 646 case NCCLASS: 647 if(c>=0 && classmatch(inst->rclass, c, 1)) 648 goto Addinst; 649 break; 650 case OR: 651 /* evaluate right choice later */ 652 if(addinst(tlp, inst->right, &tlp->se)) 653 if(++ntl >= NLIST) 654 goto Overflow; 655 /* efficiency: advance and re-evaluate */ 656 inst = inst->left; 657 goto Switchstmt; 658 case END: /* Match! */ 659 tlp->se.p[0].p2 = p; 660 newmatch(&tlp->se); 661 break; 662 } 663 } 664 } 665 Return: 666 return sel.p[0].p1>=0; 667 } 668 669 void 670 newmatch(Rangeset *sp) 671 { 672 int i; 673 674 if(sel.p[0].p1<0 || sp->p[0].p1<sel.p[0].p1 || 675 (sp->p[0].p1==sel.p[0].p1 && sp->p[0].p2>sel.p[0].p2)) 676 for(i = 0; i<NSUBEXP; i++) 677 sel.p[i] = sp->p[i]; 678 } 679 680 int 681 bexecute(File *f, Posn startp) 682 { 683 int flag = 0; 684 Inst *inst; 685 Ilist *tlp; 686 Posn p = startp; 687 int nnl = 0, ntl; 688 int c; 689 int wrapped = 0; 690 int startchar = bstartinst->type<OPERATOR? bstartinst->type : 0; 691 692 list[0][0].inst = list[1][0].inst = 0; 693 sel.p[0].p1= -1; 694 /* Execute machine once for each character, including terminal NUL */ 695 for(;;--p){ 696 doloop: 697 if((c = filereadc(f, p - 1))==-1){ 698 switch(wrapped++){ 699 case 0: /* let loop run one more click */ 700 case 2: 701 break; 702 case 1: /* expired; wrap to end */ 703 if(sel.p[0].p1>=0) 704 case 3: 705 goto Return; 706 list[0][0].inst = list[1][0].inst = 0; 707 p = f->nc; 708 goto doloop; 709 default: 710 goto Return; 711 } 712 }else if(((wrapped && p<=startp) || sel.p[0].p1>0) && nnl==0) 713 break; 714 /* fast check for first char */ 715 if(startchar && nnl==0 && c!=startchar) 716 continue; 717 tl = list[flag]; 718 nl = list[flag^=1]; 719 nl->inst = 0; 720 ntl = nnl; 721 nnl = 0; 722 if(sel.p[0].p1<0 && (!wrapped || p>startp)){ 723 /* Add first instruction to this list */ 724 /* the minus is so the optimizations in addinst work */ 725 sempty.p[0].p1 = -p; 726 if(addinst(tl, bstartinst, &sempty)) 727 if(++ntl >= NLIST) 728 Overflow: 729 error(Eoverflow); 730 } 731 /* Execute machine until this list is empty */ 732 for(tlp = tl; inst = tlp->inst; tlp++){ /* assignment = */ 733 Switchstmt: 734 switch(inst->type){ 735 default: /* regular character */ 736 if(inst->type == c){ 737 Addinst: 738 if(addinst(nl, inst->next, &tlp->se)) 739 if(++nnl >= NLIST) 740 goto Overflow; 741 } 742 break; 743 case LBRA: 744 if(inst->subid>=0) 745 tlp->se.p[inst->subid].p1 = p; 746 inst = inst->next; 747 goto Switchstmt; 748 case RBRA: 749 if(inst->subid >= 0) 750 tlp->se.p[inst->subid].p2 = p; 751 inst = inst->next; 752 goto Switchstmt; 753 case ANY: 754 if(c != '\n') 755 goto Addinst; 756 break; 757 case BOL: 758 if(c=='\n' || p==0){ 759 Step: 760 inst = inst->next; 761 goto Switchstmt; 762 } 763 break; 764 case EOL: 765 if(p==f->nc || filereadc(f, p)=='\n') 766 goto Step; 767 break; 768 case CCLASS: 769 if(c>=0 && classmatch(inst->rclass, c, 0)) 770 goto Addinst; 771 break; 772 case NCCLASS: 773 if(c>=0 && classmatch(inst->rclass, c, 1)) 774 goto Addinst; 775 break; 776 case OR: 777 /* evaluate right choice later */ 778 if(addinst(tl, inst->right, &tlp->se)) 779 if(++ntl >= NLIST) 780 goto Overflow; 781 /* efficiency: advance and re-evaluate */ 782 inst = inst->left; 783 goto Switchstmt; 784 case END: /* Match! */ 785 tlp->se.p[0].p1 = -tlp->se.p[0].p1; /* minus sign */ 786 tlp->se.p[0].p2 = p; 787 bnewmatch(&tlp->se); 788 break; 789 } 790 } 791 } 792 Return: 793 return sel.p[0].p1>=0; 794 } 795 796 void 797 bnewmatch(Rangeset *sp) 798 { 799 int i; 800 if(sel.p[0].p1<0 || sp->p[0].p1>sel.p[0].p2 || (sp->p[0].p1==sel.p[0].p2 && sp->p[0].p2<sel.p[0].p1)) 801 for(i = 0; i<NSUBEXP; i++){ /* note the reversal; p1<=p2 */ 802 sel.p[i].p1 = sp->p[i].p2; 803 sel.p[i].p2 = sp->p[i].p1; 804 } 805 } 806