1 #include <lib9.h> 2 #include "regexp.h" 3 #include "regcomp.h" 4 5 #define TRUE 1 6 #define FALSE 0 7 8 /* 9 * Parser Information 10 */ 11 typedef 12 struct Node 13 { 14 Reinst* first; 15 Reinst* last; 16 }Node; 17 18 Reprog RePrOg; 19 20 #define NSTACK 20 21 static Node andstack[NSTACK]; 22 static Node *andp; 23 static int atorstack[NSTACK]; 24 static int* atorp; 25 static int cursubid; /* id of current subexpression */ 26 static int subidstack[NSTACK]; /* parallel to atorstack */ 27 static int* subidp; 28 static int lastwasand; /* Last token was operand */ 29 static int nbra; 30 static char* exprp; /* pointer to next character in source expression */ 31 static int lexdone; 32 static int nclass; 33 static Reclass*classp; 34 static Reinst* freep; 35 static int errors; 36 static Rune yyrune; /* last lex'd rune */ 37 static Reclass*yyclassp; /* last lex'd class */ 38 39 /* predeclared crap */ 40 static void operator(int); 41 static void pushand(Reinst*, Reinst*); 42 static void pushator(int); 43 static void evaluntil(int); 44 static int bldcclass(void); 45 46 static jmp_buf regkaboom; 47 48 static void 49 rcerror(char *s) 50 { 51 errors++; 52 regerror(s); 53 longjmp(regkaboom, 1); 54 } 55 56 static Reinst* 57 newinst(int t) 58 { 59 freep->type = t; 60 freep->u2.left = 0; 61 freep->u1.right = 0; 62 return freep++; 63 } 64 65 static void 66 operand(int t) 67 { 68 Reinst *i; 69 70 if(lastwasand) 71 operator(CAT); /* catenate is implicit */ 72 i = newinst(t); 73 74 if(t == CCLASS || t == NCCLASS) 75 i->u1.cp = yyclassp; 76 if(t == RUNE) 77 i->u1.r = yyrune; 78 79 pushand(i, i); 80 lastwasand = TRUE; 81 } 82 83 static void 84 operator(int t) 85 { 86 if(t==RBRA && --nbra<0) 87 rcerror("unmatched right paren"); 88 if(t==LBRA){ 89 if(++cursubid >= NSUBEXP) 90 rcerror ("too many subexpressions"); 91 nbra++; 92 if(lastwasand) 93 operator(CAT); 94 } else 95 evaluntil(t); 96 if(t != RBRA) 97 pushator(t); 98 lastwasand = FALSE; 99 if(t==STAR || t==QUEST || t==PLUS || t==RBRA) 100 lastwasand = TRUE; /* these look like operands */ 101 } 102 103 static void 104 regerr2(char *s, int c) 105 { 106 char buf[100]; 107 char *cp = buf; 108 while(*s) 109 *cp++ = *s++; 110 *cp++ = c; 111 *cp = '\0'; 112 rcerror(buf); 113 } 114 115 static void 116 cant(char *s) 117 { 118 char buf[100]; 119 strcpy(buf, "can't happen: "); 120 strcat(buf, s); 121 rcerror(buf); 122 } 123 124 static void 125 pushand(Reinst *f, Reinst *l) 126 { 127 if(andp >= &andstack[NSTACK]) 128 cant("operand stack overflow"); 129 andp->first = f; 130 andp->last = l; 131 andp++; 132 } 133 134 static void 135 pushator(int t) 136 { 137 if(atorp >= &atorstack[NSTACK]) 138 cant("operator stack overflow"); 139 *atorp++ = t; 140 *subidp++ = cursubid; 141 } 142 143 static Node* 144 popand(int op) 145 { 146 Reinst *inst; 147 148 if(andp <= &andstack[0]){ 149 regerr2("missing operand for ", op); 150 inst = newinst(NOP); 151 pushand(inst,inst); 152 } 153 return --andp; 154 } 155 156 static int 157 popator(void) 158 { 159 if(atorp <= &atorstack[0]) 160 cant("operator stack underflow"); 161 --subidp; 162 return *--atorp; 163 } 164 165 static void 166 evaluntil(int pri) 167 { 168 Node *op1, *op2; 169 Reinst *inst1, *inst2; 170 171 while(pri==RBRA || atorp[-1]>=pri){ 172 switch(popator()){ 173 default: 174 rcerror("unknown operator in evaluntil"); 175 break; 176 case LBRA: /* must have been RBRA */ 177 op1 = popand('('); 178 inst2 = newinst(RBRA); 179 inst2->u1.subid = *subidp; 180 op1->last->u2.next = inst2; 181 inst1 = newinst(LBRA); 182 inst1->u1.subid = *subidp; 183 inst1->u2.next = op1->first; 184 pushand(inst1, inst2); 185 return; 186 case OR: 187 op2 = popand('|'); 188 op1 = popand('|'); 189 inst2 = newinst(NOP); 190 op2->last->u2.next = inst2; 191 op1->last->u2.next = inst2; 192 inst1 = newinst(OR); 193 inst1->u1.right = op1->first; 194 inst1->u2.left = op2->first; 195 pushand(inst1, inst2); 196 break; 197 case CAT: 198 op2 = popand(0); 199 op1 = popand(0); 200 op1->last->u2.next = op2->first; 201 pushand(op1->first, op2->last); 202 break; 203 case STAR: 204 op2 = popand('*'); 205 inst1 = newinst(OR); 206 op2->last->u2.next = inst1; 207 inst1->u1.right = op2->first; 208 pushand(inst1, inst1); 209 break; 210 case PLUS: 211 op2 = popand('+'); 212 inst1 = newinst(OR); 213 op2->last->u2.next = inst1; 214 inst1->u1.right = op2->first; 215 pushand(op2->first, inst1); 216 break; 217 case QUEST: 218 op2 = popand('?'); 219 inst1 = newinst(OR); 220 inst2 = newinst(NOP); 221 inst1->u2.left = inst2; 222 inst1->u1.right = op2->first; 223 op2->last->u2.next = inst2; 224 pushand(inst1, inst2); 225 break; 226 } 227 } 228 } 229 230 static Reprog* 231 optimize(Reprog *pp) 232 { 233 Reinst *inst, *target; 234 int size; 235 Reprog *npp; 236 Reclass *cl; 237 int diff; 238 239 /* 240 * get rid of NOOP chains 241 */ 242 for(inst=pp->firstinst; inst->type!=END; inst++){ 243 target = inst->u2.next; 244 while(target->type == NOP) 245 target = target->u2.next; 246 inst->u2.next = target; 247 } 248 249 /* 250 * The original allocation is for an area larger than 251 * necessary. Reallocate to the actual space used 252 * and then relocate the code. 253 */ 254 size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst); 255 npp = (Reprog *)realloc(pp, size); 256 if(npp==0 || npp==pp) 257 return pp; 258 diff = (char *)npp - (char *)pp; 259 freep = (Reinst *)((char *)freep + diff); 260 for(inst=npp->firstinst; inst<freep; inst++){ 261 switch(inst->type){ 262 case OR: 263 case STAR: 264 case PLUS: 265 case QUEST: 266 *(char **)&inst->u1.right += diff; 267 break; 268 case CCLASS: 269 case NCCLASS: 270 *(char **)&inst->u1.right += diff; 271 cl = inst->u1.cp; 272 *(char **)&cl->end += diff; 273 break; 274 } 275 *(char **)&inst->u2.left += diff; 276 } 277 *(char **)&npp->startinst += diff; 278 return npp; 279 } 280 281 #ifdef DEBUG 282 static void 283 dumpstack(void){ 284 Node *stk; 285 int *ip; 286 287 print("operators\n"); 288 for(ip=atorstack; ip<atorp; ip++) 289 print("0%o\n", *ip); 290 print("operands\n"); 291 for(stk=andstack; stk<andp; stk++) 292 print("0%o\t0%o\n", stk->first->type, stk->last->type); 293 } 294 295 static void 296 dump(Reprog *pp) 297 { 298 Reinst *l; 299 Rune *p; 300 301 l = pp->firstinst; 302 do{ 303 print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type, 304 l->u2.left-pp->firstinst, l->u1.right-pp->firstinst); 305 if(l->type == RUNE) 306 print("\t%C\n", l->r); 307 else if(l->type == CCLASS || l->type == NCCLASS){ 308 print("\t["); 309 if(l->type == NCCLASS) 310 print("^"); 311 for(p = l->cp->spans; p < l->cp->end; p += 2) 312 if(p[0] == p[1]) 313 print("%C", p[0]); 314 else 315 print("%C-%C", p[0], p[1]); 316 print("]\n"); 317 } else 318 print("\n"); 319 }while(l++->type); 320 } 321 #endif 322 323 static Reclass* 324 newclass(void) 325 { 326 if(nclass >= NCLASS) 327 regerr2("too many character classes; limit", NCLASS+'0'); 328 return &(classp[nclass++]); 329 } 330 331 static int 332 nextc(Rune *rp) 333 { 334 if(lexdone){ 335 *rp = 0; 336 return 1; 337 } 338 exprp += chartorune(rp, exprp); 339 if(*rp == L'\\'){ 340 exprp += chartorune(rp, exprp); 341 return 1; 342 } 343 if(*rp == 0) 344 lexdone = 1; 345 return 0; 346 } 347 348 static int 349 lex(int literal, int dot_type) 350 { 351 int quoted; 352 353 quoted = nextc(&yyrune); 354 if(literal || quoted){ 355 if(yyrune == 0) 356 return END; 357 return RUNE; 358 } 359 360 switch(yyrune){ 361 case 0: 362 return END; 363 case L'*': 364 return STAR; 365 case L'?': 366 return QUEST; 367 case L'+': 368 return PLUS; 369 case L'|': 370 return OR; 371 case L'.': 372 return dot_type; 373 case L'(': 374 return LBRA; 375 case L')': 376 return RBRA; 377 case L'^': 378 return BOL; 379 case L'$': 380 return EOL; 381 case L'[': 382 return bldcclass(); 383 } 384 return RUNE; 385 } 386 387 static int 388 bldcclass(void) 389 { 390 int type; 391 Rune r[NCCRUNE]; 392 Rune *p, *ep, *np; 393 Rune rune; 394 int quoted; 395 396 /* we have already seen the '[' */ 397 type = CCLASS; 398 yyclassp = newclass(); 399 400 /* look ahead for negation */ 401 /* SPECIAL CASE!!! negated classes don't match \n */ 402 ep = r; 403 quoted = nextc(&rune); 404 if(!quoted && rune == L'^'){ 405 type = NCCLASS; 406 quoted = nextc(&rune); 407 *ep++ = L'\n'; 408 *ep++ = L'\n'; 409 } 410 411 /* parse class into a set of spans */ 412 for(; ep<&r[NCCRUNE];){ 413 if(rune == 0){ 414 rcerror("malformed '[]'"); 415 return 0; 416 } 417 if(!quoted && rune == L']') 418 break; 419 if(!quoted && rune == L'-'){ 420 if(ep == r){ 421 rcerror("malformed '[]'"); 422 return 0; 423 } 424 quoted = nextc(&rune); 425 if((!quoted && rune == L']') || rune == 0){ 426 rcerror("malformed '[]'"); 427 return 0; 428 } 429 *(ep-1) = rune; 430 } else { 431 *ep++ = rune; 432 *ep++ = rune; 433 } 434 quoted = nextc(&rune); 435 } 436 437 /* sort on span start */ 438 for(p = r; p < ep; p += 2){ 439 for(np = p; np < ep; np += 2) 440 if(*np < *p){ 441 rune = np[0]; 442 np[0] = p[0]; 443 p[0] = rune; 444 rune = np[1]; 445 np[1] = p[1]; 446 p[1] = rune; 447 } 448 } 449 450 /* merge spans */ 451 np = yyclassp->spans; 452 p = r; 453 if(r == ep) 454 yyclassp->end = np; 455 else { 456 np[0] = *p++; 457 np[1] = *p++; 458 for(; p < ep; p += 2) 459 if(p[0] <= np[1]){ 460 if(p[1] > np[1]) 461 np[1] = p[1]; 462 } else { 463 np += 2; 464 np[0] = p[0]; 465 np[1] = p[1]; 466 } 467 yyclassp->end = np+2; 468 } 469 470 return type; 471 } 472 473 static Reprog* 474 regcomp1(char *s, int literal, int dot_type) 475 { 476 int token; 477 Reprog *pp; 478 479 /* get memory for the program */ 480 pp = (Reprog *)malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s)); 481 if(pp == 0){ 482 regerror("out of memory"); 483 return 0; 484 } 485 freep = pp->firstinst; 486 classp = pp->class; 487 errors = 0; 488 489 if(setjmp(regkaboom)) 490 goto out; 491 492 /* go compile the sucker */ 493 lexdone = 0; 494 exprp = s; 495 nclass = 0; 496 nbra = 0; 497 atorp = atorstack; 498 andp = andstack; 499 subidp = subidstack; 500 lastwasand = FALSE; 501 cursubid = 0; 502 503 /* Start with a low priority operator to prime parser */ 504 pushator(START-1); 505 while((token = lex(literal, dot_type)) != END){ 506 if((token&0300) == OPERATOR) 507 operator(token); 508 else 509 operand(token); 510 } 511 512 /* Close with a low priority operator */ 513 evaluntil(START); 514 515 /* Force END */ 516 operand(END); 517 evaluntil(START); 518 #ifdef DEBUG 519 dumpstack(); 520 #endif 521 if(nbra) 522 rcerror("unmatched left paren"); 523 --andp; /* points to first and only operand */ 524 pp->startinst = andp->first; 525 #ifdef DEBUG 526 dump(pp); 527 #endif 528 pp = optimize(pp); 529 #ifdef DEBUG 530 print("start: %d\n", andp->first-pp->firstinst); 531 dump(pp); 532 #endif 533 out: 534 if(errors){ 535 free(pp); 536 pp = 0; 537 } 538 return pp; 539 } 540 541 extern Reprog* 542 regcomp(char *s) 543 { 544 return regcomp1(s, 0, ANY); 545 } 546 547 extern Reprog* 548 regcomplit(char *s) 549 { 550 return regcomp1(s, 1, ANY); 551 } 552 553 extern Reprog* 554 regcompnl(char *s) 555 { 556 return regcomp1(s, 0, ANYNL); 557 } 558