1 #include <u.h> 2 #include <libc.h> 3 #include "regexp.h" 4 #include "regcomp.h" 5 6 #define TRUE 1 7 #define FALSE 0 8 9 /* 10 * Parser Information 11 */ 12 typedef 13 struct Node 14 { 15 Reinst* first; 16 Reinst* last; 17 }Node; 18 19 /* max character classes per program is nelem(reprog->class) */ 20 static Reprog *reprog; 21 22 /* max rune ranges per character class is nelem(classp->spans)/2 */ 23 #define NCCRUNE nelem(classp->spans) 24 25 #define NSTACK 20 26 static Node andstack[NSTACK]; 27 static Node *andp; 28 static int atorstack[NSTACK]; 29 static int* atorp; 30 static int cursubid; /* id of current subexpression */ 31 static int subidstack[NSTACK]; /* parallel to atorstack */ 32 static int* subidp; 33 static int lastwasand; /* Last token was operand */ 34 static int nbra; 35 static char* exprp; /* pointer to next character in source expression */ 36 static int lexdone; 37 static int nclass; 38 static Reclass*classp; 39 static Reinst* freep; 40 static int errors; 41 static Rune yyrune; /* last lex'd rune */ 42 static Reclass*yyclassp; /* last lex'd class */ 43 44 /* predeclared crap */ 45 static void operator(int); 46 static void pushand(Reinst*, Reinst*); 47 static void pushator(int); 48 static void evaluntil(int); 49 static int bldcclass(void); 50 51 static jmp_buf regkaboom; 52 53 static void 54 rcerror(char *s) 55 { 56 errors++; 57 regerror(s); 58 longjmp(regkaboom, 1); 59 } 60 61 static Reinst* 62 newinst(int t) 63 { 64 freep->type = t; 65 freep->left = 0; 66 freep->right = 0; 67 return freep++; 68 } 69 70 static void 71 operand(int t) 72 { 73 Reinst *i; 74 75 if(lastwasand) 76 operator(CAT); /* catenate is implicit */ 77 i = newinst(t); 78 79 if(t == CCLASS || t == NCCLASS) 80 i->cp = yyclassp; 81 if(t == RUNE) 82 i->r = yyrune; 83 84 pushand(i, i); 85 lastwasand = TRUE; 86 } 87 88 static void 89 operator(int t) 90 { 91 if(t==RBRA && --nbra<0) 92 rcerror("unmatched right paren"); 93 if(t==LBRA){ 94 if(++cursubid >= NSUBEXP) 95 rcerror ("too many subexpressions"); 96 nbra++; 97 if(lastwasand) 98 operator(CAT); 99 } else 100 evaluntil(t); 101 if(t != RBRA) 102 pushator(t); 103 lastwasand = FALSE; 104 if(t==STAR || t==QUEST || t==PLUS || t==RBRA) 105 lastwasand = TRUE; /* these look like operands */ 106 } 107 108 static void 109 regerr2(char *s, int c) 110 { 111 char buf[100]; 112 char *cp = buf; 113 while(*s) 114 *cp++ = *s++; 115 *cp++ = c; 116 *cp = '\0'; 117 rcerror(buf); 118 } 119 120 static void 121 cant(char *s) 122 { 123 char buf[100]; 124 strcpy(buf, "can't happen: "); 125 strcat(buf, s); 126 rcerror(buf); 127 } 128 129 static void 130 pushand(Reinst *f, Reinst *l) 131 { 132 if(andp >= &andstack[NSTACK]) 133 cant("operand stack overflow"); 134 andp->first = f; 135 andp->last = l; 136 andp++; 137 } 138 139 static void 140 pushator(int t) 141 { 142 if(atorp >= &atorstack[NSTACK]) 143 cant("operator stack overflow"); 144 *atorp++ = t; 145 *subidp++ = cursubid; 146 } 147 148 static Node* 149 popand(int op) 150 { 151 Reinst *inst; 152 153 if(andp <= &andstack[0]){ 154 regerr2("missing operand for ", op); 155 inst = newinst(NOP); 156 pushand(inst,inst); 157 } 158 return --andp; 159 } 160 161 static int 162 popator(void) 163 { 164 if(atorp <= &atorstack[0]) 165 cant("operator stack underflow"); 166 --subidp; 167 return *--atorp; 168 } 169 170 static void 171 evaluntil(int pri) 172 { 173 Node *op1, *op2; 174 Reinst *inst1, *inst2; 175 176 while(pri==RBRA || atorp[-1]>=pri){ 177 switch(popator()){ 178 default: 179 rcerror("unknown operator in evaluntil"); 180 break; 181 case LBRA: /* must have been RBRA */ 182 op1 = popand('('); 183 inst2 = newinst(RBRA); 184 inst2->subid = *subidp; 185 op1->last->next = inst2; 186 inst1 = newinst(LBRA); 187 inst1->subid = *subidp; 188 inst1->next = op1->first; 189 pushand(inst1, inst2); 190 return; 191 case OR: 192 op2 = popand('|'); 193 op1 = popand('|'); 194 inst2 = newinst(NOP); 195 op2->last->next = inst2; 196 op1->last->next = inst2; 197 inst1 = newinst(OR); 198 inst1->right = op1->first; 199 inst1->left = op2->first; 200 pushand(inst1, inst2); 201 break; 202 case CAT: 203 op2 = popand(0); 204 op1 = popand(0); 205 op1->last->next = op2->first; 206 pushand(op1->first, op2->last); 207 break; 208 case STAR: 209 op2 = popand('*'); 210 inst1 = newinst(OR); 211 op2->last->next = inst1; 212 inst1->right = op2->first; 213 pushand(inst1, inst1); 214 break; 215 case PLUS: 216 op2 = popand('+'); 217 inst1 = newinst(OR); 218 op2->last->next = inst1; 219 inst1->right = op2->first; 220 pushand(op2->first, inst1); 221 break; 222 case QUEST: 223 op2 = popand('?'); 224 inst1 = newinst(OR); 225 inst2 = newinst(NOP); 226 inst1->left = inst2; 227 inst1->right = op2->first; 228 op2->last->next = inst2; 229 pushand(inst1, inst2); 230 break; 231 } 232 } 233 } 234 235 static Reprog* 236 optimize(Reprog *pp) 237 { 238 Reinst *inst, *target; 239 int size; 240 Reprog *npp; 241 Reclass *cl; 242 int diff; 243 244 /* 245 * get rid of NOOP chains 246 */ 247 for(inst=pp->firstinst; inst->type!=END; inst++){ 248 target = inst->next; 249 while(target->type == NOP) 250 target = target->next; 251 inst->next = target; 252 } 253 254 /* 255 * The original allocation is for an area larger than 256 * necessary. Reallocate to the actual space used 257 * and then relocate the code. 258 */ 259 size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst); 260 npp = realloc(pp, size); 261 if(npp==0 || npp==pp) 262 return pp; 263 diff = (char *)npp - (char *)pp; 264 freep = (Reinst *)((char *)freep + diff); 265 for(inst=npp->firstinst; inst<freep; inst++){ 266 switch(inst->type){ 267 case OR: 268 case STAR: 269 case PLUS: 270 case QUEST: 271 *(char **)&inst->right += diff; 272 break; 273 case CCLASS: 274 case NCCLASS: 275 *(char **)&inst->right += diff; 276 cl = inst->cp; 277 *(char **)&cl->end += diff; 278 break; 279 } 280 *(char **)&inst->left += diff; 281 } 282 *(char **)&npp->startinst += diff; 283 return npp; 284 } 285 286 #ifdef DEBUG 287 static void 288 dumpstack(void){ 289 Node *stk; 290 int *ip; 291 292 print("operators\n"); 293 for(ip=atorstack; ip<atorp; ip++) 294 print("0%o\n", *ip); 295 print("operands\n"); 296 for(stk=andstack; stk<andp; stk++) 297 print("0%o\t0%o\n", stk->first->type, stk->last->type); 298 } 299 300 static void 301 dump(Reprog *pp) 302 { 303 Reinst *l; 304 Rune *p; 305 306 l = pp->firstinst; 307 do{ 308 print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type, 309 l->left-pp->firstinst, l->right-pp->firstinst); 310 if(l->type == RUNE) 311 print("\t%C\n", l->r); 312 else if(l->type == CCLASS || l->type == NCCLASS){ 313 print("\t["); 314 if(l->type == NCCLASS) 315 print("^"); 316 for(p = l->cp->spans; p < l->cp->end; p += 2) 317 if(p[0] == p[1]) 318 print("%C", p[0]); 319 else 320 print("%C-%C", p[0], p[1]); 321 print("]\n"); 322 } else 323 print("\n"); 324 }while(l++->type); 325 } 326 #endif 327 328 static Reclass* 329 newclass(void) 330 { 331 if(nclass >= nelem(reprog->class)) 332 rcerror("too many character classes; increase Reprog.class size"); 333 return &(classp[nclass++]); 334 } 335 336 static int 337 nextc(Rune *rp) 338 { 339 if(lexdone){ 340 *rp = 0; 341 return 1; 342 } 343 exprp += chartorune(rp, exprp); 344 if(*rp == L'\\'){ 345 exprp += chartorune(rp, exprp); 346 return 1; 347 } 348 if(*rp == 0) 349 lexdone = 1; 350 return 0; 351 } 352 353 static int 354 lex(int literal, int dot_type) 355 { 356 int quoted; 357 358 quoted = nextc(&yyrune); 359 if(literal || quoted){ 360 if(yyrune == 0) 361 return END; 362 return RUNE; 363 } 364 365 switch(yyrune){ 366 case 0: 367 return END; 368 case L'*': 369 return STAR; 370 case L'?': 371 return QUEST; 372 case L'+': 373 return PLUS; 374 case L'|': 375 return OR; 376 case L'.': 377 return dot_type; 378 case L'(': 379 return LBRA; 380 case L')': 381 return RBRA; 382 case L'^': 383 return BOL; 384 case L'$': 385 return EOL; 386 case L'[': 387 return bldcclass(); 388 } 389 return RUNE; 390 } 391 392 static int 393 bldcclass(void) 394 { 395 int type; 396 Rune r[NCCRUNE]; 397 Rune *p, *ep, *np; 398 Rune rune; 399 int quoted; 400 401 /* we have already seen the '[' */ 402 type = CCLASS; 403 yyclassp = newclass(); 404 405 /* look ahead for negation */ 406 /* SPECIAL CASE!!! negated classes don't match \n */ 407 ep = r; 408 quoted = nextc(&rune); 409 if(!quoted && rune == L'^'){ 410 type = NCCLASS; 411 quoted = nextc(&rune); 412 *ep++ = L'\n'; 413 *ep++ = L'\n'; 414 } 415 416 /* parse class into a set of spans */ 417 while(ep < &r[NCCRUNE-1]){ 418 if(rune == 0){ 419 rcerror("malformed '[]'"); 420 return 0; 421 } 422 if(!quoted && rune == L']') 423 break; 424 if(!quoted && rune == L'-'){ 425 if(ep == r){ 426 rcerror("malformed '[]'"); 427 return 0; 428 } 429 quoted = nextc(&rune); 430 if((!quoted && rune == L']') || rune == 0){ 431 rcerror("malformed '[]'"); 432 return 0; 433 } 434 *(ep-1) = rune; 435 } else { 436 *ep++ = rune; 437 *ep++ = rune; 438 } 439 quoted = nextc(&rune); 440 } 441 if(ep >= &r[NCCRUNE-1]) { 442 rcerror("char class too large; increase Reclass.spans size"); 443 return 0; 444 } 445 446 /* sort on span start */ 447 for(p = r; p < ep; p += 2){ 448 for(np = p; np < ep; np += 2) 449 if(*np < *p){ 450 rune = np[0]; 451 np[0] = p[0]; 452 p[0] = rune; 453 rune = np[1]; 454 np[1] = p[1]; 455 p[1] = rune; 456 } 457 } 458 459 /* merge spans */ 460 np = yyclassp->spans; 461 p = r; 462 if(r == ep) 463 yyclassp->end = np; 464 else { 465 np[0] = *p++; 466 np[1] = *p++; 467 for(; p < ep; p += 2) 468 /* overlapping or adjacent ranges? */ 469 if(p[0] <= np[1] + 1){ 470 if(p[1] >= np[1]) 471 np[1] = p[1]; /* coalesce */ 472 } else { 473 np += 2; 474 np[0] = p[0]; 475 np[1] = p[1]; 476 } 477 yyclassp->end = np+2; 478 } 479 480 return type; 481 } 482 483 static Reprog* 484 regcomp1(char *s, int literal, int dot_type) 485 { 486 int token; 487 Reprog *pp; 488 489 /* get memory for the program */ 490 pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s)); 491 if(pp == 0){ 492 regerror("out of memory"); 493 return 0; 494 } 495 freep = pp->firstinst; 496 classp = pp->class; 497 errors = 0; 498 499 if(setjmp(regkaboom)) 500 goto out; 501 502 /* go compile the sucker */ 503 lexdone = 0; 504 exprp = s; 505 nclass = 0; 506 nbra = 0; 507 atorp = atorstack; 508 andp = andstack; 509 subidp = subidstack; 510 lastwasand = FALSE; 511 cursubid = 0; 512 513 /* Start with a low priority operator to prime parser */ 514 pushator(START-1); 515 while((token = lex(literal, dot_type)) != END){ 516 if((token&0300) == OPERATOR) 517 operator(token); 518 else 519 operand(token); 520 } 521 522 /* Close with a low priority operator */ 523 evaluntil(START); 524 525 /* Force END */ 526 operand(END); 527 evaluntil(START); 528 #ifdef DEBUG 529 dumpstack(); 530 #endif 531 if(nbra) 532 rcerror("unmatched left paren"); 533 --andp; /* points to first and only operand */ 534 pp->startinst = andp->first; 535 #ifdef DEBUG 536 dump(pp); 537 #endif 538 pp = optimize(pp); 539 #ifdef DEBUG 540 print("start: %d\n", andp->first-pp->firstinst); 541 dump(pp); 542 #endif 543 out: 544 if(errors){ 545 free(pp); 546 pp = 0; 547 } 548 return pp; 549 } 550 551 extern Reprog* 552 regcomp(char *s) 553 { 554 return regcomp1(s, 0, ANY); 555 } 556 557 extern Reprog* 558 regcomplit(char *s) 559 { 560 return regcomp1(s, 1, ANY); 561 } 562 563 extern Reprog* 564 regcompnl(char *s) 565 { 566 return regcomp1(s, 0, ANYNL); 567 } 568