1 #include <u.h> 2 #include <libc.h> 3 #include "regexp.h" 4 #include "regcomp.h" 5 6 #define TRUE 1 7 #define FALSE 0 8 9 /* 10 * Parser Information 11 */ 12 typedef 13 struct Node 14 { 15 Reinst* first; 16 Reinst* last; 17 }Node; 18 19 #define NSTACK 20 20 static Node andstack[NSTACK]; 21 static Node *andp; 22 static int atorstack[NSTACK]; 23 static int* atorp; 24 static int cursubid; /* id of current subexpression */ 25 static int subidstack[NSTACK]; /* parallel to atorstack */ 26 static int* subidp; 27 static int lastwasand; /* Last token was operand */ 28 static int nbra; 29 static char* exprp; /* pointer to next character in source expression */ 30 static int lexdone; 31 static int nclass; 32 static Reclass*classp; 33 static Reinst* freep; 34 static int errors; 35 static Rune yyrune; /* last lex'd rune */ 36 static Reclass*yyclassp; /* last lex'd class */ 37 38 /* predeclared crap */ 39 static void operator(int); 40 static void pushand(Reinst*, Reinst*); 41 static void pushator(int); 42 static void evaluntil(int); 43 static int bldcclass(void); 44 45 static jmp_buf regkaboom; 46 47 static void 48 rcerror(char *s) 49 { 50 errors++; 51 regerror(s); 52 longjmp(regkaboom, 1); 53 } 54 55 static Reinst* 56 newinst(int t) 57 { 58 freep->type = t; 59 freep->left = 0; 60 freep->right = 0; 61 return freep++; 62 } 63 64 static void 65 operand(int t) 66 { 67 Reinst *i; 68 69 if(lastwasand) 70 operator(CAT); /* catenate is implicit */ 71 i = newinst(t); 72 73 if(t == CCLASS || t == NCCLASS) 74 i->cp = yyclassp; 75 if(t == RUNE) 76 i->r = yyrune; 77 78 pushand(i, i); 79 lastwasand = TRUE; 80 } 81 82 static void 83 operator(int t) 84 { 85 if(t==RBRA && --nbra<0) 86 rcerror("unmatched right paren"); 87 if(t==LBRA){ 88 if(++cursubid >= NSUBEXP) 89 rcerror ("too many subexpressions"); 90 nbra++; 91 if(lastwasand) 92 operator(CAT); 93 } else 94 evaluntil(t); 95 if(t != RBRA) 96 pushator(t); 97 lastwasand = FALSE; 98 if(t==STAR || t==QUEST || t==PLUS || t==RBRA) 99 lastwasand = TRUE; /* these look like operands */ 100 } 101 102 static void 103 regerr2(char *s, int c) 104 { 105 char buf[100]; 106 char *cp = buf; 107 while(*s) 108 *cp++ = *s++; 109 *cp++ = c; 110 *cp = '\0'; 111 rcerror(buf); 112 } 113 114 static void 115 cant(char *s) 116 { 117 char buf[100]; 118 strcpy(buf, "can't happen: "); 119 strcat(buf, s); 120 rcerror(buf); 121 } 122 123 static void 124 pushand(Reinst *f, Reinst *l) 125 { 126 if(andp >= &andstack[NSTACK]) 127 cant("operand stack overflow"); 128 andp->first = f; 129 andp->last = l; 130 andp++; 131 } 132 133 static void 134 pushator(int t) 135 { 136 if(atorp >= &atorstack[NSTACK]) 137 cant("operator stack overflow"); 138 *atorp++ = t; 139 *subidp++ = cursubid; 140 } 141 142 static Node* 143 popand(int op) 144 { 145 Reinst *inst; 146 147 if(andp <= &andstack[0]){ 148 regerr2("missing operand for ", op); 149 inst = newinst(NOP); 150 pushand(inst,inst); 151 } 152 return --andp; 153 } 154 155 static int 156 popator(void) 157 { 158 if(atorp <= &atorstack[0]) 159 cant("operator stack underflow"); 160 --subidp; 161 return *--atorp; 162 } 163 164 static void 165 evaluntil(int pri) 166 { 167 Node *op1, *op2; 168 Reinst *inst1, *inst2; 169 170 while(pri==RBRA || atorp[-1]>=pri){ 171 switch(popator()){ 172 default: 173 rcerror("unknown operator in evaluntil"); 174 break; 175 case LBRA: /* must have been RBRA */ 176 op1 = popand('('); 177 inst2 = newinst(RBRA); 178 inst2->subid = *subidp; 179 op1->last->next = inst2; 180 inst1 = newinst(LBRA); 181 inst1->subid = *subidp; 182 inst1->next = op1->first; 183 pushand(inst1, inst2); 184 return; 185 case OR: 186 op2 = popand('|'); 187 op1 = popand('|'); 188 inst2 = newinst(NOP); 189 op2->last->next = inst2; 190 op1->last->next = inst2; 191 inst1 = newinst(OR); 192 inst1->right = op1->first; 193 inst1->left = op2->first; 194 pushand(inst1, inst2); 195 break; 196 case CAT: 197 op2 = popand(0); 198 op1 = popand(0); 199 op1->last->next = op2->first; 200 pushand(op1->first, op2->last); 201 break; 202 case STAR: 203 op2 = popand('*'); 204 inst1 = newinst(OR); 205 op2->last->next = inst1; 206 inst1->right = op2->first; 207 pushand(inst1, inst1); 208 break; 209 case PLUS: 210 op2 = popand('+'); 211 inst1 = newinst(OR); 212 op2->last->next = inst1; 213 inst1->right = op2->first; 214 pushand(op2->first, inst1); 215 break; 216 case QUEST: 217 op2 = popand('?'); 218 inst1 = newinst(OR); 219 inst2 = newinst(NOP); 220 inst1->left = inst2; 221 inst1->right = op2->first; 222 op2->last->next = inst2; 223 pushand(inst1, inst2); 224 break; 225 } 226 } 227 } 228 229 static Reprog* 230 optimize(Reprog *pp) 231 { 232 Reinst *inst, *target; 233 int size; 234 Reprog *npp; 235 Reclass *cl; 236 int diff; 237 238 /* 239 * get rid of NOOP chains 240 */ 241 for(inst=pp->firstinst; inst->type!=END; inst++){ 242 target = inst->next; 243 while(target->type == NOP) 244 target = target->next; 245 inst->next = target; 246 } 247 248 /* 249 * The original allocation is for an area larger than 250 * necessary. Reallocate to the actual space used 251 * and then relocate the code. 252 */ 253 size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst); 254 npp = realloc(pp, size); 255 if(npp==0 || npp==pp) 256 return pp; 257 diff = (char *)npp - (char *)pp; 258 freep = (Reinst *)((char *)freep + diff); 259 for(inst=npp->firstinst; inst<freep; inst++){ 260 switch(inst->type){ 261 case OR: 262 case STAR: 263 case PLUS: 264 case QUEST: 265 *(char **)&inst->right += diff; 266 break; 267 case CCLASS: 268 case NCCLASS: 269 *(char **)&inst->right += diff; 270 cl = inst->cp; 271 *(char **)&cl->end += diff; 272 break; 273 } 274 *(char **)&inst->left += diff; 275 } 276 *(char **)&npp->startinst += diff; 277 return npp; 278 } 279 280 #ifdef DEBUG 281 static void 282 dumpstack(void){ 283 Node *stk; 284 int *ip; 285 286 print("operators\n"); 287 for(ip=atorstack; ip<atorp; ip++) 288 print("0%o\n", *ip); 289 print("operands\n"); 290 for(stk=andstack; stk<andp; stk++) 291 print("0%o\t0%o\n", stk->first->type, stk->last->type); 292 } 293 294 static void 295 dump(Reprog *pp) 296 { 297 Reinst *l; 298 Rune *p; 299 300 l = pp->firstinst; 301 do{ 302 print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type, 303 l->left-pp->firstinst, l->right-pp->firstinst); 304 if(l->type == RUNE) 305 print("\t%C\n", l->r); 306 else if(l->type == CCLASS || l->type == NCCLASS){ 307 print("\t["); 308 if(l->type == NCCLASS) 309 print("^"); 310 for(p = l->cp->spans; p < l->cp->end; p += 2) 311 if(p[0] == p[1]) 312 print("%C", p[0]); 313 else 314 print("%C-%C", p[0], p[1]); 315 print("]\n"); 316 } else 317 print("\n"); 318 }while(l++->type); 319 } 320 #endif 321 322 static Reclass* 323 newclass(void) 324 { 325 if(nclass >= NCLASS) 326 regerr2("too many character classes; limit", NCLASS+'0'); 327 return &(classp[nclass++]); 328 } 329 330 static int 331 nextc(Rune *rp) 332 { 333 if(lexdone){ 334 *rp = 0; 335 return 1; 336 } 337 exprp += chartorune(rp, exprp); 338 if(*rp == L'\\'){ 339 exprp += chartorune(rp, exprp); 340 return 1; 341 } 342 if(*rp == 0) 343 lexdone = 1; 344 return 0; 345 } 346 347 static int 348 lex(int literal, int dot_type) 349 { 350 int quoted; 351 352 quoted = nextc(&yyrune); 353 if(literal || quoted){ 354 if(yyrune == 0) 355 return END; 356 return RUNE; 357 } 358 359 switch(yyrune){ 360 case 0: 361 return END; 362 case L'*': 363 return STAR; 364 case L'?': 365 return QUEST; 366 case L'+': 367 return PLUS; 368 case L'|': 369 return OR; 370 case L'.': 371 return dot_type; 372 case L'(': 373 return LBRA; 374 case L')': 375 return RBRA; 376 case L'^': 377 return BOL; 378 case L'$': 379 return EOL; 380 case L'[': 381 return bldcclass(); 382 } 383 return RUNE; 384 } 385 386 static int 387 bldcclass(void) 388 { 389 int type; 390 Rune r[NCCRUNE]; 391 Rune *p, *ep, *np; 392 Rune rune; 393 int quoted; 394 395 /* we have already seen the '[' */ 396 type = CCLASS; 397 yyclassp = newclass(); 398 399 /* look ahead for negation */ 400 /* SPECIAL CASE!!! negated classes don't match \n */ 401 ep = r; 402 quoted = nextc(&rune); 403 if(!quoted && rune == L'^'){ 404 type = NCCLASS; 405 quoted = nextc(&rune); 406 *ep++ = L'\n'; 407 *ep++ = L'\n'; 408 } 409 410 /* parse class into a set of spans */ 411 for(; ep<&r[NCCRUNE];){ 412 if(rune == 0){ 413 rcerror("malformed '[]'"); 414 return 0; 415 } 416 if(!quoted && rune == L']') 417 break; 418 if(!quoted && rune == L'-'){ 419 if(ep == r){ 420 rcerror("malformed '[]'"); 421 return 0; 422 } 423 quoted = nextc(&rune); 424 if((!quoted && rune == L']') || rune == 0){ 425 rcerror("malformed '[]'"); 426 return 0; 427 } 428 *(ep-1) = rune; 429 } else { 430 *ep++ = rune; 431 *ep++ = rune; 432 } 433 quoted = nextc(&rune); 434 } 435 436 /* sort on span start */ 437 for(p = r; p < ep; p += 2){ 438 for(np = p; np < ep; np += 2) 439 if(*np < *p){ 440 rune = np[0]; 441 np[0] = p[0]; 442 p[0] = rune; 443 rune = np[1]; 444 np[1] = p[1]; 445 p[1] = rune; 446 } 447 } 448 449 /* merge spans */ 450 np = yyclassp->spans; 451 p = r; 452 if(r == ep) 453 yyclassp->end = np; 454 else { 455 np[0] = *p++; 456 np[1] = *p++; 457 for(; p < ep; p += 2) 458 if(p[0] <= np[1]){ 459 if(p[1] > np[1]) 460 np[1] = p[1]; 461 } else { 462 np += 2; 463 np[0] = p[0]; 464 np[1] = p[1]; 465 } 466 yyclassp->end = np+2; 467 } 468 469 return type; 470 } 471 472 static Reprog* 473 regcomp1(char *s, int literal, int dot_type) 474 { 475 int token; 476 Reprog *pp; 477 478 /* get memory for the program */ 479 pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s)); 480 if(pp == 0){ 481 regerror("out of memory"); 482 return 0; 483 } 484 freep = pp->firstinst; 485 classp = pp->class; 486 errors = 0; 487 488 if(setjmp(regkaboom)) 489 goto out; 490 491 /* go compile the sucker */ 492 lexdone = 0; 493 exprp = s; 494 nclass = 0; 495 nbra = 0; 496 atorp = atorstack; 497 andp = andstack; 498 subidp = subidstack; 499 lastwasand = FALSE; 500 cursubid = 0; 501 502 /* Start with a low priority operator to prime parser */ 503 pushator(START-1); 504 while((token = lex(literal, dot_type)) != END){ 505 if((token&0300) == OPERATOR) 506 operator(token); 507 else 508 operand(token); 509 } 510 511 /* Close with a low priority operator */ 512 evaluntil(START); 513 514 /* Force END */ 515 operand(END); 516 evaluntil(START); 517 #ifdef DEBUG 518 dumpstack(); 519 #endif 520 if(nbra) 521 rcerror("unmatched left paren"); 522 --andp; /* points to first and only operand */ 523 pp->startinst = andp->first; 524 #ifdef DEBUG 525 dump(pp); 526 #endif 527 pp = optimize(pp); 528 #ifdef DEBUG 529 print("start: %d\n", andp->first-pp->firstinst); 530 dump(pp); 531 #endif 532 out: 533 if(errors){ 534 free(pp); 535 pp = 0; 536 } 537 return pp; 538 } 539 540 extern Reprog* 541 regcomp(char *s) 542 { 543 return regcomp1(s, 0, ANY); 544 } 545 546 extern Reprog* 547 regcomplit(char *s) 548 { 549 return regcomp1(s, 1, ANY); 550 } 551 552 extern Reprog* 553 regcompnl(char *s) 554 { 555 return regcomp1(s, 0, ANYNL); 556 } 557