1 #include <u.h> 2 #include <libc.h> 3 #include "regexp.h" 4 #include "regcomp.h" 5 6 #define TRUE 1 7 #define FALSE 0 8 9 /* 10 * Parser Information 11 */ 12 typedef 13 struct Node 14 { 15 Reinst* first; 16 Reinst* last; 17 }Node; 18 19 #define NSTACK 20 20 static Node andstack[NSTACK]; 21 static Node *andp; 22 static int atorstack[NSTACK]; 23 static int* atorp; 24 static int cursubid; /* id of current subexpression */ 25 static int subidstack[NSTACK]; /* parallel to atorstack */ 26 static int* subidp; 27 static int lastwasand; /* Last token was operand */ 28 static int nbra; 29 static char* exprp; /* pointer to next character in source expression */ 30 static int lexdone; 31 static int nclass; 32 static Reclass*classp; 33 static Reinst* freep; 34 static int errors; 35 static Rune yyrune; /* last lex'd rune */ 36 static Reclass*yyclassp; /* last lex'd class */ 37 38 /* predeclared crap */ 39 static void operator(int); 40 static void pushand(Reinst*, Reinst*); 41 static void pushator(int); 42 static void evaluntil(int); 43 static int bldcclass(void); 44 45 static jmp_buf regkaboom; 46 47 static void 48 rcerror(char *s) 49 { 50 errors++; 51 regerror(s); 52 longjmp(regkaboom, 1); 53 } 54 55 static Reinst* 56 newinst(int t) 57 { 58 freep->type = t; 59 freep->left = 0; 60 freep->right = 0; 61 return freep++; 62 } 63 64 static void 65 operand(int t) 66 { 67 Reinst *i; 68 69 if(lastwasand) 70 operator(CAT); /* catenate is implicit */ 71 i = newinst(t); 72 73 if(t == CCLASS || t == NCCLASS) 74 i->cp = yyclassp; 75 if(t == RUNE) 76 i->r = yyrune; 77 78 pushand(i, i); 79 lastwasand = TRUE; 80 } 81 82 static void 83 operator(int t) 84 { 85 if(t==RBRA && --nbra<0) 86 rcerror("unmatched right paren"); 87 if(t==LBRA){ 88 if(++cursubid >= NSUBEXP) 89 rcerror ("too many subexpressions"); 90 nbra++; 91 if(lastwasand) 92 operator(CAT); 93 } else 94 evaluntil(t); 95 if(t != RBRA) 96 pushator(t); 97 lastwasand = FALSE; 98 if(t==STAR || t==QUEST || t==PLUS || t==RBRA) 99 lastwasand = TRUE; /* these look like operands */ 100 } 101 102 static void 103 regerr2(char *s, int c) 104 { 105 char buf[100]; 106 char *cp = buf; 107 while(*s) 108 *cp++ = *s++; 109 *cp++ = c; 110 *cp = '\0'; 111 rcerror(buf); 112 } 113 114 static void 115 cant(char *s) 116 { 117 char buf[100]; 118 strcpy(buf, "can't happen: "); 119 strcat(buf, s); 120 rcerror(buf); 121 } 122 123 static void 124 pushand(Reinst *f, Reinst *l) 125 { 126 if(andp >= &andstack[NSTACK]) 127 cant("operand stack overflow"); 128 andp->first = f; 129 andp->last = l; 130 andp++; 131 } 132 133 static void 134 pushator(int t) 135 { 136 if(atorp >= &atorstack[NSTACK]) 137 cant("operator stack overflow"); 138 *atorp++ = t; 139 *subidp++ = cursubid; 140 } 141 142 static Node* 143 popand(int op) 144 { 145 Reinst *inst; 146 147 if(andp <= &andstack[0]){ 148 regerr2("missing operand for ", op); 149 inst = newinst(NOP); 150 pushand(inst,inst); 151 } 152 return --andp; 153 } 154 155 static int 156 popator(void) 157 { 158 if(atorp <= &atorstack[0]) 159 cant("operator stack underflow"); 160 --subidp; 161 return *--atorp; 162 } 163 164 static void 165 evaluntil(int pri) 166 { 167 Node *op1, *op2; 168 Reinst *inst1, *inst2; 169 170 while(pri==RBRA || atorp[-1]>=pri){ 171 switch(popator()){ 172 default: 173 rcerror("unknown operator in evaluntil"); 174 break; 175 case LBRA: /* must have been RBRA */ 176 op1 = popand('('); 177 inst2 = newinst(RBRA); 178 inst2->subid = *subidp; 179 op1->last->next = inst2; 180 inst1 = newinst(LBRA); 181 inst1->subid = *subidp; 182 inst1->next = op1->first; 183 pushand(inst1, inst2); 184 return; 185 case OR: 186 op2 = popand('|'); 187 op1 = popand('|'); 188 inst2 = newinst(NOP); 189 op2->last->next = inst2; 190 op1->last->next = inst2; 191 inst1 = newinst(OR); 192 inst1->right = op1->first; 193 inst1->left = op2->first; 194 pushand(inst1, inst2); 195 break; 196 case CAT: 197 op2 = popand(0); 198 op1 = popand(0); 199 op1->last->next = op2->first; 200 pushand(op1->first, op2->last); 201 break; 202 case STAR: 203 op2 = popand('*'); 204 inst1 = newinst(OR); 205 op2->last->next = inst1; 206 inst1->right = op2->first; 207 pushand(inst1, inst1); 208 break; 209 case PLUS: 210 op2 = popand('+'); 211 inst1 = newinst(OR); 212 op2->last->next = inst1; 213 inst1->right = op2->first; 214 pushand(op2->first, inst1); 215 break; 216 case QUEST: 217 op2 = popand('?'); 218 inst1 = newinst(OR); 219 inst2 = newinst(NOP); 220 inst1->left = inst2; 221 inst1->right = op2->first; 222 op2->last->next = inst2; 223 pushand(inst1, inst2); 224 break; 225 } 226 } 227 } 228 229 static Reprog* 230 optimize(Reprog *pp) 231 { 232 Reinst *inst, *target; 233 int size; 234 Reprog *npp; 235 int diff; 236 237 /* 238 * get rid of NOOP chains 239 */ 240 for(inst=pp->firstinst; inst->type!=END; inst++){ 241 target = inst->next; 242 while(target->type == NOP) 243 target = target->next; 244 inst->next = target; 245 } 246 247 /* 248 * The original allocation is for an area larger than 249 * necessary. Reallocate to the actual space used 250 * and then relocate the code. 251 */ 252 size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst); 253 npp = realloc(pp, size); 254 if(npp==0 || npp==pp) 255 return pp; 256 diff = (char *)npp - (char *)pp; 257 freep = (Reinst *)((char *)freep + diff); 258 for(inst=npp->firstinst; inst<freep; inst++){ 259 switch(inst->type){ 260 case OR: 261 case STAR: 262 case PLUS: 263 case QUEST: 264 case CCLASS: 265 case NCCLASS: 266 *(char **)&inst->right += diff; 267 break; 268 } 269 *(char **)&inst->left += diff; 270 } 271 *(char **)&npp->startinst += diff; 272 return npp; 273 } 274 275 #ifdef DEBUG 276 static void 277 dumpstack(void){ 278 Node *stk; 279 int *ip; 280 281 print("operators\n"); 282 for(ip=atorstack; ip<atorp; ip++) 283 print("0%o\n", *ip); 284 print("operands\n"); 285 for(stk=andstack; stk<andp; stk++) 286 print("0%o\t0%o\n", stk->first->type, stk->last->type); 287 } 288 289 static void 290 dump(Reprog *pp) 291 { 292 Reinst *l; 293 Rune *p; 294 295 l = pp->firstinst; 296 do{ 297 print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type, 298 l->left-pp->firstinst, l->right-pp->firstinst); 299 if(l->type == RUNE) 300 print("\t%C\n", l->r); 301 else if(l->type == CCLASS || l->type == NCCLASS){ 302 print("\t["); 303 if(l->type == NCCLASS) 304 print("^"); 305 for(p = l->cp->spans; p < l->cp->end; p += 2) 306 if(p[0] == p[1]) 307 print("%C", p[0]); 308 else 309 print("%C-%C", p[0], p[1]); 310 print("]\n"); 311 } else 312 print("\n"); 313 }while(l++->type); 314 } 315 #endif 316 317 static Reclass* 318 newclass(void) 319 { 320 if(nclass >= NCLASS) 321 regerr2("too many character classes; limit", NCLASS+'0'); 322 return &(classp[nclass++]); 323 } 324 325 static int 326 nextc(Rune *rp) 327 { 328 if(lexdone){ 329 *rp = 0; 330 return 1; 331 } 332 exprp += chartorune(rp, exprp); 333 if(*rp == L'\\'){ 334 exprp += chartorune(rp, exprp); 335 return 1; 336 } 337 if(*rp == 0) 338 lexdone = 1; 339 return 0; 340 } 341 342 static int 343 lex(int literal, int dot_type) 344 { 345 int quoted; 346 347 quoted = nextc(&yyrune); 348 if(literal || quoted){ 349 if(yyrune == 0) 350 return END; 351 return RUNE; 352 } 353 354 switch(yyrune){ 355 case 0: 356 return END; 357 case L'*': 358 return STAR; 359 case L'?': 360 return QUEST; 361 case L'+': 362 return PLUS; 363 case L'|': 364 return OR; 365 case L'.': 366 return dot_type; 367 case L'(': 368 return LBRA; 369 case L')': 370 return RBRA; 371 case L'^': 372 return BOL; 373 case L'$': 374 return EOL; 375 case L'[': 376 return bldcclass(); 377 } 378 return RUNE; 379 } 380 381 static int 382 bldcclass(void) 383 { 384 int type; 385 Rune r[NCCRUNE]; 386 Rune *p, *ep, *np; 387 Rune rune; 388 int quoted; 389 390 /* we have already seen the '[' */ 391 type = CCLASS; 392 yyclassp = newclass(); 393 394 /* look ahead for negation */ 395 /* SPECIAL CASE!!! negated classes don't match \n */ 396 ep = r; 397 quoted = nextc(&rune); 398 if(!quoted && rune == L'^'){ 399 type = NCCLASS; 400 quoted = nextc(&rune); 401 *ep++ = L'\n'; 402 *ep++ = L'\n'; 403 } 404 405 /* parse class into a set of spans */ 406 for(; ep<&r[NCCRUNE];){ 407 if(rune == 0){ 408 rcerror("malformed '[]'"); 409 return 0; 410 } 411 if(!quoted && rune == L']') 412 break; 413 if(!quoted && rune == L'-'){ 414 if(ep == r){ 415 rcerror("malformed '[]'"); 416 return 0; 417 } 418 quoted = nextc(&rune); 419 if((!quoted && rune == L']') || rune == 0){ 420 rcerror("malformed '[]'"); 421 return 0; 422 } 423 *(ep-1) = rune; 424 } else { 425 *ep++ = rune; 426 *ep++ = rune; 427 } 428 quoted = nextc(&rune); 429 } 430 431 /* sort on span start */ 432 for(p = r; p < ep; p += 2){ 433 for(np = p; np < ep; np += 2) 434 if(*np < *p){ 435 rune = np[0]; 436 np[0] = p[0]; 437 p[0] = rune; 438 rune = np[1]; 439 np[1] = p[1]; 440 p[1] = rune; 441 } 442 } 443 444 /* merge spans */ 445 np = yyclassp->spans; 446 p = r; 447 if(r == ep) 448 yyclassp->end = np; 449 else { 450 np[0] = *p++; 451 np[1] = *p++; 452 for(; p < ep; p += 2) 453 if(p[0] <= np[1]){ 454 if(p[1] > np[1]) 455 np[1] = p[1]; 456 } else { 457 np += 2; 458 np[0] = p[0]; 459 np[1] = p[1]; 460 } 461 yyclassp->end = np+2; 462 } 463 464 return type; 465 } 466 467 static Reprog* 468 regcomp1(char *s, int literal, int dot_type) 469 { 470 int token; 471 Reprog *pp; 472 473 /* get memory for the program */ 474 pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s)); 475 if(pp == 0){ 476 regerror("out of memory"); 477 return 0; 478 } 479 freep = pp->firstinst; 480 classp = pp->class; 481 errors = 0; 482 483 if(setjmp(regkaboom)) 484 goto out; 485 486 /* go compile the sucker */ 487 lexdone = 0; 488 exprp = s; 489 nclass = 0; 490 nbra = 0; 491 atorp = atorstack; 492 andp = andstack; 493 subidp = subidstack; 494 lastwasand = FALSE; 495 cursubid = 0; 496 497 /* Start with a low priority operator to prime parser */ 498 pushator(START-1); 499 while((token = lex(literal, dot_type)) != END){ 500 if((token&0300) == OPERATOR) 501 operator(token); 502 else 503 operand(token); 504 } 505 506 /* Close with a low priority operator */ 507 evaluntil(START); 508 509 /* Force END */ 510 operand(END); 511 evaluntil(START); 512 #ifdef DEBUG 513 dumpstack(); 514 #endif 515 if(nbra) 516 rcerror("unmatched left paren"); 517 --andp; /* points to first and only operand */ 518 pp->startinst = andp->first; 519 #ifdef DEBUG 520 dump(pp); 521 #endif 522 pp = optimize(pp); 523 #ifdef DEBUG 524 print("start: %d\n", andp->first-pp->firstinst); 525 dump(pp); 526 #endif 527 out: 528 if(errors){ 529 free(pp); 530 pp = 0; 531 } 532 return pp; 533 } 534 535 extern Reprog* 536 regcomp(char *s) 537 { 538 return regcomp1(s, 0, ANY); 539 } 540 541 extern Reprog* 542 regcomplit(char *s) 543 { 544 return regcomp1(s, 1, ANY); 545 } 546 547 extern Reprog* 548 regcompnl(char *s) 549 { 550 return regcomp1(s, 0, ANYNL); 551 } 552