1 #include <stdlib.h> 2 #include <stdio.h> 3 #include <setjmp.h> 4 #include <string.h> 5 #include "regexp.h" 6 #include "regcomp.h" 7 8 #define TRUE 1 9 #define FALSE 0 10 11 /* 12 * Parser Information 13 */ 14 typedef 15 struct Node 16 { 17 Reinst* first; 18 Reinst* last; 19 }Node; 20 21 #define NSTACK 20 22 static Node andstack[NSTACK]; 23 static Node *andp; 24 static int atorstack[NSTACK]; 25 static int* atorp; 26 static int cursubid; /* id of current subexpression */ 27 static int subidstack[NSTACK]; /* parallel to atorstack */ 28 static int* subidp; 29 static int lastwasand; /* Last token was operand */ 30 static int nbra; 31 static char* exprp; /* pointer to next character in source expression */ 32 static int lexdone; 33 static int nclass; 34 static Reclass*classp; 35 static Reinst* freep; 36 static int errors; 37 static wchar_t yyrune; /* last lex'd rune */ 38 static Reclass*yyclassp; /* last lex'd class */ 39 40 /* predeclared crap */ 41 static void operator(int); 42 static void pushand(Reinst*, Reinst*); 43 static void pushator(int); 44 static void evaluntil(int); 45 static int bldcclass(void); 46 47 static jmp_buf regkaboom; 48 49 static void 50 rcerror(char *s) 51 { 52 errors++; 53 regerror(s); 54 longjmp(regkaboom, 1); 55 } 56 57 static Reinst* 58 newinst(int t) 59 { 60 freep->type = t; 61 freep->l.left = 0; 62 freep->r.right = 0; 63 return freep++; 64 } 65 66 static void 67 operand(int t) 68 { 69 Reinst *i; 70 71 if(lastwasand) 72 operator(CAT); /* catenate is implicit */ 73 i = newinst(t); 74 75 if(t == CCLASS || t == NCCLASS) 76 i->r.cp = yyclassp; 77 if(t == RUNE) 78 i->r.r = yyrune; 79 80 pushand(i, i); 81 lastwasand = TRUE; 82 } 83 84 static void 85 operator(int t) 86 { 87 if(t==RBRA && --nbra<0) 88 rcerror("unmatched right paren"); 89 if(t==LBRA){ 90 if(++cursubid >= NSUBEXP) 91 rcerror ("too many subexpressions"); 92 nbra++; 93 if(lastwasand) 94 operator(CAT); 95 } else 96 evaluntil(t); 97 if(t != RBRA) 98 pushator(t); 99 lastwasand = FALSE; 100 if(t==STAR || t==QUEST || t==PLUS || t==RBRA) 101 lastwasand = TRUE; /* these look like operands */ 102 } 103 104 static void 105 regerr2(char *s, int c) 106 { 107 char buf[100]; 108 char *cp = buf; 109 while(*s) 110 *cp++ = *s++; 111 *cp++ = c; 112 *cp = '\0'; 113 rcerror(buf); 114 } 115 116 static void 117 cant(char *s) 118 { 119 char buf[100]; 120 strcpy(buf, "can't happen: "); 121 strcat(buf, s); 122 rcerror(buf); 123 } 124 125 static void 126 pushand(Reinst *f, Reinst *l) 127 { 128 if(andp >= &andstack[NSTACK]) 129 cant("operand stack overflow"); 130 andp->first = f; 131 andp->last = l; 132 andp++; 133 } 134 135 static void 136 pushator(int t) 137 { 138 if(atorp >= &atorstack[NSTACK]) 139 cant("operator stack overflow"); 140 *atorp++ = t; 141 *subidp++ = cursubid; 142 } 143 144 static Node* 145 popand(int op) 146 { 147 Reinst *inst; 148 149 if(andp <= &andstack[0]){ 150 regerr2("missing operand for ", op); 151 inst = newinst(NOP); 152 pushand(inst,inst); 153 } 154 return --andp; 155 } 156 157 static int 158 popator(void) 159 { 160 if(atorp <= &atorstack[0]) 161 cant("operator stack underflow"); 162 --subidp; 163 return *--atorp; 164 } 165 166 static void 167 evaluntil(int pri) 168 { 169 Node *op1, *op2; 170 Reinst *inst1, *inst2; 171 172 while(pri==RBRA || atorp[-1]>=pri){ 173 switch(popator()){ 174 default: 175 rcerror("unknown operator in evaluntil"); 176 break; 177 case LBRA: /* must have been RBRA */ 178 op1 = popand('('); 179 inst2 = newinst(RBRA); 180 inst2->r.subid = *subidp; 181 op1->last->l.next = inst2; 182 inst1 = newinst(LBRA); 183 inst1->r.subid = *subidp; 184 inst1->l.next = op1->first; 185 pushand(inst1, inst2); 186 return; 187 case OR: 188 op2 = popand('|'); 189 op1 = popand('|'); 190 inst2 = newinst(NOP); 191 op2->last->l.next = inst2; 192 op1->last->l.next = inst2; 193 inst1 = newinst(OR); 194 inst1->r.right = op1->first; 195 inst1->l.left = op2->first; 196 pushand(inst1, inst2); 197 break; 198 case CAT: 199 op2 = popand(0); 200 op1 = popand(0); 201 op1->last->l.next = op2->first; 202 pushand(op1->first, op2->last); 203 break; 204 case STAR: 205 op2 = popand('*'); 206 inst1 = newinst(OR); 207 op2->last->l.next = inst1; 208 inst1->r.right = op2->first; 209 pushand(inst1, inst1); 210 break; 211 case PLUS: 212 op2 = popand('+'); 213 inst1 = newinst(OR); 214 op2->last->l.next = inst1; 215 inst1->r.right = op2->first; 216 pushand(op2->first, inst1); 217 break; 218 case QUEST: 219 op2 = popand('?'); 220 inst1 = newinst(OR); 221 inst2 = newinst(NOP); 222 inst1->l.left = inst2; 223 inst1->r.right = op2->first; 224 op2->last->l.next = inst2; 225 pushand(inst1, inst2); 226 break; 227 } 228 } 229 } 230 231 static Reprog* 232 optimize(Reprog *pp) 233 { 234 Reinst *inst, *target; 235 int size; 236 Reprog *npp; 237 int diff; 238 239 /* 240 * get rid of NOOP chains 241 */ 242 for(inst=pp->firstinst; inst->type!=END; inst++){ 243 target = inst->l.next; 244 while(target->type == NOP) 245 target = target->l.next; 246 inst->l.next = target; 247 } 248 249 /* 250 * The original allocation is for an area larger than 251 * necessary. Reallocate to the actual space used 252 * and then relocate the code. 253 */ 254 size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst); 255 npp = realloc(pp, size); 256 if(npp==0 || npp==pp) 257 return pp; 258 diff = (char *)npp - (char *)pp; 259 freep = (Reinst *)((char *)freep + diff); 260 for(inst=npp->firstinst; inst<freep; inst++){ 261 switch(inst->type){ 262 case OR: 263 case STAR: 264 case PLUS: 265 case QUEST: 266 case CCLASS: 267 case NCCLASS: 268 *(char **)&inst->r.right += diff; 269 break; 270 } 271 *(char **)&inst->l.left += diff; 272 } 273 *(char **)&npp->startinst += diff; 274 return npp; 275 } 276 277 #ifdef DEBUG 278 static void 279 dumpstack(void){ 280 Node *stk; 281 int *ip; 282 283 print("operators\n"); 284 for(ip=atorstack; ip<atorp; ip++) 285 print("0%o\n", *ip); 286 print("operands\n"); 287 for(stk=andstack; stk<andp; stk++) 288 print("0%o\t0%o\n", stk->first->type, stk->last->type); 289 } 290 291 static void 292 dump(Reprog *pp) 293 { 294 Reinst *l; 295 wchar_t *p; 296 297 l = pp->firstinst; 298 do{ 299 print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type, 300 l->l.left-pp->firstinst, l->l.right-pp->firstinst); 301 if(l->type == RUNE) 302 print("\t%C\n", l->r); 303 else if(l->type == CCLASS || l->type == NCCLASS){ 304 print("\t["); 305 if(l->type == NCCLASS) 306 print("^"); 307 for(p = l->r.cp->spans; p < l->r.cp->end; p += 2) 308 if(p[0] == p[1]) 309 print("%C", p[0]); 310 else 311 print("%C-%C", p[0], p[1]); 312 print("]\n"); 313 } else 314 print("\n"); 315 }while(l++->type); 316 } 317 #endif 318 319 static Reclass* 320 newclass(void) 321 { 322 if(nclass >= NCLASS) 323 regerr2("too many character classes; limit", NCLASS+'0'); 324 return &(classp[nclass++]); 325 } 326 327 static int 328 nextc(wchar_t *rp) 329 { 330 int n; 331 332 if(lexdone){ 333 *rp = 0; 334 return 1; 335 } 336 n = mbtowc(rp, exprp, MB_CUR_MAX); 337 if (n <= 0) 338 n = 1; 339 exprp += n; 340 if(*rp == L'\\'){ 341 n = mbtowc(rp, exprp, MB_CUR_MAX); 342 if (n <= 0) 343 n = 1; 344 exprp += n; 345 return 1; 346 } 347 if(*rp == 0) 348 lexdone = 1; 349 return 0; 350 } 351 352 static int 353 lex(int literal, int dot_type) 354 { 355 int quoted; 356 357 quoted = nextc(&yyrune); 358 if(literal || quoted){ 359 if(yyrune == 0) 360 return END; 361 return RUNE; 362 } 363 364 switch(yyrune){ 365 case 0: 366 return END; 367 case L'*': 368 return STAR; 369 case L'?': 370 return QUEST; 371 case L'+': 372 return PLUS; 373 case L'|': 374 return OR; 375 case L'.': 376 return dot_type; 377 case L'(': 378 return LBRA; 379 case L')': 380 return RBRA; 381 case L'^': 382 return BOL; 383 case L'$': 384 return EOL; 385 case L'[': 386 return bldcclass(); 387 } 388 return RUNE; 389 } 390 391 static int 392 bldcclass(void) 393 { 394 int type; 395 wchar_t r[NCCRUNE]; 396 wchar_t *p, *ep, *np; 397 wchar_t rune; 398 int quoted; 399 400 /* we have already seen the '[' */ 401 type = CCLASS; 402 yyclassp = newclass(); 403 404 /* look ahead for negation */ 405 ep = r; 406 quoted = nextc(&rune); 407 if(!quoted && rune == L'^'){ 408 type = NCCLASS; 409 quoted = nextc(&rune); 410 *ep++ = L'\n'; 411 *ep++ = L'\n'; 412 } 413 414 /* parse class into a set of spans */ 415 for(; ep<&r[NCCRUNE];){ 416 if(rune == 0){ 417 rcerror("malformed '[]'"); 418 return 0; 419 } 420 if(!quoted && rune == L']') 421 break; 422 if(!quoted && rune == L'-'){ 423 if(ep == r){ 424 rcerror("malformed '[]'"); 425 return 0; 426 } 427 quoted = nextc(&rune); 428 if((!quoted && rune == L']') || rune == 0){ 429 rcerror("malformed '[]'"); 430 return 0; 431 } 432 *(ep-1) = rune; 433 } else { 434 *ep++ = rune; 435 *ep++ = rune; 436 } 437 quoted = nextc(&rune); 438 } 439 440 /* sort on span start */ 441 for(p = r; p < ep; p += 2){ 442 for(np = p; np < ep; np += 2) 443 if(*np < *p){ 444 rune = np[0]; 445 np[0] = p[0]; 446 p[0] = rune; 447 rune = np[1]; 448 np[1] = p[1]; 449 p[1] = rune; 450 } 451 } 452 453 /* merge spans */ 454 np = yyclassp->spans; 455 p = r; 456 if(r == ep) 457 yyclassp->end = np; 458 else { 459 np[0] = *p++; 460 np[1] = *p++; 461 for(; p < ep; p += 2) 462 if(p[0] <= np[1]){ 463 if(p[1] > np[1]) 464 np[1] = p[1]; 465 } else { 466 np += 2; 467 np[0] = p[0]; 468 np[1] = p[1]; 469 } 470 yyclassp->end = np+2; 471 } 472 473 return type; 474 } 475 476 static Reprog* 477 regcomp1(char *s, int literal, int dot_type) 478 { 479 int token; 480 Reprog *pp; 481 482 /* get memory for the program */ 483 pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s)); 484 if(pp == 0){ 485 regerror("out of memory"); 486 return 0; 487 } 488 freep = pp->firstinst; 489 classp = pp->class; 490 errors = 0; 491 492 if(setjmp(regkaboom)) 493 goto out; 494 495 /* go compile the sucker */ 496 lexdone = 0; 497 exprp = s; 498 nclass = 0; 499 nbra = 0; 500 atorp = atorstack; 501 andp = andstack; 502 subidp = subidstack; 503 lastwasand = FALSE; 504 cursubid = 0; 505 506 /* Start with a low priority operator to prime parser */ 507 pushator(START-1); 508 while((token = lex(literal, dot_type)) != END){ 509 if((token&0300) == OPERATOR) 510 operator(token); 511 else 512 operand(token); 513 } 514 515 /* Close with a low priority operator */ 516 evaluntil(START); 517 518 /* Force END */ 519 operand(END); 520 evaluntil(START); 521 #ifdef DEBUG 522 dumpstack(); 523 #endif 524 if(nbra) 525 rcerror("unmatched left paren"); 526 --andp; /* points to first and only operand */ 527 pp->startinst = andp->first; 528 #ifdef DEBUG 529 dump(pp); 530 #endif 531 pp = optimize(pp); 532 #ifdef DEBUG 533 print("start: %d\n", andp->first-pp->firstinst); 534 dump(pp); 535 #endif 536 out: 537 if(errors){ 538 free(pp); 539 pp = 0; 540 } 541 return pp; 542 } 543 544 extern Reprog* 545 regcomp(char *s) 546 { 547 return regcomp1(s, 0, ANY); 548 } 549 550 extern Reprog* 551 regcomplit(char *s) 552 { 553 return regcomp1(s, 1, ANY); 554 } 555 556 extern Reprog* 557 regcompnl(char *s) 558 { 559 return regcomp1(s, 0, ANYNL); 560 } 561