1 #include <stdlib.h>
2 #include <stdio.h>
3 #include <setjmp.h>
4 #include <string.h>
5 #include "regexp.h"
6 #include "regcomp.h"
7
8 #define TRUE 1
9 #define FALSE 0
10
11 /*
12 * Parser Information
13 */
14 typedef
15 struct Node
16 {
17 Reinst* first;
18 Reinst* last;
19 }Node;
20
21 #define NSTACK 20
22 static Node andstack[NSTACK];
23 static Node *andp;
24 static int atorstack[NSTACK];
25 static int* atorp;
26 static int cursubid; /* id of current subexpression */
27 static int subidstack[NSTACK]; /* parallel to atorstack */
28 static int* subidp;
29 static int lastwasand; /* Last token was operand */
30 static int nbra;
31 static char* exprp; /* pointer to next character in source expression */
32 static int lexdone;
33 static int nclass;
34 static Reclass*classp;
35 static Reinst* freep;
36 static int errors;
37 static wchar_t yyrune; /* last lex'd rune */
38 static Reclass*yyclassp; /* last lex'd class */
39
40 /* predeclared crap */
41 static void operator(int);
42 static void pushand(Reinst*, Reinst*);
43 static void pushator(int);
44 static void evaluntil(int);
45 static int bldcclass(void);
46
47 static jmp_buf regkaboom;
48
49 static void
rcerror(char * s)50 rcerror(char *s)
51 {
52 errors++;
53 regerror(s);
54 longjmp(regkaboom, 1);
55 }
56
57 static Reinst*
newinst(int t)58 newinst(int t)
59 {
60 freep->type = t;
61 freep->l.left = 0;
62 freep->r.right = 0;
63 return freep++;
64 }
65
66 static void
operand(int t)67 operand(int t)
68 {
69 Reinst *i;
70
71 if(lastwasand)
72 operator(CAT); /* catenate is implicit */
73 i = newinst(t);
74
75 if(t == CCLASS || t == NCCLASS)
76 i->r.cp = yyclassp;
77 if(t == RUNE)
78 i->r.r = yyrune;
79
80 pushand(i, i);
81 lastwasand = TRUE;
82 }
83
84 static void
operator(int t)85 operator(int t)
86 {
87 if(t==RBRA && --nbra<0)
88 rcerror("unmatched right paren");
89 if(t==LBRA){
90 if(++cursubid >= NSUBEXP)
91 rcerror ("too many subexpressions");
92 nbra++;
93 if(lastwasand)
94 operator(CAT);
95 } else
96 evaluntil(t);
97 if(t != RBRA)
98 pushator(t);
99 lastwasand = FALSE;
100 if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
101 lastwasand = TRUE; /* these look like operands */
102 }
103
104 static void
regerr2(char * s,int c)105 regerr2(char *s, int c)
106 {
107 char buf[100];
108 char *cp = buf;
109 while(*s)
110 *cp++ = *s++;
111 *cp++ = c;
112 *cp = '\0';
113 rcerror(buf);
114 }
115
116 static void
cant(char * s)117 cant(char *s)
118 {
119 char buf[100];
120 strcpy(buf, "can't happen: ");
121 strcat(buf, s);
122 rcerror(buf);
123 }
124
125 static void
pushand(Reinst * f,Reinst * l)126 pushand(Reinst *f, Reinst *l)
127 {
128 if(andp >= &andstack[NSTACK])
129 cant("operand stack overflow");
130 andp->first = f;
131 andp->last = l;
132 andp++;
133 }
134
135 static void
pushator(int t)136 pushator(int t)
137 {
138 if(atorp >= &atorstack[NSTACK])
139 cant("operator stack overflow");
140 *atorp++ = t;
141 *subidp++ = cursubid;
142 }
143
144 static Node*
popand(int op)145 popand(int op)
146 {
147 Reinst *inst;
148
149 if(andp <= &andstack[0]){
150 regerr2("missing operand for ", op);
151 inst = newinst(NOP);
152 pushand(inst,inst);
153 }
154 return --andp;
155 }
156
157 static int
popator(void)158 popator(void)
159 {
160 if(atorp <= &atorstack[0])
161 cant("operator stack underflow");
162 --subidp;
163 return *--atorp;
164 }
165
166 static void
evaluntil(int pri)167 evaluntil(int pri)
168 {
169 Node *op1, *op2;
170 Reinst *inst1, *inst2;
171
172 while(pri==RBRA || atorp[-1]>=pri){
173 switch(popator()){
174 default:
175 rcerror("unknown operator in evaluntil");
176 break;
177 case LBRA: /* must have been RBRA */
178 op1 = popand('(');
179 inst2 = newinst(RBRA);
180 inst2->r.subid = *subidp;
181 op1->last->l.next = inst2;
182 inst1 = newinst(LBRA);
183 inst1->r.subid = *subidp;
184 inst1->l.next = op1->first;
185 pushand(inst1, inst2);
186 return;
187 case OR:
188 op2 = popand('|');
189 op1 = popand('|');
190 inst2 = newinst(NOP);
191 op2->last->l.next = inst2;
192 op1->last->l.next = inst2;
193 inst1 = newinst(OR);
194 inst1->r.right = op1->first;
195 inst1->l.left = op2->first;
196 pushand(inst1, inst2);
197 break;
198 case CAT:
199 op2 = popand(0);
200 op1 = popand(0);
201 op1->last->l.next = op2->first;
202 pushand(op1->first, op2->last);
203 break;
204 case STAR:
205 op2 = popand('*');
206 inst1 = newinst(OR);
207 op2->last->l.next = inst1;
208 inst1->r.right = op2->first;
209 pushand(inst1, inst1);
210 break;
211 case PLUS:
212 op2 = popand('+');
213 inst1 = newinst(OR);
214 op2->last->l.next = inst1;
215 inst1->r.right = op2->first;
216 pushand(op2->first, inst1);
217 break;
218 case QUEST:
219 op2 = popand('?');
220 inst1 = newinst(OR);
221 inst2 = newinst(NOP);
222 inst1->l.left = inst2;
223 inst1->r.right = op2->first;
224 op2->last->l.next = inst2;
225 pushand(inst1, inst2);
226 break;
227 }
228 }
229 }
230
231 static Reprog*
optimize(Reprog * pp)232 optimize(Reprog *pp)
233 {
234 Reinst *inst, *target;
235 int size;
236 Reprog *npp;
237 int diff;
238
239 /*
240 * get rid of NOOP chains
241 */
242 for(inst=pp->firstinst; inst->type!=END; inst++){
243 target = inst->l.next;
244 while(target->type == NOP)
245 target = target->l.next;
246 inst->l.next = target;
247 }
248
249 /*
250 * The original allocation is for an area larger than
251 * necessary. Reallocate to the actual space used
252 * and then relocate the code.
253 */
254 size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst);
255 npp = realloc(pp, size);
256 if(npp==0 || npp==pp)
257 return pp;
258 diff = (char *)npp - (char *)pp;
259 freep = (Reinst *)((char *)freep + diff);
260 for(inst=npp->firstinst; inst<freep; inst++){
261 switch(inst->type){
262 case OR:
263 case STAR:
264 case PLUS:
265 case QUEST:
266 case CCLASS:
267 case NCCLASS:
268 *(char **)&inst->r.right += diff;
269 break;
270 }
271 *(char **)&inst->l.left += diff;
272 }
273 *(char **)&npp->startinst += diff;
274 return npp;
275 }
276
277 #ifdef DEBUG
278 static void
dumpstack(void)279 dumpstack(void){
280 Node *stk;
281 int *ip;
282
283 print("operators\n");
284 for(ip=atorstack; ip<atorp; ip++)
285 print("0%o\n", *ip);
286 print("operands\n");
287 for(stk=andstack; stk<andp; stk++)
288 print("0%o\t0%o\n", stk->first->type, stk->last->type);
289 }
290
291 static void
dump(Reprog * pp)292 dump(Reprog *pp)
293 {
294 Reinst *l;
295 wchar_t *p;
296
297 l = pp->firstinst;
298 do{
299 print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type,
300 l->l.left-pp->firstinst, l->l.right-pp->firstinst);
301 if(l->type == RUNE)
302 print("\t%C\n", l->r);
303 else if(l->type == CCLASS || l->type == NCCLASS){
304 print("\t[");
305 if(l->type == NCCLASS)
306 print("^");
307 for(p = l->r.cp->spans; p < l->r.cp->end; p += 2)
308 if(p[0] == p[1])
309 print("%C", p[0]);
310 else
311 print("%C-%C", p[0], p[1]);
312 print("]\n");
313 } else
314 print("\n");
315 }while(l++->type);
316 }
317 #endif
318
319 static Reclass*
newclass(void)320 newclass(void)
321 {
322 if(nclass >= NCLASS)
323 regerr2("too many character classes; limit", NCLASS+'0');
324 return &(classp[nclass++]);
325 }
326
327 static int
nextc(wchar_t * rp)328 nextc(wchar_t *rp)
329 {
330 int n;
331
332 if(lexdone){
333 *rp = 0;
334 return 1;
335 }
336 n = mbtowc(rp, exprp, MB_CUR_MAX);
337 if (n <= 0)
338 n = 1;
339 exprp += n;
340 if(*rp == L'\\'){
341 n = mbtowc(rp, exprp, MB_CUR_MAX);
342 if (n <= 0)
343 n = 1;
344 exprp += n;
345 return 1;
346 }
347 if(*rp == 0)
348 lexdone = 1;
349 return 0;
350 }
351
352 static int
lex(int literal,int dot_type)353 lex(int literal, int dot_type)
354 {
355 int quoted;
356
357 quoted = nextc(&yyrune);
358 if(literal || quoted){
359 if(yyrune == 0)
360 return END;
361 return RUNE;
362 }
363
364 switch(yyrune){
365 case 0:
366 return END;
367 case L'*':
368 return STAR;
369 case L'?':
370 return QUEST;
371 case L'+':
372 return PLUS;
373 case L'|':
374 return OR;
375 case L'.':
376 return dot_type;
377 case L'(':
378 return LBRA;
379 case L')':
380 return RBRA;
381 case L'^':
382 return BOL;
383 case L'$':
384 return EOL;
385 case L'[':
386 return bldcclass();
387 }
388 return RUNE;
389 }
390
391 static int
bldcclass(void)392 bldcclass(void)
393 {
394 int type;
395 wchar_t r[NCCRUNE];
396 wchar_t *p, *ep, *np;
397 wchar_t rune;
398 int quoted;
399
400 /* we have already seen the '[' */
401 type = CCLASS;
402 yyclassp = newclass();
403
404 /* look ahead for negation */
405 ep = r;
406 quoted = nextc(&rune);
407 if(!quoted && rune == L'^'){
408 type = NCCLASS;
409 quoted = nextc(&rune);
410 *ep++ = L'\n';
411 *ep++ = L'\n';
412 }
413
414 /* parse class into a set of spans */
415 for(; ep<&r[NCCRUNE];){
416 if(rune == 0){
417 rcerror("malformed '[]'");
418 return 0;
419 }
420 if(!quoted && rune == L']')
421 break;
422 if(!quoted && rune == L'-'){
423 if(ep == r){
424 rcerror("malformed '[]'");
425 return 0;
426 }
427 quoted = nextc(&rune);
428 if((!quoted && rune == L']') || rune == 0){
429 rcerror("malformed '[]'");
430 return 0;
431 }
432 *(ep-1) = rune;
433 } else {
434 *ep++ = rune;
435 *ep++ = rune;
436 }
437 quoted = nextc(&rune);
438 }
439
440 /* sort on span start */
441 for(p = r; p < ep; p += 2){
442 for(np = p; np < ep; np += 2)
443 if(*np < *p){
444 rune = np[0];
445 np[0] = p[0];
446 p[0] = rune;
447 rune = np[1];
448 np[1] = p[1];
449 p[1] = rune;
450 }
451 }
452
453 /* merge spans */
454 np = yyclassp->spans;
455 p = r;
456 if(r == ep)
457 yyclassp->end = np;
458 else {
459 np[0] = *p++;
460 np[1] = *p++;
461 for(; p < ep; p += 2)
462 if(p[0] <= np[1]){
463 if(p[1] > np[1])
464 np[1] = p[1];
465 } else {
466 np += 2;
467 np[0] = p[0];
468 np[1] = p[1];
469 }
470 yyclassp->end = np+2;
471 }
472
473 return type;
474 }
475
476 static Reprog*
regcomp1(char * s,int literal,int dot_type)477 regcomp1(char *s, int literal, int dot_type)
478 {
479 int token;
480 Reprog *pp;
481
482 /* get memory for the program */
483 pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s));
484 if(pp == 0){
485 regerror("out of memory");
486 return 0;
487 }
488 freep = pp->firstinst;
489 classp = pp->class;
490 errors = 0;
491
492 if(setjmp(regkaboom))
493 goto out;
494
495 /* go compile the sucker */
496 lexdone = 0;
497 exprp = s;
498 nclass = 0;
499 nbra = 0;
500 atorp = atorstack;
501 andp = andstack;
502 subidp = subidstack;
503 lastwasand = FALSE;
504 cursubid = 0;
505
506 /* Start with a low priority operator to prime parser */
507 pushator(START-1);
508 while((token = lex(literal, dot_type)) != END){
509 if((token&0300) == OPERATOR)
510 operator(token);
511 else
512 operand(token);
513 }
514
515 /* Close with a low priority operator */
516 evaluntil(START);
517
518 /* Force END */
519 operand(END);
520 evaluntil(START);
521 #ifdef DEBUG
522 dumpstack();
523 #endif
524 if(nbra)
525 rcerror("unmatched left paren");
526 --andp; /* points to first and only operand */
527 pp->startinst = andp->first;
528 #ifdef DEBUG
529 dump(pp);
530 #endif
531 pp = optimize(pp);
532 #ifdef DEBUG
533 print("start: %d\n", andp->first-pp->firstinst);
534 dump(pp);
535 #endif
536 out:
537 if(errors){
538 free(pp);
539 pp = 0;
540 }
541 return pp;
542 }
543
544 extern Reprog*
regcomp(char * s)545 regcomp(char *s)
546 {
547 return regcomp1(s, 0, ANY);
548 }
549
550 extern Reprog*
regcomplit(char * s)551 regcomplit(char *s)
552 {
553 return regcomp1(s, 1, ANY);
554 }
555
556 extern Reprog*
regcompnl(char * s)557 regcompnl(char *s)
558 {
559 return regcomp1(s, 0, ANYNL);
560 }
561