1 #include <lib9.h>
2 #include "regexp.h"
3 #include "regcomp.h"
4
5 #define TRUE 1
6 #define FALSE 0
7
8 /*
9 * Parser Information
10 */
11 typedef
12 struct Node
13 {
14 Reinst* first;
15 Reinst* last;
16 }Node;
17
18 Reprog RePrOg;
19
20 #define NSTACK 20
21 static Node andstack[NSTACK];
22 static Node *andp;
23 static int atorstack[NSTACK];
24 static int* atorp;
25 static int cursubid; /* id of current subexpression */
26 static int subidstack[NSTACK]; /* parallel to atorstack */
27 static int* subidp;
28 static int lastwasand; /* Last token was operand */
29 static int nbra;
30 static char* exprp; /* pointer to next character in source expression */
31 static int lexdone;
32 static int nclass;
33 static Reclass*classp;
34 static Reinst* freep;
35 static int errors;
36 static Rune yyrune; /* last lex'd rune */
37 static Reclass*yyclassp; /* last lex'd class */
38
39 /* predeclared crap */
40 static void operator(int);
41 static void pushand(Reinst*, Reinst*);
42 static void pushator(int);
43 static void evaluntil(int);
44 static int bldcclass(void);
45
46 static jmp_buf regkaboom;
47
48 static void
rcerror(char * s)49 rcerror(char *s)
50 {
51 errors++;
52 regerror(s);
53 longjmp(regkaboom, 1);
54 }
55
56 static Reinst*
newinst(int t)57 newinst(int t)
58 {
59 freep->type = t;
60 freep->u2.left = 0;
61 freep->u1.right = 0;
62 return freep++;
63 }
64
65 static void
operand(int t)66 operand(int t)
67 {
68 Reinst *i;
69
70 if(lastwasand)
71 operator(CAT); /* catenate is implicit */
72 i = newinst(t);
73
74 if(t == CCLASS || t == NCCLASS)
75 i->u1.cp = yyclassp;
76 if(t == RUNE)
77 i->u1.r = yyrune;
78
79 pushand(i, i);
80 lastwasand = TRUE;
81 }
82
83 static void
operator(int t)84 operator(int t)
85 {
86 if(t==RBRA && --nbra<0)
87 rcerror("unmatched right paren");
88 if(t==LBRA){
89 if(++cursubid >= NSUBEXP)
90 rcerror ("too many subexpressions");
91 nbra++;
92 if(lastwasand)
93 operator(CAT);
94 } else
95 evaluntil(t);
96 if(t != RBRA)
97 pushator(t);
98 lastwasand = FALSE;
99 if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
100 lastwasand = TRUE; /* these look like operands */
101 }
102
103 static void
regerr2(char * s,int c)104 regerr2(char *s, int c)
105 {
106 char buf[100];
107 char *cp = buf;
108 while(*s)
109 *cp++ = *s++;
110 *cp++ = c;
111 *cp = '\0';
112 rcerror(buf);
113 }
114
115 static void
cant(char * s)116 cant(char *s)
117 {
118 char buf[100];
119 strcpy(buf, "can't happen: ");
120 strcat(buf, s);
121 rcerror(buf);
122 }
123
124 static void
pushand(Reinst * f,Reinst * l)125 pushand(Reinst *f, Reinst *l)
126 {
127 if(andp >= &andstack[NSTACK])
128 cant("operand stack overflow");
129 andp->first = f;
130 andp->last = l;
131 andp++;
132 }
133
134 static void
pushator(int t)135 pushator(int t)
136 {
137 if(atorp >= &atorstack[NSTACK])
138 cant("operator stack overflow");
139 *atorp++ = t;
140 *subidp++ = cursubid;
141 }
142
143 static Node*
popand(int op)144 popand(int op)
145 {
146 Reinst *inst;
147
148 if(andp <= &andstack[0]){
149 regerr2("missing operand for ", op);
150 inst = newinst(NOP);
151 pushand(inst,inst);
152 }
153 return --andp;
154 }
155
156 static int
popator(void)157 popator(void)
158 {
159 if(atorp <= &atorstack[0])
160 cant("operator stack underflow");
161 --subidp;
162 return *--atorp;
163 }
164
165 static void
evaluntil(int pri)166 evaluntil(int pri)
167 {
168 Node *op1, *op2;
169 Reinst *inst1, *inst2;
170
171 while(pri==RBRA || atorp[-1]>=pri){
172 switch(popator()){
173 default:
174 rcerror("unknown operator in evaluntil");
175 break;
176 case LBRA: /* must have been RBRA */
177 op1 = popand('(');
178 inst2 = newinst(RBRA);
179 inst2->u1.subid = *subidp;
180 op1->last->u2.next = inst2;
181 inst1 = newinst(LBRA);
182 inst1->u1.subid = *subidp;
183 inst1->u2.next = op1->first;
184 pushand(inst1, inst2);
185 return;
186 case OR:
187 op2 = popand('|');
188 op1 = popand('|');
189 inst2 = newinst(NOP);
190 op2->last->u2.next = inst2;
191 op1->last->u2.next = inst2;
192 inst1 = newinst(OR);
193 inst1->u1.right = op1->first;
194 inst1->u2.left = op2->first;
195 pushand(inst1, inst2);
196 break;
197 case CAT:
198 op2 = popand(0);
199 op1 = popand(0);
200 op1->last->u2.next = op2->first;
201 pushand(op1->first, op2->last);
202 break;
203 case STAR:
204 op2 = popand('*');
205 inst1 = newinst(OR);
206 op2->last->u2.next = inst1;
207 inst1->u1.right = op2->first;
208 pushand(inst1, inst1);
209 break;
210 case PLUS:
211 op2 = popand('+');
212 inst1 = newinst(OR);
213 op2->last->u2.next = inst1;
214 inst1->u1.right = op2->first;
215 pushand(op2->first, inst1);
216 break;
217 case QUEST:
218 op2 = popand('?');
219 inst1 = newinst(OR);
220 inst2 = newinst(NOP);
221 inst1->u2.left = inst2;
222 inst1->u1.right = op2->first;
223 op2->last->u2.next = inst2;
224 pushand(inst1, inst2);
225 break;
226 }
227 }
228 }
229
230 static Reprog*
optimize(Reprog * pp)231 optimize(Reprog *pp)
232 {
233 Reinst *inst, *target;
234 int size;
235 Reprog *npp;
236 Reclass *cl;
237 int diff;
238
239 /*
240 * get rid of NOOP chains
241 */
242 for(inst=pp->firstinst; inst->type!=END; inst++){
243 target = inst->u2.next;
244 while(target->type == NOP)
245 target = target->u2.next;
246 inst->u2.next = target;
247 }
248
249 /*
250 * The original allocation is for an area larger than
251 * necessary. Reallocate to the actual space used
252 * and then relocate the code.
253 */
254 size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst);
255 npp = (Reprog *)realloc(pp, size);
256 if(npp==0 || npp==pp)
257 return pp;
258 diff = (char *)npp - (char *)pp;
259 freep = (Reinst *)((char *)freep + diff);
260 for(inst=npp->firstinst; inst<freep; inst++){
261 switch(inst->type){
262 case OR:
263 case STAR:
264 case PLUS:
265 case QUEST:
266 *(char **)&inst->u1.right += diff;
267 break;
268 case CCLASS:
269 case NCCLASS:
270 *(char **)&inst->u1.right += diff;
271 cl = inst->u1.cp;
272 *(char **)&cl->end += diff;
273 break;
274 }
275 *(char **)&inst->u2.left += diff;
276 }
277 *(char **)&npp->startinst += diff;
278 return npp;
279 }
280
281 #ifdef DEBUG
282 static void
dumpstack(void)283 dumpstack(void){
284 Node *stk;
285 int *ip;
286
287 print("operators\n");
288 for(ip=atorstack; ip<atorp; ip++)
289 print("0%o\n", *ip);
290 print("operands\n");
291 for(stk=andstack; stk<andp; stk++)
292 print("0%o\t0%o\n", stk->first->type, stk->last->type);
293 }
294
295 static void
dump(Reprog * pp)296 dump(Reprog *pp)
297 {
298 Reinst *l;
299 Rune *p;
300
301 l = pp->firstinst;
302 do{
303 print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type,
304 l->u2.left-pp->firstinst, l->u1.right-pp->firstinst);
305 if(l->type == RUNE)
306 print("\t%C\n", l->r);
307 else if(l->type == CCLASS || l->type == NCCLASS){
308 print("\t[");
309 if(l->type == NCCLASS)
310 print("^");
311 for(p = l->cp->spans; p < l->cp->end; p += 2)
312 if(p[0] == p[1])
313 print("%C", p[0]);
314 else
315 print("%C-%C", p[0], p[1]);
316 print("]\n");
317 } else
318 print("\n");
319 }while(l++->type);
320 }
321 #endif
322
323 static Reclass*
newclass(void)324 newclass(void)
325 {
326 if(nclass >= NCLASS)
327 regerr2("too many character classes; limit", NCLASS+'0');
328 return &(classp[nclass++]);
329 }
330
331 static int
nextc(Rune * rp)332 nextc(Rune *rp)
333 {
334 if(lexdone){
335 *rp = 0;
336 return 1;
337 }
338 exprp += chartorune(rp, exprp);
339 if(*rp == L'\\'){
340 exprp += chartorune(rp, exprp);
341 return 1;
342 }
343 if(*rp == 0)
344 lexdone = 1;
345 return 0;
346 }
347
348 static int
lex(int literal,int dot_type)349 lex(int literal, int dot_type)
350 {
351 int quoted;
352
353 quoted = nextc(&yyrune);
354 if(literal || quoted){
355 if(yyrune == 0)
356 return END;
357 return RUNE;
358 }
359
360 switch(yyrune){
361 case 0:
362 return END;
363 case L'*':
364 return STAR;
365 case L'?':
366 return QUEST;
367 case L'+':
368 return PLUS;
369 case L'|':
370 return OR;
371 case L'.':
372 return dot_type;
373 case L'(':
374 return LBRA;
375 case L')':
376 return RBRA;
377 case L'^':
378 return BOL;
379 case L'$':
380 return EOL;
381 case L'[':
382 return bldcclass();
383 }
384 return RUNE;
385 }
386
387 static int
bldcclass(void)388 bldcclass(void)
389 {
390 int type;
391 Rune r[NCCRUNE];
392 Rune *p, *ep, *np;
393 Rune rune;
394 int quoted;
395
396 /* we have already seen the '[' */
397 type = CCLASS;
398 yyclassp = newclass();
399
400 /* look ahead for negation */
401 /* SPECIAL CASE!!! negated classes don't match \n */
402 ep = r;
403 quoted = nextc(&rune);
404 if(!quoted && rune == L'^'){
405 type = NCCLASS;
406 quoted = nextc(&rune);
407 *ep++ = L'\n';
408 *ep++ = L'\n';
409 }
410
411 /* parse class into a set of spans */
412 for(; ep<&r[NCCRUNE];){
413 if(rune == 0){
414 rcerror("malformed '[]'");
415 return 0;
416 }
417 if(!quoted && rune == L']')
418 break;
419 if(!quoted && rune == L'-'){
420 if(ep == r){
421 rcerror("malformed '[]'");
422 return 0;
423 }
424 quoted = nextc(&rune);
425 if((!quoted && rune == L']') || rune == 0){
426 rcerror("malformed '[]'");
427 return 0;
428 }
429 *(ep-1) = rune;
430 } else {
431 *ep++ = rune;
432 *ep++ = rune;
433 }
434 quoted = nextc(&rune);
435 }
436
437 /* sort on span start */
438 for(p = r; p < ep; p += 2){
439 for(np = p; np < ep; np += 2)
440 if(*np < *p){
441 rune = np[0];
442 np[0] = p[0];
443 p[0] = rune;
444 rune = np[1];
445 np[1] = p[1];
446 p[1] = rune;
447 }
448 }
449
450 /* merge spans */
451 np = yyclassp->spans;
452 p = r;
453 if(r == ep)
454 yyclassp->end = np;
455 else {
456 np[0] = *p++;
457 np[1] = *p++;
458 for(; p < ep; p += 2)
459 if(p[0] <= np[1]){
460 if(p[1] > np[1])
461 np[1] = p[1];
462 } else {
463 np += 2;
464 np[0] = p[0];
465 np[1] = p[1];
466 }
467 yyclassp->end = np+2;
468 }
469
470 return type;
471 }
472
473 static Reprog*
regcomp1(char * s,int literal,int dot_type)474 regcomp1(char *s, int literal, int dot_type)
475 {
476 int token;
477 Reprog *pp;
478
479 /* get memory for the program */
480 pp = (Reprog *)malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s));
481 if(pp == 0){
482 regerror("out of memory");
483 return 0;
484 }
485 freep = pp->firstinst;
486 classp = pp->class;
487 errors = 0;
488
489 if(setjmp(regkaboom))
490 goto out;
491
492 /* go compile the sucker */
493 lexdone = 0;
494 exprp = s;
495 nclass = 0;
496 nbra = 0;
497 atorp = atorstack;
498 andp = andstack;
499 subidp = subidstack;
500 lastwasand = FALSE;
501 cursubid = 0;
502
503 /* Start with a low priority operator to prime parser */
504 pushator(START-1);
505 while((token = lex(literal, dot_type)) != END){
506 if((token&0300) == OPERATOR)
507 operator(token);
508 else
509 operand(token);
510 }
511
512 /* Close with a low priority operator */
513 evaluntil(START);
514
515 /* Force END */
516 operand(END);
517 evaluntil(START);
518 #ifdef DEBUG
519 dumpstack();
520 #endif
521 if(nbra)
522 rcerror("unmatched left paren");
523 --andp; /* points to first and only operand */
524 pp->startinst = andp->first;
525 #ifdef DEBUG
526 dump(pp);
527 #endif
528 pp = optimize(pp);
529 #ifdef DEBUG
530 print("start: %d\n", andp->first-pp->firstinst);
531 dump(pp);
532 #endif
533 out:
534 if(errors){
535 free(pp);
536 pp = 0;
537 }
538 return pp;
539 }
540
541 extern Reprog*
regcomp(char * s)542 regcomp(char *s)
543 {
544 return regcomp1(s, 0, ANY);
545 }
546
547 extern Reprog*
regcomplit(char * s)548 regcomplit(char *s)
549 {
550 return regcomp1(s, 1, ANY);
551 }
552
553 extern Reprog*
regcompnl(char * s)554 regcompnl(char *s)
555 {
556 return regcomp1(s, 0, ANYNL);
557 }
558