1 #include <u.h>
2 #include <libc.h>
3 #include "regexp.h"
4 #include "regcomp.h"
5
6 #define TRUE 1
7 #define FALSE 0
8
9 /*
10 * Parser Information
11 */
12 typedef
13 struct Node
14 {
15 Reinst* first;
16 Reinst* last;
17 }Node;
18
19 /* max character classes per program is nelem(reprog->class) */
20 static Reprog *reprog;
21
22 /* max rune ranges per character class is nelem(classp->spans)/2 */
23 #define NCCRUNE nelem(classp->spans)
24
25 #define NSTACK 20
26 static Node andstack[NSTACK];
27 static Node *andp;
28 static int atorstack[NSTACK];
29 static int* atorp;
30 static int cursubid; /* id of current subexpression */
31 static int subidstack[NSTACK]; /* parallel to atorstack */
32 static int* subidp;
33 static int lastwasand; /* Last token was operand */
34 static int nbra;
35 static char* exprp; /* pointer to next character in source expression */
36 static int lexdone;
37 static int nclass;
38 static Reclass*classp;
39 static Reinst* freep;
40 static int errors;
41 static Rune yyrune; /* last lex'd rune */
42 static Reclass*yyclassp; /* last lex'd class */
43
44 /* predeclared crap */
45 static void operator(int);
46 static void pushand(Reinst*, Reinst*);
47 static void pushator(int);
48 static void evaluntil(int);
49 static int bldcclass(void);
50
51 static jmp_buf regkaboom;
52
53 static void
rcerror(char * s)54 rcerror(char *s)
55 {
56 errors++;
57 regerror(s);
58 longjmp(regkaboom, 1);
59 }
60
61 static Reinst*
newinst(int t)62 newinst(int t)
63 {
64 freep->type = t;
65 freep->left = 0;
66 freep->right = 0;
67 return freep++;
68 }
69
70 static void
operand(int t)71 operand(int t)
72 {
73 Reinst *i;
74
75 if(lastwasand)
76 operator(CAT); /* catenate is implicit */
77 i = newinst(t);
78
79 if(t == CCLASS || t == NCCLASS)
80 i->cp = yyclassp;
81 if(t == RUNE)
82 i->r = yyrune;
83
84 pushand(i, i);
85 lastwasand = TRUE;
86 }
87
88 static void
operator(int t)89 operator(int t)
90 {
91 if(t==RBRA && --nbra<0)
92 rcerror("unmatched right paren");
93 if(t==LBRA){
94 if(++cursubid >= NSUBEXP)
95 rcerror ("too many subexpressions");
96 nbra++;
97 if(lastwasand)
98 operator(CAT);
99 } else
100 evaluntil(t);
101 if(t != RBRA)
102 pushator(t);
103 lastwasand = FALSE;
104 if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
105 lastwasand = TRUE; /* these look like operands */
106 }
107
108 static void
regerr2(char * s,int c)109 regerr2(char *s, int c)
110 {
111 char buf[100];
112 char *cp = buf;
113 while(*s)
114 *cp++ = *s++;
115 *cp++ = c;
116 *cp = '\0';
117 rcerror(buf);
118 }
119
120 static void
cant(char * s)121 cant(char *s)
122 {
123 char buf[100];
124 strcpy(buf, "can't happen: ");
125 strcat(buf, s);
126 rcerror(buf);
127 }
128
129 static void
pushand(Reinst * f,Reinst * l)130 pushand(Reinst *f, Reinst *l)
131 {
132 if(andp >= &andstack[NSTACK])
133 cant("operand stack overflow");
134 andp->first = f;
135 andp->last = l;
136 andp++;
137 }
138
139 static void
pushator(int t)140 pushator(int t)
141 {
142 if(atorp >= &atorstack[NSTACK])
143 cant("operator stack overflow");
144 *atorp++ = t;
145 *subidp++ = cursubid;
146 }
147
148 static Node*
popand(int op)149 popand(int op)
150 {
151 Reinst *inst;
152
153 if(andp <= &andstack[0]){
154 regerr2("missing operand for ", op);
155 inst = newinst(NOP);
156 pushand(inst,inst);
157 }
158 return --andp;
159 }
160
161 static int
popator(void)162 popator(void)
163 {
164 if(atorp <= &atorstack[0])
165 cant("operator stack underflow");
166 --subidp;
167 return *--atorp;
168 }
169
170 static void
evaluntil(int pri)171 evaluntil(int pri)
172 {
173 Node *op1, *op2;
174 Reinst *inst1, *inst2;
175
176 while(pri==RBRA || atorp[-1]>=pri){
177 switch(popator()){
178 default:
179 rcerror("unknown operator in evaluntil");
180 break;
181 case LBRA: /* must have been RBRA */
182 op1 = popand('(');
183 inst2 = newinst(RBRA);
184 inst2->subid = *subidp;
185 op1->last->next = inst2;
186 inst1 = newinst(LBRA);
187 inst1->subid = *subidp;
188 inst1->next = op1->first;
189 pushand(inst1, inst2);
190 return;
191 case OR:
192 op2 = popand('|');
193 op1 = popand('|');
194 inst2 = newinst(NOP);
195 op2->last->next = inst2;
196 op1->last->next = inst2;
197 inst1 = newinst(OR);
198 inst1->right = op1->first;
199 inst1->left = op2->first;
200 pushand(inst1, inst2);
201 break;
202 case CAT:
203 op2 = popand(0);
204 op1 = popand(0);
205 op1->last->next = op2->first;
206 pushand(op1->first, op2->last);
207 break;
208 case STAR:
209 op2 = popand('*');
210 inst1 = newinst(OR);
211 op2->last->next = inst1;
212 inst1->right = op2->first;
213 pushand(inst1, inst1);
214 break;
215 case PLUS:
216 op2 = popand('+');
217 inst1 = newinst(OR);
218 op2->last->next = inst1;
219 inst1->right = op2->first;
220 pushand(op2->first, inst1);
221 break;
222 case QUEST:
223 op2 = popand('?');
224 inst1 = newinst(OR);
225 inst2 = newinst(NOP);
226 inst1->left = inst2;
227 inst1->right = op2->first;
228 op2->last->next = inst2;
229 pushand(inst1, inst2);
230 break;
231 }
232 }
233 }
234
235 static Reprog*
optimize(Reprog * pp)236 optimize(Reprog *pp)
237 {
238 Reinst *inst, *target;
239 int size;
240 Reprog *npp;
241 Reclass *cl;
242 int diff;
243
244 /*
245 * get rid of NOOP chains
246 */
247 for(inst=pp->firstinst; inst->type!=END; inst++){
248 target = inst->next;
249 while(target->type == NOP)
250 target = target->next;
251 inst->next = target;
252 }
253
254 /*
255 * The original allocation is for an area larger than
256 * necessary. Reallocate to the actual space used
257 * and then relocate the code.
258 */
259 size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst);
260 npp = realloc(pp, size);
261 if(npp==0 || npp==pp)
262 return pp;
263 diff = (char *)npp - (char *)pp;
264 freep = (Reinst *)((char *)freep + diff);
265 for(inst=npp->firstinst; inst<freep; inst++){
266 switch(inst->type){
267 case OR:
268 case STAR:
269 case PLUS:
270 case QUEST:
271 *(char **)&inst->right += diff;
272 break;
273 case CCLASS:
274 case NCCLASS:
275 *(char **)&inst->right += diff;
276 cl = inst->cp;
277 *(char **)&cl->end += diff;
278 break;
279 }
280 *(char **)&inst->left += diff;
281 }
282 *(char **)&npp->startinst += diff;
283 return npp;
284 }
285
286 #ifdef DEBUG
287 static void
dumpstack(void)288 dumpstack(void){
289 Node *stk;
290 int *ip;
291
292 print("operators\n");
293 for(ip=atorstack; ip<atorp; ip++)
294 print("0%o\n", *ip);
295 print("operands\n");
296 for(stk=andstack; stk<andp; stk++)
297 print("0%o\t0%o\n", stk->first->type, stk->last->type);
298 }
299
300 static void
dump(Reprog * pp)301 dump(Reprog *pp)
302 {
303 Reinst *l;
304 Rune *p;
305
306 l = pp->firstinst;
307 do{
308 print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type,
309 l->left-pp->firstinst, l->right-pp->firstinst);
310 if(l->type == RUNE)
311 print("\t%C\n", l->r);
312 else if(l->type == CCLASS || l->type == NCCLASS){
313 print("\t[");
314 if(l->type == NCCLASS)
315 print("^");
316 for(p = l->cp->spans; p < l->cp->end; p += 2)
317 if(p[0] == p[1])
318 print("%C", p[0]);
319 else
320 print("%C-%C", p[0], p[1]);
321 print("]\n");
322 } else
323 print("\n");
324 }while(l++->type);
325 }
326 #endif
327
328 static Reclass*
newclass(void)329 newclass(void)
330 {
331 if(nclass >= nelem(reprog->class))
332 rcerror("too many character classes; increase Reprog.class size");
333 return &(classp[nclass++]);
334 }
335
336 static int
nextc(Rune * rp)337 nextc(Rune *rp)
338 {
339 if(lexdone){
340 *rp = 0;
341 return 1;
342 }
343 exprp += chartorune(rp, exprp);
344 if(*rp == L'\\'){
345 exprp += chartorune(rp, exprp);
346 return 1;
347 }
348 if(*rp == 0)
349 lexdone = 1;
350 return 0;
351 }
352
353 static int
lex(int literal,int dot_type)354 lex(int literal, int dot_type)
355 {
356 int quoted;
357
358 quoted = nextc(&yyrune);
359 if(literal || quoted){
360 if(yyrune == 0)
361 return END;
362 return RUNE;
363 }
364
365 switch(yyrune){
366 case 0:
367 return END;
368 case L'*':
369 return STAR;
370 case L'?':
371 return QUEST;
372 case L'+':
373 return PLUS;
374 case L'|':
375 return OR;
376 case L'.':
377 return dot_type;
378 case L'(':
379 return LBRA;
380 case L')':
381 return RBRA;
382 case L'^':
383 return BOL;
384 case L'$':
385 return EOL;
386 case L'[':
387 return bldcclass();
388 }
389 return RUNE;
390 }
391
392 static int
bldcclass(void)393 bldcclass(void)
394 {
395 int type;
396 Rune r[NCCRUNE];
397 Rune *p, *ep, *np;
398 Rune rune;
399 int quoted;
400
401 /* we have already seen the '[' */
402 type = CCLASS;
403 yyclassp = newclass();
404
405 /* look ahead for negation */
406 /* SPECIAL CASE!!! negated classes don't match \n */
407 ep = r;
408 quoted = nextc(&rune);
409 if(!quoted && rune == L'^'){
410 type = NCCLASS;
411 quoted = nextc(&rune);
412 *ep++ = L'\n';
413 *ep++ = L'\n';
414 }
415
416 /* parse class into a set of spans */
417 while(ep < &r[NCCRUNE-1]){
418 if(rune == 0){
419 rcerror("malformed '[]'");
420 return 0;
421 }
422 if(!quoted && rune == L']')
423 break;
424 if(!quoted && rune == L'-'){
425 if(ep == r){
426 rcerror("malformed '[]'");
427 return 0;
428 }
429 quoted = nextc(&rune);
430 if((!quoted && rune == L']') || rune == 0){
431 rcerror("malformed '[]'");
432 return 0;
433 }
434 *(ep-1) = rune;
435 } else {
436 *ep++ = rune;
437 *ep++ = rune;
438 }
439 quoted = nextc(&rune);
440 }
441 if(ep >= &r[NCCRUNE-1]) {
442 rcerror("char class too large; increase Reclass.spans size");
443 return 0;
444 }
445
446 /* sort on span start */
447 for(p = r; p < ep; p += 2){
448 for(np = p; np < ep; np += 2)
449 if(*np < *p){
450 rune = np[0];
451 np[0] = p[0];
452 p[0] = rune;
453 rune = np[1];
454 np[1] = p[1];
455 p[1] = rune;
456 }
457 }
458
459 /* merge spans */
460 np = yyclassp->spans;
461 p = r;
462 if(r == ep)
463 yyclassp->end = np;
464 else {
465 np[0] = *p++;
466 np[1] = *p++;
467 for(; p < ep; p += 2)
468 /* overlapping or adjacent ranges? */
469 if(p[0] <= np[1] + 1){
470 if(p[1] >= np[1])
471 np[1] = p[1]; /* coalesce */
472 } else {
473 np += 2;
474 np[0] = p[0];
475 np[1] = p[1];
476 }
477 yyclassp->end = np+2;
478 }
479
480 return type;
481 }
482
483 static Reprog*
regcomp1(char * s,int literal,int dot_type)484 regcomp1(char *s, int literal, int dot_type)
485 {
486 int token;
487 Reprog *pp;
488
489 /* get memory for the program */
490 pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s));
491 if(pp == 0){
492 regerror("out of memory");
493 return 0;
494 }
495 freep = pp->firstinst;
496 classp = pp->class;
497 errors = 0;
498
499 if(setjmp(regkaboom))
500 goto out;
501
502 /* go compile the sucker */
503 lexdone = 0;
504 exprp = s;
505 nclass = 0;
506 nbra = 0;
507 atorp = atorstack;
508 andp = andstack;
509 subidp = subidstack;
510 lastwasand = FALSE;
511 cursubid = 0;
512
513 /* Start with a low priority operator to prime parser */
514 pushator(START-1);
515 while((token = lex(literal, dot_type)) != END){
516 if((token&0300) == OPERATOR)
517 operator(token);
518 else
519 operand(token);
520 }
521
522 /* Close with a low priority operator */
523 evaluntil(START);
524
525 /* Force END */
526 operand(END);
527 evaluntil(START);
528 #ifdef DEBUG
529 dumpstack();
530 #endif
531 if(nbra)
532 rcerror("unmatched left paren");
533 --andp; /* points to first and only operand */
534 pp->startinst = andp->first;
535 #ifdef DEBUG
536 dump(pp);
537 #endif
538 pp = optimize(pp);
539 #ifdef DEBUG
540 print("start: %d\n", andp->first-pp->firstinst);
541 dump(pp);
542 #endif
543 out:
544 if(errors){
545 free(pp);
546 pp = 0;
547 }
548 return pp;
549 }
550
551 extern Reprog*
regcomp(char * s)552 regcomp(char *s)
553 {
554 return regcomp1(s, 0, ANY);
555 }
556
557 extern Reprog*
regcomplit(char * s)558 regcomplit(char *s)
559 {
560 return regcomp1(s, 1, ANY);
561 }
562
563 extern Reprog*
regcompnl(char * s)564 regcompnl(char *s)
565 {
566 return regcomp1(s, 0, ANYNL);
567 }
568