xref: /plan9/sys/src/libregexp/regcomp.c (revision 3468a4915d661daa200976acc4f80f51aae144b2)
1 #include <u.h>
2 #include <libc.h>
3 #include "regexp.h"
4 #include "regcomp.h"
5 
6 #define	TRUE	1
7 #define	FALSE	0
8 
9 /*
10  * Parser Information
11  */
12 typedef
13 struct Node
14 {
15 	Reinst*	first;
16 	Reinst*	last;
17 }Node;
18 
19 /* max character classes per program is nelem(reprog->class) */
20 static Reprog	*reprog;
21 
22 /* max rune ranges per character class is nelem(classp->spans)/2 */
23 #define NCCRUNE	nelem(classp->spans)
24 
25 #define	NSTACK	20
26 static	Node	andstack[NSTACK];
27 static	Node	*andp;
28 static	int	atorstack[NSTACK];
29 static	int*	atorp;
30 static	int	cursubid;		/* id of current subexpression */
31 static	int	subidstack[NSTACK];	/* parallel to atorstack */
32 static	int*	subidp;
33 static	int	lastwasand;	/* Last token was operand */
34 static	int	nbra;
35 static	char*	exprp;		/* pointer to next character in source expression */
36 static	int	lexdone;
37 static	int	nclass;
38 static	Reclass*classp;
39 static	Reinst*	freep;
40 static	int	errors;
41 static	Rune	yyrune;		/* last lex'd rune */
42 static	Reclass*yyclassp;	/* last lex'd class */
43 
44 /* predeclared crap */
45 static	void	operator(int);
46 static	void	pushand(Reinst*, Reinst*);
47 static	void	pushator(int);
48 static	void	evaluntil(int);
49 static	int	bldcclass(void);
50 
51 static jmp_buf regkaboom;
52 
53 static	void
rcerror(char * s)54 rcerror(char *s)
55 {
56 	errors++;
57 	regerror(s);
58 	longjmp(regkaboom, 1);
59 }
60 
61 static	Reinst*
newinst(int t)62 newinst(int t)
63 {
64 	freep->type = t;
65 	freep->left = 0;
66 	freep->right = 0;
67 	return freep++;
68 }
69 
70 static	void
operand(int t)71 operand(int t)
72 {
73 	Reinst *i;
74 
75 	if(lastwasand)
76 		operator(CAT);	/* catenate is implicit */
77 	i = newinst(t);
78 
79 	if(t == CCLASS || t == NCCLASS)
80 		i->cp = yyclassp;
81 	if(t == RUNE)
82 		i->r = yyrune;
83 
84 	pushand(i, i);
85 	lastwasand = TRUE;
86 }
87 
88 static	void
operator(int t)89 operator(int t)
90 {
91 	if(t==RBRA && --nbra<0)
92 		rcerror("unmatched right paren");
93 	if(t==LBRA){
94 		if(++cursubid >= NSUBEXP)
95 			rcerror ("too many subexpressions");
96 		nbra++;
97 		if(lastwasand)
98 			operator(CAT);
99 	} else
100 		evaluntil(t);
101 	if(t != RBRA)
102 		pushator(t);
103 	lastwasand = FALSE;
104 	if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
105 		lastwasand = TRUE;	/* these look like operands */
106 }
107 
108 static	void
regerr2(char * s,int c)109 regerr2(char *s, int c)
110 {
111 	char buf[100];
112 	char *cp = buf;
113 	while(*s)
114 		*cp++ = *s++;
115 	*cp++ = c;
116 	*cp = '\0';
117 	rcerror(buf);
118 }
119 
120 static	void
cant(char * s)121 cant(char *s)
122 {
123 	char buf[100];
124 	strcpy(buf, "can't happen: ");
125 	strcat(buf, s);
126 	rcerror(buf);
127 }
128 
129 static	void
pushand(Reinst * f,Reinst * l)130 pushand(Reinst *f, Reinst *l)
131 {
132 	if(andp >= &andstack[NSTACK])
133 		cant("operand stack overflow");
134 	andp->first = f;
135 	andp->last = l;
136 	andp++;
137 }
138 
139 static	void
pushator(int t)140 pushator(int t)
141 {
142 	if(atorp >= &atorstack[NSTACK])
143 		cant("operator stack overflow");
144 	*atorp++ = t;
145 	*subidp++ = cursubid;
146 }
147 
148 static	Node*
popand(int op)149 popand(int op)
150 {
151 	Reinst *inst;
152 
153 	if(andp <= &andstack[0]){
154 		regerr2("missing operand for ", op);
155 		inst = newinst(NOP);
156 		pushand(inst,inst);
157 	}
158 	return --andp;
159 }
160 
161 static	int
popator(void)162 popator(void)
163 {
164 	if(atorp <= &atorstack[0])
165 		cant("operator stack underflow");
166 	--subidp;
167 	return *--atorp;
168 }
169 
170 static	void
evaluntil(int pri)171 evaluntil(int pri)
172 {
173 	Node *op1, *op2;
174 	Reinst *inst1, *inst2;
175 
176 	while(pri==RBRA || atorp[-1]>=pri){
177 		switch(popator()){
178 		default:
179 			rcerror("unknown operator in evaluntil");
180 			break;
181 		case LBRA:		/* must have been RBRA */
182 			op1 = popand('(');
183 			inst2 = newinst(RBRA);
184 			inst2->subid = *subidp;
185 			op1->last->next = inst2;
186 			inst1 = newinst(LBRA);
187 			inst1->subid = *subidp;
188 			inst1->next = op1->first;
189 			pushand(inst1, inst2);
190 			return;
191 		case OR:
192 			op2 = popand('|');
193 			op1 = popand('|');
194 			inst2 = newinst(NOP);
195 			op2->last->next = inst2;
196 			op1->last->next = inst2;
197 			inst1 = newinst(OR);
198 			inst1->right = op1->first;
199 			inst1->left = op2->first;
200 			pushand(inst1, inst2);
201 			break;
202 		case CAT:
203 			op2 = popand(0);
204 			op1 = popand(0);
205 			op1->last->next = op2->first;
206 			pushand(op1->first, op2->last);
207 			break;
208 		case STAR:
209 			op2 = popand('*');
210 			inst1 = newinst(OR);
211 			op2->last->next = inst1;
212 			inst1->right = op2->first;
213 			pushand(inst1, inst1);
214 			break;
215 		case PLUS:
216 			op2 = popand('+');
217 			inst1 = newinst(OR);
218 			op2->last->next = inst1;
219 			inst1->right = op2->first;
220 			pushand(op2->first, inst1);
221 			break;
222 		case QUEST:
223 			op2 = popand('?');
224 			inst1 = newinst(OR);
225 			inst2 = newinst(NOP);
226 			inst1->left = inst2;
227 			inst1->right = op2->first;
228 			op2->last->next = inst2;
229 			pushand(inst1, inst2);
230 			break;
231 		}
232 	}
233 }
234 
235 static	Reprog*
optimize(Reprog * pp)236 optimize(Reprog *pp)
237 {
238 	Reinst *inst, *target;
239 	int size;
240 	Reprog *npp;
241 	Reclass *cl;
242 	int diff;
243 
244 	/*
245 	 *  get rid of NOOP chains
246 	 */
247 	for(inst=pp->firstinst; inst->type!=END; inst++){
248 		target = inst->next;
249 		while(target->type == NOP)
250 			target = target->next;
251 		inst->next = target;
252 	}
253 
254 	/*
255 	 *  The original allocation is for an area larger than
256 	 *  necessary.  Reallocate to the actual space used
257 	 *  and then relocate the code.
258 	 */
259 	size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst);
260 	npp = realloc(pp, size);
261 	if(npp==0 || npp==pp)
262 		return pp;
263 	diff = (char *)npp - (char *)pp;
264 	freep = (Reinst *)((char *)freep + diff);
265 	for(inst=npp->firstinst; inst<freep; inst++){
266 		switch(inst->type){
267 		case OR:
268 		case STAR:
269 		case PLUS:
270 		case QUEST:
271 			*(char **)&inst->right += diff;
272 			break;
273 		case CCLASS:
274 		case NCCLASS:
275 			*(char **)&inst->right += diff;
276 			cl = inst->cp;
277 			*(char **)&cl->end += diff;
278 			break;
279 		}
280 		*(char **)&inst->left += diff;
281 	}
282 	*(char **)&npp->startinst += diff;
283 	return npp;
284 }
285 
286 #ifdef	DEBUG
287 static	void
dumpstack(void)288 dumpstack(void){
289 	Node *stk;
290 	int *ip;
291 
292 	print("operators\n");
293 	for(ip=atorstack; ip<atorp; ip++)
294 		print("0%o\n", *ip);
295 	print("operands\n");
296 	for(stk=andstack; stk<andp; stk++)
297 		print("0%o\t0%o\n", stk->first->type, stk->last->type);
298 }
299 
300 static	void
dump(Reprog * pp)301 dump(Reprog *pp)
302 {
303 	Reinst *l;
304 	Rune *p;
305 
306 	l = pp->firstinst;
307 	do{
308 		print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type,
309 			l->left-pp->firstinst, l->right-pp->firstinst);
310 		if(l->type == RUNE)
311 			print("\t%C\n", l->r);
312 		else if(l->type == CCLASS || l->type == NCCLASS){
313 			print("\t[");
314 			if(l->type == NCCLASS)
315 				print("^");
316 			for(p = l->cp->spans; p < l->cp->end; p += 2)
317 				if(p[0] == p[1])
318 					print("%C", p[0]);
319 				else
320 					print("%C-%C", p[0], p[1]);
321 			print("]\n");
322 		} else
323 			print("\n");
324 	}while(l++->type);
325 }
326 #endif
327 
328 static	Reclass*
newclass(void)329 newclass(void)
330 {
331 	if(nclass >= nelem(reprog->class))
332 		rcerror("too many character classes; increase Reprog.class size");
333 	return &(classp[nclass++]);
334 }
335 
336 static	int
nextc(Rune * rp)337 nextc(Rune *rp)
338 {
339 	if(lexdone){
340 		*rp = 0;
341 		return 1;
342 	}
343 	exprp += chartorune(rp, exprp);
344 	if(*rp == L'\\'){
345 		exprp += chartorune(rp, exprp);
346 		return 1;
347 	}
348 	if(*rp == 0)
349 		lexdone = 1;
350 	return 0;
351 }
352 
353 static	int
lex(int literal,int dot_type)354 lex(int literal, int dot_type)
355 {
356 	int quoted;
357 
358 	quoted = nextc(&yyrune);
359 	if(literal || quoted){
360 		if(yyrune == 0)
361 			return END;
362 		return RUNE;
363 	}
364 
365 	switch(yyrune){
366 	case 0:
367 		return END;
368 	case L'*':
369 		return STAR;
370 	case L'?':
371 		return QUEST;
372 	case L'+':
373 		return PLUS;
374 	case L'|':
375 		return OR;
376 	case L'.':
377 		return dot_type;
378 	case L'(':
379 		return LBRA;
380 	case L')':
381 		return RBRA;
382 	case L'^':
383 		return BOL;
384 	case L'$':
385 		return EOL;
386 	case L'[':
387 		return bldcclass();
388 	}
389 	return RUNE;
390 }
391 
392 static int
bldcclass(void)393 bldcclass(void)
394 {
395 	int type;
396 	Rune r[NCCRUNE];
397 	Rune *p, *ep, *np;
398 	Rune rune;
399 	int quoted;
400 
401 	/* we have already seen the '[' */
402 	type = CCLASS;
403 	yyclassp = newclass();
404 
405 	/* look ahead for negation */
406 	/* SPECIAL CASE!!! negated classes don't match \n */
407 	ep = r;
408 	quoted = nextc(&rune);
409 	if(!quoted && rune == L'^'){
410 		type = NCCLASS;
411 		quoted = nextc(&rune);
412 		*ep++ = L'\n';
413 		*ep++ = L'\n';
414 	}
415 
416 	/* parse class into a set of spans */
417 	while(ep < &r[NCCRUNE-1]){
418 		if(rune == 0){
419 			rcerror("malformed '[]'");
420 			return 0;
421 		}
422 		if(!quoted && rune == L']')
423 			break;
424 		if(!quoted && rune == L'-'){
425 			if(ep == r){
426 				rcerror("malformed '[]'");
427 				return 0;
428 			}
429 			quoted = nextc(&rune);
430 			if((!quoted && rune == L']') || rune == 0){
431 				rcerror("malformed '[]'");
432 				return 0;
433 			}
434 			*(ep-1) = rune;
435 		} else {
436 			*ep++ = rune;
437 			*ep++ = rune;
438 		}
439 		quoted = nextc(&rune);
440 	}
441 	if(ep >= &r[NCCRUNE-1]) {
442 		rcerror("char class too large; increase Reclass.spans size");
443 		return 0;
444 	}
445 
446 	/* sort on span start */
447 	for(p = r; p < ep; p += 2){
448 		for(np = p; np < ep; np += 2)
449 			if(*np < *p){
450 				rune = np[0];
451 				np[0] = p[0];
452 				p[0] = rune;
453 				rune = np[1];
454 				np[1] = p[1];
455 				p[1] = rune;
456 			}
457 	}
458 
459 	/* merge spans */
460 	np = yyclassp->spans;
461 	p = r;
462 	if(r == ep)
463 		yyclassp->end = np;
464 	else {
465 		np[0] = *p++;
466 		np[1] = *p++;
467 		for(; p < ep; p += 2)
468 			/* overlapping or adjacent ranges? */
469 			if(p[0] <= np[1] + 1){
470 				if(p[1] >= np[1])
471 					np[1] = p[1];	/* coalesce */
472 			} else {
473 				np += 2;
474 				np[0] = p[0];
475 				np[1] = p[1];
476 			}
477 		yyclassp->end = np+2;
478 	}
479 
480 	return type;
481 }
482 
483 static	Reprog*
regcomp1(char * s,int literal,int dot_type)484 regcomp1(char *s, int literal, int dot_type)
485 {
486 	int token;
487 	Reprog *pp;
488 
489 	/* get memory for the program */
490 	pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s));
491 	if(pp == 0){
492 		regerror("out of memory");
493 		return 0;
494 	}
495 	freep = pp->firstinst;
496 	classp = pp->class;
497 	errors = 0;
498 
499 	if(setjmp(regkaboom))
500 		goto out;
501 
502 	/* go compile the sucker */
503 	lexdone = 0;
504 	exprp = s;
505 	nclass = 0;
506 	nbra = 0;
507 	atorp = atorstack;
508 	andp = andstack;
509 	subidp = subidstack;
510 	lastwasand = FALSE;
511 	cursubid = 0;
512 
513 	/* Start with a low priority operator to prime parser */
514 	pushator(START-1);
515 	while((token = lex(literal, dot_type)) != END){
516 		if((token&0300) == OPERATOR)
517 			operator(token);
518 		else
519 			operand(token);
520 	}
521 
522 	/* Close with a low priority operator */
523 	evaluntil(START);
524 
525 	/* Force END */
526 	operand(END);
527 	evaluntil(START);
528 #ifdef DEBUG
529 	dumpstack();
530 #endif
531 	if(nbra)
532 		rcerror("unmatched left paren");
533 	--andp;	/* points to first and only operand */
534 	pp->startinst = andp->first;
535 #ifdef DEBUG
536 	dump(pp);
537 #endif
538 	pp = optimize(pp);
539 #ifdef DEBUG
540 	print("start: %d\n", andp->first-pp->firstinst);
541 	dump(pp);
542 #endif
543 out:
544 	if(errors){
545 		free(pp);
546 		pp = 0;
547 	}
548 	return pp;
549 }
550 
551 extern	Reprog*
regcomp(char * s)552 regcomp(char *s)
553 {
554 	return regcomp1(s, 0, ANY);
555 }
556 
557 extern	Reprog*
regcomplit(char * s)558 regcomplit(char *s)
559 {
560 	return regcomp1(s, 1, ANY);
561 }
562 
563 extern	Reprog*
regcompnl(char * s)564 regcompnl(char *s)
565 {
566 	return regcomp1(s, 0, ANYNL);
567 }
568