xref: /plan9-contrib/sys/src/libregexp/regcomp.c (revision 7dd7cddf99dd7472612f1413b4da293630e6b1bc)
1 #include <u.h>
2 #include <libc.h>
3 #include "regexp.h"
4 #include "regcomp.h"
5 
6 #define	TRUE	1
7 #define	FALSE	0
8 
9 /*
10  * Parser Information
11  */
12 typedef
13 struct Node
14 {
15 	Reinst*	first;
16 	Reinst*	last;
17 }Node;
18 
19 #define	NSTACK	20
20 static	Node	andstack[NSTACK];
21 static	Node	*andp;
22 static	int	atorstack[NSTACK];
23 static	int*	atorp;
24 static	int	cursubid;		/* id of current subexpression */
25 static	int	subidstack[NSTACK];	/* parallel to atorstack */
26 static	int*	subidp;
27 static	int	lastwasand;	/* Last token was operand */
28 static	int	nbra;
29 static	char*	exprp;		/* pointer to next character in source expression */
30 static	int	lexdone;
31 static	int	nclass;
32 static	Reclass*classp;
33 static	Reinst*	freep;
34 static	int	errors;
35 static	Rune	yyrune;		/* last lex'd rune */
36 static	Reclass*yyclassp;	/* last lex'd class */
37 
38 /* predeclared crap */
39 static	void	operator(int);
40 static	void	pushand(Reinst*, Reinst*);
41 static	void	pushator(int);
42 static	void	evaluntil(int);
43 static	int	bldcclass(void);
44 
45 static jmp_buf regkaboom;
46 
47 static	void
48 rcerror(char *s)
49 {
50 	errors++;
51 	regerror(s);
52 	longjmp(regkaboom, 1);
53 }
54 
55 static	Reinst*
56 newinst(int t)
57 {
58 	freep->type = t;
59 	freep->left = 0;
60 	freep->right = 0;
61 	return freep++;
62 }
63 
64 static	void
65 operand(int t)
66 {
67 	Reinst *i;
68 
69 	if(lastwasand)
70 		operator(CAT);	/* catenate is implicit */
71 	i = newinst(t);
72 
73 	if(t == CCLASS || t == NCCLASS)
74 		i->cp = yyclassp;
75 	if(t == RUNE)
76 		i->r = yyrune;
77 
78 	pushand(i, i);
79 	lastwasand = TRUE;
80 }
81 
82 static	void
83 operator(int t)
84 {
85 	if(t==RBRA && --nbra<0)
86 		rcerror("unmatched right paren");
87 	if(t==LBRA){
88 		if(++cursubid >= NSUBEXP)
89 			rcerror ("too many subexpressions");
90 		nbra++;
91 		if(lastwasand)
92 			operator(CAT);
93 	} else
94 		evaluntil(t);
95 	if(t != RBRA)
96 		pushator(t);
97 	lastwasand = FALSE;
98 	if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
99 		lastwasand = TRUE;	/* these look like operands */
100 }
101 
102 static	void
103 regerr2(char *s, int c)
104 {
105 	char buf[100];
106 	char *cp = buf;
107 	while(*s)
108 		*cp++ = *s++;
109 	*cp++ = c;
110 	*cp = '\0';
111 	rcerror(buf);
112 }
113 
114 static	void
115 cant(char *s)
116 {
117 	char buf[100];
118 	strcpy(buf, "can't happen: ");
119 	strcat(buf, s);
120 	rcerror(buf);
121 }
122 
123 static	void
124 pushand(Reinst *f, Reinst *l)
125 {
126 	if(andp >= &andstack[NSTACK])
127 		cant("operand stack overflow");
128 	andp->first = f;
129 	andp->last = l;
130 	andp++;
131 }
132 
133 static	void
134 pushator(int t)
135 {
136 	if(atorp >= &atorstack[NSTACK])
137 		cant("operator stack overflow");
138 	*atorp++ = t;
139 	*subidp++ = cursubid;
140 }
141 
142 static	Node*
143 popand(int op)
144 {
145 	Reinst *inst;
146 
147 	if(andp <= &andstack[0]){
148 		regerr2("missing operand for ", op);
149 		inst = newinst(NOP);
150 		pushand(inst,inst);
151 	}
152 	return --andp;
153 }
154 
155 static	int
156 popator(void)
157 {
158 	if(atorp <= &atorstack[0])
159 		cant("operator stack underflow");
160 	--subidp;
161 	return *--atorp;
162 }
163 
164 static	void
165 evaluntil(int pri)
166 {
167 	Node *op1, *op2;
168 	Reinst *inst1, *inst2;
169 
170 	while(pri==RBRA || atorp[-1]>=pri){
171 		switch(popator()){
172 		default:
173 			rcerror("unknown operator in evaluntil");
174 			break;
175 		case LBRA:		/* must have been RBRA */
176 			op1 = popand('(');
177 			inst2 = newinst(RBRA);
178 			inst2->subid = *subidp;
179 			op1->last->next = inst2;
180 			inst1 = newinst(LBRA);
181 			inst1->subid = *subidp;
182 			inst1->next = op1->first;
183 			pushand(inst1, inst2);
184 			return;
185 		case OR:
186 			op2 = popand('|');
187 			op1 = popand('|');
188 			inst2 = newinst(NOP);
189 			op2->last->next = inst2;
190 			op1->last->next = inst2;
191 			inst1 = newinst(OR);
192 			inst1->right = op1->first;
193 			inst1->left = op2->first;
194 			pushand(inst1, inst2);
195 			break;
196 		case CAT:
197 			op2 = popand(0);
198 			op1 = popand(0);
199 			op1->last->next = op2->first;
200 			pushand(op1->first, op2->last);
201 			break;
202 		case STAR:
203 			op2 = popand('*');
204 			inst1 = newinst(OR);
205 			op2->last->next = inst1;
206 			inst1->right = op2->first;
207 			pushand(inst1, inst1);
208 			break;
209 		case PLUS:
210 			op2 = popand('+');
211 			inst1 = newinst(OR);
212 			op2->last->next = inst1;
213 			inst1->right = op2->first;
214 			pushand(op2->first, inst1);
215 			break;
216 		case QUEST:
217 			op2 = popand('?');
218 			inst1 = newinst(OR);
219 			inst2 = newinst(NOP);
220 			inst1->left = inst2;
221 			inst1->right = op2->first;
222 			op2->last->next = inst2;
223 			pushand(inst1, inst2);
224 			break;
225 		}
226 	}
227 }
228 
229 static	Reprog*
230 optimize(Reprog *pp)
231 {
232 	Reinst *inst, *target;
233 	int size;
234 	Reprog *npp;
235 	Reclass *cl;
236 	int diff;
237 
238 	/*
239 	 *  get rid of NOOP chains
240 	 */
241 	for(inst=pp->firstinst; inst->type!=END; inst++){
242 		target = inst->next;
243 		while(target->type == NOP)
244 			target = target->next;
245 		inst->next = target;
246 	}
247 
248 	/*
249 	 *  The original allocation is for an area larger than
250 	 *  necessary.  Reallocate to the actual space used
251 	 *  and then relocate the code.
252 	 */
253 	size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst);
254 	npp = realloc(pp, size);
255 	if(npp==0 || npp==pp)
256 		return pp;
257 	diff = (char *)npp - (char *)pp;
258 	freep = (Reinst *)((char *)freep + diff);
259 	for(inst=npp->firstinst; inst<freep; inst++){
260 		switch(inst->type){
261 		case OR:
262 		case STAR:
263 		case PLUS:
264 		case QUEST:
265 			*(char **)&inst->right += diff;
266 			break;
267 		case CCLASS:
268 		case NCCLASS:
269 			*(char **)&inst->right += diff;
270 			cl = inst->cp;
271 			*(char **)&cl->end += diff;
272 			break;
273 		}
274 		*(char **)&inst->left += diff;
275 	}
276 	*(char **)&npp->startinst += diff;
277 	return npp;
278 }
279 
280 #ifdef	DEBUG
281 static	void
282 dumpstack(void){
283 	Node *stk;
284 	int *ip;
285 
286 	print("operators\n");
287 	for(ip=atorstack; ip<atorp; ip++)
288 		print("0%o\n", *ip);
289 	print("operands\n");
290 	for(stk=andstack; stk<andp; stk++)
291 		print("0%o\t0%o\n", stk->first->type, stk->last->type);
292 }
293 
294 static	void
295 dump(Reprog *pp)
296 {
297 	Reinst *l;
298 	Rune *p;
299 
300 	l = pp->firstinst;
301 	do{
302 		print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type,
303 			l->left-pp->firstinst, l->right-pp->firstinst);
304 		if(l->type == RUNE)
305 			print("\t%C\n", l->r);
306 		else if(l->type == CCLASS || l->type == NCCLASS){
307 			print("\t[");
308 			if(l->type == NCCLASS)
309 				print("^");
310 			for(p = l->cp->spans; p < l->cp->end; p += 2)
311 				if(p[0] == p[1])
312 					print("%C", p[0]);
313 				else
314 					print("%C-%C", p[0], p[1]);
315 			print("]\n");
316 		} else
317 			print("\n");
318 	}while(l++->type);
319 }
320 #endif
321 
322 static	Reclass*
323 newclass(void)
324 {
325 	if(nclass >= NCLASS)
326 		regerr2("too many character classes; limit", NCLASS+'0');
327 	return &(classp[nclass++]);
328 }
329 
330 static	int
331 nextc(Rune *rp)
332 {
333 	if(lexdone){
334 		*rp = 0;
335 		return 1;
336 	}
337 	exprp += chartorune(rp, exprp);
338 	if(*rp == L'\\'){
339 		exprp += chartorune(rp, exprp);
340 		return 1;
341 	}
342 	if(*rp == 0)
343 		lexdone = 1;
344 	return 0;
345 }
346 
347 static	int
348 lex(int literal, int dot_type)
349 {
350 	int quoted;
351 
352 	quoted = nextc(&yyrune);
353 	if(literal || quoted){
354 		if(yyrune == 0)
355 			return END;
356 		return RUNE;
357 	}
358 
359 	switch(yyrune){
360 	case 0:
361 		return END;
362 	case L'*':
363 		return STAR;
364 	case L'?':
365 		return QUEST;
366 	case L'+':
367 		return PLUS;
368 	case L'|':
369 		return OR;
370 	case L'.':
371 		return dot_type;
372 	case L'(':
373 		return LBRA;
374 	case L')':
375 		return RBRA;
376 	case L'^':
377 		return BOL;
378 	case L'$':
379 		return EOL;
380 	case L'[':
381 		return bldcclass();
382 	}
383 	return RUNE;
384 }
385 
386 static int
387 bldcclass(void)
388 {
389 	int type;
390 	Rune r[NCCRUNE];
391 	Rune *p, *ep, *np;
392 	Rune rune;
393 	int quoted;
394 
395 	/* we have already seen the '[' */
396 	type = CCLASS;
397 	yyclassp = newclass();
398 
399 	/* look ahead for negation */
400 	/* SPECIAL CASE!!! negated classes don't match \n */
401 	ep = r;
402 	quoted = nextc(&rune);
403 	if(!quoted && rune == L'^'){
404 		type = NCCLASS;
405 		quoted = nextc(&rune);
406 		*ep++ = L'\n';
407 		*ep++ = L'\n';
408 	}
409 
410 	/* parse class into a set of spans */
411 	for(; ep<&r[NCCRUNE];){
412 		if(rune == 0){
413 			rcerror("malformed '[]'");
414 			return 0;
415 		}
416 		if(!quoted && rune == L']')
417 			break;
418 		if(!quoted && rune == L'-'){
419 			if(ep == r){
420 				rcerror("malformed '[]'");
421 				return 0;
422 			}
423 			quoted = nextc(&rune);
424 			if((!quoted && rune == L']') || rune == 0){
425 				rcerror("malformed '[]'");
426 				return 0;
427 			}
428 			*(ep-1) = rune;
429 		} else {
430 			*ep++ = rune;
431 			*ep++ = rune;
432 		}
433 		quoted = nextc(&rune);
434 	}
435 
436 	/* sort on span start */
437 	for(p = r; p < ep; p += 2){
438 		for(np = p; np < ep; np += 2)
439 			if(*np < *p){
440 				rune = np[0];
441 				np[0] = p[0];
442 				p[0] = rune;
443 				rune = np[1];
444 				np[1] = p[1];
445 				p[1] = rune;
446 			}
447 	}
448 
449 	/* merge spans */
450 	np = yyclassp->spans;
451 	p = r;
452 	if(r == ep)
453 		yyclassp->end = np;
454 	else {
455 		np[0] = *p++;
456 		np[1] = *p++;
457 		for(; p < ep; p += 2)
458 			if(p[0] <= np[1]){
459 				if(p[1] > np[1])
460 					np[1] = p[1];
461 			} else {
462 				np += 2;
463 				np[0] = p[0];
464 				np[1] = p[1];
465 			}
466 		yyclassp->end = np+2;
467 	}
468 
469 	return type;
470 }
471 
472 static	Reprog*
473 regcomp1(char *s, int literal, int dot_type)
474 {
475 	int token;
476 	Reprog *pp;
477 
478 	/* get memory for the program */
479 	pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s));
480 	if(pp == 0){
481 		regerror("out of memory");
482 		return 0;
483 	}
484 	freep = pp->firstinst;
485 	classp = pp->class;
486 	errors = 0;
487 
488 	if(setjmp(regkaboom))
489 		goto out;
490 
491 	/* go compile the sucker */
492 	lexdone = 0;
493 	exprp = s;
494 	nclass = 0;
495 	nbra = 0;
496 	atorp = atorstack;
497 	andp = andstack;
498 	subidp = subidstack;
499 	lastwasand = FALSE;
500 	cursubid = 0;
501 
502 	/* Start with a low priority operator to prime parser */
503 	pushator(START-1);
504 	while((token = lex(literal, dot_type)) != END){
505 		if((token&0300) == OPERATOR)
506 			operator(token);
507 		else
508 			operand(token);
509 	}
510 
511 	/* Close with a low priority operator */
512 	evaluntil(START);
513 
514 	/* Force END */
515 	operand(END);
516 	evaluntil(START);
517 #ifdef DEBUG
518 	dumpstack();
519 #endif
520 	if(nbra)
521 		rcerror("unmatched left paren");
522 	--andp;	/* points to first and only operand */
523 	pp->startinst = andp->first;
524 #ifdef DEBUG
525 	dump(pp);
526 #endif
527 	pp = optimize(pp);
528 #ifdef DEBUG
529 	print("start: %d\n", andp->first-pp->firstinst);
530 	dump(pp);
531 #endif
532 out:
533 	if(errors){
534 		free(pp);
535 		pp = 0;
536 	}
537 	return pp;
538 }
539 
540 extern	Reprog*
541 regcomp(char *s)
542 {
543 	return regcomp1(s, 0, ANY);
544 }
545 
546 extern	Reprog*
547 regcomplit(char *s)
548 {
549 	return regcomp1(s, 1, ANY);
550 }
551 
552 extern	Reprog*
553 regcompnl(char *s)
554 {
555 	return regcomp1(s, 0, ANYNL);
556 }
557