xref: /plan9/sys/src/ape/lib/regexp/regcomp.c (revision 3e12c5d1bb89fc02707907988834ef147769ddaf)
1 #include <stdlib.h>
2 #include <stdio.h>
3 #include <setjmp.h>
4 #include <string.h>
5 #include "regexp.h"
6 #include "regcomp.h"
7 
8 #define	TRUE	1
9 #define	FALSE	0
10 
11 /*
12  * Parser Information
13  */
14 typedef
15 struct Node
16 {
17 	Reinst*	first;
18 	Reinst*	last;
19 }Node;
20 
21 #define	NSTACK	20
22 static	Node	andstack[NSTACK];
23 static	Node	*andp;
24 static	int	atorstack[NSTACK];
25 static	int*	atorp;
26 static	int	cursubid;		/* id of current subexpression */
27 static	int	subidstack[NSTACK];	/* parallel to atorstack */
28 static	int*	subidp;
29 static	int	lastwasand;	/* Last token was operand */
30 static	int	nbra;
31 static	char*	exprp;		/* pointer to next character in source expression */
32 static	int	lexdone;
33 static	int	nclass;
34 static	Reclass*classp;
35 static	Reinst*	freep;
36 static	int	errors;
37 static	wchar_t	yyrune;		/* last lex'd rune */
38 static	Reclass*yyclassp;	/* last lex'd class */
39 
40 /* predeclared crap */
41 static	void	operator(int);
42 static	void	pushand(Reinst*, Reinst*);
43 static	void	pushator(int);
44 static	void	evaluntil(int);
45 static	int	bldcclass(void);
46 
47 static jmp_buf regkaboom;
48 
49 static	void
rcerror(char * s)50 rcerror(char *s)
51 {
52 	errors++;
53 	regerror(s);
54 	longjmp(regkaboom, 1);
55 }
56 
57 static	Reinst*
newinst(int t)58 newinst(int t)
59 {
60 	freep->type = t;
61 	freep->l.left = 0;
62 	freep->r.right = 0;
63 	return freep++;
64 }
65 
66 static	void
operand(int t)67 operand(int t)
68 {
69 	Reinst *i;
70 
71 	if(lastwasand)
72 		operator(CAT);	/* catenate is implicit */
73 	i = newinst(t);
74 
75 	if(t == CCLASS || t == NCCLASS)
76 		i->r.cp = yyclassp;
77 	if(t == RUNE)
78 		i->r.r = yyrune;
79 
80 	pushand(i, i);
81 	lastwasand = TRUE;
82 }
83 
84 static	void
operator(int t)85 operator(int t)
86 {
87 	if(t==RBRA && --nbra<0)
88 		rcerror("unmatched right paren");
89 	if(t==LBRA){
90 		if(++cursubid >= NSUBEXP)
91 			rcerror ("too many subexpressions");
92 		nbra++;
93 		if(lastwasand)
94 			operator(CAT);
95 	} else
96 		evaluntil(t);
97 	if(t != RBRA)
98 		pushator(t);
99 	lastwasand = FALSE;
100 	if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
101 		lastwasand = TRUE;	/* these look like operands */
102 }
103 
104 static	void
regerr2(char * s,int c)105 regerr2(char *s, int c)
106 {
107 	char buf[100];
108 	char *cp = buf;
109 	while(*s)
110 		*cp++ = *s++;
111 	*cp++ = c;
112 	*cp = '\0';
113 	rcerror(buf);
114 }
115 
116 static	void
cant(char * s)117 cant(char *s)
118 {
119 	char buf[100];
120 	strcpy(buf, "can't happen: ");
121 	strcat(buf, s);
122 	rcerror(buf);
123 }
124 
125 static	void
pushand(Reinst * f,Reinst * l)126 pushand(Reinst *f, Reinst *l)
127 {
128 	if(andp >= &andstack[NSTACK])
129 		cant("operand stack overflow");
130 	andp->first = f;
131 	andp->last = l;
132 	andp++;
133 }
134 
135 static	void
pushator(int t)136 pushator(int t)
137 {
138 	if(atorp >= &atorstack[NSTACK])
139 		cant("operator stack overflow");
140 	*atorp++ = t;
141 	*subidp++ = cursubid;
142 }
143 
144 static	Node*
popand(int op)145 popand(int op)
146 {
147 	Reinst *inst;
148 
149 	if(andp <= &andstack[0]){
150 		regerr2("missing operand for ", op);
151 		inst = newinst(NOP);
152 		pushand(inst,inst);
153 	}
154 	return --andp;
155 }
156 
157 static	int
popator(void)158 popator(void)
159 {
160 	if(atorp <= &atorstack[0])
161 		cant("operator stack underflow");
162 	--subidp;
163 	return *--atorp;
164 }
165 
166 static	void
evaluntil(int pri)167 evaluntil(int pri)
168 {
169 	Node *op1, *op2;
170 	Reinst *inst1, *inst2;
171 
172 	while(pri==RBRA || atorp[-1]>=pri){
173 		switch(popator()){
174 		default:
175 			rcerror("unknown operator in evaluntil");
176 			break;
177 		case LBRA:		/* must have been RBRA */
178 			op1 = popand('(');
179 			inst2 = newinst(RBRA);
180 			inst2->r.subid = *subidp;
181 			op1->last->l.next = inst2;
182 			inst1 = newinst(LBRA);
183 			inst1->r.subid = *subidp;
184 			inst1->l.next = op1->first;
185 			pushand(inst1, inst2);
186 			return;
187 		case OR:
188 			op2 = popand('|');
189 			op1 = popand('|');
190 			inst2 = newinst(NOP);
191 			op2->last->l.next = inst2;
192 			op1->last->l.next = inst2;
193 			inst1 = newinst(OR);
194 			inst1->r.right = op1->first;
195 			inst1->l.left = op2->first;
196 			pushand(inst1, inst2);
197 			break;
198 		case CAT:
199 			op2 = popand(0);
200 			op1 = popand(0);
201 			op1->last->l.next = op2->first;
202 			pushand(op1->first, op2->last);
203 			break;
204 		case STAR:
205 			op2 = popand('*');
206 			inst1 = newinst(OR);
207 			op2->last->l.next = inst1;
208 			inst1->r.right = op2->first;
209 			pushand(inst1, inst1);
210 			break;
211 		case PLUS:
212 			op2 = popand('+');
213 			inst1 = newinst(OR);
214 			op2->last->l.next = inst1;
215 			inst1->r.right = op2->first;
216 			pushand(op2->first, inst1);
217 			break;
218 		case QUEST:
219 			op2 = popand('?');
220 			inst1 = newinst(OR);
221 			inst2 = newinst(NOP);
222 			inst1->l.left = inst2;
223 			inst1->r.right = op2->first;
224 			op2->last->l.next = inst2;
225 			pushand(inst1, inst2);
226 			break;
227 		}
228 	}
229 }
230 
231 static	Reprog*
optimize(Reprog * pp)232 optimize(Reprog *pp)
233 {
234 	Reinst *inst, *target;
235 	int size;
236 	Reprog *npp;
237 	int diff;
238 
239 	/*
240 	 *  get rid of NOOP chains
241 	 */
242 	for(inst=pp->firstinst; inst->type!=END; inst++){
243 		target = inst->l.next;
244 		while(target->type == NOP)
245 			target = target->l.next;
246 		inst->l.next = target;
247 	}
248 
249 	/*
250 	 *  The original allocation is for an area larger than
251 	 *  necessary.  Reallocate to the actual space used
252 	 *  and then relocate the code.
253 	 */
254 	size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst);
255 	npp = realloc(pp, size);
256 	if(npp==0 || npp==pp)
257 		return pp;
258 	diff = (char *)npp - (char *)pp;
259 	freep = (Reinst *)((char *)freep + diff);
260 	for(inst=npp->firstinst; inst<freep; inst++){
261 		switch(inst->type){
262 		case OR:
263 		case STAR:
264 		case PLUS:
265 		case QUEST:
266 		case CCLASS:
267 		case NCCLASS:
268 			*(char **)&inst->r.right += diff;
269 			break;
270 		}
271 		*(char **)&inst->l.left += diff;
272 	}
273 	*(char **)&npp->startinst += diff;
274 	return npp;
275 }
276 
277 #ifdef	DEBUG
278 static	void
dumpstack(void)279 dumpstack(void){
280 	Node *stk;
281 	int *ip;
282 
283 	print("operators\n");
284 	for(ip=atorstack; ip<atorp; ip++)
285 		print("0%o\n", *ip);
286 	print("operands\n");
287 	for(stk=andstack; stk<andp; stk++)
288 		print("0%o\t0%o\n", stk->first->type, stk->last->type);
289 }
290 
291 static	void
dump(Reprog * pp)292 dump(Reprog *pp)
293 {
294 	Reinst *l;
295 	wchar_t *p;
296 
297 	l = pp->firstinst;
298 	do{
299 		print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type,
300 			l->l.left-pp->firstinst, l->l.right-pp->firstinst);
301 		if(l->type == RUNE)
302 			print("\t%C\n", l->r);
303 		else if(l->type == CCLASS || l->type == NCCLASS){
304 			print("\t[");
305 			if(l->type == NCCLASS)
306 				print("^");
307 			for(p = l->r.cp->spans; p < l->r.cp->end; p += 2)
308 				if(p[0] == p[1])
309 					print("%C", p[0]);
310 				else
311 					print("%C-%C", p[0], p[1]);
312 			print("]\n");
313 		} else
314 			print("\n");
315 	}while(l++->type);
316 }
317 #endif
318 
319 static	Reclass*
newclass(void)320 newclass(void)
321 {
322 	if(nclass >= NCLASS)
323 		regerr2("too many character classes; limit", NCLASS+'0');
324 	return &(classp[nclass++]);
325 }
326 
327 static	int
nextc(wchar_t * rp)328 nextc(wchar_t *rp)
329 {
330 	int n;
331 
332 	if(lexdone){
333 		*rp = 0;
334 		return 1;
335 	}
336 	n = mbtowc(rp, exprp, MB_CUR_MAX);
337 	if (n <= 0)
338 		n = 1;
339 	exprp += n;
340 	if(*rp == L'\\'){
341 		n = mbtowc(rp, exprp, MB_CUR_MAX);
342 		if (n <= 0)
343 			n = 1;
344 		exprp += n;
345 		return 1;
346 	}
347 	if(*rp == 0)
348 		lexdone = 1;
349 	return 0;
350 }
351 
352 static	int
lex(int literal,int dot_type)353 lex(int literal, int dot_type)
354 {
355 	int quoted;
356 
357 	quoted = nextc(&yyrune);
358 	if(literal || quoted){
359 		if(yyrune == 0)
360 			return END;
361 		return RUNE;
362 	}
363 
364 	switch(yyrune){
365 	case 0:
366 		return END;
367 	case L'*':
368 		return STAR;
369 	case L'?':
370 		return QUEST;
371 	case L'+':
372 		return PLUS;
373 	case L'|':
374 		return OR;
375 	case L'.':
376 		return dot_type;
377 	case L'(':
378 		return LBRA;
379 	case L')':
380 		return RBRA;
381 	case L'^':
382 		return BOL;
383 	case L'$':
384 		return EOL;
385 	case L'[':
386 		return bldcclass();
387 	}
388 	return RUNE;
389 }
390 
391 static int
bldcclass(void)392 bldcclass(void)
393 {
394 	int type;
395 	wchar_t r[NCCRUNE];
396 	wchar_t *p, *ep, *np;
397 	wchar_t rune;
398 	int quoted;
399 
400 	/* we have already seen the '[' */
401 	type = CCLASS;
402 	yyclassp = newclass();
403 
404 	/* look ahead for negation */
405 	ep = r;
406 	quoted = nextc(&rune);
407 	if(!quoted && rune == L'^'){
408 		type = NCCLASS;
409 		quoted = nextc(&rune);
410 		*ep++ = L'\n';
411 		*ep++ = L'\n';
412 	}
413 
414 	/* parse class into a set of spans */
415 	for(; ep<&r[NCCRUNE];){
416 		if(rune == 0){
417 			rcerror("malformed '[]'");
418 			return 0;
419 		}
420 		if(!quoted && rune == L']')
421 			break;
422 		if(!quoted && rune == L'-'){
423 			if(ep == r){
424 				rcerror("malformed '[]'");
425 				return 0;
426 			}
427 			quoted = nextc(&rune);
428 			if((!quoted && rune == L']') || rune == 0){
429 				rcerror("malformed '[]'");
430 				return 0;
431 			}
432 			*(ep-1) = rune;
433 		} else {
434 			*ep++ = rune;
435 			*ep++ = rune;
436 		}
437 		quoted = nextc(&rune);
438 	}
439 
440 	/* sort on span start */
441 	for(p = r; p < ep; p += 2){
442 		for(np = p; np < ep; np += 2)
443 			if(*np < *p){
444 				rune = np[0];
445 				np[0] = p[0];
446 				p[0] = rune;
447 				rune = np[1];
448 				np[1] = p[1];
449 				p[1] = rune;
450 			}
451 	}
452 
453 	/* merge spans */
454 	np = yyclassp->spans;
455 	p = r;
456 	if(r == ep)
457 		yyclassp->end = np;
458 	else {
459 		np[0] = *p++;
460 		np[1] = *p++;
461 		for(; p < ep; p += 2)
462 			if(p[0] <= np[1]){
463 				if(p[1] > np[1])
464 					np[1] = p[1];
465 			} else {
466 				np += 2;
467 				np[0] = p[0];
468 				np[1] = p[1];
469 			}
470 		yyclassp->end = np+2;
471 	}
472 
473 	return type;
474 }
475 
476 static	Reprog*
regcomp1(char * s,int literal,int dot_type)477 regcomp1(char *s, int literal, int dot_type)
478 {
479 	int token;
480 	Reprog *pp;
481 
482 	/* get memory for the program */
483 	pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s));
484 	if(pp == 0){
485 		regerror("out of memory");
486 		return 0;
487 	}
488 	freep = pp->firstinst;
489 	classp = pp->class;
490 	errors = 0;
491 
492 	if(setjmp(regkaboom))
493 		goto out;
494 
495 	/* go compile the sucker */
496 	lexdone = 0;
497 	exprp = s;
498 	nclass = 0;
499 	nbra = 0;
500 	atorp = atorstack;
501 	andp = andstack;
502 	subidp = subidstack;
503 	lastwasand = FALSE;
504 	cursubid = 0;
505 
506 	/* Start with a low priority operator to prime parser */
507 	pushator(START-1);
508 	while((token = lex(literal, dot_type)) != END){
509 		if((token&0300) == OPERATOR)
510 			operator(token);
511 		else
512 			operand(token);
513 	}
514 
515 	/* Close with a low priority operator */
516 	evaluntil(START);
517 
518 	/* Force END */
519 	operand(END);
520 	evaluntil(START);
521 #ifdef DEBUG
522 	dumpstack();
523 #endif
524 	if(nbra)
525 		rcerror("unmatched left paren");
526 	--andp;	/* points to first and only operand */
527 	pp->startinst = andp->first;
528 #ifdef DEBUG
529 	dump(pp);
530 #endif
531 	pp = optimize(pp);
532 #ifdef DEBUG
533 	print("start: %d\n", andp->first-pp->firstinst);
534 	dump(pp);
535 #endif
536 out:
537 	if(errors){
538 		free(pp);
539 		pp = 0;
540 	}
541 	return pp;
542 }
543 
544 extern	Reprog*
regcomp(char * s)545 regcomp(char *s)
546 {
547 	return regcomp1(s, 0, ANY);
548 }
549 
550 extern	Reprog*
regcomplit(char * s)551 regcomplit(char *s)
552 {
553 	return regcomp1(s, 1, ANY);
554 }
555 
556 extern	Reprog*
regcompnl(char * s)557 regcompnl(char *s)
558 {
559 	return regcomp1(s, 0, ANYNL);
560 }
561