xref: /plan9-contrib/sys/src/libregexp/regcomp.c (revision 3e12c5d1bb89fc02707907988834ef147769ddaf)
1 #include <u.h>
2 #include <libc.h>
3 #include "regexp.h"
4 #include "regcomp.h"
5 
6 #define	TRUE	1
7 #define	FALSE	0
8 
9 /*
10  * Parser Information
11  */
12 typedef
13 struct Node
14 {
15 	Reinst*	first;
16 	Reinst*	last;
17 }Node;
18 
19 #define	NSTACK	20
20 static	Node	andstack[NSTACK];
21 static	Node	*andp;
22 static	int	atorstack[NSTACK];
23 static	int*	atorp;
24 static	int	cursubid;		/* id of current subexpression */
25 static	int	subidstack[NSTACK];	/* parallel to atorstack */
26 static	int*	subidp;
27 static	int	lastwasand;	/* Last token was operand */
28 static	int	nbra;
29 static	char*	exprp;		/* pointer to next character in source expression */
30 static	int	lexdone;
31 static	int	nclass;
32 static	Reclass*classp;
33 static	Reinst*	freep;
34 static	int	errors;
35 static	Rune	yyrune;		/* last lex'd rune */
36 static	Reclass*yyclassp;	/* last lex'd class */
37 
38 /* predeclared crap */
39 static	void	operator(int);
40 static	void	pushand(Reinst*, Reinst*);
41 static	void	pushator(int);
42 static	void	evaluntil(int);
43 static	int	bldcclass(void);
44 
45 static jmp_buf regkaboom;
46 
47 static	void
48 rcerror(char *s)
49 {
50 	errors++;
51 	regerror(s);
52 	longjmp(regkaboom, 1);
53 }
54 
55 static	Reinst*
56 newinst(int t)
57 {
58 	freep->type = t;
59 	freep->left = 0;
60 	freep->right = 0;
61 	return freep++;
62 }
63 
64 static	void
65 operand(int t)
66 {
67 	Reinst *i;
68 
69 	if(lastwasand)
70 		operator(CAT);	/* catenate is implicit */
71 	i = newinst(t);
72 
73 	if(t == CCLASS || t == NCCLASS)
74 		i->cp = yyclassp;
75 	if(t == RUNE)
76 		i->r = yyrune;
77 
78 	pushand(i, i);
79 	lastwasand = TRUE;
80 }
81 
82 static	void
83 operator(int t)
84 {
85 	if(t==RBRA && --nbra<0)
86 		rcerror("unmatched right paren");
87 	if(t==LBRA){
88 		if(++cursubid >= NSUBEXP)
89 			rcerror ("too many subexpressions");
90 		nbra++;
91 		if(lastwasand)
92 			operator(CAT);
93 	} else
94 		evaluntil(t);
95 	if(t != RBRA)
96 		pushator(t);
97 	lastwasand = FALSE;
98 	if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
99 		lastwasand = TRUE;	/* these look like operands */
100 }
101 
102 static	void
103 regerr2(char *s, int c)
104 {
105 	char buf[100];
106 	char *cp = buf;
107 	while(*s)
108 		*cp++ = *s++;
109 	*cp++ = c;
110 	*cp = '\0';
111 	rcerror(buf);
112 }
113 
114 static	void
115 cant(char *s)
116 {
117 	char buf[100];
118 	strcpy(buf, "can't happen: ");
119 	strcat(buf, s);
120 	rcerror(buf);
121 }
122 
123 static	void
124 pushand(Reinst *f, Reinst *l)
125 {
126 	if(andp >= &andstack[NSTACK])
127 		cant("operand stack overflow");
128 	andp->first = f;
129 	andp->last = l;
130 	andp++;
131 }
132 
133 static	void
134 pushator(int t)
135 {
136 	if(atorp >= &atorstack[NSTACK])
137 		cant("operator stack overflow");
138 	*atorp++ = t;
139 	*subidp++ = cursubid;
140 }
141 
142 static	Node*
143 popand(int op)
144 {
145 	Reinst *inst;
146 
147 	if(andp <= &andstack[0]){
148 		regerr2("missing operand for ", op);
149 		inst = newinst(NOP);
150 		pushand(inst,inst);
151 	}
152 	return --andp;
153 }
154 
155 static	int
156 popator(void)
157 {
158 	if(atorp <= &atorstack[0])
159 		cant("operator stack underflow");
160 	--subidp;
161 	return *--atorp;
162 }
163 
164 static	void
165 evaluntil(int pri)
166 {
167 	Node *op1, *op2;
168 	Reinst *inst1, *inst2;
169 
170 	while(pri==RBRA || atorp[-1]>=pri){
171 		switch(popator()){
172 		default:
173 			rcerror("unknown operator in evaluntil");
174 			break;
175 		case LBRA:		/* must have been RBRA */
176 			op1 = popand('(');
177 			inst2 = newinst(RBRA);
178 			inst2->subid = *subidp;
179 			op1->last->next = inst2;
180 			inst1 = newinst(LBRA);
181 			inst1->subid = *subidp;
182 			inst1->next = op1->first;
183 			pushand(inst1, inst2);
184 			return;
185 		case OR:
186 			op2 = popand('|');
187 			op1 = popand('|');
188 			inst2 = newinst(NOP);
189 			op2->last->next = inst2;
190 			op1->last->next = inst2;
191 			inst1 = newinst(OR);
192 			inst1->right = op1->first;
193 			inst1->left = op2->first;
194 			pushand(inst1, inst2);
195 			break;
196 		case CAT:
197 			op2 = popand(0);
198 			op1 = popand(0);
199 			op1->last->next = op2->first;
200 			pushand(op1->first, op2->last);
201 			break;
202 		case STAR:
203 			op2 = popand('*');
204 			inst1 = newinst(OR);
205 			op2->last->next = inst1;
206 			inst1->right = op2->first;
207 			pushand(inst1, inst1);
208 			break;
209 		case PLUS:
210 			op2 = popand('+');
211 			inst1 = newinst(OR);
212 			op2->last->next = inst1;
213 			inst1->right = op2->first;
214 			pushand(op2->first, inst1);
215 			break;
216 		case QUEST:
217 			op2 = popand('?');
218 			inst1 = newinst(OR);
219 			inst2 = newinst(NOP);
220 			inst1->left = inst2;
221 			inst1->right = op2->first;
222 			op2->last->next = inst2;
223 			pushand(inst1, inst2);
224 			break;
225 		}
226 	}
227 }
228 
229 static	Reprog*
230 optimize(Reprog *pp)
231 {
232 	Reinst *inst, *target;
233 	int size;
234 	Reprog *npp;
235 	int diff;
236 
237 	/*
238 	 *  get rid of NOOP chains
239 	 */
240 	for(inst=pp->firstinst; inst->type!=END; inst++){
241 		target = inst->next;
242 		while(target->type == NOP)
243 			target = target->next;
244 		inst->next = target;
245 	}
246 
247 	/*
248 	 *  The original allocation is for an area larger than
249 	 *  necessary.  Reallocate to the actual space used
250 	 *  and then relocate the code.
251 	 */
252 	size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst);
253 	npp = realloc(pp, size);
254 	if(npp==0 || npp==pp)
255 		return pp;
256 	diff = (char *)npp - (char *)pp;
257 	freep = (Reinst *)((char *)freep + diff);
258 	for(inst=npp->firstinst; inst<freep; inst++){
259 		switch(inst->type){
260 		case OR:
261 		case STAR:
262 		case PLUS:
263 		case QUEST:
264 		case CCLASS:
265 		case NCCLASS:
266 			*(char **)&inst->right += diff;
267 			break;
268 		}
269 		*(char **)&inst->left += diff;
270 	}
271 	*(char **)&npp->startinst += diff;
272 	return npp;
273 }
274 
275 #ifdef	DEBUG
276 static	void
277 dumpstack(void){
278 	Node *stk;
279 	int *ip;
280 
281 	print("operators\n");
282 	for(ip=atorstack; ip<atorp; ip++)
283 		print("0%o\n", *ip);
284 	print("operands\n");
285 	for(stk=andstack; stk<andp; stk++)
286 		print("0%o\t0%o\n", stk->first->type, stk->last->type);
287 }
288 
289 static	void
290 dump(Reprog *pp)
291 {
292 	Reinst *l;
293 	Rune *p;
294 
295 	l = pp->firstinst;
296 	do{
297 		print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type,
298 			l->left-pp->firstinst, l->right-pp->firstinst);
299 		if(l->type == RUNE)
300 			print("\t%C\n", l->r);
301 		else if(l->type == CCLASS || l->type == NCCLASS){
302 			print("\t[");
303 			if(l->type == NCCLASS)
304 				print("^");
305 			for(p = l->cp->spans; p < l->cp->end; p += 2)
306 				if(p[0] == p[1])
307 					print("%C", p[0]);
308 				else
309 					print("%C-%C", p[0], p[1]);
310 			print("]\n");
311 		} else
312 			print("\n");
313 	}while(l++->type);
314 }
315 #endif
316 
317 static	Reclass*
318 newclass(void)
319 {
320 	if(nclass >= NCLASS)
321 		regerr2("too many character classes; limit", NCLASS+'0');
322 	return &(classp[nclass++]);
323 }
324 
325 static	int
326 nextc(Rune *rp)
327 {
328 	if(lexdone){
329 		*rp = 0;
330 		return 1;
331 	}
332 	exprp += chartorune(rp, exprp);
333 	if(*rp == L'\\'){
334 		exprp += chartorune(rp, exprp);
335 		return 1;
336 	}
337 	if(*rp == 0)
338 		lexdone = 1;
339 	return 0;
340 }
341 
342 static	int
343 lex(int literal, int dot_type)
344 {
345 	int quoted;
346 
347 	quoted = nextc(&yyrune);
348 	if(literal || quoted){
349 		if(yyrune == 0)
350 			return END;
351 		return RUNE;
352 	}
353 
354 	switch(yyrune){
355 	case 0:
356 		return END;
357 	case L'*':
358 		return STAR;
359 	case L'?':
360 		return QUEST;
361 	case L'+':
362 		return PLUS;
363 	case L'|':
364 		return OR;
365 	case L'.':
366 		return dot_type;
367 	case L'(':
368 		return LBRA;
369 	case L')':
370 		return RBRA;
371 	case L'^':
372 		return BOL;
373 	case L'$':
374 		return EOL;
375 	case L'[':
376 		return bldcclass();
377 	}
378 	return RUNE;
379 }
380 
381 static int
382 bldcclass(void)
383 {
384 	int type;
385 	Rune r[NCCRUNE];
386 	Rune *p, *ep, *np;
387 	Rune rune;
388 	int quoted;
389 
390 	/* we have already seen the '[' */
391 	type = CCLASS;
392 	yyclassp = newclass();
393 
394 	/* look ahead for negation */
395 	/* SPECIAL CASE!!! negated classes don't match \n */
396 	ep = r;
397 	quoted = nextc(&rune);
398 	if(!quoted && rune == L'^'){
399 		type = NCCLASS;
400 		quoted = nextc(&rune);
401 		*ep++ = L'\n';
402 		*ep++ = L'\n';
403 	}
404 
405 	/* parse class into a set of spans */
406 	for(; ep<&r[NCCRUNE];){
407 		if(rune == 0){
408 			rcerror("malformed '[]'");
409 			return 0;
410 		}
411 		if(!quoted && rune == L']')
412 			break;
413 		if(!quoted && rune == L'-'){
414 			if(ep == r){
415 				rcerror("malformed '[]'");
416 				return 0;
417 			}
418 			quoted = nextc(&rune);
419 			if((!quoted && rune == L']') || rune == 0){
420 				rcerror("malformed '[]'");
421 				return 0;
422 			}
423 			*(ep-1) = rune;
424 		} else {
425 			*ep++ = rune;
426 			*ep++ = rune;
427 		}
428 		quoted = nextc(&rune);
429 	}
430 
431 	/* sort on span start */
432 	for(p = r; p < ep; p += 2){
433 		for(np = p; np < ep; np += 2)
434 			if(*np < *p){
435 				rune = np[0];
436 				np[0] = p[0];
437 				p[0] = rune;
438 				rune = np[1];
439 				np[1] = p[1];
440 				p[1] = rune;
441 			}
442 	}
443 
444 	/* merge spans */
445 	np = yyclassp->spans;
446 	p = r;
447 	if(r == ep)
448 		yyclassp->end = np;
449 	else {
450 		np[0] = *p++;
451 		np[1] = *p++;
452 		for(; p < ep; p += 2)
453 			if(p[0] <= np[1]){
454 				if(p[1] > np[1])
455 					np[1] = p[1];
456 			} else {
457 				np += 2;
458 				np[0] = p[0];
459 				np[1] = p[1];
460 			}
461 		yyclassp->end = np+2;
462 	}
463 
464 	return type;
465 }
466 
467 static	Reprog*
468 regcomp1(char *s, int literal, int dot_type)
469 {
470 	int token;
471 	Reprog *pp;
472 
473 	/* get memory for the program */
474 	pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s));
475 	if(pp == 0){
476 		regerror("out of memory");
477 		return 0;
478 	}
479 	freep = pp->firstinst;
480 	classp = pp->class;
481 	errors = 0;
482 
483 	if(setjmp(regkaboom))
484 		goto out;
485 
486 	/* go compile the sucker */
487 	lexdone = 0;
488 	exprp = s;
489 	nclass = 0;
490 	nbra = 0;
491 	atorp = atorstack;
492 	andp = andstack;
493 	subidp = subidstack;
494 	lastwasand = FALSE;
495 	cursubid = 0;
496 
497 	/* Start with a low priority operator to prime parser */
498 	pushator(START-1);
499 	while((token = lex(literal, dot_type)) != END){
500 		if((token&0300) == OPERATOR)
501 			operator(token);
502 		else
503 			operand(token);
504 	}
505 
506 	/* Close with a low priority operator */
507 	evaluntil(START);
508 
509 	/* Force END */
510 	operand(END);
511 	evaluntil(START);
512 #ifdef DEBUG
513 	dumpstack();
514 #endif
515 	if(nbra)
516 		rcerror("unmatched left paren");
517 	--andp;	/* points to first and only operand */
518 	pp->startinst = andp->first;
519 #ifdef DEBUG
520 	dump(pp);
521 #endif
522 	pp = optimize(pp);
523 #ifdef DEBUG
524 	print("start: %d\n", andp->first-pp->firstinst);
525 	dump(pp);
526 #endif
527 out:
528 	if(errors){
529 		free(pp);
530 		pp = 0;
531 	}
532 	return pp;
533 }
534 
535 extern	Reprog*
536 regcomp(char *s)
537 {
538 	return regcomp1(s, 0, ANY);
539 }
540 
541 extern	Reprog*
542 regcomplit(char *s)
543 {
544 	return regcomp1(s, 1, ANY);
545 }
546 
547 extern	Reprog*
548 regcompnl(char *s)
549 {
550 	return regcomp1(s, 0, ANYNL);
551 }
552