xref: /plan9/sys/src/cmd/upas/bayes/regcomp.c (revision 4246b6162acdbb658503b8bdc98024362bfbf0fe)
1 /* From libregexp but leaks extra classes when it runs out */
2 
3 
4 #include <u.h>
5 #include <libc.h>
6 #include "regexp.h"
7 #include "/sys/src/libregexp/regcomp.h"
8 
9 #define	TRUE	1
10 #define	FALSE	0
11 
12 /*
13  * Parser Information
14  */
15 typedef
16 struct Node
17 {
18 	Reinst*	first;
19 	Reinst*	last;
20 }Node;
21 
22 #define	NSTACK	20
23 static	Node	andstack[NSTACK];
24 static	Node	*andp;
25 static	int	atorstack[NSTACK];
26 static	int*	atorp;
27 static	int	cursubid;		/* id of current subexpression */
28 static	int	subidstack[NSTACK];	/* parallel to atorstack */
29 static	int*	subidp;
30 static	int	lastwasand;	/* Last token was operand */
31 static	int	nbra;
32 static	char*	exprp;		/* pointer to next character in source expression */
33 static	int	lexdone;
34 static	int	nclass;
35 static	Reclass*classp;
36 static	Reinst*	freep;
37 static	int	errors;
38 static	Rune	yyrune;		/* last lex'd rune */
39 static	Reclass*yyclassp;	/* last lex'd class */
40 
41 /* predeclared crap */
42 static	void	operator(int);
43 static	void	pushand(Reinst*, Reinst*);
44 static	void	pushator(int);
45 static	void	evaluntil(int);
46 static	int	bldcclass(void);
47 
48 static jmp_buf regkaboom;
49 
50 static	void
rcerror(char * s)51 rcerror(char *s)
52 {
53 	errors++;
54 	regerror(s);
55 	longjmp(regkaboom, 1);
56 }
57 
58 static	Reinst*
newinst(int t)59 newinst(int t)
60 {
61 	freep->type = t;
62 	freep->left = 0;
63 	freep->right = 0;
64 	return freep++;
65 }
66 
67 static	void
operand(int t)68 operand(int t)
69 {
70 	Reinst *i;
71 
72 	if(lastwasand)
73 		operator(CAT);	/* catenate is implicit */
74 	i = newinst(t);
75 
76 	if(t == CCLASS || t == NCCLASS)
77 		i->cp = yyclassp;
78 	if(t == RUNE)
79 		i->r = yyrune;
80 
81 	pushand(i, i);
82 	lastwasand = TRUE;
83 }
84 
85 static	void
operator(int t)86 operator(int t)
87 {
88 	if(t==RBRA && --nbra<0)
89 		rcerror("unmatched right paren");
90 	if(t==LBRA){
91 		if(++cursubid >= NSUBEXP)
92 			rcerror ("too many subexpressions");
93 		nbra++;
94 		if(lastwasand)
95 			operator(CAT);
96 	} else
97 		evaluntil(t);
98 	if(t != RBRA)
99 		pushator(t);
100 	lastwasand = FALSE;
101 	if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
102 		lastwasand = TRUE;	/* these look like operands */
103 }
104 
105 static	void
regerr2(char * s,int c)106 regerr2(char *s, int c)
107 {
108 	char buf[100];
109 	char *cp = buf;
110 	while(*s)
111 		*cp++ = *s++;
112 	*cp++ = c;
113 	*cp = '\0';
114 	rcerror(buf);
115 }
116 
117 static	void
cant(char * s)118 cant(char *s)
119 {
120 	char buf[100];
121 	strcpy(buf, "can't happen: ");
122 	strcat(buf, s);
123 	rcerror(buf);
124 }
125 
126 static	void
pushand(Reinst * f,Reinst * l)127 pushand(Reinst *f, Reinst *l)
128 {
129 	if(andp >= &andstack[NSTACK])
130 		cant("operand stack overflow");
131 	andp->first = f;
132 	andp->last = l;
133 	andp++;
134 }
135 
136 static	void
pushator(int t)137 pushator(int t)
138 {
139 	if(atorp >= &atorstack[NSTACK])
140 		cant("operator stack overflow");
141 	*atorp++ = t;
142 	*subidp++ = cursubid;
143 }
144 
145 static	Node*
popand(int op)146 popand(int op)
147 {
148 	Reinst *inst;
149 
150 	if(andp <= &andstack[0]){
151 		regerr2("missing operand for ", op);
152 		inst = newinst(NOP);
153 		pushand(inst,inst);
154 	}
155 	return --andp;
156 }
157 
158 static	int
popator(void)159 popator(void)
160 {
161 	if(atorp <= &atorstack[0])
162 		cant("operator stack underflow");
163 	--subidp;
164 	return *--atorp;
165 }
166 
167 static	void
evaluntil(int pri)168 evaluntil(int pri)
169 {
170 	Node *op1, *op2;
171 	Reinst *inst1, *inst2;
172 
173 	while(pri==RBRA || atorp[-1]>=pri){
174 		switch(popator()){
175 		default:
176 			rcerror("unknown operator in evaluntil");
177 			break;
178 		case LBRA:		/* must have been RBRA */
179 			op1 = popand('(');
180 			inst2 = newinst(RBRA);
181 			inst2->subid = *subidp;
182 			op1->last->next = inst2;
183 			inst1 = newinst(LBRA);
184 			inst1->subid = *subidp;
185 			inst1->next = op1->first;
186 			pushand(inst1, inst2);
187 			return;
188 		case OR:
189 			op2 = popand('|');
190 			op1 = popand('|');
191 			inst2 = newinst(NOP);
192 			op2->last->next = inst2;
193 			op1->last->next = inst2;
194 			inst1 = newinst(OR);
195 			inst1->right = op1->first;
196 			inst1->left = op2->first;
197 			pushand(inst1, inst2);
198 			break;
199 		case CAT:
200 			op2 = popand(0);
201 			op1 = popand(0);
202 			op1->last->next = op2->first;
203 			pushand(op1->first, op2->last);
204 			break;
205 		case STAR:
206 			op2 = popand('*');
207 			inst1 = newinst(OR);
208 			op2->last->next = inst1;
209 			inst1->right = op2->first;
210 			pushand(inst1, inst1);
211 			break;
212 		case PLUS:
213 			op2 = popand('+');
214 			inst1 = newinst(OR);
215 			op2->last->next = inst1;
216 			inst1->right = op2->first;
217 			pushand(op2->first, inst1);
218 			break;
219 		case QUEST:
220 			op2 = popand('?');
221 			inst1 = newinst(OR);
222 			inst2 = newinst(NOP);
223 			inst1->left = inst2;
224 			inst1->right = op2->first;
225 			op2->last->next = inst2;
226 			pushand(inst1, inst2);
227 			break;
228 		}
229 	}
230 }
231 
232 static	Reprog*
optimize(Reprog * pp)233 optimize(Reprog *pp)
234 {
235 	Reinst *inst, *target;
236 	int size;
237 	Reprog *npp;
238 	Reclass *cl;
239 	int diff;
240 
241 	/*
242 	 *  get rid of NOOP chains
243 	 */
244 	for(inst=pp->firstinst; inst->type!=END; inst++){
245 		target = inst->next;
246 		while(target->type == NOP)
247 			target = target->next;
248 		inst->next = target;
249 	}
250 
251 	/*
252 	 *  The original allocation is for an area larger than
253 	 *  necessary.  Reallocate to the actual space used
254 	 *  and then relocate the code.
255 	 */
256 	size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst);
257 	npp = realloc(pp, size);
258 	if(npp==0 || npp==pp)
259 		return pp;
260 	diff = (char *)npp - (char *)pp;
261 	freep = (Reinst *)((char *)freep + diff);
262 	for(inst=npp->firstinst; inst<freep; inst++){
263 		switch(inst->type){
264 		case OR:
265 		case STAR:
266 		case PLUS:
267 		case QUEST:
268 			*(char **)&inst->right += diff;
269 			break;
270 		case CCLASS:
271 		case NCCLASS:
272 			*(char **)&inst->right += diff;
273 			cl = inst->cp;
274 			*(char **)&cl->end += diff;
275 			break;
276 		}
277 		*(char **)&inst->left += diff;
278 	}
279 	*(char **)&npp->startinst += diff;
280 	return npp;
281 }
282 
283 #ifdef	DEBUG
284 static	void
dumpstack(void)285 dumpstack(void){
286 	Node *stk;
287 	int *ip;
288 
289 	print("operators\n");
290 	for(ip=atorstack; ip<atorp; ip++)
291 		print("0%o\n", *ip);
292 	print("operands\n");
293 	for(stk=andstack; stk<andp; stk++)
294 		print("0%o\t0%o\n", stk->first->type, stk->last->type);
295 }
296 
297 static	void
dump(Reprog * pp)298 dump(Reprog *pp)
299 {
300 	Reinst *l;
301 	Rune *p;
302 
303 	l = pp->firstinst;
304 	do{
305 		print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type,
306 			l->left-pp->firstinst, l->right-pp->firstinst);
307 		if(l->type == RUNE)
308 			print("\t%C\n", l->r);
309 		else if(l->type == CCLASS || l->type == NCCLASS){
310 			print("\t[");
311 			if(l->type == NCCLASS)
312 				print("^");
313 			for(p = l->cp->spans; p < l->cp->end; p += 2)
314 				if(p[0] == p[1])
315 					print("%C", p[0]);
316 				else
317 					print("%C-%C", p[0], p[1]);
318 			print("]\n");
319 		} else
320 			print("\n");
321 	}while(l++->type);
322 }
323 #endif
324 
325 static	Reclass*
newclass(void)326 newclass(void)
327 {
328 	if(nclass <= 0){
329 		classp = mallocz(128*sizeof(Reclass), 1);
330 		if(classp == nil)
331 			regerror("out of memory");
332 		nclass = 128;
333 	}
334 	return &classp[--nclass];
335 }
336 
337 static	int
nextc(Rune * rp)338 nextc(Rune *rp)
339 {
340 	if(lexdone){
341 		*rp = 0;
342 		return 1;
343 	}
344 	exprp += chartorune(rp, exprp);
345 	if(*rp == L'\\'){
346 		exprp += chartorune(rp, exprp);
347 		return 1;
348 	}
349 	if(*rp == 0)
350 		lexdone = 1;
351 	return 0;
352 }
353 
354 static	int
lex(int literal,int dot_type)355 lex(int literal, int dot_type)
356 {
357 	int quoted;
358 
359 	quoted = nextc(&yyrune);
360 	if(literal || quoted){
361 		if(yyrune == 0)
362 			return END;
363 		return RUNE;
364 	}
365 
366 	switch(yyrune){
367 	case 0:
368 		return END;
369 	case L'*':
370 		return STAR;
371 	case L'?':
372 		return QUEST;
373 	case L'+':
374 		return PLUS;
375 	case L'|':
376 		return OR;
377 	case L'.':
378 		return dot_type;
379 	case L'(':
380 		return LBRA;
381 	case L')':
382 		return RBRA;
383 	case L'^':
384 		return BOL;
385 	case L'$':
386 		return EOL;
387 	case L'[':
388 		return bldcclass();
389 	}
390 	return RUNE;
391 }
392 
393 static int
bldcclass(void)394 bldcclass(void)
395 {
396 	int type;
397 	Rune r[NCCRUNE];
398 	Rune *p, *ep, *np;
399 	Rune rune;
400 	int quoted;
401 
402 	/* we have already seen the '[' */
403 	type = CCLASS;
404 	yyclassp = newclass();
405 
406 	/* look ahead for negation */
407 	/* SPECIAL CASE!!! negated classes don't match \n */
408 	ep = r;
409 	quoted = nextc(&rune);
410 	if(!quoted && rune == L'^'){
411 		type = NCCLASS;
412 		quoted = nextc(&rune);
413 		*ep++ = L'\n';
414 		*ep++ = L'\n';
415 	}
416 
417 	/* parse class into a set of spans */
418 	for(; ep<&r[NCCRUNE];){
419 		if(rune == 0){
420 			rcerror("malformed '[]'");
421 			return 0;
422 		}
423 		if(!quoted && rune == L']')
424 			break;
425 		if(!quoted && rune == L'-'){
426 			if(ep == r){
427 				rcerror("malformed '[]'");
428 				return 0;
429 			}
430 			quoted = nextc(&rune);
431 			if((!quoted && rune == L']') || rune == 0){
432 				rcerror("malformed '[]'");
433 				return 0;
434 			}
435 			*(ep-1) = rune;
436 		} else {
437 			*ep++ = rune;
438 			*ep++ = rune;
439 		}
440 		quoted = nextc(&rune);
441 	}
442 
443 	/* sort on span start */
444 	for(p = r; p < ep; p += 2){
445 		for(np = p; np < ep; np += 2)
446 			if(*np < *p){
447 				rune = np[0];
448 				np[0] = p[0];
449 				p[0] = rune;
450 				rune = np[1];
451 				np[1] = p[1];
452 				p[1] = rune;
453 			}
454 	}
455 
456 	/* merge spans */
457 	np = yyclassp->spans;
458 	p = r;
459 	if(r == ep)
460 		yyclassp->end = np;
461 	else {
462 		np[0] = *p++;
463 		np[1] = *p++;
464 		for(; p < ep; p += 2)
465 			if(p[0] <= np[1]){
466 				if(p[1] > np[1])
467 					np[1] = p[1];
468 			} else {
469 				np += 2;
470 				np[0] = p[0];
471 				np[1] = p[1];
472 			}
473 		yyclassp->end = np+2;
474 	}
475 
476 	return type;
477 }
478 
479 static	Reprog*
regcomp1(char * s,int literal,int dot_type)480 regcomp1(char *s, int literal, int dot_type)
481 {
482 	int token;
483 	Reprog *pp;
484 
485 	/* get memory for the program */
486 	pp = malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s));
487 	if(pp == 0){
488 		regerror("out of memory");
489 		return 0;
490 	}
491 	freep = pp->firstinst;
492 	classp = pp->class;
493 	errors = 0;
494 
495 	if(setjmp(regkaboom))
496 		goto out;
497 
498 	/* go compile the sucker */
499 	lexdone = 0;
500 	exprp = s;
501 	nclass = NCLASS;
502 	nbra = 0;
503 	atorp = atorstack;
504 	andp = andstack;
505 	subidp = subidstack;
506 	lastwasand = FALSE;
507 	cursubid = 0;
508 
509 	/* Start with a low priority operator to prime parser */
510 	pushator(START-1);
511 	while((token = lex(literal, dot_type)) != END){
512 		if((token&0300) == OPERATOR)
513 			operator(token);
514 		else
515 			operand(token);
516 	}
517 
518 	/* Close with a low priority operator */
519 	evaluntil(START);
520 
521 	/* Force END */
522 	operand(END);
523 	evaluntil(START);
524 #ifdef DEBUG
525 	dumpstack();
526 #endif
527 	if(nbra)
528 		rcerror("unmatched left paren");
529 	--andp;	/* points to first and only operand */
530 	pp->startinst = andp->first;
531 #ifdef DEBUG
532 	dump(pp);
533 #endif
534 	pp = optimize(pp);
535 #ifdef DEBUG
536 	print("start: %d\n", andp->first-pp->firstinst);
537 	dump(pp);
538 #endif
539 out:
540 	if(errors){
541 		free(pp);
542 		pp = 0;
543 	}
544 	return pp;
545 }
546 
547 extern	Reprog*
regcomp(char * s)548 regcomp(char *s)
549 {
550 	return regcomp1(s, 0, ANY);
551 }
552 
553 extern	Reprog*
regcomplit(char * s)554 regcomplit(char *s)
555 {
556 	return regcomp1(s, 1, ANY);
557 }
558 
559 extern	Reprog*
regcompnl(char * s)560 regcompnl(char *s)
561 {
562 	return regcomp1(s, 0, ANYNL);
563 }
564