xref: /inferno-os/utils/libregexp/regcomp.c (revision 7e00430948d8af545f880e82bb30cd3ee50deb04)
1 #include <lib9.h>
2 #include "regexp.h"
3 #include "regcomp.h"
4 
5 #define	TRUE	1
6 #define	FALSE	0
7 
8 /*
9  * Parser Information
10  */
11 typedef
12 struct Node
13 {
14 	Reinst*	first;
15 	Reinst*	last;
16 }Node;
17 
18 Reprog	RePrOg;
19 
20 #define	NSTACK	20
21 static	Node	andstack[NSTACK];
22 static	Node	*andp;
23 static	int	atorstack[NSTACK];
24 static	int*	atorp;
25 static	int	cursubid;		/* id of current subexpression */
26 static	int	subidstack[NSTACK];	/* parallel to atorstack */
27 static	int*	subidp;
28 static	int	lastwasand;	/* Last token was operand */
29 static	int	nbra;
30 static	char*	exprp;		/* pointer to next character in source expression */
31 static	int	lexdone;
32 static	int	nclass;
33 static	Reclass*classp;
34 static	Reinst*	freep;
35 static	int	errors;
36 static	Rune	yyrune;		/* last lex'd rune */
37 static	Reclass*yyclassp;	/* last lex'd class */
38 
39 /* predeclared crap */
40 static	void	operator(int);
41 static	void	pushand(Reinst*, Reinst*);
42 static	void	pushator(int);
43 static	void	evaluntil(int);
44 static	int	bldcclass(void);
45 
46 static jmp_buf regkaboom;
47 
48 static	void
49 rcerror(char *s)
50 {
51 	errors++;
52 	regerror(s);
53 	longjmp(regkaboom, 1);
54 }
55 
56 static	Reinst*
57 newinst(int t)
58 {
59 	freep->type = t;
60 	freep->u2.left = 0;
61 	freep->u1.right = 0;
62 	return freep++;
63 }
64 
65 static	void
66 operand(int t)
67 {
68 	Reinst *i;
69 
70 	if(lastwasand)
71 		operator(CAT);	/* catenate is implicit */
72 	i = newinst(t);
73 
74 	if(t == CCLASS || t == NCCLASS)
75 		i->u1.cp = yyclassp;
76 	if(t == RUNE)
77 		i->u1.r = yyrune;
78 
79 	pushand(i, i);
80 	lastwasand = TRUE;
81 }
82 
83 static	void
84 operator(int t)
85 {
86 	if(t==RBRA && --nbra<0)
87 		rcerror("unmatched right paren");
88 	if(t==LBRA){
89 		if(++cursubid >= NSUBEXP)
90 			rcerror ("too many subexpressions");
91 		nbra++;
92 		if(lastwasand)
93 			operator(CAT);
94 	} else
95 		evaluntil(t);
96 	if(t != RBRA)
97 		pushator(t);
98 	lastwasand = FALSE;
99 	if(t==STAR || t==QUEST || t==PLUS || t==RBRA)
100 		lastwasand = TRUE;	/* these look like operands */
101 }
102 
103 static	void
104 regerr2(char *s, int c)
105 {
106 	char buf[100];
107 	char *cp = buf;
108 	while(*s)
109 		*cp++ = *s++;
110 	*cp++ = c;
111 	*cp = '\0';
112 	rcerror(buf);
113 }
114 
115 static	void
116 cant(char *s)
117 {
118 	char buf[100];
119 	strcpy(buf, "can't happen: ");
120 	strcat(buf, s);
121 	rcerror(buf);
122 }
123 
124 static	void
125 pushand(Reinst *f, Reinst *l)
126 {
127 	if(andp >= &andstack[NSTACK])
128 		cant("operand stack overflow");
129 	andp->first = f;
130 	andp->last = l;
131 	andp++;
132 }
133 
134 static	void
135 pushator(int t)
136 {
137 	if(atorp >= &atorstack[NSTACK])
138 		cant("operator stack overflow");
139 	*atorp++ = t;
140 	*subidp++ = cursubid;
141 }
142 
143 static	Node*
144 popand(int op)
145 {
146 	Reinst *inst;
147 
148 	if(andp <= &andstack[0]){
149 		regerr2("missing operand for ", op);
150 		inst = newinst(NOP);
151 		pushand(inst,inst);
152 	}
153 	return --andp;
154 }
155 
156 static	int
157 popator(void)
158 {
159 	if(atorp <= &atorstack[0])
160 		cant("operator stack underflow");
161 	--subidp;
162 	return *--atorp;
163 }
164 
165 static	void
166 evaluntil(int pri)
167 {
168 	Node *op1, *op2;
169 	Reinst *inst1, *inst2;
170 
171 	while(pri==RBRA || atorp[-1]>=pri){
172 		switch(popator()){
173 		default:
174 			rcerror("unknown operator in evaluntil");
175 			break;
176 		case LBRA:		/* must have been RBRA */
177 			op1 = popand('(');
178 			inst2 = newinst(RBRA);
179 			inst2->u1.subid = *subidp;
180 			op1->last->u2.next = inst2;
181 			inst1 = newinst(LBRA);
182 			inst1->u1.subid = *subidp;
183 			inst1->u2.next = op1->first;
184 			pushand(inst1, inst2);
185 			return;
186 		case OR:
187 			op2 = popand('|');
188 			op1 = popand('|');
189 			inst2 = newinst(NOP);
190 			op2->last->u2.next = inst2;
191 			op1->last->u2.next = inst2;
192 			inst1 = newinst(OR);
193 			inst1->u1.right = op1->first;
194 			inst1->u2.left = op2->first;
195 			pushand(inst1, inst2);
196 			break;
197 		case CAT:
198 			op2 = popand(0);
199 			op1 = popand(0);
200 			op1->last->u2.next = op2->first;
201 			pushand(op1->first, op2->last);
202 			break;
203 		case STAR:
204 			op2 = popand('*');
205 			inst1 = newinst(OR);
206 			op2->last->u2.next = inst1;
207 			inst1->u1.right = op2->first;
208 			pushand(inst1, inst1);
209 			break;
210 		case PLUS:
211 			op2 = popand('+');
212 			inst1 = newinst(OR);
213 			op2->last->u2.next = inst1;
214 			inst1->u1.right = op2->first;
215 			pushand(op2->first, inst1);
216 			break;
217 		case QUEST:
218 			op2 = popand('?');
219 			inst1 = newinst(OR);
220 			inst2 = newinst(NOP);
221 			inst1->u2.left = inst2;
222 			inst1->u1.right = op2->first;
223 			op2->last->u2.next = inst2;
224 			pushand(inst1, inst2);
225 			break;
226 		}
227 	}
228 }
229 
230 static	Reprog*
231 optimize(Reprog *pp)
232 {
233 	Reinst *inst, *target;
234 	int size;
235 	Reprog *npp;
236 	Reclass *cl;
237 	int diff;
238 
239 	/*
240 	 *  get rid of NOOP chains
241 	 */
242 	for(inst=pp->firstinst; inst->type!=END; inst++){
243 		target = inst->u2.next;
244 		while(target->type == NOP)
245 			target = target->u2.next;
246 		inst->u2.next = target;
247 	}
248 
249 	/*
250 	 *  The original allocation is for an area larger than
251 	 *  necessary.  Reallocate to the actual space used
252 	 *  and then relocate the code.
253 	 */
254 	size = sizeof(Reprog) + (freep - pp->firstinst)*sizeof(Reinst);
255 	npp = (Reprog *)realloc(pp, size);
256 	if(npp==0 || npp==pp)
257 		return pp;
258 	diff = (char *)npp - (char *)pp;
259 	freep = (Reinst *)((char *)freep + diff);
260 	for(inst=npp->firstinst; inst<freep; inst++){
261 		switch(inst->type){
262 		case OR:
263 		case STAR:
264 		case PLUS:
265 		case QUEST:
266 			*(char **)&inst->u1.right += diff;
267 			break;
268 		case CCLASS:
269 		case NCCLASS:
270 			*(char **)&inst->u1.right += diff;
271 			cl = inst->u1.cp;
272 			*(char **)&cl->end += diff;
273 			break;
274 		}
275 		*(char **)&inst->u2.left += diff;
276 	}
277 	*(char **)&npp->startinst += diff;
278 	return npp;
279 }
280 
281 #ifdef	DEBUG
282 static	void
283 dumpstack(void){
284 	Node *stk;
285 	int *ip;
286 
287 	print("operators\n");
288 	for(ip=atorstack; ip<atorp; ip++)
289 		print("0%o\n", *ip);
290 	print("operands\n");
291 	for(stk=andstack; stk<andp; stk++)
292 		print("0%o\t0%o\n", stk->first->type, stk->last->type);
293 }
294 
295 static	void
296 dump(Reprog *pp)
297 {
298 	Reinst *l;
299 	Rune *p;
300 
301 	l = pp->firstinst;
302 	do{
303 		print("%d:\t0%o\t%d\t%d", l-pp->firstinst, l->type,
304 			l->u2.left-pp->firstinst, l->u1.right-pp->firstinst);
305 		if(l->type == RUNE)
306 			print("\t%C\n", l->r);
307 		else if(l->type == CCLASS || l->type == NCCLASS){
308 			print("\t[");
309 			if(l->type == NCCLASS)
310 				print("^");
311 			for(p = l->cp->spans; p < l->cp->end; p += 2)
312 				if(p[0] == p[1])
313 					print("%C", p[0]);
314 				else
315 					print("%C-%C", p[0], p[1]);
316 			print("]\n");
317 		} else
318 			print("\n");
319 	}while(l++->type);
320 }
321 #endif
322 
323 static	Reclass*
324 newclass(void)
325 {
326 	if(nclass >= NCLASS)
327 		regerr2("too many character classes; limit", NCLASS+'0');
328 	return &(classp[nclass++]);
329 }
330 
331 static	int
332 nextc(Rune *rp)
333 {
334 	if(lexdone){
335 		*rp = 0;
336 		return 1;
337 	}
338 	exprp += chartorune(rp, exprp);
339 	if(*rp == L'\\'){
340 		exprp += chartorune(rp, exprp);
341 		return 1;
342 	}
343 	if(*rp == 0)
344 		lexdone = 1;
345 	return 0;
346 }
347 
348 static	int
349 lex(int literal, int dot_type)
350 {
351 	int quoted;
352 
353 	quoted = nextc(&yyrune);
354 	if(literal || quoted){
355 		if(yyrune == 0)
356 			return END;
357 		return RUNE;
358 	}
359 
360 	switch(yyrune){
361 	case 0:
362 		return END;
363 	case L'*':
364 		return STAR;
365 	case L'?':
366 		return QUEST;
367 	case L'+':
368 		return PLUS;
369 	case L'|':
370 		return OR;
371 	case L'.':
372 		return dot_type;
373 	case L'(':
374 		return LBRA;
375 	case L')':
376 		return RBRA;
377 	case L'^':
378 		return BOL;
379 	case L'$':
380 		return EOL;
381 	case L'[':
382 		return bldcclass();
383 	}
384 	return RUNE;
385 }
386 
387 static int
388 bldcclass(void)
389 {
390 	int type;
391 	Rune r[NCCRUNE];
392 	Rune *p, *ep, *np;
393 	Rune rune;
394 	int quoted;
395 
396 	/* we have already seen the '[' */
397 	type = CCLASS;
398 	yyclassp = newclass();
399 
400 	/* look ahead for negation */
401 	/* SPECIAL CASE!!! negated classes don't match \n */
402 	ep = r;
403 	quoted = nextc(&rune);
404 	if(!quoted && rune == L'^'){
405 		type = NCCLASS;
406 		quoted = nextc(&rune);
407 		*ep++ = L'\n';
408 		*ep++ = L'\n';
409 	}
410 
411 	/* parse class into a set of spans */
412 	for(; ep<&r[NCCRUNE];){
413 		if(rune == 0){
414 			rcerror("malformed '[]'");
415 			return 0;
416 		}
417 		if(!quoted && rune == L']')
418 			break;
419 		if(!quoted && rune == L'-'){
420 			if(ep == r){
421 				rcerror("malformed '[]'");
422 				return 0;
423 			}
424 			quoted = nextc(&rune);
425 			if((!quoted && rune == L']') || rune == 0){
426 				rcerror("malformed '[]'");
427 				return 0;
428 			}
429 			*(ep-1) = rune;
430 		} else {
431 			*ep++ = rune;
432 			*ep++ = rune;
433 		}
434 		quoted = nextc(&rune);
435 	}
436 
437 	/* sort on span start */
438 	for(p = r; p < ep; p += 2){
439 		for(np = p; np < ep; np += 2)
440 			if(*np < *p){
441 				rune = np[0];
442 				np[0] = p[0];
443 				p[0] = rune;
444 				rune = np[1];
445 				np[1] = p[1];
446 				p[1] = rune;
447 			}
448 	}
449 
450 	/* merge spans */
451 	np = yyclassp->spans;
452 	p = r;
453 	if(r == ep)
454 		yyclassp->end = np;
455 	else {
456 		np[0] = *p++;
457 		np[1] = *p++;
458 		for(; p < ep; p += 2)
459 			if(p[0] <= np[1]){
460 				if(p[1] > np[1])
461 					np[1] = p[1];
462 			} else {
463 				np += 2;
464 				np[0] = p[0];
465 				np[1] = p[1];
466 			}
467 		yyclassp->end = np+2;
468 	}
469 
470 	return type;
471 }
472 
473 static	Reprog*
474 regcomp1(char *s, int literal, int dot_type)
475 {
476 	int token;
477 	Reprog *pp;
478 
479 	/* get memory for the program */
480 	pp = (Reprog *)malloc(sizeof(Reprog) + 6*sizeof(Reinst)*strlen(s));
481 	if(pp == 0){
482 		regerror("out of memory");
483 		return 0;
484 	}
485 	freep = pp->firstinst;
486 	classp = pp->class;
487 	errors = 0;
488 
489 	if(setjmp(regkaboom))
490 		goto out;
491 
492 	/* go compile the sucker */
493 	lexdone = 0;
494 	exprp = s;
495 	nclass = 0;
496 	nbra = 0;
497 	atorp = atorstack;
498 	andp = andstack;
499 	subidp = subidstack;
500 	lastwasand = FALSE;
501 	cursubid = 0;
502 
503 	/* Start with a low priority operator to prime parser */
504 	pushator(START-1);
505 	while((token = lex(literal, dot_type)) != END){
506 		if((token&0300) == OPERATOR)
507 			operator(token);
508 		else
509 			operand(token);
510 	}
511 
512 	/* Close with a low priority operator */
513 	evaluntil(START);
514 
515 	/* Force END */
516 	operand(END);
517 	evaluntil(START);
518 #ifdef DEBUG
519 	dumpstack();
520 #endif
521 	if(nbra)
522 		rcerror("unmatched left paren");
523 	--andp;	/* points to first and only operand */
524 	pp->startinst = andp->first;
525 #ifdef DEBUG
526 	dump(pp);
527 #endif
528 	pp = optimize(pp);
529 #ifdef DEBUG
530 	print("start: %d\n", andp->first-pp->firstinst);
531 	dump(pp);
532 #endif
533 out:
534 	if(errors){
535 		free(pp);
536 		pp = 0;
537 	}
538 	return pp;
539 }
540 
541 extern	Reprog*
542 regcomp(char *s)
543 {
544 	return regcomp1(s, 0, ANY);
545 }
546 
547 extern	Reprog*
548 regcomplit(char *s)
549 {
550 	return regcomp1(s, 1, ANY);
551 }
552 
553 extern	Reprog*
554 regcompnl(char *s)
555 {
556 	return regcomp1(s, 0, ANYNL);
557 }
558