xref: /minix3/external/bsd/nvi/dist/regex/regcomp.c (revision b5e2faaaaf60a8b9a02f8d72f64caa56a87eb312)
1 /*	$NetBSD: regcomp.c,v 1.2 2013/11/22 15:52:06 christos Exp $ */
2 /*-
3  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
4  * Copyright (c) 1992, 1993, 1994
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Henry Spencer of the University of Toronto.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the University of
21  *	California, Berkeley and its contributors.
22  * 4. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)regcomp.c	8.4 (Berkeley) 3/19/94
39  */
40 
41 #if defined(LIBC_SCCS) && !defined(lint)
42 static char sccsid[] = "@(#)regcomp.c	8.4 (Berkeley) 3/19/94";
43 #endif /* LIBC_SCCS and not lint */
44 
45 #include <sys/types.h>
46 #include <stdio.h>
47 #include <string.h>
48 #include <ctype.h>
49 #include <limits.h>
50 #include <stdlib.h>
51 #include <regex.h>
52 
53 #include "utils.h"
54 #include "regex2.h"
55 
56 #include "cclass.h"
57 #include "cname.h"
58 
59 /*
60  * parse structure, passed up and down to avoid global variables and
61  * other clumsinesses
62  */
63 struct parse {
64 	RCHAR_T *next;		/* next character in RE */
65 	RCHAR_T *end;		/* end of string (-> NUL normally) */
66 	int error;		/* has an error been seen? */
67 	sop *strip;		/* malloced strip */
68 	RCHAR_T *stripdata;	/* malloced stripdata */
69 	sopno ssize;		/* malloced strip size (allocated) */
70 	sopno slen;		/* malloced strip length (used) */
71 	int ncsalloc;		/* number of csets allocated */
72 	struct re_guts *g;
73 #	define	NPAREN	10	/* we need to remember () 1-9 for back refs */
74 	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */
75 	sopno pend[NPAREN];	/* -> ) ([0] unused) */
76 };
77 
78 /* ========= begin header generated by ./mkh ========= */
79 #ifdef __cplusplus
80 extern "C" {
81 #endif
82 
83 /* === regcomp.c === */
84 static void p_ere __P((struct parse *p, int stop, size_t reclimit));
85 static void p_ere_exp __P((struct parse *p, size_t reclimit));
86 static void p_str __P((struct parse *p));
87 static void p_bre __P((struct parse *p, int end1, int end2, size_t reclimit));
88 static int p_simp_re __P((struct parse *p, int starordinary, size_t reclimit));
89 static int p_count __P((struct parse *p));
90 static void p_bracket __P((struct parse *p));
91 static void p_b_term __P((struct parse *p, cset *cs));
92 static void p_b_cclass __P((struct parse *p, cset *cs));
93 static void p_b_eclass __P((struct parse *p, cset *cs));
94 static char p_b_symbol __P((struct parse *p));
95 static char p_b_coll_elem __P((struct parse *p, int endc));
96 static char othercase __P((int ch));
97 static void bothcases __P((struct parse *p, int ch));
98 static void ordinary __P((struct parse *p, int ch));
99 static void nonnewline __P((struct parse *p));
100 static void repeat __P((struct parse *p, sopno start, int from, int to, size_t reclimit));
101 static int seterr __P((struct parse *p, int e));
102 static cset *allocset __P((struct parse *p));
103 static void freeset __P((struct parse *p, cset *cs));
104 static int freezeset __P((struct parse *p, cset *cs));
105 static int firstch __P((struct parse *p, cset *cs));
106 static int nch __P((struct parse *p, cset *cs));
107 static void mcadd __P((struct parse *p, cset *cs, const char *cp));
108 #ifdef notdef
109 static void mcsub __P((cset *cs, char *cp));
110 static int mcin __P((cset *cs, char *cp));
111 static char *mcfind __P((cset *cs, char *cp));
112 #endif
113 static void mcinvert __P((struct parse *p, cset *cs));
114 static void mccase __P((struct parse *p, cset *cs));
115 #ifdef notdef
116 static int isinsets __P((struct re_guts *g, int c));
117 static int samesets __P((struct re_guts *g, int c1, int c2));
118 #endif
119 static void categorize __P((struct parse *p, struct re_guts *g));
120 static sopno dupl __P((struct parse *p, sopno start, sopno finish));
121 static void doemit __P((struct parse *p, sop op, size_t opnd));
122 static void doinsert __P((struct parse *p, sop op, size_t opnd, sopno pos));
123 static void dofwd __P((struct parse *p, sopno pos, sop value));
124 static int enlarge __P((struct parse *p, sopno size));
125 static void stripsnug __P((struct parse *p, struct re_guts *g));
126 static void findmust __P((struct parse *p, struct re_guts *g));
127 static sopno pluscount __P((struct parse *p, struct re_guts *g));
128 
129 #ifdef __cplusplus
130 }
131 #endif
132 /* ========= end header generated by ./mkh ========= */
133 
134 static RCHAR_T nuls[10];		/* place to point scanner in event of error */
135 
136 /*
137  * macros for use with parse structure
138  * BEWARE:  these know that the parse structure is named `p' !!!
139  */
140 #define	PEEK()	(*p->next)
141 #define	PEEK2()	(*(p->next+1))
142 #define	MORE()	(p->next < p->end)
143 #define	MORE2()	(p->next+1 < p->end)
144 #define	SEE(c)	(MORE() && PEEK() == (c))
145 #define	SEETWO(a, b)	(MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
146 #define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
147 #define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
148 #define	NEXT()	(p->next++)
149 #define	NEXT2()	(p->next += 2)
150 #define	NEXTn(n)	(p->next += (n))
151 #define	GETNEXT()	(*p->next++)
152 #define	SETERROR(e)	seterr(p, (e))
153 #define	REQUIRE(co, e)	((co) || SETERROR(e))
154 #define	MUSTSEE(c, e)	(REQUIRE(MORE() && PEEK() == (c), e))
155 #define	MUSTEAT(c, e)	(REQUIRE(MORE() && GETNEXT() == (c), e))
156 #define	MUSTNOTSEE(c, e)	(REQUIRE(!MORE() || PEEK() != (c), e))
157 #define	EMIT(op, sopnd)	doemit(p, (sop)(op), (size_t)(sopnd))
158 #define	INSERT(op, pos)	doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
159 #define	AHEAD(pos)		dofwd(p, pos, HERE()-(pos))
160 #define	ASTERN(sop, pos)	EMIT(sop, HERE()-pos)
161 #define	HERE()		(p->slen)
162 #define	THERE()		(p->slen - 1)
163 #define	THERETHERE()	(p->slen - 2)
164 #define	DROP(n)	(p->slen -= (n))
165 
166 #ifndef NDEBUG
167 static int never = 0;		/* for use in asserts; shuts lint up */
168 #else
169 #define	never	0		/* some <assert.h>s have bugs too */
170 #endif
171 
172 #define	MEMLIMIT	0x8000000
173 #define MEMSIZE(p) \
174 	((p)->ncsalloc / CHAR_BIT * (p)->g->csetsize + \
175 	(p)->ncsalloc * sizeof(cset) + \
176 	(p)->ssize * sizeof(sop))
177 #define	RECLIMIT	256
178 
179 /*
180  - regcomp - interface for parser and compilation
181  = extern int regcomp(regex_t *, const RCHAR_T *, int);
182  = #define	REG_BASIC	0000
183  = #define	REG_EXTENDED	0001
184  = #define	REG_ICASE	0002
185  = #define	REG_NOSUB	0004
186  = #define	REG_NEWLINE	0010
187  = #define	REG_NOSPEC	0020
188  = #define	REG_PEND	0040
189  = #define	REG_DUMP	0200
190  */
191 int				/* 0 success, otherwise REG_something */
192 regcomp(regex_t *preg, const RCHAR_T *pattern, int cflags)
193 {
194 	struct parse pa;
195 	register struct re_guts *g;
196 	register struct parse *p = &pa;
197 	register int i;
198 	register size_t len;
199 #ifdef REDEBUG
200 #	define	GOODFLAGS(f)	(f)
201 #else
202 #	define	GOODFLAGS(f)	((f)&~REG_DUMP)
203 #endif
204 
205 	cflags = GOODFLAGS(cflags);
206 	if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
207 		return(REG_INVARG);
208 
209 	if (cflags&REG_PEND) {
210 		if (preg->re_endp < pattern)
211 			return(REG_INVARG);
212 		len = preg->re_endp - pattern;
213 	} else
214 		len = STRLEN(pattern);
215 
216 	/* do the mallocs early so failure handling is easy */
217 	g = (struct re_guts *)malloc(sizeof(struct re_guts) +
218 							(NC-1)*sizeof(cat_t));
219 	if (g == NULL)
220 		return(REG_ESPACE);
221 	p->ssize = len/(size_t)2*(size_t)3 + (size_t)1;	/* ugh */
222 	p->strip = (sop *)malloc(p->ssize * sizeof(sop));
223 	if (p->strip == NULL) {
224 		free((char *)g);
225 		return(REG_ESPACE);
226 	}
227 	p->stripdata = (RCHAR_T *)malloc(p->ssize * sizeof(RCHAR_T));
228 	if (p->stripdata == NULL) {
229 		free((char *)p->strip);
230 		free((char *)g);
231 		return(REG_ESPACE);
232 	}
233 	p->slen = 0;
234 
235 	/* set things up */
236 	p->g = g;
237 	p->next = (RCHAR_T *)__UNCONST(pattern);	/* convenience; we do not modify it */
238 	p->end = p->next + len;
239 	p->error = 0;
240 	p->ncsalloc = 0;
241 	for (i = 0; i < NPAREN; i++) {
242 		p->pbegin[i] = 0;
243 		p->pend[i] = 0;
244 	}
245 	g->csetsize = NC;
246 	g->sets = NULL;
247 	g->setbits = NULL;
248 	g->ncsets = 0;
249 	g->cflags = cflags;
250 	g->iflags = 0;
251 	g->nbol = 0;
252 	g->neol = 0;
253 	g->must = NULL;
254 	g->mlen = 0;
255 	g->nsub = 0;
256 #if 0
257 	g->ncategories = 1;	/* category 0 is "everything else" */
258 	g->categories = &g->catspace[-(CHAR_MIN)];
259 	(void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
260 #endif
261 	g->backrefs = 0;
262 
263 	/* do it */
264 	EMIT(OEND, 0);
265 	g->firststate = THERE();
266 	if (cflags&REG_EXTENDED)
267 		p_ere(p, OUT, 0);
268 	else if (cflags&REG_NOSPEC)
269 		p_str(p);
270 	else
271 		p_bre(p, OUT, OUT, 0);
272 	EMIT(OEND, 0);
273 	g->laststate = THERE();
274 
275 	/* tidy up loose ends and fill things in */
276 	categorize(p, g);
277 	stripsnug(p, g);
278 	findmust(p, g);
279 	g->nplus = pluscount(p, g);
280 	g->magic = MAGIC2;
281 	preg->re_nsub = g->nsub;
282 	preg->re_g = g;
283 	preg->re_magic = MAGIC1;
284 #ifndef REDEBUG
285 	/* not debugging, so can't rely on the assert() in regexec() */
286 	if (g->iflags&BAD)
287 		SETERROR(REG_ASSERT);
288 #endif
289 
290 	/* win or lose, we're done */
291 	if (p->error != 0)	/* lose */
292 		regfree(preg);
293 	return(p->error);
294 }
295 
296 /*
297  - p_ere - ERE parser top level, concatenation and alternation
298  == static void p_ere(register struct parse *p, int stop, size_t reclimit);
299  */
300 static void
301 p_ere(register struct parse *p, int stop, size_t reclimit)
302 
303          			/* character this ERE should end at */
304 {
305 	register char c;
306 	register sopno prevback = 0;
307 	register sopno prevfwd = 0;
308 	register sopno conc;
309 	register int first = 1;		/* is this the first alternative? */
310 
311 	if (reclimit++ > RECLIMIT || p->error == REG_ESPACE) {
312 		p->error = REG_ESPACE;
313 		return;
314 	}
315 
316 	for (;;) {
317 		/* do a bunch of concatenated expressions */
318 		conc = HERE();
319 		while (MORE() && (c = PEEK()) != '|' && c != stop)
320 			p_ere_exp(p, reclimit);
321 		(void)REQUIRE(HERE() != conc, REG_EMPTY);	/* require nonempty */
322 
323 		if (!EAT('|'))
324 			break;		/* NOTE BREAK OUT */
325 
326 		if (first) {
327 			INSERT(OCH_, conc);	/* offset is wrong */
328 			prevfwd = conc;
329 			prevback = conc;
330 			first = 0;
331 		}
332 		ASTERN(OOR1, prevback);
333 		prevback = THERE();
334 		AHEAD(prevfwd);			/* fix previous offset */
335 		prevfwd = HERE();
336 		EMIT(OOR2, 0);			/* offset is very wrong */
337 	}
338 
339 	if (!first) {		/* tail-end fixups */
340 		AHEAD(prevfwd);
341 		ASTERN(O_CH, prevback);
342 	}
343 
344 	assert(!MORE() || SEE(stop));
345 }
346 
347 /*
348  - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
349  == static void p_ere_exp(register struct parse *p);
350  */
351 static void
352 p_ere_exp(register struct parse *p, size_t reclimit)
353 {
354 	register char c;
355 	register sopno pos;
356 	register int count;
357 	register int count2;
358 	register sopno subno;
359 	int wascaret = 0;
360 
361 	assert(MORE());		/* caller should have ensured this */
362 	c = GETNEXT();
363 
364 	pos = HERE();
365 	switch (c) {
366 	case '(':
367 		(void)REQUIRE(MORE(), REG_EPAREN);
368 		p->g->nsub++;
369 		subno = p->g->nsub;
370 		if (subno < NPAREN)
371 			p->pbegin[subno] = HERE();
372 		EMIT(OLPAREN, subno);
373 		if (!SEE(')'))
374 			p_ere(p, ')', reclimit);
375 		if (subno < NPAREN) {
376 			p->pend[subno] = HERE();
377 			assert(p->pend[subno] != 0);
378 		}
379 		EMIT(ORPAREN, subno);
380 		(void)MUSTEAT(')', REG_EPAREN);
381 		break;
382 #ifndef POSIX_MISTAKE
383 	case ')':		/* happens only if no current unmatched ( */
384 		/*
385 		 * You may ask, why the ifndef?  Because I didn't notice
386 		 * this until slightly too late for 1003.2, and none of the
387 		 * other 1003.2 regular-expression reviewers noticed it at
388 		 * all.  So an unmatched ) is legal POSIX, at least until
389 		 * we can get it fixed.
390 		 */
391 		SETERROR(REG_EPAREN);
392 		break;
393 #endif
394 	case '^':
395 		EMIT(OBOL, 0);
396 		p->g->iflags |= USEBOL;
397 		p->g->nbol++;
398 		wascaret = 1;
399 		break;
400 	case '$':
401 		EMIT(OEOL, 0);
402 		p->g->iflags |= USEEOL;
403 		p->g->neol++;
404 		break;
405 	case '|':
406 		SETERROR(REG_EMPTY);
407 		break;
408 	case '*':
409 	case '+':
410 	case '?':
411 		SETERROR(REG_BADRPT);
412 		break;
413 	case '.':
414 		if (p->g->cflags&REG_NEWLINE)
415 			nonnewline(p);
416 		else
417 			EMIT(OANY, 0);
418 		break;
419 	case '[':
420 		p_bracket(p);
421 		break;
422 	case '\\':
423 		(void)REQUIRE(MORE(), REG_EESCAPE);
424 		c = GETNEXT();
425 		ordinary(p, c);
426 		break;
427 	case '{':		/* okay as ordinary except if digit follows */
428 		(void)REQUIRE(!MORE() || !ISDIGIT((UCHAR_T)PEEK()), REG_BADRPT);
429 		/* FALLTHROUGH */
430 	default:
431 		ordinary(p, c);
432 		break;
433 	}
434 
435 	if (!MORE())
436 		return;
437 	c = PEEK();
438 	/* we call { a repetition if followed by a digit */
439 	if (!( c == '*' || c == '+' || c == '?' ||
440 				(c == '{' && MORE2() && ISDIGIT((UCHAR_T)PEEK2())) ))
441 		return;		/* no repetition, we're done */
442 	NEXT();
443 
444 	(void)REQUIRE(!wascaret, REG_BADRPT);
445 	switch (c) {
446 	case '*':	/* implemented as +? */
447 		/* this case does not require the (y|) trick, noKLUDGE */
448 		INSERT(OPLUS_, pos);
449 		ASTERN(O_PLUS, pos);
450 		INSERT(OQUEST_, pos);
451 		ASTERN(O_QUEST, pos);
452 		break;
453 	case '+':
454 		INSERT(OPLUS_, pos);
455 		ASTERN(O_PLUS, pos);
456 		break;
457 	case '?':
458 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
459 		INSERT(OCH_, pos);		/* offset slightly wrong */
460 		ASTERN(OOR1, pos);		/* this one's right */
461 		AHEAD(pos);			/* fix the OCH_ */
462 		EMIT(OOR2, 0);			/* offset very wrong... */
463 		AHEAD(THERE());			/* ...so fix it */
464 		ASTERN(O_CH, THERETHERE());
465 		break;
466 	case '{':
467 		count = p_count(p);
468 		if (EAT(',')) {
469 			if (ISDIGIT((UCHAR_T)PEEK())) {
470 				count2 = p_count(p);
471 				(void)REQUIRE(count <= count2, REG_BADBR);
472 			} else		/* single number with comma */
473 				count2 = INFINITY;
474 		} else		/* just a single number */
475 			count2 = count;
476 		repeat(p, pos, count, count2, 0);
477 		if (!EAT('}')) {	/* error heuristics */
478 			while (MORE() && PEEK() != '}')
479 				NEXT();
480 			(void)REQUIRE(MORE(), REG_EBRACE);
481 			SETERROR(REG_BADBR);
482 		}
483 		break;
484 	}
485 
486 	if (!MORE())
487 		return;
488 	c = PEEK();
489 	if (!( c == '*' || c == '+' || c == '?' ||
490 				(c == '{' && MORE2() && ISDIGIT((UCHAR_T)PEEK2())) ) )
491 		return;
492 	SETERROR(REG_BADRPT);
493 }
494 
495 /*
496  - p_str - string (no metacharacters) "parser"
497  == static void p_str(register struct parse *p);
498  */
499 static void
500 p_str(register struct parse *p)
501 {
502 	(void)REQUIRE(MORE(), REG_EMPTY);
503 	while (MORE())
504 		ordinary(p, GETNEXT());
505 }
506 
507 /*
508  - p_bre - BRE parser top level, anchoring and concatenation
509  == static void p_bre(register struct parse *p, register int end1, \
510  ==	register int end2, size_t reclimit);
511  * Giving end1 as OUT essentially eliminates the end1/end2 check.
512  *
513  * This implementation is a bit of a kludge, in that a trailing $ is first
514  * taken as an ordinary character and then revised to be an anchor.  The
515  * only undesirable side effect is that '$' gets included as a character
516  * category in such cases.  This is fairly harmless; not worth fixing.
517  * The amount of lookahead needed to avoid this kludge is excessive.
518  */
519 static void
520 p_bre(register struct parse *p, register int end1, register int end2, size_t reclimit)
521 
522                   		/* first terminating character */
523                   		/* second terminating character */
524 {
525 	register sopno start;
526 	register int first = 1;			/* first subexpression? */
527 	register int wasdollar = 0;
528 
529 	if (reclimit++ > RECLIMIT || p->error == REG_ESPACE) {
530 		p->error = REG_ESPACE;
531 		return;
532 	}
533 
534 	start = HERE();
535 
536 	if (EAT('^')) {
537 		EMIT(OBOL, 0);
538 		p->g->iflags |= USEBOL;
539 		p->g->nbol++;
540 	}
541 	while (MORE() && !SEETWO(end1, end2)) {
542 		wasdollar = p_simp_re(p, first, reclimit);
543 		first = 0;
544 	}
545 	if (wasdollar) {	/* oops, that was a trailing anchor */
546 		DROP(1);
547 		EMIT(OEOL, 0);
548 		p->g->iflags |= USEEOL;
549 		p->g->neol++;
550 	}
551 
552 	(void)REQUIRE(HERE() != start, REG_EMPTY);	/* require nonempty */
553 }
554 
555 /*
556  - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
557  == static int p_simp_re(register struct parse *p, int starordinary, size_t reclimit);
558  */
559 static int			/* was the simple RE an unbackslashed $? */
560 p_simp_re(register struct parse *p, int starordinary, size_t reclimit)
561 
562                  		/* is a leading * an ordinary character? */
563 {
564 	register int c;
565 	register int count;
566 	register int count2;
567 	register sopno pos;
568 	register int i;
569 	register sopno subno;
570 	int backsl;
571 
572 	pos = HERE();		/* repetion op, if any, covers from here */
573 
574 	assert(MORE());		/* caller should have ensured this */
575 	c = GETNEXT();
576 	backsl = c == '\\';
577 	if (backsl) {
578 		(void)REQUIRE(MORE(), REG_EESCAPE);
579 		c = (unsigned char)GETNEXT();
580 		switch (c) {
581 		case '{':
582 			SETERROR(REG_BADRPT);
583 			break;
584 		case '(':
585 			p->g->nsub++;
586 			subno = p->g->nsub;
587 			if (subno < NPAREN)
588 				p->pbegin[subno] = HERE();
589 			EMIT(OLPAREN, subno);
590 			/* the MORE here is an error heuristic */
591 			if (MORE() && !SEETWO('\\', ')'))
592 				p_bre(p, '\\', ')', reclimit);
593 			if (subno < NPAREN) {
594 				p->pend[subno] = HERE();
595 				assert(p->pend[subno] != 0);
596 			}
597 			EMIT(ORPAREN, subno);
598 			(void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
599 			break;
600 		case ')':	/* should not get here -- must be user */
601 		case '}':
602 			SETERROR(REG_EPAREN);
603 			break;
604 		case '1':
605 		case '2':
606 		case '3':
607 		case '4':
608 		case '5':
609 		case '6':
610 		case '7':
611 		case '8':
612 		case '9':
613 			i = c - '0';
614 			assert(i < NPAREN);
615 			if (p->pend[i] != 0) {
616 				assert(i <= p->g->nsub);
617 				EMIT(OBACK_, i);
618 				assert(p->pbegin[i] != 0);
619 				assert(p->strip[p->pbegin[i]] == OLPAREN);
620 				assert(p->strip[p->pend[i]] == ORPAREN);
621 				(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
622 				EMIT(O_BACK, i);
623 			} else
624 				SETERROR(REG_ESUBREG);
625 			p->g->backrefs = 1;
626 			break;
627 		default:
628 			ordinary(p, c);
629 			break;
630 		}
631 	} else {
632 		switch (c) {
633 		case '.':
634 			if (p->g->cflags&REG_NEWLINE)
635 				nonnewline(p);
636 			else
637 				EMIT(OANY, 0);
638 			break;
639 		case '[':
640 			p_bracket(p);
641 			break;
642 		case '*':
643 			(void)REQUIRE(starordinary, REG_BADRPT);
644 			/* FALLTHROUGH */
645 		default:
646 			ordinary(p, c);
647 			break;
648 		}
649 	}
650 
651 	if (EAT('*')) {		/* implemented as +? */
652 		/* this case does not require the (y|) trick, noKLUDGE */
653 		INSERT(OPLUS_, pos);
654 		ASTERN(O_PLUS, pos);
655 		INSERT(OQUEST_, pos);
656 		ASTERN(O_QUEST, pos);
657 	} else if (EATTWO('\\', '{')) {
658 		count = p_count(p);
659 		if (EAT(',')) {
660 			if (MORE() && ISDIGIT((UCHAR_T)PEEK())) {
661 				count2 = p_count(p);
662 				(void)REQUIRE(count <= count2, REG_BADBR);
663 			} else		/* single number with comma */
664 				count2 = INFINITY;
665 		} else		/* just a single number */
666 			count2 = count;
667 		repeat(p, pos, count, count2, reclimit);
668 		if (!EATTWO('\\', '}')) {	/* error heuristics */
669 			while (MORE() && !SEETWO('\\', '}'))
670 				NEXT();
671 			(void)REQUIRE(MORE(), REG_EBRACE);
672 			SETERROR(REG_BADBR);
673 		}
674 	} else if (!backsl && c == (unsigned char)'$')	/* $ (but not \$) ends it */
675 		return(1);
676 
677 	return(0);
678 }
679 
680 /*
681  - p_count - parse a repetition count
682  == static int p_count(register struct parse *p);
683  */
684 static int			/* the value */
685 p_count(register struct parse *p)
686 {
687 	register int count = 0;
688 	register int ndigits = 0;
689 
690 	while (MORE() && ISDIGIT((UCHAR_T)PEEK()) && count <= DUPMAX) {
691 		count = count*10 + (GETNEXT() - '0');
692 		ndigits++;
693 	}
694 
695 	(void)REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
696 	return(count);
697 }
698 
699 /*
700  - p_bracket - parse a bracketed character list
701  == static void p_bracket(register struct parse *p);
702  *
703  * Note a significant property of this code:  if the allocset() did SETERROR,
704  * no set operations are done.
705  */
706 static void
707 p_bracket(register struct parse *p)
708 {
709 	register cset *cs;
710 	register int invert = 0;
711 	static RCHAR_T bow[] = { '[', ':', '<', ':', ']', ']' };
712 	static RCHAR_T eow[] = { '[', ':', '>', ':', ']', ']' };
713 
714 	cs = allocset(p);
715 	if (cs == NULL)
716 		return;
717 
718 	/* Dept of Truly Sickening Special-Case Kludges */
719 	if (p->next + 5 < p->end && MEMCMP(p->next, bow, 6) == 0) {
720 		EMIT(OBOW, 0);
721 		NEXTn(6);
722 		return;
723 	}
724 	if (p->next + 5 < p->end && MEMCMP(p->next, eow, 6) == 0) {
725 		EMIT(OEOW, 0);
726 		NEXTn(6);
727 		return;
728 	}
729 
730 	if (EAT('^'))
731 		invert++;	/* make note to invert set at end */
732 	if (EAT(']'))
733 		CHadd(cs, ']');
734 	else if (EAT('-'))
735 		CHadd(cs, '-');
736 	while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
737 		p_b_term(p, cs);
738 	if (EAT('-'))
739 		CHadd(cs, '-');
740 	(void)MUSTEAT(']', REG_EBRACK);
741 
742 	if (p->error != 0)	/* don't mess things up further */
743 		return;
744 
745 	if (p->g->cflags&REG_ICASE) {
746 		register int i;
747 		register int ci;
748 
749 		for (i = p->g->csetsize - 1; i >= 0; i--)
750 			if (CHIN(cs, i) && isalpha(i)) {
751 				ci = othercase(i);
752 				if (ci != i)
753 					CHadd(cs, ci);
754 			}
755 		if (cs->multis != NULL)
756 			mccase(p, cs);
757 	}
758 	if (invert) {
759 		register int i;
760 
761 		for (i = p->g->csetsize - 1; i >= 0; i--)
762 			if (CHIN(cs, i))
763 				CHsub(cs, i);
764 			else
765 				CHadd(cs, i);
766 		if (p->g->cflags&REG_NEWLINE)
767 			CHsub(cs, '\n');
768 		if (cs->multis != NULL)
769 			mcinvert(p, cs);
770 	}
771 
772 	assert(cs->multis == NULL);		/* xxx */
773 
774 	if (nch(p, cs) == 1) {		/* optimize singleton sets */
775 		ordinary(p, firstch(p, cs));
776 		freeset(p, cs);
777 	} else
778 		EMIT(OANYOF, freezeset(p, cs));
779 }
780 
781 /*
782  - p_b_term - parse one term of a bracketed character list
783  == static void p_b_term(register struct parse *p, register cset *cs);
784  */
785 static void
786 p_b_term(register struct parse *p, register cset *cs)
787 {
788 	register char c;
789 	register char start, finish;
790 	register int i;
791 
792 	/* classify what we've got */
793 	switch ((MORE()) ? PEEK() : '\0') {
794 	case '[':
795 		c = (MORE2()) ? PEEK2() : '\0';
796 		break;
797 	case '-':
798 		SETERROR(REG_ERANGE);
799 		return;			/* NOTE RETURN */
800 		break;
801 	default:
802 		c = '\0';
803 		break;
804 	}
805 
806 	switch (c) {
807 	case ':':		/* character class */
808 		NEXT2();
809 		(void)REQUIRE(MORE(), REG_EBRACK);
810 		c = PEEK();
811 		(void)REQUIRE(c != '-' && c != ']', REG_ECTYPE);
812 		p_b_cclass(p, cs);
813 		(void)REQUIRE(MORE(), REG_EBRACK);
814 		(void)REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
815 		break;
816 	case '=':		/* equivalence class */
817 		NEXT2();
818 		(void)REQUIRE(MORE(), REG_EBRACK);
819 		c = PEEK();
820 		(void)REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
821 		p_b_eclass(p, cs);
822 		(void)REQUIRE(MORE(), REG_EBRACK);
823 		(void)REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
824 		break;
825 	default:		/* symbol, ordinary character, or range */
826 /* xxx revision needed for multichar stuff */
827 		start = p_b_symbol(p);
828 		if (SEE('-') && MORE2() && PEEK2() != ']') {
829 			/* range */
830 			NEXT();
831 			if (EAT('-'))
832 				finish = '-';
833 			else
834 				finish = p_b_symbol(p);
835 		} else
836 			finish = start;
837 /* xxx what about signed chars here... */
838 		(void)REQUIRE(start <= finish, REG_ERANGE);
839 		for (i = start; i <= finish; i++)
840 			CHadd(cs, i);
841 		break;
842 	}
843 }
844 
845 /*
846  - p_b_cclass - parse a character-class name and deal with it
847  == static void p_b_cclass(register struct parse *p, register cset *cs);
848  */
849 static void
850 p_b_cclass(register struct parse *p, register cset *cs)
851 {
852 	register RCHAR_T *sp = p->next;
853 	register struct cclass *cp;
854 	register size_t len;
855 	register const char *u;
856 	register char c;
857 
858 	while (MORE() && isalpha(PEEK()))
859 		NEXT();
860 	len = p->next - sp;
861 	for (cp = cclasses; cp->name != NULL; cp++)
862 		if (STRLEN(cp->name) == len && !MEMCMP(cp->name, sp, len))
863 			break;
864 	if (cp->name == NULL) {
865 		/* oops, didn't find it */
866 		SETERROR(REG_ECTYPE);
867 		return;
868 	}
869 
870 	u = cp->chars;
871 	while ((c = *u++) != '\0')
872 		CHadd(cs, c);
873 	for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
874 		MCadd(p, cs, u);
875 }
876 
877 /*
878  - p_b_eclass - parse an equivalence-class name and deal with it
879  == static void p_b_eclass(register struct parse *p, register cset *cs);
880  *
881  * This implementation is incomplete. xxx
882  */
883 static void
884 p_b_eclass(register struct parse *p, register cset *cs)
885 {
886 	register char c;
887 
888 	c = p_b_coll_elem(p, '=');
889 	CHadd(cs, c);
890 }
891 
892 /*
893  - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
894  == static char p_b_symbol(register struct parse *p);
895  */
896 static char			/* value of symbol */
897 p_b_symbol(register struct parse *p)
898 {
899 	register char value;
900 
901 	(void)REQUIRE(MORE(), REG_EBRACK);
902 	if (!EATTWO('[', '.'))
903 		return(GETNEXT());
904 
905 	/* collating symbol */
906 	value = p_b_coll_elem(p, '.');
907 	(void)REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
908 	return(value);
909 }
910 
911 /*
912  - p_b_coll_elem - parse a collating-element name and look it up
913  == static char p_b_coll_elem(register struct parse *p, int endc);
914  */
915 static char			/* value of collating element */
916 p_b_coll_elem(register struct parse *p, int endc)
917 
918          			/* name ended by endc,']' */
919 {
920 	register RCHAR_T *sp = p->next;
921 	register struct cname *cp;
922 	register size_t len;
923 
924 	while (MORE() && !SEETWO(endc, ']'))
925 		NEXT();
926 	if (!MORE()) {
927 		SETERROR(REG_EBRACK);
928 		return(0);
929 	}
930 	len = p->next - sp;
931 	for (cp = cnames; cp->name != NULL; cp++)
932 		if (STRLEN(cp->name) == len && MEMCMP(cp->name, sp, len))
933 			return(cp->code);	/* known name */
934 	if (len == 1)
935 		return(*sp);	/* single character */
936 	SETERROR(REG_ECOLLATE);			/* neither */
937 	return(0);
938 }
939 
940 /*
941  - othercase - return the case counterpart of an alphabetic
942  == static char othercase(int ch);
943  */
944 static char			/* if no counterpart, return ch */
945 othercase(int ch)
946 {
947 	assert(isalpha(ch));
948 	if (isupper(ch))
949 		return(tolower(ch));
950 	else if (islower(ch))
951 		return(toupper(ch));
952 	else			/* peculiar, but could happen */
953 		return(ch);
954 }
955 
956 /*
957  - bothcases - emit a dualcase version of a two-case character
958  == static void bothcases(register struct parse *p, int ch);
959  *
960  * Boy, is this implementation ever a kludge...
961  */
962 static void
963 bothcases(register struct parse *p, int ch)
964 {
965 	register RCHAR_T *oldnext = p->next;
966 	register RCHAR_T *oldend = p->end;
967 	RCHAR_T bracket[3];
968 
969 	assert(othercase(ch) != ch);	/* p_bracket() would recurse */
970 	p->next = bracket;
971 	p->end = bracket+2;
972 	bracket[0] = ch;
973 	bracket[1] = ']';
974 	bracket[2] = '\0';
975 	p_bracket(p);
976 	assert(p->next == bracket+2);
977 	p->next = oldnext;
978 	p->end = oldend;
979 }
980 
981 /*
982  - ordinary - emit an ordinary character
983  == static void ordinary(register struct parse *p, register int ch);
984  */
985 static void
986 ordinary(register struct parse *p, register int ch)
987 {
988 /*
989 	register cat_t *cap = p->g->categories;
990 */
991 
992 	if ((p->g->cflags&REG_ICASE) && isalpha(ch) && othercase(ch) != ch)
993 		bothcases(p, ch);
994 	else {
995 		EMIT(OCHAR, (UCHAR_T)ch);
996 /*
997 		if (cap[ch] == 0)
998 			cap[ch] = p->g->ncategories++;
999 */
1000 	}
1001 }
1002 
1003 /*
1004  - nonnewline - emit REG_NEWLINE version of OANY
1005  == static void nonnewline(register struct parse *p);
1006  *
1007  * Boy, is this implementation ever a kludge...
1008  */
1009 static void
1010 nonnewline(register struct parse *p)
1011 {
1012 	register RCHAR_T *oldnext = p->next;
1013 	register RCHAR_T *oldend = p->end;
1014 	RCHAR_T bracket[4];
1015 
1016 	p->next = bracket;
1017 	p->end = bracket+3;
1018 	bracket[0] = '^';
1019 	bracket[1] = '\n';
1020 	bracket[2] = ']';
1021 	bracket[3] = '\0';
1022 	p_bracket(p);
1023 	assert(p->next == bracket+3);
1024 	p->next = oldnext;
1025 	p->end = oldend;
1026 }
1027 
1028 /*
1029  - repeat - generate code for a bounded repetition, recursively if needed
1030  == static void repeat(register struct parse *p, sopno start, int from, int to, size_t reclimit);
1031  */
1032 static void
1033 repeat(register struct parse *p, sopno start, int from, int to, size_t reclimit)
1034 
1035             			/* operand from here to end of strip */
1036          			/* repeated from this number */
1037        				/* to this number of times (maybe INFINITY) */
1038 {
1039 	register sopno finish;
1040 #	define	N	2
1041 #	define	INF	3
1042 #	define	REP(f, t)	((f)*8 + (t))
1043 #	define	MAP(n)	(((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1044 	register sopno copy;
1045 
1046 	if (reclimit++ > RECLIMIT)
1047 		p->error = REG_ESPACE;
1048 	if (p->error)
1049 		return;
1050 
1051 	finish = HERE();
1052 
1053 	assert(from <= to);
1054 
1055 	switch (REP(MAP(from), MAP(to))) {
1056 	case REP(0, 0):			/* must be user doing this */
1057 		DROP(finish-start);	/* drop the operand */
1058 		break;
1059 	case REP(0, 1):			/* as x{1,1}? */
1060 	case REP(0, N):			/* as x{1,n}? */
1061 	case REP(0, INF):		/* as x{1,}? */
1062 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1063 		INSERT(OCH_, start);		/* offset is wrong... */
1064 		repeat(p, start+1, 1, to, reclimit);
1065 		ASTERN(OOR1, start);
1066 		AHEAD(start);			/* ... fix it */
1067 		EMIT(OOR2, 0);
1068 		AHEAD(THERE());
1069 		ASTERN(O_CH, THERETHERE());
1070 		break;
1071 	case REP(1, 1):			/* trivial case */
1072 		/* done */
1073 		break;
1074 	case REP(1, N):			/* as x?x{1,n-1} */
1075 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1076 		INSERT(OCH_, start);
1077 		ASTERN(OOR1, start);
1078 		AHEAD(start);
1079 		EMIT(OOR2, 0);			/* offset very wrong... */
1080 		AHEAD(THERE());			/* ...so fix it */
1081 		ASTERN(O_CH, THERETHERE());
1082 		copy = dupl(p, start+1, finish+1);
1083 		assert(copy == finish+4);
1084 		repeat(p, copy, 1, to-1, reclimit);
1085 		break;
1086 	case REP(1, INF):		/* as x+ */
1087 		INSERT(OPLUS_, start);
1088 		ASTERN(O_PLUS, start);
1089 		break;
1090 	case REP(N, N):			/* as xx{m-1,n-1} */
1091 		copy = dupl(p, start, finish);
1092 		repeat(p, copy, from-1, to-1, reclimit);
1093 		break;
1094 	case REP(N, INF):		/* as xx{n-1,INF} */
1095 		copy = dupl(p, start, finish);
1096 		repeat(p, copy, from-1, to, reclimit);
1097 		break;
1098 	default:			/* "can't happen" */
1099 		SETERROR(REG_ASSERT);	/* just in case */
1100 		break;
1101 	}
1102 }
1103 
1104 /*
1105  - seterr - set an error condition
1106  == static int seterr(register struct parse *p, int e);
1107  */
1108 static int			/* useless but makes type checking happy */
1109 seterr(register struct parse *p, int e)
1110 {
1111 	if (p->error == 0)	/* keep earliest error condition */
1112 		p->error = e;
1113 	p->next = nuls;		/* try to bring things to a halt */
1114 	p->end = nuls;
1115 	return(0);		/* make the return value well-defined */
1116 }
1117 
1118 /*
1119  - allocset - allocate a set of characters for []
1120  == static cset *allocset(register struct parse *p);
1121  */
1122 static cset *
1123 allocset(register struct parse *p)
1124 {
1125 	register int no = p->g->ncsets++;
1126 	register size_t nc;
1127 	register size_t nbytes;
1128 	register cset *cs;
1129 	register size_t css = (size_t)p->g->csetsize;
1130 	register int i;
1131 
1132 	if (no >= p->ncsalloc) {	/* need another column of space */
1133 		p->ncsalloc += CHAR_BIT;
1134 		nc = p->ncsalloc;
1135 		assert(nc % CHAR_BIT == 0);
1136 		nbytes = nc / CHAR_BIT * css;
1137 		if (MEMSIZE(p) > MEMLIMIT)
1138 			goto oomem;
1139 		if (p->g->sets == NULL)
1140 			p->g->sets = (cset *)malloc(nc * sizeof(cset));
1141 		else
1142 			p->g->sets = (cset *)realloc((char *)p->g->sets,
1143 							nc * sizeof(cset));
1144 		if (p->g->setbits == NULL)
1145 			p->g->setbits = (uch *)malloc(nbytes);
1146 		else {
1147 			p->g->setbits = (uch *)realloc((char *)p->g->setbits,
1148 								nbytes);
1149 			/* xxx this isn't right if setbits is now NULL */
1150 			for (i = 0; i < no; i++)
1151 				p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
1152 		}
1153 		if (p->g->sets != NULL && p->g->setbits != NULL)
1154 			(void) memset((char *)p->g->setbits + (nbytes - css),
1155 								0, css);
1156 		else {
1157 oomem:
1158 			no = 0;
1159 			SETERROR(REG_ESPACE);
1160 			/* caller's responsibility not to do set ops */
1161 			return NULL;
1162 		}
1163 	}
1164 
1165 	cs = &p->g->sets[no];
1166 	cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
1167 	cs->mask = 1 << ((no) % CHAR_BIT);
1168 	cs->hash = 0;
1169 	cs->smultis = 0;
1170 	cs->multis = NULL;
1171 
1172 	return(cs);
1173 }
1174 
1175 /*
1176  - freeset - free a now-unused set
1177  == static void freeset(register struct parse *p, register cset *cs);
1178  */
1179 static void
1180 freeset(register struct parse *p, register cset *cs)
1181 {
1182 	register size_t i;
1183 	register cset *top = &p->g->sets[p->g->ncsets];
1184 	register size_t css = (size_t)p->g->csetsize;
1185 
1186 	for (i = 0; i < css; i++)
1187 		CHsub(cs, i);
1188 	if (cs == top-1)	/* recover only the easy case */
1189 		p->g->ncsets--;
1190 }
1191 
1192 /*
1193  - freezeset - final processing on a set of characters
1194  == static int freezeset(register struct parse *p, register cset *cs);
1195  *
1196  * The main task here is merging identical sets.  This is usually a waste
1197  * of time (although the hash code minimizes the overhead), but can win
1198  * big if REG_ICASE is being used.  REG_ICASE, by the way, is why the hash
1199  * is done using addition rather than xor -- all ASCII [aA] sets xor to
1200  * the same value!
1201  */
1202 static int			/* set number */
1203 freezeset(register struct parse *p, register cset *cs)
1204 {
1205 	register uch h = cs->hash;
1206 	register size_t i;
1207 	register cset *top = &p->g->sets[p->g->ncsets];
1208 	register cset *cs2;
1209 	register size_t css = (size_t)p->g->csetsize;
1210 
1211 	/* look for an earlier one which is the same */
1212 	for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1213 		if (cs2->hash == h && cs2 != cs) {
1214 			/* maybe */
1215 			for (i = 0; i < css; i++)
1216 				if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1217 					break;		/* no */
1218 			if (i == css)
1219 				break;			/* yes */
1220 		}
1221 
1222 	if (cs2 < top) {	/* found one */
1223 		freeset(p, cs);
1224 		cs = cs2;
1225 	}
1226 
1227 	return((int)(cs - p->g->sets));
1228 }
1229 
1230 /*
1231  - firstch - return first character in a set (which must have at least one)
1232  == static int firstch(register struct parse *p, register cset *cs);
1233  */
1234 static int			/* character; there is no "none" value */
1235 firstch(register struct parse *p, register cset *cs)
1236 {
1237 	register size_t i;
1238 	register size_t css = (size_t)p->g->csetsize;
1239 
1240 	for (i = 0; i < css; i++)
1241 		if (CHIN(cs, i))
1242 			return((char)i);
1243 	assert(never);
1244 	return(0);		/* arbitrary */
1245 }
1246 
1247 /*
1248  - nch - number of characters in a set
1249  == static int nch(register struct parse *p, register cset *cs);
1250  */
1251 static int
1252 nch(register struct parse *p, register cset *cs)
1253 {
1254 	register size_t i;
1255 	register size_t css = (size_t)p->g->csetsize;
1256 	register int n = 0;
1257 
1258 	for (i = 0; i < css; i++)
1259 		if (CHIN(cs, i))
1260 			n++;
1261 	return(n);
1262 }
1263 
1264 /*
1265  - mcadd - add a collating element to a cset
1266  == static void mcadd(register struct parse *p, register cset *cs, \
1267  ==	register char *cp);
1268  */
1269 static void
1270 mcadd(register struct parse *p, register cset *cs, register const char *cp)
1271 {
1272 	register size_t oldend = cs->smultis;
1273 
1274 	cs->smultis += strlen(cp) + 1;
1275 	if (cs->multis == NULL)
1276 		cs->multis = malloc(cs->smultis);
1277 	else
1278 		cs->multis = realloc(cs->multis, cs->smultis);
1279 	if (cs->multis == NULL) {
1280 		SETERROR(REG_ESPACE);
1281 		return;
1282 	}
1283 
1284 	(void) strcpy(cs->multis + oldend - 1, cp);
1285 	cs->multis[cs->smultis - 1] = '\0';
1286 }
1287 
1288 #ifdef notdef
1289 /*
1290  - mcsub - subtract a collating element from a cset
1291  == static void mcsub(register cset *cs, register char *cp);
1292  */
1293 static void
1294 mcsub(register cset *cs, register char *cp)
1295 {
1296 	register char *fp = mcfind(cs, cp);
1297 	register size_t len = strlen(fp);
1298 
1299 	assert(fp != NULL);
1300 	(void) memmove(fp, fp + len + 1,
1301 				cs->smultis - (fp + len + 1 - cs->multis));
1302 	cs->smultis -= len;
1303 
1304 	if (cs->smultis == 0) {
1305 		free(cs->multis);
1306 		cs->multis = NULL;
1307 		return;
1308 	}
1309 
1310 	cs->multis = realloc(cs->multis, cs->smultis);
1311 	assert(cs->multis != NULL);
1312 }
1313 
1314 /*
1315  - mcin - is a collating element in a cset?
1316  == static int mcin(register cset *cs, register char *cp);
1317  */
1318 static int
1319 mcin(register cset *cs, register char *cp)
1320 {
1321 	return(mcfind(cs, cp) != NULL);
1322 }
1323 
1324 /*
1325  - mcfind - find a collating element in a cset
1326  == static char *mcfind(register cset *cs, register char *cp);
1327  */
1328 static char *
1329 mcfind(register cset *cs, register char *cp)
1330 {
1331 	register char *p;
1332 
1333 	if (cs->multis == NULL)
1334 		return(NULL);
1335 	for (p = cs->multis; *p != '\0'; p += strlen(p) + 1)
1336 		if (strcmp(cp, p) == 0)
1337 			return(p);
1338 	return(NULL);
1339 }
1340 #endif
1341 
1342 /*
1343  - mcinvert - invert the list of collating elements in a cset
1344  == static void mcinvert(register struct parse *p, register cset *cs);
1345  *
1346  * This would have to know the set of possibilities.  Implementation
1347  * is deferred.
1348  */
1349 static void
1350 mcinvert(register struct parse *p, register cset *cs)
1351 {
1352 	assert(cs->multis == NULL);	/* xxx */
1353 }
1354 
1355 /*
1356  - mccase - add case counterparts of the list of collating elements in a cset
1357  == static void mccase(register struct parse *p, register cset *cs);
1358  *
1359  * This would have to know the set of possibilities.  Implementation
1360  * is deferred.
1361  */
1362 static void
1363 mccase(register struct parse *p, register cset *cs)
1364 {
1365 	assert(cs->multis == NULL);	/* xxx */
1366 }
1367 
1368 #ifdef notdef
1369 /*
1370  - isinsets - is this character in any sets?
1371  == static int isinsets(register struct re_guts *g, int c);
1372  */
1373 static int			/* predicate */
1374 isinsets(register struct re_guts *g, int c)
1375 {
1376 	register uch *col;
1377 	register int i;
1378 	register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1379 	register unsigned uc = (unsigned char)c;
1380 
1381 	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1382 		if (col[uc] != 0)
1383 			return(1);
1384 	return(0);
1385 }
1386 
1387 /*
1388  - samesets - are these two characters in exactly the same sets?
1389  == static int samesets(register struct re_guts *g, int c1, int c2);
1390  */
1391 static int			/* predicate */
1392 samesets(register struct re_guts *g, int c1, int c2)
1393 {
1394 	register uch *col;
1395 	register int i;
1396 	register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1397 	register unsigned uc1 = (unsigned char)c1;
1398 	register unsigned uc2 = (unsigned char)c2;
1399 
1400 	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1401 		if (col[uc1] != col[uc2])
1402 			return(0);
1403 	return(1);
1404 }
1405 #endif
1406 
1407 /*
1408  - categorize - sort out character categories
1409  == static void categorize(struct parse *p, register struct re_guts *g);
1410  */
1411 static void
1412 categorize(struct parse *p, register struct re_guts *g)
1413 {
1414 #ifdef notdef
1415 	register cat_t *cats = g->categories;
1416 	register int c;
1417 	register int c2;
1418 	register cat_t cat;
1419 
1420 	/* avoid making error situations worse */
1421 	if (p->error != 0)
1422 		return;
1423 
1424 	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
1425 		if (cats[c] == 0 && isinsets(g, c)) {
1426 			cat = g->ncategories++;
1427 			cats[c] = cat;
1428 			for (c2 = c+1; c2 <= CHAR_MAX; c2++)
1429 				if (cats[c2] == 0 && samesets(g, c, c2))
1430 					cats[c2] = cat;
1431 		}
1432 #endif
1433 }
1434 
1435 /*
1436  - dupl - emit a duplicate of a bunch of sops
1437  == static sopno dupl(register struct parse *p, sopno start, sopno finish);
1438  */
1439 static sopno			/* start of duplicate */
1440 dupl(register struct parse *p, sopno start, sopno finish)
1441 
1442             			/* from here */
1443              			/* to this less one */
1444 {
1445 	register sopno ret = HERE();
1446 	register sopno len = finish - start;
1447 
1448 	assert(finish >= start);
1449 	if (len == 0)
1450 		return(ret);
1451 	if (!enlarge(p, p->ssize + len))	/* this many unexpected additions */
1452 		return ret;
1453 	assert(p->ssize >= p->slen + len);
1454 	(void) memcpy((char *)(p->strip + p->slen),
1455 		(char *)(p->strip + start), (size_t)len*sizeof(sop));
1456 	(void) memcpy((char *)(p->stripdata + p->slen),
1457 		(char *)(p->stripdata + start), (size_t)len*sizeof(RCHAR_T));
1458 	p->slen += len;
1459 	return(ret);
1460 }
1461 
1462 /*
1463  - doemit - emit a strip operator
1464  == static void doemit(register struct parse *p, sop op, size_t opnd);
1465  *
1466  * It might seem better to implement this as a macro with a function as
1467  * hard-case backup, but it's just too big and messy unless there are
1468  * some changes to the data structures.  Maybe later.
1469  */
1470 static void
1471 doemit(register struct parse *p, sop op, size_t opnd)
1472 {
1473 	/* avoid making error situations worse */
1474 	if (p->error != 0)
1475 		return;
1476 
1477 	/* deal with oversize operands ("can't happen", more or less) */
1478 	assert(opnd < 1);
1479 
1480 	/* deal with undersized strip */
1481 	if (p->slen >= p->ssize)
1482 		if (!enlarge(p, (p->ssize+1) / 2 * 3))	/* +50% */
1483 			return;
1484 
1485 	/* finally, it's all reduced to the easy case */
1486 	p->strip[p->slen] = op;
1487 	p->stripdata[p->slen] = opnd;
1488 	p->slen++;
1489 }
1490 
1491 /*
1492  - doinsert - insert a sop into the strip
1493  == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos);
1494  */
1495 static void
1496 doinsert(register struct parse *p, sop op, size_t opnd, sopno pos)
1497 {
1498 	register sopno sn;
1499 	register sop s;
1500 	register RCHAR_T d;
1501 	register int i;
1502 
1503 	/* avoid making error situations worse */
1504 	if (p->error != 0)
1505 		return;
1506 
1507 	sn = HERE();
1508 	EMIT(op, opnd);		/* do checks, ensure space */
1509 	assert(HERE() == sn+1);
1510 	s = p->strip[sn];
1511 	d = p->stripdata[sn];
1512 
1513 	/* adjust paren pointers */
1514 	assert(pos > 0);
1515 	for (i = 1; i < NPAREN; i++) {
1516 		if (p->pbegin[i] >= pos) {
1517 			p->pbegin[i]++;
1518 		}
1519 		if (p->pend[i] >= pos) {
1520 			p->pend[i]++;
1521 		}
1522 	}
1523 
1524 	memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1525 						(HERE()-pos-1)*sizeof(sop));
1526 	memmove((char *)&p->stripdata[pos+1], (char *)&p->stripdata[pos],
1527 						(HERE()-pos-1)*sizeof(RCHAR_T));
1528 	p->strip[pos] = s;
1529 	p->stripdata[pos] = d;
1530 }
1531 
1532 /*
1533  - dofwd - complete a forward reference
1534  == static void dofwd(register struct parse *p, sopno pos, sop value);
1535  */
1536 static void
1537 dofwd(register struct parse *p, register sopno pos, sop value)
1538 {
1539 	/* avoid making error situations worse */
1540 	if (p->error != 0)
1541 		return;
1542 
1543 	assert(value < 1);
1544 	p->stripdata[pos] = value;
1545 }
1546 
1547 /*
1548  - enlarge - enlarge the strip
1549  == static int enlarge(register struct parse *p, sopno size);
1550  */
1551 static int
1552 enlarge(register struct parse *p, register sopno size)
1553 {
1554 	register sop *sp;
1555 	register RCHAR_T *dp;
1556 	sopno osize;
1557 
1558 	if (p->ssize >= size)
1559 		return 1;
1560 
1561 	osize = p->ssize;
1562 	p->ssize = size;
1563 	if (MEMSIZE(p) > MEMLIMIT)
1564 		goto oomem;
1565 	sp = realloc(p->strip, p->ssize * sizeof(sop));
1566 	if (sp == NULL)
1567 		goto oomem;
1568 	p->strip = sp;
1569 	dp = realloc(p->stripdata, p->ssize * sizeof(RCHAR_T));
1570 	if (dp == NULL) {
1571 oomem:
1572 		p->ssize = osize;
1573 		SETERROR(REG_ESPACE);
1574 		return 0;
1575 	}
1576 	p->stripdata = dp;
1577 	return 1;
1578 }
1579 
1580 /*
1581  - stripsnug - compact the strip
1582  == static void stripsnug(register struct parse *p, register struct re_guts *g);
1583  */
1584 static void
1585 stripsnug(register struct parse *p, register struct re_guts *g)
1586 {
1587 	g->nstates = p->slen;
1588 	g->strip = (sop *)realloc((char *)p->strip,
1589 	    p->slen * sizeof(sop));
1590 	if (g->strip == NULL) {
1591 		SETERROR(REG_ESPACE);
1592 		g->strip = p->strip;
1593 	}
1594 	g->stripdata = (RCHAR_T *)realloc((char *)p->stripdata,
1595 	    p->slen * sizeof(RCHAR_T));
1596 	if (g->stripdata == NULL) {
1597 		SETERROR(REG_ESPACE);
1598 		g->stripdata = p->stripdata;
1599 	}
1600 }
1601 
1602 /*
1603  - findmust - fill in must and mlen with longest mandatory literal string
1604  == static void findmust(register struct parse *p, register struct re_guts *g);
1605  *
1606  * This algorithm could do fancy things like analyzing the operands of |
1607  * for common subsequences.  Someday.  This code is simple and finds most
1608  * of the interesting cases.
1609  *
1610  * Note that must and mlen got initialized during setup.
1611  */
1612 static void
1613 findmust(struct parse *p, register struct re_guts *g)
1614 {
1615 	register sop *scans;
1616 	register RCHAR_T *scand;
1617 	sop *starts = 0;
1618 	RCHAR_T *startd = NULL;
1619 	register sop *newstarts = 0;
1620 	register RCHAR_T *newstartd = NULL;
1621 	register sopno newlen;
1622 	register sop s;
1623 	register RCHAR_T d;
1624 	register RCHAR_T *cp;
1625 	register sopno i;
1626 
1627 	/* avoid making error situations worse */
1628 	if (p->error != 0)
1629 		return;
1630 
1631 	/* find the longest OCHAR sequence in strip */
1632 	newlen = 0;
1633 	scans = g->strip + 1;
1634 	scand = g->stripdata + 1;
1635 	do {
1636 		s = *scans++;
1637 		d = *scand++;
1638 		switch (s) {
1639 		case OCHAR:		/* sequence member */
1640 			if (newlen == 0) {		/* new sequence */
1641 				newstarts = scans - 1;
1642 				newstartd = scand - 1;
1643 			}
1644 			newlen++;
1645 			break;
1646 		case OPLUS_:		/* things that don't break one */
1647 		case OLPAREN:
1648 		case ORPAREN:
1649 			break;
1650 		case OQUEST_:		/* things that must be skipped */
1651 		case OCH_:
1652 			scans--;
1653 			scand--;
1654 			do {
1655 				scans += d;
1656 				scand += d;
1657 				s = *scans;
1658 				d = *scand;
1659 				/* assert() interferes w debug printouts */
1660 				if (s != O_QUEST && s != O_CH && s != OOR2) {
1661 					g->iflags |= BAD;
1662 					return;
1663 				}
1664 			} while (s != O_QUEST && s != O_CH);
1665 			/* fallthrough */
1666 		default:		/* things that break a sequence */
1667 			if (newlen > g->mlen) {		/* ends one */
1668 				starts = newstarts;
1669 				startd = newstartd;
1670 				g->mlen = newlen;
1671 			}
1672 			newlen = 0;
1673 			break;
1674 		}
1675 	} while (s != OEND);
1676 
1677 	if (g->mlen == 0)		/* there isn't one */
1678 		return;
1679 
1680 	/* turn it into a character string */
1681 	g->must = malloc(((size_t)g->mlen + 1) * sizeof(RCHAR_T));
1682 	if (g->must == NULL) {		/* argh; just forget it */
1683 		g->mlen = 0;
1684 		return;
1685 	}
1686 	cp = g->must;
1687 	scans = starts;
1688 	scand = startd;
1689 	for (i = g->mlen; i > 0; i--) {
1690 		for (;;) {
1691 			s = *scans++;
1692 			d = *scand++;
1693 			if (s == OCHAR)
1694 				break;
1695 		}
1696 		assert(cp < g->must + g->mlen);
1697 		*cp++ = d;
1698 	}
1699 	assert(cp == g->must + g->mlen);
1700 	*cp++ = '\0';		/* just on general principles */
1701 }
1702 
1703 /*
1704  - pluscount - count + nesting
1705  == static sopno pluscount(register struct parse *p, register struct re_guts *g);
1706  */
1707 static sopno			/* nesting depth */
1708 pluscount(struct parse *p, register struct re_guts *g)
1709 {
1710 	register sop *scan;
1711 	register sop s;
1712 	register sopno plusnest = 0;
1713 	register sopno maxnest = 0;
1714 
1715 	if (p->error != 0)
1716 		return(0);	/* there may not be an OEND */
1717 
1718 	scan = g->strip + 1;
1719 	do {
1720 		s = *scan++;
1721 		switch (s) {
1722 		case OPLUS_:
1723 			plusnest++;
1724 			break;
1725 		case O_PLUS:
1726 			if (plusnest > maxnest)
1727 				maxnest = plusnest;
1728 			plusnest--;
1729 			break;
1730 		}
1731 	} while (s != OEND);
1732 	if (plusnest != 0)
1733 		g->iflags |= BAD;
1734 	return(maxnest);
1735 }
1736