xref: /netbsd-src/lib/libc/regex/regcomp.c (revision 2a399c6883d870daece976daec6ffa7bb7f934ce)
1 /*	$NetBSD: regcomp.c,v 1.8 1997/07/21 14:08:15 jtc Exp $	*/
2 
3 /*-
4  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
5  * Copyright (c) 1992, 1993, 1994
6  *	The Regents of the University of California.  All rights reserved.
7  *
8  * This code is derived from software contributed to Berkeley by
9  * Henry Spencer.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 3. All advertising materials mentioning features or use of this software
20  *    must display the following acknowledgement:
21  *	This product includes software developed by the University of
22  *	California, Berkeley and its contributors.
23  * 4. Neither the name of the University nor the names of its contributors
24  *    may be used to endorse or promote products derived from this software
25  *    without specific prior written permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37  * SUCH DAMAGE.
38  *
39  *	@(#)regcomp.c	8.5 (Berkeley) 3/20/94
40  */
41 
42 #include <sys/cdefs.h>
43 #if defined(LIBC_SCCS) && !defined(lint)
44 #if 0
45 static char sccsid[] = "@(#)regcomp.c	8.5 (Berkeley) 3/20/94";
46 #else
47 __RCSID("$NetBSD: regcomp.c,v 1.8 1997/07/21 14:08:15 jtc Exp $");
48 #endif
49 #endif /* LIBC_SCCS and not lint */
50 
51 #include "namespace.h"
52 #include <sys/types.h>
53 #include <stdio.h>
54 #include <string.h>
55 #include <ctype.h>
56 #include <limits.h>
57 #include <stdlib.h>
58 #include <regex.h>
59 
60 #ifdef __weak_alias
61 __weak_alias(regcomp,_regcomp);
62 #endif
63 
64 #include "utils.h"
65 #include "regex2.h"
66 
67 #include "cclass.h"
68 #include "cname.h"
69 
70 /*
71  * parse structure, passed up and down to avoid global variables and
72  * other clumsinesses
73  */
74 struct parse {
75 	char *next;		/* next character in RE */
76 	char *end;		/* end of string (-> NUL normally) */
77 	int error;		/* has an error been seen? */
78 	sop *strip;		/* malloced strip */
79 	sopno ssize;		/* malloced strip size (allocated) */
80 	sopno slen;		/* malloced strip length (used) */
81 	int ncsalloc;		/* number of csets allocated */
82 	struct re_guts *g;
83 #	define	NPAREN	10	/* we need to remember () 1-9 for back refs */
84 	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */
85 	sopno pend[NPAREN];	/* -> ) ([0] unused) */
86 };
87 
88 /* ========= begin header generated by ./mkh ========= */
89 #ifdef __cplusplus
90 extern "C" {
91 #endif
92 
93 /* === regcomp.c === */
94 static void p_ere __P((struct parse *p, int stop));
95 static void p_ere_exp __P((struct parse *p));
96 static void p_str __P((struct parse *p));
97 static void p_bre __P((struct parse *p, int end1, int end2));
98 static int p_simp_re __P((struct parse *p, int starordinary));
99 static int p_count __P((struct parse *p));
100 static void p_bracket __P((struct parse *p));
101 static void p_b_term __P((struct parse *p, cset *cs));
102 static void p_b_cclass __P((struct parse *p, cset *cs));
103 static void p_b_eclass __P((struct parse *p, cset *cs));
104 static char p_b_symbol __P((struct parse *p));
105 static char p_b_coll_elem __P((struct parse *p, int endc));
106 static char othercase __P((int ch));
107 static void bothcases __P((struct parse *p, int ch));
108 static void ordinary __P((struct parse *p, int ch));
109 static void nonnewline __P((struct parse *p));
110 static void repeat __P((struct parse *p, sopno start, int from, int to));
111 static int seterr __P((struct parse *p, int e));
112 static cset *allocset __P((struct parse *p));
113 static void freeset __P((struct parse *p, cset *cs));
114 static int freezeset __P((struct parse *p, cset *cs));
115 static int firstch __P((struct parse *p, cset *cs));
116 static int nch __P((struct parse *p, cset *cs));
117 static void mcadd __P((struct parse *p, cset *cs, char *cp));
118 #if 0
119 static void mcsub __P((cset *cs, char *cp));
120 static int mcin __P((cset *cs, char *cp));
121 static char *mcfind __P((cset *cs, char *cp));
122 #endif
123 static void mcinvert __P((struct parse *p, cset *cs));
124 static void mccase __P((struct parse *p, cset *cs));
125 static int isinsets __P((struct re_guts *g, int c));
126 static int samesets __P((struct re_guts *g, int c1, int c2));
127 static void categorize __P((struct parse *p, struct re_guts *g));
128 static sopno dupl __P((struct parse *p, sopno start, sopno finish));
129 static void doemit __P((struct parse *p, sop op, size_t opnd));
130 static void doinsert __P((struct parse *p, sop op, size_t opnd, sopno pos));
131 static void dofwd __P((struct parse *p, sopno pos, sop value));
132 static void enlarge __P((struct parse *p, sopno size));
133 static void stripsnug __P((struct parse *p, struct re_guts *g));
134 static void findmust __P((struct parse *p, struct re_guts *g));
135 static sopno pluscount __P((struct parse *p, struct re_guts *g));
136 
137 #ifdef __cplusplus
138 }
139 #endif
140 /* ========= end header generated by ./mkh ========= */
141 
142 static char nuls[10];		/* place to point scanner in event of error */
143 
144 /*
145  * macros for use with parse structure
146  * BEWARE:  these know that the parse structure is named `p' !!!
147  */
148 #define	PEEK()	(*p->next)
149 #define	PEEK2()	(*(p->next+1))
150 #define	MORE()	(p->next < p->end)
151 #define	MORE2()	(p->next+1 < p->end)
152 #define	SEE(c)	(MORE() && PEEK() == (c))
153 #define	SEETWO(a, b)	(MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
154 #define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
155 #define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
156 #define	NEXT()	(p->next++)
157 #define	NEXT2()	(p->next += 2)
158 #define	NEXTn(n)	(p->next += (n))
159 #define	GETNEXT()	(*p->next++)
160 #define	SETERROR(e)	seterr(p, (e))
161 #define	REQUIRE(co, e)	(void) ((co) || SETERROR(e))
162 #define	MUSTSEE(c, e)	(REQUIRE(MORE() && PEEK() == (c), e))
163 #define	MUSTEAT(c, e)	(void) (REQUIRE(MORE() && GETNEXT() == (c), e))
164 #define	MUSTNOTSEE(c, e)	(REQUIRE(!MORE() || PEEK() != (c), e))
165 #define	EMIT(op, sopnd)	doemit(p, (sop)(op), (size_t)(sopnd))
166 #define	INSERT(op, pos)	doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
167 #define	AHEAD(pos)		dofwd(p, pos, HERE()-(pos))
168 #define	ASTERN(sop, pos)	EMIT(sop, HERE()-pos)
169 #define	HERE()		(p->slen)
170 #define	THERE()		(p->slen - 1)
171 #define	THERETHERE()	(p->slen - 2)
172 #define	DROP(n)	(p->slen -= (n))
173 
174 #ifndef NDEBUG
175 static int never = 0;		/* for use in asserts; shuts lint up */
176 #else
177 #define	never	0		/* some <assert.h>s have bugs too */
178 #endif
179 
180 /*
181  - regcomp - interface for parser and compilation
182  = extern int regcomp(regex_t *, const char *, int);
183  = #define	REG_BASIC	0000
184  = #define	REG_EXTENDED	0001
185  = #define	REG_ICASE	0002
186  = #define	REG_NOSUB	0004
187  = #define	REG_NEWLINE	0010
188  = #define	REG_NOSPEC	0020
189  = #define	REG_PEND	0040
190  = #define	REG_DUMP	0200
191  */
192 int				/* 0 success, otherwise REG_something */
193 regcomp(preg, pattern, cflags)
194 regex_t *preg;
195 const char *pattern;
196 int cflags;
197 {
198 	struct parse pa;
199 	register struct re_guts *g;
200 	register struct parse *p = &pa;
201 	register int i;
202 	register size_t len;
203 #ifdef REDEBUG
204 #	define	GOODFLAGS(f)	(f)
205 #else
206 #	define	GOODFLAGS(f)	((f)&~REG_DUMP)
207 #endif
208 
209 	cflags = GOODFLAGS(cflags);
210 	if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
211 		return(REG_INVARG);
212 
213 	if (cflags&REG_PEND) {
214 		if (preg->re_endp < pattern)
215 			return(REG_INVARG);
216 		len = preg->re_endp - pattern;
217 	} else
218 		len = strlen((char *)pattern);
219 
220 	/* do the mallocs early so failure handling is easy */
221 	g = (struct re_guts *)malloc(sizeof(struct re_guts) +
222 							(NC-1)*sizeof(cat_t));
223 	if (g == NULL)
224 		return(REG_ESPACE);
225 	p->ssize = len/(size_t)2*(size_t)3 + (size_t)1;	/* ugh */
226 	p->strip = (sop *)malloc(p->ssize * sizeof(sop));
227 	p->slen = 0;
228 	if (p->strip == NULL) {
229 		free((char *)g);
230 		return(REG_ESPACE);
231 	}
232 
233 	/* set things up */
234 	p->g = g;
235 	p->next = (char *)pattern;	/* convenience; we do not modify it */
236 	p->end = p->next + len;
237 	p->error = 0;
238 	p->ncsalloc = 0;
239 	for (i = 0; i < NPAREN; i++) {
240 		p->pbegin[i] = 0;
241 		p->pend[i] = 0;
242 	}
243 	g->csetsize = NC;
244 	g->sets = NULL;
245 	g->setbits = NULL;
246 	g->ncsets = 0;
247 	g->cflags = cflags;
248 	g->iflags = 0;
249 	g->nbol = 0;
250 	g->neol = 0;
251 	g->must = NULL;
252 	g->mlen = 0;
253 	g->nsub = 0;
254 	g->ncategories = 1;	/* category 0 is "everything else" */
255 	g->categories = &g->catspace[-(CHAR_MIN)];
256 	(void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
257 	g->backrefs = 0;
258 
259 	/* do it */
260 	EMIT(OEND, 0);
261 	g->firststate = THERE();
262 	if (cflags&REG_EXTENDED)
263 		p_ere(p, OUT);
264 	else if (cflags&REG_NOSPEC)
265 		p_str(p);
266 	else
267 		p_bre(p, OUT, OUT);
268 	EMIT(OEND, 0);
269 	g->laststate = THERE();
270 
271 	/* tidy up loose ends and fill things in */
272 	categorize(p, g);
273 	stripsnug(p, g);
274 	findmust(p, g);
275 	g->nplus = pluscount(p, g);
276 	g->magic = MAGIC2;
277 	preg->re_nsub = g->nsub;
278 	preg->re_g = g;
279 	preg->re_magic = MAGIC1;
280 #ifndef REDEBUG
281 	/* not debugging, so can't rely on the assert() in regexec() */
282 	if (g->iflags&BAD)
283 		SETERROR(REG_ASSERT);
284 #endif
285 
286 	/* win or lose, we're done */
287 	if (p->error != 0)	/* lose */
288 		regfree(preg);
289 	return(p->error);
290 }
291 
292 /*
293  - p_ere - ERE parser top level, concatenation and alternation
294  == static void p_ere(register struct parse *p, int stop);
295  */
296 static void
297 p_ere(p, stop)
298 register struct parse *p;
299 int stop;			/* character this ERE should end at */
300 {
301 	register char c;
302 	register sopno prevback = 0;	/* pacify gcc */
303 	register sopno prevfwd = 0; 	/* pacify gcc */
304 	register sopno conc;
305 	register int first = 1;		/* is this the first alternative? */
306 
307 	for (;;) {
308 		/* do a bunch of concatenated expressions */
309 		conc = HERE();
310 		while (MORE() && (c = PEEK()) != '|' && c != stop)
311 			p_ere_exp(p);
312 		REQUIRE(HERE() != conc, REG_EMPTY);	/* require nonempty */
313 
314 		if (!EAT('|'))
315 			break;		/* NOTE BREAK OUT */
316 
317 		if (first) {
318 			INSERT(OCH_, conc);	/* offset is wrong */
319 			prevfwd = conc;
320 			prevback = conc;
321 			first = 0;
322 		}
323 		ASTERN(OOR1, prevback);
324 		prevback = THERE();
325 		AHEAD(prevfwd);			/* fix previous offset */
326 		prevfwd = HERE();
327 		EMIT(OOR2, 0);			/* offset is very wrong */
328 	}
329 
330 	if (!first) {		/* tail-end fixups */
331 		AHEAD(prevfwd);
332 		ASTERN(O_CH, prevback);
333 	}
334 
335 	assert(!MORE() || SEE(stop));
336 }
337 
338 /*
339  - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
340  == static void p_ere_exp(register struct parse *p);
341  */
342 static void
343 p_ere_exp(p)
344 register struct parse *p;
345 {
346 	register char c;
347 	register sopno pos;
348 	register int count;
349 	register int count2;
350 	register sopno subno;
351 	int wascaret = 0;
352 
353 	assert(MORE());		/* caller should have ensured this */
354 	c = GETNEXT();
355 
356 	pos = HERE();
357 	switch (c) {
358 	case '(':
359 		REQUIRE(MORE(), REG_EPAREN);
360 		p->g->nsub++;
361 		subno = p->g->nsub;
362 		if (subno < NPAREN)
363 			p->pbegin[subno] = HERE();
364 		EMIT(OLPAREN, subno);
365 		if (!SEE(')'))
366 			p_ere(p, ')');
367 		if (subno < NPAREN) {
368 			p->pend[subno] = HERE();
369 			assert(p->pend[subno] != 0);
370 		}
371 		EMIT(ORPAREN, subno);
372 		MUSTEAT(')', REG_EPAREN);
373 		break;
374 #ifndef POSIX_MISTAKE
375 	case ')':		/* happens only if no current unmatched ( */
376 		/*
377 		 * You may ask, why the ifndef?  Because I didn't notice
378 		 * this until slightly too late for 1003.2, and none of the
379 		 * other 1003.2 regular-expression reviewers noticed it at
380 		 * all.  So an unmatched ) is legal POSIX, at least until
381 		 * we can get it fixed.
382 		 */
383 		SETERROR(REG_EPAREN);
384 		break;
385 #endif
386 	case '^':
387 		EMIT(OBOL, 0);
388 		p->g->iflags |= USEBOL;
389 		p->g->nbol++;
390 		wascaret = 1;
391 		break;
392 	case '$':
393 		EMIT(OEOL, 0);
394 		p->g->iflags |= USEEOL;
395 		p->g->neol++;
396 		break;
397 	case '|':
398 		SETERROR(REG_EMPTY);
399 		break;
400 	case '*':
401 	case '+':
402 	case '?':
403 		SETERROR(REG_BADRPT);
404 		break;
405 	case '.':
406 		if (p->g->cflags&REG_NEWLINE)
407 			nonnewline(p);
408 		else
409 			EMIT(OANY, 0);
410 		break;
411 	case '[':
412 		p_bracket(p);
413 		break;
414 	case '\\':
415 		REQUIRE(MORE(), REG_EESCAPE);
416 		c = GETNEXT();
417 		ordinary(p, c);
418 		break;
419 	case '{':		/* okay as ordinary except if digit follows */
420 		REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT);
421 		/* FALLTHROUGH */
422 	default:
423 		ordinary(p, c);
424 		break;
425 	}
426 
427 	if (!MORE())
428 		return;
429 	c = PEEK();
430 	/* we call { a repetition if followed by a digit */
431 	if (!( c == '*' || c == '+' || c == '?' ||
432 				(c == '{' && MORE2() && isdigit(PEEK2())) ))
433 		return;		/* no repetition, we're done */
434 	NEXT();
435 
436 	REQUIRE(!wascaret, REG_BADRPT);
437 	switch (c) {
438 	case '*':	/* implemented as +? */
439 		/* this case does not require the (y|) trick, noKLUDGE */
440 		INSERT(OPLUS_, pos);
441 		ASTERN(O_PLUS, pos);
442 		INSERT(OQUEST_, pos);
443 		ASTERN(O_QUEST, pos);
444 		break;
445 	case '+':
446 		INSERT(OPLUS_, pos);
447 		ASTERN(O_PLUS, pos);
448 		break;
449 	case '?':
450 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
451 		INSERT(OCH_, pos);		/* offset slightly wrong */
452 		ASTERN(OOR1, pos);		/* this one's right */
453 		AHEAD(pos);			/* fix the OCH_ */
454 		EMIT(OOR2, 0);			/* offset very wrong... */
455 		AHEAD(THERE());			/* ...so fix it */
456 		ASTERN(O_CH, THERETHERE());
457 		break;
458 	case '{':
459 		count = p_count(p);
460 		if (EAT(',')) {
461 			if (isdigit(PEEK())) {
462 				count2 = p_count(p);
463 				REQUIRE(count <= count2, REG_BADBR);
464 			} else		/* single number with comma */
465 				count2 = INFINITY;
466 		} else		/* just a single number */
467 			count2 = count;
468 		repeat(p, pos, count, count2);
469 		if (!EAT('}')) {	/* error heuristics */
470 			while (MORE() && PEEK() != '}')
471 				NEXT();
472 			REQUIRE(MORE(), REG_EBRACE);
473 			SETERROR(REG_BADBR);
474 		}
475 		break;
476 	}
477 
478 	if (!MORE())
479 		return;
480 	c = PEEK();
481 	if (!( c == '*' || c == '+' || c == '?' ||
482 				(c == '{' && MORE2() && isdigit(PEEK2())) ) )
483 		return;
484 	SETERROR(REG_BADRPT);
485 }
486 
487 /*
488  - p_str - string (no metacharacters) "parser"
489  == static void p_str(register struct parse *p);
490  */
491 static void
492 p_str(p)
493 register struct parse *p;
494 {
495 	REQUIRE(MORE(), REG_EMPTY);
496 	while (MORE())
497 		ordinary(p, GETNEXT());
498 }
499 
500 /*
501  - p_bre - BRE parser top level, anchoring and concatenation
502  == static void p_bre(register struct parse *p, register int end1, \
503  ==	register int end2);
504  * Giving end1 as OUT essentially eliminates the end1/end2 check.
505  *
506  * This implementation is a bit of a kludge, in that a trailing $ is first
507  * taken as an ordinary character and then revised to be an anchor.  The
508  * only undesirable side effect is that '$' gets included as a character
509  * category in such cases.  This is fairly harmless; not worth fixing.
510  * The amount of lookahead needed to avoid this kludge is excessive.
511  */
512 static void
513 p_bre(p, end1, end2)
514 register struct parse *p;
515 register int end1;		/* first terminating character */
516 register int end2;		/* second terminating character */
517 {
518 	register sopno start = HERE();
519 	register int first = 1;			/* first subexpression? */
520 	register int wasdollar = 0;
521 
522 	if (EAT('^')) {
523 		EMIT(OBOL, 0);
524 		p->g->iflags |= USEBOL;
525 		p->g->nbol++;
526 	}
527 	while (MORE() && !SEETWO(end1, end2)) {
528 		wasdollar = p_simp_re(p, first);
529 		first = 0;
530 	}
531 	if (wasdollar) {	/* oops, that was a trailing anchor */
532 		DROP(1);
533 		EMIT(OEOL, 0);
534 		p->g->iflags |= USEEOL;
535 		p->g->neol++;
536 	}
537 
538 	REQUIRE(HERE() != start, REG_EMPTY);	/* require nonempty */
539 }
540 
541 /*
542  - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
543  == static int p_simp_re(register struct parse *p, int starordinary);
544  */
545 static int			/* was the simple RE an unbackslashed $? */
546 p_simp_re(p, starordinary)
547 register struct parse *p;
548 int starordinary;		/* is a leading * an ordinary character? */
549 {
550 	register int c;
551 	register int count;
552 	register int count2;
553 	register sopno pos;
554 	register int i;
555 	register sopno subno;
556 #	define	BACKSL	(1<<CHAR_BIT)
557 
558 	pos = HERE();		/* repetion op, if any, covers from here */
559 
560 	assert(MORE());		/* caller should have ensured this */
561 	c = GETNEXT();
562 	if (c == '\\') {
563 		REQUIRE(MORE(), REG_EESCAPE);
564 		c = BACKSL | (unsigned char)GETNEXT();
565 	}
566 	switch (c) {
567 	case '.':
568 		if (p->g->cflags&REG_NEWLINE)
569 			nonnewline(p);
570 		else
571 			EMIT(OANY, 0);
572 		break;
573 	case '[':
574 		p_bracket(p);
575 		break;
576 	case BACKSL|'{':
577 		SETERROR(REG_BADRPT);
578 		break;
579 	case BACKSL|'(':
580 		p->g->nsub++;
581 		subno = p->g->nsub;
582 		if (subno < NPAREN)
583 			p->pbegin[subno] = HERE();
584 		EMIT(OLPAREN, subno);
585 		/* the MORE here is an error heuristic */
586 		if (MORE() && !SEETWO('\\', ')'))
587 			p_bre(p, '\\', ')');
588 		if (subno < NPAREN) {
589 			p->pend[subno] = HERE();
590 			assert(p->pend[subno] != 0);
591 		}
592 		EMIT(ORPAREN, subno);
593 		REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
594 		break;
595 	case BACKSL|')':	/* should not get here -- must be user */
596 	case BACKSL|'}':
597 		SETERROR(REG_EPAREN);
598 		break;
599 	case BACKSL|'1':
600 	case BACKSL|'2':
601 	case BACKSL|'3':
602 	case BACKSL|'4':
603 	case BACKSL|'5':
604 	case BACKSL|'6':
605 	case BACKSL|'7':
606 	case BACKSL|'8':
607 	case BACKSL|'9':
608 		i = (c&~BACKSL) - '0';
609 		assert(i < NPAREN);
610 		if (p->pend[i] != 0) {
611 			assert(i <= p->g->nsub);
612 			EMIT(OBACK_, i);
613 			assert(p->pbegin[i] != 0);
614 			assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
615 			assert(OP(p->strip[p->pend[i]]) == ORPAREN);
616 			(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
617 			EMIT(O_BACK, i);
618 		} else
619 			SETERROR(REG_ESUBREG);
620 		p->g->backrefs = 1;
621 		break;
622 	case '*':
623 		REQUIRE(starordinary, REG_BADRPT);
624 		/* FALLTHROUGH */
625 	default:
626 		ordinary(p, c &~ BACKSL);
627 		break;
628 	}
629 
630 	if (EAT('*')) {		/* implemented as +? */
631 		/* this case does not require the (y|) trick, noKLUDGE */
632 		INSERT(OPLUS_, pos);
633 		ASTERN(O_PLUS, pos);
634 		INSERT(OQUEST_, pos);
635 		ASTERN(O_QUEST, pos);
636 	} else if (EATTWO('\\', '{')) {
637 		count = p_count(p);
638 		if (EAT(',')) {
639 			if (MORE() && isdigit(PEEK())) {
640 				count2 = p_count(p);
641 				REQUIRE(count <= count2, REG_BADBR);
642 			} else		/* single number with comma */
643 				count2 = INFINITY;
644 		} else		/* just a single number */
645 			count2 = count;
646 		repeat(p, pos, count, count2);
647 		if (!EATTWO('\\', '}')) {	/* error heuristics */
648 			while (MORE() && !SEETWO('\\', '}'))
649 				NEXT();
650 			REQUIRE(MORE(), REG_EBRACE);
651 			SETERROR(REG_BADBR);
652 		}
653 	} else if (c == (unsigned char)'$')	/* $ (but not \$) ends it */
654 		return(1);
655 
656 	return(0);
657 }
658 
659 /*
660  - p_count - parse a repetition count
661  == static int p_count(register struct parse *p);
662  */
663 static int			/* the value */
664 p_count(p)
665 register struct parse *p;
666 {
667 	register int count = 0;
668 	register int ndigits = 0;
669 
670 	while (MORE() && isdigit(PEEK()) && count <= DUPMAX) {
671 		count = count*10 + (GETNEXT() - '0');
672 		ndigits++;
673 	}
674 
675 	REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
676 	return(count);
677 }
678 
679 /*
680  - p_bracket - parse a bracketed character list
681  == static void p_bracket(register struct parse *p);
682  *
683  * Note a significant property of this code:  if the allocset() did SETERROR,
684  * no set operations are done.
685  */
686 static void
687 p_bracket(p)
688 register struct parse *p;
689 {
690 	register cset *cs = allocset(p);
691 	register int invert = 0;
692 
693 	/* Dept of Truly Sickening Special-Case Kludges */
694 	if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
695 		EMIT(OBOW, 0);
696 		NEXTn(6);
697 		return;
698 	}
699 	if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
700 		EMIT(OEOW, 0);
701 		NEXTn(6);
702 		return;
703 	}
704 
705 	if (EAT('^'))
706 		invert++;	/* make note to invert set at end */
707 	if (EAT(']'))
708 		CHadd(cs, ']');
709 	else if (EAT('-'))
710 		CHadd(cs, '-');
711 	while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
712 		p_b_term(p, cs);
713 	if (EAT('-'))
714 		CHadd(cs, '-');
715 	MUSTEAT(']', REG_EBRACK);
716 
717 	if (p->error != 0)	/* don't mess things up further */
718 		return;
719 
720 	if (p->g->cflags&REG_ICASE) {
721 		register int i;
722 		register int ci;
723 
724 		for (i = p->g->csetsize - 1; i >= 0; i--)
725 			if (CHIN(cs, i) && isalpha(i)) {
726 				ci = othercase(i);
727 				if (ci != i)
728 					CHadd(cs, ci);
729 			}
730 		if (cs->multis != NULL)
731 			mccase(p, cs);
732 	}
733 	if (invert) {
734 		register int i;
735 
736 		for (i = p->g->csetsize - 1; i >= 0; i--)
737 			if (CHIN(cs, i))
738 				CHsub(cs, i);
739 			else
740 				CHadd(cs, i);
741 		if (p->g->cflags&REG_NEWLINE)
742 			CHsub(cs, '\n');
743 		if (cs->multis != NULL)
744 			mcinvert(p, cs);
745 	}
746 
747 	assert(cs->multis == NULL);		/* xxx */
748 
749 	if (nch(p, cs) == 1) {		/* optimize singleton sets */
750 		ordinary(p, firstch(p, cs));
751 		freeset(p, cs);
752 	} else
753 		EMIT(OANYOF, freezeset(p, cs));
754 }
755 
756 /*
757  - p_b_term - parse one term of a bracketed character list
758  == static void p_b_term(register struct parse *p, register cset *cs);
759  */
760 static void
761 p_b_term(p, cs)
762 register struct parse *p;
763 register cset *cs;
764 {
765 	register char c;
766 	register char start, finish;
767 	register int i;
768 
769 	/* classify what we've got */
770 	switch ((MORE()) ? PEEK() : '\0') {
771 	case '[':
772 		c = (MORE2()) ? PEEK2() : '\0';
773 		break;
774 	case '-':
775 		SETERROR(REG_ERANGE);
776 		return;			/* NOTE RETURN */
777 		break;
778 	default:
779 		c = '\0';
780 		break;
781 	}
782 
783 	switch (c) {
784 	case ':':		/* character class */
785 		NEXT2();
786 		REQUIRE(MORE(), REG_EBRACK);
787 		c = PEEK();
788 		REQUIRE(c != '-' && c != ']', REG_ECTYPE);
789 		p_b_cclass(p, cs);
790 		REQUIRE(MORE(), REG_EBRACK);
791 		REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
792 		break;
793 	case '=':		/* equivalence class */
794 		NEXT2();
795 		REQUIRE(MORE(), REG_EBRACK);
796 		c = PEEK();
797 		REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
798 		p_b_eclass(p, cs);
799 		REQUIRE(MORE(), REG_EBRACK);
800 		REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
801 		break;
802 	default:		/* symbol, ordinary character, or range */
803 /* xxx revision needed for multichar stuff */
804 		start = p_b_symbol(p);
805 		if (SEE('-') && MORE2() && PEEK2() != ']') {
806 			/* range */
807 			NEXT();
808 			if (EAT('-'))
809 				finish = '-';
810 			else
811 				finish = p_b_symbol(p);
812 		} else
813 			finish = start;
814 /* xxx what about signed chars here... */
815 		REQUIRE(start <= finish, REG_ERANGE);
816 		for (i = start; i <= finish; i++)
817 			CHadd(cs, i);
818 		break;
819 	}
820 }
821 
822 /*
823  - p_b_cclass - parse a character-class name and deal with it
824  == static void p_b_cclass(register struct parse *p, register cset *cs);
825  */
826 static void
827 p_b_cclass(p, cs)
828 register struct parse *p;
829 register cset *cs;
830 {
831 	register char *sp = p->next;
832 	register struct cclass *cp;
833 	register size_t len;
834 	register char *u;
835 	register char c;
836 
837 	while (MORE() && isalpha(PEEK()))
838 		NEXT();
839 	len = p->next - sp;
840 	for (cp = cclasses; cp->name != NULL; cp++)
841 		if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
842 			break;
843 	if (cp->name == NULL) {
844 		/* oops, didn't find it */
845 		SETERROR(REG_ECTYPE);
846 		return;
847 	}
848 
849 	u = cp->chars;
850 	while ((c = *u++) != '\0')
851 		CHadd(cs, c);
852 	for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
853 		MCadd(p, cs, u);
854 }
855 
856 /*
857  - p_b_eclass - parse an equivalence-class name and deal with it
858  == static void p_b_eclass(register struct parse *p, register cset *cs);
859  *
860  * This implementation is incomplete. xxx
861  */
862 static void
863 p_b_eclass(p, cs)
864 register struct parse *p;
865 register cset *cs;
866 {
867 	register char c;
868 
869 	c = p_b_coll_elem(p, '=');
870 	CHadd(cs, c);
871 }
872 
873 /*
874  - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
875  == static char p_b_symbol(register struct parse *p);
876  */
877 static char			/* value of symbol */
878 p_b_symbol(p)
879 register struct parse *p;
880 {
881 	register char value;
882 
883 	REQUIRE(MORE(), REG_EBRACK);
884 	if (!EATTWO('[', '.'))
885 		return(GETNEXT());
886 
887 	/* collating symbol */
888 	value = p_b_coll_elem(p, '.');
889 	REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
890 	return(value);
891 }
892 
893 /*
894  - p_b_coll_elem - parse a collating-element name and look it up
895  == static char p_b_coll_elem(register struct parse *p, int endc);
896  */
897 static char			/* value of collating element */
898 p_b_coll_elem(p, endc)
899 register struct parse *p;
900 int endc;			/* name ended by endc,']' */
901 {
902 	register char *sp = p->next;
903 	register struct cname *cp;
904 	register int len;
905 
906 	while (MORE() && !SEETWO(endc, ']'))
907 		NEXT();
908 	if (!MORE()) {
909 		SETERROR(REG_EBRACK);
910 		return(0);
911 	}
912 	len = p->next - sp;
913 	for (cp = cnames; cp->name != NULL; cp++)
914 		if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
915 			return(cp->code);	/* known name */
916 	if (len == 1)
917 		return(*sp);	/* single character */
918 	SETERROR(REG_ECOLLATE);			/* neither */
919 	return(0);
920 }
921 
922 /*
923  - othercase - return the case counterpart of an alphabetic
924  == static char othercase(int ch);
925  */
926 static char			/* if no counterpart, return ch */
927 othercase(ch)
928 int ch;
929 {
930 	assert(isalpha(ch));
931 	if (isupper(ch))
932 		return(tolower(ch));
933 	else if (islower(ch))
934 		return(toupper(ch));
935 	else			/* peculiar, but could happen */
936 		return(ch);
937 }
938 
939 /*
940  - bothcases - emit a dualcase version of a two-case character
941  == static void bothcases(register struct parse *p, int ch);
942  *
943  * Boy, is this implementation ever a kludge...
944  */
945 static void
946 bothcases(p, ch)
947 register struct parse *p;
948 int ch;
949 {
950 	register char *oldnext = p->next;
951 	register char *oldend = p->end;
952 	char bracket[3];
953 
954 	assert(othercase(ch) != ch);	/* p_bracket() would recurse */
955 	p->next = bracket;
956 	p->end = bracket+2;
957 	bracket[0] = ch;
958 	bracket[1] = ']';
959 	bracket[2] = '\0';
960 	p_bracket(p);
961 	assert(p->next == bracket+2);
962 	p->next = oldnext;
963 	p->end = oldend;
964 }
965 
966 /*
967  - ordinary - emit an ordinary character
968  == static void ordinary(register struct parse *p, register int ch);
969  */
970 static void
971 ordinary(p, ch)
972 register struct parse *p;
973 register int ch;
974 {
975 	register cat_t *cap = p->g->categories;
976 
977 	if ((p->g->cflags&REG_ICASE) && isalpha(ch) && othercase(ch) != ch)
978 		bothcases(p, ch);
979 	else {
980 		EMIT(OCHAR, (unsigned char)ch);
981 		if (cap[ch] == 0)
982 			cap[ch] = p->g->ncategories++;
983 	}
984 }
985 
986 /*
987  - nonnewline - emit REG_NEWLINE version of OANY
988  == static void nonnewline(register struct parse *p);
989  *
990  * Boy, is this implementation ever a kludge...
991  */
992 static void
993 nonnewline(p)
994 register struct parse *p;
995 {
996 	register char *oldnext = p->next;
997 	register char *oldend = p->end;
998 	char bracket[4];
999 
1000 	p->next = bracket;
1001 	p->end = bracket+3;
1002 	bracket[0] = '^';
1003 	bracket[1] = '\n';
1004 	bracket[2] = ']';
1005 	bracket[3] = '\0';
1006 	p_bracket(p);
1007 	assert(p->next == bracket+3);
1008 	p->next = oldnext;
1009 	p->end = oldend;
1010 }
1011 
1012 /*
1013  - repeat - generate code for a bounded repetition, recursively if needed
1014  == static void repeat(register struct parse *p, sopno start, int from, int to);
1015  */
1016 static void
1017 repeat(p, start, from, to)
1018 register struct parse *p;
1019 sopno start;			/* operand from here to end of strip */
1020 int from;			/* repeated from this number */
1021 int to;				/* to this number of times (maybe INFINITY) */
1022 {
1023 	register sopno finish = HERE();
1024 #	define	N	2
1025 #	define	INF	3
1026 #	define	REP(f, t)	((f)*8 + (t))
1027 #	define	MAP(n)	(((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1028 	register sopno copy;
1029 
1030 	if (p->error != 0)	/* head off possible runaway recursion */
1031 		return;
1032 
1033 	assert(from <= to);
1034 
1035 	switch (REP(MAP(from), MAP(to))) {
1036 	case REP(0, 0):			/* must be user doing this */
1037 		DROP(finish-start);	/* drop the operand */
1038 		break;
1039 	case REP(0, 1):			/* as x{1,1}? */
1040 	case REP(0, N):			/* as x{1,n}? */
1041 	case REP(0, INF):		/* as x{1,}? */
1042 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1043 		INSERT(OCH_, start);		/* offset is wrong... */
1044 		repeat(p, start+1, 1, to);
1045 		ASTERN(OOR1, start);
1046 		AHEAD(start);			/* ... fix it */
1047 		EMIT(OOR2, 0);
1048 		AHEAD(THERE());
1049 		ASTERN(O_CH, THERETHERE());
1050 		break;
1051 	case REP(1, 1):			/* trivial case */
1052 		/* done */
1053 		break;
1054 	case REP(1, N):			/* as x?x{1,n-1} */
1055 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1056 		INSERT(OCH_, start);
1057 		ASTERN(OOR1, start);
1058 		AHEAD(start);
1059 		EMIT(OOR2, 0);			/* offset very wrong... */
1060 		AHEAD(THERE());			/* ...so fix it */
1061 		ASTERN(O_CH, THERETHERE());
1062 		copy = dupl(p, start+1, finish+1);
1063 		assert(copy == finish+4);
1064 		repeat(p, copy, 1, to-1);
1065 		break;
1066 	case REP(1, INF):		/* as x+ */
1067 		INSERT(OPLUS_, start);
1068 		ASTERN(O_PLUS, start);
1069 		break;
1070 	case REP(N, N):			/* as xx{m-1,n-1} */
1071 		copy = dupl(p, start, finish);
1072 		repeat(p, copy, from-1, to-1);
1073 		break;
1074 	case REP(N, INF):		/* as xx{n-1,INF} */
1075 		copy = dupl(p, start, finish);
1076 		repeat(p, copy, from-1, to);
1077 		break;
1078 	default:			/* "can't happen" */
1079 		SETERROR(REG_ASSERT);	/* just in case */
1080 		break;
1081 	}
1082 }
1083 
1084 /*
1085  - seterr - set an error condition
1086  == static int seterr(register struct parse *p, int e);
1087  */
1088 static int			/* useless but makes type checking happy */
1089 seterr(p, e)
1090 register struct parse *p;
1091 int e;
1092 {
1093 	if (p->error == 0)	/* keep earliest error condition */
1094 		p->error = e;
1095 	p->next = nuls;		/* try to bring things to a halt */
1096 	p->end = nuls;
1097 	return(0);		/* make the return value well-defined */
1098 }
1099 
1100 /*
1101  - allocset - allocate a set of characters for []
1102  == static cset *allocset(register struct parse *p);
1103  */
1104 static cset *
1105 allocset(p)
1106 register struct parse *p;
1107 {
1108 	register int no = p->g->ncsets++;
1109 	register size_t nc;
1110 	register size_t nbytes;
1111 	register cset *cs;
1112 	register size_t css = (size_t)p->g->csetsize;
1113 	register int i;
1114 
1115 	if (no >= p->ncsalloc) {	/* need another column of space */
1116 		p->ncsalloc += CHAR_BIT;
1117 		nc = p->ncsalloc;
1118 		assert(nc % CHAR_BIT == 0);
1119 		nbytes = nc / CHAR_BIT * css;
1120 		if (p->g->sets == NULL)
1121 			p->g->sets = (cset *)malloc(nc * sizeof(cset));
1122 		else
1123 			p->g->sets = (cset *)realloc((char *)p->g->sets,
1124 							nc * sizeof(cset));
1125 		if (p->g->setbits == NULL)
1126 			p->g->setbits = (uch *)malloc(nbytes);
1127 		else {
1128 			p->g->setbits = (uch *)realloc((char *)p->g->setbits,
1129 								nbytes);
1130 			/* xxx this isn't right if setbits is now NULL */
1131 			for (i = 0; i < no; i++)
1132 				p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
1133 		}
1134 		if (p->g->sets != NULL && p->g->setbits != NULL)
1135 			(void) memset((char *)p->g->setbits + (nbytes - css),
1136 								0, css);
1137 		else {
1138 			no = 0;
1139 			SETERROR(REG_ESPACE);
1140 			/* caller's responsibility not to do set ops */
1141 		}
1142 	}
1143 
1144 	assert(p->g->sets != NULL);	/* xxx */
1145 	cs = &p->g->sets[no];
1146 	cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
1147 	cs->mask = 1 << ((no) % CHAR_BIT);
1148 	cs->hash = 0;
1149 	cs->smultis = 0;
1150 	cs->multis = NULL;
1151 
1152 	return(cs);
1153 }
1154 
1155 /*
1156  - freeset - free a now-unused set
1157  == static void freeset(register struct parse *p, register cset *cs);
1158  */
1159 static void
1160 freeset(p, cs)
1161 register struct parse *p;
1162 register cset *cs;
1163 {
1164 	register int i;
1165 	register cset *top = &p->g->sets[p->g->ncsets];
1166 	register size_t css = (size_t)p->g->csetsize;
1167 
1168 	for (i = 0; i < css; i++)
1169 		CHsub(cs, i);
1170 	if (cs == top-1)	/* recover only the easy case */
1171 		p->g->ncsets--;
1172 }
1173 
1174 /*
1175  - freezeset - final processing on a set of characters
1176  == static int freezeset(register struct parse *p, register cset *cs);
1177  *
1178  * The main task here is merging identical sets.  This is usually a waste
1179  * of time (although the hash code minimizes the overhead), but can win
1180  * big if REG_ICASE is being used.  REG_ICASE, by the way, is why the hash
1181  * is done using addition rather than xor -- all ASCII [aA] sets xor to
1182  * the same value!
1183  */
1184 static int			/* set number */
1185 freezeset(p, cs)
1186 register struct parse *p;
1187 register cset *cs;
1188 {
1189 	register uch h = cs->hash;
1190 	register int i;
1191 	register cset *top = &p->g->sets[p->g->ncsets];
1192 	register cset *cs2;
1193 	register size_t css = (size_t)p->g->csetsize;
1194 
1195 	/* look for an earlier one which is the same */
1196 	for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1197 		if (cs2->hash == h && cs2 != cs) {
1198 			/* maybe */
1199 			for (i = 0; i < css; i++)
1200 				if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1201 					break;		/* no */
1202 			if (i == css)
1203 				break;			/* yes */
1204 		}
1205 
1206 	if (cs2 < top) {	/* found one */
1207 		freeset(p, cs);
1208 		cs = cs2;
1209 	}
1210 
1211 	return((int)(cs - p->g->sets));
1212 }
1213 
1214 /*
1215  - firstch - return first character in a set (which must have at least one)
1216  == static int firstch(register struct parse *p, register cset *cs);
1217  */
1218 static int			/* character; there is no "none" value */
1219 firstch(p, cs)
1220 register struct parse *p;
1221 register cset *cs;
1222 {
1223 	register int i;
1224 	register size_t css = (size_t)p->g->csetsize;
1225 
1226 	for (i = 0; i < css; i++)
1227 		if (CHIN(cs, i))
1228 			return((char)i);
1229 	assert(never);
1230 	return(0);		/* arbitrary */
1231 }
1232 
1233 /*
1234  - nch - number of characters in a set
1235  == static int nch(register struct parse *p, register cset *cs);
1236  */
1237 static int
1238 nch(p, cs)
1239 register struct parse *p;
1240 register cset *cs;
1241 {
1242 	register int i;
1243 	register size_t css = (size_t)p->g->csetsize;
1244 	register int n = 0;
1245 
1246 	for (i = 0; i < css; i++)
1247 		if (CHIN(cs, i))
1248 			n++;
1249 	return(n);
1250 }
1251 
1252 /*
1253  - mcadd - add a collating element to a cset
1254  == static void mcadd(register struct parse *p, register cset *cs, \
1255  ==	register char *cp);
1256  */
1257 static void
1258 mcadd(p, cs, cp)
1259 register struct parse *p;
1260 register cset *cs;
1261 register char *cp;
1262 {
1263 	register size_t oldend = cs->smultis;
1264 
1265 	cs->smultis += strlen(cp) + 1;
1266 	if (cs->multis == NULL)
1267 		cs->multis = malloc(cs->smultis);
1268 	else
1269 		cs->multis = realloc(cs->multis, cs->smultis);
1270 	if (cs->multis == NULL) {
1271 		SETERROR(REG_ESPACE);
1272 		return;
1273 	}
1274 
1275 	(void) strcpy(cs->multis + oldend - 1, cp);
1276 	cs->multis[cs->smultis - 1] = '\0';
1277 }
1278 
1279 #if 0
1280 /*
1281  - mcsub - subtract a collating element from a cset
1282  == static void mcsub(register cset *cs, register char *cp);
1283  */
1284 static void
1285 mcsub(cs, cp)
1286 register cset *cs;
1287 register char *cp;
1288 {
1289 	register char *fp = mcfind(cs, cp);
1290 	register size_t len = strlen(fp);
1291 
1292 	assert(fp != NULL);
1293 	(void) memmove(fp, fp + len + 1,
1294 				cs->smultis - (fp + len + 1 - cs->multis));
1295 	cs->smultis -= len;
1296 
1297 	if (cs->smultis == 0) {
1298 		free(cs->multis);
1299 		cs->multis = NULL;
1300 		return;
1301 	}
1302 
1303 	cs->multis = realloc(cs->multis, cs->smultis);
1304 	assert(cs->multis != NULL);
1305 }
1306 
1307 /*
1308  - mcin - is a collating element in a cset?
1309  == static int mcin(register cset *cs, register char *cp);
1310  */
1311 static int
1312 mcin(cs, cp)
1313 register cset *cs;
1314 register char *cp;
1315 {
1316 	return(mcfind(cs, cp) != NULL);
1317 }
1318 
1319 /*
1320  - mcfind - find a collating element in a cset
1321  == static char *mcfind(register cset *cs, register char *cp);
1322  */
1323 static char *
1324 mcfind(cs, cp)
1325 register cset *cs;
1326 register char *cp;
1327 {
1328 	register char *p;
1329 
1330 	if (cs->multis == NULL)
1331 		return(NULL);
1332 	for (p = cs->multis; *p != '\0'; p += strlen(p) + 1)
1333 		if (strcmp(cp, p) == 0)
1334 			return(p);
1335 	return(NULL);
1336 }
1337 #endif
1338 
1339 /*
1340  - mcinvert - invert the list of collating elements in a cset
1341  == static void mcinvert(register struct parse *p, register cset *cs);
1342  *
1343  * This would have to know the set of possibilities.  Implementation
1344  * is deferred.
1345  */
1346 static void
1347 mcinvert(p, cs)
1348 register struct parse *p;
1349 register cset *cs;
1350 {
1351 	assert(cs->multis == NULL);	/* xxx */
1352 }
1353 
1354 /*
1355  - mccase - add case counterparts of the list of collating elements in a cset
1356  == static void mccase(register struct parse *p, register cset *cs);
1357  *
1358  * This would have to know the set of possibilities.  Implementation
1359  * is deferred.
1360  */
1361 static void
1362 mccase(p, cs)
1363 register struct parse *p;
1364 register cset *cs;
1365 {
1366 	assert(cs->multis == NULL);	/* xxx */
1367 }
1368 
1369 /*
1370  - isinsets - is this character in any sets?
1371  == static int isinsets(register struct re_guts *g, int c);
1372  */
1373 static int			/* predicate */
1374 isinsets(g, c)
1375 register struct re_guts *g;
1376 int c;
1377 {
1378 	register uch *col;
1379 	register int i;
1380 	register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1381 	register unsigned uc = (unsigned char)c;
1382 
1383 	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1384 		if (col[uc] != 0)
1385 			return(1);
1386 	return(0);
1387 }
1388 
1389 /*
1390  - samesets - are these two characters in exactly the same sets?
1391  == static int samesets(register struct re_guts *g, int c1, int c2);
1392  */
1393 static int			/* predicate */
1394 samesets(g, c1, c2)
1395 register struct re_guts *g;
1396 int c1;
1397 int c2;
1398 {
1399 	register uch *col;
1400 	register int i;
1401 	register int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1402 	register unsigned uc1 = (unsigned char)c1;
1403 	register unsigned uc2 = (unsigned char)c2;
1404 
1405 	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1406 		if (col[uc1] != col[uc2])
1407 			return(0);
1408 	return(1);
1409 }
1410 
1411 /*
1412  - categorize - sort out character categories
1413  == static void categorize(struct parse *p, register struct re_guts *g);
1414  */
1415 static void
1416 categorize(p, g)
1417 struct parse *p;
1418 register struct re_guts *g;
1419 {
1420 	register cat_t *cats = g->categories;
1421 	register int c;
1422 	register int c2;
1423 	register cat_t cat;
1424 
1425 	/* avoid making error situations worse */
1426 	if (p->error != 0)
1427 		return;
1428 
1429 	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
1430 		if (cats[c] == 0 && isinsets(g, c)) {
1431 			cat = g->ncategories++;
1432 			cats[c] = cat;
1433 			for (c2 = c+1; c2 <= CHAR_MAX; c2++)
1434 				if (cats[c2] == 0 && samesets(g, c, c2))
1435 					cats[c2] = cat;
1436 		}
1437 }
1438 
1439 /*
1440  - dupl - emit a duplicate of a bunch of sops
1441  == static sopno dupl(register struct parse *p, sopno start, sopno finish);
1442  */
1443 static sopno			/* start of duplicate */
1444 dupl(p, start, finish)
1445 register struct parse *p;
1446 sopno start;			/* from here */
1447 sopno finish;			/* to this less one */
1448 {
1449 	register sopno ret = HERE();
1450 	register sopno len = finish - start;
1451 
1452 	assert(finish >= start);
1453 	if (len == 0)
1454 		return(ret);
1455 	enlarge(p, p->ssize + len);	/* this many unexpected additions */
1456 	assert(p->ssize >= p->slen + len);
1457 	(void) memcpy((char *)(p->strip + p->slen),
1458 		(char *)(p->strip + start), (size_t)len*sizeof(sop));
1459 	p->slen += len;
1460 	return(ret);
1461 }
1462 
1463 /*
1464  - doemit - emit a strip operator
1465  == static void doemit(register struct parse *p, sop op, size_t opnd);
1466  *
1467  * It might seem better to implement this as a macro with a function as
1468  * hard-case backup, but it's just too big and messy unless there are
1469  * some changes to the data structures.  Maybe later.
1470  */
1471 static void
1472 doemit(p, op, opnd)
1473 register struct parse *p;
1474 sop op;
1475 size_t opnd;
1476 {
1477 	/* avoid making error situations worse */
1478 	if (p->error != 0)
1479 		return;
1480 
1481 	/* deal with oversize operands ("can't happen", more or less) */
1482 	assert(opnd < 1<<OPSHIFT);
1483 
1484 	/* deal with undersized strip */
1485 	if (p->slen >= p->ssize)
1486 		enlarge(p, (p->ssize+1) / 2 * 3);	/* +50% */
1487 	assert(p->slen < p->ssize);
1488 
1489 	/* finally, it's all reduced to the easy case */
1490 	p->strip[p->slen++] = SOP(op, opnd);
1491 }
1492 
1493 /*
1494  - doinsert - insert a sop into the strip
1495  == static void doinsert(register struct parse *p, sop op, size_t opnd, sopno pos);
1496  */
1497 static void
1498 doinsert(p, op, opnd, pos)
1499 register struct parse *p;
1500 sop op;
1501 size_t opnd;
1502 sopno pos;
1503 {
1504 	register sopno sn;
1505 	register sop s;
1506 	register int i;
1507 
1508 	/* avoid making error situations worse */
1509 	if (p->error != 0)
1510 		return;
1511 
1512 	sn = HERE();
1513 	EMIT(op, opnd);		/* do checks, ensure space */
1514 	assert(HERE() == sn+1);
1515 	s = p->strip[sn];
1516 
1517 	/* adjust paren pointers */
1518 	assert(pos > 0);
1519 	for (i = 1; i < NPAREN; i++) {
1520 		if (p->pbegin[i] >= pos) {
1521 			p->pbegin[i]++;
1522 		}
1523 		if (p->pend[i] >= pos) {
1524 			p->pend[i]++;
1525 		}
1526 	}
1527 
1528 	memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1529 						(HERE()-pos-1)*sizeof(sop));
1530 	p->strip[pos] = s;
1531 }
1532 
1533 /*
1534  - dofwd - complete a forward reference
1535  == static void dofwd(register struct parse *p, sopno pos, sop value);
1536  */
1537 static void
1538 dofwd(p, pos, value)
1539 register struct parse *p;
1540 register sopno pos;
1541 sop value;
1542 {
1543 	/* avoid making error situations worse */
1544 	if (p->error != 0)
1545 		return;
1546 
1547 	assert(value < 1<<OPSHIFT);
1548 	p->strip[pos] = OP(p->strip[pos]) | value;
1549 }
1550 
1551 /*
1552  - enlarge - enlarge the strip
1553  == static void enlarge(register struct parse *p, sopno size);
1554  */
1555 static void
1556 enlarge(p, size)
1557 register struct parse *p;
1558 register sopno size;
1559 {
1560 	register sop *sp;
1561 
1562 	if (p->ssize >= size)
1563 		return;
1564 
1565 	sp = (sop *)realloc(p->strip, size*sizeof(sop));
1566 	if (sp == NULL) {
1567 		SETERROR(REG_ESPACE);
1568 		return;
1569 	}
1570 	p->strip = sp;
1571 	p->ssize = size;
1572 }
1573 
1574 /*
1575  - stripsnug - compact the strip
1576  == static void stripsnug(register struct parse *p, register struct re_guts *g);
1577  */
1578 static void
1579 stripsnug(p, g)
1580 register struct parse *p;
1581 register struct re_guts *g;
1582 {
1583 	g->nstates = p->slen;
1584 	g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
1585 	if (g->strip == NULL) {
1586 		SETERROR(REG_ESPACE);
1587 		g->strip = p->strip;
1588 	}
1589 }
1590 
1591 /*
1592  - findmust - fill in must and mlen with longest mandatory literal string
1593  == static void findmust(register struct parse *p, register struct re_guts *g);
1594  *
1595  * This algorithm could do fancy things like analyzing the operands of |
1596  * for common subsequences.  Someday.  This code is simple and finds most
1597  * of the interesting cases.
1598  *
1599  * Note that must and mlen got initialized during setup.
1600  */
1601 static void
1602 findmust(p, g)
1603 struct parse *p;
1604 register struct re_guts *g;
1605 {
1606 	register sop *scan;
1607 	sop *start = NULL;
1608 	register sop *newstart = NULL;
1609 	register sopno newlen;
1610 	register sop s;
1611 	register char *cp;
1612 	register sopno i;
1613 
1614 	/* avoid making error situations worse */
1615 	if (p->error != 0)
1616 		return;
1617 
1618 	/* find the longest OCHAR sequence in strip */
1619 	newlen = 0;
1620 	scan = g->strip + 1;
1621 	do {
1622 		s = *scan++;
1623 		switch (OP(s)) {
1624 		case OCHAR:		/* sequence member */
1625 			if (newlen == 0)		/* new sequence */
1626 				newstart = scan - 1;
1627 			newlen++;
1628 			break;
1629 		case OPLUS_:		/* things that don't break one */
1630 		case OLPAREN:
1631 		case ORPAREN:
1632 			break;
1633 		case OQUEST_:		/* things that must be skipped */
1634 		case OCH_:
1635 			scan--;
1636 			do {
1637 				scan += OPND(s);
1638 				s = *scan;
1639 				/* assert() interferes w debug printouts */
1640 				if (OP(s) != O_QUEST && OP(s) != O_CH &&
1641 							OP(s) != OOR2) {
1642 					g->iflags |= BAD;
1643 					return;
1644 				}
1645 			} while (OP(s) != O_QUEST && OP(s) != O_CH);
1646 			/* fallthrough */
1647 		default:		/* things that break a sequence */
1648 			if (newlen > g->mlen) {		/* ends one */
1649 				start = newstart;
1650 				g->mlen = newlen;
1651 			}
1652 			newlen = 0;
1653 			break;
1654 		}
1655 	} while (OP(s) != OEND);
1656 
1657 	if (g->mlen == 0)		/* there isn't one */
1658 		return;
1659 
1660 	/* turn it into a character string */
1661 	g->must = malloc((size_t)g->mlen + 1);
1662 	if (g->must == NULL) {		/* argh; just forget it */
1663 		g->mlen = 0;
1664 		return;
1665 	}
1666 	cp = g->must;
1667 	scan = start;
1668 	for (i = g->mlen; i > 0; i--) {
1669 		while (OP(s = *scan++) != OCHAR)
1670 			continue;
1671 		assert(cp < g->must + g->mlen);
1672 		*cp++ = (char)OPND(s);
1673 	}
1674 	assert(cp == g->must + g->mlen);
1675 	*cp++ = '\0';		/* just on general principles */
1676 }
1677 
1678 /*
1679  - pluscount - count + nesting
1680  == static sopno pluscount(register struct parse *p, register struct re_guts *g);
1681  */
1682 static sopno			/* nesting depth */
1683 pluscount(p, g)
1684 struct parse *p;
1685 register struct re_guts *g;
1686 {
1687 	register sop *scan;
1688 	register sop s;
1689 	register sopno plusnest = 0;
1690 	register sopno maxnest = 0;
1691 
1692 	if (p->error != 0)
1693 		return(0);	/* there may not be an OEND */
1694 
1695 	scan = g->strip + 1;
1696 	do {
1697 		s = *scan++;
1698 		switch (OP(s)) {
1699 		case OPLUS_:
1700 			plusnest++;
1701 			break;
1702 		case O_PLUS:
1703 			if (plusnest > maxnest)
1704 				maxnest = plusnest;
1705 			plusnest--;
1706 			break;
1707 		}
1708 	} while (OP(s) != OEND);
1709 	if (plusnest != 0)
1710 		g->iflags |= BAD;
1711 	return(maxnest);
1712 }
1713