xref: /freebsd-src/contrib/llvm-project/llvm/lib/Support/regex2.h (revision 0b57cec536236d46e3dba9bd041533462f33dbb7)
1*0b57cec5SDimitry Andric /*-
2*0b57cec5SDimitry Andric  * This code is derived from OpenBSD's libc/regex, original license follows:
3*0b57cec5SDimitry Andric  *
4*0b57cec5SDimitry Andric  * Copyright (c) 1992, 1993, 1994 Henry Spencer.
5*0b57cec5SDimitry Andric  * Copyright (c) 1992, 1993, 1994
6*0b57cec5SDimitry Andric  *	The Regents of the University of California.  All rights reserved.
7*0b57cec5SDimitry Andric  *
8*0b57cec5SDimitry Andric  * This code is derived from software contributed to Berkeley by
9*0b57cec5SDimitry Andric  * Henry Spencer.
10*0b57cec5SDimitry Andric  *
11*0b57cec5SDimitry Andric  * Redistribution and use in source and binary forms, with or without
12*0b57cec5SDimitry Andric  * modification, are permitted provided that the following conditions
13*0b57cec5SDimitry Andric  * are met:
14*0b57cec5SDimitry Andric  * 1. Redistributions of source code must retain the above copyright
15*0b57cec5SDimitry Andric  *    notice, this list of conditions and the following disclaimer.
16*0b57cec5SDimitry Andric  * 2. Redistributions in binary form must reproduce the above copyright
17*0b57cec5SDimitry Andric  *    notice, this list of conditions and the following disclaimer in the
18*0b57cec5SDimitry Andric  *    documentation and/or other materials provided with the distribution.
19*0b57cec5SDimitry Andric  * 3. Neither the name of the University nor the names of its contributors
20*0b57cec5SDimitry Andric  *    may be used to endorse or promote products derived from this software
21*0b57cec5SDimitry Andric  *    without specific prior written permission.
22*0b57cec5SDimitry Andric  *
23*0b57cec5SDimitry Andric  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24*0b57cec5SDimitry Andric  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25*0b57cec5SDimitry Andric  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26*0b57cec5SDimitry Andric  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27*0b57cec5SDimitry Andric  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28*0b57cec5SDimitry Andric  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29*0b57cec5SDimitry Andric  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30*0b57cec5SDimitry Andric  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31*0b57cec5SDimitry Andric  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32*0b57cec5SDimitry Andric  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33*0b57cec5SDimitry Andric  * SUCH DAMAGE.
34*0b57cec5SDimitry Andric  *
35*0b57cec5SDimitry Andric  *	@(#)regex2.h	8.4 (Berkeley) 3/20/94
36*0b57cec5SDimitry Andric  */
37*0b57cec5SDimitry Andric 
38*0b57cec5SDimitry Andric #ifndef LLVM_SUPPORT_REGEX2_H
39*0b57cec5SDimitry Andric #define LLVM_SUPPORT_REGEX2_H
40*0b57cec5SDimitry Andric 
41*0b57cec5SDimitry Andric #include "regutils.h"
42*0b57cec5SDimitry Andric #include <stddef.h>
43*0b57cec5SDimitry Andric 
44*0b57cec5SDimitry Andric /*
45*0b57cec5SDimitry Andric  * internals of regex_t
46*0b57cec5SDimitry Andric  */
47*0b57cec5SDimitry Andric #define	MAGIC1	((('r'^0200)<<8) | 'e')
48*0b57cec5SDimitry Andric 
49*0b57cec5SDimitry Andric /*
50*0b57cec5SDimitry Andric  * The internal representation is a *strip*, a sequence of
51*0b57cec5SDimitry Andric  * operators ending with an endmarker.  (Some terminology etc. is a
52*0b57cec5SDimitry Andric  * historical relic of earlier versions which used multiple strips.)
53*0b57cec5SDimitry Andric  * Certain oddities in the representation are there to permit running
54*0b57cec5SDimitry Andric  * the machinery backwards; in particular, any deviation from sequential
55*0b57cec5SDimitry Andric  * flow must be marked at both its source and its destination.  Some
56*0b57cec5SDimitry Andric  * fine points:
57*0b57cec5SDimitry Andric  *
58*0b57cec5SDimitry Andric  * - OPLUS_ and O_PLUS are *inside* the loop they create.
59*0b57cec5SDimitry Andric  * - OQUEST_ and O_QUEST are *outside* the bypass they create.
60*0b57cec5SDimitry Andric  * - OCH_ and O_CH are *outside* the multi-way branch they create, while
61*0b57cec5SDimitry Andric  *   OOR1 and OOR2 are respectively the end and the beginning of one of
62*0b57cec5SDimitry Andric  *   the branches.  Note that there is an implicit OOR2 following OCH_
63*0b57cec5SDimitry Andric  *   and an implicit OOR1 preceding O_CH.
64*0b57cec5SDimitry Andric  *
65*0b57cec5SDimitry Andric  * In state representations, an operator's bit is on to signify a state
66*0b57cec5SDimitry Andric  * immediately *preceding* "execution" of that operator.
67*0b57cec5SDimitry Andric  */
68*0b57cec5SDimitry Andric typedef unsigned long sop;	/* strip operator */
69*0b57cec5SDimitry Andric typedef long sopno;
70*0b57cec5SDimitry Andric #define	OPRMASK	0xf8000000LU
71*0b57cec5SDimitry Andric #define	OPDMASK	0x07ffffffLU
72*0b57cec5SDimitry Andric #define	OPSHIFT	((unsigned)27)
73*0b57cec5SDimitry Andric #define	OP(n)	((n)&OPRMASK)
74*0b57cec5SDimitry Andric #define	OPND(n)	((n)&OPDMASK)
75*0b57cec5SDimitry Andric #define	SOP(op, opnd)	((op)|(opnd))
76*0b57cec5SDimitry Andric /* operators			   meaning	operand			*/
77*0b57cec5SDimitry Andric /*						(back, fwd are offsets)	*/
78*0b57cec5SDimitry Andric #define	OEND	(1LU<<OPSHIFT)	/* endmarker	-			*/
79*0b57cec5SDimitry Andric #define	OCHAR	(2LU<<OPSHIFT)	/* character	unsigned char		*/
80*0b57cec5SDimitry Andric #define	OBOL	(3LU<<OPSHIFT)	/* left anchor	-			*/
81*0b57cec5SDimitry Andric #define	OEOL	(4LU<<OPSHIFT)	/* right anchor	-			*/
82*0b57cec5SDimitry Andric #define	OANY	(5LU<<OPSHIFT)	/* .		-			*/
83*0b57cec5SDimitry Andric #define	OANYOF	(6LU<<OPSHIFT)	/* [...]	set number		*/
84*0b57cec5SDimitry Andric #define	OBACK_	(7LU<<OPSHIFT)	/* begin \d	paren number		*/
85*0b57cec5SDimitry Andric #define	O_BACK	(8LU<<OPSHIFT)	/* end \d	paren number		*/
86*0b57cec5SDimitry Andric #define	OPLUS_	(9LU<<OPSHIFT)	/* + prefix	fwd to suffix		*/
87*0b57cec5SDimitry Andric #define	O_PLUS	(10LU<<OPSHIFT)	/* + suffix	back to prefix		*/
88*0b57cec5SDimitry Andric #define	OQUEST_	(11LU<<OPSHIFT)	/* ? prefix	fwd to suffix		*/
89*0b57cec5SDimitry Andric #define	O_QUEST	(12LU<<OPSHIFT)	/* ? suffix	back to prefix		*/
90*0b57cec5SDimitry Andric #define	OLPAREN	(13LU<<OPSHIFT)	/* (		fwd to )		*/
91*0b57cec5SDimitry Andric #define	ORPAREN	(14LU<<OPSHIFT)	/* )		back to (		*/
92*0b57cec5SDimitry Andric #define	OCH_	(15LU<<OPSHIFT)	/* begin choice	fwd to OOR2		*/
93*0b57cec5SDimitry Andric #define	OOR1	(16LU<<OPSHIFT)	/* | pt. 1	back to OOR1 or OCH_	*/
94*0b57cec5SDimitry Andric #define	OOR2	(17LU<<OPSHIFT)	/* | pt. 2	fwd to OOR2 or O_CH	*/
95*0b57cec5SDimitry Andric #define	O_CH	(18LU<<OPSHIFT)	/* end choice	back to OOR1		*/
96*0b57cec5SDimitry Andric #define	OBOW	(19LU<<OPSHIFT)	/* begin word	-			*/
97*0b57cec5SDimitry Andric #define	OEOW	(20LU<<OPSHIFT)	/* end word	-			*/
98*0b57cec5SDimitry Andric 
99*0b57cec5SDimitry Andric /*
100*0b57cec5SDimitry Andric  * Structure for [] character-set representation.  Character sets are
101*0b57cec5SDimitry Andric  * done as bit vectors, grouped 8 to a byte vector for compactness.
102*0b57cec5SDimitry Andric  * The individual set therefore has both a pointer to the byte vector
103*0b57cec5SDimitry Andric  * and a mask to pick out the relevant bit of each byte.  A hash code
104*0b57cec5SDimitry Andric  * simplifies testing whether two sets could be identical.
105*0b57cec5SDimitry Andric  *
106*0b57cec5SDimitry Andric  * This will get trickier for multicharacter collating elements.  As
107*0b57cec5SDimitry Andric  * preliminary hooks for dealing with such things, we also carry along
108*0b57cec5SDimitry Andric  * a string of multi-character elements, and decide the size of the
109*0b57cec5SDimitry Andric  * vectors at run time.
110*0b57cec5SDimitry Andric  */
111*0b57cec5SDimitry Andric typedef struct {
112*0b57cec5SDimitry Andric 	uch *ptr;		/* -> uch [csetsize] */
113*0b57cec5SDimitry Andric 	uch mask;		/* bit within array */
114*0b57cec5SDimitry Andric 	uch hash;		/* hash code */
115*0b57cec5SDimitry Andric 	size_t smultis;
116*0b57cec5SDimitry Andric 	char *multis;		/* -> char[smulti]  ab\0cd\0ef\0\0 */
117*0b57cec5SDimitry Andric } cset;
118*0b57cec5SDimitry Andric /* note that CHadd and CHsub are unsafe, and CHIN doesn't yield 0/1 */
119*0b57cec5SDimitry Andric #define	CHadd(cs, c)	((cs)->ptr[(uch)(c)] |= (cs)->mask, (cs)->hash += (c))
120*0b57cec5SDimitry Andric #define	CHsub(cs, c)	((cs)->ptr[(uch)(c)] &= ~(cs)->mask, (cs)->hash -= (c))
121*0b57cec5SDimitry Andric #define	CHIN(cs, c)	((cs)->ptr[(uch)(c)] & (cs)->mask)
122*0b57cec5SDimitry Andric #define	MCadd(p, cs, cp)	mcadd(p, cs, cp)	/* llvm_regcomp() internal fns */
123*0b57cec5SDimitry Andric #define	MCsub(p, cs, cp)	mcsub(p, cs, cp)
124*0b57cec5SDimitry Andric #define	MCin(p, cs, cp)	mcin(p, cs, cp)
125*0b57cec5SDimitry Andric 
126*0b57cec5SDimitry Andric /* stuff for character categories */
127*0b57cec5SDimitry Andric typedef unsigned char cat_t;
128*0b57cec5SDimitry Andric 
129*0b57cec5SDimitry Andric /*
130*0b57cec5SDimitry Andric  * main compiled-expression structure
131*0b57cec5SDimitry Andric  */
132*0b57cec5SDimitry Andric struct re_guts {
133*0b57cec5SDimitry Andric 	int magic;
134*0b57cec5SDimitry Andric #		define	MAGIC2	((('R'^0200)<<8)|'E')
135*0b57cec5SDimitry Andric 	sop *strip;		/* malloced area for strip */
136*0b57cec5SDimitry Andric 	int csetsize;		/* number of bits in a cset vector */
137*0b57cec5SDimitry Andric 	int ncsets;		/* number of csets in use */
138*0b57cec5SDimitry Andric 	cset *sets;		/* -> cset [ncsets] */
139*0b57cec5SDimitry Andric 	uch *setbits;		/* -> uch[csetsize][ncsets/CHAR_BIT] */
140*0b57cec5SDimitry Andric 	int cflags;		/* copy of llvm_regcomp() cflags argument */
141*0b57cec5SDimitry Andric 	sopno nstates;		/* = number of sops */
142*0b57cec5SDimitry Andric 	sopno firststate;	/* the initial OEND (normally 0) */
143*0b57cec5SDimitry Andric 	sopno laststate;	/* the final OEND */
144*0b57cec5SDimitry Andric 	int iflags;		/* internal flags */
145*0b57cec5SDimitry Andric #		define	USEBOL	01	/* used ^ */
146*0b57cec5SDimitry Andric #		define	USEEOL	02	/* used $ */
147*0b57cec5SDimitry Andric #		define	REGEX_BAD	04	/* something wrong */
148*0b57cec5SDimitry Andric 	int nbol;		/* number of ^ used */
149*0b57cec5SDimitry Andric 	int neol;		/* number of $ used */
150*0b57cec5SDimitry Andric 	int ncategories;	/* how many character categories */
151*0b57cec5SDimitry Andric 	cat_t *categories;	/* ->catspace[-CHAR_MIN] */
152*0b57cec5SDimitry Andric 	char *must;		/* match must contain this string */
153*0b57cec5SDimitry Andric 	int mlen;		/* length of must */
154*0b57cec5SDimitry Andric 	size_t nsub;		/* copy of re_nsub */
155*0b57cec5SDimitry Andric 	int backrefs;		/* does it use back references? */
156*0b57cec5SDimitry Andric 	sopno nplus;		/* how deep does it nest +s? */
157*0b57cec5SDimitry Andric 	/* catspace must be last */
158*0b57cec5SDimitry Andric 	cat_t catspace[1];	/* actually [NC] */
159*0b57cec5SDimitry Andric };
160*0b57cec5SDimitry Andric 
161*0b57cec5SDimitry Andric /* misc utilities */
162*0b57cec5SDimitry Andric #define	OUT	(CHAR_MAX+1)	/* a non-character value */
163*0b57cec5SDimitry Andric #define	ISWORD(c)	(isalnum(c&0xff) || (c) == '_')
164*0b57cec5SDimitry Andric 
165*0b57cec5SDimitry Andric #endif
166