1*f14fb602SLionel Sambuc /* $NetBSD: regex2.h,v 1.13 2011/10/09 18:23:00 christos Exp $ */ 22fe8fb19SBen Gras 3b7061124SArun Thomas /*- 4b7061124SArun Thomas * Copyright (c) 1992, 1993, 1994 5b7061124SArun Thomas * The Regents of the University of California. All rights reserved. 6b7061124SArun Thomas * 7b7061124SArun Thomas * This code is derived from software contributed to Berkeley by 8b7061124SArun Thomas * Henry Spencer. 9b7061124SArun Thomas * 10b7061124SArun Thomas * Redistribution and use in source and binary forms, with or without 11b7061124SArun Thomas * modification, are permitted provided that the following conditions 12b7061124SArun Thomas * are met: 13b7061124SArun Thomas * 1. Redistributions of source code must retain the above copyright 14b7061124SArun Thomas * notice, this list of conditions and the following disclaimer. 15b7061124SArun Thomas * 2. Redistributions in binary form must reproduce the above copyright 16b7061124SArun Thomas * notice, this list of conditions and the following disclaimer in the 17b7061124SArun Thomas * documentation and/or other materials provided with the distribution. 182fe8fb19SBen Gras * 3. Neither the name of the University nor the names of its contributors 192fe8fb19SBen Gras * may be used to endorse or promote products derived from this software 202fe8fb19SBen Gras * without specific prior written permission. 212fe8fb19SBen Gras * 222fe8fb19SBen Gras * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 232fe8fb19SBen Gras * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 242fe8fb19SBen Gras * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 252fe8fb19SBen Gras * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 262fe8fb19SBen Gras * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 272fe8fb19SBen Gras * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 282fe8fb19SBen Gras * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 292fe8fb19SBen Gras * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 302fe8fb19SBen Gras * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 312fe8fb19SBen Gras * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 322fe8fb19SBen Gras * SUCH DAMAGE. 332fe8fb19SBen Gras * 342fe8fb19SBen Gras * @(#)regex2.h 8.4 (Berkeley) 3/20/94 352fe8fb19SBen Gras */ 362fe8fb19SBen Gras 372fe8fb19SBen Gras /*- 382fe8fb19SBen Gras * Copyright (c) 1992, 1993, 1994 Henry Spencer. 392fe8fb19SBen Gras * 402fe8fb19SBen Gras * This code is derived from software contributed to Berkeley by 412fe8fb19SBen Gras * Henry Spencer. 422fe8fb19SBen Gras * 432fe8fb19SBen Gras * Redistribution and use in source and binary forms, with or without 442fe8fb19SBen Gras * modification, are permitted provided that the following conditions 452fe8fb19SBen Gras * are met: 462fe8fb19SBen Gras * 1. Redistributions of source code must retain the above copyright 472fe8fb19SBen Gras * notice, this list of conditions and the following disclaimer. 482fe8fb19SBen Gras * 2. Redistributions in binary form must reproduce the above copyright 492fe8fb19SBen Gras * notice, this list of conditions and the following disclaimer in the 502fe8fb19SBen Gras * documentation and/or other materials provided with the distribution. 51b7061124SArun Thomas * 3. All advertising materials mentioning features or use of this software 52b7061124SArun Thomas * must display the following acknowledgement: 53b7061124SArun Thomas * This product includes software developed by the University of 54b7061124SArun Thomas * California, Berkeley and its contributors. 55b7061124SArun Thomas * 4. Neither the name of the University nor the names of its contributors 56b7061124SArun Thomas * may be used to endorse or promote products derived from this software 57b7061124SArun Thomas * without specific prior written permission. 58b7061124SArun Thomas * 59b7061124SArun Thomas * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 60b7061124SArun Thomas * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 61b7061124SArun Thomas * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 62b7061124SArun Thomas * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 63b7061124SArun Thomas * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 64b7061124SArun Thomas * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 65b7061124SArun Thomas * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 66b7061124SArun Thomas * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 67b7061124SArun Thomas * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 68b7061124SArun Thomas * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 69b7061124SArun Thomas * SUCH DAMAGE. 70b7061124SArun Thomas * 71b7061124SArun Thomas * @(#)regex2.h 8.4 (Berkeley) 3/20/94 72b7061124SArun Thomas */ 73b7061124SArun Thomas 74b7061124SArun Thomas /* 75b7061124SArun Thomas * First, the stuff that ends up in the outside-world include file 76b7061124SArun Thomas = typedef off_t regoff_t; 77b7061124SArun Thomas = typedef struct { 78b7061124SArun Thomas = int re_magic; 79b7061124SArun Thomas = size_t re_nsub; // number of parenthesized subexpressions 80b7061124SArun Thomas = const char *re_endp; // end pointer for REG_PEND 81b7061124SArun Thomas = struct re_guts *re_g; // none of your business :-) 82b7061124SArun Thomas = } regex_t; 83b7061124SArun Thomas = typedef struct { 84b7061124SArun Thomas = regoff_t rm_so; // start of match 85b7061124SArun Thomas = regoff_t rm_eo; // end of match 86b7061124SArun Thomas = } regmatch_t; 87b7061124SArun Thomas */ 88b7061124SArun Thomas /* 89b7061124SArun Thomas * internals of regex_t 90b7061124SArun Thomas */ 91b7061124SArun Thomas #define MAGIC1 ((('r'^0200)<<8) | 'e') 92b7061124SArun Thomas 93b7061124SArun Thomas /* 94b7061124SArun Thomas * The internal representation is a *strip*, a sequence of 95b7061124SArun Thomas * operators ending with an endmarker. (Some terminology etc. is a 96b7061124SArun Thomas * historical relic of earlier versions which used multiple strips.) 97b7061124SArun Thomas * Certain oddities in the representation are there to permit running 98b7061124SArun Thomas * the machinery backwards; in particular, any deviation from sequential 99b7061124SArun Thomas * flow must be marked at both its source and its destination. Some 100b7061124SArun Thomas * fine points: 101b7061124SArun Thomas * 102b7061124SArun Thomas * - OPLUS_ and O_PLUS are *inside* the loop they create. 103b7061124SArun Thomas * - OQUEST_ and O_QUEST are *outside* the bypass they create. 104b7061124SArun Thomas * - OCH_ and O_CH are *outside* the multi-way branch they create, while 105b7061124SArun Thomas * OOR1 and OOR2 are respectively the end and the beginning of one of 106b7061124SArun Thomas * the branches. Note that there is an implicit OOR2 following OCH_ 107b7061124SArun Thomas * and an implicit OOR1 preceding O_CH. 108b7061124SArun Thomas * 109b7061124SArun Thomas * In state representations, an operator's bit is on to signify a state 110b7061124SArun Thomas * immediately *preceding* "execution" of that operator. 111b7061124SArun Thomas */ 1122fe8fb19SBen Gras typedef u_int32_t sop; /* strip operator */ 113*f14fb602SLionel Sambuc typedef size_t sopno; 1142fe8fb19SBen Gras #define OPRMASK ((u_int32_t)0xf8000000UL) 1152fe8fb19SBen Gras #define OPDMASK ((u_int32_t)0x07ffffffUL) 116b7061124SArun Thomas #define OPSHIFT ((unsigned)27) 117b7061124SArun Thomas #define OP(n) ((n)&OPRMASK) 1182fe8fb19SBen Gras #define OPND(n) ((int)((n)&OPDMASK)) 119b7061124SArun Thomas #define SOP(op, opnd) ((op)|(opnd)) 1202fe8fb19SBen Gras 1212fe8fb19SBen Gras #define OPC(n) (((u_int32_t)(n))<<OPSHIFT) 122b7061124SArun Thomas /* operators meaning operand */ 123b7061124SArun Thomas /* (back, fwd are offsets) */ 1242fe8fb19SBen Gras #define OEND OPC(1) /* endmarker - */ 1252fe8fb19SBen Gras #define OCHAR OPC(2) /* character unsigned char */ 1262fe8fb19SBen Gras #define OBOL OPC(3) /* left anchor - */ 1272fe8fb19SBen Gras #define OEOL OPC(4) /* right anchor - */ 1282fe8fb19SBen Gras #define OANY OPC(5) /* . - */ 1292fe8fb19SBen Gras #define OANYOF OPC(6) /* [...] set number */ 1302fe8fb19SBen Gras #define OBACK_ OPC(7) /* begin \d paren number */ 1312fe8fb19SBen Gras #define O_BACK OPC(8) /* end \d paren number */ 1322fe8fb19SBen Gras #define OPLUS_ OPC(9) /* + prefix fwd to suffix */ 1332fe8fb19SBen Gras #define O_PLUS OPC(10) /* + suffix back to prefix */ 1342fe8fb19SBen Gras #define OQUEST_ OPC(11) /* ? prefix fwd to suffix */ 1352fe8fb19SBen Gras #define O_QUEST OPC(12) /* ? suffix back to prefix */ 1362fe8fb19SBen Gras #define OLPAREN OPC(13) /* ( fwd to ) */ 1372fe8fb19SBen Gras #define ORPAREN OPC(14) /* ) back to ( */ 1382fe8fb19SBen Gras #define OCH_ OPC(15) /* begin choice fwd to OOR2 */ 1392fe8fb19SBen Gras #define OOR1 OPC(16) /* | pt. 1 back to OOR1 or OCH_ */ 1402fe8fb19SBen Gras #define OOR2 OPC(17) /* | pt. 2 fwd to OOR2 or O_CH */ 1412fe8fb19SBen Gras #define O_CH OPC(18) /* end choice back to OOR1 */ 1422fe8fb19SBen Gras #define OBOW OPC(19) /* begin word - */ 1432fe8fb19SBen Gras #define OEOW OPC(20) /* end word - */ 144b7061124SArun Thomas 145b7061124SArun Thomas /* 146b7061124SArun Thomas * Structure for [] character-set representation. Character sets are 147b7061124SArun Thomas * done as bit vectors, grouped 8 to a byte vector for compactness. 148b7061124SArun Thomas * The individual set therefore has both a pointer to the byte vector 149b7061124SArun Thomas * and a mask to pick out the relevant bit of each byte. A hash code 150b7061124SArun Thomas * simplifies testing whether two sets could be identical. 151b7061124SArun Thomas * 152b7061124SArun Thomas * This will get trickier for multicharacter collating elements. As 153b7061124SArun Thomas * preliminary hooks for dealing with such things, we also carry along 154b7061124SArun Thomas * a string of multi-character elements, and decide the size of the 155b7061124SArun Thomas * vectors at run time. 156b7061124SArun Thomas */ 157b7061124SArun Thomas typedef struct { 158b7061124SArun Thomas uch *ptr; /* -> uch [csetsize] */ 159b7061124SArun Thomas uch mask; /* bit within array */ 160b7061124SArun Thomas uch hash; /* hash code */ 161b7061124SArun Thomas size_t smultis; 162b7061124SArun Thomas char *multis; /* -> char[smulti] ab\0cd\0ef\0\0 */ 163b7061124SArun Thomas } cset; 164b7061124SArun Thomas /* note that CHadd and CHsub are unsafe, and CHIN doesn't yield 0/1 */ 165b7061124SArun Thomas #define CHadd(cs, c) ((cs)->ptr[(uch)(c)] |= (cs)->mask, (cs)->hash += (c)) 166b7061124SArun Thomas #define CHsub(cs, c) ((cs)->ptr[(uch)(c)] &= ~(cs)->mask, (cs)->hash -= (c)) 167b7061124SArun Thomas #define CHIN(cs, c) ((cs)->ptr[(uch)(c)] & (cs)->mask) 168b7061124SArun Thomas #define MCadd(p, cs, cp) mcadd(p, cs, cp) /* regcomp() internal fns */ 169b7061124SArun Thomas #define MCsub(p, cs, cp) mcsub(p, cs, cp) 170b7061124SArun Thomas #define MCin(p, cs, cp) mcin(p, cs, cp) 171b7061124SArun Thomas 172b7061124SArun Thomas /* stuff for character categories */ 173b7061124SArun Thomas typedef unsigned char cat_t; 174b7061124SArun Thomas 175b7061124SArun Thomas /* 176b7061124SArun Thomas * main compiled-expression structure 177b7061124SArun Thomas */ 178b7061124SArun Thomas struct re_guts { 179b7061124SArun Thomas int magic; 180b7061124SArun Thomas # define MAGIC2 ((('R'^0200)<<8)|'E') 181b7061124SArun Thomas sop *strip; /* malloced area for strip */ 182*f14fb602SLionel Sambuc size_t csetsize; /* number of bits in a cset vector */ 183*f14fb602SLionel Sambuc size_t ncsets; /* number of csets in use */ 184b7061124SArun Thomas cset *sets; /* -> cset [ncsets] */ 185b7061124SArun Thomas uch *setbits; /* -> uch[csetsize][ncsets/CHAR_BIT] */ 186b7061124SArun Thomas int cflags; /* copy of regcomp() cflags argument */ 187b7061124SArun Thomas sopno nstates; /* = number of sops */ 188b7061124SArun Thomas sopno firststate; /* the initial OEND (normally 0) */ 189b7061124SArun Thomas sopno laststate; /* the final OEND */ 190b7061124SArun Thomas int iflags; /* internal flags */ 191b7061124SArun Thomas # define USEBOL 01 /* used ^ */ 192b7061124SArun Thomas # define USEEOL 02 /* used $ */ 193b7061124SArun Thomas # define BAD 04 /* something wrong */ 194*f14fb602SLionel Sambuc size_t nbol; /* number of ^ used */ 195*f14fb602SLionel Sambuc size_t neol; /* number of $ used */ 196*f14fb602SLionel Sambuc size_t ncategories; /* how many character categories */ 197b7061124SArun Thomas cat_t *categories; /* ->catspace[-CHAR_MIN] */ 198b7061124SArun Thomas char *must; /* match must contain this string */ 199*f14fb602SLionel Sambuc size_t mlen; /* length of must */ 200b7061124SArun Thomas size_t nsub; /* copy of re_nsub */ 201b7061124SArun Thomas int backrefs; /* does it use back references? */ 202b7061124SArun Thomas sopno nplus; /* how deep does it nest +s? */ 203b7061124SArun Thomas /* catspace must be last */ 204b7061124SArun Thomas cat_t catspace[1]; /* actually [NC] */ 205b7061124SArun Thomas }; 206b7061124SArun Thomas 207b7061124SArun Thomas /* misc utilities */ 208b7061124SArun Thomas #define OUT (CHAR_MAX+1) /* a non-character value */ 2092fe8fb19SBen Gras #define ISWORD(c) (isalnum((unsigned char)c) || (c) == '_') 210