1 /* regcomp.h 2 * 3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 4 * 2000, 2001, 2002, 2003, by Larry Wall and others 5 * 6 * You may distribute under the terms of either the GNU General Public 7 * License or the Artistic License, as specified in the README file. 8 * 9 */ 10 11 typedef OP OP_4tree; /* Will be redefined later. */ 12 13 /* 14 * The "internal use only" fields in regexp.h are present to pass info from 15 * compile to execute that permits the execute phase to run lots faster on 16 * simple cases. They are: 17 * 18 * regstart sv that must begin a match; Nullch if none obvious 19 * reganch is the match anchored (at beginning-of-line only)? 20 * regmust string (pointer into program) that match must include, or NULL 21 * [regmust changed to SV* for bminstr()--law] 22 * regmlen length of regmust string 23 * [regmlen not used currently] 24 * 25 * Regstart and reganch permit very fast decisions on suitable starting points 26 * for a match, cutting down the work a lot. Regmust permits fast rejection 27 * of lines that cannot possibly match. The regmust tests are costly enough 28 * that pregcomp() supplies a regmust only if the r.e. contains something 29 * potentially expensive (at present, the only such thing detected is * or + 30 * at the start of the r.e., which can involve a lot of backup). Regmlen is 31 * supplied because the test in pregexec() needs it and pregcomp() is computing 32 * it anyway. 33 * [regmust is now supplied always. The tests that use regmust have a 34 * heuristic that disables the test if it usually matches.] 35 * 36 * [In fact, we now use regmust in many cases to locate where the search 37 * starts in the string, so if regback is >= 0, the regmust search is never 38 * wasted effort. The regback variable says how many characters back from 39 * where regmust matched is the earliest possible start of the match. 40 * For instance, /[a-z].foo/ has a regmust of 'foo' and a regback of 2.] 41 */ 42 43 /* 44 * Structure for regexp "program". This is essentially a linear encoding 45 * of a nondeterministic finite-state machine (aka syntax charts or 46 * "railroad normal form" in parsing technology). Each node is an opcode 47 * plus a "next" pointer, possibly plus an operand. "Next" pointers of 48 * all nodes except BRANCH implement concatenation; a "next" pointer with 49 * a BRANCH on both ends of it is connecting two alternatives. (Here we 50 * have one of the subtle syntax dependencies: an individual BRANCH (as 51 * opposed to a collection of them) is never concatenated with anything 52 * because of operator precedence.) The operand of some types of node is 53 * a literal string; for others, it is a node leading into a sub-FSM. In 54 * particular, the operand of a BRANCH node is the first node of the branch. 55 * (NB this is *not* a tree structure: the tail of the branch connects 56 * to the thing following the set of BRANCHes.) The opcodes are: 57 */ 58 59 /* 60 * A node is one char of opcode followed by two chars of "next" pointer. 61 * "Next" pointers are stored as two 8-bit pieces, high order first. The 62 * value is a positive offset from the opcode of the node containing it. 63 * An operand, if any, simply follows the node. (Note that much of the 64 * code generation knows about this implicit relationship.) 65 * 66 * Using two bytes for the "next" pointer is vast overkill for most things, 67 * but allows patterns to get big without disasters. 68 * 69 * [The "next" pointer is always aligned on an even 70 * boundary, and reads the offset directly as a short. Also, there is no 71 * special test to reverse the sign of BACK pointers since the offset is 72 * stored negative.] 73 */ 74 75 struct regnode_string { 76 U8 str_len; 77 U8 type; 78 U16 next_off; 79 char string[1]; 80 }; 81 82 struct regnode_1 { 83 U8 flags; 84 U8 type; 85 U16 next_off; 86 U32 arg1; 87 }; 88 89 struct regnode_2 { 90 U8 flags; 91 U8 type; 92 U16 next_off; 93 U16 arg1; 94 U16 arg2; 95 }; 96 97 #define ANYOF_BITMAP_SIZE 32 /* 256 b/(8 b/B) */ 98 #define ANYOF_CLASSBITMAP_SIZE 4 /* up to 32 (8*4) named classes */ 99 100 struct regnode_charclass { 101 U8 flags; 102 U8 type; 103 U16 next_off; 104 U32 arg1; 105 char bitmap[ANYOF_BITMAP_SIZE]; /* only compile-time */ 106 }; 107 108 struct regnode_charclass_class { /* has [[:blah:]] classes */ 109 U8 flags; /* should have ANYOF_CLASS here */ 110 U8 type; 111 U16 next_off; 112 U32 arg1; 113 char bitmap[ANYOF_BITMAP_SIZE]; /* both compile-time */ 114 char classflags[ANYOF_CLASSBITMAP_SIZE]; /* and run-time */ 115 }; 116 117 /* XXX fix this description. 118 Impose a limit of REG_INFTY on various pattern matching operations 119 to limit stack growth and to avoid "infinite" recursions. 120 */ 121 /* The default size for REG_INFTY is I16_MAX, which is the same as 122 SHORT_MAX (see perl.h). Unfortunately I16 isn't necessarily 16 bits 123 (see handy.h). On the Cray C90, sizeof(short)==4 and hence I16_MAX is 124 ((1<<31)-1), while on the Cray T90, sizeof(short)==8 and I16_MAX is 125 ((1<<63)-1). To limit stack growth to reasonable sizes, supply a 126 smaller default. 127 --Andy Dougherty 11 June 1998 128 */ 129 #if SHORTSIZE > 2 130 # ifndef REG_INFTY 131 # define REG_INFTY ((1<<15)-1) 132 # endif 133 #endif 134 135 #ifndef REG_INFTY 136 # define REG_INFTY I16_MAX 137 #endif 138 139 #define ARG_VALUE(arg) (arg) 140 #define ARG__SET(arg,val) ((arg) = (val)) 141 142 #undef ARG 143 #undef ARG1 144 #undef ARG2 145 146 #define ARG(p) ARG_VALUE(ARG_LOC(p)) 147 #define ARG1(p) ARG_VALUE(ARG1_LOC(p)) 148 #define ARG2(p) ARG_VALUE(ARG2_LOC(p)) 149 #define ARG_SET(p, val) ARG__SET(ARG_LOC(p), (val)) 150 #define ARG1_SET(p, val) ARG__SET(ARG1_LOC(p), (val)) 151 #define ARG2_SET(p, val) ARG__SET(ARG2_LOC(p), (val)) 152 153 #undef NEXT_OFF 154 #undef NODE_ALIGN 155 156 #ifndef lint 157 # define NEXT_OFF(p) ((p)->next_off) 158 # define NODE_ALIGN(node) 159 # define NODE_ALIGN_FILL(node) ((node)->flags = 0xde) /* deadbeef */ 160 #else /* lint */ 161 # define NEXT_OFF(p) 0 162 # define NODE_ALIGN(node) 163 # define NODE_ALIGN_FILL(node) 164 #endif /* lint */ 165 166 #define SIZE_ALIGN NODE_ALIGN 167 168 #undef OP 169 #undef OPERAND 170 #undef MASK 171 #undef STRING 172 173 #define OP(p) ((p)->type) 174 #define OPERAND(p) (((struct regnode_string *)p)->string) 175 #define MASK(p) ((char*)OPERAND(p)) 176 #define STR_LEN(p) (((struct regnode_string *)p)->str_len) 177 #define STRING(p) (((struct regnode_string *)p)->string) 178 #define STR_SZ(l) ((l + sizeof(regnode) - 1) / sizeof(regnode)) 179 #define NODE_SZ_STR(p) (STR_SZ(STR_LEN(p))+1) 180 181 #undef NODE_ALIGN 182 #undef ARG_LOC 183 #undef NEXTOPER 184 #undef PREVOPER 185 186 #define NODE_ALIGN(node) 187 #define ARG_LOC(p) (((struct regnode_1 *)p)->arg1) 188 #define ARG1_LOC(p) (((struct regnode_2 *)p)->arg1) 189 #define ARG2_LOC(p) (((struct regnode_2 *)p)->arg2) 190 #define NODE_STEP_REGNODE 1 /* sizeof(regnode)/sizeof(regnode) */ 191 #define EXTRA_STEP_2ARGS EXTRA_SIZE(struct regnode_2) 192 193 #define NODE_STEP_B 4 194 195 #define NEXTOPER(p) ((p) + NODE_STEP_REGNODE) 196 #define PREVOPER(p) ((p) - NODE_STEP_REGNODE) 197 198 #define FILL_ADVANCE_NODE(ptr, op) STMT_START { \ 199 (ptr)->type = op; (ptr)->next_off = 0; (ptr)++; } STMT_END 200 #define FILL_ADVANCE_NODE_ARG(ptr, op, arg) STMT_START { \ 201 ARG_SET(ptr, arg); FILL_ADVANCE_NODE(ptr, op); (ptr) += 1; } STMT_END 202 203 #define REG_MAGIC 0234 204 205 #define SIZE_ONLY (RExC_emit == &PL_regdummy) 206 207 /* Flags for node->flags of ANYOF */ 208 209 #define ANYOF_CLASS 0x08 /* has [[:blah:]] classes */ 210 #define ANYOF_INVERT 0x04 211 #define ANYOF_FOLD 0x02 212 #define ANYOF_LOCALE 0x01 213 214 /* Used for regstclass only */ 215 #define ANYOF_EOS 0x10 /* Can match an empty string too */ 216 217 /* There is a character or a range past 0xff */ 218 #define ANYOF_UNICODE 0x20 219 #define ANYOF_UNICODE_ALL 0x40 /* Can match any char past 0xff */ 220 221 /* size of node is large (includes class pointer) */ 222 #define ANYOF_LARGE 0x80 223 224 /* Are there any runtime flags on in this node? */ 225 #define ANYOF_RUNTIME(s) (ANYOF_FLAGS(s) & 0x0f) 226 227 #define ANYOF_FLAGS_ALL 0xff 228 229 /* Character classes for node->classflags of ANYOF */ 230 /* Should be synchronized with a table in regprop() */ 231 /* 2n should pair with 2n+1 */ 232 233 #define ANYOF_ALNUM 0 /* \w, PL_utf8_alnum, utf8::IsWord, ALNUM */ 234 #define ANYOF_NALNUM 1 235 #define ANYOF_SPACE 2 /* \s */ 236 #define ANYOF_NSPACE 3 237 #define ANYOF_DIGIT 4 238 #define ANYOF_NDIGIT 5 239 #define ANYOF_ALNUMC 6 /* isalnum(3), utf8::IsAlnum, ALNUMC */ 240 #define ANYOF_NALNUMC 7 241 #define ANYOF_ALPHA 8 242 #define ANYOF_NALPHA 9 243 #define ANYOF_ASCII 10 244 #define ANYOF_NASCII 11 245 #define ANYOF_CNTRL 12 246 #define ANYOF_NCNTRL 13 247 #define ANYOF_GRAPH 14 248 #define ANYOF_NGRAPH 15 249 #define ANYOF_LOWER 16 250 #define ANYOF_NLOWER 17 251 #define ANYOF_PRINT 18 252 #define ANYOF_NPRINT 19 253 #define ANYOF_PUNCT 20 254 #define ANYOF_NPUNCT 21 255 #define ANYOF_UPPER 22 256 #define ANYOF_NUPPER 23 257 #define ANYOF_XDIGIT 24 258 #define ANYOF_NXDIGIT 25 259 #define ANYOF_PSXSPC 26 /* POSIX space: \s plus the vertical tab */ 260 #define ANYOF_NPSXSPC 27 261 #define ANYOF_BLANK 28 /* GNU extension: space and tab: non-vertical space */ 262 #define ANYOF_NBLANK 29 263 264 #define ANYOF_MAX 32 265 266 /* Backward source code compatibility. */ 267 268 #define ANYOF_ALNUML ANYOF_ALNUM 269 #define ANYOF_NALNUML ANYOF_NALNUM 270 #define ANYOF_SPACEL ANYOF_SPACE 271 #define ANYOF_NSPACEL ANYOF_NSPACE 272 273 /* Utility macros for the bitmap and classes of ANYOF */ 274 275 #define ANYOF_SIZE (sizeof(struct regnode_charclass)) 276 #define ANYOF_CLASS_SIZE (sizeof(struct regnode_charclass_class)) 277 278 #define ANYOF_FLAGS(p) ((p)->flags) 279 280 #define ANYOF_BIT(c) (1 << ((c) & 7)) 281 282 #define ANYOF_CLASS_BYTE(p, c) (((struct regnode_charclass_class*)(p))->classflags[((c) >> 3) & 3]) 283 #define ANYOF_CLASS_SET(p, c) (ANYOF_CLASS_BYTE(p, c) |= ANYOF_BIT(c)) 284 #define ANYOF_CLASS_CLEAR(p, c) (ANYOF_CLASS_BYTE(p, c) &= ~ANYOF_BIT(c)) 285 #define ANYOF_CLASS_TEST(p, c) (ANYOF_CLASS_BYTE(p, c) & ANYOF_BIT(c)) 286 287 #define ANYOF_CLASS_ZERO(ret) Zero(((struct regnode_charclass_class*)(ret))->classflags, ANYOF_CLASSBITMAP_SIZE, char) 288 #define ANYOF_BITMAP_ZERO(ret) Zero(((struct regnode_charclass*)(ret))->bitmap, ANYOF_BITMAP_SIZE, char) 289 290 #define ANYOF_BITMAP(p) (((struct regnode_charclass*)(p))->bitmap) 291 #define ANYOF_BITMAP_BYTE(p, c) (ANYOF_BITMAP(p)[((c) >> 3) & 31]) 292 #define ANYOF_BITMAP_SET(p, c) (ANYOF_BITMAP_BYTE(p, c) |= ANYOF_BIT(c)) 293 #define ANYOF_BITMAP_CLEAR(p,c) (ANYOF_BITMAP_BYTE(p, c) &= ~ANYOF_BIT(c)) 294 #define ANYOF_BITMAP_TEST(p, c) (ANYOF_BITMAP_BYTE(p, c) & ANYOF_BIT(c)) 295 296 #define ANYOF_BITMAP_SETALL(p) \ 297 memset (ANYOF_BITMAP(p), 255, ANYOF_BITMAP_SIZE) 298 #define ANYOF_BITMAP_CLEARALL(p) \ 299 Zero (ANYOF_BITMAP(p), ANYOF_BITMAP_SIZE) 300 /* Check that all 256 bits are all set. Used in S_cl_is_anything() */ 301 #define ANYOF_BITMAP_TESTALLSET(p) \ 302 memEQ (ANYOF_BITMAP(p), "\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377\377", ANYOF_BITMAP_SIZE) 303 304 #define ANYOF_SKIP ((ANYOF_SIZE - 1)/sizeof(regnode)) 305 #define ANYOF_CLASS_SKIP ((ANYOF_CLASS_SIZE - 1)/sizeof(regnode)) 306 #define ANYOF_CLASS_ADD_SKIP (ANYOF_CLASS_SKIP - ANYOF_SKIP) 307 308 /* 309 * Utility definitions. 310 */ 311 #ifndef lint 312 #ifndef CHARMASK 313 #define UCHARAT(p) ((int)*(U8*)(p)) 314 #else 315 #define UCHARAT(p) ((int)*(p)&CHARMASK) 316 #endif 317 #else /* lint */ 318 #define UCHARAT(p) PL_regdummy 319 #endif /* lint */ 320 321 #define EXTRA_SIZE(guy) ((sizeof(guy)-1)/sizeof(struct regnode)) 322 323 #define REG_SEEN_ZERO_LEN 1 324 #define REG_SEEN_LOOKBEHIND 2 325 #define REG_SEEN_GPOS 4 326 #define REG_SEEN_EVAL 8 327 #define REG_SEEN_CANY 16 328 #define REG_SEEN_SANY REG_SEEN_CANY /* src bckwrd cmpt */ 329 330 START_EXTERN_C 331 332 #include "regnodes.h" 333 334 /* The following have no fixed length. U8 so we can do strchr() on it. */ 335 #ifndef DOINIT 336 EXTCONST U8 PL_varies[]; 337 #else 338 EXTCONST U8 PL_varies[] = { 339 BRANCH, BACK, STAR, PLUS, CURLY, CURLYX, REF, REFF, REFFL, 340 WHILEM, CURLYM, CURLYN, BRANCHJ, IFTHEN, SUSPEND, CLUMP, 0 341 }; 342 #endif 343 344 /* The following always have a length of 1. U8 we can do strchr() on it. */ 345 /* (Note that length 1 means "one character" under UTF8, not "one octet".) */ 346 #ifndef DOINIT 347 EXTCONST U8 PL_simple[]; 348 #else 349 EXTCONST U8 PL_simple[] = { 350 REG_ANY, SANY, CANY, 351 ANYOF, 352 ALNUM, ALNUML, 353 NALNUM, NALNUML, 354 SPACE, SPACEL, 355 NSPACE, NSPACEL, 356 DIGIT, NDIGIT, 357 0 358 }; 359 #endif 360 361 END_EXTERN_C 362 363 typedef struct re_scream_pos_data_s 364 { 365 char **scream_olds; /* match pos */ 366 I32 *scream_pos; /* Internal iterator of scream. */ 367 } re_scream_pos_data; 368 369 /* .what is a character array with one character for each member of .data 370 * The character describes the function of the corresponding .data item: 371 * f - start-class data for regstclass optimization 372 * n - Root of op tree for (?{EVAL}) item 373 * o - Start op for (?{EVAL}) item 374 * p - Pad for (?{EVAL} item 375 * s - swash for unicode-style character class, and the multicharacter 376 * strings resulting from casefolding the single-character entries 377 * in the character class 378 * 20010712 mjd@plover.com 379 * (Remember to update re_dup() and pregfree() if you add any items.) 380 */ 381 struct reg_data { 382 U32 count; 383 U8 *what; 384 void* data[1]; 385 }; 386 387 struct reg_substr_datum { 388 I32 min_offset; 389 I32 max_offset; 390 SV *substr; /* non-utf8 variant */ 391 SV *utf8_substr; /* utf8 variant */ 392 }; 393 394 struct reg_substr_data { 395 struct reg_substr_datum data[3]; /* Actual array */ 396 }; 397 398 #define anchored_substr substrs->data[0].substr 399 #define anchored_utf8 substrs->data[0].utf8_substr 400 #define anchored_offset substrs->data[0].min_offset 401 #define float_substr substrs->data[1].substr 402 #define float_utf8 substrs->data[1].utf8_substr 403 #define float_min_offset substrs->data[1].min_offset 404 #define float_max_offset substrs->data[1].max_offset 405 #define check_substr substrs->data[2].substr 406 #define check_utf8 substrs->data[2].utf8_substr 407 #define check_offset_min substrs->data[2].min_offset 408 #define check_offset_max substrs->data[2].max_offset 409