1 #ifdef PERL_EXT_RE_BUILD 2 #include "re_top.h" 3 #endif 4 5 #include "EXTERN.h" 6 #define PERL_IN_REGEX_ENGINE 7 #define PERL_IN_REGCOMP_ANY 8 #define PERL_IN_REGCOMP_TRIE_C 9 #include "perl.h" 10 11 #ifdef PERL_IN_XSUB_RE 12 # include "re_comp.h" 13 #else 14 # include "regcomp.h" 15 #endif 16 17 #include "invlist_inline.h" 18 #include "unicode_constants.h" 19 #include "regcomp_internal.h" 20 21 #define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ] 22 #define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid ) 23 #define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate ) 24 #define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \ 25 ? (TRIE_LIST_CUR( idx ) - 1) \ 26 : 0 ) 27 28 29 #ifdef DEBUGGING 30 /* 31 dump_trie(trie,widecharmap,revcharmap) 32 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc) 33 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc) 34 35 These routines dump out a trie in a somewhat readable format. 36 The _interim_ variants are used for debugging the interim 37 tables that are used to generate the final compressed 38 representation which is what dump_trie expects. 39 40 Part of the reason for their existence is to provide a form 41 of documentation as to how the different representations function. 42 43 */ 44 45 /* 46 Dumps the final compressed table form of the trie to Perl_debug_log. 47 Used for debugging make_trie(). 48 */ 49 50 STATIC void 51 S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, 52 AV *revcharmap, U32 depth) 53 { 54 U32 state; 55 SV *sv=sv_newmortal(); 56 int colwidth= widecharmap ? 6 : 4; 57 U16 word; 58 DECLARE_AND_GET_RE_DEBUG_FLAGS; 59 60 PERL_ARGS_ASSERT_DUMP_TRIE; 61 62 Perl_re_indentf( aTHX_ "Char : %-6s%-6s%-4s ", 63 depth+1, "Match","Base","Ofs" ); 64 65 for( state = 0 ; state < trie->uniquecharcount ; state++ ) { 66 SV ** const tmp = av_fetch_simple( revcharmap, state, 0); 67 if ( tmp ) { 68 Perl_re_printf( aTHX_ "%*s", 69 colwidth, 70 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, 71 PL_colors[0], PL_colors[1], 72 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | 73 PERL_PV_ESCAPE_FIRSTCHAR 74 ) 75 ); 76 } 77 } 78 Perl_re_printf( aTHX_ "\n"); 79 Perl_re_indentf( aTHX_ "State|-----------------------", depth+1); 80 81 for( state = 0 ; state < trie->uniquecharcount ; state++ ) 82 Perl_re_printf( aTHX_ "%.*s", colwidth, "--------"); 83 Perl_re_printf( aTHX_ "\n"); 84 85 for( state = 1 ; state < trie->statecount ; state++ ) { 86 const U32 base = trie->states[ state ].trans.base; 87 88 Perl_re_indentf( aTHX_ "#%4" UVXf "|", depth+1, (UV)state); 89 90 if ( trie->states[ state ].wordnum ) { 91 Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum ); 92 } else { 93 Perl_re_printf( aTHX_ "%6s", "" ); 94 } 95 96 Perl_re_printf( aTHX_ " @%4" UVXf " ", (UV)base ); 97 98 if ( base ) { 99 U32 ofs = 0; 100 101 while( ( base + ofs < trie->uniquecharcount ) || 102 ( base + ofs - trie->uniquecharcount < trie->lasttrans 103 && trie->trans[ base + ofs - trie->uniquecharcount ].check 104 != state)) 105 ofs++; 106 107 Perl_re_printf( aTHX_ "+%2" UVXf "[ ", (UV)ofs); 108 109 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) { 110 if ( ( base + ofs >= trie->uniquecharcount ) 111 && ( base + ofs - trie->uniquecharcount 112 < trie->lasttrans ) 113 && trie->trans[ base + ofs 114 - trie->uniquecharcount ].check == state ) 115 { 116 Perl_re_printf( aTHX_ "%*" UVXf, colwidth, 117 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next 118 ); 119 } else { 120 Perl_re_printf( aTHX_ "%*s", colwidth," ." ); 121 } 122 } 123 124 Perl_re_printf( aTHX_ "]"); 125 126 } 127 Perl_re_printf( aTHX_ "\n" ); 128 } 129 Perl_re_indentf( aTHX_ "word_info N:(prev,len)=", 130 depth); 131 for (word=1; word <= trie->wordcount; word++) { 132 Perl_re_printf( aTHX_ " %d:(%d,%d)", 133 (int)word, (int)(trie->wordinfo[word].prev), 134 (int)(trie->wordinfo[word].len)); 135 } 136 Perl_re_printf( aTHX_ "\n" ); 137 } 138 /* 139 Dumps a fully constructed but uncompressed trie in list form. 140 List tries normally only are used for construction when the number of 141 possible chars (trie->uniquecharcount) is very high. 142 Used for debugging make_trie(). 143 */ 144 STATIC void 145 S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, 146 HV *widecharmap, AV *revcharmap, U32 next_alloc, 147 U32 depth) 148 { 149 U32 state; 150 SV *sv=sv_newmortal(); 151 int colwidth= widecharmap ? 6 : 4; 152 DECLARE_AND_GET_RE_DEBUG_FLAGS; 153 154 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST; 155 156 /* print out the table precompression. */ 157 Perl_re_indentf( aTHX_ "State :Word | Transition Data\n", 158 depth+1 ); 159 Perl_re_indentf( aTHX_ "%s", 160 depth+1, "------:-----+-----------------\n" ); 161 162 for( state=1 ; state < next_alloc ; state ++ ) { 163 U16 charid; 164 165 Perl_re_indentf( aTHX_ " %4" UVXf " :", 166 depth+1, (UV)state ); 167 if ( ! trie->states[ state ].wordnum ) { 168 Perl_re_printf( aTHX_ "%5s| ",""); 169 } else { 170 Perl_re_printf( aTHX_ "W%4x| ", 171 trie->states[ state ].wordnum 172 ); 173 } 174 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) { 175 SV ** const tmp = av_fetch_simple( revcharmap, 176 TRIE_LIST_ITEM(state, charid).forid, 0); 177 if ( tmp ) { 178 Perl_re_printf( aTHX_ "%*s:%3X=%4" UVXf " | ", 179 colwidth, 180 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 181 colwidth, 182 PL_colors[0], PL_colors[1], 183 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) 184 | PERL_PV_ESCAPE_FIRSTCHAR 185 ) , 186 TRIE_LIST_ITEM(state, charid).forid, 187 (UV)TRIE_LIST_ITEM(state, charid).newstate 188 ); 189 if (!(charid % 10)) 190 Perl_re_printf( aTHX_ "\n%*s| ", 191 (int)((depth * 2) + 14), ""); 192 } 193 } 194 Perl_re_printf( aTHX_ "\n"); 195 } 196 } 197 198 /* 199 Dumps a fully constructed but uncompressed trie in table form. 200 This is the normal DFA style state transition table, with a few 201 twists to facilitate compression later. 202 Used for debugging make_trie(). 203 */ 204 STATIC void 205 S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, 206 HV *widecharmap, AV *revcharmap, U32 next_alloc, 207 U32 depth) 208 { 209 U32 state; 210 U16 charid; 211 SV *sv=sv_newmortal(); 212 int colwidth= widecharmap ? 6 : 4; 213 DECLARE_AND_GET_RE_DEBUG_FLAGS; 214 215 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE; 216 217 /* 218 print out the table precompression so that we can do a visual check 219 that they are identical. 220 */ 221 222 Perl_re_indentf( aTHX_ "Char : ", depth+1 ); 223 224 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) { 225 SV ** const tmp = av_fetch_simple( revcharmap, charid, 0); 226 if ( tmp ) { 227 Perl_re_printf( aTHX_ "%*s", 228 colwidth, 229 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, 230 PL_colors[0], PL_colors[1], 231 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | 232 PERL_PV_ESCAPE_FIRSTCHAR 233 ) 234 ); 235 } 236 } 237 238 Perl_re_printf( aTHX_ "\n"); 239 Perl_re_indentf( aTHX_ "State+-", depth+1 ); 240 241 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) { 242 Perl_re_printf( aTHX_ "%.*s", colwidth,"--------"); 243 } 244 245 Perl_re_printf( aTHX_ "\n" ); 246 247 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) { 248 249 Perl_re_indentf( aTHX_ "%4" UVXf " : ", 250 depth+1, 251 (UV)TRIE_NODENUM( state ) ); 252 253 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) { 254 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ); 255 if (v) 256 Perl_re_printf( aTHX_ "%*" UVXf, colwidth, v ); 257 else 258 Perl_re_printf( aTHX_ "%*s", colwidth, "." ); 259 } 260 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) { 261 Perl_re_printf( aTHX_ " (%4" UVXf ")\n", 262 (UV)trie->trans[ state ].check ); 263 } else { 264 Perl_re_printf( aTHX_ " (%4" UVXf ") W%4X\n", 265 (UV)trie->trans[ state ].check, 266 trie->states[ TRIE_NODENUM( state ) ].wordnum ); 267 } 268 } 269 } 270 271 #endif 272 273 274 /* make_trie(startbranch,first,last,tail,word_count,flags,depth) 275 startbranch: the first branch in the whole branch sequence 276 first : start branch of sequence of branch-exact nodes. 277 May be the same as startbranch 278 last : Thing following the last branch. 279 May be the same as tail. 280 tail : item following the branch sequence 281 count : words in the sequence 282 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/ 283 depth : indent depth 284 285 Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node. 286 287 A trie is an N'ary tree where the branches are determined by digital 288 decomposition of the key. IE, at the root node you look up the 1st character and 289 follow that branch repeat until you find the end of the branches. Nodes can be 290 marked as "accepting" meaning they represent a complete word. Eg: 291 292 /he|she|his|hers/ 293 294 would convert into the following structure. Numbers represent states, letters 295 following numbers represent valid transitions on the letter from that state, if 296 the number is in square brackets it represents an accepting state, otherwise it 297 will be in parenthesis. 298 299 +-h->+-e->[3]-+-r->(8)-+-s->[9] 300 | | 301 | (2) 302 | | 303 (1) +-i->(6)-+-s->[7] 304 | 305 +-s->(3)-+-h->(4)-+-e->[5] 306 307 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers) 308 309 This shows that when matching against the string 'hers' we will begin at state 1 310 read 'h' and move to state 2, read 'e' and move to state 3 which is accepting, 311 then read 'r' and go to state 8 followed by 's' which takes us to state 9 which 312 is also accepting. Thus we know that we can match both 'he' and 'hers' with a 313 single traverse. We store a mapping from accepting to state to which word was 314 matched, and then when we have multiple possibilities we try to complete the 315 rest of the regex in the order in which they occurred in the alternation. 316 317 The only prior NFA like behaviour that would be changed by the TRIE support is 318 the silent ignoring of duplicate alternations which are of the form: 319 320 / (DUPE|DUPE) X? (?{ ... }) Y /x 321 322 Thus EVAL blocks following a trie may be called a different number of times with 323 and without the optimisation. With the optimisations dupes will be silently 324 ignored. This inconsistent behaviour of EVAL type nodes is well established as 325 the following demonstrates: 326 327 'words'=~/(word|word|word)(?{ print $1 })[xyz]/ 328 329 which prints out 'word' three times, but 330 331 'words'=~/(word|word|word)(?{ print $1 })S/ 332 333 which doesnt print it out at all. This is due to other optimisations kicking in. 334 335 Example of what happens on a structural level: 336 337 The regexp /(ac|ad|ab)+/ will produce the following debug output: 338 339 1: CURLYM[1] {1,32767}(18) 340 5: BRANCH(8) 341 6: EXACT <ac>(16) 342 8: BRANCH(11) 343 9: EXACT <ad>(16) 344 11: BRANCH(14) 345 12: EXACT <ab>(16) 346 16: SUCCEED(0) 347 17: NOTHING(18) 348 18: END(0) 349 350 This would be optimizable with startbranch=5, first=5, last=16, tail=16 351 and should turn into: 352 353 1: CURLYM[1] {1,32767}(18) 354 5: TRIE(16) 355 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1] 356 <ac> 357 <ad> 358 <ab> 359 16: SUCCEED(0) 360 17: NOTHING(18) 361 18: END(0) 362 363 Cases where tail != last would be like /(?foo|bar)baz/: 364 365 1: BRANCH(4) 366 2: EXACT <foo>(8) 367 4: BRANCH(7) 368 5: EXACT <bar>(8) 369 7: TAIL(8) 370 8: EXACT <baz>(10) 371 10: END(0) 372 373 which would be optimizable with startbranch=1, first=1, last=7, tail=8 374 and would end up looking like: 375 376 1: TRIE(8) 377 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1] 378 <foo> 379 <bar> 380 7: TAIL(8) 381 8: EXACT <baz>(10) 382 10: END(0) 383 384 d = uvchr_to_utf8_flags(d, uv, 0); 385 386 is the recommended Unicode-aware way of saying 387 388 *(d++) = uv; 389 */ 390 391 #define TRIE_STORE_REVCHAR(val) \ 392 STMT_START { \ 393 if (UTF) { \ 394 SV *zlopp = newSV(UTF8_MAXBYTES); \ 395 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \ 396 unsigned char *const kapow = uvchr_to_utf8(flrbbbbb, val); \ 397 *kapow = '\0'; \ 398 SvCUR_set(zlopp, kapow - flrbbbbb); \ 399 SvPOK_on(zlopp); \ 400 SvUTF8_on(zlopp); \ 401 av_push_simple(revcharmap, zlopp); \ 402 } else { \ 403 char ooooff = (char)val; \ 404 av_push_simple(revcharmap, newSVpvn(&ooooff, 1)); \ 405 } \ 406 } STMT_END 407 408 /* This gets the next character from the input, folding it if not already 409 * folded. */ 410 #define TRIE_READ_CHAR STMT_START { \ 411 wordlen++; \ 412 if ( UTF ) { \ 413 /* if it is UTF then it is either already folded, or does not need \ 414 * folding */ \ 415 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \ 416 } \ 417 else if (folder == PL_fold_latin1) { \ 418 /* This folder implies Unicode rules, which in the range expressible \ 419 * by not UTF is the lower case, with the two exceptions, one of \ 420 * which should have been taken care of before calling this */ \ 421 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \ 422 uvc = toLOWER_L1(*uc); \ 423 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \ 424 len = 1; \ 425 } else { \ 426 /* raw data, will be folded later if needed */ \ 427 uvc = (U32)*uc; \ 428 len = 1; \ 429 } \ 430 } STMT_END 431 432 433 434 #define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \ 435 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \ 436 U32 ging = TRIE_LIST_LEN( state ) * 2; \ 437 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \ 438 TRIE_LIST_LEN( state ) = ging; \ 439 } \ 440 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \ 441 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \ 442 TRIE_LIST_CUR( state )++; \ 443 } STMT_END 444 445 #define TRIE_LIST_NEW(state) STMT_START { \ 446 Newx( trie->states[ state ].trans.list, \ 447 4, reg_trie_trans_le ); \ 448 TRIE_LIST_CUR( state ) = 1; \ 449 TRIE_LIST_LEN( state ) = 4; \ 450 } STMT_END 451 452 #define TRIE_HANDLE_WORD(state) STMT_START { \ 453 U16 dupe= trie->states[ state ].wordnum; \ 454 regnode * const noper_next = regnext( noper ); \ 455 \ 456 DEBUG_r({ \ 457 /* store the word for dumping */ \ 458 SV* tmp; \ 459 if (OP(noper) != NOTHING) \ 460 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \ 461 else \ 462 tmp = newSVpvn_utf8( "", 0, UTF ); \ 463 av_push_simple( trie_words, tmp ); \ 464 }); \ 465 \ 466 curword++; \ 467 trie->wordinfo[curword].prev = 0; \ 468 trie->wordinfo[curword].len = wordlen; \ 469 trie->wordinfo[curword].accept = state; \ 470 \ 471 if ( noper_next < tail ) { \ 472 if (!trie->jump) { \ 473 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \ 474 sizeof(U16) ); \ 475 trie->j_before_paren = (U16 *) PerlMemShared_calloc( word_count + 1, \ 476 sizeof(U16) ); \ 477 trie->j_after_paren = (U16 *) PerlMemShared_calloc( word_count + 1, \ 478 sizeof(U16) ); \ 479 } \ 480 trie->jump[curword] = (U16)(noper_next - convert); \ 481 U16 set_before_paren; \ 482 U16 set_after_paren; \ 483 if (OP(cur) == BRANCH) { \ 484 set_before_paren = ARG1a(cur); \ 485 set_after_paren = ARG1b(cur); \ 486 } else { \ 487 set_before_paren = ARG2a(cur); \ 488 set_after_paren = ARG2b(cur); \ 489 } \ 490 trie->j_before_paren[curword] = set_before_paren; \ 491 trie->j_after_paren[curword] = set_after_paren; \ 492 if (!jumper) \ 493 jumper = noper_next; \ 494 if (!nextbranch) \ 495 nextbranch= regnext(cur); \ 496 } \ 497 \ 498 if ( dupe ) { \ 499 /* It's a dupe. Pre-insert into the wordinfo[].prev */\ 500 /* chain, so that when the bits of chain are later */\ 501 /* linked together, the dups appear in the chain */\ 502 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \ 503 trie->wordinfo[dupe].prev = curword; \ 504 } else { \ 505 /* we haven't inserted this word yet. */ \ 506 trie->states[ state ].wordnum = curword; \ 507 } \ 508 } STMT_END 509 510 511 #define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \ 512 ( ( base + charid >= ucharcount \ 513 && base + charid < ubound \ 514 && state == trie->trans[ base - ucharcount + charid ].check \ 515 && trie->trans[ base - ucharcount + charid ].next ) \ 516 ? trie->trans[ base - ucharcount + charid ].next \ 517 : ( state==1 ? special : 0 ) \ 518 ) 519 520 /* Helper function for make_trie(): set a trie bit for both the character 521 * and its folded variant, and for the first byte of a variant codepoint, 522 * if any */ 523 524 STATIC void 525 S_trie_bitmap_set_folded(pTHX_ RExC_state_t *pRExC_state, 526 reg_trie_data *trie, U8 ch, const U8 * folder) 527 { 528 TRIE_BITMAP_SET(trie, ch); 529 /* store the folded codepoint */ 530 if ( folder ) 531 TRIE_BITMAP_SET(trie, folder[ch]); 532 533 if ( !UTF ) { 534 /* store first byte of utf8 representation of */ 535 /* variant codepoints */ 536 if (! UVCHR_IS_INVARIANT(ch)) { 537 U8 hi = UTF8_TWO_BYTE_HI(ch); 538 /* Note that hi will be either 0xc2 or 0xc3 (apart from EBCDIC 539 * systems), and TRIE_BITMAP_SET() will do >>3 to get the byte 540 * offset within the bit table, which is constant, and 541 * Coverity complained about this (CID 488118). */ 542 TRIE_BITMAP_SET(trie, hi); 543 } 544 } 545 } 546 547 548 I32 549 Perl_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, 550 regnode *first, regnode *last, regnode *tail, 551 U32 word_count, U32 flags, U32 depth) 552 { 553 /* first pass, loop through and scan words */ 554 reg_trie_data *trie; 555 HV *widecharmap = NULL; 556 AV *revcharmap = newAV(); 557 regnode *cur; 558 STRLEN len = 0; 559 UV uvc = 0; 560 U16 curword = 0; 561 U32 next_alloc = 0; 562 regnode *jumper = NULL; 563 regnode *nextbranch = NULL; 564 regnode *lastbranch = NULL; 565 regnode *convert = NULL; 566 U32 *prev_states; /* temp array mapping each state to previous one */ 567 /* we just use folder as a flag in utf8 */ 568 const U8 * folder = NULL; 569 570 /* in the below reg_add_data call we are storing either 'tu' or 'tuaa' 571 * which stands for one trie structure, one hash, optionally followed 572 * by two arrays */ 573 #ifdef DEBUGGING 574 const U32 data_slot = reg_add_data( pRExC_state, STR_WITH_LEN("tuaa")); 575 AV *trie_words = NULL; 576 /* along with revcharmap, this only used during construction but both are 577 * useful during debugging so we store them in the struct when debugging. 578 */ 579 #else 580 const U32 data_slot = reg_add_data( pRExC_state, STR_WITH_LEN("tu")); 581 STRLEN trie_charcount=0; 582 #endif 583 SV *re_trie_maxbuff; 584 DECLARE_AND_GET_RE_DEBUG_FLAGS; 585 586 PERL_ARGS_ASSERT_MAKE_TRIE; 587 #ifndef DEBUGGING 588 PERL_UNUSED_ARG(depth); 589 #endif 590 591 switch (flags) { 592 case EXACT: case EXACT_REQ8: case EXACTL: break; 593 case EXACTFAA: 594 case EXACTFUP: 595 case EXACTFU: 596 case EXACTFLU8: folder = PL_fold_latin1; break; 597 case EXACTF: folder = PL_fold; break; 598 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, REGNODE_NAME(flags) ); 599 } 600 601 /* create the trie struct, all zeroed */ 602 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) ); 603 trie->refcount = 1; 604 trie->startstate = 1; 605 trie->wordcount = word_count; 606 RExC_rxi->data->data[ data_slot ] = (void*)trie; 607 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) ); 608 if (flags == EXACT || flags == EXACT_REQ8 || flags == EXACTL) 609 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 ); 610 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc( 611 trie->wordcount+1, sizeof(reg_trie_wordinfo)); 612 613 DEBUG_r({ 614 trie_words = newAV(); 615 }); 616 617 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, GV_ADD); 618 assert(re_trie_maxbuff); 619 if (!SvIOK(re_trie_maxbuff)) { 620 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); 621 } 622 DEBUG_TRIE_COMPILE_r({ 623 Perl_re_indentf( aTHX_ 624 "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n", 625 depth+1, 626 REG_NODE_NUM(startbranch), REG_NODE_NUM(first), 627 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth); 628 }); 629 630 /* Find the node we are going to overwrite */ 631 if ( first == startbranch && OP( last ) != BRANCH ) { 632 /* whole branch chain */ 633 convert = first; 634 } else { 635 /* branch sub-chain */ 636 convert = REGNODE_AFTER( first ); 637 } 638 639 /* -- First loop and Setup -- 640 641 We first traverse the branches and scan each word to determine if it 642 contains widechars, and how many unique chars there are, this is 643 important as we have to build a table with at least as many columns as we 644 have unique chars. 645 646 We use an array of integers to represent the character codes 0..255 647 (trie->charmap) and we use a an HV* to store Unicode characters. We use 648 the native representation of the character value as the key and IV's for 649 the coded index. 650 651 *TODO* If we keep track of how many times each character is used we can 652 remap the columns so that the table compression later on is more 653 efficient in terms of memory by ensuring the most common value is in the 654 middle and the least common are on the outside. IMO this would be better 655 than a most to least common mapping as theres a decent chance the most 656 common letter will share a node with the least common, meaning the node 657 will not be compressible. With a middle is most common approach the worst 658 case is when we have the least common nodes twice. 659 660 */ 661 662 for ( cur = first ; cur < last ; cur = regnext( cur ) ) { 663 regnode *noper = REGNODE_AFTER( cur ); 664 const U8 *uc; 665 const U8 *e; 666 int foldlen = 0; 667 U32 wordlen = 0; /* required init */ 668 STRLEN minchars = 0; 669 STRLEN maxchars = 0; 670 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the 671 bitmap?*/ 672 673 /* wordlen is needed for the TRIE_READ_CHAR() macro, but we don't use its 674 value in this scope, we only modify it. clang 17 warns about this. 675 The later definitions of wordlen in this function do have their values 676 used. 677 */ 678 PERL_UNUSED_VAR(wordlen); 679 680 lastbranch = cur; 681 682 if (OP(noper) == NOTHING) { 683 /* skip past a NOTHING at the start of an alternation 684 * eg, /(?:)a|(?:b)/ should be the same as /a|b/ 685 * 686 * If the next node is not something we are supposed to process 687 * we will just ignore it due to the condition guarding the 688 * next block. 689 */ 690 691 regnode *noper_next= regnext(noper); 692 if (noper_next < tail) 693 noper= noper_next; 694 } 695 696 if ( noper < tail 697 && ( OP(noper) == flags 698 || (flags == EXACT && OP(noper) == EXACT_REQ8) 699 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8 700 || OP(noper) == EXACTFUP)))) 701 { 702 uc= (U8*)STRING(noper); 703 e= uc + STR_LEN(noper); 704 } else { 705 trie->minlen= 0; 706 continue; 707 } 708 709 710 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */ 711 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte 712 regardless of encoding */ 713 if (OP( noper ) == EXACTFUP) { 714 /* false positives are ok, so just set this */ 715 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S); 716 } 717 } 718 719 for ( ; uc < e ; uc += len ) { /* Look at each char in the current 720 branch */ 721 TRIE_CHARCOUNT(trie)++; 722 TRIE_READ_CHAR; 723 724 /* TRIE_READ_CHAR returns the current character, or its fold if /i 725 * is in effect. Under /i, this character can match itself, or 726 * anything that folds to it. If not under /i, it can match just 727 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN 728 * all fold to k, and all are single characters. But some folds 729 * expand to more than one character, so for example LATIN SMALL 730 * LIGATURE FFI folds to the three character sequence 'ffi'. If 731 * the string beginning at 'uc' is 'ffi', it could be matched by 732 * three characters, or just by the one ligature character. (It 733 * could also be matched by two characters: LATIN SMALL LIGATURE FF 734 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI). 735 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also 736 * match.) The trie needs to know the minimum and maximum number 737 * of characters that could match so that it can use size alone to 738 * quickly reject many match attempts. The max is simple: it is 739 * the number of folded characters in this branch (since a fold is 740 * never shorter than what folds to it. */ 741 742 maxchars++; 743 744 /* And the min is equal to the max if not under /i (indicated by 745 * 'folder' being NULL), or there are no multi-character folds. If 746 * there is a multi-character fold, the min is incremented just 747 * once, for the character that folds to the sequence. Each 748 * character in the sequence needs to be added to the list below of 749 * characters in the trie, but we count only the first towards the 750 * min number of characters needed. This is done through the 751 * variable 'foldlen', which is returned by the macros that look 752 * for these sequences as the number of bytes the sequence 753 * occupies. Each time through the loop, we decrement 'foldlen' by 754 * how many bytes the current char occupies. Only when it reaches 755 * 0 do we increment 'minchars' or look for another multi-character 756 * sequence. */ 757 if (folder == NULL) { 758 minchars++; 759 } 760 else if (foldlen > 0) { 761 foldlen -= (UTF) ? UTF8SKIP(uc) : 1; 762 } 763 else { 764 minchars++; 765 766 /* See if *uc is the beginning of a multi-character fold. If 767 * so, we decrement the length remaining to look at, to account 768 * for the current character this iteration. (We can use 'uc' 769 * instead of the fold returned by TRIE_READ_CHAR because the 770 * macro is smart enough to account for any unfolded 771 * characters. */ 772 if (UTF) { 773 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) { 774 foldlen -= UTF8SKIP(uc); 775 } 776 } 777 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) { 778 foldlen--; 779 } 780 } 781 782 /* The current character (and any potential folds) should be added 783 * to the possible matching characters for this position in this 784 * branch */ 785 if ( uvc < 256 ) { 786 if ( folder ) { 787 U8 folded= folder[ (U8) uvc ]; 788 if ( !trie->charmap[ folded ] ) { 789 trie->charmap[ folded ]=( ++trie->uniquecharcount ); 790 TRIE_STORE_REVCHAR( folded ); 791 } 792 } 793 if ( !trie->charmap[ uvc ] ) { 794 trie->charmap[ uvc ]=( ++trie->uniquecharcount ); 795 TRIE_STORE_REVCHAR( uvc ); 796 } 797 if ( set_bit ) { 798 /* store the codepoint in the bitmap, and its folded 799 * equivalent. */ 800 S_trie_bitmap_set_folded(aTHX_ pRExC_state, trie, uvc, folder); 801 set_bit = 0; /* We've done our bit :-) */ 802 } 803 } else { 804 805 /* XXX We could come up with the list of code points that fold 806 * to this using PL_utf8_foldclosures, except not for 807 * multi-char folds, as there may be multiple combinations 808 * there that could work, which needs to wait until runtime to 809 * resolve (The comment about LIGATURE FFI above is such an 810 * example */ 811 812 SV** svpp; 813 if ( !widecharmap ) 814 widecharmap = newHV(); 815 816 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 ); 817 818 if ( !svpp ) 819 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc ); 820 821 if ( !SvTRUE( *svpp ) ) { 822 sv_setiv( *svpp, ++trie->uniquecharcount ); 823 TRIE_STORE_REVCHAR(uvc); 824 } 825 } 826 } /* end loop through characters in this branch of the trie */ 827 828 /* We take the min and max for this branch and combine to find the min 829 * and max for all branches processed so far */ 830 if( cur == first ) { 831 trie->minlen = minchars; 832 trie->maxlen = maxchars; 833 } else if (minchars < trie->minlen) { 834 trie->minlen = minchars; 835 } else if (maxchars > trie->maxlen) { 836 trie->maxlen = maxchars; 837 } 838 } /* end first pass */ 839 trie->before_paren = OP(first) == BRANCH 840 ? ARG1a(first) 841 : ARG2a(first); /* BRANCHJ */ 842 843 trie->after_paren = OP(lastbranch) == BRANCH 844 ? ARG1b(lastbranch) 845 : ARG2b(lastbranch); /* BRANCHJ */ 846 DEBUG_TRIE_COMPILE_r( 847 Perl_re_indentf( aTHX_ 848 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n", 849 depth+1, 850 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count, 851 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount, 852 (int)trie->minlen, (int)trie->maxlen ) 853 ); 854 855 /* 856 We now know what we are dealing with in terms of unique chars and 857 string sizes so we can calculate how much memory a naive 858 representation using a flat table will take. If it's over a reasonable 859 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory 860 conservative but potentially much slower representation using an array 861 of lists. 862 863 At the end we convert both representations into the same compressed 864 form that will be used in regexec.c for matching with. The latter 865 is a form that cannot be used to construct with but has memory 866 properties similar to the list form and access properties similar 867 to the table form making it both suitable for fast searches and 868 small enough that its feasable to store for the duration of a program. 869 870 See the comment in the code where the compressed table is produced 871 inplace from the flat tabe representation for an explanation of how 872 the compression works. 873 874 */ 875 876 877 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32); 878 prev_states[1] = 0; 879 880 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) 881 > SvIV(re_trie_maxbuff) ) 882 { 883 /* 884 Second Pass -- Array Of Lists Representation 885 886 Each state will be represented by a list of charid:state records 887 (reg_trie_trans_le) the first such element holds the CUR and LEN 888 points of the allocated array. (See defines above). 889 890 We build the initial structure using the lists, and then convert 891 it into the compressed table form which allows faster lookups 892 (but cant be modified once converted). 893 */ 894 895 STRLEN transcount = 1; 896 897 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using list compiler\n", 898 depth+1)); 899 900 trie->states = (reg_trie_state *) 901 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2, 902 sizeof(reg_trie_state) ); 903 TRIE_LIST_NEW(1); 904 next_alloc = 2; 905 906 for ( cur = first ; cur < last ; cur = regnext( cur ) ) { 907 908 regnode *noper = REGNODE_AFTER( cur ); 909 U32 state = 1; /* required init */ 910 U16 charid = 0; /* sanity init */ 911 U32 wordlen = 0; /* required init */ 912 913 if (OP(noper) == NOTHING) { 914 regnode *noper_next= regnext(noper); 915 if (noper_next < tail) 916 noper= noper_next; 917 /* we will undo this assignment if noper does not 918 * point at a trieable type in the else clause of 919 * the following statement. */ 920 } 921 922 if ( noper < tail 923 && ( OP(noper) == flags 924 || (flags == EXACT && OP(noper) == EXACT_REQ8) 925 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8 926 || OP(noper) == EXACTFUP)))) 927 { 928 const U8 *uc= (U8*)STRING(noper); 929 const U8 *e= uc + STR_LEN(noper); 930 931 for ( ; uc < e ; uc += len ) { 932 933 TRIE_READ_CHAR; 934 935 if ( uvc < 256 ) { 936 charid = trie->charmap[ uvc ]; 937 } else { 938 SV** const svpp = hv_fetch( widecharmap, 939 (char*)&uvc, 940 sizeof( UV ), 941 0); 942 if ( !svpp ) { 943 charid = 0; 944 } else { 945 charid=(U16)SvIV( *svpp ); 946 } 947 } 948 /* charid is now 0 if we dont know the char read, or 949 * nonzero if we do */ 950 if ( charid ) { 951 952 U16 check; 953 U32 newstate = 0; 954 955 charid--; 956 if ( !trie->states[ state ].trans.list ) { 957 TRIE_LIST_NEW( state ); 958 } 959 for ( check = 1; 960 check <= TRIE_LIST_USED( state ); 961 check++ ) 962 { 963 if ( TRIE_LIST_ITEM( state, check ).forid 964 == charid ) 965 { 966 newstate = TRIE_LIST_ITEM( state, check ).newstate; 967 break; 968 } 969 } 970 if ( ! newstate ) { 971 newstate = next_alloc++; 972 prev_states[newstate] = state; 973 TRIE_LIST_PUSH( state, charid, newstate ); 974 transcount++; 975 } 976 state = newstate; 977 } else { 978 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc ); 979 } 980 } 981 } else { 982 /* If we end up here it is because we skipped past a NOTHING, but did not end up 983 * on a trieable type. So we need to reset noper back to point at the first regop 984 * in the branch before we call TRIE_HANDLE_WORD() 985 */ 986 noper= REGNODE_AFTER(cur); 987 } 988 TRIE_HANDLE_WORD(state); 989 990 } /* end second pass */ 991 992 /* next alloc is the NEXT state to be allocated */ 993 trie->statecount = next_alloc; 994 trie->states = (reg_trie_state *) 995 PerlMemShared_realloc( trie->states, 996 next_alloc 997 * sizeof(reg_trie_state) ); 998 999 /* and now dump it out before we compress it */ 1000 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap, 1001 revcharmap, next_alloc, 1002 depth+1) 1003 ); 1004 1005 trie->trans = (reg_trie_trans *) 1006 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) ); 1007 { 1008 U32 state; 1009 U32 tp = 0; 1010 U32 zp = 0; 1011 1012 1013 for( state=1 ; state < next_alloc ; state ++ ) { 1014 U32 base=0; 1015 1016 /* 1017 DEBUG_TRIE_COMPILE_MORE_r( 1018 Perl_re_printf( aTHX_ "tp: %d zp: %d ",tp,zp) 1019 ); 1020 */ 1021 1022 if (trie->states[state].trans.list) { 1023 U16 minid=TRIE_LIST_ITEM( state, 1).forid; 1024 U16 maxid=minid; 1025 U16 idx; 1026 1027 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) { 1028 const U16 forid = TRIE_LIST_ITEM( state, idx).forid; 1029 if ( forid < minid ) { 1030 minid=forid; 1031 } else if ( forid > maxid ) { 1032 maxid=forid; 1033 } 1034 } 1035 if ( transcount < tp + maxid - minid + 1) { 1036 transcount *= 2; 1037 trie->trans = (reg_trie_trans *) 1038 PerlMemShared_realloc( trie->trans, 1039 transcount 1040 * sizeof(reg_trie_trans) ); 1041 Zero( trie->trans + (transcount / 2), 1042 transcount / 2, 1043 reg_trie_trans ); 1044 } 1045 base = trie->uniquecharcount + tp - minid; 1046 if ( maxid == minid ) { 1047 U32 set = 0; 1048 for ( ; zp < tp ; zp++ ) { 1049 if ( ! trie->trans[ zp ].next ) { 1050 base = trie->uniquecharcount + zp - minid; 1051 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1052 1).newstate; 1053 trie->trans[ zp ].check = state; 1054 set = 1; 1055 break; 1056 } 1057 } 1058 if ( !set ) { 1059 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1060 1).newstate; 1061 trie->trans[ tp ].check = state; 1062 tp++; 1063 zp = tp; 1064 } 1065 } else { 1066 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) { 1067 const U32 tid = base 1068 - trie->uniquecharcount 1069 + TRIE_LIST_ITEM( state, idx ).forid; 1070 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, 1071 idx ).newstate; 1072 trie->trans[ tid ].check = state; 1073 } 1074 tp += ( maxid - minid + 1 ); 1075 } 1076 Safefree(trie->states[ state ].trans.list); 1077 } 1078 /* 1079 DEBUG_TRIE_COMPILE_MORE_r( 1080 Perl_re_printf( aTHX_ " base: %d\n",base); 1081 ); 1082 */ 1083 trie->states[ state ].trans.base=base; 1084 } 1085 trie->lasttrans = tp + 1; 1086 } 1087 } else { 1088 /* 1089 Second Pass -- Flat Table Representation. 1090 1091 we dont use the 0 slot of either trans[] or states[] so we add 1 to 1092 each. We know that we will need Charcount+1 trans at most to store 1093 the data (one row per char at worst case) So we preallocate both 1094 structures assuming worst case. 1095 1096 We then construct the trie using only the .next slots of the entry 1097 structs. 1098 1099 We use the .check field of the first entry of the node temporarily 1100 to make compression both faster and easier by keeping track of how 1101 many non zero fields are in the node. 1102 1103 Since trans are numbered from 1 any 0 pointer in the table is a FAIL 1104 transition. 1105 1106 There are two terms at use here: state as a TRIE_NODEIDX() which is 1107 a number representing the first entry of the node, and state as a 1108 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) 1109 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) 1110 if there are 2 entrys per node. eg: 1111 1112 A B A B 1113 1. 2 4 1. 3 7 1114 2. 0 3 3. 0 5 1115 3. 0 0 5. 0 0 1116 4. 0 0 7. 0 0 1117 1118 The table is internally in the right hand, idx form. However as we 1119 also have to deal with the states array which is indexed by nodenum 1120 we have to use TRIE_NODENUM() to convert. 1121 1122 */ 1123 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using table compiler\n", 1124 depth+1)); 1125 1126 trie->trans = (reg_trie_trans *) 1127 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 ) 1128 * trie->uniquecharcount + 1, 1129 sizeof(reg_trie_trans) ); 1130 trie->states = (reg_trie_state *) 1131 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2, 1132 sizeof(reg_trie_state) ); 1133 next_alloc = trie->uniquecharcount + 1; 1134 1135 1136 for ( cur = first ; cur < last ; cur = regnext( cur ) ) { 1137 1138 regnode *noper = REGNODE_AFTER( cur ); 1139 1140 U32 state = 1; /* required init */ 1141 1142 U16 charid = 0; /* sanity init */ 1143 U32 accept_state = 0; /* sanity init */ 1144 1145 U32 wordlen = 0; /* required init */ 1146 1147 if (OP(noper) == NOTHING) { 1148 regnode *noper_next= regnext(noper); 1149 if (noper_next < tail) 1150 noper= noper_next; 1151 /* we will undo this assignment if noper does not 1152 * point at a trieable type in the else clause of 1153 * the following statement. */ 1154 } 1155 1156 if ( noper < tail 1157 && ( OP(noper) == flags 1158 || (flags == EXACT && OP(noper) == EXACT_REQ8) 1159 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8 1160 || OP(noper) == EXACTFUP)))) 1161 { 1162 const U8 *uc= (U8*)STRING(noper); 1163 const U8 *e= uc + STR_LEN(noper); 1164 1165 for ( ; uc < e ; uc += len ) { 1166 1167 TRIE_READ_CHAR; 1168 1169 if ( uvc < 256 ) { 1170 charid = trie->charmap[ uvc ]; 1171 } else { 1172 SV* const * const svpp = hv_fetch( widecharmap, 1173 (char*)&uvc, 1174 sizeof( UV ), 1175 0); 1176 charid = svpp ? (U16)SvIV(*svpp) : 0; 1177 } 1178 if ( charid ) { 1179 charid--; 1180 if ( !trie->trans[ state + charid ].next ) { 1181 trie->trans[ state + charid ].next = next_alloc; 1182 trie->trans[ state ].check++; 1183 prev_states[TRIE_NODENUM(next_alloc)] 1184 = TRIE_NODENUM(state); 1185 next_alloc += trie->uniquecharcount; 1186 } 1187 state = trie->trans[ state + charid ].next; 1188 } else { 1189 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc ); 1190 } 1191 /* charid is now 0 if we dont know the char read, or 1192 * nonzero if we do */ 1193 } 1194 } else { 1195 /* If we end up here it is because we skipped past a NOTHING, but did not end up 1196 * on a trieable type. So we need to reset noper back to point at the first regop 1197 * in the branch before we call TRIE_HANDLE_WORD(). 1198 */ 1199 noper= REGNODE_AFTER(cur); 1200 } 1201 accept_state = TRIE_NODENUM( state ); 1202 TRIE_HANDLE_WORD(accept_state); 1203 1204 } /* end second pass */ 1205 1206 /* and now dump it out before we compress it */ 1207 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap, 1208 revcharmap, 1209 next_alloc, depth+1)); 1210 1211 { 1212 /* 1213 * Inplace compress the table.* 1214 1215 For sparse data sets the table constructed by the trie algorithm will 1216 be mostly 0/FAIL transitions or to put it another way mostly empty. 1217 (Note that leaf nodes will not contain any transitions.) 1218 1219 This algorithm compresses the tables by eliminating most such 1220 transitions, at the cost of a modest bit of extra work during lookup: 1221 1222 - Each states[] entry contains a .base field which indicates the 1223 index in the state[] array wheres its transition data is stored. 1224 1225 - If .base is 0 there are no valid transitions from that node. 1226 1227 - If .base is nonzero then charid is added to it to find an entry in 1228 the trans array. 1229 1230 -If trans[states[state].base+charid].check!=state then the 1231 transition is taken to be a 0/Fail transition. Thus if there are fail 1232 transitions at the front of the node then the .base offset will point 1233 somewhere inside the previous nodes data (or maybe even into a node 1234 even earlier), but the .check field determines if the transition is 1235 valid. 1236 1237 XXX - wrong maybe? 1238 The following process inplace converts the table to the compressed 1239 table: We first do not compress the root node 1,and mark all its 1240 .check pointers as 1 and set its .base pointer as 1 as well. This 1241 allows us to do a DFA construction from the compressed table later, 1242 and ensures that any .base pointers we calculate later are greater 1243 than 0. 1244 1245 - We set 'pos' to indicate the first entry of the second node. 1246 1247 - We then iterate over the columns of the node, finding the first and 1248 last used entry at l and m. We then copy l..m into pos..(pos+m-l), 1249 and set the .check pointers accordingly, and advance pos 1250 appropriately and repreat for the next node. Note that when we copy 1251 the next pointers we have to convert them from the original 1252 NODEIDX form to NODENUM form as the former is not valid post 1253 compression. 1254 1255 - If a node has no transitions used we mark its base as 0 and do not 1256 advance the pos pointer. 1257 1258 - If a node only has one transition we use a second pointer into the 1259 structure to fill in allocated fail transitions from other states. 1260 This pointer is independent of the main pointer and scans forward 1261 looking for null transitions that are allocated to a state. When it 1262 finds one it writes the single transition into the "hole". If the 1263 pointer doesnt find one the single transition is appended as normal. 1264 1265 - Once compressed we can Renew/realloc the structures to release the 1266 excess space. 1267 1268 See "Table-Compression Methods" in sec 3.9 of the Red Dragon, 1269 specifically Fig 3.47 and the associated pseudocode. 1270 1271 demq 1272 */ 1273 const U32 laststate = TRIE_NODENUM( next_alloc ); 1274 U32 state, charid; 1275 U32 pos = 0, zp=0; 1276 trie->statecount = laststate; 1277 1278 for ( state = 1 ; state < laststate ; state++ ) { 1279 U8 flag = 0; 1280 const U32 stateidx = TRIE_NODEIDX( state ); 1281 const U32 o_used = trie->trans[ stateidx ].check; 1282 U32 used = trie->trans[ stateidx ].check; 1283 trie->trans[ stateidx ].check = 0; 1284 1285 for ( charid = 0; 1286 used && charid < trie->uniquecharcount; 1287 charid++ ) 1288 { 1289 if ( flag || trie->trans[ stateidx + charid ].next ) { 1290 if ( trie->trans[ stateidx + charid ].next ) { 1291 if (o_used == 1) { 1292 for ( ; zp < pos ; zp++ ) { 1293 if ( ! trie->trans[ zp ].next ) { 1294 break; 1295 } 1296 } 1297 trie->states[ state ].trans.base 1298 = zp 1299 + trie->uniquecharcount 1300 - charid ; 1301 trie->trans[ zp ].next 1302 = SAFE_TRIE_NODENUM( trie->trans[ stateidx 1303 + charid ].next ); 1304 trie->trans[ zp ].check = state; 1305 if ( ++zp > pos ) pos = zp; 1306 break; 1307 } 1308 used--; 1309 } 1310 if ( !flag ) { 1311 flag = 1; 1312 trie->states[ state ].trans.base 1313 = pos + trie->uniquecharcount - charid ; 1314 } 1315 trie->trans[ pos ].next 1316 = SAFE_TRIE_NODENUM( 1317 trie->trans[ stateidx + charid ].next ); 1318 trie->trans[ pos ].check = state; 1319 pos++; 1320 } 1321 } 1322 } 1323 trie->lasttrans = pos + 1; 1324 trie->states = (reg_trie_state *) 1325 PerlMemShared_realloc( trie->states, laststate 1326 * sizeof(reg_trie_state) ); 1327 DEBUG_TRIE_COMPILE_MORE_r( 1328 Perl_re_indentf( aTHX_ "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n", 1329 depth+1, 1330 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount 1331 + 1 ), 1332 (IV)next_alloc, 1333 (IV)pos, 1334 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc ); 1335 ); 1336 1337 } /* end table compress */ 1338 } 1339 DEBUG_TRIE_COMPILE_MORE_r( 1340 Perl_re_indentf( aTHX_ "Statecount:%" UVxf " Lasttrans:%" UVxf "\n", 1341 depth+1, 1342 (UV)trie->statecount, 1343 (UV)trie->lasttrans) 1344 ); 1345 /* resize the trans array to remove unused space */ 1346 trie->trans = (reg_trie_trans *) 1347 PerlMemShared_realloc( trie->trans, trie->lasttrans 1348 * sizeof(reg_trie_trans) ); 1349 1350 { /* Modify the program and insert the new TRIE node */ 1351 U8 nodetype =(U8) flags; 1352 char *str=NULL; 1353 1354 #ifdef DEBUGGING 1355 regnode *optimize = NULL; 1356 #endif /* DEBUGGING */ 1357 /* make sure we have enough room to inject the TRIE op */ 1358 assert((!trie->jump) || !trie->jump[1] || 1359 (trie->jump[1] >= (sizeof(tregnode_TRIE)/sizeof(struct regnode)))); 1360 /* 1361 This means we convert either the first branch or the first Exact, 1362 depending on whether the thing following (in 'last') is a branch 1363 or not and whther first is the startbranch (ie is it a sub part of 1364 the alternation or is it the whole thing.) 1365 Assuming its a sub part we convert the EXACT otherwise we convert 1366 the whole branch sequence, including the first. 1367 */ 1368 /* Find the node we are going to overwrite */ 1369 if ( first != startbranch || OP( last ) == BRANCH ) { 1370 /* branch sub-chain */ 1371 NEXT_OFF( first ) = (U16)(last - first); 1372 /* whole branch chain */ 1373 } 1374 /* But first we check to see if there is a common prefix we can 1375 split out as an EXACT and put in front of the TRIE node. */ 1376 trie->startstate= 1; 1377 if ( trie->bitmap && !widecharmap && !trie->jump ) { 1378 /* we want to find the first state that has more than 1379 * one transition, if that state is not the first state 1380 * then we have a common prefix which we can remove. 1381 */ 1382 U32 state; 1383 for ( state = 1 ; state < trie->statecount-1 ; state++ ) { 1384 U32 ofs = 0; 1385 I32 first_ofs = -1; /* keeps track of the ofs of the first 1386 transition, -1 means none */ 1387 U32 count = 0; 1388 const U32 base = trie->states[ state ].trans.base; 1389 1390 /* does this state terminate an alternation? */ 1391 if ( trie->states[state].wordnum ) 1392 count = 1; 1393 1394 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) { 1395 if ( ( base + ofs >= trie->uniquecharcount ) && 1396 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) && 1397 trie->trans[ base + ofs - trie->uniquecharcount ].check == state ) 1398 { 1399 if ( ++count > 1 ) { 1400 /* we have more than one transition */ 1401 SV **tmp; 1402 U8 *ch; 1403 /* if this is the first state there is no common prefix 1404 * to extract, so we can exit */ 1405 if ( state == 1 ) break; 1406 tmp = av_fetch_simple( revcharmap, ofs, 0); 1407 ch = (U8*)SvPV_nolen_const( *tmp ); 1408 1409 /* if we are on count 2 then we need to initialize the 1410 * bitmap, and store the previous char if there was one 1411 * in it*/ 1412 if ( count == 2 ) { 1413 /* clear the bitmap */ 1414 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char); 1415 DEBUG_OPTIMISE_r( 1416 Perl_re_indentf( aTHX_ "New Start State=%" UVuf " Class: [", 1417 depth+1, 1418 (UV)state)); 1419 if (first_ofs >= 0) { 1420 SV ** const tmp = av_fetch_simple( revcharmap, first_ofs, 0); 1421 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp ); 1422 1423 S_trie_bitmap_set_folded(aTHX_ pRExC_state, trie, *ch, folder); 1424 DEBUG_OPTIMISE_r( 1425 Perl_re_printf( aTHX_ "%s", (char*)ch) 1426 ); 1427 } 1428 } 1429 /* store the current firstchar in the bitmap */ 1430 S_trie_bitmap_set_folded(aTHX_ pRExC_state, trie, *ch, folder); 1431 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch)); 1432 } 1433 first_ofs = ofs; 1434 } 1435 } 1436 if ( count == 1 ) { 1437 /* This state has only one transition, its transition is part 1438 * of a common prefix - we need to concatenate the char it 1439 * represents to what we have so far. */ 1440 SV **tmp = av_fetch_simple( revcharmap, first_ofs, 0); 1441 STRLEN len; 1442 char *ch = SvPV( *tmp, len ); 1443 DEBUG_OPTIMISE_r({ 1444 SV *sv=sv_newmortal(); 1445 Perl_re_indentf( aTHX_ "Prefix State: %" UVuf " Ofs:%" UVuf " Char='%s'\n", 1446 depth+1, 1447 (UV)state, (UV)first_ofs, 1448 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6, 1449 PL_colors[0], PL_colors[1], 1450 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | 1451 PERL_PV_ESCAPE_FIRSTCHAR 1452 ) 1453 ); 1454 }); 1455 if ( state==1 ) { 1456 OP( convert ) = nodetype; 1457 str=STRING(convert); 1458 setSTR_LEN(convert, 0); 1459 } 1460 assert( ( STR_LEN(convert) + len ) < 256 ); 1461 setSTR_LEN(convert, (U8)(STR_LEN(convert) + len)); 1462 while (len--) 1463 *str++ = *ch++; 1464 } else { 1465 #ifdef DEBUGGING 1466 if (state>1) 1467 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n")); 1468 #endif 1469 break; 1470 } 1471 } 1472 trie->prefixlen = (state-1); 1473 if (str) { 1474 regnode *n = REGNODE_AFTER(convert); 1475 assert( n - convert <= U16_MAX ); 1476 NEXT_OFF(convert) = n - convert; 1477 trie->startstate = state; 1478 trie->minlen -= (state - 1); 1479 trie->maxlen -= (state - 1); 1480 #ifdef DEBUGGING 1481 /* At least the UNICOS C compiler choked on this 1482 * being argument to DEBUG_r(), so let's just have 1483 * it right here. */ 1484 if ( 1485 #ifdef PERL_EXT_RE_BUILD 1486 1 1487 #else 1488 DEBUG_r_TEST 1489 #endif 1490 ) { 1491 U32 word = trie->wordcount; 1492 while (word--) { 1493 SV ** const tmp = av_fetch_simple( trie_words, word, 0 ); 1494 if (tmp) { 1495 if ( STR_LEN(convert) <= SvCUR(*tmp) ) 1496 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert)); 1497 else 1498 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp)); 1499 } 1500 } 1501 } 1502 #endif 1503 if (trie->maxlen) { 1504 convert = n; 1505 } else { 1506 NEXT_OFF(convert) = (U16)(tail - convert); 1507 DEBUG_r(optimize= n); 1508 } 1509 } 1510 } 1511 if (!jumper) 1512 jumper = last; 1513 if ( trie->maxlen ) { 1514 NEXT_OFF( convert ) = (U16)(tail - convert); 1515 ARG1u_SET( convert, data_slot ); 1516 /* Store the offset to the first unabsorbed branch in 1517 jump[0], which is otherwise unused by the jump logic. 1518 We use this when dumping a trie and during optimisation. */ 1519 if (trie->jump) 1520 trie->jump[0] = (U16)(nextbranch - convert); 1521 1522 /* If the start state is not accepting (meaning there is no empty string/NOTHING) 1523 * and there is a bitmap 1524 * and the first "jump target" node we found leaves enough room 1525 * then convert the TRIE node into a TRIEC node, with the bitmap 1526 * embedded inline in the opcode - this is hypothetically faster. 1527 */ 1528 if ( !trie->states[trie->startstate].wordnum 1529 && trie->bitmap 1530 && ( (char *)jumper - (char *)convert) >= (int)sizeof(tregnode_TRIEC) ) 1531 { 1532 OP( convert ) = TRIEC; 1533 Copy(trie->bitmap, ((tregnode_TRIEC *)convert)->bitmap, ANYOF_BITMAP_SIZE, char); 1534 PerlMemShared_free(trie->bitmap); 1535 trie->bitmap= NULL; 1536 } else 1537 OP( convert ) = TRIE; 1538 1539 /* store the type in the flags */ 1540 FLAGS(convert) = nodetype; 1541 DEBUG_r({ 1542 optimize = convert 1543 + NODE_STEP_REGNODE 1544 + REGNODE_ARG_LEN( OP( convert ) ); 1545 }); 1546 /* XXX We really should free up the resource in trie now, 1547 as we won't use them - (which resources?) dmq */ 1548 } 1549 /* needed for dumping*/ 1550 DEBUG_r(if (optimize) { 1551 /* 1552 Try to clean up some of the debris left after the 1553 optimisation. 1554 */ 1555 while( optimize < jumper ) { 1556 OP( optimize ) = OPTIMIZED; 1557 optimize++; 1558 } 1559 }); 1560 } /* end node insert */ 1561 1562 /* Finish populating the prev field of the wordinfo array. Walk back 1563 * from each accept state until we find another accept state, and if 1564 * so, point the first word's .prev field at the second word. If the 1565 * second already has a .prev field set, stop now. This will be the 1566 * case either if we've already processed that word's accept state, 1567 * or that state had multiple words, and the overspill words were 1568 * already linked up earlier. 1569 */ 1570 { 1571 U16 word; 1572 U32 state; 1573 U16 prev; 1574 1575 for (word=1; word <= trie->wordcount; word++) { 1576 prev = 0; 1577 if (trie->wordinfo[word].prev) 1578 continue; 1579 state = trie->wordinfo[word].accept; 1580 while (state) { 1581 state = prev_states[state]; 1582 if (!state) 1583 break; 1584 prev = trie->states[state].wordnum; 1585 if (prev) 1586 break; 1587 } 1588 trie->wordinfo[word].prev = prev; 1589 } 1590 Safefree(prev_states); 1591 } 1592 1593 1594 /* and now dump out the compressed format */ 1595 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1)); 1596 1597 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap; 1598 #ifdef DEBUGGING 1599 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words; 1600 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap; 1601 #else 1602 SvREFCNT_dec_NN(revcharmap); 1603 #endif 1604 return trie->jump 1605 ? MADE_JUMP_TRIE 1606 : trie->startstate>1 1607 ? MADE_EXACT_TRIE 1608 : MADE_TRIE; 1609 } 1610 1611 regnode * 1612 Perl_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth) 1613 { 1614 /* The Trie is constructed and compressed now so we can build a fail array if 1615 * it's needed 1616 1617 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 1618 3.32 in the 1619 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, 1620 Ullman 1985/88 1621 ISBN 0-201-10088-6 1622 1623 We find the fail state for each state in the trie, this state is the longest 1624 proper suffix of the current state's 'word' that is also a proper prefix of 1625 another word in our trie. State 1 represents the word '' and is thus the 1626 default fail state. This allows the DFA not to have to restart after its 1627 tried and failed a word at a given point, it simply continues as though it 1628 had been matching the other word in the first place. 1629 Consider 1630 'abcdgu'=~/abcdefg|cdgu/ 1631 When we get to 'd' we are still matching the first word, we would encounter 1632 'g' which would fail, which would bring us to the state representing 'd' in 1633 the second word where we would try 'g' and succeed, proceeding to match 1634 'cdgu'. 1635 */ 1636 /* add a fail transition */ 1637 const U32 trie_offset = ARG1u(source); 1638 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset]; 1639 U32 *q; 1640 const U32 ucharcount = trie->uniquecharcount; 1641 const U32 numstates = trie->statecount; 1642 const U32 ubound = trie->lasttrans + ucharcount; 1643 U32 q_read = 0; 1644 U32 q_write = 0; 1645 U32 charid; 1646 U32 base = trie->states[ 1 ].trans.base; 1647 U32 *fail; 1648 reg_ac_data *aho; 1649 const U32 data_slot = reg_add_data( pRExC_state, STR_WITH_LEN("T")); 1650 regnode *stclass; 1651 DECLARE_AND_GET_RE_DEBUG_FLAGS; 1652 1653 PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE; 1654 PERL_UNUSED_CONTEXT; 1655 #ifndef DEBUGGING 1656 PERL_UNUSED_ARG(depth); 1657 #endif 1658 1659 if ( OP(source) == TRIE ) { 1660 tregnode_TRIE *op = (tregnode_TRIE *) 1661 PerlMemShared_calloc(1, sizeof(tregnode_TRIE)); 1662 StructCopy(source, op, tregnode_TRIE); 1663 stclass = (regnode *)op; 1664 } else { 1665 tregnode_TRIEC *op = (tregnode_TRIEC *) 1666 PerlMemShared_calloc(1, sizeof(tregnode_TRIEC)); 1667 StructCopy(source, op, tregnode_TRIEC); 1668 stclass = (regnode *)op; 1669 } 1670 OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */ 1671 1672 ARG1u_SET( stclass, data_slot ); 1673 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) ); 1674 RExC_rxi->data->data[ data_slot ] = (void*)aho; 1675 aho->trie=trie_offset; 1676 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) ); 1677 Copy( trie->states, aho->states, numstates, reg_trie_state ); 1678 Newx( q, numstates, U32); 1679 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) ); 1680 aho->refcount = 1; 1681 fail = aho->fail; 1682 /* initialize fail[0..1] to be 1 so that we always have 1683 a valid final fail state */ 1684 fail[ 0 ] = fail[ 1 ] = 1; 1685 1686 for ( charid = 0; charid < ucharcount ; charid++ ) { 1687 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 ); 1688 if ( newstate ) { 1689 q[ q_write ] = newstate; 1690 /* set to point at the root */ 1691 fail[ q[ q_write++ ] ]=1; 1692 } 1693 } 1694 while ( q_read < q_write) { 1695 const U32 cur = q[ q_read++ % numstates ]; 1696 base = trie->states[ cur ].trans.base; 1697 1698 for ( charid = 0 ; charid < ucharcount ; charid++ ) { 1699 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 ); 1700 if (ch_state) { 1701 U32 fail_state = cur; 1702 U32 fail_base; 1703 do { 1704 fail_state = fail[ fail_state ]; 1705 fail_base = aho->states[ fail_state ].trans.base; 1706 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) ); 1707 1708 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ); 1709 fail[ ch_state ] = fail_state; 1710 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum ) 1711 { 1712 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum; 1713 } 1714 q[ q_write++ % numstates] = ch_state; 1715 } 1716 } 1717 } 1718 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop 1719 when we fail in state 1, this allows us to use the 1720 charclass scan to find a valid start char. This is based on the principle 1721 that theres a good chance the string being searched contains lots of stuff 1722 that cant be a start char. 1723 */ 1724 fail[ 0 ] = fail[ 1 ] = 0; 1725 DEBUG_TRIE_COMPILE_r({ 1726 Perl_re_indentf( aTHX_ "Stclass Failtable (%" UVuf " states): 0", 1727 depth, (UV)numstates 1728 ); 1729 for( q_read=1; q_read<numstates; q_read++ ) { 1730 Perl_re_printf( aTHX_ ", %" UVuf, (UV)fail[q_read]); 1731 } 1732 Perl_re_printf( aTHX_ "\n"); 1733 }); 1734 Safefree(q); 1735 /*RExC_seen |= REG_TRIEDFA_SEEN;*/ 1736 return stclass; 1737 } 1738