1 #ifdef PERL_EXT_RE_BUILD 2 #include "re_top.h" 3 #endif 4 5 #include "EXTERN.h" 6 #define PERL_IN_REGEX_ENGINE 7 #define PERL_IN_REGCOMP_ANY 8 #define PERL_IN_REGCOMP_INVLIST_C 9 #include "perl.h" 10 11 #ifdef PERL_IN_XSUB_RE 12 # include "re_comp.h" 13 #else 14 # include "regcomp.h" 15 #endif 16 17 #include "invlist_inline.h" 18 #include "unicode_constants.h" 19 #include "regcomp_internal.h" 20 21 #ifdef PERL_RE_BUILD_AUX 22 void 23 Perl_populate_bitmap_from_invlist(pTHX_ SV * invlist, const UV offset, const U8 * bitmap, const Size_t len) 24 { 25 PERL_ARGS_ASSERT_POPULATE_BITMAP_FROM_INVLIST; 26 27 /* As the name says. The zeroth bit corresponds to the code point given by 28 * 'offset' */ 29 30 UV start, end; 31 32 Zero(bitmap, len, U8); 33 34 invlist_iterinit(invlist); 35 while (invlist_iternext(invlist, &start, &end)) { 36 assert(start >= offset); 37 38 for (UV i = start; i <= end; i++) { 39 UV adjusted = i - offset; 40 41 BITMAP_BYTE(bitmap, adjusted) |= BITMAP_BIT(adjusted); 42 } 43 } 44 invlist_iterfinish(invlist); 45 } 46 47 void 48 Perl_populate_invlist_from_bitmap(pTHX_ const U8 * bitmap, const Size_t bitmap_len, SV ** invlist, const UV offset) 49 { 50 PERL_ARGS_ASSERT_POPULATE_INVLIST_FROM_BITMAP; 51 52 /* As the name says. The zeroth bit corresponds to the code point given by 53 * 'offset' */ 54 55 Size_t i; 56 57 for (i = 0; i < bitmap_len; i++) { 58 if (BITMAP_TEST(bitmap, i)) { 59 int start = i++; 60 61 /* Save a little work by adding a range all at once instead of bit 62 * by bit */ 63 while (i < bitmap_len && BITMAP_TEST(bitmap, i)) { 64 i++; 65 } 66 67 *invlist = _add_range_to_invlist(*invlist, 68 start + offset, 69 i + offset - 1); 70 } 71 } 72 } 73 #endif /* PERL_RE_BUILD_AUX */ 74 75 /* This section of code defines the inversion list object and its methods. The 76 * interfaces are highly subject to change, so as much as possible is static to 77 * this file. An inversion list is here implemented as a malloc'd C UV array 78 * as an SVt_INVLIST scalar. 79 * 80 * An inversion list for Unicode is an array of code points, sorted by ordinal 81 * number. Each element gives the code point that begins a range that extends 82 * up-to but not including the code point given by the next element. The final 83 * element gives the first code point of a range that extends to the platform's 84 * infinity. The even-numbered elements (invlist[0], invlist[2], invlist[4], 85 * ...) give ranges whose code points are all in the inversion list. We say 86 * that those ranges are in the set. The odd-numbered elements give ranges 87 * whose code points are not in the inversion list, and hence not in the set. 88 * Thus, element [0] is the first code point in the list. Element [1] 89 * is the first code point beyond that not in the list; and element [2] is the 90 * first code point beyond that that is in the list. In other words, the first 91 * range is invlist[0]..(invlist[1]-1), and all code points in that range are 92 * in the inversion list. The second range is invlist[1]..(invlist[2]-1), and 93 * all code points in that range are not in the inversion list. The third 94 * range invlist[2]..(invlist[3]-1) gives code points that are in the inversion 95 * list, and so forth. Thus every element whose index is divisible by two 96 * gives the beginning of a range that is in the list, and every element whose 97 * index is not divisible by two gives the beginning of a range not in the 98 * list. If the final element's index is divisible by two, the inversion list 99 * extends to the platform's infinity; otherwise the highest code point in the 100 * inversion list is the contents of that element minus 1. 101 * 102 * A range that contains just a single code point N will look like 103 * invlist[i] == N 104 * invlist[i+1] == N+1 105 * 106 * If N is UV_MAX (the highest representable code point on the machine), N+1 is 107 * impossible to represent, so element [i+1] is omitted. The single element 108 * inversion list 109 * invlist[0] == UV_MAX 110 * contains just UV_MAX, but is interpreted as matching to infinity. 111 * 112 * Taking the complement (inverting) an inversion list is quite simple, if the 113 * first element is 0, remove it; otherwise add a 0 element at the beginning. 114 * This implementation reserves an element at the beginning of each inversion 115 * list to always contain 0; there is an additional flag in the header which 116 * indicates if the list begins at the 0, or is offset to begin at the next 117 * element. This means that the inversion list can be inverted without any 118 * copying; just flip the flag. 119 * 120 * More about inversion lists can be found in "Unicode Demystified" 121 * Chapter 13 by Richard Gillam, published by Addison-Wesley. 122 * 123 * The inversion list data structure is currently implemented as an SV pointing 124 * to an array of UVs that the SV thinks are bytes. This allows us to have an 125 * array of UV whose memory management is automatically handled by the existing 126 * facilities for SV's. 127 * 128 * Some of the methods should always be private to the implementation, and some 129 * should eventually be made public */ 130 131 /* The header definitions are in F<invlist_inline.h> */ 132 133 #ifndef PERL_IN_XSUB_RE 134 135 PERL_STATIC_INLINE UV* 136 S__invlist_array_init(SV* const invlist, const bool will_have_0) 137 { 138 /* Returns a pointer to the first element in the inversion list's array. 139 * This is called upon initialization of an inversion list. Where the 140 * array begins depends on whether the list has the code point U+0000 in it 141 * or not. The other parameter tells it whether the code that follows this 142 * call is about to put a 0 in the inversion list or not. The first 143 * element is either the element reserved for 0, if TRUE, or the element 144 * after it, if FALSE */ 145 146 bool* offset = get_invlist_offset_addr(invlist); 147 UV* zero_addr = (UV *) SvPVX(invlist); 148 149 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT; 150 151 /* Must be empty */ 152 assert(! _invlist_len(invlist)); 153 154 *zero_addr = 0; 155 156 /* 1^1 = 0; 1^0 = 1 */ 157 *offset = 1 ^ will_have_0; 158 return zero_addr + *offset; 159 } 160 161 STATIC void 162 S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src) 163 { 164 /* Replaces the inversion list in 'dest' with the one from 'src'. It 165 * steals the list from 'src', so 'src' is made to have a NULL list. This 166 * is similar to what SvSetMagicSV() would do, if it were implemented on 167 * inversion lists, though this routine avoids a copy */ 168 169 const UV src_len = _invlist_len(src); 170 const bool src_offset = *get_invlist_offset_addr(src); 171 const STRLEN src_byte_len = SvLEN(src); 172 char * array = SvPVX(src); 173 174 #ifndef NO_TAINT_SUPPORT 175 const int oldtainted = TAINT_get; 176 #endif 177 178 PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC; 179 180 assert(is_invlist(src)); 181 assert(is_invlist(dest)); 182 assert(! invlist_is_iterating(src)); 183 assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src)); 184 185 /* Make sure it ends in the right place with a NUL, as our inversion list 186 * manipulations aren't careful to keep this true, but sv_usepvn_flags() 187 * asserts it */ 188 array[src_byte_len - 1] = '\0'; 189 190 TAINT_NOT; /* Otherwise it breaks */ 191 sv_usepvn_flags(dest, 192 (char *) array, 193 src_byte_len - 1, 194 195 /* This flag is documented to cause a copy to be avoided */ 196 SV_HAS_TRAILING_NUL); 197 TAINT_set(oldtainted); 198 SvPV_set(src, 0); 199 SvLEN_set(src, 0); 200 SvCUR_set(src, 0); 201 202 /* Finish up copying over the other fields in an inversion list */ 203 *get_invlist_offset_addr(dest) = src_offset; 204 invlist_set_len(dest, src_len, src_offset); 205 *get_invlist_previous_index_addr(dest) = 0; 206 invlist_iterfinish(dest); 207 } 208 209 PERL_STATIC_INLINE IV* 210 S_get_invlist_previous_index_addr(SV* invlist) 211 { 212 /* Return the address of the IV that is reserved to hold the cached index 213 * */ 214 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR; 215 216 assert(is_invlist(invlist)); 217 218 return &(((XINVLIST*) SvANY(invlist))->prev_index); 219 } 220 221 PERL_STATIC_INLINE IV 222 S_invlist_previous_index(SV* const invlist) 223 { 224 /* Returns cached index of previous search */ 225 226 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX; 227 228 return *get_invlist_previous_index_addr(invlist); 229 } 230 231 PERL_STATIC_INLINE void 232 S_invlist_set_previous_index(SV* const invlist, const IV index) 233 { 234 /* Caches <index> for later retrieval */ 235 236 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX; 237 238 assert(index == 0 || index < (int) _invlist_len(invlist)); 239 240 *get_invlist_previous_index_addr(invlist) = index; 241 } 242 243 PERL_STATIC_INLINE void 244 S_invlist_trim(SV* invlist) 245 { 246 /* Free the not currently-being-used space in an inversion list */ 247 248 /* But don't free up the space needed for the 0 UV that is always at the 249 * beginning of the list, nor the trailing NUL */ 250 const UV min_size = TO_INTERNAL_SIZE(1) + 1; 251 252 PERL_ARGS_ASSERT_INVLIST_TRIM; 253 254 assert(is_invlist(invlist)); 255 256 SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1)); 257 } 258 259 PERL_STATIC_INLINE void 260 S_invlist_clear(pTHX_ SV* invlist) /* Empty the inversion list */ 261 { 262 PERL_ARGS_ASSERT_INVLIST_CLEAR; 263 264 assert(is_invlist(invlist)); 265 266 invlist_set_len(invlist, 0, 0); 267 invlist_trim(invlist); 268 } 269 270 PERL_STATIC_INLINE UV 271 S_invlist_max(const SV* const invlist) 272 { 273 /* Returns the maximum number of elements storable in the inversion list's 274 * array, without having to realloc() */ 275 276 PERL_ARGS_ASSERT_INVLIST_MAX; 277 278 assert(is_invlist(invlist)); 279 280 /* Assumes worst case, in which the 0 element is not counted in the 281 * inversion list, so subtracts 1 for that */ 282 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */ 283 ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1 284 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1; 285 } 286 287 STATIC void 288 S_initialize_invlist_guts(pTHX_ SV* invlist, const Size_t initial_size) 289 { 290 PERL_ARGS_ASSERT_INITIALIZE_INVLIST_GUTS; 291 292 /* First 1 is in case the zero element isn't in the list; second 1 is for 293 * trailing NUL */ 294 SvGROW(invlist, TO_INTERNAL_SIZE(initial_size + 1) + 1); 295 invlist_set_len(invlist, 0, 0); 296 297 /* Force iterinit() to be used to get iteration to work */ 298 invlist_iterfinish(invlist); 299 300 *get_invlist_previous_index_addr(invlist) = 0; 301 SvPOK_on(invlist); /* This allows B to extract the PV */ 302 } 303 304 SV* 305 Perl__new_invlist(pTHX_ IV initial_size) 306 { 307 308 /* Return a pointer to a newly constructed inversion list, with enough 309 * space to store 'initial_size' elements. If that number is negative, a 310 * system default is used instead */ 311 312 SV* new_list; 313 314 if (initial_size < 0) { 315 initial_size = 10; 316 } 317 318 new_list = newSV_type(SVt_INVLIST); 319 initialize_invlist_guts(new_list, initial_size); 320 321 return new_list; 322 } 323 324 SV* 325 Perl__new_invlist_C_array(pTHX_ const UV* const list) 326 { 327 /* Return a pointer to a newly constructed inversion list, initialized to 328 * point to <list>, which has to be in the exact correct inversion list 329 * form, including internal fields. Thus this is a dangerous routine that 330 * should not be used in the wrong hands. The passed in 'list' contains 331 * several header fields at the beginning that are not part of the 332 * inversion list body proper */ 333 334 const STRLEN length = (STRLEN) list[0]; 335 const UV version_id = list[1]; 336 const bool offset = cBOOL(list[2]); 337 #define HEADER_LENGTH 3 338 /* If any of the above changes in any way, you must change HEADER_LENGTH 339 * (if appropriate) and regenerate INVLIST_VERSION_ID by running 340 * perl -E 'say int(rand 2**31-1)' 341 */ 342 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and 343 data structure type, so that one being 344 passed in can be validated to be an 345 inversion list of the correct vintage. 346 */ 347 348 SV* invlist = newSV_type(SVt_INVLIST); 349 350 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY; 351 352 if (version_id != INVLIST_VERSION_ID) { 353 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list"); 354 } 355 356 /* The generated array passed in includes header elements that aren't part 357 * of the list proper, so start it just after them */ 358 SvPV_set(invlist, (char *) (list + HEADER_LENGTH)); 359 360 SvLEN_set(invlist, 0); /* Means we own the contents, and the system 361 shouldn't touch it */ 362 363 *(get_invlist_offset_addr(invlist)) = offset; 364 365 /* The 'length' passed to us is the physical number of elements in the 366 * inversion list. But if there is an offset the logical number is one 367 * less than that */ 368 invlist_set_len(invlist, length - offset, offset); 369 370 invlist_set_previous_index(invlist, 0); 371 372 /* Initialize the iteration pointer. */ 373 invlist_iterfinish(invlist); 374 375 SvREADONLY_on(invlist); 376 SvPOK_on(invlist); 377 378 return invlist; 379 } 380 381 STATIC void 382 S__append_range_to_invlist(pTHX_ SV* const invlist, 383 const UV start, const UV end) 384 { 385 /* Subject to change or removal. Append the range from 'start' to 'end' at 386 * the end of the inversion list. The range must be above any existing 387 * ones. */ 388 389 UV* array; 390 UV max = invlist_max(invlist); 391 UV len = _invlist_len(invlist); 392 bool offset; 393 394 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST; 395 396 if (len == 0) { /* Empty lists must be initialized */ 397 offset = start != 0; 398 array = _invlist_array_init(invlist, ! offset); 399 } 400 else { 401 /* Here, the existing list is non-empty. The current max entry in the 402 * list is generally the first value not in the set, except when the 403 * set extends to the end of permissible values, in which case it is 404 * the first entry in that final set, and so this call is an attempt to 405 * append out-of-order */ 406 407 UV final_element = len - 1; 408 array = invlist_array(invlist); 409 if ( array[final_element] > start 410 || ELEMENT_RANGE_MATCHES_INVLIST(final_element)) 411 { 412 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%" UVuf ", start=%" UVuf ", match=%c", 413 array[final_element], start, 414 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f'); 415 } 416 417 /* Here, it is a legal append. If the new range begins 1 above the end 418 * of the range below it, it is extending the range below it, so the 419 * new first value not in the set is one greater than the newly 420 * extended range. */ 421 offset = *get_invlist_offset_addr(invlist); 422 if (array[final_element] == start) { 423 if (end != UV_MAX) { 424 array[final_element] = end + 1; 425 } 426 else { 427 /* But if the end is the maximum representable on the machine, 428 * assume that infinity was actually what was meant. Just let 429 * the range that this would extend to have no end */ 430 invlist_set_len(invlist, len - 1, offset); 431 } 432 return; 433 } 434 } 435 436 /* Here the new range doesn't extend any existing set. Add it */ 437 438 len += 2; /* Includes an element each for the start and end of range */ 439 440 /* If wll overflow the existing space, extend, which may cause the array to 441 * be moved */ 442 if (max < len) { 443 invlist_extend(invlist, len); 444 445 /* Have to set len here to avoid assert failure in invlist_array() */ 446 invlist_set_len(invlist, len, offset); 447 448 array = invlist_array(invlist); 449 } 450 else { 451 invlist_set_len(invlist, len, offset); 452 } 453 454 /* The next item on the list starts the range, the one after that is 455 * one past the new range. */ 456 array[len - 2] = start; 457 if (end != UV_MAX) { 458 array[len - 1] = end + 1; 459 } 460 else { 461 /* But if the end is the maximum representable on the machine, just let 462 * the range have no end */ 463 invlist_set_len(invlist, len - 1, offset); 464 } 465 } 466 467 SSize_t 468 Perl__invlist_search(SV* const invlist, const UV cp) 469 { 470 /* Searches the inversion list for the entry that contains the input code 471 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the 472 * return value is the index into the list's array of the range that 473 * contains <cp>, that is, 'i' such that 474 * array[i] <= cp < array[i+1] 475 */ 476 477 IV low = 0; 478 IV mid; 479 IV high = _invlist_len(invlist); 480 const IV highest_element = high - 1; 481 const UV* array; 482 483 PERL_ARGS_ASSERT__INVLIST_SEARCH; 484 485 /* If list is empty, return failure. */ 486 if (UNLIKELY(high == 0)) { 487 return -1; 488 } 489 490 /* (We can't get the array unless we know the list is non-empty) */ 491 array = invlist_array(invlist); 492 493 mid = invlist_previous_index(invlist); 494 assert(mid >=0); 495 if (UNLIKELY(mid > highest_element)) { 496 mid = highest_element; 497 } 498 499 /* <mid> contains the cache of the result of the previous call to this 500 * function (0 the first time). See if this call is for the same result, 501 * or if it is for mid-1. This is under the theory that calls to this 502 * function will often be for related code points that are near each other. 503 * And benchmarks show that caching gives better results. We also test 504 * here if the code point is within the bounds of the list. These tests 505 * replace others that would have had to be made anyway to make sure that 506 * the array bounds were not exceeded, and these give us extra information 507 * at the same time */ 508 if (cp >= array[mid]) { 509 if (cp >= array[highest_element]) { 510 return highest_element; 511 } 512 513 /* Here, array[mid] <= cp < array[highest_element]. This means that 514 * the final element is not the answer, so can exclude it; it also 515 * means that <mid> is not the final element, so can refer to 'mid + 1' 516 * safely */ 517 if (cp < array[mid + 1]) { 518 return mid; 519 } 520 high--; 521 low = mid + 1; 522 } 523 else { /* cp < aray[mid] */ 524 if (cp < array[0]) { /* Fail if outside the array */ 525 return -1; 526 } 527 high = mid; 528 if (cp >= array[mid - 1]) { 529 goto found_entry; 530 } 531 } 532 533 /* Binary search. What we are looking for is <i> such that 534 * array[i] <= cp < array[i+1] 535 * The loop below converges on the i+1. Note that there may not be an 536 * (i+1)th element in the array, and things work nonetheless */ 537 while (low < high) { 538 mid = (low + high) / 2; 539 assert(mid <= highest_element); 540 if (array[mid] <= cp) { /* cp >= array[mid] */ 541 low = mid + 1; 542 543 /* We could do this extra test to exit the loop early. 544 if (cp < array[low]) { 545 return mid; 546 } 547 */ 548 } 549 else { /* cp < array[mid] */ 550 high = mid; 551 } 552 } 553 554 found_entry: 555 high--; 556 invlist_set_previous_index(invlist, high); 557 return high; 558 } 559 560 void 561 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, 562 const bool complement_b, SV** output) 563 { 564 /* Take the union of two inversion lists and point '*output' to it. On 565 * input, '*output' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly 566 * even 'a' or 'b'). If to an inversion list, the contents of the original 567 * list will be replaced by the union. The first list, 'a', may be 568 * NULL, in which case a copy of the second list is placed in '*output'. 569 * If 'complement_b' is TRUE, the union is taken of the complement 570 * (inversion) of 'b' instead of b itself. 571 * 572 * The basis for this comes from "Unicode Demystified" Chapter 13 by 573 * Richard Gillam, published by Addison-Wesley, and explained at some 574 * length there. The preface says to incorporate its examples into your 575 * code at your own risk. 576 * 577 * The algorithm is like a merge sort. */ 578 579 const UV* array_a; /* a's array */ 580 const UV* array_b; 581 UV len_a; /* length of a's array */ 582 UV len_b; 583 584 SV* u; /* the resulting union */ 585 UV* array_u; 586 UV len_u = 0; 587 588 UV i_a = 0; /* current index into a's array */ 589 UV i_b = 0; 590 UV i_u = 0; 591 592 /* running count, as explained in the algorithm source book; items are 593 * stopped accumulating and are output when the count changes to/from 0. 594 * The count is incremented when we start a range that's in an input's set, 595 * and decremented when we start a range that's not in a set. So this 596 * variable can be 0, 1, or 2. When it is 0 neither input is in their set, 597 * and hence nothing goes into the union; 1, just one of the inputs is in 598 * its set (and its current range gets added to the union); and 2 when both 599 * inputs are in their sets. */ 600 UV count = 0; 601 602 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND; 603 assert(a != b); 604 assert(*output == NULL || is_invlist(*output)); 605 606 len_b = _invlist_len(b); 607 if (len_b == 0) { 608 609 /* Here, 'b' is empty, hence it's complement is all possible code 610 * points. So if the union includes the complement of 'b', it includes 611 * everything, and we need not even look at 'a'. It's easiest to 612 * create a new inversion list that matches everything. */ 613 if (complement_b) { 614 SV* everything = _add_range_to_invlist(NULL, 0, UV_MAX); 615 616 if (*output == NULL) { /* If the output didn't exist, just point it 617 at the new list */ 618 *output = everything; 619 } 620 else { /* Otherwise, replace its contents with the new list */ 621 invlist_replace_list_destroys_src(*output, everything); 622 SvREFCNT_dec_NN(everything); 623 } 624 625 return; 626 } 627 628 /* Here, we don't want the complement of 'b', and since 'b' is empty, 629 * the union will come entirely from 'a'. If 'a' is NULL or empty, the 630 * output will be empty */ 631 632 if (a == NULL || _invlist_len(a) == 0) { 633 if (*output == NULL) { 634 *output = _new_invlist(0); 635 } 636 else { 637 invlist_clear(*output); 638 } 639 return; 640 } 641 642 /* Here, 'a' is not empty, but 'b' is, so 'a' entirely determines the 643 * union. We can just return a copy of 'a' if '*output' doesn't point 644 * to an existing list */ 645 if (*output == NULL) { 646 *output = invlist_clone(a, NULL); 647 return; 648 } 649 650 /* If the output is to overwrite 'a', we have a no-op, as it's 651 * already in 'a' */ 652 if (*output == a) { 653 return; 654 } 655 656 /* Here, '*output' is to be overwritten by 'a' */ 657 u = invlist_clone(a, NULL); 658 invlist_replace_list_destroys_src(*output, u); 659 SvREFCNT_dec_NN(u); 660 661 return; 662 } 663 664 /* Here 'b' is not empty. See about 'a' */ 665 666 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) { 667 668 /* Here, 'a' is empty (and b is not). That means the union will come 669 * entirely from 'b'. If '*output' is NULL, we can directly return a 670 * clone of 'b'. Otherwise, we replace the contents of '*output' with 671 * the clone */ 672 673 SV ** dest = (*output == NULL) ? output : &u; 674 *dest = invlist_clone(b, NULL); 675 if (complement_b) { 676 _invlist_invert(*dest); 677 } 678 679 if (dest == &u) { 680 invlist_replace_list_destroys_src(*output, u); 681 SvREFCNT_dec_NN(u); 682 } 683 684 return; 685 } 686 687 /* Here both lists exist and are non-empty */ 688 array_a = invlist_array(a); 689 array_b = invlist_array(b); 690 691 /* If are to take the union of 'a' with the complement of b, set it 692 * up so are looking at b's complement. */ 693 if (complement_b) { 694 695 /* To complement, we invert: if the first element is 0, remove it. To 696 * do this, we just pretend the array starts one later */ 697 if (array_b[0] == 0) { 698 array_b++; 699 len_b--; 700 } 701 else { 702 703 /* But if the first element is not zero, we pretend the list starts 704 * at the 0 that is always stored immediately before the array. */ 705 array_b--; 706 len_b++; 707 } 708 } 709 710 /* Size the union for the worst case: that the sets are completely 711 * disjoint */ 712 u = _new_invlist(len_a + len_b); 713 714 /* Will contain U+0000 if either component does */ 715 array_u = _invlist_array_init(u, ( len_a > 0 && array_a[0] == 0) 716 || (len_b > 0 && array_b[0] == 0)); 717 718 /* Go through each input list item by item, stopping when have exhausted 719 * one of them */ 720 while (i_a < len_a && i_b < len_b) { 721 UV cp; /* The element to potentially add to the union's array */ 722 bool cp_in_set; /* is it in the input list's set or not */ 723 724 /* We need to take one or the other of the two inputs for the union. 725 * Since we are merging two sorted lists, we take the smaller of the 726 * next items. In case of a tie, we take first the one that is in its 727 * set. If we first took the one not in its set, it would decrement 728 * the count, possibly to 0 which would cause it to be output as ending 729 * the range, and the next time through we would take the same number, 730 * and output it again as beginning the next range. By doing it the 731 * opposite way, there is no possibility that the count will be 732 * momentarily decremented to 0, and thus the two adjoining ranges will 733 * be seamlessly merged. (In a tie and both are in the set or both not 734 * in the set, it doesn't matter which we take first.) */ 735 if ( array_a[i_a] < array_b[i_b] 736 || ( array_a[i_a] == array_b[i_b] 737 && ELEMENT_RANGE_MATCHES_INVLIST(i_a))) 738 { 739 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a); 740 cp = array_a[i_a++]; 741 } 742 else { 743 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b); 744 cp = array_b[i_b++]; 745 } 746 747 /* Here, have chosen which of the two inputs to look at. Only output 748 * if the running count changes to/from 0, which marks the 749 * beginning/end of a range that's in the set */ 750 if (cp_in_set) { 751 if (count == 0) { 752 array_u[i_u++] = cp; 753 } 754 count++; 755 } 756 else { 757 count--; 758 if (count == 0) { 759 array_u[i_u++] = cp; 760 } 761 } 762 } 763 764 765 /* The loop above increments the index into exactly one of the input lists 766 * each iteration, and ends when either index gets to its list end. That 767 * means the other index is lower than its end, and so something is 768 * remaining in that one. We decrement 'count', as explained below, if 769 * that list is in its set. (i_a and i_b each currently index the element 770 * beyond the one we care about.) */ 771 if ( (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a)) 772 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b))) 773 { 774 count--; 775 } 776 777 /* Above we decremented 'count' if the list that had unexamined elements in 778 * it was in its set. This has made it so that 'count' being non-zero 779 * means there isn't anything left to output; and 'count' equal to 0 means 780 * that what is left to output is precisely that which is left in the 781 * non-exhausted input list. 782 * 783 * To see why, note first that the exhausted input obviously has nothing 784 * left to add to the union. If it was in its set at its end, that means 785 * the set extends from here to the platform's infinity, and hence so does 786 * the union and the non-exhausted set is irrelevant. The exhausted set 787 * also contributed 1 to 'count'. If 'count' was 2, it got decremented to 788 * 1, but if it was 1, the non-exhausted set wasn't in its set, and so 789 * 'count' remains at 1. This is consistent with the decremented 'count' 790 * != 0 meaning there's nothing left to add to the union. 791 * 792 * But if the exhausted input wasn't in its set, it contributed 0 to 793 * 'count', and the rest of the union will be whatever the other input is. 794 * If 'count' was 0, neither list was in its set, and 'count' remains 0; 795 * otherwise it gets decremented to 0. This is consistent with 'count' 796 * == 0 meaning the remainder of the union is whatever is left in the 797 * non-exhausted list. */ 798 if (count != 0) { 799 len_u = i_u; 800 } 801 else { 802 IV copy_count = len_a - i_a; 803 if (copy_count > 0) { /* The non-exhausted input is 'a' */ 804 Copy(array_a + i_a, array_u + i_u, copy_count, UV); 805 } 806 else { /* The non-exhausted input is b */ 807 copy_count = len_b - i_b; 808 Copy(array_b + i_b, array_u + i_u, copy_count, UV); 809 } 810 len_u = i_u + copy_count; 811 } 812 813 /* Set the result to the final length, which can change the pointer to 814 * array_u, so re-find it. (Note that it is unlikely that this will 815 * change, as we are shrinking the space, not enlarging it) */ 816 if (len_u != _invlist_len(u)) { 817 invlist_set_len(u, len_u, *get_invlist_offset_addr(u)); 818 invlist_trim(u); 819 array_u = invlist_array(u); 820 } 821 822 if (*output == NULL) { /* Simply return the new inversion list */ 823 *output = u; 824 } 825 else { 826 /* Otherwise, overwrite the inversion list that was in '*output'. We 827 * could instead free '*output', and then set it to 'u', but experience 828 * has shown [perl #127392] that if the input is a mortal, we can get a 829 * huge build-up of these during regex compilation before they get 830 * freed. */ 831 invlist_replace_list_destroys_src(*output, u); 832 SvREFCNT_dec_NN(u); 833 } 834 835 return; 836 } 837 838 void 839 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, 840 const bool complement_b, SV** i) 841 { 842 /* Take the intersection of two inversion lists and point '*i' to it. On 843 * input, '*i' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly 844 * even 'a' or 'b'). If to an inversion list, the contents of the original 845 * list will be replaced by the intersection. The first list, 'a', may be 846 * NULL, in which case '*i' will be an empty list. If 'complement_b' is 847 * TRUE, the result will be the intersection of 'a' and the complement (or 848 * inversion) of 'b' instead of 'b' directly. 849 * 850 * The basis for this comes from "Unicode Demystified" Chapter 13 by 851 * Richard Gillam, published by Addison-Wesley, and explained at some 852 * length there. The preface says to incorporate its examples into your 853 * code at your own risk. In fact, it had bugs 854 * 855 * The algorithm is like a merge sort, and is essentially the same as the 856 * union above 857 */ 858 859 const UV* array_a; /* a's array */ 860 const UV* array_b; 861 UV len_a; /* length of a's array */ 862 UV len_b; 863 864 SV* r; /* the resulting intersection */ 865 UV* array_r; 866 UV len_r = 0; 867 868 UV i_a = 0; /* current index into a's array */ 869 UV i_b = 0; 870 UV i_r = 0; 871 872 /* running count of how many of the two inputs are postitioned at ranges 873 * that are in their sets. As explained in the algorithm source book, 874 * items are stopped accumulating and are output when the count changes 875 * to/from 2. The count is incremented when we start a range that's in an 876 * input's set, and decremented when we start a range that's not in a set. 877 * Only when it is 2 are we in the intersection. */ 878 UV count = 0; 879 880 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND; 881 assert(a != b); 882 assert(*i == NULL || is_invlist(*i)); 883 884 /* Special case if either one is empty */ 885 len_a = (a == NULL) ? 0 : _invlist_len(a); 886 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) { 887 if (len_a != 0 && complement_b) { 888 889 /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b' 890 * must be empty. Here, also we are using 'b's complement, which 891 * hence must be every possible code point. Thus the intersection 892 * is simply 'a'. */ 893 894 if (*i == a) { /* No-op */ 895 return; 896 } 897 898 if (*i == NULL) { 899 *i = invlist_clone(a, NULL); 900 return; 901 } 902 903 r = invlist_clone(a, NULL); 904 invlist_replace_list_destroys_src(*i, r); 905 SvREFCNT_dec_NN(r); 906 return; 907 } 908 909 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The 910 * intersection must be empty */ 911 if (*i == NULL) { 912 *i = _new_invlist(0); 913 return; 914 } 915 916 invlist_clear(*i); 917 return; 918 } 919 920 /* Here both lists exist and are non-empty */ 921 array_a = invlist_array(a); 922 array_b = invlist_array(b); 923 924 /* If are to take the intersection of 'a' with the complement of b, set it 925 * up so are looking at b's complement. */ 926 if (complement_b) { 927 928 /* To complement, we invert: if the first element is 0, remove it. To 929 * do this, we just pretend the array starts one later */ 930 if (array_b[0] == 0) { 931 array_b++; 932 len_b--; 933 } 934 else { 935 936 /* But if the first element is not zero, we pretend the list starts 937 * at the 0 that is always stored immediately before the array. */ 938 array_b--; 939 len_b++; 940 } 941 } 942 943 /* Size the intersection for the worst case: that the intersection ends up 944 * fragmenting everything to be completely disjoint */ 945 r= _new_invlist(len_a + len_b); 946 947 /* Will contain U+0000 iff both components do */ 948 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0 949 && len_b > 0 && array_b[0] == 0); 950 951 /* Go through each list item by item, stopping when have exhausted one of 952 * them */ 953 while (i_a < len_a && i_b < len_b) { 954 UV cp; /* The element to potentially add to the intersection's 955 array */ 956 bool cp_in_set; /* Is it in the input list's set or not */ 957 958 /* We need to take one or the other of the two inputs for the 959 * intersection. Since we are merging two sorted lists, we take the 960 * smaller of the next items. In case of a tie, we take first the one 961 * that is not in its set (a difference from the union algorithm). If 962 * we first took the one in its set, it would increment the count, 963 * possibly to 2 which would cause it to be output as starting a range 964 * in the intersection, and the next time through we would take that 965 * same number, and output it again as ending the set. By doing the 966 * opposite of this, there is no possibility that the count will be 967 * momentarily incremented to 2. (In a tie and both are in the set or 968 * both not in the set, it doesn't matter which we take first.) */ 969 if ( array_a[i_a] < array_b[i_b] 970 || ( array_a[i_a] == array_b[i_b] 971 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a))) 972 { 973 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a); 974 cp = array_a[i_a++]; 975 } 976 else { 977 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b); 978 cp= array_b[i_b++]; 979 } 980 981 /* Here, have chosen which of the two inputs to look at. Only output 982 * if the running count changes to/from 2, which marks the 983 * beginning/end of a range that's in the intersection */ 984 if (cp_in_set) { 985 count++; 986 if (count == 2) { 987 array_r[i_r++] = cp; 988 } 989 } 990 else { 991 if (count == 2) { 992 array_r[i_r++] = cp; 993 } 994 count--; 995 } 996 997 } 998 999 /* The loop above increments the index into exactly one of the input lists 1000 * each iteration, and ends when either index gets to its list end. That 1001 * means the other index is lower than its end, and so something is 1002 * remaining in that one. We increment 'count', as explained below, if the 1003 * exhausted list was in its set. (i_a and i_b each currently index the 1004 * element beyond the one we care about.) */ 1005 if ( (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a)) 1006 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b))) 1007 { 1008 count++; 1009 } 1010 1011 /* Above we incremented 'count' if the exhausted list was in its set. This 1012 * has made it so that 'count' being below 2 means there is nothing left to 1013 * output; otheriwse what's left to add to the intersection is precisely 1014 * that which is left in the non-exhausted input list. 1015 * 1016 * To see why, note first that the exhausted input obviously has nothing 1017 * left to affect the intersection. If it was in its set at its end, that 1018 * means the set extends from here to the platform's infinity, and hence 1019 * anything in the non-exhausted's list will be in the intersection, and 1020 * anything not in it won't be. Hence, the rest of the intersection is 1021 * precisely what's in the non-exhausted list The exhausted set also 1022 * contributed 1 to 'count', meaning 'count' was at least 1. Incrementing 1023 * it means 'count' is now at least 2. This is consistent with the 1024 * incremented 'count' being >= 2 means to add the non-exhausted list to 1025 * the intersection. 1026 * 1027 * But if the exhausted input wasn't in its set, it contributed 0 to 1028 * 'count', and the intersection can't include anything further; the 1029 * non-exhausted set is irrelevant. 'count' was at most 1, and doesn't get 1030 * incremented. This is consistent with 'count' being < 2 meaning nothing 1031 * further to add to the intersection. */ 1032 if (count < 2) { /* Nothing left to put in the intersection. */ 1033 len_r = i_r; 1034 } 1035 else { /* copy the non-exhausted list, unchanged. */ 1036 IV copy_count = len_a - i_a; 1037 if (copy_count > 0) { /* a is the one with stuff left */ 1038 Copy(array_a + i_a, array_r + i_r, copy_count, UV); 1039 } 1040 else { /* b is the one with stuff left */ 1041 copy_count = len_b - i_b; 1042 Copy(array_b + i_b, array_r + i_r, copy_count, UV); 1043 } 1044 len_r = i_r + copy_count; 1045 } 1046 1047 /* Set the result to the final length, which can change the pointer to 1048 * array_r, so re-find it. (Note that it is unlikely that this will 1049 * change, as we are shrinking the space, not enlarging it) */ 1050 if (len_r != _invlist_len(r)) { 1051 invlist_set_len(r, len_r, *get_invlist_offset_addr(r)); 1052 invlist_trim(r); 1053 array_r = invlist_array(r); 1054 } 1055 1056 if (*i == NULL) { /* Simply return the calculated intersection */ 1057 *i = r; 1058 } 1059 else { /* Otherwise, replace the existing inversion list in '*i'. We could 1060 instead free '*i', and then set it to 'r', but experience has 1061 shown [perl #127392] that if the input is a mortal, we can get a 1062 huge build-up of these during regex compilation before they get 1063 freed. */ 1064 if (len_r) { 1065 invlist_replace_list_destroys_src(*i, r); 1066 } 1067 else { 1068 invlist_clear(*i); 1069 } 1070 SvREFCNT_dec_NN(r); 1071 } 1072 1073 return; 1074 } 1075 1076 SV* 1077 Perl__add_range_to_invlist(pTHX_ SV* invlist, UV start, UV end) 1078 { 1079 /* Add the range from 'start' to 'end' inclusive to the inversion list's 1080 * set. A pointer to the inversion list is returned. This may actually be 1081 * a new list, in which case the passed in one has been destroyed. The 1082 * passed-in inversion list can be NULL, in which case a new one is created 1083 * with just the one range in it. The new list is not necessarily 1084 * NUL-terminated. Space is not freed if the inversion list shrinks as a 1085 * result of this function. The gain would not be large, and in many 1086 * cases, this is called multiple times on a single inversion list, so 1087 * anything freed may almost immediately be needed again. 1088 * 1089 * This used to mostly call the 'union' routine, but that is much more 1090 * heavyweight than really needed for a single range addition */ 1091 1092 UV* array; /* The array implementing the inversion list */ 1093 UV len; /* How many elements in 'array' */ 1094 SSize_t i_s; /* index into the invlist array where 'start' 1095 should go */ 1096 SSize_t i_e = 0; /* And the index where 'end' should go */ 1097 UV cur_highest; /* The highest code point in the inversion list 1098 upon entry to this function */ 1099 1100 /* This range becomes the whole inversion list if none already existed */ 1101 if (invlist == NULL) { 1102 invlist = _new_invlist(2); 1103 _append_range_to_invlist(invlist, start, end); 1104 return invlist; 1105 } 1106 1107 /* Likewise, if the inversion list is currently empty */ 1108 len = _invlist_len(invlist); 1109 if (len == 0) { 1110 _append_range_to_invlist(invlist, start, end); 1111 return invlist; 1112 } 1113 1114 /* Starting here, we have to know the internals of the list */ 1115 array = invlist_array(invlist); 1116 1117 /* If the new range ends higher than the current highest ... */ 1118 cur_highest = invlist_highest(invlist); 1119 if (end > cur_highest) { 1120 1121 /* If the whole range is higher, we can just append it */ 1122 if (start > cur_highest) { 1123 _append_range_to_invlist(invlist, start, end); 1124 return invlist; 1125 } 1126 1127 /* Otherwise, add the portion that is higher ... */ 1128 _append_range_to_invlist(invlist, cur_highest + 1, end); 1129 1130 /* ... and continue on below to handle the rest. As a result of the 1131 * above append, we know that the index of the end of the range is the 1132 * final even numbered one of the array. Recall that the final element 1133 * always starts a range that extends to infinity. If that range is in 1134 * the set (meaning the set goes from here to infinity), it will be an 1135 * even index, but if it isn't in the set, it's odd, and the final 1136 * range in the set is one less, which is even. */ 1137 if (end == UV_MAX) { 1138 i_e = len; 1139 } 1140 else { 1141 i_e = len - 2; 1142 } 1143 } 1144 1145 /* We have dealt with appending, now see about prepending. If the new 1146 * range starts lower than the current lowest ... */ 1147 if (start < array[0]) { 1148 1149 /* Adding something which has 0 in it is somewhat tricky, and uncommon. 1150 * Let the union code handle it, rather than having to know the 1151 * trickiness in two code places. */ 1152 if (UNLIKELY(start == 0)) { 1153 SV* range_invlist; 1154 1155 range_invlist = _new_invlist(2); 1156 _append_range_to_invlist(range_invlist, start, end); 1157 1158 _invlist_union(invlist, range_invlist, &invlist); 1159 1160 SvREFCNT_dec_NN(range_invlist); 1161 1162 return invlist; 1163 } 1164 1165 /* If the whole new range comes before the first entry, and doesn't 1166 * extend it, we have to insert it as an additional range */ 1167 if (end < array[0] - 1) { 1168 i_s = i_e = -1; 1169 goto splice_in_new_range; 1170 } 1171 1172 /* Here the new range adjoins the existing first range, extending it 1173 * downwards. */ 1174 array[0] = start; 1175 1176 /* And continue on below to handle the rest. We know that the index of 1177 * the beginning of the range is the first one of the array */ 1178 i_s = 0; 1179 } 1180 else { /* Not prepending any part of the new range to the existing list. 1181 * Find where in the list it should go. This finds i_s, such that: 1182 * invlist[i_s] <= start < array[i_s+1] 1183 */ 1184 i_s = _invlist_search(invlist, start); 1185 } 1186 1187 /* At this point, any extending before the beginning of the inversion list 1188 * and/or after the end has been done. This has made it so that, in the 1189 * code below, each endpoint of the new range is either in a range that is 1190 * in the set, or is in a gap between two ranges that are. This means we 1191 * don't have to worry about exceeding the array bounds. 1192 * 1193 * Find where in the list the new range ends (but we can skip this if we 1194 * have already determined what it is, or if it will be the same as i_s, 1195 * which we already have computed) */ 1196 if (i_e == 0) { 1197 i_e = (start == end) 1198 ? i_s 1199 : _invlist_search(invlist, end); 1200 } 1201 1202 /* Here generally invlist[i_e] <= end < array[i_e+1]. But if invlist[i_e] 1203 * is a range that goes to infinity there is no element at invlist[i_e+1], 1204 * so only the first relation holds. */ 1205 1206 if ( ! ELEMENT_RANGE_MATCHES_INVLIST(i_s)) { 1207 1208 /* Here, the ranges on either side of the beginning of the new range 1209 * are in the set, and this range starts in the gap between them. 1210 * 1211 * The new range extends the range above it downwards if the new range 1212 * ends at or above that range's start */ 1213 const bool extends_the_range_above = ( end == UV_MAX 1214 || end + 1 >= array[i_s+1]); 1215 1216 /* The new range extends the range below it upwards if it begins just 1217 * after where that range ends */ 1218 if (start == array[i_s]) { 1219 1220 /* If the new range fills the entire gap between the other ranges, 1221 * they will get merged together. Other ranges may also get 1222 * merged, depending on how many of them the new range spans. In 1223 * the general case, we do the merge later, just once, after we 1224 * figure out how many to merge. But in the case where the new 1225 * range exactly spans just this one gap (possibly extending into 1226 * the one above), we do the merge here, and an early exit. This 1227 * is done here to avoid having to special case later. */ 1228 if (i_e - i_s <= 1) { 1229 1230 /* If i_e - i_s == 1, it means that the new range terminates 1231 * within the range above, and hence 'extends_the_range_above' 1232 * must be true. (If the range above it extends to infinity, 1233 * 'i_s+2' will be above the array's limit, but 'len-i_s-2' 1234 * will be 0, so no harm done.) */ 1235 if (extends_the_range_above) { 1236 Move(array + i_s + 2, array + i_s, len - i_s - 2, UV); 1237 invlist_set_len(invlist, 1238 len - 2, 1239 *(get_invlist_offset_addr(invlist))); 1240 return invlist; 1241 } 1242 1243 /* Here, i_e must == i_s. We keep them in sync, as they apply 1244 * to the same range, and below we are about to decrement i_s 1245 * */ 1246 i_e--; 1247 } 1248 1249 /* Here, the new range is adjacent to the one below. (It may also 1250 * span beyond the range above, but that will get resolved later.) 1251 * Extend the range below to include this one. */ 1252 array[i_s] = (end == UV_MAX) ? UV_MAX : end + 1; 1253 i_s--; 1254 start = array[i_s]; 1255 } 1256 else if (extends_the_range_above) { 1257 1258 /* Here the new range only extends the range above it, but not the 1259 * one below. It merges with the one above. Again, we keep i_e 1260 * and i_s in sync if they point to the same range */ 1261 if (i_e == i_s) { 1262 i_e++; 1263 } 1264 i_s++; 1265 array[i_s] = start; 1266 } 1267 } 1268 1269 /* Here, we've dealt with the new range start extending any adjoining 1270 * existing ranges. 1271 * 1272 * If the new range extends to infinity, it is now the final one, 1273 * regardless of what was there before */ 1274 if (UNLIKELY(end == UV_MAX)) { 1275 invlist_set_len(invlist, i_s + 1, *(get_invlist_offset_addr(invlist))); 1276 return invlist; 1277 } 1278 1279 /* If i_e started as == i_s, it has also been dealt with, 1280 * and been updated to the new i_s, which will fail the following if */ 1281 if (! ELEMENT_RANGE_MATCHES_INVLIST(i_e)) { 1282 1283 /* Here, the ranges on either side of the end of the new range are in 1284 * the set, and this range ends in the gap between them. 1285 * 1286 * If this range is adjacent to (hence extends) the range above it, it 1287 * becomes part of that range; likewise if it extends the range below, 1288 * it becomes part of that range */ 1289 if (end + 1 == array[i_e+1]) { 1290 i_e++; 1291 array[i_e] = start; 1292 } 1293 else if (start <= array[i_e]) { 1294 array[i_e] = end + 1; 1295 i_e--; 1296 } 1297 } 1298 1299 if (i_s == i_e) { 1300 1301 /* If the range fits entirely in an existing range (as possibly already 1302 * extended above), it doesn't add anything new */ 1303 if (ELEMENT_RANGE_MATCHES_INVLIST(i_s)) { 1304 return invlist; 1305 } 1306 1307 /* Here, no part of the range is in the list. Must add it. It will 1308 * occupy 2 more slots */ 1309 splice_in_new_range: 1310 1311 invlist_extend(invlist, len + 2); 1312 array = invlist_array(invlist); 1313 /* Move the rest of the array down two slots. Don't include any 1314 * trailing NUL */ 1315 Move(array + i_e + 1, array + i_e + 3, len - i_e - 1, UV); 1316 1317 /* Do the actual splice */ 1318 array[i_e+1] = start; 1319 array[i_e+2] = end + 1; 1320 invlist_set_len(invlist, len + 2, *(get_invlist_offset_addr(invlist))); 1321 return invlist; 1322 } 1323 1324 /* Here the new range crossed the boundaries of a pre-existing range. The 1325 * code above has adjusted things so that both ends are in ranges that are 1326 * in the set. This means everything in between must also be in the set. 1327 * Just squash things together */ 1328 Move(array + i_e + 1, array + i_s + 1, len - i_e - 1, UV); 1329 invlist_set_len(invlist, 1330 len - i_e + i_s, 1331 *(get_invlist_offset_addr(invlist))); 1332 1333 return invlist; 1334 } 1335 1336 SV* 1337 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0, 1338 UV** other_elements_ptr) 1339 { 1340 /* Create and return an inversion list whose contents are to be populated 1341 * by the caller. The caller gives the number of elements (in 'size') and 1342 * the very first element ('element0'). This function will set 1343 * '*other_elements_ptr' to an array of UVs, where the remaining elements 1344 * are to be placed. 1345 * 1346 * Obviously there is some trust involved that the caller will properly 1347 * fill in the other elements of the array. 1348 * 1349 * (The first element needs to be passed in, as the underlying code does 1350 * things differently depending on whether it is zero or non-zero) */ 1351 1352 SV* invlist = _new_invlist(size); 1353 bool offset; 1354 1355 PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST; 1356 1357 invlist = add_cp_to_invlist(invlist, element0); 1358 offset = *get_invlist_offset_addr(invlist); 1359 1360 invlist_set_len(invlist, size, offset); 1361 *other_elements_ptr = invlist_array(invlist) + 1; 1362 return invlist; 1363 } 1364 1365 #endif 1366 1367 #ifndef PERL_IN_XSUB_RE 1368 void 1369 Perl__invlist_invert(pTHX_ SV* const invlist) 1370 { 1371 /* Complement the input inversion list. This adds a 0 if the list didn't 1372 * have a zero; removes it otherwise. As described above, the data 1373 * structure is set up so that this is very efficient */ 1374 1375 PERL_ARGS_ASSERT__INVLIST_INVERT; 1376 1377 assert(! invlist_is_iterating(invlist)); 1378 1379 /* The inverse of matching nothing is matching everything */ 1380 if (_invlist_len(invlist) == 0) { 1381 _append_range_to_invlist(invlist, 0, UV_MAX); 1382 return; 1383 } 1384 1385 *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist); 1386 } 1387 1388 SV* 1389 Perl_invlist_clone(pTHX_ SV* const invlist, SV* new_invlist) 1390 { 1391 /* Return a new inversion list that is a copy of the input one, which is 1392 * unchanged. The new list will not be mortal even if the old one was. */ 1393 1394 const STRLEN nominal_length = _invlist_len(invlist); 1395 const STRLEN physical_length = SvCUR(invlist); 1396 const bool offset = *(get_invlist_offset_addr(invlist)); 1397 1398 PERL_ARGS_ASSERT_INVLIST_CLONE; 1399 1400 if (new_invlist == NULL) { 1401 new_invlist = _new_invlist(nominal_length); 1402 } 1403 else { 1404 sv_upgrade(new_invlist, SVt_INVLIST); 1405 initialize_invlist_guts(new_invlist, nominal_length); 1406 } 1407 1408 *(get_invlist_offset_addr(new_invlist)) = offset; 1409 invlist_set_len(new_invlist, nominal_length, offset); 1410 Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char); 1411 1412 return new_invlist; 1413 } 1414 1415 #endif 1416 1417 1418 #ifndef PERL_IN_XSUB_RE 1419 void 1420 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, 1421 const char * const indent, SV* const invlist) 1422 { 1423 /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the 1424 * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by 1425 * the string 'indent'. The output looks like this: 1426 [0] 0x000A .. 0x000D 1427 [2] 0x0085 1428 [4] 0x2028 .. 0x2029 1429 [6] 0x3104 .. INFTY 1430 * This means that the first range of code points matched by the list are 1431 * 0xA through 0xD; the second range contains only the single code point 1432 * 0x85, etc. An inversion list is an array of UVs. Two array elements 1433 * are used to define each range (except if the final range extends to 1434 * infinity, only a single element is needed). The array index of the 1435 * first element for the corresponding range is given in brackets. */ 1436 1437 UV start, end; 1438 STRLEN count = 0; 1439 1440 PERL_ARGS_ASSERT__INVLIST_DUMP; 1441 1442 if (invlist_is_iterating(invlist)) { 1443 Perl_dump_indent(aTHX_ level, file, 1444 "%sCan't dump inversion list because is in middle of iterating\n", 1445 indent); 1446 return; 1447 } 1448 1449 invlist_iterinit(invlist); 1450 while (invlist_iternext(invlist, &start, &end)) { 1451 if (end == UV_MAX) { 1452 Perl_dump_indent(aTHX_ level, file, 1453 "%s[%" UVuf "] 0x%04" UVXf " .. INFTY\n", 1454 indent, (UV)count, start); 1455 } 1456 else if (end != start) { 1457 Perl_dump_indent(aTHX_ level, file, 1458 "%s[%" UVuf "] 0x%04" UVXf " .. 0x%04" UVXf "\n", 1459 indent, (UV)count, start, end); 1460 } 1461 else { 1462 Perl_dump_indent(aTHX_ level, file, "%s[%" UVuf "] 0x%04" UVXf "\n", 1463 indent, (UV)count, start); 1464 } 1465 count += 2; 1466 } 1467 } 1468 1469 #endif 1470 1471 #if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE) 1472 bool 1473 Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b) 1474 { 1475 /* Return a boolean as to if the two passed in inversion lists are 1476 * identical. The final argument, if TRUE, says to take the complement of 1477 * the second inversion list before doing the comparison */ 1478 1479 const UV len_a = _invlist_len(a); 1480 UV len_b = _invlist_len(b); 1481 1482 const UV* array_a = NULL; 1483 const UV* array_b = NULL; 1484 1485 PERL_ARGS_ASSERT__INVLISTEQ; 1486 1487 /* This code avoids accessing the arrays unless it knows the length is 1488 * non-zero */ 1489 1490 if (len_a == 0) { 1491 if (len_b == 0) { 1492 return ! complement_b; 1493 } 1494 } 1495 else { 1496 array_a = invlist_array(a); 1497 } 1498 1499 if (len_b != 0) { 1500 array_b = invlist_array(b); 1501 } 1502 1503 /* If are to compare 'a' with the complement of b, set it 1504 * up so are looking at b's complement. */ 1505 if (complement_b) { 1506 1507 /* The complement of nothing is everything, so <a> would have to have 1508 * just one element, starting at zero (ending at infinity) */ 1509 if (len_b == 0) { 1510 return (len_a == 1 && array_a[0] == 0); 1511 } 1512 if (array_b[0] == 0) { 1513 1514 /* Otherwise, to complement, we invert. Here, the first element is 1515 * 0, just remove it. To do this, we just pretend the array starts 1516 * one later */ 1517 1518 array_b++; 1519 len_b--; 1520 } 1521 else { 1522 1523 /* But if the first element is not zero, we pretend the list starts 1524 * at the 0 that is always stored immediately before the array. */ 1525 array_b--; 1526 len_b++; 1527 } 1528 } 1529 1530 return len_a == len_b 1531 && memEQ(array_a, array_b, len_a * sizeof(array_a[0])); 1532 1533 } 1534 #endif 1535 1536 #undef HEADER_LENGTH 1537 #undef TO_INTERNAL_SIZE 1538 #undef FROM_INTERNAL_SIZE 1539 #undef INVLIST_VERSION_ID 1540 1541 /* End of inversion list object */ 1542