1 /* pp_pack.c 2 * 3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others 5 * 6 * You may distribute under the terms of either the GNU General Public 7 * License or the Artistic License, as specified in the README file. 8 * 9 */ 10 11 /* 12 * He still hopefully carried some of his gear in his pack: a small tinder-box, 13 * two small shallow pans, the smaller fitting into the larger; inside them a 14 * wooden spoon, a short two-pronged fork and some skewers were stowed; and 15 * hidden at the bottom of the pack in a flat wooden box a dwindling treasure, 16 * some salt. 17 * 18 * [p.653 of _The Lord of the Rings_, IV/iv: "Of Herbs and Stewed Rabbit"] 19 */ 20 21 /* This file contains pp ("push/pop") functions that 22 * execute the opcodes that make up a perl program. A typical pp function 23 * expects to find its arguments on the stack, and usually pushes its 24 * results onto the stack, hence the 'pp' terminology. Each OP structure 25 * contains a pointer to the relevant pp_foo() function. 26 * 27 * This particular file just contains pp_pack() and pp_unpack(). See the 28 * other pp*.c files for the rest of the pp_ functions. 29 */ 30 31 #include "EXTERN.h" 32 #define PERL_IN_PP_PACK_C 33 #include "perl.h" 34 35 /* Types used by pack/unpack */ 36 typedef enum { 37 e_no_len, /* no length */ 38 e_number, /* number, [] */ 39 e_star /* asterisk */ 40 } howlen_t; 41 42 typedef struct tempsym { 43 const char* patptr; /* current template char */ 44 const char* patend; /* one after last char */ 45 const char* grpbeg; /* 1st char of ()-group */ 46 const char* grpend; /* end of ()-group */ 47 I32 code; /* template code (!<>) */ 48 I32 length; /* length/repeat count */ 49 howlen_t howlen; /* how length is given */ 50 int level; /* () nesting level */ 51 U32 flags; /* /=4, comma=2, pack=1 */ 52 /* and group modifiers */ 53 STRLEN strbeg; /* offset of group start */ 54 struct tempsym *previous; /* previous group */ 55 } tempsym_t; 56 57 #define TEMPSYM_INIT(symptr, p, e, f) \ 58 STMT_START { \ 59 (symptr)->patptr = (p); \ 60 (symptr)->patend = (e); \ 61 (symptr)->grpbeg = NULL; \ 62 (symptr)->grpend = NULL; \ 63 (symptr)->grpend = NULL; \ 64 (symptr)->code = 0; \ 65 (symptr)->length = 0; \ 66 (symptr)->howlen = e_no_len; \ 67 (symptr)->level = 0; \ 68 (symptr)->flags = (f); \ 69 (symptr)->strbeg = 0; \ 70 (symptr)->previous = NULL; \ 71 } STMT_END 72 73 typedef union { 74 NV nv; 75 U8 bytes[sizeof(NV)]; 76 } NV_bytes; 77 78 #if defined(HAS_LONG_DOUBLE) && defined(USE_LONG_DOUBLE) 79 typedef union { 80 long double ld; 81 U8 bytes[sizeof(long double)]; 82 } ld_bytes; 83 #endif 84 85 #ifndef CHAR_BIT 86 # define CHAR_BIT 8 87 #endif 88 /* Maximum number of bytes to which a byte can grow due to upgrade */ 89 #define UTF8_EXPAND 2 90 91 /* 92 * Offset for integer pack/unpack. 93 * 94 * On architectures where I16 and I32 aren't really 16 and 32 bits, 95 * which for now are all Crays, pack and unpack have to play games. 96 */ 97 98 /* 99 * These values are required for portability of pack() output. 100 * If they're not right on your machine, then pack() and unpack() 101 * wouldn't work right anyway; you'll need to apply the Cray hack. 102 * (I'd like to check them with #if, but you can't use sizeof() in 103 * the preprocessor.) --??? 104 */ 105 /* 106 The appropriate SHORTSIZE, INTSIZE, LONGSIZE, and LONGLONGSIZE 107 defines are now in config.h. --Andy Dougherty April 1998 108 */ 109 #define SIZE16 2 110 #define SIZE32 4 111 112 /* CROSSCOMPILE and MULTIARCH are going to affect pp_pack() and pp_unpack(). 113 --jhi Feb 1999 */ 114 115 #if U16SIZE > SIZE16 || U32SIZE > SIZE32 116 # if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 /* little-endian */ 117 # define OFF16(p) ((char*)(p)) 118 # define OFF32(p) ((char*)(p)) 119 # else 120 # if BYTEORDER == 0x4321 || BYTEORDER == 0x87654321 /* big-endian */ 121 # define OFF16(p) ((char*)(p) + (sizeof(U16) - SIZE16)) 122 # define OFF32(p) ((char*)(p) + (sizeof(U32) - SIZE32)) 123 # else 124 ++++ bad cray byte order 125 # endif 126 # endif 127 #else 128 # define OFF16(p) ((char *) (p)) 129 # define OFF32(p) ((char *) (p)) 130 #endif 131 132 #define PUSH16(utf8, cur, p, needs_swap) \ 133 PUSH_BYTES(utf8, cur, OFF16(p), SIZE16, needs_swap) 134 #define PUSH32(utf8, cur, p, needs_swap) \ 135 PUSH_BYTES(utf8, cur, OFF32(p), SIZE32, needs_swap) 136 137 #if BYTEORDER == 0x4321 || BYTEORDER == 0x87654321 /* big-endian */ 138 # define NEEDS_SWAP(d) (TYPE_ENDIANNESS(d) == TYPE_IS_LITTLE_ENDIAN) 139 #elif BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 /* little-endian */ 140 # define NEEDS_SWAP(d) (TYPE_ENDIANNESS(d) == TYPE_IS_BIG_ENDIAN) 141 #else 142 # error "Unsupported byteorder" 143 /* Need to add code here to re-instate mixed endian support. 144 NEEDS_SWAP would need to hold a flag indicating which action to 145 take, and S_reverse_copy and the code in S_utf8_to_bytes would need 146 logic adding to deal with any mixed-endian transformations needed. 147 */ 148 #endif 149 150 /* Only to be used inside a loop (see the break) */ 151 #define SHIFT_BYTES(utf8, s, strend, buf, len, datumtype, needs_swap) \ 152 STMT_START { \ 153 if (UNLIKELY(utf8)) { \ 154 if (!S_utf8_to_bytes(aTHX_ &s, strend, \ 155 (char *) (buf), len, datumtype)) break; \ 156 } else { \ 157 if (UNLIKELY(needs_swap)) \ 158 S_reverse_copy(s, (char *) (buf), len); \ 159 else \ 160 Copy(s, (char *) (buf), len, char); \ 161 s += len; \ 162 } \ 163 } STMT_END 164 165 #define SHIFT16(utf8, s, strend, p, datumtype, needs_swap) \ 166 SHIFT_BYTES(utf8, s, strend, OFF16(p), SIZE16, datumtype, needs_swap) 167 168 #define SHIFT32(utf8, s, strend, p, datumtype, needs_swap) \ 169 SHIFT_BYTES(utf8, s, strend, OFF32(p), SIZE32, datumtype, needs_swap) 170 171 #define SHIFT_VAR(utf8, s, strend, var, datumtype, needs_swap) \ 172 SHIFT_BYTES(utf8, s, strend, &(var), sizeof(var), datumtype, needs_swap) 173 174 #define PUSH_VAR(utf8, aptr, var, needs_swap) \ 175 PUSH_BYTES(utf8, aptr, &(var), sizeof(var), needs_swap) 176 177 /* Avoid stack overflow due to pathological templates. 100 should be plenty. */ 178 #define MAX_SUB_TEMPLATE_LEVEL 100 179 180 /* flags (note that type modifiers can also be used as flags!) */ 181 #define FLAG_WAS_UTF8 0x40 182 #define FLAG_PARSE_UTF8 0x20 /* Parse as utf8 */ 183 #define FLAG_UNPACK_ONLY_ONE 0x10 184 #define FLAG_DO_UTF8 0x08 /* The underlying string is utf8 */ 185 #define FLAG_SLASH 0x04 186 #define FLAG_COMMA 0x02 187 #define FLAG_PACK 0x01 188 189 STATIC SV * 190 S_mul128(pTHX_ SV *sv, U8 m) 191 { 192 STRLEN len; 193 char *s = SvPV(sv, len); 194 char *t; 195 196 PERL_ARGS_ASSERT_MUL128; 197 198 if (!strnEQ(s, "0000", 4)) { /* need to grow sv */ 199 SV * const tmpNew = newSVpvs("0000000000"); 200 201 sv_catsv(tmpNew, sv); 202 SvREFCNT_dec(sv); /* free old sv */ 203 sv = tmpNew; 204 s = SvPV(sv, len); 205 } 206 t = s + len - 1; 207 while (!*t) /* trailing '\0'? */ 208 t--; 209 while (t > s) { 210 const U32 i = ((*t - '0') << 7) + m; 211 *(t--) = '0' + (char)(i % 10); 212 m = (char)(i / 10); 213 } 214 return (sv); 215 } 216 217 /* Explosives and implosives. */ 218 219 #define ISUUCHAR(ch) (NATIVE_TO_LATIN1(ch) >= NATIVE_TO_LATIN1(' ') \ 220 && NATIVE_TO_LATIN1(ch) < NATIVE_TO_LATIN1('a')) 221 222 /* type modifiers */ 223 #define TYPE_IS_SHRIEKING 0x100 224 #define TYPE_IS_BIG_ENDIAN 0x200 225 #define TYPE_IS_LITTLE_ENDIAN 0x400 226 #define TYPE_IS_PACK 0x800 227 #define TYPE_ENDIANNESS_MASK (TYPE_IS_BIG_ENDIAN|TYPE_IS_LITTLE_ENDIAN) 228 #define TYPE_MODIFIERS(t) ((t) & ~0xFF) 229 #define TYPE_NO_MODIFIERS(t) ((t) & 0xFF) 230 231 # define TYPE_ENDIANNESS(t) ((t) & TYPE_ENDIANNESS_MASK) 232 # define TYPE_NO_ENDIANNESS(t) ((t) & ~TYPE_ENDIANNESS_MASK) 233 234 # define ENDIANNESS_ALLOWED_TYPES "sSiIlLqQjJfFdDpP(" 235 236 #define PACK_SIZE_CANNOT_CSUM 0x80 237 #define PACK_SIZE_UNPREDICTABLE 0x40 /* Not a fixed size element */ 238 #define PACK_SIZE_MASK 0x3F 239 240 #include "packsizetables.inc" 241 242 static void 243 S_reverse_copy(const char *src, char *dest, STRLEN len) 244 { 245 dest += len; 246 while (len--) 247 *--dest = *src++; 248 } 249 250 STATIC U8 251 utf8_to_byte(pTHX_ const char **s, const char *end, I32 datumtype) 252 { 253 STRLEN retlen; 254 UV val = utf8n_to_uvchr((U8 *) *s, end-*s, &retlen, 255 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); 256 /* We try to process malformed UTF-8 as much as possible (preferably with 257 warnings), but these two mean we make no progress in the string and 258 might enter an infinite loop */ 259 if (retlen == (STRLEN) -1 || retlen == 0) 260 Perl_croak(aTHX_ "Malformed UTF-8 string in '%c' format in unpack", 261 (int) TYPE_NO_MODIFIERS(datumtype)); 262 if (val >= 0x100) { 263 Perl_ck_warner(aTHX_ packWARN(WARN_UNPACK), 264 "Character in '%c' format wrapped in unpack", 265 (int) TYPE_NO_MODIFIERS(datumtype)); 266 val &= 0xff; 267 } 268 *s += retlen; 269 return (U8)val; 270 } 271 272 #define SHIFT_BYTE(utf8, s, strend, datumtype) ((utf8) ? \ 273 utf8_to_byte(aTHX_ &(s), (strend), (datumtype)) : \ 274 *(U8 *)(s)++) 275 276 STATIC bool 277 S_utf8_to_bytes(pTHX_ const char **s, const char *end, const char *buf, int buf_len, I32 datumtype) 278 { 279 UV val; 280 STRLEN retlen; 281 const char *from = *s; 282 int bad = 0; 283 const U32 flags = ckWARN(WARN_UTF8) ? 284 UTF8_CHECK_ONLY : (UTF8_CHECK_ONLY | UTF8_ALLOW_ANY); 285 const bool needs_swap = NEEDS_SWAP(datumtype); 286 287 if (UNLIKELY(needs_swap)) 288 buf += buf_len; 289 290 for (;buf_len > 0; buf_len--) { 291 if (from >= end) return FALSE; 292 val = utf8n_to_uvchr((U8 *) from, end-from, &retlen, flags); 293 if (retlen == (STRLEN) -1 || retlen == 0) { 294 from += UTF8SKIP(from); 295 bad |= 1; 296 } else from += retlen; 297 if (val >= 0x100) { 298 bad |= 2; 299 val &= 0xff; 300 } 301 if (UNLIKELY(needs_swap)) 302 *(U8 *)--buf = (U8)val; 303 else 304 *(U8 *)buf++ = (U8)val; 305 } 306 /* We have enough characters for the buffer. Did we have problems ? */ 307 if (bad) { 308 if (bad & 1) { 309 /* Rewalk the string fragment while warning */ 310 const char *ptr; 311 const int flags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY; 312 for (ptr = *s; ptr < from; ptr += UTF8SKIP(ptr)) { 313 if (ptr >= end) break; 314 utf8n_to_uvchr((U8 *) ptr, end-ptr, &retlen, flags); 315 } 316 if (from > end) from = end; 317 } 318 if ((bad & 2)) 319 Perl_ck_warner(aTHX_ packWARN(datumtype & TYPE_IS_PACK ? 320 WARN_PACK : WARN_UNPACK), 321 "Character(s) in '%c' format wrapped in %s", 322 (int) TYPE_NO_MODIFIERS(datumtype), 323 datumtype & TYPE_IS_PACK ? "pack" : "unpack"); 324 } 325 *s = from; 326 return TRUE; 327 } 328 329 STATIC char * 330 S_my_bytes_to_utf8(const U8 *start, STRLEN len, char *dest, const bool needs_swap) { 331 PERL_ARGS_ASSERT_MY_BYTES_TO_UTF8; 332 333 if (UNLIKELY(needs_swap)) { 334 const U8 *p = start + len; 335 while (p-- > start) { 336 append_utf8_from_native_byte(*p, (U8 **) & dest); 337 } 338 } else { 339 const U8 * const end = start + len; 340 while (start < end) { 341 append_utf8_from_native_byte(*start, (U8 **) & dest); 342 start++; 343 } 344 } 345 return dest; 346 } 347 348 #define PUSH_BYTES(utf8, cur, buf, len, needs_swap) \ 349 STMT_START { \ 350 if (UNLIKELY(utf8)) \ 351 (cur) = my_bytes_to_utf8((U8 *) buf, len, (cur), needs_swap); \ 352 else { \ 353 if (UNLIKELY(needs_swap)) \ 354 S_reverse_copy((char *)(buf), cur, len); \ 355 else \ 356 Copy(buf, cur, len, char); \ 357 (cur) += (len); \ 358 } \ 359 } STMT_END 360 361 #define GROWING(utf8, cat, start, cur, in_len) \ 362 STMT_START { \ 363 STRLEN glen = (in_len); \ 364 if (utf8) glen *= UTF8_EXPAND; \ 365 if ((cur) + glen >= (start) + SvLEN(cat)) { \ 366 (start) = sv_exp_grow(cat, glen); \ 367 (cur) = (start) + SvCUR(cat); \ 368 } \ 369 } STMT_END 370 371 #define PUSH_GROWING_BYTES(utf8, cat, start, cur, buf, in_len) \ 372 STMT_START { \ 373 const STRLEN glen = (in_len); \ 374 STRLEN gl = glen; \ 375 if (utf8) gl *= UTF8_EXPAND; \ 376 if ((cur) + gl >= (start) + SvLEN(cat)) { \ 377 *cur = '\0'; \ 378 SvCUR_set((cat), (cur) - (start)); \ 379 (start) = sv_exp_grow(cat, gl); \ 380 (cur) = (start) + SvCUR(cat); \ 381 } \ 382 PUSH_BYTES(utf8, cur, buf, glen, 0); \ 383 } STMT_END 384 385 #define PUSH_BYTE(utf8, s, byte) \ 386 STMT_START { \ 387 if (utf8) { \ 388 const U8 au8 = (byte); \ 389 (s) = my_bytes_to_utf8(&au8, 1, (s), 0);\ 390 } else *(U8 *)(s)++ = (byte); \ 391 } STMT_END 392 393 /* Only to be used inside a loop (see the break) */ 394 #define NEXT_UNI_VAL(val, cur, str, end, utf8_flags) \ 395 STMT_START { \ 396 STRLEN retlen; \ 397 if (str >= end) break; \ 398 val = utf8n_to_uvchr((U8 *) str, end-str, &retlen, utf8_flags); \ 399 if (retlen == (STRLEN) -1 || retlen == 0) { \ 400 *cur = '\0'; \ 401 Perl_croak(aTHX_ "Malformed UTF-8 string in pack"); \ 402 } \ 403 str += retlen; \ 404 } STMT_END 405 406 static const char *_action( const tempsym_t* symptr ) 407 { 408 return (const char *)(( symptr->flags & FLAG_PACK ) ? "pack" : "unpack"); 409 } 410 411 /* Returns the sizeof() struct described by pat */ 412 STATIC I32 413 S_measure_struct(pTHX_ tempsym_t* symptr) 414 { 415 I32 total = 0; 416 417 PERL_ARGS_ASSERT_MEASURE_STRUCT; 418 419 while (next_symbol(symptr)) { 420 I32 len; 421 int size; 422 423 switch (symptr->howlen) { 424 case e_star: 425 Perl_croak(aTHX_ "Within []-length '*' not allowed in %s", 426 _action( symptr ) ); 427 428 default: 429 /* e_no_len and e_number */ 430 len = symptr->length; 431 break; 432 } 433 434 size = packprops[TYPE_NO_ENDIANNESS(symptr->code)] & PACK_SIZE_MASK; 435 if (!size) { 436 int star; 437 /* endianness doesn't influence the size of a type */ 438 switch(TYPE_NO_ENDIANNESS(symptr->code)) { 439 default: 440 Perl_croak(aTHX_ "Invalid type '%c' in %s", 441 (int)TYPE_NO_MODIFIERS(symptr->code), 442 _action( symptr ) ); 443 case '.' | TYPE_IS_SHRIEKING: 444 case '@' | TYPE_IS_SHRIEKING: 445 case '@': 446 case '.': 447 case '/': 448 case 'U': /* XXXX Is it correct? */ 449 case 'w': 450 case 'u': 451 Perl_croak(aTHX_ "Within []-length '%c' not allowed in %s", 452 (int) TYPE_NO_MODIFIERS(symptr->code), 453 _action( symptr ) ); 454 case '%': 455 size = 0; 456 break; 457 case '(': 458 { 459 tempsym_t savsym = *symptr; 460 symptr->patptr = savsym.grpbeg; 461 symptr->patend = savsym.grpend; 462 /* XXXX Theoretically, we need to measure many times at 463 different positions, since the subexpression may contain 464 alignment commands, but be not of aligned length. 465 Need to detect this and croak(). */ 466 size = measure_struct(symptr); 467 *symptr = savsym; 468 break; 469 } 470 case 'X' | TYPE_IS_SHRIEKING: 471 /* XXXX Is this useful? Then need to treat MEASURE_BACKWARDS. 472 */ 473 if (!len) /* Avoid division by 0 */ 474 len = 1; 475 len = total % len; /* Assumed: the start is aligned. */ 476 /* FALLTHROUGH */ 477 case 'X': 478 size = -1; 479 if (total < len) 480 Perl_croak(aTHX_ "'X' outside of string in %s", _action( symptr ) ); 481 break; 482 case 'x' | TYPE_IS_SHRIEKING: 483 if (!len) /* Avoid division by 0 */ 484 len = 1; 485 star = total % len; /* Assumed: the start is aligned. */ 486 if (star) /* Other portable ways? */ 487 len = len - star; 488 else 489 len = 0; 490 /* FALLTHROUGH */ 491 case 'x': 492 case 'A': 493 case 'Z': 494 case 'a': 495 size = 1; 496 break; 497 case 'B': 498 case 'b': 499 len = (len + 7)/8; 500 size = 1; 501 break; 502 case 'H': 503 case 'h': 504 len = (len + 1)/2; 505 size = 1; 506 break; 507 508 case 'P': 509 len = 1; 510 size = sizeof(char*); 511 break; 512 } 513 } 514 total += len * size; 515 } 516 return total; 517 } 518 519 520 /* locate matching closing parenthesis or bracket 521 * returns char pointer to char after match, or NULL 522 */ 523 STATIC const char * 524 S_group_end(pTHX_ const char *patptr, const char *patend, char ender) 525 { 526 PERL_ARGS_ASSERT_GROUP_END; 527 528 while (patptr < patend) { 529 const char c = *patptr++; 530 531 if (isSPACE(c)) 532 continue; 533 else if (c == ender) 534 return patptr-1; 535 else if (c == '#') { 536 while (patptr < patend && *patptr != '\n') 537 patptr++; 538 continue; 539 } else if (c == '(') 540 patptr = group_end(patptr, patend, ')') + 1; 541 else if (c == '[') 542 patptr = group_end(patptr, patend, ']') + 1; 543 } 544 Perl_croak(aTHX_ "No group ending character '%c' found in template", 545 ender); 546 NOT_REACHED; /* NOTREACHED */ 547 } 548 549 550 /* Convert unsigned decimal number to binary. 551 * Expects a pointer to the first digit and address of length variable 552 * Advances char pointer to 1st non-digit char and returns number 553 */ 554 STATIC const char * 555 S_get_num(pTHX_ const char *patptr, I32 *lenptr ) 556 { 557 I32 len = *patptr++ - '0'; 558 559 PERL_ARGS_ASSERT_GET_NUM; 560 561 while (isDIGIT(*patptr)) { 562 if (len >= 0x7FFFFFFF/10) 563 Perl_croak(aTHX_ "pack/unpack repeat count overflow"); 564 len = (len * 10) + (*patptr++ - '0'); 565 } 566 *lenptr = len; 567 return patptr; 568 } 569 570 /* The marvellous template parsing routine: Using state stored in *symptr, 571 * locates next template code and count 572 */ 573 STATIC bool 574 S_next_symbol(pTHX_ tempsym_t* symptr ) 575 { 576 const char* patptr = symptr->patptr; 577 const char* const patend = symptr->patend; 578 579 PERL_ARGS_ASSERT_NEXT_SYMBOL; 580 581 symptr->flags &= ~FLAG_SLASH; 582 583 while (patptr < patend) { 584 if (isSPACE(*patptr)) 585 patptr++; 586 else if (*patptr == '#') { 587 patptr++; 588 while (patptr < patend && *patptr != '\n') 589 patptr++; 590 if (patptr < patend) 591 patptr++; 592 } else { 593 /* We should have found a template code */ 594 I32 code = *patptr++ & 0xFF; 595 U32 inherited_modifiers = 0; 596 597 if (code == ','){ /* grandfather in commas but with a warning */ 598 if (((symptr->flags & FLAG_COMMA) == 0) && ckWARN(WARN_UNPACK)){ 599 symptr->flags |= FLAG_COMMA; 600 Perl_warner(aTHX_ packWARN(WARN_UNPACK), 601 "Invalid type ',' in %s", _action( symptr ) ); 602 } 603 continue; 604 } 605 606 /* for '(', skip to ')' */ 607 if (code == '(') { 608 if( isDIGIT(*patptr) || *patptr == '*' || *patptr == '[' ) 609 Perl_croak(aTHX_ "()-group starts with a count in %s", 610 _action( symptr ) ); 611 symptr->grpbeg = patptr; 612 patptr = 1 + ( symptr->grpend = group_end(patptr, patend, ')') ); 613 if( symptr->level >= MAX_SUB_TEMPLATE_LEVEL ) 614 Perl_croak(aTHX_ "Too deeply nested ()-groups in %s", 615 _action( symptr ) ); 616 } 617 618 /* look for group modifiers to inherit */ 619 if (TYPE_ENDIANNESS(symptr->flags)) { 620 if (strchr(ENDIANNESS_ALLOWED_TYPES, TYPE_NO_MODIFIERS(code))) 621 inherited_modifiers |= TYPE_ENDIANNESS(symptr->flags); 622 } 623 624 /* look for modifiers */ 625 while (patptr < patend) { 626 const char *allowed; 627 I32 modifier; 628 switch (*patptr) { 629 case '!': 630 modifier = TYPE_IS_SHRIEKING; 631 allowed = "sSiIlLxXnNvV@."; 632 break; 633 case '>': 634 modifier = TYPE_IS_BIG_ENDIAN; 635 allowed = ENDIANNESS_ALLOWED_TYPES; 636 break; 637 case '<': 638 modifier = TYPE_IS_LITTLE_ENDIAN; 639 allowed = ENDIANNESS_ALLOWED_TYPES; 640 break; 641 default: 642 allowed = ""; 643 modifier = 0; 644 break; 645 } 646 647 if (modifier == 0) 648 break; 649 650 if (!strchr(allowed, TYPE_NO_MODIFIERS(code))) 651 Perl_croak(aTHX_ "'%c' allowed only after types %s in %s", *patptr, 652 allowed, _action( symptr ) ); 653 654 if (TYPE_ENDIANNESS(code | modifier) == TYPE_ENDIANNESS_MASK) 655 Perl_croak(aTHX_ "Can't use both '<' and '>' after type '%c' in %s", 656 (int) TYPE_NO_MODIFIERS(code), _action( symptr ) ); 657 else if (TYPE_ENDIANNESS(code | modifier | inherited_modifiers) == 658 TYPE_ENDIANNESS_MASK) 659 Perl_croak(aTHX_ "Can't use '%c' in a group with different byte-order in %s", 660 *patptr, _action( symptr ) ); 661 662 if ((code & modifier)) { 663 Perl_ck_warner(aTHX_ packWARN(WARN_UNPACK), 664 "Duplicate modifier '%c' after '%c' in %s", 665 *patptr, (int) TYPE_NO_MODIFIERS(code), 666 _action( symptr ) ); 667 } 668 669 code |= modifier; 670 patptr++; 671 } 672 673 /* inherit modifiers */ 674 code |= inherited_modifiers; 675 676 /* look for count and/or / */ 677 if (patptr < patend) { 678 if (isDIGIT(*patptr)) { 679 patptr = get_num( patptr, &symptr->length ); 680 symptr->howlen = e_number; 681 682 } else if (*patptr == '*') { 683 patptr++; 684 symptr->howlen = e_star; 685 686 } else if (*patptr == '[') { 687 const char* lenptr = ++patptr; 688 symptr->howlen = e_number; 689 patptr = group_end( patptr, patend, ']' ) + 1; 690 /* what kind of [] is it? */ 691 if (isDIGIT(*lenptr)) { 692 lenptr = get_num( lenptr, &symptr->length ); 693 if( *lenptr != ']' ) 694 Perl_croak(aTHX_ "Malformed integer in [] in %s", 695 _action( symptr ) ); 696 } else { 697 tempsym_t savsym = *symptr; 698 symptr->patend = patptr-1; 699 symptr->patptr = lenptr; 700 savsym.length = measure_struct(symptr); 701 *symptr = savsym; 702 } 703 } else { 704 symptr->howlen = e_no_len; 705 symptr->length = 1; 706 } 707 708 /* try to find / */ 709 while (patptr < patend) { 710 if (isSPACE(*patptr)) 711 patptr++; 712 else if (*patptr == '#') { 713 patptr++; 714 while (patptr < patend && *patptr != '\n') 715 patptr++; 716 if (patptr < patend) 717 patptr++; 718 } else { 719 if (*patptr == '/') { 720 symptr->flags |= FLAG_SLASH; 721 patptr++; 722 if (patptr < patend && 723 (isDIGIT(*patptr) || *patptr == '*' || *patptr == '[')) 724 Perl_croak(aTHX_ "'/' does not take a repeat count in %s", 725 _action( symptr ) ); 726 } 727 break; 728 } 729 } 730 } else { 731 /* at end - no count, no / */ 732 symptr->howlen = e_no_len; 733 symptr->length = 1; 734 } 735 736 symptr->code = code; 737 symptr->patptr = patptr; 738 return TRUE; 739 } 740 } 741 symptr->patptr = patptr; 742 return FALSE; 743 } 744 745 /* 746 There is no way to cleanly handle the case where we should process the 747 string per byte in its upgraded form while it's really in downgraded form 748 (e.g. estimates like strend-s as an upper bound for the number of 749 characters left wouldn't work). So if we foresee the need of this 750 (pattern starts with U or contains U0), we want to work on the encoded 751 version of the string. Users are advised to upgrade their pack string 752 themselves if they need to do a lot of unpacks like this on it 753 */ 754 STATIC bool 755 need_utf8(const char *pat, const char *patend) 756 { 757 bool first = TRUE; 758 759 PERL_ARGS_ASSERT_NEED_UTF8; 760 761 while (pat < patend) { 762 if (pat[0] == '#') { 763 pat++; 764 pat = (const char *) memchr(pat, '\n', patend-pat); 765 if (!pat) return FALSE; 766 } else if (pat[0] == 'U') { 767 if (first || pat[1] == '0') return TRUE; 768 } else first = FALSE; 769 pat++; 770 } 771 return FALSE; 772 } 773 774 STATIC char 775 first_symbol(const char *pat, const char *patend) { 776 PERL_ARGS_ASSERT_FIRST_SYMBOL; 777 778 while (pat < patend) { 779 if (pat[0] != '#') return pat[0]; 780 pat++; 781 pat = (const char *) memchr(pat, '\n', patend-pat); 782 if (!pat) return 0; 783 pat++; 784 } 785 return 0; 786 } 787 788 /* 789 790 =head1 Pack and Unpack 791 792 =for apidoc unpackstring 793 794 The engine implementing the C<unpack()> Perl function. 795 796 Using the template C<pat..patend>, this function unpacks the string 797 C<s..strend> into a number of mortal SVs, which it pushes onto the perl 798 argument (C<@_>) stack (so you will need to issue a C<PUTBACK> before and 799 C<SPAGAIN> after the call to this function). It returns the number of 800 pushed elements. 801 802 The C<strend> and C<patend> pointers should point to the byte following the 803 last character of each string. 804 805 Although this function returns its values on the perl argument stack, it 806 doesn't take any parameters from that stack (and thus in particular 807 there's no need to do a C<PUSHMARK> before calling it, unlike L</call_pv> for 808 example). 809 810 =cut */ 811 812 I32 813 Perl_unpackstring(pTHX_ const char *pat, const char *patend, const char *s, const char *strend, U32 flags) 814 { 815 tempsym_t sym; 816 817 PERL_ARGS_ASSERT_UNPACKSTRING; 818 819 if (flags & FLAG_DO_UTF8) flags |= FLAG_WAS_UTF8; 820 else if (need_utf8(pat, patend)) { 821 /* We probably should try to avoid this in case a scalar context call 822 wouldn't get to the "U0" */ 823 STRLEN len = strend - s; 824 s = (char *) bytes_to_utf8((U8 *) s, &len); 825 SAVEFREEPV(s); 826 strend = s + len; 827 flags |= FLAG_DO_UTF8; 828 } 829 830 if (first_symbol(pat, patend) != 'U' && (flags & FLAG_DO_UTF8)) 831 flags |= FLAG_PARSE_UTF8; 832 833 TEMPSYM_INIT(&sym, pat, patend, flags); 834 835 return unpack_rec(&sym, s, s, strend, NULL ); 836 } 837 838 STATIC I32 839 S_unpack_rec(pTHX_ tempsym_t* symptr, const char *s, const char *strbeg, const char *strend, const char **new_s ) 840 { 841 dSP; 842 SV *sv = NULL; 843 const I32 start_sp_offset = SP - PL_stack_base; 844 howlen_t howlen; 845 I32 checksum = 0; 846 UV cuv = 0; 847 NV cdouble = 0.0; 848 const int bits_in_uv = CHAR_BIT * sizeof(cuv); 849 bool beyond = FALSE; 850 bool explicit_length; 851 const bool unpack_only_one = (symptr->flags & FLAG_UNPACK_ONLY_ONE) != 0; 852 bool utf8 = (symptr->flags & FLAG_PARSE_UTF8) ? 1 : 0; 853 854 PERL_ARGS_ASSERT_UNPACK_REC; 855 856 symptr->strbeg = s - strbeg; 857 858 while (next_symbol(symptr)) { 859 packprops_t props; 860 I32 len; 861 I32 datumtype = symptr->code; 862 bool needs_swap; 863 /* do first one only unless in list context 864 / is implemented by unpacking the count, then popping it from the 865 stack, so must check that we're not in the middle of a / */ 866 if ( unpack_only_one 867 && (SP - PL_stack_base == start_sp_offset + 1) 868 && (datumtype != '/') ) /* XXX can this be omitted */ 869 break; 870 871 switch (howlen = symptr->howlen) { 872 case e_star: 873 len = strend - strbeg; /* long enough */ 874 break; 875 default: 876 /* e_no_len and e_number */ 877 len = symptr->length; 878 break; 879 } 880 881 explicit_length = TRUE; 882 redo_switch: 883 beyond = s >= strend; 884 885 props = packprops[TYPE_NO_ENDIANNESS(datumtype)]; 886 if (props) { 887 /* props nonzero means we can process this letter. */ 888 const long size = props & PACK_SIZE_MASK; 889 const long howmany = (strend - s) / size; 890 if (len > howmany) 891 len = howmany; 892 893 if (!checksum || (props & PACK_SIZE_CANNOT_CSUM)) { 894 if (len && unpack_only_one) len = 1; 895 EXTEND(SP, len); 896 EXTEND_MORTAL(len); 897 } 898 } 899 900 needs_swap = NEEDS_SWAP(datumtype); 901 902 switch(TYPE_NO_ENDIANNESS(datumtype)) { 903 default: 904 Perl_croak(aTHX_ "Invalid type '%c' in unpack", (int)TYPE_NO_MODIFIERS(datumtype) ); 905 906 case '%': 907 if (howlen == e_no_len) 908 len = 16; /* len is not specified */ 909 checksum = len; 910 cuv = 0; 911 cdouble = 0; 912 continue; 913 914 case '(': 915 { 916 tempsym_t savsym = *symptr; 917 const U32 group_modifiers = TYPE_MODIFIERS(datumtype & ~symptr->flags); 918 symptr->flags |= group_modifiers; 919 symptr->patend = savsym.grpend; 920 symptr->previous = &savsym; 921 symptr->level++; 922 PUTBACK; 923 if (len && unpack_only_one) len = 1; 924 while (len--) { 925 symptr->patptr = savsym.grpbeg; 926 if (utf8) symptr->flags |= FLAG_PARSE_UTF8; 927 else symptr->flags &= ~FLAG_PARSE_UTF8; 928 unpack_rec(symptr, s, strbeg, strend, &s); 929 if (s == strend && savsym.howlen == e_star) 930 break; /* No way to continue */ 931 } 932 SPAGAIN; 933 savsym.flags = symptr->flags & ~group_modifiers; 934 *symptr = savsym; 935 break; 936 } 937 case '.' | TYPE_IS_SHRIEKING: 938 case '.': { 939 const char *from; 940 SV *sv; 941 const bool u8 = utf8 && !(datumtype & TYPE_IS_SHRIEKING); 942 if (howlen == e_star) from = strbeg; 943 else if (len <= 0) from = s; 944 else { 945 tempsym_t *group = symptr; 946 947 while (--len && group) group = group->previous; 948 from = group ? strbeg + group->strbeg : strbeg; 949 } 950 sv = from <= s ? 951 newSVuv( u8 ? (UV) utf8_length((const U8*)from, (const U8*)s) : (UV) (s-from)) : 952 newSViv(-(u8 ? (IV) utf8_length((const U8*)s, (const U8*)from) : (IV) (from-s))); 953 mXPUSHs(sv); 954 break; 955 } 956 case '@' | TYPE_IS_SHRIEKING: 957 case '@': 958 s = strbeg + symptr->strbeg; 959 if (utf8 && !(datumtype & TYPE_IS_SHRIEKING)) 960 { 961 while (len > 0) { 962 if (s >= strend) 963 Perl_croak(aTHX_ "'@' outside of string in unpack"); 964 s += UTF8SKIP(s); 965 len--; 966 } 967 if (s > strend) 968 Perl_croak(aTHX_ "'@' outside of string with malformed UTF-8 in unpack"); 969 } else { 970 if (strend-s < len) 971 Perl_croak(aTHX_ "'@' outside of string in unpack"); 972 s += len; 973 } 974 break; 975 case 'X' | TYPE_IS_SHRIEKING: 976 if (!len) /* Avoid division by 0 */ 977 len = 1; 978 if (utf8) { 979 const char *hop, *last; 980 I32 l = len; 981 hop = last = strbeg; 982 while (hop < s) { 983 hop += UTF8SKIP(hop); 984 if (--l == 0) { 985 last = hop; 986 l = len; 987 } 988 } 989 if (last > s) 990 Perl_croak(aTHX_ "Malformed UTF-8 string in unpack"); 991 s = last; 992 break; 993 } 994 len = (s - strbeg) % len; 995 /* FALLTHROUGH */ 996 case 'X': 997 if (utf8) { 998 while (len > 0) { 999 if (s <= strbeg) 1000 Perl_croak(aTHX_ "'X' outside of string in unpack"); 1001 while (--s, UTF8_IS_CONTINUATION(*s)) { 1002 if (s <= strbeg) 1003 Perl_croak(aTHX_ "'X' outside of string in unpack"); 1004 } 1005 len--; 1006 } 1007 } else { 1008 if (len > s - strbeg) 1009 Perl_croak(aTHX_ "'X' outside of string in unpack" ); 1010 s -= len; 1011 } 1012 break; 1013 case 'x' | TYPE_IS_SHRIEKING: { 1014 I32 ai32; 1015 if (!len) /* Avoid division by 0 */ 1016 len = 1; 1017 if (utf8) ai32 = utf8_length((U8 *) strbeg, (U8 *) s) % len; 1018 else ai32 = (s - strbeg) % len; 1019 if (ai32 == 0) break; 1020 len -= ai32; 1021 } 1022 /* FALLTHROUGH */ 1023 case 'x': 1024 if (utf8) { 1025 while (len>0) { 1026 if (s >= strend) 1027 Perl_croak(aTHX_ "'x' outside of string in unpack"); 1028 s += UTF8SKIP(s); 1029 len--; 1030 } 1031 } else { 1032 if (len > strend - s) 1033 Perl_croak(aTHX_ "'x' outside of string in unpack"); 1034 s += len; 1035 } 1036 break; 1037 case '/': 1038 Perl_croak(aTHX_ "'/' must follow a numeric type in unpack"); 1039 1040 case 'A': 1041 case 'Z': 1042 case 'a': 1043 if (checksum) { 1044 /* Preliminary length estimate is assumed done in 'W' */ 1045 if (len > strend - s) len = strend - s; 1046 goto W_checksum; 1047 } 1048 if (utf8) { 1049 I32 l; 1050 const char *hop; 1051 for (l=len, hop=s; l>0; l--, hop += UTF8SKIP(hop)) { 1052 if (hop >= strend) { 1053 if (hop > strend) 1054 Perl_croak(aTHX_ "Malformed UTF-8 string in unpack"); 1055 break; 1056 } 1057 } 1058 if (hop > strend) 1059 Perl_croak(aTHX_ "Malformed UTF-8 string in unpack"); 1060 len = hop - s; 1061 } else if (len > strend - s) 1062 len = strend - s; 1063 1064 if (datumtype == 'Z') { 1065 /* 'Z' strips stuff after first null */ 1066 const char *ptr, *end; 1067 end = s + len; 1068 for (ptr = s; ptr < end; ptr++) if (*ptr == 0) break; 1069 sv = newSVpvn(s, ptr-s); 1070 if (howlen == e_star) /* exact for 'Z*' */ 1071 len = ptr-s + (ptr != strend ? 1 : 0); 1072 } else if (datumtype == 'A') { 1073 /* 'A' strips both nulls and spaces */ 1074 const char *ptr; 1075 if (utf8 && (symptr->flags & FLAG_WAS_UTF8)) { 1076 for (ptr = s+len-1; ptr >= s; ptr--) 1077 if (*ptr != 0 && !UTF8_IS_CONTINUATION(*ptr) && 1078 !isSPACE_utf8(ptr)) break; 1079 if (ptr >= s) ptr += UTF8SKIP(ptr); 1080 else ptr++; 1081 if (ptr > s+len) 1082 Perl_croak(aTHX_ "Malformed UTF-8 string in unpack"); 1083 } else { 1084 for (ptr = s+len-1; ptr >= s; ptr--) 1085 if (*ptr != 0 && !isSPACE(*ptr)) break; 1086 ptr++; 1087 } 1088 sv = newSVpvn(s, ptr-s); 1089 } else sv = newSVpvn(s, len); 1090 1091 if (utf8) { 1092 SvUTF8_on(sv); 1093 /* Undo any upgrade done due to need_utf8() */ 1094 if (!(symptr->flags & FLAG_WAS_UTF8)) 1095 sv_utf8_downgrade(sv, 0); 1096 } 1097 mXPUSHs(sv); 1098 s += len; 1099 break; 1100 case 'B': 1101 case 'b': { 1102 char *str; 1103 if (howlen == e_star || len > (strend - s) * 8) 1104 len = (strend - s) * 8; 1105 if (checksum) { 1106 if (utf8) 1107 while (len >= 8 && s < strend) { 1108 cuv += PL_bitcount[utf8_to_byte(aTHX_ &s, strend, datumtype)]; 1109 len -= 8; 1110 } 1111 else 1112 while (len >= 8) { 1113 cuv += PL_bitcount[*(U8 *)s++]; 1114 len -= 8; 1115 } 1116 if (len && s < strend) { 1117 U8 bits; 1118 bits = SHIFT_BYTE(utf8, s, strend, datumtype); 1119 if (datumtype == 'b') 1120 while (len-- > 0) { 1121 if (bits & 1) cuv++; 1122 bits >>= 1; 1123 } 1124 else 1125 while (len-- > 0) { 1126 if (bits & 0x80) cuv++; 1127 bits <<= 1; 1128 } 1129 } 1130 break; 1131 } 1132 1133 sv = sv_2mortal(newSV(len ? len : 1)); 1134 SvPOK_on(sv); 1135 str = SvPVX(sv); 1136 if (datumtype == 'b') { 1137 U8 bits = 0; 1138 const I32 ai32 = len; 1139 for (len = 0; len < ai32; len++) { 1140 if (len & 7) bits >>= 1; 1141 else if (utf8) { 1142 if (s >= strend) break; 1143 bits = utf8_to_byte(aTHX_ &s, strend, datumtype); 1144 } else bits = *(U8 *) s++; 1145 *str++ = bits & 1 ? '1' : '0'; 1146 } 1147 } else { 1148 U8 bits = 0; 1149 const I32 ai32 = len; 1150 for (len = 0; len < ai32; len++) { 1151 if (len & 7) bits <<= 1; 1152 else if (utf8) { 1153 if (s >= strend) break; 1154 bits = utf8_to_byte(aTHX_ &s, strend, datumtype); 1155 } else bits = *(U8 *) s++; 1156 *str++ = bits & 0x80 ? '1' : '0'; 1157 } 1158 } 1159 *str = '\0'; 1160 SvCUR_set(sv, str - SvPVX_const(sv)); 1161 XPUSHs(sv); 1162 break; 1163 } 1164 case 'H': 1165 case 'h': { 1166 char *str = NULL; 1167 /* Preliminary length estimate, acceptable for utf8 too */ 1168 if (howlen == e_star || len > (strend - s) * 2) 1169 len = (strend - s) * 2; 1170 if (!checksum) { 1171 sv = sv_2mortal(newSV(len ? len : 1)); 1172 SvPOK_on(sv); 1173 str = SvPVX(sv); 1174 } 1175 if (datumtype == 'h') { 1176 U8 bits = 0; 1177 I32 ai32 = len; 1178 for (len = 0; len < ai32; len++) { 1179 if (len & 1) bits >>= 4; 1180 else if (utf8) { 1181 if (s >= strend) break; 1182 bits = utf8_to_byte(aTHX_ &s, strend, datumtype); 1183 } else bits = * (U8 *) s++; 1184 if (!checksum) 1185 *str++ = PL_hexdigit[bits & 15]; 1186 } 1187 } else { 1188 U8 bits = 0; 1189 const I32 ai32 = len; 1190 for (len = 0; len < ai32; len++) { 1191 if (len & 1) bits <<= 4; 1192 else if (utf8) { 1193 if (s >= strend) break; 1194 bits = utf8_to_byte(aTHX_ &s, strend, datumtype); 1195 } else bits = *(U8 *) s++; 1196 if (!checksum) 1197 *str++ = PL_hexdigit[(bits >> 4) & 15]; 1198 } 1199 } 1200 if (!checksum) { 1201 *str = '\0'; 1202 SvCUR_set(sv, str - SvPVX_const(sv)); 1203 XPUSHs(sv); 1204 } 1205 break; 1206 } 1207 case 'C': 1208 if (len == 0) { 1209 if (explicit_length) 1210 /* Switch to "character" mode */ 1211 utf8 = (symptr->flags & FLAG_DO_UTF8) ? 1 : 0; 1212 break; 1213 } 1214 /* FALLTHROUGH */ 1215 case 'c': 1216 while (len-- > 0 && s < strend) { 1217 int aint; 1218 if (utf8) 1219 { 1220 STRLEN retlen; 1221 aint = utf8n_to_uvchr((U8 *) s, strend-s, &retlen, 1222 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); 1223 if (retlen == (STRLEN) -1 || retlen == 0) 1224 Perl_croak(aTHX_ "Malformed UTF-8 string in unpack"); 1225 s += retlen; 1226 } 1227 else 1228 aint = *(U8 *)(s)++; 1229 if (aint >= 128 && datumtype != 'C') /* fake up signed chars */ 1230 aint -= 256; 1231 if (!checksum) 1232 mPUSHi(aint); 1233 else if (checksum > bits_in_uv) 1234 cdouble += (NV)aint; 1235 else 1236 cuv += aint; 1237 } 1238 break; 1239 case 'W': 1240 W_checksum: 1241 if (utf8) { 1242 while (len-- > 0 && s < strend) { 1243 STRLEN retlen; 1244 const UV val = utf8n_to_uvchr((U8 *) s, strend-s, &retlen, 1245 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); 1246 if (retlen == (STRLEN) -1 || retlen == 0) 1247 Perl_croak(aTHX_ "Malformed UTF-8 string in unpack"); 1248 s += retlen; 1249 if (!checksum) 1250 mPUSHu(val); 1251 else if (checksum > bits_in_uv) 1252 cdouble += (NV) val; 1253 else 1254 cuv += val; 1255 } 1256 } else if (!checksum) 1257 while (len-- > 0) { 1258 const U8 ch = *(U8 *) s++; 1259 mPUSHu(ch); 1260 } 1261 else if (checksum > bits_in_uv) 1262 while (len-- > 0) cdouble += (NV) *(U8 *) s++; 1263 else 1264 while (len-- > 0) cuv += *(U8 *) s++; 1265 break; 1266 case 'U': 1267 if (len == 0) { 1268 if (explicit_length && howlen != e_star) { 1269 /* Switch to "bytes in UTF-8" mode */ 1270 if (symptr->flags & FLAG_DO_UTF8) utf8 = 0; 1271 else 1272 /* Should be impossible due to the need_utf8() test */ 1273 Perl_croak(aTHX_ "U0 mode on a byte string"); 1274 } 1275 break; 1276 } 1277 if (len > strend - s) len = strend - s; 1278 if (!checksum) { 1279 if (len && unpack_only_one) len = 1; 1280 EXTEND(SP, len); 1281 EXTEND_MORTAL(len); 1282 } 1283 while (len-- > 0 && s < strend) { 1284 STRLEN retlen; 1285 UV auv; 1286 if (utf8) { 1287 U8 result[UTF8_MAXLEN]; 1288 const char *ptr = s; 1289 STRLEN len; 1290 /* Bug: warns about bad utf8 even if we are short on bytes 1291 and will break out of the loop */ 1292 if (!S_utf8_to_bytes(aTHX_ &ptr, strend, (char *) result, 1, 1293 'U')) 1294 break; 1295 len = UTF8SKIP(result); 1296 if (!S_utf8_to_bytes(aTHX_ &ptr, strend, 1297 (char *) &result[1], len-1, 'U')) break; 1298 auv = NATIVE_TO_UNI(utf8n_to_uvchr(result, 1299 len, 1300 &retlen, 1301 UTF8_ALLOW_DEFAULT)); 1302 s = ptr; 1303 } else { 1304 auv = NATIVE_TO_UNI(utf8n_to_uvchr((U8*)s, 1305 strend - s, 1306 &retlen, 1307 UTF8_ALLOW_DEFAULT)); 1308 if (retlen == (STRLEN) -1 || retlen == 0) 1309 Perl_croak(aTHX_ "Malformed UTF-8 string in unpack"); 1310 s += retlen; 1311 } 1312 if (!checksum) 1313 mPUSHu(auv); 1314 else if (checksum > bits_in_uv) 1315 cdouble += (NV) auv; 1316 else 1317 cuv += auv; 1318 } 1319 break; 1320 case 's' | TYPE_IS_SHRIEKING: 1321 #if SHORTSIZE != SIZE16 1322 while (len-- > 0) { 1323 short ashort; 1324 SHIFT_VAR(utf8, s, strend, ashort, datumtype, needs_swap); 1325 if (!checksum) 1326 mPUSHi(ashort); 1327 else if (checksum > bits_in_uv) 1328 cdouble += (NV)ashort; 1329 else 1330 cuv += ashort; 1331 } 1332 break; 1333 #else 1334 /* FALLTHROUGH */ 1335 #endif 1336 case 's': 1337 while (len-- > 0) { 1338 I16 ai16; 1339 1340 #if U16SIZE > SIZE16 1341 ai16 = 0; 1342 #endif 1343 SHIFT16(utf8, s, strend, &ai16, datumtype, needs_swap); 1344 #if U16SIZE > SIZE16 1345 if (ai16 > 32767) 1346 ai16 -= 65536; 1347 #endif 1348 if (!checksum) 1349 mPUSHi(ai16); 1350 else if (checksum > bits_in_uv) 1351 cdouble += (NV)ai16; 1352 else 1353 cuv += ai16; 1354 } 1355 break; 1356 case 'S' | TYPE_IS_SHRIEKING: 1357 #if SHORTSIZE != SIZE16 1358 while (len-- > 0) { 1359 unsigned short aushort; 1360 SHIFT_VAR(utf8, s, strend, aushort, datumtype, needs_swap, 1361 needs_swap); 1362 if (!checksum) 1363 mPUSHu(aushort); 1364 else if (checksum > bits_in_uv) 1365 cdouble += (NV)aushort; 1366 else 1367 cuv += aushort; 1368 } 1369 break; 1370 #else 1371 /* FALLTHROUGH */ 1372 #endif 1373 case 'v': 1374 case 'n': 1375 case 'S': 1376 while (len-- > 0) { 1377 U16 au16; 1378 #if U16SIZE > SIZE16 1379 au16 = 0; 1380 #endif 1381 SHIFT16(utf8, s, strend, &au16, datumtype, needs_swap); 1382 if (datumtype == 'n') 1383 au16 = PerlSock_ntohs(au16); 1384 if (datumtype == 'v') 1385 au16 = vtohs(au16); 1386 if (!checksum) 1387 mPUSHu(au16); 1388 else if (checksum > bits_in_uv) 1389 cdouble += (NV) au16; 1390 else 1391 cuv += au16; 1392 } 1393 break; 1394 case 'v' | TYPE_IS_SHRIEKING: 1395 case 'n' | TYPE_IS_SHRIEKING: 1396 while (len-- > 0) { 1397 I16 ai16; 1398 # if U16SIZE > SIZE16 1399 ai16 = 0; 1400 # endif 1401 SHIFT16(utf8, s, strend, &ai16, datumtype, needs_swap); 1402 /* There should never be any byte-swapping here. */ 1403 assert(!TYPE_ENDIANNESS(datumtype)); 1404 if (datumtype == ('n' | TYPE_IS_SHRIEKING)) 1405 ai16 = (I16) PerlSock_ntohs((U16) ai16); 1406 if (datumtype == ('v' | TYPE_IS_SHRIEKING)) 1407 ai16 = (I16) vtohs((U16) ai16); 1408 if (!checksum) 1409 mPUSHi(ai16); 1410 else if (checksum > bits_in_uv) 1411 cdouble += (NV) ai16; 1412 else 1413 cuv += ai16; 1414 } 1415 break; 1416 case 'i': 1417 case 'i' | TYPE_IS_SHRIEKING: 1418 while (len-- > 0) { 1419 int aint; 1420 SHIFT_VAR(utf8, s, strend, aint, datumtype, needs_swap); 1421 if (!checksum) 1422 mPUSHi(aint); 1423 else if (checksum > bits_in_uv) 1424 cdouble += (NV)aint; 1425 else 1426 cuv += aint; 1427 } 1428 break; 1429 case 'I': 1430 case 'I' | TYPE_IS_SHRIEKING: 1431 while (len-- > 0) { 1432 unsigned int auint; 1433 SHIFT_VAR(utf8, s, strend, auint, datumtype, needs_swap); 1434 if (!checksum) 1435 mPUSHu(auint); 1436 else if (checksum > bits_in_uv) 1437 cdouble += (NV)auint; 1438 else 1439 cuv += auint; 1440 } 1441 break; 1442 case 'j': 1443 while (len-- > 0) { 1444 IV aiv; 1445 SHIFT_VAR(utf8, s, strend, aiv, datumtype, needs_swap); 1446 if (!checksum) 1447 mPUSHi(aiv); 1448 else if (checksum > bits_in_uv) 1449 cdouble += (NV)aiv; 1450 else 1451 cuv += aiv; 1452 } 1453 break; 1454 case 'J': 1455 while (len-- > 0) { 1456 UV auv; 1457 SHIFT_VAR(utf8, s, strend, auv, datumtype, needs_swap); 1458 if (!checksum) 1459 mPUSHu(auv); 1460 else if (checksum > bits_in_uv) 1461 cdouble += (NV)auv; 1462 else 1463 cuv += auv; 1464 } 1465 break; 1466 case 'l' | TYPE_IS_SHRIEKING: 1467 #if LONGSIZE != SIZE32 1468 while (len-- > 0) { 1469 long along; 1470 SHIFT_VAR(utf8, s, strend, along, datumtype, needs_swap); 1471 if (!checksum) 1472 mPUSHi(along); 1473 else if (checksum > bits_in_uv) 1474 cdouble += (NV)along; 1475 else 1476 cuv += along; 1477 } 1478 break; 1479 #else 1480 /* FALLTHROUGH */ 1481 #endif 1482 case 'l': 1483 while (len-- > 0) { 1484 I32 ai32; 1485 #if U32SIZE > SIZE32 1486 ai32 = 0; 1487 #endif 1488 SHIFT32(utf8, s, strend, &ai32, datumtype, needs_swap); 1489 #if U32SIZE > SIZE32 1490 if (ai32 > 2147483647) ai32 -= 4294967296; 1491 #endif 1492 if (!checksum) 1493 mPUSHi(ai32); 1494 else if (checksum > bits_in_uv) 1495 cdouble += (NV)ai32; 1496 else 1497 cuv += ai32; 1498 } 1499 break; 1500 case 'L' | TYPE_IS_SHRIEKING: 1501 #if LONGSIZE != SIZE32 1502 while (len-- > 0) { 1503 unsigned long aulong; 1504 SHIFT_VAR(utf8, s, strend, aulong, datumtype, needs_swap); 1505 if (!checksum) 1506 mPUSHu(aulong); 1507 else if (checksum > bits_in_uv) 1508 cdouble += (NV)aulong; 1509 else 1510 cuv += aulong; 1511 } 1512 break; 1513 #else 1514 /* FALLTHROUGH */ 1515 #endif 1516 case 'V': 1517 case 'N': 1518 case 'L': 1519 while (len-- > 0) { 1520 U32 au32; 1521 #if U32SIZE > SIZE32 1522 au32 = 0; 1523 #endif 1524 SHIFT32(utf8, s, strend, &au32, datumtype, needs_swap); 1525 if (datumtype == 'N') 1526 au32 = PerlSock_ntohl(au32); 1527 if (datumtype == 'V') 1528 au32 = vtohl(au32); 1529 if (!checksum) 1530 mPUSHu(au32); 1531 else if (checksum > bits_in_uv) 1532 cdouble += (NV)au32; 1533 else 1534 cuv += au32; 1535 } 1536 break; 1537 case 'V' | TYPE_IS_SHRIEKING: 1538 case 'N' | TYPE_IS_SHRIEKING: 1539 while (len-- > 0) { 1540 I32 ai32; 1541 #if U32SIZE > SIZE32 1542 ai32 = 0; 1543 #endif 1544 SHIFT32(utf8, s, strend, &ai32, datumtype, needs_swap); 1545 /* There should never be any byte swapping here. */ 1546 assert(!TYPE_ENDIANNESS(datumtype)); 1547 if (datumtype == ('N' | TYPE_IS_SHRIEKING)) 1548 ai32 = (I32)PerlSock_ntohl((U32)ai32); 1549 if (datumtype == ('V' | TYPE_IS_SHRIEKING)) 1550 ai32 = (I32)vtohl((U32)ai32); 1551 if (!checksum) 1552 mPUSHi(ai32); 1553 else if (checksum > bits_in_uv) 1554 cdouble += (NV)ai32; 1555 else 1556 cuv += ai32; 1557 } 1558 break; 1559 case 'p': 1560 while (len-- > 0) { 1561 const char *aptr; 1562 SHIFT_VAR(utf8, s, strend, aptr, datumtype, needs_swap); 1563 /* newSVpv generates undef if aptr is NULL */ 1564 mPUSHs(newSVpv(aptr, 0)); 1565 } 1566 break; 1567 case 'w': 1568 { 1569 UV auv = 0; 1570 U32 bytes = 0; 1571 1572 while (len > 0 && s < strend) { 1573 U8 ch; 1574 ch = SHIFT_BYTE(utf8, s, strend, datumtype); 1575 auv = (auv << 7) | (ch & 0x7f); 1576 /* UTF8_IS_XXXXX not right here because this is a BER, not 1577 * UTF-8 format - using constant 0x80 */ 1578 if (ch < 0x80) { 1579 bytes = 0; 1580 mPUSHu(auv); 1581 len--; 1582 auv = 0; 1583 continue; 1584 } 1585 if (++bytes >= sizeof(UV)) { /* promote to string */ 1586 const char *t; 1587 1588 sv = Perl_newSVpvf(aTHX_ "%.*"UVuf, (int)TYPE_DIGITS(UV), auv); 1589 while (s < strend) { 1590 ch = SHIFT_BYTE(utf8, s, strend, datumtype); 1591 sv = mul128(sv, (U8)(ch & 0x7f)); 1592 if (!(ch & 0x80)) { 1593 bytes = 0; 1594 break; 1595 } 1596 } 1597 t = SvPV_nolen_const(sv); 1598 while (*t == '0') 1599 t++; 1600 sv_chop(sv, t); 1601 mPUSHs(sv); 1602 len--; 1603 auv = 0; 1604 } 1605 } 1606 if ((s >= strend) && bytes) 1607 Perl_croak(aTHX_ "Unterminated compressed integer in unpack"); 1608 } 1609 break; 1610 case 'P': 1611 if (symptr->howlen == e_star) 1612 Perl_croak(aTHX_ "'P' must have an explicit size in unpack"); 1613 EXTEND(SP, 1); 1614 if (s + sizeof(char*) <= strend) { 1615 char *aptr; 1616 SHIFT_VAR(utf8, s, strend, aptr, datumtype, needs_swap); 1617 /* newSVpvn generates undef if aptr is NULL */ 1618 PUSHs(newSVpvn_flags(aptr, len, SVs_TEMP)); 1619 } 1620 break; 1621 #if defined(HAS_QUAD) && IVSIZE >= 8 1622 case 'q': 1623 while (len-- > 0) { 1624 Quad_t aquad; 1625 SHIFT_VAR(utf8, s, strend, aquad, datumtype, needs_swap); 1626 if (!checksum) 1627 mPUSHs(newSViv((IV)aquad)); 1628 else if (checksum > bits_in_uv) 1629 cdouble += (NV)aquad; 1630 else 1631 cuv += aquad; 1632 } 1633 break; 1634 case 'Q': 1635 while (len-- > 0) { 1636 Uquad_t auquad; 1637 SHIFT_VAR(utf8, s, strend, auquad, datumtype, needs_swap); 1638 if (!checksum) 1639 mPUSHs(newSVuv((UV)auquad)); 1640 else if (checksum > bits_in_uv) 1641 cdouble += (NV)auquad; 1642 else 1643 cuv += auquad; 1644 } 1645 break; 1646 #endif 1647 /* float and double added gnb@melba.bby.oz.au 22/11/89 */ 1648 case 'f': 1649 while (len-- > 0) { 1650 float afloat; 1651 SHIFT_VAR(utf8, s, strend, afloat, datumtype, needs_swap); 1652 if (!checksum) 1653 mPUSHn(afloat); 1654 else 1655 cdouble += afloat; 1656 } 1657 break; 1658 case 'd': 1659 while (len-- > 0) { 1660 double adouble; 1661 SHIFT_VAR(utf8, s, strend, adouble, datumtype, needs_swap); 1662 if (!checksum) 1663 mPUSHn(adouble); 1664 else 1665 cdouble += adouble; 1666 } 1667 break; 1668 case 'F': 1669 while (len-- > 0) { 1670 NV_bytes anv; 1671 SHIFT_BYTES(utf8, s, strend, anv.bytes, sizeof(anv.bytes), 1672 datumtype, needs_swap); 1673 if (!checksum) 1674 mPUSHn(anv.nv); 1675 else 1676 cdouble += anv.nv; 1677 } 1678 break; 1679 #if defined(HAS_LONG_DOUBLE) && defined(USE_LONG_DOUBLE) 1680 case 'D': 1681 while (len-- > 0) { 1682 ld_bytes aldouble; 1683 SHIFT_BYTES(utf8, s, strend, aldouble.bytes, 1684 sizeof(aldouble.bytes), datumtype, needs_swap); 1685 /* The most common long double format, the x86 80-bit 1686 * extended precision, has either 2 or 6 unused bytes, 1687 * which may contain garbage, which may contain 1688 * unintentional data. While we do zero the bytes of 1689 * the long double data in pack(), here in unpack() we 1690 * don't, because it's really hard to envision that 1691 * reading the long double off aldouble would be 1692 * affected by the unused bytes. 1693 * 1694 * Note that trying to unpack 'long doubles' of 'long 1695 * doubles' packed in another system is in the general 1696 * case doomed without having more detail. */ 1697 if (!checksum) 1698 mPUSHn(aldouble.ld); 1699 else 1700 cdouble += aldouble.ld; 1701 } 1702 break; 1703 #endif 1704 case 'u': 1705 if (!checksum) { 1706 const STRLEN l = (STRLEN) (strend - s) * 3 / 4; 1707 sv = sv_2mortal(newSV(l)); 1708 if (l) SvPOK_on(sv); 1709 } 1710 1711 /* Note that all legal uuencoded strings are ASCII printables, so 1712 * have the same representation under UTF-8 vs not. This means we 1713 * can ignore UTF8ness on legal input. For illegal we stop at the 1714 * first failure, and don't report where/what that is, so again we 1715 * can ignore UTF8ness */ 1716 1717 while (s < strend && *s != ' ' && ISUUCHAR(*s)) { 1718 I32 a, b, c, d; 1719 char hunk[3]; 1720 1721 len = PL_uudmap[*(U8*)s++] & 077; 1722 while (len > 0) { 1723 if (s < strend && ISUUCHAR(*s)) 1724 a = PL_uudmap[*(U8*)s++] & 077; 1725 else 1726 a = 0; 1727 if (s < strend && ISUUCHAR(*s)) 1728 b = PL_uudmap[*(U8*)s++] & 077; 1729 else 1730 b = 0; 1731 if (s < strend && ISUUCHAR(*s)) 1732 c = PL_uudmap[*(U8*)s++] & 077; 1733 else 1734 c = 0; 1735 if (s < strend && ISUUCHAR(*s)) 1736 d = PL_uudmap[*(U8*)s++] & 077; 1737 else 1738 d = 0; 1739 hunk[0] = (char)((a << 2) | (b >> 4)); 1740 hunk[1] = (char)((b << 4) | (c >> 2)); 1741 hunk[2] = (char)((c << 6) | d); 1742 if (!checksum) 1743 sv_catpvn(sv, hunk, (len > 3) ? 3 : len); 1744 len -= 3; 1745 } 1746 if (*s == '\n') 1747 s++; 1748 else /* possible checksum byte */ 1749 if (s + 1 < strend && s[1] == '\n') 1750 s += 2; 1751 } 1752 if (!checksum) 1753 XPUSHs(sv); 1754 break; 1755 } /* End of switch */ 1756 1757 if (checksum) { 1758 if (strchr("fFdD", TYPE_NO_MODIFIERS(datumtype)) || 1759 (checksum > bits_in_uv && 1760 strchr("cCsSiIlLnNUWvVqQjJ", TYPE_NO_MODIFIERS(datumtype))) ) { 1761 NV trouble, anv; 1762 1763 anv = (NV) (1 << (checksum & 15)); 1764 while (checksum >= 16) { 1765 checksum -= 16; 1766 anv *= 65536.0; 1767 } 1768 while (cdouble < 0.0) 1769 cdouble += anv; 1770 cdouble = Perl_modf(cdouble / anv, &trouble); 1771 #ifdef LONGDOUBLE_DOUBLEDOUBLE 1772 /* Workaround for powerpc doubledouble modfl bug: 1773 * close to 1.0L and -1.0L cdouble is 0, and trouble 1774 * is cdouble / anv. */ 1775 if (trouble != Perl_ceil(trouble)) { 1776 cdouble = trouble; 1777 if (cdouble > 1.0L) cdouble -= 1.0L; 1778 if (cdouble < -1.0L) cdouble += 1.0L; 1779 } 1780 #endif 1781 cdouble *= anv; 1782 sv = newSVnv(cdouble); 1783 } 1784 else { 1785 if (checksum < bits_in_uv) { 1786 UV mask = ((UV)1 << checksum) - 1; 1787 cuv &= mask; 1788 } 1789 sv = newSVuv(cuv); 1790 } 1791 mXPUSHs(sv); 1792 checksum = 0; 1793 } 1794 1795 if (symptr->flags & FLAG_SLASH){ 1796 if (SP - PL_stack_base - start_sp_offset <= 0) 1797 break; 1798 if( next_symbol(symptr) ){ 1799 if( symptr->howlen == e_number ) 1800 Perl_croak(aTHX_ "Count after length/code in unpack" ); 1801 if( beyond ){ 1802 /* ...end of char buffer then no decent length available */ 1803 Perl_croak(aTHX_ "length/code after end of string in unpack" ); 1804 } else { 1805 /* take top of stack (hope it's numeric) */ 1806 len = POPi; 1807 if( len < 0 ) 1808 Perl_croak(aTHX_ "Negative '/' count in unpack" ); 1809 } 1810 } else { 1811 Perl_croak(aTHX_ "Code missing after '/' in unpack" ); 1812 } 1813 datumtype = symptr->code; 1814 explicit_length = FALSE; 1815 goto redo_switch; 1816 } 1817 } 1818 1819 if (new_s) 1820 *new_s = s; 1821 PUTBACK; 1822 return SP - PL_stack_base - start_sp_offset; 1823 } 1824 1825 PP(pp_unpack) 1826 { 1827 dSP; 1828 dPOPPOPssrl; 1829 U8 gimme = GIMME_V; 1830 STRLEN llen; 1831 STRLEN rlen; 1832 const char *pat = SvPV_const(left, llen); 1833 const char *s = SvPV_const(right, rlen); 1834 const char *strend = s + rlen; 1835 const char *patend = pat + llen; 1836 I32 cnt; 1837 1838 PUTBACK; 1839 cnt = unpackstring(pat, patend, s, strend, 1840 ((gimme == G_SCALAR) ? FLAG_UNPACK_ONLY_ONE : 0) 1841 | (DO_UTF8(right) ? FLAG_DO_UTF8 : 0)); 1842 1843 SPAGAIN; 1844 if ( !cnt && gimme == G_SCALAR ) 1845 PUSHs(&PL_sv_undef); 1846 RETURN; 1847 } 1848 1849 STATIC U8 * 1850 doencodes(U8 *h, const U8 *s, I32 len) 1851 { 1852 *h++ = PL_uuemap[len]; 1853 while (len > 2) { 1854 *h++ = PL_uuemap[(077 & (s[0] >> 2))]; 1855 *h++ = PL_uuemap[(077 & (((s[0] << 4) & 060) | ((s[1] >> 4) & 017)))]; 1856 *h++ = PL_uuemap[(077 & (((s[1] << 2) & 074) | ((s[2] >> 6) & 03)))]; 1857 *h++ = PL_uuemap[(077 & (s[2] & 077))]; 1858 s += 3; 1859 len -= 3; 1860 } 1861 if (len > 0) { 1862 const U8 r = (len > 1 ? s[1] : '\0'); 1863 *h++ = PL_uuemap[(077 & (s[0] >> 2))]; 1864 *h++ = PL_uuemap[(077 & (((s[0] << 4) & 060) | ((r >> 4) & 017)))]; 1865 *h++ = PL_uuemap[(077 & ((r << 2) & 074))]; 1866 *h++ = PL_uuemap[0]; 1867 } 1868 *h++ = '\n'; 1869 return h; 1870 } 1871 1872 STATIC SV * 1873 S_is_an_int(pTHX_ const char *s, STRLEN l) 1874 { 1875 SV *result = newSVpvn(s, l); 1876 char *const result_c = SvPV_nolen(result); /* convenience */ 1877 char *out = result_c; 1878 bool skip = 1; 1879 bool ignore = 0; 1880 1881 PERL_ARGS_ASSERT_IS_AN_INT; 1882 1883 while (*s) { 1884 switch (*s) { 1885 case ' ': 1886 break; 1887 case '+': 1888 if (!skip) { 1889 SvREFCNT_dec(result); 1890 return (NULL); 1891 } 1892 break; 1893 case '0': 1894 case '1': 1895 case '2': 1896 case '3': 1897 case '4': 1898 case '5': 1899 case '6': 1900 case '7': 1901 case '8': 1902 case '9': 1903 skip = 0; 1904 if (!ignore) { 1905 *(out++) = *s; 1906 } 1907 break; 1908 case '.': 1909 ignore = 1; 1910 break; 1911 default: 1912 SvREFCNT_dec(result); 1913 return (NULL); 1914 } 1915 s++; 1916 } 1917 *(out++) = '\0'; 1918 SvCUR_set(result, out - result_c); 1919 return (result); 1920 } 1921 1922 /* pnum must be '\0' terminated */ 1923 STATIC int 1924 S_div128(pTHX_ SV *pnum, bool *done) 1925 { 1926 STRLEN len; 1927 char * const s = SvPV(pnum, len); 1928 char *t = s; 1929 int m = 0; 1930 1931 PERL_ARGS_ASSERT_DIV128; 1932 1933 *done = 1; 1934 while (*t) { 1935 const int i = m * 10 + (*t - '0'); 1936 const int r = (i >> 7); /* r < 10 */ 1937 m = i & 0x7F; 1938 if (r) { 1939 *done = 0; 1940 } 1941 *(t++) = '0' + r; 1942 } 1943 *(t++) = '\0'; 1944 SvCUR_set(pnum, (STRLEN) (t - s)); 1945 return (m); 1946 } 1947 1948 /* 1949 =for apidoc packlist 1950 1951 The engine implementing C<pack()> Perl function. 1952 1953 =cut 1954 */ 1955 1956 void 1957 Perl_packlist(pTHX_ SV *cat, const char *pat, const char *patend, SV **beglist, SV **endlist ) 1958 { 1959 tempsym_t sym; 1960 1961 PERL_ARGS_ASSERT_PACKLIST; 1962 1963 TEMPSYM_INIT(&sym, pat, patend, FLAG_PACK); 1964 1965 /* We're going to do changes through SvPVX(cat). Make sure it's valid. 1966 Also make sure any UTF8 flag is loaded */ 1967 SvPV_force_nolen(cat); 1968 if (DO_UTF8(cat)) 1969 sym.flags |= FLAG_PARSE_UTF8 | FLAG_DO_UTF8; 1970 1971 (void)pack_rec( cat, &sym, beglist, endlist ); 1972 } 1973 1974 /* like sv_utf8_upgrade, but also repoint the group start markers */ 1975 STATIC void 1976 marked_upgrade(pTHX_ SV *sv, tempsym_t *sym_ptr) { 1977 STRLEN len; 1978 tempsym_t *group; 1979 const char *from_ptr, *from_start, *from_end, **marks, **m; 1980 char *to_start, *to_ptr; 1981 1982 if (SvUTF8(sv)) return; 1983 1984 from_start = SvPVX_const(sv); 1985 from_end = from_start + SvCUR(sv); 1986 for (from_ptr = from_start; from_ptr < from_end; from_ptr++) 1987 if (!NATIVE_BYTE_IS_INVARIANT(*from_ptr)) break; 1988 if (from_ptr == from_end) { 1989 /* Simple case: no character needs to be changed */ 1990 SvUTF8_on(sv); 1991 return; 1992 } 1993 1994 len = (from_end-from_ptr)*UTF8_EXPAND+(from_ptr-from_start)+1; 1995 Newx(to_start, len, char); 1996 Copy(from_start, to_start, from_ptr-from_start, char); 1997 to_ptr = to_start + (from_ptr-from_start); 1998 1999 Newx(marks, sym_ptr->level+2, const char *); 2000 for (group=sym_ptr; group; group = group->previous) 2001 marks[group->level] = from_start + group->strbeg; 2002 marks[sym_ptr->level+1] = from_end+1; 2003 for (m = marks; *m < from_ptr; m++) 2004 *m = to_start + (*m-from_start); 2005 2006 for (;from_ptr < from_end; from_ptr++) { 2007 while (*m == from_ptr) *m++ = to_ptr; 2008 to_ptr = (char *) uvchr_to_utf8((U8 *) to_ptr, *(U8 *) from_ptr); 2009 } 2010 *to_ptr = 0; 2011 2012 while (*m == from_ptr) *m++ = to_ptr; 2013 if (m != marks + sym_ptr->level+1) { 2014 Safefree(marks); 2015 Safefree(to_start); 2016 Perl_croak(aTHX_ "panic: marks beyond string end, m=%p, marks=%p, " 2017 "level=%d", m, marks, sym_ptr->level); 2018 } 2019 for (group=sym_ptr; group; group = group->previous) 2020 group->strbeg = marks[group->level] - to_start; 2021 Safefree(marks); 2022 2023 if (SvOOK(sv)) { 2024 if (SvIVX(sv)) { 2025 SvLEN_set(sv, SvLEN(sv) + SvIVX(sv)); 2026 from_start -= SvIVX(sv); 2027 SvIV_set(sv, 0); 2028 } 2029 SvFLAGS(sv) &= ~SVf_OOK; 2030 } 2031 if (SvLEN(sv) != 0) 2032 Safefree(from_start); 2033 SvPV_set(sv, to_start); 2034 SvCUR_set(sv, to_ptr - to_start); 2035 SvLEN_set(sv, len); 2036 SvUTF8_on(sv); 2037 } 2038 2039 /* Exponential string grower. Makes string extension effectively O(n) 2040 needed says how many extra bytes we need (not counting the final '\0') 2041 Only grows the string if there is an actual lack of space 2042 */ 2043 STATIC char * 2044 S_sv_exp_grow(pTHX_ SV *sv, STRLEN needed) { 2045 const STRLEN cur = SvCUR(sv); 2046 const STRLEN len = SvLEN(sv); 2047 STRLEN extend; 2048 2049 PERL_ARGS_ASSERT_SV_EXP_GROW; 2050 2051 if (len - cur > needed) return SvPVX(sv); 2052 extend = needed > len ? needed : len; 2053 return SvGROW(sv, len+extend+1); 2054 } 2055 2056 static SV * 2057 S_sv_check_infnan(pTHX_ SV *sv, I32 datumtype) 2058 { 2059 SvGETMAGIC(sv); 2060 if (UNLIKELY(SvAMAGIC(sv))) 2061 sv = sv_2num(sv); 2062 if (UNLIKELY(isinfnansv(sv))) { 2063 const I32 c = TYPE_NO_MODIFIERS(datumtype); 2064 const NV nv = SvNV_nomg(sv); 2065 if (c == 'w') 2066 Perl_croak(aTHX_ "Cannot compress %"NVgf" in pack", nv); 2067 else 2068 Perl_croak(aTHX_ "Cannot pack %"NVgf" with '%c'", nv, (int) c); 2069 } 2070 return sv; 2071 } 2072 2073 #define SvIV_no_inf(sv,d) \ 2074 ((sv) = S_sv_check_infnan(aTHX_ sv,d), SvIV_nomg(sv)) 2075 #define SvUV_no_inf(sv,d) \ 2076 ((sv) = S_sv_check_infnan(aTHX_ sv,d), SvUV_nomg(sv)) 2077 2078 STATIC 2079 SV ** 2080 S_pack_rec(pTHX_ SV *cat, tempsym_t* symptr, SV **beglist, SV **endlist ) 2081 { 2082 tempsym_t lookahead; 2083 I32 items = endlist - beglist; 2084 bool found = next_symbol(symptr); 2085 bool utf8 = (symptr->flags & FLAG_PARSE_UTF8) ? 1 : 0; 2086 bool warn_utf8 = ckWARN(WARN_UTF8); 2087 char* from; 2088 2089 PERL_ARGS_ASSERT_PACK_REC; 2090 2091 if (symptr->level == 0 && found && symptr->code == 'U') { 2092 marked_upgrade(aTHX_ cat, symptr); 2093 symptr->flags |= FLAG_DO_UTF8; 2094 utf8 = 0; 2095 } 2096 symptr->strbeg = SvCUR(cat); 2097 2098 while (found) { 2099 SV *fromstr; 2100 STRLEN fromlen; 2101 I32 len; 2102 SV *lengthcode = NULL; 2103 I32 datumtype = symptr->code; 2104 howlen_t howlen = symptr->howlen; 2105 char *start = SvPVX(cat); 2106 char *cur = start + SvCUR(cat); 2107 bool needs_swap; 2108 2109 #define NEXTFROM (lengthcode ? lengthcode : items > 0 ? (--items, *beglist++) : &PL_sv_no) 2110 #define PEEKFROM (lengthcode ? lengthcode : items > 0 ? *beglist : &PL_sv_no) 2111 2112 switch (howlen) { 2113 case e_star: 2114 len = strchr("@Xxu", TYPE_NO_MODIFIERS(datumtype)) ? 2115 0 : items; 2116 break; 2117 default: 2118 /* e_no_len and e_number */ 2119 len = symptr->length; 2120 break; 2121 } 2122 2123 if (len) { 2124 packprops_t props = packprops[TYPE_NO_ENDIANNESS(datumtype)]; 2125 2126 if (props && !(props & PACK_SIZE_UNPREDICTABLE)) { 2127 /* We can process this letter. */ 2128 STRLEN size = props & PACK_SIZE_MASK; 2129 GROWING(utf8, cat, start, cur, (STRLEN) len * size); 2130 } 2131 } 2132 2133 /* Look ahead for next symbol. Do we have code/code? */ 2134 lookahead = *symptr; 2135 found = next_symbol(&lookahead); 2136 if (symptr->flags & FLAG_SLASH) { 2137 IV count; 2138 if (!found) Perl_croak(aTHX_ "Code missing after '/' in pack"); 2139 if (strchr("aAZ", lookahead.code)) { 2140 if (lookahead.howlen == e_number) count = lookahead.length; 2141 else { 2142 if (items > 0) { 2143 count = sv_len_utf8(*beglist); 2144 } 2145 else count = 0; 2146 if (lookahead.code == 'Z') count++; 2147 } 2148 } else { 2149 if (lookahead.howlen == e_number && lookahead.length < items) 2150 count = lookahead.length; 2151 else count = items; 2152 } 2153 lookahead.howlen = e_number; 2154 lookahead.length = count; 2155 lengthcode = sv_2mortal(newSViv(count)); 2156 } 2157 2158 needs_swap = NEEDS_SWAP(datumtype); 2159 2160 /* Code inside the switch must take care to properly update 2161 cat (CUR length and '\0' termination) if it updated *cur and 2162 doesn't simply leave using break */ 2163 switch (TYPE_NO_ENDIANNESS(datumtype)) { 2164 default: 2165 Perl_croak(aTHX_ "Invalid type '%c' in pack", 2166 (int) TYPE_NO_MODIFIERS(datumtype)); 2167 case '%': 2168 Perl_croak(aTHX_ "'%%' may not be used in pack"); 2169 2170 case '.' | TYPE_IS_SHRIEKING: 2171 case '.': 2172 if (howlen == e_star) from = start; 2173 else if (len == 0) from = cur; 2174 else { 2175 tempsym_t *group = symptr; 2176 2177 while (--len && group) group = group->previous; 2178 from = group ? start + group->strbeg : start; 2179 } 2180 fromstr = NEXTFROM; 2181 len = SvIV_no_inf(fromstr, datumtype); 2182 goto resize; 2183 case '@' | TYPE_IS_SHRIEKING: 2184 case '@': 2185 from = start + symptr->strbeg; 2186 resize: 2187 if (utf8 && !(datumtype & TYPE_IS_SHRIEKING)) 2188 if (len >= 0) { 2189 while (len && from < cur) { 2190 from += UTF8SKIP(from); 2191 len--; 2192 } 2193 if (from > cur) 2194 Perl_croak(aTHX_ "Malformed UTF-8 string in pack"); 2195 if (len) { 2196 /* Here we know from == cur */ 2197 grow: 2198 GROWING(0, cat, start, cur, len); 2199 Zero(cur, len, char); 2200 cur += len; 2201 } else if (from < cur) { 2202 len = cur - from; 2203 goto shrink; 2204 } else goto no_change; 2205 } else { 2206 cur = from; 2207 len = -len; 2208 goto utf8_shrink; 2209 } 2210 else { 2211 len -= cur - from; 2212 if (len > 0) goto grow; 2213 if (len == 0) goto no_change; 2214 len = -len; 2215 goto shrink; 2216 } 2217 break; 2218 2219 case '(': { 2220 tempsym_t savsym = *symptr; 2221 U32 group_modifiers = TYPE_MODIFIERS(datumtype & ~symptr->flags); 2222 symptr->flags |= group_modifiers; 2223 symptr->patend = savsym.grpend; 2224 symptr->level++; 2225 symptr->previous = &lookahead; 2226 while (len--) { 2227 U32 was_utf8; 2228 if (utf8) symptr->flags |= FLAG_PARSE_UTF8; 2229 else symptr->flags &= ~FLAG_PARSE_UTF8; 2230 was_utf8 = SvUTF8(cat); 2231 symptr->patptr = savsym.grpbeg; 2232 beglist = pack_rec(cat, symptr, beglist, endlist); 2233 if (SvUTF8(cat) != was_utf8) 2234 /* This had better be an upgrade while in utf8==0 mode */ 2235 utf8 = 1; 2236 2237 if (savsym.howlen == e_star && beglist == endlist) 2238 break; /* No way to continue */ 2239 } 2240 items = endlist - beglist; 2241 lookahead.flags = symptr->flags & ~group_modifiers; 2242 goto no_change; 2243 } 2244 case 'X' | TYPE_IS_SHRIEKING: 2245 if (!len) /* Avoid division by 0 */ 2246 len = 1; 2247 if (utf8) { 2248 char *hop, *last; 2249 I32 l = len; 2250 hop = last = start; 2251 while (hop < cur) { 2252 hop += UTF8SKIP(hop); 2253 if (--l == 0) { 2254 last = hop; 2255 l = len; 2256 } 2257 } 2258 if (last > cur) 2259 Perl_croak(aTHX_ "Malformed UTF-8 string in pack"); 2260 cur = last; 2261 break; 2262 } 2263 len = (cur-start) % len; 2264 /* FALLTHROUGH */ 2265 case 'X': 2266 if (utf8) { 2267 if (len < 1) goto no_change; 2268 utf8_shrink: 2269 while (len > 0) { 2270 if (cur <= start) 2271 Perl_croak(aTHX_ "'%c' outside of string in pack", 2272 (int) TYPE_NO_MODIFIERS(datumtype)); 2273 while (--cur, UTF8_IS_CONTINUATION(*cur)) { 2274 if (cur <= start) 2275 Perl_croak(aTHX_ "'%c' outside of string in pack", 2276 (int) TYPE_NO_MODIFIERS(datumtype)); 2277 } 2278 len--; 2279 } 2280 } else { 2281 shrink: 2282 if (cur - start < len) 2283 Perl_croak(aTHX_ "'%c' outside of string in pack", 2284 (int) TYPE_NO_MODIFIERS(datumtype)); 2285 cur -= len; 2286 } 2287 if (cur < start+symptr->strbeg) { 2288 /* Make sure group starts don't point into the void */ 2289 tempsym_t *group; 2290 const STRLEN length = cur-start; 2291 for (group = symptr; 2292 group && length < group->strbeg; 2293 group = group->previous) group->strbeg = length; 2294 lookahead.strbeg = length; 2295 } 2296 break; 2297 case 'x' | TYPE_IS_SHRIEKING: { 2298 I32 ai32; 2299 if (!len) /* Avoid division by 0 */ 2300 len = 1; 2301 if (utf8) ai32 = utf8_length((U8 *) start, (U8 *) cur) % len; 2302 else ai32 = (cur - start) % len; 2303 if (ai32 == 0) goto no_change; 2304 len -= ai32; 2305 } 2306 /* FALLTHROUGH */ 2307 case 'x': 2308 goto grow; 2309 case 'A': 2310 case 'Z': 2311 case 'a': { 2312 const char *aptr; 2313 2314 fromstr = NEXTFROM; 2315 aptr = SvPV_const(fromstr, fromlen); 2316 if (DO_UTF8(fromstr)) { 2317 const char *end, *s; 2318 2319 if (!utf8 && !SvUTF8(cat)) { 2320 marked_upgrade(aTHX_ cat, symptr); 2321 lookahead.flags |= FLAG_DO_UTF8; 2322 lookahead.strbeg = symptr->strbeg; 2323 utf8 = 1; 2324 start = SvPVX(cat); 2325 cur = start + SvCUR(cat); 2326 } 2327 if (howlen == e_star) { 2328 if (utf8) goto string_copy; 2329 len = fromlen+1; 2330 } 2331 s = aptr; 2332 end = aptr + fromlen; 2333 fromlen = datumtype == 'Z' ? len-1 : len; 2334 while ((I32) fromlen > 0 && s < end) { 2335 s += UTF8SKIP(s); 2336 fromlen--; 2337 } 2338 if (s > end) 2339 Perl_croak(aTHX_ "Malformed UTF-8 string in pack"); 2340 if (utf8) { 2341 len = fromlen; 2342 if (datumtype == 'Z') len++; 2343 fromlen = s-aptr; 2344 len += fromlen; 2345 2346 goto string_copy; 2347 } 2348 fromlen = len - fromlen; 2349 if (datumtype == 'Z') fromlen--; 2350 if (howlen == e_star) { 2351 len = fromlen; 2352 if (datumtype == 'Z') len++; 2353 } 2354 GROWING(0, cat, start, cur, len); 2355 if (!S_utf8_to_bytes(aTHX_ &aptr, end, cur, fromlen, 2356 datumtype | TYPE_IS_PACK)) 2357 Perl_croak(aTHX_ "panic: predicted utf8 length not available, " 2358 "for '%c', aptr=%p end=%p cur=%p, fromlen=%"UVuf, 2359 (int)datumtype, aptr, end, cur, (UV)fromlen); 2360 cur += fromlen; 2361 len -= fromlen; 2362 } else if (utf8) { 2363 if (howlen == e_star) { 2364 len = fromlen; 2365 if (datumtype == 'Z') len++; 2366 } 2367 if (len <= (I32) fromlen) { 2368 fromlen = len; 2369 if (datumtype == 'Z' && fromlen > 0) fromlen--; 2370 } 2371 /* assumes a byte expands to at most UTF8_EXPAND bytes on 2372 upgrade, so: 2373 expected_length <= from_len*UTF8_EXPAND + (len-from_len) */ 2374 GROWING(0, cat, start, cur, fromlen*(UTF8_EXPAND-1)+len); 2375 len -= fromlen; 2376 while (fromlen > 0) { 2377 cur = (char *) uvchr_to_utf8((U8 *) cur, * (U8 *) aptr); 2378 aptr++; 2379 fromlen--; 2380 } 2381 } else { 2382 string_copy: 2383 if (howlen == e_star) { 2384 len = fromlen; 2385 if (datumtype == 'Z') len++; 2386 } 2387 if (len <= (I32) fromlen) { 2388 fromlen = len; 2389 if (datumtype == 'Z' && fromlen > 0) fromlen--; 2390 } 2391 GROWING(0, cat, start, cur, len); 2392 Copy(aptr, cur, fromlen, char); 2393 cur += fromlen; 2394 len -= fromlen; 2395 } 2396 memset(cur, datumtype == 'A' ? ' ' : '\0', len); 2397 cur += len; 2398 SvTAINT(cat); 2399 break; 2400 } 2401 case 'B': 2402 case 'b': { 2403 const char *str, *end; 2404 I32 l, field_len; 2405 U8 bits; 2406 bool utf8_source; 2407 U32 utf8_flags; 2408 2409 fromstr = NEXTFROM; 2410 str = SvPV_const(fromstr, fromlen); 2411 end = str + fromlen; 2412 if (DO_UTF8(fromstr)) { 2413 utf8_source = TRUE; 2414 utf8_flags = warn_utf8 ? 0 : UTF8_ALLOW_ANY; 2415 } else { 2416 utf8_source = FALSE; 2417 utf8_flags = 0; /* Unused, but keep compilers happy */ 2418 } 2419 if (howlen == e_star) len = fromlen; 2420 field_len = (len+7)/8; 2421 GROWING(utf8, cat, start, cur, field_len); 2422 if (len > (I32)fromlen) len = fromlen; 2423 bits = 0; 2424 l = 0; 2425 if (datumtype == 'B') 2426 while (l++ < len) { 2427 if (utf8_source) { 2428 UV val = 0; 2429 NEXT_UNI_VAL(val, cur, str, end, utf8_flags); 2430 bits |= val & 1; 2431 } else bits |= *str++ & 1; 2432 if (l & 7) bits <<= 1; 2433 else { 2434 PUSH_BYTE(utf8, cur, bits); 2435 bits = 0; 2436 } 2437 } 2438 else 2439 /* datumtype == 'b' */ 2440 while (l++ < len) { 2441 if (utf8_source) { 2442 UV val = 0; 2443 NEXT_UNI_VAL(val, cur, str, end, utf8_flags); 2444 if (val & 1) bits |= 0x80; 2445 } else if (*str++ & 1) 2446 bits |= 0x80; 2447 if (l & 7) bits >>= 1; 2448 else { 2449 PUSH_BYTE(utf8, cur, bits); 2450 bits = 0; 2451 } 2452 } 2453 l--; 2454 if (l & 7) { 2455 if (datumtype == 'B') 2456 bits <<= 7 - (l & 7); 2457 else 2458 bits >>= 7 - (l & 7); 2459 PUSH_BYTE(utf8, cur, bits); 2460 l += 7; 2461 } 2462 /* Determine how many chars are left in the requested field */ 2463 l /= 8; 2464 if (howlen == e_star) field_len = 0; 2465 else field_len -= l; 2466 Zero(cur, field_len, char); 2467 cur += field_len; 2468 break; 2469 } 2470 case 'H': 2471 case 'h': { 2472 const char *str, *end; 2473 I32 l, field_len; 2474 U8 bits; 2475 bool utf8_source; 2476 U32 utf8_flags; 2477 2478 fromstr = NEXTFROM; 2479 str = SvPV_const(fromstr, fromlen); 2480 end = str + fromlen; 2481 if (DO_UTF8(fromstr)) { 2482 utf8_source = TRUE; 2483 utf8_flags = warn_utf8 ? 0 : UTF8_ALLOW_ANY; 2484 } else { 2485 utf8_source = FALSE; 2486 utf8_flags = 0; /* Unused, but keep compilers happy */ 2487 } 2488 if (howlen == e_star) len = fromlen; 2489 field_len = (len+1)/2; 2490 GROWING(utf8, cat, start, cur, field_len); 2491 if (!utf8_source && len > (I32)fromlen) len = fromlen; 2492 bits = 0; 2493 l = 0; 2494 if (datumtype == 'H') 2495 while (l++ < len) { 2496 if (utf8_source) { 2497 UV val = 0; 2498 NEXT_UNI_VAL(val, cur, str, end, utf8_flags); 2499 if (val < 256 && isALPHA(val)) 2500 bits |= (val + 9) & 0xf; 2501 else 2502 bits |= val & 0xf; 2503 } else if (isALPHA(*str)) 2504 bits |= (*str++ + 9) & 0xf; 2505 else 2506 bits |= *str++ & 0xf; 2507 if (l & 1) bits <<= 4; 2508 else { 2509 PUSH_BYTE(utf8, cur, bits); 2510 bits = 0; 2511 } 2512 } 2513 else 2514 while (l++ < len) { 2515 if (utf8_source) { 2516 UV val = 0; 2517 NEXT_UNI_VAL(val, cur, str, end, utf8_flags); 2518 if (val < 256 && isALPHA(val)) 2519 bits |= ((val + 9) & 0xf) << 4; 2520 else 2521 bits |= (val & 0xf) << 4; 2522 } else if (isALPHA(*str)) 2523 bits |= ((*str++ + 9) & 0xf) << 4; 2524 else 2525 bits |= (*str++ & 0xf) << 4; 2526 if (l & 1) bits >>= 4; 2527 else { 2528 PUSH_BYTE(utf8, cur, bits); 2529 bits = 0; 2530 } 2531 } 2532 l--; 2533 if (l & 1) { 2534 PUSH_BYTE(utf8, cur, bits); 2535 l++; 2536 } 2537 /* Determine how many chars are left in the requested field */ 2538 l /= 2; 2539 if (howlen == e_star) field_len = 0; 2540 else field_len -= l; 2541 Zero(cur, field_len, char); 2542 cur += field_len; 2543 break; 2544 } 2545 case 'c': 2546 while (len-- > 0) { 2547 IV aiv; 2548 fromstr = NEXTFROM; 2549 aiv = SvIV_no_inf(fromstr, datumtype); 2550 if ((-128 > aiv || aiv > 127)) 2551 Perl_ck_warner(aTHX_ packWARN(WARN_PACK), 2552 "Character in 'c' format wrapped in pack"); 2553 PUSH_BYTE(utf8, cur, (U8)(aiv & 0xff)); 2554 } 2555 break; 2556 case 'C': 2557 if (len == 0) { 2558 utf8 = (symptr->flags & FLAG_DO_UTF8) ? 1 : 0; 2559 break; 2560 } 2561 while (len-- > 0) { 2562 IV aiv; 2563 fromstr = NEXTFROM; 2564 aiv = SvIV_no_inf(fromstr, datumtype); 2565 if ((0 > aiv || aiv > 0xff)) 2566 Perl_ck_warner(aTHX_ packWARN(WARN_PACK), 2567 "Character in 'C' format wrapped in pack"); 2568 PUSH_BYTE(utf8, cur, (U8)(aiv & 0xff)); 2569 } 2570 break; 2571 case 'W': { 2572 char *end; 2573 U8 in_bytes = (U8)IN_BYTES; 2574 2575 end = start+SvLEN(cat)-1; 2576 if (utf8) end -= UTF8_MAXLEN-1; 2577 while (len-- > 0) { 2578 UV auv; 2579 fromstr = NEXTFROM; 2580 auv = SvUV_no_inf(fromstr, datumtype); 2581 if (in_bytes) auv = auv % 0x100; 2582 if (utf8) { 2583 W_utf8: 2584 if (cur > end) { 2585 *cur = '\0'; 2586 SvCUR_set(cat, cur - start); 2587 2588 GROWING(0, cat, start, cur, len+UTF8_MAXLEN); 2589 end = start+SvLEN(cat)-UTF8_MAXLEN; 2590 } 2591 cur = (char *) uvchr_to_utf8_flags((U8 *) cur, 2592 auv, 2593 warn_utf8 ? 2594 0 : UNICODE_ALLOW_ANY); 2595 } else { 2596 if (auv >= 0x100) { 2597 if (!SvUTF8(cat)) { 2598 *cur = '\0'; 2599 SvCUR_set(cat, cur - start); 2600 marked_upgrade(aTHX_ cat, symptr); 2601 lookahead.flags |= FLAG_DO_UTF8; 2602 lookahead.strbeg = symptr->strbeg; 2603 utf8 = 1; 2604 start = SvPVX(cat); 2605 cur = start + SvCUR(cat); 2606 end = start+SvLEN(cat)-UTF8_MAXLEN; 2607 goto W_utf8; 2608 } 2609 Perl_ck_warner(aTHX_ packWARN(WARN_PACK), 2610 "Character in 'W' format wrapped in pack"); 2611 auv &= 0xff; 2612 } 2613 if (cur >= end) { 2614 *cur = '\0'; 2615 SvCUR_set(cat, cur - start); 2616 GROWING(0, cat, start, cur, len+1); 2617 end = start+SvLEN(cat)-1; 2618 } 2619 *(U8 *) cur++ = (U8)auv; 2620 } 2621 } 2622 break; 2623 } 2624 case 'U': { 2625 char *end; 2626 2627 if (len == 0) { 2628 if (!(symptr->flags & FLAG_DO_UTF8)) { 2629 marked_upgrade(aTHX_ cat, symptr); 2630 lookahead.flags |= FLAG_DO_UTF8; 2631 lookahead.strbeg = symptr->strbeg; 2632 } 2633 utf8 = 0; 2634 goto no_change; 2635 } 2636 2637 end = start+SvLEN(cat); 2638 if (!utf8) end -= UTF8_MAXLEN; 2639 while (len-- > 0) { 2640 UV auv; 2641 fromstr = NEXTFROM; 2642 auv = SvUV_no_inf(fromstr, datumtype); 2643 if (utf8) { 2644 U8 buffer[UTF8_MAXLEN], *endb; 2645 endb = uvchr_to_utf8_flags(buffer, UNI_TO_NATIVE(auv), 2646 warn_utf8 ? 2647 0 : UNICODE_ALLOW_ANY); 2648 if (cur+(endb-buffer)*UTF8_EXPAND >= end) { 2649 *cur = '\0'; 2650 SvCUR_set(cat, cur - start); 2651 GROWING(0, cat, start, cur, 2652 len+(endb-buffer)*UTF8_EXPAND); 2653 end = start+SvLEN(cat); 2654 } 2655 cur = my_bytes_to_utf8(buffer, endb-buffer, cur, 0); 2656 } else { 2657 if (cur >= end) { 2658 *cur = '\0'; 2659 SvCUR_set(cat, cur - start); 2660 GROWING(0, cat, start, cur, len+UTF8_MAXLEN); 2661 end = start+SvLEN(cat)-UTF8_MAXLEN; 2662 } 2663 cur = (char *) uvchr_to_utf8_flags((U8 *) cur, UNI_TO_NATIVE(auv), 2664 warn_utf8 ? 2665 0 : UNICODE_ALLOW_ANY); 2666 } 2667 } 2668 break; 2669 } 2670 /* Float and double added by gnb@melba.bby.oz.au 22/11/89 */ 2671 case 'f': 2672 while (len-- > 0) { 2673 float afloat; 2674 NV anv; 2675 fromstr = NEXTFROM; 2676 anv = SvNV(fromstr); 2677 # if defined(VMS) && !defined(_IEEE_FP) 2678 /* IEEE fp overflow shenanigans are unavailable on VAX and optional 2679 * on Alpha; fake it if we don't have them. 2680 */ 2681 if (anv > FLT_MAX) 2682 afloat = FLT_MAX; 2683 else if (anv < -FLT_MAX) 2684 afloat = -FLT_MAX; 2685 else afloat = (float)anv; 2686 # else 2687 #if defined(NAN_COMPARE_BROKEN) && defined(Perl_isnan) 2688 if(Perl_isnan(anv)) 2689 afloat = (float)NV_NAN; 2690 else 2691 #endif 2692 /* a simple cast to float is undefined if outside 2693 * the range of values that can be represented */ 2694 afloat = (float)(anv > FLT_MAX ? NV_INF : 2695 anv < -FLT_MAX ? -NV_INF : anv); 2696 # endif 2697 PUSH_VAR(utf8, cur, afloat, needs_swap); 2698 } 2699 break; 2700 case 'd': 2701 while (len-- > 0) { 2702 double adouble; 2703 NV anv; 2704 fromstr = NEXTFROM; 2705 anv = SvNV(fromstr); 2706 # if defined(VMS) && !defined(_IEEE_FP) 2707 /* IEEE fp overflow shenanigans are unavailable on VAX and optional 2708 * on Alpha; fake it if we don't have them. 2709 */ 2710 if (anv > DBL_MAX) 2711 adouble = DBL_MAX; 2712 else if (anv < -DBL_MAX) 2713 adouble = -DBL_MAX; 2714 else adouble = (double)anv; 2715 # else 2716 adouble = (double)anv; 2717 # endif 2718 PUSH_VAR(utf8, cur, adouble, needs_swap); 2719 } 2720 break; 2721 case 'F': { 2722 NV_bytes anv; 2723 Zero(&anv, 1, NV); /* can be long double with unused bits */ 2724 while (len-- > 0) { 2725 fromstr = NEXTFROM; 2726 #ifdef __GNUC__ 2727 /* to work round a gcc/x86 bug; don't use SvNV */ 2728 anv.nv = sv_2nv(fromstr); 2729 # if defined(LONGDOUBLE_X86_80_BIT) && defined(USE_LONG_DOUBLE) \ 2730 && LONG_DOUBLESIZE > 10 2731 /* GCC sometimes overwrites the padding in the 2732 assignment above */ 2733 Zero(anv.bytes+10, sizeof(anv.bytes) - 10, U8); 2734 # endif 2735 #else 2736 anv.nv = SvNV(fromstr); 2737 #endif 2738 PUSH_BYTES(utf8, cur, anv.bytes, sizeof(anv.bytes), needs_swap); 2739 } 2740 break; 2741 } 2742 #if defined(HAS_LONG_DOUBLE) && defined(USE_LONG_DOUBLE) 2743 case 'D': { 2744 ld_bytes aldouble; 2745 /* long doubles can have unused bits, which may be nonzero */ 2746 Zero(&aldouble, 1, long double); 2747 while (len-- > 0) { 2748 fromstr = NEXTFROM; 2749 # ifdef __GNUC__ 2750 /* to work round a gcc/x86 bug; don't use SvNV */ 2751 aldouble.ld = (long double)sv_2nv(fromstr); 2752 # if defined(LONGDOUBLE_X86_80_BIT) && LONG_DOUBLESIZE > 10 2753 /* GCC sometimes overwrites the padding in the 2754 assignment above */ 2755 Zero(aldouble.bytes+10, sizeof(aldouble.bytes) - 10, U8); 2756 # endif 2757 # else 2758 aldouble.ld = (long double)SvNV(fromstr); 2759 # endif 2760 PUSH_BYTES(utf8, cur, aldouble.bytes, sizeof(aldouble.bytes), 2761 needs_swap); 2762 } 2763 break; 2764 } 2765 #endif 2766 case 'n' | TYPE_IS_SHRIEKING: 2767 case 'n': 2768 while (len-- > 0) { 2769 I16 ai16; 2770 fromstr = NEXTFROM; 2771 ai16 = (I16)SvIV_no_inf(fromstr, datumtype); 2772 ai16 = PerlSock_htons(ai16); 2773 PUSH16(utf8, cur, &ai16, FALSE); 2774 } 2775 break; 2776 case 'v' | TYPE_IS_SHRIEKING: 2777 case 'v': 2778 while (len-- > 0) { 2779 I16 ai16; 2780 fromstr = NEXTFROM; 2781 ai16 = (I16)SvIV_no_inf(fromstr, datumtype); 2782 ai16 = htovs(ai16); 2783 PUSH16(utf8, cur, &ai16, FALSE); 2784 } 2785 break; 2786 case 'S' | TYPE_IS_SHRIEKING: 2787 #if SHORTSIZE != SIZE16 2788 while (len-- > 0) { 2789 unsigned short aushort; 2790 fromstr = NEXTFROM; 2791 aushort = SvUV_no_inf(fromstr, datumtype); 2792 PUSH_VAR(utf8, cur, aushort, needs_swap); 2793 } 2794 break; 2795 #else 2796 /* FALLTHROUGH */ 2797 #endif 2798 case 'S': 2799 while (len-- > 0) { 2800 U16 au16; 2801 fromstr = NEXTFROM; 2802 au16 = (U16)SvUV_no_inf(fromstr, datumtype); 2803 PUSH16(utf8, cur, &au16, needs_swap); 2804 } 2805 break; 2806 case 's' | TYPE_IS_SHRIEKING: 2807 #if SHORTSIZE != SIZE16 2808 while (len-- > 0) { 2809 short ashort; 2810 fromstr = NEXTFROM; 2811 ashort = SvIV_no_inf(fromstr, datumtype); 2812 PUSH_VAR(utf8, cur, ashort, needs_swap); 2813 } 2814 break; 2815 #else 2816 /* FALLTHROUGH */ 2817 #endif 2818 case 's': 2819 while (len-- > 0) { 2820 I16 ai16; 2821 fromstr = NEXTFROM; 2822 ai16 = (I16)SvIV_no_inf(fromstr, datumtype); 2823 PUSH16(utf8, cur, &ai16, needs_swap); 2824 } 2825 break; 2826 case 'I': 2827 case 'I' | TYPE_IS_SHRIEKING: 2828 while (len-- > 0) { 2829 unsigned int auint; 2830 fromstr = NEXTFROM; 2831 auint = SvUV_no_inf(fromstr, datumtype); 2832 PUSH_VAR(utf8, cur, auint, needs_swap); 2833 } 2834 break; 2835 case 'j': 2836 while (len-- > 0) { 2837 IV aiv; 2838 fromstr = NEXTFROM; 2839 aiv = SvIV_no_inf(fromstr, datumtype); 2840 PUSH_VAR(utf8, cur, aiv, needs_swap); 2841 } 2842 break; 2843 case 'J': 2844 while (len-- > 0) { 2845 UV auv; 2846 fromstr = NEXTFROM; 2847 auv = SvUV_no_inf(fromstr, datumtype); 2848 PUSH_VAR(utf8, cur, auv, needs_swap); 2849 } 2850 break; 2851 case 'w': 2852 while (len-- > 0) { 2853 NV anv; 2854 fromstr = NEXTFROM; 2855 S_sv_check_infnan(aTHX_ fromstr, datumtype); 2856 anv = SvNV_nomg(fromstr); 2857 2858 if (anv < 0) { 2859 *cur = '\0'; 2860 SvCUR_set(cat, cur - start); 2861 Perl_croak(aTHX_ "Cannot compress negative numbers in pack"); 2862 } 2863 2864 /* 0xFFFFFFFFFFFFFFFF may cast to 18446744073709551616.0, 2865 which is == UV_MAX_P1. IOK is fine (instead of UV_only), as 2866 any negative IVs will have already been got by the croak() 2867 above. IOK is untrue for fractions, so we test them 2868 against UV_MAX_P1. */ 2869 if (SvIOK(fromstr) || anv < UV_MAX_P1) { 2870 char buf[(sizeof(UV)*CHAR_BIT)/7+1]; 2871 char *in = buf + sizeof(buf); 2872 UV auv = SvUV_nomg(fromstr); 2873 2874 do { 2875 *--in = (char)((auv & 0x7f) | 0x80); 2876 auv >>= 7; 2877 } while (auv); 2878 buf[sizeof(buf) - 1] &= 0x7f; /* clear continue bit */ 2879 PUSH_GROWING_BYTES(utf8, cat, start, cur, 2880 in, (buf + sizeof(buf)) - in); 2881 } else if (SvPOKp(fromstr)) 2882 goto w_string; 2883 else if (SvNOKp(fromstr)) { 2884 /* 10**NV_MAX_10_EXP is the largest power of 10 2885 so 10**(NV_MAX_10_EXP+1) is definitely unrepresentable 2886 given 10**(NV_MAX_10_EXP+1) == 128 ** x solve for x: 2887 x = (NV_MAX_10_EXP+1) * log (10) / log (128) 2888 And with that many bytes only Inf can overflow. 2889 Some C compilers are strict about integral constant 2890 expressions so we conservatively divide by a slightly 2891 smaller integer instead of multiplying by the exact 2892 floating-point value. 2893 */ 2894 #ifdef NV_MAX_10_EXP 2895 /* char buf[1 + (int)((NV_MAX_10_EXP + 1) * 0.47456)]; -- invalid C */ 2896 char buf[1 + (int)((NV_MAX_10_EXP + 1) / 2)]; /* valid C */ 2897 #else 2898 /* char buf[1 + (int)((308 + 1) * 0.47456)]; -- invalid C */ 2899 char buf[1 + (int)((308 + 1) / 2)]; /* valid C */ 2900 #endif 2901 char *in = buf + sizeof(buf); 2902 2903 anv = Perl_floor(anv); 2904 do { 2905 const NV next = Perl_floor(anv / 128); 2906 if (in <= buf) /* this cannot happen ;-) */ 2907 Perl_croak(aTHX_ "Cannot compress integer in pack"); 2908 *--in = (unsigned char)(anv - (next * 128)) | 0x80; 2909 anv = next; 2910 } while (anv > 0); 2911 buf[sizeof(buf) - 1] &= 0x7f; /* clear continue bit */ 2912 PUSH_GROWING_BYTES(utf8, cat, start, cur, 2913 in, (buf + sizeof(buf)) - in); 2914 } else { 2915 const char *from; 2916 char *result, *in; 2917 SV *norm; 2918 STRLEN len; 2919 bool done; 2920 2921 w_string: 2922 /* Copy string and check for compliance */ 2923 from = SvPV_nomg_const(fromstr, len); 2924 if ((norm = is_an_int(from, len)) == NULL) 2925 Perl_croak(aTHX_ "Can only compress unsigned integers in pack"); 2926 2927 Newx(result, len, char); 2928 in = result + len; 2929 done = FALSE; 2930 while (!done) *--in = div128(norm, &done) | 0x80; 2931 result[len - 1] &= 0x7F; /* clear continue bit */ 2932 PUSH_GROWING_BYTES(utf8, cat, start, cur, 2933 in, (result + len) - in); 2934 Safefree(result); 2935 SvREFCNT_dec(norm); /* free norm */ 2936 } 2937 } 2938 break; 2939 case 'i': 2940 case 'i' | TYPE_IS_SHRIEKING: 2941 while (len-- > 0) { 2942 int aint; 2943 fromstr = NEXTFROM; 2944 aint = SvIV_no_inf(fromstr, datumtype); 2945 PUSH_VAR(utf8, cur, aint, needs_swap); 2946 } 2947 break; 2948 case 'N' | TYPE_IS_SHRIEKING: 2949 case 'N': 2950 while (len-- > 0) { 2951 U32 au32; 2952 fromstr = NEXTFROM; 2953 au32 = SvUV_no_inf(fromstr, datumtype); 2954 au32 = PerlSock_htonl(au32); 2955 PUSH32(utf8, cur, &au32, FALSE); 2956 } 2957 break; 2958 case 'V' | TYPE_IS_SHRIEKING: 2959 case 'V': 2960 while (len-- > 0) { 2961 U32 au32; 2962 fromstr = NEXTFROM; 2963 au32 = SvUV_no_inf(fromstr, datumtype); 2964 au32 = htovl(au32); 2965 PUSH32(utf8, cur, &au32, FALSE); 2966 } 2967 break; 2968 case 'L' | TYPE_IS_SHRIEKING: 2969 #if LONGSIZE != SIZE32 2970 while (len-- > 0) { 2971 unsigned long aulong; 2972 fromstr = NEXTFROM; 2973 aulong = SvUV_no_inf(fromstr, datumtype); 2974 PUSH_VAR(utf8, cur, aulong, needs_swap); 2975 } 2976 break; 2977 #else 2978 /* Fall though! */ 2979 #endif 2980 case 'L': 2981 while (len-- > 0) { 2982 U32 au32; 2983 fromstr = NEXTFROM; 2984 au32 = SvUV_no_inf(fromstr, datumtype); 2985 PUSH32(utf8, cur, &au32, needs_swap); 2986 } 2987 break; 2988 case 'l' | TYPE_IS_SHRIEKING: 2989 #if LONGSIZE != SIZE32 2990 while (len-- > 0) { 2991 long along; 2992 fromstr = NEXTFROM; 2993 along = SvIV_no_inf(fromstr, datumtype); 2994 PUSH_VAR(utf8, cur, along, needs_swap); 2995 } 2996 break; 2997 #else 2998 /* Fall though! */ 2999 #endif 3000 case 'l': 3001 while (len-- > 0) { 3002 I32 ai32; 3003 fromstr = NEXTFROM; 3004 ai32 = SvIV_no_inf(fromstr, datumtype); 3005 PUSH32(utf8, cur, &ai32, needs_swap); 3006 } 3007 break; 3008 #if defined(HAS_QUAD) && IVSIZE >= 8 3009 case 'Q': 3010 while (len-- > 0) { 3011 Uquad_t auquad; 3012 fromstr = NEXTFROM; 3013 auquad = (Uquad_t) SvUV_no_inf(fromstr, datumtype); 3014 PUSH_VAR(utf8, cur, auquad, needs_swap); 3015 } 3016 break; 3017 case 'q': 3018 while (len-- > 0) { 3019 Quad_t aquad; 3020 fromstr = NEXTFROM; 3021 aquad = (Quad_t)SvIV_no_inf(fromstr, datumtype); 3022 PUSH_VAR(utf8, cur, aquad, needs_swap); 3023 } 3024 break; 3025 #endif 3026 case 'P': 3027 len = 1; /* assume SV is correct length */ 3028 GROWING(utf8, cat, start, cur, sizeof(char *)); 3029 /* FALLTHROUGH */ 3030 case 'p': 3031 while (len-- > 0) { 3032 const char *aptr; 3033 3034 fromstr = NEXTFROM; 3035 SvGETMAGIC(fromstr); 3036 if (!SvOK(fromstr)) aptr = NULL; 3037 else { 3038 /* XXX better yet, could spirit away the string to 3039 * a safe spot and hang on to it until the result 3040 * of pack() (and all copies of the result) are 3041 * gone. 3042 */ 3043 if ((SvTEMP(fromstr) || (SvPADTMP(fromstr) && 3044 !SvREADONLY(fromstr)))) { 3045 Perl_ck_warner(aTHX_ packWARN(WARN_PACK), 3046 "Attempt to pack pointer to temporary value"); 3047 } 3048 if (SvPOK(fromstr) || SvNIOK(fromstr)) 3049 aptr = SvPV_nomg_const_nolen(fromstr); 3050 else 3051 aptr = SvPV_force_flags_nolen(fromstr, 0); 3052 } 3053 PUSH_VAR(utf8, cur, aptr, needs_swap); 3054 } 3055 break; 3056 case 'u': { 3057 const char *aptr, *aend; 3058 bool from_utf8; 3059 3060 fromstr = NEXTFROM; 3061 if (len <= 2) len = 45; 3062 else len = len / 3 * 3; 3063 if (len >= 64) { 3064 Perl_ck_warner(aTHX_ packWARN(WARN_PACK), 3065 "Field too wide in 'u' format in pack"); 3066 len = 63; 3067 } 3068 aptr = SvPV_const(fromstr, fromlen); 3069 from_utf8 = DO_UTF8(fromstr); 3070 if (from_utf8) { 3071 aend = aptr + fromlen; 3072 fromlen = sv_len_utf8_nomg(fromstr); 3073 } else aend = NULL; /* Unused, but keep compilers happy */ 3074 GROWING(utf8, cat, start, cur, (fromlen+2) / 3 * 4 + (fromlen+len-1)/len * 2); 3075 while (fromlen > 0) { 3076 U8 *end; 3077 I32 todo; 3078 U8 hunk[1+63/3*4+1]; 3079 3080 if ((I32)fromlen > len) 3081 todo = len; 3082 else 3083 todo = fromlen; 3084 if (from_utf8) { 3085 char buffer[64]; 3086 if (!S_utf8_to_bytes(aTHX_ &aptr, aend, buffer, todo, 3087 'u' | TYPE_IS_PACK)) { 3088 *cur = '\0'; 3089 SvCUR_set(cat, cur - start); 3090 Perl_croak(aTHX_ "panic: string is shorter than advertised, " 3091 "aptr=%p, aend=%p, buffer=%p, todo=%ld", 3092 aptr, aend, buffer, (long) todo); 3093 } 3094 end = doencodes(hunk, (const U8 *)buffer, todo); 3095 } else { 3096 end = doencodes(hunk, (const U8 *)aptr, todo); 3097 aptr += todo; 3098 } 3099 PUSH_BYTES(utf8, cur, hunk, end-hunk, 0); 3100 fromlen -= todo; 3101 } 3102 break; 3103 } 3104 } 3105 *cur = '\0'; 3106 SvCUR_set(cat, cur - start); 3107 no_change: 3108 *symptr = lookahead; 3109 } 3110 return beglist; 3111 } 3112 #undef NEXTFROM 3113 3114 3115 PP(pp_pack) 3116 { 3117 dSP; dMARK; dORIGMARK; dTARGET; 3118 SV *cat = TARG; 3119 STRLEN fromlen; 3120 SV *pat_sv = *++MARK; 3121 const char *pat = SvPV_const(pat_sv, fromlen); 3122 const char *patend = pat + fromlen; 3123 3124 MARK++; 3125 sv_setpvs(cat, ""); 3126 SvUTF8_off(cat); 3127 3128 packlist(cat, pat, patend, MARK, SP + 1); 3129 3130 SvSETMAGIC(cat); 3131 SP = ORIGMARK; 3132 PUSHs(cat); 3133 RETURN; 3134 } 3135 3136 /* 3137 * ex: set ts=8 sts=4 sw=4 et: 3138 */ 3139