1 /* pp_pack.c 2 * 3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others 5 * 6 * You may distribute under the terms of either the GNU General Public 7 * License or the Artistic License, as specified in the README file. 8 * 9 */ 10 11 /* 12 * He still hopefully carried some of his gear in his pack: a small tinder-box, 13 * two small shallow pans, the smaller fitting into the larger; inside them a 14 * wooden spoon, a short two-pronged fork and some skewers were stowed; and 15 * hidden at the bottom of the pack in a flat wooden box a dwindling treasure, 16 * some salt. 17 * 18 * [p.653 of _The Lord of the Rings_, IV/iv: "Of Herbs and Stewed Rabbit"] 19 */ 20 21 /* This file contains pp ("push/pop") functions that 22 * execute the opcodes that make up a perl program. A typical pp function 23 * expects to find its arguments on the stack, and usually pushes its 24 * results onto the stack, hence the 'pp' terminology. Each OP structure 25 * contains a pointer to the relevant pp_foo() function. 26 * 27 * This particular file just contains pp_pack() and pp_unpack(). See the 28 * other pp*.c files for the rest of the pp_ functions. 29 */ 30 31 #include "EXTERN.h" 32 #define PERL_IN_PP_PACK_C 33 #include "perl.h" 34 35 /* Types used by pack/unpack */ 36 typedef enum { 37 e_no_len, /* no length */ 38 e_number, /* number, [] */ 39 e_star /* asterisk */ 40 } howlen_t; 41 42 typedef struct tempsym { 43 const char* patptr; /* current template char */ 44 const char* patend; /* one after last char */ 45 const char* grpbeg; /* 1st char of ()-group */ 46 const char* grpend; /* end of ()-group */ 47 I32 code; /* template code (!<>) */ 48 U32 flags; /* /=4, comma=2, pack=1 */ 49 /* and group modifiers */ 50 SSize_t length; /* length/repeat count */ 51 howlen_t howlen; /* how length is given */ 52 int level; /* () nesting level */ 53 STRLEN strbeg; /* offset of group start */ 54 struct tempsym *previous; /* previous group */ 55 } tempsym_t; 56 57 #define TEMPSYM_INIT(symptr, p, e, f) \ 58 STMT_START { \ 59 (symptr)->patptr = (p); \ 60 (symptr)->patend = (e); \ 61 (symptr)->grpbeg = NULL; \ 62 (symptr)->grpend = NULL; \ 63 (symptr)->grpend = NULL; \ 64 (symptr)->code = 0; \ 65 (symptr)->length = 0; \ 66 (symptr)->howlen = e_no_len; \ 67 (symptr)->level = 0; \ 68 (symptr)->flags = (f); \ 69 (symptr)->strbeg = 0; \ 70 (symptr)->previous = NULL; \ 71 } STMT_END 72 73 typedef union { 74 NV nv; 75 U8 bytes[sizeof(NV)]; 76 } NV_bytes; 77 78 #if defined(HAS_LONG_DOUBLE) && defined(USE_LONG_DOUBLE) 79 typedef union { 80 long double ld; 81 U8 bytes[sizeof(long double)]; 82 } ld_bytes; 83 #endif 84 85 #ifndef CHAR_BIT 86 # define CHAR_BIT 8 87 #endif 88 /* Maximum number of bytes to which a byte can grow due to upgrade */ 89 #define UTF8_EXPAND 2 90 91 /* 92 * Offset for integer pack/unpack. 93 * 94 * On architectures where I16 and I32 aren't really 16 and 32 bits, 95 * which for now are all Crays, pack and unpack have to play games. 96 */ 97 98 /* 99 * These values are required for portability of pack() output. 100 * If they're not right on your machine, then pack() and unpack() 101 * wouldn't work right anyway; you'll need to apply the Cray hack. 102 * (I'd like to check them with #if, but you can't use sizeof() in 103 * the preprocessor.) --??? 104 */ 105 /* 106 The appropriate SHORTSIZE, INTSIZE, LONGSIZE, and LONGLONGSIZE 107 defines are now in config.h. --Andy Dougherty April 1998 108 */ 109 #define SIZE16 2 110 #define SIZE32 4 111 112 /* CROSSCOMPILE and MULTIARCH are going to affect pp_pack() and pp_unpack(). 113 --jhi Feb 1999 */ 114 115 #if U16SIZE <= SIZE16 && U32SIZE <= SIZE32 116 # define OFF16(p) ((char *) (p)) 117 # define OFF32(p) ((char *) (p)) 118 #elif BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 /* little-endian */ 119 # define OFF16(p) ((char*)(p)) 120 # define OFF32(p) ((char*)(p)) 121 #elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321 /* big-endian */ 122 # define OFF16(p) ((char*)(p) + (sizeof(U16) - SIZE16)) 123 # define OFF32(p) ((char*)(p) + (sizeof(U32) - SIZE32)) 124 #else 125 # error "bad cray byte order" 126 #endif 127 128 #define PUSH16(utf8, cur, p, needs_swap) \ 129 PUSH_BYTES(utf8, cur, OFF16(p), SIZE16, needs_swap) 130 #define PUSH32(utf8, cur, p, needs_swap) \ 131 PUSH_BYTES(utf8, cur, OFF32(p), SIZE32, needs_swap) 132 133 #if BYTEORDER == 0x4321 || BYTEORDER == 0x87654321 /* big-endian */ 134 # define NEEDS_SWAP(d) (TYPE_ENDIANNESS(d) == TYPE_IS_LITTLE_ENDIAN) 135 #elif BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 /* little-endian */ 136 # define NEEDS_SWAP(d) (TYPE_ENDIANNESS(d) == TYPE_IS_BIG_ENDIAN) 137 #else 138 # error "Unsupported byteorder" 139 /* Need to add code here to re-instate mixed endian support. 140 NEEDS_SWAP would need to hold a flag indicating which action to 141 take, and S_reverse_copy and the code in S_utf8_to_bytes would need 142 logic adding to deal with any mixed-endian transformations needed. 143 */ 144 #endif 145 146 /* Only to be used inside a loop (see the break) */ 147 #define SHIFT_BYTES(utf8, s, strend, buf, len, datumtype, needs_swap) \ 148 STMT_START { \ 149 if (UNLIKELY(utf8)) { \ 150 if (!S_utf8_to_bytes(aTHX_ &s, strend, \ 151 (char *) (buf), len, datumtype)) break; \ 152 } else { \ 153 if (UNLIKELY(needs_swap)) \ 154 S_reverse_copy(s, (char *) (buf), len); \ 155 else \ 156 Copy(s, (char *) (buf), len, char); \ 157 s += len; \ 158 } \ 159 } STMT_END 160 161 #define SHIFT16(utf8, s, strend, p, datumtype, needs_swap) \ 162 SHIFT_BYTES(utf8, s, strend, OFF16(p), SIZE16, datumtype, needs_swap) 163 164 #define SHIFT32(utf8, s, strend, p, datumtype, needs_swap) \ 165 SHIFT_BYTES(utf8, s, strend, OFF32(p), SIZE32, datumtype, needs_swap) 166 167 #define SHIFT_VAR(utf8, s, strend, var, datumtype, needs_swap) \ 168 SHIFT_BYTES(utf8, s, strend, &(var), sizeof(var), datumtype, needs_swap) 169 170 #define PUSH_VAR(utf8, aptr, var, needs_swap) \ 171 PUSH_BYTES(utf8, aptr, &(var), sizeof(var), needs_swap) 172 173 /* Avoid stack overflow due to pathological templates. 100 should be plenty. */ 174 #define MAX_SUB_TEMPLATE_LEVEL 100 175 176 /* flags (note that type modifiers can also be used as flags!) */ 177 #define FLAG_WAS_UTF8 0x40 178 #define FLAG_PARSE_UTF8 0x20 /* Parse as utf8 */ 179 #define FLAG_UNPACK_ONLY_ONE 0x10 180 #define FLAG_DO_UTF8 0x08 /* The underlying string is utf8 */ 181 #define FLAG_SLASH 0x04 182 #define FLAG_COMMA 0x02 183 #define FLAG_PACK 0x01 184 185 STATIC SV * 186 S_mul128(pTHX_ SV *sv, U8 m) 187 { 188 STRLEN len; 189 char *s = SvPV(sv, len); 190 char *t; 191 192 PERL_ARGS_ASSERT_MUL128; 193 194 if (! memBEGINs(s, len, "0000")) { /* need to grow sv */ 195 SV * const tmpNew = newSVpvs("0000000000"); 196 197 sv_catsv(tmpNew, sv); 198 SvREFCNT_dec(sv); /* free old sv */ 199 sv = tmpNew; 200 s = SvPV(sv, len); 201 } 202 t = s + len - 1; 203 while (!*t) /* trailing '\0'? */ 204 t--; 205 while (t > s) { 206 const U32 i = ((*t - '0') << 7) + m; 207 *(t--) = '0' + (char)(i % 10); 208 m = (char)(i / 10); 209 } 210 return (sv); 211 } 212 213 /* Explosives and implosives. */ 214 215 #define ISUUCHAR(ch) (NATIVE_TO_LATIN1(ch) >= NATIVE_TO_LATIN1(' ') \ 216 && NATIVE_TO_LATIN1(ch) < NATIVE_TO_LATIN1('a')) 217 218 /* type modifiers */ 219 #define TYPE_IS_SHRIEKING 0x100 220 #define TYPE_IS_BIG_ENDIAN 0x200 221 #define TYPE_IS_LITTLE_ENDIAN 0x400 222 #define TYPE_IS_PACK 0x800 223 #define TYPE_ENDIANNESS_MASK (TYPE_IS_BIG_ENDIAN|TYPE_IS_LITTLE_ENDIAN) 224 #define TYPE_MODIFIERS(t) ((t) & ~0xFF) 225 #define TYPE_NO_MODIFIERS(t) ((t) & 0xFF) 226 227 # define TYPE_ENDIANNESS(t) ((t) & TYPE_ENDIANNESS_MASK) 228 # define TYPE_NO_ENDIANNESS(t) ((t) & ~TYPE_ENDIANNESS_MASK) 229 230 # define ENDIANNESS_ALLOWED_TYPES "sSiIlLqQjJfFdDpP(" 231 232 #define PACK_SIZE_CANNOT_CSUM 0x80 233 #define PACK_SIZE_UNPREDICTABLE 0x40 /* Not a fixed size element */ 234 #define PACK_SIZE_MASK 0x3F 235 236 #include "packsizetables.inc" 237 238 static void 239 S_reverse_copy(const char *src, char *dest, STRLEN len) 240 { 241 dest += len; 242 while (len--) 243 *--dest = *src++; 244 } 245 246 STATIC U8 247 utf8_to_byte(pTHX_ const char **s, const char *end, I32 datumtype) 248 { 249 STRLEN retlen; 250 UV val; 251 252 if (*s >= end) { 253 goto croak; 254 } 255 val = utf8n_to_uvchr((U8 *) *s, end-*s, &retlen, 256 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); 257 if (retlen == (STRLEN) -1) 258 croak: 259 Perl_croak(aTHX_ "Malformed UTF-8 string in '%c' format in unpack", 260 (int) TYPE_NO_MODIFIERS(datumtype)); 261 if (val >= 0x100) { 262 Perl_ck_warner(aTHX_ packWARN(WARN_UNPACK), 263 "Character in '%c' format wrapped in unpack", 264 (int) TYPE_NO_MODIFIERS(datumtype)); 265 val &= 0xff; 266 } 267 *s += retlen; 268 return (U8)val; 269 } 270 271 #define SHIFT_BYTE(utf8, s, strend, datumtype) ((utf8) ? \ 272 utf8_to_byte(aTHX_ &(s), (strend), (datumtype)) : \ 273 *(U8 *)(s)++) 274 275 STATIC bool 276 S_utf8_to_bytes(pTHX_ const char **s, const char *end, const char *buf, SSize_t buf_len, I32 datumtype) 277 { 278 UV val; 279 STRLEN retlen; 280 const char *from = *s; 281 int bad = 0; 282 const U32 flags = ckWARN(WARN_UTF8) ? 283 UTF8_CHECK_ONLY : (UTF8_CHECK_ONLY | UTF8_ALLOW_ANY); 284 const bool needs_swap = NEEDS_SWAP(datumtype); 285 286 if (UNLIKELY(needs_swap)) 287 buf += buf_len; 288 289 for (;buf_len > 0; buf_len--) { 290 if (from >= end) return FALSE; 291 val = utf8n_to_uvchr((U8 *) from, end-from, &retlen, flags); 292 if (retlen == (STRLEN) -1) { 293 from += UTF8_SAFE_SKIP(from, end); 294 bad |= 1; 295 } else from += retlen; 296 if (val >= 0x100) { 297 bad |= 2; 298 val &= 0xff; 299 } 300 if (UNLIKELY(needs_swap)) 301 *(U8 *)--buf = (U8)val; 302 else 303 *(U8 *)buf++ = (U8)val; 304 } 305 /* We have enough characters for the buffer. Did we have problems ? */ 306 if (bad) { 307 if (bad & 1) { 308 /* Rewalk the string fragment while warning */ 309 const char *ptr; 310 const U32 flags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY; 311 for (ptr = *s; ptr < from; ptr += UTF8SKIP(ptr)) { 312 if (ptr >= end) break; 313 utf8n_to_uvchr((U8 *) ptr, end-ptr, &retlen, flags); 314 } 315 if (from > end) from = end; 316 } 317 if ((bad & 2)) 318 Perl_ck_warner(aTHX_ packWARN(datumtype & TYPE_IS_PACK ? 319 WARN_PACK : WARN_UNPACK), 320 "Character(s) in '%c' format wrapped in %s", 321 (int) TYPE_NO_MODIFIERS(datumtype), 322 datumtype & TYPE_IS_PACK ? "pack" : "unpack"); 323 } 324 *s = from; 325 return TRUE; 326 } 327 328 STATIC char * 329 S_my_bytes_to_utf8(const U8 *start, STRLEN len, char *dest, const bool needs_swap) { 330 PERL_ARGS_ASSERT_MY_BYTES_TO_UTF8; 331 332 if (UNLIKELY(needs_swap)) { 333 const U8 *p = start + len; 334 while (p-- > start) { 335 append_utf8_from_native_byte(*p, (U8 **) & dest); 336 } 337 } else { 338 const U8 * const end = start + len; 339 while (start < end) { 340 append_utf8_from_native_byte(*start, (U8 **) & dest); 341 start++; 342 } 343 } 344 return dest; 345 } 346 347 #define PUSH_BYTES(utf8, cur, buf, len, needs_swap) \ 348 STMT_START { \ 349 if (UNLIKELY(utf8)) \ 350 (cur) = my_bytes_to_utf8((U8 *) buf, len, (cur), needs_swap); \ 351 else { \ 352 if (UNLIKELY(needs_swap)) \ 353 S_reverse_copy((char *)(buf), cur, len); \ 354 else \ 355 Copy(buf, cur, len, char); \ 356 (cur) += (len); \ 357 } \ 358 } STMT_END 359 360 #define SAFE_UTF8_EXPAND(var) \ 361 STMT_START { \ 362 if ((var) > SSize_t_MAX / UTF8_EXPAND) \ 363 Perl_croak(aTHX_ "%s", "Out of memory during pack()"); \ 364 (var) = (var) * UTF8_EXPAND; \ 365 } STMT_END 366 367 #define GROWING2(utf8, cat, start, cur, item_size, item_count) \ 368 STMT_START { \ 369 if (SSize_t_MAX / (item_size) < (item_count)) \ 370 Perl_croak(aTHX_ "%s", "Out of memory during pack()"); \ 371 GROWING((utf8), (cat), (start), (cur), (item_size) * (item_count)); \ 372 } STMT_END 373 374 #define GROWING(utf8, cat, start, cur, in_len) \ 375 STMT_START { \ 376 STRLEN glen = (in_len); \ 377 STRLEN catcur = (STRLEN)((cur) - (start)); \ 378 if (utf8) SAFE_UTF8_EXPAND(glen); \ 379 if (SSize_t_MAX - glen < catcur) \ 380 Perl_croak(aTHX_ "%s", "Out of memory during pack()"); \ 381 if (catcur + glen >= SvLEN(cat)) { \ 382 (start) = sv_exp_grow(cat, glen); \ 383 (cur) = (start) + SvCUR(cat); \ 384 } \ 385 } STMT_END 386 387 #define PUSH_GROWING_BYTES(utf8, cat, start, cur, buf, in_len) \ 388 STMT_START { \ 389 const STRLEN glen = (in_len); \ 390 STRLEN gl = glen; \ 391 if (utf8) SAFE_UTF8_EXPAND(gl); \ 392 if ((cur) + gl >= (start) + SvLEN(cat)) { \ 393 *cur = '\0'; \ 394 SvCUR_set((cat), (cur) - (start)); \ 395 (start) = sv_exp_grow(cat, gl); \ 396 (cur) = (start) + SvCUR(cat); \ 397 } \ 398 PUSH_BYTES(utf8, cur, buf, glen, 0); \ 399 } STMT_END 400 401 #define PUSH_BYTE(utf8, s, byte) \ 402 STMT_START { \ 403 if (utf8) { \ 404 const U8 au8 = (byte); \ 405 (s) = my_bytes_to_utf8(&au8, 1, (s), 0);\ 406 } else *(U8 *)(s)++ = (byte); \ 407 } STMT_END 408 409 /* Only to be used inside a loop (see the break) */ 410 #define NEXT_UNI_VAL(val, cur, str, end, utf8_flags) \ 411 STMT_START { \ 412 STRLEN retlen; \ 413 if (str >= end) break; \ 414 val = utf8n_to_uvchr((U8 *) str, end-str, &retlen, utf8_flags); \ 415 if (retlen == (STRLEN) -1) { \ 416 *cur = '\0'; \ 417 Perl_croak(aTHX_ "Malformed UTF-8 string in pack"); \ 418 } \ 419 str += retlen; \ 420 } STMT_END 421 422 static const char *_action( const tempsym_t* symptr ) 423 { 424 return (const char *)(( symptr->flags & FLAG_PACK ) ? "pack" : "unpack"); 425 } 426 427 /* Returns the sizeof() struct described by pat */ 428 STATIC SSize_t 429 S_measure_struct(pTHX_ tempsym_t* symptr) 430 { 431 SSize_t total = 0; 432 433 PERL_ARGS_ASSERT_MEASURE_STRUCT; 434 435 while (next_symbol(symptr)) { 436 SSize_t len, size; 437 438 switch (symptr->howlen) { 439 case e_star: 440 Perl_croak(aTHX_ "Within []-length '*' not allowed in %s", 441 _action( symptr ) ); 442 443 default: 444 /* e_no_len and e_number */ 445 len = symptr->length; 446 break; 447 } 448 449 size = packprops[TYPE_NO_ENDIANNESS(symptr->code)] & PACK_SIZE_MASK; 450 if (!size) { 451 SSize_t star; 452 /* endianness doesn't influence the size of a type */ 453 switch(TYPE_NO_ENDIANNESS(symptr->code)) { 454 default: 455 Perl_croak(aTHX_ "Invalid type '%c' in %s", 456 (int)TYPE_NO_MODIFIERS(symptr->code), 457 _action( symptr ) ); 458 case '.' | TYPE_IS_SHRIEKING: 459 case '@' | TYPE_IS_SHRIEKING: 460 case '@': 461 case '.': 462 case '/': 463 case 'U': /* XXXX Is it correct? */ 464 case 'w': 465 case 'u': 466 Perl_croak(aTHX_ "Within []-length '%c' not allowed in %s", 467 (int) TYPE_NO_MODIFIERS(symptr->code), 468 _action( symptr ) ); 469 case '%': 470 size = 0; 471 break; 472 case '(': 473 { 474 tempsym_t savsym = *symptr; 475 symptr->patptr = savsym.grpbeg; 476 symptr->patend = savsym.grpend; 477 /* XXXX Theoretically, we need to measure many times at 478 different positions, since the subexpression may contain 479 alignment commands, but be not of aligned length. 480 Need to detect this and croak(). */ 481 size = measure_struct(symptr); 482 *symptr = savsym; 483 break; 484 } 485 case 'X' | TYPE_IS_SHRIEKING: 486 /* XXXX Is this useful? Then need to treat MEASURE_BACKWARDS. 487 */ 488 if (!len) /* Avoid division by 0 */ 489 len = 1; 490 len = total % len; /* Assumed: the start is aligned. */ 491 /* FALLTHROUGH */ 492 case 'X': 493 size = -1; 494 if (total < len) 495 Perl_croak(aTHX_ "'X' outside of string in %s", _action( symptr ) ); 496 break; 497 case 'x' | TYPE_IS_SHRIEKING: 498 if (!len) /* Avoid division by 0 */ 499 len = 1; 500 star = total % len; /* Assumed: the start is aligned. */ 501 if (star) /* Other portable ways? */ 502 len = len - star; 503 else 504 len = 0; 505 /* FALLTHROUGH */ 506 case 'x': 507 case 'A': 508 case 'Z': 509 case 'a': 510 size = 1; 511 break; 512 case 'B': 513 case 'b': 514 len = (len + 7)/8; 515 size = 1; 516 break; 517 case 'H': 518 case 'h': 519 len = (len + 1)/2; 520 size = 1; 521 break; 522 523 case 'P': 524 len = 1; 525 size = sizeof(char*); 526 break; 527 } 528 } 529 total += len * size; 530 } 531 return total; 532 } 533 534 535 /* locate matching closing parenthesis or bracket 536 * returns char pointer to char after match, or NULL 537 */ 538 STATIC const char * 539 S_group_end(pTHX_ const char *patptr, const char *patend, char ender) 540 { 541 PERL_ARGS_ASSERT_GROUP_END; 542 543 while (patptr < patend) { 544 const char c = *patptr++; 545 546 if (isSPACE(c)) 547 continue; 548 else if (c == ender) 549 return patptr-1; 550 else if (c == '#') { 551 while (patptr < patend && *patptr != '\n') 552 patptr++; 553 continue; 554 } else if (c == '(') 555 patptr = group_end(patptr, patend, ')') + 1; 556 else if (c == '[') 557 patptr = group_end(patptr, patend, ']') + 1; 558 } 559 Perl_croak(aTHX_ "No group ending character '%c' found in template", 560 ender); 561 NOT_REACHED; /* NOTREACHED */ 562 } 563 564 565 /* Convert unsigned decimal number to binary. 566 * Expects a pointer to the first digit and address of length variable 567 * Advances char pointer to 1st non-digit char and returns number 568 */ 569 STATIC const char * 570 S_get_num(pTHX_ const char *patptr, SSize_t *lenptr ) 571 { 572 SSize_t len = *patptr++ - '0'; 573 574 PERL_ARGS_ASSERT_GET_NUM; 575 576 while (isDIGIT(*patptr)) { 577 SSize_t nlen = (len * 10) + (*patptr++ - '0'); 578 if (nlen < 0 || nlen/10 != len) 579 Perl_croak(aTHX_ "pack/unpack repeat count overflow"); 580 len = nlen; 581 } 582 *lenptr = len; 583 return patptr; 584 } 585 586 /* The marvellous template parsing routine: Using state stored in *symptr, 587 * locates next template code and count 588 */ 589 STATIC bool 590 S_next_symbol(pTHX_ tempsym_t* symptr ) 591 { 592 const char* patptr = symptr->patptr; 593 const char* const patend = symptr->patend; 594 595 PERL_ARGS_ASSERT_NEXT_SYMBOL; 596 597 symptr->flags &= ~FLAG_SLASH; 598 599 while (patptr < patend) { 600 if (isSPACE(*patptr)) 601 patptr++; 602 else if (*patptr == '#') { 603 patptr++; 604 while (patptr < patend && *patptr != '\n') 605 patptr++; 606 if (patptr < patend) 607 patptr++; 608 } else { 609 /* We should have found a template code */ 610 I32 code = *patptr++ & 0xFF; 611 U32 inherited_modifiers = 0; 612 613 if (code == ','){ /* grandfather in commas but with a warning */ 614 if (((symptr->flags & FLAG_COMMA) == 0) && ckWARN(WARN_UNPACK)){ 615 symptr->flags |= FLAG_COMMA; 616 Perl_warner(aTHX_ packWARN(WARN_UNPACK), 617 "Invalid type ',' in %s", _action( symptr ) ); 618 } 619 continue; 620 } 621 622 /* for '(', skip to ')' */ 623 if (code == '(') { 624 if( isDIGIT(*patptr) || *patptr == '*' || *patptr == '[' ) 625 Perl_croak(aTHX_ "()-group starts with a count in %s", 626 _action( symptr ) ); 627 symptr->grpbeg = patptr; 628 patptr = 1 + ( symptr->grpend = group_end(patptr, patend, ')') ); 629 if( symptr->level >= MAX_SUB_TEMPLATE_LEVEL ) 630 Perl_croak(aTHX_ "Too deeply nested ()-groups in %s", 631 _action( symptr ) ); 632 } 633 634 /* look for group modifiers to inherit */ 635 if (TYPE_ENDIANNESS(symptr->flags)) { 636 if (strchr(ENDIANNESS_ALLOWED_TYPES, TYPE_NO_MODIFIERS(code))) 637 inherited_modifiers |= TYPE_ENDIANNESS(symptr->flags); 638 } 639 640 /* look for modifiers */ 641 while (patptr < patend) { 642 const char *allowed; 643 I32 modifier; 644 switch (*patptr) { 645 case '!': 646 modifier = TYPE_IS_SHRIEKING; 647 allowed = "sSiIlLxXnNvV@."; 648 break; 649 case '>': 650 modifier = TYPE_IS_BIG_ENDIAN; 651 allowed = ENDIANNESS_ALLOWED_TYPES; 652 break; 653 case '<': 654 modifier = TYPE_IS_LITTLE_ENDIAN; 655 allowed = ENDIANNESS_ALLOWED_TYPES; 656 break; 657 default: 658 allowed = ""; 659 modifier = 0; 660 break; 661 } 662 663 if (modifier == 0) 664 break; 665 666 if (!strchr(allowed, TYPE_NO_MODIFIERS(code))) 667 Perl_croak(aTHX_ "'%c' allowed only after types %s in %s", *patptr, 668 allowed, _action( symptr ) ); 669 670 if (TYPE_ENDIANNESS(code | modifier) == TYPE_ENDIANNESS_MASK) 671 Perl_croak(aTHX_ "Can't use both '<' and '>' after type '%c' in %s", 672 (int) TYPE_NO_MODIFIERS(code), _action( symptr ) ); 673 else if (TYPE_ENDIANNESS(code | modifier | inherited_modifiers) == 674 TYPE_ENDIANNESS_MASK) 675 Perl_croak(aTHX_ "Can't use '%c' in a group with different byte-order in %s", 676 *patptr, _action( symptr ) ); 677 678 if ((code & modifier)) { 679 Perl_ck_warner(aTHX_ packWARN(WARN_UNPACK), 680 "Duplicate modifier '%c' after '%c' in %s", 681 *patptr, (int) TYPE_NO_MODIFIERS(code), 682 _action( symptr ) ); 683 } 684 685 code |= modifier; 686 patptr++; 687 } 688 689 /* inherit modifiers */ 690 code |= inherited_modifiers; 691 692 /* look for count and/or / */ 693 if (patptr < patend) { 694 if (isDIGIT(*patptr)) { 695 patptr = get_num( patptr, &symptr->length ); 696 symptr->howlen = e_number; 697 698 } else if (*patptr == '*') { 699 patptr++; 700 symptr->howlen = e_star; 701 702 } else if (*patptr == '[') { 703 const char* lenptr = ++patptr; 704 symptr->howlen = e_number; 705 patptr = group_end( patptr, patend, ']' ) + 1; 706 /* what kind of [] is it? */ 707 if (isDIGIT(*lenptr)) { 708 lenptr = get_num( lenptr, &symptr->length ); 709 if( *lenptr != ']' ) 710 Perl_croak(aTHX_ "Malformed integer in [] in %s", 711 _action( symptr ) ); 712 } else { 713 tempsym_t savsym = *symptr; 714 symptr->patend = patptr-1; 715 symptr->patptr = lenptr; 716 savsym.length = measure_struct(symptr); 717 *symptr = savsym; 718 } 719 } else { 720 symptr->howlen = e_no_len; 721 symptr->length = 1; 722 } 723 724 /* try to find / */ 725 while (patptr < patend) { 726 if (isSPACE(*patptr)) 727 patptr++; 728 else if (*patptr == '#') { 729 patptr++; 730 while (patptr < patend && *patptr != '\n') 731 patptr++; 732 if (patptr < patend) 733 patptr++; 734 } else { 735 if (*patptr == '/') { 736 symptr->flags |= FLAG_SLASH; 737 patptr++; 738 if (patptr < patend && 739 (isDIGIT(*patptr) || *patptr == '*' || *patptr == '[')) 740 Perl_croak(aTHX_ "'/' does not take a repeat count in %s", 741 _action( symptr ) ); 742 } 743 break; 744 } 745 } 746 } else { 747 /* at end - no count, no / */ 748 symptr->howlen = e_no_len; 749 symptr->length = 1; 750 } 751 752 symptr->code = code; 753 symptr->patptr = patptr; 754 return TRUE; 755 } 756 } 757 symptr->patptr = patptr; 758 return FALSE; 759 } 760 761 /* 762 There is no way to cleanly handle the case where we should process the 763 string per byte in its upgraded form while it's really in downgraded form 764 (e.g. estimates like strend-s as an upper bound for the number of 765 characters left wouldn't work). So if we foresee the need of this 766 (pattern starts with U or contains U0), we want to work on the encoded 767 version of the string. Users are advised to upgrade their pack string 768 themselves if they need to do a lot of unpacks like this on it 769 */ 770 STATIC bool 771 need_utf8(const char *pat, const char *patend) 772 { 773 bool first = TRUE; 774 775 PERL_ARGS_ASSERT_NEED_UTF8; 776 777 while (pat < patend) { 778 if (pat[0] == '#') { 779 pat++; 780 pat = (const char *) memchr(pat, '\n', patend-pat); 781 if (!pat) return FALSE; 782 } else if (pat[0] == 'U') { 783 if (first || pat[1] == '0') return TRUE; 784 } else first = FALSE; 785 pat++; 786 } 787 return FALSE; 788 } 789 790 STATIC char 791 first_symbol(const char *pat, const char *patend) { 792 PERL_ARGS_ASSERT_FIRST_SYMBOL; 793 794 while (pat < patend) { 795 if (pat[0] != '#') return pat[0]; 796 pat++; 797 pat = (const char *) memchr(pat, '\n', patend-pat); 798 if (!pat) return 0; 799 pat++; 800 } 801 return 0; 802 } 803 804 /* 805 806 =head1 Pack and Unpack 807 808 =for apidoc unpackstring 809 810 The engine implementing the C<unpack()> Perl function. 811 812 Using the template C<pat..patend>, this function unpacks the string 813 C<s..strend> into a number of mortal SVs, which it pushes onto the perl 814 argument (C<@_>) stack (so you will need to issue a C<PUTBACK> before and 815 C<SPAGAIN> after the call to this function). It returns the number of 816 pushed elements. 817 818 The C<strend> and C<patend> pointers should point to the byte following the 819 last character of each string. 820 821 Although this function returns its values on the perl argument stack, it 822 doesn't take any parameters from that stack (and thus in particular 823 there's no need to do a C<PUSHMARK> before calling it, unlike L</call_pv> for 824 example). 825 826 =cut */ 827 828 SSize_t 829 Perl_unpackstring(pTHX_ const char *pat, const char *patend, const char *s, const char *strend, U32 flags) 830 { 831 tempsym_t sym; 832 833 PERL_ARGS_ASSERT_UNPACKSTRING; 834 835 if (flags & FLAG_DO_UTF8) flags |= FLAG_WAS_UTF8; 836 else if (need_utf8(pat, patend)) { 837 /* We probably should try to avoid this in case a scalar context call 838 wouldn't get to the "U0" */ 839 STRLEN len = strend - s; 840 s = (char *) bytes_to_utf8((U8 *) s, &len); 841 SAVEFREEPV(s); 842 strend = s + len; 843 flags |= FLAG_DO_UTF8; 844 } 845 846 if (first_symbol(pat, patend) != 'U' && (flags & FLAG_DO_UTF8)) 847 flags |= FLAG_PARSE_UTF8; 848 849 TEMPSYM_INIT(&sym, pat, patend, flags); 850 851 return unpack_rec(&sym, s, s, strend, NULL ); 852 } 853 854 STATIC SSize_t 855 S_unpack_rec(pTHX_ tempsym_t* symptr, const char *s, const char *strbeg, const char *strend, const char **new_s ) 856 { 857 dSP; 858 SV *sv = NULL; 859 const SSize_t start_sp_offset = SP - PL_stack_base; 860 howlen_t howlen; 861 SSize_t checksum = 0; 862 UV cuv = 0; 863 NV cdouble = 0.0; 864 const SSize_t bits_in_uv = CHAR_BIT * sizeof(cuv); 865 bool beyond = FALSE; 866 bool explicit_length; 867 const bool unpack_only_one = (symptr->flags & FLAG_UNPACK_ONLY_ONE) != 0; 868 bool utf8 = (symptr->flags & FLAG_PARSE_UTF8) ? 1 : 0; 869 870 PERL_ARGS_ASSERT_UNPACK_REC; 871 872 symptr->strbeg = s - strbeg; 873 874 while (next_symbol(symptr)) { 875 packprops_t props; 876 SSize_t len; 877 I32 datumtype = symptr->code; 878 bool needs_swap; 879 /* do first one only unless in list context 880 / is implemented by unpacking the count, then popping it from the 881 stack, so must check that we're not in the middle of a / */ 882 if ( unpack_only_one 883 && (SP - PL_stack_base == start_sp_offset + 1) 884 && (datumtype != '/') ) /* XXX can this be omitted */ 885 break; 886 887 switch (howlen = symptr->howlen) { 888 case e_star: 889 len = strend - strbeg; /* long enough */ 890 break; 891 default: 892 /* e_no_len and e_number */ 893 len = symptr->length; 894 break; 895 } 896 897 explicit_length = TRUE; 898 redo_switch: 899 beyond = s >= strend; 900 901 props = packprops[TYPE_NO_ENDIANNESS(datumtype)]; 902 if (props) { 903 /* props nonzero means we can process this letter. */ 904 const SSize_t size = props & PACK_SIZE_MASK; 905 const SSize_t howmany = (strend - s) / size; 906 if (len > howmany) 907 len = howmany; 908 909 if (!checksum || (props & PACK_SIZE_CANNOT_CSUM)) { 910 if (len && unpack_only_one) len = 1; 911 EXTEND(SP, len); 912 EXTEND_MORTAL(len); 913 } 914 } 915 916 needs_swap = NEEDS_SWAP(datumtype); 917 918 switch(TYPE_NO_ENDIANNESS(datumtype)) { 919 default: 920 Perl_croak(aTHX_ "Invalid type '%c' in unpack", (int)TYPE_NO_MODIFIERS(datumtype) ); 921 922 case '%': 923 if (howlen == e_no_len) 924 len = 16; /* len is not specified */ 925 checksum = len; 926 cuv = 0; 927 cdouble = 0; 928 continue; 929 930 case '(': 931 { 932 tempsym_t savsym = *symptr; 933 const U32 group_modifiers = TYPE_MODIFIERS(datumtype & ~symptr->flags); 934 symptr->flags |= group_modifiers; 935 symptr->patend = savsym.grpend; 936 symptr->previous = &savsym; 937 symptr->level++; 938 PUTBACK; 939 if (len && unpack_only_one) len = 1; 940 while (len--) { 941 symptr->patptr = savsym.grpbeg; 942 if (utf8) symptr->flags |= FLAG_PARSE_UTF8; 943 else symptr->flags &= ~FLAG_PARSE_UTF8; 944 unpack_rec(symptr, s, strbeg, strend, &s); 945 if (s == strend && savsym.howlen == e_star) 946 break; /* No way to continue */ 947 } 948 SPAGAIN; 949 savsym.flags = symptr->flags & ~group_modifiers; 950 *symptr = savsym; 951 break; 952 } 953 case '.' | TYPE_IS_SHRIEKING: 954 case '.': { 955 const char *from; 956 SV *sv; 957 const bool u8 = utf8 && !(datumtype & TYPE_IS_SHRIEKING); 958 if (howlen == e_star) from = strbeg; 959 else if (len <= 0) from = s; 960 else { 961 tempsym_t *group = symptr; 962 963 while (--len && group) group = group->previous; 964 from = group ? strbeg + group->strbeg : strbeg; 965 } 966 sv = from <= s ? 967 newSVuv( u8 ? (UV) utf8_length((const U8*)from, (const U8*)s) : (UV) (s-from)) : 968 newSViv(-(u8 ? (IV) utf8_length((const U8*)s, (const U8*)from) : (IV) (from-s))); 969 mXPUSHs(sv); 970 break; 971 } 972 case '@' | TYPE_IS_SHRIEKING: 973 case '@': 974 s = strbeg + symptr->strbeg; 975 if (utf8 && !(datumtype & TYPE_IS_SHRIEKING)) 976 { 977 while (len > 0) { 978 if (s >= strend) 979 Perl_croak(aTHX_ "'@' outside of string in unpack"); 980 s += UTF8SKIP(s); 981 len--; 982 } 983 if (s > strend) 984 Perl_croak(aTHX_ "'@' outside of string with malformed UTF-8 in unpack"); 985 } else { 986 if (strend-s < len) 987 Perl_croak(aTHX_ "'@' outside of string in unpack"); 988 s += len; 989 } 990 break; 991 case 'X' | TYPE_IS_SHRIEKING: 992 if (!len) /* Avoid division by 0 */ 993 len = 1; 994 if (utf8) { 995 const char *hop, *last; 996 SSize_t l = len; 997 hop = last = strbeg; 998 while (hop < s) { 999 hop += UTF8SKIP(hop); 1000 if (--l == 0) { 1001 last = hop; 1002 l = len; 1003 } 1004 } 1005 if (last > s) 1006 Perl_croak(aTHX_ "Malformed UTF-8 string in unpack"); 1007 s = last; 1008 break; 1009 } 1010 len = (s - strbeg) % len; 1011 /* FALLTHROUGH */ 1012 case 'X': 1013 if (utf8) { 1014 while (len > 0) { 1015 if (s <= strbeg) 1016 Perl_croak(aTHX_ "'X' outside of string in unpack"); 1017 while (--s, UTF8_IS_CONTINUATION(*s)) { 1018 if (s <= strbeg) 1019 Perl_croak(aTHX_ "'X' outside of string in unpack"); 1020 } 1021 len--; 1022 } 1023 } else { 1024 if (len > s - strbeg) 1025 Perl_croak(aTHX_ "'X' outside of string in unpack" ); 1026 s -= len; 1027 } 1028 break; 1029 case 'x' | TYPE_IS_SHRIEKING: { 1030 SSize_t ai32; 1031 if (!len) /* Avoid division by 0 */ 1032 len = 1; 1033 if (utf8) ai32 = utf8_length((U8 *) strbeg, (U8 *) s) % len; 1034 else ai32 = (s - strbeg) % len; 1035 if (ai32 == 0) break; 1036 len -= ai32; 1037 } 1038 /* FALLTHROUGH */ 1039 case 'x': 1040 if (utf8) { 1041 while (len>0) { 1042 if (s >= strend) 1043 Perl_croak(aTHX_ "'x' outside of string in unpack"); 1044 s += UTF8SKIP(s); 1045 len--; 1046 } 1047 } else { 1048 if (len > strend - s) 1049 Perl_croak(aTHX_ "'x' outside of string in unpack"); 1050 s += len; 1051 } 1052 break; 1053 case '/': 1054 Perl_croak(aTHX_ "'/' must follow a numeric type in unpack"); 1055 1056 case 'A': 1057 case 'Z': 1058 case 'a': 1059 if (checksum) { 1060 /* Preliminary length estimate is assumed done in 'W' */ 1061 if (len > strend - s) len = strend - s; 1062 goto W_checksum; 1063 } 1064 if (utf8) { 1065 SSize_t l; 1066 const char *hop; 1067 for (l=len, hop=s; l>0; l--, hop += UTF8SKIP(hop)) { 1068 if (hop >= strend) { 1069 if (hop > strend) 1070 Perl_croak(aTHX_ "Malformed UTF-8 string in unpack"); 1071 break; 1072 } 1073 } 1074 if (hop > strend) 1075 Perl_croak(aTHX_ "Malformed UTF-8 string in unpack"); 1076 len = hop - s; 1077 } else if (len > strend - s) 1078 len = strend - s; 1079 1080 if (datumtype == 'Z') { 1081 /* 'Z' strips stuff after first null */ 1082 const char *ptr, *end; 1083 end = s + len; 1084 for (ptr = s; ptr < end; ptr++) if (*ptr == 0) break; 1085 sv = newSVpvn(s, ptr-s); 1086 if (howlen == e_star) /* exact for 'Z*' */ 1087 len = ptr-s + (ptr != strend ? 1 : 0); 1088 } else if (datumtype == 'A') { 1089 /* 'A' strips both nulls and spaces */ 1090 const char *ptr; 1091 if (utf8 && (symptr->flags & FLAG_WAS_UTF8)) { 1092 for (ptr = s+len-1; ptr >= s; ptr--) { 1093 if ( *ptr != 0 1094 && !UTF8_IS_CONTINUATION(*ptr) 1095 && !isSPACE_utf8_safe(ptr, strend)) 1096 { 1097 break; 1098 } 1099 } 1100 if (ptr >= s) ptr += UTF8SKIP(ptr); 1101 else ptr++; 1102 if (ptr > s+len) 1103 Perl_croak(aTHX_ "Malformed UTF-8 string in unpack"); 1104 } else { 1105 for (ptr = s+len-1; ptr >= s; ptr--) 1106 if (*ptr != 0 && !isSPACE(*ptr)) break; 1107 ptr++; 1108 } 1109 sv = newSVpvn(s, ptr-s); 1110 } else sv = newSVpvn(s, len); 1111 1112 if (utf8) { 1113 SvUTF8_on(sv); 1114 /* Undo any upgrade done due to need_utf8() */ 1115 if (!(symptr->flags & FLAG_WAS_UTF8)) 1116 sv_utf8_downgrade(sv, 0); 1117 } 1118 mXPUSHs(sv); 1119 s += len; 1120 break; 1121 case 'B': 1122 case 'b': { 1123 char *str; 1124 if (howlen == e_star || len > (strend - s) * 8) 1125 len = (strend - s) * 8; 1126 if (checksum) { 1127 if (utf8) 1128 while (len >= 8 && s < strend) { 1129 cuv += PL_bitcount[utf8_to_byte(aTHX_ &s, strend, datumtype)]; 1130 len -= 8; 1131 } 1132 else 1133 while (len >= 8) { 1134 cuv += PL_bitcount[*(U8 *)s++]; 1135 len -= 8; 1136 } 1137 if (len && s < strend) { 1138 U8 bits; 1139 bits = SHIFT_BYTE(utf8, s, strend, datumtype); 1140 if (datumtype == 'b') 1141 while (len-- > 0) { 1142 if (bits & 1) cuv++; 1143 bits >>= 1; 1144 } 1145 else 1146 while (len-- > 0) { 1147 if (bits & 0x80) cuv++; 1148 bits <<= 1; 1149 } 1150 } 1151 break; 1152 } 1153 1154 sv = sv_2mortal(newSV(len ? len : 1)); 1155 SvPOK_on(sv); 1156 str = SvPVX(sv); 1157 if (datumtype == 'b') { 1158 U8 bits = 0; 1159 const SSize_t ai32 = len; 1160 for (len = 0; len < ai32; len++) { 1161 if (len & 7) bits >>= 1; 1162 else if (utf8) { 1163 if (s >= strend) break; 1164 bits = utf8_to_byte(aTHX_ &s, strend, datumtype); 1165 } else bits = *(U8 *) s++; 1166 *str++ = bits & 1 ? '1' : '0'; 1167 } 1168 } else { 1169 U8 bits = 0; 1170 const SSize_t ai32 = len; 1171 for (len = 0; len < ai32; len++) { 1172 if (len & 7) bits <<= 1; 1173 else if (utf8) { 1174 if (s >= strend) break; 1175 bits = utf8_to_byte(aTHX_ &s, strend, datumtype); 1176 } else bits = *(U8 *) s++; 1177 *str++ = bits & 0x80 ? '1' : '0'; 1178 } 1179 } 1180 *str = '\0'; 1181 SvCUR_set(sv, str - SvPVX_const(sv)); 1182 XPUSHs(sv); 1183 break; 1184 } 1185 case 'H': 1186 case 'h': { 1187 char *str = NULL; 1188 /* Preliminary length estimate, acceptable for utf8 too */ 1189 if (howlen == e_star || len > (strend - s) * 2) 1190 len = (strend - s) * 2; 1191 if (!checksum) { 1192 sv = sv_2mortal(newSV(len ? len : 1)); 1193 SvPOK_on(sv); 1194 str = SvPVX(sv); 1195 } 1196 if (datumtype == 'h') { 1197 U8 bits = 0; 1198 SSize_t ai32 = len; 1199 for (len = 0; len < ai32; len++) { 1200 if (len & 1) bits >>= 4; 1201 else if (utf8) { 1202 if (s >= strend) break; 1203 bits = utf8_to_byte(aTHX_ &s, strend, datumtype); 1204 } else bits = * (U8 *) s++; 1205 if (!checksum) 1206 *str++ = PL_hexdigit[bits & 15]; 1207 } 1208 } else { 1209 U8 bits = 0; 1210 const SSize_t ai32 = len; 1211 for (len = 0; len < ai32; len++) { 1212 if (len & 1) bits <<= 4; 1213 else if (utf8) { 1214 if (s >= strend) break; 1215 bits = utf8_to_byte(aTHX_ &s, strend, datumtype); 1216 } else bits = *(U8 *) s++; 1217 if (!checksum) 1218 *str++ = PL_hexdigit[(bits >> 4) & 15]; 1219 } 1220 } 1221 if (!checksum) { 1222 *str = '\0'; 1223 SvCUR_set(sv, str - SvPVX_const(sv)); 1224 XPUSHs(sv); 1225 } 1226 break; 1227 } 1228 case 'C': 1229 if (len == 0) { 1230 if (explicit_length) 1231 /* Switch to "character" mode */ 1232 utf8 = (symptr->flags & FLAG_DO_UTF8) ? 1 : 0; 1233 break; 1234 } 1235 /* FALLTHROUGH */ 1236 case 'c': 1237 while (len-- > 0 && s < strend) { 1238 int aint; 1239 if (utf8) 1240 { 1241 STRLEN retlen; 1242 aint = utf8n_to_uvchr((U8 *) s, strend-s, &retlen, 1243 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); 1244 if (retlen == (STRLEN) -1) 1245 Perl_croak(aTHX_ "Malformed UTF-8 string in unpack"); 1246 s += retlen; 1247 } 1248 else 1249 aint = *(U8 *)(s)++; 1250 if (aint >= 128 && datumtype != 'C') /* fake up signed chars */ 1251 aint -= 256; 1252 if (!checksum) 1253 mPUSHi(aint); 1254 else if (checksum > bits_in_uv) 1255 cdouble += (NV)aint; 1256 else 1257 cuv += aint; 1258 } 1259 break; 1260 case 'W': 1261 W_checksum: 1262 if (utf8) { 1263 while (len-- > 0 && s < strend) { 1264 STRLEN retlen; 1265 const UV val = utf8n_to_uvchr((U8 *) s, strend-s, &retlen, 1266 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); 1267 if (retlen == (STRLEN) -1) 1268 Perl_croak(aTHX_ "Malformed UTF-8 string in unpack"); 1269 s += retlen; 1270 if (!checksum) 1271 mPUSHu(val); 1272 else if (checksum > bits_in_uv) 1273 cdouble += (NV) val; 1274 else 1275 cuv += val; 1276 } 1277 } else if (!checksum) 1278 while (len-- > 0) { 1279 const U8 ch = *(U8 *) s++; 1280 mPUSHu(ch); 1281 } 1282 else if (checksum > bits_in_uv) 1283 while (len-- > 0) cdouble += (NV) *(U8 *) s++; 1284 else 1285 while (len-- > 0) cuv += *(U8 *) s++; 1286 break; 1287 case 'U': 1288 if (len == 0) { 1289 if (explicit_length && howlen != e_star) { 1290 /* Switch to "bytes in UTF-8" mode */ 1291 if (symptr->flags & FLAG_DO_UTF8) utf8 = 0; 1292 else 1293 /* Should be impossible due to the need_utf8() test */ 1294 Perl_croak(aTHX_ "U0 mode on a byte string"); 1295 } 1296 break; 1297 } 1298 if (len > strend - s) len = strend - s; 1299 if (!checksum) { 1300 if (len && unpack_only_one) len = 1; 1301 EXTEND(SP, len); 1302 EXTEND_MORTAL(len); 1303 } 1304 while (len-- > 0 && s < strend) { 1305 STRLEN retlen; 1306 UV auv; 1307 if (utf8) { 1308 U8 result[UTF8_MAXLEN+1]; 1309 const char *ptr = s; 1310 STRLEN len; 1311 /* Bug: warns about bad utf8 even if we are short on bytes 1312 and will break out of the loop */ 1313 if (!S_utf8_to_bytes(aTHX_ &ptr, strend, (char *) result, 1, 1314 'U')) 1315 break; 1316 len = UTF8SKIP(result); 1317 if (!S_utf8_to_bytes(aTHX_ &ptr, strend, 1318 (char *) &result[1], len-1, 'U')) break; 1319 auv = NATIVE_TO_UNI(utf8n_to_uvchr(result, 1320 len, 1321 &retlen, 1322 UTF8_ALLOW_DEFAULT)); 1323 s = ptr; 1324 } else { 1325 auv = NATIVE_TO_UNI(utf8n_to_uvchr((U8*)s, 1326 strend - s, 1327 &retlen, 1328 UTF8_ALLOW_DEFAULT)); 1329 if (retlen == (STRLEN) -1) 1330 Perl_croak(aTHX_ "Malformed UTF-8 string in unpack"); 1331 s += retlen; 1332 } 1333 if (!checksum) 1334 mPUSHu(auv); 1335 else if (checksum > bits_in_uv) 1336 cdouble += (NV) auv; 1337 else 1338 cuv += auv; 1339 } 1340 break; 1341 case 's' | TYPE_IS_SHRIEKING: 1342 #if SHORTSIZE != SIZE16 1343 while (len-- > 0) { 1344 short ashort; 1345 SHIFT_VAR(utf8, s, strend, ashort, datumtype, needs_swap); 1346 if (!checksum) 1347 mPUSHi(ashort); 1348 else if (checksum > bits_in_uv) 1349 cdouble += (NV)ashort; 1350 else 1351 cuv += ashort; 1352 } 1353 break; 1354 #else 1355 /* FALLTHROUGH */ 1356 #endif 1357 case 's': 1358 while (len-- > 0) { 1359 I16 ai16; 1360 1361 #if U16SIZE > SIZE16 1362 ai16 = 0; 1363 #endif 1364 SHIFT16(utf8, s, strend, &ai16, datumtype, needs_swap); 1365 #if U16SIZE > SIZE16 1366 if (ai16 > 32767) 1367 ai16 -= 65536; 1368 #endif 1369 if (!checksum) 1370 mPUSHi(ai16); 1371 else if (checksum > bits_in_uv) 1372 cdouble += (NV)ai16; 1373 else 1374 cuv += ai16; 1375 } 1376 break; 1377 case 'S' | TYPE_IS_SHRIEKING: 1378 #if SHORTSIZE != SIZE16 1379 while (len-- > 0) { 1380 unsigned short aushort; 1381 SHIFT_VAR(utf8, s, strend, aushort, datumtype, needs_swap); 1382 if (!checksum) 1383 mPUSHu(aushort); 1384 else if (checksum > bits_in_uv) 1385 cdouble += (NV)aushort; 1386 else 1387 cuv += aushort; 1388 } 1389 break; 1390 #else 1391 /* FALLTHROUGH */ 1392 #endif 1393 case 'v': 1394 case 'n': 1395 case 'S': 1396 while (len-- > 0) { 1397 U16 au16; 1398 #if U16SIZE > SIZE16 1399 au16 = 0; 1400 #endif 1401 SHIFT16(utf8, s, strend, &au16, datumtype, needs_swap); 1402 if (datumtype == 'n') 1403 au16 = PerlSock_ntohs(au16); 1404 if (datumtype == 'v') 1405 au16 = vtohs(au16); 1406 if (!checksum) 1407 mPUSHu(au16); 1408 else if (checksum > bits_in_uv) 1409 cdouble += (NV) au16; 1410 else 1411 cuv += au16; 1412 } 1413 break; 1414 case 'v' | TYPE_IS_SHRIEKING: 1415 case 'n' | TYPE_IS_SHRIEKING: 1416 while (len-- > 0) { 1417 I16 ai16; 1418 # if U16SIZE > SIZE16 1419 ai16 = 0; 1420 # endif 1421 SHIFT16(utf8, s, strend, &ai16, datumtype, needs_swap); 1422 /* There should never be any byte-swapping here. */ 1423 assert(!TYPE_ENDIANNESS(datumtype)); 1424 if (datumtype == ('n' | TYPE_IS_SHRIEKING)) 1425 ai16 = (I16) PerlSock_ntohs((U16) ai16); 1426 if (datumtype == ('v' | TYPE_IS_SHRIEKING)) 1427 ai16 = (I16) vtohs((U16) ai16); 1428 if (!checksum) 1429 mPUSHi(ai16); 1430 else if (checksum > bits_in_uv) 1431 cdouble += (NV) ai16; 1432 else 1433 cuv += ai16; 1434 } 1435 break; 1436 case 'i': 1437 case 'i' | TYPE_IS_SHRIEKING: 1438 while (len-- > 0) { 1439 int aint; 1440 SHIFT_VAR(utf8, s, strend, aint, datumtype, needs_swap); 1441 if (!checksum) 1442 mPUSHi(aint); 1443 else if (checksum > bits_in_uv) 1444 cdouble += (NV)aint; 1445 else 1446 cuv += aint; 1447 } 1448 break; 1449 case 'I': 1450 case 'I' | TYPE_IS_SHRIEKING: 1451 while (len-- > 0) { 1452 unsigned int auint; 1453 SHIFT_VAR(utf8, s, strend, auint, datumtype, needs_swap); 1454 if (!checksum) 1455 mPUSHu(auint); 1456 else if (checksum > bits_in_uv) 1457 cdouble += (NV)auint; 1458 else 1459 cuv += auint; 1460 } 1461 break; 1462 case 'j': 1463 while (len-- > 0) { 1464 IV aiv; 1465 SHIFT_VAR(utf8, s, strend, aiv, datumtype, needs_swap); 1466 if (!checksum) 1467 mPUSHi(aiv); 1468 else if (checksum > bits_in_uv) 1469 cdouble += (NV)aiv; 1470 else 1471 cuv += aiv; 1472 } 1473 break; 1474 case 'J': 1475 while (len-- > 0) { 1476 UV auv; 1477 SHIFT_VAR(utf8, s, strend, auv, datumtype, needs_swap); 1478 if (!checksum) 1479 mPUSHu(auv); 1480 else if (checksum > bits_in_uv) 1481 cdouble += (NV)auv; 1482 else 1483 cuv += auv; 1484 } 1485 break; 1486 case 'l' | TYPE_IS_SHRIEKING: 1487 #if LONGSIZE != SIZE32 1488 while (len-- > 0) { 1489 long along; 1490 SHIFT_VAR(utf8, s, strend, along, datumtype, needs_swap); 1491 if (!checksum) 1492 mPUSHi(along); 1493 else if (checksum > bits_in_uv) 1494 cdouble += (NV)along; 1495 else 1496 cuv += along; 1497 } 1498 break; 1499 #else 1500 /* FALLTHROUGH */ 1501 #endif 1502 case 'l': 1503 while (len-- > 0) { 1504 I32 ai32; 1505 #if U32SIZE > SIZE32 1506 ai32 = 0; 1507 #endif 1508 SHIFT32(utf8, s, strend, &ai32, datumtype, needs_swap); 1509 #if U32SIZE > SIZE32 1510 if (ai32 > 2147483647) ai32 -= 4294967296; 1511 #endif 1512 if (!checksum) 1513 mPUSHi(ai32); 1514 else if (checksum > bits_in_uv) 1515 cdouble += (NV)ai32; 1516 else 1517 cuv += ai32; 1518 } 1519 break; 1520 case 'L' | TYPE_IS_SHRIEKING: 1521 #if LONGSIZE != SIZE32 1522 while (len-- > 0) { 1523 unsigned long aulong; 1524 SHIFT_VAR(utf8, s, strend, aulong, datumtype, needs_swap); 1525 if (!checksum) 1526 mPUSHu(aulong); 1527 else if (checksum > bits_in_uv) 1528 cdouble += (NV)aulong; 1529 else 1530 cuv += aulong; 1531 } 1532 break; 1533 #else 1534 /* FALLTHROUGH */ 1535 #endif 1536 case 'V': 1537 case 'N': 1538 case 'L': 1539 while (len-- > 0) { 1540 U32 au32; 1541 #if U32SIZE > SIZE32 1542 au32 = 0; 1543 #endif 1544 SHIFT32(utf8, s, strend, &au32, datumtype, needs_swap); 1545 if (datumtype == 'N') 1546 au32 = PerlSock_ntohl(au32); 1547 if (datumtype == 'V') 1548 au32 = vtohl(au32); 1549 if (!checksum) 1550 mPUSHu(au32); 1551 else if (checksum > bits_in_uv) 1552 cdouble += (NV)au32; 1553 else 1554 cuv += au32; 1555 } 1556 break; 1557 case 'V' | TYPE_IS_SHRIEKING: 1558 case 'N' | TYPE_IS_SHRIEKING: 1559 while (len-- > 0) { 1560 I32 ai32; 1561 #if U32SIZE > SIZE32 1562 ai32 = 0; 1563 #endif 1564 SHIFT32(utf8, s, strend, &ai32, datumtype, needs_swap); 1565 /* There should never be any byte swapping here. */ 1566 assert(!TYPE_ENDIANNESS(datumtype)); 1567 if (datumtype == ('N' | TYPE_IS_SHRIEKING)) 1568 ai32 = (I32)PerlSock_ntohl((U32)ai32); 1569 if (datumtype == ('V' | TYPE_IS_SHRIEKING)) 1570 ai32 = (I32)vtohl((U32)ai32); 1571 if (!checksum) 1572 mPUSHi(ai32); 1573 else if (checksum > bits_in_uv) 1574 cdouble += (NV)ai32; 1575 else 1576 cuv += ai32; 1577 } 1578 break; 1579 case 'p': 1580 while (len-- > 0) { 1581 const char *aptr; 1582 SHIFT_VAR(utf8, s, strend, aptr, datumtype, needs_swap); 1583 /* newSVpv generates undef if aptr is NULL */ 1584 mPUSHs(newSVpv(aptr, 0)); 1585 } 1586 break; 1587 case 'w': 1588 { 1589 UV auv = 0; 1590 size_t bytes = 0; 1591 1592 while (len > 0 && s < strend) { 1593 U8 ch; 1594 ch = SHIFT_BYTE(utf8, s, strend, datumtype); 1595 auv = (auv << 7) | (ch & 0x7f); 1596 /* UTF8_IS_XXXXX not right here because this is a BER, not 1597 * UTF-8 format - using constant 0x80 */ 1598 if (ch < 0x80) { 1599 bytes = 0; 1600 mPUSHu(auv); 1601 len--; 1602 auv = 0; 1603 continue; 1604 } 1605 if (++bytes >= sizeof(UV)) { /* promote to string */ 1606 const char *t; 1607 1608 sv = Perl_newSVpvf(aTHX_ "%.*" UVuf, 1609 (int)TYPE_DIGITS(UV), auv); 1610 while (s < strend) { 1611 ch = SHIFT_BYTE(utf8, s, strend, datumtype); 1612 sv = mul128(sv, (U8)(ch & 0x7f)); 1613 if (!(ch & 0x80)) { 1614 bytes = 0; 1615 break; 1616 } 1617 } 1618 t = SvPV_nolen_const(sv); 1619 while (*t == '0') 1620 t++; 1621 sv_chop(sv, t); 1622 mPUSHs(sv); 1623 len--; 1624 auv = 0; 1625 } 1626 } 1627 if ((s >= strend) && bytes) 1628 Perl_croak(aTHX_ "Unterminated compressed integer in unpack"); 1629 } 1630 break; 1631 case 'P': 1632 if (symptr->howlen == e_star) 1633 Perl_croak(aTHX_ "'P' must have an explicit size in unpack"); 1634 EXTEND(SP, 1); 1635 if (s + sizeof(char*) <= strend) { 1636 char *aptr; 1637 SHIFT_VAR(utf8, s, strend, aptr, datumtype, needs_swap); 1638 /* newSVpvn generates undef if aptr is NULL */ 1639 PUSHs(newSVpvn_flags(aptr, len, SVs_TEMP)); 1640 } 1641 break; 1642 #if defined(HAS_QUAD) && IVSIZE >= 8 1643 case 'q': 1644 while (len-- > 0) { 1645 Quad_t aquad; 1646 SHIFT_VAR(utf8, s, strend, aquad, datumtype, needs_swap); 1647 if (!checksum) 1648 mPUSHs(newSViv((IV)aquad)); 1649 else if (checksum > bits_in_uv) 1650 cdouble += (NV)aquad; 1651 else 1652 cuv += aquad; 1653 } 1654 break; 1655 case 'Q': 1656 while (len-- > 0) { 1657 Uquad_t auquad; 1658 SHIFT_VAR(utf8, s, strend, auquad, datumtype, needs_swap); 1659 if (!checksum) 1660 mPUSHs(newSVuv((UV)auquad)); 1661 else if (checksum > bits_in_uv) 1662 cdouble += (NV)auquad; 1663 else 1664 cuv += auquad; 1665 } 1666 break; 1667 #endif 1668 /* float and double added gnb@melba.bby.oz.au 22/11/89 */ 1669 case 'f': 1670 while (len-- > 0) { 1671 float afloat; 1672 SHIFT_VAR(utf8, s, strend, afloat, datumtype, needs_swap); 1673 if (!checksum) 1674 mPUSHn(afloat); 1675 else 1676 cdouble += afloat; 1677 } 1678 break; 1679 case 'd': 1680 while (len-- > 0) { 1681 double adouble; 1682 SHIFT_VAR(utf8, s, strend, adouble, datumtype, needs_swap); 1683 if (!checksum) 1684 mPUSHn(adouble); 1685 else 1686 cdouble += adouble; 1687 } 1688 break; 1689 case 'F': 1690 while (len-- > 0) { 1691 NV_bytes anv; 1692 SHIFT_BYTES(utf8, s, strend, anv.bytes, sizeof(anv.bytes), 1693 datumtype, needs_swap); 1694 if (!checksum) 1695 mPUSHn(anv.nv); 1696 else 1697 cdouble += anv.nv; 1698 } 1699 break; 1700 #if defined(HAS_LONG_DOUBLE) && defined(USE_LONG_DOUBLE) 1701 case 'D': 1702 while (len-- > 0) { 1703 ld_bytes aldouble; 1704 SHIFT_BYTES(utf8, s, strend, aldouble.bytes, 1705 sizeof(aldouble.bytes), datumtype, needs_swap); 1706 /* The most common long double format, the x86 80-bit 1707 * extended precision, has either 2 or 6 unused bytes, 1708 * which may contain garbage, which may contain 1709 * unintentional data. While we do zero the bytes of 1710 * the long double data in pack(), here in unpack() we 1711 * don't, because it's really hard to envision that 1712 * reading the long double off aldouble would be 1713 * affected by the unused bytes. 1714 * 1715 * Note that trying to unpack 'long doubles' of 'long 1716 * doubles' packed in another system is in the general 1717 * case doomed without having more detail. */ 1718 if (!checksum) 1719 mPUSHn(aldouble.ld); 1720 else 1721 cdouble += aldouble.ld; 1722 } 1723 break; 1724 #endif 1725 case 'u': 1726 if (!checksum) { 1727 const STRLEN l = (STRLEN) (strend - s) * 3 / 4; 1728 sv = sv_2mortal(newSV(l)); 1729 if (l) { 1730 SvPOK_on(sv); 1731 *SvEND(sv) = '\0'; 1732 } 1733 } 1734 1735 /* Note that all legal uuencoded strings are ASCII printables, so 1736 * have the same representation under UTF-8 vs not. This means we 1737 * can ignore UTF8ness on legal input. For illegal we stop at the 1738 * first failure, and don't report where/what that is, so again we 1739 * can ignore UTF8ness */ 1740 1741 while (s < strend && *s != ' ' && ISUUCHAR(*s)) { 1742 I32 a, b, c, d; 1743 char hunk[3]; 1744 1745 len = PL_uudmap[*(U8*)s++] & 077; 1746 while (len > 0) { 1747 if (s < strend && ISUUCHAR(*s)) 1748 a = PL_uudmap[*(U8*)s++] & 077; 1749 else 1750 a = 0; 1751 if (s < strend && ISUUCHAR(*s)) 1752 b = PL_uudmap[*(U8*)s++] & 077; 1753 else 1754 b = 0; 1755 if (s < strend && ISUUCHAR(*s)) 1756 c = PL_uudmap[*(U8*)s++] & 077; 1757 else 1758 c = 0; 1759 if (s < strend && ISUUCHAR(*s)) 1760 d = PL_uudmap[*(U8*)s++] & 077; 1761 else 1762 d = 0; 1763 hunk[0] = (char)((a << 2) | (b >> 4)); 1764 hunk[1] = (char)((b << 4) | (c >> 2)); 1765 hunk[2] = (char)((c << 6) | d); 1766 if (!checksum) 1767 sv_catpvn(sv, hunk, (len > 3) ? 3 : len); 1768 len -= 3; 1769 } 1770 if (*s == '\n') 1771 s++; 1772 else /* possible checksum byte */ 1773 if (s + 1 < strend && s[1] == '\n') 1774 s += 2; 1775 } 1776 if (!checksum) 1777 XPUSHs(sv); 1778 break; 1779 } /* End of switch */ 1780 1781 if (checksum) { 1782 if (strchr("fFdD", TYPE_NO_MODIFIERS(datumtype)) || 1783 (checksum > bits_in_uv && 1784 strchr("cCsSiIlLnNUWvVqQjJ", TYPE_NO_MODIFIERS(datumtype))) ) { 1785 NV trouble, anv; 1786 1787 anv = (NV) (1 << (checksum & 15)); 1788 while (checksum >= 16) { 1789 checksum -= 16; 1790 anv *= 65536.0; 1791 } 1792 while (cdouble < 0.0) 1793 cdouble += anv; 1794 cdouble = Perl_modf(cdouble / anv, &trouble); 1795 #ifdef LONGDOUBLE_DOUBLEDOUBLE 1796 /* Workaround for powerpc doubledouble modfl bug: 1797 * close to 1.0L and -1.0L cdouble is 0, and trouble 1798 * is cdouble / anv. */ 1799 if (trouble != Perl_ceil(trouble)) { 1800 cdouble = trouble; 1801 if (cdouble > 1.0L) cdouble -= 1.0L; 1802 if (cdouble < -1.0L) cdouble += 1.0L; 1803 } 1804 #endif 1805 cdouble *= anv; 1806 sv = newSVnv(cdouble); 1807 } 1808 else { 1809 if (checksum < bits_in_uv) { 1810 UV mask = ((UV)1 << checksum) - 1; 1811 cuv &= mask; 1812 } 1813 sv = newSVuv(cuv); 1814 } 1815 mXPUSHs(sv); 1816 checksum = 0; 1817 } 1818 1819 if (symptr->flags & FLAG_SLASH){ 1820 if (SP - PL_stack_base - start_sp_offset <= 0) 1821 break; 1822 if( next_symbol(symptr) ){ 1823 if( symptr->howlen == e_number ) 1824 Perl_croak(aTHX_ "Count after length/code in unpack" ); 1825 if( beyond ){ 1826 /* ...end of char buffer then no decent length available */ 1827 Perl_croak(aTHX_ "length/code after end of string in unpack" ); 1828 } else { 1829 /* take top of stack (hope it's numeric) */ 1830 len = POPi; 1831 if( len < 0 ) 1832 Perl_croak(aTHX_ "Negative '/' count in unpack" ); 1833 } 1834 } else { 1835 Perl_croak(aTHX_ "Code missing after '/' in unpack" ); 1836 } 1837 datumtype = symptr->code; 1838 explicit_length = FALSE; 1839 goto redo_switch; 1840 } 1841 } 1842 1843 if (new_s) 1844 *new_s = s; 1845 PUTBACK; 1846 return SP - PL_stack_base - start_sp_offset; 1847 } 1848 1849 PP(pp_unpack) 1850 { 1851 dSP; 1852 dPOPPOPssrl; 1853 U8 gimme = GIMME_V; 1854 STRLEN llen; 1855 STRLEN rlen; 1856 const char *pat = SvPV_const(left, llen); 1857 const char *s = SvPV_const(right, rlen); 1858 const char *strend = s + rlen; 1859 const char *patend = pat + llen; 1860 SSize_t cnt; 1861 1862 PUTBACK; 1863 cnt = unpackstring(pat, patend, s, strend, 1864 ((gimme == G_SCALAR) ? FLAG_UNPACK_ONLY_ONE : 0) 1865 | (DO_UTF8(right) ? FLAG_DO_UTF8 : 0)); 1866 1867 SPAGAIN; 1868 if ( !cnt && gimme == G_SCALAR ) 1869 PUSHs(&PL_sv_undef); 1870 RETURN; 1871 } 1872 1873 STATIC U8 * 1874 doencodes(U8 *h, const U8 *s, SSize_t len) 1875 { 1876 *h++ = PL_uuemap[len]; 1877 while (len > 2) { 1878 *h++ = PL_uuemap[(077 & (s[0] >> 2))]; 1879 *h++ = PL_uuemap[(077 & (((s[0] << 4) & 060) | ((s[1] >> 4) & 017)))]; 1880 *h++ = PL_uuemap[(077 & (((s[1] << 2) & 074) | ((s[2] >> 6) & 03)))]; 1881 *h++ = PL_uuemap[(077 & (s[2] & 077))]; 1882 s += 3; 1883 len -= 3; 1884 } 1885 if (len > 0) { 1886 const U8 r = (len > 1 ? s[1] : '\0'); 1887 *h++ = PL_uuemap[(077 & (s[0] >> 2))]; 1888 *h++ = PL_uuemap[(077 & (((s[0] << 4) & 060) | ((r >> 4) & 017)))]; 1889 *h++ = PL_uuemap[(077 & ((r << 2) & 074))]; 1890 *h++ = PL_uuemap[0]; 1891 } 1892 *h++ = '\n'; 1893 return h; 1894 } 1895 1896 STATIC SV * 1897 S_is_an_int(pTHX_ const char *s, STRLEN l) 1898 { 1899 SV *result = newSVpvn(s, l); 1900 char *const result_c = SvPV_nolen(result); /* convenience */ 1901 char *out = result_c; 1902 bool skip = 1; 1903 bool ignore = 0; 1904 1905 PERL_ARGS_ASSERT_IS_AN_INT; 1906 1907 while (*s) { 1908 switch (*s) { 1909 case ' ': 1910 break; 1911 case '+': 1912 if (!skip) { 1913 SvREFCNT_dec(result); 1914 return (NULL); 1915 } 1916 break; 1917 case '0': 1918 case '1': 1919 case '2': 1920 case '3': 1921 case '4': 1922 case '5': 1923 case '6': 1924 case '7': 1925 case '8': 1926 case '9': 1927 skip = 0; 1928 if (!ignore) { 1929 *(out++) = *s; 1930 } 1931 break; 1932 case '.': 1933 ignore = 1; 1934 break; 1935 default: 1936 SvREFCNT_dec(result); 1937 return (NULL); 1938 } 1939 s++; 1940 } 1941 *(out++) = '\0'; 1942 SvCUR_set(result, out - result_c); 1943 return (result); 1944 } 1945 1946 /* pnum must be '\0' terminated */ 1947 STATIC int 1948 S_div128(pTHX_ SV *pnum, bool *done) 1949 { 1950 STRLEN len; 1951 char * const s = SvPV(pnum, len); 1952 char *t = s; 1953 int m = 0; 1954 1955 PERL_ARGS_ASSERT_DIV128; 1956 1957 *done = 1; 1958 while (*t) { 1959 const int i = m * 10 + (*t - '0'); 1960 const int r = (i >> 7); /* r < 10 */ 1961 m = i & 0x7F; 1962 if (r) { 1963 *done = 0; 1964 } 1965 *(t++) = '0' + r; 1966 } 1967 *(t++) = '\0'; 1968 SvCUR_set(pnum, (STRLEN) (t - s)); 1969 return (m); 1970 } 1971 1972 /* 1973 =for apidoc packlist 1974 1975 The engine implementing C<pack()> Perl function. 1976 1977 =cut 1978 */ 1979 1980 void 1981 Perl_packlist(pTHX_ SV *cat, const char *pat, const char *patend, SV **beglist, SV **endlist ) 1982 { 1983 tempsym_t sym; 1984 1985 PERL_ARGS_ASSERT_PACKLIST; 1986 1987 TEMPSYM_INIT(&sym, pat, patend, FLAG_PACK); 1988 1989 /* We're going to do changes through SvPVX(cat). Make sure it's valid. 1990 Also make sure any UTF8 flag is loaded */ 1991 SvPV_force_nolen(cat); 1992 if (DO_UTF8(cat)) 1993 sym.flags |= FLAG_PARSE_UTF8 | FLAG_DO_UTF8; 1994 1995 (void)pack_rec( cat, &sym, beglist, endlist ); 1996 } 1997 1998 /* like sv_utf8_upgrade, but also repoint the group start markers */ 1999 STATIC void 2000 marked_upgrade(pTHX_ SV *sv, tempsym_t *sym_ptr) { 2001 STRLEN len; 2002 tempsym_t *group; 2003 const char *from_ptr, *from_start, *from_end, **marks, **m; 2004 char *to_start, *to_ptr; 2005 2006 if (SvUTF8(sv)) return; 2007 2008 from_start = SvPVX_const(sv); 2009 from_end = from_start + SvCUR(sv); 2010 for (from_ptr = from_start; from_ptr < from_end; from_ptr++) 2011 if (!NATIVE_BYTE_IS_INVARIANT(*from_ptr)) break; 2012 if (from_ptr == from_end) { 2013 /* Simple case: no character needs to be changed */ 2014 SvUTF8_on(sv); 2015 return; 2016 } 2017 2018 len = (from_end-from_ptr)*UTF8_EXPAND+(from_ptr-from_start)+1; 2019 Newx(to_start, len, char); 2020 Copy(from_start, to_start, from_ptr-from_start, char); 2021 to_ptr = to_start + (from_ptr-from_start); 2022 2023 Newx(marks, sym_ptr->level+2, const char *); 2024 for (group=sym_ptr; group; group = group->previous) 2025 marks[group->level] = from_start + group->strbeg; 2026 marks[sym_ptr->level+1] = from_end+1; 2027 for (m = marks; *m < from_ptr; m++) 2028 *m = to_start + (*m-from_start); 2029 2030 for (;from_ptr < from_end; from_ptr++) { 2031 while (*m == from_ptr) *m++ = to_ptr; 2032 to_ptr = (char *) uvchr_to_utf8((U8 *) to_ptr, *(U8 *) from_ptr); 2033 } 2034 *to_ptr = 0; 2035 2036 while (*m == from_ptr) *m++ = to_ptr; 2037 if (m != marks + sym_ptr->level+1) { 2038 Safefree(marks); 2039 Safefree(to_start); 2040 Perl_croak(aTHX_ "panic: marks beyond string end, m=%p, marks=%p, " 2041 "level=%d", m, marks, sym_ptr->level); 2042 } 2043 for (group=sym_ptr; group; group = group->previous) 2044 group->strbeg = marks[group->level] - to_start; 2045 Safefree(marks); 2046 2047 if (SvOOK(sv)) { 2048 if (SvIVX(sv)) { 2049 SvLEN_set(sv, SvLEN(sv) + SvIVX(sv)); 2050 from_start -= SvIVX(sv); 2051 SvIV_set(sv, 0); 2052 } 2053 SvFLAGS(sv) &= ~SVf_OOK; 2054 } 2055 if (SvLEN(sv) != 0) 2056 Safefree(from_start); 2057 SvPV_set(sv, to_start); 2058 SvCUR_set(sv, to_ptr - to_start); 2059 SvLEN_set(sv, len); 2060 SvUTF8_on(sv); 2061 } 2062 2063 /* Exponential string grower. Makes string extension effectively O(n) 2064 needed says how many extra bytes we need (not counting the final '\0') 2065 Only grows the string if there is an actual lack of space 2066 */ 2067 STATIC char * 2068 S_sv_exp_grow(pTHX_ SV *sv, STRLEN needed) { 2069 const STRLEN cur = SvCUR(sv); 2070 const STRLEN len = SvLEN(sv); 2071 STRLEN extend; 2072 2073 PERL_ARGS_ASSERT_SV_EXP_GROW; 2074 2075 if (len - cur > needed) return SvPVX(sv); 2076 extend = needed > len ? needed : len; 2077 return SvGROW(sv, len+extend+1); 2078 } 2079 2080 static SV * 2081 S_sv_check_infnan(pTHX_ SV *sv, I32 datumtype) 2082 { 2083 SvGETMAGIC(sv); 2084 if (UNLIKELY(SvAMAGIC(sv))) 2085 sv = sv_2num(sv); 2086 if (UNLIKELY(isinfnansv(sv))) { 2087 const I32 c = TYPE_NO_MODIFIERS(datumtype); 2088 const NV nv = SvNV_nomg(sv); 2089 if (c == 'w') 2090 Perl_croak(aTHX_ "Cannot compress %" NVgf " in pack", nv); 2091 else 2092 Perl_croak(aTHX_ "Cannot pack %" NVgf " with '%c'", nv, (int) c); 2093 } 2094 return sv; 2095 } 2096 2097 #define SvIV_no_inf(sv,d) \ 2098 ((sv) = S_sv_check_infnan(aTHX_ sv,d), SvIV_nomg(sv)) 2099 #define SvUV_no_inf(sv,d) \ 2100 ((sv) = S_sv_check_infnan(aTHX_ sv,d), SvUV_nomg(sv)) 2101 2102 STATIC 2103 SV ** 2104 S_pack_rec(pTHX_ SV *cat, tempsym_t* symptr, SV **beglist, SV **endlist ) 2105 { 2106 tempsym_t lookahead; 2107 SSize_t items = endlist - beglist; 2108 bool found = next_symbol(symptr); 2109 bool utf8 = (symptr->flags & FLAG_PARSE_UTF8) ? 1 : 0; 2110 bool warn_utf8 = ckWARN(WARN_UTF8); 2111 char* from; 2112 2113 PERL_ARGS_ASSERT_PACK_REC; 2114 2115 if (symptr->level == 0 && found && symptr->code == 'U') { 2116 marked_upgrade(aTHX_ cat, symptr); 2117 symptr->flags |= FLAG_DO_UTF8; 2118 utf8 = 0; 2119 } 2120 symptr->strbeg = SvCUR(cat); 2121 2122 while (found) { 2123 SV *fromstr; 2124 STRLEN fromlen; 2125 SSize_t len; 2126 SV *lengthcode = NULL; 2127 I32 datumtype = symptr->code; 2128 howlen_t howlen = symptr->howlen; 2129 char *start = SvPVX(cat); 2130 char *cur = start + SvCUR(cat); 2131 bool needs_swap; 2132 2133 #define NEXTFROM (lengthcode ? lengthcode : items > 0 ? (--items, *beglist++) : &PL_sv_no) 2134 #define PEEKFROM (lengthcode ? lengthcode : items > 0 ? *beglist : &PL_sv_no) 2135 2136 switch (howlen) { 2137 case e_star: 2138 len = strchr("@Xxu", TYPE_NO_MODIFIERS(datumtype)) ? 2139 0 : items; 2140 break; 2141 default: 2142 /* e_no_len and e_number */ 2143 len = symptr->length; 2144 break; 2145 } 2146 2147 if (len) { 2148 packprops_t props = packprops[TYPE_NO_ENDIANNESS(datumtype)]; 2149 2150 if (props && !(props & PACK_SIZE_UNPREDICTABLE)) { 2151 /* We can process this letter. */ 2152 STRLEN size = props & PACK_SIZE_MASK; 2153 GROWING2(utf8, cat, start, cur, size, (STRLEN)len); 2154 } 2155 } 2156 2157 /* Look ahead for next symbol. Do we have code/code? */ 2158 lookahead = *symptr; 2159 found = next_symbol(&lookahead); 2160 if (symptr->flags & FLAG_SLASH) { 2161 IV count; 2162 if (!found) Perl_croak(aTHX_ "Code missing after '/' in pack"); 2163 if (strchr("aAZ", lookahead.code)) { 2164 if (lookahead.howlen == e_number) count = lookahead.length; 2165 else { 2166 if (items > 0) { 2167 count = sv_len_utf8(*beglist); 2168 } 2169 else count = 0; 2170 if (lookahead.code == 'Z') count++; 2171 } 2172 } else { 2173 if (lookahead.howlen == e_number && lookahead.length < items) 2174 count = lookahead.length; 2175 else count = items; 2176 } 2177 lookahead.howlen = e_number; 2178 lookahead.length = count; 2179 lengthcode = sv_2mortal(newSViv(count)); 2180 } 2181 2182 needs_swap = NEEDS_SWAP(datumtype); 2183 2184 /* Code inside the switch must take care to properly update 2185 cat (CUR length and '\0' termination) if it updated *cur and 2186 doesn't simply leave using break */ 2187 switch (TYPE_NO_ENDIANNESS(datumtype)) { 2188 default: 2189 Perl_croak(aTHX_ "Invalid type '%c' in pack", 2190 (int) TYPE_NO_MODIFIERS(datumtype)); 2191 case '%': 2192 Perl_croak(aTHX_ "'%%' may not be used in pack"); 2193 2194 case '.' | TYPE_IS_SHRIEKING: 2195 case '.': 2196 if (howlen == e_star) from = start; 2197 else if (len == 0) from = cur; 2198 else { 2199 tempsym_t *group = symptr; 2200 2201 while (--len && group) group = group->previous; 2202 from = group ? start + group->strbeg : start; 2203 } 2204 fromstr = NEXTFROM; 2205 len = SvIV_no_inf(fromstr, datumtype); 2206 goto resize; 2207 case '@' | TYPE_IS_SHRIEKING: 2208 case '@': 2209 from = start + symptr->strbeg; 2210 resize: 2211 if (utf8 && !(datumtype & TYPE_IS_SHRIEKING)) 2212 if (len >= 0) { 2213 while (len && from < cur) { 2214 from += UTF8SKIP(from); 2215 len--; 2216 } 2217 if (from > cur) 2218 Perl_croak(aTHX_ "Malformed UTF-8 string in pack"); 2219 if (len) { 2220 /* Here we know from == cur */ 2221 grow: 2222 GROWING(0, cat, start, cur, len); 2223 Zero(cur, len, char); 2224 cur += len; 2225 } else if (from < cur) { 2226 len = cur - from; 2227 goto shrink; 2228 } else goto no_change; 2229 } else { 2230 cur = from; 2231 len = -len; 2232 goto utf8_shrink; 2233 } 2234 else { 2235 len -= cur - from; 2236 if (len > 0) goto grow; 2237 if (len == 0) goto no_change; 2238 len = -len; 2239 goto shrink; 2240 } 2241 break; 2242 2243 case '(': { 2244 tempsym_t savsym = *symptr; 2245 U32 group_modifiers = TYPE_MODIFIERS(datumtype & ~symptr->flags); 2246 symptr->flags |= group_modifiers; 2247 symptr->patend = savsym.grpend; 2248 symptr->level++; 2249 symptr->previous = &lookahead; 2250 while (len--) { 2251 U32 was_utf8; 2252 if (utf8) symptr->flags |= FLAG_PARSE_UTF8; 2253 else symptr->flags &= ~FLAG_PARSE_UTF8; 2254 was_utf8 = SvUTF8(cat); 2255 symptr->patptr = savsym.grpbeg; 2256 beglist = pack_rec(cat, symptr, beglist, endlist); 2257 if (SvUTF8(cat) != was_utf8) 2258 /* This had better be an upgrade while in utf8==0 mode */ 2259 utf8 = 1; 2260 2261 if (savsym.howlen == e_star && beglist == endlist) 2262 break; /* No way to continue */ 2263 } 2264 items = endlist - beglist; 2265 lookahead.flags = symptr->flags & ~group_modifiers; 2266 goto no_change; 2267 } 2268 case 'X' | TYPE_IS_SHRIEKING: 2269 if (!len) /* Avoid division by 0 */ 2270 len = 1; 2271 if (utf8) { 2272 char *hop, *last; 2273 SSize_t l = len; 2274 hop = last = start; 2275 while (hop < cur) { 2276 hop += UTF8SKIP(hop); 2277 if (--l == 0) { 2278 last = hop; 2279 l = len; 2280 } 2281 } 2282 if (last > cur) 2283 Perl_croak(aTHX_ "Malformed UTF-8 string in pack"); 2284 cur = last; 2285 break; 2286 } 2287 len = (cur-start) % len; 2288 /* FALLTHROUGH */ 2289 case 'X': 2290 if (utf8) { 2291 if (len < 1) goto no_change; 2292 utf8_shrink: 2293 while (len > 0) { 2294 if (cur <= start) 2295 Perl_croak(aTHX_ "'%c' outside of string in pack", 2296 (int) TYPE_NO_MODIFIERS(datumtype)); 2297 while (--cur, UTF8_IS_CONTINUATION(*cur)) { 2298 if (cur <= start) 2299 Perl_croak(aTHX_ "'%c' outside of string in pack", 2300 (int) TYPE_NO_MODIFIERS(datumtype)); 2301 } 2302 len--; 2303 } 2304 } else { 2305 shrink: 2306 if (cur - start < len) 2307 Perl_croak(aTHX_ "'%c' outside of string in pack", 2308 (int) TYPE_NO_MODIFIERS(datumtype)); 2309 cur -= len; 2310 } 2311 if (cur < start+symptr->strbeg) { 2312 /* Make sure group starts don't point into the void */ 2313 tempsym_t *group; 2314 const STRLEN length = cur-start; 2315 for (group = symptr; 2316 group && length < group->strbeg; 2317 group = group->previous) group->strbeg = length; 2318 lookahead.strbeg = length; 2319 } 2320 break; 2321 case 'x' | TYPE_IS_SHRIEKING: { 2322 SSize_t ai32; 2323 if (!len) /* Avoid division by 0 */ 2324 len = 1; 2325 if (utf8) ai32 = utf8_length((U8 *) start, (U8 *) cur) % len; 2326 else ai32 = (cur - start) % len; 2327 if (ai32 == 0) goto no_change; 2328 len -= ai32; 2329 } 2330 /* FALLTHROUGH */ 2331 case 'x': 2332 goto grow; 2333 case 'A': 2334 case 'Z': 2335 case 'a': { 2336 const char *aptr; 2337 2338 fromstr = NEXTFROM; 2339 aptr = SvPV_const(fromstr, fromlen); 2340 if (DO_UTF8(fromstr)) { 2341 const char *end, *s; 2342 2343 if (!utf8 && !SvUTF8(cat)) { 2344 marked_upgrade(aTHX_ cat, symptr); 2345 lookahead.flags |= FLAG_DO_UTF8; 2346 lookahead.strbeg = symptr->strbeg; 2347 utf8 = 1; 2348 start = SvPVX(cat); 2349 cur = start + SvCUR(cat); 2350 } 2351 if (howlen == e_star) { 2352 if (utf8) goto string_copy; 2353 len = fromlen+1; 2354 } 2355 s = aptr; 2356 end = aptr + fromlen; 2357 fromlen = datumtype == 'Z' ? len-1 : len; 2358 while ((SSize_t) fromlen > 0 && s < end) { 2359 s += UTF8SKIP(s); 2360 fromlen--; 2361 } 2362 if (s > end) 2363 Perl_croak(aTHX_ "Malformed UTF-8 string in pack"); 2364 if (utf8) { 2365 len = fromlen; 2366 if (datumtype == 'Z') len++; 2367 fromlen = s-aptr; 2368 len += fromlen; 2369 2370 goto string_copy; 2371 } 2372 fromlen = len - fromlen; 2373 if (datumtype == 'Z') fromlen--; 2374 if (howlen == e_star) { 2375 len = fromlen; 2376 if (datumtype == 'Z') len++; 2377 } 2378 GROWING(0, cat, start, cur, len); 2379 if (!S_utf8_to_bytes(aTHX_ &aptr, end, cur, fromlen, 2380 datumtype | TYPE_IS_PACK)) 2381 Perl_croak(aTHX_ "panic: predicted utf8 length not available, " 2382 "for '%c', aptr=%p end=%p cur=%p, fromlen=%zu", 2383 (int)datumtype, aptr, end, cur, fromlen); 2384 cur += fromlen; 2385 len -= fromlen; 2386 } else if (utf8) { 2387 if (howlen == e_star) { 2388 len = fromlen; 2389 if (datumtype == 'Z') len++; 2390 } 2391 if (len <= (SSize_t) fromlen) { 2392 fromlen = len; 2393 if (datumtype == 'Z' && fromlen > 0) fromlen--; 2394 } 2395 /* assumes a byte expands to at most UTF8_EXPAND bytes on 2396 upgrade, so: 2397 expected_length <= from_len*UTF8_EXPAND + (len-from_len) */ 2398 GROWING(0, cat, start, cur, fromlen*(UTF8_EXPAND-1)+len); 2399 len -= fromlen; 2400 while (fromlen > 0) { 2401 cur = (char *) uvchr_to_utf8((U8 *) cur, * (U8 *) aptr); 2402 aptr++; 2403 fromlen--; 2404 } 2405 } else { 2406 string_copy: 2407 if (howlen == e_star) { 2408 len = fromlen; 2409 if (datumtype == 'Z') len++; 2410 } 2411 if (len <= (SSize_t) fromlen) { 2412 fromlen = len; 2413 if (datumtype == 'Z' && fromlen > 0) fromlen--; 2414 } 2415 GROWING(0, cat, start, cur, len); 2416 Copy(aptr, cur, fromlen, char); 2417 cur += fromlen; 2418 len -= fromlen; 2419 } 2420 memset(cur, datumtype == 'A' ? ' ' : '\0', len); 2421 cur += len; 2422 SvTAINT(cat); 2423 break; 2424 } 2425 case 'B': 2426 case 'b': { 2427 const char *str, *end; 2428 SSize_t l, field_len; 2429 U8 bits; 2430 bool utf8_source; 2431 U32 utf8_flags; 2432 2433 fromstr = NEXTFROM; 2434 str = SvPV_const(fromstr, fromlen); 2435 end = str + fromlen; 2436 if (DO_UTF8(fromstr)) { 2437 utf8_source = TRUE; 2438 utf8_flags = warn_utf8 ? 0 : UTF8_ALLOW_ANY; 2439 } else { 2440 utf8_source = FALSE; 2441 utf8_flags = 0; /* Unused, but keep compilers happy */ 2442 } 2443 if (howlen == e_star) len = fromlen; 2444 field_len = (len+7)/8; 2445 GROWING(utf8, cat, start, cur, field_len); 2446 if (len > (SSize_t)fromlen) len = fromlen; 2447 bits = 0; 2448 l = 0; 2449 if (datumtype == 'B') 2450 while (l++ < len) { 2451 if (utf8_source) { 2452 UV val = 0; 2453 NEXT_UNI_VAL(val, cur, str, end, utf8_flags); 2454 bits |= val & 1; 2455 } else bits |= *str++ & 1; 2456 if (l & 7) bits <<= 1; 2457 else { 2458 PUSH_BYTE(utf8, cur, bits); 2459 bits = 0; 2460 } 2461 } 2462 else 2463 /* datumtype == 'b' */ 2464 while (l++ < len) { 2465 if (utf8_source) { 2466 UV val = 0; 2467 NEXT_UNI_VAL(val, cur, str, end, utf8_flags); 2468 if (val & 1) bits |= 0x80; 2469 } else if (*str++ & 1) 2470 bits |= 0x80; 2471 if (l & 7) bits >>= 1; 2472 else { 2473 PUSH_BYTE(utf8, cur, bits); 2474 bits = 0; 2475 } 2476 } 2477 l--; 2478 if (l & 7) { 2479 if (datumtype == 'B') 2480 bits <<= 7 - (l & 7); 2481 else 2482 bits >>= 7 - (l & 7); 2483 PUSH_BYTE(utf8, cur, bits); 2484 l += 7; 2485 } 2486 /* Determine how many chars are left in the requested field */ 2487 l /= 8; 2488 if (howlen == e_star) field_len = 0; 2489 else field_len -= l; 2490 Zero(cur, field_len, char); 2491 cur += field_len; 2492 break; 2493 } 2494 case 'H': 2495 case 'h': { 2496 const char *str, *end; 2497 SSize_t l, field_len; 2498 U8 bits; 2499 bool utf8_source; 2500 U32 utf8_flags; 2501 2502 fromstr = NEXTFROM; 2503 str = SvPV_const(fromstr, fromlen); 2504 end = str + fromlen; 2505 if (DO_UTF8(fromstr)) { 2506 utf8_source = TRUE; 2507 utf8_flags = warn_utf8 ? 0 : UTF8_ALLOW_ANY; 2508 } else { 2509 utf8_source = FALSE; 2510 utf8_flags = 0; /* Unused, but keep compilers happy */ 2511 } 2512 if (howlen == e_star) len = fromlen; 2513 field_len = (len+1)/2; 2514 GROWING(utf8, cat, start, cur, field_len); 2515 if (!utf8_source && len > (SSize_t)fromlen) len = fromlen; 2516 bits = 0; 2517 l = 0; 2518 if (datumtype == 'H') 2519 while (l++ < len) { 2520 if (utf8_source) { 2521 UV val = 0; 2522 NEXT_UNI_VAL(val, cur, str, end, utf8_flags); 2523 if (val < 256 && isALPHA(val)) 2524 bits |= (val + 9) & 0xf; 2525 else 2526 bits |= val & 0xf; 2527 } else if (isALPHA(*str)) 2528 bits |= (*str++ + 9) & 0xf; 2529 else 2530 bits |= *str++ & 0xf; 2531 if (l & 1) bits <<= 4; 2532 else { 2533 PUSH_BYTE(utf8, cur, bits); 2534 bits = 0; 2535 } 2536 } 2537 else 2538 while (l++ < len) { 2539 if (utf8_source) { 2540 UV val = 0; 2541 NEXT_UNI_VAL(val, cur, str, end, utf8_flags); 2542 if (val < 256 && isALPHA(val)) 2543 bits |= ((val + 9) & 0xf) << 4; 2544 else 2545 bits |= (val & 0xf) << 4; 2546 } else if (isALPHA(*str)) 2547 bits |= ((*str++ + 9) & 0xf) << 4; 2548 else 2549 bits |= (*str++ & 0xf) << 4; 2550 if (l & 1) bits >>= 4; 2551 else { 2552 PUSH_BYTE(utf8, cur, bits); 2553 bits = 0; 2554 } 2555 } 2556 l--; 2557 if (l & 1) { 2558 PUSH_BYTE(utf8, cur, bits); 2559 l++; 2560 } 2561 /* Determine how many chars are left in the requested field */ 2562 l /= 2; 2563 if (howlen == e_star) field_len = 0; 2564 else field_len -= l; 2565 Zero(cur, field_len, char); 2566 cur += field_len; 2567 break; 2568 } 2569 case 'c': 2570 while (len-- > 0) { 2571 IV aiv; 2572 fromstr = NEXTFROM; 2573 aiv = SvIV_no_inf(fromstr, datumtype); 2574 if ((-128 > aiv || aiv > 127)) 2575 Perl_ck_warner(aTHX_ packWARN(WARN_PACK), 2576 "Character in 'c' format wrapped in pack"); 2577 PUSH_BYTE(utf8, cur, (U8)(aiv & 0xff)); 2578 } 2579 break; 2580 case 'C': 2581 if (len == 0) { 2582 utf8 = (symptr->flags & FLAG_DO_UTF8) ? 1 : 0; 2583 break; 2584 } 2585 while (len-- > 0) { 2586 IV aiv; 2587 fromstr = NEXTFROM; 2588 aiv = SvIV_no_inf(fromstr, datumtype); 2589 if ((0 > aiv || aiv > 0xff)) 2590 Perl_ck_warner(aTHX_ packWARN(WARN_PACK), 2591 "Character in 'C' format wrapped in pack"); 2592 PUSH_BYTE(utf8, cur, (U8)(aiv & 0xff)); 2593 } 2594 break; 2595 case 'W': { 2596 char *end; 2597 U8 in_bytes = (U8)IN_BYTES; 2598 2599 end = start+SvLEN(cat)-1; 2600 if (utf8) end -= UTF8_MAXLEN-1; 2601 while (len-- > 0) { 2602 UV auv; 2603 fromstr = NEXTFROM; 2604 auv = SvUV_no_inf(fromstr, datumtype); 2605 if (in_bytes) auv = auv % 0x100; 2606 if (utf8) { 2607 W_utf8: 2608 if (cur >= end) { 2609 *cur = '\0'; 2610 SvCUR_set(cat, cur - start); 2611 2612 GROWING(0, cat, start, cur, len+UTF8_MAXLEN); 2613 end = start+SvLEN(cat)-UTF8_MAXLEN; 2614 } 2615 cur = (char *) uvchr_to_utf8_flags((U8 *) cur, auv, 0); 2616 } else { 2617 if (auv >= 0x100) { 2618 if (!SvUTF8(cat)) { 2619 *cur = '\0'; 2620 SvCUR_set(cat, cur - start); 2621 marked_upgrade(aTHX_ cat, symptr); 2622 lookahead.flags |= FLAG_DO_UTF8; 2623 lookahead.strbeg = symptr->strbeg; 2624 utf8 = 1; 2625 start = SvPVX(cat); 2626 cur = start + SvCUR(cat); 2627 end = start+SvLEN(cat)-UTF8_MAXLEN; 2628 goto W_utf8; 2629 } 2630 Perl_ck_warner(aTHX_ packWARN(WARN_PACK), 2631 "Character in 'W' format wrapped in pack"); 2632 auv &= 0xff; 2633 } 2634 if (cur >= end) { 2635 *cur = '\0'; 2636 SvCUR_set(cat, cur - start); 2637 GROWING(0, cat, start, cur, len+1); 2638 end = start+SvLEN(cat)-1; 2639 } 2640 *(U8 *) cur++ = (U8)auv; 2641 } 2642 } 2643 break; 2644 } 2645 case 'U': { 2646 char *end; 2647 2648 if (len == 0) { 2649 if (!(symptr->flags & FLAG_DO_UTF8)) { 2650 marked_upgrade(aTHX_ cat, symptr); 2651 lookahead.flags |= FLAG_DO_UTF8; 2652 lookahead.strbeg = symptr->strbeg; 2653 } 2654 utf8 = 0; 2655 goto no_change; 2656 } 2657 2658 end = start+SvLEN(cat); 2659 if (!utf8) end -= UTF8_MAXLEN; 2660 while (len-- > 0) { 2661 UV auv; 2662 fromstr = NEXTFROM; 2663 auv = SvUV_no_inf(fromstr, datumtype); 2664 if (utf8) { 2665 U8 buffer[UTF8_MAXLEN+1], *endb; 2666 endb = uvchr_to_utf8_flags(buffer, UNI_TO_NATIVE(auv), 0); 2667 if (cur+(endb-buffer)*UTF8_EXPAND >= end) { 2668 *cur = '\0'; 2669 SvCUR_set(cat, cur - start); 2670 GROWING(0, cat, start, cur, 2671 len+(endb-buffer)*UTF8_EXPAND); 2672 end = start+SvLEN(cat); 2673 } 2674 cur = my_bytes_to_utf8(buffer, endb-buffer, cur, 0); 2675 } else { 2676 if (cur >= end) { 2677 *cur = '\0'; 2678 SvCUR_set(cat, cur - start); 2679 GROWING(0, cat, start, cur, len+UTF8_MAXLEN); 2680 end = start+SvLEN(cat)-UTF8_MAXLEN; 2681 } 2682 cur = (char *) uvchr_to_utf8_flags((U8 *) cur, 2683 UNI_TO_NATIVE(auv), 2684 0); 2685 } 2686 } 2687 break; 2688 } 2689 /* Float and double added by gnb@melba.bby.oz.au 22/11/89 */ 2690 case 'f': 2691 while (len-- > 0) { 2692 float afloat; 2693 NV anv; 2694 fromstr = NEXTFROM; 2695 anv = SvNV(fromstr); 2696 # if (defined(VMS) && !defined(_IEEE_FP)) || defined(DOUBLE_IS_VAX_FLOAT) 2697 /* IEEE fp overflow shenanigans are unavailable on VAX and optional 2698 * on Alpha; fake it if we don't have them. 2699 */ 2700 if (anv > FLT_MAX) 2701 afloat = FLT_MAX; 2702 else if (anv < -FLT_MAX) 2703 afloat = -FLT_MAX; 2704 else afloat = (float)anv; 2705 # else 2706 # if defined(NAN_COMPARE_BROKEN) && defined(Perl_isnan) 2707 if(Perl_isnan(anv)) 2708 afloat = (float)NV_NAN; 2709 else 2710 # endif 2711 # ifdef NV_INF 2712 /* a simple cast to float is undefined if outside 2713 * the range of values that can be represented */ 2714 afloat = (float)(anv > FLT_MAX ? NV_INF : 2715 anv < -FLT_MAX ? -NV_INF : anv); 2716 # endif 2717 # endif 2718 PUSH_VAR(utf8, cur, afloat, needs_swap); 2719 } 2720 break; 2721 case 'd': 2722 while (len-- > 0) { 2723 double adouble; 2724 NV anv; 2725 fromstr = NEXTFROM; 2726 anv = SvNV(fromstr); 2727 # if (defined(VMS) && !defined(_IEEE_FP)) || defined(DOUBLE_IS_VAX_FLOAT) 2728 /* IEEE fp overflow shenanigans are unavailable on VAX and optional 2729 * on Alpha; fake it if we don't have them. 2730 */ 2731 if (anv > DBL_MAX) 2732 adouble = DBL_MAX; 2733 else if (anv < -DBL_MAX) 2734 adouble = -DBL_MAX; 2735 else adouble = (double)anv; 2736 # else 2737 adouble = (double)anv; 2738 # endif 2739 PUSH_VAR(utf8, cur, adouble, needs_swap); 2740 } 2741 break; 2742 case 'F': { 2743 NV_bytes anv; 2744 Zero(&anv, 1, NV); /* can be long double with unused bits */ 2745 while (len-- > 0) { 2746 fromstr = NEXTFROM; 2747 #ifdef __GNUC__ 2748 /* to work round a gcc/x86 bug; don't use SvNV */ 2749 anv.nv = sv_2nv(fromstr); 2750 # if defined(LONGDOUBLE_X86_80_BIT) && defined(USE_LONG_DOUBLE) \ 2751 && LONG_DOUBLESIZE > 10 2752 /* GCC sometimes overwrites the padding in the 2753 assignment above */ 2754 Zero(anv.bytes+10, sizeof(anv.bytes) - 10, U8); 2755 # endif 2756 #else 2757 anv.nv = SvNV(fromstr); 2758 #endif 2759 PUSH_BYTES(utf8, cur, anv.bytes, sizeof(anv.bytes), needs_swap); 2760 } 2761 break; 2762 } 2763 #if defined(HAS_LONG_DOUBLE) && defined(USE_LONG_DOUBLE) 2764 case 'D': { 2765 ld_bytes aldouble; 2766 /* long doubles can have unused bits, which may be nonzero */ 2767 Zero(&aldouble, 1, long double); 2768 while (len-- > 0) { 2769 fromstr = NEXTFROM; 2770 # ifdef __GNUC__ 2771 /* to work round a gcc/x86 bug; don't use SvNV */ 2772 aldouble.ld = (long double)sv_2nv(fromstr); 2773 # if defined(LONGDOUBLE_X86_80_BIT) && LONG_DOUBLESIZE > 10 2774 /* GCC sometimes overwrites the padding in the 2775 assignment above */ 2776 Zero(aldouble.bytes+10, sizeof(aldouble.bytes) - 10, U8); 2777 # endif 2778 # else 2779 aldouble.ld = (long double)SvNV(fromstr); 2780 # endif 2781 PUSH_BYTES(utf8, cur, aldouble.bytes, sizeof(aldouble.bytes), 2782 needs_swap); 2783 } 2784 break; 2785 } 2786 #endif 2787 case 'n' | TYPE_IS_SHRIEKING: 2788 case 'n': 2789 while (len-- > 0) { 2790 I16 ai16; 2791 fromstr = NEXTFROM; 2792 ai16 = (I16)SvIV_no_inf(fromstr, datumtype); 2793 ai16 = PerlSock_htons(ai16); 2794 PUSH16(utf8, cur, &ai16, FALSE); 2795 } 2796 break; 2797 case 'v' | TYPE_IS_SHRIEKING: 2798 case 'v': 2799 while (len-- > 0) { 2800 I16 ai16; 2801 fromstr = NEXTFROM; 2802 ai16 = (I16)SvIV_no_inf(fromstr, datumtype); 2803 ai16 = htovs(ai16); 2804 PUSH16(utf8, cur, &ai16, FALSE); 2805 } 2806 break; 2807 case 'S' | TYPE_IS_SHRIEKING: 2808 #if SHORTSIZE != SIZE16 2809 while (len-- > 0) { 2810 unsigned short aushort; 2811 fromstr = NEXTFROM; 2812 aushort = SvUV_no_inf(fromstr, datumtype); 2813 PUSH_VAR(utf8, cur, aushort, needs_swap); 2814 } 2815 break; 2816 #else 2817 /* FALLTHROUGH */ 2818 #endif 2819 case 'S': 2820 while (len-- > 0) { 2821 U16 au16; 2822 fromstr = NEXTFROM; 2823 au16 = (U16)SvUV_no_inf(fromstr, datumtype); 2824 PUSH16(utf8, cur, &au16, needs_swap); 2825 } 2826 break; 2827 case 's' | TYPE_IS_SHRIEKING: 2828 #if SHORTSIZE != SIZE16 2829 while (len-- > 0) { 2830 short ashort; 2831 fromstr = NEXTFROM; 2832 ashort = SvIV_no_inf(fromstr, datumtype); 2833 PUSH_VAR(utf8, cur, ashort, needs_swap); 2834 } 2835 break; 2836 #else 2837 /* FALLTHROUGH */ 2838 #endif 2839 case 's': 2840 while (len-- > 0) { 2841 I16 ai16; 2842 fromstr = NEXTFROM; 2843 ai16 = (I16)SvIV_no_inf(fromstr, datumtype); 2844 PUSH16(utf8, cur, &ai16, needs_swap); 2845 } 2846 break; 2847 case 'I': 2848 case 'I' | TYPE_IS_SHRIEKING: 2849 while (len-- > 0) { 2850 unsigned int auint; 2851 fromstr = NEXTFROM; 2852 auint = SvUV_no_inf(fromstr, datumtype); 2853 PUSH_VAR(utf8, cur, auint, needs_swap); 2854 } 2855 break; 2856 case 'j': 2857 while (len-- > 0) { 2858 IV aiv; 2859 fromstr = NEXTFROM; 2860 aiv = SvIV_no_inf(fromstr, datumtype); 2861 PUSH_VAR(utf8, cur, aiv, needs_swap); 2862 } 2863 break; 2864 case 'J': 2865 while (len-- > 0) { 2866 UV auv; 2867 fromstr = NEXTFROM; 2868 auv = SvUV_no_inf(fromstr, datumtype); 2869 PUSH_VAR(utf8, cur, auv, needs_swap); 2870 } 2871 break; 2872 case 'w': 2873 while (len-- > 0) { 2874 NV anv; 2875 fromstr = NEXTFROM; 2876 S_sv_check_infnan(aTHX_ fromstr, datumtype); 2877 anv = SvNV_nomg(fromstr); 2878 2879 if (anv < 0) { 2880 *cur = '\0'; 2881 SvCUR_set(cat, cur - start); 2882 Perl_croak(aTHX_ "Cannot compress negative numbers in pack"); 2883 } 2884 2885 /* 0xFFFFFFFFFFFFFFFF may cast to 18446744073709551616.0, 2886 which is == UV_MAX_P1. IOK is fine (instead of UV_only), as 2887 any negative IVs will have already been got by the croak() 2888 above. IOK is untrue for fractions, so we test them 2889 against UV_MAX_P1. */ 2890 if (SvIOK(fromstr) || anv < UV_MAX_P1) { 2891 char buf[(sizeof(UV)*CHAR_BIT)/7+1]; 2892 char *in = buf + sizeof(buf); 2893 UV auv = SvUV_nomg(fromstr); 2894 2895 do { 2896 *--in = (char)((auv & 0x7f) | 0x80); 2897 auv >>= 7; 2898 } while (auv); 2899 buf[sizeof(buf) - 1] &= 0x7f; /* clear continue bit */ 2900 PUSH_GROWING_BYTES(utf8, cat, start, cur, 2901 in, (buf + sizeof(buf)) - in); 2902 } else if (SvPOKp(fromstr)) 2903 goto w_string; 2904 else if (SvNOKp(fromstr)) { 2905 /* 10**NV_MAX_10_EXP is the largest power of 10 2906 so 10**(NV_MAX_10_EXP+1) is definitely unrepresentable 2907 given 10**(NV_MAX_10_EXP+1) == 128 ** x solve for x: 2908 x = (NV_MAX_10_EXP+1) * log (10) / log (128) 2909 And with that many bytes only Inf can overflow. 2910 Some C compilers are strict about integral constant 2911 expressions so we conservatively divide by a slightly 2912 smaller integer instead of multiplying by the exact 2913 floating-point value. 2914 */ 2915 #ifdef NV_MAX_10_EXP 2916 /* char buf[1 + (int)((NV_MAX_10_EXP + 1) * 0.47456)]; -- invalid C */ 2917 char buf[1 + (int)((NV_MAX_10_EXP + 1) / 2)]; /* valid C */ 2918 #else 2919 /* char buf[1 + (int)((308 + 1) * 0.47456)]; -- invalid C */ 2920 char buf[1 + (int)((308 + 1) / 2)]; /* valid C */ 2921 #endif 2922 char *in = buf + sizeof(buf); 2923 2924 anv = Perl_floor(anv); 2925 do { 2926 const NV next = Perl_floor(anv / 128); 2927 if (in <= buf) /* this cannot happen ;-) */ 2928 Perl_croak(aTHX_ "Cannot compress integer in pack"); 2929 *--in = (unsigned char)(anv - (next * 128)) | 0x80; 2930 anv = next; 2931 } while (anv > 0); 2932 buf[sizeof(buf) - 1] &= 0x7f; /* clear continue bit */ 2933 PUSH_GROWING_BYTES(utf8, cat, start, cur, 2934 in, (buf + sizeof(buf)) - in); 2935 } else { 2936 const char *from; 2937 char *result, *in; 2938 SV *norm; 2939 STRLEN len; 2940 bool done; 2941 2942 w_string: 2943 /* Copy string and check for compliance */ 2944 from = SvPV_nomg_const(fromstr, len); 2945 if ((norm = is_an_int(from, len)) == NULL) 2946 Perl_croak(aTHX_ "Can only compress unsigned integers in pack"); 2947 2948 Newx(result, len, char); 2949 in = result + len; 2950 done = FALSE; 2951 while (!done) *--in = div128(norm, &done) | 0x80; 2952 result[len - 1] &= 0x7F; /* clear continue bit */ 2953 PUSH_GROWING_BYTES(utf8, cat, start, cur, 2954 in, (result + len) - in); 2955 Safefree(result); 2956 SvREFCNT_dec(norm); /* free norm */ 2957 } 2958 } 2959 break; 2960 case 'i': 2961 case 'i' | TYPE_IS_SHRIEKING: 2962 while (len-- > 0) { 2963 int aint; 2964 fromstr = NEXTFROM; 2965 aint = SvIV_no_inf(fromstr, datumtype); 2966 PUSH_VAR(utf8, cur, aint, needs_swap); 2967 } 2968 break; 2969 case 'N' | TYPE_IS_SHRIEKING: 2970 case 'N': 2971 while (len-- > 0) { 2972 U32 au32; 2973 fromstr = NEXTFROM; 2974 au32 = SvUV_no_inf(fromstr, datumtype); 2975 au32 = PerlSock_htonl(au32); 2976 PUSH32(utf8, cur, &au32, FALSE); 2977 } 2978 break; 2979 case 'V' | TYPE_IS_SHRIEKING: 2980 case 'V': 2981 while (len-- > 0) { 2982 U32 au32; 2983 fromstr = NEXTFROM; 2984 au32 = SvUV_no_inf(fromstr, datumtype); 2985 au32 = htovl(au32); 2986 PUSH32(utf8, cur, &au32, FALSE); 2987 } 2988 break; 2989 case 'L' | TYPE_IS_SHRIEKING: 2990 #if LONGSIZE != SIZE32 2991 while (len-- > 0) { 2992 unsigned long aulong; 2993 fromstr = NEXTFROM; 2994 aulong = SvUV_no_inf(fromstr, datumtype); 2995 PUSH_VAR(utf8, cur, aulong, needs_swap); 2996 } 2997 break; 2998 #else 2999 /* Fall though! */ 3000 #endif 3001 case 'L': 3002 while (len-- > 0) { 3003 U32 au32; 3004 fromstr = NEXTFROM; 3005 au32 = SvUV_no_inf(fromstr, datumtype); 3006 PUSH32(utf8, cur, &au32, needs_swap); 3007 } 3008 break; 3009 case 'l' | TYPE_IS_SHRIEKING: 3010 #if LONGSIZE != SIZE32 3011 while (len-- > 0) { 3012 long along; 3013 fromstr = NEXTFROM; 3014 along = SvIV_no_inf(fromstr, datumtype); 3015 PUSH_VAR(utf8, cur, along, needs_swap); 3016 } 3017 break; 3018 #else 3019 /* Fall though! */ 3020 #endif 3021 case 'l': 3022 while (len-- > 0) { 3023 I32 ai32; 3024 fromstr = NEXTFROM; 3025 ai32 = SvIV_no_inf(fromstr, datumtype); 3026 PUSH32(utf8, cur, &ai32, needs_swap); 3027 } 3028 break; 3029 #if defined(HAS_QUAD) && IVSIZE >= 8 3030 case 'Q': 3031 while (len-- > 0) { 3032 Uquad_t auquad; 3033 fromstr = NEXTFROM; 3034 auquad = (Uquad_t) SvUV_no_inf(fromstr, datumtype); 3035 PUSH_VAR(utf8, cur, auquad, needs_swap); 3036 } 3037 break; 3038 case 'q': 3039 while (len-- > 0) { 3040 Quad_t aquad; 3041 fromstr = NEXTFROM; 3042 aquad = (Quad_t)SvIV_no_inf(fromstr, datumtype); 3043 PUSH_VAR(utf8, cur, aquad, needs_swap); 3044 } 3045 break; 3046 #endif 3047 case 'P': 3048 len = 1; /* assume SV is correct length */ 3049 GROWING(utf8, cat, start, cur, sizeof(char *)); 3050 /* FALLTHROUGH */ 3051 case 'p': 3052 while (len-- > 0) { 3053 const char *aptr; 3054 3055 fromstr = NEXTFROM; 3056 SvGETMAGIC(fromstr); 3057 if (!SvOK(fromstr)) aptr = NULL; 3058 else { 3059 /* XXX better yet, could spirit away the string to 3060 * a safe spot and hang on to it until the result 3061 * of pack() (and all copies of the result) are 3062 * gone. 3063 */ 3064 if (((SvTEMP(fromstr) && SvREFCNT(fromstr) == 1) 3065 || (SvPADTMP(fromstr) && 3066 !SvREADONLY(fromstr)))) { 3067 Perl_ck_warner(aTHX_ packWARN(WARN_PACK), 3068 "Attempt to pack pointer to temporary value"); 3069 } 3070 if (SvPOK(fromstr) || SvNIOK(fromstr)) 3071 aptr = SvPV_nomg_const_nolen(fromstr); 3072 else 3073 aptr = SvPV_force_flags_nolen(fromstr, 0); 3074 } 3075 PUSH_VAR(utf8, cur, aptr, needs_swap); 3076 } 3077 break; 3078 case 'u': { 3079 const char *aptr, *aend; 3080 bool from_utf8; 3081 3082 fromstr = NEXTFROM; 3083 if (len <= 2) len = 45; 3084 else len = len / 3 * 3; 3085 if (len >= 64) { 3086 Perl_ck_warner(aTHX_ packWARN(WARN_PACK), 3087 "Field too wide in 'u' format in pack"); 3088 len = 63; 3089 } 3090 aptr = SvPV_const(fromstr, fromlen); 3091 from_utf8 = DO_UTF8(fromstr); 3092 if (from_utf8) { 3093 aend = aptr + fromlen; 3094 fromlen = sv_len_utf8_nomg(fromstr); 3095 } else aend = NULL; /* Unused, but keep compilers happy */ 3096 GROWING(utf8, cat, start, cur, (fromlen+2) / 3 * 4 + (fromlen+len-1)/len * 2); 3097 while (fromlen > 0) { 3098 U8 *end; 3099 SSize_t todo; 3100 U8 hunk[1+63/3*4+1]; 3101 3102 if ((SSize_t)fromlen > len) 3103 todo = len; 3104 else 3105 todo = fromlen; 3106 if (from_utf8) { 3107 char buffer[64]; 3108 if (!S_utf8_to_bytes(aTHX_ &aptr, aend, buffer, todo, 3109 'u' | TYPE_IS_PACK)) { 3110 *cur = '\0'; 3111 SvCUR_set(cat, cur - start); 3112 Perl_croak(aTHX_ "panic: string is shorter than advertised, " 3113 "aptr=%p, aend=%p, buffer=%p, todo=%zd", 3114 aptr, aend, buffer, todo); 3115 } 3116 end = doencodes(hunk, (const U8 *)buffer, todo); 3117 } else { 3118 end = doencodes(hunk, (const U8 *)aptr, todo); 3119 aptr += todo; 3120 } 3121 PUSH_BYTES(utf8, cur, hunk, end-hunk, 0); 3122 fromlen -= todo; 3123 } 3124 break; 3125 } 3126 } 3127 *cur = '\0'; 3128 SvCUR_set(cat, cur - start); 3129 no_change: 3130 *symptr = lookahead; 3131 } 3132 return beglist; 3133 } 3134 #undef NEXTFROM 3135 3136 3137 PP(pp_pack) 3138 { 3139 dSP; dMARK; dORIGMARK; dTARGET; 3140 SV *cat = TARG; 3141 STRLEN fromlen; 3142 SV *pat_sv = *++MARK; 3143 const char *pat = SvPV_const(pat_sv, fromlen); 3144 const char *patend = pat + fromlen; 3145 3146 MARK++; 3147 SvPVCLEAR(cat); 3148 SvUTF8_off(cat); 3149 3150 packlist(cat, pat, patend, MARK, SP + 1); 3151 3152 if (SvUTF8(cat)) { 3153 STRLEN result_len; 3154 const char * result = SvPV_nomg(cat, result_len); 3155 const U8 * error_pos; 3156 3157 if (! is_utf8_string_loc((U8 *) result, result_len, &error_pos)) { 3158 _force_out_malformed_utf8_message(error_pos, 3159 (U8 *) result + result_len, 3160 0, /* no flags */ 3161 1 /* Die */ 3162 ); 3163 NOT_REACHED; /* NOTREACHED */ 3164 } 3165 } 3166 3167 SvSETMAGIC(cat); 3168 SP = ORIGMARK; 3169 PUSHs(cat); 3170 RETURN; 3171 } 3172 3173 /* 3174 * ex: set ts=8 sts=4 sw=4 et: 3175 */ 3176