1 /* pp_pack.c 2 * 3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others 5 * 6 * You may distribute under the terms of either the GNU General Public 7 * License or the Artistic License, as specified in the README file. 8 * 9 */ 10 11 /* 12 * He still hopefully carried some of his gear in his pack: a small tinder-box, 13 * two small shallow pans, the smaller fitting into the larger; inside them a 14 * wooden spoon, a short two-pronged fork and some skewers were stowed; and 15 * hidden at the bottom of the pack in a flat wooden box a dwindling treasure, 16 * some salt. 17 * 18 * [p.653 of _The Lord of the Rings_, IV/iv: "Of Herbs and Stewed Rabbit"] 19 */ 20 21 /* This file contains pp ("push/pop") functions that 22 * execute the opcodes that make up a perl program. A typical pp function 23 * expects to find its arguments on the stack, and usually pushes its 24 * results onto the stack, hence the 'pp' terminology. Each OP structure 25 * contains a pointer to the relevant pp_foo() function. 26 * 27 * This particular file just contains pp_pack() and pp_unpack(). See the 28 * other pp*.c files for the rest of the pp_ functions. 29 */ 30 31 #include "EXTERN.h" 32 #define PERL_IN_PP_PACK_C 33 #include "perl.h" 34 35 /* Types used by pack/unpack */ 36 typedef enum { 37 e_no_len, /* no length */ 38 e_number, /* number, [] */ 39 e_star /* asterisk */ 40 } howlen_t; 41 42 typedef struct tempsym { 43 const char* patptr; /* current template char */ 44 const char* patend; /* one after last char */ 45 const char* grpbeg; /* 1st char of ()-group */ 46 const char* grpend; /* end of ()-group */ 47 I32 code; /* template code (!<>) */ 48 U32 flags; /* /=4, comma=2, pack=1 */ 49 /* and group modifiers */ 50 SSize_t length; /* length/repeat count */ 51 howlen_t howlen; /* how length is given */ 52 int level; /* () nesting level */ 53 STRLEN strbeg; /* offset of group start */ 54 struct tempsym *previous; /* previous group */ 55 } tempsym_t; 56 57 #define TEMPSYM_INIT(symptr, p, e, f) \ 58 STMT_START { \ 59 (symptr)->patptr = (p); \ 60 (symptr)->patend = (e); \ 61 (symptr)->grpbeg = NULL; \ 62 (symptr)->grpend = NULL; \ 63 (symptr)->grpend = NULL; \ 64 (symptr)->code = 0; \ 65 (symptr)->length = 0; \ 66 (symptr)->howlen = e_no_len; \ 67 (symptr)->level = 0; \ 68 (symptr)->flags = (f); \ 69 (symptr)->strbeg = 0; \ 70 (symptr)->previous = NULL; \ 71 } STMT_END 72 73 typedef union { 74 NV nv; 75 U8 bytes[sizeof(NV)]; 76 } NV_bytes; 77 78 #if defined(HAS_LONG_DOUBLE) && defined(USE_LONG_DOUBLE) 79 typedef union { 80 long double ld; 81 U8 bytes[sizeof(long double)]; 82 } ld_bytes; 83 #endif 84 85 #ifndef CHAR_BIT 86 # define CHAR_BIT 8 87 #endif 88 /* Maximum number of bytes to which a byte can grow due to upgrade */ 89 #define UTF8_EXPAND 2 90 91 /* 92 * Offset for integer pack/unpack. 93 * 94 * On architectures where I16 and I32 aren't really 16 and 32 bits, 95 * which for now are all Crays, pack and unpack have to play games. 96 */ 97 98 /* 99 * These values are required for portability of pack() output. 100 * If they're not right on your machine, then pack() and unpack() 101 * wouldn't work right anyway; you'll need to apply the Cray hack. 102 * (I'd like to check them with #if, but you can't use sizeof() in 103 * the preprocessor.) --??? 104 */ 105 /* 106 The appropriate SHORTSIZE, INTSIZE, LONGSIZE, and LONGLONGSIZE 107 defines are now in config.h. --Andy Dougherty April 1998 108 */ 109 #define SIZE16 2 110 #define SIZE32 4 111 112 /* CROSSCOMPILE and MULTIARCH are going to affect pp_pack() and pp_unpack(). 113 --jhi Feb 1999 */ 114 115 #if U16SIZE <= SIZE16 && U32SIZE <= SIZE32 116 # define OFF16(p) ((char *) (p)) 117 # define OFF32(p) ((char *) (p)) 118 #elif BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 /* little-endian */ 119 # define OFF16(p) ((char*)(p)) 120 # define OFF32(p) ((char*)(p)) 121 #elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321 /* big-endian */ 122 # define OFF16(p) ((char*)(p) + (sizeof(U16) - SIZE16)) 123 # define OFF32(p) ((char*)(p) + (sizeof(U32) - SIZE32)) 124 #else 125 # error "bad cray byte order" 126 #endif 127 128 #define PUSH16(utf8, cur, p, needs_swap) \ 129 PUSH_BYTES(utf8, cur, OFF16(p), SIZE16, needs_swap) 130 #define PUSH32(utf8, cur, p, needs_swap) \ 131 PUSH_BYTES(utf8, cur, OFF32(p), SIZE32, needs_swap) 132 133 #if BYTEORDER == 0x4321 || BYTEORDER == 0x87654321 /* big-endian */ 134 # define NEEDS_SWAP(d) (TYPE_ENDIANNESS(d) == TYPE_IS_LITTLE_ENDIAN) 135 #elif BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 /* little-endian */ 136 # define NEEDS_SWAP(d) (TYPE_ENDIANNESS(d) == TYPE_IS_BIG_ENDIAN) 137 #else 138 # error "Unsupported byteorder" 139 /* Need to add code here to re-instate mixed endian support. 140 NEEDS_SWAP would need to hold a flag indicating which action to 141 take, and S_reverse_copy and the code in S_utf8_to_bytes would need 142 logic adding to deal with any mixed-endian transformations needed. 143 */ 144 #endif 145 146 /* Only to be used inside a loop (see the break) */ 147 #define SHIFT_BYTES(utf8, s, strend, buf, len, datumtype, needs_swap) \ 148 STMT_START { \ 149 if (UNLIKELY(utf8)) { \ 150 if (!S_utf8_to_bytes(aTHX_ &s, strend, \ 151 (char *) (buf), len, datumtype)) break; \ 152 } else { \ 153 if (UNLIKELY(needs_swap)) \ 154 S_reverse_copy(s, (char *) (buf), len); \ 155 else \ 156 Copy(s, (char *) (buf), len, char); \ 157 s += len; \ 158 } \ 159 } STMT_END 160 161 #define SHIFT16(utf8, s, strend, p, datumtype, needs_swap) \ 162 SHIFT_BYTES(utf8, s, strend, OFF16(p), SIZE16, datumtype, needs_swap) 163 164 #define SHIFT32(utf8, s, strend, p, datumtype, needs_swap) \ 165 SHIFT_BYTES(utf8, s, strend, OFF32(p), SIZE32, datumtype, needs_swap) 166 167 #define SHIFT_VAR(utf8, s, strend, var, datumtype, needs_swap) \ 168 SHIFT_BYTES(utf8, s, strend, &(var), sizeof(var), datumtype, needs_swap) 169 170 #define PUSH_VAR(utf8, aptr, var, needs_swap) \ 171 PUSH_BYTES(utf8, aptr, &(var), sizeof(var), needs_swap) 172 173 /* Avoid stack overflow due to pathological templates. 100 should be plenty. */ 174 #define MAX_SUB_TEMPLATE_LEVEL 100 175 176 /* flags (note that type modifiers can also be used as flags!) */ 177 #define FLAG_WAS_UTF8 0x40 178 #define FLAG_PARSE_UTF8 0x20 /* Parse as utf8 */ 179 #define FLAG_UNPACK_ONLY_ONE 0x10 180 #define FLAG_DO_UTF8 0x08 /* The underlying string is utf8 */ 181 #define FLAG_SLASH 0x04 182 #define FLAG_COMMA 0x02 183 #define FLAG_PACK 0x01 184 185 STATIC SV * 186 S_mul128(pTHX_ SV *sv, U8 m) 187 { 188 STRLEN len; 189 char *s = SvPV(sv, len); 190 char *t; 191 192 PERL_ARGS_ASSERT_MUL128; 193 194 if (! memBEGINs(s, len, "0000")) { /* need to grow sv */ 195 SV * const tmpNew = newSVpvs("0000000000"); 196 197 sv_catsv(tmpNew, sv); 198 SvREFCNT_dec(sv); /* free old sv */ 199 sv = tmpNew; 200 s = SvPV(sv, len); 201 } 202 t = s + len - 1; 203 while (!*t) /* trailing '\0'? */ 204 t--; 205 while (t > s) { 206 const U32 i = ((*t - '0') << 7) + m; 207 *(t--) = '0' + (char)(i % 10); 208 m = (char)(i / 10); 209 } 210 return (sv); 211 } 212 213 /* Explosives and implosives. */ 214 215 #define ISUUCHAR(ch) inRANGE(NATIVE_TO_LATIN1(ch), \ 216 NATIVE_TO_LATIN1(' '), \ 217 NATIVE_TO_LATIN1('a') - 1) 218 219 /* type modifiers */ 220 #define TYPE_IS_SHRIEKING 0x100 221 #define TYPE_IS_BIG_ENDIAN 0x200 222 #define TYPE_IS_LITTLE_ENDIAN 0x400 223 #define TYPE_IS_PACK 0x800 224 #define TYPE_ENDIANNESS_MASK (TYPE_IS_BIG_ENDIAN|TYPE_IS_LITTLE_ENDIAN) 225 #define TYPE_MODIFIERS(t) ((t) & ~0xFF) 226 #define TYPE_NO_MODIFIERS(t) ((t) & 0xFF) 227 228 # define TYPE_ENDIANNESS(t) ((t) & TYPE_ENDIANNESS_MASK) 229 # define TYPE_NO_ENDIANNESS(t) ((t) & ~TYPE_ENDIANNESS_MASK) 230 231 # define ENDIANNESS_ALLOWED_TYPES "sSiIlLqQjJfFdDpP(" 232 233 #define PACK_SIZE_CANNOT_CSUM 0x80 234 #define PACK_SIZE_UNPREDICTABLE 0x40 /* Not a fixed size element */ 235 #define PACK_SIZE_MASK 0x3F 236 237 #include "packsizetables.inc" 238 239 static void 240 S_reverse_copy(const char *src, char *dest, STRLEN len) 241 { 242 dest += len; 243 while (len--) 244 *--dest = *src++; 245 } 246 247 STATIC U8 248 utf8_to_byte(pTHX_ const char **s, const char *end, I32 datumtype) 249 { 250 STRLEN retlen; 251 UV val; 252 253 if (*s >= end) { 254 goto croak; 255 } 256 val = utf8n_to_uvchr((U8 *) *s, end-*s, &retlen, 257 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); 258 if (retlen == (STRLEN) -1) 259 croak: 260 Perl_croak(aTHX_ "Malformed UTF-8 string in '%c' format in unpack", 261 (int) TYPE_NO_MODIFIERS(datumtype)); 262 if (val >= 0x100) { 263 Perl_ck_warner(aTHX_ packWARN(WARN_UNPACK), 264 "Character in '%c' format wrapped in unpack", 265 (int) TYPE_NO_MODIFIERS(datumtype)); 266 val &= 0xff; 267 } 268 *s += retlen; 269 return (U8)val; 270 } 271 272 #define SHIFT_BYTE(utf8, s, strend, datumtype) ((utf8) ? \ 273 utf8_to_byte(aTHX_ &(s), (strend), (datumtype)) : \ 274 *(U8 *)(s)++) 275 276 STATIC bool 277 S_utf8_to_bytes(pTHX_ const char **s, const char *end, const char *buf, SSize_t buf_len, I32 datumtype) 278 { 279 UV val; 280 STRLEN retlen; 281 const char *from = *s; 282 int bad = 0; 283 const U32 flags = ckWARN(WARN_UTF8) ? 284 UTF8_CHECK_ONLY : (UTF8_CHECK_ONLY | UTF8_ALLOW_ANY); 285 const bool needs_swap = NEEDS_SWAP(datumtype); 286 287 if (UNLIKELY(needs_swap)) 288 buf += buf_len; 289 290 for (;buf_len > 0; buf_len--) { 291 if (from >= end) return FALSE; 292 val = utf8n_to_uvchr((U8 *) from, end-from, &retlen, flags); 293 if (retlen == (STRLEN) -1) { 294 from += UTF8_SAFE_SKIP(from, end); 295 bad |= 1; 296 } else from += retlen; 297 if (val >= 0x100) { 298 bad |= 2; 299 val &= 0xff; 300 } 301 if (UNLIKELY(needs_swap)) 302 *(U8 *)--buf = (U8)val; 303 else 304 *(U8 *)buf++ = (U8)val; 305 } 306 /* We have enough characters for the buffer. Did we have problems ? */ 307 if (bad) { 308 if (bad & 1) { 309 /* Rewalk the string fragment while warning */ 310 const char *ptr; 311 const U32 flags = ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY; 312 for (ptr = *s; ptr < from; ptr += UTF8SKIP(ptr)) { 313 if (ptr >= end) break; 314 utf8n_to_uvchr((U8 *) ptr, end-ptr, &retlen, flags); 315 } 316 if (from > end) from = end; 317 } 318 if ((bad & 2)) 319 Perl_ck_warner(aTHX_ packWARN(datumtype & TYPE_IS_PACK ? 320 WARN_PACK : WARN_UNPACK), 321 "Character(s) in '%c' format wrapped in %s", 322 (int) TYPE_NO_MODIFIERS(datumtype), 323 datumtype & TYPE_IS_PACK ? "pack" : "unpack"); 324 } 325 *s = from; 326 return TRUE; 327 } 328 329 STATIC char * 330 S_my_bytes_to_utf8(const U8 *start, STRLEN len, char *dest, const bool needs_swap) { 331 PERL_ARGS_ASSERT_MY_BYTES_TO_UTF8; 332 333 if (UNLIKELY(needs_swap)) { 334 const U8 *p = start + len; 335 while (p-- > start) { 336 append_utf8_from_native_byte(*p, (U8 **) & dest); 337 } 338 } else { 339 const U8 * const end = start + len; 340 while (start < end) { 341 append_utf8_from_native_byte(*start, (U8 **) & dest); 342 start++; 343 } 344 } 345 return dest; 346 } 347 348 #define PUSH_BYTES(utf8, cur, buf, len, needs_swap) \ 349 STMT_START { \ 350 if (UNLIKELY(utf8)) \ 351 (cur) = my_bytes_to_utf8((U8 *) buf, len, (cur), needs_swap); \ 352 else { \ 353 if (UNLIKELY(needs_swap)) \ 354 S_reverse_copy((char *)(buf), cur, len); \ 355 else \ 356 Copy(buf, cur, len, char); \ 357 (cur) += (len); \ 358 } \ 359 } STMT_END 360 361 #define SAFE_UTF8_EXPAND(var) \ 362 STMT_START { \ 363 if ((var) > SSize_t_MAX / UTF8_EXPAND) \ 364 Perl_croak(aTHX_ "%s", "Out of memory during pack()"); \ 365 (var) = (var) * UTF8_EXPAND; \ 366 } STMT_END 367 368 #define GROWING2(utf8, cat, start, cur, item_size, item_count) \ 369 STMT_START { \ 370 if (SSize_t_MAX / (item_size) < (item_count)) \ 371 Perl_croak(aTHX_ "%s", "Out of memory during pack()"); \ 372 GROWING((utf8), (cat), (start), (cur), (item_size) * (item_count)); \ 373 } STMT_END 374 375 #define GROWING(utf8, cat, start, cur, in_len) \ 376 STMT_START { \ 377 STRLEN glen = (in_len); \ 378 STRLEN catcur = (STRLEN)((cur) - (start)); \ 379 if (utf8) SAFE_UTF8_EXPAND(glen); \ 380 if (SSize_t_MAX - glen < catcur) \ 381 Perl_croak(aTHX_ "%s", "Out of memory during pack()"); \ 382 if (catcur + glen >= SvLEN(cat)) { \ 383 (start) = sv_exp_grow(cat, glen); \ 384 (cur) = (start) + SvCUR(cat); \ 385 } \ 386 } STMT_END 387 388 #define PUSH_GROWING_BYTES(utf8, cat, start, cur, buf, in_len) \ 389 STMT_START { \ 390 const STRLEN glen = (in_len); \ 391 STRLEN gl = glen; \ 392 if (utf8) SAFE_UTF8_EXPAND(gl); \ 393 if ((cur) + gl >= (start) + SvLEN(cat)) { \ 394 *cur = '\0'; \ 395 SvCUR_set((cat), (cur) - (start)); \ 396 (start) = sv_exp_grow(cat, gl); \ 397 (cur) = (start) + SvCUR(cat); \ 398 } \ 399 PUSH_BYTES(utf8, cur, buf, glen, 0); \ 400 } STMT_END 401 402 #define PUSH_BYTE(utf8, s, byte) \ 403 STMT_START { \ 404 if (utf8) { \ 405 const U8 au8 = (byte); \ 406 (s) = my_bytes_to_utf8(&au8, 1, (s), 0);\ 407 } else *(U8 *)(s)++ = (byte); \ 408 } STMT_END 409 410 /* Only to be used inside a loop (see the break) */ 411 #define NEXT_UNI_VAL(val, cur, str, end, utf8_flags) \ 412 STMT_START { \ 413 STRLEN retlen; \ 414 if (str >= end) break; \ 415 val = utf8n_to_uvchr((U8 *) str, end-str, &retlen, utf8_flags); \ 416 if (retlen == (STRLEN) -1) { \ 417 *cur = '\0'; \ 418 Perl_croak(aTHX_ "Malformed UTF-8 string in pack"); \ 419 } \ 420 str += retlen; \ 421 } STMT_END 422 423 static const char *_action( const tempsym_t* symptr ) 424 { 425 return (const char *)(( symptr->flags & FLAG_PACK ) ? "pack" : "unpack"); 426 } 427 428 /* Returns the sizeof() struct described by pat */ 429 STATIC SSize_t 430 S_measure_struct(pTHX_ tempsym_t* symptr) 431 { 432 SSize_t total = 0; 433 434 PERL_ARGS_ASSERT_MEASURE_STRUCT; 435 436 while (next_symbol(symptr)) { 437 SSize_t len, size; 438 439 switch (symptr->howlen) { 440 case e_star: 441 Perl_croak(aTHX_ "Within []-length '*' not allowed in %s", 442 _action( symptr ) ); 443 444 default: 445 /* e_no_len and e_number */ 446 len = symptr->length; 447 break; 448 } 449 450 size = packprops[TYPE_NO_ENDIANNESS(symptr->code)] & PACK_SIZE_MASK; 451 if (!size) { 452 SSize_t star; 453 /* endianness doesn't influence the size of a type */ 454 switch(TYPE_NO_ENDIANNESS(symptr->code)) { 455 default: 456 Perl_croak(aTHX_ "Invalid type '%c' in %s", 457 (int)TYPE_NO_MODIFIERS(symptr->code), 458 _action( symptr ) ); 459 case '.' | TYPE_IS_SHRIEKING: 460 case '@' | TYPE_IS_SHRIEKING: 461 case '@': 462 case '.': 463 case '/': 464 case 'U': /* XXXX Is it correct? */ 465 case 'w': 466 case 'u': 467 Perl_croak(aTHX_ "Within []-length '%c' not allowed in %s", 468 (int) TYPE_NO_MODIFIERS(symptr->code), 469 _action( symptr ) ); 470 case '%': 471 size = 0; 472 break; 473 case '(': 474 { 475 tempsym_t savsym = *symptr; 476 symptr->patptr = savsym.grpbeg; 477 symptr->patend = savsym.grpend; 478 /* XXXX Theoretically, we need to measure many times at 479 different positions, since the subexpression may contain 480 alignment commands, but be not of aligned length. 481 Need to detect this and croak(). */ 482 size = measure_struct(symptr); 483 *symptr = savsym; 484 break; 485 } 486 case 'X' | TYPE_IS_SHRIEKING: 487 /* XXXX Is this useful? Then need to treat MEASURE_BACKWARDS. 488 */ 489 if (!len) /* Avoid division by 0 */ 490 len = 1; 491 len = total % len; /* Assumed: the start is aligned. */ 492 /* FALLTHROUGH */ 493 case 'X': 494 size = -1; 495 if (total < len) 496 Perl_croak(aTHX_ "'X' outside of string in %s", _action( symptr ) ); 497 break; 498 case 'x' | TYPE_IS_SHRIEKING: 499 if (!len) /* Avoid division by 0 */ 500 len = 1; 501 star = total % len; /* Assumed: the start is aligned. */ 502 if (star) /* Other portable ways? */ 503 len = len - star; 504 else 505 len = 0; 506 /* FALLTHROUGH */ 507 case 'x': 508 case 'A': 509 case 'Z': 510 case 'a': 511 size = 1; 512 break; 513 case 'B': 514 case 'b': 515 len = (len + 7)/8; 516 size = 1; 517 break; 518 case 'H': 519 case 'h': 520 len = (len + 1)/2; 521 size = 1; 522 break; 523 524 case 'P': 525 len = 1; 526 size = sizeof(char*); 527 break; 528 } 529 } 530 total += len * size; 531 } 532 return total; 533 } 534 535 536 /* locate matching closing parenthesis or bracket 537 * returns char pointer to char after match, or NULL 538 */ 539 STATIC const char * 540 S_group_end(pTHX_ const char *patptr, const char *patend, char ender) 541 { 542 PERL_ARGS_ASSERT_GROUP_END; 543 544 while (patptr < patend) { 545 const char c = *patptr++; 546 547 if (isSPACE(c)) 548 continue; 549 else if (c == ender) 550 return patptr-1; 551 else if (c == '#') { 552 while (patptr < patend && *patptr != '\n') 553 patptr++; 554 continue; 555 } else if (c == '(') 556 patptr = group_end(patptr, patend, ')') + 1; 557 else if (c == '[') 558 patptr = group_end(patptr, patend, ']') + 1; 559 } 560 Perl_croak(aTHX_ "No group ending character '%c' found in template", 561 ender); 562 NOT_REACHED; /* NOTREACHED */ 563 } 564 565 566 /* Convert unsigned decimal number to binary. 567 * Expects a pointer to the first digit and address of length variable 568 * Advances char pointer to 1st non-digit char and returns number 569 */ 570 STATIC const char * 571 S_get_num(pTHX_ const char *patptr, SSize_t *lenptr ) 572 { 573 SSize_t len = *patptr++ - '0'; 574 575 PERL_ARGS_ASSERT_GET_NUM; 576 577 while (isDIGIT(*patptr)) { 578 SSize_t nlen = (len * 10) + (*patptr++ - '0'); 579 if (nlen < 0 || nlen/10 != len) 580 Perl_croak(aTHX_ "pack/unpack repeat count overflow"); 581 len = nlen; 582 } 583 *lenptr = len; 584 return patptr; 585 } 586 587 /* The marvellous template parsing routine: Using state stored in *symptr, 588 * locates next template code and count 589 */ 590 STATIC bool 591 S_next_symbol(pTHX_ tempsym_t* symptr ) 592 { 593 const char* patptr = symptr->patptr; 594 const char* const patend = symptr->patend; 595 596 PERL_ARGS_ASSERT_NEXT_SYMBOL; 597 598 symptr->flags &= ~FLAG_SLASH; 599 600 while (patptr < patend) { 601 if (isSPACE(*patptr)) 602 patptr++; 603 else if (*patptr == '#') { 604 patptr++; 605 while (patptr < patend && *patptr != '\n') 606 patptr++; 607 if (patptr < patend) 608 patptr++; 609 } else { 610 /* We should have found a template code */ 611 I32 code = *patptr++ & 0xFF; 612 U32 inherited_modifiers = 0; 613 614 if (code == ','){ /* grandfather in commas but with a warning */ 615 if (((symptr->flags & FLAG_COMMA) == 0) && ckWARN(WARN_UNPACK)){ 616 symptr->flags |= FLAG_COMMA; 617 Perl_warner(aTHX_ packWARN(WARN_UNPACK), 618 "Invalid type ',' in %s", _action( symptr ) ); 619 } 620 continue; 621 } 622 623 /* for '(', skip to ')' */ 624 if (code == '(') { 625 if( isDIGIT(*patptr) || *patptr == '*' || *patptr == '[' ) 626 Perl_croak(aTHX_ "()-group starts with a count in %s", 627 _action( symptr ) ); 628 symptr->grpbeg = patptr; 629 patptr = 1 + ( symptr->grpend = group_end(patptr, patend, ')') ); 630 if( symptr->level >= MAX_SUB_TEMPLATE_LEVEL ) 631 Perl_croak(aTHX_ "Too deeply nested ()-groups in %s", 632 _action( symptr ) ); 633 } 634 635 /* look for group modifiers to inherit */ 636 if (TYPE_ENDIANNESS(symptr->flags)) { 637 if (strchr(ENDIANNESS_ALLOWED_TYPES, TYPE_NO_MODIFIERS(code))) 638 inherited_modifiers |= TYPE_ENDIANNESS(symptr->flags); 639 } 640 641 /* look for modifiers */ 642 while (patptr < patend) { 643 const char *allowed; 644 I32 modifier; 645 switch (*patptr) { 646 case '!': 647 modifier = TYPE_IS_SHRIEKING; 648 allowed = "sSiIlLxXnNvV@."; 649 break; 650 case '>': 651 modifier = TYPE_IS_BIG_ENDIAN; 652 allowed = ENDIANNESS_ALLOWED_TYPES; 653 break; 654 case '<': 655 modifier = TYPE_IS_LITTLE_ENDIAN; 656 allowed = ENDIANNESS_ALLOWED_TYPES; 657 break; 658 default: 659 allowed = ""; 660 modifier = 0; 661 break; 662 } 663 664 if (modifier == 0) 665 break; 666 667 if (!strchr(allowed, TYPE_NO_MODIFIERS(code))) 668 Perl_croak(aTHX_ "'%c' allowed only after types %s in %s", *patptr, 669 allowed, _action( symptr ) ); 670 671 if (TYPE_ENDIANNESS(code | modifier) == TYPE_ENDIANNESS_MASK) 672 Perl_croak(aTHX_ "Can't use both '<' and '>' after type '%c' in %s", 673 (int) TYPE_NO_MODIFIERS(code), _action( symptr ) ); 674 else if (TYPE_ENDIANNESS(code | modifier | inherited_modifiers) == 675 TYPE_ENDIANNESS_MASK) 676 Perl_croak(aTHX_ "Can't use '%c' in a group with different byte-order in %s", 677 *patptr, _action( symptr ) ); 678 679 if ((code & modifier)) { 680 Perl_ck_warner(aTHX_ packWARN(WARN_UNPACK), 681 "Duplicate modifier '%c' after '%c' in %s", 682 *patptr, (int) TYPE_NO_MODIFIERS(code), 683 _action( symptr ) ); 684 } 685 686 code |= modifier; 687 patptr++; 688 } 689 690 /* inherit modifiers */ 691 code |= inherited_modifiers; 692 693 /* look for count and/or / */ 694 if (patptr < patend) { 695 if (isDIGIT(*patptr)) { 696 patptr = get_num( patptr, &symptr->length ); 697 symptr->howlen = e_number; 698 699 } else if (*patptr == '*') { 700 patptr++; 701 symptr->howlen = e_star; 702 703 } else if (*patptr == '[') { 704 const char* lenptr = ++patptr; 705 symptr->howlen = e_number; 706 patptr = group_end( patptr, patend, ']' ) + 1; 707 /* what kind of [] is it? */ 708 if (isDIGIT(*lenptr)) { 709 lenptr = get_num( lenptr, &symptr->length ); 710 if( *lenptr != ']' ) 711 Perl_croak(aTHX_ "Malformed integer in [] in %s", 712 _action( symptr ) ); 713 } else { 714 tempsym_t savsym = *symptr; 715 symptr->patend = patptr-1; 716 symptr->patptr = lenptr; 717 savsym.length = measure_struct(symptr); 718 *symptr = savsym; 719 } 720 } else { 721 symptr->howlen = e_no_len; 722 symptr->length = 1; 723 } 724 725 /* try to find / */ 726 while (patptr < patend) { 727 if (isSPACE(*patptr)) 728 patptr++; 729 else if (*patptr == '#') { 730 patptr++; 731 while (patptr < patend && *patptr != '\n') 732 patptr++; 733 if (patptr < patend) 734 patptr++; 735 } else { 736 if (*patptr == '/') { 737 symptr->flags |= FLAG_SLASH; 738 patptr++; 739 if (patptr < patend && 740 (isDIGIT(*patptr) || *patptr == '*' || *patptr == '[')) 741 Perl_croak(aTHX_ "'/' does not take a repeat count in %s", 742 _action( symptr ) ); 743 } 744 break; 745 } 746 } 747 } else { 748 /* at end - no count, no / */ 749 symptr->howlen = e_no_len; 750 symptr->length = 1; 751 } 752 753 symptr->code = code; 754 symptr->patptr = patptr; 755 return TRUE; 756 } 757 } 758 symptr->patptr = patptr; 759 return FALSE; 760 } 761 762 /* 763 There is no way to cleanly handle the case where we should process the 764 string per byte in its upgraded form while it's really in downgraded form 765 (e.g. estimates like strend-s as an upper bound for the number of 766 characters left wouldn't work). So if we foresee the need of this 767 (pattern starts with U or contains U0), we want to work on the encoded 768 version of the string. Users are advised to upgrade their pack string 769 themselves if they need to do a lot of unpacks like this on it 770 */ 771 STATIC bool 772 need_utf8(const char *pat, const char *patend) 773 { 774 bool first = TRUE; 775 776 PERL_ARGS_ASSERT_NEED_UTF8; 777 778 while (pat < patend) { 779 if (pat[0] == '#') { 780 pat++; 781 pat = (const char *) memchr(pat, '\n', patend-pat); 782 if (!pat) return FALSE; 783 } else if (pat[0] == 'U') { 784 if (first || pat[1] == '0') return TRUE; 785 } else first = FALSE; 786 pat++; 787 } 788 return FALSE; 789 } 790 791 STATIC char 792 first_symbol(const char *pat, const char *patend) { 793 PERL_ARGS_ASSERT_FIRST_SYMBOL; 794 795 while (pat < patend) { 796 if (pat[0] != '#') return pat[0]; 797 pat++; 798 pat = (const char *) memchr(pat, '\n', patend-pat); 799 if (!pat) return 0; 800 pat++; 801 } 802 return 0; 803 } 804 805 /* 806 807 =head1 Pack and Unpack 808 809 =for apidoc unpackstring 810 811 The engine implementing the C<unpack()> Perl function. 812 813 Using the template C<pat..patend>, this function unpacks the string 814 C<s..strend> into a number of mortal SVs, which it pushes onto the perl 815 argument (C<@_>) stack (so you will need to issue a C<PUTBACK> before and 816 C<SPAGAIN> after the call to this function). It returns the number of 817 pushed elements. 818 819 The C<strend> and C<patend> pointers should point to the byte following the 820 last character of each string. 821 822 Although this function returns its values on the perl argument stack, it 823 doesn't take any parameters from that stack (and thus in particular 824 there's no need to do a C<PUSHMARK> before calling it, unlike L</call_pv> for 825 example). 826 827 =cut */ 828 829 SSize_t 830 Perl_unpackstring(pTHX_ const char *pat, const char *patend, const char *s, const char *strend, U32 flags) 831 { 832 tempsym_t sym; 833 834 PERL_ARGS_ASSERT_UNPACKSTRING; 835 836 if (flags & FLAG_DO_UTF8) flags |= FLAG_WAS_UTF8; 837 else if (need_utf8(pat, patend)) { 838 /* We probably should try to avoid this in case a scalar context call 839 wouldn't get to the "U0" */ 840 STRLEN len = strend - s; 841 s = (char *) bytes_to_utf8((U8 *) s, &len); 842 SAVEFREEPV(s); 843 strend = s + len; 844 flags |= FLAG_DO_UTF8; 845 } 846 847 if (first_symbol(pat, patend) != 'U' && (flags & FLAG_DO_UTF8)) 848 flags |= FLAG_PARSE_UTF8; 849 850 TEMPSYM_INIT(&sym, pat, patend, flags); 851 852 return unpack_rec(&sym, s, s, strend, NULL ); 853 } 854 855 STATIC SSize_t 856 S_unpack_rec(pTHX_ tempsym_t* symptr, const char *s, const char *strbeg, const char *strend, const char **new_s ) 857 { 858 dSP; 859 SV *sv = NULL; 860 const SSize_t start_sp_offset = SP - PL_stack_base; 861 howlen_t howlen; 862 SSize_t checksum = 0; 863 UV cuv = 0; 864 NV cdouble = 0.0; 865 const SSize_t bits_in_uv = CHAR_BIT * sizeof(cuv); 866 bool beyond = FALSE; 867 bool explicit_length; 868 const bool unpack_only_one = (symptr->flags & FLAG_UNPACK_ONLY_ONE) != 0; 869 bool utf8 = (symptr->flags & FLAG_PARSE_UTF8) ? 1 : 0; 870 871 PERL_ARGS_ASSERT_UNPACK_REC; 872 873 symptr->strbeg = s - strbeg; 874 875 while (next_symbol(symptr)) { 876 packprops_t props; 877 SSize_t len; 878 I32 datumtype = symptr->code; 879 bool needs_swap; 880 /* do first one only unless in list context 881 / is implemented by unpacking the count, then popping it from the 882 stack, so must check that we're not in the middle of a / */ 883 if ( unpack_only_one 884 && (SP - PL_stack_base == start_sp_offset + 1) 885 && (datumtype != '/') ) /* XXX can this be omitted */ 886 break; 887 888 switch (howlen = symptr->howlen) { 889 case e_star: 890 len = strend - strbeg; /* long enough */ 891 break; 892 default: 893 /* e_no_len and e_number */ 894 len = symptr->length; 895 break; 896 } 897 898 explicit_length = TRUE; 899 redo_switch: 900 beyond = s >= strend; 901 902 props = packprops[TYPE_NO_ENDIANNESS(datumtype)]; 903 if (props) { 904 /* props nonzero means we can process this letter. */ 905 const SSize_t size = props & PACK_SIZE_MASK; 906 const SSize_t howmany = (strend - s) / size; 907 if (len > howmany) 908 len = howmany; 909 910 if (!checksum || (props & PACK_SIZE_CANNOT_CSUM)) { 911 if (len && unpack_only_one) len = 1; 912 EXTEND(SP, len); 913 EXTEND_MORTAL(len); 914 } 915 } 916 917 needs_swap = NEEDS_SWAP(datumtype); 918 919 switch(TYPE_NO_ENDIANNESS(datumtype)) { 920 default: 921 Perl_croak(aTHX_ "Invalid type '%c' in unpack", (int)TYPE_NO_MODIFIERS(datumtype) ); 922 923 case '%': 924 if (howlen == e_no_len) 925 len = 16; /* len is not specified */ 926 checksum = len; 927 cuv = 0; 928 cdouble = 0; 929 continue; 930 931 case '(': 932 { 933 tempsym_t savsym = *symptr; 934 const U32 group_modifiers = TYPE_MODIFIERS(datumtype & ~symptr->flags); 935 symptr->flags |= group_modifiers; 936 symptr->patend = savsym.grpend; 937 symptr->previous = &savsym; 938 symptr->level++; 939 PUTBACK; 940 if (len && unpack_only_one) len = 1; 941 while (len--) { 942 symptr->patptr = savsym.grpbeg; 943 if (utf8) symptr->flags |= FLAG_PARSE_UTF8; 944 else symptr->flags &= ~FLAG_PARSE_UTF8; 945 unpack_rec(symptr, s, strbeg, strend, &s); 946 if (s == strend && savsym.howlen == e_star) 947 break; /* No way to continue */ 948 } 949 SPAGAIN; 950 savsym.flags = symptr->flags & ~group_modifiers; 951 *symptr = savsym; 952 break; 953 } 954 case '.' | TYPE_IS_SHRIEKING: 955 case '.': { 956 const char *from; 957 SV *sv; 958 const bool u8 = utf8 && !(datumtype & TYPE_IS_SHRIEKING); 959 if (howlen == e_star) from = strbeg; 960 else if (len <= 0) from = s; 961 else { 962 tempsym_t *group = symptr; 963 964 while (--len && group) group = group->previous; 965 from = group ? strbeg + group->strbeg : strbeg; 966 } 967 sv = from <= s ? 968 newSVuv( u8 ? (UV) utf8_length((const U8*)from, (const U8*)s) : (UV) (s-from)) : 969 newSViv(-(u8 ? (IV) utf8_length((const U8*)s, (const U8*)from) : (IV) (from-s))); 970 mXPUSHs(sv); 971 break; 972 } 973 case '@' | TYPE_IS_SHRIEKING: 974 case '@': 975 s = strbeg + symptr->strbeg; 976 if (utf8 && !(datumtype & TYPE_IS_SHRIEKING)) 977 { 978 while (len > 0) { 979 if (s >= strend) 980 Perl_croak(aTHX_ "'@' outside of string in unpack"); 981 s += UTF8SKIP(s); 982 len--; 983 } 984 if (s > strend) 985 Perl_croak(aTHX_ "'@' outside of string with malformed UTF-8 in unpack"); 986 } else { 987 if (strend-s < len) 988 Perl_croak(aTHX_ "'@' outside of string in unpack"); 989 s += len; 990 } 991 break; 992 case 'X' | TYPE_IS_SHRIEKING: 993 if (!len) /* Avoid division by 0 */ 994 len = 1; 995 if (utf8) { 996 const char *hop, *last; 997 SSize_t l = len; 998 hop = last = strbeg; 999 while (hop < s) { 1000 hop += UTF8SKIP(hop); 1001 if (--l == 0) { 1002 last = hop; 1003 l = len; 1004 } 1005 } 1006 if (last > s) 1007 Perl_croak(aTHX_ "Malformed UTF-8 string in unpack"); 1008 s = last; 1009 break; 1010 } 1011 len = (s - strbeg) % len; 1012 /* FALLTHROUGH */ 1013 case 'X': 1014 if (utf8) { 1015 while (len > 0) { 1016 if (s <= strbeg) 1017 Perl_croak(aTHX_ "'X' outside of string in unpack"); 1018 while (--s, UTF8_IS_CONTINUATION(*s)) { 1019 if (s <= strbeg) 1020 Perl_croak(aTHX_ "'X' outside of string in unpack"); 1021 } 1022 len--; 1023 } 1024 } else { 1025 if (len > s - strbeg) 1026 Perl_croak(aTHX_ "'X' outside of string in unpack" ); 1027 s -= len; 1028 } 1029 break; 1030 case 'x' | TYPE_IS_SHRIEKING: { 1031 SSize_t ai32; 1032 if (!len) /* Avoid division by 0 */ 1033 len = 1; 1034 if (utf8) ai32 = utf8_length((U8 *) strbeg, (U8 *) s) % len; 1035 else ai32 = (s - strbeg) % len; 1036 if (ai32 == 0) break; 1037 len -= ai32; 1038 } 1039 /* FALLTHROUGH */ 1040 case 'x': 1041 if (utf8) { 1042 while (len>0) { 1043 if (s >= strend) 1044 Perl_croak(aTHX_ "'x' outside of string in unpack"); 1045 s += UTF8SKIP(s); 1046 len--; 1047 } 1048 } else { 1049 if (len > strend - s) 1050 Perl_croak(aTHX_ "'x' outside of string in unpack"); 1051 s += len; 1052 } 1053 break; 1054 case '/': 1055 Perl_croak(aTHX_ "'/' must follow a numeric type in unpack"); 1056 1057 case 'A': 1058 case 'Z': 1059 case 'a': 1060 if (checksum) { 1061 /* Preliminary length estimate is assumed done in 'W' */ 1062 if (len > strend - s) len = strend - s; 1063 goto W_checksum; 1064 } 1065 if (utf8) { 1066 SSize_t l; 1067 const char *hop; 1068 for (l=len, hop=s; l>0; l--, hop += UTF8SKIP(hop)) { 1069 if (hop >= strend) { 1070 if (hop > strend) 1071 Perl_croak(aTHX_ "Malformed UTF-8 string in unpack"); 1072 break; 1073 } 1074 } 1075 if (hop > strend) 1076 Perl_croak(aTHX_ "Malformed UTF-8 string in unpack"); 1077 len = hop - s; 1078 } else if (len > strend - s) 1079 len = strend - s; 1080 1081 if (datumtype == 'Z') { 1082 /* 'Z' strips stuff after first null */ 1083 const char *ptr, *end; 1084 end = s + len; 1085 for (ptr = s; ptr < end; ptr++) if (*ptr == 0) break; 1086 sv = newSVpvn(s, ptr-s); 1087 if (howlen == e_star) /* exact for 'Z*' */ 1088 len = ptr-s + (ptr != strend ? 1 : 0); 1089 } else if (datumtype == 'A') { 1090 /* 'A' strips both nulls and spaces */ 1091 const char *ptr; 1092 if (utf8 && (symptr->flags & FLAG_WAS_UTF8)) { 1093 for (ptr = s+len-1; ptr >= s; ptr--) { 1094 if ( *ptr != 0 1095 && !UTF8_IS_CONTINUATION(*ptr) 1096 && !isSPACE_utf8_safe(ptr, strend)) 1097 { 1098 break; 1099 } 1100 } 1101 if (ptr >= s) ptr += UTF8SKIP(ptr); 1102 else ptr++; 1103 if (ptr > s+len) 1104 Perl_croak(aTHX_ "Malformed UTF-8 string in unpack"); 1105 } else { 1106 for (ptr = s+len-1; ptr >= s; ptr--) 1107 if (*ptr != 0 && !isSPACE(*ptr)) break; 1108 ptr++; 1109 } 1110 sv = newSVpvn(s, ptr-s); 1111 } else sv = newSVpvn(s, len); 1112 1113 if (utf8) { 1114 SvUTF8_on(sv); 1115 /* Undo any upgrade done due to need_utf8() */ 1116 if (!(symptr->flags & FLAG_WAS_UTF8)) 1117 sv_utf8_downgrade(sv, 0); 1118 } 1119 mXPUSHs(sv); 1120 s += len; 1121 break; 1122 case 'B': 1123 case 'b': { 1124 char *str; 1125 if (howlen == e_star || len > (strend - s) * 8) 1126 len = (strend - s) * 8; 1127 if (checksum) { 1128 if (utf8) 1129 while (len >= 8 && s < strend) { 1130 cuv += PL_bitcount[utf8_to_byte(aTHX_ &s, strend, datumtype)]; 1131 len -= 8; 1132 } 1133 else 1134 while (len >= 8) { 1135 cuv += PL_bitcount[*(U8 *)s++]; 1136 len -= 8; 1137 } 1138 if (len && s < strend) { 1139 U8 bits; 1140 bits = SHIFT_BYTE(utf8, s, strend, datumtype); 1141 if (datumtype == 'b') 1142 while (len-- > 0) { 1143 if (bits & 1) cuv++; 1144 bits >>= 1; 1145 } 1146 else 1147 while (len-- > 0) { 1148 if (bits & 0x80) cuv++; 1149 bits <<= 1; 1150 } 1151 } 1152 break; 1153 } 1154 1155 sv = sv_2mortal(newSV(len ? len : 1)); 1156 SvPOK_on(sv); 1157 str = SvPVX(sv); 1158 if (datumtype == 'b') { 1159 U8 bits = 0; 1160 const SSize_t ai32 = len; 1161 for (len = 0; len < ai32; len++) { 1162 if (len & 7) bits >>= 1; 1163 else if (utf8) { 1164 if (s >= strend) break; 1165 bits = utf8_to_byte(aTHX_ &s, strend, datumtype); 1166 } else bits = *(U8 *) s++; 1167 *str++ = bits & 1 ? '1' : '0'; 1168 } 1169 } else { 1170 U8 bits = 0; 1171 const SSize_t ai32 = len; 1172 for (len = 0; len < ai32; len++) { 1173 if (len & 7) bits <<= 1; 1174 else if (utf8) { 1175 if (s >= strend) break; 1176 bits = utf8_to_byte(aTHX_ &s, strend, datumtype); 1177 } else bits = *(U8 *) s++; 1178 *str++ = bits & 0x80 ? '1' : '0'; 1179 } 1180 } 1181 *str = '\0'; 1182 SvCUR_set(sv, str - SvPVX_const(sv)); 1183 XPUSHs(sv); 1184 break; 1185 } 1186 case 'H': 1187 case 'h': { 1188 char *str = NULL; 1189 /* Preliminary length estimate, acceptable for utf8 too */ 1190 if (howlen == e_star || len > (strend - s) * 2) 1191 len = (strend - s) * 2; 1192 if (!checksum) { 1193 sv = sv_2mortal(newSV(len ? len : 1)); 1194 SvPOK_on(sv); 1195 str = SvPVX(sv); 1196 } 1197 if (datumtype == 'h') { 1198 U8 bits = 0; 1199 SSize_t ai32 = len; 1200 for (len = 0; len < ai32; len++) { 1201 if (len & 1) bits >>= 4; 1202 else if (utf8) { 1203 if (s >= strend) break; 1204 bits = utf8_to_byte(aTHX_ &s, strend, datumtype); 1205 } else bits = * (U8 *) s++; 1206 if (!checksum) 1207 *str++ = PL_hexdigit[bits & 15]; 1208 } 1209 } else { 1210 U8 bits = 0; 1211 const SSize_t ai32 = len; 1212 for (len = 0; len < ai32; len++) { 1213 if (len & 1) bits <<= 4; 1214 else if (utf8) { 1215 if (s >= strend) break; 1216 bits = utf8_to_byte(aTHX_ &s, strend, datumtype); 1217 } else bits = *(U8 *) s++; 1218 if (!checksum) 1219 *str++ = PL_hexdigit[(bits >> 4) & 15]; 1220 } 1221 } 1222 if (!checksum) { 1223 *str = '\0'; 1224 SvCUR_set(sv, str - SvPVX_const(sv)); 1225 XPUSHs(sv); 1226 } 1227 break; 1228 } 1229 case 'C': 1230 if (len == 0) { 1231 if (explicit_length) 1232 /* Switch to "character" mode */ 1233 utf8 = (symptr->flags & FLAG_DO_UTF8) ? 1 : 0; 1234 break; 1235 } 1236 /* FALLTHROUGH */ 1237 case 'c': 1238 while (len-- > 0 && s < strend) { 1239 int aint; 1240 if (utf8) 1241 { 1242 STRLEN retlen; 1243 aint = utf8n_to_uvchr((U8 *) s, strend-s, &retlen, 1244 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); 1245 if (retlen == (STRLEN) -1) 1246 Perl_croak(aTHX_ "Malformed UTF-8 string in unpack"); 1247 s += retlen; 1248 } 1249 else 1250 aint = *(U8 *)(s)++; 1251 if (aint >= 128 && datumtype != 'C') /* fake up signed chars */ 1252 aint -= 256; 1253 if (!checksum) 1254 mPUSHi(aint); 1255 else if (checksum > bits_in_uv) 1256 cdouble += (NV)aint; 1257 else 1258 cuv += aint; 1259 } 1260 break; 1261 case 'W': 1262 W_checksum: 1263 if (utf8) { 1264 while (len-- > 0 && s < strend) { 1265 STRLEN retlen; 1266 const UV val = utf8n_to_uvchr((U8 *) s, strend-s, &retlen, 1267 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); 1268 if (retlen == (STRLEN) -1) 1269 Perl_croak(aTHX_ "Malformed UTF-8 string in unpack"); 1270 s += retlen; 1271 if (!checksum) 1272 mPUSHu(val); 1273 else if (checksum > bits_in_uv) 1274 cdouble += (NV) val; 1275 else 1276 cuv += val; 1277 } 1278 } else if (!checksum) 1279 while (len-- > 0) { 1280 const U8 ch = *(U8 *) s++; 1281 mPUSHu(ch); 1282 } 1283 else if (checksum > bits_in_uv) 1284 while (len-- > 0) cdouble += (NV) *(U8 *) s++; 1285 else 1286 while (len-- > 0) cuv += *(U8 *) s++; 1287 break; 1288 case 'U': 1289 if (len == 0) { 1290 if (explicit_length && howlen != e_star) { 1291 /* Switch to "bytes in UTF-8" mode */ 1292 if (symptr->flags & FLAG_DO_UTF8) utf8 = 0; 1293 else 1294 /* Should be impossible due to the need_utf8() test */ 1295 Perl_croak(aTHX_ "U0 mode on a byte string"); 1296 } 1297 break; 1298 } 1299 if (len > strend - s) len = strend - s; 1300 if (!checksum) { 1301 if (len && unpack_only_one) len = 1; 1302 EXTEND(SP, len); 1303 EXTEND_MORTAL(len); 1304 } 1305 while (len-- > 0 && s < strend) { 1306 STRLEN retlen; 1307 UV auv; 1308 if (utf8) { 1309 U8 result[UTF8_MAXLEN+1]; 1310 const char *ptr = s; 1311 STRLEN len; 1312 /* Bug: warns about bad utf8 even if we are short on bytes 1313 and will break out of the loop */ 1314 if (!S_utf8_to_bytes(aTHX_ &ptr, strend, (char *) result, 1, 1315 'U')) 1316 break; 1317 len = UTF8SKIP(result); 1318 if (!S_utf8_to_bytes(aTHX_ &ptr, strend, 1319 (char *) &result[1], len-1, 'U')) break; 1320 auv = NATIVE_TO_UNI(utf8n_to_uvchr(result, 1321 len, 1322 &retlen, 1323 UTF8_ALLOW_DEFAULT)); 1324 s = ptr; 1325 } else { 1326 auv = NATIVE_TO_UNI(utf8n_to_uvchr((U8*)s, 1327 strend - s, 1328 &retlen, 1329 UTF8_ALLOW_DEFAULT)); 1330 if (retlen == (STRLEN) -1) 1331 Perl_croak(aTHX_ "Malformed UTF-8 string in unpack"); 1332 s += retlen; 1333 } 1334 if (!checksum) 1335 mPUSHu(auv); 1336 else if (checksum > bits_in_uv) 1337 cdouble += (NV) auv; 1338 else 1339 cuv += auv; 1340 } 1341 break; 1342 case 's' | TYPE_IS_SHRIEKING: 1343 #if SHORTSIZE != SIZE16 1344 while (len-- > 0) { 1345 short ashort; 1346 SHIFT_VAR(utf8, s, strend, ashort, datumtype, needs_swap); 1347 if (!checksum) 1348 mPUSHi(ashort); 1349 else if (checksum > bits_in_uv) 1350 cdouble += (NV)ashort; 1351 else 1352 cuv += ashort; 1353 } 1354 break; 1355 #else 1356 /* FALLTHROUGH */ 1357 #endif 1358 case 's': 1359 while (len-- > 0) { 1360 I16 ai16; 1361 1362 #if U16SIZE > SIZE16 1363 ai16 = 0; 1364 #endif 1365 SHIFT16(utf8, s, strend, &ai16, datumtype, needs_swap); 1366 #if U16SIZE > SIZE16 1367 if (ai16 > 32767) 1368 ai16 -= 65536; 1369 #endif 1370 if (!checksum) 1371 mPUSHi(ai16); 1372 else if (checksum > bits_in_uv) 1373 cdouble += (NV)ai16; 1374 else 1375 cuv += ai16; 1376 } 1377 break; 1378 case 'S' | TYPE_IS_SHRIEKING: 1379 #if SHORTSIZE != SIZE16 1380 while (len-- > 0) { 1381 unsigned short aushort; 1382 SHIFT_VAR(utf8, s, strend, aushort, datumtype, needs_swap); 1383 if (!checksum) 1384 mPUSHu(aushort); 1385 else if (checksum > bits_in_uv) 1386 cdouble += (NV)aushort; 1387 else 1388 cuv += aushort; 1389 } 1390 break; 1391 #else 1392 /* FALLTHROUGH */ 1393 #endif 1394 case 'v': 1395 case 'n': 1396 case 'S': 1397 while (len-- > 0) { 1398 U16 au16; 1399 #if U16SIZE > SIZE16 1400 au16 = 0; 1401 #endif 1402 SHIFT16(utf8, s, strend, &au16, datumtype, needs_swap); 1403 if (datumtype == 'n') 1404 au16 = PerlSock_ntohs(au16); 1405 if (datumtype == 'v') 1406 au16 = vtohs(au16); 1407 if (!checksum) 1408 mPUSHu(au16); 1409 else if (checksum > bits_in_uv) 1410 cdouble += (NV) au16; 1411 else 1412 cuv += au16; 1413 } 1414 break; 1415 case 'v' | TYPE_IS_SHRIEKING: 1416 case 'n' | TYPE_IS_SHRIEKING: 1417 while (len-- > 0) { 1418 I16 ai16; 1419 # if U16SIZE > SIZE16 1420 ai16 = 0; 1421 # endif 1422 SHIFT16(utf8, s, strend, &ai16, datumtype, needs_swap); 1423 /* There should never be any byte-swapping here. */ 1424 assert(!TYPE_ENDIANNESS(datumtype)); 1425 if (datumtype == ('n' | TYPE_IS_SHRIEKING)) 1426 ai16 = (I16) PerlSock_ntohs((U16) ai16); 1427 if (datumtype == ('v' | TYPE_IS_SHRIEKING)) 1428 ai16 = (I16) vtohs((U16) ai16); 1429 if (!checksum) 1430 mPUSHi(ai16); 1431 else if (checksum > bits_in_uv) 1432 cdouble += (NV) ai16; 1433 else 1434 cuv += ai16; 1435 } 1436 break; 1437 case 'i': 1438 case 'i' | TYPE_IS_SHRIEKING: 1439 while (len-- > 0) { 1440 int aint; 1441 SHIFT_VAR(utf8, s, strend, aint, datumtype, needs_swap); 1442 if (!checksum) 1443 mPUSHi(aint); 1444 else if (checksum > bits_in_uv) 1445 cdouble += (NV)aint; 1446 else 1447 cuv += aint; 1448 } 1449 break; 1450 case 'I': 1451 case 'I' | TYPE_IS_SHRIEKING: 1452 while (len-- > 0) { 1453 unsigned int auint; 1454 SHIFT_VAR(utf8, s, strend, auint, datumtype, needs_swap); 1455 if (!checksum) 1456 mPUSHu(auint); 1457 else if (checksum > bits_in_uv) 1458 cdouble += (NV)auint; 1459 else 1460 cuv += auint; 1461 } 1462 break; 1463 case 'j': 1464 while (len-- > 0) { 1465 IV aiv; 1466 SHIFT_VAR(utf8, s, strend, aiv, datumtype, needs_swap); 1467 if (!checksum) 1468 mPUSHi(aiv); 1469 else if (checksum > bits_in_uv) 1470 cdouble += (NV)aiv; 1471 else 1472 cuv += aiv; 1473 } 1474 break; 1475 case 'J': 1476 while (len-- > 0) { 1477 UV auv; 1478 SHIFT_VAR(utf8, s, strend, auv, datumtype, needs_swap); 1479 if (!checksum) 1480 mPUSHu(auv); 1481 else if (checksum > bits_in_uv) 1482 cdouble += (NV)auv; 1483 else 1484 cuv += auv; 1485 } 1486 break; 1487 case 'l' | TYPE_IS_SHRIEKING: 1488 #if LONGSIZE != SIZE32 1489 while (len-- > 0) { 1490 long along; 1491 SHIFT_VAR(utf8, s, strend, along, datumtype, needs_swap); 1492 if (!checksum) 1493 mPUSHi(along); 1494 else if (checksum > bits_in_uv) 1495 cdouble += (NV)along; 1496 else 1497 cuv += along; 1498 } 1499 break; 1500 #else 1501 /* FALLTHROUGH */ 1502 #endif 1503 case 'l': 1504 while (len-- > 0) { 1505 I32 ai32; 1506 #if U32SIZE > SIZE32 1507 ai32 = 0; 1508 #endif 1509 SHIFT32(utf8, s, strend, &ai32, datumtype, needs_swap); 1510 #if U32SIZE > SIZE32 1511 if (ai32 > 2147483647) ai32 -= 4294967296; 1512 #endif 1513 if (!checksum) 1514 mPUSHi(ai32); 1515 else if (checksum > bits_in_uv) 1516 cdouble += (NV)ai32; 1517 else 1518 cuv += ai32; 1519 } 1520 break; 1521 case 'L' | TYPE_IS_SHRIEKING: 1522 #if LONGSIZE != SIZE32 1523 while (len-- > 0) { 1524 unsigned long aulong; 1525 SHIFT_VAR(utf8, s, strend, aulong, datumtype, needs_swap); 1526 if (!checksum) 1527 mPUSHu(aulong); 1528 else if (checksum > bits_in_uv) 1529 cdouble += (NV)aulong; 1530 else 1531 cuv += aulong; 1532 } 1533 break; 1534 #else 1535 /* FALLTHROUGH */ 1536 #endif 1537 case 'V': 1538 case 'N': 1539 case 'L': 1540 while (len-- > 0) { 1541 U32 au32; 1542 #if U32SIZE > SIZE32 1543 au32 = 0; 1544 #endif 1545 SHIFT32(utf8, s, strend, &au32, datumtype, needs_swap); 1546 if (datumtype == 'N') 1547 au32 = PerlSock_ntohl(au32); 1548 if (datumtype == 'V') 1549 au32 = vtohl(au32); 1550 if (!checksum) 1551 mPUSHu(au32); 1552 else if (checksum > bits_in_uv) 1553 cdouble += (NV)au32; 1554 else 1555 cuv += au32; 1556 } 1557 break; 1558 case 'V' | TYPE_IS_SHRIEKING: 1559 case 'N' | TYPE_IS_SHRIEKING: 1560 while (len-- > 0) { 1561 I32 ai32; 1562 #if U32SIZE > SIZE32 1563 ai32 = 0; 1564 #endif 1565 SHIFT32(utf8, s, strend, &ai32, datumtype, needs_swap); 1566 /* There should never be any byte swapping here. */ 1567 assert(!TYPE_ENDIANNESS(datumtype)); 1568 if (datumtype == ('N' | TYPE_IS_SHRIEKING)) 1569 ai32 = (I32)PerlSock_ntohl((U32)ai32); 1570 if (datumtype == ('V' | TYPE_IS_SHRIEKING)) 1571 ai32 = (I32)vtohl((U32)ai32); 1572 if (!checksum) 1573 mPUSHi(ai32); 1574 else if (checksum > bits_in_uv) 1575 cdouble += (NV)ai32; 1576 else 1577 cuv += ai32; 1578 } 1579 break; 1580 case 'p': 1581 while (len-- > 0) { 1582 const char *aptr; 1583 SHIFT_VAR(utf8, s, strend, aptr, datumtype, needs_swap); 1584 /* newSVpv generates undef if aptr is NULL */ 1585 mPUSHs(newSVpv(aptr, 0)); 1586 } 1587 break; 1588 case 'w': 1589 { 1590 UV auv = 0; 1591 size_t bytes = 0; 1592 1593 while (len > 0 && s < strend) { 1594 U8 ch; 1595 ch = SHIFT_BYTE(utf8, s, strend, datumtype); 1596 auv = (auv << 7) | (ch & 0x7f); 1597 /* UTF8_IS_XXXXX not right here because this is a BER, not 1598 * UTF-8 format - using constant 0x80 */ 1599 if (ch < 0x80) { 1600 bytes = 0; 1601 mPUSHu(auv); 1602 len--; 1603 auv = 0; 1604 continue; 1605 } 1606 if (++bytes >= sizeof(UV)) { /* promote to string */ 1607 const char *t; 1608 1609 sv = Perl_newSVpvf(aTHX_ "%.*" UVuf, 1610 (int)TYPE_DIGITS(UV), auv); 1611 while (s < strend) { 1612 ch = SHIFT_BYTE(utf8, s, strend, datumtype); 1613 sv = mul128(sv, (U8)(ch & 0x7f)); 1614 if (!(ch & 0x80)) { 1615 bytes = 0; 1616 break; 1617 } 1618 } 1619 t = SvPV_nolen_const(sv); 1620 while (*t == '0') 1621 t++; 1622 sv_chop(sv, t); 1623 mPUSHs(sv); 1624 len--; 1625 auv = 0; 1626 } 1627 } 1628 if ((s >= strend) && bytes) 1629 Perl_croak(aTHX_ "Unterminated compressed integer in unpack"); 1630 } 1631 break; 1632 case 'P': 1633 if (symptr->howlen == e_star) 1634 Perl_croak(aTHX_ "'P' must have an explicit size in unpack"); 1635 EXTEND(SP, 1); 1636 if (s + sizeof(char*) <= strend) { 1637 char *aptr; 1638 SHIFT_VAR(utf8, s, strend, aptr, datumtype, needs_swap); 1639 /* newSVpvn generates undef if aptr is NULL */ 1640 PUSHs(newSVpvn_flags(aptr, len, SVs_TEMP)); 1641 } 1642 break; 1643 #if defined(HAS_QUAD) && IVSIZE >= 8 1644 case 'q': 1645 while (len-- > 0) { 1646 Quad_t aquad; 1647 SHIFT_VAR(utf8, s, strend, aquad, datumtype, needs_swap); 1648 if (!checksum) 1649 mPUSHs(newSViv((IV)aquad)); 1650 else if (checksum > bits_in_uv) 1651 cdouble += (NV)aquad; 1652 else 1653 cuv += aquad; 1654 } 1655 break; 1656 case 'Q': 1657 while (len-- > 0) { 1658 Uquad_t auquad; 1659 SHIFT_VAR(utf8, s, strend, auquad, datumtype, needs_swap); 1660 if (!checksum) 1661 mPUSHs(newSVuv((UV)auquad)); 1662 else if (checksum > bits_in_uv) 1663 cdouble += (NV)auquad; 1664 else 1665 cuv += auquad; 1666 } 1667 break; 1668 #endif 1669 /* float and double added gnb@melba.bby.oz.au 22/11/89 */ 1670 case 'f': 1671 while (len-- > 0) { 1672 float afloat; 1673 SHIFT_VAR(utf8, s, strend, afloat, datumtype, needs_swap); 1674 if (!checksum) 1675 mPUSHn(afloat); 1676 else 1677 cdouble += afloat; 1678 } 1679 break; 1680 case 'd': 1681 while (len-- > 0) { 1682 double adouble; 1683 SHIFT_VAR(utf8, s, strend, adouble, datumtype, needs_swap); 1684 if (!checksum) 1685 mPUSHn(adouble); 1686 else 1687 cdouble += adouble; 1688 } 1689 break; 1690 case 'F': 1691 while (len-- > 0) { 1692 NV_bytes anv; 1693 SHIFT_BYTES(utf8, s, strend, anv.bytes, sizeof(anv.bytes), 1694 datumtype, needs_swap); 1695 if (!checksum) 1696 mPUSHn(anv.nv); 1697 else 1698 cdouble += anv.nv; 1699 } 1700 break; 1701 #if defined(HAS_LONG_DOUBLE) && defined(USE_LONG_DOUBLE) 1702 case 'D': 1703 while (len-- > 0) { 1704 ld_bytes aldouble; 1705 SHIFT_BYTES(utf8, s, strend, aldouble.bytes, 1706 sizeof(aldouble.bytes), datumtype, needs_swap); 1707 /* The most common long double format, the x86 80-bit 1708 * extended precision, has either 2 or 6 unused bytes, 1709 * which may contain garbage, which may contain 1710 * unintentional data. While we do zero the bytes of 1711 * the long double data in pack(), here in unpack() we 1712 * don't, because it's really hard to envision that 1713 * reading the long double off aldouble would be 1714 * affected by the unused bytes. 1715 * 1716 * Note that trying to unpack 'long doubles' of 'long 1717 * doubles' packed in another system is in the general 1718 * case doomed without having more detail. */ 1719 if (!checksum) 1720 mPUSHn(aldouble.ld); 1721 else 1722 cdouble += aldouble.ld; 1723 } 1724 break; 1725 #endif 1726 case 'u': 1727 if (!checksum) { 1728 const STRLEN l = (STRLEN) (strend - s) * 3 / 4; 1729 sv = sv_2mortal(newSV(l)); 1730 if (l) { 1731 SvPOK_on(sv); 1732 *SvEND(sv) = '\0'; 1733 } 1734 } 1735 1736 /* Note that all legal uuencoded strings are ASCII printables, so 1737 * have the same representation under UTF-8 vs not. This means we 1738 * can ignore UTF8ness on legal input. For illegal we stop at the 1739 * first failure, and don't report where/what that is, so again we 1740 * can ignore UTF8ness */ 1741 1742 while (s < strend && *s != ' ' && ISUUCHAR(*s)) { 1743 I32 a, b, c, d; 1744 char hunk[3]; 1745 1746 len = PL_uudmap[*(U8*)s++] & 077; 1747 while (len > 0) { 1748 if (s < strend && ISUUCHAR(*s)) 1749 a = PL_uudmap[*(U8*)s++] & 077; 1750 else 1751 a = 0; 1752 if (s < strend && ISUUCHAR(*s)) 1753 b = PL_uudmap[*(U8*)s++] & 077; 1754 else 1755 b = 0; 1756 if (s < strend && ISUUCHAR(*s)) 1757 c = PL_uudmap[*(U8*)s++] & 077; 1758 else 1759 c = 0; 1760 if (s < strend && ISUUCHAR(*s)) 1761 d = PL_uudmap[*(U8*)s++] & 077; 1762 else 1763 d = 0; 1764 hunk[0] = (char)((a << 2) | (b >> 4)); 1765 hunk[1] = (char)((b << 4) | (c >> 2)); 1766 hunk[2] = (char)((c << 6) | d); 1767 if (!checksum) 1768 sv_catpvn(sv, hunk, (len > 3) ? 3 : len); 1769 len -= 3; 1770 } 1771 if (*s == '\n') 1772 s++; 1773 else /* possible checksum byte */ 1774 if (s + 1 < strend && s[1] == '\n') 1775 s += 2; 1776 } 1777 if (!checksum) 1778 XPUSHs(sv); 1779 break; 1780 } /* End of switch */ 1781 1782 if (checksum) { 1783 if (memCHRs("fFdD", TYPE_NO_MODIFIERS(datumtype)) || 1784 (checksum > bits_in_uv && 1785 memCHRs("cCsSiIlLnNUWvVqQjJ", TYPE_NO_MODIFIERS(datumtype))) ) { 1786 NV trouble, anv; 1787 1788 anv = (NV) (1 << (checksum & 15)); 1789 while (checksum >= 16) { 1790 checksum -= 16; 1791 anv *= 65536.0; 1792 } 1793 while (cdouble < 0.0) 1794 cdouble += anv; 1795 cdouble = Perl_modf(cdouble / anv, &trouble); 1796 #ifdef LONGDOUBLE_DOUBLEDOUBLE 1797 /* Workaround for powerpc doubledouble modfl bug: 1798 * close to 1.0L and -1.0L cdouble is 0, and trouble 1799 * is cdouble / anv. */ 1800 if (trouble != Perl_ceil(trouble)) { 1801 cdouble = trouble; 1802 if (cdouble > 1.0L) cdouble -= 1.0L; 1803 if (cdouble < -1.0L) cdouble += 1.0L; 1804 } 1805 #endif 1806 cdouble *= anv; 1807 sv = newSVnv(cdouble); 1808 } 1809 else { 1810 if (checksum < bits_in_uv) { 1811 UV mask = ((UV)1 << checksum) - 1; 1812 cuv &= mask; 1813 } 1814 sv = newSVuv(cuv); 1815 } 1816 mXPUSHs(sv); 1817 checksum = 0; 1818 } 1819 1820 if (symptr->flags & FLAG_SLASH){ 1821 if (SP - PL_stack_base - start_sp_offset <= 0) 1822 break; 1823 if( next_symbol(symptr) ){ 1824 if( symptr->howlen == e_number ) 1825 Perl_croak(aTHX_ "Count after length/code in unpack" ); 1826 if( beyond ){ 1827 /* ...end of char buffer then no decent length available */ 1828 Perl_croak(aTHX_ "length/code after end of string in unpack" ); 1829 } else { 1830 /* take top of stack (hope it's numeric) */ 1831 len = POPi; 1832 if( len < 0 ) 1833 Perl_croak(aTHX_ "Negative '/' count in unpack" ); 1834 } 1835 } else { 1836 Perl_croak(aTHX_ "Code missing after '/' in unpack" ); 1837 } 1838 datumtype = symptr->code; 1839 explicit_length = FALSE; 1840 goto redo_switch; 1841 } 1842 } 1843 1844 if (new_s) 1845 *new_s = s; 1846 PUTBACK; 1847 return SP - PL_stack_base - start_sp_offset; 1848 } 1849 1850 PP(pp_unpack) 1851 { 1852 dSP; 1853 dPOPPOPssrl; 1854 U8 gimme = GIMME_V; 1855 STRLEN llen; 1856 STRLEN rlen; 1857 const char *pat = SvPV_const(left, llen); 1858 const char *s = SvPV_const(right, rlen); 1859 const char *strend = s + rlen; 1860 const char *patend = pat + llen; 1861 SSize_t cnt; 1862 1863 PUTBACK; 1864 cnt = unpackstring(pat, patend, s, strend, 1865 ((gimme == G_SCALAR) ? FLAG_UNPACK_ONLY_ONE : 0) 1866 | (DO_UTF8(right) ? FLAG_DO_UTF8 : 0)); 1867 1868 SPAGAIN; 1869 if ( !cnt && gimme == G_SCALAR ) 1870 PUSHs(&PL_sv_undef); 1871 RETURN; 1872 } 1873 1874 STATIC U8 * 1875 doencodes(U8 *h, const U8 *s, SSize_t len) 1876 { 1877 *h++ = PL_uuemap[len]; 1878 while (len > 2) { 1879 *h++ = PL_uuemap[(077 & (s[0] >> 2))]; 1880 *h++ = PL_uuemap[(077 & (((s[0] << 4) & 060) | ((s[1] >> 4) & 017)))]; 1881 *h++ = PL_uuemap[(077 & (((s[1] << 2) & 074) | ((s[2] >> 6) & 03)))]; 1882 *h++ = PL_uuemap[(077 & (s[2] & 077))]; 1883 s += 3; 1884 len -= 3; 1885 } 1886 if (len > 0) { 1887 const U8 r = (len > 1 ? s[1] : '\0'); 1888 *h++ = PL_uuemap[(077 & (s[0] >> 2))]; 1889 *h++ = PL_uuemap[(077 & (((s[0] << 4) & 060) | ((r >> 4) & 017)))]; 1890 *h++ = PL_uuemap[(077 & ((r << 2) & 074))]; 1891 *h++ = PL_uuemap[0]; 1892 } 1893 *h++ = '\n'; 1894 return h; 1895 } 1896 1897 STATIC SV * 1898 S_is_an_int(pTHX_ const char *s, STRLEN l) 1899 { 1900 SV *result = newSVpvn(s, l); 1901 char *const result_c = SvPV_nolen(result); /* convenience */ 1902 char *out = result_c; 1903 bool skip = 1; 1904 bool ignore = 0; 1905 1906 PERL_ARGS_ASSERT_IS_AN_INT; 1907 1908 while (*s) { 1909 switch (*s) { 1910 case ' ': 1911 break; 1912 case '+': 1913 if (!skip) { 1914 SvREFCNT_dec(result); 1915 return (NULL); 1916 } 1917 break; 1918 case '0': 1919 case '1': 1920 case '2': 1921 case '3': 1922 case '4': 1923 case '5': 1924 case '6': 1925 case '7': 1926 case '8': 1927 case '9': 1928 skip = 0; 1929 if (!ignore) { 1930 *(out++) = *s; 1931 } 1932 break; 1933 case '.': 1934 ignore = 1; 1935 break; 1936 default: 1937 SvREFCNT_dec(result); 1938 return (NULL); 1939 } 1940 s++; 1941 } 1942 *(out++) = '\0'; 1943 SvCUR_set(result, out - result_c); 1944 return (result); 1945 } 1946 1947 /* pnum must be '\0' terminated */ 1948 STATIC int 1949 S_div128(pTHX_ SV *pnum, bool *done) 1950 { 1951 STRLEN len; 1952 char * const s = SvPV(pnum, len); 1953 char *t = s; 1954 int m = 0; 1955 1956 PERL_ARGS_ASSERT_DIV128; 1957 1958 *done = 1; 1959 while (*t) { 1960 const int i = m * 10 + (*t - '0'); 1961 const int r = (i >> 7); /* r < 10 */ 1962 m = i & 0x7F; 1963 if (r) { 1964 *done = 0; 1965 } 1966 *(t++) = '0' + r; 1967 } 1968 *(t++) = '\0'; 1969 SvCUR_set(pnum, (STRLEN) (t - s)); 1970 return (m); 1971 } 1972 1973 /* 1974 =for apidoc packlist 1975 1976 The engine implementing C<pack()> Perl function. 1977 1978 =cut 1979 */ 1980 1981 void 1982 Perl_packlist(pTHX_ SV *cat, const char *pat, const char *patend, SV **beglist, SV **endlist ) 1983 { 1984 tempsym_t sym; 1985 1986 PERL_ARGS_ASSERT_PACKLIST; 1987 1988 TEMPSYM_INIT(&sym, pat, patend, FLAG_PACK); 1989 1990 /* We're going to do changes through SvPVX(cat). Make sure it's valid. 1991 Also make sure any UTF8 flag is loaded */ 1992 SvPV_force_nolen(cat); 1993 if (DO_UTF8(cat)) 1994 sym.flags |= FLAG_PARSE_UTF8 | FLAG_DO_UTF8; 1995 1996 (void)pack_rec( cat, &sym, beglist, endlist ); 1997 } 1998 1999 /* like sv_utf8_upgrade, but also repoint the group start markers */ 2000 STATIC void 2001 marked_upgrade(pTHX_ SV *sv, tempsym_t *sym_ptr) { 2002 STRLEN len; 2003 tempsym_t *group; 2004 const char *from_ptr, *from_start, *from_end, **marks, **m; 2005 char *to_start, *to_ptr; 2006 2007 if (SvUTF8(sv)) return; 2008 2009 from_start = SvPVX_const(sv); 2010 from_end = from_start + SvCUR(sv); 2011 for (from_ptr = from_start; from_ptr < from_end; from_ptr++) 2012 if (!NATIVE_BYTE_IS_INVARIANT(*from_ptr)) break; 2013 if (from_ptr == from_end) { 2014 /* Simple case: no character needs to be changed */ 2015 SvUTF8_on(sv); 2016 return; 2017 } 2018 2019 len = (from_end-from_ptr)*UTF8_EXPAND+(from_ptr-from_start)+1; 2020 Newx(to_start, len, char); 2021 Copy(from_start, to_start, from_ptr-from_start, char); 2022 to_ptr = to_start + (from_ptr-from_start); 2023 2024 Newx(marks, sym_ptr->level+2, const char *); 2025 for (group=sym_ptr; group; group = group->previous) 2026 marks[group->level] = from_start + group->strbeg; 2027 marks[sym_ptr->level+1] = from_end+1; 2028 for (m = marks; *m < from_ptr; m++) 2029 *m = to_start + (*m-from_start); 2030 2031 for (;from_ptr < from_end; from_ptr++) { 2032 while (*m == from_ptr) *m++ = to_ptr; 2033 to_ptr = (char *) uvchr_to_utf8((U8 *) to_ptr, *(U8 *) from_ptr); 2034 } 2035 *to_ptr = 0; 2036 2037 while (*m == from_ptr) *m++ = to_ptr; 2038 if (m != marks + sym_ptr->level+1) { 2039 Safefree(marks); 2040 Safefree(to_start); 2041 Perl_croak(aTHX_ "panic: marks beyond string end, m=%p, marks=%p, " 2042 "level=%d", m, marks, sym_ptr->level); 2043 } 2044 for (group=sym_ptr; group; group = group->previous) 2045 group->strbeg = marks[group->level] - to_start; 2046 Safefree(marks); 2047 2048 if (SvOOK(sv)) { 2049 if (SvIVX(sv)) { 2050 SvLEN_set(sv, SvLEN(sv) + SvIVX(sv)); 2051 from_start -= SvIVX(sv); 2052 SvIV_set(sv, 0); 2053 } 2054 SvFLAGS(sv) &= ~SVf_OOK; 2055 } 2056 if (SvLEN(sv) != 0) 2057 Safefree(from_start); 2058 SvPV_set(sv, to_start); 2059 SvCUR_set(sv, to_ptr - to_start); 2060 SvLEN_set(sv, len); 2061 SvUTF8_on(sv); 2062 } 2063 2064 /* Exponential string grower. Makes string extension effectively O(n) 2065 needed says how many extra bytes we need (not counting the final '\0') 2066 Only grows the string if there is an actual lack of space 2067 */ 2068 STATIC char * 2069 S_sv_exp_grow(pTHX_ SV *sv, STRLEN needed) { 2070 const STRLEN cur = SvCUR(sv); 2071 const STRLEN len = SvLEN(sv); 2072 STRLEN extend; 2073 2074 PERL_ARGS_ASSERT_SV_EXP_GROW; 2075 2076 if (len - cur > needed) return SvPVX(sv); 2077 extend = needed > len ? needed : len; 2078 return SvGROW(sv, len+extend+1); 2079 } 2080 2081 static SV * 2082 S_sv_check_infnan(pTHX_ SV *sv, I32 datumtype) 2083 { 2084 SvGETMAGIC(sv); 2085 if (UNLIKELY(SvAMAGIC(sv))) 2086 sv = sv_2num(sv); 2087 if (UNLIKELY(isinfnansv(sv))) { 2088 const I32 c = TYPE_NO_MODIFIERS(datumtype); 2089 const NV nv = SvNV_nomg(sv); 2090 if (c == 'w') 2091 Perl_croak(aTHX_ "Cannot compress %" NVgf " in pack", nv); 2092 else 2093 Perl_croak(aTHX_ "Cannot pack %" NVgf " with '%c'", nv, (int) c); 2094 } 2095 return sv; 2096 } 2097 2098 #define SvIV_no_inf(sv,d) \ 2099 ((sv) = S_sv_check_infnan(aTHX_ sv,d), SvIV_nomg(sv)) 2100 #define SvUV_no_inf(sv,d) \ 2101 ((sv) = S_sv_check_infnan(aTHX_ sv,d), SvUV_nomg(sv)) 2102 2103 STATIC 2104 SV ** 2105 S_pack_rec(pTHX_ SV *cat, tempsym_t* symptr, SV **beglist, SV **endlist ) 2106 { 2107 tempsym_t lookahead; 2108 SSize_t items = endlist - beglist; 2109 bool found = next_symbol(symptr); 2110 bool utf8 = (symptr->flags & FLAG_PARSE_UTF8) ? 1 : 0; 2111 bool warn_utf8 = ckWARN(WARN_UTF8); 2112 char* from; 2113 2114 PERL_ARGS_ASSERT_PACK_REC; 2115 2116 if (symptr->level == 0 && found && symptr->code == 'U') { 2117 marked_upgrade(aTHX_ cat, symptr); 2118 symptr->flags |= FLAG_DO_UTF8; 2119 utf8 = 0; 2120 } 2121 symptr->strbeg = SvCUR(cat); 2122 2123 while (found) { 2124 SV *fromstr; 2125 STRLEN fromlen; 2126 SSize_t len; 2127 SV *lengthcode = NULL; 2128 I32 datumtype = symptr->code; 2129 howlen_t howlen = symptr->howlen; 2130 char *start = SvPVX(cat); 2131 char *cur = start + SvCUR(cat); 2132 bool needs_swap; 2133 2134 #define NEXTFROM (lengthcode ? lengthcode : items > 0 ? (--items, *beglist++) : &PL_sv_no) 2135 #define PEEKFROM (lengthcode ? lengthcode : items > 0 ? *beglist : &PL_sv_no) 2136 2137 switch (howlen) { 2138 case e_star: 2139 len = memCHRs("@Xxu", TYPE_NO_MODIFIERS(datumtype)) ? 2140 0 : items; 2141 break; 2142 default: 2143 /* e_no_len and e_number */ 2144 len = symptr->length; 2145 break; 2146 } 2147 2148 if (len) { 2149 packprops_t props = packprops[TYPE_NO_ENDIANNESS(datumtype)]; 2150 2151 if (props && !(props & PACK_SIZE_UNPREDICTABLE)) { 2152 /* We can process this letter. */ 2153 STRLEN size = props & PACK_SIZE_MASK; 2154 GROWING2(utf8, cat, start, cur, size, (STRLEN)len); 2155 } 2156 } 2157 2158 /* Look ahead for next symbol. Do we have code/code? */ 2159 lookahead = *symptr; 2160 found = next_symbol(&lookahead); 2161 if (symptr->flags & FLAG_SLASH) { 2162 IV count; 2163 if (!found) Perl_croak(aTHX_ "Code missing after '/' in pack"); 2164 if (memCHRs("aAZ", lookahead.code)) { 2165 if (lookahead.howlen == e_number) count = lookahead.length; 2166 else { 2167 if (items > 0) { 2168 count = sv_len_utf8(*beglist); 2169 } 2170 else count = 0; 2171 if (lookahead.code == 'Z') count++; 2172 } 2173 } else { 2174 if (lookahead.howlen == e_number && lookahead.length < items) 2175 count = lookahead.length; 2176 else count = items; 2177 } 2178 lookahead.howlen = e_number; 2179 lookahead.length = count; 2180 lengthcode = sv_2mortal(newSViv(count)); 2181 } 2182 2183 needs_swap = NEEDS_SWAP(datumtype); 2184 2185 /* Code inside the switch must take care to properly update 2186 cat (CUR length and '\0' termination) if it updated *cur and 2187 doesn't simply leave using break */ 2188 switch (TYPE_NO_ENDIANNESS(datumtype)) { 2189 default: 2190 Perl_croak(aTHX_ "Invalid type '%c' in pack", 2191 (int) TYPE_NO_MODIFIERS(datumtype)); 2192 case '%': 2193 Perl_croak(aTHX_ "'%%' may not be used in pack"); 2194 2195 case '.' | TYPE_IS_SHRIEKING: 2196 case '.': 2197 if (howlen == e_star) from = start; 2198 else if (len == 0) from = cur; 2199 else { 2200 tempsym_t *group = symptr; 2201 2202 while (--len && group) group = group->previous; 2203 from = group ? start + group->strbeg : start; 2204 } 2205 fromstr = NEXTFROM; 2206 len = SvIV_no_inf(fromstr, datumtype); 2207 goto resize; 2208 case '@' | TYPE_IS_SHRIEKING: 2209 case '@': 2210 from = start + symptr->strbeg; 2211 resize: 2212 if (utf8 && !(datumtype & TYPE_IS_SHRIEKING)) 2213 if (len >= 0) { 2214 while (len && from < cur) { 2215 from += UTF8SKIP(from); 2216 len--; 2217 } 2218 if (from > cur) 2219 Perl_croak(aTHX_ "Malformed UTF-8 string in pack"); 2220 if (len) { 2221 /* Here we know from == cur */ 2222 grow: 2223 GROWING(0, cat, start, cur, len); 2224 Zero(cur, len, char); 2225 cur += len; 2226 } else if (from < cur) { 2227 len = cur - from; 2228 goto shrink; 2229 } else goto no_change; 2230 } else { 2231 cur = from; 2232 len = -len; 2233 goto utf8_shrink; 2234 } 2235 else { 2236 len -= cur - from; 2237 if (len > 0) goto grow; 2238 if (len == 0) goto no_change; 2239 len = -len; 2240 goto shrink; 2241 } 2242 break; 2243 2244 case '(': { 2245 tempsym_t savsym = *symptr; 2246 U32 group_modifiers = TYPE_MODIFIERS(datumtype & ~symptr->flags); 2247 symptr->flags |= group_modifiers; 2248 symptr->patend = savsym.grpend; 2249 symptr->level++; 2250 symptr->previous = &lookahead; 2251 while (len--) { 2252 U32 was_utf8; 2253 if (utf8) symptr->flags |= FLAG_PARSE_UTF8; 2254 else symptr->flags &= ~FLAG_PARSE_UTF8; 2255 was_utf8 = SvUTF8(cat); 2256 symptr->patptr = savsym.grpbeg; 2257 beglist = pack_rec(cat, symptr, beglist, endlist); 2258 if (SvUTF8(cat) != was_utf8) 2259 /* This had better be an upgrade while in utf8==0 mode */ 2260 utf8 = 1; 2261 2262 if (savsym.howlen == e_star && beglist == endlist) 2263 break; /* No way to continue */ 2264 } 2265 items = endlist - beglist; 2266 lookahead.flags = symptr->flags & ~group_modifiers; 2267 goto no_change; 2268 } 2269 case 'X' | TYPE_IS_SHRIEKING: 2270 if (!len) /* Avoid division by 0 */ 2271 len = 1; 2272 if (utf8) { 2273 char *hop, *last; 2274 SSize_t l = len; 2275 hop = last = start; 2276 while (hop < cur) { 2277 hop += UTF8SKIP(hop); 2278 if (--l == 0) { 2279 last = hop; 2280 l = len; 2281 } 2282 } 2283 if (last > cur) 2284 Perl_croak(aTHX_ "Malformed UTF-8 string in pack"); 2285 cur = last; 2286 break; 2287 } 2288 len = (cur-start) % len; 2289 /* FALLTHROUGH */ 2290 case 'X': 2291 if (utf8) { 2292 if (len < 1) goto no_change; 2293 utf8_shrink: 2294 while (len > 0) { 2295 if (cur <= start) 2296 Perl_croak(aTHX_ "'%c' outside of string in pack", 2297 (int) TYPE_NO_MODIFIERS(datumtype)); 2298 while (--cur, UTF8_IS_CONTINUATION(*cur)) { 2299 if (cur <= start) 2300 Perl_croak(aTHX_ "'%c' outside of string in pack", 2301 (int) TYPE_NO_MODIFIERS(datumtype)); 2302 } 2303 len--; 2304 } 2305 } else { 2306 shrink: 2307 if (cur - start < len) 2308 Perl_croak(aTHX_ "'%c' outside of string in pack", 2309 (int) TYPE_NO_MODIFIERS(datumtype)); 2310 cur -= len; 2311 } 2312 if (cur < start+symptr->strbeg) { 2313 /* Make sure group starts don't point into the void */ 2314 tempsym_t *group; 2315 const STRLEN length = cur-start; 2316 for (group = symptr; 2317 group && length < group->strbeg; 2318 group = group->previous) group->strbeg = length; 2319 lookahead.strbeg = length; 2320 } 2321 break; 2322 case 'x' | TYPE_IS_SHRIEKING: { 2323 SSize_t ai32; 2324 if (!len) /* Avoid division by 0 */ 2325 len = 1; 2326 if (utf8) ai32 = utf8_length((U8 *) start, (U8 *) cur) % len; 2327 else ai32 = (cur - start) % len; 2328 if (ai32 == 0) goto no_change; 2329 len -= ai32; 2330 } 2331 /* FALLTHROUGH */ 2332 case 'x': 2333 goto grow; 2334 case 'A': 2335 case 'Z': 2336 case 'a': { 2337 const char *aptr; 2338 2339 fromstr = NEXTFROM; 2340 aptr = SvPV_const(fromstr, fromlen); 2341 if (DO_UTF8(fromstr)) { 2342 const char *end, *s; 2343 2344 if (!utf8 && !SvUTF8(cat)) { 2345 marked_upgrade(aTHX_ cat, symptr); 2346 lookahead.flags |= FLAG_DO_UTF8; 2347 lookahead.strbeg = symptr->strbeg; 2348 utf8 = 1; 2349 start = SvPVX(cat); 2350 cur = start + SvCUR(cat); 2351 } 2352 if (howlen == e_star) { 2353 if (utf8) goto string_copy; 2354 len = fromlen+1; 2355 } 2356 s = aptr; 2357 end = aptr + fromlen; 2358 fromlen = datumtype == 'Z' ? len-1 : len; 2359 while ((SSize_t) fromlen > 0 && s < end) { 2360 s += UTF8SKIP(s); 2361 fromlen--; 2362 } 2363 if (s > end) 2364 Perl_croak(aTHX_ "Malformed UTF-8 string in pack"); 2365 if (utf8) { 2366 len = fromlen; 2367 if (datumtype == 'Z') len++; 2368 fromlen = s-aptr; 2369 len += fromlen; 2370 2371 goto string_copy; 2372 } 2373 fromlen = len - fromlen; 2374 if (datumtype == 'Z') fromlen--; 2375 if (howlen == e_star) { 2376 len = fromlen; 2377 if (datumtype == 'Z') len++; 2378 } 2379 GROWING(0, cat, start, cur, len); 2380 if (!S_utf8_to_bytes(aTHX_ &aptr, end, cur, fromlen, 2381 datumtype | TYPE_IS_PACK)) 2382 Perl_croak(aTHX_ "panic: predicted utf8 length not available, " 2383 "for '%c', aptr=%p end=%p cur=%p, fromlen=%zu", 2384 (int)datumtype, aptr, end, cur, fromlen); 2385 cur += fromlen; 2386 len -= fromlen; 2387 } else if (utf8) { 2388 if (howlen == e_star) { 2389 len = fromlen; 2390 if (datumtype == 'Z') len++; 2391 } 2392 if (len <= (SSize_t) fromlen) { 2393 fromlen = len; 2394 if (datumtype == 'Z' && fromlen > 0) fromlen--; 2395 } 2396 /* assumes a byte expands to at most UTF8_EXPAND bytes on 2397 upgrade, so: 2398 expected_length <= from_len*UTF8_EXPAND + (len-from_len) */ 2399 GROWING(0, cat, start, cur, fromlen*(UTF8_EXPAND-1)+len); 2400 len -= fromlen; 2401 while (fromlen > 0) { 2402 cur = (char *) uvchr_to_utf8((U8 *) cur, * (U8 *) aptr); 2403 aptr++; 2404 fromlen--; 2405 } 2406 } else { 2407 string_copy: 2408 if (howlen == e_star) { 2409 len = fromlen; 2410 if (datumtype == 'Z') len++; 2411 } 2412 if (len <= (SSize_t) fromlen) { 2413 fromlen = len; 2414 if (datumtype == 'Z' && fromlen > 0) fromlen--; 2415 } 2416 GROWING(0, cat, start, cur, len); 2417 Copy(aptr, cur, fromlen, char); 2418 cur += fromlen; 2419 len -= fromlen; 2420 } 2421 memset(cur, datumtype == 'A' ? ' ' : '\0', len); 2422 cur += len; 2423 SvTAINT(cat); 2424 break; 2425 } 2426 case 'B': 2427 case 'b': { 2428 const char *str, *end; 2429 SSize_t l, field_len; 2430 U8 bits; 2431 bool utf8_source; 2432 U32 utf8_flags; 2433 2434 fromstr = NEXTFROM; 2435 str = SvPV_const(fromstr, fromlen); 2436 end = str + fromlen; 2437 if (DO_UTF8(fromstr)) { 2438 utf8_source = TRUE; 2439 utf8_flags = warn_utf8 ? 0 : UTF8_ALLOW_ANY; 2440 } else { 2441 utf8_source = FALSE; 2442 utf8_flags = 0; /* Unused, but keep compilers happy */ 2443 } 2444 if (howlen == e_star) len = fromlen; 2445 field_len = (len+7)/8; 2446 GROWING(utf8, cat, start, cur, field_len); 2447 if (len > (SSize_t)fromlen) len = fromlen; 2448 bits = 0; 2449 l = 0; 2450 if (datumtype == 'B') 2451 while (l++ < len) { 2452 if (utf8_source) { 2453 UV val = 0; 2454 NEXT_UNI_VAL(val, cur, str, end, utf8_flags); 2455 bits |= val & 1; 2456 } else bits |= *str++ & 1; 2457 if (l & 7) bits <<= 1; 2458 else { 2459 PUSH_BYTE(utf8, cur, bits); 2460 bits = 0; 2461 } 2462 } 2463 else 2464 /* datumtype == 'b' */ 2465 while (l++ < len) { 2466 if (utf8_source) { 2467 UV val = 0; 2468 NEXT_UNI_VAL(val, cur, str, end, utf8_flags); 2469 if (val & 1) bits |= 0x80; 2470 } else if (*str++ & 1) 2471 bits |= 0x80; 2472 if (l & 7) bits >>= 1; 2473 else { 2474 PUSH_BYTE(utf8, cur, bits); 2475 bits = 0; 2476 } 2477 } 2478 l--; 2479 if (l & 7) { 2480 if (datumtype == 'B') 2481 bits <<= 7 - (l & 7); 2482 else 2483 bits >>= 7 - (l & 7); 2484 PUSH_BYTE(utf8, cur, bits); 2485 l += 7; 2486 } 2487 /* Determine how many chars are left in the requested field */ 2488 l /= 8; 2489 if (howlen == e_star) field_len = 0; 2490 else field_len -= l; 2491 Zero(cur, field_len, char); 2492 cur += field_len; 2493 break; 2494 } 2495 case 'H': 2496 case 'h': { 2497 const char *str, *end; 2498 SSize_t l, field_len; 2499 U8 bits; 2500 bool utf8_source; 2501 U32 utf8_flags; 2502 2503 fromstr = NEXTFROM; 2504 str = SvPV_const(fromstr, fromlen); 2505 end = str + fromlen; 2506 if (DO_UTF8(fromstr)) { 2507 utf8_source = TRUE; 2508 utf8_flags = warn_utf8 ? 0 : UTF8_ALLOW_ANY; 2509 } else { 2510 utf8_source = FALSE; 2511 utf8_flags = 0; /* Unused, but keep compilers happy */ 2512 } 2513 if (howlen == e_star) len = fromlen; 2514 field_len = (len+1)/2; 2515 GROWING(utf8, cat, start, cur, field_len); 2516 if (!utf8_source && len > (SSize_t)fromlen) len = fromlen; 2517 bits = 0; 2518 l = 0; 2519 if (datumtype == 'H') 2520 while (l++ < len) { 2521 if (utf8_source) { 2522 UV val = 0; 2523 NEXT_UNI_VAL(val, cur, str, end, utf8_flags); 2524 if (val < 256 && isALPHA(val)) 2525 bits |= (val + 9) & 0xf; 2526 else 2527 bits |= val & 0xf; 2528 } else if (isALPHA(*str)) 2529 bits |= (*str++ + 9) & 0xf; 2530 else 2531 bits |= *str++ & 0xf; 2532 if (l & 1) bits <<= 4; 2533 else { 2534 PUSH_BYTE(utf8, cur, bits); 2535 bits = 0; 2536 } 2537 } 2538 else 2539 while (l++ < len) { 2540 if (utf8_source) { 2541 UV val = 0; 2542 NEXT_UNI_VAL(val, cur, str, end, utf8_flags); 2543 if (val < 256 && isALPHA(val)) 2544 bits |= ((val + 9) & 0xf) << 4; 2545 else 2546 bits |= (val & 0xf) << 4; 2547 } else if (isALPHA(*str)) 2548 bits |= ((*str++ + 9) & 0xf) << 4; 2549 else 2550 bits |= (*str++ & 0xf) << 4; 2551 if (l & 1) bits >>= 4; 2552 else { 2553 PUSH_BYTE(utf8, cur, bits); 2554 bits = 0; 2555 } 2556 } 2557 l--; 2558 if (l & 1) { 2559 PUSH_BYTE(utf8, cur, bits); 2560 l++; 2561 } 2562 /* Determine how many chars are left in the requested field */ 2563 l /= 2; 2564 if (howlen == e_star) field_len = 0; 2565 else field_len -= l; 2566 Zero(cur, field_len, char); 2567 cur += field_len; 2568 break; 2569 } 2570 case 'c': 2571 while (len-- > 0) { 2572 IV aiv; 2573 fromstr = NEXTFROM; 2574 aiv = SvIV_no_inf(fromstr, datumtype); 2575 if ((-128 > aiv || aiv > 127)) 2576 Perl_ck_warner(aTHX_ packWARN(WARN_PACK), 2577 "Character in 'c' format wrapped in pack"); 2578 PUSH_BYTE(utf8, cur, (U8)(aiv & 0xff)); 2579 } 2580 break; 2581 case 'C': 2582 if (len == 0) { 2583 utf8 = (symptr->flags & FLAG_DO_UTF8) ? 1 : 0; 2584 break; 2585 } 2586 while (len-- > 0) { 2587 IV aiv; 2588 fromstr = NEXTFROM; 2589 aiv = SvIV_no_inf(fromstr, datumtype); 2590 if ((0 > aiv || aiv > 0xff)) 2591 Perl_ck_warner(aTHX_ packWARN(WARN_PACK), 2592 "Character in 'C' format wrapped in pack"); 2593 PUSH_BYTE(utf8, cur, (U8)(aiv & 0xff)); 2594 } 2595 break; 2596 case 'W': { 2597 char *end; 2598 U8 in_bytes = (U8)IN_BYTES; 2599 2600 end = start+SvLEN(cat)-1; 2601 if (utf8) end -= UTF8_MAXLEN-1; 2602 while (len-- > 0) { 2603 UV auv; 2604 fromstr = NEXTFROM; 2605 auv = SvUV_no_inf(fromstr, datumtype); 2606 if (in_bytes) auv = auv % 0x100; 2607 if (utf8) { 2608 W_utf8: 2609 if (cur >= end) { 2610 *cur = '\0'; 2611 SvCUR_set(cat, cur - start); 2612 2613 GROWING(0, cat, start, cur, len+UTF8_MAXLEN); 2614 end = start+SvLEN(cat)-UTF8_MAXLEN; 2615 } 2616 cur = (char *) uvchr_to_utf8_flags((U8 *) cur, auv, 0); 2617 } else { 2618 if (auv >= 0x100) { 2619 if (!SvUTF8(cat)) { 2620 *cur = '\0'; 2621 SvCUR_set(cat, cur - start); 2622 marked_upgrade(aTHX_ cat, symptr); 2623 lookahead.flags |= FLAG_DO_UTF8; 2624 lookahead.strbeg = symptr->strbeg; 2625 utf8 = 1; 2626 start = SvPVX(cat); 2627 cur = start + SvCUR(cat); 2628 end = start+SvLEN(cat)-UTF8_MAXLEN; 2629 goto W_utf8; 2630 } 2631 Perl_ck_warner(aTHX_ packWARN(WARN_PACK), 2632 "Character in 'W' format wrapped in pack"); 2633 auv &= 0xff; 2634 } 2635 if (cur >= end) { 2636 *cur = '\0'; 2637 SvCUR_set(cat, cur - start); 2638 GROWING(0, cat, start, cur, len+1); 2639 end = start+SvLEN(cat)-1; 2640 } 2641 *(U8 *) cur++ = (U8)auv; 2642 } 2643 } 2644 break; 2645 } 2646 case 'U': { 2647 char *end; 2648 2649 if (len == 0) { 2650 if (!(symptr->flags & FLAG_DO_UTF8)) { 2651 marked_upgrade(aTHX_ cat, symptr); 2652 lookahead.flags |= FLAG_DO_UTF8; 2653 lookahead.strbeg = symptr->strbeg; 2654 } 2655 utf8 = 0; 2656 goto no_change; 2657 } 2658 2659 end = start+SvLEN(cat); 2660 if (!utf8) end -= UTF8_MAXLEN; 2661 while (len-- > 0) { 2662 UV auv; 2663 fromstr = NEXTFROM; 2664 auv = SvUV_no_inf(fromstr, datumtype); 2665 if (utf8) { 2666 U8 buffer[UTF8_MAXLEN+1], *endb; 2667 endb = uvchr_to_utf8_flags(buffer, UNI_TO_NATIVE(auv), 0); 2668 if (cur+(endb-buffer)*UTF8_EXPAND >= end) { 2669 *cur = '\0'; 2670 SvCUR_set(cat, cur - start); 2671 GROWING(0, cat, start, cur, 2672 len+(endb-buffer)*UTF8_EXPAND); 2673 end = start+SvLEN(cat); 2674 } 2675 cur = my_bytes_to_utf8(buffer, endb-buffer, cur, 0); 2676 } else { 2677 if (cur >= end) { 2678 *cur = '\0'; 2679 SvCUR_set(cat, cur - start); 2680 GROWING(0, cat, start, cur, len+UTF8_MAXLEN); 2681 end = start+SvLEN(cat)-UTF8_MAXLEN; 2682 } 2683 cur = (char *) uvchr_to_utf8_flags((U8 *) cur, 2684 UNI_TO_NATIVE(auv), 2685 0); 2686 } 2687 } 2688 break; 2689 } 2690 /* Float and double added by gnb@melba.bby.oz.au 22/11/89 */ 2691 case 'f': 2692 while (len-- > 0) { 2693 float afloat; 2694 NV anv; 2695 fromstr = NEXTFROM; 2696 anv = SvNV(fromstr); 2697 # if (defined(VMS) && !defined(_IEEE_FP)) || defined(DOUBLE_IS_VAX_FLOAT) 2698 /* IEEE fp overflow shenanigans are unavailable on VAX and optional 2699 * on Alpha; fake it if we don't have them. 2700 */ 2701 if (anv > FLT_MAX) 2702 afloat = FLT_MAX; 2703 else if (anv < -FLT_MAX) 2704 afloat = -FLT_MAX; 2705 else afloat = (float)anv; 2706 # else 2707 # if defined(NAN_COMPARE_BROKEN) && defined(Perl_isnan) 2708 if(Perl_isnan(anv)) 2709 afloat = (float)NV_NAN; 2710 else 2711 # endif 2712 # ifdef NV_INF 2713 /* a simple cast to float is undefined if outside 2714 * the range of values that can be represented */ 2715 afloat = (float)(anv > FLT_MAX ? NV_INF : 2716 anv < -FLT_MAX ? -NV_INF : anv); 2717 # endif 2718 # endif 2719 PUSH_VAR(utf8, cur, afloat, needs_swap); 2720 } 2721 break; 2722 case 'd': 2723 while (len-- > 0) { 2724 double adouble; 2725 NV anv; 2726 fromstr = NEXTFROM; 2727 anv = SvNV(fromstr); 2728 # if (defined(VMS) && !defined(_IEEE_FP)) || defined(DOUBLE_IS_VAX_FLOAT) 2729 /* IEEE fp overflow shenanigans are unavailable on VAX and optional 2730 * on Alpha; fake it if we don't have them. 2731 */ 2732 if (anv > DBL_MAX) 2733 adouble = DBL_MAX; 2734 else if (anv < -DBL_MAX) 2735 adouble = -DBL_MAX; 2736 else adouble = (double)anv; 2737 # else 2738 adouble = (double)anv; 2739 # endif 2740 PUSH_VAR(utf8, cur, adouble, needs_swap); 2741 } 2742 break; 2743 case 'F': { 2744 NV_bytes anv; 2745 Zero(&anv, 1, NV); /* can be long double with unused bits */ 2746 while (len-- > 0) { 2747 fromstr = NEXTFROM; 2748 #ifdef __GNUC__ 2749 /* to work round a gcc/x86 bug; don't use SvNV */ 2750 anv.nv = sv_2nv(fromstr); 2751 # if defined(LONGDOUBLE_X86_80_BIT) && defined(USE_LONG_DOUBLE) \ 2752 && LONG_DOUBLESIZE > 10 2753 /* GCC sometimes overwrites the padding in the 2754 assignment above */ 2755 Zero(anv.bytes+10, sizeof(anv.bytes) - 10, U8); 2756 # endif 2757 #else 2758 anv.nv = SvNV(fromstr); 2759 #endif 2760 PUSH_BYTES(utf8, cur, anv.bytes, sizeof(anv.bytes), needs_swap); 2761 } 2762 break; 2763 } 2764 #if defined(HAS_LONG_DOUBLE) && defined(USE_LONG_DOUBLE) 2765 case 'D': { 2766 ld_bytes aldouble; 2767 /* long doubles can have unused bits, which may be nonzero */ 2768 Zero(&aldouble, 1, long double); 2769 while (len-- > 0) { 2770 fromstr = NEXTFROM; 2771 # ifdef __GNUC__ 2772 /* to work round a gcc/x86 bug; don't use SvNV */ 2773 aldouble.ld = (long double)sv_2nv(fromstr); 2774 # if defined(LONGDOUBLE_X86_80_BIT) && LONG_DOUBLESIZE > 10 2775 /* GCC sometimes overwrites the padding in the 2776 assignment above */ 2777 Zero(aldouble.bytes+10, sizeof(aldouble.bytes) - 10, U8); 2778 # endif 2779 # else 2780 aldouble.ld = (long double)SvNV(fromstr); 2781 # endif 2782 PUSH_BYTES(utf8, cur, aldouble.bytes, sizeof(aldouble.bytes), 2783 needs_swap); 2784 } 2785 break; 2786 } 2787 #endif 2788 case 'n' | TYPE_IS_SHRIEKING: 2789 case 'n': 2790 while (len-- > 0) { 2791 I16 ai16; 2792 fromstr = NEXTFROM; 2793 ai16 = (I16)SvIV_no_inf(fromstr, datumtype); 2794 ai16 = PerlSock_htons(ai16); 2795 PUSH16(utf8, cur, &ai16, FALSE); 2796 } 2797 break; 2798 case 'v' | TYPE_IS_SHRIEKING: 2799 case 'v': 2800 while (len-- > 0) { 2801 I16 ai16; 2802 fromstr = NEXTFROM; 2803 ai16 = (I16)SvIV_no_inf(fromstr, datumtype); 2804 ai16 = htovs(ai16); 2805 PUSH16(utf8, cur, &ai16, FALSE); 2806 } 2807 break; 2808 case 'S' | TYPE_IS_SHRIEKING: 2809 #if SHORTSIZE != SIZE16 2810 while (len-- > 0) { 2811 unsigned short aushort; 2812 fromstr = NEXTFROM; 2813 aushort = SvUV_no_inf(fromstr, datumtype); 2814 PUSH_VAR(utf8, cur, aushort, needs_swap); 2815 } 2816 break; 2817 #else 2818 /* FALLTHROUGH */ 2819 #endif 2820 case 'S': 2821 while (len-- > 0) { 2822 U16 au16; 2823 fromstr = NEXTFROM; 2824 au16 = (U16)SvUV_no_inf(fromstr, datumtype); 2825 PUSH16(utf8, cur, &au16, needs_swap); 2826 } 2827 break; 2828 case 's' | TYPE_IS_SHRIEKING: 2829 #if SHORTSIZE != SIZE16 2830 while (len-- > 0) { 2831 short ashort; 2832 fromstr = NEXTFROM; 2833 ashort = SvIV_no_inf(fromstr, datumtype); 2834 PUSH_VAR(utf8, cur, ashort, needs_swap); 2835 } 2836 break; 2837 #else 2838 /* FALLTHROUGH */ 2839 #endif 2840 case 's': 2841 while (len-- > 0) { 2842 I16 ai16; 2843 fromstr = NEXTFROM; 2844 ai16 = (I16)SvIV_no_inf(fromstr, datumtype); 2845 PUSH16(utf8, cur, &ai16, needs_swap); 2846 } 2847 break; 2848 case 'I': 2849 case 'I' | TYPE_IS_SHRIEKING: 2850 while (len-- > 0) { 2851 unsigned int auint; 2852 fromstr = NEXTFROM; 2853 auint = SvUV_no_inf(fromstr, datumtype); 2854 PUSH_VAR(utf8, cur, auint, needs_swap); 2855 } 2856 break; 2857 case 'j': 2858 while (len-- > 0) { 2859 IV aiv; 2860 fromstr = NEXTFROM; 2861 aiv = SvIV_no_inf(fromstr, datumtype); 2862 PUSH_VAR(utf8, cur, aiv, needs_swap); 2863 } 2864 break; 2865 case 'J': 2866 while (len-- > 0) { 2867 UV auv; 2868 fromstr = NEXTFROM; 2869 auv = SvUV_no_inf(fromstr, datumtype); 2870 PUSH_VAR(utf8, cur, auv, needs_swap); 2871 } 2872 break; 2873 case 'w': 2874 while (len-- > 0) { 2875 NV anv; 2876 fromstr = NEXTFROM; 2877 S_sv_check_infnan(aTHX_ fromstr, datumtype); 2878 anv = SvNV_nomg(fromstr); 2879 2880 if (anv < 0) { 2881 *cur = '\0'; 2882 SvCUR_set(cat, cur - start); 2883 Perl_croak(aTHX_ "Cannot compress negative numbers in pack"); 2884 } 2885 2886 /* 0xFFFFFFFFFFFFFFFF may cast to 18446744073709551616.0, 2887 which is == UV_MAX_P1. IOK is fine (instead of UV_only), as 2888 any negative IVs will have already been got by the croak() 2889 above. IOK is untrue for fractions, so we test them 2890 against UV_MAX_P1. */ 2891 if (SvIOK(fromstr) || anv < UV_MAX_P1) { 2892 char buf[(sizeof(UV)*CHAR_BIT)/7+1]; 2893 char *in = buf + sizeof(buf); 2894 UV auv = SvUV_nomg(fromstr); 2895 2896 do { 2897 *--in = (char)((auv & 0x7f) | 0x80); 2898 auv >>= 7; 2899 } while (auv); 2900 buf[sizeof(buf) - 1] &= 0x7f; /* clear continue bit */ 2901 PUSH_GROWING_BYTES(utf8, cat, start, cur, 2902 in, (buf + sizeof(buf)) - in); 2903 } else if (SvPOKp(fromstr)) 2904 goto w_string; 2905 else if (SvNOKp(fromstr)) { 2906 /* 10**NV_MAX_10_EXP is the largest power of 10 2907 so 10**(NV_MAX_10_EXP+1) is definitely unrepresentable 2908 given 10**(NV_MAX_10_EXP+1) == 128 ** x solve for x: 2909 x = (NV_MAX_10_EXP+1) * log (10) / log (128) 2910 And with that many bytes only Inf can overflow. 2911 Some C compilers are strict about integral constant 2912 expressions so we conservatively divide by a slightly 2913 smaller integer instead of multiplying by the exact 2914 floating-point value. 2915 */ 2916 #ifdef NV_MAX_10_EXP 2917 /* char buf[1 + (int)((NV_MAX_10_EXP + 1) * 0.47456)]; -- invalid C */ 2918 char buf[1 + (int)((NV_MAX_10_EXP + 1) / 2)]; /* valid C */ 2919 #else 2920 /* char buf[1 + (int)((308 + 1) * 0.47456)]; -- invalid C */ 2921 char buf[1 + (int)((308 + 1) / 2)]; /* valid C */ 2922 #endif 2923 char *in = buf + sizeof(buf); 2924 2925 anv = Perl_floor(anv); 2926 do { 2927 const NV next = Perl_floor(anv / 128); 2928 if (in <= buf) /* this cannot happen ;-) */ 2929 Perl_croak(aTHX_ "Cannot compress integer in pack"); 2930 *--in = (unsigned char)(anv - (next * 128)) | 0x80; 2931 anv = next; 2932 } while (anv > 0); 2933 buf[sizeof(buf) - 1] &= 0x7f; /* clear continue bit */ 2934 PUSH_GROWING_BYTES(utf8, cat, start, cur, 2935 in, (buf + sizeof(buf)) - in); 2936 } else { 2937 const char *from; 2938 char *result, *in; 2939 SV *norm; 2940 STRLEN len; 2941 bool done; 2942 2943 w_string: 2944 /* Copy string and check for compliance */ 2945 from = SvPV_nomg_const(fromstr, len); 2946 if ((norm = is_an_int(from, len)) == NULL) 2947 Perl_croak(aTHX_ "Can only compress unsigned integers in pack"); 2948 2949 Newx(result, len, char); 2950 in = result + len; 2951 done = FALSE; 2952 while (!done) *--in = div128(norm, &done) | 0x80; 2953 result[len - 1] &= 0x7F; /* clear continue bit */ 2954 PUSH_GROWING_BYTES(utf8, cat, start, cur, 2955 in, (result + len) - in); 2956 Safefree(result); 2957 SvREFCNT_dec(norm); /* free norm */ 2958 } 2959 } 2960 break; 2961 case 'i': 2962 case 'i' | TYPE_IS_SHRIEKING: 2963 while (len-- > 0) { 2964 int aint; 2965 fromstr = NEXTFROM; 2966 aint = SvIV_no_inf(fromstr, datumtype); 2967 PUSH_VAR(utf8, cur, aint, needs_swap); 2968 } 2969 break; 2970 case 'N' | TYPE_IS_SHRIEKING: 2971 case 'N': 2972 while (len-- > 0) { 2973 U32 au32; 2974 fromstr = NEXTFROM; 2975 au32 = SvUV_no_inf(fromstr, datumtype); 2976 au32 = PerlSock_htonl(au32); 2977 PUSH32(utf8, cur, &au32, FALSE); 2978 } 2979 break; 2980 case 'V' | TYPE_IS_SHRIEKING: 2981 case 'V': 2982 while (len-- > 0) { 2983 U32 au32; 2984 fromstr = NEXTFROM; 2985 au32 = SvUV_no_inf(fromstr, datumtype); 2986 au32 = htovl(au32); 2987 PUSH32(utf8, cur, &au32, FALSE); 2988 } 2989 break; 2990 case 'L' | TYPE_IS_SHRIEKING: 2991 #if LONGSIZE != SIZE32 2992 while (len-- > 0) { 2993 unsigned long aulong; 2994 fromstr = NEXTFROM; 2995 aulong = SvUV_no_inf(fromstr, datumtype); 2996 PUSH_VAR(utf8, cur, aulong, needs_swap); 2997 } 2998 break; 2999 #else 3000 /* Fall though! */ 3001 #endif 3002 case 'L': 3003 while (len-- > 0) { 3004 U32 au32; 3005 fromstr = NEXTFROM; 3006 au32 = SvUV_no_inf(fromstr, datumtype); 3007 PUSH32(utf8, cur, &au32, needs_swap); 3008 } 3009 break; 3010 case 'l' | TYPE_IS_SHRIEKING: 3011 #if LONGSIZE != SIZE32 3012 while (len-- > 0) { 3013 long along; 3014 fromstr = NEXTFROM; 3015 along = SvIV_no_inf(fromstr, datumtype); 3016 PUSH_VAR(utf8, cur, along, needs_swap); 3017 } 3018 break; 3019 #else 3020 /* Fall though! */ 3021 #endif 3022 case 'l': 3023 while (len-- > 0) { 3024 I32 ai32; 3025 fromstr = NEXTFROM; 3026 ai32 = SvIV_no_inf(fromstr, datumtype); 3027 PUSH32(utf8, cur, &ai32, needs_swap); 3028 } 3029 break; 3030 #if defined(HAS_QUAD) && IVSIZE >= 8 3031 case 'Q': 3032 while (len-- > 0) { 3033 Uquad_t auquad; 3034 fromstr = NEXTFROM; 3035 auquad = (Uquad_t) SvUV_no_inf(fromstr, datumtype); 3036 PUSH_VAR(utf8, cur, auquad, needs_swap); 3037 } 3038 break; 3039 case 'q': 3040 while (len-- > 0) { 3041 Quad_t aquad; 3042 fromstr = NEXTFROM; 3043 aquad = (Quad_t)SvIV_no_inf(fromstr, datumtype); 3044 PUSH_VAR(utf8, cur, aquad, needs_swap); 3045 } 3046 break; 3047 #endif 3048 case 'P': 3049 len = 1; /* assume SV is correct length */ 3050 GROWING(utf8, cat, start, cur, sizeof(char *)); 3051 /* FALLTHROUGH */ 3052 case 'p': 3053 while (len-- > 0) { 3054 const char *aptr; 3055 3056 fromstr = NEXTFROM; 3057 SvGETMAGIC(fromstr); 3058 if (!SvOK(fromstr)) aptr = NULL; 3059 else { 3060 /* XXX better yet, could spirit away the string to 3061 * a safe spot and hang on to it until the result 3062 * of pack() (and all copies of the result) are 3063 * gone. 3064 */ 3065 if (((SvTEMP(fromstr) && SvREFCNT(fromstr) == 1) 3066 || (SvPADTMP(fromstr) && 3067 !SvREADONLY(fromstr)))) { 3068 Perl_ck_warner(aTHX_ packWARN(WARN_PACK), 3069 "Attempt to pack pointer to temporary value"); 3070 } 3071 if (SvPOK(fromstr) || SvNIOK(fromstr)) 3072 aptr = SvPV_nomg_const_nolen(fromstr); 3073 else 3074 aptr = SvPV_force_flags_nolen(fromstr, 0); 3075 } 3076 PUSH_VAR(utf8, cur, aptr, needs_swap); 3077 } 3078 break; 3079 case 'u': { 3080 const char *aptr, *aend; 3081 bool from_utf8; 3082 3083 fromstr = NEXTFROM; 3084 if (len <= 2) len = 45; 3085 else len = len / 3 * 3; 3086 if (len >= 64) { 3087 Perl_ck_warner(aTHX_ packWARN(WARN_PACK), 3088 "Field too wide in 'u' format in pack"); 3089 len = 63; 3090 } 3091 aptr = SvPV_const(fromstr, fromlen); 3092 from_utf8 = DO_UTF8(fromstr); 3093 if (from_utf8) { 3094 aend = aptr + fromlen; 3095 fromlen = sv_len_utf8_nomg(fromstr); 3096 } else aend = NULL; /* Unused, but keep compilers happy */ 3097 GROWING(utf8, cat, start, cur, (fromlen+2) / 3 * 4 + (fromlen+len-1)/len * 2); 3098 while (fromlen > 0) { 3099 U8 *end; 3100 SSize_t todo; 3101 U8 hunk[1+63/3*4+1]; 3102 3103 if ((SSize_t)fromlen > len) 3104 todo = len; 3105 else 3106 todo = fromlen; 3107 if (from_utf8) { 3108 char buffer[64]; 3109 if (!S_utf8_to_bytes(aTHX_ &aptr, aend, buffer, todo, 3110 'u' | TYPE_IS_PACK)) { 3111 *cur = '\0'; 3112 SvCUR_set(cat, cur - start); 3113 Perl_croak(aTHX_ "panic: string is shorter than advertised, " 3114 "aptr=%p, aend=%p, buffer=%p, todo=%zd", 3115 aptr, aend, buffer, todo); 3116 } 3117 end = doencodes(hunk, (const U8 *)buffer, todo); 3118 } else { 3119 end = doencodes(hunk, (const U8 *)aptr, todo); 3120 aptr += todo; 3121 } 3122 PUSH_BYTES(utf8, cur, hunk, end-hunk, 0); 3123 fromlen -= todo; 3124 } 3125 break; 3126 } 3127 } 3128 *cur = '\0'; 3129 SvCUR_set(cat, cur - start); 3130 no_change: 3131 *symptr = lookahead; 3132 } 3133 return beglist; 3134 } 3135 #undef NEXTFROM 3136 3137 3138 PP(pp_pack) 3139 { 3140 dSP; dMARK; dORIGMARK; dTARGET; 3141 SV *cat = TARG; 3142 STRLEN fromlen; 3143 SV *pat_sv = *++MARK; 3144 const char *pat = SvPV_const(pat_sv, fromlen); 3145 const char *patend = pat + fromlen; 3146 3147 MARK++; 3148 SvPVCLEAR(cat); 3149 SvUTF8_off(cat); 3150 3151 packlist(cat, pat, patend, MARK, SP + 1); 3152 3153 if (SvUTF8(cat)) { 3154 STRLEN result_len; 3155 const char * result = SvPV_nomg(cat, result_len); 3156 const U8 * error_pos; 3157 3158 if (! is_utf8_string_loc((U8 *) result, result_len, &error_pos)) { 3159 _force_out_malformed_utf8_message(error_pos, 3160 (U8 *) result + result_len, 3161 0, /* no flags */ 3162 1 /* Die */ 3163 ); 3164 NOT_REACHED; /* NOTREACHED */ 3165 } 3166 } 3167 3168 SvSETMAGIC(cat); 3169 SP = ORIGMARK; 3170 PUSHs(cat); 3171 RETURN; 3172 } 3173 3174 /* 3175 * ex: set ts=8 sts=4 sw=4 et: 3176 */ 3177