1 /* strings -- print the strings of printable characters in files 2 Copyright (C) 1993-2022 Free Software Foundation, Inc. 3 4 This program is free software; you can redistribute it and/or modify 5 it under the terms of the GNU General Public License as published by 6 the Free Software Foundation; either version 3, or (at your option) 7 any later version. 8 9 This program is distributed in the hope that it will be useful, 10 but WITHOUT ANY WARRANTY; without even the implied warranty of 11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 GNU General Public License for more details. 13 14 You should have received a copy of the GNU General Public License 15 along with this program; if not, write to the Free Software 16 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 17 02110-1301, USA. */ 18 19 /* Usage: strings [options] file... 20 21 Options: 22 --all 23 -a 24 - Scan each file in its entirety. 25 26 --data 27 -d Scan only the initialized data section(s) of object files. 28 29 --print-file-name 30 -f Print the name of the file before each string. 31 32 --bytes=min-len 33 -n min-len 34 -min-len Print graphic char sequences, MIN-LEN or more bytes long, 35 that are followed by a NUL or a non-displayable character. 36 Default is 4. 37 38 --radix={o,x,d} 39 -t {o,x,d} Print the offset within the file before each string, 40 in octal/hex/decimal. 41 42 --include-all-whitespace 43 -w By default tab and space are the only whitepace included in graphic 44 char sequences. This option considers all of isspace() valid. 45 46 -o Like -to. (Some other implementations have -o like -to, 47 others like -td. We chose one arbitrarily.) 48 49 --encoding={s,S,b,l,B,L} 50 -e {s,S,b,l,B,L} 51 Select character encoding: 7-bit-character, 8-bit-character, 52 bigendian 16-bit, littleendian 16-bit, bigendian 32-bit, 53 littleendian 32-bit. 54 55 --target=BFDNAME 56 -T {bfdname} 57 Specify a non-default object file format. 58 59 --unicode={default|locale|invalid|hex|escape|highlight} 60 -U {d|l|i|x|e|h} 61 Determine how to handle UTF-8 unicode characters. The default 62 is no special treatment. All other versions of this option 63 only apply if the encoding is valid and enabling the option 64 implies --encoding=S. 65 The 'locale' option displays the characters according to the 66 current locale. The 'invalid' option treats them as 67 non-string characters. The 'hex' option displays them as hex 68 byte sequences. The 'escape' option displays them as escape 69 sequences and the 'highlight' option displays them as 70 coloured escape sequences. 71 72 --output-separator=sep_string 73 -s sep_string String used to separate parsed strings in output. 74 Default is newline. 75 76 --help 77 -h Print the usage message on the standard output. 78 79 --version 80 -V 81 -v Print the program version number. 82 83 Written by Richard Stallman <rms@gnu.ai.mit.edu> 84 and David MacKenzie <djm@gnu.ai.mit.edu>. */ 85 86 #include "sysdep.h" 87 #include "bfd.h" 88 #include "getopt.h" 89 #include "libiberty.h" 90 #include "safe-ctype.h" 91 #include "bucomm.h" 92 93 #ifndef streq 94 #define streq(a,b) (strcmp ((a),(b)) == 0) 95 #endif 96 97 typedef enum unicode_display_type 98 { 99 unicode_default = 0, 100 unicode_locale, 101 unicode_escape, 102 unicode_hex, 103 unicode_highlight, 104 unicode_invalid 105 } unicode_display_type; 106 107 static unicode_display_type unicode_display = unicode_default; 108 109 #define STRING_ISGRAPHIC(c) \ 110 ( (c) >= 0 \ 111 && (c) <= 255 \ 112 && ((c) == '\t' || ISPRINT (c) || (encoding == 'S' && (c) > 127) \ 113 || (include_all_whitespace && ISSPACE (c))) \ 114 ) 115 116 #ifndef errno 117 extern int errno; 118 #endif 119 120 /* The BFD section flags that identify an initialized data section. */ 121 #define DATA_FLAGS (SEC_ALLOC | SEC_LOAD | SEC_HAS_CONTENTS) 122 123 /* Radix for printing addresses (must be 8, 10 or 16). */ 124 static int address_radix; 125 126 /* Minimum length of sequence of graphic chars to trigger output. */ 127 static unsigned int string_min; 128 129 /* Whether or not we include all whitespace as a graphic char. */ 130 static bool include_all_whitespace; 131 132 /* TRUE means print address within file for each string. */ 133 static bool print_addresses; 134 135 /* TRUE means print filename for each string. */ 136 static bool print_filenames; 137 138 /* TRUE means for object files scan only the data section. */ 139 static bool datasection_only; 140 141 /* The BFD object file format. */ 142 static char *target; 143 144 /* The character encoding format. */ 145 static char encoding; 146 static int encoding_bytes; 147 148 /* Output string used to separate parsed strings */ 149 static char *output_separator; 150 151 static struct option long_options[] = 152 { 153 {"all", no_argument, NULL, 'a'}, 154 {"bytes", required_argument, NULL, 'n'}, 155 {"data", no_argument, NULL, 'd'}, 156 {"encoding", required_argument, NULL, 'e'}, 157 {"help", no_argument, NULL, 'h'}, 158 {"include-all-whitespace", no_argument, NULL, 'w'}, 159 {"output-separator", required_argument, NULL, 's'}, 160 {"print-file-name", no_argument, NULL, 'f'}, 161 {"radix", required_argument, NULL, 't'}, 162 {"target", required_argument, NULL, 'T'}, 163 {"unicode", required_argument, NULL, 'U'}, 164 {"version", no_argument, NULL, 'v'}, 165 {NULL, 0, NULL, 0} 166 }; 167 168 static bool strings_file (char *); 169 static void print_strings (const char *, FILE *, file_ptr, int, char *); 170 static void usage (FILE *, int) ATTRIBUTE_NORETURN; 171 172 int main (int, char **); 173 174 int 175 main (int argc, char **argv) 176 { 177 int optc; 178 int exit_status = 0; 179 bool files_given = false; 180 char *s; 181 int numeric_opt = 0; 182 183 setlocale (LC_ALL, ""); 184 bindtextdomain (PACKAGE, LOCALEDIR); 185 textdomain (PACKAGE); 186 187 program_name = argv[0]; 188 xmalloc_set_program_name (program_name); 189 bfd_set_error_program_name (program_name); 190 191 expandargv (&argc, &argv); 192 193 string_min = 4; 194 include_all_whitespace = false; 195 print_addresses = false; 196 print_filenames = false; 197 if (DEFAULT_STRINGS_ALL) 198 datasection_only = false; 199 else 200 datasection_only = true; 201 target = NULL; 202 encoding = 's'; 203 output_separator = NULL; 204 205 while ((optc = getopt_long (argc, argv, "adfhHn:wot:e:T:s:U:Vv0123456789", 206 long_options, (int *) 0)) != EOF) 207 { 208 switch (optc) 209 { 210 case 'a': 211 datasection_only = false; 212 break; 213 214 case 'd': 215 datasection_only = true; 216 break; 217 218 case 'f': 219 print_filenames = true; 220 break; 221 222 case 'H': 223 case 'h': 224 usage (stdout, 0); 225 226 case 'n': 227 string_min = (int) strtoul (optarg, &s, 0); 228 if (s != NULL && *s != 0) 229 fatal (_("invalid integer argument %s"), optarg); 230 break; 231 232 case 'w': 233 include_all_whitespace = true; 234 break; 235 236 case 'o': 237 print_addresses = true; 238 address_radix = 8; 239 break; 240 241 case 't': 242 print_addresses = true; 243 if (optarg[1] != '\0') 244 usage (stderr, 1); 245 switch (optarg[0]) 246 { 247 case 'o': 248 address_radix = 8; 249 break; 250 251 case 'd': 252 address_radix = 10; 253 break; 254 255 case 'x': 256 address_radix = 16; 257 break; 258 259 default: 260 usage (stderr, 1); 261 } 262 break; 263 264 case 'T': 265 target = optarg; 266 break; 267 268 case 'e': 269 if (optarg[1] != '\0') 270 usage (stderr, 1); 271 encoding = optarg[0]; 272 break; 273 274 case 's': 275 output_separator = optarg; 276 break; 277 278 case 'U': 279 if (streq (optarg, "default") || streq (optarg, "d")) 280 unicode_display = unicode_default; 281 else if (streq (optarg, "locale") || streq (optarg, "l")) 282 unicode_display = unicode_locale; 283 else if (streq (optarg, "escape") || streq (optarg, "e")) 284 unicode_display = unicode_escape; 285 else if (streq (optarg, "invalid") || streq (optarg, "i")) 286 unicode_display = unicode_invalid; 287 else if (streq (optarg, "hex") || streq (optarg, "x")) 288 unicode_display = unicode_hex; 289 else if (streq (optarg, "highlight") || streq (optarg, "h")) 290 unicode_display = unicode_highlight; 291 else 292 fatal (_("invalid argument to -U/--unicode: %s"), optarg); 293 break; 294 295 case 'V': 296 case 'v': 297 print_version ("strings"); 298 break; 299 300 case '?': 301 usage (stderr, 1); 302 303 default: 304 numeric_opt = optind; 305 break; 306 } 307 } 308 309 if (unicode_display != unicode_default) 310 encoding = 'S'; 311 312 if (numeric_opt != 0) 313 { 314 string_min = (int) strtoul (argv[numeric_opt - 1] + 1, &s, 0); 315 if (s != NULL && *s != 0) 316 fatal (_("invalid integer argument %s"), argv[numeric_opt - 1] + 1); 317 } 318 if (string_min < 1) 319 fatal (_("invalid minimum string length %d"), string_min); 320 321 switch (encoding) 322 { 323 case 'S': 324 case 's': 325 encoding_bytes = 1; 326 break; 327 case 'b': 328 case 'l': 329 encoding_bytes = 2; 330 break; 331 case 'B': 332 case 'L': 333 encoding_bytes = 4; 334 break; 335 default: 336 usage (stderr, 1); 337 } 338 339 if (bfd_init () != BFD_INIT_MAGIC) 340 fatal (_("fatal error: libbfd ABI mismatch")); 341 set_default_bfd_target (); 342 343 if (optind >= argc) 344 { 345 datasection_only = false; 346 SET_BINARY (fileno (stdin)); 347 print_strings ("{standard input}", stdin, 0, 0, (char *) NULL); 348 files_given = true; 349 } 350 else 351 { 352 for (; optind < argc; ++optind) 353 { 354 if (streq (argv[optind], "-")) 355 datasection_only = false; 356 else 357 { 358 files_given = true; 359 exit_status |= !strings_file (argv[optind]); 360 } 361 } 362 } 363 364 if (!files_given) 365 usage (stderr, 1); 366 367 return (exit_status); 368 } 369 370 /* Scan section SECT of the file ABFD, whose printable name is 371 FILENAME. If it contains initialized data set GOT_A_SECTION and 372 print the strings in it. */ 373 374 static void 375 strings_a_section (bfd *abfd, asection *sect, const char *filename, 376 bool *got_a_section) 377 { 378 bfd_size_type sectsize; 379 bfd_byte *mem; 380 381 if ((sect->flags & DATA_FLAGS) != DATA_FLAGS) 382 return; 383 384 sectsize = bfd_section_size (sect); 385 if (sectsize == 0) 386 return; 387 388 if (!bfd_malloc_and_get_section (abfd, sect, &mem)) 389 { 390 non_fatal (_("%s: Reading section %s failed: %s"), 391 filename, sect->name, bfd_errmsg (bfd_get_error ())); 392 return; 393 } 394 395 *got_a_section = true; 396 print_strings (filename, NULL, sect->filepos, sectsize, (char *) mem); 397 free (mem); 398 } 399 400 /* Scan all of the sections in FILE, and print the strings 401 in the initialized data section(s). 402 403 Return TRUE if successful, 404 FALSE if not (such as if FILE is not an object file). */ 405 406 static bool 407 strings_object_file (const char *file) 408 { 409 bfd *abfd; 410 asection *s; 411 bool got_a_section; 412 413 abfd = bfd_openr (file, target); 414 415 if (abfd == NULL) 416 /* Treat the file as a non-object file. */ 417 return false; 418 419 /* This call is mainly for its side effect of reading in the sections. 420 We follow the traditional behavior of `strings' in that we don't 421 complain if we don't recognize a file to be an object file. */ 422 if (!bfd_check_format (abfd, bfd_object)) 423 { 424 bfd_close (abfd); 425 return false; 426 } 427 428 got_a_section = false; 429 for (s = abfd->sections; s != NULL; s = s->next) 430 strings_a_section (abfd, s, file, &got_a_section); 431 432 if (!bfd_close (abfd)) 433 { 434 bfd_nonfatal (file); 435 return false; 436 } 437 438 return got_a_section; 439 } 440 441 /* Print the strings in FILE. Return TRUE if ok, FALSE if an error occurs. */ 442 443 static bool 444 strings_file (char *file) 445 { 446 struct stat st; 447 448 /* get_file_size does not support non-S_ISREG files. */ 449 450 if (stat (file, &st) < 0) 451 { 452 if (errno == ENOENT) 453 non_fatal (_("'%s': No such file"), file); 454 else 455 non_fatal (_("Warning: could not locate '%s'. reason: %s"), 456 file, strerror (errno)); 457 return false; 458 } 459 else if (S_ISDIR (st.st_mode)) 460 { 461 non_fatal (_("Warning: '%s' is a directory"), file); 462 return false; 463 } 464 465 /* If we weren't told to scan the whole file, 466 try to open it as an object file and only look at 467 initialized data sections. If that fails, fall back to the 468 whole file. */ 469 if (!datasection_only || !strings_object_file (file)) 470 { 471 FILE *stream; 472 473 stream = fopen (file, FOPEN_RB); 474 if (stream == NULL) 475 { 476 fprintf (stderr, "%s: ", program_name); 477 perror (file); 478 return false; 479 } 480 481 print_strings (file, stream, (file_ptr) 0, 0, (char *) NULL); 482 483 if (fclose (stream) == EOF) 484 { 485 fprintf (stderr, "%s: ", program_name); 486 perror (file); 487 return false; 488 } 489 } 490 491 return true; 492 } 493 494 /* Read the next character, return EOF if none available. 495 Assume that STREAM is positioned so that the next byte read 496 is at address ADDRESS in the file. 497 498 If STREAM is NULL, do not read from it. 499 The caller can supply a buffer of characters 500 to be processed before the data in STREAM. 501 MAGIC is the address of the buffer and 502 MAGICCOUNT is how many characters are in it. */ 503 504 static long 505 get_char (FILE *stream, file_ptr *address, int *magiccount, char **magic) 506 { 507 int c, i; 508 long r = 0; 509 510 for (i = 0; i < encoding_bytes; i++) 511 { 512 if (*magiccount) 513 { 514 (*magiccount)--; 515 c = *(*magic)++; 516 } 517 else 518 { 519 if (stream == NULL) 520 return EOF; 521 522 /* Only use getc_unlocked if we found a declaration for it. 523 Otherwise, libc is not thread safe by default, and we 524 should not use it. */ 525 526 #if defined(HAVE_GETC_UNLOCKED) && HAVE_DECL_GETC_UNLOCKED 527 c = getc_unlocked (stream); 528 #else 529 c = getc (stream); 530 #endif 531 if (c == EOF) 532 return EOF; 533 } 534 535 (*address)++; 536 r = (r << 8) | (c & 0xff); 537 } 538 539 switch (encoding) 540 { 541 default: 542 break; 543 case 'l': 544 r = ((r & 0xff) << 8) | ((r & 0xff00) >> 8); 545 break; 546 case 'L': 547 r = (((r & 0xff) << 24) | ((r & 0xff00) << 8) 548 | ((r & 0xff0000) >> 8) | ((r & 0xff000000) >> 24)); 549 break; 550 } 551 552 return r; 553 } 554 555 /* Throw away one byte of a (possibly) multi-byte char C, updating 556 address and buffer to suit. */ 557 558 static void 559 unget_part_char (long c, file_ptr *address, int *magiccount, char **magic) 560 { 561 static char tmp[4]; 562 563 if (encoding_bytes > 1) 564 { 565 *address -= encoding_bytes - 1; 566 567 if (*magiccount == 0) 568 { 569 /* If no magic buffer exists, use temp buffer. */ 570 switch (encoding) 571 { 572 default: 573 break; 574 case 'b': 575 tmp[0] = c & 0xff; 576 *magiccount = 1; 577 break; 578 case 'l': 579 tmp[0] = (c >> 8) & 0xff; 580 *magiccount = 1; 581 break; 582 case 'B': 583 tmp[0] = (c >> 16) & 0xff; 584 tmp[1] = (c >> 8) & 0xff; 585 tmp[2] = c & 0xff; 586 *magiccount = 3; 587 break; 588 case 'L': 589 tmp[0] = (c >> 8) & 0xff; 590 tmp[1] = (c >> 16) & 0xff; 591 tmp[2] = (c >> 24) & 0xff; 592 *magiccount = 3; 593 break; 594 } 595 *magic = tmp; 596 } 597 else 598 { 599 /* If magic buffer exists, rewind. */ 600 *magic -= encoding_bytes - 1; 601 *magiccount += encoding_bytes - 1; 602 } 603 } 604 } 605 606 static void 607 print_filename_and_address (const char * filename, file_ptr address) 608 { 609 if (print_filenames) 610 printf ("%s: ", filename); 611 612 if (! print_addresses) 613 return; 614 615 switch (address_radix) 616 { 617 case 8: 618 if (sizeof (address) > sizeof (long)) 619 { 620 #ifndef __MSVCRT__ 621 printf ("%7llo ", (unsigned long long) address); 622 #else 623 printf ("%7I64o ", (unsigned long long) address); 624 #endif 625 } 626 else 627 printf ("%7lo ", (unsigned long) address); 628 break; 629 630 case 10: 631 if (sizeof (address) > sizeof (long)) 632 { 633 #ifndef __MSVCRT__ 634 printf ("%7llu ", (unsigned long long) address); 635 #else 636 printf ("%7I64d ", (unsigned long long) address); 637 #endif 638 } 639 else 640 printf ("%7ld ", (long) address); 641 break; 642 643 case 16: 644 if (sizeof (address) > sizeof (long)) 645 { 646 #ifndef __MSVCRT__ 647 printf ("%7llx ", (unsigned long long) address); 648 #else 649 printf ("%7I64x ", (unsigned long long) address); 650 #endif 651 } 652 else 653 printf ("%7lx ", (unsigned long) address); 654 break; 655 } 656 } 657 658 /* Return non-zero if the bytes starting at BUFFER form a valid UTF-8 encoding. 659 If the encoding is valid then returns the number of bytes it uses. */ 660 661 static unsigned int 662 is_valid_utf8 (const unsigned char * buffer, unsigned long buflen) 663 { 664 if (buffer[0] < 0xc0) 665 return 0; 666 667 if (buflen < 2) 668 return 0; 669 670 if ((buffer[1] & 0xc0) != 0x80) 671 return 0; 672 673 if ((buffer[0] & 0x20) == 0) 674 return 2; 675 676 if (buflen < 3) 677 return 0; 678 679 if ((buffer[2] & 0xc0) != 0x80) 680 return 0; 681 682 if ((buffer[0] & 0x10) == 0) 683 return 3; 684 685 if (buflen < 4) 686 return 0; 687 688 if ((buffer[3] & 0xc0) != 0x80) 689 return 0; 690 691 return 4; 692 } 693 694 /* Display a UTF-8 encoded character in BUFFER according to the setting 695 of unicode_display. The character is known to be valid. 696 Returns the number of bytes consumed. */ 697 698 static unsigned int 699 display_utf8_char (const unsigned char * buffer) 700 { 701 unsigned int j; 702 unsigned int utf8_len; 703 704 switch (buffer[0] & 0x30) 705 { 706 case 0x00: 707 case 0x10: 708 utf8_len = 2; 709 break; 710 case 0x20: 711 utf8_len = 3; 712 break; 713 default: 714 utf8_len = 4; 715 } 716 717 switch (unicode_display) 718 { 719 default: 720 fprintf (stderr, "ICE: unexpected unicode display type\n"); 721 break; 722 723 case unicode_escape: 724 case unicode_highlight: 725 if (unicode_display == unicode_highlight && isatty (1)) 726 printf ("\x1B[31;47m"); /* Red. */ 727 728 switch (utf8_len) 729 { 730 case 2: 731 printf ("\\u%02x%02x", 732 ((buffer[0] & 0x1c) >> 2), 733 ((buffer[0] & 0x03) << 6) | (buffer[1] & 0x3f)); 734 break; 735 736 case 3: 737 printf ("\\u%02x%02x", 738 ((buffer[0] & 0x0f) << 4) | ((buffer[1] & 0x3c) >> 2), 739 ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3f))); 740 break; 741 742 case 4: 743 printf ("\\u%02x%02x%02x", 744 ((buffer[0] & 0x07) << 6) | ((buffer[1] & 0x3c) >> 2), 745 ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3c) >> 2), 746 ((buffer[2] & 0x03) << 6) | ((buffer[3] & 0x3f))); 747 break; 748 default: 749 /* URG. */ 750 break; 751 } 752 753 if (unicode_display == unicode_highlight && isatty (1)) 754 printf ("\033[0m"); /* Default colour. */ 755 break; 756 757 case unicode_hex: 758 putchar ('<'); 759 printf ("0x"); 760 for (j = 0; j < utf8_len; j++) 761 printf ("%02x", buffer [j]); 762 putchar ('>'); 763 break; 764 765 case unicode_locale: 766 printf ("%.1s", buffer); 767 break; 768 } 769 770 return utf8_len; 771 } 772 773 /* Display strings in BUFFER. Treat any UTF-8 encoded characters encountered 774 according to the setting of the unicode_display variable. The buffer 775 contains BUFLEN bytes. 776 777 Display the characters as if they started at ADDRESS and are contained in 778 FILENAME. */ 779 780 static void 781 print_unicode_buffer (const char * filename, 782 file_ptr address, 783 const unsigned char * buffer, 784 unsigned long buflen) 785 { 786 /* Paranoia checks... */ 787 if (filename == NULL 788 || buffer == NULL 789 || unicode_display == unicode_default 790 || encoding != 'S' 791 || encoding_bytes != 1) 792 { 793 fprintf (stderr, "ICE: bad arguments to print_unicode_buffer\n"); 794 return; 795 } 796 797 if (buflen == 0) 798 return; 799 800 /* We must only display strings that are at least string_min *characters* 801 long. So we scan the buffer in two stages. First we locate the start 802 of a potential string. Then we walk along it until we have found 803 string_min characters. Then we go back to the start point and start 804 displaying characters according to the unicode_display setting. */ 805 806 unsigned long start_point = 0; 807 unsigned long i = 0; 808 unsigned int char_len = 1; 809 unsigned int num_found = 0; 810 811 for (i = 0; i < buflen; i += char_len) 812 { 813 int c = buffer[i]; 814 815 char_len = 1; 816 817 /* Find the first potential character of a string. */ 818 if (! STRING_ISGRAPHIC (c)) 819 { 820 num_found = 0; 821 continue; 822 } 823 824 if (c > 126) 825 { 826 if (c < 0xc0) 827 { 828 num_found = 0; 829 continue; 830 } 831 832 if ((char_len = is_valid_utf8 (buffer + i, buflen - i)) == 0) 833 { 834 char_len = 1; 835 num_found = 0; 836 continue; 837 } 838 839 if (unicode_display == unicode_invalid) 840 { 841 /* We have found a valid UTF-8 character, but we treat it as non-graphic. */ 842 num_found = 0; 843 continue; 844 } 845 } 846 847 if (num_found == 0) 848 /* We have found a potential starting point for a string. */ 849 start_point = i; 850 851 ++ num_found; 852 853 if (num_found >= string_min) 854 break; 855 } 856 857 if (num_found < string_min) 858 return; 859 860 print_filename_and_address (filename, address + start_point); 861 862 /* We have found string_min characters. Display them and any 863 more that follow. */ 864 for (i = start_point; i < buflen; i += char_len) 865 { 866 int c = buffer[i]; 867 868 char_len = 1; 869 870 if (! STRING_ISGRAPHIC (c)) 871 break; 872 else if (c < 127) 873 putchar (c); 874 else if (! is_valid_utf8 (buffer + i, buflen - i)) 875 break; 876 else if (unicode_display == unicode_invalid) 877 break; 878 else 879 char_len = display_utf8_char (buffer + i); 880 } 881 882 if (output_separator) 883 fputs (output_separator, stdout); 884 else 885 putchar ('\n'); 886 887 /* FIXME: Using tail recursion here is lazy programming... */ 888 print_unicode_buffer (filename, address + i, buffer + i, buflen - i); 889 } 890 891 static int 892 get_unicode_byte (FILE * stream, 893 unsigned char * putback, 894 unsigned int * num_putback, 895 unsigned int * num_read) 896 { 897 if (* num_putback > 0) 898 { 899 * num_putback = * num_putback - 1; 900 return putback [* num_putback]; 901 } 902 903 * num_read = * num_read + 1; 904 905 #if defined(HAVE_GETC_UNLOCKED) && HAVE_DECL_GETC_UNLOCKED 906 return getc_unlocked (stream); 907 #else 908 return getc (stream); 909 #endif 910 } 911 912 /* Helper function for print_unicode_stream. */ 913 914 static void 915 print_unicode_stream_body (const char * filename, 916 file_ptr address, 917 FILE * stream, 918 unsigned char * putback_buf, 919 unsigned int num_putback, 920 unsigned char * print_buf) 921 { 922 /* It would be nice if we could just read the stream into a buffer 923 and then process if with print_unicode_buffer. But the input 924 might be huge or it might time-locked (eg stdin). So instead 925 we go one byte at a time... */ 926 927 file_ptr start_point = 0; 928 unsigned int num_read = 0; 929 unsigned int num_chars = 0; 930 unsigned int num_print = 0; 931 int c = 0; 932 933 /* Find a series of string_min characters. Put them into print_buf. */ 934 do 935 { 936 if (num_chars >= string_min) 937 break; 938 939 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read); 940 if (c == EOF) 941 break; 942 943 if (! STRING_ISGRAPHIC (c)) 944 { 945 num_chars = num_print = 0; 946 continue; 947 } 948 949 if (num_chars == 0) 950 start_point = num_read - 1; 951 952 if (c < 127) 953 { 954 print_buf[num_print] = c; 955 num_chars ++; 956 num_print ++; 957 continue; 958 } 959 960 if (c < 0xc0) 961 { 962 num_chars = num_print = 0; 963 continue; 964 } 965 966 /* We *might* have a UTF-8 sequence. Time to start peeking. */ 967 char utf8[4]; 968 969 utf8[0] = c; 970 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read); 971 if (c == EOF) 972 break; 973 utf8[1] = c; 974 975 if ((utf8[1] & 0xc0) != 0x80) 976 { 977 /* Invalid UTF-8. */ 978 putback_buf[num_putback++] = utf8[1]; 979 num_chars = num_print = 0; 980 continue; 981 } 982 else if ((utf8[0] & 0x20) == 0) 983 { 984 /* A valid 2-byte UTF-8 encoding. */ 985 if (unicode_display == unicode_invalid) 986 { 987 putback_buf[num_putback++] = utf8[1]; 988 num_chars = num_print = 0; 989 } 990 else 991 { 992 print_buf[num_print ++] = utf8[0]; 993 print_buf[num_print ++] = utf8[1]; 994 num_chars ++; 995 } 996 continue; 997 } 998 999 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read); 1000 if (c == EOF) 1001 break; 1002 utf8[2] = c; 1003 1004 if ((utf8[2] & 0xc0) != 0x80) 1005 { 1006 /* Invalid UTF-8. */ 1007 putback_buf[num_putback++] = utf8[2]; 1008 putback_buf[num_putback++] = utf8[1]; 1009 num_chars = num_print = 0; 1010 continue; 1011 } 1012 else if ((utf8[0] & 0x10) == 0) 1013 { 1014 /* A valid 3-byte UTF-8 encoding. */ 1015 if (unicode_display == unicode_invalid) 1016 { 1017 putback_buf[num_putback++] = utf8[2]; 1018 putback_buf[num_putback++] = utf8[1]; 1019 num_chars = num_print = 0; 1020 } 1021 else 1022 { 1023 print_buf[num_print ++] = utf8[0]; 1024 print_buf[num_print ++] = utf8[1]; 1025 print_buf[num_print ++] = utf8[2]; 1026 num_chars ++; 1027 } 1028 continue; 1029 } 1030 1031 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read); 1032 if (c == EOF) 1033 break; 1034 utf8[3] = c; 1035 1036 if ((utf8[3] & 0xc0) != 0x80) 1037 { 1038 /* Invalid UTF-8. */ 1039 putback_buf[num_putback++] = utf8[3]; 1040 putback_buf[num_putback++] = utf8[2]; 1041 putback_buf[num_putback++] = utf8[1]; 1042 num_chars = num_print = 0; 1043 } 1044 /* We have a valid 4-byte UTF-8 encoding. */ 1045 else if (unicode_display == unicode_invalid) 1046 { 1047 putback_buf[num_putback++] = utf8[3]; 1048 putback_buf[num_putback++] = utf8[1]; 1049 putback_buf[num_putback++] = utf8[2]; 1050 num_chars = num_print = 0; 1051 } 1052 else 1053 { 1054 print_buf[num_print ++] = utf8[0]; 1055 print_buf[num_print ++] = utf8[1]; 1056 print_buf[num_print ++] = utf8[2]; 1057 print_buf[num_print ++] = utf8[3]; 1058 num_chars ++; 1059 } 1060 } 1061 while (1); 1062 1063 if (num_chars >= string_min) 1064 { 1065 /* We know that we have string_min valid characters in print_buf, 1066 and there may be more to come in the stream. Start displaying 1067 them. */ 1068 1069 print_filename_and_address (filename, address + start_point); 1070 1071 unsigned int i; 1072 for (i = 0; i < num_print;) 1073 { 1074 if (print_buf[i] < 127) 1075 putchar (print_buf[i++]); 1076 else 1077 i += display_utf8_char (print_buf + i); 1078 } 1079 1080 /* OK so now we have to start read unchecked bytes. */ 1081 1082 /* Find a series of string_min characters. Put them into print_buf. */ 1083 do 1084 { 1085 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read); 1086 if (c == EOF) 1087 break; 1088 1089 if (! STRING_ISGRAPHIC (c)) 1090 break; 1091 1092 if (c < 127) 1093 { 1094 putchar (c); 1095 continue; 1096 } 1097 1098 if (c < 0xc0) 1099 break; 1100 1101 /* We *might* have a UTF-8 sequence. Time to start peeking. */ 1102 unsigned char utf8[4]; 1103 1104 utf8[0] = c; 1105 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read); 1106 if (c == EOF) 1107 break; 1108 utf8[1] = c; 1109 1110 if ((utf8[1] & 0xc0) != 0x80) 1111 { 1112 /* Invalid UTF-8. */ 1113 putback_buf[num_putback++] = utf8[1]; 1114 break; 1115 } 1116 else if ((utf8[0] & 0x20) == 0) 1117 { 1118 /* Valid 2-byte UTF-8. */ 1119 if (unicode_display == unicode_invalid) 1120 { 1121 putback_buf[num_putback++] = utf8[1]; 1122 break; 1123 } 1124 else 1125 { 1126 (void) display_utf8_char (utf8); 1127 continue; 1128 } 1129 } 1130 1131 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read); 1132 if (c == EOF) 1133 break; 1134 utf8[2] = c; 1135 1136 if ((utf8[2] & 0xc0) != 0x80) 1137 { 1138 /* Invalid UTF-8. */ 1139 putback_buf[num_putback++] = utf8[2]; 1140 putback_buf[num_putback++] = utf8[1]; 1141 break; 1142 } 1143 else if ((utf8[0] & 0x10) == 0) 1144 { 1145 /* Valid 3-byte UTF-8. */ 1146 if (unicode_display == unicode_invalid) 1147 { 1148 putback_buf[num_putback++] = utf8[2]; 1149 putback_buf[num_putback++] = utf8[1]; 1150 break; 1151 } 1152 else 1153 { 1154 (void) display_utf8_char (utf8); 1155 continue; 1156 } 1157 } 1158 1159 c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read); 1160 if (c == EOF) 1161 break; 1162 utf8[3] = c; 1163 1164 if ((utf8[3] & 0xc0) != 0x80) 1165 { 1166 /* Invalid UTF-8. */ 1167 putback_buf[num_putback++] = utf8[3]; 1168 putback_buf[num_putback++] = utf8[2]; 1169 putback_buf[num_putback++] = utf8[1]; 1170 break; 1171 } 1172 else if (unicode_display == unicode_invalid) 1173 { 1174 putback_buf[num_putback++] = utf8[3]; 1175 putback_buf[num_putback++] = utf8[2]; 1176 putback_buf[num_putback++] = utf8[1]; 1177 break; 1178 } 1179 else 1180 /* A valid 4-byte UTF-8 encoding. */ 1181 (void) display_utf8_char (utf8); 1182 } 1183 while (1); 1184 1185 if (output_separator) 1186 fputs (output_separator, stdout); 1187 else 1188 putchar ('\n'); 1189 } 1190 1191 if (c != EOF) 1192 /* FIXME: Using tail recursion here is lazy, but it works. */ 1193 print_unicode_stream_body (filename, address + num_read, stream, putback_buf, num_putback, print_buf); 1194 } 1195 1196 /* Display strings read in from STREAM. Treat any UTF-8 encoded characters 1197 encountered according to the setting of the unicode_display variable. 1198 The stream is positioned at ADDRESS and is attached to FILENAME. */ 1199 1200 static void 1201 print_unicode_stream (const char * filename, 1202 file_ptr address, 1203 FILE * stream) 1204 { 1205 /* Paranoia checks... */ 1206 if (filename == NULL 1207 || stream == NULL 1208 || unicode_display == unicode_default 1209 || encoding != 'S' 1210 || encoding_bytes != 1) 1211 { 1212 fprintf (stderr, "ICE: bad arguments to print_unicode_stream\n"); 1213 return; 1214 } 1215 1216 /* Allocate space for string_min 4-byte utf-8 characters. */ 1217 unsigned char * print_buf = xmalloc ((4 * string_min) + 1); 1218 /* We should never have to put back more than 4 bytes. */ 1219 unsigned char putback_buf[5]; 1220 unsigned int num_putback = 0; 1221 1222 print_unicode_stream_body (filename, address, stream, putback_buf, num_putback, print_buf); 1223 free (print_buf); 1224 } 1225 1226 /* Find the strings in file FILENAME, read from STREAM. 1227 Assume that STREAM is positioned so that the next byte read 1228 is at address ADDRESS in the file. 1229 1230 If STREAM is NULL, do not read from it. 1231 The caller can supply a buffer of characters 1232 to be processed before the data in STREAM. 1233 MAGIC is the address of the buffer and 1234 MAGICCOUNT is how many characters are in it. 1235 Those characters come at address ADDRESS and the data in STREAM follow. */ 1236 1237 static void 1238 print_strings (const char *filename, FILE *stream, file_ptr address, 1239 int magiccount, char *magic) 1240 { 1241 if (unicode_display != unicode_default) 1242 { 1243 if (magic != NULL) 1244 print_unicode_buffer (filename, address, 1245 (const unsigned char *) magic, magiccount); 1246 1247 if (stream != NULL) 1248 print_unicode_stream (filename, address, stream); 1249 return; 1250 } 1251 1252 char *buf = (char *) xmalloc (sizeof (char) * (string_min + 1)); 1253 1254 while (1) 1255 { 1256 file_ptr start; 1257 unsigned int i; 1258 long c; 1259 1260 /* See if the next `string_min' chars are all graphic chars. */ 1261 tryline: 1262 start = address; 1263 for (i = 0; i < string_min; i++) 1264 { 1265 c = get_char (stream, &address, &magiccount, &magic); 1266 if (c == EOF) 1267 { 1268 free (buf); 1269 return; 1270 } 1271 1272 if (! STRING_ISGRAPHIC (c)) 1273 { 1274 /* Found a non-graphic. Try again starting with next byte. */ 1275 unget_part_char (c, &address, &magiccount, &magic); 1276 goto tryline; 1277 } 1278 buf[i] = c; 1279 } 1280 1281 /* We found a run of `string_min' graphic characters. Print up 1282 to the next non-graphic character. */ 1283 print_filename_and_address (filename, start); 1284 1285 buf[i] = '\0'; 1286 fputs (buf, stdout); 1287 1288 while (1) 1289 { 1290 c = get_char (stream, &address, &magiccount, &magic); 1291 if (c == EOF) 1292 break; 1293 if (! STRING_ISGRAPHIC (c)) 1294 { 1295 unget_part_char (c, &address, &magiccount, &magic); 1296 break; 1297 } 1298 putchar (c); 1299 } 1300 1301 if (output_separator) 1302 fputs (output_separator, stdout); 1303 else 1304 putchar ('\n'); 1305 } 1306 free (buf); 1307 } 1308 1309 static void 1310 usage (FILE *stream, int status) 1311 { 1312 fprintf (stream, _("Usage: %s [option(s)] [file(s)]\n"), program_name); 1313 fprintf (stream, _(" Display printable strings in [file(s)] (stdin by default)\n")); 1314 fprintf (stream, _(" The options are:\n")); 1315 1316 if (DEFAULT_STRINGS_ALL) 1317 fprintf (stream, _("\ 1318 -a - --all Scan the entire file, not just the data section [default]\n\ 1319 -d --data Only scan the data sections in the file\n")); 1320 else 1321 fprintf (stream, _("\ 1322 -a - --all Scan the entire file, not just the data section\n\ 1323 -d --data Only scan the data sections in the file [default]\n")); 1324 1325 fprintf (stream, _("\ 1326 -f --print-file-name Print the name of the file before each string\n\ 1327 -n <number> Locate & print any sequence of at least <number>\n\ 1328 --bytes=<number> displayable characters. (The default is 4).\n\ 1329 -t --radix={o,d,x} Print the location of the string in base 8, 10 or 16\n\ 1330 -w --include-all-whitespace Include all whitespace as valid string characters\n\ 1331 -o An alias for --radix=o\n\ 1332 -T --target=<BFDNAME> Specify the binary file format\n\ 1333 -e --encoding={s,S,b,l,B,L} Select character size and endianness:\n\ 1334 s = 7-bit, S = 8-bit, {b,l} = 16-bit, {B,L} = 32-bit\n\ 1335 --unicode={default|show|invalid|hex|escape|highlight}\n\ 1336 -U {d|s|i|x|e|h} Specify how to treat UTF-8 encoded unicode characters\n\ 1337 -s --output-separator=<string> String used to separate strings in output.\n\ 1338 @<file> Read options from <file>\n\ 1339 -h --help Display this information\n\ 1340 -v -V --version Print the program's version number\n")); 1341 list_supported_targets (program_name, stream); 1342 if (REPORT_BUGS_TO[0] && status == 0) 1343 fprintf (stream, _("Report bugs to %s\n"), REPORT_BUGS_TO); 1344 exit (status); 1345 } 1346