xref: /netbsd-src/external/gpl3/binutils.old/dist/binutils/strings.c (revision e992f068c547fd6e84b3f104dc2340adcc955732)
1 /* strings -- print the strings of printable characters in files
2    Copyright (C) 1993-2022 Free Software Foundation, Inc.
3 
4    This program is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation; either version 3, or (at your option)
7    any later version.
8 
9    This program is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12    GNU General Public License for more details.
13 
14    You should have received a copy of the GNU General Public License
15    along with this program; if not, write to the Free Software
16    Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
17    02110-1301, USA.  */
18 
19 /* Usage: strings [options] file...
20 
21    Options:
22    --all
23    -a
24    -		Scan each file in its entirety.
25 
26    --data
27    -d		Scan only the initialized data section(s) of object files.
28 
29    --print-file-name
30    -f		Print the name of the file before each string.
31 
32    --bytes=min-len
33    -n min-len
34    -min-len	Print graphic char sequences, MIN-LEN or more bytes long,
35 		that are followed by a NUL or a non-displayable character.
36 		Default is 4.
37 
38    --radix={o,x,d}
39    -t {o,x,d}	Print the offset within the file before each string,
40 		in octal/hex/decimal.
41 
42   --include-all-whitespace
43   -w		By default tab and space are the only whitepace included in graphic
44 		char sequences.  This option considers all of isspace() valid.
45 
46    -o		Like -to.  (Some other implementations have -o like -to,
47 		others like -td.  We chose one arbitrarily.)
48 
49    --encoding={s,S,b,l,B,L}
50    -e {s,S,b,l,B,L}
51 		Select character encoding: 7-bit-character, 8-bit-character,
52 		bigendian 16-bit, littleendian 16-bit, bigendian 32-bit,
53 		littleendian 32-bit.
54 
55    --target=BFDNAME
56    -T {bfdname}
57 		Specify a non-default object file format.
58 
59   --unicode={default|locale|invalid|hex|escape|highlight}
60   -U {d|l|i|x|e|h}
61 		Determine how to handle UTF-8 unicode characters.  The default
62 		is no special treatment.  All other versions of this option
63 		only apply if the encoding is valid and enabling the option
64 		implies --encoding=S.
65 		The 'locale' option displays the characters according to the
66 		current locale.  The 'invalid' option treats them as
67 		non-string characters.  The 'hex' option displays them as hex
68 		byte sequences.  The 'escape' option displays them as escape
69 		sequences and the 'highlight' option displays them as
70 		coloured escape sequences.
71 
72   --output-separator=sep_string
73   -s sep_string	String used to separate parsed strings in output.
74 		Default is newline.
75 
76    --help
77    -h		Print the usage message on the standard output.
78 
79    --version
80    -V
81    -v		Print the program version number.
82 
83    Written by Richard Stallman <rms@gnu.ai.mit.edu>
84    and David MacKenzie <djm@gnu.ai.mit.edu>.  */
85 
86 #include "sysdep.h"
87 #include "bfd.h"
88 #include "getopt.h"
89 #include "libiberty.h"
90 #include "safe-ctype.h"
91 #include "bucomm.h"
92 
93 #ifndef streq
94 #define streq(a,b) (strcmp ((a),(b)) == 0)
95 #endif
96 
97 typedef enum unicode_display_type
98 {
99   unicode_default = 0,
100   unicode_locale,
101   unicode_escape,
102   unicode_hex,
103   unicode_highlight,
104   unicode_invalid
105 } unicode_display_type;
106 
107 static unicode_display_type unicode_display = unicode_default;
108 
109 #define STRING_ISGRAPHIC(c) \
110       (   (c) >= 0 \
111        && (c) <= 255 \
112        && ((c) == '\t' || ISPRINT (c) || (encoding == 'S' && (c) > 127) \
113 	   || (include_all_whitespace && ISSPACE (c))) \
114       )
115 
116 #ifndef errno
117 extern int errno;
118 #endif
119 
120 /* The BFD section flags that identify an initialized data section.  */
121 #define DATA_FLAGS (SEC_ALLOC | SEC_LOAD | SEC_HAS_CONTENTS)
122 
123 /* Radix for printing addresses (must be 8, 10 or 16).  */
124 static int address_radix;
125 
126 /* Minimum length of sequence of graphic chars to trigger output.  */
127 static unsigned int string_min;
128 
129 /* Whether or not we include all whitespace as a graphic char.   */
130 static bool include_all_whitespace;
131 
132 /* TRUE means print address within file for each string.  */
133 static bool print_addresses;
134 
135 /* TRUE means print filename for each string.  */
136 static bool print_filenames;
137 
138 /* TRUE means for object files scan only the data section.  */
139 static bool datasection_only;
140 
141 /* The BFD object file format.  */
142 static char *target;
143 
144 /* The character encoding format.  */
145 static char encoding;
146 static int encoding_bytes;
147 
148 /* Output string used to separate parsed strings  */
149 static char *output_separator;
150 
151 static struct option long_options[] =
152 {
153   {"all", no_argument, NULL, 'a'},
154   {"bytes", required_argument, NULL, 'n'},
155   {"data", no_argument, NULL, 'd'},
156   {"encoding", required_argument, NULL, 'e'},
157   {"help", no_argument, NULL, 'h'},
158   {"include-all-whitespace", no_argument, NULL, 'w'},
159   {"output-separator", required_argument, NULL, 's'},
160   {"print-file-name", no_argument, NULL, 'f'},
161   {"radix", required_argument, NULL, 't'},
162   {"target", required_argument, NULL, 'T'},
163   {"unicode", required_argument, NULL, 'U'},
164   {"version", no_argument, NULL, 'v'},
165   {NULL, 0, NULL, 0}
166 };
167 
168 static bool strings_file (char *);
169 static void print_strings (const char *, FILE *, file_ptr, int, char *);
170 static void usage (FILE *, int) ATTRIBUTE_NORETURN;
171 
172 int main (int, char **);
173 
174 int
main(int argc,char ** argv)175 main (int argc, char **argv)
176 {
177   int optc;
178   int exit_status = 0;
179   bool files_given = false;
180   char *s;
181   int numeric_opt = 0;
182 
183   setlocale (LC_ALL, "");
184   bindtextdomain (PACKAGE, LOCALEDIR);
185   textdomain (PACKAGE);
186 
187   program_name = argv[0];
188   xmalloc_set_program_name (program_name);
189   bfd_set_error_program_name (program_name);
190 
191   expandargv (&argc, &argv);
192 
193   string_min = 4;
194   include_all_whitespace = false;
195   print_addresses = false;
196   print_filenames = false;
197   if (DEFAULT_STRINGS_ALL)
198     datasection_only = false;
199   else
200     datasection_only = true;
201   target = NULL;
202   encoding = 's';
203   output_separator = NULL;
204 
205   while ((optc = getopt_long (argc, argv, "adfhHn:wot:e:T:s:U:Vv0123456789",
206 			      long_options, (int *) 0)) != EOF)
207     {
208       switch (optc)
209 	{
210 	case 'a':
211 	  datasection_only = false;
212 	  break;
213 
214 	case 'd':
215 	  datasection_only = true;
216 	  break;
217 
218 	case 'f':
219 	  print_filenames = true;
220 	  break;
221 
222 	case 'H':
223 	case 'h':
224 	  usage (stdout, 0);
225 
226 	case 'n':
227 	  string_min = (int) strtoul (optarg, &s, 0);
228 	  if (s != NULL && *s != 0)
229 	    fatal (_("invalid integer argument %s"), optarg);
230 	  break;
231 
232 	case 'w':
233 	  include_all_whitespace = true;
234 	  break;
235 
236 	case 'o':
237 	  print_addresses = true;
238 	  address_radix = 8;
239 	  break;
240 
241 	case 't':
242 	  print_addresses = true;
243 	  if (optarg[1] != '\0')
244 	    usage (stderr, 1);
245 	  switch (optarg[0])
246 	    {
247 	    case 'o':
248 	      address_radix = 8;
249 	      break;
250 
251 	    case 'd':
252 	      address_radix = 10;
253 	      break;
254 
255 	    case 'x':
256 	      address_radix = 16;
257 	      break;
258 
259 	    default:
260 	      usage (stderr, 1);
261 	    }
262 	  break;
263 
264 	case 'T':
265 	  target = optarg;
266 	  break;
267 
268 	case 'e':
269 	  if (optarg[1] != '\0')
270 	    usage (stderr, 1);
271 	  encoding = optarg[0];
272 	  break;
273 
274 	case 's':
275 	  output_separator = optarg;
276 	  break;
277 
278 	case 'U':
279 	  if (streq (optarg, "default") || streq (optarg, "d"))
280 	    unicode_display = unicode_default;
281 	  else if (streq (optarg, "locale") || streq (optarg, "l"))
282 	    unicode_display = unicode_locale;
283 	  else if (streq (optarg, "escape") || streq (optarg, "e"))
284 	    unicode_display = unicode_escape;
285 	  else if (streq (optarg, "invalid") || streq (optarg, "i"))
286 	    unicode_display = unicode_invalid;
287 	  else if (streq (optarg, "hex") || streq (optarg, "x"))
288 	    unicode_display = unicode_hex;
289 	  else if (streq (optarg, "highlight") || streq (optarg, "h"))
290 	    unicode_display = unicode_highlight;
291 	  else
292 	    fatal (_("invalid argument to -U/--unicode: %s"), optarg);
293 	  break;
294 
295 	case 'V':
296 	case 'v':
297 	  print_version ("strings");
298 	  break;
299 
300 	case '?':
301 	  usage (stderr, 1);
302 
303 	default:
304 	  numeric_opt = optind;
305 	  break;
306 	}
307     }
308 
309   if (unicode_display != unicode_default)
310     encoding = 'S';
311 
312   if (numeric_opt != 0)
313     {
314       string_min = (int) strtoul (argv[numeric_opt - 1] + 1, &s, 0);
315       if (s != NULL && *s != 0)
316 	fatal (_("invalid integer argument %s"), argv[numeric_opt - 1] + 1);
317     }
318   if (string_min < 1)
319     fatal (_("invalid minimum string length %d"), string_min);
320 
321   switch (encoding)
322     {
323     case 'S':
324     case 's':
325       encoding_bytes = 1;
326       break;
327     case 'b':
328     case 'l':
329       encoding_bytes = 2;
330       break;
331     case 'B':
332     case 'L':
333       encoding_bytes = 4;
334       break;
335     default:
336       usage (stderr, 1);
337     }
338 
339   if (bfd_init () != BFD_INIT_MAGIC)
340     fatal (_("fatal error: libbfd ABI mismatch"));
341   set_default_bfd_target ();
342 
343   if (optind >= argc)
344     {
345       datasection_only = false;
346       SET_BINARY (fileno (stdin));
347       print_strings ("{standard input}", stdin, 0, 0, (char *) NULL);
348       files_given = true;
349     }
350   else
351     {
352       for (; optind < argc; ++optind)
353 	{
354 	  if (streq (argv[optind], "-"))
355 	    datasection_only = false;
356 	  else
357 	    {
358 	      files_given = true;
359 	      exit_status |= !strings_file (argv[optind]);
360 	    }
361 	}
362     }
363 
364   if (!files_given)
365     usage (stderr, 1);
366 
367   return (exit_status);
368 }
369 
370 /* Scan section SECT of the file ABFD, whose printable name is
371    FILENAME.  If it contains initialized data set GOT_A_SECTION and
372    print the strings in it.  */
373 
374 static void
strings_a_section(bfd * abfd,asection * sect,const char * filename,bool * got_a_section)375 strings_a_section (bfd *abfd, asection *sect, const char *filename,
376 		   bool *got_a_section)
377 {
378   bfd_size_type sectsize;
379   bfd_byte *mem;
380 
381   if ((sect->flags & DATA_FLAGS) != DATA_FLAGS)
382     return;
383 
384   sectsize = bfd_section_size (sect);
385   if (sectsize == 0)
386     return;
387 
388   if (!bfd_malloc_and_get_section (abfd, sect, &mem))
389     {
390       non_fatal (_("%s: Reading section %s failed: %s"),
391 		 filename, sect->name, bfd_errmsg (bfd_get_error ()));
392       return;
393     }
394 
395   *got_a_section = true;
396   print_strings (filename, NULL, sect->filepos, sectsize, (char *) mem);
397   free (mem);
398 }
399 
400 /* Scan all of the sections in FILE, and print the strings
401    in the initialized data section(s).
402 
403    Return TRUE if successful,
404    FALSE if not (such as if FILE is not an object file).  */
405 
406 static bool
strings_object_file(const char * file)407 strings_object_file (const char *file)
408 {
409   bfd *abfd;
410   asection *s;
411   bool got_a_section;
412 
413   abfd = bfd_openr (file, target);
414 
415   if (abfd == NULL)
416     /* Treat the file as a non-object file.  */
417     return false;
418 
419   /* This call is mainly for its side effect of reading in the sections.
420      We follow the traditional behavior of `strings' in that we don't
421      complain if we don't recognize a file to be an object file.  */
422   if (!bfd_check_format (abfd, bfd_object))
423     {
424       bfd_close (abfd);
425       return false;
426     }
427 
428   got_a_section = false;
429   for (s = abfd->sections; s != NULL; s = s->next)
430     strings_a_section (abfd, s, file, &got_a_section);
431 
432   if (!bfd_close (abfd))
433     {
434       bfd_nonfatal (file);
435       return false;
436     }
437 
438   return got_a_section;
439 }
440 
441 /* Print the strings in FILE.  Return TRUE if ok, FALSE if an error occurs.  */
442 
443 static bool
strings_file(char * file)444 strings_file (char *file)
445 {
446   struct stat st;
447 
448   /* get_file_size does not support non-S_ISREG files.  */
449 
450   if (stat (file, &st) < 0)
451     {
452       if (errno == ENOENT)
453 	non_fatal (_("'%s': No such file"), file);
454       else
455 	non_fatal (_("Warning: could not locate '%s'.  reason: %s"),
456 		   file, strerror (errno));
457       return false;
458     }
459   else if (S_ISDIR (st.st_mode))
460     {
461       non_fatal (_("Warning: '%s' is a directory"), file);
462       return false;
463     }
464 
465   /* If we weren't told to scan the whole file,
466      try to open it as an object file and only look at
467      initialized data sections.  If that fails, fall back to the
468      whole file.  */
469   if (!datasection_only || !strings_object_file (file))
470     {
471       FILE *stream;
472 
473       stream = fopen (file, FOPEN_RB);
474       if (stream == NULL)
475 	{
476 	  fprintf (stderr, "%s: ", program_name);
477 	  perror (file);
478 	  return false;
479 	}
480 
481       print_strings (file, stream, (file_ptr) 0, 0, (char *) NULL);
482 
483       if (fclose (stream) == EOF)
484 	{
485 	  fprintf (stderr, "%s: ", program_name);
486 	  perror (file);
487 	  return false;
488 	}
489     }
490 
491   return true;
492 }
493 
494 /* Read the next character, return EOF if none available.
495    Assume that STREAM is positioned so that the next byte read
496    is at address ADDRESS in the file.
497 
498    If STREAM is NULL, do not read from it.
499    The caller can supply a buffer of characters
500    to be processed before the data in STREAM.
501    MAGIC is the address of the buffer and
502    MAGICCOUNT is how many characters are in it.  */
503 
504 static long
get_char(FILE * stream,file_ptr * address,int * magiccount,char ** magic)505 get_char (FILE *stream, file_ptr *address, int *magiccount, char **magic)
506 {
507   int c, i;
508   long r = 0;
509 
510   for (i = 0; i < encoding_bytes; i++)
511     {
512       if (*magiccount)
513 	{
514 	  (*magiccount)--;
515 	  c = *(*magic)++;
516 	}
517       else
518 	{
519 	  if (stream == NULL)
520 	    return EOF;
521 
522 	  /* Only use getc_unlocked if we found a declaration for it.
523 	     Otherwise, libc is not thread safe by default, and we
524 	     should not use it.  */
525 
526 #if defined(HAVE_GETC_UNLOCKED) && HAVE_DECL_GETC_UNLOCKED
527 	  c = getc_unlocked (stream);
528 #else
529 	  c = getc (stream);
530 #endif
531 	  if (c == EOF)
532 	    return EOF;
533 	}
534 
535       (*address)++;
536       r = (r << 8) | (c & 0xff);
537     }
538 
539   switch (encoding)
540     {
541     default:
542       break;
543     case 'l':
544       r = ((r & 0xff) << 8) | ((r & 0xff00) >> 8);
545       break;
546     case 'L':
547       r = (((r & 0xff) << 24) | ((r & 0xff00) << 8)
548 	   | ((r & 0xff0000) >> 8) | ((r & 0xff000000) >> 24));
549       break;
550     }
551 
552   return r;
553 }
554 
555 /* Throw away one byte of a (possibly) multi-byte char C, updating
556    address and buffer to suit.  */
557 
558 static void
unget_part_char(long c,file_ptr * address,int * magiccount,char ** magic)559 unget_part_char (long c, file_ptr *address, int *magiccount, char **magic)
560 {
561   static char tmp[4];
562 
563   if (encoding_bytes > 1)
564     {
565       *address -= encoding_bytes - 1;
566 
567       if (*magiccount == 0)
568 	{
569 	  /* If no magic buffer exists, use temp buffer.  */
570 	  switch (encoding)
571 	    {
572 	    default:
573 	      break;
574 	    case 'b':
575 	      tmp[0] = c & 0xff;
576 	      *magiccount = 1;
577 	      break;
578 	    case 'l':
579 	      tmp[0] = (c >> 8) & 0xff;
580 	      *magiccount = 1;
581 	      break;
582 	    case 'B':
583 	      tmp[0] = (c >> 16) & 0xff;
584 	      tmp[1] = (c >> 8) & 0xff;
585 	      tmp[2] = c & 0xff;
586 	      *magiccount = 3;
587 	      break;
588 	    case 'L':
589 	      tmp[0] = (c >> 8) & 0xff;
590 	      tmp[1] = (c >> 16) & 0xff;
591 	      tmp[2] = (c >> 24) & 0xff;
592 	      *magiccount = 3;
593 	      break;
594 	    }
595 	  *magic = tmp;
596 	}
597       else
598 	{
599 	  /* If magic buffer exists, rewind.  */
600 	  *magic -= encoding_bytes - 1;
601 	  *magiccount += encoding_bytes - 1;
602 	}
603     }
604 }
605 
606 static void
print_filename_and_address(const char * filename,file_ptr address)607 print_filename_and_address (const char * filename, file_ptr address)
608 {
609   if (print_filenames)
610     printf ("%s: ", filename);
611 
612   if (! print_addresses)
613     return;
614 
615   switch (address_radix)
616     {
617     case 8:
618       if (sizeof (address) > sizeof (long))
619 	{
620 #ifndef __MSVCRT__
621 	  printf ("%7llo ", (unsigned long long) address);
622 #else
623 	  printf ("%7I64o ", (unsigned long long) address);
624 #endif
625 	}
626       else
627 	printf ("%7lo ", (unsigned long) address);
628       break;
629 
630     case 10:
631       if (sizeof (address) > sizeof (long))
632 	{
633 #ifndef __MSVCRT__
634 	  printf ("%7llu ", (unsigned long long) address);
635 #else
636 	  printf ("%7I64d ", (unsigned long long) address);
637 #endif
638 	}
639       else
640 	printf ("%7ld ", (long) address);
641       break;
642 
643     case 16:
644       if (sizeof (address) > sizeof (long))
645 	{
646 #ifndef __MSVCRT__
647 	  printf ("%7llx ", (unsigned long long) address);
648 #else
649 	  printf ("%7I64x ", (unsigned long long) address);
650 #endif
651 	}
652       else
653 	printf ("%7lx ", (unsigned long) address);
654       break;
655     }
656 }
657 
658 /* Return non-zero if the bytes starting at BUFFER form a valid UTF-8 encoding.
659    If the encoding is valid then returns the number of bytes it uses.  */
660 
661 static unsigned int
is_valid_utf8(const unsigned char * buffer,unsigned long buflen)662 is_valid_utf8 (const unsigned char * buffer, unsigned long buflen)
663 {
664   if (buffer[0] < 0xc0)
665     return 0;
666 
667   if (buflen < 2)
668     return 0;
669 
670   if ((buffer[1] & 0xc0) != 0x80)
671     return 0;
672 
673   if ((buffer[0] & 0x20) == 0)
674     return 2;
675 
676   if (buflen < 3)
677     return 0;
678 
679   if ((buffer[2] & 0xc0) != 0x80)
680     return 0;
681 
682   if ((buffer[0] & 0x10) == 0)
683     return 3;
684 
685   if (buflen < 4)
686     return 0;
687 
688   if ((buffer[3] & 0xc0) != 0x80)
689     return 0;
690 
691   return 4;
692 }
693 
694 /* Display a UTF-8 encoded character in BUFFER according to the setting
695    of unicode_display.  The character is known to be valid.
696    Returns the number of bytes consumed.  */
697 
698 static unsigned int
display_utf8_char(const unsigned char * buffer)699 display_utf8_char (const unsigned char * buffer)
700 {
701   unsigned int j;
702   unsigned int utf8_len;
703 
704   switch (buffer[0] & 0x30)
705     {
706     case 0x00:
707     case 0x10:
708       utf8_len = 2;
709       break;
710     case 0x20:
711       utf8_len = 3;
712       break;
713     default:
714       utf8_len = 4;
715     }
716 
717   switch (unicode_display)
718     {
719     default:
720       fprintf (stderr, "ICE: unexpected unicode display type\n");
721       break;
722 
723     case unicode_escape:
724     case unicode_highlight:
725       if (unicode_display == unicode_highlight && isatty (1))
726 	printf ("\x1B[31;47m"); /* Red.  */
727 
728       switch (utf8_len)
729 	{
730 	case 2:
731 	  printf ("\\u%02x%02x",
732 		  ((buffer[0] & 0x1c) >> 2),
733 		  ((buffer[0] & 0x03) << 6) | (buffer[1] & 0x3f));
734 	  break;
735 
736 	case 3:
737 	  printf ("\\u%02x%02x",
738 		  ((buffer[0] & 0x0f) << 4) | ((buffer[1] & 0x3c) >> 2),
739 		  ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3f)));
740 	  break;
741 
742 	case 4:
743 	  printf ("\\u%02x%02x%02x",
744 		  ((buffer[0] & 0x07) << 6) | ((buffer[1] & 0x3c) >> 2),
745 		  ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3c) >> 2),
746 		  ((buffer[2] & 0x03) << 6) | ((buffer[3] & 0x3f)));
747 	  break;
748 	default:
749 	  /* URG.  */
750 	  break;
751 	}
752 
753       if (unicode_display == unicode_highlight && isatty (1))
754 	printf ("\033[0m"); /* Default colour.  */
755       break;
756 
757     case unicode_hex:
758       putchar ('<');
759       printf ("0x");
760       for (j = 0; j < utf8_len; j++)
761 	printf ("%02x", buffer [j]);
762       putchar ('>');
763       break;
764 
765     case unicode_locale:
766       printf ("%.1s", buffer);
767       break;
768     }
769 
770   return utf8_len;
771 }
772 
773 /* Display strings in BUFFER.  Treat any UTF-8 encoded characters encountered
774    according to the setting of the unicode_display variable.  The buffer
775    contains BUFLEN bytes.
776 
777    Display the characters as if they started at ADDRESS and are contained in
778    FILENAME.  */
779 
780 static void
print_unicode_buffer(const char * filename,file_ptr address,const unsigned char * buffer,unsigned long buflen)781 print_unicode_buffer (const char *            filename,
782 		      file_ptr                address,
783 		      const unsigned char *   buffer,
784 		      unsigned long           buflen)
785 {
786   /* Paranoia checks...  */
787   if (filename == NULL
788       || buffer == NULL
789       || unicode_display == unicode_default
790       || encoding != 'S'
791       || encoding_bytes != 1)
792     {
793       fprintf (stderr, "ICE: bad arguments to print_unicode_buffer\n");
794       return;
795     }
796 
797   if (buflen == 0)
798     return;
799 
800   /* We must only display strings that are at least string_min *characters*
801      long.  So we scan the buffer in two stages.  First we locate the start
802      of a potential string.  Then we walk along it until we have found
803      string_min characters.  Then we go back to the start point and start
804      displaying characters according to the unicode_display setting.  */
805 
806   unsigned long start_point = 0;
807   unsigned long i = 0;
808   unsigned int char_len = 1;
809   unsigned int num_found = 0;
810 
811   for (i = 0; i < buflen; i += char_len)
812     {
813       int c = buffer[i];
814 
815       char_len = 1;
816 
817       /* Find the first potential character of a string.  */
818       if (! STRING_ISGRAPHIC (c))
819 	{
820 	  num_found = 0;
821 	  continue;
822 	}
823 
824       if (c > 126)
825 	{
826 	  if (c < 0xc0)
827 	    {
828 	      num_found = 0;
829 	      continue;
830 	    }
831 
832 	  if ((char_len = is_valid_utf8 (buffer + i, buflen - i)) == 0)
833 	    {
834 	      char_len = 1;
835 	      num_found = 0;
836 	      continue;
837 	    }
838 
839 	  if (unicode_display == unicode_invalid)
840 	    {
841 	      /* We have found a valid UTF-8 character, but we treat it as non-graphic.  */
842 	      num_found = 0;
843 	      continue;
844 	    }
845 	}
846 
847       if (num_found == 0)
848 	/* We have found a potential starting point for a string.  */
849 	start_point = i;
850 
851       ++ num_found;
852 
853       if (num_found >= string_min)
854 	break;
855     }
856 
857   if (num_found < string_min)
858     return;
859 
860   print_filename_and_address (filename, address + start_point);
861 
862   /* We have found string_min characters.  Display them and any
863      more that follow.  */
864   for (i = start_point; i < buflen; i += char_len)
865     {
866       int c = buffer[i];
867 
868       char_len = 1;
869 
870       if (! STRING_ISGRAPHIC (c))
871 	break;
872       else if (c < 127)
873 	putchar (c);
874       else if (! is_valid_utf8 (buffer + i, buflen - i))
875 	break;
876       else if (unicode_display == unicode_invalid)
877 	break;
878       else
879 	char_len = display_utf8_char (buffer + i);
880     }
881 
882   if (output_separator)
883     fputs (output_separator, stdout);
884   else
885     putchar ('\n');
886 
887   /* FIXME: Using tail recursion here is lazy programming...  */
888   print_unicode_buffer (filename, address + i, buffer + i, buflen - i);
889 }
890 
891 static int
get_unicode_byte(FILE * stream,unsigned char * putback,unsigned int * num_putback,unsigned int * num_read)892 get_unicode_byte (FILE *          stream,
893 		  unsigned char * putback,
894 		  unsigned int *  num_putback,
895 		  unsigned int *  num_read)
896 {
897   if (* num_putback > 0)
898     {
899       * num_putback = * num_putback - 1;
900       return putback [* num_putback];
901     }
902 
903   * num_read = * num_read + 1;
904 
905 #if defined(HAVE_GETC_UNLOCKED) && HAVE_DECL_GETC_UNLOCKED
906   return getc_unlocked (stream);
907 #else
908   return getc (stream);
909 #endif
910 }
911 
912 /* Helper function for print_unicode_stream.  */
913 
914 static void
print_unicode_stream_body(const char * filename,file_ptr address,FILE * stream,unsigned char * putback_buf,unsigned int num_putback,unsigned char * print_buf)915 print_unicode_stream_body (const char *     filename,
916 			   file_ptr         address,
917 			   FILE *           stream,
918 			   unsigned char *  putback_buf,
919 			   unsigned int     num_putback,
920 			   unsigned char *  print_buf)
921 {
922   /* It would be nice if we could just read the stream into a buffer
923      and then process if with print_unicode_buffer.  But the input
924      might be huge or it might time-locked (eg stdin).  So instead
925      we go one byte at a time...  */
926 
927   file_ptr start_point = 0;
928   unsigned int num_read = 0;
929   unsigned int num_chars = 0;
930   unsigned int num_print = 0;
931   int c = 0;
932 
933   /* Find a series of string_min characters.  Put them into print_buf.  */
934   do
935     {
936       if (num_chars >= string_min)
937 	break;
938 
939       c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
940       if (c == EOF)
941 	break;
942 
943       if (! STRING_ISGRAPHIC (c))
944 	{
945 	  num_chars = num_print = 0;
946 	  continue;
947 	}
948 
949       if (num_chars == 0)
950 	start_point = num_read - 1;
951 
952       if (c < 127)
953 	{
954 	  print_buf[num_print] = c;
955 	  num_chars ++;
956 	  num_print ++;
957 	  continue;
958 	}
959 
960       if (c < 0xc0)
961 	{
962 	  num_chars = num_print = 0;
963 	  continue;
964 	}
965 
966       /* We *might* have a UTF-8 sequence.  Time to start peeking.  */
967       char utf8[4];
968 
969       utf8[0] = c;
970       c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
971       if (c == EOF)
972 	break;
973       utf8[1] = c;
974 
975       if ((utf8[1] & 0xc0) != 0x80)
976 	{
977 	  /* Invalid UTF-8.  */
978 	  putback_buf[num_putback++] = utf8[1];
979 	  num_chars = num_print = 0;
980 	  continue;
981 	}
982       else if ((utf8[0] & 0x20) == 0)
983 	{
984 	  /* A valid 2-byte UTF-8 encoding.  */
985 	  if (unicode_display == unicode_invalid)
986 	    {
987 	      putback_buf[num_putback++] = utf8[1];
988 	      num_chars = num_print = 0;
989 	    }
990 	  else
991 	    {
992 	      print_buf[num_print ++] = utf8[0];
993 	      print_buf[num_print ++] = utf8[1];
994 	      num_chars ++;
995 	    }
996 	  continue;
997 	}
998 
999       c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1000       if (c == EOF)
1001 	break;
1002       utf8[2] = c;
1003 
1004       if ((utf8[2] & 0xc0) != 0x80)
1005 	{
1006 	  /* Invalid UTF-8.  */
1007 	  putback_buf[num_putback++] = utf8[2];
1008 	  putback_buf[num_putback++] = utf8[1];
1009 	  num_chars = num_print = 0;
1010 	  continue;
1011 	}
1012       else if ((utf8[0] & 0x10) == 0)
1013 	{
1014 	  /* A valid 3-byte UTF-8 encoding.  */
1015 	  if (unicode_display == unicode_invalid)
1016 	    {
1017 	      putback_buf[num_putback++] = utf8[2];
1018 	      putback_buf[num_putback++] = utf8[1];
1019 	      num_chars = num_print = 0;
1020 	    }
1021 	  else
1022 	    {
1023 	      print_buf[num_print ++] = utf8[0];
1024 	      print_buf[num_print ++] = utf8[1];
1025 	      print_buf[num_print ++] = utf8[2];
1026 	      num_chars ++;
1027 	    }
1028 	  continue;
1029 	}
1030 
1031       c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1032       if (c == EOF)
1033 	break;
1034       utf8[3] = c;
1035 
1036       if ((utf8[3] & 0xc0) != 0x80)
1037 	{
1038 	  /* Invalid UTF-8.  */
1039 	  putback_buf[num_putback++] = utf8[3];
1040 	  putback_buf[num_putback++] = utf8[2];
1041 	  putback_buf[num_putback++] = utf8[1];
1042 	  num_chars = num_print = 0;
1043 	}
1044       /* We have a valid 4-byte UTF-8 encoding.  */
1045       else if (unicode_display == unicode_invalid)
1046 	{
1047 	  putback_buf[num_putback++] = utf8[3];
1048 	  putback_buf[num_putback++] = utf8[1];
1049 	  putback_buf[num_putback++] = utf8[2];
1050 	  num_chars = num_print = 0;
1051 	}
1052       else
1053 	{
1054 	  print_buf[num_print ++] = utf8[0];
1055 	  print_buf[num_print ++] = utf8[1];
1056 	  print_buf[num_print ++] = utf8[2];
1057 	  print_buf[num_print ++] = utf8[3];
1058 	  num_chars ++;
1059 	}
1060     }
1061   while (1);
1062 
1063   if (num_chars >= string_min)
1064     {
1065       /* We know that we have string_min valid characters in print_buf,
1066 	 and there may be more to come in the stream.  Start displaying
1067 	 them.  */
1068 
1069       print_filename_and_address (filename, address + start_point);
1070 
1071       unsigned int i;
1072       for (i = 0; i < num_print;)
1073 	{
1074 	  if (print_buf[i] < 127)
1075 	    putchar (print_buf[i++]);
1076 	  else
1077 	    i += display_utf8_char (print_buf + i);
1078 	}
1079 
1080       /* OK so now we have to start read unchecked bytes.  */
1081 
1082       /* Find a series of string_min characters.  Put them into print_buf.  */
1083       do
1084 	{
1085 	  c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1086 	  if (c == EOF)
1087 	    break;
1088 
1089 	  if (! STRING_ISGRAPHIC (c))
1090 	    break;
1091 
1092 	  if (c < 127)
1093 	    {
1094 	      putchar (c);
1095 	      continue;
1096 	    }
1097 
1098 	  if (c < 0xc0)
1099 	    break;
1100 
1101 	  /* We *might* have a UTF-8 sequence.  Time to start peeking.  */
1102 	  unsigned char utf8[4];
1103 
1104 	  utf8[0] = c;
1105 	  c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1106 	  if (c == EOF)
1107 	    break;
1108 	  utf8[1] = c;
1109 
1110 	  if ((utf8[1] & 0xc0) != 0x80)
1111 	    {
1112 	      /* Invalid UTF-8.  */
1113 	      putback_buf[num_putback++] = utf8[1];
1114 	      break;
1115 	    }
1116 	  else if ((utf8[0] & 0x20) == 0)
1117 	    {
1118 	      /* Valid 2-byte UTF-8.  */
1119 	      if (unicode_display == unicode_invalid)
1120 		{
1121 		  putback_buf[num_putback++] = utf8[1];
1122 		  break;
1123 		}
1124 	      else
1125 		{
1126 		  (void) display_utf8_char (utf8);
1127 		  continue;
1128 		}
1129 	    }
1130 
1131 	  c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1132 	  if (c == EOF)
1133 	    break;
1134 	  utf8[2] = c;
1135 
1136 	  if ((utf8[2] & 0xc0) != 0x80)
1137 	    {
1138 	      /* Invalid UTF-8.  */
1139 	      putback_buf[num_putback++] = utf8[2];
1140 	      putback_buf[num_putback++] = utf8[1];
1141 	      break;
1142 	    }
1143 	  else if ((utf8[0] & 0x10) == 0)
1144 	    {
1145 	      /* Valid 3-byte UTF-8.  */
1146 	      if (unicode_display == unicode_invalid)
1147 		{
1148 		  putback_buf[num_putback++] = utf8[2];
1149 		  putback_buf[num_putback++] = utf8[1];
1150 		  break;
1151 		}
1152 	      else
1153 		{
1154 		  (void) display_utf8_char (utf8);
1155 		  continue;
1156 		}
1157 	    }
1158 
1159 	  c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1160 	  if (c == EOF)
1161 	    break;
1162 	  utf8[3] = c;
1163 
1164 	  if ((utf8[3] & 0xc0) != 0x80)
1165 	    {
1166 	      /* Invalid UTF-8.  */
1167 	      putback_buf[num_putback++] = utf8[3];
1168 	      putback_buf[num_putback++] = utf8[2];
1169 	      putback_buf[num_putback++] = utf8[1];
1170 	      break;
1171 	    }
1172 	  else if (unicode_display == unicode_invalid)
1173 	    {
1174 	      putback_buf[num_putback++] = utf8[3];
1175 	      putback_buf[num_putback++] = utf8[2];
1176 	      putback_buf[num_putback++] = utf8[1];
1177 	      break;
1178 	    }
1179 	  else
1180 	    /* A valid 4-byte UTF-8 encoding.  */
1181 	    (void) display_utf8_char (utf8);
1182 	}
1183       while (1);
1184 
1185       if (output_separator)
1186 	fputs (output_separator, stdout);
1187       else
1188 	putchar ('\n');
1189     }
1190 
1191   if (c != EOF)
1192     /* FIXME: Using tail recursion here is lazy, but it works.  */
1193     print_unicode_stream_body (filename, address + num_read, stream, putback_buf, num_putback, print_buf);
1194 }
1195 
1196 /* Display strings read in from STREAM.  Treat any UTF-8 encoded characters
1197    encountered according to the setting of the unicode_display variable.
1198    The stream is positioned at ADDRESS and is attached to FILENAME.  */
1199 
1200 static void
print_unicode_stream(const char * filename,file_ptr address,FILE * stream)1201 print_unicode_stream (const char * filename,
1202 		      file_ptr     address,
1203 		      FILE *       stream)
1204 {
1205   /* Paranoia checks...  */
1206   if (filename == NULL
1207       || stream == NULL
1208       || unicode_display == unicode_default
1209       || encoding != 'S'
1210       || encoding_bytes != 1)
1211     {
1212       fprintf (stderr, "ICE: bad arguments to print_unicode_stream\n");
1213       return;
1214     }
1215 
1216   /* Allocate space for string_min 4-byte utf-8 characters.  */
1217   unsigned char * print_buf = xmalloc ((4 * string_min) + 1);
1218   /* We should never have to put back more than 4 bytes.  */
1219   unsigned char putback_buf[5];
1220   unsigned int num_putback = 0;
1221 
1222   print_unicode_stream_body (filename, address, stream, putback_buf, num_putback, print_buf);
1223   free (print_buf);
1224 }
1225 
1226 /* Find the strings in file FILENAME, read from STREAM.
1227    Assume that STREAM is positioned so that the next byte read
1228    is at address ADDRESS in the file.
1229 
1230    If STREAM is NULL, do not read from it.
1231    The caller can supply a buffer of characters
1232    to be processed before the data in STREAM.
1233    MAGIC is the address of the buffer and
1234    MAGICCOUNT is how many characters are in it.
1235    Those characters come at address ADDRESS and the data in STREAM follow.  */
1236 
1237 static void
print_strings(const char * filename,FILE * stream,file_ptr address,int magiccount,char * magic)1238 print_strings (const char *filename, FILE *stream, file_ptr address,
1239 	       int magiccount, char *magic)
1240 {
1241   if (unicode_display != unicode_default)
1242     {
1243       if (magic != NULL)
1244 	print_unicode_buffer (filename, address,
1245 			      (const unsigned char *) magic, magiccount);
1246 
1247       if (stream != NULL)
1248 	print_unicode_stream (filename, address, stream);
1249       return;
1250     }
1251 
1252   char *buf = (char *) xmalloc (sizeof (char) * (string_min + 1));
1253 
1254   while (1)
1255     {
1256       file_ptr start;
1257       unsigned int i;
1258       long c;
1259 
1260       /* See if the next `string_min' chars are all graphic chars.  */
1261     tryline:
1262       start = address;
1263       for (i = 0; i < string_min; i++)
1264 	{
1265 	  c = get_char (stream, &address, &magiccount, &magic);
1266 	  if (c == EOF)
1267 	    {
1268 	      free (buf);
1269 	      return;
1270 	    }
1271 
1272 	  if (! STRING_ISGRAPHIC (c))
1273 	    {
1274 	      /* Found a non-graphic.  Try again starting with next byte.  */
1275 	      unget_part_char (c, &address, &magiccount, &magic);
1276 	      goto tryline;
1277 	    }
1278 	  buf[i] = c;
1279 	}
1280 
1281       /* We found a run of `string_min' graphic characters.  Print up
1282 	 to the next non-graphic character.  */
1283       print_filename_and_address (filename, start);
1284 
1285       buf[i] = '\0';
1286       fputs (buf, stdout);
1287 
1288       while (1)
1289 	{
1290 	  c = get_char (stream, &address, &magiccount, &magic);
1291 	  if (c == EOF)
1292 	    break;
1293 	  if (! STRING_ISGRAPHIC (c))
1294 	    {
1295 	      unget_part_char (c, &address, &magiccount, &magic);
1296 	      break;
1297 	    }
1298 	  putchar (c);
1299 	}
1300 
1301       if (output_separator)
1302 	fputs (output_separator, stdout);
1303       else
1304 	putchar ('\n');
1305     }
1306   free (buf);
1307 }
1308 
1309 static void
usage(FILE * stream,int status)1310 usage (FILE *stream, int status)
1311 {
1312   fprintf (stream, _("Usage: %s [option(s)] [file(s)]\n"), program_name);
1313   fprintf (stream, _(" Display printable strings in [file(s)] (stdin by default)\n"));
1314   fprintf (stream, _(" The options are:\n"));
1315 
1316   if (DEFAULT_STRINGS_ALL)
1317     fprintf (stream, _("\
1318   -a - --all                Scan the entire file, not just the data section [default]\n\
1319   -d --data                 Only scan the data sections in the file\n"));
1320   else
1321     fprintf (stream, _("\
1322   -a - --all                Scan the entire file, not just the data section\n\
1323   -d --data                 Only scan the data sections in the file [default]\n"));
1324 
1325   fprintf (stream, _("\
1326   -f --print-file-name      Print the name of the file before each string\n\
1327   -n <number>               Locate & print any sequence of at least <number>\n\
1328     --bytes=<number>         displayable characters.  (The default is 4).\n\
1329   -t --radix={o,d,x}        Print the location of the string in base 8, 10 or 16\n\
1330   -w --include-all-whitespace Include all whitespace as valid string characters\n\
1331   -o                        An alias for --radix=o\n\
1332   -T --target=<BFDNAME>     Specify the binary file format\n\
1333   -e --encoding={s,S,b,l,B,L} Select character size and endianness:\n\
1334                             s = 7-bit, S = 8-bit, {b,l} = 16-bit, {B,L} = 32-bit\n\
1335   --unicode={default|show|invalid|hex|escape|highlight}\n\
1336   -U {d|s|i|x|e|h}          Specify how to treat UTF-8 encoded unicode characters\n\
1337   -s --output-separator=<string> String used to separate strings in output.\n\
1338   @<file>                   Read options from <file>\n\
1339   -h --help                 Display this information\n\
1340   -v -V --version           Print the program's version number\n"));
1341   list_supported_targets (program_name, stream);
1342   if (REPORT_BUGS_TO[0] && status == 0)
1343     fprintf (stream, _("Report bugs to %s\n"), REPORT_BUGS_TO);
1344   exit (status);
1345 }
1346