175fd0b74Schristos /* strings -- print the strings of printable characters in files
2*e992f068Schristos Copyright (C) 1993-2022 Free Software Foundation, Inc.
375fd0b74Schristos
475fd0b74Schristos This program is free software; you can redistribute it and/or modify
575fd0b74Schristos it under the terms of the GNU General Public License as published by
675fd0b74Schristos the Free Software Foundation; either version 3, or (at your option)
775fd0b74Schristos any later version.
875fd0b74Schristos
975fd0b74Schristos This program is distributed in the hope that it will be useful,
1075fd0b74Schristos but WITHOUT ANY WARRANTY; without even the implied warranty of
1175fd0b74Schristos MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1275fd0b74Schristos GNU General Public License for more details.
1375fd0b74Schristos
1475fd0b74Schristos You should have received a copy of the GNU General Public License
1575fd0b74Schristos along with this program; if not, write to the Free Software
1675fd0b74Schristos Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
1775fd0b74Schristos 02110-1301, USA. */
1875fd0b74Schristos
1975fd0b74Schristos /* Usage: strings [options] file...
2075fd0b74Schristos
2175fd0b74Schristos Options:
2275fd0b74Schristos --all
2375fd0b74Schristos -a
2475fd0b74Schristos - Scan each file in its entirety.
2575fd0b74Schristos
2675fd0b74Schristos --data
2775fd0b74Schristos -d Scan only the initialized data section(s) of object files.
2875fd0b74Schristos
2975fd0b74Schristos --print-file-name
3075fd0b74Schristos -f Print the name of the file before each string.
3175fd0b74Schristos
3275fd0b74Schristos --bytes=min-len
3375fd0b74Schristos -n min-len
3475fd0b74Schristos -min-len Print graphic char sequences, MIN-LEN or more bytes long,
35*e992f068Schristos that are followed by a NUL or a non-displayable character.
36*e992f068Schristos Default is 4.
3775fd0b74Schristos
3875fd0b74Schristos --radix={o,x,d}
3975fd0b74Schristos -t {o,x,d} Print the offset within the file before each string,
4075fd0b74Schristos in octal/hex/decimal.
4175fd0b74Schristos
4275fd0b74Schristos --include-all-whitespace
4375fd0b74Schristos -w By default tab and space are the only whitepace included in graphic
4475fd0b74Schristos char sequences. This option considers all of isspace() valid.
4575fd0b74Schristos
4675fd0b74Schristos -o Like -to. (Some other implementations have -o like -to,
4775fd0b74Schristos others like -td. We chose one arbitrarily.)
4875fd0b74Schristos
4975fd0b74Schristos --encoding={s,S,b,l,B,L}
5075fd0b74Schristos -e {s,S,b,l,B,L}
5175fd0b74Schristos Select character encoding: 7-bit-character, 8-bit-character,
5275fd0b74Schristos bigendian 16-bit, littleendian 16-bit, bigendian 32-bit,
5375fd0b74Schristos littleendian 32-bit.
5475fd0b74Schristos
5575fd0b74Schristos --target=BFDNAME
5675fd0b74Schristos -T {bfdname}
5775fd0b74Schristos Specify a non-default object file format.
5875fd0b74Schristos
59*e992f068Schristos --unicode={default|locale|invalid|hex|escape|highlight}
60*e992f068Schristos -U {d|l|i|x|e|h}
61*e992f068Schristos Determine how to handle UTF-8 unicode characters. The default
62*e992f068Schristos is no special treatment. All other versions of this option
63*e992f068Schristos only apply if the encoding is valid and enabling the option
64*e992f068Schristos implies --encoding=S.
65*e992f068Schristos The 'locale' option displays the characters according to the
66*e992f068Schristos current locale. The 'invalid' option treats them as
67*e992f068Schristos non-string characters. The 'hex' option displays them as hex
68*e992f068Schristos byte sequences. The 'escape' option displays them as escape
69*e992f068Schristos sequences and the 'highlight' option displays them as
70*e992f068Schristos coloured escape sequences.
71*e992f068Schristos
7275fd0b74Schristos --output-separator=sep_string
7375fd0b74Schristos -s sep_string String used to separate parsed strings in output.
7475fd0b74Schristos Default is newline.
7575fd0b74Schristos
7675fd0b74Schristos --help
7775fd0b74Schristos -h Print the usage message on the standard output.
7875fd0b74Schristos
7975fd0b74Schristos --version
8075fd0b74Schristos -V
8175fd0b74Schristos -v Print the program version number.
8275fd0b74Schristos
8375fd0b74Schristos Written by Richard Stallman <rms@gnu.ai.mit.edu>
8475fd0b74Schristos and David MacKenzie <djm@gnu.ai.mit.edu>. */
8575fd0b74Schristos
8675fd0b74Schristos #include "sysdep.h"
8775fd0b74Schristos #include "bfd.h"
8875fd0b74Schristos #include "getopt.h"
8975fd0b74Schristos #include "libiberty.h"
9075fd0b74Schristos #include "safe-ctype.h"
9175fd0b74Schristos #include "bucomm.h"
9275fd0b74Schristos
93*e992f068Schristos #ifndef streq
94*e992f068Schristos #define streq(a,b) (strcmp ((a),(b)) == 0)
95*e992f068Schristos #endif
96*e992f068Schristos
97*e992f068Schristos typedef enum unicode_display_type
98*e992f068Schristos {
99*e992f068Schristos unicode_default = 0,
100*e992f068Schristos unicode_locale,
101*e992f068Schristos unicode_escape,
102*e992f068Schristos unicode_hex,
103*e992f068Schristos unicode_highlight,
104*e992f068Schristos unicode_invalid
105*e992f068Schristos } unicode_display_type;
106*e992f068Schristos
107*e992f068Schristos static unicode_display_type unicode_display = unicode_default;
108*e992f068Schristos
10975fd0b74Schristos #define STRING_ISGRAPHIC(c) \
11075fd0b74Schristos ( (c) >= 0 \
11175fd0b74Schristos && (c) <= 255 \
11275fd0b74Schristos && ((c) == '\t' || ISPRINT (c) || (encoding == 'S' && (c) > 127) \
113ede78133Schristos || (include_all_whitespace && ISSPACE (c))) \
11475fd0b74Schristos )
11575fd0b74Schristos
11675fd0b74Schristos #ifndef errno
11775fd0b74Schristos extern int errno;
11875fd0b74Schristos #endif
11975fd0b74Schristos
12075fd0b74Schristos /* The BFD section flags that identify an initialized data section. */
12175fd0b74Schristos #define DATA_FLAGS (SEC_ALLOC | SEC_LOAD | SEC_HAS_CONTENTS)
12275fd0b74Schristos
12375fd0b74Schristos /* Radix for printing addresses (must be 8, 10 or 16). */
12475fd0b74Schristos static int address_radix;
12575fd0b74Schristos
12675fd0b74Schristos /* Minimum length of sequence of graphic chars to trigger output. */
127*e992f068Schristos static unsigned int string_min;
12875fd0b74Schristos
12975fd0b74Schristos /* Whether or not we include all whitespace as a graphic char. */
130*e992f068Schristos static bool include_all_whitespace;
13175fd0b74Schristos
13275fd0b74Schristos /* TRUE means print address within file for each string. */
133*e992f068Schristos static bool print_addresses;
13475fd0b74Schristos
13575fd0b74Schristos /* TRUE means print filename for each string. */
136*e992f068Schristos static bool print_filenames;
13775fd0b74Schristos
13875fd0b74Schristos /* TRUE means for object files scan only the data section. */
139*e992f068Schristos static bool datasection_only;
14075fd0b74Schristos
14175fd0b74Schristos /* The BFD object file format. */
14275fd0b74Schristos static char *target;
14375fd0b74Schristos
14475fd0b74Schristos /* The character encoding format. */
14575fd0b74Schristos static char encoding;
14675fd0b74Schristos static int encoding_bytes;
14775fd0b74Schristos
14875fd0b74Schristos /* Output string used to separate parsed strings */
14975fd0b74Schristos static char *output_separator;
15075fd0b74Schristos
15175fd0b74Schristos static struct option long_options[] =
15275fd0b74Schristos {
15375fd0b74Schristos {"all", no_argument, NULL, 'a'},
15475fd0b74Schristos {"bytes", required_argument, NULL, 'n'},
155*e992f068Schristos {"data", no_argument, NULL, 'd'},
15675fd0b74Schristos {"encoding", required_argument, NULL, 'e'},
15775fd0b74Schristos {"help", no_argument, NULL, 'h'},
158*e992f068Schristos {"include-all-whitespace", no_argument, NULL, 'w'},
159*e992f068Schristos {"output-separator", required_argument, NULL, 's'},
160*e992f068Schristos {"print-file-name", no_argument, NULL, 'f'},
161*e992f068Schristos {"radix", required_argument, NULL, 't'},
162*e992f068Schristos {"target", required_argument, NULL, 'T'},
163*e992f068Schristos {"unicode", required_argument, NULL, 'U'},
16475fd0b74Schristos {"version", no_argument, NULL, 'v'},
16575fd0b74Schristos {NULL, 0, NULL, 0}
16675fd0b74Schristos };
16775fd0b74Schristos
168*e992f068Schristos static bool strings_file (char *);
169*e992f068Schristos static void print_strings (const char *, FILE *, file_ptr, int, char *);
170ede78133Schristos static void usage (FILE *, int) ATTRIBUTE_NORETURN;
17175fd0b74Schristos
17275fd0b74Schristos int main (int, char **);
17375fd0b74Schristos
17475fd0b74Schristos int
main(int argc,char ** argv)17575fd0b74Schristos main (int argc, char **argv)
17675fd0b74Schristos {
17775fd0b74Schristos int optc;
17875fd0b74Schristos int exit_status = 0;
179*e992f068Schristos bool files_given = false;
18075fd0b74Schristos char *s;
18175fd0b74Schristos int numeric_opt = 0;
18275fd0b74Schristos
18375fd0b74Schristos setlocale (LC_ALL, "");
18475fd0b74Schristos bindtextdomain (PACKAGE, LOCALEDIR);
18575fd0b74Schristos textdomain (PACKAGE);
18675fd0b74Schristos
18775fd0b74Schristos program_name = argv[0];
18875fd0b74Schristos xmalloc_set_program_name (program_name);
18975fd0b74Schristos bfd_set_error_program_name (program_name);
19075fd0b74Schristos
19175fd0b74Schristos expandargv (&argc, &argv);
19275fd0b74Schristos
19375fd0b74Schristos string_min = 4;
194*e992f068Schristos include_all_whitespace = false;
195*e992f068Schristos print_addresses = false;
196*e992f068Schristos print_filenames = false;
19775fd0b74Schristos if (DEFAULT_STRINGS_ALL)
198*e992f068Schristos datasection_only = false;
19975fd0b74Schristos else
200*e992f068Schristos datasection_only = true;
20175fd0b74Schristos target = NULL;
20275fd0b74Schristos encoding = 's';
20375fd0b74Schristos output_separator = NULL;
20475fd0b74Schristos
205*e992f068Schristos while ((optc = getopt_long (argc, argv, "adfhHn:wot:e:T:s:U:Vv0123456789",
20675fd0b74Schristos long_options, (int *) 0)) != EOF)
20775fd0b74Schristos {
20875fd0b74Schristos switch (optc)
20975fd0b74Schristos {
21075fd0b74Schristos case 'a':
211*e992f068Schristos datasection_only = false;
21275fd0b74Schristos break;
21375fd0b74Schristos
21475fd0b74Schristos case 'd':
215*e992f068Schristos datasection_only = true;
21675fd0b74Schristos break;
21775fd0b74Schristos
21875fd0b74Schristos case 'f':
219*e992f068Schristos print_filenames = true;
22075fd0b74Schristos break;
22175fd0b74Schristos
22275fd0b74Schristos case 'H':
22375fd0b74Schristos case 'h':
22475fd0b74Schristos usage (stdout, 0);
22575fd0b74Schristos
22675fd0b74Schristos case 'n':
22775fd0b74Schristos string_min = (int) strtoul (optarg, &s, 0);
22875fd0b74Schristos if (s != NULL && *s != 0)
22975fd0b74Schristos fatal (_("invalid integer argument %s"), optarg);
23075fd0b74Schristos break;
23175fd0b74Schristos
23275fd0b74Schristos case 'w':
233*e992f068Schristos include_all_whitespace = true;
23475fd0b74Schristos break;
23575fd0b74Schristos
23675fd0b74Schristos case 'o':
237*e992f068Schristos print_addresses = true;
23875fd0b74Schristos address_radix = 8;
23975fd0b74Schristos break;
24075fd0b74Schristos
24175fd0b74Schristos case 't':
242*e992f068Schristos print_addresses = true;
24375fd0b74Schristos if (optarg[1] != '\0')
24475fd0b74Schristos usage (stderr, 1);
24575fd0b74Schristos switch (optarg[0])
24675fd0b74Schristos {
24775fd0b74Schristos case 'o':
24875fd0b74Schristos address_radix = 8;
24975fd0b74Schristos break;
25075fd0b74Schristos
25175fd0b74Schristos case 'd':
25275fd0b74Schristos address_radix = 10;
25375fd0b74Schristos break;
25475fd0b74Schristos
25575fd0b74Schristos case 'x':
25675fd0b74Schristos address_radix = 16;
25775fd0b74Schristos break;
25875fd0b74Schristos
25975fd0b74Schristos default:
26075fd0b74Schristos usage (stderr, 1);
26175fd0b74Schristos }
26275fd0b74Schristos break;
26375fd0b74Schristos
26475fd0b74Schristos case 'T':
26575fd0b74Schristos target = optarg;
26675fd0b74Schristos break;
26775fd0b74Schristos
26875fd0b74Schristos case 'e':
26975fd0b74Schristos if (optarg[1] != '\0')
27075fd0b74Schristos usage (stderr, 1);
27175fd0b74Schristos encoding = optarg[0];
27275fd0b74Schristos break;
27375fd0b74Schristos
27475fd0b74Schristos case 's':
27575fd0b74Schristos output_separator = optarg;
27675fd0b74Schristos break;
27775fd0b74Schristos
278*e992f068Schristos case 'U':
279*e992f068Schristos if (streq (optarg, "default") || streq (optarg, "d"))
280*e992f068Schristos unicode_display = unicode_default;
281*e992f068Schristos else if (streq (optarg, "locale") || streq (optarg, "l"))
282*e992f068Schristos unicode_display = unicode_locale;
283*e992f068Schristos else if (streq (optarg, "escape") || streq (optarg, "e"))
284*e992f068Schristos unicode_display = unicode_escape;
285*e992f068Schristos else if (streq (optarg, "invalid") || streq (optarg, "i"))
286*e992f068Schristos unicode_display = unicode_invalid;
287*e992f068Schristos else if (streq (optarg, "hex") || streq (optarg, "x"))
288*e992f068Schristos unicode_display = unicode_hex;
289*e992f068Schristos else if (streq (optarg, "highlight") || streq (optarg, "h"))
290*e992f068Schristos unicode_display = unicode_highlight;
291*e992f068Schristos else
292*e992f068Schristos fatal (_("invalid argument to -U/--unicode: %s"), optarg);
293*e992f068Schristos break;
294*e992f068Schristos
29575fd0b74Schristos case 'V':
29675fd0b74Schristos case 'v':
29775fd0b74Schristos print_version ("strings");
29875fd0b74Schristos break;
29975fd0b74Schristos
30075fd0b74Schristos case '?':
30175fd0b74Schristos usage (stderr, 1);
30275fd0b74Schristos
30375fd0b74Schristos default:
30475fd0b74Schristos numeric_opt = optind;
30575fd0b74Schristos break;
30675fd0b74Schristos }
30775fd0b74Schristos }
30875fd0b74Schristos
309*e992f068Schristos if (unicode_display != unicode_default)
310*e992f068Schristos encoding = 'S';
311*e992f068Schristos
31275fd0b74Schristos if (numeric_opt != 0)
31375fd0b74Schristos {
31475fd0b74Schristos string_min = (int) strtoul (argv[numeric_opt - 1] + 1, &s, 0);
31575fd0b74Schristos if (s != NULL && *s != 0)
31675fd0b74Schristos fatal (_("invalid integer argument %s"), argv[numeric_opt - 1] + 1);
31775fd0b74Schristos }
31875fd0b74Schristos if (string_min < 1)
31975fd0b74Schristos fatal (_("invalid minimum string length %d"), string_min);
32075fd0b74Schristos
32175fd0b74Schristos switch (encoding)
32275fd0b74Schristos {
32375fd0b74Schristos case 'S':
32475fd0b74Schristos case 's':
32575fd0b74Schristos encoding_bytes = 1;
32675fd0b74Schristos break;
32775fd0b74Schristos case 'b':
32875fd0b74Schristos case 'l':
32975fd0b74Schristos encoding_bytes = 2;
33075fd0b74Schristos break;
33175fd0b74Schristos case 'B':
33275fd0b74Schristos case 'L':
33375fd0b74Schristos encoding_bytes = 4;
33475fd0b74Schristos break;
33575fd0b74Schristos default:
33675fd0b74Schristos usage (stderr, 1);
33775fd0b74Schristos }
33875fd0b74Schristos
339012573ebSchristos if (bfd_init () != BFD_INIT_MAGIC)
340012573ebSchristos fatal (_("fatal error: libbfd ABI mismatch"));
34175fd0b74Schristos set_default_bfd_target ();
34275fd0b74Schristos
34375fd0b74Schristos if (optind >= argc)
34475fd0b74Schristos {
345*e992f068Schristos datasection_only = false;
34675fd0b74Schristos SET_BINARY (fileno (stdin));
347*e992f068Schristos print_strings ("{standard input}", stdin, 0, 0, (char *) NULL);
348*e992f068Schristos files_given = true;
34975fd0b74Schristos }
35075fd0b74Schristos else
35175fd0b74Schristos {
35275fd0b74Schristos for (; optind < argc; ++optind)
35375fd0b74Schristos {
354*e992f068Schristos if (streq (argv[optind], "-"))
355*e992f068Schristos datasection_only = false;
35675fd0b74Schristos else
35775fd0b74Schristos {
358*e992f068Schristos files_given = true;
359ede78133Schristos exit_status |= !strings_file (argv[optind]);
36075fd0b74Schristos }
36175fd0b74Schristos }
36275fd0b74Schristos }
36375fd0b74Schristos
36475fd0b74Schristos if (!files_given)
36575fd0b74Schristos usage (stderr, 1);
36675fd0b74Schristos
36775fd0b74Schristos return (exit_status);
36875fd0b74Schristos }
36975fd0b74Schristos
370ede78133Schristos /* Scan section SECT of the file ABFD, whose printable name is
371ede78133Schristos FILENAME. If it contains initialized data set GOT_A_SECTION and
372ede78133Schristos print the strings in it. */
37375fd0b74Schristos
37475fd0b74Schristos static void
strings_a_section(bfd * abfd,asection * sect,const char * filename,bool * got_a_section)375ede78133Schristos strings_a_section (bfd *abfd, asection *sect, const char *filename,
376*e992f068Schristos bool *got_a_section)
37775fd0b74Schristos {
37875fd0b74Schristos bfd_size_type sectsize;
379ede78133Schristos bfd_byte *mem;
38075fd0b74Schristos
38175fd0b74Schristos if ((sect->flags & DATA_FLAGS) != DATA_FLAGS)
38275fd0b74Schristos return;
38375fd0b74Schristos
384012573ebSchristos sectsize = bfd_section_size (sect);
385ede78133Schristos if (sectsize == 0)
38675fd0b74Schristos return;
38775fd0b74Schristos
388ede78133Schristos if (!bfd_malloc_and_get_section (abfd, sect, &mem))
38975fd0b74Schristos {
390ede78133Schristos non_fatal (_("%s: Reading section %s failed: %s"),
391ede78133Schristos filename, sect->name, bfd_errmsg (bfd_get_error ()));
39275fd0b74Schristos return;
39375fd0b74Schristos }
39475fd0b74Schristos
395*e992f068Schristos *got_a_section = true;
396*e992f068Schristos print_strings (filename, NULL, sect->filepos, sectsize, (char *) mem);
39775fd0b74Schristos free (mem);
39875fd0b74Schristos }
39975fd0b74Schristos
40075fd0b74Schristos /* Scan all of the sections in FILE, and print the strings
40175fd0b74Schristos in the initialized data section(s).
40275fd0b74Schristos
40375fd0b74Schristos Return TRUE if successful,
40475fd0b74Schristos FALSE if not (such as if FILE is not an object file). */
40575fd0b74Schristos
406*e992f068Schristos static bool
strings_object_file(const char * file)40775fd0b74Schristos strings_object_file (const char *file)
40875fd0b74Schristos {
40975fd0b74Schristos bfd *abfd;
410ede78133Schristos asection *s;
411*e992f068Schristos bool got_a_section;
41275fd0b74Schristos
41375fd0b74Schristos abfd = bfd_openr (file, target);
41475fd0b74Schristos
41575fd0b74Schristos if (abfd == NULL)
41675fd0b74Schristos /* Treat the file as a non-object file. */
417*e992f068Schristos return false;
41875fd0b74Schristos
41975fd0b74Schristos /* This call is mainly for its side effect of reading in the sections.
42075fd0b74Schristos We follow the traditional behavior of `strings' in that we don't
42175fd0b74Schristos complain if we don't recognize a file to be an object file. */
42275fd0b74Schristos if (!bfd_check_format (abfd, bfd_object))
42375fd0b74Schristos {
42475fd0b74Schristos bfd_close (abfd);
425*e992f068Schristos return false;
42675fd0b74Schristos }
42775fd0b74Schristos
428*e992f068Schristos got_a_section = false;
429ede78133Schristos for (s = abfd->sections; s != NULL; s = s->next)
430ede78133Schristos strings_a_section (abfd, s, file, &got_a_section);
43175fd0b74Schristos
43275fd0b74Schristos if (!bfd_close (abfd))
43375fd0b74Schristos {
43475fd0b74Schristos bfd_nonfatal (file);
435*e992f068Schristos return false;
43675fd0b74Schristos }
43775fd0b74Schristos
43875fd0b74Schristos return got_a_section;
43975fd0b74Schristos }
44075fd0b74Schristos
44175fd0b74Schristos /* Print the strings in FILE. Return TRUE if ok, FALSE if an error occurs. */
44275fd0b74Schristos
443*e992f068Schristos static bool
strings_file(char * file)44475fd0b74Schristos strings_file (char *file)
44575fd0b74Schristos {
44675fd0b74Schristos struct stat st;
44775fd0b74Schristos
44875fd0b74Schristos /* get_file_size does not support non-S_ISREG files. */
44975fd0b74Schristos
45075fd0b74Schristos if (stat (file, &st) < 0)
45175fd0b74Schristos {
45275fd0b74Schristos if (errno == ENOENT)
45375fd0b74Schristos non_fatal (_("'%s': No such file"), file);
45475fd0b74Schristos else
45575fd0b74Schristos non_fatal (_("Warning: could not locate '%s'. reason: %s"),
45675fd0b74Schristos file, strerror (errno));
457*e992f068Schristos return false;
45875fd0b74Schristos }
459ede78133Schristos else if (S_ISDIR (st.st_mode))
460ede78133Schristos {
461ede78133Schristos non_fatal (_("Warning: '%s' is a directory"), file);
462*e992f068Schristos return false;
463ede78133Schristos }
46475fd0b74Schristos
46575fd0b74Schristos /* If we weren't told to scan the whole file,
46675fd0b74Schristos try to open it as an object file and only look at
46775fd0b74Schristos initialized data sections. If that fails, fall back to the
46875fd0b74Schristos whole file. */
46975fd0b74Schristos if (!datasection_only || !strings_object_file (file))
47075fd0b74Schristos {
47175fd0b74Schristos FILE *stream;
47275fd0b74Schristos
47375fd0b74Schristos stream = fopen (file, FOPEN_RB);
47475fd0b74Schristos if (stream == NULL)
47575fd0b74Schristos {
47675fd0b74Schristos fprintf (stderr, "%s: ", program_name);
47775fd0b74Schristos perror (file);
478*e992f068Schristos return false;
47975fd0b74Schristos }
48075fd0b74Schristos
481*e992f068Schristos print_strings (file, stream, (file_ptr) 0, 0, (char *) NULL);
48275fd0b74Schristos
48375fd0b74Schristos if (fclose (stream) == EOF)
48475fd0b74Schristos {
48575fd0b74Schristos fprintf (stderr, "%s: ", program_name);
48675fd0b74Schristos perror (file);
487*e992f068Schristos return false;
48875fd0b74Schristos }
48975fd0b74Schristos }
49075fd0b74Schristos
491*e992f068Schristos return true;
49275fd0b74Schristos }
49375fd0b74Schristos
49475fd0b74Schristos /* Read the next character, return EOF if none available.
49575fd0b74Schristos Assume that STREAM is positioned so that the next byte read
49675fd0b74Schristos is at address ADDRESS in the file.
49775fd0b74Schristos
49875fd0b74Schristos If STREAM is NULL, do not read from it.
49975fd0b74Schristos The caller can supply a buffer of characters
50075fd0b74Schristos to be processed before the data in STREAM.
50175fd0b74Schristos MAGIC is the address of the buffer and
50275fd0b74Schristos MAGICCOUNT is how many characters are in it. */
50375fd0b74Schristos
50475fd0b74Schristos static long
get_char(FILE * stream,file_ptr * address,int * magiccount,char ** magic)50575fd0b74Schristos get_char (FILE *stream, file_ptr *address, int *magiccount, char **magic)
50675fd0b74Schristos {
50775fd0b74Schristos int c, i;
50875fd0b74Schristos long r = 0;
50975fd0b74Schristos
51075fd0b74Schristos for (i = 0; i < encoding_bytes; i++)
51175fd0b74Schristos {
51275fd0b74Schristos if (*magiccount)
51375fd0b74Schristos {
51475fd0b74Schristos (*magiccount)--;
51575fd0b74Schristos c = *(*magic)++;
51675fd0b74Schristos }
51775fd0b74Schristos else
51875fd0b74Schristos {
51975fd0b74Schristos if (stream == NULL)
52075fd0b74Schristos return EOF;
52175fd0b74Schristos
52275fd0b74Schristos /* Only use getc_unlocked if we found a declaration for it.
52375fd0b74Schristos Otherwise, libc is not thread safe by default, and we
52475fd0b74Schristos should not use it. */
52575fd0b74Schristos
52675fd0b74Schristos #if defined(HAVE_GETC_UNLOCKED) && HAVE_DECL_GETC_UNLOCKED
52775fd0b74Schristos c = getc_unlocked (stream);
52875fd0b74Schristos #else
52975fd0b74Schristos c = getc (stream);
53075fd0b74Schristos #endif
53175fd0b74Schristos if (c == EOF)
53275fd0b74Schristos return EOF;
53375fd0b74Schristos }
53475fd0b74Schristos
53575fd0b74Schristos (*address)++;
53675fd0b74Schristos r = (r << 8) | (c & 0xff);
53775fd0b74Schristos }
53875fd0b74Schristos
53975fd0b74Schristos switch (encoding)
54075fd0b74Schristos {
54175fd0b74Schristos default:
54275fd0b74Schristos break;
54375fd0b74Schristos case 'l':
54475fd0b74Schristos r = ((r & 0xff) << 8) | ((r & 0xff00) >> 8);
54575fd0b74Schristos break;
54675fd0b74Schristos case 'L':
54775fd0b74Schristos r = (((r & 0xff) << 24) | ((r & 0xff00) << 8)
54875fd0b74Schristos | ((r & 0xff0000) >> 8) | ((r & 0xff000000) >> 24));
54975fd0b74Schristos break;
55075fd0b74Schristos }
55175fd0b74Schristos
55275fd0b74Schristos return r;
55375fd0b74Schristos }
554012573ebSchristos
555012573ebSchristos /* Throw away one byte of a (possibly) multi-byte char C, updating
556012573ebSchristos address and buffer to suit. */
557012573ebSchristos
558012573ebSchristos static void
unget_part_char(long c,file_ptr * address,int * magiccount,char ** magic)559012573ebSchristos unget_part_char (long c, file_ptr *address, int *magiccount, char **magic)
560012573ebSchristos {
561012573ebSchristos static char tmp[4];
562012573ebSchristos
563012573ebSchristos if (encoding_bytes > 1)
564012573ebSchristos {
565012573ebSchristos *address -= encoding_bytes - 1;
566012573ebSchristos
567012573ebSchristos if (*magiccount == 0)
568012573ebSchristos {
569012573ebSchristos /* If no magic buffer exists, use temp buffer. */
570012573ebSchristos switch (encoding)
571012573ebSchristos {
572012573ebSchristos default:
573012573ebSchristos break;
574012573ebSchristos case 'b':
575012573ebSchristos tmp[0] = c & 0xff;
576012573ebSchristos *magiccount = 1;
577012573ebSchristos break;
578012573ebSchristos case 'l':
579012573ebSchristos tmp[0] = (c >> 8) & 0xff;
580012573ebSchristos *magiccount = 1;
581012573ebSchristos break;
582012573ebSchristos case 'B':
583012573ebSchristos tmp[0] = (c >> 16) & 0xff;
584012573ebSchristos tmp[1] = (c >> 8) & 0xff;
585012573ebSchristos tmp[2] = c & 0xff;
586012573ebSchristos *magiccount = 3;
587012573ebSchristos break;
588012573ebSchristos case 'L':
589012573ebSchristos tmp[0] = (c >> 8) & 0xff;
590012573ebSchristos tmp[1] = (c >> 16) & 0xff;
591012573ebSchristos tmp[2] = (c >> 24) & 0xff;
592012573ebSchristos *magiccount = 3;
593012573ebSchristos break;
594012573ebSchristos }
595012573ebSchristos *magic = tmp;
596012573ebSchristos }
597012573ebSchristos else
598012573ebSchristos {
599012573ebSchristos /* If magic buffer exists, rewind. */
600012573ebSchristos *magic -= encoding_bytes - 1;
601012573ebSchristos *magiccount += encoding_bytes - 1;
602012573ebSchristos }
603012573ebSchristos }
604012573ebSchristos }
605*e992f068Schristos
606*e992f068Schristos static void
print_filename_and_address(const char * filename,file_ptr address)607*e992f068Schristos print_filename_and_address (const char * filename, file_ptr address)
608*e992f068Schristos {
609*e992f068Schristos if (print_filenames)
610*e992f068Schristos printf ("%s: ", filename);
611*e992f068Schristos
612*e992f068Schristos if (! print_addresses)
613*e992f068Schristos return;
614*e992f068Schristos
615*e992f068Schristos switch (address_radix)
616*e992f068Schristos {
617*e992f068Schristos case 8:
618*e992f068Schristos if (sizeof (address) > sizeof (long))
619*e992f068Schristos {
620*e992f068Schristos #ifndef __MSVCRT__
621*e992f068Schristos printf ("%7llo ", (unsigned long long) address);
622*e992f068Schristos #else
623*e992f068Schristos printf ("%7I64o ", (unsigned long long) address);
624*e992f068Schristos #endif
625*e992f068Schristos }
626*e992f068Schristos else
627*e992f068Schristos printf ("%7lo ", (unsigned long) address);
628*e992f068Schristos break;
629*e992f068Schristos
630*e992f068Schristos case 10:
631*e992f068Schristos if (sizeof (address) > sizeof (long))
632*e992f068Schristos {
633*e992f068Schristos #ifndef __MSVCRT__
634*e992f068Schristos printf ("%7llu ", (unsigned long long) address);
635*e992f068Schristos #else
636*e992f068Schristos printf ("%7I64d ", (unsigned long long) address);
637*e992f068Schristos #endif
638*e992f068Schristos }
639*e992f068Schristos else
640*e992f068Schristos printf ("%7ld ", (long) address);
641*e992f068Schristos break;
642*e992f068Schristos
643*e992f068Schristos case 16:
644*e992f068Schristos if (sizeof (address) > sizeof (long))
645*e992f068Schristos {
646*e992f068Schristos #ifndef __MSVCRT__
647*e992f068Schristos printf ("%7llx ", (unsigned long long) address);
648*e992f068Schristos #else
649*e992f068Schristos printf ("%7I64x ", (unsigned long long) address);
650*e992f068Schristos #endif
651*e992f068Schristos }
652*e992f068Schristos else
653*e992f068Schristos printf ("%7lx ", (unsigned long) address);
654*e992f068Schristos break;
655*e992f068Schristos }
656*e992f068Schristos }
657*e992f068Schristos
658*e992f068Schristos /* Return non-zero if the bytes starting at BUFFER form a valid UTF-8 encoding.
659*e992f068Schristos If the encoding is valid then returns the number of bytes it uses. */
660*e992f068Schristos
661*e992f068Schristos static unsigned int
is_valid_utf8(const unsigned char * buffer,unsigned long buflen)662*e992f068Schristos is_valid_utf8 (const unsigned char * buffer, unsigned long buflen)
663*e992f068Schristos {
664*e992f068Schristos if (buffer[0] < 0xc0)
665*e992f068Schristos return 0;
666*e992f068Schristos
667*e992f068Schristos if (buflen < 2)
668*e992f068Schristos return 0;
669*e992f068Schristos
670*e992f068Schristos if ((buffer[1] & 0xc0) != 0x80)
671*e992f068Schristos return 0;
672*e992f068Schristos
673*e992f068Schristos if ((buffer[0] & 0x20) == 0)
674*e992f068Schristos return 2;
675*e992f068Schristos
676*e992f068Schristos if (buflen < 3)
677*e992f068Schristos return 0;
678*e992f068Schristos
679*e992f068Schristos if ((buffer[2] & 0xc0) != 0x80)
680*e992f068Schristos return 0;
681*e992f068Schristos
682*e992f068Schristos if ((buffer[0] & 0x10) == 0)
683*e992f068Schristos return 3;
684*e992f068Schristos
685*e992f068Schristos if (buflen < 4)
686*e992f068Schristos return 0;
687*e992f068Schristos
688*e992f068Schristos if ((buffer[3] & 0xc0) != 0x80)
689*e992f068Schristos return 0;
690*e992f068Schristos
691*e992f068Schristos return 4;
692*e992f068Schristos }
693*e992f068Schristos
694*e992f068Schristos /* Display a UTF-8 encoded character in BUFFER according to the setting
695*e992f068Schristos of unicode_display. The character is known to be valid.
696*e992f068Schristos Returns the number of bytes consumed. */
697*e992f068Schristos
698*e992f068Schristos static unsigned int
display_utf8_char(const unsigned char * buffer)699*e992f068Schristos display_utf8_char (const unsigned char * buffer)
700*e992f068Schristos {
701*e992f068Schristos unsigned int j;
702*e992f068Schristos unsigned int utf8_len;
703*e992f068Schristos
704*e992f068Schristos switch (buffer[0] & 0x30)
705*e992f068Schristos {
706*e992f068Schristos case 0x00:
707*e992f068Schristos case 0x10:
708*e992f068Schristos utf8_len = 2;
709*e992f068Schristos break;
710*e992f068Schristos case 0x20:
711*e992f068Schristos utf8_len = 3;
712*e992f068Schristos break;
713*e992f068Schristos default:
714*e992f068Schristos utf8_len = 4;
715*e992f068Schristos }
716*e992f068Schristos
717*e992f068Schristos switch (unicode_display)
718*e992f068Schristos {
719*e992f068Schristos default:
720*e992f068Schristos fprintf (stderr, "ICE: unexpected unicode display type\n");
721*e992f068Schristos break;
722*e992f068Schristos
723*e992f068Schristos case unicode_escape:
724*e992f068Schristos case unicode_highlight:
725*e992f068Schristos if (unicode_display == unicode_highlight && isatty (1))
726*e992f068Schristos printf ("\x1B[31;47m"); /* Red. */
727*e992f068Schristos
728*e992f068Schristos switch (utf8_len)
729*e992f068Schristos {
730*e992f068Schristos case 2:
731*e992f068Schristos printf ("\\u%02x%02x",
732*e992f068Schristos ((buffer[0] & 0x1c) >> 2),
733*e992f068Schristos ((buffer[0] & 0x03) << 6) | (buffer[1] & 0x3f));
734*e992f068Schristos break;
735*e992f068Schristos
736*e992f068Schristos case 3:
737*e992f068Schristos printf ("\\u%02x%02x",
738*e992f068Schristos ((buffer[0] & 0x0f) << 4) | ((buffer[1] & 0x3c) >> 2),
739*e992f068Schristos ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3f)));
740*e992f068Schristos break;
741*e992f068Schristos
742*e992f068Schristos case 4:
743*e992f068Schristos printf ("\\u%02x%02x%02x",
744*e992f068Schristos ((buffer[0] & 0x07) << 6) | ((buffer[1] & 0x3c) >> 2),
745*e992f068Schristos ((buffer[1] & 0x03) << 6) | ((buffer[2] & 0x3c) >> 2),
746*e992f068Schristos ((buffer[2] & 0x03) << 6) | ((buffer[3] & 0x3f)));
747*e992f068Schristos break;
748*e992f068Schristos default:
749*e992f068Schristos /* URG. */
750*e992f068Schristos break;
751*e992f068Schristos }
752*e992f068Schristos
753*e992f068Schristos if (unicode_display == unicode_highlight && isatty (1))
754*e992f068Schristos printf ("\033[0m"); /* Default colour. */
755*e992f068Schristos break;
756*e992f068Schristos
757*e992f068Schristos case unicode_hex:
758*e992f068Schristos putchar ('<');
759*e992f068Schristos printf ("0x");
760*e992f068Schristos for (j = 0; j < utf8_len; j++)
761*e992f068Schristos printf ("%02x", buffer [j]);
762*e992f068Schristos putchar ('>');
763*e992f068Schristos break;
764*e992f068Schristos
765*e992f068Schristos case unicode_locale:
766*e992f068Schristos printf ("%.1s", buffer);
767*e992f068Schristos break;
768*e992f068Schristos }
769*e992f068Schristos
770*e992f068Schristos return utf8_len;
771*e992f068Schristos }
772*e992f068Schristos
773*e992f068Schristos /* Display strings in BUFFER. Treat any UTF-8 encoded characters encountered
774*e992f068Schristos according to the setting of the unicode_display variable. The buffer
775*e992f068Schristos contains BUFLEN bytes.
776*e992f068Schristos
777*e992f068Schristos Display the characters as if they started at ADDRESS and are contained in
778*e992f068Schristos FILENAME. */
779*e992f068Schristos
780*e992f068Schristos static void
print_unicode_buffer(const char * filename,file_ptr address,const unsigned char * buffer,unsigned long buflen)781*e992f068Schristos print_unicode_buffer (const char * filename,
782*e992f068Schristos file_ptr address,
783*e992f068Schristos const unsigned char * buffer,
784*e992f068Schristos unsigned long buflen)
785*e992f068Schristos {
786*e992f068Schristos /* Paranoia checks... */
787*e992f068Schristos if (filename == NULL
788*e992f068Schristos || buffer == NULL
789*e992f068Schristos || unicode_display == unicode_default
790*e992f068Schristos || encoding != 'S'
791*e992f068Schristos || encoding_bytes != 1)
792*e992f068Schristos {
793*e992f068Schristos fprintf (stderr, "ICE: bad arguments to print_unicode_buffer\n");
794*e992f068Schristos return;
795*e992f068Schristos }
796*e992f068Schristos
797*e992f068Schristos if (buflen == 0)
798*e992f068Schristos return;
799*e992f068Schristos
800*e992f068Schristos /* We must only display strings that are at least string_min *characters*
801*e992f068Schristos long. So we scan the buffer in two stages. First we locate the start
802*e992f068Schristos of a potential string. Then we walk along it until we have found
803*e992f068Schristos string_min characters. Then we go back to the start point and start
804*e992f068Schristos displaying characters according to the unicode_display setting. */
805*e992f068Schristos
806*e992f068Schristos unsigned long start_point = 0;
807*e992f068Schristos unsigned long i = 0;
808*e992f068Schristos unsigned int char_len = 1;
809*e992f068Schristos unsigned int num_found = 0;
810*e992f068Schristos
811*e992f068Schristos for (i = 0; i < buflen; i += char_len)
812*e992f068Schristos {
813*e992f068Schristos int c = buffer[i];
814*e992f068Schristos
815*e992f068Schristos char_len = 1;
816*e992f068Schristos
817*e992f068Schristos /* Find the first potential character of a string. */
818*e992f068Schristos if (! STRING_ISGRAPHIC (c))
819*e992f068Schristos {
820*e992f068Schristos num_found = 0;
821*e992f068Schristos continue;
822*e992f068Schristos }
823*e992f068Schristos
824*e992f068Schristos if (c > 126)
825*e992f068Schristos {
826*e992f068Schristos if (c < 0xc0)
827*e992f068Schristos {
828*e992f068Schristos num_found = 0;
829*e992f068Schristos continue;
830*e992f068Schristos }
831*e992f068Schristos
832*e992f068Schristos if ((char_len = is_valid_utf8 (buffer + i, buflen - i)) == 0)
833*e992f068Schristos {
834*e992f068Schristos char_len = 1;
835*e992f068Schristos num_found = 0;
836*e992f068Schristos continue;
837*e992f068Schristos }
838*e992f068Schristos
839*e992f068Schristos if (unicode_display == unicode_invalid)
840*e992f068Schristos {
841*e992f068Schristos /* We have found a valid UTF-8 character, but we treat it as non-graphic. */
842*e992f068Schristos num_found = 0;
843*e992f068Schristos continue;
844*e992f068Schristos }
845*e992f068Schristos }
846*e992f068Schristos
847*e992f068Schristos if (num_found == 0)
848*e992f068Schristos /* We have found a potential starting point for a string. */
849*e992f068Schristos start_point = i;
850*e992f068Schristos
851*e992f068Schristos ++ num_found;
852*e992f068Schristos
853*e992f068Schristos if (num_found >= string_min)
854*e992f068Schristos break;
855*e992f068Schristos }
856*e992f068Schristos
857*e992f068Schristos if (num_found < string_min)
858*e992f068Schristos return;
859*e992f068Schristos
860*e992f068Schristos print_filename_and_address (filename, address + start_point);
861*e992f068Schristos
862*e992f068Schristos /* We have found string_min characters. Display them and any
863*e992f068Schristos more that follow. */
864*e992f068Schristos for (i = start_point; i < buflen; i += char_len)
865*e992f068Schristos {
866*e992f068Schristos int c = buffer[i];
867*e992f068Schristos
868*e992f068Schristos char_len = 1;
869*e992f068Schristos
870*e992f068Schristos if (! STRING_ISGRAPHIC (c))
871*e992f068Schristos break;
872*e992f068Schristos else if (c < 127)
873*e992f068Schristos putchar (c);
874*e992f068Schristos else if (! is_valid_utf8 (buffer + i, buflen - i))
875*e992f068Schristos break;
876*e992f068Schristos else if (unicode_display == unicode_invalid)
877*e992f068Schristos break;
878*e992f068Schristos else
879*e992f068Schristos char_len = display_utf8_char (buffer + i);
880*e992f068Schristos }
881*e992f068Schristos
882*e992f068Schristos if (output_separator)
883*e992f068Schristos fputs (output_separator, stdout);
884*e992f068Schristos else
885*e992f068Schristos putchar ('\n');
886*e992f068Schristos
887*e992f068Schristos /* FIXME: Using tail recursion here is lazy programming... */
888*e992f068Schristos print_unicode_buffer (filename, address + i, buffer + i, buflen - i);
889*e992f068Schristos }
890*e992f068Schristos
891*e992f068Schristos static int
get_unicode_byte(FILE * stream,unsigned char * putback,unsigned int * num_putback,unsigned int * num_read)892*e992f068Schristos get_unicode_byte (FILE * stream,
893*e992f068Schristos unsigned char * putback,
894*e992f068Schristos unsigned int * num_putback,
895*e992f068Schristos unsigned int * num_read)
896*e992f068Schristos {
897*e992f068Schristos if (* num_putback > 0)
898*e992f068Schristos {
899*e992f068Schristos * num_putback = * num_putback - 1;
900*e992f068Schristos return putback [* num_putback];
901*e992f068Schristos }
902*e992f068Schristos
903*e992f068Schristos * num_read = * num_read + 1;
904*e992f068Schristos
905*e992f068Schristos #if defined(HAVE_GETC_UNLOCKED) && HAVE_DECL_GETC_UNLOCKED
906*e992f068Schristos return getc_unlocked (stream);
907*e992f068Schristos #else
908*e992f068Schristos return getc (stream);
909*e992f068Schristos #endif
910*e992f068Schristos }
911*e992f068Schristos
912*e992f068Schristos /* Helper function for print_unicode_stream. */
913*e992f068Schristos
914*e992f068Schristos static void
print_unicode_stream_body(const char * filename,file_ptr address,FILE * stream,unsigned char * putback_buf,unsigned int num_putback,unsigned char * print_buf)915*e992f068Schristos print_unicode_stream_body (const char * filename,
916*e992f068Schristos file_ptr address,
917*e992f068Schristos FILE * stream,
918*e992f068Schristos unsigned char * putback_buf,
919*e992f068Schristos unsigned int num_putback,
920*e992f068Schristos unsigned char * print_buf)
921*e992f068Schristos {
922*e992f068Schristos /* It would be nice if we could just read the stream into a buffer
923*e992f068Schristos and then process if with print_unicode_buffer. But the input
924*e992f068Schristos might be huge or it might time-locked (eg stdin). So instead
925*e992f068Schristos we go one byte at a time... */
926*e992f068Schristos
927*e992f068Schristos file_ptr start_point = 0;
928*e992f068Schristos unsigned int num_read = 0;
929*e992f068Schristos unsigned int num_chars = 0;
930*e992f068Schristos unsigned int num_print = 0;
931*e992f068Schristos int c = 0;
932*e992f068Schristos
933*e992f068Schristos /* Find a series of string_min characters. Put them into print_buf. */
934*e992f068Schristos do
935*e992f068Schristos {
936*e992f068Schristos if (num_chars >= string_min)
937*e992f068Schristos break;
938*e992f068Schristos
939*e992f068Schristos c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
940*e992f068Schristos if (c == EOF)
941*e992f068Schristos break;
942*e992f068Schristos
943*e992f068Schristos if (! STRING_ISGRAPHIC (c))
944*e992f068Schristos {
945*e992f068Schristos num_chars = num_print = 0;
946*e992f068Schristos continue;
947*e992f068Schristos }
948*e992f068Schristos
949*e992f068Schristos if (num_chars == 0)
950*e992f068Schristos start_point = num_read - 1;
951*e992f068Schristos
952*e992f068Schristos if (c < 127)
953*e992f068Schristos {
954*e992f068Schristos print_buf[num_print] = c;
955*e992f068Schristos num_chars ++;
956*e992f068Schristos num_print ++;
957*e992f068Schristos continue;
958*e992f068Schristos }
959*e992f068Schristos
960*e992f068Schristos if (c < 0xc0)
961*e992f068Schristos {
962*e992f068Schristos num_chars = num_print = 0;
963*e992f068Schristos continue;
964*e992f068Schristos }
965*e992f068Schristos
966*e992f068Schristos /* We *might* have a UTF-8 sequence. Time to start peeking. */
967*e992f068Schristos char utf8[4];
968*e992f068Schristos
969*e992f068Schristos utf8[0] = c;
970*e992f068Schristos c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
971*e992f068Schristos if (c == EOF)
972*e992f068Schristos break;
973*e992f068Schristos utf8[1] = c;
974*e992f068Schristos
975*e992f068Schristos if ((utf8[1] & 0xc0) != 0x80)
976*e992f068Schristos {
977*e992f068Schristos /* Invalid UTF-8. */
978*e992f068Schristos putback_buf[num_putback++] = utf8[1];
979*e992f068Schristos num_chars = num_print = 0;
980*e992f068Schristos continue;
981*e992f068Schristos }
982*e992f068Schristos else if ((utf8[0] & 0x20) == 0)
983*e992f068Schristos {
984*e992f068Schristos /* A valid 2-byte UTF-8 encoding. */
985*e992f068Schristos if (unicode_display == unicode_invalid)
986*e992f068Schristos {
987*e992f068Schristos putback_buf[num_putback++] = utf8[1];
988*e992f068Schristos num_chars = num_print = 0;
989*e992f068Schristos }
990*e992f068Schristos else
991*e992f068Schristos {
992*e992f068Schristos print_buf[num_print ++] = utf8[0];
993*e992f068Schristos print_buf[num_print ++] = utf8[1];
994*e992f068Schristos num_chars ++;
995*e992f068Schristos }
996*e992f068Schristos continue;
997*e992f068Schristos }
998*e992f068Schristos
999*e992f068Schristos c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1000*e992f068Schristos if (c == EOF)
1001*e992f068Schristos break;
1002*e992f068Schristos utf8[2] = c;
1003*e992f068Schristos
1004*e992f068Schristos if ((utf8[2] & 0xc0) != 0x80)
1005*e992f068Schristos {
1006*e992f068Schristos /* Invalid UTF-8. */
1007*e992f068Schristos putback_buf[num_putback++] = utf8[2];
1008*e992f068Schristos putback_buf[num_putback++] = utf8[1];
1009*e992f068Schristos num_chars = num_print = 0;
1010*e992f068Schristos continue;
1011*e992f068Schristos }
1012*e992f068Schristos else if ((utf8[0] & 0x10) == 0)
1013*e992f068Schristos {
1014*e992f068Schristos /* A valid 3-byte UTF-8 encoding. */
1015*e992f068Schristos if (unicode_display == unicode_invalid)
1016*e992f068Schristos {
1017*e992f068Schristos putback_buf[num_putback++] = utf8[2];
1018*e992f068Schristos putback_buf[num_putback++] = utf8[1];
1019*e992f068Schristos num_chars = num_print = 0;
1020*e992f068Schristos }
1021*e992f068Schristos else
1022*e992f068Schristos {
1023*e992f068Schristos print_buf[num_print ++] = utf8[0];
1024*e992f068Schristos print_buf[num_print ++] = utf8[1];
1025*e992f068Schristos print_buf[num_print ++] = utf8[2];
1026*e992f068Schristos num_chars ++;
1027*e992f068Schristos }
1028*e992f068Schristos continue;
1029*e992f068Schristos }
1030*e992f068Schristos
1031*e992f068Schristos c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1032*e992f068Schristos if (c == EOF)
1033*e992f068Schristos break;
1034*e992f068Schristos utf8[3] = c;
1035*e992f068Schristos
1036*e992f068Schristos if ((utf8[3] & 0xc0) != 0x80)
1037*e992f068Schristos {
1038*e992f068Schristos /* Invalid UTF-8. */
1039*e992f068Schristos putback_buf[num_putback++] = utf8[3];
1040*e992f068Schristos putback_buf[num_putback++] = utf8[2];
1041*e992f068Schristos putback_buf[num_putback++] = utf8[1];
1042*e992f068Schristos num_chars = num_print = 0;
1043*e992f068Schristos }
1044*e992f068Schristos /* We have a valid 4-byte UTF-8 encoding. */
1045*e992f068Schristos else if (unicode_display == unicode_invalid)
1046*e992f068Schristos {
1047*e992f068Schristos putback_buf[num_putback++] = utf8[3];
1048*e992f068Schristos putback_buf[num_putback++] = utf8[1];
1049*e992f068Schristos putback_buf[num_putback++] = utf8[2];
1050*e992f068Schristos num_chars = num_print = 0;
1051*e992f068Schristos }
1052*e992f068Schristos else
1053*e992f068Schristos {
1054*e992f068Schristos print_buf[num_print ++] = utf8[0];
1055*e992f068Schristos print_buf[num_print ++] = utf8[1];
1056*e992f068Schristos print_buf[num_print ++] = utf8[2];
1057*e992f068Schristos print_buf[num_print ++] = utf8[3];
1058*e992f068Schristos num_chars ++;
1059*e992f068Schristos }
1060*e992f068Schristos }
1061*e992f068Schristos while (1);
1062*e992f068Schristos
1063*e992f068Schristos if (num_chars >= string_min)
1064*e992f068Schristos {
1065*e992f068Schristos /* We know that we have string_min valid characters in print_buf,
1066*e992f068Schristos and there may be more to come in the stream. Start displaying
1067*e992f068Schristos them. */
1068*e992f068Schristos
1069*e992f068Schristos print_filename_and_address (filename, address + start_point);
1070*e992f068Schristos
1071*e992f068Schristos unsigned int i;
1072*e992f068Schristos for (i = 0; i < num_print;)
1073*e992f068Schristos {
1074*e992f068Schristos if (print_buf[i] < 127)
1075*e992f068Schristos putchar (print_buf[i++]);
1076*e992f068Schristos else
1077*e992f068Schristos i += display_utf8_char (print_buf + i);
1078*e992f068Schristos }
1079*e992f068Schristos
1080*e992f068Schristos /* OK so now we have to start read unchecked bytes. */
1081*e992f068Schristos
1082*e992f068Schristos /* Find a series of string_min characters. Put them into print_buf. */
1083*e992f068Schristos do
1084*e992f068Schristos {
1085*e992f068Schristos c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1086*e992f068Schristos if (c == EOF)
1087*e992f068Schristos break;
1088*e992f068Schristos
1089*e992f068Schristos if (! STRING_ISGRAPHIC (c))
1090*e992f068Schristos break;
1091*e992f068Schristos
1092*e992f068Schristos if (c < 127)
1093*e992f068Schristos {
1094*e992f068Schristos putchar (c);
1095*e992f068Schristos continue;
1096*e992f068Schristos }
1097*e992f068Schristos
1098*e992f068Schristos if (c < 0xc0)
1099*e992f068Schristos break;
1100*e992f068Schristos
1101*e992f068Schristos /* We *might* have a UTF-8 sequence. Time to start peeking. */
1102*e992f068Schristos unsigned char utf8[4];
1103*e992f068Schristos
1104*e992f068Schristos utf8[0] = c;
1105*e992f068Schristos c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1106*e992f068Schristos if (c == EOF)
1107*e992f068Schristos break;
1108*e992f068Schristos utf8[1] = c;
1109*e992f068Schristos
1110*e992f068Schristos if ((utf8[1] & 0xc0) != 0x80)
1111*e992f068Schristos {
1112*e992f068Schristos /* Invalid UTF-8. */
1113*e992f068Schristos putback_buf[num_putback++] = utf8[1];
1114*e992f068Schristos break;
1115*e992f068Schristos }
1116*e992f068Schristos else if ((utf8[0] & 0x20) == 0)
1117*e992f068Schristos {
1118*e992f068Schristos /* Valid 2-byte UTF-8. */
1119*e992f068Schristos if (unicode_display == unicode_invalid)
1120*e992f068Schristos {
1121*e992f068Schristos putback_buf[num_putback++] = utf8[1];
1122*e992f068Schristos break;
1123*e992f068Schristos }
1124*e992f068Schristos else
1125*e992f068Schristos {
1126*e992f068Schristos (void) display_utf8_char (utf8);
1127*e992f068Schristos continue;
1128*e992f068Schristos }
1129*e992f068Schristos }
1130*e992f068Schristos
1131*e992f068Schristos c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1132*e992f068Schristos if (c == EOF)
1133*e992f068Schristos break;
1134*e992f068Schristos utf8[2] = c;
1135*e992f068Schristos
1136*e992f068Schristos if ((utf8[2] & 0xc0) != 0x80)
1137*e992f068Schristos {
1138*e992f068Schristos /* Invalid UTF-8. */
1139*e992f068Schristos putback_buf[num_putback++] = utf8[2];
1140*e992f068Schristos putback_buf[num_putback++] = utf8[1];
1141*e992f068Schristos break;
1142*e992f068Schristos }
1143*e992f068Schristos else if ((utf8[0] & 0x10) == 0)
1144*e992f068Schristos {
1145*e992f068Schristos /* Valid 3-byte UTF-8. */
1146*e992f068Schristos if (unicode_display == unicode_invalid)
1147*e992f068Schristos {
1148*e992f068Schristos putback_buf[num_putback++] = utf8[2];
1149*e992f068Schristos putback_buf[num_putback++] = utf8[1];
1150*e992f068Schristos break;
1151*e992f068Schristos }
1152*e992f068Schristos else
1153*e992f068Schristos {
1154*e992f068Schristos (void) display_utf8_char (utf8);
1155*e992f068Schristos continue;
1156*e992f068Schristos }
1157*e992f068Schristos }
1158*e992f068Schristos
1159*e992f068Schristos c = get_unicode_byte (stream, putback_buf, & num_putback, & num_read);
1160*e992f068Schristos if (c == EOF)
1161*e992f068Schristos break;
1162*e992f068Schristos utf8[3] = c;
1163*e992f068Schristos
1164*e992f068Schristos if ((utf8[3] & 0xc0) != 0x80)
1165*e992f068Schristos {
1166*e992f068Schristos /* Invalid UTF-8. */
1167*e992f068Schristos putback_buf[num_putback++] = utf8[3];
1168*e992f068Schristos putback_buf[num_putback++] = utf8[2];
1169*e992f068Schristos putback_buf[num_putback++] = utf8[1];
1170*e992f068Schristos break;
1171*e992f068Schristos }
1172*e992f068Schristos else if (unicode_display == unicode_invalid)
1173*e992f068Schristos {
1174*e992f068Schristos putback_buf[num_putback++] = utf8[3];
1175*e992f068Schristos putback_buf[num_putback++] = utf8[2];
1176*e992f068Schristos putback_buf[num_putback++] = utf8[1];
1177*e992f068Schristos break;
1178*e992f068Schristos }
1179*e992f068Schristos else
1180*e992f068Schristos /* A valid 4-byte UTF-8 encoding. */
1181*e992f068Schristos (void) display_utf8_char (utf8);
1182*e992f068Schristos }
1183*e992f068Schristos while (1);
1184*e992f068Schristos
1185*e992f068Schristos if (output_separator)
1186*e992f068Schristos fputs (output_separator, stdout);
1187*e992f068Schristos else
1188*e992f068Schristos putchar ('\n');
1189*e992f068Schristos }
1190*e992f068Schristos
1191*e992f068Schristos if (c != EOF)
1192*e992f068Schristos /* FIXME: Using tail recursion here is lazy, but it works. */
1193*e992f068Schristos print_unicode_stream_body (filename, address + num_read, stream, putback_buf, num_putback, print_buf);
1194*e992f068Schristos }
1195*e992f068Schristos
1196*e992f068Schristos /* Display strings read in from STREAM. Treat any UTF-8 encoded characters
1197*e992f068Schristos encountered according to the setting of the unicode_display variable.
1198*e992f068Schristos The stream is positioned at ADDRESS and is attached to FILENAME. */
1199*e992f068Schristos
1200*e992f068Schristos static void
print_unicode_stream(const char * filename,file_ptr address,FILE * stream)1201*e992f068Schristos print_unicode_stream (const char * filename,
1202*e992f068Schristos file_ptr address,
1203*e992f068Schristos FILE * stream)
1204*e992f068Schristos {
1205*e992f068Schristos /* Paranoia checks... */
1206*e992f068Schristos if (filename == NULL
1207*e992f068Schristos || stream == NULL
1208*e992f068Schristos || unicode_display == unicode_default
1209*e992f068Schristos || encoding != 'S'
1210*e992f068Schristos || encoding_bytes != 1)
1211*e992f068Schristos {
1212*e992f068Schristos fprintf (stderr, "ICE: bad arguments to print_unicode_stream\n");
1213*e992f068Schristos return;
1214*e992f068Schristos }
1215*e992f068Schristos
1216*e992f068Schristos /* Allocate space for string_min 4-byte utf-8 characters. */
1217*e992f068Schristos unsigned char * print_buf = xmalloc ((4 * string_min) + 1);
1218*e992f068Schristos /* We should never have to put back more than 4 bytes. */
1219*e992f068Schristos unsigned char putback_buf[5];
1220*e992f068Schristos unsigned int num_putback = 0;
1221*e992f068Schristos
1222*e992f068Schristos print_unicode_stream_body (filename, address, stream, putback_buf, num_putback, print_buf);
1223*e992f068Schristos free (print_buf);
1224*e992f068Schristos }
122575fd0b74Schristos
122675fd0b74Schristos /* Find the strings in file FILENAME, read from STREAM.
122775fd0b74Schristos Assume that STREAM is positioned so that the next byte read
122875fd0b74Schristos is at address ADDRESS in the file.
122975fd0b74Schristos
123075fd0b74Schristos If STREAM is NULL, do not read from it.
123175fd0b74Schristos The caller can supply a buffer of characters
123275fd0b74Schristos to be processed before the data in STREAM.
123375fd0b74Schristos MAGIC is the address of the buffer and
123475fd0b74Schristos MAGICCOUNT is how many characters are in it.
123575fd0b74Schristos Those characters come at address ADDRESS and the data in STREAM follow. */
123675fd0b74Schristos
123775fd0b74Schristos static void
print_strings(const char * filename,FILE * stream,file_ptr address,int magiccount,char * magic)123875fd0b74Schristos print_strings (const char *filename, FILE *stream, file_ptr address,
1239*e992f068Schristos int magiccount, char *magic)
124075fd0b74Schristos {
1241*e992f068Schristos if (unicode_display != unicode_default)
1242*e992f068Schristos {
1243*e992f068Schristos if (magic != NULL)
1244*e992f068Schristos print_unicode_buffer (filename, address,
1245*e992f068Schristos (const unsigned char *) magic, magiccount);
1246*e992f068Schristos
1247*e992f068Schristos if (stream != NULL)
1248*e992f068Schristos print_unicode_stream (filename, address, stream);
1249*e992f068Schristos return;
1250*e992f068Schristos }
1251*e992f068Schristos
125275fd0b74Schristos char *buf = (char *) xmalloc (sizeof (char) * (string_min + 1));
125375fd0b74Schristos
125475fd0b74Schristos while (1)
125575fd0b74Schristos {
125675fd0b74Schristos file_ptr start;
1257*e992f068Schristos unsigned int i;
125875fd0b74Schristos long c;
125975fd0b74Schristos
126075fd0b74Schristos /* See if the next `string_min' chars are all graphic chars. */
126175fd0b74Schristos tryline:
126275fd0b74Schristos start = address;
126375fd0b74Schristos for (i = 0; i < string_min; i++)
126475fd0b74Schristos {
126575fd0b74Schristos c = get_char (stream, &address, &magiccount, &magic);
126675fd0b74Schristos if (c == EOF)
126775fd0b74Schristos {
126875fd0b74Schristos free (buf);
126975fd0b74Schristos return;
127075fd0b74Schristos }
1271012573ebSchristos
127275fd0b74Schristos if (! STRING_ISGRAPHIC (c))
1273012573ebSchristos {
1274012573ebSchristos /* Found a non-graphic. Try again starting with next byte. */
1275012573ebSchristos unget_part_char (c, &address, &magiccount, &magic);
127675fd0b74Schristos goto tryline;
1277012573ebSchristos }
127875fd0b74Schristos buf[i] = c;
127975fd0b74Schristos }
128075fd0b74Schristos
128175fd0b74Schristos /* We found a run of `string_min' graphic characters. Print up
128275fd0b74Schristos to the next non-graphic character. */
1283*e992f068Schristos print_filename_and_address (filename, start);
128475fd0b74Schristos
128575fd0b74Schristos buf[i] = '\0';
128675fd0b74Schristos fputs (buf, stdout);
128775fd0b74Schristos
128875fd0b74Schristos while (1)
128975fd0b74Schristos {
129075fd0b74Schristos c = get_char (stream, &address, &magiccount, &magic);
129175fd0b74Schristos if (c == EOF)
129275fd0b74Schristos break;
129375fd0b74Schristos if (! STRING_ISGRAPHIC (c))
1294012573ebSchristos {
1295012573ebSchristos unget_part_char (c, &address, &magiccount, &magic);
129675fd0b74Schristos break;
1297012573ebSchristos }
129875fd0b74Schristos putchar (c);
129975fd0b74Schristos }
130075fd0b74Schristos
130175fd0b74Schristos if (output_separator)
130275fd0b74Schristos fputs (output_separator, stdout);
130375fd0b74Schristos else
130475fd0b74Schristos putchar ('\n');
130575fd0b74Schristos }
130675fd0b74Schristos free (buf);
130775fd0b74Schristos }
130875fd0b74Schristos
130975fd0b74Schristos static void
usage(FILE * stream,int status)131075fd0b74Schristos usage (FILE *stream, int status)
131175fd0b74Schristos {
131275fd0b74Schristos fprintf (stream, _("Usage: %s [option(s)] [file(s)]\n"), program_name);
131375fd0b74Schristos fprintf (stream, _(" Display printable strings in [file(s)] (stdin by default)\n"));
131475fd0b74Schristos fprintf (stream, _(" The options are:\n"));
131575fd0b74Schristos
131675fd0b74Schristos if (DEFAULT_STRINGS_ALL)
131775fd0b74Schristos fprintf (stream, _("\
131875fd0b74Schristos -a - --all Scan the entire file, not just the data section [default]\n\
131975fd0b74Schristos -d --data Only scan the data sections in the file\n"));
132075fd0b74Schristos else
132175fd0b74Schristos fprintf (stream, _("\
132275fd0b74Schristos -a - --all Scan the entire file, not just the data section\n\
132375fd0b74Schristos -d --data Only scan the data sections in the file [default]\n"));
132475fd0b74Schristos
132575fd0b74Schristos fprintf (stream, _("\
132675fd0b74Schristos -f --print-file-name Print the name of the file before each string\n\
1327*e992f068Schristos -n <number> Locate & print any sequence of at least <number>\n\
1328*e992f068Schristos --bytes=<number> displayable characters. (The default is 4).\n\
132975fd0b74Schristos -t --radix={o,d,x} Print the location of the string in base 8, 10 or 16\n\
133075fd0b74Schristos -w --include-all-whitespace Include all whitespace as valid string characters\n\
133175fd0b74Schristos -o An alias for --radix=o\n\
133275fd0b74Schristos -T --target=<BFDNAME> Specify the binary file format\n\
133375fd0b74Schristos -e --encoding={s,S,b,l,B,L} Select character size and endianness:\n\
133475fd0b74Schristos s = 7-bit, S = 8-bit, {b,l} = 16-bit, {B,L} = 32-bit\n\
1335*e992f068Schristos --unicode={default|show|invalid|hex|escape|highlight}\n\
1336*e992f068Schristos -U {d|s|i|x|e|h} Specify how to treat UTF-8 encoded unicode characters\n\
133775fd0b74Schristos -s --output-separator=<string> String used to separate strings in output.\n\
133875fd0b74Schristos @<file> Read options from <file>\n\
133975fd0b74Schristos -h --help Display this information\n\
134075fd0b74Schristos -v -V --version Print the program's version number\n"));
134175fd0b74Schristos list_supported_targets (program_name, stream);
134275fd0b74Schristos if (REPORT_BUGS_TO[0] && status == 0)
134375fd0b74Schristos fprintf (stream, _("Report bugs to %s\n"), REPORT_BUGS_TO);
134475fd0b74Schristos exit (status);
134575fd0b74Schristos }
1346