xref: /netbsd-src/external/gpl2/gettext/dist/gettext-tools/src/x-sh.c (revision 946379e7b37692fc43f68eb0d1c10daa0a7f3b6c)
1 /* xgettext sh backend.
2    Copyright (C) 2003, 2005-2006 Free Software Foundation, Inc.
3    Written by Bruno Haible <bruno@clisp.org>, 2003.
4 
5    This program is free software; you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation; either version 2, or (at your option)
8    any later version.
9 
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    You should have received a copy of the GNU General Public License
16    along with this program; if not, write to the Free Software Foundation,
17    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
18 
19 #ifdef HAVE_CONFIG_H
20 # include "config.h"
21 #endif
22 
23 #include <errno.h>
24 #include <limits.h>
25 #include <stdbool.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 
30 #include "message.h"
31 #include "xgettext.h"
32 #include "x-sh.h"
33 #include "error.h"
34 #include "xalloc.h"
35 #include "exit.h"
36 #include "hash.h"
37 #include "gettext.h"
38 
39 #define _(s) gettext(s)
40 
41 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
42 
43 
44 /* The sh syntax is defined in POSIX:2001, see
45      http://www.opengroup.org/onlinepubs/007904975/utilities/xcu_chap02.html
46    Summary of sh syntax:
47    - Input is broken into words, which are then subject to
48      - tilde expansion ~...
49      - command substitution `...`
50      - variable substitution $var
51      - arithmetic substitution $((...))
52      - field splitting at whitespace (IFS)
53      - wildcard pattern expansion *?
54      - quote removal
55    - Strings are enclosed in "..."; command substitution, variable
56      substitution and arithmetic substitution are performed here as well.
57    - '...' is a string without substitutions.
58    - The list of resulting words is split into commands by semicolon and
59      newline.
60    - '#' at the beginning of a word introduces a comment until end of line.
61    The parser is implemented in bash-2.05b/parse.y.  */
62 
63 
64 /* ====================== Keyword set customization.  ====================== */
65 
66 /* If true extract all strings.  */
67 static bool extract_all = false;
68 
69 static hash_table keywords;
70 static bool default_keywords = true;
71 
72 
73 void
x_sh_extract_all()74 x_sh_extract_all ()
75 {
76   extract_all = true;
77 }
78 
79 
80 void
x_sh_keyword(const char * name)81 x_sh_keyword (const char *name)
82 {
83   if (name == NULL)
84     default_keywords = false;
85   else
86     {
87       const char *end;
88       struct callshape shape;
89       const char *colon;
90 
91       if (keywords.table == NULL)
92 	hash_init (&keywords, 100);
93 
94       split_keywordspec (name, &end, &shape);
95 
96       /* The characters between name and end should form a valid C identifier.
97 	 A colon means an invalid parse in split_keywordspec().  */
98       colon = strchr (name, ':');
99       if (colon == NULL || colon >= end)
100 	insert_keyword_callshape (&keywords, name, end - name, &shape);
101     }
102 }
103 
104 /* Finish initializing the keywords hash table.
105    Called after argument processing, before each file is processed.  */
106 static void
init_keywords()107 init_keywords ()
108 {
109   if (default_keywords)
110     {
111       /* When adding new keywords here, also update the documentation in
112 	 xgettext.texi!  */
113       x_sh_keyword ("gettext");
114       x_sh_keyword ("ngettext:1,2");
115       x_sh_keyword ("eval_gettext");
116       x_sh_keyword ("eval_ngettext:1,2");
117       default_keywords = false;
118     }
119 }
120 
121 void
init_flag_table_sh()122 init_flag_table_sh ()
123 {
124   xgettext_record_flag ("gettext:1:pass-sh-format");
125   xgettext_record_flag ("ngettext:1:pass-sh-format");
126   xgettext_record_flag ("ngettext:2:pass-sh-format");
127   xgettext_record_flag ("eval_gettext:1:sh-format");
128   xgettext_record_flag ("eval_ngettext:1:sh-format");
129   xgettext_record_flag ("eval_ngettext:2:sh-format");
130 }
131 
132 
133 /* ======================== Reading of characters.  ======================== */
134 
135 /* Real filename, used in error messages about the input file.  */
136 static const char *real_file_name;
137 
138 /* Logical filename and line number, used to label the extracted messages.  */
139 static char *logical_file_name;
140 static int line_number;
141 
142 /* The input file stream.  */
143 static FILE *fp;
144 
145 
146 /* Fetch the next character from the input file.  */
147 static int
do_getc()148 do_getc ()
149 {
150   int c = getc (fp);
151 
152   if (c == EOF)
153     {
154       if (ferror (fp))
155 	error (EXIT_FAILURE, errno, _("\
156 error while reading \"%s\""), real_file_name);
157     }
158   else if (c == '\n')
159    line_number++;
160 
161   return c;
162 }
163 
164 /* Put back the last fetched character, not EOF.  */
165 static void
do_ungetc(int c)166 do_ungetc (int c)
167 {
168   if (c == '\n')
169     line_number--;
170   ungetc (c, fp);
171 }
172 
173 
174 /* Remove backslash followed by newline from the input stream.  */
175 
176 static int phase1_pushback[1];
177 static int phase1_pushback_length;
178 
179 static int
phase1_getc()180 phase1_getc ()
181 {
182   int c;
183 
184   if (phase1_pushback_length)
185     {
186       c = phase1_pushback[--phase1_pushback_length];
187       if (c == '\n')
188 	++line_number;
189       return c;
190     }
191   for (;;)
192     {
193       c = do_getc ();
194       if (c != '\\')
195 	return c;
196       c = do_getc ();
197       if (c != '\n')
198 	{
199 	  if (c != EOF)
200 	    do_ungetc (c);
201 	  return '\\';
202 	}
203     }
204 }
205 
206 /* Supports only one pushback character.  */
207 static void
phase1_ungetc(int c)208 phase1_ungetc (int c)
209 {
210   switch (c)
211     {
212     case EOF:
213       break;
214 
215     case '\n':
216       --line_number;
217       /* FALLTHROUGH */
218 
219     default:
220       if (phase1_pushback_length == SIZEOF (phase1_pushback))
221 	abort ();
222       phase1_pushback[phase1_pushback_length++] = c;
223       break;
224     }
225 }
226 
227 
228 /* ========================== Reading of tokens.  ========================== */
229 
230 
231 /* A token consists of a sequence of characters.  */
232 struct token
233 {
234   int allocated;		/* number of allocated 'token_char's */
235   int charcount;		/* number of used 'token_char's */
236   char *chars;			/* the token's constituents */
237 };
238 
239 /* Initialize a 'struct token'.  */
240 static inline void
init_token(struct token * tp)241 init_token (struct token *tp)
242 {
243   tp->allocated = 10;
244   tp->chars = (char *) xmalloc (tp->allocated * sizeof (char));
245   tp->charcount = 0;
246 }
247 
248 /* Free the memory pointed to by a 'struct token'.  */
249 static inline void
free_token(struct token * tp)250 free_token (struct token *tp)
251 {
252   free (tp->chars);
253 }
254 
255 /* Ensure there is enough room in the token for one more character.  */
256 static inline void
grow_token(struct token * tp)257 grow_token (struct token *tp)
258 {
259   if (tp->charcount == tp->allocated)
260     {
261       tp->allocated *= 2;
262       tp->chars = (char *) xrealloc (tp->chars, tp->allocated * sizeof (char));
263     }
264 }
265 
266 /* Convert a struct token * to a char*.  */
267 static char *
string_of_token(const struct token * tp)268 string_of_token (const struct token *tp)
269 {
270   char *str;
271   int n;
272 
273   n = tp->charcount;
274   str = (char *) xmalloc (n + 1);
275   memcpy (str, tp->chars, n);
276   str[n] = '\0';
277   return str;
278 }
279 
280 
281 /* ========================= Accumulating messages ========================= */
282 
283 
284 static message_list_ty *mlp;
285 
286 
287 /* ========================= Accumulating comments ========================= */
288 
289 
290 static char *buffer;
291 static size_t bufmax;
292 static size_t buflen;
293 
294 static inline void
comment_start()295 comment_start ()
296 {
297   buflen = 0;
298 }
299 
300 static inline void
comment_add(int c)301 comment_add (int c)
302 {
303   if (buflen >= bufmax)
304     {
305       bufmax = 2 * bufmax + 10;
306       buffer = xrealloc (buffer, bufmax);
307     }
308   buffer[buflen++] = c;
309 }
310 
311 static inline void
comment_line_end()312 comment_line_end ()
313 {
314   while (buflen >= 1
315 	 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
316     --buflen;
317   if (buflen >= bufmax)
318     {
319       bufmax = 2 * bufmax + 10;
320       buffer = xrealloc (buffer, bufmax);
321     }
322   buffer[buflen] = '\0';
323   savable_comment_add (buffer);
324 }
325 
326 
327 /* These are for tracking whether comments count as immediately before
328    keyword.  */
329 static int last_comment_line;
330 static int last_non_comment_line;
331 
332 
333 /* ========================= Debackslashification ========================== */
334 
335 /* This state tracks the effect of backquotes, double-quotes and single-quotes
336    on the parsing of backslashes.  We make a single pass through the input
337    file, keeping the state up to date.  This is much faster than accumulating
338    strings and processing them with explicit debackslashification, like the
339    shell does it.  */
340 
341 /* The number of nested `...` or "`...`" constructs.  Assumed to be <= 32.  */
342 static unsigned int nested_backquotes;
343 
344 /* A bit mask indicating which of the currently open `...` or "`...`"
345    constructs is with double-quotes: "`...`".
346    A bit value of 1 stands for "`...`", a bit value of 0 stands for `...`.
347    Bit position 0 designates the outermost backquotes nesting,
348    bit position 1 the second-outermost backquotes nesting,
349    ...
350    bit position (nested_backquotes-1) the innermost backquotes nesting.  */
351 static unsigned int open_doublequotes_mask;
352 
353 /* A bit indicating whether a double-quote is currently open inside the
354    innermost backquotes nesting.  */
355 static bool open_doublequote;
356 
357 /* A bit indicating whether a single-quote is currently open inside the
358    innermost backquotes nesting.  */
359 static bool open_singlequote;
360 
361 /* The expected terminator of the currently open single-quote.
362    Usually '\'', but can be '"' for i18n-quotes.  */
363 static char open_singlequote_terminator;
364 
365 
366 /* Functions to update the state.  */
367 
368 static inline void
saw_opening_backquote()369 saw_opening_backquote ()
370 {
371   if (open_singlequote)
372     abort ();
373   if (open_doublequote)
374     open_doublequotes_mask |= (unsigned int) 1 << nested_backquotes;
375   nested_backquotes++;
376   open_doublequote = false;
377 }
378 
379 static inline void
saw_closing_backquote()380 saw_closing_backquote ()
381 {
382   nested_backquotes--;
383   open_doublequote = (open_doublequotes_mask >> nested_backquotes) & 1;
384   open_doublequotes_mask &= ((unsigned int) 1 << nested_backquotes) - 1;
385   open_singlequote = false; /* just for safety */
386 }
387 
388 static inline void
saw_opening_doublequote()389 saw_opening_doublequote ()
390 {
391   if (open_singlequote || open_doublequote)
392     abort ();
393   open_doublequote = true;
394 }
395 
396 static inline void
saw_closing_doublequote()397 saw_closing_doublequote ()
398 {
399   if (open_singlequote || !open_doublequote)
400     abort ();
401   open_doublequote = false;
402 }
403 
404 static inline void
saw_opening_singlequote()405 saw_opening_singlequote ()
406 {
407   if (open_doublequote || open_singlequote)
408     abort ();
409   open_singlequote = true;
410   open_singlequote_terminator = '\'';
411 }
412 
413 static inline void
saw_closing_singlequote()414 saw_closing_singlequote ()
415 {
416   if (open_doublequote || !open_singlequote)
417     abort ();
418   open_singlequote = false;
419 }
420 
421 
422 /* ========================== Reading of commands ========================== */
423 
424 /* We are only interested in constant strings.  Other words need not to be
425    represented precisely.  */
426 enum word_type
427 {
428   t_string,	/* constant string */
429   t_other,	/* other string */
430   t_separator,	/* command separator: semicolon or newline */
431   t_redirect,	/* redirection: one of < > >| << <<- >> <> <& >& */
432   t_backquote,	/* closing '`' pseudo word */
433   t_paren,	/* closing ')' pseudo word */
434   t_eof		/* EOF marker */
435 };
436 
437 struct word
438 {
439   enum word_type type;
440   struct token *token;		/* for t_string */
441   int line_number_at_start;	/* for t_string */
442 };
443 
444 /* Free the memory pointed to by a 'struct word'.  */
445 static inline void
free_word(struct word * wp)446 free_word (struct word *wp)
447 {
448   if (wp->type == t_string)
449     {
450       free_token (wp->token);
451       free (wp->token);
452     }
453 }
454 
455 /* Convert a t_string token to a char*.  */
456 static char *
string_of_word(const struct word * wp)457 string_of_word (const struct word *wp)
458 {
459   char *str;
460   int n;
461 
462   if (!(wp->type == t_string))
463     abort ();
464   n = wp->token->charcount;
465   str = (char *) xmalloc (n + 1);
466   memcpy (str, wp->token->chars, n);
467   str[n] = '\0';
468   return str;
469 }
470 
471 
472 /* Whitespace recognition.  */
473 
474 static inline bool
is_whitespace(int c)475 is_whitespace (int c)
476 {
477   return (c == ' ' || c == '\t' || c == '\n');
478 }
479 
480 /* Operator character recognition.  */
481 
482 static inline bool
is_operator_start(int c)483 is_operator_start (int c)
484 {
485   return (c == '|' || c == '&' || c == ';' || c == '<' || c == '>'
486 	  || c == '(' || c == ')');
487 }
488 
489 
490 /* Denotation of a quoted character.
491    The distinction between quoted and unquoted character is important only for
492    the special, whitespace and operator characters; it is irrelevant for
493    alphanumeric characters, '\\' and many others.  */
494 #define QUOTED(c) (UCHAR_MAX + 1 + (c))
495 /* Values in the 'unsigned char' range are implicitly unquoted.  Among these,
496    the following are important:
497      '"'         opening or closing double quote
498      '\''        opening or closing single quote
499      '$'         the unknown result of a dollar expansion
500      '`'         does not occur - replaced with OPENING_BACKQUOTE or
501                  CLOSING_BACKQUOTE
502  */
503 #define OPENING_BACKQUOTE (2 * (UCHAR_MAX + 1) + '`')
504 #define CLOSING_BACKQUOTE (3 * (UCHAR_MAX + 1) + '`')
505 
506 /* 2 characters of pushback are supported.
507    2 characters of pushback occur only when the first is an 'x'; in all
508    other cases only one character of pushback is needed.  */
509 static int phase2_pushback[2];
510 static int phase2_pushback_length;
511 
512 /* Return the next character, with backslashes removed.
513    The result is QUOTED(c) for some unsigned char c, if the next character
514    is escaped sufficiently often to make it a regular constituent character,
515    or simply an 'unsigned char' if it has its special meaning (of special,
516    whitespace or operator charcter), or OPENING_BACKQUOTE, CLOSING_BACKQUOTE,
517    EOF.
518    It's the caller's responsibility to update the state.  */
519 static int
phase2_getc()520 phase2_getc ()
521 {
522   int c;
523 
524   if (phase2_pushback_length)
525     {
526       c = phase2_pushback[--phase2_pushback_length];
527       if (c == '\n')
528 	++line_number;
529       return c;
530     }
531 
532   c = phase1_getc ();
533   if (c == EOF)
534     return c;
535   if (c == '\'')
536     return ((open_doublequote
537 	     || (open_singlequote && open_singlequote_terminator != c))
538 	    ? QUOTED (c)
539 	    : c);
540   if (open_singlequote)
541     {
542       if (c == open_singlequote_terminator)
543 	return c;
544     }
545   else
546     {
547       if (c == '"' || c == '$')
548 	return c;
549       if (c == '`')
550 	return (nested_backquotes > 0 ? CLOSING_BACKQUOTE : OPENING_BACKQUOTE);
551     }
552   if (c == '\\')
553     {
554       /* Number of debackslahificication passes that are active at the
555 	 current point.  */
556       unsigned int debackslahify =
557 	nested_backquotes + (open_singlequote ? 0 : 1);
558       /* Normal number of backslashes that yield a single backslash in the
559 	 final output.  */
560       unsigned int expected_count =
561 	(unsigned int) 1 << debackslahify;
562       /* Number of backslashes found.  */
563       unsigned int count;
564 
565       for (count = 1; count < expected_count; count++)
566 	{
567 	  c = phase1_getc ();
568 	  if (c != '\\')
569 	    break;
570 	}
571       if (count == expected_count)
572 	return '\\';
573 
574       /* The count of backslashes is > 0 and < expected_count, therefore the
575 	 result depends on c, the first character after the backslashes.
576 	 Note: The formulas below don't necessarily have a logic; they were
577 	 empirically determined such that 1. the xgettext-30 test succeeds,
578 	 2. the behaviour for count == 0 would correspond to the one without
579 	 any baskslash.  */
580       if (c == '\'')
581 	{
582 	  if (!open_singlequote && count > (expected_count >> 1))
583 	    {
584 	      phase1_ungetc (c);
585 	      return '\\';
586 	    }
587 	  else
588 	    return ((open_doublequote
589 		     || (open_singlequote && open_singlequote_terminator != c))
590 		    ? QUOTED (c)
591 		    : c);
592 	}
593       else if (c == '"')
594 	{
595 	  /* Each debackslahificication pass converts \\ to \ and \" to ";
596 	     passes corresponding to `...` drop a lone " whereas passes
597 	     corresponding to "`...`" leave it alone.  Therefore, the
598 	     minimum number of backslashes needed to get one double-quote
599 	     in the end is  open_doublequotes_mask + 1.  */
600 	  if (open_singlequote)
601 	    {
602 	      if (count > open_doublequotes_mask)
603 		{
604 		  phase1_ungetc (c);
605 		  return '\\';
606 		}
607 	      else
608 		return (open_singlequote_terminator != c ? QUOTED (c) : c);
609 	    }
610 	  else
611 	    {
612 	      if (count > open_doublequotes_mask)
613 		return QUOTED (c);
614 	      else
615 	        /* Some of the count values <= open_doublequotes_mask are
616 		   actually invalid here, but we assume a syntactically
617 		   correct input file anyway.  */
618 		return c;
619 	    }
620 	}
621       else if (c == '`')
622 	{
623 	  /* FIXME: This code looks fishy.  */
624 	  if (count == expected_count - 1)
625 	    return c;
626 	  else
627 	    /* Some of the count values < expected_count - 1 are
628 	       actually invalid here, but we assume a syntactically
629 	       correct input file anyway.  */
630 	    if (nested_backquotes > 0 && !open_singlequote
631 		&& count >= (expected_count >> 2))
632 	      return OPENING_BACKQUOTE;
633 	    else
634 	      return CLOSING_BACKQUOTE;
635 	}
636       else if (c == '$')
637 	{
638 	  if (open_singlequote)
639 	    return QUOTED (c);
640 	  if (count >= (expected_count >> 1))
641 	    return QUOTED (c);
642 	  else
643 	    return c;
644 	}
645       else
646 	{
647 	  /* When not followed by a quoting character or backslash or dollar,
648 	     a backslash survives a debackslahificication pass unmodified.
649 	     Therefore each debackslahificication pass performs a
650 	       count := (count + 1) >> 1
651 	     operation.  Therefore the minimum number of backslashes needed
652 	     to get one backslash in the end is  (expected_count >> 1) + 1.  */
653 	  if (open_doublequote || open_singlequote)
654 	    {
655 	      if (count > 0)
656 		{
657 		  phase1_ungetc (c);
658 		  return '\\';
659 		}
660 	      else
661 		return QUOTED (c);
662 	    }
663 	  else
664 	    {
665 	      if (count > (expected_count >> 1))
666 		{
667 		  phase1_ungetc (c);
668 		  return '\\';
669 		}
670 	      else if (count > 0)
671 		return QUOTED (c);
672 	      else
673 		return c;
674 	    }
675 	}
676     }
677 
678   return (open_singlequote || open_doublequote ? QUOTED (c) : c);
679 }
680 
681 /* Supports 2 characters of pushback.  */
682 static void
phase2_ungetc(int c)683 phase2_ungetc (int c)
684 {
685   switch (c)
686     {
687     case EOF:
688       break;
689 
690     case '\n':
691       --line_number;
692       /* FALLTHROUGH */
693 
694     default:
695       if (phase2_pushback_length == SIZEOF (phase2_pushback))
696 	abort ();
697       phase2_pushback[phase2_pushback_length++] = c;
698       break;
699     }
700 }
701 
702 
703 /* Context lookup table.  */
704 static flag_context_list_table_ty *flag_context_list_table;
705 
706 
707 /* Forward declaration of local functions.  */
708 static enum word_type read_command_list (int looking_for,
709 					 flag_context_ty outer_context);
710 
711 
712 
713 /* Read the next word.
714    'looking_for' denotes a parse terminator, either CLOSING_BACKQUOTE, ')'
715    or '\0'.  */
716 static void
read_word(struct word * wp,int looking_for,flag_context_ty context)717 read_word (struct word *wp, int looking_for, flag_context_ty context)
718 {
719   int c;
720   bool all_unquoted_digits;
721 
722   do
723     {
724       c = phase2_getc ();
725       if (c == '#')
726 	{
727 	  /* Skip a comment up to end of line.  */
728 	  last_comment_line = line_number;
729 	  comment_start ();
730 	  for (;;)
731 	    {
732 	      c = phase1_getc ();
733 	      if (c == EOF || c == '\n')
734 		break;
735 	      /* We skip all leading white space, but not EOLs.  */
736 	      if (!(buflen == 0 && (c == ' ' || c == '\t')))
737 		comment_add (c);
738 	    }
739 	  comment_line_end ();
740 	}
741       if (c == '\n')
742 	{
743 	  /* Comments assumed to be grouped with a message must immediately
744 	     precede it, with no non-whitespace token on a line between
745 	     both.  */
746 	  if (last_non_comment_line > last_comment_line)
747 	    savable_comment_reset ();
748 	  wp->type = t_separator;
749 	  return;
750 	}
751     }
752   while (is_whitespace (c));
753 
754   if (c == EOF)
755     {
756       wp->type = t_eof;
757       return;
758     }
759 
760   if (c == '<' || c == '>')
761     {
762       /* Recognize the redirection operators < > >| << <<- >> <> <& >&
763 	 But <( and >) are handled below, not here.  */
764       int c2 = phase2_getc ();
765       if (c2 != '(')
766 	{
767 	  if ((c == '<' ? c2 == '<' : c2 == '|') || c2 == '>' || c2 == '&')
768 	    {
769 	      if (c == '<' && c2 == '<')
770 		{
771 		  int c3 = phase2_getc ();
772 		  if (c3 != '-')
773 		    phase2_ungetc (c3);
774 		}
775 	    }
776 	  else
777 	    phase2_ungetc (c2);
778 	  wp->type = t_redirect;
779 	  return;
780 	}
781       else
782 	phase2_ungetc (c2);
783     }
784 
785   if (looking_for == CLOSING_BACKQUOTE && c == CLOSING_BACKQUOTE)
786     {
787       saw_closing_backquote ();
788       wp->type = t_backquote;
789       last_non_comment_line = line_number;
790       return;
791     }
792 
793   if (looking_for == ')' && c == ')')
794     {
795       wp->type = t_paren;
796       last_non_comment_line = line_number;
797       return;
798     }
799 
800   if (is_operator_start (c))
801     {
802       wp->type = (c == ';' ? t_separator : t_other);
803       return;
804     }
805 
806   wp->type = t_string;
807   wp->token = (struct token *) xmalloc (sizeof (struct token));
808   init_token (wp->token);
809   wp->line_number_at_start = line_number;
810   all_unquoted_digits = true;
811 
812   for (;; c = phase2_getc ())
813     {
814       if (c == EOF)
815 	break;
816 
817       if (all_unquoted_digits && (c == '<' || c == '>'))
818 	{
819 	  /* Recognize the redirection operators < > >| << <<- >> <> <& >&
820 	     prefixed with a nonempty sequence of unquoted digits.  */
821 	  int c2 = phase2_getc ();
822 	  if ((c == '<' ? c2 == '<' : c2 == '|') || c2 == '>' || c2 == '&')
823 	    {
824 	      if (c == '<' && c2 == '<')
825 		{
826 		  int c3 = phase2_getc ();
827 		  if (c3 != '-')
828 		    phase2_ungetc (c3);
829 		}
830 	    }
831 	  else
832 	    phase2_ungetc (c2);
833 
834 	  wp->type = t_redirect;
835 	  free_token (wp->token);
836 	  free (wp->token);
837 
838 	  last_non_comment_line = line_number;
839 
840 	  return;
841 	}
842 
843       all_unquoted_digits = all_unquoted_digits && (c >= '0' && c <= '9');
844 
845       if (c == '$')
846 	{
847 	  int c2;
848 
849 	  /* An unquoted dollar indicates we are not inside '...'.  */
850 	  if (open_singlequote)
851 	    abort ();
852 	  /* After reading a dollar, we know that there is no pushed back
853 	     character from an earlier lookahead.  */
854 	  if (phase2_pushback_length > 0)
855 	    abort ();
856 	  /* Therefore we can use phase1 without interfering with phase2.
857 	     We need to recognize $( outside and inside double-quotes.
858 	     It would be incorrect to do
859 		c2 = phase2_getc ();
860 		if (c2 == '(' || c2 == QUOTED ('('))
861 	     because that would also trigger for $\(.  */
862 	  c2 = phase1_getc ();
863 	  if (c2 == '(')
864 	    {
865 	      bool saved_open_doublequote;
866 	      int c3;
867 
868 	      phase1_ungetc (c2);
869 
870 	      /* The entire inner command or arithmetic expression is read
871 		 ignoring possible surrounding double-quotes.  */
872 	      saved_open_doublequote = open_doublequote;
873 	      open_doublequote = false;
874 
875 	      c2 = phase2_getc ();
876 	      if (c2 != '(')
877 		abort ();
878 
879 	      c3 = phase2_getc ();
880 	      if (c3 == '(')
881 		{
882 		  /* Arithmetic expression (Bash syntax).  Skip until the
883 		     matching closing parenthesis.  */
884 		  unsigned int depth = 2;
885 
886 		  do
887 		    {
888 		      c = phase2_getc ();
889 		      if (c == '(')
890 			depth++;
891 		      else if (c == ')')
892 			if (--depth == 0)
893 			  break;
894 		    }
895 		  while (c != EOF);
896 		}
897 	      else
898 		{
899 		  /* Command substitution (Bash syntax).  */
900 		  phase2_ungetc (c3);
901 		  read_command_list (')', context);
902 		}
903 
904 	      open_doublequote = saved_open_doublequote;
905 	    }
906 	  else
907 	    {
908 	      phase1_ungetc (c2);
909 	      c2 = phase2_getc ();
910 
911 	      if (c2 == '\'' && !open_singlequote)
912 		{
913 		  /* Bash builtin for string with ANSI-C escape sequences.  */
914 		  saw_opening_singlequote ();
915 		  for (;;)
916 		    {
917 		      c = phase2_getc ();
918 		      if (c == EOF)
919 			break;
920 		      if (c == '\'')
921 			{
922 			  saw_closing_singlequote ();
923 			  break;
924 			}
925 		      if (c == '\\')
926 			{
927 			  c = phase2_getc ();
928 			  switch (c)
929 			    {
930 			    default:
931 			      phase2_ungetc (c);
932 			      c = '\\';
933 			      break;
934 
935 			    case '\\':
936 			      break;
937 			    case '\'':
938 			      /* Don't call saw_closing_singlequote ()
939 				 here.  */
940 			      break;
941 
942 			    case 'a':
943 			      c = '\a';
944 			      break;
945 			    case 'b':
946 			      c = '\b';
947 			      break;
948 			    case 'e':
949 			      c = 0x1b; /* ESC */
950 			      break;
951 			    case 'f':
952 			      c = '\f';
953 			      break;
954 			    case 'n':
955 			      c = '\n';
956 			      break;
957 			    case 'r':
958 			      c = '\r';
959 			      break;
960 			    case 't':
961 			      c = '\t';
962 			      break;
963 			    case 'v':
964 			      c = '\v';
965 			      break;
966 
967 			    case 'x':
968 			      c = phase2_getc ();
969 			      if ((c >= '0' && c <= '9')
970 				  || (c >= 'A' && c <= 'F')
971 				  || (c >= 'a' && c <= 'f'))
972 				{
973 				  int n;
974 
975 				  if (c >= '0' && c <= '9')
976 				    n = c - '0';
977 				  else if (c >= 'A' && c <= 'F')
978 				    n = 10 + c - 'A';
979 				  else if (c >= 'a' && c <= 'f')
980 				    n = 10 + c - 'a';
981 				  else
982 				    abort ();
983 
984 				  c = phase2_getc ();
985 				  if ((c >= '0' && c <= '9')
986 				      || (c >= 'A' && c <= 'F')
987 				      || (c >= 'a' && c <= 'f'))
988 				    {
989 				      if (c >= '0' && c <= '9')
990 					n = n * 16 + c - '0';
991 				      else if (c >= 'A' && c <= 'F')
992 					n = n * 16 + 10 + c - 'A';
993 				      else if (c >= 'a' && c <= 'f')
994 					n = n * 16 + 10 + c - 'a';
995 				      else
996 					abort ();
997 				    }
998 				  else
999 				    phase2_ungetc (c);
1000 
1001 				  c = n;
1002 				}
1003 			      else
1004 				{
1005 				  phase2_ungetc (c);
1006 				  phase2_ungetc ('x');
1007 				  c = '\\';
1008 				}
1009 			      break;
1010 
1011 			    case '0': case '1': case '2': case '3':
1012 			    case '4': case '5': case '6': case '7':
1013 			      {
1014 				int n = c - '0';
1015 
1016 				c = phase2_getc ();
1017 				if (c >= '0' && c <= '7')
1018 				  {
1019 				    n = n * 8 + c - '0';
1020 
1021 				    c = phase2_getc ();
1022 				    if (c >= '0' && c <= '7')
1023 				      n = n * 8 + c - '0';
1024 				    else
1025 				      phase2_ungetc (c);
1026 				  }
1027 				else
1028 				  phase2_ungetc (c);
1029 
1030 				c = n;
1031 			      }
1032 			      break;
1033 			    }
1034 			}
1035 		      if (wp->type == t_string)
1036 			{
1037 			  grow_token (wp->token);
1038 			  wp->token->chars[wp->token->charcount++] =
1039 			    (unsigned char) c;
1040 			}
1041 		    }
1042 		  /* The result is a literal string.  Don't change wp->type.  */
1043 		  continue;
1044 		}
1045 	      else if (c2 == '"' && !open_doublequote)
1046 		{
1047 		  /* Bash builtin for internationalized string.  */
1048 		  lex_pos_ty pos;
1049 		  struct token string;
1050 
1051 		  saw_opening_singlequote ();
1052 		  open_singlequote_terminator = '"';
1053 		  pos.file_name = logical_file_name;
1054 		  pos.line_number = line_number;
1055 		  init_token (&string);
1056 		  for (;;)
1057 		    {
1058 		      c = phase2_getc ();
1059 		      if (c == EOF)
1060 			break;
1061 		      if (c == '"')
1062 			{
1063 			  saw_closing_singlequote ();
1064 			  break;
1065 			}
1066 		      grow_token (&string);
1067 		      string.chars[string.charcount++] = (unsigned char) c;
1068 		    }
1069 		  remember_a_message (mlp, NULL, string_of_token (&string),
1070 				      context, &pos, savable_comment);
1071 		  free_token (&string);
1072 
1073 		  error_with_progname = false;
1074 		  error (0, 0, _("%s:%lu: warning: the syntax $\"...\" is deprecated due to security reasons; use eval_gettext instead"),
1075 			 pos.file_name, (unsigned long) pos.line_number);
1076 		  error_with_progname = true;
1077 
1078 		  /* The result at runtime is not constant. Therefore we
1079 		     change wp->type.  */
1080 		}
1081 	      else
1082 		phase2_ungetc (c2);
1083 	    }
1084 	  wp->type = t_other;
1085 	  continue;
1086 	}
1087 
1088       if (c == '\'')
1089 	{
1090 	  if (!open_singlequote)
1091 	    {
1092 	      /* Handle an opening single quote.  */
1093 	      saw_opening_singlequote ();
1094 	    }
1095 	  else
1096 	    {
1097 	      /* Handle a closing single quote.  */
1098 	      saw_closing_singlequote ();
1099 	    }
1100 	  continue;
1101 	}
1102 
1103       if (c == '"')
1104 	{
1105 	  if (open_singlequote && open_singlequote_terminator == '"')
1106 	    {
1107 	      /* Handle a closing i18n quote.  */
1108 	      saw_closing_singlequote ();
1109 	    }
1110 	  else if (!open_doublequote)
1111 	    {
1112 	      /* Handle an opening double quote.  */
1113 	      saw_opening_doublequote ();
1114 	    }
1115 	  else
1116 	    {
1117 	      /* Handle a closing double quote.  */
1118 	      saw_closing_doublequote ();
1119 	    }
1120 	  continue;
1121 	}
1122 
1123       if (c == OPENING_BACKQUOTE)
1124 	{
1125 	  /* Handle an opening backquote.  */
1126 	  saw_opening_backquote ();
1127 
1128 	  read_command_list (CLOSING_BACKQUOTE, context);
1129 
1130 	  wp->type = t_other;
1131 	  continue;
1132 	}
1133       if (c == CLOSING_BACKQUOTE)
1134 	break;
1135 
1136       if (c == '<' || c == '>')
1137 	{
1138 	  int c2;
1139 
1140 	  /* An unquoted c indicates we are not inside '...' nor "...".  */
1141 	  if (open_singlequote || open_doublequote)
1142 	    abort ();
1143 
1144 	  c2 = phase2_getc ();
1145 	  if (c2 == '(')
1146 	    {
1147 	      /* Process substitution (Bash syntax).  */
1148 	      read_command_list (')', context);
1149 
1150 	      wp->type = t_other;
1151 	      continue;
1152 	    }
1153 	  else
1154 	    phase2_ungetc (c2);
1155 	}
1156 
1157       if (!open_singlequote && !open_doublequote
1158 	  && (is_whitespace (c) || is_operator_start (c)))
1159 	break;
1160 
1161       if (wp->type == t_string)
1162 	{
1163 	  grow_token (wp->token);
1164 	  wp->token->chars[wp->token->charcount++] = (unsigned char) c;
1165 	}
1166     }
1167 
1168   phase2_ungetc (c);
1169 
1170   if (wp->type != t_string)
1171     {
1172       free_token (wp->token);
1173       free (wp->token);
1174     }
1175   last_non_comment_line = line_number;
1176 }
1177 
1178 
1179 /* Read the next command.
1180    'looking_for' denotes a parse terminator, either CLOSING_BACKQUOTE, ')'
1181    or '\0'.
1182    Returns the type of the word that terminated the command.  */
1183 static enum word_type
read_command(int looking_for,flag_context_ty outer_context)1184 read_command (int looking_for, flag_context_ty outer_context)
1185 {
1186   /* Read the words that make up the command.
1187      Here we completely ignore field splitting at whitespace and wildcard
1188      expansions; i.e. we assume that the source is written in such a way that
1189      every word in the program determines exactly one word in the resulting
1190      command.
1191      But we do not require that the 'gettext'/'ngettext' command is the
1192      first in the command; this is because 1. we want to allow for prefixes
1193      like "$verbose" that may expand to nothing, and 2. it's a big effort
1194      to know where a command starts in a $(for ...) or $(case ...) compound
1195      command.  */
1196   int arg = 0;			/* Current argument number.  */
1197   bool arg_of_redirect = false;	/* True right after a redirection operator.  */
1198   flag_context_list_iterator_ty context_iter;
1199   const struct callshapes *shapes = NULL;
1200   struct arglist_parser *argparser = NULL;
1201 
1202   for (;;)
1203     {
1204       struct word inner;
1205       flag_context_ty inner_context;
1206 
1207       if (arg == 0)
1208 	inner_context = null_context;
1209       else
1210 	inner_context =
1211 	  inherited_context (outer_context,
1212 			     flag_context_list_iterator_advance (
1213 			       &context_iter));
1214 
1215       read_word (&inner, looking_for, inner_context);
1216 
1217       /* Recognize end of command.  */
1218       if (inner.type == t_separator
1219 	  || inner.type == t_backquote || inner.type == t_paren
1220 	  || inner.type == t_eof)
1221 	{
1222 	  if (argparser != NULL)
1223 	    arglist_parser_done (argparser, arg);
1224 	  return inner.type;
1225 	}
1226 
1227       if (extract_all)
1228 	{
1229 	  if (inner.type == t_string)
1230 	    {
1231 	      lex_pos_ty pos;
1232 
1233 	      pos.file_name = logical_file_name;
1234 	      pos.line_number = inner.line_number_at_start;
1235 	      remember_a_message (mlp, NULL, string_of_word (&inner),
1236 				  inner_context, &pos, savable_comment);
1237 	    }
1238 	}
1239 
1240       if (arg_of_redirect)
1241 	{
1242 	  /* Ignore arguments of redirection operators.  */
1243 	  arg_of_redirect = false;
1244 	}
1245       else if (inner.type == t_redirect)
1246 	{
1247 	  /* Ignore this word and the following one.  */
1248 	  arg_of_redirect = true;
1249 	}
1250       else
1251 	{
1252 	  if (argparser == NULL)
1253 	    {
1254 	      /* This is the function position.  */
1255 	      arg = 0;
1256 	      if (inner.type == t_string)
1257 		{
1258 		  char *function_name = string_of_word (&inner);
1259 		  void *keyword_value;
1260 
1261 		  if (hash_find_entry (&keywords,
1262 				       function_name, strlen (function_name),
1263 				       &keyword_value)
1264 		      == 0)
1265 		    shapes = (const struct callshapes *) keyword_value;
1266 
1267 		  argparser = arglist_parser_alloc (mlp, shapes);
1268 
1269 		  context_iter =
1270 		    flag_context_list_iterator (
1271 		      flag_context_list_table_lookup (
1272 			flag_context_list_table,
1273 			function_name, strlen (function_name)));
1274 
1275 		  free (function_name);
1276 		}
1277 	      else
1278 		context_iter = null_context_list_iterator;
1279 	    }
1280 	  else
1281 	    {
1282 	      /* These are the argument positions.  */
1283 	      if (inner.type == t_string)
1284 		arglist_parser_remember (argparser, arg,
1285 					 string_of_word (&inner),
1286 					 inner_context,
1287 					 logical_file_name,
1288 					 inner.line_number_at_start,
1289 					 savable_comment);
1290 
1291 	      if (arglist_parser_decidedp (argparser, arg))
1292 		{
1293 		  /* Stop looking for arguments of the last function_name.  */
1294 		  /* FIXME: What about context_iter?  */
1295 		  arglist_parser_done (argparser, arg);
1296 		  shapes = NULL;
1297 		  argparser = NULL;
1298 		}
1299 	    }
1300 
1301 	  arg++;
1302 	}
1303 
1304       free_word (&inner);
1305     }
1306 }
1307 
1308 
1309 /* Read a list of commands.
1310    'looking_for' denotes a parse terminator, either CLOSING_BACKQUOTE, ')'
1311    or '\0'.
1312    Returns the type of the word that terminated the command list.  */
1313 static enum word_type
read_command_list(int looking_for,flag_context_ty outer_context)1314 read_command_list (int looking_for, flag_context_ty outer_context)
1315 {
1316   for (;;)
1317     {
1318       enum word_type terminator;
1319 
1320       terminator = read_command (looking_for, outer_context);
1321       if (terminator != t_separator)
1322 	return terminator;
1323     }
1324 }
1325 
1326 
1327 void
extract_sh(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)1328 extract_sh (FILE *f,
1329 	    const char *real_filename, const char *logical_filename,
1330 	    flag_context_list_table_ty *flag_table,
1331 	    msgdomain_list_ty *mdlp)
1332 {
1333   mlp = mdlp->item[0]->messages;
1334 
1335   fp = f;
1336   real_file_name = real_filename;
1337   logical_file_name = xstrdup (logical_filename);
1338   line_number = 1;
1339 
1340   last_comment_line = -1;
1341   last_non_comment_line = -1;
1342 
1343   nested_backquotes = 0;
1344   open_doublequotes_mask = 0;
1345   open_doublequote = false;
1346   open_singlequote = false;
1347 
1348   flag_context_list_table = flag_table;
1349 
1350   init_keywords ();
1351 
1352   /* Eat tokens until eof is seen.  */
1353   read_command_list ('\0', null_context);
1354 
1355   fp = NULL;
1356   real_file_name = NULL;
1357   logical_file_name = NULL;
1358   line_number = 0;
1359 }
1360