1 /* xgettext sh backend.
2 Copyright (C) 2003, 2005-2006 Free Software Foundation, Inc.
3 Written by Bruno Haible <bruno@clisp.org>, 2003.
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option)
8 any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software Foundation,
17 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
18
19 #ifdef HAVE_CONFIG_H
20 # include "config.h"
21 #endif
22
23 #include <errno.h>
24 #include <limits.h>
25 #include <stdbool.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29
30 #include "message.h"
31 #include "xgettext.h"
32 #include "x-sh.h"
33 #include "error.h"
34 #include "xalloc.h"
35 #include "exit.h"
36 #include "hash.h"
37 #include "gettext.h"
38
39 #define _(s) gettext(s)
40
41 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
42
43
44 /* The sh syntax is defined in POSIX:2001, see
45 http://www.opengroup.org/onlinepubs/007904975/utilities/xcu_chap02.html
46 Summary of sh syntax:
47 - Input is broken into words, which are then subject to
48 - tilde expansion ~...
49 - command substitution `...`
50 - variable substitution $var
51 - arithmetic substitution $((...))
52 - field splitting at whitespace (IFS)
53 - wildcard pattern expansion *?
54 - quote removal
55 - Strings are enclosed in "..."; command substitution, variable
56 substitution and arithmetic substitution are performed here as well.
57 - '...' is a string without substitutions.
58 - The list of resulting words is split into commands by semicolon and
59 newline.
60 - '#' at the beginning of a word introduces a comment until end of line.
61 The parser is implemented in bash-2.05b/parse.y. */
62
63
64 /* ====================== Keyword set customization. ====================== */
65
66 /* If true extract all strings. */
67 static bool extract_all = false;
68
69 static hash_table keywords;
70 static bool default_keywords = true;
71
72
73 void
x_sh_extract_all()74 x_sh_extract_all ()
75 {
76 extract_all = true;
77 }
78
79
80 void
x_sh_keyword(const char * name)81 x_sh_keyword (const char *name)
82 {
83 if (name == NULL)
84 default_keywords = false;
85 else
86 {
87 const char *end;
88 struct callshape shape;
89 const char *colon;
90
91 if (keywords.table == NULL)
92 hash_init (&keywords, 100);
93
94 split_keywordspec (name, &end, &shape);
95
96 /* The characters between name and end should form a valid C identifier.
97 A colon means an invalid parse in split_keywordspec(). */
98 colon = strchr (name, ':');
99 if (colon == NULL || colon >= end)
100 insert_keyword_callshape (&keywords, name, end - name, &shape);
101 }
102 }
103
104 /* Finish initializing the keywords hash table.
105 Called after argument processing, before each file is processed. */
106 static void
init_keywords()107 init_keywords ()
108 {
109 if (default_keywords)
110 {
111 /* When adding new keywords here, also update the documentation in
112 xgettext.texi! */
113 x_sh_keyword ("gettext");
114 x_sh_keyword ("ngettext:1,2");
115 x_sh_keyword ("eval_gettext");
116 x_sh_keyword ("eval_ngettext:1,2");
117 default_keywords = false;
118 }
119 }
120
121 void
init_flag_table_sh()122 init_flag_table_sh ()
123 {
124 xgettext_record_flag ("gettext:1:pass-sh-format");
125 xgettext_record_flag ("ngettext:1:pass-sh-format");
126 xgettext_record_flag ("ngettext:2:pass-sh-format");
127 xgettext_record_flag ("eval_gettext:1:sh-format");
128 xgettext_record_flag ("eval_ngettext:1:sh-format");
129 xgettext_record_flag ("eval_ngettext:2:sh-format");
130 }
131
132
133 /* ======================== Reading of characters. ======================== */
134
135 /* Real filename, used in error messages about the input file. */
136 static const char *real_file_name;
137
138 /* Logical filename and line number, used to label the extracted messages. */
139 static char *logical_file_name;
140 static int line_number;
141
142 /* The input file stream. */
143 static FILE *fp;
144
145
146 /* Fetch the next character from the input file. */
147 static int
do_getc()148 do_getc ()
149 {
150 int c = getc (fp);
151
152 if (c == EOF)
153 {
154 if (ferror (fp))
155 error (EXIT_FAILURE, errno, _("\
156 error while reading \"%s\""), real_file_name);
157 }
158 else if (c == '\n')
159 line_number++;
160
161 return c;
162 }
163
164 /* Put back the last fetched character, not EOF. */
165 static void
do_ungetc(int c)166 do_ungetc (int c)
167 {
168 if (c == '\n')
169 line_number--;
170 ungetc (c, fp);
171 }
172
173
174 /* Remove backslash followed by newline from the input stream. */
175
176 static int phase1_pushback[1];
177 static int phase1_pushback_length;
178
179 static int
phase1_getc()180 phase1_getc ()
181 {
182 int c;
183
184 if (phase1_pushback_length)
185 {
186 c = phase1_pushback[--phase1_pushback_length];
187 if (c == '\n')
188 ++line_number;
189 return c;
190 }
191 for (;;)
192 {
193 c = do_getc ();
194 if (c != '\\')
195 return c;
196 c = do_getc ();
197 if (c != '\n')
198 {
199 if (c != EOF)
200 do_ungetc (c);
201 return '\\';
202 }
203 }
204 }
205
206 /* Supports only one pushback character. */
207 static void
phase1_ungetc(int c)208 phase1_ungetc (int c)
209 {
210 switch (c)
211 {
212 case EOF:
213 break;
214
215 case '\n':
216 --line_number;
217 /* FALLTHROUGH */
218
219 default:
220 if (phase1_pushback_length == SIZEOF (phase1_pushback))
221 abort ();
222 phase1_pushback[phase1_pushback_length++] = c;
223 break;
224 }
225 }
226
227
228 /* ========================== Reading of tokens. ========================== */
229
230
231 /* A token consists of a sequence of characters. */
232 struct token
233 {
234 int allocated; /* number of allocated 'token_char's */
235 int charcount; /* number of used 'token_char's */
236 char *chars; /* the token's constituents */
237 };
238
239 /* Initialize a 'struct token'. */
240 static inline void
init_token(struct token * tp)241 init_token (struct token *tp)
242 {
243 tp->allocated = 10;
244 tp->chars = (char *) xmalloc (tp->allocated * sizeof (char));
245 tp->charcount = 0;
246 }
247
248 /* Free the memory pointed to by a 'struct token'. */
249 static inline void
free_token(struct token * tp)250 free_token (struct token *tp)
251 {
252 free (tp->chars);
253 }
254
255 /* Ensure there is enough room in the token for one more character. */
256 static inline void
grow_token(struct token * tp)257 grow_token (struct token *tp)
258 {
259 if (tp->charcount == tp->allocated)
260 {
261 tp->allocated *= 2;
262 tp->chars = (char *) xrealloc (tp->chars, tp->allocated * sizeof (char));
263 }
264 }
265
266 /* Convert a struct token * to a char*. */
267 static char *
string_of_token(const struct token * tp)268 string_of_token (const struct token *tp)
269 {
270 char *str;
271 int n;
272
273 n = tp->charcount;
274 str = (char *) xmalloc (n + 1);
275 memcpy (str, tp->chars, n);
276 str[n] = '\0';
277 return str;
278 }
279
280
281 /* ========================= Accumulating messages ========================= */
282
283
284 static message_list_ty *mlp;
285
286
287 /* ========================= Accumulating comments ========================= */
288
289
290 static char *buffer;
291 static size_t bufmax;
292 static size_t buflen;
293
294 static inline void
comment_start()295 comment_start ()
296 {
297 buflen = 0;
298 }
299
300 static inline void
comment_add(int c)301 comment_add (int c)
302 {
303 if (buflen >= bufmax)
304 {
305 bufmax = 2 * bufmax + 10;
306 buffer = xrealloc (buffer, bufmax);
307 }
308 buffer[buflen++] = c;
309 }
310
311 static inline void
comment_line_end()312 comment_line_end ()
313 {
314 while (buflen >= 1
315 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
316 --buflen;
317 if (buflen >= bufmax)
318 {
319 bufmax = 2 * bufmax + 10;
320 buffer = xrealloc (buffer, bufmax);
321 }
322 buffer[buflen] = '\0';
323 savable_comment_add (buffer);
324 }
325
326
327 /* These are for tracking whether comments count as immediately before
328 keyword. */
329 static int last_comment_line;
330 static int last_non_comment_line;
331
332
333 /* ========================= Debackslashification ========================== */
334
335 /* This state tracks the effect of backquotes, double-quotes and single-quotes
336 on the parsing of backslashes. We make a single pass through the input
337 file, keeping the state up to date. This is much faster than accumulating
338 strings and processing them with explicit debackslashification, like the
339 shell does it. */
340
341 /* The number of nested `...` or "`...`" constructs. Assumed to be <= 32. */
342 static unsigned int nested_backquotes;
343
344 /* A bit mask indicating which of the currently open `...` or "`...`"
345 constructs is with double-quotes: "`...`".
346 A bit value of 1 stands for "`...`", a bit value of 0 stands for `...`.
347 Bit position 0 designates the outermost backquotes nesting,
348 bit position 1 the second-outermost backquotes nesting,
349 ...
350 bit position (nested_backquotes-1) the innermost backquotes nesting. */
351 static unsigned int open_doublequotes_mask;
352
353 /* A bit indicating whether a double-quote is currently open inside the
354 innermost backquotes nesting. */
355 static bool open_doublequote;
356
357 /* A bit indicating whether a single-quote is currently open inside the
358 innermost backquotes nesting. */
359 static bool open_singlequote;
360
361 /* The expected terminator of the currently open single-quote.
362 Usually '\'', but can be '"' for i18n-quotes. */
363 static char open_singlequote_terminator;
364
365
366 /* Functions to update the state. */
367
368 static inline void
saw_opening_backquote()369 saw_opening_backquote ()
370 {
371 if (open_singlequote)
372 abort ();
373 if (open_doublequote)
374 open_doublequotes_mask |= (unsigned int) 1 << nested_backquotes;
375 nested_backquotes++;
376 open_doublequote = false;
377 }
378
379 static inline void
saw_closing_backquote()380 saw_closing_backquote ()
381 {
382 nested_backquotes--;
383 open_doublequote = (open_doublequotes_mask >> nested_backquotes) & 1;
384 open_doublequotes_mask &= ((unsigned int) 1 << nested_backquotes) - 1;
385 open_singlequote = false; /* just for safety */
386 }
387
388 static inline void
saw_opening_doublequote()389 saw_opening_doublequote ()
390 {
391 if (open_singlequote || open_doublequote)
392 abort ();
393 open_doublequote = true;
394 }
395
396 static inline void
saw_closing_doublequote()397 saw_closing_doublequote ()
398 {
399 if (open_singlequote || !open_doublequote)
400 abort ();
401 open_doublequote = false;
402 }
403
404 static inline void
saw_opening_singlequote()405 saw_opening_singlequote ()
406 {
407 if (open_doublequote || open_singlequote)
408 abort ();
409 open_singlequote = true;
410 open_singlequote_terminator = '\'';
411 }
412
413 static inline void
saw_closing_singlequote()414 saw_closing_singlequote ()
415 {
416 if (open_doublequote || !open_singlequote)
417 abort ();
418 open_singlequote = false;
419 }
420
421
422 /* ========================== Reading of commands ========================== */
423
424 /* We are only interested in constant strings. Other words need not to be
425 represented precisely. */
426 enum word_type
427 {
428 t_string, /* constant string */
429 t_other, /* other string */
430 t_separator, /* command separator: semicolon or newline */
431 t_redirect, /* redirection: one of < > >| << <<- >> <> <& >& */
432 t_backquote, /* closing '`' pseudo word */
433 t_paren, /* closing ')' pseudo word */
434 t_eof /* EOF marker */
435 };
436
437 struct word
438 {
439 enum word_type type;
440 struct token *token; /* for t_string */
441 int line_number_at_start; /* for t_string */
442 };
443
444 /* Free the memory pointed to by a 'struct word'. */
445 static inline void
free_word(struct word * wp)446 free_word (struct word *wp)
447 {
448 if (wp->type == t_string)
449 {
450 free_token (wp->token);
451 free (wp->token);
452 }
453 }
454
455 /* Convert a t_string token to a char*. */
456 static char *
string_of_word(const struct word * wp)457 string_of_word (const struct word *wp)
458 {
459 char *str;
460 int n;
461
462 if (!(wp->type == t_string))
463 abort ();
464 n = wp->token->charcount;
465 str = (char *) xmalloc (n + 1);
466 memcpy (str, wp->token->chars, n);
467 str[n] = '\0';
468 return str;
469 }
470
471
472 /* Whitespace recognition. */
473
474 static inline bool
is_whitespace(int c)475 is_whitespace (int c)
476 {
477 return (c == ' ' || c == '\t' || c == '\n');
478 }
479
480 /* Operator character recognition. */
481
482 static inline bool
is_operator_start(int c)483 is_operator_start (int c)
484 {
485 return (c == '|' || c == '&' || c == ';' || c == '<' || c == '>'
486 || c == '(' || c == ')');
487 }
488
489
490 /* Denotation of a quoted character.
491 The distinction between quoted and unquoted character is important only for
492 the special, whitespace and operator characters; it is irrelevant for
493 alphanumeric characters, '\\' and many others. */
494 #define QUOTED(c) (UCHAR_MAX + 1 + (c))
495 /* Values in the 'unsigned char' range are implicitly unquoted. Among these,
496 the following are important:
497 '"' opening or closing double quote
498 '\'' opening or closing single quote
499 '$' the unknown result of a dollar expansion
500 '`' does not occur - replaced with OPENING_BACKQUOTE or
501 CLOSING_BACKQUOTE
502 */
503 #define OPENING_BACKQUOTE (2 * (UCHAR_MAX + 1) + '`')
504 #define CLOSING_BACKQUOTE (3 * (UCHAR_MAX + 1) + '`')
505
506 /* 2 characters of pushback are supported.
507 2 characters of pushback occur only when the first is an 'x'; in all
508 other cases only one character of pushback is needed. */
509 static int phase2_pushback[2];
510 static int phase2_pushback_length;
511
512 /* Return the next character, with backslashes removed.
513 The result is QUOTED(c) for some unsigned char c, if the next character
514 is escaped sufficiently often to make it a regular constituent character,
515 or simply an 'unsigned char' if it has its special meaning (of special,
516 whitespace or operator charcter), or OPENING_BACKQUOTE, CLOSING_BACKQUOTE,
517 EOF.
518 It's the caller's responsibility to update the state. */
519 static int
phase2_getc()520 phase2_getc ()
521 {
522 int c;
523
524 if (phase2_pushback_length)
525 {
526 c = phase2_pushback[--phase2_pushback_length];
527 if (c == '\n')
528 ++line_number;
529 return c;
530 }
531
532 c = phase1_getc ();
533 if (c == EOF)
534 return c;
535 if (c == '\'')
536 return ((open_doublequote
537 || (open_singlequote && open_singlequote_terminator != c))
538 ? QUOTED (c)
539 : c);
540 if (open_singlequote)
541 {
542 if (c == open_singlequote_terminator)
543 return c;
544 }
545 else
546 {
547 if (c == '"' || c == '$')
548 return c;
549 if (c == '`')
550 return (nested_backquotes > 0 ? CLOSING_BACKQUOTE : OPENING_BACKQUOTE);
551 }
552 if (c == '\\')
553 {
554 /* Number of debackslahificication passes that are active at the
555 current point. */
556 unsigned int debackslahify =
557 nested_backquotes + (open_singlequote ? 0 : 1);
558 /* Normal number of backslashes that yield a single backslash in the
559 final output. */
560 unsigned int expected_count =
561 (unsigned int) 1 << debackslahify;
562 /* Number of backslashes found. */
563 unsigned int count;
564
565 for (count = 1; count < expected_count; count++)
566 {
567 c = phase1_getc ();
568 if (c != '\\')
569 break;
570 }
571 if (count == expected_count)
572 return '\\';
573
574 /* The count of backslashes is > 0 and < expected_count, therefore the
575 result depends on c, the first character after the backslashes.
576 Note: The formulas below don't necessarily have a logic; they were
577 empirically determined such that 1. the xgettext-30 test succeeds,
578 2. the behaviour for count == 0 would correspond to the one without
579 any baskslash. */
580 if (c == '\'')
581 {
582 if (!open_singlequote && count > (expected_count >> 1))
583 {
584 phase1_ungetc (c);
585 return '\\';
586 }
587 else
588 return ((open_doublequote
589 || (open_singlequote && open_singlequote_terminator != c))
590 ? QUOTED (c)
591 : c);
592 }
593 else if (c == '"')
594 {
595 /* Each debackslahificication pass converts \\ to \ and \" to ";
596 passes corresponding to `...` drop a lone " whereas passes
597 corresponding to "`...`" leave it alone. Therefore, the
598 minimum number of backslashes needed to get one double-quote
599 in the end is open_doublequotes_mask + 1. */
600 if (open_singlequote)
601 {
602 if (count > open_doublequotes_mask)
603 {
604 phase1_ungetc (c);
605 return '\\';
606 }
607 else
608 return (open_singlequote_terminator != c ? QUOTED (c) : c);
609 }
610 else
611 {
612 if (count > open_doublequotes_mask)
613 return QUOTED (c);
614 else
615 /* Some of the count values <= open_doublequotes_mask are
616 actually invalid here, but we assume a syntactically
617 correct input file anyway. */
618 return c;
619 }
620 }
621 else if (c == '`')
622 {
623 /* FIXME: This code looks fishy. */
624 if (count == expected_count - 1)
625 return c;
626 else
627 /* Some of the count values < expected_count - 1 are
628 actually invalid here, but we assume a syntactically
629 correct input file anyway. */
630 if (nested_backquotes > 0 && !open_singlequote
631 && count >= (expected_count >> 2))
632 return OPENING_BACKQUOTE;
633 else
634 return CLOSING_BACKQUOTE;
635 }
636 else if (c == '$')
637 {
638 if (open_singlequote)
639 return QUOTED (c);
640 if (count >= (expected_count >> 1))
641 return QUOTED (c);
642 else
643 return c;
644 }
645 else
646 {
647 /* When not followed by a quoting character or backslash or dollar,
648 a backslash survives a debackslahificication pass unmodified.
649 Therefore each debackslahificication pass performs a
650 count := (count + 1) >> 1
651 operation. Therefore the minimum number of backslashes needed
652 to get one backslash in the end is (expected_count >> 1) + 1. */
653 if (open_doublequote || open_singlequote)
654 {
655 if (count > 0)
656 {
657 phase1_ungetc (c);
658 return '\\';
659 }
660 else
661 return QUOTED (c);
662 }
663 else
664 {
665 if (count > (expected_count >> 1))
666 {
667 phase1_ungetc (c);
668 return '\\';
669 }
670 else if (count > 0)
671 return QUOTED (c);
672 else
673 return c;
674 }
675 }
676 }
677
678 return (open_singlequote || open_doublequote ? QUOTED (c) : c);
679 }
680
681 /* Supports 2 characters of pushback. */
682 static void
phase2_ungetc(int c)683 phase2_ungetc (int c)
684 {
685 switch (c)
686 {
687 case EOF:
688 break;
689
690 case '\n':
691 --line_number;
692 /* FALLTHROUGH */
693
694 default:
695 if (phase2_pushback_length == SIZEOF (phase2_pushback))
696 abort ();
697 phase2_pushback[phase2_pushback_length++] = c;
698 break;
699 }
700 }
701
702
703 /* Context lookup table. */
704 static flag_context_list_table_ty *flag_context_list_table;
705
706
707 /* Forward declaration of local functions. */
708 static enum word_type read_command_list (int looking_for,
709 flag_context_ty outer_context);
710
711
712
713 /* Read the next word.
714 'looking_for' denotes a parse terminator, either CLOSING_BACKQUOTE, ')'
715 or '\0'. */
716 static void
read_word(struct word * wp,int looking_for,flag_context_ty context)717 read_word (struct word *wp, int looking_for, flag_context_ty context)
718 {
719 int c;
720 bool all_unquoted_digits;
721
722 do
723 {
724 c = phase2_getc ();
725 if (c == '#')
726 {
727 /* Skip a comment up to end of line. */
728 last_comment_line = line_number;
729 comment_start ();
730 for (;;)
731 {
732 c = phase1_getc ();
733 if (c == EOF || c == '\n')
734 break;
735 /* We skip all leading white space, but not EOLs. */
736 if (!(buflen == 0 && (c == ' ' || c == '\t')))
737 comment_add (c);
738 }
739 comment_line_end ();
740 }
741 if (c == '\n')
742 {
743 /* Comments assumed to be grouped with a message must immediately
744 precede it, with no non-whitespace token on a line between
745 both. */
746 if (last_non_comment_line > last_comment_line)
747 savable_comment_reset ();
748 wp->type = t_separator;
749 return;
750 }
751 }
752 while (is_whitespace (c));
753
754 if (c == EOF)
755 {
756 wp->type = t_eof;
757 return;
758 }
759
760 if (c == '<' || c == '>')
761 {
762 /* Recognize the redirection operators < > >| << <<- >> <> <& >&
763 But <( and >) are handled below, not here. */
764 int c2 = phase2_getc ();
765 if (c2 != '(')
766 {
767 if ((c == '<' ? c2 == '<' : c2 == '|') || c2 == '>' || c2 == '&')
768 {
769 if (c == '<' && c2 == '<')
770 {
771 int c3 = phase2_getc ();
772 if (c3 != '-')
773 phase2_ungetc (c3);
774 }
775 }
776 else
777 phase2_ungetc (c2);
778 wp->type = t_redirect;
779 return;
780 }
781 else
782 phase2_ungetc (c2);
783 }
784
785 if (looking_for == CLOSING_BACKQUOTE && c == CLOSING_BACKQUOTE)
786 {
787 saw_closing_backquote ();
788 wp->type = t_backquote;
789 last_non_comment_line = line_number;
790 return;
791 }
792
793 if (looking_for == ')' && c == ')')
794 {
795 wp->type = t_paren;
796 last_non_comment_line = line_number;
797 return;
798 }
799
800 if (is_operator_start (c))
801 {
802 wp->type = (c == ';' ? t_separator : t_other);
803 return;
804 }
805
806 wp->type = t_string;
807 wp->token = (struct token *) xmalloc (sizeof (struct token));
808 init_token (wp->token);
809 wp->line_number_at_start = line_number;
810 all_unquoted_digits = true;
811
812 for (;; c = phase2_getc ())
813 {
814 if (c == EOF)
815 break;
816
817 if (all_unquoted_digits && (c == '<' || c == '>'))
818 {
819 /* Recognize the redirection operators < > >| << <<- >> <> <& >&
820 prefixed with a nonempty sequence of unquoted digits. */
821 int c2 = phase2_getc ();
822 if ((c == '<' ? c2 == '<' : c2 == '|') || c2 == '>' || c2 == '&')
823 {
824 if (c == '<' && c2 == '<')
825 {
826 int c3 = phase2_getc ();
827 if (c3 != '-')
828 phase2_ungetc (c3);
829 }
830 }
831 else
832 phase2_ungetc (c2);
833
834 wp->type = t_redirect;
835 free_token (wp->token);
836 free (wp->token);
837
838 last_non_comment_line = line_number;
839
840 return;
841 }
842
843 all_unquoted_digits = all_unquoted_digits && (c >= '0' && c <= '9');
844
845 if (c == '$')
846 {
847 int c2;
848
849 /* An unquoted dollar indicates we are not inside '...'. */
850 if (open_singlequote)
851 abort ();
852 /* After reading a dollar, we know that there is no pushed back
853 character from an earlier lookahead. */
854 if (phase2_pushback_length > 0)
855 abort ();
856 /* Therefore we can use phase1 without interfering with phase2.
857 We need to recognize $( outside and inside double-quotes.
858 It would be incorrect to do
859 c2 = phase2_getc ();
860 if (c2 == '(' || c2 == QUOTED ('('))
861 because that would also trigger for $\(. */
862 c2 = phase1_getc ();
863 if (c2 == '(')
864 {
865 bool saved_open_doublequote;
866 int c3;
867
868 phase1_ungetc (c2);
869
870 /* The entire inner command or arithmetic expression is read
871 ignoring possible surrounding double-quotes. */
872 saved_open_doublequote = open_doublequote;
873 open_doublequote = false;
874
875 c2 = phase2_getc ();
876 if (c2 != '(')
877 abort ();
878
879 c3 = phase2_getc ();
880 if (c3 == '(')
881 {
882 /* Arithmetic expression (Bash syntax). Skip until the
883 matching closing parenthesis. */
884 unsigned int depth = 2;
885
886 do
887 {
888 c = phase2_getc ();
889 if (c == '(')
890 depth++;
891 else if (c == ')')
892 if (--depth == 0)
893 break;
894 }
895 while (c != EOF);
896 }
897 else
898 {
899 /* Command substitution (Bash syntax). */
900 phase2_ungetc (c3);
901 read_command_list (')', context);
902 }
903
904 open_doublequote = saved_open_doublequote;
905 }
906 else
907 {
908 phase1_ungetc (c2);
909 c2 = phase2_getc ();
910
911 if (c2 == '\'' && !open_singlequote)
912 {
913 /* Bash builtin for string with ANSI-C escape sequences. */
914 saw_opening_singlequote ();
915 for (;;)
916 {
917 c = phase2_getc ();
918 if (c == EOF)
919 break;
920 if (c == '\'')
921 {
922 saw_closing_singlequote ();
923 break;
924 }
925 if (c == '\\')
926 {
927 c = phase2_getc ();
928 switch (c)
929 {
930 default:
931 phase2_ungetc (c);
932 c = '\\';
933 break;
934
935 case '\\':
936 break;
937 case '\'':
938 /* Don't call saw_closing_singlequote ()
939 here. */
940 break;
941
942 case 'a':
943 c = '\a';
944 break;
945 case 'b':
946 c = '\b';
947 break;
948 case 'e':
949 c = 0x1b; /* ESC */
950 break;
951 case 'f':
952 c = '\f';
953 break;
954 case 'n':
955 c = '\n';
956 break;
957 case 'r':
958 c = '\r';
959 break;
960 case 't':
961 c = '\t';
962 break;
963 case 'v':
964 c = '\v';
965 break;
966
967 case 'x':
968 c = phase2_getc ();
969 if ((c >= '0' && c <= '9')
970 || (c >= 'A' && c <= 'F')
971 || (c >= 'a' && c <= 'f'))
972 {
973 int n;
974
975 if (c >= '0' && c <= '9')
976 n = c - '0';
977 else if (c >= 'A' && c <= 'F')
978 n = 10 + c - 'A';
979 else if (c >= 'a' && c <= 'f')
980 n = 10 + c - 'a';
981 else
982 abort ();
983
984 c = phase2_getc ();
985 if ((c >= '0' && c <= '9')
986 || (c >= 'A' && c <= 'F')
987 || (c >= 'a' && c <= 'f'))
988 {
989 if (c >= '0' && c <= '9')
990 n = n * 16 + c - '0';
991 else if (c >= 'A' && c <= 'F')
992 n = n * 16 + 10 + c - 'A';
993 else if (c >= 'a' && c <= 'f')
994 n = n * 16 + 10 + c - 'a';
995 else
996 abort ();
997 }
998 else
999 phase2_ungetc (c);
1000
1001 c = n;
1002 }
1003 else
1004 {
1005 phase2_ungetc (c);
1006 phase2_ungetc ('x');
1007 c = '\\';
1008 }
1009 break;
1010
1011 case '0': case '1': case '2': case '3':
1012 case '4': case '5': case '6': case '7':
1013 {
1014 int n = c - '0';
1015
1016 c = phase2_getc ();
1017 if (c >= '0' && c <= '7')
1018 {
1019 n = n * 8 + c - '0';
1020
1021 c = phase2_getc ();
1022 if (c >= '0' && c <= '7')
1023 n = n * 8 + c - '0';
1024 else
1025 phase2_ungetc (c);
1026 }
1027 else
1028 phase2_ungetc (c);
1029
1030 c = n;
1031 }
1032 break;
1033 }
1034 }
1035 if (wp->type == t_string)
1036 {
1037 grow_token (wp->token);
1038 wp->token->chars[wp->token->charcount++] =
1039 (unsigned char) c;
1040 }
1041 }
1042 /* The result is a literal string. Don't change wp->type. */
1043 continue;
1044 }
1045 else if (c2 == '"' && !open_doublequote)
1046 {
1047 /* Bash builtin for internationalized string. */
1048 lex_pos_ty pos;
1049 struct token string;
1050
1051 saw_opening_singlequote ();
1052 open_singlequote_terminator = '"';
1053 pos.file_name = logical_file_name;
1054 pos.line_number = line_number;
1055 init_token (&string);
1056 for (;;)
1057 {
1058 c = phase2_getc ();
1059 if (c == EOF)
1060 break;
1061 if (c == '"')
1062 {
1063 saw_closing_singlequote ();
1064 break;
1065 }
1066 grow_token (&string);
1067 string.chars[string.charcount++] = (unsigned char) c;
1068 }
1069 remember_a_message (mlp, NULL, string_of_token (&string),
1070 context, &pos, savable_comment);
1071 free_token (&string);
1072
1073 error_with_progname = false;
1074 error (0, 0, _("%s:%lu: warning: the syntax $\"...\" is deprecated due to security reasons; use eval_gettext instead"),
1075 pos.file_name, (unsigned long) pos.line_number);
1076 error_with_progname = true;
1077
1078 /* The result at runtime is not constant. Therefore we
1079 change wp->type. */
1080 }
1081 else
1082 phase2_ungetc (c2);
1083 }
1084 wp->type = t_other;
1085 continue;
1086 }
1087
1088 if (c == '\'')
1089 {
1090 if (!open_singlequote)
1091 {
1092 /* Handle an opening single quote. */
1093 saw_opening_singlequote ();
1094 }
1095 else
1096 {
1097 /* Handle a closing single quote. */
1098 saw_closing_singlequote ();
1099 }
1100 continue;
1101 }
1102
1103 if (c == '"')
1104 {
1105 if (open_singlequote && open_singlequote_terminator == '"')
1106 {
1107 /* Handle a closing i18n quote. */
1108 saw_closing_singlequote ();
1109 }
1110 else if (!open_doublequote)
1111 {
1112 /* Handle an opening double quote. */
1113 saw_opening_doublequote ();
1114 }
1115 else
1116 {
1117 /* Handle a closing double quote. */
1118 saw_closing_doublequote ();
1119 }
1120 continue;
1121 }
1122
1123 if (c == OPENING_BACKQUOTE)
1124 {
1125 /* Handle an opening backquote. */
1126 saw_opening_backquote ();
1127
1128 read_command_list (CLOSING_BACKQUOTE, context);
1129
1130 wp->type = t_other;
1131 continue;
1132 }
1133 if (c == CLOSING_BACKQUOTE)
1134 break;
1135
1136 if (c == '<' || c == '>')
1137 {
1138 int c2;
1139
1140 /* An unquoted c indicates we are not inside '...' nor "...". */
1141 if (open_singlequote || open_doublequote)
1142 abort ();
1143
1144 c2 = phase2_getc ();
1145 if (c2 == '(')
1146 {
1147 /* Process substitution (Bash syntax). */
1148 read_command_list (')', context);
1149
1150 wp->type = t_other;
1151 continue;
1152 }
1153 else
1154 phase2_ungetc (c2);
1155 }
1156
1157 if (!open_singlequote && !open_doublequote
1158 && (is_whitespace (c) || is_operator_start (c)))
1159 break;
1160
1161 if (wp->type == t_string)
1162 {
1163 grow_token (wp->token);
1164 wp->token->chars[wp->token->charcount++] = (unsigned char) c;
1165 }
1166 }
1167
1168 phase2_ungetc (c);
1169
1170 if (wp->type != t_string)
1171 {
1172 free_token (wp->token);
1173 free (wp->token);
1174 }
1175 last_non_comment_line = line_number;
1176 }
1177
1178
1179 /* Read the next command.
1180 'looking_for' denotes a parse terminator, either CLOSING_BACKQUOTE, ')'
1181 or '\0'.
1182 Returns the type of the word that terminated the command. */
1183 static enum word_type
read_command(int looking_for,flag_context_ty outer_context)1184 read_command (int looking_for, flag_context_ty outer_context)
1185 {
1186 /* Read the words that make up the command.
1187 Here we completely ignore field splitting at whitespace and wildcard
1188 expansions; i.e. we assume that the source is written in such a way that
1189 every word in the program determines exactly one word in the resulting
1190 command.
1191 But we do not require that the 'gettext'/'ngettext' command is the
1192 first in the command; this is because 1. we want to allow for prefixes
1193 like "$verbose" that may expand to nothing, and 2. it's a big effort
1194 to know where a command starts in a $(for ...) or $(case ...) compound
1195 command. */
1196 int arg = 0; /* Current argument number. */
1197 bool arg_of_redirect = false; /* True right after a redirection operator. */
1198 flag_context_list_iterator_ty context_iter;
1199 const struct callshapes *shapes = NULL;
1200 struct arglist_parser *argparser = NULL;
1201
1202 for (;;)
1203 {
1204 struct word inner;
1205 flag_context_ty inner_context;
1206
1207 if (arg == 0)
1208 inner_context = null_context;
1209 else
1210 inner_context =
1211 inherited_context (outer_context,
1212 flag_context_list_iterator_advance (
1213 &context_iter));
1214
1215 read_word (&inner, looking_for, inner_context);
1216
1217 /* Recognize end of command. */
1218 if (inner.type == t_separator
1219 || inner.type == t_backquote || inner.type == t_paren
1220 || inner.type == t_eof)
1221 {
1222 if (argparser != NULL)
1223 arglist_parser_done (argparser, arg);
1224 return inner.type;
1225 }
1226
1227 if (extract_all)
1228 {
1229 if (inner.type == t_string)
1230 {
1231 lex_pos_ty pos;
1232
1233 pos.file_name = logical_file_name;
1234 pos.line_number = inner.line_number_at_start;
1235 remember_a_message (mlp, NULL, string_of_word (&inner),
1236 inner_context, &pos, savable_comment);
1237 }
1238 }
1239
1240 if (arg_of_redirect)
1241 {
1242 /* Ignore arguments of redirection operators. */
1243 arg_of_redirect = false;
1244 }
1245 else if (inner.type == t_redirect)
1246 {
1247 /* Ignore this word and the following one. */
1248 arg_of_redirect = true;
1249 }
1250 else
1251 {
1252 if (argparser == NULL)
1253 {
1254 /* This is the function position. */
1255 arg = 0;
1256 if (inner.type == t_string)
1257 {
1258 char *function_name = string_of_word (&inner);
1259 void *keyword_value;
1260
1261 if (hash_find_entry (&keywords,
1262 function_name, strlen (function_name),
1263 &keyword_value)
1264 == 0)
1265 shapes = (const struct callshapes *) keyword_value;
1266
1267 argparser = arglist_parser_alloc (mlp, shapes);
1268
1269 context_iter =
1270 flag_context_list_iterator (
1271 flag_context_list_table_lookup (
1272 flag_context_list_table,
1273 function_name, strlen (function_name)));
1274
1275 free (function_name);
1276 }
1277 else
1278 context_iter = null_context_list_iterator;
1279 }
1280 else
1281 {
1282 /* These are the argument positions. */
1283 if (inner.type == t_string)
1284 arglist_parser_remember (argparser, arg,
1285 string_of_word (&inner),
1286 inner_context,
1287 logical_file_name,
1288 inner.line_number_at_start,
1289 savable_comment);
1290
1291 if (arglist_parser_decidedp (argparser, arg))
1292 {
1293 /* Stop looking for arguments of the last function_name. */
1294 /* FIXME: What about context_iter? */
1295 arglist_parser_done (argparser, arg);
1296 shapes = NULL;
1297 argparser = NULL;
1298 }
1299 }
1300
1301 arg++;
1302 }
1303
1304 free_word (&inner);
1305 }
1306 }
1307
1308
1309 /* Read a list of commands.
1310 'looking_for' denotes a parse terminator, either CLOSING_BACKQUOTE, ')'
1311 or '\0'.
1312 Returns the type of the word that terminated the command list. */
1313 static enum word_type
read_command_list(int looking_for,flag_context_ty outer_context)1314 read_command_list (int looking_for, flag_context_ty outer_context)
1315 {
1316 for (;;)
1317 {
1318 enum word_type terminator;
1319
1320 terminator = read_command (looking_for, outer_context);
1321 if (terminator != t_separator)
1322 return terminator;
1323 }
1324 }
1325
1326
1327 void
extract_sh(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)1328 extract_sh (FILE *f,
1329 const char *real_filename, const char *logical_filename,
1330 flag_context_list_table_ty *flag_table,
1331 msgdomain_list_ty *mdlp)
1332 {
1333 mlp = mdlp->item[0]->messages;
1334
1335 fp = f;
1336 real_file_name = real_filename;
1337 logical_file_name = xstrdup (logical_filename);
1338 line_number = 1;
1339
1340 last_comment_line = -1;
1341 last_non_comment_line = -1;
1342
1343 nested_backquotes = 0;
1344 open_doublequotes_mask = 0;
1345 open_doublequote = false;
1346 open_singlequote = false;
1347
1348 flag_context_list_table = flag_table;
1349
1350 init_keywords ();
1351
1352 /* Eat tokens until eof is seen. */
1353 read_command_list ('\0', null_context);
1354
1355 fp = NULL;
1356 real_file_name = NULL;
1357 logical_file_name = NULL;
1358 line_number = 0;
1359 }
1360