1 /* xgettext sh backend. 2 Copyright (C) 2003, 2005-2006 Free Software Foundation, Inc. 3 Written by Bruno Haible <bruno@clisp.org>, 2003. 4 5 This program is free software; you can redistribute it and/or modify 6 it under the terms of the GNU General Public License as published by 7 the Free Software Foundation; either version 2, or (at your option) 8 any later version. 9 10 This program is distributed in the hope that it will be useful, 11 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 GNU General Public License for more details. 14 15 You should have received a copy of the GNU General Public License 16 along with this program; if not, write to the Free Software Foundation, 17 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 18 19 #ifdef HAVE_CONFIG_H 20 # include "config.h" 21 #endif 22 23 #include <errno.h> 24 #include <limits.h> 25 #include <stdbool.h> 26 #include <stdio.h> 27 #include <stdlib.h> 28 #include <string.h> 29 30 #include "message.h" 31 #include "xgettext.h" 32 #include "x-sh.h" 33 #include "error.h" 34 #include "xalloc.h" 35 #include "exit.h" 36 #include "hash.h" 37 #include "gettext.h" 38 39 #define _(s) gettext(s) 40 41 #define SIZEOF(a) (sizeof(a) / sizeof(a[0])) 42 43 44 /* The sh syntax is defined in POSIX:2001, see 45 http://www.opengroup.org/onlinepubs/007904975/utilities/xcu_chap02.html 46 Summary of sh syntax: 47 - Input is broken into words, which are then subject to 48 - tilde expansion ~... 49 - command substitution `...` 50 - variable substitution $var 51 - arithmetic substitution $((...)) 52 - field splitting at whitespace (IFS) 53 - wildcard pattern expansion *? 54 - quote removal 55 - Strings are enclosed in "..."; command substitution, variable 56 substitution and arithmetic substitution are performed here as well. 57 - '...' is a string without substitutions. 58 - The list of resulting words is split into commands by semicolon and 59 newline. 60 - '#' at the beginning of a word introduces a comment until end of line. 61 The parser is implemented in bash-2.05b/parse.y. */ 62 63 64 /* ====================== Keyword set customization. ====================== */ 65 66 /* If true extract all strings. */ 67 static bool extract_all = false; 68 69 static hash_table keywords; 70 static bool default_keywords = true; 71 72 73 void 74 x_sh_extract_all () 75 { 76 extract_all = true; 77 } 78 79 80 void 81 x_sh_keyword (const char *name) 82 { 83 if (name == NULL) 84 default_keywords = false; 85 else 86 { 87 const char *end; 88 struct callshape shape; 89 const char *colon; 90 91 if (keywords.table == NULL) 92 hash_init (&keywords, 100); 93 94 split_keywordspec (name, &end, &shape); 95 96 /* The characters between name and end should form a valid C identifier. 97 A colon means an invalid parse in split_keywordspec(). */ 98 colon = strchr (name, ':'); 99 if (colon == NULL || colon >= end) 100 insert_keyword_callshape (&keywords, name, end - name, &shape); 101 } 102 } 103 104 /* Finish initializing the keywords hash table. 105 Called after argument processing, before each file is processed. */ 106 static void 107 init_keywords () 108 { 109 if (default_keywords) 110 { 111 /* When adding new keywords here, also update the documentation in 112 xgettext.texi! */ 113 x_sh_keyword ("gettext"); 114 x_sh_keyword ("ngettext:1,2"); 115 x_sh_keyword ("eval_gettext"); 116 x_sh_keyword ("eval_ngettext:1,2"); 117 default_keywords = false; 118 } 119 } 120 121 void 122 init_flag_table_sh () 123 { 124 xgettext_record_flag ("gettext:1:pass-sh-format"); 125 xgettext_record_flag ("ngettext:1:pass-sh-format"); 126 xgettext_record_flag ("ngettext:2:pass-sh-format"); 127 xgettext_record_flag ("eval_gettext:1:sh-format"); 128 xgettext_record_flag ("eval_ngettext:1:sh-format"); 129 xgettext_record_flag ("eval_ngettext:2:sh-format"); 130 } 131 132 133 /* ======================== Reading of characters. ======================== */ 134 135 /* Real filename, used in error messages about the input file. */ 136 static const char *real_file_name; 137 138 /* Logical filename and line number, used to label the extracted messages. */ 139 static char *logical_file_name; 140 static int line_number; 141 142 /* The input file stream. */ 143 static FILE *fp; 144 145 146 /* Fetch the next character from the input file. */ 147 static int 148 do_getc () 149 { 150 int c = getc (fp); 151 152 if (c == EOF) 153 { 154 if (ferror (fp)) 155 error (EXIT_FAILURE, errno, _("\ 156 error while reading \"%s\""), real_file_name); 157 } 158 else if (c == '\n') 159 line_number++; 160 161 return c; 162 } 163 164 /* Put back the last fetched character, not EOF. */ 165 static void 166 do_ungetc (int c) 167 { 168 if (c == '\n') 169 line_number--; 170 ungetc (c, fp); 171 } 172 173 174 /* Remove backslash followed by newline from the input stream. */ 175 176 static int phase1_pushback[1]; 177 static int phase1_pushback_length; 178 179 static int 180 phase1_getc () 181 { 182 int c; 183 184 if (phase1_pushback_length) 185 { 186 c = phase1_pushback[--phase1_pushback_length]; 187 if (c == '\n') 188 ++line_number; 189 return c; 190 } 191 for (;;) 192 { 193 c = do_getc (); 194 if (c != '\\') 195 return c; 196 c = do_getc (); 197 if (c != '\n') 198 { 199 if (c != EOF) 200 do_ungetc (c); 201 return '\\'; 202 } 203 } 204 } 205 206 /* Supports only one pushback character. */ 207 static void 208 phase1_ungetc (int c) 209 { 210 switch (c) 211 { 212 case EOF: 213 break; 214 215 case '\n': 216 --line_number; 217 /* FALLTHROUGH */ 218 219 default: 220 if (phase1_pushback_length == SIZEOF (phase1_pushback)) 221 abort (); 222 phase1_pushback[phase1_pushback_length++] = c; 223 break; 224 } 225 } 226 227 228 /* ========================== Reading of tokens. ========================== */ 229 230 231 /* A token consists of a sequence of characters. */ 232 struct token 233 { 234 int allocated; /* number of allocated 'token_char's */ 235 int charcount; /* number of used 'token_char's */ 236 char *chars; /* the token's constituents */ 237 }; 238 239 /* Initialize a 'struct token'. */ 240 static inline void 241 init_token (struct token *tp) 242 { 243 tp->allocated = 10; 244 tp->chars = (char *) xmalloc (tp->allocated * sizeof (char)); 245 tp->charcount = 0; 246 } 247 248 /* Free the memory pointed to by a 'struct token'. */ 249 static inline void 250 free_token (struct token *tp) 251 { 252 free (tp->chars); 253 } 254 255 /* Ensure there is enough room in the token for one more character. */ 256 static inline void 257 grow_token (struct token *tp) 258 { 259 if (tp->charcount == tp->allocated) 260 { 261 tp->allocated *= 2; 262 tp->chars = (char *) xrealloc (tp->chars, tp->allocated * sizeof (char)); 263 } 264 } 265 266 /* Convert a struct token * to a char*. */ 267 static char * 268 string_of_token (const struct token *tp) 269 { 270 char *str; 271 int n; 272 273 n = tp->charcount; 274 str = (char *) xmalloc (n + 1); 275 memcpy (str, tp->chars, n); 276 str[n] = '\0'; 277 return str; 278 } 279 280 281 /* ========================= Accumulating messages ========================= */ 282 283 284 static message_list_ty *mlp; 285 286 287 /* ========================= Accumulating comments ========================= */ 288 289 290 static char *buffer; 291 static size_t bufmax; 292 static size_t buflen; 293 294 static inline void 295 comment_start () 296 { 297 buflen = 0; 298 } 299 300 static inline void 301 comment_add (int c) 302 { 303 if (buflen >= bufmax) 304 { 305 bufmax = 2 * bufmax + 10; 306 buffer = xrealloc (buffer, bufmax); 307 } 308 buffer[buflen++] = c; 309 } 310 311 static inline void 312 comment_line_end () 313 { 314 while (buflen >= 1 315 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t')) 316 --buflen; 317 if (buflen >= bufmax) 318 { 319 bufmax = 2 * bufmax + 10; 320 buffer = xrealloc (buffer, bufmax); 321 } 322 buffer[buflen] = '\0'; 323 savable_comment_add (buffer); 324 } 325 326 327 /* These are for tracking whether comments count as immediately before 328 keyword. */ 329 static int last_comment_line; 330 static int last_non_comment_line; 331 332 333 /* ========================= Debackslashification ========================== */ 334 335 /* This state tracks the effect of backquotes, double-quotes and single-quotes 336 on the parsing of backslashes. We make a single pass through the input 337 file, keeping the state up to date. This is much faster than accumulating 338 strings and processing them with explicit debackslashification, like the 339 shell does it. */ 340 341 /* The number of nested `...` or "`...`" constructs. Assumed to be <= 32. */ 342 static unsigned int nested_backquotes; 343 344 /* A bit mask indicating which of the currently open `...` or "`...`" 345 constructs is with double-quotes: "`...`". 346 A bit value of 1 stands for "`...`", a bit value of 0 stands for `...`. 347 Bit position 0 designates the outermost backquotes nesting, 348 bit position 1 the second-outermost backquotes nesting, 349 ... 350 bit position (nested_backquotes-1) the innermost backquotes nesting. */ 351 static unsigned int open_doublequotes_mask; 352 353 /* A bit indicating whether a double-quote is currently open inside the 354 innermost backquotes nesting. */ 355 static bool open_doublequote; 356 357 /* A bit indicating whether a single-quote is currently open inside the 358 innermost backquotes nesting. */ 359 static bool open_singlequote; 360 361 /* The expected terminator of the currently open single-quote. 362 Usually '\'', but can be '"' for i18n-quotes. */ 363 static char open_singlequote_terminator; 364 365 366 /* Functions to update the state. */ 367 368 static inline void 369 saw_opening_backquote () 370 { 371 if (open_singlequote) 372 abort (); 373 if (open_doublequote) 374 open_doublequotes_mask |= (unsigned int) 1 << nested_backquotes; 375 nested_backquotes++; 376 open_doublequote = false; 377 } 378 379 static inline void 380 saw_closing_backquote () 381 { 382 nested_backquotes--; 383 open_doublequote = (open_doublequotes_mask >> nested_backquotes) & 1; 384 open_doublequotes_mask &= ((unsigned int) 1 << nested_backquotes) - 1; 385 open_singlequote = false; /* just for safety */ 386 } 387 388 static inline void 389 saw_opening_doublequote () 390 { 391 if (open_singlequote || open_doublequote) 392 abort (); 393 open_doublequote = true; 394 } 395 396 static inline void 397 saw_closing_doublequote () 398 { 399 if (open_singlequote || !open_doublequote) 400 abort (); 401 open_doublequote = false; 402 } 403 404 static inline void 405 saw_opening_singlequote () 406 { 407 if (open_doublequote || open_singlequote) 408 abort (); 409 open_singlequote = true; 410 open_singlequote_terminator = '\''; 411 } 412 413 static inline void 414 saw_closing_singlequote () 415 { 416 if (open_doublequote || !open_singlequote) 417 abort (); 418 open_singlequote = false; 419 } 420 421 422 /* ========================== Reading of commands ========================== */ 423 424 /* We are only interested in constant strings. Other words need not to be 425 represented precisely. */ 426 enum word_type 427 { 428 t_string, /* constant string */ 429 t_other, /* other string */ 430 t_separator, /* command separator: semicolon or newline */ 431 t_redirect, /* redirection: one of < > >| << <<- >> <> <& >& */ 432 t_backquote, /* closing '`' pseudo word */ 433 t_paren, /* closing ')' pseudo word */ 434 t_eof /* EOF marker */ 435 }; 436 437 struct word 438 { 439 enum word_type type; 440 struct token *token; /* for t_string */ 441 int line_number_at_start; /* for t_string */ 442 }; 443 444 /* Free the memory pointed to by a 'struct word'. */ 445 static inline void 446 free_word (struct word *wp) 447 { 448 if (wp->type == t_string) 449 { 450 free_token (wp->token); 451 free (wp->token); 452 } 453 } 454 455 /* Convert a t_string token to a char*. */ 456 static char * 457 string_of_word (const struct word *wp) 458 { 459 char *str; 460 int n; 461 462 if (!(wp->type == t_string)) 463 abort (); 464 n = wp->token->charcount; 465 str = (char *) xmalloc (n + 1); 466 memcpy (str, wp->token->chars, n); 467 str[n] = '\0'; 468 return str; 469 } 470 471 472 /* Whitespace recognition. */ 473 474 static inline bool 475 is_whitespace (int c) 476 { 477 return (c == ' ' || c == '\t' || c == '\n'); 478 } 479 480 /* Operator character recognition. */ 481 482 static inline bool 483 is_operator_start (int c) 484 { 485 return (c == '|' || c == '&' || c == ';' || c == '<' || c == '>' 486 || c == '(' || c == ')'); 487 } 488 489 490 /* Denotation of a quoted character. 491 The distinction between quoted and unquoted character is important only for 492 the special, whitespace and operator characters; it is irrelevant for 493 alphanumeric characters, '\\' and many others. */ 494 #define QUOTED(c) (UCHAR_MAX + 1 + (c)) 495 /* Values in the 'unsigned char' range are implicitly unquoted. Among these, 496 the following are important: 497 '"' opening or closing double quote 498 '\'' opening or closing single quote 499 '$' the unknown result of a dollar expansion 500 '`' does not occur - replaced with OPENING_BACKQUOTE or 501 CLOSING_BACKQUOTE 502 */ 503 #define OPENING_BACKQUOTE (2 * (UCHAR_MAX + 1) + '`') 504 #define CLOSING_BACKQUOTE (3 * (UCHAR_MAX + 1) + '`') 505 506 /* 2 characters of pushback are supported. 507 2 characters of pushback occur only when the first is an 'x'; in all 508 other cases only one character of pushback is needed. */ 509 static int phase2_pushback[2]; 510 static int phase2_pushback_length; 511 512 /* Return the next character, with backslashes removed. 513 The result is QUOTED(c) for some unsigned char c, if the next character 514 is escaped sufficiently often to make it a regular constituent character, 515 or simply an 'unsigned char' if it has its special meaning (of special, 516 whitespace or operator charcter), or OPENING_BACKQUOTE, CLOSING_BACKQUOTE, 517 EOF. 518 It's the caller's responsibility to update the state. */ 519 static int 520 phase2_getc () 521 { 522 int c; 523 524 if (phase2_pushback_length) 525 { 526 c = phase2_pushback[--phase2_pushback_length]; 527 if (c == '\n') 528 ++line_number; 529 return c; 530 } 531 532 c = phase1_getc (); 533 if (c == EOF) 534 return c; 535 if (c == '\'') 536 return ((open_doublequote 537 || (open_singlequote && open_singlequote_terminator != c)) 538 ? QUOTED (c) 539 : c); 540 if (open_singlequote) 541 { 542 if (c == open_singlequote_terminator) 543 return c; 544 } 545 else 546 { 547 if (c == '"' || c == '$') 548 return c; 549 if (c == '`') 550 return (nested_backquotes > 0 ? CLOSING_BACKQUOTE : OPENING_BACKQUOTE); 551 } 552 if (c == '\\') 553 { 554 /* Number of debackslahificication passes that are active at the 555 current point. */ 556 unsigned int debackslahify = 557 nested_backquotes + (open_singlequote ? 0 : 1); 558 /* Normal number of backslashes that yield a single backslash in the 559 final output. */ 560 unsigned int expected_count = 561 (unsigned int) 1 << debackslahify; 562 /* Number of backslashes found. */ 563 unsigned int count; 564 565 for (count = 1; count < expected_count; count++) 566 { 567 c = phase1_getc (); 568 if (c != '\\') 569 break; 570 } 571 if (count == expected_count) 572 return '\\'; 573 574 /* The count of backslashes is > 0 and < expected_count, therefore the 575 result depends on c, the first character after the backslashes. 576 Note: The formulas below don't necessarily have a logic; they were 577 empirically determined such that 1. the xgettext-30 test succeeds, 578 2. the behaviour for count == 0 would correspond to the one without 579 any baskslash. */ 580 if (c == '\'') 581 { 582 if (!open_singlequote && count > (expected_count >> 1)) 583 { 584 phase1_ungetc (c); 585 return '\\'; 586 } 587 else 588 return ((open_doublequote 589 || (open_singlequote && open_singlequote_terminator != c)) 590 ? QUOTED (c) 591 : c); 592 } 593 else if (c == '"') 594 { 595 /* Each debackslahificication pass converts \\ to \ and \" to "; 596 passes corresponding to `...` drop a lone " whereas passes 597 corresponding to "`...`" leave it alone. Therefore, the 598 minimum number of backslashes needed to get one double-quote 599 in the end is open_doublequotes_mask + 1. */ 600 if (open_singlequote) 601 { 602 if (count > open_doublequotes_mask) 603 { 604 phase1_ungetc (c); 605 return '\\'; 606 } 607 else 608 return (open_singlequote_terminator != c ? QUOTED (c) : c); 609 } 610 else 611 { 612 if (count > open_doublequotes_mask) 613 return QUOTED (c); 614 else 615 /* Some of the count values <= open_doublequotes_mask are 616 actually invalid here, but we assume a syntactically 617 correct input file anyway. */ 618 return c; 619 } 620 } 621 else if (c == '`') 622 { 623 /* FIXME: This code looks fishy. */ 624 if (count == expected_count - 1) 625 return c; 626 else 627 /* Some of the count values < expected_count - 1 are 628 actually invalid here, but we assume a syntactically 629 correct input file anyway. */ 630 if (nested_backquotes > 0 && !open_singlequote 631 && count >= (expected_count >> 2)) 632 return OPENING_BACKQUOTE; 633 else 634 return CLOSING_BACKQUOTE; 635 } 636 else if (c == '$') 637 { 638 if (open_singlequote) 639 return QUOTED (c); 640 if (count >= (expected_count >> 1)) 641 return QUOTED (c); 642 else 643 return c; 644 } 645 else 646 { 647 /* When not followed by a quoting character or backslash or dollar, 648 a backslash survives a debackslahificication pass unmodified. 649 Therefore each debackslahificication pass performs a 650 count := (count + 1) >> 1 651 operation. Therefore the minimum number of backslashes needed 652 to get one backslash in the end is (expected_count >> 1) + 1. */ 653 if (open_doublequote || open_singlequote) 654 { 655 if (count > 0) 656 { 657 phase1_ungetc (c); 658 return '\\'; 659 } 660 else 661 return QUOTED (c); 662 } 663 else 664 { 665 if (count > (expected_count >> 1)) 666 { 667 phase1_ungetc (c); 668 return '\\'; 669 } 670 else if (count > 0) 671 return QUOTED (c); 672 else 673 return c; 674 } 675 } 676 } 677 678 return (open_singlequote || open_doublequote ? QUOTED (c) : c); 679 } 680 681 /* Supports 2 characters of pushback. */ 682 static void 683 phase2_ungetc (int c) 684 { 685 switch (c) 686 { 687 case EOF: 688 break; 689 690 case '\n': 691 --line_number; 692 /* FALLTHROUGH */ 693 694 default: 695 if (phase2_pushback_length == SIZEOF (phase2_pushback)) 696 abort (); 697 phase2_pushback[phase2_pushback_length++] = c; 698 break; 699 } 700 } 701 702 703 /* Context lookup table. */ 704 static flag_context_list_table_ty *flag_context_list_table; 705 706 707 /* Forward declaration of local functions. */ 708 static enum word_type read_command_list (int looking_for, 709 flag_context_ty outer_context); 710 711 712 713 /* Read the next word. 714 'looking_for' denotes a parse terminator, either CLOSING_BACKQUOTE, ')' 715 or '\0'. */ 716 static void 717 read_word (struct word *wp, int looking_for, flag_context_ty context) 718 { 719 int c; 720 bool all_unquoted_digits; 721 722 do 723 { 724 c = phase2_getc (); 725 if (c == '#') 726 { 727 /* Skip a comment up to end of line. */ 728 last_comment_line = line_number; 729 comment_start (); 730 for (;;) 731 { 732 c = phase1_getc (); 733 if (c == EOF || c == '\n') 734 break; 735 /* We skip all leading white space, but not EOLs. */ 736 if (!(buflen == 0 && (c == ' ' || c == '\t'))) 737 comment_add (c); 738 } 739 comment_line_end (); 740 } 741 if (c == '\n') 742 { 743 /* Comments assumed to be grouped with a message must immediately 744 precede it, with no non-whitespace token on a line between 745 both. */ 746 if (last_non_comment_line > last_comment_line) 747 savable_comment_reset (); 748 wp->type = t_separator; 749 return; 750 } 751 } 752 while (is_whitespace (c)); 753 754 if (c == EOF) 755 { 756 wp->type = t_eof; 757 return; 758 } 759 760 if (c == '<' || c == '>') 761 { 762 /* Recognize the redirection operators < > >| << <<- >> <> <& >& 763 But <( and >) are handled below, not here. */ 764 int c2 = phase2_getc (); 765 if (c2 != '(') 766 { 767 if ((c == '<' ? c2 == '<' : c2 == '|') || c2 == '>' || c2 == '&') 768 { 769 if (c == '<' && c2 == '<') 770 { 771 int c3 = phase2_getc (); 772 if (c3 != '-') 773 phase2_ungetc (c3); 774 } 775 } 776 else 777 phase2_ungetc (c2); 778 wp->type = t_redirect; 779 return; 780 } 781 else 782 phase2_ungetc (c2); 783 } 784 785 if (looking_for == CLOSING_BACKQUOTE && c == CLOSING_BACKQUOTE) 786 { 787 saw_closing_backquote (); 788 wp->type = t_backquote; 789 last_non_comment_line = line_number; 790 return; 791 } 792 793 if (looking_for == ')' && c == ')') 794 { 795 wp->type = t_paren; 796 last_non_comment_line = line_number; 797 return; 798 } 799 800 if (is_operator_start (c)) 801 { 802 wp->type = (c == ';' ? t_separator : t_other); 803 return; 804 } 805 806 wp->type = t_string; 807 wp->token = (struct token *) xmalloc (sizeof (struct token)); 808 init_token (wp->token); 809 wp->line_number_at_start = line_number; 810 all_unquoted_digits = true; 811 812 for (;; c = phase2_getc ()) 813 { 814 if (c == EOF) 815 break; 816 817 if (all_unquoted_digits && (c == '<' || c == '>')) 818 { 819 /* Recognize the redirection operators < > >| << <<- >> <> <& >& 820 prefixed with a nonempty sequence of unquoted digits. */ 821 int c2 = phase2_getc (); 822 if ((c == '<' ? c2 == '<' : c2 == '|') || c2 == '>' || c2 == '&') 823 { 824 if (c == '<' && c2 == '<') 825 { 826 int c3 = phase2_getc (); 827 if (c3 != '-') 828 phase2_ungetc (c3); 829 } 830 } 831 else 832 phase2_ungetc (c2); 833 834 wp->type = t_redirect; 835 free_token (wp->token); 836 free (wp->token); 837 838 last_non_comment_line = line_number; 839 840 return; 841 } 842 843 all_unquoted_digits = all_unquoted_digits && (c >= '0' && c <= '9'); 844 845 if (c == '$') 846 { 847 int c2; 848 849 /* An unquoted dollar indicates we are not inside '...'. */ 850 if (open_singlequote) 851 abort (); 852 /* After reading a dollar, we know that there is no pushed back 853 character from an earlier lookahead. */ 854 if (phase2_pushback_length > 0) 855 abort (); 856 /* Therefore we can use phase1 without interfering with phase2. 857 We need to recognize $( outside and inside double-quotes. 858 It would be incorrect to do 859 c2 = phase2_getc (); 860 if (c2 == '(' || c2 == QUOTED ('(')) 861 because that would also trigger for $\(. */ 862 c2 = phase1_getc (); 863 if (c2 == '(') 864 { 865 bool saved_open_doublequote; 866 int c3; 867 868 phase1_ungetc (c2); 869 870 /* The entire inner command or arithmetic expression is read 871 ignoring possible surrounding double-quotes. */ 872 saved_open_doublequote = open_doublequote; 873 open_doublequote = false; 874 875 c2 = phase2_getc (); 876 if (c2 != '(') 877 abort (); 878 879 c3 = phase2_getc (); 880 if (c3 == '(') 881 { 882 /* Arithmetic expression (Bash syntax). Skip until the 883 matching closing parenthesis. */ 884 unsigned int depth = 2; 885 886 do 887 { 888 c = phase2_getc (); 889 if (c == '(') 890 depth++; 891 else if (c == ')') 892 if (--depth == 0) 893 break; 894 } 895 while (c != EOF); 896 } 897 else 898 { 899 /* Command substitution (Bash syntax). */ 900 phase2_ungetc (c3); 901 read_command_list (')', context); 902 } 903 904 open_doublequote = saved_open_doublequote; 905 } 906 else 907 { 908 phase1_ungetc (c2); 909 c2 = phase2_getc (); 910 911 if (c2 == '\'' && !open_singlequote) 912 { 913 /* Bash builtin for string with ANSI-C escape sequences. */ 914 saw_opening_singlequote (); 915 for (;;) 916 { 917 c = phase2_getc (); 918 if (c == EOF) 919 break; 920 if (c == '\'') 921 { 922 saw_closing_singlequote (); 923 break; 924 } 925 if (c == '\\') 926 { 927 c = phase2_getc (); 928 switch (c) 929 { 930 default: 931 phase2_ungetc (c); 932 c = '\\'; 933 break; 934 935 case '\\': 936 break; 937 case '\'': 938 /* Don't call saw_closing_singlequote () 939 here. */ 940 break; 941 942 case 'a': 943 c = '\a'; 944 break; 945 case 'b': 946 c = '\b'; 947 break; 948 case 'e': 949 c = 0x1b; /* ESC */ 950 break; 951 case 'f': 952 c = '\f'; 953 break; 954 case 'n': 955 c = '\n'; 956 break; 957 case 'r': 958 c = '\r'; 959 break; 960 case 't': 961 c = '\t'; 962 break; 963 case 'v': 964 c = '\v'; 965 break; 966 967 case 'x': 968 c = phase2_getc (); 969 if ((c >= '0' && c <= '9') 970 || (c >= 'A' && c <= 'F') 971 || (c >= 'a' && c <= 'f')) 972 { 973 int n; 974 975 if (c >= '0' && c <= '9') 976 n = c - '0'; 977 else if (c >= 'A' && c <= 'F') 978 n = 10 + c - 'A'; 979 else if (c >= 'a' && c <= 'f') 980 n = 10 + c - 'a'; 981 else 982 abort (); 983 984 c = phase2_getc (); 985 if ((c >= '0' && c <= '9') 986 || (c >= 'A' && c <= 'F') 987 || (c >= 'a' && c <= 'f')) 988 { 989 if (c >= '0' && c <= '9') 990 n = n * 16 + c - '0'; 991 else if (c >= 'A' && c <= 'F') 992 n = n * 16 + 10 + c - 'A'; 993 else if (c >= 'a' && c <= 'f') 994 n = n * 16 + 10 + c - 'a'; 995 else 996 abort (); 997 } 998 else 999 phase2_ungetc (c); 1000 1001 c = n; 1002 } 1003 else 1004 { 1005 phase2_ungetc (c); 1006 phase2_ungetc ('x'); 1007 c = '\\'; 1008 } 1009 break; 1010 1011 case '0': case '1': case '2': case '3': 1012 case '4': case '5': case '6': case '7': 1013 { 1014 int n = c - '0'; 1015 1016 c = phase2_getc (); 1017 if (c >= '0' && c <= '7') 1018 { 1019 n = n * 8 + c - '0'; 1020 1021 c = phase2_getc (); 1022 if (c >= '0' && c <= '7') 1023 n = n * 8 + c - '0'; 1024 else 1025 phase2_ungetc (c); 1026 } 1027 else 1028 phase2_ungetc (c); 1029 1030 c = n; 1031 } 1032 break; 1033 } 1034 } 1035 if (wp->type == t_string) 1036 { 1037 grow_token (wp->token); 1038 wp->token->chars[wp->token->charcount++] = 1039 (unsigned char) c; 1040 } 1041 } 1042 /* The result is a literal string. Don't change wp->type. */ 1043 continue; 1044 } 1045 else if (c2 == '"' && !open_doublequote) 1046 { 1047 /* Bash builtin for internationalized string. */ 1048 lex_pos_ty pos; 1049 struct token string; 1050 1051 saw_opening_singlequote (); 1052 open_singlequote_terminator = '"'; 1053 pos.file_name = logical_file_name; 1054 pos.line_number = line_number; 1055 init_token (&string); 1056 for (;;) 1057 { 1058 c = phase2_getc (); 1059 if (c == EOF) 1060 break; 1061 if (c == '"') 1062 { 1063 saw_closing_singlequote (); 1064 break; 1065 } 1066 grow_token (&string); 1067 string.chars[string.charcount++] = (unsigned char) c; 1068 } 1069 remember_a_message (mlp, NULL, string_of_token (&string), 1070 context, &pos, savable_comment); 1071 free_token (&string); 1072 1073 error_with_progname = false; 1074 error (0, 0, _("%s:%lu: warning: the syntax $\"...\" is deprecated due to security reasons; use eval_gettext instead"), 1075 pos.file_name, (unsigned long) pos.line_number); 1076 error_with_progname = true; 1077 1078 /* The result at runtime is not constant. Therefore we 1079 change wp->type. */ 1080 } 1081 else 1082 phase2_ungetc (c2); 1083 } 1084 wp->type = t_other; 1085 continue; 1086 } 1087 1088 if (c == '\'') 1089 { 1090 if (!open_singlequote) 1091 { 1092 /* Handle an opening single quote. */ 1093 saw_opening_singlequote (); 1094 } 1095 else 1096 { 1097 /* Handle a closing single quote. */ 1098 saw_closing_singlequote (); 1099 } 1100 continue; 1101 } 1102 1103 if (c == '"') 1104 { 1105 if (open_singlequote && open_singlequote_terminator == '"') 1106 { 1107 /* Handle a closing i18n quote. */ 1108 saw_closing_singlequote (); 1109 } 1110 else if (!open_doublequote) 1111 { 1112 /* Handle an opening double quote. */ 1113 saw_opening_doublequote (); 1114 } 1115 else 1116 { 1117 /* Handle a closing double quote. */ 1118 saw_closing_doublequote (); 1119 } 1120 continue; 1121 } 1122 1123 if (c == OPENING_BACKQUOTE) 1124 { 1125 /* Handle an opening backquote. */ 1126 saw_opening_backquote (); 1127 1128 read_command_list (CLOSING_BACKQUOTE, context); 1129 1130 wp->type = t_other; 1131 continue; 1132 } 1133 if (c == CLOSING_BACKQUOTE) 1134 break; 1135 1136 if (c == '<' || c == '>') 1137 { 1138 int c2; 1139 1140 /* An unquoted c indicates we are not inside '...' nor "...". */ 1141 if (open_singlequote || open_doublequote) 1142 abort (); 1143 1144 c2 = phase2_getc (); 1145 if (c2 == '(') 1146 { 1147 /* Process substitution (Bash syntax). */ 1148 read_command_list (')', context); 1149 1150 wp->type = t_other; 1151 continue; 1152 } 1153 else 1154 phase2_ungetc (c2); 1155 } 1156 1157 if (!open_singlequote && !open_doublequote 1158 && (is_whitespace (c) || is_operator_start (c))) 1159 break; 1160 1161 if (wp->type == t_string) 1162 { 1163 grow_token (wp->token); 1164 wp->token->chars[wp->token->charcount++] = (unsigned char) c; 1165 } 1166 } 1167 1168 phase2_ungetc (c); 1169 1170 if (wp->type != t_string) 1171 { 1172 free_token (wp->token); 1173 free (wp->token); 1174 } 1175 last_non_comment_line = line_number; 1176 } 1177 1178 1179 /* Read the next command. 1180 'looking_for' denotes a parse terminator, either CLOSING_BACKQUOTE, ')' 1181 or '\0'. 1182 Returns the type of the word that terminated the command. */ 1183 static enum word_type 1184 read_command (int looking_for, flag_context_ty outer_context) 1185 { 1186 /* Read the words that make up the command. 1187 Here we completely ignore field splitting at whitespace and wildcard 1188 expansions; i.e. we assume that the source is written in such a way that 1189 every word in the program determines exactly one word in the resulting 1190 command. 1191 But we do not require that the 'gettext'/'ngettext' command is the 1192 first in the command; this is because 1. we want to allow for prefixes 1193 like "$verbose" that may expand to nothing, and 2. it's a big effort 1194 to know where a command starts in a $(for ...) or $(case ...) compound 1195 command. */ 1196 int arg = 0; /* Current argument number. */ 1197 bool arg_of_redirect = false; /* True right after a redirection operator. */ 1198 flag_context_list_iterator_ty context_iter; 1199 const struct callshapes *shapes = NULL; 1200 struct arglist_parser *argparser = NULL; 1201 1202 for (;;) 1203 { 1204 struct word inner; 1205 flag_context_ty inner_context; 1206 1207 if (arg == 0) 1208 inner_context = null_context; 1209 else 1210 inner_context = 1211 inherited_context (outer_context, 1212 flag_context_list_iterator_advance ( 1213 &context_iter)); 1214 1215 read_word (&inner, looking_for, inner_context); 1216 1217 /* Recognize end of command. */ 1218 if (inner.type == t_separator 1219 || inner.type == t_backquote || inner.type == t_paren 1220 || inner.type == t_eof) 1221 { 1222 if (argparser != NULL) 1223 arglist_parser_done (argparser, arg); 1224 return inner.type; 1225 } 1226 1227 if (extract_all) 1228 { 1229 if (inner.type == t_string) 1230 { 1231 lex_pos_ty pos; 1232 1233 pos.file_name = logical_file_name; 1234 pos.line_number = inner.line_number_at_start; 1235 remember_a_message (mlp, NULL, string_of_word (&inner), 1236 inner_context, &pos, savable_comment); 1237 } 1238 } 1239 1240 if (arg_of_redirect) 1241 { 1242 /* Ignore arguments of redirection operators. */ 1243 arg_of_redirect = false; 1244 } 1245 else if (inner.type == t_redirect) 1246 { 1247 /* Ignore this word and the following one. */ 1248 arg_of_redirect = true; 1249 } 1250 else 1251 { 1252 if (argparser == NULL) 1253 { 1254 /* This is the function position. */ 1255 arg = 0; 1256 if (inner.type == t_string) 1257 { 1258 char *function_name = string_of_word (&inner); 1259 void *keyword_value; 1260 1261 if (hash_find_entry (&keywords, 1262 function_name, strlen (function_name), 1263 &keyword_value) 1264 == 0) 1265 shapes = (const struct callshapes *) keyword_value; 1266 1267 argparser = arglist_parser_alloc (mlp, shapes); 1268 1269 context_iter = 1270 flag_context_list_iterator ( 1271 flag_context_list_table_lookup ( 1272 flag_context_list_table, 1273 function_name, strlen (function_name))); 1274 1275 free (function_name); 1276 } 1277 else 1278 context_iter = null_context_list_iterator; 1279 } 1280 else 1281 { 1282 /* These are the argument positions. */ 1283 if (inner.type == t_string) 1284 arglist_parser_remember (argparser, arg, 1285 string_of_word (&inner), 1286 inner_context, 1287 logical_file_name, 1288 inner.line_number_at_start, 1289 savable_comment); 1290 1291 if (arglist_parser_decidedp (argparser, arg)) 1292 { 1293 /* Stop looking for arguments of the last function_name. */ 1294 /* FIXME: What about context_iter? */ 1295 arglist_parser_done (argparser, arg); 1296 shapes = NULL; 1297 argparser = NULL; 1298 } 1299 } 1300 1301 arg++; 1302 } 1303 1304 free_word (&inner); 1305 } 1306 } 1307 1308 1309 /* Read a list of commands. 1310 'looking_for' denotes a parse terminator, either CLOSING_BACKQUOTE, ')' 1311 or '\0'. 1312 Returns the type of the word that terminated the command list. */ 1313 static enum word_type 1314 read_command_list (int looking_for, flag_context_ty outer_context) 1315 { 1316 for (;;) 1317 { 1318 enum word_type terminator; 1319 1320 terminator = read_command (looking_for, outer_context); 1321 if (terminator != t_separator) 1322 return terminator; 1323 } 1324 } 1325 1326 1327 void 1328 extract_sh (FILE *f, 1329 const char *real_filename, const char *logical_filename, 1330 flag_context_list_table_ty *flag_table, 1331 msgdomain_list_ty *mdlp) 1332 { 1333 mlp = mdlp->item[0]->messages; 1334 1335 fp = f; 1336 real_file_name = real_filename; 1337 logical_file_name = xstrdup (logical_filename); 1338 line_number = 1; 1339 1340 last_comment_line = -1; 1341 last_non_comment_line = -1; 1342 1343 nested_backquotes = 0; 1344 open_doublequotes_mask = 0; 1345 open_doublequote = false; 1346 open_singlequote = false; 1347 1348 flag_context_list_table = flag_table; 1349 1350 init_keywords (); 1351 1352 /* Eat tokens until eof is seen. */ 1353 read_command_list ('\0', null_context); 1354 1355 fp = NULL; 1356 real_file_name = NULL; 1357 logical_file_name = NULL; 1358 line_number = 0; 1359 } 1360