1 /* xgettext PHP backend. 2 Copyright (C) 2001-2003, 2005-2006 Free Software Foundation, Inc. 3 4 This file was written by Bruno Haible <bruno@clisp.org>, 2002. 5 6 This program is free software; you can redistribute it and/or modify 7 it under the terms of the GNU General Public License as published by 8 the Free Software Foundation; either version 2, or (at your option) 9 any later version. 10 11 This program is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 GNU General Public License for more details. 15 16 You should have received a copy of the GNU General Public License 17 along with this program; if not, write to the Free Software Foundation, 18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ 19 20 #ifdef HAVE_CONFIG_H 21 # include "config.h" 22 #endif 23 24 #include <errno.h> 25 #include <stdbool.h> 26 #include <stdio.h> 27 #include <stdlib.h> 28 29 #include "message.h" 30 #include "xgettext.h" 31 #include "x-php.h" 32 #include "error.h" 33 #include "xalloc.h" 34 #include "exit.h" 35 #include "gettext.h" 36 37 #define _(s) gettext(s) 38 39 #define SIZEOF(a) (sizeof(a) / sizeof(a[0])) 40 41 42 /* The PHP syntax is defined in phpdoc/manual/langref.html. 43 See also php-4.1.0/Zend/zend_language_scanner.l. 44 Note that variable and function names can contain bytes in the range 45 0x7f..0xff; see 46 http://www.php.net/manual/en/language.variables.php 47 http://www.php.net/manual/en/language.functions.php */ 48 49 50 /* ====================== Keyword set customization. ====================== */ 51 52 /* If true extract all strings. */ 53 static bool extract_all = false; 54 55 static hash_table keywords; 56 static bool default_keywords = true; 57 58 59 void 60 x_php_extract_all () 61 { 62 extract_all = true; 63 } 64 65 66 void 67 x_php_keyword (const char *name) 68 { 69 if (name == NULL) 70 default_keywords = false; 71 else 72 { 73 const char *end; 74 struct callshape shape; 75 const char *colon; 76 77 if (keywords.table == NULL) 78 hash_init (&keywords, 100); 79 80 split_keywordspec (name, &end, &shape); 81 82 /* The characters between name and end should form a valid C identifier. 83 A colon means an invalid parse in split_keywordspec(). */ 84 colon = strchr (name, ':'); 85 if (colon == NULL || colon >= end) 86 insert_keyword_callshape (&keywords, name, end - name, &shape); 87 } 88 } 89 90 /* Finish initializing the keywords hash table. 91 Called after argument processing, before each file is processed. */ 92 static void 93 init_keywords () 94 { 95 if (default_keywords) 96 { 97 /* When adding new keywords here, also update the documentation in 98 xgettext.texi! */ 99 x_php_keyword ("_"); 100 x_php_keyword ("gettext"); 101 x_php_keyword ("dgettext:2"); 102 x_php_keyword ("dcgettext:2"); 103 /* The following were added in PHP 4.2.0. */ 104 x_php_keyword ("ngettext:1,2"); 105 x_php_keyword ("dngettext:2,3"); 106 x_php_keyword ("dcngettext:2,3"); 107 default_keywords = false; 108 } 109 } 110 111 void 112 init_flag_table_php () 113 { 114 xgettext_record_flag ("_:1:pass-php-format"); 115 xgettext_record_flag ("gettext:1:pass-php-format"); 116 xgettext_record_flag ("dgettext:2:pass-php-format"); 117 xgettext_record_flag ("dcgettext:2:pass-php-format"); 118 xgettext_record_flag ("ngettext:1:pass-php-format"); 119 xgettext_record_flag ("ngettext:2:pass-php-format"); 120 xgettext_record_flag ("dngettext:2:pass-php-format"); 121 xgettext_record_flag ("dngettext:3:pass-php-format"); 122 xgettext_record_flag ("dcngettext:2:pass-php-format"); 123 xgettext_record_flag ("dcngettext:3:pass-php-format"); 124 xgettext_record_flag ("sprintf:1:php-format"); 125 xgettext_record_flag ("printf:1:php-format"); 126 } 127 128 129 /* ======================== Reading of characters. ======================== */ 130 131 132 /* Real filename, used in error messages about the input file. */ 133 static const char *real_file_name; 134 135 /* Logical filename and line number, used to label the extracted messages. */ 136 static char *logical_file_name; 137 static int line_number; 138 139 /* The input file stream. */ 140 static FILE *fp; 141 142 143 /* 1. line_number handling. */ 144 145 static unsigned char phase1_pushback[2]; 146 static int phase1_pushback_length; 147 148 static int 149 phase1_getc () 150 { 151 int c; 152 153 if (phase1_pushback_length) 154 c = phase1_pushback[--phase1_pushback_length]; 155 else 156 { 157 c = getc (fp); 158 159 if (c == EOF) 160 { 161 if (ferror (fp)) 162 error (EXIT_FAILURE, errno, _("error while reading \"%s\""), 163 real_file_name); 164 return EOF; 165 } 166 } 167 168 if (c == '\n') 169 line_number++; 170 171 return c; 172 } 173 174 /* Supports 2 characters of pushback. */ 175 static void 176 phase1_ungetc (int c) 177 { 178 if (c != EOF) 179 { 180 if (c == '\n') 181 --line_number; 182 183 if (phase1_pushback_length == SIZEOF (phase1_pushback)) 184 abort (); 185 phase1_pushback[phase1_pushback_length++] = c; 186 } 187 } 188 189 190 /* 2. Ignore HTML sections. They are equivalent to PHP echo commands and 191 therefore don't contain translatable strings. */ 192 193 static void 194 skip_html () 195 { 196 for (;;) 197 { 198 int c = phase1_getc (); 199 200 if (c == EOF) 201 return; 202 203 if (c == '<') 204 { 205 int c2 = phase1_getc (); 206 207 if (c2 == EOF) 208 break; 209 210 if (c2 == '?') 211 { 212 /* <?php is the normal way to enter PHP mode. <? and <?= are 213 recognized by PHP depending on a configuration setting. */ 214 int c3 = phase1_getc (); 215 216 if (c3 != '=') 217 phase1_ungetc (c3); 218 219 return; 220 } 221 222 if (c2 == '%') 223 { 224 /* <% and <%= are recognized by PHP depending on a configuration 225 setting. */ 226 int c3 = phase1_getc (); 227 228 if (c3 != '=') 229 phase1_ungetc (c3); 230 231 return; 232 } 233 234 if (c2 == '<') 235 { 236 phase1_ungetc (c2); 237 continue; 238 } 239 240 /* < script language = php > 241 < script language = "php" > 242 < script language = 'php' > 243 are always recognized. */ 244 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r') 245 c2 = phase1_getc (); 246 if (c2 != 's' && c2 != 'S') 247 { 248 phase1_ungetc (c2); 249 continue; 250 } 251 c2 = phase1_getc (); 252 if (c2 != 'c' && c2 != 'C') 253 { 254 phase1_ungetc (c2); 255 continue; 256 } 257 c2 = phase1_getc (); 258 if (c2 != 'r' && c2 != 'R') 259 { 260 phase1_ungetc (c2); 261 continue; 262 } 263 c2 = phase1_getc (); 264 if (c2 != 'i' && c2 != 'I') 265 { 266 phase1_ungetc (c2); 267 continue; 268 } 269 c2 = phase1_getc (); 270 if (c2 != 'p' && c2 != 'P') 271 { 272 phase1_ungetc (c2); 273 continue; 274 } 275 c2 = phase1_getc (); 276 if (c2 != 't' && c2 != 'T') 277 { 278 phase1_ungetc (c2); 279 continue; 280 } 281 c2 = phase1_getc (); 282 if (!(c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')) 283 { 284 phase1_ungetc (c2); 285 continue; 286 } 287 do 288 c2 = phase1_getc (); 289 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r'); 290 if (c2 != 'l' && c2 != 'L') 291 { 292 phase1_ungetc (c2); 293 continue; 294 } 295 c2 = phase1_getc (); 296 if (c2 != 'a' && c2 != 'A') 297 { 298 phase1_ungetc (c2); 299 continue; 300 } 301 c2 = phase1_getc (); 302 if (c2 != 'n' && c2 != 'N') 303 { 304 phase1_ungetc (c2); 305 continue; 306 } 307 c2 = phase1_getc (); 308 if (c2 != 'g' && c2 != 'G') 309 { 310 phase1_ungetc (c2); 311 continue; 312 } 313 c2 = phase1_getc (); 314 if (c2 != 'u' && c2 != 'U') 315 { 316 phase1_ungetc (c2); 317 continue; 318 } 319 c2 = phase1_getc (); 320 if (c2 != 'a' && c2 != 'A') 321 { 322 phase1_ungetc (c2); 323 continue; 324 } 325 c2 = phase1_getc (); 326 if (c2 != 'g' && c2 != 'G') 327 { 328 phase1_ungetc (c2); 329 continue; 330 } 331 c2 = phase1_getc (); 332 if (c2 != 'e' && c2 != 'E') 333 { 334 phase1_ungetc (c2); 335 continue; 336 } 337 c2 = phase1_getc (); 338 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r') 339 c2 = phase1_getc (); 340 if (c2 != '=') 341 { 342 phase1_ungetc (c2); 343 continue; 344 } 345 c2 = phase1_getc (); 346 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r') 347 c2 = phase1_getc (); 348 if (c2 == '"') 349 { 350 c2 = phase1_getc (); 351 if (c2 != 'p') 352 { 353 phase1_ungetc (c2); 354 continue; 355 } 356 c2 = phase1_getc (); 357 if (c2 != 'h') 358 { 359 phase1_ungetc (c2); 360 continue; 361 } 362 c2 = phase1_getc (); 363 if (c2 != 'p') 364 { 365 phase1_ungetc (c2); 366 continue; 367 } 368 c2 = phase1_getc (); 369 if (c2 != '"') 370 { 371 phase1_ungetc (c2); 372 continue; 373 } 374 } 375 else if (c2 == '\'') 376 { 377 c2 = phase1_getc (); 378 if (c2 != 'p') 379 { 380 phase1_ungetc (c2); 381 continue; 382 } 383 c2 = phase1_getc (); 384 if (c2 != 'h') 385 { 386 phase1_ungetc (c2); 387 continue; 388 } 389 c2 = phase1_getc (); 390 if (c2 != 'p') 391 { 392 phase1_ungetc (c2); 393 continue; 394 } 395 c2 = phase1_getc (); 396 if (c2 != '\'') 397 { 398 phase1_ungetc (c2); 399 continue; 400 } 401 } 402 else 403 { 404 if (c2 != 'p') 405 { 406 phase1_ungetc (c2); 407 continue; 408 } 409 c2 = phase1_getc (); 410 if (c2 != 'h') 411 { 412 phase1_ungetc (c2); 413 continue; 414 } 415 c2 = phase1_getc (); 416 if (c2 != 'p') 417 { 418 phase1_ungetc (c2); 419 continue; 420 } 421 } 422 c2 = phase1_getc (); 423 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r') 424 c2 = phase1_getc (); 425 if (c2 != '>') 426 { 427 phase1_ungetc (c2); 428 continue; 429 } 430 return; 431 } 432 } 433 } 434 435 #if 0 436 437 static unsigned char phase2_pushback[1]; 438 static int phase2_pushback_length; 439 440 static int 441 phase2_getc () 442 { 443 int c; 444 445 if (phase2_pushback_length) 446 return phase2_pushback[--phase2_pushback_length]; 447 448 c = phase1_getc (); 449 switch (c) 450 { 451 case '?': 452 case '%': 453 { 454 int c2 = phase1_getc (); 455 if (c2 == '>') 456 { 457 /* ?> and %> terminate PHP mode and switch back to HTML mode. */ 458 skip_html (); 459 return ' '; 460 } 461 phase1_ungetc (c2); 462 } 463 break; 464 465 case '<': 466 { 467 int c2 = phase1_getc (); 468 469 /* < / script > terminates PHP mode and switches back to HTML mode. */ 470 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r') 471 c2 = phase1_getc (); 472 if (c2 == '/') 473 { 474 do 475 c2 = phase1_getc (); 476 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r'); 477 if (c2 == 's' || c2 == 'S') 478 { 479 c2 = phase1_getc (); 480 if (c2 == 'c' || c2 == 'C') 481 { 482 c2 = phase1_getc (); 483 if (c2 == 'r' || c2 == 'R') 484 { 485 c2 = phase1_getc (); 486 if (c2 == 'i' || c2 == 'I') 487 { 488 c2 = phase1_getc (); 489 if (c2 == 'p' || c2 == 'P') 490 { 491 c2 = phase1_getc (); 492 if (c2 == 't' || c2 == 'T') 493 { 494 do 495 c2 = phase1_getc (); 496 while (c2 == ' ' || c2 == '\t' 497 || c2 == '\n' || c2 == '\r'); 498 if (c2 == '>') 499 { 500 skip_html (); 501 return ' '; 502 } 503 } 504 } 505 } 506 } 507 } 508 } 509 } 510 phase1_ungetc (c2); 511 } 512 break; 513 } 514 515 return c; 516 } 517 518 static void 519 phase2_ungetc (int c) 520 { 521 if (c != EOF) 522 { 523 if (phase2_pushback_length == SIZEOF (phase2_pushback)) 524 abort (); 525 phase2_pushback[phase2_pushback_length++] = c; 526 } 527 } 528 529 #endif 530 531 532 /* Accumulating comments. */ 533 534 static char *buffer; 535 static size_t bufmax; 536 static size_t buflen; 537 538 static inline void 539 comment_start () 540 { 541 buflen = 0; 542 } 543 544 static inline void 545 comment_add (int c) 546 { 547 if (buflen >= bufmax) 548 { 549 bufmax = 2 * bufmax + 10; 550 buffer = xrealloc (buffer, bufmax); 551 } 552 buffer[buflen++] = c; 553 } 554 555 static inline void 556 comment_line_end (size_t chars_to_remove) 557 { 558 buflen -= chars_to_remove; 559 while (buflen >= 1 560 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t')) 561 --buflen; 562 if (chars_to_remove == 0 && buflen >= bufmax) 563 { 564 bufmax = 2 * bufmax + 10; 565 buffer = xrealloc (buffer, bufmax); 566 } 567 buffer[buflen] = '\0'; 568 savable_comment_add (buffer); 569 } 570 571 572 /* 3. Replace each comment that is not inside a string literal with a 573 space character. We need to remember the comment for later, because 574 it may be attached to a keyword string. */ 575 576 /* These are for tracking whether comments count as immediately before 577 keyword. */ 578 static int last_comment_line; 579 static int last_non_comment_line; 580 581 static unsigned char phase3_pushback[1]; 582 static int phase3_pushback_length; 583 584 static int 585 phase3_getc () 586 { 587 int lineno; 588 int c; 589 590 if (phase3_pushback_length) 591 return phase3_pushback[--phase3_pushback_length]; 592 593 c = phase1_getc (); 594 595 if (c == '#') 596 { 597 /* sh comment. */ 598 bool last_was_qmark = false; 599 600 comment_start (); 601 lineno = line_number; 602 for (;;) 603 { 604 c = phase1_getc (); 605 if (c == '\n' || c == EOF) 606 { 607 comment_line_end (0); 608 break; 609 } 610 if (last_was_qmark && c == '>') 611 { 612 comment_line_end (1); 613 skip_html (); 614 break; 615 } 616 /* We skip all leading white space, but not EOLs. */ 617 if (!(buflen == 0 && (c == ' ' || c == '\t'))) 618 comment_add (c); 619 last_was_qmark = (c == '?' || c == '%'); 620 } 621 last_comment_line = lineno; 622 return '\n'; 623 } 624 else if (c == '/') 625 { 626 c = phase1_getc (); 627 628 switch (c) 629 { 630 default: 631 phase1_ungetc (c); 632 return '/'; 633 634 case '*': 635 { 636 /* C comment. */ 637 bool last_was_star; 638 639 comment_start (); 640 lineno = line_number; 641 last_was_star = false; 642 for (;;) 643 { 644 c = phase1_getc (); 645 if (c == EOF) 646 break; 647 /* We skip all leading white space, but not EOLs. */ 648 if (buflen == 0 && (c == ' ' || c == '\t')) 649 continue; 650 comment_add (c); 651 switch (c) 652 { 653 case '\n': 654 comment_line_end (1); 655 comment_start (); 656 lineno = line_number; 657 last_was_star = false; 658 continue; 659 660 case '*': 661 last_was_star = true; 662 continue; 663 664 case '/': 665 if (last_was_star) 666 { 667 comment_line_end (2); 668 break; 669 } 670 /* FALLTHROUGH */ 671 672 default: 673 last_was_star = false; 674 continue; 675 } 676 break; 677 } 678 last_comment_line = lineno; 679 return ' '; 680 } 681 682 case '/': 683 { 684 /* C++ comment. */ 685 bool last_was_qmark = false; 686 687 comment_start (); 688 lineno = line_number; 689 for (;;) 690 { 691 c = phase1_getc (); 692 if (c == '\n' || c == EOF) 693 { 694 comment_line_end (0); 695 break; 696 } 697 if (last_was_qmark && c == '>') 698 { 699 comment_line_end (1); 700 skip_html (); 701 break; 702 } 703 /* We skip all leading white space, but not EOLs. */ 704 if (!(buflen == 0 && (c == ' ' || c == '\t'))) 705 comment_add (c); 706 last_was_qmark = (c == '?' || c == '%'); 707 } 708 last_comment_line = lineno; 709 return '\n'; 710 } 711 } 712 } 713 else 714 return c; 715 } 716 717 #ifdef unused 718 static void 719 phase3_ungetc (int c) 720 { 721 if (c != EOF) 722 { 723 if (phase3_pushback_length == SIZEOF (phase3_pushback)) 724 abort (); 725 phase3_pushback[phase3_pushback_length++] = c; 726 } 727 } 728 #endif 729 730 731 /* ========================== Reading of tokens. ========================== */ 732 733 734 enum token_type_ty 735 { 736 token_type_eof, 737 token_type_lparen, /* ( */ 738 token_type_rparen, /* ) */ 739 token_type_comma, /* , */ 740 token_type_string_literal, /* "abc" */ 741 token_type_symbol, /* symbol, number */ 742 token_type_other /* misc. operator */ 743 }; 744 typedef enum token_type_ty token_type_ty; 745 746 typedef struct token_ty token_ty; 747 struct token_ty 748 { 749 token_type_ty type; 750 char *string; /* for token_type_string_literal, token_type_symbol */ 751 int line_number; 752 }; 753 754 755 /* Free the memory pointed to by a 'struct token_ty'. */ 756 static inline void 757 free_token (token_ty *tp) 758 { 759 if (tp->type == token_type_string_literal || tp->type == token_type_symbol) 760 free (tp->string); 761 } 762 763 764 /* 4. Combine characters into tokens. Discard whitespace. */ 765 766 static void 767 x_php_lex (token_ty *tp) 768 { 769 static char *buffer; 770 static int bufmax; 771 int bufpos; 772 int c; 773 774 tp->string = NULL; 775 776 for (;;) 777 { 778 tp->line_number = line_number; 779 c = phase3_getc (); 780 switch (c) 781 { 782 case EOF: 783 tp->type = token_type_eof; 784 return; 785 786 case '\n': 787 if (last_non_comment_line > last_comment_line) 788 savable_comment_reset (); 789 /* FALLTHROUGH */ 790 case ' ': 791 case '\t': 792 case '\r': 793 /* Ignore whitespace. */ 794 continue; 795 } 796 797 last_non_comment_line = tp->line_number; 798 799 switch (c) 800 { 801 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': 802 case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N': 803 case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': 804 case 'V': case 'W': case 'X': case 'Y': case 'Z': 805 case '_': 806 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': 807 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': 808 case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': 809 case 'v': case 'w': case 'x': case 'y': case 'z': 810 case 127: case 128: case 129: case 130: case 131: case 132: case 133: 811 case 134: case 135: case 136: case 137: case 138: case 139: case 140: 812 case 141: case 142: case 143: case 144: case 145: case 146: case 147: 813 case 148: case 149: case 150: case 151: case 152: case 153: case 154: 814 case 155: case 156: case 157: case 158: case 159: case 160: case 161: 815 case 162: case 163: case 164: case 165: case 166: case 167: case 168: 816 case 169: case 170: case 171: case 172: case 173: case 174: case 175: 817 case 176: case 177: case 178: case 179: case 180: case 181: case 182: 818 case 183: case 184: case 185: case 186: case 187: case 188: case 189: 819 case 190: case 191: case 192: case 193: case 194: case 195: case 196: 820 case 197: case 198: case 199: case 200: case 201: case 202: case 203: 821 case 204: case 205: case 206: case 207: case 208: case 209: case 210: 822 case 211: case 212: case 213: case 214: case 215: case 216: case 217: 823 case 218: case 219: case 220: case 221: case 222: case 223: case 224: 824 case 225: case 226: case 227: case 228: case 229: case 230: case 231: 825 case 232: case 233: case 234: case 235: case 236: case 237: case 238: 826 case 239: case 240: case 241: case 242: case 243: case 244: case 245: 827 case 246: case 247: case 248: case 249: case 250: case 251: case 252: 828 case 253: case 254: case 255: 829 bufpos = 0; 830 for (;;) 831 { 832 if (bufpos >= bufmax) 833 { 834 bufmax = 2 * bufmax + 10; 835 buffer = xrealloc (buffer, bufmax); 836 } 837 buffer[bufpos++] = c; 838 c = phase1_getc (); 839 switch (c) 840 { 841 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 842 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': 843 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': 844 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': 845 case 'Y': case 'Z': 846 case '_': 847 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 848 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': 849 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': 850 case 's': case 't': case 'u': case 'v': case 'w': case 'x': 851 case 'y': case 'z': 852 case '0': case '1': case '2': case '3': case '4': 853 case '5': case '6': case '7': case '8': case '9': 854 case 127: case 128: case 129: case 130: case 131: case 132: 855 case 133: case 134: case 135: case 136: case 137: case 138: 856 case 139: case 140: case 141: case 142: case 143: case 144: 857 case 145: case 146: case 147: case 148: case 149: case 150: 858 case 151: case 152: case 153: case 154: case 155: case 156: 859 case 157: case 158: case 159: case 160: case 161: case 162: 860 case 163: case 164: case 165: case 166: case 167: case 168: 861 case 169: case 170: case 171: case 172: case 173: case 174: 862 case 175: case 176: case 177: case 178: case 179: case 180: 863 case 181: case 182: case 183: case 184: case 185: case 186: 864 case 187: case 188: case 189: case 190: case 191: case 192: 865 case 193: case 194: case 195: case 196: case 197: case 198: 866 case 199: case 200: case 201: case 202: case 203: case 204: 867 case 205: case 206: case 207: case 208: case 209: case 210: 868 case 211: case 212: case 213: case 214: case 215: case 216: 869 case 217: case 218: case 219: case 220: case 221: case 222: 870 case 223: case 224: case 225: case 226: case 227: case 228: 871 case 229: case 230: case 231: case 232: case 233: case 234: 872 case 235: case 236: case 237: case 238: case 239: case 240: 873 case 241: case 242: case 243: case 244: case 245: case 246: 874 case 247: case 248: case 249: case 250: case 251: case 252: 875 case 253: case 254: case 255: 876 continue; 877 878 default: 879 phase1_ungetc (c); 880 break; 881 } 882 break; 883 } 884 if (bufpos >= bufmax) 885 { 886 bufmax = 2 * bufmax + 10; 887 buffer = xrealloc (buffer, bufmax); 888 } 889 buffer[bufpos] = 0; 890 tp->string = xstrdup (buffer); 891 tp->type = token_type_symbol; 892 return; 893 894 case '\'': 895 /* Single-quoted string literal. */ 896 bufpos = 0; 897 for (;;) 898 { 899 c = phase1_getc (); 900 if (c == EOF || c == '\'') 901 break; 902 if (c == '\\') 903 { 904 c = phase1_getc (); 905 if (c != '\\' && c != '\'') 906 { 907 phase1_ungetc (c); 908 c = '\\'; 909 } 910 } 911 if (bufpos >= bufmax) 912 { 913 bufmax = 2 * bufmax + 10; 914 buffer = xrealloc (buffer, bufmax); 915 } 916 buffer[bufpos++] = c; 917 } 918 if (bufpos >= bufmax) 919 { 920 bufmax = 2 * bufmax + 10; 921 buffer = xrealloc (buffer, bufmax); 922 } 923 buffer[bufpos] = 0; 924 tp->type = token_type_string_literal; 925 tp->string = xstrdup (buffer); 926 return; 927 928 case '"': 929 /* Double-quoted string literal. */ 930 tp->type = token_type_string_literal; 931 bufpos = 0; 932 for (;;) 933 { 934 c = phase1_getc (); 935 if (c == EOF || c == '"') 936 break; 937 if (c == '$') 938 { 939 c = phase1_getc (); 940 if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') 941 || c == '_' || c == '{' || c >= 0x7f) 942 { 943 /* String with variables. */ 944 tp->type = token_type_other; 945 continue; 946 } 947 phase1_ungetc (c); 948 c = '$'; 949 } 950 if (c == '{') 951 { 952 c = phase1_getc (); 953 if (c == '$') 954 { 955 /* String with expressions. */ 956 tp->type = token_type_other; 957 continue; 958 } 959 phase1_ungetc (c); 960 c = '{'; 961 } 962 if (c == '\\') 963 { 964 int n, j; 965 966 c = phase1_getc (); 967 switch (c) 968 { 969 case '"': 970 case '\\': 971 case '$': 972 break; 973 974 case '0': case '1': case '2': case '3': 975 case '4': case '5': case '6': case '7': 976 n = 0; 977 for (j = 0; j < 3; ++j) 978 { 979 n = n * 8 + c - '0'; 980 c = phase1_getc (); 981 switch (c) 982 { 983 default: 984 break; 985 986 case '0': case '1': case '2': case '3': 987 case '4': case '5': case '6': case '7': 988 continue; 989 } 990 break; 991 } 992 phase1_ungetc (c); 993 c = n; 994 break; 995 996 case 'x': 997 n = 0; 998 for (j = 0; j < 2; ++j) 999 { 1000 c = phase1_getc (); 1001 switch (c) 1002 { 1003 case '0': case '1': case '2': case '3': case '4': 1004 case '5': case '6': case '7': case '8': case '9': 1005 n = n * 16 + c - '0'; 1006 break; 1007 case 'A': case 'B': case 'C': case 'D': case 'E': 1008 case 'F': 1009 n = n * 16 + 10 + c - 'A'; 1010 break; 1011 case 'a': case 'b': case 'c': case 'd': case 'e': 1012 case 'f': 1013 n = n * 16 + 10 + c - 'a'; 1014 break; 1015 default: 1016 phase1_ungetc (c); 1017 c = 0; 1018 break; 1019 } 1020 if (c == 0) 1021 break; 1022 } 1023 if (j == 0) 1024 { 1025 phase1_ungetc ('x'); 1026 c = '\\'; 1027 } 1028 else 1029 c = n; 1030 break; 1031 1032 case 'n': 1033 c = '\n'; 1034 break; 1035 case 't': 1036 c = '\t'; 1037 break; 1038 case 'r': 1039 c = '\r'; 1040 break; 1041 1042 default: 1043 phase1_ungetc (c); 1044 c = '\\'; 1045 break; 1046 } 1047 } 1048 if (bufpos >= bufmax) 1049 { 1050 bufmax = 2 * bufmax + 10; 1051 buffer = xrealloc (buffer, bufmax); 1052 } 1053 buffer[bufpos++] = c; 1054 } 1055 if (bufpos >= bufmax) 1056 { 1057 bufmax = 2 * bufmax + 10; 1058 buffer = xrealloc (buffer, bufmax); 1059 } 1060 buffer[bufpos] = 0; 1061 if (tp->type == token_type_string_literal) 1062 tp->string = xstrdup (buffer); 1063 return; 1064 1065 case '?': 1066 case '%': 1067 { 1068 int c2 = phase1_getc (); 1069 if (c2 == '>') 1070 { 1071 /* ?> and %> terminate PHP mode and switch back to HTML 1072 mode. */ 1073 skip_html (); 1074 } 1075 else 1076 phase1_ungetc (c2); 1077 tp->type = token_type_other; 1078 return; 1079 } 1080 1081 case '(': 1082 tp->type = token_type_lparen; 1083 return; 1084 1085 case ')': 1086 tp->type = token_type_rparen; 1087 return; 1088 1089 case ',': 1090 tp->type = token_type_comma; 1091 return; 1092 1093 case '<': 1094 { 1095 int c2 = phase1_getc (); 1096 if (c2 == '<') 1097 { 1098 int c3 = phase1_getc (); 1099 if (c3 == '<') 1100 { 1101 /* Start of here document. 1102 Parse whitespace, then label, then newline. */ 1103 do 1104 c = phase3_getc (); 1105 while (c == ' ' || c == '\t' || c == '\n' || c == '\r'); 1106 1107 bufpos = 0; 1108 do 1109 { 1110 if (bufpos >= bufmax) 1111 { 1112 bufmax = 2 * bufmax + 10; 1113 buffer = xrealloc (buffer, bufmax); 1114 } 1115 buffer[bufpos++] = c; 1116 c = phase3_getc (); 1117 } 1118 while (c != EOF && c != '\n' && c != '\r'); 1119 /* buffer[0..bufpos-1] now contains the label. */ 1120 1121 /* Now skip the here document. */ 1122 for (;;) 1123 { 1124 c = phase1_getc (); 1125 if (c == EOF) 1126 break; 1127 if (c == '\n' || c == '\r') 1128 { 1129 int bufidx = 0; 1130 1131 while (bufidx < bufpos) 1132 { 1133 c = phase1_getc (); 1134 if (c == EOF) 1135 break; 1136 if (c != buffer[bufidx]) 1137 { 1138 phase1_ungetc (c); 1139 break; 1140 } 1141 bufidx++; 1142 } 1143 if (bufidx == bufpos) 1144 { 1145 c = phase1_getc (); 1146 if (c != ';') 1147 phase1_ungetc (c); 1148 c = phase1_getc (); 1149 if (c == '\n' || c == '\r') 1150 break; 1151 } 1152 } 1153 } 1154 1155 /* FIXME: Ideally we should turn the here document into a 1156 string literal if it didn't contain $ substitution. And 1157 we should also respect backslash escape sequences like 1158 in double-quoted strings. */ 1159 tp->type = token_type_other; 1160 return; 1161 } 1162 phase1_ungetc (c3); 1163 } 1164 1165 /* < / script > terminates PHP mode and switches back to HTML 1166 mode. */ 1167 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r') 1168 c2 = phase1_getc (); 1169 if (c2 == '/') 1170 { 1171 do 1172 c2 = phase1_getc (); 1173 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r'); 1174 if (c2 == 's' || c2 == 'S') 1175 { 1176 c2 = phase1_getc (); 1177 if (c2 == 'c' || c2 == 'C') 1178 { 1179 c2 = phase1_getc (); 1180 if (c2 == 'r' || c2 == 'R') 1181 { 1182 c2 = phase1_getc (); 1183 if (c2 == 'i' || c2 == 'I') 1184 { 1185 c2 = phase1_getc (); 1186 if (c2 == 'p' || c2 == 'P') 1187 { 1188 c2 = phase1_getc (); 1189 if (c2 == 't' || c2 == 'T') 1190 { 1191 do 1192 c2 = phase1_getc (); 1193 while (c2 == ' ' || c2 == '\t' 1194 || c2 == '\n' || c2 == '\r'); 1195 if (c2 == '>') 1196 { 1197 skip_html (); 1198 } 1199 else 1200 phase1_ungetc (c2); 1201 } 1202 else 1203 phase1_ungetc (c2); 1204 } 1205 else 1206 phase1_ungetc (c2); 1207 } 1208 else 1209 phase1_ungetc (c2); 1210 } 1211 else 1212 phase1_ungetc (c2); 1213 } 1214 else 1215 phase1_ungetc (c2); 1216 } 1217 else 1218 phase1_ungetc (c2); 1219 } 1220 else 1221 phase1_ungetc (c2); 1222 1223 tp->type = token_type_other; 1224 return; 1225 } 1226 1227 case '`': 1228 /* Execution operator. */ 1229 default: 1230 /* We could carefully recognize each of the 2 and 3 character 1231 operators, but it is not necessary, as we only need to recognize 1232 gettext invocations. Don't bother. */ 1233 tp->type = token_type_other; 1234 return; 1235 } 1236 } 1237 } 1238 1239 1240 /* ========================= Extracting strings. ========================== */ 1241 1242 1243 /* Context lookup table. */ 1244 static flag_context_list_table_ty *flag_context_list_table; 1245 1246 1247 /* The file is broken into tokens. Scan the token stream, looking for 1248 a keyword, followed by a left paren, followed by a string. When we 1249 see this sequence, we have something to remember. We assume we are 1250 looking at a valid C or C++ program, and leave the complaints about 1251 the grammar to the compiler. 1252 1253 Normal handling: Look for 1254 keyword ( ... msgid ... ) 1255 Plural handling: Look for 1256 keyword ( ... msgid ... msgid_plural ... ) 1257 1258 We use recursion because the arguments before msgid or between msgid 1259 and msgid_plural can contain subexpressions of the same form. */ 1260 1261 1262 /* Extract messages until the next balanced closing parenthesis. 1263 Extracted messages are added to MLP. 1264 Return true upon eof, false upon closing parenthesis. */ 1265 static bool 1266 extract_parenthesized (message_list_ty *mlp, 1267 flag_context_ty outer_context, 1268 flag_context_list_iterator_ty context_iter, 1269 struct arglist_parser *argparser) 1270 { 1271 /* Current argument number. */ 1272 int arg = 1; 1273 /* 0 when no keyword has been seen. 1 right after a keyword is seen. */ 1274 int state; 1275 /* Parameters of the keyword just seen. Defined only in state 1. */ 1276 const struct callshapes *next_shapes = NULL; 1277 /* Context iterator that will be used if the next token is a '('. */ 1278 flag_context_list_iterator_ty next_context_iter = 1279 passthrough_context_list_iterator; 1280 /* Current context. */ 1281 flag_context_ty inner_context = 1282 inherited_context (outer_context, 1283 flag_context_list_iterator_advance (&context_iter)); 1284 1285 /* Start state is 0. */ 1286 state = 0; 1287 1288 for (;;) 1289 { 1290 token_ty token; 1291 1292 x_php_lex (&token); 1293 switch (token.type) 1294 { 1295 case token_type_symbol: 1296 { 1297 void *keyword_value; 1298 1299 if (hash_find_entry (&keywords, token.string, strlen (token.string), 1300 &keyword_value) 1301 == 0) 1302 { 1303 next_shapes = (const struct callshapes *) keyword_value; 1304 state = 1; 1305 } 1306 else 1307 state = 0; 1308 } 1309 next_context_iter = 1310 flag_context_list_iterator ( 1311 flag_context_list_table_lookup ( 1312 flag_context_list_table, 1313 token.string, strlen (token.string))); 1314 free (token.string); 1315 continue; 1316 1317 case token_type_lparen: 1318 if (extract_parenthesized (mlp, inner_context, next_context_iter, 1319 arglist_parser_alloc (mlp, 1320 state ? next_shapes : NULL))) 1321 { 1322 arglist_parser_done (argparser, arg); 1323 return true; 1324 } 1325 next_context_iter = null_context_list_iterator; 1326 state = 0; 1327 continue; 1328 1329 case token_type_rparen: 1330 arglist_parser_done (argparser, arg); 1331 return false; 1332 1333 case token_type_comma: 1334 arg++; 1335 inner_context = 1336 inherited_context (outer_context, 1337 flag_context_list_iterator_advance ( 1338 &context_iter)); 1339 next_context_iter = passthrough_context_list_iterator; 1340 state = 0; 1341 continue; 1342 1343 case token_type_string_literal: 1344 { 1345 lex_pos_ty pos; 1346 pos.file_name = logical_file_name; 1347 pos.line_number = token.line_number; 1348 1349 if (extract_all) 1350 remember_a_message (mlp, NULL, token.string, inner_context, 1351 &pos, savable_comment); 1352 else 1353 arglist_parser_remember (argparser, arg, token.string, 1354 inner_context, 1355 pos.file_name, pos.line_number, 1356 savable_comment); 1357 } 1358 next_context_iter = null_context_list_iterator; 1359 state = 0; 1360 continue; 1361 1362 case token_type_other: 1363 next_context_iter = null_context_list_iterator; 1364 state = 0; 1365 continue; 1366 1367 case token_type_eof: 1368 arglist_parser_done (argparser, arg); 1369 return true; 1370 1371 default: 1372 abort (); 1373 } 1374 } 1375 } 1376 1377 1378 void 1379 extract_php (FILE *f, 1380 const char *real_filename, const char *logical_filename, 1381 flag_context_list_table_ty *flag_table, 1382 msgdomain_list_ty *mdlp) 1383 { 1384 message_list_ty *mlp = mdlp->item[0]->messages; 1385 1386 fp = f; 1387 real_file_name = real_filename; 1388 logical_file_name = xstrdup (logical_filename); 1389 line_number = 1; 1390 1391 last_comment_line = -1; 1392 last_non_comment_line = -1; 1393 1394 flag_context_list_table = flag_table; 1395 1396 init_keywords (); 1397 1398 /* Initial mode is HTML mode, not PHP mode. */ 1399 skip_html (); 1400 1401 /* Eat tokens until eof is seen. When extract_parenthesized returns 1402 due to an unbalanced closing parenthesis, just restart it. */ 1403 while (!extract_parenthesized (mlp, null_context, null_context_list_iterator, 1404 arglist_parser_alloc (mlp, NULL))) 1405 ; 1406 1407 /* Close scanner. */ 1408 fp = NULL; 1409 real_file_name = NULL; 1410 logical_file_name = NULL; 1411 line_number = 0; 1412 } 1413