1 /* xgettext PHP backend.
2 Copyright (C) 2001-2003, 2005-2006 Free Software Foundation, Inc.
3
4 This file was written by Bruno Haible <bruno@clisp.org>, 2002.
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
19
20 #ifdef HAVE_CONFIG_H
21 # include "config.h"
22 #endif
23
24 #include <errno.h>
25 #include <stdbool.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28
29 #include "message.h"
30 #include "xgettext.h"
31 #include "x-php.h"
32 #include "error.h"
33 #include "xalloc.h"
34 #include "exit.h"
35 #include "gettext.h"
36
37 #define _(s) gettext(s)
38
39 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
40
41
42 /* The PHP syntax is defined in phpdoc/manual/langref.html.
43 See also php-4.1.0/Zend/zend_language_scanner.l.
44 Note that variable and function names can contain bytes in the range
45 0x7f..0xff; see
46 http://www.php.net/manual/en/language.variables.php
47 http://www.php.net/manual/en/language.functions.php */
48
49
50 /* ====================== Keyword set customization. ====================== */
51
52 /* If true extract all strings. */
53 static bool extract_all = false;
54
55 static hash_table keywords;
56 static bool default_keywords = true;
57
58
59 void
x_php_extract_all()60 x_php_extract_all ()
61 {
62 extract_all = true;
63 }
64
65
66 void
x_php_keyword(const char * name)67 x_php_keyword (const char *name)
68 {
69 if (name == NULL)
70 default_keywords = false;
71 else
72 {
73 const char *end;
74 struct callshape shape;
75 const char *colon;
76
77 if (keywords.table == NULL)
78 hash_init (&keywords, 100);
79
80 split_keywordspec (name, &end, &shape);
81
82 /* The characters between name and end should form a valid C identifier.
83 A colon means an invalid parse in split_keywordspec(). */
84 colon = strchr (name, ':');
85 if (colon == NULL || colon >= end)
86 insert_keyword_callshape (&keywords, name, end - name, &shape);
87 }
88 }
89
90 /* Finish initializing the keywords hash table.
91 Called after argument processing, before each file is processed. */
92 static void
init_keywords()93 init_keywords ()
94 {
95 if (default_keywords)
96 {
97 /* When adding new keywords here, also update the documentation in
98 xgettext.texi! */
99 x_php_keyword ("_");
100 x_php_keyword ("gettext");
101 x_php_keyword ("dgettext:2");
102 x_php_keyword ("dcgettext:2");
103 /* The following were added in PHP 4.2.0. */
104 x_php_keyword ("ngettext:1,2");
105 x_php_keyword ("dngettext:2,3");
106 x_php_keyword ("dcngettext:2,3");
107 default_keywords = false;
108 }
109 }
110
111 void
init_flag_table_php()112 init_flag_table_php ()
113 {
114 xgettext_record_flag ("_:1:pass-php-format");
115 xgettext_record_flag ("gettext:1:pass-php-format");
116 xgettext_record_flag ("dgettext:2:pass-php-format");
117 xgettext_record_flag ("dcgettext:2:pass-php-format");
118 xgettext_record_flag ("ngettext:1:pass-php-format");
119 xgettext_record_flag ("ngettext:2:pass-php-format");
120 xgettext_record_flag ("dngettext:2:pass-php-format");
121 xgettext_record_flag ("dngettext:3:pass-php-format");
122 xgettext_record_flag ("dcngettext:2:pass-php-format");
123 xgettext_record_flag ("dcngettext:3:pass-php-format");
124 xgettext_record_flag ("sprintf:1:php-format");
125 xgettext_record_flag ("printf:1:php-format");
126 }
127
128
129 /* ======================== Reading of characters. ======================== */
130
131
132 /* Real filename, used in error messages about the input file. */
133 static const char *real_file_name;
134
135 /* Logical filename and line number, used to label the extracted messages. */
136 static char *logical_file_name;
137 static int line_number;
138
139 /* The input file stream. */
140 static FILE *fp;
141
142
143 /* 1. line_number handling. */
144
145 static unsigned char phase1_pushback[2];
146 static int phase1_pushback_length;
147
148 static int
phase1_getc()149 phase1_getc ()
150 {
151 int c;
152
153 if (phase1_pushback_length)
154 c = phase1_pushback[--phase1_pushback_length];
155 else
156 {
157 c = getc (fp);
158
159 if (c == EOF)
160 {
161 if (ferror (fp))
162 error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
163 real_file_name);
164 return EOF;
165 }
166 }
167
168 if (c == '\n')
169 line_number++;
170
171 return c;
172 }
173
174 /* Supports 2 characters of pushback. */
175 static void
phase1_ungetc(int c)176 phase1_ungetc (int c)
177 {
178 if (c != EOF)
179 {
180 if (c == '\n')
181 --line_number;
182
183 if (phase1_pushback_length == SIZEOF (phase1_pushback))
184 abort ();
185 phase1_pushback[phase1_pushback_length++] = c;
186 }
187 }
188
189
190 /* 2. Ignore HTML sections. They are equivalent to PHP echo commands and
191 therefore don't contain translatable strings. */
192
193 static void
skip_html()194 skip_html ()
195 {
196 for (;;)
197 {
198 int c = phase1_getc ();
199
200 if (c == EOF)
201 return;
202
203 if (c == '<')
204 {
205 int c2 = phase1_getc ();
206
207 if (c2 == EOF)
208 break;
209
210 if (c2 == '?')
211 {
212 /* <?php is the normal way to enter PHP mode. <? and <?= are
213 recognized by PHP depending on a configuration setting. */
214 int c3 = phase1_getc ();
215
216 if (c3 != '=')
217 phase1_ungetc (c3);
218
219 return;
220 }
221
222 if (c2 == '%')
223 {
224 /* <% and <%= are recognized by PHP depending on a configuration
225 setting. */
226 int c3 = phase1_getc ();
227
228 if (c3 != '=')
229 phase1_ungetc (c3);
230
231 return;
232 }
233
234 if (c2 == '<')
235 {
236 phase1_ungetc (c2);
237 continue;
238 }
239
240 /* < script language = php >
241 < script language = "php" >
242 < script language = 'php' >
243 are always recognized. */
244 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
245 c2 = phase1_getc ();
246 if (c2 != 's' && c2 != 'S')
247 {
248 phase1_ungetc (c2);
249 continue;
250 }
251 c2 = phase1_getc ();
252 if (c2 != 'c' && c2 != 'C')
253 {
254 phase1_ungetc (c2);
255 continue;
256 }
257 c2 = phase1_getc ();
258 if (c2 != 'r' && c2 != 'R')
259 {
260 phase1_ungetc (c2);
261 continue;
262 }
263 c2 = phase1_getc ();
264 if (c2 != 'i' && c2 != 'I')
265 {
266 phase1_ungetc (c2);
267 continue;
268 }
269 c2 = phase1_getc ();
270 if (c2 != 'p' && c2 != 'P')
271 {
272 phase1_ungetc (c2);
273 continue;
274 }
275 c2 = phase1_getc ();
276 if (c2 != 't' && c2 != 'T')
277 {
278 phase1_ungetc (c2);
279 continue;
280 }
281 c2 = phase1_getc ();
282 if (!(c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r'))
283 {
284 phase1_ungetc (c2);
285 continue;
286 }
287 do
288 c2 = phase1_getc ();
289 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r');
290 if (c2 != 'l' && c2 != 'L')
291 {
292 phase1_ungetc (c2);
293 continue;
294 }
295 c2 = phase1_getc ();
296 if (c2 != 'a' && c2 != 'A')
297 {
298 phase1_ungetc (c2);
299 continue;
300 }
301 c2 = phase1_getc ();
302 if (c2 != 'n' && c2 != 'N')
303 {
304 phase1_ungetc (c2);
305 continue;
306 }
307 c2 = phase1_getc ();
308 if (c2 != 'g' && c2 != 'G')
309 {
310 phase1_ungetc (c2);
311 continue;
312 }
313 c2 = phase1_getc ();
314 if (c2 != 'u' && c2 != 'U')
315 {
316 phase1_ungetc (c2);
317 continue;
318 }
319 c2 = phase1_getc ();
320 if (c2 != 'a' && c2 != 'A')
321 {
322 phase1_ungetc (c2);
323 continue;
324 }
325 c2 = phase1_getc ();
326 if (c2 != 'g' && c2 != 'G')
327 {
328 phase1_ungetc (c2);
329 continue;
330 }
331 c2 = phase1_getc ();
332 if (c2 != 'e' && c2 != 'E')
333 {
334 phase1_ungetc (c2);
335 continue;
336 }
337 c2 = phase1_getc ();
338 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
339 c2 = phase1_getc ();
340 if (c2 != '=')
341 {
342 phase1_ungetc (c2);
343 continue;
344 }
345 c2 = phase1_getc ();
346 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
347 c2 = phase1_getc ();
348 if (c2 == '"')
349 {
350 c2 = phase1_getc ();
351 if (c2 != 'p')
352 {
353 phase1_ungetc (c2);
354 continue;
355 }
356 c2 = phase1_getc ();
357 if (c2 != 'h')
358 {
359 phase1_ungetc (c2);
360 continue;
361 }
362 c2 = phase1_getc ();
363 if (c2 != 'p')
364 {
365 phase1_ungetc (c2);
366 continue;
367 }
368 c2 = phase1_getc ();
369 if (c2 != '"')
370 {
371 phase1_ungetc (c2);
372 continue;
373 }
374 }
375 else if (c2 == '\'')
376 {
377 c2 = phase1_getc ();
378 if (c2 != 'p')
379 {
380 phase1_ungetc (c2);
381 continue;
382 }
383 c2 = phase1_getc ();
384 if (c2 != 'h')
385 {
386 phase1_ungetc (c2);
387 continue;
388 }
389 c2 = phase1_getc ();
390 if (c2 != 'p')
391 {
392 phase1_ungetc (c2);
393 continue;
394 }
395 c2 = phase1_getc ();
396 if (c2 != '\'')
397 {
398 phase1_ungetc (c2);
399 continue;
400 }
401 }
402 else
403 {
404 if (c2 != 'p')
405 {
406 phase1_ungetc (c2);
407 continue;
408 }
409 c2 = phase1_getc ();
410 if (c2 != 'h')
411 {
412 phase1_ungetc (c2);
413 continue;
414 }
415 c2 = phase1_getc ();
416 if (c2 != 'p')
417 {
418 phase1_ungetc (c2);
419 continue;
420 }
421 }
422 c2 = phase1_getc ();
423 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
424 c2 = phase1_getc ();
425 if (c2 != '>')
426 {
427 phase1_ungetc (c2);
428 continue;
429 }
430 return;
431 }
432 }
433 }
434
435 #if 0
436
437 static unsigned char phase2_pushback[1];
438 static int phase2_pushback_length;
439
440 static int
441 phase2_getc ()
442 {
443 int c;
444
445 if (phase2_pushback_length)
446 return phase2_pushback[--phase2_pushback_length];
447
448 c = phase1_getc ();
449 switch (c)
450 {
451 case '?':
452 case '%':
453 {
454 int c2 = phase1_getc ();
455 if (c2 == '>')
456 {
457 /* ?> and %> terminate PHP mode and switch back to HTML mode. */
458 skip_html ();
459 return ' ';
460 }
461 phase1_ungetc (c2);
462 }
463 break;
464
465 case '<':
466 {
467 int c2 = phase1_getc ();
468
469 /* < / script > terminates PHP mode and switches back to HTML mode. */
470 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
471 c2 = phase1_getc ();
472 if (c2 == '/')
473 {
474 do
475 c2 = phase1_getc ();
476 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r');
477 if (c2 == 's' || c2 == 'S')
478 {
479 c2 = phase1_getc ();
480 if (c2 == 'c' || c2 == 'C')
481 {
482 c2 = phase1_getc ();
483 if (c2 == 'r' || c2 == 'R')
484 {
485 c2 = phase1_getc ();
486 if (c2 == 'i' || c2 == 'I')
487 {
488 c2 = phase1_getc ();
489 if (c2 == 'p' || c2 == 'P')
490 {
491 c2 = phase1_getc ();
492 if (c2 == 't' || c2 == 'T')
493 {
494 do
495 c2 = phase1_getc ();
496 while (c2 == ' ' || c2 == '\t'
497 || c2 == '\n' || c2 == '\r');
498 if (c2 == '>')
499 {
500 skip_html ();
501 return ' ';
502 }
503 }
504 }
505 }
506 }
507 }
508 }
509 }
510 phase1_ungetc (c2);
511 }
512 break;
513 }
514
515 return c;
516 }
517
518 static void
519 phase2_ungetc (int c)
520 {
521 if (c != EOF)
522 {
523 if (phase2_pushback_length == SIZEOF (phase2_pushback))
524 abort ();
525 phase2_pushback[phase2_pushback_length++] = c;
526 }
527 }
528
529 #endif
530
531
532 /* Accumulating comments. */
533
534 static char *buffer;
535 static size_t bufmax;
536 static size_t buflen;
537
538 static inline void
comment_start()539 comment_start ()
540 {
541 buflen = 0;
542 }
543
544 static inline void
comment_add(int c)545 comment_add (int c)
546 {
547 if (buflen >= bufmax)
548 {
549 bufmax = 2 * bufmax + 10;
550 buffer = xrealloc (buffer, bufmax);
551 }
552 buffer[buflen++] = c;
553 }
554
555 static inline void
comment_line_end(size_t chars_to_remove)556 comment_line_end (size_t chars_to_remove)
557 {
558 buflen -= chars_to_remove;
559 while (buflen >= 1
560 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
561 --buflen;
562 if (chars_to_remove == 0 && buflen >= bufmax)
563 {
564 bufmax = 2 * bufmax + 10;
565 buffer = xrealloc (buffer, bufmax);
566 }
567 buffer[buflen] = '\0';
568 savable_comment_add (buffer);
569 }
570
571
572 /* 3. Replace each comment that is not inside a string literal with a
573 space character. We need to remember the comment for later, because
574 it may be attached to a keyword string. */
575
576 /* These are for tracking whether comments count as immediately before
577 keyword. */
578 static int last_comment_line;
579 static int last_non_comment_line;
580
581 static unsigned char phase3_pushback[1];
582 static int phase3_pushback_length;
583
584 static int
phase3_getc()585 phase3_getc ()
586 {
587 int lineno;
588 int c;
589
590 if (phase3_pushback_length)
591 return phase3_pushback[--phase3_pushback_length];
592
593 c = phase1_getc ();
594
595 if (c == '#')
596 {
597 /* sh comment. */
598 bool last_was_qmark = false;
599
600 comment_start ();
601 lineno = line_number;
602 for (;;)
603 {
604 c = phase1_getc ();
605 if (c == '\n' || c == EOF)
606 {
607 comment_line_end (0);
608 break;
609 }
610 if (last_was_qmark && c == '>')
611 {
612 comment_line_end (1);
613 skip_html ();
614 break;
615 }
616 /* We skip all leading white space, but not EOLs. */
617 if (!(buflen == 0 && (c == ' ' || c == '\t')))
618 comment_add (c);
619 last_was_qmark = (c == '?' || c == '%');
620 }
621 last_comment_line = lineno;
622 return '\n';
623 }
624 else if (c == '/')
625 {
626 c = phase1_getc ();
627
628 switch (c)
629 {
630 default:
631 phase1_ungetc (c);
632 return '/';
633
634 case '*':
635 {
636 /* C comment. */
637 bool last_was_star;
638
639 comment_start ();
640 lineno = line_number;
641 last_was_star = false;
642 for (;;)
643 {
644 c = phase1_getc ();
645 if (c == EOF)
646 break;
647 /* We skip all leading white space, but not EOLs. */
648 if (buflen == 0 && (c == ' ' || c == '\t'))
649 continue;
650 comment_add (c);
651 switch (c)
652 {
653 case '\n':
654 comment_line_end (1);
655 comment_start ();
656 lineno = line_number;
657 last_was_star = false;
658 continue;
659
660 case '*':
661 last_was_star = true;
662 continue;
663
664 case '/':
665 if (last_was_star)
666 {
667 comment_line_end (2);
668 break;
669 }
670 /* FALLTHROUGH */
671
672 default:
673 last_was_star = false;
674 continue;
675 }
676 break;
677 }
678 last_comment_line = lineno;
679 return ' ';
680 }
681
682 case '/':
683 {
684 /* C++ comment. */
685 bool last_was_qmark = false;
686
687 comment_start ();
688 lineno = line_number;
689 for (;;)
690 {
691 c = phase1_getc ();
692 if (c == '\n' || c == EOF)
693 {
694 comment_line_end (0);
695 break;
696 }
697 if (last_was_qmark && c == '>')
698 {
699 comment_line_end (1);
700 skip_html ();
701 break;
702 }
703 /* We skip all leading white space, but not EOLs. */
704 if (!(buflen == 0 && (c == ' ' || c == '\t')))
705 comment_add (c);
706 last_was_qmark = (c == '?' || c == '%');
707 }
708 last_comment_line = lineno;
709 return '\n';
710 }
711 }
712 }
713 else
714 return c;
715 }
716
717 #ifdef unused
718 static void
phase3_ungetc(int c)719 phase3_ungetc (int c)
720 {
721 if (c != EOF)
722 {
723 if (phase3_pushback_length == SIZEOF (phase3_pushback))
724 abort ();
725 phase3_pushback[phase3_pushback_length++] = c;
726 }
727 }
728 #endif
729
730
731 /* ========================== Reading of tokens. ========================== */
732
733
734 enum token_type_ty
735 {
736 token_type_eof,
737 token_type_lparen, /* ( */
738 token_type_rparen, /* ) */
739 token_type_comma, /* , */
740 token_type_string_literal, /* "abc" */
741 token_type_symbol, /* symbol, number */
742 token_type_other /* misc. operator */
743 };
744 typedef enum token_type_ty token_type_ty;
745
746 typedef struct token_ty token_ty;
747 struct token_ty
748 {
749 token_type_ty type;
750 char *string; /* for token_type_string_literal, token_type_symbol */
751 int line_number;
752 };
753
754
755 /* Free the memory pointed to by a 'struct token_ty'. */
756 static inline void
free_token(token_ty * tp)757 free_token (token_ty *tp)
758 {
759 if (tp->type == token_type_string_literal || tp->type == token_type_symbol)
760 free (tp->string);
761 }
762
763
764 /* 4. Combine characters into tokens. Discard whitespace. */
765
766 static void
x_php_lex(token_ty * tp)767 x_php_lex (token_ty *tp)
768 {
769 static char *buffer;
770 static int bufmax;
771 int bufpos;
772 int c;
773
774 tp->string = NULL;
775
776 for (;;)
777 {
778 tp->line_number = line_number;
779 c = phase3_getc ();
780 switch (c)
781 {
782 case EOF:
783 tp->type = token_type_eof;
784 return;
785
786 case '\n':
787 if (last_non_comment_line > last_comment_line)
788 savable_comment_reset ();
789 /* FALLTHROUGH */
790 case ' ':
791 case '\t':
792 case '\r':
793 /* Ignore whitespace. */
794 continue;
795 }
796
797 last_non_comment_line = tp->line_number;
798
799 switch (c)
800 {
801 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
802 case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
803 case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
804 case 'V': case 'W': case 'X': case 'Y': case 'Z':
805 case '_':
806 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
807 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
808 case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
809 case 'v': case 'w': case 'x': case 'y': case 'z':
810 case 127: case 128: case 129: case 130: case 131: case 132: case 133:
811 case 134: case 135: case 136: case 137: case 138: case 139: case 140:
812 case 141: case 142: case 143: case 144: case 145: case 146: case 147:
813 case 148: case 149: case 150: case 151: case 152: case 153: case 154:
814 case 155: case 156: case 157: case 158: case 159: case 160: case 161:
815 case 162: case 163: case 164: case 165: case 166: case 167: case 168:
816 case 169: case 170: case 171: case 172: case 173: case 174: case 175:
817 case 176: case 177: case 178: case 179: case 180: case 181: case 182:
818 case 183: case 184: case 185: case 186: case 187: case 188: case 189:
819 case 190: case 191: case 192: case 193: case 194: case 195: case 196:
820 case 197: case 198: case 199: case 200: case 201: case 202: case 203:
821 case 204: case 205: case 206: case 207: case 208: case 209: case 210:
822 case 211: case 212: case 213: case 214: case 215: case 216: case 217:
823 case 218: case 219: case 220: case 221: case 222: case 223: case 224:
824 case 225: case 226: case 227: case 228: case 229: case 230: case 231:
825 case 232: case 233: case 234: case 235: case 236: case 237: case 238:
826 case 239: case 240: case 241: case 242: case 243: case 244: case 245:
827 case 246: case 247: case 248: case 249: case 250: case 251: case 252:
828 case 253: case 254: case 255:
829 bufpos = 0;
830 for (;;)
831 {
832 if (bufpos >= bufmax)
833 {
834 bufmax = 2 * bufmax + 10;
835 buffer = xrealloc (buffer, bufmax);
836 }
837 buffer[bufpos++] = c;
838 c = phase1_getc ();
839 switch (c)
840 {
841 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
842 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
843 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
844 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
845 case 'Y': case 'Z':
846 case '_':
847 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
848 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
849 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
850 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
851 case 'y': case 'z':
852 case '0': case '1': case '2': case '3': case '4':
853 case '5': case '6': case '7': case '8': case '9':
854 case 127: case 128: case 129: case 130: case 131: case 132:
855 case 133: case 134: case 135: case 136: case 137: case 138:
856 case 139: case 140: case 141: case 142: case 143: case 144:
857 case 145: case 146: case 147: case 148: case 149: case 150:
858 case 151: case 152: case 153: case 154: case 155: case 156:
859 case 157: case 158: case 159: case 160: case 161: case 162:
860 case 163: case 164: case 165: case 166: case 167: case 168:
861 case 169: case 170: case 171: case 172: case 173: case 174:
862 case 175: case 176: case 177: case 178: case 179: case 180:
863 case 181: case 182: case 183: case 184: case 185: case 186:
864 case 187: case 188: case 189: case 190: case 191: case 192:
865 case 193: case 194: case 195: case 196: case 197: case 198:
866 case 199: case 200: case 201: case 202: case 203: case 204:
867 case 205: case 206: case 207: case 208: case 209: case 210:
868 case 211: case 212: case 213: case 214: case 215: case 216:
869 case 217: case 218: case 219: case 220: case 221: case 222:
870 case 223: case 224: case 225: case 226: case 227: case 228:
871 case 229: case 230: case 231: case 232: case 233: case 234:
872 case 235: case 236: case 237: case 238: case 239: case 240:
873 case 241: case 242: case 243: case 244: case 245: case 246:
874 case 247: case 248: case 249: case 250: case 251: case 252:
875 case 253: case 254: case 255:
876 continue;
877
878 default:
879 phase1_ungetc (c);
880 break;
881 }
882 break;
883 }
884 if (bufpos >= bufmax)
885 {
886 bufmax = 2 * bufmax + 10;
887 buffer = xrealloc (buffer, bufmax);
888 }
889 buffer[bufpos] = 0;
890 tp->string = xstrdup (buffer);
891 tp->type = token_type_symbol;
892 return;
893
894 case '\'':
895 /* Single-quoted string literal. */
896 bufpos = 0;
897 for (;;)
898 {
899 c = phase1_getc ();
900 if (c == EOF || c == '\'')
901 break;
902 if (c == '\\')
903 {
904 c = phase1_getc ();
905 if (c != '\\' && c != '\'')
906 {
907 phase1_ungetc (c);
908 c = '\\';
909 }
910 }
911 if (bufpos >= bufmax)
912 {
913 bufmax = 2 * bufmax + 10;
914 buffer = xrealloc (buffer, bufmax);
915 }
916 buffer[bufpos++] = c;
917 }
918 if (bufpos >= bufmax)
919 {
920 bufmax = 2 * bufmax + 10;
921 buffer = xrealloc (buffer, bufmax);
922 }
923 buffer[bufpos] = 0;
924 tp->type = token_type_string_literal;
925 tp->string = xstrdup (buffer);
926 return;
927
928 case '"':
929 /* Double-quoted string literal. */
930 tp->type = token_type_string_literal;
931 bufpos = 0;
932 for (;;)
933 {
934 c = phase1_getc ();
935 if (c == EOF || c == '"')
936 break;
937 if (c == '$')
938 {
939 c = phase1_getc ();
940 if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
941 || c == '_' || c == '{' || c >= 0x7f)
942 {
943 /* String with variables. */
944 tp->type = token_type_other;
945 continue;
946 }
947 phase1_ungetc (c);
948 c = '$';
949 }
950 if (c == '{')
951 {
952 c = phase1_getc ();
953 if (c == '$')
954 {
955 /* String with expressions. */
956 tp->type = token_type_other;
957 continue;
958 }
959 phase1_ungetc (c);
960 c = '{';
961 }
962 if (c == '\\')
963 {
964 int n, j;
965
966 c = phase1_getc ();
967 switch (c)
968 {
969 case '"':
970 case '\\':
971 case '$':
972 break;
973
974 case '0': case '1': case '2': case '3':
975 case '4': case '5': case '6': case '7':
976 n = 0;
977 for (j = 0; j < 3; ++j)
978 {
979 n = n * 8 + c - '0';
980 c = phase1_getc ();
981 switch (c)
982 {
983 default:
984 break;
985
986 case '0': case '1': case '2': case '3':
987 case '4': case '5': case '6': case '7':
988 continue;
989 }
990 break;
991 }
992 phase1_ungetc (c);
993 c = n;
994 break;
995
996 case 'x':
997 n = 0;
998 for (j = 0; j < 2; ++j)
999 {
1000 c = phase1_getc ();
1001 switch (c)
1002 {
1003 case '0': case '1': case '2': case '3': case '4':
1004 case '5': case '6': case '7': case '8': case '9':
1005 n = n * 16 + c - '0';
1006 break;
1007 case 'A': case 'B': case 'C': case 'D': case 'E':
1008 case 'F':
1009 n = n * 16 + 10 + c - 'A';
1010 break;
1011 case 'a': case 'b': case 'c': case 'd': case 'e':
1012 case 'f':
1013 n = n * 16 + 10 + c - 'a';
1014 break;
1015 default:
1016 phase1_ungetc (c);
1017 c = 0;
1018 break;
1019 }
1020 if (c == 0)
1021 break;
1022 }
1023 if (j == 0)
1024 {
1025 phase1_ungetc ('x');
1026 c = '\\';
1027 }
1028 else
1029 c = n;
1030 break;
1031
1032 case 'n':
1033 c = '\n';
1034 break;
1035 case 't':
1036 c = '\t';
1037 break;
1038 case 'r':
1039 c = '\r';
1040 break;
1041
1042 default:
1043 phase1_ungetc (c);
1044 c = '\\';
1045 break;
1046 }
1047 }
1048 if (bufpos >= bufmax)
1049 {
1050 bufmax = 2 * bufmax + 10;
1051 buffer = xrealloc (buffer, bufmax);
1052 }
1053 buffer[bufpos++] = c;
1054 }
1055 if (bufpos >= bufmax)
1056 {
1057 bufmax = 2 * bufmax + 10;
1058 buffer = xrealloc (buffer, bufmax);
1059 }
1060 buffer[bufpos] = 0;
1061 if (tp->type == token_type_string_literal)
1062 tp->string = xstrdup (buffer);
1063 return;
1064
1065 case '?':
1066 case '%':
1067 {
1068 int c2 = phase1_getc ();
1069 if (c2 == '>')
1070 {
1071 /* ?> and %> terminate PHP mode and switch back to HTML
1072 mode. */
1073 skip_html ();
1074 }
1075 else
1076 phase1_ungetc (c2);
1077 tp->type = token_type_other;
1078 return;
1079 }
1080
1081 case '(':
1082 tp->type = token_type_lparen;
1083 return;
1084
1085 case ')':
1086 tp->type = token_type_rparen;
1087 return;
1088
1089 case ',':
1090 tp->type = token_type_comma;
1091 return;
1092
1093 case '<':
1094 {
1095 int c2 = phase1_getc ();
1096 if (c2 == '<')
1097 {
1098 int c3 = phase1_getc ();
1099 if (c3 == '<')
1100 {
1101 /* Start of here document.
1102 Parse whitespace, then label, then newline. */
1103 do
1104 c = phase3_getc ();
1105 while (c == ' ' || c == '\t' || c == '\n' || c == '\r');
1106
1107 bufpos = 0;
1108 do
1109 {
1110 if (bufpos >= bufmax)
1111 {
1112 bufmax = 2 * bufmax + 10;
1113 buffer = xrealloc (buffer, bufmax);
1114 }
1115 buffer[bufpos++] = c;
1116 c = phase3_getc ();
1117 }
1118 while (c != EOF && c != '\n' && c != '\r');
1119 /* buffer[0..bufpos-1] now contains the label. */
1120
1121 /* Now skip the here document. */
1122 for (;;)
1123 {
1124 c = phase1_getc ();
1125 if (c == EOF)
1126 break;
1127 if (c == '\n' || c == '\r')
1128 {
1129 int bufidx = 0;
1130
1131 while (bufidx < bufpos)
1132 {
1133 c = phase1_getc ();
1134 if (c == EOF)
1135 break;
1136 if (c != buffer[bufidx])
1137 {
1138 phase1_ungetc (c);
1139 break;
1140 }
1141 bufidx++;
1142 }
1143 if (bufidx == bufpos)
1144 {
1145 c = phase1_getc ();
1146 if (c != ';')
1147 phase1_ungetc (c);
1148 c = phase1_getc ();
1149 if (c == '\n' || c == '\r')
1150 break;
1151 }
1152 }
1153 }
1154
1155 /* FIXME: Ideally we should turn the here document into a
1156 string literal if it didn't contain $ substitution. And
1157 we should also respect backslash escape sequences like
1158 in double-quoted strings. */
1159 tp->type = token_type_other;
1160 return;
1161 }
1162 phase1_ungetc (c3);
1163 }
1164
1165 /* < / script > terminates PHP mode and switches back to HTML
1166 mode. */
1167 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
1168 c2 = phase1_getc ();
1169 if (c2 == '/')
1170 {
1171 do
1172 c2 = phase1_getc ();
1173 while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r');
1174 if (c2 == 's' || c2 == 'S')
1175 {
1176 c2 = phase1_getc ();
1177 if (c2 == 'c' || c2 == 'C')
1178 {
1179 c2 = phase1_getc ();
1180 if (c2 == 'r' || c2 == 'R')
1181 {
1182 c2 = phase1_getc ();
1183 if (c2 == 'i' || c2 == 'I')
1184 {
1185 c2 = phase1_getc ();
1186 if (c2 == 'p' || c2 == 'P')
1187 {
1188 c2 = phase1_getc ();
1189 if (c2 == 't' || c2 == 'T')
1190 {
1191 do
1192 c2 = phase1_getc ();
1193 while (c2 == ' ' || c2 == '\t'
1194 || c2 == '\n' || c2 == '\r');
1195 if (c2 == '>')
1196 {
1197 skip_html ();
1198 }
1199 else
1200 phase1_ungetc (c2);
1201 }
1202 else
1203 phase1_ungetc (c2);
1204 }
1205 else
1206 phase1_ungetc (c2);
1207 }
1208 else
1209 phase1_ungetc (c2);
1210 }
1211 else
1212 phase1_ungetc (c2);
1213 }
1214 else
1215 phase1_ungetc (c2);
1216 }
1217 else
1218 phase1_ungetc (c2);
1219 }
1220 else
1221 phase1_ungetc (c2);
1222
1223 tp->type = token_type_other;
1224 return;
1225 }
1226
1227 case '`':
1228 /* Execution operator. */
1229 default:
1230 /* We could carefully recognize each of the 2 and 3 character
1231 operators, but it is not necessary, as we only need to recognize
1232 gettext invocations. Don't bother. */
1233 tp->type = token_type_other;
1234 return;
1235 }
1236 }
1237 }
1238
1239
1240 /* ========================= Extracting strings. ========================== */
1241
1242
1243 /* Context lookup table. */
1244 static flag_context_list_table_ty *flag_context_list_table;
1245
1246
1247 /* The file is broken into tokens. Scan the token stream, looking for
1248 a keyword, followed by a left paren, followed by a string. When we
1249 see this sequence, we have something to remember. We assume we are
1250 looking at a valid C or C++ program, and leave the complaints about
1251 the grammar to the compiler.
1252
1253 Normal handling: Look for
1254 keyword ( ... msgid ... )
1255 Plural handling: Look for
1256 keyword ( ... msgid ... msgid_plural ... )
1257
1258 We use recursion because the arguments before msgid or between msgid
1259 and msgid_plural can contain subexpressions of the same form. */
1260
1261
1262 /* Extract messages until the next balanced closing parenthesis.
1263 Extracted messages are added to MLP.
1264 Return true upon eof, false upon closing parenthesis. */
1265 static bool
extract_parenthesized(message_list_ty * mlp,flag_context_ty outer_context,flag_context_list_iterator_ty context_iter,struct arglist_parser * argparser)1266 extract_parenthesized (message_list_ty *mlp,
1267 flag_context_ty outer_context,
1268 flag_context_list_iterator_ty context_iter,
1269 struct arglist_parser *argparser)
1270 {
1271 /* Current argument number. */
1272 int arg = 1;
1273 /* 0 when no keyword has been seen. 1 right after a keyword is seen. */
1274 int state;
1275 /* Parameters of the keyword just seen. Defined only in state 1. */
1276 const struct callshapes *next_shapes = NULL;
1277 /* Context iterator that will be used if the next token is a '('. */
1278 flag_context_list_iterator_ty next_context_iter =
1279 passthrough_context_list_iterator;
1280 /* Current context. */
1281 flag_context_ty inner_context =
1282 inherited_context (outer_context,
1283 flag_context_list_iterator_advance (&context_iter));
1284
1285 /* Start state is 0. */
1286 state = 0;
1287
1288 for (;;)
1289 {
1290 token_ty token;
1291
1292 x_php_lex (&token);
1293 switch (token.type)
1294 {
1295 case token_type_symbol:
1296 {
1297 void *keyword_value;
1298
1299 if (hash_find_entry (&keywords, token.string, strlen (token.string),
1300 &keyword_value)
1301 == 0)
1302 {
1303 next_shapes = (const struct callshapes *) keyword_value;
1304 state = 1;
1305 }
1306 else
1307 state = 0;
1308 }
1309 next_context_iter =
1310 flag_context_list_iterator (
1311 flag_context_list_table_lookup (
1312 flag_context_list_table,
1313 token.string, strlen (token.string)));
1314 free (token.string);
1315 continue;
1316
1317 case token_type_lparen:
1318 if (extract_parenthesized (mlp, inner_context, next_context_iter,
1319 arglist_parser_alloc (mlp,
1320 state ? next_shapes : NULL)))
1321 {
1322 arglist_parser_done (argparser, arg);
1323 return true;
1324 }
1325 next_context_iter = null_context_list_iterator;
1326 state = 0;
1327 continue;
1328
1329 case token_type_rparen:
1330 arglist_parser_done (argparser, arg);
1331 return false;
1332
1333 case token_type_comma:
1334 arg++;
1335 inner_context =
1336 inherited_context (outer_context,
1337 flag_context_list_iterator_advance (
1338 &context_iter));
1339 next_context_iter = passthrough_context_list_iterator;
1340 state = 0;
1341 continue;
1342
1343 case token_type_string_literal:
1344 {
1345 lex_pos_ty pos;
1346 pos.file_name = logical_file_name;
1347 pos.line_number = token.line_number;
1348
1349 if (extract_all)
1350 remember_a_message (mlp, NULL, token.string, inner_context,
1351 &pos, savable_comment);
1352 else
1353 arglist_parser_remember (argparser, arg, token.string,
1354 inner_context,
1355 pos.file_name, pos.line_number,
1356 savable_comment);
1357 }
1358 next_context_iter = null_context_list_iterator;
1359 state = 0;
1360 continue;
1361
1362 case token_type_other:
1363 next_context_iter = null_context_list_iterator;
1364 state = 0;
1365 continue;
1366
1367 case token_type_eof:
1368 arglist_parser_done (argparser, arg);
1369 return true;
1370
1371 default:
1372 abort ();
1373 }
1374 }
1375 }
1376
1377
1378 void
extract_php(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)1379 extract_php (FILE *f,
1380 const char *real_filename, const char *logical_filename,
1381 flag_context_list_table_ty *flag_table,
1382 msgdomain_list_ty *mdlp)
1383 {
1384 message_list_ty *mlp = mdlp->item[0]->messages;
1385
1386 fp = f;
1387 real_file_name = real_filename;
1388 logical_file_name = xstrdup (logical_filename);
1389 line_number = 1;
1390
1391 last_comment_line = -1;
1392 last_non_comment_line = -1;
1393
1394 flag_context_list_table = flag_table;
1395
1396 init_keywords ();
1397
1398 /* Initial mode is HTML mode, not PHP mode. */
1399 skip_html ();
1400
1401 /* Eat tokens until eof is seen. When extract_parenthesized returns
1402 due to an unbalanced closing parenthesis, just restart it. */
1403 while (!extract_parenthesized (mlp, null_context, null_context_list_iterator,
1404 arglist_parser_alloc (mlp, NULL)))
1405 ;
1406
1407 /* Close scanner. */
1408 fp = NULL;
1409 real_file_name = NULL;
1410 logical_file_name = NULL;
1411 line_number = 0;
1412 }
1413