xref: /netbsd-src/external/gpl2/gettext/dist/gettext-tools/src/x-php.c (revision 946379e7b37692fc43f68eb0d1c10daa0a7f3b6c)
1 /* xgettext PHP backend.
2    Copyright (C) 2001-2003, 2005-2006 Free Software Foundation, Inc.
3 
4    This file was written by Bruno Haible <bruno@clisp.org>, 2002.
5 
6    This program is free software; you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 2, or (at your option)
9    any later version.
10 
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15 
16    You should have received a copy of the GNU General Public License
17    along with this program; if not, write to the Free Software Foundation,
18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
19 
20 #ifdef HAVE_CONFIG_H
21 # include "config.h"
22 #endif
23 
24 #include <errno.h>
25 #include <stdbool.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 
29 #include "message.h"
30 #include "xgettext.h"
31 #include "x-php.h"
32 #include "error.h"
33 #include "xalloc.h"
34 #include "exit.h"
35 #include "gettext.h"
36 
37 #define _(s) gettext(s)
38 
39 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
40 
41 
42 /* The PHP syntax is defined in phpdoc/manual/langref.html.
43    See also php-4.1.0/Zend/zend_language_scanner.l.
44    Note that variable and function names can contain bytes in the range
45    0x7f..0xff; see
46      http://www.php.net/manual/en/language.variables.php
47      http://www.php.net/manual/en/language.functions.php  */
48 
49 
50 /* ====================== Keyword set customization.  ====================== */
51 
52 /* If true extract all strings.  */
53 static bool extract_all = false;
54 
55 static hash_table keywords;
56 static bool default_keywords = true;
57 
58 
59 void
x_php_extract_all()60 x_php_extract_all ()
61 {
62   extract_all = true;
63 }
64 
65 
66 void
x_php_keyword(const char * name)67 x_php_keyword (const char *name)
68 {
69   if (name == NULL)
70     default_keywords = false;
71   else
72     {
73       const char *end;
74       struct callshape shape;
75       const char *colon;
76 
77       if (keywords.table == NULL)
78 	hash_init (&keywords, 100);
79 
80       split_keywordspec (name, &end, &shape);
81 
82       /* The characters between name and end should form a valid C identifier.
83 	 A colon means an invalid parse in split_keywordspec().  */
84       colon = strchr (name, ':');
85       if (colon == NULL || colon >= end)
86 	insert_keyword_callshape (&keywords, name, end - name, &shape);
87     }
88 }
89 
90 /* Finish initializing the keywords hash table.
91    Called after argument processing, before each file is processed.  */
92 static void
init_keywords()93 init_keywords ()
94 {
95   if (default_keywords)
96     {
97       /* When adding new keywords here, also update the documentation in
98 	 xgettext.texi!  */
99       x_php_keyword ("_");
100       x_php_keyword ("gettext");
101       x_php_keyword ("dgettext:2");
102       x_php_keyword ("dcgettext:2");
103       /* The following were added in PHP 4.2.0.  */
104       x_php_keyword ("ngettext:1,2");
105       x_php_keyword ("dngettext:2,3");
106       x_php_keyword ("dcngettext:2,3");
107       default_keywords = false;
108     }
109 }
110 
111 void
init_flag_table_php()112 init_flag_table_php ()
113 {
114   xgettext_record_flag ("_:1:pass-php-format");
115   xgettext_record_flag ("gettext:1:pass-php-format");
116   xgettext_record_flag ("dgettext:2:pass-php-format");
117   xgettext_record_flag ("dcgettext:2:pass-php-format");
118   xgettext_record_flag ("ngettext:1:pass-php-format");
119   xgettext_record_flag ("ngettext:2:pass-php-format");
120   xgettext_record_flag ("dngettext:2:pass-php-format");
121   xgettext_record_flag ("dngettext:3:pass-php-format");
122   xgettext_record_flag ("dcngettext:2:pass-php-format");
123   xgettext_record_flag ("dcngettext:3:pass-php-format");
124   xgettext_record_flag ("sprintf:1:php-format");
125   xgettext_record_flag ("printf:1:php-format");
126 }
127 
128 
129 /* ======================== Reading of characters.  ======================== */
130 
131 
132 /* Real filename, used in error messages about the input file.  */
133 static const char *real_file_name;
134 
135 /* Logical filename and line number, used to label the extracted messages.  */
136 static char *logical_file_name;
137 static int line_number;
138 
139 /* The input file stream.  */
140 static FILE *fp;
141 
142 
143 /* 1. line_number handling.  */
144 
145 static unsigned char phase1_pushback[2];
146 static int phase1_pushback_length;
147 
148 static int
phase1_getc()149 phase1_getc ()
150 {
151   int c;
152 
153   if (phase1_pushback_length)
154     c = phase1_pushback[--phase1_pushback_length];
155   else
156     {
157       c = getc (fp);
158 
159       if (c == EOF)
160 	{
161 	  if (ferror (fp))
162 	    error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
163 		   real_file_name);
164 	  return EOF;
165 	}
166     }
167 
168   if (c == '\n')
169     line_number++;
170 
171   return c;
172 }
173 
174 /* Supports 2 characters of pushback.  */
175 static void
phase1_ungetc(int c)176 phase1_ungetc (int c)
177 {
178   if (c != EOF)
179     {
180       if (c == '\n')
181 	--line_number;
182 
183       if (phase1_pushback_length == SIZEOF (phase1_pushback))
184 	abort ();
185       phase1_pushback[phase1_pushback_length++] = c;
186     }
187 }
188 
189 
190 /* 2. Ignore HTML sections.  They are equivalent to PHP echo commands and
191    therefore don't contain translatable strings.  */
192 
193 static void
skip_html()194 skip_html ()
195 {
196   for (;;)
197     {
198       int c = phase1_getc ();
199 
200       if (c == EOF)
201 	return;
202 
203       if (c == '<')
204 	{
205 	  int c2 = phase1_getc ();
206 
207 	  if (c2 == EOF)
208 	    break;
209 
210 	  if (c2 == '?')
211 	    {
212 	      /* <?php is the normal way to enter PHP mode. <? and <?= are
213 		 recognized by PHP depending on a configuration setting.  */
214 	      int c3 = phase1_getc ();
215 
216 	      if (c3 != '=')
217 		phase1_ungetc (c3);
218 
219 	      return;
220 	    }
221 
222 	  if (c2 == '%')
223 	    {
224 	      /* <% and <%= are recognized by PHP depending on a configuration
225 		 setting.  */
226 	      int c3 = phase1_getc ();
227 
228 	      if (c3 != '=')
229 		phase1_ungetc (c3);
230 
231 	      return;
232 	    }
233 
234 	  if (c2 == '<')
235 	    {
236 	      phase1_ungetc (c2);
237 	      continue;
238 	    }
239 
240 	  /* < script language = php >
241 	     < script language = "php" >
242 	     < script language = 'php' >
243 	     are always recognized.  */
244 	  while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
245 	    c2 = phase1_getc ();
246 	  if (c2 != 's' && c2 != 'S')
247 	    {
248 	      phase1_ungetc (c2);
249 	      continue;
250 	    }
251 	  c2 = phase1_getc ();
252 	  if (c2 != 'c' && c2 != 'C')
253 	    {
254 	      phase1_ungetc (c2);
255 	      continue;
256 	    }
257 	  c2 = phase1_getc ();
258 	  if (c2 != 'r' && c2 != 'R')
259 	    {
260 	      phase1_ungetc (c2);
261 	      continue;
262 	    }
263 	  c2 = phase1_getc ();
264 	  if (c2 != 'i' && c2 != 'I')
265 	    {
266 	      phase1_ungetc (c2);
267 	      continue;
268 	    }
269 	  c2 = phase1_getc ();
270 	  if (c2 != 'p' && c2 != 'P')
271 	    {
272 	      phase1_ungetc (c2);
273 	      continue;
274 	    }
275 	  c2 = phase1_getc ();
276 	  if (c2 != 't' && c2 != 'T')
277 	    {
278 	      phase1_ungetc (c2);
279 	      continue;
280 	    }
281 	  c2 = phase1_getc ();
282 	  if (!(c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r'))
283 	    {
284 	      phase1_ungetc (c2);
285 	      continue;
286 	    }
287 	  do
288 	    c2 = phase1_getc ();
289 	  while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r');
290 	  if (c2 != 'l' && c2 != 'L')
291 	    {
292 	      phase1_ungetc (c2);
293 	      continue;
294 	    }
295 	  c2 = phase1_getc ();
296 	  if (c2 != 'a' && c2 != 'A')
297 	    {
298 	      phase1_ungetc (c2);
299 	      continue;
300 	    }
301 	  c2 = phase1_getc ();
302 	  if (c2 != 'n' && c2 != 'N')
303 	    {
304 	      phase1_ungetc (c2);
305 	      continue;
306 	    }
307 	  c2 = phase1_getc ();
308 	  if (c2 != 'g' && c2 != 'G')
309 	    {
310 	      phase1_ungetc (c2);
311 	      continue;
312 	    }
313 	  c2 = phase1_getc ();
314 	  if (c2 != 'u' && c2 != 'U')
315 	    {
316 	      phase1_ungetc (c2);
317 	      continue;
318 	    }
319 	  c2 = phase1_getc ();
320 	  if (c2 != 'a' && c2 != 'A')
321 	    {
322 	      phase1_ungetc (c2);
323 	      continue;
324 	    }
325 	  c2 = phase1_getc ();
326 	  if (c2 != 'g' && c2 != 'G')
327 	    {
328 	      phase1_ungetc (c2);
329 	      continue;
330 	    }
331 	  c2 = phase1_getc ();
332 	  if (c2 != 'e' && c2 != 'E')
333 	    {
334 	      phase1_ungetc (c2);
335 	      continue;
336 	    }
337 	  c2 = phase1_getc ();
338 	  while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
339 	    c2 = phase1_getc ();
340 	  if (c2 != '=')
341 	    {
342 	      phase1_ungetc (c2);
343 	      continue;
344 	    }
345 	  c2 = phase1_getc ();
346 	  while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
347 	    c2 = phase1_getc ();
348 	  if (c2 == '"')
349 	    {
350 	      c2 = phase1_getc ();
351 	      if (c2 != 'p')
352 		{
353 		  phase1_ungetc (c2);
354 		  continue;
355 		}
356 	      c2 = phase1_getc ();
357 	      if (c2 != 'h')
358 		{
359 		  phase1_ungetc (c2);
360 		  continue;
361 		}
362 	      c2 = phase1_getc ();
363 	      if (c2 != 'p')
364 		{
365 		  phase1_ungetc (c2);
366 		  continue;
367 		}
368 	      c2 = phase1_getc ();
369 	      if (c2 != '"')
370 		{
371 		  phase1_ungetc (c2);
372 		  continue;
373 		}
374 	    }
375 	  else if (c2 == '\'')
376 	    {
377 	      c2 = phase1_getc ();
378 	      if (c2 != 'p')
379 		{
380 		  phase1_ungetc (c2);
381 		  continue;
382 		}
383 	      c2 = phase1_getc ();
384 	      if (c2 != 'h')
385 		{
386 		  phase1_ungetc (c2);
387 		  continue;
388 		}
389 	      c2 = phase1_getc ();
390 	      if (c2 != 'p')
391 		{
392 		  phase1_ungetc (c2);
393 		  continue;
394 		}
395 	      c2 = phase1_getc ();
396 	      if (c2 != '\'')
397 		{
398 		  phase1_ungetc (c2);
399 		  continue;
400 		}
401 	    }
402 	  else
403 	    {
404 	      if (c2 != 'p')
405 		{
406 		  phase1_ungetc (c2);
407 		  continue;
408 		}
409 	      c2 = phase1_getc ();
410 	      if (c2 != 'h')
411 		{
412 		  phase1_ungetc (c2);
413 		  continue;
414 		}
415 	      c2 = phase1_getc ();
416 	      if (c2 != 'p')
417 		{
418 		  phase1_ungetc (c2);
419 		  continue;
420 		}
421 	    }
422 	  c2 = phase1_getc ();
423 	  while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
424 	    c2 = phase1_getc ();
425 	  if (c2 != '>')
426 	    {
427 	      phase1_ungetc (c2);
428 	      continue;
429 	    }
430 	  return;
431 	}
432     }
433 }
434 
435 #if 0
436 
437 static unsigned char phase2_pushback[1];
438 static int phase2_pushback_length;
439 
440 static int
441 phase2_getc ()
442 {
443   int c;
444 
445   if (phase2_pushback_length)
446     return phase2_pushback[--phase2_pushback_length];
447 
448   c = phase1_getc ();
449   switch (c)
450     {
451     case '?':
452     case '%':
453       {
454 	int c2 = phase1_getc ();
455 	if (c2 == '>')
456 	  {
457 	    /* ?> and %> terminate PHP mode and switch back to HTML mode.  */
458 	    skip_html ();
459 	    return ' ';
460 	  }
461 	phase1_ungetc (c2);
462       }
463       break;
464 
465     case '<':
466       {
467 	int c2 = phase1_getc ();
468 
469 	/* < / script > terminates PHP mode and switches back to HTML mode.  */
470 	while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
471 	  c2 = phase1_getc ();
472 	if (c2 == '/')
473 	  {
474 	    do
475 	      c2 = phase1_getc ();
476 	    while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r');
477 	    if (c2 == 's' || c2 == 'S')
478 	      {
479 		c2 = phase1_getc ();
480 		if (c2 == 'c' || c2 == 'C')
481 		  {
482 		    c2 = phase1_getc ();
483 		    if (c2 == 'r' || c2 == 'R')
484 		      {
485 			c2 = phase1_getc ();
486 			if (c2 == 'i' || c2 == 'I')
487 			  {
488 			    c2 = phase1_getc ();
489 			    if (c2 == 'p' || c2 == 'P')
490 			      {
491 				c2 = phase1_getc ();
492 				if (c2 == 't' || c2 == 'T')
493 				  {
494 				    do
495 				      c2 = phase1_getc ();
496 				    while (c2 == ' ' || c2 == '\t'
497 					   || c2 == '\n' || c2 == '\r');
498 				    if (c2 == '>')
499 				      {
500 					skip_html ();
501 					return ' ';
502 				      }
503 				  }
504 			      }
505 			  }
506 		      }
507 		  }
508 	      }
509 	  }
510 	phase1_ungetc (c2);
511       }
512       break;
513     }
514 
515   return c;
516 }
517 
518 static void
519 phase2_ungetc (int c)
520 {
521   if (c != EOF)
522     {
523       if (phase2_pushback_length == SIZEOF (phase2_pushback))
524 	abort ();
525       phase2_pushback[phase2_pushback_length++] = c;
526     }
527 }
528 
529 #endif
530 
531 
532 /* Accumulating comments.  */
533 
534 static char *buffer;
535 static size_t bufmax;
536 static size_t buflen;
537 
538 static inline void
comment_start()539 comment_start ()
540 {
541   buflen = 0;
542 }
543 
544 static inline void
comment_add(int c)545 comment_add (int c)
546 {
547   if (buflen >= bufmax)
548     {
549       bufmax = 2 * bufmax + 10;
550       buffer = xrealloc (buffer, bufmax);
551     }
552   buffer[buflen++] = c;
553 }
554 
555 static inline void
comment_line_end(size_t chars_to_remove)556 comment_line_end (size_t chars_to_remove)
557 {
558   buflen -= chars_to_remove;
559   while (buflen >= 1
560 	 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
561     --buflen;
562   if (chars_to_remove == 0 && buflen >= bufmax)
563     {
564       bufmax = 2 * bufmax + 10;
565       buffer = xrealloc (buffer, bufmax);
566     }
567   buffer[buflen] = '\0';
568   savable_comment_add (buffer);
569 }
570 
571 
572 /* 3. Replace each comment that is not inside a string literal with a
573    space character.  We need to remember the comment for later, because
574    it may be attached to a keyword string.  */
575 
576 /* These are for tracking whether comments count as immediately before
577    keyword.  */
578 static int last_comment_line;
579 static int last_non_comment_line;
580 
581 static unsigned char phase3_pushback[1];
582 static int phase3_pushback_length;
583 
584 static int
phase3_getc()585 phase3_getc ()
586 {
587   int lineno;
588   int c;
589 
590   if (phase3_pushback_length)
591     return phase3_pushback[--phase3_pushback_length];
592 
593   c = phase1_getc ();
594 
595   if (c == '#')
596     {
597       /* sh comment.  */
598       bool last_was_qmark = false;
599 
600       comment_start ();
601       lineno = line_number;
602       for (;;)
603 	{
604 	  c = phase1_getc ();
605 	  if (c == '\n' || c == EOF)
606 	    {
607 	      comment_line_end (0);
608 	      break;
609 	    }
610 	  if (last_was_qmark && c == '>')
611 	    {
612 	      comment_line_end (1);
613 	      skip_html ();
614 	      break;
615 	    }
616 	  /* We skip all leading white space, but not EOLs.  */
617 	  if (!(buflen == 0 && (c == ' ' || c == '\t')))
618 	    comment_add (c);
619 	  last_was_qmark = (c == '?' || c == '%');
620 	}
621       last_comment_line = lineno;
622       return '\n';
623     }
624   else if (c == '/')
625     {
626       c = phase1_getc ();
627 
628       switch (c)
629 	{
630 	default:
631 	  phase1_ungetc (c);
632 	  return '/';
633 
634 	case '*':
635 	  {
636 	    /* C comment.  */
637 	    bool last_was_star;
638 
639 	    comment_start ();
640 	    lineno = line_number;
641 	    last_was_star = false;
642 	    for (;;)
643 	      {
644 		c = phase1_getc ();
645 		if (c == EOF)
646 		  break;
647 		/* We skip all leading white space, but not EOLs.  */
648 		if (buflen == 0 && (c == ' ' || c == '\t'))
649 		  continue;
650 		comment_add (c);
651 		switch (c)
652 		  {
653 		  case '\n':
654 		    comment_line_end (1);
655 		    comment_start ();
656 		    lineno = line_number;
657 		    last_was_star = false;
658 		    continue;
659 
660 		  case '*':
661 		    last_was_star = true;
662 		    continue;
663 
664 		  case '/':
665 		    if (last_was_star)
666 		      {
667 			comment_line_end (2);
668 			break;
669 		      }
670 		    /* FALLTHROUGH */
671 
672 		  default:
673 		    last_was_star = false;
674 		    continue;
675 		  }
676 		break;
677 	      }
678 	    last_comment_line = lineno;
679 	    return ' ';
680 	  }
681 
682 	case '/':
683 	  {
684 	    /* C++ comment.  */
685 	    bool last_was_qmark = false;
686 
687 	    comment_start ();
688 	    lineno = line_number;
689 	    for (;;)
690 	      {
691 		c = phase1_getc ();
692 		if (c == '\n' || c == EOF)
693 		  {
694 		    comment_line_end (0);
695 		    break;
696 		  }
697 		if (last_was_qmark && c == '>')
698 		  {
699 		    comment_line_end (1);
700 		    skip_html ();
701 		    break;
702 		  }
703 		/* We skip all leading white space, but not EOLs.  */
704 		if (!(buflen == 0 && (c == ' ' || c == '\t')))
705 		  comment_add (c);
706 		last_was_qmark = (c == '?' || c == '%');
707 	      }
708 	    last_comment_line = lineno;
709 	    return '\n';
710 	  }
711 	}
712     }
713   else
714     return c;
715 }
716 
717 #ifdef unused
718 static void
phase3_ungetc(int c)719 phase3_ungetc (int c)
720 {
721   if (c != EOF)
722     {
723       if (phase3_pushback_length == SIZEOF (phase3_pushback))
724 	abort ();
725       phase3_pushback[phase3_pushback_length++] = c;
726     }
727 }
728 #endif
729 
730 
731 /* ========================== Reading of tokens.  ========================== */
732 
733 
734 enum token_type_ty
735 {
736   token_type_eof,
737   token_type_lparen,		/* ( */
738   token_type_rparen,		/* ) */
739   token_type_comma,		/* , */
740   token_type_string_literal,	/* "abc" */
741   token_type_symbol,		/* symbol, number */
742   token_type_other		/* misc. operator */
743 };
744 typedef enum token_type_ty token_type_ty;
745 
746 typedef struct token_ty token_ty;
747 struct token_ty
748 {
749   token_type_ty type;
750   char *string;		/* for token_type_string_literal, token_type_symbol */
751   int line_number;
752 };
753 
754 
755 /* Free the memory pointed to by a 'struct token_ty'.  */
756 static inline void
free_token(token_ty * tp)757 free_token (token_ty *tp)
758 {
759   if (tp->type == token_type_string_literal || tp->type == token_type_symbol)
760     free (tp->string);
761 }
762 
763 
764 /* 4. Combine characters into tokens.  Discard whitespace.  */
765 
766 static void
x_php_lex(token_ty * tp)767 x_php_lex (token_ty *tp)
768 {
769   static char *buffer;
770   static int bufmax;
771   int bufpos;
772   int c;
773 
774   tp->string = NULL;
775 
776   for (;;)
777     {
778       tp->line_number = line_number;
779       c = phase3_getc ();
780       switch (c)
781 	{
782 	case EOF:
783 	  tp->type = token_type_eof;
784 	  return;
785 
786 	case '\n':
787 	  if (last_non_comment_line > last_comment_line)
788 	    savable_comment_reset ();
789 	  /* FALLTHROUGH */
790 	case ' ':
791 	case '\t':
792 	case '\r':
793 	  /* Ignore whitespace.  */
794 	  continue;
795 	}
796 
797       last_non_comment_line = tp->line_number;
798 
799       switch (c)
800 	{
801 	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
802 	case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
803 	case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
804 	case 'V': case 'W': case 'X': case 'Y': case 'Z':
805 	case '_':
806 	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
807 	case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
808 	case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
809 	case 'v': case 'w': case 'x': case 'y': case 'z':
810 	case 127: case 128: case 129: case 130: case 131: case 132: case 133:
811 	case 134: case 135: case 136: case 137: case 138: case 139: case 140:
812 	case 141: case 142: case 143: case 144: case 145: case 146: case 147:
813 	case 148: case 149: case 150: case 151: case 152: case 153: case 154:
814 	case 155: case 156: case 157: case 158: case 159: case 160: case 161:
815 	case 162: case 163: case 164: case 165: case 166: case 167: case 168:
816 	case 169: case 170: case 171: case 172: case 173: case 174: case 175:
817 	case 176: case 177: case 178: case 179: case 180: case 181: case 182:
818 	case 183: case 184: case 185: case 186: case 187: case 188: case 189:
819 	case 190: case 191: case 192: case 193: case 194: case 195: case 196:
820 	case 197: case 198: case 199: case 200: case 201: case 202: case 203:
821 	case 204: case 205: case 206: case 207: case 208: case 209: case 210:
822 	case 211: case 212: case 213: case 214: case 215: case 216: case 217:
823 	case 218: case 219: case 220: case 221: case 222: case 223: case 224:
824 	case 225: case 226: case 227: case 228: case 229: case 230: case 231:
825 	case 232: case 233: case 234: case 235: case 236: case 237: case 238:
826 	case 239: case 240: case 241: case 242: case 243: case 244: case 245:
827 	case 246: case 247: case 248: case 249: case 250: case 251: case 252:
828 	case 253: case 254: case 255:
829 	  bufpos = 0;
830 	  for (;;)
831 	    {
832 	      if (bufpos >= bufmax)
833 		{
834 		  bufmax = 2 * bufmax + 10;
835 		  buffer = xrealloc (buffer, bufmax);
836 		}
837 	      buffer[bufpos++] = c;
838 	      c = phase1_getc ();
839 	      switch (c)
840 		{
841 		case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
842 		case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
843 		case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
844 		case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
845 		case 'Y': case 'Z':
846 		case '_':
847 		case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
848 		case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
849 		case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
850 		case 's': case 't': case 'u': case 'v': case 'w': case 'x':
851 		case 'y': case 'z':
852 		case '0': case '1': case '2': case '3': case '4':
853 		case '5': case '6': case '7': case '8': case '9':
854 		case 127: case 128: case 129: case 130: case 131: case 132:
855 		case 133: case 134: case 135: case 136: case 137: case 138:
856 		case 139: case 140: case 141: case 142: case 143: case 144:
857 		case 145: case 146: case 147: case 148: case 149: case 150:
858 		case 151: case 152: case 153: case 154: case 155: case 156:
859 		case 157: case 158: case 159: case 160: case 161: case 162:
860 		case 163: case 164: case 165: case 166: case 167: case 168:
861 		case 169: case 170: case 171: case 172: case 173: case 174:
862 		case 175: case 176: case 177: case 178: case 179: case 180:
863 		case 181: case 182: case 183: case 184: case 185: case 186:
864 		case 187: case 188: case 189: case 190: case 191: case 192:
865 		case 193: case 194: case 195: case 196: case 197: case 198:
866 		case 199: case 200: case 201: case 202: case 203: case 204:
867 		case 205: case 206: case 207: case 208: case 209: case 210:
868 		case 211: case 212: case 213: case 214: case 215: case 216:
869 		case 217: case 218: case 219: case 220: case 221: case 222:
870 		case 223: case 224: case 225: case 226: case 227: case 228:
871 		case 229: case 230: case 231: case 232: case 233: case 234:
872 		case 235: case 236: case 237: case 238: case 239: case 240:
873 		case 241: case 242: case 243: case 244: case 245: case 246:
874 		case 247: case 248: case 249: case 250: case 251: case 252:
875 		case 253: case 254: case 255:
876 		  continue;
877 
878 		default:
879 		  phase1_ungetc (c);
880 		  break;
881 		}
882 	      break;
883 	    }
884 	  if (bufpos >= bufmax)
885 	    {
886 	      bufmax = 2 * bufmax + 10;
887 	      buffer = xrealloc (buffer, bufmax);
888 	    }
889 	  buffer[bufpos] = 0;
890 	  tp->string = xstrdup (buffer);
891 	  tp->type = token_type_symbol;
892 	  return;
893 
894 	case '\'':
895 	  /* Single-quoted string literal.  */
896 	  bufpos = 0;
897 	  for (;;)
898 	    {
899 	      c = phase1_getc ();
900 	      if (c == EOF || c == '\'')
901 		break;
902 	      if (c == '\\')
903 		{
904 		  c = phase1_getc ();
905 		  if (c != '\\' && c != '\'')
906 		    {
907 		      phase1_ungetc (c);
908 		      c = '\\';
909 		    }
910 		}
911 	      if (bufpos >= bufmax)
912 		{
913 		  bufmax = 2 * bufmax + 10;
914 		  buffer = xrealloc (buffer, bufmax);
915 		}
916 	      buffer[bufpos++] = c;
917 	    }
918 	  if (bufpos >= bufmax)
919 	    {
920 	      bufmax = 2 * bufmax + 10;
921 	      buffer = xrealloc (buffer, bufmax);
922 	    }
923 	  buffer[bufpos] = 0;
924 	  tp->type = token_type_string_literal;
925 	  tp->string = xstrdup (buffer);
926 	  return;
927 
928 	case '"':
929 	  /* Double-quoted string literal.  */
930 	  tp->type = token_type_string_literal;
931 	  bufpos = 0;
932 	  for (;;)
933 	    {
934 	      c = phase1_getc ();
935 	      if (c == EOF || c == '"')
936 		break;
937 	      if (c == '$')
938 		{
939 		  c = phase1_getc ();
940 		  if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
941 		      || c == '_' || c == '{' || c >= 0x7f)
942 		    {
943 		      /* String with variables.  */
944 		      tp->type = token_type_other;
945 		      continue;
946 		    }
947 		  phase1_ungetc (c);
948 		  c = '$';
949 		}
950 	      if (c == '{')
951 		{
952 		  c = phase1_getc ();
953 		  if (c == '$')
954 		    {
955 		      /* String with expressions.  */
956 		      tp->type = token_type_other;
957 		      continue;
958 		    }
959 		  phase1_ungetc (c);
960 		  c = '{';
961 		}
962 	      if (c == '\\')
963 		{
964 		  int n, j;
965 
966 		  c = phase1_getc ();
967 		  switch (c)
968 		    {
969 		    case '"':
970 		    case '\\':
971 		    case '$':
972 		      break;
973 
974 		    case '0': case '1': case '2': case '3':
975 		    case '4': case '5': case '6': case '7':
976 		      n = 0;
977 		      for (j = 0; j < 3; ++j)
978 			{
979 			  n = n * 8 + c - '0';
980 			  c = phase1_getc ();
981 			  switch (c)
982 			    {
983 			    default:
984 			      break;
985 
986 			    case '0': case '1': case '2': case '3':
987 			    case '4': case '5': case '6': case '7':
988 			      continue;
989 			    }
990 			  break;
991 			}
992 		      phase1_ungetc (c);
993 		      c = n;
994 		      break;
995 
996 		    case 'x':
997 		      n = 0;
998 		      for (j = 0; j < 2; ++j)
999 			{
1000 			  c = phase1_getc ();
1001 			  switch (c)
1002 			    {
1003 			    case '0': case '1': case '2': case '3': case '4':
1004 			    case '5': case '6': case '7': case '8': case '9':
1005 			      n = n * 16 + c - '0';
1006 			      break;
1007 			    case 'A': case 'B': case 'C': case 'D': case 'E':
1008 			    case 'F':
1009 			      n = n * 16 + 10 + c - 'A';
1010 			      break;
1011 			    case 'a': case 'b': case 'c': case 'd': case 'e':
1012 			    case 'f':
1013 			      n = n * 16 + 10 + c - 'a';
1014 			      break;
1015 			    default:
1016 			      phase1_ungetc (c);
1017 			      c = 0;
1018 			      break;
1019 			    }
1020 			  if (c == 0)
1021 			    break;
1022 			}
1023 		      if (j == 0)
1024 			{
1025 			  phase1_ungetc ('x');
1026 			  c = '\\';
1027 			}
1028 		      else
1029 			c = n;
1030 		      break;
1031 
1032 		    case 'n':
1033 		      c = '\n';
1034 		      break;
1035 		    case 't':
1036 		      c = '\t';
1037 		      break;
1038 		    case 'r':
1039 		      c = '\r';
1040 		      break;
1041 
1042 		    default:
1043 		      phase1_ungetc (c);
1044 		      c = '\\';
1045 		      break;
1046 		    }
1047 		}
1048 	      if (bufpos >= bufmax)
1049 		{
1050 		  bufmax = 2 * bufmax + 10;
1051 		  buffer = xrealloc (buffer, bufmax);
1052 		}
1053 	      buffer[bufpos++] = c;
1054 	    }
1055 	  if (bufpos >= bufmax)
1056 	    {
1057 	      bufmax = 2 * bufmax + 10;
1058 	      buffer = xrealloc (buffer, bufmax);
1059 	    }
1060 	  buffer[bufpos] = 0;
1061 	  if (tp->type == token_type_string_literal)
1062 	    tp->string = xstrdup (buffer);
1063 	  return;
1064 
1065 	case '?':
1066 	case '%':
1067 	  {
1068 	    int c2 = phase1_getc ();
1069 	    if (c2 == '>')
1070 	      {
1071 		/* ?> and %> terminate PHP mode and switch back to HTML
1072 		   mode.  */
1073 		skip_html ();
1074 	      }
1075 	    else
1076 	      phase1_ungetc (c2);
1077 	    tp->type = token_type_other;
1078 	    return;
1079 	  }
1080 
1081 	case '(':
1082 	  tp->type = token_type_lparen;
1083 	  return;
1084 
1085 	case ')':
1086 	  tp->type = token_type_rparen;
1087 	  return;
1088 
1089 	case ',':
1090 	  tp->type = token_type_comma;
1091 	  return;
1092 
1093 	case '<':
1094 	  {
1095 	    int c2 = phase1_getc ();
1096 	    if (c2 == '<')
1097 	      {
1098 		int c3 = phase1_getc ();
1099 		if (c3 == '<')
1100 		  {
1101 		    /* Start of here document.
1102 		       Parse whitespace, then label, then newline.  */
1103 		    do
1104 		      c = phase3_getc ();
1105 		    while (c == ' ' || c == '\t' || c == '\n' || c == '\r');
1106 
1107 		    bufpos = 0;
1108 		    do
1109 		      {
1110 			if (bufpos >= bufmax)
1111 			  {
1112 			    bufmax = 2 * bufmax + 10;
1113 			    buffer = xrealloc (buffer, bufmax);
1114 			  }
1115 			buffer[bufpos++] = c;
1116 			c = phase3_getc ();
1117 		      }
1118 		    while (c != EOF && c != '\n' && c != '\r');
1119 		    /* buffer[0..bufpos-1] now contains the label.  */
1120 
1121 		    /* Now skip the here document.  */
1122 		    for (;;)
1123 		      {
1124 			c = phase1_getc ();
1125 			if (c == EOF)
1126 			  break;
1127 			if (c == '\n' || c == '\r')
1128 			  {
1129 			    int bufidx = 0;
1130 
1131 			    while (bufidx < bufpos)
1132 			      {
1133 				c = phase1_getc ();
1134 				if (c == EOF)
1135 				  break;
1136 				if (c != buffer[bufidx])
1137 				  {
1138 				    phase1_ungetc (c);
1139 				    break;
1140 				  }
1141 				bufidx++;
1142 			      }
1143 			    if (bufidx == bufpos)
1144 			      {
1145 				c = phase1_getc ();
1146 				if (c != ';')
1147 				  phase1_ungetc (c);
1148 				c = phase1_getc ();
1149 				if (c == '\n' || c == '\r')
1150 				  break;
1151 			      }
1152 			  }
1153 		      }
1154 
1155 		    /* FIXME: Ideally we should turn the here document into a
1156 		       string literal if it didn't contain $ substitution.  And
1157 		       we should also respect backslash escape sequences like
1158 		       in double-quoted strings.  */
1159 		    tp->type = token_type_other;
1160 		    return;
1161 		  }
1162 		phase1_ungetc (c3);
1163 	      }
1164 
1165 	    /* < / script > terminates PHP mode and switches back to HTML
1166 	       mode.  */
1167 	    while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r')
1168 	      c2 = phase1_getc ();
1169 	    if (c2 == '/')
1170 	      {
1171 		do
1172 		  c2 = phase1_getc ();
1173 		while (c2 == ' ' || c2 == '\t' || c2 == '\n' || c2 == '\r');
1174 		if (c2 == 's' || c2 == 'S')
1175 		  {
1176 		    c2 = phase1_getc ();
1177 		    if (c2 == 'c' || c2 == 'C')
1178 		      {
1179 			c2 = phase1_getc ();
1180 			if (c2 == 'r' || c2 == 'R')
1181 			  {
1182 			    c2 = phase1_getc ();
1183 			    if (c2 == 'i' || c2 == 'I')
1184 			      {
1185 				c2 = phase1_getc ();
1186 				if (c2 == 'p' || c2 == 'P')
1187 				  {
1188 				    c2 = phase1_getc ();
1189 				    if (c2 == 't' || c2 == 'T')
1190 				      {
1191 					do
1192 					  c2 = phase1_getc ();
1193 					while (c2 == ' ' || c2 == '\t'
1194 					       || c2 == '\n' || c2 == '\r');
1195 					if (c2 == '>')
1196 					  {
1197 					    skip_html ();
1198 					  }
1199 					else
1200 					  phase1_ungetc (c2);
1201 				      }
1202 				    else
1203 				      phase1_ungetc (c2);
1204 				  }
1205 				else
1206 				  phase1_ungetc (c2);
1207 			      }
1208 			    else
1209 			      phase1_ungetc (c2);
1210 			  }
1211 			else
1212 			  phase1_ungetc (c2);
1213 		      }
1214 		    else
1215 		      phase1_ungetc (c2);
1216 		  }
1217 		else
1218 		  phase1_ungetc (c2);
1219 	      }
1220 	    else
1221 	      phase1_ungetc (c2);
1222 
1223 	    tp->type = token_type_other;
1224 	    return;
1225 	  }
1226 
1227 	case '`':
1228 	  /* Execution operator.  */
1229 	default:
1230 	  /* We could carefully recognize each of the 2 and 3 character
1231 	     operators, but it is not necessary, as we only need to recognize
1232 	     gettext invocations.  Don't bother.  */
1233 	  tp->type = token_type_other;
1234 	  return;
1235 	}
1236     }
1237 }
1238 
1239 
1240 /* ========================= Extracting strings.  ========================== */
1241 
1242 
1243 /* Context lookup table.  */
1244 static flag_context_list_table_ty *flag_context_list_table;
1245 
1246 
1247 /* The file is broken into tokens.  Scan the token stream, looking for
1248    a keyword, followed by a left paren, followed by a string.  When we
1249    see this sequence, we have something to remember.  We assume we are
1250    looking at a valid C or C++ program, and leave the complaints about
1251    the grammar to the compiler.
1252 
1253      Normal handling: Look for
1254        keyword ( ... msgid ... )
1255      Plural handling: Look for
1256        keyword ( ... msgid ... msgid_plural ... )
1257 
1258    We use recursion because the arguments before msgid or between msgid
1259    and msgid_plural can contain subexpressions of the same form.  */
1260 
1261 
1262 /* Extract messages until the next balanced closing parenthesis.
1263    Extracted messages are added to MLP.
1264    Return true upon eof, false upon closing parenthesis.  */
1265 static bool
extract_parenthesized(message_list_ty * mlp,flag_context_ty outer_context,flag_context_list_iterator_ty context_iter,struct arglist_parser * argparser)1266 extract_parenthesized (message_list_ty *mlp,
1267 		       flag_context_ty outer_context,
1268 		       flag_context_list_iterator_ty context_iter,
1269 		       struct arglist_parser *argparser)
1270 {
1271   /* Current argument number.  */
1272   int arg = 1;
1273   /* 0 when no keyword has been seen.  1 right after a keyword is seen.  */
1274   int state;
1275   /* Parameters of the keyword just seen.  Defined only in state 1.  */
1276   const struct callshapes *next_shapes = NULL;
1277   /* Context iterator that will be used if the next token is a '('.  */
1278   flag_context_list_iterator_ty next_context_iter =
1279     passthrough_context_list_iterator;
1280   /* Current context.  */
1281   flag_context_ty inner_context =
1282     inherited_context (outer_context,
1283 		       flag_context_list_iterator_advance (&context_iter));
1284 
1285   /* Start state is 0.  */
1286   state = 0;
1287 
1288   for (;;)
1289     {
1290       token_ty token;
1291 
1292       x_php_lex (&token);
1293       switch (token.type)
1294 	{
1295 	case token_type_symbol:
1296 	  {
1297 	    void *keyword_value;
1298 
1299 	    if (hash_find_entry (&keywords, token.string, strlen (token.string),
1300 				 &keyword_value)
1301 		== 0)
1302 	      {
1303 		next_shapes = (const struct callshapes *) keyword_value;
1304 		state = 1;
1305 	      }
1306 	    else
1307 	      state = 0;
1308 	  }
1309 	  next_context_iter =
1310 	    flag_context_list_iterator (
1311 	      flag_context_list_table_lookup (
1312 		flag_context_list_table,
1313 		token.string, strlen (token.string)));
1314 	  free (token.string);
1315 	  continue;
1316 
1317 	case token_type_lparen:
1318 	  if (extract_parenthesized (mlp, inner_context, next_context_iter,
1319 				     arglist_parser_alloc (mlp,
1320 							   state ? next_shapes : NULL)))
1321 	    {
1322 	      arglist_parser_done (argparser, arg);
1323 	      return true;
1324 	    }
1325 	  next_context_iter = null_context_list_iterator;
1326 	  state = 0;
1327 	  continue;
1328 
1329 	case token_type_rparen:
1330 	  arglist_parser_done (argparser, arg);
1331 	  return false;
1332 
1333 	case token_type_comma:
1334 	  arg++;
1335 	  inner_context =
1336 	    inherited_context (outer_context,
1337 			       flag_context_list_iterator_advance (
1338 				 &context_iter));
1339 	  next_context_iter = passthrough_context_list_iterator;
1340 	  state = 0;
1341 	  continue;
1342 
1343 	case token_type_string_literal:
1344 	  {
1345 	    lex_pos_ty pos;
1346 	    pos.file_name = logical_file_name;
1347 	    pos.line_number = token.line_number;
1348 
1349 	    if (extract_all)
1350 	      remember_a_message (mlp, NULL, token.string, inner_context,
1351 				  &pos, savable_comment);
1352 	    else
1353 	      arglist_parser_remember (argparser, arg, token.string,
1354 				       inner_context,
1355 				       pos.file_name, pos.line_number,
1356 				       savable_comment);
1357 	  }
1358 	  next_context_iter = null_context_list_iterator;
1359 	  state = 0;
1360 	  continue;
1361 
1362 	case token_type_other:
1363 	  next_context_iter = null_context_list_iterator;
1364 	  state = 0;
1365 	  continue;
1366 
1367 	case token_type_eof:
1368 	  arglist_parser_done (argparser, arg);
1369 	  return true;
1370 
1371 	default:
1372 	  abort ();
1373 	}
1374     }
1375 }
1376 
1377 
1378 void
extract_php(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)1379 extract_php (FILE *f,
1380 	     const char *real_filename, const char *logical_filename,
1381 	     flag_context_list_table_ty *flag_table,
1382 	     msgdomain_list_ty *mdlp)
1383 {
1384   message_list_ty *mlp = mdlp->item[0]->messages;
1385 
1386   fp = f;
1387   real_file_name = real_filename;
1388   logical_file_name = xstrdup (logical_filename);
1389   line_number = 1;
1390 
1391   last_comment_line = -1;
1392   last_non_comment_line = -1;
1393 
1394   flag_context_list_table = flag_table;
1395 
1396   init_keywords ();
1397 
1398   /* Initial mode is HTML mode, not PHP mode.  */
1399   skip_html ();
1400 
1401   /* Eat tokens until eof is seen.  When extract_parenthesized returns
1402      due to an unbalanced closing parenthesis, just restart it.  */
1403   while (!extract_parenthesized (mlp, null_context, null_context_list_iterator,
1404 				 arglist_parser_alloc (mlp, NULL)))
1405     ;
1406 
1407   /* Close scanner.  */
1408   fp = NULL;
1409   real_file_name = NULL;
1410   logical_file_name = NULL;
1411   line_number = 0;
1412 }
1413