xref: /netbsd-src/external/gpl2/gettext/dist/gettext-tools/src/x-perl.c (revision 946379e7b37692fc43f68eb0d1c10daa0a7f3b6c)
1 /* xgettext Perl backend.
2    Copyright (C) 2002-2006 Free Software Foundation, Inc.
3 
4    This file was written by Guido Flohr <guido@imperia.net>, 2002-2003.
5 
6    This program is free software; you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 2, or (at your option)
9    any later version.
10 
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15 
16    You should have received a copy of the GNU General Public License
17    along with this program; if not, write to the Free Software Foundation,
18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
19 
20 #ifdef HAVE_CONFIG_H
21 # include "config.h"
22 #endif
23 
24 #include <errno.h>
25 #include <stdbool.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 
30 #include "message.h"
31 #include "xgettext.h"
32 #include "x-perl.h"
33 #include "error.h"
34 #include "error-progname.h"
35 #include "xalloc.h"
36 #include "exit.h"
37 #include "po-charset.h"
38 #include "ucs4-utf8.h"
39 #include "uniname.h"
40 #include "getline.h"
41 #include "gettext.h"
42 
43 #define _(s) gettext(s)
44 
45 /* The Perl syntax is defined in perlsyn.pod.  Try the command
46    "man perlsyn" or "perldoc perlsyn".
47    Also, the syntax after the 'sub' keyword is specified in perlsub.pod.
48    Try the command "man perlsub" or "perldoc perlsub".  */
49 
50 #define DEBUG_PERL 0
51 
52 
53 /* ====================== Keyword set customization.  ====================== */
54 
55 /* If true extract all strings.  */
56 static bool extract_all = false;
57 
58 static hash_table keywords;
59 static bool default_keywords = true;
60 
61 
62 void
x_perl_extract_all()63 x_perl_extract_all ()
64 {
65   extract_all = true;
66 }
67 
68 
69 void
x_perl_keyword(const char * name)70 x_perl_keyword (const char *name)
71 {
72   if (name == NULL)
73     default_keywords = false;
74   else
75     {
76       const char *end;
77       struct callshape shape;
78       const char *colon;
79 
80       if (keywords.table == NULL)
81 	hash_init (&keywords, 100);
82 
83       split_keywordspec (name, &end, &shape);
84 
85       /* The characters between name and end should form a valid C identifier.
86 	 A colon means an invalid parse in split_keywordspec().  */
87       colon = strchr (name, ':');
88       if (colon == NULL || colon >= end)
89 	insert_keyword_callshape (&keywords, name, end - name, &shape);
90     }
91 }
92 
93 /* Finish initializing the keywords hash table.
94    Called after argument processing, before each file is processed.  */
95 static void
init_keywords()96 init_keywords ()
97 {
98   if (default_keywords)
99     {
100       /* When adding new keywords here, also update the documentation in
101 	 xgettext.texi!  */
102       x_perl_keyword ("gettext");
103       x_perl_keyword ("%gettext");
104       x_perl_keyword ("$gettext");
105       x_perl_keyword ("dgettext:2");
106       x_perl_keyword ("dcgettext:2");
107       x_perl_keyword ("ngettext:1,2");
108       x_perl_keyword ("dngettext:2,3");
109       x_perl_keyword ("dcngettext:2,3");
110       x_perl_keyword ("gettext_noop");
111 #if 0
112       x_perl_keyword ("__");
113       x_perl_keyword ("$__");
114       x_perl_keyword ("%__");
115       x_perl_keyword ("__x");
116       x_perl_keyword ("__n:1,2");
117       x_perl_keyword ("__nx:1,2");
118       x_perl_keyword ("__xn:1,2");
119       x_perl_keyword ("N__");
120 #endif
121       default_keywords = false;
122     }
123 }
124 
125 void
init_flag_table_perl()126 init_flag_table_perl ()
127 {
128   xgettext_record_flag ("gettext:1:pass-perl-format");
129   xgettext_record_flag ("gettext:1:pass-perl-brace-format");
130   xgettext_record_flag ("%gettext:1:pass-perl-format");
131   xgettext_record_flag ("%gettext:1:pass-perl-brace-format");
132   xgettext_record_flag ("$gettext:1:pass-perl-format");
133   xgettext_record_flag ("$gettext:1:pass-perl-brace-format");
134   xgettext_record_flag ("dgettext:2:pass-perl-format");
135   xgettext_record_flag ("dgettext:2:pass-perl-brace-format");
136   xgettext_record_flag ("dcgettext:2:pass-perl-format");
137   xgettext_record_flag ("dcgettext:2:pass-perl-brace-format");
138   xgettext_record_flag ("ngettext:1:pass-perl-format");
139   xgettext_record_flag ("ngettext:2:pass-perl-format");
140   xgettext_record_flag ("ngettext:1:pass-perl-brace-format");
141   xgettext_record_flag ("ngettext:2:pass-perl-brace-format");
142   xgettext_record_flag ("dngettext:2:pass-perl-format");
143   xgettext_record_flag ("dngettext:3:pass-perl-format");
144   xgettext_record_flag ("dngettext:2:pass-perl-brace-format");
145   xgettext_record_flag ("dngettext:3:pass-perl-brace-format");
146   xgettext_record_flag ("dcngettext:2:pass-perl-format");
147   xgettext_record_flag ("dcngettext:3:pass-perl-format");
148   xgettext_record_flag ("dcngettext:2:pass-perl-brace-format");
149   xgettext_record_flag ("dcngettext:3:pass-perl-brace-format");
150   xgettext_record_flag ("gettext_noop:1:pass-perl-format");
151   xgettext_record_flag ("gettext_noop:1:pass-perl-brace-format");
152   xgettext_record_flag ("printf:1:perl-format"); /* argument 1 or 2 ?? */
153   xgettext_record_flag ("sprintf:1:perl-format");
154 #if 0
155   xgettext_record_flag ("__:1:pass-perl-format");
156   xgettext_record_flag ("__:1:pass-perl-brace-format");
157   xgettext_record_flag ("%__:1:pass-perl-format");
158   xgettext_record_flag ("%__:1:pass-perl-brace-format");
159   xgettext_record_flag ("$__:1:pass-perl-format");
160   xgettext_record_flag ("$__:1:pass-perl-brace-format");
161   xgettext_record_flag ("__x:1:perl-brace-format");
162   xgettext_record_flag ("__n:1:pass-perl-format");
163   xgettext_record_flag ("__n:2:pass-perl-format");
164   xgettext_record_flag ("__n:1:pass-perl-brace-format");
165   xgettext_record_flag ("__n:2:pass-perl-brace-format");
166   xgettext_record_flag ("__nx:1:perl-brace-format");
167   xgettext_record_flag ("__nx:2:perl-brace-format");
168   xgettext_record_flag ("__xn:1:perl-brace-format");
169   xgettext_record_flag ("__xn:2:perl-brace-format");
170   xgettext_record_flag ("N__:1:pass-perl-format");
171   xgettext_record_flag ("N__:1:pass-perl-brace-format");
172 #endif
173 }
174 
175 
176 /* ======================== Reading of characters.  ======================== */
177 
178 /* Real filename, used in error messages about the input file.  */
179 static const char *real_file_name;
180 
181 /* Logical filename and line number, used to label the extracted messages.  */
182 static char *logical_file_name;
183 static int line_number;
184 
185 /* The input file stream.  */
186 static FILE *fp;
187 
188 /* The current line buffer.  */
189 static char *linebuf;
190 
191 /* The size of the current line.  */
192 static int linesize;
193 
194 /* The position in the current line.  */
195 static int linepos;
196 
197 /* The size of the input buffer.  */
198 static size_t linebuf_size;
199 
200 /* Number of lines eaten for here documents.  */
201 static int here_eaten;
202 
203 /* Paranoia: EOF marker for __END__ or __DATA__.  */
204 static bool end_of_file;
205 
206 
207 /* 1. line_number handling.  */
208 
209 /* Returns the next character from the input stream or EOF.  */
210 static int
phase1_getc()211 phase1_getc ()
212 {
213   line_number += here_eaten;
214   here_eaten = 0;
215 
216   if (end_of_file)
217     return EOF;
218 
219   if (linepos >= linesize)
220     {
221       linesize = getline (&linebuf, &linebuf_size, fp);
222 
223       if (linesize < 0)
224 	{
225 	  if (ferror (fp))
226 	    error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
227 		   real_file_name);
228 	  end_of_file = true;
229 	  return EOF;
230 	}
231 
232       linepos = 0;
233       ++line_number;
234 
235       /* Undosify.  This is important for catching the end of <<EOF and
236 	 <<'EOF'.  We could rely on stdio doing this for us but you
237 	 it is not uncommon to to come across Perl scripts with CRLF
238 	 newline conventions on systems that do not follow this
239 	 convention.  */
240       if (linesize >= 2 && linebuf[linesize - 1] == '\n'
241 	  && linebuf[linesize - 2] == '\r')
242 	{
243 	  linebuf[linesize - 2] = '\n';
244 	  linebuf[linesize - 1] = '\0';
245 	  --linesize;
246 	}
247     }
248 
249   return linebuf[linepos++];
250 }
251 
252 /* Supports only one pushback character.  */
253 static void
phase1_ungetc(int c)254 phase1_ungetc (int c)
255 {
256   if (c != EOF)
257     {
258       if (linepos == 0)
259 	/* Attempt to ungetc across line boundary.  Shouldn't happen.
260 	   No two phase1_ungetc calls are permitted in a row.  */
261 	abort ();
262 
263       --linepos;
264     }
265 }
266 
267 /* Read a here document and return its contents.
268    The delimiter is an UTF-8 encoded string; the resulting string is UTF-8
269    encoded as well.  */
270 
271 static char *
get_here_document(const char * delimiter)272 get_here_document (const char *delimiter)
273 {
274   /* Accumulator for the entire here document, including a NUL byte
275      at the end.  */
276   static char *buffer;
277   static size_t bufmax = 0;
278   size_t bufpos = 0;
279   /* Current line being appended.  */
280   static char *my_linebuf = NULL;
281   static size_t my_linebuf_size = 0;
282 
283   /* Allocate the initial buffer.  Later on, bufmax > 0.  */
284   if (bufmax == 0)
285     {
286       buffer = xrealloc (NULL, 1);
287       buffer[0] = '\0';
288       bufmax = 1;
289     }
290 
291   for (;;)
292     {
293       int read_bytes = getline (&my_linebuf, &my_linebuf_size, fp);
294       char *my_line_utf8;
295       bool chomp;
296 
297       if (read_bytes < 0)
298 	{
299 	  if (ferror (fp))
300 	    {
301 	      error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
302 		     real_file_name);
303 	    }
304 	  else
305 	    {
306 	      error_with_progname = false;
307 	      error (EXIT_SUCCESS, 0, _("\
308 %s:%d: can't find string terminator \"%s\" anywhere before EOF"),
309 		     real_file_name, line_number, delimiter);
310 	      error_with_progname = true;
311 
312 	      break;
313 	    }
314 	}
315 
316       ++here_eaten;
317 
318       /* Convert to UTF-8.  */
319       my_line_utf8 =
320 	from_current_source_encoding (my_linebuf, logical_file_name,
321 				      line_number + here_eaten);
322       if (my_line_utf8 != my_linebuf)
323 	{
324 	  if (strlen (my_line_utf8) >= my_linebuf_size)
325 	    {
326 	      my_linebuf_size = strlen (my_line_utf8) + 1;
327 	      my_linebuf = xrealloc (my_linebuf, my_linebuf_size);
328 	    }
329 	  strcpy (my_linebuf, my_line_utf8);
330 	  free (my_line_utf8);
331 	}
332 
333       /* Undosify.  This is important for catching the end of <<EOF and
334 	 <<'EOF'.  We could rely on stdio doing this for us but you
335 	 it is not uncommon to to come across Perl scripts with CRLF
336 	 newline conventions on systems that do not follow this
337 	 convention.  */
338       if (read_bytes >= 2 && my_linebuf[read_bytes - 1] == '\n'
339 	  && my_linebuf[read_bytes - 2] == '\r')
340 	{
341 	  my_linebuf[read_bytes - 2] = '\n';
342 	  my_linebuf[read_bytes - 1] = '\0';
343 	  --read_bytes;
344 	}
345 
346       /* Temporarily remove the trailing newline from my_linebuf.  */
347       chomp = false;
348       if (read_bytes >= 1 && my_linebuf[read_bytes - 1] == '\n')
349 	{
350 	  chomp = true;
351 	  my_linebuf[read_bytes - 1] = '\0';
352 	}
353 
354       /* See whether this line terminates the here document.  */
355       if (strcmp (my_linebuf, delimiter) == 0)
356 	break;
357 
358       /* Add back the trailing newline to my_linebuf.  */
359       if (chomp)
360 	my_linebuf[read_bytes - 1] = '\n';
361 
362       /* Ensure room for read_bytes + 1 bytes.  */
363       if (bufpos + read_bytes >= bufmax)
364 	{
365 	  do
366 	    bufmax = 2 * bufmax + 10;
367 	  while (bufpos + read_bytes >= bufmax);
368 	  buffer = xrealloc (buffer, bufmax);
369 	}
370       /* Append this line to the accumulator.  */
371       strcpy (buffer + bufpos, my_linebuf);
372       bufpos += read_bytes;
373     }
374 
375   /* Done accumulating the here document.  */
376   return xstrdup (buffer);
377 }
378 
379 /* Skips pod sections.  */
380 static void
skip_pod()381 skip_pod ()
382 {
383   line_number += here_eaten;
384   here_eaten = 0;
385   linepos = 0;
386 
387   for (;;)
388     {
389       linesize = getline (&linebuf, &linebuf_size, fp);
390 
391       if (linesize < 0)
392 	{
393 	  if (ferror (fp))
394 	    error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
395 		   real_file_name);
396 	  return;
397 	}
398 
399       ++line_number;
400 
401       if (strncmp ("=cut", linebuf, 4) == 0)
402 	{
403 	  /* Force reading of a new line on next call to phase1_getc().  */
404 	  linepos = linesize;
405 	  return;
406 	}
407     }
408 }
409 
410 
411 /* These are for tracking whether comments count as immediately before
412    keyword.  */
413 static int last_comment_line;
414 static int last_non_comment_line;
415 
416 
417 /* 2. Replace each comment that is not inside a string literal or regular
418    expression with a newline character.  We need to remember the comment
419    for later, because it may be attached to a keyword string.  */
420 
421 static int
phase2_getc()422 phase2_getc ()
423 {
424   static char *buffer;
425   static size_t bufmax;
426   size_t buflen;
427   int lineno;
428   int c;
429   char *utf8_string;
430 
431   c = phase1_getc ();
432   if (c == '#')
433     {
434       buflen = 0;
435       lineno = line_number;
436       /* Skip leading whitespace.  */
437       for (;;)
438 	{
439 	  c = phase1_getc ();
440 	  if (c == EOF)
441 	    break;
442 	  if (c != ' ' && c != '\t' && c != '\r' && c != '\f')
443 	    {
444 	      phase1_ungetc (c);
445 	      break;
446 	    }
447 	}
448       /* Accumulate the comment.  */
449       for (;;)
450 	{
451 	  c = phase1_getc ();
452 	  if (c == '\n' || c == EOF)
453 	    break;
454 	  if (buflen >= bufmax)
455 	    {
456 	      bufmax = 2 * bufmax + 10;
457 	      buffer = xrealloc (buffer, bufmax);
458 	    }
459 	  buffer[buflen++] = c;
460 	}
461       if (buflen >= bufmax)
462 	{
463 	  bufmax = 2 * bufmax + 10;
464 	  buffer = xrealloc (buffer, bufmax);
465 	}
466       buffer[buflen] = '\0';
467       /* Convert it to UTF-8.  */
468       utf8_string =
469 	from_current_source_encoding (buffer, logical_file_name, lineno);
470       /* Save it until we encounter the corresponding string.  */
471       savable_comment_add (utf8_string);
472       last_comment_line = lineno;
473     }
474   return c;
475 }
476 
477 /* Supports only one pushback character.  */
478 static void
phase2_ungetc(int c)479 phase2_ungetc (int c)
480 {
481   if (c != EOF)
482     phase1_ungetc (c);
483 }
484 
485 /* Whitespace recognition.  */
486 
487 #define case_whitespace \
488   case ' ': case '\t': case '\r': case '\n': case '\f'
489 
490 static inline bool
is_whitespace(int c)491 is_whitespace (int c)
492 {
493   return (c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\f');
494 }
495 
496 
497 /* ========================== Reading of tokens.  ========================== */
498 
499 
500 enum token_type_ty
501 {
502   token_type_eof,
503   token_type_lparen,		/* ( */
504   token_type_rparen,		/* ) */
505   token_type_comma,		/* , */
506   token_type_fat_comma,		/* => */
507   token_type_dereference,	/* , */
508   token_type_semicolon,         /* ; */
509   token_type_lbrace,            /* { */
510   token_type_rbrace,            /* } */
511   token_type_lbracket,          /* [ */
512   token_type_rbracket,          /* ] */
513   token_type_string,		/* quote-like */
514   token_type_named_op,          /* if, unless, while, ... */
515   token_type_variable,          /* $... */
516   token_type_symbol,		/* symbol, number */
517   token_type_regex_op,		/* s, tr, y, m.  */
518   token_type_dot,               /* . */
519   token_type_other,		/* regexp, misc. operator */
520   /* The following are not really token types, but variants used by
521      the parser.  */
522   token_type_keyword_symbol	/* keyword symbol */
523 };
524 typedef enum token_type_ty token_type_ty;
525 
526 /* Subtypes for strings, important for interpolation.  */
527 enum string_type_ty
528 {
529   string_type_verbatim,     /* "<<'EOF'", "m'...'", "s'...''...'",
530 			       "tr/.../.../", "y/.../.../".  */
531   string_type_q,            /* "'..'", "q/.../".  */
532   string_type_qq,           /* '"..."', "`...`", "qq/.../", "qx/.../",
533 			       "<file*glob>".  */
534   string_type_qr            /* Not supported.  */
535 };
536 
537 /* Subtypes for symbols, important for dollar interpretation.  */
538 enum symbol_type_ty
539 {
540   symbol_type_none,         /* Nothing special.  */
541   symbol_type_sub,          /* 'sub'.  */
542   symbol_type_function      /* Function name after 'sub'.  */
543 };
544 
545 typedef struct token_ty token_ty;
546 struct token_ty
547 {
548   token_type_ty type;
549   int sub_type;			/* for token_type_string, token_type_symbol */
550   char *string;			/* for:			in encoding:
551 				   token_type_named_op	ASCII
552 				   token_type_string	UTF-8
553 				   token_type_symbol	ASCII
554 				   token_type_variable	global_source_encoding
555 				 */
556   int line_number;
557 };
558 
559 #if DEBUG_PERL
560 static const char *
token2string(const token_ty * token)561 token2string (const token_ty *token)
562 {
563   switch (token->type)
564     {
565     case token_type_eof:
566       return "token_type_eof";
567     case token_type_lparen:
568       return "token_type_lparen";
569     case token_type_rparen:
570       return "token_type_rparen";
571     case token_type_comma:
572       return "token_type_comma";
573     case token_type_fat_comma:
574       return "token_type_fat_comma";
575     case token_type_dereference:
576       return "token_type_dereference";
577     case token_type_semicolon:
578       return "token_type_semicolon";
579     case token_type_lbrace:
580       return "token_type_lbrace";
581     case token_type_rbrace:
582       return "token_type_rbrace";
583     case token_type_lbracket:
584       return "token_type_lbracket";
585     case token_type_rbracket:
586       return "token_type_rbracket";
587     case token_type_string:
588       return "token_type_string";
589     case token_type_named_op:
590       return "token_type_named_op";
591     case token_type_variable:
592       return "token_type_variable";
593     case token_type_symbol:
594       return "token_type_symbol";
595     case token_type_regex_op:
596       return "token_type_regex_op";
597     case token_type_dot:
598       return "token_type_dot";
599     case token_type_other:
600       return "token_type_other";
601     default:
602       return "unknown";
603     }
604 }
605 #endif
606 
607 /* Free the memory pointed to by a 'struct token_ty'.  */
608 static inline void
free_token(token_ty * tp)609 free_token (token_ty *tp)
610 {
611   switch (tp->type)
612     {
613     case token_type_named_op:
614     case token_type_string:
615     case token_type_symbol:
616     case token_type_variable:
617       free (tp->string);
618       break;
619     default:
620       break;
621     }
622   free (tp);
623 }
624 
625 /* Pass 1 of extracting quotes: Find the end of the string, regardless
626    of the semantics of the construct.  Return the complete string,
627    including the starting and the trailing delimiter, with backslashes
628    removed where appropriate.  */
629 static char *
extract_quotelike_pass1(int delim)630 extract_quotelike_pass1 (int delim)
631 {
632   /* This function is called recursively.  No way to allocate stuff
633      statically.  Also alloca() is inappropriate due to limited stack
634      size on some platforms.  So we use malloc().  */
635   int bufmax = 10;
636   char *buffer = (char *) xmalloc (bufmax);
637   int bufpos = 0;
638   bool nested = true;
639   int counter_delim;
640 
641   buffer[bufpos++] = delim;
642 
643   /* Find the closing delimiter.  */
644   switch (delim)
645     {
646     case '(':
647       counter_delim = ')';
648       break;
649     case '{':
650       counter_delim = '}';
651       break;
652     case '[':
653       counter_delim = ']';
654       break;
655     case '<':
656       counter_delim = '>';
657       break;
658     default: /* "..." or '...' or |...| etc. */
659       nested = false;
660       counter_delim = delim;
661       break;
662     }
663 
664   for (;;)
665     {
666       int c = phase1_getc ();
667 
668       /* This round can produce 1 or 2 bytes.  Ensure room for 2 bytes.  */
669       if (bufpos + 2 > bufmax)
670 	{
671 	  bufmax = 2 * bufmax + 10;
672 	  buffer = xrealloc (buffer, bufmax);
673 	}
674 
675       if (c == counter_delim || c == EOF)
676 	{
677 	  buffer[bufpos++] = counter_delim; /* will be stripped off later */
678 	  buffer[bufpos++] = '\0';
679 #if DEBUG_PERL
680 	  fprintf (stderr, "PASS1: %s\n", buffer);
681 #endif
682 	  return buffer;
683 	}
684 
685       if (nested && c == delim)
686 	{
687 	  char *inner = extract_quotelike_pass1 (delim);
688 	  size_t len = strlen (inner);
689 
690 	  /* Ensure room for len + 1 bytes.  */
691 	  if (bufpos + len >= bufmax)
692 	    {
693 	      do
694 		bufmax = 2 * bufmax + 10;
695 	      while (bufpos + len >= bufmax);
696 	      buffer = xrealloc (buffer, bufmax);
697 	    }
698 	  strcpy (buffer + bufpos, inner);
699 	  free (inner);
700 	  bufpos += len;
701 	}
702       else if (c == '\\')
703 	{
704 	  c = phase1_getc ();
705 	  if (c == '\\')
706 	    {
707 	      buffer[bufpos++] = '\\';
708 	      buffer[bufpos++] = '\\';
709 	    }
710 	  else if (c == delim || c == counter_delim)
711 	    {
712 	      /* This is pass2 in Perl.  */
713 	      buffer[bufpos++] = c;
714 	    }
715 	  else
716 	    {
717 	      buffer[bufpos++] = '\\';
718 	      phase1_ungetc (c);
719 	    }
720 	}
721       else
722 	{
723 	  buffer[bufpos++] = c;
724 	}
725     }
726 }
727 
728 /* Like extract_quotelike_pass1, but return the complete string in UTF-8
729    encoding.  */
730 static char *
extract_quotelike_pass1_utf8(int delim)731 extract_quotelike_pass1_utf8 (int delim)
732 {
733   char *string = extract_quotelike_pass1 (delim);
734   char *utf8_string =
735     from_current_source_encoding (string, logical_file_name, line_number);
736   if (utf8_string != string)
737     free (string);
738   return utf8_string;
739 }
740 
741 
742 /* ========= Reading of tokens and commands.  Extracting strings.  ========= */
743 
744 
745 /* There is an ambiguity about '/': It can start a division operator ('/' or
746    '/=') or it can start a regular expression.  The distinction is important
747    because inside regular expressions, '#' loses its special meaning.
748    The distinction is possible depending on the parsing state: After a
749    variable or simple expression, it's a division operator; at the beginning
750    of an expression, it's a regexp.  */
751 static bool prefer_division_over_regexp;
752 
753 /* Context lookup table.  */
754 static flag_context_list_table_ty *flag_context_list_table;
755 
756 
757 /* Forward declaration of local functions.  */
758 static void interpolate_keywords (message_list_ty *mlp, const char *string,
759 				  int lineno);
760 static token_ty *x_perl_lex (message_list_ty *mlp);
761 static void x_perl_unlex (token_ty *tp);
762 static bool extract_balanced (message_list_ty *mlp,
763 			      token_type_ty delim, bool eat_delim,
764 			      bool comma_delim,
765 			      flag_context_ty outer_context,
766 			      flag_context_list_iterator_ty context_iter,
767 			      int arg, struct arglist_parser *argparser);
768 
769 
770 /* Extract an unsigned hexadecimal number from STRING, considering at
771    most LEN bytes and place the result in *RESULT.  Returns a pointer
772    to the first character past the hexadecimal number.  */
773 static const char *
extract_hex(const char * string,size_t len,unsigned int * result)774 extract_hex (const char *string, size_t len, unsigned int *result)
775 {
776   size_t i;
777 
778   *result = 0;
779 
780   for (i = 0; i < len; i++)
781     {
782       char c = string[i];
783       int number;
784 
785       if (c >= 'A' && c <= 'F')
786 	number = c - 'A' + 10;
787       else if (c >= 'a' && c <= 'f')
788 	number = c - 'a' + 10;
789       else if (c >= '0' && c <= '9')
790 	number = c - '0';
791       else
792 	break;
793 
794       *result <<= 4;
795       *result |= number;
796     }
797 
798   return string + i;
799 }
800 
801 /* Extract an unsigned octal number from STRING, considering at
802    most LEN bytes and place the result in *RESULT.  Returns a pointer
803    to the first character past the octal number.  */
804 static const char *
extract_oct(const char * string,size_t len,unsigned int * result)805 extract_oct (const char *string, size_t len, unsigned int *result)
806 {
807   size_t i;
808 
809   *result = 0;
810 
811   for (i = 0; i < len; i++)
812     {
813       char c = string[i];
814       int number;
815 
816       if (c >= '0' && c <= '7')
817 	number = c - '0';
818       else
819 	break;
820 
821       *result <<= 3;
822       *result |= number;
823     }
824 
825   return string + i;
826 }
827 
828 /* Extract the various quotelike constructs except for <<EOF.  See the
829    section "Gory details of parsing quoted constructs" in perlop.pod.
830    Return the resulting token in *tp; tp->type == token_type_string.  */
831 static void
extract_quotelike(token_ty * tp,int delim)832 extract_quotelike (token_ty *tp, int delim)
833 {
834   char *string = extract_quotelike_pass1_utf8 (delim);
835   size_t len = strlen (string);
836 
837   tp->type = token_type_string;
838   /* Take the string without the delimiters at the start and at the end.  */
839   if (!(len >= 2))
840     abort ();
841   string[len - 1] = '\0';
842   tp->string = xstrdup (string + 1);
843   free (string);
844 }
845 
846 /* Extract the quotelike constructs with double delimiters, like
847    s/[SEARCH]/[REPLACE]/.  This function does not eat up trailing
848    modifiers (left to the caller).
849    Return the resulting token in *tp; tp->type == token_type_regex_op.  */
850 static void
extract_triple_quotelike(message_list_ty * mlp,token_ty * tp,int delim,bool interpolate)851 extract_triple_quotelike (message_list_ty *mlp, token_ty *tp, int delim,
852 			  bool interpolate)
853 {
854   char *string;
855 
856   tp->type = token_type_regex_op;
857 
858   string = extract_quotelike_pass1_utf8 (delim);
859   if (interpolate)
860     interpolate_keywords (mlp, string, line_number);
861   free (string);
862 
863   if (delim == '(' || delim == '<' || delim == '{' || delim == '[')
864     {
865       /* The delimiter for the second string can be different, e.g.
866 	 s{SEARCH}{REPLACE} or s{SEARCH}/REPLACE/.  See "man perlrequick".  */
867       delim = phase1_getc ();
868       while (is_whitespace (delim))
869 	{
870 	  /* The hash-sign is not a valid delimiter after whitespace, ergo
871 	     use phase2_getc() and not phase1_getc() now.  */
872 	  delim = phase2_getc ();
873 	}
874     }
875   string = extract_quotelike_pass1_utf8 (delim);
876   if (interpolate)
877     interpolate_keywords (mlp, string, line_number);
878   free (string);
879 }
880 
881 /* Perform pass 3 of quotelike extraction (interpolation).
882    *tp is a token of type token_type_string.
883    This function replaces tp->string.  */
884 /* FIXME: Currently may writes null-bytes into the string.  */
885 static void
extract_quotelike_pass3(token_ty * tp,int error_level)886 extract_quotelike_pass3 (token_ty *tp, int error_level)
887 {
888   static char *buffer;
889   static int bufmax = 0;
890   int bufpos = 0;
891   const char *crs;
892   bool uppercase;
893   bool lowercase;
894   bool quotemeta;
895 
896 #if DEBUG_PERL
897   switch (tp->sub_type)
898     {
899     case string_type_verbatim:
900       fprintf (stderr, "Interpolating string_type_verbatim:\n");
901       break;
902     case string_type_q:
903       fprintf (stderr, "Interpolating string_type_q:\n");
904       break;
905     case string_type_qq:
906       fprintf (stderr, "Interpolating string_type_qq:\n");
907       break;
908     case string_type_qr:
909       fprintf (stderr, "Interpolating string_type_qr:\n");
910       break;
911     }
912   fprintf (stderr, "%s\n", tp->string);
913   if (tp->sub_type == string_type_verbatim)
914     fprintf (stderr, "---> %s\n", tp->string);
915 #endif
916 
917   if (tp->sub_type == string_type_verbatim)
918     return;
919 
920   /* Loop over tp->string, accumulating the expansion in buffer.  */
921   crs = tp->string;
922   uppercase = false;
923   lowercase = false;
924   quotemeta = false;
925   while (*crs)
926     {
927       bool backslashed;
928 
929       /* Ensure room for 7 bytes, 6 (multi-)bytes plus a leading backslash
930 	 if \Q modifier is present.  */
931       if (bufpos + 7 > bufmax)
932 	{
933 	  bufmax = 2 * bufmax + 10;
934 	  buffer = xrealloc (buffer, bufmax);
935 	}
936 
937       if (tp->sub_type == string_type_q)
938 	{
939 	  switch (*crs)
940 	    {
941 	    case '\\':
942 	      if (crs[1] == '\\')
943 		{
944 		  crs += 2;
945 		  buffer[bufpos++] = '\\';
946 		  break;
947 		}
948 	      /* FALLTHROUGH */
949 	    default:
950 	      buffer[bufpos++] = *crs++;
951 	      break;
952 	    }
953 	  continue;
954 	}
955 
956       /* We only get here for double-quoted strings or regular expressions.
957 	 Unescape escape sequences.  */
958       if (*crs == '\\')
959 	{
960 	  switch (crs[1])
961 	    {
962 	    case 't':
963 	      crs += 2;
964 	      buffer[bufpos++] = '\t';
965 	      continue;
966 	    case 'n':
967 	      crs += 2;
968 	      buffer[bufpos++] = '\n';
969 	      continue;
970 	    case 'r':
971 	      crs += 2;
972 	      buffer[bufpos++] = '\r';
973 	      continue;
974 	    case 'f':
975 	      crs += 2;
976 	      buffer[bufpos++] = '\f';
977 	      continue;
978 	    case 'b':
979 	      crs += 2;
980 	      buffer[bufpos++] = '\b';
981 	      continue;
982 	    case 'a':
983 	      crs += 2;
984 	      buffer[bufpos++] = '\a';
985 	      continue;
986 	    case 'e':
987 	      crs += 2;
988 	      buffer[bufpos++] = 0x1b;
989 	      continue;
990 	    case '0': case '1': case '2': case '3':
991 	    case '4': case '5': case '6': case '7':
992 	      {
993 		unsigned int oct_number;
994 		int length;
995 
996 		crs = extract_oct (crs + 1, 3, &oct_number);
997 
998 		/* FIXME: If one of the variables UPPERCASE or LOWERCASE is
999 		   true, the character should be converted to its uppercase
1000 		   resp. lowercase equivalent.  I don't know if the necessary
1001 		   facilities are already included in gettext.  For US-Ascii
1002 		   the conversion can be already be done, however.  */
1003 		if (uppercase && oct_number >= 'a' && oct_number <= 'z')
1004 		  {
1005 		    oct_number = oct_number - 'a' + 'A';
1006 		  }
1007 		else if (lowercase && oct_number >= 'A' && oct_number <= 'Z')
1008 		  {
1009 		    oct_number = oct_number - 'A' + 'a';
1010 		  }
1011 
1012 
1013 		/* Yes, octal escape sequences in the range 0x100..0x1ff are
1014 		   valid.  */
1015 		length = u8_uctomb ((unsigned char *) (buffer + bufpos),
1016 				    oct_number, 2);
1017 		if (length > 0)
1018 		  bufpos += length;
1019 	      }
1020 	      continue;
1021 	    case 'x':
1022 	      {
1023 		unsigned int hex_number = 0;
1024 		int length;
1025 
1026 		crs += 2;
1027 		if (*crs == '{')
1028 		  {
1029 		    const char *end = strchr (crs, '}');
1030 		    if (end == NULL)
1031 		      {
1032 			error_with_progname = false;
1033 			error (error_level, 0, _("\
1034 %s:%d: missing right brace on \\x{HEXNUMBER}"), real_file_name, line_number);
1035 			error_with_progname = true;
1036 			++crs;
1037 			continue;
1038 		      }
1039 		    else
1040 		      {
1041 			++crs;
1042 			(void) extract_hex (crs, end - crs, &hex_number);
1043 			crs = end + 1;
1044 		      }
1045 		  }
1046 		else
1047 		  {
1048 		    crs = extract_hex (crs, 2, &hex_number);
1049 		  }
1050 
1051 		/* FIXME: If one of the variables UPPERCASE or LOWERCASE is
1052 		   true, the character should be converted to its uppercase
1053 		   resp. lowercase equivalent.  I don't know if the necessary
1054 		   facilities are already included in gettext.  For US-Ascii
1055 		   the conversion can be already be done, however.  */
1056 		if (uppercase && hex_number >= 'a' && hex_number <= 'z')
1057 		  {
1058 		    hex_number = hex_number - 'a' + 'A';
1059 		  }
1060 		else if (lowercase && hex_number >= 'A' && hex_number <= 'Z')
1061 		  {
1062 		    hex_number = hex_number - 'A' + 'a';
1063 		  }
1064 
1065 		length = u8_uctomb ((unsigned char *) (buffer + bufpos),
1066 				    hex_number, 6);
1067 
1068 		if (length > 0)
1069 		  bufpos += length;
1070 	      }
1071 	      continue;
1072 	    case 'c':
1073 	      /* Perl's notion of control characters.  */
1074 	      crs += 2;
1075 	      if (*crs)
1076 		{
1077 		  int the_char = (unsigned char) *crs;
1078 		  if (the_char >= 'a' || the_char <= 'z')
1079 		    the_char = the_char - 'a' + 'A';
1080 		  buffer[bufpos++] = the_char ^ 0x40;
1081 		}
1082 	      continue;
1083 	    case 'N':
1084 	      crs += 2;
1085 	      if (*crs == '{')
1086 		{
1087 		  const char *end = strchr (crs + 1, '}');
1088 		  if (end != NULL)
1089 		    {
1090 		      char *name;
1091 		      unsigned int unicode;
1092 
1093 		      name = (char *) xmalloc (end - (crs + 1) + 1);
1094 		      memcpy (name, crs + 1, end - (crs + 1));
1095 		      name[end - (crs + 1)] = '\0';
1096 
1097 		      unicode = unicode_name_character (name);
1098 		      if (unicode != UNINAME_INVALID)
1099 			{
1100 			  /* FIXME: Convert to upper/lowercase if the
1101 			     corresponding flag is set to true.  */
1102 			  int length =
1103 			    u8_uctomb ((unsigned char *) (buffer + bufpos),
1104 				       unicode, 6);
1105 			  if (length > 0)
1106 			    bufpos += length;
1107 			}
1108 
1109 		      free (name);
1110 
1111 		      crs = end + 1;
1112 		    }
1113 		}
1114 	      continue;
1115 	    }
1116 	}
1117 
1118       /* No escape sequence, go on.  */
1119       if (*crs == '\\')
1120 	{
1121 	  ++crs;
1122 	  switch (*crs)
1123 	    {
1124 	    case 'E':
1125 	      uppercase = false;
1126 	      lowercase = false;
1127 	      quotemeta = false;
1128 	      ++crs;
1129 	      continue;
1130 	    case 'L':
1131 	      uppercase = false;
1132 	      lowercase = true;
1133 	      ++crs;
1134 	      continue;
1135 	    case 'U':
1136 	      uppercase = true;
1137 	      lowercase = false;
1138 	      ++crs;
1139 	      continue;
1140 	    case 'Q':
1141 	      quotemeta = true;
1142 	      ++crs;
1143 	      continue;
1144 	    case 'l':
1145 	      ++crs;
1146 	      if (*crs >= 'A' && *crs <= 'Z')
1147 		{
1148 		  buffer[bufpos++] = *crs - 'A' + 'a';
1149 		}
1150 	      else if ((unsigned char) *crs >= 0x80)
1151 		{
1152 		  error_with_progname = false;
1153 		  error (error_level, 0, _("\
1154 %s:%d: invalid interpolation (\"\\l\") of 8bit character \"%c\""),
1155 			 real_file_name, line_number, *crs);
1156 		  error_with_progname = true;
1157 		}
1158 	      else
1159 	        {
1160 		  buffer[bufpos++] = *crs;
1161 		}
1162 	      ++crs;
1163 	      continue;
1164 	    case 'u':
1165 	      ++crs;
1166 	      if (*crs >= 'a' && *crs <= 'z')
1167 		{
1168 		  buffer[bufpos++] = *crs - 'a' + 'A';
1169 		}
1170 	      else if ((unsigned char) *crs >= 0x80)
1171 		{
1172 		  error_with_progname = false;
1173 		  error (error_level, 0, _("\
1174 %s:%d: invalid interpolation (\"\\u\") of 8bit character \"%c\""),
1175 			 real_file_name, line_number, *crs);
1176 		  error_with_progname = true;
1177 		}
1178 	      else
1179 	        {
1180 		  buffer[bufpos++] = *crs;
1181 		}
1182 	      ++crs;
1183 	      continue;
1184 	    case '\\':
1185 	      buffer[bufpos++] = *crs;
1186 	      ++crs;
1187 	      continue;
1188 	    default:
1189 	      backslashed = true;
1190 	      break;
1191 	    }
1192 	}
1193       else
1194 	backslashed = false;
1195 
1196       if (quotemeta
1197 	  && !((*crs >= 'A' && *crs <= 'Z') || (*crs >= 'A' && *crs <= 'z')
1198 	       || (*crs >= '0' && *crs <= '9') || *crs == '_'))
1199 	{
1200 	  buffer[bufpos++] = '\\';
1201 	  backslashed = true;
1202 	}
1203 
1204       if (!backslashed && !extract_all && (*crs == '$' || *crs == '@'))
1205 	{
1206 	  error_with_progname = false;
1207 	  error (error_level, 0, _("\
1208 %s:%d: invalid variable interpolation at \"%c\""),
1209 		 real_file_name, line_number, *crs);
1210 	  error_with_progname = true;
1211 	  ++crs;
1212 	}
1213       else if (lowercase)
1214 	{
1215 	  if (*crs >= 'A' && *crs <= 'Z')
1216 	    buffer[bufpos++] = *crs - 'A' + 'a';
1217 	  else if ((unsigned char) *crs >= 0x80)
1218 	    {
1219 	      error_with_progname = false;
1220 	      error (error_level, 0, _("\
1221 %s:%d: invalid interpolation (\"\\L\") of 8bit character \"%c\""),
1222 		     real_file_name, line_number, *crs);
1223 	      error_with_progname = true;
1224 	      buffer[bufpos++] = *crs;
1225 	    }
1226 	  else
1227 	    buffer[bufpos++] = *crs;
1228 	  ++crs;
1229 	}
1230       else if (uppercase)
1231 	{
1232 	  if (*crs >= 'a' && *crs <= 'z')
1233 	    buffer[bufpos++] = *crs - 'a' + 'A';
1234 	  else if ((unsigned char) *crs >= 0x80)
1235 	    {
1236 	      error_with_progname = false;
1237 	      error (error_level, 0, _("\
1238 %s:%d: invalid interpolation (\"\\U\") of 8bit character \"%c\""),
1239 		     real_file_name, line_number, *crs);
1240 	      error_with_progname = true;
1241 	      buffer[bufpos++] = *crs;
1242 	    }
1243 	  else
1244 	    buffer[bufpos++] = *crs;
1245 	  ++crs;
1246 	}
1247       else
1248 	{
1249 	  buffer[bufpos++] = *crs++;
1250 	}
1251     }
1252 
1253   /* Ensure room for 1 more byte.  */
1254   if (bufpos >= bufmax)
1255     {
1256       bufmax = 2 * bufmax + 10;
1257       buffer = xrealloc (buffer, bufmax);
1258     }
1259 
1260   buffer[bufpos++] = '\0';
1261 
1262 #if DEBUG_PERL
1263   fprintf (stderr, "---> %s\n", buffer);
1264 #endif
1265 
1266   /* Replace tp->string.  */
1267   free (tp->string);
1268   tp->string = xstrdup (buffer);
1269 }
1270 
1271 /* Parse a variable.  This is done in several steps:
1272      1) Consume all leading occurencies of '$', '@', '%', and '*'.
1273      2) Determine the name of the variable from the following input.
1274      3) Parse possible following hash keys or array indexes.
1275  */
1276 static void
extract_variable(message_list_ty * mlp,token_ty * tp,int first)1277 extract_variable (message_list_ty *mlp, token_ty *tp, int first)
1278 {
1279   static char *buffer;
1280   static int bufmax = 0;
1281   int bufpos = 0;
1282   int c = first;
1283   size_t varbody_length = 0;
1284   bool maybe_hash_deref = false;
1285   bool maybe_hash_value = false;
1286 
1287   tp->type = token_type_variable;
1288 
1289 #if DEBUG_PERL
1290   fprintf (stderr, "%s:%d: extracting variable type '%c'\n",
1291 	   real_file_name, line_number, first);
1292 #endif
1293 
1294   /*
1295    * 1) Consume dollars and so on (not euros ...).  Unconditionally
1296    *    accepting the hash sign (#) will maybe lead to inaccurate
1297    *    results.  FIXME!
1298    */
1299   while (c == '$' || c == '*' || c == '#' || c == '@' || c == '%')
1300     {
1301       if (bufpos >= bufmax)
1302 	{
1303 	  bufmax = 2 * bufmax + 10;
1304 	  buffer = xrealloc (buffer, bufmax);
1305 	}
1306       buffer[bufpos++] = c;
1307       c = phase1_getc ();
1308     }
1309 
1310   if (c == EOF)
1311     {
1312       tp->type = token_type_eof;
1313       return;
1314     }
1315 
1316   /* Hash references are treated in a special way, when looking for
1317      our keywords.  */
1318   if (buffer[0] == '$')
1319     {
1320       if (bufpos == 1)
1321 	maybe_hash_value = true;
1322       else if (bufpos == 2 && buffer[1] == '$')
1323 	{
1324 	  if (!(c == '{'
1325 		|| (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
1326 		|| (c >= '0' && c <= '9')
1327 		|| c == '_' || c == ':' || c == '\'' || c >= 0x80))
1328 	    {
1329 	      /* Special variable $$ for pid.  */
1330 	      if (bufpos >= bufmax)
1331 		{
1332 		  bufmax = 2 * bufmax + 10;
1333 		  buffer = xrealloc (buffer, bufmax);
1334 		}
1335 	      buffer[bufpos++] = '\0';
1336 	      tp->string = xstrdup (buffer);
1337 #if DEBUG_PERL
1338 	      fprintf (stderr, "%s:%d: is PID ($$)\n",
1339 		       real_file_name, line_number);
1340 #endif
1341 
1342 	      phase1_ungetc (c);
1343 	      return;
1344 	    }
1345 
1346 	  maybe_hash_deref = true;
1347 	  bufpos = 1;
1348 	}
1349     }
1350 
1351   /*
1352    * 2) Get the name of the variable.  The first character is practically
1353    *    arbitrary.  Punctuation and numbers automagically put a variable
1354    *    in the global namespace but that subtle difference is not interesting
1355    *    for us.
1356    */
1357   if (bufpos >= bufmax)
1358     {
1359       bufmax = 2 * bufmax + 10;
1360       buffer = xrealloc (buffer, bufmax);
1361     }
1362   if (c == '{')
1363     {
1364       /* Yuck, we cannot accept ${gettext} as a keyword...  Except for
1365        * debugging purposes it is also harmless, that we suppress the
1366        * real name of the variable.
1367        */
1368 #if DEBUG_PERL
1369       fprintf (stderr, "%s:%d: braced {variable_name}\n",
1370 	       real_file_name, line_number);
1371 #endif
1372 
1373       if (extract_balanced (mlp, token_type_rbrace, true, false,
1374 			    null_context, null_context_list_iterator,
1375 			    1, arglist_parser_alloc (mlp, NULL)))
1376 	return;
1377       buffer[bufpos++] = c;
1378     }
1379   else
1380     {
1381       while ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
1382 	     || (c >= '0' && c <= '9')
1383 	     || c == '_' || c == ':' || c == '\'' || c >= 0x80)
1384 	{
1385 	  ++varbody_length;
1386 	  if (bufpos >= bufmax)
1387 	    {
1388 	      bufmax = 2 * bufmax + 10;
1389 	      buffer = xrealloc (buffer, bufmax);
1390 	    }
1391 	  buffer[bufpos++] = c;
1392 	  c = phase1_getc ();
1393 	}
1394       phase1_ungetc (c);
1395     }
1396 
1397   /* Probably some strange Perl variable like $`.  */
1398   if (varbody_length == 0)
1399     {
1400       c = phase1_getc ();
1401       if (c == EOF || is_whitespace (c))
1402 	phase1_ungetc (c);  /* Loser.  */
1403       else
1404 	{
1405 	  if (bufpos >= bufmax)
1406 	    {
1407 	      bufmax = 2 * bufmax + 10;
1408 	      buffer = xrealloc (buffer, bufmax);
1409 	    }
1410 	  buffer[bufpos++] = c;
1411 	}
1412     }
1413 
1414   if (bufpos >= bufmax)
1415     {
1416       bufmax = 2 * bufmax + 10;
1417       buffer = xrealloc (buffer, bufmax);
1418     }
1419   buffer[bufpos++] = '\0';
1420 
1421   tp->string = xstrdup (buffer);
1422 
1423 #if DEBUG_PERL
1424   fprintf (stderr, "%s:%d: complete variable name: %s\n",
1425 	   real_file_name, line_number, tp->string);
1426 #endif
1427 
1428   prefer_division_over_regexp = true;
1429 
1430   /*
1431    * 3) If the following looks strange to you, this is valid Perl syntax:
1432    *
1433    *      $var = $$hashref    # We can place a
1434    *                          # comment here and then ...
1435    *             {key_into_hashref};
1436    *
1437    *    POD sections are not allowed but we leave complaints about
1438    *    that to the compiler/interpreter.
1439    */
1440   /* We only extract strings from the first hash key (if present).  */
1441 
1442   if (maybe_hash_deref || maybe_hash_value)
1443     {
1444       bool is_dereference = false;
1445       int c;
1446 
1447       do
1448 	c = phase2_getc ();
1449       while (is_whitespace (c));
1450 
1451       if (c == '-')
1452 	{
1453 	  int c2 = phase1_getc ();
1454 
1455 	  if (c2 == '>')
1456 	    {
1457 	      is_dereference = true;
1458 
1459 	      do
1460 		c = phase2_getc ();
1461 	      while (is_whitespace (c));
1462 	    }
1463 	  else if (c2 != '\n')
1464 	    {
1465 	      /* Discarding the newline is harmless here.  The only
1466 		 special character recognized after a minus is greater-than
1467 		 for dereference.  However, the sequence "-\n>" that we
1468 	         treat incorrectly here, is a syntax error.  */
1469 	      phase1_ungetc (c2);
1470 	    }
1471 	}
1472 
1473       if (maybe_hash_value && is_dereference)
1474 	{
1475 #if DEBUG_PERL
1476 	  fprintf (stderr, "%s:%d: first keys preceded by \"->\"\n",
1477 		   real_file_name, line_number);
1478 #endif
1479 	}
1480       else if (maybe_hash_value)
1481 	{
1482 	  /* Fake it into a hash.  */
1483 	  tp->string[0] = '%';
1484 	}
1485 
1486       /* Do NOT change that into else if (see above).  */
1487       if ((maybe_hash_value || maybe_hash_deref) && c == '{')
1488 	{
1489 	  void *keyword_value;
1490 
1491 #if DEBUG_PERL
1492 	  fprintf (stderr, "%s:%d: first keys preceded by '{'\n",
1493 		   real_file_name, line_number);
1494 #endif
1495 
1496 	  if (hash_find_entry (&keywords, tp->string, strlen (tp->string),
1497 			       &keyword_value) == 0)
1498 	    {
1499 	      /* TODO: Shouldn't we use the shapes of the keyword, instead
1500 		 of hardwiring argnum1 = 1 ?
1501 	      const struct callshapes *shapes =
1502 		(const struct callshapes *) keyword_value;
1503 	      */
1504 	      struct callshapes shapes;
1505 	      shapes.keyword = tp->string; /* XXX storage duration? */
1506 	      shapes.keyword_len = strlen (tp->string);
1507 	      shapes.nshapes = 1;
1508 	      shapes.shapes[0].argnum1 = 1;
1509 	      shapes.shapes[0].argnum2 = 0;
1510 	      shapes.shapes[0].argnumc = 0;
1511 	      shapes.shapes[0].argnum1_glib_context = false;
1512 	      shapes.shapes[0].argnum2_glib_context = false;
1513 	      shapes.shapes[0].argtotal = 0;
1514 	      string_list_init (&shapes.shapes[0].xcomments);
1515 
1516 	      {
1517 		/* Extract a possible string from the key.  Before proceeding
1518 		   we check whether the open curly is followed by a symbol and
1519 		   then by a right curly.  */
1520 		flag_context_list_iterator_ty context_iter =
1521 		  flag_context_list_iterator (
1522 		    flag_context_list_table_lookup (
1523 		      flag_context_list_table,
1524 		      tp->string, strlen (tp->string)));
1525 		token_ty *t1 = x_perl_lex (mlp);
1526 
1527 #if DEBUG_PERL
1528 		fprintf (stderr, "%s:%d: extracting string key\n",
1529 			 real_file_name, line_number);
1530 #endif
1531 
1532 		if (t1->type == token_type_symbol
1533 		    || t1->type == token_type_named_op)
1534 		  {
1535 		    token_ty *t2 = x_perl_lex (mlp);
1536 		    if (t2->type == token_type_rbrace)
1537 		      {
1538 			flag_context_ty context;
1539 			lex_pos_ty pos;
1540 
1541 			context =
1542 			  inherited_context (null_context,
1543 					     flag_context_list_iterator_advance (
1544 					       &context_iter));
1545 
1546 			pos.line_number = line_number;
1547 			pos.file_name = logical_file_name;
1548 
1549 			xgettext_current_source_encoding = po_charset_utf8;
1550 			remember_a_message (mlp, NULL, xstrdup (t1->string),
1551 					    context, &pos, savable_comment);
1552 			xgettext_current_source_encoding = xgettext_global_source_encoding;
1553 			free_token (t2);
1554 			free_token (t1);
1555 		      }
1556 		    else
1557 		      {
1558 			x_perl_unlex (t2);
1559 		      }
1560 		  }
1561 		else
1562 		  {
1563 		    x_perl_unlex (t1);
1564 		    if (extract_balanced (mlp, token_type_rbrace, true, false,
1565 					  null_context, context_iter,
1566 					  1, arglist_parser_alloc (mlp, &shapes)))
1567 		      return;
1568 		  }
1569 	      }
1570 	    }
1571 	  else
1572 	    {
1573 	      phase2_ungetc (c);
1574 	    }
1575 	}
1576       else
1577 	{
1578 	  phase2_ungetc (c);
1579 	}
1580     }
1581 
1582   /* Now consume "->", "[...]", and "{...}".  */
1583   for (;;)
1584     {
1585       int c = phase2_getc ();
1586       int c2;
1587 
1588       switch (c)
1589 	{
1590 	case '{':
1591 #if DEBUG_PERL
1592 	  fprintf (stderr, "%s:%d: extracting balanced '{' after varname\n",
1593 		   real_file_name, line_number);
1594 #endif
1595 	  extract_balanced (mlp, token_type_rbrace, true, false,
1596 			    null_context, null_context_list_iterator,
1597 			    1, arglist_parser_alloc (mlp, NULL));
1598 	  break;
1599 
1600 	case '[':
1601 #if DEBUG_PERL
1602 	  fprintf (stderr, "%s:%d: extracting balanced '[' after varname\n",
1603 		   real_file_name, line_number);
1604 #endif
1605 	  extract_balanced (mlp, token_type_rbracket, true, false,
1606 			    null_context, null_context_list_iterator,
1607 			    1, arglist_parser_alloc (mlp, NULL));
1608 	  break;
1609 
1610 	case '-':
1611 	  c2 = phase1_getc ();
1612 	  if (c2 == '>')
1613 	    {
1614 #if DEBUG_PERL
1615 	      fprintf (stderr, "%s:%d: another \"->\" after varname\n",
1616 		       real_file_name, line_number);
1617 #endif
1618 	      break;
1619 	    }
1620 	  else if (c2 != '\n')
1621 	    {
1622 	      /* Discarding the newline is harmless here.  The only
1623 		 special character recognized after a minus is greater-than
1624 		 for dereference.  However, the sequence "-\n>" that we
1625 	         treat incorrectly here, is a syntax error.  */
1626 	      phase1_ungetc (c2);
1627 	    }
1628 	  /* FALLTHROUGH */
1629 
1630 	default:
1631 #if DEBUG_PERL
1632 	  fprintf (stderr, "%s:%d: variable finished\n",
1633 		   real_file_name, line_number);
1634 #endif
1635 	  phase2_ungetc (c);
1636 	  return;
1637 	}
1638     }
1639 }
1640 
1641 /* Actually a simplified version of extract_variable().  It searches for
1642    variables inside a double-quoted string that may interpolate to
1643    some keyword hash (reference).  The string is UTF-8 encoded.  */
1644 static void
interpolate_keywords(message_list_ty * mlp,const char * string,int lineno)1645 interpolate_keywords (message_list_ty *mlp, const char *string, int lineno)
1646 {
1647   static char *buffer;
1648   static int bufmax = 0;
1649   int bufpos = 0;
1650   flag_context_ty context;
1651   int c;
1652   bool maybe_hash_deref = false;
1653   enum parser_state
1654     {
1655       initial,
1656       one_dollar,
1657       two_dollars,
1658       identifier,
1659       minus,
1660       wait_lbrace,
1661       wait_quote,
1662       dquote,
1663       squote,
1664       barekey,
1665       wait_rbrace
1666     } state;
1667   token_ty token;
1668 
1669   lex_pos_ty pos;
1670 
1671   /* States are:
1672    *
1673    * initial:      initial
1674    * one_dollar:   dollar sign seen in state INITIAL
1675    * two_dollars:  another dollar-sign has been seen in state ONE_DOLLAR
1676    * identifier:   a valid identifier character has been seen in state
1677    *               ONE_DOLLAR or TWO_DOLLARS
1678    * minus:        a minus-sign has been seen in state IDENTIFIER
1679    * wait_lbrace:  a greater-than has been seen in state MINUS
1680    * wait_quote:   a left brace has been seen in state IDENTIFIER or in
1681    *               state WAIT_LBRACE
1682    * dquote:       a double-quote has been seen in state WAIT_QUOTE
1683    * squote:       a single-quote has been seen in state WAIT_QUOTE
1684    * barekey:      an bareword character has been seen in state WAIT_QUOTE
1685    * wait_rbrace:  closing quote has been seen in state DQUOTE or SQUOTE
1686    *
1687    * In the states initial...identifier the context is null_context; in the
1688    * states minus...wait_rbrace the context is the one suitable for the first
1689    * argument of the last seen identifier.
1690    */
1691   state = initial;
1692   context = null_context;
1693 
1694   token.type = token_type_string;
1695   token.sub_type = string_type_qq;
1696   token.line_number = line_number;
1697   pos.file_name = logical_file_name;
1698   pos.line_number = lineno;
1699 
1700   while ((c = (unsigned char) *string++) != '\0')
1701     {
1702       void *keyword_value;
1703 
1704       if (state == initial)
1705 	bufpos = 0;
1706 
1707       if (c == '\n')
1708 	lineno++;
1709 
1710       if (bufpos + 1 >= bufmax)
1711 	{
1712 	  bufmax = 2 * bufmax + 10;
1713 	  buffer = xrealloc (buffer, bufmax);
1714 	}
1715 
1716       switch (state)
1717 	{
1718 	case initial:
1719 	  switch (c)
1720 	    {
1721 	    case '\\':
1722 	      c = (unsigned char) *string++;
1723 	      if (c == '\0')
1724 		return;
1725 	      break;
1726 	    case '$':
1727 	      buffer[bufpos++] = '$';
1728 	      maybe_hash_deref = false;
1729 	      state = one_dollar;
1730 	      break;
1731 	    default:
1732 	      break;
1733 	    }
1734 	  break;
1735 	case one_dollar:
1736 	  switch (c)
1737 	    {
1738 	    case '$':
1739 	      /*
1740 	       * This is enough to make us believe later that we dereference
1741 	       * a hash reference.
1742 	       */
1743 	      maybe_hash_deref = true;
1744 	      state = two_dollars;
1745 	      break;
1746 	    default:
1747 	      if (c == '_' || c == ':' || c == '\'' || c >= 0x80
1748 		  || (c >= 'A' && c <= 'Z')
1749 		  || (c >= 'a' && c <= 'z')
1750 		  || (c >= '0' && c <= '9'))
1751 		{
1752 		  buffer[bufpos++] = c;
1753 		  state = identifier;
1754 		}
1755 	      else
1756 		state = initial;
1757 	      break;
1758 	    }
1759 	  break;
1760 	case two_dollars:
1761 	  if (c == '_' || c == ':' || c == '\'' || c >= 0x80
1762 	      || (c >= 'A' && c <= 'Z')
1763 	      || (c >= 'a' && c <= 'z')
1764 	      || (c >= '0' && c <= '9'))
1765 	    {
1766 	      buffer[bufpos++] = c;
1767 	      state = identifier;
1768 	    }
1769 	  else
1770 	    state = initial;
1771 	  break;
1772 	case identifier:
1773 	  switch (c)
1774 	    {
1775 	    case '-':
1776 	      if (hash_find_entry (&keywords, buffer, bufpos, &keyword_value)
1777 		  == 0)
1778 		{
1779 		  flag_context_list_iterator_ty context_iter =
1780 		    flag_context_list_iterator (
1781 		      flag_context_list_table_lookup (
1782 			flag_context_list_table,
1783 			buffer, bufpos));
1784 		  context =
1785 		    inherited_context (null_context,
1786 				       flag_context_list_iterator_advance (
1787 					 &context_iter));
1788 		  state = minus;
1789 		}
1790 	      else
1791 		state = initial;
1792 	      break;
1793 	    case '{':
1794 	      if (!maybe_hash_deref)
1795 		buffer[0] = '%';
1796 	      if (hash_find_entry (&keywords, buffer, bufpos, &keyword_value)
1797 		  == 0)
1798 		{
1799 		  flag_context_list_iterator_ty context_iter =
1800 		    flag_context_list_iterator (
1801 		      flag_context_list_table_lookup (
1802 			flag_context_list_table,
1803 			buffer, bufpos));
1804 		  context =
1805 		    inherited_context (null_context,
1806 				       flag_context_list_iterator_advance (
1807 					 &context_iter));
1808 		  state = wait_quote;
1809 		}
1810 	      else
1811 		state = initial;
1812 	      break;
1813 	    default:
1814 	      if (c == '_' || c == ':' || c == '\'' || c >= 0x80
1815 		  || (c >= 'A' && c <= 'Z')
1816 		  || (c >= 'a' && c <= 'z')
1817 		  || (c >= '0' && c <= '9'))
1818 		{
1819 		  buffer[bufpos++] = c;
1820 		}
1821 	      else
1822 		state = initial;
1823 	      break;
1824 	    }
1825 	  break;
1826 	case minus:
1827 	  switch (c)
1828 	    {
1829 	    case '>':
1830 	      state = wait_lbrace;
1831 	      break;
1832 	    default:
1833 	      context = null_context;
1834 	      state = initial;
1835 	      break;
1836 	    }
1837 	  break;
1838 	case wait_lbrace:
1839 	  switch (c)
1840 	    {
1841 	    case '{':
1842 	      state = wait_quote;
1843 	      break;
1844 	    default:
1845 	      context = null_context;
1846 	      state = initial;
1847 	      break;
1848 	    }
1849 	  break;
1850 	case wait_quote:
1851 	  switch (c)
1852 	    {
1853 	    case_whitespace:
1854 	      break;
1855 	    case '\'':
1856 	      pos.line_number = lineno;
1857 	      bufpos = 0;
1858 	      state = squote;
1859 	      break;
1860 	    case '"':
1861 	      pos.line_number = lineno;
1862 	      bufpos = 0;
1863 	      state = dquote;
1864 	      break;
1865 	    default:
1866 	      if (c == '_' || (c >= '0' && c <= '9') || c >= 0x80
1867 		  || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
1868 		{
1869 		  pos.line_number = lineno;
1870 		  bufpos = 0;
1871 		  buffer[bufpos++] = c;
1872 		  state = barekey;
1873 		}
1874 	      else
1875 		{
1876 		  context = null_context;
1877 		  state = initial;
1878 		}
1879 	      break;
1880 	    }
1881 	  break;
1882 	case dquote:
1883 	  switch (c)
1884 	    {
1885 	    case '"':
1886 	      /* The resulting string has to be interpolated twice.  */
1887 	      buffer[bufpos] = '\0';
1888 	      token.string = xstrdup (buffer);
1889 	      extract_quotelike_pass3 (&token, EXIT_FAILURE);
1890 	      /* The string can only shrink with interpolation (because
1891 		 we ignore \Q).  */
1892 	      if (!(strlen (token.string) <= bufpos))
1893 		abort ();
1894 	      strcpy (buffer, token.string);
1895 	      free (token.string);
1896 	      state = wait_rbrace;
1897 	      break;
1898 	    case '\\':
1899 	      if (string[0] == '\"')
1900 		{
1901 		  buffer[bufpos++] = string++[0];
1902 		}
1903 	      else if (string[0])
1904 		{
1905 		  buffer[bufpos++] = '\\';
1906 		  buffer[bufpos++] = string++[0];
1907 		}
1908 	      else
1909 		{
1910 		  context = null_context;
1911 		  state = initial;
1912 		}
1913 	      break;
1914 	    default:
1915 	      buffer[bufpos++] = c;
1916 	      break;
1917 	    }
1918 	  break;
1919 	case squote:
1920 	  switch (c)
1921 	    {
1922 	    case '\'':
1923 	      state = wait_rbrace;
1924 	      break;
1925 	    case '\\':
1926 	      if (string[0] == '\'')
1927 		{
1928 		  buffer[bufpos++] = string++[0];
1929 		}
1930 	      else if (string[0])
1931 		{
1932 		  buffer[bufpos++] = '\\';
1933 		  buffer[bufpos++] = string++[0];
1934 		}
1935 	      else
1936 		{
1937 		  context = null_context;
1938 		  state = initial;
1939 		}
1940 	      break;
1941 	    default:
1942 	      buffer[bufpos++] = c;
1943 	      break;
1944 	    }
1945 	  break;
1946 	case barekey:
1947 	  if (c == '_' || (c >= '0' && c <= '9') || c >= 0x80
1948 	      || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
1949 	    {
1950 	      buffer[bufpos++] = c;
1951 	      break;
1952 	    }
1953 	  else if (is_whitespace (c))
1954 	    {
1955 	      state = wait_rbrace;
1956 	      break;
1957 	    }
1958 	  else if (c != '}')
1959 	    {
1960 	      context = null_context;
1961 	      state = initial;
1962 	      break;
1963 	    }
1964 	  /* Must be right brace.  */
1965 	  /* FALLTHROUGH */
1966 	case wait_rbrace:
1967 	  switch (c)
1968 	    {
1969 	    case_whitespace:
1970 	      break;
1971 	    case '}':
1972 	      buffer[bufpos] = '\0';
1973 	      token.string = xstrdup (buffer);
1974 	      extract_quotelike_pass3 (&token, EXIT_FAILURE);
1975 	      xgettext_current_source_encoding = po_charset_utf8;
1976 	      remember_a_message (mlp, NULL, token.string, context, &pos,
1977 				  savable_comment);
1978 	      xgettext_current_source_encoding = xgettext_global_source_encoding;
1979 	      /* FALLTHROUGH */
1980 	    default:
1981 	      context = null_context;
1982 	      state = initial;
1983 	      break;
1984 	    }
1985 	  break;
1986 	}
1987     }
1988 }
1989 
1990 /* The last token seen in the token stream.  This is important for the
1991    interpretation of '?' and '/'.  */
1992 static token_type_ty last_token;
1993 
1994 /* Combine characters into tokens.  Discard whitespace.  */
1995 
1996 static void
x_perl_prelex(message_list_ty * mlp,token_ty * tp)1997 x_perl_prelex (message_list_ty *mlp, token_ty *tp)
1998 {
1999   static char *buffer;
2000   static int bufmax;
2001   int bufpos;
2002   int c;
2003 
2004   for (;;)
2005     {
2006       c = phase2_getc ();
2007       tp->line_number = line_number;
2008 
2009       switch (c)
2010 	{
2011 	case EOF:
2012 	  tp->type = token_type_eof;
2013 	  return;
2014 
2015 	case '\n':
2016 	  if (last_non_comment_line > last_comment_line)
2017 	    savable_comment_reset ();
2018 	  /* FALLTHROUGH */
2019 	case '\t':
2020 	case ' ':
2021 	  /* Ignore whitespace.  */
2022 	  continue;
2023 
2024 	case '%':
2025 	case '@':
2026 	case '*':
2027 	case '$':
2028 	  if (!extract_all)
2029 	    {
2030 	      extract_variable (mlp, tp, c);
2031 	      prefer_division_over_regexp = true;
2032 	      return;
2033 	    }
2034 	  break;
2035 	}
2036 
2037       last_non_comment_line = tp->line_number;
2038 
2039       switch (c)
2040 	{
2041 	case '.':
2042 	  {
2043 	    int c2 = phase1_getc ();
2044 	    phase1_ungetc (c2);
2045 	    if (c2 == '.')
2046 	      {
2047 		tp->type = token_type_other;
2048 		prefer_division_over_regexp = false;
2049 		return;
2050 	      }
2051 	    else if (c2 >= '0' && c2 <= '9')
2052 	      {
2053 		prefer_division_over_regexp = false;
2054 	      }
2055 	    else
2056 	      {
2057 		tp->type = token_type_dot;
2058 		prefer_division_over_regexp = true;
2059 		return;
2060 	      }
2061 	  }
2062 	  /* FALLTHROUGH */
2063 	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2064 	case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
2065 	case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
2066 	case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
2067 	case 'Y': case 'Z':
2068 	case '_':
2069 	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2070 	case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2071 	case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2072 	case 's': case 't': case 'u': case 'v': case 'w': case 'x':
2073 	case 'y': case 'z':
2074 	case '0': case '1': case '2': case '3': case '4':
2075 	case '5': case '6': case '7': case '8': case '9':
2076 	  /* Symbol, or part of a number.  */
2077 	  prefer_division_over_regexp = true;
2078 	  bufpos = 0;
2079 	  for (;;)
2080 	    {
2081 	      if (bufpos >= bufmax)
2082 		{
2083 		  bufmax = 2 * bufmax + 10;
2084 		  buffer = xrealloc (buffer, bufmax);
2085 		}
2086 	      buffer[bufpos++] = c;
2087 	      c = phase1_getc ();
2088 	      switch (c)
2089 		{
2090 		case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2091 		case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
2092 		case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
2093 		case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
2094 		case 'Y': case 'Z':
2095 		case '_':
2096 		case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2097 		case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2098 		case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2099 		case 's': case 't': case 'u': case 'v': case 'w': case 'x':
2100 		case 'y': case 'z':
2101 		case '0': case '1': case '2': case '3': case '4':
2102 		case '5': case '6': case '7': case '8': case '9':
2103 		  continue;
2104 
2105 		default:
2106 		  phase1_ungetc (c);
2107 		  break;
2108 		}
2109 	      break;
2110 	    }
2111 	  if (bufpos >= bufmax)
2112 	    {
2113 	      bufmax = 2 * bufmax + 10;
2114 	      buffer = xrealloc (buffer, bufmax);
2115 	    }
2116 	  buffer[bufpos] = '\0';
2117 
2118 	  if (strcmp (buffer, "__END__") == 0
2119 	      || strcmp (buffer, "__DATA__") == 0)
2120 	    {
2121 	      end_of_file = true;
2122 	      tp->type = token_type_eof;
2123 	      return;
2124 	    }
2125 	  else if (strcmp (buffer, "and") == 0
2126 		   || strcmp (buffer, "cmp") == 0
2127 		   || strcmp (buffer, "eq") == 0
2128 		   || strcmp (buffer, "if") == 0
2129 		   || strcmp (buffer, "ge") == 0
2130 		   || strcmp (buffer, "gt") == 0
2131 		   || strcmp (buffer, "le") == 0
2132 		   || strcmp (buffer, "lt") == 0
2133 		   || strcmp (buffer, "ne") == 0
2134 		   || strcmp (buffer, "not") == 0
2135 		   || strcmp (buffer, "or") == 0
2136 		   || strcmp (buffer, "unless") == 0
2137 		   || strcmp (buffer, "while") == 0
2138 		   || strcmp (buffer, "xor") == 0)
2139 	    {
2140 	      tp->type = token_type_named_op;
2141 	      tp->string = xstrdup (buffer);
2142 	      prefer_division_over_regexp = false;
2143 	      return;
2144 	    }
2145 	  else if (strcmp (buffer, "s") == 0
2146 		 || strcmp (buffer, "y") == 0
2147 		 || strcmp (buffer, "tr") == 0)
2148 	    {
2149 	      int delim = phase1_getc ();
2150 
2151 	      while (is_whitespace (delim))
2152 		delim = phase2_getc ();
2153 
2154 	      if (delim == EOF)
2155 		{
2156 		  tp->type = token_type_eof;
2157 		  return;
2158 		}
2159 	      if ((delim >= '0' && delim <= '9')
2160 		  || (delim >= 'A' && delim <= 'Z')
2161 		  || (delim >= 'a' && delim <= 'z'))
2162 		{
2163 		  /* False positive.  */
2164 		  phase2_ungetc (delim);
2165 		  tp->type = token_type_symbol;
2166 		  tp->sub_type = symbol_type_none;
2167 		  tp->string = xstrdup (buffer);
2168 		  prefer_division_over_regexp = true;
2169 		  return;
2170 		}
2171 	      extract_triple_quotelike (mlp, tp, delim,
2172 					buffer[0] == 's' && delim != '\'');
2173 
2174 	      /* Eat the following modifiers.  */
2175 	      do
2176 		c = phase1_getc ();
2177 	      while (c >= 'a' && c <= 'z');
2178 	      phase1_ungetc (c);
2179 	      return;
2180 	    }
2181 	  else if (strcmp (buffer, "m") == 0)
2182 	    {
2183 	      int delim = phase1_getc ();
2184 
2185 	      while (is_whitespace (delim))
2186 		delim = phase2_getc ();
2187 
2188 	      if (delim == EOF)
2189 		{
2190 		  tp->type = token_type_eof;
2191 		  return;
2192 		}
2193 	      if ((delim >= '0' && delim <= '9')
2194 		  || (delim >= 'A' && delim <= 'Z')
2195 		  || (delim >= 'a' && delim <= 'z'))
2196 		{
2197 		  /* False positive.  */
2198 		  phase2_ungetc (delim);
2199 		  tp->type = token_type_symbol;
2200 		  tp->sub_type = symbol_type_none;
2201 		  tp->string = xstrdup (buffer);
2202 		  prefer_division_over_regexp = true;
2203 		  return;
2204 		}
2205 	      extract_quotelike (tp, delim);
2206 	      if (delim != '\'')
2207 		interpolate_keywords (mlp, tp->string, line_number);
2208 	      free (tp->string);
2209 	      tp->type = token_type_regex_op;
2210 	      prefer_division_over_regexp = true;
2211 
2212 	      /* Eat the following modifiers.  */
2213 	      do
2214 		c = phase1_getc ();
2215 	      while (c >= 'a' && c <= 'z');
2216 	      phase1_ungetc (c);
2217 	      return;
2218 	    }
2219 	  else if (strcmp (buffer, "qq") == 0
2220 		   || strcmp (buffer, "q") == 0
2221 		   || strcmp (buffer, "qx") == 0
2222 		   || strcmp (buffer, "qw") == 0
2223 		   || strcmp (buffer, "qr") == 0)
2224 	    {
2225 	      /* The qw (...) construct is not really a string but we
2226 		 can treat in the same manner and then pretend it is
2227 		 a symbol.  Rationale: Saying "qw (foo bar)" is the
2228 		 same as "my @list = ('foo', 'bar'); @list;".  */
2229 
2230 	      int delim = phase1_getc ();
2231 
2232 	      while (is_whitespace (delim))
2233 		delim = phase2_getc ();
2234 
2235 	      if (delim == EOF)
2236 		{
2237 		  tp->type = token_type_eof;
2238 		  return;
2239 		}
2240 	      prefer_division_over_regexp = true;
2241 
2242 	      if ((delim >= '0' && delim <= '9')
2243 		  || (delim >= 'A' && delim <= 'Z')
2244 		  || (delim >= 'a' && delim <= 'z'))
2245 		{
2246 		  /* False positive.  */
2247 		  phase2_ungetc (delim);
2248 		  tp->type = token_type_symbol;
2249 		  tp->sub_type = symbol_type_none;
2250 		  tp->string = xstrdup (buffer);
2251 		  prefer_division_over_regexp = true;
2252 		  return;
2253 		}
2254 
2255 	      extract_quotelike (tp, delim);
2256 
2257 	      switch (buffer[1])
2258 		{
2259 		case 'q':
2260 		case 'x':
2261 		  tp->type = token_type_string;
2262 		  tp->sub_type = string_type_qq;
2263 		  interpolate_keywords (mlp, tp->string, line_number);
2264 		  break;
2265 		case 'r':
2266 		  tp->type = token_type_regex_op;
2267 		  break;
2268 		case 'w':
2269 		  tp->type = token_type_symbol;
2270 		  tp->sub_type = symbol_type_none;
2271 		  break;
2272 		case '\0':
2273 		  tp->type = token_type_string;
2274 		  tp->sub_type = string_type_q;
2275 		  break;
2276 		default:
2277 		  abort ();
2278 		}
2279 	      return;
2280 	    }
2281 	  else if (strcmp (buffer, "grep") == 0
2282 		   || strcmp (buffer, "split") == 0)
2283 	    {
2284 	      prefer_division_over_regexp = false;
2285 	    }
2286 	  tp->type = token_type_symbol;
2287 	  tp->sub_type = (strcmp (buffer, "sub") == 0
2288 			  ? symbol_type_sub
2289 			  : symbol_type_none);
2290 	  tp->string = xstrdup (buffer);
2291 	  return;
2292 
2293 	case '"':
2294 	  prefer_division_over_regexp = true;
2295 	  extract_quotelike (tp, c);
2296 	  tp->sub_type = string_type_qq;
2297 	  interpolate_keywords (mlp, tp->string, line_number);
2298 	  return;
2299 
2300 	case '`':
2301 	  prefer_division_over_regexp = true;
2302 	  extract_quotelike (tp, c);
2303 	  tp->sub_type = string_type_qq;
2304 	  interpolate_keywords (mlp, tp->string, line_number);
2305 	  return;
2306 
2307 	case '\'':
2308 	  prefer_division_over_regexp = true;
2309 	  extract_quotelike (tp, c);
2310 	  tp->sub_type = string_type_q;
2311 	  return;
2312 
2313 	case '(':
2314 	  c = phase2_getc ();
2315 	  if (c == ')')
2316 	    /* Ignore empty list.  */
2317 	    continue;
2318 	  else
2319 	    phase2_ungetc (c);
2320 	  tp->type = token_type_lparen;
2321 	  prefer_division_over_regexp = false;
2322 	  return;
2323 
2324 	case ')':
2325 	  tp->type = token_type_rparen;
2326 	  prefer_division_over_regexp = true;
2327 	  return;
2328 
2329 	case '{':
2330 	  tp->type = token_type_lbrace;
2331 	  prefer_division_over_regexp = false;
2332 	  return;
2333 
2334 	case '}':
2335 	  tp->type = token_type_rbrace;
2336 	  prefer_division_over_regexp = false;
2337 	  return;
2338 
2339 	case '[':
2340 	  tp->type = token_type_lbracket;
2341 	  prefer_division_over_regexp = false;
2342 	  return;
2343 
2344 	case ']':
2345 	  tp->type = token_type_rbracket;
2346 	  prefer_division_over_regexp = false;
2347 	  return;
2348 
2349 	case ';':
2350 	  tp->type = token_type_semicolon;
2351 	  prefer_division_over_regexp = false;
2352 	  return;
2353 
2354 	case ',':
2355 	  tp->type = token_type_comma;
2356 	  prefer_division_over_regexp = false;
2357 	  return;
2358 
2359 	case '=':
2360 	  /* Check for fat comma.  */
2361 	  c = phase1_getc ();
2362 	  if (c == '>')
2363 	    {
2364 	      tp->type = token_type_fat_comma;
2365 	      return;
2366 	    }
2367 	  else if (linepos == 2
2368 		   && (last_token == token_type_semicolon
2369 		       || last_token == token_type_rbrace)
2370 		   && ((c >= 'A' && c <='Z')
2371 		       || (c >= 'a' && c <= 'z')))
2372 	    {
2373 #if DEBUG_PERL
2374 	      fprintf (stderr, "%s:%d: start pod section\n",
2375 		       real_file_name, line_number);
2376 #endif
2377 	      skip_pod ();
2378 #if DEBUG_PERL
2379 	      fprintf (stderr, "%s:%d: end pod section\n",
2380 		       real_file_name, line_number);
2381 #endif
2382 	      continue;
2383 	    }
2384 	  phase1_ungetc (c);
2385 	  tp->type = token_type_other;
2386 	  prefer_division_over_regexp = false;
2387 	  return;
2388 
2389 	case '<':
2390 	  /* Check for <<EOF and friends.  */
2391 	  prefer_division_over_regexp = false;
2392 	  c = phase1_getc ();
2393 	  if (c == '<')
2394 	    {
2395 	      c = phase1_getc ();
2396 	      if (c == '\'')
2397 		{
2398 		  char *string;
2399 		  extract_quotelike (tp, c);
2400 		  string = get_here_document (tp->string);
2401 		  free (tp->string);
2402 		  tp->string = string;
2403 		  tp->type = token_type_string;
2404 		  tp->sub_type = string_type_verbatim;
2405 		  tp->line_number = line_number + 1;
2406 		  return;
2407 		}
2408 	      else if (c == '"')
2409 		{
2410 		  char *string;
2411 		  extract_quotelike (tp, c);
2412 		  string = get_here_document (tp->string);
2413 		  free (tp->string);
2414 		  tp->string = string;
2415 		  tp->type = token_type_string;
2416 		  tp->sub_type = string_type_qq;
2417 		  tp->line_number = line_number + 1;
2418 		  interpolate_keywords (mlp, tp->string, line_number + 1);
2419 		  return;
2420 		}
2421 	      else if ((c >= 'A' && c <= 'Z')
2422 		       || (c >= 'a' && c <= 'z')
2423 		       || c == '_')
2424 		{
2425 		  bufpos = 0;
2426 		  while ((c >= 'A' && c <= 'Z')
2427 			 || (c >= 'a' && c <= 'z')
2428 			 || (c >= '0' && c <= '9')
2429 			 || c == '_' || c >= 0x80)
2430 		    {
2431 		      if (bufpos >= bufmax)
2432 			{
2433 			  bufmax = 2 * bufmax + 10;
2434 			  buffer = xrealloc (buffer, bufmax);
2435 			}
2436 		      buffer[bufpos++] = c;
2437 		      c = phase1_getc ();
2438 		    }
2439 		  if (c == EOF)
2440 		    {
2441 		      tp->type = token_type_eof;
2442 		      return;
2443 		    }
2444 		  else
2445 		    {
2446 		      char *string;
2447 		      phase1_ungetc (c);
2448 		      if (bufpos >= bufmax)
2449 			{
2450 			  bufmax = 2 * bufmax + 10;
2451 			  buffer = xrealloc (buffer, bufmax);
2452 			}
2453 		      buffer[bufpos++] = '\0';
2454 		      string = get_here_document (buffer);
2455 		      tp->string = string;
2456 		      tp->type = token_type_string;
2457 		      tp->sub_type = string_type_qq;
2458 		      tp->line_number = line_number + 1;
2459 		      interpolate_keywords (mlp, tp->string, line_number + 1);
2460 		      return;
2461 		    }
2462 		}
2463 	      else
2464 		{
2465 		  tp->type = token_type_other;
2466 		  return;
2467 		}
2468 	    }
2469 	  else
2470 	    {
2471 	      phase1_ungetc (c);
2472 	      tp->type = token_type_other;
2473 	    }
2474 	  return;  /* End of case '>'.  */
2475 
2476 	case '-':
2477 	  /* Check for dereferencing operator.  */
2478 	  c = phase1_getc ();
2479 	  if (c == '>')
2480 	    {
2481 	      tp->type = token_type_dereference;
2482 	      return;
2483 	    }
2484 	  else if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
2485 	    {
2486 	      /* One of the -X (filetest) functions.  We play safe
2487 		 and accept all alphabetical characters here.  */
2488 	      tp->type = token_type_other;
2489 	      return;
2490 	    }
2491 	  phase1_ungetc (c);
2492 	  tp->type = token_type_other;
2493 	  prefer_division_over_regexp = false;
2494 	  return;
2495 
2496 	case '/':
2497 	case '?':
2498 	  if (!prefer_division_over_regexp)
2499 	    {
2500 	      extract_quotelike (tp, c);
2501 	      interpolate_keywords (mlp, tp->string, line_number);
2502 	      free (tp->string);
2503 	      tp->type = token_type_other;
2504 	      prefer_division_over_regexp = true;
2505 	      /* Eat the following modifiers.  */
2506 	      do
2507 		c = phase1_getc ();
2508 	      while (c >= 'a' && c <= 'z');
2509 	      phase1_ungetc (c);
2510 	      return;
2511 	    }
2512 	  /* FALLTHROUGH */
2513 
2514 	default:
2515 	  /* We could carefully recognize each of the 2 and 3 character
2516 	     operators, but it is not necessary, as we only need to recognize
2517 	     gettext invocations.  Don't bother.  */
2518 	  tp->type = token_type_other;
2519 	  prefer_division_over_regexp = false;
2520 	  return;
2521 	}
2522     }
2523 }
2524 
2525 
2526 /* A token stack used as a lookahead buffer.  */
2527 
2528 typedef struct token_stack_ty token_stack_ty;
2529 struct token_stack_ty
2530 {
2531   token_ty **items;
2532   size_t nitems;
2533   size_t nitems_max;
2534 };
2535 
2536 static struct token_stack_ty token_stack;
2537 
2538 #if DEBUG_PERL
2539 /* Dumps all resources allocated by stack STACK.  */
2540 static int
token_stack_dump(token_stack_ty * stack)2541 token_stack_dump (token_stack_ty *stack)
2542 {
2543   size_t i;
2544 
2545   fprintf (stderr, "BEGIN STACK DUMP\n");
2546   for (i = 0; i < stack->nitems; i++)
2547     {
2548       token_ty *token = stack->items[i];
2549       fprintf (stderr, "  [%s]\n", token2string (token));
2550       switch (token->type)
2551 	{
2552 	case token_type_named_op:
2553 	case token_type_string:
2554 	case token_type_symbol:
2555 	case token_type_variable:
2556 	  fprintf (stderr, "    string: %s\n", token->string);
2557 	  break;
2558 	}
2559     }
2560   fprintf (stderr, "END STACK DUMP\n");
2561   return 0;
2562 }
2563 #endif
2564 
2565 /* Pushes the token TOKEN onto the stack STACK.  */
2566 static inline void
token_stack_push(token_stack_ty * stack,token_ty * token)2567 token_stack_push (token_stack_ty *stack, token_ty *token)
2568 {
2569   if (stack->nitems >= stack->nitems_max)
2570     {
2571       size_t nbytes;
2572 
2573       stack->nitems_max = 2 * stack->nitems_max + 4;
2574       nbytes = stack->nitems_max * sizeof (token_ty *);
2575       stack->items = xrealloc (stack->items, nbytes);
2576     }
2577   stack->items[stack->nitems++] = token;
2578 }
2579 
2580 /* Pops the most recently pushed token from the stack STACK and returns it.
2581    Returns NULL if the stack is empty.  */
2582 static inline token_ty *
token_stack_pop(token_stack_ty * stack)2583 token_stack_pop (token_stack_ty *stack)
2584 {
2585   if (stack->nitems > 0)
2586     return stack->items[--(stack->nitems)];
2587   else
2588     return NULL;
2589 }
2590 
2591 /* Return the top of the stack without removing it from the stack, or
2592    NULL if the stack is empty.  */
2593 static inline token_ty *
token_stack_peek(const token_stack_ty * stack)2594 token_stack_peek (const token_stack_ty *stack)
2595 {
2596   if (stack->nitems > 0)
2597     return stack->items[stack->nitems - 1];
2598   else
2599     return NULL;
2600 }
2601 
2602 /* Frees all resources allocated by stack STACK.  */
2603 static inline void
token_stack_free(token_stack_ty * stack)2604 token_stack_free (token_stack_ty *stack)
2605 {
2606   size_t i;
2607 
2608   for (i = 0; i < stack->nitems; i++)
2609     free_token (stack->items[i]);
2610   free (stack->items);
2611 }
2612 
2613 
2614 static token_ty *
x_perl_lex(message_list_ty * mlp)2615 x_perl_lex (message_list_ty *mlp)
2616 {
2617 #if DEBUG_PERL
2618   int dummy = token_stack_dump (&token_stack);
2619 #endif
2620   token_ty *tp = token_stack_pop (&token_stack);
2621 
2622   if (!tp)
2623     {
2624       tp = (token_ty *) xmalloc (sizeof (token_ty));
2625       x_perl_prelex (mlp, tp);
2626 #if DEBUG_PERL
2627       fprintf (stderr, "%s:%d: x_perl_prelex returned %s\n",
2628 	       real_file_name, line_number, token2string (tp));
2629 #endif
2630     }
2631 #if DEBUG_PERL
2632   else
2633     {
2634       fprintf (stderr, "%s:%d: %s recycled from stack\n",
2635 	       real_file_name, line_number, token2string (tp));
2636     }
2637 #endif
2638 
2639   /* A symbol followed by a fat comma is really a single-quoted string.
2640      Function definitions or forward declarations also need a special
2641      handling because the dollars and at signs inside the parentheses
2642      must not be interpreted as the beginning of a variable ')'.  */
2643   if (tp->type == token_type_symbol || tp->type == token_type_named_op)
2644     {
2645       token_ty *next = token_stack_peek (&token_stack);
2646 
2647       if (!next)
2648 	{
2649 #if DEBUG_PERL
2650 	  fprintf (stderr, "%s:%d: pre-fetching next token\n",
2651 		   real_file_name, line_number);
2652 #endif
2653 	  next = x_perl_lex (mlp);
2654 	  x_perl_unlex (next);
2655 #if DEBUG_PERL
2656 	  fprintf (stderr, "%s:%d: unshifted next token\n",
2657 		   real_file_name, line_number);
2658 #endif
2659 	}
2660 
2661 #if DEBUG_PERL
2662       fprintf (stderr, "%s:%d: next token is %s\n",
2663 	       real_file_name, line_number, token2string (next));
2664 #endif
2665 
2666       if (next->type == token_type_fat_comma)
2667 	{
2668 	  tp->type = token_type_string;
2669 	  tp->sub_type = string_type_q;
2670 #if DEBUG_PERL
2671 	  fprintf (stderr,
2672 		   "%s:%d: token %s mutated to token_type_string\n",
2673 		   real_file_name, line_number, token2string (tp));
2674 #endif
2675 	}
2676       else if (tp->type == token_type_symbol && tp->sub_type == symbol_type_sub
2677 	       && next->type == token_type_symbol)
2678         {
2679 	  /* Start of a function declaration or definition.  Mark this
2680 	     symbol as a function name, so that we can later eat up
2681 	     possible prototype information.  */
2682 #if DEBUG_PERL
2683 	  fprintf (stderr, "%s:%d: subroutine declaration/definition '%s'\n",
2684 		   real_file_name, line_number, next->string);
2685 #endif
2686 	  next->sub_type = symbol_type_function;
2687 	}
2688       else if (tp->type == token_type_symbol
2689 	       && (tp->sub_type == symbol_type_sub
2690 		   || tp->sub_type == symbol_type_function)
2691 	       && next->type == token_type_lparen)
2692         {
2693 	  /* For simplicity we simply consume everything up to the
2694 	     closing parenthesis.  Actually only a limited set of
2695 	     characters is allowed inside parentheses but we leave
2696 	     complaints to the interpreter and are prepared for
2697 	     future extensions to the Perl syntax.  */
2698 	  int c;
2699 
2700 #if DEBUG_PERL
2701 	  fprintf (stderr, "%s:%d: consuming prototype information\n",
2702 		   real_file_name, line_number);
2703 #endif
2704 
2705 	  do
2706 	    {
2707 	      c = phase1_getc ();
2708 #if DEBUG_PERL
2709 	      fprintf (stderr, "  consuming character '%c'\n", c);
2710 #endif
2711 	    }
2712 	  while (c != EOF && c != ')');
2713 	  phase1_ungetc (c);
2714 	}
2715     }
2716 
2717   return tp;
2718 }
2719 
2720 static void
x_perl_unlex(token_ty * tp)2721 x_perl_unlex (token_ty *tp)
2722 {
2723   token_stack_push (&token_stack, tp);
2724 }
2725 
2726 
2727 /* ========================= Extracting strings.  ========================== */
2728 
2729 /* Assuming TP is a string token, this function accumulates all subsequent
2730    . string2 . string3 ... to the string.  (String concatenation.)  */
2731 
2732 static char *
collect_message(message_list_ty * mlp,token_ty * tp,int error_level)2733 collect_message (message_list_ty *mlp, token_ty *tp, int error_level)
2734 {
2735   char *string;
2736   size_t len;
2737 
2738   extract_quotelike_pass3 (tp, error_level);
2739   string = xstrdup (tp->string);
2740   len = strlen (tp->string) + 1;
2741 
2742   for (;;)
2743     {
2744       int c;
2745 
2746       do
2747 	c = phase2_getc ();
2748       while (is_whitespace (c));
2749 
2750       if (c != '.')
2751 	{
2752 	  phase2_ungetc (c);
2753 	  return string;
2754 	}
2755 
2756       do
2757 	c = phase2_getc ();
2758       while (is_whitespace (c));
2759 
2760       phase2_ungetc (c);
2761 
2762       if (c == '"' || c == '\'' || c == '`'
2763 	  || (!prefer_division_over_regexp && (c == '/' || c == '?'))
2764 	  || c == 'q')
2765 	{
2766 	  token_ty *qstring = x_perl_lex (mlp);
2767 	  if (qstring->type != token_type_string)
2768 	    {
2769 	      /* assert (qstring->type == token_type_symbol) */
2770 	      x_perl_unlex (qstring);
2771 	      return string;
2772 	    }
2773 
2774 	  extract_quotelike_pass3 (qstring, error_level);
2775 	  len += strlen (qstring->string);
2776 	  string = xrealloc (string, len);
2777 	  strcat (string, qstring->string);
2778 	  free_token (qstring);
2779 	}
2780     }
2781 }
2782 
2783 /* The file is broken into tokens.  Scan the token stream, looking for
2784    a keyword, followed by a left paren, followed by a string.  When we
2785    see this sequence, we have something to remember.  We assume we are
2786    looking at a valid Perl program, and leave the complaints about
2787    the grammar to the compiler.
2788 
2789      Normal handling: Look for
2790        keyword ( ... msgid ... )
2791      Plural handling: Look for
2792        keyword ( ... msgid ... msgid_plural ... )
2793 
2794    We use recursion because the arguments before msgid or between msgid
2795    and msgid_plural can contain subexpressions of the same form.
2796 
2797    In Perl, parentheses around function arguments can be omitted.
2798 
2799    The general rules are:
2800      1) Functions declared with a prototype take exactly the specified number
2801         of arguments.
2802           sub one_arg ($) { ... }
2803           sub two_args ($$) { ... }
2804      2) When a function name is immediately followed by an opening parenthesis,
2805         the argument list ends at the corresponding closing parenthesis.
2806 
2807    If rule 1 and rule 2 are contradictory, i.e. when the program calls a
2808    function with an explicit argument list and the wrong number of arguments,
2809    the program is invalid:
2810      sub two_args ($$) { ... }
2811      foo two_args (x), y             - invalid due to rules 1 and 2
2812 
2813    Ambiguities are resolved as follows:
2814      3) Some built-ins, such as 'abs', 'sqrt', 'sin', 'cos', ..., and functions
2815         declared with a prototype of exactly one argument take exactly one
2816         argument:
2817           foo sin x, y  ==>  foo (sin (x), y)
2818           sub one_arg ($) { ... }
2819           foo one_arg x, y, z  ==>  foo (one_arg (x), y, z)
2820      4) Other identifiers, if not immediately followed by an opening
2821         parenthesis, consume the entire remaining argument list:
2822           foo bar x, y  ==>  foo (bar (x, y))
2823           sub two_args ($$) { ... }
2824           foo two_args x, y  ==>  foo (two_args (x, y))
2825 
2826    Other series of comma separated expressions without a function name at
2827    the beginning are comma expressions:
2828           sub two_args ($$) { ... }
2829           foo two_args x, (y, z)  ==>  foo (two_args (x, (y, z)))
2830    Note that the evaluation of comma expressions returns a list of values
2831    when in list context (e.g. inside the argument list of a function without
2832    prototype) but only one value when inside the argument list of a function
2833    with a prototype:
2834           sub print3 ($$$) { print @_ }
2835           print3 5, (6, 7), 8  ==>  578
2836           print 5, (6, 7), 8  ==>  5678
2837 
2838    Where rule 3 or 4 contradict rule 1 or 2, the program is invalid:
2839      sin (x, y)                      - invalid due to rules 2 and 3
2840      sub one_arg ($) { ... }
2841      one_arg (x, y)                  - invalid due to rules 2 and 3
2842      sub two_args ($$) { ... }
2843      foo two_args x, y, z            - invalid due to rules 1 and 4
2844  */
2845 
2846 /* Extract messages until the next balanced closing parenthesis.
2847    Extracted messages are added to MLP.
2848 
2849    DELIM can be either token_type_rbrace, token_type_rbracket,
2850    token_type_rparen.  Additionally, if COMMA_DELIM is true, parsing
2851    stops at the next comma outside parentheses.
2852 
2853    ARG is the current argument list position, starts with 1.
2854    ARGPARSER is the corresponding argument list parser.
2855 
2856    Returns true for EOF, false otherwise.  */
2857 
2858 static bool
extract_balanced(message_list_ty * mlp,token_type_ty delim,bool eat_delim,bool comma_delim,flag_context_ty outer_context,flag_context_list_iterator_ty context_iter,int arg,struct arglist_parser * argparser)2859 extract_balanced (message_list_ty *mlp,
2860 		  token_type_ty delim, bool eat_delim, bool comma_delim,
2861 		  flag_context_ty outer_context,
2862 		  flag_context_list_iterator_ty context_iter,
2863 		  int arg, struct arglist_parser *argparser)
2864 {
2865   /* Whether to implicitly assume the next tokens are arguments even without
2866      a '('.  */
2867   bool next_is_argument = false;
2868   /* Parameters of the keyword just seen.  Defined only when next_is_argument
2869      is true.  */
2870   const struct callshapes *next_shapes = NULL;
2871   struct arglist_parser *next_argparser = NULL;
2872 
2873   /* Whether to not consider strings until the next comma.  */
2874   bool skip_until_comma = false;
2875 
2876   /* Context iterator that will be used if the next token is a '('.  */
2877   flag_context_list_iterator_ty next_context_iter =
2878     passthrough_context_list_iterator;
2879   /* Current context.  */
2880   flag_context_ty inner_context =
2881     inherited_context (outer_context,
2882 		       flag_context_list_iterator_advance (&context_iter));
2883 
2884 #if DEBUG_PERL
2885   static int nesting_level = 0;
2886 
2887   ++nesting_level;
2888 #endif
2889 
2890   last_token = token_type_semicolon;  /* Safe assumption.  */
2891   prefer_division_over_regexp = false;
2892 
2893   for (;;)
2894     {
2895       /* The current token.  */
2896       token_ty *tp;
2897 
2898       tp = x_perl_lex (mlp);
2899 
2900       last_token = tp->type;
2901 
2902       if (delim == tp->type)
2903 	{
2904 	  xgettext_current_source_encoding = po_charset_utf8;
2905 	  arglist_parser_done (argparser, arg);
2906 	  xgettext_current_source_encoding = xgettext_global_source_encoding;
2907 	  if (next_argparser != NULL)
2908 	    free (next_argparser);
2909 #if DEBUG_PERL
2910 	  fprintf (stderr, "%s:%d: extract_balanced finished (%d)\n",
2911 		   logical_file_name, tp->line_number, --nesting_level);
2912 #endif
2913 	  if (eat_delim)
2914 	    free_token (tp);
2915 	  else
2916 	    /* Preserve the delimiter for the caller.  */
2917 	    x_perl_unlex (tp);
2918 	  return false;
2919 	}
2920 
2921       if (comma_delim && tp->type == token_type_comma)
2922 	{
2923 	  xgettext_current_source_encoding = po_charset_utf8;
2924 	  arglist_parser_done (argparser, arg);
2925 	  xgettext_current_source_encoding = xgettext_global_source_encoding;
2926 	  if (next_argparser != NULL)
2927 	    free (next_argparser);
2928 #if DEBUG_PERL
2929 	  fprintf (stderr, "%s:%d: extract_balanced finished at comma (%d)\n",
2930 		   logical_file_name, tp->line_number, --nesting_level);
2931 #endif
2932 	  x_perl_unlex (tp);
2933 	  return false;
2934 	}
2935 
2936       if (next_is_argument && tp->type != token_type_lparen)
2937 	{
2938 	  /* An argument list starts, even though there is no '('.  */
2939 	  bool next_comma_delim;
2940 
2941 	  x_perl_unlex (tp);
2942 
2943 	  if (next_shapes != NULL)
2944 	    /* We know something about the function being called.  Assume
2945 	       that it consumes only one argument if no argument number or
2946 	       total > 1 is specified.  */
2947 	    {
2948 	      size_t i;
2949 
2950 	      next_comma_delim = true;
2951 	      for (i = 0; i < next_shapes->nshapes; i++)
2952 		{
2953 		  const struct callshape *shape = &next_shapes->shapes[i];
2954 
2955 		  if (shape->argnum1 > 1
2956 		      || shape->argnum2 > 1
2957 		      || shape->argnumc > 1
2958 		      || shape->argtotal > 1)
2959 		    next_comma_delim = false;
2960 		}
2961 	    }
2962 	  else
2963 	    /* We know nothing about the function being called.  It could be
2964 	       a function prototyped to take only one argument, or on the other
2965 	       hand it could be prototyped to take more than one argument or an
2966 	       arbitrary argument list or it could be unprototyped.  Due to
2967 	       the way the parser works, assuming the first case gives the
2968 	       best results.  */
2969 	    next_comma_delim = true;
2970 
2971 	  if (extract_balanced (mlp, delim, false, next_comma_delim,
2972 				inner_context, next_context_iter,
2973 				1, next_argparser))
2974 	    {
2975 	      xgettext_current_source_encoding = po_charset_utf8;
2976 	      arglist_parser_done (argparser, arg);
2977 	      xgettext_current_source_encoding = xgettext_global_source_encoding;
2978 	      return true;
2979 	    }
2980 
2981 	  next_is_argument = false;
2982 	  next_argparser = NULL;
2983 	  next_context_iter = null_context_list_iterator;
2984 	  continue;
2985 	}
2986 
2987       switch (tp->type)
2988 	{
2989 	case token_type_symbol:
2990 #if DEBUG_PERL
2991 	  fprintf (stderr, "%s:%d: type symbol (%d) \"%s\"\n",
2992 		   logical_file_name, tp->line_number, nesting_level,
2993 		   tp->string);
2994 #endif
2995 
2996 	  {
2997 	    void *keyword_value;
2998 
2999 	    if (hash_find_entry (&keywords, tp->string, strlen (tp->string),
3000 				 &keyword_value) == 0)
3001 	      {
3002 		const struct callshapes *shapes =
3003 		  (const struct callshapes *) keyword_value;
3004 
3005 		last_token = token_type_keyword_symbol;
3006 		next_shapes = shapes;
3007 		next_argparser = arglist_parser_alloc (mlp, shapes);
3008 	      }
3009 	    else
3010 	      {
3011 		next_shapes = NULL;
3012 		next_argparser = arglist_parser_alloc (mlp, NULL);
3013 	      }
3014 	  }
3015 	  next_is_argument = true;
3016 	  next_context_iter =
3017 	    flag_context_list_iterator (
3018 	      flag_context_list_table_lookup (
3019 		flag_context_list_table,
3020 		tp->string, strlen (tp->string)));
3021 	  break;
3022 
3023 	case token_type_variable:
3024 #if DEBUG_PERL
3025 	  fprintf (stderr, "%s:%d: type variable (%d) \"%s\"\n",
3026 		   logical_file_name, tp->line_number, nesting_level, tp->string);
3027 #endif
3028 	  prefer_division_over_regexp = true;
3029 	  next_is_argument = false;
3030 	  if (next_argparser != NULL)
3031 	    free (next_argparser);
3032 	  next_argparser = NULL;
3033 	  next_context_iter = null_context_list_iterator;
3034 	  break;
3035 
3036 	case token_type_lparen:
3037 #if DEBUG_PERL
3038 	  fprintf (stderr, "%s:%d: type left parenthesis (%d)\n",
3039 		   logical_file_name, tp->line_number, nesting_level);
3040 #endif
3041 	  if (next_is_argument)
3042 	    {
3043 	      /* Parse the argument list of a function call.  */
3044 	      if (extract_balanced (mlp, token_type_rparen, true, false,
3045 				    inner_context, next_context_iter,
3046 				    1, next_argparser))
3047 		{
3048 		  xgettext_current_source_encoding = po_charset_utf8;
3049 		  arglist_parser_done (argparser, arg);
3050 		  xgettext_current_source_encoding = xgettext_global_source_encoding;
3051 		  return true;
3052 		}
3053 	      next_is_argument = false;
3054 	      next_argparser = NULL;
3055 	    }
3056 	  else
3057 	    {
3058 	      /* Parse a parenthesized expression or comma expression.  */
3059 	      if (extract_balanced (mlp, token_type_rparen, true, false,
3060 				    inner_context, next_context_iter,
3061 				    arg, arglist_parser_clone (argparser)))
3062 		{
3063 		  xgettext_current_source_encoding = po_charset_utf8;
3064 		  arglist_parser_done (argparser, arg);
3065 		  xgettext_current_source_encoding = xgettext_global_source_encoding;
3066 		  if (next_argparser != NULL)
3067 		    free (next_argparser);
3068 		  free_token (tp);
3069 		  return true;
3070 		}
3071 	      next_is_argument = false;
3072 	      if (next_argparser != NULL)
3073 		free (next_argparser);
3074 	      next_argparser = NULL;
3075 	    }
3076 	  skip_until_comma = true;
3077 	  next_context_iter = null_context_list_iterator;
3078 	  break;
3079 
3080 	case token_type_rparen:
3081 #if DEBUG_PERL
3082 	  fprintf (stderr, "%s:%d: type right parenthesis (%d)\n",
3083 		   logical_file_name, tp->line_number, nesting_level);
3084 #endif
3085 	  next_is_argument = false;
3086 	  if (next_argparser != NULL)
3087 	    free (next_argparser);
3088 	  next_argparser = NULL;
3089 	  skip_until_comma = true;
3090 	  next_context_iter = null_context_list_iterator;
3091 	  break;
3092 
3093 	case token_type_comma:
3094 	case token_type_fat_comma:
3095 #if DEBUG_PERL
3096 	  fprintf (stderr, "%s:%d: type comma (%d)\n",
3097 		   logical_file_name, tp->line_number, nesting_level);
3098 #endif
3099 	  if (arglist_parser_decidedp (argparser, arg))
3100 	    {
3101 	      /* We have missed the argument.  */
3102 	      xgettext_current_source_encoding = po_charset_utf8;
3103 	      arglist_parser_done (argparser, arg);
3104 	      xgettext_current_source_encoding = xgettext_global_source_encoding;
3105 	      argparser = arglist_parser_alloc (mlp, NULL);
3106 	      arg = 0;
3107 	    }
3108 	  arg++;
3109 #if DEBUG_PERL
3110 	  fprintf (stderr, "%s:%d: arg: %d\n",
3111 		   real_file_name, tp->line_number, arg);
3112 #endif
3113 	  inner_context =
3114 	    inherited_context (outer_context,
3115 			       flag_context_list_iterator_advance (
3116 				 &context_iter));
3117 	  next_is_argument = false;
3118 	  if (next_argparser != NULL)
3119 	    free (next_argparser);
3120 	  next_argparser = NULL;
3121 	  skip_until_comma = false;
3122 	  next_context_iter = passthrough_context_list_iterator;
3123 	  break;
3124 
3125 	case token_type_string:
3126 #if DEBUG_PERL
3127 	  fprintf (stderr, "%s:%d: type string (%d): \"%s\"\n",
3128 		   logical_file_name, tp->line_number, nesting_level,
3129 		   tp->string);
3130 #endif
3131 
3132 	  if (extract_all)
3133 	    {
3134 	      char *string = collect_message (mlp, tp, EXIT_SUCCESS);
3135 	      lex_pos_ty pos;
3136 
3137 	      pos.file_name = logical_file_name;
3138 	      pos.line_number = tp->line_number;
3139 	      xgettext_current_source_encoding = po_charset_utf8;
3140 	      remember_a_message (mlp, NULL, string, inner_context, &pos, savable_comment);
3141 	      xgettext_current_source_encoding = xgettext_global_source_encoding;
3142 	    }
3143 	  else if (!skip_until_comma)
3144 	    {
3145 	      /* Need to collect the complete string, with error checking,
3146 		 only if the argument ARG is used in ARGPARSER.  */
3147 	      bool must_collect = false;
3148 	      {
3149 		size_t nalternatives = argparser->nalternatives;
3150 		size_t i;
3151 
3152 		for (i = 0; i < nalternatives; i++)
3153 		  {
3154 		    struct partial_call *cp = &argparser->alternative[i];
3155 
3156 		    if (arg == cp->argnumc
3157 			|| arg == cp->argnum1 || arg == cp->argnum2)
3158 		      must_collect = true;
3159 		  }
3160 	      }
3161 
3162 	      if (must_collect)
3163 		{
3164 		  char *string = collect_message (mlp, tp, EXIT_FAILURE);
3165 
3166 		  xgettext_current_source_encoding = po_charset_utf8;
3167 		  arglist_parser_remember (argparser, arg,
3168 					   string, inner_context,
3169 					   logical_file_name, tp->line_number,
3170 					   savable_comment);
3171 		  xgettext_current_source_encoding = xgettext_global_source_encoding;
3172 		}
3173 	    }
3174 
3175 	  if (arglist_parser_decidedp (argparser, arg))
3176 	    {
3177 	      xgettext_current_source_encoding = po_charset_utf8;
3178 	      arglist_parser_done (argparser, arg);
3179 	      xgettext_current_source_encoding = xgettext_global_source_encoding;
3180 	      argparser = arglist_parser_alloc (mlp, NULL);
3181 	    }
3182 
3183 	  next_is_argument = false;
3184 	  if (next_argparser != NULL)
3185 	    free (next_argparser);
3186 	  next_argparser = NULL;
3187 	  next_context_iter = null_context_list_iterator;
3188 	  break;
3189 
3190 	case token_type_eof:
3191 #if DEBUG_PERL
3192 	  fprintf (stderr, "%s:%d: type EOF (%d)\n",
3193 		   logical_file_name, tp->line_number, nesting_level);
3194 #endif
3195 	  xgettext_current_source_encoding = po_charset_utf8;
3196 	  arglist_parser_done (argparser, arg);
3197 	  xgettext_current_source_encoding = xgettext_global_source_encoding;
3198 	  if (next_argparser != NULL)
3199 	    free (next_argparser);
3200 	  next_argparser = NULL;
3201 	  free_token (tp);
3202 	  return true;
3203 
3204 	case token_type_lbrace:
3205 #if DEBUG_PERL
3206 	  fprintf (stderr, "%s:%d: type lbrace (%d)\n",
3207 		   logical_file_name, tp->line_number, nesting_level);
3208 #endif
3209 	  if (extract_balanced (mlp, token_type_rbrace, true, false,
3210 				null_context, null_context_list_iterator,
3211 				1, arglist_parser_alloc (mlp, NULL)))
3212 	    {
3213 	      xgettext_current_source_encoding = po_charset_utf8;
3214 	      arglist_parser_done (argparser, arg);
3215 	      xgettext_current_source_encoding = xgettext_global_source_encoding;
3216 	      if (next_argparser != NULL)
3217 		free (next_argparser);
3218 	      free_token (tp);
3219 	      return true;
3220 	    }
3221 	  next_is_argument = false;
3222 	  if (next_argparser != NULL)
3223 	    free (next_argparser);
3224 	  next_argparser = NULL;
3225 	  next_context_iter = null_context_list_iterator;
3226 	  break;
3227 
3228 	case token_type_rbrace:
3229 #if DEBUG_PERL
3230 	  fprintf (stderr, "%s:%d: type rbrace (%d)\n",
3231 		   logical_file_name, tp->line_number, nesting_level);
3232 #endif
3233 	  next_is_argument = false;
3234 	  if (next_argparser != NULL)
3235 	    free (next_argparser);
3236 	  next_argparser = NULL;
3237 	  next_context_iter = null_context_list_iterator;
3238 	  break;
3239 
3240 	case token_type_lbracket:
3241 #if DEBUG_PERL
3242 	  fprintf (stderr, "%s:%d: type lbracket (%d)\n",
3243 		   logical_file_name, tp->line_number, nesting_level);
3244 #endif
3245 	  if (extract_balanced (mlp, token_type_rbracket, true, false,
3246 				null_context, null_context_list_iterator,
3247 				1, arglist_parser_alloc (mlp, NULL)))
3248 	    {
3249 	      xgettext_current_source_encoding = po_charset_utf8;
3250 	      arglist_parser_done (argparser, arg);
3251 	      xgettext_current_source_encoding = xgettext_global_source_encoding;
3252 	      if (next_argparser != NULL)
3253 		free (next_argparser);
3254 	      free_token (tp);
3255 	      return true;
3256 	    }
3257 	  next_is_argument = false;
3258 	  if (next_argparser != NULL)
3259 	    free (next_argparser);
3260 	  next_argparser = NULL;
3261 	  next_context_iter = null_context_list_iterator;
3262 	  break;
3263 
3264 	case token_type_rbracket:
3265 #if DEBUG_PERL
3266 	  fprintf (stderr, "%s:%d: type rbracket (%d)\n",
3267 		   logical_file_name, tp->line_number, nesting_level);
3268 #endif
3269 	  next_is_argument = false;
3270 	  if (next_argparser != NULL)
3271 	    free (next_argparser);
3272 	  next_argparser = NULL;
3273 	  next_context_iter = null_context_list_iterator;
3274 	  break;
3275 
3276 	case token_type_semicolon:
3277 #if DEBUG_PERL
3278 	  fprintf (stderr, "%s:%d: type semicolon (%d)\n",
3279 		   logical_file_name, tp->line_number, nesting_level);
3280 #endif
3281 
3282 	  /* The ultimate sign.  */
3283 	  xgettext_current_source_encoding = po_charset_utf8;
3284 	  arglist_parser_done (argparser, arg);
3285 	  xgettext_current_source_encoding = xgettext_global_source_encoding;
3286 	  argparser = arglist_parser_alloc (mlp, NULL);
3287 
3288 	  /* FIXME: Instead of resetting outer_context here, it may be better
3289 	     to recurse in the next_is_argument handling above, waiting for
3290 	     the next semicolon or other statement terminator.  */
3291 	  outer_context = null_context;
3292 	  context_iter = null_context_list_iterator;
3293 	  next_is_argument = false;
3294 	  if (next_argparser != NULL)
3295 	    free (next_argparser);
3296 	  next_argparser = NULL;
3297 	  next_context_iter = passthrough_context_list_iterator;
3298 	  inner_context =
3299 	    inherited_context (outer_context,
3300 			       flag_context_list_iterator_advance (
3301 				 &context_iter));
3302 	  break;
3303 
3304 	case token_type_dereference:
3305 #if DEBUG_PERL
3306 	  fprintf (stderr, "%s:%d: type dereference (%d)\n",
3307 		   logical_file_name, tp->line_number, nesting_level);
3308 #endif
3309 	  next_is_argument = false;
3310 	  if (next_argparser != NULL)
3311 	    free (next_argparser);
3312 	  next_argparser = NULL;
3313 	  next_context_iter = null_context_list_iterator;
3314 	  break;
3315 
3316 	case token_type_dot:
3317 #if DEBUG_PERL
3318 	  fprintf (stderr, "%s:%d: type dot (%d)\n",
3319 		   logical_file_name, tp->line_number, nesting_level);
3320 #endif
3321 	  next_is_argument = false;
3322 	  if (next_argparser != NULL)
3323 	    free (next_argparser);
3324 	  next_argparser = NULL;
3325 	  next_context_iter = null_context_list_iterator;
3326 	  break;
3327 
3328 	case token_type_named_op:
3329 #if DEBUG_PERL
3330 	  fprintf (stderr, "%s:%d: type named operator (%d): %s\n",
3331 		   logical_file_name, tp->line_number, nesting_level,
3332 		   tp->string);
3333 #endif
3334 	  next_is_argument = false;
3335 	  if (next_argparser != NULL)
3336 	    free (next_argparser);
3337 	  next_argparser = NULL;
3338 	  next_context_iter = null_context_list_iterator;
3339 	  break;
3340 
3341 	case token_type_regex_op:
3342 #if DEBUG_PERL
3343 	  fprintf (stderr, "%s:%d: type regex operator (%d)\n",
3344 		   logical_file_name, tp->line_number, nesting_level);
3345 #endif
3346 	  next_is_argument = false;
3347 	  if (next_argparser != NULL)
3348 	    free (next_argparser);
3349 	  next_argparser = NULL;
3350 	  next_context_iter = null_context_list_iterator;
3351 	  break;
3352 
3353 	case token_type_other:
3354 #if DEBUG_PERL
3355 	  fprintf (stderr, "%s:%d: type other (%d)\n",
3356 		   logical_file_name, tp->line_number, nesting_level);
3357 #endif
3358 	  next_is_argument = false;
3359 	  if (next_argparser != NULL)
3360 	    free (next_argparser);
3361 	  next_argparser = NULL;
3362 	  next_context_iter = null_context_list_iterator;
3363 	  break;
3364 
3365 	default:
3366 	  fprintf (stderr, "%s:%d: unknown token type %d\n",
3367 		   real_file_name, tp->line_number, tp->type);
3368 	  abort ();
3369 	}
3370 
3371       free_token (tp);
3372     }
3373 }
3374 
3375 void
extract_perl(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)3376 extract_perl (FILE *f, const char *real_filename, const char *logical_filename,
3377 	      flag_context_list_table_ty *flag_table,
3378 	      msgdomain_list_ty *mdlp)
3379 {
3380   message_list_ty *mlp = mdlp->item[0]->messages;
3381 
3382   fp = f;
3383   real_file_name = real_filename;
3384   logical_file_name = xstrdup (logical_filename);
3385   line_number = 0;
3386 
3387   last_comment_line = -1;
3388   last_non_comment_line = -1;
3389 
3390   flag_context_list_table = flag_table;
3391 
3392   init_keywords ();
3393 
3394   token_stack.items = NULL;
3395   token_stack.nitems = 0;
3396   token_stack.nitems_max = 0;
3397   linesize = 0;
3398   linepos = 0;
3399   here_eaten = 0;
3400   end_of_file = false;
3401 
3402   /* Eat tokens until eof is seen.  When extract_balanced returns
3403      due to an unbalanced closing brace, just restart it.  */
3404   while (!extract_balanced (mlp, token_type_rbrace, true, false,
3405 			    null_context, null_context_list_iterator,
3406 			    1, arglist_parser_alloc (mlp, NULL)))
3407     ;
3408 
3409   fp = NULL;
3410   real_file_name = NULL;
3411   free (logical_file_name);
3412   logical_file_name = NULL;
3413   line_number = 0;
3414   last_token = token_type_semicolon;
3415   token_stack_free (&token_stack);
3416   here_eaten = 0;
3417   end_of_file = true;
3418 }
3419