1 /* xgettext Perl backend.
2 Copyright (C) 2002-2006 Free Software Foundation, Inc.
3
4 This file was written by Guido Flohr <guido@imperia.net>, 2002-2003.
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
19
20 #ifdef HAVE_CONFIG_H
21 # include "config.h"
22 #endif
23
24 #include <errno.h>
25 #include <stdbool.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29
30 #include "message.h"
31 #include "xgettext.h"
32 #include "x-perl.h"
33 #include "error.h"
34 #include "error-progname.h"
35 #include "xalloc.h"
36 #include "exit.h"
37 #include "po-charset.h"
38 #include "ucs4-utf8.h"
39 #include "uniname.h"
40 #include "getline.h"
41 #include "gettext.h"
42
43 #define _(s) gettext(s)
44
45 /* The Perl syntax is defined in perlsyn.pod. Try the command
46 "man perlsyn" or "perldoc perlsyn".
47 Also, the syntax after the 'sub' keyword is specified in perlsub.pod.
48 Try the command "man perlsub" or "perldoc perlsub". */
49
50 #define DEBUG_PERL 0
51
52
53 /* ====================== Keyword set customization. ====================== */
54
55 /* If true extract all strings. */
56 static bool extract_all = false;
57
58 static hash_table keywords;
59 static bool default_keywords = true;
60
61
62 void
x_perl_extract_all()63 x_perl_extract_all ()
64 {
65 extract_all = true;
66 }
67
68
69 void
x_perl_keyword(const char * name)70 x_perl_keyword (const char *name)
71 {
72 if (name == NULL)
73 default_keywords = false;
74 else
75 {
76 const char *end;
77 struct callshape shape;
78 const char *colon;
79
80 if (keywords.table == NULL)
81 hash_init (&keywords, 100);
82
83 split_keywordspec (name, &end, &shape);
84
85 /* The characters between name and end should form a valid C identifier.
86 A colon means an invalid parse in split_keywordspec(). */
87 colon = strchr (name, ':');
88 if (colon == NULL || colon >= end)
89 insert_keyword_callshape (&keywords, name, end - name, &shape);
90 }
91 }
92
93 /* Finish initializing the keywords hash table.
94 Called after argument processing, before each file is processed. */
95 static void
init_keywords()96 init_keywords ()
97 {
98 if (default_keywords)
99 {
100 /* When adding new keywords here, also update the documentation in
101 xgettext.texi! */
102 x_perl_keyword ("gettext");
103 x_perl_keyword ("%gettext");
104 x_perl_keyword ("$gettext");
105 x_perl_keyword ("dgettext:2");
106 x_perl_keyword ("dcgettext:2");
107 x_perl_keyword ("ngettext:1,2");
108 x_perl_keyword ("dngettext:2,3");
109 x_perl_keyword ("dcngettext:2,3");
110 x_perl_keyword ("gettext_noop");
111 #if 0
112 x_perl_keyword ("__");
113 x_perl_keyword ("$__");
114 x_perl_keyword ("%__");
115 x_perl_keyword ("__x");
116 x_perl_keyword ("__n:1,2");
117 x_perl_keyword ("__nx:1,2");
118 x_perl_keyword ("__xn:1,2");
119 x_perl_keyword ("N__");
120 #endif
121 default_keywords = false;
122 }
123 }
124
125 void
init_flag_table_perl()126 init_flag_table_perl ()
127 {
128 xgettext_record_flag ("gettext:1:pass-perl-format");
129 xgettext_record_flag ("gettext:1:pass-perl-brace-format");
130 xgettext_record_flag ("%gettext:1:pass-perl-format");
131 xgettext_record_flag ("%gettext:1:pass-perl-brace-format");
132 xgettext_record_flag ("$gettext:1:pass-perl-format");
133 xgettext_record_flag ("$gettext:1:pass-perl-brace-format");
134 xgettext_record_flag ("dgettext:2:pass-perl-format");
135 xgettext_record_flag ("dgettext:2:pass-perl-brace-format");
136 xgettext_record_flag ("dcgettext:2:pass-perl-format");
137 xgettext_record_flag ("dcgettext:2:pass-perl-brace-format");
138 xgettext_record_flag ("ngettext:1:pass-perl-format");
139 xgettext_record_flag ("ngettext:2:pass-perl-format");
140 xgettext_record_flag ("ngettext:1:pass-perl-brace-format");
141 xgettext_record_flag ("ngettext:2:pass-perl-brace-format");
142 xgettext_record_flag ("dngettext:2:pass-perl-format");
143 xgettext_record_flag ("dngettext:3:pass-perl-format");
144 xgettext_record_flag ("dngettext:2:pass-perl-brace-format");
145 xgettext_record_flag ("dngettext:3:pass-perl-brace-format");
146 xgettext_record_flag ("dcngettext:2:pass-perl-format");
147 xgettext_record_flag ("dcngettext:3:pass-perl-format");
148 xgettext_record_flag ("dcngettext:2:pass-perl-brace-format");
149 xgettext_record_flag ("dcngettext:3:pass-perl-brace-format");
150 xgettext_record_flag ("gettext_noop:1:pass-perl-format");
151 xgettext_record_flag ("gettext_noop:1:pass-perl-brace-format");
152 xgettext_record_flag ("printf:1:perl-format"); /* argument 1 or 2 ?? */
153 xgettext_record_flag ("sprintf:1:perl-format");
154 #if 0
155 xgettext_record_flag ("__:1:pass-perl-format");
156 xgettext_record_flag ("__:1:pass-perl-brace-format");
157 xgettext_record_flag ("%__:1:pass-perl-format");
158 xgettext_record_flag ("%__:1:pass-perl-brace-format");
159 xgettext_record_flag ("$__:1:pass-perl-format");
160 xgettext_record_flag ("$__:1:pass-perl-brace-format");
161 xgettext_record_flag ("__x:1:perl-brace-format");
162 xgettext_record_flag ("__n:1:pass-perl-format");
163 xgettext_record_flag ("__n:2:pass-perl-format");
164 xgettext_record_flag ("__n:1:pass-perl-brace-format");
165 xgettext_record_flag ("__n:2:pass-perl-brace-format");
166 xgettext_record_flag ("__nx:1:perl-brace-format");
167 xgettext_record_flag ("__nx:2:perl-brace-format");
168 xgettext_record_flag ("__xn:1:perl-brace-format");
169 xgettext_record_flag ("__xn:2:perl-brace-format");
170 xgettext_record_flag ("N__:1:pass-perl-format");
171 xgettext_record_flag ("N__:1:pass-perl-brace-format");
172 #endif
173 }
174
175
176 /* ======================== Reading of characters. ======================== */
177
178 /* Real filename, used in error messages about the input file. */
179 static const char *real_file_name;
180
181 /* Logical filename and line number, used to label the extracted messages. */
182 static char *logical_file_name;
183 static int line_number;
184
185 /* The input file stream. */
186 static FILE *fp;
187
188 /* The current line buffer. */
189 static char *linebuf;
190
191 /* The size of the current line. */
192 static int linesize;
193
194 /* The position in the current line. */
195 static int linepos;
196
197 /* The size of the input buffer. */
198 static size_t linebuf_size;
199
200 /* Number of lines eaten for here documents. */
201 static int here_eaten;
202
203 /* Paranoia: EOF marker for __END__ or __DATA__. */
204 static bool end_of_file;
205
206
207 /* 1. line_number handling. */
208
209 /* Returns the next character from the input stream or EOF. */
210 static int
phase1_getc()211 phase1_getc ()
212 {
213 line_number += here_eaten;
214 here_eaten = 0;
215
216 if (end_of_file)
217 return EOF;
218
219 if (linepos >= linesize)
220 {
221 linesize = getline (&linebuf, &linebuf_size, fp);
222
223 if (linesize < 0)
224 {
225 if (ferror (fp))
226 error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
227 real_file_name);
228 end_of_file = true;
229 return EOF;
230 }
231
232 linepos = 0;
233 ++line_number;
234
235 /* Undosify. This is important for catching the end of <<EOF and
236 <<'EOF'. We could rely on stdio doing this for us but you
237 it is not uncommon to to come across Perl scripts with CRLF
238 newline conventions on systems that do not follow this
239 convention. */
240 if (linesize >= 2 && linebuf[linesize - 1] == '\n'
241 && linebuf[linesize - 2] == '\r')
242 {
243 linebuf[linesize - 2] = '\n';
244 linebuf[linesize - 1] = '\0';
245 --linesize;
246 }
247 }
248
249 return linebuf[linepos++];
250 }
251
252 /* Supports only one pushback character. */
253 static void
phase1_ungetc(int c)254 phase1_ungetc (int c)
255 {
256 if (c != EOF)
257 {
258 if (linepos == 0)
259 /* Attempt to ungetc across line boundary. Shouldn't happen.
260 No two phase1_ungetc calls are permitted in a row. */
261 abort ();
262
263 --linepos;
264 }
265 }
266
267 /* Read a here document and return its contents.
268 The delimiter is an UTF-8 encoded string; the resulting string is UTF-8
269 encoded as well. */
270
271 static char *
get_here_document(const char * delimiter)272 get_here_document (const char *delimiter)
273 {
274 /* Accumulator for the entire here document, including a NUL byte
275 at the end. */
276 static char *buffer;
277 static size_t bufmax = 0;
278 size_t bufpos = 0;
279 /* Current line being appended. */
280 static char *my_linebuf = NULL;
281 static size_t my_linebuf_size = 0;
282
283 /* Allocate the initial buffer. Later on, bufmax > 0. */
284 if (bufmax == 0)
285 {
286 buffer = xrealloc (NULL, 1);
287 buffer[0] = '\0';
288 bufmax = 1;
289 }
290
291 for (;;)
292 {
293 int read_bytes = getline (&my_linebuf, &my_linebuf_size, fp);
294 char *my_line_utf8;
295 bool chomp;
296
297 if (read_bytes < 0)
298 {
299 if (ferror (fp))
300 {
301 error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
302 real_file_name);
303 }
304 else
305 {
306 error_with_progname = false;
307 error (EXIT_SUCCESS, 0, _("\
308 %s:%d: can't find string terminator \"%s\" anywhere before EOF"),
309 real_file_name, line_number, delimiter);
310 error_with_progname = true;
311
312 break;
313 }
314 }
315
316 ++here_eaten;
317
318 /* Convert to UTF-8. */
319 my_line_utf8 =
320 from_current_source_encoding (my_linebuf, logical_file_name,
321 line_number + here_eaten);
322 if (my_line_utf8 != my_linebuf)
323 {
324 if (strlen (my_line_utf8) >= my_linebuf_size)
325 {
326 my_linebuf_size = strlen (my_line_utf8) + 1;
327 my_linebuf = xrealloc (my_linebuf, my_linebuf_size);
328 }
329 strcpy (my_linebuf, my_line_utf8);
330 free (my_line_utf8);
331 }
332
333 /* Undosify. This is important for catching the end of <<EOF and
334 <<'EOF'. We could rely on stdio doing this for us but you
335 it is not uncommon to to come across Perl scripts with CRLF
336 newline conventions on systems that do not follow this
337 convention. */
338 if (read_bytes >= 2 && my_linebuf[read_bytes - 1] == '\n'
339 && my_linebuf[read_bytes - 2] == '\r')
340 {
341 my_linebuf[read_bytes - 2] = '\n';
342 my_linebuf[read_bytes - 1] = '\0';
343 --read_bytes;
344 }
345
346 /* Temporarily remove the trailing newline from my_linebuf. */
347 chomp = false;
348 if (read_bytes >= 1 && my_linebuf[read_bytes - 1] == '\n')
349 {
350 chomp = true;
351 my_linebuf[read_bytes - 1] = '\0';
352 }
353
354 /* See whether this line terminates the here document. */
355 if (strcmp (my_linebuf, delimiter) == 0)
356 break;
357
358 /* Add back the trailing newline to my_linebuf. */
359 if (chomp)
360 my_linebuf[read_bytes - 1] = '\n';
361
362 /* Ensure room for read_bytes + 1 bytes. */
363 if (bufpos + read_bytes >= bufmax)
364 {
365 do
366 bufmax = 2 * bufmax + 10;
367 while (bufpos + read_bytes >= bufmax);
368 buffer = xrealloc (buffer, bufmax);
369 }
370 /* Append this line to the accumulator. */
371 strcpy (buffer + bufpos, my_linebuf);
372 bufpos += read_bytes;
373 }
374
375 /* Done accumulating the here document. */
376 return xstrdup (buffer);
377 }
378
379 /* Skips pod sections. */
380 static void
skip_pod()381 skip_pod ()
382 {
383 line_number += here_eaten;
384 here_eaten = 0;
385 linepos = 0;
386
387 for (;;)
388 {
389 linesize = getline (&linebuf, &linebuf_size, fp);
390
391 if (linesize < 0)
392 {
393 if (ferror (fp))
394 error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
395 real_file_name);
396 return;
397 }
398
399 ++line_number;
400
401 if (strncmp ("=cut", linebuf, 4) == 0)
402 {
403 /* Force reading of a new line on next call to phase1_getc(). */
404 linepos = linesize;
405 return;
406 }
407 }
408 }
409
410
411 /* These are for tracking whether comments count as immediately before
412 keyword. */
413 static int last_comment_line;
414 static int last_non_comment_line;
415
416
417 /* 2. Replace each comment that is not inside a string literal or regular
418 expression with a newline character. We need to remember the comment
419 for later, because it may be attached to a keyword string. */
420
421 static int
phase2_getc()422 phase2_getc ()
423 {
424 static char *buffer;
425 static size_t bufmax;
426 size_t buflen;
427 int lineno;
428 int c;
429 char *utf8_string;
430
431 c = phase1_getc ();
432 if (c == '#')
433 {
434 buflen = 0;
435 lineno = line_number;
436 /* Skip leading whitespace. */
437 for (;;)
438 {
439 c = phase1_getc ();
440 if (c == EOF)
441 break;
442 if (c != ' ' && c != '\t' && c != '\r' && c != '\f')
443 {
444 phase1_ungetc (c);
445 break;
446 }
447 }
448 /* Accumulate the comment. */
449 for (;;)
450 {
451 c = phase1_getc ();
452 if (c == '\n' || c == EOF)
453 break;
454 if (buflen >= bufmax)
455 {
456 bufmax = 2 * bufmax + 10;
457 buffer = xrealloc (buffer, bufmax);
458 }
459 buffer[buflen++] = c;
460 }
461 if (buflen >= bufmax)
462 {
463 bufmax = 2 * bufmax + 10;
464 buffer = xrealloc (buffer, bufmax);
465 }
466 buffer[buflen] = '\0';
467 /* Convert it to UTF-8. */
468 utf8_string =
469 from_current_source_encoding (buffer, logical_file_name, lineno);
470 /* Save it until we encounter the corresponding string. */
471 savable_comment_add (utf8_string);
472 last_comment_line = lineno;
473 }
474 return c;
475 }
476
477 /* Supports only one pushback character. */
478 static void
phase2_ungetc(int c)479 phase2_ungetc (int c)
480 {
481 if (c != EOF)
482 phase1_ungetc (c);
483 }
484
485 /* Whitespace recognition. */
486
487 #define case_whitespace \
488 case ' ': case '\t': case '\r': case '\n': case '\f'
489
490 static inline bool
is_whitespace(int c)491 is_whitespace (int c)
492 {
493 return (c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\f');
494 }
495
496
497 /* ========================== Reading of tokens. ========================== */
498
499
500 enum token_type_ty
501 {
502 token_type_eof,
503 token_type_lparen, /* ( */
504 token_type_rparen, /* ) */
505 token_type_comma, /* , */
506 token_type_fat_comma, /* => */
507 token_type_dereference, /* , */
508 token_type_semicolon, /* ; */
509 token_type_lbrace, /* { */
510 token_type_rbrace, /* } */
511 token_type_lbracket, /* [ */
512 token_type_rbracket, /* ] */
513 token_type_string, /* quote-like */
514 token_type_named_op, /* if, unless, while, ... */
515 token_type_variable, /* $... */
516 token_type_symbol, /* symbol, number */
517 token_type_regex_op, /* s, tr, y, m. */
518 token_type_dot, /* . */
519 token_type_other, /* regexp, misc. operator */
520 /* The following are not really token types, but variants used by
521 the parser. */
522 token_type_keyword_symbol /* keyword symbol */
523 };
524 typedef enum token_type_ty token_type_ty;
525
526 /* Subtypes for strings, important for interpolation. */
527 enum string_type_ty
528 {
529 string_type_verbatim, /* "<<'EOF'", "m'...'", "s'...''...'",
530 "tr/.../.../", "y/.../.../". */
531 string_type_q, /* "'..'", "q/.../". */
532 string_type_qq, /* '"..."', "`...`", "qq/.../", "qx/.../",
533 "<file*glob>". */
534 string_type_qr /* Not supported. */
535 };
536
537 /* Subtypes for symbols, important for dollar interpretation. */
538 enum symbol_type_ty
539 {
540 symbol_type_none, /* Nothing special. */
541 symbol_type_sub, /* 'sub'. */
542 symbol_type_function /* Function name after 'sub'. */
543 };
544
545 typedef struct token_ty token_ty;
546 struct token_ty
547 {
548 token_type_ty type;
549 int sub_type; /* for token_type_string, token_type_symbol */
550 char *string; /* for: in encoding:
551 token_type_named_op ASCII
552 token_type_string UTF-8
553 token_type_symbol ASCII
554 token_type_variable global_source_encoding
555 */
556 int line_number;
557 };
558
559 #if DEBUG_PERL
560 static const char *
token2string(const token_ty * token)561 token2string (const token_ty *token)
562 {
563 switch (token->type)
564 {
565 case token_type_eof:
566 return "token_type_eof";
567 case token_type_lparen:
568 return "token_type_lparen";
569 case token_type_rparen:
570 return "token_type_rparen";
571 case token_type_comma:
572 return "token_type_comma";
573 case token_type_fat_comma:
574 return "token_type_fat_comma";
575 case token_type_dereference:
576 return "token_type_dereference";
577 case token_type_semicolon:
578 return "token_type_semicolon";
579 case token_type_lbrace:
580 return "token_type_lbrace";
581 case token_type_rbrace:
582 return "token_type_rbrace";
583 case token_type_lbracket:
584 return "token_type_lbracket";
585 case token_type_rbracket:
586 return "token_type_rbracket";
587 case token_type_string:
588 return "token_type_string";
589 case token_type_named_op:
590 return "token_type_named_op";
591 case token_type_variable:
592 return "token_type_variable";
593 case token_type_symbol:
594 return "token_type_symbol";
595 case token_type_regex_op:
596 return "token_type_regex_op";
597 case token_type_dot:
598 return "token_type_dot";
599 case token_type_other:
600 return "token_type_other";
601 default:
602 return "unknown";
603 }
604 }
605 #endif
606
607 /* Free the memory pointed to by a 'struct token_ty'. */
608 static inline void
free_token(token_ty * tp)609 free_token (token_ty *tp)
610 {
611 switch (tp->type)
612 {
613 case token_type_named_op:
614 case token_type_string:
615 case token_type_symbol:
616 case token_type_variable:
617 free (tp->string);
618 break;
619 default:
620 break;
621 }
622 free (tp);
623 }
624
625 /* Pass 1 of extracting quotes: Find the end of the string, regardless
626 of the semantics of the construct. Return the complete string,
627 including the starting and the trailing delimiter, with backslashes
628 removed where appropriate. */
629 static char *
extract_quotelike_pass1(int delim)630 extract_quotelike_pass1 (int delim)
631 {
632 /* This function is called recursively. No way to allocate stuff
633 statically. Also alloca() is inappropriate due to limited stack
634 size on some platforms. So we use malloc(). */
635 int bufmax = 10;
636 char *buffer = (char *) xmalloc (bufmax);
637 int bufpos = 0;
638 bool nested = true;
639 int counter_delim;
640
641 buffer[bufpos++] = delim;
642
643 /* Find the closing delimiter. */
644 switch (delim)
645 {
646 case '(':
647 counter_delim = ')';
648 break;
649 case '{':
650 counter_delim = '}';
651 break;
652 case '[':
653 counter_delim = ']';
654 break;
655 case '<':
656 counter_delim = '>';
657 break;
658 default: /* "..." or '...' or |...| etc. */
659 nested = false;
660 counter_delim = delim;
661 break;
662 }
663
664 for (;;)
665 {
666 int c = phase1_getc ();
667
668 /* This round can produce 1 or 2 bytes. Ensure room for 2 bytes. */
669 if (bufpos + 2 > bufmax)
670 {
671 bufmax = 2 * bufmax + 10;
672 buffer = xrealloc (buffer, bufmax);
673 }
674
675 if (c == counter_delim || c == EOF)
676 {
677 buffer[bufpos++] = counter_delim; /* will be stripped off later */
678 buffer[bufpos++] = '\0';
679 #if DEBUG_PERL
680 fprintf (stderr, "PASS1: %s\n", buffer);
681 #endif
682 return buffer;
683 }
684
685 if (nested && c == delim)
686 {
687 char *inner = extract_quotelike_pass1 (delim);
688 size_t len = strlen (inner);
689
690 /* Ensure room for len + 1 bytes. */
691 if (bufpos + len >= bufmax)
692 {
693 do
694 bufmax = 2 * bufmax + 10;
695 while (bufpos + len >= bufmax);
696 buffer = xrealloc (buffer, bufmax);
697 }
698 strcpy (buffer + bufpos, inner);
699 free (inner);
700 bufpos += len;
701 }
702 else if (c == '\\')
703 {
704 c = phase1_getc ();
705 if (c == '\\')
706 {
707 buffer[bufpos++] = '\\';
708 buffer[bufpos++] = '\\';
709 }
710 else if (c == delim || c == counter_delim)
711 {
712 /* This is pass2 in Perl. */
713 buffer[bufpos++] = c;
714 }
715 else
716 {
717 buffer[bufpos++] = '\\';
718 phase1_ungetc (c);
719 }
720 }
721 else
722 {
723 buffer[bufpos++] = c;
724 }
725 }
726 }
727
728 /* Like extract_quotelike_pass1, but return the complete string in UTF-8
729 encoding. */
730 static char *
extract_quotelike_pass1_utf8(int delim)731 extract_quotelike_pass1_utf8 (int delim)
732 {
733 char *string = extract_quotelike_pass1 (delim);
734 char *utf8_string =
735 from_current_source_encoding (string, logical_file_name, line_number);
736 if (utf8_string != string)
737 free (string);
738 return utf8_string;
739 }
740
741
742 /* ========= Reading of tokens and commands. Extracting strings. ========= */
743
744
745 /* There is an ambiguity about '/': It can start a division operator ('/' or
746 '/=') or it can start a regular expression. The distinction is important
747 because inside regular expressions, '#' loses its special meaning.
748 The distinction is possible depending on the parsing state: After a
749 variable or simple expression, it's a division operator; at the beginning
750 of an expression, it's a regexp. */
751 static bool prefer_division_over_regexp;
752
753 /* Context lookup table. */
754 static flag_context_list_table_ty *flag_context_list_table;
755
756
757 /* Forward declaration of local functions. */
758 static void interpolate_keywords (message_list_ty *mlp, const char *string,
759 int lineno);
760 static token_ty *x_perl_lex (message_list_ty *mlp);
761 static void x_perl_unlex (token_ty *tp);
762 static bool extract_balanced (message_list_ty *mlp,
763 token_type_ty delim, bool eat_delim,
764 bool comma_delim,
765 flag_context_ty outer_context,
766 flag_context_list_iterator_ty context_iter,
767 int arg, struct arglist_parser *argparser);
768
769
770 /* Extract an unsigned hexadecimal number from STRING, considering at
771 most LEN bytes and place the result in *RESULT. Returns a pointer
772 to the first character past the hexadecimal number. */
773 static const char *
extract_hex(const char * string,size_t len,unsigned int * result)774 extract_hex (const char *string, size_t len, unsigned int *result)
775 {
776 size_t i;
777
778 *result = 0;
779
780 for (i = 0; i < len; i++)
781 {
782 char c = string[i];
783 int number;
784
785 if (c >= 'A' && c <= 'F')
786 number = c - 'A' + 10;
787 else if (c >= 'a' && c <= 'f')
788 number = c - 'a' + 10;
789 else if (c >= '0' && c <= '9')
790 number = c - '0';
791 else
792 break;
793
794 *result <<= 4;
795 *result |= number;
796 }
797
798 return string + i;
799 }
800
801 /* Extract an unsigned octal number from STRING, considering at
802 most LEN bytes and place the result in *RESULT. Returns a pointer
803 to the first character past the octal number. */
804 static const char *
extract_oct(const char * string,size_t len,unsigned int * result)805 extract_oct (const char *string, size_t len, unsigned int *result)
806 {
807 size_t i;
808
809 *result = 0;
810
811 for (i = 0; i < len; i++)
812 {
813 char c = string[i];
814 int number;
815
816 if (c >= '0' && c <= '7')
817 number = c - '0';
818 else
819 break;
820
821 *result <<= 3;
822 *result |= number;
823 }
824
825 return string + i;
826 }
827
828 /* Extract the various quotelike constructs except for <<EOF. See the
829 section "Gory details of parsing quoted constructs" in perlop.pod.
830 Return the resulting token in *tp; tp->type == token_type_string. */
831 static void
extract_quotelike(token_ty * tp,int delim)832 extract_quotelike (token_ty *tp, int delim)
833 {
834 char *string = extract_quotelike_pass1_utf8 (delim);
835 size_t len = strlen (string);
836
837 tp->type = token_type_string;
838 /* Take the string without the delimiters at the start and at the end. */
839 if (!(len >= 2))
840 abort ();
841 string[len - 1] = '\0';
842 tp->string = xstrdup (string + 1);
843 free (string);
844 }
845
846 /* Extract the quotelike constructs with double delimiters, like
847 s/[SEARCH]/[REPLACE]/. This function does not eat up trailing
848 modifiers (left to the caller).
849 Return the resulting token in *tp; tp->type == token_type_regex_op. */
850 static void
extract_triple_quotelike(message_list_ty * mlp,token_ty * tp,int delim,bool interpolate)851 extract_triple_quotelike (message_list_ty *mlp, token_ty *tp, int delim,
852 bool interpolate)
853 {
854 char *string;
855
856 tp->type = token_type_regex_op;
857
858 string = extract_quotelike_pass1_utf8 (delim);
859 if (interpolate)
860 interpolate_keywords (mlp, string, line_number);
861 free (string);
862
863 if (delim == '(' || delim == '<' || delim == '{' || delim == '[')
864 {
865 /* The delimiter for the second string can be different, e.g.
866 s{SEARCH}{REPLACE} or s{SEARCH}/REPLACE/. See "man perlrequick". */
867 delim = phase1_getc ();
868 while (is_whitespace (delim))
869 {
870 /* The hash-sign is not a valid delimiter after whitespace, ergo
871 use phase2_getc() and not phase1_getc() now. */
872 delim = phase2_getc ();
873 }
874 }
875 string = extract_quotelike_pass1_utf8 (delim);
876 if (interpolate)
877 interpolate_keywords (mlp, string, line_number);
878 free (string);
879 }
880
881 /* Perform pass 3 of quotelike extraction (interpolation).
882 *tp is a token of type token_type_string.
883 This function replaces tp->string. */
884 /* FIXME: Currently may writes null-bytes into the string. */
885 static void
extract_quotelike_pass3(token_ty * tp,int error_level)886 extract_quotelike_pass3 (token_ty *tp, int error_level)
887 {
888 static char *buffer;
889 static int bufmax = 0;
890 int bufpos = 0;
891 const char *crs;
892 bool uppercase;
893 bool lowercase;
894 bool quotemeta;
895
896 #if DEBUG_PERL
897 switch (tp->sub_type)
898 {
899 case string_type_verbatim:
900 fprintf (stderr, "Interpolating string_type_verbatim:\n");
901 break;
902 case string_type_q:
903 fprintf (stderr, "Interpolating string_type_q:\n");
904 break;
905 case string_type_qq:
906 fprintf (stderr, "Interpolating string_type_qq:\n");
907 break;
908 case string_type_qr:
909 fprintf (stderr, "Interpolating string_type_qr:\n");
910 break;
911 }
912 fprintf (stderr, "%s\n", tp->string);
913 if (tp->sub_type == string_type_verbatim)
914 fprintf (stderr, "---> %s\n", tp->string);
915 #endif
916
917 if (tp->sub_type == string_type_verbatim)
918 return;
919
920 /* Loop over tp->string, accumulating the expansion in buffer. */
921 crs = tp->string;
922 uppercase = false;
923 lowercase = false;
924 quotemeta = false;
925 while (*crs)
926 {
927 bool backslashed;
928
929 /* Ensure room for 7 bytes, 6 (multi-)bytes plus a leading backslash
930 if \Q modifier is present. */
931 if (bufpos + 7 > bufmax)
932 {
933 bufmax = 2 * bufmax + 10;
934 buffer = xrealloc (buffer, bufmax);
935 }
936
937 if (tp->sub_type == string_type_q)
938 {
939 switch (*crs)
940 {
941 case '\\':
942 if (crs[1] == '\\')
943 {
944 crs += 2;
945 buffer[bufpos++] = '\\';
946 break;
947 }
948 /* FALLTHROUGH */
949 default:
950 buffer[bufpos++] = *crs++;
951 break;
952 }
953 continue;
954 }
955
956 /* We only get here for double-quoted strings or regular expressions.
957 Unescape escape sequences. */
958 if (*crs == '\\')
959 {
960 switch (crs[1])
961 {
962 case 't':
963 crs += 2;
964 buffer[bufpos++] = '\t';
965 continue;
966 case 'n':
967 crs += 2;
968 buffer[bufpos++] = '\n';
969 continue;
970 case 'r':
971 crs += 2;
972 buffer[bufpos++] = '\r';
973 continue;
974 case 'f':
975 crs += 2;
976 buffer[bufpos++] = '\f';
977 continue;
978 case 'b':
979 crs += 2;
980 buffer[bufpos++] = '\b';
981 continue;
982 case 'a':
983 crs += 2;
984 buffer[bufpos++] = '\a';
985 continue;
986 case 'e':
987 crs += 2;
988 buffer[bufpos++] = 0x1b;
989 continue;
990 case '0': case '1': case '2': case '3':
991 case '4': case '5': case '6': case '7':
992 {
993 unsigned int oct_number;
994 int length;
995
996 crs = extract_oct (crs + 1, 3, &oct_number);
997
998 /* FIXME: If one of the variables UPPERCASE or LOWERCASE is
999 true, the character should be converted to its uppercase
1000 resp. lowercase equivalent. I don't know if the necessary
1001 facilities are already included in gettext. For US-Ascii
1002 the conversion can be already be done, however. */
1003 if (uppercase && oct_number >= 'a' && oct_number <= 'z')
1004 {
1005 oct_number = oct_number - 'a' + 'A';
1006 }
1007 else if (lowercase && oct_number >= 'A' && oct_number <= 'Z')
1008 {
1009 oct_number = oct_number - 'A' + 'a';
1010 }
1011
1012
1013 /* Yes, octal escape sequences in the range 0x100..0x1ff are
1014 valid. */
1015 length = u8_uctomb ((unsigned char *) (buffer + bufpos),
1016 oct_number, 2);
1017 if (length > 0)
1018 bufpos += length;
1019 }
1020 continue;
1021 case 'x':
1022 {
1023 unsigned int hex_number = 0;
1024 int length;
1025
1026 crs += 2;
1027 if (*crs == '{')
1028 {
1029 const char *end = strchr (crs, '}');
1030 if (end == NULL)
1031 {
1032 error_with_progname = false;
1033 error (error_level, 0, _("\
1034 %s:%d: missing right brace on \\x{HEXNUMBER}"), real_file_name, line_number);
1035 error_with_progname = true;
1036 ++crs;
1037 continue;
1038 }
1039 else
1040 {
1041 ++crs;
1042 (void) extract_hex (crs, end - crs, &hex_number);
1043 crs = end + 1;
1044 }
1045 }
1046 else
1047 {
1048 crs = extract_hex (crs, 2, &hex_number);
1049 }
1050
1051 /* FIXME: If one of the variables UPPERCASE or LOWERCASE is
1052 true, the character should be converted to its uppercase
1053 resp. lowercase equivalent. I don't know if the necessary
1054 facilities are already included in gettext. For US-Ascii
1055 the conversion can be already be done, however. */
1056 if (uppercase && hex_number >= 'a' && hex_number <= 'z')
1057 {
1058 hex_number = hex_number - 'a' + 'A';
1059 }
1060 else if (lowercase && hex_number >= 'A' && hex_number <= 'Z')
1061 {
1062 hex_number = hex_number - 'A' + 'a';
1063 }
1064
1065 length = u8_uctomb ((unsigned char *) (buffer + bufpos),
1066 hex_number, 6);
1067
1068 if (length > 0)
1069 bufpos += length;
1070 }
1071 continue;
1072 case 'c':
1073 /* Perl's notion of control characters. */
1074 crs += 2;
1075 if (*crs)
1076 {
1077 int the_char = (unsigned char) *crs;
1078 if (the_char >= 'a' || the_char <= 'z')
1079 the_char = the_char - 'a' + 'A';
1080 buffer[bufpos++] = the_char ^ 0x40;
1081 }
1082 continue;
1083 case 'N':
1084 crs += 2;
1085 if (*crs == '{')
1086 {
1087 const char *end = strchr (crs + 1, '}');
1088 if (end != NULL)
1089 {
1090 char *name;
1091 unsigned int unicode;
1092
1093 name = (char *) xmalloc (end - (crs + 1) + 1);
1094 memcpy (name, crs + 1, end - (crs + 1));
1095 name[end - (crs + 1)] = '\0';
1096
1097 unicode = unicode_name_character (name);
1098 if (unicode != UNINAME_INVALID)
1099 {
1100 /* FIXME: Convert to upper/lowercase if the
1101 corresponding flag is set to true. */
1102 int length =
1103 u8_uctomb ((unsigned char *) (buffer + bufpos),
1104 unicode, 6);
1105 if (length > 0)
1106 bufpos += length;
1107 }
1108
1109 free (name);
1110
1111 crs = end + 1;
1112 }
1113 }
1114 continue;
1115 }
1116 }
1117
1118 /* No escape sequence, go on. */
1119 if (*crs == '\\')
1120 {
1121 ++crs;
1122 switch (*crs)
1123 {
1124 case 'E':
1125 uppercase = false;
1126 lowercase = false;
1127 quotemeta = false;
1128 ++crs;
1129 continue;
1130 case 'L':
1131 uppercase = false;
1132 lowercase = true;
1133 ++crs;
1134 continue;
1135 case 'U':
1136 uppercase = true;
1137 lowercase = false;
1138 ++crs;
1139 continue;
1140 case 'Q':
1141 quotemeta = true;
1142 ++crs;
1143 continue;
1144 case 'l':
1145 ++crs;
1146 if (*crs >= 'A' && *crs <= 'Z')
1147 {
1148 buffer[bufpos++] = *crs - 'A' + 'a';
1149 }
1150 else if ((unsigned char) *crs >= 0x80)
1151 {
1152 error_with_progname = false;
1153 error (error_level, 0, _("\
1154 %s:%d: invalid interpolation (\"\\l\") of 8bit character \"%c\""),
1155 real_file_name, line_number, *crs);
1156 error_with_progname = true;
1157 }
1158 else
1159 {
1160 buffer[bufpos++] = *crs;
1161 }
1162 ++crs;
1163 continue;
1164 case 'u':
1165 ++crs;
1166 if (*crs >= 'a' && *crs <= 'z')
1167 {
1168 buffer[bufpos++] = *crs - 'a' + 'A';
1169 }
1170 else if ((unsigned char) *crs >= 0x80)
1171 {
1172 error_with_progname = false;
1173 error (error_level, 0, _("\
1174 %s:%d: invalid interpolation (\"\\u\") of 8bit character \"%c\""),
1175 real_file_name, line_number, *crs);
1176 error_with_progname = true;
1177 }
1178 else
1179 {
1180 buffer[bufpos++] = *crs;
1181 }
1182 ++crs;
1183 continue;
1184 case '\\':
1185 buffer[bufpos++] = *crs;
1186 ++crs;
1187 continue;
1188 default:
1189 backslashed = true;
1190 break;
1191 }
1192 }
1193 else
1194 backslashed = false;
1195
1196 if (quotemeta
1197 && !((*crs >= 'A' && *crs <= 'Z') || (*crs >= 'A' && *crs <= 'z')
1198 || (*crs >= '0' && *crs <= '9') || *crs == '_'))
1199 {
1200 buffer[bufpos++] = '\\';
1201 backslashed = true;
1202 }
1203
1204 if (!backslashed && !extract_all && (*crs == '$' || *crs == '@'))
1205 {
1206 error_with_progname = false;
1207 error (error_level, 0, _("\
1208 %s:%d: invalid variable interpolation at \"%c\""),
1209 real_file_name, line_number, *crs);
1210 error_with_progname = true;
1211 ++crs;
1212 }
1213 else if (lowercase)
1214 {
1215 if (*crs >= 'A' && *crs <= 'Z')
1216 buffer[bufpos++] = *crs - 'A' + 'a';
1217 else if ((unsigned char) *crs >= 0x80)
1218 {
1219 error_with_progname = false;
1220 error (error_level, 0, _("\
1221 %s:%d: invalid interpolation (\"\\L\") of 8bit character \"%c\""),
1222 real_file_name, line_number, *crs);
1223 error_with_progname = true;
1224 buffer[bufpos++] = *crs;
1225 }
1226 else
1227 buffer[bufpos++] = *crs;
1228 ++crs;
1229 }
1230 else if (uppercase)
1231 {
1232 if (*crs >= 'a' && *crs <= 'z')
1233 buffer[bufpos++] = *crs - 'a' + 'A';
1234 else if ((unsigned char) *crs >= 0x80)
1235 {
1236 error_with_progname = false;
1237 error (error_level, 0, _("\
1238 %s:%d: invalid interpolation (\"\\U\") of 8bit character \"%c\""),
1239 real_file_name, line_number, *crs);
1240 error_with_progname = true;
1241 buffer[bufpos++] = *crs;
1242 }
1243 else
1244 buffer[bufpos++] = *crs;
1245 ++crs;
1246 }
1247 else
1248 {
1249 buffer[bufpos++] = *crs++;
1250 }
1251 }
1252
1253 /* Ensure room for 1 more byte. */
1254 if (bufpos >= bufmax)
1255 {
1256 bufmax = 2 * bufmax + 10;
1257 buffer = xrealloc (buffer, bufmax);
1258 }
1259
1260 buffer[bufpos++] = '\0';
1261
1262 #if DEBUG_PERL
1263 fprintf (stderr, "---> %s\n", buffer);
1264 #endif
1265
1266 /* Replace tp->string. */
1267 free (tp->string);
1268 tp->string = xstrdup (buffer);
1269 }
1270
1271 /* Parse a variable. This is done in several steps:
1272 1) Consume all leading occurencies of '$', '@', '%', and '*'.
1273 2) Determine the name of the variable from the following input.
1274 3) Parse possible following hash keys or array indexes.
1275 */
1276 static void
extract_variable(message_list_ty * mlp,token_ty * tp,int first)1277 extract_variable (message_list_ty *mlp, token_ty *tp, int first)
1278 {
1279 static char *buffer;
1280 static int bufmax = 0;
1281 int bufpos = 0;
1282 int c = first;
1283 size_t varbody_length = 0;
1284 bool maybe_hash_deref = false;
1285 bool maybe_hash_value = false;
1286
1287 tp->type = token_type_variable;
1288
1289 #if DEBUG_PERL
1290 fprintf (stderr, "%s:%d: extracting variable type '%c'\n",
1291 real_file_name, line_number, first);
1292 #endif
1293
1294 /*
1295 * 1) Consume dollars and so on (not euros ...). Unconditionally
1296 * accepting the hash sign (#) will maybe lead to inaccurate
1297 * results. FIXME!
1298 */
1299 while (c == '$' || c == '*' || c == '#' || c == '@' || c == '%')
1300 {
1301 if (bufpos >= bufmax)
1302 {
1303 bufmax = 2 * bufmax + 10;
1304 buffer = xrealloc (buffer, bufmax);
1305 }
1306 buffer[bufpos++] = c;
1307 c = phase1_getc ();
1308 }
1309
1310 if (c == EOF)
1311 {
1312 tp->type = token_type_eof;
1313 return;
1314 }
1315
1316 /* Hash references are treated in a special way, when looking for
1317 our keywords. */
1318 if (buffer[0] == '$')
1319 {
1320 if (bufpos == 1)
1321 maybe_hash_value = true;
1322 else if (bufpos == 2 && buffer[1] == '$')
1323 {
1324 if (!(c == '{'
1325 || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
1326 || (c >= '0' && c <= '9')
1327 || c == '_' || c == ':' || c == '\'' || c >= 0x80))
1328 {
1329 /* Special variable $$ for pid. */
1330 if (bufpos >= bufmax)
1331 {
1332 bufmax = 2 * bufmax + 10;
1333 buffer = xrealloc (buffer, bufmax);
1334 }
1335 buffer[bufpos++] = '\0';
1336 tp->string = xstrdup (buffer);
1337 #if DEBUG_PERL
1338 fprintf (stderr, "%s:%d: is PID ($$)\n",
1339 real_file_name, line_number);
1340 #endif
1341
1342 phase1_ungetc (c);
1343 return;
1344 }
1345
1346 maybe_hash_deref = true;
1347 bufpos = 1;
1348 }
1349 }
1350
1351 /*
1352 * 2) Get the name of the variable. The first character is practically
1353 * arbitrary. Punctuation and numbers automagically put a variable
1354 * in the global namespace but that subtle difference is not interesting
1355 * for us.
1356 */
1357 if (bufpos >= bufmax)
1358 {
1359 bufmax = 2 * bufmax + 10;
1360 buffer = xrealloc (buffer, bufmax);
1361 }
1362 if (c == '{')
1363 {
1364 /* Yuck, we cannot accept ${gettext} as a keyword... Except for
1365 * debugging purposes it is also harmless, that we suppress the
1366 * real name of the variable.
1367 */
1368 #if DEBUG_PERL
1369 fprintf (stderr, "%s:%d: braced {variable_name}\n",
1370 real_file_name, line_number);
1371 #endif
1372
1373 if (extract_balanced (mlp, token_type_rbrace, true, false,
1374 null_context, null_context_list_iterator,
1375 1, arglist_parser_alloc (mlp, NULL)))
1376 return;
1377 buffer[bufpos++] = c;
1378 }
1379 else
1380 {
1381 while ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
1382 || (c >= '0' && c <= '9')
1383 || c == '_' || c == ':' || c == '\'' || c >= 0x80)
1384 {
1385 ++varbody_length;
1386 if (bufpos >= bufmax)
1387 {
1388 bufmax = 2 * bufmax + 10;
1389 buffer = xrealloc (buffer, bufmax);
1390 }
1391 buffer[bufpos++] = c;
1392 c = phase1_getc ();
1393 }
1394 phase1_ungetc (c);
1395 }
1396
1397 /* Probably some strange Perl variable like $`. */
1398 if (varbody_length == 0)
1399 {
1400 c = phase1_getc ();
1401 if (c == EOF || is_whitespace (c))
1402 phase1_ungetc (c); /* Loser. */
1403 else
1404 {
1405 if (bufpos >= bufmax)
1406 {
1407 bufmax = 2 * bufmax + 10;
1408 buffer = xrealloc (buffer, bufmax);
1409 }
1410 buffer[bufpos++] = c;
1411 }
1412 }
1413
1414 if (bufpos >= bufmax)
1415 {
1416 bufmax = 2 * bufmax + 10;
1417 buffer = xrealloc (buffer, bufmax);
1418 }
1419 buffer[bufpos++] = '\0';
1420
1421 tp->string = xstrdup (buffer);
1422
1423 #if DEBUG_PERL
1424 fprintf (stderr, "%s:%d: complete variable name: %s\n",
1425 real_file_name, line_number, tp->string);
1426 #endif
1427
1428 prefer_division_over_regexp = true;
1429
1430 /*
1431 * 3) If the following looks strange to you, this is valid Perl syntax:
1432 *
1433 * $var = $$hashref # We can place a
1434 * # comment here and then ...
1435 * {key_into_hashref};
1436 *
1437 * POD sections are not allowed but we leave complaints about
1438 * that to the compiler/interpreter.
1439 */
1440 /* We only extract strings from the first hash key (if present). */
1441
1442 if (maybe_hash_deref || maybe_hash_value)
1443 {
1444 bool is_dereference = false;
1445 int c;
1446
1447 do
1448 c = phase2_getc ();
1449 while (is_whitespace (c));
1450
1451 if (c == '-')
1452 {
1453 int c2 = phase1_getc ();
1454
1455 if (c2 == '>')
1456 {
1457 is_dereference = true;
1458
1459 do
1460 c = phase2_getc ();
1461 while (is_whitespace (c));
1462 }
1463 else if (c2 != '\n')
1464 {
1465 /* Discarding the newline is harmless here. The only
1466 special character recognized after a minus is greater-than
1467 for dereference. However, the sequence "-\n>" that we
1468 treat incorrectly here, is a syntax error. */
1469 phase1_ungetc (c2);
1470 }
1471 }
1472
1473 if (maybe_hash_value && is_dereference)
1474 {
1475 #if DEBUG_PERL
1476 fprintf (stderr, "%s:%d: first keys preceded by \"->\"\n",
1477 real_file_name, line_number);
1478 #endif
1479 }
1480 else if (maybe_hash_value)
1481 {
1482 /* Fake it into a hash. */
1483 tp->string[0] = '%';
1484 }
1485
1486 /* Do NOT change that into else if (see above). */
1487 if ((maybe_hash_value || maybe_hash_deref) && c == '{')
1488 {
1489 void *keyword_value;
1490
1491 #if DEBUG_PERL
1492 fprintf (stderr, "%s:%d: first keys preceded by '{'\n",
1493 real_file_name, line_number);
1494 #endif
1495
1496 if (hash_find_entry (&keywords, tp->string, strlen (tp->string),
1497 &keyword_value) == 0)
1498 {
1499 /* TODO: Shouldn't we use the shapes of the keyword, instead
1500 of hardwiring argnum1 = 1 ?
1501 const struct callshapes *shapes =
1502 (const struct callshapes *) keyword_value;
1503 */
1504 struct callshapes shapes;
1505 shapes.keyword = tp->string; /* XXX storage duration? */
1506 shapes.keyword_len = strlen (tp->string);
1507 shapes.nshapes = 1;
1508 shapes.shapes[0].argnum1 = 1;
1509 shapes.shapes[0].argnum2 = 0;
1510 shapes.shapes[0].argnumc = 0;
1511 shapes.shapes[0].argnum1_glib_context = false;
1512 shapes.shapes[0].argnum2_glib_context = false;
1513 shapes.shapes[0].argtotal = 0;
1514 string_list_init (&shapes.shapes[0].xcomments);
1515
1516 {
1517 /* Extract a possible string from the key. Before proceeding
1518 we check whether the open curly is followed by a symbol and
1519 then by a right curly. */
1520 flag_context_list_iterator_ty context_iter =
1521 flag_context_list_iterator (
1522 flag_context_list_table_lookup (
1523 flag_context_list_table,
1524 tp->string, strlen (tp->string)));
1525 token_ty *t1 = x_perl_lex (mlp);
1526
1527 #if DEBUG_PERL
1528 fprintf (stderr, "%s:%d: extracting string key\n",
1529 real_file_name, line_number);
1530 #endif
1531
1532 if (t1->type == token_type_symbol
1533 || t1->type == token_type_named_op)
1534 {
1535 token_ty *t2 = x_perl_lex (mlp);
1536 if (t2->type == token_type_rbrace)
1537 {
1538 flag_context_ty context;
1539 lex_pos_ty pos;
1540
1541 context =
1542 inherited_context (null_context,
1543 flag_context_list_iterator_advance (
1544 &context_iter));
1545
1546 pos.line_number = line_number;
1547 pos.file_name = logical_file_name;
1548
1549 xgettext_current_source_encoding = po_charset_utf8;
1550 remember_a_message (mlp, NULL, xstrdup (t1->string),
1551 context, &pos, savable_comment);
1552 xgettext_current_source_encoding = xgettext_global_source_encoding;
1553 free_token (t2);
1554 free_token (t1);
1555 }
1556 else
1557 {
1558 x_perl_unlex (t2);
1559 }
1560 }
1561 else
1562 {
1563 x_perl_unlex (t1);
1564 if (extract_balanced (mlp, token_type_rbrace, true, false,
1565 null_context, context_iter,
1566 1, arglist_parser_alloc (mlp, &shapes)))
1567 return;
1568 }
1569 }
1570 }
1571 else
1572 {
1573 phase2_ungetc (c);
1574 }
1575 }
1576 else
1577 {
1578 phase2_ungetc (c);
1579 }
1580 }
1581
1582 /* Now consume "->", "[...]", and "{...}". */
1583 for (;;)
1584 {
1585 int c = phase2_getc ();
1586 int c2;
1587
1588 switch (c)
1589 {
1590 case '{':
1591 #if DEBUG_PERL
1592 fprintf (stderr, "%s:%d: extracting balanced '{' after varname\n",
1593 real_file_name, line_number);
1594 #endif
1595 extract_balanced (mlp, token_type_rbrace, true, false,
1596 null_context, null_context_list_iterator,
1597 1, arglist_parser_alloc (mlp, NULL));
1598 break;
1599
1600 case '[':
1601 #if DEBUG_PERL
1602 fprintf (stderr, "%s:%d: extracting balanced '[' after varname\n",
1603 real_file_name, line_number);
1604 #endif
1605 extract_balanced (mlp, token_type_rbracket, true, false,
1606 null_context, null_context_list_iterator,
1607 1, arglist_parser_alloc (mlp, NULL));
1608 break;
1609
1610 case '-':
1611 c2 = phase1_getc ();
1612 if (c2 == '>')
1613 {
1614 #if DEBUG_PERL
1615 fprintf (stderr, "%s:%d: another \"->\" after varname\n",
1616 real_file_name, line_number);
1617 #endif
1618 break;
1619 }
1620 else if (c2 != '\n')
1621 {
1622 /* Discarding the newline is harmless here. The only
1623 special character recognized after a minus is greater-than
1624 for dereference. However, the sequence "-\n>" that we
1625 treat incorrectly here, is a syntax error. */
1626 phase1_ungetc (c2);
1627 }
1628 /* FALLTHROUGH */
1629
1630 default:
1631 #if DEBUG_PERL
1632 fprintf (stderr, "%s:%d: variable finished\n",
1633 real_file_name, line_number);
1634 #endif
1635 phase2_ungetc (c);
1636 return;
1637 }
1638 }
1639 }
1640
1641 /* Actually a simplified version of extract_variable(). It searches for
1642 variables inside a double-quoted string that may interpolate to
1643 some keyword hash (reference). The string is UTF-8 encoded. */
1644 static void
interpolate_keywords(message_list_ty * mlp,const char * string,int lineno)1645 interpolate_keywords (message_list_ty *mlp, const char *string, int lineno)
1646 {
1647 static char *buffer;
1648 static int bufmax = 0;
1649 int bufpos = 0;
1650 flag_context_ty context;
1651 int c;
1652 bool maybe_hash_deref = false;
1653 enum parser_state
1654 {
1655 initial,
1656 one_dollar,
1657 two_dollars,
1658 identifier,
1659 minus,
1660 wait_lbrace,
1661 wait_quote,
1662 dquote,
1663 squote,
1664 barekey,
1665 wait_rbrace
1666 } state;
1667 token_ty token;
1668
1669 lex_pos_ty pos;
1670
1671 /* States are:
1672 *
1673 * initial: initial
1674 * one_dollar: dollar sign seen in state INITIAL
1675 * two_dollars: another dollar-sign has been seen in state ONE_DOLLAR
1676 * identifier: a valid identifier character has been seen in state
1677 * ONE_DOLLAR or TWO_DOLLARS
1678 * minus: a minus-sign has been seen in state IDENTIFIER
1679 * wait_lbrace: a greater-than has been seen in state MINUS
1680 * wait_quote: a left brace has been seen in state IDENTIFIER or in
1681 * state WAIT_LBRACE
1682 * dquote: a double-quote has been seen in state WAIT_QUOTE
1683 * squote: a single-quote has been seen in state WAIT_QUOTE
1684 * barekey: an bareword character has been seen in state WAIT_QUOTE
1685 * wait_rbrace: closing quote has been seen in state DQUOTE or SQUOTE
1686 *
1687 * In the states initial...identifier the context is null_context; in the
1688 * states minus...wait_rbrace the context is the one suitable for the first
1689 * argument of the last seen identifier.
1690 */
1691 state = initial;
1692 context = null_context;
1693
1694 token.type = token_type_string;
1695 token.sub_type = string_type_qq;
1696 token.line_number = line_number;
1697 pos.file_name = logical_file_name;
1698 pos.line_number = lineno;
1699
1700 while ((c = (unsigned char) *string++) != '\0')
1701 {
1702 void *keyword_value;
1703
1704 if (state == initial)
1705 bufpos = 0;
1706
1707 if (c == '\n')
1708 lineno++;
1709
1710 if (bufpos + 1 >= bufmax)
1711 {
1712 bufmax = 2 * bufmax + 10;
1713 buffer = xrealloc (buffer, bufmax);
1714 }
1715
1716 switch (state)
1717 {
1718 case initial:
1719 switch (c)
1720 {
1721 case '\\':
1722 c = (unsigned char) *string++;
1723 if (c == '\0')
1724 return;
1725 break;
1726 case '$':
1727 buffer[bufpos++] = '$';
1728 maybe_hash_deref = false;
1729 state = one_dollar;
1730 break;
1731 default:
1732 break;
1733 }
1734 break;
1735 case one_dollar:
1736 switch (c)
1737 {
1738 case '$':
1739 /*
1740 * This is enough to make us believe later that we dereference
1741 * a hash reference.
1742 */
1743 maybe_hash_deref = true;
1744 state = two_dollars;
1745 break;
1746 default:
1747 if (c == '_' || c == ':' || c == '\'' || c >= 0x80
1748 || (c >= 'A' && c <= 'Z')
1749 || (c >= 'a' && c <= 'z')
1750 || (c >= '0' && c <= '9'))
1751 {
1752 buffer[bufpos++] = c;
1753 state = identifier;
1754 }
1755 else
1756 state = initial;
1757 break;
1758 }
1759 break;
1760 case two_dollars:
1761 if (c == '_' || c == ':' || c == '\'' || c >= 0x80
1762 || (c >= 'A' && c <= 'Z')
1763 || (c >= 'a' && c <= 'z')
1764 || (c >= '0' && c <= '9'))
1765 {
1766 buffer[bufpos++] = c;
1767 state = identifier;
1768 }
1769 else
1770 state = initial;
1771 break;
1772 case identifier:
1773 switch (c)
1774 {
1775 case '-':
1776 if (hash_find_entry (&keywords, buffer, bufpos, &keyword_value)
1777 == 0)
1778 {
1779 flag_context_list_iterator_ty context_iter =
1780 flag_context_list_iterator (
1781 flag_context_list_table_lookup (
1782 flag_context_list_table,
1783 buffer, bufpos));
1784 context =
1785 inherited_context (null_context,
1786 flag_context_list_iterator_advance (
1787 &context_iter));
1788 state = minus;
1789 }
1790 else
1791 state = initial;
1792 break;
1793 case '{':
1794 if (!maybe_hash_deref)
1795 buffer[0] = '%';
1796 if (hash_find_entry (&keywords, buffer, bufpos, &keyword_value)
1797 == 0)
1798 {
1799 flag_context_list_iterator_ty context_iter =
1800 flag_context_list_iterator (
1801 flag_context_list_table_lookup (
1802 flag_context_list_table,
1803 buffer, bufpos));
1804 context =
1805 inherited_context (null_context,
1806 flag_context_list_iterator_advance (
1807 &context_iter));
1808 state = wait_quote;
1809 }
1810 else
1811 state = initial;
1812 break;
1813 default:
1814 if (c == '_' || c == ':' || c == '\'' || c >= 0x80
1815 || (c >= 'A' && c <= 'Z')
1816 || (c >= 'a' && c <= 'z')
1817 || (c >= '0' && c <= '9'))
1818 {
1819 buffer[bufpos++] = c;
1820 }
1821 else
1822 state = initial;
1823 break;
1824 }
1825 break;
1826 case minus:
1827 switch (c)
1828 {
1829 case '>':
1830 state = wait_lbrace;
1831 break;
1832 default:
1833 context = null_context;
1834 state = initial;
1835 break;
1836 }
1837 break;
1838 case wait_lbrace:
1839 switch (c)
1840 {
1841 case '{':
1842 state = wait_quote;
1843 break;
1844 default:
1845 context = null_context;
1846 state = initial;
1847 break;
1848 }
1849 break;
1850 case wait_quote:
1851 switch (c)
1852 {
1853 case_whitespace:
1854 break;
1855 case '\'':
1856 pos.line_number = lineno;
1857 bufpos = 0;
1858 state = squote;
1859 break;
1860 case '"':
1861 pos.line_number = lineno;
1862 bufpos = 0;
1863 state = dquote;
1864 break;
1865 default:
1866 if (c == '_' || (c >= '0' && c <= '9') || c >= 0x80
1867 || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
1868 {
1869 pos.line_number = lineno;
1870 bufpos = 0;
1871 buffer[bufpos++] = c;
1872 state = barekey;
1873 }
1874 else
1875 {
1876 context = null_context;
1877 state = initial;
1878 }
1879 break;
1880 }
1881 break;
1882 case dquote:
1883 switch (c)
1884 {
1885 case '"':
1886 /* The resulting string has to be interpolated twice. */
1887 buffer[bufpos] = '\0';
1888 token.string = xstrdup (buffer);
1889 extract_quotelike_pass3 (&token, EXIT_FAILURE);
1890 /* The string can only shrink with interpolation (because
1891 we ignore \Q). */
1892 if (!(strlen (token.string) <= bufpos))
1893 abort ();
1894 strcpy (buffer, token.string);
1895 free (token.string);
1896 state = wait_rbrace;
1897 break;
1898 case '\\':
1899 if (string[0] == '\"')
1900 {
1901 buffer[bufpos++] = string++[0];
1902 }
1903 else if (string[0])
1904 {
1905 buffer[bufpos++] = '\\';
1906 buffer[bufpos++] = string++[0];
1907 }
1908 else
1909 {
1910 context = null_context;
1911 state = initial;
1912 }
1913 break;
1914 default:
1915 buffer[bufpos++] = c;
1916 break;
1917 }
1918 break;
1919 case squote:
1920 switch (c)
1921 {
1922 case '\'':
1923 state = wait_rbrace;
1924 break;
1925 case '\\':
1926 if (string[0] == '\'')
1927 {
1928 buffer[bufpos++] = string++[0];
1929 }
1930 else if (string[0])
1931 {
1932 buffer[bufpos++] = '\\';
1933 buffer[bufpos++] = string++[0];
1934 }
1935 else
1936 {
1937 context = null_context;
1938 state = initial;
1939 }
1940 break;
1941 default:
1942 buffer[bufpos++] = c;
1943 break;
1944 }
1945 break;
1946 case barekey:
1947 if (c == '_' || (c >= '0' && c <= '9') || c >= 0x80
1948 || (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
1949 {
1950 buffer[bufpos++] = c;
1951 break;
1952 }
1953 else if (is_whitespace (c))
1954 {
1955 state = wait_rbrace;
1956 break;
1957 }
1958 else if (c != '}')
1959 {
1960 context = null_context;
1961 state = initial;
1962 break;
1963 }
1964 /* Must be right brace. */
1965 /* FALLTHROUGH */
1966 case wait_rbrace:
1967 switch (c)
1968 {
1969 case_whitespace:
1970 break;
1971 case '}':
1972 buffer[bufpos] = '\0';
1973 token.string = xstrdup (buffer);
1974 extract_quotelike_pass3 (&token, EXIT_FAILURE);
1975 xgettext_current_source_encoding = po_charset_utf8;
1976 remember_a_message (mlp, NULL, token.string, context, &pos,
1977 savable_comment);
1978 xgettext_current_source_encoding = xgettext_global_source_encoding;
1979 /* FALLTHROUGH */
1980 default:
1981 context = null_context;
1982 state = initial;
1983 break;
1984 }
1985 break;
1986 }
1987 }
1988 }
1989
1990 /* The last token seen in the token stream. This is important for the
1991 interpretation of '?' and '/'. */
1992 static token_type_ty last_token;
1993
1994 /* Combine characters into tokens. Discard whitespace. */
1995
1996 static void
x_perl_prelex(message_list_ty * mlp,token_ty * tp)1997 x_perl_prelex (message_list_ty *mlp, token_ty *tp)
1998 {
1999 static char *buffer;
2000 static int bufmax;
2001 int bufpos;
2002 int c;
2003
2004 for (;;)
2005 {
2006 c = phase2_getc ();
2007 tp->line_number = line_number;
2008
2009 switch (c)
2010 {
2011 case EOF:
2012 tp->type = token_type_eof;
2013 return;
2014
2015 case '\n':
2016 if (last_non_comment_line > last_comment_line)
2017 savable_comment_reset ();
2018 /* FALLTHROUGH */
2019 case '\t':
2020 case ' ':
2021 /* Ignore whitespace. */
2022 continue;
2023
2024 case '%':
2025 case '@':
2026 case '*':
2027 case '$':
2028 if (!extract_all)
2029 {
2030 extract_variable (mlp, tp, c);
2031 prefer_division_over_regexp = true;
2032 return;
2033 }
2034 break;
2035 }
2036
2037 last_non_comment_line = tp->line_number;
2038
2039 switch (c)
2040 {
2041 case '.':
2042 {
2043 int c2 = phase1_getc ();
2044 phase1_ungetc (c2);
2045 if (c2 == '.')
2046 {
2047 tp->type = token_type_other;
2048 prefer_division_over_regexp = false;
2049 return;
2050 }
2051 else if (c2 >= '0' && c2 <= '9')
2052 {
2053 prefer_division_over_regexp = false;
2054 }
2055 else
2056 {
2057 tp->type = token_type_dot;
2058 prefer_division_over_regexp = true;
2059 return;
2060 }
2061 }
2062 /* FALLTHROUGH */
2063 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2064 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
2065 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
2066 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
2067 case 'Y': case 'Z':
2068 case '_':
2069 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2070 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2071 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2072 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
2073 case 'y': case 'z':
2074 case '0': case '1': case '2': case '3': case '4':
2075 case '5': case '6': case '7': case '8': case '9':
2076 /* Symbol, or part of a number. */
2077 prefer_division_over_regexp = true;
2078 bufpos = 0;
2079 for (;;)
2080 {
2081 if (bufpos >= bufmax)
2082 {
2083 bufmax = 2 * bufmax + 10;
2084 buffer = xrealloc (buffer, bufmax);
2085 }
2086 buffer[bufpos++] = c;
2087 c = phase1_getc ();
2088 switch (c)
2089 {
2090 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2091 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
2092 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
2093 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
2094 case 'Y': case 'Z':
2095 case '_':
2096 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2097 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2098 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2099 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
2100 case 'y': case 'z':
2101 case '0': case '1': case '2': case '3': case '4':
2102 case '5': case '6': case '7': case '8': case '9':
2103 continue;
2104
2105 default:
2106 phase1_ungetc (c);
2107 break;
2108 }
2109 break;
2110 }
2111 if (bufpos >= bufmax)
2112 {
2113 bufmax = 2 * bufmax + 10;
2114 buffer = xrealloc (buffer, bufmax);
2115 }
2116 buffer[bufpos] = '\0';
2117
2118 if (strcmp (buffer, "__END__") == 0
2119 || strcmp (buffer, "__DATA__") == 0)
2120 {
2121 end_of_file = true;
2122 tp->type = token_type_eof;
2123 return;
2124 }
2125 else if (strcmp (buffer, "and") == 0
2126 || strcmp (buffer, "cmp") == 0
2127 || strcmp (buffer, "eq") == 0
2128 || strcmp (buffer, "if") == 0
2129 || strcmp (buffer, "ge") == 0
2130 || strcmp (buffer, "gt") == 0
2131 || strcmp (buffer, "le") == 0
2132 || strcmp (buffer, "lt") == 0
2133 || strcmp (buffer, "ne") == 0
2134 || strcmp (buffer, "not") == 0
2135 || strcmp (buffer, "or") == 0
2136 || strcmp (buffer, "unless") == 0
2137 || strcmp (buffer, "while") == 0
2138 || strcmp (buffer, "xor") == 0)
2139 {
2140 tp->type = token_type_named_op;
2141 tp->string = xstrdup (buffer);
2142 prefer_division_over_regexp = false;
2143 return;
2144 }
2145 else if (strcmp (buffer, "s") == 0
2146 || strcmp (buffer, "y") == 0
2147 || strcmp (buffer, "tr") == 0)
2148 {
2149 int delim = phase1_getc ();
2150
2151 while (is_whitespace (delim))
2152 delim = phase2_getc ();
2153
2154 if (delim == EOF)
2155 {
2156 tp->type = token_type_eof;
2157 return;
2158 }
2159 if ((delim >= '0' && delim <= '9')
2160 || (delim >= 'A' && delim <= 'Z')
2161 || (delim >= 'a' && delim <= 'z'))
2162 {
2163 /* False positive. */
2164 phase2_ungetc (delim);
2165 tp->type = token_type_symbol;
2166 tp->sub_type = symbol_type_none;
2167 tp->string = xstrdup (buffer);
2168 prefer_division_over_regexp = true;
2169 return;
2170 }
2171 extract_triple_quotelike (mlp, tp, delim,
2172 buffer[0] == 's' && delim != '\'');
2173
2174 /* Eat the following modifiers. */
2175 do
2176 c = phase1_getc ();
2177 while (c >= 'a' && c <= 'z');
2178 phase1_ungetc (c);
2179 return;
2180 }
2181 else if (strcmp (buffer, "m") == 0)
2182 {
2183 int delim = phase1_getc ();
2184
2185 while (is_whitespace (delim))
2186 delim = phase2_getc ();
2187
2188 if (delim == EOF)
2189 {
2190 tp->type = token_type_eof;
2191 return;
2192 }
2193 if ((delim >= '0' && delim <= '9')
2194 || (delim >= 'A' && delim <= 'Z')
2195 || (delim >= 'a' && delim <= 'z'))
2196 {
2197 /* False positive. */
2198 phase2_ungetc (delim);
2199 tp->type = token_type_symbol;
2200 tp->sub_type = symbol_type_none;
2201 tp->string = xstrdup (buffer);
2202 prefer_division_over_regexp = true;
2203 return;
2204 }
2205 extract_quotelike (tp, delim);
2206 if (delim != '\'')
2207 interpolate_keywords (mlp, tp->string, line_number);
2208 free (tp->string);
2209 tp->type = token_type_regex_op;
2210 prefer_division_over_regexp = true;
2211
2212 /* Eat the following modifiers. */
2213 do
2214 c = phase1_getc ();
2215 while (c >= 'a' && c <= 'z');
2216 phase1_ungetc (c);
2217 return;
2218 }
2219 else if (strcmp (buffer, "qq") == 0
2220 || strcmp (buffer, "q") == 0
2221 || strcmp (buffer, "qx") == 0
2222 || strcmp (buffer, "qw") == 0
2223 || strcmp (buffer, "qr") == 0)
2224 {
2225 /* The qw (...) construct is not really a string but we
2226 can treat in the same manner and then pretend it is
2227 a symbol. Rationale: Saying "qw (foo bar)" is the
2228 same as "my @list = ('foo', 'bar'); @list;". */
2229
2230 int delim = phase1_getc ();
2231
2232 while (is_whitespace (delim))
2233 delim = phase2_getc ();
2234
2235 if (delim == EOF)
2236 {
2237 tp->type = token_type_eof;
2238 return;
2239 }
2240 prefer_division_over_regexp = true;
2241
2242 if ((delim >= '0' && delim <= '9')
2243 || (delim >= 'A' && delim <= 'Z')
2244 || (delim >= 'a' && delim <= 'z'))
2245 {
2246 /* False positive. */
2247 phase2_ungetc (delim);
2248 tp->type = token_type_symbol;
2249 tp->sub_type = symbol_type_none;
2250 tp->string = xstrdup (buffer);
2251 prefer_division_over_regexp = true;
2252 return;
2253 }
2254
2255 extract_quotelike (tp, delim);
2256
2257 switch (buffer[1])
2258 {
2259 case 'q':
2260 case 'x':
2261 tp->type = token_type_string;
2262 tp->sub_type = string_type_qq;
2263 interpolate_keywords (mlp, tp->string, line_number);
2264 break;
2265 case 'r':
2266 tp->type = token_type_regex_op;
2267 break;
2268 case 'w':
2269 tp->type = token_type_symbol;
2270 tp->sub_type = symbol_type_none;
2271 break;
2272 case '\0':
2273 tp->type = token_type_string;
2274 tp->sub_type = string_type_q;
2275 break;
2276 default:
2277 abort ();
2278 }
2279 return;
2280 }
2281 else if (strcmp (buffer, "grep") == 0
2282 || strcmp (buffer, "split") == 0)
2283 {
2284 prefer_division_over_regexp = false;
2285 }
2286 tp->type = token_type_symbol;
2287 tp->sub_type = (strcmp (buffer, "sub") == 0
2288 ? symbol_type_sub
2289 : symbol_type_none);
2290 tp->string = xstrdup (buffer);
2291 return;
2292
2293 case '"':
2294 prefer_division_over_regexp = true;
2295 extract_quotelike (tp, c);
2296 tp->sub_type = string_type_qq;
2297 interpolate_keywords (mlp, tp->string, line_number);
2298 return;
2299
2300 case '`':
2301 prefer_division_over_regexp = true;
2302 extract_quotelike (tp, c);
2303 tp->sub_type = string_type_qq;
2304 interpolate_keywords (mlp, tp->string, line_number);
2305 return;
2306
2307 case '\'':
2308 prefer_division_over_regexp = true;
2309 extract_quotelike (tp, c);
2310 tp->sub_type = string_type_q;
2311 return;
2312
2313 case '(':
2314 c = phase2_getc ();
2315 if (c == ')')
2316 /* Ignore empty list. */
2317 continue;
2318 else
2319 phase2_ungetc (c);
2320 tp->type = token_type_lparen;
2321 prefer_division_over_regexp = false;
2322 return;
2323
2324 case ')':
2325 tp->type = token_type_rparen;
2326 prefer_division_over_regexp = true;
2327 return;
2328
2329 case '{':
2330 tp->type = token_type_lbrace;
2331 prefer_division_over_regexp = false;
2332 return;
2333
2334 case '}':
2335 tp->type = token_type_rbrace;
2336 prefer_division_over_regexp = false;
2337 return;
2338
2339 case '[':
2340 tp->type = token_type_lbracket;
2341 prefer_division_over_regexp = false;
2342 return;
2343
2344 case ']':
2345 tp->type = token_type_rbracket;
2346 prefer_division_over_regexp = false;
2347 return;
2348
2349 case ';':
2350 tp->type = token_type_semicolon;
2351 prefer_division_over_regexp = false;
2352 return;
2353
2354 case ',':
2355 tp->type = token_type_comma;
2356 prefer_division_over_regexp = false;
2357 return;
2358
2359 case '=':
2360 /* Check for fat comma. */
2361 c = phase1_getc ();
2362 if (c == '>')
2363 {
2364 tp->type = token_type_fat_comma;
2365 return;
2366 }
2367 else if (linepos == 2
2368 && (last_token == token_type_semicolon
2369 || last_token == token_type_rbrace)
2370 && ((c >= 'A' && c <='Z')
2371 || (c >= 'a' && c <= 'z')))
2372 {
2373 #if DEBUG_PERL
2374 fprintf (stderr, "%s:%d: start pod section\n",
2375 real_file_name, line_number);
2376 #endif
2377 skip_pod ();
2378 #if DEBUG_PERL
2379 fprintf (stderr, "%s:%d: end pod section\n",
2380 real_file_name, line_number);
2381 #endif
2382 continue;
2383 }
2384 phase1_ungetc (c);
2385 tp->type = token_type_other;
2386 prefer_division_over_regexp = false;
2387 return;
2388
2389 case '<':
2390 /* Check for <<EOF and friends. */
2391 prefer_division_over_regexp = false;
2392 c = phase1_getc ();
2393 if (c == '<')
2394 {
2395 c = phase1_getc ();
2396 if (c == '\'')
2397 {
2398 char *string;
2399 extract_quotelike (tp, c);
2400 string = get_here_document (tp->string);
2401 free (tp->string);
2402 tp->string = string;
2403 tp->type = token_type_string;
2404 tp->sub_type = string_type_verbatim;
2405 tp->line_number = line_number + 1;
2406 return;
2407 }
2408 else if (c == '"')
2409 {
2410 char *string;
2411 extract_quotelike (tp, c);
2412 string = get_here_document (tp->string);
2413 free (tp->string);
2414 tp->string = string;
2415 tp->type = token_type_string;
2416 tp->sub_type = string_type_qq;
2417 tp->line_number = line_number + 1;
2418 interpolate_keywords (mlp, tp->string, line_number + 1);
2419 return;
2420 }
2421 else if ((c >= 'A' && c <= 'Z')
2422 || (c >= 'a' && c <= 'z')
2423 || c == '_')
2424 {
2425 bufpos = 0;
2426 while ((c >= 'A' && c <= 'Z')
2427 || (c >= 'a' && c <= 'z')
2428 || (c >= '0' && c <= '9')
2429 || c == '_' || c >= 0x80)
2430 {
2431 if (bufpos >= bufmax)
2432 {
2433 bufmax = 2 * bufmax + 10;
2434 buffer = xrealloc (buffer, bufmax);
2435 }
2436 buffer[bufpos++] = c;
2437 c = phase1_getc ();
2438 }
2439 if (c == EOF)
2440 {
2441 tp->type = token_type_eof;
2442 return;
2443 }
2444 else
2445 {
2446 char *string;
2447 phase1_ungetc (c);
2448 if (bufpos >= bufmax)
2449 {
2450 bufmax = 2 * bufmax + 10;
2451 buffer = xrealloc (buffer, bufmax);
2452 }
2453 buffer[bufpos++] = '\0';
2454 string = get_here_document (buffer);
2455 tp->string = string;
2456 tp->type = token_type_string;
2457 tp->sub_type = string_type_qq;
2458 tp->line_number = line_number + 1;
2459 interpolate_keywords (mlp, tp->string, line_number + 1);
2460 return;
2461 }
2462 }
2463 else
2464 {
2465 tp->type = token_type_other;
2466 return;
2467 }
2468 }
2469 else
2470 {
2471 phase1_ungetc (c);
2472 tp->type = token_type_other;
2473 }
2474 return; /* End of case '>'. */
2475
2476 case '-':
2477 /* Check for dereferencing operator. */
2478 c = phase1_getc ();
2479 if (c == '>')
2480 {
2481 tp->type = token_type_dereference;
2482 return;
2483 }
2484 else if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
2485 {
2486 /* One of the -X (filetest) functions. We play safe
2487 and accept all alphabetical characters here. */
2488 tp->type = token_type_other;
2489 return;
2490 }
2491 phase1_ungetc (c);
2492 tp->type = token_type_other;
2493 prefer_division_over_regexp = false;
2494 return;
2495
2496 case '/':
2497 case '?':
2498 if (!prefer_division_over_regexp)
2499 {
2500 extract_quotelike (tp, c);
2501 interpolate_keywords (mlp, tp->string, line_number);
2502 free (tp->string);
2503 tp->type = token_type_other;
2504 prefer_division_over_regexp = true;
2505 /* Eat the following modifiers. */
2506 do
2507 c = phase1_getc ();
2508 while (c >= 'a' && c <= 'z');
2509 phase1_ungetc (c);
2510 return;
2511 }
2512 /* FALLTHROUGH */
2513
2514 default:
2515 /* We could carefully recognize each of the 2 and 3 character
2516 operators, but it is not necessary, as we only need to recognize
2517 gettext invocations. Don't bother. */
2518 tp->type = token_type_other;
2519 prefer_division_over_regexp = false;
2520 return;
2521 }
2522 }
2523 }
2524
2525
2526 /* A token stack used as a lookahead buffer. */
2527
2528 typedef struct token_stack_ty token_stack_ty;
2529 struct token_stack_ty
2530 {
2531 token_ty **items;
2532 size_t nitems;
2533 size_t nitems_max;
2534 };
2535
2536 static struct token_stack_ty token_stack;
2537
2538 #if DEBUG_PERL
2539 /* Dumps all resources allocated by stack STACK. */
2540 static int
token_stack_dump(token_stack_ty * stack)2541 token_stack_dump (token_stack_ty *stack)
2542 {
2543 size_t i;
2544
2545 fprintf (stderr, "BEGIN STACK DUMP\n");
2546 for (i = 0; i < stack->nitems; i++)
2547 {
2548 token_ty *token = stack->items[i];
2549 fprintf (stderr, " [%s]\n", token2string (token));
2550 switch (token->type)
2551 {
2552 case token_type_named_op:
2553 case token_type_string:
2554 case token_type_symbol:
2555 case token_type_variable:
2556 fprintf (stderr, " string: %s\n", token->string);
2557 break;
2558 }
2559 }
2560 fprintf (stderr, "END STACK DUMP\n");
2561 return 0;
2562 }
2563 #endif
2564
2565 /* Pushes the token TOKEN onto the stack STACK. */
2566 static inline void
token_stack_push(token_stack_ty * stack,token_ty * token)2567 token_stack_push (token_stack_ty *stack, token_ty *token)
2568 {
2569 if (stack->nitems >= stack->nitems_max)
2570 {
2571 size_t nbytes;
2572
2573 stack->nitems_max = 2 * stack->nitems_max + 4;
2574 nbytes = stack->nitems_max * sizeof (token_ty *);
2575 stack->items = xrealloc (stack->items, nbytes);
2576 }
2577 stack->items[stack->nitems++] = token;
2578 }
2579
2580 /* Pops the most recently pushed token from the stack STACK and returns it.
2581 Returns NULL if the stack is empty. */
2582 static inline token_ty *
token_stack_pop(token_stack_ty * stack)2583 token_stack_pop (token_stack_ty *stack)
2584 {
2585 if (stack->nitems > 0)
2586 return stack->items[--(stack->nitems)];
2587 else
2588 return NULL;
2589 }
2590
2591 /* Return the top of the stack without removing it from the stack, or
2592 NULL if the stack is empty. */
2593 static inline token_ty *
token_stack_peek(const token_stack_ty * stack)2594 token_stack_peek (const token_stack_ty *stack)
2595 {
2596 if (stack->nitems > 0)
2597 return stack->items[stack->nitems - 1];
2598 else
2599 return NULL;
2600 }
2601
2602 /* Frees all resources allocated by stack STACK. */
2603 static inline void
token_stack_free(token_stack_ty * stack)2604 token_stack_free (token_stack_ty *stack)
2605 {
2606 size_t i;
2607
2608 for (i = 0; i < stack->nitems; i++)
2609 free_token (stack->items[i]);
2610 free (stack->items);
2611 }
2612
2613
2614 static token_ty *
x_perl_lex(message_list_ty * mlp)2615 x_perl_lex (message_list_ty *mlp)
2616 {
2617 #if DEBUG_PERL
2618 int dummy = token_stack_dump (&token_stack);
2619 #endif
2620 token_ty *tp = token_stack_pop (&token_stack);
2621
2622 if (!tp)
2623 {
2624 tp = (token_ty *) xmalloc (sizeof (token_ty));
2625 x_perl_prelex (mlp, tp);
2626 #if DEBUG_PERL
2627 fprintf (stderr, "%s:%d: x_perl_prelex returned %s\n",
2628 real_file_name, line_number, token2string (tp));
2629 #endif
2630 }
2631 #if DEBUG_PERL
2632 else
2633 {
2634 fprintf (stderr, "%s:%d: %s recycled from stack\n",
2635 real_file_name, line_number, token2string (tp));
2636 }
2637 #endif
2638
2639 /* A symbol followed by a fat comma is really a single-quoted string.
2640 Function definitions or forward declarations also need a special
2641 handling because the dollars and at signs inside the parentheses
2642 must not be interpreted as the beginning of a variable ')'. */
2643 if (tp->type == token_type_symbol || tp->type == token_type_named_op)
2644 {
2645 token_ty *next = token_stack_peek (&token_stack);
2646
2647 if (!next)
2648 {
2649 #if DEBUG_PERL
2650 fprintf (stderr, "%s:%d: pre-fetching next token\n",
2651 real_file_name, line_number);
2652 #endif
2653 next = x_perl_lex (mlp);
2654 x_perl_unlex (next);
2655 #if DEBUG_PERL
2656 fprintf (stderr, "%s:%d: unshifted next token\n",
2657 real_file_name, line_number);
2658 #endif
2659 }
2660
2661 #if DEBUG_PERL
2662 fprintf (stderr, "%s:%d: next token is %s\n",
2663 real_file_name, line_number, token2string (next));
2664 #endif
2665
2666 if (next->type == token_type_fat_comma)
2667 {
2668 tp->type = token_type_string;
2669 tp->sub_type = string_type_q;
2670 #if DEBUG_PERL
2671 fprintf (stderr,
2672 "%s:%d: token %s mutated to token_type_string\n",
2673 real_file_name, line_number, token2string (tp));
2674 #endif
2675 }
2676 else if (tp->type == token_type_symbol && tp->sub_type == symbol_type_sub
2677 && next->type == token_type_symbol)
2678 {
2679 /* Start of a function declaration or definition. Mark this
2680 symbol as a function name, so that we can later eat up
2681 possible prototype information. */
2682 #if DEBUG_PERL
2683 fprintf (stderr, "%s:%d: subroutine declaration/definition '%s'\n",
2684 real_file_name, line_number, next->string);
2685 #endif
2686 next->sub_type = symbol_type_function;
2687 }
2688 else if (tp->type == token_type_symbol
2689 && (tp->sub_type == symbol_type_sub
2690 || tp->sub_type == symbol_type_function)
2691 && next->type == token_type_lparen)
2692 {
2693 /* For simplicity we simply consume everything up to the
2694 closing parenthesis. Actually only a limited set of
2695 characters is allowed inside parentheses but we leave
2696 complaints to the interpreter and are prepared for
2697 future extensions to the Perl syntax. */
2698 int c;
2699
2700 #if DEBUG_PERL
2701 fprintf (stderr, "%s:%d: consuming prototype information\n",
2702 real_file_name, line_number);
2703 #endif
2704
2705 do
2706 {
2707 c = phase1_getc ();
2708 #if DEBUG_PERL
2709 fprintf (stderr, " consuming character '%c'\n", c);
2710 #endif
2711 }
2712 while (c != EOF && c != ')');
2713 phase1_ungetc (c);
2714 }
2715 }
2716
2717 return tp;
2718 }
2719
2720 static void
x_perl_unlex(token_ty * tp)2721 x_perl_unlex (token_ty *tp)
2722 {
2723 token_stack_push (&token_stack, tp);
2724 }
2725
2726
2727 /* ========================= Extracting strings. ========================== */
2728
2729 /* Assuming TP is a string token, this function accumulates all subsequent
2730 . string2 . string3 ... to the string. (String concatenation.) */
2731
2732 static char *
collect_message(message_list_ty * mlp,token_ty * tp,int error_level)2733 collect_message (message_list_ty *mlp, token_ty *tp, int error_level)
2734 {
2735 char *string;
2736 size_t len;
2737
2738 extract_quotelike_pass3 (tp, error_level);
2739 string = xstrdup (tp->string);
2740 len = strlen (tp->string) + 1;
2741
2742 for (;;)
2743 {
2744 int c;
2745
2746 do
2747 c = phase2_getc ();
2748 while (is_whitespace (c));
2749
2750 if (c != '.')
2751 {
2752 phase2_ungetc (c);
2753 return string;
2754 }
2755
2756 do
2757 c = phase2_getc ();
2758 while (is_whitespace (c));
2759
2760 phase2_ungetc (c);
2761
2762 if (c == '"' || c == '\'' || c == '`'
2763 || (!prefer_division_over_regexp && (c == '/' || c == '?'))
2764 || c == 'q')
2765 {
2766 token_ty *qstring = x_perl_lex (mlp);
2767 if (qstring->type != token_type_string)
2768 {
2769 /* assert (qstring->type == token_type_symbol) */
2770 x_perl_unlex (qstring);
2771 return string;
2772 }
2773
2774 extract_quotelike_pass3 (qstring, error_level);
2775 len += strlen (qstring->string);
2776 string = xrealloc (string, len);
2777 strcat (string, qstring->string);
2778 free_token (qstring);
2779 }
2780 }
2781 }
2782
2783 /* The file is broken into tokens. Scan the token stream, looking for
2784 a keyword, followed by a left paren, followed by a string. When we
2785 see this sequence, we have something to remember. We assume we are
2786 looking at a valid Perl program, and leave the complaints about
2787 the grammar to the compiler.
2788
2789 Normal handling: Look for
2790 keyword ( ... msgid ... )
2791 Plural handling: Look for
2792 keyword ( ... msgid ... msgid_plural ... )
2793
2794 We use recursion because the arguments before msgid or between msgid
2795 and msgid_plural can contain subexpressions of the same form.
2796
2797 In Perl, parentheses around function arguments can be omitted.
2798
2799 The general rules are:
2800 1) Functions declared with a prototype take exactly the specified number
2801 of arguments.
2802 sub one_arg ($) { ... }
2803 sub two_args ($$) { ... }
2804 2) When a function name is immediately followed by an opening parenthesis,
2805 the argument list ends at the corresponding closing parenthesis.
2806
2807 If rule 1 and rule 2 are contradictory, i.e. when the program calls a
2808 function with an explicit argument list and the wrong number of arguments,
2809 the program is invalid:
2810 sub two_args ($$) { ... }
2811 foo two_args (x), y - invalid due to rules 1 and 2
2812
2813 Ambiguities are resolved as follows:
2814 3) Some built-ins, such as 'abs', 'sqrt', 'sin', 'cos', ..., and functions
2815 declared with a prototype of exactly one argument take exactly one
2816 argument:
2817 foo sin x, y ==> foo (sin (x), y)
2818 sub one_arg ($) { ... }
2819 foo one_arg x, y, z ==> foo (one_arg (x), y, z)
2820 4) Other identifiers, if not immediately followed by an opening
2821 parenthesis, consume the entire remaining argument list:
2822 foo bar x, y ==> foo (bar (x, y))
2823 sub two_args ($$) { ... }
2824 foo two_args x, y ==> foo (two_args (x, y))
2825
2826 Other series of comma separated expressions without a function name at
2827 the beginning are comma expressions:
2828 sub two_args ($$) { ... }
2829 foo two_args x, (y, z) ==> foo (two_args (x, (y, z)))
2830 Note that the evaluation of comma expressions returns a list of values
2831 when in list context (e.g. inside the argument list of a function without
2832 prototype) but only one value when inside the argument list of a function
2833 with a prototype:
2834 sub print3 ($$$) { print @_ }
2835 print3 5, (6, 7), 8 ==> 578
2836 print 5, (6, 7), 8 ==> 5678
2837
2838 Where rule 3 or 4 contradict rule 1 or 2, the program is invalid:
2839 sin (x, y) - invalid due to rules 2 and 3
2840 sub one_arg ($) { ... }
2841 one_arg (x, y) - invalid due to rules 2 and 3
2842 sub two_args ($$) { ... }
2843 foo two_args x, y, z - invalid due to rules 1 and 4
2844 */
2845
2846 /* Extract messages until the next balanced closing parenthesis.
2847 Extracted messages are added to MLP.
2848
2849 DELIM can be either token_type_rbrace, token_type_rbracket,
2850 token_type_rparen. Additionally, if COMMA_DELIM is true, parsing
2851 stops at the next comma outside parentheses.
2852
2853 ARG is the current argument list position, starts with 1.
2854 ARGPARSER is the corresponding argument list parser.
2855
2856 Returns true for EOF, false otherwise. */
2857
2858 static bool
extract_balanced(message_list_ty * mlp,token_type_ty delim,bool eat_delim,bool comma_delim,flag_context_ty outer_context,flag_context_list_iterator_ty context_iter,int arg,struct arglist_parser * argparser)2859 extract_balanced (message_list_ty *mlp,
2860 token_type_ty delim, bool eat_delim, bool comma_delim,
2861 flag_context_ty outer_context,
2862 flag_context_list_iterator_ty context_iter,
2863 int arg, struct arglist_parser *argparser)
2864 {
2865 /* Whether to implicitly assume the next tokens are arguments even without
2866 a '('. */
2867 bool next_is_argument = false;
2868 /* Parameters of the keyword just seen. Defined only when next_is_argument
2869 is true. */
2870 const struct callshapes *next_shapes = NULL;
2871 struct arglist_parser *next_argparser = NULL;
2872
2873 /* Whether to not consider strings until the next comma. */
2874 bool skip_until_comma = false;
2875
2876 /* Context iterator that will be used if the next token is a '('. */
2877 flag_context_list_iterator_ty next_context_iter =
2878 passthrough_context_list_iterator;
2879 /* Current context. */
2880 flag_context_ty inner_context =
2881 inherited_context (outer_context,
2882 flag_context_list_iterator_advance (&context_iter));
2883
2884 #if DEBUG_PERL
2885 static int nesting_level = 0;
2886
2887 ++nesting_level;
2888 #endif
2889
2890 last_token = token_type_semicolon; /* Safe assumption. */
2891 prefer_division_over_regexp = false;
2892
2893 for (;;)
2894 {
2895 /* The current token. */
2896 token_ty *tp;
2897
2898 tp = x_perl_lex (mlp);
2899
2900 last_token = tp->type;
2901
2902 if (delim == tp->type)
2903 {
2904 xgettext_current_source_encoding = po_charset_utf8;
2905 arglist_parser_done (argparser, arg);
2906 xgettext_current_source_encoding = xgettext_global_source_encoding;
2907 if (next_argparser != NULL)
2908 free (next_argparser);
2909 #if DEBUG_PERL
2910 fprintf (stderr, "%s:%d: extract_balanced finished (%d)\n",
2911 logical_file_name, tp->line_number, --nesting_level);
2912 #endif
2913 if (eat_delim)
2914 free_token (tp);
2915 else
2916 /* Preserve the delimiter for the caller. */
2917 x_perl_unlex (tp);
2918 return false;
2919 }
2920
2921 if (comma_delim && tp->type == token_type_comma)
2922 {
2923 xgettext_current_source_encoding = po_charset_utf8;
2924 arglist_parser_done (argparser, arg);
2925 xgettext_current_source_encoding = xgettext_global_source_encoding;
2926 if (next_argparser != NULL)
2927 free (next_argparser);
2928 #if DEBUG_PERL
2929 fprintf (stderr, "%s:%d: extract_balanced finished at comma (%d)\n",
2930 logical_file_name, tp->line_number, --nesting_level);
2931 #endif
2932 x_perl_unlex (tp);
2933 return false;
2934 }
2935
2936 if (next_is_argument && tp->type != token_type_lparen)
2937 {
2938 /* An argument list starts, even though there is no '('. */
2939 bool next_comma_delim;
2940
2941 x_perl_unlex (tp);
2942
2943 if (next_shapes != NULL)
2944 /* We know something about the function being called. Assume
2945 that it consumes only one argument if no argument number or
2946 total > 1 is specified. */
2947 {
2948 size_t i;
2949
2950 next_comma_delim = true;
2951 for (i = 0; i < next_shapes->nshapes; i++)
2952 {
2953 const struct callshape *shape = &next_shapes->shapes[i];
2954
2955 if (shape->argnum1 > 1
2956 || shape->argnum2 > 1
2957 || shape->argnumc > 1
2958 || shape->argtotal > 1)
2959 next_comma_delim = false;
2960 }
2961 }
2962 else
2963 /* We know nothing about the function being called. It could be
2964 a function prototyped to take only one argument, or on the other
2965 hand it could be prototyped to take more than one argument or an
2966 arbitrary argument list or it could be unprototyped. Due to
2967 the way the parser works, assuming the first case gives the
2968 best results. */
2969 next_comma_delim = true;
2970
2971 if (extract_balanced (mlp, delim, false, next_comma_delim,
2972 inner_context, next_context_iter,
2973 1, next_argparser))
2974 {
2975 xgettext_current_source_encoding = po_charset_utf8;
2976 arglist_parser_done (argparser, arg);
2977 xgettext_current_source_encoding = xgettext_global_source_encoding;
2978 return true;
2979 }
2980
2981 next_is_argument = false;
2982 next_argparser = NULL;
2983 next_context_iter = null_context_list_iterator;
2984 continue;
2985 }
2986
2987 switch (tp->type)
2988 {
2989 case token_type_symbol:
2990 #if DEBUG_PERL
2991 fprintf (stderr, "%s:%d: type symbol (%d) \"%s\"\n",
2992 logical_file_name, tp->line_number, nesting_level,
2993 tp->string);
2994 #endif
2995
2996 {
2997 void *keyword_value;
2998
2999 if (hash_find_entry (&keywords, tp->string, strlen (tp->string),
3000 &keyword_value) == 0)
3001 {
3002 const struct callshapes *shapes =
3003 (const struct callshapes *) keyword_value;
3004
3005 last_token = token_type_keyword_symbol;
3006 next_shapes = shapes;
3007 next_argparser = arglist_parser_alloc (mlp, shapes);
3008 }
3009 else
3010 {
3011 next_shapes = NULL;
3012 next_argparser = arglist_parser_alloc (mlp, NULL);
3013 }
3014 }
3015 next_is_argument = true;
3016 next_context_iter =
3017 flag_context_list_iterator (
3018 flag_context_list_table_lookup (
3019 flag_context_list_table,
3020 tp->string, strlen (tp->string)));
3021 break;
3022
3023 case token_type_variable:
3024 #if DEBUG_PERL
3025 fprintf (stderr, "%s:%d: type variable (%d) \"%s\"\n",
3026 logical_file_name, tp->line_number, nesting_level, tp->string);
3027 #endif
3028 prefer_division_over_regexp = true;
3029 next_is_argument = false;
3030 if (next_argparser != NULL)
3031 free (next_argparser);
3032 next_argparser = NULL;
3033 next_context_iter = null_context_list_iterator;
3034 break;
3035
3036 case token_type_lparen:
3037 #if DEBUG_PERL
3038 fprintf (stderr, "%s:%d: type left parenthesis (%d)\n",
3039 logical_file_name, tp->line_number, nesting_level);
3040 #endif
3041 if (next_is_argument)
3042 {
3043 /* Parse the argument list of a function call. */
3044 if (extract_balanced (mlp, token_type_rparen, true, false,
3045 inner_context, next_context_iter,
3046 1, next_argparser))
3047 {
3048 xgettext_current_source_encoding = po_charset_utf8;
3049 arglist_parser_done (argparser, arg);
3050 xgettext_current_source_encoding = xgettext_global_source_encoding;
3051 return true;
3052 }
3053 next_is_argument = false;
3054 next_argparser = NULL;
3055 }
3056 else
3057 {
3058 /* Parse a parenthesized expression or comma expression. */
3059 if (extract_balanced (mlp, token_type_rparen, true, false,
3060 inner_context, next_context_iter,
3061 arg, arglist_parser_clone (argparser)))
3062 {
3063 xgettext_current_source_encoding = po_charset_utf8;
3064 arglist_parser_done (argparser, arg);
3065 xgettext_current_source_encoding = xgettext_global_source_encoding;
3066 if (next_argparser != NULL)
3067 free (next_argparser);
3068 free_token (tp);
3069 return true;
3070 }
3071 next_is_argument = false;
3072 if (next_argparser != NULL)
3073 free (next_argparser);
3074 next_argparser = NULL;
3075 }
3076 skip_until_comma = true;
3077 next_context_iter = null_context_list_iterator;
3078 break;
3079
3080 case token_type_rparen:
3081 #if DEBUG_PERL
3082 fprintf (stderr, "%s:%d: type right parenthesis (%d)\n",
3083 logical_file_name, tp->line_number, nesting_level);
3084 #endif
3085 next_is_argument = false;
3086 if (next_argparser != NULL)
3087 free (next_argparser);
3088 next_argparser = NULL;
3089 skip_until_comma = true;
3090 next_context_iter = null_context_list_iterator;
3091 break;
3092
3093 case token_type_comma:
3094 case token_type_fat_comma:
3095 #if DEBUG_PERL
3096 fprintf (stderr, "%s:%d: type comma (%d)\n",
3097 logical_file_name, tp->line_number, nesting_level);
3098 #endif
3099 if (arglist_parser_decidedp (argparser, arg))
3100 {
3101 /* We have missed the argument. */
3102 xgettext_current_source_encoding = po_charset_utf8;
3103 arglist_parser_done (argparser, arg);
3104 xgettext_current_source_encoding = xgettext_global_source_encoding;
3105 argparser = arglist_parser_alloc (mlp, NULL);
3106 arg = 0;
3107 }
3108 arg++;
3109 #if DEBUG_PERL
3110 fprintf (stderr, "%s:%d: arg: %d\n",
3111 real_file_name, tp->line_number, arg);
3112 #endif
3113 inner_context =
3114 inherited_context (outer_context,
3115 flag_context_list_iterator_advance (
3116 &context_iter));
3117 next_is_argument = false;
3118 if (next_argparser != NULL)
3119 free (next_argparser);
3120 next_argparser = NULL;
3121 skip_until_comma = false;
3122 next_context_iter = passthrough_context_list_iterator;
3123 break;
3124
3125 case token_type_string:
3126 #if DEBUG_PERL
3127 fprintf (stderr, "%s:%d: type string (%d): \"%s\"\n",
3128 logical_file_name, tp->line_number, nesting_level,
3129 tp->string);
3130 #endif
3131
3132 if (extract_all)
3133 {
3134 char *string = collect_message (mlp, tp, EXIT_SUCCESS);
3135 lex_pos_ty pos;
3136
3137 pos.file_name = logical_file_name;
3138 pos.line_number = tp->line_number;
3139 xgettext_current_source_encoding = po_charset_utf8;
3140 remember_a_message (mlp, NULL, string, inner_context, &pos, savable_comment);
3141 xgettext_current_source_encoding = xgettext_global_source_encoding;
3142 }
3143 else if (!skip_until_comma)
3144 {
3145 /* Need to collect the complete string, with error checking,
3146 only if the argument ARG is used in ARGPARSER. */
3147 bool must_collect = false;
3148 {
3149 size_t nalternatives = argparser->nalternatives;
3150 size_t i;
3151
3152 for (i = 0; i < nalternatives; i++)
3153 {
3154 struct partial_call *cp = &argparser->alternative[i];
3155
3156 if (arg == cp->argnumc
3157 || arg == cp->argnum1 || arg == cp->argnum2)
3158 must_collect = true;
3159 }
3160 }
3161
3162 if (must_collect)
3163 {
3164 char *string = collect_message (mlp, tp, EXIT_FAILURE);
3165
3166 xgettext_current_source_encoding = po_charset_utf8;
3167 arglist_parser_remember (argparser, arg,
3168 string, inner_context,
3169 logical_file_name, tp->line_number,
3170 savable_comment);
3171 xgettext_current_source_encoding = xgettext_global_source_encoding;
3172 }
3173 }
3174
3175 if (arglist_parser_decidedp (argparser, arg))
3176 {
3177 xgettext_current_source_encoding = po_charset_utf8;
3178 arglist_parser_done (argparser, arg);
3179 xgettext_current_source_encoding = xgettext_global_source_encoding;
3180 argparser = arglist_parser_alloc (mlp, NULL);
3181 }
3182
3183 next_is_argument = false;
3184 if (next_argparser != NULL)
3185 free (next_argparser);
3186 next_argparser = NULL;
3187 next_context_iter = null_context_list_iterator;
3188 break;
3189
3190 case token_type_eof:
3191 #if DEBUG_PERL
3192 fprintf (stderr, "%s:%d: type EOF (%d)\n",
3193 logical_file_name, tp->line_number, nesting_level);
3194 #endif
3195 xgettext_current_source_encoding = po_charset_utf8;
3196 arglist_parser_done (argparser, arg);
3197 xgettext_current_source_encoding = xgettext_global_source_encoding;
3198 if (next_argparser != NULL)
3199 free (next_argparser);
3200 next_argparser = NULL;
3201 free_token (tp);
3202 return true;
3203
3204 case token_type_lbrace:
3205 #if DEBUG_PERL
3206 fprintf (stderr, "%s:%d: type lbrace (%d)\n",
3207 logical_file_name, tp->line_number, nesting_level);
3208 #endif
3209 if (extract_balanced (mlp, token_type_rbrace, true, false,
3210 null_context, null_context_list_iterator,
3211 1, arglist_parser_alloc (mlp, NULL)))
3212 {
3213 xgettext_current_source_encoding = po_charset_utf8;
3214 arglist_parser_done (argparser, arg);
3215 xgettext_current_source_encoding = xgettext_global_source_encoding;
3216 if (next_argparser != NULL)
3217 free (next_argparser);
3218 free_token (tp);
3219 return true;
3220 }
3221 next_is_argument = false;
3222 if (next_argparser != NULL)
3223 free (next_argparser);
3224 next_argparser = NULL;
3225 next_context_iter = null_context_list_iterator;
3226 break;
3227
3228 case token_type_rbrace:
3229 #if DEBUG_PERL
3230 fprintf (stderr, "%s:%d: type rbrace (%d)\n",
3231 logical_file_name, tp->line_number, nesting_level);
3232 #endif
3233 next_is_argument = false;
3234 if (next_argparser != NULL)
3235 free (next_argparser);
3236 next_argparser = NULL;
3237 next_context_iter = null_context_list_iterator;
3238 break;
3239
3240 case token_type_lbracket:
3241 #if DEBUG_PERL
3242 fprintf (stderr, "%s:%d: type lbracket (%d)\n",
3243 logical_file_name, tp->line_number, nesting_level);
3244 #endif
3245 if (extract_balanced (mlp, token_type_rbracket, true, false,
3246 null_context, null_context_list_iterator,
3247 1, arglist_parser_alloc (mlp, NULL)))
3248 {
3249 xgettext_current_source_encoding = po_charset_utf8;
3250 arglist_parser_done (argparser, arg);
3251 xgettext_current_source_encoding = xgettext_global_source_encoding;
3252 if (next_argparser != NULL)
3253 free (next_argparser);
3254 free_token (tp);
3255 return true;
3256 }
3257 next_is_argument = false;
3258 if (next_argparser != NULL)
3259 free (next_argparser);
3260 next_argparser = NULL;
3261 next_context_iter = null_context_list_iterator;
3262 break;
3263
3264 case token_type_rbracket:
3265 #if DEBUG_PERL
3266 fprintf (stderr, "%s:%d: type rbracket (%d)\n",
3267 logical_file_name, tp->line_number, nesting_level);
3268 #endif
3269 next_is_argument = false;
3270 if (next_argparser != NULL)
3271 free (next_argparser);
3272 next_argparser = NULL;
3273 next_context_iter = null_context_list_iterator;
3274 break;
3275
3276 case token_type_semicolon:
3277 #if DEBUG_PERL
3278 fprintf (stderr, "%s:%d: type semicolon (%d)\n",
3279 logical_file_name, tp->line_number, nesting_level);
3280 #endif
3281
3282 /* The ultimate sign. */
3283 xgettext_current_source_encoding = po_charset_utf8;
3284 arglist_parser_done (argparser, arg);
3285 xgettext_current_source_encoding = xgettext_global_source_encoding;
3286 argparser = arglist_parser_alloc (mlp, NULL);
3287
3288 /* FIXME: Instead of resetting outer_context here, it may be better
3289 to recurse in the next_is_argument handling above, waiting for
3290 the next semicolon or other statement terminator. */
3291 outer_context = null_context;
3292 context_iter = null_context_list_iterator;
3293 next_is_argument = false;
3294 if (next_argparser != NULL)
3295 free (next_argparser);
3296 next_argparser = NULL;
3297 next_context_iter = passthrough_context_list_iterator;
3298 inner_context =
3299 inherited_context (outer_context,
3300 flag_context_list_iterator_advance (
3301 &context_iter));
3302 break;
3303
3304 case token_type_dereference:
3305 #if DEBUG_PERL
3306 fprintf (stderr, "%s:%d: type dereference (%d)\n",
3307 logical_file_name, tp->line_number, nesting_level);
3308 #endif
3309 next_is_argument = false;
3310 if (next_argparser != NULL)
3311 free (next_argparser);
3312 next_argparser = NULL;
3313 next_context_iter = null_context_list_iterator;
3314 break;
3315
3316 case token_type_dot:
3317 #if DEBUG_PERL
3318 fprintf (stderr, "%s:%d: type dot (%d)\n",
3319 logical_file_name, tp->line_number, nesting_level);
3320 #endif
3321 next_is_argument = false;
3322 if (next_argparser != NULL)
3323 free (next_argparser);
3324 next_argparser = NULL;
3325 next_context_iter = null_context_list_iterator;
3326 break;
3327
3328 case token_type_named_op:
3329 #if DEBUG_PERL
3330 fprintf (stderr, "%s:%d: type named operator (%d): %s\n",
3331 logical_file_name, tp->line_number, nesting_level,
3332 tp->string);
3333 #endif
3334 next_is_argument = false;
3335 if (next_argparser != NULL)
3336 free (next_argparser);
3337 next_argparser = NULL;
3338 next_context_iter = null_context_list_iterator;
3339 break;
3340
3341 case token_type_regex_op:
3342 #if DEBUG_PERL
3343 fprintf (stderr, "%s:%d: type regex operator (%d)\n",
3344 logical_file_name, tp->line_number, nesting_level);
3345 #endif
3346 next_is_argument = false;
3347 if (next_argparser != NULL)
3348 free (next_argparser);
3349 next_argparser = NULL;
3350 next_context_iter = null_context_list_iterator;
3351 break;
3352
3353 case token_type_other:
3354 #if DEBUG_PERL
3355 fprintf (stderr, "%s:%d: type other (%d)\n",
3356 logical_file_name, tp->line_number, nesting_level);
3357 #endif
3358 next_is_argument = false;
3359 if (next_argparser != NULL)
3360 free (next_argparser);
3361 next_argparser = NULL;
3362 next_context_iter = null_context_list_iterator;
3363 break;
3364
3365 default:
3366 fprintf (stderr, "%s:%d: unknown token type %d\n",
3367 real_file_name, tp->line_number, tp->type);
3368 abort ();
3369 }
3370
3371 free_token (tp);
3372 }
3373 }
3374
3375 void
extract_perl(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)3376 extract_perl (FILE *f, const char *real_filename, const char *logical_filename,
3377 flag_context_list_table_ty *flag_table,
3378 msgdomain_list_ty *mdlp)
3379 {
3380 message_list_ty *mlp = mdlp->item[0]->messages;
3381
3382 fp = f;
3383 real_file_name = real_filename;
3384 logical_file_name = xstrdup (logical_filename);
3385 line_number = 0;
3386
3387 last_comment_line = -1;
3388 last_non_comment_line = -1;
3389
3390 flag_context_list_table = flag_table;
3391
3392 init_keywords ();
3393
3394 token_stack.items = NULL;
3395 token_stack.nitems = 0;
3396 token_stack.nitems_max = 0;
3397 linesize = 0;
3398 linepos = 0;
3399 here_eaten = 0;
3400 end_of_file = false;
3401
3402 /* Eat tokens until eof is seen. When extract_balanced returns
3403 due to an unbalanced closing brace, just restart it. */
3404 while (!extract_balanced (mlp, token_type_rbrace, true, false,
3405 null_context, null_context_list_iterator,
3406 1, arglist_parser_alloc (mlp, NULL)))
3407 ;
3408
3409 fp = NULL;
3410 real_file_name = NULL;
3411 free (logical_file_name);
3412 logical_file_name = NULL;
3413 line_number = 0;
3414 last_token = token_type_semicolon;
3415 token_stack_free (&token_stack);
3416 here_eaten = 0;
3417 end_of_file = true;
3418 }
3419