1 /* xgettext Python backend.
2 Copyright (C) 2002-2003, 2005-2006 Free Software Foundation, Inc.
3
4 This file was written by Bruno Haible <haible@clisp.cons.org>, 2002.
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; either version 2, or (at your option)
9 any later version.
10
11 This program is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 GNU General Public License for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with this program; if not, write to the Free Software Foundation,
18 Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
19
20 #ifdef HAVE_CONFIG_H
21 # include "config.h"
22 #endif
23
24 #include <assert.h>
25 #include <errno.h>
26 #include <stdbool.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <string.h>
30
31 #include "message.h"
32 #include "xgettext.h"
33 #include "x-python.h"
34 #include "error.h"
35 #include "error-progname.h"
36 #include "progname.h"
37 #include "basename.h"
38 #include "xerror.h"
39 #include "xvasprintf.h"
40 #include "xalloc.h"
41 #include "exit.h"
42 #include "c-strstr.h"
43 #include "c-ctype.h"
44 #include "po-charset.h"
45 #include "uniname.h"
46 #include "utf16-ucs4.h"
47 #include "utf8-ucs4.h"
48 #include "ucs4-utf8.h"
49 #include "gettext.h"
50
51 #define _(s) gettext(s)
52
53 #define max(a,b) ((a) > (b) ? (a) : (b))
54
55 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
56
57
58 /* The Python syntax is defined in the Python Reference Manual
59 /usr/share/doc/packages/python/html/ref/index.html.
60 See also Python-2.0/Parser/tokenizer.c, Python-2.0/Python/compile.c,
61 Python-2.0/Objects/unicodeobject.c. */
62
63
64 /* ====================== Keyword set customization. ====================== */
65
66 /* If true extract all strings. */
67 static bool extract_all = false;
68
69 static hash_table keywords;
70 static bool default_keywords = true;
71
72
73 void
x_python_extract_all()74 x_python_extract_all ()
75 {
76 extract_all = true;
77 }
78
79
80 void
x_python_keyword(const char * name)81 x_python_keyword (const char *name)
82 {
83 if (name == NULL)
84 default_keywords = false;
85 else
86 {
87 const char *end;
88 struct callshape shape;
89 const char *colon;
90
91 if (keywords.table == NULL)
92 hash_init (&keywords, 100);
93
94 split_keywordspec (name, &end, &shape);
95
96 /* The characters between name and end should form a valid C identifier.
97 A colon means an invalid parse in split_keywordspec(). */
98 colon = strchr (name, ':');
99 if (colon == NULL || colon >= end)
100 insert_keyword_callshape (&keywords, name, end - name, &shape);
101 }
102 }
103
104 /* Finish initializing the keywords hash table.
105 Called after argument processing, before each file is processed. */
106 static void
init_keywords()107 init_keywords ()
108 {
109 if (default_keywords)
110 {
111 /* When adding new keywords here, also update the documentation in
112 xgettext.texi! */
113 x_python_keyword ("gettext");
114 x_python_keyword ("ugettext");
115 x_python_keyword ("dgettext:2");
116 x_python_keyword ("ngettext:1,2");
117 x_python_keyword ("ungettext:1,2");
118 x_python_keyword ("dngettext:2,3");
119 x_python_keyword ("_");
120 default_keywords = false;
121 }
122 }
123
124 void
init_flag_table_python()125 init_flag_table_python ()
126 {
127 xgettext_record_flag ("gettext:1:pass-python-format");
128 xgettext_record_flag ("ugettext:1:pass-python-format");
129 xgettext_record_flag ("dgettext:2:pass-python-format");
130 xgettext_record_flag ("ngettext:1:pass-python-format");
131 xgettext_record_flag ("ngettext:2:pass-python-format");
132 xgettext_record_flag ("ungettext:1:pass-python-format");
133 xgettext_record_flag ("ungettext:2:pass-python-format");
134 xgettext_record_flag ("dngettext:2:pass-python-format");
135 xgettext_record_flag ("dngettext:3:pass-python-format");
136 xgettext_record_flag ("_:1:pass-python-format");
137 /* xgettext_record_flag ("%:1:python-format"); // % is an infix operator! */
138 }
139
140
141 /* ======================== Reading of characters. ======================== */
142
143 /* Real filename, used in error messages about the input file. */
144 static const char *real_file_name;
145
146 /* Logical filename and line number, used to label the extracted messages. */
147 static char *logical_file_name;
148 static int line_number;
149
150 /* The input file stream. */
151 static FILE *fp;
152
153
154 /* 1. line_number handling. */
155
156 /* Maximum used, roughly a safer MB_LEN_MAX. */
157 #define MAX_PHASE1_PUSHBACK 16
158 static unsigned char phase1_pushback[MAX_PHASE1_PUSHBACK];
159 static int phase1_pushback_length;
160
161 /* Read the next single byte from the input file. */
162 static int
phase1_getc()163 phase1_getc ()
164 {
165 int c;
166
167 if (phase1_pushback_length)
168 c = phase1_pushback[--phase1_pushback_length];
169 else
170 {
171 c = getc (fp);
172
173 if (c == EOF)
174 {
175 if (ferror (fp))
176 error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
177 real_file_name);
178 return EOF;
179 }
180 }
181
182 if (c == '\n')
183 ++line_number;
184
185 return c;
186 }
187
188 /* Supports MAX_PHASE1_PUSHBACK characters of pushback. */
189 static void
phase1_ungetc(int c)190 phase1_ungetc (int c)
191 {
192 if (c != EOF)
193 {
194 if (c == '\n')
195 --line_number;
196
197 if (phase1_pushback_length == SIZEOF (phase1_pushback))
198 abort ();
199 phase1_pushback[phase1_pushback_length++] = c;
200 }
201 }
202
203
204 /* Phase 2: Conversion to Unicode.
205 This is done early because PEP 0263 specifies that conversion to Unicode
206 conceptually occurs before tokenization. A test case where it matters
207 is with encodings like BIG5: when a double-byte character ending in 0x5C
208 is followed by '\' or 'u0021', the tokenizer must not treat the second
209 half of the double-byte character as a backslash. */
210
211 /* End-of-file indicator for functions returning an UCS-4 character. */
212 #define UEOF -1
213
214 static int phase2_pushback[max (9, UNINAME_MAX + 3)];
215 static int phase2_pushback_length;
216
217 /* Read the next Unicode UCS-4 character from the input file. */
218 static int
phase2_getc()219 phase2_getc ()
220 {
221 if (phase2_pushback_length)
222 return phase2_pushback[--phase2_pushback_length];
223
224 if (xgettext_current_source_encoding == po_charset_ascii)
225 {
226 int c = phase1_getc ();
227 if (c == EOF)
228 return UEOF;
229 if (!c_isascii (c))
230 {
231 char buffer[21];
232 sprintf (buffer, ":%ld", (long) line_number);
233 multiline_error (xstrdup (""),
234 xasprintf (_("\
235 Non-ASCII string at %s%s.\n\
236 Please specify the source encoding through --from-code or through a comment\n\
237 as specified in http://www.python.org/peps/pep-0263.html.\n"),
238 real_file_name, buffer));
239 exit (EXIT_FAILURE);
240 }
241 return c;
242 }
243 else if (xgettext_current_source_encoding != po_charset_utf8)
244 {
245 #if HAVE_ICONV
246 /* Use iconv on an increasing number of bytes. Read only as many bytes
247 through phase1_getc as needed. This is needed to give reasonable
248 interactive behaviour when fp is connected to an interactive tty. */
249 unsigned char buf[MAX_PHASE1_PUSHBACK];
250 size_t bufcount;
251 int c = phase1_getc ();
252 if (c == EOF)
253 return UEOF;
254 buf[0] = (unsigned char) c;
255 bufcount = 1;
256
257 for (;;)
258 {
259 unsigned char scratchbuf[6];
260 const char *inptr = (const char *) &buf[0];
261 size_t insize = bufcount;
262 char *outptr = (char *) &scratchbuf[0];
263 size_t outsize = sizeof (scratchbuf);
264
265 size_t res = iconv (xgettext_current_source_iconv,
266 (ICONV_CONST char **) &inptr, &insize,
267 &outptr, &outsize);
268 /* We expect that a character has been produced if and only if
269 some input bytes have been consumed. */
270 if ((insize < bufcount) != (outsize < sizeof (scratchbuf)))
271 abort ();
272 if (outsize == sizeof (scratchbuf))
273 {
274 /* No character has been produced. Must be an error. */
275 if (res != (size_t)(-1))
276 abort ();
277
278 if (errno == EILSEQ)
279 {
280 /* An invalid multibyte sequence was encountered. */
281 multiline_error (xstrdup (""),
282 xasprintf (_("\
283 %s:%d: Invalid multibyte sequence.\n\
284 Please specify the correct source encoding through --from-code or through a\n\
285 comment as specified in http://www.python.org/peps/pep-0263.html.\n"),
286 real_file_name, line_number));
287 exit (EXIT_FAILURE);
288 }
289 else if (errno == EINVAL)
290 {
291 /* An incomplete multibyte character. */
292 int c;
293
294 if (bufcount == MAX_PHASE1_PUSHBACK)
295 {
296 /* An overlong incomplete multibyte sequence was
297 encountered. */
298 multiline_error (xstrdup (""),
299 xasprintf (_("\
300 %s:%d: Long incomplete multibyte sequence.\n\
301 Please specify the correct source encoding through --from-code or through a\n\
302 comment as specified in http://www.python.org/peps/pep-0263.html.\n"),
303 real_file_name, line_number));
304 exit (EXIT_FAILURE);
305 }
306
307 /* Read one more byte and retry iconv. */
308 c = phase1_getc ();
309 if (c == EOF)
310 {
311 multiline_error (xstrdup (""),
312 xasprintf (_("\
313 %s:%d: Incomplete multibyte sequence at end of file.\n\
314 Please specify the correct source encoding through --from-code or through a\n\
315 comment as specified in http://www.python.org/peps/pep-0263.html.\n"),
316 real_file_name, line_number));
317 exit (EXIT_FAILURE);
318 }
319 if (c == '\n')
320 {
321 multiline_error (xstrdup (""),
322 xasprintf (_("\
323 %s:%d: Incomplete multibyte sequence at end of line.\n\
324 Please specify the correct source encoding through --from-code or through a\n\
325 comment as specified in http://www.python.org/peps/pep-0263.html.\n"),
326 real_file_name, line_number - 1));
327 exit (EXIT_FAILURE);
328 }
329 buf[bufcount++] = (unsigned char) c;
330 }
331 else
332 error (EXIT_FAILURE, errno, _("%s:%d: iconv failure"),
333 real_file_name, line_number);
334 }
335 else
336 {
337 size_t outbytes = sizeof (scratchbuf) - outsize;
338 size_t bytes = bufcount - insize;
339 unsigned int uc;
340
341 /* We expect that one character has been produced. */
342 if (bytes == 0)
343 abort ();
344 if (outbytes == 0)
345 abort ();
346 /* Push back the unused bytes. */
347 while (insize > 0)
348 phase1_ungetc (buf[--insize]);
349 /* Convert the character from UTF-8 to UCS-4. */
350 if (u8_mbtouc (&uc, scratchbuf, outbytes) < outbytes)
351 {
352 /* scratchbuf contains an out-of-range Unicode character
353 (> 0x10ffff). */
354 multiline_error (xstrdup (""),
355 xasprintf (_("\
356 %s:%d: Invalid multibyte sequence.\n\
357 Please specify the source encoding through --from-code or through a comment\n\
358 as specified in http://www.python.org/peps/pep-0263.html.\n"),
359 real_file_name, line_number));
360 exit (EXIT_FAILURE);
361 }
362 return uc;
363 }
364 }
365 #else
366 /* If we don't have iconv(), the only supported values for
367 xgettext_global_source_encoding and thus also for
368 xgettext_current_source_encoding are ASCII and UTF-8. */
369 abort ();
370 #endif
371 }
372 else
373 {
374 /* Read an UTF-8 encoded character. */
375 unsigned char buf[6];
376 unsigned int count;
377 int c;
378 unsigned int uc;
379
380 c = phase1_getc ();
381 if (c == EOF)
382 return UEOF;
383 buf[0] = c;
384 count = 1;
385
386 if (buf[0] >= 0xc0)
387 {
388 c = phase1_getc ();
389 if (c == EOF)
390 return UEOF;
391 buf[1] = c;
392 count = 2;
393 }
394
395 if (buf[0] >= 0xe0
396 && ((buf[1] ^ 0x80) < 0x40))
397 {
398 c = phase1_getc ();
399 if (c == EOF)
400 return UEOF;
401 buf[2] = c;
402 count = 3;
403 }
404
405 if (buf[0] >= 0xf0
406 && ((buf[1] ^ 0x80) < 0x40)
407 && ((buf[2] ^ 0x80) < 0x40))
408 {
409 c = phase1_getc ();
410 if (c == EOF)
411 return UEOF;
412 buf[3] = c;
413 count = 4;
414 }
415
416 if (buf[0] >= 0xf8
417 && ((buf[1] ^ 0x80) < 0x40)
418 && ((buf[2] ^ 0x80) < 0x40)
419 && ((buf[3] ^ 0x80) < 0x40))
420 {
421 c = phase1_getc ();
422 if (c == EOF)
423 return UEOF;
424 buf[4] = c;
425 count = 5;
426 }
427
428 if (buf[0] >= 0xfc
429 && ((buf[1] ^ 0x80) < 0x40)
430 && ((buf[2] ^ 0x80) < 0x40)
431 && ((buf[3] ^ 0x80) < 0x40)
432 && ((buf[4] ^ 0x80) < 0x40))
433 {
434 c = phase1_getc ();
435 if (c == EOF)
436 return UEOF;
437 buf[5] = c;
438 count = 6;
439 }
440
441 u8_mbtouc (&uc, buf, count);
442 return uc;
443 }
444 }
445
446 /* Supports max (9, UNINAME_MAX + 3) pushback characters. */
447 static void
phase2_ungetc(int c)448 phase2_ungetc (int c)
449 {
450 if (c != UEOF)
451 {
452 if (phase2_pushback_length == SIZEOF (phase2_pushback))
453 abort ();
454 phase2_pushback[phase2_pushback_length++] = c;
455 }
456 }
457
458
459 /* ========================= Accumulating strings. ======================== */
460
461 /* A string buffer type that allows appending Unicode characters.
462 Returns the entire string in UTF-8 encoding. */
463
464 struct unicode_string_buffer
465 {
466 /* The part of the string that has already been converted to UTF-8. */
467 char *utf8_buffer;
468 size_t utf8_buflen;
469 size_t utf8_allocated;
470 };
471
472 /* Initialize a 'struct unicode_string_buffer' to empty. */
473 static inline void
init_unicode_string_buffer(struct unicode_string_buffer * bp)474 init_unicode_string_buffer (struct unicode_string_buffer *bp)
475 {
476 bp->utf8_buffer = NULL;
477 bp->utf8_buflen = 0;
478 bp->utf8_allocated = 0;
479 }
480
481 /* Auxiliary function: Ensure count more bytes are available in bp->utf8. */
482 static inline void
unicode_string_buffer_append_unicode_grow(struct unicode_string_buffer * bp,size_t count)483 unicode_string_buffer_append_unicode_grow (struct unicode_string_buffer *bp,
484 size_t count)
485 {
486 if (bp->utf8_buflen + count > bp->utf8_allocated)
487 {
488 size_t new_allocated = 2 * bp->utf8_allocated + 10;
489 if (new_allocated < bp->utf8_buflen + count)
490 new_allocated = bp->utf8_buflen + count;
491 bp->utf8_allocated = new_allocated;
492 bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated);
493 }
494 }
495
496 /* Auxiliary function: Append a Unicode character to bp->utf8.
497 uc must be < 0x110000. */
498 static inline void
unicode_string_buffer_append_unicode(struct unicode_string_buffer * bp,unsigned int uc)499 unicode_string_buffer_append_unicode (struct unicode_string_buffer *bp,
500 unsigned int uc)
501 {
502 unsigned char utf8buf[6];
503 int count = u8_uctomb (utf8buf, uc, 6);
504
505 if (count < 0)
506 /* The caller should have ensured that uc is not out-of-range. */
507 abort ();
508
509 unicode_string_buffer_append_unicode_grow (bp, count);
510 memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count);
511 bp->utf8_buflen += count;
512 }
513
514 /* Return the string buffer's contents. */
515 static char *
unicode_string_buffer_result(struct unicode_string_buffer * bp)516 unicode_string_buffer_result (struct unicode_string_buffer *bp)
517 {
518 /* NUL-terminate it. */
519 unicode_string_buffer_append_unicode_grow (bp, 1);
520 bp->utf8_buffer[bp->utf8_buflen] = '\0';
521 /* Return it. */
522 return bp->utf8_buffer;
523 }
524
525 /* Free the memory pointed to by a 'struct unicode_string_buffer'. */
526 static inline void
free_unicode_string_buffer(struct unicode_string_buffer * bp)527 free_unicode_string_buffer (struct unicode_string_buffer *bp)
528 {
529 free (bp->utf8_buffer);
530 }
531
532
533 /* ======================== Accumulating comments. ======================== */
534
535
536 /* Accumulating a single comment line. */
537
538 static struct unicode_string_buffer comment_buffer;
539
540 static inline void
comment_start()541 comment_start ()
542 {
543 comment_buffer.utf8_buflen = 0;
544 }
545
546 static inline bool
comment_at_start()547 comment_at_start ()
548 {
549 return (comment_buffer.utf8_buflen == 0);
550 }
551
552 static inline void
comment_add(int c)553 comment_add (int c)
554 {
555 unicode_string_buffer_append_unicode (&comment_buffer, c);
556 }
557
558 static inline const char *
comment_line_end()559 comment_line_end ()
560 {
561 char *buffer = unicode_string_buffer_result (&comment_buffer);
562 size_t buflen = strlen (buffer);
563
564 while (buflen >= 1
565 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
566 --buflen;
567 buffer[buflen] = '\0';
568 savable_comment_add (buffer);
569 return buffer;
570 }
571
572
573 /* These are for tracking whether comments count as immediately before
574 keyword. */
575 static int last_comment_line;
576 static int last_non_comment_line;
577
578
579 /* ======================== Recognizing comments. ======================== */
580
581
582 /* Recognizing the "coding" comment.
583 As specified in PEP 0263, it takes the form
584 "coding" [":"|"="] {alphanumeric or "-" or "_" or "*"}*
585 and is located in a comment in a line that
586 - is either the first or second line,
587 - is not a continuation line,
588 - contains no other tokens except this comment. */
589
590 /* Canonicalized encoding name for the current input file. */
591 static const char *xgettext_current_file_source_encoding;
592
593 #if HAVE_ICONV
594 /* Converter from xgettext_current_file_source_encoding to UTF-8 (except from
595 ASCII or UTF-8, when this conversion is a no-op). */
596 static iconv_t xgettext_current_file_source_iconv;
597 #endif
598
599 static inline void
set_current_file_source_encoding(const char * canon_encoding)600 set_current_file_source_encoding (const char *canon_encoding)
601 {
602 xgettext_current_file_source_encoding = canon_encoding;
603
604 if (xgettext_current_file_source_encoding != po_charset_ascii
605 && xgettext_current_file_source_encoding != po_charset_utf8)
606 {
607 #if HAVE_ICONV
608 iconv_t cd;
609
610 /* Avoid glibc-2.1 bug with EUC-KR. */
611 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
612 if (strcmp (xgettext_current_file_source_encoding, "EUC-KR") == 0)
613 cd = (iconv_t)(-1);
614 else
615 # endif
616 cd = iconv_open (po_charset_utf8, xgettext_current_file_source_encoding);
617 if (cd == (iconv_t)(-1))
618 error_at_line (EXIT_FAILURE, 0, logical_file_name, line_number - 1, _("\
619 Cannot convert from \"%s\" to \"%s\". %s relies on iconv(), \
620 and iconv() does not support this conversion."),
621 xgettext_current_file_source_encoding, po_charset_utf8,
622 basename (program_name));
623 xgettext_current_file_source_iconv = cd;
624 #else
625 error_at_line (EXIT_FAILURE, 0, logical_file_name, line_number - 1, _("\
626 Cannot convert from \"%s\" to \"%s\". %s relies on iconv(). \
627 This version was built without iconv()."),
628 xgettext_global_source_encoding, po_charset_utf8,
629 basename (program_name));
630 #endif
631 }
632
633 xgettext_current_source_encoding = xgettext_current_file_source_encoding;
634 #if HAVE_ICONV
635 xgettext_current_source_iconv = xgettext_current_file_source_iconv;
636 #endif
637 }
638
639 static inline void
try_to_extract_coding(const char * comment)640 try_to_extract_coding (const char *comment)
641 {
642 const char *p = c_strstr (comment, "coding");
643
644 if (p != NULL)
645 {
646 p += 6;
647 if (*p == ':' || *p == '=')
648 {
649 p++;
650 while (*p == ' ' || *p == '\t')
651 p++;
652 {
653 const char *encoding_start = p;
654
655 while (c_isalnum (*p) || *p == '-' || *p == '_' || *p == '.')
656 p++;
657 {
658 const char *encoding_end = p;
659
660 if (encoding_end > encoding_start)
661 {
662 /* Extract the encoding string. */
663 size_t encoding_len = encoding_end - encoding_start;
664 char *encoding = (char *) xmalloc (encoding_len + 1);
665
666 memcpy (encoding, encoding_start, encoding_len);
667 encoding[encoding_len] = '\0';
668
669 {
670 /* Canonicalize it. */
671 const char *canon_encoding = po_charset_canonicalize (encoding);
672 if (canon_encoding == NULL)
673 {
674 error_at_line (0, 0,
675 logical_file_name, line_number - 1, _("\
676 Unknown encoding \"%s\". Proceeding with ASCII instead."),
677 encoding);
678 canon_encoding = po_charset_ascii;
679 }
680
681 /* Activate it. */
682 set_current_file_source_encoding (canon_encoding);
683 }
684
685 free (encoding);
686 }
687 }
688 }
689 }
690 }
691 }
692
693 /* Tracking whether the current line is a continuation line or contains a
694 non-blank character. */
695 static bool continuation_or_nonblank_line = false;
696
697
698 /* Phase 3: Outside strings, replace backslash-newline with nothing and a
699 comment with nothing. */
700
701 static int
phase3_getc()702 phase3_getc ()
703 {
704 int c;
705
706 for (;;)
707 {
708 c = phase2_getc ();
709 if (c == '\\')
710 {
711 c = phase2_getc ();
712 if (c != '\n')
713 {
714 phase2_ungetc (c);
715 /* This shouldn't happen usually, because "A backslash is
716 illegal elsewhere on a line outside a string literal." */
717 return '\\';
718 }
719 /* Eat backslash-newline. */
720 continuation_or_nonblank_line = true;
721 }
722 else if (c == '#')
723 {
724 /* Eat a comment. */
725 const char *comment;
726
727 last_comment_line = line_number;
728 comment_start ();
729 for (;;)
730 {
731 c = phase2_getc ();
732 if (c == UEOF || c == '\n')
733 break;
734 /* We skip all leading white space, but not EOLs. */
735 if (!(comment_at_start () && (c == ' ' || c == '\t')))
736 comment_add (c);
737 }
738 comment = comment_line_end ();
739 if (line_number - 1 <= 2 && !continuation_or_nonblank_line)
740 try_to_extract_coding (comment);
741 continuation_or_nonblank_line = false;
742 return c;
743 }
744 else
745 {
746 if (c == '\n')
747 continuation_or_nonblank_line = false;
748 else if (!(c == ' ' || c == '\t' || c == '\f'))
749 continuation_or_nonblank_line = true;
750 return c;
751 }
752 }
753 }
754
755 /* Supports only one pushback character. */
756 static void
phase3_ungetc(int c)757 phase3_ungetc (int c)
758 {
759 phase2_ungetc (c);
760 }
761
762
763 /* ========================= Accumulating strings. ======================== */
764
765 /* Return value of phase7_getuc when EOF is reached. */
766 #define P7_EOF (-1)
767 #define P7_STRING_END (-2)
768
769 /* Convert an UTF-16 or UTF-32 code point to a return value that can be
770 distinguished from a single-byte return value. */
771 #define UNICODE(code) (0x100 + (code))
772
773 /* Test a return value of phase7_getuc whether it designates an UTF-16 or
774 UTF-32 code point. */
775 #define IS_UNICODE(p7_result) ((p7_result) >= 0x100)
776
777 /* Extract the UTF-16 or UTF-32 code of a return value that satisfies
778 IS_UNICODE. */
779 #define UNICODE_VALUE(p7_result) ((p7_result) - 0x100)
780
781 /* A string buffer type that allows appending bytes (in the
782 xgettext_current_source_encoding) or Unicode characters.
783 Returns the entire string in UTF-8 encoding. */
784
785 struct mixed_string_buffer
786 {
787 /* The part of the string that has already been converted to UTF-8. */
788 char *utf8_buffer;
789 size_t utf8_buflen;
790 size_t utf8_allocated;
791 /* The first half of an UTF-16 surrogate character. */
792 unsigned short utf16_surr;
793 /* The part of the string that is still in the source encoding. */
794 char *curr_buffer;
795 size_t curr_buflen;
796 size_t curr_allocated;
797 };
798
799 /* Initialize a 'struct mixed_string_buffer' to empty. */
800 static inline void
init_mixed_string_buffer(struct mixed_string_buffer * bp)801 init_mixed_string_buffer (struct mixed_string_buffer *bp)
802 {
803 bp->utf8_buffer = NULL;
804 bp->utf8_buflen = 0;
805 bp->utf8_allocated = 0;
806 bp->utf16_surr = 0;
807 bp->curr_buffer = NULL;
808 bp->curr_buflen = 0;
809 bp->curr_allocated = 0;
810 }
811
812 /* Auxiliary function: Append a byte to bp->curr. */
813 static inline void
mixed_string_buffer_append_byte(struct mixed_string_buffer * bp,unsigned char c)814 mixed_string_buffer_append_byte (struct mixed_string_buffer *bp, unsigned char c)
815 {
816 if (bp->curr_buflen == bp->curr_allocated)
817 {
818 bp->curr_allocated = 2 * bp->curr_allocated + 10;
819 bp->curr_buffer = xrealloc (bp->curr_buffer, bp->curr_allocated);
820 }
821 bp->curr_buffer[bp->curr_buflen++] = c;
822 }
823
824 /* Auxiliary function: Ensure count more bytes are available in bp->utf8. */
825 static inline void
mixed_string_buffer_append_unicode_grow(struct mixed_string_buffer * bp,size_t count)826 mixed_string_buffer_append_unicode_grow (struct mixed_string_buffer *bp, size_t count)
827 {
828 if (bp->utf8_buflen + count > bp->utf8_allocated)
829 {
830 size_t new_allocated = 2 * bp->utf8_allocated + 10;
831 if (new_allocated < bp->utf8_buflen + count)
832 new_allocated = bp->utf8_buflen + count;
833 bp->utf8_allocated = new_allocated;
834 bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated);
835 }
836 }
837
838 /* Auxiliary function: Append a Unicode character to bp->utf8.
839 uc must be < 0x110000. */
840 static inline void
mixed_string_buffer_append_unicode(struct mixed_string_buffer * bp,unsigned int uc)841 mixed_string_buffer_append_unicode (struct mixed_string_buffer *bp, unsigned int uc)
842 {
843 unsigned char utf8buf[6];
844 int count = u8_uctomb (utf8buf, uc, 6);
845
846 if (count < 0)
847 /* The caller should have ensured that uc is not out-of-range. */
848 abort ();
849
850 mixed_string_buffer_append_unicode_grow (bp, count);
851 memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count);
852 bp->utf8_buflen += count;
853 }
854
855 /* Auxiliary function: Flush bp->utf16_surr into bp->utf8_buffer. */
856 static inline void
mixed_string_buffer_flush_utf16_surr(struct mixed_string_buffer * bp)857 mixed_string_buffer_flush_utf16_surr (struct mixed_string_buffer *bp)
858 {
859 if (bp->utf16_surr != 0)
860 {
861 /* A half surrogate is invalid, therefore use U+FFFD instead. */
862 mixed_string_buffer_append_unicode (bp, 0xfffd);
863 bp->utf16_surr = 0;
864 }
865 }
866
867 /* Auxiliary function: Flush bp->curr_buffer into bp->utf8_buffer. */
868 static inline void
mixed_string_buffer_flush_curr_buffer(struct mixed_string_buffer * bp,int lineno)869 mixed_string_buffer_flush_curr_buffer (struct mixed_string_buffer *bp, int lineno)
870 {
871 if (bp->curr_buflen > 0)
872 {
873 char *curr;
874 size_t count;
875
876 mixed_string_buffer_append_byte (bp, '\0');
877
878 /* Convert from the source encoding to UTF-8. */
879 curr = from_current_source_encoding (bp->curr_buffer,
880 logical_file_name, lineno);
881
882 /* Append it to bp->utf8_buffer. */
883 count = strlen (curr);
884 mixed_string_buffer_append_unicode_grow (bp, count);
885 memcpy (bp->utf8_buffer + bp->utf8_buflen, curr, count);
886 bp->utf8_buflen += count;
887
888 if (curr != bp->curr_buffer)
889 free (curr);
890 bp->curr_buflen = 0;
891 }
892 }
893
894 /* Append a character or Unicode character to a 'struct mixed_string_buffer'. */
895 static void
mixed_string_buffer_append(struct mixed_string_buffer * bp,int c)896 mixed_string_buffer_append (struct mixed_string_buffer *bp, int c)
897 {
898 if (IS_UNICODE (c))
899 {
900 /* Append a Unicode character. */
901
902 /* Switch from multibyte character mode to Unicode character mode. */
903 mixed_string_buffer_flush_curr_buffer (bp, line_number);
904
905 /* Test whether this character and the previous one form a Unicode
906 surrogate character pair. */
907 if (bp->utf16_surr != 0
908 && (c >= UNICODE (0xdc00) && c < UNICODE (0xe000)))
909 {
910 unsigned short utf16buf[2];
911 unsigned int uc;
912
913 utf16buf[0] = bp->utf16_surr;
914 utf16buf[1] = UNICODE_VALUE (c);
915 if (u16_mbtouc_aux (&uc, utf16buf, 2) != 2)
916 abort ();
917
918 mixed_string_buffer_append_unicode (bp, uc);
919 bp->utf16_surr = 0;
920 }
921 else
922 {
923 mixed_string_buffer_flush_utf16_surr (bp);
924
925 if (c >= UNICODE (0xd800) && c < UNICODE (0xdc00))
926 bp->utf16_surr = UNICODE_VALUE (c);
927 else
928 mixed_string_buffer_append_unicode (bp, UNICODE_VALUE (c));
929 }
930 }
931 else
932 {
933 /* Append a single byte. */
934
935 /* Switch from Unicode character mode to multibyte character mode. */
936 mixed_string_buffer_flush_utf16_surr (bp);
937
938 /* When a newline is seen, convert the accumulated multibyte sequence.
939 This ensures a correct line number in the error message in case of
940 a conversion error. The "- 1" is to account for the newline. */
941 if (c == '\n')
942 mixed_string_buffer_flush_curr_buffer (bp, line_number - 1);
943
944 mixed_string_buffer_append_byte (bp, (unsigned char) c);
945 }
946 }
947
948 /* Return the string buffer's contents. */
949 static char *
mixed_string_buffer_result(struct mixed_string_buffer * bp)950 mixed_string_buffer_result (struct mixed_string_buffer *bp)
951 {
952 /* Flush all into bp->utf8_buffer. */
953 mixed_string_buffer_flush_utf16_surr (bp);
954 mixed_string_buffer_flush_curr_buffer (bp, line_number);
955 /* NUL-terminate it. */
956 mixed_string_buffer_append_unicode_grow (bp, 1);
957 bp->utf8_buffer[bp->utf8_buflen] = '\0';
958 /* Return it. */
959 return bp->utf8_buffer;
960 }
961
962 /* Free the memory pointed to by a 'struct mixed_string_buffer'. */
963 static inline void
free_mixed_string_buffer(struct mixed_string_buffer * bp)964 free_mixed_string_buffer (struct mixed_string_buffer *bp)
965 {
966 free (bp->utf8_buffer);
967 free (bp->curr_buffer);
968 }
969
970
971 /* ========================== Reading of tokens. ========================== */
972
973
974 enum token_type_ty
975 {
976 token_type_eof,
977 token_type_lparen, /* ( */
978 token_type_rparen, /* ) */
979 token_type_comma, /* , */
980 token_type_string, /* "abc", 'abc', """abc""", '''abc''' */
981 token_type_symbol, /* symbol, number */
982 token_type_other /* misc. operator */
983 };
984 typedef enum token_type_ty token_type_ty;
985
986 typedef struct token_ty token_ty;
987 struct token_ty
988 {
989 token_type_ty type;
990 char *string; /* for token_type_string, token_type_symbol */
991 refcounted_string_list_ty *comment; /* for token_type_string */
992 int line_number;
993 };
994
995
996 /* There are two different input syntaxes for strings, "abc" and r"abc",
997 and two different input syntaxes for Unicode strings, u"abc" and ur"abc".
998 Which escape sequences are understood, i.e. what is interpreted specially
999 after backslash?
1000 "abc" \<nl> \\ \' \" \a\b\f\n\r\t\v \ooo \xnn
1001 r"abc"
1002 u"abc" \<nl> \\ \' \" \a\b\f\n\r\t\v \ooo \xnn \unnnn \Unnnnnnnn \N{...}
1003 ur"abc" \unnnn
1004 The \unnnn values are UTF-16 values; a single \Unnnnnnnn can expand to two
1005 \unnnn items. The \ooo and \xnn values are in the current source encoding.
1006 */
1007
1008 static int
phase7_getuc(int quote_char,bool triple,bool interpret_ansic,bool interpret_unicode,unsigned int * backslash_counter)1009 phase7_getuc (int quote_char,
1010 bool triple, bool interpret_ansic, bool interpret_unicode,
1011 unsigned int *backslash_counter)
1012 {
1013 int c;
1014
1015 for (;;)
1016 {
1017 /* Use phase 2, because phase 3 elides comments. */
1018 c = phase2_getc ();
1019
1020 if (c == UEOF)
1021 return P7_EOF;
1022
1023 if (c == quote_char && (interpret_ansic || (*backslash_counter & 1) == 0))
1024 {
1025 if (triple)
1026 {
1027 int c1 = phase2_getc ();
1028 if (c1 == quote_char)
1029 {
1030 int c2 = phase2_getc ();
1031 if (c2 == quote_char)
1032 return P7_STRING_END;
1033 phase2_ungetc (c2);
1034 }
1035 phase2_ungetc (c1);
1036 return UNICODE (c);
1037 }
1038 else
1039 return P7_STRING_END;
1040 }
1041
1042 if (c == '\n')
1043 {
1044 if (triple)
1045 {
1046 *backslash_counter = 0;
1047 return UNICODE ('\n');
1048 }
1049 /* In r"..." and ur"..." strings, newline is only allowed
1050 immediately after an odd number of backslashes (although the
1051 backslashes are not interpreted!). */
1052 if (!(interpret_ansic || (*backslash_counter & 1) == 0))
1053 {
1054 *backslash_counter = 0;
1055 return UNICODE ('\n');
1056 }
1057 phase2_ungetc (c);
1058 error_with_progname = false;
1059 error (0, 0, _("%s:%d: warning: unterminated string"),
1060 logical_file_name, line_number);
1061 error_with_progname = true;
1062 return P7_STRING_END;
1063 }
1064
1065 if (c != '\\')
1066 {
1067 *backslash_counter = 0;
1068 return UNICODE (c);
1069 }
1070
1071 /* Backslash handling. */
1072
1073 if (!interpret_ansic && !interpret_unicode)
1074 {
1075 ++*backslash_counter;
1076 return UNICODE ('\\');
1077 }
1078
1079 /* Dispatch according to the character following the backslash. */
1080 c = phase2_getc ();
1081 if (c == UEOF)
1082 {
1083 ++*backslash_counter;
1084 return UNICODE ('\\');
1085 }
1086
1087 if (interpret_ansic)
1088 switch (c)
1089 {
1090 case '\n':
1091 continue;
1092 case '\\':
1093 ++*backslash_counter;
1094 return UNICODE (c);
1095 case '\'': case '"':
1096 *backslash_counter = 0;
1097 return UNICODE (c);
1098 case 'a':
1099 *backslash_counter = 0;
1100 return UNICODE ('\a');
1101 case 'b':
1102 *backslash_counter = 0;
1103 return UNICODE ('\b');
1104 case 'f':
1105 *backslash_counter = 0;
1106 return UNICODE ('\f');
1107 case 'n':
1108 *backslash_counter = 0;
1109 return UNICODE ('\n');
1110 case 'r':
1111 *backslash_counter = 0;
1112 return UNICODE ('\r');
1113 case 't':
1114 *backslash_counter = 0;
1115 return UNICODE ('\t');
1116 case 'v':
1117 *backslash_counter = 0;
1118 return UNICODE ('\v');
1119 case '0': case '1': case '2': case '3': case '4':
1120 case '5': case '6': case '7':
1121 {
1122 int n = c - '0';
1123
1124 c = phase2_getc ();
1125 if (c != UEOF)
1126 {
1127 if (c >= '0' && c <= '7')
1128 {
1129 n = (n << 3) + (c - '0');
1130 c = phase2_getc ();
1131 if (c != UEOF)
1132 {
1133 if (c >= '0' && c <= '7')
1134 n = (n << 3) + (c - '0');
1135 else
1136 phase2_ungetc (c);
1137 }
1138 }
1139 else
1140 phase2_ungetc (c);
1141 }
1142 *backslash_counter = 0;
1143 return (unsigned char) n;
1144 }
1145 case 'x':
1146 {
1147 int c1 = phase2_getc ();
1148 int n1;
1149
1150 if (c1 >= '0' && c1 <= '9')
1151 n1 = c1 - '0';
1152 else if (c1 >= 'A' && c1 <= 'F')
1153 n1 = c1 - 'A' + 10;
1154 else if (c1 >= 'a' && c1 <= 'f')
1155 n1 = c1 - 'a' + 10;
1156 else
1157 n1 = -1;
1158
1159 if (n1 >= 0)
1160 {
1161 int c2 = phase2_getc ();
1162 int n2;
1163
1164 if (c2 >= '0' && c2 <= '9')
1165 n2 = c2 - '0';
1166 else if (c2 >= 'A' && c2 <= 'F')
1167 n2 = c2 - 'A' + 10;
1168 else if (c2 >= 'a' && c2 <= 'f')
1169 n2 = c2 - 'a' + 10;
1170 else
1171 n2 = -1;
1172
1173 if (n2 >= 0)
1174 {
1175 *backslash_counter = 0;
1176 return (unsigned char) ((n1 << 4) + n2);
1177 }
1178
1179 phase2_ungetc (c2);
1180 }
1181 phase2_ungetc (c1);
1182 phase2_ungetc (c);
1183 ++*backslash_counter;
1184 return UNICODE ('\\');
1185 }
1186 }
1187
1188 if (interpret_unicode)
1189 {
1190 if (c == 'u')
1191 {
1192 unsigned char buf[4];
1193 unsigned int n = 0;
1194 int i;
1195
1196 for (i = 0; i < 4; i++)
1197 {
1198 int c1 = phase2_getc ();
1199
1200 if (c1 >= '0' && c1 <= '9')
1201 n = (n << 4) + (c1 - '0');
1202 else if (c1 >= 'A' && c1 <= 'F')
1203 n = (n << 4) + (c1 - 'A' + 10);
1204 else if (c1 >= 'a' && c1 <= 'f')
1205 n = (n << 4) + (c1 - 'a' + 10);
1206 else
1207 {
1208 phase2_ungetc (c1);
1209 while (--i >= 0)
1210 phase2_ungetc (buf[i]);
1211 phase2_ungetc (c);
1212 ++*backslash_counter;
1213 return UNICODE ('\\');
1214 }
1215
1216 buf[i] = c1;
1217 }
1218 *backslash_counter = 0;
1219 return UNICODE (n);
1220 }
1221
1222 if (interpret_ansic)
1223 {
1224 if (c == 'U')
1225 {
1226 unsigned char buf[8];
1227 unsigned int n = 0;
1228 int i;
1229
1230 for (i = 0; i < 8; i++)
1231 {
1232 int c1 = phase2_getc ();
1233
1234 if (c1 >= '0' && c1 <= '9')
1235 n = (n << 4) + (c1 - '0');
1236 else if (c1 >= 'A' && c1 <= 'F')
1237 n = (n << 4) + (c1 - 'A' + 10);
1238 else if (c1 >= 'a' && c1 <= 'f')
1239 n = (n << 4) + (c1 - 'a' + 10);
1240 else
1241 {
1242 phase2_ungetc (c1);
1243 while (--i >= 0)
1244 phase2_ungetc (buf[i]);
1245 phase2_ungetc (c);
1246 ++*backslash_counter;
1247 return UNICODE ('\\');
1248 }
1249
1250 buf[i] = c1;
1251 }
1252 if (n < 0x110000)
1253 {
1254 *backslash_counter = 0;
1255 return UNICODE (n);
1256 }
1257
1258 error_with_progname = false;
1259 error (0, 0, _("%s:%d: warning: invalid Unicode character"),
1260 logical_file_name, line_number);
1261 error_with_progname = true;
1262
1263 while (--i >= 0)
1264 phase2_ungetc (buf[i]);
1265 phase2_ungetc (c);
1266 ++*backslash_counter;
1267 return UNICODE ('\\');
1268 }
1269
1270 if (c == 'N')
1271 {
1272 int c1 = phase2_getc ();
1273 if (c1 == '{')
1274 {
1275 unsigned char buf[UNINAME_MAX + 1];
1276 int i;
1277 unsigned int n;
1278
1279 for (i = 0; i < UNINAME_MAX; i++)
1280 {
1281 int c2 = phase2_getc ();
1282 if (!(c2 >= ' ' && c2 <= '~'))
1283 {
1284 phase2_ungetc (c2);
1285 while (--i >= 0)
1286 phase2_ungetc (buf[i]);
1287 phase2_ungetc (c1);
1288 phase2_ungetc (c);
1289 ++*backslash_counter;
1290 return UNICODE ('\\');
1291 }
1292 if (c2 == '}')
1293 break;
1294 buf[i] = c2;
1295 }
1296 buf[i] = '\0';
1297
1298 n = unicode_name_character ((char *) buf);
1299 if (n != UNINAME_INVALID)
1300 {
1301 *backslash_counter = 0;
1302 return UNICODE (n);
1303 }
1304
1305 phase2_ungetc ('}');
1306 while (--i >= 0)
1307 phase2_ungetc (buf[i]);
1308 }
1309 phase2_ungetc (c1);
1310 phase2_ungetc (c);
1311 ++*backslash_counter;
1312 return UNICODE ('\\');
1313 }
1314 }
1315 }
1316
1317 phase2_ungetc (c);
1318 ++*backslash_counter;
1319 return UNICODE ('\\');
1320 }
1321 }
1322
1323
1324 /* Combine characters into tokens. Discard whitespace except newlines at
1325 the end of logical lines. */
1326
1327 /* Number of pending open parentheses/braces/brackets. */
1328 static int open_pbb;
1329
1330 static token_ty phase5_pushback[1];
1331 static int phase5_pushback_length;
1332
1333 static void
phase5_get(token_ty * tp)1334 phase5_get (token_ty *tp)
1335 {
1336 int c;
1337
1338 if (phase5_pushback_length)
1339 {
1340 *tp = phase5_pushback[--phase5_pushback_length];
1341 return;
1342 }
1343
1344 for (;;)
1345 {
1346 tp->line_number = line_number;
1347 c = phase3_getc ();
1348
1349 switch (c)
1350 {
1351 case UEOF:
1352 tp->type = token_type_eof;
1353 return;
1354
1355 case ' ':
1356 case '\t':
1357 case '\f':
1358 /* Ignore whitespace and comments. */
1359 continue;
1360
1361 case '\n':
1362 if (last_non_comment_line > last_comment_line)
1363 savable_comment_reset ();
1364 /* Ignore newline if and only if it is used for implicit line
1365 joining. */
1366 if (open_pbb > 0)
1367 continue;
1368 tp->type = token_type_other;
1369 return;
1370 }
1371
1372 last_non_comment_line = tp->line_number;
1373
1374 switch (c)
1375 {
1376 case '.':
1377 {
1378 int c1 = phase3_getc ();
1379 phase3_ungetc (c1);
1380 if (!(c1 >= '0' && c1 <= '9'))
1381 {
1382
1383 tp->type = token_type_other;
1384 return;
1385 }
1386 }
1387 /* FALLTHROUGH */
1388 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1389 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1390 case 'M': case 'N': case 'O': case 'P': case 'Q':
1391 case 'S': case 'T': case 'V': case 'W': case 'X':
1392 case 'Y': case 'Z':
1393 case '_':
1394 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1395 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1396 case 'm': case 'n': case 'o': case 'p': case 'q':
1397 case 's': case 't': case 'v': case 'w': case 'x':
1398 case 'y': case 'z':
1399 case '0': case '1': case '2': case '3': case '4':
1400 case '5': case '6': case '7': case '8': case '9':
1401 symbol:
1402 /* Symbol, or part of a number. */
1403 {
1404 static char *buffer;
1405 static int bufmax;
1406 int bufpos;
1407
1408 bufpos = 0;
1409 for (;;)
1410 {
1411 if (bufpos >= bufmax)
1412 {
1413 bufmax = 2 * bufmax + 10;
1414 buffer = xrealloc (buffer, bufmax);
1415 }
1416 buffer[bufpos++] = c;
1417 c = phase3_getc ();
1418 switch (c)
1419 {
1420 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1421 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1422 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1423 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1424 case 'Y': case 'Z':
1425 case '_':
1426 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1427 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1428 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1429 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1430 case 'y': case 'z':
1431 case '0': case '1': case '2': case '3': case '4':
1432 case '5': case '6': case '7': case '8': case '9':
1433 continue;
1434 default:
1435 phase3_ungetc (c);
1436 break;
1437 }
1438 break;
1439 }
1440 if (bufpos >= bufmax)
1441 {
1442 bufmax = 2 * bufmax + 10;
1443 buffer = xrealloc (buffer, bufmax);
1444 }
1445 buffer[bufpos] = '\0';
1446 tp->string = xstrdup (buffer);
1447 tp->type = token_type_symbol;
1448 return;
1449 }
1450
1451 /* Strings. */
1452 {
1453 struct mixed_string_buffer literal;
1454 int quote_char;
1455 bool interpret_ansic;
1456 bool interpret_unicode;
1457 bool triple;
1458 unsigned int backslash_counter;
1459
1460 case 'R': case 'r':
1461 {
1462 int c1 = phase2_getc ();
1463 if (c1 == '"' || c1 == '\'')
1464 {
1465 quote_char = c1;
1466 interpret_ansic = false;
1467 interpret_unicode = false;
1468 goto string;
1469 }
1470 phase2_ungetc (c1);
1471 goto symbol;
1472 }
1473
1474 case 'U': case 'u':
1475 {
1476 int c1 = phase2_getc ();
1477 if (c1 == '"' || c1 == '\'')
1478 {
1479 quote_char = c1;
1480 interpret_ansic = true;
1481 interpret_unicode = true;
1482 goto string;
1483 }
1484 if (c1 == 'R' || c1 == 'r')
1485 {
1486 int c2 = phase2_getc ();
1487 if (c2 == '"' || c2 == '\'')
1488 {
1489 quote_char = c2;
1490 interpret_ansic = false;
1491 interpret_unicode = true;
1492 goto string;
1493 }
1494 phase2_ungetc (c2);
1495 }
1496 phase2_ungetc (c1);
1497 goto symbol;
1498 }
1499
1500 case '"': case '\'':
1501 quote_char = c;
1502 interpret_ansic = true;
1503 interpret_unicode = false;
1504 string:
1505 triple = false;
1506 {
1507 int c1 = phase2_getc ();
1508 if (c1 == quote_char)
1509 {
1510 int c2 = phase2_getc ();
1511 if (c2 == quote_char)
1512 triple = true;
1513 else
1514 {
1515 phase2_ungetc (c2);
1516 phase2_ungetc (c1);
1517 }
1518 }
1519 else
1520 phase2_ungetc (c1);
1521 }
1522 backslash_counter = 0;
1523 /* Start accumulating the string. */
1524 init_mixed_string_buffer (&literal);
1525 for (;;)
1526 {
1527 int uc = phase7_getuc (quote_char, triple, interpret_ansic,
1528 interpret_unicode, &backslash_counter);
1529
1530 if (uc == P7_EOF || uc == P7_STRING_END)
1531 break;
1532
1533 if (IS_UNICODE (uc))
1534 assert (UNICODE_VALUE (uc) >= 0
1535 && UNICODE_VALUE (uc) < 0x110000);
1536
1537 mixed_string_buffer_append (&literal, uc);
1538 }
1539 tp->string = xstrdup (mixed_string_buffer_result (&literal));
1540 free_mixed_string_buffer (&literal);
1541 tp->comment = add_reference (savable_comment);
1542 tp->type = token_type_string;
1543 return;
1544 }
1545
1546 case '(':
1547 open_pbb++;
1548 tp->type = token_type_lparen;
1549 return;
1550
1551 case ')':
1552 if (open_pbb > 0)
1553 open_pbb--;
1554 tp->type = token_type_rparen;
1555 return;
1556
1557 case ',':
1558 tp->type = token_type_comma;
1559 return;
1560
1561 case '[': case '{':
1562 open_pbb++;
1563 tp->type = token_type_other;
1564 return;
1565
1566 case ']': case '}':
1567 if (open_pbb > 0)
1568 open_pbb--;
1569 tp->type = token_type_other;
1570 return;
1571
1572 default:
1573 /* We could carefully recognize each of the 2 and 3 character
1574 operators, but it is not necessary, as we only need to recognize
1575 gettext invocations. Don't bother. */
1576 tp->type = token_type_other;
1577 return;
1578 }
1579 }
1580 }
1581
1582 /* Supports only one pushback token. */
1583 static void
phase5_unget(token_ty * tp)1584 phase5_unget (token_ty *tp)
1585 {
1586 if (tp->type != token_type_eof)
1587 {
1588 if (phase5_pushback_length == SIZEOF (phase5_pushback))
1589 abort ();
1590 phase5_pushback[phase5_pushback_length++] = *tp;
1591 }
1592 }
1593
1594
1595 /* Combine adjacent strings to form a single string. Note that the end
1596 of a logical line appears as a token of its own, therefore strings that
1597 belong to different logical lines will not be concatenated. */
1598
1599 static void
x_python_lex(token_ty * tp)1600 x_python_lex (token_ty *tp)
1601 {
1602 phase5_get (tp);
1603 if (tp->type != token_type_string)
1604 return;
1605 for (;;)
1606 {
1607 token_ty tmp;
1608 size_t len;
1609
1610 phase5_get (&tmp);
1611 if (tmp.type != token_type_string)
1612 {
1613 phase5_unget (&tmp);
1614 return;
1615 }
1616 len = strlen (tp->string);
1617 tp->string = xrealloc (tp->string, len + strlen (tmp.string) + 1);
1618 strcpy (tp->string + len, tmp.string);
1619 free (tmp.string);
1620 }
1621 }
1622
1623
1624 /* ========================= Extracting strings. ========================== */
1625
1626
1627 /* Context lookup table. */
1628 static flag_context_list_table_ty *flag_context_list_table;
1629
1630
1631 /* The file is broken into tokens. Scan the token stream, looking for
1632 a keyword, followed by a left paren, followed by a string. When we
1633 see this sequence, we have something to remember. We assume we are
1634 looking at a valid C or C++ program, and leave the complaints about
1635 the grammar to the compiler.
1636
1637 Normal handling: Look for
1638 keyword ( ... msgid ... )
1639 Plural handling: Look for
1640 keyword ( ... msgid ... msgid_plural ... )
1641
1642 We use recursion because the arguments before msgid or between msgid
1643 and msgid_plural can contain subexpressions of the same form. */
1644
1645
1646 /* Extract messages until the next balanced closing parenthesis.
1647 Extracted messages are added to MLP.
1648 Return true upon eof, false upon closing parenthesis. */
1649 static bool
extract_parenthesized(message_list_ty * mlp,flag_context_ty outer_context,flag_context_list_iterator_ty context_iter,struct arglist_parser * argparser)1650 extract_parenthesized (message_list_ty *mlp,
1651 flag_context_ty outer_context,
1652 flag_context_list_iterator_ty context_iter,
1653 struct arglist_parser *argparser)
1654 {
1655 /* Current argument number. */
1656 int arg = 1;
1657 /* 0 when no keyword has been seen. 1 right after a keyword is seen. */
1658 int state;
1659 /* Parameters of the keyword just seen. Defined only in state 1. */
1660 const struct callshapes *next_shapes = NULL;
1661 /* Context iterator that will be used if the next token is a '('. */
1662 flag_context_list_iterator_ty next_context_iter =
1663 passthrough_context_list_iterator;
1664 /* Current context. */
1665 flag_context_ty inner_context =
1666 inherited_context (outer_context,
1667 flag_context_list_iterator_advance (&context_iter));
1668
1669 /* Start state is 0. */
1670 state = 0;
1671
1672 for (;;)
1673 {
1674 token_ty token;
1675
1676 x_python_lex (&token);
1677 switch (token.type)
1678 {
1679 case token_type_symbol:
1680 {
1681 void *keyword_value;
1682
1683 if (hash_find_entry (&keywords, token.string, strlen (token.string),
1684 &keyword_value)
1685 == 0)
1686 {
1687 next_shapes = (const struct callshapes *) keyword_value;
1688 state = 1;
1689 }
1690 else
1691 state = 0;
1692 }
1693 next_context_iter =
1694 flag_context_list_iterator (
1695 flag_context_list_table_lookup (
1696 flag_context_list_table,
1697 token.string, strlen (token.string)));
1698 free (token.string);
1699 continue;
1700
1701 case token_type_lparen:
1702 if (extract_parenthesized (mlp, inner_context, next_context_iter,
1703 arglist_parser_alloc (mlp,
1704 state ? next_shapes : NULL)))
1705 {
1706 xgettext_current_source_encoding = po_charset_utf8;
1707 arglist_parser_done (argparser, arg);
1708 xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1709 return true;
1710 }
1711 next_context_iter = null_context_list_iterator;
1712 state = 0;
1713 continue;
1714
1715 case token_type_rparen:
1716 xgettext_current_source_encoding = po_charset_utf8;
1717 arglist_parser_done (argparser, arg);
1718 xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1719 return false;
1720
1721 case token_type_comma:
1722 arg++;
1723 inner_context =
1724 inherited_context (outer_context,
1725 flag_context_list_iterator_advance (
1726 &context_iter));
1727 next_context_iter = passthrough_context_list_iterator;
1728 state = 0;
1729 continue;
1730
1731 case token_type_string:
1732 {
1733 lex_pos_ty pos;
1734 pos.file_name = logical_file_name;
1735 pos.line_number = token.line_number;
1736
1737 xgettext_current_source_encoding = po_charset_utf8;
1738 if (extract_all)
1739 remember_a_message (mlp, NULL, token.string, inner_context,
1740 &pos, token.comment);
1741 else
1742 arglist_parser_remember (argparser, arg, token.string,
1743 inner_context,
1744 pos.file_name, pos.line_number,
1745 token.comment);
1746 xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1747 }
1748 drop_reference (token.comment);
1749 next_context_iter = null_context_list_iterator;
1750 state = 0;
1751 continue;
1752
1753 case token_type_eof:
1754 xgettext_current_source_encoding = po_charset_utf8;
1755 arglist_parser_done (argparser, arg);
1756 xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1757 return true;
1758
1759 case token_type_other:
1760 next_context_iter = null_context_list_iterator;
1761 state = 0;
1762 continue;
1763
1764 default:
1765 abort ();
1766 }
1767 }
1768 }
1769
1770
1771 void
extract_python(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)1772 extract_python (FILE *f,
1773 const char *real_filename, const char *logical_filename,
1774 flag_context_list_table_ty *flag_table,
1775 msgdomain_list_ty *mdlp)
1776 {
1777 message_list_ty *mlp = mdlp->item[0]->messages;
1778
1779 fp = f;
1780 real_file_name = real_filename;
1781 logical_file_name = xstrdup (logical_filename);
1782 line_number = 1;
1783
1784 last_comment_line = -1;
1785 last_non_comment_line = -1;
1786
1787 xgettext_current_file_source_encoding = xgettext_global_source_encoding;
1788 #if HAVE_ICONV
1789 xgettext_current_file_source_iconv = xgettext_global_source_iconv;
1790 #endif
1791
1792 xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1793 #if HAVE_ICONV
1794 xgettext_current_source_iconv = xgettext_current_file_source_iconv;
1795 #endif
1796
1797 continuation_or_nonblank_line = false;
1798
1799 open_pbb = 0;
1800
1801 flag_context_list_table = flag_table;
1802
1803 init_keywords ();
1804
1805 /* Eat tokens until eof is seen. When extract_parenthesized returns
1806 due to an unbalanced closing parenthesis, just restart it. */
1807 while (!extract_parenthesized (mlp, null_context, null_context_list_iterator,
1808 arglist_parser_alloc (mlp, NULL)))
1809 ;
1810
1811 fp = NULL;
1812 real_file_name = NULL;
1813 logical_file_name = NULL;
1814 line_number = 0;
1815 }
1816