xref: /netbsd-src/external/gpl2/gettext/dist/gettext-tools/src/x-python.c (revision 946379e7b37692fc43f68eb0d1c10daa0a7f3b6c)
1 /* xgettext Python backend.
2    Copyright (C) 2002-2003, 2005-2006 Free Software Foundation, Inc.
3 
4    This file was written by Bruno Haible <haible@clisp.cons.org>, 2002.
5 
6    This program is free software; you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 2, or (at your option)
9    any later version.
10 
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15 
16    You should have received a copy of the GNU General Public License
17    along with this program; if not, write to the Free Software Foundation,
18    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
19 
20 #ifdef HAVE_CONFIG_H
21 # include "config.h"
22 #endif
23 
24 #include <assert.h>
25 #include <errno.h>
26 #include <stdbool.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <string.h>
30 
31 #include "message.h"
32 #include "xgettext.h"
33 #include "x-python.h"
34 #include "error.h"
35 #include "error-progname.h"
36 #include "progname.h"
37 #include "basename.h"
38 #include "xerror.h"
39 #include "xvasprintf.h"
40 #include "xalloc.h"
41 #include "exit.h"
42 #include "c-strstr.h"
43 #include "c-ctype.h"
44 #include "po-charset.h"
45 #include "uniname.h"
46 #include "utf16-ucs4.h"
47 #include "utf8-ucs4.h"
48 #include "ucs4-utf8.h"
49 #include "gettext.h"
50 
51 #define _(s) gettext(s)
52 
53 #define max(a,b) ((a) > (b) ? (a) : (b))
54 
55 #define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
56 
57 
58 /* The Python syntax is defined in the Python Reference Manual
59    /usr/share/doc/packages/python/html/ref/index.html.
60    See also Python-2.0/Parser/tokenizer.c, Python-2.0/Python/compile.c,
61    Python-2.0/Objects/unicodeobject.c.  */
62 
63 
64 /* ====================== Keyword set customization.  ====================== */
65 
66 /* If true extract all strings.  */
67 static bool extract_all = false;
68 
69 static hash_table keywords;
70 static bool default_keywords = true;
71 
72 
73 void
x_python_extract_all()74 x_python_extract_all ()
75 {
76   extract_all = true;
77 }
78 
79 
80 void
x_python_keyword(const char * name)81 x_python_keyword (const char *name)
82 {
83   if (name == NULL)
84     default_keywords = false;
85   else
86     {
87       const char *end;
88       struct callshape shape;
89       const char *colon;
90 
91       if (keywords.table == NULL)
92 	hash_init (&keywords, 100);
93 
94       split_keywordspec (name, &end, &shape);
95 
96       /* The characters between name and end should form a valid C identifier.
97 	 A colon means an invalid parse in split_keywordspec().  */
98       colon = strchr (name, ':');
99       if (colon == NULL || colon >= end)
100 	insert_keyword_callshape (&keywords, name, end - name, &shape);
101     }
102 }
103 
104 /* Finish initializing the keywords hash table.
105    Called after argument processing, before each file is processed.  */
106 static void
init_keywords()107 init_keywords ()
108 {
109   if (default_keywords)
110     {
111       /* When adding new keywords here, also update the documentation in
112 	 xgettext.texi!  */
113       x_python_keyword ("gettext");
114       x_python_keyword ("ugettext");
115       x_python_keyword ("dgettext:2");
116       x_python_keyword ("ngettext:1,2");
117       x_python_keyword ("ungettext:1,2");
118       x_python_keyword ("dngettext:2,3");
119       x_python_keyword ("_");
120       default_keywords = false;
121     }
122 }
123 
124 void
init_flag_table_python()125 init_flag_table_python ()
126 {
127   xgettext_record_flag ("gettext:1:pass-python-format");
128   xgettext_record_flag ("ugettext:1:pass-python-format");
129   xgettext_record_flag ("dgettext:2:pass-python-format");
130   xgettext_record_flag ("ngettext:1:pass-python-format");
131   xgettext_record_flag ("ngettext:2:pass-python-format");
132   xgettext_record_flag ("ungettext:1:pass-python-format");
133   xgettext_record_flag ("ungettext:2:pass-python-format");
134   xgettext_record_flag ("dngettext:2:pass-python-format");
135   xgettext_record_flag ("dngettext:3:pass-python-format");
136   xgettext_record_flag ("_:1:pass-python-format");
137   /* xgettext_record_flag ("%:1:python-format"); // % is an infix operator! */
138 }
139 
140 
141 /* ======================== Reading of characters.  ======================== */
142 
143 /* Real filename, used in error messages about the input file.  */
144 static const char *real_file_name;
145 
146 /* Logical filename and line number, used to label the extracted messages.  */
147 static char *logical_file_name;
148 static int line_number;
149 
150 /* The input file stream.  */
151 static FILE *fp;
152 
153 
154 /* 1. line_number handling.  */
155 
156 /* Maximum used, roughly a safer MB_LEN_MAX.  */
157 #define MAX_PHASE1_PUSHBACK 16
158 static unsigned char phase1_pushback[MAX_PHASE1_PUSHBACK];
159 static int phase1_pushback_length;
160 
161 /* Read the next single byte from the input file.  */
162 static int
phase1_getc()163 phase1_getc ()
164 {
165   int c;
166 
167   if (phase1_pushback_length)
168     c = phase1_pushback[--phase1_pushback_length];
169   else
170     {
171       c = getc (fp);
172 
173       if (c == EOF)
174 	{
175 	  if (ferror (fp))
176 	    error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
177 		   real_file_name);
178 	  return EOF;
179 	}
180     }
181 
182   if (c == '\n')
183     ++line_number;
184 
185   return c;
186 }
187 
188 /* Supports MAX_PHASE1_PUSHBACK characters of pushback.  */
189 static void
phase1_ungetc(int c)190 phase1_ungetc (int c)
191 {
192   if (c != EOF)
193     {
194       if (c == '\n')
195 	--line_number;
196 
197       if (phase1_pushback_length == SIZEOF (phase1_pushback))
198 	abort ();
199       phase1_pushback[phase1_pushback_length++] = c;
200     }
201 }
202 
203 
204 /* Phase 2: Conversion to Unicode.
205    This is done early because PEP 0263 specifies that conversion to Unicode
206    conceptually occurs before tokenization.  A test case where it matters
207    is with encodings like BIG5: when a double-byte character ending in 0x5C
208    is followed by '\' or 'u0021', the tokenizer must not treat the second
209    half of the double-byte character as a backslash.  */
210 
211 /* End-of-file indicator for functions returning an UCS-4 character.  */
212 #define UEOF -1
213 
214 static int phase2_pushback[max (9, UNINAME_MAX + 3)];
215 static int phase2_pushback_length;
216 
217 /* Read the next Unicode UCS-4 character from the input file.  */
218 static int
phase2_getc()219 phase2_getc ()
220 {
221   if (phase2_pushback_length)
222     return phase2_pushback[--phase2_pushback_length];
223 
224   if (xgettext_current_source_encoding == po_charset_ascii)
225     {
226       int c = phase1_getc ();
227       if (c == EOF)
228 	return UEOF;
229       if (!c_isascii (c))
230 	{
231 	  char buffer[21];
232 	  sprintf (buffer, ":%ld", (long) line_number);
233 	  multiline_error (xstrdup (""),
234 			   xasprintf (_("\
235 Non-ASCII string at %s%s.\n\
236 Please specify the source encoding through --from-code or through a comment\n\
237 as specified in http://www.python.org/peps/pep-0263.html.\n"),
238 			   real_file_name, buffer));
239 	  exit (EXIT_FAILURE);
240 	}
241       return c;
242     }
243   else if (xgettext_current_source_encoding != po_charset_utf8)
244     {
245 #if HAVE_ICONV
246       /* Use iconv on an increasing number of bytes.  Read only as many bytes
247 	 through phase1_getc as needed.  This is needed to give reasonable
248 	 interactive behaviour when fp is connected to an interactive tty.  */
249       unsigned char buf[MAX_PHASE1_PUSHBACK];
250       size_t bufcount;
251       int c = phase1_getc ();
252       if (c == EOF)
253 	return UEOF;
254       buf[0] = (unsigned char) c;
255       bufcount = 1;
256 
257       for (;;)
258 	{
259 	  unsigned char scratchbuf[6];
260 	  const char *inptr = (const char *) &buf[0];
261 	  size_t insize = bufcount;
262 	  char *outptr = (char *) &scratchbuf[0];
263 	  size_t outsize = sizeof (scratchbuf);
264 
265 	  size_t res = iconv (xgettext_current_source_iconv,
266 			      (ICONV_CONST char **) &inptr, &insize,
267 			      &outptr, &outsize);
268 	  /* We expect that a character has been produced if and only if
269 	     some input bytes have been consumed.  */
270 	  if ((insize < bufcount) != (outsize < sizeof (scratchbuf)))
271 	    abort ();
272 	  if (outsize == sizeof (scratchbuf))
273 	    {
274 	      /* No character has been produced.  Must be an error.  */
275 	      if (res != (size_t)(-1))
276 		abort ();
277 
278 	      if (errno == EILSEQ)
279 		{
280 		  /* An invalid multibyte sequence was encountered.  */
281 		  multiline_error (xstrdup (""),
282 				   xasprintf (_("\
283 %s:%d: Invalid multibyte sequence.\n\
284 Please specify the correct source encoding through --from-code or through a\n\
285 comment as specified in http://www.python.org/peps/pep-0263.html.\n"),
286 				   real_file_name, line_number));
287 		  exit (EXIT_FAILURE);
288 		}
289 	      else if (errno == EINVAL)
290 		{
291 		  /* An incomplete multibyte character.  */
292 		  int c;
293 
294 		  if (bufcount == MAX_PHASE1_PUSHBACK)
295 		    {
296 		      /* An overlong incomplete multibyte sequence was
297 			 encountered.  */
298 		      multiline_error (xstrdup (""),
299 				       xasprintf (_("\
300 %s:%d: Long incomplete multibyte sequence.\n\
301 Please specify the correct source encoding through --from-code or through a\n\
302 comment as specified in http://www.python.org/peps/pep-0263.html.\n"),
303 				       real_file_name, line_number));
304 		      exit (EXIT_FAILURE);
305 		    }
306 
307 		  /* Read one more byte and retry iconv.  */
308 		  c = phase1_getc ();
309 		  if (c == EOF)
310 		    {
311 		      multiline_error (xstrdup (""),
312 				       xasprintf (_("\
313 %s:%d: Incomplete multibyte sequence at end of file.\n\
314 Please specify the correct source encoding through --from-code or through a\n\
315 comment as specified in http://www.python.org/peps/pep-0263.html.\n"),
316 				       real_file_name, line_number));
317 		      exit (EXIT_FAILURE);
318 		    }
319 		  if (c == '\n')
320 		    {
321 		      multiline_error (xstrdup (""),
322 				       xasprintf (_("\
323 %s:%d: Incomplete multibyte sequence at end of line.\n\
324 Please specify the correct source encoding through --from-code or through a\n\
325 comment as specified in http://www.python.org/peps/pep-0263.html.\n"),
326 				       real_file_name, line_number - 1));
327 		      exit (EXIT_FAILURE);
328 		    }
329 		  buf[bufcount++] = (unsigned char) c;
330 		}
331 	      else
332 		error (EXIT_FAILURE, errno, _("%s:%d: iconv failure"),
333 		       real_file_name, line_number);
334 	    }
335 	  else
336 	    {
337 	      size_t outbytes = sizeof (scratchbuf) - outsize;
338 	      size_t bytes = bufcount - insize;
339 	      unsigned int uc;
340 
341 	      /* We expect that one character has been produced.  */
342 	      if (bytes == 0)
343 		abort ();
344 	      if (outbytes == 0)
345 		abort ();
346 	      /* Push back the unused bytes.  */
347 	      while (insize > 0)
348 		phase1_ungetc (buf[--insize]);
349 	      /* Convert the character from UTF-8 to UCS-4.  */
350 	      if (u8_mbtouc (&uc, scratchbuf, outbytes) < outbytes)
351 		{
352 		  /* scratchbuf contains an out-of-range Unicode character
353 		     (> 0x10ffff).  */
354 		  multiline_error (xstrdup (""),
355 				   xasprintf (_("\
356 %s:%d: Invalid multibyte sequence.\n\
357 Please specify the source encoding through --from-code or through a comment\n\
358 as specified in http://www.python.org/peps/pep-0263.html.\n"),
359 				   real_file_name, line_number));
360 		  exit (EXIT_FAILURE);
361 		}
362 	      return uc;
363 	    }
364 	}
365 #else
366       /* If we don't have iconv(), the only supported values for
367 	 xgettext_global_source_encoding and thus also for
368 	 xgettext_current_source_encoding are ASCII and UTF-8.  */
369       abort ();
370 #endif
371     }
372   else
373     {
374       /* Read an UTF-8 encoded character.  */
375       unsigned char buf[6];
376       unsigned int count;
377       int c;
378       unsigned int uc;
379 
380       c = phase1_getc ();
381       if (c == EOF)
382 	return UEOF;
383       buf[0] = c;
384       count = 1;
385 
386       if (buf[0] >= 0xc0)
387 	{
388 	  c = phase1_getc ();
389 	  if (c == EOF)
390 	    return UEOF;
391 	  buf[1] = c;
392 	  count = 2;
393 	}
394 
395       if (buf[0] >= 0xe0
396 	  && ((buf[1] ^ 0x80) < 0x40))
397 	{
398 	  c = phase1_getc ();
399 	  if (c == EOF)
400 	    return UEOF;
401 	  buf[2] = c;
402 	  count = 3;
403 	}
404 
405       if (buf[0] >= 0xf0
406 	  && ((buf[1] ^ 0x80) < 0x40)
407 	  && ((buf[2] ^ 0x80) < 0x40))
408 	{
409 	  c = phase1_getc ();
410 	  if (c == EOF)
411 	    return UEOF;
412 	  buf[3] = c;
413 	  count = 4;
414 	}
415 
416       if (buf[0] >= 0xf8
417 	  && ((buf[1] ^ 0x80) < 0x40)
418 	  && ((buf[2] ^ 0x80) < 0x40)
419 	  && ((buf[3] ^ 0x80) < 0x40))
420 	{
421 	  c = phase1_getc ();
422 	  if (c == EOF)
423 	    return UEOF;
424 	  buf[4] = c;
425 	  count = 5;
426 	}
427 
428       if (buf[0] >= 0xfc
429 	  && ((buf[1] ^ 0x80) < 0x40)
430 	  && ((buf[2] ^ 0x80) < 0x40)
431 	  && ((buf[3] ^ 0x80) < 0x40)
432 	  && ((buf[4] ^ 0x80) < 0x40))
433 	{
434 	  c = phase1_getc ();
435 	  if (c == EOF)
436 	    return UEOF;
437 	  buf[5] = c;
438 	  count = 6;
439 	}
440 
441       u8_mbtouc (&uc, buf, count);
442       return uc;
443     }
444 }
445 
446 /* Supports max (9, UNINAME_MAX + 3) pushback characters.  */
447 static void
phase2_ungetc(int c)448 phase2_ungetc (int c)
449 {
450   if (c != UEOF)
451     {
452       if (phase2_pushback_length == SIZEOF (phase2_pushback))
453 	abort ();
454       phase2_pushback[phase2_pushback_length++] = c;
455     }
456 }
457 
458 
459 /* ========================= Accumulating strings.  ======================== */
460 
461 /* A string buffer type that allows appending Unicode characters.
462    Returns the entire string in UTF-8 encoding.  */
463 
464 struct unicode_string_buffer
465 {
466   /* The part of the string that has already been converted to UTF-8.  */
467   char *utf8_buffer;
468   size_t utf8_buflen;
469   size_t utf8_allocated;
470 };
471 
472 /* Initialize a 'struct unicode_string_buffer' to empty.  */
473 static inline void
init_unicode_string_buffer(struct unicode_string_buffer * bp)474 init_unicode_string_buffer (struct unicode_string_buffer *bp)
475 {
476   bp->utf8_buffer = NULL;
477   bp->utf8_buflen = 0;
478   bp->utf8_allocated = 0;
479 }
480 
481 /* Auxiliary function: Ensure count more bytes are available in bp->utf8.  */
482 static inline void
unicode_string_buffer_append_unicode_grow(struct unicode_string_buffer * bp,size_t count)483 unicode_string_buffer_append_unicode_grow (struct unicode_string_buffer *bp,
484 					   size_t count)
485 {
486   if (bp->utf8_buflen + count > bp->utf8_allocated)
487     {
488       size_t new_allocated = 2 * bp->utf8_allocated + 10;
489       if (new_allocated < bp->utf8_buflen + count)
490 	new_allocated = bp->utf8_buflen + count;
491       bp->utf8_allocated = new_allocated;
492       bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated);
493     }
494 }
495 
496 /* Auxiliary function: Append a Unicode character to bp->utf8.
497    uc must be < 0x110000.  */
498 static inline void
unicode_string_buffer_append_unicode(struct unicode_string_buffer * bp,unsigned int uc)499 unicode_string_buffer_append_unicode (struct unicode_string_buffer *bp,
500 				      unsigned int uc)
501 {
502   unsigned char utf8buf[6];
503   int count = u8_uctomb (utf8buf, uc, 6);
504 
505   if (count < 0)
506     /* The caller should have ensured that uc is not out-of-range.  */
507     abort ();
508 
509   unicode_string_buffer_append_unicode_grow (bp, count);
510   memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count);
511   bp->utf8_buflen += count;
512 }
513 
514 /* Return the string buffer's contents.  */
515 static char *
unicode_string_buffer_result(struct unicode_string_buffer * bp)516 unicode_string_buffer_result (struct unicode_string_buffer *bp)
517 {
518   /* NUL-terminate it.  */
519   unicode_string_buffer_append_unicode_grow (bp, 1);
520   bp->utf8_buffer[bp->utf8_buflen] = '\0';
521   /* Return it.  */
522   return bp->utf8_buffer;
523 }
524 
525 /* Free the memory pointed to by a 'struct unicode_string_buffer'.  */
526 static inline void
free_unicode_string_buffer(struct unicode_string_buffer * bp)527 free_unicode_string_buffer (struct unicode_string_buffer *bp)
528 {
529   free (bp->utf8_buffer);
530 }
531 
532 
533 /* ======================== Accumulating comments.  ======================== */
534 
535 
536 /* Accumulating a single comment line.  */
537 
538 static struct unicode_string_buffer comment_buffer;
539 
540 static inline void
comment_start()541 comment_start ()
542 {
543   comment_buffer.utf8_buflen = 0;
544 }
545 
546 static inline bool
comment_at_start()547 comment_at_start ()
548 {
549   return (comment_buffer.utf8_buflen == 0);
550 }
551 
552 static inline void
comment_add(int c)553 comment_add (int c)
554 {
555   unicode_string_buffer_append_unicode (&comment_buffer, c);
556 }
557 
558 static inline const char *
comment_line_end()559 comment_line_end ()
560 {
561   char *buffer = unicode_string_buffer_result (&comment_buffer);
562   size_t buflen = strlen (buffer);
563 
564   while (buflen >= 1
565 	 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
566     --buflen;
567   buffer[buflen] = '\0';
568   savable_comment_add (buffer);
569   return buffer;
570 }
571 
572 
573 /* These are for tracking whether comments count as immediately before
574    keyword.  */
575 static int last_comment_line;
576 static int last_non_comment_line;
577 
578 
579 /* ======================== Recognizing comments.  ======================== */
580 
581 
582 /* Recognizing the "coding" comment.
583    As specified in PEP 0263, it takes the form
584      "coding" [":"|"="] {alphanumeric or "-" or "_" or "*"}*
585    and is located in a comment in a line that
586      - is either the first or second line,
587      - is not a continuation line,
588      - contains no other tokens except this comment.  */
589 
590 /* Canonicalized encoding name for the current input file.  */
591 static const char *xgettext_current_file_source_encoding;
592 
593 #if HAVE_ICONV
594 /* Converter from xgettext_current_file_source_encoding to UTF-8 (except from
595    ASCII or UTF-8, when this conversion is a no-op).  */
596 static iconv_t xgettext_current_file_source_iconv;
597 #endif
598 
599 static inline void
set_current_file_source_encoding(const char * canon_encoding)600 set_current_file_source_encoding (const char *canon_encoding)
601 {
602   xgettext_current_file_source_encoding = canon_encoding;
603 
604   if (xgettext_current_file_source_encoding != po_charset_ascii
605       && xgettext_current_file_source_encoding != po_charset_utf8)
606     {
607 #if HAVE_ICONV
608       iconv_t cd;
609 
610       /* Avoid glibc-2.1 bug with EUC-KR.  */
611 # if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
612       if (strcmp (xgettext_current_file_source_encoding, "EUC-KR") == 0)
613 	cd = (iconv_t)(-1);
614       else
615 # endif
616       cd = iconv_open (po_charset_utf8, xgettext_current_file_source_encoding);
617       if (cd == (iconv_t)(-1))
618 	error_at_line (EXIT_FAILURE, 0, logical_file_name, line_number - 1, _("\
619 Cannot convert from \"%s\" to \"%s\". %s relies on iconv(), \
620 and iconv() does not support this conversion."),
621 	       xgettext_current_file_source_encoding, po_charset_utf8,
622 	       basename (program_name));
623       xgettext_current_file_source_iconv = cd;
624 #else
625       error_at_line (EXIT_FAILURE, 0, logical_file_name, line_number - 1, _("\
626 Cannot convert from \"%s\" to \"%s\". %s relies on iconv(). \
627 This version was built without iconv()."),
628 	     xgettext_global_source_encoding, po_charset_utf8,
629 	     basename (program_name));
630 #endif
631     }
632 
633   xgettext_current_source_encoding = xgettext_current_file_source_encoding;
634 #if HAVE_ICONV
635   xgettext_current_source_iconv = xgettext_current_file_source_iconv;
636 #endif
637 }
638 
639 static inline void
try_to_extract_coding(const char * comment)640 try_to_extract_coding (const char *comment)
641 {
642   const char *p = c_strstr (comment, "coding");
643 
644   if (p != NULL)
645     {
646       p += 6;
647       if (*p == ':' || *p == '=')
648 	{
649 	  p++;
650 	  while (*p == ' ' || *p == '\t')
651 	    p++;
652 	  {
653 	    const char *encoding_start = p;
654 
655 	    while (c_isalnum (*p) || *p == '-' || *p == '_' || *p == '.')
656 	      p++;
657 	    {
658 	      const char *encoding_end = p;
659 
660 	      if (encoding_end > encoding_start)
661 		{
662 		  /* Extract the encoding string.  */
663 		  size_t encoding_len = encoding_end - encoding_start;
664 		  char *encoding = (char *) xmalloc (encoding_len + 1);
665 
666 		  memcpy (encoding, encoding_start, encoding_len);
667 		  encoding[encoding_len] = '\0';
668 
669 		  {
670 		    /* Canonicalize it.  */
671 		    const char *canon_encoding = po_charset_canonicalize (encoding);
672 		    if (canon_encoding == NULL)
673 		      {
674 			error_at_line (0, 0,
675 				       logical_file_name, line_number - 1, _("\
676 Unknown encoding \"%s\". Proceeding with ASCII instead."),
677 				       encoding);
678 		        canon_encoding = po_charset_ascii;
679 		      }
680 
681 		    /* Activate it.  */
682 		    set_current_file_source_encoding (canon_encoding);
683 		  }
684 
685 		  free (encoding);
686 		}
687 	    }
688 	  }
689 	}
690     }
691 }
692 
693 /* Tracking whether the current line is a continuation line or contains a
694    non-blank character.  */
695 static bool continuation_or_nonblank_line = false;
696 
697 
698 /* Phase 3: Outside strings, replace backslash-newline with nothing and a
699    comment with nothing.  */
700 
701 static int
phase3_getc()702 phase3_getc ()
703 {
704   int c;
705 
706   for (;;)
707     {
708       c = phase2_getc ();
709       if (c == '\\')
710 	{
711 	  c = phase2_getc ();
712 	  if (c != '\n')
713 	    {
714 	      phase2_ungetc (c);
715 	      /* This shouldn't happen usually, because "A backslash is
716 		 illegal elsewhere on a line outside a string literal."  */
717 	      return '\\';
718 	    }
719 	  /* Eat backslash-newline.  */
720 	  continuation_or_nonblank_line = true;
721 	}
722       else if (c == '#')
723 	{
724 	  /* Eat a comment.  */
725 	  const char *comment;
726 
727 	  last_comment_line = line_number;
728 	  comment_start ();
729 	  for (;;)
730 	    {
731 	      c = phase2_getc ();
732 	      if (c == UEOF || c == '\n')
733 		break;
734 	      /* We skip all leading white space, but not EOLs.  */
735 	      if (!(comment_at_start () && (c == ' ' || c == '\t')))
736 		comment_add (c);
737 	    }
738 	  comment = comment_line_end ();
739 	  if (line_number - 1 <= 2 && !continuation_or_nonblank_line)
740 	    try_to_extract_coding (comment);
741 	  continuation_or_nonblank_line = false;
742 	  return c;
743 	}
744       else
745 	{
746 	  if (c == '\n')
747 	    continuation_or_nonblank_line = false;
748 	  else if (!(c == ' ' || c == '\t' || c == '\f'))
749 	    continuation_or_nonblank_line = true;
750 	  return c;
751 	}
752     }
753 }
754 
755 /* Supports only one pushback character.  */
756 static void
phase3_ungetc(int c)757 phase3_ungetc (int c)
758 {
759   phase2_ungetc (c);
760 }
761 
762 
763 /* ========================= Accumulating strings.  ======================== */
764 
765 /* Return value of phase7_getuc when EOF is reached.  */
766 #define P7_EOF (-1)
767 #define P7_STRING_END (-2)
768 
769 /* Convert an UTF-16 or UTF-32 code point to a return value that can be
770    distinguished from a single-byte return value.  */
771 #define UNICODE(code) (0x100 + (code))
772 
773 /* Test a return value of phase7_getuc whether it designates an UTF-16 or
774    UTF-32 code point.  */
775 #define IS_UNICODE(p7_result) ((p7_result) >= 0x100)
776 
777 /* Extract the UTF-16 or UTF-32 code of a return value that satisfies
778    IS_UNICODE.  */
779 #define UNICODE_VALUE(p7_result) ((p7_result) - 0x100)
780 
781 /* A string buffer type that allows appending bytes (in the
782    xgettext_current_source_encoding) or Unicode characters.
783    Returns the entire string in UTF-8 encoding.  */
784 
785 struct mixed_string_buffer
786 {
787   /* The part of the string that has already been converted to UTF-8.  */
788   char *utf8_buffer;
789   size_t utf8_buflen;
790   size_t utf8_allocated;
791   /* The first half of an UTF-16 surrogate character.  */
792   unsigned short utf16_surr;
793   /* The part of the string that is still in the source encoding.  */
794   char *curr_buffer;
795   size_t curr_buflen;
796   size_t curr_allocated;
797 };
798 
799 /* Initialize a 'struct mixed_string_buffer' to empty.  */
800 static inline void
init_mixed_string_buffer(struct mixed_string_buffer * bp)801 init_mixed_string_buffer (struct mixed_string_buffer *bp)
802 {
803   bp->utf8_buffer = NULL;
804   bp->utf8_buflen = 0;
805   bp->utf8_allocated = 0;
806   bp->utf16_surr = 0;
807   bp->curr_buffer = NULL;
808   bp->curr_buflen = 0;
809   bp->curr_allocated = 0;
810 }
811 
812 /* Auxiliary function: Append a byte to bp->curr.  */
813 static inline void
mixed_string_buffer_append_byte(struct mixed_string_buffer * bp,unsigned char c)814 mixed_string_buffer_append_byte (struct mixed_string_buffer *bp, unsigned char c)
815 {
816   if (bp->curr_buflen == bp->curr_allocated)
817     {
818       bp->curr_allocated = 2 * bp->curr_allocated + 10;
819       bp->curr_buffer = xrealloc (bp->curr_buffer, bp->curr_allocated);
820     }
821   bp->curr_buffer[bp->curr_buflen++] = c;
822 }
823 
824 /* Auxiliary function: Ensure count more bytes are available in bp->utf8.  */
825 static inline void
mixed_string_buffer_append_unicode_grow(struct mixed_string_buffer * bp,size_t count)826 mixed_string_buffer_append_unicode_grow (struct mixed_string_buffer *bp, size_t count)
827 {
828   if (bp->utf8_buflen + count > bp->utf8_allocated)
829     {
830       size_t new_allocated = 2 * bp->utf8_allocated + 10;
831       if (new_allocated < bp->utf8_buflen + count)
832 	new_allocated = bp->utf8_buflen + count;
833       bp->utf8_allocated = new_allocated;
834       bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated);
835     }
836 }
837 
838 /* Auxiliary function: Append a Unicode character to bp->utf8.
839    uc must be < 0x110000.  */
840 static inline void
mixed_string_buffer_append_unicode(struct mixed_string_buffer * bp,unsigned int uc)841 mixed_string_buffer_append_unicode (struct mixed_string_buffer *bp, unsigned int uc)
842 {
843   unsigned char utf8buf[6];
844   int count = u8_uctomb (utf8buf, uc, 6);
845 
846   if (count < 0)
847     /* The caller should have ensured that uc is not out-of-range.  */
848     abort ();
849 
850   mixed_string_buffer_append_unicode_grow (bp, count);
851   memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count);
852   bp->utf8_buflen += count;
853 }
854 
855 /* Auxiliary function: Flush bp->utf16_surr into bp->utf8_buffer.  */
856 static inline void
mixed_string_buffer_flush_utf16_surr(struct mixed_string_buffer * bp)857 mixed_string_buffer_flush_utf16_surr (struct mixed_string_buffer *bp)
858 {
859   if (bp->utf16_surr != 0)
860     {
861       /* A half surrogate is invalid, therefore use U+FFFD instead.  */
862       mixed_string_buffer_append_unicode (bp, 0xfffd);
863       bp->utf16_surr = 0;
864     }
865 }
866 
867 /* Auxiliary function: Flush bp->curr_buffer into bp->utf8_buffer.  */
868 static inline void
mixed_string_buffer_flush_curr_buffer(struct mixed_string_buffer * bp,int lineno)869 mixed_string_buffer_flush_curr_buffer (struct mixed_string_buffer *bp, int lineno)
870 {
871   if (bp->curr_buflen > 0)
872     {
873       char *curr;
874       size_t count;
875 
876       mixed_string_buffer_append_byte (bp, '\0');
877 
878       /* Convert from the source encoding to UTF-8.  */
879       curr = from_current_source_encoding (bp->curr_buffer,
880 					   logical_file_name, lineno);
881 
882       /* Append it to bp->utf8_buffer.  */
883       count = strlen (curr);
884       mixed_string_buffer_append_unicode_grow (bp, count);
885       memcpy (bp->utf8_buffer + bp->utf8_buflen, curr, count);
886       bp->utf8_buflen += count;
887 
888       if (curr != bp->curr_buffer)
889 	free (curr);
890       bp->curr_buflen = 0;
891     }
892 }
893 
894 /* Append a character or Unicode character to a 'struct mixed_string_buffer'.  */
895 static void
mixed_string_buffer_append(struct mixed_string_buffer * bp,int c)896 mixed_string_buffer_append (struct mixed_string_buffer *bp, int c)
897 {
898   if (IS_UNICODE (c))
899     {
900       /* Append a Unicode character.  */
901 
902       /* Switch from multibyte character mode to Unicode character mode.  */
903       mixed_string_buffer_flush_curr_buffer (bp, line_number);
904 
905       /* Test whether this character and the previous one form a Unicode
906 	 surrogate character pair.  */
907       if (bp->utf16_surr != 0
908 	  && (c >= UNICODE (0xdc00) && c < UNICODE (0xe000)))
909 	{
910 	  unsigned short utf16buf[2];
911 	  unsigned int uc;
912 
913 	  utf16buf[0] = bp->utf16_surr;
914 	  utf16buf[1] = UNICODE_VALUE (c);
915 	  if (u16_mbtouc_aux (&uc, utf16buf, 2) != 2)
916 	    abort ();
917 
918 	  mixed_string_buffer_append_unicode (bp, uc);
919 	  bp->utf16_surr = 0;
920 	}
921       else
922 	{
923 	  mixed_string_buffer_flush_utf16_surr (bp);
924 
925 	  if (c >= UNICODE (0xd800) && c < UNICODE (0xdc00))
926 	    bp->utf16_surr = UNICODE_VALUE (c);
927 	  else
928 	    mixed_string_buffer_append_unicode (bp, UNICODE_VALUE (c));
929 	}
930     }
931   else
932     {
933       /* Append a single byte.  */
934 
935       /* Switch from Unicode character mode to multibyte character mode.  */
936       mixed_string_buffer_flush_utf16_surr (bp);
937 
938       /* When a newline is seen, convert the accumulated multibyte sequence.
939 	 This ensures a correct line number in the error message in case of
940 	 a conversion error.  The "- 1" is to account for the newline.  */
941       if (c == '\n')
942 	mixed_string_buffer_flush_curr_buffer (bp, line_number - 1);
943 
944       mixed_string_buffer_append_byte (bp, (unsigned char) c);
945     }
946 }
947 
948 /* Return the string buffer's contents.  */
949 static char *
mixed_string_buffer_result(struct mixed_string_buffer * bp)950 mixed_string_buffer_result (struct mixed_string_buffer *bp)
951 {
952   /* Flush all into bp->utf8_buffer.  */
953   mixed_string_buffer_flush_utf16_surr (bp);
954   mixed_string_buffer_flush_curr_buffer (bp, line_number);
955   /* NUL-terminate it.  */
956   mixed_string_buffer_append_unicode_grow (bp, 1);
957   bp->utf8_buffer[bp->utf8_buflen] = '\0';
958   /* Return it.  */
959   return bp->utf8_buffer;
960 }
961 
962 /* Free the memory pointed to by a 'struct mixed_string_buffer'.  */
963 static inline void
free_mixed_string_buffer(struct mixed_string_buffer * bp)964 free_mixed_string_buffer (struct mixed_string_buffer *bp)
965 {
966   free (bp->utf8_buffer);
967   free (bp->curr_buffer);
968 }
969 
970 
971 /* ========================== Reading of tokens.  ========================== */
972 
973 
974 enum token_type_ty
975 {
976   token_type_eof,
977   token_type_lparen,		/* ( */
978   token_type_rparen,		/* ) */
979   token_type_comma,		/* , */
980   token_type_string,		/* "abc", 'abc', """abc""", '''abc''' */
981   token_type_symbol,		/* symbol, number */
982   token_type_other		/* misc. operator */
983 };
984 typedef enum token_type_ty token_type_ty;
985 
986 typedef struct token_ty token_ty;
987 struct token_ty
988 {
989   token_type_ty type;
990   char *string;		/* for token_type_string, token_type_symbol */
991   refcounted_string_list_ty *comment;	/* for token_type_string */
992   int line_number;
993 };
994 
995 
996 /* There are two different input syntaxes for strings, "abc" and r"abc",
997    and two different input syntaxes for Unicode strings, u"abc" and ur"abc".
998    Which escape sequences are understood, i.e. what is interpreted specially
999    after backslash?
1000     "abc"     \<nl> \\ \' \" \a\b\f\n\r\t\v \ooo \xnn
1001     r"abc"
1002     u"abc"    \<nl> \\ \' \" \a\b\f\n\r\t\v \ooo \xnn \unnnn \Unnnnnnnn \N{...}
1003     ur"abc"                                           \unnnn
1004    The \unnnn values are UTF-16 values; a single \Unnnnnnnn can expand to two
1005    \unnnn items.  The \ooo and \xnn values are in the current source encoding.
1006  */
1007 
1008 static int
phase7_getuc(int quote_char,bool triple,bool interpret_ansic,bool interpret_unicode,unsigned int * backslash_counter)1009 phase7_getuc (int quote_char,
1010 	      bool triple, bool interpret_ansic, bool interpret_unicode,
1011 	      unsigned int *backslash_counter)
1012 {
1013   int c;
1014 
1015   for (;;)
1016     {
1017       /* Use phase 2, because phase 3 elides comments.  */
1018       c = phase2_getc ();
1019 
1020       if (c == UEOF)
1021 	return P7_EOF;
1022 
1023       if (c == quote_char && (interpret_ansic || (*backslash_counter & 1) == 0))
1024 	{
1025 	  if (triple)
1026 	    {
1027 	      int c1 = phase2_getc ();
1028 	      if (c1 == quote_char)
1029 		{
1030 		  int c2 = phase2_getc ();
1031 		  if (c2 == quote_char)
1032 		    return P7_STRING_END;
1033 		  phase2_ungetc (c2);
1034 		}
1035 	      phase2_ungetc (c1);
1036 	      return UNICODE (c);
1037 	    }
1038 	  else
1039 	    return P7_STRING_END;
1040 	}
1041 
1042       if (c == '\n')
1043 	{
1044 	  if (triple)
1045 	    {
1046 	      *backslash_counter = 0;
1047 	      return UNICODE ('\n');
1048 	    }
1049 	  /* In r"..." and ur"..." strings, newline is only allowed
1050 	     immediately after an odd number of backslashes (although the
1051 	     backslashes are not interpreted!).  */
1052 	  if (!(interpret_ansic || (*backslash_counter & 1) == 0))
1053 	    {
1054 	      *backslash_counter = 0;
1055 	      return UNICODE ('\n');
1056 	    }
1057 	  phase2_ungetc (c);
1058 	  error_with_progname = false;
1059 	  error (0, 0, _("%s:%d: warning: unterminated string"),
1060 		 logical_file_name, line_number);
1061 	  error_with_progname = true;
1062 	  return P7_STRING_END;
1063 	}
1064 
1065       if (c != '\\')
1066 	{
1067 	  *backslash_counter = 0;
1068 	  return UNICODE (c);
1069 	}
1070 
1071       /* Backslash handling.  */
1072 
1073       if (!interpret_ansic && !interpret_unicode)
1074 	{
1075 	  ++*backslash_counter;
1076 	  return UNICODE ('\\');
1077 	}
1078 
1079       /* Dispatch according to the character following the backslash.  */
1080       c = phase2_getc ();
1081       if (c == UEOF)
1082 	{
1083 	  ++*backslash_counter;
1084 	  return UNICODE ('\\');
1085 	}
1086 
1087       if (interpret_ansic)
1088 	switch (c)
1089 	  {
1090 	  case '\n':
1091 	    continue;
1092 	  case '\\':
1093 	    ++*backslash_counter;
1094 	    return UNICODE (c);
1095 	  case '\'': case '"':
1096 	    *backslash_counter = 0;
1097 	    return UNICODE (c);
1098 	  case 'a':
1099 	    *backslash_counter = 0;
1100 	    return UNICODE ('\a');
1101 	  case 'b':
1102 	    *backslash_counter = 0;
1103 	    return UNICODE ('\b');
1104 	  case 'f':
1105 	    *backslash_counter = 0;
1106 	    return UNICODE ('\f');
1107 	  case 'n':
1108 	    *backslash_counter = 0;
1109 	    return UNICODE ('\n');
1110 	  case 'r':
1111 	    *backslash_counter = 0;
1112 	    return UNICODE ('\r');
1113 	  case 't':
1114 	    *backslash_counter = 0;
1115 	    return UNICODE ('\t');
1116 	  case 'v':
1117 	    *backslash_counter = 0;
1118 	    return UNICODE ('\v');
1119 	  case '0': case '1': case '2': case '3': case '4':
1120 	  case '5': case '6': case '7':
1121 	    {
1122 	      int n = c - '0';
1123 
1124 	      c = phase2_getc ();
1125 	      if (c != UEOF)
1126 		{
1127 		  if (c >= '0' && c <= '7')
1128 		    {
1129 		      n = (n << 3) + (c - '0');
1130 		      c = phase2_getc ();
1131 		      if (c != UEOF)
1132 			{
1133 			  if (c >= '0' && c <= '7')
1134 			    n = (n << 3) + (c - '0');
1135 			  else
1136 			    phase2_ungetc (c);
1137 			}
1138 		    }
1139 		  else
1140 		    phase2_ungetc (c);
1141 		}
1142 	      *backslash_counter = 0;
1143 	      return (unsigned char) n;
1144 	    }
1145 	  case 'x':
1146 	    {
1147 	      int c1 = phase2_getc ();
1148 	      int n1;
1149 
1150 	      if (c1 >= '0' && c1 <= '9')
1151 		n1 = c1 - '0';
1152 	      else if (c1 >= 'A' && c1 <= 'F')
1153 		n1 = c1 - 'A' + 10;
1154 	      else if (c1 >= 'a' && c1 <= 'f')
1155 		n1 = c1 - 'a' + 10;
1156 	      else
1157 		n1 = -1;
1158 
1159 	      if (n1 >= 0)
1160 		{
1161 		  int c2 = phase2_getc ();
1162 		  int n2;
1163 
1164 		  if (c2 >= '0' && c2 <= '9')
1165 		    n2 = c2 - '0';
1166 		  else if (c2 >= 'A' && c2 <= 'F')
1167 		    n2 = c2 - 'A' + 10;
1168 		  else if (c2 >= 'a' && c2 <= 'f')
1169 		    n2 = c2 - 'a' + 10;
1170 		  else
1171 		    n2 = -1;
1172 
1173 		  if (n2 >= 0)
1174 		    {
1175 		      *backslash_counter = 0;
1176 		      return (unsigned char) ((n1 << 4) + n2);
1177 		    }
1178 
1179 		  phase2_ungetc (c2);
1180 		}
1181 	      phase2_ungetc (c1);
1182 	      phase2_ungetc (c);
1183 	      ++*backslash_counter;
1184 	      return UNICODE ('\\');
1185 	    }
1186 	  }
1187 
1188       if (interpret_unicode)
1189 	{
1190 	  if (c == 'u')
1191 	    {
1192 	      unsigned char buf[4];
1193 	      unsigned int n = 0;
1194 	      int i;
1195 
1196 	      for (i = 0; i < 4; i++)
1197 		{
1198 		  int c1 = phase2_getc ();
1199 
1200 		  if (c1 >= '0' && c1 <= '9')
1201 		    n = (n << 4) + (c1 - '0');
1202 		  else if (c1 >= 'A' && c1 <= 'F')
1203 		    n = (n << 4) + (c1 - 'A' + 10);
1204 		  else if (c1 >= 'a' && c1 <= 'f')
1205 		    n = (n << 4) + (c1 - 'a' + 10);
1206 		  else
1207 		    {
1208 		      phase2_ungetc (c1);
1209 		      while (--i >= 0)
1210 			phase2_ungetc (buf[i]);
1211 		      phase2_ungetc (c);
1212 		      ++*backslash_counter;
1213 		      return UNICODE ('\\');
1214 		    }
1215 
1216 		  buf[i] = c1;
1217 		}
1218 	      *backslash_counter = 0;
1219 	      return UNICODE (n);
1220 	    }
1221 
1222 	  if (interpret_ansic)
1223 	    {
1224 	      if (c == 'U')
1225 		{
1226 		  unsigned char buf[8];
1227 		  unsigned int n = 0;
1228 		  int i;
1229 
1230 		  for (i = 0; i < 8; i++)
1231 		    {
1232 		      int c1 = phase2_getc ();
1233 
1234 		      if (c1 >= '0' && c1 <= '9')
1235 			n = (n << 4) + (c1 - '0');
1236 		      else if (c1 >= 'A' && c1 <= 'F')
1237 			n = (n << 4) + (c1 - 'A' + 10);
1238 		      else if (c1 >= 'a' && c1 <= 'f')
1239 			n = (n << 4) + (c1 - 'a' + 10);
1240 		      else
1241 			{
1242 			  phase2_ungetc (c1);
1243 			  while (--i >= 0)
1244 			    phase2_ungetc (buf[i]);
1245 			  phase2_ungetc (c);
1246 			  ++*backslash_counter;
1247 			  return UNICODE ('\\');
1248 			}
1249 
1250 		      buf[i] = c1;
1251 		    }
1252 		  if (n < 0x110000)
1253 		    {
1254 		      *backslash_counter = 0;
1255 		      return UNICODE (n);
1256 		    }
1257 
1258 		  error_with_progname = false;
1259 		  error (0, 0, _("%s:%d: warning: invalid Unicode character"),
1260 			 logical_file_name, line_number);
1261 		  error_with_progname = true;
1262 
1263 		  while (--i >= 0)
1264 		    phase2_ungetc (buf[i]);
1265 		  phase2_ungetc (c);
1266 		  ++*backslash_counter;
1267 		  return UNICODE ('\\');
1268 		}
1269 
1270 	      if (c == 'N')
1271 		{
1272 		  int c1 = phase2_getc ();
1273 		  if (c1 == '{')
1274 		    {
1275 		      unsigned char buf[UNINAME_MAX + 1];
1276 		      int i;
1277 		      unsigned int n;
1278 
1279 		      for (i = 0; i < UNINAME_MAX; i++)
1280 			{
1281 			  int c2 = phase2_getc ();
1282 			  if (!(c2 >= ' ' && c2 <= '~'))
1283 			    {
1284 			      phase2_ungetc (c2);
1285 			      while (--i >= 0)
1286 				phase2_ungetc (buf[i]);
1287 			      phase2_ungetc (c1);
1288 			      phase2_ungetc (c);
1289 			      ++*backslash_counter;
1290 			      return UNICODE ('\\');
1291 			    }
1292 			  if (c2 == '}')
1293 			    break;
1294 			  buf[i] = c2;
1295 			}
1296 		      buf[i] = '\0';
1297 
1298 		      n = unicode_name_character ((char *) buf);
1299 		      if (n != UNINAME_INVALID)
1300 			{
1301 			  *backslash_counter = 0;
1302 			  return UNICODE (n);
1303 			}
1304 
1305 		      phase2_ungetc ('}');
1306 		      while (--i >= 0)
1307 			phase2_ungetc (buf[i]);
1308 		    }
1309 		  phase2_ungetc (c1);
1310 		  phase2_ungetc (c);
1311 		  ++*backslash_counter;
1312 		  return UNICODE ('\\');
1313 		}
1314 	    }
1315 	}
1316 
1317       phase2_ungetc (c);
1318       ++*backslash_counter;
1319       return UNICODE ('\\');
1320     }
1321 }
1322 
1323 
1324 /* Combine characters into tokens.  Discard whitespace except newlines at
1325    the end of logical lines.  */
1326 
1327 /* Number of pending open parentheses/braces/brackets.  */
1328 static int open_pbb;
1329 
1330 static token_ty phase5_pushback[1];
1331 static int phase5_pushback_length;
1332 
1333 static void
phase5_get(token_ty * tp)1334 phase5_get (token_ty *tp)
1335 {
1336   int c;
1337 
1338   if (phase5_pushback_length)
1339     {
1340       *tp = phase5_pushback[--phase5_pushback_length];
1341       return;
1342     }
1343 
1344   for (;;)
1345     {
1346       tp->line_number = line_number;
1347       c = phase3_getc ();
1348 
1349       switch (c)
1350 	{
1351 	case UEOF:
1352 	  tp->type = token_type_eof;
1353 	  return;
1354 
1355 	case ' ':
1356 	case '\t':
1357 	case '\f':
1358 	  /* Ignore whitespace and comments.  */
1359 	  continue;
1360 
1361 	case '\n':
1362 	  if (last_non_comment_line > last_comment_line)
1363 	    savable_comment_reset ();
1364 	  /* Ignore newline if and only if it is used for implicit line
1365 	     joining.  */
1366 	  if (open_pbb > 0)
1367 	    continue;
1368 	  tp->type = token_type_other;
1369 	  return;
1370 	}
1371 
1372       last_non_comment_line = tp->line_number;
1373 
1374       switch (c)
1375 	{
1376 	case '.':
1377 	  {
1378 	    int c1 = phase3_getc ();
1379 	    phase3_ungetc (c1);
1380 	    if (!(c1 >= '0' && c1 <= '9'))
1381 	      {
1382 
1383 		tp->type = token_type_other;
1384 		return;
1385 	      }
1386 	  }
1387 	  /* FALLTHROUGH */
1388 	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1389 	case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1390 	case 'M': case 'N': case 'O': case 'P': case 'Q':
1391 	case 'S': case 'T':           case 'V': case 'W': case 'X':
1392 	case 'Y': case 'Z':
1393 	case '_':
1394 	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1395 	case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1396 	case 'm': case 'n': case 'o': case 'p': case 'q':
1397 	case 's': case 't':           case 'v': case 'w': case 'x':
1398 	case 'y': case 'z':
1399 	case '0': case '1': case '2': case '3': case '4':
1400 	case '5': case '6': case '7': case '8': case '9':
1401 	symbol:
1402 	  /* Symbol, or part of a number.  */
1403 	  {
1404 	    static char *buffer;
1405 	    static int bufmax;
1406 	    int bufpos;
1407 
1408 	    bufpos = 0;
1409 	    for (;;)
1410 	      {
1411 		if (bufpos >= bufmax)
1412 		  {
1413 		    bufmax = 2 * bufmax + 10;
1414 		    buffer = xrealloc (buffer, bufmax);
1415 		  }
1416 		buffer[bufpos++] = c;
1417 		c = phase3_getc ();
1418 		switch (c)
1419 		  {
1420 		  case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1421 		  case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1422 		  case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1423 		  case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1424 		  case 'Y': case 'Z':
1425 		  case '_':
1426 		  case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1427 		  case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1428 		  case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1429 		  case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1430 		  case 'y': case 'z':
1431 		  case '0': case '1': case '2': case '3': case '4':
1432 		  case '5': case '6': case '7': case '8': case '9':
1433 		    continue;
1434 		  default:
1435 		    phase3_ungetc (c);
1436 		    break;
1437 		  }
1438 		break;
1439 	      }
1440 	    if (bufpos >= bufmax)
1441 	      {
1442 		bufmax = 2 * bufmax + 10;
1443 		buffer = xrealloc (buffer, bufmax);
1444 	      }
1445 	    buffer[bufpos] = '\0';
1446 	    tp->string = xstrdup (buffer);
1447 	    tp->type = token_type_symbol;
1448 	    return;
1449 	  }
1450 
1451 	/* Strings.  */
1452 	  {
1453 	    struct mixed_string_buffer literal;
1454 	    int quote_char;
1455 	    bool interpret_ansic;
1456 	    bool interpret_unicode;
1457 	    bool triple;
1458 	    unsigned int backslash_counter;
1459 
1460 	    case 'R': case 'r':
1461 	      {
1462 		int c1 = phase2_getc ();
1463 		if (c1 == '"' || c1 == '\'')
1464 		  {
1465 		    quote_char = c1;
1466 		    interpret_ansic = false;
1467 		    interpret_unicode = false;
1468 		    goto string;
1469 		  }
1470 		phase2_ungetc (c1);
1471 		goto symbol;
1472 	      }
1473 
1474 	    case 'U': case 'u':
1475 	      {
1476 		int c1 = phase2_getc ();
1477 		if (c1 == '"' || c1 == '\'')
1478 		  {
1479 		    quote_char = c1;
1480 		    interpret_ansic = true;
1481 		    interpret_unicode = true;
1482 		    goto string;
1483 		  }
1484 		if (c1 == 'R' || c1 == 'r')
1485 		  {
1486 		    int c2 = phase2_getc ();
1487 		    if (c2 == '"' || c2 == '\'')
1488 		      {
1489 			quote_char = c2;
1490 			interpret_ansic = false;
1491 			interpret_unicode = true;
1492 			goto string;
1493 		      }
1494 		    phase2_ungetc (c2);
1495 		  }
1496 		phase2_ungetc (c1);
1497 		goto symbol;
1498 	      }
1499 
1500 	    case '"': case '\'':
1501 	      quote_char = c;
1502 	      interpret_ansic = true;
1503 	      interpret_unicode = false;
1504 	    string:
1505 	      triple = false;
1506 	      {
1507 		int c1 = phase2_getc ();
1508 		if (c1 == quote_char)
1509 		  {
1510 		    int c2 = phase2_getc ();
1511 		    if (c2 == quote_char)
1512 		      triple = true;
1513 		    else
1514 		      {
1515 			phase2_ungetc (c2);
1516 			phase2_ungetc (c1);
1517 		      }
1518 		  }
1519 		else
1520 		  phase2_ungetc (c1);
1521 	      }
1522 	      backslash_counter = 0;
1523 	      /* Start accumulating the string.  */
1524 	      init_mixed_string_buffer (&literal);
1525 	      for (;;)
1526 		{
1527 		  int uc = phase7_getuc (quote_char, triple, interpret_ansic,
1528 					 interpret_unicode, &backslash_counter);
1529 
1530 		  if (uc == P7_EOF || uc == P7_STRING_END)
1531 		    break;
1532 
1533 		  if (IS_UNICODE (uc))
1534 		    assert (UNICODE_VALUE (uc) >= 0
1535 			    && UNICODE_VALUE (uc) < 0x110000);
1536 
1537 		  mixed_string_buffer_append (&literal, uc);
1538 		}
1539 	      tp->string = xstrdup (mixed_string_buffer_result (&literal));
1540 	      free_mixed_string_buffer (&literal);
1541 	      tp->comment = add_reference (savable_comment);
1542 	      tp->type = token_type_string;
1543 	      return;
1544 	  }
1545 
1546 	case '(':
1547 	  open_pbb++;
1548 	  tp->type = token_type_lparen;
1549 	  return;
1550 
1551 	case ')':
1552 	  if (open_pbb > 0)
1553 	    open_pbb--;
1554 	  tp->type = token_type_rparen;
1555 	  return;
1556 
1557 	case ',':
1558 	  tp->type = token_type_comma;
1559 	  return;
1560 
1561 	case '[': case '{':
1562 	  open_pbb++;
1563 	  tp->type = token_type_other;
1564 	  return;
1565 
1566 	case ']': case '}':
1567 	  if (open_pbb > 0)
1568 	    open_pbb--;
1569 	  tp->type = token_type_other;
1570 	  return;
1571 
1572 	default:
1573 	  /* We could carefully recognize each of the 2 and 3 character
1574 	     operators, but it is not necessary, as we only need to recognize
1575 	     gettext invocations.  Don't bother.  */
1576 	  tp->type = token_type_other;
1577 	  return;
1578 	}
1579     }
1580 }
1581 
1582 /* Supports only one pushback token.  */
1583 static void
phase5_unget(token_ty * tp)1584 phase5_unget (token_ty *tp)
1585 {
1586   if (tp->type != token_type_eof)
1587     {
1588       if (phase5_pushback_length == SIZEOF (phase5_pushback))
1589 	abort ();
1590       phase5_pushback[phase5_pushback_length++] = *tp;
1591     }
1592 }
1593 
1594 
1595 /* Combine adjacent strings to form a single string.  Note that the end
1596    of a logical line appears as a token of its own, therefore strings that
1597    belong to different logical lines will not be concatenated.  */
1598 
1599 static void
x_python_lex(token_ty * tp)1600 x_python_lex (token_ty *tp)
1601 {
1602   phase5_get (tp);
1603   if (tp->type != token_type_string)
1604     return;
1605   for (;;)
1606     {
1607       token_ty tmp;
1608       size_t len;
1609 
1610       phase5_get (&tmp);
1611       if (tmp.type != token_type_string)
1612 	{
1613 	  phase5_unget (&tmp);
1614 	  return;
1615 	}
1616       len = strlen (tp->string);
1617       tp->string = xrealloc (tp->string, len + strlen (tmp.string) + 1);
1618       strcpy (tp->string + len, tmp.string);
1619       free (tmp.string);
1620     }
1621 }
1622 
1623 
1624 /* ========================= Extracting strings.  ========================== */
1625 
1626 
1627 /* Context lookup table.  */
1628 static flag_context_list_table_ty *flag_context_list_table;
1629 
1630 
1631 /* The file is broken into tokens.  Scan the token stream, looking for
1632    a keyword, followed by a left paren, followed by a string.  When we
1633    see this sequence, we have something to remember.  We assume we are
1634    looking at a valid C or C++ program, and leave the complaints about
1635    the grammar to the compiler.
1636 
1637      Normal handling: Look for
1638        keyword ( ... msgid ... )
1639      Plural handling: Look for
1640        keyword ( ... msgid ... msgid_plural ... )
1641 
1642    We use recursion because the arguments before msgid or between msgid
1643    and msgid_plural can contain subexpressions of the same form.  */
1644 
1645 
1646 /* Extract messages until the next balanced closing parenthesis.
1647    Extracted messages are added to MLP.
1648    Return true upon eof, false upon closing parenthesis.  */
1649 static bool
extract_parenthesized(message_list_ty * mlp,flag_context_ty outer_context,flag_context_list_iterator_ty context_iter,struct arglist_parser * argparser)1650 extract_parenthesized (message_list_ty *mlp,
1651 		       flag_context_ty outer_context,
1652 		       flag_context_list_iterator_ty context_iter,
1653 		       struct arglist_parser *argparser)
1654 {
1655   /* Current argument number.  */
1656   int arg = 1;
1657   /* 0 when no keyword has been seen.  1 right after a keyword is seen.  */
1658   int state;
1659   /* Parameters of the keyword just seen.  Defined only in state 1.  */
1660   const struct callshapes *next_shapes = NULL;
1661   /* Context iterator that will be used if the next token is a '('.  */
1662   flag_context_list_iterator_ty next_context_iter =
1663     passthrough_context_list_iterator;
1664   /* Current context.  */
1665   flag_context_ty inner_context =
1666     inherited_context (outer_context,
1667 		       flag_context_list_iterator_advance (&context_iter));
1668 
1669   /* Start state is 0.  */
1670   state = 0;
1671 
1672   for (;;)
1673     {
1674       token_ty token;
1675 
1676       x_python_lex (&token);
1677       switch (token.type)
1678 	{
1679 	case token_type_symbol:
1680 	  {
1681 	    void *keyword_value;
1682 
1683 	    if (hash_find_entry (&keywords, token.string, strlen (token.string),
1684 				 &keyword_value)
1685 		== 0)
1686 	      {
1687 		next_shapes = (const struct callshapes *) keyword_value;
1688 		state = 1;
1689 	      }
1690 	    else
1691 	      state = 0;
1692 	  }
1693 	  next_context_iter =
1694 	    flag_context_list_iterator (
1695 	      flag_context_list_table_lookup (
1696 		flag_context_list_table,
1697 		token.string, strlen (token.string)));
1698 	  free (token.string);
1699 	  continue;
1700 
1701 	case token_type_lparen:
1702 	  if (extract_parenthesized (mlp, inner_context, next_context_iter,
1703 				     arglist_parser_alloc (mlp,
1704 							   state ? next_shapes : NULL)))
1705 	    {
1706 	      xgettext_current_source_encoding = po_charset_utf8;
1707 	      arglist_parser_done (argparser, arg);
1708 	      xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1709 	      return true;
1710 	    }
1711 	  next_context_iter = null_context_list_iterator;
1712 	  state = 0;
1713 	  continue;
1714 
1715 	case token_type_rparen:
1716 	  xgettext_current_source_encoding = po_charset_utf8;
1717 	  arglist_parser_done (argparser, arg);
1718 	  xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1719 	  return false;
1720 
1721 	case token_type_comma:
1722 	  arg++;
1723 	  inner_context =
1724 	    inherited_context (outer_context,
1725 			       flag_context_list_iterator_advance (
1726 				 &context_iter));
1727 	  next_context_iter = passthrough_context_list_iterator;
1728 	  state = 0;
1729 	  continue;
1730 
1731 	case token_type_string:
1732 	  {
1733 	    lex_pos_ty pos;
1734 	    pos.file_name = logical_file_name;
1735 	    pos.line_number = token.line_number;
1736 
1737 	    xgettext_current_source_encoding = po_charset_utf8;
1738 	    if (extract_all)
1739 	      remember_a_message (mlp, NULL, token.string, inner_context,
1740 				  &pos, token.comment);
1741 	    else
1742 	      arglist_parser_remember (argparser, arg, token.string,
1743 				       inner_context,
1744 				       pos.file_name, pos.line_number,
1745 				       token.comment);
1746 	    xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1747 	  }
1748 	  drop_reference (token.comment);
1749 	  next_context_iter = null_context_list_iterator;
1750 	  state = 0;
1751 	  continue;
1752 
1753 	case token_type_eof:
1754 	  xgettext_current_source_encoding = po_charset_utf8;
1755 	  arglist_parser_done (argparser, arg);
1756 	  xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1757 	  return true;
1758 
1759 	case token_type_other:
1760 	  next_context_iter = null_context_list_iterator;
1761 	  state = 0;
1762 	  continue;
1763 
1764 	default:
1765 	  abort ();
1766 	}
1767     }
1768 }
1769 
1770 
1771 void
extract_python(FILE * f,const char * real_filename,const char * logical_filename,flag_context_list_table_ty * flag_table,msgdomain_list_ty * mdlp)1772 extract_python (FILE *f,
1773 		const char *real_filename, const char *logical_filename,
1774 		flag_context_list_table_ty *flag_table,
1775 		msgdomain_list_ty *mdlp)
1776 {
1777   message_list_ty *mlp = mdlp->item[0]->messages;
1778 
1779   fp = f;
1780   real_file_name = real_filename;
1781   logical_file_name = xstrdup (logical_filename);
1782   line_number = 1;
1783 
1784   last_comment_line = -1;
1785   last_non_comment_line = -1;
1786 
1787   xgettext_current_file_source_encoding = xgettext_global_source_encoding;
1788 #if HAVE_ICONV
1789   xgettext_current_file_source_iconv = xgettext_global_source_iconv;
1790 #endif
1791 
1792   xgettext_current_source_encoding = xgettext_current_file_source_encoding;
1793 #if HAVE_ICONV
1794   xgettext_current_source_iconv = xgettext_current_file_source_iconv;
1795 #endif
1796 
1797   continuation_or_nonblank_line = false;
1798 
1799   open_pbb = 0;
1800 
1801   flag_context_list_table = flag_table;
1802 
1803   init_keywords ();
1804 
1805   /* Eat tokens until eof is seen.  When extract_parenthesized returns
1806      due to an unbalanced closing parenthesis, just restart it.  */
1807   while (!extract_parenthesized (mlp, null_context, null_context_list_iterator,
1808 				 arglist_parser_alloc (mlp, NULL)))
1809     ;
1810 
1811   fp = NULL;
1812   real_file_name = NULL;
1813   logical_file_name = NULL;
1814   line_number = 0;
1815 }
1816