gettext-tools/src/x-python.c

*946379e7Schristos/* xgettext Python backend.
*946379e7Schristos   Copyright (C) 2002-2003, 2005-2006 Free Software Foundation, Inc.
*946379e7Schristos
*946379e7Schristos   This file was written by Bruno Haible <haible@clisp.cons.org>, 2002.
*946379e7Schristos
*946379e7Schristos   This program is free software; you can redistribute it and/or modify
*946379e7Schristos   it under the terms of the GNU General Public License as published by
*946379e7Schristos   the Free Software Foundation; either version 2, or (at your option)
*946379e7Schristos   any later version.
*946379e7Schristos
*946379e7Schristos   This program is distributed in the hope that it will be useful,
*946379e7Schristos   but WITHOUT ANY WARRANTY; without even the implied warranty of
*946379e7Schristos   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
*946379e7Schristos   GNU General Public License for more details.
*946379e7Schristos
*946379e7Schristos   You should have received a copy of the GNU General Public License
*946379e7Schristos   along with this program; if not, write to the Free Software Foundation,
*946379e7Schristos   Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
*946379e7Schristos
*946379e7Schristos#ifdef HAVE_CONFIG_H
*946379e7Schristos# include "config.h"
*946379e7Schristos#endif
*946379e7Schristos
*946379e7Schristos#include <assert.h>
*946379e7Schristos#include <errno.h>
*946379e7Schristos#include <stdbool.h>
*946379e7Schristos#include <stdio.h>
*946379e7Schristos#include <stdlib.h>
*946379e7Schristos#include <string.h>
*946379e7Schristos
*946379e7Schristos#include "message.h"
*946379e7Schristos#include "xgettext.h"
*946379e7Schristos#include "x-python.h"
*946379e7Schristos#include "error.h"
*946379e7Schristos#include "error-progname.h"
*946379e7Schristos#include "progname.h"
*946379e7Schristos#include "basename.h"
*946379e7Schristos#include "xerror.h"
*946379e7Schristos#include "xvasprintf.h"
*946379e7Schristos#include "xalloc.h"
*946379e7Schristos#include "exit.h"
*946379e7Schristos#include "c-strstr.h"
*946379e7Schristos#include "c-ctype.h"
*946379e7Schristos#include "po-charset.h"
*946379e7Schristos#include "uniname.h"
*946379e7Schristos#include "utf16-ucs4.h"
*946379e7Schristos#include "utf8-ucs4.h"
*946379e7Schristos#include "ucs4-utf8.h"
*946379e7Schristos#include "gettext.h"
*946379e7Schristos
*946379e7Schristos#define _(s) gettext(s)
*946379e7Schristos
*946379e7Schristos#define max(a,b) ((a) > (b) ? (a) : (b))
*946379e7Schristos
*946379e7Schristos#define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
*946379e7Schristos
*946379e7Schristos
*946379e7Schristos/* The Python syntax is defined in the Python Reference Manual
*946379e7Schristos   /usr/share/doc/packages/python/html/ref/index.html.
*946379e7Schristos   See also Python-2.0/Parser/tokenizer.c, Python-2.0/Python/compile.c,
*946379e7Schristos   Python-2.0/Objects/unicodeobject.c.  */
*946379e7Schristos
*946379e7Schristos
*946379e7Schristos/* ====================== Keyword set customization.  ====================== */
*946379e7Schristos
*946379e7Schristos/* If true extract all strings.  */
*946379e7Schristosstatic bool extract_all = false;
*946379e7Schristos
*946379e7Schristosstatic hash_table keywords;
*946379e7Schristosstatic bool default_keywords = true;
*946379e7Schristos
*946379e7Schristos
*946379e7Schristosvoid
*946379e7Schristosx_python_extract_all ()
*946379e7Schristos{
*946379e7Schristos  extract_all = true;
*946379e7Schristos}
*946379e7Schristos
*946379e7Schristos
*946379e7Schristosvoid
*946379e7Schristosx_python_keyword (const char *name)
*946379e7Schristos{
*946379e7Schristos  if (name == NULL)
*946379e7Schristos    default_keywords = false;
*946379e7Schristos  else
*946379e7Schristos    {
*946379e7Schristos      const char *end;
*946379e7Schristos      struct callshape shape;
*946379e7Schristos      const char *colon;
*946379e7Schristos
*946379e7Schristos      if (keywords.table == NULL)
*946379e7Schristos	hash_init (&keywords, 100);
*946379e7Schristos
*946379e7Schristos      split_keywordspec (name, &end, &shape);
*946379e7Schristos
*946379e7Schristos      /* The characters between name and end should form a valid C identifier.
*946379e7Schristos	 A colon means an invalid parse in split_keywordspec().  */
*946379e7Schristos      colon = strchr (name, ':');
*946379e7Schristos      if (colon == NULL || colon >= end)
*946379e7Schristos	insert_keyword_callshape (&keywords, name, end - name, &shape);
*946379e7Schristos    }
*946379e7Schristos}
*946379e7Schristos
*946379e7Schristos/* Finish initializing the keywords hash table.
*946379e7Schristos   Called after argument processing, before each file is processed.  */
*946379e7Schristosstatic void
*946379e7Schristosinit_keywords ()
*946379e7Schristos{
*946379e7Schristos  if (default_keywords)
*946379e7Schristos    {
*946379e7Schristos      /* When adding new keywords here, also update the documentation in
*946379e7Schristos	 xgettext.texi!  */
*946379e7Schristos      x_python_keyword ("gettext");
*946379e7Schristos      x_python_keyword ("ugettext");
*946379e7Schristos      x_python_keyword ("dgettext:2");
*946379e7Schristos      x_python_keyword ("ngettext:1,2");
*946379e7Schristos      x_python_keyword ("ungettext:1,2");
*946379e7Schristos      x_python_keyword ("dngettext:2,3");
*946379e7Schristos      x_python_keyword ("_");
*946379e7Schristos      default_keywords = false;
*946379e7Schristos    }
*946379e7Schristos}
*946379e7Schristos
*946379e7Schristosvoid
*946379e7Schristosinit_flag_table_python ()
*946379e7Schristos{
*946379e7Schristos  xgettext_record_flag ("gettext:1:pass-python-format");
*946379e7Schristos  xgettext_record_flag ("ugettext:1:pass-python-format");
*946379e7Schristos  xgettext_record_flag ("dgettext:2:pass-python-format");
*946379e7Schristos  xgettext_record_flag ("ngettext:1:pass-python-format");
*946379e7Schristos  xgettext_record_flag ("ngettext:2:pass-python-format");
*946379e7Schristos  xgettext_record_flag ("ungettext:1:pass-python-format");
*946379e7Schristos  xgettext_record_flag ("ungettext:2:pass-python-format");
*946379e7Schristos  xgettext_record_flag ("dngettext:2:pass-python-format");
*946379e7Schristos  xgettext_record_flag ("dngettext:3:pass-python-format");
*946379e7Schristos  xgettext_record_flag ("_:1:pass-python-format");
*946379e7Schristos  /* xgettext_record_flag ("%:1:python-format"); // % is an infix operator! */
*946379e7Schristos}
*946379e7Schristos
*946379e7Schristos
*946379e7Schristos/* ======================== Reading of characters.  ======================== */
*946379e7Schristos
*946379e7Schristos/* Real filename, used in error messages about the input file.  */
*946379e7Schristosstatic const char *real_file_name;
*946379e7Schristos
*946379e7Schristos/* Logical filename and line number, used to label the extracted messages.  */
*946379e7Schristosstatic char *logical_file_name;
*946379e7Schristosstatic int line_number;
*946379e7Schristos
*946379e7Schristos/* The input file stream.  */
*946379e7Schristosstatic FILE *fp;
*946379e7Schristos
*946379e7Schristos
*946379e7Schristos/* 1. line_number handling.  */
*946379e7Schristos
*946379e7Schristos/* Maximum used, roughly a safer MB_LEN_MAX.  */
*946379e7Schristos#define MAX_PHASE1_PUSHBACK 16
*946379e7Schristosstatic unsigned char phase1_pushback[MAX_PHASE1_PUSHBACK];
*946379e7Schristosstatic int phase1_pushback_length;
*946379e7Schristos
*946379e7Schristos/* Read the next single byte from the input file.  */
*946379e7Schristosstatic int
*946379e7Schristosphase1_getc ()
*946379e7Schristos{
*946379e7Schristos  int c;
*946379e7Schristos
*946379e7Schristos  if (phase1_pushback_length)
*946379e7Schristos    c = phase1_pushback[--phase1_pushback_length];
*946379e7Schristos  else
*946379e7Schristos    {
*946379e7Schristos      c = getc (fp);
*946379e7Schristos
*946379e7Schristos      if (c == EOF)
*946379e7Schristos	{
*946379e7Schristos	  if (ferror (fp))
*946379e7Schristos	    error (EXIT_FAILURE, errno, _("error while reading \"%s\""),
*946379e7Schristos		   real_file_name);
*946379e7Schristos	  return EOF;
*946379e7Schristos	}
*946379e7Schristos    }
*946379e7Schristos
*946379e7Schristos  if (c == '\n')
*946379e7Schristos    ++line_number;
*946379e7Schristos
*946379e7Schristos  return c;
*946379e7Schristos}
*946379e7Schristos
*946379e7Schristos/* Supports MAX_PHASE1_PUSHBACK characters of pushback.  */
*946379e7Schristosstatic void
*946379e7Schristosphase1_ungetc (int c)
*946379e7Schristos{
*946379e7Schristos  if (c != EOF)
*946379e7Schristos    {
*946379e7Schristos      if (c == '\n')
*946379e7Schristos	--line_number;
*946379e7Schristos
*946379e7Schristos      if (phase1_pushback_length == SIZEOF (phase1_pushback))
*946379e7Schristos	abort ();
*946379e7Schristos      phase1_pushback[phase1_pushback_length++] = c;
*946379e7Schristos    }
*946379e7Schristos}
*946379e7Schristos
*946379e7Schristos
*946379e7Schristos/* Phase 2: Conversion to Unicode.
*946379e7Schristos   This is done early because PEP 0263 specifies that conversion to Unicode
*946379e7Schristos   conceptually occurs before tokenization.  A test case where it matters
*946379e7Schristos   is with encodings like BIG5: when a double-byte character ending in 0x5C
*946379e7Schristos   is followed by '\' or 'u0021', the tokenizer must not treat the second
*946379e7Schristos   half of the double-byte character as a backslash.  */
*946379e7Schristos
*946379e7Schristos/* End-of-file indicator for functions returning an UCS-4 character.  */
*946379e7Schristos#define UEOF -1
*946379e7Schristos
*946379e7Schristosstatic int phase2_pushback[max (9, UNINAME_MAX + 3)];
*946379e7Schristosstatic int phase2_pushback_length;
*946379e7Schristos
*946379e7Schristos/* Read the next Unicode UCS-4 character from the input file.  */
*946379e7Schristosstatic int
*946379e7Schristosphase2_getc ()
*946379e7Schristos{
*946379e7Schristos  if (phase2_pushback_length)
*946379e7Schristos    return phase2_pushback[--phase2_pushback_length];
*946379e7Schristos
*946379e7Schristos  if (xgettext_current_source_encoding == po_charset_ascii)
*946379e7Schristos    {
*946379e7Schristos      int c = phase1_getc ();
*946379e7Schristos      if (c == EOF)
*946379e7Schristos	return UEOF;
*946379e7Schristos      if (!c_isascii (c))
*946379e7Schristos	{
*946379e7Schristos	  char buffer[21];
*946379e7Schristos	  sprintf (buffer, ":%ld", (long) line_number);
*946379e7Schristos	  multiline_error (xstrdup (""),
*946379e7Schristos			   xasprintf (_("\
*946379e7SchristosNon-ASCII string at %s%s.\n\
*946379e7SchristosPlease specify the source encoding through --from-code or through a comment\n\
*946379e7Schristosas specified in http://www.python.org/peps/pep-0263.html.\n"),
*946379e7Schristos			   real_file_name, buffer));
*946379e7Schristos	  exit (EXIT_FAILURE);
*946379e7Schristos	}
*946379e7Schristos      return c;
*946379e7Schristos    }
*946379e7Schristos  else if (xgettext_current_source_encoding != po_charset_utf8)
*946379e7Schristos    {
*946379e7Schristos#if HAVE_ICONV
*946379e7Schristos      /* Use iconv on an increasing number of bytes.  Read only as many bytes
*946379e7Schristos	 through phase1_getc as needed.  This is needed to give reasonable
*946379e7Schristos	 interactive behaviour when fp is connected to an interactive tty.  */
*946379e7Schristos      unsigned char buf[MAX_PHASE1_PUSHBACK];
*946379e7Schristos      size_t bufcount;
*946379e7Schristos      int c = phase1_getc ();
*946379e7Schristos      if (c == EOF)
*946379e7Schristos	return UEOF;
*946379e7Schristos      buf[0] = (unsigned char) c;
*946379e7Schristos      bufcount = 1;
*946379e7Schristos
*946379e7Schristos      for (;;)
*946379e7Schristos	{
*946379e7Schristos	  unsigned char scratchbuf[6];
*946379e7Schristos	  const char *inptr = (const char *) &buf[0];
*946379e7Schristos	  size_t insize = bufcount;
*946379e7Schristos	  char *outptr = (char *) &scratchbuf[0];
*946379e7Schristos	  size_t outsize = sizeof (scratchbuf);
*946379e7Schristos
*946379e7Schristos	  size_t res = iconv (xgettext_current_source_iconv,
*946379e7Schristos			      (ICONV_CONST char **) &inptr, &insize,
*946379e7Schristos			      &outptr, &outsize);
*946379e7Schristos	  /* We expect that a character has been produced if and only if
*946379e7Schristos	     some input bytes have been consumed.  */
*946379e7Schristos	  if ((insize < bufcount) != (outsize < sizeof (scratchbuf)))
*946379e7Schristos	    abort ();
*946379e7Schristos	  if (outsize == sizeof (scratchbuf))
*946379e7Schristos	    {
*946379e7Schristos	      /* No character has been produced.  Must be an error.  */
*946379e7Schristos	      if (res != (size_t)(-1))
*946379e7Schristos		abort ();
*946379e7Schristos
*946379e7Schristos	      if (errno == EILSEQ)
*946379e7Schristos		{
*946379e7Schristos		  /* An invalid multibyte sequence was encountered.  */
*946379e7Schristos		  multiline_error (xstrdup (""),
*946379e7Schristos				   xasprintf (_("\
*946379e7Schristos%s:%d: Invalid multibyte sequence.\n\
*946379e7SchristosPlease specify the correct source encoding through --from-code or through a\n\
*946379e7Schristoscomment as specified in http://www.python.org/peps/pep-0263.html.\n"),
*946379e7Schristos				   real_file_name, line_number));
*946379e7Schristos		  exit (EXIT_FAILURE);
*946379e7Schristos		}
*946379e7Schristos	      else if (errno == EINVAL)
*946379e7Schristos		{
*946379e7Schristos		  /* An incomplete multibyte character.  */
*946379e7Schristos		  int c;
*946379e7Schristos
*946379e7Schristos		  if (bufcount == MAX_PHASE1_PUSHBACK)
*946379e7Schristos		    {
*946379e7Schristos		      /* An overlong incomplete multibyte sequence was
*946379e7Schristos			 encountered.  */
*946379e7Schristos		      multiline_error (xstrdup (""),
*946379e7Schristos				       xasprintf (_("\
*946379e7Schristos%s:%d: Long incomplete multibyte sequence.\n\
*946379e7SchristosPlease specify the correct source encoding through --from-code or through a\n\
*946379e7Schristoscomment as specified in http://www.python.org/peps/pep-0263.html.\n"),
*946379e7Schristos				       real_file_name, line_number));
*946379e7Schristos		      exit (EXIT_FAILURE);
*946379e7Schristos		    }
*946379e7Schristos
*946379e7Schristos		  /* Read one more byte and retry iconv.  */
*946379e7Schristos		  c = phase1_getc ();
*946379e7Schristos		  if (c == EOF)
*946379e7Schristos		    {
*946379e7Schristos		      multiline_error (xstrdup (""),
*946379e7Schristos				       xasprintf (_("\
*946379e7Schristos%s:%d: Incomplete multibyte sequence at end of file.\n\
*946379e7SchristosPlease specify the correct source encoding through --from-code or through a\n\
*946379e7Schristoscomment as specified in http://www.python.org/peps/pep-0263.html.\n"),
*946379e7Schristos				       real_file_name, line_number));
*946379e7Schristos		      exit (EXIT_FAILURE);
*946379e7Schristos		    }
*946379e7Schristos		  if (c == '\n')
*946379e7Schristos		    {
*946379e7Schristos		      multiline_error (xstrdup (""),
*946379e7Schristos				       xasprintf (_("\
*946379e7Schristos%s:%d: Incomplete multibyte sequence at end of line.\n\
*946379e7SchristosPlease specify the correct source encoding through --from-code or through a\n\
*946379e7Schristoscomment as specified in http://www.python.org/peps/pep-0263.html.\n"),
*946379e7Schristos				       real_file_name, line_number - 1));
*946379e7Schristos		      exit (EXIT_FAILURE);
*946379e7Schristos		    }
*946379e7Schristos		  buf[bufcount++] = (unsigned char) c;
*946379e7Schristos		}
*946379e7Schristos	      else
*946379e7Schristos		error (EXIT_FAILURE, errno, _("%s:%d: iconv failure"),
*946379e7Schristos		       real_file_name, line_number);
*946379e7Schristos	    }
*946379e7Schristos	  else
*946379e7Schristos	    {
*946379e7Schristos	      size_t outbytes = sizeof (scratchbuf) - outsize;
*946379e7Schristos	      size_t bytes = bufcount - insize;
*946379e7Schristos	      unsigned int uc;
*946379e7Schristos
*946379e7Schristos	      /* We expect that one character has been produced.  */
*946379e7Schristos	      if (bytes == 0)
*946379e7Schristos		abort ();
*946379e7Schristos	      if (outbytes == 0)
*946379e7Schristos		abort ();
*946379e7Schristos	      /* Push back the unused bytes.  */
*946379e7Schristos	      while (insize > 0)
*946379e7Schristos		phase1_ungetc (buf[--insize]);
*946379e7Schristos	      /* Convert the character from UTF-8 to UCS-4.  */
*946379e7Schristos	      if (u8_mbtouc (&uc, scratchbuf, outbytes) < outbytes)
*946379e7Schristos		{
*946379e7Schristos		  /* scratchbuf contains an out-of-range Unicode character
*946379e7Schristos		     (> 0x10ffff).  */
*946379e7Schristos		  multiline_error (xstrdup (""),
*946379e7Schristos				   xasprintf (_("\
*946379e7Schristos%s:%d: Invalid multibyte sequence.\n\
*946379e7SchristosPlease specify the source encoding through --from-code or through a comment\n\
*946379e7Schristosas specified in http://www.python.org/peps/pep-0263.html.\n"),
*946379e7Schristos				   real_file_name, line_number));
*946379e7Schristos		  exit (EXIT_FAILURE);
*946379e7Schristos		}
*946379e7Schristos	      return uc;
*946379e7Schristos	    }
*946379e7Schristos	}
*946379e7Schristos#else
*946379e7Schristos      /* If we don't have iconv(), the only supported values for
*946379e7Schristos	 xgettext_global_source_encoding and thus also for
*946379e7Schristos	 xgettext_current_source_encoding are ASCII and UTF-8.  */
*946379e7Schristos      abort ();
*946379e7Schristos#endif
*946379e7Schristos    }
*946379e7Schristos  else
*946379e7Schristos    {
*946379e7Schristos      /* Read an UTF-8 encoded character.  */
*946379e7Schristos      unsigned char buf[6];
*946379e7Schristos      unsigned int count;
*946379e7Schristos      int c;
*946379e7Schristos      unsigned int uc;
*946379e7Schristos
*946379e7Schristos      c = phase1_getc ();
*946379e7Schristos      if (c == EOF)
*946379e7Schristos	return UEOF;
*946379e7Schristos      buf[0] = c;
*946379e7Schristos      count = 1;
*946379e7Schristos
*946379e7Schristos      if (buf[0] >= 0xc0)
*946379e7Schristos	{
*946379e7Schristos	  c = phase1_getc ();
*946379e7Schristos	  if (c == EOF)
*946379e7Schristos	    return UEOF;
*946379e7Schristos	  buf[1] = c;
*946379e7Schristos	  count = 2;
*946379e7Schristos	}
*946379e7Schristos
*946379e7Schristos      if (buf[0] >= 0xe0
*946379e7Schristos	  && ((buf[1] ^ 0x80) < 0x40))
*946379e7Schristos	{
*946379e7Schristos	  c = phase1_getc ();
*946379e7Schristos	  if (c == EOF)
*946379e7Schristos	    return UEOF;
*946379e7Schristos	  buf[2] = c;
*946379e7Schristos	  count = 3;
*946379e7Schristos	}
*946379e7Schristos
*946379e7Schristos      if (buf[0] >= 0xf0
*946379e7Schristos	  && ((buf[1] ^ 0x80) < 0x40)
*946379e7Schristos	  && ((buf[2] ^ 0x80) < 0x40))
*946379e7Schristos	{
*946379e7Schristos	  c = phase1_getc ();
*946379e7Schristos	  if (c == EOF)
*946379e7Schristos	    return UEOF;
*946379e7Schristos	  buf[3] = c;
*946379e7Schristos	  count = 4;
*946379e7Schristos	}
*946379e7Schristos
*946379e7Schristos      if (buf[0] >= 0xf8
*946379e7Schristos	  && ((buf[1] ^ 0x80) < 0x40)
*946379e7Schristos	  && ((buf[2] ^ 0x80) < 0x40)
*946379e7Schristos	  && ((buf[3] ^ 0x80) < 0x40))
*946379e7Schristos	{
*946379e7Schristos	  c = phase1_getc ();
*946379e7Schristos	  if (c == EOF)
*946379e7Schristos	    return UEOF;
*946379e7Schristos	  buf[4] = c;
*946379e7Schristos	  count = 5;
*946379e7Schristos	}
*946379e7Schristos
*946379e7Schristos      if (buf[0] >= 0xfc
*946379e7Schristos	  && ((buf[1] ^ 0x80) < 0x40)
*946379e7Schristos	  && ((buf[2] ^ 0x80) < 0x40)
*946379e7Schristos	  && ((buf[3] ^ 0x80) < 0x40)
*946379e7Schristos	  && ((buf[4] ^ 0x80) < 0x40))
*946379e7Schristos	{
*946379e7Schristos	  c = phase1_getc ();
*946379e7Schristos	  if (c == EOF)
*946379e7Schristos	    return UEOF;
*946379e7Schristos	  buf[5] = c;
*946379e7Schristos	  count = 6;
*946379e7Schristos	}
*946379e7Schristos
*946379e7Schristos      u8_mbtouc (&uc, buf, count);
*946379e7Schristos      return uc;
*946379e7Schristos    }
*946379e7Schristos}
*946379e7Schristos
*946379e7Schristos/* Supports max (9, UNINAME_MAX + 3) pushback characters.  */
*946379e7Schristosstatic void
*946379e7Schristosphase2_ungetc (int c)
*946379e7Schristos{
*946379e7Schristos  if (c != UEOF)
*946379e7Schristos    {
*946379e7Schristos      if (phase2_pushback_length == SIZEOF (phase2_pushback))
*946379e7Schristos	abort ();
*946379e7Schristos      phase2_pushback[phase2_pushback_length++] = c;
*946379e7Schristos    }
*946379e7Schristos}
*946379e7Schristos
*946379e7Schristos
*946379e7Schristos/* ========================= Accumulating strings.  ======================== */
*946379e7Schristos
*946379e7Schristos/* A string buffer type that allows appending Unicode characters.
*946379e7Schristos   Returns the entire string in UTF-8 encoding.  */
*946379e7Schristos
*946379e7Schristosstruct unicode_string_buffer
*946379e7Schristos{
*946379e7Schristos  /* The part of the string that has already been converted to UTF-8.  */
*946379e7Schristos  char *utf8_buffer;
*946379e7Schristos  size_t utf8_buflen;
*946379e7Schristos  size_t utf8_allocated;
*946379e7Schristos};
*946379e7Schristos
*946379e7Schristos/* Initialize a 'struct unicode_string_buffer' to empty.  */
*946379e7Schristosstatic inline void
*946379e7Schristosinit_unicode_string_buffer (struct unicode_string_buffer *bp)
*946379e7Schristos{
*946379e7Schristos  bp->utf8_buffer = NULL;
*946379e7Schristos  bp->utf8_buflen = 0;
*946379e7Schristos  bp->utf8_allocated = 0;
*946379e7Schristos}
*946379e7Schristos
*946379e7Schristos/* Auxiliary function: Ensure count more bytes are available in bp->utf8.  */
*946379e7Schristosstatic inline void
*946379e7Schristosunicode_string_buffer_append_unicode_grow (struct unicode_string_buffer *bp,
*946379e7Schristos					   size_t count)
*946379e7Schristos{
*946379e7Schristos  if (bp->utf8_buflen + count > bp->utf8_allocated)
*946379e7Schristos    {
*946379e7Schristos      size_t new_allocated = 2 * bp->utf8_allocated + 10;
*946379e7Schristos      if (new_allocated < bp->utf8_buflen + count)
*946379e7Schristos	new_allocated = bp->utf8_buflen + count;
*946379e7Schristos      bp->utf8_allocated = new_allocated;
*946379e7Schristos      bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated);
*946379e7Schristos    }
*946379e7Schristos}
*946379e7Schristos
*946379e7Schristos/* Auxiliary function: Append a Unicode character to bp->utf8.
*946379e7Schristos   uc must be < 0x110000.  */
*946379e7Schristosstatic inline void
*946379e7Schristosunicode_string_buffer_append_unicode (struct unicode_string_buffer *bp,
*946379e7Schristos				      unsigned int uc)
*946379e7Schristos{
*946379e7Schristos  unsigned char utf8buf[6];
*946379e7Schristos  int count = u8_uctomb (utf8buf, uc, 6);
*946379e7Schristos
*946379e7Schristos  if (count < 0)
*946379e7Schristos    /* The caller should have ensured that uc is not out-of-range.  */
*946379e7Schristos    abort ();
*946379e7Schristos
*946379e7Schristos  unicode_string_buffer_append_unicode_grow (bp, count);
*946379e7Schristos  memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count);
*946379e7Schristos  bp->utf8_buflen += count;
*946379e7Schristos}
*946379e7Schristos
*946379e7Schristos/* Return the string buffer's contents.  */
*946379e7Schristosstatic char *
*946379e7Schristosunicode_string_buffer_result (struct unicode_string_buffer *bp)
*946379e7Schristos{
*946379e7Schristos  /* NUL-terminate it.  */
*946379e7Schristos  unicode_string_buffer_append_unicode_grow (bp, 1);
*946379e7Schristos  bp->utf8_buffer[bp->utf8_buflen] = '\0';
*946379e7Schristos  /* Return it.  */
*946379e7Schristos  return bp->utf8_buffer;
*946379e7Schristos}
*946379e7Schristos
*946379e7Schristos/* Free the memory pointed to by a 'struct unicode_string_buffer'.  */
*946379e7Schristosstatic inline void
*946379e7Schristosfree_unicode_string_buffer (struct unicode_string_buffer *bp)
*946379e7Schristos{
*946379e7Schristos  free (bp->utf8_buffer);
*946379e7Schristos}
*946379e7Schristos
*946379e7Schristos
*946379e7Schristos/* ======================== Accumulating comments.  ======================== */
*946379e7Schristos
*946379e7Schristos
*946379e7Schristos/* Accumulating a single comment line.  */
*946379e7Schristos
*946379e7Schristosstatic struct unicode_string_buffer comment_buffer;
*946379e7Schristos
*946379e7Schristosstatic inline void
*946379e7Schristoscomment_start ()
*946379e7Schristos{
*946379e7Schristos  comment_buffer.utf8_buflen = 0;
*946379e7Schristos}
*946379e7Schristos
*946379e7Schristosstatic inline bool
*946379e7Schristoscomment_at_start ()
*946379e7Schristos{
*946379e7Schristos  return (comment_buffer.utf8_buflen == 0);
*946379e7Schristos}
*946379e7Schristos
*946379e7Schristosstatic inline void
*946379e7Schristoscomment_add (int c)
*946379e7Schristos{
*946379e7Schristos  unicode_string_buffer_append_unicode (&comment_buffer, c);
*946379e7Schristos}
*946379e7Schristos
*946379e7Schristosstatic inline const char *
*946379e7Schristoscomment_line_end ()
*946379e7Schristos{
*946379e7Schristos  char *buffer = unicode_string_buffer_result (&comment_buffer);
*946379e7Schristos  size_t buflen = strlen (buffer);
*946379e7Schristos
*946379e7Schristos  while (buflen >= 1
*946379e7Schristos	 && (buffer[buflen - 1] == ' ' || buffer[buflen - 1] == '\t'))
*946379e7Schristos    --buflen;
*946379e7Schristos  buffer[buflen] = '\0';
*946379e7Schristos  savable_comment_add (buffer);
*946379e7Schristos  return buffer;
*946379e7Schristos}
*946379e7Schristos
*946379e7Schristos
*946379e7Schristos/* These are for tracking whether comments count as immediately before
*946379e7Schristos   keyword.  */
*946379e7Schristosstatic int last_comment_line;
*946379e7Schristosstatic int last_non_comment_line;
*946379e7Schristos
*946379e7Schristos
*946379e7Schristos/* ======================== Recognizing comments.  ======================== */
*946379e7Schristos
*946379e7Schristos
*946379e7Schristos/* Recognizing the "coding" comment.
*946379e7Schristos   As specified in PEP 0263, it takes the form
*946379e7Schristos     "coding" [":"|"="] {alphanumeric or "-" or "_" or "*"}*
*946379e7Schristos   and is located in a comment in a line that
*946379e7Schristos     - is either the first or second line,
*946379e7Schristos     - is not a continuation line,
*946379e7Schristos     - contains no other tokens except this comment.  */
*946379e7Schristos
*946379e7Schristos/* Canonicalized encoding name for the current input file.  */
*946379e7Schristosstatic const char *xgettext_current_file_source_encoding;
*946379e7Schristos
*946379e7Schristos#if HAVE_ICONV
*946379e7Schristos/* Converter from xgettext_current_file_source_encoding to UTF-8 (except from
*946379e7Schristos   ASCII or UTF-8, when this conversion is a no-op).  */
*946379e7Schristosstatic iconv_t xgettext_current_file_source_iconv;
*946379e7Schristos#endif
*946379e7Schristos
*946379e7Schristosstatic inline void
*946379e7Schristosset_current_file_source_encoding (const char *canon_encoding)
*946379e7Schristos{
*946379e7Schristos  xgettext_current_file_source_encoding = canon_encoding;
*946379e7Schristos
*946379e7Schristos  if (xgettext_current_file_source_encoding != po_charset_ascii
*946379e7Schristos      && xgettext_current_file_source_encoding != po_charset_utf8)
*946379e7Schristos    {
*946379e7Schristos#if HAVE_ICONV
*946379e7Schristos      iconv_t cd;
*946379e7Schristos
*946379e7Schristos      /* Avoid glibc-2.1 bug with EUC-KR.  */
*946379e7Schristos# if (__GLIBC__ - 0 == 2 && __GLIBC_MINOR__ - 0 <= 1) && !defined _LIBICONV_VERSION
*946379e7Schristos      if (strcmp (xgettext_current_file_source_encoding, "EUC-KR") == 0)
*946379e7Schristos	cd = (iconv_t)(-1);
*946379e7Schristos      else
*946379e7Schristos# endif
*946379e7Schristos      cd = iconv_open (po_charset_utf8, xgettext_current_file_source_encoding);
*946379e7Schristos      if (cd == (iconv_t)(-1))
*946379e7Schristos	error_at_line (EXIT_FAILURE, 0, logical_file_name, line_number - 1, _("\
*946379e7SchristosCannot convert from \"%s\" to \"%s\". %s relies on iconv(), \
*946379e7Schristosand iconv() does not support this conversion."),
*946379e7Schristos	       xgettext_current_file_source_encoding, po_charset_utf8,
*946379e7Schristos	       basename (program_name));
*946379e7Schristos      xgettext_current_file_source_iconv = cd;
*946379e7Schristos#else
*946379e7Schristos      error_at_line (EXIT_FAILURE, 0, logical_file_name, line_number - 1, _("\
*946379e7SchristosCannot convert from \"%s\" to \"%s\". %s relies on iconv(). \
*946379e7SchristosThis version was built without iconv()."),
*946379e7Schristos	     xgettext_global_source_encoding, po_charset_utf8,
*946379e7Schristos	     basename (program_name));
*946379e7Schristos#endif
*946379e7Schristos    }
*946379e7Schristos
*946379e7Schristos  xgettext_current_source_encoding = xgettext_current_file_source_encoding;
*946379e7Schristos#if HAVE_ICONV
*946379e7Schristos  xgettext_current_source_iconv = xgettext_current_file_source_iconv;
*946379e7Schristos#endif
*946379e7Schristos}
*946379e7Schristos
*946379e7Schristosstatic inline void
*946379e7Schristostry_to_extract_coding (const char *comment)
*946379e7Schristos{
*946379e7Schristos  const char *p = c_strstr (comment, "coding");
*946379e7Schristos
*946379e7Schristos  if (p != NULL)
*946379e7Schristos    {
*946379e7Schristos      p += 6;
*946379e7Schristos      if (*p == ':' || *p == '=')
*946379e7Schristos	{
*946379e7Schristos	  p++;
*946379e7Schristos	  while (*p == ' ' || *p == '\t')
*946379e7Schristos	    p++;
*946379e7Schristos	  {
*946379e7Schristos	    const char *encoding_start = p;
*946379e7Schristos
*946379e7Schristos	    while (c_isalnum (*p) || *p == '-' || *p == '_' || *p == '.')
*946379e7Schristos	      p++;
*946379e7Schristos	    {
*946379e7Schristos	      const char *encoding_end = p;
*946379e7Schristos
*946379e7Schristos	      if (encoding_end > encoding_start)
*946379e7Schristos		{
*946379e7Schristos		  /* Extract the encoding string.  */
*946379e7Schristos		  size_t encoding_len = encoding_end - encoding_start;
*946379e7Schristos		  char *encoding = (char *) xmalloc (encoding_len + 1);
*946379e7Schristos
*946379e7Schristos		  memcpy (encoding, encoding_start, encoding_len);
*946379e7Schristos		  encoding[encoding_len] = '\0';
*946379e7Schristos
*946379e7Schristos		  {
*946379e7Schristos		    /* Canonicalize it.  */
*946379e7Schristos		    const char *canon_encoding = po_charset_canonicalize (encoding);
*946379e7Schristos		    if (canon_encoding == NULL)
*946379e7Schristos		      {
*946379e7Schristos			error_at_line (0, 0,
*946379e7Schristos				       logical_file_name, line_number - 1, _("\
*946379e7SchristosUnknown encoding \"%s\". Proceeding with ASCII instead."),
*946379e7Schristos				       encoding);
*946379e7Schristos		        canon_encoding = po_charset_ascii;
*946379e7Schristos		      }
*946379e7Schristos
*946379e7Schristos		    /* Activate it.  */
*946379e7Schristos		    set_current_file_source_encoding (canon_encoding);
*946379e7Schristos		  }
*946379e7Schristos
*946379e7Schristos		  free (encoding);
*946379e7Schristos		}
*946379e7Schristos	    }
*946379e7Schristos	  }
*946379e7Schristos	}
*946379e7Schristos    }
*946379e7Schristos}
*946379e7Schristos
*946379e7Schristos/* Tracking whether the current line is a continuation line or contains a
*946379e7Schristos   non-blank character.  */
*946379e7Schristosstatic bool continuation_or_nonblank_line = false;
*946379e7Schristos
*946379e7Schristos
*946379e7Schristos/* Phase 3: Outside strings, replace backslash-newline with nothing and a
*946379e7Schristos   comment with nothing.  */
*946379e7Schristos
*946379e7Schristosstatic int
*946379e7Schristosphase3_getc ()
*946379e7Schristos{
*946379e7Schristos  int c;
*946379e7Schristos
*946379e7Schristos  for (;;)
*946379e7Schristos    {
*946379e7Schristos      c = phase2_getc ();
*946379e7Schristos      if (c == '\\')
*946379e7Schristos	{
*946379e7Schristos	  c = phase2_getc ();
*946379e7Schristos	  if (c != '\n')
*946379e7Schristos	    {
*946379e7Schristos	      phase2_ungetc (c);
*946379e7Schristos	      /* This shouldn't happen usually, because "A backslash is
*946379e7Schristos		 illegal elsewhere on a line outside a string literal."  */
*946379e7Schristos	      return '\\';
*946379e7Schristos	    }
*946379e7Schristos	  /* Eat backslash-newline.  */
*946379e7Schristos	  continuation_or_nonblank_line = true;
*946379e7Schristos	}
*946379e7Schristos      else if (c == '#')
*946379e7Schristos	{
*946379e7Schristos	  /* Eat a comment.  */
*946379e7Schristos	  const char *comment;
*946379e7Schristos
*946379e7Schristos	  last_comment_line = line_number;
*946379e7Schristos	  comment_start ();
*946379e7Schristos	  for (;;)
*946379e7Schristos	    {
*946379e7Schristos	      c = phase2_getc ();
*946379e7Schristos	      if (c == UEOF || c == '\n')
*946379e7Schristos		break;
*946379e7Schristos	      /* We skip all leading white space, but not EOLs.  */
*946379e7Schristos	      if (!(comment_at_start () && (c == ' ' || c == '\t')))
*946379e7Schristos		comment_add (c);
*946379e7Schristos	    }
*946379e7Schristos	  comment = comment_line_end ();
*946379e7Schristos	  if (line_number - 1 <= 2 && !continuation_or_nonblank_line)
*946379e7Schristos	    try_to_extract_coding (comment);
*946379e7Schristos	  continuation_or_nonblank_line = false;
*946379e7Schristos	  return c;
*946379e7Schristos	}
*946379e7Schristos      else
*946379e7Schristos	{
*946379e7Schristos	  if (c == '\n')
*946379e7Schristos	    continuation_or_nonblank_line = false;
*946379e7Schristos	  else if (!(c == ' ' || c == '\t' || c == '\f'))
*946379e7Schristos	    continuation_or_nonblank_line = true;
*946379e7Schristos	  return c;
*946379e7Schristos	}
*946379e7Schristos    }
*946379e7Schristos}
*946379e7Schristos
*946379e7Schristos/* Supports only one pushback character.  */
*946379e7Schristosstatic void
*946379e7Schristosphase3_ungetc (int c)
*946379e7Schristos{
*946379e7Schristos  phase2_ungetc (c);
*946379e7Schristos}
*946379e7Schristos
*946379e7Schristos
*946379e7Schristos/* ========================= Accumulating strings.  ======================== */
*946379e7Schristos
*946379e7Schristos/* Return value of phase7_getuc when EOF is reached.  */
*946379e7Schristos#define P7_EOF (-1)
*946379e7Schristos#define P7_STRING_END (-2)
*946379e7Schristos
*946379e7Schristos/* Convert an UTF-16 or UTF-32 code point to a return value that can be
*946379e7Schristos   distinguished from a single-byte return value.  */
*946379e7Schristos#define UNICODE(code) (0x100 + (code))
*946379e7Schristos
*946379e7Schristos/* Test a return value of phase7_getuc whether it designates an UTF-16 or
*946379e7Schristos   UTF-32 code point.  */
*946379e7Schristos#define IS_UNICODE(p7_result) ((p7_result) >= 0x100)
*946379e7Schristos
*946379e7Schristos/* Extract the UTF-16 or UTF-32 code of a return value that satisfies
*946379e7Schristos   IS_UNICODE.  */
*946379e7Schristos#define UNICODE_VALUE(p7_result) ((p7_result) - 0x100)
*946379e7Schristos
*946379e7Schristos/* A string buffer type that allows appending bytes (in the
*946379e7Schristos   xgettext_current_source_encoding) or Unicode characters.
*946379e7Schristos   Returns the entire string in UTF-8 encoding.  */
*946379e7Schristos
*946379e7Schristosstruct mixed_string_buffer
*946379e7Schristos{
*946379e7Schristos  /* The part of the string that has already been converted to UTF-8.  */
*946379e7Schristos  char *utf8_buffer;
*946379e7Schristos  size_t utf8_buflen;
*946379e7Schristos  size_t utf8_allocated;
*946379e7Schristos  /* The first half of an UTF-16 surrogate character.  */
*946379e7Schristos  unsigned short utf16_surr;
*946379e7Schristos  /* The part of the string that is still in the source encoding.  */
*946379e7Schristos  char *curr_buffer;
*946379e7Schristos  size_t curr_buflen;
*946379e7Schristos  size_t curr_allocated;
*946379e7Schristos};
*946379e7Schristos
*946379e7Schristos/* Initialize a 'struct mixed_string_buffer' to empty.  */
*946379e7Schristosstatic inline void
*946379e7Schristosinit_mixed_string_buffer (struct mixed_string_buffer *bp)
*946379e7Schristos{
*946379e7Schristos  bp->utf8_buffer = NULL;
*946379e7Schristos  bp->utf8_buflen = 0;
*946379e7Schristos  bp->utf8_allocated = 0;
*946379e7Schristos  bp->utf16_surr = 0;
*946379e7Schristos  bp->curr_buffer = NULL;
*946379e7Schristos  bp->curr_buflen = 0;
*946379e7Schristos  bp->curr_allocated = 0;
*946379e7Schristos}
*946379e7Schristos
*946379e7Schristos/* Auxiliary function: Append a byte to bp->curr.  */
*946379e7Schristosstatic inline void
*946379e7Schristosmixed_string_buffer_append_byte (struct mixed_string_buffer *bp, unsigned char c)
*946379e7Schristos{
*946379e7Schristos  if (bp->curr_buflen == bp->curr_allocated)
*946379e7Schristos    {
*946379e7Schristos      bp->curr_allocated = 2 * bp->curr_allocated + 10;
*946379e7Schristos      bp->curr_buffer = xrealloc (bp->curr_buffer, bp->curr_allocated);
*946379e7Schristos    }
*946379e7Schristos  bp->curr_buffer[bp->curr_buflen++] = c;
*946379e7Schristos}
*946379e7Schristos
*946379e7Schristos/* Auxiliary function: Ensure count more bytes are available in bp->utf8.  */
*946379e7Schristosstatic inline void
*946379e7Schristosmixed_string_buffer_append_unicode_grow (struct mixed_string_buffer *bp, size_t count)
*946379e7Schristos{
*946379e7Schristos  if (bp->utf8_buflen + count > bp->utf8_allocated)
*946379e7Schristos    {
*946379e7Schristos      size_t new_allocated = 2 * bp->utf8_allocated + 10;
*946379e7Schristos      if (new_allocated < bp->utf8_buflen + count)
*946379e7Schristos	new_allocated = bp->utf8_buflen + count;
*946379e7Schristos      bp->utf8_allocated = new_allocated;
*946379e7Schristos      bp->utf8_buffer = xrealloc (bp->utf8_buffer, new_allocated);
*946379e7Schristos    }
*946379e7Schristos}
*946379e7Schristos
*946379e7Schristos/* Auxiliary function: Append a Unicode character to bp->utf8.
*946379e7Schristos   uc must be < 0x110000.  */
*946379e7Schristosstatic inline void
*946379e7Schristosmixed_string_buffer_append_unicode (struct mixed_string_buffer *bp, unsigned int uc)
*946379e7Schristos{
*946379e7Schristos  unsigned char utf8buf[6];
*946379e7Schristos  int count = u8_uctomb (utf8buf, uc, 6);
*946379e7Schristos
*946379e7Schristos  if (count < 0)
*946379e7Schristos    /* The caller should have ensured that uc is not out-of-range.  */
*946379e7Schristos    abort ();
*946379e7Schristos
*946379e7Schristos  mixed_string_buffer_append_unicode_grow (bp, count);
*946379e7Schristos  memcpy (bp->utf8_buffer + bp->utf8_buflen, utf8buf, count);
*946379e7Schristos  bp->utf8_buflen += count;
*946379e7Schristos}
*946379e7Schristos
*946379e7Schristos/* Auxiliary function: Flush bp->utf16_surr into bp->utf8_buffer.  */
*946379e7Schristosstatic inline void
*946379e7Schristosmixed_string_buffer_flush_utf16_surr (struct mixed_string_buffer *bp)
*946379e7Schristos{
*946379e7Schristos  if (bp->utf16_surr != 0)
*946379e7Schristos    {
*946379e7Schristos      /* A half surrogate is invalid, therefore use U+FFFD instead.  */
*946379e7Schristos      mixed_string_buffer_append_unicode (bp, 0xfffd);
*946379e7Schristos      bp->utf16_surr = 0;
*946379e7Schristos    }
*946379e7Schristos}
*946379e7Schristos
*946379e7Schristos/* Auxiliary function: Flush bp->curr_buffer into bp->utf8_buffer.  */
*946379e7Schristosstatic inline void
*946379e7Schristosmixed_string_buffer_flush_curr_buffer (struct mixed_string_buffer *bp, int lineno)
*946379e7Schristos{
*946379e7Schristos  if (bp->curr_buflen > 0)
*946379e7Schristos    {
*946379e7Schristos      char *curr;
*946379e7Schristos      size_t count;
*946379e7Schristos
*946379e7Schristos      mixed_string_buffer_append_byte (bp, '\0');
*946379e7Schristos
*946379e7Schristos      /* Convert from the source encoding to UTF-8.  */
*946379e7Schristos      curr = from_current_source_encoding (bp->curr_buffer,
*946379e7Schristos					   logical_file_name, lineno);
*946379e7Schristos
*946379e7Schristos      /* Append it to bp->utf8_buffer.  */
*946379e7Schristos      count = strlen (curr);
*946379e7Schristos      mixed_string_buffer_append_unicode_grow (bp, count);
*946379e7Schristos      memcpy (bp->utf8_buffer + bp->utf8_buflen, curr, count);
*946379e7Schristos      bp->utf8_buflen += count;
*946379e7Schristos
*946379e7Schristos      if (curr != bp->curr_buffer)
*946379e7Schristos	free (curr);
*946379e7Schristos      bp->curr_buflen = 0;
*946379e7Schristos    }
*946379e7Schristos}
*946379e7Schristos
*946379e7Schristos/* Append a character or Unicode character to a 'struct mixed_string_buffer'.  */
*946379e7Schristosstatic void
*946379e7Schristosmixed_string_buffer_append (struct mixed_string_buffer *bp, int c)
*946379e7Schristos{
*946379e7Schristos  if (IS_UNICODE (c))
*946379e7Schristos    {
*946379e7Schristos      /* Append a Unicode character.  */
*946379e7Schristos
*946379e7Schristos      /* Switch from multibyte character mode to Unicode character mode.  */
*946379e7Schristos      mixed_string_buffer_flush_curr_buffer (bp, line_number);
*946379e7Schristos
*946379e7Schristos      /* Test whether this character and the previous one form a Unicode
*946379e7Schristos	 surrogate character pair.  */
*946379e7Schristos      if (bp->utf16_surr != 0
*946379e7Schristos	  && (c >= UNICODE (0xdc00) && c < UNICODE (0xe000)))
*946379e7Schristos	{
*946379e7Schristos	  unsigned short utf16buf[2];
*946379e7Schristos	  unsigned int uc;
*946379e7Schristos
*946379e7Schristos	  utf16buf[0] = bp->utf16_surr;
*946379e7Schristos	  utf16buf[1] = UNICODE_VALUE (c);
*946379e7Schristos	  if (u16_mbtouc_aux (&uc, utf16buf, 2) != 2)
*946379e7Schristos	    abort ();
*946379e7Schristos
*946379e7Schristos	  mixed_string_buffer_append_unicode (bp, uc);
*946379e7Schristos	  bp->utf16_surr = 0;
*946379e7Schristos	}
*946379e7Schristos      else
*946379e7Schristos	{
*946379e7Schristos	  mixed_string_buffer_flush_utf16_surr (bp);
*946379e7Schristos
*946379e7Schristos	  if (c >= UNICODE (0xd800) && c < UNICODE (0xdc00))
*946379e7Schristos	    bp->utf16_surr = UNICODE_VALUE (c);
*946379e7Schristos	  else
*946379e7Schristos	    mixed_string_buffer_append_unicode (bp, UNICODE_VALUE (c));
*946379e7Schristos	}
*946379e7Schristos    }
*946379e7Schristos  else
*946379e7Schristos    {
*946379e7Schristos      /* Append a single byte.  */
*946379e7Schristos
*946379e7Schristos      /* Switch from Unicode character mode to multibyte character mode.  */
*946379e7Schristos      mixed_string_buffer_flush_utf16_surr (bp);
*946379e7Schristos
*946379e7Schristos      /* When a newline is seen, convert the accumulated multibyte sequence.
*946379e7Schristos	 This ensures a correct line number in the error message in case of
*946379e7Schristos	 a conversion error.  The "- 1" is to account for the newline.  */
*946379e7Schristos      if (c == '\n')
*946379e7Schristos	mixed_string_buffer_flush_curr_buffer (bp, line_number - 1);
*946379e7Schristos
*946379e7Schristos      mixed_string_buffer_append_byte (bp, (unsigned char) c);
*946379e7Schristos    }
*946379e7Schristos}
*946379e7Schristos
*946379e7Schristos/* Return the string buffer's contents.  */
*946379e7Schristosstatic char *
*946379e7Schristosmixed_string_buffer_result (struct mixed_string_buffer *bp)
*946379e7Schristos{
*946379e7Schristos  /* Flush all into bp->utf8_buffer.  */
*946379e7Schristos  mixed_string_buffer_flush_utf16_surr (bp);
*946379e7Schristos  mixed_string_buffer_flush_curr_buffer (bp, line_number);
*946379e7Schristos  /* NUL-terminate it.  */
*946379e7Schristos  mixed_string_buffer_append_unicode_grow (bp, 1);
*946379e7Schristos  bp->utf8_buffer[bp->utf8_buflen] = '\0';
*946379e7Schristos  /* Return it.  */
*946379e7Schristos  return bp->utf8_buffer;
*946379e7Schristos}
*946379e7Schristos
*946379e7Schristos/* Free the memory pointed to by a 'struct mixed_string_buffer'.  */
*946379e7Schristosstatic inline void
*946379e7Schristosfree_mixed_string_buffer (struct mixed_string_buffer *bp)
*946379e7Schristos{
*946379e7Schristos  free (bp->utf8_buffer);
*946379e7Schristos  free (bp->curr_buffer);
*946379e7Schristos}
*946379e7Schristos
*946379e7Schristos
*946379e7Schristos/* ========================== Reading of tokens.  ========================== */
*946379e7Schristos
*946379e7Schristos
*946379e7Schristosenum token_type_ty
*946379e7Schristos{
*946379e7Schristos  token_type_eof,
*946379e7Schristos  token_type_lparen,		/* ( */
*946379e7Schristos  token_type_rparen,		/* ) */
*946379e7Schristos  token_type_comma,		/* , */
*946379e7Schristos  token_type_string,		/* "abc", 'abc', """abc""", '''abc''' */
*946379e7Schristos  token_type_symbol,		/* symbol, number */
*946379e7Schristos  token_type_other		/* misc. operator */
*946379e7Schristos};
*946379e7Schristostypedef enum token_type_ty token_type_ty;
*946379e7Schristos
*946379e7Schristostypedef struct token_ty token_ty;
*946379e7Schristosstruct token_ty
*946379e7Schristos{
*946379e7Schristos  token_type_ty type;
*946379e7Schristos  char *string;		/* for token_type_string, token_type_symbol */
*946379e7Schristos  refcounted_string_list_ty *comment;	/* for token_type_string */
*946379e7Schristos  int line_number;
*946379e7Schristos};
*946379e7Schristos
*946379e7Schristos
*946379e7Schristos/* There are two different input syntaxes for strings, "abc" and r"abc",
*946379e7Schristos   and two different input syntaxes for Unicode strings, u"abc" and ur"abc".
*946379e7Schristos   Which escape sequences are understood, i.e. what is interpreted specially
*946379e7Schristos   after backslash?
*946379e7Schristos    "abc"     \<nl> \\ \' \" \a\b\f\n\r\t\v \ooo \xnn
*946379e7Schristos    r"abc"
*946379e7Schristos    u"abc"    \<nl> \\ \' \" \a\b\f\n\r\t\v \ooo \xnn \unnnn \Unnnnnnnn \N{...}
*946379e7Schristos    ur"abc"                                           \unnnn
*946379e7Schristos   The \unnnn values are UTF-16 values; a single \Unnnnnnnn can expand to two
*946379e7Schristos   \unnnn items.  The \ooo and \xnn values are in the current source encoding.
*946379e7Schristos */
*946379e7Schristos
*946379e7Schristosstatic int
*946379e7Schristosphase7_getuc (int quote_char,
*946379e7Schristos	      bool triple, bool interpret_ansic, bool interpret_unicode,
*946379e7Schristos	      unsigned int *backslash_counter)
*946379e7Schristos{
*946379e7Schristos  int c;
*946379e7Schristos
*946379e7Schristos  for (;;)
*946379e7Schristos    {
*946379e7Schristos      /* Use phase 2, because phase 3 elides comments.  */
*946379e7Schristos      c = phase2_getc ();
*946379e7Schristos
*946379e7Schristos      if (c == UEOF)
*946379e7Schristos	return P7_EOF;
*946379e7Schristos
*946379e7Schristos      if (c == quote_char && (interpret_ansic || (*backslash_counter & 1) == 0))
*946379e7Schristos	{
*946379e7Schristos	  if (triple)
*946379e7Schristos	    {
*946379e7Schristos	      int c1 = phase2_getc ();
*946379e7Schristos	      if (c1 == quote_char)
*946379e7Schristos		{
*946379e7Schristos		  int c2 = phase2_getc ();
*946379e7Schristos		  if (c2 == quote_char)
*946379e7Schristos		    return P7_STRING_END;
*946379e7Schristos		  phase2_ungetc (c2);
*946379e7Schristos		}
*946379e7Schristos	      phase2_ungetc (c1);
*946379e7Schristos	      return UNICODE (c);
*946379e7Schristos	    }
*946379e7Schristos	  else
*946379e7Schristos	    return P7_STRING_END;
*946379e7Schristos	}
*946379e7Schristos
*946379e7Schristos      if (c == '\n')
*946379e7Schristos	{
*946379e7Schristos	  if (triple)
*946379e7Schristos	    {
*946379e7Schristos	      *backslash_counter = 0;
*946379e7Schristos	      return UNICODE ('\n');
*946379e7Schristos	    }
*946379e7Schristos	  /* In r"..." and ur"..." strings, newline is only allowed
*946379e7Schristos	     immediately after an odd number of backslashes (although the
*946379e7Schristos	     backslashes are not interpreted!).  */
*946379e7Schristos	  if (!(interpret_ansic || (*backslash_counter & 1) == 0))
*946379e7Schristos	    {
*946379e7Schristos	      *backslash_counter = 0;
*946379e7Schristos	      return UNICODE ('\n');
*946379e7Schristos	    }
*946379e7Schristos	  phase2_ungetc (c);
*946379e7Schristos	  error_with_progname = false;
*946379e7Schristos	  error (0, 0, _("%s:%d: warning: unterminated string"),
*946379e7Schristos		 logical_file_name, line_number);
*946379e7Schristos	  error_with_progname = true;
*946379e7Schristos	  return P7_STRING_END;
*946379e7Schristos	}
*946379e7Schristos
*946379e7Schristos      if (c != '\\')
*946379e7Schristos	{
*946379e7Schristos	  *backslash_counter = 0;
*946379e7Schristos	  return UNICODE (c);
*946379e7Schristos	}
*946379e7Schristos
*946379e7Schristos      /* Backslash handling.  */
*946379e7Schristos
*946379e7Schristos      if (!interpret_ansic && !interpret_unicode)
*946379e7Schristos	{
*946379e7Schristos	  ++*backslash_counter;
*946379e7Schristos	  return UNICODE ('\\');
*946379e7Schristos	}
*946379e7Schristos
*946379e7Schristos      /* Dispatch according to the character following the backslash.  */
*946379e7Schristos      c = phase2_getc ();
*946379e7Schristos      if (c == UEOF)
*946379e7Schristos	{
*946379e7Schristos	  ++*backslash_counter;
*946379e7Schristos	  return UNICODE ('\\');
*946379e7Schristos	}
*946379e7Schristos
*946379e7Schristos      if (interpret_ansic)
*946379e7Schristos	switch (c)
*946379e7Schristos	  {
*946379e7Schristos	  case '\n':
*946379e7Schristos	    continue;
*946379e7Schristos	  case '\\':
*946379e7Schristos	    ++*backslash_counter;
*946379e7Schristos	    return UNICODE (c);
*946379e7Schristos	  case '\'': case '"':
*946379e7Schristos	    *backslash_counter = 0;
*946379e7Schristos	    return UNICODE (c);
*946379e7Schristos	  case 'a':
*946379e7Schristos	    *backslash_counter = 0;
*946379e7Schristos	    return UNICODE ('\a');
*946379e7Schristos	  case 'b':
*946379e7Schristos	    *backslash_counter = 0;
*946379e7Schristos	    return UNICODE ('\b');
*946379e7Schristos	  case 'f':
*946379e7Schristos	    *backslash_counter = 0;
*946379e7Schristos	    return UNICODE ('\f');
*946379e7Schristos	  case 'n':
*946379e7Schristos	    *backslash_counter = 0;
*946379e7Schristos	    return UNICODE ('\n');
*946379e7Schristos	  case 'r':
*946379e7Schristos	    *backslash_counter = 0;
*946379e7Schristos	    return UNICODE ('\r');
*946379e7Schristos	  case 't':
*946379e7Schristos	    *backslash_counter = 0;
*946379e7Schristos	    return UNICODE ('\t');
*946379e7Schristos	  case 'v':
*946379e7Schristos	    *backslash_counter = 0;
*946379e7Schristos	    return UNICODE ('\v');
*946379e7Schristos	  case '0': case '1': case '2': case '3': case '4':
*946379e7Schristos	  case '5': case '6': case '7':
*946379e7Schristos	    {
*946379e7Schristos	      int n = c - '0';
*946379e7Schristos
*946379e7Schristos	      c = phase2_getc ();
*946379e7Schristos	      if (c != UEOF)
*946379e7Schristos		{
*946379e7Schristos		  if (c >= '0' && c <= '7')
*946379e7Schristos		    {
*946379e7Schristos		      n = (n << 3) + (c - '0');
*946379e7Schristos		      c = phase2_getc ();
*946379e7Schristos		      if (c != UEOF)
*946379e7Schristos			{
*946379e7Schristos			  if (c >= '0' && c <= '7')
*946379e7Schristos			    n = (n << 3) + (c - '0');
*946379e7Schristos			  else
*946379e7Schristos			    phase2_ungetc (c);
*946379e7Schristos			}
*946379e7Schristos		    }
*946379e7Schristos		  else
*946379e7Schristos		    phase2_ungetc (c);
*946379e7Schristos		}
*946379e7Schristos	      *backslash_counter = 0;
*946379e7Schristos	      return (unsigned char) n;
*946379e7Schristos	    }
*946379e7Schristos	  case 'x':
*946379e7Schristos	    {
*946379e7Schristos	      int c1 = phase2_getc ();
*946379e7Schristos	      int n1;
*946379e7Schristos
*946379e7Schristos	      if (c1 >= '0' && c1 <= '9')
*946379e7Schristos		n1 = c1 - '0';
*946379e7Schristos	      else if (c1 >= 'A' && c1 <= 'F')
*946379e7Schristos		n1 = c1 - 'A' + 10;
*946379e7Schristos	      else if (c1 >= 'a' && c1 <= 'f')
*946379e7Schristos		n1 = c1 - 'a' + 10;
*946379e7Schristos	      else
*946379e7Schristos		n1 = -1;
*946379e7Schristos
*946379e7Schristos	      if (n1 >= 0)
*946379e7Schristos		{
*946379e7Schristos		  int c2 = phase2_getc ();
*946379e7Schristos		  int n2;
*946379e7Schristos
*946379e7Schristos		  if (c2 >= '0' && c2 <= '9')
*946379e7Schristos		    n2 = c2 - '0';
*946379e7Schristos		  else if (c2 >= 'A' && c2 <= 'F')
*946379e7Schristos		    n2 = c2 - 'A' + 10;
*946379e7Schristos		  else if (c2 >= 'a' && c2 <= 'f')
*946379e7Schristos		    n2 = c2 - 'a' + 10;
*946379e7Schristos		  else
*946379e7Schristos		    n2 = -1;
*946379e7Schristos
*946379e7Schristos		  if (n2 >= 0)
*946379e7Schristos		    {
*946379e7Schristos		      *backslash_counter = 0;
*946379e7Schristos		      return (unsigned char) ((n1 << 4) + n2);
*946379e7Schristos		    }
*946379e7Schristos
*946379e7Schristos		  phase2_ungetc (c2);
*946379e7Schristos		}
*946379e7Schristos	      phase2_ungetc (c1);
*946379e7Schristos	      phase2_ungetc (c);
*946379e7Schristos	      ++*backslash_counter;
*946379e7Schristos	      return UNICODE ('\\');
*946379e7Schristos	    }
*946379e7Schristos	  }
*946379e7Schristos
*946379e7Schristos      if (interpret_unicode)
*946379e7Schristos	{
*946379e7Schristos	  if (c == 'u')
*946379e7Schristos	    {
*946379e7Schristos	      unsigned char buf[4];
*946379e7Schristos	      unsigned int n = 0;
*946379e7Schristos	      int i;
*946379e7Schristos
*946379e7Schristos	      for (i = 0; i < 4; i++)
*946379e7Schristos		{
*946379e7Schristos		  int c1 = phase2_getc ();
*946379e7Schristos
*946379e7Schristos		  if (c1 >= '0' && c1 <= '9')
*946379e7Schristos		    n = (n << 4) + (c1 - '0');
*946379e7Schristos		  else if (c1 >= 'A' && c1 <= 'F')
*946379e7Schristos		    n = (n << 4) + (c1 - 'A' + 10);
*946379e7Schristos		  else if (c1 >= 'a' && c1 <= 'f')
*946379e7Schristos		    n = (n << 4) + (c1 - 'a' + 10);
*946379e7Schristos		  else
*946379e7Schristos		    {
*946379e7Schristos		      phase2_ungetc (c1);
*946379e7Schristos		      while (--i >= 0)
*946379e7Schristos			phase2_ungetc (buf[i]);
*946379e7Schristos		      phase2_ungetc (c);
*946379e7Schristos		      ++*backslash_counter;
*946379e7Schristos		      return UNICODE ('\\');
*946379e7Schristos		    }
*946379e7Schristos
*946379e7Schristos		  buf[i] = c1;
*946379e7Schristos		}
*946379e7Schristos	      *backslash_counter = 0;
*946379e7Schristos	      return UNICODE (n);
*946379e7Schristos	    }
*946379e7Schristos
*946379e7Schristos	  if (interpret_ansic)
*946379e7Schristos	    {
*946379e7Schristos	      if (c == 'U')
*946379e7Schristos		{
*946379e7Schristos		  unsigned char buf[8];
*946379e7Schristos		  unsigned int n = 0;
*946379e7Schristos		  int i;
*946379e7Schristos
*946379e7Schristos		  for (i = 0; i < 8; i++)
*946379e7Schristos		    {
*946379e7Schristos		      int c1 = phase2_getc ();
*946379e7Schristos
*946379e7Schristos		      if (c1 >= '0' && c1 <= '9')
*946379e7Schristos			n = (n << 4) + (c1 - '0');
*946379e7Schristos		      else if (c1 >= 'A' && c1 <= 'F')
*946379e7Schristos			n = (n << 4) + (c1 - 'A' + 10);
*946379e7Schristos		      else if (c1 >= 'a' && c1 <= 'f')
*946379e7Schristos			n = (n << 4) + (c1 - 'a' + 10);
*946379e7Schristos		      else
*946379e7Schristos			{
*946379e7Schristos			  phase2_ungetc (c1);
*946379e7Schristos			  while (--i >= 0)
*946379e7Schristos			    phase2_ungetc (buf[i]);
*946379e7Schristos			  phase2_ungetc (c);
*946379e7Schristos			  ++*backslash_counter;
*946379e7Schristos			  return UNICODE ('\\');
*946379e7Schristos			}
*946379e7Schristos
*946379e7Schristos		      buf[i] = c1;
*946379e7Schristos		    }
*946379e7Schristos		  if (n < 0x110000)
*946379e7Schristos		    {
*946379e7Schristos		      *backslash_counter = 0;
*946379e7Schristos		      return UNICODE (n);
*946379e7Schristos		    }
*946379e7Schristos
*946379e7Schristos		  error_with_progname = false;
*946379e7Schristos		  error (0, 0, _("%s:%d: warning: invalid Unicode character"),
*946379e7Schristos			 logical_file_name, line_number);
*946379e7Schristos		  error_with_progname = true;
*946379e7Schristos
*946379e7Schristos		  while (--i >= 0)
*946379e7Schristos		    phase2_ungetc (buf[i]);
*946379e7Schristos		  phase2_ungetc (c);
*946379e7Schristos		  ++*backslash_counter;
*946379e7Schristos		  return UNICODE ('\\');
*946379e7Schristos		}
*946379e7Schristos
*946379e7Schristos	      if (c == 'N')
*946379e7Schristos		{
*946379e7Schristos		  int c1 = phase2_getc ();
*946379e7Schristos		  if (c1 == '{')
*946379e7Schristos		    {
*946379e7Schristos		      unsigned char buf[UNINAME_MAX + 1];
*946379e7Schristos		      int i;
*946379e7Schristos		      unsigned int n;
*946379e7Schristos
*946379e7Schristos		      for (i = 0; i < UNINAME_MAX; i++)
*946379e7Schristos			{
*946379e7Schristos			  int c2 = phase2_getc ();
*946379e7Schristos			  if (!(c2 >= ' ' && c2 <= '~'))
*946379e7Schristos			    {
*946379e7Schristos			      phase2_ungetc (c2);
*946379e7Schristos			      while (--i >= 0)
*946379e7Schristos				phase2_ungetc (buf[i]);
*946379e7Schristos			      phase2_ungetc (c1);
*946379e7Schristos			      phase2_ungetc (c);
*946379e7Schristos			      ++*backslash_counter;
*946379e7Schristos			      return UNICODE ('\\');
*946379e7Schristos			    }
*946379e7Schristos			  if (c2 == '}')
*946379e7Schristos			    break;
*946379e7Schristos			  buf[i] = c2;
*946379e7Schristos			}
*946379e7Schristos		      buf[i] = '\0';
*946379e7Schristos
*946379e7Schristos		      n = unicode_name_character ((char *) buf);
*946379e7Schristos		      if (n != UNINAME_INVALID)
*946379e7Schristos			{
*946379e7Schristos			  *backslash_counter = 0;
*946379e7Schristos			  return UNICODE (n);
*946379e7Schristos			}
*946379e7Schristos
*946379e7Schristos		      phase2_ungetc ('}');
*946379e7Schristos		      while (--i >= 0)
*946379e7Schristos			phase2_ungetc (buf[i]);
*946379e7Schristos		    }
*946379e7Schristos		  phase2_ungetc (c1);
*946379e7Schristos		  phase2_ungetc (c);
*946379e7Schristos		  ++*backslash_counter;
*946379e7Schristos		  return UNICODE ('\\');
*946379e7Schristos		}
*946379e7Schristos	    }
*946379e7Schristos	}
*946379e7Schristos
*946379e7Schristos      phase2_ungetc (c);
*946379e7Schristos      ++*backslash_counter;
*946379e7Schristos      return UNICODE ('\\');
*946379e7Schristos    }
*946379e7Schristos}
*946379e7Schristos
*946379e7Schristos
*946379e7Schristos/* Combine characters into tokens.  Discard whitespace except newlines at
*946379e7Schristos   the end of logical lines.  */
*946379e7Schristos
*946379e7Schristos/* Number of pending open parentheses/braces/brackets.  */
*946379e7Schristosstatic int open_pbb;
*946379e7Schristos
*946379e7Schristosstatic token_ty phase5_pushback[1];
*946379e7Schristosstatic int phase5_pushback_length;
*946379e7Schristos
*946379e7Schristosstatic void
*946379e7Schristosphase5_get (token_ty *tp)
*946379e7Schristos{
*946379e7Schristos  int c;
*946379e7Schristos
*946379e7Schristos  if (phase5_pushback_length)
*946379e7Schristos    {
*946379e7Schristos      *tp = phase5_pushback[--phase5_pushback_length];
*946379e7Schristos      return;
*946379e7Schristos    }
*946379e7Schristos
*946379e7Schristos  for (;;)
*946379e7Schristos    {
*946379e7Schristos      tp->line_number = line_number;
*946379e7Schristos      c = phase3_getc ();
*946379e7Schristos
*946379e7Schristos      switch (c)
*946379e7Schristos	{
*946379e7Schristos	case UEOF:
*946379e7Schristos	  tp->type = token_type_eof;
*946379e7Schristos	  return;
*946379e7Schristos
*946379e7Schristos	case ' ':
*946379e7Schristos	case '\t':
*946379e7Schristos	case '\f':
*946379e7Schristos	  /* Ignore whitespace and comments.  */
*946379e7Schristos	  continue;
*946379e7Schristos
*946379e7Schristos	case '\n':
*946379e7Schristos	  if (last_non_comment_line > last_comment_line)
*946379e7Schristos	    savable_comment_reset ();
*946379e7Schristos	  /* Ignore newline if and only if it is used for implicit line
*946379e7Schristos	     joining.  */
*946379e7Schristos	  if (open_pbb > 0)
*946379e7Schristos	    continue;
*946379e7Schristos	  tp->type = token_type_other;
*946379e7Schristos	  return;
*946379e7Schristos	}
*946379e7Schristos
*946379e7Schristos      last_non_comment_line = tp->line_number;
*946379e7Schristos
*946379e7Schristos      switch (c)
*946379e7Schristos	{
*946379e7Schristos	case '.':
*946379e7Schristos	  {
*946379e7Schristos	    int c1 = phase3_getc ();
*946379e7Schristos	    phase3_ungetc (c1);
*946379e7Schristos	    if (!(c1 >= '0' && c1 <= '9'))
*946379e7Schristos	      {
*946379e7Schristos
*946379e7Schristos		tp->type = token_type_other;
*946379e7Schristos		return;
*946379e7Schristos	      }
*946379e7Schristos	  }
*946379e7Schristos	  /* FALLTHROUGH */
*946379e7Schristos	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
*946379e7Schristos	case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
*946379e7Schristos	case 'M': case 'N': case 'O': case 'P': case 'Q':
*946379e7Schristos	case 'S': case 'T':           case 'V': case 'W': case 'X':
*946379e7Schristos	case 'Y': case 'Z':
*946379e7Schristos	case '_':
*946379e7Schristos	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
*946379e7Schristos	case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
*946379e7Schristos	case 'm': case 'n': case 'o': case 'p': case 'q':
*946379e7Schristos	case 's': case 't':           case 'v': case 'w': case 'x':
*946379e7Schristos	case 'y': case 'z':
*946379e7Schristos	case '0': case '1': case '2': case '3': case '4':
*946379e7Schristos	case '5': case '6': case '7': case '8': case '9':
*946379e7Schristos	symbol:
*946379e7Schristos	  /* Symbol, or part of a number.  */
*946379e7Schristos	  {
*946379e7Schristos	    static char *buffer;
*946379e7Schristos	    static int bufmax;
*946379e7Schristos	    int bufpos;
*946379e7Schristos
*946379e7Schristos	    bufpos = 0;
*946379e7Schristos	    for (;;)
*946379e7Schristos	      {
*946379e7Schristos		if (bufpos >= bufmax)
*946379e7Schristos		  {
*946379e7Schristos		    bufmax = 2 * bufmax + 10;
*946379e7Schristos		    buffer = xrealloc (buffer, bufmax);
*946379e7Schristos		  }
*946379e7Schristos		buffer[bufpos++] = c;
*946379e7Schristos		c = phase3_getc ();
*946379e7Schristos		switch (c)
*946379e7Schristos		  {
*946379e7Schristos		  case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
*946379e7Schristos		  case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
*946379e7Schristos		  case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
*946379e7Schristos		  case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
*946379e7Schristos		  case 'Y': case 'Z':
*946379e7Schristos		  case '_':
*946379e7Schristos		  case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
*946379e7Schristos		  case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
*946379e7Schristos		  case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
*946379e7Schristos		  case 's': case 't': case 'u': case 'v': case 'w': case 'x':
*946379e7Schristos		  case 'y': case 'z':
*946379e7Schristos		  case '0': case '1': case '2': case '3': case '4':
*946379e7Schristos		  case '5': case '6': case '7': case '8': case '9':
*946379e7Schristos		    continue;
*946379e7Schristos		  default:
*946379e7Schristos		    phase3_ungetc (c);
*946379e7Schristos		    break;
*946379e7Schristos		  }
*946379e7Schristos		break;
*946379e7Schristos	      }
*946379e7Schristos	    if (bufpos >= bufmax)
*946379e7Schristos	      {
*946379e7Schristos		bufmax = 2 * bufmax + 10;
*946379e7Schristos		buffer = xrealloc (buffer, bufmax);
*946379e7Schristos	      }
*946379e7Schristos	    buffer[bufpos] = '\0';
*946379e7Schristos	    tp->string = xstrdup (buffer);
*946379e7Schristos	    tp->type = token_type_symbol;
*946379e7Schristos	    return;
*946379e7Schristos	  }
*946379e7Schristos
*946379e7Schristos	/* Strings.  */
*946379e7Schristos	  {
*946379e7Schristos	    struct mixed_string_buffer literal;
*946379e7Schristos	    int quote_char;
*946379e7Schristos	    bool interpret_ansic;
*946379e7Schristos	    bool interpret_unicode;
*946379e7Schristos	    bool triple;
*946379e7Schristos	    unsigned int backslash_counter;
*946379e7Schristos
*946379e7Schristos	    case 'R': case 'r':
*946379e7Schristos	      {
*946379e7Schristos		int c1 = phase2_getc ();
*946379e7Schristos		if (c1 == '"' || c1 == '\'')
*946379e7Schristos		  {
*946379e7Schristos		    quote_char = c1;
*946379e7Schristos		    interpret_ansic = false;
*946379e7Schristos		    interpret_unicode = false;
*946379e7Schristos		    goto string;
*946379e7Schristos		  }
*946379e7Schristos		phase2_ungetc (c1);
*946379e7Schristos		goto symbol;
*946379e7Schristos	      }
*946379e7Schristos
*946379e7Schristos	    case 'U': case 'u':
*946379e7Schristos	      {
*946379e7Schristos		int c1 = phase2_getc ();
*946379e7Schristos		if (c1 == '"' || c1 == '\'')
*946379e7Schristos		  {
*946379e7Schristos		    quote_char = c1;
*946379e7Schristos		    interpret_ansic = true;
*946379e7Schristos		    interpret_unicode = true;
*946379e7Schristos		    goto string;
*946379e7Schristos		  }
*946379e7Schristos		if (c1 == 'R' || c1 == 'r')
*946379e7Schristos		  {
*946379e7Schristos		    int c2 = phase2_getc ();
*946379e7Schristos		    if (c2 == '"' || c2 == '\'')
*946379e7Schristos		      {
*946379e7Schristos			quote_char = c2;
*946379e7Schristos			interpret_ansic = false;
*946379e7Schristos			interpret_unicode = true;
*946379e7Schristos			goto string;
*946379e7Schristos		      }
*946379e7Schristos		    phase2_ungetc (c2);
*946379e7Schristos		  }
*946379e7Schristos		phase2_ungetc (c1);
*946379e7Schristos		goto symbol;
*946379e7Schristos	      }
*946379e7Schristos
*946379e7Schristos	    case '"': case '\'':
*946379e7Schristos	      quote_char = c;
*946379e7Schristos	      interpret_ansic = true;
*946379e7Schristos	      interpret_unicode = false;
*946379e7Schristos	    string:
*946379e7Schristos	      triple = false;
*946379e7Schristos	      {
*946379e7Schristos		int c1 = phase2_getc ();
*946379e7Schristos		if (c1 == quote_char)
*946379e7Schristos		  {
*946379e7Schristos		    int c2 = phase2_getc ();
*946379e7Schristos		    if (c2 == quote_char)
*946379e7Schristos		      triple = true;
*946379e7Schristos		    else
*946379e7Schristos		      {
*946379e7Schristos			phase2_ungetc (c2);
*946379e7Schristos			phase2_ungetc (c1);
*946379e7Schristos		      }
*946379e7Schristos		  }
*946379e7Schristos		else
*946379e7Schristos		  phase2_ungetc (c1);
*946379e7Schristos	      }
*946379e7Schristos	      backslash_counter = 0;
*946379e7Schristos	      /* Start accumulating the string.  */
*946379e7Schristos	      init_mixed_string_buffer (&literal);
*946379e7Schristos	      for (;;)
*946379e7Schristos		{
*946379e7Schristos		  int uc = phase7_getuc (quote_char, triple, interpret_ansic,
*946379e7Schristos					 interpret_unicode, &backslash_counter);
*946379e7Schristos
*946379e7Schristos		  if (uc == P7_EOF || uc == P7_STRING_END)
*946379e7Schristos		    break;
*946379e7Schristos
*946379e7Schristos		  if (IS_UNICODE (uc))
*946379e7Schristos		    assert (UNICODE_VALUE (uc) >= 0
*946379e7Schristos			    && UNICODE_VALUE (uc) < 0x110000);
*946379e7Schristos
*946379e7Schristos		  mixed_string_buffer_append (&literal, uc);
*946379e7Schristos		}
*946379e7Schristos	      tp->string = xstrdup (mixed_string_buffer_result (&literal));
*946379e7Schristos	      free_mixed_string_buffer (&literal);
*946379e7Schristos	      tp->comment = add_reference (savable_comment);
*946379e7Schristos	      tp->type = token_type_string;
*946379e7Schristos	      return;
*946379e7Schristos	  }
*946379e7Schristos
*946379e7Schristos	case '(':
*946379e7Schristos	  open_pbb++;
*946379e7Schristos	  tp->type = token_type_lparen;
*946379e7Schristos	  return;
*946379e7Schristos
*946379e7Schristos	case ')':
*946379e7Schristos	  if (open_pbb > 0)
*946379e7Schristos	    open_pbb--;
*946379e7Schristos	  tp->type = token_type_rparen;
*946379e7Schristos	  return;
*946379e7Schristos
*946379e7Schristos	case ',':
*946379e7Schristos	  tp->type = token_type_comma;
*946379e7Schristos	  return;
*946379e7Schristos
*946379e7Schristos	case '[': case '{':
*946379e7Schristos	  open_pbb++;
*946379e7Schristos	  tp->type = token_type_other;
*946379e7Schristos	  return;
*946379e7Schristos
*946379e7Schristos	case ']': case '}':
*946379e7Schristos	  if (open_pbb > 0)
*946379e7Schristos	    open_pbb--;
*946379e7Schristos	  tp->type = token_type_other;
*946379e7Schristos	  return;
*946379e7Schristos
*946379e7Schristos	default:
*946379e7Schristos	  /* We could carefully recognize each of the 2 and 3 character
*946379e7Schristos	     operators, but it is not necessary, as we only need to recognize
*946379e7Schristos	     gettext invocations.  Don't bother.  */
*946379e7Schristos	  tp->type = token_type_other;
*946379e7Schristos	  return;
*946379e7Schristos	}
*946379e7Schristos    }
*946379e7Schristos}
*946379e7Schristos
*946379e7Schristos/* Supports only one pushback token.  */
*946379e7Schristosstatic void
*946379e7Schristosphase5_unget (token_ty *tp)
*946379e7Schristos{
*946379e7Schristos  if (tp->type != token_type_eof)
*946379e7Schristos    {
*946379e7Schristos      if (phase5_pushback_length == SIZEOF (phase5_pushback))
*946379e7Schristos	abort ();
*946379e7Schristos      phase5_pushback[phase5_pushback_length++] = *tp;
*946379e7Schristos    }
*946379e7Schristos}
*946379e7Schristos
*946379e7Schristos
*946379e7Schristos/* Combine adjacent strings to form a single string.  Note that the end
*946379e7Schristos   of a logical line appears as a token of its own, therefore strings that
*946379e7Schristos   belong to different logical lines will not be concatenated.  */
*946379e7Schristos
*946379e7Schristosstatic void
*946379e7Schristosx_python_lex (token_ty *tp)
*946379e7Schristos{
*946379e7Schristos  phase5_get (tp);
*946379e7Schristos  if (tp->type != token_type_string)
*946379e7Schristos    return;
*946379e7Schristos  for (;;)
*946379e7Schristos    {
*946379e7Schristos      token_ty tmp;
*946379e7Schristos      size_t len;
*946379e7Schristos
*946379e7Schristos      phase5_get (&tmp);
*946379e7Schristos      if (tmp.type != token_type_string)
*946379e7Schristos	{
*946379e7Schristos	  phase5_unget (&tmp);
*946379e7Schristos	  return;
*946379e7Schristos	}
*946379e7Schristos      len = strlen (tp->string);
*946379e7Schristos      tp->string = xrealloc (tp->string, len + strlen (tmp.string) + 1);
*946379e7Schristos      strcpy (tp->string + len, tmp.string);
*946379e7Schristos      free (tmp.string);
*946379e7Schristos    }
*946379e7Schristos}
*946379e7Schristos
*946379e7Schristos
*946379e7Schristos/* ========================= Extracting strings.  ========================== */
*946379e7Schristos
*946379e7Schristos
*946379e7Schristos/* Context lookup table.  */
*946379e7Schristosstatic flag_context_list_table_ty *flag_context_list_table;
*946379e7Schristos
*946379e7Schristos
*946379e7Schristos/* The file is broken into tokens.  Scan the token stream, looking for
*946379e7Schristos   a keyword, followed by a left paren, followed by a string.  When we
*946379e7Schristos   see this sequence, we have something to remember.  We assume we are
*946379e7Schristos   looking at a valid C or C++ program, and leave the complaints about
*946379e7Schristos   the grammar to the compiler.
*946379e7Schristos
*946379e7Schristos     Normal handling: Look for
*946379e7Schristos       keyword ( ... msgid ... )
*946379e7Schristos     Plural handling: Look for
*946379e7Schristos       keyword ( ... msgid ... msgid_plural ... )
*946379e7Schristos
*946379e7Schristos   We use recursion because the arguments before msgid or between msgid
*946379e7Schristos   and msgid_plural can contain subexpressions of the same form.  */
*946379e7Schristos
*946379e7Schristos
*946379e7Schristos/* Extract messages until the next balanced closing parenthesis.
*946379e7Schristos   Extracted messages are added to MLP.
*946379e7Schristos   Return true upon eof, false upon closing parenthesis.  */
*946379e7Schristosstatic bool
*946379e7Schristosextract_parenthesized (message_list_ty *mlp,
*946379e7Schristos		       flag_context_ty outer_context,
*946379e7Schristos		       flag_context_list_iterator_ty context_iter,
*946379e7Schristos		       struct arglist_parser *argparser)
*946379e7Schristos{
*946379e7Schristos  /* Current argument number.  */
*946379e7Schristos  int arg = 1;
*946379e7Schristos  /* 0 when no keyword has been seen.  1 right after a keyword is seen.  */
*946379e7Schristos  int state;
*946379e7Schristos  /* Parameters of the keyword just seen.  Defined only in state 1.  */
*946379e7Schristos  const struct callshapes *next_shapes = NULL;
*946379e7Schristos  /* Context iterator that will be used if the next token is a '('.  */
*946379e7Schristos  flag_context_list_iterator_ty next_context_iter =
*946379e7Schristos    passthrough_context_list_iterator;
*946379e7Schristos  /* Current context.  */
*946379e7Schristos  flag_context_ty inner_context =
*946379e7Schristos    inherited_context (outer_context,
*946379e7Schristos		       flag_context_list_iterator_advance (&context_iter));
*946379e7Schristos
*946379e7Schristos  /* Start state is 0.  */
*946379e7Schristos  state = 0;
*946379e7Schristos
*946379e7Schristos  for (;;)
*946379e7Schristos    {
*946379e7Schristos      token_ty token;
*946379e7Schristos
*946379e7Schristos      x_python_lex (&token);
*946379e7Schristos      switch (token.type)
*946379e7Schristos	{
*946379e7Schristos	case token_type_symbol:
*946379e7Schristos	  {
*946379e7Schristos	    void *keyword_value;
*946379e7Schristos
*946379e7Schristos	    if (hash_find_entry (&keywords, token.string, strlen (token.string),
*946379e7Schristos				 &keyword_value)
*946379e7Schristos		== 0)
*946379e7Schristos	      {
*946379e7Schristos		next_shapes = (const struct callshapes *) keyword_value;
*946379e7Schristos		state = 1;
*946379e7Schristos	      }
*946379e7Schristos	    else
*946379e7Schristos	      state = 0;
*946379e7Schristos	  }
*946379e7Schristos	  next_context_iter =
*946379e7Schristos	    flag_context_list_iterator (
*946379e7Schristos	      flag_context_list_table_lookup (
*946379e7Schristos		flag_context_list_table,
*946379e7Schristos		token.string, strlen (token.string)));
*946379e7Schristos	  free (token.string);
*946379e7Schristos	  continue;
*946379e7Schristos
*946379e7Schristos	case token_type_lparen:
*946379e7Schristos	  if (extract_parenthesized (mlp, inner_context, next_context_iter,
*946379e7Schristos				     arglist_parser_alloc (mlp,
*946379e7Schristos							   state ? next_shapes : NULL)))
*946379e7Schristos	    {
*946379e7Schristos	      xgettext_current_source_encoding = po_charset_utf8;
*946379e7Schristos	      arglist_parser_done (argparser, arg);
*946379e7Schristos	      xgettext_current_source_encoding = xgettext_current_file_source_encoding;
*946379e7Schristos	      return true;
*946379e7Schristos	    }
*946379e7Schristos	  next_context_iter = null_context_list_iterator;
*946379e7Schristos	  state = 0;
*946379e7Schristos	  continue;
*946379e7Schristos
*946379e7Schristos	case token_type_rparen:
*946379e7Schristos	  xgettext_current_source_encoding = po_charset_utf8;
*946379e7Schristos	  arglist_parser_done (argparser, arg);
*946379e7Schristos	  xgettext_current_source_encoding = xgettext_current_file_source_encoding;
*946379e7Schristos	  return false;
*946379e7Schristos
*946379e7Schristos	case token_type_comma:
*946379e7Schristos	  arg++;
*946379e7Schristos	  inner_context =
*946379e7Schristos	    inherited_context (outer_context,
*946379e7Schristos			       flag_context_list_iterator_advance (
*946379e7Schristos				 &context_iter));
*946379e7Schristos	  next_context_iter = passthrough_context_list_iterator;
*946379e7Schristos	  state = 0;
*946379e7Schristos	  continue;
*946379e7Schristos
*946379e7Schristos	case token_type_string:
*946379e7Schristos	  {
*946379e7Schristos	    lex_pos_ty pos;
*946379e7Schristos	    pos.file_name = logical_file_name;
*946379e7Schristos	    pos.line_number = token.line_number;
*946379e7Schristos
*946379e7Schristos	    xgettext_current_source_encoding = po_charset_utf8;
*946379e7Schristos	    if (extract_all)
*946379e7Schristos	      remember_a_message (mlp, NULL, token.string, inner_context,
*946379e7Schristos				  &pos, token.comment);
*946379e7Schristos	    else
*946379e7Schristos	      arglist_parser_remember (argparser, arg, token.string,
*946379e7Schristos				       inner_context,
*946379e7Schristos				       pos.file_name, pos.line_number,
*946379e7Schristos				       token.comment);
*946379e7Schristos	    xgettext_current_source_encoding = xgettext_current_file_source_encoding;
*946379e7Schristos	  }
*946379e7Schristos	  drop_reference (token.comment);
*946379e7Schristos	  next_context_iter = null_context_list_iterator;
*946379e7Schristos	  state = 0;
*946379e7Schristos	  continue;
*946379e7Schristos
*946379e7Schristos	case token_type_eof:
*946379e7Schristos	  xgettext_current_source_encoding = po_charset_utf8;
*946379e7Schristos	  arglist_parser_done (argparser, arg);
*946379e7Schristos	  xgettext_current_source_encoding = xgettext_current_file_source_encoding;
*946379e7Schristos	  return true;
*946379e7Schristos
*946379e7Schristos	case token_type_other:
*946379e7Schristos	  next_context_iter = null_context_list_iterator;
*946379e7Schristos	  state = 0;
*946379e7Schristos	  continue;
*946379e7Schristos
*946379e7Schristos	default:
*946379e7Schristos	  abort ();
*946379e7Schristos	}
*946379e7Schristos    }
*946379e7Schristos}
*946379e7Schristos
*946379e7Schristos
*946379e7Schristosvoid
*946379e7Schristosextract_python (FILE *f,
*946379e7Schristos		const char *real_filename, const char *logical_filename,
*946379e7Schristos		flag_context_list_table_ty *flag_table,
*946379e7Schristos		msgdomain_list_ty *mdlp)
*946379e7Schristos{
*946379e7Schristos  message_list_ty *mlp = mdlp->item[0]->messages;
*946379e7Schristos
*946379e7Schristos  fp = f;
*946379e7Schristos  real_file_name = real_filename;
*946379e7Schristos  logical_file_name = xstrdup (logical_filename);
*946379e7Schristos  line_number = 1;
*946379e7Schristos
*946379e7Schristos  last_comment_line = -1;
*946379e7Schristos  last_non_comment_line = -1;
*946379e7Schristos
*946379e7Schristos  xgettext_current_file_source_encoding = xgettext_global_source_encoding;
*946379e7Schristos#if HAVE_ICONV
*946379e7Schristos  xgettext_current_file_source_iconv = xgettext_global_source_iconv;
*946379e7Schristos#endif
*946379e7Schristos
*946379e7Schristos  xgettext_current_source_encoding = xgettext_current_file_source_encoding;
*946379e7Schristos#if HAVE_ICONV
*946379e7Schristos  xgettext_current_source_iconv = xgettext_current_file_source_iconv;
*946379e7Schristos#endif
*946379e7Schristos
*946379e7Schristos  continuation_or_nonblank_line = false;
*946379e7Schristos
*946379e7Schristos  open_pbb = 0;
*946379e7Schristos
*946379e7Schristos  flag_context_list_table = flag_table;
*946379e7Schristos
*946379e7Schristos  init_keywords ();
*946379e7Schristos
*946379e7Schristos  /* Eat tokens until eof is seen.  When extract_parenthesized returns
*946379e7Schristos     due to an unbalanced closing parenthesis, just restart it.  */
*946379e7Schristos  while (!extract_parenthesized (mlp, null_context, null_context_list_iterator,
*946379e7Schristos				 arglist_parser_alloc (mlp, NULL)))
*946379e7Schristos    ;
*946379e7Schristos
*946379e7Schristos  fp = NULL;
*946379e7Schristos  real_file_name = NULL;
*946379e7Schristos  logical_file_name = NULL;
*946379e7Schristos  line_number = 0;
*946379e7Schristos}