gcc/libcpp/charset.c

404b540aSrobert/* CPP Library - charsets
404b540aSrobert   Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004
404b540aSrobert   Free Software Foundation, Inc.
404b540aSrobert
404b540aSrobert   Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges.
404b540aSrobert
404b540aSrobertThis program is free software; you can redistribute it and/or modify it
404b540aSrobertunder the terms of the GNU General Public License as published by the
404b540aSrobertFree Software Foundation; either version 2, or (at your option) any
404b540aSrobertlater version.
404b540aSrobert
404b540aSrobertThis program is distributed in the hope that it will be useful,
404b540aSrobertbut WITHOUT ANY WARRANTY; without even the implied warranty of
404b540aSrobertMERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
404b540aSrobertGNU General Public License for more details.
404b540aSrobert
404b540aSrobertYou should have received a copy of the GNU General Public License
404b540aSrobertalong with this program; if not, write to the Free Software
404b540aSrobertFoundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
404b540aSrobert
404b540aSrobert#include "config.h"
404b540aSrobert#include "system.h"
404b540aSrobert#include "cpplib.h"
404b540aSrobert#include "internal.h"
404b540aSrobert
404b540aSrobert/* Character set handling for C-family languages.
404b540aSrobert
404b540aSrobert   Terminological note: In what follows, "charset" or "character set"
404b540aSrobert   will be taken to mean both an abstract set of characters and an
404b540aSrobert   encoding for that set.
404b540aSrobert
404b540aSrobert   The C99 standard discusses two character sets: source and execution.
404b540aSrobert   The source character set is used for internal processing in translation
404b540aSrobert   phases 1 through 4; the execution character set is used thereafter.
404b540aSrobert   Both are required by 5.2.1.2p1 to be multibyte encodings, not wide
404b540aSrobert   character encodings (see 3.7.2, 3.7.3 for the standardese meanings
404b540aSrobert   of these terms).  Furthermore, the "basic character set" (listed in
404b540aSrobert   5.2.1p3) is to be encoded in each with values one byte wide, and is
404b540aSrobert   to appear in the initial shift state.
404b540aSrobert
404b540aSrobert   It is not explicitly mentioned, but there is also a "wide execution
404b540aSrobert   character set" used to encode wide character constants and wide
404b540aSrobert   string literals; this is supposed to be the result of applying the
404b540aSrobert   standard library function mbstowcs() to an equivalent narrow string
404b540aSrobert   (6.4.5p5).  However, the behavior of hexadecimal and octal
404b540aSrobert   \-escapes is at odds with this; they are supposed to be translated
404b540aSrobert   directly to wchar_t values (6.4.4.4p5,6).
404b540aSrobert
404b540aSrobert   The source character set is not necessarily the character set used
404b540aSrobert   to encode physical source files on disk; translation phase 1 converts
404b540aSrobert   from whatever that encoding is to the source character set.
404b540aSrobert
404b540aSrobert   The presence of universal character names in C99 (6.4.3 et seq.)
404b540aSrobert   forces the source character set to be isomorphic to ISO 10646,
404b540aSrobert   that is, Unicode.  There is no such constraint on the execution
404b540aSrobert   character set; note also that the conversion from source to
404b540aSrobert   execution character set does not occur for identifiers (5.1.1.2p1#5).
404b540aSrobert
404b540aSrobert   For convenience of implementation, the source character set's
404b540aSrobert   encoding of the basic character set should be identical to the
404b540aSrobert   execution character set OF THE HOST SYSTEM's encoding of the basic
404b540aSrobert   character set, and it should not be a state-dependent encoding.
404b540aSrobert
404b540aSrobert   cpplib uses UTF-8 or UTF-EBCDIC for the source character set,
404b540aSrobert   depending on whether the host is based on ASCII or EBCDIC (see
404b540aSrobert   respectively Unicode section 2.3/ISO10646 Amendment 2, and Unicode
404b540aSrobert   Technical Report #16).  With limited exceptions, it relies on the
404b540aSrobert   system library's iconv() primitive to do charset conversion
404b540aSrobert   (specified in SUSv2).  */
404b540aSrobert
404b540aSrobert#if !HAVE_ICONV
404b540aSrobert/* Make certain that the uses of iconv(), iconv_open(), iconv_close()
404b540aSrobert   below, which are guarded only by if statements with compile-time
404b540aSrobert   constant conditions, do not cause link errors.  */
404b540aSrobert#define iconv_open(x, y) (errno = EINVAL, (iconv_t)-1)
404b540aSrobert#define iconv(a,b,c,d,e) (errno = EINVAL, (size_t)-1)
404b540aSrobert#define iconv_close(x)   (void)0
404b540aSrobert#define ICONV_CONST
404b540aSrobert#endif
404b540aSrobert
404b540aSrobert#if HOST_CHARSET == HOST_CHARSET_ASCII
404b540aSrobert#define SOURCE_CHARSET "UTF-8"
404b540aSrobert#define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0x7e
404b540aSrobert#elif HOST_CHARSET == HOST_CHARSET_EBCDIC
404b540aSrobert#define SOURCE_CHARSET "UTF-EBCDIC"
404b540aSrobert#define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0xFF
404b540aSrobert#else
404b540aSrobert#error "Unrecognized basic host character set"
404b540aSrobert#endif
404b540aSrobert
404b540aSrobert#ifndef EILSEQ
404b540aSrobert#define EILSEQ EINVAL
404b540aSrobert#endif
404b540aSrobert
404b540aSrobert/* This structure is used for a resizable string buffer throughout.  */
404b540aSrobert/* Don't call it strbuf, as that conflicts with unistd.h on systems
404b540aSrobert   such as DYNIX/ptx where unistd.h includes stropts.h.  */
404b540aSrobertstruct _cpp_strbuf
404b540aSrobert{
404b540aSrobert  uchar *text;
404b540aSrobert  size_t asize;
404b540aSrobert  size_t len;
404b540aSrobert};
404b540aSrobert
404b540aSrobert/* This is enough to hold any string that fits on a single 80-column
404b540aSrobert   line, even if iconv quadruples its size (e.g. conversion from
404b540aSrobert   ASCII to UTF-32) rounded up to a power of two.  */
404b540aSrobert#define OUTBUF_BLOCK_SIZE 256
404b540aSrobert
404b540aSrobert/* Conversions between UTF-8 and UTF-16/32 are implemented by custom
404b540aSrobert   logic.  This is because a depressing number of systems lack iconv,
404b540aSrobert   or have have iconv libraries that do not do these conversions, so
404b540aSrobert   we need a fallback implementation for them.  To ensure the fallback
404b540aSrobert   doesn't break due to neglect, it is used on all systems.
404b540aSrobert
404b540aSrobert   UTF-32 encoding is nice and simple: a four-byte binary number,
404b540aSrobert   constrained to the range 00000000-7FFFFFFF to avoid questions of
404b540aSrobert   signedness.  We do have to cope with big- and little-endian
404b540aSrobert   variants.
404b540aSrobert
404b540aSrobert   UTF-16 encoding uses two-byte binary numbers, again in big- and
404b540aSrobert   little-endian variants, for all values in the 00000000-0000FFFF
404b540aSrobert   range.  Values in the 00010000-0010FFFF range are encoded as pairs
404b540aSrobert   of two-byte numbers, called "surrogate pairs": given a number S in
404b540aSrobert   this range, it is mapped to a pair (H, L) as follows:
404b540aSrobert
404b540aSrobert     H = (S - 0x10000) / 0x400 + 0xD800
404b540aSrobert     L = (S - 0x10000) % 0x400 + 0xDC00
404b540aSrobert
404b540aSrobert   Two-byte values in the D800...DFFF range are ill-formed except as a
404b540aSrobert   component of a surrogate pair.  Even if the encoding within a
404b540aSrobert   two-byte value is little-endian, the H member of the surrogate pair
404b540aSrobert   comes first.
404b540aSrobert
404b540aSrobert   There is no way to encode values in the 00110000-7FFFFFFF range,
404b540aSrobert   which is not currently a problem as there are no assigned code
404b540aSrobert   points in that range; however, the author expects that it will
404b540aSrobert   eventually become necessary to abandon UTF-16 due to this
404b540aSrobert   limitation.  Note also that, because of these pairs, UTF-16 does
404b540aSrobert   not meet the requirements of the C standard for a wide character
404b540aSrobert   encoding (see 3.7.3 and 6.4.4.4p11).
404b540aSrobert
404b540aSrobert   UTF-8 encoding looks like this:
404b540aSrobert
404b540aSrobert   value range	       encoded as
404b540aSrobert   00000000-0000007F   0xxxxxxx
404b540aSrobert   00000080-000007FF   110xxxxx 10xxxxxx
404b540aSrobert   00000800-0000FFFF   1110xxxx 10xxxxxx 10xxxxxx
404b540aSrobert   00010000-001FFFFF   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
404b540aSrobert   00200000-03FFFFFF   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
404b540aSrobert   04000000-7FFFFFFF   1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
404b540aSrobert
404b540aSrobert   Values in the 0000D800 ... 0000DFFF range (surrogates) are invalid,
404b540aSrobert   which means that three-byte sequences ED xx yy, with A0 <= xx <= BF,
404b540aSrobert   never occur.  Note also that any value that can be encoded by a
404b540aSrobert   given row of the table can also be encoded by all successive rows,
404b540aSrobert   but this is not done; only the shortest possible encoding for any
404b540aSrobert   given value is valid.  For instance, the character 07C0 could be
404b540aSrobert   encoded as any of DF 80, E0 9F 80, F0 80 9F 80, F8 80 80 9F 80, or
404b540aSrobert   FC 80 80 80 9F 80.  Only the first is valid.
404b540aSrobert
404b540aSrobert   An implementation note: the transformation from UTF-16 to UTF-8, or
404b540aSrobert   vice versa, is easiest done by using UTF-32 as an intermediary.  */
404b540aSrobert
404b540aSrobert/* Internal primitives which go from an UTF-8 byte stream to native-endian
404b540aSrobert   UTF-32 in a cppchar_t, or vice versa; this avoids an extra marshal/unmarshal
404b540aSrobert   operation in several places below.  */
404b540aSrobertstatic inline int
404b540aSrobertone_utf8_to_cppchar (const uchar **inbufp, size_t *inbytesleftp,
404b540aSrobert		     cppchar_t *cp)
404b540aSrobert{
404b540aSrobert  static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x02, 0x01 };
404b540aSrobert  static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
404b540aSrobert
404b540aSrobert  cppchar_t c;
404b540aSrobert  const uchar *inbuf = *inbufp;
404b540aSrobert  size_t nbytes, i;
404b540aSrobert
404b540aSrobert  if (*inbytesleftp < 1)
404b540aSrobert    return EINVAL;
404b540aSrobert
404b540aSrobert  c = *inbuf;
404b540aSrobert  if (c < 0x80)
404b540aSrobert    {
404b540aSrobert      *cp = c;
404b540aSrobert      *inbytesleftp -= 1;
404b540aSrobert      *inbufp += 1;
404b540aSrobert      return 0;
404b540aSrobert    }
404b540aSrobert
404b540aSrobert  /* The number of leading 1-bits in the first byte indicates how many
404b540aSrobert     bytes follow.  */
404b540aSrobert  for (nbytes = 2; nbytes < 7; nbytes++)
404b540aSrobert    if ((c & ~masks[nbytes-1]) == patns[nbytes-1])
404b540aSrobert      goto found;
404b540aSrobert  return EILSEQ;
404b540aSrobert found:
404b540aSrobert
404b540aSrobert  if (*inbytesleftp < nbytes)
404b540aSrobert    return EINVAL;
404b540aSrobert
404b540aSrobert  c = (c & masks[nbytes-1]);
404b540aSrobert  inbuf++;
404b540aSrobert  for (i = 1; i < nbytes; i++)
404b540aSrobert    {
404b540aSrobert      cppchar_t n = *inbuf++;
404b540aSrobert      if ((n & 0xC0) != 0x80)
404b540aSrobert	return EILSEQ;
404b540aSrobert      c = ((c << 6) + (n & 0x3F));
404b540aSrobert    }
404b540aSrobert
404b540aSrobert  /* Make sure the shortest possible encoding was used.  */
404b540aSrobert  if (c <=      0x7F && nbytes > 1) return EILSEQ;
404b540aSrobert  if (c <=     0x7FF && nbytes > 2) return EILSEQ;
404b540aSrobert  if (c <=    0xFFFF && nbytes > 3) return EILSEQ;
404b540aSrobert  if (c <=  0x1FFFFF && nbytes > 4) return EILSEQ;
404b540aSrobert  if (c <= 0x3FFFFFF && nbytes > 5) return EILSEQ;
404b540aSrobert
404b540aSrobert  /* Make sure the character is valid.  */
404b540aSrobert  if (c > 0x7FFFFFFF || (c >= 0xD800 && c <= 0xDFFF)) return EILSEQ;
404b540aSrobert
404b540aSrobert  *cp = c;
404b540aSrobert  *inbufp = inbuf;
404b540aSrobert  *inbytesleftp -= nbytes;
404b540aSrobert  return 0;
404b540aSrobert}
404b540aSrobert
404b540aSrobertstatic inline int
404b540aSrobertone_cppchar_to_utf8 (cppchar_t c, uchar **outbufp, size_t *outbytesleftp)
404b540aSrobert{
404b540aSrobert  static const uchar masks[6] =  { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
404b540aSrobert  static const uchar limits[6] = { 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE };
404b540aSrobert  size_t nbytes;
404b540aSrobert  uchar buf[6], *p = &buf[6];
404b540aSrobert  uchar *outbuf = *outbufp;
404b540aSrobert
404b540aSrobert  nbytes = 1;
404b540aSrobert  if (c < 0x80)
404b540aSrobert    *--p = c;
404b540aSrobert  else
404b540aSrobert    {
404b540aSrobert      do
404b540aSrobert	{
404b540aSrobert	  *--p = ((c & 0x3F) | 0x80);
404b540aSrobert	  c >>= 6;
404b540aSrobert	  nbytes++;
404b540aSrobert	}
404b540aSrobert      while (c >= 0x3F || (c & limits[nbytes-1]));
404b540aSrobert      *--p = (c | masks[nbytes-1]);
404b540aSrobert    }
404b540aSrobert
404b540aSrobert  if (*outbytesleftp < nbytes)
404b540aSrobert    return E2BIG;
404b540aSrobert
404b540aSrobert  while (p < &buf[6])
404b540aSrobert    *outbuf++ = *p++;
404b540aSrobert  *outbytesleftp -= nbytes;
404b540aSrobert  *outbufp = outbuf;
404b540aSrobert  return 0;
404b540aSrobert}
404b540aSrobert
404b540aSrobert/* The following four functions transform one character between the two
404b540aSrobert   encodings named in the function name.  All have the signature
404b540aSrobert   int (*)(iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
404b540aSrobert           uchar **outbufp, size_t *outbytesleftp)
404b540aSrobert
404b540aSrobert   BIGEND must have the value 0 or 1, coerced to (iconv_t); it is
404b540aSrobert   interpreted as a boolean indicating whether big-endian or
404b540aSrobert   little-endian encoding is to be used for the member of the pair
404b540aSrobert   that is not UTF-8.
404b540aSrobert
404b540aSrobert   INBUFP, INBYTESLEFTP, OUTBUFP, OUTBYTESLEFTP work exactly as they
404b540aSrobert   do for iconv.
404b540aSrobert
404b540aSrobert   The return value is either 0 for success, or an errno value for
404b540aSrobert   failure, which may be E2BIG (need more space), EILSEQ (ill-formed
404b540aSrobert   input sequence), ir EINVAL (incomplete input sequence).  */
404b540aSrobert
404b540aSrobertstatic inline int
404b540aSrobertone_utf8_to_utf32 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
404b540aSrobert		   uchar **outbufp, size_t *outbytesleftp)
404b540aSrobert{
404b540aSrobert  uchar *outbuf;
404b540aSrobert  cppchar_t s = 0;
404b540aSrobert  int rval;
404b540aSrobert
404b540aSrobert  /* Check for space first, since we know exactly how much we need.  */
404b540aSrobert  if (*outbytesleftp < 4)
404b540aSrobert    return E2BIG;
404b540aSrobert
404b540aSrobert  rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
404b540aSrobert  if (rval)
404b540aSrobert    return rval;
404b540aSrobert
404b540aSrobert  outbuf = *outbufp;
404b540aSrobert  outbuf[bigend ? 3 : 0] = (s & 0x000000FF);
404b540aSrobert  outbuf[bigend ? 2 : 1] = (s & 0x0000FF00) >> 8;
404b540aSrobert  outbuf[bigend ? 1 : 2] = (s & 0x00FF0000) >> 16;
404b540aSrobert  outbuf[bigend ? 0 : 3] = (s & 0xFF000000) >> 24;
404b540aSrobert
404b540aSrobert  *outbufp += 4;
404b540aSrobert  *outbytesleftp -= 4;
404b540aSrobert  return 0;
404b540aSrobert}
404b540aSrobert
404b540aSrobertstatic inline int
404b540aSrobertone_utf32_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
404b540aSrobert		   uchar **outbufp, size_t *outbytesleftp)
404b540aSrobert{
404b540aSrobert  cppchar_t s;
404b540aSrobert  int rval;
404b540aSrobert  const uchar *inbuf;
404b540aSrobert
404b540aSrobert  if (*inbytesleftp < 4)
404b540aSrobert    return EINVAL;
404b540aSrobert
404b540aSrobert  inbuf = *inbufp;
404b540aSrobert
404b540aSrobert  s  = inbuf[bigend ? 0 : 3] << 24;
404b540aSrobert  s += inbuf[bigend ? 1 : 2] << 16;
404b540aSrobert  s += inbuf[bigend ? 2 : 1] << 8;
404b540aSrobert  s += inbuf[bigend ? 3 : 0];
404b540aSrobert
404b540aSrobert  if (s >= 0x7FFFFFFF || (s >= 0xD800 && s <= 0xDFFF))
404b540aSrobert    return EILSEQ;
404b540aSrobert
404b540aSrobert  rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
404b540aSrobert  if (rval)
404b540aSrobert    return rval;
404b540aSrobert
404b540aSrobert  *inbufp += 4;
404b540aSrobert  *inbytesleftp -= 4;
404b540aSrobert  return 0;
404b540aSrobert}
404b540aSrobert
404b540aSrobertstatic inline int
404b540aSrobertone_utf8_to_utf16 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
404b540aSrobert		   uchar **outbufp, size_t *outbytesleftp)
404b540aSrobert{
404b540aSrobert  int rval;
404b540aSrobert  cppchar_t s = 0;
404b540aSrobert  const uchar *save_inbuf = *inbufp;
404b540aSrobert  size_t save_inbytesleft = *inbytesleftp;
404b540aSrobert  uchar *outbuf = *outbufp;
404b540aSrobert
404b540aSrobert  rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
404b540aSrobert  if (rval)
404b540aSrobert    return rval;
404b540aSrobert
404b540aSrobert  if (s > 0x0010FFFF)
404b540aSrobert    {
404b540aSrobert      *inbufp = save_inbuf;
404b540aSrobert      *inbytesleftp = save_inbytesleft;
404b540aSrobert      return EILSEQ;
404b540aSrobert    }
404b540aSrobert
404b540aSrobert  if (s < 0xFFFF)
404b540aSrobert    {
404b540aSrobert      if (*outbytesleftp < 2)
404b540aSrobert	{
404b540aSrobert	  *inbufp = save_inbuf;
404b540aSrobert	  *inbytesleftp = save_inbytesleft;
404b540aSrobert	  return E2BIG;
404b540aSrobert	}
404b540aSrobert      outbuf[bigend ? 1 : 0] = (s & 0x00FF);
404b540aSrobert      outbuf[bigend ? 0 : 1] = (s & 0xFF00) >> 8;
404b540aSrobert
404b540aSrobert      *outbufp += 2;
404b540aSrobert      *outbytesleftp -= 2;
404b540aSrobert      return 0;
404b540aSrobert    }
404b540aSrobert  else
404b540aSrobert    {
404b540aSrobert      cppchar_t hi, lo;
404b540aSrobert
404b540aSrobert      if (*outbytesleftp < 4)
404b540aSrobert	{
404b540aSrobert	  *inbufp = save_inbuf;
404b540aSrobert	  *inbytesleftp = save_inbytesleft;
404b540aSrobert	  return E2BIG;
404b540aSrobert	}
404b540aSrobert
404b540aSrobert      hi = (s - 0x10000) / 0x400 + 0xD800;
404b540aSrobert      lo = (s - 0x10000) % 0x400 + 0xDC00;
404b540aSrobert
404b540aSrobert      /* Even if we are little-endian, put the high surrogate first.
404b540aSrobert	 ??? Matches practice?  */
404b540aSrobert      outbuf[bigend ? 1 : 0] = (hi & 0x00FF);
404b540aSrobert      outbuf[bigend ? 0 : 1] = (hi & 0xFF00) >> 8;
404b540aSrobert      outbuf[bigend ? 3 : 2] = (lo & 0x00FF);
404b540aSrobert      outbuf[bigend ? 2 : 3] = (lo & 0xFF00) >> 8;
404b540aSrobert
404b540aSrobert      *outbufp += 4;
404b540aSrobert      *outbytesleftp -= 4;
404b540aSrobert      return 0;
404b540aSrobert    }
404b540aSrobert}
404b540aSrobert
404b540aSrobertstatic inline int
404b540aSrobertone_utf16_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
404b540aSrobert		   uchar **outbufp, size_t *outbytesleftp)
404b540aSrobert{
404b540aSrobert  cppchar_t s;
404b540aSrobert  const uchar *inbuf = *inbufp;
404b540aSrobert  int rval;
404b540aSrobert
404b540aSrobert  if (*inbytesleftp < 2)
404b540aSrobert    return EINVAL;
404b540aSrobert  s  = inbuf[bigend ? 0 : 1] << 8;
404b540aSrobert  s += inbuf[bigend ? 1 : 0];
404b540aSrobert
404b540aSrobert  /* Low surrogate without immediately preceding high surrogate is invalid.  */
404b540aSrobert  if (s >= 0xDC00 && s <= 0xDFFF)
404b540aSrobert    return EILSEQ;
404b540aSrobert  /* High surrogate must have a following low surrogate.  */
404b540aSrobert  else if (s >= 0xD800 && s <= 0xDBFF)
404b540aSrobert    {
404b540aSrobert      cppchar_t hi = s, lo;
404b540aSrobert      if (*inbytesleftp < 4)
404b540aSrobert	return EINVAL;
404b540aSrobert
404b540aSrobert      lo  = inbuf[bigend ? 2 : 3] << 8;
404b540aSrobert      lo += inbuf[bigend ? 3 : 2];
404b540aSrobert
404b540aSrobert      if (lo < 0xDC00 || lo > 0xDFFF)
404b540aSrobert	return EILSEQ;
404b540aSrobert
404b540aSrobert      s = (hi - 0xD800) * 0x400 + (lo - 0xDC00) + 0x10000;
404b540aSrobert    }
404b540aSrobert
404b540aSrobert  rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
404b540aSrobert  if (rval)
404b540aSrobert    return rval;
404b540aSrobert
404b540aSrobert  /* Success - update the input pointers (one_cppchar_to_utf8 has done
404b540aSrobert     the output pointers for us).  */
404b540aSrobert  if (s <= 0xFFFF)
404b540aSrobert    {
404b540aSrobert      *inbufp += 2;
404b540aSrobert      *inbytesleftp -= 2;
404b540aSrobert    }
404b540aSrobert  else
404b540aSrobert    {
404b540aSrobert      *inbufp += 4;
404b540aSrobert      *inbytesleftp -= 4;
404b540aSrobert    }
404b540aSrobert  return 0;
404b540aSrobert}
404b540aSrobert
404b540aSrobert/* Helper routine for the next few functions.  The 'const' on
404b540aSrobert   one_conversion means that we promise not to modify what function is
404b540aSrobert   pointed to, which lets the inliner see through it.  */
404b540aSrobert
404b540aSrobertstatic inline bool
404b540aSrobertconversion_loop (int (*const one_conversion)(iconv_t, const uchar **, size_t *,
404b540aSrobert					     uchar **, size_t *),
404b540aSrobert		 iconv_t cd, const uchar *from, size_t flen, struct _cpp_strbuf *to)
404b540aSrobert{
404b540aSrobert  const uchar *inbuf;
404b540aSrobert  uchar *outbuf;
404b540aSrobert  size_t inbytesleft, outbytesleft;
404b540aSrobert  int rval;
404b540aSrobert
404b540aSrobert  inbuf = from;
404b540aSrobert  inbytesleft = flen;
404b540aSrobert  outbuf = to->text + to->len;
404b540aSrobert  outbytesleft = to->asize - to->len;
404b540aSrobert
404b540aSrobert  for (;;)
404b540aSrobert    {
404b540aSrobert      do
404b540aSrobert	rval = one_conversion (cd, &inbuf, &inbytesleft,
404b540aSrobert			       &outbuf, &outbytesleft);
404b540aSrobert      while (inbytesleft && !rval);
404b540aSrobert
404b540aSrobert      if (__builtin_expect (inbytesleft == 0, 1))
404b540aSrobert	{
404b540aSrobert	  to->len = to->asize - outbytesleft;
404b540aSrobert	  return true;
404b540aSrobert	}
404b540aSrobert      if (rval != E2BIG)
404b540aSrobert	{
404b540aSrobert	  errno = rval;
404b540aSrobert	  return false;
404b540aSrobert	}
404b540aSrobert
404b540aSrobert      outbytesleft += OUTBUF_BLOCK_SIZE;
404b540aSrobert      to->asize += OUTBUF_BLOCK_SIZE;
404b540aSrobert      to->text = XRESIZEVEC (uchar, to->text, to->asize);
404b540aSrobert      outbuf = to->text + to->asize - outbytesleft;
404b540aSrobert    }
404b540aSrobert}
404b540aSrobert
404b540aSrobert
404b540aSrobert/* These functions convert entire strings between character sets.
404b540aSrobert   They all have the signature
404b540aSrobert
404b540aSrobert   bool (*)(iconv_t cd, const uchar *from, size_t flen, struct _cpp_strbuf *to);
404b540aSrobert
404b540aSrobert   The input string FROM is converted as specified by the function
404b540aSrobert   name plus the iconv descriptor CD (which may be fake), and the
404b540aSrobert   result appended to TO.  On any error, false is returned, otherwise true.  */
404b540aSrobert
404b540aSrobert/* These four use the custom conversion code above.  */
404b540aSrobertstatic bool
404b540aSrobertconvert_utf8_utf16 (iconv_t cd, const uchar *from, size_t flen,
404b540aSrobert		    struct _cpp_strbuf *to)
404b540aSrobert{
404b540aSrobert  return conversion_loop (one_utf8_to_utf16, cd, from, flen, to);
404b540aSrobert}
404b540aSrobert
404b540aSrobertstatic bool
404b540aSrobertconvert_utf8_utf32 (iconv_t cd, const uchar *from, size_t flen,
404b540aSrobert		    struct _cpp_strbuf *to)
404b540aSrobert{
404b540aSrobert  return conversion_loop (one_utf8_to_utf32, cd, from, flen, to);
404b540aSrobert}
404b540aSrobert
404b540aSrobertstatic bool
404b540aSrobertconvert_utf16_utf8 (iconv_t cd, const uchar *from, size_t flen,
404b540aSrobert		    struct _cpp_strbuf *to)
404b540aSrobert{
404b540aSrobert  return conversion_loop (one_utf16_to_utf8, cd, from, flen, to);
404b540aSrobert}
404b540aSrobert
404b540aSrobertstatic bool
404b540aSrobertconvert_utf32_utf8 (iconv_t cd, const uchar *from, size_t flen,
404b540aSrobert		    struct _cpp_strbuf *to)
404b540aSrobert{
404b540aSrobert  return conversion_loop (one_utf32_to_utf8, cd, from, flen, to);
404b540aSrobert}
404b540aSrobert
404b540aSrobert/* Identity conversion, used when we have no alternative.  */
404b540aSrobertstatic bool
404b540aSrobertconvert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED,
404b540aSrobert		       const uchar *from, size_t flen, struct _cpp_strbuf *to)
404b540aSrobert{
404b540aSrobert  if (to->len + flen > to->asize)
404b540aSrobert    {
404b540aSrobert      to->asize = to->len + flen;
404b540aSrobert      to->text = XRESIZEVEC (uchar, to->text, to->asize);
404b540aSrobert    }
404b540aSrobert  memcpy (to->text + to->len, from, flen);
404b540aSrobert  to->len += flen;
404b540aSrobert  return true;
404b540aSrobert}
404b540aSrobert
404b540aSrobert/* And this one uses the system iconv primitive.  It's a little
404b540aSrobert   different, since iconv's interface is a little different.  */
404b540aSrobert#if HAVE_ICONV
404b540aSrobertstatic bool
404b540aSrobertconvert_using_iconv (iconv_t cd, const uchar *from, size_t flen,
404b540aSrobert		     struct _cpp_strbuf *to)
404b540aSrobert{
404b540aSrobert  ICONV_CONST char *inbuf;
404b540aSrobert  char *outbuf;
404b540aSrobert  size_t inbytesleft, outbytesleft;
404b540aSrobert
404b540aSrobert  /* Reset conversion descriptor and check that it is valid.  */
404b540aSrobert  if (iconv (cd, 0, 0, 0, 0) == (size_t)-1)
404b540aSrobert    return false;
404b540aSrobert
404b540aSrobert  inbuf = (ICONV_CONST char *)from;
404b540aSrobert  inbytesleft = flen;
404b540aSrobert  outbuf = (char *)to->text + to->len;
404b540aSrobert  outbytesleft = to->asize - to->len;
404b540aSrobert
404b540aSrobert  for (;;)
404b540aSrobert    {
404b540aSrobert      iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
404b540aSrobert      if (__builtin_expect (inbytesleft == 0, 1))
404b540aSrobert	{
404b540aSrobert	  to->len = to->asize - outbytesleft;
404b540aSrobert	  return true;
404b540aSrobert	}
404b540aSrobert      if (errno != E2BIG)
404b540aSrobert	return false;
404b540aSrobert
404b540aSrobert      outbytesleft += OUTBUF_BLOCK_SIZE;
404b540aSrobert      to->asize += OUTBUF_BLOCK_SIZE;
404b540aSrobert      to->text = XRESIZEVEC (uchar, to->text, to->asize);
404b540aSrobert      outbuf = (char *)to->text + to->asize - outbytesleft;
404b540aSrobert    }
404b540aSrobert}
404b540aSrobert#else
404b540aSrobert#define convert_using_iconv 0 /* prevent undefined symbol error below */
404b540aSrobert#endif
404b540aSrobert
404b540aSrobert/* Arrange for the above custom conversion logic to be used automatically
404b540aSrobert   when conversion between a suitable pair of character sets is requested.  */
404b540aSrobert
404b540aSrobert#define APPLY_CONVERSION(CONVERTER, FROM, FLEN, TO) \
404b540aSrobert   CONVERTER.func (CONVERTER.cd, FROM, FLEN, TO)
404b540aSrobert
404b540aSrobertstruct conversion
404b540aSrobert{
404b540aSrobert  const char *pair;
404b540aSrobert  convert_f func;
404b540aSrobert  iconv_t fake_cd;
404b540aSrobert};
404b540aSrobertstatic const struct conversion conversion_tab[] = {
404b540aSrobert  { "UTF-8/UTF-32LE", convert_utf8_utf32, (iconv_t)0 },
404b540aSrobert  { "UTF-8/UTF-32BE", convert_utf8_utf32, (iconv_t)1 },
404b540aSrobert  { "UTF-8/UTF-16LE", convert_utf8_utf16, (iconv_t)0 },
404b540aSrobert  { "UTF-8/UTF-16BE", convert_utf8_utf16, (iconv_t)1 },
404b540aSrobert  { "UTF-32LE/UTF-8", convert_utf32_utf8, (iconv_t)0 },
404b540aSrobert  { "UTF-32BE/UTF-8", convert_utf32_utf8, (iconv_t)1 },
404b540aSrobert  { "UTF-16LE/UTF-8", convert_utf16_utf8, (iconv_t)0 },
404b540aSrobert  { "UTF-16BE/UTF-8", convert_utf16_utf8, (iconv_t)1 },
404b540aSrobert};
404b540aSrobert
404b540aSrobert/* Subroutine of cpp_init_iconv: initialize and return a
404b540aSrobert   cset_converter structure for conversion from FROM to TO.  If
404b540aSrobert   iconv_open() fails, issue an error and return an identity
404b540aSrobert   converter.  Silently return an identity converter if FROM and TO
404b540aSrobert   are identical.  */
404b540aSrobertstatic struct cset_converter
404b540aSrobertinit_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
404b540aSrobert{
404b540aSrobert  struct cset_converter ret;
404b540aSrobert  char *pair;
404b540aSrobert  size_t i;
404b540aSrobert
404b540aSrobert  if (!strcasecmp (to, from))
404b540aSrobert    {
404b540aSrobert      ret.func = convert_no_conversion;
404b540aSrobert      ret.cd = (iconv_t) -1;
404b540aSrobert      return ret;
404b540aSrobert    }
404b540aSrobert
404b540aSrobert  pair = (char *) alloca(strlen(to) + strlen(from) + 2);
404b540aSrobert
404b540aSrobert  strcpy(pair, from);
404b540aSrobert  strcat(pair, "/");
404b540aSrobert  strcat(pair, to);
404b540aSrobert  for (i = 0; i < ARRAY_SIZE (conversion_tab); i++)
404b540aSrobert    if (!strcasecmp (pair, conversion_tab[i].pair))
404b540aSrobert      {
404b540aSrobert	ret.func = conversion_tab[i].func;
404b540aSrobert	ret.cd = conversion_tab[i].fake_cd;
404b540aSrobert	return ret;
404b540aSrobert      }
404b540aSrobert
404b540aSrobert  /* No custom converter - try iconv.  */
404b540aSrobert  if (HAVE_ICONV)
404b540aSrobert    {
404b540aSrobert      ret.func = convert_using_iconv;
404b540aSrobert      ret.cd = iconv_open (to, from);
404b540aSrobert
404b540aSrobert      if (ret.cd == (iconv_t) -1)
404b540aSrobert	{
404b540aSrobert	  if (errno == EINVAL)
404b540aSrobert	    cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */
404b540aSrobert		       "conversion from %s to %s not supported by iconv",
404b540aSrobert		       from, to);
404b540aSrobert	  else
404b540aSrobert	    cpp_errno (pfile, CPP_DL_ERROR, "iconv_open");
404b540aSrobert
404b540aSrobert	  ret.func = convert_no_conversion;
404b540aSrobert	}
404b540aSrobert    }
404b540aSrobert  else
404b540aSrobert    {
404b540aSrobert      cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */
404b540aSrobert		 "no iconv implementation, cannot convert from %s to %s",
404b540aSrobert		 from, to);
404b540aSrobert      ret.func = convert_no_conversion;
404b540aSrobert      ret.cd = (iconv_t) -1;
404b540aSrobert    }
404b540aSrobert  return ret;
404b540aSrobert}
404b540aSrobert
404b540aSrobert/* If charset conversion is requested, initialize iconv(3) descriptors
404b540aSrobert   for conversion from the source character set to the execution
404b540aSrobert   character sets.  If iconv is not present in the C library, and
404b540aSrobert   conversion is requested, issue an error.  */
404b540aSrobert
404b540aSrobertvoid
404b540aSrobertcpp_init_iconv (cpp_reader *pfile)
404b540aSrobert{
404b540aSrobert  const char *ncset = CPP_OPTION (pfile, narrow_charset);
404b540aSrobert  const char *wcset = CPP_OPTION (pfile, wide_charset);
404b540aSrobert  const char *default_wcset;
404b540aSrobert
404b540aSrobert  bool be = CPP_OPTION (pfile, bytes_big_endian);
404b540aSrobert
404b540aSrobert  if (CPP_OPTION (pfile, wchar_precision) >= 32)
404b540aSrobert    default_wcset = be ? "UTF-32BE" : "UTF-32LE";
404b540aSrobert  else if (CPP_OPTION (pfile, wchar_precision) >= 16)
404b540aSrobert    default_wcset = be ? "UTF-16BE" : "UTF-16LE";
404b540aSrobert  else
404b540aSrobert    /* This effectively means that wide strings are not supported,
404b540aSrobert       so don't do any conversion at all.  */
404b540aSrobert   default_wcset = SOURCE_CHARSET;
404b540aSrobert
404b540aSrobert  if (!ncset)
404b540aSrobert    ncset = SOURCE_CHARSET;
404b540aSrobert  if (!wcset)
404b540aSrobert    wcset = default_wcset;
404b540aSrobert
404b540aSrobert  pfile->narrow_cset_desc = init_iconv_desc (pfile, ncset, SOURCE_CHARSET);
404b540aSrobert  pfile->wide_cset_desc = init_iconv_desc (pfile, wcset, SOURCE_CHARSET);
404b540aSrobert}
404b540aSrobert
404b540aSrobert/* Destroy iconv(3) descriptors set up by cpp_init_iconv, if necessary.  */
404b540aSrobertvoid
404b540aSrobert_cpp_destroy_iconv (cpp_reader *pfile)
404b540aSrobert{
404b540aSrobert  if (HAVE_ICONV)
404b540aSrobert    {
404b540aSrobert      if (pfile->narrow_cset_desc.func == convert_using_iconv)
404b540aSrobert	iconv_close (pfile->narrow_cset_desc.cd);
404b540aSrobert      if (pfile->wide_cset_desc.func == convert_using_iconv)
404b540aSrobert	iconv_close (pfile->wide_cset_desc.cd);
404b540aSrobert    }
404b540aSrobert}
404b540aSrobert
404b540aSrobert/* Utility routine for use by a full compiler.  C is a character taken
404b540aSrobert   from the *basic* source character set, encoded in the host's
404b540aSrobert   execution encoding.  Convert it to (the target's) execution
404b540aSrobert   encoding, and return that value.
404b540aSrobert
404b540aSrobert   Issues an internal error if C's representation in the narrow
404b540aSrobert   execution character set fails to be a single-byte value (C99
404b540aSrobert   5.2.1p3: "The representation of each member of the source and
404b540aSrobert   execution character sets shall fit in a byte.")  May also issue an
404b540aSrobert   internal error if C fails to be a member of the basic source
404b540aSrobert   character set (testing this exactly is too hard, especially when
404b540aSrobert   the host character set is EBCDIC).  */
404b540aSrobertcppchar_t
404b540aSrobertcpp_host_to_exec_charset (cpp_reader *pfile, cppchar_t c)
404b540aSrobert{
404b540aSrobert  uchar sbuf[1];
404b540aSrobert  struct _cpp_strbuf tbuf;
404b540aSrobert
404b540aSrobert  /* This test is merely an approximation, but it suffices to catch
404b540aSrobert     the most important thing, which is that we don't get handed a
404b540aSrobert     character outside the unibyte range of the host character set.  */
404b540aSrobert  if (c > LAST_POSSIBLY_BASIC_SOURCE_CHAR)
404b540aSrobert    {
404b540aSrobert      cpp_error (pfile, CPP_DL_ICE,
404b540aSrobert		 "character 0x%lx is not in the basic source character set\n",
404b540aSrobert		 (unsigned long)c);
404b540aSrobert      return 0;
404b540aSrobert    }
404b540aSrobert
404b540aSrobert  /* Being a character in the unibyte range of the host character set,
404b540aSrobert     we can safely splat it into a one-byte buffer and trust that that
404b540aSrobert     is a well-formed string.  */
404b540aSrobert  sbuf[0] = c;
404b540aSrobert
404b540aSrobert  /* This should never need to reallocate, but just in case... */
404b540aSrobert  tbuf.asize = 1;
404b540aSrobert  tbuf.text = XNEWVEC (uchar, tbuf.asize);
404b540aSrobert  tbuf.len = 0;
404b540aSrobert
404b540aSrobert  if (!APPLY_CONVERSION (pfile->narrow_cset_desc, sbuf, 1, &tbuf))
404b540aSrobert    {
404b540aSrobert      cpp_errno (pfile, CPP_DL_ICE, "converting to execution character set");
404b540aSrobert      return 0;
404b540aSrobert    }
404b540aSrobert  if (tbuf.len != 1)
404b540aSrobert    {
404b540aSrobert      cpp_error (pfile, CPP_DL_ICE,
404b540aSrobert		 "character 0x%lx is not unibyte in execution character set",
404b540aSrobert		 (unsigned long)c);
404b540aSrobert      return 0;
404b540aSrobert    }
404b540aSrobert  c = tbuf.text[0];
404b540aSrobert  free(tbuf.text);
404b540aSrobert  return c;
404b540aSrobert}
404b540aSrobert
404b540aSrobert
404b540aSrobert
404b540aSrobert/* Utility routine that computes a mask of the form 0000...111... with
404b540aSrobert   WIDTH 1-bits.  */
404b540aSrobertstatic inline size_t
404b540aSrobertwidth_to_mask (size_t width)
404b540aSrobert{
404b540aSrobert  width = MIN (width, BITS_PER_CPPCHAR_T);
404b540aSrobert  if (width >= CHAR_BIT * sizeof (size_t))
404b540aSrobert    return ~(size_t) 0;
404b540aSrobert  else
404b540aSrobert    return ((size_t) 1 << width) - 1;
404b540aSrobert}
404b540aSrobert
404b540aSrobert/* A large table of unicode character information.  */
404b540aSrobertenum {
404b540aSrobert  /* Valid in a C99 identifier?  */
404b540aSrobert  C99 = 1,
404b540aSrobert  /* Valid in a C99 identifier, but not as the first character?  */
404b540aSrobert  DIG = 2,
404b540aSrobert  /* Valid in a C++ identifier?  */
404b540aSrobert  CXX = 4,
404b540aSrobert  /* NFC representation is not valid in an identifier?  */
404b540aSrobert  CID = 8,
404b540aSrobert  /* Might be valid NFC form?  */
404b540aSrobert  NFC = 16,
404b540aSrobert  /* Might be valid NFKC form?  */
404b540aSrobert  NKC = 32,
404b540aSrobert  /* Certain preceding characters might make it not valid NFC/NKFC form?  */
404b540aSrobert  CTX = 64
404b540aSrobert};
404b540aSrobert
404b540aSrobertstatic const struct {
404b540aSrobert  /* Bitmap of flags above.  */
404b540aSrobert  unsigned char flags;
404b540aSrobert  /* Combining class of the character.  */
404b540aSrobert  unsigned char combine;
404b540aSrobert  /* Last character in the range described by this entry.  */
404b540aSrobert  unsigned short end;
404b540aSrobert} ucnranges[] = {
404b540aSrobert#include "ucnid.h"
404b540aSrobert};
404b540aSrobert
404b540aSrobert/* Returns 1 if C is valid in an identifier, 2 if C is valid except at
404b540aSrobert   the start of an identifier, and 0 if C is not valid in an
404b540aSrobert   identifier.  We assume C has already gone through the checks of
404b540aSrobert   _cpp_valid_ucn.  Also update NST for C if returning nonzero.  The
404b540aSrobert   algorithm is a simple binary search on the table defined in
404b540aSrobert   ucnid.h.  */
404b540aSrobert
404b540aSrobertstatic int
404b540aSrobertucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
404b540aSrobert			 struct normalize_state *nst)
404b540aSrobert{
404b540aSrobert  int mn, mx, md;
404b540aSrobert
404b540aSrobert  if (c > 0xFFFF)
404b540aSrobert    return 0;
404b540aSrobert
404b540aSrobert  mn = 0;
404b540aSrobert  mx = ARRAY_SIZE (ucnranges) - 1;
404b540aSrobert  while (mx != mn)
404b540aSrobert    {
404b540aSrobert      md = (mn + mx) / 2;
404b540aSrobert      if (c <= ucnranges[md].end)
404b540aSrobert	mx = md;
404b540aSrobert      else
404b540aSrobert	mn = md + 1;
404b540aSrobert    }
404b540aSrobert
404b540aSrobert  /* When -pedantic, we require the character to have been listed by
404b540aSrobert     the standard for the current language.  Otherwise, we accept the
404b540aSrobert     union of the acceptable sets for C++98 and C99.  */
404b540aSrobert  if (! (ucnranges[mn].flags & (C99 | CXX)))
404b540aSrobert      return 0;
404b540aSrobert
404b540aSrobert  if (CPP_PEDANTIC (pfile)
404b540aSrobert      && ((CPP_OPTION (pfile, c99) && !(ucnranges[mn].flags & C99))
404b540aSrobert	  || (CPP_OPTION (pfile, cplusplus)
404b540aSrobert	      && !(ucnranges[mn].flags & CXX))))
404b540aSrobert    return 0;
404b540aSrobert
404b540aSrobert  /* Update NST.  */
404b540aSrobert  if (ucnranges[mn].combine != 0 && ucnranges[mn].combine < nst->prev_class)
404b540aSrobert    nst->level = normalized_none;
404b540aSrobert  else if (ucnranges[mn].flags & CTX)
404b540aSrobert    {
404b540aSrobert      bool safe;
404b540aSrobert      cppchar_t p = nst->previous;
404b540aSrobert
404b540aSrobert      /* Easy cases from Bengali, Oriya, Tamil, Jannada, and Malayalam.  */
404b540aSrobert      if (c == 0x09BE)
404b540aSrobert	safe = p != 0x09C7;  /* Use 09CB instead of 09C7 09BE.  */
404b540aSrobert      else if (c == 0x0B3E)
404b540aSrobert	safe = p != 0x0B47;  /* Use 0B4B instead of 0B47 0B3E.  */
404b540aSrobert      else if (c == 0x0BBE)
404b540aSrobert	safe = p != 0x0BC6 && p != 0x0BC7;  /* Use 0BCA/0BCB instead.  */
404b540aSrobert      else if (c == 0x0CC2)
404b540aSrobert	safe = p != 0x0CC6;  /* Use 0CCA instead of 0CC6 0CC2.  */
404b540aSrobert      else if (c == 0x0D3E)
404b540aSrobert	safe = p != 0x0D46 && p != 0x0D47;  /* Use 0D4A/0D4B instead.  */
404b540aSrobert      /* For Hangul, characters in the range AC00-D7A3 are NFC/NFKC,
404b540aSrobert	 and are combined algorithmically from a sequence of the form
404b540aSrobert	 1100-1112 1161-1175 11A8-11C2
404b540aSrobert	 (if the third is not present, it is treated as 11A7, which is not
404b540aSrobert	 really a valid character).
404b540aSrobert	 Unfortunately, C99 allows (only) the NFC form, but C++ allows
404b540aSrobert	 only the combining characters.  */
404b540aSrobert      else if (c >= 0x1161 && c <= 0x1175)
404b540aSrobert	safe = p < 0x1100 || p > 0x1112;
404b540aSrobert      else if (c >= 0x11A8 && c <= 0x11C2)
404b540aSrobert	safe = (p < 0xAC00 || p > 0xD7A3 || (p - 0xAC00) % 28 != 0);
404b540aSrobert      else
404b540aSrobert	{
404b540aSrobert	  /* Uh-oh, someone updated ucnid.h without updating this code.  */
404b540aSrobert	  cpp_error (pfile, CPP_DL_ICE, "Character %x might not be NFKC", c);
404b540aSrobert	  safe = true;
404b540aSrobert	}
404b540aSrobert      if (!safe && c < 0x1161)
404b540aSrobert	nst->level = normalized_none;
404b540aSrobert      else if (!safe)
404b540aSrobert	nst->level = MAX (nst->level, normalized_identifier_C);
404b540aSrobert    }
404b540aSrobert  else if (ucnranges[mn].flags & NKC)
404b540aSrobert    ;
404b540aSrobert  else if (ucnranges[mn].flags & NFC)
404b540aSrobert    nst->level = MAX (nst->level, normalized_C);
404b540aSrobert  else if (ucnranges[mn].flags & CID)
404b540aSrobert    nst->level = MAX (nst->level, normalized_identifier_C);
404b540aSrobert  else
404b540aSrobert    nst->level = normalized_none;
404b540aSrobert  nst->previous = c;
404b540aSrobert  nst->prev_class = ucnranges[mn].combine;
404b540aSrobert
404b540aSrobert  /* In C99, UCN digits may not begin identifiers.  */
404b540aSrobert  if (CPP_OPTION (pfile, c99) && (ucnranges[mn].flags & DIG))
404b540aSrobert    return 2;
404b540aSrobert
404b540aSrobert  return 1;
404b540aSrobert}
404b540aSrobert
404b540aSrobert/* [lex.charset]: The character designated by the universal character
404b540aSrobert   name \UNNNNNNNN is that character whose character short name in
404b540aSrobert   ISO/IEC 10646 is NNNNNNNN; the character designated by the
404b540aSrobert   universal character name \uNNNN is that character whose character
404b540aSrobert   short name in ISO/IEC 10646 is 0000NNNN.  If the hexadecimal value
404b540aSrobert   for a universal character name is less than 0x20 or in the range
404b540aSrobert   0x7F-0x9F (inclusive), or if the universal character name
404b540aSrobert   designates a character in the basic source character set, then the
404b540aSrobert   program is ill-formed.
404b540aSrobert
404b540aSrobert   *PSTR must be preceded by "\u" or "\U"; it is assumed that the
404b540aSrobert   buffer end is delimited by a non-hex digit.  Returns zero if the
404b540aSrobert   UCN has not been consumed.
404b540aSrobert
404b540aSrobert   Otherwise the nonzero value of the UCN, whether valid or invalid,
404b540aSrobert   is returned.  Diagnostics are emitted for invalid values.  PSTR
404b540aSrobert   is updated to point one beyond the UCN, or to the syntactically
404b540aSrobert   invalid character.
404b540aSrobert
404b540aSrobert   IDENTIFIER_POS is 0 when not in an identifier, 1 for the start of
404b540aSrobert   an identifier, or 2 otherwise.  */
404b540aSrobert
404b540aSrobertcppchar_t
404b540aSrobert_cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
404b540aSrobert		const uchar *limit, int identifier_pos,
404b540aSrobert		struct normalize_state *nst)
404b540aSrobert{
404b540aSrobert  cppchar_t result, c;
404b540aSrobert  unsigned int length;
404b540aSrobert  const uchar *str = *pstr;
404b540aSrobert  const uchar *base = str - 2;
404b540aSrobert
404b540aSrobert  if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99))
404b540aSrobert    cpp_error (pfile, CPP_DL_WARNING,
404b540aSrobert	       "universal character names are only valid in C++ and C99");
404b540aSrobert  else if (CPP_WTRADITIONAL (pfile) && identifier_pos == 0)
404b540aSrobert    cpp_error (pfile, CPP_DL_WARNING,
404b540aSrobert	       "the meaning of '\\%c' is different in traditional C",
404b540aSrobert	       (int) str[-1]);
404b540aSrobert
404b540aSrobert  if (str[-1] == 'u')
404b540aSrobert    length = 4;
404b540aSrobert  else if (str[-1] == 'U')
404b540aSrobert    length = 8;
404b540aSrobert  else
404b540aSrobert    {
404b540aSrobert      cpp_error (pfile, CPP_DL_ICE, "In _cpp_valid_ucn but not a UCN");
404b540aSrobert      length = 4;
404b540aSrobert    }
404b540aSrobert
404b540aSrobert  result = 0;
404b540aSrobert  do
404b540aSrobert    {
404b540aSrobert      c = *str;
404b540aSrobert      if (!ISXDIGIT (c))
404b540aSrobert	break;
404b540aSrobert      str++;
404b540aSrobert      result = (result << 4) + hex_value (c);
404b540aSrobert    }
404b540aSrobert  while (--length && str < limit);
404b540aSrobert
404b540aSrobert  /* Partial UCNs are not valid in strings, but decompose into
404b540aSrobert     multiple tokens in identifiers, so we can't give a helpful
404b540aSrobert     error message in that case.  */
404b540aSrobert  if (length && identifier_pos)
404b540aSrobert    return 0;
404b540aSrobert
404b540aSrobert  *pstr = str;
404b540aSrobert  if (length)
404b540aSrobert    {
404b540aSrobert      cpp_error (pfile, CPP_DL_ERROR,
404b540aSrobert		 "incomplete universal character name %.*s",
404b540aSrobert		 (int) (str - base), base);
404b540aSrobert      result = 1;
404b540aSrobert    }
404b540aSrobert  /* The standard permits $, @ and ` to be specified as UCNs.  We use
404b540aSrobert     hex escapes so that this also works with EBCDIC hosts.  */
404b540aSrobert  else if ((result < 0xa0
404b540aSrobert	    && (result != 0x24 && result != 0x40 && result != 0x60))
404b540aSrobert	   || (result & 0x80000000)
404b540aSrobert	   || (result >= 0xD800 && result <= 0xDFFF))
404b540aSrobert    {
404b540aSrobert      cpp_error (pfile, CPP_DL_ERROR,
404b540aSrobert		 "%.*s is not a valid universal character",
404b540aSrobert		 (int) (str - base), base);
404b540aSrobert      result = 1;
404b540aSrobert    }
404b540aSrobert  else if (identifier_pos && result == 0x24
404b540aSrobert	   && CPP_OPTION (pfile, dollars_in_ident))
404b540aSrobert    {
404b540aSrobert      if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
404b540aSrobert	{
404b540aSrobert	  CPP_OPTION (pfile, warn_dollars) = 0;
404b540aSrobert	  cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
404b540aSrobert	}
404b540aSrobert      NORMALIZE_STATE_UPDATE_IDNUM (nst);
404b540aSrobert    }
404b540aSrobert  else if (identifier_pos)
404b540aSrobert    {
404b540aSrobert      int validity = ucn_valid_in_identifier (pfile, result, nst);
404b540aSrobert
404b540aSrobert      if (validity == 0)
404b540aSrobert	cpp_error (pfile, CPP_DL_ERROR,
404b540aSrobert		   "universal character %.*s is not valid in an identifier",
404b540aSrobert		   (int) (str - base), base);
404b540aSrobert      else if (validity == 2 && identifier_pos == 1)
404b540aSrobert	cpp_error (pfile, CPP_DL_ERROR,
404b540aSrobert   "universal character %.*s is not valid at the start of an identifier",
404b540aSrobert		   (int) (str - base), base);
404b540aSrobert    }
404b540aSrobert
404b540aSrobert  if (result == 0)
404b540aSrobert    result = 1;
404b540aSrobert
404b540aSrobert  return result;
404b540aSrobert}
404b540aSrobert
404b540aSrobert/* Convert an UCN, pointed to by FROM, to UTF-8 encoding, then translate
404b540aSrobert   it to the execution character set and write the result into TBUF.
404b540aSrobert   An advanced pointer is returned.  Issues all relevant diagnostics.  */
404b540aSrobertstatic const uchar *
404b540aSrobertconvert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit,
404b540aSrobert	     struct _cpp_strbuf *tbuf, bool wide)
404b540aSrobert{
404b540aSrobert  cppchar_t ucn;
404b540aSrobert  uchar buf[6];
404b540aSrobert  uchar *bufp = buf;
404b540aSrobert  size_t bytesleft = 6;
404b540aSrobert  int rval;
404b540aSrobert  struct cset_converter cvt
404b540aSrobert    = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
404b540aSrobert  struct normalize_state nst = INITIAL_NORMALIZE_STATE;
404b540aSrobert
404b540aSrobert  from++;  /* Skip u/U.  */
404b540aSrobert  ucn = _cpp_valid_ucn (pfile, &from, limit, 0, &nst);
404b540aSrobert
404b540aSrobert  rval = one_cppchar_to_utf8 (ucn, &bufp, &bytesleft);
404b540aSrobert  if (rval)
404b540aSrobert    {
404b540aSrobert      errno = rval;
404b540aSrobert      cpp_errno (pfile, CPP_DL_ERROR,
404b540aSrobert		 "converting UCN to source character set");
404b540aSrobert    }
404b540aSrobert  else if (!APPLY_CONVERSION (cvt, buf, 6 - bytesleft, tbuf))
404b540aSrobert    cpp_errno (pfile, CPP_DL_ERROR,
404b540aSrobert	       "converting UCN to execution character set");
404b540aSrobert
404b540aSrobert  return from;
404b540aSrobert}
404b540aSrobert
404b540aSrobert/* Subroutine of convert_hex and convert_oct.  N is the representation
404b540aSrobert   in the execution character set of a numeric escape; write it into the
404b540aSrobert   string buffer TBUF and update the end-of-string pointer therein.  WIDE
404b540aSrobert   is true if it's a wide string that's being assembled in TBUF.  This
404b540aSrobert   function issues no diagnostics and never fails.  */
404b540aSrobertstatic void
404b540aSrobertemit_numeric_escape (cpp_reader *pfile, cppchar_t n,
404b540aSrobert		     struct _cpp_strbuf *tbuf, bool wide)
404b540aSrobert{
404b540aSrobert  if (wide)
404b540aSrobert    {
404b540aSrobert      /* We have to render this into the target byte order, which may not
404b540aSrobert	 be our byte order.  */
404b540aSrobert      bool bigend = CPP_OPTION (pfile, bytes_big_endian);
404b540aSrobert      size_t width = CPP_OPTION (pfile, wchar_precision);
404b540aSrobert      size_t cwidth = CPP_OPTION (pfile, char_precision);
404b540aSrobert      size_t cmask = width_to_mask (cwidth);
404b540aSrobert      size_t nbwc = width / cwidth;
404b540aSrobert      size_t i;
404b540aSrobert      size_t off = tbuf->len;
404b540aSrobert      cppchar_t c;
404b540aSrobert
404b540aSrobert      if (tbuf->len + nbwc > tbuf->asize)
404b540aSrobert	{
404b540aSrobert	  tbuf->asize += OUTBUF_BLOCK_SIZE;
404b540aSrobert	  tbuf->text = XRESIZEVEC (uchar, tbuf->text, tbuf->asize);
404b540aSrobert	}
404b540aSrobert
404b540aSrobert      for (i = 0; i < nbwc; i++)
404b540aSrobert	{
404b540aSrobert	  c = n & cmask;
404b540aSrobert	  n >>= cwidth;
404b540aSrobert	  tbuf->text[off + (bigend ? nbwc - i - 1 : i)] = c;
404b540aSrobert	}
404b540aSrobert      tbuf->len += nbwc;
404b540aSrobert    }
404b540aSrobert  else
404b540aSrobert    {
404b540aSrobert      /* Note: this code does not handle the case where the target
404b540aSrobert	 and host have a different number of bits in a byte.  */
404b540aSrobert      if (tbuf->len + 1 > tbuf->asize)
404b540aSrobert	{
404b540aSrobert	  tbuf->asize += OUTBUF_BLOCK_SIZE;
404b540aSrobert	  tbuf->text = XRESIZEVEC (uchar, tbuf->text, tbuf->asize);
404b540aSrobert	}
404b540aSrobert      tbuf->text[tbuf->len++] = n;
404b540aSrobert    }
404b540aSrobert}
404b540aSrobert
404b540aSrobert/* Convert a hexadecimal escape, pointed to by FROM, to the execution
404b540aSrobert   character set and write it into the string buffer TBUF.  Returns an
404b540aSrobert   advanced pointer, and issues diagnostics as necessary.
404b540aSrobert   No character set translation occurs; this routine always produces the
404b540aSrobert   execution-set character with numeric value equal to the given hex
404b540aSrobert   number.  You can, e.g. generate surrogate pairs this way.  */
404b540aSrobertstatic const uchar *
404b540aSrobertconvert_hex (cpp_reader *pfile, const uchar *from, const uchar *limit,
404b540aSrobert	     struct _cpp_strbuf *tbuf, bool wide)
404b540aSrobert{
404b540aSrobert  cppchar_t c, n = 0, overflow = 0;
404b540aSrobert  int digits_found = 0;
404b540aSrobert  size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
404b540aSrobert		  : CPP_OPTION (pfile, char_precision));
404b540aSrobert  size_t mask = width_to_mask (width);
404b540aSrobert
404b540aSrobert  if (CPP_WTRADITIONAL (pfile))
404b540aSrobert    cpp_error (pfile, CPP_DL_WARNING,
404b540aSrobert	       "the meaning of '\\x' is different in traditional C");
404b540aSrobert
404b540aSrobert  from++;  /* Skip 'x'.  */
404b540aSrobert  while (from < limit)
404b540aSrobert    {
404b540aSrobert      c = *from;
404b540aSrobert      if (! hex_p (c))
404b540aSrobert	break;
404b540aSrobert      from++;
404b540aSrobert      overflow |= n ^ (n << 4 >> 4);
404b540aSrobert      n = (n << 4) + hex_value (c);
404b540aSrobert      digits_found = 1;
404b540aSrobert    }
404b540aSrobert
404b540aSrobert  if (!digits_found)
404b540aSrobert    {
404b540aSrobert      cpp_error (pfile, CPP_DL_ERROR,
404b540aSrobert		 "\\x used with no following hex digits");
404b540aSrobert      return from;
404b540aSrobert    }
404b540aSrobert
404b540aSrobert  if (overflow | (n != (n & mask)))
404b540aSrobert    {
404b540aSrobert      cpp_error (pfile, CPP_DL_PEDWARN,
404b540aSrobert		 "hex escape sequence out of range");
404b540aSrobert      n &= mask;
404b540aSrobert    }
404b540aSrobert
404b540aSrobert  emit_numeric_escape (pfile, n, tbuf, wide);
404b540aSrobert
404b540aSrobert  return from;
404b540aSrobert}
404b540aSrobert
404b540aSrobert/* Convert an octal escape, pointed to by FROM, to the execution
404b540aSrobert   character set and write it into the string buffer TBUF.  Returns an
404b540aSrobert   advanced pointer, and issues diagnostics as necessary.
404b540aSrobert   No character set translation occurs; this routine always produces the
404b540aSrobert   execution-set character with numeric value equal to the given octal
404b540aSrobert   number.  */
404b540aSrobertstatic const uchar *
404b540aSrobertconvert_oct (cpp_reader *pfile, const uchar *from, const uchar *limit,
404b540aSrobert	     struct _cpp_strbuf *tbuf, bool wide)
404b540aSrobert{
404b540aSrobert  size_t count = 0;
404b540aSrobert  cppchar_t c, n = 0;
404b540aSrobert  size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
404b540aSrobert		  : CPP_OPTION (pfile, char_precision));
404b540aSrobert  size_t mask = width_to_mask (width);
404b540aSrobert  bool overflow = false;
404b540aSrobert
404b540aSrobert  while (from < limit && count++ < 3)
404b540aSrobert    {
404b540aSrobert      c = *from;
404b540aSrobert      if (c < '0' || c > '7')
404b540aSrobert	break;
404b540aSrobert      from++;
404b540aSrobert      overflow |= n ^ (n << 3 >> 3);
404b540aSrobert      n = (n << 3) + c - '0';
404b540aSrobert    }
404b540aSrobert
404b540aSrobert  if (n != (n & mask))
404b540aSrobert    {
404b540aSrobert      cpp_error (pfile, CPP_DL_PEDWARN,
404b540aSrobert		 "octal escape sequence out of range");
404b540aSrobert      n &= mask;
404b540aSrobert    }
404b540aSrobert
404b540aSrobert  emit_numeric_escape (pfile, n, tbuf, wide);
404b540aSrobert
404b540aSrobert  return from;
404b540aSrobert}
404b540aSrobert
404b540aSrobert/* Convert an escape sequence (pointed to by FROM) to its value on
404b540aSrobert   the target, and to the execution character set.  Do not scan past
404b540aSrobert   LIMIT.  Write the converted value into TBUF.  Returns an advanced
404b540aSrobert   pointer.  Handles all relevant diagnostics.  */
404b540aSrobertstatic const uchar *
404b540aSrobertconvert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit,
404b540aSrobert		struct _cpp_strbuf *tbuf, bool wide)
404b540aSrobert{
404b540aSrobert  /* Values of \a \b \e \f \n \r \t \v respectively.  */
404b540aSrobert#if HOST_CHARSET == HOST_CHARSET_ASCII
404b540aSrobert  static const uchar charconsts[] = {  7,  8, 27, 12, 10, 13,  9, 11 };
404b540aSrobert#elif HOST_CHARSET == HOST_CHARSET_EBCDIC
404b540aSrobert  static const uchar charconsts[] = { 47, 22, 39, 12, 21, 13,  5, 11 };
404b540aSrobert#else
404b540aSrobert#error "unknown host character set"
404b540aSrobert#endif
404b540aSrobert
404b540aSrobert  uchar c;
404b540aSrobert  struct cset_converter cvt
404b540aSrobert    = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
404b540aSrobert
404b540aSrobert  c = *from;
404b540aSrobert  switch (c)
404b540aSrobert    {
404b540aSrobert      /* UCNs, hex escapes, and octal escapes are processed separately.  */
404b540aSrobert    case 'u': case 'U':
404b540aSrobert      return convert_ucn (pfile, from, limit, tbuf, wide);
404b540aSrobert
404b540aSrobert    case 'x':
404b540aSrobert      return convert_hex (pfile, from, limit, tbuf, wide);
404b540aSrobert      break;
404b540aSrobert
404b540aSrobert    case '0':  case '1':  case '2':  case '3':
404b540aSrobert    case '4':  case '5':  case '6':  case '7':
404b540aSrobert      return convert_oct (pfile, from, limit, tbuf, wide);
404b540aSrobert
404b540aSrobert      /* Various letter escapes.  Get the appropriate host-charset
404b540aSrobert	 value into C.  */
404b540aSrobert    case '\\': case '\'': case '"': case '?': break;
404b540aSrobert
404b540aSrobert    case '(': case '{': case '[': case '%':
404b540aSrobert      /* '\(', etc, can be used at the beginning of a line in a long
404b540aSrobert	 string split onto multiple lines with \-newline, to prevent
404b540aSrobert	 Emacs or other text editors from getting confused.  '\%' can
404b540aSrobert	 be used to prevent SCCS from mangling printf format strings.  */
404b540aSrobert      if (CPP_PEDANTIC (pfile))
404b540aSrobert	goto unknown;
404b540aSrobert      break;
404b540aSrobert
404b540aSrobert    case 'b': c = charconsts[1];  break;
404b540aSrobert    case 'f': c = charconsts[3];  break;
404b540aSrobert    case 'n': c = charconsts[4];  break;
404b540aSrobert    case 'r': c = charconsts[5];  break;
404b540aSrobert    case 't': c = charconsts[6];  break;
404b540aSrobert    case 'v': c = charconsts[7];  break;
404b540aSrobert
404b540aSrobert    case 'a':
404b540aSrobert      if (CPP_WTRADITIONAL (pfile))
404b540aSrobert	cpp_error (pfile, CPP_DL_WARNING,
404b540aSrobert		   "the meaning of '\\a' is different in traditional C");
404b540aSrobert      c = charconsts[0];
404b540aSrobert      break;
404b540aSrobert
404b540aSrobert    case 'e': case 'E':
404b540aSrobert      if (CPP_PEDANTIC (pfile))
404b540aSrobert	cpp_error (pfile, CPP_DL_PEDWARN,
404b540aSrobert		   "non-ISO-standard escape sequence, '\\%c'", (int) c);
404b540aSrobert      c = charconsts[2];
404b540aSrobert      break;
404b540aSrobert
404b540aSrobert    default:
404b540aSrobert    unknown:
404b540aSrobert      if (ISGRAPH (c))
404b540aSrobert	cpp_error (pfile, CPP_DL_PEDWARN,
404b540aSrobert		   "unknown escape sequence '\\%c'", (int) c);
404b540aSrobert      else
404b540aSrobert	{
404b540aSrobert	  /* diagnostic.c does not support "%03o".  When it does, this
404b540aSrobert	     code can use %03o directly in the diagnostic again.  */
404b540aSrobert	  char buf[32];
404b540aSrobert	  sprintf(buf, "%03o", (int) c);
404b540aSrobert	  cpp_error (pfile, CPP_DL_PEDWARN,
404b540aSrobert		     "unknown escape sequence: '\\%s'", buf);
404b540aSrobert	}
404b540aSrobert    }
404b540aSrobert
404b540aSrobert  /* Now convert what we have to the execution character set.  */
404b540aSrobert  if (!APPLY_CONVERSION (cvt, &c, 1, tbuf))
404b540aSrobert    cpp_errno (pfile, CPP_DL_ERROR,
404b540aSrobert	       "converting escape sequence to execution character set");
404b540aSrobert
404b540aSrobert  return from + 1;
404b540aSrobert}
404b540aSrobert
404b540aSrobert/* FROM is an array of cpp_string structures of length COUNT.  These
404b540aSrobert   are to be converted from the source to the execution character set,
404b540aSrobert   escape sequences translated, and finally all are to be
404b540aSrobert   concatenated.  WIDE indicates whether or not to produce a wide
404b540aSrobert   string.  The result is written into TO.  Returns true for success,
404b540aSrobert   false for failure.  */
404b540aSrobertbool
404b540aSrobertcpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count,
404b540aSrobert		      cpp_string *to, bool wide)
404b540aSrobert{
404b540aSrobert  struct _cpp_strbuf tbuf;
404b540aSrobert  const uchar *p, *base, *limit;
404b540aSrobert  size_t i;
404b540aSrobert  struct cset_converter cvt
404b540aSrobert    = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
404b540aSrobert
404b540aSrobert  tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);
404b540aSrobert  tbuf.text = XNEWVEC (uchar, tbuf.asize);
404b540aSrobert  tbuf.len = 0;
404b540aSrobert
404b540aSrobert  for (i = 0; i < count; i++)
404b540aSrobert    {
404b540aSrobert      p = from[i].text;
404b540aSrobert      if (*p == 'L') p++;
404b540aSrobert      p++; /* Skip leading quote.  */
404b540aSrobert      limit = from[i].text + from[i].len - 1; /* Skip trailing quote.  */
404b540aSrobert
404b540aSrobert      for (;;)
404b540aSrobert	{
404b540aSrobert	  base = p;
404b540aSrobert	  while (p < limit && *p != '\\')
404b540aSrobert	    p++;
404b540aSrobert	  if (p > base)
404b540aSrobert	    {
404b540aSrobert	      /* We have a run of normal characters; these can be fed
404b540aSrobert		 directly to convert_cset.  */
404b540aSrobert	      if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf))
404b540aSrobert		goto fail;
404b540aSrobert	    }
404b540aSrobert	  if (p == limit)
404b540aSrobert	    break;
404b540aSrobert
404b540aSrobert	  p = convert_escape (pfile, p + 1, limit, &tbuf, wide);
404b540aSrobert	}
404b540aSrobert    }
404b540aSrobert  /* NUL-terminate the 'to' buffer and translate it to a cpp_string
404b540aSrobert     structure.  */
404b540aSrobert  emit_numeric_escape (pfile, 0, &tbuf, wide);
404b540aSrobert  tbuf.text = XRESIZEVEC (uchar, tbuf.text, tbuf.len);
404b540aSrobert  to->text = tbuf.text;
404b540aSrobert  to->len = tbuf.len;
404b540aSrobert  return true;
404b540aSrobert
404b540aSrobert fail:
404b540aSrobert  cpp_errno (pfile, CPP_DL_ERROR, "converting to execution character set");
404b540aSrobert  free (tbuf.text);
404b540aSrobert  return false;
404b540aSrobert}
404b540aSrobert
404b540aSrobert/* Subroutine of do_line and do_linemarker.  Convert escape sequences
404b540aSrobert   in a string, but do not perform character set conversion.  */
404b540aSrobertbool
404b540aSrobertcpp_interpret_string_notranslate (cpp_reader *pfile, const cpp_string *from,
404b540aSrobert				  size_t count,	cpp_string *to, bool wide)
404b540aSrobert{
404b540aSrobert  struct cset_converter save_narrow_cset_desc = pfile->narrow_cset_desc;
404b540aSrobert  bool retval;
404b540aSrobert
404b540aSrobert  pfile->narrow_cset_desc.func = convert_no_conversion;
404b540aSrobert  pfile->narrow_cset_desc.cd = (iconv_t) -1;
404b540aSrobert
404b540aSrobert  retval = cpp_interpret_string (pfile, from, count, to, wide);
404b540aSrobert
404b540aSrobert  pfile->narrow_cset_desc = save_narrow_cset_desc;
404b540aSrobert  return retval;
404b540aSrobert}
404b540aSrobert
404b540aSrobert
404b540aSrobert/* Subroutine of cpp_interpret_charconst which performs the conversion
404b540aSrobert   to a number, for narrow strings.  STR is the string structure returned
404b540aSrobert   by cpp_interpret_string.  PCHARS_SEEN and UNSIGNEDP are as for
404b540aSrobert   cpp_interpret_charconst.  */
404b540aSrobertstatic cppchar_t
404b540aSrobertnarrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
404b540aSrobert			 unsigned int *pchars_seen, int *unsignedp)
404b540aSrobert{
404b540aSrobert  size_t width = CPP_OPTION (pfile, char_precision);
404b540aSrobert  size_t max_chars = CPP_OPTION (pfile, int_precision) / width;
404b540aSrobert  size_t mask = width_to_mask (width);
404b540aSrobert  size_t i;
404b540aSrobert  cppchar_t result, c;
404b540aSrobert  bool unsigned_p;
404b540aSrobert
404b540aSrobert  /* The value of a multi-character character constant, or a
404b540aSrobert     single-character character constant whose representation in the
404b540aSrobert     execution character set is more than one byte long, is
404b540aSrobert     implementation defined.  This implementation defines it to be the
404b540aSrobert     number formed by interpreting the byte sequence in memory as a
404b540aSrobert     big-endian binary number.  If overflow occurs, the high bytes are
404b540aSrobert     lost, and a warning is issued.
404b540aSrobert
404b540aSrobert     We don't want to process the NUL terminator handed back by
404b540aSrobert     cpp_interpret_string.  */
404b540aSrobert  result = 0;
404b540aSrobert  for (i = 0; i < str.len - 1; i++)
404b540aSrobert    {
404b540aSrobert      c = str.text[i] & mask;
404b540aSrobert      if (width < BITS_PER_CPPCHAR_T)
404b540aSrobert	result = (result << width) | c;
404b540aSrobert      else
404b540aSrobert	result = c;
404b540aSrobert    }
404b540aSrobert
404b540aSrobert  if (i > max_chars)
404b540aSrobert    {
404b540aSrobert      i = max_chars;
404b540aSrobert      cpp_error (pfile, CPP_DL_WARNING,
404b540aSrobert		 "character constant too long for its type");
404b540aSrobert    }
404b540aSrobert  else if (i > 1 && CPP_OPTION (pfile, warn_multichar))
404b540aSrobert    cpp_error (pfile, CPP_DL_WARNING, "multi-character character constant");
404b540aSrobert
404b540aSrobert  /* Multichar constants are of type int and therefore signed.  */
404b540aSrobert  if (i > 1)
404b540aSrobert    unsigned_p = 0;
404b540aSrobert  else
404b540aSrobert    unsigned_p = CPP_OPTION (pfile, unsigned_char);
404b540aSrobert
404b540aSrobert  /* Truncate the constant to its natural width, and simultaneously
404b540aSrobert     sign- or zero-extend to the full width of cppchar_t.
404b540aSrobert     For single-character constants, the value is WIDTH bits wide.
404b540aSrobert     For multi-character constants, the value is INT_PRECISION bits wide.  */
404b540aSrobert  if (i > 1)
404b540aSrobert    width = CPP_OPTION (pfile, int_precision);
404b540aSrobert  if (width < BITS_PER_CPPCHAR_T)
404b540aSrobert    {
404b540aSrobert      mask = ((cppchar_t) 1 << width) - 1;
404b540aSrobert      if (unsigned_p || !(result & (1 << (width - 1))))
404b540aSrobert	result &= mask;
404b540aSrobert      else
404b540aSrobert	result |= ~mask;
404b540aSrobert    }
404b540aSrobert  *pchars_seen = i;
404b540aSrobert  *unsignedp = unsigned_p;
404b540aSrobert  return result;
404b540aSrobert}
404b540aSrobert
404b540aSrobert/* Subroutine of cpp_interpret_charconst which performs the conversion
404b540aSrobert   to a number, for wide strings.  STR is the string structure returned
404b540aSrobert   by cpp_interpret_string.  PCHARS_SEEN and UNSIGNEDP are as for
404b540aSrobert   cpp_interpret_charconst.  */
404b540aSrobertstatic cppchar_t
404b540aSrobertwide_str_to_charconst (cpp_reader *pfile, cpp_string str,
404b540aSrobert		       unsigned int *pchars_seen, int *unsignedp)
404b540aSrobert{
404b540aSrobert  bool bigend = CPP_OPTION (pfile, bytes_big_endian);
404b540aSrobert  size_t width = CPP_OPTION (pfile, wchar_precision);
404b540aSrobert  size_t cwidth = CPP_OPTION (pfile, char_precision);
404b540aSrobert  size_t mask = width_to_mask (width);
404b540aSrobert  size_t cmask = width_to_mask (cwidth);
404b540aSrobert  size_t nbwc = width / cwidth;
404b540aSrobert  size_t off, i;
404b540aSrobert  cppchar_t result = 0, c;
404b540aSrobert
404b540aSrobert  /* This is finicky because the string is in the target's byte order,
404b540aSrobert     which may not be our byte order.  Only the last character, ignoring
404b540aSrobert     the NUL terminator, is relevant.  */
404b540aSrobert  off = str.len - (nbwc * 2);
404b540aSrobert  result = 0;
404b540aSrobert  for (i = 0; i < nbwc; i++)
404b540aSrobert    {
404b540aSrobert      c = bigend ? str.text[off + i] : str.text[off + nbwc - i - 1];
404b540aSrobert      result = (result << cwidth) | (c & cmask);
404b540aSrobert    }
404b540aSrobert
404b540aSrobert  /* Wide character constants have type wchar_t, and a single
404b540aSrobert     character exactly fills a wchar_t, so a multi-character wide
404b540aSrobert     character constant is guaranteed to overflow.  */
404b540aSrobert  if (off > 0)
404b540aSrobert    cpp_error (pfile, CPP_DL_WARNING,
404b540aSrobert	       "character constant too long for its type");
404b540aSrobert
404b540aSrobert  /* Truncate the constant to its natural width, and simultaneously
404b540aSrobert     sign- or zero-extend to the full width of cppchar_t.  */
404b540aSrobert  if (width < BITS_PER_CPPCHAR_T)
404b540aSrobert    {
404b540aSrobert      if (CPP_OPTION (pfile, unsigned_wchar) || !(result & (1 << (width - 1))))
404b540aSrobert	result &= mask;
404b540aSrobert      else
404b540aSrobert	result |= ~mask;
404b540aSrobert    }
404b540aSrobert
404b540aSrobert  *unsignedp = CPP_OPTION (pfile, unsigned_wchar);
404b540aSrobert  *pchars_seen = 1;
404b540aSrobert  return result;
404b540aSrobert}
404b540aSrobert
404b540aSrobert/* Interpret a (possibly wide) character constant in TOKEN.
404b540aSrobert   PCHARS_SEEN points to a variable that is filled in with the number
404b540aSrobert   of characters seen, and UNSIGNEDP to a variable that indicates
404b540aSrobert   whether the result has signed type.  */
404b540aSrobertcppchar_t
404b540aSrobertcpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token,
404b540aSrobert			 unsigned int *pchars_seen, int *unsignedp)
404b540aSrobert{
404b540aSrobert  cpp_string str = { 0, 0 };
404b540aSrobert  bool wide = (token->type == CPP_WCHAR);
404b540aSrobert  cppchar_t result;
404b540aSrobert
404b540aSrobert  /* an empty constant will appear as L'' or '' */
404b540aSrobert  if (token->val.str.len == (size_t) (2 + wide))
404b540aSrobert    {
404b540aSrobert      cpp_error (pfile, CPP_DL_ERROR, "empty character constant");
404b540aSrobert      return 0;
404b540aSrobert    }
404b540aSrobert  else if (!cpp_interpret_string (pfile, &token->val.str, 1, &str, wide))
404b540aSrobert    return 0;
404b540aSrobert
404b540aSrobert  if (wide)
404b540aSrobert    result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp);
404b540aSrobert  else
404b540aSrobert    result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp);
404b540aSrobert
404b540aSrobert  if (str.text != token->val.str.text)
404b540aSrobert    free ((void *)str.text);
404b540aSrobert
404b540aSrobert  return result;
404b540aSrobert}
404b540aSrobert
404b540aSrobert/* Convert an identifier denoted by ID and LEN, which might contain
404b540aSrobert   UCN escapes, to the source character set, either UTF-8 or
404b540aSrobert   UTF-EBCDIC.  Assumes that the identifier is actually a valid identifier.  */
404b540aSrobertcpp_hashnode *
404b540aSrobert_cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len)
404b540aSrobert{
404b540aSrobert  /* It turns out that a UCN escape always turns into fewer characters
404b540aSrobert     than the escape itself, so we can allocate a temporary in advance.  */
404b540aSrobert  uchar * buf = (uchar *) alloca (len + 1);
404b540aSrobert  uchar * bufp = buf;
404b540aSrobert  size_t idp;
404b540aSrobert
404b540aSrobert  for (idp = 0; idp < len; idp++)
404b540aSrobert    if (id[idp] != '\\')
404b540aSrobert      *bufp++ = id[idp];
404b540aSrobert    else
404b540aSrobert      {
404b540aSrobert	unsigned length = id[idp+1] == 'u' ? 4 : 8;
404b540aSrobert	cppchar_t value = 0;
404b540aSrobert	size_t bufleft = len - (bufp - buf);
404b540aSrobert	int rval;
404b540aSrobert
404b540aSrobert	idp += 2;
404b540aSrobert	while (length && idp < len && ISXDIGIT (id[idp]))
404b540aSrobert	  {
404b540aSrobert	    value = (value << 4) + hex_value (id[idp]);
404b540aSrobert	    idp++;
404b540aSrobert	    length--;
404b540aSrobert	  }
404b540aSrobert	idp--;
404b540aSrobert
404b540aSrobert	/* Special case for EBCDIC: if the identifier contains
404b540aSrobert	   a '$' specified using a UCN, translate it to EBCDIC.  */
404b540aSrobert	if (value == 0x24)
404b540aSrobert	  {
404b540aSrobert	    *bufp++ = '$';
404b540aSrobert	    continue;
404b540aSrobert	  }
404b540aSrobert
404b540aSrobert	rval = one_cppchar_to_utf8 (value, &bufp, &bufleft);
404b540aSrobert	if (rval)
404b540aSrobert	  {
404b540aSrobert	    errno = rval;
404b540aSrobert	    cpp_errno (pfile, CPP_DL_ERROR,
404b540aSrobert		       "converting UCN to source character set");
404b540aSrobert	    break;
404b540aSrobert	  }
404b540aSrobert      }
404b540aSrobert
404b540aSrobert  return CPP_HASHNODE (ht_lookup (pfile->hash_table,
404b540aSrobert				  buf, bufp - buf, HT_ALLOC));
404b540aSrobert}
404b540aSrobert
404b540aSrobert/* Convert an input buffer (containing the complete contents of one
404b540aSrobert   source file) from INPUT_CHARSET to the source character set.  INPUT
404b540aSrobert   points to the input buffer, SIZE is its allocated size, and LEN is
404b540aSrobert   the length of the meaningful data within the buffer.  The
404b540aSrobert   translated buffer is returned, and *ST_SIZE is set to the length of
404b540aSrobert   the meaningful data within the translated buffer.
404b540aSrobert
404b540aSrobert   INPUT is expected to have been allocated with xmalloc.  This function
404b540aSrobert   will either return INPUT, or free it and return a pointer to another
404b540aSrobert   xmalloc-allocated block of memory.  */
404b540aSrobertuchar *
404b540aSrobert_cpp_convert_input (cpp_reader *pfile, const char *input_charset,
404b540aSrobert		    uchar *input, size_t size, size_t len, off_t *st_size)
404b540aSrobert{
404b540aSrobert  struct cset_converter input_cset;
404b540aSrobert  struct _cpp_strbuf to;
404b540aSrobert
404b540aSrobert  input_cset = init_iconv_desc (pfile, SOURCE_CHARSET, input_charset);
404b540aSrobert  if (input_cset.func == convert_no_conversion)
404b540aSrobert    {
404b540aSrobert      to.text = input;
404b540aSrobert      to.asize = size;
404b540aSrobert      to.len = len;
404b540aSrobert    }
404b540aSrobert  else
404b540aSrobert    {
404b540aSrobert      to.asize = MAX (65536, len);
404b540aSrobert      to.text = XNEWVEC (uchar, to.asize);
404b540aSrobert      to.len = 0;
404b540aSrobert
404b540aSrobert      if (!APPLY_CONVERSION (input_cset, input, len, &to))
404b540aSrobert	cpp_error (pfile, CPP_DL_ERROR,
404b540aSrobert		   "failure to convert %s to %s",
404b540aSrobert		   CPP_OPTION (pfile, input_charset), SOURCE_CHARSET);
404b540aSrobert
404b540aSrobert      free (input);
404b540aSrobert    }
404b540aSrobert
404b540aSrobert  /* Clean up the mess.  */
404b540aSrobert  if (input_cset.func == convert_using_iconv)
404b540aSrobert    iconv_close (input_cset.cd);
404b540aSrobert
404b540aSrobert  /* Resize buffer if we allocated substantially too much, or if we
404b540aSrobert     haven't enough space for the \n-terminator.  */
404b540aSrobert  if (to.len + 4096 < to.asize || to.len >= to.asize)
404b540aSrobert    to.text = XRESIZEVEC (uchar, to.text, to.len + 1);
404b540aSrobert
404b540aSrobert  /* If the file is using old-school Mac line endings (\r only),
404b540aSrobert     terminate with another \r, not an \n, so that we do not mistake
404b540aSrobert     the \r\n sequence for a single DOS line ending and erroneously
404b540aSrobert     issue the "No newline at end of file" diagnostic.  */
*1c4aaf6cSkili  if (to.len > 0 && to.text[to.len - 1] == '\r')
404b540aSrobert    to.text[to.len] = '\r';
404b540aSrobert  else
404b540aSrobert    to.text[to.len] = '\n';
404b540aSrobert
404b540aSrobert  *st_size = to.len;
404b540aSrobert  return to.text;
404b540aSrobert}
404b540aSrobert
404b540aSrobert/* Decide on the default encoding to assume for input files.  */
404b540aSrobertconst char *
404b540aSrobert_cpp_default_encoding (void)
404b540aSrobert{
404b540aSrobert  const char *current_encoding = NULL;
404b540aSrobert
404b540aSrobert  /* We disable this because the default codeset is 7-bit ASCII on
404b540aSrobert     most platforms, and this causes conversion failures on every
404b540aSrobert     file in GCC that happens to have one of the upper 128 characters
404b540aSrobert     in it -- most likely, as part of the name of a contributor.
404b540aSrobert     We should definitely recognize in-band markers of file encoding,
404b540aSrobert     like:
404b540aSrobert     - the appropriate Unicode byte-order mark (FE FF) to recognize
404b540aSrobert       UTF16 and UCS4 (in both big-endian and little-endian flavors)
404b540aSrobert       and UTF8
404b540aSrobert     - a "#i", "#d", "/ *", "//", " #p" or "#p" (for #pragma) to
404b540aSrobert       distinguish ASCII and EBCDIC.
404b540aSrobert     - now we can parse something like "#pragma GCC encoding <xyz>
404b540aSrobert       on the first line, or even Emacs/VIM's mode line tags (there's
404b540aSrobert       a problem here in that VIM uses the last line, and Emacs has
404b540aSrobert       its more elaborate "local variables" convention).
404b540aSrobert     - investigate whether Java has another common convention, which
404b540aSrobert       would be friendly to support.
404b540aSrobert     (Zack Weinberg and Paolo Bonzini, May 20th 2004)  */
404b540aSrobert#if defined (HAVE_LOCALE_H) && defined (HAVE_LANGINFO_CODESET) && 0
404b540aSrobert  setlocale (LC_CTYPE, "");
404b540aSrobert  current_encoding = nl_langinfo (CODESET);
404b540aSrobert#endif
404b540aSrobert  if (current_encoding == NULL || *current_encoding == '\0')
404b540aSrobert    current_encoding = SOURCE_CHARSET;
404b540aSrobert
404b540aSrobert  return current_encoding;
404b540aSrobert}