xref: /openbsd-src/gnu/gcc/libcpp/charset.c (revision 1c4aaf6cd14a97342bfb0950f36366339645d326)
1404b540aSrobert /* CPP Library - charsets
2404b540aSrobert    Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004
3404b540aSrobert    Free Software Foundation, Inc.
4404b540aSrobert 
5404b540aSrobert    Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges.
6404b540aSrobert 
7404b540aSrobert This program is free software; you can redistribute it and/or modify it
8404b540aSrobert under the terms of the GNU General Public License as published by the
9404b540aSrobert Free Software Foundation; either version 2, or (at your option) any
10404b540aSrobert later version.
11404b540aSrobert 
12404b540aSrobert This program is distributed in the hope that it will be useful,
13404b540aSrobert but WITHOUT ANY WARRANTY; without even the implied warranty of
14404b540aSrobert MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15404b540aSrobert GNU General Public License for more details.
16404b540aSrobert 
17404b540aSrobert You should have received a copy of the GNU General Public License
18404b540aSrobert along with this program; if not, write to the Free Software
19404b540aSrobert Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.  */
20404b540aSrobert 
21404b540aSrobert #include "config.h"
22404b540aSrobert #include "system.h"
23404b540aSrobert #include "cpplib.h"
24404b540aSrobert #include "internal.h"
25404b540aSrobert 
26404b540aSrobert /* Character set handling for C-family languages.
27404b540aSrobert 
28404b540aSrobert    Terminological note: In what follows, "charset" or "character set"
29404b540aSrobert    will be taken to mean both an abstract set of characters and an
30404b540aSrobert    encoding for that set.
31404b540aSrobert 
32404b540aSrobert    The C99 standard discusses two character sets: source and execution.
33404b540aSrobert    The source character set is used for internal processing in translation
34404b540aSrobert    phases 1 through 4; the execution character set is used thereafter.
35404b540aSrobert    Both are required by 5.2.1.2p1 to be multibyte encodings, not wide
36404b540aSrobert    character encodings (see 3.7.2, 3.7.3 for the standardese meanings
37404b540aSrobert    of these terms).  Furthermore, the "basic character set" (listed in
38404b540aSrobert    5.2.1p3) is to be encoded in each with values one byte wide, and is
39404b540aSrobert    to appear in the initial shift state.
40404b540aSrobert 
41404b540aSrobert    It is not explicitly mentioned, but there is also a "wide execution
42404b540aSrobert    character set" used to encode wide character constants and wide
43404b540aSrobert    string literals; this is supposed to be the result of applying the
44404b540aSrobert    standard library function mbstowcs() to an equivalent narrow string
45404b540aSrobert    (6.4.5p5).  However, the behavior of hexadecimal and octal
46404b540aSrobert    \-escapes is at odds with this; they are supposed to be translated
47404b540aSrobert    directly to wchar_t values (6.4.4.4p5,6).
48404b540aSrobert 
49404b540aSrobert    The source character set is not necessarily the character set used
50404b540aSrobert    to encode physical source files on disk; translation phase 1 converts
51404b540aSrobert    from whatever that encoding is to the source character set.
52404b540aSrobert 
53404b540aSrobert    The presence of universal character names in C99 (6.4.3 et seq.)
54404b540aSrobert    forces the source character set to be isomorphic to ISO 10646,
55404b540aSrobert    that is, Unicode.  There is no such constraint on the execution
56404b540aSrobert    character set; note also that the conversion from source to
57404b540aSrobert    execution character set does not occur for identifiers (5.1.1.2p1#5).
58404b540aSrobert 
59404b540aSrobert    For convenience of implementation, the source character set's
60404b540aSrobert    encoding of the basic character set should be identical to the
61404b540aSrobert    execution character set OF THE HOST SYSTEM's encoding of the basic
62404b540aSrobert    character set, and it should not be a state-dependent encoding.
63404b540aSrobert 
64404b540aSrobert    cpplib uses UTF-8 or UTF-EBCDIC for the source character set,
65404b540aSrobert    depending on whether the host is based on ASCII or EBCDIC (see
66404b540aSrobert    respectively Unicode section 2.3/ISO10646 Amendment 2, and Unicode
67404b540aSrobert    Technical Report #16).  With limited exceptions, it relies on the
68404b540aSrobert    system library's iconv() primitive to do charset conversion
69404b540aSrobert    (specified in SUSv2).  */
70404b540aSrobert 
71404b540aSrobert #if !HAVE_ICONV
72404b540aSrobert /* Make certain that the uses of iconv(), iconv_open(), iconv_close()
73404b540aSrobert    below, which are guarded only by if statements with compile-time
74404b540aSrobert    constant conditions, do not cause link errors.  */
75404b540aSrobert #define iconv_open(x, y) (errno = EINVAL, (iconv_t)-1)
76404b540aSrobert #define iconv(a,b,c,d,e) (errno = EINVAL, (size_t)-1)
77404b540aSrobert #define iconv_close(x)   (void)0
78404b540aSrobert #define ICONV_CONST
79404b540aSrobert #endif
80404b540aSrobert 
81404b540aSrobert #if HOST_CHARSET == HOST_CHARSET_ASCII
82404b540aSrobert #define SOURCE_CHARSET "UTF-8"
83404b540aSrobert #define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0x7e
84404b540aSrobert #elif HOST_CHARSET == HOST_CHARSET_EBCDIC
85404b540aSrobert #define SOURCE_CHARSET "UTF-EBCDIC"
86404b540aSrobert #define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0xFF
87404b540aSrobert #else
88404b540aSrobert #error "Unrecognized basic host character set"
89404b540aSrobert #endif
90404b540aSrobert 
91404b540aSrobert #ifndef EILSEQ
92404b540aSrobert #define EILSEQ EINVAL
93404b540aSrobert #endif
94404b540aSrobert 
95404b540aSrobert /* This structure is used for a resizable string buffer throughout.  */
96404b540aSrobert /* Don't call it strbuf, as that conflicts with unistd.h on systems
97404b540aSrobert    such as DYNIX/ptx where unistd.h includes stropts.h.  */
98404b540aSrobert struct _cpp_strbuf
99404b540aSrobert {
100404b540aSrobert   uchar *text;
101404b540aSrobert   size_t asize;
102404b540aSrobert   size_t len;
103404b540aSrobert };
104404b540aSrobert 
105404b540aSrobert /* This is enough to hold any string that fits on a single 80-column
106404b540aSrobert    line, even if iconv quadruples its size (e.g. conversion from
107404b540aSrobert    ASCII to UTF-32) rounded up to a power of two.  */
108404b540aSrobert #define OUTBUF_BLOCK_SIZE 256
109404b540aSrobert 
110404b540aSrobert /* Conversions between UTF-8 and UTF-16/32 are implemented by custom
111404b540aSrobert    logic.  This is because a depressing number of systems lack iconv,
112404b540aSrobert    or have have iconv libraries that do not do these conversions, so
113404b540aSrobert    we need a fallback implementation for them.  To ensure the fallback
114404b540aSrobert    doesn't break due to neglect, it is used on all systems.
115404b540aSrobert 
116404b540aSrobert    UTF-32 encoding is nice and simple: a four-byte binary number,
117404b540aSrobert    constrained to the range 00000000-7FFFFFFF to avoid questions of
118404b540aSrobert    signedness.  We do have to cope with big- and little-endian
119404b540aSrobert    variants.
120404b540aSrobert 
121404b540aSrobert    UTF-16 encoding uses two-byte binary numbers, again in big- and
122404b540aSrobert    little-endian variants, for all values in the 00000000-0000FFFF
123404b540aSrobert    range.  Values in the 00010000-0010FFFF range are encoded as pairs
124404b540aSrobert    of two-byte numbers, called "surrogate pairs": given a number S in
125404b540aSrobert    this range, it is mapped to a pair (H, L) as follows:
126404b540aSrobert 
127404b540aSrobert      H = (S - 0x10000) / 0x400 + 0xD800
128404b540aSrobert      L = (S - 0x10000) % 0x400 + 0xDC00
129404b540aSrobert 
130404b540aSrobert    Two-byte values in the D800...DFFF range are ill-formed except as a
131404b540aSrobert    component of a surrogate pair.  Even if the encoding within a
132404b540aSrobert    two-byte value is little-endian, the H member of the surrogate pair
133404b540aSrobert    comes first.
134404b540aSrobert 
135404b540aSrobert    There is no way to encode values in the 00110000-7FFFFFFF range,
136404b540aSrobert    which is not currently a problem as there are no assigned code
137404b540aSrobert    points in that range; however, the author expects that it will
138404b540aSrobert    eventually become necessary to abandon UTF-16 due to this
139404b540aSrobert    limitation.  Note also that, because of these pairs, UTF-16 does
140404b540aSrobert    not meet the requirements of the C standard for a wide character
141404b540aSrobert    encoding (see 3.7.3 and 6.4.4.4p11).
142404b540aSrobert 
143404b540aSrobert    UTF-8 encoding looks like this:
144404b540aSrobert 
145404b540aSrobert    value range	       encoded as
146404b540aSrobert    00000000-0000007F   0xxxxxxx
147404b540aSrobert    00000080-000007FF   110xxxxx 10xxxxxx
148404b540aSrobert    00000800-0000FFFF   1110xxxx 10xxxxxx 10xxxxxx
149404b540aSrobert    00010000-001FFFFF   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
150404b540aSrobert    00200000-03FFFFFF   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
151404b540aSrobert    04000000-7FFFFFFF   1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
152404b540aSrobert 
153404b540aSrobert    Values in the 0000D800 ... 0000DFFF range (surrogates) are invalid,
154404b540aSrobert    which means that three-byte sequences ED xx yy, with A0 <= xx <= BF,
155404b540aSrobert    never occur.  Note also that any value that can be encoded by a
156404b540aSrobert    given row of the table can also be encoded by all successive rows,
157404b540aSrobert    but this is not done; only the shortest possible encoding for any
158404b540aSrobert    given value is valid.  For instance, the character 07C0 could be
159404b540aSrobert    encoded as any of DF 80, E0 9F 80, F0 80 9F 80, F8 80 80 9F 80, or
160404b540aSrobert    FC 80 80 80 9F 80.  Only the first is valid.
161404b540aSrobert 
162404b540aSrobert    An implementation note: the transformation from UTF-16 to UTF-8, or
163404b540aSrobert    vice versa, is easiest done by using UTF-32 as an intermediary.  */
164404b540aSrobert 
165404b540aSrobert /* Internal primitives which go from an UTF-8 byte stream to native-endian
166404b540aSrobert    UTF-32 in a cppchar_t, or vice versa; this avoids an extra marshal/unmarshal
167404b540aSrobert    operation in several places below.  */
168404b540aSrobert static inline int
one_utf8_to_cppchar(const uchar ** inbufp,size_t * inbytesleftp,cppchar_t * cp)169404b540aSrobert one_utf8_to_cppchar (const uchar **inbufp, size_t *inbytesleftp,
170404b540aSrobert 		     cppchar_t *cp)
171404b540aSrobert {
172404b540aSrobert   static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x02, 0x01 };
173404b540aSrobert   static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
174404b540aSrobert 
175404b540aSrobert   cppchar_t c;
176404b540aSrobert   const uchar *inbuf = *inbufp;
177404b540aSrobert   size_t nbytes, i;
178404b540aSrobert 
179404b540aSrobert   if (*inbytesleftp < 1)
180404b540aSrobert     return EINVAL;
181404b540aSrobert 
182404b540aSrobert   c = *inbuf;
183404b540aSrobert   if (c < 0x80)
184404b540aSrobert     {
185404b540aSrobert       *cp = c;
186404b540aSrobert       *inbytesleftp -= 1;
187404b540aSrobert       *inbufp += 1;
188404b540aSrobert       return 0;
189404b540aSrobert     }
190404b540aSrobert 
191404b540aSrobert   /* The number of leading 1-bits in the first byte indicates how many
192404b540aSrobert      bytes follow.  */
193404b540aSrobert   for (nbytes = 2; nbytes < 7; nbytes++)
194404b540aSrobert     if ((c & ~masks[nbytes-1]) == patns[nbytes-1])
195404b540aSrobert       goto found;
196404b540aSrobert   return EILSEQ;
197404b540aSrobert  found:
198404b540aSrobert 
199404b540aSrobert   if (*inbytesleftp < nbytes)
200404b540aSrobert     return EINVAL;
201404b540aSrobert 
202404b540aSrobert   c = (c & masks[nbytes-1]);
203404b540aSrobert   inbuf++;
204404b540aSrobert   for (i = 1; i < nbytes; i++)
205404b540aSrobert     {
206404b540aSrobert       cppchar_t n = *inbuf++;
207404b540aSrobert       if ((n & 0xC0) != 0x80)
208404b540aSrobert 	return EILSEQ;
209404b540aSrobert       c = ((c << 6) + (n & 0x3F));
210404b540aSrobert     }
211404b540aSrobert 
212404b540aSrobert   /* Make sure the shortest possible encoding was used.  */
213404b540aSrobert   if (c <=      0x7F && nbytes > 1) return EILSEQ;
214404b540aSrobert   if (c <=     0x7FF && nbytes > 2) return EILSEQ;
215404b540aSrobert   if (c <=    0xFFFF && nbytes > 3) return EILSEQ;
216404b540aSrobert   if (c <=  0x1FFFFF && nbytes > 4) return EILSEQ;
217404b540aSrobert   if (c <= 0x3FFFFFF && nbytes > 5) return EILSEQ;
218404b540aSrobert 
219404b540aSrobert   /* Make sure the character is valid.  */
220404b540aSrobert   if (c > 0x7FFFFFFF || (c >= 0xD800 && c <= 0xDFFF)) return EILSEQ;
221404b540aSrobert 
222404b540aSrobert   *cp = c;
223404b540aSrobert   *inbufp = inbuf;
224404b540aSrobert   *inbytesleftp -= nbytes;
225404b540aSrobert   return 0;
226404b540aSrobert }
227404b540aSrobert 
228404b540aSrobert static inline int
one_cppchar_to_utf8(cppchar_t c,uchar ** outbufp,size_t * outbytesleftp)229404b540aSrobert one_cppchar_to_utf8 (cppchar_t c, uchar **outbufp, size_t *outbytesleftp)
230404b540aSrobert {
231404b540aSrobert   static const uchar masks[6] =  { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
232404b540aSrobert   static const uchar limits[6] = { 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE };
233404b540aSrobert   size_t nbytes;
234404b540aSrobert   uchar buf[6], *p = &buf[6];
235404b540aSrobert   uchar *outbuf = *outbufp;
236404b540aSrobert 
237404b540aSrobert   nbytes = 1;
238404b540aSrobert   if (c < 0x80)
239404b540aSrobert     *--p = c;
240404b540aSrobert   else
241404b540aSrobert     {
242404b540aSrobert       do
243404b540aSrobert 	{
244404b540aSrobert 	  *--p = ((c & 0x3F) | 0x80);
245404b540aSrobert 	  c >>= 6;
246404b540aSrobert 	  nbytes++;
247404b540aSrobert 	}
248404b540aSrobert       while (c >= 0x3F || (c & limits[nbytes-1]));
249404b540aSrobert       *--p = (c | masks[nbytes-1]);
250404b540aSrobert     }
251404b540aSrobert 
252404b540aSrobert   if (*outbytesleftp < nbytes)
253404b540aSrobert     return E2BIG;
254404b540aSrobert 
255404b540aSrobert   while (p < &buf[6])
256404b540aSrobert     *outbuf++ = *p++;
257404b540aSrobert   *outbytesleftp -= nbytes;
258404b540aSrobert   *outbufp = outbuf;
259404b540aSrobert   return 0;
260404b540aSrobert }
261404b540aSrobert 
262404b540aSrobert /* The following four functions transform one character between the two
263404b540aSrobert    encodings named in the function name.  All have the signature
264404b540aSrobert    int (*)(iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
265404b540aSrobert            uchar **outbufp, size_t *outbytesleftp)
266404b540aSrobert 
267404b540aSrobert    BIGEND must have the value 0 or 1, coerced to (iconv_t); it is
268404b540aSrobert    interpreted as a boolean indicating whether big-endian or
269404b540aSrobert    little-endian encoding is to be used for the member of the pair
270404b540aSrobert    that is not UTF-8.
271404b540aSrobert 
272404b540aSrobert    INBUFP, INBYTESLEFTP, OUTBUFP, OUTBYTESLEFTP work exactly as they
273404b540aSrobert    do for iconv.
274404b540aSrobert 
275404b540aSrobert    The return value is either 0 for success, or an errno value for
276404b540aSrobert    failure, which may be E2BIG (need more space), EILSEQ (ill-formed
277404b540aSrobert    input sequence), ir EINVAL (incomplete input sequence).  */
278404b540aSrobert 
279404b540aSrobert static inline int
one_utf8_to_utf32(iconv_t bigend,const uchar ** inbufp,size_t * inbytesleftp,uchar ** outbufp,size_t * outbytesleftp)280404b540aSrobert one_utf8_to_utf32 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
281404b540aSrobert 		   uchar **outbufp, size_t *outbytesleftp)
282404b540aSrobert {
283404b540aSrobert   uchar *outbuf;
284404b540aSrobert   cppchar_t s = 0;
285404b540aSrobert   int rval;
286404b540aSrobert 
287404b540aSrobert   /* Check for space first, since we know exactly how much we need.  */
288404b540aSrobert   if (*outbytesleftp < 4)
289404b540aSrobert     return E2BIG;
290404b540aSrobert 
291404b540aSrobert   rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
292404b540aSrobert   if (rval)
293404b540aSrobert     return rval;
294404b540aSrobert 
295404b540aSrobert   outbuf = *outbufp;
296404b540aSrobert   outbuf[bigend ? 3 : 0] = (s & 0x000000FF);
297404b540aSrobert   outbuf[bigend ? 2 : 1] = (s & 0x0000FF00) >> 8;
298404b540aSrobert   outbuf[bigend ? 1 : 2] = (s & 0x00FF0000) >> 16;
299404b540aSrobert   outbuf[bigend ? 0 : 3] = (s & 0xFF000000) >> 24;
300404b540aSrobert 
301404b540aSrobert   *outbufp += 4;
302404b540aSrobert   *outbytesleftp -= 4;
303404b540aSrobert   return 0;
304404b540aSrobert }
305404b540aSrobert 
306404b540aSrobert static inline int
one_utf32_to_utf8(iconv_t bigend,const uchar ** inbufp,size_t * inbytesleftp,uchar ** outbufp,size_t * outbytesleftp)307404b540aSrobert one_utf32_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
308404b540aSrobert 		   uchar **outbufp, size_t *outbytesleftp)
309404b540aSrobert {
310404b540aSrobert   cppchar_t s;
311404b540aSrobert   int rval;
312404b540aSrobert   const uchar *inbuf;
313404b540aSrobert 
314404b540aSrobert   if (*inbytesleftp < 4)
315404b540aSrobert     return EINVAL;
316404b540aSrobert 
317404b540aSrobert   inbuf = *inbufp;
318404b540aSrobert 
319404b540aSrobert   s  = inbuf[bigend ? 0 : 3] << 24;
320404b540aSrobert   s += inbuf[bigend ? 1 : 2] << 16;
321404b540aSrobert   s += inbuf[bigend ? 2 : 1] << 8;
322404b540aSrobert   s += inbuf[bigend ? 3 : 0];
323404b540aSrobert 
324404b540aSrobert   if (s >= 0x7FFFFFFF || (s >= 0xD800 && s <= 0xDFFF))
325404b540aSrobert     return EILSEQ;
326404b540aSrobert 
327404b540aSrobert   rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
328404b540aSrobert   if (rval)
329404b540aSrobert     return rval;
330404b540aSrobert 
331404b540aSrobert   *inbufp += 4;
332404b540aSrobert   *inbytesleftp -= 4;
333404b540aSrobert   return 0;
334404b540aSrobert }
335404b540aSrobert 
336404b540aSrobert static inline int
one_utf8_to_utf16(iconv_t bigend,const uchar ** inbufp,size_t * inbytesleftp,uchar ** outbufp,size_t * outbytesleftp)337404b540aSrobert one_utf8_to_utf16 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
338404b540aSrobert 		   uchar **outbufp, size_t *outbytesleftp)
339404b540aSrobert {
340404b540aSrobert   int rval;
341404b540aSrobert   cppchar_t s = 0;
342404b540aSrobert   const uchar *save_inbuf = *inbufp;
343404b540aSrobert   size_t save_inbytesleft = *inbytesleftp;
344404b540aSrobert   uchar *outbuf = *outbufp;
345404b540aSrobert 
346404b540aSrobert   rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
347404b540aSrobert   if (rval)
348404b540aSrobert     return rval;
349404b540aSrobert 
350404b540aSrobert   if (s > 0x0010FFFF)
351404b540aSrobert     {
352404b540aSrobert       *inbufp = save_inbuf;
353404b540aSrobert       *inbytesleftp = save_inbytesleft;
354404b540aSrobert       return EILSEQ;
355404b540aSrobert     }
356404b540aSrobert 
357404b540aSrobert   if (s < 0xFFFF)
358404b540aSrobert     {
359404b540aSrobert       if (*outbytesleftp < 2)
360404b540aSrobert 	{
361404b540aSrobert 	  *inbufp = save_inbuf;
362404b540aSrobert 	  *inbytesleftp = save_inbytesleft;
363404b540aSrobert 	  return E2BIG;
364404b540aSrobert 	}
365404b540aSrobert       outbuf[bigend ? 1 : 0] = (s & 0x00FF);
366404b540aSrobert       outbuf[bigend ? 0 : 1] = (s & 0xFF00) >> 8;
367404b540aSrobert 
368404b540aSrobert       *outbufp += 2;
369404b540aSrobert       *outbytesleftp -= 2;
370404b540aSrobert       return 0;
371404b540aSrobert     }
372404b540aSrobert   else
373404b540aSrobert     {
374404b540aSrobert       cppchar_t hi, lo;
375404b540aSrobert 
376404b540aSrobert       if (*outbytesleftp < 4)
377404b540aSrobert 	{
378404b540aSrobert 	  *inbufp = save_inbuf;
379404b540aSrobert 	  *inbytesleftp = save_inbytesleft;
380404b540aSrobert 	  return E2BIG;
381404b540aSrobert 	}
382404b540aSrobert 
383404b540aSrobert       hi = (s - 0x10000) / 0x400 + 0xD800;
384404b540aSrobert       lo = (s - 0x10000) % 0x400 + 0xDC00;
385404b540aSrobert 
386404b540aSrobert       /* Even if we are little-endian, put the high surrogate first.
387404b540aSrobert 	 ??? Matches practice?  */
388404b540aSrobert       outbuf[bigend ? 1 : 0] = (hi & 0x00FF);
389404b540aSrobert       outbuf[bigend ? 0 : 1] = (hi & 0xFF00) >> 8;
390404b540aSrobert       outbuf[bigend ? 3 : 2] = (lo & 0x00FF);
391404b540aSrobert       outbuf[bigend ? 2 : 3] = (lo & 0xFF00) >> 8;
392404b540aSrobert 
393404b540aSrobert       *outbufp += 4;
394404b540aSrobert       *outbytesleftp -= 4;
395404b540aSrobert       return 0;
396404b540aSrobert     }
397404b540aSrobert }
398404b540aSrobert 
399404b540aSrobert static inline int
one_utf16_to_utf8(iconv_t bigend,const uchar ** inbufp,size_t * inbytesleftp,uchar ** outbufp,size_t * outbytesleftp)400404b540aSrobert one_utf16_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
401404b540aSrobert 		   uchar **outbufp, size_t *outbytesleftp)
402404b540aSrobert {
403404b540aSrobert   cppchar_t s;
404404b540aSrobert   const uchar *inbuf = *inbufp;
405404b540aSrobert   int rval;
406404b540aSrobert 
407404b540aSrobert   if (*inbytesleftp < 2)
408404b540aSrobert     return EINVAL;
409404b540aSrobert   s  = inbuf[bigend ? 0 : 1] << 8;
410404b540aSrobert   s += inbuf[bigend ? 1 : 0];
411404b540aSrobert 
412404b540aSrobert   /* Low surrogate without immediately preceding high surrogate is invalid.  */
413404b540aSrobert   if (s >= 0xDC00 && s <= 0xDFFF)
414404b540aSrobert     return EILSEQ;
415404b540aSrobert   /* High surrogate must have a following low surrogate.  */
416404b540aSrobert   else if (s >= 0xD800 && s <= 0xDBFF)
417404b540aSrobert     {
418404b540aSrobert       cppchar_t hi = s, lo;
419404b540aSrobert       if (*inbytesleftp < 4)
420404b540aSrobert 	return EINVAL;
421404b540aSrobert 
422404b540aSrobert       lo  = inbuf[bigend ? 2 : 3] << 8;
423404b540aSrobert       lo += inbuf[bigend ? 3 : 2];
424404b540aSrobert 
425404b540aSrobert       if (lo < 0xDC00 || lo > 0xDFFF)
426404b540aSrobert 	return EILSEQ;
427404b540aSrobert 
428404b540aSrobert       s = (hi - 0xD800) * 0x400 + (lo - 0xDC00) + 0x10000;
429404b540aSrobert     }
430404b540aSrobert 
431404b540aSrobert   rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
432404b540aSrobert   if (rval)
433404b540aSrobert     return rval;
434404b540aSrobert 
435404b540aSrobert   /* Success - update the input pointers (one_cppchar_to_utf8 has done
436404b540aSrobert      the output pointers for us).  */
437404b540aSrobert   if (s <= 0xFFFF)
438404b540aSrobert     {
439404b540aSrobert       *inbufp += 2;
440404b540aSrobert       *inbytesleftp -= 2;
441404b540aSrobert     }
442404b540aSrobert   else
443404b540aSrobert     {
444404b540aSrobert       *inbufp += 4;
445404b540aSrobert       *inbytesleftp -= 4;
446404b540aSrobert     }
447404b540aSrobert   return 0;
448404b540aSrobert }
449404b540aSrobert 
450404b540aSrobert /* Helper routine for the next few functions.  The 'const' on
451404b540aSrobert    one_conversion means that we promise not to modify what function is
452404b540aSrobert    pointed to, which lets the inliner see through it.  */
453404b540aSrobert 
454404b540aSrobert static inline bool
conversion_loop(int (* const one_conversion)(iconv_t,const uchar **,size_t *,uchar **,size_t *),iconv_t cd,const uchar * from,size_t flen,struct _cpp_strbuf * to)455404b540aSrobert conversion_loop (int (*const one_conversion)(iconv_t, const uchar **, size_t *,
456404b540aSrobert 					     uchar **, size_t *),
457404b540aSrobert 		 iconv_t cd, const uchar *from, size_t flen, struct _cpp_strbuf *to)
458404b540aSrobert {
459404b540aSrobert   const uchar *inbuf;
460404b540aSrobert   uchar *outbuf;
461404b540aSrobert   size_t inbytesleft, outbytesleft;
462404b540aSrobert   int rval;
463404b540aSrobert 
464404b540aSrobert   inbuf = from;
465404b540aSrobert   inbytesleft = flen;
466404b540aSrobert   outbuf = to->text + to->len;
467404b540aSrobert   outbytesleft = to->asize - to->len;
468404b540aSrobert 
469404b540aSrobert   for (;;)
470404b540aSrobert     {
471404b540aSrobert       do
472404b540aSrobert 	rval = one_conversion (cd, &inbuf, &inbytesleft,
473404b540aSrobert 			       &outbuf, &outbytesleft);
474404b540aSrobert       while (inbytesleft && !rval);
475404b540aSrobert 
476404b540aSrobert       if (__builtin_expect (inbytesleft == 0, 1))
477404b540aSrobert 	{
478404b540aSrobert 	  to->len = to->asize - outbytesleft;
479404b540aSrobert 	  return true;
480404b540aSrobert 	}
481404b540aSrobert       if (rval != E2BIG)
482404b540aSrobert 	{
483404b540aSrobert 	  errno = rval;
484404b540aSrobert 	  return false;
485404b540aSrobert 	}
486404b540aSrobert 
487404b540aSrobert       outbytesleft += OUTBUF_BLOCK_SIZE;
488404b540aSrobert       to->asize += OUTBUF_BLOCK_SIZE;
489404b540aSrobert       to->text = XRESIZEVEC (uchar, to->text, to->asize);
490404b540aSrobert       outbuf = to->text + to->asize - outbytesleft;
491404b540aSrobert     }
492404b540aSrobert }
493404b540aSrobert 
494404b540aSrobert 
495404b540aSrobert /* These functions convert entire strings between character sets.
496404b540aSrobert    They all have the signature
497404b540aSrobert 
498404b540aSrobert    bool (*)(iconv_t cd, const uchar *from, size_t flen, struct _cpp_strbuf *to);
499404b540aSrobert 
500404b540aSrobert    The input string FROM is converted as specified by the function
501404b540aSrobert    name plus the iconv descriptor CD (which may be fake), and the
502404b540aSrobert    result appended to TO.  On any error, false is returned, otherwise true.  */
503404b540aSrobert 
504404b540aSrobert /* These four use the custom conversion code above.  */
505404b540aSrobert static bool
convert_utf8_utf16(iconv_t cd,const uchar * from,size_t flen,struct _cpp_strbuf * to)506404b540aSrobert convert_utf8_utf16 (iconv_t cd, const uchar *from, size_t flen,
507404b540aSrobert 		    struct _cpp_strbuf *to)
508404b540aSrobert {
509404b540aSrobert   return conversion_loop (one_utf8_to_utf16, cd, from, flen, to);
510404b540aSrobert }
511404b540aSrobert 
512404b540aSrobert static bool
convert_utf8_utf32(iconv_t cd,const uchar * from,size_t flen,struct _cpp_strbuf * to)513404b540aSrobert convert_utf8_utf32 (iconv_t cd, const uchar *from, size_t flen,
514404b540aSrobert 		    struct _cpp_strbuf *to)
515404b540aSrobert {
516404b540aSrobert   return conversion_loop (one_utf8_to_utf32, cd, from, flen, to);
517404b540aSrobert }
518404b540aSrobert 
519404b540aSrobert static bool
convert_utf16_utf8(iconv_t cd,const uchar * from,size_t flen,struct _cpp_strbuf * to)520404b540aSrobert convert_utf16_utf8 (iconv_t cd, const uchar *from, size_t flen,
521404b540aSrobert 		    struct _cpp_strbuf *to)
522404b540aSrobert {
523404b540aSrobert   return conversion_loop (one_utf16_to_utf8, cd, from, flen, to);
524404b540aSrobert }
525404b540aSrobert 
526404b540aSrobert static bool
convert_utf32_utf8(iconv_t cd,const uchar * from,size_t flen,struct _cpp_strbuf * to)527404b540aSrobert convert_utf32_utf8 (iconv_t cd, const uchar *from, size_t flen,
528404b540aSrobert 		    struct _cpp_strbuf *to)
529404b540aSrobert {
530404b540aSrobert   return conversion_loop (one_utf32_to_utf8, cd, from, flen, to);
531404b540aSrobert }
532404b540aSrobert 
533404b540aSrobert /* Identity conversion, used when we have no alternative.  */
534404b540aSrobert static bool
convert_no_conversion(iconv_t cd ATTRIBUTE_UNUSED,const uchar * from,size_t flen,struct _cpp_strbuf * to)535404b540aSrobert convert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED,
536404b540aSrobert 		       const uchar *from, size_t flen, struct _cpp_strbuf *to)
537404b540aSrobert {
538404b540aSrobert   if (to->len + flen > to->asize)
539404b540aSrobert     {
540404b540aSrobert       to->asize = to->len + flen;
541404b540aSrobert       to->text = XRESIZEVEC (uchar, to->text, to->asize);
542404b540aSrobert     }
543404b540aSrobert   memcpy (to->text + to->len, from, flen);
544404b540aSrobert   to->len += flen;
545404b540aSrobert   return true;
546404b540aSrobert }
547404b540aSrobert 
548404b540aSrobert /* And this one uses the system iconv primitive.  It's a little
549404b540aSrobert    different, since iconv's interface is a little different.  */
550404b540aSrobert #if HAVE_ICONV
551404b540aSrobert static bool
convert_using_iconv(iconv_t cd,const uchar * from,size_t flen,struct _cpp_strbuf * to)552404b540aSrobert convert_using_iconv (iconv_t cd, const uchar *from, size_t flen,
553404b540aSrobert 		     struct _cpp_strbuf *to)
554404b540aSrobert {
555404b540aSrobert   ICONV_CONST char *inbuf;
556404b540aSrobert   char *outbuf;
557404b540aSrobert   size_t inbytesleft, outbytesleft;
558404b540aSrobert 
559404b540aSrobert   /* Reset conversion descriptor and check that it is valid.  */
560404b540aSrobert   if (iconv (cd, 0, 0, 0, 0) == (size_t)-1)
561404b540aSrobert     return false;
562404b540aSrobert 
563404b540aSrobert   inbuf = (ICONV_CONST char *)from;
564404b540aSrobert   inbytesleft = flen;
565404b540aSrobert   outbuf = (char *)to->text + to->len;
566404b540aSrobert   outbytesleft = to->asize - to->len;
567404b540aSrobert 
568404b540aSrobert   for (;;)
569404b540aSrobert     {
570404b540aSrobert       iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
571404b540aSrobert       if (__builtin_expect (inbytesleft == 0, 1))
572404b540aSrobert 	{
573404b540aSrobert 	  to->len = to->asize - outbytesleft;
574404b540aSrobert 	  return true;
575404b540aSrobert 	}
576404b540aSrobert       if (errno != E2BIG)
577404b540aSrobert 	return false;
578404b540aSrobert 
579404b540aSrobert       outbytesleft += OUTBUF_BLOCK_SIZE;
580404b540aSrobert       to->asize += OUTBUF_BLOCK_SIZE;
581404b540aSrobert       to->text = XRESIZEVEC (uchar, to->text, to->asize);
582404b540aSrobert       outbuf = (char *)to->text + to->asize - outbytesleft;
583404b540aSrobert     }
584404b540aSrobert }
585404b540aSrobert #else
586404b540aSrobert #define convert_using_iconv 0 /* prevent undefined symbol error below */
587404b540aSrobert #endif
588404b540aSrobert 
589404b540aSrobert /* Arrange for the above custom conversion logic to be used automatically
590404b540aSrobert    when conversion between a suitable pair of character sets is requested.  */
591404b540aSrobert 
592404b540aSrobert #define APPLY_CONVERSION(CONVERTER, FROM, FLEN, TO) \
593404b540aSrobert    CONVERTER.func (CONVERTER.cd, FROM, FLEN, TO)
594404b540aSrobert 
595404b540aSrobert struct conversion
596404b540aSrobert {
597404b540aSrobert   const char *pair;
598404b540aSrobert   convert_f func;
599404b540aSrobert   iconv_t fake_cd;
600404b540aSrobert };
601404b540aSrobert static const struct conversion conversion_tab[] = {
602404b540aSrobert   { "UTF-8/UTF-32LE", convert_utf8_utf32, (iconv_t)0 },
603404b540aSrobert   { "UTF-8/UTF-32BE", convert_utf8_utf32, (iconv_t)1 },
604404b540aSrobert   { "UTF-8/UTF-16LE", convert_utf8_utf16, (iconv_t)0 },
605404b540aSrobert   { "UTF-8/UTF-16BE", convert_utf8_utf16, (iconv_t)1 },
606404b540aSrobert   { "UTF-32LE/UTF-8", convert_utf32_utf8, (iconv_t)0 },
607404b540aSrobert   { "UTF-32BE/UTF-8", convert_utf32_utf8, (iconv_t)1 },
608404b540aSrobert   { "UTF-16LE/UTF-8", convert_utf16_utf8, (iconv_t)0 },
609404b540aSrobert   { "UTF-16BE/UTF-8", convert_utf16_utf8, (iconv_t)1 },
610404b540aSrobert };
611404b540aSrobert 
612404b540aSrobert /* Subroutine of cpp_init_iconv: initialize and return a
613404b540aSrobert    cset_converter structure for conversion from FROM to TO.  If
614404b540aSrobert    iconv_open() fails, issue an error and return an identity
615404b540aSrobert    converter.  Silently return an identity converter if FROM and TO
616404b540aSrobert    are identical.  */
617404b540aSrobert static struct cset_converter
init_iconv_desc(cpp_reader * pfile,const char * to,const char * from)618404b540aSrobert init_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
619404b540aSrobert {
620404b540aSrobert   struct cset_converter ret;
621404b540aSrobert   char *pair;
622404b540aSrobert   size_t i;
623404b540aSrobert 
624404b540aSrobert   if (!strcasecmp (to, from))
625404b540aSrobert     {
626404b540aSrobert       ret.func = convert_no_conversion;
627404b540aSrobert       ret.cd = (iconv_t) -1;
628404b540aSrobert       return ret;
629404b540aSrobert     }
630404b540aSrobert 
631404b540aSrobert   pair = (char *) alloca(strlen(to) + strlen(from) + 2);
632404b540aSrobert 
633404b540aSrobert   strcpy(pair, from);
634404b540aSrobert   strcat(pair, "/");
635404b540aSrobert   strcat(pair, to);
636404b540aSrobert   for (i = 0; i < ARRAY_SIZE (conversion_tab); i++)
637404b540aSrobert     if (!strcasecmp (pair, conversion_tab[i].pair))
638404b540aSrobert       {
639404b540aSrobert 	ret.func = conversion_tab[i].func;
640404b540aSrobert 	ret.cd = conversion_tab[i].fake_cd;
641404b540aSrobert 	return ret;
642404b540aSrobert       }
643404b540aSrobert 
644404b540aSrobert   /* No custom converter - try iconv.  */
645404b540aSrobert   if (HAVE_ICONV)
646404b540aSrobert     {
647404b540aSrobert       ret.func = convert_using_iconv;
648404b540aSrobert       ret.cd = iconv_open (to, from);
649404b540aSrobert 
650404b540aSrobert       if (ret.cd == (iconv_t) -1)
651404b540aSrobert 	{
652404b540aSrobert 	  if (errno == EINVAL)
653404b540aSrobert 	    cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */
654404b540aSrobert 		       "conversion from %s to %s not supported by iconv",
655404b540aSrobert 		       from, to);
656404b540aSrobert 	  else
657404b540aSrobert 	    cpp_errno (pfile, CPP_DL_ERROR, "iconv_open");
658404b540aSrobert 
659404b540aSrobert 	  ret.func = convert_no_conversion;
660404b540aSrobert 	}
661404b540aSrobert     }
662404b540aSrobert   else
663404b540aSrobert     {
664404b540aSrobert       cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */
665404b540aSrobert 		 "no iconv implementation, cannot convert from %s to %s",
666404b540aSrobert 		 from, to);
667404b540aSrobert       ret.func = convert_no_conversion;
668404b540aSrobert       ret.cd = (iconv_t) -1;
669404b540aSrobert     }
670404b540aSrobert   return ret;
671404b540aSrobert }
672404b540aSrobert 
673404b540aSrobert /* If charset conversion is requested, initialize iconv(3) descriptors
674404b540aSrobert    for conversion from the source character set to the execution
675404b540aSrobert    character sets.  If iconv is not present in the C library, and
676404b540aSrobert    conversion is requested, issue an error.  */
677404b540aSrobert 
678404b540aSrobert void
cpp_init_iconv(cpp_reader * pfile)679404b540aSrobert cpp_init_iconv (cpp_reader *pfile)
680404b540aSrobert {
681404b540aSrobert   const char *ncset = CPP_OPTION (pfile, narrow_charset);
682404b540aSrobert   const char *wcset = CPP_OPTION (pfile, wide_charset);
683404b540aSrobert   const char *default_wcset;
684404b540aSrobert 
685404b540aSrobert   bool be = CPP_OPTION (pfile, bytes_big_endian);
686404b540aSrobert 
687404b540aSrobert   if (CPP_OPTION (pfile, wchar_precision) >= 32)
688404b540aSrobert     default_wcset = be ? "UTF-32BE" : "UTF-32LE";
689404b540aSrobert   else if (CPP_OPTION (pfile, wchar_precision) >= 16)
690404b540aSrobert     default_wcset = be ? "UTF-16BE" : "UTF-16LE";
691404b540aSrobert   else
692404b540aSrobert     /* This effectively means that wide strings are not supported,
693404b540aSrobert        so don't do any conversion at all.  */
694404b540aSrobert    default_wcset = SOURCE_CHARSET;
695404b540aSrobert 
696404b540aSrobert   if (!ncset)
697404b540aSrobert     ncset = SOURCE_CHARSET;
698404b540aSrobert   if (!wcset)
699404b540aSrobert     wcset = default_wcset;
700404b540aSrobert 
701404b540aSrobert   pfile->narrow_cset_desc = init_iconv_desc (pfile, ncset, SOURCE_CHARSET);
702404b540aSrobert   pfile->wide_cset_desc = init_iconv_desc (pfile, wcset, SOURCE_CHARSET);
703404b540aSrobert }
704404b540aSrobert 
705404b540aSrobert /* Destroy iconv(3) descriptors set up by cpp_init_iconv, if necessary.  */
706404b540aSrobert void
_cpp_destroy_iconv(cpp_reader * pfile)707404b540aSrobert _cpp_destroy_iconv (cpp_reader *pfile)
708404b540aSrobert {
709404b540aSrobert   if (HAVE_ICONV)
710404b540aSrobert     {
711404b540aSrobert       if (pfile->narrow_cset_desc.func == convert_using_iconv)
712404b540aSrobert 	iconv_close (pfile->narrow_cset_desc.cd);
713404b540aSrobert       if (pfile->wide_cset_desc.func == convert_using_iconv)
714404b540aSrobert 	iconv_close (pfile->wide_cset_desc.cd);
715404b540aSrobert     }
716404b540aSrobert }
717404b540aSrobert 
718404b540aSrobert /* Utility routine for use by a full compiler.  C is a character taken
719404b540aSrobert    from the *basic* source character set, encoded in the host's
720404b540aSrobert    execution encoding.  Convert it to (the target's) execution
721404b540aSrobert    encoding, and return that value.
722404b540aSrobert 
723404b540aSrobert    Issues an internal error if C's representation in the narrow
724404b540aSrobert    execution character set fails to be a single-byte value (C99
725404b540aSrobert    5.2.1p3: "The representation of each member of the source and
726404b540aSrobert    execution character sets shall fit in a byte.")  May also issue an
727404b540aSrobert    internal error if C fails to be a member of the basic source
728404b540aSrobert    character set (testing this exactly is too hard, especially when
729404b540aSrobert    the host character set is EBCDIC).  */
730404b540aSrobert cppchar_t
cpp_host_to_exec_charset(cpp_reader * pfile,cppchar_t c)731404b540aSrobert cpp_host_to_exec_charset (cpp_reader *pfile, cppchar_t c)
732404b540aSrobert {
733404b540aSrobert   uchar sbuf[1];
734404b540aSrobert   struct _cpp_strbuf tbuf;
735404b540aSrobert 
736404b540aSrobert   /* This test is merely an approximation, but it suffices to catch
737404b540aSrobert      the most important thing, which is that we don't get handed a
738404b540aSrobert      character outside the unibyte range of the host character set.  */
739404b540aSrobert   if (c > LAST_POSSIBLY_BASIC_SOURCE_CHAR)
740404b540aSrobert     {
741404b540aSrobert       cpp_error (pfile, CPP_DL_ICE,
742404b540aSrobert 		 "character 0x%lx is not in the basic source character set\n",
743404b540aSrobert 		 (unsigned long)c);
744404b540aSrobert       return 0;
745404b540aSrobert     }
746404b540aSrobert 
747404b540aSrobert   /* Being a character in the unibyte range of the host character set,
748404b540aSrobert      we can safely splat it into a one-byte buffer and trust that that
749404b540aSrobert      is a well-formed string.  */
750404b540aSrobert   sbuf[0] = c;
751404b540aSrobert 
752404b540aSrobert   /* This should never need to reallocate, but just in case... */
753404b540aSrobert   tbuf.asize = 1;
754404b540aSrobert   tbuf.text = XNEWVEC (uchar, tbuf.asize);
755404b540aSrobert   tbuf.len = 0;
756404b540aSrobert 
757404b540aSrobert   if (!APPLY_CONVERSION (pfile->narrow_cset_desc, sbuf, 1, &tbuf))
758404b540aSrobert     {
759404b540aSrobert       cpp_errno (pfile, CPP_DL_ICE, "converting to execution character set");
760404b540aSrobert       return 0;
761404b540aSrobert     }
762404b540aSrobert   if (tbuf.len != 1)
763404b540aSrobert     {
764404b540aSrobert       cpp_error (pfile, CPP_DL_ICE,
765404b540aSrobert 		 "character 0x%lx is not unibyte in execution character set",
766404b540aSrobert 		 (unsigned long)c);
767404b540aSrobert       return 0;
768404b540aSrobert     }
769404b540aSrobert   c = tbuf.text[0];
770404b540aSrobert   free(tbuf.text);
771404b540aSrobert   return c;
772404b540aSrobert }
773404b540aSrobert 
774404b540aSrobert 
775404b540aSrobert 
776404b540aSrobert /* Utility routine that computes a mask of the form 0000...111... with
777404b540aSrobert    WIDTH 1-bits.  */
778404b540aSrobert static inline size_t
width_to_mask(size_t width)779404b540aSrobert width_to_mask (size_t width)
780404b540aSrobert {
781404b540aSrobert   width = MIN (width, BITS_PER_CPPCHAR_T);
782404b540aSrobert   if (width >= CHAR_BIT * sizeof (size_t))
783404b540aSrobert     return ~(size_t) 0;
784404b540aSrobert   else
785404b540aSrobert     return ((size_t) 1 << width) - 1;
786404b540aSrobert }
787404b540aSrobert 
788404b540aSrobert /* A large table of unicode character information.  */
789404b540aSrobert enum {
790404b540aSrobert   /* Valid in a C99 identifier?  */
791404b540aSrobert   C99 = 1,
792404b540aSrobert   /* Valid in a C99 identifier, but not as the first character?  */
793404b540aSrobert   DIG = 2,
794404b540aSrobert   /* Valid in a C++ identifier?  */
795404b540aSrobert   CXX = 4,
796404b540aSrobert   /* NFC representation is not valid in an identifier?  */
797404b540aSrobert   CID = 8,
798404b540aSrobert   /* Might be valid NFC form?  */
799404b540aSrobert   NFC = 16,
800404b540aSrobert   /* Might be valid NFKC form?  */
801404b540aSrobert   NKC = 32,
802404b540aSrobert   /* Certain preceding characters might make it not valid NFC/NKFC form?  */
803404b540aSrobert   CTX = 64
804404b540aSrobert };
805404b540aSrobert 
806404b540aSrobert static const struct {
807404b540aSrobert   /* Bitmap of flags above.  */
808404b540aSrobert   unsigned char flags;
809404b540aSrobert   /* Combining class of the character.  */
810404b540aSrobert   unsigned char combine;
811404b540aSrobert   /* Last character in the range described by this entry.  */
812404b540aSrobert   unsigned short end;
813404b540aSrobert } ucnranges[] = {
814404b540aSrobert #include "ucnid.h"
815404b540aSrobert };
816404b540aSrobert 
817404b540aSrobert /* Returns 1 if C is valid in an identifier, 2 if C is valid except at
818404b540aSrobert    the start of an identifier, and 0 if C is not valid in an
819404b540aSrobert    identifier.  We assume C has already gone through the checks of
820404b540aSrobert    _cpp_valid_ucn.  Also update NST for C if returning nonzero.  The
821404b540aSrobert    algorithm is a simple binary search on the table defined in
822404b540aSrobert    ucnid.h.  */
823404b540aSrobert 
824404b540aSrobert static int
ucn_valid_in_identifier(cpp_reader * pfile,cppchar_t c,struct normalize_state * nst)825404b540aSrobert ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
826404b540aSrobert 			 struct normalize_state *nst)
827404b540aSrobert {
828404b540aSrobert   int mn, mx, md;
829404b540aSrobert 
830404b540aSrobert   if (c > 0xFFFF)
831404b540aSrobert     return 0;
832404b540aSrobert 
833404b540aSrobert   mn = 0;
834404b540aSrobert   mx = ARRAY_SIZE (ucnranges) - 1;
835404b540aSrobert   while (mx != mn)
836404b540aSrobert     {
837404b540aSrobert       md = (mn + mx) / 2;
838404b540aSrobert       if (c <= ucnranges[md].end)
839404b540aSrobert 	mx = md;
840404b540aSrobert       else
841404b540aSrobert 	mn = md + 1;
842404b540aSrobert     }
843404b540aSrobert 
844404b540aSrobert   /* When -pedantic, we require the character to have been listed by
845404b540aSrobert      the standard for the current language.  Otherwise, we accept the
846404b540aSrobert      union of the acceptable sets for C++98 and C99.  */
847404b540aSrobert   if (! (ucnranges[mn].flags & (C99 | CXX)))
848404b540aSrobert       return 0;
849404b540aSrobert 
850404b540aSrobert   if (CPP_PEDANTIC (pfile)
851404b540aSrobert       && ((CPP_OPTION (pfile, c99) && !(ucnranges[mn].flags & C99))
852404b540aSrobert 	  || (CPP_OPTION (pfile, cplusplus)
853404b540aSrobert 	      && !(ucnranges[mn].flags & CXX))))
854404b540aSrobert     return 0;
855404b540aSrobert 
856404b540aSrobert   /* Update NST.  */
857404b540aSrobert   if (ucnranges[mn].combine != 0 && ucnranges[mn].combine < nst->prev_class)
858404b540aSrobert     nst->level = normalized_none;
859404b540aSrobert   else if (ucnranges[mn].flags & CTX)
860404b540aSrobert     {
861404b540aSrobert       bool safe;
862404b540aSrobert       cppchar_t p = nst->previous;
863404b540aSrobert 
864404b540aSrobert       /* Easy cases from Bengali, Oriya, Tamil, Jannada, and Malayalam.  */
865404b540aSrobert       if (c == 0x09BE)
866404b540aSrobert 	safe = p != 0x09C7;  /* Use 09CB instead of 09C7 09BE.  */
867404b540aSrobert       else if (c == 0x0B3E)
868404b540aSrobert 	safe = p != 0x0B47;  /* Use 0B4B instead of 0B47 0B3E.  */
869404b540aSrobert       else if (c == 0x0BBE)
870404b540aSrobert 	safe = p != 0x0BC6 && p != 0x0BC7;  /* Use 0BCA/0BCB instead.  */
871404b540aSrobert       else if (c == 0x0CC2)
872404b540aSrobert 	safe = p != 0x0CC6;  /* Use 0CCA instead of 0CC6 0CC2.  */
873404b540aSrobert       else if (c == 0x0D3E)
874404b540aSrobert 	safe = p != 0x0D46 && p != 0x0D47;  /* Use 0D4A/0D4B instead.  */
875404b540aSrobert       /* For Hangul, characters in the range AC00-D7A3 are NFC/NFKC,
876404b540aSrobert 	 and are combined algorithmically from a sequence of the form
877404b540aSrobert 	 1100-1112 1161-1175 11A8-11C2
878404b540aSrobert 	 (if the third is not present, it is treated as 11A7, which is not
879404b540aSrobert 	 really a valid character).
880404b540aSrobert 	 Unfortunately, C99 allows (only) the NFC form, but C++ allows
881404b540aSrobert 	 only the combining characters.  */
882404b540aSrobert       else if (c >= 0x1161 && c <= 0x1175)
883404b540aSrobert 	safe = p < 0x1100 || p > 0x1112;
884404b540aSrobert       else if (c >= 0x11A8 && c <= 0x11C2)
885404b540aSrobert 	safe = (p < 0xAC00 || p > 0xD7A3 || (p - 0xAC00) % 28 != 0);
886404b540aSrobert       else
887404b540aSrobert 	{
888404b540aSrobert 	  /* Uh-oh, someone updated ucnid.h without updating this code.  */
889404b540aSrobert 	  cpp_error (pfile, CPP_DL_ICE, "Character %x might not be NFKC", c);
890404b540aSrobert 	  safe = true;
891404b540aSrobert 	}
892404b540aSrobert       if (!safe && c < 0x1161)
893404b540aSrobert 	nst->level = normalized_none;
894404b540aSrobert       else if (!safe)
895404b540aSrobert 	nst->level = MAX (nst->level, normalized_identifier_C);
896404b540aSrobert     }
897404b540aSrobert   else if (ucnranges[mn].flags & NKC)
898404b540aSrobert     ;
899404b540aSrobert   else if (ucnranges[mn].flags & NFC)
900404b540aSrobert     nst->level = MAX (nst->level, normalized_C);
901404b540aSrobert   else if (ucnranges[mn].flags & CID)
902404b540aSrobert     nst->level = MAX (nst->level, normalized_identifier_C);
903404b540aSrobert   else
904404b540aSrobert     nst->level = normalized_none;
905404b540aSrobert   nst->previous = c;
906404b540aSrobert   nst->prev_class = ucnranges[mn].combine;
907404b540aSrobert 
908404b540aSrobert   /* In C99, UCN digits may not begin identifiers.  */
909404b540aSrobert   if (CPP_OPTION (pfile, c99) && (ucnranges[mn].flags & DIG))
910404b540aSrobert     return 2;
911404b540aSrobert 
912404b540aSrobert   return 1;
913404b540aSrobert }
914404b540aSrobert 
915404b540aSrobert /* [lex.charset]: The character designated by the universal character
916404b540aSrobert    name \UNNNNNNNN is that character whose character short name in
917404b540aSrobert    ISO/IEC 10646 is NNNNNNNN; the character designated by the
918404b540aSrobert    universal character name \uNNNN is that character whose character
919404b540aSrobert    short name in ISO/IEC 10646 is 0000NNNN.  If the hexadecimal value
920404b540aSrobert    for a universal character name is less than 0x20 or in the range
921404b540aSrobert    0x7F-0x9F (inclusive), or if the universal character name
922404b540aSrobert    designates a character in the basic source character set, then the
923404b540aSrobert    program is ill-formed.
924404b540aSrobert 
925404b540aSrobert    *PSTR must be preceded by "\u" or "\U"; it is assumed that the
926404b540aSrobert    buffer end is delimited by a non-hex digit.  Returns zero if the
927404b540aSrobert    UCN has not been consumed.
928404b540aSrobert 
929404b540aSrobert    Otherwise the nonzero value of the UCN, whether valid or invalid,
930404b540aSrobert    is returned.  Diagnostics are emitted for invalid values.  PSTR
931404b540aSrobert    is updated to point one beyond the UCN, or to the syntactically
932404b540aSrobert    invalid character.
933404b540aSrobert 
934404b540aSrobert    IDENTIFIER_POS is 0 when not in an identifier, 1 for the start of
935404b540aSrobert    an identifier, or 2 otherwise.  */
936404b540aSrobert 
937404b540aSrobert cppchar_t
_cpp_valid_ucn(cpp_reader * pfile,const uchar ** pstr,const uchar * limit,int identifier_pos,struct normalize_state * nst)938404b540aSrobert _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
939404b540aSrobert 		const uchar *limit, int identifier_pos,
940404b540aSrobert 		struct normalize_state *nst)
941404b540aSrobert {
942404b540aSrobert   cppchar_t result, c;
943404b540aSrobert   unsigned int length;
944404b540aSrobert   const uchar *str = *pstr;
945404b540aSrobert   const uchar *base = str - 2;
946404b540aSrobert 
947404b540aSrobert   if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99))
948404b540aSrobert     cpp_error (pfile, CPP_DL_WARNING,
949404b540aSrobert 	       "universal character names are only valid in C++ and C99");
950404b540aSrobert   else if (CPP_WTRADITIONAL (pfile) && identifier_pos == 0)
951404b540aSrobert     cpp_error (pfile, CPP_DL_WARNING,
952404b540aSrobert 	       "the meaning of '\\%c' is different in traditional C",
953404b540aSrobert 	       (int) str[-1]);
954404b540aSrobert 
955404b540aSrobert   if (str[-1] == 'u')
956404b540aSrobert     length = 4;
957404b540aSrobert   else if (str[-1] == 'U')
958404b540aSrobert     length = 8;
959404b540aSrobert   else
960404b540aSrobert     {
961404b540aSrobert       cpp_error (pfile, CPP_DL_ICE, "In _cpp_valid_ucn but not a UCN");
962404b540aSrobert       length = 4;
963404b540aSrobert     }
964404b540aSrobert 
965404b540aSrobert   result = 0;
966404b540aSrobert   do
967404b540aSrobert     {
968404b540aSrobert       c = *str;
969404b540aSrobert       if (!ISXDIGIT (c))
970404b540aSrobert 	break;
971404b540aSrobert       str++;
972404b540aSrobert       result = (result << 4) + hex_value (c);
973404b540aSrobert     }
974404b540aSrobert   while (--length && str < limit);
975404b540aSrobert 
976404b540aSrobert   /* Partial UCNs are not valid in strings, but decompose into
977404b540aSrobert      multiple tokens in identifiers, so we can't give a helpful
978404b540aSrobert      error message in that case.  */
979404b540aSrobert   if (length && identifier_pos)
980404b540aSrobert     return 0;
981404b540aSrobert 
982404b540aSrobert   *pstr = str;
983404b540aSrobert   if (length)
984404b540aSrobert     {
985404b540aSrobert       cpp_error (pfile, CPP_DL_ERROR,
986404b540aSrobert 		 "incomplete universal character name %.*s",
987404b540aSrobert 		 (int) (str - base), base);
988404b540aSrobert       result = 1;
989404b540aSrobert     }
990404b540aSrobert   /* The standard permits $, @ and ` to be specified as UCNs.  We use
991404b540aSrobert      hex escapes so that this also works with EBCDIC hosts.  */
992404b540aSrobert   else if ((result < 0xa0
993404b540aSrobert 	    && (result != 0x24 && result != 0x40 && result != 0x60))
994404b540aSrobert 	   || (result & 0x80000000)
995404b540aSrobert 	   || (result >= 0xD800 && result <= 0xDFFF))
996404b540aSrobert     {
997404b540aSrobert       cpp_error (pfile, CPP_DL_ERROR,
998404b540aSrobert 		 "%.*s is not a valid universal character",
999404b540aSrobert 		 (int) (str - base), base);
1000404b540aSrobert       result = 1;
1001404b540aSrobert     }
1002404b540aSrobert   else if (identifier_pos && result == 0x24
1003404b540aSrobert 	   && CPP_OPTION (pfile, dollars_in_ident))
1004404b540aSrobert     {
1005404b540aSrobert       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1006404b540aSrobert 	{
1007404b540aSrobert 	  CPP_OPTION (pfile, warn_dollars) = 0;
1008404b540aSrobert 	  cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1009404b540aSrobert 	}
1010404b540aSrobert       NORMALIZE_STATE_UPDATE_IDNUM (nst);
1011404b540aSrobert     }
1012404b540aSrobert   else if (identifier_pos)
1013404b540aSrobert     {
1014404b540aSrobert       int validity = ucn_valid_in_identifier (pfile, result, nst);
1015404b540aSrobert 
1016404b540aSrobert       if (validity == 0)
1017404b540aSrobert 	cpp_error (pfile, CPP_DL_ERROR,
1018404b540aSrobert 		   "universal character %.*s is not valid in an identifier",
1019404b540aSrobert 		   (int) (str - base), base);
1020404b540aSrobert       else if (validity == 2 && identifier_pos == 1)
1021404b540aSrobert 	cpp_error (pfile, CPP_DL_ERROR,
1022404b540aSrobert    "universal character %.*s is not valid at the start of an identifier",
1023404b540aSrobert 		   (int) (str - base), base);
1024404b540aSrobert     }
1025404b540aSrobert 
1026404b540aSrobert   if (result == 0)
1027404b540aSrobert     result = 1;
1028404b540aSrobert 
1029404b540aSrobert   return result;
1030404b540aSrobert }
1031404b540aSrobert 
1032404b540aSrobert /* Convert an UCN, pointed to by FROM, to UTF-8 encoding, then translate
1033404b540aSrobert    it to the execution character set and write the result into TBUF.
1034404b540aSrobert    An advanced pointer is returned.  Issues all relevant diagnostics.  */
1035404b540aSrobert static const uchar *
convert_ucn(cpp_reader * pfile,const uchar * from,const uchar * limit,struct _cpp_strbuf * tbuf,bool wide)1036404b540aSrobert convert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit,
1037404b540aSrobert 	     struct _cpp_strbuf *tbuf, bool wide)
1038404b540aSrobert {
1039404b540aSrobert   cppchar_t ucn;
1040404b540aSrobert   uchar buf[6];
1041404b540aSrobert   uchar *bufp = buf;
1042404b540aSrobert   size_t bytesleft = 6;
1043404b540aSrobert   int rval;
1044404b540aSrobert   struct cset_converter cvt
1045404b540aSrobert     = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
1046404b540aSrobert   struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1047404b540aSrobert 
1048404b540aSrobert   from++;  /* Skip u/U.  */
1049404b540aSrobert   ucn = _cpp_valid_ucn (pfile, &from, limit, 0, &nst);
1050404b540aSrobert 
1051404b540aSrobert   rval = one_cppchar_to_utf8 (ucn, &bufp, &bytesleft);
1052404b540aSrobert   if (rval)
1053404b540aSrobert     {
1054404b540aSrobert       errno = rval;
1055404b540aSrobert       cpp_errno (pfile, CPP_DL_ERROR,
1056404b540aSrobert 		 "converting UCN to source character set");
1057404b540aSrobert     }
1058404b540aSrobert   else if (!APPLY_CONVERSION (cvt, buf, 6 - bytesleft, tbuf))
1059404b540aSrobert     cpp_errno (pfile, CPP_DL_ERROR,
1060404b540aSrobert 	       "converting UCN to execution character set");
1061404b540aSrobert 
1062404b540aSrobert   return from;
1063404b540aSrobert }
1064404b540aSrobert 
1065404b540aSrobert /* Subroutine of convert_hex and convert_oct.  N is the representation
1066404b540aSrobert    in the execution character set of a numeric escape; write it into the
1067404b540aSrobert    string buffer TBUF and update the end-of-string pointer therein.  WIDE
1068404b540aSrobert    is true if it's a wide string that's being assembled in TBUF.  This
1069404b540aSrobert    function issues no diagnostics and never fails.  */
1070404b540aSrobert static void
emit_numeric_escape(cpp_reader * pfile,cppchar_t n,struct _cpp_strbuf * tbuf,bool wide)1071404b540aSrobert emit_numeric_escape (cpp_reader *pfile, cppchar_t n,
1072404b540aSrobert 		     struct _cpp_strbuf *tbuf, bool wide)
1073404b540aSrobert {
1074404b540aSrobert   if (wide)
1075404b540aSrobert     {
1076404b540aSrobert       /* We have to render this into the target byte order, which may not
1077404b540aSrobert 	 be our byte order.  */
1078404b540aSrobert       bool bigend = CPP_OPTION (pfile, bytes_big_endian);
1079404b540aSrobert       size_t width = CPP_OPTION (pfile, wchar_precision);
1080404b540aSrobert       size_t cwidth = CPP_OPTION (pfile, char_precision);
1081404b540aSrobert       size_t cmask = width_to_mask (cwidth);
1082404b540aSrobert       size_t nbwc = width / cwidth;
1083404b540aSrobert       size_t i;
1084404b540aSrobert       size_t off = tbuf->len;
1085404b540aSrobert       cppchar_t c;
1086404b540aSrobert 
1087404b540aSrobert       if (tbuf->len + nbwc > tbuf->asize)
1088404b540aSrobert 	{
1089404b540aSrobert 	  tbuf->asize += OUTBUF_BLOCK_SIZE;
1090404b540aSrobert 	  tbuf->text = XRESIZEVEC (uchar, tbuf->text, tbuf->asize);
1091404b540aSrobert 	}
1092404b540aSrobert 
1093404b540aSrobert       for (i = 0; i < nbwc; i++)
1094404b540aSrobert 	{
1095404b540aSrobert 	  c = n & cmask;
1096404b540aSrobert 	  n >>= cwidth;
1097404b540aSrobert 	  tbuf->text[off + (bigend ? nbwc - i - 1 : i)] = c;
1098404b540aSrobert 	}
1099404b540aSrobert       tbuf->len += nbwc;
1100404b540aSrobert     }
1101404b540aSrobert   else
1102404b540aSrobert     {
1103404b540aSrobert       /* Note: this code does not handle the case where the target
1104404b540aSrobert 	 and host have a different number of bits in a byte.  */
1105404b540aSrobert       if (tbuf->len + 1 > tbuf->asize)
1106404b540aSrobert 	{
1107404b540aSrobert 	  tbuf->asize += OUTBUF_BLOCK_SIZE;
1108404b540aSrobert 	  tbuf->text = XRESIZEVEC (uchar, tbuf->text, tbuf->asize);
1109404b540aSrobert 	}
1110404b540aSrobert       tbuf->text[tbuf->len++] = n;
1111404b540aSrobert     }
1112404b540aSrobert }
1113404b540aSrobert 
1114404b540aSrobert /* Convert a hexadecimal escape, pointed to by FROM, to the execution
1115404b540aSrobert    character set and write it into the string buffer TBUF.  Returns an
1116404b540aSrobert    advanced pointer, and issues diagnostics as necessary.
1117404b540aSrobert    No character set translation occurs; this routine always produces the
1118404b540aSrobert    execution-set character with numeric value equal to the given hex
1119404b540aSrobert    number.  You can, e.g. generate surrogate pairs this way.  */
1120404b540aSrobert static const uchar *
convert_hex(cpp_reader * pfile,const uchar * from,const uchar * limit,struct _cpp_strbuf * tbuf,bool wide)1121404b540aSrobert convert_hex (cpp_reader *pfile, const uchar *from, const uchar *limit,
1122404b540aSrobert 	     struct _cpp_strbuf *tbuf, bool wide)
1123404b540aSrobert {
1124404b540aSrobert   cppchar_t c, n = 0, overflow = 0;
1125404b540aSrobert   int digits_found = 0;
1126404b540aSrobert   size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
1127404b540aSrobert 		  : CPP_OPTION (pfile, char_precision));
1128404b540aSrobert   size_t mask = width_to_mask (width);
1129404b540aSrobert 
1130404b540aSrobert   if (CPP_WTRADITIONAL (pfile))
1131404b540aSrobert     cpp_error (pfile, CPP_DL_WARNING,
1132404b540aSrobert 	       "the meaning of '\\x' is different in traditional C");
1133404b540aSrobert 
1134404b540aSrobert   from++;  /* Skip 'x'.  */
1135404b540aSrobert   while (from < limit)
1136404b540aSrobert     {
1137404b540aSrobert       c = *from;
1138404b540aSrobert       if (! hex_p (c))
1139404b540aSrobert 	break;
1140404b540aSrobert       from++;
1141404b540aSrobert       overflow |= n ^ (n << 4 >> 4);
1142404b540aSrobert       n = (n << 4) + hex_value (c);
1143404b540aSrobert       digits_found = 1;
1144404b540aSrobert     }
1145404b540aSrobert 
1146404b540aSrobert   if (!digits_found)
1147404b540aSrobert     {
1148404b540aSrobert       cpp_error (pfile, CPP_DL_ERROR,
1149404b540aSrobert 		 "\\x used with no following hex digits");
1150404b540aSrobert       return from;
1151404b540aSrobert     }
1152404b540aSrobert 
1153404b540aSrobert   if (overflow | (n != (n & mask)))
1154404b540aSrobert     {
1155404b540aSrobert       cpp_error (pfile, CPP_DL_PEDWARN,
1156404b540aSrobert 		 "hex escape sequence out of range");
1157404b540aSrobert       n &= mask;
1158404b540aSrobert     }
1159404b540aSrobert 
1160404b540aSrobert   emit_numeric_escape (pfile, n, tbuf, wide);
1161404b540aSrobert 
1162404b540aSrobert   return from;
1163404b540aSrobert }
1164404b540aSrobert 
1165404b540aSrobert /* Convert an octal escape, pointed to by FROM, to the execution
1166404b540aSrobert    character set and write it into the string buffer TBUF.  Returns an
1167404b540aSrobert    advanced pointer, and issues diagnostics as necessary.
1168404b540aSrobert    No character set translation occurs; this routine always produces the
1169404b540aSrobert    execution-set character with numeric value equal to the given octal
1170404b540aSrobert    number.  */
1171404b540aSrobert static const uchar *
convert_oct(cpp_reader * pfile,const uchar * from,const uchar * limit,struct _cpp_strbuf * tbuf,bool wide)1172404b540aSrobert convert_oct (cpp_reader *pfile, const uchar *from, const uchar *limit,
1173404b540aSrobert 	     struct _cpp_strbuf *tbuf, bool wide)
1174404b540aSrobert {
1175404b540aSrobert   size_t count = 0;
1176404b540aSrobert   cppchar_t c, n = 0;
1177404b540aSrobert   size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
1178404b540aSrobert 		  : CPP_OPTION (pfile, char_precision));
1179404b540aSrobert   size_t mask = width_to_mask (width);
1180404b540aSrobert   bool overflow = false;
1181404b540aSrobert 
1182404b540aSrobert   while (from < limit && count++ < 3)
1183404b540aSrobert     {
1184404b540aSrobert       c = *from;
1185404b540aSrobert       if (c < '0' || c > '7')
1186404b540aSrobert 	break;
1187404b540aSrobert       from++;
1188404b540aSrobert       overflow |= n ^ (n << 3 >> 3);
1189404b540aSrobert       n = (n << 3) + c - '0';
1190404b540aSrobert     }
1191404b540aSrobert 
1192404b540aSrobert   if (n != (n & mask))
1193404b540aSrobert     {
1194404b540aSrobert       cpp_error (pfile, CPP_DL_PEDWARN,
1195404b540aSrobert 		 "octal escape sequence out of range");
1196404b540aSrobert       n &= mask;
1197404b540aSrobert     }
1198404b540aSrobert 
1199404b540aSrobert   emit_numeric_escape (pfile, n, tbuf, wide);
1200404b540aSrobert 
1201404b540aSrobert   return from;
1202404b540aSrobert }
1203404b540aSrobert 
1204404b540aSrobert /* Convert an escape sequence (pointed to by FROM) to its value on
1205404b540aSrobert    the target, and to the execution character set.  Do not scan past
1206404b540aSrobert    LIMIT.  Write the converted value into TBUF.  Returns an advanced
1207404b540aSrobert    pointer.  Handles all relevant diagnostics.  */
1208404b540aSrobert static const uchar *
convert_escape(cpp_reader * pfile,const uchar * from,const uchar * limit,struct _cpp_strbuf * tbuf,bool wide)1209404b540aSrobert convert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit,
1210404b540aSrobert 		struct _cpp_strbuf *tbuf, bool wide)
1211404b540aSrobert {
1212404b540aSrobert   /* Values of \a \b \e \f \n \r \t \v respectively.  */
1213404b540aSrobert #if HOST_CHARSET == HOST_CHARSET_ASCII
1214404b540aSrobert   static const uchar charconsts[] = {  7,  8, 27, 12, 10, 13,  9, 11 };
1215404b540aSrobert #elif HOST_CHARSET == HOST_CHARSET_EBCDIC
1216404b540aSrobert   static const uchar charconsts[] = { 47, 22, 39, 12, 21, 13,  5, 11 };
1217404b540aSrobert #else
1218404b540aSrobert #error "unknown host character set"
1219404b540aSrobert #endif
1220404b540aSrobert 
1221404b540aSrobert   uchar c;
1222404b540aSrobert   struct cset_converter cvt
1223404b540aSrobert     = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
1224404b540aSrobert 
1225404b540aSrobert   c = *from;
1226404b540aSrobert   switch (c)
1227404b540aSrobert     {
1228404b540aSrobert       /* UCNs, hex escapes, and octal escapes are processed separately.  */
1229404b540aSrobert     case 'u': case 'U':
1230404b540aSrobert       return convert_ucn (pfile, from, limit, tbuf, wide);
1231404b540aSrobert 
1232404b540aSrobert     case 'x':
1233404b540aSrobert       return convert_hex (pfile, from, limit, tbuf, wide);
1234404b540aSrobert       break;
1235404b540aSrobert 
1236404b540aSrobert     case '0':  case '1':  case '2':  case '3':
1237404b540aSrobert     case '4':  case '5':  case '6':  case '7':
1238404b540aSrobert       return convert_oct (pfile, from, limit, tbuf, wide);
1239404b540aSrobert 
1240404b540aSrobert       /* Various letter escapes.  Get the appropriate host-charset
1241404b540aSrobert 	 value into C.  */
1242404b540aSrobert     case '\\': case '\'': case '"': case '?': break;
1243404b540aSrobert 
1244404b540aSrobert     case '(': case '{': case '[': case '%':
1245404b540aSrobert       /* '\(', etc, can be used at the beginning of a line in a long
1246404b540aSrobert 	 string split onto multiple lines with \-newline, to prevent
1247404b540aSrobert 	 Emacs or other text editors from getting confused.  '\%' can
1248404b540aSrobert 	 be used to prevent SCCS from mangling printf format strings.  */
1249404b540aSrobert       if (CPP_PEDANTIC (pfile))
1250404b540aSrobert 	goto unknown;
1251404b540aSrobert       break;
1252404b540aSrobert 
1253404b540aSrobert     case 'b': c = charconsts[1];  break;
1254404b540aSrobert     case 'f': c = charconsts[3];  break;
1255404b540aSrobert     case 'n': c = charconsts[4];  break;
1256404b540aSrobert     case 'r': c = charconsts[5];  break;
1257404b540aSrobert     case 't': c = charconsts[6];  break;
1258404b540aSrobert     case 'v': c = charconsts[7];  break;
1259404b540aSrobert 
1260404b540aSrobert     case 'a':
1261404b540aSrobert       if (CPP_WTRADITIONAL (pfile))
1262404b540aSrobert 	cpp_error (pfile, CPP_DL_WARNING,
1263404b540aSrobert 		   "the meaning of '\\a' is different in traditional C");
1264404b540aSrobert       c = charconsts[0];
1265404b540aSrobert       break;
1266404b540aSrobert 
1267404b540aSrobert     case 'e': case 'E':
1268404b540aSrobert       if (CPP_PEDANTIC (pfile))
1269404b540aSrobert 	cpp_error (pfile, CPP_DL_PEDWARN,
1270404b540aSrobert 		   "non-ISO-standard escape sequence, '\\%c'", (int) c);
1271404b540aSrobert       c = charconsts[2];
1272404b540aSrobert       break;
1273404b540aSrobert 
1274404b540aSrobert     default:
1275404b540aSrobert     unknown:
1276404b540aSrobert       if (ISGRAPH (c))
1277404b540aSrobert 	cpp_error (pfile, CPP_DL_PEDWARN,
1278404b540aSrobert 		   "unknown escape sequence '\\%c'", (int) c);
1279404b540aSrobert       else
1280404b540aSrobert 	{
1281404b540aSrobert 	  /* diagnostic.c does not support "%03o".  When it does, this
1282404b540aSrobert 	     code can use %03o directly in the diagnostic again.  */
1283404b540aSrobert 	  char buf[32];
1284404b540aSrobert 	  sprintf(buf, "%03o", (int) c);
1285404b540aSrobert 	  cpp_error (pfile, CPP_DL_PEDWARN,
1286404b540aSrobert 		     "unknown escape sequence: '\\%s'", buf);
1287404b540aSrobert 	}
1288404b540aSrobert     }
1289404b540aSrobert 
1290404b540aSrobert   /* Now convert what we have to the execution character set.  */
1291404b540aSrobert   if (!APPLY_CONVERSION (cvt, &c, 1, tbuf))
1292404b540aSrobert     cpp_errno (pfile, CPP_DL_ERROR,
1293404b540aSrobert 	       "converting escape sequence to execution character set");
1294404b540aSrobert 
1295404b540aSrobert   return from + 1;
1296404b540aSrobert }
1297404b540aSrobert 
1298404b540aSrobert /* FROM is an array of cpp_string structures of length COUNT.  These
1299404b540aSrobert    are to be converted from the source to the execution character set,
1300404b540aSrobert    escape sequences translated, and finally all are to be
1301404b540aSrobert    concatenated.  WIDE indicates whether or not to produce a wide
1302404b540aSrobert    string.  The result is written into TO.  Returns true for success,
1303404b540aSrobert    false for failure.  */
1304404b540aSrobert bool
cpp_interpret_string(cpp_reader * pfile,const cpp_string * from,size_t count,cpp_string * to,bool wide)1305404b540aSrobert cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count,
1306404b540aSrobert 		      cpp_string *to, bool wide)
1307404b540aSrobert {
1308404b540aSrobert   struct _cpp_strbuf tbuf;
1309404b540aSrobert   const uchar *p, *base, *limit;
1310404b540aSrobert   size_t i;
1311404b540aSrobert   struct cset_converter cvt
1312404b540aSrobert     = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
1313404b540aSrobert 
1314404b540aSrobert   tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);
1315404b540aSrobert   tbuf.text = XNEWVEC (uchar, tbuf.asize);
1316404b540aSrobert   tbuf.len = 0;
1317404b540aSrobert 
1318404b540aSrobert   for (i = 0; i < count; i++)
1319404b540aSrobert     {
1320404b540aSrobert       p = from[i].text;
1321404b540aSrobert       if (*p == 'L') p++;
1322404b540aSrobert       p++; /* Skip leading quote.  */
1323404b540aSrobert       limit = from[i].text + from[i].len - 1; /* Skip trailing quote.  */
1324404b540aSrobert 
1325404b540aSrobert       for (;;)
1326404b540aSrobert 	{
1327404b540aSrobert 	  base = p;
1328404b540aSrobert 	  while (p < limit && *p != '\\')
1329404b540aSrobert 	    p++;
1330404b540aSrobert 	  if (p > base)
1331404b540aSrobert 	    {
1332404b540aSrobert 	      /* We have a run of normal characters; these can be fed
1333404b540aSrobert 		 directly to convert_cset.  */
1334404b540aSrobert 	      if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf))
1335404b540aSrobert 		goto fail;
1336404b540aSrobert 	    }
1337404b540aSrobert 	  if (p == limit)
1338404b540aSrobert 	    break;
1339404b540aSrobert 
1340404b540aSrobert 	  p = convert_escape (pfile, p + 1, limit, &tbuf, wide);
1341404b540aSrobert 	}
1342404b540aSrobert     }
1343404b540aSrobert   /* NUL-terminate the 'to' buffer and translate it to a cpp_string
1344404b540aSrobert      structure.  */
1345404b540aSrobert   emit_numeric_escape (pfile, 0, &tbuf, wide);
1346404b540aSrobert   tbuf.text = XRESIZEVEC (uchar, tbuf.text, tbuf.len);
1347404b540aSrobert   to->text = tbuf.text;
1348404b540aSrobert   to->len = tbuf.len;
1349404b540aSrobert   return true;
1350404b540aSrobert 
1351404b540aSrobert  fail:
1352404b540aSrobert   cpp_errno (pfile, CPP_DL_ERROR, "converting to execution character set");
1353404b540aSrobert   free (tbuf.text);
1354404b540aSrobert   return false;
1355404b540aSrobert }
1356404b540aSrobert 
1357404b540aSrobert /* Subroutine of do_line and do_linemarker.  Convert escape sequences
1358404b540aSrobert    in a string, but do not perform character set conversion.  */
1359404b540aSrobert bool
cpp_interpret_string_notranslate(cpp_reader * pfile,const cpp_string * from,size_t count,cpp_string * to,bool wide)1360404b540aSrobert cpp_interpret_string_notranslate (cpp_reader *pfile, const cpp_string *from,
1361404b540aSrobert 				  size_t count,	cpp_string *to, bool wide)
1362404b540aSrobert {
1363404b540aSrobert   struct cset_converter save_narrow_cset_desc = pfile->narrow_cset_desc;
1364404b540aSrobert   bool retval;
1365404b540aSrobert 
1366404b540aSrobert   pfile->narrow_cset_desc.func = convert_no_conversion;
1367404b540aSrobert   pfile->narrow_cset_desc.cd = (iconv_t) -1;
1368404b540aSrobert 
1369404b540aSrobert   retval = cpp_interpret_string (pfile, from, count, to, wide);
1370404b540aSrobert 
1371404b540aSrobert   pfile->narrow_cset_desc = save_narrow_cset_desc;
1372404b540aSrobert   return retval;
1373404b540aSrobert }
1374404b540aSrobert 
1375404b540aSrobert 
1376404b540aSrobert /* Subroutine of cpp_interpret_charconst which performs the conversion
1377404b540aSrobert    to a number, for narrow strings.  STR is the string structure returned
1378404b540aSrobert    by cpp_interpret_string.  PCHARS_SEEN and UNSIGNEDP are as for
1379404b540aSrobert    cpp_interpret_charconst.  */
1380404b540aSrobert static cppchar_t
narrow_str_to_charconst(cpp_reader * pfile,cpp_string str,unsigned int * pchars_seen,int * unsignedp)1381404b540aSrobert narrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
1382404b540aSrobert 			 unsigned int *pchars_seen, int *unsignedp)
1383404b540aSrobert {
1384404b540aSrobert   size_t width = CPP_OPTION (pfile, char_precision);
1385404b540aSrobert   size_t max_chars = CPP_OPTION (pfile, int_precision) / width;
1386404b540aSrobert   size_t mask = width_to_mask (width);
1387404b540aSrobert   size_t i;
1388404b540aSrobert   cppchar_t result, c;
1389404b540aSrobert   bool unsigned_p;
1390404b540aSrobert 
1391404b540aSrobert   /* The value of a multi-character character constant, or a
1392404b540aSrobert      single-character character constant whose representation in the
1393404b540aSrobert      execution character set is more than one byte long, is
1394404b540aSrobert      implementation defined.  This implementation defines it to be the
1395404b540aSrobert      number formed by interpreting the byte sequence in memory as a
1396404b540aSrobert      big-endian binary number.  If overflow occurs, the high bytes are
1397404b540aSrobert      lost, and a warning is issued.
1398404b540aSrobert 
1399404b540aSrobert      We don't want to process the NUL terminator handed back by
1400404b540aSrobert      cpp_interpret_string.  */
1401404b540aSrobert   result = 0;
1402404b540aSrobert   for (i = 0; i < str.len - 1; i++)
1403404b540aSrobert     {
1404404b540aSrobert       c = str.text[i] & mask;
1405404b540aSrobert       if (width < BITS_PER_CPPCHAR_T)
1406404b540aSrobert 	result = (result << width) | c;
1407404b540aSrobert       else
1408404b540aSrobert 	result = c;
1409404b540aSrobert     }
1410404b540aSrobert 
1411404b540aSrobert   if (i > max_chars)
1412404b540aSrobert     {
1413404b540aSrobert       i = max_chars;
1414404b540aSrobert       cpp_error (pfile, CPP_DL_WARNING,
1415404b540aSrobert 		 "character constant too long for its type");
1416404b540aSrobert     }
1417404b540aSrobert   else if (i > 1 && CPP_OPTION (pfile, warn_multichar))
1418404b540aSrobert     cpp_error (pfile, CPP_DL_WARNING, "multi-character character constant");
1419404b540aSrobert 
1420404b540aSrobert   /* Multichar constants are of type int and therefore signed.  */
1421404b540aSrobert   if (i > 1)
1422404b540aSrobert     unsigned_p = 0;
1423404b540aSrobert   else
1424404b540aSrobert     unsigned_p = CPP_OPTION (pfile, unsigned_char);
1425404b540aSrobert 
1426404b540aSrobert   /* Truncate the constant to its natural width, and simultaneously
1427404b540aSrobert      sign- or zero-extend to the full width of cppchar_t.
1428404b540aSrobert      For single-character constants, the value is WIDTH bits wide.
1429404b540aSrobert      For multi-character constants, the value is INT_PRECISION bits wide.  */
1430404b540aSrobert   if (i > 1)
1431404b540aSrobert     width = CPP_OPTION (pfile, int_precision);
1432404b540aSrobert   if (width < BITS_PER_CPPCHAR_T)
1433404b540aSrobert     {
1434404b540aSrobert       mask = ((cppchar_t) 1 << width) - 1;
1435404b540aSrobert       if (unsigned_p || !(result & (1 << (width - 1))))
1436404b540aSrobert 	result &= mask;
1437404b540aSrobert       else
1438404b540aSrobert 	result |= ~mask;
1439404b540aSrobert     }
1440404b540aSrobert   *pchars_seen = i;
1441404b540aSrobert   *unsignedp = unsigned_p;
1442404b540aSrobert   return result;
1443404b540aSrobert }
1444404b540aSrobert 
1445404b540aSrobert /* Subroutine of cpp_interpret_charconst which performs the conversion
1446404b540aSrobert    to a number, for wide strings.  STR is the string structure returned
1447404b540aSrobert    by cpp_interpret_string.  PCHARS_SEEN and UNSIGNEDP are as for
1448404b540aSrobert    cpp_interpret_charconst.  */
1449404b540aSrobert static cppchar_t
wide_str_to_charconst(cpp_reader * pfile,cpp_string str,unsigned int * pchars_seen,int * unsignedp)1450404b540aSrobert wide_str_to_charconst (cpp_reader *pfile, cpp_string str,
1451404b540aSrobert 		       unsigned int *pchars_seen, int *unsignedp)
1452404b540aSrobert {
1453404b540aSrobert   bool bigend = CPP_OPTION (pfile, bytes_big_endian);
1454404b540aSrobert   size_t width = CPP_OPTION (pfile, wchar_precision);
1455404b540aSrobert   size_t cwidth = CPP_OPTION (pfile, char_precision);
1456404b540aSrobert   size_t mask = width_to_mask (width);
1457404b540aSrobert   size_t cmask = width_to_mask (cwidth);
1458404b540aSrobert   size_t nbwc = width / cwidth;
1459404b540aSrobert   size_t off, i;
1460404b540aSrobert   cppchar_t result = 0, c;
1461404b540aSrobert 
1462404b540aSrobert   /* This is finicky because the string is in the target's byte order,
1463404b540aSrobert      which may not be our byte order.  Only the last character, ignoring
1464404b540aSrobert      the NUL terminator, is relevant.  */
1465404b540aSrobert   off = str.len - (nbwc * 2);
1466404b540aSrobert   result = 0;
1467404b540aSrobert   for (i = 0; i < nbwc; i++)
1468404b540aSrobert     {
1469404b540aSrobert       c = bigend ? str.text[off + i] : str.text[off + nbwc - i - 1];
1470404b540aSrobert       result = (result << cwidth) | (c & cmask);
1471404b540aSrobert     }
1472404b540aSrobert 
1473404b540aSrobert   /* Wide character constants have type wchar_t, and a single
1474404b540aSrobert      character exactly fills a wchar_t, so a multi-character wide
1475404b540aSrobert      character constant is guaranteed to overflow.  */
1476404b540aSrobert   if (off > 0)
1477404b540aSrobert     cpp_error (pfile, CPP_DL_WARNING,
1478404b540aSrobert 	       "character constant too long for its type");
1479404b540aSrobert 
1480404b540aSrobert   /* Truncate the constant to its natural width, and simultaneously
1481404b540aSrobert      sign- or zero-extend to the full width of cppchar_t.  */
1482404b540aSrobert   if (width < BITS_PER_CPPCHAR_T)
1483404b540aSrobert     {
1484404b540aSrobert       if (CPP_OPTION (pfile, unsigned_wchar) || !(result & (1 << (width - 1))))
1485404b540aSrobert 	result &= mask;
1486404b540aSrobert       else
1487404b540aSrobert 	result |= ~mask;
1488404b540aSrobert     }
1489404b540aSrobert 
1490404b540aSrobert   *unsignedp = CPP_OPTION (pfile, unsigned_wchar);
1491404b540aSrobert   *pchars_seen = 1;
1492404b540aSrobert   return result;
1493404b540aSrobert }
1494404b540aSrobert 
1495404b540aSrobert /* Interpret a (possibly wide) character constant in TOKEN.
1496404b540aSrobert    PCHARS_SEEN points to a variable that is filled in with the number
1497404b540aSrobert    of characters seen, and UNSIGNEDP to a variable that indicates
1498404b540aSrobert    whether the result has signed type.  */
1499404b540aSrobert cppchar_t
cpp_interpret_charconst(cpp_reader * pfile,const cpp_token * token,unsigned int * pchars_seen,int * unsignedp)1500404b540aSrobert cpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token,
1501404b540aSrobert 			 unsigned int *pchars_seen, int *unsignedp)
1502404b540aSrobert {
1503404b540aSrobert   cpp_string str = { 0, 0 };
1504404b540aSrobert   bool wide = (token->type == CPP_WCHAR);
1505404b540aSrobert   cppchar_t result;
1506404b540aSrobert 
1507404b540aSrobert   /* an empty constant will appear as L'' or '' */
1508404b540aSrobert   if (token->val.str.len == (size_t) (2 + wide))
1509404b540aSrobert     {
1510404b540aSrobert       cpp_error (pfile, CPP_DL_ERROR, "empty character constant");
1511404b540aSrobert       return 0;
1512404b540aSrobert     }
1513404b540aSrobert   else if (!cpp_interpret_string (pfile, &token->val.str, 1, &str, wide))
1514404b540aSrobert     return 0;
1515404b540aSrobert 
1516404b540aSrobert   if (wide)
1517404b540aSrobert     result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp);
1518404b540aSrobert   else
1519404b540aSrobert     result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp);
1520404b540aSrobert 
1521404b540aSrobert   if (str.text != token->val.str.text)
1522404b540aSrobert     free ((void *)str.text);
1523404b540aSrobert 
1524404b540aSrobert   return result;
1525404b540aSrobert }
1526404b540aSrobert 
1527404b540aSrobert /* Convert an identifier denoted by ID and LEN, which might contain
1528404b540aSrobert    UCN escapes, to the source character set, either UTF-8 or
1529404b540aSrobert    UTF-EBCDIC.  Assumes that the identifier is actually a valid identifier.  */
1530404b540aSrobert cpp_hashnode *
_cpp_interpret_identifier(cpp_reader * pfile,const uchar * id,size_t len)1531404b540aSrobert _cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len)
1532404b540aSrobert {
1533404b540aSrobert   /* It turns out that a UCN escape always turns into fewer characters
1534404b540aSrobert      than the escape itself, so we can allocate a temporary in advance.  */
1535404b540aSrobert   uchar * buf = (uchar *) alloca (len + 1);
1536404b540aSrobert   uchar * bufp = buf;
1537404b540aSrobert   size_t idp;
1538404b540aSrobert 
1539404b540aSrobert   for (idp = 0; idp < len; idp++)
1540404b540aSrobert     if (id[idp] != '\\')
1541404b540aSrobert       *bufp++ = id[idp];
1542404b540aSrobert     else
1543404b540aSrobert       {
1544404b540aSrobert 	unsigned length = id[idp+1] == 'u' ? 4 : 8;
1545404b540aSrobert 	cppchar_t value = 0;
1546404b540aSrobert 	size_t bufleft = len - (bufp - buf);
1547404b540aSrobert 	int rval;
1548404b540aSrobert 
1549404b540aSrobert 	idp += 2;
1550404b540aSrobert 	while (length && idp < len && ISXDIGIT (id[idp]))
1551404b540aSrobert 	  {
1552404b540aSrobert 	    value = (value << 4) + hex_value (id[idp]);
1553404b540aSrobert 	    idp++;
1554404b540aSrobert 	    length--;
1555404b540aSrobert 	  }
1556404b540aSrobert 	idp--;
1557404b540aSrobert 
1558404b540aSrobert 	/* Special case for EBCDIC: if the identifier contains
1559404b540aSrobert 	   a '$' specified using a UCN, translate it to EBCDIC.  */
1560404b540aSrobert 	if (value == 0x24)
1561404b540aSrobert 	  {
1562404b540aSrobert 	    *bufp++ = '$';
1563404b540aSrobert 	    continue;
1564404b540aSrobert 	  }
1565404b540aSrobert 
1566404b540aSrobert 	rval = one_cppchar_to_utf8 (value, &bufp, &bufleft);
1567404b540aSrobert 	if (rval)
1568404b540aSrobert 	  {
1569404b540aSrobert 	    errno = rval;
1570404b540aSrobert 	    cpp_errno (pfile, CPP_DL_ERROR,
1571404b540aSrobert 		       "converting UCN to source character set");
1572404b540aSrobert 	    break;
1573404b540aSrobert 	  }
1574404b540aSrobert       }
1575404b540aSrobert 
1576404b540aSrobert   return CPP_HASHNODE (ht_lookup (pfile->hash_table,
1577404b540aSrobert 				  buf, bufp - buf, HT_ALLOC));
1578404b540aSrobert }
1579404b540aSrobert 
1580404b540aSrobert /* Convert an input buffer (containing the complete contents of one
1581404b540aSrobert    source file) from INPUT_CHARSET to the source character set.  INPUT
1582404b540aSrobert    points to the input buffer, SIZE is its allocated size, and LEN is
1583404b540aSrobert    the length of the meaningful data within the buffer.  The
1584404b540aSrobert    translated buffer is returned, and *ST_SIZE is set to the length of
1585404b540aSrobert    the meaningful data within the translated buffer.
1586404b540aSrobert 
1587404b540aSrobert    INPUT is expected to have been allocated with xmalloc.  This function
1588404b540aSrobert    will either return INPUT, or free it and return a pointer to another
1589404b540aSrobert    xmalloc-allocated block of memory.  */
1590404b540aSrobert uchar *
_cpp_convert_input(cpp_reader * pfile,const char * input_charset,uchar * input,size_t size,size_t len,off_t * st_size)1591404b540aSrobert _cpp_convert_input (cpp_reader *pfile, const char *input_charset,
1592404b540aSrobert 		    uchar *input, size_t size, size_t len, off_t *st_size)
1593404b540aSrobert {
1594404b540aSrobert   struct cset_converter input_cset;
1595404b540aSrobert   struct _cpp_strbuf to;
1596404b540aSrobert 
1597404b540aSrobert   input_cset = init_iconv_desc (pfile, SOURCE_CHARSET, input_charset);
1598404b540aSrobert   if (input_cset.func == convert_no_conversion)
1599404b540aSrobert     {
1600404b540aSrobert       to.text = input;
1601404b540aSrobert       to.asize = size;
1602404b540aSrobert       to.len = len;
1603404b540aSrobert     }
1604404b540aSrobert   else
1605404b540aSrobert     {
1606404b540aSrobert       to.asize = MAX (65536, len);
1607404b540aSrobert       to.text = XNEWVEC (uchar, to.asize);
1608404b540aSrobert       to.len = 0;
1609404b540aSrobert 
1610404b540aSrobert       if (!APPLY_CONVERSION (input_cset, input, len, &to))
1611404b540aSrobert 	cpp_error (pfile, CPP_DL_ERROR,
1612404b540aSrobert 		   "failure to convert %s to %s",
1613404b540aSrobert 		   CPP_OPTION (pfile, input_charset), SOURCE_CHARSET);
1614404b540aSrobert 
1615404b540aSrobert       free (input);
1616404b540aSrobert     }
1617404b540aSrobert 
1618404b540aSrobert   /* Clean up the mess.  */
1619404b540aSrobert   if (input_cset.func == convert_using_iconv)
1620404b540aSrobert     iconv_close (input_cset.cd);
1621404b540aSrobert 
1622404b540aSrobert   /* Resize buffer if we allocated substantially too much, or if we
1623404b540aSrobert      haven't enough space for the \n-terminator.  */
1624404b540aSrobert   if (to.len + 4096 < to.asize || to.len >= to.asize)
1625404b540aSrobert     to.text = XRESIZEVEC (uchar, to.text, to.len + 1);
1626404b540aSrobert 
1627404b540aSrobert   /* If the file is using old-school Mac line endings (\r only),
1628404b540aSrobert      terminate with another \r, not an \n, so that we do not mistake
1629404b540aSrobert      the \r\n sequence for a single DOS line ending and erroneously
1630404b540aSrobert      issue the "No newline at end of file" diagnostic.  */
1631*1c4aaf6cSkili   if (to.len > 0 && to.text[to.len - 1] == '\r')
1632404b540aSrobert     to.text[to.len] = '\r';
1633404b540aSrobert   else
1634404b540aSrobert     to.text[to.len] = '\n';
1635404b540aSrobert 
1636404b540aSrobert   *st_size = to.len;
1637404b540aSrobert   return to.text;
1638404b540aSrobert }
1639404b540aSrobert 
1640404b540aSrobert /* Decide on the default encoding to assume for input files.  */
1641404b540aSrobert const char *
_cpp_default_encoding(void)1642404b540aSrobert _cpp_default_encoding (void)
1643404b540aSrobert {
1644404b540aSrobert   const char *current_encoding = NULL;
1645404b540aSrobert 
1646404b540aSrobert   /* We disable this because the default codeset is 7-bit ASCII on
1647404b540aSrobert      most platforms, and this causes conversion failures on every
1648404b540aSrobert      file in GCC that happens to have one of the upper 128 characters
1649404b540aSrobert      in it -- most likely, as part of the name of a contributor.
1650404b540aSrobert      We should definitely recognize in-band markers of file encoding,
1651404b540aSrobert      like:
1652404b540aSrobert      - the appropriate Unicode byte-order mark (FE FF) to recognize
1653404b540aSrobert        UTF16 and UCS4 (in both big-endian and little-endian flavors)
1654404b540aSrobert        and UTF8
1655404b540aSrobert      - a "#i", "#d", "/ *", "//", " #p" or "#p" (for #pragma) to
1656404b540aSrobert        distinguish ASCII and EBCDIC.
1657404b540aSrobert      - now we can parse something like "#pragma GCC encoding <xyz>
1658404b540aSrobert        on the first line, or even Emacs/VIM's mode line tags (there's
1659404b540aSrobert        a problem here in that VIM uses the last line, and Emacs has
1660404b540aSrobert        its more elaborate "local variables" convention).
1661404b540aSrobert      - investigate whether Java has another common convention, which
1662404b540aSrobert        would be friendly to support.
1663404b540aSrobert      (Zack Weinberg and Paolo Bonzini, May 20th 2004)  */
1664404b540aSrobert #if defined (HAVE_LOCALE_H) && defined (HAVE_LANGINFO_CODESET) && 0
1665404b540aSrobert   setlocale (LC_CTYPE, "");
1666404b540aSrobert   current_encoding = nl_langinfo (CODESET);
1667404b540aSrobert #endif
1668404b540aSrobert   if (current_encoding == NULL || *current_encoding == '\0')
1669404b540aSrobert     current_encoding = SOURCE_CHARSET;
1670404b540aSrobert 
1671404b540aSrobert   return current_encoding;
1672404b540aSrobert }
1673