1404b540aSrobert /* CPP Library - charsets
2404b540aSrobert Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003, 2004
3404b540aSrobert Free Software Foundation, Inc.
4404b540aSrobert
5404b540aSrobert Broken out of c-lex.c Apr 2003, adding valid C99 UCN ranges.
6404b540aSrobert
7404b540aSrobert This program is free software; you can redistribute it and/or modify it
8404b540aSrobert under the terms of the GNU General Public License as published by the
9404b540aSrobert Free Software Foundation; either version 2, or (at your option) any
10404b540aSrobert later version.
11404b540aSrobert
12404b540aSrobert This program is distributed in the hope that it will be useful,
13404b540aSrobert but WITHOUT ANY WARRANTY; without even the implied warranty of
14404b540aSrobert MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15404b540aSrobert GNU General Public License for more details.
16404b540aSrobert
17404b540aSrobert You should have received a copy of the GNU General Public License
18404b540aSrobert along with this program; if not, write to the Free Software
19404b540aSrobert Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
20404b540aSrobert
21404b540aSrobert #include "config.h"
22404b540aSrobert #include "system.h"
23404b540aSrobert #include "cpplib.h"
24404b540aSrobert #include "internal.h"
25404b540aSrobert
26404b540aSrobert /* Character set handling for C-family languages.
27404b540aSrobert
28404b540aSrobert Terminological note: In what follows, "charset" or "character set"
29404b540aSrobert will be taken to mean both an abstract set of characters and an
30404b540aSrobert encoding for that set.
31404b540aSrobert
32404b540aSrobert The C99 standard discusses two character sets: source and execution.
33404b540aSrobert The source character set is used for internal processing in translation
34404b540aSrobert phases 1 through 4; the execution character set is used thereafter.
35404b540aSrobert Both are required by 5.2.1.2p1 to be multibyte encodings, not wide
36404b540aSrobert character encodings (see 3.7.2, 3.7.3 for the standardese meanings
37404b540aSrobert of these terms). Furthermore, the "basic character set" (listed in
38404b540aSrobert 5.2.1p3) is to be encoded in each with values one byte wide, and is
39404b540aSrobert to appear in the initial shift state.
40404b540aSrobert
41404b540aSrobert It is not explicitly mentioned, but there is also a "wide execution
42404b540aSrobert character set" used to encode wide character constants and wide
43404b540aSrobert string literals; this is supposed to be the result of applying the
44404b540aSrobert standard library function mbstowcs() to an equivalent narrow string
45404b540aSrobert (6.4.5p5). However, the behavior of hexadecimal and octal
46404b540aSrobert \-escapes is at odds with this; they are supposed to be translated
47404b540aSrobert directly to wchar_t values (6.4.4.4p5,6).
48404b540aSrobert
49404b540aSrobert The source character set is not necessarily the character set used
50404b540aSrobert to encode physical source files on disk; translation phase 1 converts
51404b540aSrobert from whatever that encoding is to the source character set.
52404b540aSrobert
53404b540aSrobert The presence of universal character names in C99 (6.4.3 et seq.)
54404b540aSrobert forces the source character set to be isomorphic to ISO 10646,
55404b540aSrobert that is, Unicode. There is no such constraint on the execution
56404b540aSrobert character set; note also that the conversion from source to
57404b540aSrobert execution character set does not occur for identifiers (5.1.1.2p1#5).
58404b540aSrobert
59404b540aSrobert For convenience of implementation, the source character set's
60404b540aSrobert encoding of the basic character set should be identical to the
61404b540aSrobert execution character set OF THE HOST SYSTEM's encoding of the basic
62404b540aSrobert character set, and it should not be a state-dependent encoding.
63404b540aSrobert
64404b540aSrobert cpplib uses UTF-8 or UTF-EBCDIC for the source character set,
65404b540aSrobert depending on whether the host is based on ASCII or EBCDIC (see
66404b540aSrobert respectively Unicode section 2.3/ISO10646 Amendment 2, and Unicode
67404b540aSrobert Technical Report #16). With limited exceptions, it relies on the
68404b540aSrobert system library's iconv() primitive to do charset conversion
69404b540aSrobert (specified in SUSv2). */
70404b540aSrobert
71404b540aSrobert #if !HAVE_ICONV
72404b540aSrobert /* Make certain that the uses of iconv(), iconv_open(), iconv_close()
73404b540aSrobert below, which are guarded only by if statements with compile-time
74404b540aSrobert constant conditions, do not cause link errors. */
75404b540aSrobert #define iconv_open(x, y) (errno = EINVAL, (iconv_t)-1)
76404b540aSrobert #define iconv(a,b,c,d,e) (errno = EINVAL, (size_t)-1)
77404b540aSrobert #define iconv_close(x) (void)0
78404b540aSrobert #define ICONV_CONST
79404b540aSrobert #endif
80404b540aSrobert
81404b540aSrobert #if HOST_CHARSET == HOST_CHARSET_ASCII
82404b540aSrobert #define SOURCE_CHARSET "UTF-8"
83404b540aSrobert #define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0x7e
84404b540aSrobert #elif HOST_CHARSET == HOST_CHARSET_EBCDIC
85404b540aSrobert #define SOURCE_CHARSET "UTF-EBCDIC"
86404b540aSrobert #define LAST_POSSIBLY_BASIC_SOURCE_CHAR 0xFF
87404b540aSrobert #else
88404b540aSrobert #error "Unrecognized basic host character set"
89404b540aSrobert #endif
90404b540aSrobert
91404b540aSrobert #ifndef EILSEQ
92404b540aSrobert #define EILSEQ EINVAL
93404b540aSrobert #endif
94404b540aSrobert
95404b540aSrobert /* This structure is used for a resizable string buffer throughout. */
96404b540aSrobert /* Don't call it strbuf, as that conflicts with unistd.h on systems
97404b540aSrobert such as DYNIX/ptx where unistd.h includes stropts.h. */
98404b540aSrobert struct _cpp_strbuf
99404b540aSrobert {
100404b540aSrobert uchar *text;
101404b540aSrobert size_t asize;
102404b540aSrobert size_t len;
103404b540aSrobert };
104404b540aSrobert
105404b540aSrobert /* This is enough to hold any string that fits on a single 80-column
106404b540aSrobert line, even if iconv quadruples its size (e.g. conversion from
107404b540aSrobert ASCII to UTF-32) rounded up to a power of two. */
108404b540aSrobert #define OUTBUF_BLOCK_SIZE 256
109404b540aSrobert
110404b540aSrobert /* Conversions between UTF-8 and UTF-16/32 are implemented by custom
111404b540aSrobert logic. This is because a depressing number of systems lack iconv,
112404b540aSrobert or have have iconv libraries that do not do these conversions, so
113404b540aSrobert we need a fallback implementation for them. To ensure the fallback
114404b540aSrobert doesn't break due to neglect, it is used on all systems.
115404b540aSrobert
116404b540aSrobert UTF-32 encoding is nice and simple: a four-byte binary number,
117404b540aSrobert constrained to the range 00000000-7FFFFFFF to avoid questions of
118404b540aSrobert signedness. We do have to cope with big- and little-endian
119404b540aSrobert variants.
120404b540aSrobert
121404b540aSrobert UTF-16 encoding uses two-byte binary numbers, again in big- and
122404b540aSrobert little-endian variants, for all values in the 00000000-0000FFFF
123404b540aSrobert range. Values in the 00010000-0010FFFF range are encoded as pairs
124404b540aSrobert of two-byte numbers, called "surrogate pairs": given a number S in
125404b540aSrobert this range, it is mapped to a pair (H, L) as follows:
126404b540aSrobert
127404b540aSrobert H = (S - 0x10000) / 0x400 + 0xD800
128404b540aSrobert L = (S - 0x10000) % 0x400 + 0xDC00
129404b540aSrobert
130404b540aSrobert Two-byte values in the D800...DFFF range are ill-formed except as a
131404b540aSrobert component of a surrogate pair. Even if the encoding within a
132404b540aSrobert two-byte value is little-endian, the H member of the surrogate pair
133404b540aSrobert comes first.
134404b540aSrobert
135404b540aSrobert There is no way to encode values in the 00110000-7FFFFFFF range,
136404b540aSrobert which is not currently a problem as there are no assigned code
137404b540aSrobert points in that range; however, the author expects that it will
138404b540aSrobert eventually become necessary to abandon UTF-16 due to this
139404b540aSrobert limitation. Note also that, because of these pairs, UTF-16 does
140404b540aSrobert not meet the requirements of the C standard for a wide character
141404b540aSrobert encoding (see 3.7.3 and 6.4.4.4p11).
142404b540aSrobert
143404b540aSrobert UTF-8 encoding looks like this:
144404b540aSrobert
145404b540aSrobert value range encoded as
146404b540aSrobert 00000000-0000007F 0xxxxxxx
147404b540aSrobert 00000080-000007FF 110xxxxx 10xxxxxx
148404b540aSrobert 00000800-0000FFFF 1110xxxx 10xxxxxx 10xxxxxx
149404b540aSrobert 00010000-001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
150404b540aSrobert 00200000-03FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
151404b540aSrobert 04000000-7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
152404b540aSrobert
153404b540aSrobert Values in the 0000D800 ... 0000DFFF range (surrogates) are invalid,
154404b540aSrobert which means that three-byte sequences ED xx yy, with A0 <= xx <= BF,
155404b540aSrobert never occur. Note also that any value that can be encoded by a
156404b540aSrobert given row of the table can also be encoded by all successive rows,
157404b540aSrobert but this is not done; only the shortest possible encoding for any
158404b540aSrobert given value is valid. For instance, the character 07C0 could be
159404b540aSrobert encoded as any of DF 80, E0 9F 80, F0 80 9F 80, F8 80 80 9F 80, or
160404b540aSrobert FC 80 80 80 9F 80. Only the first is valid.
161404b540aSrobert
162404b540aSrobert An implementation note: the transformation from UTF-16 to UTF-8, or
163404b540aSrobert vice versa, is easiest done by using UTF-32 as an intermediary. */
164404b540aSrobert
165404b540aSrobert /* Internal primitives which go from an UTF-8 byte stream to native-endian
166404b540aSrobert UTF-32 in a cppchar_t, or vice versa; this avoids an extra marshal/unmarshal
167404b540aSrobert operation in several places below. */
168404b540aSrobert static inline int
one_utf8_to_cppchar(const uchar ** inbufp,size_t * inbytesleftp,cppchar_t * cp)169404b540aSrobert one_utf8_to_cppchar (const uchar **inbufp, size_t *inbytesleftp,
170404b540aSrobert cppchar_t *cp)
171404b540aSrobert {
172404b540aSrobert static const uchar masks[6] = { 0x7F, 0x1F, 0x0F, 0x07, 0x02, 0x01 };
173404b540aSrobert static const uchar patns[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
174404b540aSrobert
175404b540aSrobert cppchar_t c;
176404b540aSrobert const uchar *inbuf = *inbufp;
177404b540aSrobert size_t nbytes, i;
178404b540aSrobert
179404b540aSrobert if (*inbytesleftp < 1)
180404b540aSrobert return EINVAL;
181404b540aSrobert
182404b540aSrobert c = *inbuf;
183404b540aSrobert if (c < 0x80)
184404b540aSrobert {
185404b540aSrobert *cp = c;
186404b540aSrobert *inbytesleftp -= 1;
187404b540aSrobert *inbufp += 1;
188404b540aSrobert return 0;
189404b540aSrobert }
190404b540aSrobert
191404b540aSrobert /* The number of leading 1-bits in the first byte indicates how many
192404b540aSrobert bytes follow. */
193404b540aSrobert for (nbytes = 2; nbytes < 7; nbytes++)
194404b540aSrobert if ((c & ~masks[nbytes-1]) == patns[nbytes-1])
195404b540aSrobert goto found;
196404b540aSrobert return EILSEQ;
197404b540aSrobert found:
198404b540aSrobert
199404b540aSrobert if (*inbytesleftp < nbytes)
200404b540aSrobert return EINVAL;
201404b540aSrobert
202404b540aSrobert c = (c & masks[nbytes-1]);
203404b540aSrobert inbuf++;
204404b540aSrobert for (i = 1; i < nbytes; i++)
205404b540aSrobert {
206404b540aSrobert cppchar_t n = *inbuf++;
207404b540aSrobert if ((n & 0xC0) != 0x80)
208404b540aSrobert return EILSEQ;
209404b540aSrobert c = ((c << 6) + (n & 0x3F));
210404b540aSrobert }
211404b540aSrobert
212404b540aSrobert /* Make sure the shortest possible encoding was used. */
213404b540aSrobert if (c <= 0x7F && nbytes > 1) return EILSEQ;
214404b540aSrobert if (c <= 0x7FF && nbytes > 2) return EILSEQ;
215404b540aSrobert if (c <= 0xFFFF && nbytes > 3) return EILSEQ;
216404b540aSrobert if (c <= 0x1FFFFF && nbytes > 4) return EILSEQ;
217404b540aSrobert if (c <= 0x3FFFFFF && nbytes > 5) return EILSEQ;
218404b540aSrobert
219404b540aSrobert /* Make sure the character is valid. */
220404b540aSrobert if (c > 0x7FFFFFFF || (c >= 0xD800 && c <= 0xDFFF)) return EILSEQ;
221404b540aSrobert
222404b540aSrobert *cp = c;
223404b540aSrobert *inbufp = inbuf;
224404b540aSrobert *inbytesleftp -= nbytes;
225404b540aSrobert return 0;
226404b540aSrobert }
227404b540aSrobert
228404b540aSrobert static inline int
one_cppchar_to_utf8(cppchar_t c,uchar ** outbufp,size_t * outbytesleftp)229404b540aSrobert one_cppchar_to_utf8 (cppchar_t c, uchar **outbufp, size_t *outbytesleftp)
230404b540aSrobert {
231404b540aSrobert static const uchar masks[6] = { 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
232404b540aSrobert static const uchar limits[6] = { 0x80, 0xE0, 0xF0, 0xF8, 0xFC, 0xFE };
233404b540aSrobert size_t nbytes;
234404b540aSrobert uchar buf[6], *p = &buf[6];
235404b540aSrobert uchar *outbuf = *outbufp;
236404b540aSrobert
237404b540aSrobert nbytes = 1;
238404b540aSrobert if (c < 0x80)
239404b540aSrobert *--p = c;
240404b540aSrobert else
241404b540aSrobert {
242404b540aSrobert do
243404b540aSrobert {
244404b540aSrobert *--p = ((c & 0x3F) | 0x80);
245404b540aSrobert c >>= 6;
246404b540aSrobert nbytes++;
247404b540aSrobert }
248404b540aSrobert while (c >= 0x3F || (c & limits[nbytes-1]));
249404b540aSrobert *--p = (c | masks[nbytes-1]);
250404b540aSrobert }
251404b540aSrobert
252404b540aSrobert if (*outbytesleftp < nbytes)
253404b540aSrobert return E2BIG;
254404b540aSrobert
255404b540aSrobert while (p < &buf[6])
256404b540aSrobert *outbuf++ = *p++;
257404b540aSrobert *outbytesleftp -= nbytes;
258404b540aSrobert *outbufp = outbuf;
259404b540aSrobert return 0;
260404b540aSrobert }
261404b540aSrobert
262404b540aSrobert /* The following four functions transform one character between the two
263404b540aSrobert encodings named in the function name. All have the signature
264404b540aSrobert int (*)(iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
265404b540aSrobert uchar **outbufp, size_t *outbytesleftp)
266404b540aSrobert
267404b540aSrobert BIGEND must have the value 0 or 1, coerced to (iconv_t); it is
268404b540aSrobert interpreted as a boolean indicating whether big-endian or
269404b540aSrobert little-endian encoding is to be used for the member of the pair
270404b540aSrobert that is not UTF-8.
271404b540aSrobert
272404b540aSrobert INBUFP, INBYTESLEFTP, OUTBUFP, OUTBYTESLEFTP work exactly as they
273404b540aSrobert do for iconv.
274404b540aSrobert
275404b540aSrobert The return value is either 0 for success, or an errno value for
276404b540aSrobert failure, which may be E2BIG (need more space), EILSEQ (ill-formed
277404b540aSrobert input sequence), ir EINVAL (incomplete input sequence). */
278404b540aSrobert
279404b540aSrobert static inline int
one_utf8_to_utf32(iconv_t bigend,const uchar ** inbufp,size_t * inbytesleftp,uchar ** outbufp,size_t * outbytesleftp)280404b540aSrobert one_utf8_to_utf32 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
281404b540aSrobert uchar **outbufp, size_t *outbytesleftp)
282404b540aSrobert {
283404b540aSrobert uchar *outbuf;
284404b540aSrobert cppchar_t s = 0;
285404b540aSrobert int rval;
286404b540aSrobert
287404b540aSrobert /* Check for space first, since we know exactly how much we need. */
288404b540aSrobert if (*outbytesleftp < 4)
289404b540aSrobert return E2BIG;
290404b540aSrobert
291404b540aSrobert rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
292404b540aSrobert if (rval)
293404b540aSrobert return rval;
294404b540aSrobert
295404b540aSrobert outbuf = *outbufp;
296404b540aSrobert outbuf[bigend ? 3 : 0] = (s & 0x000000FF);
297404b540aSrobert outbuf[bigend ? 2 : 1] = (s & 0x0000FF00) >> 8;
298404b540aSrobert outbuf[bigend ? 1 : 2] = (s & 0x00FF0000) >> 16;
299404b540aSrobert outbuf[bigend ? 0 : 3] = (s & 0xFF000000) >> 24;
300404b540aSrobert
301404b540aSrobert *outbufp += 4;
302404b540aSrobert *outbytesleftp -= 4;
303404b540aSrobert return 0;
304404b540aSrobert }
305404b540aSrobert
306404b540aSrobert static inline int
one_utf32_to_utf8(iconv_t bigend,const uchar ** inbufp,size_t * inbytesleftp,uchar ** outbufp,size_t * outbytesleftp)307404b540aSrobert one_utf32_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
308404b540aSrobert uchar **outbufp, size_t *outbytesleftp)
309404b540aSrobert {
310404b540aSrobert cppchar_t s;
311404b540aSrobert int rval;
312404b540aSrobert const uchar *inbuf;
313404b540aSrobert
314404b540aSrobert if (*inbytesleftp < 4)
315404b540aSrobert return EINVAL;
316404b540aSrobert
317404b540aSrobert inbuf = *inbufp;
318404b540aSrobert
319404b540aSrobert s = inbuf[bigend ? 0 : 3] << 24;
320404b540aSrobert s += inbuf[bigend ? 1 : 2] << 16;
321404b540aSrobert s += inbuf[bigend ? 2 : 1] << 8;
322404b540aSrobert s += inbuf[bigend ? 3 : 0];
323404b540aSrobert
324404b540aSrobert if (s >= 0x7FFFFFFF || (s >= 0xD800 && s <= 0xDFFF))
325404b540aSrobert return EILSEQ;
326404b540aSrobert
327404b540aSrobert rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
328404b540aSrobert if (rval)
329404b540aSrobert return rval;
330404b540aSrobert
331404b540aSrobert *inbufp += 4;
332404b540aSrobert *inbytesleftp -= 4;
333404b540aSrobert return 0;
334404b540aSrobert }
335404b540aSrobert
336404b540aSrobert static inline int
one_utf8_to_utf16(iconv_t bigend,const uchar ** inbufp,size_t * inbytesleftp,uchar ** outbufp,size_t * outbytesleftp)337404b540aSrobert one_utf8_to_utf16 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
338404b540aSrobert uchar **outbufp, size_t *outbytesleftp)
339404b540aSrobert {
340404b540aSrobert int rval;
341404b540aSrobert cppchar_t s = 0;
342404b540aSrobert const uchar *save_inbuf = *inbufp;
343404b540aSrobert size_t save_inbytesleft = *inbytesleftp;
344404b540aSrobert uchar *outbuf = *outbufp;
345404b540aSrobert
346404b540aSrobert rval = one_utf8_to_cppchar (inbufp, inbytesleftp, &s);
347404b540aSrobert if (rval)
348404b540aSrobert return rval;
349404b540aSrobert
350404b540aSrobert if (s > 0x0010FFFF)
351404b540aSrobert {
352404b540aSrobert *inbufp = save_inbuf;
353404b540aSrobert *inbytesleftp = save_inbytesleft;
354404b540aSrobert return EILSEQ;
355404b540aSrobert }
356404b540aSrobert
357404b540aSrobert if (s < 0xFFFF)
358404b540aSrobert {
359404b540aSrobert if (*outbytesleftp < 2)
360404b540aSrobert {
361404b540aSrobert *inbufp = save_inbuf;
362404b540aSrobert *inbytesleftp = save_inbytesleft;
363404b540aSrobert return E2BIG;
364404b540aSrobert }
365404b540aSrobert outbuf[bigend ? 1 : 0] = (s & 0x00FF);
366404b540aSrobert outbuf[bigend ? 0 : 1] = (s & 0xFF00) >> 8;
367404b540aSrobert
368404b540aSrobert *outbufp += 2;
369404b540aSrobert *outbytesleftp -= 2;
370404b540aSrobert return 0;
371404b540aSrobert }
372404b540aSrobert else
373404b540aSrobert {
374404b540aSrobert cppchar_t hi, lo;
375404b540aSrobert
376404b540aSrobert if (*outbytesleftp < 4)
377404b540aSrobert {
378404b540aSrobert *inbufp = save_inbuf;
379404b540aSrobert *inbytesleftp = save_inbytesleft;
380404b540aSrobert return E2BIG;
381404b540aSrobert }
382404b540aSrobert
383404b540aSrobert hi = (s - 0x10000) / 0x400 + 0xD800;
384404b540aSrobert lo = (s - 0x10000) % 0x400 + 0xDC00;
385404b540aSrobert
386404b540aSrobert /* Even if we are little-endian, put the high surrogate first.
387404b540aSrobert ??? Matches practice? */
388404b540aSrobert outbuf[bigend ? 1 : 0] = (hi & 0x00FF);
389404b540aSrobert outbuf[bigend ? 0 : 1] = (hi & 0xFF00) >> 8;
390404b540aSrobert outbuf[bigend ? 3 : 2] = (lo & 0x00FF);
391404b540aSrobert outbuf[bigend ? 2 : 3] = (lo & 0xFF00) >> 8;
392404b540aSrobert
393404b540aSrobert *outbufp += 4;
394404b540aSrobert *outbytesleftp -= 4;
395404b540aSrobert return 0;
396404b540aSrobert }
397404b540aSrobert }
398404b540aSrobert
399404b540aSrobert static inline int
one_utf16_to_utf8(iconv_t bigend,const uchar ** inbufp,size_t * inbytesleftp,uchar ** outbufp,size_t * outbytesleftp)400404b540aSrobert one_utf16_to_utf8 (iconv_t bigend, const uchar **inbufp, size_t *inbytesleftp,
401404b540aSrobert uchar **outbufp, size_t *outbytesleftp)
402404b540aSrobert {
403404b540aSrobert cppchar_t s;
404404b540aSrobert const uchar *inbuf = *inbufp;
405404b540aSrobert int rval;
406404b540aSrobert
407404b540aSrobert if (*inbytesleftp < 2)
408404b540aSrobert return EINVAL;
409404b540aSrobert s = inbuf[bigend ? 0 : 1] << 8;
410404b540aSrobert s += inbuf[bigend ? 1 : 0];
411404b540aSrobert
412404b540aSrobert /* Low surrogate without immediately preceding high surrogate is invalid. */
413404b540aSrobert if (s >= 0xDC00 && s <= 0xDFFF)
414404b540aSrobert return EILSEQ;
415404b540aSrobert /* High surrogate must have a following low surrogate. */
416404b540aSrobert else if (s >= 0xD800 && s <= 0xDBFF)
417404b540aSrobert {
418404b540aSrobert cppchar_t hi = s, lo;
419404b540aSrobert if (*inbytesleftp < 4)
420404b540aSrobert return EINVAL;
421404b540aSrobert
422404b540aSrobert lo = inbuf[bigend ? 2 : 3] << 8;
423404b540aSrobert lo += inbuf[bigend ? 3 : 2];
424404b540aSrobert
425404b540aSrobert if (lo < 0xDC00 || lo > 0xDFFF)
426404b540aSrobert return EILSEQ;
427404b540aSrobert
428404b540aSrobert s = (hi - 0xD800) * 0x400 + (lo - 0xDC00) + 0x10000;
429404b540aSrobert }
430404b540aSrobert
431404b540aSrobert rval = one_cppchar_to_utf8 (s, outbufp, outbytesleftp);
432404b540aSrobert if (rval)
433404b540aSrobert return rval;
434404b540aSrobert
435404b540aSrobert /* Success - update the input pointers (one_cppchar_to_utf8 has done
436404b540aSrobert the output pointers for us). */
437404b540aSrobert if (s <= 0xFFFF)
438404b540aSrobert {
439404b540aSrobert *inbufp += 2;
440404b540aSrobert *inbytesleftp -= 2;
441404b540aSrobert }
442404b540aSrobert else
443404b540aSrobert {
444404b540aSrobert *inbufp += 4;
445404b540aSrobert *inbytesleftp -= 4;
446404b540aSrobert }
447404b540aSrobert return 0;
448404b540aSrobert }
449404b540aSrobert
450404b540aSrobert /* Helper routine for the next few functions. The 'const' on
451404b540aSrobert one_conversion means that we promise not to modify what function is
452404b540aSrobert pointed to, which lets the inliner see through it. */
453404b540aSrobert
454404b540aSrobert static inline bool
conversion_loop(int (* const one_conversion)(iconv_t,const uchar **,size_t *,uchar **,size_t *),iconv_t cd,const uchar * from,size_t flen,struct _cpp_strbuf * to)455404b540aSrobert conversion_loop (int (*const one_conversion)(iconv_t, const uchar **, size_t *,
456404b540aSrobert uchar **, size_t *),
457404b540aSrobert iconv_t cd, const uchar *from, size_t flen, struct _cpp_strbuf *to)
458404b540aSrobert {
459404b540aSrobert const uchar *inbuf;
460404b540aSrobert uchar *outbuf;
461404b540aSrobert size_t inbytesleft, outbytesleft;
462404b540aSrobert int rval;
463404b540aSrobert
464404b540aSrobert inbuf = from;
465404b540aSrobert inbytesleft = flen;
466404b540aSrobert outbuf = to->text + to->len;
467404b540aSrobert outbytesleft = to->asize - to->len;
468404b540aSrobert
469404b540aSrobert for (;;)
470404b540aSrobert {
471404b540aSrobert do
472404b540aSrobert rval = one_conversion (cd, &inbuf, &inbytesleft,
473404b540aSrobert &outbuf, &outbytesleft);
474404b540aSrobert while (inbytesleft && !rval);
475404b540aSrobert
476404b540aSrobert if (__builtin_expect (inbytesleft == 0, 1))
477404b540aSrobert {
478404b540aSrobert to->len = to->asize - outbytesleft;
479404b540aSrobert return true;
480404b540aSrobert }
481404b540aSrobert if (rval != E2BIG)
482404b540aSrobert {
483404b540aSrobert errno = rval;
484404b540aSrobert return false;
485404b540aSrobert }
486404b540aSrobert
487404b540aSrobert outbytesleft += OUTBUF_BLOCK_SIZE;
488404b540aSrobert to->asize += OUTBUF_BLOCK_SIZE;
489404b540aSrobert to->text = XRESIZEVEC (uchar, to->text, to->asize);
490404b540aSrobert outbuf = to->text + to->asize - outbytesleft;
491404b540aSrobert }
492404b540aSrobert }
493404b540aSrobert
494404b540aSrobert
495404b540aSrobert /* These functions convert entire strings between character sets.
496404b540aSrobert They all have the signature
497404b540aSrobert
498404b540aSrobert bool (*)(iconv_t cd, const uchar *from, size_t flen, struct _cpp_strbuf *to);
499404b540aSrobert
500404b540aSrobert The input string FROM is converted as specified by the function
501404b540aSrobert name plus the iconv descriptor CD (which may be fake), and the
502404b540aSrobert result appended to TO. On any error, false is returned, otherwise true. */
503404b540aSrobert
504404b540aSrobert /* These four use the custom conversion code above. */
505404b540aSrobert static bool
convert_utf8_utf16(iconv_t cd,const uchar * from,size_t flen,struct _cpp_strbuf * to)506404b540aSrobert convert_utf8_utf16 (iconv_t cd, const uchar *from, size_t flen,
507404b540aSrobert struct _cpp_strbuf *to)
508404b540aSrobert {
509404b540aSrobert return conversion_loop (one_utf8_to_utf16, cd, from, flen, to);
510404b540aSrobert }
511404b540aSrobert
512404b540aSrobert static bool
convert_utf8_utf32(iconv_t cd,const uchar * from,size_t flen,struct _cpp_strbuf * to)513404b540aSrobert convert_utf8_utf32 (iconv_t cd, const uchar *from, size_t flen,
514404b540aSrobert struct _cpp_strbuf *to)
515404b540aSrobert {
516404b540aSrobert return conversion_loop (one_utf8_to_utf32, cd, from, flen, to);
517404b540aSrobert }
518404b540aSrobert
519404b540aSrobert static bool
convert_utf16_utf8(iconv_t cd,const uchar * from,size_t flen,struct _cpp_strbuf * to)520404b540aSrobert convert_utf16_utf8 (iconv_t cd, const uchar *from, size_t flen,
521404b540aSrobert struct _cpp_strbuf *to)
522404b540aSrobert {
523404b540aSrobert return conversion_loop (one_utf16_to_utf8, cd, from, flen, to);
524404b540aSrobert }
525404b540aSrobert
526404b540aSrobert static bool
convert_utf32_utf8(iconv_t cd,const uchar * from,size_t flen,struct _cpp_strbuf * to)527404b540aSrobert convert_utf32_utf8 (iconv_t cd, const uchar *from, size_t flen,
528404b540aSrobert struct _cpp_strbuf *to)
529404b540aSrobert {
530404b540aSrobert return conversion_loop (one_utf32_to_utf8, cd, from, flen, to);
531404b540aSrobert }
532404b540aSrobert
533404b540aSrobert /* Identity conversion, used when we have no alternative. */
534404b540aSrobert static bool
convert_no_conversion(iconv_t cd ATTRIBUTE_UNUSED,const uchar * from,size_t flen,struct _cpp_strbuf * to)535404b540aSrobert convert_no_conversion (iconv_t cd ATTRIBUTE_UNUSED,
536404b540aSrobert const uchar *from, size_t flen, struct _cpp_strbuf *to)
537404b540aSrobert {
538404b540aSrobert if (to->len + flen > to->asize)
539404b540aSrobert {
540404b540aSrobert to->asize = to->len + flen;
541404b540aSrobert to->text = XRESIZEVEC (uchar, to->text, to->asize);
542404b540aSrobert }
543404b540aSrobert memcpy (to->text + to->len, from, flen);
544404b540aSrobert to->len += flen;
545404b540aSrobert return true;
546404b540aSrobert }
547404b540aSrobert
548404b540aSrobert /* And this one uses the system iconv primitive. It's a little
549404b540aSrobert different, since iconv's interface is a little different. */
550404b540aSrobert #if HAVE_ICONV
551404b540aSrobert static bool
convert_using_iconv(iconv_t cd,const uchar * from,size_t flen,struct _cpp_strbuf * to)552404b540aSrobert convert_using_iconv (iconv_t cd, const uchar *from, size_t flen,
553404b540aSrobert struct _cpp_strbuf *to)
554404b540aSrobert {
555404b540aSrobert ICONV_CONST char *inbuf;
556404b540aSrobert char *outbuf;
557404b540aSrobert size_t inbytesleft, outbytesleft;
558404b540aSrobert
559404b540aSrobert /* Reset conversion descriptor and check that it is valid. */
560404b540aSrobert if (iconv (cd, 0, 0, 0, 0) == (size_t)-1)
561404b540aSrobert return false;
562404b540aSrobert
563404b540aSrobert inbuf = (ICONV_CONST char *)from;
564404b540aSrobert inbytesleft = flen;
565404b540aSrobert outbuf = (char *)to->text + to->len;
566404b540aSrobert outbytesleft = to->asize - to->len;
567404b540aSrobert
568404b540aSrobert for (;;)
569404b540aSrobert {
570404b540aSrobert iconv (cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
571404b540aSrobert if (__builtin_expect (inbytesleft == 0, 1))
572404b540aSrobert {
573404b540aSrobert to->len = to->asize - outbytesleft;
574404b540aSrobert return true;
575404b540aSrobert }
576404b540aSrobert if (errno != E2BIG)
577404b540aSrobert return false;
578404b540aSrobert
579404b540aSrobert outbytesleft += OUTBUF_BLOCK_SIZE;
580404b540aSrobert to->asize += OUTBUF_BLOCK_SIZE;
581404b540aSrobert to->text = XRESIZEVEC (uchar, to->text, to->asize);
582404b540aSrobert outbuf = (char *)to->text + to->asize - outbytesleft;
583404b540aSrobert }
584404b540aSrobert }
585404b540aSrobert #else
586404b540aSrobert #define convert_using_iconv 0 /* prevent undefined symbol error below */
587404b540aSrobert #endif
588404b540aSrobert
589404b540aSrobert /* Arrange for the above custom conversion logic to be used automatically
590404b540aSrobert when conversion between a suitable pair of character sets is requested. */
591404b540aSrobert
592404b540aSrobert #define APPLY_CONVERSION(CONVERTER, FROM, FLEN, TO) \
593404b540aSrobert CONVERTER.func (CONVERTER.cd, FROM, FLEN, TO)
594404b540aSrobert
595404b540aSrobert struct conversion
596404b540aSrobert {
597404b540aSrobert const char *pair;
598404b540aSrobert convert_f func;
599404b540aSrobert iconv_t fake_cd;
600404b540aSrobert };
601404b540aSrobert static const struct conversion conversion_tab[] = {
602404b540aSrobert { "UTF-8/UTF-32LE", convert_utf8_utf32, (iconv_t)0 },
603404b540aSrobert { "UTF-8/UTF-32BE", convert_utf8_utf32, (iconv_t)1 },
604404b540aSrobert { "UTF-8/UTF-16LE", convert_utf8_utf16, (iconv_t)0 },
605404b540aSrobert { "UTF-8/UTF-16BE", convert_utf8_utf16, (iconv_t)1 },
606404b540aSrobert { "UTF-32LE/UTF-8", convert_utf32_utf8, (iconv_t)0 },
607404b540aSrobert { "UTF-32BE/UTF-8", convert_utf32_utf8, (iconv_t)1 },
608404b540aSrobert { "UTF-16LE/UTF-8", convert_utf16_utf8, (iconv_t)0 },
609404b540aSrobert { "UTF-16BE/UTF-8", convert_utf16_utf8, (iconv_t)1 },
610404b540aSrobert };
611404b540aSrobert
612404b540aSrobert /* Subroutine of cpp_init_iconv: initialize and return a
613404b540aSrobert cset_converter structure for conversion from FROM to TO. If
614404b540aSrobert iconv_open() fails, issue an error and return an identity
615404b540aSrobert converter. Silently return an identity converter if FROM and TO
616404b540aSrobert are identical. */
617404b540aSrobert static struct cset_converter
init_iconv_desc(cpp_reader * pfile,const char * to,const char * from)618404b540aSrobert init_iconv_desc (cpp_reader *pfile, const char *to, const char *from)
619404b540aSrobert {
620404b540aSrobert struct cset_converter ret;
621404b540aSrobert char *pair;
622404b540aSrobert size_t i;
623404b540aSrobert
624404b540aSrobert if (!strcasecmp (to, from))
625404b540aSrobert {
626404b540aSrobert ret.func = convert_no_conversion;
627404b540aSrobert ret.cd = (iconv_t) -1;
628404b540aSrobert return ret;
629404b540aSrobert }
630404b540aSrobert
631404b540aSrobert pair = (char *) alloca(strlen(to) + strlen(from) + 2);
632404b540aSrobert
633404b540aSrobert strcpy(pair, from);
634404b540aSrobert strcat(pair, "/");
635404b540aSrobert strcat(pair, to);
636404b540aSrobert for (i = 0; i < ARRAY_SIZE (conversion_tab); i++)
637404b540aSrobert if (!strcasecmp (pair, conversion_tab[i].pair))
638404b540aSrobert {
639404b540aSrobert ret.func = conversion_tab[i].func;
640404b540aSrobert ret.cd = conversion_tab[i].fake_cd;
641404b540aSrobert return ret;
642404b540aSrobert }
643404b540aSrobert
644404b540aSrobert /* No custom converter - try iconv. */
645404b540aSrobert if (HAVE_ICONV)
646404b540aSrobert {
647404b540aSrobert ret.func = convert_using_iconv;
648404b540aSrobert ret.cd = iconv_open (to, from);
649404b540aSrobert
650404b540aSrobert if (ret.cd == (iconv_t) -1)
651404b540aSrobert {
652404b540aSrobert if (errno == EINVAL)
653404b540aSrobert cpp_error (pfile, CPP_DL_ERROR, /* FIXME should be DL_SORRY */
654404b540aSrobert "conversion from %s to %s not supported by iconv",
655404b540aSrobert from, to);
656404b540aSrobert else
657404b540aSrobert cpp_errno (pfile, CPP_DL_ERROR, "iconv_open");
658404b540aSrobert
659404b540aSrobert ret.func = convert_no_conversion;
660404b540aSrobert }
661404b540aSrobert }
662404b540aSrobert else
663404b540aSrobert {
664404b540aSrobert cpp_error (pfile, CPP_DL_ERROR, /* FIXME: should be DL_SORRY */
665404b540aSrobert "no iconv implementation, cannot convert from %s to %s",
666404b540aSrobert from, to);
667404b540aSrobert ret.func = convert_no_conversion;
668404b540aSrobert ret.cd = (iconv_t) -1;
669404b540aSrobert }
670404b540aSrobert return ret;
671404b540aSrobert }
672404b540aSrobert
673404b540aSrobert /* If charset conversion is requested, initialize iconv(3) descriptors
674404b540aSrobert for conversion from the source character set to the execution
675404b540aSrobert character sets. If iconv is not present in the C library, and
676404b540aSrobert conversion is requested, issue an error. */
677404b540aSrobert
678404b540aSrobert void
cpp_init_iconv(cpp_reader * pfile)679404b540aSrobert cpp_init_iconv (cpp_reader *pfile)
680404b540aSrobert {
681404b540aSrobert const char *ncset = CPP_OPTION (pfile, narrow_charset);
682404b540aSrobert const char *wcset = CPP_OPTION (pfile, wide_charset);
683404b540aSrobert const char *default_wcset;
684404b540aSrobert
685404b540aSrobert bool be = CPP_OPTION (pfile, bytes_big_endian);
686404b540aSrobert
687404b540aSrobert if (CPP_OPTION (pfile, wchar_precision) >= 32)
688404b540aSrobert default_wcset = be ? "UTF-32BE" : "UTF-32LE";
689404b540aSrobert else if (CPP_OPTION (pfile, wchar_precision) >= 16)
690404b540aSrobert default_wcset = be ? "UTF-16BE" : "UTF-16LE";
691404b540aSrobert else
692404b540aSrobert /* This effectively means that wide strings are not supported,
693404b540aSrobert so don't do any conversion at all. */
694404b540aSrobert default_wcset = SOURCE_CHARSET;
695404b540aSrobert
696404b540aSrobert if (!ncset)
697404b540aSrobert ncset = SOURCE_CHARSET;
698404b540aSrobert if (!wcset)
699404b540aSrobert wcset = default_wcset;
700404b540aSrobert
701404b540aSrobert pfile->narrow_cset_desc = init_iconv_desc (pfile, ncset, SOURCE_CHARSET);
702404b540aSrobert pfile->wide_cset_desc = init_iconv_desc (pfile, wcset, SOURCE_CHARSET);
703404b540aSrobert }
704404b540aSrobert
705404b540aSrobert /* Destroy iconv(3) descriptors set up by cpp_init_iconv, if necessary. */
706404b540aSrobert void
_cpp_destroy_iconv(cpp_reader * pfile)707404b540aSrobert _cpp_destroy_iconv (cpp_reader *pfile)
708404b540aSrobert {
709404b540aSrobert if (HAVE_ICONV)
710404b540aSrobert {
711404b540aSrobert if (pfile->narrow_cset_desc.func == convert_using_iconv)
712404b540aSrobert iconv_close (pfile->narrow_cset_desc.cd);
713404b540aSrobert if (pfile->wide_cset_desc.func == convert_using_iconv)
714404b540aSrobert iconv_close (pfile->wide_cset_desc.cd);
715404b540aSrobert }
716404b540aSrobert }
717404b540aSrobert
718404b540aSrobert /* Utility routine for use by a full compiler. C is a character taken
719404b540aSrobert from the *basic* source character set, encoded in the host's
720404b540aSrobert execution encoding. Convert it to (the target's) execution
721404b540aSrobert encoding, and return that value.
722404b540aSrobert
723404b540aSrobert Issues an internal error if C's representation in the narrow
724404b540aSrobert execution character set fails to be a single-byte value (C99
725404b540aSrobert 5.2.1p3: "The representation of each member of the source and
726404b540aSrobert execution character sets shall fit in a byte.") May also issue an
727404b540aSrobert internal error if C fails to be a member of the basic source
728404b540aSrobert character set (testing this exactly is too hard, especially when
729404b540aSrobert the host character set is EBCDIC). */
730404b540aSrobert cppchar_t
cpp_host_to_exec_charset(cpp_reader * pfile,cppchar_t c)731404b540aSrobert cpp_host_to_exec_charset (cpp_reader *pfile, cppchar_t c)
732404b540aSrobert {
733404b540aSrobert uchar sbuf[1];
734404b540aSrobert struct _cpp_strbuf tbuf;
735404b540aSrobert
736404b540aSrobert /* This test is merely an approximation, but it suffices to catch
737404b540aSrobert the most important thing, which is that we don't get handed a
738404b540aSrobert character outside the unibyte range of the host character set. */
739404b540aSrobert if (c > LAST_POSSIBLY_BASIC_SOURCE_CHAR)
740404b540aSrobert {
741404b540aSrobert cpp_error (pfile, CPP_DL_ICE,
742404b540aSrobert "character 0x%lx is not in the basic source character set\n",
743404b540aSrobert (unsigned long)c);
744404b540aSrobert return 0;
745404b540aSrobert }
746404b540aSrobert
747404b540aSrobert /* Being a character in the unibyte range of the host character set,
748404b540aSrobert we can safely splat it into a one-byte buffer and trust that that
749404b540aSrobert is a well-formed string. */
750404b540aSrobert sbuf[0] = c;
751404b540aSrobert
752404b540aSrobert /* This should never need to reallocate, but just in case... */
753404b540aSrobert tbuf.asize = 1;
754404b540aSrobert tbuf.text = XNEWVEC (uchar, tbuf.asize);
755404b540aSrobert tbuf.len = 0;
756404b540aSrobert
757404b540aSrobert if (!APPLY_CONVERSION (pfile->narrow_cset_desc, sbuf, 1, &tbuf))
758404b540aSrobert {
759404b540aSrobert cpp_errno (pfile, CPP_DL_ICE, "converting to execution character set");
760404b540aSrobert return 0;
761404b540aSrobert }
762404b540aSrobert if (tbuf.len != 1)
763404b540aSrobert {
764404b540aSrobert cpp_error (pfile, CPP_DL_ICE,
765404b540aSrobert "character 0x%lx is not unibyte in execution character set",
766404b540aSrobert (unsigned long)c);
767404b540aSrobert return 0;
768404b540aSrobert }
769404b540aSrobert c = tbuf.text[0];
770404b540aSrobert free(tbuf.text);
771404b540aSrobert return c;
772404b540aSrobert }
773404b540aSrobert
774404b540aSrobert
775404b540aSrobert
776404b540aSrobert /* Utility routine that computes a mask of the form 0000...111... with
777404b540aSrobert WIDTH 1-bits. */
778404b540aSrobert static inline size_t
width_to_mask(size_t width)779404b540aSrobert width_to_mask (size_t width)
780404b540aSrobert {
781404b540aSrobert width = MIN (width, BITS_PER_CPPCHAR_T);
782404b540aSrobert if (width >= CHAR_BIT * sizeof (size_t))
783404b540aSrobert return ~(size_t) 0;
784404b540aSrobert else
785404b540aSrobert return ((size_t) 1 << width) - 1;
786404b540aSrobert }
787404b540aSrobert
788404b540aSrobert /* A large table of unicode character information. */
789404b540aSrobert enum {
790404b540aSrobert /* Valid in a C99 identifier? */
791404b540aSrobert C99 = 1,
792404b540aSrobert /* Valid in a C99 identifier, but not as the first character? */
793404b540aSrobert DIG = 2,
794404b540aSrobert /* Valid in a C++ identifier? */
795404b540aSrobert CXX = 4,
796404b540aSrobert /* NFC representation is not valid in an identifier? */
797404b540aSrobert CID = 8,
798404b540aSrobert /* Might be valid NFC form? */
799404b540aSrobert NFC = 16,
800404b540aSrobert /* Might be valid NFKC form? */
801404b540aSrobert NKC = 32,
802404b540aSrobert /* Certain preceding characters might make it not valid NFC/NKFC form? */
803404b540aSrobert CTX = 64
804404b540aSrobert };
805404b540aSrobert
806404b540aSrobert static const struct {
807404b540aSrobert /* Bitmap of flags above. */
808404b540aSrobert unsigned char flags;
809404b540aSrobert /* Combining class of the character. */
810404b540aSrobert unsigned char combine;
811404b540aSrobert /* Last character in the range described by this entry. */
812404b540aSrobert unsigned short end;
813404b540aSrobert } ucnranges[] = {
814404b540aSrobert #include "ucnid.h"
815404b540aSrobert };
816404b540aSrobert
817404b540aSrobert /* Returns 1 if C is valid in an identifier, 2 if C is valid except at
818404b540aSrobert the start of an identifier, and 0 if C is not valid in an
819404b540aSrobert identifier. We assume C has already gone through the checks of
820404b540aSrobert _cpp_valid_ucn. Also update NST for C if returning nonzero. The
821404b540aSrobert algorithm is a simple binary search on the table defined in
822404b540aSrobert ucnid.h. */
823404b540aSrobert
824404b540aSrobert static int
ucn_valid_in_identifier(cpp_reader * pfile,cppchar_t c,struct normalize_state * nst)825404b540aSrobert ucn_valid_in_identifier (cpp_reader *pfile, cppchar_t c,
826404b540aSrobert struct normalize_state *nst)
827404b540aSrobert {
828404b540aSrobert int mn, mx, md;
829404b540aSrobert
830404b540aSrobert if (c > 0xFFFF)
831404b540aSrobert return 0;
832404b540aSrobert
833404b540aSrobert mn = 0;
834404b540aSrobert mx = ARRAY_SIZE (ucnranges) - 1;
835404b540aSrobert while (mx != mn)
836404b540aSrobert {
837404b540aSrobert md = (mn + mx) / 2;
838404b540aSrobert if (c <= ucnranges[md].end)
839404b540aSrobert mx = md;
840404b540aSrobert else
841404b540aSrobert mn = md + 1;
842404b540aSrobert }
843404b540aSrobert
844404b540aSrobert /* When -pedantic, we require the character to have been listed by
845404b540aSrobert the standard for the current language. Otherwise, we accept the
846404b540aSrobert union of the acceptable sets for C++98 and C99. */
847404b540aSrobert if (! (ucnranges[mn].flags & (C99 | CXX)))
848404b540aSrobert return 0;
849404b540aSrobert
850404b540aSrobert if (CPP_PEDANTIC (pfile)
851404b540aSrobert && ((CPP_OPTION (pfile, c99) && !(ucnranges[mn].flags & C99))
852404b540aSrobert || (CPP_OPTION (pfile, cplusplus)
853404b540aSrobert && !(ucnranges[mn].flags & CXX))))
854404b540aSrobert return 0;
855404b540aSrobert
856404b540aSrobert /* Update NST. */
857404b540aSrobert if (ucnranges[mn].combine != 0 && ucnranges[mn].combine < nst->prev_class)
858404b540aSrobert nst->level = normalized_none;
859404b540aSrobert else if (ucnranges[mn].flags & CTX)
860404b540aSrobert {
861404b540aSrobert bool safe;
862404b540aSrobert cppchar_t p = nst->previous;
863404b540aSrobert
864404b540aSrobert /* Easy cases from Bengali, Oriya, Tamil, Jannada, and Malayalam. */
865404b540aSrobert if (c == 0x09BE)
866404b540aSrobert safe = p != 0x09C7; /* Use 09CB instead of 09C7 09BE. */
867404b540aSrobert else if (c == 0x0B3E)
868404b540aSrobert safe = p != 0x0B47; /* Use 0B4B instead of 0B47 0B3E. */
869404b540aSrobert else if (c == 0x0BBE)
870404b540aSrobert safe = p != 0x0BC6 && p != 0x0BC7; /* Use 0BCA/0BCB instead. */
871404b540aSrobert else if (c == 0x0CC2)
872404b540aSrobert safe = p != 0x0CC6; /* Use 0CCA instead of 0CC6 0CC2. */
873404b540aSrobert else if (c == 0x0D3E)
874404b540aSrobert safe = p != 0x0D46 && p != 0x0D47; /* Use 0D4A/0D4B instead. */
875404b540aSrobert /* For Hangul, characters in the range AC00-D7A3 are NFC/NFKC,
876404b540aSrobert and are combined algorithmically from a sequence of the form
877404b540aSrobert 1100-1112 1161-1175 11A8-11C2
878404b540aSrobert (if the third is not present, it is treated as 11A7, which is not
879404b540aSrobert really a valid character).
880404b540aSrobert Unfortunately, C99 allows (only) the NFC form, but C++ allows
881404b540aSrobert only the combining characters. */
882404b540aSrobert else if (c >= 0x1161 && c <= 0x1175)
883404b540aSrobert safe = p < 0x1100 || p > 0x1112;
884404b540aSrobert else if (c >= 0x11A8 && c <= 0x11C2)
885404b540aSrobert safe = (p < 0xAC00 || p > 0xD7A3 || (p - 0xAC00) % 28 != 0);
886404b540aSrobert else
887404b540aSrobert {
888404b540aSrobert /* Uh-oh, someone updated ucnid.h without updating this code. */
889404b540aSrobert cpp_error (pfile, CPP_DL_ICE, "Character %x might not be NFKC", c);
890404b540aSrobert safe = true;
891404b540aSrobert }
892404b540aSrobert if (!safe && c < 0x1161)
893404b540aSrobert nst->level = normalized_none;
894404b540aSrobert else if (!safe)
895404b540aSrobert nst->level = MAX (nst->level, normalized_identifier_C);
896404b540aSrobert }
897404b540aSrobert else if (ucnranges[mn].flags & NKC)
898404b540aSrobert ;
899404b540aSrobert else if (ucnranges[mn].flags & NFC)
900404b540aSrobert nst->level = MAX (nst->level, normalized_C);
901404b540aSrobert else if (ucnranges[mn].flags & CID)
902404b540aSrobert nst->level = MAX (nst->level, normalized_identifier_C);
903404b540aSrobert else
904404b540aSrobert nst->level = normalized_none;
905404b540aSrobert nst->previous = c;
906404b540aSrobert nst->prev_class = ucnranges[mn].combine;
907404b540aSrobert
908404b540aSrobert /* In C99, UCN digits may not begin identifiers. */
909404b540aSrobert if (CPP_OPTION (pfile, c99) && (ucnranges[mn].flags & DIG))
910404b540aSrobert return 2;
911404b540aSrobert
912404b540aSrobert return 1;
913404b540aSrobert }
914404b540aSrobert
915404b540aSrobert /* [lex.charset]: The character designated by the universal character
916404b540aSrobert name \UNNNNNNNN is that character whose character short name in
917404b540aSrobert ISO/IEC 10646 is NNNNNNNN; the character designated by the
918404b540aSrobert universal character name \uNNNN is that character whose character
919404b540aSrobert short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value
920404b540aSrobert for a universal character name is less than 0x20 or in the range
921404b540aSrobert 0x7F-0x9F (inclusive), or if the universal character name
922404b540aSrobert designates a character in the basic source character set, then the
923404b540aSrobert program is ill-formed.
924404b540aSrobert
925404b540aSrobert *PSTR must be preceded by "\u" or "\U"; it is assumed that the
926404b540aSrobert buffer end is delimited by a non-hex digit. Returns zero if the
927404b540aSrobert UCN has not been consumed.
928404b540aSrobert
929404b540aSrobert Otherwise the nonzero value of the UCN, whether valid or invalid,
930404b540aSrobert is returned. Diagnostics are emitted for invalid values. PSTR
931404b540aSrobert is updated to point one beyond the UCN, or to the syntactically
932404b540aSrobert invalid character.
933404b540aSrobert
934404b540aSrobert IDENTIFIER_POS is 0 when not in an identifier, 1 for the start of
935404b540aSrobert an identifier, or 2 otherwise. */
936404b540aSrobert
937404b540aSrobert cppchar_t
_cpp_valid_ucn(cpp_reader * pfile,const uchar ** pstr,const uchar * limit,int identifier_pos,struct normalize_state * nst)938404b540aSrobert _cpp_valid_ucn (cpp_reader *pfile, const uchar **pstr,
939404b540aSrobert const uchar *limit, int identifier_pos,
940404b540aSrobert struct normalize_state *nst)
941404b540aSrobert {
942404b540aSrobert cppchar_t result, c;
943404b540aSrobert unsigned int length;
944404b540aSrobert const uchar *str = *pstr;
945404b540aSrobert const uchar *base = str - 2;
946404b540aSrobert
947404b540aSrobert if (!CPP_OPTION (pfile, cplusplus) && !CPP_OPTION (pfile, c99))
948404b540aSrobert cpp_error (pfile, CPP_DL_WARNING,
949404b540aSrobert "universal character names are only valid in C++ and C99");
950404b540aSrobert else if (CPP_WTRADITIONAL (pfile) && identifier_pos == 0)
951404b540aSrobert cpp_error (pfile, CPP_DL_WARNING,
952404b540aSrobert "the meaning of '\\%c' is different in traditional C",
953404b540aSrobert (int) str[-1]);
954404b540aSrobert
955404b540aSrobert if (str[-1] == 'u')
956404b540aSrobert length = 4;
957404b540aSrobert else if (str[-1] == 'U')
958404b540aSrobert length = 8;
959404b540aSrobert else
960404b540aSrobert {
961404b540aSrobert cpp_error (pfile, CPP_DL_ICE, "In _cpp_valid_ucn but not a UCN");
962404b540aSrobert length = 4;
963404b540aSrobert }
964404b540aSrobert
965404b540aSrobert result = 0;
966404b540aSrobert do
967404b540aSrobert {
968404b540aSrobert c = *str;
969404b540aSrobert if (!ISXDIGIT (c))
970404b540aSrobert break;
971404b540aSrobert str++;
972404b540aSrobert result = (result << 4) + hex_value (c);
973404b540aSrobert }
974404b540aSrobert while (--length && str < limit);
975404b540aSrobert
976404b540aSrobert /* Partial UCNs are not valid in strings, but decompose into
977404b540aSrobert multiple tokens in identifiers, so we can't give a helpful
978404b540aSrobert error message in that case. */
979404b540aSrobert if (length && identifier_pos)
980404b540aSrobert return 0;
981404b540aSrobert
982404b540aSrobert *pstr = str;
983404b540aSrobert if (length)
984404b540aSrobert {
985404b540aSrobert cpp_error (pfile, CPP_DL_ERROR,
986404b540aSrobert "incomplete universal character name %.*s",
987404b540aSrobert (int) (str - base), base);
988404b540aSrobert result = 1;
989404b540aSrobert }
990404b540aSrobert /* The standard permits $, @ and ` to be specified as UCNs. We use
991404b540aSrobert hex escapes so that this also works with EBCDIC hosts. */
992404b540aSrobert else if ((result < 0xa0
993404b540aSrobert && (result != 0x24 && result != 0x40 && result != 0x60))
994404b540aSrobert || (result & 0x80000000)
995404b540aSrobert || (result >= 0xD800 && result <= 0xDFFF))
996404b540aSrobert {
997404b540aSrobert cpp_error (pfile, CPP_DL_ERROR,
998404b540aSrobert "%.*s is not a valid universal character",
999404b540aSrobert (int) (str - base), base);
1000404b540aSrobert result = 1;
1001404b540aSrobert }
1002404b540aSrobert else if (identifier_pos && result == 0x24
1003404b540aSrobert && CPP_OPTION (pfile, dollars_in_ident))
1004404b540aSrobert {
1005404b540aSrobert if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1006404b540aSrobert {
1007404b540aSrobert CPP_OPTION (pfile, warn_dollars) = 0;
1008404b540aSrobert cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1009404b540aSrobert }
1010404b540aSrobert NORMALIZE_STATE_UPDATE_IDNUM (nst);
1011404b540aSrobert }
1012404b540aSrobert else if (identifier_pos)
1013404b540aSrobert {
1014404b540aSrobert int validity = ucn_valid_in_identifier (pfile, result, nst);
1015404b540aSrobert
1016404b540aSrobert if (validity == 0)
1017404b540aSrobert cpp_error (pfile, CPP_DL_ERROR,
1018404b540aSrobert "universal character %.*s is not valid in an identifier",
1019404b540aSrobert (int) (str - base), base);
1020404b540aSrobert else if (validity == 2 && identifier_pos == 1)
1021404b540aSrobert cpp_error (pfile, CPP_DL_ERROR,
1022404b540aSrobert "universal character %.*s is not valid at the start of an identifier",
1023404b540aSrobert (int) (str - base), base);
1024404b540aSrobert }
1025404b540aSrobert
1026404b540aSrobert if (result == 0)
1027404b540aSrobert result = 1;
1028404b540aSrobert
1029404b540aSrobert return result;
1030404b540aSrobert }
1031404b540aSrobert
1032404b540aSrobert /* Convert an UCN, pointed to by FROM, to UTF-8 encoding, then translate
1033404b540aSrobert it to the execution character set and write the result into TBUF.
1034404b540aSrobert An advanced pointer is returned. Issues all relevant diagnostics. */
1035404b540aSrobert static const uchar *
convert_ucn(cpp_reader * pfile,const uchar * from,const uchar * limit,struct _cpp_strbuf * tbuf,bool wide)1036404b540aSrobert convert_ucn (cpp_reader *pfile, const uchar *from, const uchar *limit,
1037404b540aSrobert struct _cpp_strbuf *tbuf, bool wide)
1038404b540aSrobert {
1039404b540aSrobert cppchar_t ucn;
1040404b540aSrobert uchar buf[6];
1041404b540aSrobert uchar *bufp = buf;
1042404b540aSrobert size_t bytesleft = 6;
1043404b540aSrobert int rval;
1044404b540aSrobert struct cset_converter cvt
1045404b540aSrobert = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
1046404b540aSrobert struct normalize_state nst = INITIAL_NORMALIZE_STATE;
1047404b540aSrobert
1048404b540aSrobert from++; /* Skip u/U. */
1049404b540aSrobert ucn = _cpp_valid_ucn (pfile, &from, limit, 0, &nst);
1050404b540aSrobert
1051404b540aSrobert rval = one_cppchar_to_utf8 (ucn, &bufp, &bytesleft);
1052404b540aSrobert if (rval)
1053404b540aSrobert {
1054404b540aSrobert errno = rval;
1055404b540aSrobert cpp_errno (pfile, CPP_DL_ERROR,
1056404b540aSrobert "converting UCN to source character set");
1057404b540aSrobert }
1058404b540aSrobert else if (!APPLY_CONVERSION (cvt, buf, 6 - bytesleft, tbuf))
1059404b540aSrobert cpp_errno (pfile, CPP_DL_ERROR,
1060404b540aSrobert "converting UCN to execution character set");
1061404b540aSrobert
1062404b540aSrobert return from;
1063404b540aSrobert }
1064404b540aSrobert
1065404b540aSrobert /* Subroutine of convert_hex and convert_oct. N is the representation
1066404b540aSrobert in the execution character set of a numeric escape; write it into the
1067404b540aSrobert string buffer TBUF and update the end-of-string pointer therein. WIDE
1068404b540aSrobert is true if it's a wide string that's being assembled in TBUF. This
1069404b540aSrobert function issues no diagnostics and never fails. */
1070404b540aSrobert static void
emit_numeric_escape(cpp_reader * pfile,cppchar_t n,struct _cpp_strbuf * tbuf,bool wide)1071404b540aSrobert emit_numeric_escape (cpp_reader *pfile, cppchar_t n,
1072404b540aSrobert struct _cpp_strbuf *tbuf, bool wide)
1073404b540aSrobert {
1074404b540aSrobert if (wide)
1075404b540aSrobert {
1076404b540aSrobert /* We have to render this into the target byte order, which may not
1077404b540aSrobert be our byte order. */
1078404b540aSrobert bool bigend = CPP_OPTION (pfile, bytes_big_endian);
1079404b540aSrobert size_t width = CPP_OPTION (pfile, wchar_precision);
1080404b540aSrobert size_t cwidth = CPP_OPTION (pfile, char_precision);
1081404b540aSrobert size_t cmask = width_to_mask (cwidth);
1082404b540aSrobert size_t nbwc = width / cwidth;
1083404b540aSrobert size_t i;
1084404b540aSrobert size_t off = tbuf->len;
1085404b540aSrobert cppchar_t c;
1086404b540aSrobert
1087404b540aSrobert if (tbuf->len + nbwc > tbuf->asize)
1088404b540aSrobert {
1089404b540aSrobert tbuf->asize += OUTBUF_BLOCK_SIZE;
1090404b540aSrobert tbuf->text = XRESIZEVEC (uchar, tbuf->text, tbuf->asize);
1091404b540aSrobert }
1092404b540aSrobert
1093404b540aSrobert for (i = 0; i < nbwc; i++)
1094404b540aSrobert {
1095404b540aSrobert c = n & cmask;
1096404b540aSrobert n >>= cwidth;
1097404b540aSrobert tbuf->text[off + (bigend ? nbwc - i - 1 : i)] = c;
1098404b540aSrobert }
1099404b540aSrobert tbuf->len += nbwc;
1100404b540aSrobert }
1101404b540aSrobert else
1102404b540aSrobert {
1103404b540aSrobert /* Note: this code does not handle the case where the target
1104404b540aSrobert and host have a different number of bits in a byte. */
1105404b540aSrobert if (tbuf->len + 1 > tbuf->asize)
1106404b540aSrobert {
1107404b540aSrobert tbuf->asize += OUTBUF_BLOCK_SIZE;
1108404b540aSrobert tbuf->text = XRESIZEVEC (uchar, tbuf->text, tbuf->asize);
1109404b540aSrobert }
1110404b540aSrobert tbuf->text[tbuf->len++] = n;
1111404b540aSrobert }
1112404b540aSrobert }
1113404b540aSrobert
1114404b540aSrobert /* Convert a hexadecimal escape, pointed to by FROM, to the execution
1115404b540aSrobert character set and write it into the string buffer TBUF. Returns an
1116404b540aSrobert advanced pointer, and issues diagnostics as necessary.
1117404b540aSrobert No character set translation occurs; this routine always produces the
1118404b540aSrobert execution-set character with numeric value equal to the given hex
1119404b540aSrobert number. You can, e.g. generate surrogate pairs this way. */
1120404b540aSrobert static const uchar *
convert_hex(cpp_reader * pfile,const uchar * from,const uchar * limit,struct _cpp_strbuf * tbuf,bool wide)1121404b540aSrobert convert_hex (cpp_reader *pfile, const uchar *from, const uchar *limit,
1122404b540aSrobert struct _cpp_strbuf *tbuf, bool wide)
1123404b540aSrobert {
1124404b540aSrobert cppchar_t c, n = 0, overflow = 0;
1125404b540aSrobert int digits_found = 0;
1126404b540aSrobert size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
1127404b540aSrobert : CPP_OPTION (pfile, char_precision));
1128404b540aSrobert size_t mask = width_to_mask (width);
1129404b540aSrobert
1130404b540aSrobert if (CPP_WTRADITIONAL (pfile))
1131404b540aSrobert cpp_error (pfile, CPP_DL_WARNING,
1132404b540aSrobert "the meaning of '\\x' is different in traditional C");
1133404b540aSrobert
1134404b540aSrobert from++; /* Skip 'x'. */
1135404b540aSrobert while (from < limit)
1136404b540aSrobert {
1137404b540aSrobert c = *from;
1138404b540aSrobert if (! hex_p (c))
1139404b540aSrobert break;
1140404b540aSrobert from++;
1141404b540aSrobert overflow |= n ^ (n << 4 >> 4);
1142404b540aSrobert n = (n << 4) + hex_value (c);
1143404b540aSrobert digits_found = 1;
1144404b540aSrobert }
1145404b540aSrobert
1146404b540aSrobert if (!digits_found)
1147404b540aSrobert {
1148404b540aSrobert cpp_error (pfile, CPP_DL_ERROR,
1149404b540aSrobert "\\x used with no following hex digits");
1150404b540aSrobert return from;
1151404b540aSrobert }
1152404b540aSrobert
1153404b540aSrobert if (overflow | (n != (n & mask)))
1154404b540aSrobert {
1155404b540aSrobert cpp_error (pfile, CPP_DL_PEDWARN,
1156404b540aSrobert "hex escape sequence out of range");
1157404b540aSrobert n &= mask;
1158404b540aSrobert }
1159404b540aSrobert
1160404b540aSrobert emit_numeric_escape (pfile, n, tbuf, wide);
1161404b540aSrobert
1162404b540aSrobert return from;
1163404b540aSrobert }
1164404b540aSrobert
1165404b540aSrobert /* Convert an octal escape, pointed to by FROM, to the execution
1166404b540aSrobert character set and write it into the string buffer TBUF. Returns an
1167404b540aSrobert advanced pointer, and issues diagnostics as necessary.
1168404b540aSrobert No character set translation occurs; this routine always produces the
1169404b540aSrobert execution-set character with numeric value equal to the given octal
1170404b540aSrobert number. */
1171404b540aSrobert static const uchar *
convert_oct(cpp_reader * pfile,const uchar * from,const uchar * limit,struct _cpp_strbuf * tbuf,bool wide)1172404b540aSrobert convert_oct (cpp_reader *pfile, const uchar *from, const uchar *limit,
1173404b540aSrobert struct _cpp_strbuf *tbuf, bool wide)
1174404b540aSrobert {
1175404b540aSrobert size_t count = 0;
1176404b540aSrobert cppchar_t c, n = 0;
1177404b540aSrobert size_t width = (wide ? CPP_OPTION (pfile, wchar_precision)
1178404b540aSrobert : CPP_OPTION (pfile, char_precision));
1179404b540aSrobert size_t mask = width_to_mask (width);
1180404b540aSrobert bool overflow = false;
1181404b540aSrobert
1182404b540aSrobert while (from < limit && count++ < 3)
1183404b540aSrobert {
1184404b540aSrobert c = *from;
1185404b540aSrobert if (c < '0' || c > '7')
1186404b540aSrobert break;
1187404b540aSrobert from++;
1188404b540aSrobert overflow |= n ^ (n << 3 >> 3);
1189404b540aSrobert n = (n << 3) + c - '0';
1190404b540aSrobert }
1191404b540aSrobert
1192404b540aSrobert if (n != (n & mask))
1193404b540aSrobert {
1194404b540aSrobert cpp_error (pfile, CPP_DL_PEDWARN,
1195404b540aSrobert "octal escape sequence out of range");
1196404b540aSrobert n &= mask;
1197404b540aSrobert }
1198404b540aSrobert
1199404b540aSrobert emit_numeric_escape (pfile, n, tbuf, wide);
1200404b540aSrobert
1201404b540aSrobert return from;
1202404b540aSrobert }
1203404b540aSrobert
1204404b540aSrobert /* Convert an escape sequence (pointed to by FROM) to its value on
1205404b540aSrobert the target, and to the execution character set. Do not scan past
1206404b540aSrobert LIMIT. Write the converted value into TBUF. Returns an advanced
1207404b540aSrobert pointer. Handles all relevant diagnostics. */
1208404b540aSrobert static const uchar *
convert_escape(cpp_reader * pfile,const uchar * from,const uchar * limit,struct _cpp_strbuf * tbuf,bool wide)1209404b540aSrobert convert_escape (cpp_reader *pfile, const uchar *from, const uchar *limit,
1210404b540aSrobert struct _cpp_strbuf *tbuf, bool wide)
1211404b540aSrobert {
1212404b540aSrobert /* Values of \a \b \e \f \n \r \t \v respectively. */
1213404b540aSrobert #if HOST_CHARSET == HOST_CHARSET_ASCII
1214404b540aSrobert static const uchar charconsts[] = { 7, 8, 27, 12, 10, 13, 9, 11 };
1215404b540aSrobert #elif HOST_CHARSET == HOST_CHARSET_EBCDIC
1216404b540aSrobert static const uchar charconsts[] = { 47, 22, 39, 12, 21, 13, 5, 11 };
1217404b540aSrobert #else
1218404b540aSrobert #error "unknown host character set"
1219404b540aSrobert #endif
1220404b540aSrobert
1221404b540aSrobert uchar c;
1222404b540aSrobert struct cset_converter cvt
1223404b540aSrobert = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
1224404b540aSrobert
1225404b540aSrobert c = *from;
1226404b540aSrobert switch (c)
1227404b540aSrobert {
1228404b540aSrobert /* UCNs, hex escapes, and octal escapes are processed separately. */
1229404b540aSrobert case 'u': case 'U':
1230404b540aSrobert return convert_ucn (pfile, from, limit, tbuf, wide);
1231404b540aSrobert
1232404b540aSrobert case 'x':
1233404b540aSrobert return convert_hex (pfile, from, limit, tbuf, wide);
1234404b540aSrobert break;
1235404b540aSrobert
1236404b540aSrobert case '0': case '1': case '2': case '3':
1237404b540aSrobert case '4': case '5': case '6': case '7':
1238404b540aSrobert return convert_oct (pfile, from, limit, tbuf, wide);
1239404b540aSrobert
1240404b540aSrobert /* Various letter escapes. Get the appropriate host-charset
1241404b540aSrobert value into C. */
1242404b540aSrobert case '\\': case '\'': case '"': case '?': break;
1243404b540aSrobert
1244404b540aSrobert case '(': case '{': case '[': case '%':
1245404b540aSrobert /* '\(', etc, can be used at the beginning of a line in a long
1246404b540aSrobert string split onto multiple lines with \-newline, to prevent
1247404b540aSrobert Emacs or other text editors from getting confused. '\%' can
1248404b540aSrobert be used to prevent SCCS from mangling printf format strings. */
1249404b540aSrobert if (CPP_PEDANTIC (pfile))
1250404b540aSrobert goto unknown;
1251404b540aSrobert break;
1252404b540aSrobert
1253404b540aSrobert case 'b': c = charconsts[1]; break;
1254404b540aSrobert case 'f': c = charconsts[3]; break;
1255404b540aSrobert case 'n': c = charconsts[4]; break;
1256404b540aSrobert case 'r': c = charconsts[5]; break;
1257404b540aSrobert case 't': c = charconsts[6]; break;
1258404b540aSrobert case 'v': c = charconsts[7]; break;
1259404b540aSrobert
1260404b540aSrobert case 'a':
1261404b540aSrobert if (CPP_WTRADITIONAL (pfile))
1262404b540aSrobert cpp_error (pfile, CPP_DL_WARNING,
1263404b540aSrobert "the meaning of '\\a' is different in traditional C");
1264404b540aSrobert c = charconsts[0];
1265404b540aSrobert break;
1266404b540aSrobert
1267404b540aSrobert case 'e': case 'E':
1268404b540aSrobert if (CPP_PEDANTIC (pfile))
1269404b540aSrobert cpp_error (pfile, CPP_DL_PEDWARN,
1270404b540aSrobert "non-ISO-standard escape sequence, '\\%c'", (int) c);
1271404b540aSrobert c = charconsts[2];
1272404b540aSrobert break;
1273404b540aSrobert
1274404b540aSrobert default:
1275404b540aSrobert unknown:
1276404b540aSrobert if (ISGRAPH (c))
1277404b540aSrobert cpp_error (pfile, CPP_DL_PEDWARN,
1278404b540aSrobert "unknown escape sequence '\\%c'", (int) c);
1279404b540aSrobert else
1280404b540aSrobert {
1281404b540aSrobert /* diagnostic.c does not support "%03o". When it does, this
1282404b540aSrobert code can use %03o directly in the diagnostic again. */
1283404b540aSrobert char buf[32];
1284404b540aSrobert sprintf(buf, "%03o", (int) c);
1285404b540aSrobert cpp_error (pfile, CPP_DL_PEDWARN,
1286404b540aSrobert "unknown escape sequence: '\\%s'", buf);
1287404b540aSrobert }
1288404b540aSrobert }
1289404b540aSrobert
1290404b540aSrobert /* Now convert what we have to the execution character set. */
1291404b540aSrobert if (!APPLY_CONVERSION (cvt, &c, 1, tbuf))
1292404b540aSrobert cpp_errno (pfile, CPP_DL_ERROR,
1293404b540aSrobert "converting escape sequence to execution character set");
1294404b540aSrobert
1295404b540aSrobert return from + 1;
1296404b540aSrobert }
1297404b540aSrobert
1298404b540aSrobert /* FROM is an array of cpp_string structures of length COUNT. These
1299404b540aSrobert are to be converted from the source to the execution character set,
1300404b540aSrobert escape sequences translated, and finally all are to be
1301404b540aSrobert concatenated. WIDE indicates whether or not to produce a wide
1302404b540aSrobert string. The result is written into TO. Returns true for success,
1303404b540aSrobert false for failure. */
1304404b540aSrobert bool
cpp_interpret_string(cpp_reader * pfile,const cpp_string * from,size_t count,cpp_string * to,bool wide)1305404b540aSrobert cpp_interpret_string (cpp_reader *pfile, const cpp_string *from, size_t count,
1306404b540aSrobert cpp_string *to, bool wide)
1307404b540aSrobert {
1308404b540aSrobert struct _cpp_strbuf tbuf;
1309404b540aSrobert const uchar *p, *base, *limit;
1310404b540aSrobert size_t i;
1311404b540aSrobert struct cset_converter cvt
1312404b540aSrobert = wide ? pfile->wide_cset_desc : pfile->narrow_cset_desc;
1313404b540aSrobert
1314404b540aSrobert tbuf.asize = MAX (OUTBUF_BLOCK_SIZE, from->len);
1315404b540aSrobert tbuf.text = XNEWVEC (uchar, tbuf.asize);
1316404b540aSrobert tbuf.len = 0;
1317404b540aSrobert
1318404b540aSrobert for (i = 0; i < count; i++)
1319404b540aSrobert {
1320404b540aSrobert p = from[i].text;
1321404b540aSrobert if (*p == 'L') p++;
1322404b540aSrobert p++; /* Skip leading quote. */
1323404b540aSrobert limit = from[i].text + from[i].len - 1; /* Skip trailing quote. */
1324404b540aSrobert
1325404b540aSrobert for (;;)
1326404b540aSrobert {
1327404b540aSrobert base = p;
1328404b540aSrobert while (p < limit && *p != '\\')
1329404b540aSrobert p++;
1330404b540aSrobert if (p > base)
1331404b540aSrobert {
1332404b540aSrobert /* We have a run of normal characters; these can be fed
1333404b540aSrobert directly to convert_cset. */
1334404b540aSrobert if (!APPLY_CONVERSION (cvt, base, p - base, &tbuf))
1335404b540aSrobert goto fail;
1336404b540aSrobert }
1337404b540aSrobert if (p == limit)
1338404b540aSrobert break;
1339404b540aSrobert
1340404b540aSrobert p = convert_escape (pfile, p + 1, limit, &tbuf, wide);
1341404b540aSrobert }
1342404b540aSrobert }
1343404b540aSrobert /* NUL-terminate the 'to' buffer and translate it to a cpp_string
1344404b540aSrobert structure. */
1345404b540aSrobert emit_numeric_escape (pfile, 0, &tbuf, wide);
1346404b540aSrobert tbuf.text = XRESIZEVEC (uchar, tbuf.text, tbuf.len);
1347404b540aSrobert to->text = tbuf.text;
1348404b540aSrobert to->len = tbuf.len;
1349404b540aSrobert return true;
1350404b540aSrobert
1351404b540aSrobert fail:
1352404b540aSrobert cpp_errno (pfile, CPP_DL_ERROR, "converting to execution character set");
1353404b540aSrobert free (tbuf.text);
1354404b540aSrobert return false;
1355404b540aSrobert }
1356404b540aSrobert
1357404b540aSrobert /* Subroutine of do_line and do_linemarker. Convert escape sequences
1358404b540aSrobert in a string, but do not perform character set conversion. */
1359404b540aSrobert bool
cpp_interpret_string_notranslate(cpp_reader * pfile,const cpp_string * from,size_t count,cpp_string * to,bool wide)1360404b540aSrobert cpp_interpret_string_notranslate (cpp_reader *pfile, const cpp_string *from,
1361404b540aSrobert size_t count, cpp_string *to, bool wide)
1362404b540aSrobert {
1363404b540aSrobert struct cset_converter save_narrow_cset_desc = pfile->narrow_cset_desc;
1364404b540aSrobert bool retval;
1365404b540aSrobert
1366404b540aSrobert pfile->narrow_cset_desc.func = convert_no_conversion;
1367404b540aSrobert pfile->narrow_cset_desc.cd = (iconv_t) -1;
1368404b540aSrobert
1369404b540aSrobert retval = cpp_interpret_string (pfile, from, count, to, wide);
1370404b540aSrobert
1371404b540aSrobert pfile->narrow_cset_desc = save_narrow_cset_desc;
1372404b540aSrobert return retval;
1373404b540aSrobert }
1374404b540aSrobert
1375404b540aSrobert
1376404b540aSrobert /* Subroutine of cpp_interpret_charconst which performs the conversion
1377404b540aSrobert to a number, for narrow strings. STR is the string structure returned
1378404b540aSrobert by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for
1379404b540aSrobert cpp_interpret_charconst. */
1380404b540aSrobert static cppchar_t
narrow_str_to_charconst(cpp_reader * pfile,cpp_string str,unsigned int * pchars_seen,int * unsignedp)1381404b540aSrobert narrow_str_to_charconst (cpp_reader *pfile, cpp_string str,
1382404b540aSrobert unsigned int *pchars_seen, int *unsignedp)
1383404b540aSrobert {
1384404b540aSrobert size_t width = CPP_OPTION (pfile, char_precision);
1385404b540aSrobert size_t max_chars = CPP_OPTION (pfile, int_precision) / width;
1386404b540aSrobert size_t mask = width_to_mask (width);
1387404b540aSrobert size_t i;
1388404b540aSrobert cppchar_t result, c;
1389404b540aSrobert bool unsigned_p;
1390404b540aSrobert
1391404b540aSrobert /* The value of a multi-character character constant, or a
1392404b540aSrobert single-character character constant whose representation in the
1393404b540aSrobert execution character set is more than one byte long, is
1394404b540aSrobert implementation defined. This implementation defines it to be the
1395404b540aSrobert number formed by interpreting the byte sequence in memory as a
1396404b540aSrobert big-endian binary number. If overflow occurs, the high bytes are
1397404b540aSrobert lost, and a warning is issued.
1398404b540aSrobert
1399404b540aSrobert We don't want to process the NUL terminator handed back by
1400404b540aSrobert cpp_interpret_string. */
1401404b540aSrobert result = 0;
1402404b540aSrobert for (i = 0; i < str.len - 1; i++)
1403404b540aSrobert {
1404404b540aSrobert c = str.text[i] & mask;
1405404b540aSrobert if (width < BITS_PER_CPPCHAR_T)
1406404b540aSrobert result = (result << width) | c;
1407404b540aSrobert else
1408404b540aSrobert result = c;
1409404b540aSrobert }
1410404b540aSrobert
1411404b540aSrobert if (i > max_chars)
1412404b540aSrobert {
1413404b540aSrobert i = max_chars;
1414404b540aSrobert cpp_error (pfile, CPP_DL_WARNING,
1415404b540aSrobert "character constant too long for its type");
1416404b540aSrobert }
1417404b540aSrobert else if (i > 1 && CPP_OPTION (pfile, warn_multichar))
1418404b540aSrobert cpp_error (pfile, CPP_DL_WARNING, "multi-character character constant");
1419404b540aSrobert
1420404b540aSrobert /* Multichar constants are of type int and therefore signed. */
1421404b540aSrobert if (i > 1)
1422404b540aSrobert unsigned_p = 0;
1423404b540aSrobert else
1424404b540aSrobert unsigned_p = CPP_OPTION (pfile, unsigned_char);
1425404b540aSrobert
1426404b540aSrobert /* Truncate the constant to its natural width, and simultaneously
1427404b540aSrobert sign- or zero-extend to the full width of cppchar_t.
1428404b540aSrobert For single-character constants, the value is WIDTH bits wide.
1429404b540aSrobert For multi-character constants, the value is INT_PRECISION bits wide. */
1430404b540aSrobert if (i > 1)
1431404b540aSrobert width = CPP_OPTION (pfile, int_precision);
1432404b540aSrobert if (width < BITS_PER_CPPCHAR_T)
1433404b540aSrobert {
1434404b540aSrobert mask = ((cppchar_t) 1 << width) - 1;
1435404b540aSrobert if (unsigned_p || !(result & (1 << (width - 1))))
1436404b540aSrobert result &= mask;
1437404b540aSrobert else
1438404b540aSrobert result |= ~mask;
1439404b540aSrobert }
1440404b540aSrobert *pchars_seen = i;
1441404b540aSrobert *unsignedp = unsigned_p;
1442404b540aSrobert return result;
1443404b540aSrobert }
1444404b540aSrobert
1445404b540aSrobert /* Subroutine of cpp_interpret_charconst which performs the conversion
1446404b540aSrobert to a number, for wide strings. STR is the string structure returned
1447404b540aSrobert by cpp_interpret_string. PCHARS_SEEN and UNSIGNEDP are as for
1448404b540aSrobert cpp_interpret_charconst. */
1449404b540aSrobert static cppchar_t
wide_str_to_charconst(cpp_reader * pfile,cpp_string str,unsigned int * pchars_seen,int * unsignedp)1450404b540aSrobert wide_str_to_charconst (cpp_reader *pfile, cpp_string str,
1451404b540aSrobert unsigned int *pchars_seen, int *unsignedp)
1452404b540aSrobert {
1453404b540aSrobert bool bigend = CPP_OPTION (pfile, bytes_big_endian);
1454404b540aSrobert size_t width = CPP_OPTION (pfile, wchar_precision);
1455404b540aSrobert size_t cwidth = CPP_OPTION (pfile, char_precision);
1456404b540aSrobert size_t mask = width_to_mask (width);
1457404b540aSrobert size_t cmask = width_to_mask (cwidth);
1458404b540aSrobert size_t nbwc = width / cwidth;
1459404b540aSrobert size_t off, i;
1460404b540aSrobert cppchar_t result = 0, c;
1461404b540aSrobert
1462404b540aSrobert /* This is finicky because the string is in the target's byte order,
1463404b540aSrobert which may not be our byte order. Only the last character, ignoring
1464404b540aSrobert the NUL terminator, is relevant. */
1465404b540aSrobert off = str.len - (nbwc * 2);
1466404b540aSrobert result = 0;
1467404b540aSrobert for (i = 0; i < nbwc; i++)
1468404b540aSrobert {
1469404b540aSrobert c = bigend ? str.text[off + i] : str.text[off + nbwc - i - 1];
1470404b540aSrobert result = (result << cwidth) | (c & cmask);
1471404b540aSrobert }
1472404b540aSrobert
1473404b540aSrobert /* Wide character constants have type wchar_t, and a single
1474404b540aSrobert character exactly fills a wchar_t, so a multi-character wide
1475404b540aSrobert character constant is guaranteed to overflow. */
1476404b540aSrobert if (off > 0)
1477404b540aSrobert cpp_error (pfile, CPP_DL_WARNING,
1478404b540aSrobert "character constant too long for its type");
1479404b540aSrobert
1480404b540aSrobert /* Truncate the constant to its natural width, and simultaneously
1481404b540aSrobert sign- or zero-extend to the full width of cppchar_t. */
1482404b540aSrobert if (width < BITS_PER_CPPCHAR_T)
1483404b540aSrobert {
1484404b540aSrobert if (CPP_OPTION (pfile, unsigned_wchar) || !(result & (1 << (width - 1))))
1485404b540aSrobert result &= mask;
1486404b540aSrobert else
1487404b540aSrobert result |= ~mask;
1488404b540aSrobert }
1489404b540aSrobert
1490404b540aSrobert *unsignedp = CPP_OPTION (pfile, unsigned_wchar);
1491404b540aSrobert *pchars_seen = 1;
1492404b540aSrobert return result;
1493404b540aSrobert }
1494404b540aSrobert
1495404b540aSrobert /* Interpret a (possibly wide) character constant in TOKEN.
1496404b540aSrobert PCHARS_SEEN points to a variable that is filled in with the number
1497404b540aSrobert of characters seen, and UNSIGNEDP to a variable that indicates
1498404b540aSrobert whether the result has signed type. */
1499404b540aSrobert cppchar_t
cpp_interpret_charconst(cpp_reader * pfile,const cpp_token * token,unsigned int * pchars_seen,int * unsignedp)1500404b540aSrobert cpp_interpret_charconst (cpp_reader *pfile, const cpp_token *token,
1501404b540aSrobert unsigned int *pchars_seen, int *unsignedp)
1502404b540aSrobert {
1503404b540aSrobert cpp_string str = { 0, 0 };
1504404b540aSrobert bool wide = (token->type == CPP_WCHAR);
1505404b540aSrobert cppchar_t result;
1506404b540aSrobert
1507404b540aSrobert /* an empty constant will appear as L'' or '' */
1508404b540aSrobert if (token->val.str.len == (size_t) (2 + wide))
1509404b540aSrobert {
1510404b540aSrobert cpp_error (pfile, CPP_DL_ERROR, "empty character constant");
1511404b540aSrobert return 0;
1512404b540aSrobert }
1513404b540aSrobert else if (!cpp_interpret_string (pfile, &token->val.str, 1, &str, wide))
1514404b540aSrobert return 0;
1515404b540aSrobert
1516404b540aSrobert if (wide)
1517404b540aSrobert result = wide_str_to_charconst (pfile, str, pchars_seen, unsignedp);
1518404b540aSrobert else
1519404b540aSrobert result = narrow_str_to_charconst (pfile, str, pchars_seen, unsignedp);
1520404b540aSrobert
1521404b540aSrobert if (str.text != token->val.str.text)
1522404b540aSrobert free ((void *)str.text);
1523404b540aSrobert
1524404b540aSrobert return result;
1525404b540aSrobert }
1526404b540aSrobert
1527404b540aSrobert /* Convert an identifier denoted by ID and LEN, which might contain
1528404b540aSrobert UCN escapes, to the source character set, either UTF-8 or
1529404b540aSrobert UTF-EBCDIC. Assumes that the identifier is actually a valid identifier. */
1530404b540aSrobert cpp_hashnode *
_cpp_interpret_identifier(cpp_reader * pfile,const uchar * id,size_t len)1531404b540aSrobert _cpp_interpret_identifier (cpp_reader *pfile, const uchar *id, size_t len)
1532404b540aSrobert {
1533404b540aSrobert /* It turns out that a UCN escape always turns into fewer characters
1534404b540aSrobert than the escape itself, so we can allocate a temporary in advance. */
1535404b540aSrobert uchar * buf = (uchar *) alloca (len + 1);
1536404b540aSrobert uchar * bufp = buf;
1537404b540aSrobert size_t idp;
1538404b540aSrobert
1539404b540aSrobert for (idp = 0; idp < len; idp++)
1540404b540aSrobert if (id[idp] != '\\')
1541404b540aSrobert *bufp++ = id[idp];
1542404b540aSrobert else
1543404b540aSrobert {
1544404b540aSrobert unsigned length = id[idp+1] == 'u' ? 4 : 8;
1545404b540aSrobert cppchar_t value = 0;
1546404b540aSrobert size_t bufleft = len - (bufp - buf);
1547404b540aSrobert int rval;
1548404b540aSrobert
1549404b540aSrobert idp += 2;
1550404b540aSrobert while (length && idp < len && ISXDIGIT (id[idp]))
1551404b540aSrobert {
1552404b540aSrobert value = (value << 4) + hex_value (id[idp]);
1553404b540aSrobert idp++;
1554404b540aSrobert length--;
1555404b540aSrobert }
1556404b540aSrobert idp--;
1557404b540aSrobert
1558404b540aSrobert /* Special case for EBCDIC: if the identifier contains
1559404b540aSrobert a '$' specified using a UCN, translate it to EBCDIC. */
1560404b540aSrobert if (value == 0x24)
1561404b540aSrobert {
1562404b540aSrobert *bufp++ = '$';
1563404b540aSrobert continue;
1564404b540aSrobert }
1565404b540aSrobert
1566404b540aSrobert rval = one_cppchar_to_utf8 (value, &bufp, &bufleft);
1567404b540aSrobert if (rval)
1568404b540aSrobert {
1569404b540aSrobert errno = rval;
1570404b540aSrobert cpp_errno (pfile, CPP_DL_ERROR,
1571404b540aSrobert "converting UCN to source character set");
1572404b540aSrobert break;
1573404b540aSrobert }
1574404b540aSrobert }
1575404b540aSrobert
1576404b540aSrobert return CPP_HASHNODE (ht_lookup (pfile->hash_table,
1577404b540aSrobert buf, bufp - buf, HT_ALLOC));
1578404b540aSrobert }
1579404b540aSrobert
1580404b540aSrobert /* Convert an input buffer (containing the complete contents of one
1581404b540aSrobert source file) from INPUT_CHARSET to the source character set. INPUT
1582404b540aSrobert points to the input buffer, SIZE is its allocated size, and LEN is
1583404b540aSrobert the length of the meaningful data within the buffer. The
1584404b540aSrobert translated buffer is returned, and *ST_SIZE is set to the length of
1585404b540aSrobert the meaningful data within the translated buffer.
1586404b540aSrobert
1587404b540aSrobert INPUT is expected to have been allocated with xmalloc. This function
1588404b540aSrobert will either return INPUT, or free it and return a pointer to another
1589404b540aSrobert xmalloc-allocated block of memory. */
1590404b540aSrobert uchar *
_cpp_convert_input(cpp_reader * pfile,const char * input_charset,uchar * input,size_t size,size_t len,off_t * st_size)1591404b540aSrobert _cpp_convert_input (cpp_reader *pfile, const char *input_charset,
1592404b540aSrobert uchar *input, size_t size, size_t len, off_t *st_size)
1593404b540aSrobert {
1594404b540aSrobert struct cset_converter input_cset;
1595404b540aSrobert struct _cpp_strbuf to;
1596404b540aSrobert
1597404b540aSrobert input_cset = init_iconv_desc (pfile, SOURCE_CHARSET, input_charset);
1598404b540aSrobert if (input_cset.func == convert_no_conversion)
1599404b540aSrobert {
1600404b540aSrobert to.text = input;
1601404b540aSrobert to.asize = size;
1602404b540aSrobert to.len = len;
1603404b540aSrobert }
1604404b540aSrobert else
1605404b540aSrobert {
1606404b540aSrobert to.asize = MAX (65536, len);
1607404b540aSrobert to.text = XNEWVEC (uchar, to.asize);
1608404b540aSrobert to.len = 0;
1609404b540aSrobert
1610404b540aSrobert if (!APPLY_CONVERSION (input_cset, input, len, &to))
1611404b540aSrobert cpp_error (pfile, CPP_DL_ERROR,
1612404b540aSrobert "failure to convert %s to %s",
1613404b540aSrobert CPP_OPTION (pfile, input_charset), SOURCE_CHARSET);
1614404b540aSrobert
1615404b540aSrobert free (input);
1616404b540aSrobert }
1617404b540aSrobert
1618404b540aSrobert /* Clean up the mess. */
1619404b540aSrobert if (input_cset.func == convert_using_iconv)
1620404b540aSrobert iconv_close (input_cset.cd);
1621404b540aSrobert
1622404b540aSrobert /* Resize buffer if we allocated substantially too much, or if we
1623404b540aSrobert haven't enough space for the \n-terminator. */
1624404b540aSrobert if (to.len + 4096 < to.asize || to.len >= to.asize)
1625404b540aSrobert to.text = XRESIZEVEC (uchar, to.text, to.len + 1);
1626404b540aSrobert
1627404b540aSrobert /* If the file is using old-school Mac line endings (\r only),
1628404b540aSrobert terminate with another \r, not an \n, so that we do not mistake
1629404b540aSrobert the \r\n sequence for a single DOS line ending and erroneously
1630404b540aSrobert issue the "No newline at end of file" diagnostic. */
1631*1c4aaf6cSkili if (to.len > 0 && to.text[to.len - 1] == '\r')
1632404b540aSrobert to.text[to.len] = '\r';
1633404b540aSrobert else
1634404b540aSrobert to.text[to.len] = '\n';
1635404b540aSrobert
1636404b540aSrobert *st_size = to.len;
1637404b540aSrobert return to.text;
1638404b540aSrobert }
1639404b540aSrobert
1640404b540aSrobert /* Decide on the default encoding to assume for input files. */
1641404b540aSrobert const char *
_cpp_default_encoding(void)1642404b540aSrobert _cpp_default_encoding (void)
1643404b540aSrobert {
1644404b540aSrobert const char *current_encoding = NULL;
1645404b540aSrobert
1646404b540aSrobert /* We disable this because the default codeset is 7-bit ASCII on
1647404b540aSrobert most platforms, and this causes conversion failures on every
1648404b540aSrobert file in GCC that happens to have one of the upper 128 characters
1649404b540aSrobert in it -- most likely, as part of the name of a contributor.
1650404b540aSrobert We should definitely recognize in-band markers of file encoding,
1651404b540aSrobert like:
1652404b540aSrobert - the appropriate Unicode byte-order mark (FE FF) to recognize
1653404b540aSrobert UTF16 and UCS4 (in both big-endian and little-endian flavors)
1654404b540aSrobert and UTF8
1655404b540aSrobert - a "#i", "#d", "/ *", "//", " #p" or "#p" (for #pragma) to
1656404b540aSrobert distinguish ASCII and EBCDIC.
1657404b540aSrobert - now we can parse something like "#pragma GCC encoding <xyz>
1658404b540aSrobert on the first line, or even Emacs/VIM's mode line tags (there's
1659404b540aSrobert a problem here in that VIM uses the last line, and Emacs has
1660404b540aSrobert its more elaborate "local variables" convention).
1661404b540aSrobert - investigate whether Java has another common convention, which
1662404b540aSrobert would be friendly to support.
1663404b540aSrobert (Zack Weinberg and Paolo Bonzini, May 20th 2004) */
1664404b540aSrobert #if defined (HAVE_LOCALE_H) && defined (HAVE_LANGINFO_CODESET) && 0
1665404b540aSrobert setlocale (LC_CTYPE, "");
1666404b540aSrobert current_encoding = nl_langinfo (CODESET);
1667404b540aSrobert #endif
1668404b540aSrobert if (current_encoding == NULL || *current_encoding == '\0')
1669404b540aSrobert current_encoding = SOURCE_CHARSET;
1670404b540aSrobert
1671404b540aSrobert return current_encoding;
1672404b540aSrobert }
1673