xref: /netbsd-src/external/gpl3/gcc.old/dist/libcpp/lex.c (revision 6cd39ddb8550f6fa1bff3fed32053d7f19fd0453)
1 /* CPP Library - lexical analysis.
2    Copyright (C) 2000-2013 Free Software Foundation, Inc.
3    Contributed by Per Bothner, 1994-95.
4    Based on CCCP program by Paul Rubin, June 1986
5    Adapted to ANSI C, Richard Stallman, Jan 1987
6    Broken out to separate file, Zack Weinberg, Mar 2000
7 
8 This program is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by the
10 Free Software Foundation; either version 3, or (at your option) any
11 later version.
12 
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 GNU General Public License for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING3.  If not see
20 <http://www.gnu.org/licenses/>.  */
21 
22 #include "config.h"
23 #include "system.h"
24 #include "cpplib.h"
25 #include "internal.h"
26 
27 enum spell_type
28 {
29   SPELL_OPERATOR = 0,
30   SPELL_IDENT,
31   SPELL_LITERAL,
32   SPELL_NONE
33 };
34 
35 struct token_spelling
36 {
37   enum spell_type category;
38   const unsigned char *name;
39 };
40 
41 static const unsigned char *const digraph_spellings[] =
42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" };
43 
44 #define OP(e, s) { SPELL_OPERATOR, UC s  },
45 #define TK(e, s) { SPELL_ ## s,    UC #e },
46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE };
47 #undef OP
48 #undef TK
49 
50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category)
51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name)
52 
53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int);
54 static int skip_line_comment (cpp_reader *);
55 static void skip_whitespace (cpp_reader *, cppchar_t);
56 static void lex_string (cpp_reader *, cpp_token *, const uchar *);
57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t);
58 static void store_comment (cpp_reader *, cpp_token *);
59 static void create_literal (cpp_reader *, cpp_token *, const uchar *,
60 			    unsigned int, enum cpp_ttype);
61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *);
62 static int name_p (cpp_reader *, const cpp_string *);
63 static tokenrun *next_tokenrun (tokenrun *);
64 
65 static _cpp_buff *new_buff (size_t);
66 
67 
68 /* Utility routine:
69 
70    Compares, the token TOKEN to the NUL-terminated string STRING.
71    TOKEN must be a CPP_NAME.  Returns 1 for equal, 0 for unequal.  */
72 int
73 cpp_ideq (const cpp_token *token, const char *string)
74 {
75   if (token->type != CPP_NAME)
76     return 0;
77 
78   return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string);
79 }
80 
81 /* Record a note TYPE at byte POS into the current cleaned logical
82    line.  */
83 static void
84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type)
85 {
86   if (buffer->notes_used == buffer->notes_cap)
87     {
88       buffer->notes_cap = buffer->notes_cap * 2 + 200;
89       buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes,
90                                   buffer->notes_cap);
91     }
92 
93   buffer->notes[buffer->notes_used].pos = pos;
94   buffer->notes[buffer->notes_used].type = type;
95   buffer->notes_used++;
96 }
97 
98 
99 /* Fast path to find line special characters using optimized character
100    scanning algorithms.  Anything complicated falls back to the slow
101    path below.  Since this loop is very hot it's worth doing these kinds
102    of optimizations.
103 
104    One of the paths through the ifdefs should provide
105 
106      const uchar *search_line_fast (const uchar *s, const uchar *end);
107 
108    Between S and END, search for \n, \r, \\, ?.  Return a pointer to
109    the found character.
110 
111    Note that the last character of the buffer is *always* a newline,
112    as forced by _cpp_convert_input.  This fact can be used to avoid
113    explicitly looking for the end of the buffer.  */
114 
115 /* Configure gives us an ifdef test.  */
116 #ifndef WORDS_BIGENDIAN
117 #define WORDS_BIGENDIAN 0
118 #endif
119 
120 /* We'd like the largest integer that fits into a register.  There's nothing
121    in <stdint.h> that gives us that.  For most hosts this is unsigned long,
122    but MS decided on an LLP64 model.  Thankfully when building with GCC we
123    can get the "real" word size.  */
124 #ifdef __GNUC__
125 typedef unsigned int word_type __attribute__((__mode__(__word__)));
126 #else
127 typedef unsigned long word_type;
128 #endif
129 
130 /* The code below is only expecting sizes 4 or 8.
131    Die at compile-time if this expectation is violated.  */
132 typedef char check_word_type_size
133   [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1];
134 
135 /* Return X with the first N bytes forced to values that won't match one
136    of the interesting characters.  Note that NUL is not interesting.  */
137 
138 static inline word_type
139 acc_char_mask_misalign (word_type val, unsigned int n)
140 {
141   word_type mask = -1;
142   if (WORDS_BIGENDIAN)
143     mask >>= n * 8;
144   else
145     mask <<= n * 8;
146   return val & mask;
147 }
148 
149 /* Return X replicated to all byte positions within WORD_TYPE.  */
150 
151 static inline word_type
152 acc_char_replicate (uchar x)
153 {
154   word_type ret;
155 
156   ret = (x << 24) | (x << 16) | (x << 8) | x;
157   if (sizeof(word_type) == 8)
158     ret = (ret << 16 << 16) | ret;
159   return ret;
160 }
161 
162 /* Return non-zero if some byte of VAL is (probably) C.  */
163 
164 static inline word_type
165 acc_char_cmp (word_type val, word_type c)
166 {
167 #if defined(__GNUC__) && defined(__alpha__)
168   /* We can get exact results using a compare-bytes instruction.
169      Get (val == c) via (0 >= (val ^ c)).  */
170   return __builtin_alpha_cmpbge (0, val ^ c);
171 #else
172   word_type magic = 0x7efefefeU;
173   if (sizeof(word_type) == 8)
174     magic = (magic << 16 << 16) | 0xfefefefeU;
175   magic |= 1;
176 
177   val ^= c;
178   return ((val + magic) ^ ~val) & ~magic;
179 #endif
180 }
181 
182 /* Given the result of acc_char_cmp is non-zero, return the index of
183    the found character.  If this was a false positive, return -1.  */
184 
185 static inline int
186 acc_char_index (word_type cmp ATTRIBUTE_UNUSED,
187 		word_type val ATTRIBUTE_UNUSED)
188 {
189 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN
190   /* The cmpbge instruction sets *bits* of the result corresponding to
191      matches in the bytes with no false positives.  */
192   return __builtin_ctzl (cmp);
193 #else
194   unsigned int i;
195 
196   /* ??? It would be nice to force unrolling here,
197      and have all of these constants folded.  */
198   for (i = 0; i < sizeof(word_type); ++i)
199     {
200       uchar c;
201       if (WORDS_BIGENDIAN)
202 	c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff;
203       else
204 	c = (val >> i * 8) & 0xff;
205 
206       if (c == '\n' || c == '\r' || c == '\\' || c == '?')
207 	return i;
208     }
209 
210   return -1;
211 #endif
212 }
213 
214 /* A version of the fast scanner using bit fiddling techniques.
215 
216    For 32-bit words, one would normally perform 16 comparisons and
217    16 branches.  With this algorithm one performs 24 arithmetic
218    operations and one branch.  Whether this is faster with a 32-bit
219    word size is going to be somewhat system dependent.
220 
221    For 64-bit words, we eliminate twice the number of comparisons
222    and branches without increasing the number of arithmetic operations.
223    It's almost certainly going to be a win with 64-bit word size.  */
224 
225 static const uchar * search_line_acc_char (const uchar *, const uchar *)
226   ATTRIBUTE_UNUSED;
227 
228 static const uchar *
229 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
230 {
231   const word_type repl_nl = acc_char_replicate ('\n');
232   const word_type repl_cr = acc_char_replicate ('\r');
233   const word_type repl_bs = acc_char_replicate ('\\');
234   const word_type repl_qm = acc_char_replicate ('?');
235 
236   unsigned int misalign;
237   const word_type *p;
238   word_type val, t;
239 
240   /* Align the buffer.  Mask out any bytes from before the beginning.  */
241   p = (word_type *)((uintptr_t)s & -sizeof(word_type));
242   val = *p;
243   misalign = (uintptr_t)s & (sizeof(word_type) - 1);
244   if (misalign)
245     val = acc_char_mask_misalign (val, misalign);
246 
247   /* Main loop.  */
248   while (1)
249     {
250       t  = acc_char_cmp (val, repl_nl);
251       t |= acc_char_cmp (val, repl_cr);
252       t |= acc_char_cmp (val, repl_bs);
253       t |= acc_char_cmp (val, repl_qm);
254 
255       if (__builtin_expect (t != 0, 0))
256 	{
257 	  int i = acc_char_index (t, val);
258 	  if (i >= 0)
259 	    return (const uchar *)p + i;
260 	}
261 
262       val = *++p;
263     }
264 }
265 
266 /* Disable on Solaris 2/x86 until the following problems can be properly
267    autoconfed:
268 
269    The Solaris 9 assembler cannot assemble SSE4.2 insns.
270    Before Solaris 9 Update 6, SSE insns cannot be executed.
271    The Solaris 10+ assembler tags objects with the instruction set
272    extensions used, so SSE4.2 executables cannot run on machines that
273    don't support that extension.  */
274 
275 #if (GCC_VERSION >= 4005) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__))
276 
277 /* Replicated character data to be shared between implementations.
278    Recall that outside of a context with vector support we can't
279    define compatible vector types, therefore these are all defined
280    in terms of raw characters.  */
281 static const char repl_chars[4][16] __attribute__((aligned(16))) = {
282   { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
283     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' },
284   { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
285     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' },
286   { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
287     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' },
288   { '?', '?', '?', '?', '?', '?', '?', '?',
289     '?', '?', '?', '?', '?', '?', '?', '?' },
290 };
291 
292 /* A version of the fast scanner using MMX vectorized byte compare insns.
293 
294    This uses the PMOVMSKB instruction which was introduced with "MMX2",
295    which was packaged into SSE1; it is also present in the AMD MMX
296    extension.  Mark the function as using "sse" so that we emit a real
297    "emms" instruction, rather than the 3dNOW "femms" instruction.  */
298 
299 static const uchar *
300 #ifndef __SSE__
301 __attribute__((__target__("sse")))
302 #endif
303 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
304 {
305   typedef char v8qi __attribute__ ((__vector_size__ (8)));
306   typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__));
307 
308   const v8qi repl_nl = *(const v8qi *)repl_chars[0];
309   const v8qi repl_cr = *(const v8qi *)repl_chars[1];
310   const v8qi repl_bs = *(const v8qi *)repl_chars[2];
311   const v8qi repl_qm = *(const v8qi *)repl_chars[3];
312 
313   unsigned int misalign, found, mask;
314   const v8qi *p;
315   v8qi data, t, c;
316 
317   /* Align the source pointer.  While MMX doesn't generate unaligned data
318      faults, this allows us to safely scan to the end of the buffer without
319      reading beyond the end of the last page.  */
320   misalign = (uintptr_t)s & 7;
321   p = (const v8qi *)((uintptr_t)s & -8);
322   data = *p;
323 
324   /* Create a mask for the bytes that are valid within the first
325      16-byte block.  The Idea here is that the AND with the mask
326      within the loop is "free", since we need some AND or TEST
327      insn in order to set the flags for the branch anyway.  */
328   mask = -1u << misalign;
329 
330   /* Main loop processing 8 bytes at a time.  */
331   goto start;
332   do
333     {
334       data = *++p;
335       mask = -1;
336 
337     start:
338       t = __builtin_ia32_pcmpeqb(data, repl_nl);
339       c = __builtin_ia32_pcmpeqb(data, repl_cr);
340       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
341       c = __builtin_ia32_pcmpeqb(data, repl_bs);
342       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
343       c = __builtin_ia32_pcmpeqb(data, repl_qm);
344       t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c);
345       found = __builtin_ia32_pmovmskb (t);
346       found &= mask;
347     }
348   while (!found);
349 
350   __builtin_ia32_emms ();
351 
352   /* FOUND contains 1 in bits for which we matched a relevant
353      character.  Conversion to the byte index is trivial.  */
354   found = __builtin_ctz(found);
355   return (const uchar *)p + found;
356 }
357 
358 /* A version of the fast scanner using SSE2 vectorized byte compare insns.  */
359 
360 static const uchar *
361 #ifndef __SSE2__
362 __attribute__((__target__("sse2")))
363 #endif
364 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
365 {
366   typedef char v16qi __attribute__ ((__vector_size__ (16)));
367 
368   const v16qi repl_nl = *(const v16qi *)repl_chars[0];
369   const v16qi repl_cr = *(const v16qi *)repl_chars[1];
370   const v16qi repl_bs = *(const v16qi *)repl_chars[2];
371   const v16qi repl_qm = *(const v16qi *)repl_chars[3];
372 
373   unsigned int misalign, found, mask;
374   const v16qi *p;
375   v16qi data, t;
376 
377   /* Align the source pointer.  */
378   misalign = (uintptr_t)s & 15;
379   p = (const v16qi *)((uintptr_t)s & -16);
380   data = *p;
381 
382   /* Create a mask for the bytes that are valid within the first
383      16-byte block.  The Idea here is that the AND with the mask
384      within the loop is "free", since we need some AND or TEST
385      insn in order to set the flags for the branch anyway.  */
386   mask = -1u << misalign;
387 
388   /* Main loop processing 16 bytes at a time.  */
389   goto start;
390   do
391     {
392       data = *++p;
393       mask = -1;
394 
395     start:
396       t  = __builtin_ia32_pcmpeqb128(data, repl_nl);
397       t |= __builtin_ia32_pcmpeqb128(data, repl_cr);
398       t |= __builtin_ia32_pcmpeqb128(data, repl_bs);
399       t |= __builtin_ia32_pcmpeqb128(data, repl_qm);
400       found = __builtin_ia32_pmovmskb128 (t);
401       found &= mask;
402     }
403   while (!found);
404 
405   /* FOUND contains 1 in bits for which we matched a relevant
406      character.  Conversion to the byte index is trivial.  */
407   found = __builtin_ctz(found);
408   return (const uchar *)p + found;
409 }
410 
411 #ifdef HAVE_SSE4
412 /* A version of the fast scanner using SSE 4.2 vectorized string insns.  */
413 
414 static const uchar *
415 #ifndef __SSE4_2__
416 __attribute__((__target__("sse4.2")))
417 #endif
418 search_line_sse42 (const uchar *s, const uchar *end)
419 {
420   typedef char v16qi __attribute__ ((__vector_size__ (16)));
421   static const v16qi search = { '\n', '\r', '?', '\\' };
422 
423   uintptr_t si = (uintptr_t)s;
424   uintptr_t index;
425 
426   /* Check for unaligned input.  */
427   if (si & 15)
428     {
429       v16qi sv;
430 
431       if (__builtin_expect (end - s < 16, 0)
432 	  && __builtin_expect ((si & 0xfff) > 0xff0, 0))
433 	{
434 	  /* There are less than 16 bytes left in the buffer, and less
435 	     than 16 bytes left on the page.  Reading 16 bytes at this
436 	     point might generate a spurious page fault.  Defer to the
437 	     SSE2 implementation, which already handles alignment.  */
438 	  return search_line_sse2 (s, end);
439 	}
440 
441       /* ??? The builtin doesn't understand that the PCMPESTRI read from
442 	 memory need not be aligned.  */
443       sv = __builtin_ia32_loaddqu ((const char *) s);
444       index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0);
445 
446       if (__builtin_expect (index < 16, 0))
447 	goto found;
448 
449       /* Advance the pointer to an aligned address.  We will re-scan a
450 	 few bytes, but we no longer need care for reading past the
451 	 end of a page, since we're guaranteed a match.  */
452       s = (const uchar *)((si + 16) & -16);
453     }
454 
455   /* Main loop, processing 16 bytes at a time.  By doing the whole loop
456      in inline assembly, we can make proper use of the flags set.  */
457   __asm (      "sub $16, %1\n"
458 	"	.balign 16\n"
459 	"0:	add $16, %1\n"
460 	"	%vpcmpestri $0, (%1), %2\n"
461 	"	jnc 0b"
462 	: "=&c"(index), "+r"(s)
463 	: "x"(search), "a"(4), "d"(16));
464 
465  found:
466   return s + index;
467 }
468 
469 #else
470 /* Work around out-dated assemblers without sse4 support.  */
471 #define search_line_sse42 search_line_sse2
472 #endif
473 
474 /* Check the CPU capabilities.  */
475 
476 #include "../gcc/config/i386/cpuid.h"
477 
478 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *);
479 static search_line_fast_type search_line_fast;
480 
481 #define HAVE_init_vectorized_lexer 1
482 static inline void
483 init_vectorized_lexer (void)
484 {
485   unsigned dummy, ecx = 0, edx = 0;
486   search_line_fast_type impl = search_line_acc_char;
487   int minimum = 0;
488 
489 #if defined(__SSE4_2__)
490   minimum = 3;
491 #elif defined(__SSE2__)
492   minimum = 2;
493 #elif defined(__SSE__)
494   minimum = 1;
495 #endif
496 
497   if (minimum == 3)
498     impl = search_line_sse42;
499   else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2)
500     {
501       if (minimum == 3 || (ecx & bit_SSE4_2))
502         impl = search_line_sse42;
503       else if (minimum == 2 || (edx & bit_SSE2))
504 	impl = search_line_sse2;
505       else if (minimum == 1 || (edx & bit_SSE))
506 	impl = search_line_mmx;
507     }
508   else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx))
509     {
510       if (minimum == 1
511 	  || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV))
512 	impl = search_line_mmx;
513     }
514 
515   search_line_fast = impl;
516 }
517 
518 #elif defined(_ARCH_PWR8) && defined(__ALTIVEC__)
519 
520 /* A vection of the fast scanner using AltiVec vectorized byte compares
521    and VSX unaligned loads (when VSX is available).  This is otherwise
522    the same as the pre-GCC 5 version.  */
523 
524 static const uchar *
525 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
526 {
527   typedef __attribute__((altivec(vector))) unsigned char vc;
528 
529   const vc repl_nl = {
530     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
531     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
532   };
533   const vc repl_cr = {
534     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
535     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
536   };
537   const vc repl_bs = {
538     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
539     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
540   };
541   const vc repl_qm = {
542     '?', '?', '?', '?', '?', '?', '?', '?',
543     '?', '?', '?', '?', '?', '?', '?', '?',
544   };
545   const vc zero = { 0 };
546 
547   vc data, t;
548 
549   /* Main loop processing 16 bytes at a time.  */
550   do
551     {
552       vc m_nl, m_cr, m_bs, m_qm;
553 
554       data = *((const vc *)s);
555       s += 16;
556 
557       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
558       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
559       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
560       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
561       t = (m_nl | m_cr) | (m_bs | m_qm);
562 
563       /* T now contains 0xff in bytes for which we matched one of the relevant
564 	 characters.  We want to exit the loop if any byte in T is non-zero.
565 	 Below is the expansion of vec_any_ne(t, zero).  */
566     }
567   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
568 
569   /* Restore s to to point to the 16 bytes we just processed.  */
570   s -= 16;
571 
572   {
573 #define N  (sizeof(vc) / sizeof(long))
574 
575     union {
576       vc v;
577       /* Statically assert that N is 2 or 4.  */
578       unsigned long l[(N == 2 || N == 4) ? N : -1];
579     } u;
580     unsigned long l, i = 0;
581 
582     u.v = t;
583 
584     /* Find the first word of T that is non-zero.  */
585     switch (N)
586       {
587       case 4:
588 	l = u.l[i++];
589 	if (l != 0)
590 	  break;
591 	s += sizeof(unsigned long);
592 	l = u.l[i++];
593 	if (l != 0)
594 	  break;
595 	s += sizeof(unsigned long);
596       case 2:
597 	l = u.l[i++];
598 	if (l != 0)
599 	  break;
600 	s += sizeof(unsigned long);
601 	l = u.l[i];
602       }
603 
604     /* L now contains 0xff in bytes for which we matched one of the
605        relevant characters.  We can find the byte index by finding
606        its bit index and dividing by 8.  */
607 #ifdef __BIG_ENDIAN__
608     l = __builtin_clzl(l) >> 3;
609 #else
610     l = __builtin_ctzl(l) >> 3;
611 #endif
612     return s + l;
613 
614 #undef N
615   }
616 }
617 
618 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__)
619 
620 /* A vection of the fast scanner using AltiVec vectorized byte compares.
621    This cannot be used for little endian because vec_lvsl/lvsr are
622    deprecated for little endian and the code won't work properly.  */
623 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported,
624    so we can't compile this function without -maltivec on the command line
625    (or implied by some other switch).  */
626 
627 static const uchar *
628 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
629 {
630   typedef __attribute__((altivec(vector))) unsigned char vc;
631 
632   const vc repl_nl = {
633     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
634     '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n'
635   };
636   const vc repl_cr = {
637     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r',
638     '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r'
639   };
640   const vc repl_bs = {
641     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\',
642     '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\'
643   };
644   const vc repl_qm = {
645     '?', '?', '?', '?', '?', '?', '?', '?',
646     '?', '?', '?', '?', '?', '?', '?', '?',
647   };
648   const vc ones = {
649     -1, -1, -1, -1, -1, -1, -1, -1,
650     -1, -1, -1, -1, -1, -1, -1, -1,
651   };
652   const vc zero = { 0 };
653 
654   vc data, mask, t;
655 
656   /* Altivec loads automatically mask addresses with -16.  This lets us
657      issue the first load as early as possible.  */
658   data = __builtin_vec_ld(0, (const vc *)s);
659 
660   /* Discard bytes before the beginning of the buffer.  Do this by
661      beginning with all ones and shifting in zeros according to the
662      mis-alignment.  The LVSR instruction pulls the exact shift we
663      want from the address.  */
664   mask = __builtin_vec_lvsr(0, s);
665   mask = __builtin_vec_perm(zero, ones, mask);
666   data &= mask;
667 
668   /* While altivec loads mask addresses, we still need to align S so
669      that the offset we compute at the end is correct.  */
670   s = (const uchar *)((uintptr_t)s & -16);
671 
672   /* Main loop processing 16 bytes at a time.  */
673   goto start;
674   do
675     {
676       vc m_nl, m_cr, m_bs, m_qm;
677 
678       s += 16;
679       data = __builtin_vec_ld(0, (const vc *)s);
680 
681     start:
682       m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl);
683       m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr);
684       m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs);
685       m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm);
686       t = (m_nl | m_cr) | (m_bs | m_qm);
687 
688       /* T now contains 0xff in bytes for which we matched one of the relevant
689 	 characters.  We want to exit the loop if any byte in T is non-zero.
690 	 Below is the expansion of vec_any_ne(t, zero).  */
691     }
692   while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero));
693 
694   {
695 #define N  (sizeof(vc) / sizeof(long))
696 
697     union {
698       vc v;
699       /* Statically assert that N is 2 or 4.  */
700       unsigned long l[(N == 2 || N == 4) ? N : -1];
701     } u;
702     unsigned long l, i = 0;
703 
704     u.v = t;
705 
706     /* Find the first word of T that is non-zero.  */
707     switch (N)
708       {
709       case 4:
710 	l = u.l[i++];
711 	if (l != 0)
712 	  break;
713 	s += sizeof(unsigned long);
714 	l = u.l[i++];
715 	if (l != 0)
716 	  break;
717 	s += sizeof(unsigned long);
718       case 2:
719 	l = u.l[i++];
720 	if (l != 0)
721 	  break;
722 	s += sizeof(unsigned long);
723 	l = u.l[i];
724       }
725 
726     /* L now contains 0xff in bytes for which we matched one of the
727        relevant characters.  We can find the byte index by finding
728        its bit index and dividing by 8.  */
729     l = __builtin_clzl(l) >> 3;
730     return s + l;
731 
732 #undef N
733   }
734 }
735 
736 #elif defined (__ARM_NEON__)
737 #include "arm_neon.h"
738 
739 static const uchar *
740 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED)
741 {
742   const uint8x16_t repl_nl = vdupq_n_u8 ('\n');
743   const uint8x16_t repl_cr = vdupq_n_u8 ('\r');
744   const uint8x16_t repl_bs = vdupq_n_u8 ('\\');
745   const uint8x16_t repl_qm = vdupq_n_u8 ('?');
746   const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL);
747 
748   unsigned int misalign, found, mask;
749   const uint8_t *p;
750   uint8x16_t data;
751 
752   /* Align the source pointer.  */
753   misalign = (uintptr_t)s & 15;
754   p = (const uint8_t *)((uintptr_t)s & -16);
755   data = vld1q_u8 (p);
756 
757   /* Create a mask for the bytes that are valid within the first
758      16-byte block.  The Idea here is that the AND with the mask
759      within the loop is "free", since we need some AND or TEST
760      insn in order to set the flags for the branch anyway.  */
761   mask = (-1u << misalign) & 0xffff;
762 
763   /* Main loop, processing 16 bytes at a time.  */
764   goto start;
765 
766   do
767     {
768       uint8x8_t l;
769       uint16x4_t m;
770       uint32x2_t n;
771       uint8x16_t t, u, v, w;
772 
773       p += 16;
774       data = vld1q_u8 (p);
775       mask = 0xffff;
776 
777     start:
778       t = vceqq_u8 (data, repl_nl);
779       u = vceqq_u8 (data, repl_cr);
780       v = vorrq_u8 (t, vceqq_u8 (data, repl_bs));
781       w = vorrq_u8 (u, vceqq_u8 (data, repl_qm));
782       t = vandq_u8 (vorrq_u8 (v, w), xmask);
783       l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t));
784       m = vpaddl_u8 (l);
785       n = vpaddl_u16 (m);
786 
787       found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n,
788 	      vshr_n_u64 ((uint64x1_t) n, 24)), 0);
789       found &= mask;
790     }
791   while (!found);
792 
793   /* FOUND contains 1 in bits for which we matched a relevant
794      character.  Conversion to the byte index is trivial.  */
795   found = __builtin_ctz (found);
796   return (const uchar *)p + found;
797 }
798 
799 #else
800 
801 /* We only have one accellerated alternative.  Use a direct call so that
802    we encourage inlining.  */
803 
804 #define search_line_fast  search_line_acc_char
805 
806 #endif
807 
808 /* Initialize the lexer if needed.  */
809 
810 void
811 _cpp_init_lexer (void)
812 {
813 #ifdef HAVE_init_vectorized_lexer
814   init_vectorized_lexer ();
815 #endif
816 }
817 
818 /* Returns with a logical line that contains no escaped newlines or
819    trigraphs.  This is a time-critical inner loop.  */
820 void
821 _cpp_clean_line (cpp_reader *pfile)
822 {
823   cpp_buffer *buffer;
824   const uchar *s;
825   uchar c, *d, *p;
826 
827   buffer = pfile->buffer;
828   buffer->cur_note = buffer->notes_used = 0;
829   buffer->cur = buffer->line_base = buffer->next_line;
830   buffer->need_line = false;
831   s = buffer->next_line;
832 
833   if (!buffer->from_stage3)
834     {
835       const uchar *pbackslash = NULL;
836 
837       /* Fast path.  This is the common case of an un-escaped line with
838 	 no trigraphs.  The primary win here is by not writing any
839 	 data back to memory until we have to.  */
840       while (1)
841 	{
842 	  /* Perform an optimized search for \n, \r, \\, ?.  */
843 	  s = search_line_fast (s, buffer->rlimit);
844 
845 	  c = *s;
846 	  if (c == '\\')
847 	    {
848 	      /* Record the location of the backslash and continue.  */
849 	      pbackslash = s++;
850 	    }
851 	  else if (__builtin_expect (c == '?', 0))
852 	    {
853 	      if (__builtin_expect (s[1] == '?', false)
854 		   && _cpp_trigraph_map[s[2]])
855 		{
856 		  /* Have a trigraph.  We may or may not have to convert
857 		     it.  Add a line note regardless, for -Wtrigraphs.  */
858 		  add_line_note (buffer, s, s[2]);
859 		  if (CPP_OPTION (pfile, trigraphs))
860 		    {
861 		      /* We do, and that means we have to switch to the
862 		         slow path.  */
863 		      d = (uchar *) s;
864 		      *d = _cpp_trigraph_map[s[2]];
865 		      s += 2;
866 		      goto slow_path;
867 		    }
868 		}
869 	      /* Not a trigraph.  Continue on fast-path.  */
870 	      s++;
871 	    }
872 	  else
873 	    break;
874 	}
875 
876       /* This must be \r or \n.  We're either done, or we'll be forced
877 	 to write back to the buffer and continue on the slow path.  */
878       d = (uchar *) s;
879 
880       if (__builtin_expect (s == buffer->rlimit, false))
881 	goto done;
882 
883       /* DOS line ending? */
884       if (__builtin_expect (c == '\r', false) && s[1] == '\n')
885 	{
886 	  s++;
887 	  if (s == buffer->rlimit)
888 	    goto done;
889 	}
890 
891       if (__builtin_expect (pbackslash == NULL, true))
892 	goto done;
893 
894       /* Check for escaped newline.  */
895       p = d;
896       while (is_nvspace (p[-1]))
897 	p--;
898       if (p - 1 != pbackslash)
899 	goto done;
900 
901       /* Have an escaped newline; process it and proceed to
902 	 the slow path.  */
903       add_line_note (buffer, p - 1, p != d ? ' ' : '\\');
904       d = p - 2;
905       buffer->next_line = p - 1;
906 
907     slow_path:
908       while (1)
909 	{
910 	  c = *++s;
911 	  *++d = c;
912 
913 	  if (c == '\n' || c == '\r')
914 	    {
915 	      /* Handle DOS line endings.  */
916 	      if (c == '\r' && s != buffer->rlimit && s[1] == '\n')
917 		s++;
918 	      if (s == buffer->rlimit)
919 		break;
920 
921 	      /* Escaped?  */
922 	      p = d;
923 	      while (p != buffer->next_line && is_nvspace (p[-1]))
924 		p--;
925 	      if (p == buffer->next_line || p[-1] != '\\')
926 		break;
927 
928 	      add_line_note (buffer, p - 1, p != d ? ' ': '\\');
929 	      d = p - 2;
930 	      buffer->next_line = p - 1;
931 	    }
932 	  else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]])
933 	    {
934 	      /* Add a note regardless, for the benefit of -Wtrigraphs.  */
935 	      add_line_note (buffer, d, s[2]);
936 	      if (CPP_OPTION (pfile, trigraphs))
937 		{
938 		  *d = _cpp_trigraph_map[s[2]];
939 		  s += 2;
940 		}
941 	    }
942 	}
943     }
944   else
945     {
946       while (*s != '\n' && *s != '\r')
947 	s++;
948       d = (uchar *) s;
949 
950       /* Handle DOS line endings.  */
951       if (*s == '\r' && s != buffer->rlimit && s[1] == '\n')
952 	s++;
953     }
954 
955  done:
956   *d = '\n';
957   /* A sentinel note that should never be processed.  */
958   add_line_note (buffer, d + 1, '\n');
959   buffer->next_line = s + 1;
960 }
961 
962 /* Return true if the trigraph indicated by NOTE should be warned
963    about in a comment.  */
964 static bool
965 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note)
966 {
967   const uchar *p;
968 
969   /* Within comments we don't warn about trigraphs, unless the
970      trigraph forms an escaped newline, as that may change
971      behavior.  */
972   if (note->type != '/')
973     return false;
974 
975   /* If -trigraphs, then this was an escaped newline iff the next note
976      is coincident.  */
977   if (CPP_OPTION (pfile, trigraphs))
978     return note[1].pos == note->pos;
979 
980   /* Otherwise, see if this forms an escaped newline.  */
981   p = note->pos + 3;
982   while (is_nvspace (*p))
983     p++;
984 
985   /* There might have been escaped newlines between the trigraph and the
986      newline we found.  Hence the position test.  */
987   return (*p == '\n' && p < note[1].pos);
988 }
989 
990 /* Process the notes created by add_line_note as far as the current
991    location.  */
992 void
993 _cpp_process_line_notes (cpp_reader *pfile, int in_comment)
994 {
995   cpp_buffer *buffer = pfile->buffer;
996 
997   for (;;)
998     {
999       _cpp_line_note *note = &buffer->notes[buffer->cur_note];
1000       unsigned int col;
1001 
1002       if (note->pos > buffer->cur)
1003 	break;
1004 
1005       buffer->cur_note++;
1006       col = CPP_BUF_COLUMN (buffer, note->pos + 1);
1007 
1008       if (note->type == '\\' || note->type == ' ')
1009 	{
1010 	  if (note->type == ' ' && !in_comment)
1011 	    cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col,
1012 				 "backslash and newline separated by space");
1013 
1014 	  if (buffer->next_line > buffer->rlimit)
1015 	    {
1016 	      cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col,
1017 				   "backslash-newline at end of file");
1018 	      /* Prevent "no newline at end of file" warning.  */
1019 	      buffer->next_line = buffer->rlimit;
1020 	    }
1021 
1022 	  buffer->line_base = note->pos;
1023 	  CPP_INCREMENT_LINE (pfile, 0);
1024 	}
1025       else if (_cpp_trigraph_map[note->type])
1026 	{
1027 	  if (CPP_OPTION (pfile, warn_trigraphs)
1028 	      && (!in_comment || warn_in_comment (pfile, note)))
1029 	    {
1030 	      if (CPP_OPTION (pfile, trigraphs))
1031 		cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS,
1032                                        pfile->line_table->highest_line, col,
1033 				       "trigraph ??%c converted to %c",
1034 				       note->type,
1035 				       (int) _cpp_trigraph_map[note->type]);
1036 	      else
1037 		{
1038 		  cpp_warning_with_line
1039 		    (pfile, CPP_W_TRIGRAPHS,
1040                      pfile->line_table->highest_line, col,
1041 		     "trigraph ??%c ignored, use -trigraphs to enable",
1042 		     note->type);
1043 		}
1044 	    }
1045 	}
1046       else if (note->type == 0)
1047 	/* Already processed in lex_raw_string.  */;
1048       else
1049 	abort ();
1050     }
1051 }
1052 
1053 /* Skip a C-style block comment.  We find the end of the comment by
1054    seeing if an asterisk is before every '/' we encounter.  Returns
1055    nonzero if comment terminated by EOF, zero otherwise.
1056 
1057    Buffer->cur points to the initial asterisk of the comment.  */
1058 bool
1059 _cpp_skip_block_comment (cpp_reader *pfile)
1060 {
1061   cpp_buffer *buffer = pfile->buffer;
1062   const uchar *cur = buffer->cur;
1063   uchar c;
1064 
1065   cur++;
1066   if (*cur == '/')
1067     cur++;
1068 
1069   for (;;)
1070     {
1071       /* People like decorating comments with '*', so check for '/'
1072 	 instead for efficiency.  */
1073       c = *cur++;
1074 
1075       if (c == '/')
1076 	{
1077 	  if (cur[-2] == '*')
1078 	    break;
1079 
1080 	  /* Warn about potential nested comments, but not if the '/'
1081 	     comes immediately before the true comment delimiter.
1082 	     Don't bother to get it right across escaped newlines.  */
1083 	  if (CPP_OPTION (pfile, warn_comments)
1084 	      && cur[0] == '*' && cur[1] != '/')
1085 	    {
1086 	      buffer->cur = cur;
1087 	      cpp_warning_with_line (pfile, CPP_W_COMMENTS,
1088 				     pfile->line_table->highest_line,
1089 				     CPP_BUF_COL (buffer),
1090 				     "\"/*\" within comment");
1091 	    }
1092 	}
1093       else if (c == '\n')
1094 	{
1095 	  unsigned int cols;
1096 	  buffer->cur = cur - 1;
1097 	  _cpp_process_line_notes (pfile, true);
1098 	  if (buffer->next_line >= buffer->rlimit)
1099 	    return true;
1100 	  _cpp_clean_line (pfile);
1101 
1102 	  cols = buffer->next_line - buffer->line_base;
1103 	  CPP_INCREMENT_LINE (pfile, cols);
1104 
1105 	  cur = buffer->cur;
1106 	}
1107     }
1108 
1109   buffer->cur = cur;
1110   _cpp_process_line_notes (pfile, true);
1111   return false;
1112 }
1113 
1114 /* Skip a C++ line comment, leaving buffer->cur pointing to the
1115    terminating newline.  Handles escaped newlines.  Returns nonzero
1116    if a multiline comment.  */
1117 static int
1118 skip_line_comment (cpp_reader *pfile)
1119 {
1120   cpp_buffer *buffer = pfile->buffer;
1121   source_location orig_line = pfile->line_table->highest_line;
1122 
1123   while (*buffer->cur != '\n')
1124     buffer->cur++;
1125 
1126   _cpp_process_line_notes (pfile, true);
1127   return orig_line != pfile->line_table->highest_line;
1128 }
1129 
1130 /* Skips whitespace, saving the next non-whitespace character.  */
1131 static void
1132 skip_whitespace (cpp_reader *pfile, cppchar_t c)
1133 {
1134   cpp_buffer *buffer = pfile->buffer;
1135   bool saw_NUL = false;
1136 
1137   do
1138     {
1139       /* Horizontal space always OK.  */
1140       if (c == ' ' || c == '\t')
1141 	;
1142       /* Just \f \v or \0 left.  */
1143       else if (c == '\0')
1144 	saw_NUL = true;
1145       else if (pfile->state.in_directive && CPP_PEDANTIC (pfile))
1146 	cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line,
1147 			     CPP_BUF_COL (buffer),
1148 			     "%s in preprocessing directive",
1149 			     c == '\f' ? "form feed" : "vertical tab");
1150 
1151       c = *buffer->cur++;
1152     }
1153   /* We only want non-vertical space, i.e. ' ' \t \f \v \0.  */
1154   while (is_nvspace (c));
1155 
1156   if (saw_NUL)
1157     cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored");
1158 
1159   buffer->cur--;
1160 }
1161 
1162 /* See if the characters of a number token are valid in a name (no
1163    '.', '+' or '-').  */
1164 static int
1165 name_p (cpp_reader *pfile, const cpp_string *string)
1166 {
1167   unsigned int i;
1168 
1169   for (i = 0; i < string->len; i++)
1170     if (!is_idchar (string->text[i]))
1171       return 0;
1172 
1173   return 1;
1174 }
1175 
1176 /* After parsing an identifier or other sequence, produce a warning about
1177    sequences not in NFC/NFKC.  */
1178 static void
1179 warn_about_normalization (cpp_reader *pfile,
1180 			  const cpp_token *token,
1181 			  const struct normalize_state *s)
1182 {
1183   if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s)
1184       && !pfile->state.skipping)
1185     {
1186       /* Make sure that the token is printed using UCNs, even
1187 	 if we'd otherwise happily print UTF-8.  */
1188       unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token));
1189       size_t sz;
1190 
1191       sz = cpp_spell_token (pfile, token, buf, false) - buf;
1192       if (NORMALIZE_STATE_RESULT (s) == normalized_C)
1193 	cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1194 			       "`%.*s' is not in NFKC", (int) sz, buf);
1195       else
1196 	cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0,
1197 			       "`%.*s' is not in NFC", (int) sz, buf);
1198       free (buf);
1199     }
1200 }
1201 
1202 /* Returns TRUE if the sequence starting at buffer->cur is invalid in
1203    an identifier.  FIRST is TRUE if this starts an identifier.  */
1204 static bool
1205 forms_identifier_p (cpp_reader *pfile, int first,
1206 		    struct normalize_state *state)
1207 {
1208   cpp_buffer *buffer = pfile->buffer;
1209 
1210   if (*buffer->cur == '$')
1211     {
1212       if (!CPP_OPTION (pfile, dollars_in_ident))
1213 	return false;
1214 
1215       buffer->cur++;
1216       if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping)
1217 	{
1218 	  CPP_OPTION (pfile, warn_dollars) = 0;
1219 	  cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number");
1220 	}
1221 
1222       return true;
1223     }
1224 
1225   /* Is this a syntactically valid UCN?  */
1226   if (CPP_OPTION (pfile, extended_identifiers)
1227       && *buffer->cur == '\\'
1228       && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U'))
1229     {
1230       buffer->cur += 2;
1231       if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
1232 			  state))
1233 	return true;
1234       buffer->cur -= 2;
1235     }
1236 
1237   return false;
1238 }
1239 
1240 /* Helper function to get the cpp_hashnode of the identifier BASE.  */
1241 static cpp_hashnode *
1242 lex_identifier_intern (cpp_reader *pfile, const uchar *base)
1243 {
1244   cpp_hashnode *result;
1245   const uchar *cur;
1246   unsigned int len;
1247   unsigned int hash = HT_HASHSTEP (0, *base);
1248 
1249   cur = base + 1;
1250   while (ISIDNUM (*cur))
1251     {
1252       hash = HT_HASHSTEP (hash, *cur);
1253       cur++;
1254     }
1255   len = cur - base;
1256   hash = HT_HASHFINISH (hash, len);
1257   result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1258 					      base, len, hash, HT_ALLOC));
1259 
1260   /* Rarely, identifiers require diagnostics when lexed.  */
1261   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1262 			&& !pfile->state.skipping, 0))
1263     {
1264       /* It is allowed to poison the same identifier twice.  */
1265       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1266 	cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1267 		   NODE_NAME (result));
1268 
1269       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1270 	 replacement list of a variadic macro.  */
1271       if (result == pfile->spec_nodes.n__VA_ARGS__
1272 	  && !pfile->state.va_args_ok)
1273 	cpp_error (pfile, CPP_DL_PEDWARN,
1274 		   "__VA_ARGS__ can only appear in the expansion"
1275 		   " of a C99 variadic macro");
1276 
1277       /* For -Wc++-compat, warn about use of C++ named operators.  */
1278       if (result->flags & NODE_WARN_OPERATOR)
1279 	cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1280 		     "identifier \"%s\" is a special operator name in C++",
1281 		     NODE_NAME (result));
1282     }
1283 
1284   return result;
1285 }
1286 
1287 /* Get the cpp_hashnode of an identifier specified by NAME in
1288    the current cpp_reader object.  If none is found, NULL is returned.  */
1289 cpp_hashnode *
1290 _cpp_lex_identifier (cpp_reader *pfile, const char *name)
1291 {
1292   cpp_hashnode *result;
1293   result = lex_identifier_intern (pfile, (uchar *) name);
1294   return result;
1295 }
1296 
1297 /* Lex an identifier starting at BUFFER->CUR - 1.  */
1298 static cpp_hashnode *
1299 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn,
1300 		struct normalize_state *nst)
1301 {
1302   cpp_hashnode *result;
1303   const uchar *cur;
1304   unsigned int len;
1305   unsigned int hash = HT_HASHSTEP (0, *base);
1306 
1307   cur = pfile->buffer->cur;
1308   if (! starts_ucn)
1309     while (ISIDNUM (*cur))
1310       {
1311 	hash = HT_HASHSTEP (hash, *cur);
1312 	cur++;
1313       }
1314   pfile->buffer->cur = cur;
1315   if (starts_ucn || forms_identifier_p (pfile, false, nst))
1316     {
1317       /* Slower version for identifiers containing UCNs (or $).  */
1318       do {
1319 	while (ISIDNUM (*pfile->buffer->cur))
1320 	  {
1321 	    pfile->buffer->cur++;
1322 	    NORMALIZE_STATE_UPDATE_IDNUM (nst);
1323 	  }
1324       } while (forms_identifier_p (pfile, false, nst));
1325       result = _cpp_interpret_identifier (pfile, base,
1326 					  pfile->buffer->cur - base);
1327     }
1328   else
1329     {
1330       len = cur - base;
1331       hash = HT_HASHFINISH (hash, len);
1332 
1333       result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table,
1334 						  base, len, hash, HT_ALLOC));
1335     }
1336 
1337   /* Rarely, identifiers require diagnostics when lexed.  */
1338   if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC)
1339 			&& !pfile->state.skipping, 0))
1340     {
1341       /* It is allowed to poison the same identifier twice.  */
1342       if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok)
1343 	cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"",
1344 		   NODE_NAME (result));
1345 
1346       /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the
1347 	 replacement list of a variadic macro.  */
1348       if (result == pfile->spec_nodes.n__VA_ARGS__
1349 	  && !pfile->state.va_args_ok)
1350 	cpp_error (pfile, CPP_DL_PEDWARN,
1351 		   "__VA_ARGS__ can only appear in the expansion"
1352 		   " of a C99 variadic macro");
1353 
1354       /* For -Wc++-compat, warn about use of C++ named operators.  */
1355       if (result->flags & NODE_WARN_OPERATOR)
1356 	cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES,
1357 		     "identifier \"%s\" is a special operator name in C++",
1358 		     NODE_NAME (result));
1359     }
1360 
1361   return result;
1362 }
1363 
1364 /* Lex a number to NUMBER starting at BUFFER->CUR - 1.  */
1365 static void
1366 lex_number (cpp_reader *pfile, cpp_string *number,
1367 	    struct normalize_state *nst)
1368 {
1369   const uchar *cur;
1370   const uchar *base;
1371   uchar *dest;
1372 
1373   base = pfile->buffer->cur - 1;
1374   do
1375     {
1376       cur = pfile->buffer->cur;
1377 
1378       /* N.B. ISIDNUM does not include $.  */
1379       while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1]))
1380 	{
1381 	  cur++;
1382 	  NORMALIZE_STATE_UPDATE_IDNUM (nst);
1383 	}
1384 
1385       pfile->buffer->cur = cur;
1386     }
1387   while (forms_identifier_p (pfile, false, nst));
1388 
1389   number->len = cur - base;
1390   dest = _cpp_unaligned_alloc (pfile, number->len + 1);
1391   memcpy (dest, base, number->len);
1392   dest[number->len] = '\0';
1393   number->text = dest;
1394 }
1395 
1396 /* Create a token of type TYPE with a literal spelling.  */
1397 static void
1398 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base,
1399 		unsigned int len, enum cpp_ttype type)
1400 {
1401   uchar *dest = _cpp_unaligned_alloc (pfile, len + 1);
1402 
1403   memcpy (dest, base, len);
1404   dest[len] = '\0';
1405   token->type = type;
1406   token->val.str.len = len;
1407   token->val.str.text = dest;
1408 }
1409 
1410 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer
1411    sequence from *FIRST_BUFF_P to LAST_BUFF_P.  */
1412 
1413 static void
1414 bufring_append (cpp_reader *pfile, const uchar *base, size_t len,
1415 		_cpp_buff **first_buff_p, _cpp_buff **last_buff_p)
1416 {
1417   _cpp_buff *first_buff = *first_buff_p;
1418   _cpp_buff *last_buff = *last_buff_p;
1419 
1420   if (first_buff == NULL)
1421     first_buff = last_buff = _cpp_get_buff (pfile, len);
1422   else if (len > BUFF_ROOM (last_buff))
1423     {
1424       size_t room = BUFF_ROOM (last_buff);
1425       memcpy (BUFF_FRONT (last_buff), base, room);
1426       BUFF_FRONT (last_buff) += room;
1427       base += room;
1428       len -= room;
1429       last_buff = _cpp_append_extend_buff (pfile, last_buff, len);
1430     }
1431 
1432   memcpy (BUFF_FRONT (last_buff), base, len);
1433   BUFF_FRONT (last_buff) += len;
1434 
1435   *first_buff_p = first_buff;
1436   *last_buff_p = last_buff;
1437 }
1438 
1439 /* Lexes a raw string.  The stored string contains the spelling, including
1440    double quotes, delimiter string, '(' and ')', any leading
1441    'L', 'u', 'U' or 'u8' and 'R' modifier.  It returns the type of the
1442    literal, or CPP_OTHER if it was not properly terminated.
1443 
1444    The spelling is NUL-terminated, but it is not guaranteed that this
1445    is the first NUL since embedded NULs are preserved.  */
1446 
1447 static void
1448 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base,
1449 		const uchar *cur)
1450 {
1451   const uchar *raw_prefix;
1452   unsigned int raw_prefix_len = 0;
1453   enum cpp_ttype type;
1454   size_t total_len = 0;
1455   _cpp_buff *first_buff = NULL, *last_buff = NULL;
1456   _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note];
1457 
1458   type = (*base == 'L' ? CPP_WSTRING :
1459 	  *base == 'U' ? CPP_STRING32 :
1460 	  *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1461 	  : CPP_STRING);
1462 
1463   raw_prefix = cur + 1;
1464   while (raw_prefix_len < 16)
1465     {
1466       switch (raw_prefix[raw_prefix_len])
1467 	{
1468 	case ' ': case '(': case ')': case '\\': case '\t':
1469 	case '\v': case '\f': case '\n': default:
1470 	  break;
1471 	/* Basic source charset except the above chars.  */
1472 	case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1473 	case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1474 	case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1475 	case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1476 	case 'y': case 'z':
1477 	case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1478 	case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1479 	case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1480 	case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1481 	case 'Y': case 'Z':
1482 	case '0': case '1': case '2': case '3': case '4': case '5':
1483 	case '6': case '7': case '8': case '9':
1484 	case '_': case '{': case '}': case '#': case '[': case ']':
1485 	case '<': case '>': case '%': case ':': case ';': case '.':
1486 	case '?': case '*': case '+': case '-': case '/': case '^':
1487 	case '&': case '|': case '~': case '!': case '=': case ',':
1488 	case '"': case '\'':
1489 	  raw_prefix_len++;
1490 	  continue;
1491 	}
1492       break;
1493     }
1494 
1495   if (raw_prefix[raw_prefix_len] != '(')
1496     {
1497       int col = CPP_BUF_COLUMN (pfile->buffer, raw_prefix + raw_prefix_len)
1498 		+ 1;
1499       if (raw_prefix_len == 16)
1500 	cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
1501 			     "raw string delimiter longer than 16 characters");
1502       else
1503 	cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col,
1504 			     "invalid character '%c' in raw string delimiter",
1505 			     (int) raw_prefix[raw_prefix_len]);
1506       pfile->buffer->cur = raw_prefix - 1;
1507       create_literal (pfile, token, base, raw_prefix - 1 - base, CPP_OTHER);
1508       return;
1509     }
1510 
1511   cur = raw_prefix + raw_prefix_len + 1;
1512   for (;;)
1513     {
1514 #define BUF_APPEND(STR,LEN)					\
1515       do {							\
1516 	bufring_append (pfile, (const uchar *)(STR), (LEN),	\
1517 			&first_buff, &last_buff);		\
1518 	total_len += (LEN);					\
1519       } while (0);
1520 
1521       cppchar_t c;
1522 
1523       /* If we previously performed any trigraph or line splicing
1524 	 transformations, undo them within the body of the raw string.  */
1525       while (note->pos < cur)
1526 	++note;
1527       for (; note->pos == cur; ++note)
1528 	{
1529 	  switch (note->type)
1530 	    {
1531 	    case '\\':
1532 	    case ' ':
1533 	      /* Restore backslash followed by newline.  */
1534 	      BUF_APPEND (base, cur - base);
1535 	      base = cur;
1536 	      BUF_APPEND ("\\", 1);
1537 	    after_backslash:
1538 	      if (note->type == ' ')
1539 		{
1540 		  /* GNU backslash whitespace newline extension.  FIXME
1541 		     could be any sequence of non-vertical space.  When we
1542 		     can properly restore any such sequence, we should mark
1543 		     this note as handled so _cpp_process_line_notes
1544 		     doesn't warn.  */
1545 		  BUF_APPEND (" ", 1);
1546 		}
1547 
1548 	      BUF_APPEND ("\n", 1);
1549 	      break;
1550 
1551 	    case 0:
1552 	      /* Already handled.  */
1553 	      break;
1554 
1555 	    default:
1556 	      if (_cpp_trigraph_map[note->type])
1557 		{
1558 		  /* Don't warn about this trigraph in
1559 		     _cpp_process_line_notes, since trigraphs show up as
1560 		     trigraphs in raw strings.  */
1561 		  uchar type = note->type;
1562 		  note->type = 0;
1563 
1564 		  if (!CPP_OPTION (pfile, trigraphs))
1565 		    /* If we didn't convert the trigraph in the first
1566 		       place, don't do anything now either.  */
1567 		    break;
1568 
1569 		  BUF_APPEND (base, cur - base);
1570 		  base = cur;
1571 		  BUF_APPEND ("??", 2);
1572 
1573 		  /* ??/ followed by newline gets two line notes, one for
1574 		     the trigraph and one for the backslash/newline.  */
1575 		  if (type == '/' && note[1].pos == cur)
1576 		    {
1577 		      if (note[1].type != '\\'
1578 			  && note[1].type != ' ')
1579 			abort ();
1580 		      BUF_APPEND ("/", 1);
1581 		      ++note;
1582 		      goto after_backslash;
1583 		    }
1584 		  /* The ) from ??) could be part of the suffix.  */
1585 		  else if (type == ')'
1586 			   && strncmp ((const char *) cur+1,
1587 				       (const char *) raw_prefix,
1588 				       raw_prefix_len) == 0
1589 			   && cur[raw_prefix_len+1] == '"')
1590 		    {
1591 		      BUF_APPEND (")", 1);
1592 		      base++;
1593 		      cur += raw_prefix_len + 2;
1594 		      goto break_outer_loop;
1595 		    }
1596 		  else
1597 		    {
1598 		      /* Skip the replacement character.  */
1599 		      base = ++cur;
1600 		      BUF_APPEND (&type, 1);
1601 		    }
1602 		}
1603 	      else
1604 		abort ();
1605 	      break;
1606 	    }
1607 	}
1608       c = *cur++;
1609 
1610       if (c == ')'
1611 	  && strncmp ((const char *) cur, (const char *) raw_prefix,
1612 		      raw_prefix_len) == 0
1613 	  && cur[raw_prefix_len] == '"')
1614 	{
1615 	  cur += raw_prefix_len + 1;
1616 	  break;
1617 	}
1618       else if (c == '\n')
1619 	{
1620 	  if (pfile->state.in_directive
1621 	      || pfile->state.parsing_args
1622 	      || pfile->state.in_deferred_pragma)
1623 	    {
1624 	      cur--;
1625 	      type = CPP_OTHER;
1626 	      cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0,
1627 				   "unterminated raw string");
1628 	      break;
1629 	    }
1630 
1631 	  BUF_APPEND (base, cur - base);
1632 
1633 	  if (pfile->buffer->cur < pfile->buffer->rlimit)
1634 	    CPP_INCREMENT_LINE (pfile, 0);
1635 	  pfile->buffer->need_line = true;
1636 
1637 	  pfile->buffer->cur = cur-1;
1638 	  _cpp_process_line_notes (pfile, false);
1639 	  if (!_cpp_get_fresh_line (pfile))
1640 	    {
1641 	      source_location src_loc = token->src_loc;
1642 	      token->type = CPP_EOF;
1643 	      /* Tell the compiler the line number of the EOF token.  */
1644 	      token->src_loc = pfile->line_table->highest_line;
1645 	      token->flags = BOL;
1646 	      if (first_buff != NULL)
1647 		_cpp_release_buff (pfile, first_buff);
1648 	      cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0,
1649 				   "unterminated raw string");
1650 	      return;
1651 	    }
1652 
1653 	  cur = base = pfile->buffer->cur;
1654 	  note = &pfile->buffer->notes[pfile->buffer->cur_note];
1655 	}
1656     }
1657  break_outer_loop:
1658 
1659   if (CPP_OPTION (pfile, user_literals))
1660     {
1661       /* According to C++11 [lex.ext]p10, a ud-suffix not starting with an
1662 	 underscore is ill-formed.  Since this breaks programs using macros
1663 	 from inttypes.h, we generate a warning and treat the ud-suffix as a
1664 	 separate preprocessing token.  This approach is under discussion by
1665 	 the standards committee, and has been adopted as a conforming
1666 	 extension by other front ends such as clang.
1667          A special exception is made for the suffix 's' which will be
1668 	 standardized as a user-defined literal suffix for strings.  */
1669       if (ISALPHA (*cur) && *cur != 's')
1670 	{
1671 	  /* Raise a warning, but do not consume subsequent tokens.  */
1672 	  if (CPP_OPTION (pfile, warn_literal_suffix))
1673 	    cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1674 				   token->src_loc, 0,
1675 				   "invalid suffix on literal; C++11 requires "
1676 				   "a space between literal and identifier");
1677 	}
1678       /* Grab user defined literal suffix.  */
1679       else if (ISIDST (*cur))
1680 	{
1681 	  type = cpp_userdef_string_add_type (type);
1682 	  ++cur;
1683 
1684 	  while (ISIDNUM (*cur))
1685 	    ++cur;
1686 	}
1687     }
1688 
1689   pfile->buffer->cur = cur;
1690   if (first_buff == NULL)
1691     create_literal (pfile, token, base, cur - base, type);
1692   else
1693     {
1694       uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1);
1695 
1696       token->type = type;
1697       token->val.str.len = total_len + (cur - base);
1698       token->val.str.text = dest;
1699       last_buff = first_buff;
1700       while (last_buff != NULL)
1701 	{
1702 	  memcpy (dest, last_buff->base,
1703 		  BUFF_FRONT (last_buff) - last_buff->base);
1704 	  dest += BUFF_FRONT (last_buff) - last_buff->base;
1705 	  last_buff = last_buff->next;
1706 	}
1707       _cpp_release_buff (pfile, first_buff);
1708       memcpy (dest, base, cur - base);
1709       dest[cur - base] = '\0';
1710     }
1711 }
1712 
1713 /* Lexes a string, character constant, or angle-bracketed header file
1714    name.  The stored string contains the spelling, including opening
1715    quote and any leading 'L', 'u', 'U' or 'u8' and optional
1716    'R' modifier.  It returns the type of the literal, or CPP_OTHER
1717    if it was not properly terminated, or CPP_LESS for an unterminated
1718    header name which must be relexed as normal tokens.
1719 
1720    The spelling is NUL-terminated, but it is not guaranteed that this
1721    is the first NUL since embedded NULs are preserved.  */
1722 static void
1723 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
1724 {
1725   bool saw_NUL = false;
1726   const uchar *cur;
1727   cppchar_t terminator;
1728   enum cpp_ttype type;
1729 
1730   cur = base;
1731   terminator = *cur++;
1732   if (terminator == 'L' || terminator == 'U')
1733     terminator = *cur++;
1734   else if (terminator == 'u')
1735     {
1736       terminator = *cur++;
1737       if (terminator == '8')
1738 	terminator = *cur++;
1739     }
1740   if (terminator == 'R')
1741     {
1742       lex_raw_string (pfile, token, base, cur);
1743       return;
1744     }
1745   if (terminator == '"')
1746     type = (*base == 'L' ? CPP_WSTRING :
1747 	    *base == 'U' ? CPP_STRING32 :
1748 	    *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16)
1749 			 : CPP_STRING);
1750   else if (terminator == '\'')
1751     type = (*base == 'L' ? CPP_WCHAR :
1752 	    *base == 'U' ? CPP_CHAR32 :
1753 	    *base == 'u' ? CPP_CHAR16 : CPP_CHAR);
1754   else
1755     terminator = '>', type = CPP_HEADER_NAME;
1756 
1757   for (;;)
1758     {
1759       cppchar_t c = *cur++;
1760 
1761       /* In #include-style directives, terminators are not escapable.  */
1762       if (c == '\\' && !pfile->state.angled_headers && *cur != '\n')
1763 	cur++;
1764       else if (c == terminator)
1765 	break;
1766       else if (c == '\n')
1767 	{
1768 	  cur--;
1769 	  /* Unmatched quotes always yield undefined behavior, but
1770 	     greedy lexing means that what appears to be an unterminated
1771 	     header name may actually be a legitimate sequence of tokens.  */
1772 	  if (terminator == '>')
1773 	    {
1774 	      token->type = CPP_LESS;
1775 	      return;
1776 	    }
1777 	  type = CPP_OTHER;
1778 	  break;
1779 	}
1780       else if (c == '\0')
1781 	saw_NUL = true;
1782     }
1783 
1784   if (saw_NUL && !pfile->state.skipping)
1785     cpp_error (pfile, CPP_DL_WARNING,
1786 	       "null character(s) preserved in literal");
1787 
1788   if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM)
1789     cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character",
1790 	       (int) terminator);
1791 
1792   if (CPP_OPTION (pfile, user_literals))
1793     {
1794       /* According to C++11 [lex.ext]p10, a ud-suffix not starting with an
1795 	 underscore is ill-formed.  Since this breaks programs using macros
1796 	 from inttypes.h, we generate a warning and treat the ud-suffix as a
1797 	 separate preprocessing token.  This approach is under discussion by
1798 	 the standards committee, and has been adopted as a conforming
1799 	 extension by other front ends such as clang.
1800          A special exception is made for the suffix 's' which will be
1801 	 standardized as a user-defined literal suffix for strings.  */
1802       if (ISALPHA (*cur) && *cur != 's')
1803 	{
1804 	  /* Raise a warning, but do not consume subsequent tokens.  */
1805 	  if (CPP_OPTION (pfile, warn_literal_suffix))
1806 	    cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX,
1807 				   token->src_loc, 0,
1808 				   "invalid suffix on literal; C++11 requires "
1809 				   "a space between literal and identifier");
1810 	}
1811       /* Grab user defined literal suffix.  */
1812       else if (ISIDST (*cur))
1813 	{
1814 	  type = cpp_userdef_char_add_type (type);
1815 	  type = cpp_userdef_string_add_type (type);
1816           ++cur;
1817 
1818 	  while (ISIDNUM (*cur))
1819 	    ++cur;
1820 	}
1821     }
1822 
1823   pfile->buffer->cur = cur;
1824   create_literal (pfile, token, base, cur - base, type);
1825 }
1826 
1827 /* Return the comment table. The client may not make any assumption
1828    about the ordering of the table.  */
1829 cpp_comment_table *
1830 cpp_get_comments (cpp_reader *pfile)
1831 {
1832   return &pfile->comments;
1833 }
1834 
1835 /* Append a comment to the end of the comment table. */
1836 static void
1837 store_comment (cpp_reader *pfile, cpp_token *token)
1838 {
1839   int len;
1840 
1841   if (pfile->comments.allocated == 0)
1842     {
1843       pfile->comments.allocated = 256;
1844       pfile->comments.entries = (cpp_comment *) xmalloc
1845 	(pfile->comments.allocated * sizeof (cpp_comment));
1846     }
1847 
1848   if (pfile->comments.count == pfile->comments.allocated)
1849     {
1850       pfile->comments.allocated *= 2;
1851       pfile->comments.entries = (cpp_comment *) xrealloc
1852 	(pfile->comments.entries,
1853 	 pfile->comments.allocated * sizeof (cpp_comment));
1854     }
1855 
1856   len = token->val.str.len;
1857 
1858   /* Copy comment. Note, token may not be NULL terminated. */
1859   pfile->comments.entries[pfile->comments.count].comment =
1860     (char *) xmalloc (sizeof (char) * (len + 1));
1861   memcpy (pfile->comments.entries[pfile->comments.count].comment,
1862 	  token->val.str.text, len);
1863   pfile->comments.entries[pfile->comments.count].comment[len] = '\0';
1864 
1865   /* Set source location. */
1866   pfile->comments.entries[pfile->comments.count].sloc = token->src_loc;
1867 
1868   /* Increment the count of entries in the comment table. */
1869   pfile->comments.count++;
1870 }
1871 
1872 /* The stored comment includes the comment start and any terminator.  */
1873 static void
1874 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from,
1875 	      cppchar_t type)
1876 {
1877   unsigned char *buffer;
1878   unsigned int len, clen, i;
1879   int convert_to_c = (pfile->state.in_directive || pfile->state.parsing_args)
1880     && type == '/';
1881 
1882   len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'.  */
1883 
1884   /* C++ comments probably (not definitely) have moved past a new
1885      line, which we don't want to save in the comment.  */
1886   if (is_vspace (pfile->buffer->cur[-1]))
1887     len--;
1888 
1889   /* If we are currently in a directive or in argument parsing, then
1890      we need to store all C++ comments as C comments internally, and
1891      so we need to allocate a little extra space in that case.
1892 
1893      Note that the only time we encounter a directive here is
1894      when we are saving comments in a "#define".  */
1895   clen = convert_to_c ? len + 2 : len;
1896 
1897   buffer = _cpp_unaligned_alloc (pfile, clen);
1898 
1899   token->type = CPP_COMMENT;
1900   token->val.str.len = clen;
1901   token->val.str.text = buffer;
1902 
1903   buffer[0] = '/';
1904   memcpy (buffer + 1, from, len - 1);
1905 
1906   /* Finish conversion to a C comment, if necessary.  */
1907   if (convert_to_c)
1908     {
1909       buffer[1] = '*';
1910       buffer[clen - 2] = '*';
1911       buffer[clen - 1] = '/';
1912       /* As there can be in a C++ comments illegal sequences for C comments
1913          we need to filter them out.  */
1914       for (i = 2; i < (clen - 2); i++)
1915         if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*'))
1916           buffer[i] = '|';
1917     }
1918 
1919   /* Finally store this comment for use by clients of libcpp. */
1920   store_comment (pfile, token);
1921 }
1922 
1923 /* Allocate COUNT tokens for RUN.  */
1924 void
1925 _cpp_init_tokenrun (tokenrun *run, unsigned int count)
1926 {
1927   run->base = XNEWVEC (cpp_token, count);
1928   run->limit = run->base + count;
1929   run->next = NULL;
1930 }
1931 
1932 /* Returns the next tokenrun, or creates one if there is none.  */
1933 static tokenrun *
1934 next_tokenrun (tokenrun *run)
1935 {
1936   if (run->next == NULL)
1937     {
1938       run->next = XNEW (tokenrun);
1939       run->next->prev = run;
1940       _cpp_init_tokenrun (run->next, 250);
1941     }
1942 
1943   return run->next;
1944 }
1945 
1946 /* Return the number of not yet processed token in a given
1947    context.  */
1948 int
1949 _cpp_remaining_tokens_num_in_context (cpp_context *context)
1950 {
1951   if (context->tokens_kind == TOKENS_KIND_DIRECT)
1952     return (LAST (context).token - FIRST (context).token);
1953   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1954 	   || context->tokens_kind == TOKENS_KIND_EXTENDED)
1955     return (LAST (context).ptoken - FIRST (context).ptoken);
1956   else
1957       abort ();
1958 }
1959 
1960 /* Returns the token present at index INDEX in a given context.  If
1961    INDEX is zero, the next token to be processed is returned.  */
1962 static const cpp_token*
1963 _cpp_token_from_context_at (cpp_context *context, int index)
1964 {
1965   if (context->tokens_kind == TOKENS_KIND_DIRECT)
1966     return &(FIRST (context).token[index]);
1967   else if (context->tokens_kind == TOKENS_KIND_INDIRECT
1968 	   || context->tokens_kind == TOKENS_KIND_EXTENDED)
1969     return FIRST (context).ptoken[index];
1970  else
1971    abort ();
1972 }
1973 
1974 /* Look ahead in the input stream.  */
1975 const cpp_token *
1976 cpp_peek_token (cpp_reader *pfile, int index)
1977 {
1978   cpp_context *context = pfile->context;
1979   const cpp_token *peektok;
1980   int count;
1981 
1982   /* First, scan through any pending cpp_context objects.  */
1983   while (context->prev)
1984     {
1985       ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context);
1986 
1987       if (index < (int) sz)
1988         return _cpp_token_from_context_at (context, index);
1989       index -= (int) sz;
1990       context = context->prev;
1991     }
1992 
1993   /* We will have to read some new tokens after all (and do so
1994      without invalidating preceding tokens).  */
1995   count = index;
1996   pfile->keep_tokens++;
1997 
1998   do
1999     {
2000       peektok = _cpp_lex_token (pfile);
2001       if (peektok->type == CPP_EOF)
2002 	return peektok;
2003     }
2004   while (index--);
2005 
2006   _cpp_backup_tokens_direct (pfile, count + 1);
2007   pfile->keep_tokens--;
2008 
2009   return peektok;
2010 }
2011 
2012 /* Allocate a single token that is invalidated at the same time as the
2013    rest of the tokens on the line.  Has its line and col set to the
2014    same as the last lexed token, so that diagnostics appear in the
2015    right place.  */
2016 cpp_token *
2017 _cpp_temp_token (cpp_reader *pfile)
2018 {
2019   cpp_token *old, *result;
2020   ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token;
2021   ptrdiff_t la = (ptrdiff_t) pfile->lookaheads;
2022 
2023   old = pfile->cur_token - 1;
2024   /* Any pre-existing lookaheads must not be clobbered.  */
2025   if (la)
2026     {
2027       if (sz <= la)
2028         {
2029           tokenrun *next = next_tokenrun (pfile->cur_run);
2030 
2031           if (sz < la)
2032             memmove (next->base + 1, next->base,
2033                      (la - sz) * sizeof (cpp_token));
2034 
2035           next->base[0] = pfile->cur_run->limit[-1];
2036         }
2037 
2038       if (sz > 1)
2039         memmove (pfile->cur_token + 1, pfile->cur_token,
2040                  MIN (la, sz - 1) * sizeof (cpp_token));
2041     }
2042 
2043   if (!sz && pfile->cur_token == pfile->cur_run->limit)
2044     {
2045       pfile->cur_run = next_tokenrun (pfile->cur_run);
2046       pfile->cur_token = pfile->cur_run->base;
2047     }
2048 
2049   result = pfile->cur_token++;
2050   result->src_loc = old->src_loc;
2051   return result;
2052 }
2053 
2054 /* Lex a token into RESULT (external interface).  Takes care of issues
2055    like directive handling, token lookahead, multiple include
2056    optimization and skipping.  */
2057 const cpp_token *
2058 _cpp_lex_token (cpp_reader *pfile)
2059 {
2060   cpp_token *result;
2061 
2062   for (;;)
2063     {
2064       if (pfile->cur_token == pfile->cur_run->limit)
2065 	{
2066 	  pfile->cur_run = next_tokenrun (pfile->cur_run);
2067 	  pfile->cur_token = pfile->cur_run->base;
2068 	}
2069       /* We assume that the current token is somewhere in the current
2070 	 run.  */
2071       if (pfile->cur_token < pfile->cur_run->base
2072 	  || pfile->cur_token >= pfile->cur_run->limit)
2073 	abort ();
2074 
2075       if (pfile->lookaheads)
2076 	{
2077 	  pfile->lookaheads--;
2078 	  result = pfile->cur_token++;
2079 	}
2080       else
2081 	result = _cpp_lex_direct (pfile);
2082 
2083       if (result->flags & BOL)
2084 	{
2085 	  /* Is this a directive.  If _cpp_handle_directive returns
2086 	     false, it is an assembler #.  */
2087 	  if (result->type == CPP_HASH
2088 	      /* 6.10.3 p 11: Directives in a list of macro arguments
2089 		 gives undefined behavior.  This implementation
2090 		 handles the directive as normal.  */
2091 	      && pfile->state.parsing_args != 1)
2092 	    {
2093 	      if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE))
2094 		{
2095 		  if (pfile->directive_result.type == CPP_PADDING)
2096 		    continue;
2097 		  result = &pfile->directive_result;
2098 		}
2099 	    }
2100 	  else if (pfile->state.in_deferred_pragma)
2101 	    result = &pfile->directive_result;
2102 
2103 	  if (pfile->cb.line_change && !pfile->state.skipping)
2104 	    pfile->cb.line_change (pfile, result, pfile->state.parsing_args);
2105 	}
2106 
2107       /* We don't skip tokens in directives.  */
2108       if (pfile->state.in_directive || pfile->state.in_deferred_pragma)
2109 	break;
2110 
2111       /* Outside a directive, invalidate controlling macros.  At file
2112 	 EOF, _cpp_lex_direct takes care of popping the buffer, so we never
2113 	 get here and MI optimization works.  */
2114       pfile->mi_valid = false;
2115 
2116       if (!pfile->state.skipping || result->type == CPP_EOF)
2117 	break;
2118     }
2119 
2120   return result;
2121 }
2122 
2123 /* Returns true if a fresh line has been loaded.  */
2124 bool
2125 _cpp_get_fresh_line (cpp_reader *pfile)
2126 {
2127   int return_at_eof;
2128 
2129   /* We can't get a new line until we leave the current directive.  */
2130   if (pfile->state.in_directive)
2131     return false;
2132 
2133   for (;;)
2134     {
2135       cpp_buffer *buffer = pfile->buffer;
2136 
2137       if (!buffer->need_line)
2138 	return true;
2139 
2140       if (buffer->next_line < buffer->rlimit)
2141 	{
2142 	  _cpp_clean_line (pfile);
2143 	  return true;
2144 	}
2145 
2146       /* First, get out of parsing arguments state.  */
2147       if (pfile->state.parsing_args)
2148 	return false;
2149 
2150       /* End of buffer.  Non-empty files should end in a newline.  */
2151       if (buffer->buf != buffer->rlimit
2152 	  && buffer->next_line > buffer->rlimit
2153 	  && !buffer->from_stage3)
2154 	{
2155 	  /* Clip to buffer size.  */
2156 	  buffer->next_line = buffer->rlimit;
2157 	}
2158 
2159       return_at_eof = buffer->return_at_eof;
2160       _cpp_pop_buffer (pfile);
2161       if (pfile->buffer == NULL || return_at_eof)
2162 	return false;
2163     }
2164 }
2165 
2166 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE)		\
2167   do							\
2168     {							\
2169       result->type = ELSE_TYPE;				\
2170       if (*buffer->cur == CHAR)				\
2171 	buffer->cur++, result->type = THEN_TYPE;	\
2172     }							\
2173   while (0)
2174 
2175 /* Lex a token into pfile->cur_token, which is also incremented, to
2176    get diagnostics pointing to the correct location.
2177 
2178    Does not handle issues such as token lookahead, multiple-include
2179    optimization, directives, skipping etc.  This function is only
2180    suitable for use by _cpp_lex_token, and in special cases like
2181    lex_expansion_token which doesn't care for any of these issues.
2182 
2183    When meeting a newline, returns CPP_EOF if parsing a directive,
2184    otherwise returns to the start of the token buffer if permissible.
2185    Returns the location of the lexed token.  */
2186 cpp_token *
2187 _cpp_lex_direct (cpp_reader *pfile)
2188 {
2189   cppchar_t c;
2190   cpp_buffer *buffer;
2191   const unsigned char *comment_start;
2192   cpp_token *result = pfile->cur_token++;
2193 
2194  fresh_line:
2195   result->flags = 0;
2196   buffer = pfile->buffer;
2197   if (buffer->need_line)
2198     {
2199       if (pfile->state.in_deferred_pragma)
2200 	{
2201 	  result->type = CPP_PRAGMA_EOL;
2202 	  pfile->state.in_deferred_pragma = false;
2203 	  if (!pfile->state.pragma_allow_expansion)
2204 	    pfile->state.prevent_expansion--;
2205 	  return result;
2206 	}
2207       if (!_cpp_get_fresh_line (pfile))
2208 	{
2209 	  result->type = CPP_EOF;
2210 	  if (!pfile->state.in_directive)
2211 	    {
2212 	      /* Tell the compiler the line number of the EOF token.  */
2213 	      result->src_loc = pfile->line_table->highest_line;
2214 	      result->flags = BOL;
2215 	    }
2216 	  return result;
2217 	}
2218       if (!pfile->keep_tokens)
2219 	{
2220 	  pfile->cur_run = &pfile->base_run;
2221 	  result = pfile->base_run.base;
2222 	  pfile->cur_token = result + 1;
2223 	}
2224       result->flags = BOL;
2225       if (pfile->state.parsing_args == 2)
2226 	result->flags |= PREV_WHITE;
2227     }
2228   buffer = pfile->buffer;
2229  update_tokens_line:
2230   result->src_loc = pfile->line_table->highest_line;
2231 
2232  skipped_white:
2233   if (buffer->cur >= buffer->notes[buffer->cur_note].pos
2234       && !pfile->overlaid_buffer)
2235     {
2236       _cpp_process_line_notes (pfile, false);
2237       result->src_loc = pfile->line_table->highest_line;
2238     }
2239   c = *buffer->cur++;
2240 
2241   if (pfile->forced_token_location_p)
2242     result->src_loc = *pfile->forced_token_location_p;
2243   else
2244     result->src_loc = linemap_position_for_column (pfile->line_table,
2245 					  CPP_BUF_COLUMN (buffer, buffer->cur));
2246 
2247   switch (c)
2248     {
2249     case ' ': case '\t': case '\f': case '\v': case '\0':
2250       result->flags |= PREV_WHITE;
2251       skip_whitespace (pfile, c);
2252       goto skipped_white;
2253 
2254     case '\n':
2255       if (buffer->cur < buffer->rlimit)
2256 	CPP_INCREMENT_LINE (pfile, 0);
2257       buffer->need_line = true;
2258       goto fresh_line;
2259 
2260     case '0': case '1': case '2': case '3': case '4':
2261     case '5': case '6': case '7': case '8': case '9':
2262       {
2263 	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2264 	result->type = CPP_NUMBER;
2265 	lex_number (pfile, &result->val.str, &nst);
2266 	warn_about_normalization (pfile, result, &nst);
2267 	break;
2268       }
2269 
2270     case 'L':
2271     case 'u':
2272     case 'U':
2273     case 'R':
2274       /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters,
2275 	 wide strings or raw strings.  */
2276       if (c == 'L' || CPP_OPTION (pfile, rliterals)
2277 	  || (c != 'R' && CPP_OPTION (pfile, uliterals)))
2278 	{
2279 	  if ((*buffer->cur == '\'' && c != 'R')
2280 	      || *buffer->cur == '"'
2281 	      || (*buffer->cur == 'R'
2282 		  && c != 'R'
2283 		  && buffer->cur[1] == '"'
2284 		  && CPP_OPTION (pfile, rliterals))
2285 	      || (*buffer->cur == '8'
2286 		  && c == 'u'
2287 		  && (buffer->cur[1] == '"'
2288 		      || (buffer->cur[1] == 'R' && buffer->cur[2] == '"'
2289 			  && CPP_OPTION (pfile, rliterals)))))
2290 	    {
2291 	      lex_string (pfile, result, buffer->cur - 1);
2292 	      break;
2293 	    }
2294 	}
2295       /* Fall through.  */
2296 
2297     case '_':
2298     case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
2299     case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
2300     case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
2301     case 's': case 't':           case 'v': case 'w': case 'x':
2302     case 'y': case 'z':
2303     case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
2304     case 'G': case 'H': case 'I': case 'J': case 'K':
2305     case 'M': case 'N': case 'O': case 'P': case 'Q':
2306     case 'S': case 'T':           case 'V': case 'W': case 'X':
2307     case 'Y': case 'Z':
2308       result->type = CPP_NAME;
2309       {
2310 	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2311 	result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false,
2312 						&nst);
2313 	warn_about_normalization (pfile, result, &nst);
2314       }
2315 
2316       /* Convert named operators to their proper types.  */
2317       if (result->val.node.node->flags & NODE_OPERATOR)
2318 	{
2319 	  result->flags |= NAMED_OP;
2320 	  result->type = (enum cpp_ttype) result->val.node.node->directive_index;
2321 	}
2322       break;
2323 
2324     case '\'':
2325     case '"':
2326       lex_string (pfile, result, buffer->cur - 1);
2327       break;
2328 
2329     case '/':
2330       /* A potential block or line comment.  */
2331       comment_start = buffer->cur;
2332       c = *buffer->cur;
2333 
2334       if (c == '*')
2335 	{
2336 	  if (_cpp_skip_block_comment (pfile))
2337 	    cpp_error (pfile, CPP_DL_ERROR, "unterminated comment");
2338 	}
2339       else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments)
2340 			    || cpp_in_system_header (pfile)))
2341 	{
2342 	  /* Warn about comments only if pedantically GNUC89, and not
2343 	     in system headers.  */
2344 	  if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile)
2345 	      && ! buffer->warned_cplusplus_comments)
2346 	    {
2347 	      cpp_error (pfile, CPP_DL_PEDWARN,
2348 			 "C++ style comments are not allowed in ISO C90");
2349 	      cpp_error (pfile, CPP_DL_PEDWARN,
2350 			 "(this will be reported only once per input file)");
2351 	      buffer->warned_cplusplus_comments = 1;
2352 	    }
2353 
2354 	  if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments))
2355 	    cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment");
2356 	}
2357       else if (c == '=')
2358 	{
2359 	  buffer->cur++;
2360 	  result->type = CPP_DIV_EQ;
2361 	  break;
2362 	}
2363       else
2364 	{
2365 	  result->type = CPP_DIV;
2366 	  break;
2367 	}
2368 
2369       if (!pfile->state.save_comments)
2370 	{
2371 	  result->flags |= PREV_WHITE;
2372 	  goto update_tokens_line;
2373 	}
2374 
2375       /* Save the comment as a token in its own right.  */
2376       save_comment (pfile, result, comment_start, c);
2377       break;
2378 
2379     case '<':
2380       if (pfile->state.angled_headers)
2381 	{
2382 	  lex_string (pfile, result, buffer->cur - 1);
2383 	  if (result->type != CPP_LESS)
2384 	    break;
2385 	}
2386 
2387       result->type = CPP_LESS;
2388       if (*buffer->cur == '=')
2389 	buffer->cur++, result->type = CPP_LESS_EQ;
2390       else if (*buffer->cur == '<')
2391 	{
2392 	  buffer->cur++;
2393 	  IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT);
2394 	}
2395       else if (CPP_OPTION (pfile, digraphs))
2396 	{
2397 	  if (*buffer->cur == ':')
2398 	    {
2399 	      /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next
2400 		 three characters are <:: and the subsequent character
2401 		 is neither : nor >, the < is treated as a preprocessor
2402 		 token by itself".  */
2403 	      if (CPP_OPTION (pfile, cplusplus)
2404 		  && (CPP_OPTION (pfile, lang) == CLK_CXX11
2405 		      || CPP_OPTION (pfile, lang) == CLK_GNUCXX11)
2406 		  && buffer->cur[1] == ':'
2407 		  && buffer->cur[2] != ':' && buffer->cur[2] != '>')
2408 		break;
2409 
2410 	      buffer->cur++;
2411 	      result->flags |= DIGRAPH;
2412 	      result->type = CPP_OPEN_SQUARE;
2413 	    }
2414 	  else if (*buffer->cur == '%')
2415 	    {
2416 	      buffer->cur++;
2417 	      result->flags |= DIGRAPH;
2418 	      result->type = CPP_OPEN_BRACE;
2419 	    }
2420 	}
2421       break;
2422 
2423     case '>':
2424       result->type = CPP_GREATER;
2425       if (*buffer->cur == '=')
2426 	buffer->cur++, result->type = CPP_GREATER_EQ;
2427       else if (*buffer->cur == '>')
2428 	{
2429 	  buffer->cur++;
2430 	  IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT);
2431 	}
2432       break;
2433 
2434     case '%':
2435       result->type = CPP_MOD;
2436       if (*buffer->cur == '=')
2437 	buffer->cur++, result->type = CPP_MOD_EQ;
2438       else if (CPP_OPTION (pfile, digraphs))
2439 	{
2440 	  if (*buffer->cur == ':')
2441 	    {
2442 	      buffer->cur++;
2443 	      result->flags |= DIGRAPH;
2444 	      result->type = CPP_HASH;
2445 	      if (*buffer->cur == '%' && buffer->cur[1] == ':')
2446 		buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0;
2447 	    }
2448 	  else if (*buffer->cur == '>')
2449 	    {
2450 	      buffer->cur++;
2451 	      result->flags |= DIGRAPH;
2452 	      result->type = CPP_CLOSE_BRACE;
2453 	    }
2454 	}
2455       break;
2456 
2457     case '.':
2458       result->type = CPP_DOT;
2459       if (ISDIGIT (*buffer->cur))
2460 	{
2461 	  struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2462 	  result->type = CPP_NUMBER;
2463 	  lex_number (pfile, &result->val.str, &nst);
2464 	  warn_about_normalization (pfile, result, &nst);
2465 	}
2466       else if (*buffer->cur == '.' && buffer->cur[1] == '.')
2467 	buffer->cur += 2, result->type = CPP_ELLIPSIS;
2468       else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2469 	buffer->cur++, result->type = CPP_DOT_STAR;
2470       break;
2471 
2472     case '+':
2473       result->type = CPP_PLUS;
2474       if (*buffer->cur == '+')
2475 	buffer->cur++, result->type = CPP_PLUS_PLUS;
2476       else if (*buffer->cur == '=')
2477 	buffer->cur++, result->type = CPP_PLUS_EQ;
2478       break;
2479 
2480     case '-':
2481       result->type = CPP_MINUS;
2482       if (*buffer->cur == '>')
2483 	{
2484 	  buffer->cur++;
2485 	  result->type = CPP_DEREF;
2486 	  if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus))
2487 	    buffer->cur++, result->type = CPP_DEREF_STAR;
2488 	}
2489       else if (*buffer->cur == '-')
2490 	buffer->cur++, result->type = CPP_MINUS_MINUS;
2491       else if (*buffer->cur == '=')
2492 	buffer->cur++, result->type = CPP_MINUS_EQ;
2493       break;
2494 
2495     case '&':
2496       result->type = CPP_AND;
2497       if (*buffer->cur == '&')
2498 	buffer->cur++, result->type = CPP_AND_AND;
2499       else if (*buffer->cur == '=')
2500 	buffer->cur++, result->type = CPP_AND_EQ;
2501       break;
2502 
2503     case '|':
2504       result->type = CPP_OR;
2505       if (*buffer->cur == '|')
2506 	buffer->cur++, result->type = CPP_OR_OR;
2507       else if (*buffer->cur == '=')
2508 	buffer->cur++, result->type = CPP_OR_EQ;
2509       break;
2510 
2511     case ':':
2512       result->type = CPP_COLON;
2513       if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus))
2514 	buffer->cur++, result->type = CPP_SCOPE;
2515       else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs))
2516 	{
2517 	  buffer->cur++;
2518 	  result->flags |= DIGRAPH;
2519 	  result->type = CPP_CLOSE_SQUARE;
2520 	}
2521       break;
2522 
2523     case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break;
2524     case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break;
2525     case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break;
2526     case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break;
2527     case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break;
2528 
2529     case '?': result->type = CPP_QUERY; break;
2530     case '~': result->type = CPP_COMPL; break;
2531     case ',': result->type = CPP_COMMA; break;
2532     case '(': result->type = CPP_OPEN_PAREN; break;
2533     case ')': result->type = CPP_CLOSE_PAREN; break;
2534     case '[': result->type = CPP_OPEN_SQUARE; break;
2535     case ']': result->type = CPP_CLOSE_SQUARE; break;
2536     case '{': result->type = CPP_OPEN_BRACE; break;
2537     case '}': result->type = CPP_CLOSE_BRACE; break;
2538     case ';': result->type = CPP_SEMICOLON; break;
2539 
2540       /* @ is a punctuator in Objective-C.  */
2541     case '@': result->type = CPP_ATSIGN; break;
2542 
2543     case '$':
2544     case '\\':
2545       {
2546 	const uchar *base = --buffer->cur;
2547 	struct normalize_state nst = INITIAL_NORMALIZE_STATE;
2548 
2549 	if (forms_identifier_p (pfile, true, &nst))
2550 	  {
2551 	    result->type = CPP_NAME;
2552 	    result->val.node.node = lex_identifier (pfile, base, true, &nst);
2553 	    warn_about_normalization (pfile, result, &nst);
2554 	    break;
2555 	  }
2556 	buffer->cur++;
2557       }
2558 
2559     default:
2560       create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER);
2561       break;
2562     }
2563 
2564   return result;
2565 }
2566 
2567 /* An upper bound on the number of bytes needed to spell TOKEN.
2568    Does not include preceding whitespace.  */
2569 unsigned int
2570 cpp_token_len (const cpp_token *token)
2571 {
2572   unsigned int len;
2573 
2574   switch (TOKEN_SPELL (token))
2575     {
2576     default:		len = 6;				break;
2577     case SPELL_LITERAL:	len = token->val.str.len;		break;
2578     case SPELL_IDENT:	len = NODE_LEN (token->val.node.node) * 10;	break;
2579     }
2580 
2581   return len;
2582 }
2583 
2584 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER.
2585    Return the number of bytes read out of NAME.  (There are always
2586    10 bytes written to BUFFER.)  */
2587 
2588 static size_t
2589 utf8_to_ucn (unsigned char *buffer, const unsigned char *name)
2590 {
2591   int j;
2592   int ucn_len = 0;
2593   int ucn_len_c;
2594   unsigned t;
2595   unsigned long utf32;
2596 
2597   /* Compute the length of the UTF-8 sequence.  */
2598   for (t = *name; t & 0x80; t <<= 1)
2599     ucn_len++;
2600 
2601   utf32 = *name & (0x7F >> ucn_len);
2602   for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++)
2603     {
2604       utf32 = (utf32 << 6) | (*++name & 0x3F);
2605 
2606       /* Ill-formed UTF-8.  */
2607       if ((*name & ~0x3F) != 0x80)
2608 	abort ();
2609     }
2610 
2611   *buffer++ = '\\';
2612   *buffer++ = 'U';
2613   for (j = 7; j >= 0; j--)
2614     *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF];
2615   return ucn_len;
2616 }
2617 
2618 /* Given a token TYPE corresponding to a digraph, return a pointer to
2619    the spelling of the digraph.  */
2620 static const unsigned char *
2621 cpp_digraph2name (enum cpp_ttype type)
2622 {
2623   return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH];
2624 }
2625 
2626 /* Write the spelling of a token TOKEN to BUFFER.  The buffer must
2627    already contain the enough space to hold the token's spelling.
2628    Returns a pointer to the character after the last character written.
2629    FORSTRING is true if this is to be the spelling after translation
2630    phase 1 (this is different for UCNs).
2631    FIXME: Would be nice if we didn't need the PFILE argument.  */
2632 unsigned char *
2633 cpp_spell_token (cpp_reader *pfile, const cpp_token *token,
2634 		 unsigned char *buffer, bool forstring)
2635 {
2636   switch (TOKEN_SPELL (token))
2637     {
2638     case SPELL_OPERATOR:
2639       {
2640 	const unsigned char *spelling;
2641 	unsigned char c;
2642 
2643 	if (token->flags & DIGRAPH)
2644 	  spelling = cpp_digraph2name (token->type);
2645 	else if (token->flags & NAMED_OP)
2646 	  goto spell_ident;
2647 	else
2648 	  spelling = TOKEN_NAME (token);
2649 
2650 	while ((c = *spelling++) != '\0')
2651 	  *buffer++ = c;
2652       }
2653       break;
2654 
2655     spell_ident:
2656     case SPELL_IDENT:
2657       if (forstring)
2658 	{
2659 	  memcpy (buffer, NODE_NAME (token->val.node.node),
2660 		  NODE_LEN (token->val.node.node));
2661 	  buffer += NODE_LEN (token->val.node.node);
2662 	}
2663       else
2664 	{
2665 	  size_t i;
2666 	  const unsigned char * name = NODE_NAME (token->val.node.node);
2667 
2668 	  for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2669 	    if (name[i] & ~0x7F)
2670 	      {
2671 		i += utf8_to_ucn (buffer, name + i) - 1;
2672 		buffer += 10;
2673 	      }
2674 	    else
2675 	      *buffer++ = NODE_NAME (token->val.node.node)[i];
2676 	}
2677       break;
2678 
2679     case SPELL_LITERAL:
2680       memcpy (buffer, token->val.str.text, token->val.str.len);
2681       buffer += token->val.str.len;
2682       break;
2683 
2684     case SPELL_NONE:
2685       cpp_error (pfile, CPP_DL_ICE,
2686 		 "unspellable token %s", TOKEN_NAME (token));
2687       break;
2688     }
2689 
2690   return buffer;
2691 }
2692 
2693 /* Returns TOKEN spelt as a null-terminated string.  The string is
2694    freed when the reader is destroyed.  Useful for diagnostics.  */
2695 unsigned char *
2696 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token)
2697 {
2698   unsigned int len = cpp_token_len (token) + 1;
2699   unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end;
2700 
2701   end = cpp_spell_token (pfile, token, start, false);
2702   end[0] = '\0';
2703 
2704   return start;
2705 }
2706 
2707 /* Returns a pointer to a string which spells the token defined by
2708    TYPE and FLAGS.  Used by C front ends, which really should move to
2709    using cpp_token_as_text.  */
2710 const char *
2711 cpp_type2name (enum cpp_ttype type, unsigned char flags)
2712 {
2713   if (flags & DIGRAPH)
2714     return (const char *) cpp_digraph2name (type);
2715   else if (flags & NAMED_OP)
2716     return cpp_named_operator2name (type);
2717 
2718   return (const char *) token_spellings[type].name;
2719 }
2720 
2721 /* Writes the spelling of token to FP, without any preceding space.
2722    Separated from cpp_spell_token for efficiency - to avoid stdio
2723    double-buffering.  */
2724 void
2725 cpp_output_token (const cpp_token *token, FILE *fp)
2726 {
2727   switch (TOKEN_SPELL (token))
2728     {
2729     case SPELL_OPERATOR:
2730       {
2731 	const unsigned char *spelling;
2732 	int c;
2733 
2734 	if (token->flags & DIGRAPH)
2735 	  spelling = cpp_digraph2name (token->type);
2736 	else if (token->flags & NAMED_OP)
2737 	  goto spell_ident;
2738 	else
2739 	  spelling = TOKEN_NAME (token);
2740 
2741 	c = *spelling;
2742 	do
2743 	  putc (c, fp);
2744 	while ((c = *++spelling) != '\0');
2745       }
2746       break;
2747 
2748     spell_ident:
2749     case SPELL_IDENT:
2750       {
2751 	size_t i;
2752 	const unsigned char * name = NODE_NAME (token->val.node.node);
2753 
2754 	for (i = 0; i < NODE_LEN (token->val.node.node); i++)
2755 	  if (name[i] & ~0x7F)
2756 	    {
2757 	      unsigned char buffer[10];
2758 	      i += utf8_to_ucn (buffer, name + i) - 1;
2759 	      fwrite (buffer, 1, 10, fp);
2760 	    }
2761 	  else
2762 	    fputc (NODE_NAME (token->val.node.node)[i], fp);
2763       }
2764       break;
2765 
2766     case SPELL_LITERAL:
2767       fwrite (token->val.str.text, 1, token->val.str.len, fp);
2768       break;
2769 
2770     case SPELL_NONE:
2771       /* An error, most probably.  */
2772       break;
2773     }
2774 }
2775 
2776 /* Compare two tokens.  */
2777 int
2778 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b)
2779 {
2780   if (a->type == b->type && a->flags == b->flags)
2781     switch (TOKEN_SPELL (a))
2782       {
2783       default:			/* Keep compiler happy.  */
2784       case SPELL_OPERATOR:
2785 	/* token_no is used to track where multiple consecutive ##
2786 	   tokens were originally located.  */
2787 	return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no);
2788       case SPELL_NONE:
2789 	return (a->type != CPP_MACRO_ARG
2790 		|| a->val.macro_arg.arg_no == b->val.macro_arg.arg_no);
2791       case SPELL_IDENT:
2792 	return a->val.node.node == b->val.node.node;
2793       case SPELL_LITERAL:
2794 	return (a->val.str.len == b->val.str.len
2795 		&& !memcmp (a->val.str.text, b->val.str.text,
2796 			    a->val.str.len));
2797       }
2798 
2799   return 0;
2800 }
2801 
2802 /* Returns nonzero if a space should be inserted to avoid an
2803    accidental token paste for output.  For simplicity, it is
2804    conservative, and occasionally advises a space where one is not
2805    needed, e.g. "." and ".2".  */
2806 int
2807 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1,
2808 		 const cpp_token *token2)
2809 {
2810   enum cpp_ttype a = token1->type, b = token2->type;
2811   cppchar_t c;
2812 
2813   if (token1->flags & NAMED_OP)
2814     a = CPP_NAME;
2815   if (token2->flags & NAMED_OP)
2816     b = CPP_NAME;
2817 
2818   c = EOF;
2819   if (token2->flags & DIGRAPH)
2820     c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0];
2821   else if (token_spellings[b].category == SPELL_OPERATOR)
2822     c = token_spellings[b].name[0];
2823 
2824   /* Quickly get everything that can paste with an '='.  */
2825   if ((int) a <= (int) CPP_LAST_EQ && c == '=')
2826     return 1;
2827 
2828   switch (a)
2829     {
2830     case CPP_GREATER:	return c == '>';
2831     case CPP_LESS:	return c == '<' || c == '%' || c == ':';
2832     case CPP_PLUS:	return c == '+';
2833     case CPP_MINUS:	return c == '-' || c == '>';
2834     case CPP_DIV:	return c == '/' || c == '*'; /* Comments.  */
2835     case CPP_MOD:	return c == ':' || c == '>';
2836     case CPP_AND:	return c == '&';
2837     case CPP_OR:	return c == '|';
2838     case CPP_COLON:	return c == ':' || c == '>';
2839     case CPP_DEREF:	return c == '*';
2840     case CPP_DOT:	return c == '.' || c == '%' || b == CPP_NUMBER;
2841     case CPP_HASH:	return c == '#' || c == '%'; /* Digraph form.  */
2842     case CPP_NAME:	return ((b == CPP_NUMBER
2843 				 && name_p (pfile, &token2->val.str))
2844 				|| b == CPP_NAME
2845 				|| b == CPP_CHAR || b == CPP_STRING); /* L */
2846     case CPP_NUMBER:	return (b == CPP_NUMBER || b == CPP_NAME
2847 				|| c == '.' || c == '+' || c == '-');
2848 				      /* UCNs */
2849     case CPP_OTHER:	return ((token1->val.str.text[0] == '\\'
2850 				 && b == CPP_NAME)
2851 				|| (CPP_OPTION (pfile, objc)
2852 				    && token1->val.str.text[0] == '@'
2853 				    && (b == CPP_NAME || b == CPP_STRING)));
2854     default:		break;
2855     }
2856 
2857   return 0;
2858 }
2859 
2860 /* Output all the remaining tokens on the current line, and a newline
2861    character, to FP.  Leading whitespace is removed.  If there are
2862    macros, special token padding is not performed.  */
2863 void
2864 cpp_output_line (cpp_reader *pfile, FILE *fp)
2865 {
2866   const cpp_token *token;
2867 
2868   token = cpp_get_token (pfile);
2869   while (token->type != CPP_EOF)
2870     {
2871       cpp_output_token (token, fp);
2872       token = cpp_get_token (pfile);
2873       if (token->flags & PREV_WHITE)
2874 	putc (' ', fp);
2875     }
2876 
2877   putc ('\n', fp);
2878 }
2879 
2880 /* Return a string representation of all the remaining tokens on the
2881    current line.  The result is allocated using xmalloc and must be
2882    freed by the caller.  */
2883 unsigned char *
2884 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name)
2885 {
2886   const cpp_token *token;
2887   unsigned int out = dir_name ? ustrlen (dir_name) : 0;
2888   unsigned int alloced = 120 + out;
2889   unsigned char *result = (unsigned char *) xmalloc (alloced);
2890 
2891   /* If DIR_NAME is empty, there are no initial contents.  */
2892   if (dir_name)
2893     {
2894       sprintf ((char *) result, "#%s ", dir_name);
2895       out += 2;
2896     }
2897 
2898   token = cpp_get_token (pfile);
2899   while (token->type != CPP_EOF)
2900     {
2901       unsigned char *last;
2902       /* Include room for a possible space and the terminating nul.  */
2903       unsigned int len = cpp_token_len (token) + 2;
2904 
2905       if (out + len > alloced)
2906 	{
2907 	  alloced *= 2;
2908 	  if (out + len > alloced)
2909 	    alloced = out + len;
2910 	  result = (unsigned char *) xrealloc (result, alloced);
2911 	}
2912 
2913       last = cpp_spell_token (pfile, token, &result[out], 0);
2914       out = last - result;
2915 
2916       token = cpp_get_token (pfile);
2917       if (token->flags & PREV_WHITE)
2918 	result[out++] = ' ';
2919     }
2920 
2921   result[out] = '\0';
2922   return result;
2923 }
2924 
2925 /* Memory buffers.  Changing these three constants can have a dramatic
2926    effect on performance.  The values here are reasonable defaults,
2927    but might be tuned.  If you adjust them, be sure to test across a
2928    range of uses of cpplib, including heavy nested function-like macro
2929    expansion.  Also check the change in peak memory usage (NJAMD is a
2930    good tool for this).  */
2931 #define MIN_BUFF_SIZE 8000
2932 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2)
2933 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \
2934 	(MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2)
2935 
2936 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0)
2937   #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE!
2938 #endif
2939 
2940 /* Create a new allocation buffer.  Place the control block at the end
2941    of the buffer, so that buffer overflows will cause immediate chaos.  */
2942 static _cpp_buff *
2943 new_buff (size_t len)
2944 {
2945   _cpp_buff *result;
2946   unsigned char *base;
2947 
2948   if (len < MIN_BUFF_SIZE)
2949     len = MIN_BUFF_SIZE;
2950   len = CPP_ALIGN (len);
2951 
2952 #ifdef ENABLE_VALGRIND_CHECKING
2953   /* Valgrind warns about uses of interior pointers, so put _cpp_buff
2954      struct first.  */
2955   size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT);
2956   base = XNEWVEC (unsigned char, len + slen);
2957   result = (_cpp_buff *) base;
2958   base += slen;
2959 #else
2960   base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff));
2961   result = (_cpp_buff *) (base + len);
2962 #endif
2963   result->base = base;
2964   result->cur = base;
2965   result->limit = base + len;
2966   result->next = NULL;
2967   return result;
2968 }
2969 
2970 /* Place a chain of unwanted allocation buffers on the free list.  */
2971 void
2972 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff)
2973 {
2974   _cpp_buff *end = buff;
2975 
2976   while (end->next)
2977     end = end->next;
2978   end->next = pfile->free_buffs;
2979   pfile->free_buffs = buff;
2980 }
2981 
2982 /* Return a free buffer of size at least MIN_SIZE.  */
2983 _cpp_buff *
2984 _cpp_get_buff (cpp_reader *pfile, size_t min_size)
2985 {
2986   _cpp_buff *result, **p;
2987 
2988   for (p = &pfile->free_buffs;; p = &(*p)->next)
2989     {
2990       size_t size;
2991 
2992       if (*p == NULL)
2993 	return new_buff (min_size);
2994       result = *p;
2995       size = result->limit - result->base;
2996       /* Return a buffer that's big enough, but don't waste one that's
2997          way too big.  */
2998       if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size))
2999 	break;
3000     }
3001 
3002   *p = result->next;
3003   result->next = NULL;
3004   result->cur = result->base;
3005   return result;
3006 }
3007 
3008 /* Creates a new buffer with enough space to hold the uncommitted
3009    remaining bytes of BUFF, and at least MIN_EXTRA more bytes.  Copies
3010    the excess bytes to the new buffer.  Chains the new buffer after
3011    BUFF, and returns the new buffer.  */
3012 _cpp_buff *
3013 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra)
3014 {
3015   size_t size = EXTENDED_BUFF_SIZE (buff, min_extra);
3016   _cpp_buff *new_buff = _cpp_get_buff (pfile, size);
3017 
3018   buff->next = new_buff;
3019   memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff));
3020   return new_buff;
3021 }
3022 
3023 /* Creates a new buffer with enough space to hold the uncommitted
3024    remaining bytes of the buffer pointed to by BUFF, and at least
3025    MIN_EXTRA more bytes.  Copies the excess bytes to the new buffer.
3026    Chains the new buffer before the buffer pointed to by BUFF, and
3027    updates the pointer to point to the new buffer.  */
3028 void
3029 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra)
3030 {
3031   _cpp_buff *new_buff, *old_buff = *pbuff;
3032   size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra);
3033 
3034   new_buff = _cpp_get_buff (pfile, size);
3035   memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff));
3036   new_buff->next = old_buff;
3037   *pbuff = new_buff;
3038 }
3039 
3040 /* Free a chain of buffers starting at BUFF.  */
3041 void
3042 _cpp_free_buff (_cpp_buff *buff)
3043 {
3044   _cpp_buff *next;
3045 
3046   for (; buff; buff = next)
3047     {
3048       next = buff->next;
3049 #ifdef ENABLE_VALGRIND_CHECKING
3050       free (buff);
3051 #else
3052       free (buff->base);
3053 #endif
3054     }
3055 }
3056 
3057 /* Allocate permanent, unaligned storage of length LEN.  */
3058 unsigned char *
3059 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len)
3060 {
3061   _cpp_buff *buff = pfile->u_buff;
3062   unsigned char *result = buff->cur;
3063 
3064   if (len > (size_t) (buff->limit - result))
3065     {
3066       buff = _cpp_get_buff (pfile, len);
3067       buff->next = pfile->u_buff;
3068       pfile->u_buff = buff;
3069       result = buff->cur;
3070     }
3071 
3072   buff->cur = result + len;
3073   return result;
3074 }
3075 
3076 /* Allocate permanent, unaligned storage of length LEN from a_buff.
3077    That buffer is used for growing allocations when saving macro
3078    replacement lists in a #define, and when parsing an answer to an
3079    assertion in #assert, #unassert or #if (and therefore possibly
3080    whilst expanding macros).  It therefore must not be used by any
3081    code that they might call: specifically the lexer and the guts of
3082    the macro expander.
3083 
3084    All existing other uses clearly fit this restriction: storing
3085    registered pragmas during initialization.  */
3086 unsigned char *
3087 _cpp_aligned_alloc (cpp_reader *pfile, size_t len)
3088 {
3089   _cpp_buff *buff = pfile->a_buff;
3090   unsigned char *result = buff->cur;
3091 
3092   if (len > (size_t) (buff->limit - result))
3093     {
3094       buff = _cpp_get_buff (pfile, len);
3095       buff->next = pfile->a_buff;
3096       pfile->a_buff = buff;
3097       result = buff->cur;
3098     }
3099 
3100   buff->cur = result + len;
3101   return result;
3102 }
3103 
3104 /* Say which field of TOK is in use.  */
3105 
3106 enum cpp_token_fld_kind
3107 cpp_token_val_index (cpp_token *tok)
3108 {
3109   switch (TOKEN_SPELL (tok))
3110     {
3111     case SPELL_IDENT:
3112       return CPP_TOKEN_FLD_NODE;
3113     case SPELL_LITERAL:
3114       return CPP_TOKEN_FLD_STR;
3115     case SPELL_OPERATOR:
3116       if (tok->type == CPP_PASTE)
3117 	return CPP_TOKEN_FLD_TOKEN_NO;
3118       else
3119 	return CPP_TOKEN_FLD_NONE;
3120     case SPELL_NONE:
3121       if (tok->type == CPP_MACRO_ARG)
3122 	return CPP_TOKEN_FLD_ARG_NO;
3123       else if (tok->type == CPP_PADDING)
3124 	return CPP_TOKEN_FLD_SOURCE;
3125       else if (tok->type == CPP_PRAGMA)
3126 	return CPP_TOKEN_FLD_PRAGMA;
3127       /* else fall through */
3128     default:
3129       return CPP_TOKEN_FLD_NONE;
3130     }
3131 }
3132 
3133 /* All tokens lexed in R after calling this function will be forced to have
3134    their source_location the same as the location referenced by P, until
3135    cpp_stop_forcing_token_locations is called for R.  */
3136 
3137 void
3138 cpp_force_token_locations (cpp_reader *r, source_location *p)
3139 {
3140   r->forced_token_location_p = p;
3141 }
3142 
3143 /* Go back to assigning locations naturally for lexed tokens.  */
3144 
3145 void
3146 cpp_stop_forcing_token_locations (cpp_reader *r)
3147 {
3148   r->forced_token_location_p = NULL;
3149 }
3150