1 /* CPP Library - lexical analysis. 2 Copyright (C) 2000-2013 Free Software Foundation, Inc. 3 Contributed by Per Bothner, 1994-95. 4 Based on CCCP program by Paul Rubin, June 1986 5 Adapted to ANSI C, Richard Stallman, Jan 1987 6 Broken out to separate file, Zack Weinberg, Mar 2000 7 8 This program is free software; you can redistribute it and/or modify it 9 under the terms of the GNU General Public License as published by the 10 Free Software Foundation; either version 3, or (at your option) any 11 later version. 12 13 This program is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with this program; see the file COPYING3. If not see 20 <http://www.gnu.org/licenses/>. */ 21 22 #include "config.h" 23 #include "system.h" 24 #include "cpplib.h" 25 #include "internal.h" 26 27 enum spell_type 28 { 29 SPELL_OPERATOR = 0, 30 SPELL_IDENT, 31 SPELL_LITERAL, 32 SPELL_NONE 33 }; 34 35 struct token_spelling 36 { 37 enum spell_type category; 38 const unsigned char *name; 39 }; 40 41 static const unsigned char *const digraph_spellings[] = 42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" }; 43 44 #define OP(e, s) { SPELL_OPERATOR, UC s }, 45 #define TK(e, s) { SPELL_ ## s, UC #e }, 46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE }; 47 #undef OP 48 #undef TK 49 50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category) 51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name) 52 53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int); 54 static int skip_line_comment (cpp_reader *); 55 static void skip_whitespace (cpp_reader *, cppchar_t); 56 static void lex_string (cpp_reader *, cpp_token *, const uchar *); 57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t); 58 static void store_comment (cpp_reader *, cpp_token *); 59 static void create_literal (cpp_reader *, cpp_token *, const uchar *, 60 unsigned int, enum cpp_ttype); 61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *); 62 static int name_p (cpp_reader *, const cpp_string *); 63 static tokenrun *next_tokenrun (tokenrun *); 64 65 static _cpp_buff *new_buff (size_t); 66 67 68 /* Utility routine: 69 70 Compares, the token TOKEN to the NUL-terminated string STRING. 71 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */ 72 int 73 cpp_ideq (const cpp_token *token, const char *string) 74 { 75 if (token->type != CPP_NAME) 76 return 0; 77 78 return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string); 79 } 80 81 /* Record a note TYPE at byte POS into the current cleaned logical 82 line. */ 83 static void 84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type) 85 { 86 if (buffer->notes_used == buffer->notes_cap) 87 { 88 buffer->notes_cap = buffer->notes_cap * 2 + 200; 89 buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes, 90 buffer->notes_cap); 91 } 92 93 buffer->notes[buffer->notes_used].pos = pos; 94 buffer->notes[buffer->notes_used].type = type; 95 buffer->notes_used++; 96 } 97 98 99 /* Fast path to find line special characters using optimized character 100 scanning algorithms. Anything complicated falls back to the slow 101 path below. Since this loop is very hot it's worth doing these kinds 102 of optimizations. 103 104 One of the paths through the ifdefs should provide 105 106 const uchar *search_line_fast (const uchar *s, const uchar *end); 107 108 Between S and END, search for \n, \r, \\, ?. Return a pointer to 109 the found character. 110 111 Note that the last character of the buffer is *always* a newline, 112 as forced by _cpp_convert_input. This fact can be used to avoid 113 explicitly looking for the end of the buffer. */ 114 115 /* Configure gives us an ifdef test. */ 116 #ifndef WORDS_BIGENDIAN 117 #define WORDS_BIGENDIAN 0 118 #endif 119 120 /* We'd like the largest integer that fits into a register. There's nothing 121 in <stdint.h> that gives us that. For most hosts this is unsigned long, 122 but MS decided on an LLP64 model. Thankfully when building with GCC we 123 can get the "real" word size. */ 124 #ifdef __GNUC__ 125 typedef unsigned int word_type __attribute__((__mode__(__word__))); 126 #else 127 typedef unsigned long word_type; 128 #endif 129 130 /* The code below is only expecting sizes 4 or 8. 131 Die at compile-time if this expectation is violated. */ 132 typedef char check_word_type_size 133 [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1]; 134 135 /* Return X with the first N bytes forced to values that won't match one 136 of the interesting characters. Note that NUL is not interesting. */ 137 138 static inline word_type 139 acc_char_mask_misalign (word_type val, unsigned int n) 140 { 141 word_type mask = -1; 142 if (WORDS_BIGENDIAN) 143 mask >>= n * 8; 144 else 145 mask <<= n * 8; 146 return val & mask; 147 } 148 149 /* Return X replicated to all byte positions within WORD_TYPE. */ 150 151 static inline word_type 152 acc_char_replicate (uchar x) 153 { 154 word_type ret; 155 156 ret = (x << 24) | (x << 16) | (x << 8) | x; 157 if (sizeof(word_type) == 8) 158 ret = (ret << 16 << 16) | ret; 159 return ret; 160 } 161 162 /* Return non-zero if some byte of VAL is (probably) C. */ 163 164 static inline word_type 165 acc_char_cmp (word_type val, word_type c) 166 { 167 #if defined(__GNUC__) && defined(__alpha__) 168 /* We can get exact results using a compare-bytes instruction. 169 Get (val == c) via (0 >= (val ^ c)). */ 170 return __builtin_alpha_cmpbge (0, val ^ c); 171 #else 172 word_type magic = 0x7efefefeU; 173 if (sizeof(word_type) == 8) 174 magic = (magic << 16 << 16) | 0xfefefefeU; 175 magic |= 1; 176 177 val ^= c; 178 return ((val + magic) ^ ~val) & ~magic; 179 #endif 180 } 181 182 /* Given the result of acc_char_cmp is non-zero, return the index of 183 the found character. If this was a false positive, return -1. */ 184 185 static inline int 186 acc_char_index (word_type cmp ATTRIBUTE_UNUSED, 187 word_type val ATTRIBUTE_UNUSED) 188 { 189 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN 190 /* The cmpbge instruction sets *bits* of the result corresponding to 191 matches in the bytes with no false positives. */ 192 return __builtin_ctzl (cmp); 193 #else 194 unsigned int i; 195 196 /* ??? It would be nice to force unrolling here, 197 and have all of these constants folded. */ 198 for (i = 0; i < sizeof(word_type); ++i) 199 { 200 uchar c; 201 if (WORDS_BIGENDIAN) 202 c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff; 203 else 204 c = (val >> i * 8) & 0xff; 205 206 if (c == '\n' || c == '\r' || c == '\\' || c == '?') 207 return i; 208 } 209 210 return -1; 211 #endif 212 } 213 214 /* A version of the fast scanner using bit fiddling techniques. 215 216 For 32-bit words, one would normally perform 16 comparisons and 217 16 branches. With this algorithm one performs 24 arithmetic 218 operations and one branch. Whether this is faster with a 32-bit 219 word size is going to be somewhat system dependent. 220 221 For 64-bit words, we eliminate twice the number of comparisons 222 and branches without increasing the number of arithmetic operations. 223 It's almost certainly going to be a win with 64-bit word size. */ 224 225 static const uchar * search_line_acc_char (const uchar *, const uchar *) 226 ATTRIBUTE_UNUSED; 227 228 static const uchar * 229 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) 230 { 231 const word_type repl_nl = acc_char_replicate ('\n'); 232 const word_type repl_cr = acc_char_replicate ('\r'); 233 const word_type repl_bs = acc_char_replicate ('\\'); 234 const word_type repl_qm = acc_char_replicate ('?'); 235 236 unsigned int misalign; 237 const word_type *p; 238 word_type val, t; 239 240 /* Align the buffer. Mask out any bytes from before the beginning. */ 241 p = (word_type *)((uintptr_t)s & -sizeof(word_type)); 242 val = *p; 243 misalign = (uintptr_t)s & (sizeof(word_type) - 1); 244 if (misalign) 245 val = acc_char_mask_misalign (val, misalign); 246 247 /* Main loop. */ 248 while (1) 249 { 250 t = acc_char_cmp (val, repl_nl); 251 t |= acc_char_cmp (val, repl_cr); 252 t |= acc_char_cmp (val, repl_bs); 253 t |= acc_char_cmp (val, repl_qm); 254 255 if (__builtin_expect (t != 0, 0)) 256 { 257 int i = acc_char_index (t, val); 258 if (i >= 0) 259 return (const uchar *)p + i; 260 } 261 262 val = *++p; 263 } 264 } 265 266 /* Disable on Solaris 2/x86 until the following problems can be properly 267 autoconfed: 268 269 The Solaris 9 assembler cannot assemble SSE4.2 insns. 270 Before Solaris 9 Update 6, SSE insns cannot be executed. 271 The Solaris 10+ assembler tags objects with the instruction set 272 extensions used, so SSE4.2 executables cannot run on machines that 273 don't support that extension. */ 274 275 #if (GCC_VERSION >= 4005) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__)) 276 277 /* Replicated character data to be shared between implementations. 278 Recall that outside of a context with vector support we can't 279 define compatible vector types, therefore these are all defined 280 in terms of raw characters. */ 281 static const char repl_chars[4][16] __attribute__((aligned(16))) = { 282 { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', 283 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' }, 284 { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r', 285 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' }, 286 { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', 287 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' }, 288 { '?', '?', '?', '?', '?', '?', '?', '?', 289 '?', '?', '?', '?', '?', '?', '?', '?' }, 290 }; 291 292 /* A version of the fast scanner using MMX vectorized byte compare insns. 293 294 This uses the PMOVMSKB instruction which was introduced with "MMX2", 295 which was packaged into SSE1; it is also present in the AMD MMX 296 extension. Mark the function as using "sse" so that we emit a real 297 "emms" instruction, rather than the 3dNOW "femms" instruction. */ 298 299 static const uchar * 300 #ifndef __SSE__ 301 __attribute__((__target__("sse"))) 302 #endif 303 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) 304 { 305 typedef char v8qi __attribute__ ((__vector_size__ (8))); 306 typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__)); 307 308 const v8qi repl_nl = *(const v8qi *)repl_chars[0]; 309 const v8qi repl_cr = *(const v8qi *)repl_chars[1]; 310 const v8qi repl_bs = *(const v8qi *)repl_chars[2]; 311 const v8qi repl_qm = *(const v8qi *)repl_chars[3]; 312 313 unsigned int misalign, found, mask; 314 const v8qi *p; 315 v8qi data, t, c; 316 317 /* Align the source pointer. While MMX doesn't generate unaligned data 318 faults, this allows us to safely scan to the end of the buffer without 319 reading beyond the end of the last page. */ 320 misalign = (uintptr_t)s & 7; 321 p = (const v8qi *)((uintptr_t)s & -8); 322 data = *p; 323 324 /* Create a mask for the bytes that are valid within the first 325 16-byte block. The Idea here is that the AND with the mask 326 within the loop is "free", since we need some AND or TEST 327 insn in order to set the flags for the branch anyway. */ 328 mask = -1u << misalign; 329 330 /* Main loop processing 8 bytes at a time. */ 331 goto start; 332 do 333 { 334 data = *++p; 335 mask = -1; 336 337 start: 338 t = __builtin_ia32_pcmpeqb(data, repl_nl); 339 c = __builtin_ia32_pcmpeqb(data, repl_cr); 340 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c); 341 c = __builtin_ia32_pcmpeqb(data, repl_bs); 342 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c); 343 c = __builtin_ia32_pcmpeqb(data, repl_qm); 344 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c); 345 found = __builtin_ia32_pmovmskb (t); 346 found &= mask; 347 } 348 while (!found); 349 350 __builtin_ia32_emms (); 351 352 /* FOUND contains 1 in bits for which we matched a relevant 353 character. Conversion to the byte index is trivial. */ 354 found = __builtin_ctz(found); 355 return (const uchar *)p + found; 356 } 357 358 /* A version of the fast scanner using SSE2 vectorized byte compare insns. */ 359 360 static const uchar * 361 #ifndef __SSE2__ 362 __attribute__((__target__("sse2"))) 363 #endif 364 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) 365 { 366 typedef char v16qi __attribute__ ((__vector_size__ (16))); 367 368 const v16qi repl_nl = *(const v16qi *)repl_chars[0]; 369 const v16qi repl_cr = *(const v16qi *)repl_chars[1]; 370 const v16qi repl_bs = *(const v16qi *)repl_chars[2]; 371 const v16qi repl_qm = *(const v16qi *)repl_chars[3]; 372 373 unsigned int misalign, found, mask; 374 const v16qi *p; 375 v16qi data, t; 376 377 /* Align the source pointer. */ 378 misalign = (uintptr_t)s & 15; 379 p = (const v16qi *)((uintptr_t)s & -16); 380 data = *p; 381 382 /* Create a mask for the bytes that are valid within the first 383 16-byte block. The Idea here is that the AND with the mask 384 within the loop is "free", since we need some AND or TEST 385 insn in order to set the flags for the branch anyway. */ 386 mask = -1u << misalign; 387 388 /* Main loop processing 16 bytes at a time. */ 389 goto start; 390 do 391 { 392 data = *++p; 393 mask = -1; 394 395 start: 396 t = __builtin_ia32_pcmpeqb128(data, repl_nl); 397 t |= __builtin_ia32_pcmpeqb128(data, repl_cr); 398 t |= __builtin_ia32_pcmpeqb128(data, repl_bs); 399 t |= __builtin_ia32_pcmpeqb128(data, repl_qm); 400 found = __builtin_ia32_pmovmskb128 (t); 401 found &= mask; 402 } 403 while (!found); 404 405 /* FOUND contains 1 in bits for which we matched a relevant 406 character. Conversion to the byte index is trivial. */ 407 found = __builtin_ctz(found); 408 return (const uchar *)p + found; 409 } 410 411 #ifdef HAVE_SSE4 412 /* A version of the fast scanner using SSE 4.2 vectorized string insns. */ 413 414 static const uchar * 415 #ifndef __SSE4_2__ 416 __attribute__((__target__("sse4.2"))) 417 #endif 418 search_line_sse42 (const uchar *s, const uchar *end) 419 { 420 typedef char v16qi __attribute__ ((__vector_size__ (16))); 421 static const v16qi search = { '\n', '\r', '?', '\\' }; 422 423 uintptr_t si = (uintptr_t)s; 424 uintptr_t index; 425 426 /* Check for unaligned input. */ 427 if (si & 15) 428 { 429 v16qi sv; 430 431 if (__builtin_expect (end - s < 16, 0) 432 && __builtin_expect ((si & 0xfff) > 0xff0, 0)) 433 { 434 /* There are less than 16 bytes left in the buffer, and less 435 than 16 bytes left on the page. Reading 16 bytes at this 436 point might generate a spurious page fault. Defer to the 437 SSE2 implementation, which already handles alignment. */ 438 return search_line_sse2 (s, end); 439 } 440 441 /* ??? The builtin doesn't understand that the PCMPESTRI read from 442 memory need not be aligned. */ 443 sv = __builtin_ia32_loaddqu ((const char *) s); 444 index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0); 445 446 if (__builtin_expect (index < 16, 0)) 447 goto found; 448 449 /* Advance the pointer to an aligned address. We will re-scan a 450 few bytes, but we no longer need care for reading past the 451 end of a page, since we're guaranteed a match. */ 452 s = (const uchar *)((si + 16) & -16); 453 } 454 455 /* Main loop, processing 16 bytes at a time. By doing the whole loop 456 in inline assembly, we can make proper use of the flags set. */ 457 __asm ( "sub $16, %1\n" 458 " .balign 16\n" 459 "0: add $16, %1\n" 460 " %vpcmpestri $0, (%1), %2\n" 461 " jnc 0b" 462 : "=&c"(index), "+r"(s) 463 : "x"(search), "a"(4), "d"(16)); 464 465 found: 466 return s + index; 467 } 468 469 #else 470 /* Work around out-dated assemblers without sse4 support. */ 471 #define search_line_sse42 search_line_sse2 472 #endif 473 474 /* Check the CPU capabilities. */ 475 476 #include "../gcc/config/i386/cpuid.h" 477 478 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *); 479 static search_line_fast_type search_line_fast; 480 481 #define HAVE_init_vectorized_lexer 1 482 static inline void 483 init_vectorized_lexer (void) 484 { 485 unsigned dummy, ecx = 0, edx = 0; 486 search_line_fast_type impl = search_line_acc_char; 487 int minimum = 0; 488 489 #if defined(__SSE4_2__) 490 minimum = 3; 491 #elif defined(__SSE2__) 492 minimum = 2; 493 #elif defined(__SSE__) 494 minimum = 1; 495 #endif 496 497 if (minimum == 3) 498 impl = search_line_sse42; 499 else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2) 500 { 501 if (minimum == 3 || (ecx & bit_SSE4_2)) 502 impl = search_line_sse42; 503 else if (minimum == 2 || (edx & bit_SSE2)) 504 impl = search_line_sse2; 505 else if (minimum == 1 || (edx & bit_SSE)) 506 impl = search_line_mmx; 507 } 508 else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx)) 509 { 510 if (minimum == 1 511 || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV)) 512 impl = search_line_mmx; 513 } 514 515 search_line_fast = impl; 516 } 517 518 #elif defined(_ARCH_PWR8) && defined(__ALTIVEC__) 519 520 /* A vection of the fast scanner using AltiVec vectorized byte compares 521 and VSX unaligned loads (when VSX is available). This is otherwise 522 the same as the pre-GCC 5 version. */ 523 524 static const uchar * 525 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) 526 { 527 typedef __attribute__((altivec(vector))) unsigned char vc; 528 529 const vc repl_nl = { 530 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', 531 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' 532 }; 533 const vc repl_cr = { 534 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r', 535 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' 536 }; 537 const vc repl_bs = { 538 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', 539 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' 540 }; 541 const vc repl_qm = { 542 '?', '?', '?', '?', '?', '?', '?', '?', 543 '?', '?', '?', '?', '?', '?', '?', '?', 544 }; 545 const vc zero = { 0 }; 546 547 vc data, t; 548 549 /* Main loop processing 16 bytes at a time. */ 550 do 551 { 552 vc m_nl, m_cr, m_bs, m_qm; 553 554 data = *((const vc *)s); 555 s += 16; 556 557 m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl); 558 m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr); 559 m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs); 560 m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm); 561 t = (m_nl | m_cr) | (m_bs | m_qm); 562 563 /* T now contains 0xff in bytes for which we matched one of the relevant 564 characters. We want to exit the loop if any byte in T is non-zero. 565 Below is the expansion of vec_any_ne(t, zero). */ 566 } 567 while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero)); 568 569 /* Restore s to to point to the 16 bytes we just processed. */ 570 s -= 16; 571 572 { 573 #define N (sizeof(vc) / sizeof(long)) 574 575 union { 576 vc v; 577 /* Statically assert that N is 2 or 4. */ 578 unsigned long l[(N == 2 || N == 4) ? N : -1]; 579 } u; 580 unsigned long l, i = 0; 581 582 u.v = t; 583 584 /* Find the first word of T that is non-zero. */ 585 switch (N) 586 { 587 case 4: 588 l = u.l[i++]; 589 if (l != 0) 590 break; 591 s += sizeof(unsigned long); 592 l = u.l[i++]; 593 if (l != 0) 594 break; 595 s += sizeof(unsigned long); 596 case 2: 597 l = u.l[i++]; 598 if (l != 0) 599 break; 600 s += sizeof(unsigned long); 601 l = u.l[i]; 602 } 603 604 /* L now contains 0xff in bytes for which we matched one of the 605 relevant characters. We can find the byte index by finding 606 its bit index and dividing by 8. */ 607 #ifdef __BIG_ENDIAN__ 608 l = __builtin_clzl(l) >> 3; 609 #else 610 l = __builtin_ctzl(l) >> 3; 611 #endif 612 return s + l; 613 614 #undef N 615 } 616 } 617 618 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__) 619 620 /* A vection of the fast scanner using AltiVec vectorized byte compares. 621 This cannot be used for little endian because vec_lvsl/lvsr are 622 deprecated for little endian and the code won't work properly. */ 623 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported, 624 so we can't compile this function without -maltivec on the command line 625 (or implied by some other switch). */ 626 627 static const uchar * 628 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) 629 { 630 typedef __attribute__((altivec(vector))) unsigned char vc; 631 632 const vc repl_nl = { 633 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', 634 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' 635 }; 636 const vc repl_cr = { 637 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r', 638 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' 639 }; 640 const vc repl_bs = { 641 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', 642 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' 643 }; 644 const vc repl_qm = { 645 '?', '?', '?', '?', '?', '?', '?', '?', 646 '?', '?', '?', '?', '?', '?', '?', '?', 647 }; 648 const vc ones = { 649 -1, -1, -1, -1, -1, -1, -1, -1, 650 -1, -1, -1, -1, -1, -1, -1, -1, 651 }; 652 const vc zero = { 0 }; 653 654 vc data, mask, t; 655 656 /* Altivec loads automatically mask addresses with -16. This lets us 657 issue the first load as early as possible. */ 658 data = __builtin_vec_ld(0, (const vc *)s); 659 660 /* Discard bytes before the beginning of the buffer. Do this by 661 beginning with all ones and shifting in zeros according to the 662 mis-alignment. The LVSR instruction pulls the exact shift we 663 want from the address. */ 664 mask = __builtin_vec_lvsr(0, s); 665 mask = __builtin_vec_perm(zero, ones, mask); 666 data &= mask; 667 668 /* While altivec loads mask addresses, we still need to align S so 669 that the offset we compute at the end is correct. */ 670 s = (const uchar *)((uintptr_t)s & -16); 671 672 /* Main loop processing 16 bytes at a time. */ 673 goto start; 674 do 675 { 676 vc m_nl, m_cr, m_bs, m_qm; 677 678 s += 16; 679 data = __builtin_vec_ld(0, (const vc *)s); 680 681 start: 682 m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl); 683 m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr); 684 m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs); 685 m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm); 686 t = (m_nl | m_cr) | (m_bs | m_qm); 687 688 /* T now contains 0xff in bytes for which we matched one of the relevant 689 characters. We want to exit the loop if any byte in T is non-zero. 690 Below is the expansion of vec_any_ne(t, zero). */ 691 } 692 while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero)); 693 694 { 695 #define N (sizeof(vc) / sizeof(long)) 696 697 union { 698 vc v; 699 /* Statically assert that N is 2 or 4. */ 700 unsigned long l[(N == 2 || N == 4) ? N : -1]; 701 } u; 702 unsigned long l, i = 0; 703 704 u.v = t; 705 706 /* Find the first word of T that is non-zero. */ 707 switch (N) 708 { 709 case 4: 710 l = u.l[i++]; 711 if (l != 0) 712 break; 713 s += sizeof(unsigned long); 714 l = u.l[i++]; 715 if (l != 0) 716 break; 717 s += sizeof(unsigned long); 718 case 2: 719 l = u.l[i++]; 720 if (l != 0) 721 break; 722 s += sizeof(unsigned long); 723 l = u.l[i]; 724 } 725 726 /* L now contains 0xff in bytes for which we matched one of the 727 relevant characters. We can find the byte index by finding 728 its bit index and dividing by 8. */ 729 l = __builtin_clzl(l) >> 3; 730 return s + l; 731 732 #undef N 733 } 734 } 735 736 #elif defined (__ARM_NEON__) 737 #include "arm_neon.h" 738 739 static const uchar * 740 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) 741 { 742 const uint8x16_t repl_nl = vdupq_n_u8 ('\n'); 743 const uint8x16_t repl_cr = vdupq_n_u8 ('\r'); 744 const uint8x16_t repl_bs = vdupq_n_u8 ('\\'); 745 const uint8x16_t repl_qm = vdupq_n_u8 ('?'); 746 const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL); 747 748 unsigned int misalign, found, mask; 749 const uint8_t *p; 750 uint8x16_t data; 751 752 /* Align the source pointer. */ 753 misalign = (uintptr_t)s & 15; 754 p = (const uint8_t *)((uintptr_t)s & -16); 755 data = vld1q_u8 (p); 756 757 /* Create a mask for the bytes that are valid within the first 758 16-byte block. The Idea here is that the AND with the mask 759 within the loop is "free", since we need some AND or TEST 760 insn in order to set the flags for the branch anyway. */ 761 mask = (-1u << misalign) & 0xffff; 762 763 /* Main loop, processing 16 bytes at a time. */ 764 goto start; 765 766 do 767 { 768 uint8x8_t l; 769 uint16x4_t m; 770 uint32x2_t n; 771 uint8x16_t t, u, v, w; 772 773 p += 16; 774 data = vld1q_u8 (p); 775 mask = 0xffff; 776 777 start: 778 t = vceqq_u8 (data, repl_nl); 779 u = vceqq_u8 (data, repl_cr); 780 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs)); 781 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm)); 782 t = vandq_u8 (vorrq_u8 (v, w), xmask); 783 l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t)); 784 m = vpaddl_u8 (l); 785 n = vpaddl_u16 (m); 786 787 found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n, 788 vshr_n_u64 ((uint64x1_t) n, 24)), 0); 789 found &= mask; 790 } 791 while (!found); 792 793 /* FOUND contains 1 in bits for which we matched a relevant 794 character. Conversion to the byte index is trivial. */ 795 found = __builtin_ctz (found); 796 return (const uchar *)p + found; 797 } 798 799 #else 800 801 /* We only have one accellerated alternative. Use a direct call so that 802 we encourage inlining. */ 803 804 #define search_line_fast search_line_acc_char 805 806 #endif 807 808 /* Initialize the lexer if needed. */ 809 810 void 811 _cpp_init_lexer (void) 812 { 813 #ifdef HAVE_init_vectorized_lexer 814 init_vectorized_lexer (); 815 #endif 816 } 817 818 /* Returns with a logical line that contains no escaped newlines or 819 trigraphs. This is a time-critical inner loop. */ 820 void 821 _cpp_clean_line (cpp_reader *pfile) 822 { 823 cpp_buffer *buffer; 824 const uchar *s; 825 uchar c, *d, *p; 826 827 buffer = pfile->buffer; 828 buffer->cur_note = buffer->notes_used = 0; 829 buffer->cur = buffer->line_base = buffer->next_line; 830 buffer->need_line = false; 831 s = buffer->next_line; 832 833 if (!buffer->from_stage3) 834 { 835 const uchar *pbackslash = NULL; 836 837 /* Fast path. This is the common case of an un-escaped line with 838 no trigraphs. The primary win here is by not writing any 839 data back to memory until we have to. */ 840 while (1) 841 { 842 /* Perform an optimized search for \n, \r, \\, ?. */ 843 s = search_line_fast (s, buffer->rlimit); 844 845 c = *s; 846 if (c == '\\') 847 { 848 /* Record the location of the backslash and continue. */ 849 pbackslash = s++; 850 } 851 else if (__builtin_expect (c == '?', 0)) 852 { 853 if (__builtin_expect (s[1] == '?', false) 854 && _cpp_trigraph_map[s[2]]) 855 { 856 /* Have a trigraph. We may or may not have to convert 857 it. Add a line note regardless, for -Wtrigraphs. */ 858 add_line_note (buffer, s, s[2]); 859 if (CPP_OPTION (pfile, trigraphs)) 860 { 861 /* We do, and that means we have to switch to the 862 slow path. */ 863 d = (uchar *) s; 864 *d = _cpp_trigraph_map[s[2]]; 865 s += 2; 866 goto slow_path; 867 } 868 } 869 /* Not a trigraph. Continue on fast-path. */ 870 s++; 871 } 872 else 873 break; 874 } 875 876 /* This must be \r or \n. We're either done, or we'll be forced 877 to write back to the buffer and continue on the slow path. */ 878 d = (uchar *) s; 879 880 if (__builtin_expect (s == buffer->rlimit, false)) 881 goto done; 882 883 /* DOS line ending? */ 884 if (__builtin_expect (c == '\r', false) && s[1] == '\n') 885 { 886 s++; 887 if (s == buffer->rlimit) 888 goto done; 889 } 890 891 if (__builtin_expect (pbackslash == NULL, true)) 892 goto done; 893 894 /* Check for escaped newline. */ 895 p = d; 896 while (is_nvspace (p[-1])) 897 p--; 898 if (p - 1 != pbackslash) 899 goto done; 900 901 /* Have an escaped newline; process it and proceed to 902 the slow path. */ 903 add_line_note (buffer, p - 1, p != d ? ' ' : '\\'); 904 d = p - 2; 905 buffer->next_line = p - 1; 906 907 slow_path: 908 while (1) 909 { 910 c = *++s; 911 *++d = c; 912 913 if (c == '\n' || c == '\r') 914 { 915 /* Handle DOS line endings. */ 916 if (c == '\r' && s != buffer->rlimit && s[1] == '\n') 917 s++; 918 if (s == buffer->rlimit) 919 break; 920 921 /* Escaped? */ 922 p = d; 923 while (p != buffer->next_line && is_nvspace (p[-1])) 924 p--; 925 if (p == buffer->next_line || p[-1] != '\\') 926 break; 927 928 add_line_note (buffer, p - 1, p != d ? ' ': '\\'); 929 d = p - 2; 930 buffer->next_line = p - 1; 931 } 932 else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]]) 933 { 934 /* Add a note regardless, for the benefit of -Wtrigraphs. */ 935 add_line_note (buffer, d, s[2]); 936 if (CPP_OPTION (pfile, trigraphs)) 937 { 938 *d = _cpp_trigraph_map[s[2]]; 939 s += 2; 940 } 941 } 942 } 943 } 944 else 945 { 946 while (*s != '\n' && *s != '\r') 947 s++; 948 d = (uchar *) s; 949 950 /* Handle DOS line endings. */ 951 if (*s == '\r' && s != buffer->rlimit && s[1] == '\n') 952 s++; 953 } 954 955 done: 956 *d = '\n'; 957 /* A sentinel note that should never be processed. */ 958 add_line_note (buffer, d + 1, '\n'); 959 buffer->next_line = s + 1; 960 } 961 962 /* Return true if the trigraph indicated by NOTE should be warned 963 about in a comment. */ 964 static bool 965 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note) 966 { 967 const uchar *p; 968 969 /* Within comments we don't warn about trigraphs, unless the 970 trigraph forms an escaped newline, as that may change 971 behavior. */ 972 if (note->type != '/') 973 return false; 974 975 /* If -trigraphs, then this was an escaped newline iff the next note 976 is coincident. */ 977 if (CPP_OPTION (pfile, trigraphs)) 978 return note[1].pos == note->pos; 979 980 /* Otherwise, see if this forms an escaped newline. */ 981 p = note->pos + 3; 982 while (is_nvspace (*p)) 983 p++; 984 985 /* There might have been escaped newlines between the trigraph and the 986 newline we found. Hence the position test. */ 987 return (*p == '\n' && p < note[1].pos); 988 } 989 990 /* Process the notes created by add_line_note as far as the current 991 location. */ 992 void 993 _cpp_process_line_notes (cpp_reader *pfile, int in_comment) 994 { 995 cpp_buffer *buffer = pfile->buffer; 996 997 for (;;) 998 { 999 _cpp_line_note *note = &buffer->notes[buffer->cur_note]; 1000 unsigned int col; 1001 1002 if (note->pos > buffer->cur) 1003 break; 1004 1005 buffer->cur_note++; 1006 col = CPP_BUF_COLUMN (buffer, note->pos + 1); 1007 1008 if (note->type == '\\' || note->type == ' ') 1009 { 1010 if (note->type == ' ' && !in_comment) 1011 cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col, 1012 "backslash and newline separated by space"); 1013 1014 if (buffer->next_line > buffer->rlimit) 1015 { 1016 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col, 1017 "backslash-newline at end of file"); 1018 /* Prevent "no newline at end of file" warning. */ 1019 buffer->next_line = buffer->rlimit; 1020 } 1021 1022 buffer->line_base = note->pos; 1023 CPP_INCREMENT_LINE (pfile, 0); 1024 } 1025 else if (_cpp_trigraph_map[note->type]) 1026 { 1027 if (CPP_OPTION (pfile, warn_trigraphs) 1028 && (!in_comment || warn_in_comment (pfile, note))) 1029 { 1030 if (CPP_OPTION (pfile, trigraphs)) 1031 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS, 1032 pfile->line_table->highest_line, col, 1033 "trigraph ??%c converted to %c", 1034 note->type, 1035 (int) _cpp_trigraph_map[note->type]); 1036 else 1037 { 1038 cpp_warning_with_line 1039 (pfile, CPP_W_TRIGRAPHS, 1040 pfile->line_table->highest_line, col, 1041 "trigraph ??%c ignored, use -trigraphs to enable", 1042 note->type); 1043 } 1044 } 1045 } 1046 else if (note->type == 0) 1047 /* Already processed in lex_raw_string. */; 1048 else 1049 abort (); 1050 } 1051 } 1052 1053 /* Skip a C-style block comment. We find the end of the comment by 1054 seeing if an asterisk is before every '/' we encounter. Returns 1055 nonzero if comment terminated by EOF, zero otherwise. 1056 1057 Buffer->cur points to the initial asterisk of the comment. */ 1058 bool 1059 _cpp_skip_block_comment (cpp_reader *pfile) 1060 { 1061 cpp_buffer *buffer = pfile->buffer; 1062 const uchar *cur = buffer->cur; 1063 uchar c; 1064 1065 cur++; 1066 if (*cur == '/') 1067 cur++; 1068 1069 for (;;) 1070 { 1071 /* People like decorating comments with '*', so check for '/' 1072 instead for efficiency. */ 1073 c = *cur++; 1074 1075 if (c == '/') 1076 { 1077 if (cur[-2] == '*') 1078 break; 1079 1080 /* Warn about potential nested comments, but not if the '/' 1081 comes immediately before the true comment delimiter. 1082 Don't bother to get it right across escaped newlines. */ 1083 if (CPP_OPTION (pfile, warn_comments) 1084 && cur[0] == '*' && cur[1] != '/') 1085 { 1086 buffer->cur = cur; 1087 cpp_warning_with_line (pfile, CPP_W_COMMENTS, 1088 pfile->line_table->highest_line, 1089 CPP_BUF_COL (buffer), 1090 "\"/*\" within comment"); 1091 } 1092 } 1093 else if (c == '\n') 1094 { 1095 unsigned int cols; 1096 buffer->cur = cur - 1; 1097 _cpp_process_line_notes (pfile, true); 1098 if (buffer->next_line >= buffer->rlimit) 1099 return true; 1100 _cpp_clean_line (pfile); 1101 1102 cols = buffer->next_line - buffer->line_base; 1103 CPP_INCREMENT_LINE (pfile, cols); 1104 1105 cur = buffer->cur; 1106 } 1107 } 1108 1109 buffer->cur = cur; 1110 _cpp_process_line_notes (pfile, true); 1111 return false; 1112 } 1113 1114 /* Skip a C++ line comment, leaving buffer->cur pointing to the 1115 terminating newline. Handles escaped newlines. Returns nonzero 1116 if a multiline comment. */ 1117 static int 1118 skip_line_comment (cpp_reader *pfile) 1119 { 1120 cpp_buffer *buffer = pfile->buffer; 1121 source_location orig_line = pfile->line_table->highest_line; 1122 1123 while (*buffer->cur != '\n') 1124 buffer->cur++; 1125 1126 _cpp_process_line_notes (pfile, true); 1127 return orig_line != pfile->line_table->highest_line; 1128 } 1129 1130 /* Skips whitespace, saving the next non-whitespace character. */ 1131 static void 1132 skip_whitespace (cpp_reader *pfile, cppchar_t c) 1133 { 1134 cpp_buffer *buffer = pfile->buffer; 1135 bool saw_NUL = false; 1136 1137 do 1138 { 1139 /* Horizontal space always OK. */ 1140 if (c == ' ' || c == '\t') 1141 ; 1142 /* Just \f \v or \0 left. */ 1143 else if (c == '\0') 1144 saw_NUL = true; 1145 else if (pfile->state.in_directive && CPP_PEDANTIC (pfile)) 1146 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, 1147 CPP_BUF_COL (buffer), 1148 "%s in preprocessing directive", 1149 c == '\f' ? "form feed" : "vertical tab"); 1150 1151 c = *buffer->cur++; 1152 } 1153 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */ 1154 while (is_nvspace (c)); 1155 1156 if (saw_NUL) 1157 cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored"); 1158 1159 buffer->cur--; 1160 } 1161 1162 /* See if the characters of a number token are valid in a name (no 1163 '.', '+' or '-'). */ 1164 static int 1165 name_p (cpp_reader *pfile, const cpp_string *string) 1166 { 1167 unsigned int i; 1168 1169 for (i = 0; i < string->len; i++) 1170 if (!is_idchar (string->text[i])) 1171 return 0; 1172 1173 return 1; 1174 } 1175 1176 /* After parsing an identifier or other sequence, produce a warning about 1177 sequences not in NFC/NFKC. */ 1178 static void 1179 warn_about_normalization (cpp_reader *pfile, 1180 const cpp_token *token, 1181 const struct normalize_state *s) 1182 { 1183 if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s) 1184 && !pfile->state.skipping) 1185 { 1186 /* Make sure that the token is printed using UCNs, even 1187 if we'd otherwise happily print UTF-8. */ 1188 unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token)); 1189 size_t sz; 1190 1191 sz = cpp_spell_token (pfile, token, buf, false) - buf; 1192 if (NORMALIZE_STATE_RESULT (s) == normalized_C) 1193 cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0, 1194 "`%.*s' is not in NFKC", (int) sz, buf); 1195 else 1196 cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0, 1197 "`%.*s' is not in NFC", (int) sz, buf); 1198 free (buf); 1199 } 1200 } 1201 1202 /* Returns TRUE if the sequence starting at buffer->cur is invalid in 1203 an identifier. FIRST is TRUE if this starts an identifier. */ 1204 static bool 1205 forms_identifier_p (cpp_reader *pfile, int first, 1206 struct normalize_state *state) 1207 { 1208 cpp_buffer *buffer = pfile->buffer; 1209 1210 if (*buffer->cur == '$') 1211 { 1212 if (!CPP_OPTION (pfile, dollars_in_ident)) 1213 return false; 1214 1215 buffer->cur++; 1216 if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping) 1217 { 1218 CPP_OPTION (pfile, warn_dollars) = 0; 1219 cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number"); 1220 } 1221 1222 return true; 1223 } 1224 1225 /* Is this a syntactically valid UCN? */ 1226 if (CPP_OPTION (pfile, extended_identifiers) 1227 && *buffer->cur == '\\' 1228 && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U')) 1229 { 1230 buffer->cur += 2; 1231 if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first, 1232 state)) 1233 return true; 1234 buffer->cur -= 2; 1235 } 1236 1237 return false; 1238 } 1239 1240 /* Helper function to get the cpp_hashnode of the identifier BASE. */ 1241 static cpp_hashnode * 1242 lex_identifier_intern (cpp_reader *pfile, const uchar *base) 1243 { 1244 cpp_hashnode *result; 1245 const uchar *cur; 1246 unsigned int len; 1247 unsigned int hash = HT_HASHSTEP (0, *base); 1248 1249 cur = base + 1; 1250 while (ISIDNUM (*cur)) 1251 { 1252 hash = HT_HASHSTEP (hash, *cur); 1253 cur++; 1254 } 1255 len = cur - base; 1256 hash = HT_HASHFINISH (hash, len); 1257 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table, 1258 base, len, hash, HT_ALLOC)); 1259 1260 /* Rarely, identifiers require diagnostics when lexed. */ 1261 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC) 1262 && !pfile->state.skipping, 0)) 1263 { 1264 /* It is allowed to poison the same identifier twice. */ 1265 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok) 1266 cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"", 1267 NODE_NAME (result)); 1268 1269 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the 1270 replacement list of a variadic macro. */ 1271 if (result == pfile->spec_nodes.n__VA_ARGS__ 1272 && !pfile->state.va_args_ok) 1273 cpp_error (pfile, CPP_DL_PEDWARN, 1274 "__VA_ARGS__ can only appear in the expansion" 1275 " of a C99 variadic macro"); 1276 1277 /* For -Wc++-compat, warn about use of C++ named operators. */ 1278 if (result->flags & NODE_WARN_OPERATOR) 1279 cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES, 1280 "identifier \"%s\" is a special operator name in C++", 1281 NODE_NAME (result)); 1282 } 1283 1284 return result; 1285 } 1286 1287 /* Get the cpp_hashnode of an identifier specified by NAME in 1288 the current cpp_reader object. If none is found, NULL is returned. */ 1289 cpp_hashnode * 1290 _cpp_lex_identifier (cpp_reader *pfile, const char *name) 1291 { 1292 cpp_hashnode *result; 1293 result = lex_identifier_intern (pfile, (uchar *) name); 1294 return result; 1295 } 1296 1297 /* Lex an identifier starting at BUFFER->CUR - 1. */ 1298 static cpp_hashnode * 1299 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn, 1300 struct normalize_state *nst) 1301 { 1302 cpp_hashnode *result; 1303 const uchar *cur; 1304 unsigned int len; 1305 unsigned int hash = HT_HASHSTEP (0, *base); 1306 1307 cur = pfile->buffer->cur; 1308 if (! starts_ucn) 1309 while (ISIDNUM (*cur)) 1310 { 1311 hash = HT_HASHSTEP (hash, *cur); 1312 cur++; 1313 } 1314 pfile->buffer->cur = cur; 1315 if (starts_ucn || forms_identifier_p (pfile, false, nst)) 1316 { 1317 /* Slower version for identifiers containing UCNs (or $). */ 1318 do { 1319 while (ISIDNUM (*pfile->buffer->cur)) 1320 { 1321 pfile->buffer->cur++; 1322 NORMALIZE_STATE_UPDATE_IDNUM (nst); 1323 } 1324 } while (forms_identifier_p (pfile, false, nst)); 1325 result = _cpp_interpret_identifier (pfile, base, 1326 pfile->buffer->cur - base); 1327 } 1328 else 1329 { 1330 len = cur - base; 1331 hash = HT_HASHFINISH (hash, len); 1332 1333 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table, 1334 base, len, hash, HT_ALLOC)); 1335 } 1336 1337 /* Rarely, identifiers require diagnostics when lexed. */ 1338 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC) 1339 && !pfile->state.skipping, 0)) 1340 { 1341 /* It is allowed to poison the same identifier twice. */ 1342 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok) 1343 cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"", 1344 NODE_NAME (result)); 1345 1346 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the 1347 replacement list of a variadic macro. */ 1348 if (result == pfile->spec_nodes.n__VA_ARGS__ 1349 && !pfile->state.va_args_ok) 1350 cpp_error (pfile, CPP_DL_PEDWARN, 1351 "__VA_ARGS__ can only appear in the expansion" 1352 " of a C99 variadic macro"); 1353 1354 /* For -Wc++-compat, warn about use of C++ named operators. */ 1355 if (result->flags & NODE_WARN_OPERATOR) 1356 cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES, 1357 "identifier \"%s\" is a special operator name in C++", 1358 NODE_NAME (result)); 1359 } 1360 1361 return result; 1362 } 1363 1364 /* Lex a number to NUMBER starting at BUFFER->CUR - 1. */ 1365 static void 1366 lex_number (cpp_reader *pfile, cpp_string *number, 1367 struct normalize_state *nst) 1368 { 1369 const uchar *cur; 1370 const uchar *base; 1371 uchar *dest; 1372 1373 base = pfile->buffer->cur - 1; 1374 do 1375 { 1376 cur = pfile->buffer->cur; 1377 1378 /* N.B. ISIDNUM does not include $. */ 1379 while (ISIDNUM (*cur) || *cur == '.' || VALID_SIGN (*cur, cur[-1])) 1380 { 1381 cur++; 1382 NORMALIZE_STATE_UPDATE_IDNUM (nst); 1383 } 1384 1385 pfile->buffer->cur = cur; 1386 } 1387 while (forms_identifier_p (pfile, false, nst)); 1388 1389 number->len = cur - base; 1390 dest = _cpp_unaligned_alloc (pfile, number->len + 1); 1391 memcpy (dest, base, number->len); 1392 dest[number->len] = '\0'; 1393 number->text = dest; 1394 } 1395 1396 /* Create a token of type TYPE with a literal spelling. */ 1397 static void 1398 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base, 1399 unsigned int len, enum cpp_ttype type) 1400 { 1401 uchar *dest = _cpp_unaligned_alloc (pfile, len + 1); 1402 1403 memcpy (dest, base, len); 1404 dest[len] = '\0'; 1405 token->type = type; 1406 token->val.str.len = len; 1407 token->val.str.text = dest; 1408 } 1409 1410 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer 1411 sequence from *FIRST_BUFF_P to LAST_BUFF_P. */ 1412 1413 static void 1414 bufring_append (cpp_reader *pfile, const uchar *base, size_t len, 1415 _cpp_buff **first_buff_p, _cpp_buff **last_buff_p) 1416 { 1417 _cpp_buff *first_buff = *first_buff_p; 1418 _cpp_buff *last_buff = *last_buff_p; 1419 1420 if (first_buff == NULL) 1421 first_buff = last_buff = _cpp_get_buff (pfile, len); 1422 else if (len > BUFF_ROOM (last_buff)) 1423 { 1424 size_t room = BUFF_ROOM (last_buff); 1425 memcpy (BUFF_FRONT (last_buff), base, room); 1426 BUFF_FRONT (last_buff) += room; 1427 base += room; 1428 len -= room; 1429 last_buff = _cpp_append_extend_buff (pfile, last_buff, len); 1430 } 1431 1432 memcpy (BUFF_FRONT (last_buff), base, len); 1433 BUFF_FRONT (last_buff) += len; 1434 1435 *first_buff_p = first_buff; 1436 *last_buff_p = last_buff; 1437 } 1438 1439 /* Lexes a raw string. The stored string contains the spelling, including 1440 double quotes, delimiter string, '(' and ')', any leading 1441 'L', 'u', 'U' or 'u8' and 'R' modifier. It returns the type of the 1442 literal, or CPP_OTHER if it was not properly terminated. 1443 1444 The spelling is NUL-terminated, but it is not guaranteed that this 1445 is the first NUL since embedded NULs are preserved. */ 1446 1447 static void 1448 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base, 1449 const uchar *cur) 1450 { 1451 const uchar *raw_prefix; 1452 unsigned int raw_prefix_len = 0; 1453 enum cpp_ttype type; 1454 size_t total_len = 0; 1455 _cpp_buff *first_buff = NULL, *last_buff = NULL; 1456 _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note]; 1457 1458 type = (*base == 'L' ? CPP_WSTRING : 1459 *base == 'U' ? CPP_STRING32 : 1460 *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16) 1461 : CPP_STRING); 1462 1463 raw_prefix = cur + 1; 1464 while (raw_prefix_len < 16) 1465 { 1466 switch (raw_prefix[raw_prefix_len]) 1467 { 1468 case ' ': case '(': case ')': case '\\': case '\t': 1469 case '\v': case '\f': case '\n': default: 1470 break; 1471 /* Basic source charset except the above chars. */ 1472 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 1473 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': 1474 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': 1475 case 's': case 't': case 'u': case 'v': case 'w': case 'x': 1476 case 'y': case 'z': 1477 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 1478 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': 1479 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': 1480 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': 1481 case 'Y': case 'Z': 1482 case '0': case '1': case '2': case '3': case '4': case '5': 1483 case '6': case '7': case '8': case '9': 1484 case '_': case '{': case '}': case '#': case '[': case ']': 1485 case '<': case '>': case '%': case ':': case ';': case '.': 1486 case '?': case '*': case '+': case '-': case '/': case '^': 1487 case '&': case '|': case '~': case '!': case '=': case ',': 1488 case '"': case '\'': 1489 raw_prefix_len++; 1490 continue; 1491 } 1492 break; 1493 } 1494 1495 if (raw_prefix[raw_prefix_len] != '(') 1496 { 1497 int col = CPP_BUF_COLUMN (pfile->buffer, raw_prefix + raw_prefix_len) 1498 + 1; 1499 if (raw_prefix_len == 16) 1500 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col, 1501 "raw string delimiter longer than 16 characters"); 1502 else 1503 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, col, 1504 "invalid character '%c' in raw string delimiter", 1505 (int) raw_prefix[raw_prefix_len]); 1506 pfile->buffer->cur = raw_prefix - 1; 1507 create_literal (pfile, token, base, raw_prefix - 1 - base, CPP_OTHER); 1508 return; 1509 } 1510 1511 cur = raw_prefix + raw_prefix_len + 1; 1512 for (;;) 1513 { 1514 #define BUF_APPEND(STR,LEN) \ 1515 do { \ 1516 bufring_append (pfile, (const uchar *)(STR), (LEN), \ 1517 &first_buff, &last_buff); \ 1518 total_len += (LEN); \ 1519 } while (0); 1520 1521 cppchar_t c; 1522 1523 /* If we previously performed any trigraph or line splicing 1524 transformations, undo them within the body of the raw string. */ 1525 while (note->pos < cur) 1526 ++note; 1527 for (; note->pos == cur; ++note) 1528 { 1529 switch (note->type) 1530 { 1531 case '\\': 1532 case ' ': 1533 /* Restore backslash followed by newline. */ 1534 BUF_APPEND (base, cur - base); 1535 base = cur; 1536 BUF_APPEND ("\\", 1); 1537 after_backslash: 1538 if (note->type == ' ') 1539 { 1540 /* GNU backslash whitespace newline extension. FIXME 1541 could be any sequence of non-vertical space. When we 1542 can properly restore any such sequence, we should mark 1543 this note as handled so _cpp_process_line_notes 1544 doesn't warn. */ 1545 BUF_APPEND (" ", 1); 1546 } 1547 1548 BUF_APPEND ("\n", 1); 1549 break; 1550 1551 case 0: 1552 /* Already handled. */ 1553 break; 1554 1555 default: 1556 if (_cpp_trigraph_map[note->type]) 1557 { 1558 /* Don't warn about this trigraph in 1559 _cpp_process_line_notes, since trigraphs show up as 1560 trigraphs in raw strings. */ 1561 uchar type = note->type; 1562 note->type = 0; 1563 1564 if (!CPP_OPTION (pfile, trigraphs)) 1565 /* If we didn't convert the trigraph in the first 1566 place, don't do anything now either. */ 1567 break; 1568 1569 BUF_APPEND (base, cur - base); 1570 base = cur; 1571 BUF_APPEND ("??", 2); 1572 1573 /* ??/ followed by newline gets two line notes, one for 1574 the trigraph and one for the backslash/newline. */ 1575 if (type == '/' && note[1].pos == cur) 1576 { 1577 if (note[1].type != '\\' 1578 && note[1].type != ' ') 1579 abort (); 1580 BUF_APPEND ("/", 1); 1581 ++note; 1582 goto after_backslash; 1583 } 1584 /* The ) from ??) could be part of the suffix. */ 1585 else if (type == ')' 1586 && strncmp ((const char *) cur+1, 1587 (const char *) raw_prefix, 1588 raw_prefix_len) == 0 1589 && cur[raw_prefix_len+1] == '"') 1590 { 1591 BUF_APPEND (")", 1); 1592 base++; 1593 cur += raw_prefix_len + 2; 1594 goto break_outer_loop; 1595 } 1596 else 1597 { 1598 /* Skip the replacement character. */ 1599 base = ++cur; 1600 BUF_APPEND (&type, 1); 1601 } 1602 } 1603 else 1604 abort (); 1605 break; 1606 } 1607 } 1608 c = *cur++; 1609 1610 if (c == ')' 1611 && strncmp ((const char *) cur, (const char *) raw_prefix, 1612 raw_prefix_len) == 0 1613 && cur[raw_prefix_len] == '"') 1614 { 1615 cur += raw_prefix_len + 1; 1616 break; 1617 } 1618 else if (c == '\n') 1619 { 1620 if (pfile->state.in_directive 1621 || pfile->state.parsing_args 1622 || pfile->state.in_deferred_pragma) 1623 { 1624 cur--; 1625 type = CPP_OTHER; 1626 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0, 1627 "unterminated raw string"); 1628 break; 1629 } 1630 1631 BUF_APPEND (base, cur - base); 1632 1633 if (pfile->buffer->cur < pfile->buffer->rlimit) 1634 CPP_INCREMENT_LINE (pfile, 0); 1635 pfile->buffer->need_line = true; 1636 1637 pfile->buffer->cur = cur-1; 1638 _cpp_process_line_notes (pfile, false); 1639 if (!_cpp_get_fresh_line (pfile)) 1640 { 1641 source_location src_loc = token->src_loc; 1642 token->type = CPP_EOF; 1643 /* Tell the compiler the line number of the EOF token. */ 1644 token->src_loc = pfile->line_table->highest_line; 1645 token->flags = BOL; 1646 if (first_buff != NULL) 1647 _cpp_release_buff (pfile, first_buff); 1648 cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0, 1649 "unterminated raw string"); 1650 return; 1651 } 1652 1653 cur = base = pfile->buffer->cur; 1654 note = &pfile->buffer->notes[pfile->buffer->cur_note]; 1655 } 1656 } 1657 break_outer_loop: 1658 1659 if (CPP_OPTION (pfile, user_literals)) 1660 { 1661 /* According to C++11 [lex.ext]p10, a ud-suffix not starting with an 1662 underscore is ill-formed. Since this breaks programs using macros 1663 from inttypes.h, we generate a warning and treat the ud-suffix as a 1664 separate preprocessing token. This approach is under discussion by 1665 the standards committee, and has been adopted as a conforming 1666 extension by other front ends such as clang. 1667 A special exception is made for the suffix 's' which will be 1668 standardized as a user-defined literal suffix for strings. */ 1669 if (ISALPHA (*cur) && *cur != 's') 1670 { 1671 /* Raise a warning, but do not consume subsequent tokens. */ 1672 if (CPP_OPTION (pfile, warn_literal_suffix)) 1673 cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX, 1674 token->src_loc, 0, 1675 "invalid suffix on literal; C++11 requires " 1676 "a space between literal and identifier"); 1677 } 1678 /* Grab user defined literal suffix. */ 1679 else if (ISIDST (*cur)) 1680 { 1681 type = cpp_userdef_string_add_type (type); 1682 ++cur; 1683 1684 while (ISIDNUM (*cur)) 1685 ++cur; 1686 } 1687 } 1688 1689 pfile->buffer->cur = cur; 1690 if (first_buff == NULL) 1691 create_literal (pfile, token, base, cur - base, type); 1692 else 1693 { 1694 uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1); 1695 1696 token->type = type; 1697 token->val.str.len = total_len + (cur - base); 1698 token->val.str.text = dest; 1699 last_buff = first_buff; 1700 while (last_buff != NULL) 1701 { 1702 memcpy (dest, last_buff->base, 1703 BUFF_FRONT (last_buff) - last_buff->base); 1704 dest += BUFF_FRONT (last_buff) - last_buff->base; 1705 last_buff = last_buff->next; 1706 } 1707 _cpp_release_buff (pfile, first_buff); 1708 memcpy (dest, base, cur - base); 1709 dest[cur - base] = '\0'; 1710 } 1711 } 1712 1713 /* Lexes a string, character constant, or angle-bracketed header file 1714 name. The stored string contains the spelling, including opening 1715 quote and any leading 'L', 'u', 'U' or 'u8' and optional 1716 'R' modifier. It returns the type of the literal, or CPP_OTHER 1717 if it was not properly terminated, or CPP_LESS for an unterminated 1718 header name which must be relexed as normal tokens. 1719 1720 The spelling is NUL-terminated, but it is not guaranteed that this 1721 is the first NUL since embedded NULs are preserved. */ 1722 static void 1723 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base) 1724 { 1725 bool saw_NUL = false; 1726 const uchar *cur; 1727 cppchar_t terminator; 1728 enum cpp_ttype type; 1729 1730 cur = base; 1731 terminator = *cur++; 1732 if (terminator == 'L' || terminator == 'U') 1733 terminator = *cur++; 1734 else if (terminator == 'u') 1735 { 1736 terminator = *cur++; 1737 if (terminator == '8') 1738 terminator = *cur++; 1739 } 1740 if (terminator == 'R') 1741 { 1742 lex_raw_string (pfile, token, base, cur); 1743 return; 1744 } 1745 if (terminator == '"') 1746 type = (*base == 'L' ? CPP_WSTRING : 1747 *base == 'U' ? CPP_STRING32 : 1748 *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16) 1749 : CPP_STRING); 1750 else if (terminator == '\'') 1751 type = (*base == 'L' ? CPP_WCHAR : 1752 *base == 'U' ? CPP_CHAR32 : 1753 *base == 'u' ? CPP_CHAR16 : CPP_CHAR); 1754 else 1755 terminator = '>', type = CPP_HEADER_NAME; 1756 1757 for (;;) 1758 { 1759 cppchar_t c = *cur++; 1760 1761 /* In #include-style directives, terminators are not escapable. */ 1762 if (c == '\\' && !pfile->state.angled_headers && *cur != '\n') 1763 cur++; 1764 else if (c == terminator) 1765 break; 1766 else if (c == '\n') 1767 { 1768 cur--; 1769 /* Unmatched quotes always yield undefined behavior, but 1770 greedy lexing means that what appears to be an unterminated 1771 header name may actually be a legitimate sequence of tokens. */ 1772 if (terminator == '>') 1773 { 1774 token->type = CPP_LESS; 1775 return; 1776 } 1777 type = CPP_OTHER; 1778 break; 1779 } 1780 else if (c == '\0') 1781 saw_NUL = true; 1782 } 1783 1784 if (saw_NUL && !pfile->state.skipping) 1785 cpp_error (pfile, CPP_DL_WARNING, 1786 "null character(s) preserved in literal"); 1787 1788 if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM) 1789 cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character", 1790 (int) terminator); 1791 1792 if (CPP_OPTION (pfile, user_literals)) 1793 { 1794 /* According to C++11 [lex.ext]p10, a ud-suffix not starting with an 1795 underscore is ill-formed. Since this breaks programs using macros 1796 from inttypes.h, we generate a warning and treat the ud-suffix as a 1797 separate preprocessing token. This approach is under discussion by 1798 the standards committee, and has been adopted as a conforming 1799 extension by other front ends such as clang. 1800 A special exception is made for the suffix 's' which will be 1801 standardized as a user-defined literal suffix for strings. */ 1802 if (ISALPHA (*cur) && *cur != 's') 1803 { 1804 /* Raise a warning, but do not consume subsequent tokens. */ 1805 if (CPP_OPTION (pfile, warn_literal_suffix)) 1806 cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX, 1807 token->src_loc, 0, 1808 "invalid suffix on literal; C++11 requires " 1809 "a space between literal and identifier"); 1810 } 1811 /* Grab user defined literal suffix. */ 1812 else if (ISIDST (*cur)) 1813 { 1814 type = cpp_userdef_char_add_type (type); 1815 type = cpp_userdef_string_add_type (type); 1816 ++cur; 1817 1818 while (ISIDNUM (*cur)) 1819 ++cur; 1820 } 1821 } 1822 1823 pfile->buffer->cur = cur; 1824 create_literal (pfile, token, base, cur - base, type); 1825 } 1826 1827 /* Return the comment table. The client may not make any assumption 1828 about the ordering of the table. */ 1829 cpp_comment_table * 1830 cpp_get_comments (cpp_reader *pfile) 1831 { 1832 return &pfile->comments; 1833 } 1834 1835 /* Append a comment to the end of the comment table. */ 1836 static void 1837 store_comment (cpp_reader *pfile, cpp_token *token) 1838 { 1839 int len; 1840 1841 if (pfile->comments.allocated == 0) 1842 { 1843 pfile->comments.allocated = 256; 1844 pfile->comments.entries = (cpp_comment *) xmalloc 1845 (pfile->comments.allocated * sizeof (cpp_comment)); 1846 } 1847 1848 if (pfile->comments.count == pfile->comments.allocated) 1849 { 1850 pfile->comments.allocated *= 2; 1851 pfile->comments.entries = (cpp_comment *) xrealloc 1852 (pfile->comments.entries, 1853 pfile->comments.allocated * sizeof (cpp_comment)); 1854 } 1855 1856 len = token->val.str.len; 1857 1858 /* Copy comment. Note, token may not be NULL terminated. */ 1859 pfile->comments.entries[pfile->comments.count].comment = 1860 (char *) xmalloc (sizeof (char) * (len + 1)); 1861 memcpy (pfile->comments.entries[pfile->comments.count].comment, 1862 token->val.str.text, len); 1863 pfile->comments.entries[pfile->comments.count].comment[len] = '\0'; 1864 1865 /* Set source location. */ 1866 pfile->comments.entries[pfile->comments.count].sloc = token->src_loc; 1867 1868 /* Increment the count of entries in the comment table. */ 1869 pfile->comments.count++; 1870 } 1871 1872 /* The stored comment includes the comment start and any terminator. */ 1873 static void 1874 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from, 1875 cppchar_t type) 1876 { 1877 unsigned char *buffer; 1878 unsigned int len, clen, i; 1879 int convert_to_c = (pfile->state.in_directive || pfile->state.parsing_args) 1880 && type == '/'; 1881 1882 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */ 1883 1884 /* C++ comments probably (not definitely) have moved past a new 1885 line, which we don't want to save in the comment. */ 1886 if (is_vspace (pfile->buffer->cur[-1])) 1887 len--; 1888 1889 /* If we are currently in a directive or in argument parsing, then 1890 we need to store all C++ comments as C comments internally, and 1891 so we need to allocate a little extra space in that case. 1892 1893 Note that the only time we encounter a directive here is 1894 when we are saving comments in a "#define". */ 1895 clen = convert_to_c ? len + 2 : len; 1896 1897 buffer = _cpp_unaligned_alloc (pfile, clen); 1898 1899 token->type = CPP_COMMENT; 1900 token->val.str.len = clen; 1901 token->val.str.text = buffer; 1902 1903 buffer[0] = '/'; 1904 memcpy (buffer + 1, from, len - 1); 1905 1906 /* Finish conversion to a C comment, if necessary. */ 1907 if (convert_to_c) 1908 { 1909 buffer[1] = '*'; 1910 buffer[clen - 2] = '*'; 1911 buffer[clen - 1] = '/'; 1912 /* As there can be in a C++ comments illegal sequences for C comments 1913 we need to filter them out. */ 1914 for (i = 2; i < (clen - 2); i++) 1915 if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*')) 1916 buffer[i] = '|'; 1917 } 1918 1919 /* Finally store this comment for use by clients of libcpp. */ 1920 store_comment (pfile, token); 1921 } 1922 1923 /* Allocate COUNT tokens for RUN. */ 1924 void 1925 _cpp_init_tokenrun (tokenrun *run, unsigned int count) 1926 { 1927 run->base = XNEWVEC (cpp_token, count); 1928 run->limit = run->base + count; 1929 run->next = NULL; 1930 } 1931 1932 /* Returns the next tokenrun, or creates one if there is none. */ 1933 static tokenrun * 1934 next_tokenrun (tokenrun *run) 1935 { 1936 if (run->next == NULL) 1937 { 1938 run->next = XNEW (tokenrun); 1939 run->next->prev = run; 1940 _cpp_init_tokenrun (run->next, 250); 1941 } 1942 1943 return run->next; 1944 } 1945 1946 /* Return the number of not yet processed token in a given 1947 context. */ 1948 int 1949 _cpp_remaining_tokens_num_in_context (cpp_context *context) 1950 { 1951 if (context->tokens_kind == TOKENS_KIND_DIRECT) 1952 return (LAST (context).token - FIRST (context).token); 1953 else if (context->tokens_kind == TOKENS_KIND_INDIRECT 1954 || context->tokens_kind == TOKENS_KIND_EXTENDED) 1955 return (LAST (context).ptoken - FIRST (context).ptoken); 1956 else 1957 abort (); 1958 } 1959 1960 /* Returns the token present at index INDEX in a given context. If 1961 INDEX is zero, the next token to be processed is returned. */ 1962 static const cpp_token* 1963 _cpp_token_from_context_at (cpp_context *context, int index) 1964 { 1965 if (context->tokens_kind == TOKENS_KIND_DIRECT) 1966 return &(FIRST (context).token[index]); 1967 else if (context->tokens_kind == TOKENS_KIND_INDIRECT 1968 || context->tokens_kind == TOKENS_KIND_EXTENDED) 1969 return FIRST (context).ptoken[index]; 1970 else 1971 abort (); 1972 } 1973 1974 /* Look ahead in the input stream. */ 1975 const cpp_token * 1976 cpp_peek_token (cpp_reader *pfile, int index) 1977 { 1978 cpp_context *context = pfile->context; 1979 const cpp_token *peektok; 1980 int count; 1981 1982 /* First, scan through any pending cpp_context objects. */ 1983 while (context->prev) 1984 { 1985 ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context); 1986 1987 if (index < (int) sz) 1988 return _cpp_token_from_context_at (context, index); 1989 index -= (int) sz; 1990 context = context->prev; 1991 } 1992 1993 /* We will have to read some new tokens after all (and do so 1994 without invalidating preceding tokens). */ 1995 count = index; 1996 pfile->keep_tokens++; 1997 1998 do 1999 { 2000 peektok = _cpp_lex_token (pfile); 2001 if (peektok->type == CPP_EOF) 2002 return peektok; 2003 } 2004 while (index--); 2005 2006 _cpp_backup_tokens_direct (pfile, count + 1); 2007 pfile->keep_tokens--; 2008 2009 return peektok; 2010 } 2011 2012 /* Allocate a single token that is invalidated at the same time as the 2013 rest of the tokens on the line. Has its line and col set to the 2014 same as the last lexed token, so that diagnostics appear in the 2015 right place. */ 2016 cpp_token * 2017 _cpp_temp_token (cpp_reader *pfile) 2018 { 2019 cpp_token *old, *result; 2020 ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token; 2021 ptrdiff_t la = (ptrdiff_t) pfile->lookaheads; 2022 2023 old = pfile->cur_token - 1; 2024 /* Any pre-existing lookaheads must not be clobbered. */ 2025 if (la) 2026 { 2027 if (sz <= la) 2028 { 2029 tokenrun *next = next_tokenrun (pfile->cur_run); 2030 2031 if (sz < la) 2032 memmove (next->base + 1, next->base, 2033 (la - sz) * sizeof (cpp_token)); 2034 2035 next->base[0] = pfile->cur_run->limit[-1]; 2036 } 2037 2038 if (sz > 1) 2039 memmove (pfile->cur_token + 1, pfile->cur_token, 2040 MIN (la, sz - 1) * sizeof (cpp_token)); 2041 } 2042 2043 if (!sz && pfile->cur_token == pfile->cur_run->limit) 2044 { 2045 pfile->cur_run = next_tokenrun (pfile->cur_run); 2046 pfile->cur_token = pfile->cur_run->base; 2047 } 2048 2049 result = pfile->cur_token++; 2050 result->src_loc = old->src_loc; 2051 return result; 2052 } 2053 2054 /* Lex a token into RESULT (external interface). Takes care of issues 2055 like directive handling, token lookahead, multiple include 2056 optimization and skipping. */ 2057 const cpp_token * 2058 _cpp_lex_token (cpp_reader *pfile) 2059 { 2060 cpp_token *result; 2061 2062 for (;;) 2063 { 2064 if (pfile->cur_token == pfile->cur_run->limit) 2065 { 2066 pfile->cur_run = next_tokenrun (pfile->cur_run); 2067 pfile->cur_token = pfile->cur_run->base; 2068 } 2069 /* We assume that the current token is somewhere in the current 2070 run. */ 2071 if (pfile->cur_token < pfile->cur_run->base 2072 || pfile->cur_token >= pfile->cur_run->limit) 2073 abort (); 2074 2075 if (pfile->lookaheads) 2076 { 2077 pfile->lookaheads--; 2078 result = pfile->cur_token++; 2079 } 2080 else 2081 result = _cpp_lex_direct (pfile); 2082 2083 if (result->flags & BOL) 2084 { 2085 /* Is this a directive. If _cpp_handle_directive returns 2086 false, it is an assembler #. */ 2087 if (result->type == CPP_HASH 2088 /* 6.10.3 p 11: Directives in a list of macro arguments 2089 gives undefined behavior. This implementation 2090 handles the directive as normal. */ 2091 && pfile->state.parsing_args != 1) 2092 { 2093 if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE)) 2094 { 2095 if (pfile->directive_result.type == CPP_PADDING) 2096 continue; 2097 result = &pfile->directive_result; 2098 } 2099 } 2100 else if (pfile->state.in_deferred_pragma) 2101 result = &pfile->directive_result; 2102 2103 if (pfile->cb.line_change && !pfile->state.skipping) 2104 pfile->cb.line_change (pfile, result, pfile->state.parsing_args); 2105 } 2106 2107 /* We don't skip tokens in directives. */ 2108 if (pfile->state.in_directive || pfile->state.in_deferred_pragma) 2109 break; 2110 2111 /* Outside a directive, invalidate controlling macros. At file 2112 EOF, _cpp_lex_direct takes care of popping the buffer, so we never 2113 get here and MI optimization works. */ 2114 pfile->mi_valid = false; 2115 2116 if (!pfile->state.skipping || result->type == CPP_EOF) 2117 break; 2118 } 2119 2120 return result; 2121 } 2122 2123 /* Returns true if a fresh line has been loaded. */ 2124 bool 2125 _cpp_get_fresh_line (cpp_reader *pfile) 2126 { 2127 int return_at_eof; 2128 2129 /* We can't get a new line until we leave the current directive. */ 2130 if (pfile->state.in_directive) 2131 return false; 2132 2133 for (;;) 2134 { 2135 cpp_buffer *buffer = pfile->buffer; 2136 2137 if (!buffer->need_line) 2138 return true; 2139 2140 if (buffer->next_line < buffer->rlimit) 2141 { 2142 _cpp_clean_line (pfile); 2143 return true; 2144 } 2145 2146 /* First, get out of parsing arguments state. */ 2147 if (pfile->state.parsing_args) 2148 return false; 2149 2150 /* End of buffer. Non-empty files should end in a newline. */ 2151 if (buffer->buf != buffer->rlimit 2152 && buffer->next_line > buffer->rlimit 2153 && !buffer->from_stage3) 2154 { 2155 /* Clip to buffer size. */ 2156 buffer->next_line = buffer->rlimit; 2157 } 2158 2159 return_at_eof = buffer->return_at_eof; 2160 _cpp_pop_buffer (pfile); 2161 if (pfile->buffer == NULL || return_at_eof) 2162 return false; 2163 } 2164 } 2165 2166 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE) \ 2167 do \ 2168 { \ 2169 result->type = ELSE_TYPE; \ 2170 if (*buffer->cur == CHAR) \ 2171 buffer->cur++, result->type = THEN_TYPE; \ 2172 } \ 2173 while (0) 2174 2175 /* Lex a token into pfile->cur_token, which is also incremented, to 2176 get diagnostics pointing to the correct location. 2177 2178 Does not handle issues such as token lookahead, multiple-include 2179 optimization, directives, skipping etc. This function is only 2180 suitable for use by _cpp_lex_token, and in special cases like 2181 lex_expansion_token which doesn't care for any of these issues. 2182 2183 When meeting a newline, returns CPP_EOF if parsing a directive, 2184 otherwise returns to the start of the token buffer if permissible. 2185 Returns the location of the lexed token. */ 2186 cpp_token * 2187 _cpp_lex_direct (cpp_reader *pfile) 2188 { 2189 cppchar_t c; 2190 cpp_buffer *buffer; 2191 const unsigned char *comment_start; 2192 cpp_token *result = pfile->cur_token++; 2193 2194 fresh_line: 2195 result->flags = 0; 2196 buffer = pfile->buffer; 2197 if (buffer->need_line) 2198 { 2199 if (pfile->state.in_deferred_pragma) 2200 { 2201 result->type = CPP_PRAGMA_EOL; 2202 pfile->state.in_deferred_pragma = false; 2203 if (!pfile->state.pragma_allow_expansion) 2204 pfile->state.prevent_expansion--; 2205 return result; 2206 } 2207 if (!_cpp_get_fresh_line (pfile)) 2208 { 2209 result->type = CPP_EOF; 2210 if (!pfile->state.in_directive) 2211 { 2212 /* Tell the compiler the line number of the EOF token. */ 2213 result->src_loc = pfile->line_table->highest_line; 2214 result->flags = BOL; 2215 } 2216 return result; 2217 } 2218 if (!pfile->keep_tokens) 2219 { 2220 pfile->cur_run = &pfile->base_run; 2221 result = pfile->base_run.base; 2222 pfile->cur_token = result + 1; 2223 } 2224 result->flags = BOL; 2225 if (pfile->state.parsing_args == 2) 2226 result->flags |= PREV_WHITE; 2227 } 2228 buffer = pfile->buffer; 2229 update_tokens_line: 2230 result->src_loc = pfile->line_table->highest_line; 2231 2232 skipped_white: 2233 if (buffer->cur >= buffer->notes[buffer->cur_note].pos 2234 && !pfile->overlaid_buffer) 2235 { 2236 _cpp_process_line_notes (pfile, false); 2237 result->src_loc = pfile->line_table->highest_line; 2238 } 2239 c = *buffer->cur++; 2240 2241 if (pfile->forced_token_location_p) 2242 result->src_loc = *pfile->forced_token_location_p; 2243 else 2244 result->src_loc = linemap_position_for_column (pfile->line_table, 2245 CPP_BUF_COLUMN (buffer, buffer->cur)); 2246 2247 switch (c) 2248 { 2249 case ' ': case '\t': case '\f': case '\v': case '\0': 2250 result->flags |= PREV_WHITE; 2251 skip_whitespace (pfile, c); 2252 goto skipped_white; 2253 2254 case '\n': 2255 if (buffer->cur < buffer->rlimit) 2256 CPP_INCREMENT_LINE (pfile, 0); 2257 buffer->need_line = true; 2258 goto fresh_line; 2259 2260 case '0': case '1': case '2': case '3': case '4': 2261 case '5': case '6': case '7': case '8': case '9': 2262 { 2263 struct normalize_state nst = INITIAL_NORMALIZE_STATE; 2264 result->type = CPP_NUMBER; 2265 lex_number (pfile, &result->val.str, &nst); 2266 warn_about_normalization (pfile, result, &nst); 2267 break; 2268 } 2269 2270 case 'L': 2271 case 'u': 2272 case 'U': 2273 case 'R': 2274 /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters, 2275 wide strings or raw strings. */ 2276 if (c == 'L' || CPP_OPTION (pfile, rliterals) 2277 || (c != 'R' && CPP_OPTION (pfile, uliterals))) 2278 { 2279 if ((*buffer->cur == '\'' && c != 'R') 2280 || *buffer->cur == '"' 2281 || (*buffer->cur == 'R' 2282 && c != 'R' 2283 && buffer->cur[1] == '"' 2284 && CPP_OPTION (pfile, rliterals)) 2285 || (*buffer->cur == '8' 2286 && c == 'u' 2287 && (buffer->cur[1] == '"' 2288 || (buffer->cur[1] == 'R' && buffer->cur[2] == '"' 2289 && CPP_OPTION (pfile, rliterals))))) 2290 { 2291 lex_string (pfile, result, buffer->cur - 1); 2292 break; 2293 } 2294 } 2295 /* Fall through. */ 2296 2297 case '_': 2298 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 2299 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': 2300 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': 2301 case 's': case 't': case 'v': case 'w': case 'x': 2302 case 'y': case 'z': 2303 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 2304 case 'G': case 'H': case 'I': case 'J': case 'K': 2305 case 'M': case 'N': case 'O': case 'P': case 'Q': 2306 case 'S': case 'T': case 'V': case 'W': case 'X': 2307 case 'Y': case 'Z': 2308 result->type = CPP_NAME; 2309 { 2310 struct normalize_state nst = INITIAL_NORMALIZE_STATE; 2311 result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false, 2312 &nst); 2313 warn_about_normalization (pfile, result, &nst); 2314 } 2315 2316 /* Convert named operators to their proper types. */ 2317 if (result->val.node.node->flags & NODE_OPERATOR) 2318 { 2319 result->flags |= NAMED_OP; 2320 result->type = (enum cpp_ttype) result->val.node.node->directive_index; 2321 } 2322 break; 2323 2324 case '\'': 2325 case '"': 2326 lex_string (pfile, result, buffer->cur - 1); 2327 break; 2328 2329 case '/': 2330 /* A potential block or line comment. */ 2331 comment_start = buffer->cur; 2332 c = *buffer->cur; 2333 2334 if (c == '*') 2335 { 2336 if (_cpp_skip_block_comment (pfile)) 2337 cpp_error (pfile, CPP_DL_ERROR, "unterminated comment"); 2338 } 2339 else if (c == '/' && (CPP_OPTION (pfile, cplusplus_comments) 2340 || cpp_in_system_header (pfile))) 2341 { 2342 /* Warn about comments only if pedantically GNUC89, and not 2343 in system headers. */ 2344 if (CPP_OPTION (pfile, lang) == CLK_GNUC89 && CPP_PEDANTIC (pfile) 2345 && ! buffer->warned_cplusplus_comments) 2346 { 2347 cpp_error (pfile, CPP_DL_PEDWARN, 2348 "C++ style comments are not allowed in ISO C90"); 2349 cpp_error (pfile, CPP_DL_PEDWARN, 2350 "(this will be reported only once per input file)"); 2351 buffer->warned_cplusplus_comments = 1; 2352 } 2353 2354 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments)) 2355 cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment"); 2356 } 2357 else if (c == '=') 2358 { 2359 buffer->cur++; 2360 result->type = CPP_DIV_EQ; 2361 break; 2362 } 2363 else 2364 { 2365 result->type = CPP_DIV; 2366 break; 2367 } 2368 2369 if (!pfile->state.save_comments) 2370 { 2371 result->flags |= PREV_WHITE; 2372 goto update_tokens_line; 2373 } 2374 2375 /* Save the comment as a token in its own right. */ 2376 save_comment (pfile, result, comment_start, c); 2377 break; 2378 2379 case '<': 2380 if (pfile->state.angled_headers) 2381 { 2382 lex_string (pfile, result, buffer->cur - 1); 2383 if (result->type != CPP_LESS) 2384 break; 2385 } 2386 2387 result->type = CPP_LESS; 2388 if (*buffer->cur == '=') 2389 buffer->cur++, result->type = CPP_LESS_EQ; 2390 else if (*buffer->cur == '<') 2391 { 2392 buffer->cur++; 2393 IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT); 2394 } 2395 else if (CPP_OPTION (pfile, digraphs)) 2396 { 2397 if (*buffer->cur == ':') 2398 { 2399 /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next 2400 three characters are <:: and the subsequent character 2401 is neither : nor >, the < is treated as a preprocessor 2402 token by itself". */ 2403 if (CPP_OPTION (pfile, cplusplus) 2404 && (CPP_OPTION (pfile, lang) == CLK_CXX11 2405 || CPP_OPTION (pfile, lang) == CLK_GNUCXX11) 2406 && buffer->cur[1] == ':' 2407 && buffer->cur[2] != ':' && buffer->cur[2] != '>') 2408 break; 2409 2410 buffer->cur++; 2411 result->flags |= DIGRAPH; 2412 result->type = CPP_OPEN_SQUARE; 2413 } 2414 else if (*buffer->cur == '%') 2415 { 2416 buffer->cur++; 2417 result->flags |= DIGRAPH; 2418 result->type = CPP_OPEN_BRACE; 2419 } 2420 } 2421 break; 2422 2423 case '>': 2424 result->type = CPP_GREATER; 2425 if (*buffer->cur == '=') 2426 buffer->cur++, result->type = CPP_GREATER_EQ; 2427 else if (*buffer->cur == '>') 2428 { 2429 buffer->cur++; 2430 IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT); 2431 } 2432 break; 2433 2434 case '%': 2435 result->type = CPP_MOD; 2436 if (*buffer->cur == '=') 2437 buffer->cur++, result->type = CPP_MOD_EQ; 2438 else if (CPP_OPTION (pfile, digraphs)) 2439 { 2440 if (*buffer->cur == ':') 2441 { 2442 buffer->cur++; 2443 result->flags |= DIGRAPH; 2444 result->type = CPP_HASH; 2445 if (*buffer->cur == '%' && buffer->cur[1] == ':') 2446 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0; 2447 } 2448 else if (*buffer->cur == '>') 2449 { 2450 buffer->cur++; 2451 result->flags |= DIGRAPH; 2452 result->type = CPP_CLOSE_BRACE; 2453 } 2454 } 2455 break; 2456 2457 case '.': 2458 result->type = CPP_DOT; 2459 if (ISDIGIT (*buffer->cur)) 2460 { 2461 struct normalize_state nst = INITIAL_NORMALIZE_STATE; 2462 result->type = CPP_NUMBER; 2463 lex_number (pfile, &result->val.str, &nst); 2464 warn_about_normalization (pfile, result, &nst); 2465 } 2466 else if (*buffer->cur == '.' && buffer->cur[1] == '.') 2467 buffer->cur += 2, result->type = CPP_ELLIPSIS; 2468 else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus)) 2469 buffer->cur++, result->type = CPP_DOT_STAR; 2470 break; 2471 2472 case '+': 2473 result->type = CPP_PLUS; 2474 if (*buffer->cur == '+') 2475 buffer->cur++, result->type = CPP_PLUS_PLUS; 2476 else if (*buffer->cur == '=') 2477 buffer->cur++, result->type = CPP_PLUS_EQ; 2478 break; 2479 2480 case '-': 2481 result->type = CPP_MINUS; 2482 if (*buffer->cur == '>') 2483 { 2484 buffer->cur++; 2485 result->type = CPP_DEREF; 2486 if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus)) 2487 buffer->cur++, result->type = CPP_DEREF_STAR; 2488 } 2489 else if (*buffer->cur == '-') 2490 buffer->cur++, result->type = CPP_MINUS_MINUS; 2491 else if (*buffer->cur == '=') 2492 buffer->cur++, result->type = CPP_MINUS_EQ; 2493 break; 2494 2495 case '&': 2496 result->type = CPP_AND; 2497 if (*buffer->cur == '&') 2498 buffer->cur++, result->type = CPP_AND_AND; 2499 else if (*buffer->cur == '=') 2500 buffer->cur++, result->type = CPP_AND_EQ; 2501 break; 2502 2503 case '|': 2504 result->type = CPP_OR; 2505 if (*buffer->cur == '|') 2506 buffer->cur++, result->type = CPP_OR_OR; 2507 else if (*buffer->cur == '=') 2508 buffer->cur++, result->type = CPP_OR_EQ; 2509 break; 2510 2511 case ':': 2512 result->type = CPP_COLON; 2513 if (*buffer->cur == ':' && CPP_OPTION (pfile, cplusplus)) 2514 buffer->cur++, result->type = CPP_SCOPE; 2515 else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs)) 2516 { 2517 buffer->cur++; 2518 result->flags |= DIGRAPH; 2519 result->type = CPP_CLOSE_SQUARE; 2520 } 2521 break; 2522 2523 case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break; 2524 case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break; 2525 case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break; 2526 case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break; 2527 case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break; 2528 2529 case '?': result->type = CPP_QUERY; break; 2530 case '~': result->type = CPP_COMPL; break; 2531 case ',': result->type = CPP_COMMA; break; 2532 case '(': result->type = CPP_OPEN_PAREN; break; 2533 case ')': result->type = CPP_CLOSE_PAREN; break; 2534 case '[': result->type = CPP_OPEN_SQUARE; break; 2535 case ']': result->type = CPP_CLOSE_SQUARE; break; 2536 case '{': result->type = CPP_OPEN_BRACE; break; 2537 case '}': result->type = CPP_CLOSE_BRACE; break; 2538 case ';': result->type = CPP_SEMICOLON; break; 2539 2540 /* @ is a punctuator in Objective-C. */ 2541 case '@': result->type = CPP_ATSIGN; break; 2542 2543 case '$': 2544 case '\\': 2545 { 2546 const uchar *base = --buffer->cur; 2547 struct normalize_state nst = INITIAL_NORMALIZE_STATE; 2548 2549 if (forms_identifier_p (pfile, true, &nst)) 2550 { 2551 result->type = CPP_NAME; 2552 result->val.node.node = lex_identifier (pfile, base, true, &nst); 2553 warn_about_normalization (pfile, result, &nst); 2554 break; 2555 } 2556 buffer->cur++; 2557 } 2558 2559 default: 2560 create_literal (pfile, result, buffer->cur - 1, 1, CPP_OTHER); 2561 break; 2562 } 2563 2564 return result; 2565 } 2566 2567 /* An upper bound on the number of bytes needed to spell TOKEN. 2568 Does not include preceding whitespace. */ 2569 unsigned int 2570 cpp_token_len (const cpp_token *token) 2571 { 2572 unsigned int len; 2573 2574 switch (TOKEN_SPELL (token)) 2575 { 2576 default: len = 6; break; 2577 case SPELL_LITERAL: len = token->val.str.len; break; 2578 case SPELL_IDENT: len = NODE_LEN (token->val.node.node) * 10; break; 2579 } 2580 2581 return len; 2582 } 2583 2584 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER. 2585 Return the number of bytes read out of NAME. (There are always 2586 10 bytes written to BUFFER.) */ 2587 2588 static size_t 2589 utf8_to_ucn (unsigned char *buffer, const unsigned char *name) 2590 { 2591 int j; 2592 int ucn_len = 0; 2593 int ucn_len_c; 2594 unsigned t; 2595 unsigned long utf32; 2596 2597 /* Compute the length of the UTF-8 sequence. */ 2598 for (t = *name; t & 0x80; t <<= 1) 2599 ucn_len++; 2600 2601 utf32 = *name & (0x7F >> ucn_len); 2602 for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++) 2603 { 2604 utf32 = (utf32 << 6) | (*++name & 0x3F); 2605 2606 /* Ill-formed UTF-8. */ 2607 if ((*name & ~0x3F) != 0x80) 2608 abort (); 2609 } 2610 2611 *buffer++ = '\\'; 2612 *buffer++ = 'U'; 2613 for (j = 7; j >= 0; j--) 2614 *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF]; 2615 return ucn_len; 2616 } 2617 2618 /* Given a token TYPE corresponding to a digraph, return a pointer to 2619 the spelling of the digraph. */ 2620 static const unsigned char * 2621 cpp_digraph2name (enum cpp_ttype type) 2622 { 2623 return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH]; 2624 } 2625 2626 /* Write the spelling of a token TOKEN to BUFFER. The buffer must 2627 already contain the enough space to hold the token's spelling. 2628 Returns a pointer to the character after the last character written. 2629 FORSTRING is true if this is to be the spelling after translation 2630 phase 1 (this is different for UCNs). 2631 FIXME: Would be nice if we didn't need the PFILE argument. */ 2632 unsigned char * 2633 cpp_spell_token (cpp_reader *pfile, const cpp_token *token, 2634 unsigned char *buffer, bool forstring) 2635 { 2636 switch (TOKEN_SPELL (token)) 2637 { 2638 case SPELL_OPERATOR: 2639 { 2640 const unsigned char *spelling; 2641 unsigned char c; 2642 2643 if (token->flags & DIGRAPH) 2644 spelling = cpp_digraph2name (token->type); 2645 else if (token->flags & NAMED_OP) 2646 goto spell_ident; 2647 else 2648 spelling = TOKEN_NAME (token); 2649 2650 while ((c = *spelling++) != '\0') 2651 *buffer++ = c; 2652 } 2653 break; 2654 2655 spell_ident: 2656 case SPELL_IDENT: 2657 if (forstring) 2658 { 2659 memcpy (buffer, NODE_NAME (token->val.node.node), 2660 NODE_LEN (token->val.node.node)); 2661 buffer += NODE_LEN (token->val.node.node); 2662 } 2663 else 2664 { 2665 size_t i; 2666 const unsigned char * name = NODE_NAME (token->val.node.node); 2667 2668 for (i = 0; i < NODE_LEN (token->val.node.node); i++) 2669 if (name[i] & ~0x7F) 2670 { 2671 i += utf8_to_ucn (buffer, name + i) - 1; 2672 buffer += 10; 2673 } 2674 else 2675 *buffer++ = NODE_NAME (token->val.node.node)[i]; 2676 } 2677 break; 2678 2679 case SPELL_LITERAL: 2680 memcpy (buffer, token->val.str.text, token->val.str.len); 2681 buffer += token->val.str.len; 2682 break; 2683 2684 case SPELL_NONE: 2685 cpp_error (pfile, CPP_DL_ICE, 2686 "unspellable token %s", TOKEN_NAME (token)); 2687 break; 2688 } 2689 2690 return buffer; 2691 } 2692 2693 /* Returns TOKEN spelt as a null-terminated string. The string is 2694 freed when the reader is destroyed. Useful for diagnostics. */ 2695 unsigned char * 2696 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token) 2697 { 2698 unsigned int len = cpp_token_len (token) + 1; 2699 unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end; 2700 2701 end = cpp_spell_token (pfile, token, start, false); 2702 end[0] = '\0'; 2703 2704 return start; 2705 } 2706 2707 /* Returns a pointer to a string which spells the token defined by 2708 TYPE and FLAGS. Used by C front ends, which really should move to 2709 using cpp_token_as_text. */ 2710 const char * 2711 cpp_type2name (enum cpp_ttype type, unsigned char flags) 2712 { 2713 if (flags & DIGRAPH) 2714 return (const char *) cpp_digraph2name (type); 2715 else if (flags & NAMED_OP) 2716 return cpp_named_operator2name (type); 2717 2718 return (const char *) token_spellings[type].name; 2719 } 2720 2721 /* Writes the spelling of token to FP, without any preceding space. 2722 Separated from cpp_spell_token for efficiency - to avoid stdio 2723 double-buffering. */ 2724 void 2725 cpp_output_token (const cpp_token *token, FILE *fp) 2726 { 2727 switch (TOKEN_SPELL (token)) 2728 { 2729 case SPELL_OPERATOR: 2730 { 2731 const unsigned char *spelling; 2732 int c; 2733 2734 if (token->flags & DIGRAPH) 2735 spelling = cpp_digraph2name (token->type); 2736 else if (token->flags & NAMED_OP) 2737 goto spell_ident; 2738 else 2739 spelling = TOKEN_NAME (token); 2740 2741 c = *spelling; 2742 do 2743 putc (c, fp); 2744 while ((c = *++spelling) != '\0'); 2745 } 2746 break; 2747 2748 spell_ident: 2749 case SPELL_IDENT: 2750 { 2751 size_t i; 2752 const unsigned char * name = NODE_NAME (token->val.node.node); 2753 2754 for (i = 0; i < NODE_LEN (token->val.node.node); i++) 2755 if (name[i] & ~0x7F) 2756 { 2757 unsigned char buffer[10]; 2758 i += utf8_to_ucn (buffer, name + i) - 1; 2759 fwrite (buffer, 1, 10, fp); 2760 } 2761 else 2762 fputc (NODE_NAME (token->val.node.node)[i], fp); 2763 } 2764 break; 2765 2766 case SPELL_LITERAL: 2767 fwrite (token->val.str.text, 1, token->val.str.len, fp); 2768 break; 2769 2770 case SPELL_NONE: 2771 /* An error, most probably. */ 2772 break; 2773 } 2774 } 2775 2776 /* Compare two tokens. */ 2777 int 2778 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b) 2779 { 2780 if (a->type == b->type && a->flags == b->flags) 2781 switch (TOKEN_SPELL (a)) 2782 { 2783 default: /* Keep compiler happy. */ 2784 case SPELL_OPERATOR: 2785 /* token_no is used to track where multiple consecutive ## 2786 tokens were originally located. */ 2787 return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no); 2788 case SPELL_NONE: 2789 return (a->type != CPP_MACRO_ARG 2790 || a->val.macro_arg.arg_no == b->val.macro_arg.arg_no); 2791 case SPELL_IDENT: 2792 return a->val.node.node == b->val.node.node; 2793 case SPELL_LITERAL: 2794 return (a->val.str.len == b->val.str.len 2795 && !memcmp (a->val.str.text, b->val.str.text, 2796 a->val.str.len)); 2797 } 2798 2799 return 0; 2800 } 2801 2802 /* Returns nonzero if a space should be inserted to avoid an 2803 accidental token paste for output. For simplicity, it is 2804 conservative, and occasionally advises a space where one is not 2805 needed, e.g. "." and ".2". */ 2806 int 2807 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1, 2808 const cpp_token *token2) 2809 { 2810 enum cpp_ttype a = token1->type, b = token2->type; 2811 cppchar_t c; 2812 2813 if (token1->flags & NAMED_OP) 2814 a = CPP_NAME; 2815 if (token2->flags & NAMED_OP) 2816 b = CPP_NAME; 2817 2818 c = EOF; 2819 if (token2->flags & DIGRAPH) 2820 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0]; 2821 else if (token_spellings[b].category == SPELL_OPERATOR) 2822 c = token_spellings[b].name[0]; 2823 2824 /* Quickly get everything that can paste with an '='. */ 2825 if ((int) a <= (int) CPP_LAST_EQ && c == '=') 2826 return 1; 2827 2828 switch (a) 2829 { 2830 case CPP_GREATER: return c == '>'; 2831 case CPP_LESS: return c == '<' || c == '%' || c == ':'; 2832 case CPP_PLUS: return c == '+'; 2833 case CPP_MINUS: return c == '-' || c == '>'; 2834 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */ 2835 case CPP_MOD: return c == ':' || c == '>'; 2836 case CPP_AND: return c == '&'; 2837 case CPP_OR: return c == '|'; 2838 case CPP_COLON: return c == ':' || c == '>'; 2839 case CPP_DEREF: return c == '*'; 2840 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER; 2841 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */ 2842 case CPP_NAME: return ((b == CPP_NUMBER 2843 && name_p (pfile, &token2->val.str)) 2844 || b == CPP_NAME 2845 || b == CPP_CHAR || b == CPP_STRING); /* L */ 2846 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME 2847 || c == '.' || c == '+' || c == '-'); 2848 /* UCNs */ 2849 case CPP_OTHER: return ((token1->val.str.text[0] == '\\' 2850 && b == CPP_NAME) 2851 || (CPP_OPTION (pfile, objc) 2852 && token1->val.str.text[0] == '@' 2853 && (b == CPP_NAME || b == CPP_STRING))); 2854 default: break; 2855 } 2856 2857 return 0; 2858 } 2859 2860 /* Output all the remaining tokens on the current line, and a newline 2861 character, to FP. Leading whitespace is removed. If there are 2862 macros, special token padding is not performed. */ 2863 void 2864 cpp_output_line (cpp_reader *pfile, FILE *fp) 2865 { 2866 const cpp_token *token; 2867 2868 token = cpp_get_token (pfile); 2869 while (token->type != CPP_EOF) 2870 { 2871 cpp_output_token (token, fp); 2872 token = cpp_get_token (pfile); 2873 if (token->flags & PREV_WHITE) 2874 putc (' ', fp); 2875 } 2876 2877 putc ('\n', fp); 2878 } 2879 2880 /* Return a string representation of all the remaining tokens on the 2881 current line. The result is allocated using xmalloc and must be 2882 freed by the caller. */ 2883 unsigned char * 2884 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name) 2885 { 2886 const cpp_token *token; 2887 unsigned int out = dir_name ? ustrlen (dir_name) : 0; 2888 unsigned int alloced = 120 + out; 2889 unsigned char *result = (unsigned char *) xmalloc (alloced); 2890 2891 /* If DIR_NAME is empty, there are no initial contents. */ 2892 if (dir_name) 2893 { 2894 sprintf ((char *) result, "#%s ", dir_name); 2895 out += 2; 2896 } 2897 2898 token = cpp_get_token (pfile); 2899 while (token->type != CPP_EOF) 2900 { 2901 unsigned char *last; 2902 /* Include room for a possible space and the terminating nul. */ 2903 unsigned int len = cpp_token_len (token) + 2; 2904 2905 if (out + len > alloced) 2906 { 2907 alloced *= 2; 2908 if (out + len > alloced) 2909 alloced = out + len; 2910 result = (unsigned char *) xrealloc (result, alloced); 2911 } 2912 2913 last = cpp_spell_token (pfile, token, &result[out], 0); 2914 out = last - result; 2915 2916 token = cpp_get_token (pfile); 2917 if (token->flags & PREV_WHITE) 2918 result[out++] = ' '; 2919 } 2920 2921 result[out] = '\0'; 2922 return result; 2923 } 2924 2925 /* Memory buffers. Changing these three constants can have a dramatic 2926 effect on performance. The values here are reasonable defaults, 2927 but might be tuned. If you adjust them, be sure to test across a 2928 range of uses of cpplib, including heavy nested function-like macro 2929 expansion. Also check the change in peak memory usage (NJAMD is a 2930 good tool for this). */ 2931 #define MIN_BUFF_SIZE 8000 2932 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2) 2933 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \ 2934 (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2) 2935 2936 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0) 2937 #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE! 2938 #endif 2939 2940 /* Create a new allocation buffer. Place the control block at the end 2941 of the buffer, so that buffer overflows will cause immediate chaos. */ 2942 static _cpp_buff * 2943 new_buff (size_t len) 2944 { 2945 _cpp_buff *result; 2946 unsigned char *base; 2947 2948 if (len < MIN_BUFF_SIZE) 2949 len = MIN_BUFF_SIZE; 2950 len = CPP_ALIGN (len); 2951 2952 #ifdef ENABLE_VALGRIND_CHECKING 2953 /* Valgrind warns about uses of interior pointers, so put _cpp_buff 2954 struct first. */ 2955 size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT); 2956 base = XNEWVEC (unsigned char, len + slen); 2957 result = (_cpp_buff *) base; 2958 base += slen; 2959 #else 2960 base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff)); 2961 result = (_cpp_buff *) (base + len); 2962 #endif 2963 result->base = base; 2964 result->cur = base; 2965 result->limit = base + len; 2966 result->next = NULL; 2967 return result; 2968 } 2969 2970 /* Place a chain of unwanted allocation buffers on the free list. */ 2971 void 2972 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff) 2973 { 2974 _cpp_buff *end = buff; 2975 2976 while (end->next) 2977 end = end->next; 2978 end->next = pfile->free_buffs; 2979 pfile->free_buffs = buff; 2980 } 2981 2982 /* Return a free buffer of size at least MIN_SIZE. */ 2983 _cpp_buff * 2984 _cpp_get_buff (cpp_reader *pfile, size_t min_size) 2985 { 2986 _cpp_buff *result, **p; 2987 2988 for (p = &pfile->free_buffs;; p = &(*p)->next) 2989 { 2990 size_t size; 2991 2992 if (*p == NULL) 2993 return new_buff (min_size); 2994 result = *p; 2995 size = result->limit - result->base; 2996 /* Return a buffer that's big enough, but don't waste one that's 2997 way too big. */ 2998 if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size)) 2999 break; 3000 } 3001 3002 *p = result->next; 3003 result->next = NULL; 3004 result->cur = result->base; 3005 return result; 3006 } 3007 3008 /* Creates a new buffer with enough space to hold the uncommitted 3009 remaining bytes of BUFF, and at least MIN_EXTRA more bytes. Copies 3010 the excess bytes to the new buffer. Chains the new buffer after 3011 BUFF, and returns the new buffer. */ 3012 _cpp_buff * 3013 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra) 3014 { 3015 size_t size = EXTENDED_BUFF_SIZE (buff, min_extra); 3016 _cpp_buff *new_buff = _cpp_get_buff (pfile, size); 3017 3018 buff->next = new_buff; 3019 memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff)); 3020 return new_buff; 3021 } 3022 3023 /* Creates a new buffer with enough space to hold the uncommitted 3024 remaining bytes of the buffer pointed to by BUFF, and at least 3025 MIN_EXTRA more bytes. Copies the excess bytes to the new buffer. 3026 Chains the new buffer before the buffer pointed to by BUFF, and 3027 updates the pointer to point to the new buffer. */ 3028 void 3029 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra) 3030 { 3031 _cpp_buff *new_buff, *old_buff = *pbuff; 3032 size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra); 3033 3034 new_buff = _cpp_get_buff (pfile, size); 3035 memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff)); 3036 new_buff->next = old_buff; 3037 *pbuff = new_buff; 3038 } 3039 3040 /* Free a chain of buffers starting at BUFF. */ 3041 void 3042 _cpp_free_buff (_cpp_buff *buff) 3043 { 3044 _cpp_buff *next; 3045 3046 for (; buff; buff = next) 3047 { 3048 next = buff->next; 3049 #ifdef ENABLE_VALGRIND_CHECKING 3050 free (buff); 3051 #else 3052 free (buff->base); 3053 #endif 3054 } 3055 } 3056 3057 /* Allocate permanent, unaligned storage of length LEN. */ 3058 unsigned char * 3059 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len) 3060 { 3061 _cpp_buff *buff = pfile->u_buff; 3062 unsigned char *result = buff->cur; 3063 3064 if (len > (size_t) (buff->limit - result)) 3065 { 3066 buff = _cpp_get_buff (pfile, len); 3067 buff->next = pfile->u_buff; 3068 pfile->u_buff = buff; 3069 result = buff->cur; 3070 } 3071 3072 buff->cur = result + len; 3073 return result; 3074 } 3075 3076 /* Allocate permanent, unaligned storage of length LEN from a_buff. 3077 That buffer is used for growing allocations when saving macro 3078 replacement lists in a #define, and when parsing an answer to an 3079 assertion in #assert, #unassert or #if (and therefore possibly 3080 whilst expanding macros). It therefore must not be used by any 3081 code that they might call: specifically the lexer and the guts of 3082 the macro expander. 3083 3084 All existing other uses clearly fit this restriction: storing 3085 registered pragmas during initialization. */ 3086 unsigned char * 3087 _cpp_aligned_alloc (cpp_reader *pfile, size_t len) 3088 { 3089 _cpp_buff *buff = pfile->a_buff; 3090 unsigned char *result = buff->cur; 3091 3092 if (len > (size_t) (buff->limit - result)) 3093 { 3094 buff = _cpp_get_buff (pfile, len); 3095 buff->next = pfile->a_buff; 3096 pfile->a_buff = buff; 3097 result = buff->cur; 3098 } 3099 3100 buff->cur = result + len; 3101 return result; 3102 } 3103 3104 /* Say which field of TOK is in use. */ 3105 3106 enum cpp_token_fld_kind 3107 cpp_token_val_index (cpp_token *tok) 3108 { 3109 switch (TOKEN_SPELL (tok)) 3110 { 3111 case SPELL_IDENT: 3112 return CPP_TOKEN_FLD_NODE; 3113 case SPELL_LITERAL: 3114 return CPP_TOKEN_FLD_STR; 3115 case SPELL_OPERATOR: 3116 if (tok->type == CPP_PASTE) 3117 return CPP_TOKEN_FLD_TOKEN_NO; 3118 else 3119 return CPP_TOKEN_FLD_NONE; 3120 case SPELL_NONE: 3121 if (tok->type == CPP_MACRO_ARG) 3122 return CPP_TOKEN_FLD_ARG_NO; 3123 else if (tok->type == CPP_PADDING) 3124 return CPP_TOKEN_FLD_SOURCE; 3125 else if (tok->type == CPP_PRAGMA) 3126 return CPP_TOKEN_FLD_PRAGMA; 3127 /* else fall through */ 3128 default: 3129 return CPP_TOKEN_FLD_NONE; 3130 } 3131 } 3132 3133 /* All tokens lexed in R after calling this function will be forced to have 3134 their source_location the same as the location referenced by P, until 3135 cpp_stop_forcing_token_locations is called for R. */ 3136 3137 void 3138 cpp_force_token_locations (cpp_reader *r, source_location *p) 3139 { 3140 r->forced_token_location_p = p; 3141 } 3142 3143 /* Go back to assigning locations naturally for lexed tokens. */ 3144 3145 void 3146 cpp_stop_forcing_token_locations (cpp_reader *r) 3147 { 3148 r->forced_token_location_p = NULL; 3149 } 3150