1 /* CPP Library - lexical analysis. 2 Copyright (C) 2000-2020 Free Software Foundation, Inc. 3 Contributed by Per Bothner, 1994-95. 4 Based on CCCP program by Paul Rubin, June 1986 5 Adapted to ANSI C, Richard Stallman, Jan 1987 6 Broken out to separate file, Zack Weinberg, Mar 2000 7 8 This program is free software; you can redistribute it and/or modify it 9 under the terms of the GNU General Public License as published by the 10 Free Software Foundation; either version 3, or (at your option) any 11 later version. 12 13 This program is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with this program; see the file COPYING3. If not see 20 <http://www.gnu.org/licenses/>. */ 21 22 #include "config.h" 23 #include "system.h" 24 #include "cpplib.h" 25 #include "internal.h" 26 27 enum spell_type 28 { 29 SPELL_OPERATOR = 0, 30 SPELL_IDENT, 31 SPELL_LITERAL, 32 SPELL_NONE 33 }; 34 35 struct token_spelling 36 { 37 enum spell_type category; 38 const unsigned char *name; 39 }; 40 41 static const unsigned char *const digraph_spellings[] = 42 { UC"%:", UC"%:%:", UC"<:", UC":>", UC"<%", UC"%>" }; 43 44 #define OP(e, s) { SPELL_OPERATOR, UC s }, 45 #define TK(e, s) { SPELL_ ## s, UC #e }, 46 static const struct token_spelling token_spellings[N_TTYPES] = { TTYPE_TABLE }; 47 #undef OP 48 #undef TK 49 50 #define TOKEN_SPELL(token) (token_spellings[(token)->type].category) 51 #define TOKEN_NAME(token) (token_spellings[(token)->type].name) 52 53 static void add_line_note (cpp_buffer *, const uchar *, unsigned int); 54 static int skip_line_comment (cpp_reader *); 55 static void skip_whitespace (cpp_reader *, cppchar_t); 56 static void lex_string (cpp_reader *, cpp_token *, const uchar *); 57 static void save_comment (cpp_reader *, cpp_token *, const uchar *, cppchar_t); 58 static void store_comment (cpp_reader *, cpp_token *); 59 static void create_literal (cpp_reader *, cpp_token *, const uchar *, 60 unsigned int, enum cpp_ttype); 61 static bool warn_in_comment (cpp_reader *, _cpp_line_note *); 62 static int name_p (cpp_reader *, const cpp_string *); 63 static tokenrun *next_tokenrun (tokenrun *); 64 65 static _cpp_buff *new_buff (size_t); 66 67 68 /* Utility routine: 69 70 Compares, the token TOKEN to the NUL-terminated string STRING. 71 TOKEN must be a CPP_NAME. Returns 1 for equal, 0 for unequal. */ 72 int 73 cpp_ideq (const cpp_token *token, const char *string) 74 { 75 if (token->type != CPP_NAME) 76 return 0; 77 78 return !ustrcmp (NODE_NAME (token->val.node.node), (const uchar *) string); 79 } 80 81 /* Record a note TYPE at byte POS into the current cleaned logical 82 line. */ 83 static void 84 add_line_note (cpp_buffer *buffer, const uchar *pos, unsigned int type) 85 { 86 if (buffer->notes_used == buffer->notes_cap) 87 { 88 buffer->notes_cap = buffer->notes_cap * 2 + 200; 89 buffer->notes = XRESIZEVEC (_cpp_line_note, buffer->notes, 90 buffer->notes_cap); 91 } 92 93 buffer->notes[buffer->notes_used].pos = pos; 94 buffer->notes[buffer->notes_used].type = type; 95 buffer->notes_used++; 96 } 97 98 99 /* Fast path to find line special characters using optimized character 100 scanning algorithms. Anything complicated falls back to the slow 101 path below. Since this loop is very hot it's worth doing these kinds 102 of optimizations. 103 104 One of the paths through the ifdefs should provide 105 106 const uchar *search_line_fast (const uchar *s, const uchar *end); 107 108 Between S and END, search for \n, \r, \\, ?. Return a pointer to 109 the found character. 110 111 Note that the last character of the buffer is *always* a newline, 112 as forced by _cpp_convert_input. This fact can be used to avoid 113 explicitly looking for the end of the buffer. */ 114 115 /* Configure gives us an ifdef test. */ 116 #ifndef WORDS_BIGENDIAN 117 #define WORDS_BIGENDIAN 0 118 #endif 119 120 /* We'd like the largest integer that fits into a register. There's nothing 121 in <stdint.h> that gives us that. For most hosts this is unsigned long, 122 but MS decided on an LLP64 model. Thankfully when building with GCC we 123 can get the "real" word size. */ 124 #ifdef __GNUC__ 125 typedef unsigned int word_type __attribute__((__mode__(__word__))); 126 #else 127 typedef unsigned long word_type; 128 #endif 129 130 /* The code below is only expecting sizes 4 or 8. 131 Die at compile-time if this expectation is violated. */ 132 typedef char check_word_type_size 133 [(sizeof(word_type) == 8 || sizeof(word_type) == 4) * 2 - 1]; 134 135 /* Return X with the first N bytes forced to values that won't match one 136 of the interesting characters. Note that NUL is not interesting. */ 137 138 static inline word_type 139 acc_char_mask_misalign (word_type val, unsigned int n) 140 { 141 word_type mask = -1; 142 if (WORDS_BIGENDIAN) 143 mask >>= n * 8; 144 else 145 mask <<= n * 8; 146 return val & mask; 147 } 148 149 /* Return X replicated to all byte positions within WORD_TYPE. */ 150 151 static inline word_type 152 acc_char_replicate (uchar x) 153 { 154 word_type ret; 155 156 ret = (x << 24) | (x << 16) | (x << 8) | x; 157 if (sizeof(word_type) == 8) 158 ret = (ret << 16 << 16) | ret; 159 return ret; 160 } 161 162 /* Return non-zero if some byte of VAL is (probably) C. */ 163 164 static inline word_type 165 acc_char_cmp (word_type val, word_type c) 166 { 167 #if defined(__GNUC__) && defined(__alpha__) 168 /* We can get exact results using a compare-bytes instruction. 169 Get (val == c) via (0 >= (val ^ c)). */ 170 return __builtin_alpha_cmpbge (0, val ^ c); 171 #else 172 word_type magic = 0x7efefefeU; 173 if (sizeof(word_type) == 8) 174 magic = (magic << 16 << 16) | 0xfefefefeU; 175 magic |= 1; 176 177 val ^= c; 178 return ((val + magic) ^ ~val) & ~magic; 179 #endif 180 } 181 182 /* Given the result of acc_char_cmp is non-zero, return the index of 183 the found character. If this was a false positive, return -1. */ 184 185 static inline int 186 acc_char_index (word_type cmp ATTRIBUTE_UNUSED, 187 word_type val ATTRIBUTE_UNUSED) 188 { 189 #if defined(__GNUC__) && defined(__alpha__) && !WORDS_BIGENDIAN 190 /* The cmpbge instruction sets *bits* of the result corresponding to 191 matches in the bytes with no false positives. */ 192 return __builtin_ctzl (cmp); 193 #else 194 unsigned int i; 195 196 /* ??? It would be nice to force unrolling here, 197 and have all of these constants folded. */ 198 for (i = 0; i < sizeof(word_type); ++i) 199 { 200 uchar c; 201 if (WORDS_BIGENDIAN) 202 c = (val >> (sizeof(word_type) - i - 1) * 8) & 0xff; 203 else 204 c = (val >> i * 8) & 0xff; 205 206 if (c == '\n' || c == '\r' || c == '\\' || c == '?') 207 return i; 208 } 209 210 return -1; 211 #endif 212 } 213 214 /* A version of the fast scanner using bit fiddling techniques. 215 216 For 32-bit words, one would normally perform 16 comparisons and 217 16 branches. With this algorithm one performs 24 arithmetic 218 operations and one branch. Whether this is faster with a 32-bit 219 word size is going to be somewhat system dependent. 220 221 For 64-bit words, we eliminate twice the number of comparisons 222 and branches without increasing the number of arithmetic operations. 223 It's almost certainly going to be a win with 64-bit word size. */ 224 225 static const uchar * search_line_acc_char (const uchar *, const uchar *) 226 ATTRIBUTE_UNUSED; 227 228 static const uchar * 229 search_line_acc_char (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) 230 { 231 const word_type repl_nl = acc_char_replicate ('\n'); 232 const word_type repl_cr = acc_char_replicate ('\r'); 233 const word_type repl_bs = acc_char_replicate ('\\'); 234 const word_type repl_qm = acc_char_replicate ('?'); 235 236 unsigned int misalign; 237 const word_type *p; 238 word_type val, t; 239 240 /* Align the buffer. Mask out any bytes from before the beginning. */ 241 p = (word_type *)((uintptr_t)s & -sizeof(word_type)); 242 val = *p; 243 misalign = (uintptr_t)s & (sizeof(word_type) - 1); 244 if (misalign) 245 val = acc_char_mask_misalign (val, misalign); 246 247 /* Main loop. */ 248 while (1) 249 { 250 t = acc_char_cmp (val, repl_nl); 251 t |= acc_char_cmp (val, repl_cr); 252 t |= acc_char_cmp (val, repl_bs); 253 t |= acc_char_cmp (val, repl_qm); 254 255 if (__builtin_expect (t != 0, 0)) 256 { 257 int i = acc_char_index (t, val); 258 if (i >= 0) 259 return (const uchar *)p + i; 260 } 261 262 val = *++p; 263 } 264 } 265 266 /* Disable on Solaris 2/x86 until the following problem can be properly 267 autoconfed: 268 269 The Solaris 10+ assembler tags objects with the instruction set 270 extensions used, so SSE4.2 executables cannot run on machines that 271 don't support that extension. */ 272 273 #if (GCC_VERSION >= 4005) && (__GNUC__ >= 5 || !defined(__PIC__)) && (defined(__i386__) || defined(__x86_64__)) && !(defined(__sun__) && defined(__svr4__)) 274 275 /* Replicated character data to be shared between implementations. 276 Recall that outside of a context with vector support we can't 277 define compatible vector types, therefore these are all defined 278 in terms of raw characters. */ 279 static const char repl_chars[4][16] __attribute__((aligned(16))) = { 280 { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', 281 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' }, 282 { '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r', 283 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' }, 284 { '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', 285 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' }, 286 { '?', '?', '?', '?', '?', '?', '?', '?', 287 '?', '?', '?', '?', '?', '?', '?', '?' }, 288 }; 289 290 /* A version of the fast scanner using MMX vectorized byte compare insns. 291 292 This uses the PMOVMSKB instruction which was introduced with "MMX2", 293 which was packaged into SSE1; it is also present in the AMD MMX 294 extension. Mark the function as using "sse" so that we emit a real 295 "emms" instruction, rather than the 3dNOW "femms" instruction. */ 296 297 static const uchar * 298 #ifndef __SSE__ 299 __attribute__((__target__("sse"))) 300 #endif 301 search_line_mmx (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) 302 { 303 typedef char v8qi __attribute__ ((__vector_size__ (8))); 304 typedef int __m64 __attribute__ ((__vector_size__ (8), __may_alias__)); 305 306 const v8qi repl_nl = *(const v8qi *)repl_chars[0]; 307 const v8qi repl_cr = *(const v8qi *)repl_chars[1]; 308 const v8qi repl_bs = *(const v8qi *)repl_chars[2]; 309 const v8qi repl_qm = *(const v8qi *)repl_chars[3]; 310 311 unsigned int misalign, found, mask; 312 const v8qi *p; 313 v8qi data, t, c; 314 315 /* Align the source pointer. While MMX doesn't generate unaligned data 316 faults, this allows us to safely scan to the end of the buffer without 317 reading beyond the end of the last page. */ 318 misalign = (uintptr_t)s & 7; 319 p = (const v8qi *)((uintptr_t)s & -8); 320 data = *p; 321 322 /* Create a mask for the bytes that are valid within the first 323 16-byte block. The Idea here is that the AND with the mask 324 within the loop is "free", since we need some AND or TEST 325 insn in order to set the flags for the branch anyway. */ 326 mask = -1u << misalign; 327 328 /* Main loop processing 8 bytes at a time. */ 329 goto start; 330 do 331 { 332 data = *++p; 333 mask = -1; 334 335 start: 336 t = __builtin_ia32_pcmpeqb(data, repl_nl); 337 c = __builtin_ia32_pcmpeqb(data, repl_cr); 338 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c); 339 c = __builtin_ia32_pcmpeqb(data, repl_bs); 340 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c); 341 c = __builtin_ia32_pcmpeqb(data, repl_qm); 342 t = (v8qi) __builtin_ia32_por ((__m64)t, (__m64)c); 343 found = __builtin_ia32_pmovmskb (t); 344 found &= mask; 345 } 346 while (!found); 347 348 __builtin_ia32_emms (); 349 350 /* FOUND contains 1 in bits for which we matched a relevant 351 character. Conversion to the byte index is trivial. */ 352 found = __builtin_ctz(found); 353 return (const uchar *)p + found; 354 } 355 356 /* A version of the fast scanner using SSE2 vectorized byte compare insns. */ 357 358 static const uchar * 359 #ifndef __SSE2__ 360 __attribute__((__target__("sse2"))) 361 #endif 362 search_line_sse2 (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) 363 { 364 typedef char v16qi __attribute__ ((__vector_size__ (16))); 365 366 const v16qi repl_nl = *(const v16qi *)repl_chars[0]; 367 const v16qi repl_cr = *(const v16qi *)repl_chars[1]; 368 const v16qi repl_bs = *(const v16qi *)repl_chars[2]; 369 const v16qi repl_qm = *(const v16qi *)repl_chars[3]; 370 371 unsigned int misalign, found, mask; 372 const v16qi *p; 373 v16qi data, t; 374 375 /* Align the source pointer. */ 376 misalign = (uintptr_t)s & 15; 377 p = (const v16qi *)((uintptr_t)s & -16); 378 data = *p; 379 380 /* Create a mask for the bytes that are valid within the first 381 16-byte block. The Idea here is that the AND with the mask 382 within the loop is "free", since we need some AND or TEST 383 insn in order to set the flags for the branch anyway. */ 384 mask = -1u << misalign; 385 386 /* Main loop processing 16 bytes at a time. */ 387 goto start; 388 do 389 { 390 data = *++p; 391 mask = -1; 392 393 start: 394 t = __builtin_ia32_pcmpeqb128(data, repl_nl); 395 t |= __builtin_ia32_pcmpeqb128(data, repl_cr); 396 t |= __builtin_ia32_pcmpeqb128(data, repl_bs); 397 t |= __builtin_ia32_pcmpeqb128(data, repl_qm); 398 found = __builtin_ia32_pmovmskb128 (t); 399 found &= mask; 400 } 401 while (!found); 402 403 /* FOUND contains 1 in bits for which we matched a relevant 404 character. Conversion to the byte index is trivial. */ 405 found = __builtin_ctz(found); 406 return (const uchar *)p + found; 407 } 408 409 #ifdef HAVE_SSE4 410 /* A version of the fast scanner using SSE 4.2 vectorized string insns. */ 411 412 static const uchar * 413 #ifndef __SSE4_2__ 414 __attribute__((__target__("sse4.2"))) 415 #endif 416 search_line_sse42 (const uchar *s, const uchar *end) 417 { 418 typedef char v16qi __attribute__ ((__vector_size__ (16))); 419 static const v16qi search = { '\n', '\r', '?', '\\' }; 420 421 uintptr_t si = (uintptr_t)s; 422 uintptr_t index; 423 424 /* Check for unaligned input. */ 425 if (si & 15) 426 { 427 v16qi sv; 428 429 if (__builtin_expect (end - s < 16, 0) 430 && __builtin_expect ((si & 0xfff) > 0xff0, 0)) 431 { 432 /* There are less than 16 bytes left in the buffer, and less 433 than 16 bytes left on the page. Reading 16 bytes at this 434 point might generate a spurious page fault. Defer to the 435 SSE2 implementation, which already handles alignment. */ 436 return search_line_sse2 (s, end); 437 } 438 439 /* ??? The builtin doesn't understand that the PCMPESTRI read from 440 memory need not be aligned. */ 441 sv = __builtin_ia32_loaddqu ((const char *) s); 442 index = __builtin_ia32_pcmpestri128 (search, 4, sv, 16, 0); 443 444 if (__builtin_expect (index < 16, 0)) 445 goto found; 446 447 /* Advance the pointer to an aligned address. We will re-scan a 448 few bytes, but we no longer need care for reading past the 449 end of a page, since we're guaranteed a match. */ 450 s = (const uchar *)((si + 15) & -16); 451 } 452 453 /* Main loop, processing 16 bytes at a time. */ 454 #ifdef __GCC_ASM_FLAG_OUTPUTS__ 455 while (1) 456 { 457 char f; 458 459 /* By using inline assembly instead of the builtin, 460 we can use the result, as well as the flags set. */ 461 __asm ("%vpcmpestri\t$0, %2, %3" 462 : "=c"(index), "=@ccc"(f) 463 : "m"(*s), "x"(search), "a"(4), "d"(16)); 464 if (f) 465 break; 466 467 s += 16; 468 } 469 #else 470 s -= 16; 471 /* By doing the whole loop in inline assembly, 472 we can make proper use of the flags set. */ 473 __asm ( ".balign 16\n" 474 "0: add $16, %1\n" 475 " %vpcmpestri\t$0, (%1), %2\n" 476 " jnc 0b" 477 : "=&c"(index), "+r"(s) 478 : "x"(search), "a"(4), "d"(16)); 479 #endif 480 481 found: 482 return s + index; 483 } 484 485 #else 486 /* Work around out-dated assemblers without sse4 support. */ 487 #define search_line_sse42 search_line_sse2 488 #endif 489 490 /* Check the CPU capabilities. */ 491 492 #include "../gcc/config/i386/cpuid.h" 493 494 typedef const uchar * (*search_line_fast_type) (const uchar *, const uchar *); 495 static search_line_fast_type search_line_fast; 496 497 #define HAVE_init_vectorized_lexer 1 498 static inline void 499 init_vectorized_lexer (void) 500 { 501 unsigned dummy, ecx = 0, edx = 0; 502 search_line_fast_type impl = search_line_acc_char; 503 int minimum = 0; 504 505 #if defined(__SSE4_2__) 506 minimum = 3; 507 #elif defined(__SSE2__) 508 minimum = 2; 509 #elif defined(__SSE__) 510 minimum = 1; 511 #endif 512 513 if (minimum == 3) 514 impl = search_line_sse42; 515 else if (__get_cpuid (1, &dummy, &dummy, &ecx, &edx) || minimum == 2) 516 { 517 if (minimum == 3 || (ecx & bit_SSE4_2)) 518 impl = search_line_sse42; 519 else if (minimum == 2 || (edx & bit_SSE2)) 520 impl = search_line_sse2; 521 else if (minimum == 1 || (edx & bit_SSE)) 522 impl = search_line_mmx; 523 } 524 else if (__get_cpuid (0x80000001, &dummy, &dummy, &dummy, &edx)) 525 { 526 if (minimum == 1 527 || (edx & (bit_MMXEXT | bit_CMOV)) == (bit_MMXEXT | bit_CMOV)) 528 impl = search_line_mmx; 529 } 530 531 search_line_fast = impl; 532 } 533 534 #elif (GCC_VERSION >= 4005) && defined(_ARCH_PWR8) && defined(__ALTIVEC__) 535 536 /* A vection of the fast scanner using AltiVec vectorized byte compares 537 and VSX unaligned loads (when VSX is available). This is otherwise 538 the same as the AltiVec version. */ 539 540 ATTRIBUTE_NO_SANITIZE_UNDEFINED 541 static const uchar * 542 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) 543 { 544 typedef __attribute__((altivec(vector))) unsigned char vc; 545 546 const vc repl_nl = { 547 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', 548 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' 549 }; 550 const vc repl_cr = { 551 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r', 552 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' 553 }; 554 const vc repl_bs = { 555 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', 556 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' 557 }; 558 const vc repl_qm = { 559 '?', '?', '?', '?', '?', '?', '?', '?', 560 '?', '?', '?', '?', '?', '?', '?', '?', 561 }; 562 const vc zero = { 0 }; 563 564 vc data, t; 565 566 /* Main loop processing 16 bytes at a time. */ 567 do 568 { 569 vc m_nl, m_cr, m_bs, m_qm; 570 571 data = __builtin_vec_vsx_ld (0, s); 572 s += 16; 573 574 m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl); 575 m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr); 576 m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs); 577 m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm); 578 t = (m_nl | m_cr) | (m_bs | m_qm); 579 580 /* T now contains 0xff in bytes for which we matched one of the relevant 581 characters. We want to exit the loop if any byte in T is non-zero. 582 Below is the expansion of vec_any_ne(t, zero). */ 583 } 584 while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero)); 585 586 /* Restore s to to point to the 16 bytes we just processed. */ 587 s -= 16; 588 589 { 590 #define N (sizeof(vc) / sizeof(long)) 591 592 union { 593 vc v; 594 /* Statically assert that N is 2 or 4. */ 595 unsigned long l[(N == 2 || N == 4) ? N : -1]; 596 } u; 597 unsigned long l, i = 0; 598 599 u.v = t; 600 601 /* Find the first word of T that is non-zero. */ 602 switch (N) 603 { 604 case 4: 605 l = u.l[i++]; 606 if (l != 0) 607 break; 608 s += sizeof(unsigned long); 609 l = u.l[i++]; 610 if (l != 0) 611 break; 612 s += sizeof(unsigned long); 613 /* FALLTHRU */ 614 case 2: 615 l = u.l[i++]; 616 if (l != 0) 617 break; 618 s += sizeof(unsigned long); 619 l = u.l[i]; 620 } 621 622 /* L now contains 0xff in bytes for which we matched one of the 623 relevant characters. We can find the byte index by finding 624 its bit index and dividing by 8. */ 625 #ifdef __BIG_ENDIAN__ 626 l = __builtin_clzl(l) >> 3; 627 #else 628 l = __builtin_ctzl(l) >> 3; 629 #endif 630 return s + l; 631 632 #undef N 633 } 634 } 635 636 #elif (GCC_VERSION >= 4005) && defined(__ALTIVEC__) && defined (__BIG_ENDIAN__) 637 638 /* A vection of the fast scanner using AltiVec vectorized byte compares. 639 This cannot be used for little endian because vec_lvsl/lvsr are 640 deprecated for little endian and the code won't work properly. */ 641 /* ??? Unfortunately, attribute(target("altivec")) is not yet supported, 642 so we can't compile this function without -maltivec on the command line 643 (or implied by some other switch). */ 644 645 static const uchar * 646 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) 647 { 648 typedef __attribute__((altivec(vector))) unsigned char vc; 649 650 const vc repl_nl = { 651 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', 652 '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' 653 }; 654 const vc repl_cr = { 655 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r', 656 '\r', '\r', '\r', '\r', '\r', '\r', '\r', '\r' 657 }; 658 const vc repl_bs = { 659 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\', 660 '\\', '\\', '\\', '\\', '\\', '\\', '\\', '\\' 661 }; 662 const vc repl_qm = { 663 '?', '?', '?', '?', '?', '?', '?', '?', 664 '?', '?', '?', '?', '?', '?', '?', '?', 665 }; 666 const vc ones = { 667 -1, -1, -1, -1, -1, -1, -1, -1, 668 -1, -1, -1, -1, -1, -1, -1, -1, 669 }; 670 const vc zero = { 0 }; 671 672 vc data, mask, t; 673 674 /* Altivec loads automatically mask addresses with -16. This lets us 675 issue the first load as early as possible. */ 676 data = __builtin_vec_ld(0, (const vc *)s); 677 678 /* Discard bytes before the beginning of the buffer. Do this by 679 beginning with all ones and shifting in zeros according to the 680 mis-alignment. The LVSR instruction pulls the exact shift we 681 want from the address. */ 682 mask = __builtin_vec_lvsr(0, s); 683 mask = __builtin_vec_perm(zero, ones, mask); 684 data &= mask; 685 686 /* While altivec loads mask addresses, we still need to align S so 687 that the offset we compute at the end is correct. */ 688 s = (const uchar *)((uintptr_t)s & -16); 689 690 /* Main loop processing 16 bytes at a time. */ 691 goto start; 692 do 693 { 694 vc m_nl, m_cr, m_bs, m_qm; 695 696 s += 16; 697 data = __builtin_vec_ld(0, (const vc *)s); 698 699 start: 700 m_nl = (vc) __builtin_vec_cmpeq(data, repl_nl); 701 m_cr = (vc) __builtin_vec_cmpeq(data, repl_cr); 702 m_bs = (vc) __builtin_vec_cmpeq(data, repl_bs); 703 m_qm = (vc) __builtin_vec_cmpeq(data, repl_qm); 704 t = (m_nl | m_cr) | (m_bs | m_qm); 705 706 /* T now contains 0xff in bytes for which we matched one of the relevant 707 characters. We want to exit the loop if any byte in T is non-zero. 708 Below is the expansion of vec_any_ne(t, zero). */ 709 } 710 while (!__builtin_vec_vcmpeq_p(/*__CR6_LT_REV*/3, t, zero)); 711 712 { 713 #define N (sizeof(vc) / sizeof(long)) 714 715 union { 716 vc v; 717 /* Statically assert that N is 2 or 4. */ 718 unsigned long l[(N == 2 || N == 4) ? N : -1]; 719 } u; 720 unsigned long l, i = 0; 721 722 u.v = t; 723 724 /* Find the first word of T that is non-zero. */ 725 switch (N) 726 { 727 case 4: 728 l = u.l[i++]; 729 if (l != 0) 730 break; 731 s += sizeof(unsigned long); 732 l = u.l[i++]; 733 if (l != 0) 734 break; 735 s += sizeof(unsigned long); 736 /* FALLTHROUGH */ 737 case 2: 738 l = u.l[i++]; 739 if (l != 0) 740 break; 741 s += sizeof(unsigned long); 742 l = u.l[i]; 743 } 744 745 /* L now contains 0xff in bytes for which we matched one of the 746 relevant characters. We can find the byte index by finding 747 its bit index and dividing by 8. */ 748 l = __builtin_clzl(l) >> 3; 749 return s + l; 750 751 #undef N 752 } 753 } 754 755 #elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE) 756 #include "arm_neon.h" 757 758 /* This doesn't have to be the exact page size, but no system may use 759 a size smaller than this. ARMv8 requires a minimum page size of 760 4k. The impact of being conservative here is a small number of 761 cases will take the slightly slower entry path into the main 762 loop. */ 763 764 #define AARCH64_MIN_PAGE_SIZE 4096 765 766 static const uchar * 767 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) 768 { 769 const uint8x16_t repl_nl = vdupq_n_u8 ('\n'); 770 const uint8x16_t repl_cr = vdupq_n_u8 ('\r'); 771 const uint8x16_t repl_bs = vdupq_n_u8 ('\\'); 772 const uint8x16_t repl_qm = vdupq_n_u8 ('?'); 773 const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL); 774 775 #ifdef __ARM_BIG_ENDIAN 776 const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0}; 777 #else 778 const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8}; 779 #endif 780 781 unsigned int found; 782 const uint8_t *p; 783 uint8x16_t data; 784 uint8x16_t t; 785 uint16x8_t m; 786 uint8x16_t u, v, w; 787 788 /* Align the source pointer. */ 789 p = (const uint8_t *)((uintptr_t)s & -16); 790 791 /* Assuming random string start positions, with a 4k page size we'll take 792 the slow path about 0.37% of the time. */ 793 if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE 794 - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1))) 795 < 16, 0)) 796 { 797 /* Slow path: the string starts near a possible page boundary. */ 798 uint32_t misalign, mask; 799 800 misalign = (uintptr_t)s & 15; 801 mask = (-1u << misalign) & 0xffff; 802 data = vld1q_u8 (p); 803 t = vceqq_u8 (data, repl_nl); 804 u = vceqq_u8 (data, repl_cr); 805 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs)); 806 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm)); 807 t = vorrq_u8 (v, w); 808 t = vandq_u8 (t, xmask); 809 m = vpaddlq_u8 (t); 810 m = vshlq_u16 (m, shift); 811 found = vaddvq_u16 (m); 812 found &= mask; 813 if (found) 814 return (const uchar*)p + __builtin_ctz (found); 815 } 816 else 817 { 818 data = vld1q_u8 ((const uint8_t *) s); 819 t = vceqq_u8 (data, repl_nl); 820 u = vceqq_u8 (data, repl_cr); 821 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs)); 822 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm)); 823 t = vorrq_u8 (v, w); 824 if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0)) 825 goto done; 826 } 827 828 do 829 { 830 p += 16; 831 data = vld1q_u8 (p); 832 t = vceqq_u8 (data, repl_nl); 833 u = vceqq_u8 (data, repl_cr); 834 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs)); 835 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm)); 836 t = vorrq_u8 (v, w); 837 } while (!vpaddd_u64 ((uint64x2_t)t)); 838 839 done: 840 /* Now that we've found the terminating substring, work out precisely where 841 we need to stop. */ 842 t = vandq_u8 (t, xmask); 843 m = vpaddlq_u8 (t); 844 m = vshlq_u16 (m, shift); 845 found = vaddvq_u16 (m); 846 return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p) 847 + __builtin_ctz (found)); 848 } 849 850 #elif defined (__ARM_NEON) 851 #include "arm_neon.h" 852 853 static const uchar * 854 search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) 855 { 856 const uint8x16_t repl_nl = vdupq_n_u8 ('\n'); 857 const uint8x16_t repl_cr = vdupq_n_u8 ('\r'); 858 const uint8x16_t repl_bs = vdupq_n_u8 ('\\'); 859 const uint8x16_t repl_qm = vdupq_n_u8 ('?'); 860 const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL); 861 862 unsigned int misalign, found, mask; 863 const uint8_t *p; 864 uint8x16_t data; 865 866 /* Align the source pointer. */ 867 misalign = (uintptr_t)s & 15; 868 p = (const uint8_t *)((uintptr_t)s & -16); 869 data = vld1q_u8 (p); 870 871 /* Create a mask for the bytes that are valid within the first 872 16-byte block. The Idea here is that the AND with the mask 873 within the loop is "free", since we need some AND or TEST 874 insn in order to set the flags for the branch anyway. */ 875 mask = (-1u << misalign) & 0xffff; 876 877 /* Main loop, processing 16 bytes at a time. */ 878 goto start; 879 880 do 881 { 882 uint8x8_t l; 883 uint16x4_t m; 884 uint32x2_t n; 885 uint8x16_t t, u, v, w; 886 887 p += 16; 888 data = vld1q_u8 (p); 889 mask = 0xffff; 890 891 start: 892 t = vceqq_u8 (data, repl_nl); 893 u = vceqq_u8 (data, repl_cr); 894 v = vorrq_u8 (t, vceqq_u8 (data, repl_bs)); 895 w = vorrq_u8 (u, vceqq_u8 (data, repl_qm)); 896 t = vandq_u8 (vorrq_u8 (v, w), xmask); 897 l = vpadd_u8 (vget_low_u8 (t), vget_high_u8 (t)); 898 m = vpaddl_u8 (l); 899 n = vpaddl_u16 (m); 900 901 found = vget_lane_u32 ((uint32x2_t) vorr_u64 ((uint64x1_t) n, 902 vshr_n_u64 ((uint64x1_t) n, 24)), 0); 903 found &= mask; 904 } 905 while (!found); 906 907 /* FOUND contains 1 in bits for which we matched a relevant 908 character. Conversion to the byte index is trivial. */ 909 found = __builtin_ctz (found); 910 return (const uchar *)p + found; 911 } 912 913 #else 914 915 /* We only have one accelerated alternative. Use a direct call so that 916 we encourage inlining. */ 917 918 #define search_line_fast search_line_acc_char 919 920 #endif 921 922 /* Initialize the lexer if needed. */ 923 924 void 925 _cpp_init_lexer (void) 926 { 927 #ifdef HAVE_init_vectorized_lexer 928 init_vectorized_lexer (); 929 #endif 930 } 931 932 /* Returns with a logical line that contains no escaped newlines or 933 trigraphs. This is a time-critical inner loop. */ 934 void 935 _cpp_clean_line (cpp_reader *pfile) 936 { 937 cpp_buffer *buffer; 938 const uchar *s; 939 uchar c, *d, *p; 940 941 buffer = pfile->buffer; 942 buffer->cur_note = buffer->notes_used = 0; 943 buffer->cur = buffer->line_base = buffer->next_line; 944 buffer->need_line = false; 945 s = buffer->next_line; 946 947 if (!buffer->from_stage3) 948 { 949 const uchar *pbackslash = NULL; 950 951 /* Fast path. This is the common case of an un-escaped line with 952 no trigraphs. The primary win here is by not writing any 953 data back to memory until we have to. */ 954 while (1) 955 { 956 /* Perform an optimized search for \n, \r, \\, ?. */ 957 s = search_line_fast (s, buffer->rlimit); 958 959 c = *s; 960 if (c == '\\') 961 { 962 /* Record the location of the backslash and continue. */ 963 pbackslash = s++; 964 } 965 else if (__builtin_expect (c == '?', 0)) 966 { 967 if (__builtin_expect (s[1] == '?', false) 968 && _cpp_trigraph_map[s[2]]) 969 { 970 /* Have a trigraph. We may or may not have to convert 971 it. Add a line note regardless, for -Wtrigraphs. */ 972 add_line_note (buffer, s, s[2]); 973 if (CPP_OPTION (pfile, trigraphs)) 974 { 975 /* We do, and that means we have to switch to the 976 slow path. */ 977 d = (uchar *) s; 978 *d = _cpp_trigraph_map[s[2]]; 979 s += 2; 980 goto slow_path; 981 } 982 } 983 /* Not a trigraph. Continue on fast-path. */ 984 s++; 985 } 986 else 987 break; 988 } 989 990 /* This must be \r or \n. We're either done, or we'll be forced 991 to write back to the buffer and continue on the slow path. */ 992 d = (uchar *) s; 993 994 if (__builtin_expect (s == buffer->rlimit, false)) 995 goto done; 996 997 /* DOS line ending? */ 998 if (__builtin_expect (c == '\r', false) && s[1] == '\n') 999 { 1000 s++; 1001 if (s == buffer->rlimit) 1002 goto done; 1003 } 1004 1005 if (__builtin_expect (pbackslash == NULL, true)) 1006 goto done; 1007 1008 /* Check for escaped newline. */ 1009 p = d; 1010 while (is_nvspace (p[-1])) 1011 p--; 1012 if (p - 1 != pbackslash) 1013 goto done; 1014 1015 /* Have an escaped newline; process it and proceed to 1016 the slow path. */ 1017 add_line_note (buffer, p - 1, p != d ? ' ' : '\\'); 1018 d = p - 2; 1019 buffer->next_line = p - 1; 1020 1021 slow_path: 1022 while (1) 1023 { 1024 c = *++s; 1025 *++d = c; 1026 1027 if (c == '\n' || c == '\r') 1028 { 1029 /* Handle DOS line endings. */ 1030 if (c == '\r' && s != buffer->rlimit && s[1] == '\n') 1031 s++; 1032 if (s == buffer->rlimit) 1033 break; 1034 1035 /* Escaped? */ 1036 p = d; 1037 while (p != buffer->next_line && is_nvspace (p[-1])) 1038 p--; 1039 if (p == buffer->next_line || p[-1] != '\\') 1040 break; 1041 1042 add_line_note (buffer, p - 1, p != d ? ' ': '\\'); 1043 d = p - 2; 1044 buffer->next_line = p - 1; 1045 } 1046 else if (c == '?' && s[1] == '?' && _cpp_trigraph_map[s[2]]) 1047 { 1048 /* Add a note regardless, for the benefit of -Wtrigraphs. */ 1049 add_line_note (buffer, d, s[2]); 1050 if (CPP_OPTION (pfile, trigraphs)) 1051 { 1052 *d = _cpp_trigraph_map[s[2]]; 1053 s += 2; 1054 } 1055 } 1056 } 1057 } 1058 else 1059 { 1060 while (*s != '\n' && *s != '\r') 1061 s++; 1062 d = (uchar *) s; 1063 1064 /* Handle DOS line endings. */ 1065 if (*s == '\r' && s != buffer->rlimit && s[1] == '\n') 1066 s++; 1067 } 1068 1069 done: 1070 *d = '\n'; 1071 /* A sentinel note that should never be processed. */ 1072 add_line_note (buffer, d + 1, '\n'); 1073 buffer->next_line = s + 1; 1074 } 1075 1076 /* Return true if the trigraph indicated by NOTE should be warned 1077 about in a comment. */ 1078 static bool 1079 warn_in_comment (cpp_reader *pfile, _cpp_line_note *note) 1080 { 1081 const uchar *p; 1082 1083 /* Within comments we don't warn about trigraphs, unless the 1084 trigraph forms an escaped newline, as that may change 1085 behavior. */ 1086 if (note->type != '/') 1087 return false; 1088 1089 /* If -trigraphs, then this was an escaped newline iff the next note 1090 is coincident. */ 1091 if (CPP_OPTION (pfile, trigraphs)) 1092 return note[1].pos == note->pos; 1093 1094 /* Otherwise, see if this forms an escaped newline. */ 1095 p = note->pos + 3; 1096 while (is_nvspace (*p)) 1097 p++; 1098 1099 /* There might have been escaped newlines between the trigraph and the 1100 newline we found. Hence the position test. */ 1101 return (*p == '\n' && p < note[1].pos); 1102 } 1103 1104 /* Process the notes created by add_line_note as far as the current 1105 location. */ 1106 void 1107 _cpp_process_line_notes (cpp_reader *pfile, int in_comment) 1108 { 1109 cpp_buffer *buffer = pfile->buffer; 1110 1111 for (;;) 1112 { 1113 _cpp_line_note *note = &buffer->notes[buffer->cur_note]; 1114 unsigned int col; 1115 1116 if (note->pos > buffer->cur) 1117 break; 1118 1119 buffer->cur_note++; 1120 col = CPP_BUF_COLUMN (buffer, note->pos + 1); 1121 1122 if (note->type == '\\' || note->type == ' ') 1123 { 1124 if (note->type == ' ' && !in_comment) 1125 cpp_error_with_line (pfile, CPP_DL_WARNING, pfile->line_table->highest_line, col, 1126 "backslash and newline separated by space"); 1127 1128 if (buffer->next_line > buffer->rlimit) 1129 { 1130 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, col, 1131 "backslash-newline at end of file"); 1132 /* Prevent "no newline at end of file" warning. */ 1133 buffer->next_line = buffer->rlimit; 1134 } 1135 1136 buffer->line_base = note->pos; 1137 CPP_INCREMENT_LINE (pfile, 0); 1138 } 1139 else if (_cpp_trigraph_map[note->type]) 1140 { 1141 if (CPP_OPTION (pfile, warn_trigraphs) 1142 && (!in_comment || warn_in_comment (pfile, note))) 1143 { 1144 if (CPP_OPTION (pfile, trigraphs)) 1145 cpp_warning_with_line (pfile, CPP_W_TRIGRAPHS, 1146 pfile->line_table->highest_line, col, 1147 "trigraph ??%c converted to %c", 1148 note->type, 1149 (int) _cpp_trigraph_map[note->type]); 1150 else 1151 { 1152 cpp_warning_with_line 1153 (pfile, CPP_W_TRIGRAPHS, 1154 pfile->line_table->highest_line, col, 1155 "trigraph ??%c ignored, use -trigraphs to enable", 1156 note->type); 1157 } 1158 } 1159 } 1160 else if (note->type == 0) 1161 /* Already processed in lex_raw_string. */; 1162 else 1163 abort (); 1164 } 1165 } 1166 1167 /* Skip a C-style block comment. We find the end of the comment by 1168 seeing if an asterisk is before every '/' we encounter. Returns 1169 nonzero if comment terminated by EOF, zero otherwise. 1170 1171 Buffer->cur points to the initial asterisk of the comment. */ 1172 bool 1173 _cpp_skip_block_comment (cpp_reader *pfile) 1174 { 1175 cpp_buffer *buffer = pfile->buffer; 1176 const uchar *cur = buffer->cur; 1177 uchar c; 1178 1179 cur++; 1180 if (*cur == '/') 1181 cur++; 1182 1183 for (;;) 1184 { 1185 /* People like decorating comments with '*', so check for '/' 1186 instead for efficiency. */ 1187 c = *cur++; 1188 1189 if (c == '/') 1190 { 1191 if (cur[-2] == '*') 1192 break; 1193 1194 /* Warn about potential nested comments, but not if the '/' 1195 comes immediately before the true comment delimiter. 1196 Don't bother to get it right across escaped newlines. */ 1197 if (CPP_OPTION (pfile, warn_comments) 1198 && cur[0] == '*' && cur[1] != '/') 1199 { 1200 buffer->cur = cur; 1201 cpp_warning_with_line (pfile, CPP_W_COMMENTS, 1202 pfile->line_table->highest_line, 1203 CPP_BUF_COL (buffer), 1204 "\"/*\" within comment"); 1205 } 1206 } 1207 else if (c == '\n') 1208 { 1209 unsigned int cols; 1210 buffer->cur = cur - 1; 1211 _cpp_process_line_notes (pfile, true); 1212 if (buffer->next_line >= buffer->rlimit) 1213 return true; 1214 _cpp_clean_line (pfile); 1215 1216 cols = buffer->next_line - buffer->line_base; 1217 CPP_INCREMENT_LINE (pfile, cols); 1218 1219 cur = buffer->cur; 1220 } 1221 } 1222 1223 buffer->cur = cur; 1224 _cpp_process_line_notes (pfile, true); 1225 return false; 1226 } 1227 1228 /* Skip a C++ line comment, leaving buffer->cur pointing to the 1229 terminating newline. Handles escaped newlines. Returns nonzero 1230 if a multiline comment. */ 1231 static int 1232 skip_line_comment (cpp_reader *pfile) 1233 { 1234 cpp_buffer *buffer = pfile->buffer; 1235 location_t orig_line = pfile->line_table->highest_line; 1236 1237 while (*buffer->cur != '\n') 1238 buffer->cur++; 1239 1240 _cpp_process_line_notes (pfile, true); 1241 return orig_line != pfile->line_table->highest_line; 1242 } 1243 1244 /* Skips whitespace, saving the next non-whitespace character. */ 1245 static void 1246 skip_whitespace (cpp_reader *pfile, cppchar_t c) 1247 { 1248 cpp_buffer *buffer = pfile->buffer; 1249 bool saw_NUL = false; 1250 1251 do 1252 { 1253 /* Horizontal space always OK. */ 1254 if (c == ' ' || c == '\t') 1255 ; 1256 /* Just \f \v or \0 left. */ 1257 else if (c == '\0') 1258 saw_NUL = true; 1259 else if (pfile->state.in_directive && CPP_PEDANTIC (pfile)) 1260 cpp_error_with_line (pfile, CPP_DL_PEDWARN, pfile->line_table->highest_line, 1261 CPP_BUF_COL (buffer), 1262 "%s in preprocessing directive", 1263 c == '\f' ? "form feed" : "vertical tab"); 1264 1265 c = *buffer->cur++; 1266 } 1267 /* We only want non-vertical space, i.e. ' ' \t \f \v \0. */ 1268 while (is_nvspace (c)); 1269 1270 if (saw_NUL) 1271 cpp_error (pfile, CPP_DL_WARNING, "null character(s) ignored"); 1272 1273 buffer->cur--; 1274 } 1275 1276 /* See if the characters of a number token are valid in a name (no 1277 '.', '+' or '-'). */ 1278 static int 1279 name_p (cpp_reader *pfile, const cpp_string *string) 1280 { 1281 unsigned int i; 1282 1283 for (i = 0; i < string->len; i++) 1284 if (!is_idchar (string->text[i])) 1285 return 0; 1286 1287 return 1; 1288 } 1289 1290 /* After parsing an identifier or other sequence, produce a warning about 1291 sequences not in NFC/NFKC. */ 1292 static void 1293 warn_about_normalization (cpp_reader *pfile, 1294 const cpp_token *token, 1295 const struct normalize_state *s) 1296 { 1297 if (CPP_OPTION (pfile, warn_normalize) < NORMALIZE_STATE_RESULT (s) 1298 && !pfile->state.skipping) 1299 { 1300 /* Make sure that the token is printed using UCNs, even 1301 if we'd otherwise happily print UTF-8. */ 1302 unsigned char *buf = XNEWVEC (unsigned char, cpp_token_len (token)); 1303 size_t sz; 1304 1305 sz = cpp_spell_token (pfile, token, buf, false) - buf; 1306 if (NORMALIZE_STATE_RESULT (s) == normalized_C) 1307 cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0, 1308 "`%.*s' is not in NFKC", (int) sz, buf); 1309 else 1310 cpp_warning_with_line (pfile, CPP_W_NORMALIZE, token->src_loc, 0, 1311 "`%.*s' is not in NFC", (int) sz, buf); 1312 free (buf); 1313 } 1314 } 1315 1316 static const cppchar_t utf8_signifier = 0xC0; 1317 1318 /* Returns TRUE if the sequence starting at buffer->cur is valid in 1319 an identifier. FIRST is TRUE if this starts an identifier. */ 1320 static bool 1321 forms_identifier_p (cpp_reader *pfile, int first, 1322 struct normalize_state *state) 1323 { 1324 cpp_buffer *buffer = pfile->buffer; 1325 1326 if (*buffer->cur == '$') 1327 { 1328 if (!CPP_OPTION (pfile, dollars_in_ident)) 1329 return false; 1330 1331 buffer->cur++; 1332 if (CPP_OPTION (pfile, warn_dollars) && !pfile->state.skipping) 1333 { 1334 CPP_OPTION (pfile, warn_dollars) = 0; 1335 cpp_error (pfile, CPP_DL_PEDWARN, "'$' in identifier or number"); 1336 } 1337 1338 return true; 1339 } 1340 1341 /* Is this a syntactically valid UCN or a valid UTF-8 char? */ 1342 if (CPP_OPTION (pfile, extended_identifiers)) 1343 { 1344 cppchar_t s; 1345 if (*buffer->cur >= utf8_signifier) 1346 { 1347 if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first, 1348 state, &s)) 1349 return true; 1350 } 1351 else if (*buffer->cur == '\\' 1352 && (buffer->cur[1] == 'u' || buffer->cur[1] == 'U')) 1353 { 1354 buffer->cur += 2; 1355 if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first, 1356 state, &s, NULL, NULL)) 1357 return true; 1358 buffer->cur -= 2; 1359 } 1360 } 1361 1362 return false; 1363 } 1364 1365 /* Helper function to issue error about improper __VA_OPT__ use. */ 1366 static void 1367 maybe_va_opt_error (cpp_reader *pfile) 1368 { 1369 if (CPP_PEDANTIC (pfile) && !CPP_OPTION (pfile, va_opt)) 1370 { 1371 /* __VA_OPT__ should not be accepted at all, but allow it in 1372 system headers. */ 1373 if (!cpp_in_system_header (pfile)) 1374 cpp_error (pfile, CPP_DL_PEDWARN, 1375 "__VA_OPT__ is not available until C++2a"); 1376 } 1377 else if (!pfile->state.va_args_ok) 1378 { 1379 /* __VA_OPT__ should only appear in the replacement list of a 1380 variadic macro. */ 1381 cpp_error (pfile, CPP_DL_PEDWARN, 1382 "__VA_OPT__ can only appear in the expansion" 1383 " of a C++2a variadic macro"); 1384 } 1385 } 1386 1387 /* Helper function to get the cpp_hashnode of the identifier BASE. */ 1388 static cpp_hashnode * 1389 lex_identifier_intern (cpp_reader *pfile, const uchar *base) 1390 { 1391 cpp_hashnode *result; 1392 const uchar *cur; 1393 unsigned int len; 1394 unsigned int hash = HT_HASHSTEP (0, *base); 1395 1396 cur = base + 1; 1397 while (ISIDNUM (*cur)) 1398 { 1399 hash = HT_HASHSTEP (hash, *cur); 1400 cur++; 1401 } 1402 len = cur - base; 1403 hash = HT_HASHFINISH (hash, len); 1404 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table, 1405 base, len, hash, HT_ALLOC)); 1406 1407 /* Rarely, identifiers require diagnostics when lexed. */ 1408 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC) 1409 && !pfile->state.skipping, 0)) 1410 { 1411 /* It is allowed to poison the same identifier twice. */ 1412 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok) 1413 cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"", 1414 NODE_NAME (result)); 1415 1416 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the 1417 replacement list of a variadic macro. */ 1418 if (result == pfile->spec_nodes.n__VA_ARGS__ 1419 && !pfile->state.va_args_ok) 1420 { 1421 if (CPP_OPTION (pfile, cplusplus)) 1422 cpp_error (pfile, CPP_DL_PEDWARN, 1423 "__VA_ARGS__ can only appear in the expansion" 1424 " of a C++11 variadic macro"); 1425 else 1426 cpp_error (pfile, CPP_DL_PEDWARN, 1427 "__VA_ARGS__ can only appear in the expansion" 1428 " of a C99 variadic macro"); 1429 } 1430 1431 if (result == pfile->spec_nodes.n__VA_OPT__) 1432 maybe_va_opt_error (pfile); 1433 1434 /* For -Wc++-compat, warn about use of C++ named operators. */ 1435 if (result->flags & NODE_WARN_OPERATOR) 1436 cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES, 1437 "identifier \"%s\" is a special operator name in C++", 1438 NODE_NAME (result)); 1439 } 1440 1441 return result; 1442 } 1443 1444 /* Get the cpp_hashnode of an identifier specified by NAME in 1445 the current cpp_reader object. If none is found, NULL is returned. */ 1446 cpp_hashnode * 1447 _cpp_lex_identifier (cpp_reader *pfile, const char *name) 1448 { 1449 cpp_hashnode *result; 1450 result = lex_identifier_intern (pfile, (uchar *) name); 1451 return result; 1452 } 1453 1454 /* Lex an identifier starting at BUFFER->CUR - 1. */ 1455 static cpp_hashnode * 1456 lex_identifier (cpp_reader *pfile, const uchar *base, bool starts_ucn, 1457 struct normalize_state *nst, cpp_hashnode **spelling) 1458 { 1459 cpp_hashnode *result; 1460 const uchar *cur; 1461 unsigned int len; 1462 unsigned int hash = HT_HASHSTEP (0, *base); 1463 1464 cur = pfile->buffer->cur; 1465 if (! starts_ucn) 1466 { 1467 while (ISIDNUM (*cur)) 1468 { 1469 hash = HT_HASHSTEP (hash, *cur); 1470 cur++; 1471 } 1472 NORMALIZE_STATE_UPDATE_IDNUM (nst, *(cur - 1)); 1473 } 1474 pfile->buffer->cur = cur; 1475 if (starts_ucn || forms_identifier_p (pfile, false, nst)) 1476 { 1477 /* Slower version for identifiers containing UCNs 1478 or extended chars (including $). */ 1479 do { 1480 while (ISIDNUM (*pfile->buffer->cur)) 1481 { 1482 NORMALIZE_STATE_UPDATE_IDNUM (nst, *pfile->buffer->cur); 1483 pfile->buffer->cur++; 1484 } 1485 } while (forms_identifier_p (pfile, false, nst)); 1486 result = _cpp_interpret_identifier (pfile, base, 1487 pfile->buffer->cur - base); 1488 *spelling = cpp_lookup (pfile, base, pfile->buffer->cur - base); 1489 } 1490 else 1491 { 1492 len = cur - base; 1493 hash = HT_HASHFINISH (hash, len); 1494 1495 result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table, 1496 base, len, hash, HT_ALLOC)); 1497 *spelling = result; 1498 } 1499 1500 /* Rarely, identifiers require diagnostics when lexed. */ 1501 if (__builtin_expect ((result->flags & NODE_DIAGNOSTIC) 1502 && !pfile->state.skipping, 0)) 1503 { 1504 /* It is allowed to poison the same identifier twice. */ 1505 if ((result->flags & NODE_POISONED) && !pfile->state.poisoned_ok) 1506 cpp_error (pfile, CPP_DL_ERROR, "attempt to use poisoned \"%s\"", 1507 NODE_NAME (result)); 1508 1509 /* Constraint 6.10.3.5: __VA_ARGS__ should only appear in the 1510 replacement list of a variadic macro. */ 1511 if (result == pfile->spec_nodes.n__VA_ARGS__ 1512 && !pfile->state.va_args_ok) 1513 { 1514 if (CPP_OPTION (pfile, cplusplus)) 1515 cpp_error (pfile, CPP_DL_PEDWARN, 1516 "__VA_ARGS__ can only appear in the expansion" 1517 " of a C++11 variadic macro"); 1518 else 1519 cpp_error (pfile, CPP_DL_PEDWARN, 1520 "__VA_ARGS__ can only appear in the expansion" 1521 " of a C99 variadic macro"); 1522 } 1523 1524 /* __VA_OPT__ should only appear in the replacement list of a 1525 variadic macro. */ 1526 if (result == pfile->spec_nodes.n__VA_OPT__) 1527 maybe_va_opt_error (pfile); 1528 1529 /* For -Wc++-compat, warn about use of C++ named operators. */ 1530 if (result->flags & NODE_WARN_OPERATOR) 1531 cpp_warning (pfile, CPP_W_CXX_OPERATOR_NAMES, 1532 "identifier \"%s\" is a special operator name in C++", 1533 NODE_NAME (result)); 1534 } 1535 1536 return result; 1537 } 1538 1539 /* Lex a number to NUMBER starting at BUFFER->CUR - 1. */ 1540 static void 1541 lex_number (cpp_reader *pfile, cpp_string *number, 1542 struct normalize_state *nst) 1543 { 1544 const uchar *cur; 1545 const uchar *base; 1546 uchar *dest; 1547 1548 base = pfile->buffer->cur - 1; 1549 do 1550 { 1551 cur = pfile->buffer->cur; 1552 1553 /* N.B. ISIDNUM does not include $. */ 1554 while (ISIDNUM (*cur) || *cur == '.' || DIGIT_SEP (*cur) 1555 || VALID_SIGN (*cur, cur[-1])) 1556 { 1557 NORMALIZE_STATE_UPDATE_IDNUM (nst, *cur); 1558 cur++; 1559 } 1560 /* A number can't end with a digit separator. */ 1561 while (cur > pfile->buffer->cur && DIGIT_SEP (cur[-1])) 1562 --cur; 1563 1564 pfile->buffer->cur = cur; 1565 } 1566 while (forms_identifier_p (pfile, false, nst)); 1567 1568 number->len = cur - base; 1569 dest = _cpp_unaligned_alloc (pfile, number->len + 1); 1570 memcpy (dest, base, number->len); 1571 dest[number->len] = '\0'; 1572 number->text = dest; 1573 } 1574 1575 /* Create a token of type TYPE with a literal spelling. */ 1576 static void 1577 create_literal (cpp_reader *pfile, cpp_token *token, const uchar *base, 1578 unsigned int len, enum cpp_ttype type) 1579 { 1580 uchar *dest = _cpp_unaligned_alloc (pfile, len + 1); 1581 1582 memcpy (dest, base, len); 1583 dest[len] = '\0'; 1584 token->type = type; 1585 token->val.str.len = len; 1586 token->val.str.text = dest; 1587 } 1588 1589 /* Subroutine of lex_raw_string: Append LEN chars from BASE to the buffer 1590 sequence from *FIRST_BUFF_P to LAST_BUFF_P. */ 1591 1592 static void 1593 bufring_append (cpp_reader *pfile, const uchar *base, size_t len, 1594 _cpp_buff **first_buff_p, _cpp_buff **last_buff_p) 1595 { 1596 _cpp_buff *first_buff = *first_buff_p; 1597 _cpp_buff *last_buff = *last_buff_p; 1598 1599 if (first_buff == NULL) 1600 first_buff = last_buff = _cpp_get_buff (pfile, len); 1601 else if (len > BUFF_ROOM (last_buff)) 1602 { 1603 size_t room = BUFF_ROOM (last_buff); 1604 memcpy (BUFF_FRONT (last_buff), base, room); 1605 BUFF_FRONT (last_buff) += room; 1606 base += room; 1607 len -= room; 1608 last_buff = _cpp_append_extend_buff (pfile, last_buff, len); 1609 } 1610 1611 memcpy (BUFF_FRONT (last_buff), base, len); 1612 BUFF_FRONT (last_buff) += len; 1613 1614 *first_buff_p = first_buff; 1615 *last_buff_p = last_buff; 1616 } 1617 1618 1619 /* Returns true if a macro has been defined. 1620 This might not work if compile with -save-temps, 1621 or preprocess separately from compilation. */ 1622 1623 static bool 1624 is_macro(cpp_reader *pfile, const uchar *base) 1625 { 1626 const uchar *cur = base; 1627 if (! ISIDST (*cur)) 1628 return false; 1629 unsigned int hash = HT_HASHSTEP (0, *cur); 1630 ++cur; 1631 while (ISIDNUM (*cur)) 1632 { 1633 hash = HT_HASHSTEP (hash, *cur); 1634 ++cur; 1635 } 1636 hash = HT_HASHFINISH (hash, cur - base); 1637 1638 cpp_hashnode *result = CPP_HASHNODE (ht_lookup_with_hash (pfile->hash_table, 1639 base, cur - base, hash, HT_NO_INSERT)); 1640 1641 return result && cpp_macro_p (result); 1642 } 1643 1644 /* Returns true if a literal suffix does not have the expected form 1645 and is defined as a macro. */ 1646 1647 static bool 1648 is_macro_not_literal_suffix(cpp_reader *pfile, const uchar *base) 1649 { 1650 /* User-defined literals outside of namespace std must start with a single 1651 underscore, so assume anything of that form really is a UDL suffix. 1652 We don't need to worry about UDLs defined inside namespace std because 1653 their names are reserved, so cannot be used as macro names in valid 1654 programs. */ 1655 if (base[0] == '_' && base[1] != '_') 1656 return false; 1657 return is_macro (pfile, base); 1658 } 1659 1660 /* Lexes a raw string. The stored string contains the spelling, including 1661 double quotes, delimiter string, '(' and ')', any leading 1662 'L', 'u', 'U' or 'u8' and 'R' modifier. It returns the type of the 1663 literal, or CPP_OTHER if it was not properly terminated. 1664 1665 The spelling is NUL-terminated, but it is not guaranteed that this 1666 is the first NUL since embedded NULs are preserved. */ 1667 1668 static void 1669 lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base, 1670 const uchar *cur) 1671 { 1672 uchar raw_prefix[17]; 1673 uchar temp_buffer[18]; 1674 const uchar *orig_base; 1675 unsigned int raw_prefix_len = 0, raw_suffix_len = 0; 1676 enum raw_str_phase { RAW_STR_PREFIX, RAW_STR, RAW_STR_SUFFIX }; 1677 raw_str_phase phase = RAW_STR_PREFIX; 1678 enum cpp_ttype type; 1679 size_t total_len = 0; 1680 /* Index into temp_buffer during phases other than RAW_STR, 1681 during RAW_STR phase 17 to tell BUF_APPEND that nothing should 1682 be appended to temp_buffer. */ 1683 size_t temp_buffer_len = 0; 1684 _cpp_buff *first_buff = NULL, *last_buff = NULL; 1685 size_t raw_prefix_start; 1686 _cpp_line_note *note = &pfile->buffer->notes[pfile->buffer->cur_note]; 1687 1688 type = (*base == 'L' ? CPP_WSTRING : 1689 *base == 'U' ? CPP_STRING32 : 1690 *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16) 1691 : CPP_STRING); 1692 1693 #define BUF_APPEND(STR,LEN) \ 1694 do { \ 1695 bufring_append (pfile, (const uchar *)(STR), (LEN), \ 1696 &first_buff, &last_buff); \ 1697 total_len += (LEN); \ 1698 if (__builtin_expect (temp_buffer_len < 17, 0) \ 1699 && (const uchar *)(STR) != base \ 1700 && (LEN) <= 2) \ 1701 { \ 1702 memcpy (temp_buffer + temp_buffer_len, \ 1703 (const uchar *)(STR), (LEN)); \ 1704 temp_buffer_len += (LEN); \ 1705 } \ 1706 } while (0) 1707 1708 orig_base = base; 1709 ++cur; 1710 raw_prefix_start = cur - base; 1711 for (;;) 1712 { 1713 cppchar_t c; 1714 1715 /* If we previously performed any trigraph or line splicing 1716 transformations, undo them in between the opening and closing 1717 double quote. */ 1718 while (note->pos < cur) 1719 ++note; 1720 for (; note->pos == cur; ++note) 1721 { 1722 switch (note->type) 1723 { 1724 case '\\': 1725 case ' ': 1726 /* Restore backslash followed by newline. */ 1727 BUF_APPEND (base, cur - base); 1728 base = cur; 1729 BUF_APPEND ("\\", 1); 1730 after_backslash: 1731 if (note->type == ' ') 1732 { 1733 /* GNU backslash whitespace newline extension. FIXME 1734 could be any sequence of non-vertical space. When we 1735 can properly restore any such sequence, we should mark 1736 this note as handled so _cpp_process_line_notes 1737 doesn't warn. */ 1738 BUF_APPEND (" ", 1); 1739 } 1740 1741 BUF_APPEND ("\n", 1); 1742 break; 1743 1744 case 0: 1745 /* Already handled. */ 1746 break; 1747 1748 default: 1749 if (_cpp_trigraph_map[note->type]) 1750 { 1751 /* Don't warn about this trigraph in 1752 _cpp_process_line_notes, since trigraphs show up as 1753 trigraphs in raw strings. */ 1754 uchar type = note->type; 1755 note->type = 0; 1756 1757 if (!CPP_OPTION (pfile, trigraphs)) 1758 /* If we didn't convert the trigraph in the first 1759 place, don't do anything now either. */ 1760 break; 1761 1762 BUF_APPEND (base, cur - base); 1763 base = cur; 1764 BUF_APPEND ("??", 2); 1765 1766 /* ??/ followed by newline gets two line notes, one for 1767 the trigraph and one for the backslash/newline. */ 1768 if (type == '/' && note[1].pos == cur) 1769 { 1770 if (note[1].type != '\\' 1771 && note[1].type != ' ') 1772 abort (); 1773 BUF_APPEND ("/", 1); 1774 ++note; 1775 goto after_backslash; 1776 } 1777 else 1778 { 1779 /* Skip the replacement character. */ 1780 base = ++cur; 1781 BUF_APPEND (&type, 1); 1782 c = type; 1783 goto check_c; 1784 } 1785 } 1786 else 1787 abort (); 1788 break; 1789 } 1790 } 1791 c = *cur++; 1792 if (__builtin_expect (temp_buffer_len < 17, 0)) 1793 temp_buffer[temp_buffer_len++] = c; 1794 1795 check_c: 1796 if (phase == RAW_STR_PREFIX) 1797 { 1798 while (raw_prefix_len < temp_buffer_len) 1799 { 1800 raw_prefix[raw_prefix_len] = temp_buffer[raw_prefix_len]; 1801 switch (raw_prefix[raw_prefix_len]) 1802 { 1803 case ' ': case '(': case ')': case '\\': case '\t': 1804 case '\v': case '\f': case '\n': default: 1805 break; 1806 /* Basic source charset except the above chars. */ 1807 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 1808 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': 1809 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': 1810 case 's': case 't': case 'u': case 'v': case 'w': case 'x': 1811 case 'y': case 'z': 1812 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 1813 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L': 1814 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': 1815 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X': 1816 case 'Y': case 'Z': 1817 case '0': case '1': case '2': case '3': case '4': case '5': 1818 case '6': case '7': case '8': case '9': 1819 case '_': case '{': case '}': case '#': case '[': case ']': 1820 case '<': case '>': case '%': case ':': case ';': case '.': 1821 case '?': case '*': case '+': case '-': case '/': case '^': 1822 case '&': case '|': case '~': case '!': case '=': case ',': 1823 case '"': case '\'': 1824 if (raw_prefix_len < 16) 1825 { 1826 raw_prefix_len++; 1827 continue; 1828 } 1829 break; 1830 } 1831 1832 if (raw_prefix[raw_prefix_len] != '(') 1833 { 1834 int col = CPP_BUF_COLUMN (pfile->buffer, cur) + 1; 1835 if (raw_prefix_len == 16) 1836 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 1837 col, "raw string delimiter longer " 1838 "than 16 characters"); 1839 else if (raw_prefix[raw_prefix_len] == '\n') 1840 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 1841 col, "invalid new-line in raw " 1842 "string delimiter"); 1843 else 1844 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 1845 col, "invalid character '%c' in " 1846 "raw string delimiter", 1847 (int) raw_prefix[raw_prefix_len]); 1848 pfile->buffer->cur = orig_base + raw_prefix_start - 1; 1849 create_literal (pfile, token, orig_base, 1850 raw_prefix_start - 1, CPP_OTHER); 1851 if (first_buff) 1852 _cpp_release_buff (pfile, first_buff); 1853 return; 1854 } 1855 raw_prefix[raw_prefix_len] = '"'; 1856 phase = RAW_STR; 1857 /* Nothing should be appended to temp_buffer during 1858 RAW_STR phase. */ 1859 temp_buffer_len = 17; 1860 break; 1861 } 1862 continue; 1863 } 1864 else if (phase == RAW_STR_SUFFIX) 1865 { 1866 while (raw_suffix_len <= raw_prefix_len 1867 && raw_suffix_len < temp_buffer_len 1868 && temp_buffer[raw_suffix_len] == raw_prefix[raw_suffix_len]) 1869 raw_suffix_len++; 1870 if (raw_suffix_len > raw_prefix_len) 1871 break; 1872 if (raw_suffix_len == temp_buffer_len) 1873 continue; 1874 phase = RAW_STR; 1875 /* Nothing should be appended to temp_buffer during 1876 RAW_STR phase. */ 1877 temp_buffer_len = 17; 1878 } 1879 if (c == ')') 1880 { 1881 phase = RAW_STR_SUFFIX; 1882 raw_suffix_len = 0; 1883 temp_buffer_len = 0; 1884 } 1885 else if (c == '\n') 1886 { 1887 if (pfile->state.in_directive 1888 || (pfile->state.parsing_args 1889 && pfile->buffer->next_line >= pfile->buffer->rlimit)) 1890 { 1891 cur--; 1892 type = CPP_OTHER; 1893 cpp_error_with_line (pfile, CPP_DL_ERROR, token->src_loc, 0, 1894 "unterminated raw string"); 1895 break; 1896 } 1897 1898 BUF_APPEND (base, cur - base); 1899 1900 if (pfile->buffer->cur < pfile->buffer->rlimit) 1901 CPP_INCREMENT_LINE (pfile, 0); 1902 pfile->buffer->need_line = true; 1903 1904 pfile->buffer->cur = cur-1; 1905 _cpp_process_line_notes (pfile, false); 1906 if (!_cpp_get_fresh_line (pfile)) 1907 { 1908 location_t src_loc = token->src_loc; 1909 token->type = CPP_EOF; 1910 /* Tell the compiler the line number of the EOF token. */ 1911 token->src_loc = pfile->line_table->highest_line; 1912 token->flags = BOL; 1913 if (first_buff != NULL) 1914 _cpp_release_buff (pfile, first_buff); 1915 cpp_error_with_line (pfile, CPP_DL_ERROR, src_loc, 0, 1916 "unterminated raw string"); 1917 return; 1918 } 1919 1920 cur = base = pfile->buffer->cur; 1921 note = &pfile->buffer->notes[pfile->buffer->cur_note]; 1922 } 1923 } 1924 1925 if (CPP_OPTION (pfile, user_literals)) 1926 { 1927 /* If a string format macro, say from inttypes.h, is placed touching 1928 a string literal it could be parsed as a C++11 user-defined string 1929 literal thus breaking the program. */ 1930 if (is_macro_not_literal_suffix (pfile, cur)) 1931 { 1932 /* Raise a warning, but do not consume subsequent tokens. */ 1933 if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping) 1934 cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX, 1935 token->src_loc, 0, 1936 "invalid suffix on literal; C++11 requires " 1937 "a space between literal and string macro"); 1938 } 1939 /* Grab user defined literal suffix. */ 1940 else if (ISIDST (*cur)) 1941 { 1942 type = cpp_userdef_string_add_type (type); 1943 ++cur; 1944 1945 while (ISIDNUM (*cur)) 1946 ++cur; 1947 } 1948 } 1949 1950 pfile->buffer->cur = cur; 1951 if (first_buff == NULL) 1952 create_literal (pfile, token, base, cur - base, type); 1953 else 1954 { 1955 uchar *dest = _cpp_unaligned_alloc (pfile, total_len + (cur - base) + 1); 1956 1957 token->type = type; 1958 token->val.str.len = total_len + (cur - base); 1959 token->val.str.text = dest; 1960 last_buff = first_buff; 1961 while (last_buff != NULL) 1962 { 1963 memcpy (dest, last_buff->base, 1964 BUFF_FRONT (last_buff) - last_buff->base); 1965 dest += BUFF_FRONT (last_buff) - last_buff->base; 1966 last_buff = last_buff->next; 1967 } 1968 _cpp_release_buff (pfile, first_buff); 1969 memcpy (dest, base, cur - base); 1970 dest[cur - base] = '\0'; 1971 } 1972 } 1973 1974 /* Lexes a string, character constant, or angle-bracketed header file 1975 name. The stored string contains the spelling, including opening 1976 quote and any leading 'L', 'u', 'U' or 'u8' and optional 1977 'R' modifier. It returns the type of the literal, or CPP_OTHER 1978 if it was not properly terminated, or CPP_LESS for an unterminated 1979 header name which must be relexed as normal tokens. 1980 1981 The spelling is NUL-terminated, but it is not guaranteed that this 1982 is the first NUL since embedded NULs are preserved. */ 1983 static void 1984 lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base) 1985 { 1986 bool saw_NUL = false; 1987 const uchar *cur; 1988 cppchar_t terminator; 1989 enum cpp_ttype type; 1990 1991 cur = base; 1992 terminator = *cur++; 1993 if (terminator == 'L' || terminator == 'U') 1994 terminator = *cur++; 1995 else if (terminator == 'u') 1996 { 1997 terminator = *cur++; 1998 if (terminator == '8') 1999 terminator = *cur++; 2000 } 2001 if (terminator == 'R') 2002 { 2003 lex_raw_string (pfile, token, base, cur); 2004 return; 2005 } 2006 if (terminator == '"') 2007 type = (*base == 'L' ? CPP_WSTRING : 2008 *base == 'U' ? CPP_STRING32 : 2009 *base == 'u' ? (base[1] == '8' ? CPP_UTF8STRING : CPP_STRING16) 2010 : CPP_STRING); 2011 else if (terminator == '\'') 2012 type = (*base == 'L' ? CPP_WCHAR : 2013 *base == 'U' ? CPP_CHAR32 : 2014 *base == 'u' ? (base[1] == '8' ? CPP_UTF8CHAR : CPP_CHAR16) 2015 : CPP_CHAR); 2016 else 2017 terminator = '>', type = CPP_HEADER_NAME; 2018 2019 for (;;) 2020 { 2021 cppchar_t c = *cur++; 2022 2023 /* In #include-style directives, terminators are not escapable. */ 2024 if (c == '\\' && !pfile->state.angled_headers && *cur != '\n') 2025 cur++; 2026 else if (c == terminator) 2027 break; 2028 else if (c == '\n') 2029 { 2030 cur--; 2031 /* Unmatched quotes always yield undefined behavior, but 2032 greedy lexing means that what appears to be an unterminated 2033 header name may actually be a legitimate sequence of tokens. */ 2034 if (terminator == '>') 2035 { 2036 token->type = CPP_LESS; 2037 return; 2038 } 2039 type = CPP_OTHER; 2040 break; 2041 } 2042 else if (c == '\0') 2043 saw_NUL = true; 2044 } 2045 2046 if (saw_NUL && !pfile->state.skipping) 2047 cpp_error (pfile, CPP_DL_WARNING, 2048 "null character(s) preserved in literal"); 2049 2050 if (type == CPP_OTHER && CPP_OPTION (pfile, lang) != CLK_ASM) 2051 cpp_error (pfile, CPP_DL_PEDWARN, "missing terminating %c character", 2052 (int) terminator); 2053 2054 if (CPP_OPTION (pfile, user_literals)) 2055 { 2056 /* If a string format macro, say from inttypes.h, is placed touching 2057 a string literal it could be parsed as a C++11 user-defined string 2058 literal thus breaking the program. */ 2059 if (is_macro_not_literal_suffix (pfile, cur)) 2060 { 2061 /* Raise a warning, but do not consume subsequent tokens. */ 2062 if (CPP_OPTION (pfile, warn_literal_suffix) && !pfile->state.skipping) 2063 cpp_warning_with_line (pfile, CPP_W_LITERAL_SUFFIX, 2064 token->src_loc, 0, 2065 "invalid suffix on literal; C++11 requires " 2066 "a space between literal and string macro"); 2067 } 2068 /* Grab user defined literal suffix. */ 2069 else if (ISIDST (*cur)) 2070 { 2071 type = cpp_userdef_char_add_type (type); 2072 type = cpp_userdef_string_add_type (type); 2073 ++cur; 2074 2075 while (ISIDNUM (*cur)) 2076 ++cur; 2077 } 2078 } 2079 else if (CPP_OPTION (pfile, cpp_warn_cxx11_compat) 2080 && is_macro (pfile, cur) 2081 && !pfile->state.skipping) 2082 cpp_warning_with_line (pfile, CPP_W_CXX11_COMPAT, 2083 token->src_loc, 0, "C++11 requires a space " 2084 "between string literal and macro"); 2085 2086 pfile->buffer->cur = cur; 2087 create_literal (pfile, token, base, cur - base, type); 2088 } 2089 2090 /* Return the comment table. The client may not make any assumption 2091 about the ordering of the table. */ 2092 cpp_comment_table * 2093 cpp_get_comments (cpp_reader *pfile) 2094 { 2095 return &pfile->comments; 2096 } 2097 2098 /* Append a comment to the end of the comment table. */ 2099 static void 2100 store_comment (cpp_reader *pfile, cpp_token *token) 2101 { 2102 int len; 2103 2104 if (pfile->comments.allocated == 0) 2105 { 2106 pfile->comments.allocated = 256; 2107 pfile->comments.entries = (cpp_comment *) xmalloc 2108 (pfile->comments.allocated * sizeof (cpp_comment)); 2109 } 2110 2111 if (pfile->comments.count == pfile->comments.allocated) 2112 { 2113 pfile->comments.allocated *= 2; 2114 pfile->comments.entries = (cpp_comment *) xrealloc 2115 (pfile->comments.entries, 2116 pfile->comments.allocated * sizeof (cpp_comment)); 2117 } 2118 2119 len = token->val.str.len; 2120 2121 /* Copy comment. Note, token may not be NULL terminated. */ 2122 pfile->comments.entries[pfile->comments.count].comment = 2123 (char *) xmalloc (sizeof (char) * (len + 1)); 2124 memcpy (pfile->comments.entries[pfile->comments.count].comment, 2125 token->val.str.text, len); 2126 pfile->comments.entries[pfile->comments.count].comment[len] = '\0'; 2127 2128 /* Set source location. */ 2129 pfile->comments.entries[pfile->comments.count].sloc = token->src_loc; 2130 2131 /* Increment the count of entries in the comment table. */ 2132 pfile->comments.count++; 2133 } 2134 2135 /* The stored comment includes the comment start and any terminator. */ 2136 static void 2137 save_comment (cpp_reader *pfile, cpp_token *token, const unsigned char *from, 2138 cppchar_t type) 2139 { 2140 unsigned char *buffer; 2141 unsigned int len, clen, i; 2142 int convert_to_c = (pfile->state.in_directive || pfile->state.parsing_args) 2143 && type == '/'; 2144 2145 len = pfile->buffer->cur - from + 1; /* + 1 for the initial '/'. */ 2146 2147 /* C++ comments probably (not definitely) have moved past a new 2148 line, which we don't want to save in the comment. */ 2149 if (is_vspace (pfile->buffer->cur[-1])) 2150 len--; 2151 2152 /* If we are currently in a directive or in argument parsing, then 2153 we need to store all C++ comments as C comments internally, and 2154 so we need to allocate a little extra space in that case. 2155 2156 Note that the only time we encounter a directive here is 2157 when we are saving comments in a "#define". */ 2158 clen = convert_to_c ? len + 2 : len; 2159 2160 buffer = _cpp_unaligned_alloc (pfile, clen); 2161 2162 token->type = CPP_COMMENT; 2163 token->val.str.len = clen; 2164 token->val.str.text = buffer; 2165 2166 buffer[0] = '/'; 2167 memcpy (buffer + 1, from, len - 1); 2168 2169 /* Finish conversion to a C comment, if necessary. */ 2170 if (convert_to_c) 2171 { 2172 buffer[1] = '*'; 2173 buffer[clen - 2] = '*'; 2174 buffer[clen - 1] = '/'; 2175 /* As there can be in a C++ comments illegal sequences for C comments 2176 we need to filter them out. */ 2177 for (i = 2; i < (clen - 2); i++) 2178 if (buffer[i] == '/' && (buffer[i - 1] == '*' || buffer[i + 1] == '*')) 2179 buffer[i] = '|'; 2180 } 2181 2182 /* Finally store this comment for use by clients of libcpp. */ 2183 store_comment (pfile, token); 2184 } 2185 2186 /* Returns true if comment at COMMENT_START is a recognized FALLTHROUGH 2187 comment. */ 2188 2189 static bool 2190 fallthrough_comment_p (cpp_reader *pfile, const unsigned char *comment_start) 2191 { 2192 const unsigned char *from = comment_start + 1; 2193 2194 switch (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough)) 2195 { 2196 /* For both -Wimplicit-fallthrough=0 and -Wimplicit-fallthrough=5 we 2197 don't recognize any comments. The latter only checks attributes, 2198 the former doesn't warn. */ 2199 case 0: 2200 default: 2201 return false; 2202 /* -Wimplicit-fallthrough=1 considers any comment, no matter what 2203 content it has. */ 2204 case 1: 2205 return true; 2206 case 2: 2207 /* -Wimplicit-fallthrough=2 looks for (case insensitive) 2208 .*falls?[ \t-]*thr(u|ough).* regex. */ 2209 for (; (size_t) (pfile->buffer->cur - from) >= sizeof "fallthru" - 1; 2210 from++) 2211 { 2212 /* Is there anything like strpbrk with upper boundary, or 2213 memchr looking for 2 characters rather than just one? */ 2214 if (from[0] != 'f' && from[0] != 'F') 2215 continue; 2216 if (from[1] != 'a' && from[1] != 'A') 2217 continue; 2218 if (from[2] != 'l' && from[2] != 'L') 2219 continue; 2220 if (from[3] != 'l' && from[3] != 'L') 2221 continue; 2222 from += sizeof "fall" - 1; 2223 if (from[0] == 's' || from[0] == 'S') 2224 from++; 2225 while (*from == ' ' || *from == '\t' || *from == '-') 2226 from++; 2227 if (from[0] != 't' && from[0] != 'T') 2228 continue; 2229 if (from[1] != 'h' && from[1] != 'H') 2230 continue; 2231 if (from[2] != 'r' && from[2] != 'R') 2232 continue; 2233 if (from[3] == 'u' || from[3] == 'U') 2234 return true; 2235 if (from[3] != 'o' && from[3] != 'O') 2236 continue; 2237 if (from[4] != 'u' && from[4] != 'U') 2238 continue; 2239 if (from[5] != 'g' && from[5] != 'G') 2240 continue; 2241 if (from[6] != 'h' && from[6] != 'H') 2242 continue; 2243 return true; 2244 } 2245 return false; 2246 case 3: 2247 case 4: 2248 break; 2249 } 2250 2251 /* Whole comment contents: 2252 -fallthrough 2253 @fallthrough@ 2254 */ 2255 if (*from == '-' || *from == '@') 2256 { 2257 size_t len = sizeof "fallthrough" - 1; 2258 if ((size_t) (pfile->buffer->cur - from - 1) < len) 2259 return false; 2260 if (memcmp (from + 1, "fallthrough", len)) 2261 return false; 2262 if (*from == '@') 2263 { 2264 if (from[len + 1] != '@') 2265 return false; 2266 len++; 2267 } 2268 from += 1 + len; 2269 } 2270 /* Whole comment contents (regex): 2271 lint -fallthrough[ \t]* 2272 */ 2273 else if (*from == 'l') 2274 { 2275 size_t len = sizeof "int -fallthrough" - 1; 2276 if ((size_t) (pfile->buffer->cur - from - 1) < len) 2277 return false; 2278 if (memcmp (from + 1, "int -fallthrough", len)) 2279 return false; 2280 from += 1 + len; 2281 while (*from == ' ' || *from == '\t') 2282 from++; 2283 } 2284 /* Whole comment contents (regex): 2285 [ \t]*FALLTHR(U|OUGH)[ \t]* 2286 */ 2287 else if (CPP_OPTION (pfile, cpp_warn_implicit_fallthrough) == 4) 2288 { 2289 while (*from == ' ' || *from == '\t') 2290 from++; 2291 if ((size_t) (pfile->buffer->cur - from) < sizeof "FALLTHRU" - 1) 2292 return false; 2293 if (memcmp (from, "FALLTHR", sizeof "FALLTHR" - 1)) 2294 return false; 2295 from += sizeof "FALLTHR" - 1; 2296 if (*from == 'U') 2297 from++; 2298 else if ((size_t) (pfile->buffer->cur - from) < sizeof "OUGH" - 1) 2299 return false; 2300 else if (memcmp (from, "OUGH", sizeof "OUGH" - 1)) 2301 return false; 2302 else 2303 from += sizeof "OUGH" - 1; 2304 while (*from == ' ' || *from == '\t') 2305 from++; 2306 } 2307 /* Whole comment contents (regex): 2308 [ \t.!]*(ELSE,? |INTENTIONAL(LY)? )?FALL(S | |-)?THR(OUGH|U)[ \t.!]*(-[^\n\r]*)? 2309 [ \t.!]*(Else,? |Intentional(ly)? )?Fall((s | |-)[Tt]|t)hr(ough|u)[ \t.!]*(-[^\n\r]*)? 2310 [ \t.!]*([Ee]lse,? |[Ii]ntentional(ly)? )?fall(s | |-)?thr(ough|u)[ \t.!]*(-[^\n\r]*)? 2311 */ 2312 else 2313 { 2314 while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!') 2315 from++; 2316 unsigned char f = *from; 2317 bool all_upper = false; 2318 if (f == 'E' || f == 'e') 2319 { 2320 if ((size_t) (pfile->buffer->cur - from) 2321 < sizeof "else fallthru" - 1) 2322 return false; 2323 if (f == 'E' && memcmp (from + 1, "LSE", sizeof "LSE" - 1) == 0) 2324 all_upper = true; 2325 else if (memcmp (from + 1, "lse", sizeof "lse" - 1)) 2326 return false; 2327 from += sizeof "else" - 1; 2328 if (*from == ',') 2329 from++; 2330 if (*from != ' ') 2331 return false; 2332 from++; 2333 if (all_upper && *from == 'f') 2334 return false; 2335 if (f == 'e' && *from == 'F') 2336 return false; 2337 f = *from; 2338 } 2339 else if (f == 'I' || f == 'i') 2340 { 2341 if ((size_t) (pfile->buffer->cur - from) 2342 < sizeof "intentional fallthru" - 1) 2343 return false; 2344 if (f == 'I' && memcmp (from + 1, "NTENTIONAL", 2345 sizeof "NTENTIONAL" - 1) == 0) 2346 all_upper = true; 2347 else if (memcmp (from + 1, "ntentional", 2348 sizeof "ntentional" - 1)) 2349 return false; 2350 from += sizeof "intentional" - 1; 2351 if (*from == ' ') 2352 { 2353 from++; 2354 if (all_upper && *from == 'f') 2355 return false; 2356 } 2357 else if (all_upper) 2358 { 2359 if (memcmp (from, "LY F", sizeof "LY F" - 1)) 2360 return false; 2361 from += sizeof "LY " - 1; 2362 } 2363 else 2364 { 2365 if (memcmp (from, "ly ", sizeof "ly " - 1)) 2366 return false; 2367 from += sizeof "ly " - 1; 2368 } 2369 if (f == 'i' && *from == 'F') 2370 return false; 2371 f = *from; 2372 } 2373 if (f != 'F' && f != 'f') 2374 return false; 2375 if ((size_t) (pfile->buffer->cur - from) < sizeof "fallthru" - 1) 2376 return false; 2377 if (f == 'F' && memcmp (from + 1, "ALL", sizeof "ALL" - 1) == 0) 2378 all_upper = true; 2379 else if (all_upper) 2380 return false; 2381 else if (memcmp (from + 1, "all", sizeof "all" - 1)) 2382 return false; 2383 from += sizeof "fall" - 1; 2384 if (*from == (all_upper ? 'S' : 's') && from[1] == ' ') 2385 from += 2; 2386 else if (*from == ' ' || *from == '-') 2387 from++; 2388 else if (*from != (all_upper ? 'T' : 't')) 2389 return false; 2390 if ((f == 'f' || *from != 'T') && (all_upper || *from != 't')) 2391 return false; 2392 if ((size_t) (pfile->buffer->cur - from) < sizeof "thru" - 1) 2393 return false; 2394 if (memcmp (from + 1, all_upper ? "HRU" : "hru", sizeof "hru" - 1)) 2395 { 2396 if ((size_t) (pfile->buffer->cur - from) < sizeof "through" - 1) 2397 return false; 2398 if (memcmp (from + 1, all_upper ? "HROUGH" : "hrough", 2399 sizeof "hrough" - 1)) 2400 return false; 2401 from += sizeof "through" - 1; 2402 } 2403 else 2404 from += sizeof "thru" - 1; 2405 while (*from == ' ' || *from == '\t' || *from == '.' || *from == '!') 2406 from++; 2407 if (*from == '-') 2408 { 2409 from++; 2410 if (*comment_start == '*') 2411 { 2412 do 2413 { 2414 while (*from && *from != '*' 2415 && *from != '\n' && *from != '\r') 2416 from++; 2417 if (*from != '*' || from[1] == '/') 2418 break; 2419 from++; 2420 } 2421 while (1); 2422 } 2423 else 2424 while (*from && *from != '\n' && *from != '\r') 2425 from++; 2426 } 2427 } 2428 /* C block comment. */ 2429 if (*comment_start == '*') 2430 { 2431 if (*from != '*' || from[1] != '/') 2432 return false; 2433 } 2434 /* C++ line comment. */ 2435 else if (*from != '\n') 2436 return false; 2437 2438 return true; 2439 } 2440 2441 /* Allocate COUNT tokens for RUN. */ 2442 void 2443 _cpp_init_tokenrun (tokenrun *run, unsigned int count) 2444 { 2445 run->base = XNEWVEC (cpp_token, count); 2446 run->limit = run->base + count; 2447 run->next = NULL; 2448 } 2449 2450 /* Returns the next tokenrun, or creates one if there is none. */ 2451 static tokenrun * 2452 next_tokenrun (tokenrun *run) 2453 { 2454 if (run->next == NULL) 2455 { 2456 run->next = XNEW (tokenrun); 2457 run->next->prev = run; 2458 _cpp_init_tokenrun (run->next, 250); 2459 } 2460 2461 return run->next; 2462 } 2463 2464 /* Return the number of not yet processed token in a given 2465 context. */ 2466 int 2467 _cpp_remaining_tokens_num_in_context (cpp_context *context) 2468 { 2469 if (context->tokens_kind == TOKENS_KIND_DIRECT) 2470 return (LAST (context).token - FIRST (context).token); 2471 else if (context->tokens_kind == TOKENS_KIND_INDIRECT 2472 || context->tokens_kind == TOKENS_KIND_EXTENDED) 2473 return (LAST (context).ptoken - FIRST (context).ptoken); 2474 else 2475 abort (); 2476 } 2477 2478 /* Returns the token present at index INDEX in a given context. If 2479 INDEX is zero, the next token to be processed is returned. */ 2480 static const cpp_token* 2481 _cpp_token_from_context_at (cpp_context *context, int index) 2482 { 2483 if (context->tokens_kind == TOKENS_KIND_DIRECT) 2484 return &(FIRST (context).token[index]); 2485 else if (context->tokens_kind == TOKENS_KIND_INDIRECT 2486 || context->tokens_kind == TOKENS_KIND_EXTENDED) 2487 return FIRST (context).ptoken[index]; 2488 else 2489 abort (); 2490 } 2491 2492 /* Look ahead in the input stream. */ 2493 const cpp_token * 2494 cpp_peek_token (cpp_reader *pfile, int index) 2495 { 2496 cpp_context *context = pfile->context; 2497 const cpp_token *peektok; 2498 int count; 2499 2500 /* First, scan through any pending cpp_context objects. */ 2501 while (context->prev) 2502 { 2503 ptrdiff_t sz = _cpp_remaining_tokens_num_in_context (context); 2504 2505 if (index < (int) sz) 2506 return _cpp_token_from_context_at (context, index); 2507 index -= (int) sz; 2508 context = context->prev; 2509 } 2510 2511 /* We will have to read some new tokens after all (and do so 2512 without invalidating preceding tokens). */ 2513 count = index; 2514 pfile->keep_tokens++; 2515 2516 /* For peeked tokens temporarily disable line_change reporting, 2517 until the tokens are parsed for real. */ 2518 void (*line_change) (cpp_reader *, const cpp_token *, int) 2519 = pfile->cb.line_change; 2520 pfile->cb.line_change = NULL; 2521 2522 do 2523 { 2524 peektok = _cpp_lex_token (pfile); 2525 if (peektok->type == CPP_EOF) 2526 { 2527 index--; 2528 break; 2529 } 2530 } 2531 while (index--); 2532 2533 _cpp_backup_tokens_direct (pfile, count - index); 2534 pfile->keep_tokens--; 2535 pfile->cb.line_change = line_change; 2536 2537 return peektok; 2538 } 2539 2540 /* Allocate a single token that is invalidated at the same time as the 2541 rest of the tokens on the line. Has its line and col set to the 2542 same as the last lexed token, so that diagnostics appear in the 2543 right place. */ 2544 cpp_token * 2545 _cpp_temp_token (cpp_reader *pfile) 2546 { 2547 cpp_token *old, *result; 2548 ptrdiff_t sz = pfile->cur_run->limit - pfile->cur_token; 2549 ptrdiff_t la = (ptrdiff_t) pfile->lookaheads; 2550 2551 old = pfile->cur_token - 1; 2552 /* Any pre-existing lookaheads must not be clobbered. */ 2553 if (la) 2554 { 2555 if (sz <= la) 2556 { 2557 tokenrun *next = next_tokenrun (pfile->cur_run); 2558 2559 if (sz < la) 2560 memmove (next->base + 1, next->base, 2561 (la - sz) * sizeof (cpp_token)); 2562 2563 next->base[0] = pfile->cur_run->limit[-1]; 2564 } 2565 2566 if (sz > 1) 2567 memmove (pfile->cur_token + 1, pfile->cur_token, 2568 MIN (la, sz - 1) * sizeof (cpp_token)); 2569 } 2570 2571 if (!sz && pfile->cur_token == pfile->cur_run->limit) 2572 { 2573 pfile->cur_run = next_tokenrun (pfile->cur_run); 2574 pfile->cur_token = pfile->cur_run->base; 2575 } 2576 2577 result = pfile->cur_token++; 2578 result->src_loc = old->src_loc; 2579 return result; 2580 } 2581 2582 /* Lex a token into RESULT (external interface). Takes care of issues 2583 like directive handling, token lookahead, multiple include 2584 optimization and skipping. */ 2585 const cpp_token * 2586 _cpp_lex_token (cpp_reader *pfile) 2587 { 2588 cpp_token *result; 2589 2590 for (;;) 2591 { 2592 if (pfile->cur_token == pfile->cur_run->limit) 2593 { 2594 pfile->cur_run = next_tokenrun (pfile->cur_run); 2595 pfile->cur_token = pfile->cur_run->base; 2596 } 2597 /* We assume that the current token is somewhere in the current 2598 run. */ 2599 if (pfile->cur_token < pfile->cur_run->base 2600 || pfile->cur_token >= pfile->cur_run->limit) 2601 abort (); 2602 2603 if (pfile->lookaheads) 2604 { 2605 pfile->lookaheads--; 2606 result = pfile->cur_token++; 2607 } 2608 else 2609 result = _cpp_lex_direct (pfile); 2610 2611 if (result->flags & BOL) 2612 { 2613 /* Is this a directive. If _cpp_handle_directive returns 2614 false, it is an assembler #. */ 2615 if (result->type == CPP_HASH 2616 /* 6.10.3 p 11: Directives in a list of macro arguments 2617 gives undefined behavior. This implementation 2618 handles the directive as normal. */ 2619 && pfile->state.parsing_args != 1) 2620 { 2621 if (_cpp_handle_directive (pfile, result->flags & PREV_WHITE)) 2622 { 2623 if (pfile->directive_result.type == CPP_PADDING) 2624 continue; 2625 result = &pfile->directive_result; 2626 } 2627 } 2628 else if (pfile->state.in_deferred_pragma) 2629 result = &pfile->directive_result; 2630 2631 if (pfile->cb.line_change && !pfile->state.skipping) 2632 pfile->cb.line_change (pfile, result, pfile->state.parsing_args); 2633 } 2634 2635 /* We don't skip tokens in directives. */ 2636 if (pfile->state.in_directive || pfile->state.in_deferred_pragma) 2637 break; 2638 2639 /* Outside a directive, invalidate controlling macros. At file 2640 EOF, _cpp_lex_direct takes care of popping the buffer, so we never 2641 get here and MI optimization works. */ 2642 pfile->mi_valid = false; 2643 2644 if (!pfile->state.skipping || result->type == CPP_EOF) 2645 break; 2646 } 2647 2648 return result; 2649 } 2650 2651 /* Returns true if a fresh line has been loaded. */ 2652 bool 2653 _cpp_get_fresh_line (cpp_reader *pfile) 2654 { 2655 int return_at_eof; 2656 2657 /* We can't get a new line until we leave the current directive. */ 2658 if (pfile->state.in_directive) 2659 return false; 2660 2661 for (;;) 2662 { 2663 cpp_buffer *buffer = pfile->buffer; 2664 2665 if (!buffer->need_line) 2666 return true; 2667 2668 if (buffer->next_line < buffer->rlimit) 2669 { 2670 _cpp_clean_line (pfile); 2671 return true; 2672 } 2673 2674 /* First, get out of parsing arguments state. */ 2675 if (pfile->state.parsing_args) 2676 return false; 2677 2678 /* End of buffer. Non-empty files should end in a newline. */ 2679 if (buffer->buf != buffer->rlimit 2680 && buffer->next_line > buffer->rlimit 2681 && !buffer->from_stage3) 2682 { 2683 /* Clip to buffer size. */ 2684 buffer->next_line = buffer->rlimit; 2685 } 2686 2687 return_at_eof = buffer->return_at_eof; 2688 _cpp_pop_buffer (pfile); 2689 if (pfile->buffer == NULL || return_at_eof) 2690 return false; 2691 } 2692 } 2693 2694 #define IF_NEXT_IS(CHAR, THEN_TYPE, ELSE_TYPE) \ 2695 do \ 2696 { \ 2697 result->type = ELSE_TYPE; \ 2698 if (*buffer->cur == CHAR) \ 2699 buffer->cur++, result->type = THEN_TYPE; \ 2700 } \ 2701 while (0) 2702 2703 /* Lex a token into pfile->cur_token, which is also incremented, to 2704 get diagnostics pointing to the correct location. 2705 2706 Does not handle issues such as token lookahead, multiple-include 2707 optimization, directives, skipping etc. This function is only 2708 suitable for use by _cpp_lex_token, and in special cases like 2709 lex_expansion_token which doesn't care for any of these issues. 2710 2711 When meeting a newline, returns CPP_EOF if parsing a directive, 2712 otherwise returns to the start of the token buffer if permissible. 2713 Returns the location of the lexed token. */ 2714 cpp_token * 2715 _cpp_lex_direct (cpp_reader *pfile) 2716 { 2717 cppchar_t c; 2718 cpp_buffer *buffer; 2719 const unsigned char *comment_start; 2720 bool fallthrough_comment = false; 2721 cpp_token *result = pfile->cur_token++; 2722 2723 fresh_line: 2724 result->flags = 0; 2725 buffer = pfile->buffer; 2726 if (buffer->need_line) 2727 { 2728 if (pfile->state.in_deferred_pragma) 2729 { 2730 result->type = CPP_PRAGMA_EOL; 2731 pfile->state.in_deferred_pragma = false; 2732 if (!pfile->state.pragma_allow_expansion) 2733 pfile->state.prevent_expansion--; 2734 return result; 2735 } 2736 if (!_cpp_get_fresh_line (pfile)) 2737 { 2738 result->type = CPP_EOF; 2739 if (!pfile->state.in_directive) 2740 { 2741 /* Tell the compiler the line number of the EOF token. */ 2742 result->src_loc = pfile->line_table->highest_line; 2743 result->flags = BOL; 2744 } 2745 return result; 2746 } 2747 if (buffer != pfile->buffer) 2748 fallthrough_comment = false; 2749 if (!pfile->keep_tokens) 2750 { 2751 pfile->cur_run = &pfile->base_run; 2752 result = pfile->base_run.base; 2753 pfile->cur_token = result + 1; 2754 } 2755 result->flags = BOL; 2756 if (pfile->state.parsing_args == 2) 2757 result->flags |= PREV_WHITE; 2758 } 2759 buffer = pfile->buffer; 2760 update_tokens_line: 2761 result->src_loc = pfile->line_table->highest_line; 2762 2763 skipped_white: 2764 if (buffer->cur >= buffer->notes[buffer->cur_note].pos 2765 && !pfile->overlaid_buffer) 2766 { 2767 _cpp_process_line_notes (pfile, false); 2768 result->src_loc = pfile->line_table->highest_line; 2769 } 2770 c = *buffer->cur++; 2771 2772 if (pfile->forced_token_location) 2773 result->src_loc = pfile->forced_token_location; 2774 else 2775 result->src_loc = linemap_position_for_column (pfile->line_table, 2776 CPP_BUF_COLUMN (buffer, buffer->cur)); 2777 2778 switch (c) 2779 { 2780 case ' ': case '\t': case '\f': case '\v': case '\0': 2781 result->flags |= PREV_WHITE; 2782 skip_whitespace (pfile, c); 2783 goto skipped_white; 2784 2785 case '\n': 2786 /* Increment the line, unless this is the last line ... */ 2787 if (buffer->cur < buffer->rlimit 2788 /* ... or this is a #include, (where _cpp_stack_file needs to 2789 unwind by one line) ... */ 2790 || (pfile->state.in_directive > 1 2791 /* ... except traditional-cpp increments this elsewhere. */ 2792 && !CPP_OPTION (pfile, traditional))) 2793 CPP_INCREMENT_LINE (pfile, 0); 2794 buffer->need_line = true; 2795 goto fresh_line; 2796 2797 case '0': case '1': case '2': case '3': case '4': 2798 case '5': case '6': case '7': case '8': case '9': 2799 { 2800 struct normalize_state nst = INITIAL_NORMALIZE_STATE; 2801 result->type = CPP_NUMBER; 2802 lex_number (pfile, &result->val.str, &nst); 2803 warn_about_normalization (pfile, result, &nst); 2804 break; 2805 } 2806 2807 case 'L': 2808 case 'u': 2809 case 'U': 2810 case 'R': 2811 /* 'L', 'u', 'U', 'u8' or 'R' may introduce wide characters, 2812 wide strings or raw strings. */ 2813 if (c == 'L' || CPP_OPTION (pfile, rliterals) 2814 || (c != 'R' && CPP_OPTION (pfile, uliterals))) 2815 { 2816 if ((*buffer->cur == '\'' && c != 'R') 2817 || *buffer->cur == '"' 2818 || (*buffer->cur == 'R' 2819 && c != 'R' 2820 && buffer->cur[1] == '"' 2821 && CPP_OPTION (pfile, rliterals)) 2822 || (*buffer->cur == '8' 2823 && c == 'u' 2824 && ((buffer->cur[1] == '"' || (buffer->cur[1] == '\'' 2825 && CPP_OPTION (pfile, utf8_char_literals))) 2826 || (buffer->cur[1] == 'R' && buffer->cur[2] == '"' 2827 && CPP_OPTION (pfile, rliterals))))) 2828 { 2829 lex_string (pfile, result, buffer->cur - 1); 2830 break; 2831 } 2832 } 2833 /* Fall through. */ 2834 2835 case '_': 2836 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': 2837 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l': 2838 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': 2839 case 's': case 't': case 'v': case 'w': case 'x': 2840 case 'y': case 'z': 2841 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': 2842 case 'G': case 'H': case 'I': case 'J': case 'K': 2843 case 'M': case 'N': case 'O': case 'P': case 'Q': 2844 case 'S': case 'T': case 'V': case 'W': case 'X': 2845 case 'Y': case 'Z': 2846 result->type = CPP_NAME; 2847 { 2848 struct normalize_state nst = INITIAL_NORMALIZE_STATE; 2849 result->val.node.node = lex_identifier (pfile, buffer->cur - 1, false, 2850 &nst, 2851 &result->val.node.spelling); 2852 warn_about_normalization (pfile, result, &nst); 2853 } 2854 2855 /* Convert named operators to their proper types. */ 2856 if (result->val.node.node->flags & NODE_OPERATOR) 2857 { 2858 result->flags |= NAMED_OP; 2859 result->type = (enum cpp_ttype) result->val.node.node->directive_index; 2860 } 2861 2862 /* Signal FALLTHROUGH comment followed by another token. */ 2863 if (fallthrough_comment) 2864 result->flags |= PREV_FALLTHROUGH; 2865 break; 2866 2867 case '\'': 2868 case '"': 2869 lex_string (pfile, result, buffer->cur - 1); 2870 break; 2871 2872 case '/': 2873 /* A potential block or line comment. */ 2874 comment_start = buffer->cur; 2875 c = *buffer->cur; 2876 2877 if (c == '*') 2878 { 2879 if (_cpp_skip_block_comment (pfile)) 2880 cpp_error (pfile, CPP_DL_ERROR, "unterminated comment"); 2881 } 2882 else if (c == '/' && ! CPP_OPTION (pfile, traditional)) 2883 { 2884 /* Don't warn for system headers. */ 2885 if (cpp_in_system_header (pfile)) 2886 ; 2887 /* Warn about comments if pedantically GNUC89, and not 2888 in system headers. */ 2889 else if (CPP_OPTION (pfile, lang) == CLK_GNUC89 2890 && CPP_PEDANTIC (pfile) 2891 && ! buffer->warned_cplusplus_comments) 2892 { 2893 if (cpp_error (pfile, CPP_DL_PEDWARN, 2894 "C++ style comments are not allowed in ISO C90")) 2895 cpp_error (pfile, CPP_DL_NOTE, 2896 "(this will be reported only once per input file)"); 2897 buffer->warned_cplusplus_comments = 1; 2898 } 2899 /* Or if specifically desired via -Wc90-c99-compat. */ 2900 else if (CPP_OPTION (pfile, cpp_warn_c90_c99_compat) > 0 2901 && ! CPP_OPTION (pfile, cplusplus) 2902 && ! buffer->warned_cplusplus_comments) 2903 { 2904 if (cpp_error (pfile, CPP_DL_WARNING, 2905 "C++ style comments are incompatible with C90")) 2906 cpp_error (pfile, CPP_DL_NOTE, 2907 "(this will be reported only once per input file)"); 2908 buffer->warned_cplusplus_comments = 1; 2909 } 2910 /* In C89/C94, C++ style comments are forbidden. */ 2911 else if ((CPP_OPTION (pfile, lang) == CLK_STDC89 2912 || CPP_OPTION (pfile, lang) == CLK_STDC94)) 2913 { 2914 /* But don't be confused about valid code such as 2915 - // immediately followed by *, 2916 - // in a preprocessing directive, 2917 - // in an #if 0 block. */ 2918 if (buffer->cur[1] == '*' 2919 || pfile->state.in_directive 2920 || pfile->state.skipping) 2921 { 2922 result->type = CPP_DIV; 2923 break; 2924 } 2925 else if (! buffer->warned_cplusplus_comments) 2926 { 2927 if (cpp_error (pfile, CPP_DL_ERROR, 2928 "C++ style comments are not allowed in " 2929 "ISO C90")) 2930 cpp_error (pfile, CPP_DL_NOTE, 2931 "(this will be reported only once per input " 2932 "file)"); 2933 buffer->warned_cplusplus_comments = 1; 2934 } 2935 } 2936 if (skip_line_comment (pfile) && CPP_OPTION (pfile, warn_comments)) 2937 cpp_warning (pfile, CPP_W_COMMENTS, "multi-line comment"); 2938 } 2939 else if (c == '=') 2940 { 2941 buffer->cur++; 2942 result->type = CPP_DIV_EQ; 2943 break; 2944 } 2945 else 2946 { 2947 result->type = CPP_DIV; 2948 break; 2949 } 2950 2951 if (fallthrough_comment_p (pfile, comment_start)) 2952 fallthrough_comment = true; 2953 2954 if (pfile->cb.comment) 2955 { 2956 size_t len = pfile->buffer->cur - comment_start; 2957 pfile->cb.comment (pfile, result->src_loc, comment_start - 1, 2958 len + 1); 2959 } 2960 2961 if (!pfile->state.save_comments) 2962 { 2963 result->flags |= PREV_WHITE; 2964 goto update_tokens_line; 2965 } 2966 2967 if (fallthrough_comment) 2968 result->flags |= PREV_FALLTHROUGH; 2969 2970 /* Save the comment as a token in its own right. */ 2971 save_comment (pfile, result, comment_start, c); 2972 break; 2973 2974 case '<': 2975 if (pfile->state.angled_headers) 2976 { 2977 lex_string (pfile, result, buffer->cur - 1); 2978 if (result->type != CPP_LESS) 2979 break; 2980 } 2981 2982 result->type = CPP_LESS; 2983 if (*buffer->cur == '=') 2984 { 2985 buffer->cur++, result->type = CPP_LESS_EQ; 2986 if (*buffer->cur == '>' 2987 && CPP_OPTION (pfile, cplusplus) 2988 && CPP_OPTION (pfile, lang) >= CLK_GNUCXX2A) 2989 buffer->cur++, result->type = CPP_SPACESHIP; 2990 } 2991 else if (*buffer->cur == '<') 2992 { 2993 buffer->cur++; 2994 IF_NEXT_IS ('=', CPP_LSHIFT_EQ, CPP_LSHIFT); 2995 } 2996 else if (CPP_OPTION (pfile, digraphs)) 2997 { 2998 if (*buffer->cur == ':') 2999 { 3000 /* C++11 [2.5/3 lex.pptoken], "Otherwise, if the next 3001 three characters are <:: and the subsequent character 3002 is neither : nor >, the < is treated as a preprocessor 3003 token by itself". */ 3004 if (CPP_OPTION (pfile, cplusplus) 3005 && CPP_OPTION (pfile, lang) != CLK_CXX98 3006 && CPP_OPTION (pfile, lang) != CLK_GNUCXX 3007 && buffer->cur[1] == ':' 3008 && buffer->cur[2] != ':' && buffer->cur[2] != '>') 3009 break; 3010 3011 buffer->cur++; 3012 result->flags |= DIGRAPH; 3013 result->type = CPP_OPEN_SQUARE; 3014 } 3015 else if (*buffer->cur == '%') 3016 { 3017 buffer->cur++; 3018 result->flags |= DIGRAPH; 3019 result->type = CPP_OPEN_BRACE; 3020 } 3021 } 3022 break; 3023 3024 case '>': 3025 result->type = CPP_GREATER; 3026 if (*buffer->cur == '=') 3027 buffer->cur++, result->type = CPP_GREATER_EQ; 3028 else if (*buffer->cur == '>') 3029 { 3030 buffer->cur++; 3031 IF_NEXT_IS ('=', CPP_RSHIFT_EQ, CPP_RSHIFT); 3032 } 3033 break; 3034 3035 case '%': 3036 result->type = CPP_MOD; 3037 if (*buffer->cur == '=') 3038 buffer->cur++, result->type = CPP_MOD_EQ; 3039 else if (CPP_OPTION (pfile, digraphs)) 3040 { 3041 if (*buffer->cur == ':') 3042 { 3043 buffer->cur++; 3044 result->flags |= DIGRAPH; 3045 result->type = CPP_HASH; 3046 if (*buffer->cur == '%' && buffer->cur[1] == ':') 3047 buffer->cur += 2, result->type = CPP_PASTE, result->val.token_no = 0; 3048 } 3049 else if (*buffer->cur == '>') 3050 { 3051 buffer->cur++; 3052 result->flags |= DIGRAPH; 3053 result->type = CPP_CLOSE_BRACE; 3054 } 3055 } 3056 break; 3057 3058 case '.': 3059 result->type = CPP_DOT; 3060 if (ISDIGIT (*buffer->cur)) 3061 { 3062 struct normalize_state nst = INITIAL_NORMALIZE_STATE; 3063 result->type = CPP_NUMBER; 3064 lex_number (pfile, &result->val.str, &nst); 3065 warn_about_normalization (pfile, result, &nst); 3066 } 3067 else if (*buffer->cur == '.' && buffer->cur[1] == '.') 3068 buffer->cur += 2, result->type = CPP_ELLIPSIS; 3069 else if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus)) 3070 buffer->cur++, result->type = CPP_DOT_STAR; 3071 break; 3072 3073 case '+': 3074 result->type = CPP_PLUS; 3075 if (*buffer->cur == '+') 3076 buffer->cur++, result->type = CPP_PLUS_PLUS; 3077 else if (*buffer->cur == '=') 3078 buffer->cur++, result->type = CPP_PLUS_EQ; 3079 break; 3080 3081 case '-': 3082 result->type = CPP_MINUS; 3083 if (*buffer->cur == '>') 3084 { 3085 buffer->cur++; 3086 result->type = CPP_DEREF; 3087 if (*buffer->cur == '*' && CPP_OPTION (pfile, cplusplus)) 3088 buffer->cur++, result->type = CPP_DEREF_STAR; 3089 } 3090 else if (*buffer->cur == '-') 3091 buffer->cur++, result->type = CPP_MINUS_MINUS; 3092 else if (*buffer->cur == '=') 3093 buffer->cur++, result->type = CPP_MINUS_EQ; 3094 break; 3095 3096 case '&': 3097 result->type = CPP_AND; 3098 if (*buffer->cur == '&') 3099 buffer->cur++, result->type = CPP_AND_AND; 3100 else if (*buffer->cur == '=') 3101 buffer->cur++, result->type = CPP_AND_EQ; 3102 break; 3103 3104 case '|': 3105 result->type = CPP_OR; 3106 if (*buffer->cur == '|') 3107 buffer->cur++, result->type = CPP_OR_OR; 3108 else if (*buffer->cur == '=') 3109 buffer->cur++, result->type = CPP_OR_EQ; 3110 break; 3111 3112 case ':': 3113 result->type = CPP_COLON; 3114 if (*buffer->cur == ':' && CPP_OPTION (pfile, scope)) 3115 buffer->cur++, result->type = CPP_SCOPE; 3116 else if (*buffer->cur == '>' && CPP_OPTION (pfile, digraphs)) 3117 { 3118 buffer->cur++; 3119 result->flags |= DIGRAPH; 3120 result->type = CPP_CLOSE_SQUARE; 3121 } 3122 break; 3123 3124 case '*': IF_NEXT_IS ('=', CPP_MULT_EQ, CPP_MULT); break; 3125 case '=': IF_NEXT_IS ('=', CPP_EQ_EQ, CPP_EQ); break; 3126 case '!': IF_NEXT_IS ('=', CPP_NOT_EQ, CPP_NOT); break; 3127 case '^': IF_NEXT_IS ('=', CPP_XOR_EQ, CPP_XOR); break; 3128 case '#': IF_NEXT_IS ('#', CPP_PASTE, CPP_HASH); result->val.token_no = 0; break; 3129 3130 case '?': result->type = CPP_QUERY; break; 3131 case '~': result->type = CPP_COMPL; break; 3132 case ',': result->type = CPP_COMMA; break; 3133 case '(': result->type = CPP_OPEN_PAREN; break; 3134 case ')': result->type = CPP_CLOSE_PAREN; break; 3135 case '[': result->type = CPP_OPEN_SQUARE; break; 3136 case ']': result->type = CPP_CLOSE_SQUARE; break; 3137 case '{': result->type = CPP_OPEN_BRACE; break; 3138 case '}': result->type = CPP_CLOSE_BRACE; break; 3139 case ';': result->type = CPP_SEMICOLON; break; 3140 3141 /* @ is a punctuator in Objective-C. */ 3142 case '@': result->type = CPP_ATSIGN; break; 3143 3144 default: 3145 { 3146 const uchar *base = --buffer->cur; 3147 3148 /* Check for an extended identifier ($ or UCN or UTF-8). */ 3149 struct normalize_state nst = INITIAL_NORMALIZE_STATE; 3150 if (forms_identifier_p (pfile, true, &nst)) 3151 { 3152 result->type = CPP_NAME; 3153 result->val.node.node = lex_identifier (pfile, base, true, &nst, 3154 &result->val.node.spelling); 3155 warn_about_normalization (pfile, result, &nst); 3156 break; 3157 } 3158 3159 /* Otherwise this will form a CPP_OTHER token. Parse valid UTF-8 as a 3160 single token. */ 3161 buffer->cur++; 3162 if (c >= utf8_signifier) 3163 { 3164 const uchar *pstr = base; 3165 cppchar_t s; 3166 if (_cpp_valid_utf8 (pfile, &pstr, buffer->rlimit, 0, NULL, &s)) 3167 buffer->cur = pstr; 3168 } 3169 create_literal (pfile, result, base, buffer->cur - base, CPP_OTHER); 3170 break; 3171 } 3172 3173 } 3174 3175 /* Potentially convert the location of the token to a range. */ 3176 if (result->src_loc >= RESERVED_LOCATION_COUNT 3177 && result->type != CPP_EOF) 3178 { 3179 /* Ensure that any line notes are processed, so that we have the 3180 correct physical line/column for the end-point of the token even 3181 when a logical line is split via one or more backslashes. */ 3182 if (buffer->cur >= buffer->notes[buffer->cur_note].pos 3183 && !pfile->overlaid_buffer) 3184 _cpp_process_line_notes (pfile, false); 3185 3186 source_range tok_range; 3187 tok_range.m_start = result->src_loc; 3188 tok_range.m_finish 3189 = linemap_position_for_column (pfile->line_table, 3190 CPP_BUF_COLUMN (buffer, buffer->cur)); 3191 3192 result->src_loc = COMBINE_LOCATION_DATA (pfile->line_table, 3193 result->src_loc, 3194 tok_range, NULL); 3195 } 3196 3197 return result; 3198 } 3199 3200 /* An upper bound on the number of bytes needed to spell TOKEN. 3201 Does not include preceding whitespace. */ 3202 unsigned int 3203 cpp_token_len (const cpp_token *token) 3204 { 3205 unsigned int len; 3206 3207 switch (TOKEN_SPELL (token)) 3208 { 3209 default: len = 6; break; 3210 case SPELL_LITERAL: len = token->val.str.len; break; 3211 case SPELL_IDENT: len = NODE_LEN (token->val.node.node) * 10; break; 3212 } 3213 3214 return len; 3215 } 3216 3217 /* Parse UTF-8 out of NAMEP and place a \U escape in BUFFER. 3218 Return the number of bytes read out of NAME. (There are always 3219 10 bytes written to BUFFER.) */ 3220 3221 static size_t 3222 utf8_to_ucn (unsigned char *buffer, const unsigned char *name) 3223 { 3224 int j; 3225 int ucn_len = 0; 3226 int ucn_len_c; 3227 unsigned t; 3228 unsigned long utf32; 3229 3230 /* Compute the length of the UTF-8 sequence. */ 3231 for (t = *name; t & 0x80; t <<= 1) 3232 ucn_len++; 3233 3234 utf32 = *name & (0x7F >> ucn_len); 3235 for (ucn_len_c = 1; ucn_len_c < ucn_len; ucn_len_c++) 3236 { 3237 utf32 = (utf32 << 6) | (*++name & 0x3F); 3238 3239 /* Ill-formed UTF-8. */ 3240 if ((*name & ~0x3F) != 0x80) 3241 abort (); 3242 } 3243 3244 *buffer++ = '\\'; 3245 *buffer++ = 'U'; 3246 for (j = 7; j >= 0; j--) 3247 *buffer++ = "0123456789abcdef"[(utf32 >> (4 * j)) & 0xF]; 3248 return ucn_len; 3249 } 3250 3251 /* Given a token TYPE corresponding to a digraph, return a pointer to 3252 the spelling of the digraph. */ 3253 static const unsigned char * 3254 cpp_digraph2name (enum cpp_ttype type) 3255 { 3256 return digraph_spellings[(int) type - (int) CPP_FIRST_DIGRAPH]; 3257 } 3258 3259 /* Write the spelling of an identifier IDENT, using UCNs, to BUFFER. 3260 The buffer must already contain the enough space to hold the 3261 token's spelling. Returns a pointer to the character after the 3262 last character written. */ 3263 unsigned char * 3264 _cpp_spell_ident_ucns (unsigned char *buffer, cpp_hashnode *ident) 3265 { 3266 size_t i; 3267 const unsigned char *name = NODE_NAME (ident); 3268 3269 for (i = 0; i < NODE_LEN (ident); i++) 3270 if (name[i] & ~0x7F) 3271 { 3272 i += utf8_to_ucn (buffer, name + i) - 1; 3273 buffer += 10; 3274 } 3275 else 3276 *buffer++ = name[i]; 3277 3278 return buffer; 3279 } 3280 3281 /* Write the spelling of a token TOKEN to BUFFER. The buffer must 3282 already contain the enough space to hold the token's spelling. 3283 Returns a pointer to the character after the last character written. 3284 FORSTRING is true if this is to be the spelling after translation 3285 phase 1 (with the original spelling of extended identifiers), false 3286 if extended identifiers should always be written using UCNs (there is 3287 no option for always writing them in the internal UTF-8 form). 3288 FIXME: Would be nice if we didn't need the PFILE argument. */ 3289 unsigned char * 3290 cpp_spell_token (cpp_reader *pfile, const cpp_token *token, 3291 unsigned char *buffer, bool forstring) 3292 { 3293 switch (TOKEN_SPELL (token)) 3294 { 3295 case SPELL_OPERATOR: 3296 { 3297 const unsigned char *spelling; 3298 unsigned char c; 3299 3300 if (token->flags & DIGRAPH) 3301 spelling = cpp_digraph2name (token->type); 3302 else if (token->flags & NAMED_OP) 3303 goto spell_ident; 3304 else 3305 spelling = TOKEN_NAME (token); 3306 3307 while ((c = *spelling++) != '\0') 3308 *buffer++ = c; 3309 } 3310 break; 3311 3312 spell_ident: 3313 case SPELL_IDENT: 3314 if (forstring) 3315 { 3316 memcpy (buffer, NODE_NAME (token->val.node.spelling), 3317 NODE_LEN (token->val.node.spelling)); 3318 buffer += NODE_LEN (token->val.node.spelling); 3319 } 3320 else 3321 buffer = _cpp_spell_ident_ucns (buffer, token->val.node.node); 3322 break; 3323 3324 case SPELL_LITERAL: 3325 memcpy (buffer, token->val.str.text, token->val.str.len); 3326 buffer += token->val.str.len; 3327 break; 3328 3329 case SPELL_NONE: 3330 cpp_error (pfile, CPP_DL_ICE, 3331 "unspellable token %s", TOKEN_NAME (token)); 3332 break; 3333 } 3334 3335 return buffer; 3336 } 3337 3338 /* Returns TOKEN spelt as a null-terminated string. The string is 3339 freed when the reader is destroyed. Useful for diagnostics. */ 3340 unsigned char * 3341 cpp_token_as_text (cpp_reader *pfile, const cpp_token *token) 3342 { 3343 unsigned int len = cpp_token_len (token) + 1; 3344 unsigned char *start = _cpp_unaligned_alloc (pfile, len), *end; 3345 3346 end = cpp_spell_token (pfile, token, start, false); 3347 end[0] = '\0'; 3348 3349 return start; 3350 } 3351 3352 /* Returns a pointer to a string which spells the token defined by 3353 TYPE and FLAGS. Used by C front ends, which really should move to 3354 using cpp_token_as_text. */ 3355 const char * 3356 cpp_type2name (enum cpp_ttype type, unsigned char flags) 3357 { 3358 if (flags & DIGRAPH) 3359 return (const char *) cpp_digraph2name (type); 3360 else if (flags & NAMED_OP) 3361 return cpp_named_operator2name (type); 3362 3363 return (const char *) token_spellings[type].name; 3364 } 3365 3366 /* Writes the spelling of token to FP, without any preceding space. 3367 Separated from cpp_spell_token for efficiency - to avoid stdio 3368 double-buffering. */ 3369 void 3370 cpp_output_token (const cpp_token *token, FILE *fp) 3371 { 3372 switch (TOKEN_SPELL (token)) 3373 { 3374 case SPELL_OPERATOR: 3375 { 3376 const unsigned char *spelling; 3377 int c; 3378 3379 if (token->flags & DIGRAPH) 3380 spelling = cpp_digraph2name (token->type); 3381 else if (token->flags & NAMED_OP) 3382 goto spell_ident; 3383 else 3384 spelling = TOKEN_NAME (token); 3385 3386 c = *spelling; 3387 do 3388 putc (c, fp); 3389 while ((c = *++spelling) != '\0'); 3390 } 3391 break; 3392 3393 spell_ident: 3394 case SPELL_IDENT: 3395 { 3396 size_t i; 3397 const unsigned char * name = NODE_NAME (token->val.node.node); 3398 3399 for (i = 0; i < NODE_LEN (token->val.node.node); i++) 3400 if (name[i] & ~0x7F) 3401 { 3402 unsigned char buffer[10]; 3403 i += utf8_to_ucn (buffer, name + i) - 1; 3404 fwrite (buffer, 1, 10, fp); 3405 } 3406 else 3407 fputc (NODE_NAME (token->val.node.node)[i], fp); 3408 } 3409 break; 3410 3411 case SPELL_LITERAL: 3412 fwrite (token->val.str.text, 1, token->val.str.len, fp); 3413 break; 3414 3415 case SPELL_NONE: 3416 /* An error, most probably. */ 3417 break; 3418 } 3419 } 3420 3421 /* Compare two tokens. */ 3422 int 3423 _cpp_equiv_tokens (const cpp_token *a, const cpp_token *b) 3424 { 3425 if (a->type == b->type && a->flags == b->flags) 3426 switch (TOKEN_SPELL (a)) 3427 { 3428 default: /* Keep compiler happy. */ 3429 case SPELL_OPERATOR: 3430 /* token_no is used to track where multiple consecutive ## 3431 tokens were originally located. */ 3432 return (a->type != CPP_PASTE || a->val.token_no == b->val.token_no); 3433 case SPELL_NONE: 3434 return (a->type != CPP_MACRO_ARG 3435 || (a->val.macro_arg.arg_no == b->val.macro_arg.arg_no 3436 && a->val.macro_arg.spelling == b->val.macro_arg.spelling)); 3437 case SPELL_IDENT: 3438 return (a->val.node.node == b->val.node.node 3439 && a->val.node.spelling == b->val.node.spelling); 3440 case SPELL_LITERAL: 3441 return (a->val.str.len == b->val.str.len 3442 && !memcmp (a->val.str.text, b->val.str.text, 3443 a->val.str.len)); 3444 } 3445 3446 return 0; 3447 } 3448 3449 /* Returns nonzero if a space should be inserted to avoid an 3450 accidental token paste for output. For simplicity, it is 3451 conservative, and occasionally advises a space where one is not 3452 needed, e.g. "." and ".2". */ 3453 int 3454 cpp_avoid_paste (cpp_reader *pfile, const cpp_token *token1, 3455 const cpp_token *token2) 3456 { 3457 enum cpp_ttype a = token1->type, b = token2->type; 3458 cppchar_t c; 3459 3460 if (token1->flags & NAMED_OP) 3461 a = CPP_NAME; 3462 if (token2->flags & NAMED_OP) 3463 b = CPP_NAME; 3464 3465 c = EOF; 3466 if (token2->flags & DIGRAPH) 3467 c = digraph_spellings[(int) b - (int) CPP_FIRST_DIGRAPH][0]; 3468 else if (token_spellings[b].category == SPELL_OPERATOR) 3469 c = token_spellings[b].name[0]; 3470 3471 /* Quickly get everything that can paste with an '='. */ 3472 if ((int) a <= (int) CPP_LAST_EQ && c == '=') 3473 return 1; 3474 3475 switch (a) 3476 { 3477 case CPP_GREATER: return c == '>'; 3478 case CPP_LESS: return c == '<' || c == '%' || c == ':'; 3479 case CPP_PLUS: return c == '+'; 3480 case CPP_MINUS: return c == '-' || c == '>'; 3481 case CPP_DIV: return c == '/' || c == '*'; /* Comments. */ 3482 case CPP_MOD: return c == ':' || c == '>'; 3483 case CPP_AND: return c == '&'; 3484 case CPP_OR: return c == '|'; 3485 case CPP_COLON: return c == ':' || c == '>'; 3486 case CPP_DEREF: return c == '*'; 3487 case CPP_DOT: return c == '.' || c == '%' || b == CPP_NUMBER; 3488 case CPP_HASH: return c == '#' || c == '%'; /* Digraph form. */ 3489 case CPP_PRAGMA: 3490 case CPP_NAME: return ((b == CPP_NUMBER 3491 && name_p (pfile, &token2->val.str)) 3492 || b == CPP_NAME 3493 || b == CPP_CHAR || b == CPP_STRING); /* L */ 3494 case CPP_NUMBER: return (b == CPP_NUMBER || b == CPP_NAME 3495 || c == '.' || c == '+' || c == '-'); 3496 /* UCNs */ 3497 case CPP_OTHER: return ((token1->val.str.text[0] == '\\' 3498 && b == CPP_NAME) 3499 || (CPP_OPTION (pfile, objc) 3500 && token1->val.str.text[0] == '@' 3501 && (b == CPP_NAME || b == CPP_STRING))); 3502 case CPP_LESS_EQ: return c == '>'; 3503 case CPP_STRING: 3504 case CPP_WSTRING: 3505 case CPP_UTF8STRING: 3506 case CPP_STRING16: 3507 case CPP_STRING32: return (CPP_OPTION (pfile, user_literals) 3508 && (b == CPP_NAME 3509 || (TOKEN_SPELL (token2) == SPELL_LITERAL 3510 && ISIDST (token2->val.str.text[0])))); 3511 3512 default: break; 3513 } 3514 3515 return 0; 3516 } 3517 3518 /* Output all the remaining tokens on the current line, and a newline 3519 character, to FP. Leading whitespace is removed. If there are 3520 macros, special token padding is not performed. */ 3521 void 3522 cpp_output_line (cpp_reader *pfile, FILE *fp) 3523 { 3524 const cpp_token *token; 3525 3526 token = cpp_get_token (pfile); 3527 while (token->type != CPP_EOF) 3528 { 3529 cpp_output_token (token, fp); 3530 token = cpp_get_token (pfile); 3531 if (token->flags & PREV_WHITE) 3532 putc (' ', fp); 3533 } 3534 3535 putc ('\n', fp); 3536 } 3537 3538 /* Return a string representation of all the remaining tokens on the 3539 current line. The result is allocated using xmalloc and must be 3540 freed by the caller. */ 3541 unsigned char * 3542 cpp_output_line_to_string (cpp_reader *pfile, const unsigned char *dir_name) 3543 { 3544 const cpp_token *token; 3545 unsigned int out = dir_name ? ustrlen (dir_name) : 0; 3546 unsigned int alloced = 120 + out; 3547 unsigned char *result = (unsigned char *) xmalloc (alloced); 3548 3549 /* If DIR_NAME is empty, there are no initial contents. */ 3550 if (dir_name) 3551 { 3552 sprintf ((char *) result, "#%s ", dir_name); 3553 out += 2; 3554 } 3555 3556 token = cpp_get_token (pfile); 3557 while (token->type != CPP_EOF) 3558 { 3559 unsigned char *last; 3560 /* Include room for a possible space and the terminating nul. */ 3561 unsigned int len = cpp_token_len (token) + 2; 3562 3563 if (out + len > alloced) 3564 { 3565 alloced *= 2; 3566 if (out + len > alloced) 3567 alloced = out + len; 3568 result = (unsigned char *) xrealloc (result, alloced); 3569 } 3570 3571 last = cpp_spell_token (pfile, token, &result[out], 0); 3572 out = last - result; 3573 3574 token = cpp_get_token (pfile); 3575 if (token->flags & PREV_WHITE) 3576 result[out++] = ' '; 3577 } 3578 3579 result[out] = '\0'; 3580 return result; 3581 } 3582 3583 /* Memory buffers. Changing these three constants can have a dramatic 3584 effect on performance. The values here are reasonable defaults, 3585 but might be tuned. If you adjust them, be sure to test across a 3586 range of uses of cpplib, including heavy nested function-like macro 3587 expansion. Also check the change in peak memory usage (NJAMD is a 3588 good tool for this). */ 3589 #define MIN_BUFF_SIZE 8000 3590 #define BUFF_SIZE_UPPER_BOUND(MIN_SIZE) (MIN_BUFF_SIZE + (MIN_SIZE) * 3 / 2) 3591 #define EXTENDED_BUFF_SIZE(BUFF, MIN_EXTRA) \ 3592 (MIN_EXTRA + ((BUFF)->limit - (BUFF)->cur) * 2) 3593 3594 #if MIN_BUFF_SIZE > BUFF_SIZE_UPPER_BOUND (0) 3595 #error BUFF_SIZE_UPPER_BOUND must be at least as large as MIN_BUFF_SIZE! 3596 #endif 3597 3598 /* Create a new allocation buffer. Place the control block at the end 3599 of the buffer, so that buffer overflows will cause immediate chaos. */ 3600 static _cpp_buff * 3601 new_buff (size_t len) 3602 { 3603 _cpp_buff *result; 3604 unsigned char *base; 3605 3606 if (len < MIN_BUFF_SIZE) 3607 len = MIN_BUFF_SIZE; 3608 len = CPP_ALIGN (len); 3609 3610 #ifdef ENABLE_VALGRIND_ANNOTATIONS 3611 /* Valgrind warns about uses of interior pointers, so put _cpp_buff 3612 struct first. */ 3613 size_t slen = CPP_ALIGN2 (sizeof (_cpp_buff), 2 * DEFAULT_ALIGNMENT); 3614 base = XNEWVEC (unsigned char, len + slen); 3615 result = (_cpp_buff *) base; 3616 base += slen; 3617 #else 3618 base = XNEWVEC (unsigned char, len + sizeof (_cpp_buff)); 3619 result = (_cpp_buff *) (base + len); 3620 #endif 3621 result->base = base; 3622 result->cur = base; 3623 result->limit = base + len; 3624 result->next = NULL; 3625 return result; 3626 } 3627 3628 /* Place a chain of unwanted allocation buffers on the free list. */ 3629 void 3630 _cpp_release_buff (cpp_reader *pfile, _cpp_buff *buff) 3631 { 3632 _cpp_buff *end = buff; 3633 3634 while (end->next) 3635 end = end->next; 3636 end->next = pfile->free_buffs; 3637 pfile->free_buffs = buff; 3638 } 3639 3640 /* Return a free buffer of size at least MIN_SIZE. */ 3641 _cpp_buff * 3642 _cpp_get_buff (cpp_reader *pfile, size_t min_size) 3643 { 3644 _cpp_buff *result, **p; 3645 3646 for (p = &pfile->free_buffs;; p = &(*p)->next) 3647 { 3648 size_t size; 3649 3650 if (*p == NULL) 3651 return new_buff (min_size); 3652 result = *p; 3653 size = result->limit - result->base; 3654 /* Return a buffer that's big enough, but don't waste one that's 3655 way too big. */ 3656 if (size >= min_size && size <= BUFF_SIZE_UPPER_BOUND (min_size)) 3657 break; 3658 } 3659 3660 *p = result->next; 3661 result->next = NULL; 3662 result->cur = result->base; 3663 return result; 3664 } 3665 3666 /* Creates a new buffer with enough space to hold the uncommitted 3667 remaining bytes of BUFF, and at least MIN_EXTRA more bytes. Copies 3668 the excess bytes to the new buffer. Chains the new buffer after 3669 BUFF, and returns the new buffer. */ 3670 _cpp_buff * 3671 _cpp_append_extend_buff (cpp_reader *pfile, _cpp_buff *buff, size_t min_extra) 3672 { 3673 size_t size = EXTENDED_BUFF_SIZE (buff, min_extra); 3674 _cpp_buff *new_buff = _cpp_get_buff (pfile, size); 3675 3676 buff->next = new_buff; 3677 memcpy (new_buff->base, buff->cur, BUFF_ROOM (buff)); 3678 return new_buff; 3679 } 3680 3681 /* Creates a new buffer with enough space to hold the uncommitted 3682 remaining bytes of the buffer pointed to by BUFF, and at least 3683 MIN_EXTRA more bytes. Copies the excess bytes to the new buffer. 3684 Chains the new buffer before the buffer pointed to by BUFF, and 3685 updates the pointer to point to the new buffer. */ 3686 void 3687 _cpp_extend_buff (cpp_reader *pfile, _cpp_buff **pbuff, size_t min_extra) 3688 { 3689 _cpp_buff *new_buff, *old_buff = *pbuff; 3690 size_t size = EXTENDED_BUFF_SIZE (old_buff, min_extra); 3691 3692 new_buff = _cpp_get_buff (pfile, size); 3693 memcpy (new_buff->base, old_buff->cur, BUFF_ROOM (old_buff)); 3694 new_buff->next = old_buff; 3695 *pbuff = new_buff; 3696 } 3697 3698 /* Free a chain of buffers starting at BUFF. */ 3699 void 3700 _cpp_free_buff (_cpp_buff *buff) 3701 { 3702 _cpp_buff *next; 3703 3704 for (; buff; buff = next) 3705 { 3706 next = buff->next; 3707 #ifdef ENABLE_VALGRIND_ANNOTATIONS 3708 free (buff); 3709 #else 3710 free (buff->base); 3711 #endif 3712 } 3713 } 3714 3715 /* Allocate permanent, unaligned storage of length LEN. */ 3716 unsigned char * 3717 _cpp_unaligned_alloc (cpp_reader *pfile, size_t len) 3718 { 3719 _cpp_buff *buff = pfile->u_buff; 3720 unsigned char *result = buff->cur; 3721 3722 if (len > (size_t) (buff->limit - result)) 3723 { 3724 buff = _cpp_get_buff (pfile, len); 3725 buff->next = pfile->u_buff; 3726 pfile->u_buff = buff; 3727 result = buff->cur; 3728 } 3729 3730 buff->cur = result + len; 3731 return result; 3732 } 3733 3734 /* Allocate permanent, unaligned storage of length LEN from a_buff. 3735 That buffer is used for growing allocations when saving macro 3736 replacement lists in a #define, and when parsing an answer to an 3737 assertion in #assert, #unassert or #if (and therefore possibly 3738 whilst expanding macros). It therefore must not be used by any 3739 code that they might call: specifically the lexer and the guts of 3740 the macro expander. 3741 3742 All existing other uses clearly fit this restriction: storing 3743 registered pragmas during initialization. */ 3744 unsigned char * 3745 _cpp_aligned_alloc (cpp_reader *pfile, size_t len) 3746 { 3747 _cpp_buff *buff = pfile->a_buff; 3748 unsigned char *result = buff->cur; 3749 3750 if (len > (size_t) (buff->limit - result)) 3751 { 3752 buff = _cpp_get_buff (pfile, len); 3753 buff->next = pfile->a_buff; 3754 pfile->a_buff = buff; 3755 result = buff->cur; 3756 } 3757 3758 buff->cur = result + len; 3759 return result; 3760 } 3761 3762 /* Commit or allocate storage from a buffer. */ 3763 3764 void * 3765 _cpp_commit_buff (cpp_reader *pfile, size_t size) 3766 { 3767 void *ptr = BUFF_FRONT (pfile->a_buff); 3768 3769 if (pfile->hash_table->alloc_subobject) 3770 { 3771 void *copy = pfile->hash_table->alloc_subobject (size); 3772 memcpy (copy, ptr, size); 3773 ptr = copy; 3774 } 3775 else 3776 BUFF_FRONT (pfile->a_buff) += size; 3777 3778 return ptr; 3779 } 3780 3781 /* Say which field of TOK is in use. */ 3782 3783 enum cpp_token_fld_kind 3784 cpp_token_val_index (const cpp_token *tok) 3785 { 3786 switch (TOKEN_SPELL (tok)) 3787 { 3788 case SPELL_IDENT: 3789 return CPP_TOKEN_FLD_NODE; 3790 case SPELL_LITERAL: 3791 return CPP_TOKEN_FLD_STR; 3792 case SPELL_OPERATOR: 3793 /* Operands which were originally spelled as ident keep around 3794 the node for the exact spelling. */ 3795 if (tok->flags & NAMED_OP) 3796 return CPP_TOKEN_FLD_NODE; 3797 else if (tok->type == CPP_PASTE) 3798 return CPP_TOKEN_FLD_TOKEN_NO; 3799 else 3800 return CPP_TOKEN_FLD_NONE; 3801 case SPELL_NONE: 3802 if (tok->type == CPP_MACRO_ARG) 3803 return CPP_TOKEN_FLD_ARG_NO; 3804 else if (tok->type == CPP_PADDING) 3805 return CPP_TOKEN_FLD_SOURCE; 3806 else if (tok->type == CPP_PRAGMA) 3807 return CPP_TOKEN_FLD_PRAGMA; 3808 /* fall through */ 3809 default: 3810 return CPP_TOKEN_FLD_NONE; 3811 } 3812 } 3813 3814 /* All tokens lexed in R after calling this function will be forced to 3815 have their location_t to be P, until 3816 cpp_stop_forcing_token_locations is called for R. */ 3817 3818 void 3819 cpp_force_token_locations (cpp_reader *r, location_t loc) 3820 { 3821 r->forced_token_location = loc; 3822 } 3823 3824 /* Go back to assigning locations naturally for lexed tokens. */ 3825 3826 void 3827 cpp_stop_forcing_token_locations (cpp_reader *r) 3828 { 3829 r->forced_token_location = 0; 3830 } 3831