xref: /minix3/external/bsd/llvm/dist/clang/lib/Lex/TokenConcatenation.cpp (revision f4a2713ac843a11c696ec80c0a5e3e5d80b4d338)
1*f4a2713aSLionel Sambuc //===--- TokenConcatenation.cpp - Token Concatenation Avoidance -----------===//
2*f4a2713aSLionel Sambuc //
3*f4a2713aSLionel Sambuc //                     The LLVM Compiler Infrastructure
4*f4a2713aSLionel Sambuc //
5*f4a2713aSLionel Sambuc // This file is distributed under the University of Illinois Open Source
6*f4a2713aSLionel Sambuc // License. See LICENSE.TXT for details.
7*f4a2713aSLionel Sambuc //
8*f4a2713aSLionel Sambuc //===----------------------------------------------------------------------===//
9*f4a2713aSLionel Sambuc //
10*f4a2713aSLionel Sambuc // This file implements the TokenConcatenation class.
11*f4a2713aSLionel Sambuc //
12*f4a2713aSLionel Sambuc //===----------------------------------------------------------------------===//
13*f4a2713aSLionel Sambuc 
14*f4a2713aSLionel Sambuc #include "clang/Lex/TokenConcatenation.h"
15*f4a2713aSLionel Sambuc #include "clang/Basic/CharInfo.h"
16*f4a2713aSLionel Sambuc #include "clang/Lex/Preprocessor.h"
17*f4a2713aSLionel Sambuc #include "llvm/Support/ErrorHandling.h"
18*f4a2713aSLionel Sambuc using namespace clang;
19*f4a2713aSLionel Sambuc 
20*f4a2713aSLionel Sambuc 
21*f4a2713aSLionel Sambuc /// IsStringPrefix - Return true if Str is a string prefix.
22*f4a2713aSLionel Sambuc /// 'L', 'u', 'U', or 'u8'. Including raw versions.
23*f4a2713aSLionel Sambuc static bool IsStringPrefix(StringRef Str, bool CPlusPlus11) {
24*f4a2713aSLionel Sambuc 
25*f4a2713aSLionel Sambuc   if (Str[0] == 'L' ||
26*f4a2713aSLionel Sambuc       (CPlusPlus11 && (Str[0] == 'u' || Str[0] == 'U' || Str[0] == 'R'))) {
27*f4a2713aSLionel Sambuc 
28*f4a2713aSLionel Sambuc     if (Str.size() == 1)
29*f4a2713aSLionel Sambuc       return true; // "L", "u", "U", and "R"
30*f4a2713aSLionel Sambuc 
31*f4a2713aSLionel Sambuc     // Check for raw flavors. Need to make sure the first character wasn't
32*f4a2713aSLionel Sambuc     // already R. Need CPlusPlus11 check for "LR".
33*f4a2713aSLionel Sambuc     if (Str[1] == 'R' && Str[0] != 'R' && Str.size() == 2 && CPlusPlus11)
34*f4a2713aSLionel Sambuc       return true; // "LR", "uR", "UR"
35*f4a2713aSLionel Sambuc 
36*f4a2713aSLionel Sambuc     // Check for "u8" and "u8R"
37*f4a2713aSLionel Sambuc     if (Str[0] == 'u' && Str[1] == '8') {
38*f4a2713aSLionel Sambuc       if (Str.size() == 2) return true; // "u8"
39*f4a2713aSLionel Sambuc       if (Str.size() == 3 && Str[2] == 'R') return true; // "u8R"
40*f4a2713aSLionel Sambuc     }
41*f4a2713aSLionel Sambuc   }
42*f4a2713aSLionel Sambuc 
43*f4a2713aSLionel Sambuc   return false;
44*f4a2713aSLionel Sambuc }
45*f4a2713aSLionel Sambuc 
46*f4a2713aSLionel Sambuc /// IsIdentifierStringPrefix - Return true if the spelling of the token
47*f4a2713aSLionel Sambuc /// is literally 'L', 'u', 'U', or 'u8'. Including raw versions.
48*f4a2713aSLionel Sambuc bool TokenConcatenation::IsIdentifierStringPrefix(const Token &Tok) const {
49*f4a2713aSLionel Sambuc   const LangOptions &LangOpts = PP.getLangOpts();
50*f4a2713aSLionel Sambuc 
51*f4a2713aSLionel Sambuc   if (!Tok.needsCleaning()) {
52*f4a2713aSLionel Sambuc     if (Tok.getLength() < 1 || Tok.getLength() > 3)
53*f4a2713aSLionel Sambuc       return false;
54*f4a2713aSLionel Sambuc     SourceManager &SM = PP.getSourceManager();
55*f4a2713aSLionel Sambuc     const char *Ptr = SM.getCharacterData(SM.getSpellingLoc(Tok.getLocation()));
56*f4a2713aSLionel Sambuc     return IsStringPrefix(StringRef(Ptr, Tok.getLength()),
57*f4a2713aSLionel Sambuc                           LangOpts.CPlusPlus11);
58*f4a2713aSLionel Sambuc   }
59*f4a2713aSLionel Sambuc 
60*f4a2713aSLionel Sambuc   if (Tok.getLength() < 256) {
61*f4a2713aSLionel Sambuc     char Buffer[256];
62*f4a2713aSLionel Sambuc     const char *TokPtr = Buffer;
63*f4a2713aSLionel Sambuc     unsigned length = PP.getSpelling(Tok, TokPtr);
64*f4a2713aSLionel Sambuc     return IsStringPrefix(StringRef(TokPtr, length), LangOpts.CPlusPlus11);
65*f4a2713aSLionel Sambuc   }
66*f4a2713aSLionel Sambuc 
67*f4a2713aSLionel Sambuc   return IsStringPrefix(StringRef(PP.getSpelling(Tok)), LangOpts.CPlusPlus11);
68*f4a2713aSLionel Sambuc }
69*f4a2713aSLionel Sambuc 
70*f4a2713aSLionel Sambuc TokenConcatenation::TokenConcatenation(Preprocessor &pp) : PP(pp) {
71*f4a2713aSLionel Sambuc   memset(TokenInfo, 0, sizeof(TokenInfo));
72*f4a2713aSLionel Sambuc 
73*f4a2713aSLionel Sambuc   // These tokens have custom code in AvoidConcat.
74*f4a2713aSLionel Sambuc   TokenInfo[tok::identifier      ] |= aci_custom;
75*f4a2713aSLionel Sambuc   TokenInfo[tok::numeric_constant] |= aci_custom_firstchar;
76*f4a2713aSLionel Sambuc   TokenInfo[tok::period          ] |= aci_custom_firstchar;
77*f4a2713aSLionel Sambuc   TokenInfo[tok::amp             ] |= aci_custom_firstchar;
78*f4a2713aSLionel Sambuc   TokenInfo[tok::plus            ] |= aci_custom_firstchar;
79*f4a2713aSLionel Sambuc   TokenInfo[tok::minus           ] |= aci_custom_firstchar;
80*f4a2713aSLionel Sambuc   TokenInfo[tok::slash           ] |= aci_custom_firstchar;
81*f4a2713aSLionel Sambuc   TokenInfo[tok::less            ] |= aci_custom_firstchar;
82*f4a2713aSLionel Sambuc   TokenInfo[tok::greater         ] |= aci_custom_firstchar;
83*f4a2713aSLionel Sambuc   TokenInfo[tok::pipe            ] |= aci_custom_firstchar;
84*f4a2713aSLionel Sambuc   TokenInfo[tok::percent         ] |= aci_custom_firstchar;
85*f4a2713aSLionel Sambuc   TokenInfo[tok::colon           ] |= aci_custom_firstchar;
86*f4a2713aSLionel Sambuc   TokenInfo[tok::hash            ] |= aci_custom_firstchar;
87*f4a2713aSLionel Sambuc   TokenInfo[tok::arrow           ] |= aci_custom_firstchar;
88*f4a2713aSLionel Sambuc 
89*f4a2713aSLionel Sambuc   // These tokens have custom code in C++11 mode.
90*f4a2713aSLionel Sambuc   if (PP.getLangOpts().CPlusPlus11) {
91*f4a2713aSLionel Sambuc     TokenInfo[tok::string_literal      ] |= aci_custom;
92*f4a2713aSLionel Sambuc     TokenInfo[tok::wide_string_literal ] |= aci_custom;
93*f4a2713aSLionel Sambuc     TokenInfo[tok::utf8_string_literal ] |= aci_custom;
94*f4a2713aSLionel Sambuc     TokenInfo[tok::utf16_string_literal] |= aci_custom;
95*f4a2713aSLionel Sambuc     TokenInfo[tok::utf32_string_literal] |= aci_custom;
96*f4a2713aSLionel Sambuc     TokenInfo[tok::char_constant       ] |= aci_custom;
97*f4a2713aSLionel Sambuc     TokenInfo[tok::wide_char_constant  ] |= aci_custom;
98*f4a2713aSLionel Sambuc     TokenInfo[tok::utf16_char_constant ] |= aci_custom;
99*f4a2713aSLionel Sambuc     TokenInfo[tok::utf32_char_constant ] |= aci_custom;
100*f4a2713aSLionel Sambuc   }
101*f4a2713aSLionel Sambuc 
102*f4a2713aSLionel Sambuc   // These tokens change behavior if followed by an '='.
103*f4a2713aSLionel Sambuc   TokenInfo[tok::amp         ] |= aci_avoid_equal;           // &=
104*f4a2713aSLionel Sambuc   TokenInfo[tok::plus        ] |= aci_avoid_equal;           // +=
105*f4a2713aSLionel Sambuc   TokenInfo[tok::minus       ] |= aci_avoid_equal;           // -=
106*f4a2713aSLionel Sambuc   TokenInfo[tok::slash       ] |= aci_avoid_equal;           // /=
107*f4a2713aSLionel Sambuc   TokenInfo[tok::less        ] |= aci_avoid_equal;           // <=
108*f4a2713aSLionel Sambuc   TokenInfo[tok::greater     ] |= aci_avoid_equal;           // >=
109*f4a2713aSLionel Sambuc   TokenInfo[tok::pipe        ] |= aci_avoid_equal;           // |=
110*f4a2713aSLionel Sambuc   TokenInfo[tok::percent     ] |= aci_avoid_equal;           // %=
111*f4a2713aSLionel Sambuc   TokenInfo[tok::star        ] |= aci_avoid_equal;           // *=
112*f4a2713aSLionel Sambuc   TokenInfo[tok::exclaim     ] |= aci_avoid_equal;           // !=
113*f4a2713aSLionel Sambuc   TokenInfo[tok::lessless    ] |= aci_avoid_equal;           // <<=
114*f4a2713aSLionel Sambuc   TokenInfo[tok::greatergreater] |= aci_avoid_equal;         // >>=
115*f4a2713aSLionel Sambuc   TokenInfo[tok::caret       ] |= aci_avoid_equal;           // ^=
116*f4a2713aSLionel Sambuc   TokenInfo[tok::equal       ] |= aci_avoid_equal;           // ==
117*f4a2713aSLionel Sambuc }
118*f4a2713aSLionel Sambuc 
119*f4a2713aSLionel Sambuc /// GetFirstChar - Get the first character of the token \arg Tok,
120*f4a2713aSLionel Sambuc /// avoiding calls to getSpelling where possible.
121*f4a2713aSLionel Sambuc static char GetFirstChar(Preprocessor &PP, const Token &Tok) {
122*f4a2713aSLionel Sambuc   if (IdentifierInfo *II = Tok.getIdentifierInfo()) {
123*f4a2713aSLionel Sambuc     // Avoid spelling identifiers, the most common form of token.
124*f4a2713aSLionel Sambuc     return II->getNameStart()[0];
125*f4a2713aSLionel Sambuc   } else if (!Tok.needsCleaning()) {
126*f4a2713aSLionel Sambuc     if (Tok.isLiteral() && Tok.getLiteralData()) {
127*f4a2713aSLionel Sambuc       return *Tok.getLiteralData();
128*f4a2713aSLionel Sambuc     } else {
129*f4a2713aSLionel Sambuc       SourceManager &SM = PP.getSourceManager();
130*f4a2713aSLionel Sambuc       return *SM.getCharacterData(SM.getSpellingLoc(Tok.getLocation()));
131*f4a2713aSLionel Sambuc     }
132*f4a2713aSLionel Sambuc   } else if (Tok.getLength() < 256) {
133*f4a2713aSLionel Sambuc     char Buffer[256];
134*f4a2713aSLionel Sambuc     const char *TokPtr = Buffer;
135*f4a2713aSLionel Sambuc     PP.getSpelling(Tok, TokPtr);
136*f4a2713aSLionel Sambuc     return TokPtr[0];
137*f4a2713aSLionel Sambuc   } else {
138*f4a2713aSLionel Sambuc     return PP.getSpelling(Tok)[0];
139*f4a2713aSLionel Sambuc   }
140*f4a2713aSLionel Sambuc }
141*f4a2713aSLionel Sambuc 
142*f4a2713aSLionel Sambuc /// AvoidConcat - If printing PrevTok immediately followed by Tok would cause
143*f4a2713aSLionel Sambuc /// the two individual tokens to be lexed as a single token, return true
144*f4a2713aSLionel Sambuc /// (which causes a space to be printed between them).  This allows the output
145*f4a2713aSLionel Sambuc /// of -E mode to be lexed to the same token stream as lexing the input
146*f4a2713aSLionel Sambuc /// directly would.
147*f4a2713aSLionel Sambuc ///
148*f4a2713aSLionel Sambuc /// This code must conservatively return true if it doesn't want to be 100%
149*f4a2713aSLionel Sambuc /// accurate.  This will cause the output to include extra space characters,
150*f4a2713aSLionel Sambuc /// but the resulting output won't have incorrect concatenations going on.
151*f4a2713aSLionel Sambuc /// Examples include "..", which we print with a space between, because we
152*f4a2713aSLionel Sambuc /// don't want to track enough to tell "x.." from "...".
153*f4a2713aSLionel Sambuc bool TokenConcatenation::AvoidConcat(const Token &PrevPrevTok,
154*f4a2713aSLionel Sambuc                                      const Token &PrevTok,
155*f4a2713aSLionel Sambuc                                      const Token &Tok) const {
156*f4a2713aSLionel Sambuc   // First, check to see if the tokens were directly adjacent in the original
157*f4a2713aSLionel Sambuc   // source.  If they were, it must be okay to stick them together: if there
158*f4a2713aSLionel Sambuc   // were an issue, the tokens would have been lexed differently.
159*f4a2713aSLionel Sambuc   SourceManager &SM = PP.getSourceManager();
160*f4a2713aSLionel Sambuc   SourceLocation PrevSpellLoc = SM.getSpellingLoc(PrevTok.getLocation());
161*f4a2713aSLionel Sambuc   SourceLocation SpellLoc = SM.getSpellingLoc(Tok.getLocation());
162*f4a2713aSLionel Sambuc   if (PrevSpellLoc.getLocWithOffset(PrevTok.getLength()) == SpellLoc)
163*f4a2713aSLionel Sambuc     return false;
164*f4a2713aSLionel Sambuc 
165*f4a2713aSLionel Sambuc   tok::TokenKind PrevKind = PrevTok.getKind();
166*f4a2713aSLionel Sambuc   if (PrevTok.getIdentifierInfo())  // Language keyword or named operator.
167*f4a2713aSLionel Sambuc     PrevKind = tok::identifier;
168*f4a2713aSLionel Sambuc 
169*f4a2713aSLionel Sambuc   // Look up information on when we should avoid concatenation with prevtok.
170*f4a2713aSLionel Sambuc   unsigned ConcatInfo = TokenInfo[PrevKind];
171*f4a2713aSLionel Sambuc 
172*f4a2713aSLionel Sambuc   // If prevtok never causes a problem for anything after it, return quickly.
173*f4a2713aSLionel Sambuc   if (ConcatInfo == 0) return false;
174*f4a2713aSLionel Sambuc 
175*f4a2713aSLionel Sambuc   if (ConcatInfo & aci_avoid_equal) {
176*f4a2713aSLionel Sambuc     // If the next token is '=' or '==', avoid concatenation.
177*f4a2713aSLionel Sambuc     if (Tok.is(tok::equal) || Tok.is(tok::equalequal))
178*f4a2713aSLionel Sambuc       return true;
179*f4a2713aSLionel Sambuc     ConcatInfo &= ~aci_avoid_equal;
180*f4a2713aSLionel Sambuc   }
181*f4a2713aSLionel Sambuc 
182*f4a2713aSLionel Sambuc   if (ConcatInfo == 0) return false;
183*f4a2713aSLionel Sambuc 
184*f4a2713aSLionel Sambuc   // Basic algorithm: we look at the first character of the second token, and
185*f4a2713aSLionel Sambuc   // determine whether it, if appended to the first token, would form (or
186*f4a2713aSLionel Sambuc   // would contribute) to a larger token if concatenated.
187*f4a2713aSLionel Sambuc   char FirstChar = 0;
188*f4a2713aSLionel Sambuc   if (ConcatInfo & aci_custom) {
189*f4a2713aSLionel Sambuc     // If the token does not need to know the first character, don't get it.
190*f4a2713aSLionel Sambuc   } else {
191*f4a2713aSLionel Sambuc     FirstChar = GetFirstChar(PP, Tok);
192*f4a2713aSLionel Sambuc   }
193*f4a2713aSLionel Sambuc 
194*f4a2713aSLionel Sambuc   switch (PrevKind) {
195*f4a2713aSLionel Sambuc   default:
196*f4a2713aSLionel Sambuc     llvm_unreachable("InitAvoidConcatTokenInfo built wrong");
197*f4a2713aSLionel Sambuc 
198*f4a2713aSLionel Sambuc   case tok::raw_identifier:
199*f4a2713aSLionel Sambuc     llvm_unreachable("tok::raw_identifier in non-raw lexing mode!");
200*f4a2713aSLionel Sambuc 
201*f4a2713aSLionel Sambuc   case tok::string_literal:
202*f4a2713aSLionel Sambuc   case tok::wide_string_literal:
203*f4a2713aSLionel Sambuc   case tok::utf8_string_literal:
204*f4a2713aSLionel Sambuc   case tok::utf16_string_literal:
205*f4a2713aSLionel Sambuc   case tok::utf32_string_literal:
206*f4a2713aSLionel Sambuc   case tok::char_constant:
207*f4a2713aSLionel Sambuc   case tok::wide_char_constant:
208*f4a2713aSLionel Sambuc   case tok::utf16_char_constant:
209*f4a2713aSLionel Sambuc   case tok::utf32_char_constant:
210*f4a2713aSLionel Sambuc     if (!PP.getLangOpts().CPlusPlus11)
211*f4a2713aSLionel Sambuc       return false;
212*f4a2713aSLionel Sambuc 
213*f4a2713aSLionel Sambuc     // In C++11, a string or character literal followed by an identifier is a
214*f4a2713aSLionel Sambuc     // single token.
215*f4a2713aSLionel Sambuc     if (Tok.getIdentifierInfo())
216*f4a2713aSLionel Sambuc       return true;
217*f4a2713aSLionel Sambuc 
218*f4a2713aSLionel Sambuc     // A ud-suffix is an identifier. If the previous token ends with one, treat
219*f4a2713aSLionel Sambuc     // it as an identifier.
220*f4a2713aSLionel Sambuc     if (!PrevTok.hasUDSuffix())
221*f4a2713aSLionel Sambuc       return false;
222*f4a2713aSLionel Sambuc     // FALL THROUGH.
223*f4a2713aSLionel Sambuc   case tok::identifier:   // id+id or id+number or id+L"foo".
224*f4a2713aSLionel Sambuc     // id+'.'... will not append.
225*f4a2713aSLionel Sambuc     if (Tok.is(tok::numeric_constant))
226*f4a2713aSLionel Sambuc       return GetFirstChar(PP, Tok) != '.';
227*f4a2713aSLionel Sambuc 
228*f4a2713aSLionel Sambuc     if (Tok.getIdentifierInfo() || Tok.is(tok::wide_string_literal) ||
229*f4a2713aSLionel Sambuc         Tok.is(tok::utf8_string_literal) || Tok.is(tok::utf16_string_literal) ||
230*f4a2713aSLionel Sambuc         Tok.is(tok::utf32_string_literal) || Tok.is(tok::wide_char_constant) ||
231*f4a2713aSLionel Sambuc         Tok.is(tok::utf16_char_constant) || Tok.is(tok::utf32_char_constant))
232*f4a2713aSLionel Sambuc       return true;
233*f4a2713aSLionel Sambuc 
234*f4a2713aSLionel Sambuc     // If this isn't identifier + string, we're done.
235*f4a2713aSLionel Sambuc     if (Tok.isNot(tok::char_constant) && Tok.isNot(tok::string_literal))
236*f4a2713aSLionel Sambuc       return false;
237*f4a2713aSLionel Sambuc 
238*f4a2713aSLionel Sambuc     // Otherwise, this is a narrow character or string.  If the *identifier*
239*f4a2713aSLionel Sambuc     // is a literal 'L', 'u8', 'u' or 'U', avoid pasting L "foo" -> L"foo".
240*f4a2713aSLionel Sambuc     return IsIdentifierStringPrefix(PrevTok);
241*f4a2713aSLionel Sambuc 
242*f4a2713aSLionel Sambuc   case tok::numeric_constant:
243*f4a2713aSLionel Sambuc     return isPreprocessingNumberBody(FirstChar) ||
244*f4a2713aSLionel Sambuc            FirstChar == '+' || FirstChar == '-';
245*f4a2713aSLionel Sambuc   case tok::period:          // ..., .*, .1234
246*f4a2713aSLionel Sambuc     return (FirstChar == '.' && PrevPrevTok.is(tok::period)) ||
247*f4a2713aSLionel Sambuc            isDigit(FirstChar) ||
248*f4a2713aSLionel Sambuc            (PP.getLangOpts().CPlusPlus && FirstChar == '*');
249*f4a2713aSLionel Sambuc   case tok::amp:             // &&
250*f4a2713aSLionel Sambuc     return FirstChar == '&';
251*f4a2713aSLionel Sambuc   case tok::plus:            // ++
252*f4a2713aSLionel Sambuc     return FirstChar == '+';
253*f4a2713aSLionel Sambuc   case tok::minus:           // --, ->, ->*
254*f4a2713aSLionel Sambuc     return FirstChar == '-' || FirstChar == '>';
255*f4a2713aSLionel Sambuc   case tok::slash:           //, /*, //
256*f4a2713aSLionel Sambuc     return FirstChar == '*' || FirstChar == '/';
257*f4a2713aSLionel Sambuc   case tok::less:            // <<, <<=, <:, <%
258*f4a2713aSLionel Sambuc     return FirstChar == '<' || FirstChar == ':' || FirstChar == '%';
259*f4a2713aSLionel Sambuc   case tok::greater:         // >>, >>=
260*f4a2713aSLionel Sambuc     return FirstChar == '>';
261*f4a2713aSLionel Sambuc   case tok::pipe:            // ||
262*f4a2713aSLionel Sambuc     return FirstChar == '|';
263*f4a2713aSLionel Sambuc   case tok::percent:         // %>, %:
264*f4a2713aSLionel Sambuc     return FirstChar == '>' || FirstChar == ':';
265*f4a2713aSLionel Sambuc   case tok::colon:           // ::, :>
266*f4a2713aSLionel Sambuc     return FirstChar == '>' ||
267*f4a2713aSLionel Sambuc     (PP.getLangOpts().CPlusPlus && FirstChar == ':');
268*f4a2713aSLionel Sambuc   case tok::hash:            // ##, #@, %:%:
269*f4a2713aSLionel Sambuc     return FirstChar == '#' || FirstChar == '@' || FirstChar == '%';
270*f4a2713aSLionel Sambuc   case tok::arrow:           // ->*
271*f4a2713aSLionel Sambuc     return PP.getLangOpts().CPlusPlus && FirstChar == '*';
272*f4a2713aSLionel Sambuc   }
273*f4a2713aSLionel Sambuc }
274