1*f4a2713aSLionel Sambuc //===--- TokenConcatenation.cpp - Token Concatenation Avoidance -----------===// 2*f4a2713aSLionel Sambuc // 3*f4a2713aSLionel Sambuc // The LLVM Compiler Infrastructure 4*f4a2713aSLionel Sambuc // 5*f4a2713aSLionel Sambuc // This file is distributed under the University of Illinois Open Source 6*f4a2713aSLionel Sambuc // License. See LICENSE.TXT for details. 7*f4a2713aSLionel Sambuc // 8*f4a2713aSLionel Sambuc //===----------------------------------------------------------------------===// 9*f4a2713aSLionel Sambuc // 10*f4a2713aSLionel Sambuc // This file implements the TokenConcatenation class. 11*f4a2713aSLionel Sambuc // 12*f4a2713aSLionel Sambuc //===----------------------------------------------------------------------===// 13*f4a2713aSLionel Sambuc 14*f4a2713aSLionel Sambuc #include "clang/Lex/TokenConcatenation.h" 15*f4a2713aSLionel Sambuc #include "clang/Basic/CharInfo.h" 16*f4a2713aSLionel Sambuc #include "clang/Lex/Preprocessor.h" 17*f4a2713aSLionel Sambuc #include "llvm/Support/ErrorHandling.h" 18*f4a2713aSLionel Sambuc using namespace clang; 19*f4a2713aSLionel Sambuc 20*f4a2713aSLionel Sambuc 21*f4a2713aSLionel Sambuc /// IsStringPrefix - Return true if Str is a string prefix. 22*f4a2713aSLionel Sambuc /// 'L', 'u', 'U', or 'u8'. Including raw versions. 23*f4a2713aSLionel Sambuc static bool IsStringPrefix(StringRef Str, bool CPlusPlus11) { 24*f4a2713aSLionel Sambuc 25*f4a2713aSLionel Sambuc if (Str[0] == 'L' || 26*f4a2713aSLionel Sambuc (CPlusPlus11 && (Str[0] == 'u' || Str[0] == 'U' || Str[0] == 'R'))) { 27*f4a2713aSLionel Sambuc 28*f4a2713aSLionel Sambuc if (Str.size() == 1) 29*f4a2713aSLionel Sambuc return true; // "L", "u", "U", and "R" 30*f4a2713aSLionel Sambuc 31*f4a2713aSLionel Sambuc // Check for raw flavors. Need to make sure the first character wasn't 32*f4a2713aSLionel Sambuc // already R. Need CPlusPlus11 check for "LR". 33*f4a2713aSLionel Sambuc if (Str[1] == 'R' && Str[0] != 'R' && Str.size() == 2 && CPlusPlus11) 34*f4a2713aSLionel Sambuc return true; // "LR", "uR", "UR" 35*f4a2713aSLionel Sambuc 36*f4a2713aSLionel Sambuc // Check for "u8" and "u8R" 37*f4a2713aSLionel Sambuc if (Str[0] == 'u' && Str[1] == '8') { 38*f4a2713aSLionel Sambuc if (Str.size() == 2) return true; // "u8" 39*f4a2713aSLionel Sambuc if (Str.size() == 3 && Str[2] == 'R') return true; // "u8R" 40*f4a2713aSLionel Sambuc } 41*f4a2713aSLionel Sambuc } 42*f4a2713aSLionel Sambuc 43*f4a2713aSLionel Sambuc return false; 44*f4a2713aSLionel Sambuc } 45*f4a2713aSLionel Sambuc 46*f4a2713aSLionel Sambuc /// IsIdentifierStringPrefix - Return true if the spelling of the token 47*f4a2713aSLionel Sambuc /// is literally 'L', 'u', 'U', or 'u8'. Including raw versions. 48*f4a2713aSLionel Sambuc bool TokenConcatenation::IsIdentifierStringPrefix(const Token &Tok) const { 49*f4a2713aSLionel Sambuc const LangOptions &LangOpts = PP.getLangOpts(); 50*f4a2713aSLionel Sambuc 51*f4a2713aSLionel Sambuc if (!Tok.needsCleaning()) { 52*f4a2713aSLionel Sambuc if (Tok.getLength() < 1 || Tok.getLength() > 3) 53*f4a2713aSLionel Sambuc return false; 54*f4a2713aSLionel Sambuc SourceManager &SM = PP.getSourceManager(); 55*f4a2713aSLionel Sambuc const char *Ptr = SM.getCharacterData(SM.getSpellingLoc(Tok.getLocation())); 56*f4a2713aSLionel Sambuc return IsStringPrefix(StringRef(Ptr, Tok.getLength()), 57*f4a2713aSLionel Sambuc LangOpts.CPlusPlus11); 58*f4a2713aSLionel Sambuc } 59*f4a2713aSLionel Sambuc 60*f4a2713aSLionel Sambuc if (Tok.getLength() < 256) { 61*f4a2713aSLionel Sambuc char Buffer[256]; 62*f4a2713aSLionel Sambuc const char *TokPtr = Buffer; 63*f4a2713aSLionel Sambuc unsigned length = PP.getSpelling(Tok, TokPtr); 64*f4a2713aSLionel Sambuc return IsStringPrefix(StringRef(TokPtr, length), LangOpts.CPlusPlus11); 65*f4a2713aSLionel Sambuc } 66*f4a2713aSLionel Sambuc 67*f4a2713aSLionel Sambuc return IsStringPrefix(StringRef(PP.getSpelling(Tok)), LangOpts.CPlusPlus11); 68*f4a2713aSLionel Sambuc } 69*f4a2713aSLionel Sambuc 70*f4a2713aSLionel Sambuc TokenConcatenation::TokenConcatenation(Preprocessor &pp) : PP(pp) { 71*f4a2713aSLionel Sambuc memset(TokenInfo, 0, sizeof(TokenInfo)); 72*f4a2713aSLionel Sambuc 73*f4a2713aSLionel Sambuc // These tokens have custom code in AvoidConcat. 74*f4a2713aSLionel Sambuc TokenInfo[tok::identifier ] |= aci_custom; 75*f4a2713aSLionel Sambuc TokenInfo[tok::numeric_constant] |= aci_custom_firstchar; 76*f4a2713aSLionel Sambuc TokenInfo[tok::period ] |= aci_custom_firstchar; 77*f4a2713aSLionel Sambuc TokenInfo[tok::amp ] |= aci_custom_firstchar; 78*f4a2713aSLionel Sambuc TokenInfo[tok::plus ] |= aci_custom_firstchar; 79*f4a2713aSLionel Sambuc TokenInfo[tok::minus ] |= aci_custom_firstchar; 80*f4a2713aSLionel Sambuc TokenInfo[tok::slash ] |= aci_custom_firstchar; 81*f4a2713aSLionel Sambuc TokenInfo[tok::less ] |= aci_custom_firstchar; 82*f4a2713aSLionel Sambuc TokenInfo[tok::greater ] |= aci_custom_firstchar; 83*f4a2713aSLionel Sambuc TokenInfo[tok::pipe ] |= aci_custom_firstchar; 84*f4a2713aSLionel Sambuc TokenInfo[tok::percent ] |= aci_custom_firstchar; 85*f4a2713aSLionel Sambuc TokenInfo[tok::colon ] |= aci_custom_firstchar; 86*f4a2713aSLionel Sambuc TokenInfo[tok::hash ] |= aci_custom_firstchar; 87*f4a2713aSLionel Sambuc TokenInfo[tok::arrow ] |= aci_custom_firstchar; 88*f4a2713aSLionel Sambuc 89*f4a2713aSLionel Sambuc // These tokens have custom code in C++11 mode. 90*f4a2713aSLionel Sambuc if (PP.getLangOpts().CPlusPlus11) { 91*f4a2713aSLionel Sambuc TokenInfo[tok::string_literal ] |= aci_custom; 92*f4a2713aSLionel Sambuc TokenInfo[tok::wide_string_literal ] |= aci_custom; 93*f4a2713aSLionel Sambuc TokenInfo[tok::utf8_string_literal ] |= aci_custom; 94*f4a2713aSLionel Sambuc TokenInfo[tok::utf16_string_literal] |= aci_custom; 95*f4a2713aSLionel Sambuc TokenInfo[tok::utf32_string_literal] |= aci_custom; 96*f4a2713aSLionel Sambuc TokenInfo[tok::char_constant ] |= aci_custom; 97*f4a2713aSLionel Sambuc TokenInfo[tok::wide_char_constant ] |= aci_custom; 98*f4a2713aSLionel Sambuc TokenInfo[tok::utf16_char_constant ] |= aci_custom; 99*f4a2713aSLionel Sambuc TokenInfo[tok::utf32_char_constant ] |= aci_custom; 100*f4a2713aSLionel Sambuc } 101*f4a2713aSLionel Sambuc 102*f4a2713aSLionel Sambuc // These tokens change behavior if followed by an '='. 103*f4a2713aSLionel Sambuc TokenInfo[tok::amp ] |= aci_avoid_equal; // &= 104*f4a2713aSLionel Sambuc TokenInfo[tok::plus ] |= aci_avoid_equal; // += 105*f4a2713aSLionel Sambuc TokenInfo[tok::minus ] |= aci_avoid_equal; // -= 106*f4a2713aSLionel Sambuc TokenInfo[tok::slash ] |= aci_avoid_equal; // /= 107*f4a2713aSLionel Sambuc TokenInfo[tok::less ] |= aci_avoid_equal; // <= 108*f4a2713aSLionel Sambuc TokenInfo[tok::greater ] |= aci_avoid_equal; // >= 109*f4a2713aSLionel Sambuc TokenInfo[tok::pipe ] |= aci_avoid_equal; // |= 110*f4a2713aSLionel Sambuc TokenInfo[tok::percent ] |= aci_avoid_equal; // %= 111*f4a2713aSLionel Sambuc TokenInfo[tok::star ] |= aci_avoid_equal; // *= 112*f4a2713aSLionel Sambuc TokenInfo[tok::exclaim ] |= aci_avoid_equal; // != 113*f4a2713aSLionel Sambuc TokenInfo[tok::lessless ] |= aci_avoid_equal; // <<= 114*f4a2713aSLionel Sambuc TokenInfo[tok::greatergreater] |= aci_avoid_equal; // >>= 115*f4a2713aSLionel Sambuc TokenInfo[tok::caret ] |= aci_avoid_equal; // ^= 116*f4a2713aSLionel Sambuc TokenInfo[tok::equal ] |= aci_avoid_equal; // == 117*f4a2713aSLionel Sambuc } 118*f4a2713aSLionel Sambuc 119*f4a2713aSLionel Sambuc /// GetFirstChar - Get the first character of the token \arg Tok, 120*f4a2713aSLionel Sambuc /// avoiding calls to getSpelling where possible. 121*f4a2713aSLionel Sambuc static char GetFirstChar(Preprocessor &PP, const Token &Tok) { 122*f4a2713aSLionel Sambuc if (IdentifierInfo *II = Tok.getIdentifierInfo()) { 123*f4a2713aSLionel Sambuc // Avoid spelling identifiers, the most common form of token. 124*f4a2713aSLionel Sambuc return II->getNameStart()[0]; 125*f4a2713aSLionel Sambuc } else if (!Tok.needsCleaning()) { 126*f4a2713aSLionel Sambuc if (Tok.isLiteral() && Tok.getLiteralData()) { 127*f4a2713aSLionel Sambuc return *Tok.getLiteralData(); 128*f4a2713aSLionel Sambuc } else { 129*f4a2713aSLionel Sambuc SourceManager &SM = PP.getSourceManager(); 130*f4a2713aSLionel Sambuc return *SM.getCharacterData(SM.getSpellingLoc(Tok.getLocation())); 131*f4a2713aSLionel Sambuc } 132*f4a2713aSLionel Sambuc } else if (Tok.getLength() < 256) { 133*f4a2713aSLionel Sambuc char Buffer[256]; 134*f4a2713aSLionel Sambuc const char *TokPtr = Buffer; 135*f4a2713aSLionel Sambuc PP.getSpelling(Tok, TokPtr); 136*f4a2713aSLionel Sambuc return TokPtr[0]; 137*f4a2713aSLionel Sambuc } else { 138*f4a2713aSLionel Sambuc return PP.getSpelling(Tok)[0]; 139*f4a2713aSLionel Sambuc } 140*f4a2713aSLionel Sambuc } 141*f4a2713aSLionel Sambuc 142*f4a2713aSLionel Sambuc /// AvoidConcat - If printing PrevTok immediately followed by Tok would cause 143*f4a2713aSLionel Sambuc /// the two individual tokens to be lexed as a single token, return true 144*f4a2713aSLionel Sambuc /// (which causes a space to be printed between them). This allows the output 145*f4a2713aSLionel Sambuc /// of -E mode to be lexed to the same token stream as lexing the input 146*f4a2713aSLionel Sambuc /// directly would. 147*f4a2713aSLionel Sambuc /// 148*f4a2713aSLionel Sambuc /// This code must conservatively return true if it doesn't want to be 100% 149*f4a2713aSLionel Sambuc /// accurate. This will cause the output to include extra space characters, 150*f4a2713aSLionel Sambuc /// but the resulting output won't have incorrect concatenations going on. 151*f4a2713aSLionel Sambuc /// Examples include "..", which we print with a space between, because we 152*f4a2713aSLionel Sambuc /// don't want to track enough to tell "x.." from "...". 153*f4a2713aSLionel Sambuc bool TokenConcatenation::AvoidConcat(const Token &PrevPrevTok, 154*f4a2713aSLionel Sambuc const Token &PrevTok, 155*f4a2713aSLionel Sambuc const Token &Tok) const { 156*f4a2713aSLionel Sambuc // First, check to see if the tokens were directly adjacent in the original 157*f4a2713aSLionel Sambuc // source. If they were, it must be okay to stick them together: if there 158*f4a2713aSLionel Sambuc // were an issue, the tokens would have been lexed differently. 159*f4a2713aSLionel Sambuc SourceManager &SM = PP.getSourceManager(); 160*f4a2713aSLionel Sambuc SourceLocation PrevSpellLoc = SM.getSpellingLoc(PrevTok.getLocation()); 161*f4a2713aSLionel Sambuc SourceLocation SpellLoc = SM.getSpellingLoc(Tok.getLocation()); 162*f4a2713aSLionel Sambuc if (PrevSpellLoc.getLocWithOffset(PrevTok.getLength()) == SpellLoc) 163*f4a2713aSLionel Sambuc return false; 164*f4a2713aSLionel Sambuc 165*f4a2713aSLionel Sambuc tok::TokenKind PrevKind = PrevTok.getKind(); 166*f4a2713aSLionel Sambuc if (PrevTok.getIdentifierInfo()) // Language keyword or named operator. 167*f4a2713aSLionel Sambuc PrevKind = tok::identifier; 168*f4a2713aSLionel Sambuc 169*f4a2713aSLionel Sambuc // Look up information on when we should avoid concatenation with prevtok. 170*f4a2713aSLionel Sambuc unsigned ConcatInfo = TokenInfo[PrevKind]; 171*f4a2713aSLionel Sambuc 172*f4a2713aSLionel Sambuc // If prevtok never causes a problem for anything after it, return quickly. 173*f4a2713aSLionel Sambuc if (ConcatInfo == 0) return false; 174*f4a2713aSLionel Sambuc 175*f4a2713aSLionel Sambuc if (ConcatInfo & aci_avoid_equal) { 176*f4a2713aSLionel Sambuc // If the next token is '=' or '==', avoid concatenation. 177*f4a2713aSLionel Sambuc if (Tok.is(tok::equal) || Tok.is(tok::equalequal)) 178*f4a2713aSLionel Sambuc return true; 179*f4a2713aSLionel Sambuc ConcatInfo &= ~aci_avoid_equal; 180*f4a2713aSLionel Sambuc } 181*f4a2713aSLionel Sambuc 182*f4a2713aSLionel Sambuc if (ConcatInfo == 0) return false; 183*f4a2713aSLionel Sambuc 184*f4a2713aSLionel Sambuc // Basic algorithm: we look at the first character of the second token, and 185*f4a2713aSLionel Sambuc // determine whether it, if appended to the first token, would form (or 186*f4a2713aSLionel Sambuc // would contribute) to a larger token if concatenated. 187*f4a2713aSLionel Sambuc char FirstChar = 0; 188*f4a2713aSLionel Sambuc if (ConcatInfo & aci_custom) { 189*f4a2713aSLionel Sambuc // If the token does not need to know the first character, don't get it. 190*f4a2713aSLionel Sambuc } else { 191*f4a2713aSLionel Sambuc FirstChar = GetFirstChar(PP, Tok); 192*f4a2713aSLionel Sambuc } 193*f4a2713aSLionel Sambuc 194*f4a2713aSLionel Sambuc switch (PrevKind) { 195*f4a2713aSLionel Sambuc default: 196*f4a2713aSLionel Sambuc llvm_unreachable("InitAvoidConcatTokenInfo built wrong"); 197*f4a2713aSLionel Sambuc 198*f4a2713aSLionel Sambuc case tok::raw_identifier: 199*f4a2713aSLionel Sambuc llvm_unreachable("tok::raw_identifier in non-raw lexing mode!"); 200*f4a2713aSLionel Sambuc 201*f4a2713aSLionel Sambuc case tok::string_literal: 202*f4a2713aSLionel Sambuc case tok::wide_string_literal: 203*f4a2713aSLionel Sambuc case tok::utf8_string_literal: 204*f4a2713aSLionel Sambuc case tok::utf16_string_literal: 205*f4a2713aSLionel Sambuc case tok::utf32_string_literal: 206*f4a2713aSLionel Sambuc case tok::char_constant: 207*f4a2713aSLionel Sambuc case tok::wide_char_constant: 208*f4a2713aSLionel Sambuc case tok::utf16_char_constant: 209*f4a2713aSLionel Sambuc case tok::utf32_char_constant: 210*f4a2713aSLionel Sambuc if (!PP.getLangOpts().CPlusPlus11) 211*f4a2713aSLionel Sambuc return false; 212*f4a2713aSLionel Sambuc 213*f4a2713aSLionel Sambuc // In C++11, a string or character literal followed by an identifier is a 214*f4a2713aSLionel Sambuc // single token. 215*f4a2713aSLionel Sambuc if (Tok.getIdentifierInfo()) 216*f4a2713aSLionel Sambuc return true; 217*f4a2713aSLionel Sambuc 218*f4a2713aSLionel Sambuc // A ud-suffix is an identifier. If the previous token ends with one, treat 219*f4a2713aSLionel Sambuc // it as an identifier. 220*f4a2713aSLionel Sambuc if (!PrevTok.hasUDSuffix()) 221*f4a2713aSLionel Sambuc return false; 222*f4a2713aSLionel Sambuc // FALL THROUGH. 223*f4a2713aSLionel Sambuc case tok::identifier: // id+id or id+number or id+L"foo". 224*f4a2713aSLionel Sambuc // id+'.'... will not append. 225*f4a2713aSLionel Sambuc if (Tok.is(tok::numeric_constant)) 226*f4a2713aSLionel Sambuc return GetFirstChar(PP, Tok) != '.'; 227*f4a2713aSLionel Sambuc 228*f4a2713aSLionel Sambuc if (Tok.getIdentifierInfo() || Tok.is(tok::wide_string_literal) || 229*f4a2713aSLionel Sambuc Tok.is(tok::utf8_string_literal) || Tok.is(tok::utf16_string_literal) || 230*f4a2713aSLionel Sambuc Tok.is(tok::utf32_string_literal) || Tok.is(tok::wide_char_constant) || 231*f4a2713aSLionel Sambuc Tok.is(tok::utf16_char_constant) || Tok.is(tok::utf32_char_constant)) 232*f4a2713aSLionel Sambuc return true; 233*f4a2713aSLionel Sambuc 234*f4a2713aSLionel Sambuc // If this isn't identifier + string, we're done. 235*f4a2713aSLionel Sambuc if (Tok.isNot(tok::char_constant) && Tok.isNot(tok::string_literal)) 236*f4a2713aSLionel Sambuc return false; 237*f4a2713aSLionel Sambuc 238*f4a2713aSLionel Sambuc // Otherwise, this is a narrow character or string. If the *identifier* 239*f4a2713aSLionel Sambuc // is a literal 'L', 'u8', 'u' or 'U', avoid pasting L "foo" -> L"foo". 240*f4a2713aSLionel Sambuc return IsIdentifierStringPrefix(PrevTok); 241*f4a2713aSLionel Sambuc 242*f4a2713aSLionel Sambuc case tok::numeric_constant: 243*f4a2713aSLionel Sambuc return isPreprocessingNumberBody(FirstChar) || 244*f4a2713aSLionel Sambuc FirstChar == '+' || FirstChar == '-'; 245*f4a2713aSLionel Sambuc case tok::period: // ..., .*, .1234 246*f4a2713aSLionel Sambuc return (FirstChar == '.' && PrevPrevTok.is(tok::period)) || 247*f4a2713aSLionel Sambuc isDigit(FirstChar) || 248*f4a2713aSLionel Sambuc (PP.getLangOpts().CPlusPlus && FirstChar == '*'); 249*f4a2713aSLionel Sambuc case tok::amp: // && 250*f4a2713aSLionel Sambuc return FirstChar == '&'; 251*f4a2713aSLionel Sambuc case tok::plus: // ++ 252*f4a2713aSLionel Sambuc return FirstChar == '+'; 253*f4a2713aSLionel Sambuc case tok::minus: // --, ->, ->* 254*f4a2713aSLionel Sambuc return FirstChar == '-' || FirstChar == '>'; 255*f4a2713aSLionel Sambuc case tok::slash: //, /*, // 256*f4a2713aSLionel Sambuc return FirstChar == '*' || FirstChar == '/'; 257*f4a2713aSLionel Sambuc case tok::less: // <<, <<=, <:, <% 258*f4a2713aSLionel Sambuc return FirstChar == '<' || FirstChar == ':' || FirstChar == '%'; 259*f4a2713aSLionel Sambuc case tok::greater: // >>, >>= 260*f4a2713aSLionel Sambuc return FirstChar == '>'; 261*f4a2713aSLionel Sambuc case tok::pipe: // || 262*f4a2713aSLionel Sambuc return FirstChar == '|'; 263*f4a2713aSLionel Sambuc case tok::percent: // %>, %: 264*f4a2713aSLionel Sambuc return FirstChar == '>' || FirstChar == ':'; 265*f4a2713aSLionel Sambuc case tok::colon: // ::, :> 266*f4a2713aSLionel Sambuc return FirstChar == '>' || 267*f4a2713aSLionel Sambuc (PP.getLangOpts().CPlusPlus && FirstChar == ':'); 268*f4a2713aSLionel Sambuc case tok::hash: // ##, #@, %:%: 269*f4a2713aSLionel Sambuc return FirstChar == '#' || FirstChar == '@' || FirstChar == '%'; 270*f4a2713aSLionel Sambuc case tok::arrow: // ->* 271*f4a2713aSLionel Sambuc return PP.getLangOpts().CPlusPlus && FirstChar == '*'; 272*f4a2713aSLionel Sambuc } 273*f4a2713aSLionel Sambuc } 274