1f4a2713aSLionel Sambuc //===--- TokenConcatenation.cpp - Token Concatenation Avoidance -----------===//
2f4a2713aSLionel Sambuc //
3f4a2713aSLionel Sambuc // The LLVM Compiler Infrastructure
4f4a2713aSLionel Sambuc //
5f4a2713aSLionel Sambuc // This file is distributed under the University of Illinois Open Source
6f4a2713aSLionel Sambuc // License. See LICENSE.TXT for details.
7f4a2713aSLionel Sambuc //
8f4a2713aSLionel Sambuc //===----------------------------------------------------------------------===//
9f4a2713aSLionel Sambuc //
10f4a2713aSLionel Sambuc // This file implements the TokenConcatenation class.
11f4a2713aSLionel Sambuc //
12f4a2713aSLionel Sambuc //===----------------------------------------------------------------------===//
13f4a2713aSLionel Sambuc
14f4a2713aSLionel Sambuc #include "clang/Lex/TokenConcatenation.h"
15f4a2713aSLionel Sambuc #include "clang/Basic/CharInfo.h"
16f4a2713aSLionel Sambuc #include "clang/Lex/Preprocessor.h"
17f4a2713aSLionel Sambuc #include "llvm/Support/ErrorHandling.h"
18f4a2713aSLionel Sambuc using namespace clang;
19f4a2713aSLionel Sambuc
20f4a2713aSLionel Sambuc
21f4a2713aSLionel Sambuc /// IsStringPrefix - Return true if Str is a string prefix.
22f4a2713aSLionel Sambuc /// 'L', 'u', 'U', or 'u8'. Including raw versions.
IsStringPrefix(StringRef Str,bool CPlusPlus11)23f4a2713aSLionel Sambuc static bool IsStringPrefix(StringRef Str, bool CPlusPlus11) {
24f4a2713aSLionel Sambuc
25f4a2713aSLionel Sambuc if (Str[0] == 'L' ||
26f4a2713aSLionel Sambuc (CPlusPlus11 && (Str[0] == 'u' || Str[0] == 'U' || Str[0] == 'R'))) {
27f4a2713aSLionel Sambuc
28f4a2713aSLionel Sambuc if (Str.size() == 1)
29f4a2713aSLionel Sambuc return true; // "L", "u", "U", and "R"
30f4a2713aSLionel Sambuc
31f4a2713aSLionel Sambuc // Check for raw flavors. Need to make sure the first character wasn't
32f4a2713aSLionel Sambuc // already R. Need CPlusPlus11 check for "LR".
33f4a2713aSLionel Sambuc if (Str[1] == 'R' && Str[0] != 'R' && Str.size() == 2 && CPlusPlus11)
34f4a2713aSLionel Sambuc return true; // "LR", "uR", "UR"
35f4a2713aSLionel Sambuc
36f4a2713aSLionel Sambuc // Check for "u8" and "u8R"
37f4a2713aSLionel Sambuc if (Str[0] == 'u' && Str[1] == '8') {
38f4a2713aSLionel Sambuc if (Str.size() == 2) return true; // "u8"
39f4a2713aSLionel Sambuc if (Str.size() == 3 && Str[2] == 'R') return true; // "u8R"
40f4a2713aSLionel Sambuc }
41f4a2713aSLionel Sambuc }
42f4a2713aSLionel Sambuc
43f4a2713aSLionel Sambuc return false;
44f4a2713aSLionel Sambuc }
45f4a2713aSLionel Sambuc
46f4a2713aSLionel Sambuc /// IsIdentifierStringPrefix - Return true if the spelling of the token
47f4a2713aSLionel Sambuc /// is literally 'L', 'u', 'U', or 'u8'. Including raw versions.
IsIdentifierStringPrefix(const Token & Tok) const48f4a2713aSLionel Sambuc bool TokenConcatenation::IsIdentifierStringPrefix(const Token &Tok) const {
49f4a2713aSLionel Sambuc const LangOptions &LangOpts = PP.getLangOpts();
50f4a2713aSLionel Sambuc
51f4a2713aSLionel Sambuc if (!Tok.needsCleaning()) {
52f4a2713aSLionel Sambuc if (Tok.getLength() < 1 || Tok.getLength() > 3)
53f4a2713aSLionel Sambuc return false;
54f4a2713aSLionel Sambuc SourceManager &SM = PP.getSourceManager();
55f4a2713aSLionel Sambuc const char *Ptr = SM.getCharacterData(SM.getSpellingLoc(Tok.getLocation()));
56f4a2713aSLionel Sambuc return IsStringPrefix(StringRef(Ptr, Tok.getLength()),
57f4a2713aSLionel Sambuc LangOpts.CPlusPlus11);
58f4a2713aSLionel Sambuc }
59f4a2713aSLionel Sambuc
60f4a2713aSLionel Sambuc if (Tok.getLength() < 256) {
61f4a2713aSLionel Sambuc char Buffer[256];
62f4a2713aSLionel Sambuc const char *TokPtr = Buffer;
63f4a2713aSLionel Sambuc unsigned length = PP.getSpelling(Tok, TokPtr);
64f4a2713aSLionel Sambuc return IsStringPrefix(StringRef(TokPtr, length), LangOpts.CPlusPlus11);
65f4a2713aSLionel Sambuc }
66f4a2713aSLionel Sambuc
67f4a2713aSLionel Sambuc return IsStringPrefix(StringRef(PP.getSpelling(Tok)), LangOpts.CPlusPlus11);
68f4a2713aSLionel Sambuc }
69f4a2713aSLionel Sambuc
TokenConcatenation(Preprocessor & pp)70f4a2713aSLionel Sambuc TokenConcatenation::TokenConcatenation(Preprocessor &pp) : PP(pp) {
71f4a2713aSLionel Sambuc memset(TokenInfo, 0, sizeof(TokenInfo));
72f4a2713aSLionel Sambuc
73f4a2713aSLionel Sambuc // These tokens have custom code in AvoidConcat.
74f4a2713aSLionel Sambuc TokenInfo[tok::identifier ] |= aci_custom;
75f4a2713aSLionel Sambuc TokenInfo[tok::numeric_constant] |= aci_custom_firstchar;
76f4a2713aSLionel Sambuc TokenInfo[tok::period ] |= aci_custom_firstchar;
77f4a2713aSLionel Sambuc TokenInfo[tok::amp ] |= aci_custom_firstchar;
78f4a2713aSLionel Sambuc TokenInfo[tok::plus ] |= aci_custom_firstchar;
79f4a2713aSLionel Sambuc TokenInfo[tok::minus ] |= aci_custom_firstchar;
80f4a2713aSLionel Sambuc TokenInfo[tok::slash ] |= aci_custom_firstchar;
81f4a2713aSLionel Sambuc TokenInfo[tok::less ] |= aci_custom_firstchar;
82f4a2713aSLionel Sambuc TokenInfo[tok::greater ] |= aci_custom_firstchar;
83f4a2713aSLionel Sambuc TokenInfo[tok::pipe ] |= aci_custom_firstchar;
84f4a2713aSLionel Sambuc TokenInfo[tok::percent ] |= aci_custom_firstchar;
85f4a2713aSLionel Sambuc TokenInfo[tok::colon ] |= aci_custom_firstchar;
86f4a2713aSLionel Sambuc TokenInfo[tok::hash ] |= aci_custom_firstchar;
87f4a2713aSLionel Sambuc TokenInfo[tok::arrow ] |= aci_custom_firstchar;
88f4a2713aSLionel Sambuc
89f4a2713aSLionel Sambuc // These tokens have custom code in C++11 mode.
90f4a2713aSLionel Sambuc if (PP.getLangOpts().CPlusPlus11) {
91f4a2713aSLionel Sambuc TokenInfo[tok::string_literal ] |= aci_custom;
92f4a2713aSLionel Sambuc TokenInfo[tok::wide_string_literal ] |= aci_custom;
93f4a2713aSLionel Sambuc TokenInfo[tok::utf8_string_literal ] |= aci_custom;
94f4a2713aSLionel Sambuc TokenInfo[tok::utf16_string_literal] |= aci_custom;
95f4a2713aSLionel Sambuc TokenInfo[tok::utf32_string_literal] |= aci_custom;
96f4a2713aSLionel Sambuc TokenInfo[tok::char_constant ] |= aci_custom;
97f4a2713aSLionel Sambuc TokenInfo[tok::wide_char_constant ] |= aci_custom;
98f4a2713aSLionel Sambuc TokenInfo[tok::utf16_char_constant ] |= aci_custom;
99f4a2713aSLionel Sambuc TokenInfo[tok::utf32_char_constant ] |= aci_custom;
100f4a2713aSLionel Sambuc }
101f4a2713aSLionel Sambuc
102*0a6a1f1dSLionel Sambuc // These tokens have custom code in C++1z mode.
103*0a6a1f1dSLionel Sambuc if (PP.getLangOpts().CPlusPlus1z)
104*0a6a1f1dSLionel Sambuc TokenInfo[tok::utf8_char_constant] |= aci_custom;
105*0a6a1f1dSLionel Sambuc
106f4a2713aSLionel Sambuc // These tokens change behavior if followed by an '='.
107f4a2713aSLionel Sambuc TokenInfo[tok::amp ] |= aci_avoid_equal; // &=
108f4a2713aSLionel Sambuc TokenInfo[tok::plus ] |= aci_avoid_equal; // +=
109f4a2713aSLionel Sambuc TokenInfo[tok::minus ] |= aci_avoid_equal; // -=
110f4a2713aSLionel Sambuc TokenInfo[tok::slash ] |= aci_avoid_equal; // /=
111f4a2713aSLionel Sambuc TokenInfo[tok::less ] |= aci_avoid_equal; // <=
112f4a2713aSLionel Sambuc TokenInfo[tok::greater ] |= aci_avoid_equal; // >=
113f4a2713aSLionel Sambuc TokenInfo[tok::pipe ] |= aci_avoid_equal; // |=
114f4a2713aSLionel Sambuc TokenInfo[tok::percent ] |= aci_avoid_equal; // %=
115f4a2713aSLionel Sambuc TokenInfo[tok::star ] |= aci_avoid_equal; // *=
116f4a2713aSLionel Sambuc TokenInfo[tok::exclaim ] |= aci_avoid_equal; // !=
117f4a2713aSLionel Sambuc TokenInfo[tok::lessless ] |= aci_avoid_equal; // <<=
118f4a2713aSLionel Sambuc TokenInfo[tok::greatergreater] |= aci_avoid_equal; // >>=
119f4a2713aSLionel Sambuc TokenInfo[tok::caret ] |= aci_avoid_equal; // ^=
120f4a2713aSLionel Sambuc TokenInfo[tok::equal ] |= aci_avoid_equal; // ==
121f4a2713aSLionel Sambuc }
122f4a2713aSLionel Sambuc
123f4a2713aSLionel Sambuc /// GetFirstChar - Get the first character of the token \arg Tok,
124f4a2713aSLionel Sambuc /// avoiding calls to getSpelling where possible.
GetFirstChar(Preprocessor & PP,const Token & Tok)125f4a2713aSLionel Sambuc static char GetFirstChar(Preprocessor &PP, const Token &Tok) {
126f4a2713aSLionel Sambuc if (IdentifierInfo *II = Tok.getIdentifierInfo()) {
127f4a2713aSLionel Sambuc // Avoid spelling identifiers, the most common form of token.
128f4a2713aSLionel Sambuc return II->getNameStart()[0];
129f4a2713aSLionel Sambuc } else if (!Tok.needsCleaning()) {
130f4a2713aSLionel Sambuc if (Tok.isLiteral() && Tok.getLiteralData()) {
131f4a2713aSLionel Sambuc return *Tok.getLiteralData();
132f4a2713aSLionel Sambuc } else {
133f4a2713aSLionel Sambuc SourceManager &SM = PP.getSourceManager();
134f4a2713aSLionel Sambuc return *SM.getCharacterData(SM.getSpellingLoc(Tok.getLocation()));
135f4a2713aSLionel Sambuc }
136f4a2713aSLionel Sambuc } else if (Tok.getLength() < 256) {
137f4a2713aSLionel Sambuc char Buffer[256];
138f4a2713aSLionel Sambuc const char *TokPtr = Buffer;
139f4a2713aSLionel Sambuc PP.getSpelling(Tok, TokPtr);
140f4a2713aSLionel Sambuc return TokPtr[0];
141f4a2713aSLionel Sambuc } else {
142f4a2713aSLionel Sambuc return PP.getSpelling(Tok)[0];
143f4a2713aSLionel Sambuc }
144f4a2713aSLionel Sambuc }
145f4a2713aSLionel Sambuc
146f4a2713aSLionel Sambuc /// AvoidConcat - If printing PrevTok immediately followed by Tok would cause
147f4a2713aSLionel Sambuc /// the two individual tokens to be lexed as a single token, return true
148f4a2713aSLionel Sambuc /// (which causes a space to be printed between them). This allows the output
149f4a2713aSLionel Sambuc /// of -E mode to be lexed to the same token stream as lexing the input
150f4a2713aSLionel Sambuc /// directly would.
151f4a2713aSLionel Sambuc ///
152f4a2713aSLionel Sambuc /// This code must conservatively return true if it doesn't want to be 100%
153f4a2713aSLionel Sambuc /// accurate. This will cause the output to include extra space characters,
154f4a2713aSLionel Sambuc /// but the resulting output won't have incorrect concatenations going on.
155f4a2713aSLionel Sambuc /// Examples include "..", which we print with a space between, because we
156f4a2713aSLionel Sambuc /// don't want to track enough to tell "x.." from "...".
AvoidConcat(const Token & PrevPrevTok,const Token & PrevTok,const Token & Tok) const157f4a2713aSLionel Sambuc bool TokenConcatenation::AvoidConcat(const Token &PrevPrevTok,
158f4a2713aSLionel Sambuc const Token &PrevTok,
159f4a2713aSLionel Sambuc const Token &Tok) const {
160f4a2713aSLionel Sambuc // First, check to see if the tokens were directly adjacent in the original
161f4a2713aSLionel Sambuc // source. If they were, it must be okay to stick them together: if there
162f4a2713aSLionel Sambuc // were an issue, the tokens would have been lexed differently.
163f4a2713aSLionel Sambuc SourceManager &SM = PP.getSourceManager();
164f4a2713aSLionel Sambuc SourceLocation PrevSpellLoc = SM.getSpellingLoc(PrevTok.getLocation());
165f4a2713aSLionel Sambuc SourceLocation SpellLoc = SM.getSpellingLoc(Tok.getLocation());
166f4a2713aSLionel Sambuc if (PrevSpellLoc.getLocWithOffset(PrevTok.getLength()) == SpellLoc)
167f4a2713aSLionel Sambuc return false;
168f4a2713aSLionel Sambuc
169f4a2713aSLionel Sambuc tok::TokenKind PrevKind = PrevTok.getKind();
170*0a6a1f1dSLionel Sambuc if (!PrevTok.isAnnotation() && PrevTok.getIdentifierInfo())
171*0a6a1f1dSLionel Sambuc PrevKind = tok::identifier; // Language keyword or named operator.
172f4a2713aSLionel Sambuc
173f4a2713aSLionel Sambuc // Look up information on when we should avoid concatenation with prevtok.
174f4a2713aSLionel Sambuc unsigned ConcatInfo = TokenInfo[PrevKind];
175f4a2713aSLionel Sambuc
176f4a2713aSLionel Sambuc // If prevtok never causes a problem for anything after it, return quickly.
177f4a2713aSLionel Sambuc if (ConcatInfo == 0) return false;
178f4a2713aSLionel Sambuc
179f4a2713aSLionel Sambuc if (ConcatInfo & aci_avoid_equal) {
180f4a2713aSLionel Sambuc // If the next token is '=' or '==', avoid concatenation.
181f4a2713aSLionel Sambuc if (Tok.is(tok::equal) || Tok.is(tok::equalequal))
182f4a2713aSLionel Sambuc return true;
183f4a2713aSLionel Sambuc ConcatInfo &= ~aci_avoid_equal;
184f4a2713aSLionel Sambuc }
185*0a6a1f1dSLionel Sambuc if (Tok.isAnnotation()) {
186*0a6a1f1dSLionel Sambuc // Modules annotation can show up when generated automatically for includes.
187*0a6a1f1dSLionel Sambuc assert((Tok.is(tok::annot_module_include) ||
188*0a6a1f1dSLionel Sambuc Tok.is(tok::annot_module_begin) ||
189*0a6a1f1dSLionel Sambuc Tok.is(tok::annot_module_end)) &&
190*0a6a1f1dSLionel Sambuc "unexpected annotation in AvoidConcat");
191*0a6a1f1dSLionel Sambuc ConcatInfo = 0;
192*0a6a1f1dSLionel Sambuc }
193f4a2713aSLionel Sambuc
194f4a2713aSLionel Sambuc if (ConcatInfo == 0) return false;
195f4a2713aSLionel Sambuc
196f4a2713aSLionel Sambuc // Basic algorithm: we look at the first character of the second token, and
197f4a2713aSLionel Sambuc // determine whether it, if appended to the first token, would form (or
198f4a2713aSLionel Sambuc // would contribute) to a larger token if concatenated.
199f4a2713aSLionel Sambuc char FirstChar = 0;
200f4a2713aSLionel Sambuc if (ConcatInfo & aci_custom) {
201f4a2713aSLionel Sambuc // If the token does not need to know the first character, don't get it.
202f4a2713aSLionel Sambuc } else {
203f4a2713aSLionel Sambuc FirstChar = GetFirstChar(PP, Tok);
204f4a2713aSLionel Sambuc }
205f4a2713aSLionel Sambuc
206f4a2713aSLionel Sambuc switch (PrevKind) {
207f4a2713aSLionel Sambuc default:
208f4a2713aSLionel Sambuc llvm_unreachable("InitAvoidConcatTokenInfo built wrong");
209f4a2713aSLionel Sambuc
210f4a2713aSLionel Sambuc case tok::raw_identifier:
211f4a2713aSLionel Sambuc llvm_unreachable("tok::raw_identifier in non-raw lexing mode!");
212f4a2713aSLionel Sambuc
213f4a2713aSLionel Sambuc case tok::string_literal:
214f4a2713aSLionel Sambuc case tok::wide_string_literal:
215f4a2713aSLionel Sambuc case tok::utf8_string_literal:
216f4a2713aSLionel Sambuc case tok::utf16_string_literal:
217f4a2713aSLionel Sambuc case tok::utf32_string_literal:
218f4a2713aSLionel Sambuc case tok::char_constant:
219f4a2713aSLionel Sambuc case tok::wide_char_constant:
220*0a6a1f1dSLionel Sambuc case tok::utf8_char_constant:
221f4a2713aSLionel Sambuc case tok::utf16_char_constant:
222f4a2713aSLionel Sambuc case tok::utf32_char_constant:
223f4a2713aSLionel Sambuc if (!PP.getLangOpts().CPlusPlus11)
224f4a2713aSLionel Sambuc return false;
225f4a2713aSLionel Sambuc
226f4a2713aSLionel Sambuc // In C++11, a string or character literal followed by an identifier is a
227f4a2713aSLionel Sambuc // single token.
228f4a2713aSLionel Sambuc if (Tok.getIdentifierInfo())
229f4a2713aSLionel Sambuc return true;
230f4a2713aSLionel Sambuc
231f4a2713aSLionel Sambuc // A ud-suffix is an identifier. If the previous token ends with one, treat
232f4a2713aSLionel Sambuc // it as an identifier.
233f4a2713aSLionel Sambuc if (!PrevTok.hasUDSuffix())
234f4a2713aSLionel Sambuc return false;
235f4a2713aSLionel Sambuc // FALL THROUGH.
236f4a2713aSLionel Sambuc case tok::identifier: // id+id or id+number or id+L"foo".
237f4a2713aSLionel Sambuc // id+'.'... will not append.
238f4a2713aSLionel Sambuc if (Tok.is(tok::numeric_constant))
239f4a2713aSLionel Sambuc return GetFirstChar(PP, Tok) != '.';
240f4a2713aSLionel Sambuc
241f4a2713aSLionel Sambuc if (Tok.getIdentifierInfo() || Tok.is(tok::wide_string_literal) ||
242f4a2713aSLionel Sambuc Tok.is(tok::utf8_string_literal) || Tok.is(tok::utf16_string_literal) ||
243f4a2713aSLionel Sambuc Tok.is(tok::utf32_string_literal) || Tok.is(tok::wide_char_constant) ||
244*0a6a1f1dSLionel Sambuc Tok.is(tok::utf8_char_constant) || Tok.is(tok::utf16_char_constant) ||
245*0a6a1f1dSLionel Sambuc Tok.is(tok::utf32_char_constant))
246f4a2713aSLionel Sambuc return true;
247f4a2713aSLionel Sambuc
248f4a2713aSLionel Sambuc // If this isn't identifier + string, we're done.
249f4a2713aSLionel Sambuc if (Tok.isNot(tok::char_constant) && Tok.isNot(tok::string_literal))
250f4a2713aSLionel Sambuc return false;
251f4a2713aSLionel Sambuc
252f4a2713aSLionel Sambuc // Otherwise, this is a narrow character or string. If the *identifier*
253f4a2713aSLionel Sambuc // is a literal 'L', 'u8', 'u' or 'U', avoid pasting L "foo" -> L"foo".
254f4a2713aSLionel Sambuc return IsIdentifierStringPrefix(PrevTok);
255f4a2713aSLionel Sambuc
256f4a2713aSLionel Sambuc case tok::numeric_constant:
257f4a2713aSLionel Sambuc return isPreprocessingNumberBody(FirstChar) ||
258f4a2713aSLionel Sambuc FirstChar == '+' || FirstChar == '-';
259f4a2713aSLionel Sambuc case tok::period: // ..., .*, .1234
260f4a2713aSLionel Sambuc return (FirstChar == '.' && PrevPrevTok.is(tok::period)) ||
261f4a2713aSLionel Sambuc isDigit(FirstChar) ||
262f4a2713aSLionel Sambuc (PP.getLangOpts().CPlusPlus && FirstChar == '*');
263f4a2713aSLionel Sambuc case tok::amp: // &&
264f4a2713aSLionel Sambuc return FirstChar == '&';
265f4a2713aSLionel Sambuc case tok::plus: // ++
266f4a2713aSLionel Sambuc return FirstChar == '+';
267f4a2713aSLionel Sambuc case tok::minus: // --, ->, ->*
268f4a2713aSLionel Sambuc return FirstChar == '-' || FirstChar == '>';
269f4a2713aSLionel Sambuc case tok::slash: //, /*, //
270f4a2713aSLionel Sambuc return FirstChar == '*' || FirstChar == '/';
271f4a2713aSLionel Sambuc case tok::less: // <<, <<=, <:, <%
272f4a2713aSLionel Sambuc return FirstChar == '<' || FirstChar == ':' || FirstChar == '%';
273f4a2713aSLionel Sambuc case tok::greater: // >>, >>=
274f4a2713aSLionel Sambuc return FirstChar == '>';
275f4a2713aSLionel Sambuc case tok::pipe: // ||
276f4a2713aSLionel Sambuc return FirstChar == '|';
277f4a2713aSLionel Sambuc case tok::percent: // %>, %:
278f4a2713aSLionel Sambuc return FirstChar == '>' || FirstChar == ':';
279f4a2713aSLionel Sambuc case tok::colon: // ::, :>
280f4a2713aSLionel Sambuc return FirstChar == '>' ||
281f4a2713aSLionel Sambuc (PP.getLangOpts().CPlusPlus && FirstChar == ':');
282f4a2713aSLionel Sambuc case tok::hash: // ##, #@, %:%:
283f4a2713aSLionel Sambuc return FirstChar == '#' || FirstChar == '@' || FirstChar == '%';
284f4a2713aSLionel Sambuc case tok::arrow: // ->*
285f4a2713aSLionel Sambuc return PP.getLangOpts().CPlusPlus && FirstChar == '*';
286f4a2713aSLionel Sambuc }
287f4a2713aSLionel Sambuc }
288