xref: /netbsd-src/external/apache2/llvm/dist/clang/lib/Tooling/Syntax/Tokens.cpp (revision e038c9c4676b0f19b1b7dd08a940c6ed64a6d5ae)
17330f729Sjoerg //===- Tokens.cpp - collect tokens from preprocessing ---------------------===//
27330f729Sjoerg //
37330f729Sjoerg // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
47330f729Sjoerg // See https://llvm.org/LICENSE.txt for license information.
57330f729Sjoerg // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
67330f729Sjoerg //
77330f729Sjoerg //===----------------------------------------------------------------------===//
87330f729Sjoerg #include "clang/Tooling/Syntax/Tokens.h"
97330f729Sjoerg 
107330f729Sjoerg #include "clang/Basic/Diagnostic.h"
117330f729Sjoerg #include "clang/Basic/IdentifierTable.h"
127330f729Sjoerg #include "clang/Basic/LLVM.h"
137330f729Sjoerg #include "clang/Basic/LangOptions.h"
147330f729Sjoerg #include "clang/Basic/SourceLocation.h"
157330f729Sjoerg #include "clang/Basic/SourceManager.h"
167330f729Sjoerg #include "clang/Basic/TokenKinds.h"
177330f729Sjoerg #include "clang/Lex/PPCallbacks.h"
187330f729Sjoerg #include "clang/Lex/Preprocessor.h"
197330f729Sjoerg #include "clang/Lex/Token.h"
207330f729Sjoerg #include "llvm/ADT/ArrayRef.h"
217330f729Sjoerg #include "llvm/ADT/None.h"
227330f729Sjoerg #include "llvm/ADT/Optional.h"
237330f729Sjoerg #include "llvm/ADT/STLExtras.h"
247330f729Sjoerg #include "llvm/Support/Debug.h"
257330f729Sjoerg #include "llvm/Support/ErrorHandling.h"
267330f729Sjoerg #include "llvm/Support/FormatVariadic.h"
277330f729Sjoerg #include "llvm/Support/raw_ostream.h"
287330f729Sjoerg #include <algorithm>
297330f729Sjoerg #include <cassert>
307330f729Sjoerg #include <iterator>
317330f729Sjoerg #include <string>
327330f729Sjoerg #include <utility>
337330f729Sjoerg #include <vector>
347330f729Sjoerg 
357330f729Sjoerg using namespace clang;
367330f729Sjoerg using namespace clang::syntax;
377330f729Sjoerg 
38*e038c9c4Sjoerg namespace {
39*e038c9c4Sjoerg // Finds the smallest consecutive subsuquence of Toks that covers R.
40*e038c9c4Sjoerg llvm::ArrayRef<syntax::Token>
getTokensCovering(llvm::ArrayRef<syntax::Token> Toks,SourceRange R,const SourceManager & SM)41*e038c9c4Sjoerg getTokensCovering(llvm::ArrayRef<syntax::Token> Toks, SourceRange R,
42*e038c9c4Sjoerg                   const SourceManager &SM) {
43*e038c9c4Sjoerg   if (R.isInvalid())
44*e038c9c4Sjoerg     return {};
45*e038c9c4Sjoerg   const syntax::Token *Begin =
46*e038c9c4Sjoerg       llvm::partition_point(Toks, [&](const syntax::Token &T) {
47*e038c9c4Sjoerg         return SM.isBeforeInTranslationUnit(T.location(), R.getBegin());
48*e038c9c4Sjoerg       });
49*e038c9c4Sjoerg   const syntax::Token *End =
50*e038c9c4Sjoerg       llvm::partition_point(Toks, [&](const syntax::Token &T) {
51*e038c9c4Sjoerg         return !SM.isBeforeInTranslationUnit(R.getEnd(), T.location());
52*e038c9c4Sjoerg       });
53*e038c9c4Sjoerg   if (Begin > End)
54*e038c9c4Sjoerg     return {};
55*e038c9c4Sjoerg   return {Begin, End};
56*e038c9c4Sjoerg }
57*e038c9c4Sjoerg 
58*e038c9c4Sjoerg // Finds the smallest expansion range that contains expanded tokens First and
59*e038c9c4Sjoerg // Last, e.g.:
60*e038c9c4Sjoerg // #define ID(x) x
61*e038c9c4Sjoerg // ID(ID(ID(a1) a2))
62*e038c9c4Sjoerg //          ~~       -> a1
63*e038c9c4Sjoerg //              ~~   -> a2
64*e038c9c4Sjoerg //       ~~~~~~~~~   -> a1 a2
findCommonRangeForMacroArgs(const syntax::Token & First,const syntax::Token & Last,const SourceManager & SM)65*e038c9c4Sjoerg SourceRange findCommonRangeForMacroArgs(const syntax::Token &First,
66*e038c9c4Sjoerg                                         const syntax::Token &Last,
67*e038c9c4Sjoerg                                         const SourceManager &SM) {
68*e038c9c4Sjoerg   SourceRange Res;
69*e038c9c4Sjoerg   auto FirstLoc = First.location(), LastLoc = Last.location();
70*e038c9c4Sjoerg   // Keep traversing up the spelling chain as longs as tokens are part of the
71*e038c9c4Sjoerg   // same expansion.
72*e038c9c4Sjoerg   while (!FirstLoc.isFileID() && !LastLoc.isFileID()) {
73*e038c9c4Sjoerg     auto ExpInfoFirst = SM.getSLocEntry(SM.getFileID(FirstLoc)).getExpansion();
74*e038c9c4Sjoerg     auto ExpInfoLast = SM.getSLocEntry(SM.getFileID(LastLoc)).getExpansion();
75*e038c9c4Sjoerg     // Stop if expansions have diverged.
76*e038c9c4Sjoerg     if (ExpInfoFirst.getExpansionLocStart() !=
77*e038c9c4Sjoerg         ExpInfoLast.getExpansionLocStart())
78*e038c9c4Sjoerg       break;
79*e038c9c4Sjoerg     // Do not continue into macro bodies.
80*e038c9c4Sjoerg     if (!ExpInfoFirst.isMacroArgExpansion() ||
81*e038c9c4Sjoerg         !ExpInfoLast.isMacroArgExpansion())
82*e038c9c4Sjoerg       break;
83*e038c9c4Sjoerg     FirstLoc = SM.getImmediateSpellingLoc(FirstLoc);
84*e038c9c4Sjoerg     LastLoc = SM.getImmediateSpellingLoc(LastLoc);
85*e038c9c4Sjoerg     // Update the result afterwards, as we want the tokens that triggered the
86*e038c9c4Sjoerg     // expansion.
87*e038c9c4Sjoerg     Res = {FirstLoc, LastLoc};
88*e038c9c4Sjoerg   }
89*e038c9c4Sjoerg   // Normally mapping back to expansion location here only changes FileID, as
90*e038c9c4Sjoerg   // we've already found some tokens expanded from the same macro argument, and
91*e038c9c4Sjoerg   // they should map to a consecutive subset of spelled tokens. Unfortunately
92*e038c9c4Sjoerg   // SourceManager::isBeforeInTranslationUnit discriminates sourcelocations
93*e038c9c4Sjoerg   // based on their FileID in addition to offsets. So even though we are
94*e038c9c4Sjoerg   // referring to same tokens, SourceManager might tell us that one is before
95*e038c9c4Sjoerg   // the other if they've got different FileIDs.
96*e038c9c4Sjoerg   return SM.getExpansionRange(CharSourceRange(Res, true)).getAsRange();
97*e038c9c4Sjoerg }
98*e038c9c4Sjoerg 
99*e038c9c4Sjoerg } // namespace
100*e038c9c4Sjoerg 
Token(SourceLocation Location,unsigned Length,tok::TokenKind Kind)1017330f729Sjoerg syntax::Token::Token(SourceLocation Location, unsigned Length,
1027330f729Sjoerg                      tok::TokenKind Kind)
1037330f729Sjoerg     : Location(Location), Length(Length), Kind(Kind) {
1047330f729Sjoerg   assert(Location.isValid());
1057330f729Sjoerg }
1067330f729Sjoerg 
Token(const clang::Token & T)1077330f729Sjoerg syntax::Token::Token(const clang::Token &T)
1087330f729Sjoerg     : Token(T.getLocation(), T.getLength(), T.getKind()) {
1097330f729Sjoerg   assert(!T.isAnnotation());
1107330f729Sjoerg }
1117330f729Sjoerg 
text(const SourceManager & SM) const1127330f729Sjoerg llvm::StringRef syntax::Token::text(const SourceManager &SM) const {
1137330f729Sjoerg   bool Invalid = false;
1147330f729Sjoerg   const char *Start = SM.getCharacterData(location(), &Invalid);
1157330f729Sjoerg   assert(!Invalid);
1167330f729Sjoerg   return llvm::StringRef(Start, length());
1177330f729Sjoerg }
1187330f729Sjoerg 
range(const SourceManager & SM) const1197330f729Sjoerg FileRange syntax::Token::range(const SourceManager &SM) const {
1207330f729Sjoerg   assert(location().isFileID() && "must be a spelled token");
1217330f729Sjoerg   FileID File;
1227330f729Sjoerg   unsigned StartOffset;
1237330f729Sjoerg   std::tie(File, StartOffset) = SM.getDecomposedLoc(location());
1247330f729Sjoerg   return FileRange(File, StartOffset, StartOffset + length());
1257330f729Sjoerg }
1267330f729Sjoerg 
range(const SourceManager & SM,const syntax::Token & First,const syntax::Token & Last)1277330f729Sjoerg FileRange syntax::Token::range(const SourceManager &SM,
1287330f729Sjoerg                                const syntax::Token &First,
1297330f729Sjoerg                                const syntax::Token &Last) {
1307330f729Sjoerg   auto F = First.range(SM);
1317330f729Sjoerg   auto L = Last.range(SM);
1327330f729Sjoerg   assert(F.file() == L.file() && "tokens from different files");
133*e038c9c4Sjoerg   assert((F == L || F.endOffset() <= L.beginOffset()) &&
134*e038c9c4Sjoerg          "wrong order of tokens");
1357330f729Sjoerg   return FileRange(F.file(), F.beginOffset(), L.endOffset());
1367330f729Sjoerg }
1377330f729Sjoerg 
operator <<(llvm::raw_ostream & OS,const Token & T)1387330f729Sjoerg llvm::raw_ostream &syntax::operator<<(llvm::raw_ostream &OS, const Token &T) {
1397330f729Sjoerg   return OS << T.str();
1407330f729Sjoerg }
1417330f729Sjoerg 
FileRange(FileID File,unsigned BeginOffset,unsigned EndOffset)1427330f729Sjoerg FileRange::FileRange(FileID File, unsigned BeginOffset, unsigned EndOffset)
1437330f729Sjoerg     : File(File), Begin(BeginOffset), End(EndOffset) {
1447330f729Sjoerg   assert(File.isValid());
1457330f729Sjoerg   assert(BeginOffset <= EndOffset);
1467330f729Sjoerg }
1477330f729Sjoerg 
FileRange(const SourceManager & SM,SourceLocation BeginLoc,unsigned Length)1487330f729Sjoerg FileRange::FileRange(const SourceManager &SM, SourceLocation BeginLoc,
1497330f729Sjoerg                      unsigned Length) {
1507330f729Sjoerg   assert(BeginLoc.isValid());
1517330f729Sjoerg   assert(BeginLoc.isFileID());
1527330f729Sjoerg 
1537330f729Sjoerg   std::tie(File, Begin) = SM.getDecomposedLoc(BeginLoc);
1547330f729Sjoerg   End = Begin + Length;
1557330f729Sjoerg }
FileRange(const SourceManager & SM,SourceLocation BeginLoc,SourceLocation EndLoc)1567330f729Sjoerg FileRange::FileRange(const SourceManager &SM, SourceLocation BeginLoc,
1577330f729Sjoerg                      SourceLocation EndLoc) {
1587330f729Sjoerg   assert(BeginLoc.isValid());
1597330f729Sjoerg   assert(BeginLoc.isFileID());
1607330f729Sjoerg   assert(EndLoc.isValid());
1617330f729Sjoerg   assert(EndLoc.isFileID());
1627330f729Sjoerg   assert(SM.getFileID(BeginLoc) == SM.getFileID(EndLoc));
1637330f729Sjoerg   assert(SM.getFileOffset(BeginLoc) <= SM.getFileOffset(EndLoc));
1647330f729Sjoerg 
1657330f729Sjoerg   std::tie(File, Begin) = SM.getDecomposedLoc(BeginLoc);
1667330f729Sjoerg   End = SM.getFileOffset(EndLoc);
1677330f729Sjoerg }
1687330f729Sjoerg 
operator <<(llvm::raw_ostream & OS,const FileRange & R)1697330f729Sjoerg llvm::raw_ostream &syntax::operator<<(llvm::raw_ostream &OS,
1707330f729Sjoerg                                       const FileRange &R) {
1717330f729Sjoerg   return OS << llvm::formatv("FileRange(file = {0}, offsets = {1}-{2})",
1727330f729Sjoerg                              R.file().getHashValue(), R.beginOffset(),
1737330f729Sjoerg                              R.endOffset());
1747330f729Sjoerg }
1757330f729Sjoerg 
text(const SourceManager & SM) const1767330f729Sjoerg llvm::StringRef FileRange::text(const SourceManager &SM) const {
1777330f729Sjoerg   bool Invalid = false;
1787330f729Sjoerg   StringRef Text = SM.getBufferData(File, &Invalid);
1797330f729Sjoerg   if (Invalid)
1807330f729Sjoerg     return "";
1817330f729Sjoerg   assert(Begin <= Text.size());
1827330f729Sjoerg   assert(End <= Text.size());
1837330f729Sjoerg   return Text.substr(Begin, length());
1847330f729Sjoerg }
1857330f729Sjoerg 
indexExpandedTokens()186*e038c9c4Sjoerg void TokenBuffer::indexExpandedTokens() {
187*e038c9c4Sjoerg   // No-op if the index is already created.
188*e038c9c4Sjoerg   if (!ExpandedTokIndex.empty())
189*e038c9c4Sjoerg     return;
190*e038c9c4Sjoerg   ExpandedTokIndex.reserve(ExpandedTokens.size());
191*e038c9c4Sjoerg   // Index ExpandedTokens for faster lookups by SourceLocation.
192*e038c9c4Sjoerg   for (size_t I = 0, E = ExpandedTokens.size(); I != E; ++I) {
193*e038c9c4Sjoerg     SourceLocation Loc = ExpandedTokens[I].location();
194*e038c9c4Sjoerg     if (Loc.isValid())
195*e038c9c4Sjoerg       ExpandedTokIndex[Loc] = I;
196*e038c9c4Sjoerg   }
197*e038c9c4Sjoerg }
198*e038c9c4Sjoerg 
expandedTokens(SourceRange R) const199*e038c9c4Sjoerg llvm::ArrayRef<syntax::Token> TokenBuffer::expandedTokens(SourceRange R) const {
200*e038c9c4Sjoerg   if (R.isInvalid())
201*e038c9c4Sjoerg     return {};
202*e038c9c4Sjoerg   if (!ExpandedTokIndex.empty()) {
203*e038c9c4Sjoerg     // Quick lookup if `R` is a token range.
204*e038c9c4Sjoerg     // This is a huge win since majority of the users use ranges provided by an
205*e038c9c4Sjoerg     // AST. Ranges in AST are token ranges from expanded token stream.
206*e038c9c4Sjoerg     const auto B = ExpandedTokIndex.find(R.getBegin());
207*e038c9c4Sjoerg     const auto E = ExpandedTokIndex.find(R.getEnd());
208*e038c9c4Sjoerg     if (B != ExpandedTokIndex.end() && E != ExpandedTokIndex.end()) {
209*e038c9c4Sjoerg       const Token *L = ExpandedTokens.data() + B->getSecond();
210*e038c9c4Sjoerg       // Add 1 to End to make a half-open range.
211*e038c9c4Sjoerg       const Token *R = ExpandedTokens.data() + E->getSecond() + 1;
212*e038c9c4Sjoerg       if (L > R)
213*e038c9c4Sjoerg         return {};
214*e038c9c4Sjoerg       return {L, R};
215*e038c9c4Sjoerg     }
216*e038c9c4Sjoerg   }
217*e038c9c4Sjoerg   // Slow case. Use `isBeforeInTranslationUnit` to binary search for the
218*e038c9c4Sjoerg   // required range.
219*e038c9c4Sjoerg   return getTokensCovering(expandedTokens(), R, *SourceMgr);
220*e038c9c4Sjoerg }
221*e038c9c4Sjoerg 
toCharRange(const SourceManager & SM) const222*e038c9c4Sjoerg CharSourceRange FileRange::toCharRange(const SourceManager &SM) const {
223*e038c9c4Sjoerg   return CharSourceRange(
224*e038c9c4Sjoerg       SourceRange(SM.getComposedLoc(File, Begin), SM.getComposedLoc(File, End)),
225*e038c9c4Sjoerg       /*IsTokenRange=*/false);
226*e038c9c4Sjoerg }
227*e038c9c4Sjoerg 
2287330f729Sjoerg std::pair<const syntax::Token *, const TokenBuffer::Mapping *>
spelledForExpandedToken(const syntax::Token * Expanded) const2297330f729Sjoerg TokenBuffer::spelledForExpandedToken(const syntax::Token *Expanded) const {
2307330f729Sjoerg   assert(Expanded);
2317330f729Sjoerg   assert(ExpandedTokens.data() <= Expanded &&
2327330f729Sjoerg          Expanded < ExpandedTokens.data() + ExpandedTokens.size());
2337330f729Sjoerg 
2347330f729Sjoerg   auto FileIt = Files.find(
2357330f729Sjoerg       SourceMgr->getFileID(SourceMgr->getExpansionLoc(Expanded->location())));
2367330f729Sjoerg   assert(FileIt != Files.end() && "no file for an expanded token");
2377330f729Sjoerg 
2387330f729Sjoerg   const MarkedFile &File = FileIt->second;
2397330f729Sjoerg 
2407330f729Sjoerg   unsigned ExpandedIndex = Expanded - ExpandedTokens.data();
2417330f729Sjoerg   // Find the first mapping that produced tokens after \p Expanded.
2427330f729Sjoerg   auto It = llvm::partition_point(File.Mappings, [&](const Mapping &M) {
2437330f729Sjoerg     return M.BeginExpanded <= ExpandedIndex;
2447330f729Sjoerg   });
2457330f729Sjoerg   // Our token could only be produced by the previous mapping.
2467330f729Sjoerg   if (It == File.Mappings.begin()) {
2477330f729Sjoerg     // No previous mapping, no need to modify offsets.
248*e038c9c4Sjoerg     return {&File.SpelledTokens[ExpandedIndex - File.BeginExpanded],
249*e038c9c4Sjoerg             /*Mapping=*/nullptr};
2507330f729Sjoerg   }
2517330f729Sjoerg   --It; // 'It' now points to last mapping that started before our token.
2527330f729Sjoerg 
2537330f729Sjoerg   // Check if the token is part of the mapping.
2547330f729Sjoerg   if (ExpandedIndex < It->EndExpanded)
255*e038c9c4Sjoerg     return {&File.SpelledTokens[It->BeginSpelled], /*Mapping=*/&*It};
2567330f729Sjoerg 
2577330f729Sjoerg   // Not part of the mapping, use the index from previous mapping to compute the
2587330f729Sjoerg   // corresponding spelled token.
2597330f729Sjoerg   return {
2607330f729Sjoerg       &File.SpelledTokens[It->EndSpelled + (ExpandedIndex - It->EndExpanded)],
261*e038c9c4Sjoerg       /*Mapping=*/nullptr};
262*e038c9c4Sjoerg }
263*e038c9c4Sjoerg 
264*e038c9c4Sjoerg const TokenBuffer::Mapping *
mappingStartingBeforeSpelled(const MarkedFile & F,const syntax::Token * Spelled)265*e038c9c4Sjoerg TokenBuffer::mappingStartingBeforeSpelled(const MarkedFile &F,
266*e038c9c4Sjoerg                                           const syntax::Token *Spelled) {
267*e038c9c4Sjoerg   assert(F.SpelledTokens.data() <= Spelled);
268*e038c9c4Sjoerg   unsigned SpelledI = Spelled - F.SpelledTokens.data();
269*e038c9c4Sjoerg   assert(SpelledI < F.SpelledTokens.size());
270*e038c9c4Sjoerg 
271*e038c9c4Sjoerg   auto It = llvm::partition_point(F.Mappings, [SpelledI](const Mapping &M) {
272*e038c9c4Sjoerg     return M.BeginSpelled <= SpelledI;
273*e038c9c4Sjoerg   });
274*e038c9c4Sjoerg   if (It == F.Mappings.begin())
275*e038c9c4Sjoerg     return nullptr;
276*e038c9c4Sjoerg   --It;
277*e038c9c4Sjoerg   return &*It;
278*e038c9c4Sjoerg }
279*e038c9c4Sjoerg 
280*e038c9c4Sjoerg llvm::SmallVector<llvm::ArrayRef<syntax::Token>, 1>
expandedForSpelled(llvm::ArrayRef<syntax::Token> Spelled) const281*e038c9c4Sjoerg TokenBuffer::expandedForSpelled(llvm::ArrayRef<syntax::Token> Spelled) const {
282*e038c9c4Sjoerg   if (Spelled.empty())
283*e038c9c4Sjoerg     return {};
284*e038c9c4Sjoerg   const auto &File = fileForSpelled(Spelled);
285*e038c9c4Sjoerg 
286*e038c9c4Sjoerg   auto *FrontMapping = mappingStartingBeforeSpelled(File, &Spelled.front());
287*e038c9c4Sjoerg   unsigned SpelledFrontI = &Spelled.front() - File.SpelledTokens.data();
288*e038c9c4Sjoerg   assert(SpelledFrontI < File.SpelledTokens.size());
289*e038c9c4Sjoerg   unsigned ExpandedBegin;
290*e038c9c4Sjoerg   if (!FrontMapping) {
291*e038c9c4Sjoerg     // No mapping that starts before the first token of Spelled, we don't have
292*e038c9c4Sjoerg     // to modify offsets.
293*e038c9c4Sjoerg     ExpandedBegin = File.BeginExpanded + SpelledFrontI;
294*e038c9c4Sjoerg   } else if (SpelledFrontI < FrontMapping->EndSpelled) {
295*e038c9c4Sjoerg     // This mapping applies to Spelled tokens.
296*e038c9c4Sjoerg     if (SpelledFrontI != FrontMapping->BeginSpelled) {
297*e038c9c4Sjoerg       // Spelled tokens don't cover the entire mapping, returning empty result.
298*e038c9c4Sjoerg       return {}; // FIXME: support macro arguments.
299*e038c9c4Sjoerg     }
300*e038c9c4Sjoerg     // Spelled tokens start at the beginning of this mapping.
301*e038c9c4Sjoerg     ExpandedBegin = FrontMapping->BeginExpanded;
302*e038c9c4Sjoerg   } else {
303*e038c9c4Sjoerg     // Spelled tokens start after the mapping ends (they start in the hole
304*e038c9c4Sjoerg     // between 2 mappings, or between a mapping and end of the file).
305*e038c9c4Sjoerg     ExpandedBegin =
306*e038c9c4Sjoerg         FrontMapping->EndExpanded + (SpelledFrontI - FrontMapping->EndSpelled);
307*e038c9c4Sjoerg   }
308*e038c9c4Sjoerg 
309*e038c9c4Sjoerg   auto *BackMapping = mappingStartingBeforeSpelled(File, &Spelled.back());
310*e038c9c4Sjoerg   unsigned SpelledBackI = &Spelled.back() - File.SpelledTokens.data();
311*e038c9c4Sjoerg   unsigned ExpandedEnd;
312*e038c9c4Sjoerg   if (!BackMapping) {
313*e038c9c4Sjoerg     // No mapping that starts before the last token of Spelled, we don't have to
314*e038c9c4Sjoerg     // modify offsets.
315*e038c9c4Sjoerg     ExpandedEnd = File.BeginExpanded + SpelledBackI + 1;
316*e038c9c4Sjoerg   } else if (SpelledBackI < BackMapping->EndSpelled) {
317*e038c9c4Sjoerg     // This mapping applies to Spelled tokens.
318*e038c9c4Sjoerg     if (SpelledBackI + 1 != BackMapping->EndSpelled) {
319*e038c9c4Sjoerg       // Spelled tokens don't cover the entire mapping, returning empty result.
320*e038c9c4Sjoerg       return {}; // FIXME: support macro arguments.
321*e038c9c4Sjoerg     }
322*e038c9c4Sjoerg     ExpandedEnd = BackMapping->EndExpanded;
323*e038c9c4Sjoerg   } else {
324*e038c9c4Sjoerg     // Spelled tokens end after the mapping ends.
325*e038c9c4Sjoerg     ExpandedEnd =
326*e038c9c4Sjoerg         BackMapping->EndExpanded + (SpelledBackI - BackMapping->EndSpelled) + 1;
327*e038c9c4Sjoerg   }
328*e038c9c4Sjoerg 
329*e038c9c4Sjoerg   assert(ExpandedBegin < ExpandedTokens.size());
330*e038c9c4Sjoerg   assert(ExpandedEnd < ExpandedTokens.size());
331*e038c9c4Sjoerg   // Avoid returning empty ranges.
332*e038c9c4Sjoerg   if (ExpandedBegin == ExpandedEnd)
333*e038c9c4Sjoerg     return {};
334*e038c9c4Sjoerg   return {llvm::makeArrayRef(ExpandedTokens.data() + ExpandedBegin,
335*e038c9c4Sjoerg                              ExpandedTokens.data() + ExpandedEnd)};
3367330f729Sjoerg }
3377330f729Sjoerg 
spelledTokens(FileID FID) const3387330f729Sjoerg llvm::ArrayRef<syntax::Token> TokenBuffer::spelledTokens(FileID FID) const {
3397330f729Sjoerg   auto It = Files.find(FID);
3407330f729Sjoerg   assert(It != Files.end());
3417330f729Sjoerg   return It->second.SpelledTokens;
3427330f729Sjoerg }
3437330f729Sjoerg 
spelledTokenAt(SourceLocation Loc) const344*e038c9c4Sjoerg const syntax::Token *TokenBuffer::spelledTokenAt(SourceLocation Loc) const {
345*e038c9c4Sjoerg   assert(Loc.isFileID());
346*e038c9c4Sjoerg   const auto *Tok = llvm::partition_point(
347*e038c9c4Sjoerg       spelledTokens(SourceMgr->getFileID(Loc)),
348*e038c9c4Sjoerg       [&](const syntax::Token &Tok) { return Tok.location() < Loc; });
349*e038c9c4Sjoerg   if (!Tok || Tok->location() != Loc)
350*e038c9c4Sjoerg     return nullptr;
351*e038c9c4Sjoerg   return Tok;
352*e038c9c4Sjoerg }
353*e038c9c4Sjoerg 
str() const3547330f729Sjoerg std::string TokenBuffer::Mapping::str() const {
355*e038c9c4Sjoerg   return std::string(
356*e038c9c4Sjoerg       llvm::formatv("spelled tokens: [{0},{1}), expanded tokens: [{2},{3})",
357*e038c9c4Sjoerg                     BeginSpelled, EndSpelled, BeginExpanded, EndExpanded));
3587330f729Sjoerg }
3597330f729Sjoerg 
3607330f729Sjoerg llvm::Optional<llvm::ArrayRef<syntax::Token>>
spelledForExpanded(llvm::ArrayRef<syntax::Token> Expanded) const3617330f729Sjoerg TokenBuffer::spelledForExpanded(llvm::ArrayRef<syntax::Token> Expanded) const {
3627330f729Sjoerg   // Mapping an empty range is ambiguous in case of empty mappings at either end
3637330f729Sjoerg   // of the range, bail out in that case.
3647330f729Sjoerg   if (Expanded.empty())
3657330f729Sjoerg     return llvm::None;
3667330f729Sjoerg 
3677330f729Sjoerg   const syntax::Token *BeginSpelled;
3687330f729Sjoerg   const Mapping *BeginMapping;
3697330f729Sjoerg   std::tie(BeginSpelled, BeginMapping) =
3707330f729Sjoerg       spelledForExpandedToken(&Expanded.front());
3717330f729Sjoerg 
3727330f729Sjoerg   const syntax::Token *LastSpelled;
3737330f729Sjoerg   const Mapping *LastMapping;
3747330f729Sjoerg   std::tie(LastSpelled, LastMapping) =
3757330f729Sjoerg       spelledForExpandedToken(&Expanded.back());
3767330f729Sjoerg 
3777330f729Sjoerg   FileID FID = SourceMgr->getFileID(BeginSpelled->location());
3787330f729Sjoerg   // FIXME: Handle multi-file changes by trying to map onto a common root.
3797330f729Sjoerg   if (FID != SourceMgr->getFileID(LastSpelled->location()))
3807330f729Sjoerg     return llvm::None;
3817330f729Sjoerg 
3827330f729Sjoerg   const MarkedFile &File = Files.find(FID)->second;
3837330f729Sjoerg 
384*e038c9c4Sjoerg   // If both tokens are coming from a macro argument expansion, try and map to
385*e038c9c4Sjoerg   // smallest part of the macro argument. BeginMapping && LastMapping check is
386*e038c9c4Sjoerg   // only for performance, they are a prerequisite for Expanded.front() and
387*e038c9c4Sjoerg   // Expanded.back() being part of a macro arg expansion.
388*e038c9c4Sjoerg   if (BeginMapping && LastMapping &&
389*e038c9c4Sjoerg       SourceMgr->isMacroArgExpansion(Expanded.front().location()) &&
390*e038c9c4Sjoerg       SourceMgr->isMacroArgExpansion(Expanded.back().location())) {
391*e038c9c4Sjoerg     auto CommonRange = findCommonRangeForMacroArgs(Expanded.front(),
392*e038c9c4Sjoerg                                                    Expanded.back(), *SourceMgr);
393*e038c9c4Sjoerg     // It might be the case that tokens are arguments of different macro calls,
394*e038c9c4Sjoerg     // in that case we should continue with the logic below instead of returning
395*e038c9c4Sjoerg     // an empty range.
396*e038c9c4Sjoerg     if (CommonRange.isValid())
397*e038c9c4Sjoerg       return getTokensCovering(File.SpelledTokens, CommonRange, *SourceMgr);
398*e038c9c4Sjoerg   }
399*e038c9c4Sjoerg 
400*e038c9c4Sjoerg   // Do not allow changes that doesn't cover full expansion.
4017330f729Sjoerg   unsigned BeginExpanded = Expanded.begin() - ExpandedTokens.data();
4027330f729Sjoerg   unsigned EndExpanded = Expanded.end() - ExpandedTokens.data();
403*e038c9c4Sjoerg   if (BeginMapping && BeginExpanded != BeginMapping->BeginExpanded)
4047330f729Sjoerg     return llvm::None;
405*e038c9c4Sjoerg   if (LastMapping && LastMapping->EndExpanded != EndExpanded)
4067330f729Sjoerg     return llvm::None;
4077330f729Sjoerg   // All is good, return the result.
4087330f729Sjoerg   return llvm::makeArrayRef(
4097330f729Sjoerg       BeginMapping ? File.SpelledTokens.data() + BeginMapping->BeginSpelled
4107330f729Sjoerg                    : BeginSpelled,
4117330f729Sjoerg       LastMapping ? File.SpelledTokens.data() + LastMapping->EndSpelled
4127330f729Sjoerg                   : LastSpelled + 1);
4137330f729Sjoerg }
4147330f729Sjoerg 
makeExpansion(const MarkedFile & F,const Mapping & M) const415*e038c9c4Sjoerg TokenBuffer::Expansion TokenBuffer::makeExpansion(const MarkedFile &F,
416*e038c9c4Sjoerg                                                   const Mapping &M) const {
417*e038c9c4Sjoerg   Expansion E;
418*e038c9c4Sjoerg   E.Spelled = llvm::makeArrayRef(F.SpelledTokens.data() + M.BeginSpelled,
419*e038c9c4Sjoerg                                  F.SpelledTokens.data() + M.EndSpelled);
420*e038c9c4Sjoerg   E.Expanded = llvm::makeArrayRef(ExpandedTokens.data() + M.BeginExpanded,
421*e038c9c4Sjoerg                                   ExpandedTokens.data() + M.EndExpanded);
422*e038c9c4Sjoerg   return E;
423*e038c9c4Sjoerg }
424*e038c9c4Sjoerg 
425*e038c9c4Sjoerg const TokenBuffer::MarkedFile &
fileForSpelled(llvm::ArrayRef<syntax::Token> Spelled) const426*e038c9c4Sjoerg TokenBuffer::fileForSpelled(llvm::ArrayRef<syntax::Token> Spelled) const {
427*e038c9c4Sjoerg   assert(!Spelled.empty());
428*e038c9c4Sjoerg   assert(Spelled.front().location().isFileID() && "not a spelled token");
429*e038c9c4Sjoerg   auto FileIt = Files.find(SourceMgr->getFileID(Spelled.front().location()));
430*e038c9c4Sjoerg   assert(FileIt != Files.end() && "file not tracked by token buffer");
431*e038c9c4Sjoerg   const auto &File = FileIt->second;
432*e038c9c4Sjoerg   assert(File.SpelledTokens.data() <= Spelled.data() &&
433*e038c9c4Sjoerg          Spelled.end() <=
434*e038c9c4Sjoerg              (File.SpelledTokens.data() + File.SpelledTokens.size()) &&
435*e038c9c4Sjoerg          "Tokens not in spelled range");
436*e038c9c4Sjoerg #ifndef NDEBUG
437*e038c9c4Sjoerg   auto T1 = Spelled.back().location();
438*e038c9c4Sjoerg   auto T2 = File.SpelledTokens.back().location();
439*e038c9c4Sjoerg   assert(T1 == T2 || sourceManager().isBeforeInTranslationUnit(T1, T2));
440*e038c9c4Sjoerg #endif
441*e038c9c4Sjoerg   return File;
442*e038c9c4Sjoerg }
443*e038c9c4Sjoerg 
4447330f729Sjoerg llvm::Optional<TokenBuffer::Expansion>
expansionStartingAt(const syntax::Token * Spelled) const4457330f729Sjoerg TokenBuffer::expansionStartingAt(const syntax::Token *Spelled) const {
4467330f729Sjoerg   assert(Spelled);
447*e038c9c4Sjoerg   const auto &File = fileForSpelled(*Spelled);
4487330f729Sjoerg 
4497330f729Sjoerg   unsigned SpelledIndex = Spelled - File.SpelledTokens.data();
4507330f729Sjoerg   auto M = llvm::partition_point(File.Mappings, [&](const Mapping &M) {
4517330f729Sjoerg     return M.BeginSpelled < SpelledIndex;
4527330f729Sjoerg   });
4537330f729Sjoerg   if (M == File.Mappings.end() || M->BeginSpelled != SpelledIndex)
4547330f729Sjoerg     return llvm::None;
455*e038c9c4Sjoerg   return makeExpansion(File, *M);
456*e038c9c4Sjoerg }
4577330f729Sjoerg 
expansionsOverlapping(llvm::ArrayRef<syntax::Token> Spelled) const458*e038c9c4Sjoerg std::vector<TokenBuffer::Expansion> TokenBuffer::expansionsOverlapping(
459*e038c9c4Sjoerg     llvm::ArrayRef<syntax::Token> Spelled) const {
460*e038c9c4Sjoerg   if (Spelled.empty())
461*e038c9c4Sjoerg     return {};
462*e038c9c4Sjoerg   const auto &File = fileForSpelled(Spelled);
463*e038c9c4Sjoerg 
464*e038c9c4Sjoerg   // Find the first overlapping range, and then copy until we stop overlapping.
465*e038c9c4Sjoerg   unsigned SpelledBeginIndex = Spelled.begin() - File.SpelledTokens.data();
466*e038c9c4Sjoerg   unsigned SpelledEndIndex = Spelled.end() - File.SpelledTokens.data();
467*e038c9c4Sjoerg   auto M = llvm::partition_point(File.Mappings, [&](const Mapping &M) {
468*e038c9c4Sjoerg     return M.EndSpelled <= SpelledBeginIndex;
469*e038c9c4Sjoerg   });
470*e038c9c4Sjoerg   std::vector<TokenBuffer::Expansion> Expansions;
471*e038c9c4Sjoerg   for (; M != File.Mappings.end() && M->BeginSpelled < SpelledEndIndex; ++M)
472*e038c9c4Sjoerg     Expansions.push_back(makeExpansion(File, *M));
473*e038c9c4Sjoerg   return Expansions;
474*e038c9c4Sjoerg }
475*e038c9c4Sjoerg 
476*e038c9c4Sjoerg llvm::ArrayRef<syntax::Token>
spelledTokensTouching(SourceLocation Loc,llvm::ArrayRef<syntax::Token> Tokens)477*e038c9c4Sjoerg syntax::spelledTokensTouching(SourceLocation Loc,
478*e038c9c4Sjoerg                               llvm::ArrayRef<syntax::Token> Tokens) {
479*e038c9c4Sjoerg   assert(Loc.isFileID());
480*e038c9c4Sjoerg 
481*e038c9c4Sjoerg   auto *Right = llvm::partition_point(
482*e038c9c4Sjoerg       Tokens, [&](const syntax::Token &Tok) { return Tok.location() < Loc; });
483*e038c9c4Sjoerg   bool AcceptRight = Right != Tokens.end() && Right->location() <= Loc;
484*e038c9c4Sjoerg   bool AcceptLeft =
485*e038c9c4Sjoerg       Right != Tokens.begin() && (Right - 1)->endLocation() >= Loc;
486*e038c9c4Sjoerg   return llvm::makeArrayRef(Right - (AcceptLeft ? 1 : 0),
487*e038c9c4Sjoerg                             Right + (AcceptRight ? 1 : 0));
488*e038c9c4Sjoerg }
489*e038c9c4Sjoerg 
490*e038c9c4Sjoerg llvm::ArrayRef<syntax::Token>
spelledTokensTouching(SourceLocation Loc,const syntax::TokenBuffer & Tokens)491*e038c9c4Sjoerg syntax::spelledTokensTouching(SourceLocation Loc,
492*e038c9c4Sjoerg                               const syntax::TokenBuffer &Tokens) {
493*e038c9c4Sjoerg   return spelledTokensTouching(
494*e038c9c4Sjoerg       Loc, Tokens.spelledTokens(Tokens.sourceManager().getFileID(Loc)));
495*e038c9c4Sjoerg }
496*e038c9c4Sjoerg 
497*e038c9c4Sjoerg const syntax::Token *
spelledIdentifierTouching(SourceLocation Loc,llvm::ArrayRef<syntax::Token> Tokens)498*e038c9c4Sjoerg syntax::spelledIdentifierTouching(SourceLocation Loc,
499*e038c9c4Sjoerg                                   llvm::ArrayRef<syntax::Token> Tokens) {
500*e038c9c4Sjoerg   for (const syntax::Token &Tok : spelledTokensTouching(Loc, Tokens)) {
501*e038c9c4Sjoerg     if (Tok.kind() == tok::identifier)
502*e038c9c4Sjoerg       return &Tok;
503*e038c9c4Sjoerg   }
504*e038c9c4Sjoerg   return nullptr;
505*e038c9c4Sjoerg }
506*e038c9c4Sjoerg 
507*e038c9c4Sjoerg const syntax::Token *
spelledIdentifierTouching(SourceLocation Loc,const syntax::TokenBuffer & Tokens)508*e038c9c4Sjoerg syntax::spelledIdentifierTouching(SourceLocation Loc,
509*e038c9c4Sjoerg                                   const syntax::TokenBuffer &Tokens) {
510*e038c9c4Sjoerg   return spelledIdentifierTouching(
511*e038c9c4Sjoerg       Loc, Tokens.spelledTokens(Tokens.sourceManager().getFileID(Loc)));
5127330f729Sjoerg }
5137330f729Sjoerg 
5147330f729Sjoerg std::vector<const syntax::Token *>
macroExpansions(FileID FID) const5157330f729Sjoerg TokenBuffer::macroExpansions(FileID FID) const {
5167330f729Sjoerg   auto FileIt = Files.find(FID);
5177330f729Sjoerg   assert(FileIt != Files.end() && "file not tracked by token buffer");
5187330f729Sjoerg   auto &File = FileIt->second;
5197330f729Sjoerg   std::vector<const syntax::Token *> Expansions;
5207330f729Sjoerg   auto &Spelled = File.SpelledTokens;
5217330f729Sjoerg   for (auto Mapping : File.Mappings) {
5227330f729Sjoerg     const syntax::Token *Token = &Spelled[Mapping.BeginSpelled];
5237330f729Sjoerg     if (Token->kind() == tok::TokenKind::identifier)
5247330f729Sjoerg       Expansions.push_back(Token);
5257330f729Sjoerg   }
5267330f729Sjoerg   return Expansions;
5277330f729Sjoerg }
5287330f729Sjoerg 
tokenize(const FileRange & FR,const SourceManager & SM,const LangOptions & LO)529*e038c9c4Sjoerg std::vector<syntax::Token> syntax::tokenize(const FileRange &FR,
530*e038c9c4Sjoerg                                             const SourceManager &SM,
5317330f729Sjoerg                                             const LangOptions &LO) {
5327330f729Sjoerg   std::vector<syntax::Token> Tokens;
5337330f729Sjoerg   IdentifierTable Identifiers(LO);
5347330f729Sjoerg   auto AddToken = [&](clang::Token T) {
5357330f729Sjoerg     // Fill the proper token kind for keywords, etc.
5367330f729Sjoerg     if (T.getKind() == tok::raw_identifier && !T.needsCleaning() &&
5377330f729Sjoerg         !T.hasUCN()) { // FIXME: support needsCleaning and hasUCN cases.
5387330f729Sjoerg       clang::IdentifierInfo &II = Identifiers.get(T.getRawIdentifier());
5397330f729Sjoerg       T.setIdentifierInfo(&II);
5407330f729Sjoerg       T.setKind(II.getTokenID());
5417330f729Sjoerg     }
5427330f729Sjoerg     Tokens.push_back(syntax::Token(T));
5437330f729Sjoerg   };
5447330f729Sjoerg 
545*e038c9c4Sjoerg   auto SrcBuffer = SM.getBufferData(FR.file());
546*e038c9c4Sjoerg   Lexer L(SM.getLocForStartOfFile(FR.file()), LO, SrcBuffer.data(),
547*e038c9c4Sjoerg           SrcBuffer.data() + FR.beginOffset(),
548*e038c9c4Sjoerg           // We can't make BufEnd point to FR.endOffset, as Lexer requires a
549*e038c9c4Sjoerg           // null terminated buffer.
550*e038c9c4Sjoerg           SrcBuffer.data() + SrcBuffer.size());
5517330f729Sjoerg 
5527330f729Sjoerg   clang::Token T;
553*e038c9c4Sjoerg   while (!L.LexFromRawLexer(T) && L.getCurrentBufferOffset() < FR.endOffset())
5547330f729Sjoerg     AddToken(T);
555*e038c9c4Sjoerg   // LexFromRawLexer returns true when it parses the last token of the file, add
556*e038c9c4Sjoerg   // it iff it starts within the range we are interested in.
557*e038c9c4Sjoerg   if (SM.getFileOffset(T.getLocation()) < FR.endOffset())
5587330f729Sjoerg     AddToken(T);
5597330f729Sjoerg   return Tokens;
5607330f729Sjoerg }
5617330f729Sjoerg 
tokenize(FileID FID,const SourceManager & SM,const LangOptions & LO)562*e038c9c4Sjoerg std::vector<syntax::Token> syntax::tokenize(FileID FID, const SourceManager &SM,
563*e038c9c4Sjoerg                                             const LangOptions &LO) {
564*e038c9c4Sjoerg   return tokenize(syntax::FileRange(FID, 0, SM.getFileIDSize(FID)), SM, LO);
565*e038c9c4Sjoerg }
566*e038c9c4Sjoerg 
5677330f729Sjoerg /// Records information reqired to construct mappings for the token buffer that
5687330f729Sjoerg /// we are collecting.
5697330f729Sjoerg class TokenCollector::CollectPPExpansions : public PPCallbacks {
5707330f729Sjoerg public:
CollectPPExpansions(TokenCollector & C)5717330f729Sjoerg   CollectPPExpansions(TokenCollector &C) : Collector(&C) {}
5727330f729Sjoerg 
5737330f729Sjoerg   /// Disabled instance will stop reporting anything to TokenCollector.
5747330f729Sjoerg   /// This ensures that uses of the preprocessor after TokenCollector::consume()
5757330f729Sjoerg   /// is called do not access the (possibly invalid) collector instance.
disable()5767330f729Sjoerg   void disable() { Collector = nullptr; }
5777330f729Sjoerg 
MacroExpands(const clang::Token & MacroNameTok,const MacroDefinition & MD,SourceRange Range,const MacroArgs * Args)5787330f729Sjoerg   void MacroExpands(const clang::Token &MacroNameTok, const MacroDefinition &MD,
5797330f729Sjoerg                     SourceRange Range, const MacroArgs *Args) override {
5807330f729Sjoerg     if (!Collector)
5817330f729Sjoerg       return;
582*e038c9c4Sjoerg     const auto &SM = Collector->PP.getSourceManager();
583*e038c9c4Sjoerg     // Only record top-level expansions that directly produce expanded tokens.
584*e038c9c4Sjoerg     // This excludes those where:
5857330f729Sjoerg     //   - the macro use is inside a macro body,
5867330f729Sjoerg     //   - the macro appears in an argument to another macro.
587*e038c9c4Sjoerg     // However macro expansion isn't really a tree, it's token rewrite rules,
588*e038c9c4Sjoerg     // so there are other cases, e.g.
589*e038c9c4Sjoerg     //   #define B(X) X
590*e038c9c4Sjoerg     //   #define A 1 + B
591*e038c9c4Sjoerg     //   A(2)
592*e038c9c4Sjoerg     // Both A and B produce expanded tokens, though the macro name 'B' comes
593*e038c9c4Sjoerg     // from an expansion. The best we can do is merge the mappings for both.
594*e038c9c4Sjoerg 
595*e038c9c4Sjoerg     // The *last* token of any top-level macro expansion must be in a file.
596*e038c9c4Sjoerg     // (In the example above, see the closing paren of the expansion of B).
597*e038c9c4Sjoerg     if (!Range.getEnd().isFileID())
5987330f729Sjoerg       return;
599*e038c9c4Sjoerg     // If there's a current expansion that encloses this one, this one can't be
600*e038c9c4Sjoerg     // top-level.
601*e038c9c4Sjoerg     if (LastExpansionEnd.isValid() &&
602*e038c9c4Sjoerg         !SM.isBeforeInTranslationUnit(LastExpansionEnd, Range.getEnd()))
603*e038c9c4Sjoerg       return;
604*e038c9c4Sjoerg 
605*e038c9c4Sjoerg     // If the macro invocation (B) starts in a macro (A) but ends in a file,
606*e038c9c4Sjoerg     // we'll create a merged mapping for A + B by overwriting the endpoint for
607*e038c9c4Sjoerg     // A's startpoint.
608*e038c9c4Sjoerg     if (!Range.getBegin().isFileID()) {
609*e038c9c4Sjoerg       Range.setBegin(SM.getExpansionLoc(Range.getBegin()));
610*e038c9c4Sjoerg       assert(Collector->Expansions.count(Range.getBegin()) &&
611*e038c9c4Sjoerg              "Overlapping macros should have same expansion location");
612*e038c9c4Sjoerg     }
613*e038c9c4Sjoerg 
614*e038c9c4Sjoerg     Collector->Expansions[Range.getBegin()] = Range.getEnd();
6157330f729Sjoerg     LastExpansionEnd = Range.getEnd();
6167330f729Sjoerg   }
6177330f729Sjoerg   // FIXME: handle directives like #pragma, #include, etc.
6187330f729Sjoerg private:
6197330f729Sjoerg   TokenCollector *Collector;
6207330f729Sjoerg   /// Used to detect recursive macro expansions.
6217330f729Sjoerg   SourceLocation LastExpansionEnd;
6227330f729Sjoerg };
6237330f729Sjoerg 
6247330f729Sjoerg /// Fills in the TokenBuffer by tracing the run of a preprocessor. The
6257330f729Sjoerg /// implementation tracks the tokens, macro expansions and directives coming
6267330f729Sjoerg /// from the preprocessor and:
6277330f729Sjoerg /// - for each token, figures out if it is a part of an expanded token stream,
6287330f729Sjoerg ///   spelled token stream or both. Stores the tokens appropriately.
6297330f729Sjoerg /// - records mappings from the spelled to expanded token ranges, e.g. for macro
6307330f729Sjoerg ///   expansions.
6317330f729Sjoerg /// FIXME: also properly record:
6327330f729Sjoerg ///          - #include directives,
6337330f729Sjoerg ///          - #pragma, #line and other PP directives,
6347330f729Sjoerg ///          - skipped pp regions,
6357330f729Sjoerg ///          - ...
6367330f729Sjoerg 
TokenCollector(Preprocessor & PP)6377330f729Sjoerg TokenCollector::TokenCollector(Preprocessor &PP) : PP(PP) {
6387330f729Sjoerg   // Collect the expanded token stream during preprocessing.
6397330f729Sjoerg   PP.setTokenWatcher([this](const clang::Token &T) {
6407330f729Sjoerg     if (T.isAnnotation())
6417330f729Sjoerg       return;
6427330f729Sjoerg     DEBUG_WITH_TYPE("collect-tokens", llvm::dbgs()
6437330f729Sjoerg                                           << "Token: "
6447330f729Sjoerg                                           << syntax::Token(T).dumpForTests(
6457330f729Sjoerg                                                  this->PP.getSourceManager())
6467330f729Sjoerg                                           << "\n"
6477330f729Sjoerg 
6487330f729Sjoerg     );
6497330f729Sjoerg     Expanded.push_back(syntax::Token(T));
6507330f729Sjoerg   });
6517330f729Sjoerg   // And locations of macro calls, to properly recover boundaries of those in
6527330f729Sjoerg   // case of empty expansions.
6537330f729Sjoerg   auto CB = std::make_unique<CollectPPExpansions>(*this);
6547330f729Sjoerg   this->Collector = CB.get();
6557330f729Sjoerg   PP.addPPCallbacks(std::move(CB));
6567330f729Sjoerg }
6577330f729Sjoerg 
6587330f729Sjoerg /// Builds mappings and spelled tokens in the TokenBuffer based on the expanded
6597330f729Sjoerg /// token stream.
6607330f729Sjoerg class TokenCollector::Builder {
6617330f729Sjoerg public:
Builder(std::vector<syntax::Token> Expanded,PPExpansions CollectedExpansions,const SourceManager & SM,const LangOptions & LangOpts)6627330f729Sjoerg   Builder(std::vector<syntax::Token> Expanded, PPExpansions CollectedExpansions,
6637330f729Sjoerg           const SourceManager &SM, const LangOptions &LangOpts)
6647330f729Sjoerg       : Result(SM), CollectedExpansions(std::move(CollectedExpansions)), SM(SM),
6657330f729Sjoerg         LangOpts(LangOpts) {
6667330f729Sjoerg     Result.ExpandedTokens = std::move(Expanded);
6677330f729Sjoerg   }
6687330f729Sjoerg 
build()6697330f729Sjoerg   TokenBuffer build() && {
6707330f729Sjoerg     assert(!Result.ExpandedTokens.empty());
6717330f729Sjoerg     assert(Result.ExpandedTokens.back().kind() == tok::eof);
672*e038c9c4Sjoerg 
673*e038c9c4Sjoerg     // Tokenize every file that contributed tokens to the expanded stream.
674*e038c9c4Sjoerg     buildSpelledTokens();
675*e038c9c4Sjoerg 
676*e038c9c4Sjoerg     // The expanded token stream consists of runs of tokens that came from
677*e038c9c4Sjoerg     // the same source (a macro expansion, part of a file etc).
678*e038c9c4Sjoerg     // Between these runs are the logical positions of spelled tokens that
679*e038c9c4Sjoerg     // didn't expand to anything.
680*e038c9c4Sjoerg     while (NextExpanded < Result.ExpandedTokens.size() - 1 /* eof */) {
681*e038c9c4Sjoerg       // Create empty mappings for spelled tokens that expanded to nothing here.
682*e038c9c4Sjoerg       // May advance NextSpelled, but NextExpanded is unchanged.
683*e038c9c4Sjoerg       discard();
684*e038c9c4Sjoerg       // Create mapping for a contiguous run of expanded tokens.
685*e038c9c4Sjoerg       // Advances NextExpanded past the run, and NextSpelled accordingly.
686*e038c9c4Sjoerg       unsigned OldPosition = NextExpanded;
687*e038c9c4Sjoerg       advance();
688*e038c9c4Sjoerg       if (NextExpanded == OldPosition)
689*e038c9c4Sjoerg         diagnoseAdvanceFailure();
6907330f729Sjoerg     }
691*e038c9c4Sjoerg     // If any tokens remain in any of the files, they didn't expand to anything.
692*e038c9c4Sjoerg     // Create empty mappings up until the end of the file.
693*e038c9c4Sjoerg     for (const auto &File : Result.Files)
694*e038c9c4Sjoerg       discard(File.first);
6957330f729Sjoerg 
696*e038c9c4Sjoerg #ifndef NDEBUG
697*e038c9c4Sjoerg     for (auto &pair : Result.Files) {
698*e038c9c4Sjoerg       auto &mappings = pair.second.Mappings;
699*e038c9c4Sjoerg       assert(llvm::is_sorted(mappings, [](const TokenBuffer::Mapping &M1,
700*e038c9c4Sjoerg                                           const TokenBuffer::Mapping &M2) {
701*e038c9c4Sjoerg         return M1.BeginSpelled < M2.BeginSpelled &&
702*e038c9c4Sjoerg                M1.EndSpelled < M2.EndSpelled &&
703*e038c9c4Sjoerg                M1.BeginExpanded < M2.BeginExpanded &&
704*e038c9c4Sjoerg                M1.EndExpanded < M2.EndExpanded;
705*e038c9c4Sjoerg       }));
706*e038c9c4Sjoerg     }
707*e038c9c4Sjoerg #endif
7087330f729Sjoerg 
7097330f729Sjoerg     return std::move(Result);
7107330f729Sjoerg   }
7117330f729Sjoerg 
7127330f729Sjoerg private:
713*e038c9c4Sjoerg   // Consume a sequence of spelled tokens that didn't expand to anything.
714*e038c9c4Sjoerg   // In the simplest case, skips spelled tokens until finding one that produced
715*e038c9c4Sjoerg   // the NextExpanded token, and creates an empty mapping for them.
716*e038c9c4Sjoerg   // If Drain is provided, skips remaining tokens from that file instead.
discard(llvm::Optional<FileID> Drain=llvm::None)717*e038c9c4Sjoerg   void discard(llvm::Optional<FileID> Drain = llvm::None) {
718*e038c9c4Sjoerg     SourceLocation Target =
719*e038c9c4Sjoerg         Drain ? SM.getLocForEndOfFile(*Drain)
720*e038c9c4Sjoerg               : SM.getExpansionLoc(
721*e038c9c4Sjoerg                     Result.ExpandedTokens[NextExpanded].location());
722*e038c9c4Sjoerg     FileID File = SM.getFileID(Target);
723*e038c9c4Sjoerg     const auto &SpelledTokens = Result.Files[File].SpelledTokens;
724*e038c9c4Sjoerg     auto &NextSpelled = this->NextSpelled[File];
725*e038c9c4Sjoerg 
726*e038c9c4Sjoerg     TokenBuffer::Mapping Mapping;
727*e038c9c4Sjoerg     Mapping.BeginSpelled = NextSpelled;
728*e038c9c4Sjoerg     // When dropping trailing tokens from a file, the empty mapping should
729*e038c9c4Sjoerg     // be positioned within the file's expanded-token range (at the end).
730*e038c9c4Sjoerg     Mapping.BeginExpanded = Mapping.EndExpanded =
731*e038c9c4Sjoerg         Drain ? Result.Files[*Drain].EndExpanded : NextExpanded;
732*e038c9c4Sjoerg     // We may want to split into several adjacent empty mappings.
733*e038c9c4Sjoerg     // FlushMapping() emits the current mapping and starts a new one.
734*e038c9c4Sjoerg     auto FlushMapping = [&, this] {
735*e038c9c4Sjoerg       Mapping.EndSpelled = NextSpelled;
736*e038c9c4Sjoerg       if (Mapping.BeginSpelled != Mapping.EndSpelled)
737*e038c9c4Sjoerg         Result.Files[File].Mappings.push_back(Mapping);
738*e038c9c4Sjoerg       Mapping.BeginSpelled = NextSpelled;
739*e038c9c4Sjoerg     };
740*e038c9c4Sjoerg 
741*e038c9c4Sjoerg     while (NextSpelled < SpelledTokens.size() &&
742*e038c9c4Sjoerg            SpelledTokens[NextSpelled].location() < Target) {
743*e038c9c4Sjoerg       // If we know mapping bounds at [NextSpelled, KnownEnd] (macro expansion)
744*e038c9c4Sjoerg       // then we want to partition our (empty) mapping.
745*e038c9c4Sjoerg       //   [Start, NextSpelled) [NextSpelled, KnownEnd] (KnownEnd, Target)
746*e038c9c4Sjoerg       SourceLocation KnownEnd =
747*e038c9c4Sjoerg           CollectedExpansions.lookup(SpelledTokens[NextSpelled].location());
748*e038c9c4Sjoerg       if (KnownEnd.isValid()) {
749*e038c9c4Sjoerg         FlushMapping(); // Emits [Start, NextSpelled)
750*e038c9c4Sjoerg         while (NextSpelled < SpelledTokens.size() &&
751*e038c9c4Sjoerg                SpelledTokens[NextSpelled].location() <= KnownEnd)
752*e038c9c4Sjoerg           ++NextSpelled;
753*e038c9c4Sjoerg         FlushMapping(); // Emits [NextSpelled, KnownEnd]
754*e038c9c4Sjoerg         // Now the loop contitues and will emit (KnownEnd, Target).
755*e038c9c4Sjoerg       } else {
756*e038c9c4Sjoerg         ++NextSpelled;
7577330f729Sjoerg       }
758*e038c9c4Sjoerg     }
759*e038c9c4Sjoerg     FlushMapping();
760*e038c9c4Sjoerg   }
7617330f729Sjoerg 
762*e038c9c4Sjoerg   // Consumes the NextExpanded token and others that are part of the same run.
763*e038c9c4Sjoerg   // Increases NextExpanded and NextSpelled by at least one, and adds a mapping
764*e038c9c4Sjoerg   // (unless this is a run of file tokens, which we represent with no mapping).
advance()765*e038c9c4Sjoerg   void advance() {
766*e038c9c4Sjoerg     const syntax::Token &Tok = Result.ExpandedTokens[NextExpanded];
767*e038c9c4Sjoerg     SourceLocation Expansion = SM.getExpansionLoc(Tok.location());
768*e038c9c4Sjoerg     FileID File = SM.getFileID(Expansion);
769*e038c9c4Sjoerg     const auto &SpelledTokens = Result.Files[File].SpelledTokens;
770*e038c9c4Sjoerg     auto &NextSpelled = this->NextSpelled[File];
7717330f729Sjoerg 
772*e038c9c4Sjoerg     if (Tok.location().isFileID()) {
773*e038c9c4Sjoerg       // A run of file tokens continues while the expanded/spelled tokens match.
774*e038c9c4Sjoerg       while (NextSpelled < SpelledTokens.size() &&
775*e038c9c4Sjoerg              NextExpanded < Result.ExpandedTokens.size() &&
776*e038c9c4Sjoerg              SpelledTokens[NextSpelled].location() ==
777*e038c9c4Sjoerg                  Result.ExpandedTokens[NextExpanded].location()) {
778*e038c9c4Sjoerg         ++NextSpelled;
779*e038c9c4Sjoerg         ++NextExpanded;
780*e038c9c4Sjoerg       }
781*e038c9c4Sjoerg       // We need no mapping for file tokens copied to the expanded stream.
782*e038c9c4Sjoerg     } else {
783*e038c9c4Sjoerg       // We found a new macro expansion. We should have its spelling bounds.
784*e038c9c4Sjoerg       auto End = CollectedExpansions.lookup(Expansion);
785*e038c9c4Sjoerg       assert(End.isValid() && "Macro expansion wasn't captured?");
786*e038c9c4Sjoerg 
787*e038c9c4Sjoerg       // Mapping starts here...
788*e038c9c4Sjoerg       TokenBuffer::Mapping Mapping;
789*e038c9c4Sjoerg       Mapping.BeginExpanded = NextExpanded;
790*e038c9c4Sjoerg       Mapping.BeginSpelled = NextSpelled;
791*e038c9c4Sjoerg       // ... consumes spelled tokens within bounds we captured ...
792*e038c9c4Sjoerg       while (NextSpelled < SpelledTokens.size() &&
793*e038c9c4Sjoerg              SpelledTokens[NextSpelled].location() <= End)
794*e038c9c4Sjoerg         ++NextSpelled;
795*e038c9c4Sjoerg       // ... consumes expanded tokens rooted at the same expansion ...
796*e038c9c4Sjoerg       while (NextExpanded < Result.ExpandedTokens.size() &&
797*e038c9c4Sjoerg              SM.getExpansionLoc(
798*e038c9c4Sjoerg                  Result.ExpandedTokens[NextExpanded].location()) == Expansion)
799*e038c9c4Sjoerg         ++NextExpanded;
800*e038c9c4Sjoerg       // ... and ends here.
801*e038c9c4Sjoerg       Mapping.EndExpanded = NextExpanded;
802*e038c9c4Sjoerg       Mapping.EndSpelled = NextSpelled;
803*e038c9c4Sjoerg       Result.Files[File].Mappings.push_back(Mapping);
8047330f729Sjoerg     }
8057330f729Sjoerg   }
8067330f729Sjoerg 
807*e038c9c4Sjoerg   // advance() is supposed to consume at least one token - if not, we crash.
diagnoseAdvanceFailure()808*e038c9c4Sjoerg   void diagnoseAdvanceFailure() {
809*e038c9c4Sjoerg #ifndef NDEBUG
810*e038c9c4Sjoerg     // Show the failed-to-map token in context.
811*e038c9c4Sjoerg     for (unsigned I = (NextExpanded < 10) ? 0 : NextExpanded - 10;
812*e038c9c4Sjoerg          I < NextExpanded + 5 && I < Result.ExpandedTokens.size(); ++I) {
813*e038c9c4Sjoerg       const char *L =
814*e038c9c4Sjoerg           (I == NextExpanded) ? "!! " : (I < NextExpanded) ? "ok " : "   ";
815*e038c9c4Sjoerg       llvm::errs() << L << Result.ExpandedTokens[I].dumpForTests(SM) << "\n";
8167330f729Sjoerg     }
817*e038c9c4Sjoerg #endif
818*e038c9c4Sjoerg     llvm_unreachable("Couldn't map expanded token to spelled tokens!");
8197330f729Sjoerg   }
8207330f729Sjoerg 
8217330f729Sjoerg   /// Initializes TokenBuffer::Files and fills spelled tokens and expanded
8227330f729Sjoerg   /// ranges for each of the files.
buildSpelledTokens()8237330f729Sjoerg   void buildSpelledTokens() {
8247330f729Sjoerg     for (unsigned I = 0; I < Result.ExpandedTokens.size(); ++I) {
825*e038c9c4Sjoerg       const auto &Tok = Result.ExpandedTokens[I];
826*e038c9c4Sjoerg       auto FID = SM.getFileID(SM.getExpansionLoc(Tok.location()));
8277330f729Sjoerg       auto It = Result.Files.try_emplace(FID);
8287330f729Sjoerg       TokenBuffer::MarkedFile &File = It.first->second;
8297330f729Sjoerg 
830*e038c9c4Sjoerg       // The eof token should not be considered part of the main-file's range.
831*e038c9c4Sjoerg       File.EndExpanded = Tok.kind() == tok::eof ? I : I + 1;
832*e038c9c4Sjoerg 
8337330f729Sjoerg       if (!It.second)
8347330f729Sjoerg         continue; // we have seen this file before.
8357330f729Sjoerg       // This is the first time we see this file.
8367330f729Sjoerg       File.BeginExpanded = I;
8377330f729Sjoerg       File.SpelledTokens = tokenize(FID, SM, LangOpts);
8387330f729Sjoerg     }
8397330f729Sjoerg   }
8407330f729Sjoerg 
8417330f729Sjoerg   TokenBuffer Result;
842*e038c9c4Sjoerg   unsigned NextExpanded = 0;                    // cursor in ExpandedTokens
843*e038c9c4Sjoerg   llvm::DenseMap<FileID, unsigned> NextSpelled; // cursor in SpelledTokens
8447330f729Sjoerg   PPExpansions CollectedExpansions;
8457330f729Sjoerg   const SourceManager &SM;
8467330f729Sjoerg   const LangOptions &LangOpts;
8477330f729Sjoerg };
8487330f729Sjoerg 
consume()8497330f729Sjoerg TokenBuffer TokenCollector::consume() && {
8507330f729Sjoerg   PP.setTokenWatcher(nullptr);
8517330f729Sjoerg   Collector->disable();
8527330f729Sjoerg   return Builder(std::move(Expanded), std::move(Expansions),
8537330f729Sjoerg                  PP.getSourceManager(), PP.getLangOpts())
8547330f729Sjoerg       .build();
8557330f729Sjoerg }
8567330f729Sjoerg 
str() const8577330f729Sjoerg std::string syntax::Token::str() const {
858*e038c9c4Sjoerg   return std::string(llvm::formatv("Token({0}, length = {1})",
859*e038c9c4Sjoerg                                    tok::getTokenName(kind()), length()));
8607330f729Sjoerg }
8617330f729Sjoerg 
dumpForTests(const SourceManager & SM) const8627330f729Sjoerg std::string syntax::Token::dumpForTests(const SourceManager &SM) const {
863*e038c9c4Sjoerg   return std::string(llvm::formatv("Token(`{0}`, {1}, length = {2})", text(SM),
864*e038c9c4Sjoerg                                    tok::getTokenName(kind()), length()));
8657330f729Sjoerg }
8667330f729Sjoerg 
dumpForTests() const8677330f729Sjoerg std::string TokenBuffer::dumpForTests() const {
8687330f729Sjoerg   auto PrintToken = [this](const syntax::Token &T) -> std::string {
8697330f729Sjoerg     if (T.kind() == tok::eof)
8707330f729Sjoerg       return "<eof>";
871*e038c9c4Sjoerg     return std::string(T.text(*SourceMgr));
8727330f729Sjoerg   };
8737330f729Sjoerg 
8747330f729Sjoerg   auto DumpTokens = [this, &PrintToken](llvm::raw_ostream &OS,
8757330f729Sjoerg                                         llvm::ArrayRef<syntax::Token> Tokens) {
8767330f729Sjoerg     if (Tokens.empty()) {
8777330f729Sjoerg       OS << "<empty>";
8787330f729Sjoerg       return;
8797330f729Sjoerg     }
8807330f729Sjoerg     OS << Tokens[0].text(*SourceMgr);
8817330f729Sjoerg     for (unsigned I = 1; I < Tokens.size(); ++I) {
8827330f729Sjoerg       if (Tokens[I].kind() == tok::eof)
8837330f729Sjoerg         continue;
8847330f729Sjoerg       OS << " " << PrintToken(Tokens[I]);
8857330f729Sjoerg     }
8867330f729Sjoerg   };
8877330f729Sjoerg 
8887330f729Sjoerg   std::string Dump;
8897330f729Sjoerg   llvm::raw_string_ostream OS(Dump);
8907330f729Sjoerg 
8917330f729Sjoerg   OS << "expanded tokens:\n"
8927330f729Sjoerg      << "  ";
8937330f729Sjoerg   // (!) we do not show '<eof>'.
8947330f729Sjoerg   DumpTokens(OS, llvm::makeArrayRef(ExpandedTokens).drop_back());
8957330f729Sjoerg   OS << "\n";
8967330f729Sjoerg 
8977330f729Sjoerg   std::vector<FileID> Keys;
8987330f729Sjoerg   for (auto F : Files)
8997330f729Sjoerg     Keys.push_back(F.first);
9007330f729Sjoerg   llvm::sort(Keys);
9017330f729Sjoerg 
9027330f729Sjoerg   for (FileID ID : Keys) {
9037330f729Sjoerg     const MarkedFile &File = Files.find(ID)->second;
9047330f729Sjoerg     auto *Entry = SourceMgr->getFileEntryForID(ID);
9057330f729Sjoerg     if (!Entry)
9067330f729Sjoerg       continue; // Skip builtin files.
9077330f729Sjoerg     OS << llvm::formatv("file '{0}'\n", Entry->getName())
9087330f729Sjoerg        << "  spelled tokens:\n"
9097330f729Sjoerg        << "    ";
9107330f729Sjoerg     DumpTokens(OS, File.SpelledTokens);
9117330f729Sjoerg     OS << "\n";
9127330f729Sjoerg 
9137330f729Sjoerg     if (File.Mappings.empty()) {
9147330f729Sjoerg       OS << "  no mappings.\n";
9157330f729Sjoerg       continue;
9167330f729Sjoerg     }
9177330f729Sjoerg     OS << "  mappings:\n";
9187330f729Sjoerg     for (auto &M : File.Mappings) {
9197330f729Sjoerg       OS << llvm::formatv(
9207330f729Sjoerg           "    ['{0}'_{1}, '{2}'_{3}) => ['{4}'_{5}, '{6}'_{7})\n",
9217330f729Sjoerg           PrintToken(File.SpelledTokens[M.BeginSpelled]), M.BeginSpelled,
9227330f729Sjoerg           M.EndSpelled == File.SpelledTokens.size()
9237330f729Sjoerg               ? "<eof>"
9247330f729Sjoerg               : PrintToken(File.SpelledTokens[M.EndSpelled]),
9257330f729Sjoerg           M.EndSpelled, PrintToken(ExpandedTokens[M.BeginExpanded]),
9267330f729Sjoerg           M.BeginExpanded, PrintToken(ExpandedTokens[M.EndExpanded]),
9277330f729Sjoerg           M.EndExpanded);
9287330f729Sjoerg     }
9297330f729Sjoerg   }
9307330f729Sjoerg   return OS.str();
9317330f729Sjoerg }
932