1e5dd7070Spatrick //===- Tokens.cpp - collect tokens from preprocessing ---------------------===//
2e5dd7070Spatrick //
3e5dd7070Spatrick // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4e5dd7070Spatrick // See https://llvm.org/LICENSE.txt for license information.
5e5dd7070Spatrick // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6e5dd7070Spatrick //
7e5dd7070Spatrick //===----------------------------------------------------------------------===//
8e5dd7070Spatrick #include "clang/Tooling/Syntax/Tokens.h"
9e5dd7070Spatrick
10e5dd7070Spatrick #include "clang/Basic/Diagnostic.h"
11e5dd7070Spatrick #include "clang/Basic/IdentifierTable.h"
12e5dd7070Spatrick #include "clang/Basic/LLVM.h"
13e5dd7070Spatrick #include "clang/Basic/LangOptions.h"
14e5dd7070Spatrick #include "clang/Basic/SourceLocation.h"
15e5dd7070Spatrick #include "clang/Basic/SourceManager.h"
16e5dd7070Spatrick #include "clang/Basic/TokenKinds.h"
17e5dd7070Spatrick #include "clang/Lex/PPCallbacks.h"
18e5dd7070Spatrick #include "clang/Lex/Preprocessor.h"
19e5dd7070Spatrick #include "clang/Lex/Token.h"
20e5dd7070Spatrick #include "llvm/ADT/ArrayRef.h"
21e5dd7070Spatrick #include "llvm/ADT/STLExtras.h"
22e5dd7070Spatrick #include "llvm/Support/Debug.h"
23e5dd7070Spatrick #include "llvm/Support/ErrorHandling.h"
24e5dd7070Spatrick #include "llvm/Support/FormatVariadic.h"
25e5dd7070Spatrick #include "llvm/Support/raw_ostream.h"
26e5dd7070Spatrick #include <algorithm>
27e5dd7070Spatrick #include <cassert>
28e5dd7070Spatrick #include <iterator>
29*12c85518Srobert #include <optional>
30e5dd7070Spatrick #include <string>
31e5dd7070Spatrick #include <utility>
32e5dd7070Spatrick #include <vector>
33e5dd7070Spatrick
34e5dd7070Spatrick using namespace clang;
35e5dd7070Spatrick using namespace clang::syntax;
36e5dd7070Spatrick
37ec727ea7Spatrick namespace {
38ec727ea7Spatrick // Finds the smallest consecutive subsuquence of Toks that covers R.
39ec727ea7Spatrick llvm::ArrayRef<syntax::Token>
getTokensCovering(llvm::ArrayRef<syntax::Token> Toks,SourceRange R,const SourceManager & SM)40ec727ea7Spatrick getTokensCovering(llvm::ArrayRef<syntax::Token> Toks, SourceRange R,
41ec727ea7Spatrick const SourceManager &SM) {
42ec727ea7Spatrick if (R.isInvalid())
43ec727ea7Spatrick return {};
44ec727ea7Spatrick const syntax::Token *Begin =
45ec727ea7Spatrick llvm::partition_point(Toks, [&](const syntax::Token &T) {
46ec727ea7Spatrick return SM.isBeforeInTranslationUnit(T.location(), R.getBegin());
47ec727ea7Spatrick });
48ec727ea7Spatrick const syntax::Token *End =
49ec727ea7Spatrick llvm::partition_point(Toks, [&](const syntax::Token &T) {
50ec727ea7Spatrick return !SM.isBeforeInTranslationUnit(R.getEnd(), T.location());
51ec727ea7Spatrick });
52ec727ea7Spatrick if (Begin > End)
53ec727ea7Spatrick return {};
54ec727ea7Spatrick return {Begin, End};
55ec727ea7Spatrick }
56ec727ea7Spatrick
57*12c85518Srobert // Finds the range within FID corresponding to expanded tokens [First, Last].
58*12c85518Srobert // Prev precedes First and Next follows Last, these must *not* be included.
59*12c85518Srobert // If no range satisfies the criteria, returns an invalid range.
60*12c85518Srobert //
61ec727ea7Spatrick // #define ID(x) x
62ec727ea7Spatrick // ID(ID(ID(a1) a2))
63ec727ea7Spatrick // ~~ -> a1
64ec727ea7Spatrick // ~~ -> a2
65ec727ea7Spatrick // ~~~~~~~~~ -> a1 a2
spelledForExpandedSlow(SourceLocation First,SourceLocation Last,SourceLocation Prev,SourceLocation Next,FileID TargetFile,const SourceManager & SM)66*12c85518Srobert SourceRange spelledForExpandedSlow(SourceLocation First, SourceLocation Last,
67*12c85518Srobert SourceLocation Prev, SourceLocation Next,
68*12c85518Srobert FileID TargetFile,
69ec727ea7Spatrick const SourceManager &SM) {
70*12c85518Srobert // There are two main parts to this algorithm:
71*12c85518Srobert // - identifying which spelled range covers the expanded tokens
72*12c85518Srobert // - validating that this range doesn't cover any extra tokens (First/Last)
73*12c85518Srobert //
74*12c85518Srobert // We do these in order. However as we transform the expanded range into the
75*12c85518Srobert // spelled one, we adjust First/Last so the validation remains simple.
76*12c85518Srobert
77*12c85518Srobert assert(SM.getSLocEntry(TargetFile).isFile());
78*12c85518Srobert // In most cases, to select First and Last we must return their expansion
79*12c85518Srobert // range, i.e. the whole of any macros they are included in.
80*12c85518Srobert //
81*12c85518Srobert // When First and Last are part of the *same macro arg* of a macro written
82*12c85518Srobert // in TargetFile, we that slice of the arg, i.e. their spelling range.
83*12c85518Srobert //
84*12c85518Srobert // Unwrap such macro calls. If the target file has A(B(C)), the
85*12c85518Srobert // SourceLocation stack of a token inside C shows us the expansion of A first,
86*12c85518Srobert // then B, then any macros inside C's body, then C itself.
87*12c85518Srobert // (This is the reverse of the order the PP applies the expansions in).
88*12c85518Srobert while (First.isMacroID() && Last.isMacroID()) {
89*12c85518Srobert auto DecFirst = SM.getDecomposedLoc(First);
90*12c85518Srobert auto DecLast = SM.getDecomposedLoc(Last);
91*12c85518Srobert auto &ExpFirst = SM.getSLocEntry(DecFirst.first).getExpansion();
92*12c85518Srobert auto &ExpLast = SM.getSLocEntry(DecLast.first).getExpansion();
93*12c85518Srobert
94*12c85518Srobert if (!ExpFirst.isMacroArgExpansion() || !ExpLast.isMacroArgExpansion())
95ec727ea7Spatrick break;
96*12c85518Srobert // Locations are in the same macro arg if they expand to the same place.
97*12c85518Srobert // (They may still have different FileIDs - an arg can have >1 chunks!)
98*12c85518Srobert if (ExpFirst.getExpansionLocStart() != ExpLast.getExpansionLocStart())
99ec727ea7Spatrick break;
100*12c85518Srobert // Careful, given:
101*12c85518Srobert // #define HIDE ID(ID(a))
102*12c85518Srobert // ID(ID(HIDE))
103*12c85518Srobert // The token `a` is wrapped in 4 arg-expansions, we only want to unwrap 2.
104*12c85518Srobert // We distinguish them by whether the macro expands into the target file.
105*12c85518Srobert // Fortunately, the target file ones will always appear first.
106*12c85518Srobert auto &ExpMacro =
107*12c85518Srobert SM.getSLocEntry(SM.getFileID(ExpFirst.getExpansionLocStart()))
108*12c85518Srobert .getExpansion();
109*12c85518Srobert if (ExpMacro.getExpansionLocStart().isMacroID())
110*12c85518Srobert break;
111*12c85518Srobert // Replace each endpoint with its spelling inside the macro arg.
112*12c85518Srobert // (This is getImmediateSpellingLoc without repeating lookups).
113*12c85518Srobert First = ExpFirst.getSpellingLoc().getLocWithOffset(DecFirst.second);
114*12c85518Srobert Last = ExpLast.getSpellingLoc().getLocWithOffset(DecLast.second);
115*12c85518Srobert
116*12c85518Srobert // Now: how do we adjust the previous/next bounds? Three cases:
117*12c85518Srobert // A) If they are also part of the same macro arg, we translate them too.
118*12c85518Srobert // This will ensure that we don't select any macros nested within the
119*12c85518Srobert // macro arg that cover extra tokens. Critical case:
120*12c85518Srobert // #define ID(X) X
121*12c85518Srobert // ID(prev target) // selecting 'target' succeeds
122*12c85518Srobert // #define LARGE ID(prev target)
123*12c85518Srobert // LARGE // selecting 'target' fails.
124*12c85518Srobert // B) They are not in the macro at all, then their expansion range is a
125*12c85518Srobert // sibling to it, and we can safely substitute that.
126*12c85518Srobert // #define PREV prev
127*12c85518Srobert // #define ID(X) X
128*12c85518Srobert // PREV ID(target) // selecting 'target' succeeds.
129*12c85518Srobert // #define LARGE PREV ID(target)
130*12c85518Srobert // LARGE // selecting 'target' fails.
131*12c85518Srobert // C) They are in a different arg of this macro, or the macro body.
132*12c85518Srobert // Now selecting the whole macro arg is fine, but the whole macro is not.
133*12c85518Srobert // Model this by setting using the edge of the macro call as the bound.
134*12c85518Srobert // #define ID2(X, Y) X Y
135*12c85518Srobert // ID2(prev, target) // selecting 'target' succeeds
136*12c85518Srobert // #define LARGE ID2(prev, target)
137*12c85518Srobert // LARGE // selecting 'target' fails
138*12c85518Srobert auto AdjustBound = [&](SourceLocation &Bound) {
139*12c85518Srobert if (Bound.isInvalid() || !Bound.isMacroID()) // Non-macro must be case B.
140*12c85518Srobert return;
141*12c85518Srobert auto DecBound = SM.getDecomposedLoc(Bound);
142*12c85518Srobert auto &ExpBound = SM.getSLocEntry(DecBound.first).getExpansion();
143*12c85518Srobert if (ExpBound.isMacroArgExpansion() &&
144*12c85518Srobert ExpBound.getExpansionLocStart() == ExpFirst.getExpansionLocStart()) {
145*12c85518Srobert // Case A: translate to (spelling) loc within the macro arg.
146*12c85518Srobert Bound = ExpBound.getSpellingLoc().getLocWithOffset(DecBound.second);
147*12c85518Srobert return;
148ec727ea7Spatrick }
149*12c85518Srobert while (Bound.isMacroID()) {
150*12c85518Srobert SourceRange Exp = SM.getImmediateExpansionRange(Bound).getAsRange();
151*12c85518Srobert if (Exp.getBegin() == ExpMacro.getExpansionLocStart()) {
152*12c85518Srobert // Case B: bounds become the macro call itself.
153*12c85518Srobert Bound = (&Bound == &Prev) ? Exp.getBegin() : Exp.getEnd();
154*12c85518Srobert return;
155*12c85518Srobert }
156*12c85518Srobert // Either case C, or expansion location will later find case B.
157*12c85518Srobert // We choose the upper bound for Prev and the lower one for Next:
158*12c85518Srobert // ID(prev) target ID(next)
159*12c85518Srobert // ^ ^
160*12c85518Srobert // new-prev new-next
161*12c85518Srobert Bound = (&Bound == &Prev) ? Exp.getEnd() : Exp.getBegin();
162*12c85518Srobert }
163*12c85518Srobert };
164*12c85518Srobert AdjustBound(Prev);
165*12c85518Srobert AdjustBound(Next);
166*12c85518Srobert }
167*12c85518Srobert
168*12c85518Srobert // In all remaining cases we need the full containing macros.
169*12c85518Srobert // If this overlaps Prev or Next, then no range is possible.
170*12c85518Srobert SourceRange Candidate =
171*12c85518Srobert SM.getExpansionRange(SourceRange(First, Last)).getAsRange();
172*12c85518Srobert auto DecFirst = SM.getDecomposedExpansionLoc(Candidate.getBegin());
173*12c85518Srobert auto DecLast = SM.getDecomposedLoc(Candidate.getEnd());
174*12c85518Srobert // Can end up in the wrong file due to bad input or token-pasting shenanigans.
175*12c85518Srobert if (Candidate.isInvalid() || DecFirst.first != TargetFile || DecLast.first != TargetFile)
176*12c85518Srobert return SourceRange();
177*12c85518Srobert // Check bounds, which may still be inside macros.
178*12c85518Srobert if (Prev.isValid()) {
179*12c85518Srobert auto Dec = SM.getDecomposedLoc(SM.getExpansionRange(Prev).getBegin());
180*12c85518Srobert if (Dec.first != DecFirst.first || Dec.second >= DecFirst.second)
181*12c85518Srobert return SourceRange();
182*12c85518Srobert }
183*12c85518Srobert if (Next.isValid()) {
184*12c85518Srobert auto Dec = SM.getDecomposedLoc(SM.getExpansionRange(Next).getEnd());
185*12c85518Srobert if (Dec.first != DecLast.first || Dec.second <= DecLast.second)
186*12c85518Srobert return SourceRange();
187*12c85518Srobert }
188*12c85518Srobert // Now we know that Candidate is a file range that covers [First, Last]
189*12c85518Srobert // without encroaching on {Prev, Next}. Ship it!
190*12c85518Srobert return Candidate;
191ec727ea7Spatrick }
192ec727ea7Spatrick
193ec727ea7Spatrick } // namespace
194ec727ea7Spatrick
Token(SourceLocation Location,unsigned Length,tok::TokenKind Kind)195e5dd7070Spatrick syntax::Token::Token(SourceLocation Location, unsigned Length,
196e5dd7070Spatrick tok::TokenKind Kind)
197e5dd7070Spatrick : Location(Location), Length(Length), Kind(Kind) {
198e5dd7070Spatrick assert(Location.isValid());
199e5dd7070Spatrick }
200e5dd7070Spatrick
Token(const clang::Token & T)201e5dd7070Spatrick syntax::Token::Token(const clang::Token &T)
202e5dd7070Spatrick : Token(T.getLocation(), T.getLength(), T.getKind()) {
203e5dd7070Spatrick assert(!T.isAnnotation());
204e5dd7070Spatrick }
205e5dd7070Spatrick
text(const SourceManager & SM) const206e5dd7070Spatrick llvm::StringRef syntax::Token::text(const SourceManager &SM) const {
207e5dd7070Spatrick bool Invalid = false;
208e5dd7070Spatrick const char *Start = SM.getCharacterData(location(), &Invalid);
209e5dd7070Spatrick assert(!Invalid);
210e5dd7070Spatrick return llvm::StringRef(Start, length());
211e5dd7070Spatrick }
212e5dd7070Spatrick
range(const SourceManager & SM) const213e5dd7070Spatrick FileRange syntax::Token::range(const SourceManager &SM) const {
214e5dd7070Spatrick assert(location().isFileID() && "must be a spelled token");
215e5dd7070Spatrick FileID File;
216e5dd7070Spatrick unsigned StartOffset;
217e5dd7070Spatrick std::tie(File, StartOffset) = SM.getDecomposedLoc(location());
218e5dd7070Spatrick return FileRange(File, StartOffset, StartOffset + length());
219e5dd7070Spatrick }
220e5dd7070Spatrick
range(const SourceManager & SM,const syntax::Token & First,const syntax::Token & Last)221e5dd7070Spatrick FileRange syntax::Token::range(const SourceManager &SM,
222e5dd7070Spatrick const syntax::Token &First,
223e5dd7070Spatrick const syntax::Token &Last) {
224e5dd7070Spatrick auto F = First.range(SM);
225e5dd7070Spatrick auto L = Last.range(SM);
226e5dd7070Spatrick assert(F.file() == L.file() && "tokens from different files");
227ec727ea7Spatrick assert((F == L || F.endOffset() <= L.beginOffset()) &&
228ec727ea7Spatrick "wrong order of tokens");
229e5dd7070Spatrick return FileRange(F.file(), F.beginOffset(), L.endOffset());
230e5dd7070Spatrick }
231e5dd7070Spatrick
operator <<(llvm::raw_ostream & OS,const Token & T)232e5dd7070Spatrick llvm::raw_ostream &syntax::operator<<(llvm::raw_ostream &OS, const Token &T) {
233e5dd7070Spatrick return OS << T.str();
234e5dd7070Spatrick }
235e5dd7070Spatrick
FileRange(FileID File,unsigned BeginOffset,unsigned EndOffset)236e5dd7070Spatrick FileRange::FileRange(FileID File, unsigned BeginOffset, unsigned EndOffset)
237e5dd7070Spatrick : File(File), Begin(BeginOffset), End(EndOffset) {
238e5dd7070Spatrick assert(File.isValid());
239e5dd7070Spatrick assert(BeginOffset <= EndOffset);
240e5dd7070Spatrick }
241e5dd7070Spatrick
FileRange(const SourceManager & SM,SourceLocation BeginLoc,unsigned Length)242e5dd7070Spatrick FileRange::FileRange(const SourceManager &SM, SourceLocation BeginLoc,
243e5dd7070Spatrick unsigned Length) {
244e5dd7070Spatrick assert(BeginLoc.isValid());
245e5dd7070Spatrick assert(BeginLoc.isFileID());
246e5dd7070Spatrick
247e5dd7070Spatrick std::tie(File, Begin) = SM.getDecomposedLoc(BeginLoc);
248e5dd7070Spatrick End = Begin + Length;
249e5dd7070Spatrick }
FileRange(const SourceManager & SM,SourceLocation BeginLoc,SourceLocation EndLoc)250e5dd7070Spatrick FileRange::FileRange(const SourceManager &SM, SourceLocation BeginLoc,
251e5dd7070Spatrick SourceLocation EndLoc) {
252e5dd7070Spatrick assert(BeginLoc.isValid());
253e5dd7070Spatrick assert(BeginLoc.isFileID());
254e5dd7070Spatrick assert(EndLoc.isValid());
255e5dd7070Spatrick assert(EndLoc.isFileID());
256e5dd7070Spatrick assert(SM.getFileID(BeginLoc) == SM.getFileID(EndLoc));
257e5dd7070Spatrick assert(SM.getFileOffset(BeginLoc) <= SM.getFileOffset(EndLoc));
258e5dd7070Spatrick
259e5dd7070Spatrick std::tie(File, Begin) = SM.getDecomposedLoc(BeginLoc);
260e5dd7070Spatrick End = SM.getFileOffset(EndLoc);
261e5dd7070Spatrick }
262e5dd7070Spatrick
operator <<(llvm::raw_ostream & OS,const FileRange & R)263e5dd7070Spatrick llvm::raw_ostream &syntax::operator<<(llvm::raw_ostream &OS,
264e5dd7070Spatrick const FileRange &R) {
265e5dd7070Spatrick return OS << llvm::formatv("FileRange(file = {0}, offsets = {1}-{2})",
266e5dd7070Spatrick R.file().getHashValue(), R.beginOffset(),
267e5dd7070Spatrick R.endOffset());
268e5dd7070Spatrick }
269e5dd7070Spatrick
text(const SourceManager & SM) const270e5dd7070Spatrick llvm::StringRef FileRange::text(const SourceManager &SM) const {
271e5dd7070Spatrick bool Invalid = false;
272e5dd7070Spatrick StringRef Text = SM.getBufferData(File, &Invalid);
273e5dd7070Spatrick if (Invalid)
274e5dd7070Spatrick return "";
275e5dd7070Spatrick assert(Begin <= Text.size());
276e5dd7070Spatrick assert(End <= Text.size());
277e5dd7070Spatrick return Text.substr(Begin, length());
278e5dd7070Spatrick }
279e5dd7070Spatrick
indexExpandedTokens()280a9ac8606Spatrick void TokenBuffer::indexExpandedTokens() {
281a9ac8606Spatrick // No-op if the index is already created.
282a9ac8606Spatrick if (!ExpandedTokIndex.empty())
283a9ac8606Spatrick return;
284a9ac8606Spatrick ExpandedTokIndex.reserve(ExpandedTokens.size());
285a9ac8606Spatrick // Index ExpandedTokens for faster lookups by SourceLocation.
286a9ac8606Spatrick for (size_t I = 0, E = ExpandedTokens.size(); I != E; ++I) {
287a9ac8606Spatrick SourceLocation Loc = ExpandedTokens[I].location();
288a9ac8606Spatrick if (Loc.isValid())
289a9ac8606Spatrick ExpandedTokIndex[Loc] = I;
290a9ac8606Spatrick }
291a9ac8606Spatrick }
292a9ac8606Spatrick
expandedTokens(SourceRange R) const293e5dd7070Spatrick llvm::ArrayRef<syntax::Token> TokenBuffer::expandedTokens(SourceRange R) const {
294a9ac8606Spatrick if (R.isInvalid())
295a9ac8606Spatrick return {};
296a9ac8606Spatrick if (!ExpandedTokIndex.empty()) {
297a9ac8606Spatrick // Quick lookup if `R` is a token range.
298a9ac8606Spatrick // This is a huge win since majority of the users use ranges provided by an
299a9ac8606Spatrick // AST. Ranges in AST are token ranges from expanded token stream.
300a9ac8606Spatrick const auto B = ExpandedTokIndex.find(R.getBegin());
301a9ac8606Spatrick const auto E = ExpandedTokIndex.find(R.getEnd());
302a9ac8606Spatrick if (B != ExpandedTokIndex.end() && E != ExpandedTokIndex.end()) {
303a9ac8606Spatrick const Token *L = ExpandedTokens.data() + B->getSecond();
304a9ac8606Spatrick // Add 1 to End to make a half-open range.
305a9ac8606Spatrick const Token *R = ExpandedTokens.data() + E->getSecond() + 1;
306a9ac8606Spatrick if (L > R)
307a9ac8606Spatrick return {};
308a9ac8606Spatrick return {L, R};
309a9ac8606Spatrick }
310a9ac8606Spatrick }
311a9ac8606Spatrick // Slow case. Use `isBeforeInTranslationUnit` to binary search for the
312a9ac8606Spatrick // required range.
313ec727ea7Spatrick return getTokensCovering(expandedTokens(), R, *SourceMgr);
314e5dd7070Spatrick }
315e5dd7070Spatrick
toCharRange(const SourceManager & SM) const316e5dd7070Spatrick CharSourceRange FileRange::toCharRange(const SourceManager &SM) const {
317e5dd7070Spatrick return CharSourceRange(
318e5dd7070Spatrick SourceRange(SM.getComposedLoc(File, Begin), SM.getComposedLoc(File, End)),
319e5dd7070Spatrick /*IsTokenRange=*/false);
320e5dd7070Spatrick }
321e5dd7070Spatrick
322e5dd7070Spatrick std::pair<const syntax::Token *, const TokenBuffer::Mapping *>
spelledForExpandedToken(const syntax::Token * Expanded) const323e5dd7070Spatrick TokenBuffer::spelledForExpandedToken(const syntax::Token *Expanded) const {
324e5dd7070Spatrick assert(Expanded);
325e5dd7070Spatrick assert(ExpandedTokens.data() <= Expanded &&
326e5dd7070Spatrick Expanded < ExpandedTokens.data() + ExpandedTokens.size());
327e5dd7070Spatrick
328e5dd7070Spatrick auto FileIt = Files.find(
329e5dd7070Spatrick SourceMgr->getFileID(SourceMgr->getExpansionLoc(Expanded->location())));
330e5dd7070Spatrick assert(FileIt != Files.end() && "no file for an expanded token");
331e5dd7070Spatrick
332e5dd7070Spatrick const MarkedFile &File = FileIt->second;
333e5dd7070Spatrick
334e5dd7070Spatrick unsigned ExpandedIndex = Expanded - ExpandedTokens.data();
335e5dd7070Spatrick // Find the first mapping that produced tokens after \p Expanded.
336e5dd7070Spatrick auto It = llvm::partition_point(File.Mappings, [&](const Mapping &M) {
337e5dd7070Spatrick return M.BeginExpanded <= ExpandedIndex;
338e5dd7070Spatrick });
339e5dd7070Spatrick // Our token could only be produced by the previous mapping.
340e5dd7070Spatrick if (It == File.Mappings.begin()) {
341e5dd7070Spatrick // No previous mapping, no need to modify offsets.
342ec727ea7Spatrick return {&File.SpelledTokens[ExpandedIndex - File.BeginExpanded],
343ec727ea7Spatrick /*Mapping=*/nullptr};
344e5dd7070Spatrick }
345e5dd7070Spatrick --It; // 'It' now points to last mapping that started before our token.
346e5dd7070Spatrick
347e5dd7070Spatrick // Check if the token is part of the mapping.
348e5dd7070Spatrick if (ExpandedIndex < It->EndExpanded)
349ec727ea7Spatrick return {&File.SpelledTokens[It->BeginSpelled], /*Mapping=*/&*It};
350e5dd7070Spatrick
351e5dd7070Spatrick // Not part of the mapping, use the index from previous mapping to compute the
352e5dd7070Spatrick // corresponding spelled token.
353e5dd7070Spatrick return {
354e5dd7070Spatrick &File.SpelledTokens[It->EndSpelled + (ExpandedIndex - It->EndExpanded)],
355ec727ea7Spatrick /*Mapping=*/nullptr};
356ec727ea7Spatrick }
357ec727ea7Spatrick
358ec727ea7Spatrick const TokenBuffer::Mapping *
mappingStartingBeforeSpelled(const MarkedFile & F,const syntax::Token * Spelled)359ec727ea7Spatrick TokenBuffer::mappingStartingBeforeSpelled(const MarkedFile &F,
360ec727ea7Spatrick const syntax::Token *Spelled) {
361ec727ea7Spatrick assert(F.SpelledTokens.data() <= Spelled);
362ec727ea7Spatrick unsigned SpelledI = Spelled - F.SpelledTokens.data();
363ec727ea7Spatrick assert(SpelledI < F.SpelledTokens.size());
364ec727ea7Spatrick
365ec727ea7Spatrick auto It = llvm::partition_point(F.Mappings, [SpelledI](const Mapping &M) {
366ec727ea7Spatrick return M.BeginSpelled <= SpelledI;
367ec727ea7Spatrick });
368ec727ea7Spatrick if (It == F.Mappings.begin())
369ec727ea7Spatrick return nullptr;
370ec727ea7Spatrick --It;
371ec727ea7Spatrick return &*It;
372ec727ea7Spatrick }
373ec727ea7Spatrick
374ec727ea7Spatrick llvm::SmallVector<llvm::ArrayRef<syntax::Token>, 1>
expandedForSpelled(llvm::ArrayRef<syntax::Token> Spelled) const375ec727ea7Spatrick TokenBuffer::expandedForSpelled(llvm::ArrayRef<syntax::Token> Spelled) const {
376ec727ea7Spatrick if (Spelled.empty())
377ec727ea7Spatrick return {};
378a9ac8606Spatrick const auto &File = fileForSpelled(Spelled);
379ec727ea7Spatrick
380ec727ea7Spatrick auto *FrontMapping = mappingStartingBeforeSpelled(File, &Spelled.front());
381ec727ea7Spatrick unsigned SpelledFrontI = &Spelled.front() - File.SpelledTokens.data();
382ec727ea7Spatrick assert(SpelledFrontI < File.SpelledTokens.size());
383ec727ea7Spatrick unsigned ExpandedBegin;
384ec727ea7Spatrick if (!FrontMapping) {
385ec727ea7Spatrick // No mapping that starts before the first token of Spelled, we don't have
386ec727ea7Spatrick // to modify offsets.
387ec727ea7Spatrick ExpandedBegin = File.BeginExpanded + SpelledFrontI;
388ec727ea7Spatrick } else if (SpelledFrontI < FrontMapping->EndSpelled) {
389ec727ea7Spatrick // This mapping applies to Spelled tokens.
390ec727ea7Spatrick if (SpelledFrontI != FrontMapping->BeginSpelled) {
391ec727ea7Spatrick // Spelled tokens don't cover the entire mapping, returning empty result.
392ec727ea7Spatrick return {}; // FIXME: support macro arguments.
393ec727ea7Spatrick }
394ec727ea7Spatrick // Spelled tokens start at the beginning of this mapping.
395ec727ea7Spatrick ExpandedBegin = FrontMapping->BeginExpanded;
396ec727ea7Spatrick } else {
397ec727ea7Spatrick // Spelled tokens start after the mapping ends (they start in the hole
398ec727ea7Spatrick // between 2 mappings, or between a mapping and end of the file).
399ec727ea7Spatrick ExpandedBegin =
400ec727ea7Spatrick FrontMapping->EndExpanded + (SpelledFrontI - FrontMapping->EndSpelled);
401ec727ea7Spatrick }
402ec727ea7Spatrick
403ec727ea7Spatrick auto *BackMapping = mappingStartingBeforeSpelled(File, &Spelled.back());
404ec727ea7Spatrick unsigned SpelledBackI = &Spelled.back() - File.SpelledTokens.data();
405ec727ea7Spatrick unsigned ExpandedEnd;
406ec727ea7Spatrick if (!BackMapping) {
407ec727ea7Spatrick // No mapping that starts before the last token of Spelled, we don't have to
408ec727ea7Spatrick // modify offsets.
409ec727ea7Spatrick ExpandedEnd = File.BeginExpanded + SpelledBackI + 1;
410ec727ea7Spatrick } else if (SpelledBackI < BackMapping->EndSpelled) {
411ec727ea7Spatrick // This mapping applies to Spelled tokens.
412ec727ea7Spatrick if (SpelledBackI + 1 != BackMapping->EndSpelled) {
413ec727ea7Spatrick // Spelled tokens don't cover the entire mapping, returning empty result.
414ec727ea7Spatrick return {}; // FIXME: support macro arguments.
415ec727ea7Spatrick }
416ec727ea7Spatrick ExpandedEnd = BackMapping->EndExpanded;
417ec727ea7Spatrick } else {
418ec727ea7Spatrick // Spelled tokens end after the mapping ends.
419ec727ea7Spatrick ExpandedEnd =
420ec727ea7Spatrick BackMapping->EndExpanded + (SpelledBackI - BackMapping->EndSpelled) + 1;
421ec727ea7Spatrick }
422ec727ea7Spatrick
423ec727ea7Spatrick assert(ExpandedBegin < ExpandedTokens.size());
424ec727ea7Spatrick assert(ExpandedEnd < ExpandedTokens.size());
425ec727ea7Spatrick // Avoid returning empty ranges.
426ec727ea7Spatrick if (ExpandedBegin == ExpandedEnd)
427ec727ea7Spatrick return {};
428*12c85518Srobert return {llvm::ArrayRef(ExpandedTokens.data() + ExpandedBegin,
429ec727ea7Spatrick ExpandedTokens.data() + ExpandedEnd)};
430e5dd7070Spatrick }
431e5dd7070Spatrick
spelledTokens(FileID FID) const432e5dd7070Spatrick llvm::ArrayRef<syntax::Token> TokenBuffer::spelledTokens(FileID FID) const {
433e5dd7070Spatrick auto It = Files.find(FID);
434e5dd7070Spatrick assert(It != Files.end());
435e5dd7070Spatrick return It->second.SpelledTokens;
436e5dd7070Spatrick }
437e5dd7070Spatrick
spelledTokenAt(SourceLocation Loc) const438ec727ea7Spatrick const syntax::Token *TokenBuffer::spelledTokenAt(SourceLocation Loc) const {
439ec727ea7Spatrick assert(Loc.isFileID());
440ec727ea7Spatrick const auto *Tok = llvm::partition_point(
441ec727ea7Spatrick spelledTokens(SourceMgr->getFileID(Loc)),
442ec727ea7Spatrick [&](const syntax::Token &Tok) { return Tok.location() < Loc; });
443ec727ea7Spatrick if (!Tok || Tok->location() != Loc)
444ec727ea7Spatrick return nullptr;
445ec727ea7Spatrick return Tok;
446ec727ea7Spatrick }
447ec727ea7Spatrick
str() const448e5dd7070Spatrick std::string TokenBuffer::Mapping::str() const {
449ec727ea7Spatrick return std::string(
450ec727ea7Spatrick llvm::formatv("spelled tokens: [{0},{1}), expanded tokens: [{2},{3})",
451ec727ea7Spatrick BeginSpelled, EndSpelled, BeginExpanded, EndExpanded));
452e5dd7070Spatrick }
453e5dd7070Spatrick
454*12c85518Srobert std::optional<llvm::ArrayRef<syntax::Token>>
spelledForExpanded(llvm::ArrayRef<syntax::Token> Expanded) const455e5dd7070Spatrick TokenBuffer::spelledForExpanded(llvm::ArrayRef<syntax::Token> Expanded) const {
456e5dd7070Spatrick // Mapping an empty range is ambiguous in case of empty mappings at either end
457e5dd7070Spatrick // of the range, bail out in that case.
458e5dd7070Spatrick if (Expanded.empty())
459*12c85518Srobert return std::nullopt;
460*12c85518Srobert const syntax::Token *First = &Expanded.front();
461*12c85518Srobert const syntax::Token *Last = &Expanded.back();
462*12c85518Srobert auto [FirstSpelled, FirstMapping] = spelledForExpandedToken(First);
463*12c85518Srobert auto [LastSpelled, LastMapping] = spelledForExpandedToken(Last);
464e5dd7070Spatrick
465*12c85518Srobert FileID FID = SourceMgr->getFileID(FirstSpelled->location());
466e5dd7070Spatrick // FIXME: Handle multi-file changes by trying to map onto a common root.
467e5dd7070Spatrick if (FID != SourceMgr->getFileID(LastSpelled->location()))
468*12c85518Srobert return std::nullopt;
469e5dd7070Spatrick
470e5dd7070Spatrick const MarkedFile &File = Files.find(FID)->second;
471e5dd7070Spatrick
472*12c85518Srobert // If the range is within one macro argument, the result may be only part of a
473*12c85518Srobert // Mapping. We must use the general (SourceManager-based) algorithm.
474*12c85518Srobert if (FirstMapping && FirstMapping == LastMapping &&
475*12c85518Srobert SourceMgr->isMacroArgExpansion(First->location()) &&
476*12c85518Srobert SourceMgr->isMacroArgExpansion(Last->location())) {
477*12c85518Srobert // We use excluded Prev/Next token for bounds checking.
478*12c85518Srobert SourceLocation Prev = (First == &ExpandedTokens.front())
479*12c85518Srobert ? SourceLocation()
480*12c85518Srobert : (First - 1)->location();
481*12c85518Srobert SourceLocation Next = (Last == &ExpandedTokens.back())
482*12c85518Srobert ? SourceLocation()
483*12c85518Srobert : (Last + 1)->location();
484*12c85518Srobert SourceRange Range = spelledForExpandedSlow(
485*12c85518Srobert First->location(), Last->location(), Prev, Next, FID, *SourceMgr);
486*12c85518Srobert if (Range.isInvalid())
487*12c85518Srobert return std::nullopt;
488*12c85518Srobert return getTokensCovering(File.SpelledTokens, Range, *SourceMgr);
489ec727ea7Spatrick }
490ec727ea7Spatrick
491*12c85518Srobert // Otherwise, use the fast version based on Mappings.
492ec727ea7Spatrick // Do not allow changes that doesn't cover full expansion.
493*12c85518Srobert unsigned FirstExpanded = Expanded.begin() - ExpandedTokens.data();
494*12c85518Srobert unsigned LastExpanded = Expanded.end() - ExpandedTokens.data();
495*12c85518Srobert if (FirstMapping && FirstExpanded != FirstMapping->BeginExpanded)
496*12c85518Srobert return std::nullopt;
497*12c85518Srobert if (LastMapping && LastMapping->EndExpanded != LastExpanded)
498*12c85518Srobert return std::nullopt;
499*12c85518Srobert return llvm::ArrayRef(
500*12c85518Srobert FirstMapping ? File.SpelledTokens.data() + FirstMapping->BeginSpelled
501*12c85518Srobert : FirstSpelled,
502e5dd7070Spatrick LastMapping ? File.SpelledTokens.data() + LastMapping->EndSpelled
503e5dd7070Spatrick : LastSpelled + 1);
504e5dd7070Spatrick }
505e5dd7070Spatrick
makeExpansion(const MarkedFile & F,const Mapping & M) const506a9ac8606Spatrick TokenBuffer::Expansion TokenBuffer::makeExpansion(const MarkedFile &F,
507a9ac8606Spatrick const Mapping &M) const {
508a9ac8606Spatrick Expansion E;
509*12c85518Srobert E.Spelled = llvm::ArrayRef(F.SpelledTokens.data() + M.BeginSpelled,
510a9ac8606Spatrick F.SpelledTokens.data() + M.EndSpelled);
511*12c85518Srobert E.Expanded = llvm::ArrayRef(ExpandedTokens.data() + M.BeginExpanded,
512a9ac8606Spatrick ExpandedTokens.data() + M.EndExpanded);
513a9ac8606Spatrick return E;
514a9ac8606Spatrick }
515a9ac8606Spatrick
516a9ac8606Spatrick const TokenBuffer::MarkedFile &
fileForSpelled(llvm::ArrayRef<syntax::Token> Spelled) const517a9ac8606Spatrick TokenBuffer::fileForSpelled(llvm::ArrayRef<syntax::Token> Spelled) const {
518a9ac8606Spatrick assert(!Spelled.empty());
519a9ac8606Spatrick assert(Spelled.front().location().isFileID() && "not a spelled token");
520a9ac8606Spatrick auto FileIt = Files.find(SourceMgr->getFileID(Spelled.front().location()));
521a9ac8606Spatrick assert(FileIt != Files.end() && "file not tracked by token buffer");
522a9ac8606Spatrick const auto &File = FileIt->second;
523a9ac8606Spatrick assert(File.SpelledTokens.data() <= Spelled.data() &&
524a9ac8606Spatrick Spelled.end() <=
525a9ac8606Spatrick (File.SpelledTokens.data() + File.SpelledTokens.size()) &&
526a9ac8606Spatrick "Tokens not in spelled range");
527a9ac8606Spatrick #ifndef NDEBUG
528a9ac8606Spatrick auto T1 = Spelled.back().location();
529a9ac8606Spatrick auto T2 = File.SpelledTokens.back().location();
530a9ac8606Spatrick assert(T1 == T2 || sourceManager().isBeforeInTranslationUnit(T1, T2));
531a9ac8606Spatrick #endif
532a9ac8606Spatrick return File;
533a9ac8606Spatrick }
534a9ac8606Spatrick
535*12c85518Srobert std::optional<TokenBuffer::Expansion>
expansionStartingAt(const syntax::Token * Spelled) const536e5dd7070Spatrick TokenBuffer::expansionStartingAt(const syntax::Token *Spelled) const {
537e5dd7070Spatrick assert(Spelled);
538a9ac8606Spatrick const auto &File = fileForSpelled(*Spelled);
539e5dd7070Spatrick
540e5dd7070Spatrick unsigned SpelledIndex = Spelled - File.SpelledTokens.data();
541e5dd7070Spatrick auto M = llvm::partition_point(File.Mappings, [&](const Mapping &M) {
542e5dd7070Spatrick return M.BeginSpelled < SpelledIndex;
543e5dd7070Spatrick });
544e5dd7070Spatrick if (M == File.Mappings.end() || M->BeginSpelled != SpelledIndex)
545*12c85518Srobert return std::nullopt;
546a9ac8606Spatrick return makeExpansion(File, *M);
547e5dd7070Spatrick }
548a9ac8606Spatrick
expansionsOverlapping(llvm::ArrayRef<syntax::Token> Spelled) const549a9ac8606Spatrick std::vector<TokenBuffer::Expansion> TokenBuffer::expansionsOverlapping(
550a9ac8606Spatrick llvm::ArrayRef<syntax::Token> Spelled) const {
551a9ac8606Spatrick if (Spelled.empty())
552a9ac8606Spatrick return {};
553a9ac8606Spatrick const auto &File = fileForSpelled(Spelled);
554a9ac8606Spatrick
555a9ac8606Spatrick // Find the first overlapping range, and then copy until we stop overlapping.
556a9ac8606Spatrick unsigned SpelledBeginIndex = Spelled.begin() - File.SpelledTokens.data();
557a9ac8606Spatrick unsigned SpelledEndIndex = Spelled.end() - File.SpelledTokens.data();
558a9ac8606Spatrick auto M = llvm::partition_point(File.Mappings, [&](const Mapping &M) {
559a9ac8606Spatrick return M.EndSpelled <= SpelledBeginIndex;
560a9ac8606Spatrick });
561a9ac8606Spatrick std::vector<TokenBuffer::Expansion> Expansions;
562a9ac8606Spatrick for (; M != File.Mappings.end() && M->BeginSpelled < SpelledEndIndex; ++M)
563a9ac8606Spatrick Expansions.push_back(makeExpansion(File, *M));
564a9ac8606Spatrick return Expansions;
565a9ac8606Spatrick }
566a9ac8606Spatrick
567e5dd7070Spatrick llvm::ArrayRef<syntax::Token>
spelledTokensTouching(SourceLocation Loc,llvm::ArrayRef<syntax::Token> Tokens)568e5dd7070Spatrick syntax::spelledTokensTouching(SourceLocation Loc,
569ec727ea7Spatrick llvm::ArrayRef<syntax::Token> Tokens) {
570e5dd7070Spatrick assert(Loc.isFileID());
571ec727ea7Spatrick
572e5dd7070Spatrick auto *Right = llvm::partition_point(
573ec727ea7Spatrick Tokens, [&](const syntax::Token &Tok) { return Tok.location() < Loc; });
574ec727ea7Spatrick bool AcceptRight = Right != Tokens.end() && Right->location() <= Loc;
575ec727ea7Spatrick bool AcceptLeft =
576ec727ea7Spatrick Right != Tokens.begin() && (Right - 1)->endLocation() >= Loc;
577*12c85518Srobert return llvm::ArrayRef(Right - (AcceptLeft ? 1 : 0),
578e5dd7070Spatrick Right + (AcceptRight ? 1 : 0));
579e5dd7070Spatrick }
580e5dd7070Spatrick
581ec727ea7Spatrick llvm::ArrayRef<syntax::Token>
spelledTokensTouching(SourceLocation Loc,const syntax::TokenBuffer & Tokens)582ec727ea7Spatrick syntax::spelledTokensTouching(SourceLocation Loc,
583ec727ea7Spatrick const syntax::TokenBuffer &Tokens) {
584ec727ea7Spatrick return spelledTokensTouching(
585ec727ea7Spatrick Loc, Tokens.spelledTokens(Tokens.sourceManager().getFileID(Loc)));
586ec727ea7Spatrick }
587ec727ea7Spatrick
588e5dd7070Spatrick const syntax::Token *
spelledIdentifierTouching(SourceLocation Loc,llvm::ArrayRef<syntax::Token> Tokens)589e5dd7070Spatrick syntax::spelledIdentifierTouching(SourceLocation Loc,
590ec727ea7Spatrick llvm::ArrayRef<syntax::Token> Tokens) {
591e5dd7070Spatrick for (const syntax::Token &Tok : spelledTokensTouching(Loc, Tokens)) {
592e5dd7070Spatrick if (Tok.kind() == tok::identifier)
593e5dd7070Spatrick return &Tok;
594e5dd7070Spatrick }
595e5dd7070Spatrick return nullptr;
596e5dd7070Spatrick }
597e5dd7070Spatrick
598ec727ea7Spatrick const syntax::Token *
spelledIdentifierTouching(SourceLocation Loc,const syntax::TokenBuffer & Tokens)599ec727ea7Spatrick syntax::spelledIdentifierTouching(SourceLocation Loc,
600ec727ea7Spatrick const syntax::TokenBuffer &Tokens) {
601ec727ea7Spatrick return spelledIdentifierTouching(
602ec727ea7Spatrick Loc, Tokens.spelledTokens(Tokens.sourceManager().getFileID(Loc)));
603ec727ea7Spatrick }
604ec727ea7Spatrick
605e5dd7070Spatrick std::vector<const syntax::Token *>
macroExpansions(FileID FID) const606e5dd7070Spatrick TokenBuffer::macroExpansions(FileID FID) const {
607e5dd7070Spatrick auto FileIt = Files.find(FID);
608e5dd7070Spatrick assert(FileIt != Files.end() && "file not tracked by token buffer");
609e5dd7070Spatrick auto &File = FileIt->second;
610e5dd7070Spatrick std::vector<const syntax::Token *> Expansions;
611e5dd7070Spatrick auto &Spelled = File.SpelledTokens;
612e5dd7070Spatrick for (auto Mapping : File.Mappings) {
613e5dd7070Spatrick const syntax::Token *Token = &Spelled[Mapping.BeginSpelled];
614e5dd7070Spatrick if (Token->kind() == tok::TokenKind::identifier)
615e5dd7070Spatrick Expansions.push_back(Token);
616e5dd7070Spatrick }
617e5dd7070Spatrick return Expansions;
618e5dd7070Spatrick }
619e5dd7070Spatrick
tokenize(const FileRange & FR,const SourceManager & SM,const LangOptions & LO)620ec727ea7Spatrick std::vector<syntax::Token> syntax::tokenize(const FileRange &FR,
621ec727ea7Spatrick const SourceManager &SM,
622e5dd7070Spatrick const LangOptions &LO) {
623e5dd7070Spatrick std::vector<syntax::Token> Tokens;
624e5dd7070Spatrick IdentifierTable Identifiers(LO);
625e5dd7070Spatrick auto AddToken = [&](clang::Token T) {
626e5dd7070Spatrick // Fill the proper token kind for keywords, etc.
627e5dd7070Spatrick if (T.getKind() == tok::raw_identifier && !T.needsCleaning() &&
628e5dd7070Spatrick !T.hasUCN()) { // FIXME: support needsCleaning and hasUCN cases.
629e5dd7070Spatrick clang::IdentifierInfo &II = Identifiers.get(T.getRawIdentifier());
630e5dd7070Spatrick T.setIdentifierInfo(&II);
631e5dd7070Spatrick T.setKind(II.getTokenID());
632e5dd7070Spatrick }
633e5dd7070Spatrick Tokens.push_back(syntax::Token(T));
634e5dd7070Spatrick };
635e5dd7070Spatrick
636ec727ea7Spatrick auto SrcBuffer = SM.getBufferData(FR.file());
637ec727ea7Spatrick Lexer L(SM.getLocForStartOfFile(FR.file()), LO, SrcBuffer.data(),
638ec727ea7Spatrick SrcBuffer.data() + FR.beginOffset(),
639ec727ea7Spatrick // We can't make BufEnd point to FR.endOffset, as Lexer requires a
640ec727ea7Spatrick // null terminated buffer.
641ec727ea7Spatrick SrcBuffer.data() + SrcBuffer.size());
642e5dd7070Spatrick
643e5dd7070Spatrick clang::Token T;
644ec727ea7Spatrick while (!L.LexFromRawLexer(T) && L.getCurrentBufferOffset() < FR.endOffset())
645e5dd7070Spatrick AddToken(T);
646ec727ea7Spatrick // LexFromRawLexer returns true when it parses the last token of the file, add
647ec727ea7Spatrick // it iff it starts within the range we are interested in.
648ec727ea7Spatrick if (SM.getFileOffset(T.getLocation()) < FR.endOffset())
649e5dd7070Spatrick AddToken(T);
650e5dd7070Spatrick return Tokens;
651e5dd7070Spatrick }
652e5dd7070Spatrick
tokenize(FileID FID,const SourceManager & SM,const LangOptions & LO)653ec727ea7Spatrick std::vector<syntax::Token> syntax::tokenize(FileID FID, const SourceManager &SM,
654ec727ea7Spatrick const LangOptions &LO) {
655ec727ea7Spatrick return tokenize(syntax::FileRange(FID, 0, SM.getFileIDSize(FID)), SM, LO);
656ec727ea7Spatrick }
657ec727ea7Spatrick
658e5dd7070Spatrick /// Records information reqired to construct mappings for the token buffer that
659e5dd7070Spatrick /// we are collecting.
660e5dd7070Spatrick class TokenCollector::CollectPPExpansions : public PPCallbacks {
661e5dd7070Spatrick public:
CollectPPExpansions(TokenCollector & C)662e5dd7070Spatrick CollectPPExpansions(TokenCollector &C) : Collector(&C) {}
663e5dd7070Spatrick
664e5dd7070Spatrick /// Disabled instance will stop reporting anything to TokenCollector.
665e5dd7070Spatrick /// This ensures that uses of the preprocessor after TokenCollector::consume()
666e5dd7070Spatrick /// is called do not access the (possibly invalid) collector instance.
disable()667e5dd7070Spatrick void disable() { Collector = nullptr; }
668e5dd7070Spatrick
MacroExpands(const clang::Token & MacroNameTok,const MacroDefinition & MD,SourceRange Range,const MacroArgs * Args)669e5dd7070Spatrick void MacroExpands(const clang::Token &MacroNameTok, const MacroDefinition &MD,
670e5dd7070Spatrick SourceRange Range, const MacroArgs *Args) override {
671e5dd7070Spatrick if (!Collector)
672e5dd7070Spatrick return;
673389bb291Spatrick const auto &SM = Collector->PP.getSourceManager();
674389bb291Spatrick // Only record top-level expansions that directly produce expanded tokens.
675389bb291Spatrick // This excludes those where:
676e5dd7070Spatrick // - the macro use is inside a macro body,
677e5dd7070Spatrick // - the macro appears in an argument to another macro.
678389bb291Spatrick // However macro expansion isn't really a tree, it's token rewrite rules,
679389bb291Spatrick // so there are other cases, e.g.
680389bb291Spatrick // #define B(X) X
681389bb291Spatrick // #define A 1 + B
682389bb291Spatrick // A(2)
683389bb291Spatrick // Both A and B produce expanded tokens, though the macro name 'B' comes
684389bb291Spatrick // from an expansion. The best we can do is merge the mappings for both.
685389bb291Spatrick
686389bb291Spatrick // The *last* token of any top-level macro expansion must be in a file.
687389bb291Spatrick // (In the example above, see the closing paren of the expansion of B).
688389bb291Spatrick if (!Range.getEnd().isFileID())
689e5dd7070Spatrick return;
690389bb291Spatrick // If there's a current expansion that encloses this one, this one can't be
691389bb291Spatrick // top-level.
692389bb291Spatrick if (LastExpansionEnd.isValid() &&
693389bb291Spatrick !SM.isBeforeInTranslationUnit(LastExpansionEnd, Range.getEnd()))
694389bb291Spatrick return;
695389bb291Spatrick
696389bb291Spatrick // If the macro invocation (B) starts in a macro (A) but ends in a file,
697389bb291Spatrick // we'll create a merged mapping for A + B by overwriting the endpoint for
698389bb291Spatrick // A's startpoint.
699389bb291Spatrick if (!Range.getBegin().isFileID()) {
700389bb291Spatrick Range.setBegin(SM.getExpansionLoc(Range.getBegin()));
701a9ac8606Spatrick assert(Collector->Expansions.count(Range.getBegin()) &&
702389bb291Spatrick "Overlapping macros should have same expansion location");
703389bb291Spatrick }
704389bb291Spatrick
705a9ac8606Spatrick Collector->Expansions[Range.getBegin()] = Range.getEnd();
706e5dd7070Spatrick LastExpansionEnd = Range.getEnd();
707e5dd7070Spatrick }
708e5dd7070Spatrick // FIXME: handle directives like #pragma, #include, etc.
709e5dd7070Spatrick private:
710e5dd7070Spatrick TokenCollector *Collector;
711e5dd7070Spatrick /// Used to detect recursive macro expansions.
712e5dd7070Spatrick SourceLocation LastExpansionEnd;
713e5dd7070Spatrick };
714e5dd7070Spatrick
715e5dd7070Spatrick /// Fills in the TokenBuffer by tracing the run of a preprocessor. The
716e5dd7070Spatrick /// implementation tracks the tokens, macro expansions and directives coming
717e5dd7070Spatrick /// from the preprocessor and:
718e5dd7070Spatrick /// - for each token, figures out if it is a part of an expanded token stream,
719e5dd7070Spatrick /// spelled token stream or both. Stores the tokens appropriately.
720e5dd7070Spatrick /// - records mappings from the spelled to expanded token ranges, e.g. for macro
721e5dd7070Spatrick /// expansions.
722e5dd7070Spatrick /// FIXME: also properly record:
723e5dd7070Spatrick /// - #include directives,
724e5dd7070Spatrick /// - #pragma, #line and other PP directives,
725e5dd7070Spatrick /// - skipped pp regions,
726e5dd7070Spatrick /// - ...
727e5dd7070Spatrick
TokenCollector(Preprocessor & PP)728e5dd7070Spatrick TokenCollector::TokenCollector(Preprocessor &PP) : PP(PP) {
729e5dd7070Spatrick // Collect the expanded token stream during preprocessing.
730e5dd7070Spatrick PP.setTokenWatcher([this](const clang::Token &T) {
731e5dd7070Spatrick if (T.isAnnotation())
732e5dd7070Spatrick return;
733e5dd7070Spatrick DEBUG_WITH_TYPE("collect-tokens", llvm::dbgs()
734e5dd7070Spatrick << "Token: "
735e5dd7070Spatrick << syntax::Token(T).dumpForTests(
736e5dd7070Spatrick this->PP.getSourceManager())
737e5dd7070Spatrick << "\n"
738e5dd7070Spatrick
739e5dd7070Spatrick );
740e5dd7070Spatrick Expanded.push_back(syntax::Token(T));
741e5dd7070Spatrick });
742e5dd7070Spatrick // And locations of macro calls, to properly recover boundaries of those in
743e5dd7070Spatrick // case of empty expansions.
744e5dd7070Spatrick auto CB = std::make_unique<CollectPPExpansions>(*this);
745e5dd7070Spatrick this->Collector = CB.get();
746e5dd7070Spatrick PP.addPPCallbacks(std::move(CB));
747e5dd7070Spatrick }
748e5dd7070Spatrick
749e5dd7070Spatrick /// Builds mappings and spelled tokens in the TokenBuffer based on the expanded
750e5dd7070Spatrick /// token stream.
751e5dd7070Spatrick class TokenCollector::Builder {
752e5dd7070Spatrick public:
Builder(std::vector<syntax::Token> Expanded,PPExpansions CollectedExpansions,const SourceManager & SM,const LangOptions & LangOpts)753e5dd7070Spatrick Builder(std::vector<syntax::Token> Expanded, PPExpansions CollectedExpansions,
754e5dd7070Spatrick const SourceManager &SM, const LangOptions &LangOpts)
755e5dd7070Spatrick : Result(SM), CollectedExpansions(std::move(CollectedExpansions)), SM(SM),
756e5dd7070Spatrick LangOpts(LangOpts) {
757e5dd7070Spatrick Result.ExpandedTokens = std::move(Expanded);
758e5dd7070Spatrick }
759e5dd7070Spatrick
build()760e5dd7070Spatrick TokenBuffer build() && {
761e5dd7070Spatrick assert(!Result.ExpandedTokens.empty());
762e5dd7070Spatrick assert(Result.ExpandedTokens.back().kind() == tok::eof);
763389bb291Spatrick
764389bb291Spatrick // Tokenize every file that contributed tokens to the expanded stream.
765389bb291Spatrick buildSpelledTokens();
766389bb291Spatrick
767389bb291Spatrick // The expanded token stream consists of runs of tokens that came from
768389bb291Spatrick // the same source (a macro expansion, part of a file etc).
769389bb291Spatrick // Between these runs are the logical positions of spelled tokens that
770389bb291Spatrick // didn't expand to anything.
771389bb291Spatrick while (NextExpanded < Result.ExpandedTokens.size() - 1 /* eof */) {
772389bb291Spatrick // Create empty mappings for spelled tokens that expanded to nothing here.
773389bb291Spatrick // May advance NextSpelled, but NextExpanded is unchanged.
774389bb291Spatrick discard();
775389bb291Spatrick // Create mapping for a contiguous run of expanded tokens.
776389bb291Spatrick // Advances NextExpanded past the run, and NextSpelled accordingly.
777389bb291Spatrick unsigned OldPosition = NextExpanded;
778389bb291Spatrick advance();
779389bb291Spatrick if (NextExpanded == OldPosition)
780389bb291Spatrick diagnoseAdvanceFailure();
781e5dd7070Spatrick }
782389bb291Spatrick // If any tokens remain in any of the files, they didn't expand to anything.
783389bb291Spatrick // Create empty mappings up until the end of the file.
784389bb291Spatrick for (const auto &File : Result.Files)
785389bb291Spatrick discard(File.first);
786e5dd7070Spatrick
787ec727ea7Spatrick #ifndef NDEBUG
788ec727ea7Spatrick for (auto &pair : Result.Files) {
789ec727ea7Spatrick auto &mappings = pair.second.Mappings;
790ec727ea7Spatrick assert(llvm::is_sorted(mappings, [](const TokenBuffer::Mapping &M1,
791ec727ea7Spatrick const TokenBuffer::Mapping &M2) {
792ec727ea7Spatrick return M1.BeginSpelled < M2.BeginSpelled &&
793ec727ea7Spatrick M1.EndSpelled < M2.EndSpelled &&
794ec727ea7Spatrick M1.BeginExpanded < M2.BeginExpanded &&
795ec727ea7Spatrick M1.EndExpanded < M2.EndExpanded;
796ec727ea7Spatrick }));
797ec727ea7Spatrick }
798ec727ea7Spatrick #endif
799ec727ea7Spatrick
800e5dd7070Spatrick return std::move(Result);
801e5dd7070Spatrick }
802e5dd7070Spatrick
803e5dd7070Spatrick private:
804389bb291Spatrick // Consume a sequence of spelled tokens that didn't expand to anything.
805389bb291Spatrick // In the simplest case, skips spelled tokens until finding one that produced
806389bb291Spatrick // the NextExpanded token, and creates an empty mapping for them.
807389bb291Spatrick // If Drain is provided, skips remaining tokens from that file instead.
discard(std::optional<FileID> Drain=std::nullopt)808*12c85518Srobert void discard(std::optional<FileID> Drain = std::nullopt) {
809389bb291Spatrick SourceLocation Target =
810389bb291Spatrick Drain ? SM.getLocForEndOfFile(*Drain)
811389bb291Spatrick : SM.getExpansionLoc(
812389bb291Spatrick Result.ExpandedTokens[NextExpanded].location());
813389bb291Spatrick FileID File = SM.getFileID(Target);
814389bb291Spatrick const auto &SpelledTokens = Result.Files[File].SpelledTokens;
815389bb291Spatrick auto &NextSpelled = this->NextSpelled[File];
816389bb291Spatrick
817389bb291Spatrick TokenBuffer::Mapping Mapping;
818389bb291Spatrick Mapping.BeginSpelled = NextSpelled;
819389bb291Spatrick // When dropping trailing tokens from a file, the empty mapping should
820389bb291Spatrick // be positioned within the file's expanded-token range (at the end).
821389bb291Spatrick Mapping.BeginExpanded = Mapping.EndExpanded =
822389bb291Spatrick Drain ? Result.Files[*Drain].EndExpanded : NextExpanded;
823389bb291Spatrick // We may want to split into several adjacent empty mappings.
824389bb291Spatrick // FlushMapping() emits the current mapping and starts a new one.
825389bb291Spatrick auto FlushMapping = [&, this] {
826389bb291Spatrick Mapping.EndSpelled = NextSpelled;
827389bb291Spatrick if (Mapping.BeginSpelled != Mapping.EndSpelled)
828389bb291Spatrick Result.Files[File].Mappings.push_back(Mapping);
829389bb291Spatrick Mapping.BeginSpelled = NextSpelled;
830389bb291Spatrick };
831389bb291Spatrick
832389bb291Spatrick while (NextSpelled < SpelledTokens.size() &&
833389bb291Spatrick SpelledTokens[NextSpelled].location() < Target) {
834389bb291Spatrick // If we know mapping bounds at [NextSpelled, KnownEnd] (macro expansion)
835389bb291Spatrick // then we want to partition our (empty) mapping.
836389bb291Spatrick // [Start, NextSpelled) [NextSpelled, KnownEnd] (KnownEnd, Target)
837a9ac8606Spatrick SourceLocation KnownEnd =
838a9ac8606Spatrick CollectedExpansions.lookup(SpelledTokens[NextSpelled].location());
839389bb291Spatrick if (KnownEnd.isValid()) {
840389bb291Spatrick FlushMapping(); // Emits [Start, NextSpelled)
841389bb291Spatrick while (NextSpelled < SpelledTokens.size() &&
842389bb291Spatrick SpelledTokens[NextSpelled].location() <= KnownEnd)
843389bb291Spatrick ++NextSpelled;
844389bb291Spatrick FlushMapping(); // Emits [NextSpelled, KnownEnd]
845*12c85518Srobert // Now the loop continues and will emit (KnownEnd, Target).
846389bb291Spatrick } else {
847389bb291Spatrick ++NextSpelled;
848e5dd7070Spatrick }
849389bb291Spatrick }
850389bb291Spatrick FlushMapping();
851389bb291Spatrick }
852e5dd7070Spatrick
853389bb291Spatrick // Consumes the NextExpanded token and others that are part of the same run.
854389bb291Spatrick // Increases NextExpanded and NextSpelled by at least one, and adds a mapping
855389bb291Spatrick // (unless this is a run of file tokens, which we represent with no mapping).
advance()856389bb291Spatrick void advance() {
857389bb291Spatrick const syntax::Token &Tok = Result.ExpandedTokens[NextExpanded];
858389bb291Spatrick SourceLocation Expansion = SM.getExpansionLoc(Tok.location());
859389bb291Spatrick FileID File = SM.getFileID(Expansion);
860389bb291Spatrick const auto &SpelledTokens = Result.Files[File].SpelledTokens;
861389bb291Spatrick auto &NextSpelled = this->NextSpelled[File];
862e5dd7070Spatrick
863389bb291Spatrick if (Tok.location().isFileID()) {
864389bb291Spatrick // A run of file tokens continues while the expanded/spelled tokens match.
865389bb291Spatrick while (NextSpelled < SpelledTokens.size() &&
866389bb291Spatrick NextExpanded < Result.ExpandedTokens.size() &&
867389bb291Spatrick SpelledTokens[NextSpelled].location() ==
868389bb291Spatrick Result.ExpandedTokens[NextExpanded].location()) {
869389bb291Spatrick ++NextSpelled;
870389bb291Spatrick ++NextExpanded;
871389bb291Spatrick }
872389bb291Spatrick // We need no mapping for file tokens copied to the expanded stream.
873389bb291Spatrick } else {
874389bb291Spatrick // We found a new macro expansion. We should have its spelling bounds.
875a9ac8606Spatrick auto End = CollectedExpansions.lookup(Expansion);
876389bb291Spatrick assert(End.isValid() && "Macro expansion wasn't captured?");
877389bb291Spatrick
878389bb291Spatrick // Mapping starts here...
879389bb291Spatrick TokenBuffer::Mapping Mapping;
880389bb291Spatrick Mapping.BeginExpanded = NextExpanded;
881389bb291Spatrick Mapping.BeginSpelled = NextSpelled;
882389bb291Spatrick // ... consumes spelled tokens within bounds we captured ...
883389bb291Spatrick while (NextSpelled < SpelledTokens.size() &&
884389bb291Spatrick SpelledTokens[NextSpelled].location() <= End)
885389bb291Spatrick ++NextSpelled;
886389bb291Spatrick // ... consumes expanded tokens rooted at the same expansion ...
887389bb291Spatrick while (NextExpanded < Result.ExpandedTokens.size() &&
888389bb291Spatrick SM.getExpansionLoc(
889389bb291Spatrick Result.ExpandedTokens[NextExpanded].location()) == Expansion)
890389bb291Spatrick ++NextExpanded;
891389bb291Spatrick // ... and ends here.
892389bb291Spatrick Mapping.EndExpanded = NextExpanded;
893389bb291Spatrick Mapping.EndSpelled = NextSpelled;
894389bb291Spatrick Result.Files[File].Mappings.push_back(Mapping);
895e5dd7070Spatrick }
896e5dd7070Spatrick }
897e5dd7070Spatrick
898389bb291Spatrick // advance() is supposed to consume at least one token - if not, we crash.
diagnoseAdvanceFailure()899389bb291Spatrick void diagnoseAdvanceFailure() {
900389bb291Spatrick #ifndef NDEBUG
901389bb291Spatrick // Show the failed-to-map token in context.
902389bb291Spatrick for (unsigned I = (NextExpanded < 10) ? 0 : NextExpanded - 10;
903389bb291Spatrick I < NextExpanded + 5 && I < Result.ExpandedTokens.size(); ++I) {
904389bb291Spatrick const char *L =
905389bb291Spatrick (I == NextExpanded) ? "!! " : (I < NextExpanded) ? "ok " : " ";
906389bb291Spatrick llvm::errs() << L << Result.ExpandedTokens[I].dumpForTests(SM) << "\n";
907e5dd7070Spatrick }
908389bb291Spatrick #endif
909389bb291Spatrick llvm_unreachable("Couldn't map expanded token to spelled tokens!");
910e5dd7070Spatrick }
911e5dd7070Spatrick
912e5dd7070Spatrick /// Initializes TokenBuffer::Files and fills spelled tokens and expanded
913e5dd7070Spatrick /// ranges for each of the files.
buildSpelledTokens()914e5dd7070Spatrick void buildSpelledTokens() {
915e5dd7070Spatrick for (unsigned I = 0; I < Result.ExpandedTokens.size(); ++I) {
916389bb291Spatrick const auto &Tok = Result.ExpandedTokens[I];
917389bb291Spatrick auto FID = SM.getFileID(SM.getExpansionLoc(Tok.location()));
918e5dd7070Spatrick auto It = Result.Files.try_emplace(FID);
919e5dd7070Spatrick TokenBuffer::MarkedFile &File = It.first->second;
920e5dd7070Spatrick
921389bb291Spatrick // The eof token should not be considered part of the main-file's range.
922389bb291Spatrick File.EndExpanded = Tok.kind() == tok::eof ? I : I + 1;
923389bb291Spatrick
924e5dd7070Spatrick if (!It.second)
925e5dd7070Spatrick continue; // we have seen this file before.
926e5dd7070Spatrick // This is the first time we see this file.
927e5dd7070Spatrick File.BeginExpanded = I;
928e5dd7070Spatrick File.SpelledTokens = tokenize(FID, SM, LangOpts);
929e5dd7070Spatrick }
930e5dd7070Spatrick }
931e5dd7070Spatrick
932e5dd7070Spatrick TokenBuffer Result;
933389bb291Spatrick unsigned NextExpanded = 0; // cursor in ExpandedTokens
934389bb291Spatrick llvm::DenseMap<FileID, unsigned> NextSpelled; // cursor in SpelledTokens
935e5dd7070Spatrick PPExpansions CollectedExpansions;
936e5dd7070Spatrick const SourceManager &SM;
937e5dd7070Spatrick const LangOptions &LangOpts;
938e5dd7070Spatrick };
939e5dd7070Spatrick
consume()940e5dd7070Spatrick TokenBuffer TokenCollector::consume() && {
941e5dd7070Spatrick PP.setTokenWatcher(nullptr);
942e5dd7070Spatrick Collector->disable();
943e5dd7070Spatrick return Builder(std::move(Expanded), std::move(Expansions),
944e5dd7070Spatrick PP.getSourceManager(), PP.getLangOpts())
945e5dd7070Spatrick .build();
946e5dd7070Spatrick }
947e5dd7070Spatrick
str() const948e5dd7070Spatrick std::string syntax::Token::str() const {
949ec727ea7Spatrick return std::string(llvm::formatv("Token({0}, length = {1})",
950ec727ea7Spatrick tok::getTokenName(kind()), length()));
951e5dd7070Spatrick }
952e5dd7070Spatrick
dumpForTests(const SourceManager & SM) const953e5dd7070Spatrick std::string syntax::Token::dumpForTests(const SourceManager &SM) const {
954ec727ea7Spatrick return std::string(llvm::formatv("Token(`{0}`, {1}, length = {2})", text(SM),
955ec727ea7Spatrick tok::getTokenName(kind()), length()));
956e5dd7070Spatrick }
957e5dd7070Spatrick
dumpForTests() const958e5dd7070Spatrick std::string TokenBuffer::dumpForTests() const {
959e5dd7070Spatrick auto PrintToken = [this](const syntax::Token &T) -> std::string {
960e5dd7070Spatrick if (T.kind() == tok::eof)
961e5dd7070Spatrick return "<eof>";
962ec727ea7Spatrick return std::string(T.text(*SourceMgr));
963e5dd7070Spatrick };
964e5dd7070Spatrick
965e5dd7070Spatrick auto DumpTokens = [this, &PrintToken](llvm::raw_ostream &OS,
966e5dd7070Spatrick llvm::ArrayRef<syntax::Token> Tokens) {
967e5dd7070Spatrick if (Tokens.empty()) {
968e5dd7070Spatrick OS << "<empty>";
969e5dd7070Spatrick return;
970e5dd7070Spatrick }
971e5dd7070Spatrick OS << Tokens[0].text(*SourceMgr);
972e5dd7070Spatrick for (unsigned I = 1; I < Tokens.size(); ++I) {
973e5dd7070Spatrick if (Tokens[I].kind() == tok::eof)
974e5dd7070Spatrick continue;
975e5dd7070Spatrick OS << " " << PrintToken(Tokens[I]);
976e5dd7070Spatrick }
977e5dd7070Spatrick };
978e5dd7070Spatrick
979e5dd7070Spatrick std::string Dump;
980e5dd7070Spatrick llvm::raw_string_ostream OS(Dump);
981e5dd7070Spatrick
982e5dd7070Spatrick OS << "expanded tokens:\n"
983e5dd7070Spatrick << " ";
984e5dd7070Spatrick // (!) we do not show '<eof>'.
985*12c85518Srobert DumpTokens(OS, llvm::ArrayRef(ExpandedTokens).drop_back());
986e5dd7070Spatrick OS << "\n";
987e5dd7070Spatrick
988e5dd7070Spatrick std::vector<FileID> Keys;
989e5dd7070Spatrick for (auto F : Files)
990e5dd7070Spatrick Keys.push_back(F.first);
991e5dd7070Spatrick llvm::sort(Keys);
992e5dd7070Spatrick
993e5dd7070Spatrick for (FileID ID : Keys) {
994e5dd7070Spatrick const MarkedFile &File = Files.find(ID)->second;
995e5dd7070Spatrick auto *Entry = SourceMgr->getFileEntryForID(ID);
996e5dd7070Spatrick if (!Entry)
997e5dd7070Spatrick continue; // Skip builtin files.
998e5dd7070Spatrick OS << llvm::formatv("file '{0}'\n", Entry->getName())
999e5dd7070Spatrick << " spelled tokens:\n"
1000e5dd7070Spatrick << " ";
1001e5dd7070Spatrick DumpTokens(OS, File.SpelledTokens);
1002e5dd7070Spatrick OS << "\n";
1003e5dd7070Spatrick
1004e5dd7070Spatrick if (File.Mappings.empty()) {
1005e5dd7070Spatrick OS << " no mappings.\n";
1006e5dd7070Spatrick continue;
1007e5dd7070Spatrick }
1008e5dd7070Spatrick OS << " mappings:\n";
1009e5dd7070Spatrick for (auto &M : File.Mappings) {
1010e5dd7070Spatrick OS << llvm::formatv(
1011e5dd7070Spatrick " ['{0}'_{1}, '{2}'_{3}) => ['{4}'_{5}, '{6}'_{7})\n",
1012e5dd7070Spatrick PrintToken(File.SpelledTokens[M.BeginSpelled]), M.BeginSpelled,
1013e5dd7070Spatrick M.EndSpelled == File.SpelledTokens.size()
1014e5dd7070Spatrick ? "<eof>"
1015e5dd7070Spatrick : PrintToken(File.SpelledTokens[M.EndSpelled]),
1016e5dd7070Spatrick M.EndSpelled, PrintToken(ExpandedTokens[M.BeginExpanded]),
1017e5dd7070Spatrick M.BeginExpanded, PrintToken(ExpandedTokens[M.EndExpanded]),
1018e5dd7070Spatrick M.EndExpanded);
1019e5dd7070Spatrick }
1020e5dd7070Spatrick }
1021*12c85518Srobert return Dump;
1022e5dd7070Spatrick }
1023