xref: /openbsd-src/gnu/llvm/clang/lib/Format/Macros.h (revision 12c855180aad702bbcca06e0398d774beeafb155)
1*12c85518Srobert //===--- Macros.h - Format C++ code -----------------------------*- C++ -*-===//
2a9ac8606Spatrick //
3*12c85518Srobert // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*12c85518Srobert // See https://llvm.org/LICENSE.txt for license information.
5*12c85518Srobert // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6a9ac8606Spatrick //
7a9ac8606Spatrick //===----------------------------------------------------------------------===//
8a9ac8606Spatrick ///
9a9ac8606Spatrick /// \file
10a9ac8606Spatrick /// This file contains the main building blocks of macro support in
11a9ac8606Spatrick /// clang-format.
12a9ac8606Spatrick ///
13a9ac8606Spatrick /// In order to not violate the requirement that clang-format can format files
14a9ac8606Spatrick /// in isolation, clang-format's macro support uses expansions users provide
15a9ac8606Spatrick /// as part of clang-format's style configuration.
16a9ac8606Spatrick ///
17a9ac8606Spatrick /// Macro definitions are of the form "MACRO(p1, p2)=p1 + p2", but only support
18a9ac8606Spatrick /// one level of expansion (\see MacroExpander for a full description of what
19a9ac8606Spatrick /// is supported).
20a9ac8606Spatrick ///
21a9ac8606Spatrick /// As part of parsing, clang-format uses the MacroExpander to expand the
22a9ac8606Spatrick /// spelled token streams into expanded token streams when it encounters a
23a9ac8606Spatrick /// macro call. The UnwrappedLineParser continues to parse UnwrappedLines
24a9ac8606Spatrick /// from the expanded token stream.
25*12c85518Srobert /// After the expanded unwrapped lines are parsed, the MacroCallReconstructor
26*12c85518Srobert /// matches the spelled token stream into unwrapped lines that best resemble the
27*12c85518Srobert /// structure of the expanded unwrapped lines. These reconstructed unwrapped
28*12c85518Srobert /// lines are aliasing the tokens in the expanded token stream, so that token
29*12c85518Srobert /// annotations will be reused when formatting the spelled macro calls.
30a9ac8606Spatrick ///
31*12c85518Srobert /// When formatting, clang-format annotates and formats the expanded unwrapped
32*12c85518Srobert /// lines first, determining the token types. Next, it formats the spelled
33*12c85518Srobert /// unwrapped lines, keeping the token types fixed, while allowing other
34*12c85518Srobert /// formatting decisions to change.
35a9ac8606Spatrick ///
36a9ac8606Spatrick //===----------------------------------------------------------------------===//
37a9ac8606Spatrick 
38a9ac8606Spatrick #ifndef CLANG_LIB_FORMAT_MACROS_H
39a9ac8606Spatrick #define CLANG_LIB_FORMAT_MACROS_H
40a9ac8606Spatrick 
41*12c85518Srobert #include <list>
42*12c85518Srobert #include <map>
43a9ac8606Spatrick #include <string>
44a9ac8606Spatrick #include <vector>
45a9ac8606Spatrick 
46a9ac8606Spatrick #include "FormatToken.h"
47a9ac8606Spatrick #include "llvm/ADT/ArrayRef.h"
48*12c85518Srobert #include "llvm/ADT/DenseMap.h"
49a9ac8606Spatrick #include "llvm/ADT/SmallVector.h"
50a9ac8606Spatrick #include "llvm/ADT/StringRef.h"
51a9ac8606Spatrick 
52a9ac8606Spatrick namespace clang {
53a9ac8606Spatrick namespace format {
54*12c85518Srobert 
55*12c85518Srobert struct UnwrappedLine;
56*12c85518Srobert struct UnwrappedLineNode;
57a9ac8606Spatrick 
58a9ac8606Spatrick /// Takes a set of macro definitions as strings and allows expanding calls to
59a9ac8606Spatrick /// those macros.
60a9ac8606Spatrick ///
61a9ac8606Spatrick /// For example:
62a9ac8606Spatrick /// Definition: A(x, y)=x + y
63a9ac8606Spatrick /// Call      : A(int a = 1, 2)
64a9ac8606Spatrick /// Expansion : int a = 1 + 2
65a9ac8606Spatrick ///
66a9ac8606Spatrick /// Expansion does not check arity of the definition.
67a9ac8606Spatrick /// If fewer arguments than expected are provided, the remaining parameters
68a9ac8606Spatrick /// are considered empty:
69a9ac8606Spatrick /// Call     : A(a)
70a9ac8606Spatrick /// Expansion: a +
71a9ac8606Spatrick /// If more arguments than expected are provided, they will be discarded.
72a9ac8606Spatrick ///
73a9ac8606Spatrick /// The expander does not support:
74a9ac8606Spatrick /// - recursive expansion
75a9ac8606Spatrick /// - stringification
76a9ac8606Spatrick /// - concatenation
77a9ac8606Spatrick /// - variadic macros
78a9ac8606Spatrick ///
79a9ac8606Spatrick /// Furthermore, only a single expansion of each macro argument is supported,
80a9ac8606Spatrick /// so that we cannot get conflicting formatting decisions from different
81a9ac8606Spatrick /// expansions.
82a9ac8606Spatrick /// Definition: A(x)=x+x
83a9ac8606Spatrick /// Call      : A(id)
84a9ac8606Spatrick /// Expansion : id+x
85a9ac8606Spatrick ///
86a9ac8606Spatrick class MacroExpander {
87a9ac8606Spatrick public:
88a9ac8606Spatrick   using ArgsList = llvm::ArrayRef<llvm::SmallVector<FormatToken *, 8>>;
89a9ac8606Spatrick 
90a9ac8606Spatrick   /// Construct a macro expander from a set of macro definitions.
91a9ac8606Spatrick   /// Macro definitions must be encoded as UTF-8.
92a9ac8606Spatrick   ///
93a9ac8606Spatrick   /// Each entry in \p Macros must conform to the following simple
94a9ac8606Spatrick   /// macro-definition language:
95a9ac8606Spatrick   /// <definition> ::= <id> <expansion> | <id> "(" <params> ")" <expansion>
96a9ac8606Spatrick   /// <params>     ::= <id-list> | ""
97a9ac8606Spatrick   /// <id-list>    ::= <id> | <id> "," <params>
98a9ac8606Spatrick   /// <expansion>  ::= "=" <tail> | <eof>
99a9ac8606Spatrick   /// <tail>       ::= <tok> <tail> | <eof>
100a9ac8606Spatrick   ///
101a9ac8606Spatrick   /// Macros that cannot be parsed will be silently discarded.
102a9ac8606Spatrick   ///
103a9ac8606Spatrick   MacroExpander(const std::vector<std::string> &Macros,
104a9ac8606Spatrick                 clang::SourceManager &SourceMgr, const FormatStyle &Style,
105a9ac8606Spatrick                 llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
106a9ac8606Spatrick                 IdentifierTable &IdentTable);
107a9ac8606Spatrick   ~MacroExpander();
108a9ac8606Spatrick 
109a9ac8606Spatrick   /// Returns whether a macro \p Name is defined.
110a9ac8606Spatrick   bool defined(llvm::StringRef Name) const;
111a9ac8606Spatrick 
112a9ac8606Spatrick   /// Returns whether the macro has no arguments and should not consume
113a9ac8606Spatrick   /// subsequent parentheses.
114a9ac8606Spatrick   bool objectLike(llvm::StringRef Name) const;
115a9ac8606Spatrick 
116a9ac8606Spatrick   /// Returns the expanded stream of format tokens for \p ID, where
117a9ac8606Spatrick   /// each element in \p Args is a positional argument to the macro call.
118a9ac8606Spatrick   llvm::SmallVector<FormatToken *, 8> expand(FormatToken *ID,
119a9ac8606Spatrick                                              ArgsList Args) const;
120a9ac8606Spatrick 
121a9ac8606Spatrick private:
122a9ac8606Spatrick   struct Definition;
123a9ac8606Spatrick   class DefinitionParser;
124a9ac8606Spatrick 
125a9ac8606Spatrick   void parseDefinition(const std::string &Macro);
126a9ac8606Spatrick 
127a9ac8606Spatrick   clang::SourceManager &SourceMgr;
128a9ac8606Spatrick   const FormatStyle &Style;
129a9ac8606Spatrick   llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator;
130a9ac8606Spatrick   IdentifierTable &IdentTable;
131*12c85518Srobert   SmallVector<std::unique_ptr<llvm::MemoryBuffer>> Buffers;
132a9ac8606Spatrick   llvm::StringMap<Definition> Definitions;
133a9ac8606Spatrick };
134a9ac8606Spatrick 
135*12c85518Srobert /// Converts a sequence of UnwrappedLines containing expanded macros into a
136*12c85518Srobert /// single UnwrappedLine containing the macro calls.  This UnwrappedLine may be
137*12c85518Srobert /// broken into child lines, in a way that best conveys the structure of the
138*12c85518Srobert /// expanded code.
139*12c85518Srobert ///
140*12c85518Srobert /// In the simplest case, a spelled UnwrappedLine contains one macro, and after
141*12c85518Srobert /// expanding it we have one expanded UnwrappedLine.  In general, macro
142*12c85518Srobert /// expansions can span UnwrappedLines, and multiple macros can contribute
143*12c85518Srobert /// tokens to the same line.  We keep consuming expanded lines until:
144*12c85518Srobert /// *   all expansions that started have finished (we're not chopping any macros
145*12c85518Srobert ///     in half)
146*12c85518Srobert /// *   *and* we've reached the end of a *spelled* unwrapped line.
147*12c85518Srobert ///
148*12c85518Srobert /// A single UnwrappedLine represents this chunk of code.
149*12c85518Srobert ///
150*12c85518Srobert /// After this point, the state of the spelled/expanded stream is "in sync"
151*12c85518Srobert /// (both at the start of an UnwrappedLine, with no macros open), so the
152*12c85518Srobert /// Unexpander can be thrown away and parsing can continue.
153*12c85518Srobert ///
154*12c85518Srobert /// Given a mapping from the macro name identifier token in the macro call
155*12c85518Srobert /// to the tokens of the macro call, for example:
156*12c85518Srobert /// CLASSA -> CLASSA({public: void x();})
157*12c85518Srobert ///
158*12c85518Srobert /// When getting the formatted lines of the expansion via the \c addLine method
159*12c85518Srobert /// (each '->' specifies a call to \c addLine ):
160*12c85518Srobert /// -> class A {
161*12c85518Srobert /// -> public:
162*12c85518Srobert /// ->   void x();
163*12c85518Srobert /// -> };
164*12c85518Srobert ///
165*12c85518Srobert /// Creates the tree of unwrapped lines containing the macro call tokens so that
166*12c85518Srobert /// the macro call tokens fit the semantic structure of the expanded formatted
167*12c85518Srobert /// lines:
168*12c85518Srobert /// -> CLASSA({
169*12c85518Srobert /// -> public:
170*12c85518Srobert /// ->   void x();
171*12c85518Srobert /// -> })
172*12c85518Srobert class MacroCallReconstructor {
173*12c85518Srobert public:
174*12c85518Srobert   /// Create an Reconstructor whose resulting \p UnwrappedLine will start at
175*12c85518Srobert   /// \p Level, using the map from name identifier token to the corresponding
176*12c85518Srobert   /// tokens of the spelled macro call.
177*12c85518Srobert   MacroCallReconstructor(
178*12c85518Srobert       unsigned Level,
179*12c85518Srobert       const llvm::DenseMap<FormatToken *, std::unique_ptr<UnwrappedLine>>
180*12c85518Srobert           &ActiveExpansions);
181*12c85518Srobert 
182*12c85518Srobert   /// For the given \p Line, match all occurences of tokens expanded from a
183*12c85518Srobert   /// macro to unwrapped lines in the spelled macro call so that the resulting
184*12c85518Srobert   /// tree of unwrapped lines best resembles the structure of unwrapped lines
185*12c85518Srobert   /// passed in via \c addLine.
186*12c85518Srobert   void addLine(const UnwrappedLine &Line);
187*12c85518Srobert 
188*12c85518Srobert   /// Check whether at the current state there is no open macro expansion
189*12c85518Srobert   /// that needs to be processed to finish an macro call.
190*12c85518Srobert   /// Only when \c finished() is true, \c takeResult() can be called to retrieve
191*12c85518Srobert   /// the resulting \c UnwrappedLine.
192*12c85518Srobert   /// If there are multiple subsequent macro calls within an unwrapped line in
193*12c85518Srobert   /// the spelled token stream, the calling code may also continue to call
194*12c85518Srobert   /// \c addLine() when \c finished() is true.
finished()195*12c85518Srobert   bool finished() const { return ActiveExpansions.empty(); }
196*12c85518Srobert 
197*12c85518Srobert   /// Retrieve the formatted \c UnwrappedLine containing the orginal
198*12c85518Srobert   /// macro calls, formatted according to the expanded token stream received
199*12c85518Srobert   /// via \c addLine().
200*12c85518Srobert   /// Generally, this line tries to have the same structure as the expanded,
201*12c85518Srobert   /// formatted unwrapped lines handed in via \c addLine(), with the exception
202*12c85518Srobert   /// that for multiple top-level lines, each subsequent line will be the
203*12c85518Srobert   /// child of the last token in its predecessor. This representation is chosen
204*12c85518Srobert   /// because it is a precondition to the formatter that we get what looks like
205*12c85518Srobert   /// a single statement in a single \c UnwrappedLine (i.e. matching parens).
206*12c85518Srobert   ///
207*12c85518Srobert   /// If a token in a macro argument is a child of a token in the expansion,
208*12c85518Srobert   /// the parent will be the corresponding token in the macro call.
209*12c85518Srobert   /// For example:
210*12c85518Srobert   ///   #define C(a, b) class C { a b
211*12c85518Srobert   ///   C(int x;, int y;)
212*12c85518Srobert   /// would expand to
213*12c85518Srobert   ///   class C { int x; int y;
214*12c85518Srobert   /// where in a formatted line "int x;" and "int y;" would both be new separate
215*12c85518Srobert   /// lines.
216*12c85518Srobert   ///
217*12c85518Srobert   /// In the result, "int x;" will be a child of the opening parenthesis in "C("
218*12c85518Srobert   /// and "int y;" will be a child of the "," token:
219*12c85518Srobert   ///   C (
220*12c85518Srobert   ///     \- int x;
221*12c85518Srobert   ///     ,
222*12c85518Srobert   ///     \- int y;
223*12c85518Srobert   ///     )
224*12c85518Srobert   UnwrappedLine takeResult() &&;
225*12c85518Srobert 
226*12c85518Srobert private:
227*12c85518Srobert   void add(FormatToken *Token, FormatToken *ExpandedParent, bool First);
228*12c85518Srobert   void prepareParent(FormatToken *ExpandedParent, bool First);
229*12c85518Srobert   FormatToken *getParentInResult(FormatToken *Parent);
230*12c85518Srobert   void reconstruct(FormatToken *Token);
231*12c85518Srobert   void startReconstruction(FormatToken *Token);
232*12c85518Srobert   bool reconstructActiveCallUntil(FormatToken *Token);
233*12c85518Srobert   void endReconstruction(FormatToken *Token);
234*12c85518Srobert   bool processNextReconstructed();
235*12c85518Srobert   void finalize();
236*12c85518Srobert 
237*12c85518Srobert   struct ReconstructedLine;
238*12c85518Srobert 
239*12c85518Srobert   void appendToken(FormatToken *Token, ReconstructedLine *L = nullptr);
240*12c85518Srobert   UnwrappedLine createUnwrappedLine(const ReconstructedLine &Line, int Level);
241*12c85518Srobert   void debug(const ReconstructedLine &Line, int Level);
242*12c85518Srobert   ReconstructedLine &parentLine();
243*12c85518Srobert   ReconstructedLine *currentLine();
244*12c85518Srobert   void debugParentMap() const;
245*12c85518Srobert 
246*12c85518Srobert #ifndef NDEBUG
247*12c85518Srobert   enum ReconstructorState {
248*12c85518Srobert     Start,      // No macro expansion was found in the input yet.
249*12c85518Srobert     InProgress, // During a macro reconstruction.
250*12c85518Srobert     Finalized,  // Past macro reconstruction, the result is finalized.
251*12c85518Srobert   };
252*12c85518Srobert   ReconstructorState State = Start;
253*12c85518Srobert #endif
254*12c85518Srobert 
255*12c85518Srobert   // Node in which we build up the resulting unwrapped line; this type is
256*12c85518Srobert   // analogous to UnwrappedLineNode.
257*12c85518Srobert   struct LineNode {
258*12c85518Srobert     LineNode() = default;
LineNodeLineNode259*12c85518Srobert     LineNode(FormatToken *Tok) : Tok(Tok) {}
260*12c85518Srobert     FormatToken *Tok = nullptr;
261*12c85518Srobert     llvm::SmallVector<std::unique_ptr<ReconstructedLine>> Children;
262*12c85518Srobert   };
263*12c85518Srobert 
264*12c85518Srobert   // Line in which we build up the resulting unwrapped line.
265*12c85518Srobert   // FIXME: Investigate changing UnwrappedLine to a pointer type and using it
266*12c85518Srobert   // instead of rolling our own type.
267*12c85518Srobert   struct ReconstructedLine {
268*12c85518Srobert     llvm::SmallVector<std::unique_ptr<LineNode>> Tokens;
269*12c85518Srobert   };
270*12c85518Srobert 
271*12c85518Srobert   // The line in which we collect the resulting reconstructed output.
272*12c85518Srobert   // To reduce special cases in the algorithm, the first level of the line
273*12c85518Srobert   // contains a single null token that has the reconstructed incoming
274*12c85518Srobert   // lines as children.
275*12c85518Srobert   // In the end, we stich the lines together so that each subsequent line
276*12c85518Srobert   // is a child of the last token of the previous line. This is necessary
277*12c85518Srobert   // in order to format the overall expression as a single logical line -
278*12c85518Srobert   // if we created separate lines, we'd format them with their own top-level
279*12c85518Srobert   // indent depending on the semantic structure, which is not desired.
280*12c85518Srobert   ReconstructedLine Result;
281*12c85518Srobert 
282*12c85518Srobert   // Stack of currently "open" lines, where each line's predecessor's last
283*12c85518Srobert   // token is the parent token for that line.
284*12c85518Srobert   llvm::SmallVector<ReconstructedLine *> ActiveReconstructedLines;
285*12c85518Srobert 
286*12c85518Srobert   // Maps from the expanded token to the token that takes its place in the
287*12c85518Srobert   // reconstructed token stream in terms of parent-child relationships.
288*12c85518Srobert   // Note that it might take multiple steps to arrive at the correct
289*12c85518Srobert   // parent in the output.
290*12c85518Srobert   // Given: #define C(a, b) []() { a; b; }
291*12c85518Srobert   // And a call: C(f(), g())
292*12c85518Srobert   // The structure in the incoming formatted unwrapped line will be:
293*12c85518Srobert   // []() {
294*12c85518Srobert   //      |- f();
295*12c85518Srobert   //      \- g();
296*12c85518Srobert   // }
297*12c85518Srobert   // with f and g being children of the opening brace.
298*12c85518Srobert   // In the reconstructed call:
299*12c85518Srobert   // C(f(), g())
300*12c85518Srobert   //  \- f()
301*12c85518Srobert   //      \- g()
302*12c85518Srobert   // We want f to be a child of the opening parenthesis and g to be a child
303*12c85518Srobert   // of the comma token in the macro call.
304*12c85518Srobert   // Thus, we map
305*12c85518Srobert   // { -> (
306*12c85518Srobert   // and add
307*12c85518Srobert   // ( -> ,
308*12c85518Srobert   // once we're past the comma in the reconstruction.
309*12c85518Srobert   llvm::DenseMap<FormatToken *, FormatToken *>
310*12c85518Srobert       SpelledParentToReconstructedParent;
311*12c85518Srobert 
312*12c85518Srobert   // Keeps track of a single expansion while we're reconstructing tokens it
313*12c85518Srobert   // generated.
314*12c85518Srobert   struct Expansion {
315*12c85518Srobert     // The identifier token of the macro call.
316*12c85518Srobert     FormatToken *ID;
317*12c85518Srobert     // Our current position in the reconstruction.
318*12c85518Srobert     std::list<UnwrappedLineNode>::iterator SpelledI;
319*12c85518Srobert     // The end of the reconstructed token sequence.
320*12c85518Srobert     std::list<UnwrappedLineNode>::iterator SpelledE;
321*12c85518Srobert   };
322*12c85518Srobert 
323*12c85518Srobert   // Stack of macro calls for which we're in the middle of an expansion.
324*12c85518Srobert   llvm::SmallVector<Expansion> ActiveExpansions;
325*12c85518Srobert 
326*12c85518Srobert   struct MacroCallState {
327*12c85518Srobert     MacroCallState(ReconstructedLine *Line, FormatToken *ParentLastToken,
328*12c85518Srobert                    FormatToken *MacroCallLParen);
329*12c85518Srobert 
330*12c85518Srobert     ReconstructedLine *Line;
331*12c85518Srobert 
332*12c85518Srobert     // The last token in the parent line or expansion, or nullptr if the macro
333*12c85518Srobert     // expansion is on a top-level line.
334*12c85518Srobert     //
335*12c85518Srobert     // For example, in the macro call:
336*12c85518Srobert     //   auto f = []() { ID(1); };
337*12c85518Srobert     // The MacroCallState for ID will have '{' as ParentLastToken.
338*12c85518Srobert     //
339*12c85518Srobert     // In the macro call:
340*12c85518Srobert     //   ID(ID(void f()));
341*12c85518Srobert     // The MacroCallState of the outer ID will have nullptr as ParentLastToken,
342*12c85518Srobert     // while the MacroCallState for the inner ID will have the '(' of the outer
343*12c85518Srobert     // ID as ParentLastToken.
344*12c85518Srobert     //
345*12c85518Srobert     // In the macro call:
346*12c85518Srobert     //   ID2(a, ID(b));
347*12c85518Srobert     // The MacroCallState of ID will have ',' as ParentLastToken.
348*12c85518Srobert     FormatToken *ParentLastToken;
349*12c85518Srobert 
350*12c85518Srobert     // The l_paren of this MacroCallState's macro call.
351*12c85518Srobert     FormatToken *MacroCallLParen;
352*12c85518Srobert   };
353*12c85518Srobert 
354*12c85518Srobert   // Keeps track of the lines into which the opening brace/parenthesis &
355*12c85518Srobert   // argument separating commas for each level in the macro call go in order to
356*12c85518Srobert   // put the corresponding closing brace/parenthesis into the same line in the
357*12c85518Srobert   // output and keep track of which parents in the expanded token stream map to
358*12c85518Srobert   // which tokens in the reconstructed stream.
359*12c85518Srobert   // When an opening brace/parenthesis has children, we want the structure of
360*12c85518Srobert   // the output line to be:
361*12c85518Srobert   // |- MACRO
362*12c85518Srobert   // |- (
363*12c85518Srobert   // |  \- <argument>
364*12c85518Srobert   // |- ,
365*12c85518Srobert   // |  \- <argument>
366*12c85518Srobert   // \- )
367*12c85518Srobert   llvm::SmallVector<MacroCallState> MacroCallStructure;
368*12c85518Srobert 
369*12c85518Srobert   // Level the generated UnwrappedLine will be at.
370*12c85518Srobert   const unsigned Level;
371*12c85518Srobert 
372*12c85518Srobert   // Maps from identifier of the macro call to an unwrapped line containing
373*12c85518Srobert   // all tokens of the macro call.
374*12c85518Srobert   const llvm::DenseMap<FormatToken *, std::unique_ptr<UnwrappedLine>>
375*12c85518Srobert       &IdToReconstructed;
376*12c85518Srobert };
377*12c85518Srobert 
378a9ac8606Spatrick } // namespace format
379a9ac8606Spatrick } // namespace clang
380a9ac8606Spatrick 
381a9ac8606Spatrick #endif
382