1*12c85518Srobert //===--- Macros.h - Format C++ code -----------------------------*- C++ -*-===// 2a9ac8606Spatrick // 3*12c85518Srobert // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4*12c85518Srobert // See https://llvm.org/LICENSE.txt for license information. 5*12c85518Srobert // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6a9ac8606Spatrick // 7a9ac8606Spatrick //===----------------------------------------------------------------------===// 8a9ac8606Spatrick /// 9a9ac8606Spatrick /// \file 10a9ac8606Spatrick /// This file contains the main building blocks of macro support in 11a9ac8606Spatrick /// clang-format. 12a9ac8606Spatrick /// 13a9ac8606Spatrick /// In order to not violate the requirement that clang-format can format files 14a9ac8606Spatrick /// in isolation, clang-format's macro support uses expansions users provide 15a9ac8606Spatrick /// as part of clang-format's style configuration. 16a9ac8606Spatrick /// 17a9ac8606Spatrick /// Macro definitions are of the form "MACRO(p1, p2)=p1 + p2", but only support 18a9ac8606Spatrick /// one level of expansion (\see MacroExpander for a full description of what 19a9ac8606Spatrick /// is supported). 20a9ac8606Spatrick /// 21a9ac8606Spatrick /// As part of parsing, clang-format uses the MacroExpander to expand the 22a9ac8606Spatrick /// spelled token streams into expanded token streams when it encounters a 23a9ac8606Spatrick /// macro call. The UnwrappedLineParser continues to parse UnwrappedLines 24a9ac8606Spatrick /// from the expanded token stream. 25*12c85518Srobert /// After the expanded unwrapped lines are parsed, the MacroCallReconstructor 26*12c85518Srobert /// matches the spelled token stream into unwrapped lines that best resemble the 27*12c85518Srobert /// structure of the expanded unwrapped lines. These reconstructed unwrapped 28*12c85518Srobert /// lines are aliasing the tokens in the expanded token stream, so that token 29*12c85518Srobert /// annotations will be reused when formatting the spelled macro calls. 30a9ac8606Spatrick /// 31*12c85518Srobert /// When formatting, clang-format annotates and formats the expanded unwrapped 32*12c85518Srobert /// lines first, determining the token types. Next, it formats the spelled 33*12c85518Srobert /// unwrapped lines, keeping the token types fixed, while allowing other 34*12c85518Srobert /// formatting decisions to change. 35a9ac8606Spatrick /// 36a9ac8606Spatrick //===----------------------------------------------------------------------===// 37a9ac8606Spatrick 38a9ac8606Spatrick #ifndef CLANG_LIB_FORMAT_MACROS_H 39a9ac8606Spatrick #define CLANG_LIB_FORMAT_MACROS_H 40a9ac8606Spatrick 41*12c85518Srobert #include <list> 42*12c85518Srobert #include <map> 43a9ac8606Spatrick #include <string> 44a9ac8606Spatrick #include <vector> 45a9ac8606Spatrick 46a9ac8606Spatrick #include "FormatToken.h" 47a9ac8606Spatrick #include "llvm/ADT/ArrayRef.h" 48*12c85518Srobert #include "llvm/ADT/DenseMap.h" 49a9ac8606Spatrick #include "llvm/ADT/SmallVector.h" 50a9ac8606Spatrick #include "llvm/ADT/StringRef.h" 51a9ac8606Spatrick 52a9ac8606Spatrick namespace clang { 53a9ac8606Spatrick namespace format { 54*12c85518Srobert 55*12c85518Srobert struct UnwrappedLine; 56*12c85518Srobert struct UnwrappedLineNode; 57a9ac8606Spatrick 58a9ac8606Spatrick /// Takes a set of macro definitions as strings and allows expanding calls to 59a9ac8606Spatrick /// those macros. 60a9ac8606Spatrick /// 61a9ac8606Spatrick /// For example: 62a9ac8606Spatrick /// Definition: A(x, y)=x + y 63a9ac8606Spatrick /// Call : A(int a = 1, 2) 64a9ac8606Spatrick /// Expansion : int a = 1 + 2 65a9ac8606Spatrick /// 66a9ac8606Spatrick /// Expansion does not check arity of the definition. 67a9ac8606Spatrick /// If fewer arguments than expected are provided, the remaining parameters 68a9ac8606Spatrick /// are considered empty: 69a9ac8606Spatrick /// Call : A(a) 70a9ac8606Spatrick /// Expansion: a + 71a9ac8606Spatrick /// If more arguments than expected are provided, they will be discarded. 72a9ac8606Spatrick /// 73a9ac8606Spatrick /// The expander does not support: 74a9ac8606Spatrick /// - recursive expansion 75a9ac8606Spatrick /// - stringification 76a9ac8606Spatrick /// - concatenation 77a9ac8606Spatrick /// - variadic macros 78a9ac8606Spatrick /// 79a9ac8606Spatrick /// Furthermore, only a single expansion of each macro argument is supported, 80a9ac8606Spatrick /// so that we cannot get conflicting formatting decisions from different 81a9ac8606Spatrick /// expansions. 82a9ac8606Spatrick /// Definition: A(x)=x+x 83a9ac8606Spatrick /// Call : A(id) 84a9ac8606Spatrick /// Expansion : id+x 85a9ac8606Spatrick /// 86a9ac8606Spatrick class MacroExpander { 87a9ac8606Spatrick public: 88a9ac8606Spatrick using ArgsList = llvm::ArrayRef<llvm::SmallVector<FormatToken *, 8>>; 89a9ac8606Spatrick 90a9ac8606Spatrick /// Construct a macro expander from a set of macro definitions. 91a9ac8606Spatrick /// Macro definitions must be encoded as UTF-8. 92a9ac8606Spatrick /// 93a9ac8606Spatrick /// Each entry in \p Macros must conform to the following simple 94a9ac8606Spatrick /// macro-definition language: 95a9ac8606Spatrick /// <definition> ::= <id> <expansion> | <id> "(" <params> ")" <expansion> 96a9ac8606Spatrick /// <params> ::= <id-list> | "" 97a9ac8606Spatrick /// <id-list> ::= <id> | <id> "," <params> 98a9ac8606Spatrick /// <expansion> ::= "=" <tail> | <eof> 99a9ac8606Spatrick /// <tail> ::= <tok> <tail> | <eof> 100a9ac8606Spatrick /// 101a9ac8606Spatrick /// Macros that cannot be parsed will be silently discarded. 102a9ac8606Spatrick /// 103a9ac8606Spatrick MacroExpander(const std::vector<std::string> &Macros, 104a9ac8606Spatrick clang::SourceManager &SourceMgr, const FormatStyle &Style, 105a9ac8606Spatrick llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator, 106a9ac8606Spatrick IdentifierTable &IdentTable); 107a9ac8606Spatrick ~MacroExpander(); 108a9ac8606Spatrick 109a9ac8606Spatrick /// Returns whether a macro \p Name is defined. 110a9ac8606Spatrick bool defined(llvm::StringRef Name) const; 111a9ac8606Spatrick 112a9ac8606Spatrick /// Returns whether the macro has no arguments and should not consume 113a9ac8606Spatrick /// subsequent parentheses. 114a9ac8606Spatrick bool objectLike(llvm::StringRef Name) const; 115a9ac8606Spatrick 116a9ac8606Spatrick /// Returns the expanded stream of format tokens for \p ID, where 117a9ac8606Spatrick /// each element in \p Args is a positional argument to the macro call. 118a9ac8606Spatrick llvm::SmallVector<FormatToken *, 8> expand(FormatToken *ID, 119a9ac8606Spatrick ArgsList Args) const; 120a9ac8606Spatrick 121a9ac8606Spatrick private: 122a9ac8606Spatrick struct Definition; 123a9ac8606Spatrick class DefinitionParser; 124a9ac8606Spatrick 125a9ac8606Spatrick void parseDefinition(const std::string &Macro); 126a9ac8606Spatrick 127a9ac8606Spatrick clang::SourceManager &SourceMgr; 128a9ac8606Spatrick const FormatStyle &Style; 129a9ac8606Spatrick llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator; 130a9ac8606Spatrick IdentifierTable &IdentTable; 131*12c85518Srobert SmallVector<std::unique_ptr<llvm::MemoryBuffer>> Buffers; 132a9ac8606Spatrick llvm::StringMap<Definition> Definitions; 133a9ac8606Spatrick }; 134a9ac8606Spatrick 135*12c85518Srobert /// Converts a sequence of UnwrappedLines containing expanded macros into a 136*12c85518Srobert /// single UnwrappedLine containing the macro calls. This UnwrappedLine may be 137*12c85518Srobert /// broken into child lines, in a way that best conveys the structure of the 138*12c85518Srobert /// expanded code. 139*12c85518Srobert /// 140*12c85518Srobert /// In the simplest case, a spelled UnwrappedLine contains one macro, and after 141*12c85518Srobert /// expanding it we have one expanded UnwrappedLine. In general, macro 142*12c85518Srobert /// expansions can span UnwrappedLines, and multiple macros can contribute 143*12c85518Srobert /// tokens to the same line. We keep consuming expanded lines until: 144*12c85518Srobert /// * all expansions that started have finished (we're not chopping any macros 145*12c85518Srobert /// in half) 146*12c85518Srobert /// * *and* we've reached the end of a *spelled* unwrapped line. 147*12c85518Srobert /// 148*12c85518Srobert /// A single UnwrappedLine represents this chunk of code. 149*12c85518Srobert /// 150*12c85518Srobert /// After this point, the state of the spelled/expanded stream is "in sync" 151*12c85518Srobert /// (both at the start of an UnwrappedLine, with no macros open), so the 152*12c85518Srobert /// Unexpander can be thrown away and parsing can continue. 153*12c85518Srobert /// 154*12c85518Srobert /// Given a mapping from the macro name identifier token in the macro call 155*12c85518Srobert /// to the tokens of the macro call, for example: 156*12c85518Srobert /// CLASSA -> CLASSA({public: void x();}) 157*12c85518Srobert /// 158*12c85518Srobert /// When getting the formatted lines of the expansion via the \c addLine method 159*12c85518Srobert /// (each '->' specifies a call to \c addLine ): 160*12c85518Srobert /// -> class A { 161*12c85518Srobert /// -> public: 162*12c85518Srobert /// -> void x(); 163*12c85518Srobert /// -> }; 164*12c85518Srobert /// 165*12c85518Srobert /// Creates the tree of unwrapped lines containing the macro call tokens so that 166*12c85518Srobert /// the macro call tokens fit the semantic structure of the expanded formatted 167*12c85518Srobert /// lines: 168*12c85518Srobert /// -> CLASSA({ 169*12c85518Srobert /// -> public: 170*12c85518Srobert /// -> void x(); 171*12c85518Srobert /// -> }) 172*12c85518Srobert class MacroCallReconstructor { 173*12c85518Srobert public: 174*12c85518Srobert /// Create an Reconstructor whose resulting \p UnwrappedLine will start at 175*12c85518Srobert /// \p Level, using the map from name identifier token to the corresponding 176*12c85518Srobert /// tokens of the spelled macro call. 177*12c85518Srobert MacroCallReconstructor( 178*12c85518Srobert unsigned Level, 179*12c85518Srobert const llvm::DenseMap<FormatToken *, std::unique_ptr<UnwrappedLine>> 180*12c85518Srobert &ActiveExpansions); 181*12c85518Srobert 182*12c85518Srobert /// For the given \p Line, match all occurences of tokens expanded from a 183*12c85518Srobert /// macro to unwrapped lines in the spelled macro call so that the resulting 184*12c85518Srobert /// tree of unwrapped lines best resembles the structure of unwrapped lines 185*12c85518Srobert /// passed in via \c addLine. 186*12c85518Srobert void addLine(const UnwrappedLine &Line); 187*12c85518Srobert 188*12c85518Srobert /// Check whether at the current state there is no open macro expansion 189*12c85518Srobert /// that needs to be processed to finish an macro call. 190*12c85518Srobert /// Only when \c finished() is true, \c takeResult() can be called to retrieve 191*12c85518Srobert /// the resulting \c UnwrappedLine. 192*12c85518Srobert /// If there are multiple subsequent macro calls within an unwrapped line in 193*12c85518Srobert /// the spelled token stream, the calling code may also continue to call 194*12c85518Srobert /// \c addLine() when \c finished() is true. finished()195*12c85518Srobert bool finished() const { return ActiveExpansions.empty(); } 196*12c85518Srobert 197*12c85518Srobert /// Retrieve the formatted \c UnwrappedLine containing the orginal 198*12c85518Srobert /// macro calls, formatted according to the expanded token stream received 199*12c85518Srobert /// via \c addLine(). 200*12c85518Srobert /// Generally, this line tries to have the same structure as the expanded, 201*12c85518Srobert /// formatted unwrapped lines handed in via \c addLine(), with the exception 202*12c85518Srobert /// that for multiple top-level lines, each subsequent line will be the 203*12c85518Srobert /// child of the last token in its predecessor. This representation is chosen 204*12c85518Srobert /// because it is a precondition to the formatter that we get what looks like 205*12c85518Srobert /// a single statement in a single \c UnwrappedLine (i.e. matching parens). 206*12c85518Srobert /// 207*12c85518Srobert /// If a token in a macro argument is a child of a token in the expansion, 208*12c85518Srobert /// the parent will be the corresponding token in the macro call. 209*12c85518Srobert /// For example: 210*12c85518Srobert /// #define C(a, b) class C { a b 211*12c85518Srobert /// C(int x;, int y;) 212*12c85518Srobert /// would expand to 213*12c85518Srobert /// class C { int x; int y; 214*12c85518Srobert /// where in a formatted line "int x;" and "int y;" would both be new separate 215*12c85518Srobert /// lines. 216*12c85518Srobert /// 217*12c85518Srobert /// In the result, "int x;" will be a child of the opening parenthesis in "C(" 218*12c85518Srobert /// and "int y;" will be a child of the "," token: 219*12c85518Srobert /// C ( 220*12c85518Srobert /// \- int x; 221*12c85518Srobert /// , 222*12c85518Srobert /// \- int y; 223*12c85518Srobert /// ) 224*12c85518Srobert UnwrappedLine takeResult() &&; 225*12c85518Srobert 226*12c85518Srobert private: 227*12c85518Srobert void add(FormatToken *Token, FormatToken *ExpandedParent, bool First); 228*12c85518Srobert void prepareParent(FormatToken *ExpandedParent, bool First); 229*12c85518Srobert FormatToken *getParentInResult(FormatToken *Parent); 230*12c85518Srobert void reconstruct(FormatToken *Token); 231*12c85518Srobert void startReconstruction(FormatToken *Token); 232*12c85518Srobert bool reconstructActiveCallUntil(FormatToken *Token); 233*12c85518Srobert void endReconstruction(FormatToken *Token); 234*12c85518Srobert bool processNextReconstructed(); 235*12c85518Srobert void finalize(); 236*12c85518Srobert 237*12c85518Srobert struct ReconstructedLine; 238*12c85518Srobert 239*12c85518Srobert void appendToken(FormatToken *Token, ReconstructedLine *L = nullptr); 240*12c85518Srobert UnwrappedLine createUnwrappedLine(const ReconstructedLine &Line, int Level); 241*12c85518Srobert void debug(const ReconstructedLine &Line, int Level); 242*12c85518Srobert ReconstructedLine &parentLine(); 243*12c85518Srobert ReconstructedLine *currentLine(); 244*12c85518Srobert void debugParentMap() const; 245*12c85518Srobert 246*12c85518Srobert #ifndef NDEBUG 247*12c85518Srobert enum ReconstructorState { 248*12c85518Srobert Start, // No macro expansion was found in the input yet. 249*12c85518Srobert InProgress, // During a macro reconstruction. 250*12c85518Srobert Finalized, // Past macro reconstruction, the result is finalized. 251*12c85518Srobert }; 252*12c85518Srobert ReconstructorState State = Start; 253*12c85518Srobert #endif 254*12c85518Srobert 255*12c85518Srobert // Node in which we build up the resulting unwrapped line; this type is 256*12c85518Srobert // analogous to UnwrappedLineNode. 257*12c85518Srobert struct LineNode { 258*12c85518Srobert LineNode() = default; LineNodeLineNode259*12c85518Srobert LineNode(FormatToken *Tok) : Tok(Tok) {} 260*12c85518Srobert FormatToken *Tok = nullptr; 261*12c85518Srobert llvm::SmallVector<std::unique_ptr<ReconstructedLine>> Children; 262*12c85518Srobert }; 263*12c85518Srobert 264*12c85518Srobert // Line in which we build up the resulting unwrapped line. 265*12c85518Srobert // FIXME: Investigate changing UnwrappedLine to a pointer type and using it 266*12c85518Srobert // instead of rolling our own type. 267*12c85518Srobert struct ReconstructedLine { 268*12c85518Srobert llvm::SmallVector<std::unique_ptr<LineNode>> Tokens; 269*12c85518Srobert }; 270*12c85518Srobert 271*12c85518Srobert // The line in which we collect the resulting reconstructed output. 272*12c85518Srobert // To reduce special cases in the algorithm, the first level of the line 273*12c85518Srobert // contains a single null token that has the reconstructed incoming 274*12c85518Srobert // lines as children. 275*12c85518Srobert // In the end, we stich the lines together so that each subsequent line 276*12c85518Srobert // is a child of the last token of the previous line. This is necessary 277*12c85518Srobert // in order to format the overall expression as a single logical line - 278*12c85518Srobert // if we created separate lines, we'd format them with their own top-level 279*12c85518Srobert // indent depending on the semantic structure, which is not desired. 280*12c85518Srobert ReconstructedLine Result; 281*12c85518Srobert 282*12c85518Srobert // Stack of currently "open" lines, where each line's predecessor's last 283*12c85518Srobert // token is the parent token for that line. 284*12c85518Srobert llvm::SmallVector<ReconstructedLine *> ActiveReconstructedLines; 285*12c85518Srobert 286*12c85518Srobert // Maps from the expanded token to the token that takes its place in the 287*12c85518Srobert // reconstructed token stream in terms of parent-child relationships. 288*12c85518Srobert // Note that it might take multiple steps to arrive at the correct 289*12c85518Srobert // parent in the output. 290*12c85518Srobert // Given: #define C(a, b) []() { a; b; } 291*12c85518Srobert // And a call: C(f(), g()) 292*12c85518Srobert // The structure in the incoming formatted unwrapped line will be: 293*12c85518Srobert // []() { 294*12c85518Srobert // |- f(); 295*12c85518Srobert // \- g(); 296*12c85518Srobert // } 297*12c85518Srobert // with f and g being children of the opening brace. 298*12c85518Srobert // In the reconstructed call: 299*12c85518Srobert // C(f(), g()) 300*12c85518Srobert // \- f() 301*12c85518Srobert // \- g() 302*12c85518Srobert // We want f to be a child of the opening parenthesis and g to be a child 303*12c85518Srobert // of the comma token in the macro call. 304*12c85518Srobert // Thus, we map 305*12c85518Srobert // { -> ( 306*12c85518Srobert // and add 307*12c85518Srobert // ( -> , 308*12c85518Srobert // once we're past the comma in the reconstruction. 309*12c85518Srobert llvm::DenseMap<FormatToken *, FormatToken *> 310*12c85518Srobert SpelledParentToReconstructedParent; 311*12c85518Srobert 312*12c85518Srobert // Keeps track of a single expansion while we're reconstructing tokens it 313*12c85518Srobert // generated. 314*12c85518Srobert struct Expansion { 315*12c85518Srobert // The identifier token of the macro call. 316*12c85518Srobert FormatToken *ID; 317*12c85518Srobert // Our current position in the reconstruction. 318*12c85518Srobert std::list<UnwrappedLineNode>::iterator SpelledI; 319*12c85518Srobert // The end of the reconstructed token sequence. 320*12c85518Srobert std::list<UnwrappedLineNode>::iterator SpelledE; 321*12c85518Srobert }; 322*12c85518Srobert 323*12c85518Srobert // Stack of macro calls for which we're in the middle of an expansion. 324*12c85518Srobert llvm::SmallVector<Expansion> ActiveExpansions; 325*12c85518Srobert 326*12c85518Srobert struct MacroCallState { 327*12c85518Srobert MacroCallState(ReconstructedLine *Line, FormatToken *ParentLastToken, 328*12c85518Srobert FormatToken *MacroCallLParen); 329*12c85518Srobert 330*12c85518Srobert ReconstructedLine *Line; 331*12c85518Srobert 332*12c85518Srobert // The last token in the parent line or expansion, or nullptr if the macro 333*12c85518Srobert // expansion is on a top-level line. 334*12c85518Srobert // 335*12c85518Srobert // For example, in the macro call: 336*12c85518Srobert // auto f = []() { ID(1); }; 337*12c85518Srobert // The MacroCallState for ID will have '{' as ParentLastToken. 338*12c85518Srobert // 339*12c85518Srobert // In the macro call: 340*12c85518Srobert // ID(ID(void f())); 341*12c85518Srobert // The MacroCallState of the outer ID will have nullptr as ParentLastToken, 342*12c85518Srobert // while the MacroCallState for the inner ID will have the '(' of the outer 343*12c85518Srobert // ID as ParentLastToken. 344*12c85518Srobert // 345*12c85518Srobert // In the macro call: 346*12c85518Srobert // ID2(a, ID(b)); 347*12c85518Srobert // The MacroCallState of ID will have ',' as ParentLastToken. 348*12c85518Srobert FormatToken *ParentLastToken; 349*12c85518Srobert 350*12c85518Srobert // The l_paren of this MacroCallState's macro call. 351*12c85518Srobert FormatToken *MacroCallLParen; 352*12c85518Srobert }; 353*12c85518Srobert 354*12c85518Srobert // Keeps track of the lines into which the opening brace/parenthesis & 355*12c85518Srobert // argument separating commas for each level in the macro call go in order to 356*12c85518Srobert // put the corresponding closing brace/parenthesis into the same line in the 357*12c85518Srobert // output and keep track of which parents in the expanded token stream map to 358*12c85518Srobert // which tokens in the reconstructed stream. 359*12c85518Srobert // When an opening brace/parenthesis has children, we want the structure of 360*12c85518Srobert // the output line to be: 361*12c85518Srobert // |- MACRO 362*12c85518Srobert // |- ( 363*12c85518Srobert // | \- <argument> 364*12c85518Srobert // |- , 365*12c85518Srobert // | \- <argument> 366*12c85518Srobert // \- ) 367*12c85518Srobert llvm::SmallVector<MacroCallState> MacroCallStructure; 368*12c85518Srobert 369*12c85518Srobert // Level the generated UnwrappedLine will be at. 370*12c85518Srobert const unsigned Level; 371*12c85518Srobert 372*12c85518Srobert // Maps from identifier of the macro call to an unwrapped line containing 373*12c85518Srobert // all tokens of the macro call. 374*12c85518Srobert const llvm::DenseMap<FormatToken *, std::unique_ptr<UnwrappedLine>> 375*12c85518Srobert &IdToReconstructed; 376*12c85518Srobert }; 377*12c85518Srobert 378a9ac8606Spatrick } // namespace format 379a9ac8606Spatrick } // namespace clang 380a9ac8606Spatrick 381a9ac8606Spatrick #endif 382