xref: /llvm-project/mlir/tools/mlir-tblgen/FormatGen.h (revision db791b278a414fb6df1acc1799adcf11d8fb9169)
1 //===- FormatGen.h - Utilities for custom assembly formats ------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file contains common classes for building custom assembly format parsers
10 // and generators.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef MLIR_TOOLS_MLIRTBLGEN_FORMATGEN_H_
15 #define MLIR_TOOLS_MLIRTBLGEN_FORMATGEN_H_
16 
17 #include "mlir/Support/LLVM.h"
18 #include "llvm/ADT/StringRef.h"
19 #include "llvm/ADT/StringSet.h"
20 #include "llvm/Support/Allocator.h"
21 #include "llvm/Support/CommandLine.h"
22 #include "llvm/Support/SMLoc.h"
23 #include <vector>
24 
25 namespace llvm {
26 class SourceMgr;
27 } // namespace llvm
28 
29 namespace mlir {
30 namespace tblgen {
31 
32 //===----------------------------------------------------------------------===//
33 // FormatToken
34 //===----------------------------------------------------------------------===//
35 
36 /// This class represents a specific token in the input format.
37 class FormatToken {
38 public:
39   /// Basic token kinds.
40   enum Kind {
41     // Markers.
42     eof,
43     error,
44 
45     // Tokens with no info.
46     l_paren,
47     r_paren,
48     caret,
49     colon,
50     comma,
51     equal,
52     less,
53     greater,
54     question,
55     star,
56     pipe,
57 
58     // Keywords.
59     keyword_start,
60     kw_attr_dict,
61     kw_attr_dict_w_keyword,
62     kw_prop_dict,
63     kw_custom,
64     kw_functional_type,
65     kw_oilist,
66     kw_operands,
67     kw_params,
68     kw_qualified,
69     kw_ref,
70     kw_regions,
71     kw_results,
72     kw_struct,
73     kw_successors,
74     kw_type,
75     keyword_end,
76 
77     // String valued tokens.
78     identifier,
79     literal,
80     variable,
81     string,
82   };
83 
FormatToken(Kind kind,StringRef spelling)84   FormatToken(Kind kind, StringRef spelling) : kind(kind), spelling(spelling) {}
85 
86   /// Return the bytes that make up this token.
getSpelling()87   StringRef getSpelling() const { return spelling; }
88 
89   /// Return the kind of this token.
getKind()90   Kind getKind() const { return kind; }
91 
92   /// Return a location for this token.
93   SMLoc getLoc() const;
94 
95   /// Returns true if the token is of the given kind.
is(Kind kind)96   bool is(Kind kind) { return getKind() == kind; }
97 
98   /// Return if this token is a keyword.
isKeyword()99   bool isKeyword() const {
100     return getKind() > Kind::keyword_start && getKind() < Kind::keyword_end;
101   }
102 
103 private:
104   /// Discriminator that indicates the kind of token this is.
105   Kind kind;
106 
107   /// A reference to the entire token contents; this is always a pointer into
108   /// a memory buffer owned by the source manager.
109   StringRef spelling;
110 };
111 
112 //===----------------------------------------------------------------------===//
113 // FormatLexer
114 //===----------------------------------------------------------------------===//
115 
116 /// This class implements a simple lexer for operation assembly format strings.
117 class FormatLexer {
118 public:
119   FormatLexer(llvm::SourceMgr &mgr, SMLoc loc);
120 
121   /// Lex the next token and return it.
122   FormatToken lexToken();
123 
124   /// Emit an error to the lexer with the given location and message.
125   FormatToken emitError(SMLoc loc, const Twine &msg);
126   FormatToken emitError(const char *loc, const Twine &msg);
127 
128   FormatToken emitErrorAndNote(SMLoc loc, const Twine &msg, const Twine &note);
129 
130 private:
131   /// Return the next character in the stream.
132   int getNextChar();
133 
134   /// Lex an identifier, literal, variable, or string.
135   FormatToken lexIdentifier(const char *tokStart);
136   FormatToken lexLiteral(const char *tokStart);
137   FormatToken lexVariable(const char *tokStart);
138   FormatToken lexString(const char *tokStart);
139 
140   /// Create a token with the current pointer and a start pointer.
formToken(FormatToken::Kind kind,const char * tokStart)141   FormatToken formToken(FormatToken::Kind kind, const char *tokStart) {
142     return FormatToken(kind, StringRef(tokStart, curPtr - tokStart));
143   }
144 
145   /// The source manager containing the format string.
146   llvm::SourceMgr &mgr;
147   /// Location of the format string.
148   SMLoc loc;
149   /// Buffer containing the format string.
150   StringRef curBuffer;
151   /// Current pointer in the buffer.
152   const char *curPtr;
153 };
154 
155 //===----------------------------------------------------------------------===//
156 // FormatElement
157 //===----------------------------------------------------------------------===//
158 
159 /// This class represents a single format element.
160 ///
161 /// If you squint and take a close look, you can see the outline of a `Format`
162 /// dialect.
163 class FormatElement {
164 public:
165   virtual ~FormatElement();
166 
167   // The top-level kinds of format elements.
168   enum Kind { Literal, String, Variable, Whitespace, Directive, Optional };
169 
170   /// Support LLVM-style RTTI.
classof(const FormatElement * el)171   static bool classof(const FormatElement *el) { return true; }
172 
173   /// Get the element kind.
getKind()174   Kind getKind() const { return kind; }
175 
176 protected:
177   /// Create a format element with the given kind.
FormatElement(Kind kind)178   FormatElement(Kind kind) : kind(kind) {}
179 
180 private:
181   /// The kind of the element.
182   Kind kind;
183 };
184 
185 /// The base class for all format elements. This class implements common methods
186 /// for LLVM-style RTTI.
187 template <FormatElement::Kind ElementKind>
188 class FormatElementBase : public FormatElement {
189 public:
190   /// Support LLVM-style RTTI.
classof(const FormatElement * el)191   static bool classof(const FormatElement *el) {
192     return ElementKind == el->getKind();
193   }
194 
195 protected:
196   /// Create a format element with the given kind.
FormatElementBase()197   FormatElementBase() : FormatElement(ElementKind) {}
198 };
199 
200 /// This class represents a literal element. A literal is either one of the
201 /// supported punctuation characters (e.g. `(` or `,`) or a string literal (e.g.
202 /// `literal`).
203 class LiteralElement : public FormatElementBase<FormatElement::Literal> {
204 public:
205   /// Create a literal element with the given spelling.
LiteralElement(StringRef spelling)206   explicit LiteralElement(StringRef spelling) : spelling(spelling) {}
207 
208   /// Get the spelling of the literal.
getSpelling()209   StringRef getSpelling() const { return spelling; }
210 
211 private:
212   /// The spelling of the variable, i.e. the string contained within the
213   /// backticks.
214   StringRef spelling;
215 };
216 
217 /// This class represents a raw string that can contain arbitrary C++ code.
218 class StringElement : public FormatElementBase<FormatElement::String> {
219 public:
220   /// Create a string element with the given contents.
StringElement(std::string value)221   explicit StringElement(std::string value) : value(std::move(value)) {}
222 
223   /// Get the value of the string element.
getValue()224   StringRef getValue() const { return value; }
225 
226 private:
227   /// The contents of the string.
228   std::string value;
229 };
230 
231 /// This class represents a variable element. A variable refers to some part of
232 /// the object being parsed, e.g. an attribute or operand on an operation or a
233 /// parameter on an attribute.
234 class VariableElement : public FormatElementBase<FormatElement::Variable> {
235 public:
236   /// These are the kinds of variables.
237   enum Kind {
238     Attribute,
239     Operand,
240     Region,
241     Result,
242     Successor,
243     Parameter,
244     Property
245   };
246 
247   /// Get the kind of variable.
getKind()248   Kind getKind() const { return kind; }
249 
250 protected:
251   /// Create a variable with a kind.
VariableElement(Kind kind)252   VariableElement(Kind kind) : kind(kind) {}
253 
254 private:
255   /// The kind of variable.
256   Kind kind;
257 };
258 
259 /// Base class for variable elements. This class implements common methods for
260 /// LLVM-style RTTI.
261 template <VariableElement::Kind VariableKind>
262 class VariableElementBase : public VariableElement {
263 public:
264   /// An element is of this class if it is a variable and has the same variable
265   /// type.
classof(const FormatElement * el)266   static bool classof(const FormatElement *el) {
267     if (auto *varEl = dyn_cast<VariableElement>(el))
268       return VariableKind == varEl->getKind();
269     return false;
270   }
271 
272 protected:
273   /// Create a variable element with the given variable kind.
VariableElementBase()274   VariableElementBase() : VariableElement(VariableKind) {}
275 };
276 
277 /// This class represents a whitespace element, e.g. a newline or space. It is a
278 /// literal that is printed but never parsed. When the value is empty, i.e. ``,
279 /// a space is elided where one would have been printed automatically.
280 class WhitespaceElement : public FormatElementBase<FormatElement::Whitespace> {
281 public:
282   /// Create a whitespace element.
WhitespaceElement(StringRef value)283   explicit WhitespaceElement(StringRef value) : value(value) {}
284 
285   /// Get the whitespace value.
getValue()286   StringRef getValue() const { return value; }
287 
288 private:
289   /// The value of the whitespace element. Can be empty.
290   StringRef value;
291 };
292 
293 class DirectiveElement : public FormatElementBase<FormatElement::Directive> {
294 public:
295   /// These are the kinds of directives.
296   enum Kind {
297     AttrDict,
298     PropDict,
299     Custom,
300     FunctionalType,
301     OIList,
302     Operands,
303     Ref,
304     Regions,
305     Results,
306     Successors,
307     Type,
308     Params,
309     Struct
310   };
311 
312   /// Get the directive kind.
getKind()313   Kind getKind() const { return kind; }
314 
315 protected:
316   /// Create a directive element with a kind.
DirectiveElement(Kind kind)317   DirectiveElement(Kind kind) : kind(kind) {}
318 
319 private:
320   /// The directive kind.
321   Kind kind;
322 };
323 
324 /// Base class for directive elements. This class implements common methods for
325 /// LLVM-style RTTI.
326 template <DirectiveElement::Kind DirectiveKind>
327 class DirectiveElementBase : public DirectiveElement {
328 public:
329   /// Create a directive element with the specified kind.
DirectiveElementBase()330   DirectiveElementBase() : DirectiveElement(DirectiveKind) {}
331 
332   /// A format element is of this class if it is a directive element and has the
333   /// same kind.
classof(const FormatElement * el)334   static bool classof(const FormatElement *el) {
335     if (auto *directiveEl = dyn_cast<DirectiveElement>(el))
336       return DirectiveKind == directiveEl->getKind();
337     return false;
338   }
339 };
340 
341 /// This class represents a custom format directive that is implemented by the
342 /// user in C++. The directive accepts a list of arguments that is passed to the
343 /// C++ function.
344 class CustomDirective : public DirectiveElementBase<DirectiveElement::Custom> {
345 public:
346   /// Create a custom directive with a name and list of arguments.
CustomDirective(StringRef name,std::vector<FormatElement * > && arguments)347   CustomDirective(StringRef name, std::vector<FormatElement *> &&arguments)
348       : name(name), arguments(std::move(arguments)) {}
349 
350   /// Get the custom directive name.
getName()351   StringRef getName() const { return name; }
352 
353   /// Get the arguments to the custom directive.
getArguments()354   ArrayRef<FormatElement *> getArguments() const { return arguments; }
355 
356 private:
357   /// The name of the custom directive. The name is used to call two C++
358   /// methods: `parse{name}` and `print{name}` with the given arguments.
359   StringRef name;
360   /// The arguments with which to call the custom functions. These are either
361   /// variables (for which the functions are responsible for populating) or
362   /// references to variables.
363   std::vector<FormatElement *> arguments;
364 };
365 
366 /// This class represents a reference directive. This directive can be used to
367 /// reference but not bind a previously bound variable or format object. Its
368 /// current only use is to pass variables as arguments to the custom directive.
369 class RefDirective : public DirectiveElementBase<DirectiveElement::Ref> {
370 public:
371   /// Create a reference directive with the single referenced child.
RefDirective(FormatElement * arg)372   RefDirective(FormatElement *arg) : arg(arg) {}
373 
374   /// Get the reference argument.
getArg()375   FormatElement *getArg() const { return arg; }
376 
377 private:
378   /// The referenced argument.
379   FormatElement *arg;
380 };
381 
382 /// This class represents a group of elements that are optionally emitted based
383 /// on an optional variable "anchor" and a group of elements that are emitted
384 /// when the anchor element is not present.
385 class OptionalElement : public FormatElementBase<FormatElement::Optional> {
386 public:
387   /// Create an optional group with the given child elements.
OptionalElement(std::vector<FormatElement * > && thenElements,std::vector<FormatElement * > && elseElements,unsigned thenParseStart,unsigned elseParseStart,FormatElement * anchor,bool inverted)388   OptionalElement(std::vector<FormatElement *> &&thenElements,
389                   std::vector<FormatElement *> &&elseElements,
390                   unsigned thenParseStart, unsigned elseParseStart,
391                   FormatElement *anchor, bool inverted)
392       : thenElements(std::move(thenElements)),
393         elseElements(std::move(elseElements)), thenParseStart(thenParseStart),
394         elseParseStart(elseParseStart), anchor(anchor), inverted(inverted) {}
395 
396   /// Return the `then` elements of the optional group. Drops the first
397   /// `thenParseStart` whitespace elements if `parseable` is true.
398   ArrayRef<FormatElement *> getThenElements(bool parseable = false) const {
399     return llvm::ArrayRef(thenElements)
400         .drop_front(parseable ? thenParseStart : 0);
401   }
402 
403   /// Return the `else` elements of the optional group. Drops the first
404   /// `elseParseStart` whitespace elements if `parseable` is true.
405   ArrayRef<FormatElement *> getElseElements(bool parseable = false) const {
406     return llvm::ArrayRef(elseElements)
407         .drop_front(parseable ? elseParseStart : 0);
408   }
409 
410   /// Return the anchor of the optional group.
getAnchor()411   FormatElement *getAnchor() const { return anchor; }
412 
413   /// Return true if the optional group is inverted.
isInverted()414   bool isInverted() const { return inverted; }
415 
416 private:
417   /// The child elements emitted when the anchor is present.
418   std::vector<FormatElement *> thenElements;
419   /// The child elements emitted when the anchor is not present.
420   std::vector<FormatElement *> elseElements;
421   /// The index of the first element that is parsed in `thenElements`. That is,
422   /// the first non-whitespace element.
423   unsigned thenParseStart;
424   /// The index of the first element that is parsed in `elseElements`. That is,
425   /// the first non-whitespace element.
426   unsigned elseParseStart;
427   /// The anchor element of the optional group.
428   FormatElement *anchor;
429   /// Whether the optional group condition is inverted and the anchor element is
430   /// in the else group.
431   bool inverted;
432 };
433 
434 //===----------------------------------------------------------------------===//
435 // FormatParserBase
436 //===----------------------------------------------------------------------===//
437 
438 /// Base class for a parser that implements an assembly format. This class
439 /// defines a common assembly format syntax and the creation of format elements.
440 /// Subclasses will need to implement parsing for the format elements they
441 /// support.
442 class FormatParser {
443 public:
444   /// Vtable anchor.
445   virtual ~FormatParser();
446 
447   /// Parse the assembly format.
448   FailureOr<std::vector<FormatElement *>> parse();
449 
450 protected:
451   /// The current context of the parser when parsing an element.
452   enum Context {
453     /// The element is being parsed in a "top-level" context, i.e. at the top of
454     /// the format or in an optional group.
455     TopLevelContext,
456     /// The element is being parsed as a custom directive child.
457     CustomDirectiveContext,
458     /// The element is being parsed as a type directive child.
459     TypeDirectiveContext,
460     /// The element is being parsed as a reference directive child.
461     RefDirectiveContext,
462     /// The element is being parsed as a struct directive child.
463     StructDirectiveContext
464   };
465 
466   /// Create a format parser with the given source manager and a location.
FormatParser(llvm::SourceMgr & mgr,llvm::SMLoc loc)467   explicit FormatParser(llvm::SourceMgr &mgr, llvm::SMLoc loc)
468       : lexer(mgr, loc), curToken(lexer.lexToken()) {}
469 
470   /// Allocate and construct a format element.
471   template <typename FormatElementT, typename... Args>
create(Args &&...args)472   FormatElementT *create(Args &&...args) {
473     // FormatElementT *ptr = allocator.Allocate<FormatElementT>();
474     // ::new (ptr) FormatElementT(std::forward<Args>(args)...);
475     // return ptr;
476     auto mem = std::make_unique<FormatElementT>(std::forward<Args>(args)...);
477     FormatElementT *ptr = mem.get();
478     allocator.push_back(std::move(mem));
479     return ptr;
480   }
481 
482   //===--------------------------------------------------------------------===//
483   // Element Parsing
484 
485   /// Parse a single element of any kind.
486   FailureOr<FormatElement *> parseElement(Context ctx);
487   /// Parse a literal.
488   FailureOr<FormatElement *> parseLiteral(Context ctx);
489   /// Parse a string.
490   FailureOr<FormatElement *> parseString(Context ctx);
491   /// Parse a variable.
492   FailureOr<FormatElement *> parseVariable(Context ctx);
493   /// Parse a directive.
494   FailureOr<FormatElement *> parseDirective(Context ctx);
495   /// Parse an optional group.
496   FailureOr<FormatElement *> parseOptionalGroup(Context ctx);
497   /// Parse a custom directive.
498   FailureOr<FormatElement *> parseCustomDirective(llvm::SMLoc loc, Context ctx);
499   /// Parse a ref directive.
500   FailureOr<FormatElement *> parseRefDirective(SMLoc loc, Context context);
501   /// Parse a qualified directive.
502   FailureOr<FormatElement *> parseQualifiedDirective(SMLoc loc, Context ctx);
503 
504   /// Parse a format-specific variable kind.
505   virtual FailureOr<FormatElement *>
506   parseVariableImpl(llvm::SMLoc loc, StringRef name, Context ctx) = 0;
507   /// Parse a format-specific directive kind.
508   virtual FailureOr<FormatElement *>
509   parseDirectiveImpl(llvm::SMLoc loc, FormatToken::Kind kind, Context ctx) = 0;
510 
511   //===--------------------------------------------------------------------===//
512   // Format Verification
513 
514   /// Verify that the format is well-formed.
515   virtual LogicalResult verify(llvm::SMLoc loc,
516                                ArrayRef<FormatElement *> elements) = 0;
517   /// Verify the arguments to a custom directive.
518   virtual LogicalResult
519   verifyCustomDirectiveArguments(llvm::SMLoc loc,
520                                  ArrayRef<FormatElement *> arguments) = 0;
521   /// Verify the elements of an optional group.
522   virtual LogicalResult
523   verifyOptionalGroupElements(llvm::SMLoc loc,
524                               ArrayRef<FormatElement *> elements,
525                               FormatElement *anchor) = 0;
526 
527   /// Mark 'element' as qualified. If 'element' cannot be qualified an error
528   /// should be emitted and failure returned.
529   virtual LogicalResult markQualified(llvm::SMLoc loc,
530                                       FormatElement *element) = 0;
531 
532   //===--------------------------------------------------------------------===//
533   // Lexer Utilities
534 
535   /// Emit an error at the given location.
emitError(llvm::SMLoc loc,const Twine & msg)536   LogicalResult emitError(llvm::SMLoc loc, const Twine &msg) {
537     lexer.emitError(loc, msg);
538     return failure();
539   }
540 
541   /// Emit an error and a note at the given notation.
emitErrorAndNote(llvm::SMLoc loc,const Twine & msg,const Twine & note)542   LogicalResult emitErrorAndNote(llvm::SMLoc loc, const Twine &msg,
543                                  const Twine &note) {
544     lexer.emitErrorAndNote(loc, msg, note);
545     return failure();
546   }
547 
548   /// Parse a single token of the expected kind.
parseToken(FormatToken::Kind kind,const Twine & msg)549   FailureOr<FormatToken> parseToken(FormatToken::Kind kind, const Twine &msg) {
550     if (!curToken.is(kind))
551       return emitError(curToken.getLoc(), msg);
552     FormatToken tok = curToken;
553     consumeToken();
554     return tok;
555   }
556 
557   /// Advance the lexer to the next token.
consumeToken()558   void consumeToken() {
559     assert(!curToken.is(FormatToken::eof) && !curToken.is(FormatToken::error) &&
560            "shouldn't advance past EOF or errors");
561     curToken = lexer.lexToken();
562   }
563 
564   /// Get the current token.
peekToken()565   FormatToken peekToken() { return curToken; }
566 
567 private:
568   /// The format parser retains ownership of the format elements in a bump
569   /// pointer allocator.
570   // FIXME: FormatElement with `std::vector` need to be converted to use
571   // trailing objects.
572   // llvm::BumpPtrAllocator allocator;
573   std::vector<std::unique_ptr<FormatElement>> allocator;
574   /// The format lexer to use.
575   FormatLexer lexer;
576   /// The current token in the lexer.
577   FormatToken curToken;
578 };
579 
580 //===----------------------------------------------------------------------===//
581 // Utility Functions
582 //===----------------------------------------------------------------------===//
583 
584 /// Whether a space needs to be emitted before a literal. E.g., two keywords
585 /// back-to-back require a space separator, but a keyword followed by '<' does
586 /// not require a space.
587 bool shouldEmitSpaceBefore(StringRef value, bool lastWasPunctuation);
588 
589 /// Returns true if the given string can be formatted as a keyword.
590 bool canFormatStringAsKeyword(StringRef value,
591                               function_ref<void(Twine)> emitError = nullptr);
592 
593 /// Returns true if the given string is valid format literal element.
594 /// If `emitError` is provided, it is invoked with the reason for the failure.
595 bool isValidLiteral(StringRef value,
596                     function_ref<void(Twine)> emitError = nullptr);
597 
598 /// Whether a failure in parsing the assembly format should be a fatal error.
599 extern llvm::cl::opt<bool> formatErrorIsFatal;
600 
601 } // namespace tblgen
602 } // namespace mlir
603 
604 #endif // MLIR_TOOLS_MLIRTBLGEN_FORMATGEN_H_
605