xref: /llvm-project/llvm/lib/TableGen/TGLexer.h (revision 4e8c9d28132039a98feb97cec2759cddeb37d934)
1 //===- TGLexer.h - Lexer for TableGen Files ---------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This class represents the Lexer for tablegen files.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef LLVM_LIB_TABLEGEN_TGLEXER_H
14 #define LLVM_LIB_TABLEGEN_TGLEXER_H
15 
16 #include "llvm/ADT/SmallVector.h"
17 #include "llvm/ADT/StringRef.h"
18 #include "llvm/ADT/StringSet.h"
19 #include "llvm/Support/DataTypes.h"
20 #include "llvm/Support/SMLoc.h"
21 #include <cassert>
22 #include <memory>
23 #include <set>
24 #include <string>
25 
26 namespace llvm {
27 template <typename T> class ArrayRef;
28 class SourceMgr;
29 class Twine;
30 
31 namespace tgtok {
32 enum TokKind {
33   // Markers
34   Eof,
35   Error,
36 
37   // Tokens with no info.
38   minus,     // -
39   plus,      // +
40   l_square,  // [
41   r_square,  // ]
42   l_brace,   // {
43   r_brace,   // }
44   l_paren,   // (
45   r_paren,   // )
46   less,      // <
47   greater,   // >
48   colon,     // :
49   semi,      // ;
50   comma,     // ,
51   dot,       // .
52   equal,     // =
53   question,  // ?
54   paste,     // #
55   dotdotdot, // ...
56 
57   // Boolean literals.
58   TrueVal,
59   FalseVal,
60 
61   // Integer value.
62   IntVal,
63 
64   // Binary constant.  Note that these are sized according to the number of
65   // bits given.
66   BinaryIntVal,
67 
68   // Preprocessing tokens for internal usage by the lexer.
69   // They are never returned as a result of Lex().
70   Ifdef,
71   Ifndef,
72   Else,
73   Endif,
74   Define,
75 
76   // Reserved keywords. ('ElseKW' is named to distinguish it from the
77   // existing 'Else' that means the preprocessor #else.)
78   Bit,
79   Bits,
80   Code,
81   Dag,
82   ElseKW,
83   Field,
84   In,
85   Include,
86   Int,
87   List,
88   String,
89   Then,
90 
91   // Object start tokens.
92   OBJECT_START_FIRST,
93   Assert = OBJECT_START_FIRST,
94   Class,
95   Def,
96   Defm,
97   Defset,
98   Deftype,
99   Defvar,
100   Dump,
101   Foreach,
102   If,
103   Let,
104   MultiClass,
105   OBJECT_START_LAST = MultiClass,
106 
107   // Bang operators.
108   BANG_OPERATOR_FIRST,
109   XConcat = BANG_OPERATOR_FIRST,
110   XADD,
111   XSUB,
112   XMUL,
113   XDIV,
114   XNOT,
115   XLOG2,
116   XAND,
117   XOR,
118   XXOR,
119   XSRA,
120   XSRL,
121   XSHL,
122   XListConcat,
123   XListFlatten,
124   XListSplat,
125   XStrConcat,
126   XInterleave,
127   XSubstr,
128   XFind,
129   XCast,
130   XSubst,
131   XForEach,
132   XFilter,
133   XFoldl,
134   XHead,
135   XTail,
136   XSize,
137   XEmpty,
138   XInitialized,
139   XIf,
140   XCond,
141   XEq,
142   XIsA,
143   XDag,
144   XNe,
145   XLe,
146   XLt,
147   XGe,
148   XGt,
149   XSetDagOp,
150   XGetDagOp,
151   XExists,
152   XListRemove,
153   XToLower,
154   XToUpper,
155   XRange,
156   XGetDagArg,
157   XGetDagName,
158   XSetDagArg,
159   XSetDagName,
160   XRepr,
161   BANG_OPERATOR_LAST = XRepr,
162 
163   // String valued tokens.
164   STRING_VALUE_FIRST,
165   Id = STRING_VALUE_FIRST,
166   StrVal,
167   VarName,
168   CodeFragment,
169   STRING_VALUE_LAST = CodeFragment,
170 };
171 
172 /// isBangOperator - Return true if this is a bang operator.
173 static inline bool isBangOperator(tgtok::TokKind Kind) {
174   return tgtok::BANG_OPERATOR_FIRST <= Kind && Kind <= BANG_OPERATOR_LAST;
175 }
176 
177 /// isObjectStart - Return true if this is a valid first token for a statement.
178 static inline bool isObjectStart(tgtok::TokKind Kind) {
179   return tgtok::OBJECT_START_FIRST <= Kind && Kind <= OBJECT_START_LAST;
180 }
181 
182 /// isStringValue - Return true if this is a string value.
183 static inline bool isStringValue(tgtok::TokKind Kind) {
184   return tgtok::STRING_VALUE_FIRST <= Kind && Kind <= STRING_VALUE_LAST;
185 }
186 } // namespace tgtok
187 
188 /// TGLexer - TableGen Lexer class.
189 class TGLexer {
190   SourceMgr &SrcMgr;
191 
192   const char *CurPtr = nullptr;
193   StringRef CurBuf;
194 
195   // Information about the current token.
196   const char *TokStart = nullptr;
197   tgtok::TokKind CurCode = tgtok::TokKind::Eof;
198   std::string CurStrVal; // This is valid for Id, StrVal, VarName, CodeFragment
199   int64_t CurIntVal = 0; // This is valid for IntVal.
200 
201   /// CurBuffer - This is the current buffer index we're lexing from as managed
202   /// by the SourceMgr object.
203   unsigned CurBuffer = 0;
204 
205 public:
206   typedef std::set<std::string> DependenciesSetTy;
207 
208 private:
209   /// Dependencies - This is the list of all included files.
210   DependenciesSetTy Dependencies;
211 
212 public:
213   TGLexer(SourceMgr &SrcMgr, ArrayRef<std::string> Macros);
214 
215   tgtok::TokKind Lex() {
216     return CurCode = LexToken(CurPtr == CurBuf.begin());
217   }
218 
219   const DependenciesSetTy &getDependencies() const {
220     return Dependencies;
221   }
222 
223   tgtok::TokKind getCode() const { return CurCode; }
224 
225   const std::string &getCurStrVal() const {
226     assert(tgtok::isStringValue(CurCode) &&
227            "This token doesn't have a string value");
228     return CurStrVal;
229   }
230   int64_t getCurIntVal() const {
231     assert(CurCode == tgtok::IntVal && "This token isn't an integer");
232     return CurIntVal;
233   }
234   std::pair<int64_t, unsigned> getCurBinaryIntVal() const {
235     assert(CurCode == tgtok::BinaryIntVal &&
236            "This token isn't a binary integer");
237     return {CurIntVal, (CurPtr - TokStart) - 2};
238   }
239 
240   SMLoc getLoc() const;
241   SMRange getLocRange() const;
242 
243 private:
244   /// LexToken - Read the next token and return its code.
245   tgtok::TokKind LexToken(bool FileOrLineStart = false);
246 
247   tgtok::TokKind ReturnError(SMLoc Loc, const Twine &Msg);
248   tgtok::TokKind ReturnError(const char *Loc, const Twine &Msg);
249 
250   int getNextChar();
251   int peekNextChar(int Index) const;
252   void SkipBCPLComment();
253   bool SkipCComment();
254   tgtok::TokKind LexIdentifier();
255   bool LexInclude();
256   tgtok::TokKind LexString();
257   tgtok::TokKind LexVarName();
258   tgtok::TokKind LexNumber();
259   tgtok::TokKind LexBracket();
260   tgtok::TokKind LexExclaim();
261 
262   // Process EOF encountered in LexToken().
263   // If EOF is met in an include file, then the method will update
264   // CurPtr, CurBuf and preprocessing include stack, and return true.
265   // If EOF is met in the top-level file, then the method will
266   // update and check the preprocessing include stack, and return false.
267   bool processEOF();
268 
269   // *** Structures and methods for preprocessing support ***
270 
271   // A set of macro names that are defined either via command line or
272   // by using:
273   //     #define NAME
274   StringSet<> DefinedMacros;
275 
276   // Each of #ifdef and #else directives has a descriptor associated
277   // with it.
278   //
279   // An ordered list of preprocessing controls defined by #ifdef/#else
280   // directives that are in effect currently is called preprocessing
281   // control stack.  It is represented as a vector of PreprocessorControlDesc's.
282   //
283   // The control stack is updated according to the following rules:
284   //
285   // For each #ifdef we add an element to the control stack.
286   // For each #else we replace the top element with a descriptor
287   // with an inverted IsDefined value.
288   // For each #endif we pop the top element from the control stack.
289   //
290   // When CurPtr reaches the current buffer's end, the control stack
291   // must be empty, i.e. #ifdef and the corresponding #endif
292   // must be located in the same file.
293   struct PreprocessorControlDesc {
294     // Either tgtok::Ifdef or tgtok::Else.
295     tgtok::TokKind Kind;
296 
297     // True, if the condition for this directive is true, false - otherwise.
298     // Examples:
299     //     #ifdef NAME       : true, if NAME is defined, false - otherwise.
300     //     ...
301     //     #else             : false, if NAME is defined, true - otherwise.
302     bool IsDefined;
303 
304     // Pointer into CurBuf to the beginning of the preprocessing directive
305     // word, e.g.:
306     //     #ifdef NAME
307     //      ^ - SrcPos
308     SMLoc SrcPos;
309   };
310 
311   // We want to disallow code like this:
312   //     file1.td:
313   //         #define NAME
314   //         #ifdef NAME
315   //         include "file2.td"
316   //     EOF
317   //     file2.td:
318   //         #endif
319   //     EOF
320   //
321   // To do this, we clear the preprocessing control stack on entry
322   // to each of the included file.  PrepIncludeStack is used to store
323   // preprocessing control stacks for the current file and all its
324   // parent files.  The back() element is the preprocessing control
325   // stack for the current file.
326   SmallVector<SmallVector<PreprocessorControlDesc>> PrepIncludeStack;
327 
328   // Validate that the current preprocessing control stack is empty,
329   // since we are about to exit a file, and pop the include stack.
330   //
331   // If IncludeStackMustBeEmpty is true, the include stack must be empty
332   // after the popping, otherwise, the include stack must not be empty
333   // after the popping.  Basically, the include stack must be empty
334   // only if we exit the "top-level" file (i.e. finish lexing).
335   //
336   // The method returns false, if the current preprocessing control stack
337   // is not empty (e.g. there is an unterminated #ifdef/#else),
338   // true - otherwise.
339   bool prepExitInclude(bool IncludeStackMustBeEmpty);
340 
341   // Look ahead for a preprocessing directive starting from CurPtr.  The caller
342   // must only call this method, if *(CurPtr - 1) is '#'.  If the method matches
343   // a preprocessing directive word followed by a whitespace, then it returns
344   // one of the internal token kinds, i.e. Ifdef, Else, Endif, Define.
345   //
346   // CurPtr is not adjusted by this method.
347   tgtok::TokKind prepIsDirective() const;
348 
349   // Given a preprocessing token kind, adjusts CurPtr to the end
350   // of the preprocessing directive word.
351   //
352   // We use look-ahead prepIsDirective() and prepEatPreprocessorDirective()
353   // to avoid adjusting CurPtr before we are sure that '#' is followed
354   // by a preprocessing directive.  If it is not, then we fall back to
355   // tgtok::paste interpretation of '#'.
356   void prepEatPreprocessorDirective(tgtok::TokKind Kind);
357 
358   // The main "exit" point from the token parsing to preprocessor.
359   //
360   // The method is called for CurPtr, when prepIsDirective() returns
361   // true.  The first parameter matches the result of prepIsDirective(),
362   // denoting the actual preprocessor directive to be processed.
363   //
364   // If the preprocessing directive disables the tokens processing, e.g.:
365   //     #ifdef NAME // NAME is undefined
366   // then lexPreprocessor() enters the lines-skipping mode.
367   // In this mode, it does not parse any tokens, because the code under
368   // the #ifdef may not even be a correct tablegen code.  The preprocessor
369   // looks for lines containing other preprocessing directives, which
370   // may be prepended with whitespaces and C-style comments.  If the line
371   // does not contain a preprocessing directive, it is skipped completely.
372   // Otherwise, the preprocessing directive is processed by recursively
373   // calling lexPreprocessor().  The processing of the encountered
374   // preprocessing directives includes updating preprocessing control stack
375   // and adding new macros into DefinedMacros set.
376   //
377   // The second parameter controls whether lexPreprocessor() is called from
378   // LexToken() (true) or recursively from lexPreprocessor() (false).
379   //
380   // If ReturnNextLiveToken is true, the method returns the next
381   // LEX token following the current directive or following the end
382   // of the disabled preprocessing region corresponding to this directive.
383   // If ReturnNextLiveToken is false, the method returns the first parameter,
384   // unless there were errors encountered in the disabled preprocessing
385   // region - in this case, it returns tgtok::Error.
386   tgtok::TokKind lexPreprocessor(tgtok::TokKind Kind,
387                                  bool ReturnNextLiveToken = true);
388 
389   // Worker method for lexPreprocessor() to skip lines after some
390   // preprocessing directive up to the buffer end or to the directive
391   // that re-enables token processing.  The method returns true
392   // upon processing the next directive that re-enables tokens
393   // processing.  False is returned if an error was encountered.
394   //
395   // Note that prepSkipRegion() calls lexPreprocessor() to process
396   // encountered preprocessing directives.  In this case, the second
397   // parameter to lexPreprocessor() is set to false.  Being passed
398   // false ReturnNextLiveToken, lexPreprocessor() must never call
399   // prepSkipRegion().  We assert this by passing ReturnNextLiveToken
400   // to prepSkipRegion() and checking that it is never set to false.
401   bool prepSkipRegion(bool MustNeverBeFalse);
402 
403   // Lex name of the macro after either #ifdef or #define.  We could have used
404   // LexIdentifier(), but it has special handling of "include" word, which
405   // could result in awkward diagnostic errors.  Consider:
406   // ----
407   // #ifdef include
408   // class ...
409   // ----
410   // LexIdentifier() will engage LexInclude(), which will complain about
411   // missing file with name "class".  Instead, prepLexMacroName() will treat
412   // "include" as a normal macro name.
413   //
414   // On entry, CurPtr points to the end of a preprocessing directive word.
415   // The method allows for whitespaces between the preprocessing directive
416   // and the macro name.  The allowed whitespaces are ' ' and '\t'.
417   //
418   // If the first non-whitespace symbol after the preprocessing directive
419   // is a valid start symbol for an identifier (i.e. [a-zA-Z_]), then
420   // the method updates TokStart to the position of the first non-whitespace
421   // symbol, sets CurPtr to the position of the macro name's last symbol,
422   // and returns a string reference to the macro name.  Otherwise,
423   // TokStart is set to the first non-whitespace symbol after the preprocessing
424   // directive, and the method returns an empty string reference.
425   //
426   // In all cases, TokStart may be used to point to the word following
427   // the preprocessing directive.
428   StringRef prepLexMacroName();
429 
430   // Skip any whitespaces starting from CurPtr.  The method is used
431   // only in the lines-skipping mode to find the first non-whitespace
432   // symbol after or at CurPtr.  Allowed whitespaces are ' ', '\t', '\n'
433   // and '\r'.  The method skips C-style comments as well, because
434   // it is used to find the beginning of the preprocessing directive.
435   // If we do not handle C-style comments the following code would
436   // result in incorrect detection of a preprocessing directive:
437   //     /*
438   //     #ifdef NAME
439   //     */
440   // As long as we skip C-style comments, the following code is correctly
441   // recognized as a preprocessing directive:
442   //     /* first line comment
443   //        second line comment */ #ifdef NAME
444   //
445   // The method returns true upon reaching the first non-whitespace symbol
446   // or EOF, CurPtr is set to point to this symbol.  The method returns false,
447   // if an error occurred during skipping of a C-style comment.
448   bool prepSkipLineBegin();
449 
450   // Skip any whitespaces or comments after a preprocessing directive.
451   // The method returns true upon reaching either end of the line
452   // or end of the file.  If there is a multiline C-style comment
453   // after the preprocessing directive, the method skips
454   // the comment, so the final CurPtr may point to one of the next lines.
455   // The method returns false, if an error occurred during skipping
456   // C- or C++-style comment, or a non-whitespace symbol appears
457   // after the preprocessing directive.
458   //
459   // The method maybe called both during lines-skipping and tokens
460   // processing.  It actually verifies that only whitespaces or/and
461   // comments follow a preprocessing directive.
462   //
463   // After the execution of this mehod, CurPtr points either to new line
464   // symbol, buffer end or non-whitespace symbol following the preprocesing
465   // directive.
466   bool prepSkipDirectiveEnd();
467 
468   // Return true, if the current preprocessor control stack is such that
469   // we should allow lexer to process the next token, false - otherwise.
470   //
471   // In particular, the method returns true, if all the #ifdef/#else
472   // controls on the stack have their IsDefined member set to true.
473   bool prepIsProcessingEnabled();
474 
475   // Report an error, if we reach EOF with non-empty preprocessing control
476   // stack.  This means there is no matching #endif for the previous
477   // #ifdef/#else.
478   void prepReportPreprocessorStackError();
479 };
480 
481 } // end namespace llvm
482 
483 #endif
484