1 //===- TGLexer.h - Lexer for TableGen Files ---------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This class represents the Lexer for tablegen files. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #ifndef LLVM_LIB_TABLEGEN_TGLEXER_H 14 #define LLVM_LIB_TABLEGEN_TGLEXER_H 15 16 #include "llvm/ADT/ArrayRef.h" 17 #include "llvm/ADT/StringRef.h" 18 #include "llvm/ADT/StringSet.h" 19 #include "llvm/Support/DataTypes.h" 20 #include "llvm/Support/SMLoc.h" 21 #include <cassert> 22 #include <map> 23 #include <memory> 24 #include <string> 25 26 namespace llvm { 27 class SourceMgr; 28 class SMLoc; 29 class Twine; 30 31 namespace tgtok { 32 enum TokKind { 33 // Markers 34 Eof, Error, 35 36 // Tokens with no info. 37 minus, plus, // - + 38 l_square, r_square, // [ ] 39 l_brace, r_brace, // { } 40 l_paren, r_paren, // ( ) 41 less, greater, // < > 42 colon, semi, // : ; 43 comma, period, // , . 44 equal, question, // = ? 45 paste, // # 46 47 // Keywords. 48 Bit, Bits, Class, Code, Dag, Def, Foreach, Defm, Field, In, Int, Let, List, 49 MultiClass, String, Defset, 50 51 // !keywords. 52 XConcat, XADD, XMUL, XAND, XOR, XSRA, XSRL, XSHL, XListConcat, XListSplat, 53 XStrConcat, XCast, XSubst, XForEach, XFoldl, XHead, XTail, XSize, XEmpty, 54 XIf, XCond, XEq, XIsA, XDag, XNe, XLe, XLt, XGe, XGt, 55 56 // Integer value. 57 IntVal, 58 59 // Binary constant. Note that these are sized according to the number of 60 // bits given. 61 BinaryIntVal, 62 63 // String valued tokens. 64 Id, StrVal, VarName, CodeFragment, 65 66 // Preprocessing tokens for internal usage by the lexer. 67 // They are never returned as a result of Lex(). 68 Ifdef, Ifndef, Else, Endif, Define 69 }; 70 } 71 72 /// TGLexer - TableGen Lexer class. 73 class TGLexer { 74 SourceMgr &SrcMgr; 75 76 const char *CurPtr; 77 StringRef CurBuf; 78 79 // Information about the current token. 80 const char *TokStart; 81 tgtok::TokKind CurCode; 82 std::string CurStrVal; // This is valid for ID, STRVAL, VARNAME, CODEFRAGMENT 83 int64_t CurIntVal; // This is valid for INTVAL. 84 85 /// CurBuffer - This is the current buffer index we're lexing from as managed 86 /// by the SourceMgr object. 87 unsigned CurBuffer; 88 89 public: 90 typedef std::map<std::string, SMLoc> DependenciesMapTy; 91 private: 92 /// Dependencies - This is the list of all included files. 93 DependenciesMapTy Dependencies; 94 95 public: 96 TGLexer(SourceMgr &SrcMgr, ArrayRef<std::string> Macros); 97 98 tgtok::TokKind Lex() { 99 return CurCode = LexToken(CurPtr == CurBuf.begin()); 100 } 101 102 const DependenciesMapTy &getDependencies() const { 103 return Dependencies; 104 } 105 106 tgtok::TokKind getCode() const { return CurCode; } 107 108 const std::string &getCurStrVal() const { 109 assert((CurCode == tgtok::Id || CurCode == tgtok::StrVal || 110 CurCode == tgtok::VarName || CurCode == tgtok::CodeFragment) && 111 "This token doesn't have a string value"); 112 return CurStrVal; 113 } 114 int64_t getCurIntVal() const { 115 assert(CurCode == tgtok::IntVal && "This token isn't an integer"); 116 return CurIntVal; 117 } 118 std::pair<int64_t, unsigned> getCurBinaryIntVal() const { 119 assert(CurCode == tgtok::BinaryIntVal && 120 "This token isn't a binary integer"); 121 return std::make_pair(CurIntVal, (CurPtr - TokStart)-2); 122 } 123 124 SMLoc getLoc() const; 125 126 private: 127 /// LexToken - Read the next token and return its code. 128 tgtok::TokKind LexToken(bool FileOrLineStart = false); 129 130 tgtok::TokKind ReturnError(SMLoc Loc, const Twine &Msg); 131 tgtok::TokKind ReturnError(const char *Loc, const Twine &Msg); 132 133 int getNextChar(); 134 int peekNextChar(int Index) const; 135 void SkipBCPLComment(); 136 bool SkipCComment(); 137 tgtok::TokKind LexIdentifier(); 138 bool LexInclude(); 139 tgtok::TokKind LexString(); 140 tgtok::TokKind LexVarName(); 141 tgtok::TokKind LexNumber(); 142 tgtok::TokKind LexBracket(); 143 tgtok::TokKind LexExclaim(); 144 145 // Process EOF encountered in LexToken(). 146 // If EOF is met in an include file, then the method will update 147 // CurPtr, CurBuf and preprocessing include stack, and return true. 148 // If EOF is met in the top-level file, then the method will 149 // update and check the preprocessing include stack, and return false. 150 bool processEOF(); 151 152 // *** Structures and methods for preprocessing support *** 153 154 // A set of macro names that are defined either via command line or 155 // by using: 156 // #define NAME 157 StringSet<> DefinedMacros; 158 159 // Each of #ifdef and #else directives has a descriptor associated 160 // with it. 161 // 162 // An ordered list of preprocessing controls defined by #ifdef/#else 163 // directives that are in effect currently is called preprocessing 164 // control stack. It is represented as a vector of PreprocessorControlDesc's. 165 // 166 // The control stack is updated according to the following rules: 167 // 168 // For each #ifdef we add an element to the control stack. 169 // For each #else we replace the top element with a descriptor 170 // with an inverted IsDefined value. 171 // For each #endif we pop the top element from the control stack. 172 // 173 // When CurPtr reaches the current buffer's end, the control stack 174 // must be empty, i.e. #ifdef and the corresponding #endif 175 // must be located in the same file. 176 struct PreprocessorControlDesc { 177 // Either tgtok::Ifdef or tgtok::Else. 178 tgtok::TokKind Kind; 179 180 // True, if the condition for this directive is true, false - otherwise. 181 // Examples: 182 // #ifdef NAME : true, if NAME is defined, false - otherwise. 183 // ... 184 // #else : false, if NAME is defined, true - otherwise. 185 bool IsDefined; 186 187 // Pointer into CurBuf to the beginning of the preprocessing directive 188 // word, e.g.: 189 // #ifdef NAME 190 // ^ - SrcPos 191 SMLoc SrcPos; 192 }; 193 194 // We want to disallow code like this: 195 // file1.td: 196 // #define NAME 197 // #ifdef NAME 198 // include "file2.td" 199 // EOF 200 // file2.td: 201 // #endif 202 // EOF 203 // 204 // To do this, we clear the preprocessing control stack on entry 205 // to each of the included file. PrepIncludeStack is used to store 206 // preprocessing control stacks for the current file and all its 207 // parent files. The back() element is the preprocessing control 208 // stack for the current file. 209 std::vector<std::unique_ptr<std::vector<PreprocessorControlDesc>>> 210 PrepIncludeStack; 211 212 // Validate that the current preprocessing control stack is empty, 213 // since we are about to exit a file, and pop the include stack. 214 // 215 // If IncludeStackMustBeEmpty is true, the include stack must be empty 216 // after the popping, otherwise, the include stack must not be empty 217 // after the popping. Basically, the include stack must be empty 218 // only if we exit the "top-level" file (i.e. finish lexing). 219 // 220 // The method returns false, if the current preprocessing control stack 221 // is not empty (e.g. there is an unterminated #ifdef/#else), 222 // true - otherwise. 223 bool prepExitInclude(bool IncludeStackMustBeEmpty); 224 225 // Look ahead for a preprocessing directive starting from CurPtr. The caller 226 // must only call this method, if *(CurPtr - 1) is '#'. If the method matches 227 // a preprocessing directive word followed by a whitespace, then it returns 228 // one of the internal token kinds, i.e. Ifdef, Else, Endif, Define. 229 // 230 // CurPtr is not adjusted by this method. 231 tgtok::TokKind prepIsDirective() const; 232 233 // Given a preprocessing token kind, adjusts CurPtr to the end 234 // of the preprocessing directive word. Returns true, unless 235 // an unsupported token kind is passed in. 236 // 237 // We use look-ahead prepIsDirective() and prepEatPreprocessorDirective() 238 // to avoid adjusting CurPtr before we are sure that '#' is followed 239 // by a preprocessing directive. If it is not, then we fall back to 240 // tgtok::paste interpretation of '#'. 241 bool prepEatPreprocessorDirective(tgtok::TokKind Kind); 242 243 // The main "exit" point from the token parsing to preprocessor. 244 // 245 // The method is called for CurPtr, when prepIsDirective() returns 246 // true. The first parameter matches the result of prepIsDirective(), 247 // denoting the actual preprocessor directive to be processed. 248 // 249 // If the preprocessing directive disables the tokens processing, e.g.: 250 // #ifdef NAME // NAME is undefined 251 // then lexPreprocessor() enters the lines-skipping mode. 252 // In this mode, it does not parse any tokens, because the code under 253 // the #ifdef may not even be a correct tablegen code. The preprocessor 254 // looks for lines containing other preprocessing directives, which 255 // may be prepended with whitespaces and C-style comments. If the line 256 // does not contain a preprocessing directive, it is skipped completely. 257 // Otherwise, the preprocessing directive is processed by recursively 258 // calling lexPreprocessor(). The processing of the encountered 259 // preprocessing directives includes updating preprocessing control stack 260 // and adding new macros into DefinedMacros set. 261 // 262 // The second parameter controls whether lexPreprocessor() is called from 263 // LexToken() (true) or recursively from lexPreprocessor() (false). 264 // 265 // If ReturnNextLiveToken is true, the method returns the next 266 // LEX token following the current directive or following the end 267 // of the disabled preprocessing region corresponding to this directive. 268 // If ReturnNextLiveToken is false, the method returns the first parameter, 269 // unless there were errors encountered in the disabled preprocessing 270 // region - in this case, it returns tgtok::Error. 271 tgtok::TokKind lexPreprocessor(tgtok::TokKind Kind, 272 bool ReturnNextLiveToken = true); 273 274 // Worker method for lexPreprocessor() to skip lines after some 275 // preprocessing directive up to the buffer end or to the directive 276 // that re-enables token processing. The method returns true 277 // upon processing the next directive that re-enables tokens 278 // processing. False is returned if an error was encountered. 279 // 280 // Note that prepSkipRegion() calls lexPreprocessor() to process 281 // encountered preprocessing directives. In this case, the second 282 // parameter to lexPreprocessor() is set to false. Being passed 283 // false ReturnNextLiveToken, lexPreprocessor() must never call 284 // prepSkipRegion(). We assert this by passing ReturnNextLiveToken 285 // to prepSkipRegion() and checking that it is never set to false. 286 bool prepSkipRegion(bool MustNeverBeFalse); 287 288 // Lex name of the macro after either #ifdef or #define. We could have used 289 // LexIdentifier(), but it has special handling of "include" word, which 290 // could result in awkward diagnostic errors. Consider: 291 // ---- 292 // #ifdef include 293 // class ... 294 // ---- 295 // LexIdentifier() will engage LexInclude(), which will complain about 296 // missing file with name "class". Instead, prepLexMacroName() will treat 297 // "include" as a normal macro name. 298 // 299 // On entry, CurPtr points to the end of a preprocessing directive word. 300 // The method allows for whitespaces between the preprocessing directive 301 // and the macro name. The allowed whitespaces are ' ' and '\t'. 302 // 303 // If the first non-whitespace symbol after the preprocessing directive 304 // is a valid start symbol for an identifier (i.e. [a-zA-Z_]), then 305 // the method updates TokStart to the position of the first non-whitespace 306 // symbol, sets CurPtr to the position of the macro name's last symbol, 307 // and returns a string reference to the macro name. Otherwise, 308 // TokStart is set to the first non-whitespace symbol after the preprocessing 309 // directive, and the method returns an empty string reference. 310 // 311 // In all cases, TokStart may be used to point to the word following 312 // the preprocessing directive. 313 StringRef prepLexMacroName(); 314 315 // Skip any whitespaces starting from CurPtr. The method is used 316 // only in the lines-skipping mode to find the first non-whitespace 317 // symbol after or at CurPtr. Allowed whitespaces are ' ', '\t', '\n' 318 // and '\r'. The method skips C-style comments as well, because 319 // it is used to find the beginning of the preprocessing directive. 320 // If we do not handle C-style comments the following code would 321 // result in incorrect detection of a preprocessing directive: 322 // /* 323 // #ifdef NAME 324 // */ 325 // As long as we skip C-style comments, the following code is correctly 326 // recognized as a preprocessing directive: 327 // /* first line comment 328 // second line comment */ #ifdef NAME 329 // 330 // The method returns true upon reaching the first non-whitespace symbol 331 // or EOF, CurPtr is set to point to this symbol. The method returns false, 332 // if an error occured during skipping of a C-style comment. 333 bool prepSkipLineBegin(); 334 335 // Skip any whitespaces or comments after a preprocessing directive. 336 // The method returns true upon reaching either end of the line 337 // or end of the file. If there is a multiline C-style comment 338 // after the preprocessing directive, the method skips 339 // the comment, so the final CurPtr may point to one of the next lines. 340 // The method returns false, if an error occured during skipping 341 // C- or C++-style comment, or a non-whitespace symbol appears 342 // after the preprocessing directive. 343 // 344 // The method maybe called both during lines-skipping and tokens 345 // processing. It actually verifies that only whitespaces or/and 346 // comments follow a preprocessing directive. 347 // 348 // After the execution of this mehod, CurPtr points either to new line 349 // symbol, buffer end or non-whitespace symbol following the preprocesing 350 // directive. 351 bool prepSkipDirectiveEnd(); 352 353 // Skip all symbols to the end of the line/file. 354 // The method adjusts CurPtr, so that it points to either new line 355 // symbol in the current line or the buffer end. 356 void prepSkipToLineEnd(); 357 358 // Return true, if the current preprocessor control stack is such that 359 // we should allow lexer to process the next token, false - otherwise. 360 // 361 // In particular, the method returns true, if all the #ifdef/#else 362 // controls on the stack have their IsDefined member set to true. 363 bool prepIsProcessingEnabled(); 364 365 // Report an error, if we reach EOF with non-empty preprocessing control 366 // stack. This means there is no matching #endif for the previous 367 // #ifdef/#else. 368 void prepReportPreprocessorStackError(); 369 }; 370 371 } // end namespace llvm 372 373 #endif 374