1 //===- Lexer.h - Lexer for the Toy language -------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements a simple Lexer for the Toy language. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #ifndef TOY_LEXER_H 14 #define TOY_LEXER_H 15 16 #include "llvm/ADT/StringRef.h" 17 18 #include <cstdlib> 19 #include <memory> 20 #include <string> 21 22 namespace toy { 23 24 /// Structure definition a location in a file. 25 struct Location { 26 std::shared_ptr<std::string> file; ///< filename. 27 int line; ///< line number. 28 int col; ///< column number. 29 }; 30 31 // List of Token returned by the lexer. 32 enum Token : int { 33 tok_semicolon = ';', 34 tok_parenthese_open = '(', 35 tok_parenthese_close = ')', 36 tok_bracket_open = '{', 37 tok_bracket_close = '}', 38 tok_sbracket_open = '[', 39 tok_sbracket_close = ']', 40 41 tok_eof = -1, 42 43 // commands 44 tok_return = -2, 45 tok_var = -3, 46 tok_def = -4, 47 48 // primary 49 tok_identifier = -5, 50 tok_number = -6, 51 }; 52 53 /// The Lexer is an abstract base class providing all the facilities that the 54 /// Parser expects. It goes through the stream one token at a time and keeps 55 /// track of the location in the file for debugging purpose. 56 /// It relies on a subclass to provide a `readNextLine()` method. The subclass 57 /// can proceed by reading the next line from the standard input or from a 58 /// memory mapped file. 59 class Lexer { 60 public: 61 /// Create a lexer for the given filename. The filename is kept only for 62 /// debugging purpose (attaching a location to a Token). 63 Lexer(std::string filename) 64 : lastLocation( 65 {std::make_shared<std::string>(std::move(filename)), 0, 0}) {} 66 virtual ~Lexer() = default; 67 68 /// Look at the current token in the stream. 69 Token getCurToken() { return curTok; } 70 71 /// Move to the next token in the stream and return it. 72 Token getNextToken() { return curTok = getTok(); } 73 74 /// Move to the next token in the stream, asserting on the current token 75 /// matching the expectation. 76 void consume(Token tok) { 77 assert(tok == curTok && "consume Token mismatch expectation"); 78 getNextToken(); 79 } 80 81 /// Return the current identifier (prereq: getCurToken() == tok_identifier) 82 llvm::StringRef getId() { 83 assert(curTok == tok_identifier); 84 return identifierStr; 85 } 86 87 /// Return the current number (prereq: getCurToken() == tok_number) 88 double getValue() { 89 assert(curTok == tok_number); 90 return numVal; 91 } 92 93 /// Return the location for the beginning of the current token. 94 Location getLastLocation() { return lastLocation; } 95 96 // Return the current line in the file. 97 int getLine() { return curLineNum; } 98 99 // Return the current column in the file. 100 int getCol() { return curCol; } 101 102 private: 103 /// Delegate to a derived class fetching the next line. Returns an empty 104 /// string to signal end of file (EOF). Lines are expected to always finish 105 /// with "\n" 106 virtual llvm::StringRef readNextLine() = 0; 107 108 /// Return the next character from the stream. This manages the buffer for the 109 /// current line and request the next line buffer to the derived class as 110 /// needed. 111 int getNextChar() { 112 // The current line buffer should not be empty unless it is the end of file. 113 if (curLineBuffer.empty()) 114 return EOF; 115 ++curCol; 116 auto nextchar = curLineBuffer.front(); 117 curLineBuffer = curLineBuffer.drop_front(); 118 if (curLineBuffer.empty()) 119 curLineBuffer = readNextLine(); 120 if (nextchar == '\n') { 121 ++curLineNum; 122 curCol = 0; 123 } 124 return nextchar; 125 } 126 127 /// Return the next token from standard input. 128 Token getTok() { 129 // Skip any whitespace. 130 while (isspace(lastChar)) 131 lastChar = Token(getNextChar()); 132 133 // Save the current location before reading the token characters. 134 lastLocation.line = curLineNum; 135 lastLocation.col = curCol; 136 137 // Identifier: [a-zA-Z][a-zA-Z0-9_]* 138 if (isalpha(lastChar)) { 139 identifierStr = (char)lastChar; 140 while (isalnum((lastChar = Token(getNextChar()))) || lastChar == '_') 141 identifierStr += (char)lastChar; 142 143 if (identifierStr == "return") 144 return tok_return; 145 if (identifierStr == "def") 146 return tok_def; 147 if (identifierStr == "var") 148 return tok_var; 149 return tok_identifier; 150 } 151 152 // Number: [0-9.]+ 153 if (isdigit(lastChar) || lastChar == '.') { 154 std::string numStr; 155 do { 156 numStr += lastChar; 157 lastChar = Token(getNextChar()); 158 } while (isdigit(lastChar) || lastChar == '.'); 159 160 numVal = strtod(numStr.c_str(), nullptr); 161 return tok_number; 162 } 163 164 if (lastChar == '#') { 165 // Comment until end of line. 166 do { 167 lastChar = Token(getNextChar()); 168 } while (lastChar != EOF && lastChar != '\n' && lastChar != '\r'); 169 170 if (lastChar != EOF) 171 return getTok(); 172 } 173 174 // Check for end of file. Don't eat the EOF. 175 if (lastChar == EOF) 176 return tok_eof; 177 178 // Otherwise, just return the character as its ascii value. 179 Token thisChar = Token(lastChar); 180 lastChar = Token(getNextChar()); 181 return thisChar; 182 } 183 184 /// The last token read from the input. 185 Token curTok = tok_eof; 186 187 /// Location for `curTok`. 188 Location lastLocation; 189 190 /// If the current Token is an identifier, this string contains the value. 191 std::string identifierStr; 192 193 /// If the current Token is a number, this contains the value. 194 double numVal = 0; 195 196 /// The last value returned by getNextChar(). We need to keep it around as we 197 /// always need to read ahead one character to decide when to end a token and 198 /// we can't put it back in the stream after reading from it. 199 Token lastChar = Token(' '); 200 201 /// Keep track of the current line number in the input stream 202 int curLineNum = 0; 203 204 /// Keep track of the current column number in the input stream 205 int curCol = 0; 206 207 /// Buffer supplied by the derived class on calls to `readNextLine()` 208 llvm::StringRef curLineBuffer = "\n"; 209 }; 210 211 /// A lexer implementation operating on a buffer in memory. 212 class LexerBuffer final : public Lexer { 213 public: 214 LexerBuffer(const char *begin, const char *end, std::string filename) 215 : Lexer(std::move(filename)), current(begin), end(end) {} 216 217 private: 218 /// Provide one line at a time to the Lexer, return an empty string when 219 /// reaching the end of the buffer. 220 llvm::StringRef readNextLine() override { 221 auto *begin = current; 222 while (current <= end && *current && *current != '\n') 223 ++current; 224 if (current <= end && *current) 225 ++current; 226 llvm::StringRef result{begin, static_cast<size_t>(current - begin)}; 227 return result; 228 } 229 const char *current, *end; 230 }; 231 } // namespace toy 232 233 #endif // TOY_LEXER_H 234