xref: /llvm-project/mlir/examples/toy/Ch2/include/toy/Lexer.h (revision 8ab50da589fd2692052dcb85edf06d1d2d8da42c)
1 //===- Lexer.h - Lexer for the Toy language -------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements a simple Lexer for the Toy language.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef TOY_LEXER_H
14 #define TOY_LEXER_H
15 
16 #include "llvm/ADT/StringRef.h"
17 
18 #include <cstdlib>
19 #include <memory>
20 #include <string>
21 
22 namespace toy {
23 
24 /// Structure definition a location in a file.
25 struct Location {
26   std::shared_ptr<std::string> file; ///< filename.
27   int line;                          ///< line number.
28   int col;                           ///< column number.
29 };
30 
31 // List of Token returned by the lexer.
32 enum Token : int {
33   tok_semicolon = ';',
34   tok_parenthese_open = '(',
35   tok_parenthese_close = ')',
36   tok_bracket_open = '{',
37   tok_bracket_close = '}',
38   tok_sbracket_open = '[',
39   tok_sbracket_close = ']',
40 
41   tok_eof = -1,
42 
43   // commands
44   tok_return = -2,
45   tok_var = -3,
46   tok_def = -4,
47 
48   // primary
49   tok_identifier = -5,
50   tok_number = -6,
51 };
52 
53 /// The Lexer is an abstract base class providing all the facilities that the
54 /// Parser expects. It goes through the stream one token at a time and keeps
55 /// track of the location in the file for debugging purpose.
56 /// It relies on a subclass to provide a `readNextLine()` method. The subclass
57 /// can proceed by reading the next line from the standard input or from a
58 /// memory mapped file.
59 class Lexer {
60 public:
61   /// Create a lexer for the given filename. The filename is kept only for
62   /// debugging purpose (attaching a location to a Token).
63   Lexer(std::string filename)
64       : lastLocation(
65             {std::make_shared<std::string>(std::move(filename)), 0, 0}) {}
66   virtual ~Lexer() = default;
67 
68   /// Look at the current token in the stream.
69   Token getCurToken() { return curTok; }
70 
71   /// Move to the next token in the stream and return it.
72   Token getNextToken() { return curTok = getTok(); }
73 
74   /// Move to the next token in the stream, asserting on the current token
75   /// matching the expectation.
76   void consume(Token tok) {
77     assert(tok == curTok && "consume Token mismatch expectation");
78     getNextToken();
79   }
80 
81   /// Return the current identifier (prereq: getCurToken() == tok_identifier)
82   llvm::StringRef getId() {
83     assert(curTok == tok_identifier);
84     return identifierStr;
85   }
86 
87   /// Return the current number (prereq: getCurToken() == tok_number)
88   double getValue() {
89     assert(curTok == tok_number);
90     return numVal;
91   }
92 
93   /// Return the location for the beginning of the current token.
94   Location getLastLocation() { return lastLocation; }
95 
96   // Return the current line in the file.
97   int getLine() { return curLineNum; }
98 
99   // Return the current column in the file.
100   int getCol() { return curCol; }
101 
102 private:
103   /// Delegate to a derived class fetching the next line. Returns an empty
104   /// string to signal end of file (EOF). Lines are expected to always finish
105   /// with "\n"
106   virtual llvm::StringRef readNextLine() = 0;
107 
108   /// Return the next character from the stream. This manages the buffer for the
109   /// current line and request the next line buffer to the derived class as
110   /// needed.
111   int getNextChar() {
112     // The current line buffer should not be empty unless it is the end of file.
113     if (curLineBuffer.empty())
114       return EOF;
115     ++curCol;
116     auto nextchar = curLineBuffer.front();
117     curLineBuffer = curLineBuffer.drop_front();
118     if (curLineBuffer.empty())
119       curLineBuffer = readNextLine();
120     if (nextchar == '\n') {
121       ++curLineNum;
122       curCol = 0;
123     }
124     return nextchar;
125   }
126 
127   ///  Return the next token from standard input.
128   Token getTok() {
129     // Skip any whitespace.
130     while (isspace(lastChar))
131       lastChar = Token(getNextChar());
132 
133     // Save the current location before reading the token characters.
134     lastLocation.line = curLineNum;
135     lastLocation.col = curCol;
136 
137     // Identifier: [a-zA-Z][a-zA-Z0-9_]*
138     if (isalpha(lastChar)) {
139       identifierStr = (char)lastChar;
140       while (isalnum((lastChar = Token(getNextChar()))) || lastChar == '_')
141         identifierStr += (char)lastChar;
142 
143       if (identifierStr == "return")
144         return tok_return;
145       if (identifierStr == "def")
146         return tok_def;
147       if (identifierStr == "var")
148         return tok_var;
149       return tok_identifier;
150     }
151 
152     // Number: [0-9.]+
153     if (isdigit(lastChar) || lastChar == '.') {
154       std::string numStr;
155       do {
156         numStr += lastChar;
157         lastChar = Token(getNextChar());
158       } while (isdigit(lastChar) || lastChar == '.');
159 
160       numVal = strtod(numStr.c_str(), nullptr);
161       return tok_number;
162     }
163 
164     if (lastChar == '#') {
165       // Comment until end of line.
166       do {
167         lastChar = Token(getNextChar());
168       } while (lastChar != EOF && lastChar != '\n' && lastChar != '\r');
169 
170       if (lastChar != EOF)
171         return getTok();
172     }
173 
174     // Check for end of file.  Don't eat the EOF.
175     if (lastChar == EOF)
176       return tok_eof;
177 
178     // Otherwise, just return the character as its ascii value.
179     Token thisChar = Token(lastChar);
180     lastChar = Token(getNextChar());
181     return thisChar;
182   }
183 
184   /// The last token read from the input.
185   Token curTok = tok_eof;
186 
187   /// Location for `curTok`.
188   Location lastLocation;
189 
190   /// If the current Token is an identifier, this string contains the value.
191   std::string identifierStr;
192 
193   /// If the current Token is a number, this contains the value.
194   double numVal = 0;
195 
196   /// The last value returned by getNextChar(). We need to keep it around as we
197   /// always need to read ahead one character to decide when to end a token and
198   /// we can't put it back in the stream after reading from it.
199   Token lastChar = Token(' ');
200 
201   /// Keep track of the current line number in the input stream
202   int curLineNum = 0;
203 
204   /// Keep track of the current column number in the input stream
205   int curCol = 0;
206 
207   /// Buffer supplied by the derived class on calls to `readNextLine()`
208   llvm::StringRef curLineBuffer = "\n";
209 };
210 
211 /// A lexer implementation operating on a buffer in memory.
212 class LexerBuffer final : public Lexer {
213 public:
214   LexerBuffer(const char *begin, const char *end, std::string filename)
215       : Lexer(std::move(filename)), current(begin), end(end) {}
216 
217 private:
218   /// Provide one line at a time to the Lexer, return an empty string when
219   /// reaching the end of the buffer.
220   llvm::StringRef readNextLine() override {
221     auto *begin = current;
222     while (current <= end && *current && *current != '\n')
223       ++current;
224     if (current <= end && *current)
225       ++current;
226     llvm::StringRef result{begin, static_cast<size_t>(current - begin)};
227     return result;
228   }
229   const char *current, *end;
230 };
231 } // namespace toy
232 
233 #endif // TOY_LEXER_H
234