xref: /llvm-project/llvm/tools/llvm-rc/ResourceScriptToken.cpp (revision 07bc04ff384b367d6e4bb318fbad8e9c6b0a304e)
1 //===-- ResourceScriptToken.cpp ---------------------------------*- C++-*-===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===---------------------------------------------------------------------===//
9 //
10 // This file implements an interface defined in ResourceScriptToken.h.
11 // In particular, it defines an .rc script tokenizer.
12 //
13 //===---------------------------------------------------------------------===//
14 
15 #include "ResourceScriptToken.h"
16 #include "llvm/Support/raw_ostream.h"
17 
18 #include <algorithm>
19 #include <cassert>
20 #include <cctype>
21 #include <cstdlib>
22 #include <utility>
23 
24 using namespace llvm;
25 
26 using Kind = RCToken::Kind;
27 
28 // Checks if Representation is a correct description of an RC integer.
29 // It should be a 32-bit unsigned integer, either decimal, octal (0[0-7]+),
30 // or hexadecimal (0x[0-9a-f]+). It might be followed by a single 'L'
31 // character (that is the difference between our representation and
32 // StringRef's one). If Representation is correct, 'true' is returned and
33 // the return value is put back in Num.
34 static bool rcGetAsInteger(StringRef Representation, uint32_t &Num) {
35   size_t Length = Representation.size();
36   if (Length == 0)
37     return false;
38   // Strip the last 'L' if unnecessary.
39   if (std::toupper(Representation.back()) == 'L')
40     Representation = Representation.drop_back(1);
41 
42   return !Representation.getAsInteger<uint32_t>(0, Num);
43 }
44 
45 RCToken::RCToken(RCToken::Kind RCTokenKind, StringRef Value)
46     : TokenKind(RCTokenKind), TokenValue(Value) {}
47 
48 uint32_t RCToken::intValue() const {
49   assert(TokenKind == Kind::Int);
50   // We assume that the token already is a correct integer (checked by
51   // rcGetAsInteger).
52   uint32_t Result;
53   bool IsSuccess = rcGetAsInteger(TokenValue, Result);
54   assert(IsSuccess);
55   (void)IsSuccess;  // Silence the compiler warning when -DNDEBUG flag is on.
56   return Result;
57 }
58 
59 bool RCToken::isLongInt() const {
60   return TokenKind == Kind::Int && std::toupper(TokenValue.back()) == 'L';
61 }
62 
63 StringRef RCToken::value() const { return TokenValue; }
64 
65 Kind RCToken::kind() const { return TokenKind; }
66 
67 bool RCToken::isBinaryOp() const {
68   switch (TokenKind) {
69   case Kind::Plus:
70   case Kind::Minus:
71   case Kind::Pipe:
72   case Kind::Amp:
73     return true;
74   default:
75     return false;
76   }
77 }
78 
79 static Error getStringError(const Twine &message) {
80   return make_error<StringError>("Error parsing file: " + message,
81                                  inconvertibleErrorCode());
82 }
83 
84 namespace {
85 
86 class Tokenizer {
87 public:
88   Tokenizer(StringRef Input) : Data(Input), DataLength(Input.size()) {}
89 
90   Expected<std::vector<RCToken>> run();
91 
92 private:
93   // All 'advancing' methods return boolean values; if they're equal to false,
94   // the stream has ended or failed.
95   bool advance(size_t Amount = 1);
96   bool skipWhitespaces();
97 
98   // Consumes a token. If any problem occurred, a non-empty Error is returned.
99   Error consumeToken(const Kind TokenKind);
100 
101   // Check if tokenizer is about to read FollowingChars.
102   bool willNowRead(StringRef FollowingChars) const;
103 
104   // Check if tokenizer can start reading an identifier at current position.
105   // The original tool did non specify the rules to determine what is a correct
106   // identifier. We assume they should follow the C convention:
107   // [a-zA-Z_][a-zA-Z0-9_]*.
108   bool canStartIdentifier() const;
109   // Check if tokenizer can continue reading an identifier.
110   bool canContinueIdentifier() const;
111 
112   // Check if tokenizer can start reading an integer.
113   // A correct integer always starts with a 0-9 digit,
114   // can contain characters 0-9A-Fa-f (digits),
115   // Ll (marking the integer is 32-bit), Xx (marking the representation
116   // is hexadecimal). As some kind of separator should come after the
117   // integer, we can consume the integer until a non-alphanumeric
118   // character.
119   bool canStartInt() const;
120   bool canContinueInt() const;
121 
122   bool canStartString() const;
123 
124   bool streamEof() const;
125 
126   // Classify the token that is about to be read from the current position.
127   Kind classifyCurrentToken() const;
128 
129   // Process the Kind::Identifier token - check if it is
130   // an identifier describing a block start or end.
131   void processIdentifier(RCToken &token) const;
132 
133   StringRef Data;
134   size_t DataLength, Pos;
135 };
136 
137 Expected<std::vector<RCToken>> Tokenizer::run() {
138   Pos = 0;
139   std::vector<RCToken> Result;
140 
141   // Consume an optional UTF-8 Byte Order Mark.
142   if (willNowRead("\xef\xbb\xbf"))
143     advance(3);
144 
145   while (!streamEof()) {
146     if (!skipWhitespaces())
147       break;
148 
149     Kind TokenKind = classifyCurrentToken();
150     if (TokenKind == Kind::Invalid)
151       return getStringError("Invalid token found at position " + Twine(Pos));
152 
153     const size_t TokenStart = Pos;
154     if (Error TokenError = consumeToken(TokenKind))
155       return std::move(TokenError);
156 
157     RCToken Token(TokenKind, Data.take_front(Pos).drop_front(TokenStart));
158     if (TokenKind == Kind::Identifier) {
159       processIdentifier(Token);
160     } else if (TokenKind == Kind::Int) {
161       uint32_t TokenInt;
162       if (!rcGetAsInteger(Token.value(), TokenInt)) {
163         // The integer has incorrect format or cannot be represented in
164         // a 32-bit integer.
165         return getStringError("Integer invalid or too large: " +
166                               Token.value().str());
167       }
168     }
169 
170     Result.push_back(Token);
171   }
172 
173   return Result;
174 }
175 
176 bool Tokenizer::advance(size_t Amount) {
177   Pos += Amount;
178   return !streamEof();
179 }
180 
181 bool Tokenizer::skipWhitespaces() {
182   while (!streamEof() && std::isspace(Data[Pos]))
183     advance();
184   return !streamEof();
185 }
186 
187 Error Tokenizer::consumeToken(const Kind TokenKind) {
188   switch (TokenKind) {
189   // One-character token consumption.
190 #define TOKEN(Name)
191 #define SHORT_TOKEN(Name, Ch) case Kind::Name:
192 #include "ResourceScriptTokenList.h"
193 #undef TOKEN
194 #undef SHORT_TOKEN
195     advance();
196     return Error::success();
197 
198   case Kind::Identifier:
199     while (!streamEof() && canContinueIdentifier())
200       advance();
201     return Error::success();
202 
203   case Kind::Int:
204     while (!streamEof() && canContinueInt())
205       advance();
206     return Error::success();
207 
208   case Kind::String:
209     // Consume the preceding 'L', if there is any.
210     if (std::toupper(Data[Pos]) == 'L')
211       advance();
212     // Consume the double-quote.
213     advance();
214 
215     // Consume the characters until the end of the file, line or string.
216     while (true) {
217       if (streamEof()) {
218         return getStringError("Unterminated string literal.");
219       } else if (Data[Pos] == '"') {
220         // Consume the ending double-quote.
221         advance();
222         return Error::success();
223       } else if (Data[Pos] == '\n') {
224         return getStringError("String literal not terminated in the line.");
225       }
226 
227       advance();
228     }
229 
230   case Kind::Invalid:
231     assert(false && "Cannot consume an invalid token.");
232   }
233 
234   llvm_unreachable("Unknown RCToken::Kind");
235 }
236 
237 bool Tokenizer::willNowRead(StringRef FollowingChars) const {
238   return Data.drop_front(Pos).startswith(FollowingChars);
239 }
240 
241 bool Tokenizer::canStartIdentifier() const {
242   assert(!streamEof());
243 
244   const char CurChar = Data[Pos];
245   return std::isalpha(CurChar) || CurChar == '_';
246 }
247 
248 bool Tokenizer::canContinueIdentifier() const {
249   assert(!streamEof());
250   const char CurChar = Data[Pos];
251   return std::isalnum(CurChar) || CurChar == '_';
252 }
253 
254 bool Tokenizer::canStartInt() const {
255   assert(!streamEof());
256   return std::isdigit(Data[Pos]);
257 }
258 
259 bool Tokenizer::canContinueInt() const {
260   assert(!streamEof());
261   return std::isalnum(Data[Pos]);
262 }
263 
264 bool Tokenizer::canStartString() const {
265   return willNowRead("\"") || willNowRead("L\"") || willNowRead("l\"");
266 }
267 
268 bool Tokenizer::streamEof() const { return Pos == DataLength; }
269 
270 Kind Tokenizer::classifyCurrentToken() const {
271   if (canStartInt())
272     return Kind::Int;
273   if (canStartString())
274     return Kind::String;
275   // BEGIN and END are at this point of lexing recognized as identifiers.
276   if (canStartIdentifier())
277     return Kind::Identifier;
278 
279   const char CurChar = Data[Pos];
280 
281   switch (CurChar) {
282   // One-character token classification.
283 #define TOKEN(Name)
284 #define SHORT_TOKEN(Name, Ch)                                                  \
285   case Ch:                                                                     \
286     return Kind::Name;
287 #include "ResourceScriptTokenList.h"
288 #undef TOKEN
289 #undef SHORT_TOKEN
290 
291   default:
292     return Kind::Invalid;
293   }
294 }
295 
296 void Tokenizer::processIdentifier(RCToken &Token) const {
297   assert(Token.kind() == Kind::Identifier);
298   StringRef Name = Token.value();
299 
300   if (Name.equals_lower("begin"))
301     Token = RCToken(Kind::BlockBegin, Name);
302   else if (Name.equals_lower("end"))
303     Token = RCToken(Kind::BlockEnd, Name);
304 }
305 
306 } // anonymous namespace
307 
308 namespace llvm {
309 
310 Expected<std::vector<RCToken>> tokenizeRC(StringRef Input) {
311   return Tokenizer(Input).run();
312 }
313 
314 } // namespace llvm
315