xref: /netbsd-src/external/apache2/llvm/dist/llvm/tools/llvm-rc/ResourceScriptToken.cpp (revision 181254a7b1bdde6873432bffef2d2decc4b5c22f)
1 //===-- ResourceScriptToken.cpp ---------------------------------*- C++-*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===---------------------------------------------------------------------===//
8 //
9 // This file implements an interface defined in ResourceScriptToken.h.
10 // In particular, it defines an .rc script tokenizer.
11 //
12 //===---------------------------------------------------------------------===//
13 
14 #include "ResourceScriptToken.h"
15 #include "llvm/Support/raw_ostream.h"
16 
17 #include <algorithm>
18 #include <cassert>
19 #include <cctype>
20 #include <cstdlib>
21 #include <utility>
22 
23 using namespace llvm;
24 
25 using Kind = RCToken::Kind;
26 
27 // Checks if Representation is a correct description of an RC integer.
28 // It should be a 32-bit unsigned integer, either decimal, octal (0[0-7]+),
29 // or hexadecimal (0x[0-9a-f]+). It might be followed by a single 'L'
30 // character (that is the difference between our representation and
31 // StringRef's one). If Representation is correct, 'true' is returned and
32 // the return value is put back in Num.
33 static bool rcGetAsInteger(StringRef Representation, uint32_t &Num) {
34   size_t Length = Representation.size();
35   if (Length == 0)
36     return false;
37   // Strip the last 'L' if unnecessary.
38   if (std::toupper(Representation.back()) == 'L')
39     Representation = Representation.drop_back(1);
40 
41   return !Representation.getAsInteger<uint32_t>(0, Num);
42 }
43 
44 RCToken::RCToken(RCToken::Kind RCTokenKind, StringRef Value)
45     : TokenKind(RCTokenKind), TokenValue(Value) {}
46 
47 uint32_t RCToken::intValue() const {
48   assert(TokenKind == Kind::Int);
49   // We assume that the token already is a correct integer (checked by
50   // rcGetAsInteger).
51   uint32_t Result;
52   bool IsSuccess = rcGetAsInteger(TokenValue, Result);
53   assert(IsSuccess);
54   (void)IsSuccess;  // Silence the compiler warning when -DNDEBUG flag is on.
55   return Result;
56 }
57 
58 bool RCToken::isLongInt() const {
59   return TokenKind == Kind::Int && std::toupper(TokenValue.back()) == 'L';
60 }
61 
62 StringRef RCToken::value() const { return TokenValue; }
63 
64 Kind RCToken::kind() const { return TokenKind; }
65 
66 bool RCToken::isBinaryOp() const {
67   switch (TokenKind) {
68   case Kind::Plus:
69   case Kind::Minus:
70   case Kind::Pipe:
71   case Kind::Amp:
72     return true;
73   default:
74     return false;
75   }
76 }
77 
78 static Error getStringError(const Twine &message) {
79   return make_error<StringError>("Error parsing file: " + message,
80                                  inconvertibleErrorCode());
81 }
82 
83 namespace {
84 
85 class Tokenizer {
86 public:
87   Tokenizer(StringRef Input) : Data(Input), DataLength(Input.size()) {}
88 
89   Expected<std::vector<RCToken>> run();
90 
91 private:
92   // All 'advancing' methods return boolean values; if they're equal to false,
93   // the stream has ended or failed.
94   bool advance(size_t Amount = 1);
95   bool skipWhitespaces();
96 
97   // Consumes a token. If any problem occurred, a non-empty Error is returned.
98   Error consumeToken(const Kind TokenKind);
99 
100   // Check if tokenizer is about to read FollowingChars.
101   bool willNowRead(StringRef FollowingChars) const;
102 
103   // Check if tokenizer can start reading an identifier at current position.
104   // The original tool did non specify the rules to determine what is a correct
105   // identifier. We assume they should follow the C convention:
106   // [a-zA-Z_][a-zA-Z0-9_]*.
107   bool canStartIdentifier() const;
108   // Check if tokenizer can continue reading an identifier.
109   bool canContinueIdentifier() const;
110 
111   // Check if tokenizer can start reading an integer.
112   // A correct integer always starts with a 0-9 digit,
113   // can contain characters 0-9A-Fa-f (digits),
114   // Ll (marking the integer is 32-bit), Xx (marking the representation
115   // is hexadecimal). As some kind of separator should come after the
116   // integer, we can consume the integer until a non-alphanumeric
117   // character.
118   bool canStartInt() const;
119   bool canContinueInt() const;
120 
121   bool canStartString() const;
122 
123   // Check if tokenizer can start reading a single line comment (e.g. a comment
124   // that begins with '//')
125   bool canStartLineComment() const;
126 
127   // Check if tokenizer can start or finish reading a block comment (e.g. a
128   // comment that begins with '/*' and ends with '*/')
129   bool canStartBlockComment() const;
130 
131   // Throw away all remaining characters on the current line.
132   void skipCurrentLine();
133 
134   bool streamEof() const;
135 
136   // Classify the token that is about to be read from the current position.
137   Kind classifyCurrentToken() const;
138 
139   // Process the Kind::Identifier token - check if it is
140   // an identifier describing a block start or end.
141   void processIdentifier(RCToken &token) const;
142 
143   StringRef Data;
144   size_t DataLength, Pos;
145 };
146 
147 void Tokenizer::skipCurrentLine() {
148   Pos = Data.find_first_of("\r\n", Pos);
149   Pos = Data.find_first_not_of("\r\n", Pos);
150 
151   if (Pos == StringRef::npos)
152     Pos = DataLength;
153 }
154 
155 Expected<std::vector<RCToken>> Tokenizer::run() {
156   Pos = 0;
157   std::vector<RCToken> Result;
158 
159   // Consume an optional UTF-8 Byte Order Mark.
160   if (willNowRead("\xef\xbb\xbf"))
161     advance(3);
162 
163   while (!streamEof()) {
164     if (!skipWhitespaces())
165       break;
166 
167     Kind TokenKind = classifyCurrentToken();
168     if (TokenKind == Kind::Invalid)
169       return getStringError("Invalid token found at position " + Twine(Pos));
170 
171     const size_t TokenStart = Pos;
172     if (Error TokenError = consumeToken(TokenKind))
173       return std::move(TokenError);
174 
175     // Comments are just deleted, don't bother saving them.
176     if (TokenKind == Kind::LineComment || TokenKind == Kind::StartComment)
177       continue;
178 
179     RCToken Token(TokenKind, Data.take_front(Pos).drop_front(TokenStart));
180     if (TokenKind == Kind::Identifier) {
181       processIdentifier(Token);
182     } else if (TokenKind == Kind::Int) {
183       uint32_t TokenInt;
184       if (!rcGetAsInteger(Token.value(), TokenInt)) {
185         // The integer has incorrect format or cannot be represented in
186         // a 32-bit integer.
187         return getStringError("Integer invalid or too large: " +
188                               Token.value().str());
189       }
190     }
191 
192     Result.push_back(Token);
193   }
194 
195   return Result;
196 }
197 
198 bool Tokenizer::advance(size_t Amount) {
199   Pos += Amount;
200   return !streamEof();
201 }
202 
203 bool Tokenizer::skipWhitespaces() {
204   while (!streamEof() && std::isspace(Data[Pos]))
205     advance();
206   return !streamEof();
207 }
208 
209 Error Tokenizer::consumeToken(const Kind TokenKind) {
210   switch (TokenKind) {
211   // One-character token consumption.
212 #define TOKEN(Name)
213 #define SHORT_TOKEN(Name, Ch) case Kind::Name:
214 #include "ResourceScriptTokenList.def"
215     advance();
216     return Error::success();
217 
218   case Kind::LineComment:
219     advance(2);
220     skipCurrentLine();
221     return Error::success();
222 
223   case Kind::StartComment: {
224     advance(2);
225     auto EndPos = Data.find("*/", Pos);
226     if (EndPos == StringRef::npos)
227       return getStringError(
228           "Unclosed multi-line comment beginning at position " + Twine(Pos));
229     advance(EndPos - Pos);
230     advance(2);
231     return Error::success();
232   }
233   case Kind::Identifier:
234     while (!streamEof() && canContinueIdentifier())
235       advance();
236     return Error::success();
237 
238   case Kind::Int:
239     while (!streamEof() && canContinueInt())
240       advance();
241     return Error::success();
242 
243   case Kind::String:
244     // Consume the preceding 'L', if there is any.
245     if (std::toupper(Data[Pos]) == 'L')
246       advance();
247     // Consume the double-quote.
248     advance();
249 
250     // Consume the characters until the end of the file, line or string.
251     while (true) {
252       if (streamEof()) {
253         return getStringError("Unterminated string literal.");
254       } else if (Data[Pos] == '"') {
255         // Consume the ending double-quote.
256         advance();
257         // However, if another '"' follows this double-quote, the string didn't
258         // end and we just included '"' into the string.
259         if (!willNowRead("\""))
260           return Error::success();
261       } else if (Data[Pos] == '\n') {
262         return getStringError("String literal not terminated in the line.");
263       }
264 
265       advance();
266     }
267 
268   case Kind::Invalid:
269     assert(false && "Cannot consume an invalid token.");
270   }
271 
272   llvm_unreachable("Unknown RCToken::Kind");
273 }
274 
275 bool Tokenizer::willNowRead(StringRef FollowingChars) const {
276   return Data.drop_front(Pos).startswith(FollowingChars);
277 }
278 
279 bool Tokenizer::canStartIdentifier() const {
280   assert(!streamEof());
281 
282   const char CurChar = Data[Pos];
283   return std::isalpha(CurChar) || CurChar == '_' || CurChar == '.';
284 }
285 
286 bool Tokenizer::canContinueIdentifier() const {
287   assert(!streamEof());
288   const char CurChar = Data[Pos];
289   return std::isalnum(CurChar) || CurChar == '_' || CurChar == '.' ||
290          CurChar == '/' || CurChar == '\\';
291 }
292 
293 bool Tokenizer::canStartInt() const {
294   assert(!streamEof());
295   return std::isdigit(Data[Pos]);
296 }
297 
298 bool Tokenizer::canStartBlockComment() const {
299   assert(!streamEof());
300   return Data.drop_front(Pos).startswith("/*");
301 }
302 
303 bool Tokenizer::canStartLineComment() const {
304   assert(!streamEof());
305   return Data.drop_front(Pos).startswith("//");
306 }
307 
308 bool Tokenizer::canContinueInt() const {
309   assert(!streamEof());
310   return std::isalnum(Data[Pos]);
311 }
312 
313 bool Tokenizer::canStartString() const {
314   return willNowRead("\"") || willNowRead("L\"") || willNowRead("l\"");
315 }
316 
317 bool Tokenizer::streamEof() const { return Pos == DataLength; }
318 
319 Kind Tokenizer::classifyCurrentToken() const {
320   if (canStartBlockComment())
321     return Kind::StartComment;
322   if (canStartLineComment())
323     return Kind::LineComment;
324 
325   if (canStartInt())
326     return Kind::Int;
327   if (canStartString())
328     return Kind::String;
329   // BEGIN and END are at this point of lexing recognized as identifiers.
330   if (canStartIdentifier())
331     return Kind::Identifier;
332 
333   const char CurChar = Data[Pos];
334 
335   switch (CurChar) {
336   // One-character token classification.
337 #define TOKEN(Name)
338 #define SHORT_TOKEN(Name, Ch)                                                  \
339   case Ch:                                                                     \
340     return Kind::Name;
341 #include "ResourceScriptTokenList.def"
342 
343   default:
344     return Kind::Invalid;
345   }
346 }
347 
348 void Tokenizer::processIdentifier(RCToken &Token) const {
349   assert(Token.kind() == Kind::Identifier);
350   StringRef Name = Token.value();
351 
352   if (Name.equals_lower("begin"))
353     Token = RCToken(Kind::BlockBegin, Name);
354   else if (Name.equals_lower("end"))
355     Token = RCToken(Kind::BlockEnd, Name);
356 }
357 
358 } // anonymous namespace
359 
360 namespace llvm {
361 
362 Expected<std::vector<RCToken>> tokenizeRC(StringRef Input) {
363   return Tokenizer(Input).run();
364 }
365 
366 } // namespace llvm
367