1 //===-- ResourceScriptToken.cpp ---------------------------------*- C++-*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===---------------------------------------------------------------------===// 8 // 9 // This file implements an interface defined in ResourceScriptToken.h. 10 // In particular, it defines an .rc script tokenizer. 11 // 12 //===---------------------------------------------------------------------===// 13 14 #include "ResourceScriptToken.h" 15 #include "llvm/Support/raw_ostream.h" 16 17 #include <algorithm> 18 #include <cassert> 19 #include <cctype> 20 #include <cstdlib> 21 #include <utility> 22 23 using namespace llvm; 24 25 using Kind = RCToken::Kind; 26 27 // Checks if Representation is a correct description of an RC integer. 28 // It should be a 32-bit unsigned integer, either decimal, octal (0[0-7]+), 29 // or hexadecimal (0x[0-9a-f]+). It might be followed by a single 'L' 30 // character (that is the difference between our representation and 31 // StringRef's one). If Representation is correct, 'true' is returned and 32 // the return value is put back in Num. 33 static bool rcGetAsInteger(StringRef Representation, uint32_t &Num) { 34 size_t Length = Representation.size(); 35 if (Length == 0) 36 return false; 37 // Strip the last 'L' if unnecessary. 38 if (std::toupper(Representation.back()) == 'L') 39 Representation = Representation.drop_back(1); 40 41 return !Representation.getAsInteger<uint32_t>(0, Num); 42 } 43 44 RCToken::RCToken(RCToken::Kind RCTokenKind, StringRef Value) 45 : TokenKind(RCTokenKind), TokenValue(Value) {} 46 47 uint32_t RCToken::intValue() const { 48 assert(TokenKind == Kind::Int); 49 // We assume that the token already is a correct integer (checked by 50 // rcGetAsInteger). 51 uint32_t Result; 52 bool IsSuccess = rcGetAsInteger(TokenValue, Result); 53 assert(IsSuccess); 54 (void)IsSuccess; // Silence the compiler warning when -DNDEBUG flag is on. 55 return Result; 56 } 57 58 bool RCToken::isLongInt() const { 59 return TokenKind == Kind::Int && std::toupper(TokenValue.back()) == 'L'; 60 } 61 62 StringRef RCToken::value() const { return TokenValue; } 63 64 Kind RCToken::kind() const { return TokenKind; } 65 66 bool RCToken::isBinaryOp() const { 67 switch (TokenKind) { 68 case Kind::Plus: 69 case Kind::Minus: 70 case Kind::Pipe: 71 case Kind::Amp: 72 return true; 73 default: 74 return false; 75 } 76 } 77 78 static Error getStringError(const Twine &message) { 79 return make_error<StringError>("Error parsing file: " + message, 80 inconvertibleErrorCode()); 81 } 82 83 namespace { 84 85 class Tokenizer { 86 public: 87 Tokenizer(StringRef Input) : Data(Input), DataLength(Input.size()) {} 88 89 Expected<std::vector<RCToken>> run(); 90 91 private: 92 // All 'advancing' methods return boolean values; if they're equal to false, 93 // the stream has ended or failed. 94 bool advance(size_t Amount = 1); 95 bool skipWhitespaces(); 96 97 // Consumes a token. If any problem occurred, a non-empty Error is returned. 98 Error consumeToken(const Kind TokenKind); 99 100 // Check if tokenizer is about to read FollowingChars. 101 bool willNowRead(StringRef FollowingChars) const; 102 103 // Check if tokenizer can start reading an identifier at current position. 104 // The original tool did non specify the rules to determine what is a correct 105 // identifier. We assume they should follow the C convention: 106 // [a-zA-Z_][a-zA-Z0-9_]*. 107 bool canStartIdentifier() const; 108 // Check if tokenizer can continue reading an identifier. 109 bool canContinueIdentifier() const; 110 111 // Check if tokenizer can start reading an integer. 112 // A correct integer always starts with a 0-9 digit, 113 // can contain characters 0-9A-Fa-f (digits), 114 // Ll (marking the integer is 32-bit), Xx (marking the representation 115 // is hexadecimal). As some kind of separator should come after the 116 // integer, we can consume the integer until a non-alphanumeric 117 // character. 118 bool canStartInt() const; 119 bool canContinueInt() const; 120 121 bool canStartString() const; 122 123 // Check if tokenizer can start reading a single line comment (e.g. a comment 124 // that begins with '//') 125 bool canStartLineComment() const; 126 127 // Check if tokenizer can start or finish reading a block comment (e.g. a 128 // comment that begins with '/*' and ends with '*/') 129 bool canStartBlockComment() const; 130 131 // Throw away all remaining characters on the current line. 132 void skipCurrentLine(); 133 134 bool streamEof() const; 135 136 // Classify the token that is about to be read from the current position. 137 Kind classifyCurrentToken() const; 138 139 // Process the Kind::Identifier token - check if it is 140 // an identifier describing a block start or end. 141 void processIdentifier(RCToken &token) const; 142 143 StringRef Data; 144 size_t DataLength, Pos; 145 }; 146 147 void Tokenizer::skipCurrentLine() { 148 Pos = Data.find_first_of("\r\n", Pos); 149 Pos = Data.find_first_not_of("\r\n", Pos); 150 151 if (Pos == StringRef::npos) 152 Pos = DataLength; 153 } 154 155 Expected<std::vector<RCToken>> Tokenizer::run() { 156 Pos = 0; 157 std::vector<RCToken> Result; 158 159 // Consume an optional UTF-8 Byte Order Mark. 160 if (willNowRead("\xef\xbb\xbf")) 161 advance(3); 162 163 while (!streamEof()) { 164 if (!skipWhitespaces()) 165 break; 166 167 Kind TokenKind = classifyCurrentToken(); 168 if (TokenKind == Kind::Invalid) 169 return getStringError("Invalid token found at position " + Twine(Pos)); 170 171 const size_t TokenStart = Pos; 172 if (Error TokenError = consumeToken(TokenKind)) 173 return std::move(TokenError); 174 175 // Comments are just deleted, don't bother saving them. 176 if (TokenKind == Kind::LineComment || TokenKind == Kind::StartComment) 177 continue; 178 179 RCToken Token(TokenKind, Data.take_front(Pos).drop_front(TokenStart)); 180 if (TokenKind == Kind::Identifier) { 181 processIdentifier(Token); 182 } else if (TokenKind == Kind::Int) { 183 uint32_t TokenInt; 184 if (!rcGetAsInteger(Token.value(), TokenInt)) { 185 // The integer has incorrect format or cannot be represented in 186 // a 32-bit integer. 187 return getStringError("Integer invalid or too large: " + 188 Token.value().str()); 189 } 190 } 191 192 Result.push_back(Token); 193 } 194 195 return Result; 196 } 197 198 bool Tokenizer::advance(size_t Amount) { 199 Pos += Amount; 200 return !streamEof(); 201 } 202 203 bool Tokenizer::skipWhitespaces() { 204 while (!streamEof() && std::isspace(Data[Pos])) 205 advance(); 206 return !streamEof(); 207 } 208 209 Error Tokenizer::consumeToken(const Kind TokenKind) { 210 switch (TokenKind) { 211 // One-character token consumption. 212 #define TOKEN(Name) 213 #define SHORT_TOKEN(Name, Ch) case Kind::Name: 214 #include "ResourceScriptTokenList.def" 215 advance(); 216 return Error::success(); 217 218 case Kind::LineComment: 219 advance(2); 220 skipCurrentLine(); 221 return Error::success(); 222 223 case Kind::StartComment: { 224 advance(2); 225 auto EndPos = Data.find("*/", Pos); 226 if (EndPos == StringRef::npos) 227 return getStringError( 228 "Unclosed multi-line comment beginning at position " + Twine(Pos)); 229 advance(EndPos - Pos); 230 advance(2); 231 return Error::success(); 232 } 233 case Kind::Identifier: 234 while (!streamEof() && canContinueIdentifier()) 235 advance(); 236 return Error::success(); 237 238 case Kind::Int: 239 while (!streamEof() && canContinueInt()) 240 advance(); 241 return Error::success(); 242 243 case Kind::String: 244 // Consume the preceding 'L', if there is any. 245 if (std::toupper(Data[Pos]) == 'L') 246 advance(); 247 // Consume the double-quote. 248 advance(); 249 250 // Consume the characters until the end of the file, line or string. 251 while (true) { 252 if (streamEof()) { 253 return getStringError("Unterminated string literal."); 254 } else if (Data[Pos] == '"') { 255 // Consume the ending double-quote. 256 advance(); 257 // However, if another '"' follows this double-quote, the string didn't 258 // end and we just included '"' into the string. 259 if (!willNowRead("\"")) 260 return Error::success(); 261 } else if (Data[Pos] == '\n') { 262 return getStringError("String literal not terminated in the line."); 263 } 264 265 advance(); 266 } 267 268 case Kind::Invalid: 269 assert(false && "Cannot consume an invalid token."); 270 } 271 272 llvm_unreachable("Unknown RCToken::Kind"); 273 } 274 275 bool Tokenizer::willNowRead(StringRef FollowingChars) const { 276 return Data.drop_front(Pos).startswith(FollowingChars); 277 } 278 279 bool Tokenizer::canStartIdentifier() const { 280 assert(!streamEof()); 281 282 const char CurChar = Data[Pos]; 283 return std::isalpha(CurChar) || CurChar == '_' || CurChar == '.'; 284 } 285 286 bool Tokenizer::canContinueIdentifier() const { 287 assert(!streamEof()); 288 const char CurChar = Data[Pos]; 289 return std::isalnum(CurChar) || CurChar == '_' || CurChar == '.' || 290 CurChar == '/' || CurChar == '\\'; 291 } 292 293 bool Tokenizer::canStartInt() const { 294 assert(!streamEof()); 295 return std::isdigit(Data[Pos]); 296 } 297 298 bool Tokenizer::canStartBlockComment() const { 299 assert(!streamEof()); 300 return Data.drop_front(Pos).startswith("/*"); 301 } 302 303 bool Tokenizer::canStartLineComment() const { 304 assert(!streamEof()); 305 return Data.drop_front(Pos).startswith("//"); 306 } 307 308 bool Tokenizer::canContinueInt() const { 309 assert(!streamEof()); 310 return std::isalnum(Data[Pos]); 311 } 312 313 bool Tokenizer::canStartString() const { 314 return willNowRead("\"") || willNowRead("L\"") || willNowRead("l\""); 315 } 316 317 bool Tokenizer::streamEof() const { return Pos == DataLength; } 318 319 Kind Tokenizer::classifyCurrentToken() const { 320 if (canStartBlockComment()) 321 return Kind::StartComment; 322 if (canStartLineComment()) 323 return Kind::LineComment; 324 325 if (canStartInt()) 326 return Kind::Int; 327 if (canStartString()) 328 return Kind::String; 329 // BEGIN and END are at this point of lexing recognized as identifiers. 330 if (canStartIdentifier()) 331 return Kind::Identifier; 332 333 const char CurChar = Data[Pos]; 334 335 switch (CurChar) { 336 // One-character token classification. 337 #define TOKEN(Name) 338 #define SHORT_TOKEN(Name, Ch) \ 339 case Ch: \ 340 return Kind::Name; 341 #include "ResourceScriptTokenList.def" 342 343 default: 344 return Kind::Invalid; 345 } 346 } 347 348 void Tokenizer::processIdentifier(RCToken &Token) const { 349 assert(Token.kind() == Kind::Identifier); 350 StringRef Name = Token.value(); 351 352 if (Name.equals_lower("begin")) 353 Token = RCToken(Kind::BlockBegin, Name); 354 else if (Name.equals_lower("end")) 355 Token = RCToken(Kind::BlockEnd, Name); 356 } 357 358 } // anonymous namespace 359 360 namespace llvm { 361 362 Expected<std::vector<RCToken>> tokenizeRC(StringRef Input) { 363 return Tokenizer(Input).run(); 364 } 365 366 } // namespace llvm 367