1 //===-- ResourceScriptToken.cpp ---------------------------------*- C++-*-===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===---------------------------------------------------------------------===// 9 // 10 // This file implements an interface defined in ResourceScriptToken.h. 11 // In particular, it defines an .rc script tokenizer. 12 // 13 //===---------------------------------------------------------------------===// 14 15 #include "ResourceScriptToken.h" 16 #include "llvm/Support/raw_ostream.h" 17 18 #include <algorithm> 19 #include <cassert> 20 #include <cctype> 21 #include <cstdlib> 22 #include <utility> 23 24 using namespace llvm; 25 26 using Kind = RCToken::Kind; 27 28 // Checks if Representation is a correct description of an RC integer. 29 // It should be a 32-bit unsigned integer, either decimal, octal (0[0-7]+), 30 // or hexadecimal (0x[0-9a-f]+). It might be followed by a single 'L' 31 // character (that is the difference between our representation and 32 // StringRef's one). If Representation is correct, 'true' is returned and 33 // the return value is put back in Num. 34 static bool rcGetAsInteger(StringRef Representation, uint32_t &Num) { 35 size_t Length = Representation.size(); 36 if (Length == 0) 37 return false; 38 // Strip the last 'L' if unnecessary. 39 if (std::toupper(Representation.back()) == 'L') 40 Representation = Representation.drop_back(1); 41 42 return !Representation.getAsInteger<uint32_t>(0, Num); 43 } 44 45 RCToken::RCToken(RCToken::Kind RCTokenKind, StringRef Value) 46 : TokenKind(RCTokenKind), TokenValue(Value) {} 47 48 uint32_t RCToken::intValue() const { 49 assert(TokenKind == Kind::Int); 50 // We assume that the token already is a correct integer (checked by 51 // rcGetAsInteger). 52 uint32_t Result; 53 bool IsSuccess = rcGetAsInteger(TokenValue, Result); 54 assert(IsSuccess); 55 (void)IsSuccess; // Silence the compiler warning when -DNDEBUG flag is on. 56 return Result; 57 } 58 59 bool RCToken::isLongInt() const { 60 return TokenKind == Kind::Int && std::toupper(TokenValue.back()) == 'L'; 61 } 62 63 StringRef RCToken::value() const { return TokenValue; } 64 65 Kind RCToken::kind() const { return TokenKind; } 66 67 bool RCToken::isBinaryOp() const { 68 switch (TokenKind) { 69 case Kind::Plus: 70 case Kind::Minus: 71 case Kind::Pipe: 72 case Kind::Amp: 73 return true; 74 default: 75 return false; 76 } 77 } 78 79 static Error getStringError(const Twine &message) { 80 return make_error<StringError>("Error parsing file: " + message, 81 inconvertibleErrorCode()); 82 } 83 84 namespace { 85 86 class Tokenizer { 87 public: 88 Tokenizer(StringRef Input) : Data(Input), DataLength(Input.size()) {} 89 90 Expected<std::vector<RCToken>> run(); 91 92 private: 93 // All 'advancing' methods return boolean values; if they're equal to false, 94 // the stream has ended or failed. 95 bool advance(size_t Amount = 1); 96 bool skipWhitespaces(); 97 98 // Consumes a token. If any problem occurred, a non-empty Error is returned. 99 Error consumeToken(const Kind TokenKind); 100 101 // Check if tokenizer is about to read FollowingChars. 102 bool willNowRead(StringRef FollowingChars) const; 103 104 // Check if tokenizer can start reading an identifier at current position. 105 // The original tool did non specify the rules to determine what is a correct 106 // identifier. We assume they should follow the C convention: 107 // [a-zA-Z_][a-zA-Z0-9_]*. 108 bool canStartIdentifier() const; 109 // Check if tokenizer can continue reading an identifier. 110 bool canContinueIdentifier() const; 111 112 // Check if tokenizer can start reading an integer. 113 // A correct integer always starts with a 0-9 digit, 114 // can contain characters 0-9A-Fa-f (digits), 115 // Ll (marking the integer is 32-bit), Xx (marking the representation 116 // is hexadecimal). As some kind of separator should come after the 117 // integer, we can consume the integer until a non-alphanumeric 118 // character. 119 bool canStartInt() const; 120 bool canContinueInt() const; 121 122 bool canStartString() const; 123 124 // Check if tokenizer can start reading a single line comment (e.g. a comment 125 // that begins with '//') 126 bool canStartLineComment() const; 127 128 // Check if tokenizer can start or finish reading a block comment (e.g. a 129 // comment that begins with '/*' and ends with '*/') 130 bool canStartBlockComment() const; 131 132 // Throw away all remaining characters on the current line. 133 void skipCurrentLine(); 134 135 bool streamEof() const; 136 137 // Classify the token that is about to be read from the current position. 138 Kind classifyCurrentToken() const; 139 140 // Process the Kind::Identifier token - check if it is 141 // an identifier describing a block start or end. 142 void processIdentifier(RCToken &token) const; 143 144 StringRef Data; 145 size_t DataLength, Pos; 146 }; 147 148 void Tokenizer::skipCurrentLine() { 149 Pos = Data.find_first_of("\r\n", Pos); 150 Pos = Data.find_first_not_of("\r\n", Pos); 151 152 if (Pos == StringRef::npos) 153 Pos = DataLength; 154 } 155 156 Expected<std::vector<RCToken>> Tokenizer::run() { 157 Pos = 0; 158 std::vector<RCToken> Result; 159 160 // Consume an optional UTF-8 Byte Order Mark. 161 if (willNowRead("\xef\xbb\xbf")) 162 advance(3); 163 164 while (!streamEof()) { 165 if (!skipWhitespaces()) 166 break; 167 168 Kind TokenKind = classifyCurrentToken(); 169 if (TokenKind == Kind::Invalid) 170 return getStringError("Invalid token found at position " + Twine(Pos)); 171 172 const size_t TokenStart = Pos; 173 if (Error TokenError = consumeToken(TokenKind)) 174 return std::move(TokenError); 175 176 // Comments are just deleted, don't bother saving them. 177 if (TokenKind == Kind::LineComment || TokenKind == Kind::StartComment) 178 continue; 179 180 RCToken Token(TokenKind, Data.take_front(Pos).drop_front(TokenStart)); 181 if (TokenKind == Kind::Identifier) { 182 processIdentifier(Token); 183 } else if (TokenKind == Kind::Int) { 184 uint32_t TokenInt; 185 if (!rcGetAsInteger(Token.value(), TokenInt)) { 186 // The integer has incorrect format or cannot be represented in 187 // a 32-bit integer. 188 return getStringError("Integer invalid or too large: " + 189 Token.value().str()); 190 } 191 } 192 193 Result.push_back(Token); 194 } 195 196 return Result; 197 } 198 199 bool Tokenizer::advance(size_t Amount) { 200 Pos += Amount; 201 return !streamEof(); 202 } 203 204 bool Tokenizer::skipWhitespaces() { 205 while (!streamEof() && std::isspace(Data[Pos])) 206 advance(); 207 return !streamEof(); 208 } 209 210 Error Tokenizer::consumeToken(const Kind TokenKind) { 211 switch (TokenKind) { 212 // One-character token consumption. 213 #define TOKEN(Name) 214 #define SHORT_TOKEN(Name, Ch) case Kind::Name: 215 #include "ResourceScriptTokenList.h" 216 #undef TOKEN 217 #undef SHORT_TOKEN 218 advance(); 219 return Error::success(); 220 221 case Kind::LineComment: 222 advance(2); 223 skipCurrentLine(); 224 return Error::success(); 225 226 case Kind::StartComment: { 227 advance(2); 228 auto EndPos = Data.find("*/", Pos); 229 if (EndPos == StringRef::npos) 230 return getStringError( 231 "Unclosed multi-line comment beginning at position " + Twine(Pos)); 232 advance(EndPos - Pos); 233 advance(2); 234 return Error::success(); 235 } 236 case Kind::Identifier: 237 while (!streamEof() && canContinueIdentifier()) 238 advance(); 239 return Error::success(); 240 241 case Kind::Int: 242 while (!streamEof() && canContinueInt()) 243 advance(); 244 return Error::success(); 245 246 case Kind::String: 247 // Consume the preceding 'L', if there is any. 248 if (std::toupper(Data[Pos]) == 'L') 249 advance(); 250 // Consume the double-quote. 251 advance(); 252 253 // Consume the characters until the end of the file, line or string. 254 while (true) { 255 if (streamEof()) { 256 return getStringError("Unterminated string literal."); 257 } else if (Data[Pos] == '"') { 258 // Consume the ending double-quote. 259 advance(); 260 // However, if another '"' follows this double-quote, the string didn't 261 // end and we just included '"' into the string. 262 if (!willNowRead("\"")) 263 return Error::success(); 264 } else if (Data[Pos] == '\n') { 265 return getStringError("String literal not terminated in the line."); 266 } 267 268 advance(); 269 } 270 271 case Kind::Invalid: 272 assert(false && "Cannot consume an invalid token."); 273 } 274 275 llvm_unreachable("Unknown RCToken::Kind"); 276 } 277 278 bool Tokenizer::willNowRead(StringRef FollowingChars) const { 279 return Data.drop_front(Pos).startswith(FollowingChars); 280 } 281 282 bool Tokenizer::canStartIdentifier() const { 283 assert(!streamEof()); 284 285 const char CurChar = Data[Pos]; 286 return std::isalpha(CurChar) || CurChar == '_'; 287 } 288 289 bool Tokenizer::canContinueIdentifier() const { 290 assert(!streamEof()); 291 const char CurChar = Data[Pos]; 292 return std::isalnum(CurChar) || CurChar == '_'; 293 } 294 295 bool Tokenizer::canStartInt() const { 296 assert(!streamEof()); 297 return std::isdigit(Data[Pos]); 298 } 299 300 bool Tokenizer::canStartBlockComment() const { 301 assert(!streamEof()); 302 return Data.drop_front(Pos).startswith("/*"); 303 } 304 305 bool Tokenizer::canStartLineComment() const { 306 assert(!streamEof()); 307 return Data.drop_front(Pos).startswith("//"); 308 } 309 310 bool Tokenizer::canContinueInt() const { 311 assert(!streamEof()); 312 return std::isalnum(Data[Pos]); 313 } 314 315 bool Tokenizer::canStartString() const { 316 return willNowRead("\"") || willNowRead("L\"") || willNowRead("l\""); 317 } 318 319 bool Tokenizer::streamEof() const { return Pos == DataLength; } 320 321 Kind Tokenizer::classifyCurrentToken() const { 322 if (canStartBlockComment()) 323 return Kind::StartComment; 324 if (canStartLineComment()) 325 return Kind::LineComment; 326 327 if (canStartInt()) 328 return Kind::Int; 329 if (canStartString()) 330 return Kind::String; 331 // BEGIN and END are at this point of lexing recognized as identifiers. 332 if (canStartIdentifier()) 333 return Kind::Identifier; 334 335 const char CurChar = Data[Pos]; 336 337 switch (CurChar) { 338 // One-character token classification. 339 #define TOKEN(Name) 340 #define SHORT_TOKEN(Name, Ch) \ 341 case Ch: \ 342 return Kind::Name; 343 #include "ResourceScriptTokenList.h" 344 #undef TOKEN 345 #undef SHORT_TOKEN 346 347 default: 348 return Kind::Invalid; 349 } 350 } 351 352 void Tokenizer::processIdentifier(RCToken &Token) const { 353 assert(Token.kind() == Kind::Identifier); 354 StringRef Name = Token.value(); 355 356 if (Name.equals_lower("begin")) 357 Token = RCToken(Kind::BlockBegin, Name); 358 else if (Name.equals_lower("end")) 359 Token = RCToken(Kind::BlockEnd, Name); 360 } 361 362 } // anonymous namespace 363 364 namespace llvm { 365 366 Expected<std::vector<RCToken>> tokenizeRC(StringRef Input) { 367 return Tokenizer(Input).run(); 368 } 369 370 } // namespace llvm 371