1 //===-- ResourceScriptToken.cpp ---------------------------------*- C++-*-===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===---------------------------------------------------------------------===// 9 // 10 // This file implements an interface defined in ResourceScriptToken.h. 11 // In particular, it defines an .rc script tokenizer. 12 // 13 //===---------------------------------------------------------------------===// 14 15 #include "ResourceScriptToken.h" 16 #include "llvm/Support/raw_ostream.h" 17 18 #include <algorithm> 19 #include <cassert> 20 #include <cctype> 21 #include <cstdlib> 22 #include <utility> 23 24 using namespace llvm; 25 26 using Kind = RCToken::Kind; 27 28 // Checks if Representation is a correct description of an RC integer. 29 // It should be a 32-bit unsigned integer, either decimal, octal (0[0-7]+), 30 // or hexadecimal (0x[0-9a-f]+). It might be followed by a single 'L' 31 // character (that is the difference between our representation and 32 // StringRef's one). If Representation is correct, 'true' is returned and 33 // the return value is put back in Num. 34 static bool rcGetAsInteger(StringRef Representation, uint32_t &Num) { 35 size_t Length = Representation.size(); 36 if (Length == 0) 37 return false; 38 // Strip the last 'L' if unnecessary. 39 if (std::toupper(Representation.back()) == 'L') 40 Representation = Representation.drop_back(1); 41 42 return !Representation.getAsInteger<uint32_t>(0, Num); 43 } 44 45 RCToken::RCToken(RCToken::Kind RCTokenKind, StringRef Value) 46 : TokenKind(RCTokenKind), TokenValue(Value) {} 47 48 uint32_t RCToken::intValue() const { 49 assert(TokenKind == Kind::Int); 50 // We assume that the token already is a correct integer (checked by 51 // rcGetAsInteger). 52 uint32_t Result; 53 bool IsSuccess = rcGetAsInteger(TokenValue, Result); 54 assert(IsSuccess); 55 (void)IsSuccess; // Silence the compiler warning when -DNDEBUG flag is on. 56 return Result; 57 } 58 59 StringRef RCToken::value() const { return TokenValue; } 60 61 Kind RCToken::kind() const { return TokenKind; } 62 63 bool RCToken::isBinaryOp() const { 64 switch (TokenKind) { 65 case Kind::Plus: 66 case Kind::Minus: 67 case Kind::Pipe: 68 case Kind::Amp: 69 return true; 70 default: 71 return false; 72 } 73 } 74 75 static Error getStringError(const Twine &message) { 76 return make_error<StringError>("Error parsing file: " + message, 77 inconvertibleErrorCode()); 78 } 79 80 namespace { 81 82 class Tokenizer { 83 public: 84 Tokenizer(StringRef Input) : Data(Input), DataLength(Input.size()) {} 85 86 Expected<std::vector<RCToken>> run(); 87 88 private: 89 // All 'advancing' methods return boolean values; if they're equal to false, 90 // the stream has ended or failed. 91 bool advance(size_t Amount = 1); 92 bool skipWhitespaces(); 93 94 // Consumes a token. If any problem occurred, a non-empty Error is returned. 95 Error consumeToken(const Kind TokenKind); 96 97 // Check if tokenizer is about to read FollowingChars. 98 bool willNowRead(StringRef FollowingChars) const; 99 100 // Check if tokenizer can start reading an identifier at current position. 101 // The original tool did non specify the rules to determine what is a correct 102 // identifier. We assume they should follow the C convention: 103 // [a-zA-Z_][a-zA-Z0-9_]*. 104 bool canStartIdentifier() const; 105 // Check if tokenizer can continue reading an identifier. 106 bool canContinueIdentifier() const; 107 108 // Check if tokenizer can start reading an integer. 109 // A correct integer always starts with a 0-9 digit, 110 // can contain characters 0-9A-Fa-f (digits), 111 // Ll (marking the integer is 32-bit), Xx (marking the representation 112 // is hexadecimal). As some kind of separator should come after the 113 // integer, we can consume the integer until a non-alphanumeric 114 // character. 115 bool canStartInt() const; 116 bool canContinueInt() const; 117 118 bool canStartString() const; 119 120 bool streamEof() const; 121 122 // Classify the token that is about to be read from the current position. 123 Kind classifyCurrentToken() const; 124 125 // Process the Kind::Identifier token - check if it is 126 // an identifier describing a block start or end. 127 void processIdentifier(RCToken &token) const; 128 129 StringRef Data; 130 size_t DataLength, Pos; 131 }; 132 133 Expected<std::vector<RCToken>> Tokenizer::run() { 134 Pos = 0; 135 std::vector<RCToken> Result; 136 137 // Consume an optional UTF-8 Byte Order Mark. 138 if (willNowRead("\xef\xbb\xbf")) 139 advance(3); 140 141 while (!streamEof()) { 142 if (!skipWhitespaces()) 143 break; 144 145 Kind TokenKind = classifyCurrentToken(); 146 if (TokenKind == Kind::Invalid) 147 return getStringError("Invalid token found at position " + Twine(Pos)); 148 149 const size_t TokenStart = Pos; 150 if (Error TokenError = consumeToken(TokenKind)) 151 return std::move(TokenError); 152 153 RCToken Token(TokenKind, Data.take_front(Pos).drop_front(TokenStart)); 154 if (TokenKind == Kind::Identifier) { 155 processIdentifier(Token); 156 } else if (TokenKind == Kind::Int) { 157 uint32_t TokenInt; 158 if (!rcGetAsInteger(Token.value(), TokenInt)) { 159 // The integer has incorrect format or cannot be represented in 160 // a 32-bit integer. 161 return getStringError("Integer invalid or too large: " + 162 Token.value().str()); 163 } 164 } 165 166 Result.push_back(Token); 167 } 168 169 return Result; 170 } 171 172 bool Tokenizer::advance(size_t Amount) { 173 Pos += Amount; 174 return !streamEof(); 175 } 176 177 bool Tokenizer::skipWhitespaces() { 178 while (!streamEof() && std::isspace(Data[Pos])) 179 advance(); 180 return !streamEof(); 181 } 182 183 Error Tokenizer::consumeToken(const Kind TokenKind) { 184 switch (TokenKind) { 185 // One-character token consumption. 186 #define TOKEN(Name) 187 #define SHORT_TOKEN(Name, Ch) case Kind::Name: 188 #include "ResourceScriptTokenList.h" 189 #undef TOKEN 190 #undef SHORT_TOKEN 191 advance(); 192 return Error::success(); 193 194 case Kind::Identifier: 195 while (!streamEof() && canContinueIdentifier()) 196 advance(); 197 return Error::success(); 198 199 case Kind::Int: 200 while (!streamEof() && canContinueInt()) 201 advance(); 202 return Error::success(); 203 204 case Kind::String: 205 // Consume the preceding 'L', if there is any. 206 if (std::toupper(Data[Pos]) == 'L') 207 advance(); 208 // Consume the double-quote. 209 advance(); 210 211 // Consume the characters until the end of the file, line or string. 212 while (true) { 213 if (streamEof()) { 214 return getStringError("Unterminated string literal."); 215 } else if (Data[Pos] == '"') { 216 // Consume the ending double-quote. 217 advance(); 218 return Error::success(); 219 } else if (Data[Pos] == '\n') { 220 return getStringError("String literal not terminated in the line."); 221 } 222 223 advance(); 224 } 225 226 case Kind::Invalid: 227 assert(false && "Cannot consume an invalid token."); 228 } 229 230 llvm_unreachable("Unknown RCToken::Kind"); 231 } 232 233 bool Tokenizer::willNowRead(StringRef FollowingChars) const { 234 return Data.drop_front(Pos).startswith(FollowingChars); 235 } 236 237 bool Tokenizer::canStartIdentifier() const { 238 assert(!streamEof()); 239 240 const char CurChar = Data[Pos]; 241 return std::isalpha(CurChar) || CurChar == '_'; 242 } 243 244 bool Tokenizer::canContinueIdentifier() const { 245 assert(!streamEof()); 246 const char CurChar = Data[Pos]; 247 return std::isalnum(CurChar) || CurChar == '_'; 248 } 249 250 bool Tokenizer::canStartInt() const { 251 assert(!streamEof()); 252 return std::isdigit(Data[Pos]); 253 } 254 255 bool Tokenizer::canContinueInt() const { 256 assert(!streamEof()); 257 return std::isalnum(Data[Pos]); 258 } 259 260 bool Tokenizer::canStartString() const { 261 return willNowRead("\"") || willNowRead("L\"") || willNowRead("l\""); 262 } 263 264 bool Tokenizer::streamEof() const { return Pos == DataLength; } 265 266 Kind Tokenizer::classifyCurrentToken() const { 267 if (canStartInt()) 268 return Kind::Int; 269 if (canStartString()) 270 return Kind::String; 271 // BEGIN and END are at this point of lexing recognized as identifiers. 272 if (canStartIdentifier()) 273 return Kind::Identifier; 274 275 const char CurChar = Data[Pos]; 276 277 switch (CurChar) { 278 // One-character token classification. 279 #define TOKEN(Name) 280 #define SHORT_TOKEN(Name, Ch) \ 281 case Ch: \ 282 return Kind::Name; 283 #include "ResourceScriptTokenList.h" 284 #undef TOKEN 285 #undef SHORT_TOKEN 286 287 default: 288 return Kind::Invalid; 289 } 290 } 291 292 void Tokenizer::processIdentifier(RCToken &Token) const { 293 assert(Token.kind() == Kind::Identifier); 294 StringRef Name = Token.value(); 295 296 if (Name.equals_lower("begin")) 297 Token = RCToken(Kind::BlockBegin, Name); 298 else if (Name.equals_lower("end")) 299 Token = RCToken(Kind::BlockEnd, Name); 300 } 301 302 } // anonymous namespace 303 304 namespace llvm { 305 306 Expected<std::vector<RCToken>> tokenizeRC(StringRef Input) { 307 return Tokenizer(Input).run(); 308 } 309 310 } // namespace llvm 311