1 //===-- ResourceScriptToken.cpp ---------------------------------*- C++-*-===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===---------------------------------------------------------------------===// 9 // 10 // This file implements an interface defined in ResourceScriptToken.h. 11 // In particular, it defines an .rc script tokenizer. 12 // 13 //===---------------------------------------------------------------------===// 14 15 #include "ResourceScriptToken.h" 16 #include "llvm/Support/raw_ostream.h" 17 18 #include <algorithm> 19 #include <cassert> 20 #include <cctype> 21 #include <cstdlib> 22 #include <utility> 23 24 using namespace llvm; 25 26 using Kind = RCToken::Kind; 27 28 // Checks if Representation is a correct description of an RC integer. 29 // It should be a 32-bit unsigned integer, either decimal, octal (0[0-7]+), 30 // or hexadecimal (0x[0-9a-f]+). It might be followed by a single 'L' 31 // character (that is the difference between our representation and 32 // StringRef's one). If Representation is correct, 'true' is returned and 33 // the return value is put back in Num. 34 static bool rcGetAsInteger(StringRef Representation, uint32_t &Num) { 35 size_t Length = Representation.size(); 36 if (Length == 0) 37 return false; 38 // Strip the last 'L' if unnecessary. 39 if (std::toupper(Representation.back()) == 'L') 40 Representation = Representation.drop_back(1); 41 42 return !Representation.getAsInteger<uint32_t>(0, Num); 43 } 44 45 RCToken::RCToken(RCToken::Kind RCTokenKind, StringRef Value) 46 : TokenKind(RCTokenKind), TokenValue(Value) {} 47 48 uint32_t RCToken::intValue() const { 49 assert(TokenKind == Kind::Int); 50 // We assume that the token already is a correct integer (checked by 51 // rcGetAsInteger). 52 uint32_t Result; 53 bool IsSuccess = rcGetAsInteger(TokenValue, Result); 54 assert(IsSuccess); 55 (void)IsSuccess; // Silence the compiler warning when -DNDEBUG flag is on. 56 return Result; 57 } 58 59 StringRef RCToken::value() const { return TokenValue; } 60 61 Kind RCToken::kind() const { return TokenKind; } 62 63 static Error getStringError(const Twine &message) { 64 return make_error<StringError>("Error parsing file: " + message, 65 inconvertibleErrorCode()); 66 } 67 68 namespace { 69 70 class Tokenizer { 71 public: 72 Tokenizer(StringRef Input) : Data(Input), DataLength(Input.size()) {} 73 74 Expected<std::vector<RCToken>> run(); 75 76 private: 77 // All 'advancing' methods return boolean values; if they're equal to false, 78 // the stream has ended or failed. 79 bool advance(size_t Amount = 1); 80 bool skipWhitespaces(); 81 82 // Consumes a token. If any problem occurred, a non-empty Error is returned. 83 Error consumeToken(const Kind TokenKind); 84 85 // Check if tokenizer is about to read FollowingChars. 86 bool willNowRead(StringRef FollowingChars) const; 87 88 // Check if tokenizer can start reading an identifier at current position. 89 // The original tool did non specify the rules to determine what is a correct 90 // identifier. We assume they should follow the C convention: 91 // [a-zA-z_][a-zA-Z0-9_]*. 92 bool canStartIdentifier() const; 93 // Check if tokenizer can continue reading an identifier. 94 bool canContinueIdentifier() const; 95 96 // Check if tokenizer can start reading an integer. 97 // A correct integer always starts with a 0-9 digit, 98 // can contain characters 0-9A-Fa-f (digits), 99 // Ll (marking the integer is 32-bit), Xx (marking the representation 100 // is hexadecimal). As some kind of separator should come after the 101 // integer, we can consume the integer until a non-alphanumeric 102 // character. 103 bool canStartInt() const; 104 bool canContinueInt() const; 105 106 bool canStartString() const; 107 108 bool streamEof() const; 109 110 // Classify the token that is about to be read from the current position. 111 Kind classifyCurrentToken() const; 112 113 // Process the Kind::Identifier token - check if it is 114 // an identifier describing a block start or end. 115 void processIdentifier(RCToken &token) const; 116 117 StringRef Data; 118 size_t DataLength, Pos; 119 }; 120 121 Expected<std::vector<RCToken>> Tokenizer::run() { 122 Pos = 0; 123 std::vector<RCToken> Result; 124 125 // Consume an optional UTF-8 Byte Order Mark. 126 if (willNowRead("\xef\xbb\xbf")) 127 advance(3); 128 129 while (!streamEof()) { 130 if (!skipWhitespaces()) 131 break; 132 133 Kind TokenKind = classifyCurrentToken(); 134 if (TokenKind == Kind::Invalid) 135 return getStringError("Invalid token found at position " + Twine(Pos)); 136 137 const size_t TokenStart = Pos; 138 if (Error TokenError = consumeToken(TokenKind)) 139 return std::move(TokenError); 140 141 RCToken Token(TokenKind, Data.take_front(Pos).drop_front(TokenStart)); 142 if (TokenKind == Kind::Identifier) { 143 processIdentifier(Token); 144 } else if (TokenKind == Kind::Int) { 145 uint32_t TokenInt; 146 if (!rcGetAsInteger(Token.value(), TokenInt)) { 147 // The integer has incorrect format or cannot be represented in 148 // a 32-bit integer. 149 return getStringError("Integer invalid or too large: " + 150 Token.value().str()); 151 } 152 } 153 154 Result.push_back(Token); 155 } 156 157 return Result; 158 } 159 160 bool Tokenizer::advance(size_t Amount) { 161 Pos += Amount; 162 return !streamEof(); 163 } 164 165 bool Tokenizer::skipWhitespaces() { 166 while (!streamEof() && std::isspace(Data[Pos])) 167 advance(); 168 return !streamEof(); 169 } 170 171 Error Tokenizer::consumeToken(const Kind TokenKind) { 172 switch (TokenKind) { 173 // One-character token consumption. 174 #define TOKEN(Name) 175 #define SHORT_TOKEN(Name, Ch) case Kind::Name: 176 #include "ResourceScriptTokenList.h" 177 #undef TOKEN 178 #undef SHORT_TOKEN 179 advance(); 180 return Error::success(); 181 182 case Kind::Identifier: 183 while (!streamEof() && canContinueIdentifier()) 184 advance(); 185 return Error::success(); 186 187 case Kind::Int: 188 while (!streamEof() && canContinueInt()) 189 advance(); 190 return Error::success(); 191 192 case Kind::String: 193 // Consume the preceding 'L', if there is any. 194 if (std::toupper(Data[Pos]) == 'L') 195 advance(); 196 // Consume the double-quote. 197 advance(); 198 199 // Consume the characters until the end of the file, line or string. 200 while (true) { 201 if (streamEof()) { 202 return getStringError("Unterminated string literal."); 203 } else if (Data[Pos] == '"') { 204 // Consume the ending double-quote. 205 advance(); 206 return Error::success(); 207 } else if (Data[Pos] == '\n') { 208 return getStringError("String literal not terminated in the line."); 209 } 210 211 advance(); 212 } 213 214 case Kind::Invalid: 215 assert(false && "Cannot consume an invalid token."); 216 } 217 } 218 219 bool Tokenizer::willNowRead(StringRef FollowingChars) const { 220 return Data.drop_front(Pos).startswith(FollowingChars); 221 } 222 223 bool Tokenizer::canStartIdentifier() const { 224 assert(!streamEof()); 225 226 const char CurChar = Data[Pos]; 227 return std::isalpha(CurChar) || CurChar == '_'; 228 } 229 230 bool Tokenizer::canContinueIdentifier() const { 231 assert(!streamEof()); 232 const char CurChar = Data[Pos]; 233 return std::isalnum(CurChar) || CurChar == '_'; 234 } 235 236 bool Tokenizer::canStartInt() const { 237 assert(!streamEof()); 238 return std::isdigit(Data[Pos]); 239 } 240 241 bool Tokenizer::canContinueInt() const { 242 assert(!streamEof()); 243 return std::isalnum(Data[Pos]); 244 } 245 246 bool Tokenizer::canStartString() const { 247 return willNowRead("\"") || willNowRead("L\"") || willNowRead("l\""); 248 } 249 250 bool Tokenizer::streamEof() const { return Pos == DataLength; } 251 252 Kind Tokenizer::classifyCurrentToken() const { 253 if (canStartInt()) 254 return Kind::Int; 255 if (canStartString()) 256 return Kind::String; 257 // BEGIN and END are at this point of lexing recognized as identifiers. 258 if (canStartIdentifier()) 259 return Kind::Identifier; 260 261 const char CurChar = Data[Pos]; 262 263 switch (CurChar) { 264 // One-character token classification. 265 #define TOKEN(Name) 266 #define SHORT_TOKEN(Name, Ch) \ 267 case Ch: \ 268 return Kind::Name; 269 #include "ResourceScriptTokenList.h" 270 #undef TOKEN 271 #undef SHORT_TOKEN 272 273 default: 274 return Kind::Invalid; 275 } 276 } 277 278 void Tokenizer::processIdentifier(RCToken &Token) const { 279 assert(Token.kind() == Kind::Identifier); 280 StringRef Name = Token.value(); 281 282 if (Name.equals_lower("begin")) 283 Token = RCToken(Kind::BlockBegin, Name); 284 else if (Name.equals_lower("end")) 285 Token = RCToken(Kind::BlockEnd, Name); 286 } 287 288 } // anonymous namespace 289 290 namespace llvm { 291 292 Expected<std::vector<RCToken>> tokenizeRC(StringRef Input) { 293 return Tokenizer(Input).run(); 294 } 295 296 } // namespace llvm 297