17330f729Sjoerg //===-- ResourceScriptToken.cpp ---------------------------------*- C++-*-===//
27330f729Sjoerg //
37330f729Sjoerg // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
47330f729Sjoerg // See https://llvm.org/LICENSE.txt for license information.
57330f729Sjoerg // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
67330f729Sjoerg //
77330f729Sjoerg //===---------------------------------------------------------------------===//
87330f729Sjoerg //
97330f729Sjoerg // This file implements an interface defined in ResourceScriptToken.h.
107330f729Sjoerg // In particular, it defines an .rc script tokenizer.
117330f729Sjoerg //
127330f729Sjoerg //===---------------------------------------------------------------------===//
137330f729Sjoerg
147330f729Sjoerg #include "ResourceScriptToken.h"
15*82d56013Sjoerg #include "llvm/ADT/StringExtras.h"
167330f729Sjoerg #include "llvm/Support/raw_ostream.h"
177330f729Sjoerg
187330f729Sjoerg #include <algorithm>
197330f729Sjoerg #include <cassert>
207330f729Sjoerg #include <cctype>
217330f729Sjoerg #include <cstdlib>
227330f729Sjoerg #include <utility>
237330f729Sjoerg
247330f729Sjoerg using namespace llvm;
257330f729Sjoerg
267330f729Sjoerg using Kind = RCToken::Kind;
277330f729Sjoerg
287330f729Sjoerg // Checks if Representation is a correct description of an RC integer.
297330f729Sjoerg // It should be a 32-bit unsigned integer, either decimal, octal (0[0-7]+),
307330f729Sjoerg // or hexadecimal (0x[0-9a-f]+). It might be followed by a single 'L'
317330f729Sjoerg // character (that is the difference between our representation and
327330f729Sjoerg // StringRef's one). If Representation is correct, 'true' is returned and
337330f729Sjoerg // the return value is put back in Num.
rcGetAsInteger(StringRef Representation,uint32_t & Num)347330f729Sjoerg static bool rcGetAsInteger(StringRef Representation, uint32_t &Num) {
357330f729Sjoerg size_t Length = Representation.size();
367330f729Sjoerg if (Length == 0)
377330f729Sjoerg return false;
387330f729Sjoerg // Strip the last 'L' if unnecessary.
397330f729Sjoerg if (std::toupper(Representation.back()) == 'L')
407330f729Sjoerg Representation = Representation.drop_back(1);
417330f729Sjoerg
427330f729Sjoerg return !Representation.getAsInteger<uint32_t>(0, Num);
437330f729Sjoerg }
447330f729Sjoerg
RCToken(RCToken::Kind RCTokenKind,StringRef Value)457330f729Sjoerg RCToken::RCToken(RCToken::Kind RCTokenKind, StringRef Value)
467330f729Sjoerg : TokenKind(RCTokenKind), TokenValue(Value) {}
477330f729Sjoerg
intValue() const487330f729Sjoerg uint32_t RCToken::intValue() const {
497330f729Sjoerg assert(TokenKind == Kind::Int);
507330f729Sjoerg // We assume that the token already is a correct integer (checked by
517330f729Sjoerg // rcGetAsInteger).
527330f729Sjoerg uint32_t Result;
537330f729Sjoerg bool IsSuccess = rcGetAsInteger(TokenValue, Result);
547330f729Sjoerg assert(IsSuccess);
557330f729Sjoerg (void)IsSuccess; // Silence the compiler warning when -DNDEBUG flag is on.
567330f729Sjoerg return Result;
577330f729Sjoerg }
587330f729Sjoerg
isLongInt() const597330f729Sjoerg bool RCToken::isLongInt() const {
607330f729Sjoerg return TokenKind == Kind::Int && std::toupper(TokenValue.back()) == 'L';
617330f729Sjoerg }
627330f729Sjoerg
value() const637330f729Sjoerg StringRef RCToken::value() const { return TokenValue; }
647330f729Sjoerg
kind() const657330f729Sjoerg Kind RCToken::kind() const { return TokenKind; }
667330f729Sjoerg
isBinaryOp() const677330f729Sjoerg bool RCToken::isBinaryOp() const {
687330f729Sjoerg switch (TokenKind) {
697330f729Sjoerg case Kind::Plus:
707330f729Sjoerg case Kind::Minus:
717330f729Sjoerg case Kind::Pipe:
727330f729Sjoerg case Kind::Amp:
737330f729Sjoerg return true;
747330f729Sjoerg default:
757330f729Sjoerg return false;
767330f729Sjoerg }
777330f729Sjoerg }
787330f729Sjoerg
getStringError(const Twine & message)797330f729Sjoerg static Error getStringError(const Twine &message) {
807330f729Sjoerg return make_error<StringError>("Error parsing file: " + message,
817330f729Sjoerg inconvertibleErrorCode());
827330f729Sjoerg }
837330f729Sjoerg
847330f729Sjoerg namespace {
857330f729Sjoerg
867330f729Sjoerg class Tokenizer {
877330f729Sjoerg public:
Tokenizer(StringRef Input)88*82d56013Sjoerg Tokenizer(StringRef Input) : Data(Input), DataLength(Input.size()), Pos(0) {}
897330f729Sjoerg
907330f729Sjoerg Expected<std::vector<RCToken>> run();
917330f729Sjoerg
927330f729Sjoerg private:
937330f729Sjoerg // All 'advancing' methods return boolean values; if they're equal to false,
947330f729Sjoerg // the stream has ended or failed.
957330f729Sjoerg bool advance(size_t Amount = 1);
967330f729Sjoerg bool skipWhitespaces();
977330f729Sjoerg
987330f729Sjoerg // Consumes a token. If any problem occurred, a non-empty Error is returned.
997330f729Sjoerg Error consumeToken(const Kind TokenKind);
1007330f729Sjoerg
1017330f729Sjoerg // Check if tokenizer is about to read FollowingChars.
1027330f729Sjoerg bool willNowRead(StringRef FollowingChars) const;
1037330f729Sjoerg
1047330f729Sjoerg // Check if tokenizer can start reading an identifier at current position.
1057330f729Sjoerg // The original tool did non specify the rules to determine what is a correct
1067330f729Sjoerg // identifier. We assume they should follow the C convention:
1077330f729Sjoerg // [a-zA-Z_][a-zA-Z0-9_]*.
1087330f729Sjoerg bool canStartIdentifier() const;
1097330f729Sjoerg // Check if tokenizer can continue reading an identifier.
1107330f729Sjoerg bool canContinueIdentifier() const;
1117330f729Sjoerg
1127330f729Sjoerg // Check if tokenizer can start reading an integer.
1137330f729Sjoerg // A correct integer always starts with a 0-9 digit,
1147330f729Sjoerg // can contain characters 0-9A-Fa-f (digits),
1157330f729Sjoerg // Ll (marking the integer is 32-bit), Xx (marking the representation
1167330f729Sjoerg // is hexadecimal). As some kind of separator should come after the
1177330f729Sjoerg // integer, we can consume the integer until a non-alphanumeric
1187330f729Sjoerg // character.
1197330f729Sjoerg bool canStartInt() const;
1207330f729Sjoerg bool canContinueInt() const;
1217330f729Sjoerg
1227330f729Sjoerg bool canStartString() const;
1237330f729Sjoerg
1247330f729Sjoerg // Check if tokenizer can start reading a single line comment (e.g. a comment
1257330f729Sjoerg // that begins with '//')
1267330f729Sjoerg bool canStartLineComment() const;
1277330f729Sjoerg
1287330f729Sjoerg // Check if tokenizer can start or finish reading a block comment (e.g. a
1297330f729Sjoerg // comment that begins with '/*' and ends with '*/')
1307330f729Sjoerg bool canStartBlockComment() const;
1317330f729Sjoerg
1327330f729Sjoerg // Throw away all remaining characters on the current line.
1337330f729Sjoerg void skipCurrentLine();
1347330f729Sjoerg
1357330f729Sjoerg bool streamEof() const;
1367330f729Sjoerg
1377330f729Sjoerg // Classify the token that is about to be read from the current position.
1387330f729Sjoerg Kind classifyCurrentToken() const;
1397330f729Sjoerg
1407330f729Sjoerg // Process the Kind::Identifier token - check if it is
1417330f729Sjoerg // an identifier describing a block start or end.
1427330f729Sjoerg void processIdentifier(RCToken &token) const;
1437330f729Sjoerg
1447330f729Sjoerg StringRef Data;
1457330f729Sjoerg size_t DataLength, Pos;
1467330f729Sjoerg };
1477330f729Sjoerg
skipCurrentLine()1487330f729Sjoerg void Tokenizer::skipCurrentLine() {
1497330f729Sjoerg Pos = Data.find_first_of("\r\n", Pos);
1507330f729Sjoerg Pos = Data.find_first_not_of("\r\n", Pos);
1517330f729Sjoerg
1527330f729Sjoerg if (Pos == StringRef::npos)
1537330f729Sjoerg Pos = DataLength;
1547330f729Sjoerg }
1557330f729Sjoerg
run()1567330f729Sjoerg Expected<std::vector<RCToken>> Tokenizer::run() {
1577330f729Sjoerg Pos = 0;
1587330f729Sjoerg std::vector<RCToken> Result;
1597330f729Sjoerg
1607330f729Sjoerg // Consume an optional UTF-8 Byte Order Mark.
1617330f729Sjoerg if (willNowRead("\xef\xbb\xbf"))
1627330f729Sjoerg advance(3);
1637330f729Sjoerg
1647330f729Sjoerg while (!streamEof()) {
1657330f729Sjoerg if (!skipWhitespaces())
1667330f729Sjoerg break;
1677330f729Sjoerg
1687330f729Sjoerg Kind TokenKind = classifyCurrentToken();
1697330f729Sjoerg if (TokenKind == Kind::Invalid)
1707330f729Sjoerg return getStringError("Invalid token found at position " + Twine(Pos));
1717330f729Sjoerg
1727330f729Sjoerg const size_t TokenStart = Pos;
1737330f729Sjoerg if (Error TokenError = consumeToken(TokenKind))
1747330f729Sjoerg return std::move(TokenError);
1757330f729Sjoerg
1767330f729Sjoerg // Comments are just deleted, don't bother saving them.
1777330f729Sjoerg if (TokenKind == Kind::LineComment || TokenKind == Kind::StartComment)
1787330f729Sjoerg continue;
1797330f729Sjoerg
1807330f729Sjoerg RCToken Token(TokenKind, Data.take_front(Pos).drop_front(TokenStart));
1817330f729Sjoerg if (TokenKind == Kind::Identifier) {
1827330f729Sjoerg processIdentifier(Token);
1837330f729Sjoerg } else if (TokenKind == Kind::Int) {
1847330f729Sjoerg uint32_t TokenInt;
1857330f729Sjoerg if (!rcGetAsInteger(Token.value(), TokenInt)) {
1867330f729Sjoerg // The integer has incorrect format or cannot be represented in
1877330f729Sjoerg // a 32-bit integer.
1887330f729Sjoerg return getStringError("Integer invalid or too large: " +
1897330f729Sjoerg Token.value().str());
1907330f729Sjoerg }
1917330f729Sjoerg }
1927330f729Sjoerg
1937330f729Sjoerg Result.push_back(Token);
1947330f729Sjoerg }
1957330f729Sjoerg
1967330f729Sjoerg return Result;
1977330f729Sjoerg }
1987330f729Sjoerg
advance(size_t Amount)1997330f729Sjoerg bool Tokenizer::advance(size_t Amount) {
2007330f729Sjoerg Pos += Amount;
2017330f729Sjoerg return !streamEof();
2027330f729Sjoerg }
2037330f729Sjoerg
skipWhitespaces()2047330f729Sjoerg bool Tokenizer::skipWhitespaces() {
205*82d56013Sjoerg while (!streamEof() && isSpace(Data[Pos]))
2067330f729Sjoerg advance();
2077330f729Sjoerg return !streamEof();
2087330f729Sjoerg }
2097330f729Sjoerg
consumeToken(const Kind TokenKind)2107330f729Sjoerg Error Tokenizer::consumeToken(const Kind TokenKind) {
2117330f729Sjoerg switch (TokenKind) {
2127330f729Sjoerg // One-character token consumption.
2137330f729Sjoerg #define TOKEN(Name)
2147330f729Sjoerg #define SHORT_TOKEN(Name, Ch) case Kind::Name:
2157330f729Sjoerg #include "ResourceScriptTokenList.def"
2167330f729Sjoerg advance();
2177330f729Sjoerg return Error::success();
2187330f729Sjoerg
2197330f729Sjoerg case Kind::LineComment:
2207330f729Sjoerg advance(2);
2217330f729Sjoerg skipCurrentLine();
2227330f729Sjoerg return Error::success();
2237330f729Sjoerg
2247330f729Sjoerg case Kind::StartComment: {
2257330f729Sjoerg advance(2);
2267330f729Sjoerg auto EndPos = Data.find("*/", Pos);
2277330f729Sjoerg if (EndPos == StringRef::npos)
2287330f729Sjoerg return getStringError(
2297330f729Sjoerg "Unclosed multi-line comment beginning at position " + Twine(Pos));
2307330f729Sjoerg advance(EndPos - Pos);
2317330f729Sjoerg advance(2);
2327330f729Sjoerg return Error::success();
2337330f729Sjoerg }
2347330f729Sjoerg case Kind::Identifier:
2357330f729Sjoerg while (!streamEof() && canContinueIdentifier())
2367330f729Sjoerg advance();
2377330f729Sjoerg return Error::success();
2387330f729Sjoerg
2397330f729Sjoerg case Kind::Int:
2407330f729Sjoerg while (!streamEof() && canContinueInt())
2417330f729Sjoerg advance();
2427330f729Sjoerg return Error::success();
2437330f729Sjoerg
2447330f729Sjoerg case Kind::String:
2457330f729Sjoerg // Consume the preceding 'L', if there is any.
2467330f729Sjoerg if (std::toupper(Data[Pos]) == 'L')
2477330f729Sjoerg advance();
2487330f729Sjoerg // Consume the double-quote.
2497330f729Sjoerg advance();
2507330f729Sjoerg
2517330f729Sjoerg // Consume the characters until the end of the file, line or string.
2527330f729Sjoerg while (true) {
2537330f729Sjoerg if (streamEof()) {
2547330f729Sjoerg return getStringError("Unterminated string literal.");
2557330f729Sjoerg } else if (Data[Pos] == '"') {
2567330f729Sjoerg // Consume the ending double-quote.
2577330f729Sjoerg advance();
2587330f729Sjoerg // However, if another '"' follows this double-quote, the string didn't
2597330f729Sjoerg // end and we just included '"' into the string.
2607330f729Sjoerg if (!willNowRead("\""))
2617330f729Sjoerg return Error::success();
2627330f729Sjoerg } else if (Data[Pos] == '\n') {
2637330f729Sjoerg return getStringError("String literal not terminated in the line.");
2647330f729Sjoerg }
2657330f729Sjoerg
2667330f729Sjoerg advance();
2677330f729Sjoerg }
2687330f729Sjoerg
2697330f729Sjoerg case Kind::Invalid:
2707330f729Sjoerg assert(false && "Cannot consume an invalid token.");
2717330f729Sjoerg }
2727330f729Sjoerg
2737330f729Sjoerg llvm_unreachable("Unknown RCToken::Kind");
2747330f729Sjoerg }
2757330f729Sjoerg
willNowRead(StringRef FollowingChars) const2767330f729Sjoerg bool Tokenizer::willNowRead(StringRef FollowingChars) const {
2777330f729Sjoerg return Data.drop_front(Pos).startswith(FollowingChars);
2787330f729Sjoerg }
2797330f729Sjoerg
canStartIdentifier() const2807330f729Sjoerg bool Tokenizer::canStartIdentifier() const {
2817330f729Sjoerg assert(!streamEof());
2827330f729Sjoerg
2837330f729Sjoerg const char CurChar = Data[Pos];
2847330f729Sjoerg return std::isalpha(CurChar) || CurChar == '_' || CurChar == '.';
2857330f729Sjoerg }
2867330f729Sjoerg
canContinueIdentifier() const2877330f729Sjoerg bool Tokenizer::canContinueIdentifier() const {
2887330f729Sjoerg assert(!streamEof());
2897330f729Sjoerg const char CurChar = Data[Pos];
2907330f729Sjoerg return std::isalnum(CurChar) || CurChar == '_' || CurChar == '.' ||
2917330f729Sjoerg CurChar == '/' || CurChar == '\\';
2927330f729Sjoerg }
2937330f729Sjoerg
canStartInt() const2947330f729Sjoerg bool Tokenizer::canStartInt() const {
2957330f729Sjoerg assert(!streamEof());
2967330f729Sjoerg return std::isdigit(Data[Pos]);
2977330f729Sjoerg }
2987330f729Sjoerg
canStartBlockComment() const2997330f729Sjoerg bool Tokenizer::canStartBlockComment() const {
3007330f729Sjoerg assert(!streamEof());
3017330f729Sjoerg return Data.drop_front(Pos).startswith("/*");
3027330f729Sjoerg }
3037330f729Sjoerg
canStartLineComment() const3047330f729Sjoerg bool Tokenizer::canStartLineComment() const {
3057330f729Sjoerg assert(!streamEof());
3067330f729Sjoerg return Data.drop_front(Pos).startswith("//");
3077330f729Sjoerg }
3087330f729Sjoerg
canContinueInt() const3097330f729Sjoerg bool Tokenizer::canContinueInt() const {
3107330f729Sjoerg assert(!streamEof());
3117330f729Sjoerg return std::isalnum(Data[Pos]);
3127330f729Sjoerg }
3137330f729Sjoerg
canStartString() const3147330f729Sjoerg bool Tokenizer::canStartString() const {
3157330f729Sjoerg return willNowRead("\"") || willNowRead("L\"") || willNowRead("l\"");
3167330f729Sjoerg }
3177330f729Sjoerg
streamEof() const3187330f729Sjoerg bool Tokenizer::streamEof() const { return Pos == DataLength; }
3197330f729Sjoerg
classifyCurrentToken() const3207330f729Sjoerg Kind Tokenizer::classifyCurrentToken() const {
3217330f729Sjoerg if (canStartBlockComment())
3227330f729Sjoerg return Kind::StartComment;
3237330f729Sjoerg if (canStartLineComment())
3247330f729Sjoerg return Kind::LineComment;
3257330f729Sjoerg
3267330f729Sjoerg if (canStartInt())
3277330f729Sjoerg return Kind::Int;
3287330f729Sjoerg if (canStartString())
3297330f729Sjoerg return Kind::String;
3307330f729Sjoerg // BEGIN and END are at this point of lexing recognized as identifiers.
3317330f729Sjoerg if (canStartIdentifier())
3327330f729Sjoerg return Kind::Identifier;
3337330f729Sjoerg
3347330f729Sjoerg const char CurChar = Data[Pos];
3357330f729Sjoerg
3367330f729Sjoerg switch (CurChar) {
3377330f729Sjoerg // One-character token classification.
3387330f729Sjoerg #define TOKEN(Name)
3397330f729Sjoerg #define SHORT_TOKEN(Name, Ch) \
3407330f729Sjoerg case Ch: \
3417330f729Sjoerg return Kind::Name;
3427330f729Sjoerg #include "ResourceScriptTokenList.def"
3437330f729Sjoerg
3447330f729Sjoerg default:
3457330f729Sjoerg return Kind::Invalid;
3467330f729Sjoerg }
3477330f729Sjoerg }
3487330f729Sjoerg
processIdentifier(RCToken & Token) const3497330f729Sjoerg void Tokenizer::processIdentifier(RCToken &Token) const {
3507330f729Sjoerg assert(Token.kind() == Kind::Identifier);
3517330f729Sjoerg StringRef Name = Token.value();
3527330f729Sjoerg
3537330f729Sjoerg if (Name.equals_lower("begin"))
3547330f729Sjoerg Token = RCToken(Kind::BlockBegin, Name);
3557330f729Sjoerg else if (Name.equals_lower("end"))
3567330f729Sjoerg Token = RCToken(Kind::BlockEnd, Name);
3577330f729Sjoerg }
3587330f729Sjoerg
3597330f729Sjoerg } // anonymous namespace
3607330f729Sjoerg
3617330f729Sjoerg namespace llvm {
3627330f729Sjoerg
tokenizeRC(StringRef Input)3637330f729Sjoerg Expected<std::vector<RCToken>> tokenizeRC(StringRef Input) {
3647330f729Sjoerg return Tokenizer(Input).run();
3657330f729Sjoerg }
3667330f729Sjoerg
3677330f729Sjoerg } // namespace llvm
368