tools/llvm-rc/ResourceScriptToken.cpp

7330f729Sjoerg//===-- ResourceScriptToken.cpp ---------------------------------*- C++-*-===//
7330f729Sjoerg//
7330f729Sjoerg// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
7330f729Sjoerg// See https://llvm.org/LICENSE.txt for license information.
7330f729Sjoerg// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7330f729Sjoerg//
7330f729Sjoerg//===---------------------------------------------------------------------===//
7330f729Sjoerg//
7330f729Sjoerg// This file implements an interface defined in ResourceScriptToken.h.
7330f729Sjoerg// In particular, it defines an .rc script tokenizer.
7330f729Sjoerg//
7330f729Sjoerg//===---------------------------------------------------------------------===//
7330f729Sjoerg
7330f729Sjoerg#include "ResourceScriptToken.h"
*82d56013Sjoerg#include "llvm/ADT/StringExtras.h"
7330f729Sjoerg#include "llvm/Support/raw_ostream.h"
7330f729Sjoerg
7330f729Sjoerg#include <algorithm>
7330f729Sjoerg#include <cassert>
7330f729Sjoerg#include <cctype>
7330f729Sjoerg#include <cstdlib>
7330f729Sjoerg#include <utility>
7330f729Sjoerg
7330f729Sjoergusing namespace llvm;
7330f729Sjoerg
7330f729Sjoergusing Kind = RCToken::Kind;
7330f729Sjoerg
7330f729Sjoerg// Checks if Representation is a correct description of an RC integer.
7330f729Sjoerg// It should be a 32-bit unsigned integer, either decimal, octal (0[0-7]+),
7330f729Sjoerg// or hexadecimal (0x[0-9a-f]+). It might be followed by a single 'L'
7330f729Sjoerg// character (that is the difference between our representation and
7330f729Sjoerg// StringRef's one). If Representation is correct, 'true' is returned and
7330f729Sjoerg// the return value is put back in Num.
7330f729Sjoergstatic bool rcGetAsInteger(StringRef Representation, uint32_t &Num) {
7330f729Sjoerg  size_t Length = Representation.size();
7330f729Sjoerg  if (Length == 0)
7330f729Sjoerg    return false;
7330f729Sjoerg  // Strip the last 'L' if unnecessary.
7330f729Sjoerg  if (std::toupper(Representation.back()) == 'L')
7330f729Sjoerg    Representation = Representation.drop_back(1);
7330f729Sjoerg
7330f729Sjoerg  return !Representation.getAsInteger<uint32_t>(0, Num);
7330f729Sjoerg}
7330f729Sjoerg
7330f729SjoergRCToken::RCToken(RCToken::Kind RCTokenKind, StringRef Value)
7330f729Sjoerg    : TokenKind(RCTokenKind), TokenValue(Value) {}
7330f729Sjoerg
7330f729Sjoerguint32_t RCToken::intValue() const {
7330f729Sjoerg  assert(TokenKind == Kind::Int);
7330f729Sjoerg  // We assume that the token already is a correct integer (checked by
7330f729Sjoerg  // rcGetAsInteger).
7330f729Sjoerg  uint32_t Result;
7330f729Sjoerg  bool IsSuccess = rcGetAsInteger(TokenValue, Result);
7330f729Sjoerg  assert(IsSuccess);
7330f729Sjoerg  (void)IsSuccess;  // Silence the compiler warning when -DNDEBUG flag is on.
7330f729Sjoerg  return Result;
7330f729Sjoerg}
7330f729Sjoerg
7330f729Sjoergbool RCToken::isLongInt() const {
7330f729Sjoerg  return TokenKind == Kind::Int && std::toupper(TokenValue.back()) == 'L';
7330f729Sjoerg}
7330f729Sjoerg
7330f729SjoergStringRef RCToken::value() const { return TokenValue; }
7330f729Sjoerg
7330f729SjoergKind RCToken::kind() const { return TokenKind; }
7330f729Sjoerg
7330f729Sjoergbool RCToken::isBinaryOp() const {
7330f729Sjoerg  switch (TokenKind) {
7330f729Sjoerg  case Kind::Plus:
7330f729Sjoerg  case Kind::Minus:
7330f729Sjoerg  case Kind::Pipe:
7330f729Sjoerg  case Kind::Amp:
7330f729Sjoerg    return true;
7330f729Sjoerg  default:
7330f729Sjoerg    return false;
7330f729Sjoerg  }
7330f729Sjoerg}
7330f729Sjoerg
7330f729Sjoergstatic Error getStringError(const Twine &message) {
7330f729Sjoerg  return make_error<StringError>("Error parsing file: " + message,
7330f729Sjoerg                                 inconvertibleErrorCode());
7330f729Sjoerg}
7330f729Sjoerg
7330f729Sjoergnamespace {
7330f729Sjoerg
7330f729Sjoergclass Tokenizer {
7330f729Sjoergpublic:
*82d56013Sjoerg  Tokenizer(StringRef Input) : Data(Input), DataLength(Input.size()), Pos(0) {}
7330f729Sjoerg
7330f729Sjoerg  Expected<std::vector<RCToken>> run();
7330f729Sjoerg
7330f729Sjoergprivate:
7330f729Sjoerg  // All 'advancing' methods return boolean values; if they're equal to false,
7330f729Sjoerg  // the stream has ended or failed.
7330f729Sjoerg  bool advance(size_t Amount = 1);
7330f729Sjoerg  bool skipWhitespaces();
7330f729Sjoerg
7330f729Sjoerg  // Consumes a token. If any problem occurred, a non-empty Error is returned.
7330f729Sjoerg  Error consumeToken(const Kind TokenKind);
7330f729Sjoerg
7330f729Sjoerg  // Check if tokenizer is about to read FollowingChars.
7330f729Sjoerg  bool willNowRead(StringRef FollowingChars) const;
7330f729Sjoerg
7330f729Sjoerg  // Check if tokenizer can start reading an identifier at current position.
7330f729Sjoerg  // The original tool did non specify the rules to determine what is a correct
7330f729Sjoerg  // identifier. We assume they should follow the C convention:
7330f729Sjoerg  // [a-zA-Z_][a-zA-Z0-9_]*.
7330f729Sjoerg  bool canStartIdentifier() const;
7330f729Sjoerg  // Check if tokenizer can continue reading an identifier.
7330f729Sjoerg  bool canContinueIdentifier() const;
7330f729Sjoerg
7330f729Sjoerg  // Check if tokenizer can start reading an integer.
7330f729Sjoerg  // A correct integer always starts with a 0-9 digit,
7330f729Sjoerg  // can contain characters 0-9A-Fa-f (digits),
7330f729Sjoerg  // Ll (marking the integer is 32-bit), Xx (marking the representation
7330f729Sjoerg  // is hexadecimal). As some kind of separator should come after the
7330f729Sjoerg  // integer, we can consume the integer until a non-alphanumeric
7330f729Sjoerg  // character.
7330f729Sjoerg  bool canStartInt() const;
7330f729Sjoerg  bool canContinueInt() const;
7330f729Sjoerg
7330f729Sjoerg  bool canStartString() const;
7330f729Sjoerg
7330f729Sjoerg  // Check if tokenizer can start reading a single line comment (e.g. a comment
7330f729Sjoerg  // that begins with '//')
7330f729Sjoerg  bool canStartLineComment() const;
7330f729Sjoerg
7330f729Sjoerg  // Check if tokenizer can start or finish reading a block comment (e.g. a
7330f729Sjoerg  // comment that begins with '/*' and ends with '*/')
7330f729Sjoerg  bool canStartBlockComment() const;
7330f729Sjoerg
7330f729Sjoerg  // Throw away all remaining characters on the current line.
7330f729Sjoerg  void skipCurrentLine();
7330f729Sjoerg
7330f729Sjoerg  bool streamEof() const;
7330f729Sjoerg
7330f729Sjoerg  // Classify the token that is about to be read from the current position.
7330f729Sjoerg  Kind classifyCurrentToken() const;
7330f729Sjoerg
7330f729Sjoerg  // Process the Kind::Identifier token - check if it is
7330f729Sjoerg  // an identifier describing a block start or end.
7330f729Sjoerg  void processIdentifier(RCToken &token) const;
7330f729Sjoerg
7330f729Sjoerg  StringRef Data;
7330f729Sjoerg  size_t DataLength, Pos;
7330f729Sjoerg};
7330f729Sjoerg
7330f729Sjoergvoid Tokenizer::skipCurrentLine() {
7330f729Sjoerg  Pos = Data.find_first_of("\r\n", Pos);
7330f729Sjoerg  Pos = Data.find_first_not_of("\r\n", Pos);
7330f729Sjoerg
7330f729Sjoerg  if (Pos == StringRef::npos)
7330f729Sjoerg    Pos = DataLength;
7330f729Sjoerg}
7330f729Sjoerg
7330f729SjoergExpected<std::vector<RCToken>> Tokenizer::run() {
7330f729Sjoerg  Pos = 0;
7330f729Sjoerg  std::vector<RCToken> Result;
7330f729Sjoerg
7330f729Sjoerg  // Consume an optional UTF-8 Byte Order Mark.
7330f729Sjoerg  if (willNowRead("\xef\xbb\xbf"))
7330f729Sjoerg    advance(3);
7330f729Sjoerg
7330f729Sjoerg  while (!streamEof()) {
7330f729Sjoerg    if (!skipWhitespaces())
7330f729Sjoerg      break;
7330f729Sjoerg
7330f729Sjoerg    Kind TokenKind = classifyCurrentToken();
7330f729Sjoerg    if (TokenKind == Kind::Invalid)
7330f729Sjoerg      return getStringError("Invalid token found at position " + Twine(Pos));
7330f729Sjoerg
7330f729Sjoerg    const size_t TokenStart = Pos;
7330f729Sjoerg    if (Error TokenError = consumeToken(TokenKind))
7330f729Sjoerg      return std::move(TokenError);
7330f729Sjoerg
7330f729Sjoerg    // Comments are just deleted, don't bother saving them.
7330f729Sjoerg    if (TokenKind == Kind::LineComment || TokenKind == Kind::StartComment)
7330f729Sjoerg      continue;
7330f729Sjoerg
7330f729Sjoerg    RCToken Token(TokenKind, Data.take_front(Pos).drop_front(TokenStart));
7330f729Sjoerg    if (TokenKind == Kind::Identifier) {
7330f729Sjoerg      processIdentifier(Token);
7330f729Sjoerg    } else if (TokenKind == Kind::Int) {
7330f729Sjoerg      uint32_t TokenInt;
7330f729Sjoerg      if (!rcGetAsInteger(Token.value(), TokenInt)) {
7330f729Sjoerg        // The integer has incorrect format or cannot be represented in
7330f729Sjoerg        // a 32-bit integer.
7330f729Sjoerg        return getStringError("Integer invalid or too large: " +
7330f729Sjoerg                              Token.value().str());
7330f729Sjoerg      }
7330f729Sjoerg    }
7330f729Sjoerg
7330f729Sjoerg    Result.push_back(Token);
7330f729Sjoerg  }
7330f729Sjoerg
7330f729Sjoerg  return Result;
7330f729Sjoerg}
7330f729Sjoerg
7330f729Sjoergbool Tokenizer::advance(size_t Amount) {
7330f729Sjoerg  Pos += Amount;
7330f729Sjoerg  return !streamEof();
7330f729Sjoerg}
7330f729Sjoerg
7330f729Sjoergbool Tokenizer::skipWhitespaces() {
*82d56013Sjoerg  while (!streamEof() && isSpace(Data[Pos]))
7330f729Sjoerg    advance();
7330f729Sjoerg  return !streamEof();
7330f729Sjoerg}
7330f729Sjoerg
7330f729SjoergError Tokenizer::consumeToken(const Kind TokenKind) {
7330f729Sjoerg  switch (TokenKind) {
7330f729Sjoerg  // One-character token consumption.
7330f729Sjoerg#define TOKEN(Name)
7330f729Sjoerg#define SHORT_TOKEN(Name, Ch) case Kind::Name:
7330f729Sjoerg#include "ResourceScriptTokenList.def"
7330f729Sjoerg    advance();
7330f729Sjoerg    return Error::success();
7330f729Sjoerg
7330f729Sjoerg  case Kind::LineComment:
7330f729Sjoerg    advance(2);
7330f729Sjoerg    skipCurrentLine();
7330f729Sjoerg    return Error::success();
7330f729Sjoerg
7330f729Sjoerg  case Kind::StartComment: {
7330f729Sjoerg    advance(2);
7330f729Sjoerg    auto EndPos = Data.find("*/", Pos);
7330f729Sjoerg    if (EndPos == StringRef::npos)
7330f729Sjoerg      return getStringError(
7330f729Sjoerg          "Unclosed multi-line comment beginning at position " + Twine(Pos));
7330f729Sjoerg    advance(EndPos - Pos);
7330f729Sjoerg    advance(2);
7330f729Sjoerg    return Error::success();
7330f729Sjoerg  }
7330f729Sjoerg  case Kind::Identifier:
7330f729Sjoerg    while (!streamEof() && canContinueIdentifier())
7330f729Sjoerg      advance();
7330f729Sjoerg    return Error::success();
7330f729Sjoerg
7330f729Sjoerg  case Kind::Int:
7330f729Sjoerg    while (!streamEof() && canContinueInt())
7330f729Sjoerg      advance();
7330f729Sjoerg    return Error::success();
7330f729Sjoerg
7330f729Sjoerg  case Kind::String:
7330f729Sjoerg    // Consume the preceding 'L', if there is any.
7330f729Sjoerg    if (std::toupper(Data[Pos]) == 'L')
7330f729Sjoerg      advance();
7330f729Sjoerg    // Consume the double-quote.
7330f729Sjoerg    advance();
7330f729Sjoerg
7330f729Sjoerg    // Consume the characters until the end of the file, line or string.
7330f729Sjoerg    while (true) {
7330f729Sjoerg      if (streamEof()) {
7330f729Sjoerg        return getStringError("Unterminated string literal.");
7330f729Sjoerg      } else if (Data[Pos] == '"') {
7330f729Sjoerg        // Consume the ending double-quote.
7330f729Sjoerg        advance();
7330f729Sjoerg        // However, if another '"' follows this double-quote, the string didn't
7330f729Sjoerg        // end and we just included '"' into the string.
7330f729Sjoerg        if (!willNowRead("\""))
7330f729Sjoerg          return Error::success();
7330f729Sjoerg      } else if (Data[Pos] == '\n') {
7330f729Sjoerg        return getStringError("String literal not terminated in the line.");
7330f729Sjoerg      }
7330f729Sjoerg
7330f729Sjoerg      advance();
7330f729Sjoerg    }
7330f729Sjoerg
7330f729Sjoerg  case Kind::Invalid:
7330f729Sjoerg    assert(false && "Cannot consume an invalid token.");
7330f729Sjoerg  }
7330f729Sjoerg
7330f729Sjoerg  llvm_unreachable("Unknown RCToken::Kind");
7330f729Sjoerg}
7330f729Sjoerg
7330f729Sjoergbool Tokenizer::willNowRead(StringRef FollowingChars) const {
7330f729Sjoerg  return Data.drop_front(Pos).startswith(FollowingChars);
7330f729Sjoerg}
7330f729Sjoerg
7330f729Sjoergbool Tokenizer::canStartIdentifier() const {
7330f729Sjoerg  assert(!streamEof());
7330f729Sjoerg
7330f729Sjoerg  const char CurChar = Data[Pos];
7330f729Sjoerg  return std::isalpha(CurChar) || CurChar == '_' || CurChar == '.';
7330f729Sjoerg}
7330f729Sjoerg
7330f729Sjoergbool Tokenizer::canContinueIdentifier() const {
7330f729Sjoerg  assert(!streamEof());
7330f729Sjoerg  const char CurChar = Data[Pos];
7330f729Sjoerg  return std::isalnum(CurChar) || CurChar == '_' || CurChar == '.' ||
7330f729Sjoerg         CurChar == '/' || CurChar == '\\';
7330f729Sjoerg}
7330f729Sjoerg
7330f729Sjoergbool Tokenizer::canStartInt() const {
7330f729Sjoerg  assert(!streamEof());
7330f729Sjoerg  return std::isdigit(Data[Pos]);
7330f729Sjoerg}
7330f729Sjoerg
7330f729Sjoergbool Tokenizer::canStartBlockComment() const {
7330f729Sjoerg  assert(!streamEof());
7330f729Sjoerg  return Data.drop_front(Pos).startswith("/*");
7330f729Sjoerg}
7330f729Sjoerg
7330f729Sjoergbool Tokenizer::canStartLineComment() const {
7330f729Sjoerg  assert(!streamEof());
7330f729Sjoerg  return Data.drop_front(Pos).startswith("//");
7330f729Sjoerg}
7330f729Sjoerg
7330f729Sjoergbool Tokenizer::canContinueInt() const {
7330f729Sjoerg  assert(!streamEof());
7330f729Sjoerg  return std::isalnum(Data[Pos]);
7330f729Sjoerg}
7330f729Sjoerg
7330f729Sjoergbool Tokenizer::canStartString() const {
7330f729Sjoerg  return willNowRead("\"") || willNowRead("L\"") || willNowRead("l\"");
7330f729Sjoerg}
7330f729Sjoerg
7330f729Sjoergbool Tokenizer::streamEof() const { return Pos == DataLength; }
7330f729Sjoerg
7330f729SjoergKind Tokenizer::classifyCurrentToken() const {
7330f729Sjoerg  if (canStartBlockComment())
7330f729Sjoerg    return Kind::StartComment;
7330f729Sjoerg  if (canStartLineComment())
7330f729Sjoerg    return Kind::LineComment;
7330f729Sjoerg
7330f729Sjoerg  if (canStartInt())
7330f729Sjoerg    return Kind::Int;
7330f729Sjoerg  if (canStartString())
7330f729Sjoerg    return Kind::String;
7330f729Sjoerg  // BEGIN and END are at this point of lexing recognized as identifiers.
7330f729Sjoerg  if (canStartIdentifier())
7330f729Sjoerg    return Kind::Identifier;
7330f729Sjoerg
7330f729Sjoerg  const char CurChar = Data[Pos];
7330f729Sjoerg
7330f729Sjoerg  switch (CurChar) {
7330f729Sjoerg  // One-character token classification.
7330f729Sjoerg#define TOKEN(Name)
7330f729Sjoerg#define SHORT_TOKEN(Name, Ch)                                                  \
7330f729Sjoerg  case Ch:                                                                     \
7330f729Sjoerg    return Kind::Name;
7330f729Sjoerg#include "ResourceScriptTokenList.def"
7330f729Sjoerg
7330f729Sjoerg  default:
7330f729Sjoerg    return Kind::Invalid;
7330f729Sjoerg  }
7330f729Sjoerg}
7330f729Sjoerg
7330f729Sjoergvoid Tokenizer::processIdentifier(RCToken &Token) const {
7330f729Sjoerg  assert(Token.kind() == Kind::Identifier);
7330f729Sjoerg  StringRef Name = Token.value();
7330f729Sjoerg
7330f729Sjoerg  if (Name.equals_lower("begin"))
7330f729Sjoerg    Token = RCToken(Kind::BlockBegin, Name);
7330f729Sjoerg  else if (Name.equals_lower("end"))
7330f729Sjoerg    Token = RCToken(Kind::BlockEnd, Name);
7330f729Sjoerg}
7330f729Sjoerg
7330f729Sjoerg} // anonymous namespace
7330f729Sjoerg
7330f729Sjoergnamespace llvm {
7330f729Sjoerg
7330f729SjoergExpected<std::vector<RCToken>> tokenizeRC(StringRef Input) {
7330f729Sjoerg  return Tokenizer(Input).run();
7330f729Sjoerg}
7330f729Sjoerg
7330f729Sjoerg} // namespace llvm