1 //===--- Parser.cpp - Matcher expression parser -----*- C++ -*-===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 /// 10 /// \file 11 /// \brief Recursive parser implementation for the matcher expression grammar. 12 /// 13 //===----------------------------------------------------------------------===// 14 15 #include <string> 16 #include <vector> 17 18 #include "clang/ASTMatchers/Dynamic/Parser.h" 19 #include "clang/ASTMatchers/Dynamic/Registry.h" 20 #include "clang/Basic/CharInfo.h" 21 #include "llvm/ADT/Optional.h" 22 #include "llvm/ADT/Twine.h" 23 24 namespace clang { 25 namespace ast_matchers { 26 namespace dynamic { 27 28 /// \brief Simple structure to hold information for one token from the parser. 29 struct Parser::TokenInfo { 30 /// \brief Different possible tokens. 31 enum TokenKind { 32 TK_Eof = 0, 33 TK_OpenParen = 1, 34 TK_CloseParen = 2, 35 TK_Comma = 3, 36 TK_Period = 4, 37 TK_Literal = 5, 38 TK_Ident = 6, 39 TK_InvalidChar = 7, 40 TK_Error = 8 41 }; 42 43 /// \brief Some known identifiers. 44 static const char* const ID_Bind; 45 46 TokenInfo() : Text(), Kind(TK_Eof), Range(), Value() {} 47 48 StringRef Text; 49 TokenKind Kind; 50 SourceRange Range; 51 VariantValue Value; 52 }; 53 54 const char* const Parser::TokenInfo::ID_Bind = "bind"; 55 56 /// \brief Simple tokenizer for the parser. 57 class Parser::CodeTokenizer { 58 public: 59 explicit CodeTokenizer(StringRef MatcherCode, Diagnostics *Error) 60 : Code(MatcherCode), StartOfLine(MatcherCode), Line(1), Error(Error) { 61 NextToken = getNextToken(); 62 } 63 64 /// \brief Returns but doesn't consume the next token. 65 const TokenInfo &peekNextToken() const { return NextToken; } 66 67 /// \brief Consumes and returns the next token. 68 TokenInfo consumeNextToken() { 69 TokenInfo ThisToken = NextToken; 70 NextToken = getNextToken(); 71 return ThisToken; 72 } 73 74 TokenInfo::TokenKind nextTokenKind() const { return NextToken.Kind; } 75 76 private: 77 TokenInfo getNextToken() { 78 consumeWhitespace(); 79 TokenInfo Result; 80 Result.Range.Start = currentLocation(); 81 82 if (Code.empty()) { 83 Result.Kind = TokenInfo::TK_Eof; 84 Result.Text = ""; 85 return Result; 86 } 87 88 switch (Code[0]) { 89 case ',': 90 Result.Kind = TokenInfo::TK_Comma; 91 Result.Text = Code.substr(0, 1); 92 Code = Code.drop_front(); 93 break; 94 case '.': 95 Result.Kind = TokenInfo::TK_Period; 96 Result.Text = Code.substr(0, 1); 97 Code = Code.drop_front(); 98 break; 99 case '(': 100 Result.Kind = TokenInfo::TK_OpenParen; 101 Result.Text = Code.substr(0, 1); 102 Code = Code.drop_front(); 103 break; 104 case ')': 105 Result.Kind = TokenInfo::TK_CloseParen; 106 Result.Text = Code.substr(0, 1); 107 Code = Code.drop_front(); 108 break; 109 110 case '"': 111 case '\'': 112 // Parse a string literal. 113 consumeStringLiteral(&Result); 114 break; 115 116 case '0': case '1': case '2': case '3': case '4': 117 case '5': case '6': case '7': case '8': case '9': 118 // Parse an unsigned literal. 119 consumeUnsignedLiteral(&Result); 120 break; 121 122 default: 123 if (isAlphanumeric(Code[0])) { 124 // Parse an identifier 125 size_t TokenLength = 1; 126 while (TokenLength < Code.size() && isAlphanumeric(Code[TokenLength])) 127 ++TokenLength; 128 Result.Kind = TokenInfo::TK_Ident; 129 Result.Text = Code.substr(0, TokenLength); 130 Code = Code.drop_front(TokenLength); 131 } else { 132 Result.Kind = TokenInfo::TK_InvalidChar; 133 Result.Text = Code.substr(0, 1); 134 Code = Code.drop_front(1); 135 } 136 break; 137 } 138 139 Result.Range.End = currentLocation(); 140 return Result; 141 } 142 143 /// \brief Consume an unsigned literal. 144 void consumeUnsignedLiteral(TokenInfo *Result) { 145 unsigned Length = 1; 146 if (Code.size() > 1) { 147 // Consume the 'x' or 'b' radix modifier, if present. 148 switch (toLowercase(Code[1])) { 149 case 'x': case 'b': Length = 2; 150 } 151 } 152 while (Length < Code.size() && isHexDigit(Code[Length])) 153 ++Length; 154 155 Result->Text = Code.substr(0, Length); 156 Code = Code.drop_front(Length); 157 158 unsigned Value; 159 if (!Result->Text.getAsInteger(0, Value)) { 160 Result->Kind = TokenInfo::TK_Literal; 161 Result->Value = Value; 162 } else { 163 SourceRange Range; 164 Range.Start = Result->Range.Start; 165 Range.End = currentLocation(); 166 Error->addError(Range, Error->ET_ParserUnsignedError) << Result->Text; 167 Result->Kind = TokenInfo::TK_Error; 168 } 169 } 170 171 /// \brief Consume a string literal. 172 /// 173 /// \c Code must be positioned at the start of the literal (the opening 174 /// quote). Consumed until it finds the same closing quote character. 175 void consumeStringLiteral(TokenInfo *Result) { 176 bool InEscape = false; 177 const char Marker = Code[0]; 178 for (size_t Length = 1, Size = Code.size(); Length != Size; ++Length) { 179 if (InEscape) { 180 InEscape = false; 181 continue; 182 } 183 if (Code[Length] == '\\') { 184 InEscape = true; 185 continue; 186 } 187 if (Code[Length] == Marker) { 188 Result->Kind = TokenInfo::TK_Literal; 189 Result->Text = Code.substr(0, Length + 1); 190 Result->Value = Code.substr(1, Length - 1).str(); 191 Code = Code.drop_front(Length + 1); 192 return; 193 } 194 } 195 196 StringRef ErrorText = Code; 197 Code = Code.drop_front(Code.size()); 198 SourceRange Range; 199 Range.Start = Result->Range.Start; 200 Range.End = currentLocation(); 201 Error->addError(Range, Error->ET_ParserStringError) << ErrorText; 202 Result->Kind = TokenInfo::TK_Error; 203 } 204 205 /// \brief Consume all leading whitespace from \c Code. 206 void consumeWhitespace() { 207 while (!Code.empty() && isWhitespace(Code[0])) { 208 if (Code[0] == '\n') { 209 ++Line; 210 StartOfLine = Code.drop_front(); 211 } 212 Code = Code.drop_front(); 213 } 214 } 215 216 SourceLocation currentLocation() { 217 SourceLocation Location; 218 Location.Line = Line; 219 Location.Column = Code.data() - StartOfLine.data() + 1; 220 return Location; 221 } 222 223 StringRef Code; 224 StringRef StartOfLine; 225 unsigned Line; 226 Diagnostics *Error; 227 TokenInfo NextToken; 228 }; 229 230 Parser::Sema::~Sema() {} 231 232 /// \brief Parse and validate a matcher expression. 233 /// \return \c true on success, in which case \c Value has the matcher parsed. 234 /// If the input is malformed, or some argument has an error, it 235 /// returns \c false. 236 bool Parser::parseMatcherExpressionImpl(VariantValue *Value) { 237 const TokenInfo NameToken = Tokenizer->consumeNextToken(); 238 assert(NameToken.Kind == TokenInfo::TK_Ident); 239 const TokenInfo OpenToken = Tokenizer->consumeNextToken(); 240 if (OpenToken.Kind != TokenInfo::TK_OpenParen) { 241 Error->addError(OpenToken.Range, Error->ET_ParserNoOpenParen) 242 << OpenToken.Text; 243 return false; 244 } 245 246 llvm::Optional<MatcherCtor> Ctor = 247 S->lookupMatcherCtor(NameToken.Text, NameToken.Range, Error); 248 249 std::vector<ParserValue> Args; 250 TokenInfo EndToken; 251 while (Tokenizer->nextTokenKind() != TokenInfo::TK_Eof) { 252 if (Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen) { 253 // End of args. 254 EndToken = Tokenizer->consumeNextToken(); 255 break; 256 } 257 if (Args.size() > 0) { 258 // We must find a , token to continue. 259 const TokenInfo CommaToken = Tokenizer->consumeNextToken(); 260 if (CommaToken.Kind != TokenInfo::TK_Comma) { 261 Error->addError(CommaToken.Range, Error->ET_ParserNoComma) 262 << CommaToken.Text; 263 return false; 264 } 265 } 266 267 Diagnostics::Context Ctx(Diagnostics::Context::MatcherArg, Error, 268 NameToken.Text, NameToken.Range, Args.size() + 1); 269 ParserValue ArgValue; 270 ArgValue.Text = Tokenizer->peekNextToken().Text; 271 ArgValue.Range = Tokenizer->peekNextToken().Range; 272 if (!parseExpressionImpl(&ArgValue.Value)) return false; 273 274 Args.push_back(ArgValue); 275 } 276 277 if (EndToken.Kind == TokenInfo::TK_Eof) { 278 Error->addError(OpenToken.Range, Error->ET_ParserNoCloseParen); 279 return false; 280 } 281 282 std::string BindID; 283 if (Tokenizer->peekNextToken().Kind == TokenInfo::TK_Period) { 284 // Parse .bind("foo") 285 Tokenizer->consumeNextToken(); // consume the period. 286 const TokenInfo BindToken = Tokenizer->consumeNextToken(); 287 const TokenInfo OpenToken = Tokenizer->consumeNextToken(); 288 const TokenInfo IDToken = Tokenizer->consumeNextToken(); 289 const TokenInfo CloseToken = Tokenizer->consumeNextToken(); 290 291 // TODO: We could use different error codes for each/some to be more 292 // explicit about the syntax error. 293 if (BindToken.Kind != TokenInfo::TK_Ident || 294 BindToken.Text != TokenInfo::ID_Bind) { 295 Error->addError(BindToken.Range, Error->ET_ParserMalformedBindExpr); 296 return false; 297 } 298 if (OpenToken.Kind != TokenInfo::TK_OpenParen) { 299 Error->addError(OpenToken.Range, Error->ET_ParserMalformedBindExpr); 300 return false; 301 } 302 if (IDToken.Kind != TokenInfo::TK_Literal || !IDToken.Value.isString()) { 303 Error->addError(IDToken.Range, Error->ET_ParserMalformedBindExpr); 304 return false; 305 } 306 if (CloseToken.Kind != TokenInfo::TK_CloseParen) { 307 Error->addError(CloseToken.Range, Error->ET_ParserMalformedBindExpr); 308 return false; 309 } 310 BindID = IDToken.Value.getString(); 311 } 312 313 if (!Ctor) 314 return false; 315 316 // Merge the start and end infos. 317 Diagnostics::Context Ctx(Diagnostics::Context::ConstructMatcher, Error, 318 NameToken.Text, NameToken.Range); 319 SourceRange MatcherRange = NameToken.Range; 320 MatcherRange.End = EndToken.Range.End; 321 VariantMatcher Result = S->actOnMatcherExpression( 322 *Ctor, MatcherRange, BindID, Args, Error); 323 if (Result.isNull()) return false; 324 325 *Value = Result; 326 return true; 327 } 328 329 /// \brief Parse an <Expresssion> 330 bool Parser::parseExpressionImpl(VariantValue *Value) { 331 switch (Tokenizer->nextTokenKind()) { 332 case TokenInfo::TK_Literal: 333 *Value = Tokenizer->consumeNextToken().Value; 334 return true; 335 336 case TokenInfo::TK_Ident: 337 return parseMatcherExpressionImpl(Value); 338 339 case TokenInfo::TK_Eof: 340 Error->addError(Tokenizer->consumeNextToken().Range, 341 Error->ET_ParserNoCode); 342 return false; 343 344 case TokenInfo::TK_Error: 345 // This error was already reported by the tokenizer. 346 return false; 347 348 case TokenInfo::TK_OpenParen: 349 case TokenInfo::TK_CloseParen: 350 case TokenInfo::TK_Comma: 351 case TokenInfo::TK_Period: 352 case TokenInfo::TK_InvalidChar: 353 const TokenInfo Token = Tokenizer->consumeNextToken(); 354 Error->addError(Token.Range, Error->ET_ParserInvalidToken) << Token.Text; 355 return false; 356 } 357 358 llvm_unreachable("Unknown token kind."); 359 } 360 361 Parser::Parser(CodeTokenizer *Tokenizer, Sema *S, 362 Diagnostics *Error) 363 : Tokenizer(Tokenizer), S(S), Error(Error) {} 364 365 class RegistrySema : public Parser::Sema { 366 public: 367 virtual ~RegistrySema() {} 368 llvm::Optional<MatcherCtor> lookupMatcherCtor(StringRef MatcherName, 369 const SourceRange &NameRange, 370 Diagnostics *Error) { 371 return Registry::lookupMatcherCtor(MatcherName, NameRange, Error); 372 } 373 VariantMatcher actOnMatcherExpression(MatcherCtor Ctor, 374 const SourceRange &NameRange, 375 StringRef BindID, 376 ArrayRef<ParserValue> Args, 377 Diagnostics *Error) { 378 if (BindID.empty()) { 379 return Registry::constructMatcher(Ctor, NameRange, Args, Error); 380 } else { 381 return Registry::constructBoundMatcher(Ctor, NameRange, BindID, Args, 382 Error); 383 } 384 } 385 }; 386 387 bool Parser::parseExpression(StringRef Code, VariantValue *Value, 388 Diagnostics *Error) { 389 RegistrySema S; 390 return parseExpression(Code, &S, Value, Error); 391 } 392 393 bool Parser::parseExpression(StringRef Code, Sema *S, 394 VariantValue *Value, Diagnostics *Error) { 395 CodeTokenizer Tokenizer(Code, Error); 396 if (!Parser(&Tokenizer, S, Error).parseExpressionImpl(Value)) return false; 397 if (Tokenizer.peekNextToken().Kind != TokenInfo::TK_Eof) { 398 Error->addError(Tokenizer.peekNextToken().Range, 399 Error->ET_ParserTrailingCode); 400 return false; 401 } 402 return true; 403 } 404 405 llvm::Optional<DynTypedMatcher> 406 Parser::parseMatcherExpression(StringRef Code, Diagnostics *Error) { 407 RegistrySema S; 408 return parseMatcherExpression(Code, &S, Error); 409 } 410 411 llvm::Optional<DynTypedMatcher> 412 Parser::parseMatcherExpression(StringRef Code, Parser::Sema *S, 413 Diagnostics *Error) { 414 VariantValue Value; 415 if (!parseExpression(Code, S, &Value, Error)) 416 return llvm::Optional<DynTypedMatcher>(); 417 if (!Value.isMatcher()) { 418 Error->addError(SourceRange(), Error->ET_ParserNotAMatcher); 419 return llvm::Optional<DynTypedMatcher>(); 420 } 421 llvm::Optional<DynTypedMatcher> Result = 422 Value.getMatcher().getSingleMatcher(); 423 if (!Result.hasValue()) { 424 Error->addError(SourceRange(), Error->ET_ParserOverloadedType) 425 << Value.getTypeAsString(); 426 } 427 return Result; 428 } 429 430 } // namespace dynamic 431 } // namespace ast_matchers 432 } // namespace clang 433