1 //===- Parser.cpp - Matcher expression parser -----------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 /// 10 /// \file 11 /// Recursive parser implementation for the matcher expression grammar. 12 /// 13 //===----------------------------------------------------------------------===// 14 15 #include "clang/ASTMatchers/Dynamic/Parser.h" 16 #include "clang/ASTMatchers/ASTMatchersInternal.h" 17 #include "clang/ASTMatchers/Dynamic/Diagnostics.h" 18 #include "clang/ASTMatchers/Dynamic/Registry.h" 19 #include "clang/Basic/CharInfo.h" 20 #include "llvm/ADT/Optional.h" 21 #include "llvm/ADT/StringRef.h" 22 #include "llvm/Support/ErrorHandling.h" 23 #include "llvm/Support/ManagedStatic.h" 24 #include <algorithm> 25 #include <cassert> 26 #include <cerrno> 27 #include <cstddef> 28 #include <cstdlib> 29 #include <string> 30 #include <utility> 31 #include <vector> 32 33 namespace clang { 34 namespace ast_matchers { 35 namespace dynamic { 36 37 /// Simple structure to hold information for one token from the parser. 38 struct Parser::TokenInfo { 39 /// Different possible tokens. 40 enum TokenKind { 41 TK_Eof, 42 TK_OpenParen, 43 TK_CloseParen, 44 TK_Comma, 45 TK_Period, 46 TK_Literal, 47 TK_Ident, 48 TK_InvalidChar, 49 TK_Error, 50 TK_CodeCompletion 51 }; 52 53 /// Some known identifiers. 54 static const char* const ID_Bind; 55 56 TokenInfo() = default; 57 58 StringRef Text; 59 TokenKind Kind = TK_Eof; 60 SourceRange Range; 61 VariantValue Value; 62 }; 63 64 const char* const Parser::TokenInfo::ID_Bind = "bind"; 65 66 /// Simple tokenizer for the parser. 67 class Parser::CodeTokenizer { 68 public: 69 explicit CodeTokenizer(StringRef MatcherCode, Diagnostics *Error) 70 : Code(MatcherCode), StartOfLine(MatcherCode), Error(Error) { 71 NextToken = getNextToken(); 72 } 73 74 CodeTokenizer(StringRef MatcherCode, Diagnostics *Error, 75 unsigned CodeCompletionOffset) 76 : Code(MatcherCode), StartOfLine(MatcherCode), Error(Error), 77 CodeCompletionLocation(MatcherCode.data() + CodeCompletionOffset) { 78 NextToken = getNextToken(); 79 } 80 81 /// Returns but doesn't consume the next token. 82 const TokenInfo &peekNextToken() const { return NextToken; } 83 84 /// Consumes and returns the next token. 85 TokenInfo consumeNextToken() { 86 TokenInfo ThisToken = NextToken; 87 NextToken = getNextToken(); 88 return ThisToken; 89 } 90 91 TokenInfo::TokenKind nextTokenKind() const { return NextToken.Kind; } 92 93 private: 94 TokenInfo getNextToken() { 95 consumeWhitespace(); 96 TokenInfo Result; 97 Result.Range.Start = currentLocation(); 98 99 if (CodeCompletionLocation && CodeCompletionLocation <= Code.data()) { 100 Result.Kind = TokenInfo::TK_CodeCompletion; 101 Result.Text = StringRef(CodeCompletionLocation, 0); 102 CodeCompletionLocation = nullptr; 103 return Result; 104 } 105 106 if (Code.empty()) { 107 Result.Kind = TokenInfo::TK_Eof; 108 Result.Text = ""; 109 return Result; 110 } 111 112 switch (Code[0]) { 113 case ',': 114 Result.Kind = TokenInfo::TK_Comma; 115 Result.Text = Code.substr(0, 1); 116 Code = Code.drop_front(); 117 break; 118 case '.': 119 Result.Kind = TokenInfo::TK_Period; 120 Result.Text = Code.substr(0, 1); 121 Code = Code.drop_front(); 122 break; 123 case '(': 124 Result.Kind = TokenInfo::TK_OpenParen; 125 Result.Text = Code.substr(0, 1); 126 Code = Code.drop_front(); 127 break; 128 case ')': 129 Result.Kind = TokenInfo::TK_CloseParen; 130 Result.Text = Code.substr(0, 1); 131 Code = Code.drop_front(); 132 break; 133 134 case '"': 135 case '\'': 136 // Parse a string literal. 137 consumeStringLiteral(&Result); 138 break; 139 140 case '0': case '1': case '2': case '3': case '4': 141 case '5': case '6': case '7': case '8': case '9': 142 // Parse an unsigned and float literal. 143 consumeNumberLiteral(&Result); 144 break; 145 146 default: 147 if (isAlphanumeric(Code[0])) { 148 // Parse an identifier 149 size_t TokenLength = 1; 150 while (true) { 151 // A code completion location in/immediately after an identifier will 152 // cause the portion of the identifier before the code completion 153 // location to become a code completion token. 154 if (CodeCompletionLocation == Code.data() + TokenLength) { 155 CodeCompletionLocation = nullptr; 156 Result.Kind = TokenInfo::TK_CodeCompletion; 157 Result.Text = Code.substr(0, TokenLength); 158 Code = Code.drop_front(TokenLength); 159 return Result; 160 } 161 if (TokenLength == Code.size() || !isAlphanumeric(Code[TokenLength])) 162 break; 163 ++TokenLength; 164 } 165 if (TokenLength == 4 && Code.startswith("true")) { 166 Result.Kind = TokenInfo::TK_Literal; 167 Result.Value = true; 168 } else if (TokenLength == 5 && Code.startswith("false")) { 169 Result.Kind = TokenInfo::TK_Literal; 170 Result.Value = false; 171 } else { 172 Result.Kind = TokenInfo::TK_Ident; 173 Result.Text = Code.substr(0, TokenLength); 174 } 175 Code = Code.drop_front(TokenLength); 176 } else { 177 Result.Kind = TokenInfo::TK_InvalidChar; 178 Result.Text = Code.substr(0, 1); 179 Code = Code.drop_front(1); 180 } 181 break; 182 } 183 184 Result.Range.End = currentLocation(); 185 return Result; 186 } 187 188 /// Consume an unsigned and float literal. 189 void consumeNumberLiteral(TokenInfo *Result) { 190 bool isFloatingLiteral = false; 191 unsigned Length = 1; 192 if (Code.size() > 1) { 193 // Consume the 'x' or 'b' radix modifier, if present. 194 switch (toLowercase(Code[1])) { 195 case 'x': case 'b': Length = 2; 196 } 197 } 198 while (Length < Code.size() && isHexDigit(Code[Length])) 199 ++Length; 200 201 // Try to recognize a floating point literal. 202 while (Length < Code.size()) { 203 char c = Code[Length]; 204 if (c == '-' || c == '+' || c == '.' || isHexDigit(c)) { 205 isFloatingLiteral = true; 206 Length++; 207 } else { 208 break; 209 } 210 } 211 212 Result->Text = Code.substr(0, Length); 213 Code = Code.drop_front(Length); 214 215 if (isFloatingLiteral) { 216 char *end; 217 errno = 0; 218 std::string Text = Result->Text.str(); 219 double doubleValue = strtod(Text.c_str(), &end); 220 if (*end == 0 && errno == 0) { 221 Result->Kind = TokenInfo::TK_Literal; 222 Result->Value = doubleValue; 223 return; 224 } 225 } else { 226 unsigned Value; 227 if (!Result->Text.getAsInteger(0, Value)) { 228 Result->Kind = TokenInfo::TK_Literal; 229 Result->Value = Value; 230 return; 231 } 232 } 233 234 SourceRange Range; 235 Range.Start = Result->Range.Start; 236 Range.End = currentLocation(); 237 Error->addError(Range, Error->ET_ParserNumberError) << Result->Text; 238 Result->Kind = TokenInfo::TK_Error; 239 } 240 241 /// Consume a string literal. 242 /// 243 /// \c Code must be positioned at the start of the literal (the opening 244 /// quote). Consumed until it finds the same closing quote character. 245 void consumeStringLiteral(TokenInfo *Result) { 246 bool InEscape = false; 247 const char Marker = Code[0]; 248 for (size_t Length = 1, Size = Code.size(); Length != Size; ++Length) { 249 if (InEscape) { 250 InEscape = false; 251 continue; 252 } 253 if (Code[Length] == '\\') { 254 InEscape = true; 255 continue; 256 } 257 if (Code[Length] == Marker) { 258 Result->Kind = TokenInfo::TK_Literal; 259 Result->Text = Code.substr(0, Length + 1); 260 Result->Value = Code.substr(1, Length - 1); 261 Code = Code.drop_front(Length + 1); 262 return; 263 } 264 } 265 266 StringRef ErrorText = Code; 267 Code = Code.drop_front(Code.size()); 268 SourceRange Range; 269 Range.Start = Result->Range.Start; 270 Range.End = currentLocation(); 271 Error->addError(Range, Error->ET_ParserStringError) << ErrorText; 272 Result->Kind = TokenInfo::TK_Error; 273 } 274 275 /// Consume all leading whitespace from \c Code. 276 void consumeWhitespace() { 277 while (!Code.empty() && isWhitespace(Code[0])) { 278 if (Code[0] == '\n') { 279 ++Line; 280 StartOfLine = Code.drop_front(); 281 } 282 Code = Code.drop_front(); 283 } 284 } 285 286 SourceLocation currentLocation() { 287 SourceLocation Location; 288 Location.Line = Line; 289 Location.Column = Code.data() - StartOfLine.data() + 1; 290 return Location; 291 } 292 293 StringRef Code; 294 StringRef StartOfLine; 295 unsigned Line = 1; 296 Diagnostics *Error; 297 TokenInfo NextToken; 298 const char *CodeCompletionLocation = nullptr; 299 }; 300 301 Parser::Sema::~Sema() = default; 302 303 std::vector<ArgKind> Parser::Sema::getAcceptedCompletionTypes( 304 llvm::ArrayRef<std::pair<MatcherCtor, unsigned>> Context) { 305 return {}; 306 } 307 308 std::vector<MatcherCompletion> 309 Parser::Sema::getMatcherCompletions(llvm::ArrayRef<ArgKind> AcceptedTypes) { 310 return {}; 311 } 312 313 struct Parser::ScopedContextEntry { 314 Parser *P; 315 316 ScopedContextEntry(Parser *P, MatcherCtor C) : P(P) { 317 P->ContextStack.push_back(std::make_pair(C, 0u)); 318 } 319 320 ~ScopedContextEntry() { 321 P->ContextStack.pop_back(); 322 } 323 324 void nextArg() { 325 ++P->ContextStack.back().second; 326 } 327 }; 328 329 /// Parse expressions that start with an identifier. 330 /// 331 /// This function can parse named values and matchers. 332 /// In case of failure it will try to determine the user's intent to give 333 /// an appropriate error message. 334 bool Parser::parseIdentifierPrefixImpl(VariantValue *Value) { 335 const TokenInfo NameToken = Tokenizer->consumeNextToken(); 336 337 if (Tokenizer->nextTokenKind() != TokenInfo::TK_OpenParen) { 338 // Parse as a named value. 339 if (const VariantValue NamedValue = 340 NamedValues ? NamedValues->lookup(NameToken.Text) 341 : VariantValue()) { 342 *Value = NamedValue; 343 return true; 344 } 345 // If the syntax is correct and the name is not a matcher either, report 346 // unknown named value. 347 if ((Tokenizer->nextTokenKind() == TokenInfo::TK_Comma || 348 Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen || 349 Tokenizer->nextTokenKind() == TokenInfo::TK_Eof) && 350 !S->lookupMatcherCtor(NameToken.Text)) { 351 Error->addError(NameToken.Range, Error->ET_RegistryValueNotFound) 352 << NameToken.Text; 353 return false; 354 } 355 // Otherwise, fallback to the matcher parser. 356 } 357 358 // Parse as a matcher expression. 359 return parseMatcherExpressionImpl(NameToken, Value); 360 } 361 362 bool Parser::parseBindID(std::string &BindID) { 363 // Parse .bind("foo") 364 assert(Tokenizer->peekNextToken().Kind == TokenInfo::TK_Period); 365 Tokenizer->consumeNextToken(); // consume the period. 366 const TokenInfo BindToken = Tokenizer->consumeNextToken(); 367 if (BindToken.Kind == TokenInfo::TK_CodeCompletion) { 368 addCompletion(BindToken, MatcherCompletion("bind(\"", "bind", 1)); 369 return false; 370 } 371 372 const TokenInfo OpenToken = Tokenizer->consumeNextToken(); 373 const TokenInfo IDToken = Tokenizer->consumeNextToken(); 374 const TokenInfo CloseToken = Tokenizer->consumeNextToken(); 375 376 // TODO: We could use different error codes for each/some to be more 377 // explicit about the syntax error. 378 if (BindToken.Kind != TokenInfo::TK_Ident || 379 BindToken.Text != TokenInfo::ID_Bind) { 380 Error->addError(BindToken.Range, Error->ET_ParserMalformedBindExpr); 381 return false; 382 } 383 if (OpenToken.Kind != TokenInfo::TK_OpenParen) { 384 Error->addError(OpenToken.Range, Error->ET_ParserMalformedBindExpr); 385 return false; 386 } 387 if (IDToken.Kind != TokenInfo::TK_Literal || !IDToken.Value.isString()) { 388 Error->addError(IDToken.Range, Error->ET_ParserMalformedBindExpr); 389 return false; 390 } 391 if (CloseToken.Kind != TokenInfo::TK_CloseParen) { 392 Error->addError(CloseToken.Range, Error->ET_ParserMalformedBindExpr); 393 return false; 394 } 395 BindID = IDToken.Value.getString(); 396 return true; 397 } 398 399 /// Parse and validate a matcher expression. 400 /// \return \c true on success, in which case \c Value has the matcher parsed. 401 /// If the input is malformed, or some argument has an error, it 402 /// returns \c false. 403 bool Parser::parseMatcherExpressionImpl(const TokenInfo &NameToken, 404 VariantValue *Value) { 405 assert(NameToken.Kind == TokenInfo::TK_Ident); 406 const TokenInfo OpenToken = Tokenizer->consumeNextToken(); 407 if (OpenToken.Kind != TokenInfo::TK_OpenParen) { 408 Error->addError(OpenToken.Range, Error->ET_ParserNoOpenParen) 409 << OpenToken.Text; 410 return false; 411 } 412 413 llvm::Optional<MatcherCtor> Ctor = S->lookupMatcherCtor(NameToken.Text); 414 415 if (!Ctor) { 416 Error->addError(NameToken.Range, Error->ET_RegistryMatcherNotFound) 417 << NameToken.Text; 418 // Do not return here. We need to continue to give completion suggestions. 419 } 420 421 std::vector<ParserValue> Args; 422 TokenInfo EndToken; 423 424 { 425 ScopedContextEntry SCE(this, Ctor ? *Ctor : nullptr); 426 427 while (Tokenizer->nextTokenKind() != TokenInfo::TK_Eof) { 428 if (Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen) { 429 // End of args. 430 EndToken = Tokenizer->consumeNextToken(); 431 break; 432 } 433 if (!Args.empty()) { 434 // We must find a , token to continue. 435 const TokenInfo CommaToken = Tokenizer->consumeNextToken(); 436 if (CommaToken.Kind != TokenInfo::TK_Comma) { 437 Error->addError(CommaToken.Range, Error->ET_ParserNoComma) 438 << CommaToken.Text; 439 return false; 440 } 441 } 442 443 Diagnostics::Context Ctx(Diagnostics::Context::MatcherArg, Error, 444 NameToken.Text, NameToken.Range, 445 Args.size() + 1); 446 ParserValue ArgValue; 447 ArgValue.Text = Tokenizer->peekNextToken().Text; 448 ArgValue.Range = Tokenizer->peekNextToken().Range; 449 if (!parseExpressionImpl(&ArgValue.Value)) { 450 return false; 451 } 452 453 Args.push_back(ArgValue); 454 SCE.nextArg(); 455 } 456 } 457 458 if (EndToken.Kind == TokenInfo::TK_Eof) { 459 Error->addError(OpenToken.Range, Error->ET_ParserNoCloseParen); 460 return false; 461 } 462 463 std::string BindID; 464 if (Tokenizer->peekNextToken().Kind == TokenInfo::TK_Period) { 465 if (!parseBindID(BindID)) 466 return false; 467 } 468 469 if (!Ctor) 470 return false; 471 472 // Merge the start and end infos. 473 Diagnostics::Context Ctx(Diagnostics::Context::ConstructMatcher, Error, 474 NameToken.Text, NameToken.Range); 475 SourceRange MatcherRange = NameToken.Range; 476 MatcherRange.End = EndToken.Range.End; 477 VariantMatcher Result = S->actOnMatcherExpression( 478 *Ctor, MatcherRange, BindID, Args, Error); 479 if (Result.isNull()) return false; 480 481 *Value = Result; 482 return true; 483 } 484 485 // If the prefix of this completion matches the completion token, add it to 486 // Completions minus the prefix. 487 void Parser::addCompletion(const TokenInfo &CompToken, 488 const MatcherCompletion& Completion) { 489 if (StringRef(Completion.TypedText).startswith(CompToken.Text) && 490 Completion.Specificity > 0) { 491 Completions.emplace_back(Completion.TypedText.substr(CompToken.Text.size()), 492 Completion.MatcherDecl, Completion.Specificity); 493 } 494 } 495 496 std::vector<MatcherCompletion> Parser::getNamedValueCompletions( 497 ArrayRef<ArgKind> AcceptedTypes) { 498 if (!NamedValues) return std::vector<MatcherCompletion>(); 499 std::vector<MatcherCompletion> Result; 500 for (const auto &Entry : *NamedValues) { 501 unsigned Specificity; 502 if (Entry.getValue().isConvertibleTo(AcceptedTypes, &Specificity)) { 503 std::string Decl = 504 (Entry.getValue().getTypeAsString() + " " + Entry.getKey()).str(); 505 Result.emplace_back(Entry.getKey(), Decl, Specificity); 506 } 507 } 508 return Result; 509 } 510 511 void Parser::addExpressionCompletions() { 512 const TokenInfo CompToken = Tokenizer->consumeNextToken(); 513 assert(CompToken.Kind == TokenInfo::TK_CodeCompletion); 514 515 // We cannot complete code if there is an invalid element on the context 516 // stack. 517 for (ContextStackTy::iterator I = ContextStack.begin(), 518 E = ContextStack.end(); 519 I != E; ++I) { 520 if (!I->first) 521 return; 522 } 523 524 auto AcceptedTypes = S->getAcceptedCompletionTypes(ContextStack); 525 for (const auto &Completion : S->getMatcherCompletions(AcceptedTypes)) { 526 addCompletion(CompToken, Completion); 527 } 528 529 for (const auto &Completion : getNamedValueCompletions(AcceptedTypes)) { 530 addCompletion(CompToken, Completion); 531 } 532 } 533 534 /// Parse an <Expression> 535 bool Parser::parseExpressionImpl(VariantValue *Value) { 536 switch (Tokenizer->nextTokenKind()) { 537 case TokenInfo::TK_Literal: 538 *Value = Tokenizer->consumeNextToken().Value; 539 return true; 540 541 case TokenInfo::TK_Ident: 542 return parseIdentifierPrefixImpl(Value); 543 544 case TokenInfo::TK_CodeCompletion: 545 addExpressionCompletions(); 546 return false; 547 548 case TokenInfo::TK_Eof: 549 Error->addError(Tokenizer->consumeNextToken().Range, 550 Error->ET_ParserNoCode); 551 return false; 552 553 case TokenInfo::TK_Error: 554 // This error was already reported by the tokenizer. 555 return false; 556 557 case TokenInfo::TK_OpenParen: 558 case TokenInfo::TK_CloseParen: 559 case TokenInfo::TK_Comma: 560 case TokenInfo::TK_Period: 561 case TokenInfo::TK_InvalidChar: 562 const TokenInfo Token = Tokenizer->consumeNextToken(); 563 Error->addError(Token.Range, Error->ET_ParserInvalidToken) << Token.Text; 564 return false; 565 } 566 567 llvm_unreachable("Unknown token kind."); 568 } 569 570 static llvm::ManagedStatic<Parser::RegistrySema> DefaultRegistrySema; 571 572 Parser::Parser(CodeTokenizer *Tokenizer, Sema *S, 573 const NamedValueMap *NamedValues, Diagnostics *Error) 574 : Tokenizer(Tokenizer), S(S ? S : &*DefaultRegistrySema), 575 NamedValues(NamedValues), Error(Error) {} 576 577 Parser::RegistrySema::~RegistrySema() = default; 578 579 llvm::Optional<MatcherCtor> 580 Parser::RegistrySema::lookupMatcherCtor(StringRef MatcherName) { 581 return Registry::lookupMatcherCtor(MatcherName); 582 } 583 584 VariantMatcher Parser::RegistrySema::actOnMatcherExpression( 585 MatcherCtor Ctor, SourceRange NameRange, StringRef BindID, 586 ArrayRef<ParserValue> Args, Diagnostics *Error) { 587 if (BindID.empty()) { 588 return Registry::constructMatcher(Ctor, NameRange, Args, Error); 589 } else { 590 return Registry::constructBoundMatcher(Ctor, NameRange, BindID, Args, 591 Error); 592 } 593 } 594 595 std::vector<ArgKind> Parser::RegistrySema::getAcceptedCompletionTypes( 596 ArrayRef<std::pair<MatcherCtor, unsigned>> Context) { 597 return Registry::getAcceptedCompletionTypes(Context); 598 } 599 600 std::vector<MatcherCompletion> Parser::RegistrySema::getMatcherCompletions( 601 ArrayRef<ArgKind> AcceptedTypes) { 602 return Registry::getMatcherCompletions(AcceptedTypes); 603 } 604 605 bool Parser::parseExpression(StringRef Code, Sema *S, 606 const NamedValueMap *NamedValues, 607 VariantValue *Value, Diagnostics *Error) { 608 CodeTokenizer Tokenizer(Code, Error); 609 if (!Parser(&Tokenizer, S, NamedValues, Error).parseExpressionImpl(Value)) 610 return false; 611 if (Tokenizer.peekNextToken().Kind != TokenInfo::TK_Eof) { 612 Error->addError(Tokenizer.peekNextToken().Range, 613 Error->ET_ParserTrailingCode); 614 return false; 615 } 616 return true; 617 } 618 619 std::vector<MatcherCompletion> 620 Parser::completeExpression(StringRef Code, unsigned CompletionOffset, Sema *S, 621 const NamedValueMap *NamedValues) { 622 Diagnostics Error; 623 CodeTokenizer Tokenizer(Code, &Error, CompletionOffset); 624 Parser P(&Tokenizer, S, NamedValues, &Error); 625 VariantValue Dummy; 626 P.parseExpressionImpl(&Dummy); 627 628 // Sort by specificity, then by name. 629 llvm::sort(P.Completions.begin(), P.Completions.end(), 630 [](const MatcherCompletion &A, const MatcherCompletion &B) { 631 if (A.Specificity != B.Specificity) 632 return A.Specificity > B.Specificity; 633 return A.TypedText < B.TypedText; 634 }); 635 636 return P.Completions; 637 } 638 639 llvm::Optional<DynTypedMatcher> 640 Parser::parseMatcherExpression(StringRef Code, Sema *S, 641 const NamedValueMap *NamedValues, 642 Diagnostics *Error) { 643 VariantValue Value; 644 if (!parseExpression(Code, S, NamedValues, &Value, Error)) 645 return llvm::Optional<DynTypedMatcher>(); 646 if (!Value.isMatcher()) { 647 Error->addError(SourceRange(), Error->ET_ParserNotAMatcher); 648 return llvm::Optional<DynTypedMatcher>(); 649 } 650 llvm::Optional<DynTypedMatcher> Result = 651 Value.getMatcher().getSingleMatcher(); 652 if (!Result.hasValue()) { 653 Error->addError(SourceRange(), Error->ET_ParserOverloadedType) 654 << Value.getTypeAsString(); 655 } 656 return Result; 657 } 658 659 } // namespace dynamic 660 } // namespace ast_matchers 661 } // namespace clang 662