xref: /llvm-project/clang/lib/ASTMatchers/Dynamic/Parser.cpp (revision 2bbed50a45c5fc445de3fe87cbac13dbbed38c53)
1 //===--- Parser.cpp - Matcher expression parser -----*- C++ -*-===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 ///
10 /// \file
11 /// \brief Recursive parser implementation for the matcher expression grammar.
12 ///
13 //===----------------------------------------------------------------------===//
14 
15 #include "clang/ASTMatchers/Dynamic/Parser.h"
16 #include "clang/ASTMatchers/Dynamic/Registry.h"
17 #include "clang/Basic/CharInfo.h"
18 #include "llvm/ADT/Optional.h"
19 #include "llvm/Support/ManagedStatic.h"
20 #include <string>
21 #include <vector>
22 
23 namespace clang {
24 namespace ast_matchers {
25 namespace dynamic {
26 
27 /// \brief Simple structure to hold information for one token from the parser.
28 struct Parser::TokenInfo {
29   /// \brief Different possible tokens.
30   enum TokenKind {
31     TK_Eof,
32     TK_OpenParen,
33     TK_CloseParen,
34     TK_Comma,
35     TK_Period,
36     TK_Literal,
37     TK_Ident,
38     TK_InvalidChar,
39     TK_Error,
40     TK_CodeCompletion
41   };
42 
43   /// \brief Some known identifiers.
44   static const char* const ID_Bind;
45 
46   TokenInfo() : Text(), Kind(TK_Eof), Range(), Value() {}
47 
48   StringRef Text;
49   TokenKind Kind;
50   SourceRange Range;
51   VariantValue Value;
52 };
53 
54 const char* const Parser::TokenInfo::ID_Bind = "bind";
55 
56 /// \brief Simple tokenizer for the parser.
57 class Parser::CodeTokenizer {
58 public:
59   explicit CodeTokenizer(StringRef MatcherCode, Diagnostics *Error)
60       : Code(MatcherCode), StartOfLine(MatcherCode), Line(1), Error(Error),
61         CodeCompletionLocation(nullptr) {
62     NextToken = getNextToken();
63   }
64 
65   CodeTokenizer(StringRef MatcherCode, Diagnostics *Error,
66                 unsigned CodeCompletionOffset)
67       : Code(MatcherCode), StartOfLine(MatcherCode), Line(1), Error(Error),
68         CodeCompletionLocation(MatcherCode.data() + CodeCompletionOffset) {
69     NextToken = getNextToken();
70   }
71 
72   /// \brief Returns but doesn't consume the next token.
73   const TokenInfo &peekNextToken() const { return NextToken; }
74 
75   /// \brief Consumes and returns the next token.
76   TokenInfo consumeNextToken() {
77     TokenInfo ThisToken = NextToken;
78     NextToken = getNextToken();
79     return ThisToken;
80   }
81 
82   TokenInfo::TokenKind nextTokenKind() const { return NextToken.Kind; }
83 
84 private:
85   TokenInfo getNextToken() {
86     consumeWhitespace();
87     TokenInfo Result;
88     Result.Range.Start = currentLocation();
89 
90     if (CodeCompletionLocation && CodeCompletionLocation <= Code.data()) {
91       Result.Kind = TokenInfo::TK_CodeCompletion;
92       Result.Text = StringRef(CodeCompletionLocation, 0);
93       CodeCompletionLocation = nullptr;
94       return Result;
95     }
96 
97     if (Code.empty()) {
98       Result.Kind = TokenInfo::TK_Eof;
99       Result.Text = "";
100       return Result;
101     }
102 
103     switch (Code[0]) {
104     case ',':
105       Result.Kind = TokenInfo::TK_Comma;
106       Result.Text = Code.substr(0, 1);
107       Code = Code.drop_front();
108       break;
109     case '.':
110       Result.Kind = TokenInfo::TK_Period;
111       Result.Text = Code.substr(0, 1);
112       Code = Code.drop_front();
113       break;
114     case '(':
115       Result.Kind = TokenInfo::TK_OpenParen;
116       Result.Text = Code.substr(0, 1);
117       Code = Code.drop_front();
118       break;
119     case ')':
120       Result.Kind = TokenInfo::TK_CloseParen;
121       Result.Text = Code.substr(0, 1);
122       Code = Code.drop_front();
123       break;
124 
125     case '"':
126     case '\'':
127       // Parse a string literal.
128       consumeStringLiteral(&Result);
129       break;
130 
131     case '0': case '1': case '2': case '3': case '4':
132     case '5': case '6': case '7': case '8': case '9':
133       // Parse an unsigned and float literal.
134       consumeNumberLiteral(&Result);
135       break;
136 
137     default:
138       if (isAlphanumeric(Code[0])) {
139         // Parse an identifier
140         size_t TokenLength = 1;
141         while (1) {
142           // A code completion location in/immediately after an identifier will
143           // cause the portion of the identifier before the code completion
144           // location to become a code completion token.
145           if (CodeCompletionLocation == Code.data() + TokenLength) {
146             CodeCompletionLocation = nullptr;
147             Result.Kind = TokenInfo::TK_CodeCompletion;
148             Result.Text = Code.substr(0, TokenLength);
149             Code = Code.drop_front(TokenLength);
150             return Result;
151           }
152           if (TokenLength == Code.size() || !isAlphanumeric(Code[TokenLength]))
153             break;
154           ++TokenLength;
155         }
156         if (TokenLength == 4 && Code.startswith("true")) {
157           Result.Kind = TokenInfo::TK_Literal;
158           Result.Value = true;
159         } else if (TokenLength == 5 && Code.startswith("false")) {
160           Result.Kind = TokenInfo::TK_Literal;
161           Result.Value = false;
162         } else {
163           Result.Kind = TokenInfo::TK_Ident;
164           Result.Text = Code.substr(0, TokenLength);
165         }
166         Code = Code.drop_front(TokenLength);
167       } else {
168         Result.Kind = TokenInfo::TK_InvalidChar;
169         Result.Text = Code.substr(0, 1);
170         Code = Code.drop_front(1);
171       }
172       break;
173     }
174 
175     Result.Range.End = currentLocation();
176     return Result;
177   }
178 
179   /// \brief Consume an unsigned and float literal.
180   void consumeNumberLiteral(TokenInfo *Result) {
181     bool isFloatingLiteral = false;
182     unsigned Length = 1;
183     if (Code.size() > 1) {
184       // Consume the 'x' or 'b' radix modifier, if present.
185       switch (toLowercase(Code[1])) {
186       case 'x': case 'b': Length = 2;
187       }
188     }
189     while (Length < Code.size() && isHexDigit(Code[Length]))
190       ++Length;
191 
192     // Try to recognize a floating point literal.
193     while (Length < Code.size()) {
194       char c = Code[Length];
195       if (c == '-' || c == '+' || c == '.' || isHexDigit(c)) {
196         isFloatingLiteral = true;
197         Length++;
198       } else {
199         break;
200       }
201     }
202 
203     Result->Text = Code.substr(0, Length);
204     Code = Code.drop_front(Length);
205 
206     if (isFloatingLiteral) {
207       char *end;
208       errno = 0;
209       double doubleValue = strtod(Result->Text.str().c_str(), &end);
210       if (*end == 0 && errno == 0) {
211         Result->Kind = TokenInfo::TK_Literal;
212         Result->Value = doubleValue;
213         return;
214       }
215     } else {
216       unsigned Value;
217       if (!Result->Text.getAsInteger(0, Value)) {
218         Result->Kind = TokenInfo::TK_Literal;
219         Result->Value = Value;
220         return;
221       }
222     }
223 
224     SourceRange Range;
225     Range.Start = Result->Range.Start;
226     Range.End = currentLocation();
227     Error->addError(Range, Error->ET_ParserNumberError) << Result->Text;
228     Result->Kind = TokenInfo::TK_Error;
229   }
230 
231   /// \brief Consume a string literal.
232   ///
233   /// \c Code must be positioned at the start of the literal (the opening
234   /// quote). Consumed until it finds the same closing quote character.
235   void consumeStringLiteral(TokenInfo *Result) {
236     bool InEscape = false;
237     const char Marker = Code[0];
238     for (size_t Length = 1, Size = Code.size(); Length != Size; ++Length) {
239       if (InEscape) {
240         InEscape = false;
241         continue;
242       }
243       if (Code[Length] == '\\') {
244         InEscape = true;
245         continue;
246       }
247       if (Code[Length] == Marker) {
248         Result->Kind = TokenInfo::TK_Literal;
249         Result->Text = Code.substr(0, Length + 1);
250         Result->Value = Code.substr(1, Length - 1);
251         Code = Code.drop_front(Length + 1);
252         return;
253       }
254     }
255 
256     StringRef ErrorText = Code;
257     Code = Code.drop_front(Code.size());
258     SourceRange Range;
259     Range.Start = Result->Range.Start;
260     Range.End = currentLocation();
261     Error->addError(Range, Error->ET_ParserStringError) << ErrorText;
262     Result->Kind = TokenInfo::TK_Error;
263   }
264 
265   /// \brief Consume all leading whitespace from \c Code.
266   void consumeWhitespace() {
267     while (!Code.empty() && isWhitespace(Code[0])) {
268       if (Code[0] == '\n') {
269         ++Line;
270         StartOfLine = Code.drop_front();
271       }
272       Code = Code.drop_front();
273     }
274   }
275 
276   SourceLocation currentLocation() {
277     SourceLocation Location;
278     Location.Line = Line;
279     Location.Column = Code.data() - StartOfLine.data() + 1;
280     return Location;
281   }
282 
283   StringRef Code;
284   StringRef StartOfLine;
285   unsigned Line;
286   Diagnostics *Error;
287   TokenInfo NextToken;
288   const char *CodeCompletionLocation;
289 };
290 
291 Parser::Sema::~Sema() {}
292 
293 std::vector<ArgKind> Parser::Sema::getAcceptedCompletionTypes(
294     llvm::ArrayRef<std::pair<MatcherCtor, unsigned>> Context) {
295   return std::vector<ArgKind>();
296 }
297 
298 std::vector<MatcherCompletion>
299 Parser::Sema::getMatcherCompletions(llvm::ArrayRef<ArgKind> AcceptedTypes) {
300   return std::vector<MatcherCompletion>();
301 }
302 
303 struct Parser::ScopedContextEntry {
304   Parser *P;
305 
306   ScopedContextEntry(Parser *P, MatcherCtor C) : P(P) {
307     P->ContextStack.push_back(std::make_pair(C, 0u));
308   }
309 
310   ~ScopedContextEntry() {
311     P->ContextStack.pop_back();
312   }
313 
314   void nextArg() {
315     ++P->ContextStack.back().second;
316   }
317 };
318 
319 /// \brief Parse expressions that start with an identifier.
320 ///
321 /// This function can parse named values and matchers.
322 /// In case of failure it will try to determine the user's intent to give
323 /// an appropriate error message.
324 bool Parser::parseIdentifierPrefixImpl(VariantValue *Value) {
325   const TokenInfo NameToken = Tokenizer->consumeNextToken();
326 
327   if (Tokenizer->nextTokenKind() != TokenInfo::TK_OpenParen) {
328     // Parse as a named value.
329     if (const VariantValue NamedValue =
330             NamedValues ? NamedValues->lookup(NameToken.Text)
331                         : VariantValue()) {
332       *Value = NamedValue;
333       return true;
334     }
335     // If the syntax is correct and the name is not a matcher either, report
336     // unknown named value.
337     if ((Tokenizer->nextTokenKind() == TokenInfo::TK_Comma ||
338          Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen ||
339          Tokenizer->nextTokenKind() == TokenInfo::TK_Eof) &&
340         !S->lookupMatcherCtor(NameToken.Text)) {
341       Error->addError(NameToken.Range, Error->ET_RegistryValueNotFound)
342           << NameToken.Text;
343       return false;
344     }
345     // Otherwise, fallback to the matcher parser.
346   }
347 
348   // Parse as a matcher expression.
349   return parseMatcherExpressionImpl(NameToken, Value);
350 }
351 
352 /// \brief Parse and validate a matcher expression.
353 /// \return \c true on success, in which case \c Value has the matcher parsed.
354 ///   If the input is malformed, or some argument has an error, it
355 ///   returns \c false.
356 bool Parser::parseMatcherExpressionImpl(const TokenInfo &NameToken,
357                                         VariantValue *Value) {
358   assert(NameToken.Kind == TokenInfo::TK_Ident);
359   const TokenInfo OpenToken = Tokenizer->consumeNextToken();
360   if (OpenToken.Kind != TokenInfo::TK_OpenParen) {
361     Error->addError(OpenToken.Range, Error->ET_ParserNoOpenParen)
362         << OpenToken.Text;
363     return false;
364   }
365 
366   llvm::Optional<MatcherCtor> Ctor = S->lookupMatcherCtor(NameToken.Text);
367 
368   if (!Ctor) {
369     Error->addError(NameToken.Range, Error->ET_RegistryMatcherNotFound)
370         << NameToken.Text;
371     // Do not return here. We need to continue to give completion suggestions.
372   }
373 
374   std::vector<ParserValue> Args;
375   TokenInfo EndToken;
376 
377   {
378     ScopedContextEntry SCE(this, Ctor ? *Ctor : nullptr);
379 
380     while (Tokenizer->nextTokenKind() != TokenInfo::TK_Eof) {
381       if (Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen) {
382         // End of args.
383         EndToken = Tokenizer->consumeNextToken();
384         break;
385       }
386       if (Args.size() > 0) {
387         // We must find a , token to continue.
388         const TokenInfo CommaToken = Tokenizer->consumeNextToken();
389         if (CommaToken.Kind != TokenInfo::TK_Comma) {
390           Error->addError(CommaToken.Range, Error->ET_ParserNoComma)
391               << CommaToken.Text;
392           return false;
393         }
394       }
395 
396       Diagnostics::Context Ctx(Diagnostics::Context::MatcherArg, Error,
397                                NameToken.Text, NameToken.Range,
398                                Args.size() + 1);
399       ParserValue ArgValue;
400       ArgValue.Text = Tokenizer->peekNextToken().Text;
401       ArgValue.Range = Tokenizer->peekNextToken().Range;
402       if (!parseExpressionImpl(&ArgValue.Value)) {
403         return false;
404       }
405 
406       Args.push_back(ArgValue);
407       SCE.nextArg();
408     }
409   }
410 
411   if (EndToken.Kind == TokenInfo::TK_Eof) {
412     Error->addError(OpenToken.Range, Error->ET_ParserNoCloseParen);
413     return false;
414   }
415 
416   std::string BindID;
417   if (Tokenizer->peekNextToken().Kind == TokenInfo::TK_Period) {
418     // Parse .bind("foo")
419     Tokenizer->consumeNextToken();  // consume the period.
420     const TokenInfo BindToken = Tokenizer->consumeNextToken();
421     if (BindToken.Kind == TokenInfo::TK_CodeCompletion) {
422       addCompletion(BindToken, MatcherCompletion("bind(\"", "bind", 1));
423       return false;
424     }
425 
426     const TokenInfo OpenToken = Tokenizer->consumeNextToken();
427     const TokenInfo IDToken = Tokenizer->consumeNextToken();
428     const TokenInfo CloseToken = Tokenizer->consumeNextToken();
429 
430     // TODO: We could use different error codes for each/some to be more
431     //       explicit about the syntax error.
432     if (BindToken.Kind != TokenInfo::TK_Ident ||
433         BindToken.Text != TokenInfo::ID_Bind) {
434       Error->addError(BindToken.Range, Error->ET_ParserMalformedBindExpr);
435       return false;
436     }
437     if (OpenToken.Kind != TokenInfo::TK_OpenParen) {
438       Error->addError(OpenToken.Range, Error->ET_ParserMalformedBindExpr);
439       return false;
440     }
441     if (IDToken.Kind != TokenInfo::TK_Literal || !IDToken.Value.isString()) {
442       Error->addError(IDToken.Range, Error->ET_ParserMalformedBindExpr);
443       return false;
444     }
445     if (CloseToken.Kind != TokenInfo::TK_CloseParen) {
446       Error->addError(CloseToken.Range, Error->ET_ParserMalformedBindExpr);
447       return false;
448     }
449     BindID = IDToken.Value.getString();
450   }
451 
452   if (!Ctor)
453     return false;
454 
455   // Merge the start and end infos.
456   Diagnostics::Context Ctx(Diagnostics::Context::ConstructMatcher, Error,
457                            NameToken.Text, NameToken.Range);
458   SourceRange MatcherRange = NameToken.Range;
459   MatcherRange.End = EndToken.Range.End;
460   VariantMatcher Result = S->actOnMatcherExpression(
461       *Ctor, MatcherRange, BindID, Args, Error);
462   if (Result.isNull()) return false;
463 
464   *Value = Result;
465   return true;
466 }
467 
468 // If the prefix of this completion matches the completion token, add it to
469 // Completions minus the prefix.
470 void Parser::addCompletion(const TokenInfo &CompToken,
471                            const MatcherCompletion& Completion) {
472   if (StringRef(Completion.TypedText).startswith(CompToken.Text) &&
473       Completion.Specificity > 0) {
474     Completions.emplace_back(Completion.TypedText.substr(CompToken.Text.size()),
475                              Completion.MatcherDecl, Completion.Specificity);
476   }
477 }
478 
479 std::vector<MatcherCompletion> Parser::getNamedValueCompletions(
480     ArrayRef<ArgKind> AcceptedTypes) {
481   if (!NamedValues) return std::vector<MatcherCompletion>();
482   std::vector<MatcherCompletion> Result;
483   for (const auto &Entry : *NamedValues) {
484     unsigned Specificity;
485     if (Entry.getValue().isConvertibleTo(AcceptedTypes, &Specificity)) {
486       std::string Decl =
487           (Entry.getValue().getTypeAsString() + " " + Entry.getKey()).str();
488       Result.emplace_back(Entry.getKey(), Decl, Specificity);
489     }
490   }
491   return Result;
492 }
493 
494 void Parser::addExpressionCompletions() {
495   const TokenInfo CompToken = Tokenizer->consumeNextToken();
496   assert(CompToken.Kind == TokenInfo::TK_CodeCompletion);
497 
498   // We cannot complete code if there is an invalid element on the context
499   // stack.
500   for (ContextStackTy::iterator I = ContextStack.begin(),
501                                 E = ContextStack.end();
502        I != E; ++I) {
503     if (!I->first)
504       return;
505   }
506 
507   auto AcceptedTypes = S->getAcceptedCompletionTypes(ContextStack);
508   for (const auto &Completion : S->getMatcherCompletions(AcceptedTypes)) {
509     addCompletion(CompToken, Completion);
510   }
511 
512   for (const auto &Completion : getNamedValueCompletions(AcceptedTypes)) {
513     addCompletion(CompToken, Completion);
514   }
515 }
516 
517 /// \brief Parse an <Expresssion>
518 bool Parser::parseExpressionImpl(VariantValue *Value) {
519   switch (Tokenizer->nextTokenKind()) {
520   case TokenInfo::TK_Literal:
521     *Value = Tokenizer->consumeNextToken().Value;
522     return true;
523 
524   case TokenInfo::TK_Ident:
525     return parseIdentifierPrefixImpl(Value);
526 
527   case TokenInfo::TK_CodeCompletion:
528     addExpressionCompletions();
529     return false;
530 
531   case TokenInfo::TK_Eof:
532     Error->addError(Tokenizer->consumeNextToken().Range,
533                     Error->ET_ParserNoCode);
534     return false;
535 
536   case TokenInfo::TK_Error:
537     // This error was already reported by the tokenizer.
538     return false;
539 
540   case TokenInfo::TK_OpenParen:
541   case TokenInfo::TK_CloseParen:
542   case TokenInfo::TK_Comma:
543   case TokenInfo::TK_Period:
544   case TokenInfo::TK_InvalidChar:
545     const TokenInfo Token = Tokenizer->consumeNextToken();
546     Error->addError(Token.Range, Error->ET_ParserInvalidToken) << Token.Text;
547     return false;
548   }
549 
550   llvm_unreachable("Unknown token kind.");
551 }
552 
553 static llvm::ManagedStatic<Parser::RegistrySema> DefaultRegistrySema;
554 
555 Parser::Parser(CodeTokenizer *Tokenizer, Sema *S,
556                const NamedValueMap *NamedValues, Diagnostics *Error)
557     : Tokenizer(Tokenizer), S(S ? S : &*DefaultRegistrySema),
558       NamedValues(NamedValues), Error(Error) {}
559 
560 Parser::RegistrySema::~RegistrySema() {}
561 
562 llvm::Optional<MatcherCtor>
563 Parser::RegistrySema::lookupMatcherCtor(StringRef MatcherName) {
564   return Registry::lookupMatcherCtor(MatcherName);
565 }
566 
567 VariantMatcher Parser::RegistrySema::actOnMatcherExpression(
568     MatcherCtor Ctor, SourceRange NameRange, StringRef BindID,
569     ArrayRef<ParserValue> Args, Diagnostics *Error) {
570   if (BindID.empty()) {
571     return Registry::constructMatcher(Ctor, NameRange, Args, Error);
572   } else {
573     return Registry::constructBoundMatcher(Ctor, NameRange, BindID, Args,
574                                            Error);
575   }
576 }
577 
578 std::vector<ArgKind> Parser::RegistrySema::getAcceptedCompletionTypes(
579     ArrayRef<std::pair<MatcherCtor, unsigned>> Context) {
580   return Registry::getAcceptedCompletionTypes(Context);
581 }
582 
583 std::vector<MatcherCompletion> Parser::RegistrySema::getMatcherCompletions(
584     ArrayRef<ArgKind> AcceptedTypes) {
585   return Registry::getMatcherCompletions(AcceptedTypes);
586 }
587 
588 bool Parser::parseExpression(StringRef Code, Sema *S,
589                              const NamedValueMap *NamedValues,
590                              VariantValue *Value, Diagnostics *Error) {
591   CodeTokenizer Tokenizer(Code, Error);
592   if (!Parser(&Tokenizer, S, NamedValues, Error).parseExpressionImpl(Value))
593     return false;
594   if (Tokenizer.peekNextToken().Kind != TokenInfo::TK_Eof) {
595     Error->addError(Tokenizer.peekNextToken().Range,
596                     Error->ET_ParserTrailingCode);
597     return false;
598   }
599   return true;
600 }
601 
602 std::vector<MatcherCompletion>
603 Parser::completeExpression(StringRef Code, unsigned CompletionOffset, Sema *S,
604                            const NamedValueMap *NamedValues) {
605   Diagnostics Error;
606   CodeTokenizer Tokenizer(Code, &Error, CompletionOffset);
607   Parser P(&Tokenizer, S, NamedValues, &Error);
608   VariantValue Dummy;
609   P.parseExpressionImpl(&Dummy);
610 
611   // Sort by specificity, then by name.
612   std::sort(P.Completions.begin(), P.Completions.end(),
613             [](const MatcherCompletion &A, const MatcherCompletion &B) {
614     if (A.Specificity != B.Specificity)
615       return A.Specificity > B.Specificity;
616     return A.TypedText < B.TypedText;
617   });
618 
619   return P.Completions;
620 }
621 
622 llvm::Optional<DynTypedMatcher>
623 Parser::parseMatcherExpression(StringRef Code, Sema *S,
624                                const NamedValueMap *NamedValues,
625                                Diagnostics *Error) {
626   VariantValue Value;
627   if (!parseExpression(Code, S, NamedValues, &Value, Error))
628     return llvm::Optional<DynTypedMatcher>();
629   if (!Value.isMatcher()) {
630     Error->addError(SourceRange(), Error->ET_ParserNotAMatcher);
631     return llvm::Optional<DynTypedMatcher>();
632   }
633   llvm::Optional<DynTypedMatcher> Result =
634       Value.getMatcher().getSingleMatcher();
635   if (!Result.hasValue()) {
636     Error->addError(SourceRange(), Error->ET_ParserOverloadedType)
637         << Value.getTypeAsString();
638   }
639   return Result;
640 }
641 
642 }  // namespace dynamic
643 }  // namespace ast_matchers
644 }  // namespace clang
645