xref: /llvm-project/clang/lib/ASTMatchers/Dynamic/Parser.cpp (revision 31edb51a4f274e97a9c54ae830b1896c690b8cf7)
1 //===--- Parser.cpp - Matcher expression parser -----*- C++ -*-===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 ///
10 /// \file
11 /// \brief Recursive parser implementation for the matcher expression grammar.
12 ///
13 //===----------------------------------------------------------------------===//
14 
15 #include <string>
16 #include <vector>
17 
18 #include "clang/ASTMatchers/Dynamic/Parser.h"
19 #include "clang/ASTMatchers/Dynamic/Registry.h"
20 #include "clang/Basic/CharInfo.h"
21 #include "llvm/ADT/Twine.h"
22 
23 namespace clang {
24 namespace ast_matchers {
25 namespace dynamic {
26 
27 /// \brief Simple structure to hold information for one token from the parser.
28 struct Parser::TokenInfo {
29   /// \brief Different possible tokens.
30   enum TokenKind {
31     TK_Eof = 0,
32     TK_OpenParen = 1,
33     TK_CloseParen = 2,
34     TK_Comma = 3,
35     TK_Period = 4,
36     TK_Literal = 5,
37     TK_Ident = 6,
38     TK_InvalidChar = 7,
39     TK_Error = 8
40   };
41 
42   /// \brief Some known identifiers.
43   static const char* const ID_Bind;
44 
45   TokenInfo() : Text(), Kind(TK_Eof), Range(), Value() {}
46 
47   StringRef Text;
48   TokenKind Kind;
49   SourceRange Range;
50   VariantValue Value;
51 };
52 
53 const char* const Parser::TokenInfo::ID_Bind = "bind";
54 
55 /// \brief Simple tokenizer for the parser.
56 class Parser::CodeTokenizer {
57 public:
58   explicit CodeTokenizer(StringRef MatcherCode, Diagnostics *Error)
59       : Code(MatcherCode), StartOfLine(MatcherCode), Line(1), Error(Error) {
60     NextToken = getNextToken();
61   }
62 
63   /// \brief Returns but doesn't consume the next token.
64   const TokenInfo &peekNextToken() const { return NextToken; }
65 
66   /// \brief Consumes and returns the next token.
67   TokenInfo consumeNextToken() {
68     TokenInfo ThisToken = NextToken;
69     NextToken = getNextToken();
70     return ThisToken;
71   }
72 
73   TokenInfo::TokenKind nextTokenKind() const { return NextToken.Kind; }
74 
75 private:
76   TokenInfo getNextToken() {
77     consumeWhitespace();
78     TokenInfo Result;
79     Result.Range.Start = currentLocation();
80 
81     if (Code.empty()) {
82       Result.Kind = TokenInfo::TK_Eof;
83       Result.Text = "";
84       return Result;
85     }
86 
87     switch (Code[0]) {
88     case ',':
89       Result.Kind = TokenInfo::TK_Comma;
90       Result.Text = Code.substr(0, 1);
91       Code = Code.drop_front();
92       break;
93     case '.':
94       Result.Kind = TokenInfo::TK_Period;
95       Result.Text = Code.substr(0, 1);
96       Code = Code.drop_front();
97       break;
98     case '(':
99       Result.Kind = TokenInfo::TK_OpenParen;
100       Result.Text = Code.substr(0, 1);
101       Code = Code.drop_front();
102       break;
103     case ')':
104       Result.Kind = TokenInfo::TK_CloseParen;
105       Result.Text = Code.substr(0, 1);
106       Code = Code.drop_front();
107       break;
108 
109     case '"':
110     case '\'':
111       // Parse a string literal.
112       consumeStringLiteral(&Result);
113       break;
114 
115     default:
116       if (isAlphanumeric(Code[0])) {
117         // Parse an identifier
118         size_t TokenLength = 1;
119         while (TokenLength < Code.size() && isAlphanumeric(Code[TokenLength]))
120           ++TokenLength;
121         Result.Kind = TokenInfo::TK_Ident;
122         Result.Text = Code.substr(0, TokenLength);
123         Code = Code.drop_front(TokenLength);
124       } else {
125         Result.Kind = TokenInfo::TK_InvalidChar;
126         Result.Text = Code.substr(0, 1);
127         Code = Code.drop_front(1);
128       }
129       break;
130     }
131 
132     Result.Range.End = currentLocation();
133     return Result;
134   }
135 
136   /// \brief Consume a string literal.
137   ///
138   /// \c Code must be positioned at the start of the literal (the opening
139   /// quote). Consumed until it finds the same closing quote character.
140   void consumeStringLiteral(TokenInfo *Result) {
141     bool InEscape = false;
142     const char Marker = Code[0];
143     for (size_t Length = 1, Size = Code.size(); Length != Size; ++Length) {
144       if (InEscape) {
145         InEscape = false;
146         continue;
147       }
148       if (Code[Length] == '\\') {
149         InEscape = true;
150         continue;
151       }
152       if (Code[Length] == Marker) {
153         Result->Kind = TokenInfo::TK_Literal;
154         Result->Text = Code.substr(0, Length + 1);
155         Result->Value = Code.substr(1, Length - 1).str();
156         Code = Code.drop_front(Length + 1);
157         return;
158       }
159     }
160 
161     StringRef ErrorText = Code;
162     Code = Code.drop_front(Code.size());
163     SourceRange Range;
164     Range.Start = Result->Range.Start;
165     Range.End = currentLocation();
166     Error->pushErrorFrame(Range, Error->ET_ParserStringError)
167         << ErrorText;
168     Result->Kind = TokenInfo::TK_Error;
169   }
170 
171   /// \brief Consume all leading whitespace from \c Code.
172   void consumeWhitespace() {
173     while (!Code.empty() && isWhitespace(Code[0])) {
174       if (Code[0] == '\n') {
175         ++Line;
176         StartOfLine = Code.drop_front();
177       }
178       Code = Code.drop_front();
179     }
180   }
181 
182   SourceLocation currentLocation() {
183     SourceLocation Location;
184     Location.Line = Line;
185     Location.Column = Code.data() - StartOfLine.data() + 1;
186     return Location;
187   }
188 
189   StringRef Code;
190   StringRef StartOfLine;
191   unsigned Line;
192   Diagnostics *Error;
193   TokenInfo NextToken;
194 };
195 
196 Parser::Sema::~Sema() {}
197 
198 /// \brief Parse and validate a matcher expression.
199 /// \return \c true on success, in which case \c Value has the matcher parsed.
200 ///   If the input is malformed, or some argument has an error, it
201 ///   returns \c false.
202 bool Parser::parseMatcherExpressionImpl(VariantValue *Value) {
203   const TokenInfo NameToken = Tokenizer->consumeNextToken();
204   assert(NameToken.Kind == TokenInfo::TK_Ident);
205   const TokenInfo OpenToken = Tokenizer->consumeNextToken();
206   if (OpenToken.Kind != TokenInfo::TK_OpenParen) {
207     Error->pushErrorFrame(OpenToken.Range, Error->ET_ParserNoOpenParen)
208         << OpenToken.Text;
209     return false;
210   }
211 
212   std::vector<ParserValue> Args;
213   TokenInfo EndToken;
214   while (Tokenizer->nextTokenKind() != TokenInfo::TK_Eof) {
215     if (Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen) {
216       // End of args.
217       EndToken = Tokenizer->consumeNextToken();
218       break;
219     }
220     if (Args.size() > 0) {
221       // We must find a , token to continue.
222       const TokenInfo CommaToken = Tokenizer->consumeNextToken();
223       if (CommaToken.Kind != TokenInfo::TK_Comma) {
224         Error->pushErrorFrame(CommaToken.Range, Error->ET_ParserNoComma)
225             << CommaToken.Text;
226         return false;
227       }
228     }
229 
230     ParserValue ArgValue;
231     ArgValue.Text = Tokenizer->peekNextToken().Text;
232     ArgValue.Range = Tokenizer->peekNextToken().Range;
233     if (!parseExpressionImpl(&ArgValue.Value)) {
234       Error->pushErrorFrame(NameToken.Range,
235                             Error->ET_ParserMatcherArgFailure)
236           << (Args.size() + 1) << NameToken.Text;
237       return false;
238     }
239 
240     Args.push_back(ArgValue);
241   }
242 
243   if (EndToken.Kind == TokenInfo::TK_Eof) {
244     Error->pushErrorFrame(OpenToken.Range, Error->ET_ParserNoCloseParen);
245     return false;
246   }
247 
248   std::string BindID;
249   if (Tokenizer->peekNextToken().Kind == TokenInfo::TK_Period) {
250     // Parse .bind("foo")
251     Tokenizer->consumeNextToken();  // consume the period.
252     const TokenInfo BindToken = Tokenizer->consumeNextToken();
253     const TokenInfo OpenToken = Tokenizer->consumeNextToken();
254     const TokenInfo IDToken = Tokenizer->consumeNextToken();
255     const TokenInfo CloseToken = Tokenizer->consumeNextToken();
256 
257     // TODO: We could use different error codes for each/some to be more
258     //       explicit about the syntax error.
259     if (BindToken.Kind != TokenInfo::TK_Ident ||
260         BindToken.Text != TokenInfo::ID_Bind) {
261       Error->pushErrorFrame(BindToken.Range, Error->ET_ParserMalformedBindExpr);
262       return false;
263     }
264     if (OpenToken.Kind != TokenInfo::TK_OpenParen) {
265       Error->pushErrorFrame(OpenToken.Range, Error->ET_ParserMalformedBindExpr);
266       return false;
267     }
268     if (IDToken.Kind != TokenInfo::TK_Literal || !IDToken.Value.isString()) {
269       Error->pushErrorFrame(IDToken.Range, Error->ET_ParserMalformedBindExpr);
270       return false;
271     }
272     if (CloseToken.Kind != TokenInfo::TK_CloseParen) {
273       Error->pushErrorFrame(CloseToken.Range,
274                             Error->ET_ParserMalformedBindExpr);
275       return false;
276     }
277     BindID = IDToken.Value.getString();
278   }
279 
280   // Merge the start and end infos.
281   SourceRange MatcherRange = NameToken.Range;
282   MatcherRange.End = EndToken.Range.End;
283   DynTypedMatcher *Result = S->actOnMatcherExpression(
284       NameToken.Text, MatcherRange, BindID, Args, Error);
285   if (Result == NULL) {
286     Error->pushErrorFrame(NameToken.Range, Error->ET_ParserMatcherFailure)
287         << NameToken.Text;
288     return false;
289   }
290 
291   Value->takeMatcher(Result);
292   return true;
293 }
294 
295 /// \brief Parse an <Expresssion>
296 bool Parser::parseExpressionImpl(VariantValue *Value) {
297   switch (Tokenizer->nextTokenKind()) {
298   case TokenInfo::TK_Literal:
299     *Value = Tokenizer->consumeNextToken().Value;
300     return true;
301 
302   case TokenInfo::TK_Ident:
303     return parseMatcherExpressionImpl(Value);
304 
305   case TokenInfo::TK_Eof:
306     Error->pushErrorFrame(Tokenizer->consumeNextToken().Range,
307                           Error->ET_ParserNoCode);
308     return false;
309 
310   case TokenInfo::TK_Error:
311     // This error was already reported by the tokenizer.
312     return false;
313 
314   case TokenInfo::TK_OpenParen:
315   case TokenInfo::TK_CloseParen:
316   case TokenInfo::TK_Comma:
317   case TokenInfo::TK_Period:
318   case TokenInfo::TK_InvalidChar:
319     const TokenInfo Token = Tokenizer->consumeNextToken();
320     Error->pushErrorFrame(Token.Range, Error->ET_ParserInvalidToken)
321         << Token.Text;
322     return false;
323   }
324 
325   llvm_unreachable("Unknown token kind.");
326 }
327 
328 Parser::Parser(CodeTokenizer *Tokenizer, Sema *S,
329                Diagnostics *Error)
330     : Tokenizer(Tokenizer), S(S), Error(Error) {}
331 
332 class RegistrySema : public Parser::Sema {
333 public:
334   virtual ~RegistrySema() {}
335   DynTypedMatcher *actOnMatcherExpression(StringRef MatcherName,
336                                           const SourceRange &NameRange,
337                                           StringRef BindID,
338                                           ArrayRef<ParserValue> Args,
339                                           Diagnostics *Error) {
340     if (BindID.empty()) {
341       return Registry::constructMatcher(MatcherName, NameRange, Args, Error);
342     } else {
343       return Registry::constructBoundMatcher(MatcherName, NameRange, BindID,
344                                              Args, Error);
345     }
346   }
347 };
348 
349 bool Parser::parseExpression(StringRef Code, VariantValue *Value,
350                              Diagnostics *Error) {
351   RegistrySema S;
352   return parseExpression(Code, &S, Value, Error);
353 }
354 
355 bool Parser::parseExpression(StringRef Code, Sema *S,
356                              VariantValue *Value, Diagnostics *Error) {
357   CodeTokenizer Tokenizer(Code, Error);
358   if (!Parser(&Tokenizer, S, Error).parseExpressionImpl(Value)) return false;
359   if (Tokenizer.peekNextToken().Kind != TokenInfo::TK_Eof) {
360     Error->pushErrorFrame(Tokenizer.peekNextToken().Range,
361                           Error->ET_ParserTrailingCode);
362     return false;
363   }
364   return true;
365 }
366 
367 DynTypedMatcher *Parser::parseMatcherExpression(StringRef Code,
368                                                 Diagnostics *Error) {
369   RegistrySema S;
370   return parseMatcherExpression(Code, &S, Error);
371 }
372 
373 DynTypedMatcher *Parser::parseMatcherExpression(StringRef Code,
374                                                 Parser::Sema *S,
375                                                 Diagnostics *Error) {
376   VariantValue Value;
377   if (!parseExpression(Code, S, &Value, Error))
378     return NULL;
379   if (!Value.isMatcher()) {
380     Error->pushErrorFrame(SourceRange(), Error->ET_ParserNotAMatcher);
381     return NULL;
382   }
383   return Value.getMatcher().clone();
384 }
385 
386 }  // namespace dynamic
387 }  // namespace ast_matchers
388 }  // namespace clang
389