xref: /llvm-project/clang/lib/ASTMatchers/Dynamic/Parser.cpp (revision c6f2c9b5665e1547af171dd9a25672a44e9280a4)
1 //===--- Parser.cpp - Matcher expression parser -----*- C++ -*-===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 ///
10 /// \file
11 /// \brief Recursive parser implementation for the matcher expression grammar.
12 ///
13 //===----------------------------------------------------------------------===//
14 
15 #include <string>
16 #include <vector>
17 
18 #include "clang/ASTMatchers/Dynamic/Parser.h"
19 #include "clang/ASTMatchers/Dynamic/Registry.h"
20 #include "clang/Basic/CharInfo.h"
21 #include "llvm/ADT/Twine.h"
22 
23 namespace clang {
24 namespace ast_matchers {
25 namespace dynamic {
26 
27 /// \brief Simple structure to hold information for one token from the parser.
28 struct Parser::TokenInfo {
29   /// \brief Different possible tokens.
30   enum TokenKind {
31     TK_Eof = 0,
32     TK_OpenParen = 1,
33     TK_CloseParen = 2,
34     TK_Comma = 3,
35     TK_Period = 4,
36     TK_Literal = 5,
37     TK_Ident = 6,
38     TK_InvalidChar = 7,
39     TK_Error = 8
40   };
41 
42   /// \brief Some known identifiers.
43   static const char* const ID_Bind;
44 
45   TokenInfo() : Text(), Kind(TK_Eof), Range(), Value() {}
46 
47   StringRef Text;
48   TokenKind Kind;
49   SourceRange Range;
50   VariantValue Value;
51 };
52 
53 const char* const Parser::TokenInfo::ID_Bind = "bind";
54 
55 /// \brief Simple tokenizer for the parser.
56 class Parser::CodeTokenizer {
57 public:
58   explicit CodeTokenizer(StringRef MatcherCode, Diagnostics *Error)
59       : Code(MatcherCode), StartOfLine(MatcherCode), Line(1), Error(Error) {
60     NextToken = getNextToken();
61   }
62 
63   /// \brief Returns but doesn't consume the next token.
64   const TokenInfo &peekNextToken() const { return NextToken; }
65 
66   /// \brief Consumes and returns the next token.
67   TokenInfo consumeNextToken() {
68     TokenInfo ThisToken = NextToken;
69     NextToken = getNextToken();
70     return ThisToken;
71   }
72 
73   TokenInfo::TokenKind nextTokenKind() const { return NextToken.Kind; }
74 
75 private:
76   TokenInfo getNextToken() {
77     consumeWhitespace();
78     TokenInfo Result;
79     Result.Range.Start = currentLocation();
80 
81     if (Code.empty()) {
82       Result.Kind = TokenInfo::TK_Eof;
83       Result.Text = "";
84       return Result;
85     }
86 
87     switch (Code[0]) {
88     case ',':
89       Result.Kind = TokenInfo::TK_Comma;
90       Result.Text = Code.substr(0, 1);
91       Code = Code.drop_front();
92       break;
93     case '.':
94       Result.Kind = TokenInfo::TK_Period;
95       Result.Text = Code.substr(0, 1);
96       Code = Code.drop_front();
97       break;
98     case '(':
99       Result.Kind = TokenInfo::TK_OpenParen;
100       Result.Text = Code.substr(0, 1);
101       Code = Code.drop_front();
102       break;
103     case ')':
104       Result.Kind = TokenInfo::TK_CloseParen;
105       Result.Text = Code.substr(0, 1);
106       Code = Code.drop_front();
107       break;
108 
109     case '"':
110     case '\'':
111       // Parse a string literal.
112       consumeStringLiteral(&Result);
113       break;
114 
115     case '0': case '1': case '2': case '3': case '4':
116     case '5': case '6': case '7': case '8': case '9':
117       // Parse an unsigned literal.
118       consumeUnsignedLiteral(&Result);
119       break;
120 
121     default:
122       if (isAlphanumeric(Code[0])) {
123         // Parse an identifier
124         size_t TokenLength = 1;
125         while (TokenLength < Code.size() && isAlphanumeric(Code[TokenLength]))
126           ++TokenLength;
127         Result.Kind = TokenInfo::TK_Ident;
128         Result.Text = Code.substr(0, TokenLength);
129         Code = Code.drop_front(TokenLength);
130       } else {
131         Result.Kind = TokenInfo::TK_InvalidChar;
132         Result.Text = Code.substr(0, 1);
133         Code = Code.drop_front(1);
134       }
135       break;
136     }
137 
138     Result.Range.End = currentLocation();
139     return Result;
140   }
141 
142   /// \brief Consume an unsigned literal.
143   void consumeUnsignedLiteral(TokenInfo *Result) {
144     unsigned Length = 1;
145     if (Code.size() > 1) {
146       // Consume the 'x' or 'b' radix modifier, if present.
147       switch (toLowercase(Code[1])) {
148       case 'x': case 'b': Length = 2;
149       }
150     }
151     while (Length < Code.size() && isHexDigit(Code[Length]))
152       ++Length;
153 
154     Result->Text = Code.substr(0, Length);
155     Code = Code.drop_front(Length);
156 
157     unsigned Value;
158     if (!Result->Text.getAsInteger(0, Value)) {
159       Result->Kind = TokenInfo::TK_Literal;
160       Result->Value = Value;
161     } else {
162       SourceRange Range;
163       Range.Start = Result->Range.Start;
164       Range.End = currentLocation();
165       Error->pushErrorFrame(Range, Error->ET_ParserUnsignedError)
166           << Result->Text;
167       Result->Kind = TokenInfo::TK_Error;
168     }
169   }
170 
171   /// \brief Consume a string literal.
172   ///
173   /// \c Code must be positioned at the start of the literal (the opening
174   /// quote). Consumed until it finds the same closing quote character.
175   void consumeStringLiteral(TokenInfo *Result) {
176     bool InEscape = false;
177     const char Marker = Code[0];
178     for (size_t Length = 1, Size = Code.size(); Length != Size; ++Length) {
179       if (InEscape) {
180         InEscape = false;
181         continue;
182       }
183       if (Code[Length] == '\\') {
184         InEscape = true;
185         continue;
186       }
187       if (Code[Length] == Marker) {
188         Result->Kind = TokenInfo::TK_Literal;
189         Result->Text = Code.substr(0, Length + 1);
190         Result->Value = Code.substr(1, Length - 1).str();
191         Code = Code.drop_front(Length + 1);
192         return;
193       }
194     }
195 
196     StringRef ErrorText = Code;
197     Code = Code.drop_front(Code.size());
198     SourceRange Range;
199     Range.Start = Result->Range.Start;
200     Range.End = currentLocation();
201     Error->pushErrorFrame(Range, Error->ET_ParserStringError)
202         << ErrorText;
203     Result->Kind = TokenInfo::TK_Error;
204   }
205 
206   /// \brief Consume all leading whitespace from \c Code.
207   void consumeWhitespace() {
208     while (!Code.empty() && isWhitespace(Code[0])) {
209       if (Code[0] == '\n') {
210         ++Line;
211         StartOfLine = Code.drop_front();
212       }
213       Code = Code.drop_front();
214     }
215   }
216 
217   SourceLocation currentLocation() {
218     SourceLocation Location;
219     Location.Line = Line;
220     Location.Column = Code.data() - StartOfLine.data() + 1;
221     return Location;
222   }
223 
224   StringRef Code;
225   StringRef StartOfLine;
226   unsigned Line;
227   Diagnostics *Error;
228   TokenInfo NextToken;
229 };
230 
231 Parser::Sema::~Sema() {}
232 
233 /// \brief Parse and validate a matcher expression.
234 /// \return \c true on success, in which case \c Value has the matcher parsed.
235 ///   If the input is malformed, or some argument has an error, it
236 ///   returns \c false.
237 bool Parser::parseMatcherExpressionImpl(VariantValue *Value) {
238   const TokenInfo NameToken = Tokenizer->consumeNextToken();
239   assert(NameToken.Kind == TokenInfo::TK_Ident);
240   const TokenInfo OpenToken = Tokenizer->consumeNextToken();
241   if (OpenToken.Kind != TokenInfo::TK_OpenParen) {
242     Error->pushErrorFrame(OpenToken.Range, Error->ET_ParserNoOpenParen)
243         << OpenToken.Text;
244     return false;
245   }
246 
247   std::vector<ParserValue> Args;
248   TokenInfo EndToken;
249   while (Tokenizer->nextTokenKind() != TokenInfo::TK_Eof) {
250     if (Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen) {
251       // End of args.
252       EndToken = Tokenizer->consumeNextToken();
253       break;
254     }
255     if (Args.size() > 0) {
256       // We must find a , token to continue.
257       const TokenInfo CommaToken = Tokenizer->consumeNextToken();
258       if (CommaToken.Kind != TokenInfo::TK_Comma) {
259         Error->pushErrorFrame(CommaToken.Range, Error->ET_ParserNoComma)
260             << CommaToken.Text;
261         return false;
262       }
263     }
264 
265     ParserValue ArgValue;
266     ArgValue.Text = Tokenizer->peekNextToken().Text;
267     ArgValue.Range = Tokenizer->peekNextToken().Range;
268     if (!parseExpressionImpl(&ArgValue.Value)) {
269       Error->pushErrorFrame(NameToken.Range,
270                             Error->ET_ParserMatcherArgFailure)
271           << (Args.size() + 1) << NameToken.Text;
272       return false;
273     }
274 
275     Args.push_back(ArgValue);
276   }
277 
278   if (EndToken.Kind == TokenInfo::TK_Eof) {
279     Error->pushErrorFrame(OpenToken.Range, Error->ET_ParserNoCloseParen);
280     return false;
281   }
282 
283   std::string BindID;
284   if (Tokenizer->peekNextToken().Kind == TokenInfo::TK_Period) {
285     // Parse .bind("foo")
286     Tokenizer->consumeNextToken();  // consume the period.
287     const TokenInfo BindToken = Tokenizer->consumeNextToken();
288     const TokenInfo OpenToken = Tokenizer->consumeNextToken();
289     const TokenInfo IDToken = Tokenizer->consumeNextToken();
290     const TokenInfo CloseToken = Tokenizer->consumeNextToken();
291 
292     // TODO: We could use different error codes for each/some to be more
293     //       explicit about the syntax error.
294     if (BindToken.Kind != TokenInfo::TK_Ident ||
295         BindToken.Text != TokenInfo::ID_Bind) {
296       Error->pushErrorFrame(BindToken.Range, Error->ET_ParserMalformedBindExpr);
297       return false;
298     }
299     if (OpenToken.Kind != TokenInfo::TK_OpenParen) {
300       Error->pushErrorFrame(OpenToken.Range, Error->ET_ParserMalformedBindExpr);
301       return false;
302     }
303     if (IDToken.Kind != TokenInfo::TK_Literal || !IDToken.Value.isString()) {
304       Error->pushErrorFrame(IDToken.Range, Error->ET_ParserMalformedBindExpr);
305       return false;
306     }
307     if (CloseToken.Kind != TokenInfo::TK_CloseParen) {
308       Error->pushErrorFrame(CloseToken.Range,
309                             Error->ET_ParserMalformedBindExpr);
310       return false;
311     }
312     BindID = IDToken.Value.getString();
313   }
314 
315   // Merge the start and end infos.
316   SourceRange MatcherRange = NameToken.Range;
317   MatcherRange.End = EndToken.Range.End;
318   MatcherList Result = S->actOnMatcherExpression(
319       NameToken.Text, MatcherRange, BindID, Args, Error);
320   if (Result.empty()) {
321     Error->pushErrorFrame(NameToken.Range, Error->ET_ParserMatcherFailure)
322         << NameToken.Text;
323     return false;
324   }
325 
326   *Value = Result;
327   return true;
328 }
329 
330 /// \brief Parse an <Expresssion>
331 bool Parser::parseExpressionImpl(VariantValue *Value) {
332   switch (Tokenizer->nextTokenKind()) {
333   case TokenInfo::TK_Literal:
334     *Value = Tokenizer->consumeNextToken().Value;
335     return true;
336 
337   case TokenInfo::TK_Ident:
338     return parseMatcherExpressionImpl(Value);
339 
340   case TokenInfo::TK_Eof:
341     Error->pushErrorFrame(Tokenizer->consumeNextToken().Range,
342                           Error->ET_ParserNoCode);
343     return false;
344 
345   case TokenInfo::TK_Error:
346     // This error was already reported by the tokenizer.
347     return false;
348 
349   case TokenInfo::TK_OpenParen:
350   case TokenInfo::TK_CloseParen:
351   case TokenInfo::TK_Comma:
352   case TokenInfo::TK_Period:
353   case TokenInfo::TK_InvalidChar:
354     const TokenInfo Token = Tokenizer->consumeNextToken();
355     Error->pushErrorFrame(Token.Range, Error->ET_ParserInvalidToken)
356         << Token.Text;
357     return false;
358   }
359 
360   llvm_unreachable("Unknown token kind.");
361 }
362 
363 Parser::Parser(CodeTokenizer *Tokenizer, Sema *S,
364                Diagnostics *Error)
365     : Tokenizer(Tokenizer), S(S), Error(Error) {}
366 
367 class RegistrySema : public Parser::Sema {
368 public:
369   virtual ~RegistrySema() {}
370   MatcherList actOnMatcherExpression(StringRef MatcherName,
371                                      const SourceRange &NameRange,
372                                      StringRef BindID,
373                                      ArrayRef<ParserValue> Args,
374                                      Diagnostics *Error) {
375     if (BindID.empty()) {
376       return Registry::constructMatcher(MatcherName, NameRange, Args, Error);
377     } else {
378       return Registry::constructBoundMatcher(MatcherName, NameRange, BindID,
379                                              Args, Error);
380     }
381   }
382 };
383 
384 bool Parser::parseExpression(StringRef Code, VariantValue *Value,
385                              Diagnostics *Error) {
386   RegistrySema S;
387   return parseExpression(Code, &S, Value, Error);
388 }
389 
390 bool Parser::parseExpression(StringRef Code, Sema *S,
391                              VariantValue *Value, Diagnostics *Error) {
392   CodeTokenizer Tokenizer(Code, Error);
393   if (!Parser(&Tokenizer, S, Error).parseExpressionImpl(Value)) return false;
394   if (Tokenizer.peekNextToken().Kind != TokenInfo::TK_Eof) {
395     Error->pushErrorFrame(Tokenizer.peekNextToken().Range,
396                           Error->ET_ParserTrailingCode);
397     return false;
398   }
399   return true;
400 }
401 
402 DynTypedMatcher *Parser::parseMatcherExpression(StringRef Code,
403                                                 Diagnostics *Error) {
404   RegistrySema S;
405   return parseMatcherExpression(Code, &S, Error);
406 }
407 
408 DynTypedMatcher *Parser::parseMatcherExpression(StringRef Code,
409                                                 Parser::Sema *S,
410                                                 Diagnostics *Error) {
411   VariantValue Value;
412   if (!parseExpression(Code, S, &Value, Error))
413     return NULL;
414   if (!Value.isMatchers()) {
415     Error->pushErrorFrame(SourceRange(), Error->ET_ParserNotAMatcher);
416     return NULL;
417   }
418   if (Value.getMatchers().matchers().size() != 1) {
419     Error->pushErrorFrame(SourceRange(), Error->ET_ParserOverloadedType)
420         << Value.getTypeAsString();
421     return NULL;
422   }
423   return Value.getMatchers().matchers()[0]->clone();
424 }
425 
426 }  // namespace dynamic
427 }  // namespace ast_matchers
428 }  // namespace clang
429