xref: /llvm-project/clang/lib/ASTMatchers/Dynamic/Parser.cpp (revision 24db0f0afd425fdb0854d3d6a6e04f87c76dd27f)
1 //===--- Parser.cpp - Matcher expression parser -----*- C++ -*-===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 ///
10 /// \file
11 /// \brief Recursive parser implementation for the matcher expression grammar.
12 ///
13 //===----------------------------------------------------------------------===//
14 
15 #include <string>
16 #include <vector>
17 
18 #include "clang/ASTMatchers/Dynamic/Parser.h"
19 #include "clang/ASTMatchers/Dynamic/Registry.h"
20 #include "clang/Basic/CharInfo.h"
21 #include "llvm/ADT/Twine.h"
22 
23 namespace clang {
24 namespace ast_matchers {
25 namespace dynamic {
26 
27 /// \brief Simple structure to hold information for one token from the parser.
28 struct Parser::TokenInfo {
29   /// \brief Different possible tokens.
30   enum TokenKind {
31     TK_Eof = 0,
32     TK_OpenParen = 1,
33     TK_CloseParen = 2,
34     TK_Comma = 3,
35     TK_Literal = 4,
36     TK_Ident = 5,
37     TK_InvalidChar = 6,
38     TK_Error = 7
39   };
40 
41   TokenInfo() : Text(), Kind(TK_Eof), Range(), Value() {}
42 
43   StringRef Text;
44   TokenKind Kind;
45   SourceRange Range;
46   VariantValue Value;
47 };
48 
49 /// \brief Simple tokenizer for the parser.
50 class Parser::CodeTokenizer {
51 public:
52   explicit CodeTokenizer(StringRef MatcherCode, Diagnostics *Error)
53       : Code(MatcherCode), StartOfLine(MatcherCode), Line(1), Error(Error) {
54     NextToken = getNextToken();
55   }
56 
57   /// \brief Returns but doesn't consume the next token.
58   const TokenInfo &peekNextToken() const { return NextToken; }
59 
60   /// \brief Consumes and returns the next token.
61   TokenInfo consumeNextToken() {
62     TokenInfo ThisToken = NextToken;
63     NextToken = getNextToken();
64     return ThisToken;
65   }
66 
67   TokenInfo::TokenKind nextTokenKind() const { return NextToken.Kind; }
68 
69 private:
70   TokenInfo getNextToken() {
71     consumeWhitespace();
72     TokenInfo Result;
73     Result.Range.Start = currentLocation();
74 
75     if (Code.empty()) {
76       Result.Kind = TokenInfo::TK_Eof;
77       Result.Text = "";
78       return Result;
79     }
80 
81     switch (Code[0]) {
82     case ',':
83       Result.Kind = TokenInfo::TK_Comma;
84       Result.Text = Code.substr(0, 1);
85       Code = Code.drop_front();
86       break;
87     case '(':
88       Result.Kind = TokenInfo::TK_OpenParen;
89       Result.Text = Code.substr(0, 1);
90       Code = Code.drop_front();
91       break;
92     case ')':
93       Result.Kind = TokenInfo::TK_CloseParen;
94       Result.Text = Code.substr(0, 1);
95       Code = Code.drop_front();
96       break;
97 
98     case '"':
99     case '\'':
100       // Parse a string literal.
101       consumeStringLiteral(&Result);
102       break;
103 
104     default:
105       if (isAlphanumeric(Code[0])) {
106         // Parse an identifier
107         size_t TokenLength = 1;
108         while (TokenLength < Code.size() && isAlphanumeric(Code[TokenLength]))
109           ++TokenLength;
110         Result.Kind = TokenInfo::TK_Ident;
111         Result.Text = Code.substr(0, TokenLength);
112         Code = Code.drop_front(TokenLength);
113       } else {
114         Result.Kind = TokenInfo::TK_InvalidChar;
115         Result.Text = Code.substr(0, 1);
116         Code = Code.drop_front(1);
117       }
118       break;
119     }
120 
121     Result.Range.End = currentLocation();
122     return Result;
123   }
124 
125   /// \brief Consume a string literal.
126   ///
127   /// \c Code must be positioned at the start of the literal (the opening
128   /// quote). Consumed until it finds the same closing quote character.
129   void consumeStringLiteral(TokenInfo *Result) {
130     bool InEscape = false;
131     const char Marker = Code[0];
132     for (size_t Length = 1, Size = Code.size(); Length != Size; ++Length) {
133       if (InEscape) {
134         InEscape = false;
135         continue;
136       }
137       if (Code[Length] == '\\') {
138         InEscape = true;
139         continue;
140       }
141       if (Code[Length] == Marker) {
142         Result->Kind = TokenInfo::TK_Literal;
143         Result->Text = Code.substr(0, Length + 1);
144         Result->Value = Code.substr(1, Length - 1).str();
145         Code = Code.drop_front(Length + 1);
146         return;
147       }
148     }
149 
150     StringRef ErrorText = Code;
151     Code = Code.drop_front(Code.size());
152     SourceRange Range;
153     Range.Start = Result->Range.Start;
154     Range.End = currentLocation();
155     Error->pushErrorFrame(Range, Error->ET_ParserStringError)
156         << ErrorText;
157     Result->Kind = TokenInfo::TK_Error;
158   }
159 
160   /// \brief Consume all leading whitespace from \c Code.
161   void consumeWhitespace() {
162     while (!Code.empty() && isWhitespace(Code[0])) {
163       if (Code[0] == '\n') {
164         ++Line;
165         StartOfLine = Code.drop_front();
166       }
167       Code = Code.drop_front();
168     }
169   }
170 
171   SourceLocation currentLocation() {
172     SourceLocation Location;
173     Location.Line = Line;
174     Location.Column = Code.data() - StartOfLine.data() + 1;
175     return Location;
176   }
177 
178   StringRef Code;
179   StringRef StartOfLine;
180   unsigned Line;
181   Diagnostics *Error;
182   TokenInfo NextToken;
183 };
184 
185 Parser::Sema::~Sema() {}
186 
187 /// \brief Parse and validate a matcher expression.
188 /// \return \c true on success, in which case \c Value has the matcher parsed.
189 ///   If the input is malformed, or some argument has an error, it
190 ///   returns \c false.
191 bool Parser::parseMatcherExpressionImpl(VariantValue *Value) {
192   const TokenInfo NameToken = Tokenizer->consumeNextToken();
193   assert(NameToken.Kind == TokenInfo::TK_Ident);
194   const TokenInfo OpenToken = Tokenizer->consumeNextToken();
195   if (OpenToken.Kind != TokenInfo::TK_OpenParen) {
196     Error->pushErrorFrame(OpenToken.Range, Error->ET_ParserNoOpenParen)
197         << OpenToken.Text;
198     return false;
199   }
200 
201   std::vector<ParserValue> Args;
202   TokenInfo EndToken;
203   while (Tokenizer->nextTokenKind() != TokenInfo::TK_Eof) {
204     if (Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen) {
205       // End of args.
206       EndToken = Tokenizer->consumeNextToken();
207       break;
208     }
209     if (Args.size() > 0) {
210       // We must find a , token to continue.
211       const TokenInfo CommaToken = Tokenizer->consumeNextToken();
212       if (CommaToken.Kind != TokenInfo::TK_Comma) {
213         Error->pushErrorFrame(CommaToken.Range, Error->ET_ParserNoComma)
214             << CommaToken.Text;
215         return false;
216       }
217     }
218 
219     ParserValue ArgValue;
220     ArgValue.Text = Tokenizer->peekNextToken().Text;
221     ArgValue.Range = Tokenizer->peekNextToken().Range;
222     if (!parseExpressionImpl(&ArgValue.Value)) {
223       Error->pushErrorFrame(NameToken.Range,
224                             Error->ET_ParserMatcherArgFailure)
225           << (Args.size() + 1) << NameToken.Text;
226       return false;
227     }
228 
229     Args.push_back(ArgValue);
230   }
231 
232   if (EndToken.Kind == TokenInfo::TK_Eof) {
233     Error->pushErrorFrame(OpenToken.Range, Error->ET_ParserNoCloseParen);
234     return false;
235   }
236 
237   // Merge the start and end infos.
238   SourceRange MatcherRange = NameToken.Range;
239   MatcherRange.End = EndToken.Range.End;
240   DynTypedMatcher *Result =
241       S->actOnMatcherExpression(NameToken.Text, MatcherRange, Args, Error);
242   if (Result == NULL) {
243     Error->pushErrorFrame(NameToken.Range, Error->ET_ParserMatcherFailure)
244         << NameToken.Text;
245     return false;
246   }
247 
248   Value->takeMatcher(Result);
249   return true;
250 }
251 
252 /// \brief Parse an <Expresssion>
253 bool Parser::parseExpressionImpl(VariantValue *Value) {
254   switch (Tokenizer->nextTokenKind()) {
255   case TokenInfo::TK_Literal:
256     *Value = Tokenizer->consumeNextToken().Value;
257     return true;
258 
259   case TokenInfo::TK_Ident:
260     return parseMatcherExpressionImpl(Value);
261 
262   case TokenInfo::TK_Eof:
263     Error->pushErrorFrame(Tokenizer->consumeNextToken().Range,
264                           Error->ET_ParserNoCode);
265     return false;
266 
267   case TokenInfo::TK_Error:
268     // This error was already reported by the tokenizer.
269     return false;
270 
271   case TokenInfo::TK_OpenParen:
272   case TokenInfo::TK_CloseParen:
273   case TokenInfo::TK_Comma:
274   case TokenInfo::TK_InvalidChar:
275     const TokenInfo Token = Tokenizer->consumeNextToken();
276     Error->pushErrorFrame(Token.Range, Error->ET_ParserInvalidToken)
277         << Token.Text;
278     return false;
279   }
280 
281   llvm_unreachable("Unknown token kind.");
282 }
283 
284 Parser::Parser(CodeTokenizer *Tokenizer, Sema *S,
285                Diagnostics *Error)
286     : Tokenizer(Tokenizer), S(S), Error(Error) {}
287 
288 class RegistrySema : public Parser::Sema {
289 public:
290   virtual ~RegistrySema() {}
291   DynTypedMatcher *actOnMatcherExpression(StringRef MatcherName,
292                                           const SourceRange &NameRange,
293                                           ArrayRef<ParserValue> Args,
294                                           Diagnostics *Error) {
295     return Registry::constructMatcher(MatcherName, NameRange, Args, Error);
296   }
297 };
298 
299 bool Parser::parseExpression(StringRef Code, VariantValue *Value,
300                              Diagnostics *Error) {
301   RegistrySema S;
302   return parseExpression(Code, &S, Value, Error);
303 }
304 
305 bool Parser::parseExpression(StringRef Code, Sema *S,
306                              VariantValue *Value, Diagnostics *Error) {
307   CodeTokenizer Tokenizer(Code, Error);
308   return Parser(&Tokenizer, S, Error).parseExpressionImpl(Value);
309 }
310 
311 DynTypedMatcher *Parser::parseMatcherExpression(StringRef Code,
312                                                 Diagnostics *Error) {
313   RegistrySema S;
314   return parseMatcherExpression(Code, &S, Error);
315 }
316 
317 DynTypedMatcher *Parser::parseMatcherExpression(StringRef Code,
318                                                 Parser::Sema *S,
319                                                 Diagnostics *Error) {
320   VariantValue Value;
321   if (!parseExpression(Code, S, &Value, Error))
322     return NULL;
323   if (!Value.isMatcher()) {
324     Error->pushErrorFrame(SourceRange(), Error->ET_ParserNotAMatcher);
325     return NULL;
326   }
327   return Value.getMatcher().clone();
328 }
329 
330 }  // namespace dynamic
331 }  // namespace ast_matchers
332 }  // namespace clang
333