xref: /llvm-project/clang/lib/ASTMatchers/Dynamic/Parser.cpp (revision 5553d0d4cadc35733a910e7af5f8911105ff529d)
1 //===--- Parser.cpp - Matcher expression parser -----*- C++ -*-===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 ///
10 /// \file
11 /// \brief Recursive parser implementation for the matcher expression grammar.
12 ///
13 //===----------------------------------------------------------------------===//
14 
15 #include "clang/ASTMatchers/Dynamic/Parser.h"
16 #include "clang/ASTMatchers/Dynamic/Registry.h"
17 #include "clang/Basic/CharInfo.h"
18 #include "llvm/ADT/Optional.h"
19 #include "llvm/ADT/Twine.h"
20 #include <string>
21 #include <vector>
22 
23 namespace clang {
24 namespace ast_matchers {
25 namespace dynamic {
26 
27 /// \brief Simple structure to hold information for one token from the parser.
28 struct Parser::TokenInfo {
29   /// \brief Different possible tokens.
30   enum TokenKind {
31     TK_Eof = 0,
32     TK_OpenParen = 1,
33     TK_CloseParen = 2,
34     TK_Comma = 3,
35     TK_Period = 4,
36     TK_Literal = 5,
37     TK_Ident = 6,
38     TK_InvalidChar = 7,
39     TK_Error = 8
40   };
41 
42   /// \brief Some known identifiers.
43   static const char* const ID_Bind;
44 
45   TokenInfo() : Text(), Kind(TK_Eof), Range(), Value() {}
46 
47   StringRef Text;
48   TokenKind Kind;
49   SourceRange Range;
50   VariantValue Value;
51 };
52 
53 const char* const Parser::TokenInfo::ID_Bind = "bind";
54 
55 /// \brief Simple tokenizer for the parser.
56 class Parser::CodeTokenizer {
57 public:
58   explicit CodeTokenizer(StringRef MatcherCode, Diagnostics *Error)
59       : Code(MatcherCode), StartOfLine(MatcherCode), Line(1), Error(Error) {
60     NextToken = getNextToken();
61   }
62 
63   /// \brief Returns but doesn't consume the next token.
64   const TokenInfo &peekNextToken() const { return NextToken; }
65 
66   /// \brief Consumes and returns the next token.
67   TokenInfo consumeNextToken() {
68     TokenInfo ThisToken = NextToken;
69     NextToken = getNextToken();
70     return ThisToken;
71   }
72 
73   TokenInfo::TokenKind nextTokenKind() const { return NextToken.Kind; }
74 
75 private:
76   TokenInfo getNextToken() {
77     consumeWhitespace();
78     TokenInfo Result;
79     Result.Range.Start = currentLocation();
80 
81     if (Code.empty()) {
82       Result.Kind = TokenInfo::TK_Eof;
83       Result.Text = "";
84       return Result;
85     }
86 
87     switch (Code[0]) {
88     case ',':
89       Result.Kind = TokenInfo::TK_Comma;
90       Result.Text = Code.substr(0, 1);
91       Code = Code.drop_front();
92       break;
93     case '.':
94       Result.Kind = TokenInfo::TK_Period;
95       Result.Text = Code.substr(0, 1);
96       Code = Code.drop_front();
97       break;
98     case '(':
99       Result.Kind = TokenInfo::TK_OpenParen;
100       Result.Text = Code.substr(0, 1);
101       Code = Code.drop_front();
102       break;
103     case ')':
104       Result.Kind = TokenInfo::TK_CloseParen;
105       Result.Text = Code.substr(0, 1);
106       Code = Code.drop_front();
107       break;
108 
109     case '"':
110     case '\'':
111       // Parse a string literal.
112       consumeStringLiteral(&Result);
113       break;
114 
115     case '0': case '1': case '2': case '3': case '4':
116     case '5': case '6': case '7': case '8': case '9':
117       // Parse an unsigned literal.
118       consumeUnsignedLiteral(&Result);
119       break;
120 
121     default:
122       if (isAlphanumeric(Code[0])) {
123         // Parse an identifier
124         size_t TokenLength = 1;
125         while (TokenLength < Code.size() && isAlphanumeric(Code[TokenLength]))
126           ++TokenLength;
127         Result.Kind = TokenInfo::TK_Ident;
128         Result.Text = Code.substr(0, TokenLength);
129         Code = Code.drop_front(TokenLength);
130       } else {
131         Result.Kind = TokenInfo::TK_InvalidChar;
132         Result.Text = Code.substr(0, 1);
133         Code = Code.drop_front(1);
134       }
135       break;
136     }
137 
138     Result.Range.End = currentLocation();
139     return Result;
140   }
141 
142   /// \brief Consume an unsigned literal.
143   void consumeUnsignedLiteral(TokenInfo *Result) {
144     unsigned Length = 1;
145     if (Code.size() > 1) {
146       // Consume the 'x' or 'b' radix modifier, if present.
147       switch (toLowercase(Code[1])) {
148       case 'x': case 'b': Length = 2;
149       }
150     }
151     while (Length < Code.size() && isHexDigit(Code[Length]))
152       ++Length;
153 
154     Result->Text = Code.substr(0, Length);
155     Code = Code.drop_front(Length);
156 
157     unsigned Value;
158     if (!Result->Text.getAsInteger(0, Value)) {
159       Result->Kind = TokenInfo::TK_Literal;
160       Result->Value = Value;
161     } else {
162       SourceRange Range;
163       Range.Start = Result->Range.Start;
164       Range.End = currentLocation();
165       Error->addError(Range, Error->ET_ParserUnsignedError) << Result->Text;
166       Result->Kind = TokenInfo::TK_Error;
167     }
168   }
169 
170   /// \brief Consume a string literal.
171   ///
172   /// \c Code must be positioned at the start of the literal (the opening
173   /// quote). Consumed until it finds the same closing quote character.
174   void consumeStringLiteral(TokenInfo *Result) {
175     bool InEscape = false;
176     const char Marker = Code[0];
177     for (size_t Length = 1, Size = Code.size(); Length != Size; ++Length) {
178       if (InEscape) {
179         InEscape = false;
180         continue;
181       }
182       if (Code[Length] == '\\') {
183         InEscape = true;
184         continue;
185       }
186       if (Code[Length] == Marker) {
187         Result->Kind = TokenInfo::TK_Literal;
188         Result->Text = Code.substr(0, Length + 1);
189         Result->Value = Code.substr(1, Length - 1).str();
190         Code = Code.drop_front(Length + 1);
191         return;
192       }
193     }
194 
195     StringRef ErrorText = Code;
196     Code = Code.drop_front(Code.size());
197     SourceRange Range;
198     Range.Start = Result->Range.Start;
199     Range.End = currentLocation();
200     Error->addError(Range, Error->ET_ParserStringError) << ErrorText;
201     Result->Kind = TokenInfo::TK_Error;
202   }
203 
204   /// \brief Consume all leading whitespace from \c Code.
205   void consumeWhitespace() {
206     while (!Code.empty() && isWhitespace(Code[0])) {
207       if (Code[0] == '\n') {
208         ++Line;
209         StartOfLine = Code.drop_front();
210       }
211       Code = Code.drop_front();
212     }
213   }
214 
215   SourceLocation currentLocation() {
216     SourceLocation Location;
217     Location.Line = Line;
218     Location.Column = Code.data() - StartOfLine.data() + 1;
219     return Location;
220   }
221 
222   StringRef Code;
223   StringRef StartOfLine;
224   unsigned Line;
225   Diagnostics *Error;
226   TokenInfo NextToken;
227 };
228 
229 Parser::Sema::~Sema() {}
230 
231 /// \brief Parse and validate a matcher expression.
232 /// \return \c true on success, in which case \c Value has the matcher parsed.
233 ///   If the input is malformed, or some argument has an error, it
234 ///   returns \c false.
235 bool Parser::parseMatcherExpressionImpl(VariantValue *Value) {
236   const TokenInfo NameToken = Tokenizer->consumeNextToken();
237   assert(NameToken.Kind == TokenInfo::TK_Ident);
238   const TokenInfo OpenToken = Tokenizer->consumeNextToken();
239   if (OpenToken.Kind != TokenInfo::TK_OpenParen) {
240     Error->addError(OpenToken.Range, Error->ET_ParserNoOpenParen)
241         << OpenToken.Text;
242     return false;
243   }
244 
245   llvm::Optional<MatcherCtor> Ctor =
246       S->lookupMatcherCtor(NameToken.Text, NameToken.Range, Error);
247 
248   std::vector<ParserValue> Args;
249   TokenInfo EndToken;
250   while (Tokenizer->nextTokenKind() != TokenInfo::TK_Eof) {
251     if (Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen) {
252       // End of args.
253       EndToken = Tokenizer->consumeNextToken();
254       break;
255     }
256     if (Args.size() > 0) {
257       // We must find a , token to continue.
258       const TokenInfo CommaToken = Tokenizer->consumeNextToken();
259       if (CommaToken.Kind != TokenInfo::TK_Comma) {
260         Error->addError(CommaToken.Range, Error->ET_ParserNoComma)
261             << CommaToken.Text;
262         return false;
263       }
264     }
265 
266     Diagnostics::Context Ctx(Diagnostics::Context::MatcherArg, Error,
267                              NameToken.Text, NameToken.Range, Args.size() + 1);
268     ParserValue ArgValue;
269     ArgValue.Text = Tokenizer->peekNextToken().Text;
270     ArgValue.Range = Tokenizer->peekNextToken().Range;
271     if (!parseExpressionImpl(&ArgValue.Value)) return false;
272 
273     Args.push_back(ArgValue);
274   }
275 
276   if (EndToken.Kind == TokenInfo::TK_Eof) {
277     Error->addError(OpenToken.Range, Error->ET_ParserNoCloseParen);
278     return false;
279   }
280 
281   std::string BindID;
282   if (Tokenizer->peekNextToken().Kind == TokenInfo::TK_Period) {
283     // Parse .bind("foo")
284     Tokenizer->consumeNextToken();  // consume the period.
285     const TokenInfo BindToken = Tokenizer->consumeNextToken();
286     const TokenInfo OpenToken = Tokenizer->consumeNextToken();
287     const TokenInfo IDToken = Tokenizer->consumeNextToken();
288     const TokenInfo CloseToken = Tokenizer->consumeNextToken();
289 
290     // TODO: We could use different error codes for each/some to be more
291     //       explicit about the syntax error.
292     if (BindToken.Kind != TokenInfo::TK_Ident ||
293         BindToken.Text != TokenInfo::ID_Bind) {
294       Error->addError(BindToken.Range, Error->ET_ParserMalformedBindExpr);
295       return false;
296     }
297     if (OpenToken.Kind != TokenInfo::TK_OpenParen) {
298       Error->addError(OpenToken.Range, Error->ET_ParserMalformedBindExpr);
299       return false;
300     }
301     if (IDToken.Kind != TokenInfo::TK_Literal || !IDToken.Value.isString()) {
302       Error->addError(IDToken.Range, Error->ET_ParserMalformedBindExpr);
303       return false;
304     }
305     if (CloseToken.Kind != TokenInfo::TK_CloseParen) {
306       Error->addError(CloseToken.Range, Error->ET_ParserMalformedBindExpr);
307       return false;
308     }
309     BindID = IDToken.Value.getString();
310   }
311 
312   if (!Ctor)
313     return false;
314 
315   // Merge the start and end infos.
316   Diagnostics::Context Ctx(Diagnostics::Context::ConstructMatcher, Error,
317                            NameToken.Text, NameToken.Range);
318   SourceRange MatcherRange = NameToken.Range;
319   MatcherRange.End = EndToken.Range.End;
320   VariantMatcher Result = S->actOnMatcherExpression(
321       *Ctor, MatcherRange, BindID, Args, Error);
322   if (Result.isNull()) return false;
323 
324   *Value = Result;
325   return true;
326 }
327 
328 /// \brief Parse an <Expresssion>
329 bool Parser::parseExpressionImpl(VariantValue *Value) {
330   switch (Tokenizer->nextTokenKind()) {
331   case TokenInfo::TK_Literal:
332     *Value = Tokenizer->consumeNextToken().Value;
333     return true;
334 
335   case TokenInfo::TK_Ident:
336     return parseMatcherExpressionImpl(Value);
337 
338   case TokenInfo::TK_Eof:
339     Error->addError(Tokenizer->consumeNextToken().Range,
340                     Error->ET_ParserNoCode);
341     return false;
342 
343   case TokenInfo::TK_Error:
344     // This error was already reported by the tokenizer.
345     return false;
346 
347   case TokenInfo::TK_OpenParen:
348   case TokenInfo::TK_CloseParen:
349   case TokenInfo::TK_Comma:
350   case TokenInfo::TK_Period:
351   case TokenInfo::TK_InvalidChar:
352     const TokenInfo Token = Tokenizer->consumeNextToken();
353     Error->addError(Token.Range, Error->ET_ParserInvalidToken) << Token.Text;
354     return false;
355   }
356 
357   llvm_unreachable("Unknown token kind.");
358 }
359 
360 Parser::Parser(CodeTokenizer *Tokenizer, Sema *S,
361                Diagnostics *Error)
362     : Tokenizer(Tokenizer), S(S), Error(Error) {}
363 
364 class RegistrySema : public Parser::Sema {
365 public:
366   virtual ~RegistrySema() {}
367   llvm::Optional<MatcherCtor> lookupMatcherCtor(StringRef MatcherName,
368                                                 const SourceRange &NameRange,
369                                                 Diagnostics *Error) {
370     return Registry::lookupMatcherCtor(MatcherName, NameRange, Error);
371   }
372   VariantMatcher actOnMatcherExpression(MatcherCtor Ctor,
373                                         const SourceRange &NameRange,
374                                         StringRef BindID,
375                                         ArrayRef<ParserValue> Args,
376                                         Diagnostics *Error) {
377     if (BindID.empty()) {
378       return Registry::constructMatcher(Ctor, NameRange, Args, Error);
379     } else {
380       return Registry::constructBoundMatcher(Ctor, NameRange, BindID, Args,
381                                              Error);
382     }
383   }
384 };
385 
386 bool Parser::parseExpression(StringRef Code, VariantValue *Value,
387                              Diagnostics *Error) {
388   RegistrySema S;
389   return parseExpression(Code, &S, Value, Error);
390 }
391 
392 bool Parser::parseExpression(StringRef Code, Sema *S,
393                              VariantValue *Value, Diagnostics *Error) {
394   CodeTokenizer Tokenizer(Code, Error);
395   if (!Parser(&Tokenizer, S, Error).parseExpressionImpl(Value)) return false;
396   if (Tokenizer.peekNextToken().Kind != TokenInfo::TK_Eof) {
397     Error->addError(Tokenizer.peekNextToken().Range,
398                     Error->ET_ParserTrailingCode);
399     return false;
400   }
401   return true;
402 }
403 
404 llvm::Optional<DynTypedMatcher>
405 Parser::parseMatcherExpression(StringRef Code, Diagnostics *Error) {
406   RegistrySema S;
407   return parseMatcherExpression(Code, &S, Error);
408 }
409 
410 llvm::Optional<DynTypedMatcher>
411 Parser::parseMatcherExpression(StringRef Code, Parser::Sema *S,
412                                Diagnostics *Error) {
413   VariantValue Value;
414   if (!parseExpression(Code, S, &Value, Error))
415     return llvm::Optional<DynTypedMatcher>();
416   if (!Value.isMatcher()) {
417     Error->addError(SourceRange(), Error->ET_ParserNotAMatcher);
418     return llvm::Optional<DynTypedMatcher>();
419   }
420   llvm::Optional<DynTypedMatcher> Result =
421       Value.getMatcher().getSingleMatcher();
422   if (!Result.hasValue()) {
423     Error->addError(SourceRange(), Error->ET_ParserOverloadedType)
424         << Value.getTypeAsString();
425   }
426   return Result;
427 }
428 
429 }  // namespace dynamic
430 }  // namespace ast_matchers
431 }  // namespace clang
432