xref: /llvm-project/clang/lib/ASTMatchers/Dynamic/Parser.cpp (revision 00cba4f6dda891c5f3e7fd904a6f6d992e9e0702)
1 //===--- Parser.cpp - Matcher expression parser -----*- C++ -*-===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 ///
10 /// \file
11 /// \brief Recursive parser implementation for the matcher expression grammar.
12 ///
13 //===----------------------------------------------------------------------===//
14 
15 #include <string>
16 #include <vector>
17 
18 #include "clang/ASTMatchers/Dynamic/Parser.h"
19 #include "clang/ASTMatchers/Dynamic/Registry.h"
20 #include "clang/Basic/CharInfo.h"
21 #include "llvm/ADT/Optional.h"
22 #include "llvm/ADT/Twine.h"
23 
24 namespace clang {
25 namespace ast_matchers {
26 namespace dynamic {
27 
28 /// \brief Simple structure to hold information for one token from the parser.
29 struct Parser::TokenInfo {
30   /// \brief Different possible tokens.
31   enum TokenKind {
32     TK_Eof = 0,
33     TK_OpenParen = 1,
34     TK_CloseParen = 2,
35     TK_Comma = 3,
36     TK_Period = 4,
37     TK_Literal = 5,
38     TK_Ident = 6,
39     TK_InvalidChar = 7,
40     TK_Error = 8
41   };
42 
43   /// \brief Some known identifiers.
44   static const char* const ID_Bind;
45 
46   TokenInfo() : Text(), Kind(TK_Eof), Range(), Value() {}
47 
48   StringRef Text;
49   TokenKind Kind;
50   SourceRange Range;
51   VariantValue Value;
52 };
53 
54 const char* const Parser::TokenInfo::ID_Bind = "bind";
55 
56 /// \brief Simple tokenizer for the parser.
57 class Parser::CodeTokenizer {
58 public:
59   explicit CodeTokenizer(StringRef MatcherCode, Diagnostics *Error)
60       : Code(MatcherCode), StartOfLine(MatcherCode), Line(1), Error(Error) {
61     NextToken = getNextToken();
62   }
63 
64   /// \brief Returns but doesn't consume the next token.
65   const TokenInfo &peekNextToken() const { return NextToken; }
66 
67   /// \brief Consumes and returns the next token.
68   TokenInfo consumeNextToken() {
69     TokenInfo ThisToken = NextToken;
70     NextToken = getNextToken();
71     return ThisToken;
72   }
73 
74   TokenInfo::TokenKind nextTokenKind() const { return NextToken.Kind; }
75 
76 private:
77   TokenInfo getNextToken() {
78     consumeWhitespace();
79     TokenInfo Result;
80     Result.Range.Start = currentLocation();
81 
82     if (Code.empty()) {
83       Result.Kind = TokenInfo::TK_Eof;
84       Result.Text = "";
85       return Result;
86     }
87 
88     switch (Code[0]) {
89     case ',':
90       Result.Kind = TokenInfo::TK_Comma;
91       Result.Text = Code.substr(0, 1);
92       Code = Code.drop_front();
93       break;
94     case '.':
95       Result.Kind = TokenInfo::TK_Period;
96       Result.Text = Code.substr(0, 1);
97       Code = Code.drop_front();
98       break;
99     case '(':
100       Result.Kind = TokenInfo::TK_OpenParen;
101       Result.Text = Code.substr(0, 1);
102       Code = Code.drop_front();
103       break;
104     case ')':
105       Result.Kind = TokenInfo::TK_CloseParen;
106       Result.Text = Code.substr(0, 1);
107       Code = Code.drop_front();
108       break;
109 
110     case '"':
111     case '\'':
112       // Parse a string literal.
113       consumeStringLiteral(&Result);
114       break;
115 
116     case '0': case '1': case '2': case '3': case '4':
117     case '5': case '6': case '7': case '8': case '9':
118       // Parse an unsigned literal.
119       consumeUnsignedLiteral(&Result);
120       break;
121 
122     default:
123       if (isAlphanumeric(Code[0])) {
124         // Parse an identifier
125         size_t TokenLength = 1;
126         while (TokenLength < Code.size() && isAlphanumeric(Code[TokenLength]))
127           ++TokenLength;
128         Result.Kind = TokenInfo::TK_Ident;
129         Result.Text = Code.substr(0, TokenLength);
130         Code = Code.drop_front(TokenLength);
131       } else {
132         Result.Kind = TokenInfo::TK_InvalidChar;
133         Result.Text = Code.substr(0, 1);
134         Code = Code.drop_front(1);
135       }
136       break;
137     }
138 
139     Result.Range.End = currentLocation();
140     return Result;
141   }
142 
143   /// \brief Consume an unsigned literal.
144   void consumeUnsignedLiteral(TokenInfo *Result) {
145     unsigned Length = 1;
146     if (Code.size() > 1) {
147       // Consume the 'x' or 'b' radix modifier, if present.
148       switch (toLowercase(Code[1])) {
149       case 'x': case 'b': Length = 2;
150       }
151     }
152     while (Length < Code.size() && isHexDigit(Code[Length]))
153       ++Length;
154 
155     Result->Text = Code.substr(0, Length);
156     Code = Code.drop_front(Length);
157 
158     unsigned Value;
159     if (!Result->Text.getAsInteger(0, Value)) {
160       Result->Kind = TokenInfo::TK_Literal;
161       Result->Value = Value;
162     } else {
163       SourceRange Range;
164       Range.Start = Result->Range.Start;
165       Range.End = currentLocation();
166       Error->addError(Range, Error->ET_ParserUnsignedError) << Result->Text;
167       Result->Kind = TokenInfo::TK_Error;
168     }
169   }
170 
171   /// \brief Consume a string literal.
172   ///
173   /// \c Code must be positioned at the start of the literal (the opening
174   /// quote). Consumed until it finds the same closing quote character.
175   void consumeStringLiteral(TokenInfo *Result) {
176     bool InEscape = false;
177     const char Marker = Code[0];
178     for (size_t Length = 1, Size = Code.size(); Length != Size; ++Length) {
179       if (InEscape) {
180         InEscape = false;
181         continue;
182       }
183       if (Code[Length] == '\\') {
184         InEscape = true;
185         continue;
186       }
187       if (Code[Length] == Marker) {
188         Result->Kind = TokenInfo::TK_Literal;
189         Result->Text = Code.substr(0, Length + 1);
190         Result->Value = Code.substr(1, Length - 1).str();
191         Code = Code.drop_front(Length + 1);
192         return;
193       }
194     }
195 
196     StringRef ErrorText = Code;
197     Code = Code.drop_front(Code.size());
198     SourceRange Range;
199     Range.Start = Result->Range.Start;
200     Range.End = currentLocation();
201     Error->addError(Range, Error->ET_ParserStringError) << ErrorText;
202     Result->Kind = TokenInfo::TK_Error;
203   }
204 
205   /// \brief Consume all leading whitespace from \c Code.
206   void consumeWhitespace() {
207     while (!Code.empty() && isWhitespace(Code[0])) {
208       if (Code[0] == '\n') {
209         ++Line;
210         StartOfLine = Code.drop_front();
211       }
212       Code = Code.drop_front();
213     }
214   }
215 
216   SourceLocation currentLocation() {
217     SourceLocation Location;
218     Location.Line = Line;
219     Location.Column = Code.data() - StartOfLine.data() + 1;
220     return Location;
221   }
222 
223   StringRef Code;
224   StringRef StartOfLine;
225   unsigned Line;
226   Diagnostics *Error;
227   TokenInfo NextToken;
228 };
229 
230 Parser::Sema::~Sema() {}
231 
232 /// \brief Parse and validate a matcher expression.
233 /// \return \c true on success, in which case \c Value has the matcher parsed.
234 ///   If the input is malformed, or some argument has an error, it
235 ///   returns \c false.
236 bool Parser::parseMatcherExpressionImpl(VariantValue *Value) {
237   const TokenInfo NameToken = Tokenizer->consumeNextToken();
238   assert(NameToken.Kind == TokenInfo::TK_Ident);
239   const TokenInfo OpenToken = Tokenizer->consumeNextToken();
240   if (OpenToken.Kind != TokenInfo::TK_OpenParen) {
241     Error->addError(OpenToken.Range, Error->ET_ParserNoOpenParen)
242         << OpenToken.Text;
243     return false;
244   }
245 
246   llvm::Optional<MatcherCtor> Ctor =
247       S->lookupMatcherCtor(NameToken.Text, NameToken.Range, Error);
248 
249   std::vector<ParserValue> Args;
250   TokenInfo EndToken;
251   while (Tokenizer->nextTokenKind() != TokenInfo::TK_Eof) {
252     if (Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen) {
253       // End of args.
254       EndToken = Tokenizer->consumeNextToken();
255       break;
256     }
257     if (Args.size() > 0) {
258       // We must find a , token to continue.
259       const TokenInfo CommaToken = Tokenizer->consumeNextToken();
260       if (CommaToken.Kind != TokenInfo::TK_Comma) {
261         Error->addError(CommaToken.Range, Error->ET_ParserNoComma)
262             << CommaToken.Text;
263         return false;
264       }
265     }
266 
267     Diagnostics::Context Ctx(Diagnostics::Context::MatcherArg, Error,
268                              NameToken.Text, NameToken.Range, Args.size() + 1);
269     ParserValue ArgValue;
270     ArgValue.Text = Tokenizer->peekNextToken().Text;
271     ArgValue.Range = Tokenizer->peekNextToken().Range;
272     if (!parseExpressionImpl(&ArgValue.Value)) return false;
273 
274     Args.push_back(ArgValue);
275   }
276 
277   if (EndToken.Kind == TokenInfo::TK_Eof) {
278     Error->addError(OpenToken.Range, Error->ET_ParserNoCloseParen);
279     return false;
280   }
281 
282   std::string BindID;
283   if (Tokenizer->peekNextToken().Kind == TokenInfo::TK_Period) {
284     // Parse .bind("foo")
285     Tokenizer->consumeNextToken();  // consume the period.
286     const TokenInfo BindToken = Tokenizer->consumeNextToken();
287     const TokenInfo OpenToken = Tokenizer->consumeNextToken();
288     const TokenInfo IDToken = Tokenizer->consumeNextToken();
289     const TokenInfo CloseToken = Tokenizer->consumeNextToken();
290 
291     // TODO: We could use different error codes for each/some to be more
292     //       explicit about the syntax error.
293     if (BindToken.Kind != TokenInfo::TK_Ident ||
294         BindToken.Text != TokenInfo::ID_Bind) {
295       Error->addError(BindToken.Range, Error->ET_ParserMalformedBindExpr);
296       return false;
297     }
298     if (OpenToken.Kind != TokenInfo::TK_OpenParen) {
299       Error->addError(OpenToken.Range, Error->ET_ParserMalformedBindExpr);
300       return false;
301     }
302     if (IDToken.Kind != TokenInfo::TK_Literal || !IDToken.Value.isString()) {
303       Error->addError(IDToken.Range, Error->ET_ParserMalformedBindExpr);
304       return false;
305     }
306     if (CloseToken.Kind != TokenInfo::TK_CloseParen) {
307       Error->addError(CloseToken.Range, Error->ET_ParserMalformedBindExpr);
308       return false;
309     }
310     BindID = IDToken.Value.getString();
311   }
312 
313   if (!Ctor)
314     return false;
315 
316   // Merge the start and end infos.
317   Diagnostics::Context Ctx(Diagnostics::Context::ConstructMatcher, Error,
318                            NameToken.Text, NameToken.Range);
319   SourceRange MatcherRange = NameToken.Range;
320   MatcherRange.End = EndToken.Range.End;
321   VariantMatcher Result = S->actOnMatcherExpression(
322       *Ctor, MatcherRange, BindID, Args, Error);
323   if (Result.isNull()) return false;
324 
325   *Value = Result;
326   return true;
327 }
328 
329 /// \brief Parse an <Expresssion>
330 bool Parser::parseExpressionImpl(VariantValue *Value) {
331   switch (Tokenizer->nextTokenKind()) {
332   case TokenInfo::TK_Literal:
333     *Value = Tokenizer->consumeNextToken().Value;
334     return true;
335 
336   case TokenInfo::TK_Ident:
337     return parseMatcherExpressionImpl(Value);
338 
339   case TokenInfo::TK_Eof:
340     Error->addError(Tokenizer->consumeNextToken().Range,
341                     Error->ET_ParserNoCode);
342     return false;
343 
344   case TokenInfo::TK_Error:
345     // This error was already reported by the tokenizer.
346     return false;
347 
348   case TokenInfo::TK_OpenParen:
349   case TokenInfo::TK_CloseParen:
350   case TokenInfo::TK_Comma:
351   case TokenInfo::TK_Period:
352   case TokenInfo::TK_InvalidChar:
353     const TokenInfo Token = Tokenizer->consumeNextToken();
354     Error->addError(Token.Range, Error->ET_ParserInvalidToken) << Token.Text;
355     return false;
356   }
357 
358   llvm_unreachable("Unknown token kind.");
359 }
360 
361 Parser::Parser(CodeTokenizer *Tokenizer, Sema *S,
362                Diagnostics *Error)
363     : Tokenizer(Tokenizer), S(S), Error(Error) {}
364 
365 class RegistrySema : public Parser::Sema {
366 public:
367   virtual ~RegistrySema() {}
368   llvm::Optional<MatcherCtor> lookupMatcherCtor(StringRef MatcherName,
369                                                 const SourceRange &NameRange,
370                                                 Diagnostics *Error) {
371     return Registry::lookupMatcherCtor(MatcherName, NameRange, Error);
372   }
373   VariantMatcher actOnMatcherExpression(MatcherCtor Ctor,
374                                         const SourceRange &NameRange,
375                                         StringRef BindID,
376                                         ArrayRef<ParserValue> Args,
377                                         Diagnostics *Error) {
378     if (BindID.empty()) {
379       return Registry::constructMatcher(Ctor, NameRange, Args, Error);
380     } else {
381       return Registry::constructBoundMatcher(Ctor, NameRange, BindID, Args,
382                                              Error);
383     }
384   }
385 };
386 
387 bool Parser::parseExpression(StringRef Code, VariantValue *Value,
388                              Diagnostics *Error) {
389   RegistrySema S;
390   return parseExpression(Code, &S, Value, Error);
391 }
392 
393 bool Parser::parseExpression(StringRef Code, Sema *S,
394                              VariantValue *Value, Diagnostics *Error) {
395   CodeTokenizer Tokenizer(Code, Error);
396   if (!Parser(&Tokenizer, S, Error).parseExpressionImpl(Value)) return false;
397   if (Tokenizer.peekNextToken().Kind != TokenInfo::TK_Eof) {
398     Error->addError(Tokenizer.peekNextToken().Range,
399                     Error->ET_ParserTrailingCode);
400     return false;
401   }
402   return true;
403 }
404 
405 llvm::Optional<DynTypedMatcher>
406 Parser::parseMatcherExpression(StringRef Code, Diagnostics *Error) {
407   RegistrySema S;
408   return parseMatcherExpression(Code, &S, Error);
409 }
410 
411 llvm::Optional<DynTypedMatcher>
412 Parser::parseMatcherExpression(StringRef Code, Parser::Sema *S,
413                                Diagnostics *Error) {
414   VariantValue Value;
415   if (!parseExpression(Code, S, &Value, Error))
416     return llvm::Optional<DynTypedMatcher>();
417   if (!Value.isMatcher()) {
418     Error->addError(SourceRange(), Error->ET_ParserNotAMatcher);
419     return llvm::Optional<DynTypedMatcher>();
420   }
421   llvm::Optional<DynTypedMatcher> Result =
422       Value.getMatcher().getSingleMatcher();
423   if (!Result.hasValue()) {
424     Error->addError(SourceRange(), Error->ET_ParserOverloadedType)
425         << Value.getTypeAsString();
426   }
427   return Result;
428 }
429 
430 }  // namespace dynamic
431 }  // namespace ast_matchers
432 }  // namespace clang
433