1 //===--- Parser.cpp - Matcher expression parser -----*- C++ -*-===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 /// 10 /// \file 11 /// \brief Recursive parser implementation for the matcher expression grammar. 12 /// 13 //===----------------------------------------------------------------------===// 14 15 #include <string> 16 #include <vector> 17 18 #include "clang/ASTMatchers/Dynamic/Parser.h" 19 #include "clang/ASTMatchers/Dynamic/Registry.h" 20 #include "clang/Basic/CharInfo.h" 21 #include "llvm/ADT/Twine.h" 22 23 namespace clang { 24 namespace ast_matchers { 25 namespace dynamic { 26 27 /// \brief Simple structure to hold information for one token from the parser. 28 struct Parser::TokenInfo { 29 /// \brief Different possible tokens. 30 enum TokenKind { 31 TK_Eof = 0, 32 TK_OpenParen = 1, 33 TK_CloseParen = 2, 34 TK_Comma = 3, 35 TK_Period = 4, 36 TK_Literal = 5, 37 TK_Ident = 6, 38 TK_InvalidChar = 7, 39 TK_Error = 8 40 }; 41 42 /// \brief Some known identifiers. 43 static const char* const ID_Bind; 44 45 TokenInfo() : Text(), Kind(TK_Eof), Range(), Value() {} 46 47 StringRef Text; 48 TokenKind Kind; 49 SourceRange Range; 50 VariantValue Value; 51 }; 52 53 const char* const Parser::TokenInfo::ID_Bind = "bind"; 54 55 /// \brief Simple tokenizer for the parser. 56 class Parser::CodeTokenizer { 57 public: 58 explicit CodeTokenizer(StringRef MatcherCode, Diagnostics *Error) 59 : Code(MatcherCode), StartOfLine(MatcherCode), Line(1), Error(Error) { 60 NextToken = getNextToken(); 61 } 62 63 /// \brief Returns but doesn't consume the next token. 64 const TokenInfo &peekNextToken() const { return NextToken; } 65 66 /// \brief Consumes and returns the next token. 67 TokenInfo consumeNextToken() { 68 TokenInfo ThisToken = NextToken; 69 NextToken = getNextToken(); 70 return ThisToken; 71 } 72 73 TokenInfo::TokenKind nextTokenKind() const { return NextToken.Kind; } 74 75 private: 76 TokenInfo getNextToken() { 77 consumeWhitespace(); 78 TokenInfo Result; 79 Result.Range.Start = currentLocation(); 80 81 if (Code.empty()) { 82 Result.Kind = TokenInfo::TK_Eof; 83 Result.Text = ""; 84 return Result; 85 } 86 87 switch (Code[0]) { 88 case ',': 89 Result.Kind = TokenInfo::TK_Comma; 90 Result.Text = Code.substr(0, 1); 91 Code = Code.drop_front(); 92 break; 93 case '.': 94 Result.Kind = TokenInfo::TK_Period; 95 Result.Text = Code.substr(0, 1); 96 Code = Code.drop_front(); 97 break; 98 case '(': 99 Result.Kind = TokenInfo::TK_OpenParen; 100 Result.Text = Code.substr(0, 1); 101 Code = Code.drop_front(); 102 break; 103 case ')': 104 Result.Kind = TokenInfo::TK_CloseParen; 105 Result.Text = Code.substr(0, 1); 106 Code = Code.drop_front(); 107 break; 108 109 case '"': 110 case '\'': 111 // Parse a string literal. 112 consumeStringLiteral(&Result); 113 break; 114 115 default: 116 if (isAlphanumeric(Code[0])) { 117 // Parse an identifier 118 size_t TokenLength = 1; 119 while (TokenLength < Code.size() && isAlphanumeric(Code[TokenLength])) 120 ++TokenLength; 121 Result.Kind = TokenInfo::TK_Ident; 122 Result.Text = Code.substr(0, TokenLength); 123 Code = Code.drop_front(TokenLength); 124 } else { 125 Result.Kind = TokenInfo::TK_InvalidChar; 126 Result.Text = Code.substr(0, 1); 127 Code = Code.drop_front(1); 128 } 129 break; 130 } 131 132 Result.Range.End = currentLocation(); 133 return Result; 134 } 135 136 /// \brief Consume a string literal. 137 /// 138 /// \c Code must be positioned at the start of the literal (the opening 139 /// quote). Consumed until it finds the same closing quote character. 140 void consumeStringLiteral(TokenInfo *Result) { 141 bool InEscape = false; 142 const char Marker = Code[0]; 143 for (size_t Length = 1, Size = Code.size(); Length != Size; ++Length) { 144 if (InEscape) { 145 InEscape = false; 146 continue; 147 } 148 if (Code[Length] == '\\') { 149 InEscape = true; 150 continue; 151 } 152 if (Code[Length] == Marker) { 153 Result->Kind = TokenInfo::TK_Literal; 154 Result->Text = Code.substr(0, Length + 1); 155 Result->Value = Code.substr(1, Length - 1).str(); 156 Code = Code.drop_front(Length + 1); 157 return; 158 } 159 } 160 161 StringRef ErrorText = Code; 162 Code = Code.drop_front(Code.size()); 163 SourceRange Range; 164 Range.Start = Result->Range.Start; 165 Range.End = currentLocation(); 166 Error->pushErrorFrame(Range, Error->ET_ParserStringError) 167 << ErrorText; 168 Result->Kind = TokenInfo::TK_Error; 169 } 170 171 /// \brief Consume all leading whitespace from \c Code. 172 void consumeWhitespace() { 173 while (!Code.empty() && isWhitespace(Code[0])) { 174 if (Code[0] == '\n') { 175 ++Line; 176 StartOfLine = Code.drop_front(); 177 } 178 Code = Code.drop_front(); 179 } 180 } 181 182 SourceLocation currentLocation() { 183 SourceLocation Location; 184 Location.Line = Line; 185 Location.Column = Code.data() - StartOfLine.data() + 1; 186 return Location; 187 } 188 189 StringRef Code; 190 StringRef StartOfLine; 191 unsigned Line; 192 Diagnostics *Error; 193 TokenInfo NextToken; 194 }; 195 196 Parser::Sema::~Sema() {} 197 198 /// \brief Parse and validate a matcher expression. 199 /// \return \c true on success, in which case \c Value has the matcher parsed. 200 /// If the input is malformed, or some argument has an error, it 201 /// returns \c false. 202 bool Parser::parseMatcherExpressionImpl(VariantValue *Value) { 203 const TokenInfo NameToken = Tokenizer->consumeNextToken(); 204 assert(NameToken.Kind == TokenInfo::TK_Ident); 205 const TokenInfo OpenToken = Tokenizer->consumeNextToken(); 206 if (OpenToken.Kind != TokenInfo::TK_OpenParen) { 207 Error->pushErrorFrame(OpenToken.Range, Error->ET_ParserNoOpenParen) 208 << OpenToken.Text; 209 return false; 210 } 211 212 std::vector<ParserValue> Args; 213 TokenInfo EndToken; 214 while (Tokenizer->nextTokenKind() != TokenInfo::TK_Eof) { 215 if (Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen) { 216 // End of args. 217 EndToken = Tokenizer->consumeNextToken(); 218 break; 219 } 220 if (Args.size() > 0) { 221 // We must find a , token to continue. 222 const TokenInfo CommaToken = Tokenizer->consumeNextToken(); 223 if (CommaToken.Kind != TokenInfo::TK_Comma) { 224 Error->pushErrorFrame(CommaToken.Range, Error->ET_ParserNoComma) 225 << CommaToken.Text; 226 return false; 227 } 228 } 229 230 ParserValue ArgValue; 231 ArgValue.Text = Tokenizer->peekNextToken().Text; 232 ArgValue.Range = Tokenizer->peekNextToken().Range; 233 if (!parseExpressionImpl(&ArgValue.Value)) { 234 Error->pushErrorFrame(NameToken.Range, 235 Error->ET_ParserMatcherArgFailure) 236 << (Args.size() + 1) << NameToken.Text; 237 return false; 238 } 239 240 Args.push_back(ArgValue); 241 } 242 243 if (EndToken.Kind == TokenInfo::TK_Eof) { 244 Error->pushErrorFrame(OpenToken.Range, Error->ET_ParserNoCloseParen); 245 return false; 246 } 247 248 std::string BindID; 249 if (Tokenizer->peekNextToken().Kind == TokenInfo::TK_Period) { 250 // Parse .bind("foo") 251 Tokenizer->consumeNextToken(); // consume the period. 252 const TokenInfo BindToken = Tokenizer->consumeNextToken(); 253 const TokenInfo OpenToken = Tokenizer->consumeNextToken(); 254 const TokenInfo IDToken = Tokenizer->consumeNextToken(); 255 const TokenInfo CloseToken = Tokenizer->consumeNextToken(); 256 257 // TODO: We could use different error codes for each/some to be more 258 // explicit about the syntax error. 259 if (BindToken.Kind != TokenInfo::TK_Ident || 260 BindToken.Text != TokenInfo::ID_Bind) { 261 Error->pushErrorFrame(BindToken.Range, Error->ET_ParserMalformedBindExpr); 262 return false; 263 } 264 if (OpenToken.Kind != TokenInfo::TK_OpenParen) { 265 Error->pushErrorFrame(OpenToken.Range, Error->ET_ParserMalformedBindExpr); 266 return false; 267 } 268 if (IDToken.Kind != TokenInfo::TK_Literal || !IDToken.Value.isString()) { 269 Error->pushErrorFrame(IDToken.Range, Error->ET_ParserMalformedBindExpr); 270 return false; 271 } 272 if (CloseToken.Kind != TokenInfo::TK_CloseParen) { 273 Error->pushErrorFrame(CloseToken.Range, 274 Error->ET_ParserMalformedBindExpr); 275 return false; 276 } 277 BindID = IDToken.Value.getString(); 278 } 279 280 // Merge the start and end infos. 281 SourceRange MatcherRange = NameToken.Range; 282 MatcherRange.End = EndToken.Range.End; 283 DynTypedMatcher *Result = S->actOnMatcherExpression( 284 NameToken.Text, MatcherRange, BindID, Args, Error); 285 if (Result == NULL) { 286 Error->pushErrorFrame(NameToken.Range, Error->ET_ParserMatcherFailure) 287 << NameToken.Text; 288 return false; 289 } 290 291 Value->takeMatcher(Result); 292 return true; 293 } 294 295 /// \brief Parse an <Expresssion> 296 bool Parser::parseExpressionImpl(VariantValue *Value) { 297 switch (Tokenizer->nextTokenKind()) { 298 case TokenInfo::TK_Literal: 299 *Value = Tokenizer->consumeNextToken().Value; 300 return true; 301 302 case TokenInfo::TK_Ident: 303 return parseMatcherExpressionImpl(Value); 304 305 case TokenInfo::TK_Eof: 306 Error->pushErrorFrame(Tokenizer->consumeNextToken().Range, 307 Error->ET_ParserNoCode); 308 return false; 309 310 case TokenInfo::TK_Error: 311 // This error was already reported by the tokenizer. 312 return false; 313 314 case TokenInfo::TK_OpenParen: 315 case TokenInfo::TK_CloseParen: 316 case TokenInfo::TK_Comma: 317 case TokenInfo::TK_Period: 318 case TokenInfo::TK_InvalidChar: 319 const TokenInfo Token = Tokenizer->consumeNextToken(); 320 Error->pushErrorFrame(Token.Range, Error->ET_ParserInvalidToken) 321 << Token.Text; 322 return false; 323 } 324 325 llvm_unreachable("Unknown token kind."); 326 } 327 328 Parser::Parser(CodeTokenizer *Tokenizer, Sema *S, 329 Diagnostics *Error) 330 : Tokenizer(Tokenizer), S(S), Error(Error) {} 331 332 class RegistrySema : public Parser::Sema { 333 public: 334 virtual ~RegistrySema() {} 335 DynTypedMatcher *actOnMatcherExpression(StringRef MatcherName, 336 const SourceRange &NameRange, 337 StringRef BindID, 338 ArrayRef<ParserValue> Args, 339 Diagnostics *Error) { 340 if (BindID.empty()) { 341 return Registry::constructMatcher(MatcherName, NameRange, Args, Error); 342 } else { 343 return Registry::constructBoundMatcher(MatcherName, NameRange, BindID, 344 Args, Error); 345 } 346 } 347 }; 348 349 bool Parser::parseExpression(StringRef Code, VariantValue *Value, 350 Diagnostics *Error) { 351 RegistrySema S; 352 return parseExpression(Code, &S, Value, Error); 353 } 354 355 bool Parser::parseExpression(StringRef Code, Sema *S, 356 VariantValue *Value, Diagnostics *Error) { 357 CodeTokenizer Tokenizer(Code, Error); 358 if (!Parser(&Tokenizer, S, Error).parseExpressionImpl(Value)) return false; 359 if (Tokenizer.peekNextToken().Kind != TokenInfo::TK_Eof) { 360 Error->pushErrorFrame(Tokenizer.peekNextToken().Range, 361 Error->ET_ParserTrailingCode); 362 return false; 363 } 364 return true; 365 } 366 367 DynTypedMatcher *Parser::parseMatcherExpression(StringRef Code, 368 Diagnostics *Error) { 369 RegistrySema S; 370 return parseMatcherExpression(Code, &S, Error); 371 } 372 373 DynTypedMatcher *Parser::parseMatcherExpression(StringRef Code, 374 Parser::Sema *S, 375 Diagnostics *Error) { 376 VariantValue Value; 377 if (!parseExpression(Code, S, &Value, Error)) 378 return NULL; 379 if (!Value.isMatcher()) { 380 Error->pushErrorFrame(SourceRange(), Error->ET_ParserNotAMatcher); 381 return NULL; 382 } 383 return Value.getMatcher().clone(); 384 } 385 386 } // namespace dynamic 387 } // namespace ast_matchers 388 } // namespace clang 389