1 //===--- Parser.cpp - Matcher expression parser -----*- C++ -*-===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 /// 10 /// \file 11 /// \brief Recursive parser implementation for the matcher expression grammar. 12 /// 13 //===----------------------------------------------------------------------===// 14 15 #include <string> 16 #include <vector> 17 18 #include "clang/ASTMatchers/Dynamic/Parser.h" 19 #include "clang/ASTMatchers/Dynamic/Registry.h" 20 #include "clang/Basic/CharInfo.h" 21 #include "llvm/ADT/Twine.h" 22 23 namespace clang { 24 namespace ast_matchers { 25 namespace dynamic { 26 27 /// \brief Simple structure to hold information for one token from the parser. 28 struct Parser::TokenInfo { 29 /// \brief Different possible tokens. 30 enum TokenKind { 31 TK_Eof = 0, 32 TK_OpenParen = 1, 33 TK_CloseParen = 2, 34 TK_Comma = 3, 35 TK_Period = 4, 36 TK_Literal = 5, 37 TK_Ident = 6, 38 TK_InvalidChar = 7, 39 TK_Error = 8 40 }; 41 42 /// \brief Some known identifiers. 43 static const char* const ID_Bind; 44 45 TokenInfo() : Text(), Kind(TK_Eof), Range(), Value() {} 46 47 StringRef Text; 48 TokenKind Kind; 49 SourceRange Range; 50 VariantValue Value; 51 }; 52 53 const char* const Parser::TokenInfo::ID_Bind = "bind"; 54 55 /// \brief Simple tokenizer for the parser. 56 class Parser::CodeTokenizer { 57 public: 58 explicit CodeTokenizer(StringRef MatcherCode, Diagnostics *Error) 59 : Code(MatcherCode), StartOfLine(MatcherCode), Line(1), Error(Error) { 60 NextToken = getNextToken(); 61 } 62 63 /// \brief Returns but doesn't consume the next token. 64 const TokenInfo &peekNextToken() const { return NextToken; } 65 66 /// \brief Consumes and returns the next token. 67 TokenInfo consumeNextToken() { 68 TokenInfo ThisToken = NextToken; 69 NextToken = getNextToken(); 70 return ThisToken; 71 } 72 73 TokenInfo::TokenKind nextTokenKind() const { return NextToken.Kind; } 74 75 private: 76 TokenInfo getNextToken() { 77 consumeWhitespace(); 78 TokenInfo Result; 79 Result.Range.Start = currentLocation(); 80 81 if (Code.empty()) { 82 Result.Kind = TokenInfo::TK_Eof; 83 Result.Text = ""; 84 return Result; 85 } 86 87 switch (Code[0]) { 88 case ',': 89 Result.Kind = TokenInfo::TK_Comma; 90 Result.Text = Code.substr(0, 1); 91 Code = Code.drop_front(); 92 break; 93 case '.': 94 Result.Kind = TokenInfo::TK_Period; 95 Result.Text = Code.substr(0, 1); 96 Code = Code.drop_front(); 97 break; 98 case '(': 99 Result.Kind = TokenInfo::TK_OpenParen; 100 Result.Text = Code.substr(0, 1); 101 Code = Code.drop_front(); 102 break; 103 case ')': 104 Result.Kind = TokenInfo::TK_CloseParen; 105 Result.Text = Code.substr(0, 1); 106 Code = Code.drop_front(); 107 break; 108 109 case '"': 110 case '\'': 111 // Parse a string literal. 112 consumeStringLiteral(&Result); 113 break; 114 115 case '0': case '1': case '2': case '3': case '4': 116 case '5': case '6': case '7': case '8': case '9': 117 // Parse an unsigned literal. 118 consumeUnsignedLiteral(&Result); 119 break; 120 121 default: 122 if (isAlphanumeric(Code[0])) { 123 // Parse an identifier 124 size_t TokenLength = 1; 125 while (TokenLength < Code.size() && isAlphanumeric(Code[TokenLength])) 126 ++TokenLength; 127 Result.Kind = TokenInfo::TK_Ident; 128 Result.Text = Code.substr(0, TokenLength); 129 Code = Code.drop_front(TokenLength); 130 } else { 131 Result.Kind = TokenInfo::TK_InvalidChar; 132 Result.Text = Code.substr(0, 1); 133 Code = Code.drop_front(1); 134 } 135 break; 136 } 137 138 Result.Range.End = currentLocation(); 139 return Result; 140 } 141 142 /// \brief Consume an unsigned literal. 143 void consumeUnsignedLiteral(TokenInfo *Result) { 144 unsigned Length = 1; 145 if (Code.size() > 1) { 146 // Consume the 'x' or 'b' radix modifier, if present. 147 switch (toLowercase(Code[1])) { 148 case 'x': case 'b': Length = 2; 149 } 150 } 151 while (Length < Code.size() && isHexDigit(Code[Length])) 152 ++Length; 153 154 Result->Text = Code.substr(0, Length); 155 Code = Code.drop_front(Length); 156 157 unsigned Value; 158 if (!Result->Text.getAsInteger(0, Value)) { 159 Result->Kind = TokenInfo::TK_Literal; 160 Result->Value = Value; 161 } else { 162 SourceRange Range; 163 Range.Start = Result->Range.Start; 164 Range.End = currentLocation(); 165 Error->pushErrorFrame(Range, Error->ET_ParserUnsignedError) 166 << Result->Text; 167 Result->Kind = TokenInfo::TK_Error; 168 } 169 } 170 171 /// \brief Consume a string literal. 172 /// 173 /// \c Code must be positioned at the start of the literal (the opening 174 /// quote). Consumed until it finds the same closing quote character. 175 void consumeStringLiteral(TokenInfo *Result) { 176 bool InEscape = false; 177 const char Marker = Code[0]; 178 for (size_t Length = 1, Size = Code.size(); Length != Size; ++Length) { 179 if (InEscape) { 180 InEscape = false; 181 continue; 182 } 183 if (Code[Length] == '\\') { 184 InEscape = true; 185 continue; 186 } 187 if (Code[Length] == Marker) { 188 Result->Kind = TokenInfo::TK_Literal; 189 Result->Text = Code.substr(0, Length + 1); 190 Result->Value = Code.substr(1, Length - 1).str(); 191 Code = Code.drop_front(Length + 1); 192 return; 193 } 194 } 195 196 StringRef ErrorText = Code; 197 Code = Code.drop_front(Code.size()); 198 SourceRange Range; 199 Range.Start = Result->Range.Start; 200 Range.End = currentLocation(); 201 Error->pushErrorFrame(Range, Error->ET_ParserStringError) 202 << ErrorText; 203 Result->Kind = TokenInfo::TK_Error; 204 } 205 206 /// \brief Consume all leading whitespace from \c Code. 207 void consumeWhitespace() { 208 while (!Code.empty() && isWhitespace(Code[0])) { 209 if (Code[0] == '\n') { 210 ++Line; 211 StartOfLine = Code.drop_front(); 212 } 213 Code = Code.drop_front(); 214 } 215 } 216 217 SourceLocation currentLocation() { 218 SourceLocation Location; 219 Location.Line = Line; 220 Location.Column = Code.data() - StartOfLine.data() + 1; 221 return Location; 222 } 223 224 StringRef Code; 225 StringRef StartOfLine; 226 unsigned Line; 227 Diagnostics *Error; 228 TokenInfo NextToken; 229 }; 230 231 Parser::Sema::~Sema() {} 232 233 /// \brief Parse and validate a matcher expression. 234 /// \return \c true on success, in which case \c Value has the matcher parsed. 235 /// If the input is malformed, or some argument has an error, it 236 /// returns \c false. 237 bool Parser::parseMatcherExpressionImpl(VariantValue *Value) { 238 const TokenInfo NameToken = Tokenizer->consumeNextToken(); 239 assert(NameToken.Kind == TokenInfo::TK_Ident); 240 const TokenInfo OpenToken = Tokenizer->consumeNextToken(); 241 if (OpenToken.Kind != TokenInfo::TK_OpenParen) { 242 Error->pushErrorFrame(OpenToken.Range, Error->ET_ParserNoOpenParen) 243 << OpenToken.Text; 244 return false; 245 } 246 247 std::vector<ParserValue> Args; 248 TokenInfo EndToken; 249 while (Tokenizer->nextTokenKind() != TokenInfo::TK_Eof) { 250 if (Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen) { 251 // End of args. 252 EndToken = Tokenizer->consumeNextToken(); 253 break; 254 } 255 if (Args.size() > 0) { 256 // We must find a , token to continue. 257 const TokenInfo CommaToken = Tokenizer->consumeNextToken(); 258 if (CommaToken.Kind != TokenInfo::TK_Comma) { 259 Error->pushErrorFrame(CommaToken.Range, Error->ET_ParserNoComma) 260 << CommaToken.Text; 261 return false; 262 } 263 } 264 265 ParserValue ArgValue; 266 ArgValue.Text = Tokenizer->peekNextToken().Text; 267 ArgValue.Range = Tokenizer->peekNextToken().Range; 268 if (!parseExpressionImpl(&ArgValue.Value)) { 269 Error->pushErrorFrame(NameToken.Range, 270 Error->ET_ParserMatcherArgFailure) 271 << (Args.size() + 1) << NameToken.Text; 272 return false; 273 } 274 275 Args.push_back(ArgValue); 276 } 277 278 if (EndToken.Kind == TokenInfo::TK_Eof) { 279 Error->pushErrorFrame(OpenToken.Range, Error->ET_ParserNoCloseParen); 280 return false; 281 } 282 283 std::string BindID; 284 if (Tokenizer->peekNextToken().Kind == TokenInfo::TK_Period) { 285 // Parse .bind("foo") 286 Tokenizer->consumeNextToken(); // consume the period. 287 const TokenInfo BindToken = Tokenizer->consumeNextToken(); 288 const TokenInfo OpenToken = Tokenizer->consumeNextToken(); 289 const TokenInfo IDToken = Tokenizer->consumeNextToken(); 290 const TokenInfo CloseToken = Tokenizer->consumeNextToken(); 291 292 // TODO: We could use different error codes for each/some to be more 293 // explicit about the syntax error. 294 if (BindToken.Kind != TokenInfo::TK_Ident || 295 BindToken.Text != TokenInfo::ID_Bind) { 296 Error->pushErrorFrame(BindToken.Range, Error->ET_ParserMalformedBindExpr); 297 return false; 298 } 299 if (OpenToken.Kind != TokenInfo::TK_OpenParen) { 300 Error->pushErrorFrame(OpenToken.Range, Error->ET_ParserMalformedBindExpr); 301 return false; 302 } 303 if (IDToken.Kind != TokenInfo::TK_Literal || !IDToken.Value.isString()) { 304 Error->pushErrorFrame(IDToken.Range, Error->ET_ParserMalformedBindExpr); 305 return false; 306 } 307 if (CloseToken.Kind != TokenInfo::TK_CloseParen) { 308 Error->pushErrorFrame(CloseToken.Range, 309 Error->ET_ParserMalformedBindExpr); 310 return false; 311 } 312 BindID = IDToken.Value.getString(); 313 } 314 315 // Merge the start and end infos. 316 SourceRange MatcherRange = NameToken.Range; 317 MatcherRange.End = EndToken.Range.End; 318 MatcherList Result = S->actOnMatcherExpression( 319 NameToken.Text, MatcherRange, BindID, Args, Error); 320 if (Result.empty()) { 321 Error->pushErrorFrame(NameToken.Range, Error->ET_ParserMatcherFailure) 322 << NameToken.Text; 323 return false; 324 } 325 326 *Value = Result; 327 return true; 328 } 329 330 /// \brief Parse an <Expresssion> 331 bool Parser::parseExpressionImpl(VariantValue *Value) { 332 switch (Tokenizer->nextTokenKind()) { 333 case TokenInfo::TK_Literal: 334 *Value = Tokenizer->consumeNextToken().Value; 335 return true; 336 337 case TokenInfo::TK_Ident: 338 return parseMatcherExpressionImpl(Value); 339 340 case TokenInfo::TK_Eof: 341 Error->pushErrorFrame(Tokenizer->consumeNextToken().Range, 342 Error->ET_ParserNoCode); 343 return false; 344 345 case TokenInfo::TK_Error: 346 // This error was already reported by the tokenizer. 347 return false; 348 349 case TokenInfo::TK_OpenParen: 350 case TokenInfo::TK_CloseParen: 351 case TokenInfo::TK_Comma: 352 case TokenInfo::TK_Period: 353 case TokenInfo::TK_InvalidChar: 354 const TokenInfo Token = Tokenizer->consumeNextToken(); 355 Error->pushErrorFrame(Token.Range, Error->ET_ParserInvalidToken) 356 << Token.Text; 357 return false; 358 } 359 360 llvm_unreachable("Unknown token kind."); 361 } 362 363 Parser::Parser(CodeTokenizer *Tokenizer, Sema *S, 364 Diagnostics *Error) 365 : Tokenizer(Tokenizer), S(S), Error(Error) {} 366 367 class RegistrySema : public Parser::Sema { 368 public: 369 virtual ~RegistrySema() {} 370 MatcherList actOnMatcherExpression(StringRef MatcherName, 371 const SourceRange &NameRange, 372 StringRef BindID, 373 ArrayRef<ParserValue> Args, 374 Diagnostics *Error) { 375 if (BindID.empty()) { 376 return Registry::constructMatcher(MatcherName, NameRange, Args, Error); 377 } else { 378 return Registry::constructBoundMatcher(MatcherName, NameRange, BindID, 379 Args, Error); 380 } 381 } 382 }; 383 384 bool Parser::parseExpression(StringRef Code, VariantValue *Value, 385 Diagnostics *Error) { 386 RegistrySema S; 387 return parseExpression(Code, &S, Value, Error); 388 } 389 390 bool Parser::parseExpression(StringRef Code, Sema *S, 391 VariantValue *Value, Diagnostics *Error) { 392 CodeTokenizer Tokenizer(Code, Error); 393 if (!Parser(&Tokenizer, S, Error).parseExpressionImpl(Value)) return false; 394 if (Tokenizer.peekNextToken().Kind != TokenInfo::TK_Eof) { 395 Error->pushErrorFrame(Tokenizer.peekNextToken().Range, 396 Error->ET_ParserTrailingCode); 397 return false; 398 } 399 return true; 400 } 401 402 DynTypedMatcher *Parser::parseMatcherExpression(StringRef Code, 403 Diagnostics *Error) { 404 RegistrySema S; 405 return parseMatcherExpression(Code, &S, Error); 406 } 407 408 DynTypedMatcher *Parser::parseMatcherExpression(StringRef Code, 409 Parser::Sema *S, 410 Diagnostics *Error) { 411 VariantValue Value; 412 if (!parseExpression(Code, S, &Value, Error)) 413 return NULL; 414 if (!Value.isMatchers()) { 415 Error->pushErrorFrame(SourceRange(), Error->ET_ParserNotAMatcher); 416 return NULL; 417 } 418 if (Value.getMatchers().matchers().size() != 1) { 419 Error->pushErrorFrame(SourceRange(), Error->ET_ParserOverloadedType) 420 << Value.getTypeAsString(); 421 return NULL; 422 } 423 return Value.getMatchers().matchers()[0]->clone(); 424 } 425 426 } // namespace dynamic 427 } // namespace ast_matchers 428 } // namespace clang 429