1 //===--- Parser.cpp - Matcher expression parser -----*- C++ -*-===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 /// 10 /// \file 11 /// \brief Recursive parser implementation for the matcher expression grammar. 12 /// 13 //===----------------------------------------------------------------------===// 14 15 #include "clang/ASTMatchers/Dynamic/Parser.h" 16 #include "clang/ASTMatchers/Dynamic/Registry.h" 17 #include "clang/Basic/CharInfo.h" 18 #include "llvm/ADT/Optional.h" 19 #include "llvm/ADT/Twine.h" 20 #include <string> 21 #include <vector> 22 23 namespace clang { 24 namespace ast_matchers { 25 namespace dynamic { 26 27 /// \brief Simple structure to hold information for one token from the parser. 28 struct Parser::TokenInfo { 29 /// \brief Different possible tokens. 30 enum TokenKind { 31 TK_Eof = 0, 32 TK_OpenParen = 1, 33 TK_CloseParen = 2, 34 TK_Comma = 3, 35 TK_Period = 4, 36 TK_Literal = 5, 37 TK_Ident = 6, 38 TK_InvalidChar = 7, 39 TK_Error = 8 40 }; 41 42 /// \brief Some known identifiers. 43 static const char* const ID_Bind; 44 45 TokenInfo() : Text(), Kind(TK_Eof), Range(), Value() {} 46 47 StringRef Text; 48 TokenKind Kind; 49 SourceRange Range; 50 VariantValue Value; 51 }; 52 53 const char* const Parser::TokenInfo::ID_Bind = "bind"; 54 55 /// \brief Simple tokenizer for the parser. 56 class Parser::CodeTokenizer { 57 public: 58 explicit CodeTokenizer(StringRef MatcherCode, Diagnostics *Error) 59 : Code(MatcherCode), StartOfLine(MatcherCode), Line(1), Error(Error) { 60 NextToken = getNextToken(); 61 } 62 63 /// \brief Returns but doesn't consume the next token. 64 const TokenInfo &peekNextToken() const { return NextToken; } 65 66 /// \brief Consumes and returns the next token. 67 TokenInfo consumeNextToken() { 68 TokenInfo ThisToken = NextToken; 69 NextToken = getNextToken(); 70 return ThisToken; 71 } 72 73 TokenInfo::TokenKind nextTokenKind() const { return NextToken.Kind; } 74 75 private: 76 TokenInfo getNextToken() { 77 consumeWhitespace(); 78 TokenInfo Result; 79 Result.Range.Start = currentLocation(); 80 81 if (Code.empty()) { 82 Result.Kind = TokenInfo::TK_Eof; 83 Result.Text = ""; 84 return Result; 85 } 86 87 switch (Code[0]) { 88 case ',': 89 Result.Kind = TokenInfo::TK_Comma; 90 Result.Text = Code.substr(0, 1); 91 Code = Code.drop_front(); 92 break; 93 case '.': 94 Result.Kind = TokenInfo::TK_Period; 95 Result.Text = Code.substr(0, 1); 96 Code = Code.drop_front(); 97 break; 98 case '(': 99 Result.Kind = TokenInfo::TK_OpenParen; 100 Result.Text = Code.substr(0, 1); 101 Code = Code.drop_front(); 102 break; 103 case ')': 104 Result.Kind = TokenInfo::TK_CloseParen; 105 Result.Text = Code.substr(0, 1); 106 Code = Code.drop_front(); 107 break; 108 109 case '"': 110 case '\'': 111 // Parse a string literal. 112 consumeStringLiteral(&Result); 113 break; 114 115 case '0': case '1': case '2': case '3': case '4': 116 case '5': case '6': case '7': case '8': case '9': 117 // Parse an unsigned literal. 118 consumeUnsignedLiteral(&Result); 119 break; 120 121 default: 122 if (isAlphanumeric(Code[0])) { 123 // Parse an identifier 124 size_t TokenLength = 1; 125 while (TokenLength < Code.size() && isAlphanumeric(Code[TokenLength])) 126 ++TokenLength; 127 Result.Kind = TokenInfo::TK_Ident; 128 Result.Text = Code.substr(0, TokenLength); 129 Code = Code.drop_front(TokenLength); 130 } else { 131 Result.Kind = TokenInfo::TK_InvalidChar; 132 Result.Text = Code.substr(0, 1); 133 Code = Code.drop_front(1); 134 } 135 break; 136 } 137 138 Result.Range.End = currentLocation(); 139 return Result; 140 } 141 142 /// \brief Consume an unsigned literal. 143 void consumeUnsignedLiteral(TokenInfo *Result) { 144 unsigned Length = 1; 145 if (Code.size() > 1) { 146 // Consume the 'x' or 'b' radix modifier, if present. 147 switch (toLowercase(Code[1])) { 148 case 'x': case 'b': Length = 2; 149 } 150 } 151 while (Length < Code.size() && isHexDigit(Code[Length])) 152 ++Length; 153 154 Result->Text = Code.substr(0, Length); 155 Code = Code.drop_front(Length); 156 157 unsigned Value; 158 if (!Result->Text.getAsInteger(0, Value)) { 159 Result->Kind = TokenInfo::TK_Literal; 160 Result->Value = Value; 161 } else { 162 SourceRange Range; 163 Range.Start = Result->Range.Start; 164 Range.End = currentLocation(); 165 Error->addError(Range, Error->ET_ParserUnsignedError) << Result->Text; 166 Result->Kind = TokenInfo::TK_Error; 167 } 168 } 169 170 /// \brief Consume a string literal. 171 /// 172 /// \c Code must be positioned at the start of the literal (the opening 173 /// quote). Consumed until it finds the same closing quote character. 174 void consumeStringLiteral(TokenInfo *Result) { 175 bool InEscape = false; 176 const char Marker = Code[0]; 177 for (size_t Length = 1, Size = Code.size(); Length != Size; ++Length) { 178 if (InEscape) { 179 InEscape = false; 180 continue; 181 } 182 if (Code[Length] == '\\') { 183 InEscape = true; 184 continue; 185 } 186 if (Code[Length] == Marker) { 187 Result->Kind = TokenInfo::TK_Literal; 188 Result->Text = Code.substr(0, Length + 1); 189 Result->Value = Code.substr(1, Length - 1).str(); 190 Code = Code.drop_front(Length + 1); 191 return; 192 } 193 } 194 195 StringRef ErrorText = Code; 196 Code = Code.drop_front(Code.size()); 197 SourceRange Range; 198 Range.Start = Result->Range.Start; 199 Range.End = currentLocation(); 200 Error->addError(Range, Error->ET_ParserStringError) << ErrorText; 201 Result->Kind = TokenInfo::TK_Error; 202 } 203 204 /// \brief Consume all leading whitespace from \c Code. 205 void consumeWhitespace() { 206 while (!Code.empty() && isWhitespace(Code[0])) { 207 if (Code[0] == '\n') { 208 ++Line; 209 StartOfLine = Code.drop_front(); 210 } 211 Code = Code.drop_front(); 212 } 213 } 214 215 SourceLocation currentLocation() { 216 SourceLocation Location; 217 Location.Line = Line; 218 Location.Column = Code.data() - StartOfLine.data() + 1; 219 return Location; 220 } 221 222 StringRef Code; 223 StringRef StartOfLine; 224 unsigned Line; 225 Diagnostics *Error; 226 TokenInfo NextToken; 227 }; 228 229 Parser::Sema::~Sema() {} 230 231 /// \brief Parse and validate a matcher expression. 232 /// \return \c true on success, in which case \c Value has the matcher parsed. 233 /// If the input is malformed, or some argument has an error, it 234 /// returns \c false. 235 bool Parser::parseMatcherExpressionImpl(VariantValue *Value) { 236 const TokenInfo NameToken = Tokenizer->consumeNextToken(); 237 assert(NameToken.Kind == TokenInfo::TK_Ident); 238 const TokenInfo OpenToken = Tokenizer->consumeNextToken(); 239 if (OpenToken.Kind != TokenInfo::TK_OpenParen) { 240 Error->addError(OpenToken.Range, Error->ET_ParserNoOpenParen) 241 << OpenToken.Text; 242 return false; 243 } 244 245 llvm::Optional<MatcherCtor> Ctor = 246 S->lookupMatcherCtor(NameToken.Text, NameToken.Range, Error); 247 248 std::vector<ParserValue> Args; 249 TokenInfo EndToken; 250 while (Tokenizer->nextTokenKind() != TokenInfo::TK_Eof) { 251 if (Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen) { 252 // End of args. 253 EndToken = Tokenizer->consumeNextToken(); 254 break; 255 } 256 if (Args.size() > 0) { 257 // We must find a , token to continue. 258 const TokenInfo CommaToken = Tokenizer->consumeNextToken(); 259 if (CommaToken.Kind != TokenInfo::TK_Comma) { 260 Error->addError(CommaToken.Range, Error->ET_ParserNoComma) 261 << CommaToken.Text; 262 return false; 263 } 264 } 265 266 Diagnostics::Context Ctx(Diagnostics::Context::MatcherArg, Error, 267 NameToken.Text, NameToken.Range, Args.size() + 1); 268 ParserValue ArgValue; 269 ArgValue.Text = Tokenizer->peekNextToken().Text; 270 ArgValue.Range = Tokenizer->peekNextToken().Range; 271 if (!parseExpressionImpl(&ArgValue.Value)) return false; 272 273 Args.push_back(ArgValue); 274 } 275 276 if (EndToken.Kind == TokenInfo::TK_Eof) { 277 Error->addError(OpenToken.Range, Error->ET_ParserNoCloseParen); 278 return false; 279 } 280 281 std::string BindID; 282 if (Tokenizer->peekNextToken().Kind == TokenInfo::TK_Period) { 283 // Parse .bind("foo") 284 Tokenizer->consumeNextToken(); // consume the period. 285 const TokenInfo BindToken = Tokenizer->consumeNextToken(); 286 const TokenInfo OpenToken = Tokenizer->consumeNextToken(); 287 const TokenInfo IDToken = Tokenizer->consumeNextToken(); 288 const TokenInfo CloseToken = Tokenizer->consumeNextToken(); 289 290 // TODO: We could use different error codes for each/some to be more 291 // explicit about the syntax error. 292 if (BindToken.Kind != TokenInfo::TK_Ident || 293 BindToken.Text != TokenInfo::ID_Bind) { 294 Error->addError(BindToken.Range, Error->ET_ParserMalformedBindExpr); 295 return false; 296 } 297 if (OpenToken.Kind != TokenInfo::TK_OpenParen) { 298 Error->addError(OpenToken.Range, Error->ET_ParserMalformedBindExpr); 299 return false; 300 } 301 if (IDToken.Kind != TokenInfo::TK_Literal || !IDToken.Value.isString()) { 302 Error->addError(IDToken.Range, Error->ET_ParserMalformedBindExpr); 303 return false; 304 } 305 if (CloseToken.Kind != TokenInfo::TK_CloseParen) { 306 Error->addError(CloseToken.Range, Error->ET_ParserMalformedBindExpr); 307 return false; 308 } 309 BindID = IDToken.Value.getString(); 310 } 311 312 if (!Ctor) 313 return false; 314 315 // Merge the start and end infos. 316 Diagnostics::Context Ctx(Diagnostics::Context::ConstructMatcher, Error, 317 NameToken.Text, NameToken.Range); 318 SourceRange MatcherRange = NameToken.Range; 319 MatcherRange.End = EndToken.Range.End; 320 VariantMatcher Result = S->actOnMatcherExpression( 321 *Ctor, MatcherRange, BindID, Args, Error); 322 if (Result.isNull()) return false; 323 324 *Value = Result; 325 return true; 326 } 327 328 /// \brief Parse an <Expresssion> 329 bool Parser::parseExpressionImpl(VariantValue *Value) { 330 switch (Tokenizer->nextTokenKind()) { 331 case TokenInfo::TK_Literal: 332 *Value = Tokenizer->consumeNextToken().Value; 333 return true; 334 335 case TokenInfo::TK_Ident: 336 return parseMatcherExpressionImpl(Value); 337 338 case TokenInfo::TK_Eof: 339 Error->addError(Tokenizer->consumeNextToken().Range, 340 Error->ET_ParserNoCode); 341 return false; 342 343 case TokenInfo::TK_Error: 344 // This error was already reported by the tokenizer. 345 return false; 346 347 case TokenInfo::TK_OpenParen: 348 case TokenInfo::TK_CloseParen: 349 case TokenInfo::TK_Comma: 350 case TokenInfo::TK_Period: 351 case TokenInfo::TK_InvalidChar: 352 const TokenInfo Token = Tokenizer->consumeNextToken(); 353 Error->addError(Token.Range, Error->ET_ParserInvalidToken) << Token.Text; 354 return false; 355 } 356 357 llvm_unreachable("Unknown token kind."); 358 } 359 360 Parser::Parser(CodeTokenizer *Tokenizer, Sema *S, 361 Diagnostics *Error) 362 : Tokenizer(Tokenizer), S(S), Error(Error) {} 363 364 class RegistrySema : public Parser::Sema { 365 public: 366 virtual ~RegistrySema() {} 367 llvm::Optional<MatcherCtor> lookupMatcherCtor(StringRef MatcherName, 368 const SourceRange &NameRange, 369 Diagnostics *Error) { 370 return Registry::lookupMatcherCtor(MatcherName, NameRange, Error); 371 } 372 VariantMatcher actOnMatcherExpression(MatcherCtor Ctor, 373 const SourceRange &NameRange, 374 StringRef BindID, 375 ArrayRef<ParserValue> Args, 376 Diagnostics *Error) { 377 if (BindID.empty()) { 378 return Registry::constructMatcher(Ctor, NameRange, Args, Error); 379 } else { 380 return Registry::constructBoundMatcher(Ctor, NameRange, BindID, Args, 381 Error); 382 } 383 } 384 }; 385 386 bool Parser::parseExpression(StringRef Code, VariantValue *Value, 387 Diagnostics *Error) { 388 RegistrySema S; 389 return parseExpression(Code, &S, Value, Error); 390 } 391 392 bool Parser::parseExpression(StringRef Code, Sema *S, 393 VariantValue *Value, Diagnostics *Error) { 394 CodeTokenizer Tokenizer(Code, Error); 395 if (!Parser(&Tokenizer, S, Error).parseExpressionImpl(Value)) return false; 396 if (Tokenizer.peekNextToken().Kind != TokenInfo::TK_Eof) { 397 Error->addError(Tokenizer.peekNextToken().Range, 398 Error->ET_ParserTrailingCode); 399 return false; 400 } 401 return true; 402 } 403 404 llvm::Optional<DynTypedMatcher> 405 Parser::parseMatcherExpression(StringRef Code, Diagnostics *Error) { 406 RegistrySema S; 407 return parseMatcherExpression(Code, &S, Error); 408 } 409 410 llvm::Optional<DynTypedMatcher> 411 Parser::parseMatcherExpression(StringRef Code, Parser::Sema *S, 412 Diagnostics *Error) { 413 VariantValue Value; 414 if (!parseExpression(Code, S, &Value, Error)) 415 return llvm::Optional<DynTypedMatcher>(); 416 if (!Value.isMatcher()) { 417 Error->addError(SourceRange(), Error->ET_ParserNotAMatcher); 418 return llvm::Optional<DynTypedMatcher>(); 419 } 420 llvm::Optional<DynTypedMatcher> Result = 421 Value.getMatcher().getSingleMatcher(); 422 if (!Result.hasValue()) { 423 Error->addError(SourceRange(), Error->ET_ParserOverloadedType) 424 << Value.getTypeAsString(); 425 } 426 return Result; 427 } 428 429 } // namespace dynamic 430 } // namespace ast_matchers 431 } // namespace clang 432