1 //===--- Parser.cpp - Matcher expression parser -----*- C++ -*-===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 /// 10 /// \file 11 /// \brief Recursive parser implementation for the matcher expression grammar. 12 /// 13 //===----------------------------------------------------------------------===// 14 15 #include <string> 16 #include <vector> 17 18 #include "clang/ASTMatchers/Dynamic/Parser.h" 19 #include "clang/ASTMatchers/Dynamic/Registry.h" 20 #include "clang/Basic/CharInfo.h" 21 #include "llvm/ADT/Twine.h" 22 23 namespace clang { 24 namespace ast_matchers { 25 namespace dynamic { 26 27 /// \brief Simple structure to hold information for one token from the parser. 28 struct Parser::TokenInfo { 29 /// \brief Different possible tokens. 30 enum TokenKind { 31 TK_Eof = 0, 32 TK_OpenParen = 1, 33 TK_CloseParen = 2, 34 TK_Comma = 3, 35 TK_Literal = 4, 36 TK_Ident = 5, 37 TK_InvalidChar = 6, 38 TK_Error = 7 39 }; 40 41 TokenInfo() : Text(), Kind(TK_Eof), Range(), Value() {} 42 43 StringRef Text; 44 TokenKind Kind; 45 SourceRange Range; 46 VariantValue Value; 47 }; 48 49 /// \brief Simple tokenizer for the parser. 50 class Parser::CodeTokenizer { 51 public: 52 explicit CodeTokenizer(StringRef MatcherCode, Diagnostics *Error) 53 : Code(MatcherCode), StartOfLine(MatcherCode), Line(1), Error(Error) { 54 NextToken = getNextToken(); 55 } 56 57 /// \brief Returns but doesn't consume the next token. 58 const TokenInfo &peekNextToken() const { return NextToken; } 59 60 /// \brief Consumes and returns the next token. 61 TokenInfo consumeNextToken() { 62 TokenInfo ThisToken = NextToken; 63 NextToken = getNextToken(); 64 return ThisToken; 65 } 66 67 TokenInfo::TokenKind nextTokenKind() const { return NextToken.Kind; } 68 69 private: 70 TokenInfo getNextToken() { 71 consumeWhitespace(); 72 TokenInfo Result; 73 Result.Range.Start = currentLocation(); 74 75 if (Code.empty()) { 76 Result.Kind = TokenInfo::TK_Eof; 77 Result.Text = ""; 78 return Result; 79 } 80 81 switch (Code[0]) { 82 case ',': 83 Result.Kind = TokenInfo::TK_Comma; 84 Result.Text = Code.substr(0, 1); 85 Code = Code.drop_front(); 86 break; 87 case '(': 88 Result.Kind = TokenInfo::TK_OpenParen; 89 Result.Text = Code.substr(0, 1); 90 Code = Code.drop_front(); 91 break; 92 case ')': 93 Result.Kind = TokenInfo::TK_CloseParen; 94 Result.Text = Code.substr(0, 1); 95 Code = Code.drop_front(); 96 break; 97 98 case '"': 99 case '\'': 100 // Parse a string literal. 101 consumeStringLiteral(&Result); 102 break; 103 104 default: 105 if (isAlphanumeric(Code[0])) { 106 // Parse an identifier 107 size_t TokenLength = 1; 108 while (TokenLength < Code.size() && isAlphanumeric(Code[TokenLength])) 109 ++TokenLength; 110 Result.Kind = TokenInfo::TK_Ident; 111 Result.Text = Code.substr(0, TokenLength); 112 Code = Code.drop_front(TokenLength); 113 } else { 114 Result.Kind = TokenInfo::TK_InvalidChar; 115 Result.Text = Code.substr(0, 1); 116 Code = Code.drop_front(1); 117 } 118 break; 119 } 120 121 Result.Range.End = currentLocation(); 122 return Result; 123 } 124 125 /// \brief Consume a string literal. 126 /// 127 /// \c Code must be positioned at the start of the literal (the opening 128 /// quote). Consumed until it finds the same closing quote character. 129 void consumeStringLiteral(TokenInfo *Result) { 130 bool InEscape = false; 131 const char Marker = Code[0]; 132 for (size_t Length = 1, Size = Code.size(); Length != Size; ++Length) { 133 if (InEscape) { 134 InEscape = false; 135 continue; 136 } 137 if (Code[Length] == '\\') { 138 InEscape = true; 139 continue; 140 } 141 if (Code[Length] == Marker) { 142 Result->Kind = TokenInfo::TK_Literal; 143 Result->Text = Code.substr(0, Length + 1); 144 Result->Value = Code.substr(1, Length - 1).str(); 145 Code = Code.drop_front(Length + 1); 146 return; 147 } 148 } 149 150 StringRef ErrorText = Code; 151 Code = Code.drop_front(Code.size()); 152 SourceRange Range; 153 Range.Start = Result->Range.Start; 154 Range.End = currentLocation(); 155 Error->pushErrorFrame(Range, Error->ET_ParserStringError) 156 << ErrorText; 157 Result->Kind = TokenInfo::TK_Error; 158 } 159 160 /// \brief Consume all leading whitespace from \c Code. 161 void consumeWhitespace() { 162 while (!Code.empty() && isWhitespace(Code[0])) { 163 if (Code[0] == '\n') { 164 ++Line; 165 StartOfLine = Code.drop_front(); 166 } 167 Code = Code.drop_front(); 168 } 169 } 170 171 SourceLocation currentLocation() { 172 SourceLocation Location; 173 Location.Line = Line; 174 Location.Column = Code.data() - StartOfLine.data() + 1; 175 return Location; 176 } 177 178 StringRef Code; 179 StringRef StartOfLine; 180 unsigned Line; 181 Diagnostics *Error; 182 TokenInfo NextToken; 183 }; 184 185 Parser::Sema::~Sema() {} 186 187 /// \brief Parse and validate a matcher expression. 188 /// \return \c true on success, in which case \c Value has the matcher parsed. 189 /// If the input is malformed, or some argument has an error, it 190 /// returns \c false. 191 bool Parser::parseMatcherExpressionImpl(VariantValue *Value) { 192 const TokenInfo NameToken = Tokenizer->consumeNextToken(); 193 assert(NameToken.Kind == TokenInfo::TK_Ident); 194 const TokenInfo OpenToken = Tokenizer->consumeNextToken(); 195 if (OpenToken.Kind != TokenInfo::TK_OpenParen) { 196 Error->pushErrorFrame(OpenToken.Range, Error->ET_ParserNoOpenParen) 197 << OpenToken.Text; 198 return false; 199 } 200 201 std::vector<ParserValue> Args; 202 TokenInfo EndToken; 203 while (Tokenizer->nextTokenKind() != TokenInfo::TK_Eof) { 204 if (Tokenizer->nextTokenKind() == TokenInfo::TK_CloseParen) { 205 // End of args. 206 EndToken = Tokenizer->consumeNextToken(); 207 break; 208 } 209 if (Args.size() > 0) { 210 // We must find a , token to continue. 211 const TokenInfo CommaToken = Tokenizer->consumeNextToken(); 212 if (CommaToken.Kind != TokenInfo::TK_Comma) { 213 Error->pushErrorFrame(CommaToken.Range, Error->ET_ParserNoComma) 214 << CommaToken.Text; 215 return false; 216 } 217 } 218 219 ParserValue ArgValue; 220 ArgValue.Text = Tokenizer->peekNextToken().Text; 221 ArgValue.Range = Tokenizer->peekNextToken().Range; 222 if (!parseExpressionImpl(&ArgValue.Value)) { 223 Error->pushErrorFrame(NameToken.Range, 224 Error->ET_ParserMatcherArgFailure) 225 << (Args.size() + 1) << NameToken.Text; 226 return false; 227 } 228 229 Args.push_back(ArgValue); 230 } 231 232 if (EndToken.Kind == TokenInfo::TK_Eof) { 233 Error->pushErrorFrame(OpenToken.Range, Error->ET_ParserNoCloseParen); 234 return false; 235 } 236 237 // Merge the start and end infos. 238 SourceRange MatcherRange = NameToken.Range; 239 MatcherRange.End = EndToken.Range.End; 240 DynTypedMatcher *Result = 241 S->actOnMatcherExpression(NameToken.Text, MatcherRange, Args, Error); 242 if (Result == NULL) { 243 Error->pushErrorFrame(NameToken.Range, Error->ET_ParserMatcherFailure) 244 << NameToken.Text; 245 return false; 246 } 247 248 Value->takeMatcher(Result); 249 return true; 250 } 251 252 /// \brief Parse an <Expresssion> 253 bool Parser::parseExpressionImpl(VariantValue *Value) { 254 switch (Tokenizer->nextTokenKind()) { 255 case TokenInfo::TK_Literal: 256 *Value = Tokenizer->consumeNextToken().Value; 257 return true; 258 259 case TokenInfo::TK_Ident: 260 return parseMatcherExpressionImpl(Value); 261 262 case TokenInfo::TK_Eof: 263 Error->pushErrorFrame(Tokenizer->consumeNextToken().Range, 264 Error->ET_ParserNoCode); 265 return false; 266 267 case TokenInfo::TK_Error: 268 // This error was already reported by the tokenizer. 269 return false; 270 271 case TokenInfo::TK_OpenParen: 272 case TokenInfo::TK_CloseParen: 273 case TokenInfo::TK_Comma: 274 case TokenInfo::TK_InvalidChar: 275 const TokenInfo Token = Tokenizer->consumeNextToken(); 276 Error->pushErrorFrame(Token.Range, Error->ET_ParserInvalidToken) 277 << Token.Text; 278 return false; 279 } 280 281 llvm_unreachable("Unknown token kind."); 282 } 283 284 Parser::Parser(CodeTokenizer *Tokenizer, Sema *S, 285 Diagnostics *Error) 286 : Tokenizer(Tokenizer), S(S), Error(Error) {} 287 288 class RegistrySema : public Parser::Sema { 289 public: 290 virtual ~RegistrySema() {} 291 DynTypedMatcher *actOnMatcherExpression(StringRef MatcherName, 292 const SourceRange &NameRange, 293 ArrayRef<ParserValue> Args, 294 Diagnostics *Error) { 295 return Registry::constructMatcher(MatcherName, NameRange, Args, Error); 296 } 297 }; 298 299 bool Parser::parseExpression(StringRef Code, VariantValue *Value, 300 Diagnostics *Error) { 301 RegistrySema S; 302 return parseExpression(Code, &S, Value, Error); 303 } 304 305 bool Parser::parseExpression(StringRef Code, Sema *S, 306 VariantValue *Value, Diagnostics *Error) { 307 CodeTokenizer Tokenizer(Code, Error); 308 return Parser(&Tokenizer, S, Error).parseExpressionImpl(Value); 309 } 310 311 DynTypedMatcher *Parser::parseMatcherExpression(StringRef Code, 312 Diagnostics *Error) { 313 RegistrySema S; 314 return parseMatcherExpression(Code, &S, Error); 315 } 316 317 DynTypedMatcher *Parser::parseMatcherExpression(StringRef Code, 318 Parser::Sema *S, 319 Diagnostics *Error) { 320 VariantValue Value; 321 if (!parseExpression(Code, S, &Value, Error)) 322 return NULL; 323 if (!Value.isMatcher()) { 324 Error->pushErrorFrame(SourceRange(), Error->ET_ParserNotAMatcher); 325 return NULL; 326 } 327 return Value.getMatcher().clone(); 328 } 329 330 } // namespace dynamic 331 } // namespace ast_matchers 332 } // namespace clang 333