1 //===--- CommentParser.cpp - Doxygen comment parser -----------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 10 #include "clang/AST/CommentParser.h" 11 #include "clang/AST/CommentSema.h" 12 #include "clang/AST/CommentDiagnostic.h" 13 #include "clang/Basic/SourceManager.h" 14 #include "llvm/Support/ErrorHandling.h" 15 16 namespace clang { 17 namespace comments { 18 19 /// Re-lexes a sequence of tok::text tokens. 20 class TextTokenRetokenizer { 21 llvm::BumpPtrAllocator &Allocator; 22 Parser &P; 23 SmallVector<Token, 16> Toks; 24 25 struct Position { 26 unsigned CurToken; 27 const char *BufferStart; 28 const char *BufferEnd; 29 const char *BufferPtr; 30 SourceLocation BufferStartLoc; 31 }; 32 33 /// Current position in Toks. 34 Position Pos; 35 36 bool isEnd() const { 37 return Pos.CurToken >= Toks.size(); 38 } 39 40 /// Sets up the buffer pointers to point to current token. 41 void setupBuffer() { 42 assert(!isEnd()); 43 const Token &Tok = Toks[Pos.CurToken]; 44 45 Pos.BufferStart = Tok.getText().begin(); 46 Pos.BufferEnd = Tok.getText().end(); 47 Pos.BufferPtr = Pos.BufferStart; 48 Pos.BufferStartLoc = Tok.getLocation(); 49 } 50 51 SourceLocation getSourceLocation() const { 52 const unsigned CharNo = Pos.BufferPtr - Pos.BufferStart; 53 return Pos.BufferStartLoc.getLocWithOffset(CharNo); 54 } 55 56 char peek() const { 57 assert(!isEnd()); 58 assert(Pos.BufferPtr != Pos.BufferEnd); 59 return *Pos.BufferPtr; 60 } 61 62 void consumeChar() { 63 assert(!isEnd()); 64 assert(Pos.BufferPtr != Pos.BufferEnd); 65 Pos.BufferPtr++; 66 if (Pos.BufferPtr == Pos.BufferEnd) { 67 Pos.CurToken++; 68 if (isEnd() && addToken()) { 69 assert(!isEnd()); 70 setupBuffer(); 71 } 72 } 73 } 74 75 /// Add a token. 76 /// Returns true on success, false if there are no interesting tokens to 77 /// fetch from lexer. 78 bool addToken() { 79 if (P.Tok.isNot(tok::text)) 80 return false; 81 82 Toks.push_back(P.Tok); 83 P.consumeToken(); 84 if (Toks.size() == 1) 85 setupBuffer(); 86 return true; 87 } 88 89 static bool isWhitespace(char C) { 90 return C == ' ' || C == '\n' || C == '\r' || 91 C == '\t' || C == '\f' || C == '\v'; 92 } 93 94 void consumeWhitespace() { 95 while (!isEnd()) { 96 if (isWhitespace(peek())) 97 consumeChar(); 98 else 99 break; 100 } 101 } 102 103 void formTokenWithChars(Token &Result, 104 SourceLocation Loc, 105 const char *TokBegin, 106 unsigned TokLength, 107 StringRef Text) { 108 Result.setLocation(Loc); 109 Result.setKind(tok::text); 110 Result.setLength(TokLength); 111 #ifndef NDEBUG 112 Result.TextPtr1 = "<UNSET>"; 113 Result.TextLen1 = 7; 114 #endif 115 Result.setText(Text); 116 } 117 118 public: 119 TextTokenRetokenizer(llvm::BumpPtrAllocator &Allocator, Parser &P): 120 Allocator(Allocator), P(P) { 121 Pos.CurToken = 0; 122 addToken(); 123 } 124 125 /// Extract a word -- sequence of non-whitespace characters. 126 bool lexWord(Token &Tok) { 127 if (isEnd()) 128 return false; 129 130 Position SavedPos = Pos; 131 132 consumeWhitespace(); 133 SmallString<32> WordText; 134 const char *WordBegin = Pos.BufferPtr; 135 SourceLocation Loc = getSourceLocation(); 136 while (!isEnd()) { 137 const char C = peek(); 138 if (!isWhitespace(C)) { 139 WordText.push_back(C); 140 consumeChar(); 141 } else 142 break; 143 } 144 const unsigned Length = WordText.size(); 145 if (Length == 0) { 146 Pos = SavedPos; 147 return false; 148 } 149 150 char *TextPtr = Allocator.Allocate<char>(Length + 1); 151 152 memcpy(TextPtr, WordText.c_str(), Length + 1); 153 StringRef Text = StringRef(TextPtr, Length); 154 155 formTokenWithChars(Tok, Loc, WordBegin, 156 Pos.BufferPtr - WordBegin, Text); 157 return true; 158 } 159 160 bool lexDelimitedSeq(Token &Tok, char OpenDelim, char CloseDelim) { 161 if (isEnd()) 162 return false; 163 164 Position SavedPos = Pos; 165 166 consumeWhitespace(); 167 SmallString<32> WordText; 168 const char *WordBegin = Pos.BufferPtr; 169 SourceLocation Loc = getSourceLocation(); 170 bool Error = false; 171 if (!isEnd()) { 172 const char C = peek(); 173 if (C == OpenDelim) { 174 WordText.push_back(C); 175 consumeChar(); 176 } else 177 Error = true; 178 } 179 char C = '\0'; 180 while (!Error && !isEnd()) { 181 C = peek(); 182 WordText.push_back(C); 183 consumeChar(); 184 if (C == CloseDelim) 185 break; 186 } 187 if (!Error && C != CloseDelim) 188 Error = true; 189 190 if (Error) { 191 Pos = SavedPos; 192 return false; 193 } 194 195 const unsigned Length = WordText.size(); 196 char *TextPtr = Allocator.Allocate<char>(Length + 1); 197 198 memcpy(TextPtr, WordText.c_str(), Length + 1); 199 StringRef Text = StringRef(TextPtr, Length); 200 201 formTokenWithChars(Tok, Loc, WordBegin, 202 Pos.BufferPtr - WordBegin, Text); 203 return true; 204 } 205 206 /// Put back tokens that we didn't consume. 207 void putBackLeftoverTokens() { 208 if (isEnd()) 209 return; 210 211 bool HavePartialTok = false; 212 Token PartialTok; 213 if (Pos.BufferPtr != Pos.BufferStart) { 214 formTokenWithChars(PartialTok, getSourceLocation(), 215 Pos.BufferPtr, Pos.BufferEnd - Pos.BufferPtr, 216 StringRef(Pos.BufferPtr, 217 Pos.BufferEnd - Pos.BufferPtr)); 218 HavePartialTok = true; 219 Pos.CurToken++; 220 } 221 222 P.putBack(llvm::makeArrayRef(Toks.begin() + Pos.CurToken, Toks.end())); 223 Pos.CurToken = Toks.size(); 224 225 if (HavePartialTok) 226 P.putBack(PartialTok); 227 } 228 }; 229 230 Parser::Parser(Lexer &L, Sema &S, llvm::BumpPtrAllocator &Allocator, 231 const SourceManager &SourceMgr, DiagnosticsEngine &Diags): 232 L(L), S(S), Allocator(Allocator), SourceMgr(SourceMgr), Diags(Diags) { 233 consumeToken(); 234 } 235 236 ParamCommandComment *Parser::parseParamCommandArgs( 237 ParamCommandComment *PC, 238 TextTokenRetokenizer &Retokenizer) { 239 Token Arg; 240 // Check if argument looks like direction specification: [dir] 241 // e.g., [in], [out], [in,out] 242 if (Retokenizer.lexDelimitedSeq(Arg, '[', ']')) 243 PC = S.actOnParamCommandDirectionArg(PC, 244 Arg.getLocation(), 245 Arg.getEndLocation(), 246 Arg.getText()); 247 248 if (Retokenizer.lexWord(Arg)) 249 PC = S.actOnParamCommandParamNameArg(PC, 250 Arg.getLocation(), 251 Arg.getEndLocation(), 252 Arg.getText()); 253 254 return PC; 255 } 256 257 BlockCommandComment *Parser::parseBlockCommandArgs( 258 BlockCommandComment *BC, 259 TextTokenRetokenizer &Retokenizer, 260 unsigned NumArgs) { 261 typedef BlockCommandComment::Argument Argument; 262 Argument *Args = 263 new (Allocator.Allocate<Argument>(NumArgs)) Argument[NumArgs]; 264 unsigned ParsedArgs = 0; 265 Token Arg; 266 while (ParsedArgs < NumArgs && Retokenizer.lexWord(Arg)) { 267 Args[ParsedArgs] = Argument(SourceRange(Arg.getLocation(), 268 Arg.getEndLocation()), 269 Arg.getText()); 270 ParsedArgs++; 271 } 272 273 return S.actOnBlockCommandArgs(BC, llvm::makeArrayRef(Args, ParsedArgs)); 274 } 275 276 BlockCommandComment *Parser::parseBlockCommand() { 277 assert(Tok.is(tok::command)); 278 279 ParamCommandComment *PC; 280 BlockCommandComment *BC; 281 bool IsParam = false; 282 unsigned NumArgs = 0; 283 if (S.isParamCommand(Tok.getCommandName())) { 284 IsParam = true; 285 PC = S.actOnParamCommandStart(Tok.getLocation(), 286 Tok.getEndLocation(), 287 Tok.getCommandName()); 288 } else { 289 NumArgs = S.getBlockCommandNumArgs(Tok.getCommandName()); 290 BC = S.actOnBlockCommandStart(Tok.getLocation(), 291 Tok.getEndLocation(), 292 Tok.getCommandName()); 293 } 294 consumeToken(); 295 296 if (Tok.is(tok::command) && S.isBlockCommand(Tok.getCommandName())) { 297 // Block command ahead. We can't nest block commands, so pretend that this 298 // command has an empty argument. 299 ParagraphComment *PC = S.actOnParagraphComment( 300 ArrayRef<InlineContentComment *>()); 301 return S.actOnBlockCommandFinish(BC, PC); 302 } 303 304 if (IsParam || NumArgs > 0) { 305 // In order to parse command arguments we need to retokenize a few 306 // following text tokens. 307 TextTokenRetokenizer Retokenizer(Allocator, *this); 308 309 if (IsParam) 310 PC = parseParamCommandArgs(PC, Retokenizer); 311 else 312 BC = parseBlockCommandArgs(BC, Retokenizer, NumArgs); 313 314 Retokenizer.putBackLeftoverTokens(); 315 } 316 317 BlockContentComment *Block = parseParagraphOrBlockCommand(); 318 // Since we have checked for a block command, we should have parsed a 319 // paragraph. 320 if (IsParam) 321 return S.actOnParamCommandFinish(PC, cast<ParagraphComment>(Block)); 322 else 323 return S.actOnBlockCommandFinish(BC, cast<ParagraphComment>(Block)); 324 } 325 326 InlineCommandComment *Parser::parseInlineCommand() { 327 assert(Tok.is(tok::command)); 328 329 const Token CommandTok = Tok; 330 consumeToken(); 331 332 TextTokenRetokenizer Retokenizer(Allocator, *this); 333 334 Token ArgTok; 335 bool ArgTokValid = Retokenizer.lexWord(ArgTok); 336 337 InlineCommandComment *IC; 338 if (ArgTokValid) { 339 IC = S.actOnInlineCommand(CommandTok.getLocation(), 340 CommandTok.getEndLocation(), 341 CommandTok.getCommandName(), 342 ArgTok.getLocation(), 343 ArgTok.getEndLocation(), 344 ArgTok.getText()); 345 } else { 346 IC = S.actOnInlineCommand(CommandTok.getLocation(), 347 CommandTok.getEndLocation(), 348 CommandTok.getCommandName()); 349 } 350 351 Retokenizer.putBackLeftoverTokens(); 352 353 return IC; 354 } 355 356 HTMLStartTagComment *Parser::parseHTMLStartTag() { 357 assert(Tok.is(tok::html_start_tag)); 358 HTMLStartTagComment *HST = 359 S.actOnHTMLStartTagStart(Tok.getLocation(), 360 Tok.getHTMLTagStartName()); 361 consumeToken(); 362 363 SmallVector<HTMLStartTagComment::Attribute, 2> Attrs; 364 while (true) { 365 switch (Tok.getKind()) { 366 case tok::html_ident: { 367 Token Ident = Tok; 368 consumeToken(); 369 if (Tok.isNot(tok::html_equals)) { 370 Attrs.push_back(HTMLStartTagComment::Attribute(Ident.getLocation(), 371 Ident.getHTMLIdent())); 372 continue; 373 } 374 Token Equals = Tok; 375 consumeToken(); 376 if (Tok.isNot(tok::html_quoted_string)) { 377 Diag(Tok.getLocation(), 378 diag::warn_doc_html_start_tag_expected_quoted_string) 379 << SourceRange(Equals.getLocation()); 380 Attrs.push_back(HTMLStartTagComment::Attribute(Ident.getLocation(), 381 Ident.getHTMLIdent())); 382 while (Tok.is(tok::html_equals) || 383 Tok.is(tok::html_quoted_string)) 384 consumeToken(); 385 continue; 386 } 387 Attrs.push_back(HTMLStartTagComment::Attribute( 388 Ident.getLocation(), 389 Ident.getHTMLIdent(), 390 Equals.getLocation(), 391 SourceRange(Tok.getLocation(), 392 Tok.getEndLocation()), 393 Tok.getHTMLQuotedString())); 394 consumeToken(); 395 continue; 396 } 397 398 case tok::html_greater: 399 HST = S.actOnHTMLStartTagFinish(HST, 400 copyArray(llvm::makeArrayRef(Attrs)), 401 Tok.getLocation(), 402 /* IsSelfClosing = */ false); 403 consumeToken(); 404 return HST; 405 406 case tok::html_slash_greater: 407 HST = S.actOnHTMLStartTagFinish(HST, 408 copyArray(llvm::makeArrayRef(Attrs)), 409 Tok.getLocation(), 410 /* IsSelfClosing = */ true); 411 consumeToken(); 412 return HST; 413 414 case tok::html_equals: 415 case tok::html_quoted_string: 416 Diag(Tok.getLocation(), 417 diag::warn_doc_html_start_tag_expected_ident_or_greater); 418 while (Tok.is(tok::html_equals) || 419 Tok.is(tok::html_quoted_string)) 420 consumeToken(); 421 if (Tok.is(tok::html_ident) || 422 Tok.is(tok::html_greater) || 423 Tok.is(tok::html_slash_greater)) 424 continue; 425 426 return S.actOnHTMLStartTagFinish(HST, 427 copyArray(llvm::makeArrayRef(Attrs)), 428 SourceLocation(), 429 /* IsSelfClosing = */ false); 430 431 default: 432 // Not a token from an HTML start tag. Thus HTML tag prematurely ended. 433 HST = S.actOnHTMLStartTagFinish(HST, 434 copyArray(llvm::makeArrayRef(Attrs)), 435 SourceLocation(), 436 /* IsSelfClosing = */ false); 437 bool StartLineInvalid; 438 const unsigned StartLine = SourceMgr.getPresumedLineNumber( 439 HST->getLocation(), 440 &StartLineInvalid); 441 bool EndLineInvalid; 442 const unsigned EndLine = SourceMgr.getPresumedLineNumber( 443 Tok.getLocation(), 444 &EndLineInvalid); 445 if (StartLineInvalid || EndLineInvalid || StartLine == EndLine) 446 Diag(Tok.getLocation(), 447 diag::warn_doc_html_start_tag_expected_ident_or_greater) 448 << HST->getSourceRange(); 449 else { 450 Diag(Tok.getLocation(), 451 diag::warn_doc_html_start_tag_expected_ident_or_greater); 452 Diag(HST->getLocation(), diag::note_doc_html_tag_started_here) 453 << HST->getSourceRange(); 454 } 455 return HST; 456 } 457 } 458 } 459 460 HTMLEndTagComment *Parser::parseHTMLEndTag() { 461 assert(Tok.is(tok::html_end_tag)); 462 Token TokEndTag = Tok; 463 consumeToken(); 464 SourceLocation Loc; 465 if (Tok.is(tok::html_greater)) { 466 Loc = Tok.getLocation(); 467 consumeToken(); 468 } 469 470 return S.actOnHTMLEndTag(TokEndTag.getLocation(), 471 Loc, 472 TokEndTag.getHTMLTagEndName()); 473 } 474 475 BlockContentComment *Parser::parseParagraphOrBlockCommand() { 476 SmallVector<InlineContentComment *, 8> Content; 477 478 while (true) { 479 switch (Tok.getKind()) { 480 case tok::verbatim_block_begin: 481 case tok::verbatim_line_name: 482 case tok::eof: 483 assert(Content.size() != 0); 484 break; // Block content or EOF ahead, finish this parapgaph. 485 486 case tok::command: 487 if (S.isBlockCommand(Tok.getCommandName())) { 488 if (Content.size() == 0) 489 return parseBlockCommand(); 490 break; // Block command ahead, finish this parapgaph. 491 } 492 if (S.isInlineCommand(Tok.getCommandName())) { 493 Content.push_back(parseInlineCommand()); 494 continue; 495 } 496 497 // Not a block command, not an inline command ==> an unknown command. 498 Content.push_back(S.actOnUnknownCommand(Tok.getLocation(), 499 Tok.getEndLocation(), 500 Tok.getCommandName())); 501 consumeToken(); 502 continue; 503 504 case tok::newline: { 505 consumeToken(); 506 if (Tok.is(tok::newline) || Tok.is(tok::eof)) { 507 consumeToken(); 508 break; // Two newlines -- end of paragraph. 509 } 510 if (Content.size() > 0) 511 Content.back()->addTrailingNewline(); 512 continue; 513 } 514 515 // Don't deal with HTML tag soup now. 516 case tok::html_start_tag: 517 Content.push_back(parseHTMLStartTag()); 518 continue; 519 520 case tok::html_end_tag: 521 Content.push_back(parseHTMLEndTag()); 522 continue; 523 524 case tok::text: 525 Content.push_back(S.actOnText(Tok.getLocation(), 526 Tok.getEndLocation(), 527 Tok.getText())); 528 consumeToken(); 529 continue; 530 531 case tok::verbatim_block_line: 532 case tok::verbatim_block_end: 533 case tok::verbatim_line_text: 534 case tok::html_ident: 535 case tok::html_equals: 536 case tok::html_quoted_string: 537 case tok::html_greater: 538 case tok::html_slash_greater: 539 llvm_unreachable("should not see this token"); 540 } 541 break; 542 } 543 544 return S.actOnParagraphComment(copyArray(llvm::makeArrayRef(Content))); 545 } 546 547 VerbatimBlockComment *Parser::parseVerbatimBlock() { 548 assert(Tok.is(tok::verbatim_block_begin)); 549 550 VerbatimBlockComment *VB = 551 S.actOnVerbatimBlockStart(Tok.getLocation(), 552 Tok.getVerbatimBlockName()); 553 consumeToken(); 554 555 // Don't create an empty line if verbatim opening command is followed 556 // by a newline. 557 if (Tok.is(tok::newline)) 558 consumeToken(); 559 560 SmallVector<VerbatimBlockLineComment *, 8> Lines; 561 while (Tok.is(tok::verbatim_block_line) || 562 Tok.is(tok::newline)) { 563 VerbatimBlockLineComment *Line; 564 if (Tok.is(tok::verbatim_block_line)) { 565 Line = S.actOnVerbatimBlockLine(Tok.getLocation(), 566 Tok.getVerbatimBlockText()); 567 consumeToken(); 568 if (Tok.is(tok::newline)) { 569 consumeToken(); 570 } 571 } else { 572 // Empty line, just a tok::newline. 573 Line = S.actOnVerbatimBlockLine(Tok.getLocation(), ""); 574 consumeToken(); 575 } 576 Lines.push_back(Line); 577 } 578 579 if (Tok.is(tok::verbatim_block_end)) { 580 VB = S.actOnVerbatimBlockFinish(VB, Tok.getLocation(), 581 Tok.getVerbatimBlockName(), 582 copyArray(llvm::makeArrayRef(Lines))); 583 consumeToken(); 584 } else { 585 // Unterminated \\verbatim block 586 VB = S.actOnVerbatimBlockFinish(VB, SourceLocation(), "", 587 copyArray(llvm::makeArrayRef(Lines))); 588 } 589 590 return VB; 591 } 592 593 VerbatimLineComment *Parser::parseVerbatimLine() { 594 assert(Tok.is(tok::verbatim_line_name)); 595 596 Token NameTok = Tok; 597 consumeToken(); 598 599 SourceLocation TextBegin; 600 StringRef Text; 601 // Next token might not be a tok::verbatim_line_text if verbatim line 602 // starting command comes just before a newline or comment end. 603 if (Tok.is(tok::verbatim_line_text)) { 604 TextBegin = Tok.getLocation(); 605 Text = Tok.getVerbatimLineText(); 606 } else { 607 TextBegin = NameTok.getEndLocation(); 608 Text = ""; 609 } 610 611 VerbatimLineComment *VL = S.actOnVerbatimLine(NameTok.getLocation(), 612 NameTok.getVerbatimLineName(), 613 TextBegin, 614 Text); 615 consumeToken(); 616 return VL; 617 } 618 619 BlockContentComment *Parser::parseBlockContent() { 620 switch (Tok.getKind()) { 621 case tok::text: 622 case tok::command: 623 case tok::html_start_tag: 624 case tok::html_end_tag: 625 return parseParagraphOrBlockCommand(); 626 627 case tok::verbatim_block_begin: 628 return parseVerbatimBlock(); 629 630 case tok::verbatim_line_name: 631 return parseVerbatimLine(); 632 633 case tok::eof: 634 case tok::newline: 635 case tok::verbatim_block_line: 636 case tok::verbatim_block_end: 637 case tok::verbatim_line_text: 638 case tok::html_ident: 639 case tok::html_equals: 640 case tok::html_quoted_string: 641 case tok::html_greater: 642 case tok::html_slash_greater: 643 llvm_unreachable("should not see this token"); 644 } 645 llvm_unreachable("bogus token kind"); 646 } 647 648 FullComment *Parser::parseFullComment() { 649 // Skip newlines at the beginning of the comment. 650 while (Tok.is(tok::newline)) 651 consumeToken(); 652 653 SmallVector<BlockContentComment *, 8> Blocks; 654 while (Tok.isNot(tok::eof)) { 655 Blocks.push_back(parseBlockContent()); 656 657 // Skip extra newlines after paragraph end. 658 while (Tok.is(tok::newline)) 659 consumeToken(); 660 } 661 return S.actOnFullComment(copyArray(llvm::makeArrayRef(Blocks))); 662 } 663 664 } // end namespace comments 665 } // end namespace clang 666