1 //===--- CommentParser.cpp - Doxygen comment parser -----------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 10 #include "clang/AST/CommentParser.h" 11 #include "clang/AST/CommentSema.h" 12 #include "clang/AST/CommentDiagnostic.h" 13 #include "clang/Basic/SourceManager.h" 14 #include "llvm/Support/ErrorHandling.h" 15 16 namespace clang { 17 namespace comments { 18 19 /// Re-lexes a sequence of tok::text tokens. 20 class TextTokenRetokenizer { 21 llvm::BumpPtrAllocator &Allocator; 22 Parser &P; 23 24 /// This flag is set when there are no more tokens we can fetch from lexer. 25 bool NoMoreInterestingTokens; 26 27 /// Token buffer: tokens we have processed and lookahead. 28 SmallVector<Token, 16> Toks; 29 30 /// A position in \c Toks. 31 struct Position { 32 unsigned CurToken; 33 const char *BufferStart; 34 const char *BufferEnd; 35 const char *BufferPtr; 36 SourceLocation BufferStartLoc; 37 }; 38 39 /// Current position in Toks. 40 Position Pos; 41 42 bool isEnd() const { 43 return Pos.CurToken >= Toks.size(); 44 } 45 46 /// Sets up the buffer pointers to point to current token. 47 void setupBuffer() { 48 assert(!isEnd()); 49 const Token &Tok = Toks[Pos.CurToken]; 50 51 Pos.BufferStart = Tok.getText().begin(); 52 Pos.BufferEnd = Tok.getText().end(); 53 Pos.BufferPtr = Pos.BufferStart; 54 Pos.BufferStartLoc = Tok.getLocation(); 55 } 56 57 SourceLocation getSourceLocation() const { 58 const unsigned CharNo = Pos.BufferPtr - Pos.BufferStart; 59 return Pos.BufferStartLoc.getLocWithOffset(CharNo); 60 } 61 62 char peek() const { 63 assert(!isEnd()); 64 assert(Pos.BufferPtr != Pos.BufferEnd); 65 return *Pos.BufferPtr; 66 } 67 68 void consumeChar() { 69 assert(!isEnd()); 70 assert(Pos.BufferPtr != Pos.BufferEnd); 71 Pos.BufferPtr++; 72 if (Pos.BufferPtr == Pos.BufferEnd) { 73 Pos.CurToken++; 74 if (isEnd() && !addToken()) 75 return; 76 77 assert(!isEnd()); 78 setupBuffer(); 79 } 80 } 81 82 /// Add a token. 83 /// Returns true on success, false if there are no interesting tokens to 84 /// fetch from lexer. 85 bool addToken() { 86 if (NoMoreInterestingTokens) 87 return false; 88 89 if (P.Tok.is(tok::newline)) { 90 // If we see a single newline token between text tokens, skip it. 91 Token Newline = P.Tok; 92 P.consumeToken(); 93 if (P.Tok.isNot(tok::text)) { 94 P.putBack(Newline); 95 NoMoreInterestingTokens = true; 96 return false; 97 } 98 } 99 if (P.Tok.isNot(tok::text)) { 100 NoMoreInterestingTokens = true; 101 return false; 102 } 103 104 Toks.push_back(P.Tok); 105 P.consumeToken(); 106 if (Toks.size() == 1) 107 setupBuffer(); 108 return true; 109 } 110 111 static bool isWhitespace(char C) { 112 return C == ' ' || C == '\n' || C == '\r' || 113 C == '\t' || C == '\f' || C == '\v'; 114 } 115 116 void consumeWhitespace() { 117 while (!isEnd()) { 118 if (isWhitespace(peek())) 119 consumeChar(); 120 else 121 break; 122 } 123 } 124 125 void formTokenWithChars(Token &Result, 126 SourceLocation Loc, 127 const char *TokBegin, 128 unsigned TokLength, 129 StringRef Text) { 130 Result.setLocation(Loc); 131 Result.setKind(tok::text); 132 Result.setLength(TokLength); 133 #ifndef NDEBUG 134 Result.TextPtr1 = "<UNSET>"; 135 Result.TextLen1 = 7; 136 #endif 137 Result.setText(Text); 138 } 139 140 public: 141 TextTokenRetokenizer(llvm::BumpPtrAllocator &Allocator, Parser &P): 142 Allocator(Allocator), P(P), NoMoreInterestingTokens(false) { 143 Pos.CurToken = 0; 144 addToken(); 145 } 146 147 /// Extract a word -- sequence of non-whitespace characters. 148 bool lexWord(Token &Tok) { 149 if (isEnd()) 150 return false; 151 152 Position SavedPos = Pos; 153 154 consumeWhitespace(); 155 SmallString<32> WordText; 156 const char *WordBegin = Pos.BufferPtr; 157 SourceLocation Loc = getSourceLocation(); 158 while (!isEnd()) { 159 const char C = peek(); 160 if (!isWhitespace(C)) { 161 WordText.push_back(C); 162 consumeChar(); 163 } else 164 break; 165 } 166 const unsigned Length = WordText.size(); 167 if (Length == 0) { 168 Pos = SavedPos; 169 return false; 170 } 171 172 char *TextPtr = Allocator.Allocate<char>(Length + 1); 173 174 memcpy(TextPtr, WordText.c_str(), Length + 1); 175 StringRef Text = StringRef(TextPtr, Length); 176 177 formTokenWithChars(Tok, Loc, WordBegin, 178 Pos.BufferPtr - WordBegin, Text); 179 return true; 180 } 181 182 bool lexDelimitedSeq(Token &Tok, char OpenDelim, char CloseDelim) { 183 if (isEnd()) 184 return false; 185 186 Position SavedPos = Pos; 187 188 consumeWhitespace(); 189 SmallString<32> WordText; 190 const char *WordBegin = Pos.BufferPtr; 191 SourceLocation Loc = getSourceLocation(); 192 bool Error = false; 193 if (!isEnd()) { 194 const char C = peek(); 195 if (C == OpenDelim) { 196 WordText.push_back(C); 197 consumeChar(); 198 } else 199 Error = true; 200 } 201 char C = '\0'; 202 while (!Error && !isEnd()) { 203 C = peek(); 204 WordText.push_back(C); 205 consumeChar(); 206 if (C == CloseDelim) 207 break; 208 } 209 if (!Error && C != CloseDelim) 210 Error = true; 211 212 if (Error) { 213 Pos = SavedPos; 214 return false; 215 } 216 217 const unsigned Length = WordText.size(); 218 char *TextPtr = Allocator.Allocate<char>(Length + 1); 219 220 memcpy(TextPtr, WordText.c_str(), Length + 1); 221 StringRef Text = StringRef(TextPtr, Length); 222 223 formTokenWithChars(Tok, Loc, WordBegin, 224 Pos.BufferPtr - WordBegin, Text); 225 return true; 226 } 227 228 /// Put back tokens that we didn't consume. 229 void putBackLeftoverTokens() { 230 if (isEnd()) 231 return; 232 233 bool HavePartialTok = false; 234 Token PartialTok; 235 if (Pos.BufferPtr != Pos.BufferStart) { 236 formTokenWithChars(PartialTok, getSourceLocation(), 237 Pos.BufferPtr, Pos.BufferEnd - Pos.BufferPtr, 238 StringRef(Pos.BufferPtr, 239 Pos.BufferEnd - Pos.BufferPtr)); 240 HavePartialTok = true; 241 Pos.CurToken++; 242 } 243 244 P.putBack(llvm::makeArrayRef(Toks.begin() + Pos.CurToken, Toks.end())); 245 Pos.CurToken = Toks.size(); 246 247 if (HavePartialTok) 248 P.putBack(PartialTok); 249 } 250 }; 251 252 Parser::Parser(Lexer &L, Sema &S, llvm::BumpPtrAllocator &Allocator, 253 const SourceManager &SourceMgr, DiagnosticsEngine &Diags): 254 L(L), S(S), Allocator(Allocator), SourceMgr(SourceMgr), Diags(Diags) { 255 consumeToken(); 256 } 257 258 ParamCommandComment *Parser::parseParamCommandArgs( 259 ParamCommandComment *PC, 260 TextTokenRetokenizer &Retokenizer) { 261 Token Arg; 262 // Check if argument looks like direction specification: [dir] 263 // e.g., [in], [out], [in,out] 264 if (Retokenizer.lexDelimitedSeq(Arg, '[', ']')) 265 PC = S.actOnParamCommandDirectionArg(PC, 266 Arg.getLocation(), 267 Arg.getEndLocation(), 268 Arg.getText()); 269 270 if (Retokenizer.lexWord(Arg)) 271 PC = S.actOnParamCommandParamNameArg(PC, 272 Arg.getLocation(), 273 Arg.getEndLocation(), 274 Arg.getText()); 275 276 return PC; 277 } 278 279 TParamCommandComment *Parser::parseTParamCommandArgs( 280 TParamCommandComment *TPC, 281 TextTokenRetokenizer &Retokenizer) { 282 Token Arg; 283 if (Retokenizer.lexWord(Arg)) 284 TPC = S.actOnTParamCommandParamNameArg(TPC, 285 Arg.getLocation(), 286 Arg.getEndLocation(), 287 Arg.getText()); 288 289 return TPC; 290 } 291 292 BlockCommandComment *Parser::parseBlockCommandArgs( 293 BlockCommandComment *BC, 294 TextTokenRetokenizer &Retokenizer, 295 unsigned NumArgs) { 296 typedef BlockCommandComment::Argument Argument; 297 Argument *Args = 298 new (Allocator.Allocate<Argument>(NumArgs)) Argument[NumArgs]; 299 unsigned ParsedArgs = 0; 300 Token Arg; 301 while (ParsedArgs < NumArgs && Retokenizer.lexWord(Arg)) { 302 Args[ParsedArgs] = Argument(SourceRange(Arg.getLocation(), 303 Arg.getEndLocation()), 304 Arg.getText()); 305 ParsedArgs++; 306 } 307 308 return S.actOnBlockCommandArgs(BC, llvm::makeArrayRef(Args, ParsedArgs)); 309 } 310 311 BlockCommandComment *Parser::parseBlockCommand() { 312 assert(Tok.is(tok::command)); 313 314 ParamCommandComment *PC; 315 TParamCommandComment *TPC; 316 BlockCommandComment *BC; 317 bool IsParam = false; 318 bool IsTParam = false; 319 unsigned NumArgs = 0; 320 if (S.isParamCommand(Tok.getCommandName())) { 321 IsParam = true; 322 PC = S.actOnParamCommandStart(Tok.getLocation(), 323 Tok.getEndLocation(), 324 Tok.getCommandName()); 325 } if (S.isTParamCommand(Tok.getCommandName())) { 326 IsTParam = true; 327 TPC = S.actOnTParamCommandStart(Tok.getLocation(), 328 Tok.getEndLocation(), 329 Tok.getCommandName()); 330 } else { 331 NumArgs = S.getBlockCommandNumArgs(Tok.getCommandName()); 332 BC = S.actOnBlockCommandStart(Tok.getLocation(), 333 Tok.getEndLocation(), 334 Tok.getCommandName()); 335 } 336 consumeToken(); 337 338 if (Tok.is(tok::command) && S.isBlockCommand(Tok.getCommandName())) { 339 // Block command ahead. We can't nest block commands, so pretend that this 340 // command has an empty argument. 341 ParagraphComment *Paragraph = S.actOnParagraphComment( 342 ArrayRef<InlineContentComment *>()); 343 return S.actOnBlockCommandFinish(IsParam ? PC : BC, Paragraph); 344 } 345 346 if (IsParam || IsTParam || NumArgs > 0) { 347 // In order to parse command arguments we need to retokenize a few 348 // following text tokens. 349 TextTokenRetokenizer Retokenizer(Allocator, *this); 350 351 if (IsParam) 352 PC = parseParamCommandArgs(PC, Retokenizer); 353 else if (IsTParam) 354 TPC = parseTParamCommandArgs(TPC, Retokenizer); 355 else 356 BC = parseBlockCommandArgs(BC, Retokenizer, NumArgs); 357 358 Retokenizer.putBackLeftoverTokens(); 359 } 360 361 BlockContentComment *Block = parseParagraphOrBlockCommand(); 362 // Since we have checked for a block command, we should have parsed a 363 // paragraph. 364 if (IsParam) 365 return S.actOnParamCommandFinish(PC, cast<ParagraphComment>(Block)); 366 else if (IsTParam) 367 return S.actOnTParamCommandFinish(TPC, cast<ParagraphComment>(Block)); 368 else 369 return S.actOnBlockCommandFinish(BC, cast<ParagraphComment>(Block)); 370 } 371 372 InlineCommandComment *Parser::parseInlineCommand() { 373 assert(Tok.is(tok::command)); 374 375 const Token CommandTok = Tok; 376 consumeToken(); 377 378 TextTokenRetokenizer Retokenizer(Allocator, *this); 379 380 Token ArgTok; 381 bool ArgTokValid = Retokenizer.lexWord(ArgTok); 382 383 InlineCommandComment *IC; 384 if (ArgTokValid) { 385 IC = S.actOnInlineCommand(CommandTok.getLocation(), 386 CommandTok.getEndLocation(), 387 CommandTok.getCommandName(), 388 ArgTok.getLocation(), 389 ArgTok.getEndLocation(), 390 ArgTok.getText()); 391 } else { 392 IC = S.actOnInlineCommand(CommandTok.getLocation(), 393 CommandTok.getEndLocation(), 394 CommandTok.getCommandName()); 395 } 396 397 Retokenizer.putBackLeftoverTokens(); 398 399 return IC; 400 } 401 402 HTMLStartTagComment *Parser::parseHTMLStartTag() { 403 assert(Tok.is(tok::html_start_tag)); 404 HTMLStartTagComment *HST = 405 S.actOnHTMLStartTagStart(Tok.getLocation(), 406 Tok.getHTMLTagStartName()); 407 consumeToken(); 408 409 SmallVector<HTMLStartTagComment::Attribute, 2> Attrs; 410 while (true) { 411 switch (Tok.getKind()) { 412 case tok::html_ident: { 413 Token Ident = Tok; 414 consumeToken(); 415 if (Tok.isNot(tok::html_equals)) { 416 Attrs.push_back(HTMLStartTagComment::Attribute(Ident.getLocation(), 417 Ident.getHTMLIdent())); 418 continue; 419 } 420 Token Equals = Tok; 421 consumeToken(); 422 if (Tok.isNot(tok::html_quoted_string)) { 423 Diag(Tok.getLocation(), 424 diag::warn_doc_html_start_tag_expected_quoted_string) 425 << SourceRange(Equals.getLocation()); 426 Attrs.push_back(HTMLStartTagComment::Attribute(Ident.getLocation(), 427 Ident.getHTMLIdent())); 428 while (Tok.is(tok::html_equals) || 429 Tok.is(tok::html_quoted_string)) 430 consumeToken(); 431 continue; 432 } 433 Attrs.push_back(HTMLStartTagComment::Attribute( 434 Ident.getLocation(), 435 Ident.getHTMLIdent(), 436 Equals.getLocation(), 437 SourceRange(Tok.getLocation(), 438 Tok.getEndLocation()), 439 Tok.getHTMLQuotedString())); 440 consumeToken(); 441 continue; 442 } 443 444 case tok::html_greater: 445 HST = S.actOnHTMLStartTagFinish(HST, 446 S.copyArray(llvm::makeArrayRef(Attrs)), 447 Tok.getLocation(), 448 /* IsSelfClosing = */ false); 449 consumeToken(); 450 return HST; 451 452 case tok::html_slash_greater: 453 HST = S.actOnHTMLStartTagFinish(HST, 454 S.copyArray(llvm::makeArrayRef(Attrs)), 455 Tok.getLocation(), 456 /* IsSelfClosing = */ true); 457 consumeToken(); 458 return HST; 459 460 case tok::html_equals: 461 case tok::html_quoted_string: 462 Diag(Tok.getLocation(), 463 diag::warn_doc_html_start_tag_expected_ident_or_greater); 464 while (Tok.is(tok::html_equals) || 465 Tok.is(tok::html_quoted_string)) 466 consumeToken(); 467 if (Tok.is(tok::html_ident) || 468 Tok.is(tok::html_greater) || 469 Tok.is(tok::html_slash_greater)) 470 continue; 471 472 return S.actOnHTMLStartTagFinish(HST, 473 S.copyArray(llvm::makeArrayRef(Attrs)), 474 SourceLocation(), 475 /* IsSelfClosing = */ false); 476 477 default: 478 // Not a token from an HTML start tag. Thus HTML tag prematurely ended. 479 HST = S.actOnHTMLStartTagFinish(HST, 480 S.copyArray(llvm::makeArrayRef(Attrs)), 481 SourceLocation(), 482 /* IsSelfClosing = */ false); 483 bool StartLineInvalid; 484 const unsigned StartLine = SourceMgr.getPresumedLineNumber( 485 HST->getLocation(), 486 &StartLineInvalid); 487 bool EndLineInvalid; 488 const unsigned EndLine = SourceMgr.getPresumedLineNumber( 489 Tok.getLocation(), 490 &EndLineInvalid); 491 if (StartLineInvalid || EndLineInvalid || StartLine == EndLine) 492 Diag(Tok.getLocation(), 493 diag::warn_doc_html_start_tag_expected_ident_or_greater) 494 << HST->getSourceRange(); 495 else { 496 Diag(Tok.getLocation(), 497 diag::warn_doc_html_start_tag_expected_ident_or_greater); 498 Diag(HST->getLocation(), diag::note_doc_html_tag_started_here) 499 << HST->getSourceRange(); 500 } 501 return HST; 502 } 503 } 504 } 505 506 HTMLEndTagComment *Parser::parseHTMLEndTag() { 507 assert(Tok.is(tok::html_end_tag)); 508 Token TokEndTag = Tok; 509 consumeToken(); 510 SourceLocation Loc; 511 if (Tok.is(tok::html_greater)) { 512 Loc = Tok.getLocation(); 513 consumeToken(); 514 } 515 516 return S.actOnHTMLEndTag(TokEndTag.getLocation(), 517 Loc, 518 TokEndTag.getHTMLTagEndName()); 519 } 520 521 BlockContentComment *Parser::parseParagraphOrBlockCommand() { 522 SmallVector<InlineContentComment *, 8> Content; 523 524 while (true) { 525 switch (Tok.getKind()) { 526 case tok::verbatim_block_begin: 527 case tok::verbatim_line_name: 528 case tok::eof: 529 assert(Content.size() != 0); 530 break; // Block content or EOF ahead, finish this parapgaph. 531 532 case tok::command: 533 if (S.isBlockCommand(Tok.getCommandName())) { 534 if (Content.size() == 0) 535 return parseBlockCommand(); 536 break; // Block command ahead, finish this parapgaph. 537 } 538 if (S.isInlineCommand(Tok.getCommandName())) { 539 Content.push_back(parseInlineCommand()); 540 continue; 541 } 542 543 // Not a block command, not an inline command ==> an unknown command. 544 Content.push_back(S.actOnUnknownCommand(Tok.getLocation(), 545 Tok.getEndLocation(), 546 Tok.getCommandName())); 547 consumeToken(); 548 continue; 549 550 case tok::newline: { 551 consumeToken(); 552 if (Tok.is(tok::newline) || Tok.is(tok::eof)) { 553 consumeToken(); 554 break; // Two newlines -- end of paragraph. 555 } 556 if (Content.size() > 0) 557 Content.back()->addTrailingNewline(); 558 continue; 559 } 560 561 // Don't deal with HTML tag soup now. 562 case tok::html_start_tag: 563 Content.push_back(parseHTMLStartTag()); 564 continue; 565 566 case tok::html_end_tag: 567 Content.push_back(parseHTMLEndTag()); 568 continue; 569 570 case tok::text: 571 Content.push_back(S.actOnText(Tok.getLocation(), 572 Tok.getEndLocation(), 573 Tok.getText())); 574 consumeToken(); 575 continue; 576 577 case tok::verbatim_block_line: 578 case tok::verbatim_block_end: 579 case tok::verbatim_line_text: 580 case tok::html_ident: 581 case tok::html_equals: 582 case tok::html_quoted_string: 583 case tok::html_greater: 584 case tok::html_slash_greater: 585 llvm_unreachable("should not see this token"); 586 } 587 break; 588 } 589 590 return S.actOnParagraphComment(S.copyArray(llvm::makeArrayRef(Content))); 591 } 592 593 VerbatimBlockComment *Parser::parseVerbatimBlock() { 594 assert(Tok.is(tok::verbatim_block_begin)); 595 596 VerbatimBlockComment *VB = 597 S.actOnVerbatimBlockStart(Tok.getLocation(), 598 Tok.getVerbatimBlockName()); 599 consumeToken(); 600 601 // Don't create an empty line if verbatim opening command is followed 602 // by a newline. 603 if (Tok.is(tok::newline)) 604 consumeToken(); 605 606 SmallVector<VerbatimBlockLineComment *, 8> Lines; 607 while (Tok.is(tok::verbatim_block_line) || 608 Tok.is(tok::newline)) { 609 VerbatimBlockLineComment *Line; 610 if (Tok.is(tok::verbatim_block_line)) { 611 Line = S.actOnVerbatimBlockLine(Tok.getLocation(), 612 Tok.getVerbatimBlockText()); 613 consumeToken(); 614 if (Tok.is(tok::newline)) { 615 consumeToken(); 616 } 617 } else { 618 // Empty line, just a tok::newline. 619 Line = S.actOnVerbatimBlockLine(Tok.getLocation(), ""); 620 consumeToken(); 621 } 622 Lines.push_back(Line); 623 } 624 625 if (Tok.is(tok::verbatim_block_end)) { 626 VB = S.actOnVerbatimBlockFinish(VB, Tok.getLocation(), 627 Tok.getVerbatimBlockName(), 628 S.copyArray(llvm::makeArrayRef(Lines))); 629 consumeToken(); 630 } else { 631 // Unterminated \\verbatim block 632 VB = S.actOnVerbatimBlockFinish(VB, SourceLocation(), "", 633 S.copyArray(llvm::makeArrayRef(Lines))); 634 } 635 636 return VB; 637 } 638 639 VerbatimLineComment *Parser::parseVerbatimLine() { 640 assert(Tok.is(tok::verbatim_line_name)); 641 642 Token NameTok = Tok; 643 consumeToken(); 644 645 SourceLocation TextBegin; 646 StringRef Text; 647 // Next token might not be a tok::verbatim_line_text if verbatim line 648 // starting command comes just before a newline or comment end. 649 if (Tok.is(tok::verbatim_line_text)) { 650 TextBegin = Tok.getLocation(); 651 Text = Tok.getVerbatimLineText(); 652 } else { 653 TextBegin = NameTok.getEndLocation(); 654 Text = ""; 655 } 656 657 VerbatimLineComment *VL = S.actOnVerbatimLine(NameTok.getLocation(), 658 NameTok.getVerbatimLineName(), 659 TextBegin, 660 Text); 661 consumeToken(); 662 return VL; 663 } 664 665 BlockContentComment *Parser::parseBlockContent() { 666 switch (Tok.getKind()) { 667 case tok::text: 668 case tok::command: 669 case tok::html_start_tag: 670 case tok::html_end_tag: 671 return parseParagraphOrBlockCommand(); 672 673 case tok::verbatim_block_begin: 674 return parseVerbatimBlock(); 675 676 case tok::verbatim_line_name: 677 return parseVerbatimLine(); 678 679 case tok::eof: 680 case tok::newline: 681 case tok::verbatim_block_line: 682 case tok::verbatim_block_end: 683 case tok::verbatim_line_text: 684 case tok::html_ident: 685 case tok::html_equals: 686 case tok::html_quoted_string: 687 case tok::html_greater: 688 case tok::html_slash_greater: 689 llvm_unreachable("should not see this token"); 690 } 691 llvm_unreachable("bogus token kind"); 692 } 693 694 FullComment *Parser::parseFullComment() { 695 // Skip newlines at the beginning of the comment. 696 while (Tok.is(tok::newline)) 697 consumeToken(); 698 699 SmallVector<BlockContentComment *, 8> Blocks; 700 while (Tok.isNot(tok::eof)) { 701 Blocks.push_back(parseBlockContent()); 702 703 // Skip extra newlines after paragraph end. 704 while (Tok.is(tok::newline)) 705 consumeToken(); 706 } 707 return S.actOnFullComment(S.copyArray(llvm::makeArrayRef(Blocks))); 708 } 709 710 } // end namespace comments 711 } // end namespace clang 712