1 //===--- CommentParser.cpp - Doxygen comment parser -----------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 10 #include "clang/AST/CommentParser.h" 11 #include "clang/AST/CommentSema.h" 12 #include "clang/AST/CommentDiagnostic.h" 13 #include "clang/Basic/SourceManager.h" 14 #include "llvm/Support/ErrorHandling.h" 15 16 namespace clang { 17 namespace comments { 18 19 /// Re-lexes a sequence of tok::text tokens. 20 class TextTokenRetokenizer { 21 llvm::BumpPtrAllocator &Allocator; 22 Parser &P; 23 24 /// This flag is set when there are no more tokens we can fetch from lexer. 25 bool NoMoreInterestingTokens; 26 27 /// Token buffer: tokens we have processed and lookahead. 28 SmallVector<Token, 16> Toks; 29 30 /// A position in \c Toks. 31 struct Position { 32 unsigned CurToken; 33 const char *BufferStart; 34 const char *BufferEnd; 35 const char *BufferPtr; 36 SourceLocation BufferStartLoc; 37 }; 38 39 /// Current position in Toks. 40 Position Pos; 41 42 bool isEnd() const { 43 return Pos.CurToken >= Toks.size(); 44 } 45 46 /// Sets up the buffer pointers to point to current token. 47 void setupBuffer() { 48 assert(!isEnd()); 49 const Token &Tok = Toks[Pos.CurToken]; 50 51 Pos.BufferStart = Tok.getText().begin(); 52 Pos.BufferEnd = Tok.getText().end(); 53 Pos.BufferPtr = Pos.BufferStart; 54 Pos.BufferStartLoc = Tok.getLocation(); 55 } 56 57 SourceLocation getSourceLocation() const { 58 const unsigned CharNo = Pos.BufferPtr - Pos.BufferStart; 59 return Pos.BufferStartLoc.getLocWithOffset(CharNo); 60 } 61 62 char peek() const { 63 assert(!isEnd()); 64 assert(Pos.BufferPtr != Pos.BufferEnd); 65 return *Pos.BufferPtr; 66 } 67 68 void consumeChar() { 69 assert(!isEnd()); 70 assert(Pos.BufferPtr != Pos.BufferEnd); 71 Pos.BufferPtr++; 72 if (Pos.BufferPtr == Pos.BufferEnd) { 73 Pos.CurToken++; 74 if (isEnd() && !addToken()) 75 return; 76 77 assert(!isEnd()); 78 setupBuffer(); 79 } 80 } 81 82 /// Add a token. 83 /// Returns true on success, false if there are no interesting tokens to 84 /// fetch from lexer. 85 bool addToken() { 86 if (NoMoreInterestingTokens) 87 return false; 88 89 if (P.Tok.is(tok::newline)) { 90 // If we see a single newline token between text tokens, skip it. 91 Token Newline = P.Tok; 92 P.consumeToken(); 93 if (P.Tok.isNot(tok::text)) { 94 P.putBack(Newline); 95 NoMoreInterestingTokens = true; 96 return false; 97 } 98 } 99 if (P.Tok.isNot(tok::text)) { 100 NoMoreInterestingTokens = true; 101 return false; 102 } 103 104 Toks.push_back(P.Tok); 105 P.consumeToken(); 106 if (Toks.size() == 1) 107 setupBuffer(); 108 return true; 109 } 110 111 static bool isWhitespace(char C) { 112 return C == ' ' || C == '\n' || C == '\r' || 113 C == '\t' || C == '\f' || C == '\v'; 114 } 115 116 void consumeWhitespace() { 117 while (!isEnd()) { 118 if (isWhitespace(peek())) 119 consumeChar(); 120 else 121 break; 122 } 123 } 124 125 void formTokenWithChars(Token &Result, 126 SourceLocation Loc, 127 const char *TokBegin, 128 unsigned TokLength, 129 StringRef Text) { 130 Result.setLocation(Loc); 131 Result.setKind(tok::text); 132 Result.setLength(TokLength); 133 #ifndef NDEBUG 134 Result.TextPtr1 = "<UNSET>"; 135 Result.TextLen1 = 7; 136 #endif 137 Result.setText(Text); 138 } 139 140 public: 141 TextTokenRetokenizer(llvm::BumpPtrAllocator &Allocator, Parser &P): 142 Allocator(Allocator), P(P), NoMoreInterestingTokens(false) { 143 Pos.CurToken = 0; 144 addToken(); 145 } 146 147 /// Extract a word -- sequence of non-whitespace characters. 148 bool lexWord(Token &Tok) { 149 if (isEnd()) 150 return false; 151 152 Position SavedPos = Pos; 153 154 consumeWhitespace(); 155 SmallString<32> WordText; 156 const char *WordBegin = Pos.BufferPtr; 157 SourceLocation Loc = getSourceLocation(); 158 while (!isEnd()) { 159 const char C = peek(); 160 if (!isWhitespace(C)) { 161 WordText.push_back(C); 162 consumeChar(); 163 } else 164 break; 165 } 166 const unsigned Length = WordText.size(); 167 if (Length == 0) { 168 Pos = SavedPos; 169 return false; 170 } 171 172 char *TextPtr = Allocator.Allocate<char>(Length + 1); 173 174 memcpy(TextPtr, WordText.c_str(), Length + 1); 175 StringRef Text = StringRef(TextPtr, Length); 176 177 formTokenWithChars(Tok, Loc, WordBegin, 178 Pos.BufferPtr - WordBegin, Text); 179 return true; 180 } 181 182 bool lexDelimitedSeq(Token &Tok, char OpenDelim, char CloseDelim) { 183 if (isEnd()) 184 return false; 185 186 Position SavedPos = Pos; 187 188 consumeWhitespace(); 189 SmallString<32> WordText; 190 const char *WordBegin = Pos.BufferPtr; 191 SourceLocation Loc = getSourceLocation(); 192 bool Error = false; 193 if (!isEnd()) { 194 const char C = peek(); 195 if (C == OpenDelim) { 196 WordText.push_back(C); 197 consumeChar(); 198 } else 199 Error = true; 200 } 201 char C = '\0'; 202 while (!Error && !isEnd()) { 203 C = peek(); 204 WordText.push_back(C); 205 consumeChar(); 206 if (C == CloseDelim) 207 break; 208 } 209 if (!Error && C != CloseDelim) 210 Error = true; 211 212 if (Error) { 213 Pos = SavedPos; 214 return false; 215 } 216 217 const unsigned Length = WordText.size(); 218 char *TextPtr = Allocator.Allocate<char>(Length + 1); 219 220 memcpy(TextPtr, WordText.c_str(), Length + 1); 221 StringRef Text = StringRef(TextPtr, Length); 222 223 formTokenWithChars(Tok, Loc, WordBegin, 224 Pos.BufferPtr - WordBegin, Text); 225 return true; 226 } 227 228 /// Put back tokens that we didn't consume. 229 void putBackLeftoverTokens() { 230 if (isEnd()) 231 return; 232 233 bool HavePartialTok = false; 234 Token PartialTok; 235 if (Pos.BufferPtr != Pos.BufferStart) { 236 formTokenWithChars(PartialTok, getSourceLocation(), 237 Pos.BufferPtr, Pos.BufferEnd - Pos.BufferPtr, 238 StringRef(Pos.BufferPtr, 239 Pos.BufferEnd - Pos.BufferPtr)); 240 HavePartialTok = true; 241 Pos.CurToken++; 242 } 243 244 P.putBack(llvm::makeArrayRef(Toks.begin() + Pos.CurToken, Toks.end())); 245 Pos.CurToken = Toks.size(); 246 247 if (HavePartialTok) 248 P.putBack(PartialTok); 249 } 250 }; 251 252 Parser::Parser(Lexer &L, Sema &S, llvm::BumpPtrAllocator &Allocator, 253 const SourceManager &SourceMgr, DiagnosticsEngine &Diags): 254 L(L), S(S), Allocator(Allocator), SourceMgr(SourceMgr), Diags(Diags) { 255 consumeToken(); 256 } 257 258 ParamCommandComment *Parser::parseParamCommandArgs( 259 ParamCommandComment *PC, 260 TextTokenRetokenizer &Retokenizer) { 261 Token Arg; 262 // Check if argument looks like direction specification: [dir] 263 // e.g., [in], [out], [in,out] 264 if (Retokenizer.lexDelimitedSeq(Arg, '[', ']')) 265 PC = S.actOnParamCommandDirectionArg(PC, 266 Arg.getLocation(), 267 Arg.getEndLocation(), 268 Arg.getText()); 269 270 if (Retokenizer.lexWord(Arg)) 271 PC = S.actOnParamCommandParamNameArg(PC, 272 Arg.getLocation(), 273 Arg.getEndLocation(), 274 Arg.getText()); 275 276 return PC; 277 } 278 279 BlockCommandComment *Parser::parseBlockCommandArgs( 280 BlockCommandComment *BC, 281 TextTokenRetokenizer &Retokenizer, 282 unsigned NumArgs) { 283 typedef BlockCommandComment::Argument Argument; 284 Argument *Args = 285 new (Allocator.Allocate<Argument>(NumArgs)) Argument[NumArgs]; 286 unsigned ParsedArgs = 0; 287 Token Arg; 288 while (ParsedArgs < NumArgs && Retokenizer.lexWord(Arg)) { 289 Args[ParsedArgs] = Argument(SourceRange(Arg.getLocation(), 290 Arg.getEndLocation()), 291 Arg.getText()); 292 ParsedArgs++; 293 } 294 295 return S.actOnBlockCommandArgs(BC, llvm::makeArrayRef(Args, ParsedArgs)); 296 } 297 298 BlockCommandComment *Parser::parseBlockCommand() { 299 assert(Tok.is(tok::command)); 300 301 ParamCommandComment *PC; 302 BlockCommandComment *BC; 303 bool IsParam = false; 304 unsigned NumArgs = 0; 305 if (S.isParamCommand(Tok.getCommandName())) { 306 IsParam = true; 307 PC = S.actOnParamCommandStart(Tok.getLocation(), 308 Tok.getEndLocation(), 309 Tok.getCommandName()); 310 } else { 311 NumArgs = S.getBlockCommandNumArgs(Tok.getCommandName()); 312 BC = S.actOnBlockCommandStart(Tok.getLocation(), 313 Tok.getEndLocation(), 314 Tok.getCommandName()); 315 } 316 consumeToken(); 317 318 if (Tok.is(tok::command) && S.isBlockCommand(Tok.getCommandName())) { 319 // Block command ahead. We can't nest block commands, so pretend that this 320 // command has an empty argument. 321 ParagraphComment *PC = S.actOnParagraphComment( 322 ArrayRef<InlineContentComment *>()); 323 return S.actOnBlockCommandFinish(BC, PC); 324 } 325 326 if (IsParam || NumArgs > 0) { 327 // In order to parse command arguments we need to retokenize a few 328 // following text tokens. 329 TextTokenRetokenizer Retokenizer(Allocator, *this); 330 331 if (IsParam) 332 PC = parseParamCommandArgs(PC, Retokenizer); 333 else 334 BC = parseBlockCommandArgs(BC, Retokenizer, NumArgs); 335 336 Retokenizer.putBackLeftoverTokens(); 337 } 338 339 BlockContentComment *Block = parseParagraphOrBlockCommand(); 340 // Since we have checked for a block command, we should have parsed a 341 // paragraph. 342 if (IsParam) 343 return S.actOnParamCommandFinish(PC, cast<ParagraphComment>(Block)); 344 else 345 return S.actOnBlockCommandFinish(BC, cast<ParagraphComment>(Block)); 346 } 347 348 InlineCommandComment *Parser::parseInlineCommand() { 349 assert(Tok.is(tok::command)); 350 351 const Token CommandTok = Tok; 352 consumeToken(); 353 354 TextTokenRetokenizer Retokenizer(Allocator, *this); 355 356 Token ArgTok; 357 bool ArgTokValid = Retokenizer.lexWord(ArgTok); 358 359 InlineCommandComment *IC; 360 if (ArgTokValid) { 361 IC = S.actOnInlineCommand(CommandTok.getLocation(), 362 CommandTok.getEndLocation(), 363 CommandTok.getCommandName(), 364 ArgTok.getLocation(), 365 ArgTok.getEndLocation(), 366 ArgTok.getText()); 367 } else { 368 IC = S.actOnInlineCommand(CommandTok.getLocation(), 369 CommandTok.getEndLocation(), 370 CommandTok.getCommandName()); 371 } 372 373 Retokenizer.putBackLeftoverTokens(); 374 375 return IC; 376 } 377 378 HTMLStartTagComment *Parser::parseHTMLStartTag() { 379 assert(Tok.is(tok::html_start_tag)); 380 HTMLStartTagComment *HST = 381 S.actOnHTMLStartTagStart(Tok.getLocation(), 382 Tok.getHTMLTagStartName()); 383 consumeToken(); 384 385 SmallVector<HTMLStartTagComment::Attribute, 2> Attrs; 386 while (true) { 387 switch (Tok.getKind()) { 388 case tok::html_ident: { 389 Token Ident = Tok; 390 consumeToken(); 391 if (Tok.isNot(tok::html_equals)) { 392 Attrs.push_back(HTMLStartTagComment::Attribute(Ident.getLocation(), 393 Ident.getHTMLIdent())); 394 continue; 395 } 396 Token Equals = Tok; 397 consumeToken(); 398 if (Tok.isNot(tok::html_quoted_string)) { 399 Diag(Tok.getLocation(), 400 diag::warn_doc_html_start_tag_expected_quoted_string) 401 << SourceRange(Equals.getLocation()); 402 Attrs.push_back(HTMLStartTagComment::Attribute(Ident.getLocation(), 403 Ident.getHTMLIdent())); 404 while (Tok.is(tok::html_equals) || 405 Tok.is(tok::html_quoted_string)) 406 consumeToken(); 407 continue; 408 } 409 Attrs.push_back(HTMLStartTagComment::Attribute( 410 Ident.getLocation(), 411 Ident.getHTMLIdent(), 412 Equals.getLocation(), 413 SourceRange(Tok.getLocation(), 414 Tok.getEndLocation()), 415 Tok.getHTMLQuotedString())); 416 consumeToken(); 417 continue; 418 } 419 420 case tok::html_greater: 421 HST = S.actOnHTMLStartTagFinish(HST, 422 copyArray(llvm::makeArrayRef(Attrs)), 423 Tok.getLocation(), 424 /* IsSelfClosing = */ false); 425 consumeToken(); 426 return HST; 427 428 case tok::html_slash_greater: 429 HST = S.actOnHTMLStartTagFinish(HST, 430 copyArray(llvm::makeArrayRef(Attrs)), 431 Tok.getLocation(), 432 /* IsSelfClosing = */ true); 433 consumeToken(); 434 return HST; 435 436 case tok::html_equals: 437 case tok::html_quoted_string: 438 Diag(Tok.getLocation(), 439 diag::warn_doc_html_start_tag_expected_ident_or_greater); 440 while (Tok.is(tok::html_equals) || 441 Tok.is(tok::html_quoted_string)) 442 consumeToken(); 443 if (Tok.is(tok::html_ident) || 444 Tok.is(tok::html_greater) || 445 Tok.is(tok::html_slash_greater)) 446 continue; 447 448 return S.actOnHTMLStartTagFinish(HST, 449 copyArray(llvm::makeArrayRef(Attrs)), 450 SourceLocation(), 451 /* IsSelfClosing = */ false); 452 453 default: 454 // Not a token from an HTML start tag. Thus HTML tag prematurely ended. 455 HST = S.actOnHTMLStartTagFinish(HST, 456 copyArray(llvm::makeArrayRef(Attrs)), 457 SourceLocation(), 458 /* IsSelfClosing = */ false); 459 bool StartLineInvalid; 460 const unsigned StartLine = SourceMgr.getPresumedLineNumber( 461 HST->getLocation(), 462 &StartLineInvalid); 463 bool EndLineInvalid; 464 const unsigned EndLine = SourceMgr.getPresumedLineNumber( 465 Tok.getLocation(), 466 &EndLineInvalid); 467 if (StartLineInvalid || EndLineInvalid || StartLine == EndLine) 468 Diag(Tok.getLocation(), 469 diag::warn_doc_html_start_tag_expected_ident_or_greater) 470 << HST->getSourceRange(); 471 else { 472 Diag(Tok.getLocation(), 473 diag::warn_doc_html_start_tag_expected_ident_or_greater); 474 Diag(HST->getLocation(), diag::note_doc_html_tag_started_here) 475 << HST->getSourceRange(); 476 } 477 return HST; 478 } 479 } 480 } 481 482 HTMLEndTagComment *Parser::parseHTMLEndTag() { 483 assert(Tok.is(tok::html_end_tag)); 484 Token TokEndTag = Tok; 485 consumeToken(); 486 SourceLocation Loc; 487 if (Tok.is(tok::html_greater)) { 488 Loc = Tok.getLocation(); 489 consumeToken(); 490 } 491 492 return S.actOnHTMLEndTag(TokEndTag.getLocation(), 493 Loc, 494 TokEndTag.getHTMLTagEndName()); 495 } 496 497 BlockContentComment *Parser::parseParagraphOrBlockCommand() { 498 SmallVector<InlineContentComment *, 8> Content; 499 500 while (true) { 501 switch (Tok.getKind()) { 502 case tok::verbatim_block_begin: 503 case tok::verbatim_line_name: 504 case tok::eof: 505 assert(Content.size() != 0); 506 break; // Block content or EOF ahead, finish this parapgaph. 507 508 case tok::command: 509 if (S.isBlockCommand(Tok.getCommandName())) { 510 if (Content.size() == 0) 511 return parseBlockCommand(); 512 break; // Block command ahead, finish this parapgaph. 513 } 514 if (S.isInlineCommand(Tok.getCommandName())) { 515 Content.push_back(parseInlineCommand()); 516 continue; 517 } 518 519 // Not a block command, not an inline command ==> an unknown command. 520 Content.push_back(S.actOnUnknownCommand(Tok.getLocation(), 521 Tok.getEndLocation(), 522 Tok.getCommandName())); 523 consumeToken(); 524 continue; 525 526 case tok::newline: { 527 consumeToken(); 528 if (Tok.is(tok::newline) || Tok.is(tok::eof)) { 529 consumeToken(); 530 break; // Two newlines -- end of paragraph. 531 } 532 if (Content.size() > 0) 533 Content.back()->addTrailingNewline(); 534 continue; 535 } 536 537 // Don't deal with HTML tag soup now. 538 case tok::html_start_tag: 539 Content.push_back(parseHTMLStartTag()); 540 continue; 541 542 case tok::html_end_tag: 543 Content.push_back(parseHTMLEndTag()); 544 continue; 545 546 case tok::text: 547 Content.push_back(S.actOnText(Tok.getLocation(), 548 Tok.getEndLocation(), 549 Tok.getText())); 550 consumeToken(); 551 continue; 552 553 case tok::verbatim_block_line: 554 case tok::verbatim_block_end: 555 case tok::verbatim_line_text: 556 case tok::html_ident: 557 case tok::html_equals: 558 case tok::html_quoted_string: 559 case tok::html_greater: 560 case tok::html_slash_greater: 561 llvm_unreachable("should not see this token"); 562 } 563 break; 564 } 565 566 return S.actOnParagraphComment(copyArray(llvm::makeArrayRef(Content))); 567 } 568 569 VerbatimBlockComment *Parser::parseVerbatimBlock() { 570 assert(Tok.is(tok::verbatim_block_begin)); 571 572 VerbatimBlockComment *VB = 573 S.actOnVerbatimBlockStart(Tok.getLocation(), 574 Tok.getVerbatimBlockName()); 575 consumeToken(); 576 577 // Don't create an empty line if verbatim opening command is followed 578 // by a newline. 579 if (Tok.is(tok::newline)) 580 consumeToken(); 581 582 SmallVector<VerbatimBlockLineComment *, 8> Lines; 583 while (Tok.is(tok::verbatim_block_line) || 584 Tok.is(tok::newline)) { 585 VerbatimBlockLineComment *Line; 586 if (Tok.is(tok::verbatim_block_line)) { 587 Line = S.actOnVerbatimBlockLine(Tok.getLocation(), 588 Tok.getVerbatimBlockText()); 589 consumeToken(); 590 if (Tok.is(tok::newline)) { 591 consumeToken(); 592 } 593 } else { 594 // Empty line, just a tok::newline. 595 Line = S.actOnVerbatimBlockLine(Tok.getLocation(), ""); 596 consumeToken(); 597 } 598 Lines.push_back(Line); 599 } 600 601 if (Tok.is(tok::verbatim_block_end)) { 602 VB = S.actOnVerbatimBlockFinish(VB, Tok.getLocation(), 603 Tok.getVerbatimBlockName(), 604 copyArray(llvm::makeArrayRef(Lines))); 605 consumeToken(); 606 } else { 607 // Unterminated \\verbatim block 608 VB = S.actOnVerbatimBlockFinish(VB, SourceLocation(), "", 609 copyArray(llvm::makeArrayRef(Lines))); 610 } 611 612 return VB; 613 } 614 615 VerbatimLineComment *Parser::parseVerbatimLine() { 616 assert(Tok.is(tok::verbatim_line_name)); 617 618 Token NameTok = Tok; 619 consumeToken(); 620 621 SourceLocation TextBegin; 622 StringRef Text; 623 // Next token might not be a tok::verbatim_line_text if verbatim line 624 // starting command comes just before a newline or comment end. 625 if (Tok.is(tok::verbatim_line_text)) { 626 TextBegin = Tok.getLocation(); 627 Text = Tok.getVerbatimLineText(); 628 } else { 629 TextBegin = NameTok.getEndLocation(); 630 Text = ""; 631 } 632 633 VerbatimLineComment *VL = S.actOnVerbatimLine(NameTok.getLocation(), 634 NameTok.getVerbatimLineName(), 635 TextBegin, 636 Text); 637 consumeToken(); 638 return VL; 639 } 640 641 BlockContentComment *Parser::parseBlockContent() { 642 switch (Tok.getKind()) { 643 case tok::text: 644 case tok::command: 645 case tok::html_start_tag: 646 case tok::html_end_tag: 647 return parseParagraphOrBlockCommand(); 648 649 case tok::verbatim_block_begin: 650 return parseVerbatimBlock(); 651 652 case tok::verbatim_line_name: 653 return parseVerbatimLine(); 654 655 case tok::eof: 656 case tok::newline: 657 case tok::verbatim_block_line: 658 case tok::verbatim_block_end: 659 case tok::verbatim_line_text: 660 case tok::html_ident: 661 case tok::html_equals: 662 case tok::html_quoted_string: 663 case tok::html_greater: 664 case tok::html_slash_greater: 665 llvm_unreachable("should not see this token"); 666 } 667 llvm_unreachable("bogus token kind"); 668 } 669 670 FullComment *Parser::parseFullComment() { 671 // Skip newlines at the beginning of the comment. 672 while (Tok.is(tok::newline)) 673 consumeToken(); 674 675 SmallVector<BlockContentComment *, 8> Blocks; 676 while (Tok.isNot(tok::eof)) { 677 Blocks.push_back(parseBlockContent()); 678 679 // Skip extra newlines after paragraph end. 680 while (Tok.is(tok::newline)) 681 consumeToken(); 682 } 683 return S.actOnFullComment(copyArray(llvm::makeArrayRef(Blocks))); 684 } 685 686 } // end namespace comments 687 } // end namespace clang 688