xref: /llvm-project/clang/lib/AST/CommentParser.cpp (revision 35b0c09b6cd99b5e673a454b424af5ca10f1c045)
1 //===--- CommentParser.cpp - Doxygen comment parser -----------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 
10 #include "clang/AST/CommentParser.h"
11 #include "clang/AST/CommentSema.h"
12 #include "clang/AST/CommentDiagnostic.h"
13 #include "clang/Basic/SourceManager.h"
14 #include "llvm/Support/ErrorHandling.h"
15 
16 namespace clang {
17 namespace comments {
18 
19 /// Re-lexes a sequence of tok::text tokens.
20 class TextTokenRetokenizer {
21   llvm::BumpPtrAllocator &Allocator;
22   Parser &P;
23 
24   /// This flag is set when there are no more tokens we can fetch from lexer.
25   bool NoMoreInterestingTokens;
26 
27   /// Token buffer: tokens we have processed and lookahead.
28   SmallVector<Token, 16> Toks;
29 
30   /// A position in \c Toks.
31   struct Position {
32     unsigned CurToken;
33     const char *BufferStart;
34     const char *BufferEnd;
35     const char *BufferPtr;
36     SourceLocation BufferStartLoc;
37   };
38 
39   /// Current position in Toks.
40   Position Pos;
41 
42   bool isEnd() const {
43     return Pos.CurToken >= Toks.size();
44   }
45 
46   /// Sets up the buffer pointers to point to current token.
47   void setupBuffer() {
48     assert(!isEnd());
49     const Token &Tok = Toks[Pos.CurToken];
50 
51     Pos.BufferStart = Tok.getText().begin();
52     Pos.BufferEnd = Tok.getText().end();
53     Pos.BufferPtr = Pos.BufferStart;
54     Pos.BufferStartLoc = Tok.getLocation();
55   }
56 
57   SourceLocation getSourceLocation() const {
58     const unsigned CharNo = Pos.BufferPtr - Pos.BufferStart;
59     return Pos.BufferStartLoc.getLocWithOffset(CharNo);
60   }
61 
62   char peek() const {
63     assert(!isEnd());
64     assert(Pos.BufferPtr != Pos.BufferEnd);
65     return *Pos.BufferPtr;
66   }
67 
68   void consumeChar() {
69     assert(!isEnd());
70     assert(Pos.BufferPtr != Pos.BufferEnd);
71     Pos.BufferPtr++;
72     if (Pos.BufferPtr == Pos.BufferEnd) {
73       Pos.CurToken++;
74       if (isEnd() && !addToken())
75         return;
76 
77       assert(!isEnd());
78       setupBuffer();
79     }
80   }
81 
82   /// Add a token.
83   /// Returns true on success, false if there are no interesting tokens to
84   /// fetch from lexer.
85   bool addToken() {
86     if (NoMoreInterestingTokens)
87       return false;
88 
89     if (P.Tok.is(tok::newline)) {
90       // If we see a single newline token between text tokens, skip it.
91       Token Newline = P.Tok;
92       P.consumeToken();
93       if (P.Tok.isNot(tok::text)) {
94         P.putBack(Newline);
95         NoMoreInterestingTokens = true;
96         return false;
97       }
98     }
99     if (P.Tok.isNot(tok::text)) {
100       NoMoreInterestingTokens = true;
101       return false;
102     }
103 
104     Toks.push_back(P.Tok);
105     P.consumeToken();
106     if (Toks.size() == 1)
107       setupBuffer();
108     return true;
109   }
110 
111   static bool isWhitespace(char C) {
112     return C == ' ' || C == '\n' || C == '\r' ||
113            C == '\t' || C == '\f' || C == '\v';
114   }
115 
116   void consumeWhitespace() {
117     while (!isEnd()) {
118       if (isWhitespace(peek()))
119         consumeChar();
120       else
121         break;
122     }
123   }
124 
125   void formTokenWithChars(Token &Result,
126                           SourceLocation Loc,
127                           const char *TokBegin,
128                           unsigned TokLength,
129                           StringRef Text) {
130     Result.setLocation(Loc);
131     Result.setKind(tok::text);
132     Result.setLength(TokLength);
133 #ifndef NDEBUG
134     Result.TextPtr1 = "<UNSET>";
135     Result.TextLen1 = 7;
136 #endif
137     Result.setText(Text);
138   }
139 
140 public:
141   TextTokenRetokenizer(llvm::BumpPtrAllocator &Allocator, Parser &P):
142       Allocator(Allocator), P(P), NoMoreInterestingTokens(false) {
143     Pos.CurToken = 0;
144     addToken();
145   }
146 
147   /// Extract a word -- sequence of non-whitespace characters.
148   bool lexWord(Token &Tok) {
149     if (isEnd())
150       return false;
151 
152     Position SavedPos = Pos;
153 
154     consumeWhitespace();
155     SmallString<32> WordText;
156     const char *WordBegin = Pos.BufferPtr;
157     SourceLocation Loc = getSourceLocation();
158     while (!isEnd()) {
159       const char C = peek();
160       if (!isWhitespace(C)) {
161         WordText.push_back(C);
162         consumeChar();
163       } else
164         break;
165     }
166     const unsigned Length = WordText.size();
167     if (Length == 0) {
168       Pos = SavedPos;
169       return false;
170     }
171 
172     char *TextPtr = Allocator.Allocate<char>(Length + 1);
173 
174     memcpy(TextPtr, WordText.c_str(), Length + 1);
175     StringRef Text = StringRef(TextPtr, Length);
176 
177     formTokenWithChars(Tok, Loc, WordBegin,
178                        Pos.BufferPtr - WordBegin, Text);
179     return true;
180   }
181 
182   bool lexDelimitedSeq(Token &Tok, char OpenDelim, char CloseDelim) {
183     if (isEnd())
184       return false;
185 
186     Position SavedPos = Pos;
187 
188     consumeWhitespace();
189     SmallString<32> WordText;
190     const char *WordBegin = Pos.BufferPtr;
191     SourceLocation Loc = getSourceLocation();
192     bool Error = false;
193     if (!isEnd()) {
194       const char C = peek();
195       if (C == OpenDelim) {
196         WordText.push_back(C);
197         consumeChar();
198       } else
199         Error = true;
200     }
201     char C = '\0';
202     while (!Error && !isEnd()) {
203       C = peek();
204       WordText.push_back(C);
205       consumeChar();
206       if (C == CloseDelim)
207         break;
208     }
209     if (!Error && C != CloseDelim)
210       Error = true;
211 
212     if (Error) {
213       Pos = SavedPos;
214       return false;
215     }
216 
217     const unsigned Length = WordText.size();
218     char *TextPtr = Allocator.Allocate<char>(Length + 1);
219 
220     memcpy(TextPtr, WordText.c_str(), Length + 1);
221     StringRef Text = StringRef(TextPtr, Length);
222 
223     formTokenWithChars(Tok, Loc, WordBegin,
224                        Pos.BufferPtr - WordBegin, Text);
225     return true;
226   }
227 
228   /// Put back tokens that we didn't consume.
229   void putBackLeftoverTokens() {
230     if (isEnd())
231       return;
232 
233     bool HavePartialTok = false;
234     Token PartialTok;
235     if (Pos.BufferPtr != Pos.BufferStart) {
236       formTokenWithChars(PartialTok, getSourceLocation(),
237                          Pos.BufferPtr, Pos.BufferEnd - Pos.BufferPtr,
238                          StringRef(Pos.BufferPtr,
239                                    Pos.BufferEnd - Pos.BufferPtr));
240       HavePartialTok = true;
241       Pos.CurToken++;
242     }
243 
244     P.putBack(llvm::makeArrayRef(Toks.begin() + Pos.CurToken, Toks.end()));
245     Pos.CurToken = Toks.size();
246 
247     if (HavePartialTok)
248       P.putBack(PartialTok);
249   }
250 };
251 
252 Parser::Parser(Lexer &L, Sema &S, llvm::BumpPtrAllocator &Allocator,
253                const SourceManager &SourceMgr, DiagnosticsEngine &Diags):
254     L(L), S(S), Allocator(Allocator), SourceMgr(SourceMgr), Diags(Diags) {
255   consumeToken();
256 }
257 
258 ParamCommandComment *Parser::parseParamCommandArgs(
259     ParamCommandComment *PC,
260     TextTokenRetokenizer &Retokenizer) {
261   Token Arg;
262   // Check if argument looks like direction specification: [dir]
263   // e.g., [in], [out], [in,out]
264   if (Retokenizer.lexDelimitedSeq(Arg, '[', ']'))
265     PC = S.actOnParamCommandDirectionArg(PC,
266                                          Arg.getLocation(),
267                                          Arg.getEndLocation(),
268                                          Arg.getText());
269 
270   if (Retokenizer.lexWord(Arg))
271     PC = S.actOnParamCommandParamNameArg(PC,
272                                          Arg.getLocation(),
273                                          Arg.getEndLocation(),
274                                          Arg.getText());
275 
276   return PC;
277 }
278 
279 BlockCommandComment *Parser::parseBlockCommandArgs(
280     BlockCommandComment *BC,
281     TextTokenRetokenizer &Retokenizer,
282     unsigned NumArgs) {
283   typedef BlockCommandComment::Argument Argument;
284   Argument *Args =
285       new (Allocator.Allocate<Argument>(NumArgs)) Argument[NumArgs];
286   unsigned ParsedArgs = 0;
287   Token Arg;
288   while (ParsedArgs < NumArgs && Retokenizer.lexWord(Arg)) {
289     Args[ParsedArgs] = Argument(SourceRange(Arg.getLocation(),
290                                             Arg.getEndLocation()),
291                                 Arg.getText());
292     ParsedArgs++;
293   }
294 
295   return S.actOnBlockCommandArgs(BC, llvm::makeArrayRef(Args, ParsedArgs));
296 }
297 
298 BlockCommandComment *Parser::parseBlockCommand() {
299   assert(Tok.is(tok::command));
300 
301   ParamCommandComment *PC;
302   BlockCommandComment *BC;
303   bool IsParam = false;
304   unsigned NumArgs = 0;
305   if (S.isParamCommand(Tok.getCommandName())) {
306     IsParam = true;
307     PC = S.actOnParamCommandStart(Tok.getLocation(),
308                                   Tok.getEndLocation(),
309                                   Tok.getCommandName());
310   } else {
311     NumArgs = S.getBlockCommandNumArgs(Tok.getCommandName());
312     BC = S.actOnBlockCommandStart(Tok.getLocation(),
313                                   Tok.getEndLocation(),
314                                   Tok.getCommandName());
315   }
316   consumeToken();
317 
318   if (Tok.is(tok::command) && S.isBlockCommand(Tok.getCommandName())) {
319     // Block command ahead.  We can't nest block commands, so pretend that this
320     // command has an empty argument.
321     ParagraphComment *PC = S.actOnParagraphComment(
322                                 ArrayRef<InlineContentComment *>());
323     return S.actOnBlockCommandFinish(BC, PC);
324   }
325 
326   if (IsParam || NumArgs > 0) {
327     // In order to parse command arguments we need to retokenize a few
328     // following text tokens.
329     TextTokenRetokenizer Retokenizer(Allocator, *this);
330 
331     if (IsParam)
332       PC = parseParamCommandArgs(PC, Retokenizer);
333     else
334       BC = parseBlockCommandArgs(BC, Retokenizer, NumArgs);
335 
336     Retokenizer.putBackLeftoverTokens();
337   }
338 
339   BlockContentComment *Block = parseParagraphOrBlockCommand();
340   // Since we have checked for a block command, we should have parsed a
341   // paragraph.
342   if (IsParam)
343     return S.actOnParamCommandFinish(PC, cast<ParagraphComment>(Block));
344   else
345     return S.actOnBlockCommandFinish(BC, cast<ParagraphComment>(Block));
346 }
347 
348 InlineCommandComment *Parser::parseInlineCommand() {
349   assert(Tok.is(tok::command));
350 
351   const Token CommandTok = Tok;
352   consumeToken();
353 
354   TextTokenRetokenizer Retokenizer(Allocator, *this);
355 
356   Token ArgTok;
357   bool ArgTokValid = Retokenizer.lexWord(ArgTok);
358 
359   InlineCommandComment *IC;
360   if (ArgTokValid) {
361     IC = S.actOnInlineCommand(CommandTok.getLocation(),
362                               CommandTok.getEndLocation(),
363                               CommandTok.getCommandName(),
364                               ArgTok.getLocation(),
365                               ArgTok.getEndLocation(),
366                               ArgTok.getText());
367   } else {
368     IC = S.actOnInlineCommand(CommandTok.getLocation(),
369                               CommandTok.getEndLocation(),
370                               CommandTok.getCommandName());
371   }
372 
373   Retokenizer.putBackLeftoverTokens();
374 
375   return IC;
376 }
377 
378 HTMLStartTagComment *Parser::parseHTMLStartTag() {
379   assert(Tok.is(tok::html_start_tag));
380   HTMLStartTagComment *HST =
381       S.actOnHTMLStartTagStart(Tok.getLocation(),
382                                Tok.getHTMLTagStartName());
383   consumeToken();
384 
385   SmallVector<HTMLStartTagComment::Attribute, 2> Attrs;
386   while (true) {
387     switch (Tok.getKind()) {
388     case tok::html_ident: {
389       Token Ident = Tok;
390       consumeToken();
391       if (Tok.isNot(tok::html_equals)) {
392         Attrs.push_back(HTMLStartTagComment::Attribute(Ident.getLocation(),
393                                                        Ident.getHTMLIdent()));
394         continue;
395       }
396       Token Equals = Tok;
397       consumeToken();
398       if (Tok.isNot(tok::html_quoted_string)) {
399         Diag(Tok.getLocation(),
400              diag::warn_doc_html_start_tag_expected_quoted_string)
401           << SourceRange(Equals.getLocation());
402         Attrs.push_back(HTMLStartTagComment::Attribute(Ident.getLocation(),
403                                                        Ident.getHTMLIdent()));
404         while (Tok.is(tok::html_equals) ||
405                Tok.is(tok::html_quoted_string))
406           consumeToken();
407         continue;
408       }
409       Attrs.push_back(HTMLStartTagComment::Attribute(
410                               Ident.getLocation(),
411                               Ident.getHTMLIdent(),
412                               Equals.getLocation(),
413                               SourceRange(Tok.getLocation(),
414                                           Tok.getEndLocation()),
415                               Tok.getHTMLQuotedString()));
416       consumeToken();
417       continue;
418     }
419 
420     case tok::html_greater:
421       HST = S.actOnHTMLStartTagFinish(HST,
422                                       copyArray(llvm::makeArrayRef(Attrs)),
423                                       Tok.getLocation(),
424                                       /* IsSelfClosing = */ false);
425       consumeToken();
426       return HST;
427 
428     case tok::html_slash_greater:
429       HST = S.actOnHTMLStartTagFinish(HST,
430                                       copyArray(llvm::makeArrayRef(Attrs)),
431                                       Tok.getLocation(),
432                                       /* IsSelfClosing = */ true);
433       consumeToken();
434       return HST;
435 
436     case tok::html_equals:
437     case tok::html_quoted_string:
438       Diag(Tok.getLocation(),
439            diag::warn_doc_html_start_tag_expected_ident_or_greater);
440       while (Tok.is(tok::html_equals) ||
441              Tok.is(tok::html_quoted_string))
442         consumeToken();
443       if (Tok.is(tok::html_ident) ||
444           Tok.is(tok::html_greater) ||
445           Tok.is(tok::html_slash_greater))
446         continue;
447 
448       return S.actOnHTMLStartTagFinish(HST,
449                                        copyArray(llvm::makeArrayRef(Attrs)),
450                                        SourceLocation(),
451                                        /* IsSelfClosing = */ false);
452 
453     default:
454       // Not a token from an HTML start tag.  Thus HTML tag prematurely ended.
455       HST = S.actOnHTMLStartTagFinish(HST,
456                                       copyArray(llvm::makeArrayRef(Attrs)),
457                                       SourceLocation(),
458                                       /* IsSelfClosing = */ false);
459       bool StartLineInvalid;
460       const unsigned StartLine = SourceMgr.getPresumedLineNumber(
461                                                   HST->getLocation(),
462                                                   &StartLineInvalid);
463       bool EndLineInvalid;
464       const unsigned EndLine = SourceMgr.getPresumedLineNumber(
465                                                   Tok.getLocation(),
466                                                   &EndLineInvalid);
467       if (StartLineInvalid || EndLineInvalid || StartLine == EndLine)
468         Diag(Tok.getLocation(),
469              diag::warn_doc_html_start_tag_expected_ident_or_greater)
470           << HST->getSourceRange();
471       else {
472         Diag(Tok.getLocation(),
473              diag::warn_doc_html_start_tag_expected_ident_or_greater);
474         Diag(HST->getLocation(), diag::note_doc_html_tag_started_here)
475           << HST->getSourceRange();
476       }
477       return HST;
478     }
479   }
480 }
481 
482 HTMLEndTagComment *Parser::parseHTMLEndTag() {
483   assert(Tok.is(tok::html_end_tag));
484   Token TokEndTag = Tok;
485   consumeToken();
486   SourceLocation Loc;
487   if (Tok.is(tok::html_greater)) {
488     Loc = Tok.getLocation();
489     consumeToken();
490   }
491 
492   return S.actOnHTMLEndTag(TokEndTag.getLocation(),
493                            Loc,
494                            TokEndTag.getHTMLTagEndName());
495 }
496 
497 BlockContentComment *Parser::parseParagraphOrBlockCommand() {
498   SmallVector<InlineContentComment *, 8> Content;
499 
500   while (true) {
501     switch (Tok.getKind()) {
502     case tok::verbatim_block_begin:
503     case tok::verbatim_line_name:
504     case tok::eof:
505       assert(Content.size() != 0);
506       break; // Block content or EOF ahead, finish this parapgaph.
507 
508     case tok::command:
509       if (S.isBlockCommand(Tok.getCommandName())) {
510         if (Content.size() == 0)
511           return parseBlockCommand();
512         break; // Block command ahead, finish this parapgaph.
513       }
514       if (S.isInlineCommand(Tok.getCommandName())) {
515         Content.push_back(parseInlineCommand());
516         continue;
517       }
518 
519       // Not a block command, not an inline command ==> an unknown command.
520       Content.push_back(S.actOnUnknownCommand(Tok.getLocation(),
521                                               Tok.getEndLocation(),
522                                               Tok.getCommandName()));
523       consumeToken();
524       continue;
525 
526     case tok::newline: {
527       consumeToken();
528       if (Tok.is(tok::newline) || Tok.is(tok::eof)) {
529         consumeToken();
530         break; // Two newlines -- end of paragraph.
531       }
532       if (Content.size() > 0)
533         Content.back()->addTrailingNewline();
534       continue;
535     }
536 
537     // Don't deal with HTML tag soup now.
538     case tok::html_start_tag:
539       Content.push_back(parseHTMLStartTag());
540       continue;
541 
542     case tok::html_end_tag:
543       Content.push_back(parseHTMLEndTag());
544       continue;
545 
546     case tok::text:
547       Content.push_back(S.actOnText(Tok.getLocation(),
548                                     Tok.getEndLocation(),
549                                     Tok.getText()));
550       consumeToken();
551       continue;
552 
553     case tok::verbatim_block_line:
554     case tok::verbatim_block_end:
555     case tok::verbatim_line_text:
556     case tok::html_ident:
557     case tok::html_equals:
558     case tok::html_quoted_string:
559     case tok::html_greater:
560     case tok::html_slash_greater:
561       llvm_unreachable("should not see this token");
562     }
563     break;
564   }
565 
566   return S.actOnParagraphComment(copyArray(llvm::makeArrayRef(Content)));
567 }
568 
569 VerbatimBlockComment *Parser::parseVerbatimBlock() {
570   assert(Tok.is(tok::verbatim_block_begin));
571 
572   VerbatimBlockComment *VB =
573       S.actOnVerbatimBlockStart(Tok.getLocation(),
574                                 Tok.getVerbatimBlockName());
575   consumeToken();
576 
577   // Don't create an empty line if verbatim opening command is followed
578   // by a newline.
579   if (Tok.is(tok::newline))
580     consumeToken();
581 
582   SmallVector<VerbatimBlockLineComment *, 8> Lines;
583   while (Tok.is(tok::verbatim_block_line) ||
584          Tok.is(tok::newline)) {
585     VerbatimBlockLineComment *Line;
586     if (Tok.is(tok::verbatim_block_line)) {
587       Line = S.actOnVerbatimBlockLine(Tok.getLocation(),
588                                       Tok.getVerbatimBlockText());
589       consumeToken();
590       if (Tok.is(tok::newline)) {
591         consumeToken();
592       }
593     } else {
594       // Empty line, just a tok::newline.
595       Line = S.actOnVerbatimBlockLine(Tok.getLocation(), "");
596       consumeToken();
597     }
598     Lines.push_back(Line);
599   }
600 
601   if (Tok.is(tok::verbatim_block_end)) {
602     VB = S.actOnVerbatimBlockFinish(VB, Tok.getLocation(),
603                                     Tok.getVerbatimBlockName(),
604                                     copyArray(llvm::makeArrayRef(Lines)));
605     consumeToken();
606   } else {
607     // Unterminated \\verbatim block
608     VB = S.actOnVerbatimBlockFinish(VB, SourceLocation(), "",
609                                     copyArray(llvm::makeArrayRef(Lines)));
610   }
611 
612   return VB;
613 }
614 
615 VerbatimLineComment *Parser::parseVerbatimLine() {
616   assert(Tok.is(tok::verbatim_line_name));
617 
618   Token NameTok = Tok;
619   consumeToken();
620 
621   SourceLocation TextBegin;
622   StringRef Text;
623   // Next token might not be a tok::verbatim_line_text if verbatim line
624   // starting command comes just before a newline or comment end.
625   if (Tok.is(tok::verbatim_line_text)) {
626     TextBegin = Tok.getLocation();
627     Text = Tok.getVerbatimLineText();
628   } else {
629     TextBegin = NameTok.getEndLocation();
630     Text = "";
631   }
632 
633   VerbatimLineComment *VL = S.actOnVerbatimLine(NameTok.getLocation(),
634                                                 NameTok.getVerbatimLineName(),
635                                                 TextBegin,
636                                                 Text);
637   consumeToken();
638   return VL;
639 }
640 
641 BlockContentComment *Parser::parseBlockContent() {
642   switch (Tok.getKind()) {
643   case tok::text:
644   case tok::command:
645   case tok::html_start_tag:
646   case tok::html_end_tag:
647     return parseParagraphOrBlockCommand();
648 
649   case tok::verbatim_block_begin:
650     return parseVerbatimBlock();
651 
652   case tok::verbatim_line_name:
653     return parseVerbatimLine();
654 
655   case tok::eof:
656   case tok::newline:
657   case tok::verbatim_block_line:
658   case tok::verbatim_block_end:
659   case tok::verbatim_line_text:
660   case tok::html_ident:
661   case tok::html_equals:
662   case tok::html_quoted_string:
663   case tok::html_greater:
664   case tok::html_slash_greater:
665     llvm_unreachable("should not see this token");
666   }
667   llvm_unreachable("bogus token kind");
668 }
669 
670 FullComment *Parser::parseFullComment() {
671   // Skip newlines at the beginning of the comment.
672   while (Tok.is(tok::newline))
673     consumeToken();
674 
675   SmallVector<BlockContentComment *, 8> Blocks;
676   while (Tok.isNot(tok::eof)) {
677     Blocks.push_back(parseBlockContent());
678 
679     // Skip extra newlines after paragraph end.
680     while (Tok.is(tok::newline))
681       consumeToken();
682   }
683   return S.actOnFullComment(copyArray(llvm::makeArrayRef(Blocks)));
684 }
685 
686 } // end namespace comments
687 } // end namespace clang
688