xref: /llvm-project/clang/lib/AST/CommentParser.cpp (revision 1bfd9dadda07b77119e8804bd6a3dc798f98f3f7)
1 //===--- CommentParser.cpp - Doxygen comment parser -----------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 
10 #include "clang/AST/CommentParser.h"
11 #include "clang/AST/CommentSema.h"
12 #include "clang/AST/CommentDiagnostic.h"
13 #include "clang/Basic/SourceManager.h"
14 #include "llvm/Support/ErrorHandling.h"
15 
16 namespace clang {
17 namespace comments {
18 
19 /// Re-lexes a sequence of tok::text tokens.
20 class TextTokenRetokenizer {
21   llvm::BumpPtrAllocator &Allocator;
22   static const unsigned MaxTokens = 16;
23   SmallVector<Token, MaxTokens> Toks;
24 
25   struct Position {
26     unsigned CurToken;
27     const char *BufferStart;
28     const char *BufferEnd;
29     const char *BufferPtr;
30     SourceLocation BufferStartLoc;
31   };
32 
33   /// Current position in Toks.
34   Position Pos;
35 
36   bool isEnd() const {
37     return Pos.CurToken >= Toks.size();
38   }
39 
40   /// Sets up the buffer pointers to point to current token.
41   void setupBuffer() {
42     assert(Pos.CurToken < Toks.size());
43     const Token &Tok = Toks[Pos.CurToken];
44 
45     Pos.BufferStart = Tok.getText().begin();
46     Pos.BufferEnd = Tok.getText().end();
47     Pos.BufferPtr = Pos.BufferStart;
48     Pos.BufferStartLoc = Tok.getLocation();
49   }
50 
51   SourceLocation getSourceLocation() const {
52     const unsigned CharNo = Pos.BufferPtr - Pos.BufferStart;
53     return Pos.BufferStartLoc.getLocWithOffset(CharNo);
54   }
55 
56   char peek() const {
57     assert(!isEnd());
58     assert(Pos.BufferPtr != Pos.BufferEnd);
59     return *Pos.BufferPtr;
60   }
61 
62   void consumeChar() {
63     assert(!isEnd());
64     assert(Pos.BufferPtr != Pos.BufferEnd);
65     Pos.BufferPtr++;
66     if (Pos.BufferPtr == Pos.BufferEnd) {
67       Pos.CurToken++;
68       if (Pos.CurToken < Toks.size())
69         setupBuffer();
70     }
71   }
72 
73   static bool isWhitespace(char C) {
74     return C == ' ' || C == '\n' || C == '\r' ||
75            C == '\t' || C == '\f' || C == '\v';
76   }
77 
78   void consumeWhitespace() {
79     while (!isEnd()) {
80       if (isWhitespace(peek()))
81         consumeChar();
82       else
83         break;
84     }
85   }
86 
87   void formTokenWithChars(Token &Result,
88                           SourceLocation Loc,
89                           const char *TokBegin,
90                           unsigned TokLength,
91                           StringRef Text) {
92     Result.setLocation(Loc);
93     Result.setKind(tok::text);
94     Result.setLength(TokLength);
95 #ifndef NDEBUG
96     Result.TextPtr1 = "<UNSET>";
97     Result.TextLen1 = 7;
98 #endif
99     Result.setText(Text);
100   }
101 
102 public:
103   TextTokenRetokenizer(llvm::BumpPtrAllocator &Allocator):
104       Allocator(Allocator) {
105     Pos.CurToken = 0;
106   }
107 
108   /// Add a token.
109   /// Returns true on success, false if it seems like we have enough tokens.
110   bool addToken(const Token &Tok) {
111     assert(Tok.is(tok::text));
112     if (Toks.size() >= MaxTokens)
113       return false;
114 
115     Toks.push_back(Tok);
116     if (Toks.size() == 1)
117       setupBuffer();
118     return true;
119   }
120 
121   /// Extract a word -- sequence of non-whitespace characters.
122   bool lexWord(Token &Tok) {
123     if (isEnd())
124       return false;
125 
126     Position SavedPos = Pos;
127 
128     consumeWhitespace();
129     SmallString<32> WordText;
130     const char *WordBegin = Pos.BufferPtr;
131     SourceLocation Loc = getSourceLocation();
132     while (!isEnd()) {
133       const char C = peek();
134       if (!isWhitespace(C)) {
135         WordText.push_back(C);
136         consumeChar();
137       } else
138         break;
139     }
140     const unsigned Length = WordText.size();
141     if (Length == 0) {
142       Pos = SavedPos;
143       return false;
144     }
145 
146     char *TextPtr = Allocator.Allocate<char>(Length + 1);
147 
148     memcpy(TextPtr, WordText.c_str(), Length + 1);
149     StringRef Text = StringRef(TextPtr, Length);
150 
151     formTokenWithChars(Tok, Loc, WordBegin,
152                        Pos.BufferPtr - WordBegin, Text);
153     return true;
154   }
155 
156   bool lexDelimitedSeq(Token &Tok, char OpenDelim, char CloseDelim) {
157     if (isEnd())
158       return false;
159 
160     Position SavedPos = Pos;
161 
162     consumeWhitespace();
163     SmallString<32> WordText;
164     const char *WordBegin = Pos.BufferPtr;
165     SourceLocation Loc = getSourceLocation();
166     bool Error = false;
167     if (!isEnd()) {
168       const char C = peek();
169       if (C == OpenDelim) {
170         WordText.push_back(C);
171         consumeChar();
172       } else
173         Error = true;
174     }
175     char C = '\0';
176     while (!Error && !isEnd()) {
177       C = peek();
178       WordText.push_back(C);
179       consumeChar();
180       if (C == CloseDelim)
181         break;
182     }
183     if (!Error && C != CloseDelim)
184       Error = true;
185 
186     if (Error) {
187       Pos = SavedPos;
188       return false;
189     }
190 
191     const unsigned Length = WordText.size();
192     char *TextPtr = Allocator.Allocate<char>(Length + 1);
193 
194     memcpy(TextPtr, WordText.c_str(), Length + 1);
195     StringRef Text = StringRef(TextPtr, Length);
196 
197     formTokenWithChars(Tok, Loc, WordBegin,
198                        Pos.BufferPtr - WordBegin, Text);
199     return true;
200   }
201 
202   /// Return a text token.  Useful to take tokens back.
203   bool lexText(Token &Tok) {
204     if (isEnd())
205       return false;
206 
207     if (Pos.BufferPtr != Pos.BufferStart)
208       formTokenWithChars(Tok, getSourceLocation(),
209                          Pos.BufferPtr, Pos.BufferEnd - Pos.BufferPtr,
210                          StringRef(Pos.BufferPtr,
211                                    Pos.BufferEnd - Pos.BufferPtr));
212     else
213       Tok = Toks[Pos.CurToken];
214 
215     Pos.CurToken++;
216     if (Pos.CurToken < Toks.size())
217       setupBuffer();
218     return true;
219   }
220 };
221 
222 Parser::Parser(Lexer &L, Sema &S, llvm::BumpPtrAllocator &Allocator,
223                const SourceManager &SourceMgr, DiagnosticsEngine &Diags):
224     L(L), S(S), Allocator(Allocator), SourceMgr(SourceMgr), Diags(Diags) {
225   consumeToken();
226 }
227 
228 ParamCommandComment *Parser::parseParamCommandArgs(
229     ParamCommandComment *PC,
230     TextTokenRetokenizer &Retokenizer) {
231   Token Arg;
232   // Check if argument looks like direction specification: [dir]
233   // e.g., [in], [out], [in,out]
234   if (Retokenizer.lexDelimitedSeq(Arg, '[', ']'))
235     PC = S.actOnParamCommandDirectionArg(PC,
236                                          Arg.getLocation(),
237                                          Arg.getEndLocation(),
238                                          Arg.getText());
239 
240   if (Retokenizer.lexWord(Arg))
241     PC = S.actOnParamCommandParamNameArg(PC,
242                                          Arg.getLocation(),
243                                          Arg.getEndLocation(),
244                                          Arg.getText());
245 
246   return PC;
247 }
248 
249 BlockCommandComment *Parser::parseBlockCommandArgs(
250     BlockCommandComment *BC,
251     TextTokenRetokenizer &Retokenizer,
252     unsigned NumArgs) {
253   typedef BlockCommandComment::Argument Argument;
254   Argument *Args =
255       new (Allocator.Allocate<Argument>(NumArgs)) Argument[NumArgs];
256   unsigned ParsedArgs = 0;
257   Token Arg;
258   while (ParsedArgs < NumArgs && Retokenizer.lexWord(Arg)) {
259     Args[ParsedArgs] = Argument(SourceRange(Arg.getLocation(),
260                                             Arg.getEndLocation()),
261                                 Arg.getText());
262     ParsedArgs++;
263   }
264 
265   return S.actOnBlockCommandArgs(BC, llvm::makeArrayRef(Args, ParsedArgs));
266 }
267 
268 BlockCommandComment *Parser::parseBlockCommand() {
269   assert(Tok.is(tok::command));
270 
271   ParamCommandComment *PC;
272   BlockCommandComment *BC;
273   bool IsParam = false;
274   unsigned NumArgs = 0;
275   if (S.isParamCommand(Tok.getCommandName())) {
276     IsParam = true;
277     PC = S.actOnParamCommandStart(Tok.getLocation(),
278                                   Tok.getEndLocation(),
279                                   Tok.getCommandName());
280   } else {
281     NumArgs = S.getBlockCommandNumArgs(Tok.getCommandName());
282     BC = S.actOnBlockCommandStart(Tok.getLocation(),
283                                   Tok.getEndLocation(),
284                                   Tok.getCommandName());
285   }
286   consumeToken();
287 
288   if (Tok.is(tok::command) && S.isBlockCommand(Tok.getCommandName())) {
289     // Block command ahead.  We can't nest block commands, so pretend that this
290     // command has an empty argument.
291     ParagraphComment *PC = S.actOnParagraphComment(
292                                 ArrayRef<InlineContentComment *>());
293     return S.actOnBlockCommandFinish(BC, PC);
294   }
295 
296   if (IsParam || NumArgs > 0) {
297     // In order to parse command arguments we need to retokenize a few
298     // following text tokens.
299     TextTokenRetokenizer Retokenizer(Allocator);
300     while (Tok.is(tok::text)) {
301       if (Retokenizer.addToken(Tok))
302         consumeToken();
303     }
304 
305     if (IsParam)
306       PC = parseParamCommandArgs(PC, Retokenizer);
307     else
308       BC = parseBlockCommandArgs(BC, Retokenizer, NumArgs);
309 
310     // Put back tokens we didn't use.
311     SmallVector<Token, 16> TextToks;
312     Token Text;
313     while (Retokenizer.lexText(Text)) {
314       TextToks.push_back(Text);
315     }
316     putBack(TextToks);
317   }
318 
319   BlockContentComment *Block = parseParagraphOrBlockCommand();
320   // Since we have checked for a block command, we should have parsed a
321   // paragraph.
322   if (IsParam)
323     return S.actOnParamCommandFinish(PC, cast<ParagraphComment>(Block));
324   else
325     return S.actOnBlockCommandFinish(BC, cast<ParagraphComment>(Block));
326 }
327 
328 InlineCommandComment *Parser::parseInlineCommand() {
329   assert(Tok.is(tok::command));
330 
331   const Token CommandTok = Tok;
332   consumeToken();
333 
334   TextTokenRetokenizer Retokenizer(Allocator);
335   while (Tok.is(tok::text)) {
336     if (Retokenizer.addToken(Tok))
337       consumeToken();
338   }
339 
340   Token ArgTok;
341   bool ArgTokValid = Retokenizer.lexWord(ArgTok);
342 
343   InlineCommandComment *IC;
344   if (ArgTokValid) {
345     IC = S.actOnInlineCommand(CommandTok.getLocation(),
346                               CommandTok.getEndLocation(),
347                               CommandTok.getCommandName(),
348                               ArgTok.getLocation(),
349                               ArgTok.getEndLocation(),
350                               ArgTok.getText());
351   } else {
352     IC = S.actOnInlineCommand(CommandTok.getLocation(),
353                               CommandTok.getEndLocation(),
354                               CommandTok.getCommandName());
355   }
356 
357   Token Text;
358   while (Retokenizer.lexText(Text))
359     putBack(Text);
360 
361   return IC;
362 }
363 
364 HTMLStartTagComment *Parser::parseHTMLStartTag() {
365   assert(Tok.is(tok::html_start_tag));
366   HTMLStartTagComment *HST =
367       S.actOnHTMLStartTagStart(Tok.getLocation(),
368                                Tok.getHTMLTagStartName());
369   consumeToken();
370 
371   SmallVector<HTMLStartTagComment::Attribute, 2> Attrs;
372   while (true) {
373     switch (Tok.getKind()) {
374     case tok::html_ident: {
375       Token Ident = Tok;
376       consumeToken();
377       if (Tok.isNot(tok::html_equals)) {
378         Attrs.push_back(HTMLStartTagComment::Attribute(Ident.getLocation(),
379                                                        Ident.getHTMLIdent()));
380         continue;
381       }
382       Token Equals = Tok;
383       consumeToken();
384       if (Tok.isNot(tok::html_quoted_string)) {
385         Diag(Tok.getLocation(),
386              diag::warn_doc_html_start_tag_expected_quoted_string)
387           << SourceRange(Equals.getLocation());
388         Attrs.push_back(HTMLStartTagComment::Attribute(Ident.getLocation(),
389                                                        Ident.getHTMLIdent()));
390         while (Tok.is(tok::html_equals) ||
391                Tok.is(tok::html_quoted_string))
392           consumeToken();
393         continue;
394       }
395       Attrs.push_back(HTMLStartTagComment::Attribute(
396                               Ident.getLocation(),
397                               Ident.getHTMLIdent(),
398                               Equals.getLocation(),
399                               SourceRange(Tok.getLocation(),
400                                           Tok.getEndLocation()),
401                               Tok.getHTMLQuotedString()));
402       consumeToken();
403       continue;
404     }
405 
406     case tok::html_greater:
407       HST = S.actOnHTMLStartTagFinish(HST,
408                                       copyArray(llvm::makeArrayRef(Attrs)),
409                                       Tok.getLocation(),
410                                       /* IsSelfClosing = */ false);
411       consumeToken();
412       return HST;
413 
414     case tok::html_slash_greater:
415       HST = S.actOnHTMLStartTagFinish(HST,
416                                       copyArray(llvm::makeArrayRef(Attrs)),
417                                       Tok.getLocation(),
418                                       /* IsSelfClosing = */ true);
419       consumeToken();
420       return HST;
421 
422     case tok::html_equals:
423     case tok::html_quoted_string:
424       Diag(Tok.getLocation(),
425            diag::warn_doc_html_start_tag_expected_ident_or_greater);
426       while (Tok.is(tok::html_equals) ||
427              Tok.is(tok::html_quoted_string))
428         consumeToken();
429       if (Tok.is(tok::html_ident) ||
430           Tok.is(tok::html_greater) ||
431           Tok.is(tok::html_slash_greater))
432         continue;
433 
434       return S.actOnHTMLStartTagFinish(HST,
435                                        copyArray(llvm::makeArrayRef(Attrs)),
436                                        SourceLocation(),
437                                        /* IsSelfClosing = */ false);
438 
439     default:
440       // Not a token from an HTML start tag.  Thus HTML tag prematurely ended.
441       HST = S.actOnHTMLStartTagFinish(HST,
442                                       copyArray(llvm::makeArrayRef(Attrs)),
443                                       SourceLocation(),
444                                       /* IsSelfClosing = */ false);
445       bool StartLineInvalid;
446       const unsigned StartLine = SourceMgr.getPresumedLineNumber(
447                                                   HST->getLocation(),
448                                                   &StartLineInvalid);
449       bool EndLineInvalid;
450       const unsigned EndLine = SourceMgr.getPresumedLineNumber(
451                                                   Tok.getLocation(),
452                                                   &EndLineInvalid);
453       if (StartLineInvalid || EndLineInvalid || StartLine == EndLine)
454         Diag(Tok.getLocation(),
455              diag::warn_doc_html_start_tag_expected_ident_or_greater)
456           << HST->getSourceRange();
457       else {
458         Diag(Tok.getLocation(),
459              diag::warn_doc_html_start_tag_expected_ident_or_greater);
460         Diag(HST->getLocation(), diag::note_doc_html_tag_started_here)
461           << HST->getSourceRange();
462       }
463       return HST;
464     }
465   }
466 }
467 
468 HTMLEndTagComment *Parser::parseHTMLEndTag() {
469   assert(Tok.is(tok::html_end_tag));
470   Token TokEndTag = Tok;
471   consumeToken();
472   SourceLocation Loc;
473   if (Tok.is(tok::html_greater)) {
474     Loc = Tok.getLocation();
475     consumeToken();
476   }
477 
478   return S.actOnHTMLEndTag(TokEndTag.getLocation(),
479                            Loc,
480                            TokEndTag.getHTMLTagEndName());
481 }
482 
483 BlockContentComment *Parser::parseParagraphOrBlockCommand() {
484   SmallVector<InlineContentComment *, 8> Content;
485 
486   while (true) {
487     switch (Tok.getKind()) {
488     case tok::verbatim_block_begin:
489     case tok::verbatim_line_name:
490     case tok::eof:
491       assert(Content.size() != 0);
492       break; // Block content or EOF ahead, finish this parapgaph.
493 
494     case tok::command:
495       if (S.isBlockCommand(Tok.getCommandName())) {
496         if (Content.size() == 0)
497           return parseBlockCommand();
498         break; // Block command ahead, finish this parapgaph.
499       }
500       if (S.isInlineCommand(Tok.getCommandName())) {
501         Content.push_back(parseInlineCommand());
502         continue;
503       }
504 
505       // Not a block command, not an inline command ==> an unknown command.
506       Content.push_back(S.actOnUnknownCommand(Tok.getLocation(),
507                                               Tok.getEndLocation(),
508                                               Tok.getCommandName()));
509       consumeToken();
510       continue;
511 
512     case tok::newline: {
513       consumeToken();
514       if (Tok.is(tok::newline) || Tok.is(tok::eof)) {
515         consumeToken();
516         break; // Two newlines -- end of paragraph.
517       }
518       if (Content.size() > 0)
519         Content.back()->addTrailingNewline();
520       continue;
521     }
522 
523     // Don't deal with HTML tag soup now.
524     case tok::html_start_tag:
525       Content.push_back(parseHTMLStartTag());
526       continue;
527 
528     case tok::html_end_tag:
529       Content.push_back(parseHTMLEndTag());
530       continue;
531 
532     case tok::text:
533       Content.push_back(S.actOnText(Tok.getLocation(),
534                                     Tok.getEndLocation(),
535                                     Tok.getText()));
536       consumeToken();
537       continue;
538 
539     case tok::verbatim_block_line:
540     case tok::verbatim_block_end:
541     case tok::verbatim_line_text:
542     case tok::html_ident:
543     case tok::html_equals:
544     case tok::html_quoted_string:
545     case tok::html_greater:
546     case tok::html_slash_greater:
547       llvm_unreachable("should not see this token");
548     }
549     break;
550   }
551 
552   return S.actOnParagraphComment(copyArray(llvm::makeArrayRef(Content)));
553 }
554 
555 VerbatimBlockComment *Parser::parseVerbatimBlock() {
556   assert(Tok.is(tok::verbatim_block_begin));
557 
558   VerbatimBlockComment *VB =
559       S.actOnVerbatimBlockStart(Tok.getLocation(),
560                                 Tok.getVerbatimBlockName());
561   consumeToken();
562 
563   // Don't create an empty line if verbatim opening command is followed
564   // by a newline.
565   if (Tok.is(tok::newline))
566     consumeToken();
567 
568   SmallVector<VerbatimBlockLineComment *, 8> Lines;
569   while (Tok.is(tok::verbatim_block_line) ||
570          Tok.is(tok::newline)) {
571     VerbatimBlockLineComment *Line;
572     if (Tok.is(tok::verbatim_block_line)) {
573       Line = S.actOnVerbatimBlockLine(Tok.getLocation(),
574                                       Tok.getVerbatimBlockText());
575       consumeToken();
576       if (Tok.is(tok::newline)) {
577         consumeToken();
578       }
579     } else {
580       // Empty line, just a tok::newline.
581       Line = S.actOnVerbatimBlockLine(Tok.getLocation(), "");
582       consumeToken();
583     }
584     Lines.push_back(Line);
585   }
586 
587   if (Tok.is(tok::verbatim_block_end)) {
588     VB = S.actOnVerbatimBlockFinish(VB, Tok.getLocation(),
589                                     Tok.getVerbatimBlockName(),
590                                     copyArray(llvm::makeArrayRef(Lines)));
591     consumeToken();
592   } else {
593     // Unterminated \\verbatim block
594     VB = S.actOnVerbatimBlockFinish(VB, SourceLocation(), "",
595                                     copyArray(llvm::makeArrayRef(Lines)));
596   }
597 
598   return VB;
599 }
600 
601 VerbatimLineComment *Parser::parseVerbatimLine() {
602   assert(Tok.is(tok::verbatim_line_name));
603 
604   Token NameTok = Tok;
605   consumeToken();
606 
607   SourceLocation TextBegin;
608   StringRef Text;
609   // Next token might not be a tok::verbatim_line_text if verbatim line
610   // starting command comes just before a newline or comment end.
611   if (Tok.is(tok::verbatim_line_text)) {
612     TextBegin = Tok.getLocation();
613     Text = Tok.getVerbatimLineText();
614   } else {
615     TextBegin = NameTok.getEndLocation();
616     Text = "";
617   }
618 
619   VerbatimLineComment *VL = S.actOnVerbatimLine(NameTok.getLocation(),
620                                                 NameTok.getVerbatimLineName(),
621                                                 TextBegin,
622                                                 Text);
623   consumeToken();
624   return VL;
625 }
626 
627 BlockContentComment *Parser::parseBlockContent() {
628   switch (Tok.getKind()) {
629   case tok::text:
630   case tok::command:
631   case tok::html_start_tag:
632   case tok::html_end_tag:
633     return parseParagraphOrBlockCommand();
634 
635   case tok::verbatim_block_begin:
636     return parseVerbatimBlock();
637 
638   case tok::verbatim_line_name:
639     return parseVerbatimLine();
640 
641   case tok::eof:
642   case tok::newline:
643   case tok::verbatim_block_line:
644   case tok::verbatim_block_end:
645   case tok::verbatim_line_text:
646   case tok::html_ident:
647   case tok::html_equals:
648   case tok::html_quoted_string:
649   case tok::html_greater:
650   case tok::html_slash_greater:
651     llvm_unreachable("should not see this token");
652   }
653   llvm_unreachable("bogus token kind");
654 }
655 
656 FullComment *Parser::parseFullComment() {
657   // Skip newlines at the beginning of the comment.
658   while (Tok.is(tok::newline))
659     consumeToken();
660 
661   SmallVector<BlockContentComment *, 8> Blocks;
662   while (Tok.isNot(tok::eof)) {
663     Blocks.push_back(parseBlockContent());
664 
665     // Skip extra newlines after paragraph end.
666     while (Tok.is(tok::newline))
667       consumeToken();
668   }
669   return S.actOnFullComment(copyArray(llvm::makeArrayRef(Blocks)));
670 }
671 
672 } // end namespace comments
673 } // end namespace clang
674