xref: /llvm-project/clang/lib/AST/CommentParser.cpp (revision 0a36302ae032d924d876a8d686547f2118c26901)
1 //===--- CommentParser.cpp - Doxygen comment parser -----------------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 
10 #include "clang/AST/CommentParser.h"
11 #include "clang/AST/CommentSema.h"
12 #include "clang/AST/CommentDiagnostic.h"
13 #include "clang/Basic/SourceManager.h"
14 #include "llvm/Support/ErrorHandling.h"
15 
16 namespace clang {
17 namespace comments {
18 
19 /// Re-lexes a sequence of tok::text tokens.
20 class TextTokenRetokenizer {
21   llvm::BumpPtrAllocator &Allocator;
22   Parser &P;
23   SmallVector<Token, 16> Toks;
24 
25   struct Position {
26     unsigned CurToken;
27     const char *BufferStart;
28     const char *BufferEnd;
29     const char *BufferPtr;
30     SourceLocation BufferStartLoc;
31   };
32 
33   /// Current position in Toks.
34   Position Pos;
35 
36   bool isEnd() const {
37     return Pos.CurToken >= Toks.size();
38   }
39 
40   /// Sets up the buffer pointers to point to current token.
41   void setupBuffer() {
42     assert(!isEnd());
43     const Token &Tok = Toks[Pos.CurToken];
44 
45     Pos.BufferStart = Tok.getText().begin();
46     Pos.BufferEnd = Tok.getText().end();
47     Pos.BufferPtr = Pos.BufferStart;
48     Pos.BufferStartLoc = Tok.getLocation();
49   }
50 
51   SourceLocation getSourceLocation() const {
52     const unsigned CharNo = Pos.BufferPtr - Pos.BufferStart;
53     return Pos.BufferStartLoc.getLocWithOffset(CharNo);
54   }
55 
56   char peek() const {
57     assert(!isEnd());
58     assert(Pos.BufferPtr != Pos.BufferEnd);
59     return *Pos.BufferPtr;
60   }
61 
62   void consumeChar() {
63     assert(!isEnd());
64     assert(Pos.BufferPtr != Pos.BufferEnd);
65     Pos.BufferPtr++;
66     if (Pos.BufferPtr == Pos.BufferEnd) {
67       Pos.CurToken++;
68       if (isEnd() && addToken()) {
69         assert(!isEnd());
70         setupBuffer();
71       }
72     }
73   }
74 
75   /// Add a token.
76   /// Returns true on success, false if there are no interesting tokens to
77   /// fetch from lexer.
78   bool addToken() {
79     if (P.Tok.isNot(tok::text))
80       return false;
81 
82     Toks.push_back(P.Tok);
83     P.consumeToken();
84     if (Toks.size() == 1)
85       setupBuffer();
86     return true;
87   }
88 
89   static bool isWhitespace(char C) {
90     return C == ' ' || C == '\n' || C == '\r' ||
91            C == '\t' || C == '\f' || C == '\v';
92   }
93 
94   void consumeWhitespace() {
95     while (!isEnd()) {
96       if (isWhitespace(peek()))
97         consumeChar();
98       else
99         break;
100     }
101   }
102 
103   void formTokenWithChars(Token &Result,
104                           SourceLocation Loc,
105                           const char *TokBegin,
106                           unsigned TokLength,
107                           StringRef Text) {
108     Result.setLocation(Loc);
109     Result.setKind(tok::text);
110     Result.setLength(TokLength);
111 #ifndef NDEBUG
112     Result.TextPtr1 = "<UNSET>";
113     Result.TextLen1 = 7;
114 #endif
115     Result.setText(Text);
116   }
117 
118 public:
119   TextTokenRetokenizer(llvm::BumpPtrAllocator &Allocator, Parser &P):
120       Allocator(Allocator), P(P) {
121     Pos.CurToken = 0;
122     addToken();
123   }
124 
125   /// Extract a word -- sequence of non-whitespace characters.
126   bool lexWord(Token &Tok) {
127     if (isEnd())
128       return false;
129 
130     Position SavedPos = Pos;
131 
132     consumeWhitespace();
133     SmallString<32> WordText;
134     const char *WordBegin = Pos.BufferPtr;
135     SourceLocation Loc = getSourceLocation();
136     while (!isEnd()) {
137       const char C = peek();
138       if (!isWhitespace(C)) {
139         WordText.push_back(C);
140         consumeChar();
141       } else
142         break;
143     }
144     const unsigned Length = WordText.size();
145     if (Length == 0) {
146       Pos = SavedPos;
147       return false;
148     }
149 
150     char *TextPtr = Allocator.Allocate<char>(Length + 1);
151 
152     memcpy(TextPtr, WordText.c_str(), Length + 1);
153     StringRef Text = StringRef(TextPtr, Length);
154 
155     formTokenWithChars(Tok, Loc, WordBegin,
156                        Pos.BufferPtr - WordBegin, Text);
157     return true;
158   }
159 
160   bool lexDelimitedSeq(Token &Tok, char OpenDelim, char CloseDelim) {
161     if (isEnd())
162       return false;
163 
164     Position SavedPos = Pos;
165 
166     consumeWhitespace();
167     SmallString<32> WordText;
168     const char *WordBegin = Pos.BufferPtr;
169     SourceLocation Loc = getSourceLocation();
170     bool Error = false;
171     if (!isEnd()) {
172       const char C = peek();
173       if (C == OpenDelim) {
174         WordText.push_back(C);
175         consumeChar();
176       } else
177         Error = true;
178     }
179     char C = '\0';
180     while (!Error && !isEnd()) {
181       C = peek();
182       WordText.push_back(C);
183       consumeChar();
184       if (C == CloseDelim)
185         break;
186     }
187     if (!Error && C != CloseDelim)
188       Error = true;
189 
190     if (Error) {
191       Pos = SavedPos;
192       return false;
193     }
194 
195     const unsigned Length = WordText.size();
196     char *TextPtr = Allocator.Allocate<char>(Length + 1);
197 
198     memcpy(TextPtr, WordText.c_str(), Length + 1);
199     StringRef Text = StringRef(TextPtr, Length);
200 
201     formTokenWithChars(Tok, Loc, WordBegin,
202                        Pos.BufferPtr - WordBegin, Text);
203     return true;
204   }
205 
206   /// Put back tokens that we didn't consume.
207   void putBackLeftoverTokens() {
208     if (isEnd())
209       return;
210 
211     bool HavePartialTok = false;
212     Token PartialTok;
213     if (Pos.BufferPtr != Pos.BufferStart) {
214       formTokenWithChars(PartialTok, getSourceLocation(),
215                          Pos.BufferPtr, Pos.BufferEnd - Pos.BufferPtr,
216                          StringRef(Pos.BufferPtr,
217                                    Pos.BufferEnd - Pos.BufferPtr));
218       HavePartialTok = true;
219       Pos.CurToken++;
220     }
221 
222     P.putBack(llvm::makeArrayRef(Toks.begin() + Pos.CurToken, Toks.end()));
223     Pos.CurToken = Toks.size();
224 
225     if (HavePartialTok)
226       P.putBack(PartialTok);
227   }
228 };
229 
230 Parser::Parser(Lexer &L, Sema &S, llvm::BumpPtrAllocator &Allocator,
231                const SourceManager &SourceMgr, DiagnosticsEngine &Diags):
232     L(L), S(S), Allocator(Allocator), SourceMgr(SourceMgr), Diags(Diags) {
233   consumeToken();
234 }
235 
236 ParamCommandComment *Parser::parseParamCommandArgs(
237     ParamCommandComment *PC,
238     TextTokenRetokenizer &Retokenizer) {
239   Token Arg;
240   // Check if argument looks like direction specification: [dir]
241   // e.g., [in], [out], [in,out]
242   if (Retokenizer.lexDelimitedSeq(Arg, '[', ']'))
243     PC = S.actOnParamCommandDirectionArg(PC,
244                                          Arg.getLocation(),
245                                          Arg.getEndLocation(),
246                                          Arg.getText());
247 
248   if (Retokenizer.lexWord(Arg))
249     PC = S.actOnParamCommandParamNameArg(PC,
250                                          Arg.getLocation(),
251                                          Arg.getEndLocation(),
252                                          Arg.getText());
253 
254   return PC;
255 }
256 
257 BlockCommandComment *Parser::parseBlockCommandArgs(
258     BlockCommandComment *BC,
259     TextTokenRetokenizer &Retokenizer,
260     unsigned NumArgs) {
261   typedef BlockCommandComment::Argument Argument;
262   Argument *Args =
263       new (Allocator.Allocate<Argument>(NumArgs)) Argument[NumArgs];
264   unsigned ParsedArgs = 0;
265   Token Arg;
266   while (ParsedArgs < NumArgs && Retokenizer.lexWord(Arg)) {
267     Args[ParsedArgs] = Argument(SourceRange(Arg.getLocation(),
268                                             Arg.getEndLocation()),
269                                 Arg.getText());
270     ParsedArgs++;
271   }
272 
273   return S.actOnBlockCommandArgs(BC, llvm::makeArrayRef(Args, ParsedArgs));
274 }
275 
276 BlockCommandComment *Parser::parseBlockCommand() {
277   assert(Tok.is(tok::command));
278 
279   ParamCommandComment *PC;
280   BlockCommandComment *BC;
281   bool IsParam = false;
282   unsigned NumArgs = 0;
283   if (S.isParamCommand(Tok.getCommandName())) {
284     IsParam = true;
285     PC = S.actOnParamCommandStart(Tok.getLocation(),
286                                   Tok.getEndLocation(),
287                                   Tok.getCommandName());
288   } else {
289     NumArgs = S.getBlockCommandNumArgs(Tok.getCommandName());
290     BC = S.actOnBlockCommandStart(Tok.getLocation(),
291                                   Tok.getEndLocation(),
292                                   Tok.getCommandName());
293   }
294   consumeToken();
295 
296   if (Tok.is(tok::command) && S.isBlockCommand(Tok.getCommandName())) {
297     // Block command ahead.  We can't nest block commands, so pretend that this
298     // command has an empty argument.
299     ParagraphComment *PC = S.actOnParagraphComment(
300                                 ArrayRef<InlineContentComment *>());
301     return S.actOnBlockCommandFinish(BC, PC);
302   }
303 
304   if (IsParam || NumArgs > 0) {
305     // In order to parse command arguments we need to retokenize a few
306     // following text tokens.
307     TextTokenRetokenizer Retokenizer(Allocator, *this);
308 
309     if (IsParam)
310       PC = parseParamCommandArgs(PC, Retokenizer);
311     else
312       BC = parseBlockCommandArgs(BC, Retokenizer, NumArgs);
313 
314     Retokenizer.putBackLeftoverTokens();
315   }
316 
317   BlockContentComment *Block = parseParagraphOrBlockCommand();
318   // Since we have checked for a block command, we should have parsed a
319   // paragraph.
320   if (IsParam)
321     return S.actOnParamCommandFinish(PC, cast<ParagraphComment>(Block));
322   else
323     return S.actOnBlockCommandFinish(BC, cast<ParagraphComment>(Block));
324 }
325 
326 InlineCommandComment *Parser::parseInlineCommand() {
327   assert(Tok.is(tok::command));
328 
329   const Token CommandTok = Tok;
330   consumeToken();
331 
332   TextTokenRetokenizer Retokenizer(Allocator, *this);
333 
334   Token ArgTok;
335   bool ArgTokValid = Retokenizer.lexWord(ArgTok);
336 
337   InlineCommandComment *IC;
338   if (ArgTokValid) {
339     IC = S.actOnInlineCommand(CommandTok.getLocation(),
340                               CommandTok.getEndLocation(),
341                               CommandTok.getCommandName(),
342                               ArgTok.getLocation(),
343                               ArgTok.getEndLocation(),
344                               ArgTok.getText());
345   } else {
346     IC = S.actOnInlineCommand(CommandTok.getLocation(),
347                               CommandTok.getEndLocation(),
348                               CommandTok.getCommandName());
349   }
350 
351   Retokenizer.putBackLeftoverTokens();
352 
353   return IC;
354 }
355 
356 HTMLStartTagComment *Parser::parseHTMLStartTag() {
357   assert(Tok.is(tok::html_start_tag));
358   HTMLStartTagComment *HST =
359       S.actOnHTMLStartTagStart(Tok.getLocation(),
360                                Tok.getHTMLTagStartName());
361   consumeToken();
362 
363   SmallVector<HTMLStartTagComment::Attribute, 2> Attrs;
364   while (true) {
365     switch (Tok.getKind()) {
366     case tok::html_ident: {
367       Token Ident = Tok;
368       consumeToken();
369       if (Tok.isNot(tok::html_equals)) {
370         Attrs.push_back(HTMLStartTagComment::Attribute(Ident.getLocation(),
371                                                        Ident.getHTMLIdent()));
372         continue;
373       }
374       Token Equals = Tok;
375       consumeToken();
376       if (Tok.isNot(tok::html_quoted_string)) {
377         Diag(Tok.getLocation(),
378              diag::warn_doc_html_start_tag_expected_quoted_string)
379           << SourceRange(Equals.getLocation());
380         Attrs.push_back(HTMLStartTagComment::Attribute(Ident.getLocation(),
381                                                        Ident.getHTMLIdent()));
382         while (Tok.is(tok::html_equals) ||
383                Tok.is(tok::html_quoted_string))
384           consumeToken();
385         continue;
386       }
387       Attrs.push_back(HTMLStartTagComment::Attribute(
388                               Ident.getLocation(),
389                               Ident.getHTMLIdent(),
390                               Equals.getLocation(),
391                               SourceRange(Tok.getLocation(),
392                                           Tok.getEndLocation()),
393                               Tok.getHTMLQuotedString()));
394       consumeToken();
395       continue;
396     }
397 
398     case tok::html_greater:
399       HST = S.actOnHTMLStartTagFinish(HST,
400                                       copyArray(llvm::makeArrayRef(Attrs)),
401                                       Tok.getLocation(),
402                                       /* IsSelfClosing = */ false);
403       consumeToken();
404       return HST;
405 
406     case tok::html_slash_greater:
407       HST = S.actOnHTMLStartTagFinish(HST,
408                                       copyArray(llvm::makeArrayRef(Attrs)),
409                                       Tok.getLocation(),
410                                       /* IsSelfClosing = */ true);
411       consumeToken();
412       return HST;
413 
414     case tok::html_equals:
415     case tok::html_quoted_string:
416       Diag(Tok.getLocation(),
417            diag::warn_doc_html_start_tag_expected_ident_or_greater);
418       while (Tok.is(tok::html_equals) ||
419              Tok.is(tok::html_quoted_string))
420         consumeToken();
421       if (Tok.is(tok::html_ident) ||
422           Tok.is(tok::html_greater) ||
423           Tok.is(tok::html_slash_greater))
424         continue;
425 
426       return S.actOnHTMLStartTagFinish(HST,
427                                        copyArray(llvm::makeArrayRef(Attrs)),
428                                        SourceLocation(),
429                                        /* IsSelfClosing = */ false);
430 
431     default:
432       // Not a token from an HTML start tag.  Thus HTML tag prematurely ended.
433       HST = S.actOnHTMLStartTagFinish(HST,
434                                       copyArray(llvm::makeArrayRef(Attrs)),
435                                       SourceLocation(),
436                                       /* IsSelfClosing = */ false);
437       bool StartLineInvalid;
438       const unsigned StartLine = SourceMgr.getPresumedLineNumber(
439                                                   HST->getLocation(),
440                                                   &StartLineInvalid);
441       bool EndLineInvalid;
442       const unsigned EndLine = SourceMgr.getPresumedLineNumber(
443                                                   Tok.getLocation(),
444                                                   &EndLineInvalid);
445       if (StartLineInvalid || EndLineInvalid || StartLine == EndLine)
446         Diag(Tok.getLocation(),
447              diag::warn_doc_html_start_tag_expected_ident_or_greater)
448           << HST->getSourceRange();
449       else {
450         Diag(Tok.getLocation(),
451              diag::warn_doc_html_start_tag_expected_ident_or_greater);
452         Diag(HST->getLocation(), diag::note_doc_html_tag_started_here)
453           << HST->getSourceRange();
454       }
455       return HST;
456     }
457   }
458 }
459 
460 HTMLEndTagComment *Parser::parseHTMLEndTag() {
461   assert(Tok.is(tok::html_end_tag));
462   Token TokEndTag = Tok;
463   consumeToken();
464   SourceLocation Loc;
465   if (Tok.is(tok::html_greater)) {
466     Loc = Tok.getLocation();
467     consumeToken();
468   }
469 
470   return S.actOnHTMLEndTag(TokEndTag.getLocation(),
471                            Loc,
472                            TokEndTag.getHTMLTagEndName());
473 }
474 
475 BlockContentComment *Parser::parseParagraphOrBlockCommand() {
476   SmallVector<InlineContentComment *, 8> Content;
477 
478   while (true) {
479     switch (Tok.getKind()) {
480     case tok::verbatim_block_begin:
481     case tok::verbatim_line_name:
482     case tok::eof:
483       assert(Content.size() != 0);
484       break; // Block content or EOF ahead, finish this parapgaph.
485 
486     case tok::command:
487       if (S.isBlockCommand(Tok.getCommandName())) {
488         if (Content.size() == 0)
489           return parseBlockCommand();
490         break; // Block command ahead, finish this parapgaph.
491       }
492       if (S.isInlineCommand(Tok.getCommandName())) {
493         Content.push_back(parseInlineCommand());
494         continue;
495       }
496 
497       // Not a block command, not an inline command ==> an unknown command.
498       Content.push_back(S.actOnUnknownCommand(Tok.getLocation(),
499                                               Tok.getEndLocation(),
500                                               Tok.getCommandName()));
501       consumeToken();
502       continue;
503 
504     case tok::newline: {
505       consumeToken();
506       if (Tok.is(tok::newline) || Tok.is(tok::eof)) {
507         consumeToken();
508         break; // Two newlines -- end of paragraph.
509       }
510       if (Content.size() > 0)
511         Content.back()->addTrailingNewline();
512       continue;
513     }
514 
515     // Don't deal with HTML tag soup now.
516     case tok::html_start_tag:
517       Content.push_back(parseHTMLStartTag());
518       continue;
519 
520     case tok::html_end_tag:
521       Content.push_back(parseHTMLEndTag());
522       continue;
523 
524     case tok::text:
525       Content.push_back(S.actOnText(Tok.getLocation(),
526                                     Tok.getEndLocation(),
527                                     Tok.getText()));
528       consumeToken();
529       continue;
530 
531     case tok::verbatim_block_line:
532     case tok::verbatim_block_end:
533     case tok::verbatim_line_text:
534     case tok::html_ident:
535     case tok::html_equals:
536     case tok::html_quoted_string:
537     case tok::html_greater:
538     case tok::html_slash_greater:
539       llvm_unreachable("should not see this token");
540     }
541     break;
542   }
543 
544   return S.actOnParagraphComment(copyArray(llvm::makeArrayRef(Content)));
545 }
546 
547 VerbatimBlockComment *Parser::parseVerbatimBlock() {
548   assert(Tok.is(tok::verbatim_block_begin));
549 
550   VerbatimBlockComment *VB =
551       S.actOnVerbatimBlockStart(Tok.getLocation(),
552                                 Tok.getVerbatimBlockName());
553   consumeToken();
554 
555   // Don't create an empty line if verbatim opening command is followed
556   // by a newline.
557   if (Tok.is(tok::newline))
558     consumeToken();
559 
560   SmallVector<VerbatimBlockLineComment *, 8> Lines;
561   while (Tok.is(tok::verbatim_block_line) ||
562          Tok.is(tok::newline)) {
563     VerbatimBlockLineComment *Line;
564     if (Tok.is(tok::verbatim_block_line)) {
565       Line = S.actOnVerbatimBlockLine(Tok.getLocation(),
566                                       Tok.getVerbatimBlockText());
567       consumeToken();
568       if (Tok.is(tok::newline)) {
569         consumeToken();
570       }
571     } else {
572       // Empty line, just a tok::newline.
573       Line = S.actOnVerbatimBlockLine(Tok.getLocation(), "");
574       consumeToken();
575     }
576     Lines.push_back(Line);
577   }
578 
579   if (Tok.is(tok::verbatim_block_end)) {
580     VB = S.actOnVerbatimBlockFinish(VB, Tok.getLocation(),
581                                     Tok.getVerbatimBlockName(),
582                                     copyArray(llvm::makeArrayRef(Lines)));
583     consumeToken();
584   } else {
585     // Unterminated \\verbatim block
586     VB = S.actOnVerbatimBlockFinish(VB, SourceLocation(), "",
587                                     copyArray(llvm::makeArrayRef(Lines)));
588   }
589 
590   return VB;
591 }
592 
593 VerbatimLineComment *Parser::parseVerbatimLine() {
594   assert(Tok.is(tok::verbatim_line_name));
595 
596   Token NameTok = Tok;
597   consumeToken();
598 
599   SourceLocation TextBegin;
600   StringRef Text;
601   // Next token might not be a tok::verbatim_line_text if verbatim line
602   // starting command comes just before a newline or comment end.
603   if (Tok.is(tok::verbatim_line_text)) {
604     TextBegin = Tok.getLocation();
605     Text = Tok.getVerbatimLineText();
606   } else {
607     TextBegin = NameTok.getEndLocation();
608     Text = "";
609   }
610 
611   VerbatimLineComment *VL = S.actOnVerbatimLine(NameTok.getLocation(),
612                                                 NameTok.getVerbatimLineName(),
613                                                 TextBegin,
614                                                 Text);
615   consumeToken();
616   return VL;
617 }
618 
619 BlockContentComment *Parser::parseBlockContent() {
620   switch (Tok.getKind()) {
621   case tok::text:
622   case tok::command:
623   case tok::html_start_tag:
624   case tok::html_end_tag:
625     return parseParagraphOrBlockCommand();
626 
627   case tok::verbatim_block_begin:
628     return parseVerbatimBlock();
629 
630   case tok::verbatim_line_name:
631     return parseVerbatimLine();
632 
633   case tok::eof:
634   case tok::newline:
635   case tok::verbatim_block_line:
636   case tok::verbatim_block_end:
637   case tok::verbatim_line_text:
638   case tok::html_ident:
639   case tok::html_equals:
640   case tok::html_quoted_string:
641   case tok::html_greater:
642   case tok::html_slash_greater:
643     llvm_unreachable("should not see this token");
644   }
645   llvm_unreachable("bogus token kind");
646 }
647 
648 FullComment *Parser::parseFullComment() {
649   // Skip newlines at the beginning of the comment.
650   while (Tok.is(tok::newline))
651     consumeToken();
652 
653   SmallVector<BlockContentComment *, 8> Blocks;
654   while (Tok.isNot(tok::eof)) {
655     Blocks.push_back(parseBlockContent());
656 
657     // Skip extra newlines after paragraph end.
658     while (Tok.is(tok::newline))
659       consumeToken();
660   }
661   return S.actOnFullComment(copyArray(llvm::makeArrayRef(Blocks)));
662 }
663 
664 } // end namespace comments
665 } // end namespace clang
666