1e5dd7070Spatrick //===--- CommentLexer.cpp -------------------------------------------------===//
2e5dd7070Spatrick //
3e5dd7070Spatrick // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4e5dd7070Spatrick // See https://llvm.org/LICENSE.txt for license information.
5e5dd7070Spatrick // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6e5dd7070Spatrick //
7e5dd7070Spatrick //===----------------------------------------------------------------------===//
8e5dd7070Spatrick
9e5dd7070Spatrick #include "clang/AST/CommentLexer.h"
10e5dd7070Spatrick #include "clang/AST/CommentCommandTraits.h"
11e5dd7070Spatrick #include "clang/AST/CommentDiagnostic.h"
12e5dd7070Spatrick #include "clang/Basic/CharInfo.h"
13e5dd7070Spatrick #include "llvm/ADT/StringExtras.h"
14e5dd7070Spatrick #include "llvm/ADT/StringSwitch.h"
15e5dd7070Spatrick #include "llvm/Support/ConvertUTF.h"
16e5dd7070Spatrick #include "llvm/Support/ErrorHandling.h"
17e5dd7070Spatrick
18e5dd7070Spatrick namespace clang {
19e5dd7070Spatrick namespace comments {
20e5dd7070Spatrick
dump(const Lexer & L,const SourceManager & SM) const21e5dd7070Spatrick void Token::dump(const Lexer &L, const SourceManager &SM) const {
22e5dd7070Spatrick llvm::errs() << "comments::Token Kind=" << Kind << " ";
23e5dd7070Spatrick Loc.print(llvm::errs(), SM);
24e5dd7070Spatrick llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
25e5dd7070Spatrick }
26e5dd7070Spatrick
isHTMLNamedCharacterReferenceCharacter(char C)27e5dd7070Spatrick static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
28e5dd7070Spatrick return isLetter(C);
29e5dd7070Spatrick }
30e5dd7070Spatrick
isHTMLDecimalCharacterReferenceCharacter(char C)31e5dd7070Spatrick static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
32e5dd7070Spatrick return isDigit(C);
33e5dd7070Spatrick }
34e5dd7070Spatrick
isHTMLHexCharacterReferenceCharacter(char C)35e5dd7070Spatrick static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
36e5dd7070Spatrick return isHexDigit(C);
37e5dd7070Spatrick }
38e5dd7070Spatrick
convertCodePointToUTF8(llvm::BumpPtrAllocator & Allocator,unsigned CodePoint)39e5dd7070Spatrick static inline StringRef convertCodePointToUTF8(
40e5dd7070Spatrick llvm::BumpPtrAllocator &Allocator,
41e5dd7070Spatrick unsigned CodePoint) {
42e5dd7070Spatrick char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
43e5dd7070Spatrick char *ResolvedPtr = Resolved;
44e5dd7070Spatrick if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
45e5dd7070Spatrick return StringRef(Resolved, ResolvedPtr - Resolved);
46e5dd7070Spatrick else
47e5dd7070Spatrick return StringRef();
48e5dd7070Spatrick }
49e5dd7070Spatrick
50e5dd7070Spatrick namespace {
51e5dd7070Spatrick
52e5dd7070Spatrick #include "clang/AST/CommentHTMLTags.inc"
53e5dd7070Spatrick #include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
54e5dd7070Spatrick
55e5dd7070Spatrick } // end anonymous namespace
56e5dd7070Spatrick
resolveHTMLNamedCharacterReference(StringRef Name) const57e5dd7070Spatrick StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
58e5dd7070Spatrick // Fast path, first check a few most widely used named character references.
59e5dd7070Spatrick return llvm::StringSwitch<StringRef>(Name)
60e5dd7070Spatrick .Case("amp", "&")
61e5dd7070Spatrick .Case("lt", "<")
62e5dd7070Spatrick .Case("gt", ">")
63e5dd7070Spatrick .Case("quot", "\"")
64e5dd7070Spatrick .Case("apos", "\'")
65e5dd7070Spatrick // Slow path.
66e5dd7070Spatrick .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
67e5dd7070Spatrick }
68e5dd7070Spatrick
resolveHTMLDecimalCharacterReference(StringRef Name) const69e5dd7070Spatrick StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
70e5dd7070Spatrick unsigned CodePoint = 0;
71e5dd7070Spatrick for (unsigned i = 0, e = Name.size(); i != e; ++i) {
72e5dd7070Spatrick assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
73e5dd7070Spatrick CodePoint *= 10;
74e5dd7070Spatrick CodePoint += Name[i] - '0';
75e5dd7070Spatrick }
76e5dd7070Spatrick return convertCodePointToUTF8(Allocator, CodePoint);
77e5dd7070Spatrick }
78e5dd7070Spatrick
resolveHTMLHexCharacterReference(StringRef Name) const79e5dd7070Spatrick StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
80e5dd7070Spatrick unsigned CodePoint = 0;
81e5dd7070Spatrick for (unsigned i = 0, e = Name.size(); i != e; ++i) {
82e5dd7070Spatrick CodePoint *= 16;
83e5dd7070Spatrick const char C = Name[i];
84e5dd7070Spatrick assert(isHTMLHexCharacterReferenceCharacter(C));
85e5dd7070Spatrick CodePoint += llvm::hexDigitValue(C);
86e5dd7070Spatrick }
87e5dd7070Spatrick return convertCodePointToUTF8(Allocator, CodePoint);
88e5dd7070Spatrick }
89e5dd7070Spatrick
skipLineStartingDecorations()90e5dd7070Spatrick void Lexer::skipLineStartingDecorations() {
91e5dd7070Spatrick // This function should be called only for C comments
92e5dd7070Spatrick assert(CommentState == LCS_InsideCComment);
93e5dd7070Spatrick
94e5dd7070Spatrick if (BufferPtr == CommentEnd)
95e5dd7070Spatrick return;
96e5dd7070Spatrick
97e5dd7070Spatrick const char *NewBufferPtr = BufferPtr;
98*12c85518Srobert while (isHorizontalWhitespace(*NewBufferPtr))
99*12c85518Srobert if (++NewBufferPtr == CommentEnd)
100e5dd7070Spatrick return;
101*12c85518Srobert if (*NewBufferPtr == '*')
102e5dd7070Spatrick BufferPtr = NewBufferPtr + 1;
103e5dd7070Spatrick }
104e5dd7070Spatrick
105e5dd7070Spatrick namespace {
106e5dd7070Spatrick /// Returns pointer to the first newline character in the string.
findNewline(const char * BufferPtr,const char * BufferEnd)107e5dd7070Spatrick const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
108e5dd7070Spatrick for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
109e5dd7070Spatrick if (isVerticalWhitespace(*BufferPtr))
110e5dd7070Spatrick return BufferPtr;
111e5dd7070Spatrick }
112e5dd7070Spatrick return BufferEnd;
113e5dd7070Spatrick }
114e5dd7070Spatrick
skipNewline(const char * BufferPtr,const char * BufferEnd)115e5dd7070Spatrick const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
116e5dd7070Spatrick if (BufferPtr == BufferEnd)
117e5dd7070Spatrick return BufferPtr;
118e5dd7070Spatrick
119e5dd7070Spatrick if (*BufferPtr == '\n')
120e5dd7070Spatrick BufferPtr++;
121e5dd7070Spatrick else {
122e5dd7070Spatrick assert(*BufferPtr == '\r');
123e5dd7070Spatrick BufferPtr++;
124e5dd7070Spatrick if (BufferPtr != BufferEnd && *BufferPtr == '\n')
125e5dd7070Spatrick BufferPtr++;
126e5dd7070Spatrick }
127e5dd7070Spatrick return BufferPtr;
128e5dd7070Spatrick }
129e5dd7070Spatrick
skipNamedCharacterReference(const char * BufferPtr,const char * BufferEnd)130e5dd7070Spatrick const char *skipNamedCharacterReference(const char *BufferPtr,
131e5dd7070Spatrick const char *BufferEnd) {
132e5dd7070Spatrick for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
133e5dd7070Spatrick if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
134e5dd7070Spatrick return BufferPtr;
135e5dd7070Spatrick }
136e5dd7070Spatrick return BufferEnd;
137e5dd7070Spatrick }
138e5dd7070Spatrick
skipDecimalCharacterReference(const char * BufferPtr,const char * BufferEnd)139e5dd7070Spatrick const char *skipDecimalCharacterReference(const char *BufferPtr,
140e5dd7070Spatrick const char *BufferEnd) {
141e5dd7070Spatrick for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
142e5dd7070Spatrick if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
143e5dd7070Spatrick return BufferPtr;
144e5dd7070Spatrick }
145e5dd7070Spatrick return BufferEnd;
146e5dd7070Spatrick }
147e5dd7070Spatrick
skipHexCharacterReference(const char * BufferPtr,const char * BufferEnd)148e5dd7070Spatrick const char *skipHexCharacterReference(const char *BufferPtr,
149e5dd7070Spatrick const char *BufferEnd) {
150e5dd7070Spatrick for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
151e5dd7070Spatrick if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
152e5dd7070Spatrick return BufferPtr;
153e5dd7070Spatrick }
154e5dd7070Spatrick return BufferEnd;
155e5dd7070Spatrick }
156e5dd7070Spatrick
isHTMLIdentifierStartingCharacter(char C)157e5dd7070Spatrick bool isHTMLIdentifierStartingCharacter(char C) {
158e5dd7070Spatrick return isLetter(C);
159e5dd7070Spatrick }
160e5dd7070Spatrick
isHTMLIdentifierCharacter(char C)161e5dd7070Spatrick bool isHTMLIdentifierCharacter(char C) {
162e5dd7070Spatrick return isAlphanumeric(C);
163e5dd7070Spatrick }
164e5dd7070Spatrick
skipHTMLIdentifier(const char * BufferPtr,const char * BufferEnd)165e5dd7070Spatrick const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
166e5dd7070Spatrick for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
167e5dd7070Spatrick if (!isHTMLIdentifierCharacter(*BufferPtr))
168e5dd7070Spatrick return BufferPtr;
169e5dd7070Spatrick }
170e5dd7070Spatrick return BufferEnd;
171e5dd7070Spatrick }
172e5dd7070Spatrick
173e5dd7070Spatrick /// Skip HTML string quoted in single or double quotes. Escaping quotes inside
174e5dd7070Spatrick /// string allowed.
175e5dd7070Spatrick ///
176e5dd7070Spatrick /// Returns pointer to closing quote.
skipHTMLQuotedString(const char * BufferPtr,const char * BufferEnd)177e5dd7070Spatrick const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
178e5dd7070Spatrick {
179e5dd7070Spatrick const char Quote = *BufferPtr;
180e5dd7070Spatrick assert(Quote == '\"' || Quote == '\'');
181e5dd7070Spatrick
182e5dd7070Spatrick BufferPtr++;
183e5dd7070Spatrick for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
184e5dd7070Spatrick const char C = *BufferPtr;
185e5dd7070Spatrick if (C == Quote && BufferPtr[-1] != '\\')
186e5dd7070Spatrick return BufferPtr;
187e5dd7070Spatrick }
188e5dd7070Spatrick return BufferEnd;
189e5dd7070Spatrick }
190e5dd7070Spatrick
skipWhitespace(const char * BufferPtr,const char * BufferEnd)191e5dd7070Spatrick const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
192e5dd7070Spatrick for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
193e5dd7070Spatrick if (!isWhitespace(*BufferPtr))
194e5dd7070Spatrick return BufferPtr;
195e5dd7070Spatrick }
196e5dd7070Spatrick return BufferEnd;
197e5dd7070Spatrick }
198e5dd7070Spatrick
isWhitespace(const char * BufferPtr,const char * BufferEnd)199e5dd7070Spatrick bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
200e5dd7070Spatrick return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
201e5dd7070Spatrick }
202e5dd7070Spatrick
isCommandNameStartCharacter(char C)203e5dd7070Spatrick bool isCommandNameStartCharacter(char C) {
204e5dd7070Spatrick return isLetter(C);
205e5dd7070Spatrick }
206e5dd7070Spatrick
isCommandNameCharacter(char C)207e5dd7070Spatrick bool isCommandNameCharacter(char C) {
208e5dd7070Spatrick return isAlphanumeric(C);
209e5dd7070Spatrick }
210e5dd7070Spatrick
skipCommandName(const char * BufferPtr,const char * BufferEnd)211e5dd7070Spatrick const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
212e5dd7070Spatrick for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
213e5dd7070Spatrick if (!isCommandNameCharacter(*BufferPtr))
214e5dd7070Spatrick return BufferPtr;
215e5dd7070Spatrick }
216e5dd7070Spatrick return BufferEnd;
217e5dd7070Spatrick }
218e5dd7070Spatrick
219e5dd7070Spatrick /// Return the one past end pointer for BCPL comments.
220e5dd7070Spatrick /// Handles newlines escaped with backslash or trigraph for backslahs.
findBCPLCommentEnd(const char * BufferPtr,const char * BufferEnd)221e5dd7070Spatrick const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
222e5dd7070Spatrick const char *CurPtr = BufferPtr;
223e5dd7070Spatrick while (CurPtr != BufferEnd) {
224e5dd7070Spatrick while (!isVerticalWhitespace(*CurPtr)) {
225e5dd7070Spatrick CurPtr++;
226e5dd7070Spatrick if (CurPtr == BufferEnd)
227e5dd7070Spatrick return BufferEnd;
228e5dd7070Spatrick }
229e5dd7070Spatrick // We found a newline, check if it is escaped.
230e5dd7070Spatrick const char *EscapePtr = CurPtr - 1;
231e5dd7070Spatrick while(isHorizontalWhitespace(*EscapePtr))
232e5dd7070Spatrick EscapePtr--;
233e5dd7070Spatrick
234e5dd7070Spatrick if (*EscapePtr == '\\' ||
235e5dd7070Spatrick (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
236e5dd7070Spatrick EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
237e5dd7070Spatrick // We found an escaped newline.
238e5dd7070Spatrick CurPtr = skipNewline(CurPtr, BufferEnd);
239e5dd7070Spatrick } else
240e5dd7070Spatrick return CurPtr; // Not an escaped newline.
241e5dd7070Spatrick }
242e5dd7070Spatrick return BufferEnd;
243e5dd7070Spatrick }
244e5dd7070Spatrick
245e5dd7070Spatrick /// Return the one past end pointer for C comments.
246e5dd7070Spatrick /// Very dumb, does not handle escaped newlines or trigraphs.
findCCommentEnd(const char * BufferPtr,const char * BufferEnd)247e5dd7070Spatrick const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
248e5dd7070Spatrick for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
249e5dd7070Spatrick if (*BufferPtr == '*') {
250e5dd7070Spatrick assert(BufferPtr + 1 != BufferEnd);
251e5dd7070Spatrick if (*(BufferPtr + 1) == '/')
252e5dd7070Spatrick return BufferPtr;
253e5dd7070Spatrick }
254e5dd7070Spatrick }
255e5dd7070Spatrick llvm_unreachable("buffer end hit before '*/' was seen");
256e5dd7070Spatrick }
257e5dd7070Spatrick
258e5dd7070Spatrick } // end anonymous namespace
259e5dd7070Spatrick
formTokenWithChars(Token & Result,const char * TokEnd,tok::TokenKind Kind)260e5dd7070Spatrick void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
261e5dd7070Spatrick tok::TokenKind Kind) {
262e5dd7070Spatrick const unsigned TokLen = TokEnd - BufferPtr;
263e5dd7070Spatrick Result.setLocation(getSourceLocation(BufferPtr));
264e5dd7070Spatrick Result.setKind(Kind);
265e5dd7070Spatrick Result.setLength(TokLen);
266e5dd7070Spatrick #ifndef NDEBUG
267e5dd7070Spatrick Result.TextPtr = "<UNSET>";
268e5dd7070Spatrick Result.IntVal = 7;
269e5dd7070Spatrick #endif
270e5dd7070Spatrick BufferPtr = TokEnd;
271e5dd7070Spatrick }
272e5dd7070Spatrick
skipTextToken()273*12c85518Srobert const char *Lexer::skipTextToken() {
274*12c85518Srobert const char *TokenPtr = BufferPtr;
275*12c85518Srobert assert(TokenPtr < CommentEnd);
276*12c85518Srobert StringRef TokStartSymbols = ParseCommands ? "\n\r\\@\"&<" : "\n\r";
277*12c85518Srobert
278*12c85518Srobert again:
279*12c85518Srobert size_t End =
280*12c85518Srobert StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of(TokStartSymbols);
281*12c85518Srobert if (End == StringRef::npos)
282*12c85518Srobert return CommentEnd;
283*12c85518Srobert
284*12c85518Srobert // Doxygen doesn't recognize any commands in a one-line double quotation.
285*12c85518Srobert // If we don't find an ending quotation mark, we pretend it never began.
286*12c85518Srobert if (*(TokenPtr + End) == '\"') {
287*12c85518Srobert TokenPtr += End + 1;
288*12c85518Srobert End = StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of("\n\r\"");
289*12c85518Srobert if (End != StringRef::npos && *(TokenPtr + End) == '\"')
290*12c85518Srobert TokenPtr += End + 1;
291*12c85518Srobert goto again;
292*12c85518Srobert }
293*12c85518Srobert return TokenPtr + End;
294*12c85518Srobert }
295*12c85518Srobert
lexCommentText(Token & T)296e5dd7070Spatrick void Lexer::lexCommentText(Token &T) {
297e5dd7070Spatrick assert(CommentState == LCS_InsideBCPLComment ||
298e5dd7070Spatrick CommentState == LCS_InsideCComment);
299e5dd7070Spatrick
300e5dd7070Spatrick // Handles lexing non-command text, i.e. text and newline.
301e5dd7070Spatrick auto HandleNonCommandToken = [&]() -> void {
302e5dd7070Spatrick assert(State == LS_Normal);
303e5dd7070Spatrick
304e5dd7070Spatrick const char *TokenPtr = BufferPtr;
305e5dd7070Spatrick assert(TokenPtr < CommentEnd);
306e5dd7070Spatrick switch (*TokenPtr) {
307e5dd7070Spatrick case '\n':
308e5dd7070Spatrick case '\r':
309e5dd7070Spatrick TokenPtr = skipNewline(TokenPtr, CommentEnd);
310e5dd7070Spatrick formTokenWithChars(T, TokenPtr, tok::newline);
311e5dd7070Spatrick
312e5dd7070Spatrick if (CommentState == LCS_InsideCComment)
313e5dd7070Spatrick skipLineStartingDecorations();
314e5dd7070Spatrick return;
315e5dd7070Spatrick
316*12c85518Srobert default:
317*12c85518Srobert return formTextToken(T, skipTextToken());
318e5dd7070Spatrick }
319e5dd7070Spatrick };
320e5dd7070Spatrick
321e5dd7070Spatrick if (!ParseCommands)
322e5dd7070Spatrick return HandleNonCommandToken();
323e5dd7070Spatrick
324e5dd7070Spatrick switch (State) {
325e5dd7070Spatrick case LS_Normal:
326e5dd7070Spatrick break;
327e5dd7070Spatrick case LS_VerbatimBlockFirstLine:
328e5dd7070Spatrick lexVerbatimBlockFirstLine(T);
329e5dd7070Spatrick return;
330e5dd7070Spatrick case LS_VerbatimBlockBody:
331e5dd7070Spatrick lexVerbatimBlockBody(T);
332e5dd7070Spatrick return;
333e5dd7070Spatrick case LS_VerbatimLineText:
334e5dd7070Spatrick lexVerbatimLineText(T);
335e5dd7070Spatrick return;
336e5dd7070Spatrick case LS_HTMLStartTag:
337e5dd7070Spatrick lexHTMLStartTag(T);
338e5dd7070Spatrick return;
339e5dd7070Spatrick case LS_HTMLEndTag:
340e5dd7070Spatrick lexHTMLEndTag(T);
341e5dd7070Spatrick return;
342e5dd7070Spatrick }
343e5dd7070Spatrick
344e5dd7070Spatrick assert(State == LS_Normal);
345e5dd7070Spatrick const char *TokenPtr = BufferPtr;
346e5dd7070Spatrick assert(TokenPtr < CommentEnd);
347e5dd7070Spatrick switch(*TokenPtr) {
348e5dd7070Spatrick case '\\':
349e5dd7070Spatrick case '@': {
350e5dd7070Spatrick // Commands that start with a backslash and commands that start with
351e5dd7070Spatrick // 'at' have equivalent semantics. But we keep information about the
352e5dd7070Spatrick // exact syntax in AST for comments.
353e5dd7070Spatrick tok::TokenKind CommandKind =
354e5dd7070Spatrick (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
355e5dd7070Spatrick TokenPtr++;
356e5dd7070Spatrick if (TokenPtr == CommentEnd) {
357e5dd7070Spatrick formTextToken(T, TokenPtr);
358e5dd7070Spatrick return;
359e5dd7070Spatrick }
360e5dd7070Spatrick char C = *TokenPtr;
361e5dd7070Spatrick switch (C) {
362e5dd7070Spatrick default:
363e5dd7070Spatrick break;
364e5dd7070Spatrick
365e5dd7070Spatrick case '\\': case '@': case '&': case '$':
366e5dd7070Spatrick case '#': case '<': case '>': case '%':
367e5dd7070Spatrick case '\"': case '.': case ':':
368e5dd7070Spatrick // This is one of \\ \@ \& \$ etc escape sequences.
369e5dd7070Spatrick TokenPtr++;
370e5dd7070Spatrick if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
371e5dd7070Spatrick // This is the \:: escape sequence.
372e5dd7070Spatrick TokenPtr++;
373e5dd7070Spatrick }
374e5dd7070Spatrick StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
375e5dd7070Spatrick formTokenWithChars(T, TokenPtr, tok::text);
376e5dd7070Spatrick T.setText(UnescapedText);
377e5dd7070Spatrick return;
378e5dd7070Spatrick }
379e5dd7070Spatrick
380e5dd7070Spatrick // Don't make zero-length commands.
381e5dd7070Spatrick if (!isCommandNameStartCharacter(*TokenPtr)) {
382e5dd7070Spatrick formTextToken(T, TokenPtr);
383e5dd7070Spatrick return;
384e5dd7070Spatrick }
385e5dd7070Spatrick
386e5dd7070Spatrick TokenPtr = skipCommandName(TokenPtr, CommentEnd);
387e5dd7070Spatrick unsigned Length = TokenPtr - (BufferPtr + 1);
388e5dd7070Spatrick
389e5dd7070Spatrick // Hardcoded support for lexing LaTeX formula commands
390*12c85518Srobert // \f$ \f( \f) \f[ \f] \f{ \f} as a single command.
391e5dd7070Spatrick if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
392e5dd7070Spatrick C = *TokenPtr;
393*12c85518Srobert if (C == '$' || C == '(' || C == ')' || C == '[' || C == ']' ||
394*12c85518Srobert C == '{' || C == '}') {
395e5dd7070Spatrick TokenPtr++;
396e5dd7070Spatrick Length++;
397e5dd7070Spatrick }
398e5dd7070Spatrick }
399e5dd7070Spatrick
400e5dd7070Spatrick StringRef CommandName(BufferPtr + 1, Length);
401e5dd7070Spatrick
402e5dd7070Spatrick const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
403e5dd7070Spatrick if (!Info) {
404e5dd7070Spatrick if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
405e5dd7070Spatrick StringRef CorrectedName = Info->Name;
406e5dd7070Spatrick SourceLocation Loc = getSourceLocation(BufferPtr);
407e5dd7070Spatrick SourceLocation EndLoc = getSourceLocation(TokenPtr);
408e5dd7070Spatrick SourceRange FullRange = SourceRange(Loc, EndLoc);
409e5dd7070Spatrick SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc);
410e5dd7070Spatrick Diag(Loc, diag::warn_correct_comment_command_name)
411e5dd7070Spatrick << FullRange << CommandName << CorrectedName
412e5dd7070Spatrick << FixItHint::CreateReplacement(CommandRange, CorrectedName);
413e5dd7070Spatrick } else {
414e5dd7070Spatrick formTokenWithChars(T, TokenPtr, tok::unknown_command);
415e5dd7070Spatrick T.setUnknownCommandName(CommandName);
416e5dd7070Spatrick Diag(T.getLocation(), diag::warn_unknown_comment_command_name)
417e5dd7070Spatrick << SourceRange(T.getLocation(), T.getEndLocation());
418e5dd7070Spatrick return;
419e5dd7070Spatrick }
420e5dd7070Spatrick }
421e5dd7070Spatrick if (Info->IsVerbatimBlockCommand) {
422e5dd7070Spatrick setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
423e5dd7070Spatrick return;
424e5dd7070Spatrick }
425e5dd7070Spatrick if (Info->IsVerbatimLineCommand) {
426e5dd7070Spatrick setupAndLexVerbatimLine(T, TokenPtr, Info);
427e5dd7070Spatrick return;
428e5dd7070Spatrick }
429e5dd7070Spatrick formTokenWithChars(T, TokenPtr, CommandKind);
430e5dd7070Spatrick T.setCommandID(Info->getID());
431e5dd7070Spatrick return;
432e5dd7070Spatrick }
433e5dd7070Spatrick
434e5dd7070Spatrick case '&':
435e5dd7070Spatrick lexHTMLCharacterReference(T);
436e5dd7070Spatrick return;
437e5dd7070Spatrick
438e5dd7070Spatrick case '<': {
439e5dd7070Spatrick TokenPtr++;
440e5dd7070Spatrick if (TokenPtr == CommentEnd) {
441e5dd7070Spatrick formTextToken(T, TokenPtr);
442e5dd7070Spatrick return;
443e5dd7070Spatrick }
444e5dd7070Spatrick const char C = *TokenPtr;
445e5dd7070Spatrick if (isHTMLIdentifierStartingCharacter(C))
446e5dd7070Spatrick setupAndLexHTMLStartTag(T);
447e5dd7070Spatrick else if (C == '/')
448e5dd7070Spatrick setupAndLexHTMLEndTag(T);
449e5dd7070Spatrick else
450e5dd7070Spatrick formTextToken(T, TokenPtr);
451e5dd7070Spatrick return;
452e5dd7070Spatrick }
453e5dd7070Spatrick
454e5dd7070Spatrick default:
455e5dd7070Spatrick return HandleNonCommandToken();
456e5dd7070Spatrick }
457e5dd7070Spatrick }
458e5dd7070Spatrick
setupAndLexVerbatimBlock(Token & T,const char * TextBegin,char Marker,const CommandInfo * Info)459e5dd7070Spatrick void Lexer::setupAndLexVerbatimBlock(Token &T,
460e5dd7070Spatrick const char *TextBegin,
461e5dd7070Spatrick char Marker, const CommandInfo *Info) {
462e5dd7070Spatrick assert(Info->IsVerbatimBlockCommand);
463e5dd7070Spatrick
464e5dd7070Spatrick VerbatimBlockEndCommandName.clear();
465e5dd7070Spatrick VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
466e5dd7070Spatrick VerbatimBlockEndCommandName.append(Info->EndCommandName);
467e5dd7070Spatrick
468e5dd7070Spatrick formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
469e5dd7070Spatrick T.setVerbatimBlockID(Info->getID());
470e5dd7070Spatrick
471e5dd7070Spatrick // If there is a newline following the verbatim opening command, skip the
472e5dd7070Spatrick // newline so that we don't create an tok::verbatim_block_line with empty
473e5dd7070Spatrick // text content.
474e5dd7070Spatrick if (BufferPtr != CommentEnd &&
475e5dd7070Spatrick isVerticalWhitespace(*BufferPtr)) {
476e5dd7070Spatrick BufferPtr = skipNewline(BufferPtr, CommentEnd);
477e5dd7070Spatrick State = LS_VerbatimBlockBody;
478e5dd7070Spatrick return;
479e5dd7070Spatrick }
480e5dd7070Spatrick
481e5dd7070Spatrick State = LS_VerbatimBlockFirstLine;
482e5dd7070Spatrick }
483e5dd7070Spatrick
lexVerbatimBlockFirstLine(Token & T)484e5dd7070Spatrick void Lexer::lexVerbatimBlockFirstLine(Token &T) {
485e5dd7070Spatrick again:
486e5dd7070Spatrick assert(BufferPtr < CommentEnd);
487e5dd7070Spatrick
488e5dd7070Spatrick // FIXME: It would be better to scan the text once, finding either the block
489e5dd7070Spatrick // end command or newline.
490e5dd7070Spatrick //
491e5dd7070Spatrick // Extract current line.
492e5dd7070Spatrick const char *Newline = findNewline(BufferPtr, CommentEnd);
493e5dd7070Spatrick StringRef Line(BufferPtr, Newline - BufferPtr);
494e5dd7070Spatrick
495e5dd7070Spatrick // Look for end command in current line.
496e5dd7070Spatrick size_t Pos = Line.find(VerbatimBlockEndCommandName);
497e5dd7070Spatrick const char *TextEnd;
498e5dd7070Spatrick const char *NextLine;
499e5dd7070Spatrick if (Pos == StringRef::npos) {
500e5dd7070Spatrick // Current line is completely verbatim.
501e5dd7070Spatrick TextEnd = Newline;
502e5dd7070Spatrick NextLine = skipNewline(Newline, CommentEnd);
503e5dd7070Spatrick } else if (Pos == 0) {
504e5dd7070Spatrick // Current line contains just an end command.
505e5dd7070Spatrick const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
506e5dd7070Spatrick StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
507e5dd7070Spatrick formTokenWithChars(T, End, tok::verbatim_block_end);
508e5dd7070Spatrick T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
509e5dd7070Spatrick State = LS_Normal;
510e5dd7070Spatrick return;
511e5dd7070Spatrick } else {
512e5dd7070Spatrick // There is some text, followed by end command. Extract text first.
513e5dd7070Spatrick TextEnd = BufferPtr + Pos;
514e5dd7070Spatrick NextLine = TextEnd;
515e5dd7070Spatrick // If there is only whitespace before end command, skip whitespace.
516e5dd7070Spatrick if (isWhitespace(BufferPtr, TextEnd)) {
517e5dd7070Spatrick BufferPtr = TextEnd;
518e5dd7070Spatrick goto again;
519e5dd7070Spatrick }
520e5dd7070Spatrick }
521e5dd7070Spatrick
522e5dd7070Spatrick StringRef Text(BufferPtr, TextEnd - BufferPtr);
523e5dd7070Spatrick formTokenWithChars(T, NextLine, tok::verbatim_block_line);
524e5dd7070Spatrick T.setVerbatimBlockText(Text);
525e5dd7070Spatrick
526e5dd7070Spatrick State = LS_VerbatimBlockBody;
527e5dd7070Spatrick }
528e5dd7070Spatrick
lexVerbatimBlockBody(Token & T)529e5dd7070Spatrick void Lexer::lexVerbatimBlockBody(Token &T) {
530e5dd7070Spatrick assert(State == LS_VerbatimBlockBody);
531e5dd7070Spatrick
532e5dd7070Spatrick if (CommentState == LCS_InsideCComment)
533e5dd7070Spatrick skipLineStartingDecorations();
534e5dd7070Spatrick
535e5dd7070Spatrick if (BufferPtr == CommentEnd) {
536e5dd7070Spatrick formTokenWithChars(T, BufferPtr, tok::verbatim_block_line);
537e5dd7070Spatrick T.setVerbatimBlockText("");
538e5dd7070Spatrick return;
539e5dd7070Spatrick }
540e5dd7070Spatrick
541e5dd7070Spatrick lexVerbatimBlockFirstLine(T);
542e5dd7070Spatrick }
543e5dd7070Spatrick
setupAndLexVerbatimLine(Token & T,const char * TextBegin,const CommandInfo * Info)544e5dd7070Spatrick void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
545e5dd7070Spatrick const CommandInfo *Info) {
546e5dd7070Spatrick assert(Info->IsVerbatimLineCommand);
547e5dd7070Spatrick formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
548e5dd7070Spatrick T.setVerbatimLineID(Info->getID());
549e5dd7070Spatrick
550e5dd7070Spatrick State = LS_VerbatimLineText;
551e5dd7070Spatrick }
552e5dd7070Spatrick
lexVerbatimLineText(Token & T)553e5dd7070Spatrick void Lexer::lexVerbatimLineText(Token &T) {
554e5dd7070Spatrick assert(State == LS_VerbatimLineText);
555e5dd7070Spatrick
556e5dd7070Spatrick // Extract current line.
557e5dd7070Spatrick const char *Newline = findNewline(BufferPtr, CommentEnd);
558e5dd7070Spatrick StringRef Text(BufferPtr, Newline - BufferPtr);
559e5dd7070Spatrick formTokenWithChars(T, Newline, tok::verbatim_line_text);
560e5dd7070Spatrick T.setVerbatimLineText(Text);
561e5dd7070Spatrick
562e5dd7070Spatrick State = LS_Normal;
563e5dd7070Spatrick }
564e5dd7070Spatrick
lexHTMLCharacterReference(Token & T)565e5dd7070Spatrick void Lexer::lexHTMLCharacterReference(Token &T) {
566e5dd7070Spatrick const char *TokenPtr = BufferPtr;
567e5dd7070Spatrick assert(*TokenPtr == '&');
568e5dd7070Spatrick TokenPtr++;
569e5dd7070Spatrick if (TokenPtr == CommentEnd) {
570e5dd7070Spatrick formTextToken(T, TokenPtr);
571e5dd7070Spatrick return;
572e5dd7070Spatrick }
573e5dd7070Spatrick const char *NamePtr;
574e5dd7070Spatrick bool isNamed = false;
575e5dd7070Spatrick bool isDecimal = false;
576e5dd7070Spatrick char C = *TokenPtr;
577e5dd7070Spatrick if (isHTMLNamedCharacterReferenceCharacter(C)) {
578e5dd7070Spatrick NamePtr = TokenPtr;
579e5dd7070Spatrick TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
580e5dd7070Spatrick isNamed = true;
581e5dd7070Spatrick } else if (C == '#') {
582e5dd7070Spatrick TokenPtr++;
583e5dd7070Spatrick if (TokenPtr == CommentEnd) {
584e5dd7070Spatrick formTextToken(T, TokenPtr);
585e5dd7070Spatrick return;
586e5dd7070Spatrick }
587e5dd7070Spatrick C = *TokenPtr;
588e5dd7070Spatrick if (isHTMLDecimalCharacterReferenceCharacter(C)) {
589e5dd7070Spatrick NamePtr = TokenPtr;
590e5dd7070Spatrick TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
591e5dd7070Spatrick isDecimal = true;
592e5dd7070Spatrick } else if (C == 'x' || C == 'X') {
593e5dd7070Spatrick TokenPtr++;
594e5dd7070Spatrick NamePtr = TokenPtr;
595e5dd7070Spatrick TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
596e5dd7070Spatrick } else {
597e5dd7070Spatrick formTextToken(T, TokenPtr);
598e5dd7070Spatrick return;
599e5dd7070Spatrick }
600e5dd7070Spatrick } else {
601e5dd7070Spatrick formTextToken(T, TokenPtr);
602e5dd7070Spatrick return;
603e5dd7070Spatrick }
604e5dd7070Spatrick if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
605e5dd7070Spatrick *TokenPtr != ';') {
606e5dd7070Spatrick formTextToken(T, TokenPtr);
607e5dd7070Spatrick return;
608e5dd7070Spatrick }
609e5dd7070Spatrick StringRef Name(NamePtr, TokenPtr - NamePtr);
610e5dd7070Spatrick TokenPtr++; // Skip semicolon.
611e5dd7070Spatrick StringRef Resolved;
612e5dd7070Spatrick if (isNamed)
613e5dd7070Spatrick Resolved = resolveHTMLNamedCharacterReference(Name);
614e5dd7070Spatrick else if (isDecimal)
615e5dd7070Spatrick Resolved = resolveHTMLDecimalCharacterReference(Name);
616e5dd7070Spatrick else
617e5dd7070Spatrick Resolved = resolveHTMLHexCharacterReference(Name);
618e5dd7070Spatrick
619e5dd7070Spatrick if (Resolved.empty()) {
620e5dd7070Spatrick formTextToken(T, TokenPtr);
621e5dd7070Spatrick return;
622e5dd7070Spatrick }
623e5dd7070Spatrick formTokenWithChars(T, TokenPtr, tok::text);
624e5dd7070Spatrick T.setText(Resolved);
625e5dd7070Spatrick }
626e5dd7070Spatrick
setupAndLexHTMLStartTag(Token & T)627e5dd7070Spatrick void Lexer::setupAndLexHTMLStartTag(Token &T) {
628e5dd7070Spatrick assert(BufferPtr[0] == '<' &&
629e5dd7070Spatrick isHTMLIdentifierStartingCharacter(BufferPtr[1]));
630e5dd7070Spatrick const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
631e5dd7070Spatrick StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
632e5dd7070Spatrick if (!isHTMLTagName(Name)) {
633e5dd7070Spatrick formTextToken(T, TagNameEnd);
634e5dd7070Spatrick return;
635e5dd7070Spatrick }
636e5dd7070Spatrick
637e5dd7070Spatrick formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
638e5dd7070Spatrick T.setHTMLTagStartName(Name);
639e5dd7070Spatrick
640e5dd7070Spatrick BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
641e5dd7070Spatrick
642e5dd7070Spatrick const char C = *BufferPtr;
643e5dd7070Spatrick if (BufferPtr != CommentEnd &&
644e5dd7070Spatrick (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
645e5dd7070Spatrick State = LS_HTMLStartTag;
646e5dd7070Spatrick }
647e5dd7070Spatrick
lexHTMLStartTag(Token & T)648e5dd7070Spatrick void Lexer::lexHTMLStartTag(Token &T) {
649e5dd7070Spatrick assert(State == LS_HTMLStartTag);
650e5dd7070Spatrick
651e5dd7070Spatrick const char *TokenPtr = BufferPtr;
652e5dd7070Spatrick char C = *TokenPtr;
653e5dd7070Spatrick if (isHTMLIdentifierCharacter(C)) {
654e5dd7070Spatrick TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
655e5dd7070Spatrick StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
656e5dd7070Spatrick formTokenWithChars(T, TokenPtr, tok::html_ident);
657e5dd7070Spatrick T.setHTMLIdent(Ident);
658e5dd7070Spatrick } else {
659e5dd7070Spatrick switch (C) {
660e5dd7070Spatrick case '=':
661e5dd7070Spatrick TokenPtr++;
662e5dd7070Spatrick formTokenWithChars(T, TokenPtr, tok::html_equals);
663e5dd7070Spatrick break;
664e5dd7070Spatrick case '\"':
665e5dd7070Spatrick case '\'': {
666e5dd7070Spatrick const char *OpenQuote = TokenPtr;
667e5dd7070Spatrick TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
668e5dd7070Spatrick const char *ClosingQuote = TokenPtr;
669e5dd7070Spatrick if (TokenPtr != CommentEnd) // Skip closing quote.
670e5dd7070Spatrick TokenPtr++;
671e5dd7070Spatrick formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
672e5dd7070Spatrick T.setHTMLQuotedString(StringRef(OpenQuote + 1,
673e5dd7070Spatrick ClosingQuote - (OpenQuote + 1)));
674e5dd7070Spatrick break;
675e5dd7070Spatrick }
676e5dd7070Spatrick case '>':
677e5dd7070Spatrick TokenPtr++;
678e5dd7070Spatrick formTokenWithChars(T, TokenPtr, tok::html_greater);
679e5dd7070Spatrick State = LS_Normal;
680e5dd7070Spatrick return;
681e5dd7070Spatrick case '/':
682e5dd7070Spatrick TokenPtr++;
683e5dd7070Spatrick if (TokenPtr != CommentEnd && *TokenPtr == '>') {
684e5dd7070Spatrick TokenPtr++;
685e5dd7070Spatrick formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
686e5dd7070Spatrick } else
687e5dd7070Spatrick formTextToken(T, TokenPtr);
688e5dd7070Spatrick
689e5dd7070Spatrick State = LS_Normal;
690e5dd7070Spatrick return;
691e5dd7070Spatrick }
692e5dd7070Spatrick }
693e5dd7070Spatrick
694e5dd7070Spatrick // Now look ahead and return to normal state if we don't see any HTML tokens
695e5dd7070Spatrick // ahead.
696e5dd7070Spatrick BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
697e5dd7070Spatrick if (BufferPtr == CommentEnd) {
698e5dd7070Spatrick State = LS_Normal;
699e5dd7070Spatrick return;
700e5dd7070Spatrick }
701e5dd7070Spatrick
702e5dd7070Spatrick C = *BufferPtr;
703e5dd7070Spatrick if (!isHTMLIdentifierStartingCharacter(C) &&
704*12c85518Srobert C != '=' && C != '\"' && C != '\'' && C != '>' && C != '/') {
705e5dd7070Spatrick State = LS_Normal;
706e5dd7070Spatrick return;
707e5dd7070Spatrick }
708e5dd7070Spatrick }
709e5dd7070Spatrick
setupAndLexHTMLEndTag(Token & T)710e5dd7070Spatrick void Lexer::setupAndLexHTMLEndTag(Token &T) {
711e5dd7070Spatrick assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
712e5dd7070Spatrick
713e5dd7070Spatrick const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
714e5dd7070Spatrick const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
715e5dd7070Spatrick StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
716e5dd7070Spatrick if (!isHTMLTagName(Name)) {
717e5dd7070Spatrick formTextToken(T, TagNameEnd);
718e5dd7070Spatrick return;
719e5dd7070Spatrick }
720e5dd7070Spatrick
721e5dd7070Spatrick const char *End = skipWhitespace(TagNameEnd, CommentEnd);
722e5dd7070Spatrick
723e5dd7070Spatrick formTokenWithChars(T, End, tok::html_end_tag);
724e5dd7070Spatrick T.setHTMLTagEndName(Name);
725e5dd7070Spatrick
726e5dd7070Spatrick if (BufferPtr != CommentEnd && *BufferPtr == '>')
727e5dd7070Spatrick State = LS_HTMLEndTag;
728e5dd7070Spatrick }
729e5dd7070Spatrick
lexHTMLEndTag(Token & T)730e5dd7070Spatrick void Lexer::lexHTMLEndTag(Token &T) {
731e5dd7070Spatrick assert(BufferPtr != CommentEnd && *BufferPtr == '>');
732e5dd7070Spatrick
733e5dd7070Spatrick formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
734e5dd7070Spatrick State = LS_Normal;
735e5dd7070Spatrick }
736e5dd7070Spatrick
Lexer(llvm::BumpPtrAllocator & Allocator,DiagnosticsEngine & Diags,const CommandTraits & Traits,SourceLocation FileLoc,const char * BufferStart,const char * BufferEnd,bool ParseCommands)737e5dd7070Spatrick Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
738e5dd7070Spatrick const CommandTraits &Traits, SourceLocation FileLoc,
739a9ac8606Spatrick const char *BufferStart, const char *BufferEnd, bool ParseCommands)
740e5dd7070Spatrick : Allocator(Allocator), Diags(Diags), Traits(Traits),
741a9ac8606Spatrick BufferStart(BufferStart), BufferEnd(BufferEnd), BufferPtr(BufferStart),
742a9ac8606Spatrick FileLoc(FileLoc), ParseCommands(ParseCommands),
743a9ac8606Spatrick CommentState(LCS_BeforeComment), State(LS_Normal) {}
744e5dd7070Spatrick
lex(Token & T)745e5dd7070Spatrick void Lexer::lex(Token &T) {
746e5dd7070Spatrick again:
747e5dd7070Spatrick switch (CommentState) {
748e5dd7070Spatrick case LCS_BeforeComment:
749e5dd7070Spatrick if (BufferPtr == BufferEnd) {
750e5dd7070Spatrick formTokenWithChars(T, BufferPtr, tok::eof);
751e5dd7070Spatrick return;
752e5dd7070Spatrick }
753e5dd7070Spatrick
754e5dd7070Spatrick assert(*BufferPtr == '/');
755e5dd7070Spatrick BufferPtr++; // Skip first slash.
756e5dd7070Spatrick switch(*BufferPtr) {
757e5dd7070Spatrick case '/': { // BCPL comment.
758e5dd7070Spatrick BufferPtr++; // Skip second slash.
759e5dd7070Spatrick
760e5dd7070Spatrick if (BufferPtr != BufferEnd) {
761e5dd7070Spatrick // Skip Doxygen magic marker, if it is present.
762e5dd7070Spatrick // It might be missing because of a typo //< or /*<, or because we
763e5dd7070Spatrick // merged this non-Doxygen comment into a bunch of Doxygen comments
764e5dd7070Spatrick // around it: /** ... */ /* ... */ /** ... */
765e5dd7070Spatrick const char C = *BufferPtr;
766e5dd7070Spatrick if (C == '/' || C == '!')
767e5dd7070Spatrick BufferPtr++;
768e5dd7070Spatrick }
769e5dd7070Spatrick
770e5dd7070Spatrick // Skip less-than symbol that marks trailing comments.
771e5dd7070Spatrick // Skip it even if the comment is not a Doxygen one, because //< and /*<
772e5dd7070Spatrick // are frequent typos.
773e5dd7070Spatrick if (BufferPtr != BufferEnd && *BufferPtr == '<')
774e5dd7070Spatrick BufferPtr++;
775e5dd7070Spatrick
776e5dd7070Spatrick CommentState = LCS_InsideBCPLComment;
777e5dd7070Spatrick if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
778e5dd7070Spatrick State = LS_Normal;
779e5dd7070Spatrick CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
780e5dd7070Spatrick goto again;
781e5dd7070Spatrick }
782e5dd7070Spatrick case '*': { // C comment.
783e5dd7070Spatrick BufferPtr++; // Skip star.
784e5dd7070Spatrick
785e5dd7070Spatrick // Skip Doxygen magic marker.
786e5dd7070Spatrick const char C = *BufferPtr;
787e5dd7070Spatrick if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
788e5dd7070Spatrick BufferPtr++;
789e5dd7070Spatrick
790e5dd7070Spatrick // Skip less-than symbol that marks trailing comments.
791e5dd7070Spatrick if (BufferPtr != BufferEnd && *BufferPtr == '<')
792e5dd7070Spatrick BufferPtr++;
793e5dd7070Spatrick
794e5dd7070Spatrick CommentState = LCS_InsideCComment;
795e5dd7070Spatrick State = LS_Normal;
796e5dd7070Spatrick CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
797e5dd7070Spatrick goto again;
798e5dd7070Spatrick }
799e5dd7070Spatrick default:
800e5dd7070Spatrick llvm_unreachable("second character of comment should be '/' or '*'");
801e5dd7070Spatrick }
802e5dd7070Spatrick
803e5dd7070Spatrick case LCS_BetweenComments: {
804e5dd7070Spatrick // Consecutive comments are extracted only if there is only whitespace
805e5dd7070Spatrick // between them. So we can search for the start of the next comment.
806e5dd7070Spatrick const char *EndWhitespace = BufferPtr;
807e5dd7070Spatrick while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
808e5dd7070Spatrick EndWhitespace++;
809e5dd7070Spatrick
810e5dd7070Spatrick // Turn any whitespace between comments (and there is only whitespace
811e5dd7070Spatrick // between them -- guaranteed by comment extraction) into a newline. We
812e5dd7070Spatrick // have two newlines between C comments in total (first one was synthesized
813e5dd7070Spatrick // after a comment).
814e5dd7070Spatrick formTokenWithChars(T, EndWhitespace, tok::newline);
815e5dd7070Spatrick
816e5dd7070Spatrick CommentState = LCS_BeforeComment;
817e5dd7070Spatrick break;
818e5dd7070Spatrick }
819e5dd7070Spatrick
820e5dd7070Spatrick case LCS_InsideBCPLComment:
821e5dd7070Spatrick case LCS_InsideCComment:
822e5dd7070Spatrick if (BufferPtr != CommentEnd) {
823e5dd7070Spatrick lexCommentText(T);
824e5dd7070Spatrick break;
825e5dd7070Spatrick } else {
826e5dd7070Spatrick // Skip C comment closing sequence.
827e5dd7070Spatrick if (CommentState == LCS_InsideCComment) {
828e5dd7070Spatrick assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
829e5dd7070Spatrick BufferPtr += 2;
830e5dd7070Spatrick assert(BufferPtr <= BufferEnd);
831e5dd7070Spatrick
832e5dd7070Spatrick // Synthenize newline just after the C comment, regardless if there is
833e5dd7070Spatrick // actually a newline.
834e5dd7070Spatrick formTokenWithChars(T, BufferPtr, tok::newline);
835e5dd7070Spatrick
836e5dd7070Spatrick CommentState = LCS_BetweenComments;
837e5dd7070Spatrick break;
838e5dd7070Spatrick } else {
839e5dd7070Spatrick // Don't synthesized a newline after BCPL comment.
840e5dd7070Spatrick CommentState = LCS_BetweenComments;
841e5dd7070Spatrick goto again;
842e5dd7070Spatrick }
843e5dd7070Spatrick }
844e5dd7070Spatrick }
845e5dd7070Spatrick }
846e5dd7070Spatrick
getSpelling(const Token & Tok,const SourceManager & SourceMgr) const847e5dd7070Spatrick StringRef Lexer::getSpelling(const Token &Tok,
848e5dd7070Spatrick const SourceManager &SourceMgr) const {
849e5dd7070Spatrick SourceLocation Loc = Tok.getLocation();
850e5dd7070Spatrick std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
851e5dd7070Spatrick
852e5dd7070Spatrick bool InvalidTemp = false;
853e5dd7070Spatrick StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
854e5dd7070Spatrick if (InvalidTemp)
855e5dd7070Spatrick return StringRef();
856e5dd7070Spatrick
857e5dd7070Spatrick const char *Begin = File.data() + LocInfo.second;
858e5dd7070Spatrick return StringRef(Begin, Tok.getLength());
859e5dd7070Spatrick }
860e5dd7070Spatrick
861e5dd7070Spatrick } // end namespace comments
862e5dd7070Spatrick } // end namespace clang
863