xref: /openbsd-src/gnu/llvm/clang/lib/AST/CommentLexer.cpp (revision 12c855180aad702bbcca06e0398d774beeafb155)
1e5dd7070Spatrick //===--- CommentLexer.cpp -------------------------------------------------===//
2e5dd7070Spatrick //
3e5dd7070Spatrick // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4e5dd7070Spatrick // See https://llvm.org/LICENSE.txt for license information.
5e5dd7070Spatrick // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6e5dd7070Spatrick //
7e5dd7070Spatrick //===----------------------------------------------------------------------===//
8e5dd7070Spatrick 
9e5dd7070Spatrick #include "clang/AST/CommentLexer.h"
10e5dd7070Spatrick #include "clang/AST/CommentCommandTraits.h"
11e5dd7070Spatrick #include "clang/AST/CommentDiagnostic.h"
12e5dd7070Spatrick #include "clang/Basic/CharInfo.h"
13e5dd7070Spatrick #include "llvm/ADT/StringExtras.h"
14e5dd7070Spatrick #include "llvm/ADT/StringSwitch.h"
15e5dd7070Spatrick #include "llvm/Support/ConvertUTF.h"
16e5dd7070Spatrick #include "llvm/Support/ErrorHandling.h"
17e5dd7070Spatrick 
18e5dd7070Spatrick namespace clang {
19e5dd7070Spatrick namespace comments {
20e5dd7070Spatrick 
dump(const Lexer & L,const SourceManager & SM) const21e5dd7070Spatrick void Token::dump(const Lexer &L, const SourceManager &SM) const {
22e5dd7070Spatrick   llvm::errs() << "comments::Token Kind=" << Kind << " ";
23e5dd7070Spatrick   Loc.print(llvm::errs(), SM);
24e5dd7070Spatrick   llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
25e5dd7070Spatrick }
26e5dd7070Spatrick 
isHTMLNamedCharacterReferenceCharacter(char C)27e5dd7070Spatrick static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
28e5dd7070Spatrick   return isLetter(C);
29e5dd7070Spatrick }
30e5dd7070Spatrick 
isHTMLDecimalCharacterReferenceCharacter(char C)31e5dd7070Spatrick static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
32e5dd7070Spatrick   return isDigit(C);
33e5dd7070Spatrick }
34e5dd7070Spatrick 
isHTMLHexCharacterReferenceCharacter(char C)35e5dd7070Spatrick static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
36e5dd7070Spatrick   return isHexDigit(C);
37e5dd7070Spatrick }
38e5dd7070Spatrick 
convertCodePointToUTF8(llvm::BumpPtrAllocator & Allocator,unsigned CodePoint)39e5dd7070Spatrick static inline StringRef convertCodePointToUTF8(
40e5dd7070Spatrick                                       llvm::BumpPtrAllocator &Allocator,
41e5dd7070Spatrick                                       unsigned CodePoint) {
42e5dd7070Spatrick   char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
43e5dd7070Spatrick   char *ResolvedPtr = Resolved;
44e5dd7070Spatrick   if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
45e5dd7070Spatrick     return StringRef(Resolved, ResolvedPtr - Resolved);
46e5dd7070Spatrick   else
47e5dd7070Spatrick     return StringRef();
48e5dd7070Spatrick }
49e5dd7070Spatrick 
50e5dd7070Spatrick namespace {
51e5dd7070Spatrick 
52e5dd7070Spatrick #include "clang/AST/CommentHTMLTags.inc"
53e5dd7070Spatrick #include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
54e5dd7070Spatrick 
55e5dd7070Spatrick } // end anonymous namespace
56e5dd7070Spatrick 
resolveHTMLNamedCharacterReference(StringRef Name) const57e5dd7070Spatrick StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
58e5dd7070Spatrick   // Fast path, first check a few most widely used named character references.
59e5dd7070Spatrick   return llvm::StringSwitch<StringRef>(Name)
60e5dd7070Spatrick       .Case("amp", "&")
61e5dd7070Spatrick       .Case("lt", "<")
62e5dd7070Spatrick       .Case("gt", ">")
63e5dd7070Spatrick       .Case("quot", "\"")
64e5dd7070Spatrick       .Case("apos", "\'")
65e5dd7070Spatrick       // Slow path.
66e5dd7070Spatrick       .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
67e5dd7070Spatrick }
68e5dd7070Spatrick 
resolveHTMLDecimalCharacterReference(StringRef Name) const69e5dd7070Spatrick StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
70e5dd7070Spatrick   unsigned CodePoint = 0;
71e5dd7070Spatrick   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
72e5dd7070Spatrick     assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
73e5dd7070Spatrick     CodePoint *= 10;
74e5dd7070Spatrick     CodePoint += Name[i] - '0';
75e5dd7070Spatrick   }
76e5dd7070Spatrick   return convertCodePointToUTF8(Allocator, CodePoint);
77e5dd7070Spatrick }
78e5dd7070Spatrick 
resolveHTMLHexCharacterReference(StringRef Name) const79e5dd7070Spatrick StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
80e5dd7070Spatrick   unsigned CodePoint = 0;
81e5dd7070Spatrick   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
82e5dd7070Spatrick     CodePoint *= 16;
83e5dd7070Spatrick     const char C = Name[i];
84e5dd7070Spatrick     assert(isHTMLHexCharacterReferenceCharacter(C));
85e5dd7070Spatrick     CodePoint += llvm::hexDigitValue(C);
86e5dd7070Spatrick   }
87e5dd7070Spatrick   return convertCodePointToUTF8(Allocator, CodePoint);
88e5dd7070Spatrick }
89e5dd7070Spatrick 
skipLineStartingDecorations()90e5dd7070Spatrick void Lexer::skipLineStartingDecorations() {
91e5dd7070Spatrick   // This function should be called only for C comments
92e5dd7070Spatrick   assert(CommentState == LCS_InsideCComment);
93e5dd7070Spatrick 
94e5dd7070Spatrick   if (BufferPtr == CommentEnd)
95e5dd7070Spatrick     return;
96e5dd7070Spatrick 
97e5dd7070Spatrick   const char *NewBufferPtr = BufferPtr;
98*12c85518Srobert   while (isHorizontalWhitespace(*NewBufferPtr))
99*12c85518Srobert     if (++NewBufferPtr == CommentEnd)
100e5dd7070Spatrick       return;
101*12c85518Srobert   if (*NewBufferPtr == '*')
102e5dd7070Spatrick     BufferPtr = NewBufferPtr + 1;
103e5dd7070Spatrick }
104e5dd7070Spatrick 
105e5dd7070Spatrick namespace {
106e5dd7070Spatrick /// Returns pointer to the first newline character in the string.
findNewline(const char * BufferPtr,const char * BufferEnd)107e5dd7070Spatrick const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
108e5dd7070Spatrick   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
109e5dd7070Spatrick     if (isVerticalWhitespace(*BufferPtr))
110e5dd7070Spatrick       return BufferPtr;
111e5dd7070Spatrick   }
112e5dd7070Spatrick   return BufferEnd;
113e5dd7070Spatrick }
114e5dd7070Spatrick 
skipNewline(const char * BufferPtr,const char * BufferEnd)115e5dd7070Spatrick const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
116e5dd7070Spatrick   if (BufferPtr == BufferEnd)
117e5dd7070Spatrick     return BufferPtr;
118e5dd7070Spatrick 
119e5dd7070Spatrick   if (*BufferPtr == '\n')
120e5dd7070Spatrick     BufferPtr++;
121e5dd7070Spatrick   else {
122e5dd7070Spatrick     assert(*BufferPtr == '\r');
123e5dd7070Spatrick     BufferPtr++;
124e5dd7070Spatrick     if (BufferPtr != BufferEnd && *BufferPtr == '\n')
125e5dd7070Spatrick       BufferPtr++;
126e5dd7070Spatrick   }
127e5dd7070Spatrick   return BufferPtr;
128e5dd7070Spatrick }
129e5dd7070Spatrick 
skipNamedCharacterReference(const char * BufferPtr,const char * BufferEnd)130e5dd7070Spatrick const char *skipNamedCharacterReference(const char *BufferPtr,
131e5dd7070Spatrick                                         const char *BufferEnd) {
132e5dd7070Spatrick   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
133e5dd7070Spatrick     if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
134e5dd7070Spatrick       return BufferPtr;
135e5dd7070Spatrick   }
136e5dd7070Spatrick   return BufferEnd;
137e5dd7070Spatrick }
138e5dd7070Spatrick 
skipDecimalCharacterReference(const char * BufferPtr,const char * BufferEnd)139e5dd7070Spatrick const char *skipDecimalCharacterReference(const char *BufferPtr,
140e5dd7070Spatrick                                           const char *BufferEnd) {
141e5dd7070Spatrick   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
142e5dd7070Spatrick     if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
143e5dd7070Spatrick       return BufferPtr;
144e5dd7070Spatrick   }
145e5dd7070Spatrick   return BufferEnd;
146e5dd7070Spatrick }
147e5dd7070Spatrick 
skipHexCharacterReference(const char * BufferPtr,const char * BufferEnd)148e5dd7070Spatrick const char *skipHexCharacterReference(const char *BufferPtr,
149e5dd7070Spatrick                                       const char *BufferEnd) {
150e5dd7070Spatrick   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
151e5dd7070Spatrick     if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
152e5dd7070Spatrick       return BufferPtr;
153e5dd7070Spatrick   }
154e5dd7070Spatrick   return BufferEnd;
155e5dd7070Spatrick }
156e5dd7070Spatrick 
isHTMLIdentifierStartingCharacter(char C)157e5dd7070Spatrick bool isHTMLIdentifierStartingCharacter(char C) {
158e5dd7070Spatrick   return isLetter(C);
159e5dd7070Spatrick }
160e5dd7070Spatrick 
isHTMLIdentifierCharacter(char C)161e5dd7070Spatrick bool isHTMLIdentifierCharacter(char C) {
162e5dd7070Spatrick   return isAlphanumeric(C);
163e5dd7070Spatrick }
164e5dd7070Spatrick 
skipHTMLIdentifier(const char * BufferPtr,const char * BufferEnd)165e5dd7070Spatrick const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
166e5dd7070Spatrick   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
167e5dd7070Spatrick     if (!isHTMLIdentifierCharacter(*BufferPtr))
168e5dd7070Spatrick       return BufferPtr;
169e5dd7070Spatrick   }
170e5dd7070Spatrick   return BufferEnd;
171e5dd7070Spatrick }
172e5dd7070Spatrick 
173e5dd7070Spatrick /// Skip HTML string quoted in single or double quotes.  Escaping quotes inside
174e5dd7070Spatrick /// string allowed.
175e5dd7070Spatrick ///
176e5dd7070Spatrick /// Returns pointer to closing quote.
skipHTMLQuotedString(const char * BufferPtr,const char * BufferEnd)177e5dd7070Spatrick const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
178e5dd7070Spatrick {
179e5dd7070Spatrick   const char Quote = *BufferPtr;
180e5dd7070Spatrick   assert(Quote == '\"' || Quote == '\'');
181e5dd7070Spatrick 
182e5dd7070Spatrick   BufferPtr++;
183e5dd7070Spatrick   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
184e5dd7070Spatrick     const char C = *BufferPtr;
185e5dd7070Spatrick     if (C == Quote && BufferPtr[-1] != '\\')
186e5dd7070Spatrick       return BufferPtr;
187e5dd7070Spatrick   }
188e5dd7070Spatrick   return BufferEnd;
189e5dd7070Spatrick }
190e5dd7070Spatrick 
skipWhitespace(const char * BufferPtr,const char * BufferEnd)191e5dd7070Spatrick const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
192e5dd7070Spatrick   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
193e5dd7070Spatrick     if (!isWhitespace(*BufferPtr))
194e5dd7070Spatrick       return BufferPtr;
195e5dd7070Spatrick   }
196e5dd7070Spatrick   return BufferEnd;
197e5dd7070Spatrick }
198e5dd7070Spatrick 
isWhitespace(const char * BufferPtr,const char * BufferEnd)199e5dd7070Spatrick bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
200e5dd7070Spatrick   return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
201e5dd7070Spatrick }
202e5dd7070Spatrick 
isCommandNameStartCharacter(char C)203e5dd7070Spatrick bool isCommandNameStartCharacter(char C) {
204e5dd7070Spatrick   return isLetter(C);
205e5dd7070Spatrick }
206e5dd7070Spatrick 
isCommandNameCharacter(char C)207e5dd7070Spatrick bool isCommandNameCharacter(char C) {
208e5dd7070Spatrick   return isAlphanumeric(C);
209e5dd7070Spatrick }
210e5dd7070Spatrick 
skipCommandName(const char * BufferPtr,const char * BufferEnd)211e5dd7070Spatrick const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
212e5dd7070Spatrick   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
213e5dd7070Spatrick     if (!isCommandNameCharacter(*BufferPtr))
214e5dd7070Spatrick       return BufferPtr;
215e5dd7070Spatrick   }
216e5dd7070Spatrick   return BufferEnd;
217e5dd7070Spatrick }
218e5dd7070Spatrick 
219e5dd7070Spatrick /// Return the one past end pointer for BCPL comments.
220e5dd7070Spatrick /// Handles newlines escaped with backslash or trigraph for backslahs.
findBCPLCommentEnd(const char * BufferPtr,const char * BufferEnd)221e5dd7070Spatrick const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
222e5dd7070Spatrick   const char *CurPtr = BufferPtr;
223e5dd7070Spatrick   while (CurPtr != BufferEnd) {
224e5dd7070Spatrick     while (!isVerticalWhitespace(*CurPtr)) {
225e5dd7070Spatrick       CurPtr++;
226e5dd7070Spatrick       if (CurPtr == BufferEnd)
227e5dd7070Spatrick         return BufferEnd;
228e5dd7070Spatrick     }
229e5dd7070Spatrick     // We found a newline, check if it is escaped.
230e5dd7070Spatrick     const char *EscapePtr = CurPtr - 1;
231e5dd7070Spatrick     while(isHorizontalWhitespace(*EscapePtr))
232e5dd7070Spatrick       EscapePtr--;
233e5dd7070Spatrick 
234e5dd7070Spatrick     if (*EscapePtr == '\\' ||
235e5dd7070Spatrick         (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
236e5dd7070Spatrick          EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
237e5dd7070Spatrick       // We found an escaped newline.
238e5dd7070Spatrick       CurPtr = skipNewline(CurPtr, BufferEnd);
239e5dd7070Spatrick     } else
240e5dd7070Spatrick       return CurPtr; // Not an escaped newline.
241e5dd7070Spatrick   }
242e5dd7070Spatrick   return BufferEnd;
243e5dd7070Spatrick }
244e5dd7070Spatrick 
245e5dd7070Spatrick /// Return the one past end pointer for C comments.
246e5dd7070Spatrick /// Very dumb, does not handle escaped newlines or trigraphs.
findCCommentEnd(const char * BufferPtr,const char * BufferEnd)247e5dd7070Spatrick const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
248e5dd7070Spatrick   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
249e5dd7070Spatrick     if (*BufferPtr == '*') {
250e5dd7070Spatrick       assert(BufferPtr + 1 != BufferEnd);
251e5dd7070Spatrick       if (*(BufferPtr + 1) == '/')
252e5dd7070Spatrick         return BufferPtr;
253e5dd7070Spatrick     }
254e5dd7070Spatrick   }
255e5dd7070Spatrick   llvm_unreachable("buffer end hit before '*/' was seen");
256e5dd7070Spatrick }
257e5dd7070Spatrick 
258e5dd7070Spatrick } // end anonymous namespace
259e5dd7070Spatrick 
formTokenWithChars(Token & Result,const char * TokEnd,tok::TokenKind Kind)260e5dd7070Spatrick void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
261e5dd7070Spatrick                                tok::TokenKind Kind) {
262e5dd7070Spatrick   const unsigned TokLen = TokEnd - BufferPtr;
263e5dd7070Spatrick   Result.setLocation(getSourceLocation(BufferPtr));
264e5dd7070Spatrick   Result.setKind(Kind);
265e5dd7070Spatrick   Result.setLength(TokLen);
266e5dd7070Spatrick #ifndef NDEBUG
267e5dd7070Spatrick   Result.TextPtr = "<UNSET>";
268e5dd7070Spatrick   Result.IntVal = 7;
269e5dd7070Spatrick #endif
270e5dd7070Spatrick   BufferPtr = TokEnd;
271e5dd7070Spatrick }
272e5dd7070Spatrick 
skipTextToken()273*12c85518Srobert const char *Lexer::skipTextToken() {
274*12c85518Srobert   const char *TokenPtr = BufferPtr;
275*12c85518Srobert   assert(TokenPtr < CommentEnd);
276*12c85518Srobert   StringRef TokStartSymbols = ParseCommands ? "\n\r\\@\"&<" : "\n\r";
277*12c85518Srobert 
278*12c85518Srobert again:
279*12c85518Srobert   size_t End =
280*12c85518Srobert       StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of(TokStartSymbols);
281*12c85518Srobert   if (End == StringRef::npos)
282*12c85518Srobert     return CommentEnd;
283*12c85518Srobert 
284*12c85518Srobert   // Doxygen doesn't recognize any commands in a one-line double quotation.
285*12c85518Srobert   // If we don't find an ending quotation mark, we pretend it never began.
286*12c85518Srobert   if (*(TokenPtr + End) == '\"') {
287*12c85518Srobert     TokenPtr += End + 1;
288*12c85518Srobert     End = StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of("\n\r\"");
289*12c85518Srobert     if (End != StringRef::npos && *(TokenPtr + End) == '\"')
290*12c85518Srobert       TokenPtr += End + 1;
291*12c85518Srobert     goto again;
292*12c85518Srobert   }
293*12c85518Srobert   return TokenPtr + End;
294*12c85518Srobert }
295*12c85518Srobert 
lexCommentText(Token & T)296e5dd7070Spatrick void Lexer::lexCommentText(Token &T) {
297e5dd7070Spatrick   assert(CommentState == LCS_InsideBCPLComment ||
298e5dd7070Spatrick          CommentState == LCS_InsideCComment);
299e5dd7070Spatrick 
300e5dd7070Spatrick   // Handles lexing non-command text, i.e. text and newline.
301e5dd7070Spatrick   auto HandleNonCommandToken = [&]() -> void {
302e5dd7070Spatrick     assert(State == LS_Normal);
303e5dd7070Spatrick 
304e5dd7070Spatrick     const char *TokenPtr = BufferPtr;
305e5dd7070Spatrick     assert(TokenPtr < CommentEnd);
306e5dd7070Spatrick     switch (*TokenPtr) {
307e5dd7070Spatrick       case '\n':
308e5dd7070Spatrick       case '\r':
309e5dd7070Spatrick           TokenPtr = skipNewline(TokenPtr, CommentEnd);
310e5dd7070Spatrick           formTokenWithChars(T, TokenPtr, tok::newline);
311e5dd7070Spatrick 
312e5dd7070Spatrick           if (CommentState == LCS_InsideCComment)
313e5dd7070Spatrick             skipLineStartingDecorations();
314e5dd7070Spatrick           return;
315e5dd7070Spatrick 
316*12c85518Srobert       default:
317*12c85518Srobert         return formTextToken(T, skipTextToken());
318e5dd7070Spatrick     }
319e5dd7070Spatrick   };
320e5dd7070Spatrick 
321e5dd7070Spatrick   if (!ParseCommands)
322e5dd7070Spatrick     return HandleNonCommandToken();
323e5dd7070Spatrick 
324e5dd7070Spatrick   switch (State) {
325e5dd7070Spatrick   case LS_Normal:
326e5dd7070Spatrick     break;
327e5dd7070Spatrick   case LS_VerbatimBlockFirstLine:
328e5dd7070Spatrick     lexVerbatimBlockFirstLine(T);
329e5dd7070Spatrick     return;
330e5dd7070Spatrick   case LS_VerbatimBlockBody:
331e5dd7070Spatrick     lexVerbatimBlockBody(T);
332e5dd7070Spatrick     return;
333e5dd7070Spatrick   case LS_VerbatimLineText:
334e5dd7070Spatrick     lexVerbatimLineText(T);
335e5dd7070Spatrick     return;
336e5dd7070Spatrick   case LS_HTMLStartTag:
337e5dd7070Spatrick     lexHTMLStartTag(T);
338e5dd7070Spatrick     return;
339e5dd7070Spatrick   case LS_HTMLEndTag:
340e5dd7070Spatrick     lexHTMLEndTag(T);
341e5dd7070Spatrick     return;
342e5dd7070Spatrick   }
343e5dd7070Spatrick 
344e5dd7070Spatrick   assert(State == LS_Normal);
345e5dd7070Spatrick   const char *TokenPtr = BufferPtr;
346e5dd7070Spatrick   assert(TokenPtr < CommentEnd);
347e5dd7070Spatrick   switch(*TokenPtr) {
348e5dd7070Spatrick     case '\\':
349e5dd7070Spatrick     case '@': {
350e5dd7070Spatrick       // Commands that start with a backslash and commands that start with
351e5dd7070Spatrick       // 'at' have equivalent semantics.  But we keep information about the
352e5dd7070Spatrick       // exact syntax in AST for comments.
353e5dd7070Spatrick       tok::TokenKind CommandKind =
354e5dd7070Spatrick           (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
355e5dd7070Spatrick       TokenPtr++;
356e5dd7070Spatrick       if (TokenPtr == CommentEnd) {
357e5dd7070Spatrick         formTextToken(T, TokenPtr);
358e5dd7070Spatrick         return;
359e5dd7070Spatrick       }
360e5dd7070Spatrick       char C = *TokenPtr;
361e5dd7070Spatrick       switch (C) {
362e5dd7070Spatrick       default:
363e5dd7070Spatrick         break;
364e5dd7070Spatrick 
365e5dd7070Spatrick       case '\\': case '@': case '&': case '$':
366e5dd7070Spatrick       case '#':  case '<': case '>': case '%':
367e5dd7070Spatrick       case '\"': case '.': case ':':
368e5dd7070Spatrick         // This is one of \\ \@ \& \$ etc escape sequences.
369e5dd7070Spatrick         TokenPtr++;
370e5dd7070Spatrick         if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
371e5dd7070Spatrick           // This is the \:: escape sequence.
372e5dd7070Spatrick           TokenPtr++;
373e5dd7070Spatrick         }
374e5dd7070Spatrick         StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
375e5dd7070Spatrick         formTokenWithChars(T, TokenPtr, tok::text);
376e5dd7070Spatrick         T.setText(UnescapedText);
377e5dd7070Spatrick         return;
378e5dd7070Spatrick       }
379e5dd7070Spatrick 
380e5dd7070Spatrick       // Don't make zero-length commands.
381e5dd7070Spatrick       if (!isCommandNameStartCharacter(*TokenPtr)) {
382e5dd7070Spatrick         formTextToken(T, TokenPtr);
383e5dd7070Spatrick         return;
384e5dd7070Spatrick       }
385e5dd7070Spatrick 
386e5dd7070Spatrick       TokenPtr = skipCommandName(TokenPtr, CommentEnd);
387e5dd7070Spatrick       unsigned Length = TokenPtr - (BufferPtr + 1);
388e5dd7070Spatrick 
389e5dd7070Spatrick       // Hardcoded support for lexing LaTeX formula commands
390*12c85518Srobert       // \f$ \f( \f) \f[ \f] \f{ \f} as a single command.
391e5dd7070Spatrick       if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
392e5dd7070Spatrick         C = *TokenPtr;
393*12c85518Srobert         if (C == '$' || C == '(' || C == ')' || C == '[' || C == ']' ||
394*12c85518Srobert             C == '{' || C == '}') {
395e5dd7070Spatrick           TokenPtr++;
396e5dd7070Spatrick           Length++;
397e5dd7070Spatrick         }
398e5dd7070Spatrick       }
399e5dd7070Spatrick 
400e5dd7070Spatrick       StringRef CommandName(BufferPtr + 1, Length);
401e5dd7070Spatrick 
402e5dd7070Spatrick       const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
403e5dd7070Spatrick       if (!Info) {
404e5dd7070Spatrick         if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
405e5dd7070Spatrick           StringRef CorrectedName = Info->Name;
406e5dd7070Spatrick           SourceLocation Loc = getSourceLocation(BufferPtr);
407e5dd7070Spatrick           SourceLocation EndLoc = getSourceLocation(TokenPtr);
408e5dd7070Spatrick           SourceRange FullRange = SourceRange(Loc, EndLoc);
409e5dd7070Spatrick           SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc);
410e5dd7070Spatrick           Diag(Loc, diag::warn_correct_comment_command_name)
411e5dd7070Spatrick             << FullRange << CommandName << CorrectedName
412e5dd7070Spatrick             << FixItHint::CreateReplacement(CommandRange, CorrectedName);
413e5dd7070Spatrick         } else {
414e5dd7070Spatrick           formTokenWithChars(T, TokenPtr, tok::unknown_command);
415e5dd7070Spatrick           T.setUnknownCommandName(CommandName);
416e5dd7070Spatrick           Diag(T.getLocation(), diag::warn_unknown_comment_command_name)
417e5dd7070Spatrick               << SourceRange(T.getLocation(), T.getEndLocation());
418e5dd7070Spatrick           return;
419e5dd7070Spatrick         }
420e5dd7070Spatrick       }
421e5dd7070Spatrick       if (Info->IsVerbatimBlockCommand) {
422e5dd7070Spatrick         setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
423e5dd7070Spatrick         return;
424e5dd7070Spatrick       }
425e5dd7070Spatrick       if (Info->IsVerbatimLineCommand) {
426e5dd7070Spatrick         setupAndLexVerbatimLine(T, TokenPtr, Info);
427e5dd7070Spatrick         return;
428e5dd7070Spatrick       }
429e5dd7070Spatrick       formTokenWithChars(T, TokenPtr, CommandKind);
430e5dd7070Spatrick       T.setCommandID(Info->getID());
431e5dd7070Spatrick       return;
432e5dd7070Spatrick     }
433e5dd7070Spatrick 
434e5dd7070Spatrick     case '&':
435e5dd7070Spatrick       lexHTMLCharacterReference(T);
436e5dd7070Spatrick       return;
437e5dd7070Spatrick 
438e5dd7070Spatrick     case '<': {
439e5dd7070Spatrick       TokenPtr++;
440e5dd7070Spatrick       if (TokenPtr == CommentEnd) {
441e5dd7070Spatrick         formTextToken(T, TokenPtr);
442e5dd7070Spatrick         return;
443e5dd7070Spatrick       }
444e5dd7070Spatrick       const char C = *TokenPtr;
445e5dd7070Spatrick       if (isHTMLIdentifierStartingCharacter(C))
446e5dd7070Spatrick         setupAndLexHTMLStartTag(T);
447e5dd7070Spatrick       else if (C == '/')
448e5dd7070Spatrick         setupAndLexHTMLEndTag(T);
449e5dd7070Spatrick       else
450e5dd7070Spatrick         formTextToken(T, TokenPtr);
451e5dd7070Spatrick       return;
452e5dd7070Spatrick     }
453e5dd7070Spatrick 
454e5dd7070Spatrick     default:
455e5dd7070Spatrick       return HandleNonCommandToken();
456e5dd7070Spatrick   }
457e5dd7070Spatrick }
458e5dd7070Spatrick 
setupAndLexVerbatimBlock(Token & T,const char * TextBegin,char Marker,const CommandInfo * Info)459e5dd7070Spatrick void Lexer::setupAndLexVerbatimBlock(Token &T,
460e5dd7070Spatrick                                      const char *TextBegin,
461e5dd7070Spatrick                                      char Marker, const CommandInfo *Info) {
462e5dd7070Spatrick   assert(Info->IsVerbatimBlockCommand);
463e5dd7070Spatrick 
464e5dd7070Spatrick   VerbatimBlockEndCommandName.clear();
465e5dd7070Spatrick   VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
466e5dd7070Spatrick   VerbatimBlockEndCommandName.append(Info->EndCommandName);
467e5dd7070Spatrick 
468e5dd7070Spatrick   formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
469e5dd7070Spatrick   T.setVerbatimBlockID(Info->getID());
470e5dd7070Spatrick 
471e5dd7070Spatrick   // If there is a newline following the verbatim opening command, skip the
472e5dd7070Spatrick   // newline so that we don't create an tok::verbatim_block_line with empty
473e5dd7070Spatrick   // text content.
474e5dd7070Spatrick   if (BufferPtr != CommentEnd &&
475e5dd7070Spatrick       isVerticalWhitespace(*BufferPtr)) {
476e5dd7070Spatrick     BufferPtr = skipNewline(BufferPtr, CommentEnd);
477e5dd7070Spatrick     State = LS_VerbatimBlockBody;
478e5dd7070Spatrick     return;
479e5dd7070Spatrick   }
480e5dd7070Spatrick 
481e5dd7070Spatrick   State = LS_VerbatimBlockFirstLine;
482e5dd7070Spatrick }
483e5dd7070Spatrick 
lexVerbatimBlockFirstLine(Token & T)484e5dd7070Spatrick void Lexer::lexVerbatimBlockFirstLine(Token &T) {
485e5dd7070Spatrick again:
486e5dd7070Spatrick   assert(BufferPtr < CommentEnd);
487e5dd7070Spatrick 
488e5dd7070Spatrick   // FIXME: It would be better to scan the text once, finding either the block
489e5dd7070Spatrick   // end command or newline.
490e5dd7070Spatrick   //
491e5dd7070Spatrick   // Extract current line.
492e5dd7070Spatrick   const char *Newline = findNewline(BufferPtr, CommentEnd);
493e5dd7070Spatrick   StringRef Line(BufferPtr, Newline - BufferPtr);
494e5dd7070Spatrick 
495e5dd7070Spatrick   // Look for end command in current line.
496e5dd7070Spatrick   size_t Pos = Line.find(VerbatimBlockEndCommandName);
497e5dd7070Spatrick   const char *TextEnd;
498e5dd7070Spatrick   const char *NextLine;
499e5dd7070Spatrick   if (Pos == StringRef::npos) {
500e5dd7070Spatrick     // Current line is completely verbatim.
501e5dd7070Spatrick     TextEnd = Newline;
502e5dd7070Spatrick     NextLine = skipNewline(Newline, CommentEnd);
503e5dd7070Spatrick   } else if (Pos == 0) {
504e5dd7070Spatrick     // Current line contains just an end command.
505e5dd7070Spatrick     const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
506e5dd7070Spatrick     StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
507e5dd7070Spatrick     formTokenWithChars(T, End, tok::verbatim_block_end);
508e5dd7070Spatrick     T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
509e5dd7070Spatrick     State = LS_Normal;
510e5dd7070Spatrick     return;
511e5dd7070Spatrick   } else {
512e5dd7070Spatrick     // There is some text, followed by end command.  Extract text first.
513e5dd7070Spatrick     TextEnd = BufferPtr + Pos;
514e5dd7070Spatrick     NextLine = TextEnd;
515e5dd7070Spatrick     // If there is only whitespace before end command, skip whitespace.
516e5dd7070Spatrick     if (isWhitespace(BufferPtr, TextEnd)) {
517e5dd7070Spatrick       BufferPtr = TextEnd;
518e5dd7070Spatrick       goto again;
519e5dd7070Spatrick     }
520e5dd7070Spatrick   }
521e5dd7070Spatrick 
522e5dd7070Spatrick   StringRef Text(BufferPtr, TextEnd - BufferPtr);
523e5dd7070Spatrick   formTokenWithChars(T, NextLine, tok::verbatim_block_line);
524e5dd7070Spatrick   T.setVerbatimBlockText(Text);
525e5dd7070Spatrick 
526e5dd7070Spatrick   State = LS_VerbatimBlockBody;
527e5dd7070Spatrick }
528e5dd7070Spatrick 
lexVerbatimBlockBody(Token & T)529e5dd7070Spatrick void Lexer::lexVerbatimBlockBody(Token &T) {
530e5dd7070Spatrick   assert(State == LS_VerbatimBlockBody);
531e5dd7070Spatrick 
532e5dd7070Spatrick   if (CommentState == LCS_InsideCComment)
533e5dd7070Spatrick     skipLineStartingDecorations();
534e5dd7070Spatrick 
535e5dd7070Spatrick   if (BufferPtr == CommentEnd) {
536e5dd7070Spatrick     formTokenWithChars(T, BufferPtr, tok::verbatim_block_line);
537e5dd7070Spatrick     T.setVerbatimBlockText("");
538e5dd7070Spatrick     return;
539e5dd7070Spatrick   }
540e5dd7070Spatrick 
541e5dd7070Spatrick   lexVerbatimBlockFirstLine(T);
542e5dd7070Spatrick }
543e5dd7070Spatrick 
setupAndLexVerbatimLine(Token & T,const char * TextBegin,const CommandInfo * Info)544e5dd7070Spatrick void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
545e5dd7070Spatrick                                     const CommandInfo *Info) {
546e5dd7070Spatrick   assert(Info->IsVerbatimLineCommand);
547e5dd7070Spatrick   formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
548e5dd7070Spatrick   T.setVerbatimLineID(Info->getID());
549e5dd7070Spatrick 
550e5dd7070Spatrick   State = LS_VerbatimLineText;
551e5dd7070Spatrick }
552e5dd7070Spatrick 
lexVerbatimLineText(Token & T)553e5dd7070Spatrick void Lexer::lexVerbatimLineText(Token &T) {
554e5dd7070Spatrick   assert(State == LS_VerbatimLineText);
555e5dd7070Spatrick 
556e5dd7070Spatrick   // Extract current line.
557e5dd7070Spatrick   const char *Newline = findNewline(BufferPtr, CommentEnd);
558e5dd7070Spatrick   StringRef Text(BufferPtr, Newline - BufferPtr);
559e5dd7070Spatrick   formTokenWithChars(T, Newline, tok::verbatim_line_text);
560e5dd7070Spatrick   T.setVerbatimLineText(Text);
561e5dd7070Spatrick 
562e5dd7070Spatrick   State = LS_Normal;
563e5dd7070Spatrick }
564e5dd7070Spatrick 
lexHTMLCharacterReference(Token & T)565e5dd7070Spatrick void Lexer::lexHTMLCharacterReference(Token &T) {
566e5dd7070Spatrick   const char *TokenPtr = BufferPtr;
567e5dd7070Spatrick   assert(*TokenPtr == '&');
568e5dd7070Spatrick   TokenPtr++;
569e5dd7070Spatrick   if (TokenPtr == CommentEnd) {
570e5dd7070Spatrick     formTextToken(T, TokenPtr);
571e5dd7070Spatrick     return;
572e5dd7070Spatrick   }
573e5dd7070Spatrick   const char *NamePtr;
574e5dd7070Spatrick   bool isNamed = false;
575e5dd7070Spatrick   bool isDecimal = false;
576e5dd7070Spatrick   char C = *TokenPtr;
577e5dd7070Spatrick   if (isHTMLNamedCharacterReferenceCharacter(C)) {
578e5dd7070Spatrick     NamePtr = TokenPtr;
579e5dd7070Spatrick     TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
580e5dd7070Spatrick     isNamed = true;
581e5dd7070Spatrick   } else if (C == '#') {
582e5dd7070Spatrick     TokenPtr++;
583e5dd7070Spatrick     if (TokenPtr == CommentEnd) {
584e5dd7070Spatrick       formTextToken(T, TokenPtr);
585e5dd7070Spatrick       return;
586e5dd7070Spatrick     }
587e5dd7070Spatrick     C = *TokenPtr;
588e5dd7070Spatrick     if (isHTMLDecimalCharacterReferenceCharacter(C)) {
589e5dd7070Spatrick       NamePtr = TokenPtr;
590e5dd7070Spatrick       TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
591e5dd7070Spatrick       isDecimal = true;
592e5dd7070Spatrick     } else if (C == 'x' || C == 'X') {
593e5dd7070Spatrick       TokenPtr++;
594e5dd7070Spatrick       NamePtr = TokenPtr;
595e5dd7070Spatrick       TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
596e5dd7070Spatrick     } else {
597e5dd7070Spatrick       formTextToken(T, TokenPtr);
598e5dd7070Spatrick       return;
599e5dd7070Spatrick     }
600e5dd7070Spatrick   } else {
601e5dd7070Spatrick     formTextToken(T, TokenPtr);
602e5dd7070Spatrick     return;
603e5dd7070Spatrick   }
604e5dd7070Spatrick   if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
605e5dd7070Spatrick       *TokenPtr != ';') {
606e5dd7070Spatrick     formTextToken(T, TokenPtr);
607e5dd7070Spatrick     return;
608e5dd7070Spatrick   }
609e5dd7070Spatrick   StringRef Name(NamePtr, TokenPtr - NamePtr);
610e5dd7070Spatrick   TokenPtr++; // Skip semicolon.
611e5dd7070Spatrick   StringRef Resolved;
612e5dd7070Spatrick   if (isNamed)
613e5dd7070Spatrick     Resolved = resolveHTMLNamedCharacterReference(Name);
614e5dd7070Spatrick   else if (isDecimal)
615e5dd7070Spatrick     Resolved = resolveHTMLDecimalCharacterReference(Name);
616e5dd7070Spatrick   else
617e5dd7070Spatrick     Resolved = resolveHTMLHexCharacterReference(Name);
618e5dd7070Spatrick 
619e5dd7070Spatrick   if (Resolved.empty()) {
620e5dd7070Spatrick     formTextToken(T, TokenPtr);
621e5dd7070Spatrick     return;
622e5dd7070Spatrick   }
623e5dd7070Spatrick   formTokenWithChars(T, TokenPtr, tok::text);
624e5dd7070Spatrick   T.setText(Resolved);
625e5dd7070Spatrick }
626e5dd7070Spatrick 
setupAndLexHTMLStartTag(Token & T)627e5dd7070Spatrick void Lexer::setupAndLexHTMLStartTag(Token &T) {
628e5dd7070Spatrick   assert(BufferPtr[0] == '<' &&
629e5dd7070Spatrick          isHTMLIdentifierStartingCharacter(BufferPtr[1]));
630e5dd7070Spatrick   const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
631e5dd7070Spatrick   StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
632e5dd7070Spatrick   if (!isHTMLTagName(Name)) {
633e5dd7070Spatrick     formTextToken(T, TagNameEnd);
634e5dd7070Spatrick     return;
635e5dd7070Spatrick   }
636e5dd7070Spatrick 
637e5dd7070Spatrick   formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
638e5dd7070Spatrick   T.setHTMLTagStartName(Name);
639e5dd7070Spatrick 
640e5dd7070Spatrick   BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
641e5dd7070Spatrick 
642e5dd7070Spatrick   const char C = *BufferPtr;
643e5dd7070Spatrick   if (BufferPtr != CommentEnd &&
644e5dd7070Spatrick       (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
645e5dd7070Spatrick     State = LS_HTMLStartTag;
646e5dd7070Spatrick }
647e5dd7070Spatrick 
lexHTMLStartTag(Token & T)648e5dd7070Spatrick void Lexer::lexHTMLStartTag(Token &T) {
649e5dd7070Spatrick   assert(State == LS_HTMLStartTag);
650e5dd7070Spatrick 
651e5dd7070Spatrick   const char *TokenPtr = BufferPtr;
652e5dd7070Spatrick   char C = *TokenPtr;
653e5dd7070Spatrick   if (isHTMLIdentifierCharacter(C)) {
654e5dd7070Spatrick     TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
655e5dd7070Spatrick     StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
656e5dd7070Spatrick     formTokenWithChars(T, TokenPtr, tok::html_ident);
657e5dd7070Spatrick     T.setHTMLIdent(Ident);
658e5dd7070Spatrick   } else {
659e5dd7070Spatrick     switch (C) {
660e5dd7070Spatrick     case '=':
661e5dd7070Spatrick       TokenPtr++;
662e5dd7070Spatrick       formTokenWithChars(T, TokenPtr, tok::html_equals);
663e5dd7070Spatrick       break;
664e5dd7070Spatrick     case '\"':
665e5dd7070Spatrick     case '\'': {
666e5dd7070Spatrick       const char *OpenQuote = TokenPtr;
667e5dd7070Spatrick       TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
668e5dd7070Spatrick       const char *ClosingQuote = TokenPtr;
669e5dd7070Spatrick       if (TokenPtr != CommentEnd) // Skip closing quote.
670e5dd7070Spatrick         TokenPtr++;
671e5dd7070Spatrick       formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
672e5dd7070Spatrick       T.setHTMLQuotedString(StringRef(OpenQuote + 1,
673e5dd7070Spatrick                                       ClosingQuote - (OpenQuote + 1)));
674e5dd7070Spatrick       break;
675e5dd7070Spatrick     }
676e5dd7070Spatrick     case '>':
677e5dd7070Spatrick       TokenPtr++;
678e5dd7070Spatrick       formTokenWithChars(T, TokenPtr, tok::html_greater);
679e5dd7070Spatrick       State = LS_Normal;
680e5dd7070Spatrick       return;
681e5dd7070Spatrick     case '/':
682e5dd7070Spatrick       TokenPtr++;
683e5dd7070Spatrick       if (TokenPtr != CommentEnd && *TokenPtr == '>') {
684e5dd7070Spatrick         TokenPtr++;
685e5dd7070Spatrick         formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
686e5dd7070Spatrick       } else
687e5dd7070Spatrick         formTextToken(T, TokenPtr);
688e5dd7070Spatrick 
689e5dd7070Spatrick       State = LS_Normal;
690e5dd7070Spatrick       return;
691e5dd7070Spatrick     }
692e5dd7070Spatrick   }
693e5dd7070Spatrick 
694e5dd7070Spatrick   // Now look ahead and return to normal state if we don't see any HTML tokens
695e5dd7070Spatrick   // ahead.
696e5dd7070Spatrick   BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
697e5dd7070Spatrick   if (BufferPtr == CommentEnd) {
698e5dd7070Spatrick     State = LS_Normal;
699e5dd7070Spatrick     return;
700e5dd7070Spatrick   }
701e5dd7070Spatrick 
702e5dd7070Spatrick   C = *BufferPtr;
703e5dd7070Spatrick   if (!isHTMLIdentifierStartingCharacter(C) &&
704*12c85518Srobert       C != '=' && C != '\"' && C != '\'' && C != '>' && C != '/') {
705e5dd7070Spatrick     State = LS_Normal;
706e5dd7070Spatrick     return;
707e5dd7070Spatrick   }
708e5dd7070Spatrick }
709e5dd7070Spatrick 
setupAndLexHTMLEndTag(Token & T)710e5dd7070Spatrick void Lexer::setupAndLexHTMLEndTag(Token &T) {
711e5dd7070Spatrick   assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
712e5dd7070Spatrick 
713e5dd7070Spatrick   const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
714e5dd7070Spatrick   const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
715e5dd7070Spatrick   StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
716e5dd7070Spatrick   if (!isHTMLTagName(Name)) {
717e5dd7070Spatrick     formTextToken(T, TagNameEnd);
718e5dd7070Spatrick     return;
719e5dd7070Spatrick   }
720e5dd7070Spatrick 
721e5dd7070Spatrick   const char *End = skipWhitespace(TagNameEnd, CommentEnd);
722e5dd7070Spatrick 
723e5dd7070Spatrick   formTokenWithChars(T, End, tok::html_end_tag);
724e5dd7070Spatrick   T.setHTMLTagEndName(Name);
725e5dd7070Spatrick 
726e5dd7070Spatrick   if (BufferPtr != CommentEnd && *BufferPtr == '>')
727e5dd7070Spatrick     State = LS_HTMLEndTag;
728e5dd7070Spatrick }
729e5dd7070Spatrick 
lexHTMLEndTag(Token & T)730e5dd7070Spatrick void Lexer::lexHTMLEndTag(Token &T) {
731e5dd7070Spatrick   assert(BufferPtr != CommentEnd && *BufferPtr == '>');
732e5dd7070Spatrick 
733e5dd7070Spatrick   formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
734e5dd7070Spatrick   State = LS_Normal;
735e5dd7070Spatrick }
736e5dd7070Spatrick 
Lexer(llvm::BumpPtrAllocator & Allocator,DiagnosticsEngine & Diags,const CommandTraits & Traits,SourceLocation FileLoc,const char * BufferStart,const char * BufferEnd,bool ParseCommands)737e5dd7070Spatrick Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
738e5dd7070Spatrick              const CommandTraits &Traits, SourceLocation FileLoc,
739a9ac8606Spatrick              const char *BufferStart, const char *BufferEnd, bool ParseCommands)
740e5dd7070Spatrick     : Allocator(Allocator), Diags(Diags), Traits(Traits),
741a9ac8606Spatrick       BufferStart(BufferStart), BufferEnd(BufferEnd), BufferPtr(BufferStart),
742a9ac8606Spatrick       FileLoc(FileLoc), ParseCommands(ParseCommands),
743a9ac8606Spatrick       CommentState(LCS_BeforeComment), State(LS_Normal) {}
744e5dd7070Spatrick 
lex(Token & T)745e5dd7070Spatrick void Lexer::lex(Token &T) {
746e5dd7070Spatrick again:
747e5dd7070Spatrick   switch (CommentState) {
748e5dd7070Spatrick   case LCS_BeforeComment:
749e5dd7070Spatrick     if (BufferPtr == BufferEnd) {
750e5dd7070Spatrick       formTokenWithChars(T, BufferPtr, tok::eof);
751e5dd7070Spatrick       return;
752e5dd7070Spatrick     }
753e5dd7070Spatrick 
754e5dd7070Spatrick     assert(*BufferPtr == '/');
755e5dd7070Spatrick     BufferPtr++; // Skip first slash.
756e5dd7070Spatrick     switch(*BufferPtr) {
757e5dd7070Spatrick     case '/': { // BCPL comment.
758e5dd7070Spatrick       BufferPtr++; // Skip second slash.
759e5dd7070Spatrick 
760e5dd7070Spatrick       if (BufferPtr != BufferEnd) {
761e5dd7070Spatrick         // Skip Doxygen magic marker, if it is present.
762e5dd7070Spatrick         // It might be missing because of a typo //< or /*<, or because we
763e5dd7070Spatrick         // merged this non-Doxygen comment into a bunch of Doxygen comments
764e5dd7070Spatrick         // around it: /** ... */ /* ... */ /** ... */
765e5dd7070Spatrick         const char C = *BufferPtr;
766e5dd7070Spatrick         if (C == '/' || C == '!')
767e5dd7070Spatrick           BufferPtr++;
768e5dd7070Spatrick       }
769e5dd7070Spatrick 
770e5dd7070Spatrick       // Skip less-than symbol that marks trailing comments.
771e5dd7070Spatrick       // Skip it even if the comment is not a Doxygen one, because //< and /*<
772e5dd7070Spatrick       // are frequent typos.
773e5dd7070Spatrick       if (BufferPtr != BufferEnd && *BufferPtr == '<')
774e5dd7070Spatrick         BufferPtr++;
775e5dd7070Spatrick 
776e5dd7070Spatrick       CommentState = LCS_InsideBCPLComment;
777e5dd7070Spatrick       if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
778e5dd7070Spatrick         State = LS_Normal;
779e5dd7070Spatrick       CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
780e5dd7070Spatrick       goto again;
781e5dd7070Spatrick     }
782e5dd7070Spatrick     case '*': { // C comment.
783e5dd7070Spatrick       BufferPtr++; // Skip star.
784e5dd7070Spatrick 
785e5dd7070Spatrick       // Skip Doxygen magic marker.
786e5dd7070Spatrick       const char C = *BufferPtr;
787e5dd7070Spatrick       if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
788e5dd7070Spatrick         BufferPtr++;
789e5dd7070Spatrick 
790e5dd7070Spatrick       // Skip less-than symbol that marks trailing comments.
791e5dd7070Spatrick       if (BufferPtr != BufferEnd && *BufferPtr == '<')
792e5dd7070Spatrick         BufferPtr++;
793e5dd7070Spatrick 
794e5dd7070Spatrick       CommentState = LCS_InsideCComment;
795e5dd7070Spatrick       State = LS_Normal;
796e5dd7070Spatrick       CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
797e5dd7070Spatrick       goto again;
798e5dd7070Spatrick     }
799e5dd7070Spatrick     default:
800e5dd7070Spatrick       llvm_unreachable("second character of comment should be '/' or '*'");
801e5dd7070Spatrick     }
802e5dd7070Spatrick 
803e5dd7070Spatrick   case LCS_BetweenComments: {
804e5dd7070Spatrick     // Consecutive comments are extracted only if there is only whitespace
805e5dd7070Spatrick     // between them.  So we can search for the start of the next comment.
806e5dd7070Spatrick     const char *EndWhitespace = BufferPtr;
807e5dd7070Spatrick     while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
808e5dd7070Spatrick       EndWhitespace++;
809e5dd7070Spatrick 
810e5dd7070Spatrick     // Turn any whitespace between comments (and there is only whitespace
811e5dd7070Spatrick     // between them -- guaranteed by comment extraction) into a newline.  We
812e5dd7070Spatrick     // have two newlines between C comments in total (first one was synthesized
813e5dd7070Spatrick     // after a comment).
814e5dd7070Spatrick     formTokenWithChars(T, EndWhitespace, tok::newline);
815e5dd7070Spatrick 
816e5dd7070Spatrick     CommentState = LCS_BeforeComment;
817e5dd7070Spatrick     break;
818e5dd7070Spatrick   }
819e5dd7070Spatrick 
820e5dd7070Spatrick   case LCS_InsideBCPLComment:
821e5dd7070Spatrick   case LCS_InsideCComment:
822e5dd7070Spatrick     if (BufferPtr != CommentEnd) {
823e5dd7070Spatrick       lexCommentText(T);
824e5dd7070Spatrick       break;
825e5dd7070Spatrick     } else {
826e5dd7070Spatrick       // Skip C comment closing sequence.
827e5dd7070Spatrick       if (CommentState == LCS_InsideCComment) {
828e5dd7070Spatrick         assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
829e5dd7070Spatrick         BufferPtr += 2;
830e5dd7070Spatrick         assert(BufferPtr <= BufferEnd);
831e5dd7070Spatrick 
832e5dd7070Spatrick         // Synthenize newline just after the C comment, regardless if there is
833e5dd7070Spatrick         // actually a newline.
834e5dd7070Spatrick         formTokenWithChars(T, BufferPtr, tok::newline);
835e5dd7070Spatrick 
836e5dd7070Spatrick         CommentState = LCS_BetweenComments;
837e5dd7070Spatrick         break;
838e5dd7070Spatrick       } else {
839e5dd7070Spatrick         // Don't synthesized a newline after BCPL comment.
840e5dd7070Spatrick         CommentState = LCS_BetweenComments;
841e5dd7070Spatrick         goto again;
842e5dd7070Spatrick       }
843e5dd7070Spatrick     }
844e5dd7070Spatrick   }
845e5dd7070Spatrick }
846e5dd7070Spatrick 
getSpelling(const Token & Tok,const SourceManager & SourceMgr) const847e5dd7070Spatrick StringRef Lexer::getSpelling(const Token &Tok,
848e5dd7070Spatrick                              const SourceManager &SourceMgr) const {
849e5dd7070Spatrick   SourceLocation Loc = Tok.getLocation();
850e5dd7070Spatrick   std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
851e5dd7070Spatrick 
852e5dd7070Spatrick   bool InvalidTemp = false;
853e5dd7070Spatrick   StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
854e5dd7070Spatrick   if (InvalidTemp)
855e5dd7070Spatrick     return StringRef();
856e5dd7070Spatrick 
857e5dd7070Spatrick   const char *Begin = File.data() + LocInfo.second;
858e5dd7070Spatrick   return StringRef(Begin, Tok.getLength());
859e5dd7070Spatrick }
860e5dd7070Spatrick 
861e5dd7070Spatrick } // end namespace comments
862e5dd7070Spatrick } // end namespace clang
863