xref: /freebsd-src/contrib/llvm-project/clang/lib/AST/CommentLexer.cpp (revision bdd1243df58e60e85101c09001d9812a789b6bc4)
10b57cec5SDimitry Andric //===--- CommentLexer.cpp -------------------------------------------------===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric 
90b57cec5SDimitry Andric #include "clang/AST/CommentLexer.h"
100b57cec5SDimitry Andric #include "clang/AST/CommentCommandTraits.h"
110b57cec5SDimitry Andric #include "clang/AST/CommentDiagnostic.h"
120b57cec5SDimitry Andric #include "clang/Basic/CharInfo.h"
130b57cec5SDimitry Andric #include "llvm/ADT/StringExtras.h"
140b57cec5SDimitry Andric #include "llvm/ADT/StringSwitch.h"
150b57cec5SDimitry Andric #include "llvm/Support/ConvertUTF.h"
160b57cec5SDimitry Andric #include "llvm/Support/ErrorHandling.h"
170b57cec5SDimitry Andric 
180b57cec5SDimitry Andric namespace clang {
190b57cec5SDimitry Andric namespace comments {
200b57cec5SDimitry Andric 
dump(const Lexer & L,const SourceManager & SM) const210b57cec5SDimitry Andric void Token::dump(const Lexer &L, const SourceManager &SM) const {
220b57cec5SDimitry Andric   llvm::errs() << "comments::Token Kind=" << Kind << " ";
230b57cec5SDimitry Andric   Loc.print(llvm::errs(), SM);
240b57cec5SDimitry Andric   llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
250b57cec5SDimitry Andric }
260b57cec5SDimitry Andric 
isHTMLNamedCharacterReferenceCharacter(char C)270b57cec5SDimitry Andric static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
280b57cec5SDimitry Andric   return isLetter(C);
290b57cec5SDimitry Andric }
300b57cec5SDimitry Andric 
isHTMLDecimalCharacterReferenceCharacter(char C)310b57cec5SDimitry Andric static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
320b57cec5SDimitry Andric   return isDigit(C);
330b57cec5SDimitry Andric }
340b57cec5SDimitry Andric 
isHTMLHexCharacterReferenceCharacter(char C)350b57cec5SDimitry Andric static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
360b57cec5SDimitry Andric   return isHexDigit(C);
370b57cec5SDimitry Andric }
380b57cec5SDimitry Andric 
convertCodePointToUTF8(llvm::BumpPtrAllocator & Allocator,unsigned CodePoint)390b57cec5SDimitry Andric static inline StringRef convertCodePointToUTF8(
400b57cec5SDimitry Andric                                       llvm::BumpPtrAllocator &Allocator,
410b57cec5SDimitry Andric                                       unsigned CodePoint) {
420b57cec5SDimitry Andric   char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
430b57cec5SDimitry Andric   char *ResolvedPtr = Resolved;
440b57cec5SDimitry Andric   if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
450b57cec5SDimitry Andric     return StringRef(Resolved, ResolvedPtr - Resolved);
460b57cec5SDimitry Andric   else
470b57cec5SDimitry Andric     return StringRef();
480b57cec5SDimitry Andric }
490b57cec5SDimitry Andric 
500b57cec5SDimitry Andric namespace {
510b57cec5SDimitry Andric 
520b57cec5SDimitry Andric #include "clang/AST/CommentHTMLTags.inc"
530b57cec5SDimitry Andric #include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
540b57cec5SDimitry Andric 
550b57cec5SDimitry Andric } // end anonymous namespace
560b57cec5SDimitry Andric 
resolveHTMLNamedCharacterReference(StringRef Name) const570b57cec5SDimitry Andric StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
580b57cec5SDimitry Andric   // Fast path, first check a few most widely used named character references.
590b57cec5SDimitry Andric   return llvm::StringSwitch<StringRef>(Name)
600b57cec5SDimitry Andric       .Case("amp", "&")
610b57cec5SDimitry Andric       .Case("lt", "<")
620b57cec5SDimitry Andric       .Case("gt", ">")
630b57cec5SDimitry Andric       .Case("quot", "\"")
640b57cec5SDimitry Andric       .Case("apos", "\'")
650b57cec5SDimitry Andric       // Slow path.
660b57cec5SDimitry Andric       .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
670b57cec5SDimitry Andric }
680b57cec5SDimitry Andric 
resolveHTMLDecimalCharacterReference(StringRef Name) const690b57cec5SDimitry Andric StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
700b57cec5SDimitry Andric   unsigned CodePoint = 0;
710b57cec5SDimitry Andric   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
720b57cec5SDimitry Andric     assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
730b57cec5SDimitry Andric     CodePoint *= 10;
740b57cec5SDimitry Andric     CodePoint += Name[i] - '0';
750b57cec5SDimitry Andric   }
760b57cec5SDimitry Andric   return convertCodePointToUTF8(Allocator, CodePoint);
770b57cec5SDimitry Andric }
780b57cec5SDimitry Andric 
resolveHTMLHexCharacterReference(StringRef Name) const790b57cec5SDimitry Andric StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
800b57cec5SDimitry Andric   unsigned CodePoint = 0;
810b57cec5SDimitry Andric   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
820b57cec5SDimitry Andric     CodePoint *= 16;
830b57cec5SDimitry Andric     const char C = Name[i];
840b57cec5SDimitry Andric     assert(isHTMLHexCharacterReferenceCharacter(C));
850b57cec5SDimitry Andric     CodePoint += llvm::hexDigitValue(C);
860b57cec5SDimitry Andric   }
870b57cec5SDimitry Andric   return convertCodePointToUTF8(Allocator, CodePoint);
880b57cec5SDimitry Andric }
890b57cec5SDimitry Andric 
skipLineStartingDecorations()900b57cec5SDimitry Andric void Lexer::skipLineStartingDecorations() {
910b57cec5SDimitry Andric   // This function should be called only for C comments
920b57cec5SDimitry Andric   assert(CommentState == LCS_InsideCComment);
930b57cec5SDimitry Andric 
940b57cec5SDimitry Andric   if (BufferPtr == CommentEnd)
950b57cec5SDimitry Andric     return;
960b57cec5SDimitry Andric 
970b57cec5SDimitry Andric   const char *NewBufferPtr = BufferPtr;
9804eeddc0SDimitry Andric   while (isHorizontalWhitespace(*NewBufferPtr))
9904eeddc0SDimitry Andric     if (++NewBufferPtr == CommentEnd)
1000b57cec5SDimitry Andric       return;
10104eeddc0SDimitry Andric   if (*NewBufferPtr == '*')
1020b57cec5SDimitry Andric     BufferPtr = NewBufferPtr + 1;
1030b57cec5SDimitry Andric }
1040b57cec5SDimitry Andric 
1050b57cec5SDimitry Andric namespace {
1060b57cec5SDimitry Andric /// Returns pointer to the first newline character in the string.
findNewline(const char * BufferPtr,const char * BufferEnd)1070b57cec5SDimitry Andric const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
1080b57cec5SDimitry Andric   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
1090b57cec5SDimitry Andric     if (isVerticalWhitespace(*BufferPtr))
1100b57cec5SDimitry Andric       return BufferPtr;
1110b57cec5SDimitry Andric   }
1120b57cec5SDimitry Andric   return BufferEnd;
1130b57cec5SDimitry Andric }
1140b57cec5SDimitry Andric 
skipNewline(const char * BufferPtr,const char * BufferEnd)1150b57cec5SDimitry Andric const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
1160b57cec5SDimitry Andric   if (BufferPtr == BufferEnd)
1170b57cec5SDimitry Andric     return BufferPtr;
1180b57cec5SDimitry Andric 
1190b57cec5SDimitry Andric   if (*BufferPtr == '\n')
1200b57cec5SDimitry Andric     BufferPtr++;
1210b57cec5SDimitry Andric   else {
1220b57cec5SDimitry Andric     assert(*BufferPtr == '\r');
1230b57cec5SDimitry Andric     BufferPtr++;
1240b57cec5SDimitry Andric     if (BufferPtr != BufferEnd && *BufferPtr == '\n')
1250b57cec5SDimitry Andric       BufferPtr++;
1260b57cec5SDimitry Andric   }
1270b57cec5SDimitry Andric   return BufferPtr;
1280b57cec5SDimitry Andric }
1290b57cec5SDimitry Andric 
skipNamedCharacterReference(const char * BufferPtr,const char * BufferEnd)1300b57cec5SDimitry Andric const char *skipNamedCharacterReference(const char *BufferPtr,
1310b57cec5SDimitry Andric                                         const char *BufferEnd) {
1320b57cec5SDimitry Andric   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
1330b57cec5SDimitry Andric     if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
1340b57cec5SDimitry Andric       return BufferPtr;
1350b57cec5SDimitry Andric   }
1360b57cec5SDimitry Andric   return BufferEnd;
1370b57cec5SDimitry Andric }
1380b57cec5SDimitry Andric 
skipDecimalCharacterReference(const char * BufferPtr,const char * BufferEnd)1390b57cec5SDimitry Andric const char *skipDecimalCharacterReference(const char *BufferPtr,
1400b57cec5SDimitry Andric                                           const char *BufferEnd) {
1410b57cec5SDimitry Andric   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
1420b57cec5SDimitry Andric     if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
1430b57cec5SDimitry Andric       return BufferPtr;
1440b57cec5SDimitry Andric   }
1450b57cec5SDimitry Andric   return BufferEnd;
1460b57cec5SDimitry Andric }
1470b57cec5SDimitry Andric 
skipHexCharacterReference(const char * BufferPtr,const char * BufferEnd)1480b57cec5SDimitry Andric const char *skipHexCharacterReference(const char *BufferPtr,
1490b57cec5SDimitry Andric                                       const char *BufferEnd) {
1500b57cec5SDimitry Andric   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
1510b57cec5SDimitry Andric     if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
1520b57cec5SDimitry Andric       return BufferPtr;
1530b57cec5SDimitry Andric   }
1540b57cec5SDimitry Andric   return BufferEnd;
1550b57cec5SDimitry Andric }
1560b57cec5SDimitry Andric 
isHTMLIdentifierStartingCharacter(char C)1570b57cec5SDimitry Andric bool isHTMLIdentifierStartingCharacter(char C) {
1580b57cec5SDimitry Andric   return isLetter(C);
1590b57cec5SDimitry Andric }
1600b57cec5SDimitry Andric 
isHTMLIdentifierCharacter(char C)1610b57cec5SDimitry Andric bool isHTMLIdentifierCharacter(char C) {
1620b57cec5SDimitry Andric   return isAlphanumeric(C);
1630b57cec5SDimitry Andric }
1640b57cec5SDimitry Andric 
skipHTMLIdentifier(const char * BufferPtr,const char * BufferEnd)1650b57cec5SDimitry Andric const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
1660b57cec5SDimitry Andric   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
1670b57cec5SDimitry Andric     if (!isHTMLIdentifierCharacter(*BufferPtr))
1680b57cec5SDimitry Andric       return BufferPtr;
1690b57cec5SDimitry Andric   }
1700b57cec5SDimitry Andric   return BufferEnd;
1710b57cec5SDimitry Andric }
1720b57cec5SDimitry Andric 
1730b57cec5SDimitry Andric /// Skip HTML string quoted in single or double quotes.  Escaping quotes inside
1740b57cec5SDimitry Andric /// string allowed.
1750b57cec5SDimitry Andric ///
1760b57cec5SDimitry Andric /// Returns pointer to closing quote.
skipHTMLQuotedString(const char * BufferPtr,const char * BufferEnd)1770b57cec5SDimitry Andric const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
1780b57cec5SDimitry Andric {
1790b57cec5SDimitry Andric   const char Quote = *BufferPtr;
1800b57cec5SDimitry Andric   assert(Quote == '\"' || Quote == '\'');
1810b57cec5SDimitry Andric 
1820b57cec5SDimitry Andric   BufferPtr++;
1830b57cec5SDimitry Andric   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
1840b57cec5SDimitry Andric     const char C = *BufferPtr;
1850b57cec5SDimitry Andric     if (C == Quote && BufferPtr[-1] != '\\')
1860b57cec5SDimitry Andric       return BufferPtr;
1870b57cec5SDimitry Andric   }
1880b57cec5SDimitry Andric   return BufferEnd;
1890b57cec5SDimitry Andric }
1900b57cec5SDimitry Andric 
skipWhitespace(const char * BufferPtr,const char * BufferEnd)1910b57cec5SDimitry Andric const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
1920b57cec5SDimitry Andric   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
1930b57cec5SDimitry Andric     if (!isWhitespace(*BufferPtr))
1940b57cec5SDimitry Andric       return BufferPtr;
1950b57cec5SDimitry Andric   }
1960b57cec5SDimitry Andric   return BufferEnd;
1970b57cec5SDimitry Andric }
1980b57cec5SDimitry Andric 
isWhitespace(const char * BufferPtr,const char * BufferEnd)1990b57cec5SDimitry Andric bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
2000b57cec5SDimitry Andric   return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
2010b57cec5SDimitry Andric }
2020b57cec5SDimitry Andric 
isCommandNameStartCharacter(char C)2030b57cec5SDimitry Andric bool isCommandNameStartCharacter(char C) {
2040b57cec5SDimitry Andric   return isLetter(C);
2050b57cec5SDimitry Andric }
2060b57cec5SDimitry Andric 
isCommandNameCharacter(char C)2070b57cec5SDimitry Andric bool isCommandNameCharacter(char C) {
2080b57cec5SDimitry Andric   return isAlphanumeric(C);
2090b57cec5SDimitry Andric }
2100b57cec5SDimitry Andric 
skipCommandName(const char * BufferPtr,const char * BufferEnd)2110b57cec5SDimitry Andric const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
2120b57cec5SDimitry Andric   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
2130b57cec5SDimitry Andric     if (!isCommandNameCharacter(*BufferPtr))
2140b57cec5SDimitry Andric       return BufferPtr;
2150b57cec5SDimitry Andric   }
2160b57cec5SDimitry Andric   return BufferEnd;
2170b57cec5SDimitry Andric }
2180b57cec5SDimitry Andric 
2190b57cec5SDimitry Andric /// Return the one past end pointer for BCPL comments.
2200b57cec5SDimitry Andric /// Handles newlines escaped with backslash or trigraph for backslahs.
findBCPLCommentEnd(const char * BufferPtr,const char * BufferEnd)2210b57cec5SDimitry Andric const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
2220b57cec5SDimitry Andric   const char *CurPtr = BufferPtr;
2230b57cec5SDimitry Andric   while (CurPtr != BufferEnd) {
2240b57cec5SDimitry Andric     while (!isVerticalWhitespace(*CurPtr)) {
2250b57cec5SDimitry Andric       CurPtr++;
2260b57cec5SDimitry Andric       if (CurPtr == BufferEnd)
2270b57cec5SDimitry Andric         return BufferEnd;
2280b57cec5SDimitry Andric     }
2290b57cec5SDimitry Andric     // We found a newline, check if it is escaped.
2300b57cec5SDimitry Andric     const char *EscapePtr = CurPtr - 1;
2310b57cec5SDimitry Andric     while(isHorizontalWhitespace(*EscapePtr))
2320b57cec5SDimitry Andric       EscapePtr--;
2330b57cec5SDimitry Andric 
2340b57cec5SDimitry Andric     if (*EscapePtr == '\\' ||
2350b57cec5SDimitry Andric         (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
2360b57cec5SDimitry Andric          EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
2370b57cec5SDimitry Andric       // We found an escaped newline.
2380b57cec5SDimitry Andric       CurPtr = skipNewline(CurPtr, BufferEnd);
2390b57cec5SDimitry Andric     } else
2400b57cec5SDimitry Andric       return CurPtr; // Not an escaped newline.
2410b57cec5SDimitry Andric   }
2420b57cec5SDimitry Andric   return BufferEnd;
2430b57cec5SDimitry Andric }
2440b57cec5SDimitry Andric 
2450b57cec5SDimitry Andric /// Return the one past end pointer for C comments.
2460b57cec5SDimitry Andric /// Very dumb, does not handle escaped newlines or trigraphs.
findCCommentEnd(const char * BufferPtr,const char * BufferEnd)2470b57cec5SDimitry Andric const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
2480b57cec5SDimitry Andric   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
2490b57cec5SDimitry Andric     if (*BufferPtr == '*') {
2500b57cec5SDimitry Andric       assert(BufferPtr + 1 != BufferEnd);
2510b57cec5SDimitry Andric       if (*(BufferPtr + 1) == '/')
2520b57cec5SDimitry Andric         return BufferPtr;
2530b57cec5SDimitry Andric     }
2540b57cec5SDimitry Andric   }
2550b57cec5SDimitry Andric   llvm_unreachable("buffer end hit before '*/' was seen");
2560b57cec5SDimitry Andric }
2570b57cec5SDimitry Andric 
2580b57cec5SDimitry Andric } // end anonymous namespace
2590b57cec5SDimitry Andric 
formTokenWithChars(Token & Result,const char * TokEnd,tok::TokenKind Kind)2600b57cec5SDimitry Andric void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
2610b57cec5SDimitry Andric                                tok::TokenKind Kind) {
2620b57cec5SDimitry Andric   const unsigned TokLen = TokEnd - BufferPtr;
2630b57cec5SDimitry Andric   Result.setLocation(getSourceLocation(BufferPtr));
2640b57cec5SDimitry Andric   Result.setKind(Kind);
2650b57cec5SDimitry Andric   Result.setLength(TokLen);
2660b57cec5SDimitry Andric #ifndef NDEBUG
2670b57cec5SDimitry Andric   Result.TextPtr = "<UNSET>";
2680b57cec5SDimitry Andric   Result.IntVal = 7;
2690b57cec5SDimitry Andric #endif
2700b57cec5SDimitry Andric   BufferPtr = TokEnd;
2710b57cec5SDimitry Andric }
2720b57cec5SDimitry Andric 
skipTextToken()27304eeddc0SDimitry Andric const char *Lexer::skipTextToken() {
27404eeddc0SDimitry Andric   const char *TokenPtr = BufferPtr;
27504eeddc0SDimitry Andric   assert(TokenPtr < CommentEnd);
27604eeddc0SDimitry Andric   StringRef TokStartSymbols = ParseCommands ? "\n\r\\@\"&<" : "\n\r";
27704eeddc0SDimitry Andric 
27804eeddc0SDimitry Andric again:
27904eeddc0SDimitry Andric   size_t End =
28004eeddc0SDimitry Andric       StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of(TokStartSymbols);
28104eeddc0SDimitry Andric   if (End == StringRef::npos)
28204eeddc0SDimitry Andric     return CommentEnd;
28304eeddc0SDimitry Andric 
28404eeddc0SDimitry Andric   // Doxygen doesn't recognize any commands in a one-line double quotation.
28504eeddc0SDimitry Andric   // If we don't find an ending quotation mark, we pretend it never began.
28604eeddc0SDimitry Andric   if (*(TokenPtr + End) == '\"') {
28704eeddc0SDimitry Andric     TokenPtr += End + 1;
28804eeddc0SDimitry Andric     End = StringRef(TokenPtr, CommentEnd - TokenPtr).find_first_of("\n\r\"");
28904eeddc0SDimitry Andric     if (End != StringRef::npos && *(TokenPtr + End) == '\"')
29004eeddc0SDimitry Andric       TokenPtr += End + 1;
29104eeddc0SDimitry Andric     goto again;
29204eeddc0SDimitry Andric   }
29304eeddc0SDimitry Andric   return TokenPtr + End;
29404eeddc0SDimitry Andric }
29504eeddc0SDimitry Andric 
lexCommentText(Token & T)2960b57cec5SDimitry Andric void Lexer::lexCommentText(Token &T) {
2970b57cec5SDimitry Andric   assert(CommentState == LCS_InsideBCPLComment ||
2980b57cec5SDimitry Andric          CommentState == LCS_InsideCComment);
2990b57cec5SDimitry Andric 
3000b57cec5SDimitry Andric   // Handles lexing non-command text, i.e. text and newline.
3010b57cec5SDimitry Andric   auto HandleNonCommandToken = [&]() -> void {
3020b57cec5SDimitry Andric     assert(State == LS_Normal);
3030b57cec5SDimitry Andric 
3040b57cec5SDimitry Andric     const char *TokenPtr = BufferPtr;
3050b57cec5SDimitry Andric     assert(TokenPtr < CommentEnd);
3060b57cec5SDimitry Andric     switch (*TokenPtr) {
3070b57cec5SDimitry Andric       case '\n':
3080b57cec5SDimitry Andric       case '\r':
3090b57cec5SDimitry Andric           TokenPtr = skipNewline(TokenPtr, CommentEnd);
3100b57cec5SDimitry Andric           formTokenWithChars(T, TokenPtr, tok::newline);
3110b57cec5SDimitry Andric 
3120b57cec5SDimitry Andric           if (CommentState == LCS_InsideCComment)
3130b57cec5SDimitry Andric             skipLineStartingDecorations();
3140b57cec5SDimitry Andric           return;
3150b57cec5SDimitry Andric 
31604eeddc0SDimitry Andric       default:
31704eeddc0SDimitry Andric         return formTextToken(T, skipTextToken());
3180b57cec5SDimitry Andric     }
3190b57cec5SDimitry Andric   };
3200b57cec5SDimitry Andric 
3210b57cec5SDimitry Andric   if (!ParseCommands)
3220b57cec5SDimitry Andric     return HandleNonCommandToken();
3230b57cec5SDimitry Andric 
3240b57cec5SDimitry Andric   switch (State) {
3250b57cec5SDimitry Andric   case LS_Normal:
3260b57cec5SDimitry Andric     break;
3270b57cec5SDimitry Andric   case LS_VerbatimBlockFirstLine:
3280b57cec5SDimitry Andric     lexVerbatimBlockFirstLine(T);
3290b57cec5SDimitry Andric     return;
3300b57cec5SDimitry Andric   case LS_VerbatimBlockBody:
3310b57cec5SDimitry Andric     lexVerbatimBlockBody(T);
3320b57cec5SDimitry Andric     return;
3330b57cec5SDimitry Andric   case LS_VerbatimLineText:
3340b57cec5SDimitry Andric     lexVerbatimLineText(T);
3350b57cec5SDimitry Andric     return;
3360b57cec5SDimitry Andric   case LS_HTMLStartTag:
3370b57cec5SDimitry Andric     lexHTMLStartTag(T);
3380b57cec5SDimitry Andric     return;
3390b57cec5SDimitry Andric   case LS_HTMLEndTag:
3400b57cec5SDimitry Andric     lexHTMLEndTag(T);
3410b57cec5SDimitry Andric     return;
3420b57cec5SDimitry Andric   }
3430b57cec5SDimitry Andric 
3440b57cec5SDimitry Andric   assert(State == LS_Normal);
3450b57cec5SDimitry Andric   const char *TokenPtr = BufferPtr;
3460b57cec5SDimitry Andric   assert(TokenPtr < CommentEnd);
3470b57cec5SDimitry Andric   switch(*TokenPtr) {
3480b57cec5SDimitry Andric     case '\\':
3490b57cec5SDimitry Andric     case '@': {
3500b57cec5SDimitry Andric       // Commands that start with a backslash and commands that start with
3510b57cec5SDimitry Andric       // 'at' have equivalent semantics.  But we keep information about the
3520b57cec5SDimitry Andric       // exact syntax in AST for comments.
3530b57cec5SDimitry Andric       tok::TokenKind CommandKind =
3540b57cec5SDimitry Andric           (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
3550b57cec5SDimitry Andric       TokenPtr++;
3560b57cec5SDimitry Andric       if (TokenPtr == CommentEnd) {
3570b57cec5SDimitry Andric         formTextToken(T, TokenPtr);
3580b57cec5SDimitry Andric         return;
3590b57cec5SDimitry Andric       }
3600b57cec5SDimitry Andric       char C = *TokenPtr;
3610b57cec5SDimitry Andric       switch (C) {
3620b57cec5SDimitry Andric       default:
3630b57cec5SDimitry Andric         break;
3640b57cec5SDimitry Andric 
3650b57cec5SDimitry Andric       case '\\': case '@': case '&': case '$':
3660b57cec5SDimitry Andric       case '#':  case '<': case '>': case '%':
3670b57cec5SDimitry Andric       case '\"': case '.': case ':':
3680b57cec5SDimitry Andric         // This is one of \\ \@ \& \$ etc escape sequences.
3690b57cec5SDimitry Andric         TokenPtr++;
3700b57cec5SDimitry Andric         if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
3710b57cec5SDimitry Andric           // This is the \:: escape sequence.
3720b57cec5SDimitry Andric           TokenPtr++;
3730b57cec5SDimitry Andric         }
3740b57cec5SDimitry Andric         StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
3750b57cec5SDimitry Andric         formTokenWithChars(T, TokenPtr, tok::text);
3760b57cec5SDimitry Andric         T.setText(UnescapedText);
3770b57cec5SDimitry Andric         return;
3780b57cec5SDimitry Andric       }
3790b57cec5SDimitry Andric 
3800b57cec5SDimitry Andric       // Don't make zero-length commands.
3810b57cec5SDimitry Andric       if (!isCommandNameStartCharacter(*TokenPtr)) {
3820b57cec5SDimitry Andric         formTextToken(T, TokenPtr);
3830b57cec5SDimitry Andric         return;
3840b57cec5SDimitry Andric       }
3850b57cec5SDimitry Andric 
3860b57cec5SDimitry Andric       TokenPtr = skipCommandName(TokenPtr, CommentEnd);
3870b57cec5SDimitry Andric       unsigned Length = TokenPtr - (BufferPtr + 1);
3880b57cec5SDimitry Andric 
3890b57cec5SDimitry Andric       // Hardcoded support for lexing LaTeX formula commands
390349cc55cSDimitry Andric       // \f$ \f( \f) \f[ \f] \f{ \f} as a single command.
3910b57cec5SDimitry Andric       if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
3920b57cec5SDimitry Andric         C = *TokenPtr;
393349cc55cSDimitry Andric         if (C == '$' || C == '(' || C == ')' || C == '[' || C == ']' ||
394349cc55cSDimitry Andric             C == '{' || C == '}') {
3950b57cec5SDimitry Andric           TokenPtr++;
3960b57cec5SDimitry Andric           Length++;
3970b57cec5SDimitry Andric         }
3980b57cec5SDimitry Andric       }
3990b57cec5SDimitry Andric 
4000b57cec5SDimitry Andric       StringRef CommandName(BufferPtr + 1, Length);
4010b57cec5SDimitry Andric 
4020b57cec5SDimitry Andric       const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
4030b57cec5SDimitry Andric       if (!Info) {
4040b57cec5SDimitry Andric         if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
4050b57cec5SDimitry Andric           StringRef CorrectedName = Info->Name;
4060b57cec5SDimitry Andric           SourceLocation Loc = getSourceLocation(BufferPtr);
4070b57cec5SDimitry Andric           SourceLocation EndLoc = getSourceLocation(TokenPtr);
4080b57cec5SDimitry Andric           SourceRange FullRange = SourceRange(Loc, EndLoc);
4090b57cec5SDimitry Andric           SourceRange CommandRange(Loc.getLocWithOffset(1), EndLoc);
4100b57cec5SDimitry Andric           Diag(Loc, diag::warn_correct_comment_command_name)
4110b57cec5SDimitry Andric             << FullRange << CommandName << CorrectedName
4120b57cec5SDimitry Andric             << FixItHint::CreateReplacement(CommandRange, CorrectedName);
4130b57cec5SDimitry Andric         } else {
4140b57cec5SDimitry Andric           formTokenWithChars(T, TokenPtr, tok::unknown_command);
4150b57cec5SDimitry Andric           T.setUnknownCommandName(CommandName);
4160b57cec5SDimitry Andric           Diag(T.getLocation(), diag::warn_unknown_comment_command_name)
4170b57cec5SDimitry Andric               << SourceRange(T.getLocation(), T.getEndLocation());
4180b57cec5SDimitry Andric           return;
4190b57cec5SDimitry Andric         }
4200b57cec5SDimitry Andric       }
4210b57cec5SDimitry Andric       if (Info->IsVerbatimBlockCommand) {
4220b57cec5SDimitry Andric         setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
4230b57cec5SDimitry Andric         return;
4240b57cec5SDimitry Andric       }
4250b57cec5SDimitry Andric       if (Info->IsVerbatimLineCommand) {
4260b57cec5SDimitry Andric         setupAndLexVerbatimLine(T, TokenPtr, Info);
4270b57cec5SDimitry Andric         return;
4280b57cec5SDimitry Andric       }
4290b57cec5SDimitry Andric       formTokenWithChars(T, TokenPtr, CommandKind);
4300b57cec5SDimitry Andric       T.setCommandID(Info->getID());
4310b57cec5SDimitry Andric       return;
4320b57cec5SDimitry Andric     }
4330b57cec5SDimitry Andric 
4340b57cec5SDimitry Andric     case '&':
4350b57cec5SDimitry Andric       lexHTMLCharacterReference(T);
4360b57cec5SDimitry Andric       return;
4370b57cec5SDimitry Andric 
4380b57cec5SDimitry Andric     case '<': {
4390b57cec5SDimitry Andric       TokenPtr++;
4400b57cec5SDimitry Andric       if (TokenPtr == CommentEnd) {
4410b57cec5SDimitry Andric         formTextToken(T, TokenPtr);
4420b57cec5SDimitry Andric         return;
4430b57cec5SDimitry Andric       }
4440b57cec5SDimitry Andric       const char C = *TokenPtr;
4450b57cec5SDimitry Andric       if (isHTMLIdentifierStartingCharacter(C))
4460b57cec5SDimitry Andric         setupAndLexHTMLStartTag(T);
4470b57cec5SDimitry Andric       else if (C == '/')
4480b57cec5SDimitry Andric         setupAndLexHTMLEndTag(T);
4490b57cec5SDimitry Andric       else
4500b57cec5SDimitry Andric         formTextToken(T, TokenPtr);
4510b57cec5SDimitry Andric       return;
4520b57cec5SDimitry Andric     }
4530b57cec5SDimitry Andric 
4540b57cec5SDimitry Andric     default:
4550b57cec5SDimitry Andric       return HandleNonCommandToken();
4560b57cec5SDimitry Andric   }
4570b57cec5SDimitry Andric }
4580b57cec5SDimitry Andric 
setupAndLexVerbatimBlock(Token & T,const char * TextBegin,char Marker,const CommandInfo * Info)4590b57cec5SDimitry Andric void Lexer::setupAndLexVerbatimBlock(Token &T,
4600b57cec5SDimitry Andric                                      const char *TextBegin,
4610b57cec5SDimitry Andric                                      char Marker, const CommandInfo *Info) {
4620b57cec5SDimitry Andric   assert(Info->IsVerbatimBlockCommand);
4630b57cec5SDimitry Andric 
4640b57cec5SDimitry Andric   VerbatimBlockEndCommandName.clear();
4650b57cec5SDimitry Andric   VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
4660b57cec5SDimitry Andric   VerbatimBlockEndCommandName.append(Info->EndCommandName);
4670b57cec5SDimitry Andric 
4680b57cec5SDimitry Andric   formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
4690b57cec5SDimitry Andric   T.setVerbatimBlockID(Info->getID());
4700b57cec5SDimitry Andric 
4710b57cec5SDimitry Andric   // If there is a newline following the verbatim opening command, skip the
4720b57cec5SDimitry Andric   // newline so that we don't create an tok::verbatim_block_line with empty
4730b57cec5SDimitry Andric   // text content.
4740b57cec5SDimitry Andric   if (BufferPtr != CommentEnd &&
4750b57cec5SDimitry Andric       isVerticalWhitespace(*BufferPtr)) {
4760b57cec5SDimitry Andric     BufferPtr = skipNewline(BufferPtr, CommentEnd);
4770b57cec5SDimitry Andric     State = LS_VerbatimBlockBody;
4780b57cec5SDimitry Andric     return;
4790b57cec5SDimitry Andric   }
4800b57cec5SDimitry Andric 
4810b57cec5SDimitry Andric   State = LS_VerbatimBlockFirstLine;
4820b57cec5SDimitry Andric }
4830b57cec5SDimitry Andric 
lexVerbatimBlockFirstLine(Token & T)4840b57cec5SDimitry Andric void Lexer::lexVerbatimBlockFirstLine(Token &T) {
4850b57cec5SDimitry Andric again:
4860b57cec5SDimitry Andric   assert(BufferPtr < CommentEnd);
4870b57cec5SDimitry Andric 
4880b57cec5SDimitry Andric   // FIXME: It would be better to scan the text once, finding either the block
4890b57cec5SDimitry Andric   // end command or newline.
4900b57cec5SDimitry Andric   //
4910b57cec5SDimitry Andric   // Extract current line.
4920b57cec5SDimitry Andric   const char *Newline = findNewline(BufferPtr, CommentEnd);
4930b57cec5SDimitry Andric   StringRef Line(BufferPtr, Newline - BufferPtr);
4940b57cec5SDimitry Andric 
4950b57cec5SDimitry Andric   // Look for end command in current line.
4960b57cec5SDimitry Andric   size_t Pos = Line.find(VerbatimBlockEndCommandName);
4970b57cec5SDimitry Andric   const char *TextEnd;
4980b57cec5SDimitry Andric   const char *NextLine;
4990b57cec5SDimitry Andric   if (Pos == StringRef::npos) {
5000b57cec5SDimitry Andric     // Current line is completely verbatim.
5010b57cec5SDimitry Andric     TextEnd = Newline;
5020b57cec5SDimitry Andric     NextLine = skipNewline(Newline, CommentEnd);
5030b57cec5SDimitry Andric   } else if (Pos == 0) {
5040b57cec5SDimitry Andric     // Current line contains just an end command.
5050b57cec5SDimitry Andric     const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
5060b57cec5SDimitry Andric     StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
5070b57cec5SDimitry Andric     formTokenWithChars(T, End, tok::verbatim_block_end);
5080b57cec5SDimitry Andric     T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
5090b57cec5SDimitry Andric     State = LS_Normal;
5100b57cec5SDimitry Andric     return;
5110b57cec5SDimitry Andric   } else {
5120b57cec5SDimitry Andric     // There is some text, followed by end command.  Extract text first.
5130b57cec5SDimitry Andric     TextEnd = BufferPtr + Pos;
5140b57cec5SDimitry Andric     NextLine = TextEnd;
5150b57cec5SDimitry Andric     // If there is only whitespace before end command, skip whitespace.
5160b57cec5SDimitry Andric     if (isWhitespace(BufferPtr, TextEnd)) {
5170b57cec5SDimitry Andric       BufferPtr = TextEnd;
5180b57cec5SDimitry Andric       goto again;
5190b57cec5SDimitry Andric     }
5200b57cec5SDimitry Andric   }
5210b57cec5SDimitry Andric 
5220b57cec5SDimitry Andric   StringRef Text(BufferPtr, TextEnd - BufferPtr);
5230b57cec5SDimitry Andric   formTokenWithChars(T, NextLine, tok::verbatim_block_line);
5240b57cec5SDimitry Andric   T.setVerbatimBlockText(Text);
5250b57cec5SDimitry Andric 
5260b57cec5SDimitry Andric   State = LS_VerbatimBlockBody;
5270b57cec5SDimitry Andric }
5280b57cec5SDimitry Andric 
lexVerbatimBlockBody(Token & T)5290b57cec5SDimitry Andric void Lexer::lexVerbatimBlockBody(Token &T) {
5300b57cec5SDimitry Andric   assert(State == LS_VerbatimBlockBody);
5310b57cec5SDimitry Andric 
5320b57cec5SDimitry Andric   if (CommentState == LCS_InsideCComment)
5330b57cec5SDimitry Andric     skipLineStartingDecorations();
5340b57cec5SDimitry Andric 
5350b57cec5SDimitry Andric   if (BufferPtr == CommentEnd) {
5360b57cec5SDimitry Andric     formTokenWithChars(T, BufferPtr, tok::verbatim_block_line);
5370b57cec5SDimitry Andric     T.setVerbatimBlockText("");
5380b57cec5SDimitry Andric     return;
5390b57cec5SDimitry Andric   }
5400b57cec5SDimitry Andric 
5410b57cec5SDimitry Andric   lexVerbatimBlockFirstLine(T);
5420b57cec5SDimitry Andric }
5430b57cec5SDimitry Andric 
setupAndLexVerbatimLine(Token & T,const char * TextBegin,const CommandInfo * Info)5440b57cec5SDimitry Andric void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
5450b57cec5SDimitry Andric                                     const CommandInfo *Info) {
5460b57cec5SDimitry Andric   assert(Info->IsVerbatimLineCommand);
5470b57cec5SDimitry Andric   formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
5480b57cec5SDimitry Andric   T.setVerbatimLineID(Info->getID());
5490b57cec5SDimitry Andric 
5500b57cec5SDimitry Andric   State = LS_VerbatimLineText;
5510b57cec5SDimitry Andric }
5520b57cec5SDimitry Andric 
lexVerbatimLineText(Token & T)5530b57cec5SDimitry Andric void Lexer::lexVerbatimLineText(Token &T) {
5540b57cec5SDimitry Andric   assert(State == LS_VerbatimLineText);
5550b57cec5SDimitry Andric 
5560b57cec5SDimitry Andric   // Extract current line.
5570b57cec5SDimitry Andric   const char *Newline = findNewline(BufferPtr, CommentEnd);
5580b57cec5SDimitry Andric   StringRef Text(BufferPtr, Newline - BufferPtr);
5590b57cec5SDimitry Andric   formTokenWithChars(T, Newline, tok::verbatim_line_text);
5600b57cec5SDimitry Andric   T.setVerbatimLineText(Text);
5610b57cec5SDimitry Andric 
5620b57cec5SDimitry Andric   State = LS_Normal;
5630b57cec5SDimitry Andric }
5640b57cec5SDimitry Andric 
lexHTMLCharacterReference(Token & T)5650b57cec5SDimitry Andric void Lexer::lexHTMLCharacterReference(Token &T) {
5660b57cec5SDimitry Andric   const char *TokenPtr = BufferPtr;
5670b57cec5SDimitry Andric   assert(*TokenPtr == '&');
5680b57cec5SDimitry Andric   TokenPtr++;
5690b57cec5SDimitry Andric   if (TokenPtr == CommentEnd) {
5700b57cec5SDimitry Andric     formTextToken(T, TokenPtr);
5710b57cec5SDimitry Andric     return;
5720b57cec5SDimitry Andric   }
5730b57cec5SDimitry Andric   const char *NamePtr;
5740b57cec5SDimitry Andric   bool isNamed = false;
5750b57cec5SDimitry Andric   bool isDecimal = false;
5760b57cec5SDimitry Andric   char C = *TokenPtr;
5770b57cec5SDimitry Andric   if (isHTMLNamedCharacterReferenceCharacter(C)) {
5780b57cec5SDimitry Andric     NamePtr = TokenPtr;
5790b57cec5SDimitry Andric     TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
5800b57cec5SDimitry Andric     isNamed = true;
5810b57cec5SDimitry Andric   } else if (C == '#') {
5820b57cec5SDimitry Andric     TokenPtr++;
5830b57cec5SDimitry Andric     if (TokenPtr == CommentEnd) {
5840b57cec5SDimitry Andric       formTextToken(T, TokenPtr);
5850b57cec5SDimitry Andric       return;
5860b57cec5SDimitry Andric     }
5870b57cec5SDimitry Andric     C = *TokenPtr;
5880b57cec5SDimitry Andric     if (isHTMLDecimalCharacterReferenceCharacter(C)) {
5890b57cec5SDimitry Andric       NamePtr = TokenPtr;
5900b57cec5SDimitry Andric       TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
5910b57cec5SDimitry Andric       isDecimal = true;
5920b57cec5SDimitry Andric     } else if (C == 'x' || C == 'X') {
5930b57cec5SDimitry Andric       TokenPtr++;
5940b57cec5SDimitry Andric       NamePtr = TokenPtr;
5950b57cec5SDimitry Andric       TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
5960b57cec5SDimitry Andric     } else {
5970b57cec5SDimitry Andric       formTextToken(T, TokenPtr);
5980b57cec5SDimitry Andric       return;
5990b57cec5SDimitry Andric     }
6000b57cec5SDimitry Andric   } else {
6010b57cec5SDimitry Andric     formTextToken(T, TokenPtr);
6020b57cec5SDimitry Andric     return;
6030b57cec5SDimitry Andric   }
6040b57cec5SDimitry Andric   if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
6050b57cec5SDimitry Andric       *TokenPtr != ';') {
6060b57cec5SDimitry Andric     formTextToken(T, TokenPtr);
6070b57cec5SDimitry Andric     return;
6080b57cec5SDimitry Andric   }
6090b57cec5SDimitry Andric   StringRef Name(NamePtr, TokenPtr - NamePtr);
6100b57cec5SDimitry Andric   TokenPtr++; // Skip semicolon.
6110b57cec5SDimitry Andric   StringRef Resolved;
6120b57cec5SDimitry Andric   if (isNamed)
6130b57cec5SDimitry Andric     Resolved = resolveHTMLNamedCharacterReference(Name);
6140b57cec5SDimitry Andric   else if (isDecimal)
6150b57cec5SDimitry Andric     Resolved = resolveHTMLDecimalCharacterReference(Name);
6160b57cec5SDimitry Andric   else
6170b57cec5SDimitry Andric     Resolved = resolveHTMLHexCharacterReference(Name);
6180b57cec5SDimitry Andric 
6190b57cec5SDimitry Andric   if (Resolved.empty()) {
6200b57cec5SDimitry Andric     formTextToken(T, TokenPtr);
6210b57cec5SDimitry Andric     return;
6220b57cec5SDimitry Andric   }
6230b57cec5SDimitry Andric   formTokenWithChars(T, TokenPtr, tok::text);
6240b57cec5SDimitry Andric   T.setText(Resolved);
6250b57cec5SDimitry Andric }
6260b57cec5SDimitry Andric 
setupAndLexHTMLStartTag(Token & T)6270b57cec5SDimitry Andric void Lexer::setupAndLexHTMLStartTag(Token &T) {
6280b57cec5SDimitry Andric   assert(BufferPtr[0] == '<' &&
6290b57cec5SDimitry Andric          isHTMLIdentifierStartingCharacter(BufferPtr[1]));
6300b57cec5SDimitry Andric   const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
6310b57cec5SDimitry Andric   StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
6320b57cec5SDimitry Andric   if (!isHTMLTagName(Name)) {
6330b57cec5SDimitry Andric     formTextToken(T, TagNameEnd);
6340b57cec5SDimitry Andric     return;
6350b57cec5SDimitry Andric   }
6360b57cec5SDimitry Andric 
6370b57cec5SDimitry Andric   formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
6380b57cec5SDimitry Andric   T.setHTMLTagStartName(Name);
6390b57cec5SDimitry Andric 
6400b57cec5SDimitry Andric   BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
6410b57cec5SDimitry Andric 
6420b57cec5SDimitry Andric   const char C = *BufferPtr;
6430b57cec5SDimitry Andric   if (BufferPtr != CommentEnd &&
6440b57cec5SDimitry Andric       (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
6450b57cec5SDimitry Andric     State = LS_HTMLStartTag;
6460b57cec5SDimitry Andric }
6470b57cec5SDimitry Andric 
lexHTMLStartTag(Token & T)6480b57cec5SDimitry Andric void Lexer::lexHTMLStartTag(Token &T) {
6490b57cec5SDimitry Andric   assert(State == LS_HTMLStartTag);
6500b57cec5SDimitry Andric 
6510b57cec5SDimitry Andric   const char *TokenPtr = BufferPtr;
6520b57cec5SDimitry Andric   char C = *TokenPtr;
6530b57cec5SDimitry Andric   if (isHTMLIdentifierCharacter(C)) {
6540b57cec5SDimitry Andric     TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
6550b57cec5SDimitry Andric     StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
6560b57cec5SDimitry Andric     formTokenWithChars(T, TokenPtr, tok::html_ident);
6570b57cec5SDimitry Andric     T.setHTMLIdent(Ident);
6580b57cec5SDimitry Andric   } else {
6590b57cec5SDimitry Andric     switch (C) {
6600b57cec5SDimitry Andric     case '=':
6610b57cec5SDimitry Andric       TokenPtr++;
6620b57cec5SDimitry Andric       formTokenWithChars(T, TokenPtr, tok::html_equals);
6630b57cec5SDimitry Andric       break;
6640b57cec5SDimitry Andric     case '\"':
6650b57cec5SDimitry Andric     case '\'': {
6660b57cec5SDimitry Andric       const char *OpenQuote = TokenPtr;
6670b57cec5SDimitry Andric       TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
6680b57cec5SDimitry Andric       const char *ClosingQuote = TokenPtr;
6690b57cec5SDimitry Andric       if (TokenPtr != CommentEnd) // Skip closing quote.
6700b57cec5SDimitry Andric         TokenPtr++;
6710b57cec5SDimitry Andric       formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
6720b57cec5SDimitry Andric       T.setHTMLQuotedString(StringRef(OpenQuote + 1,
6730b57cec5SDimitry Andric                                       ClosingQuote - (OpenQuote + 1)));
6740b57cec5SDimitry Andric       break;
6750b57cec5SDimitry Andric     }
6760b57cec5SDimitry Andric     case '>':
6770b57cec5SDimitry Andric       TokenPtr++;
6780b57cec5SDimitry Andric       formTokenWithChars(T, TokenPtr, tok::html_greater);
6790b57cec5SDimitry Andric       State = LS_Normal;
6800b57cec5SDimitry Andric       return;
6810b57cec5SDimitry Andric     case '/':
6820b57cec5SDimitry Andric       TokenPtr++;
6830b57cec5SDimitry Andric       if (TokenPtr != CommentEnd && *TokenPtr == '>') {
6840b57cec5SDimitry Andric         TokenPtr++;
6850b57cec5SDimitry Andric         formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
6860b57cec5SDimitry Andric       } else
6870b57cec5SDimitry Andric         formTextToken(T, TokenPtr);
6880b57cec5SDimitry Andric 
6890b57cec5SDimitry Andric       State = LS_Normal;
6900b57cec5SDimitry Andric       return;
6910b57cec5SDimitry Andric     }
6920b57cec5SDimitry Andric   }
6930b57cec5SDimitry Andric 
6940b57cec5SDimitry Andric   // Now look ahead and return to normal state if we don't see any HTML tokens
6950b57cec5SDimitry Andric   // ahead.
6960b57cec5SDimitry Andric   BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
6970b57cec5SDimitry Andric   if (BufferPtr == CommentEnd) {
6980b57cec5SDimitry Andric     State = LS_Normal;
6990b57cec5SDimitry Andric     return;
7000b57cec5SDimitry Andric   }
7010b57cec5SDimitry Andric 
7020b57cec5SDimitry Andric   C = *BufferPtr;
7030b57cec5SDimitry Andric   if (!isHTMLIdentifierStartingCharacter(C) &&
704*bdd1243dSDimitry Andric       C != '=' && C != '\"' && C != '\'' && C != '>' && C != '/') {
7050b57cec5SDimitry Andric     State = LS_Normal;
7060b57cec5SDimitry Andric     return;
7070b57cec5SDimitry Andric   }
7080b57cec5SDimitry Andric }
7090b57cec5SDimitry Andric 
setupAndLexHTMLEndTag(Token & T)7100b57cec5SDimitry Andric void Lexer::setupAndLexHTMLEndTag(Token &T) {
7110b57cec5SDimitry Andric   assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
7120b57cec5SDimitry Andric 
7130b57cec5SDimitry Andric   const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
7140b57cec5SDimitry Andric   const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
7150b57cec5SDimitry Andric   StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
7160b57cec5SDimitry Andric   if (!isHTMLTagName(Name)) {
7170b57cec5SDimitry Andric     formTextToken(T, TagNameEnd);
7180b57cec5SDimitry Andric     return;
7190b57cec5SDimitry Andric   }
7200b57cec5SDimitry Andric 
7210b57cec5SDimitry Andric   const char *End = skipWhitespace(TagNameEnd, CommentEnd);
7220b57cec5SDimitry Andric 
7230b57cec5SDimitry Andric   formTokenWithChars(T, End, tok::html_end_tag);
7240b57cec5SDimitry Andric   T.setHTMLTagEndName(Name);
7250b57cec5SDimitry Andric 
7260b57cec5SDimitry Andric   if (BufferPtr != CommentEnd && *BufferPtr == '>')
7270b57cec5SDimitry Andric     State = LS_HTMLEndTag;
7280b57cec5SDimitry Andric }
7290b57cec5SDimitry Andric 
lexHTMLEndTag(Token & T)7300b57cec5SDimitry Andric void Lexer::lexHTMLEndTag(Token &T) {
7310b57cec5SDimitry Andric   assert(BufferPtr != CommentEnd && *BufferPtr == '>');
7320b57cec5SDimitry Andric 
7330b57cec5SDimitry Andric   formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
7340b57cec5SDimitry Andric   State = LS_Normal;
7350b57cec5SDimitry Andric }
7360b57cec5SDimitry Andric 
Lexer(llvm::BumpPtrAllocator & Allocator,DiagnosticsEngine & Diags,const CommandTraits & Traits,SourceLocation FileLoc,const char * BufferStart,const char * BufferEnd,bool ParseCommands)7370b57cec5SDimitry Andric Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
7380b57cec5SDimitry Andric              const CommandTraits &Traits, SourceLocation FileLoc,
739e8d8bef9SDimitry Andric              const char *BufferStart, const char *BufferEnd, bool ParseCommands)
7400b57cec5SDimitry Andric     : Allocator(Allocator), Diags(Diags), Traits(Traits),
741e8d8bef9SDimitry Andric       BufferStart(BufferStart), BufferEnd(BufferEnd), BufferPtr(BufferStart),
742e8d8bef9SDimitry Andric       FileLoc(FileLoc), ParseCommands(ParseCommands),
743e8d8bef9SDimitry Andric       CommentState(LCS_BeforeComment), State(LS_Normal) {}
7440b57cec5SDimitry Andric 
lex(Token & T)7450b57cec5SDimitry Andric void Lexer::lex(Token &T) {
7460b57cec5SDimitry Andric again:
7470b57cec5SDimitry Andric   switch (CommentState) {
7480b57cec5SDimitry Andric   case LCS_BeforeComment:
7490b57cec5SDimitry Andric     if (BufferPtr == BufferEnd) {
7500b57cec5SDimitry Andric       formTokenWithChars(T, BufferPtr, tok::eof);
7510b57cec5SDimitry Andric       return;
7520b57cec5SDimitry Andric     }
7530b57cec5SDimitry Andric 
7540b57cec5SDimitry Andric     assert(*BufferPtr == '/');
7550b57cec5SDimitry Andric     BufferPtr++; // Skip first slash.
7560b57cec5SDimitry Andric     switch(*BufferPtr) {
7570b57cec5SDimitry Andric     case '/': { // BCPL comment.
7580b57cec5SDimitry Andric       BufferPtr++; // Skip second slash.
7590b57cec5SDimitry Andric 
7600b57cec5SDimitry Andric       if (BufferPtr != BufferEnd) {
7610b57cec5SDimitry Andric         // Skip Doxygen magic marker, if it is present.
7620b57cec5SDimitry Andric         // It might be missing because of a typo //< or /*<, or because we
7630b57cec5SDimitry Andric         // merged this non-Doxygen comment into a bunch of Doxygen comments
7640b57cec5SDimitry Andric         // around it: /** ... */ /* ... */ /** ... */
7650b57cec5SDimitry Andric         const char C = *BufferPtr;
7660b57cec5SDimitry Andric         if (C == '/' || C == '!')
7670b57cec5SDimitry Andric           BufferPtr++;
7680b57cec5SDimitry Andric       }
7690b57cec5SDimitry Andric 
7700b57cec5SDimitry Andric       // Skip less-than symbol that marks trailing comments.
7710b57cec5SDimitry Andric       // Skip it even if the comment is not a Doxygen one, because //< and /*<
7720b57cec5SDimitry Andric       // are frequent typos.
7730b57cec5SDimitry Andric       if (BufferPtr != BufferEnd && *BufferPtr == '<')
7740b57cec5SDimitry Andric         BufferPtr++;
7750b57cec5SDimitry Andric 
7760b57cec5SDimitry Andric       CommentState = LCS_InsideBCPLComment;
7770b57cec5SDimitry Andric       if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
7780b57cec5SDimitry Andric         State = LS_Normal;
7790b57cec5SDimitry Andric       CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
7800b57cec5SDimitry Andric       goto again;
7810b57cec5SDimitry Andric     }
7820b57cec5SDimitry Andric     case '*': { // C comment.
7830b57cec5SDimitry Andric       BufferPtr++; // Skip star.
7840b57cec5SDimitry Andric 
7850b57cec5SDimitry Andric       // Skip Doxygen magic marker.
7860b57cec5SDimitry Andric       const char C = *BufferPtr;
7870b57cec5SDimitry Andric       if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
7880b57cec5SDimitry Andric         BufferPtr++;
7890b57cec5SDimitry Andric 
7900b57cec5SDimitry Andric       // Skip less-than symbol that marks trailing comments.
7910b57cec5SDimitry Andric       if (BufferPtr != BufferEnd && *BufferPtr == '<')
7920b57cec5SDimitry Andric         BufferPtr++;
7930b57cec5SDimitry Andric 
7940b57cec5SDimitry Andric       CommentState = LCS_InsideCComment;
7950b57cec5SDimitry Andric       State = LS_Normal;
7960b57cec5SDimitry Andric       CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
7970b57cec5SDimitry Andric       goto again;
7980b57cec5SDimitry Andric     }
7990b57cec5SDimitry Andric     default:
8000b57cec5SDimitry Andric       llvm_unreachable("second character of comment should be '/' or '*'");
8010b57cec5SDimitry Andric     }
8020b57cec5SDimitry Andric 
8030b57cec5SDimitry Andric   case LCS_BetweenComments: {
8040b57cec5SDimitry Andric     // Consecutive comments are extracted only if there is only whitespace
8050b57cec5SDimitry Andric     // between them.  So we can search for the start of the next comment.
8060b57cec5SDimitry Andric     const char *EndWhitespace = BufferPtr;
8070b57cec5SDimitry Andric     while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
8080b57cec5SDimitry Andric       EndWhitespace++;
8090b57cec5SDimitry Andric 
8100b57cec5SDimitry Andric     // Turn any whitespace between comments (and there is only whitespace
8110b57cec5SDimitry Andric     // between them -- guaranteed by comment extraction) into a newline.  We
8120b57cec5SDimitry Andric     // have two newlines between C comments in total (first one was synthesized
8130b57cec5SDimitry Andric     // after a comment).
8140b57cec5SDimitry Andric     formTokenWithChars(T, EndWhitespace, tok::newline);
8150b57cec5SDimitry Andric 
8160b57cec5SDimitry Andric     CommentState = LCS_BeforeComment;
8170b57cec5SDimitry Andric     break;
8180b57cec5SDimitry Andric   }
8190b57cec5SDimitry Andric 
8200b57cec5SDimitry Andric   case LCS_InsideBCPLComment:
8210b57cec5SDimitry Andric   case LCS_InsideCComment:
8220b57cec5SDimitry Andric     if (BufferPtr != CommentEnd) {
8230b57cec5SDimitry Andric       lexCommentText(T);
8240b57cec5SDimitry Andric       break;
8250b57cec5SDimitry Andric     } else {
8260b57cec5SDimitry Andric       // Skip C comment closing sequence.
8270b57cec5SDimitry Andric       if (CommentState == LCS_InsideCComment) {
8280b57cec5SDimitry Andric         assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
8290b57cec5SDimitry Andric         BufferPtr += 2;
8300b57cec5SDimitry Andric         assert(BufferPtr <= BufferEnd);
8310b57cec5SDimitry Andric 
8320b57cec5SDimitry Andric         // Synthenize newline just after the C comment, regardless if there is
8330b57cec5SDimitry Andric         // actually a newline.
8340b57cec5SDimitry Andric         formTokenWithChars(T, BufferPtr, tok::newline);
8350b57cec5SDimitry Andric 
8360b57cec5SDimitry Andric         CommentState = LCS_BetweenComments;
8370b57cec5SDimitry Andric         break;
8380b57cec5SDimitry Andric       } else {
8390b57cec5SDimitry Andric         // Don't synthesized a newline after BCPL comment.
8400b57cec5SDimitry Andric         CommentState = LCS_BetweenComments;
8410b57cec5SDimitry Andric         goto again;
8420b57cec5SDimitry Andric       }
8430b57cec5SDimitry Andric     }
8440b57cec5SDimitry Andric   }
8450b57cec5SDimitry Andric }
8460b57cec5SDimitry Andric 
getSpelling(const Token & Tok,const SourceManager & SourceMgr) const8470b57cec5SDimitry Andric StringRef Lexer::getSpelling(const Token &Tok,
848a7dea167SDimitry Andric                              const SourceManager &SourceMgr) const {
8490b57cec5SDimitry Andric   SourceLocation Loc = Tok.getLocation();
8500b57cec5SDimitry Andric   std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
8510b57cec5SDimitry Andric 
8520b57cec5SDimitry Andric   bool InvalidTemp = false;
8530b57cec5SDimitry Andric   StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
854a7dea167SDimitry Andric   if (InvalidTemp)
8550b57cec5SDimitry Andric     return StringRef();
8560b57cec5SDimitry Andric 
8570b57cec5SDimitry Andric   const char *Begin = File.data() + LocInfo.second;
8580b57cec5SDimitry Andric   return StringRef(Begin, Tok.getLength());
8590b57cec5SDimitry Andric }
8600b57cec5SDimitry Andric 
8610b57cec5SDimitry Andric } // end namespace comments
8620b57cec5SDimitry Andric } // end namespace clang
863