1*f4a2713aSLionel Sambuc #include "clang/AST/CommentLexer.h" 2*f4a2713aSLionel Sambuc #include "clang/AST/CommentCommandTraits.h" 3*f4a2713aSLionel Sambuc #include "clang/AST/CommentDiagnostic.h" 4*f4a2713aSLionel Sambuc #include "clang/Basic/CharInfo.h" 5*f4a2713aSLionel Sambuc #include "llvm/ADT/StringExtras.h" 6*f4a2713aSLionel Sambuc #include "llvm/ADT/StringSwitch.h" 7*f4a2713aSLionel Sambuc #include "llvm/Support/ConvertUTF.h" 8*f4a2713aSLionel Sambuc #include "llvm/Support/ErrorHandling.h" 9*f4a2713aSLionel Sambuc 10*f4a2713aSLionel Sambuc namespace clang { 11*f4a2713aSLionel Sambuc namespace comments { 12*f4a2713aSLionel Sambuc 13*f4a2713aSLionel Sambuc void Token::dump(const Lexer &L, const SourceManager &SM) const { 14*f4a2713aSLionel Sambuc llvm::errs() << "comments::Token Kind=" << Kind << " "; 15*f4a2713aSLionel Sambuc Loc.dump(SM); 16*f4a2713aSLionel Sambuc llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n"; 17*f4a2713aSLionel Sambuc } 18*f4a2713aSLionel Sambuc 19*f4a2713aSLionel Sambuc static inline bool isHTMLNamedCharacterReferenceCharacter(char C) { 20*f4a2713aSLionel Sambuc return isLetter(C); 21*f4a2713aSLionel Sambuc } 22*f4a2713aSLionel Sambuc 23*f4a2713aSLionel Sambuc static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) { 24*f4a2713aSLionel Sambuc return isDigit(C); 25*f4a2713aSLionel Sambuc } 26*f4a2713aSLionel Sambuc 27*f4a2713aSLionel Sambuc static inline bool isHTMLHexCharacterReferenceCharacter(char C) { 28*f4a2713aSLionel Sambuc return isHexDigit(C); 29*f4a2713aSLionel Sambuc } 30*f4a2713aSLionel Sambuc 31*f4a2713aSLionel Sambuc static inline StringRef convertCodePointToUTF8( 32*f4a2713aSLionel Sambuc llvm::BumpPtrAllocator &Allocator, 33*f4a2713aSLionel Sambuc unsigned CodePoint) { 34*f4a2713aSLionel Sambuc char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT); 35*f4a2713aSLionel Sambuc char *ResolvedPtr = Resolved; 36*f4a2713aSLionel Sambuc if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr)) 37*f4a2713aSLionel Sambuc return StringRef(Resolved, ResolvedPtr - Resolved); 38*f4a2713aSLionel Sambuc else 39*f4a2713aSLionel Sambuc return StringRef(); 40*f4a2713aSLionel Sambuc } 41*f4a2713aSLionel Sambuc 42*f4a2713aSLionel Sambuc namespace { 43*f4a2713aSLionel Sambuc 44*f4a2713aSLionel Sambuc #include "clang/AST/CommentHTMLTags.inc" 45*f4a2713aSLionel Sambuc #include "clang/AST/CommentHTMLNamedCharacterReferences.inc" 46*f4a2713aSLionel Sambuc 47*f4a2713aSLionel Sambuc } // unnamed namespace 48*f4a2713aSLionel Sambuc 49*f4a2713aSLionel Sambuc StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const { 50*f4a2713aSLionel Sambuc // Fast path, first check a few most widely used named character references. 51*f4a2713aSLionel Sambuc return llvm::StringSwitch<StringRef>(Name) 52*f4a2713aSLionel Sambuc .Case("amp", "&") 53*f4a2713aSLionel Sambuc .Case("lt", "<") 54*f4a2713aSLionel Sambuc .Case("gt", ">") 55*f4a2713aSLionel Sambuc .Case("quot", "\"") 56*f4a2713aSLionel Sambuc .Case("apos", "\'") 57*f4a2713aSLionel Sambuc // Slow path. 58*f4a2713aSLionel Sambuc .Default(translateHTMLNamedCharacterReferenceToUTF8(Name)); 59*f4a2713aSLionel Sambuc } 60*f4a2713aSLionel Sambuc 61*f4a2713aSLionel Sambuc StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const { 62*f4a2713aSLionel Sambuc unsigned CodePoint = 0; 63*f4a2713aSLionel Sambuc for (unsigned i = 0, e = Name.size(); i != e; ++i) { 64*f4a2713aSLionel Sambuc assert(isHTMLDecimalCharacterReferenceCharacter(Name[i])); 65*f4a2713aSLionel Sambuc CodePoint *= 10; 66*f4a2713aSLionel Sambuc CodePoint += Name[i] - '0'; 67*f4a2713aSLionel Sambuc } 68*f4a2713aSLionel Sambuc return convertCodePointToUTF8(Allocator, CodePoint); 69*f4a2713aSLionel Sambuc } 70*f4a2713aSLionel Sambuc 71*f4a2713aSLionel Sambuc StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const { 72*f4a2713aSLionel Sambuc unsigned CodePoint = 0; 73*f4a2713aSLionel Sambuc for (unsigned i = 0, e = Name.size(); i != e; ++i) { 74*f4a2713aSLionel Sambuc CodePoint *= 16; 75*f4a2713aSLionel Sambuc const char C = Name[i]; 76*f4a2713aSLionel Sambuc assert(isHTMLHexCharacterReferenceCharacter(C)); 77*f4a2713aSLionel Sambuc CodePoint += llvm::hexDigitValue(C); 78*f4a2713aSLionel Sambuc } 79*f4a2713aSLionel Sambuc return convertCodePointToUTF8(Allocator, CodePoint); 80*f4a2713aSLionel Sambuc } 81*f4a2713aSLionel Sambuc 82*f4a2713aSLionel Sambuc void Lexer::skipLineStartingDecorations() { 83*f4a2713aSLionel Sambuc // This function should be called only for C comments 84*f4a2713aSLionel Sambuc assert(CommentState == LCS_InsideCComment); 85*f4a2713aSLionel Sambuc 86*f4a2713aSLionel Sambuc if (BufferPtr == CommentEnd) 87*f4a2713aSLionel Sambuc return; 88*f4a2713aSLionel Sambuc 89*f4a2713aSLionel Sambuc switch (*BufferPtr) { 90*f4a2713aSLionel Sambuc case ' ': 91*f4a2713aSLionel Sambuc case '\t': 92*f4a2713aSLionel Sambuc case '\f': 93*f4a2713aSLionel Sambuc case '\v': { 94*f4a2713aSLionel Sambuc const char *NewBufferPtr = BufferPtr; 95*f4a2713aSLionel Sambuc NewBufferPtr++; 96*f4a2713aSLionel Sambuc if (NewBufferPtr == CommentEnd) 97*f4a2713aSLionel Sambuc return; 98*f4a2713aSLionel Sambuc 99*f4a2713aSLionel Sambuc char C = *NewBufferPtr; 100*f4a2713aSLionel Sambuc while (isHorizontalWhitespace(C)) { 101*f4a2713aSLionel Sambuc NewBufferPtr++; 102*f4a2713aSLionel Sambuc if (NewBufferPtr == CommentEnd) 103*f4a2713aSLionel Sambuc return; 104*f4a2713aSLionel Sambuc C = *NewBufferPtr; 105*f4a2713aSLionel Sambuc } 106*f4a2713aSLionel Sambuc if (C == '*') 107*f4a2713aSLionel Sambuc BufferPtr = NewBufferPtr + 1; 108*f4a2713aSLionel Sambuc break; 109*f4a2713aSLionel Sambuc } 110*f4a2713aSLionel Sambuc case '*': 111*f4a2713aSLionel Sambuc BufferPtr++; 112*f4a2713aSLionel Sambuc break; 113*f4a2713aSLionel Sambuc } 114*f4a2713aSLionel Sambuc } 115*f4a2713aSLionel Sambuc 116*f4a2713aSLionel Sambuc namespace { 117*f4a2713aSLionel Sambuc /// Returns pointer to the first newline character in the string. 118*f4a2713aSLionel Sambuc const char *findNewline(const char *BufferPtr, const char *BufferEnd) { 119*f4a2713aSLionel Sambuc for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 120*f4a2713aSLionel Sambuc if (isVerticalWhitespace(*BufferPtr)) 121*f4a2713aSLionel Sambuc return BufferPtr; 122*f4a2713aSLionel Sambuc } 123*f4a2713aSLionel Sambuc return BufferEnd; 124*f4a2713aSLionel Sambuc } 125*f4a2713aSLionel Sambuc 126*f4a2713aSLionel Sambuc const char *skipNewline(const char *BufferPtr, const char *BufferEnd) { 127*f4a2713aSLionel Sambuc if (BufferPtr == BufferEnd) 128*f4a2713aSLionel Sambuc return BufferPtr; 129*f4a2713aSLionel Sambuc 130*f4a2713aSLionel Sambuc if (*BufferPtr == '\n') 131*f4a2713aSLionel Sambuc BufferPtr++; 132*f4a2713aSLionel Sambuc else { 133*f4a2713aSLionel Sambuc assert(*BufferPtr == '\r'); 134*f4a2713aSLionel Sambuc BufferPtr++; 135*f4a2713aSLionel Sambuc if (BufferPtr != BufferEnd && *BufferPtr == '\n') 136*f4a2713aSLionel Sambuc BufferPtr++; 137*f4a2713aSLionel Sambuc } 138*f4a2713aSLionel Sambuc return BufferPtr; 139*f4a2713aSLionel Sambuc } 140*f4a2713aSLionel Sambuc 141*f4a2713aSLionel Sambuc const char *skipNamedCharacterReference(const char *BufferPtr, 142*f4a2713aSLionel Sambuc const char *BufferEnd) { 143*f4a2713aSLionel Sambuc for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 144*f4a2713aSLionel Sambuc if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr)) 145*f4a2713aSLionel Sambuc return BufferPtr; 146*f4a2713aSLionel Sambuc } 147*f4a2713aSLionel Sambuc return BufferEnd; 148*f4a2713aSLionel Sambuc } 149*f4a2713aSLionel Sambuc 150*f4a2713aSLionel Sambuc const char *skipDecimalCharacterReference(const char *BufferPtr, 151*f4a2713aSLionel Sambuc const char *BufferEnd) { 152*f4a2713aSLionel Sambuc for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 153*f4a2713aSLionel Sambuc if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr)) 154*f4a2713aSLionel Sambuc return BufferPtr; 155*f4a2713aSLionel Sambuc } 156*f4a2713aSLionel Sambuc return BufferEnd; 157*f4a2713aSLionel Sambuc } 158*f4a2713aSLionel Sambuc 159*f4a2713aSLionel Sambuc const char *skipHexCharacterReference(const char *BufferPtr, 160*f4a2713aSLionel Sambuc const char *BufferEnd) { 161*f4a2713aSLionel Sambuc for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 162*f4a2713aSLionel Sambuc if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr)) 163*f4a2713aSLionel Sambuc return BufferPtr; 164*f4a2713aSLionel Sambuc } 165*f4a2713aSLionel Sambuc return BufferEnd; 166*f4a2713aSLionel Sambuc } 167*f4a2713aSLionel Sambuc 168*f4a2713aSLionel Sambuc bool isHTMLIdentifierStartingCharacter(char C) { 169*f4a2713aSLionel Sambuc return isLetter(C); 170*f4a2713aSLionel Sambuc } 171*f4a2713aSLionel Sambuc 172*f4a2713aSLionel Sambuc bool isHTMLIdentifierCharacter(char C) { 173*f4a2713aSLionel Sambuc return isAlphanumeric(C); 174*f4a2713aSLionel Sambuc } 175*f4a2713aSLionel Sambuc 176*f4a2713aSLionel Sambuc const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) { 177*f4a2713aSLionel Sambuc for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 178*f4a2713aSLionel Sambuc if (!isHTMLIdentifierCharacter(*BufferPtr)) 179*f4a2713aSLionel Sambuc return BufferPtr; 180*f4a2713aSLionel Sambuc } 181*f4a2713aSLionel Sambuc return BufferEnd; 182*f4a2713aSLionel Sambuc } 183*f4a2713aSLionel Sambuc 184*f4a2713aSLionel Sambuc /// Skip HTML string quoted in single or double quotes. Escaping quotes inside 185*f4a2713aSLionel Sambuc /// string allowed. 186*f4a2713aSLionel Sambuc /// 187*f4a2713aSLionel Sambuc /// Returns pointer to closing quote. 188*f4a2713aSLionel Sambuc const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd) 189*f4a2713aSLionel Sambuc { 190*f4a2713aSLionel Sambuc const char Quote = *BufferPtr; 191*f4a2713aSLionel Sambuc assert(Quote == '\"' || Quote == '\''); 192*f4a2713aSLionel Sambuc 193*f4a2713aSLionel Sambuc BufferPtr++; 194*f4a2713aSLionel Sambuc for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 195*f4a2713aSLionel Sambuc const char C = *BufferPtr; 196*f4a2713aSLionel Sambuc if (C == Quote && BufferPtr[-1] != '\\') 197*f4a2713aSLionel Sambuc return BufferPtr; 198*f4a2713aSLionel Sambuc } 199*f4a2713aSLionel Sambuc return BufferEnd; 200*f4a2713aSLionel Sambuc } 201*f4a2713aSLionel Sambuc 202*f4a2713aSLionel Sambuc const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) { 203*f4a2713aSLionel Sambuc for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 204*f4a2713aSLionel Sambuc if (!isWhitespace(*BufferPtr)) 205*f4a2713aSLionel Sambuc return BufferPtr; 206*f4a2713aSLionel Sambuc } 207*f4a2713aSLionel Sambuc return BufferEnd; 208*f4a2713aSLionel Sambuc } 209*f4a2713aSLionel Sambuc 210*f4a2713aSLionel Sambuc bool isWhitespace(const char *BufferPtr, const char *BufferEnd) { 211*f4a2713aSLionel Sambuc return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd; 212*f4a2713aSLionel Sambuc } 213*f4a2713aSLionel Sambuc 214*f4a2713aSLionel Sambuc bool isCommandNameStartCharacter(char C) { 215*f4a2713aSLionel Sambuc return isLetter(C); 216*f4a2713aSLionel Sambuc } 217*f4a2713aSLionel Sambuc 218*f4a2713aSLionel Sambuc bool isCommandNameCharacter(char C) { 219*f4a2713aSLionel Sambuc return isAlphanumeric(C); 220*f4a2713aSLionel Sambuc } 221*f4a2713aSLionel Sambuc 222*f4a2713aSLionel Sambuc const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) { 223*f4a2713aSLionel Sambuc for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 224*f4a2713aSLionel Sambuc if (!isCommandNameCharacter(*BufferPtr)) 225*f4a2713aSLionel Sambuc return BufferPtr; 226*f4a2713aSLionel Sambuc } 227*f4a2713aSLionel Sambuc return BufferEnd; 228*f4a2713aSLionel Sambuc } 229*f4a2713aSLionel Sambuc 230*f4a2713aSLionel Sambuc /// Return the one past end pointer for BCPL comments. 231*f4a2713aSLionel Sambuc /// Handles newlines escaped with backslash or trigraph for backslahs. 232*f4a2713aSLionel Sambuc const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) { 233*f4a2713aSLionel Sambuc const char *CurPtr = BufferPtr; 234*f4a2713aSLionel Sambuc while (CurPtr != BufferEnd) { 235*f4a2713aSLionel Sambuc while (!isVerticalWhitespace(*CurPtr)) { 236*f4a2713aSLionel Sambuc CurPtr++; 237*f4a2713aSLionel Sambuc if (CurPtr == BufferEnd) 238*f4a2713aSLionel Sambuc return BufferEnd; 239*f4a2713aSLionel Sambuc } 240*f4a2713aSLionel Sambuc // We found a newline, check if it is escaped. 241*f4a2713aSLionel Sambuc const char *EscapePtr = CurPtr - 1; 242*f4a2713aSLionel Sambuc while(isHorizontalWhitespace(*EscapePtr)) 243*f4a2713aSLionel Sambuc EscapePtr--; 244*f4a2713aSLionel Sambuc 245*f4a2713aSLionel Sambuc if (*EscapePtr == '\\' || 246*f4a2713aSLionel Sambuc (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' && 247*f4a2713aSLionel Sambuc EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) { 248*f4a2713aSLionel Sambuc // We found an escaped newline. 249*f4a2713aSLionel Sambuc CurPtr = skipNewline(CurPtr, BufferEnd); 250*f4a2713aSLionel Sambuc } else 251*f4a2713aSLionel Sambuc return CurPtr; // Not an escaped newline. 252*f4a2713aSLionel Sambuc } 253*f4a2713aSLionel Sambuc return BufferEnd; 254*f4a2713aSLionel Sambuc } 255*f4a2713aSLionel Sambuc 256*f4a2713aSLionel Sambuc /// Return the one past end pointer for C comments. 257*f4a2713aSLionel Sambuc /// Very dumb, does not handle escaped newlines or trigraphs. 258*f4a2713aSLionel Sambuc const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) { 259*f4a2713aSLionel Sambuc for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 260*f4a2713aSLionel Sambuc if (*BufferPtr == '*') { 261*f4a2713aSLionel Sambuc assert(BufferPtr + 1 != BufferEnd); 262*f4a2713aSLionel Sambuc if (*(BufferPtr + 1) == '/') 263*f4a2713aSLionel Sambuc return BufferPtr; 264*f4a2713aSLionel Sambuc } 265*f4a2713aSLionel Sambuc } 266*f4a2713aSLionel Sambuc llvm_unreachable("buffer end hit before '*/' was seen"); 267*f4a2713aSLionel Sambuc } 268*f4a2713aSLionel Sambuc 269*f4a2713aSLionel Sambuc } // unnamed namespace 270*f4a2713aSLionel Sambuc 271*f4a2713aSLionel Sambuc void Lexer::lexCommentText(Token &T) { 272*f4a2713aSLionel Sambuc assert(CommentState == LCS_InsideBCPLComment || 273*f4a2713aSLionel Sambuc CommentState == LCS_InsideCComment); 274*f4a2713aSLionel Sambuc 275*f4a2713aSLionel Sambuc switch (State) { 276*f4a2713aSLionel Sambuc case LS_Normal: 277*f4a2713aSLionel Sambuc break; 278*f4a2713aSLionel Sambuc case LS_VerbatimBlockFirstLine: 279*f4a2713aSLionel Sambuc lexVerbatimBlockFirstLine(T); 280*f4a2713aSLionel Sambuc return; 281*f4a2713aSLionel Sambuc case LS_VerbatimBlockBody: 282*f4a2713aSLionel Sambuc lexVerbatimBlockBody(T); 283*f4a2713aSLionel Sambuc return; 284*f4a2713aSLionel Sambuc case LS_VerbatimLineText: 285*f4a2713aSLionel Sambuc lexVerbatimLineText(T); 286*f4a2713aSLionel Sambuc return; 287*f4a2713aSLionel Sambuc case LS_HTMLStartTag: 288*f4a2713aSLionel Sambuc lexHTMLStartTag(T); 289*f4a2713aSLionel Sambuc return; 290*f4a2713aSLionel Sambuc case LS_HTMLEndTag: 291*f4a2713aSLionel Sambuc lexHTMLEndTag(T); 292*f4a2713aSLionel Sambuc return; 293*f4a2713aSLionel Sambuc } 294*f4a2713aSLionel Sambuc 295*f4a2713aSLionel Sambuc assert(State == LS_Normal); 296*f4a2713aSLionel Sambuc 297*f4a2713aSLionel Sambuc const char *TokenPtr = BufferPtr; 298*f4a2713aSLionel Sambuc assert(TokenPtr < CommentEnd); 299*f4a2713aSLionel Sambuc while (TokenPtr != CommentEnd) { 300*f4a2713aSLionel Sambuc switch(*TokenPtr) { 301*f4a2713aSLionel Sambuc case '\\': 302*f4a2713aSLionel Sambuc case '@': { 303*f4a2713aSLionel Sambuc // Commands that start with a backslash and commands that start with 304*f4a2713aSLionel Sambuc // 'at' have equivalent semantics. But we keep information about the 305*f4a2713aSLionel Sambuc // exact syntax in AST for comments. 306*f4a2713aSLionel Sambuc tok::TokenKind CommandKind = 307*f4a2713aSLionel Sambuc (*TokenPtr == '@') ? tok::at_command : tok::backslash_command; 308*f4a2713aSLionel Sambuc TokenPtr++; 309*f4a2713aSLionel Sambuc if (TokenPtr == CommentEnd) { 310*f4a2713aSLionel Sambuc formTextToken(T, TokenPtr); 311*f4a2713aSLionel Sambuc return; 312*f4a2713aSLionel Sambuc } 313*f4a2713aSLionel Sambuc char C = *TokenPtr; 314*f4a2713aSLionel Sambuc switch (C) { 315*f4a2713aSLionel Sambuc default: 316*f4a2713aSLionel Sambuc break; 317*f4a2713aSLionel Sambuc 318*f4a2713aSLionel Sambuc case '\\': case '@': case '&': case '$': 319*f4a2713aSLionel Sambuc case '#': case '<': case '>': case '%': 320*f4a2713aSLionel Sambuc case '\"': case '.': case ':': 321*f4a2713aSLionel Sambuc // This is one of \\ \@ \& \$ etc escape sequences. 322*f4a2713aSLionel Sambuc TokenPtr++; 323*f4a2713aSLionel Sambuc if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') { 324*f4a2713aSLionel Sambuc // This is the \:: escape sequence. 325*f4a2713aSLionel Sambuc TokenPtr++; 326*f4a2713aSLionel Sambuc } 327*f4a2713aSLionel Sambuc StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1)); 328*f4a2713aSLionel Sambuc formTokenWithChars(T, TokenPtr, tok::text); 329*f4a2713aSLionel Sambuc T.setText(UnescapedText); 330*f4a2713aSLionel Sambuc return; 331*f4a2713aSLionel Sambuc } 332*f4a2713aSLionel Sambuc 333*f4a2713aSLionel Sambuc // Don't make zero-length commands. 334*f4a2713aSLionel Sambuc if (!isCommandNameStartCharacter(*TokenPtr)) { 335*f4a2713aSLionel Sambuc formTextToken(T, TokenPtr); 336*f4a2713aSLionel Sambuc return; 337*f4a2713aSLionel Sambuc } 338*f4a2713aSLionel Sambuc 339*f4a2713aSLionel Sambuc TokenPtr = skipCommandName(TokenPtr, CommentEnd); 340*f4a2713aSLionel Sambuc unsigned Length = TokenPtr - (BufferPtr + 1); 341*f4a2713aSLionel Sambuc 342*f4a2713aSLionel Sambuc // Hardcoded support for lexing LaTeX formula commands 343*f4a2713aSLionel Sambuc // \f$ \f[ \f] \f{ \f} as a single command. 344*f4a2713aSLionel Sambuc if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) { 345*f4a2713aSLionel Sambuc C = *TokenPtr; 346*f4a2713aSLionel Sambuc if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') { 347*f4a2713aSLionel Sambuc TokenPtr++; 348*f4a2713aSLionel Sambuc Length++; 349*f4a2713aSLionel Sambuc } 350*f4a2713aSLionel Sambuc } 351*f4a2713aSLionel Sambuc 352*f4a2713aSLionel Sambuc const StringRef CommandName(BufferPtr + 1, Length); 353*f4a2713aSLionel Sambuc 354*f4a2713aSLionel Sambuc const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName); 355*f4a2713aSLionel Sambuc if (!Info) { 356*f4a2713aSLionel Sambuc formTokenWithChars(T, TokenPtr, tok::unknown_command); 357*f4a2713aSLionel Sambuc T.setUnknownCommandName(CommandName); 358*f4a2713aSLionel Sambuc if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) { 359*f4a2713aSLionel Sambuc StringRef CorrectedName = Info->Name; 360*f4a2713aSLionel Sambuc SourceRange CommandRange(T.getLocation().getLocWithOffset(1), 361*f4a2713aSLionel Sambuc T.getEndLocation()); 362*f4a2713aSLionel Sambuc Diag(T.getLocation(), diag::warn_correct_comment_command_name) 363*f4a2713aSLionel Sambuc << CommandName << CorrectedName 364*f4a2713aSLionel Sambuc << FixItHint::CreateReplacement(CommandRange, CorrectedName); 365*f4a2713aSLionel Sambuc } else { 366*f4a2713aSLionel Sambuc Diag(T.getLocation(), diag::warn_unknown_comment_command_name); 367*f4a2713aSLionel Sambuc return; 368*f4a2713aSLionel Sambuc } 369*f4a2713aSLionel Sambuc } 370*f4a2713aSLionel Sambuc if (Info->IsVerbatimBlockCommand) { 371*f4a2713aSLionel Sambuc setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info); 372*f4a2713aSLionel Sambuc return; 373*f4a2713aSLionel Sambuc } 374*f4a2713aSLionel Sambuc if (Info->IsVerbatimLineCommand) { 375*f4a2713aSLionel Sambuc setupAndLexVerbatimLine(T, TokenPtr, Info); 376*f4a2713aSLionel Sambuc return; 377*f4a2713aSLionel Sambuc } 378*f4a2713aSLionel Sambuc formTokenWithChars(T, TokenPtr, CommandKind); 379*f4a2713aSLionel Sambuc T.setCommandID(Info->getID()); 380*f4a2713aSLionel Sambuc return; 381*f4a2713aSLionel Sambuc } 382*f4a2713aSLionel Sambuc 383*f4a2713aSLionel Sambuc case '&': 384*f4a2713aSLionel Sambuc lexHTMLCharacterReference(T); 385*f4a2713aSLionel Sambuc return; 386*f4a2713aSLionel Sambuc 387*f4a2713aSLionel Sambuc case '<': { 388*f4a2713aSLionel Sambuc TokenPtr++; 389*f4a2713aSLionel Sambuc if (TokenPtr == CommentEnd) { 390*f4a2713aSLionel Sambuc formTextToken(T, TokenPtr); 391*f4a2713aSLionel Sambuc return; 392*f4a2713aSLionel Sambuc } 393*f4a2713aSLionel Sambuc const char C = *TokenPtr; 394*f4a2713aSLionel Sambuc if (isHTMLIdentifierStartingCharacter(C)) 395*f4a2713aSLionel Sambuc setupAndLexHTMLStartTag(T); 396*f4a2713aSLionel Sambuc else if (C == '/') 397*f4a2713aSLionel Sambuc setupAndLexHTMLEndTag(T); 398*f4a2713aSLionel Sambuc else 399*f4a2713aSLionel Sambuc formTextToken(T, TokenPtr); 400*f4a2713aSLionel Sambuc 401*f4a2713aSLionel Sambuc return; 402*f4a2713aSLionel Sambuc } 403*f4a2713aSLionel Sambuc 404*f4a2713aSLionel Sambuc case '\n': 405*f4a2713aSLionel Sambuc case '\r': 406*f4a2713aSLionel Sambuc TokenPtr = skipNewline(TokenPtr, CommentEnd); 407*f4a2713aSLionel Sambuc formTokenWithChars(T, TokenPtr, tok::newline); 408*f4a2713aSLionel Sambuc 409*f4a2713aSLionel Sambuc if (CommentState == LCS_InsideCComment) 410*f4a2713aSLionel Sambuc skipLineStartingDecorations(); 411*f4a2713aSLionel Sambuc return; 412*f4a2713aSLionel Sambuc 413*f4a2713aSLionel Sambuc default: { 414*f4a2713aSLionel Sambuc size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr). 415*f4a2713aSLionel Sambuc find_first_of("\n\r\\@&<"); 416*f4a2713aSLionel Sambuc if (End != StringRef::npos) 417*f4a2713aSLionel Sambuc TokenPtr += End; 418*f4a2713aSLionel Sambuc else 419*f4a2713aSLionel Sambuc TokenPtr = CommentEnd; 420*f4a2713aSLionel Sambuc formTextToken(T, TokenPtr); 421*f4a2713aSLionel Sambuc return; 422*f4a2713aSLionel Sambuc } 423*f4a2713aSLionel Sambuc } 424*f4a2713aSLionel Sambuc } 425*f4a2713aSLionel Sambuc } 426*f4a2713aSLionel Sambuc 427*f4a2713aSLionel Sambuc void Lexer::setupAndLexVerbatimBlock(Token &T, 428*f4a2713aSLionel Sambuc const char *TextBegin, 429*f4a2713aSLionel Sambuc char Marker, const CommandInfo *Info) { 430*f4a2713aSLionel Sambuc assert(Info->IsVerbatimBlockCommand); 431*f4a2713aSLionel Sambuc 432*f4a2713aSLionel Sambuc VerbatimBlockEndCommandName.clear(); 433*f4a2713aSLionel Sambuc VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@"); 434*f4a2713aSLionel Sambuc VerbatimBlockEndCommandName.append(Info->EndCommandName); 435*f4a2713aSLionel Sambuc 436*f4a2713aSLionel Sambuc formTokenWithChars(T, TextBegin, tok::verbatim_block_begin); 437*f4a2713aSLionel Sambuc T.setVerbatimBlockID(Info->getID()); 438*f4a2713aSLionel Sambuc 439*f4a2713aSLionel Sambuc // If there is a newline following the verbatim opening command, skip the 440*f4a2713aSLionel Sambuc // newline so that we don't create an tok::verbatim_block_line with empty 441*f4a2713aSLionel Sambuc // text content. 442*f4a2713aSLionel Sambuc if (BufferPtr != CommentEnd && 443*f4a2713aSLionel Sambuc isVerticalWhitespace(*BufferPtr)) { 444*f4a2713aSLionel Sambuc BufferPtr = skipNewline(BufferPtr, CommentEnd); 445*f4a2713aSLionel Sambuc State = LS_VerbatimBlockBody; 446*f4a2713aSLionel Sambuc return; 447*f4a2713aSLionel Sambuc } 448*f4a2713aSLionel Sambuc 449*f4a2713aSLionel Sambuc State = LS_VerbatimBlockFirstLine; 450*f4a2713aSLionel Sambuc } 451*f4a2713aSLionel Sambuc 452*f4a2713aSLionel Sambuc void Lexer::lexVerbatimBlockFirstLine(Token &T) { 453*f4a2713aSLionel Sambuc again: 454*f4a2713aSLionel Sambuc assert(BufferPtr < CommentEnd); 455*f4a2713aSLionel Sambuc 456*f4a2713aSLionel Sambuc // FIXME: It would be better to scan the text once, finding either the block 457*f4a2713aSLionel Sambuc // end command or newline. 458*f4a2713aSLionel Sambuc // 459*f4a2713aSLionel Sambuc // Extract current line. 460*f4a2713aSLionel Sambuc const char *Newline = findNewline(BufferPtr, CommentEnd); 461*f4a2713aSLionel Sambuc StringRef Line(BufferPtr, Newline - BufferPtr); 462*f4a2713aSLionel Sambuc 463*f4a2713aSLionel Sambuc // Look for end command in current line. 464*f4a2713aSLionel Sambuc size_t Pos = Line.find(VerbatimBlockEndCommandName); 465*f4a2713aSLionel Sambuc const char *TextEnd; 466*f4a2713aSLionel Sambuc const char *NextLine; 467*f4a2713aSLionel Sambuc if (Pos == StringRef::npos) { 468*f4a2713aSLionel Sambuc // Current line is completely verbatim. 469*f4a2713aSLionel Sambuc TextEnd = Newline; 470*f4a2713aSLionel Sambuc NextLine = skipNewline(Newline, CommentEnd); 471*f4a2713aSLionel Sambuc } else if (Pos == 0) { 472*f4a2713aSLionel Sambuc // Current line contains just an end command. 473*f4a2713aSLionel Sambuc const char *End = BufferPtr + VerbatimBlockEndCommandName.size(); 474*f4a2713aSLionel Sambuc StringRef Name(BufferPtr + 1, End - (BufferPtr + 1)); 475*f4a2713aSLionel Sambuc formTokenWithChars(T, End, tok::verbatim_block_end); 476*f4a2713aSLionel Sambuc T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID()); 477*f4a2713aSLionel Sambuc State = LS_Normal; 478*f4a2713aSLionel Sambuc return; 479*f4a2713aSLionel Sambuc } else { 480*f4a2713aSLionel Sambuc // There is some text, followed by end command. Extract text first. 481*f4a2713aSLionel Sambuc TextEnd = BufferPtr + Pos; 482*f4a2713aSLionel Sambuc NextLine = TextEnd; 483*f4a2713aSLionel Sambuc // If there is only whitespace before end command, skip whitespace. 484*f4a2713aSLionel Sambuc if (isWhitespace(BufferPtr, TextEnd)) { 485*f4a2713aSLionel Sambuc BufferPtr = TextEnd; 486*f4a2713aSLionel Sambuc goto again; 487*f4a2713aSLionel Sambuc } 488*f4a2713aSLionel Sambuc } 489*f4a2713aSLionel Sambuc 490*f4a2713aSLionel Sambuc StringRef Text(BufferPtr, TextEnd - BufferPtr); 491*f4a2713aSLionel Sambuc formTokenWithChars(T, NextLine, tok::verbatim_block_line); 492*f4a2713aSLionel Sambuc T.setVerbatimBlockText(Text); 493*f4a2713aSLionel Sambuc 494*f4a2713aSLionel Sambuc State = LS_VerbatimBlockBody; 495*f4a2713aSLionel Sambuc } 496*f4a2713aSLionel Sambuc 497*f4a2713aSLionel Sambuc void Lexer::lexVerbatimBlockBody(Token &T) { 498*f4a2713aSLionel Sambuc assert(State == LS_VerbatimBlockBody); 499*f4a2713aSLionel Sambuc 500*f4a2713aSLionel Sambuc if (CommentState == LCS_InsideCComment) 501*f4a2713aSLionel Sambuc skipLineStartingDecorations(); 502*f4a2713aSLionel Sambuc 503*f4a2713aSLionel Sambuc lexVerbatimBlockFirstLine(T); 504*f4a2713aSLionel Sambuc } 505*f4a2713aSLionel Sambuc 506*f4a2713aSLionel Sambuc void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin, 507*f4a2713aSLionel Sambuc const CommandInfo *Info) { 508*f4a2713aSLionel Sambuc assert(Info->IsVerbatimLineCommand); 509*f4a2713aSLionel Sambuc formTokenWithChars(T, TextBegin, tok::verbatim_line_name); 510*f4a2713aSLionel Sambuc T.setVerbatimLineID(Info->getID()); 511*f4a2713aSLionel Sambuc 512*f4a2713aSLionel Sambuc State = LS_VerbatimLineText; 513*f4a2713aSLionel Sambuc } 514*f4a2713aSLionel Sambuc 515*f4a2713aSLionel Sambuc void Lexer::lexVerbatimLineText(Token &T) { 516*f4a2713aSLionel Sambuc assert(State == LS_VerbatimLineText); 517*f4a2713aSLionel Sambuc 518*f4a2713aSLionel Sambuc // Extract current line. 519*f4a2713aSLionel Sambuc const char *Newline = findNewline(BufferPtr, CommentEnd); 520*f4a2713aSLionel Sambuc const StringRef Text(BufferPtr, Newline - BufferPtr); 521*f4a2713aSLionel Sambuc formTokenWithChars(T, Newline, tok::verbatim_line_text); 522*f4a2713aSLionel Sambuc T.setVerbatimLineText(Text); 523*f4a2713aSLionel Sambuc 524*f4a2713aSLionel Sambuc State = LS_Normal; 525*f4a2713aSLionel Sambuc } 526*f4a2713aSLionel Sambuc 527*f4a2713aSLionel Sambuc void Lexer::lexHTMLCharacterReference(Token &T) { 528*f4a2713aSLionel Sambuc const char *TokenPtr = BufferPtr; 529*f4a2713aSLionel Sambuc assert(*TokenPtr == '&'); 530*f4a2713aSLionel Sambuc TokenPtr++; 531*f4a2713aSLionel Sambuc if (TokenPtr == CommentEnd) { 532*f4a2713aSLionel Sambuc formTextToken(T, TokenPtr); 533*f4a2713aSLionel Sambuc return; 534*f4a2713aSLionel Sambuc } 535*f4a2713aSLionel Sambuc const char *NamePtr; 536*f4a2713aSLionel Sambuc bool isNamed = false; 537*f4a2713aSLionel Sambuc bool isDecimal = false; 538*f4a2713aSLionel Sambuc char C = *TokenPtr; 539*f4a2713aSLionel Sambuc if (isHTMLNamedCharacterReferenceCharacter(C)) { 540*f4a2713aSLionel Sambuc NamePtr = TokenPtr; 541*f4a2713aSLionel Sambuc TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd); 542*f4a2713aSLionel Sambuc isNamed = true; 543*f4a2713aSLionel Sambuc } else if (C == '#') { 544*f4a2713aSLionel Sambuc TokenPtr++; 545*f4a2713aSLionel Sambuc if (TokenPtr == CommentEnd) { 546*f4a2713aSLionel Sambuc formTextToken(T, TokenPtr); 547*f4a2713aSLionel Sambuc return; 548*f4a2713aSLionel Sambuc } 549*f4a2713aSLionel Sambuc C = *TokenPtr; 550*f4a2713aSLionel Sambuc if (isHTMLDecimalCharacterReferenceCharacter(C)) { 551*f4a2713aSLionel Sambuc NamePtr = TokenPtr; 552*f4a2713aSLionel Sambuc TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd); 553*f4a2713aSLionel Sambuc isDecimal = true; 554*f4a2713aSLionel Sambuc } else if (C == 'x' || C == 'X') { 555*f4a2713aSLionel Sambuc TokenPtr++; 556*f4a2713aSLionel Sambuc NamePtr = TokenPtr; 557*f4a2713aSLionel Sambuc TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd); 558*f4a2713aSLionel Sambuc } else { 559*f4a2713aSLionel Sambuc formTextToken(T, TokenPtr); 560*f4a2713aSLionel Sambuc return; 561*f4a2713aSLionel Sambuc } 562*f4a2713aSLionel Sambuc } else { 563*f4a2713aSLionel Sambuc formTextToken(T, TokenPtr); 564*f4a2713aSLionel Sambuc return; 565*f4a2713aSLionel Sambuc } 566*f4a2713aSLionel Sambuc if (NamePtr == TokenPtr || TokenPtr == CommentEnd || 567*f4a2713aSLionel Sambuc *TokenPtr != ';') { 568*f4a2713aSLionel Sambuc formTextToken(T, TokenPtr); 569*f4a2713aSLionel Sambuc return; 570*f4a2713aSLionel Sambuc } 571*f4a2713aSLionel Sambuc StringRef Name(NamePtr, TokenPtr - NamePtr); 572*f4a2713aSLionel Sambuc TokenPtr++; // Skip semicolon. 573*f4a2713aSLionel Sambuc StringRef Resolved; 574*f4a2713aSLionel Sambuc if (isNamed) 575*f4a2713aSLionel Sambuc Resolved = resolveHTMLNamedCharacterReference(Name); 576*f4a2713aSLionel Sambuc else if (isDecimal) 577*f4a2713aSLionel Sambuc Resolved = resolveHTMLDecimalCharacterReference(Name); 578*f4a2713aSLionel Sambuc else 579*f4a2713aSLionel Sambuc Resolved = resolveHTMLHexCharacterReference(Name); 580*f4a2713aSLionel Sambuc 581*f4a2713aSLionel Sambuc if (Resolved.empty()) { 582*f4a2713aSLionel Sambuc formTextToken(T, TokenPtr); 583*f4a2713aSLionel Sambuc return; 584*f4a2713aSLionel Sambuc } 585*f4a2713aSLionel Sambuc formTokenWithChars(T, TokenPtr, tok::text); 586*f4a2713aSLionel Sambuc T.setText(Resolved); 587*f4a2713aSLionel Sambuc return; 588*f4a2713aSLionel Sambuc } 589*f4a2713aSLionel Sambuc 590*f4a2713aSLionel Sambuc void Lexer::setupAndLexHTMLStartTag(Token &T) { 591*f4a2713aSLionel Sambuc assert(BufferPtr[0] == '<' && 592*f4a2713aSLionel Sambuc isHTMLIdentifierStartingCharacter(BufferPtr[1])); 593*f4a2713aSLionel Sambuc const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd); 594*f4a2713aSLionel Sambuc StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1)); 595*f4a2713aSLionel Sambuc if (!isHTMLTagName(Name)) { 596*f4a2713aSLionel Sambuc formTextToken(T, TagNameEnd); 597*f4a2713aSLionel Sambuc return; 598*f4a2713aSLionel Sambuc } 599*f4a2713aSLionel Sambuc 600*f4a2713aSLionel Sambuc formTokenWithChars(T, TagNameEnd, tok::html_start_tag); 601*f4a2713aSLionel Sambuc T.setHTMLTagStartName(Name); 602*f4a2713aSLionel Sambuc 603*f4a2713aSLionel Sambuc BufferPtr = skipWhitespace(BufferPtr, CommentEnd); 604*f4a2713aSLionel Sambuc 605*f4a2713aSLionel Sambuc const char C = *BufferPtr; 606*f4a2713aSLionel Sambuc if (BufferPtr != CommentEnd && 607*f4a2713aSLionel Sambuc (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C))) 608*f4a2713aSLionel Sambuc State = LS_HTMLStartTag; 609*f4a2713aSLionel Sambuc } 610*f4a2713aSLionel Sambuc 611*f4a2713aSLionel Sambuc void Lexer::lexHTMLStartTag(Token &T) { 612*f4a2713aSLionel Sambuc assert(State == LS_HTMLStartTag); 613*f4a2713aSLionel Sambuc 614*f4a2713aSLionel Sambuc const char *TokenPtr = BufferPtr; 615*f4a2713aSLionel Sambuc char C = *TokenPtr; 616*f4a2713aSLionel Sambuc if (isHTMLIdentifierCharacter(C)) { 617*f4a2713aSLionel Sambuc TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd); 618*f4a2713aSLionel Sambuc StringRef Ident(BufferPtr, TokenPtr - BufferPtr); 619*f4a2713aSLionel Sambuc formTokenWithChars(T, TokenPtr, tok::html_ident); 620*f4a2713aSLionel Sambuc T.setHTMLIdent(Ident); 621*f4a2713aSLionel Sambuc } else { 622*f4a2713aSLionel Sambuc switch (C) { 623*f4a2713aSLionel Sambuc case '=': 624*f4a2713aSLionel Sambuc TokenPtr++; 625*f4a2713aSLionel Sambuc formTokenWithChars(T, TokenPtr, tok::html_equals); 626*f4a2713aSLionel Sambuc break; 627*f4a2713aSLionel Sambuc case '\"': 628*f4a2713aSLionel Sambuc case '\'': { 629*f4a2713aSLionel Sambuc const char *OpenQuote = TokenPtr; 630*f4a2713aSLionel Sambuc TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd); 631*f4a2713aSLionel Sambuc const char *ClosingQuote = TokenPtr; 632*f4a2713aSLionel Sambuc if (TokenPtr != CommentEnd) // Skip closing quote. 633*f4a2713aSLionel Sambuc TokenPtr++; 634*f4a2713aSLionel Sambuc formTokenWithChars(T, TokenPtr, tok::html_quoted_string); 635*f4a2713aSLionel Sambuc T.setHTMLQuotedString(StringRef(OpenQuote + 1, 636*f4a2713aSLionel Sambuc ClosingQuote - (OpenQuote + 1))); 637*f4a2713aSLionel Sambuc break; 638*f4a2713aSLionel Sambuc } 639*f4a2713aSLionel Sambuc case '>': 640*f4a2713aSLionel Sambuc TokenPtr++; 641*f4a2713aSLionel Sambuc formTokenWithChars(T, TokenPtr, tok::html_greater); 642*f4a2713aSLionel Sambuc State = LS_Normal; 643*f4a2713aSLionel Sambuc return; 644*f4a2713aSLionel Sambuc case '/': 645*f4a2713aSLionel Sambuc TokenPtr++; 646*f4a2713aSLionel Sambuc if (TokenPtr != CommentEnd && *TokenPtr == '>') { 647*f4a2713aSLionel Sambuc TokenPtr++; 648*f4a2713aSLionel Sambuc formTokenWithChars(T, TokenPtr, tok::html_slash_greater); 649*f4a2713aSLionel Sambuc } else 650*f4a2713aSLionel Sambuc formTextToken(T, TokenPtr); 651*f4a2713aSLionel Sambuc 652*f4a2713aSLionel Sambuc State = LS_Normal; 653*f4a2713aSLionel Sambuc return; 654*f4a2713aSLionel Sambuc } 655*f4a2713aSLionel Sambuc } 656*f4a2713aSLionel Sambuc 657*f4a2713aSLionel Sambuc // Now look ahead and return to normal state if we don't see any HTML tokens 658*f4a2713aSLionel Sambuc // ahead. 659*f4a2713aSLionel Sambuc BufferPtr = skipWhitespace(BufferPtr, CommentEnd); 660*f4a2713aSLionel Sambuc if (BufferPtr == CommentEnd) { 661*f4a2713aSLionel Sambuc State = LS_Normal; 662*f4a2713aSLionel Sambuc return; 663*f4a2713aSLionel Sambuc } 664*f4a2713aSLionel Sambuc 665*f4a2713aSLionel Sambuc C = *BufferPtr; 666*f4a2713aSLionel Sambuc if (!isHTMLIdentifierStartingCharacter(C) && 667*f4a2713aSLionel Sambuc C != '=' && C != '\"' && C != '\'' && C != '>') { 668*f4a2713aSLionel Sambuc State = LS_Normal; 669*f4a2713aSLionel Sambuc return; 670*f4a2713aSLionel Sambuc } 671*f4a2713aSLionel Sambuc } 672*f4a2713aSLionel Sambuc 673*f4a2713aSLionel Sambuc void Lexer::setupAndLexHTMLEndTag(Token &T) { 674*f4a2713aSLionel Sambuc assert(BufferPtr[0] == '<' && BufferPtr[1] == '/'); 675*f4a2713aSLionel Sambuc 676*f4a2713aSLionel Sambuc const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd); 677*f4a2713aSLionel Sambuc const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd); 678*f4a2713aSLionel Sambuc StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin); 679*f4a2713aSLionel Sambuc if (!isHTMLTagName(Name)) { 680*f4a2713aSLionel Sambuc formTextToken(T, TagNameEnd); 681*f4a2713aSLionel Sambuc return; 682*f4a2713aSLionel Sambuc } 683*f4a2713aSLionel Sambuc 684*f4a2713aSLionel Sambuc const char *End = skipWhitespace(TagNameEnd, CommentEnd); 685*f4a2713aSLionel Sambuc 686*f4a2713aSLionel Sambuc formTokenWithChars(T, End, tok::html_end_tag); 687*f4a2713aSLionel Sambuc T.setHTMLTagEndName(Name); 688*f4a2713aSLionel Sambuc 689*f4a2713aSLionel Sambuc if (BufferPtr != CommentEnd && *BufferPtr == '>') 690*f4a2713aSLionel Sambuc State = LS_HTMLEndTag; 691*f4a2713aSLionel Sambuc } 692*f4a2713aSLionel Sambuc 693*f4a2713aSLionel Sambuc void Lexer::lexHTMLEndTag(Token &T) { 694*f4a2713aSLionel Sambuc assert(BufferPtr != CommentEnd && *BufferPtr == '>'); 695*f4a2713aSLionel Sambuc 696*f4a2713aSLionel Sambuc formTokenWithChars(T, BufferPtr + 1, tok::html_greater); 697*f4a2713aSLionel Sambuc State = LS_Normal; 698*f4a2713aSLionel Sambuc } 699*f4a2713aSLionel Sambuc 700*f4a2713aSLionel Sambuc Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags, 701*f4a2713aSLionel Sambuc const CommandTraits &Traits, 702*f4a2713aSLionel Sambuc SourceLocation FileLoc, 703*f4a2713aSLionel Sambuc const char *BufferStart, const char *BufferEnd): 704*f4a2713aSLionel Sambuc Allocator(Allocator), Diags(Diags), Traits(Traits), 705*f4a2713aSLionel Sambuc BufferStart(BufferStart), BufferEnd(BufferEnd), 706*f4a2713aSLionel Sambuc FileLoc(FileLoc), BufferPtr(BufferStart), 707*f4a2713aSLionel Sambuc CommentState(LCS_BeforeComment), State(LS_Normal) { 708*f4a2713aSLionel Sambuc } 709*f4a2713aSLionel Sambuc 710*f4a2713aSLionel Sambuc void Lexer::lex(Token &T) { 711*f4a2713aSLionel Sambuc again: 712*f4a2713aSLionel Sambuc switch (CommentState) { 713*f4a2713aSLionel Sambuc case LCS_BeforeComment: 714*f4a2713aSLionel Sambuc if (BufferPtr == BufferEnd) { 715*f4a2713aSLionel Sambuc formTokenWithChars(T, BufferPtr, tok::eof); 716*f4a2713aSLionel Sambuc return; 717*f4a2713aSLionel Sambuc } 718*f4a2713aSLionel Sambuc 719*f4a2713aSLionel Sambuc assert(*BufferPtr == '/'); 720*f4a2713aSLionel Sambuc BufferPtr++; // Skip first slash. 721*f4a2713aSLionel Sambuc switch(*BufferPtr) { 722*f4a2713aSLionel Sambuc case '/': { // BCPL comment. 723*f4a2713aSLionel Sambuc BufferPtr++; // Skip second slash. 724*f4a2713aSLionel Sambuc 725*f4a2713aSLionel Sambuc if (BufferPtr != BufferEnd) { 726*f4a2713aSLionel Sambuc // Skip Doxygen magic marker, if it is present. 727*f4a2713aSLionel Sambuc // It might be missing because of a typo //< or /*<, or because we 728*f4a2713aSLionel Sambuc // merged this non-Doxygen comment into a bunch of Doxygen comments 729*f4a2713aSLionel Sambuc // around it: /** ... */ /* ... */ /** ... */ 730*f4a2713aSLionel Sambuc const char C = *BufferPtr; 731*f4a2713aSLionel Sambuc if (C == '/' || C == '!') 732*f4a2713aSLionel Sambuc BufferPtr++; 733*f4a2713aSLionel Sambuc } 734*f4a2713aSLionel Sambuc 735*f4a2713aSLionel Sambuc // Skip less-than symbol that marks trailing comments. 736*f4a2713aSLionel Sambuc // Skip it even if the comment is not a Doxygen one, because //< and /*< 737*f4a2713aSLionel Sambuc // are frequent typos. 738*f4a2713aSLionel Sambuc if (BufferPtr != BufferEnd && *BufferPtr == '<') 739*f4a2713aSLionel Sambuc BufferPtr++; 740*f4a2713aSLionel Sambuc 741*f4a2713aSLionel Sambuc CommentState = LCS_InsideBCPLComment; 742*f4a2713aSLionel Sambuc if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine) 743*f4a2713aSLionel Sambuc State = LS_Normal; 744*f4a2713aSLionel Sambuc CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd); 745*f4a2713aSLionel Sambuc goto again; 746*f4a2713aSLionel Sambuc } 747*f4a2713aSLionel Sambuc case '*': { // C comment. 748*f4a2713aSLionel Sambuc BufferPtr++; // Skip star. 749*f4a2713aSLionel Sambuc 750*f4a2713aSLionel Sambuc // Skip Doxygen magic marker. 751*f4a2713aSLionel Sambuc const char C = *BufferPtr; 752*f4a2713aSLionel Sambuc if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!') 753*f4a2713aSLionel Sambuc BufferPtr++; 754*f4a2713aSLionel Sambuc 755*f4a2713aSLionel Sambuc // Skip less-than symbol that marks trailing comments. 756*f4a2713aSLionel Sambuc if (BufferPtr != BufferEnd && *BufferPtr == '<') 757*f4a2713aSLionel Sambuc BufferPtr++; 758*f4a2713aSLionel Sambuc 759*f4a2713aSLionel Sambuc CommentState = LCS_InsideCComment; 760*f4a2713aSLionel Sambuc State = LS_Normal; 761*f4a2713aSLionel Sambuc CommentEnd = findCCommentEnd(BufferPtr, BufferEnd); 762*f4a2713aSLionel Sambuc goto again; 763*f4a2713aSLionel Sambuc } 764*f4a2713aSLionel Sambuc default: 765*f4a2713aSLionel Sambuc llvm_unreachable("second character of comment should be '/' or '*'"); 766*f4a2713aSLionel Sambuc } 767*f4a2713aSLionel Sambuc 768*f4a2713aSLionel Sambuc case LCS_BetweenComments: { 769*f4a2713aSLionel Sambuc // Consecutive comments are extracted only if there is only whitespace 770*f4a2713aSLionel Sambuc // between them. So we can search for the start of the next comment. 771*f4a2713aSLionel Sambuc const char *EndWhitespace = BufferPtr; 772*f4a2713aSLionel Sambuc while(EndWhitespace != BufferEnd && *EndWhitespace != '/') 773*f4a2713aSLionel Sambuc EndWhitespace++; 774*f4a2713aSLionel Sambuc 775*f4a2713aSLionel Sambuc // Turn any whitespace between comments (and there is only whitespace 776*f4a2713aSLionel Sambuc // between them -- guaranteed by comment extraction) into a newline. We 777*f4a2713aSLionel Sambuc // have two newlines between C comments in total (first one was synthesized 778*f4a2713aSLionel Sambuc // after a comment). 779*f4a2713aSLionel Sambuc formTokenWithChars(T, EndWhitespace, tok::newline); 780*f4a2713aSLionel Sambuc 781*f4a2713aSLionel Sambuc CommentState = LCS_BeforeComment; 782*f4a2713aSLionel Sambuc break; 783*f4a2713aSLionel Sambuc } 784*f4a2713aSLionel Sambuc 785*f4a2713aSLionel Sambuc case LCS_InsideBCPLComment: 786*f4a2713aSLionel Sambuc case LCS_InsideCComment: 787*f4a2713aSLionel Sambuc if (BufferPtr != CommentEnd) { 788*f4a2713aSLionel Sambuc lexCommentText(T); 789*f4a2713aSLionel Sambuc break; 790*f4a2713aSLionel Sambuc } else { 791*f4a2713aSLionel Sambuc // Skip C comment closing sequence. 792*f4a2713aSLionel Sambuc if (CommentState == LCS_InsideCComment) { 793*f4a2713aSLionel Sambuc assert(BufferPtr[0] == '*' && BufferPtr[1] == '/'); 794*f4a2713aSLionel Sambuc BufferPtr += 2; 795*f4a2713aSLionel Sambuc assert(BufferPtr <= BufferEnd); 796*f4a2713aSLionel Sambuc 797*f4a2713aSLionel Sambuc // Synthenize newline just after the C comment, regardless if there is 798*f4a2713aSLionel Sambuc // actually a newline. 799*f4a2713aSLionel Sambuc formTokenWithChars(T, BufferPtr, tok::newline); 800*f4a2713aSLionel Sambuc 801*f4a2713aSLionel Sambuc CommentState = LCS_BetweenComments; 802*f4a2713aSLionel Sambuc break; 803*f4a2713aSLionel Sambuc } else { 804*f4a2713aSLionel Sambuc // Don't synthesized a newline after BCPL comment. 805*f4a2713aSLionel Sambuc CommentState = LCS_BetweenComments; 806*f4a2713aSLionel Sambuc goto again; 807*f4a2713aSLionel Sambuc } 808*f4a2713aSLionel Sambuc } 809*f4a2713aSLionel Sambuc } 810*f4a2713aSLionel Sambuc } 811*f4a2713aSLionel Sambuc 812*f4a2713aSLionel Sambuc StringRef Lexer::getSpelling(const Token &Tok, 813*f4a2713aSLionel Sambuc const SourceManager &SourceMgr, 814*f4a2713aSLionel Sambuc bool *Invalid) const { 815*f4a2713aSLionel Sambuc SourceLocation Loc = Tok.getLocation(); 816*f4a2713aSLionel Sambuc std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc); 817*f4a2713aSLionel Sambuc 818*f4a2713aSLionel Sambuc bool InvalidTemp = false; 819*f4a2713aSLionel Sambuc StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp); 820*f4a2713aSLionel Sambuc if (InvalidTemp) { 821*f4a2713aSLionel Sambuc *Invalid = true; 822*f4a2713aSLionel Sambuc return StringRef(); 823*f4a2713aSLionel Sambuc } 824*f4a2713aSLionel Sambuc 825*f4a2713aSLionel Sambuc const char *Begin = File.data() + LocInfo.second; 826*f4a2713aSLionel Sambuc return StringRef(Begin, Tok.getLength()); 827*f4a2713aSLionel Sambuc } 828*f4a2713aSLionel Sambuc 829*f4a2713aSLionel Sambuc } // end namespace comments 830*f4a2713aSLionel Sambuc } // end namespace clang 831*f4a2713aSLionel Sambuc 832