xref: /minix3/external/bsd/llvm/dist/clang/lib/AST/CommentLexer.cpp (revision f4a2713ac843a11c696ec80c0a5e3e5d80b4d338)
1*f4a2713aSLionel Sambuc #include "clang/AST/CommentLexer.h"
2*f4a2713aSLionel Sambuc #include "clang/AST/CommentCommandTraits.h"
3*f4a2713aSLionel Sambuc #include "clang/AST/CommentDiagnostic.h"
4*f4a2713aSLionel Sambuc #include "clang/Basic/CharInfo.h"
5*f4a2713aSLionel Sambuc #include "llvm/ADT/StringExtras.h"
6*f4a2713aSLionel Sambuc #include "llvm/ADT/StringSwitch.h"
7*f4a2713aSLionel Sambuc #include "llvm/Support/ConvertUTF.h"
8*f4a2713aSLionel Sambuc #include "llvm/Support/ErrorHandling.h"
9*f4a2713aSLionel Sambuc 
10*f4a2713aSLionel Sambuc namespace clang {
11*f4a2713aSLionel Sambuc namespace comments {
12*f4a2713aSLionel Sambuc 
13*f4a2713aSLionel Sambuc void Token::dump(const Lexer &L, const SourceManager &SM) const {
14*f4a2713aSLionel Sambuc   llvm::errs() << "comments::Token Kind=" << Kind << " ";
15*f4a2713aSLionel Sambuc   Loc.dump(SM);
16*f4a2713aSLionel Sambuc   llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
17*f4a2713aSLionel Sambuc }
18*f4a2713aSLionel Sambuc 
19*f4a2713aSLionel Sambuc static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
20*f4a2713aSLionel Sambuc   return isLetter(C);
21*f4a2713aSLionel Sambuc }
22*f4a2713aSLionel Sambuc 
23*f4a2713aSLionel Sambuc static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
24*f4a2713aSLionel Sambuc   return isDigit(C);
25*f4a2713aSLionel Sambuc }
26*f4a2713aSLionel Sambuc 
27*f4a2713aSLionel Sambuc static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
28*f4a2713aSLionel Sambuc   return isHexDigit(C);
29*f4a2713aSLionel Sambuc }
30*f4a2713aSLionel Sambuc 
31*f4a2713aSLionel Sambuc static inline StringRef convertCodePointToUTF8(
32*f4a2713aSLionel Sambuc                                       llvm::BumpPtrAllocator &Allocator,
33*f4a2713aSLionel Sambuc                                       unsigned CodePoint) {
34*f4a2713aSLionel Sambuc   char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
35*f4a2713aSLionel Sambuc   char *ResolvedPtr = Resolved;
36*f4a2713aSLionel Sambuc   if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
37*f4a2713aSLionel Sambuc     return StringRef(Resolved, ResolvedPtr - Resolved);
38*f4a2713aSLionel Sambuc   else
39*f4a2713aSLionel Sambuc     return StringRef();
40*f4a2713aSLionel Sambuc }
41*f4a2713aSLionel Sambuc 
42*f4a2713aSLionel Sambuc namespace {
43*f4a2713aSLionel Sambuc 
44*f4a2713aSLionel Sambuc #include "clang/AST/CommentHTMLTags.inc"
45*f4a2713aSLionel Sambuc #include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
46*f4a2713aSLionel Sambuc 
47*f4a2713aSLionel Sambuc } // unnamed namespace
48*f4a2713aSLionel Sambuc 
49*f4a2713aSLionel Sambuc StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
50*f4a2713aSLionel Sambuc   // Fast path, first check a few most widely used named character references.
51*f4a2713aSLionel Sambuc   return llvm::StringSwitch<StringRef>(Name)
52*f4a2713aSLionel Sambuc       .Case("amp", "&")
53*f4a2713aSLionel Sambuc       .Case("lt", "<")
54*f4a2713aSLionel Sambuc       .Case("gt", ">")
55*f4a2713aSLionel Sambuc       .Case("quot", "\"")
56*f4a2713aSLionel Sambuc       .Case("apos", "\'")
57*f4a2713aSLionel Sambuc       // Slow path.
58*f4a2713aSLionel Sambuc       .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
59*f4a2713aSLionel Sambuc }
60*f4a2713aSLionel Sambuc 
61*f4a2713aSLionel Sambuc StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
62*f4a2713aSLionel Sambuc   unsigned CodePoint = 0;
63*f4a2713aSLionel Sambuc   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
64*f4a2713aSLionel Sambuc     assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
65*f4a2713aSLionel Sambuc     CodePoint *= 10;
66*f4a2713aSLionel Sambuc     CodePoint += Name[i] - '0';
67*f4a2713aSLionel Sambuc   }
68*f4a2713aSLionel Sambuc   return convertCodePointToUTF8(Allocator, CodePoint);
69*f4a2713aSLionel Sambuc }
70*f4a2713aSLionel Sambuc 
71*f4a2713aSLionel Sambuc StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
72*f4a2713aSLionel Sambuc   unsigned CodePoint = 0;
73*f4a2713aSLionel Sambuc   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
74*f4a2713aSLionel Sambuc     CodePoint *= 16;
75*f4a2713aSLionel Sambuc     const char C = Name[i];
76*f4a2713aSLionel Sambuc     assert(isHTMLHexCharacterReferenceCharacter(C));
77*f4a2713aSLionel Sambuc     CodePoint += llvm::hexDigitValue(C);
78*f4a2713aSLionel Sambuc   }
79*f4a2713aSLionel Sambuc   return convertCodePointToUTF8(Allocator, CodePoint);
80*f4a2713aSLionel Sambuc }
81*f4a2713aSLionel Sambuc 
82*f4a2713aSLionel Sambuc void Lexer::skipLineStartingDecorations() {
83*f4a2713aSLionel Sambuc   // This function should be called only for C comments
84*f4a2713aSLionel Sambuc   assert(CommentState == LCS_InsideCComment);
85*f4a2713aSLionel Sambuc 
86*f4a2713aSLionel Sambuc   if (BufferPtr == CommentEnd)
87*f4a2713aSLionel Sambuc     return;
88*f4a2713aSLionel Sambuc 
89*f4a2713aSLionel Sambuc   switch (*BufferPtr) {
90*f4a2713aSLionel Sambuc   case ' ':
91*f4a2713aSLionel Sambuc   case '\t':
92*f4a2713aSLionel Sambuc   case '\f':
93*f4a2713aSLionel Sambuc   case '\v': {
94*f4a2713aSLionel Sambuc     const char *NewBufferPtr = BufferPtr;
95*f4a2713aSLionel Sambuc     NewBufferPtr++;
96*f4a2713aSLionel Sambuc     if (NewBufferPtr == CommentEnd)
97*f4a2713aSLionel Sambuc       return;
98*f4a2713aSLionel Sambuc 
99*f4a2713aSLionel Sambuc     char C = *NewBufferPtr;
100*f4a2713aSLionel Sambuc     while (isHorizontalWhitespace(C)) {
101*f4a2713aSLionel Sambuc       NewBufferPtr++;
102*f4a2713aSLionel Sambuc       if (NewBufferPtr == CommentEnd)
103*f4a2713aSLionel Sambuc         return;
104*f4a2713aSLionel Sambuc       C = *NewBufferPtr;
105*f4a2713aSLionel Sambuc     }
106*f4a2713aSLionel Sambuc     if (C == '*')
107*f4a2713aSLionel Sambuc       BufferPtr = NewBufferPtr + 1;
108*f4a2713aSLionel Sambuc     break;
109*f4a2713aSLionel Sambuc   }
110*f4a2713aSLionel Sambuc   case '*':
111*f4a2713aSLionel Sambuc     BufferPtr++;
112*f4a2713aSLionel Sambuc     break;
113*f4a2713aSLionel Sambuc   }
114*f4a2713aSLionel Sambuc }
115*f4a2713aSLionel Sambuc 
116*f4a2713aSLionel Sambuc namespace {
117*f4a2713aSLionel Sambuc /// Returns pointer to the first newline character in the string.
118*f4a2713aSLionel Sambuc const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
119*f4a2713aSLionel Sambuc   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
120*f4a2713aSLionel Sambuc     if (isVerticalWhitespace(*BufferPtr))
121*f4a2713aSLionel Sambuc       return BufferPtr;
122*f4a2713aSLionel Sambuc   }
123*f4a2713aSLionel Sambuc   return BufferEnd;
124*f4a2713aSLionel Sambuc }
125*f4a2713aSLionel Sambuc 
126*f4a2713aSLionel Sambuc const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
127*f4a2713aSLionel Sambuc   if (BufferPtr == BufferEnd)
128*f4a2713aSLionel Sambuc     return BufferPtr;
129*f4a2713aSLionel Sambuc 
130*f4a2713aSLionel Sambuc   if (*BufferPtr == '\n')
131*f4a2713aSLionel Sambuc     BufferPtr++;
132*f4a2713aSLionel Sambuc   else {
133*f4a2713aSLionel Sambuc     assert(*BufferPtr == '\r');
134*f4a2713aSLionel Sambuc     BufferPtr++;
135*f4a2713aSLionel Sambuc     if (BufferPtr != BufferEnd && *BufferPtr == '\n')
136*f4a2713aSLionel Sambuc       BufferPtr++;
137*f4a2713aSLionel Sambuc   }
138*f4a2713aSLionel Sambuc   return BufferPtr;
139*f4a2713aSLionel Sambuc }
140*f4a2713aSLionel Sambuc 
141*f4a2713aSLionel Sambuc const char *skipNamedCharacterReference(const char *BufferPtr,
142*f4a2713aSLionel Sambuc                                         const char *BufferEnd) {
143*f4a2713aSLionel Sambuc   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
144*f4a2713aSLionel Sambuc     if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
145*f4a2713aSLionel Sambuc       return BufferPtr;
146*f4a2713aSLionel Sambuc   }
147*f4a2713aSLionel Sambuc   return BufferEnd;
148*f4a2713aSLionel Sambuc }
149*f4a2713aSLionel Sambuc 
150*f4a2713aSLionel Sambuc const char *skipDecimalCharacterReference(const char *BufferPtr,
151*f4a2713aSLionel Sambuc                                           const char *BufferEnd) {
152*f4a2713aSLionel Sambuc   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
153*f4a2713aSLionel Sambuc     if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
154*f4a2713aSLionel Sambuc       return BufferPtr;
155*f4a2713aSLionel Sambuc   }
156*f4a2713aSLionel Sambuc   return BufferEnd;
157*f4a2713aSLionel Sambuc }
158*f4a2713aSLionel Sambuc 
159*f4a2713aSLionel Sambuc const char *skipHexCharacterReference(const char *BufferPtr,
160*f4a2713aSLionel Sambuc                                       const char *BufferEnd) {
161*f4a2713aSLionel Sambuc   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
162*f4a2713aSLionel Sambuc     if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
163*f4a2713aSLionel Sambuc       return BufferPtr;
164*f4a2713aSLionel Sambuc   }
165*f4a2713aSLionel Sambuc   return BufferEnd;
166*f4a2713aSLionel Sambuc }
167*f4a2713aSLionel Sambuc 
168*f4a2713aSLionel Sambuc bool isHTMLIdentifierStartingCharacter(char C) {
169*f4a2713aSLionel Sambuc   return isLetter(C);
170*f4a2713aSLionel Sambuc }
171*f4a2713aSLionel Sambuc 
172*f4a2713aSLionel Sambuc bool isHTMLIdentifierCharacter(char C) {
173*f4a2713aSLionel Sambuc   return isAlphanumeric(C);
174*f4a2713aSLionel Sambuc }
175*f4a2713aSLionel Sambuc 
176*f4a2713aSLionel Sambuc const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
177*f4a2713aSLionel Sambuc   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
178*f4a2713aSLionel Sambuc     if (!isHTMLIdentifierCharacter(*BufferPtr))
179*f4a2713aSLionel Sambuc       return BufferPtr;
180*f4a2713aSLionel Sambuc   }
181*f4a2713aSLionel Sambuc   return BufferEnd;
182*f4a2713aSLionel Sambuc }
183*f4a2713aSLionel Sambuc 
184*f4a2713aSLionel Sambuc /// Skip HTML string quoted in single or double quotes.  Escaping quotes inside
185*f4a2713aSLionel Sambuc /// string allowed.
186*f4a2713aSLionel Sambuc ///
187*f4a2713aSLionel Sambuc /// Returns pointer to closing quote.
188*f4a2713aSLionel Sambuc const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
189*f4a2713aSLionel Sambuc {
190*f4a2713aSLionel Sambuc   const char Quote = *BufferPtr;
191*f4a2713aSLionel Sambuc   assert(Quote == '\"' || Quote == '\'');
192*f4a2713aSLionel Sambuc 
193*f4a2713aSLionel Sambuc   BufferPtr++;
194*f4a2713aSLionel Sambuc   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
195*f4a2713aSLionel Sambuc     const char C = *BufferPtr;
196*f4a2713aSLionel Sambuc     if (C == Quote && BufferPtr[-1] != '\\')
197*f4a2713aSLionel Sambuc       return BufferPtr;
198*f4a2713aSLionel Sambuc   }
199*f4a2713aSLionel Sambuc   return BufferEnd;
200*f4a2713aSLionel Sambuc }
201*f4a2713aSLionel Sambuc 
202*f4a2713aSLionel Sambuc const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
203*f4a2713aSLionel Sambuc   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
204*f4a2713aSLionel Sambuc     if (!isWhitespace(*BufferPtr))
205*f4a2713aSLionel Sambuc       return BufferPtr;
206*f4a2713aSLionel Sambuc   }
207*f4a2713aSLionel Sambuc   return BufferEnd;
208*f4a2713aSLionel Sambuc }
209*f4a2713aSLionel Sambuc 
210*f4a2713aSLionel Sambuc bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
211*f4a2713aSLionel Sambuc   return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
212*f4a2713aSLionel Sambuc }
213*f4a2713aSLionel Sambuc 
214*f4a2713aSLionel Sambuc bool isCommandNameStartCharacter(char C) {
215*f4a2713aSLionel Sambuc   return isLetter(C);
216*f4a2713aSLionel Sambuc }
217*f4a2713aSLionel Sambuc 
218*f4a2713aSLionel Sambuc bool isCommandNameCharacter(char C) {
219*f4a2713aSLionel Sambuc   return isAlphanumeric(C);
220*f4a2713aSLionel Sambuc }
221*f4a2713aSLionel Sambuc 
222*f4a2713aSLionel Sambuc const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
223*f4a2713aSLionel Sambuc   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
224*f4a2713aSLionel Sambuc     if (!isCommandNameCharacter(*BufferPtr))
225*f4a2713aSLionel Sambuc       return BufferPtr;
226*f4a2713aSLionel Sambuc   }
227*f4a2713aSLionel Sambuc   return BufferEnd;
228*f4a2713aSLionel Sambuc }
229*f4a2713aSLionel Sambuc 
230*f4a2713aSLionel Sambuc /// Return the one past end pointer for BCPL comments.
231*f4a2713aSLionel Sambuc /// Handles newlines escaped with backslash or trigraph for backslahs.
232*f4a2713aSLionel Sambuc const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
233*f4a2713aSLionel Sambuc   const char *CurPtr = BufferPtr;
234*f4a2713aSLionel Sambuc   while (CurPtr != BufferEnd) {
235*f4a2713aSLionel Sambuc     while (!isVerticalWhitespace(*CurPtr)) {
236*f4a2713aSLionel Sambuc       CurPtr++;
237*f4a2713aSLionel Sambuc       if (CurPtr == BufferEnd)
238*f4a2713aSLionel Sambuc         return BufferEnd;
239*f4a2713aSLionel Sambuc     }
240*f4a2713aSLionel Sambuc     // We found a newline, check if it is escaped.
241*f4a2713aSLionel Sambuc     const char *EscapePtr = CurPtr - 1;
242*f4a2713aSLionel Sambuc     while(isHorizontalWhitespace(*EscapePtr))
243*f4a2713aSLionel Sambuc       EscapePtr--;
244*f4a2713aSLionel Sambuc 
245*f4a2713aSLionel Sambuc     if (*EscapePtr == '\\' ||
246*f4a2713aSLionel Sambuc         (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
247*f4a2713aSLionel Sambuc          EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
248*f4a2713aSLionel Sambuc       // We found an escaped newline.
249*f4a2713aSLionel Sambuc       CurPtr = skipNewline(CurPtr, BufferEnd);
250*f4a2713aSLionel Sambuc     } else
251*f4a2713aSLionel Sambuc       return CurPtr; // Not an escaped newline.
252*f4a2713aSLionel Sambuc   }
253*f4a2713aSLionel Sambuc   return BufferEnd;
254*f4a2713aSLionel Sambuc }
255*f4a2713aSLionel Sambuc 
256*f4a2713aSLionel Sambuc /// Return the one past end pointer for C comments.
257*f4a2713aSLionel Sambuc /// Very dumb, does not handle escaped newlines or trigraphs.
258*f4a2713aSLionel Sambuc const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
259*f4a2713aSLionel Sambuc   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
260*f4a2713aSLionel Sambuc     if (*BufferPtr == '*') {
261*f4a2713aSLionel Sambuc       assert(BufferPtr + 1 != BufferEnd);
262*f4a2713aSLionel Sambuc       if (*(BufferPtr + 1) == '/')
263*f4a2713aSLionel Sambuc         return BufferPtr;
264*f4a2713aSLionel Sambuc     }
265*f4a2713aSLionel Sambuc   }
266*f4a2713aSLionel Sambuc   llvm_unreachable("buffer end hit before '*/' was seen");
267*f4a2713aSLionel Sambuc }
268*f4a2713aSLionel Sambuc 
269*f4a2713aSLionel Sambuc } // unnamed namespace
270*f4a2713aSLionel Sambuc 
271*f4a2713aSLionel Sambuc void Lexer::lexCommentText(Token &T) {
272*f4a2713aSLionel Sambuc   assert(CommentState == LCS_InsideBCPLComment ||
273*f4a2713aSLionel Sambuc          CommentState == LCS_InsideCComment);
274*f4a2713aSLionel Sambuc 
275*f4a2713aSLionel Sambuc   switch (State) {
276*f4a2713aSLionel Sambuc   case LS_Normal:
277*f4a2713aSLionel Sambuc     break;
278*f4a2713aSLionel Sambuc   case LS_VerbatimBlockFirstLine:
279*f4a2713aSLionel Sambuc     lexVerbatimBlockFirstLine(T);
280*f4a2713aSLionel Sambuc     return;
281*f4a2713aSLionel Sambuc   case LS_VerbatimBlockBody:
282*f4a2713aSLionel Sambuc     lexVerbatimBlockBody(T);
283*f4a2713aSLionel Sambuc     return;
284*f4a2713aSLionel Sambuc   case LS_VerbatimLineText:
285*f4a2713aSLionel Sambuc     lexVerbatimLineText(T);
286*f4a2713aSLionel Sambuc     return;
287*f4a2713aSLionel Sambuc   case LS_HTMLStartTag:
288*f4a2713aSLionel Sambuc     lexHTMLStartTag(T);
289*f4a2713aSLionel Sambuc     return;
290*f4a2713aSLionel Sambuc   case LS_HTMLEndTag:
291*f4a2713aSLionel Sambuc     lexHTMLEndTag(T);
292*f4a2713aSLionel Sambuc     return;
293*f4a2713aSLionel Sambuc   }
294*f4a2713aSLionel Sambuc 
295*f4a2713aSLionel Sambuc   assert(State == LS_Normal);
296*f4a2713aSLionel Sambuc 
297*f4a2713aSLionel Sambuc   const char *TokenPtr = BufferPtr;
298*f4a2713aSLionel Sambuc   assert(TokenPtr < CommentEnd);
299*f4a2713aSLionel Sambuc   while (TokenPtr != CommentEnd) {
300*f4a2713aSLionel Sambuc     switch(*TokenPtr) {
301*f4a2713aSLionel Sambuc       case '\\':
302*f4a2713aSLionel Sambuc       case '@': {
303*f4a2713aSLionel Sambuc         // Commands that start with a backslash and commands that start with
304*f4a2713aSLionel Sambuc         // 'at' have equivalent semantics.  But we keep information about the
305*f4a2713aSLionel Sambuc         // exact syntax in AST for comments.
306*f4a2713aSLionel Sambuc         tok::TokenKind CommandKind =
307*f4a2713aSLionel Sambuc             (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
308*f4a2713aSLionel Sambuc         TokenPtr++;
309*f4a2713aSLionel Sambuc         if (TokenPtr == CommentEnd) {
310*f4a2713aSLionel Sambuc           formTextToken(T, TokenPtr);
311*f4a2713aSLionel Sambuc           return;
312*f4a2713aSLionel Sambuc         }
313*f4a2713aSLionel Sambuc         char C = *TokenPtr;
314*f4a2713aSLionel Sambuc         switch (C) {
315*f4a2713aSLionel Sambuc         default:
316*f4a2713aSLionel Sambuc           break;
317*f4a2713aSLionel Sambuc 
318*f4a2713aSLionel Sambuc         case '\\': case '@': case '&': case '$':
319*f4a2713aSLionel Sambuc         case '#':  case '<': case '>': case '%':
320*f4a2713aSLionel Sambuc         case '\"': case '.': case ':':
321*f4a2713aSLionel Sambuc           // This is one of \\ \@ \& \$ etc escape sequences.
322*f4a2713aSLionel Sambuc           TokenPtr++;
323*f4a2713aSLionel Sambuc           if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
324*f4a2713aSLionel Sambuc             // This is the \:: escape sequence.
325*f4a2713aSLionel Sambuc             TokenPtr++;
326*f4a2713aSLionel Sambuc           }
327*f4a2713aSLionel Sambuc           StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
328*f4a2713aSLionel Sambuc           formTokenWithChars(T, TokenPtr, tok::text);
329*f4a2713aSLionel Sambuc           T.setText(UnescapedText);
330*f4a2713aSLionel Sambuc           return;
331*f4a2713aSLionel Sambuc         }
332*f4a2713aSLionel Sambuc 
333*f4a2713aSLionel Sambuc         // Don't make zero-length commands.
334*f4a2713aSLionel Sambuc         if (!isCommandNameStartCharacter(*TokenPtr)) {
335*f4a2713aSLionel Sambuc           formTextToken(T, TokenPtr);
336*f4a2713aSLionel Sambuc           return;
337*f4a2713aSLionel Sambuc         }
338*f4a2713aSLionel Sambuc 
339*f4a2713aSLionel Sambuc         TokenPtr = skipCommandName(TokenPtr, CommentEnd);
340*f4a2713aSLionel Sambuc         unsigned Length = TokenPtr - (BufferPtr + 1);
341*f4a2713aSLionel Sambuc 
342*f4a2713aSLionel Sambuc         // Hardcoded support for lexing LaTeX formula commands
343*f4a2713aSLionel Sambuc         // \f$ \f[ \f] \f{ \f} as a single command.
344*f4a2713aSLionel Sambuc         if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
345*f4a2713aSLionel Sambuc           C = *TokenPtr;
346*f4a2713aSLionel Sambuc           if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
347*f4a2713aSLionel Sambuc             TokenPtr++;
348*f4a2713aSLionel Sambuc             Length++;
349*f4a2713aSLionel Sambuc           }
350*f4a2713aSLionel Sambuc         }
351*f4a2713aSLionel Sambuc 
352*f4a2713aSLionel Sambuc         const StringRef CommandName(BufferPtr + 1, Length);
353*f4a2713aSLionel Sambuc 
354*f4a2713aSLionel Sambuc         const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
355*f4a2713aSLionel Sambuc         if (!Info) {
356*f4a2713aSLionel Sambuc           formTokenWithChars(T, TokenPtr, tok::unknown_command);
357*f4a2713aSLionel Sambuc           T.setUnknownCommandName(CommandName);
358*f4a2713aSLionel Sambuc           if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
359*f4a2713aSLionel Sambuc             StringRef CorrectedName = Info->Name;
360*f4a2713aSLionel Sambuc             SourceRange CommandRange(T.getLocation().getLocWithOffset(1),
361*f4a2713aSLionel Sambuc                                      T.getEndLocation());
362*f4a2713aSLionel Sambuc             Diag(T.getLocation(), diag::warn_correct_comment_command_name)
363*f4a2713aSLionel Sambuc               << CommandName << CorrectedName
364*f4a2713aSLionel Sambuc               << FixItHint::CreateReplacement(CommandRange, CorrectedName);
365*f4a2713aSLionel Sambuc           } else {
366*f4a2713aSLionel Sambuc             Diag(T.getLocation(), diag::warn_unknown_comment_command_name);
367*f4a2713aSLionel Sambuc             return;
368*f4a2713aSLionel Sambuc           }
369*f4a2713aSLionel Sambuc         }
370*f4a2713aSLionel Sambuc         if (Info->IsVerbatimBlockCommand) {
371*f4a2713aSLionel Sambuc           setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
372*f4a2713aSLionel Sambuc           return;
373*f4a2713aSLionel Sambuc         }
374*f4a2713aSLionel Sambuc         if (Info->IsVerbatimLineCommand) {
375*f4a2713aSLionel Sambuc           setupAndLexVerbatimLine(T, TokenPtr, Info);
376*f4a2713aSLionel Sambuc           return;
377*f4a2713aSLionel Sambuc         }
378*f4a2713aSLionel Sambuc         formTokenWithChars(T, TokenPtr, CommandKind);
379*f4a2713aSLionel Sambuc         T.setCommandID(Info->getID());
380*f4a2713aSLionel Sambuc         return;
381*f4a2713aSLionel Sambuc       }
382*f4a2713aSLionel Sambuc 
383*f4a2713aSLionel Sambuc       case '&':
384*f4a2713aSLionel Sambuc         lexHTMLCharacterReference(T);
385*f4a2713aSLionel Sambuc         return;
386*f4a2713aSLionel Sambuc 
387*f4a2713aSLionel Sambuc       case '<': {
388*f4a2713aSLionel Sambuc         TokenPtr++;
389*f4a2713aSLionel Sambuc         if (TokenPtr == CommentEnd) {
390*f4a2713aSLionel Sambuc           formTextToken(T, TokenPtr);
391*f4a2713aSLionel Sambuc           return;
392*f4a2713aSLionel Sambuc         }
393*f4a2713aSLionel Sambuc         const char C = *TokenPtr;
394*f4a2713aSLionel Sambuc         if (isHTMLIdentifierStartingCharacter(C))
395*f4a2713aSLionel Sambuc           setupAndLexHTMLStartTag(T);
396*f4a2713aSLionel Sambuc         else if (C == '/')
397*f4a2713aSLionel Sambuc           setupAndLexHTMLEndTag(T);
398*f4a2713aSLionel Sambuc         else
399*f4a2713aSLionel Sambuc           formTextToken(T, TokenPtr);
400*f4a2713aSLionel Sambuc 
401*f4a2713aSLionel Sambuc         return;
402*f4a2713aSLionel Sambuc       }
403*f4a2713aSLionel Sambuc 
404*f4a2713aSLionel Sambuc       case '\n':
405*f4a2713aSLionel Sambuc       case '\r':
406*f4a2713aSLionel Sambuc         TokenPtr = skipNewline(TokenPtr, CommentEnd);
407*f4a2713aSLionel Sambuc         formTokenWithChars(T, TokenPtr, tok::newline);
408*f4a2713aSLionel Sambuc 
409*f4a2713aSLionel Sambuc         if (CommentState == LCS_InsideCComment)
410*f4a2713aSLionel Sambuc           skipLineStartingDecorations();
411*f4a2713aSLionel Sambuc         return;
412*f4a2713aSLionel Sambuc 
413*f4a2713aSLionel Sambuc       default: {
414*f4a2713aSLionel Sambuc         size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr).
415*f4a2713aSLionel Sambuc                          find_first_of("\n\r\\@&<");
416*f4a2713aSLionel Sambuc         if (End != StringRef::npos)
417*f4a2713aSLionel Sambuc           TokenPtr += End;
418*f4a2713aSLionel Sambuc         else
419*f4a2713aSLionel Sambuc           TokenPtr = CommentEnd;
420*f4a2713aSLionel Sambuc         formTextToken(T, TokenPtr);
421*f4a2713aSLionel Sambuc         return;
422*f4a2713aSLionel Sambuc       }
423*f4a2713aSLionel Sambuc     }
424*f4a2713aSLionel Sambuc   }
425*f4a2713aSLionel Sambuc }
426*f4a2713aSLionel Sambuc 
427*f4a2713aSLionel Sambuc void Lexer::setupAndLexVerbatimBlock(Token &T,
428*f4a2713aSLionel Sambuc                                      const char *TextBegin,
429*f4a2713aSLionel Sambuc                                      char Marker, const CommandInfo *Info) {
430*f4a2713aSLionel Sambuc   assert(Info->IsVerbatimBlockCommand);
431*f4a2713aSLionel Sambuc 
432*f4a2713aSLionel Sambuc   VerbatimBlockEndCommandName.clear();
433*f4a2713aSLionel Sambuc   VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
434*f4a2713aSLionel Sambuc   VerbatimBlockEndCommandName.append(Info->EndCommandName);
435*f4a2713aSLionel Sambuc 
436*f4a2713aSLionel Sambuc   formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
437*f4a2713aSLionel Sambuc   T.setVerbatimBlockID(Info->getID());
438*f4a2713aSLionel Sambuc 
439*f4a2713aSLionel Sambuc   // If there is a newline following the verbatim opening command, skip the
440*f4a2713aSLionel Sambuc   // newline so that we don't create an tok::verbatim_block_line with empty
441*f4a2713aSLionel Sambuc   // text content.
442*f4a2713aSLionel Sambuc   if (BufferPtr != CommentEnd &&
443*f4a2713aSLionel Sambuc       isVerticalWhitespace(*BufferPtr)) {
444*f4a2713aSLionel Sambuc     BufferPtr = skipNewline(BufferPtr, CommentEnd);
445*f4a2713aSLionel Sambuc     State = LS_VerbatimBlockBody;
446*f4a2713aSLionel Sambuc     return;
447*f4a2713aSLionel Sambuc   }
448*f4a2713aSLionel Sambuc 
449*f4a2713aSLionel Sambuc   State = LS_VerbatimBlockFirstLine;
450*f4a2713aSLionel Sambuc }
451*f4a2713aSLionel Sambuc 
452*f4a2713aSLionel Sambuc void Lexer::lexVerbatimBlockFirstLine(Token &T) {
453*f4a2713aSLionel Sambuc again:
454*f4a2713aSLionel Sambuc   assert(BufferPtr < CommentEnd);
455*f4a2713aSLionel Sambuc 
456*f4a2713aSLionel Sambuc   // FIXME: It would be better to scan the text once, finding either the block
457*f4a2713aSLionel Sambuc   // end command or newline.
458*f4a2713aSLionel Sambuc   //
459*f4a2713aSLionel Sambuc   // Extract current line.
460*f4a2713aSLionel Sambuc   const char *Newline = findNewline(BufferPtr, CommentEnd);
461*f4a2713aSLionel Sambuc   StringRef Line(BufferPtr, Newline - BufferPtr);
462*f4a2713aSLionel Sambuc 
463*f4a2713aSLionel Sambuc   // Look for end command in current line.
464*f4a2713aSLionel Sambuc   size_t Pos = Line.find(VerbatimBlockEndCommandName);
465*f4a2713aSLionel Sambuc   const char *TextEnd;
466*f4a2713aSLionel Sambuc   const char *NextLine;
467*f4a2713aSLionel Sambuc   if (Pos == StringRef::npos) {
468*f4a2713aSLionel Sambuc     // Current line is completely verbatim.
469*f4a2713aSLionel Sambuc     TextEnd = Newline;
470*f4a2713aSLionel Sambuc     NextLine = skipNewline(Newline, CommentEnd);
471*f4a2713aSLionel Sambuc   } else if (Pos == 0) {
472*f4a2713aSLionel Sambuc     // Current line contains just an end command.
473*f4a2713aSLionel Sambuc     const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
474*f4a2713aSLionel Sambuc     StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
475*f4a2713aSLionel Sambuc     formTokenWithChars(T, End, tok::verbatim_block_end);
476*f4a2713aSLionel Sambuc     T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
477*f4a2713aSLionel Sambuc     State = LS_Normal;
478*f4a2713aSLionel Sambuc     return;
479*f4a2713aSLionel Sambuc   } else {
480*f4a2713aSLionel Sambuc     // There is some text, followed by end command.  Extract text first.
481*f4a2713aSLionel Sambuc     TextEnd = BufferPtr + Pos;
482*f4a2713aSLionel Sambuc     NextLine = TextEnd;
483*f4a2713aSLionel Sambuc     // If there is only whitespace before end command, skip whitespace.
484*f4a2713aSLionel Sambuc     if (isWhitespace(BufferPtr, TextEnd)) {
485*f4a2713aSLionel Sambuc       BufferPtr = TextEnd;
486*f4a2713aSLionel Sambuc       goto again;
487*f4a2713aSLionel Sambuc     }
488*f4a2713aSLionel Sambuc   }
489*f4a2713aSLionel Sambuc 
490*f4a2713aSLionel Sambuc   StringRef Text(BufferPtr, TextEnd - BufferPtr);
491*f4a2713aSLionel Sambuc   formTokenWithChars(T, NextLine, tok::verbatim_block_line);
492*f4a2713aSLionel Sambuc   T.setVerbatimBlockText(Text);
493*f4a2713aSLionel Sambuc 
494*f4a2713aSLionel Sambuc   State = LS_VerbatimBlockBody;
495*f4a2713aSLionel Sambuc }
496*f4a2713aSLionel Sambuc 
497*f4a2713aSLionel Sambuc void Lexer::lexVerbatimBlockBody(Token &T) {
498*f4a2713aSLionel Sambuc   assert(State == LS_VerbatimBlockBody);
499*f4a2713aSLionel Sambuc 
500*f4a2713aSLionel Sambuc   if (CommentState == LCS_InsideCComment)
501*f4a2713aSLionel Sambuc     skipLineStartingDecorations();
502*f4a2713aSLionel Sambuc 
503*f4a2713aSLionel Sambuc   lexVerbatimBlockFirstLine(T);
504*f4a2713aSLionel Sambuc }
505*f4a2713aSLionel Sambuc 
506*f4a2713aSLionel Sambuc void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
507*f4a2713aSLionel Sambuc                                     const CommandInfo *Info) {
508*f4a2713aSLionel Sambuc   assert(Info->IsVerbatimLineCommand);
509*f4a2713aSLionel Sambuc   formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
510*f4a2713aSLionel Sambuc   T.setVerbatimLineID(Info->getID());
511*f4a2713aSLionel Sambuc 
512*f4a2713aSLionel Sambuc   State = LS_VerbatimLineText;
513*f4a2713aSLionel Sambuc }
514*f4a2713aSLionel Sambuc 
515*f4a2713aSLionel Sambuc void Lexer::lexVerbatimLineText(Token &T) {
516*f4a2713aSLionel Sambuc   assert(State == LS_VerbatimLineText);
517*f4a2713aSLionel Sambuc 
518*f4a2713aSLionel Sambuc   // Extract current line.
519*f4a2713aSLionel Sambuc   const char *Newline = findNewline(BufferPtr, CommentEnd);
520*f4a2713aSLionel Sambuc   const StringRef Text(BufferPtr, Newline - BufferPtr);
521*f4a2713aSLionel Sambuc   formTokenWithChars(T, Newline, tok::verbatim_line_text);
522*f4a2713aSLionel Sambuc   T.setVerbatimLineText(Text);
523*f4a2713aSLionel Sambuc 
524*f4a2713aSLionel Sambuc   State = LS_Normal;
525*f4a2713aSLionel Sambuc }
526*f4a2713aSLionel Sambuc 
527*f4a2713aSLionel Sambuc void Lexer::lexHTMLCharacterReference(Token &T) {
528*f4a2713aSLionel Sambuc   const char *TokenPtr = BufferPtr;
529*f4a2713aSLionel Sambuc   assert(*TokenPtr == '&');
530*f4a2713aSLionel Sambuc   TokenPtr++;
531*f4a2713aSLionel Sambuc   if (TokenPtr == CommentEnd) {
532*f4a2713aSLionel Sambuc     formTextToken(T, TokenPtr);
533*f4a2713aSLionel Sambuc     return;
534*f4a2713aSLionel Sambuc   }
535*f4a2713aSLionel Sambuc   const char *NamePtr;
536*f4a2713aSLionel Sambuc   bool isNamed = false;
537*f4a2713aSLionel Sambuc   bool isDecimal = false;
538*f4a2713aSLionel Sambuc   char C = *TokenPtr;
539*f4a2713aSLionel Sambuc   if (isHTMLNamedCharacterReferenceCharacter(C)) {
540*f4a2713aSLionel Sambuc     NamePtr = TokenPtr;
541*f4a2713aSLionel Sambuc     TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
542*f4a2713aSLionel Sambuc     isNamed = true;
543*f4a2713aSLionel Sambuc   } else if (C == '#') {
544*f4a2713aSLionel Sambuc     TokenPtr++;
545*f4a2713aSLionel Sambuc     if (TokenPtr == CommentEnd) {
546*f4a2713aSLionel Sambuc       formTextToken(T, TokenPtr);
547*f4a2713aSLionel Sambuc       return;
548*f4a2713aSLionel Sambuc     }
549*f4a2713aSLionel Sambuc     C = *TokenPtr;
550*f4a2713aSLionel Sambuc     if (isHTMLDecimalCharacterReferenceCharacter(C)) {
551*f4a2713aSLionel Sambuc       NamePtr = TokenPtr;
552*f4a2713aSLionel Sambuc       TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
553*f4a2713aSLionel Sambuc       isDecimal = true;
554*f4a2713aSLionel Sambuc     } else if (C == 'x' || C == 'X') {
555*f4a2713aSLionel Sambuc       TokenPtr++;
556*f4a2713aSLionel Sambuc       NamePtr = TokenPtr;
557*f4a2713aSLionel Sambuc       TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
558*f4a2713aSLionel Sambuc     } else {
559*f4a2713aSLionel Sambuc       formTextToken(T, TokenPtr);
560*f4a2713aSLionel Sambuc       return;
561*f4a2713aSLionel Sambuc     }
562*f4a2713aSLionel Sambuc   } else {
563*f4a2713aSLionel Sambuc     formTextToken(T, TokenPtr);
564*f4a2713aSLionel Sambuc     return;
565*f4a2713aSLionel Sambuc   }
566*f4a2713aSLionel Sambuc   if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
567*f4a2713aSLionel Sambuc       *TokenPtr != ';') {
568*f4a2713aSLionel Sambuc     formTextToken(T, TokenPtr);
569*f4a2713aSLionel Sambuc     return;
570*f4a2713aSLionel Sambuc   }
571*f4a2713aSLionel Sambuc   StringRef Name(NamePtr, TokenPtr - NamePtr);
572*f4a2713aSLionel Sambuc   TokenPtr++; // Skip semicolon.
573*f4a2713aSLionel Sambuc   StringRef Resolved;
574*f4a2713aSLionel Sambuc   if (isNamed)
575*f4a2713aSLionel Sambuc     Resolved = resolveHTMLNamedCharacterReference(Name);
576*f4a2713aSLionel Sambuc   else if (isDecimal)
577*f4a2713aSLionel Sambuc     Resolved = resolveHTMLDecimalCharacterReference(Name);
578*f4a2713aSLionel Sambuc   else
579*f4a2713aSLionel Sambuc     Resolved = resolveHTMLHexCharacterReference(Name);
580*f4a2713aSLionel Sambuc 
581*f4a2713aSLionel Sambuc   if (Resolved.empty()) {
582*f4a2713aSLionel Sambuc     formTextToken(T, TokenPtr);
583*f4a2713aSLionel Sambuc     return;
584*f4a2713aSLionel Sambuc   }
585*f4a2713aSLionel Sambuc   formTokenWithChars(T, TokenPtr, tok::text);
586*f4a2713aSLionel Sambuc   T.setText(Resolved);
587*f4a2713aSLionel Sambuc   return;
588*f4a2713aSLionel Sambuc }
589*f4a2713aSLionel Sambuc 
590*f4a2713aSLionel Sambuc void Lexer::setupAndLexHTMLStartTag(Token &T) {
591*f4a2713aSLionel Sambuc   assert(BufferPtr[0] == '<' &&
592*f4a2713aSLionel Sambuc          isHTMLIdentifierStartingCharacter(BufferPtr[1]));
593*f4a2713aSLionel Sambuc   const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
594*f4a2713aSLionel Sambuc   StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
595*f4a2713aSLionel Sambuc   if (!isHTMLTagName(Name)) {
596*f4a2713aSLionel Sambuc     formTextToken(T, TagNameEnd);
597*f4a2713aSLionel Sambuc     return;
598*f4a2713aSLionel Sambuc   }
599*f4a2713aSLionel Sambuc 
600*f4a2713aSLionel Sambuc   formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
601*f4a2713aSLionel Sambuc   T.setHTMLTagStartName(Name);
602*f4a2713aSLionel Sambuc 
603*f4a2713aSLionel Sambuc   BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
604*f4a2713aSLionel Sambuc 
605*f4a2713aSLionel Sambuc   const char C = *BufferPtr;
606*f4a2713aSLionel Sambuc   if (BufferPtr != CommentEnd &&
607*f4a2713aSLionel Sambuc       (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
608*f4a2713aSLionel Sambuc     State = LS_HTMLStartTag;
609*f4a2713aSLionel Sambuc }
610*f4a2713aSLionel Sambuc 
611*f4a2713aSLionel Sambuc void Lexer::lexHTMLStartTag(Token &T) {
612*f4a2713aSLionel Sambuc   assert(State == LS_HTMLStartTag);
613*f4a2713aSLionel Sambuc 
614*f4a2713aSLionel Sambuc   const char *TokenPtr = BufferPtr;
615*f4a2713aSLionel Sambuc   char C = *TokenPtr;
616*f4a2713aSLionel Sambuc   if (isHTMLIdentifierCharacter(C)) {
617*f4a2713aSLionel Sambuc     TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
618*f4a2713aSLionel Sambuc     StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
619*f4a2713aSLionel Sambuc     formTokenWithChars(T, TokenPtr, tok::html_ident);
620*f4a2713aSLionel Sambuc     T.setHTMLIdent(Ident);
621*f4a2713aSLionel Sambuc   } else {
622*f4a2713aSLionel Sambuc     switch (C) {
623*f4a2713aSLionel Sambuc     case '=':
624*f4a2713aSLionel Sambuc       TokenPtr++;
625*f4a2713aSLionel Sambuc       formTokenWithChars(T, TokenPtr, tok::html_equals);
626*f4a2713aSLionel Sambuc       break;
627*f4a2713aSLionel Sambuc     case '\"':
628*f4a2713aSLionel Sambuc     case '\'': {
629*f4a2713aSLionel Sambuc       const char *OpenQuote = TokenPtr;
630*f4a2713aSLionel Sambuc       TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
631*f4a2713aSLionel Sambuc       const char *ClosingQuote = TokenPtr;
632*f4a2713aSLionel Sambuc       if (TokenPtr != CommentEnd) // Skip closing quote.
633*f4a2713aSLionel Sambuc         TokenPtr++;
634*f4a2713aSLionel Sambuc       formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
635*f4a2713aSLionel Sambuc       T.setHTMLQuotedString(StringRef(OpenQuote + 1,
636*f4a2713aSLionel Sambuc                                       ClosingQuote - (OpenQuote + 1)));
637*f4a2713aSLionel Sambuc       break;
638*f4a2713aSLionel Sambuc     }
639*f4a2713aSLionel Sambuc     case '>':
640*f4a2713aSLionel Sambuc       TokenPtr++;
641*f4a2713aSLionel Sambuc       formTokenWithChars(T, TokenPtr, tok::html_greater);
642*f4a2713aSLionel Sambuc       State = LS_Normal;
643*f4a2713aSLionel Sambuc       return;
644*f4a2713aSLionel Sambuc     case '/':
645*f4a2713aSLionel Sambuc       TokenPtr++;
646*f4a2713aSLionel Sambuc       if (TokenPtr != CommentEnd && *TokenPtr == '>') {
647*f4a2713aSLionel Sambuc         TokenPtr++;
648*f4a2713aSLionel Sambuc         formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
649*f4a2713aSLionel Sambuc       } else
650*f4a2713aSLionel Sambuc         formTextToken(T, TokenPtr);
651*f4a2713aSLionel Sambuc 
652*f4a2713aSLionel Sambuc       State = LS_Normal;
653*f4a2713aSLionel Sambuc       return;
654*f4a2713aSLionel Sambuc     }
655*f4a2713aSLionel Sambuc   }
656*f4a2713aSLionel Sambuc 
657*f4a2713aSLionel Sambuc   // Now look ahead and return to normal state if we don't see any HTML tokens
658*f4a2713aSLionel Sambuc   // ahead.
659*f4a2713aSLionel Sambuc   BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
660*f4a2713aSLionel Sambuc   if (BufferPtr == CommentEnd) {
661*f4a2713aSLionel Sambuc     State = LS_Normal;
662*f4a2713aSLionel Sambuc     return;
663*f4a2713aSLionel Sambuc   }
664*f4a2713aSLionel Sambuc 
665*f4a2713aSLionel Sambuc   C = *BufferPtr;
666*f4a2713aSLionel Sambuc   if (!isHTMLIdentifierStartingCharacter(C) &&
667*f4a2713aSLionel Sambuc       C != '=' && C != '\"' && C != '\'' && C != '>') {
668*f4a2713aSLionel Sambuc     State = LS_Normal;
669*f4a2713aSLionel Sambuc     return;
670*f4a2713aSLionel Sambuc   }
671*f4a2713aSLionel Sambuc }
672*f4a2713aSLionel Sambuc 
673*f4a2713aSLionel Sambuc void Lexer::setupAndLexHTMLEndTag(Token &T) {
674*f4a2713aSLionel Sambuc   assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
675*f4a2713aSLionel Sambuc 
676*f4a2713aSLionel Sambuc   const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
677*f4a2713aSLionel Sambuc   const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
678*f4a2713aSLionel Sambuc   StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
679*f4a2713aSLionel Sambuc   if (!isHTMLTagName(Name)) {
680*f4a2713aSLionel Sambuc     formTextToken(T, TagNameEnd);
681*f4a2713aSLionel Sambuc     return;
682*f4a2713aSLionel Sambuc   }
683*f4a2713aSLionel Sambuc 
684*f4a2713aSLionel Sambuc   const char *End = skipWhitespace(TagNameEnd, CommentEnd);
685*f4a2713aSLionel Sambuc 
686*f4a2713aSLionel Sambuc   formTokenWithChars(T, End, tok::html_end_tag);
687*f4a2713aSLionel Sambuc   T.setHTMLTagEndName(Name);
688*f4a2713aSLionel Sambuc 
689*f4a2713aSLionel Sambuc   if (BufferPtr != CommentEnd && *BufferPtr == '>')
690*f4a2713aSLionel Sambuc     State = LS_HTMLEndTag;
691*f4a2713aSLionel Sambuc }
692*f4a2713aSLionel Sambuc 
693*f4a2713aSLionel Sambuc void Lexer::lexHTMLEndTag(Token &T) {
694*f4a2713aSLionel Sambuc   assert(BufferPtr != CommentEnd && *BufferPtr == '>');
695*f4a2713aSLionel Sambuc 
696*f4a2713aSLionel Sambuc   formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
697*f4a2713aSLionel Sambuc   State = LS_Normal;
698*f4a2713aSLionel Sambuc }
699*f4a2713aSLionel Sambuc 
700*f4a2713aSLionel Sambuc Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
701*f4a2713aSLionel Sambuc              const CommandTraits &Traits,
702*f4a2713aSLionel Sambuc              SourceLocation FileLoc,
703*f4a2713aSLionel Sambuc              const char *BufferStart, const char *BufferEnd):
704*f4a2713aSLionel Sambuc     Allocator(Allocator), Diags(Diags), Traits(Traits),
705*f4a2713aSLionel Sambuc     BufferStart(BufferStart), BufferEnd(BufferEnd),
706*f4a2713aSLionel Sambuc     FileLoc(FileLoc), BufferPtr(BufferStart),
707*f4a2713aSLionel Sambuc     CommentState(LCS_BeforeComment), State(LS_Normal) {
708*f4a2713aSLionel Sambuc }
709*f4a2713aSLionel Sambuc 
710*f4a2713aSLionel Sambuc void Lexer::lex(Token &T) {
711*f4a2713aSLionel Sambuc again:
712*f4a2713aSLionel Sambuc   switch (CommentState) {
713*f4a2713aSLionel Sambuc   case LCS_BeforeComment:
714*f4a2713aSLionel Sambuc     if (BufferPtr == BufferEnd) {
715*f4a2713aSLionel Sambuc       formTokenWithChars(T, BufferPtr, tok::eof);
716*f4a2713aSLionel Sambuc       return;
717*f4a2713aSLionel Sambuc     }
718*f4a2713aSLionel Sambuc 
719*f4a2713aSLionel Sambuc     assert(*BufferPtr == '/');
720*f4a2713aSLionel Sambuc     BufferPtr++; // Skip first slash.
721*f4a2713aSLionel Sambuc     switch(*BufferPtr) {
722*f4a2713aSLionel Sambuc     case '/': { // BCPL comment.
723*f4a2713aSLionel Sambuc       BufferPtr++; // Skip second slash.
724*f4a2713aSLionel Sambuc 
725*f4a2713aSLionel Sambuc       if (BufferPtr != BufferEnd) {
726*f4a2713aSLionel Sambuc         // Skip Doxygen magic marker, if it is present.
727*f4a2713aSLionel Sambuc         // It might be missing because of a typo //< or /*<, or because we
728*f4a2713aSLionel Sambuc         // merged this non-Doxygen comment into a bunch of Doxygen comments
729*f4a2713aSLionel Sambuc         // around it: /** ... */ /* ... */ /** ... */
730*f4a2713aSLionel Sambuc         const char C = *BufferPtr;
731*f4a2713aSLionel Sambuc         if (C == '/' || C == '!')
732*f4a2713aSLionel Sambuc           BufferPtr++;
733*f4a2713aSLionel Sambuc       }
734*f4a2713aSLionel Sambuc 
735*f4a2713aSLionel Sambuc       // Skip less-than symbol that marks trailing comments.
736*f4a2713aSLionel Sambuc       // Skip it even if the comment is not a Doxygen one, because //< and /*<
737*f4a2713aSLionel Sambuc       // are frequent typos.
738*f4a2713aSLionel Sambuc       if (BufferPtr != BufferEnd && *BufferPtr == '<')
739*f4a2713aSLionel Sambuc         BufferPtr++;
740*f4a2713aSLionel Sambuc 
741*f4a2713aSLionel Sambuc       CommentState = LCS_InsideBCPLComment;
742*f4a2713aSLionel Sambuc       if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
743*f4a2713aSLionel Sambuc         State = LS_Normal;
744*f4a2713aSLionel Sambuc       CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
745*f4a2713aSLionel Sambuc       goto again;
746*f4a2713aSLionel Sambuc     }
747*f4a2713aSLionel Sambuc     case '*': { // C comment.
748*f4a2713aSLionel Sambuc       BufferPtr++; // Skip star.
749*f4a2713aSLionel Sambuc 
750*f4a2713aSLionel Sambuc       // Skip Doxygen magic marker.
751*f4a2713aSLionel Sambuc       const char C = *BufferPtr;
752*f4a2713aSLionel Sambuc       if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
753*f4a2713aSLionel Sambuc         BufferPtr++;
754*f4a2713aSLionel Sambuc 
755*f4a2713aSLionel Sambuc       // Skip less-than symbol that marks trailing comments.
756*f4a2713aSLionel Sambuc       if (BufferPtr != BufferEnd && *BufferPtr == '<')
757*f4a2713aSLionel Sambuc         BufferPtr++;
758*f4a2713aSLionel Sambuc 
759*f4a2713aSLionel Sambuc       CommentState = LCS_InsideCComment;
760*f4a2713aSLionel Sambuc       State = LS_Normal;
761*f4a2713aSLionel Sambuc       CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
762*f4a2713aSLionel Sambuc       goto again;
763*f4a2713aSLionel Sambuc     }
764*f4a2713aSLionel Sambuc     default:
765*f4a2713aSLionel Sambuc       llvm_unreachable("second character of comment should be '/' or '*'");
766*f4a2713aSLionel Sambuc     }
767*f4a2713aSLionel Sambuc 
768*f4a2713aSLionel Sambuc   case LCS_BetweenComments: {
769*f4a2713aSLionel Sambuc     // Consecutive comments are extracted only if there is only whitespace
770*f4a2713aSLionel Sambuc     // between them.  So we can search for the start of the next comment.
771*f4a2713aSLionel Sambuc     const char *EndWhitespace = BufferPtr;
772*f4a2713aSLionel Sambuc     while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
773*f4a2713aSLionel Sambuc       EndWhitespace++;
774*f4a2713aSLionel Sambuc 
775*f4a2713aSLionel Sambuc     // Turn any whitespace between comments (and there is only whitespace
776*f4a2713aSLionel Sambuc     // between them -- guaranteed by comment extraction) into a newline.  We
777*f4a2713aSLionel Sambuc     // have two newlines between C comments in total (first one was synthesized
778*f4a2713aSLionel Sambuc     // after a comment).
779*f4a2713aSLionel Sambuc     formTokenWithChars(T, EndWhitespace, tok::newline);
780*f4a2713aSLionel Sambuc 
781*f4a2713aSLionel Sambuc     CommentState = LCS_BeforeComment;
782*f4a2713aSLionel Sambuc     break;
783*f4a2713aSLionel Sambuc   }
784*f4a2713aSLionel Sambuc 
785*f4a2713aSLionel Sambuc   case LCS_InsideBCPLComment:
786*f4a2713aSLionel Sambuc   case LCS_InsideCComment:
787*f4a2713aSLionel Sambuc     if (BufferPtr != CommentEnd) {
788*f4a2713aSLionel Sambuc       lexCommentText(T);
789*f4a2713aSLionel Sambuc       break;
790*f4a2713aSLionel Sambuc     } else {
791*f4a2713aSLionel Sambuc       // Skip C comment closing sequence.
792*f4a2713aSLionel Sambuc       if (CommentState == LCS_InsideCComment) {
793*f4a2713aSLionel Sambuc         assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
794*f4a2713aSLionel Sambuc         BufferPtr += 2;
795*f4a2713aSLionel Sambuc         assert(BufferPtr <= BufferEnd);
796*f4a2713aSLionel Sambuc 
797*f4a2713aSLionel Sambuc         // Synthenize newline just after the C comment, regardless if there is
798*f4a2713aSLionel Sambuc         // actually a newline.
799*f4a2713aSLionel Sambuc         formTokenWithChars(T, BufferPtr, tok::newline);
800*f4a2713aSLionel Sambuc 
801*f4a2713aSLionel Sambuc         CommentState = LCS_BetweenComments;
802*f4a2713aSLionel Sambuc         break;
803*f4a2713aSLionel Sambuc       } else {
804*f4a2713aSLionel Sambuc         // Don't synthesized a newline after BCPL comment.
805*f4a2713aSLionel Sambuc         CommentState = LCS_BetweenComments;
806*f4a2713aSLionel Sambuc         goto again;
807*f4a2713aSLionel Sambuc       }
808*f4a2713aSLionel Sambuc     }
809*f4a2713aSLionel Sambuc   }
810*f4a2713aSLionel Sambuc }
811*f4a2713aSLionel Sambuc 
812*f4a2713aSLionel Sambuc StringRef Lexer::getSpelling(const Token &Tok,
813*f4a2713aSLionel Sambuc                              const SourceManager &SourceMgr,
814*f4a2713aSLionel Sambuc                              bool *Invalid) const {
815*f4a2713aSLionel Sambuc   SourceLocation Loc = Tok.getLocation();
816*f4a2713aSLionel Sambuc   std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
817*f4a2713aSLionel Sambuc 
818*f4a2713aSLionel Sambuc   bool InvalidTemp = false;
819*f4a2713aSLionel Sambuc   StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
820*f4a2713aSLionel Sambuc   if (InvalidTemp) {
821*f4a2713aSLionel Sambuc     *Invalid = true;
822*f4a2713aSLionel Sambuc     return StringRef();
823*f4a2713aSLionel Sambuc   }
824*f4a2713aSLionel Sambuc 
825*f4a2713aSLionel Sambuc   const char *Begin = File.data() + LocInfo.second;
826*f4a2713aSLionel Sambuc   return StringRef(Begin, Tok.getLength());
827*f4a2713aSLionel Sambuc }
828*f4a2713aSLionel Sambuc 
829*f4a2713aSLionel Sambuc } // end namespace comments
830*f4a2713aSLionel Sambuc } // end namespace clang
831*f4a2713aSLionel Sambuc 
832