xref: /minix3/external/bsd/llvm/dist/clang/lib/AST/CommentLexer.cpp (revision 0a6a1f1d05b60e214de2f05a7310ddd1f0e590e7)
1f4a2713aSLionel Sambuc #include "clang/AST/CommentLexer.h"
2f4a2713aSLionel Sambuc #include "clang/AST/CommentCommandTraits.h"
3f4a2713aSLionel Sambuc #include "clang/AST/CommentDiagnostic.h"
4f4a2713aSLionel Sambuc #include "clang/Basic/CharInfo.h"
5f4a2713aSLionel Sambuc #include "llvm/ADT/StringExtras.h"
6f4a2713aSLionel Sambuc #include "llvm/ADT/StringSwitch.h"
7f4a2713aSLionel Sambuc #include "llvm/Support/ConvertUTF.h"
8f4a2713aSLionel Sambuc #include "llvm/Support/ErrorHandling.h"
9f4a2713aSLionel Sambuc 
10f4a2713aSLionel Sambuc namespace clang {
11f4a2713aSLionel Sambuc namespace comments {
12f4a2713aSLionel Sambuc 
dump(const Lexer & L,const SourceManager & SM) const13f4a2713aSLionel Sambuc void Token::dump(const Lexer &L, const SourceManager &SM) const {
14f4a2713aSLionel Sambuc   llvm::errs() << "comments::Token Kind=" << Kind << " ";
15f4a2713aSLionel Sambuc   Loc.dump(SM);
16f4a2713aSLionel Sambuc   llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
17f4a2713aSLionel Sambuc }
18f4a2713aSLionel Sambuc 
isHTMLNamedCharacterReferenceCharacter(char C)19f4a2713aSLionel Sambuc static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
20f4a2713aSLionel Sambuc   return isLetter(C);
21f4a2713aSLionel Sambuc }
22f4a2713aSLionel Sambuc 
isHTMLDecimalCharacterReferenceCharacter(char C)23f4a2713aSLionel Sambuc static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
24f4a2713aSLionel Sambuc   return isDigit(C);
25f4a2713aSLionel Sambuc }
26f4a2713aSLionel Sambuc 
isHTMLHexCharacterReferenceCharacter(char C)27f4a2713aSLionel Sambuc static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
28f4a2713aSLionel Sambuc   return isHexDigit(C);
29f4a2713aSLionel Sambuc }
30f4a2713aSLionel Sambuc 
convertCodePointToUTF8(llvm::BumpPtrAllocator & Allocator,unsigned CodePoint)31f4a2713aSLionel Sambuc static inline StringRef convertCodePointToUTF8(
32f4a2713aSLionel Sambuc                                       llvm::BumpPtrAllocator &Allocator,
33f4a2713aSLionel Sambuc                                       unsigned CodePoint) {
34f4a2713aSLionel Sambuc   char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
35f4a2713aSLionel Sambuc   char *ResolvedPtr = Resolved;
36f4a2713aSLionel Sambuc   if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
37f4a2713aSLionel Sambuc     return StringRef(Resolved, ResolvedPtr - Resolved);
38f4a2713aSLionel Sambuc   else
39f4a2713aSLionel Sambuc     return StringRef();
40f4a2713aSLionel Sambuc }
41f4a2713aSLionel Sambuc 
42f4a2713aSLionel Sambuc namespace {
43f4a2713aSLionel Sambuc 
44f4a2713aSLionel Sambuc #include "clang/AST/CommentHTMLTags.inc"
45f4a2713aSLionel Sambuc #include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
46f4a2713aSLionel Sambuc 
47f4a2713aSLionel Sambuc } // unnamed namespace
48f4a2713aSLionel Sambuc 
resolveHTMLNamedCharacterReference(StringRef Name) const49f4a2713aSLionel Sambuc StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
50f4a2713aSLionel Sambuc   // Fast path, first check a few most widely used named character references.
51f4a2713aSLionel Sambuc   return llvm::StringSwitch<StringRef>(Name)
52f4a2713aSLionel Sambuc       .Case("amp", "&")
53f4a2713aSLionel Sambuc       .Case("lt", "<")
54f4a2713aSLionel Sambuc       .Case("gt", ">")
55f4a2713aSLionel Sambuc       .Case("quot", "\"")
56f4a2713aSLionel Sambuc       .Case("apos", "\'")
57f4a2713aSLionel Sambuc       // Slow path.
58f4a2713aSLionel Sambuc       .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
59f4a2713aSLionel Sambuc }
60f4a2713aSLionel Sambuc 
resolveHTMLDecimalCharacterReference(StringRef Name) const61f4a2713aSLionel Sambuc StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
62f4a2713aSLionel Sambuc   unsigned CodePoint = 0;
63f4a2713aSLionel Sambuc   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
64f4a2713aSLionel Sambuc     assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
65f4a2713aSLionel Sambuc     CodePoint *= 10;
66f4a2713aSLionel Sambuc     CodePoint += Name[i] - '0';
67f4a2713aSLionel Sambuc   }
68f4a2713aSLionel Sambuc   return convertCodePointToUTF8(Allocator, CodePoint);
69f4a2713aSLionel Sambuc }
70f4a2713aSLionel Sambuc 
resolveHTMLHexCharacterReference(StringRef Name) const71f4a2713aSLionel Sambuc StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
72f4a2713aSLionel Sambuc   unsigned CodePoint = 0;
73f4a2713aSLionel Sambuc   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
74f4a2713aSLionel Sambuc     CodePoint *= 16;
75f4a2713aSLionel Sambuc     const char C = Name[i];
76f4a2713aSLionel Sambuc     assert(isHTMLHexCharacterReferenceCharacter(C));
77f4a2713aSLionel Sambuc     CodePoint += llvm::hexDigitValue(C);
78f4a2713aSLionel Sambuc   }
79f4a2713aSLionel Sambuc   return convertCodePointToUTF8(Allocator, CodePoint);
80f4a2713aSLionel Sambuc }
81f4a2713aSLionel Sambuc 
skipLineStartingDecorations()82f4a2713aSLionel Sambuc void Lexer::skipLineStartingDecorations() {
83f4a2713aSLionel Sambuc   // This function should be called only for C comments
84f4a2713aSLionel Sambuc   assert(CommentState == LCS_InsideCComment);
85f4a2713aSLionel Sambuc 
86f4a2713aSLionel Sambuc   if (BufferPtr == CommentEnd)
87f4a2713aSLionel Sambuc     return;
88f4a2713aSLionel Sambuc 
89f4a2713aSLionel Sambuc   switch (*BufferPtr) {
90f4a2713aSLionel Sambuc   case ' ':
91f4a2713aSLionel Sambuc   case '\t':
92f4a2713aSLionel Sambuc   case '\f':
93f4a2713aSLionel Sambuc   case '\v': {
94f4a2713aSLionel Sambuc     const char *NewBufferPtr = BufferPtr;
95f4a2713aSLionel Sambuc     NewBufferPtr++;
96f4a2713aSLionel Sambuc     if (NewBufferPtr == CommentEnd)
97f4a2713aSLionel Sambuc       return;
98f4a2713aSLionel Sambuc 
99f4a2713aSLionel Sambuc     char C = *NewBufferPtr;
100f4a2713aSLionel Sambuc     while (isHorizontalWhitespace(C)) {
101f4a2713aSLionel Sambuc       NewBufferPtr++;
102f4a2713aSLionel Sambuc       if (NewBufferPtr == CommentEnd)
103f4a2713aSLionel Sambuc         return;
104f4a2713aSLionel Sambuc       C = *NewBufferPtr;
105f4a2713aSLionel Sambuc     }
106f4a2713aSLionel Sambuc     if (C == '*')
107f4a2713aSLionel Sambuc       BufferPtr = NewBufferPtr + 1;
108f4a2713aSLionel Sambuc     break;
109f4a2713aSLionel Sambuc   }
110f4a2713aSLionel Sambuc   case '*':
111f4a2713aSLionel Sambuc     BufferPtr++;
112f4a2713aSLionel Sambuc     break;
113f4a2713aSLionel Sambuc   }
114f4a2713aSLionel Sambuc }
115f4a2713aSLionel Sambuc 
116f4a2713aSLionel Sambuc namespace {
117f4a2713aSLionel Sambuc /// Returns pointer to the first newline character in the string.
findNewline(const char * BufferPtr,const char * BufferEnd)118f4a2713aSLionel Sambuc const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
119f4a2713aSLionel Sambuc   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
120f4a2713aSLionel Sambuc     if (isVerticalWhitespace(*BufferPtr))
121f4a2713aSLionel Sambuc       return BufferPtr;
122f4a2713aSLionel Sambuc   }
123f4a2713aSLionel Sambuc   return BufferEnd;
124f4a2713aSLionel Sambuc }
125f4a2713aSLionel Sambuc 
skipNewline(const char * BufferPtr,const char * BufferEnd)126f4a2713aSLionel Sambuc const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
127f4a2713aSLionel Sambuc   if (BufferPtr == BufferEnd)
128f4a2713aSLionel Sambuc     return BufferPtr;
129f4a2713aSLionel Sambuc 
130f4a2713aSLionel Sambuc   if (*BufferPtr == '\n')
131f4a2713aSLionel Sambuc     BufferPtr++;
132f4a2713aSLionel Sambuc   else {
133f4a2713aSLionel Sambuc     assert(*BufferPtr == '\r');
134f4a2713aSLionel Sambuc     BufferPtr++;
135f4a2713aSLionel Sambuc     if (BufferPtr != BufferEnd && *BufferPtr == '\n')
136f4a2713aSLionel Sambuc       BufferPtr++;
137f4a2713aSLionel Sambuc   }
138f4a2713aSLionel Sambuc   return BufferPtr;
139f4a2713aSLionel Sambuc }
140f4a2713aSLionel Sambuc 
skipNamedCharacterReference(const char * BufferPtr,const char * BufferEnd)141f4a2713aSLionel Sambuc const char *skipNamedCharacterReference(const char *BufferPtr,
142f4a2713aSLionel Sambuc                                         const char *BufferEnd) {
143f4a2713aSLionel Sambuc   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
144f4a2713aSLionel Sambuc     if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
145f4a2713aSLionel Sambuc       return BufferPtr;
146f4a2713aSLionel Sambuc   }
147f4a2713aSLionel Sambuc   return BufferEnd;
148f4a2713aSLionel Sambuc }
149f4a2713aSLionel Sambuc 
skipDecimalCharacterReference(const char * BufferPtr,const char * BufferEnd)150f4a2713aSLionel Sambuc const char *skipDecimalCharacterReference(const char *BufferPtr,
151f4a2713aSLionel Sambuc                                           const char *BufferEnd) {
152f4a2713aSLionel Sambuc   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
153f4a2713aSLionel Sambuc     if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
154f4a2713aSLionel Sambuc       return BufferPtr;
155f4a2713aSLionel Sambuc   }
156f4a2713aSLionel Sambuc   return BufferEnd;
157f4a2713aSLionel Sambuc }
158f4a2713aSLionel Sambuc 
skipHexCharacterReference(const char * BufferPtr,const char * BufferEnd)159f4a2713aSLionel Sambuc const char *skipHexCharacterReference(const char *BufferPtr,
160f4a2713aSLionel Sambuc                                       const char *BufferEnd) {
161f4a2713aSLionel Sambuc   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
162f4a2713aSLionel Sambuc     if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
163f4a2713aSLionel Sambuc       return BufferPtr;
164f4a2713aSLionel Sambuc   }
165f4a2713aSLionel Sambuc   return BufferEnd;
166f4a2713aSLionel Sambuc }
167f4a2713aSLionel Sambuc 
isHTMLIdentifierStartingCharacter(char C)168f4a2713aSLionel Sambuc bool isHTMLIdentifierStartingCharacter(char C) {
169f4a2713aSLionel Sambuc   return isLetter(C);
170f4a2713aSLionel Sambuc }
171f4a2713aSLionel Sambuc 
isHTMLIdentifierCharacter(char C)172f4a2713aSLionel Sambuc bool isHTMLIdentifierCharacter(char C) {
173f4a2713aSLionel Sambuc   return isAlphanumeric(C);
174f4a2713aSLionel Sambuc }
175f4a2713aSLionel Sambuc 
skipHTMLIdentifier(const char * BufferPtr,const char * BufferEnd)176f4a2713aSLionel Sambuc const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
177f4a2713aSLionel Sambuc   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
178f4a2713aSLionel Sambuc     if (!isHTMLIdentifierCharacter(*BufferPtr))
179f4a2713aSLionel Sambuc       return BufferPtr;
180f4a2713aSLionel Sambuc   }
181f4a2713aSLionel Sambuc   return BufferEnd;
182f4a2713aSLionel Sambuc }
183f4a2713aSLionel Sambuc 
184f4a2713aSLionel Sambuc /// Skip HTML string quoted in single or double quotes.  Escaping quotes inside
185f4a2713aSLionel Sambuc /// string allowed.
186f4a2713aSLionel Sambuc ///
187f4a2713aSLionel Sambuc /// Returns pointer to closing quote.
skipHTMLQuotedString(const char * BufferPtr,const char * BufferEnd)188f4a2713aSLionel Sambuc const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
189f4a2713aSLionel Sambuc {
190f4a2713aSLionel Sambuc   const char Quote = *BufferPtr;
191f4a2713aSLionel Sambuc   assert(Quote == '\"' || Quote == '\'');
192f4a2713aSLionel Sambuc 
193f4a2713aSLionel Sambuc   BufferPtr++;
194f4a2713aSLionel Sambuc   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
195f4a2713aSLionel Sambuc     const char C = *BufferPtr;
196f4a2713aSLionel Sambuc     if (C == Quote && BufferPtr[-1] != '\\')
197f4a2713aSLionel Sambuc       return BufferPtr;
198f4a2713aSLionel Sambuc   }
199f4a2713aSLionel Sambuc   return BufferEnd;
200f4a2713aSLionel Sambuc }
201f4a2713aSLionel Sambuc 
skipWhitespace(const char * BufferPtr,const char * BufferEnd)202f4a2713aSLionel Sambuc const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
203f4a2713aSLionel Sambuc   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
204f4a2713aSLionel Sambuc     if (!isWhitespace(*BufferPtr))
205f4a2713aSLionel Sambuc       return BufferPtr;
206f4a2713aSLionel Sambuc   }
207f4a2713aSLionel Sambuc   return BufferEnd;
208f4a2713aSLionel Sambuc }
209f4a2713aSLionel Sambuc 
isWhitespace(const char * BufferPtr,const char * BufferEnd)210f4a2713aSLionel Sambuc bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
211f4a2713aSLionel Sambuc   return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
212f4a2713aSLionel Sambuc }
213f4a2713aSLionel Sambuc 
isCommandNameStartCharacter(char C)214f4a2713aSLionel Sambuc bool isCommandNameStartCharacter(char C) {
215f4a2713aSLionel Sambuc   return isLetter(C);
216f4a2713aSLionel Sambuc }
217f4a2713aSLionel Sambuc 
isCommandNameCharacter(char C)218f4a2713aSLionel Sambuc bool isCommandNameCharacter(char C) {
219f4a2713aSLionel Sambuc   return isAlphanumeric(C);
220f4a2713aSLionel Sambuc }
221f4a2713aSLionel Sambuc 
skipCommandName(const char * BufferPtr,const char * BufferEnd)222f4a2713aSLionel Sambuc const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
223f4a2713aSLionel Sambuc   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
224f4a2713aSLionel Sambuc     if (!isCommandNameCharacter(*BufferPtr))
225f4a2713aSLionel Sambuc       return BufferPtr;
226f4a2713aSLionel Sambuc   }
227f4a2713aSLionel Sambuc   return BufferEnd;
228f4a2713aSLionel Sambuc }
229f4a2713aSLionel Sambuc 
230f4a2713aSLionel Sambuc /// Return the one past end pointer for BCPL comments.
231f4a2713aSLionel Sambuc /// Handles newlines escaped with backslash or trigraph for backslahs.
findBCPLCommentEnd(const char * BufferPtr,const char * BufferEnd)232f4a2713aSLionel Sambuc const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
233f4a2713aSLionel Sambuc   const char *CurPtr = BufferPtr;
234f4a2713aSLionel Sambuc   while (CurPtr != BufferEnd) {
235f4a2713aSLionel Sambuc     while (!isVerticalWhitespace(*CurPtr)) {
236f4a2713aSLionel Sambuc       CurPtr++;
237f4a2713aSLionel Sambuc       if (CurPtr == BufferEnd)
238f4a2713aSLionel Sambuc         return BufferEnd;
239f4a2713aSLionel Sambuc     }
240f4a2713aSLionel Sambuc     // We found a newline, check if it is escaped.
241f4a2713aSLionel Sambuc     const char *EscapePtr = CurPtr - 1;
242f4a2713aSLionel Sambuc     while(isHorizontalWhitespace(*EscapePtr))
243f4a2713aSLionel Sambuc       EscapePtr--;
244f4a2713aSLionel Sambuc 
245f4a2713aSLionel Sambuc     if (*EscapePtr == '\\' ||
246f4a2713aSLionel Sambuc         (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
247f4a2713aSLionel Sambuc          EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
248f4a2713aSLionel Sambuc       // We found an escaped newline.
249f4a2713aSLionel Sambuc       CurPtr = skipNewline(CurPtr, BufferEnd);
250f4a2713aSLionel Sambuc     } else
251f4a2713aSLionel Sambuc       return CurPtr; // Not an escaped newline.
252f4a2713aSLionel Sambuc   }
253f4a2713aSLionel Sambuc   return BufferEnd;
254f4a2713aSLionel Sambuc }
255f4a2713aSLionel Sambuc 
256f4a2713aSLionel Sambuc /// Return the one past end pointer for C comments.
257f4a2713aSLionel Sambuc /// Very dumb, does not handle escaped newlines or trigraphs.
findCCommentEnd(const char * BufferPtr,const char * BufferEnd)258f4a2713aSLionel Sambuc const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
259f4a2713aSLionel Sambuc   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
260f4a2713aSLionel Sambuc     if (*BufferPtr == '*') {
261f4a2713aSLionel Sambuc       assert(BufferPtr + 1 != BufferEnd);
262f4a2713aSLionel Sambuc       if (*(BufferPtr + 1) == '/')
263f4a2713aSLionel Sambuc         return BufferPtr;
264f4a2713aSLionel Sambuc     }
265f4a2713aSLionel Sambuc   }
266f4a2713aSLionel Sambuc   llvm_unreachable("buffer end hit before '*/' was seen");
267f4a2713aSLionel Sambuc }
268f4a2713aSLionel Sambuc 
269f4a2713aSLionel Sambuc } // unnamed namespace
270f4a2713aSLionel Sambuc 
formTokenWithChars(Token & Result,const char * TokEnd,tok::TokenKind Kind)271*0a6a1f1dSLionel Sambuc void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
272*0a6a1f1dSLionel Sambuc                                tok::TokenKind Kind) {
273*0a6a1f1dSLionel Sambuc   const unsigned TokLen = TokEnd - BufferPtr;
274*0a6a1f1dSLionel Sambuc   Result.setLocation(getSourceLocation(BufferPtr));
275*0a6a1f1dSLionel Sambuc   Result.setKind(Kind);
276*0a6a1f1dSLionel Sambuc   Result.setLength(TokLen);
277*0a6a1f1dSLionel Sambuc #ifndef NDEBUG
278*0a6a1f1dSLionel Sambuc   Result.TextPtr = "<UNSET>";
279*0a6a1f1dSLionel Sambuc   Result.IntVal = 7;
280*0a6a1f1dSLionel Sambuc #endif
281*0a6a1f1dSLionel Sambuc   BufferPtr = TokEnd;
282*0a6a1f1dSLionel Sambuc }
283*0a6a1f1dSLionel Sambuc 
lexCommentText(Token & T)284f4a2713aSLionel Sambuc void Lexer::lexCommentText(Token &T) {
285f4a2713aSLionel Sambuc   assert(CommentState == LCS_InsideBCPLComment ||
286f4a2713aSLionel Sambuc          CommentState == LCS_InsideCComment);
287f4a2713aSLionel Sambuc 
288f4a2713aSLionel Sambuc   switch (State) {
289f4a2713aSLionel Sambuc   case LS_Normal:
290f4a2713aSLionel Sambuc     break;
291f4a2713aSLionel Sambuc   case LS_VerbatimBlockFirstLine:
292f4a2713aSLionel Sambuc     lexVerbatimBlockFirstLine(T);
293f4a2713aSLionel Sambuc     return;
294f4a2713aSLionel Sambuc   case LS_VerbatimBlockBody:
295f4a2713aSLionel Sambuc     lexVerbatimBlockBody(T);
296f4a2713aSLionel Sambuc     return;
297f4a2713aSLionel Sambuc   case LS_VerbatimLineText:
298f4a2713aSLionel Sambuc     lexVerbatimLineText(T);
299f4a2713aSLionel Sambuc     return;
300f4a2713aSLionel Sambuc   case LS_HTMLStartTag:
301f4a2713aSLionel Sambuc     lexHTMLStartTag(T);
302f4a2713aSLionel Sambuc     return;
303f4a2713aSLionel Sambuc   case LS_HTMLEndTag:
304f4a2713aSLionel Sambuc     lexHTMLEndTag(T);
305f4a2713aSLionel Sambuc     return;
306f4a2713aSLionel Sambuc   }
307f4a2713aSLionel Sambuc 
308f4a2713aSLionel Sambuc   assert(State == LS_Normal);
309f4a2713aSLionel Sambuc 
310f4a2713aSLionel Sambuc   const char *TokenPtr = BufferPtr;
311f4a2713aSLionel Sambuc   assert(TokenPtr < CommentEnd);
312f4a2713aSLionel Sambuc   while (TokenPtr != CommentEnd) {
313f4a2713aSLionel Sambuc     switch(*TokenPtr) {
314f4a2713aSLionel Sambuc       case '\\':
315f4a2713aSLionel Sambuc       case '@': {
316f4a2713aSLionel Sambuc         // Commands that start with a backslash and commands that start with
317f4a2713aSLionel Sambuc         // 'at' have equivalent semantics.  But we keep information about the
318f4a2713aSLionel Sambuc         // exact syntax in AST for comments.
319f4a2713aSLionel Sambuc         tok::TokenKind CommandKind =
320f4a2713aSLionel Sambuc             (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
321f4a2713aSLionel Sambuc         TokenPtr++;
322f4a2713aSLionel Sambuc         if (TokenPtr == CommentEnd) {
323f4a2713aSLionel Sambuc           formTextToken(T, TokenPtr);
324f4a2713aSLionel Sambuc           return;
325f4a2713aSLionel Sambuc         }
326f4a2713aSLionel Sambuc         char C = *TokenPtr;
327f4a2713aSLionel Sambuc         switch (C) {
328f4a2713aSLionel Sambuc         default:
329f4a2713aSLionel Sambuc           break;
330f4a2713aSLionel Sambuc 
331f4a2713aSLionel Sambuc         case '\\': case '@': case '&': case '$':
332f4a2713aSLionel Sambuc         case '#':  case '<': case '>': case '%':
333f4a2713aSLionel Sambuc         case '\"': case '.': case ':':
334f4a2713aSLionel Sambuc           // This is one of \\ \@ \& \$ etc escape sequences.
335f4a2713aSLionel Sambuc           TokenPtr++;
336f4a2713aSLionel Sambuc           if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
337f4a2713aSLionel Sambuc             // This is the \:: escape sequence.
338f4a2713aSLionel Sambuc             TokenPtr++;
339f4a2713aSLionel Sambuc           }
340f4a2713aSLionel Sambuc           StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
341f4a2713aSLionel Sambuc           formTokenWithChars(T, TokenPtr, tok::text);
342f4a2713aSLionel Sambuc           T.setText(UnescapedText);
343f4a2713aSLionel Sambuc           return;
344f4a2713aSLionel Sambuc         }
345f4a2713aSLionel Sambuc 
346f4a2713aSLionel Sambuc         // Don't make zero-length commands.
347f4a2713aSLionel Sambuc         if (!isCommandNameStartCharacter(*TokenPtr)) {
348f4a2713aSLionel Sambuc           formTextToken(T, TokenPtr);
349f4a2713aSLionel Sambuc           return;
350f4a2713aSLionel Sambuc         }
351f4a2713aSLionel Sambuc 
352f4a2713aSLionel Sambuc         TokenPtr = skipCommandName(TokenPtr, CommentEnd);
353f4a2713aSLionel Sambuc         unsigned Length = TokenPtr - (BufferPtr + 1);
354f4a2713aSLionel Sambuc 
355f4a2713aSLionel Sambuc         // Hardcoded support for lexing LaTeX formula commands
356f4a2713aSLionel Sambuc         // \f$ \f[ \f] \f{ \f} as a single command.
357f4a2713aSLionel Sambuc         if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
358f4a2713aSLionel Sambuc           C = *TokenPtr;
359f4a2713aSLionel Sambuc           if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
360f4a2713aSLionel Sambuc             TokenPtr++;
361f4a2713aSLionel Sambuc             Length++;
362f4a2713aSLionel Sambuc           }
363f4a2713aSLionel Sambuc         }
364f4a2713aSLionel Sambuc 
365*0a6a1f1dSLionel Sambuc         StringRef CommandName(BufferPtr + 1, Length);
366f4a2713aSLionel Sambuc 
367f4a2713aSLionel Sambuc         const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
368f4a2713aSLionel Sambuc         if (!Info) {
369f4a2713aSLionel Sambuc           if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
370f4a2713aSLionel Sambuc             StringRef CorrectedName = Info->Name;
371*0a6a1f1dSLionel Sambuc             SourceLocation Loc = getSourceLocation(BufferPtr);
372*0a6a1f1dSLionel Sambuc             SourceRange CommandRange(Loc.getLocWithOffset(1),
373*0a6a1f1dSLionel Sambuc                                      getSourceLocation(TokenPtr));
374*0a6a1f1dSLionel Sambuc             Diag(Loc, diag::warn_correct_comment_command_name)
375f4a2713aSLionel Sambuc               << CommandName << CorrectedName
376f4a2713aSLionel Sambuc               << FixItHint::CreateReplacement(CommandRange, CorrectedName);
377f4a2713aSLionel Sambuc           } else {
378*0a6a1f1dSLionel Sambuc             formTokenWithChars(T, TokenPtr, tok::unknown_command);
379*0a6a1f1dSLionel Sambuc             T.setUnknownCommandName(CommandName);
380f4a2713aSLionel Sambuc             Diag(T.getLocation(), diag::warn_unknown_comment_command_name);
381f4a2713aSLionel Sambuc             return;
382f4a2713aSLionel Sambuc           }
383f4a2713aSLionel Sambuc         }
384f4a2713aSLionel Sambuc         if (Info->IsVerbatimBlockCommand) {
385f4a2713aSLionel Sambuc           setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
386f4a2713aSLionel Sambuc           return;
387f4a2713aSLionel Sambuc         }
388f4a2713aSLionel Sambuc         if (Info->IsVerbatimLineCommand) {
389f4a2713aSLionel Sambuc           setupAndLexVerbatimLine(T, TokenPtr, Info);
390f4a2713aSLionel Sambuc           return;
391f4a2713aSLionel Sambuc         }
392f4a2713aSLionel Sambuc         formTokenWithChars(T, TokenPtr, CommandKind);
393f4a2713aSLionel Sambuc         T.setCommandID(Info->getID());
394f4a2713aSLionel Sambuc         return;
395f4a2713aSLionel Sambuc       }
396f4a2713aSLionel Sambuc 
397f4a2713aSLionel Sambuc       case '&':
398f4a2713aSLionel Sambuc         lexHTMLCharacterReference(T);
399f4a2713aSLionel Sambuc         return;
400f4a2713aSLionel Sambuc 
401f4a2713aSLionel Sambuc       case '<': {
402f4a2713aSLionel Sambuc         TokenPtr++;
403f4a2713aSLionel Sambuc         if (TokenPtr == CommentEnd) {
404f4a2713aSLionel Sambuc           formTextToken(T, TokenPtr);
405f4a2713aSLionel Sambuc           return;
406f4a2713aSLionel Sambuc         }
407f4a2713aSLionel Sambuc         const char C = *TokenPtr;
408f4a2713aSLionel Sambuc         if (isHTMLIdentifierStartingCharacter(C))
409f4a2713aSLionel Sambuc           setupAndLexHTMLStartTag(T);
410f4a2713aSLionel Sambuc         else if (C == '/')
411f4a2713aSLionel Sambuc           setupAndLexHTMLEndTag(T);
412f4a2713aSLionel Sambuc         else
413f4a2713aSLionel Sambuc           formTextToken(T, TokenPtr);
414f4a2713aSLionel Sambuc 
415f4a2713aSLionel Sambuc         return;
416f4a2713aSLionel Sambuc       }
417f4a2713aSLionel Sambuc 
418f4a2713aSLionel Sambuc       case '\n':
419f4a2713aSLionel Sambuc       case '\r':
420f4a2713aSLionel Sambuc         TokenPtr = skipNewline(TokenPtr, CommentEnd);
421f4a2713aSLionel Sambuc         formTokenWithChars(T, TokenPtr, tok::newline);
422f4a2713aSLionel Sambuc 
423f4a2713aSLionel Sambuc         if (CommentState == LCS_InsideCComment)
424f4a2713aSLionel Sambuc           skipLineStartingDecorations();
425f4a2713aSLionel Sambuc         return;
426f4a2713aSLionel Sambuc 
427f4a2713aSLionel Sambuc       default: {
428f4a2713aSLionel Sambuc         size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr).
429f4a2713aSLionel Sambuc                          find_first_of("\n\r\\@&<");
430f4a2713aSLionel Sambuc         if (End != StringRef::npos)
431f4a2713aSLionel Sambuc           TokenPtr += End;
432f4a2713aSLionel Sambuc         else
433f4a2713aSLionel Sambuc           TokenPtr = CommentEnd;
434f4a2713aSLionel Sambuc         formTextToken(T, TokenPtr);
435f4a2713aSLionel Sambuc         return;
436f4a2713aSLionel Sambuc       }
437f4a2713aSLionel Sambuc     }
438f4a2713aSLionel Sambuc   }
439f4a2713aSLionel Sambuc }
440f4a2713aSLionel Sambuc 
setupAndLexVerbatimBlock(Token & T,const char * TextBegin,char Marker,const CommandInfo * Info)441f4a2713aSLionel Sambuc void Lexer::setupAndLexVerbatimBlock(Token &T,
442f4a2713aSLionel Sambuc                                      const char *TextBegin,
443f4a2713aSLionel Sambuc                                      char Marker, const CommandInfo *Info) {
444f4a2713aSLionel Sambuc   assert(Info->IsVerbatimBlockCommand);
445f4a2713aSLionel Sambuc 
446f4a2713aSLionel Sambuc   VerbatimBlockEndCommandName.clear();
447f4a2713aSLionel Sambuc   VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
448f4a2713aSLionel Sambuc   VerbatimBlockEndCommandName.append(Info->EndCommandName);
449f4a2713aSLionel Sambuc 
450f4a2713aSLionel Sambuc   formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
451f4a2713aSLionel Sambuc   T.setVerbatimBlockID(Info->getID());
452f4a2713aSLionel Sambuc 
453f4a2713aSLionel Sambuc   // If there is a newline following the verbatim opening command, skip the
454f4a2713aSLionel Sambuc   // newline so that we don't create an tok::verbatim_block_line with empty
455f4a2713aSLionel Sambuc   // text content.
456f4a2713aSLionel Sambuc   if (BufferPtr != CommentEnd &&
457f4a2713aSLionel Sambuc       isVerticalWhitespace(*BufferPtr)) {
458f4a2713aSLionel Sambuc     BufferPtr = skipNewline(BufferPtr, CommentEnd);
459f4a2713aSLionel Sambuc     State = LS_VerbatimBlockBody;
460f4a2713aSLionel Sambuc     return;
461f4a2713aSLionel Sambuc   }
462f4a2713aSLionel Sambuc 
463f4a2713aSLionel Sambuc   State = LS_VerbatimBlockFirstLine;
464f4a2713aSLionel Sambuc }
465f4a2713aSLionel Sambuc 
lexVerbatimBlockFirstLine(Token & T)466f4a2713aSLionel Sambuc void Lexer::lexVerbatimBlockFirstLine(Token &T) {
467f4a2713aSLionel Sambuc again:
468f4a2713aSLionel Sambuc   assert(BufferPtr < CommentEnd);
469f4a2713aSLionel Sambuc 
470f4a2713aSLionel Sambuc   // FIXME: It would be better to scan the text once, finding either the block
471f4a2713aSLionel Sambuc   // end command or newline.
472f4a2713aSLionel Sambuc   //
473f4a2713aSLionel Sambuc   // Extract current line.
474f4a2713aSLionel Sambuc   const char *Newline = findNewline(BufferPtr, CommentEnd);
475f4a2713aSLionel Sambuc   StringRef Line(BufferPtr, Newline - BufferPtr);
476f4a2713aSLionel Sambuc 
477f4a2713aSLionel Sambuc   // Look for end command in current line.
478f4a2713aSLionel Sambuc   size_t Pos = Line.find(VerbatimBlockEndCommandName);
479f4a2713aSLionel Sambuc   const char *TextEnd;
480f4a2713aSLionel Sambuc   const char *NextLine;
481f4a2713aSLionel Sambuc   if (Pos == StringRef::npos) {
482f4a2713aSLionel Sambuc     // Current line is completely verbatim.
483f4a2713aSLionel Sambuc     TextEnd = Newline;
484f4a2713aSLionel Sambuc     NextLine = skipNewline(Newline, CommentEnd);
485f4a2713aSLionel Sambuc   } else if (Pos == 0) {
486f4a2713aSLionel Sambuc     // Current line contains just an end command.
487f4a2713aSLionel Sambuc     const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
488f4a2713aSLionel Sambuc     StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
489f4a2713aSLionel Sambuc     formTokenWithChars(T, End, tok::verbatim_block_end);
490f4a2713aSLionel Sambuc     T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
491f4a2713aSLionel Sambuc     State = LS_Normal;
492f4a2713aSLionel Sambuc     return;
493f4a2713aSLionel Sambuc   } else {
494f4a2713aSLionel Sambuc     // There is some text, followed by end command.  Extract text first.
495f4a2713aSLionel Sambuc     TextEnd = BufferPtr + Pos;
496f4a2713aSLionel Sambuc     NextLine = TextEnd;
497f4a2713aSLionel Sambuc     // If there is only whitespace before end command, skip whitespace.
498f4a2713aSLionel Sambuc     if (isWhitespace(BufferPtr, TextEnd)) {
499f4a2713aSLionel Sambuc       BufferPtr = TextEnd;
500f4a2713aSLionel Sambuc       goto again;
501f4a2713aSLionel Sambuc     }
502f4a2713aSLionel Sambuc   }
503f4a2713aSLionel Sambuc 
504f4a2713aSLionel Sambuc   StringRef Text(BufferPtr, TextEnd - BufferPtr);
505f4a2713aSLionel Sambuc   formTokenWithChars(T, NextLine, tok::verbatim_block_line);
506f4a2713aSLionel Sambuc   T.setVerbatimBlockText(Text);
507f4a2713aSLionel Sambuc 
508f4a2713aSLionel Sambuc   State = LS_VerbatimBlockBody;
509f4a2713aSLionel Sambuc }
510f4a2713aSLionel Sambuc 
lexVerbatimBlockBody(Token & T)511f4a2713aSLionel Sambuc void Lexer::lexVerbatimBlockBody(Token &T) {
512f4a2713aSLionel Sambuc   assert(State == LS_VerbatimBlockBody);
513f4a2713aSLionel Sambuc 
514f4a2713aSLionel Sambuc   if (CommentState == LCS_InsideCComment)
515f4a2713aSLionel Sambuc     skipLineStartingDecorations();
516f4a2713aSLionel Sambuc 
517f4a2713aSLionel Sambuc   lexVerbatimBlockFirstLine(T);
518f4a2713aSLionel Sambuc }
519f4a2713aSLionel Sambuc 
setupAndLexVerbatimLine(Token & T,const char * TextBegin,const CommandInfo * Info)520f4a2713aSLionel Sambuc void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
521f4a2713aSLionel Sambuc                                     const CommandInfo *Info) {
522f4a2713aSLionel Sambuc   assert(Info->IsVerbatimLineCommand);
523f4a2713aSLionel Sambuc   formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
524f4a2713aSLionel Sambuc   T.setVerbatimLineID(Info->getID());
525f4a2713aSLionel Sambuc 
526f4a2713aSLionel Sambuc   State = LS_VerbatimLineText;
527f4a2713aSLionel Sambuc }
528f4a2713aSLionel Sambuc 
lexVerbatimLineText(Token & T)529f4a2713aSLionel Sambuc void Lexer::lexVerbatimLineText(Token &T) {
530f4a2713aSLionel Sambuc   assert(State == LS_VerbatimLineText);
531f4a2713aSLionel Sambuc 
532f4a2713aSLionel Sambuc   // Extract current line.
533f4a2713aSLionel Sambuc   const char *Newline = findNewline(BufferPtr, CommentEnd);
534*0a6a1f1dSLionel Sambuc   StringRef Text(BufferPtr, Newline - BufferPtr);
535f4a2713aSLionel Sambuc   formTokenWithChars(T, Newline, tok::verbatim_line_text);
536f4a2713aSLionel Sambuc   T.setVerbatimLineText(Text);
537f4a2713aSLionel Sambuc 
538f4a2713aSLionel Sambuc   State = LS_Normal;
539f4a2713aSLionel Sambuc }
540f4a2713aSLionel Sambuc 
lexHTMLCharacterReference(Token & T)541f4a2713aSLionel Sambuc void Lexer::lexHTMLCharacterReference(Token &T) {
542f4a2713aSLionel Sambuc   const char *TokenPtr = BufferPtr;
543f4a2713aSLionel Sambuc   assert(*TokenPtr == '&');
544f4a2713aSLionel Sambuc   TokenPtr++;
545f4a2713aSLionel Sambuc   if (TokenPtr == CommentEnd) {
546f4a2713aSLionel Sambuc     formTextToken(T, TokenPtr);
547f4a2713aSLionel Sambuc     return;
548f4a2713aSLionel Sambuc   }
549f4a2713aSLionel Sambuc   const char *NamePtr;
550f4a2713aSLionel Sambuc   bool isNamed = false;
551f4a2713aSLionel Sambuc   bool isDecimal = false;
552f4a2713aSLionel Sambuc   char C = *TokenPtr;
553f4a2713aSLionel Sambuc   if (isHTMLNamedCharacterReferenceCharacter(C)) {
554f4a2713aSLionel Sambuc     NamePtr = TokenPtr;
555f4a2713aSLionel Sambuc     TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
556f4a2713aSLionel Sambuc     isNamed = true;
557f4a2713aSLionel Sambuc   } else if (C == '#') {
558f4a2713aSLionel Sambuc     TokenPtr++;
559f4a2713aSLionel Sambuc     if (TokenPtr == CommentEnd) {
560f4a2713aSLionel Sambuc       formTextToken(T, TokenPtr);
561f4a2713aSLionel Sambuc       return;
562f4a2713aSLionel Sambuc     }
563f4a2713aSLionel Sambuc     C = *TokenPtr;
564f4a2713aSLionel Sambuc     if (isHTMLDecimalCharacterReferenceCharacter(C)) {
565f4a2713aSLionel Sambuc       NamePtr = TokenPtr;
566f4a2713aSLionel Sambuc       TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
567f4a2713aSLionel Sambuc       isDecimal = true;
568f4a2713aSLionel Sambuc     } else if (C == 'x' || C == 'X') {
569f4a2713aSLionel Sambuc       TokenPtr++;
570f4a2713aSLionel Sambuc       NamePtr = TokenPtr;
571f4a2713aSLionel Sambuc       TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
572f4a2713aSLionel Sambuc     } else {
573f4a2713aSLionel Sambuc       formTextToken(T, TokenPtr);
574f4a2713aSLionel Sambuc       return;
575f4a2713aSLionel Sambuc     }
576f4a2713aSLionel Sambuc   } else {
577f4a2713aSLionel Sambuc     formTextToken(T, TokenPtr);
578f4a2713aSLionel Sambuc     return;
579f4a2713aSLionel Sambuc   }
580f4a2713aSLionel Sambuc   if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
581f4a2713aSLionel Sambuc       *TokenPtr != ';') {
582f4a2713aSLionel Sambuc     formTextToken(T, TokenPtr);
583f4a2713aSLionel Sambuc     return;
584f4a2713aSLionel Sambuc   }
585f4a2713aSLionel Sambuc   StringRef Name(NamePtr, TokenPtr - NamePtr);
586f4a2713aSLionel Sambuc   TokenPtr++; // Skip semicolon.
587f4a2713aSLionel Sambuc   StringRef Resolved;
588f4a2713aSLionel Sambuc   if (isNamed)
589f4a2713aSLionel Sambuc     Resolved = resolveHTMLNamedCharacterReference(Name);
590f4a2713aSLionel Sambuc   else if (isDecimal)
591f4a2713aSLionel Sambuc     Resolved = resolveHTMLDecimalCharacterReference(Name);
592f4a2713aSLionel Sambuc   else
593f4a2713aSLionel Sambuc     Resolved = resolveHTMLHexCharacterReference(Name);
594f4a2713aSLionel Sambuc 
595f4a2713aSLionel Sambuc   if (Resolved.empty()) {
596f4a2713aSLionel Sambuc     formTextToken(T, TokenPtr);
597f4a2713aSLionel Sambuc     return;
598f4a2713aSLionel Sambuc   }
599f4a2713aSLionel Sambuc   formTokenWithChars(T, TokenPtr, tok::text);
600f4a2713aSLionel Sambuc   T.setText(Resolved);
601f4a2713aSLionel Sambuc   return;
602f4a2713aSLionel Sambuc }
603f4a2713aSLionel Sambuc 
setupAndLexHTMLStartTag(Token & T)604f4a2713aSLionel Sambuc void Lexer::setupAndLexHTMLStartTag(Token &T) {
605f4a2713aSLionel Sambuc   assert(BufferPtr[0] == '<' &&
606f4a2713aSLionel Sambuc          isHTMLIdentifierStartingCharacter(BufferPtr[1]));
607f4a2713aSLionel Sambuc   const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
608f4a2713aSLionel Sambuc   StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
609f4a2713aSLionel Sambuc   if (!isHTMLTagName(Name)) {
610f4a2713aSLionel Sambuc     formTextToken(T, TagNameEnd);
611f4a2713aSLionel Sambuc     return;
612f4a2713aSLionel Sambuc   }
613f4a2713aSLionel Sambuc 
614f4a2713aSLionel Sambuc   formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
615f4a2713aSLionel Sambuc   T.setHTMLTagStartName(Name);
616f4a2713aSLionel Sambuc 
617f4a2713aSLionel Sambuc   BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
618f4a2713aSLionel Sambuc 
619f4a2713aSLionel Sambuc   const char C = *BufferPtr;
620f4a2713aSLionel Sambuc   if (BufferPtr != CommentEnd &&
621f4a2713aSLionel Sambuc       (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
622f4a2713aSLionel Sambuc     State = LS_HTMLStartTag;
623f4a2713aSLionel Sambuc }
624f4a2713aSLionel Sambuc 
lexHTMLStartTag(Token & T)625f4a2713aSLionel Sambuc void Lexer::lexHTMLStartTag(Token &T) {
626f4a2713aSLionel Sambuc   assert(State == LS_HTMLStartTag);
627f4a2713aSLionel Sambuc 
628f4a2713aSLionel Sambuc   const char *TokenPtr = BufferPtr;
629f4a2713aSLionel Sambuc   char C = *TokenPtr;
630f4a2713aSLionel Sambuc   if (isHTMLIdentifierCharacter(C)) {
631f4a2713aSLionel Sambuc     TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
632f4a2713aSLionel Sambuc     StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
633f4a2713aSLionel Sambuc     formTokenWithChars(T, TokenPtr, tok::html_ident);
634f4a2713aSLionel Sambuc     T.setHTMLIdent(Ident);
635f4a2713aSLionel Sambuc   } else {
636f4a2713aSLionel Sambuc     switch (C) {
637f4a2713aSLionel Sambuc     case '=':
638f4a2713aSLionel Sambuc       TokenPtr++;
639f4a2713aSLionel Sambuc       formTokenWithChars(T, TokenPtr, tok::html_equals);
640f4a2713aSLionel Sambuc       break;
641f4a2713aSLionel Sambuc     case '\"':
642f4a2713aSLionel Sambuc     case '\'': {
643f4a2713aSLionel Sambuc       const char *OpenQuote = TokenPtr;
644f4a2713aSLionel Sambuc       TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
645f4a2713aSLionel Sambuc       const char *ClosingQuote = TokenPtr;
646f4a2713aSLionel Sambuc       if (TokenPtr != CommentEnd) // Skip closing quote.
647f4a2713aSLionel Sambuc         TokenPtr++;
648f4a2713aSLionel Sambuc       formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
649f4a2713aSLionel Sambuc       T.setHTMLQuotedString(StringRef(OpenQuote + 1,
650f4a2713aSLionel Sambuc                                       ClosingQuote - (OpenQuote + 1)));
651f4a2713aSLionel Sambuc       break;
652f4a2713aSLionel Sambuc     }
653f4a2713aSLionel Sambuc     case '>':
654f4a2713aSLionel Sambuc       TokenPtr++;
655f4a2713aSLionel Sambuc       formTokenWithChars(T, TokenPtr, tok::html_greater);
656f4a2713aSLionel Sambuc       State = LS_Normal;
657f4a2713aSLionel Sambuc       return;
658f4a2713aSLionel Sambuc     case '/':
659f4a2713aSLionel Sambuc       TokenPtr++;
660f4a2713aSLionel Sambuc       if (TokenPtr != CommentEnd && *TokenPtr == '>') {
661f4a2713aSLionel Sambuc         TokenPtr++;
662f4a2713aSLionel Sambuc         formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
663f4a2713aSLionel Sambuc       } else
664f4a2713aSLionel Sambuc         formTextToken(T, TokenPtr);
665f4a2713aSLionel Sambuc 
666f4a2713aSLionel Sambuc       State = LS_Normal;
667f4a2713aSLionel Sambuc       return;
668f4a2713aSLionel Sambuc     }
669f4a2713aSLionel Sambuc   }
670f4a2713aSLionel Sambuc 
671f4a2713aSLionel Sambuc   // Now look ahead and return to normal state if we don't see any HTML tokens
672f4a2713aSLionel Sambuc   // ahead.
673f4a2713aSLionel Sambuc   BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
674f4a2713aSLionel Sambuc   if (BufferPtr == CommentEnd) {
675f4a2713aSLionel Sambuc     State = LS_Normal;
676f4a2713aSLionel Sambuc     return;
677f4a2713aSLionel Sambuc   }
678f4a2713aSLionel Sambuc 
679f4a2713aSLionel Sambuc   C = *BufferPtr;
680f4a2713aSLionel Sambuc   if (!isHTMLIdentifierStartingCharacter(C) &&
681f4a2713aSLionel Sambuc       C != '=' && C != '\"' && C != '\'' && C != '>') {
682f4a2713aSLionel Sambuc     State = LS_Normal;
683f4a2713aSLionel Sambuc     return;
684f4a2713aSLionel Sambuc   }
685f4a2713aSLionel Sambuc }
686f4a2713aSLionel Sambuc 
setupAndLexHTMLEndTag(Token & T)687f4a2713aSLionel Sambuc void Lexer::setupAndLexHTMLEndTag(Token &T) {
688f4a2713aSLionel Sambuc   assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
689f4a2713aSLionel Sambuc 
690f4a2713aSLionel Sambuc   const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
691f4a2713aSLionel Sambuc   const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
692f4a2713aSLionel Sambuc   StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
693f4a2713aSLionel Sambuc   if (!isHTMLTagName(Name)) {
694f4a2713aSLionel Sambuc     formTextToken(T, TagNameEnd);
695f4a2713aSLionel Sambuc     return;
696f4a2713aSLionel Sambuc   }
697f4a2713aSLionel Sambuc 
698f4a2713aSLionel Sambuc   const char *End = skipWhitespace(TagNameEnd, CommentEnd);
699f4a2713aSLionel Sambuc 
700f4a2713aSLionel Sambuc   formTokenWithChars(T, End, tok::html_end_tag);
701f4a2713aSLionel Sambuc   T.setHTMLTagEndName(Name);
702f4a2713aSLionel Sambuc 
703f4a2713aSLionel Sambuc   if (BufferPtr != CommentEnd && *BufferPtr == '>')
704f4a2713aSLionel Sambuc     State = LS_HTMLEndTag;
705f4a2713aSLionel Sambuc }
706f4a2713aSLionel Sambuc 
lexHTMLEndTag(Token & T)707f4a2713aSLionel Sambuc void Lexer::lexHTMLEndTag(Token &T) {
708f4a2713aSLionel Sambuc   assert(BufferPtr != CommentEnd && *BufferPtr == '>');
709f4a2713aSLionel Sambuc 
710f4a2713aSLionel Sambuc   formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
711f4a2713aSLionel Sambuc   State = LS_Normal;
712f4a2713aSLionel Sambuc }
713f4a2713aSLionel Sambuc 
Lexer(llvm::BumpPtrAllocator & Allocator,DiagnosticsEngine & Diags,const CommandTraits & Traits,SourceLocation FileLoc,const char * BufferStart,const char * BufferEnd)714f4a2713aSLionel Sambuc Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
715f4a2713aSLionel Sambuc              const CommandTraits &Traits,
716f4a2713aSLionel Sambuc              SourceLocation FileLoc,
717f4a2713aSLionel Sambuc              const char *BufferStart, const char *BufferEnd):
718f4a2713aSLionel Sambuc     Allocator(Allocator), Diags(Diags), Traits(Traits),
719f4a2713aSLionel Sambuc     BufferStart(BufferStart), BufferEnd(BufferEnd),
720f4a2713aSLionel Sambuc     FileLoc(FileLoc), BufferPtr(BufferStart),
721f4a2713aSLionel Sambuc     CommentState(LCS_BeforeComment), State(LS_Normal) {
722f4a2713aSLionel Sambuc }
723f4a2713aSLionel Sambuc 
lex(Token & T)724f4a2713aSLionel Sambuc void Lexer::lex(Token &T) {
725f4a2713aSLionel Sambuc again:
726f4a2713aSLionel Sambuc   switch (CommentState) {
727f4a2713aSLionel Sambuc   case LCS_BeforeComment:
728f4a2713aSLionel Sambuc     if (BufferPtr == BufferEnd) {
729f4a2713aSLionel Sambuc       formTokenWithChars(T, BufferPtr, tok::eof);
730f4a2713aSLionel Sambuc       return;
731f4a2713aSLionel Sambuc     }
732f4a2713aSLionel Sambuc 
733f4a2713aSLionel Sambuc     assert(*BufferPtr == '/');
734f4a2713aSLionel Sambuc     BufferPtr++; // Skip first slash.
735f4a2713aSLionel Sambuc     switch(*BufferPtr) {
736f4a2713aSLionel Sambuc     case '/': { // BCPL comment.
737f4a2713aSLionel Sambuc       BufferPtr++; // Skip second slash.
738f4a2713aSLionel Sambuc 
739f4a2713aSLionel Sambuc       if (BufferPtr != BufferEnd) {
740f4a2713aSLionel Sambuc         // Skip Doxygen magic marker, if it is present.
741f4a2713aSLionel Sambuc         // It might be missing because of a typo //< or /*<, or because we
742f4a2713aSLionel Sambuc         // merged this non-Doxygen comment into a bunch of Doxygen comments
743f4a2713aSLionel Sambuc         // around it: /** ... */ /* ... */ /** ... */
744f4a2713aSLionel Sambuc         const char C = *BufferPtr;
745f4a2713aSLionel Sambuc         if (C == '/' || C == '!')
746f4a2713aSLionel Sambuc           BufferPtr++;
747f4a2713aSLionel Sambuc       }
748f4a2713aSLionel Sambuc 
749f4a2713aSLionel Sambuc       // Skip less-than symbol that marks trailing comments.
750f4a2713aSLionel Sambuc       // Skip it even if the comment is not a Doxygen one, because //< and /*<
751f4a2713aSLionel Sambuc       // are frequent typos.
752f4a2713aSLionel Sambuc       if (BufferPtr != BufferEnd && *BufferPtr == '<')
753f4a2713aSLionel Sambuc         BufferPtr++;
754f4a2713aSLionel Sambuc 
755f4a2713aSLionel Sambuc       CommentState = LCS_InsideBCPLComment;
756f4a2713aSLionel Sambuc       if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
757f4a2713aSLionel Sambuc         State = LS_Normal;
758f4a2713aSLionel Sambuc       CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
759f4a2713aSLionel Sambuc       goto again;
760f4a2713aSLionel Sambuc     }
761f4a2713aSLionel Sambuc     case '*': { // C comment.
762f4a2713aSLionel Sambuc       BufferPtr++; // Skip star.
763f4a2713aSLionel Sambuc 
764f4a2713aSLionel Sambuc       // Skip Doxygen magic marker.
765f4a2713aSLionel Sambuc       const char C = *BufferPtr;
766f4a2713aSLionel Sambuc       if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
767f4a2713aSLionel Sambuc         BufferPtr++;
768f4a2713aSLionel Sambuc 
769f4a2713aSLionel Sambuc       // Skip less-than symbol that marks trailing comments.
770f4a2713aSLionel Sambuc       if (BufferPtr != BufferEnd && *BufferPtr == '<')
771f4a2713aSLionel Sambuc         BufferPtr++;
772f4a2713aSLionel Sambuc 
773f4a2713aSLionel Sambuc       CommentState = LCS_InsideCComment;
774f4a2713aSLionel Sambuc       State = LS_Normal;
775f4a2713aSLionel Sambuc       CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
776f4a2713aSLionel Sambuc       goto again;
777f4a2713aSLionel Sambuc     }
778f4a2713aSLionel Sambuc     default:
779f4a2713aSLionel Sambuc       llvm_unreachable("second character of comment should be '/' or '*'");
780f4a2713aSLionel Sambuc     }
781f4a2713aSLionel Sambuc 
782f4a2713aSLionel Sambuc   case LCS_BetweenComments: {
783f4a2713aSLionel Sambuc     // Consecutive comments are extracted only if there is only whitespace
784f4a2713aSLionel Sambuc     // between them.  So we can search for the start of the next comment.
785f4a2713aSLionel Sambuc     const char *EndWhitespace = BufferPtr;
786f4a2713aSLionel Sambuc     while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
787f4a2713aSLionel Sambuc       EndWhitespace++;
788f4a2713aSLionel Sambuc 
789f4a2713aSLionel Sambuc     // Turn any whitespace between comments (and there is only whitespace
790f4a2713aSLionel Sambuc     // between them -- guaranteed by comment extraction) into a newline.  We
791f4a2713aSLionel Sambuc     // have two newlines between C comments in total (first one was synthesized
792f4a2713aSLionel Sambuc     // after a comment).
793f4a2713aSLionel Sambuc     formTokenWithChars(T, EndWhitespace, tok::newline);
794f4a2713aSLionel Sambuc 
795f4a2713aSLionel Sambuc     CommentState = LCS_BeforeComment;
796f4a2713aSLionel Sambuc     break;
797f4a2713aSLionel Sambuc   }
798f4a2713aSLionel Sambuc 
799f4a2713aSLionel Sambuc   case LCS_InsideBCPLComment:
800f4a2713aSLionel Sambuc   case LCS_InsideCComment:
801f4a2713aSLionel Sambuc     if (BufferPtr != CommentEnd) {
802f4a2713aSLionel Sambuc       lexCommentText(T);
803f4a2713aSLionel Sambuc       break;
804f4a2713aSLionel Sambuc     } else {
805f4a2713aSLionel Sambuc       // Skip C comment closing sequence.
806f4a2713aSLionel Sambuc       if (CommentState == LCS_InsideCComment) {
807f4a2713aSLionel Sambuc         assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
808f4a2713aSLionel Sambuc         BufferPtr += 2;
809f4a2713aSLionel Sambuc         assert(BufferPtr <= BufferEnd);
810f4a2713aSLionel Sambuc 
811f4a2713aSLionel Sambuc         // Synthenize newline just after the C comment, regardless if there is
812f4a2713aSLionel Sambuc         // actually a newline.
813f4a2713aSLionel Sambuc         formTokenWithChars(T, BufferPtr, tok::newline);
814f4a2713aSLionel Sambuc 
815f4a2713aSLionel Sambuc         CommentState = LCS_BetweenComments;
816f4a2713aSLionel Sambuc         break;
817f4a2713aSLionel Sambuc       } else {
818f4a2713aSLionel Sambuc         // Don't synthesized a newline after BCPL comment.
819f4a2713aSLionel Sambuc         CommentState = LCS_BetweenComments;
820f4a2713aSLionel Sambuc         goto again;
821f4a2713aSLionel Sambuc       }
822f4a2713aSLionel Sambuc     }
823f4a2713aSLionel Sambuc   }
824f4a2713aSLionel Sambuc }
825f4a2713aSLionel Sambuc 
getSpelling(const Token & Tok,const SourceManager & SourceMgr,bool * Invalid) const826f4a2713aSLionel Sambuc StringRef Lexer::getSpelling(const Token &Tok,
827f4a2713aSLionel Sambuc                              const SourceManager &SourceMgr,
828f4a2713aSLionel Sambuc                              bool *Invalid) const {
829f4a2713aSLionel Sambuc   SourceLocation Loc = Tok.getLocation();
830f4a2713aSLionel Sambuc   std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
831f4a2713aSLionel Sambuc 
832f4a2713aSLionel Sambuc   bool InvalidTemp = false;
833f4a2713aSLionel Sambuc   StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
834f4a2713aSLionel Sambuc   if (InvalidTemp) {
835f4a2713aSLionel Sambuc     *Invalid = true;
836f4a2713aSLionel Sambuc     return StringRef();
837f4a2713aSLionel Sambuc   }
838f4a2713aSLionel Sambuc 
839f4a2713aSLionel Sambuc   const char *Begin = File.data() + LocInfo.second;
840f4a2713aSLionel Sambuc   return StringRef(Begin, Tok.getLength());
841f4a2713aSLionel Sambuc }
842f4a2713aSLionel Sambuc 
843f4a2713aSLionel Sambuc } // end namespace comments
844f4a2713aSLionel Sambuc } // end namespace clang
845f4a2713aSLionel Sambuc 
846