1f4a2713aSLionel Sambuc #include "clang/AST/CommentLexer.h"
2f4a2713aSLionel Sambuc #include "clang/AST/CommentCommandTraits.h"
3f4a2713aSLionel Sambuc #include "clang/AST/CommentDiagnostic.h"
4f4a2713aSLionel Sambuc #include "clang/Basic/CharInfo.h"
5f4a2713aSLionel Sambuc #include "llvm/ADT/StringExtras.h"
6f4a2713aSLionel Sambuc #include "llvm/ADT/StringSwitch.h"
7f4a2713aSLionel Sambuc #include "llvm/Support/ConvertUTF.h"
8f4a2713aSLionel Sambuc #include "llvm/Support/ErrorHandling.h"
9f4a2713aSLionel Sambuc
10f4a2713aSLionel Sambuc namespace clang {
11f4a2713aSLionel Sambuc namespace comments {
12f4a2713aSLionel Sambuc
dump(const Lexer & L,const SourceManager & SM) const13f4a2713aSLionel Sambuc void Token::dump(const Lexer &L, const SourceManager &SM) const {
14f4a2713aSLionel Sambuc llvm::errs() << "comments::Token Kind=" << Kind << " ";
15f4a2713aSLionel Sambuc Loc.dump(SM);
16f4a2713aSLionel Sambuc llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
17f4a2713aSLionel Sambuc }
18f4a2713aSLionel Sambuc
isHTMLNamedCharacterReferenceCharacter(char C)19f4a2713aSLionel Sambuc static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
20f4a2713aSLionel Sambuc return isLetter(C);
21f4a2713aSLionel Sambuc }
22f4a2713aSLionel Sambuc
isHTMLDecimalCharacterReferenceCharacter(char C)23f4a2713aSLionel Sambuc static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
24f4a2713aSLionel Sambuc return isDigit(C);
25f4a2713aSLionel Sambuc }
26f4a2713aSLionel Sambuc
isHTMLHexCharacterReferenceCharacter(char C)27f4a2713aSLionel Sambuc static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
28f4a2713aSLionel Sambuc return isHexDigit(C);
29f4a2713aSLionel Sambuc }
30f4a2713aSLionel Sambuc
convertCodePointToUTF8(llvm::BumpPtrAllocator & Allocator,unsigned CodePoint)31f4a2713aSLionel Sambuc static inline StringRef convertCodePointToUTF8(
32f4a2713aSLionel Sambuc llvm::BumpPtrAllocator &Allocator,
33f4a2713aSLionel Sambuc unsigned CodePoint) {
34f4a2713aSLionel Sambuc char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
35f4a2713aSLionel Sambuc char *ResolvedPtr = Resolved;
36f4a2713aSLionel Sambuc if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
37f4a2713aSLionel Sambuc return StringRef(Resolved, ResolvedPtr - Resolved);
38f4a2713aSLionel Sambuc else
39f4a2713aSLionel Sambuc return StringRef();
40f4a2713aSLionel Sambuc }
41f4a2713aSLionel Sambuc
42f4a2713aSLionel Sambuc namespace {
43f4a2713aSLionel Sambuc
44f4a2713aSLionel Sambuc #include "clang/AST/CommentHTMLTags.inc"
45f4a2713aSLionel Sambuc #include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
46f4a2713aSLionel Sambuc
47f4a2713aSLionel Sambuc } // unnamed namespace
48f4a2713aSLionel Sambuc
resolveHTMLNamedCharacterReference(StringRef Name) const49f4a2713aSLionel Sambuc StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
50f4a2713aSLionel Sambuc // Fast path, first check a few most widely used named character references.
51f4a2713aSLionel Sambuc return llvm::StringSwitch<StringRef>(Name)
52f4a2713aSLionel Sambuc .Case("amp", "&")
53f4a2713aSLionel Sambuc .Case("lt", "<")
54f4a2713aSLionel Sambuc .Case("gt", ">")
55f4a2713aSLionel Sambuc .Case("quot", "\"")
56f4a2713aSLionel Sambuc .Case("apos", "\'")
57f4a2713aSLionel Sambuc // Slow path.
58f4a2713aSLionel Sambuc .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
59f4a2713aSLionel Sambuc }
60f4a2713aSLionel Sambuc
resolveHTMLDecimalCharacterReference(StringRef Name) const61f4a2713aSLionel Sambuc StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
62f4a2713aSLionel Sambuc unsigned CodePoint = 0;
63f4a2713aSLionel Sambuc for (unsigned i = 0, e = Name.size(); i != e; ++i) {
64f4a2713aSLionel Sambuc assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
65f4a2713aSLionel Sambuc CodePoint *= 10;
66f4a2713aSLionel Sambuc CodePoint += Name[i] - '0';
67f4a2713aSLionel Sambuc }
68f4a2713aSLionel Sambuc return convertCodePointToUTF8(Allocator, CodePoint);
69f4a2713aSLionel Sambuc }
70f4a2713aSLionel Sambuc
resolveHTMLHexCharacterReference(StringRef Name) const71f4a2713aSLionel Sambuc StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
72f4a2713aSLionel Sambuc unsigned CodePoint = 0;
73f4a2713aSLionel Sambuc for (unsigned i = 0, e = Name.size(); i != e; ++i) {
74f4a2713aSLionel Sambuc CodePoint *= 16;
75f4a2713aSLionel Sambuc const char C = Name[i];
76f4a2713aSLionel Sambuc assert(isHTMLHexCharacterReferenceCharacter(C));
77f4a2713aSLionel Sambuc CodePoint += llvm::hexDigitValue(C);
78f4a2713aSLionel Sambuc }
79f4a2713aSLionel Sambuc return convertCodePointToUTF8(Allocator, CodePoint);
80f4a2713aSLionel Sambuc }
81f4a2713aSLionel Sambuc
skipLineStartingDecorations()82f4a2713aSLionel Sambuc void Lexer::skipLineStartingDecorations() {
83f4a2713aSLionel Sambuc // This function should be called only for C comments
84f4a2713aSLionel Sambuc assert(CommentState == LCS_InsideCComment);
85f4a2713aSLionel Sambuc
86f4a2713aSLionel Sambuc if (BufferPtr == CommentEnd)
87f4a2713aSLionel Sambuc return;
88f4a2713aSLionel Sambuc
89f4a2713aSLionel Sambuc switch (*BufferPtr) {
90f4a2713aSLionel Sambuc case ' ':
91f4a2713aSLionel Sambuc case '\t':
92f4a2713aSLionel Sambuc case '\f':
93f4a2713aSLionel Sambuc case '\v': {
94f4a2713aSLionel Sambuc const char *NewBufferPtr = BufferPtr;
95f4a2713aSLionel Sambuc NewBufferPtr++;
96f4a2713aSLionel Sambuc if (NewBufferPtr == CommentEnd)
97f4a2713aSLionel Sambuc return;
98f4a2713aSLionel Sambuc
99f4a2713aSLionel Sambuc char C = *NewBufferPtr;
100f4a2713aSLionel Sambuc while (isHorizontalWhitespace(C)) {
101f4a2713aSLionel Sambuc NewBufferPtr++;
102f4a2713aSLionel Sambuc if (NewBufferPtr == CommentEnd)
103f4a2713aSLionel Sambuc return;
104f4a2713aSLionel Sambuc C = *NewBufferPtr;
105f4a2713aSLionel Sambuc }
106f4a2713aSLionel Sambuc if (C == '*')
107f4a2713aSLionel Sambuc BufferPtr = NewBufferPtr + 1;
108f4a2713aSLionel Sambuc break;
109f4a2713aSLionel Sambuc }
110f4a2713aSLionel Sambuc case '*':
111f4a2713aSLionel Sambuc BufferPtr++;
112f4a2713aSLionel Sambuc break;
113f4a2713aSLionel Sambuc }
114f4a2713aSLionel Sambuc }
115f4a2713aSLionel Sambuc
116f4a2713aSLionel Sambuc namespace {
117f4a2713aSLionel Sambuc /// Returns pointer to the first newline character in the string.
findNewline(const char * BufferPtr,const char * BufferEnd)118f4a2713aSLionel Sambuc const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
119f4a2713aSLionel Sambuc for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
120f4a2713aSLionel Sambuc if (isVerticalWhitespace(*BufferPtr))
121f4a2713aSLionel Sambuc return BufferPtr;
122f4a2713aSLionel Sambuc }
123f4a2713aSLionel Sambuc return BufferEnd;
124f4a2713aSLionel Sambuc }
125f4a2713aSLionel Sambuc
skipNewline(const char * BufferPtr,const char * BufferEnd)126f4a2713aSLionel Sambuc const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
127f4a2713aSLionel Sambuc if (BufferPtr == BufferEnd)
128f4a2713aSLionel Sambuc return BufferPtr;
129f4a2713aSLionel Sambuc
130f4a2713aSLionel Sambuc if (*BufferPtr == '\n')
131f4a2713aSLionel Sambuc BufferPtr++;
132f4a2713aSLionel Sambuc else {
133f4a2713aSLionel Sambuc assert(*BufferPtr == '\r');
134f4a2713aSLionel Sambuc BufferPtr++;
135f4a2713aSLionel Sambuc if (BufferPtr != BufferEnd && *BufferPtr == '\n')
136f4a2713aSLionel Sambuc BufferPtr++;
137f4a2713aSLionel Sambuc }
138f4a2713aSLionel Sambuc return BufferPtr;
139f4a2713aSLionel Sambuc }
140f4a2713aSLionel Sambuc
skipNamedCharacterReference(const char * BufferPtr,const char * BufferEnd)141f4a2713aSLionel Sambuc const char *skipNamedCharacterReference(const char *BufferPtr,
142f4a2713aSLionel Sambuc const char *BufferEnd) {
143f4a2713aSLionel Sambuc for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
144f4a2713aSLionel Sambuc if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
145f4a2713aSLionel Sambuc return BufferPtr;
146f4a2713aSLionel Sambuc }
147f4a2713aSLionel Sambuc return BufferEnd;
148f4a2713aSLionel Sambuc }
149f4a2713aSLionel Sambuc
skipDecimalCharacterReference(const char * BufferPtr,const char * BufferEnd)150f4a2713aSLionel Sambuc const char *skipDecimalCharacterReference(const char *BufferPtr,
151f4a2713aSLionel Sambuc const char *BufferEnd) {
152f4a2713aSLionel Sambuc for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
153f4a2713aSLionel Sambuc if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
154f4a2713aSLionel Sambuc return BufferPtr;
155f4a2713aSLionel Sambuc }
156f4a2713aSLionel Sambuc return BufferEnd;
157f4a2713aSLionel Sambuc }
158f4a2713aSLionel Sambuc
skipHexCharacterReference(const char * BufferPtr,const char * BufferEnd)159f4a2713aSLionel Sambuc const char *skipHexCharacterReference(const char *BufferPtr,
160f4a2713aSLionel Sambuc const char *BufferEnd) {
161f4a2713aSLionel Sambuc for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
162f4a2713aSLionel Sambuc if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
163f4a2713aSLionel Sambuc return BufferPtr;
164f4a2713aSLionel Sambuc }
165f4a2713aSLionel Sambuc return BufferEnd;
166f4a2713aSLionel Sambuc }
167f4a2713aSLionel Sambuc
isHTMLIdentifierStartingCharacter(char C)168f4a2713aSLionel Sambuc bool isHTMLIdentifierStartingCharacter(char C) {
169f4a2713aSLionel Sambuc return isLetter(C);
170f4a2713aSLionel Sambuc }
171f4a2713aSLionel Sambuc
isHTMLIdentifierCharacter(char C)172f4a2713aSLionel Sambuc bool isHTMLIdentifierCharacter(char C) {
173f4a2713aSLionel Sambuc return isAlphanumeric(C);
174f4a2713aSLionel Sambuc }
175f4a2713aSLionel Sambuc
skipHTMLIdentifier(const char * BufferPtr,const char * BufferEnd)176f4a2713aSLionel Sambuc const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
177f4a2713aSLionel Sambuc for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
178f4a2713aSLionel Sambuc if (!isHTMLIdentifierCharacter(*BufferPtr))
179f4a2713aSLionel Sambuc return BufferPtr;
180f4a2713aSLionel Sambuc }
181f4a2713aSLionel Sambuc return BufferEnd;
182f4a2713aSLionel Sambuc }
183f4a2713aSLionel Sambuc
184f4a2713aSLionel Sambuc /// Skip HTML string quoted in single or double quotes. Escaping quotes inside
185f4a2713aSLionel Sambuc /// string allowed.
186f4a2713aSLionel Sambuc ///
187f4a2713aSLionel Sambuc /// Returns pointer to closing quote.
skipHTMLQuotedString(const char * BufferPtr,const char * BufferEnd)188f4a2713aSLionel Sambuc const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
189f4a2713aSLionel Sambuc {
190f4a2713aSLionel Sambuc const char Quote = *BufferPtr;
191f4a2713aSLionel Sambuc assert(Quote == '\"' || Quote == '\'');
192f4a2713aSLionel Sambuc
193f4a2713aSLionel Sambuc BufferPtr++;
194f4a2713aSLionel Sambuc for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
195f4a2713aSLionel Sambuc const char C = *BufferPtr;
196f4a2713aSLionel Sambuc if (C == Quote && BufferPtr[-1] != '\\')
197f4a2713aSLionel Sambuc return BufferPtr;
198f4a2713aSLionel Sambuc }
199f4a2713aSLionel Sambuc return BufferEnd;
200f4a2713aSLionel Sambuc }
201f4a2713aSLionel Sambuc
skipWhitespace(const char * BufferPtr,const char * BufferEnd)202f4a2713aSLionel Sambuc const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
203f4a2713aSLionel Sambuc for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
204f4a2713aSLionel Sambuc if (!isWhitespace(*BufferPtr))
205f4a2713aSLionel Sambuc return BufferPtr;
206f4a2713aSLionel Sambuc }
207f4a2713aSLionel Sambuc return BufferEnd;
208f4a2713aSLionel Sambuc }
209f4a2713aSLionel Sambuc
isWhitespace(const char * BufferPtr,const char * BufferEnd)210f4a2713aSLionel Sambuc bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
211f4a2713aSLionel Sambuc return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
212f4a2713aSLionel Sambuc }
213f4a2713aSLionel Sambuc
isCommandNameStartCharacter(char C)214f4a2713aSLionel Sambuc bool isCommandNameStartCharacter(char C) {
215f4a2713aSLionel Sambuc return isLetter(C);
216f4a2713aSLionel Sambuc }
217f4a2713aSLionel Sambuc
isCommandNameCharacter(char C)218f4a2713aSLionel Sambuc bool isCommandNameCharacter(char C) {
219f4a2713aSLionel Sambuc return isAlphanumeric(C);
220f4a2713aSLionel Sambuc }
221f4a2713aSLionel Sambuc
skipCommandName(const char * BufferPtr,const char * BufferEnd)222f4a2713aSLionel Sambuc const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
223f4a2713aSLionel Sambuc for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
224f4a2713aSLionel Sambuc if (!isCommandNameCharacter(*BufferPtr))
225f4a2713aSLionel Sambuc return BufferPtr;
226f4a2713aSLionel Sambuc }
227f4a2713aSLionel Sambuc return BufferEnd;
228f4a2713aSLionel Sambuc }
229f4a2713aSLionel Sambuc
230f4a2713aSLionel Sambuc /// Return the one past end pointer for BCPL comments.
231f4a2713aSLionel Sambuc /// Handles newlines escaped with backslash or trigraph for backslahs.
findBCPLCommentEnd(const char * BufferPtr,const char * BufferEnd)232f4a2713aSLionel Sambuc const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
233f4a2713aSLionel Sambuc const char *CurPtr = BufferPtr;
234f4a2713aSLionel Sambuc while (CurPtr != BufferEnd) {
235f4a2713aSLionel Sambuc while (!isVerticalWhitespace(*CurPtr)) {
236f4a2713aSLionel Sambuc CurPtr++;
237f4a2713aSLionel Sambuc if (CurPtr == BufferEnd)
238f4a2713aSLionel Sambuc return BufferEnd;
239f4a2713aSLionel Sambuc }
240f4a2713aSLionel Sambuc // We found a newline, check if it is escaped.
241f4a2713aSLionel Sambuc const char *EscapePtr = CurPtr - 1;
242f4a2713aSLionel Sambuc while(isHorizontalWhitespace(*EscapePtr))
243f4a2713aSLionel Sambuc EscapePtr--;
244f4a2713aSLionel Sambuc
245f4a2713aSLionel Sambuc if (*EscapePtr == '\\' ||
246f4a2713aSLionel Sambuc (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
247f4a2713aSLionel Sambuc EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
248f4a2713aSLionel Sambuc // We found an escaped newline.
249f4a2713aSLionel Sambuc CurPtr = skipNewline(CurPtr, BufferEnd);
250f4a2713aSLionel Sambuc } else
251f4a2713aSLionel Sambuc return CurPtr; // Not an escaped newline.
252f4a2713aSLionel Sambuc }
253f4a2713aSLionel Sambuc return BufferEnd;
254f4a2713aSLionel Sambuc }
255f4a2713aSLionel Sambuc
256f4a2713aSLionel Sambuc /// Return the one past end pointer for C comments.
257f4a2713aSLionel Sambuc /// Very dumb, does not handle escaped newlines or trigraphs.
findCCommentEnd(const char * BufferPtr,const char * BufferEnd)258f4a2713aSLionel Sambuc const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
259f4a2713aSLionel Sambuc for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
260f4a2713aSLionel Sambuc if (*BufferPtr == '*') {
261f4a2713aSLionel Sambuc assert(BufferPtr + 1 != BufferEnd);
262f4a2713aSLionel Sambuc if (*(BufferPtr + 1) == '/')
263f4a2713aSLionel Sambuc return BufferPtr;
264f4a2713aSLionel Sambuc }
265f4a2713aSLionel Sambuc }
266f4a2713aSLionel Sambuc llvm_unreachable("buffer end hit before '*/' was seen");
267f4a2713aSLionel Sambuc }
268f4a2713aSLionel Sambuc
269f4a2713aSLionel Sambuc } // unnamed namespace
270f4a2713aSLionel Sambuc
formTokenWithChars(Token & Result,const char * TokEnd,tok::TokenKind Kind)271*0a6a1f1dSLionel Sambuc void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
272*0a6a1f1dSLionel Sambuc tok::TokenKind Kind) {
273*0a6a1f1dSLionel Sambuc const unsigned TokLen = TokEnd - BufferPtr;
274*0a6a1f1dSLionel Sambuc Result.setLocation(getSourceLocation(BufferPtr));
275*0a6a1f1dSLionel Sambuc Result.setKind(Kind);
276*0a6a1f1dSLionel Sambuc Result.setLength(TokLen);
277*0a6a1f1dSLionel Sambuc #ifndef NDEBUG
278*0a6a1f1dSLionel Sambuc Result.TextPtr = "<UNSET>";
279*0a6a1f1dSLionel Sambuc Result.IntVal = 7;
280*0a6a1f1dSLionel Sambuc #endif
281*0a6a1f1dSLionel Sambuc BufferPtr = TokEnd;
282*0a6a1f1dSLionel Sambuc }
283*0a6a1f1dSLionel Sambuc
lexCommentText(Token & T)284f4a2713aSLionel Sambuc void Lexer::lexCommentText(Token &T) {
285f4a2713aSLionel Sambuc assert(CommentState == LCS_InsideBCPLComment ||
286f4a2713aSLionel Sambuc CommentState == LCS_InsideCComment);
287f4a2713aSLionel Sambuc
288f4a2713aSLionel Sambuc switch (State) {
289f4a2713aSLionel Sambuc case LS_Normal:
290f4a2713aSLionel Sambuc break;
291f4a2713aSLionel Sambuc case LS_VerbatimBlockFirstLine:
292f4a2713aSLionel Sambuc lexVerbatimBlockFirstLine(T);
293f4a2713aSLionel Sambuc return;
294f4a2713aSLionel Sambuc case LS_VerbatimBlockBody:
295f4a2713aSLionel Sambuc lexVerbatimBlockBody(T);
296f4a2713aSLionel Sambuc return;
297f4a2713aSLionel Sambuc case LS_VerbatimLineText:
298f4a2713aSLionel Sambuc lexVerbatimLineText(T);
299f4a2713aSLionel Sambuc return;
300f4a2713aSLionel Sambuc case LS_HTMLStartTag:
301f4a2713aSLionel Sambuc lexHTMLStartTag(T);
302f4a2713aSLionel Sambuc return;
303f4a2713aSLionel Sambuc case LS_HTMLEndTag:
304f4a2713aSLionel Sambuc lexHTMLEndTag(T);
305f4a2713aSLionel Sambuc return;
306f4a2713aSLionel Sambuc }
307f4a2713aSLionel Sambuc
308f4a2713aSLionel Sambuc assert(State == LS_Normal);
309f4a2713aSLionel Sambuc
310f4a2713aSLionel Sambuc const char *TokenPtr = BufferPtr;
311f4a2713aSLionel Sambuc assert(TokenPtr < CommentEnd);
312f4a2713aSLionel Sambuc while (TokenPtr != CommentEnd) {
313f4a2713aSLionel Sambuc switch(*TokenPtr) {
314f4a2713aSLionel Sambuc case '\\':
315f4a2713aSLionel Sambuc case '@': {
316f4a2713aSLionel Sambuc // Commands that start with a backslash and commands that start with
317f4a2713aSLionel Sambuc // 'at' have equivalent semantics. But we keep information about the
318f4a2713aSLionel Sambuc // exact syntax in AST for comments.
319f4a2713aSLionel Sambuc tok::TokenKind CommandKind =
320f4a2713aSLionel Sambuc (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
321f4a2713aSLionel Sambuc TokenPtr++;
322f4a2713aSLionel Sambuc if (TokenPtr == CommentEnd) {
323f4a2713aSLionel Sambuc formTextToken(T, TokenPtr);
324f4a2713aSLionel Sambuc return;
325f4a2713aSLionel Sambuc }
326f4a2713aSLionel Sambuc char C = *TokenPtr;
327f4a2713aSLionel Sambuc switch (C) {
328f4a2713aSLionel Sambuc default:
329f4a2713aSLionel Sambuc break;
330f4a2713aSLionel Sambuc
331f4a2713aSLionel Sambuc case '\\': case '@': case '&': case '$':
332f4a2713aSLionel Sambuc case '#': case '<': case '>': case '%':
333f4a2713aSLionel Sambuc case '\"': case '.': case ':':
334f4a2713aSLionel Sambuc // This is one of \\ \@ \& \$ etc escape sequences.
335f4a2713aSLionel Sambuc TokenPtr++;
336f4a2713aSLionel Sambuc if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
337f4a2713aSLionel Sambuc // This is the \:: escape sequence.
338f4a2713aSLionel Sambuc TokenPtr++;
339f4a2713aSLionel Sambuc }
340f4a2713aSLionel Sambuc StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
341f4a2713aSLionel Sambuc formTokenWithChars(T, TokenPtr, tok::text);
342f4a2713aSLionel Sambuc T.setText(UnescapedText);
343f4a2713aSLionel Sambuc return;
344f4a2713aSLionel Sambuc }
345f4a2713aSLionel Sambuc
346f4a2713aSLionel Sambuc // Don't make zero-length commands.
347f4a2713aSLionel Sambuc if (!isCommandNameStartCharacter(*TokenPtr)) {
348f4a2713aSLionel Sambuc formTextToken(T, TokenPtr);
349f4a2713aSLionel Sambuc return;
350f4a2713aSLionel Sambuc }
351f4a2713aSLionel Sambuc
352f4a2713aSLionel Sambuc TokenPtr = skipCommandName(TokenPtr, CommentEnd);
353f4a2713aSLionel Sambuc unsigned Length = TokenPtr - (BufferPtr + 1);
354f4a2713aSLionel Sambuc
355f4a2713aSLionel Sambuc // Hardcoded support for lexing LaTeX formula commands
356f4a2713aSLionel Sambuc // \f$ \f[ \f] \f{ \f} as a single command.
357f4a2713aSLionel Sambuc if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
358f4a2713aSLionel Sambuc C = *TokenPtr;
359f4a2713aSLionel Sambuc if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
360f4a2713aSLionel Sambuc TokenPtr++;
361f4a2713aSLionel Sambuc Length++;
362f4a2713aSLionel Sambuc }
363f4a2713aSLionel Sambuc }
364f4a2713aSLionel Sambuc
365*0a6a1f1dSLionel Sambuc StringRef CommandName(BufferPtr + 1, Length);
366f4a2713aSLionel Sambuc
367f4a2713aSLionel Sambuc const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
368f4a2713aSLionel Sambuc if (!Info) {
369f4a2713aSLionel Sambuc if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
370f4a2713aSLionel Sambuc StringRef CorrectedName = Info->Name;
371*0a6a1f1dSLionel Sambuc SourceLocation Loc = getSourceLocation(BufferPtr);
372*0a6a1f1dSLionel Sambuc SourceRange CommandRange(Loc.getLocWithOffset(1),
373*0a6a1f1dSLionel Sambuc getSourceLocation(TokenPtr));
374*0a6a1f1dSLionel Sambuc Diag(Loc, diag::warn_correct_comment_command_name)
375f4a2713aSLionel Sambuc << CommandName << CorrectedName
376f4a2713aSLionel Sambuc << FixItHint::CreateReplacement(CommandRange, CorrectedName);
377f4a2713aSLionel Sambuc } else {
378*0a6a1f1dSLionel Sambuc formTokenWithChars(T, TokenPtr, tok::unknown_command);
379*0a6a1f1dSLionel Sambuc T.setUnknownCommandName(CommandName);
380f4a2713aSLionel Sambuc Diag(T.getLocation(), diag::warn_unknown_comment_command_name);
381f4a2713aSLionel Sambuc return;
382f4a2713aSLionel Sambuc }
383f4a2713aSLionel Sambuc }
384f4a2713aSLionel Sambuc if (Info->IsVerbatimBlockCommand) {
385f4a2713aSLionel Sambuc setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
386f4a2713aSLionel Sambuc return;
387f4a2713aSLionel Sambuc }
388f4a2713aSLionel Sambuc if (Info->IsVerbatimLineCommand) {
389f4a2713aSLionel Sambuc setupAndLexVerbatimLine(T, TokenPtr, Info);
390f4a2713aSLionel Sambuc return;
391f4a2713aSLionel Sambuc }
392f4a2713aSLionel Sambuc formTokenWithChars(T, TokenPtr, CommandKind);
393f4a2713aSLionel Sambuc T.setCommandID(Info->getID());
394f4a2713aSLionel Sambuc return;
395f4a2713aSLionel Sambuc }
396f4a2713aSLionel Sambuc
397f4a2713aSLionel Sambuc case '&':
398f4a2713aSLionel Sambuc lexHTMLCharacterReference(T);
399f4a2713aSLionel Sambuc return;
400f4a2713aSLionel Sambuc
401f4a2713aSLionel Sambuc case '<': {
402f4a2713aSLionel Sambuc TokenPtr++;
403f4a2713aSLionel Sambuc if (TokenPtr == CommentEnd) {
404f4a2713aSLionel Sambuc formTextToken(T, TokenPtr);
405f4a2713aSLionel Sambuc return;
406f4a2713aSLionel Sambuc }
407f4a2713aSLionel Sambuc const char C = *TokenPtr;
408f4a2713aSLionel Sambuc if (isHTMLIdentifierStartingCharacter(C))
409f4a2713aSLionel Sambuc setupAndLexHTMLStartTag(T);
410f4a2713aSLionel Sambuc else if (C == '/')
411f4a2713aSLionel Sambuc setupAndLexHTMLEndTag(T);
412f4a2713aSLionel Sambuc else
413f4a2713aSLionel Sambuc formTextToken(T, TokenPtr);
414f4a2713aSLionel Sambuc
415f4a2713aSLionel Sambuc return;
416f4a2713aSLionel Sambuc }
417f4a2713aSLionel Sambuc
418f4a2713aSLionel Sambuc case '\n':
419f4a2713aSLionel Sambuc case '\r':
420f4a2713aSLionel Sambuc TokenPtr = skipNewline(TokenPtr, CommentEnd);
421f4a2713aSLionel Sambuc formTokenWithChars(T, TokenPtr, tok::newline);
422f4a2713aSLionel Sambuc
423f4a2713aSLionel Sambuc if (CommentState == LCS_InsideCComment)
424f4a2713aSLionel Sambuc skipLineStartingDecorations();
425f4a2713aSLionel Sambuc return;
426f4a2713aSLionel Sambuc
427f4a2713aSLionel Sambuc default: {
428f4a2713aSLionel Sambuc size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr).
429f4a2713aSLionel Sambuc find_first_of("\n\r\\@&<");
430f4a2713aSLionel Sambuc if (End != StringRef::npos)
431f4a2713aSLionel Sambuc TokenPtr += End;
432f4a2713aSLionel Sambuc else
433f4a2713aSLionel Sambuc TokenPtr = CommentEnd;
434f4a2713aSLionel Sambuc formTextToken(T, TokenPtr);
435f4a2713aSLionel Sambuc return;
436f4a2713aSLionel Sambuc }
437f4a2713aSLionel Sambuc }
438f4a2713aSLionel Sambuc }
439f4a2713aSLionel Sambuc }
440f4a2713aSLionel Sambuc
setupAndLexVerbatimBlock(Token & T,const char * TextBegin,char Marker,const CommandInfo * Info)441f4a2713aSLionel Sambuc void Lexer::setupAndLexVerbatimBlock(Token &T,
442f4a2713aSLionel Sambuc const char *TextBegin,
443f4a2713aSLionel Sambuc char Marker, const CommandInfo *Info) {
444f4a2713aSLionel Sambuc assert(Info->IsVerbatimBlockCommand);
445f4a2713aSLionel Sambuc
446f4a2713aSLionel Sambuc VerbatimBlockEndCommandName.clear();
447f4a2713aSLionel Sambuc VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
448f4a2713aSLionel Sambuc VerbatimBlockEndCommandName.append(Info->EndCommandName);
449f4a2713aSLionel Sambuc
450f4a2713aSLionel Sambuc formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
451f4a2713aSLionel Sambuc T.setVerbatimBlockID(Info->getID());
452f4a2713aSLionel Sambuc
453f4a2713aSLionel Sambuc // If there is a newline following the verbatim opening command, skip the
454f4a2713aSLionel Sambuc // newline so that we don't create an tok::verbatim_block_line with empty
455f4a2713aSLionel Sambuc // text content.
456f4a2713aSLionel Sambuc if (BufferPtr != CommentEnd &&
457f4a2713aSLionel Sambuc isVerticalWhitespace(*BufferPtr)) {
458f4a2713aSLionel Sambuc BufferPtr = skipNewline(BufferPtr, CommentEnd);
459f4a2713aSLionel Sambuc State = LS_VerbatimBlockBody;
460f4a2713aSLionel Sambuc return;
461f4a2713aSLionel Sambuc }
462f4a2713aSLionel Sambuc
463f4a2713aSLionel Sambuc State = LS_VerbatimBlockFirstLine;
464f4a2713aSLionel Sambuc }
465f4a2713aSLionel Sambuc
lexVerbatimBlockFirstLine(Token & T)466f4a2713aSLionel Sambuc void Lexer::lexVerbatimBlockFirstLine(Token &T) {
467f4a2713aSLionel Sambuc again:
468f4a2713aSLionel Sambuc assert(BufferPtr < CommentEnd);
469f4a2713aSLionel Sambuc
470f4a2713aSLionel Sambuc // FIXME: It would be better to scan the text once, finding either the block
471f4a2713aSLionel Sambuc // end command or newline.
472f4a2713aSLionel Sambuc //
473f4a2713aSLionel Sambuc // Extract current line.
474f4a2713aSLionel Sambuc const char *Newline = findNewline(BufferPtr, CommentEnd);
475f4a2713aSLionel Sambuc StringRef Line(BufferPtr, Newline - BufferPtr);
476f4a2713aSLionel Sambuc
477f4a2713aSLionel Sambuc // Look for end command in current line.
478f4a2713aSLionel Sambuc size_t Pos = Line.find(VerbatimBlockEndCommandName);
479f4a2713aSLionel Sambuc const char *TextEnd;
480f4a2713aSLionel Sambuc const char *NextLine;
481f4a2713aSLionel Sambuc if (Pos == StringRef::npos) {
482f4a2713aSLionel Sambuc // Current line is completely verbatim.
483f4a2713aSLionel Sambuc TextEnd = Newline;
484f4a2713aSLionel Sambuc NextLine = skipNewline(Newline, CommentEnd);
485f4a2713aSLionel Sambuc } else if (Pos == 0) {
486f4a2713aSLionel Sambuc // Current line contains just an end command.
487f4a2713aSLionel Sambuc const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
488f4a2713aSLionel Sambuc StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
489f4a2713aSLionel Sambuc formTokenWithChars(T, End, tok::verbatim_block_end);
490f4a2713aSLionel Sambuc T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
491f4a2713aSLionel Sambuc State = LS_Normal;
492f4a2713aSLionel Sambuc return;
493f4a2713aSLionel Sambuc } else {
494f4a2713aSLionel Sambuc // There is some text, followed by end command. Extract text first.
495f4a2713aSLionel Sambuc TextEnd = BufferPtr + Pos;
496f4a2713aSLionel Sambuc NextLine = TextEnd;
497f4a2713aSLionel Sambuc // If there is only whitespace before end command, skip whitespace.
498f4a2713aSLionel Sambuc if (isWhitespace(BufferPtr, TextEnd)) {
499f4a2713aSLionel Sambuc BufferPtr = TextEnd;
500f4a2713aSLionel Sambuc goto again;
501f4a2713aSLionel Sambuc }
502f4a2713aSLionel Sambuc }
503f4a2713aSLionel Sambuc
504f4a2713aSLionel Sambuc StringRef Text(BufferPtr, TextEnd - BufferPtr);
505f4a2713aSLionel Sambuc formTokenWithChars(T, NextLine, tok::verbatim_block_line);
506f4a2713aSLionel Sambuc T.setVerbatimBlockText(Text);
507f4a2713aSLionel Sambuc
508f4a2713aSLionel Sambuc State = LS_VerbatimBlockBody;
509f4a2713aSLionel Sambuc }
510f4a2713aSLionel Sambuc
lexVerbatimBlockBody(Token & T)511f4a2713aSLionel Sambuc void Lexer::lexVerbatimBlockBody(Token &T) {
512f4a2713aSLionel Sambuc assert(State == LS_VerbatimBlockBody);
513f4a2713aSLionel Sambuc
514f4a2713aSLionel Sambuc if (CommentState == LCS_InsideCComment)
515f4a2713aSLionel Sambuc skipLineStartingDecorations();
516f4a2713aSLionel Sambuc
517f4a2713aSLionel Sambuc lexVerbatimBlockFirstLine(T);
518f4a2713aSLionel Sambuc }
519f4a2713aSLionel Sambuc
setupAndLexVerbatimLine(Token & T,const char * TextBegin,const CommandInfo * Info)520f4a2713aSLionel Sambuc void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
521f4a2713aSLionel Sambuc const CommandInfo *Info) {
522f4a2713aSLionel Sambuc assert(Info->IsVerbatimLineCommand);
523f4a2713aSLionel Sambuc formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
524f4a2713aSLionel Sambuc T.setVerbatimLineID(Info->getID());
525f4a2713aSLionel Sambuc
526f4a2713aSLionel Sambuc State = LS_VerbatimLineText;
527f4a2713aSLionel Sambuc }
528f4a2713aSLionel Sambuc
lexVerbatimLineText(Token & T)529f4a2713aSLionel Sambuc void Lexer::lexVerbatimLineText(Token &T) {
530f4a2713aSLionel Sambuc assert(State == LS_VerbatimLineText);
531f4a2713aSLionel Sambuc
532f4a2713aSLionel Sambuc // Extract current line.
533f4a2713aSLionel Sambuc const char *Newline = findNewline(BufferPtr, CommentEnd);
534*0a6a1f1dSLionel Sambuc StringRef Text(BufferPtr, Newline - BufferPtr);
535f4a2713aSLionel Sambuc formTokenWithChars(T, Newline, tok::verbatim_line_text);
536f4a2713aSLionel Sambuc T.setVerbatimLineText(Text);
537f4a2713aSLionel Sambuc
538f4a2713aSLionel Sambuc State = LS_Normal;
539f4a2713aSLionel Sambuc }
540f4a2713aSLionel Sambuc
lexHTMLCharacterReference(Token & T)541f4a2713aSLionel Sambuc void Lexer::lexHTMLCharacterReference(Token &T) {
542f4a2713aSLionel Sambuc const char *TokenPtr = BufferPtr;
543f4a2713aSLionel Sambuc assert(*TokenPtr == '&');
544f4a2713aSLionel Sambuc TokenPtr++;
545f4a2713aSLionel Sambuc if (TokenPtr == CommentEnd) {
546f4a2713aSLionel Sambuc formTextToken(T, TokenPtr);
547f4a2713aSLionel Sambuc return;
548f4a2713aSLionel Sambuc }
549f4a2713aSLionel Sambuc const char *NamePtr;
550f4a2713aSLionel Sambuc bool isNamed = false;
551f4a2713aSLionel Sambuc bool isDecimal = false;
552f4a2713aSLionel Sambuc char C = *TokenPtr;
553f4a2713aSLionel Sambuc if (isHTMLNamedCharacterReferenceCharacter(C)) {
554f4a2713aSLionel Sambuc NamePtr = TokenPtr;
555f4a2713aSLionel Sambuc TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
556f4a2713aSLionel Sambuc isNamed = true;
557f4a2713aSLionel Sambuc } else if (C == '#') {
558f4a2713aSLionel Sambuc TokenPtr++;
559f4a2713aSLionel Sambuc if (TokenPtr == CommentEnd) {
560f4a2713aSLionel Sambuc formTextToken(T, TokenPtr);
561f4a2713aSLionel Sambuc return;
562f4a2713aSLionel Sambuc }
563f4a2713aSLionel Sambuc C = *TokenPtr;
564f4a2713aSLionel Sambuc if (isHTMLDecimalCharacterReferenceCharacter(C)) {
565f4a2713aSLionel Sambuc NamePtr = TokenPtr;
566f4a2713aSLionel Sambuc TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
567f4a2713aSLionel Sambuc isDecimal = true;
568f4a2713aSLionel Sambuc } else if (C == 'x' || C == 'X') {
569f4a2713aSLionel Sambuc TokenPtr++;
570f4a2713aSLionel Sambuc NamePtr = TokenPtr;
571f4a2713aSLionel Sambuc TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
572f4a2713aSLionel Sambuc } else {
573f4a2713aSLionel Sambuc formTextToken(T, TokenPtr);
574f4a2713aSLionel Sambuc return;
575f4a2713aSLionel Sambuc }
576f4a2713aSLionel Sambuc } else {
577f4a2713aSLionel Sambuc formTextToken(T, TokenPtr);
578f4a2713aSLionel Sambuc return;
579f4a2713aSLionel Sambuc }
580f4a2713aSLionel Sambuc if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
581f4a2713aSLionel Sambuc *TokenPtr != ';') {
582f4a2713aSLionel Sambuc formTextToken(T, TokenPtr);
583f4a2713aSLionel Sambuc return;
584f4a2713aSLionel Sambuc }
585f4a2713aSLionel Sambuc StringRef Name(NamePtr, TokenPtr - NamePtr);
586f4a2713aSLionel Sambuc TokenPtr++; // Skip semicolon.
587f4a2713aSLionel Sambuc StringRef Resolved;
588f4a2713aSLionel Sambuc if (isNamed)
589f4a2713aSLionel Sambuc Resolved = resolveHTMLNamedCharacterReference(Name);
590f4a2713aSLionel Sambuc else if (isDecimal)
591f4a2713aSLionel Sambuc Resolved = resolveHTMLDecimalCharacterReference(Name);
592f4a2713aSLionel Sambuc else
593f4a2713aSLionel Sambuc Resolved = resolveHTMLHexCharacterReference(Name);
594f4a2713aSLionel Sambuc
595f4a2713aSLionel Sambuc if (Resolved.empty()) {
596f4a2713aSLionel Sambuc formTextToken(T, TokenPtr);
597f4a2713aSLionel Sambuc return;
598f4a2713aSLionel Sambuc }
599f4a2713aSLionel Sambuc formTokenWithChars(T, TokenPtr, tok::text);
600f4a2713aSLionel Sambuc T.setText(Resolved);
601f4a2713aSLionel Sambuc return;
602f4a2713aSLionel Sambuc }
603f4a2713aSLionel Sambuc
setupAndLexHTMLStartTag(Token & T)604f4a2713aSLionel Sambuc void Lexer::setupAndLexHTMLStartTag(Token &T) {
605f4a2713aSLionel Sambuc assert(BufferPtr[0] == '<' &&
606f4a2713aSLionel Sambuc isHTMLIdentifierStartingCharacter(BufferPtr[1]));
607f4a2713aSLionel Sambuc const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
608f4a2713aSLionel Sambuc StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
609f4a2713aSLionel Sambuc if (!isHTMLTagName(Name)) {
610f4a2713aSLionel Sambuc formTextToken(T, TagNameEnd);
611f4a2713aSLionel Sambuc return;
612f4a2713aSLionel Sambuc }
613f4a2713aSLionel Sambuc
614f4a2713aSLionel Sambuc formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
615f4a2713aSLionel Sambuc T.setHTMLTagStartName(Name);
616f4a2713aSLionel Sambuc
617f4a2713aSLionel Sambuc BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
618f4a2713aSLionel Sambuc
619f4a2713aSLionel Sambuc const char C = *BufferPtr;
620f4a2713aSLionel Sambuc if (BufferPtr != CommentEnd &&
621f4a2713aSLionel Sambuc (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
622f4a2713aSLionel Sambuc State = LS_HTMLStartTag;
623f4a2713aSLionel Sambuc }
624f4a2713aSLionel Sambuc
lexHTMLStartTag(Token & T)625f4a2713aSLionel Sambuc void Lexer::lexHTMLStartTag(Token &T) {
626f4a2713aSLionel Sambuc assert(State == LS_HTMLStartTag);
627f4a2713aSLionel Sambuc
628f4a2713aSLionel Sambuc const char *TokenPtr = BufferPtr;
629f4a2713aSLionel Sambuc char C = *TokenPtr;
630f4a2713aSLionel Sambuc if (isHTMLIdentifierCharacter(C)) {
631f4a2713aSLionel Sambuc TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
632f4a2713aSLionel Sambuc StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
633f4a2713aSLionel Sambuc formTokenWithChars(T, TokenPtr, tok::html_ident);
634f4a2713aSLionel Sambuc T.setHTMLIdent(Ident);
635f4a2713aSLionel Sambuc } else {
636f4a2713aSLionel Sambuc switch (C) {
637f4a2713aSLionel Sambuc case '=':
638f4a2713aSLionel Sambuc TokenPtr++;
639f4a2713aSLionel Sambuc formTokenWithChars(T, TokenPtr, tok::html_equals);
640f4a2713aSLionel Sambuc break;
641f4a2713aSLionel Sambuc case '\"':
642f4a2713aSLionel Sambuc case '\'': {
643f4a2713aSLionel Sambuc const char *OpenQuote = TokenPtr;
644f4a2713aSLionel Sambuc TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
645f4a2713aSLionel Sambuc const char *ClosingQuote = TokenPtr;
646f4a2713aSLionel Sambuc if (TokenPtr != CommentEnd) // Skip closing quote.
647f4a2713aSLionel Sambuc TokenPtr++;
648f4a2713aSLionel Sambuc formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
649f4a2713aSLionel Sambuc T.setHTMLQuotedString(StringRef(OpenQuote + 1,
650f4a2713aSLionel Sambuc ClosingQuote - (OpenQuote + 1)));
651f4a2713aSLionel Sambuc break;
652f4a2713aSLionel Sambuc }
653f4a2713aSLionel Sambuc case '>':
654f4a2713aSLionel Sambuc TokenPtr++;
655f4a2713aSLionel Sambuc formTokenWithChars(T, TokenPtr, tok::html_greater);
656f4a2713aSLionel Sambuc State = LS_Normal;
657f4a2713aSLionel Sambuc return;
658f4a2713aSLionel Sambuc case '/':
659f4a2713aSLionel Sambuc TokenPtr++;
660f4a2713aSLionel Sambuc if (TokenPtr != CommentEnd && *TokenPtr == '>') {
661f4a2713aSLionel Sambuc TokenPtr++;
662f4a2713aSLionel Sambuc formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
663f4a2713aSLionel Sambuc } else
664f4a2713aSLionel Sambuc formTextToken(T, TokenPtr);
665f4a2713aSLionel Sambuc
666f4a2713aSLionel Sambuc State = LS_Normal;
667f4a2713aSLionel Sambuc return;
668f4a2713aSLionel Sambuc }
669f4a2713aSLionel Sambuc }
670f4a2713aSLionel Sambuc
671f4a2713aSLionel Sambuc // Now look ahead and return to normal state if we don't see any HTML tokens
672f4a2713aSLionel Sambuc // ahead.
673f4a2713aSLionel Sambuc BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
674f4a2713aSLionel Sambuc if (BufferPtr == CommentEnd) {
675f4a2713aSLionel Sambuc State = LS_Normal;
676f4a2713aSLionel Sambuc return;
677f4a2713aSLionel Sambuc }
678f4a2713aSLionel Sambuc
679f4a2713aSLionel Sambuc C = *BufferPtr;
680f4a2713aSLionel Sambuc if (!isHTMLIdentifierStartingCharacter(C) &&
681f4a2713aSLionel Sambuc C != '=' && C != '\"' && C != '\'' && C != '>') {
682f4a2713aSLionel Sambuc State = LS_Normal;
683f4a2713aSLionel Sambuc return;
684f4a2713aSLionel Sambuc }
685f4a2713aSLionel Sambuc }
686f4a2713aSLionel Sambuc
setupAndLexHTMLEndTag(Token & T)687f4a2713aSLionel Sambuc void Lexer::setupAndLexHTMLEndTag(Token &T) {
688f4a2713aSLionel Sambuc assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
689f4a2713aSLionel Sambuc
690f4a2713aSLionel Sambuc const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
691f4a2713aSLionel Sambuc const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
692f4a2713aSLionel Sambuc StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
693f4a2713aSLionel Sambuc if (!isHTMLTagName(Name)) {
694f4a2713aSLionel Sambuc formTextToken(T, TagNameEnd);
695f4a2713aSLionel Sambuc return;
696f4a2713aSLionel Sambuc }
697f4a2713aSLionel Sambuc
698f4a2713aSLionel Sambuc const char *End = skipWhitespace(TagNameEnd, CommentEnd);
699f4a2713aSLionel Sambuc
700f4a2713aSLionel Sambuc formTokenWithChars(T, End, tok::html_end_tag);
701f4a2713aSLionel Sambuc T.setHTMLTagEndName(Name);
702f4a2713aSLionel Sambuc
703f4a2713aSLionel Sambuc if (BufferPtr != CommentEnd && *BufferPtr == '>')
704f4a2713aSLionel Sambuc State = LS_HTMLEndTag;
705f4a2713aSLionel Sambuc }
706f4a2713aSLionel Sambuc
lexHTMLEndTag(Token & T)707f4a2713aSLionel Sambuc void Lexer::lexHTMLEndTag(Token &T) {
708f4a2713aSLionel Sambuc assert(BufferPtr != CommentEnd && *BufferPtr == '>');
709f4a2713aSLionel Sambuc
710f4a2713aSLionel Sambuc formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
711f4a2713aSLionel Sambuc State = LS_Normal;
712f4a2713aSLionel Sambuc }
713f4a2713aSLionel Sambuc
Lexer(llvm::BumpPtrAllocator & Allocator,DiagnosticsEngine & Diags,const CommandTraits & Traits,SourceLocation FileLoc,const char * BufferStart,const char * BufferEnd)714f4a2713aSLionel Sambuc Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
715f4a2713aSLionel Sambuc const CommandTraits &Traits,
716f4a2713aSLionel Sambuc SourceLocation FileLoc,
717f4a2713aSLionel Sambuc const char *BufferStart, const char *BufferEnd):
718f4a2713aSLionel Sambuc Allocator(Allocator), Diags(Diags), Traits(Traits),
719f4a2713aSLionel Sambuc BufferStart(BufferStart), BufferEnd(BufferEnd),
720f4a2713aSLionel Sambuc FileLoc(FileLoc), BufferPtr(BufferStart),
721f4a2713aSLionel Sambuc CommentState(LCS_BeforeComment), State(LS_Normal) {
722f4a2713aSLionel Sambuc }
723f4a2713aSLionel Sambuc
lex(Token & T)724f4a2713aSLionel Sambuc void Lexer::lex(Token &T) {
725f4a2713aSLionel Sambuc again:
726f4a2713aSLionel Sambuc switch (CommentState) {
727f4a2713aSLionel Sambuc case LCS_BeforeComment:
728f4a2713aSLionel Sambuc if (BufferPtr == BufferEnd) {
729f4a2713aSLionel Sambuc formTokenWithChars(T, BufferPtr, tok::eof);
730f4a2713aSLionel Sambuc return;
731f4a2713aSLionel Sambuc }
732f4a2713aSLionel Sambuc
733f4a2713aSLionel Sambuc assert(*BufferPtr == '/');
734f4a2713aSLionel Sambuc BufferPtr++; // Skip first slash.
735f4a2713aSLionel Sambuc switch(*BufferPtr) {
736f4a2713aSLionel Sambuc case '/': { // BCPL comment.
737f4a2713aSLionel Sambuc BufferPtr++; // Skip second slash.
738f4a2713aSLionel Sambuc
739f4a2713aSLionel Sambuc if (BufferPtr != BufferEnd) {
740f4a2713aSLionel Sambuc // Skip Doxygen magic marker, if it is present.
741f4a2713aSLionel Sambuc // It might be missing because of a typo //< or /*<, or because we
742f4a2713aSLionel Sambuc // merged this non-Doxygen comment into a bunch of Doxygen comments
743f4a2713aSLionel Sambuc // around it: /** ... */ /* ... */ /** ... */
744f4a2713aSLionel Sambuc const char C = *BufferPtr;
745f4a2713aSLionel Sambuc if (C == '/' || C == '!')
746f4a2713aSLionel Sambuc BufferPtr++;
747f4a2713aSLionel Sambuc }
748f4a2713aSLionel Sambuc
749f4a2713aSLionel Sambuc // Skip less-than symbol that marks trailing comments.
750f4a2713aSLionel Sambuc // Skip it even if the comment is not a Doxygen one, because //< and /*<
751f4a2713aSLionel Sambuc // are frequent typos.
752f4a2713aSLionel Sambuc if (BufferPtr != BufferEnd && *BufferPtr == '<')
753f4a2713aSLionel Sambuc BufferPtr++;
754f4a2713aSLionel Sambuc
755f4a2713aSLionel Sambuc CommentState = LCS_InsideBCPLComment;
756f4a2713aSLionel Sambuc if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
757f4a2713aSLionel Sambuc State = LS_Normal;
758f4a2713aSLionel Sambuc CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
759f4a2713aSLionel Sambuc goto again;
760f4a2713aSLionel Sambuc }
761f4a2713aSLionel Sambuc case '*': { // C comment.
762f4a2713aSLionel Sambuc BufferPtr++; // Skip star.
763f4a2713aSLionel Sambuc
764f4a2713aSLionel Sambuc // Skip Doxygen magic marker.
765f4a2713aSLionel Sambuc const char C = *BufferPtr;
766f4a2713aSLionel Sambuc if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
767f4a2713aSLionel Sambuc BufferPtr++;
768f4a2713aSLionel Sambuc
769f4a2713aSLionel Sambuc // Skip less-than symbol that marks trailing comments.
770f4a2713aSLionel Sambuc if (BufferPtr != BufferEnd && *BufferPtr == '<')
771f4a2713aSLionel Sambuc BufferPtr++;
772f4a2713aSLionel Sambuc
773f4a2713aSLionel Sambuc CommentState = LCS_InsideCComment;
774f4a2713aSLionel Sambuc State = LS_Normal;
775f4a2713aSLionel Sambuc CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
776f4a2713aSLionel Sambuc goto again;
777f4a2713aSLionel Sambuc }
778f4a2713aSLionel Sambuc default:
779f4a2713aSLionel Sambuc llvm_unreachable("second character of comment should be '/' or '*'");
780f4a2713aSLionel Sambuc }
781f4a2713aSLionel Sambuc
782f4a2713aSLionel Sambuc case LCS_BetweenComments: {
783f4a2713aSLionel Sambuc // Consecutive comments are extracted only if there is only whitespace
784f4a2713aSLionel Sambuc // between them. So we can search for the start of the next comment.
785f4a2713aSLionel Sambuc const char *EndWhitespace = BufferPtr;
786f4a2713aSLionel Sambuc while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
787f4a2713aSLionel Sambuc EndWhitespace++;
788f4a2713aSLionel Sambuc
789f4a2713aSLionel Sambuc // Turn any whitespace between comments (and there is only whitespace
790f4a2713aSLionel Sambuc // between them -- guaranteed by comment extraction) into a newline. We
791f4a2713aSLionel Sambuc // have two newlines between C comments in total (first one was synthesized
792f4a2713aSLionel Sambuc // after a comment).
793f4a2713aSLionel Sambuc formTokenWithChars(T, EndWhitespace, tok::newline);
794f4a2713aSLionel Sambuc
795f4a2713aSLionel Sambuc CommentState = LCS_BeforeComment;
796f4a2713aSLionel Sambuc break;
797f4a2713aSLionel Sambuc }
798f4a2713aSLionel Sambuc
799f4a2713aSLionel Sambuc case LCS_InsideBCPLComment:
800f4a2713aSLionel Sambuc case LCS_InsideCComment:
801f4a2713aSLionel Sambuc if (BufferPtr != CommentEnd) {
802f4a2713aSLionel Sambuc lexCommentText(T);
803f4a2713aSLionel Sambuc break;
804f4a2713aSLionel Sambuc } else {
805f4a2713aSLionel Sambuc // Skip C comment closing sequence.
806f4a2713aSLionel Sambuc if (CommentState == LCS_InsideCComment) {
807f4a2713aSLionel Sambuc assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
808f4a2713aSLionel Sambuc BufferPtr += 2;
809f4a2713aSLionel Sambuc assert(BufferPtr <= BufferEnd);
810f4a2713aSLionel Sambuc
811f4a2713aSLionel Sambuc // Synthenize newline just after the C comment, regardless if there is
812f4a2713aSLionel Sambuc // actually a newline.
813f4a2713aSLionel Sambuc formTokenWithChars(T, BufferPtr, tok::newline);
814f4a2713aSLionel Sambuc
815f4a2713aSLionel Sambuc CommentState = LCS_BetweenComments;
816f4a2713aSLionel Sambuc break;
817f4a2713aSLionel Sambuc } else {
818f4a2713aSLionel Sambuc // Don't synthesized a newline after BCPL comment.
819f4a2713aSLionel Sambuc CommentState = LCS_BetweenComments;
820f4a2713aSLionel Sambuc goto again;
821f4a2713aSLionel Sambuc }
822f4a2713aSLionel Sambuc }
823f4a2713aSLionel Sambuc }
824f4a2713aSLionel Sambuc }
825f4a2713aSLionel Sambuc
getSpelling(const Token & Tok,const SourceManager & SourceMgr,bool * Invalid) const826f4a2713aSLionel Sambuc StringRef Lexer::getSpelling(const Token &Tok,
827f4a2713aSLionel Sambuc const SourceManager &SourceMgr,
828f4a2713aSLionel Sambuc bool *Invalid) const {
829f4a2713aSLionel Sambuc SourceLocation Loc = Tok.getLocation();
830f4a2713aSLionel Sambuc std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
831f4a2713aSLionel Sambuc
832f4a2713aSLionel Sambuc bool InvalidTemp = false;
833f4a2713aSLionel Sambuc StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
834f4a2713aSLionel Sambuc if (InvalidTemp) {
835f4a2713aSLionel Sambuc *Invalid = true;
836f4a2713aSLionel Sambuc return StringRef();
837f4a2713aSLionel Sambuc }
838f4a2713aSLionel Sambuc
839f4a2713aSLionel Sambuc const char *Begin = File.data() + LocInfo.second;
840f4a2713aSLionel Sambuc return StringRef(Begin, Tok.getLength());
841f4a2713aSLionel Sambuc }
842f4a2713aSLionel Sambuc
843f4a2713aSLionel Sambuc } // end namespace comments
844f4a2713aSLionel Sambuc } // end namespace clang
845f4a2713aSLionel Sambuc
846