xref: /openbsd-src/gnu/llvm/clang/lib/Lex/LiteralSupport.cpp (revision e5dd70708596ae51455a0ffa086a00c5b29f8583)
1*e5dd7070Spatrick //===--- LiteralSupport.cpp - Code to parse and process literals ----------===//
2*e5dd7070Spatrick //
3*e5dd7070Spatrick // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4*e5dd7070Spatrick // See https://llvm.org/LICENSE.txt for license information.
5*e5dd7070Spatrick // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6*e5dd7070Spatrick //
7*e5dd7070Spatrick //===----------------------------------------------------------------------===//
8*e5dd7070Spatrick //
9*e5dd7070Spatrick // This file implements the NumericLiteralParser, CharLiteralParser, and
10*e5dd7070Spatrick // StringLiteralParser interfaces.
11*e5dd7070Spatrick //
12*e5dd7070Spatrick //===----------------------------------------------------------------------===//
13*e5dd7070Spatrick 
14*e5dd7070Spatrick #include "clang/Lex/LiteralSupport.h"
15*e5dd7070Spatrick #include "clang/Basic/CharInfo.h"
16*e5dd7070Spatrick #include "clang/Basic/LangOptions.h"
17*e5dd7070Spatrick #include "clang/Basic/SourceLocation.h"
18*e5dd7070Spatrick #include "clang/Basic/TargetInfo.h"
19*e5dd7070Spatrick #include "clang/Lex/LexDiagnostic.h"
20*e5dd7070Spatrick #include "clang/Lex/Lexer.h"
21*e5dd7070Spatrick #include "clang/Lex/Preprocessor.h"
22*e5dd7070Spatrick #include "clang/Lex/Token.h"
23*e5dd7070Spatrick #include "llvm/ADT/APInt.h"
24*e5dd7070Spatrick #include "llvm/ADT/SmallVector.h"
25*e5dd7070Spatrick #include "llvm/ADT/StringExtras.h"
26*e5dd7070Spatrick #include "llvm/ADT/StringSwitch.h"
27*e5dd7070Spatrick #include "llvm/Support/ConvertUTF.h"
28*e5dd7070Spatrick #include "llvm/Support/ErrorHandling.h"
29*e5dd7070Spatrick #include <algorithm>
30*e5dd7070Spatrick #include <cassert>
31*e5dd7070Spatrick #include <cstddef>
32*e5dd7070Spatrick #include <cstdint>
33*e5dd7070Spatrick #include <cstring>
34*e5dd7070Spatrick #include <string>
35*e5dd7070Spatrick 
36*e5dd7070Spatrick using namespace clang;
37*e5dd7070Spatrick 
38*e5dd7070Spatrick static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
39*e5dd7070Spatrick   switch (kind) {
40*e5dd7070Spatrick   default: llvm_unreachable("Unknown token type!");
41*e5dd7070Spatrick   case tok::char_constant:
42*e5dd7070Spatrick   case tok::string_literal:
43*e5dd7070Spatrick   case tok::utf8_char_constant:
44*e5dd7070Spatrick   case tok::utf8_string_literal:
45*e5dd7070Spatrick     return Target.getCharWidth();
46*e5dd7070Spatrick   case tok::wide_char_constant:
47*e5dd7070Spatrick   case tok::wide_string_literal:
48*e5dd7070Spatrick     return Target.getWCharWidth();
49*e5dd7070Spatrick   case tok::utf16_char_constant:
50*e5dd7070Spatrick   case tok::utf16_string_literal:
51*e5dd7070Spatrick     return Target.getChar16Width();
52*e5dd7070Spatrick   case tok::utf32_char_constant:
53*e5dd7070Spatrick   case tok::utf32_string_literal:
54*e5dd7070Spatrick     return Target.getChar32Width();
55*e5dd7070Spatrick   }
56*e5dd7070Spatrick }
57*e5dd7070Spatrick 
58*e5dd7070Spatrick static CharSourceRange MakeCharSourceRange(const LangOptions &Features,
59*e5dd7070Spatrick                                            FullSourceLoc TokLoc,
60*e5dd7070Spatrick                                            const char *TokBegin,
61*e5dd7070Spatrick                                            const char *TokRangeBegin,
62*e5dd7070Spatrick                                            const char *TokRangeEnd) {
63*e5dd7070Spatrick   SourceLocation Begin =
64*e5dd7070Spatrick     Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
65*e5dd7070Spatrick                                    TokLoc.getManager(), Features);
66*e5dd7070Spatrick   SourceLocation End =
67*e5dd7070Spatrick     Lexer::AdvanceToTokenCharacter(Begin, TokRangeEnd - TokRangeBegin,
68*e5dd7070Spatrick                                    TokLoc.getManager(), Features);
69*e5dd7070Spatrick   return CharSourceRange::getCharRange(Begin, End);
70*e5dd7070Spatrick }
71*e5dd7070Spatrick 
72*e5dd7070Spatrick /// Produce a diagnostic highlighting some portion of a literal.
73*e5dd7070Spatrick ///
74*e5dd7070Spatrick /// Emits the diagnostic \p DiagID, highlighting the range of characters from
75*e5dd7070Spatrick /// \p TokRangeBegin (inclusive) to \p TokRangeEnd (exclusive), which must be
76*e5dd7070Spatrick /// a substring of a spelling buffer for the token beginning at \p TokBegin.
77*e5dd7070Spatrick static DiagnosticBuilder Diag(DiagnosticsEngine *Diags,
78*e5dd7070Spatrick                               const LangOptions &Features, FullSourceLoc TokLoc,
79*e5dd7070Spatrick                               const char *TokBegin, const char *TokRangeBegin,
80*e5dd7070Spatrick                               const char *TokRangeEnd, unsigned DiagID) {
81*e5dd7070Spatrick   SourceLocation Begin =
82*e5dd7070Spatrick     Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
83*e5dd7070Spatrick                                    TokLoc.getManager(), Features);
84*e5dd7070Spatrick   return Diags->Report(Begin, DiagID) <<
85*e5dd7070Spatrick     MakeCharSourceRange(Features, TokLoc, TokBegin, TokRangeBegin, TokRangeEnd);
86*e5dd7070Spatrick }
87*e5dd7070Spatrick 
88*e5dd7070Spatrick /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
89*e5dd7070Spatrick /// either a character or a string literal.
90*e5dd7070Spatrick static unsigned ProcessCharEscape(const char *ThisTokBegin,
91*e5dd7070Spatrick                                   const char *&ThisTokBuf,
92*e5dd7070Spatrick                                   const char *ThisTokEnd, bool &HadError,
93*e5dd7070Spatrick                                   FullSourceLoc Loc, unsigned CharWidth,
94*e5dd7070Spatrick                                   DiagnosticsEngine *Diags,
95*e5dd7070Spatrick                                   const LangOptions &Features) {
96*e5dd7070Spatrick   const char *EscapeBegin = ThisTokBuf;
97*e5dd7070Spatrick 
98*e5dd7070Spatrick   // Skip the '\' char.
99*e5dd7070Spatrick   ++ThisTokBuf;
100*e5dd7070Spatrick 
101*e5dd7070Spatrick   // We know that this character can't be off the end of the buffer, because
102*e5dd7070Spatrick   // that would have been \", which would not have been the end of string.
103*e5dd7070Spatrick   unsigned ResultChar = *ThisTokBuf++;
104*e5dd7070Spatrick   switch (ResultChar) {
105*e5dd7070Spatrick   // These map to themselves.
106*e5dd7070Spatrick   case '\\': case '\'': case '"': case '?': break;
107*e5dd7070Spatrick 
108*e5dd7070Spatrick     // These have fixed mappings.
109*e5dd7070Spatrick   case 'a':
110*e5dd7070Spatrick     // TODO: K&R: the meaning of '\\a' is different in traditional C
111*e5dd7070Spatrick     ResultChar = 7;
112*e5dd7070Spatrick     break;
113*e5dd7070Spatrick   case 'b':
114*e5dd7070Spatrick     ResultChar = 8;
115*e5dd7070Spatrick     break;
116*e5dd7070Spatrick   case 'e':
117*e5dd7070Spatrick     if (Diags)
118*e5dd7070Spatrick       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
119*e5dd7070Spatrick            diag::ext_nonstandard_escape) << "e";
120*e5dd7070Spatrick     ResultChar = 27;
121*e5dd7070Spatrick     break;
122*e5dd7070Spatrick   case 'E':
123*e5dd7070Spatrick     if (Diags)
124*e5dd7070Spatrick       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
125*e5dd7070Spatrick            diag::ext_nonstandard_escape) << "E";
126*e5dd7070Spatrick     ResultChar = 27;
127*e5dd7070Spatrick     break;
128*e5dd7070Spatrick   case 'f':
129*e5dd7070Spatrick     ResultChar = 12;
130*e5dd7070Spatrick     break;
131*e5dd7070Spatrick   case 'n':
132*e5dd7070Spatrick     ResultChar = 10;
133*e5dd7070Spatrick     break;
134*e5dd7070Spatrick   case 'r':
135*e5dd7070Spatrick     ResultChar = 13;
136*e5dd7070Spatrick     break;
137*e5dd7070Spatrick   case 't':
138*e5dd7070Spatrick     ResultChar = 9;
139*e5dd7070Spatrick     break;
140*e5dd7070Spatrick   case 'v':
141*e5dd7070Spatrick     ResultChar = 11;
142*e5dd7070Spatrick     break;
143*e5dd7070Spatrick   case 'x': { // Hex escape.
144*e5dd7070Spatrick     ResultChar = 0;
145*e5dd7070Spatrick     if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
146*e5dd7070Spatrick       if (Diags)
147*e5dd7070Spatrick         Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
148*e5dd7070Spatrick              diag::err_hex_escape_no_digits) << "x";
149*e5dd7070Spatrick       HadError = true;
150*e5dd7070Spatrick       break;
151*e5dd7070Spatrick     }
152*e5dd7070Spatrick 
153*e5dd7070Spatrick     // Hex escapes are a maximal series of hex digits.
154*e5dd7070Spatrick     bool Overflow = false;
155*e5dd7070Spatrick     for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) {
156*e5dd7070Spatrick       int CharVal = llvm::hexDigitValue(ThisTokBuf[0]);
157*e5dd7070Spatrick       if (CharVal == -1) break;
158*e5dd7070Spatrick       // About to shift out a digit?
159*e5dd7070Spatrick       if (ResultChar & 0xF0000000)
160*e5dd7070Spatrick         Overflow = true;
161*e5dd7070Spatrick       ResultChar <<= 4;
162*e5dd7070Spatrick       ResultChar |= CharVal;
163*e5dd7070Spatrick     }
164*e5dd7070Spatrick 
165*e5dd7070Spatrick     // See if any bits will be truncated when evaluated as a character.
166*e5dd7070Spatrick     if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
167*e5dd7070Spatrick       Overflow = true;
168*e5dd7070Spatrick       ResultChar &= ~0U >> (32-CharWidth);
169*e5dd7070Spatrick     }
170*e5dd7070Spatrick 
171*e5dd7070Spatrick     // Check for overflow.
172*e5dd7070Spatrick     if (Overflow && Diags)   // Too many digits to fit in
173*e5dd7070Spatrick       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
174*e5dd7070Spatrick            diag::err_escape_too_large) << 0;
175*e5dd7070Spatrick     break;
176*e5dd7070Spatrick   }
177*e5dd7070Spatrick   case '0': case '1': case '2': case '3':
178*e5dd7070Spatrick   case '4': case '5': case '6': case '7': {
179*e5dd7070Spatrick     // Octal escapes.
180*e5dd7070Spatrick     --ThisTokBuf;
181*e5dd7070Spatrick     ResultChar = 0;
182*e5dd7070Spatrick 
183*e5dd7070Spatrick     // Octal escapes are a series of octal digits with maximum length 3.
184*e5dd7070Spatrick     // "\0123" is a two digit sequence equal to "\012" "3".
185*e5dd7070Spatrick     unsigned NumDigits = 0;
186*e5dd7070Spatrick     do {
187*e5dd7070Spatrick       ResultChar <<= 3;
188*e5dd7070Spatrick       ResultChar |= *ThisTokBuf++ - '0';
189*e5dd7070Spatrick       ++NumDigits;
190*e5dd7070Spatrick     } while (ThisTokBuf != ThisTokEnd && NumDigits < 3 &&
191*e5dd7070Spatrick              ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7');
192*e5dd7070Spatrick 
193*e5dd7070Spatrick     // Check for overflow.  Reject '\777', but not L'\777'.
194*e5dd7070Spatrick     if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
195*e5dd7070Spatrick       if (Diags)
196*e5dd7070Spatrick         Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
197*e5dd7070Spatrick              diag::err_escape_too_large) << 1;
198*e5dd7070Spatrick       ResultChar &= ~0U >> (32-CharWidth);
199*e5dd7070Spatrick     }
200*e5dd7070Spatrick     break;
201*e5dd7070Spatrick   }
202*e5dd7070Spatrick 
203*e5dd7070Spatrick     // Otherwise, these are not valid escapes.
204*e5dd7070Spatrick   case '(': case '{': case '[': case '%':
205*e5dd7070Spatrick     // GCC accepts these as extensions.  We warn about them as such though.
206*e5dd7070Spatrick     if (Diags)
207*e5dd7070Spatrick       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
208*e5dd7070Spatrick            diag::ext_nonstandard_escape)
209*e5dd7070Spatrick         << std::string(1, ResultChar);
210*e5dd7070Spatrick     break;
211*e5dd7070Spatrick   default:
212*e5dd7070Spatrick     if (!Diags)
213*e5dd7070Spatrick       break;
214*e5dd7070Spatrick 
215*e5dd7070Spatrick     if (isPrintable(ResultChar))
216*e5dd7070Spatrick       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
217*e5dd7070Spatrick            diag::ext_unknown_escape)
218*e5dd7070Spatrick         << std::string(1, ResultChar);
219*e5dd7070Spatrick     else
220*e5dd7070Spatrick       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
221*e5dd7070Spatrick            diag::ext_unknown_escape)
222*e5dd7070Spatrick         << "x" + llvm::utohexstr(ResultChar);
223*e5dd7070Spatrick     break;
224*e5dd7070Spatrick   }
225*e5dd7070Spatrick 
226*e5dd7070Spatrick   return ResultChar;
227*e5dd7070Spatrick }
228*e5dd7070Spatrick 
229*e5dd7070Spatrick static void appendCodePoint(unsigned Codepoint,
230*e5dd7070Spatrick                             llvm::SmallVectorImpl<char> &Str) {
231*e5dd7070Spatrick   char ResultBuf[4];
232*e5dd7070Spatrick   char *ResultPtr = ResultBuf;
233*e5dd7070Spatrick   bool Res = llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr);
234*e5dd7070Spatrick   (void)Res;
235*e5dd7070Spatrick   assert(Res && "Unexpected conversion failure");
236*e5dd7070Spatrick   Str.append(ResultBuf, ResultPtr);
237*e5dd7070Spatrick }
238*e5dd7070Spatrick 
239*e5dd7070Spatrick void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
240*e5dd7070Spatrick   for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) {
241*e5dd7070Spatrick     if (*I != '\\') {
242*e5dd7070Spatrick       Buf.push_back(*I);
243*e5dd7070Spatrick       continue;
244*e5dd7070Spatrick     }
245*e5dd7070Spatrick 
246*e5dd7070Spatrick     ++I;
247*e5dd7070Spatrick     assert(*I == 'u' || *I == 'U');
248*e5dd7070Spatrick 
249*e5dd7070Spatrick     unsigned NumHexDigits;
250*e5dd7070Spatrick     if (*I == 'u')
251*e5dd7070Spatrick       NumHexDigits = 4;
252*e5dd7070Spatrick     else
253*e5dd7070Spatrick       NumHexDigits = 8;
254*e5dd7070Spatrick 
255*e5dd7070Spatrick     assert(I + NumHexDigits <= E);
256*e5dd7070Spatrick 
257*e5dd7070Spatrick     uint32_t CodePoint = 0;
258*e5dd7070Spatrick     for (++I; NumHexDigits != 0; ++I, --NumHexDigits) {
259*e5dd7070Spatrick       unsigned Value = llvm::hexDigitValue(*I);
260*e5dd7070Spatrick       assert(Value != -1U);
261*e5dd7070Spatrick 
262*e5dd7070Spatrick       CodePoint <<= 4;
263*e5dd7070Spatrick       CodePoint += Value;
264*e5dd7070Spatrick     }
265*e5dd7070Spatrick 
266*e5dd7070Spatrick     appendCodePoint(CodePoint, Buf);
267*e5dd7070Spatrick     --I;
268*e5dd7070Spatrick   }
269*e5dd7070Spatrick }
270*e5dd7070Spatrick 
271*e5dd7070Spatrick /// ProcessUCNEscape - Read the Universal Character Name, check constraints and
272*e5dd7070Spatrick /// return the UTF32.
273*e5dd7070Spatrick static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
274*e5dd7070Spatrick                              const char *ThisTokEnd,
275*e5dd7070Spatrick                              uint32_t &UcnVal, unsigned short &UcnLen,
276*e5dd7070Spatrick                              FullSourceLoc Loc, DiagnosticsEngine *Diags,
277*e5dd7070Spatrick                              const LangOptions &Features,
278*e5dd7070Spatrick                              bool in_char_string_literal = false) {
279*e5dd7070Spatrick   const char *UcnBegin = ThisTokBuf;
280*e5dd7070Spatrick 
281*e5dd7070Spatrick   // Skip the '\u' char's.
282*e5dd7070Spatrick   ThisTokBuf += 2;
283*e5dd7070Spatrick 
284*e5dd7070Spatrick   if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
285*e5dd7070Spatrick     if (Diags)
286*e5dd7070Spatrick       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
287*e5dd7070Spatrick            diag::err_hex_escape_no_digits) << StringRef(&ThisTokBuf[-1], 1);
288*e5dd7070Spatrick     return false;
289*e5dd7070Spatrick   }
290*e5dd7070Spatrick   UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
291*e5dd7070Spatrick   unsigned short UcnLenSave = UcnLen;
292*e5dd7070Spatrick   for (; ThisTokBuf != ThisTokEnd && UcnLenSave; ++ThisTokBuf, UcnLenSave--) {
293*e5dd7070Spatrick     int CharVal = llvm::hexDigitValue(ThisTokBuf[0]);
294*e5dd7070Spatrick     if (CharVal == -1) break;
295*e5dd7070Spatrick     UcnVal <<= 4;
296*e5dd7070Spatrick     UcnVal |= CharVal;
297*e5dd7070Spatrick   }
298*e5dd7070Spatrick   // If we didn't consume the proper number of digits, there is a problem.
299*e5dd7070Spatrick   if (UcnLenSave) {
300*e5dd7070Spatrick     if (Diags)
301*e5dd7070Spatrick       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
302*e5dd7070Spatrick            diag::err_ucn_escape_incomplete);
303*e5dd7070Spatrick     return false;
304*e5dd7070Spatrick   }
305*e5dd7070Spatrick 
306*e5dd7070Spatrick   // Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2]
307*e5dd7070Spatrick   if ((0xD800 <= UcnVal && UcnVal <= 0xDFFF) || // surrogate codepoints
308*e5dd7070Spatrick       UcnVal > 0x10FFFF) {                      // maximum legal UTF32 value
309*e5dd7070Spatrick     if (Diags)
310*e5dd7070Spatrick       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
311*e5dd7070Spatrick            diag::err_ucn_escape_invalid);
312*e5dd7070Spatrick     return false;
313*e5dd7070Spatrick   }
314*e5dd7070Spatrick 
315*e5dd7070Spatrick   // C++11 allows UCNs that refer to control characters and basic source
316*e5dd7070Spatrick   // characters inside character and string literals
317*e5dd7070Spatrick   if (UcnVal < 0xa0 &&
318*e5dd7070Spatrick       (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60)) {  // $, @, `
319*e5dd7070Spatrick     bool IsError = (!Features.CPlusPlus11 || !in_char_string_literal);
320*e5dd7070Spatrick     if (Diags) {
321*e5dd7070Spatrick       char BasicSCSChar = UcnVal;
322*e5dd7070Spatrick       if (UcnVal >= 0x20 && UcnVal < 0x7f)
323*e5dd7070Spatrick         Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
324*e5dd7070Spatrick              IsError ? diag::err_ucn_escape_basic_scs :
325*e5dd7070Spatrick                        diag::warn_cxx98_compat_literal_ucn_escape_basic_scs)
326*e5dd7070Spatrick             << StringRef(&BasicSCSChar, 1);
327*e5dd7070Spatrick       else
328*e5dd7070Spatrick         Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
329*e5dd7070Spatrick              IsError ? diag::err_ucn_control_character :
330*e5dd7070Spatrick                        diag::warn_cxx98_compat_literal_ucn_control_character);
331*e5dd7070Spatrick     }
332*e5dd7070Spatrick     if (IsError)
333*e5dd7070Spatrick       return false;
334*e5dd7070Spatrick   }
335*e5dd7070Spatrick 
336*e5dd7070Spatrick   if (!Features.CPlusPlus && !Features.C99 && Diags)
337*e5dd7070Spatrick     Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
338*e5dd7070Spatrick          diag::warn_ucn_not_valid_in_c89_literal);
339*e5dd7070Spatrick 
340*e5dd7070Spatrick   return true;
341*e5dd7070Spatrick }
342*e5dd7070Spatrick 
343*e5dd7070Spatrick /// MeasureUCNEscape - Determine the number of bytes within the resulting string
344*e5dd7070Spatrick /// which this UCN will occupy.
345*e5dd7070Spatrick static int MeasureUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
346*e5dd7070Spatrick                             const char *ThisTokEnd, unsigned CharByteWidth,
347*e5dd7070Spatrick                             const LangOptions &Features, bool &HadError) {
348*e5dd7070Spatrick   // UTF-32: 4 bytes per escape.
349*e5dd7070Spatrick   if (CharByteWidth == 4)
350*e5dd7070Spatrick     return 4;
351*e5dd7070Spatrick 
352*e5dd7070Spatrick   uint32_t UcnVal = 0;
353*e5dd7070Spatrick   unsigned short UcnLen = 0;
354*e5dd7070Spatrick   FullSourceLoc Loc;
355*e5dd7070Spatrick 
356*e5dd7070Spatrick   if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
357*e5dd7070Spatrick                         UcnLen, Loc, nullptr, Features, true)) {
358*e5dd7070Spatrick     HadError = true;
359*e5dd7070Spatrick     return 0;
360*e5dd7070Spatrick   }
361*e5dd7070Spatrick 
362*e5dd7070Spatrick   // UTF-16: 2 bytes for BMP, 4 bytes otherwise.
363*e5dd7070Spatrick   if (CharByteWidth == 2)
364*e5dd7070Spatrick     return UcnVal <= 0xFFFF ? 2 : 4;
365*e5dd7070Spatrick 
366*e5dd7070Spatrick   // UTF-8.
367*e5dd7070Spatrick   if (UcnVal < 0x80)
368*e5dd7070Spatrick     return 1;
369*e5dd7070Spatrick   if (UcnVal < 0x800)
370*e5dd7070Spatrick     return 2;
371*e5dd7070Spatrick   if (UcnVal < 0x10000)
372*e5dd7070Spatrick     return 3;
373*e5dd7070Spatrick   return 4;
374*e5dd7070Spatrick }
375*e5dd7070Spatrick 
376*e5dd7070Spatrick /// EncodeUCNEscape - Read the Universal Character Name, check constraints and
377*e5dd7070Spatrick /// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
378*e5dd7070Spatrick /// StringLiteralParser. When we decide to implement UCN's for identifiers,
379*e5dd7070Spatrick /// we will likely rework our support for UCN's.
380*e5dd7070Spatrick static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
381*e5dd7070Spatrick                             const char *ThisTokEnd,
382*e5dd7070Spatrick                             char *&ResultBuf, bool &HadError,
383*e5dd7070Spatrick                             FullSourceLoc Loc, unsigned CharByteWidth,
384*e5dd7070Spatrick                             DiagnosticsEngine *Diags,
385*e5dd7070Spatrick                             const LangOptions &Features) {
386*e5dd7070Spatrick   typedef uint32_t UTF32;
387*e5dd7070Spatrick   UTF32 UcnVal = 0;
388*e5dd7070Spatrick   unsigned short UcnLen = 0;
389*e5dd7070Spatrick   if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen,
390*e5dd7070Spatrick                         Loc, Diags, Features, true)) {
391*e5dd7070Spatrick     HadError = true;
392*e5dd7070Spatrick     return;
393*e5dd7070Spatrick   }
394*e5dd7070Spatrick 
395*e5dd7070Spatrick   assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth == 4) &&
396*e5dd7070Spatrick          "only character widths of 1, 2, or 4 bytes supported");
397*e5dd7070Spatrick 
398*e5dd7070Spatrick   (void)UcnLen;
399*e5dd7070Spatrick   assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");
400*e5dd7070Spatrick 
401*e5dd7070Spatrick   if (CharByteWidth == 4) {
402*e5dd7070Spatrick     // FIXME: Make the type of the result buffer correct instead of
403*e5dd7070Spatrick     // using reinterpret_cast.
404*e5dd7070Spatrick     llvm::UTF32 *ResultPtr = reinterpret_cast<llvm::UTF32*>(ResultBuf);
405*e5dd7070Spatrick     *ResultPtr = UcnVal;
406*e5dd7070Spatrick     ResultBuf += 4;
407*e5dd7070Spatrick     return;
408*e5dd7070Spatrick   }
409*e5dd7070Spatrick 
410*e5dd7070Spatrick   if (CharByteWidth == 2) {
411*e5dd7070Spatrick     // FIXME: Make the type of the result buffer correct instead of
412*e5dd7070Spatrick     // using reinterpret_cast.
413*e5dd7070Spatrick     llvm::UTF16 *ResultPtr = reinterpret_cast<llvm::UTF16*>(ResultBuf);
414*e5dd7070Spatrick 
415*e5dd7070Spatrick     if (UcnVal <= (UTF32)0xFFFF) {
416*e5dd7070Spatrick       *ResultPtr = UcnVal;
417*e5dd7070Spatrick       ResultBuf += 2;
418*e5dd7070Spatrick       return;
419*e5dd7070Spatrick     }
420*e5dd7070Spatrick 
421*e5dd7070Spatrick     // Convert to UTF16.
422*e5dd7070Spatrick     UcnVal -= 0x10000;
423*e5dd7070Spatrick     *ResultPtr     = 0xD800 + (UcnVal >> 10);
424*e5dd7070Spatrick     *(ResultPtr+1) = 0xDC00 + (UcnVal & 0x3FF);
425*e5dd7070Spatrick     ResultBuf += 4;
426*e5dd7070Spatrick     return;
427*e5dd7070Spatrick   }
428*e5dd7070Spatrick 
429*e5dd7070Spatrick   assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters");
430*e5dd7070Spatrick 
431*e5dd7070Spatrick   // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
432*e5dd7070Spatrick   // The conversion below was inspired by:
433*e5dd7070Spatrick   //   http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
434*e5dd7070Spatrick   // First, we determine how many bytes the result will require.
435*e5dd7070Spatrick   typedef uint8_t UTF8;
436*e5dd7070Spatrick 
437*e5dd7070Spatrick   unsigned short bytesToWrite = 0;
438*e5dd7070Spatrick   if (UcnVal < (UTF32)0x80)
439*e5dd7070Spatrick     bytesToWrite = 1;
440*e5dd7070Spatrick   else if (UcnVal < (UTF32)0x800)
441*e5dd7070Spatrick     bytesToWrite = 2;
442*e5dd7070Spatrick   else if (UcnVal < (UTF32)0x10000)
443*e5dd7070Spatrick     bytesToWrite = 3;
444*e5dd7070Spatrick   else
445*e5dd7070Spatrick     bytesToWrite = 4;
446*e5dd7070Spatrick 
447*e5dd7070Spatrick   const unsigned byteMask = 0xBF;
448*e5dd7070Spatrick   const unsigned byteMark = 0x80;
449*e5dd7070Spatrick 
450*e5dd7070Spatrick   // Once the bits are split out into bytes of UTF8, this is a mask OR-ed
451*e5dd7070Spatrick   // into the first byte, depending on how many bytes follow.
452*e5dd7070Spatrick   static const UTF8 firstByteMark[5] = {
453*e5dd7070Spatrick     0x00, 0x00, 0xC0, 0xE0, 0xF0
454*e5dd7070Spatrick   };
455*e5dd7070Spatrick   // Finally, we write the bytes into ResultBuf.
456*e5dd7070Spatrick   ResultBuf += bytesToWrite;
457*e5dd7070Spatrick   switch (bytesToWrite) { // note: everything falls through.
458*e5dd7070Spatrick   case 4:
459*e5dd7070Spatrick     *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
460*e5dd7070Spatrick     LLVM_FALLTHROUGH;
461*e5dd7070Spatrick   case 3:
462*e5dd7070Spatrick     *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
463*e5dd7070Spatrick     LLVM_FALLTHROUGH;
464*e5dd7070Spatrick   case 2:
465*e5dd7070Spatrick     *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
466*e5dd7070Spatrick     LLVM_FALLTHROUGH;
467*e5dd7070Spatrick   case 1:
468*e5dd7070Spatrick     *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]);
469*e5dd7070Spatrick   }
470*e5dd7070Spatrick   // Update the buffer.
471*e5dd7070Spatrick   ResultBuf += bytesToWrite;
472*e5dd7070Spatrick }
473*e5dd7070Spatrick 
474*e5dd7070Spatrick ///       integer-constant: [C99 6.4.4.1]
475*e5dd7070Spatrick ///         decimal-constant integer-suffix
476*e5dd7070Spatrick ///         octal-constant integer-suffix
477*e5dd7070Spatrick ///         hexadecimal-constant integer-suffix
478*e5dd7070Spatrick ///         binary-literal integer-suffix [GNU, C++1y]
479*e5dd7070Spatrick ///       user-defined-integer-literal: [C++11 lex.ext]
480*e5dd7070Spatrick ///         decimal-literal ud-suffix
481*e5dd7070Spatrick ///         octal-literal ud-suffix
482*e5dd7070Spatrick ///         hexadecimal-literal ud-suffix
483*e5dd7070Spatrick ///         binary-literal ud-suffix [GNU, C++1y]
484*e5dd7070Spatrick ///       decimal-constant:
485*e5dd7070Spatrick ///         nonzero-digit
486*e5dd7070Spatrick ///         decimal-constant digit
487*e5dd7070Spatrick ///       octal-constant:
488*e5dd7070Spatrick ///         0
489*e5dd7070Spatrick ///         octal-constant octal-digit
490*e5dd7070Spatrick ///       hexadecimal-constant:
491*e5dd7070Spatrick ///         hexadecimal-prefix hexadecimal-digit
492*e5dd7070Spatrick ///         hexadecimal-constant hexadecimal-digit
493*e5dd7070Spatrick ///       hexadecimal-prefix: one of
494*e5dd7070Spatrick ///         0x 0X
495*e5dd7070Spatrick ///       binary-literal:
496*e5dd7070Spatrick ///         0b binary-digit
497*e5dd7070Spatrick ///         0B binary-digit
498*e5dd7070Spatrick ///         binary-literal binary-digit
499*e5dd7070Spatrick ///       integer-suffix:
500*e5dd7070Spatrick ///         unsigned-suffix [long-suffix]
501*e5dd7070Spatrick ///         unsigned-suffix [long-long-suffix]
502*e5dd7070Spatrick ///         long-suffix [unsigned-suffix]
503*e5dd7070Spatrick ///         long-long-suffix [unsigned-sufix]
504*e5dd7070Spatrick ///       nonzero-digit:
505*e5dd7070Spatrick ///         1 2 3 4 5 6 7 8 9
506*e5dd7070Spatrick ///       octal-digit:
507*e5dd7070Spatrick ///         0 1 2 3 4 5 6 7
508*e5dd7070Spatrick ///       hexadecimal-digit:
509*e5dd7070Spatrick ///         0 1 2 3 4 5 6 7 8 9
510*e5dd7070Spatrick ///         a b c d e f
511*e5dd7070Spatrick ///         A B C D E F
512*e5dd7070Spatrick ///       binary-digit:
513*e5dd7070Spatrick ///         0
514*e5dd7070Spatrick ///         1
515*e5dd7070Spatrick ///       unsigned-suffix: one of
516*e5dd7070Spatrick ///         u U
517*e5dd7070Spatrick ///       long-suffix: one of
518*e5dd7070Spatrick ///         l L
519*e5dd7070Spatrick ///       long-long-suffix: one of
520*e5dd7070Spatrick ///         ll LL
521*e5dd7070Spatrick ///
522*e5dd7070Spatrick ///       floating-constant: [C99 6.4.4.2]
523*e5dd7070Spatrick ///         TODO: add rules...
524*e5dd7070Spatrick ///
525*e5dd7070Spatrick NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling,
526*e5dd7070Spatrick                                            SourceLocation TokLoc,
527*e5dd7070Spatrick                                            Preprocessor &PP)
528*e5dd7070Spatrick   : PP(PP), ThisTokBegin(TokSpelling.begin()), ThisTokEnd(TokSpelling.end()) {
529*e5dd7070Spatrick 
530*e5dd7070Spatrick   // This routine assumes that the range begin/end matches the regex for integer
531*e5dd7070Spatrick   // and FP constants (specifically, the 'pp-number' regex), and assumes that
532*e5dd7070Spatrick   // the byte at "*end" is both valid and not part of the regex.  Because of
533*e5dd7070Spatrick   // this, it doesn't have to check for 'overscan' in various places.
534*e5dd7070Spatrick   assert(!isPreprocessingNumberBody(*ThisTokEnd) && "didn't maximally munch?");
535*e5dd7070Spatrick 
536*e5dd7070Spatrick   s = DigitsBegin = ThisTokBegin;
537*e5dd7070Spatrick   saw_exponent = false;
538*e5dd7070Spatrick   saw_period = false;
539*e5dd7070Spatrick   saw_ud_suffix = false;
540*e5dd7070Spatrick   saw_fixed_point_suffix = false;
541*e5dd7070Spatrick   isLong = false;
542*e5dd7070Spatrick   isUnsigned = false;
543*e5dd7070Spatrick   isLongLong = false;
544*e5dd7070Spatrick   isHalf = false;
545*e5dd7070Spatrick   isFloat = false;
546*e5dd7070Spatrick   isImaginary = false;
547*e5dd7070Spatrick   isFloat16 = false;
548*e5dd7070Spatrick   isFloat128 = false;
549*e5dd7070Spatrick   MicrosoftInteger = 0;
550*e5dd7070Spatrick   isFract = false;
551*e5dd7070Spatrick   isAccum = false;
552*e5dd7070Spatrick   hadError = false;
553*e5dd7070Spatrick 
554*e5dd7070Spatrick   if (*s == '0') { // parse radix
555*e5dd7070Spatrick     ParseNumberStartingWithZero(TokLoc);
556*e5dd7070Spatrick     if (hadError)
557*e5dd7070Spatrick       return;
558*e5dd7070Spatrick   } else { // the first digit is non-zero
559*e5dd7070Spatrick     radix = 10;
560*e5dd7070Spatrick     s = SkipDigits(s);
561*e5dd7070Spatrick     if (s == ThisTokEnd) {
562*e5dd7070Spatrick       // Done.
563*e5dd7070Spatrick     } else {
564*e5dd7070Spatrick       ParseDecimalOrOctalCommon(TokLoc);
565*e5dd7070Spatrick       if (hadError)
566*e5dd7070Spatrick         return;
567*e5dd7070Spatrick     }
568*e5dd7070Spatrick   }
569*e5dd7070Spatrick 
570*e5dd7070Spatrick   SuffixBegin = s;
571*e5dd7070Spatrick   checkSeparator(TokLoc, s, CSK_AfterDigits);
572*e5dd7070Spatrick 
573*e5dd7070Spatrick   // Initial scan to lookahead for fixed point suffix.
574*e5dd7070Spatrick   if (PP.getLangOpts().FixedPoint) {
575*e5dd7070Spatrick     for (const char *c = s; c != ThisTokEnd; ++c) {
576*e5dd7070Spatrick       if (*c == 'r' || *c == 'k' || *c == 'R' || *c == 'K') {
577*e5dd7070Spatrick         saw_fixed_point_suffix = true;
578*e5dd7070Spatrick         break;
579*e5dd7070Spatrick       }
580*e5dd7070Spatrick     }
581*e5dd7070Spatrick   }
582*e5dd7070Spatrick 
583*e5dd7070Spatrick   // Parse the suffix.  At this point we can classify whether we have an FP or
584*e5dd7070Spatrick   // integer constant.
585*e5dd7070Spatrick   bool isFPConstant = isFloatingLiteral();
586*e5dd7070Spatrick 
587*e5dd7070Spatrick   // Loop over all of the characters of the suffix.  If we see something bad,
588*e5dd7070Spatrick   // we break out of the loop.
589*e5dd7070Spatrick   for (; s != ThisTokEnd; ++s) {
590*e5dd7070Spatrick     switch (*s) {
591*e5dd7070Spatrick     case 'R':
592*e5dd7070Spatrick     case 'r':
593*e5dd7070Spatrick       if (!PP.getLangOpts().FixedPoint) break;
594*e5dd7070Spatrick       if (isFract || isAccum) break;
595*e5dd7070Spatrick       if (!(saw_period || saw_exponent)) break;
596*e5dd7070Spatrick       isFract = true;
597*e5dd7070Spatrick       continue;
598*e5dd7070Spatrick     case 'K':
599*e5dd7070Spatrick     case 'k':
600*e5dd7070Spatrick       if (!PP.getLangOpts().FixedPoint) break;
601*e5dd7070Spatrick       if (isFract || isAccum) break;
602*e5dd7070Spatrick       if (!(saw_period || saw_exponent)) break;
603*e5dd7070Spatrick       isAccum = true;
604*e5dd7070Spatrick       continue;
605*e5dd7070Spatrick     case 'h':      // FP Suffix for "half".
606*e5dd7070Spatrick     case 'H':
607*e5dd7070Spatrick       // OpenCL Extension v1.2 s9.5 - h or H suffix for half type.
608*e5dd7070Spatrick       if (!(PP.getLangOpts().Half || PP.getLangOpts().FixedPoint)) break;
609*e5dd7070Spatrick       if (isIntegerLiteral()) break;  // Error for integer constant.
610*e5dd7070Spatrick       if (isHalf || isFloat || isLong) break; // HH, FH, LH invalid.
611*e5dd7070Spatrick       isHalf = true;
612*e5dd7070Spatrick       continue;  // Success.
613*e5dd7070Spatrick     case 'f':      // FP Suffix for "float"
614*e5dd7070Spatrick     case 'F':
615*e5dd7070Spatrick       if (!isFPConstant) break;  // Error for integer constant.
616*e5dd7070Spatrick       if (isHalf || isFloat || isLong || isFloat128)
617*e5dd7070Spatrick         break; // HF, FF, LF, QF invalid.
618*e5dd7070Spatrick 
619*e5dd7070Spatrick       // CUDA host and device may have different _Float16 support, therefore
620*e5dd7070Spatrick       // allows f16 literals to avoid false alarm.
621*e5dd7070Spatrick       // ToDo: more precise check for CUDA.
622*e5dd7070Spatrick       if ((PP.getTargetInfo().hasFloat16Type() || PP.getLangOpts().CUDA) &&
623*e5dd7070Spatrick           s + 2 < ThisTokEnd && s[1] == '1' && s[2] == '6') {
624*e5dd7070Spatrick         s += 2; // success, eat up 2 characters.
625*e5dd7070Spatrick         isFloat16 = true;
626*e5dd7070Spatrick         continue;
627*e5dd7070Spatrick       }
628*e5dd7070Spatrick 
629*e5dd7070Spatrick       isFloat = true;
630*e5dd7070Spatrick       continue;  // Success.
631*e5dd7070Spatrick     case 'q':    // FP Suffix for "__float128"
632*e5dd7070Spatrick     case 'Q':
633*e5dd7070Spatrick       if (!isFPConstant) break;  // Error for integer constant.
634*e5dd7070Spatrick       if (isHalf || isFloat || isLong || isFloat128)
635*e5dd7070Spatrick         break; // HQ, FQ, LQ, QQ invalid.
636*e5dd7070Spatrick       isFloat128 = true;
637*e5dd7070Spatrick       continue;  // Success.
638*e5dd7070Spatrick     case 'u':
639*e5dd7070Spatrick     case 'U':
640*e5dd7070Spatrick       if (isFPConstant) break;  // Error for floating constant.
641*e5dd7070Spatrick       if (isUnsigned) break;    // Cannot be repeated.
642*e5dd7070Spatrick       isUnsigned = true;
643*e5dd7070Spatrick       continue;  // Success.
644*e5dd7070Spatrick     case 'l':
645*e5dd7070Spatrick     case 'L':
646*e5dd7070Spatrick       if (isLong || isLongLong) break;  // Cannot be repeated.
647*e5dd7070Spatrick       if (isHalf || isFloat || isFloat128) break;     // LH, LF, LQ invalid.
648*e5dd7070Spatrick 
649*e5dd7070Spatrick       // Check for long long.  The L's need to be adjacent and the same case.
650*e5dd7070Spatrick       if (s[1] == s[0]) {
651*e5dd7070Spatrick         assert(s + 1 < ThisTokEnd && "didn't maximally munch?");
652*e5dd7070Spatrick         if (isFPConstant) break;        // long long invalid for floats.
653*e5dd7070Spatrick         isLongLong = true;
654*e5dd7070Spatrick         ++s;  // Eat both of them.
655*e5dd7070Spatrick       } else {
656*e5dd7070Spatrick         isLong = true;
657*e5dd7070Spatrick       }
658*e5dd7070Spatrick       continue;  // Success.
659*e5dd7070Spatrick     case 'i':
660*e5dd7070Spatrick     case 'I':
661*e5dd7070Spatrick       if (PP.getLangOpts().MicrosoftExt) {
662*e5dd7070Spatrick         if (isLong || isLongLong || MicrosoftInteger)
663*e5dd7070Spatrick           break;
664*e5dd7070Spatrick 
665*e5dd7070Spatrick         if (!isFPConstant) {
666*e5dd7070Spatrick           // Allow i8, i16, i32, and i64.
667*e5dd7070Spatrick           switch (s[1]) {
668*e5dd7070Spatrick           case '8':
669*e5dd7070Spatrick             s += 2; // i8 suffix
670*e5dd7070Spatrick             MicrosoftInteger = 8;
671*e5dd7070Spatrick             break;
672*e5dd7070Spatrick           case '1':
673*e5dd7070Spatrick             if (s[2] == '6') {
674*e5dd7070Spatrick               s += 3; // i16 suffix
675*e5dd7070Spatrick               MicrosoftInteger = 16;
676*e5dd7070Spatrick             }
677*e5dd7070Spatrick             break;
678*e5dd7070Spatrick           case '3':
679*e5dd7070Spatrick             if (s[2] == '2') {
680*e5dd7070Spatrick               s += 3; // i32 suffix
681*e5dd7070Spatrick               MicrosoftInteger = 32;
682*e5dd7070Spatrick             }
683*e5dd7070Spatrick             break;
684*e5dd7070Spatrick           case '6':
685*e5dd7070Spatrick             if (s[2] == '4') {
686*e5dd7070Spatrick               s += 3; // i64 suffix
687*e5dd7070Spatrick               MicrosoftInteger = 64;
688*e5dd7070Spatrick             }
689*e5dd7070Spatrick             break;
690*e5dd7070Spatrick           default:
691*e5dd7070Spatrick             break;
692*e5dd7070Spatrick           }
693*e5dd7070Spatrick         }
694*e5dd7070Spatrick         if (MicrosoftInteger) {
695*e5dd7070Spatrick           assert(s <= ThisTokEnd && "didn't maximally munch?");
696*e5dd7070Spatrick           break;
697*e5dd7070Spatrick         }
698*e5dd7070Spatrick       }
699*e5dd7070Spatrick       LLVM_FALLTHROUGH;
700*e5dd7070Spatrick     case 'j':
701*e5dd7070Spatrick     case 'J':
702*e5dd7070Spatrick       if (isImaginary) break;   // Cannot be repeated.
703*e5dd7070Spatrick       isImaginary = true;
704*e5dd7070Spatrick       continue;  // Success.
705*e5dd7070Spatrick     }
706*e5dd7070Spatrick     // If we reached here, there was an error or a ud-suffix.
707*e5dd7070Spatrick     break;
708*e5dd7070Spatrick   }
709*e5dd7070Spatrick 
710*e5dd7070Spatrick   // "i", "if", and "il" are user-defined suffixes in C++1y.
711*e5dd7070Spatrick   if (s != ThisTokEnd || isImaginary) {
712*e5dd7070Spatrick     // FIXME: Don't bother expanding UCNs if !tok.hasUCN().
713*e5dd7070Spatrick     expandUCNs(UDSuffixBuf, StringRef(SuffixBegin, ThisTokEnd - SuffixBegin));
714*e5dd7070Spatrick     if (isValidUDSuffix(PP.getLangOpts(), UDSuffixBuf)) {
715*e5dd7070Spatrick       if (!isImaginary) {
716*e5dd7070Spatrick         // Any suffix pieces we might have parsed are actually part of the
717*e5dd7070Spatrick         // ud-suffix.
718*e5dd7070Spatrick         isLong = false;
719*e5dd7070Spatrick         isUnsigned = false;
720*e5dd7070Spatrick         isLongLong = false;
721*e5dd7070Spatrick         isFloat = false;
722*e5dd7070Spatrick         isFloat16 = false;
723*e5dd7070Spatrick         isHalf = false;
724*e5dd7070Spatrick         isImaginary = false;
725*e5dd7070Spatrick         MicrosoftInteger = 0;
726*e5dd7070Spatrick         saw_fixed_point_suffix = false;
727*e5dd7070Spatrick         isFract = false;
728*e5dd7070Spatrick         isAccum = false;
729*e5dd7070Spatrick       }
730*e5dd7070Spatrick 
731*e5dd7070Spatrick       saw_ud_suffix = true;
732*e5dd7070Spatrick       return;
733*e5dd7070Spatrick     }
734*e5dd7070Spatrick 
735*e5dd7070Spatrick     if (s != ThisTokEnd) {
736*e5dd7070Spatrick       // Report an error if there are any.
737*e5dd7070Spatrick       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, SuffixBegin - ThisTokBegin),
738*e5dd7070Spatrick               diag::err_invalid_suffix_constant)
739*e5dd7070Spatrick           << StringRef(SuffixBegin, ThisTokEnd - SuffixBegin) << isFPConstant;
740*e5dd7070Spatrick       hadError = true;
741*e5dd7070Spatrick     }
742*e5dd7070Spatrick   }
743*e5dd7070Spatrick 
744*e5dd7070Spatrick   if (!hadError && saw_fixed_point_suffix) {
745*e5dd7070Spatrick     assert(isFract || isAccum);
746*e5dd7070Spatrick   }
747*e5dd7070Spatrick }
748*e5dd7070Spatrick 
749*e5dd7070Spatrick /// ParseDecimalOrOctalCommon - This method is called for decimal or octal
750*e5dd7070Spatrick /// numbers. It issues an error for illegal digits, and handles floating point
751*e5dd7070Spatrick /// parsing. If it detects a floating point number, the radix is set to 10.
752*e5dd7070Spatrick void NumericLiteralParser::ParseDecimalOrOctalCommon(SourceLocation TokLoc){
753*e5dd7070Spatrick   assert((radix == 8 || radix == 10) && "Unexpected radix");
754*e5dd7070Spatrick 
755*e5dd7070Spatrick   // If we have a hex digit other than 'e' (which denotes a FP exponent) then
756*e5dd7070Spatrick   // the code is using an incorrect base.
757*e5dd7070Spatrick   if (isHexDigit(*s) && *s != 'e' && *s != 'E' &&
758*e5dd7070Spatrick       !isValidUDSuffix(PP.getLangOpts(), StringRef(s, ThisTokEnd - s))) {
759*e5dd7070Spatrick     PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
760*e5dd7070Spatrick             diag::err_invalid_digit) << StringRef(s, 1) << (radix == 8 ? 1 : 0);
761*e5dd7070Spatrick     hadError = true;
762*e5dd7070Spatrick     return;
763*e5dd7070Spatrick   }
764*e5dd7070Spatrick 
765*e5dd7070Spatrick   if (*s == '.') {
766*e5dd7070Spatrick     checkSeparator(TokLoc, s, CSK_AfterDigits);
767*e5dd7070Spatrick     s++;
768*e5dd7070Spatrick     radix = 10;
769*e5dd7070Spatrick     saw_period = true;
770*e5dd7070Spatrick     checkSeparator(TokLoc, s, CSK_BeforeDigits);
771*e5dd7070Spatrick     s = SkipDigits(s); // Skip suffix.
772*e5dd7070Spatrick   }
773*e5dd7070Spatrick   if (*s == 'e' || *s == 'E') { // exponent
774*e5dd7070Spatrick     checkSeparator(TokLoc, s, CSK_AfterDigits);
775*e5dd7070Spatrick     const char *Exponent = s;
776*e5dd7070Spatrick     s++;
777*e5dd7070Spatrick     radix = 10;
778*e5dd7070Spatrick     saw_exponent = true;
779*e5dd7070Spatrick     if (s != ThisTokEnd && (*s == '+' || *s == '-'))  s++; // sign
780*e5dd7070Spatrick     const char *first_non_digit = SkipDigits(s);
781*e5dd7070Spatrick     if (containsDigits(s, first_non_digit)) {
782*e5dd7070Spatrick       checkSeparator(TokLoc, s, CSK_BeforeDigits);
783*e5dd7070Spatrick       s = first_non_digit;
784*e5dd7070Spatrick     } else {
785*e5dd7070Spatrick       if (!hadError) {
786*e5dd7070Spatrick         PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
787*e5dd7070Spatrick                 diag::err_exponent_has_no_digits);
788*e5dd7070Spatrick         hadError = true;
789*e5dd7070Spatrick       }
790*e5dd7070Spatrick       return;
791*e5dd7070Spatrick     }
792*e5dd7070Spatrick   }
793*e5dd7070Spatrick }
794*e5dd7070Spatrick 
795*e5dd7070Spatrick /// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
796*e5dd7070Spatrick /// suffixes as ud-suffixes, because the diagnostic experience is better if we
797*e5dd7070Spatrick /// treat it as an invalid suffix.
798*e5dd7070Spatrick bool NumericLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
799*e5dd7070Spatrick                                            StringRef Suffix) {
800*e5dd7070Spatrick   if (!LangOpts.CPlusPlus11 || Suffix.empty())
801*e5dd7070Spatrick     return false;
802*e5dd7070Spatrick 
803*e5dd7070Spatrick   // By C++11 [lex.ext]p10, ud-suffixes starting with an '_' are always valid.
804*e5dd7070Spatrick   if (Suffix[0] == '_')
805*e5dd7070Spatrick     return true;
806*e5dd7070Spatrick 
807*e5dd7070Spatrick   // In C++11, there are no library suffixes.
808*e5dd7070Spatrick   if (!LangOpts.CPlusPlus14)
809*e5dd7070Spatrick     return false;
810*e5dd7070Spatrick 
811*e5dd7070Spatrick   // In C++14, "s", "h", "min", "ms", "us", and "ns" are used in the library.
812*e5dd7070Spatrick   // Per tweaked N3660, "il", "i", and "if" are also used in the library.
813*e5dd7070Spatrick   // In C++2a "d" and "y" are used in the library.
814*e5dd7070Spatrick   return llvm::StringSwitch<bool>(Suffix)
815*e5dd7070Spatrick       .Cases("h", "min", "s", true)
816*e5dd7070Spatrick       .Cases("ms", "us", "ns", true)
817*e5dd7070Spatrick       .Cases("il", "i", "if", true)
818*e5dd7070Spatrick       .Cases("d", "y", LangOpts.CPlusPlus2a)
819*e5dd7070Spatrick       .Default(false);
820*e5dd7070Spatrick }
821*e5dd7070Spatrick 
822*e5dd7070Spatrick void NumericLiteralParser::checkSeparator(SourceLocation TokLoc,
823*e5dd7070Spatrick                                           const char *Pos,
824*e5dd7070Spatrick                                           CheckSeparatorKind IsAfterDigits) {
825*e5dd7070Spatrick   if (IsAfterDigits == CSK_AfterDigits) {
826*e5dd7070Spatrick     if (Pos == ThisTokBegin)
827*e5dd7070Spatrick       return;
828*e5dd7070Spatrick     --Pos;
829*e5dd7070Spatrick   } else if (Pos == ThisTokEnd)
830*e5dd7070Spatrick     return;
831*e5dd7070Spatrick 
832*e5dd7070Spatrick   if (isDigitSeparator(*Pos)) {
833*e5dd7070Spatrick     PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Pos - ThisTokBegin),
834*e5dd7070Spatrick             diag::err_digit_separator_not_between_digits)
835*e5dd7070Spatrick       << IsAfterDigits;
836*e5dd7070Spatrick     hadError = true;
837*e5dd7070Spatrick   }
838*e5dd7070Spatrick }
839*e5dd7070Spatrick 
840*e5dd7070Spatrick /// ParseNumberStartingWithZero - This method is called when the first character
841*e5dd7070Spatrick /// of the number is found to be a zero.  This means it is either an octal
842*e5dd7070Spatrick /// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or
843*e5dd7070Spatrick /// a floating point number (01239.123e4).  Eat the prefix, determining the
844*e5dd7070Spatrick /// radix etc.
845*e5dd7070Spatrick void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
846*e5dd7070Spatrick   assert(s[0] == '0' && "Invalid method call");
847*e5dd7070Spatrick   s++;
848*e5dd7070Spatrick 
849*e5dd7070Spatrick   int c1 = s[0];
850*e5dd7070Spatrick 
851*e5dd7070Spatrick   // Handle a hex number like 0x1234.
852*e5dd7070Spatrick   if ((c1 == 'x' || c1 == 'X') && (isHexDigit(s[1]) || s[1] == '.')) {
853*e5dd7070Spatrick     s++;
854*e5dd7070Spatrick     assert(s < ThisTokEnd && "didn't maximally munch?");
855*e5dd7070Spatrick     radix = 16;
856*e5dd7070Spatrick     DigitsBegin = s;
857*e5dd7070Spatrick     s = SkipHexDigits(s);
858*e5dd7070Spatrick     bool HasSignificandDigits = containsDigits(DigitsBegin, s);
859*e5dd7070Spatrick     if (s == ThisTokEnd) {
860*e5dd7070Spatrick       // Done.
861*e5dd7070Spatrick     } else if (*s == '.') {
862*e5dd7070Spatrick       s++;
863*e5dd7070Spatrick       saw_period = true;
864*e5dd7070Spatrick       const char *floatDigitsBegin = s;
865*e5dd7070Spatrick       s = SkipHexDigits(s);
866*e5dd7070Spatrick       if (containsDigits(floatDigitsBegin, s))
867*e5dd7070Spatrick         HasSignificandDigits = true;
868*e5dd7070Spatrick       if (HasSignificandDigits)
869*e5dd7070Spatrick         checkSeparator(TokLoc, floatDigitsBegin, CSK_BeforeDigits);
870*e5dd7070Spatrick     }
871*e5dd7070Spatrick 
872*e5dd7070Spatrick     if (!HasSignificandDigits) {
873*e5dd7070Spatrick       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin),
874*e5dd7070Spatrick               diag::err_hex_constant_requires)
875*e5dd7070Spatrick           << PP.getLangOpts().CPlusPlus << 1;
876*e5dd7070Spatrick       hadError = true;
877*e5dd7070Spatrick       return;
878*e5dd7070Spatrick     }
879*e5dd7070Spatrick 
880*e5dd7070Spatrick     // A binary exponent can appear with or with a '.'. If dotted, the
881*e5dd7070Spatrick     // binary exponent is required.
882*e5dd7070Spatrick     if (*s == 'p' || *s == 'P') {
883*e5dd7070Spatrick       checkSeparator(TokLoc, s, CSK_AfterDigits);
884*e5dd7070Spatrick       const char *Exponent = s;
885*e5dd7070Spatrick       s++;
886*e5dd7070Spatrick       saw_exponent = true;
887*e5dd7070Spatrick       if (s != ThisTokEnd && (*s == '+' || *s == '-'))  s++; // sign
888*e5dd7070Spatrick       const char *first_non_digit = SkipDigits(s);
889*e5dd7070Spatrick       if (!containsDigits(s, first_non_digit)) {
890*e5dd7070Spatrick         if (!hadError) {
891*e5dd7070Spatrick           PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
892*e5dd7070Spatrick                   diag::err_exponent_has_no_digits);
893*e5dd7070Spatrick           hadError = true;
894*e5dd7070Spatrick         }
895*e5dd7070Spatrick         return;
896*e5dd7070Spatrick       }
897*e5dd7070Spatrick       checkSeparator(TokLoc, s, CSK_BeforeDigits);
898*e5dd7070Spatrick       s = first_non_digit;
899*e5dd7070Spatrick 
900*e5dd7070Spatrick       if (!PP.getLangOpts().HexFloats)
901*e5dd7070Spatrick         PP.Diag(TokLoc, PP.getLangOpts().CPlusPlus
902*e5dd7070Spatrick                             ? diag::ext_hex_literal_invalid
903*e5dd7070Spatrick                             : diag::ext_hex_constant_invalid);
904*e5dd7070Spatrick       else if (PP.getLangOpts().CPlusPlus17)
905*e5dd7070Spatrick         PP.Diag(TokLoc, diag::warn_cxx17_hex_literal);
906*e5dd7070Spatrick     } else if (saw_period) {
907*e5dd7070Spatrick       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin),
908*e5dd7070Spatrick               diag::err_hex_constant_requires)
909*e5dd7070Spatrick           << PP.getLangOpts().CPlusPlus << 0;
910*e5dd7070Spatrick       hadError = true;
911*e5dd7070Spatrick     }
912*e5dd7070Spatrick     return;
913*e5dd7070Spatrick   }
914*e5dd7070Spatrick 
915*e5dd7070Spatrick   // Handle simple binary numbers 0b01010
916*e5dd7070Spatrick   if ((c1 == 'b' || c1 == 'B') && (s[1] == '0' || s[1] == '1')) {
917*e5dd7070Spatrick     // 0b101010 is a C++1y / GCC extension.
918*e5dd7070Spatrick     PP.Diag(TokLoc,
919*e5dd7070Spatrick             PP.getLangOpts().CPlusPlus14
920*e5dd7070Spatrick               ? diag::warn_cxx11_compat_binary_literal
921*e5dd7070Spatrick               : PP.getLangOpts().CPlusPlus
922*e5dd7070Spatrick                 ? diag::ext_binary_literal_cxx14
923*e5dd7070Spatrick                 : diag::ext_binary_literal);
924*e5dd7070Spatrick     ++s;
925*e5dd7070Spatrick     assert(s < ThisTokEnd && "didn't maximally munch?");
926*e5dd7070Spatrick     radix = 2;
927*e5dd7070Spatrick     DigitsBegin = s;
928*e5dd7070Spatrick     s = SkipBinaryDigits(s);
929*e5dd7070Spatrick     if (s == ThisTokEnd) {
930*e5dd7070Spatrick       // Done.
931*e5dd7070Spatrick     } else if (isHexDigit(*s) &&
932*e5dd7070Spatrick                !isValidUDSuffix(PP.getLangOpts(),
933*e5dd7070Spatrick                                 StringRef(s, ThisTokEnd - s))) {
934*e5dd7070Spatrick       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
935*e5dd7070Spatrick               diag::err_invalid_digit) << StringRef(s, 1) << 2;
936*e5dd7070Spatrick       hadError = true;
937*e5dd7070Spatrick     }
938*e5dd7070Spatrick     // Other suffixes will be diagnosed by the caller.
939*e5dd7070Spatrick     return;
940*e5dd7070Spatrick   }
941*e5dd7070Spatrick 
942*e5dd7070Spatrick   // For now, the radix is set to 8. If we discover that we have a
943*e5dd7070Spatrick   // floating point constant, the radix will change to 10. Octal floating
944*e5dd7070Spatrick   // point constants are not permitted (only decimal and hexadecimal).
945*e5dd7070Spatrick   radix = 8;
946*e5dd7070Spatrick   DigitsBegin = s;
947*e5dd7070Spatrick   s = SkipOctalDigits(s);
948*e5dd7070Spatrick   if (s == ThisTokEnd)
949*e5dd7070Spatrick     return; // Done, simple octal number like 01234
950*e5dd7070Spatrick 
951*e5dd7070Spatrick   // If we have some other non-octal digit that *is* a decimal digit, see if
952*e5dd7070Spatrick   // this is part of a floating point number like 094.123 or 09e1.
953*e5dd7070Spatrick   if (isDigit(*s)) {
954*e5dd7070Spatrick     const char *EndDecimal = SkipDigits(s);
955*e5dd7070Spatrick     if (EndDecimal[0] == '.' || EndDecimal[0] == 'e' || EndDecimal[0] == 'E') {
956*e5dd7070Spatrick       s = EndDecimal;
957*e5dd7070Spatrick       radix = 10;
958*e5dd7070Spatrick     }
959*e5dd7070Spatrick   }
960*e5dd7070Spatrick 
961*e5dd7070Spatrick   ParseDecimalOrOctalCommon(TokLoc);
962*e5dd7070Spatrick }
963*e5dd7070Spatrick 
964*e5dd7070Spatrick static bool alwaysFitsInto64Bits(unsigned Radix, unsigned NumDigits) {
965*e5dd7070Spatrick   switch (Radix) {
966*e5dd7070Spatrick   case 2:
967*e5dd7070Spatrick     return NumDigits <= 64;
968*e5dd7070Spatrick   case 8:
969*e5dd7070Spatrick     return NumDigits <= 64 / 3; // Digits are groups of 3 bits.
970*e5dd7070Spatrick   case 10:
971*e5dd7070Spatrick     return NumDigits <= 19; // floor(log10(2^64))
972*e5dd7070Spatrick   case 16:
973*e5dd7070Spatrick     return NumDigits <= 64 / 4; // Digits are groups of 4 bits.
974*e5dd7070Spatrick   default:
975*e5dd7070Spatrick     llvm_unreachable("impossible Radix");
976*e5dd7070Spatrick   }
977*e5dd7070Spatrick }
978*e5dd7070Spatrick 
979*e5dd7070Spatrick /// GetIntegerValue - Convert this numeric literal value to an APInt that
980*e5dd7070Spatrick /// matches Val's input width.  If there is an overflow, set Val to the low bits
981*e5dd7070Spatrick /// of the result and return true.  Otherwise, return false.
982*e5dd7070Spatrick bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) {
983*e5dd7070Spatrick   // Fast path: Compute a conservative bound on the maximum number of
984*e5dd7070Spatrick   // bits per digit in this radix. If we can't possibly overflow a
985*e5dd7070Spatrick   // uint64 based on that bound then do the simple conversion to
986*e5dd7070Spatrick   // integer. This avoids the expensive overflow checking below, and
987*e5dd7070Spatrick   // handles the common cases that matter (small decimal integers and
988*e5dd7070Spatrick   // hex/octal values which don't overflow).
989*e5dd7070Spatrick   const unsigned NumDigits = SuffixBegin - DigitsBegin;
990*e5dd7070Spatrick   if (alwaysFitsInto64Bits(radix, NumDigits)) {
991*e5dd7070Spatrick     uint64_t N = 0;
992*e5dd7070Spatrick     for (const char *Ptr = DigitsBegin; Ptr != SuffixBegin; ++Ptr)
993*e5dd7070Spatrick       if (!isDigitSeparator(*Ptr))
994*e5dd7070Spatrick         N = N * radix + llvm::hexDigitValue(*Ptr);
995*e5dd7070Spatrick 
996*e5dd7070Spatrick     // This will truncate the value to Val's input width. Simply check
997*e5dd7070Spatrick     // for overflow by comparing.
998*e5dd7070Spatrick     Val = N;
999*e5dd7070Spatrick     return Val.getZExtValue() != N;
1000*e5dd7070Spatrick   }
1001*e5dd7070Spatrick 
1002*e5dd7070Spatrick   Val = 0;
1003*e5dd7070Spatrick   const char *Ptr = DigitsBegin;
1004*e5dd7070Spatrick 
1005*e5dd7070Spatrick   llvm::APInt RadixVal(Val.getBitWidth(), radix);
1006*e5dd7070Spatrick   llvm::APInt CharVal(Val.getBitWidth(), 0);
1007*e5dd7070Spatrick   llvm::APInt OldVal = Val;
1008*e5dd7070Spatrick 
1009*e5dd7070Spatrick   bool OverflowOccurred = false;
1010*e5dd7070Spatrick   while (Ptr < SuffixBegin) {
1011*e5dd7070Spatrick     if (isDigitSeparator(*Ptr)) {
1012*e5dd7070Spatrick       ++Ptr;
1013*e5dd7070Spatrick       continue;
1014*e5dd7070Spatrick     }
1015*e5dd7070Spatrick 
1016*e5dd7070Spatrick     unsigned C = llvm::hexDigitValue(*Ptr++);
1017*e5dd7070Spatrick 
1018*e5dd7070Spatrick     // If this letter is out of bound for this radix, reject it.
1019*e5dd7070Spatrick     assert(C < radix && "NumericLiteralParser ctor should have rejected this");
1020*e5dd7070Spatrick 
1021*e5dd7070Spatrick     CharVal = C;
1022*e5dd7070Spatrick 
1023*e5dd7070Spatrick     // Add the digit to the value in the appropriate radix.  If adding in digits
1024*e5dd7070Spatrick     // made the value smaller, then this overflowed.
1025*e5dd7070Spatrick     OldVal = Val;
1026*e5dd7070Spatrick 
1027*e5dd7070Spatrick     // Multiply by radix, did overflow occur on the multiply?
1028*e5dd7070Spatrick     Val *= RadixVal;
1029*e5dd7070Spatrick     OverflowOccurred |= Val.udiv(RadixVal) != OldVal;
1030*e5dd7070Spatrick 
1031*e5dd7070Spatrick     // Add value, did overflow occur on the value?
1032*e5dd7070Spatrick     //   (a + b) ult b  <=> overflow
1033*e5dd7070Spatrick     Val += CharVal;
1034*e5dd7070Spatrick     OverflowOccurred |= Val.ult(CharVal);
1035*e5dd7070Spatrick   }
1036*e5dd7070Spatrick   return OverflowOccurred;
1037*e5dd7070Spatrick }
1038*e5dd7070Spatrick 
1039*e5dd7070Spatrick llvm::APFloat::opStatus
1040*e5dd7070Spatrick NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
1041*e5dd7070Spatrick   using llvm::APFloat;
1042*e5dd7070Spatrick 
1043*e5dd7070Spatrick   unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin);
1044*e5dd7070Spatrick 
1045*e5dd7070Spatrick   llvm::SmallString<16> Buffer;
1046*e5dd7070Spatrick   StringRef Str(ThisTokBegin, n);
1047*e5dd7070Spatrick   if (Str.find('\'') != StringRef::npos) {
1048*e5dd7070Spatrick     Buffer.reserve(n);
1049*e5dd7070Spatrick     std::remove_copy_if(Str.begin(), Str.end(), std::back_inserter(Buffer),
1050*e5dd7070Spatrick                         &isDigitSeparator);
1051*e5dd7070Spatrick     Str = Buffer;
1052*e5dd7070Spatrick   }
1053*e5dd7070Spatrick 
1054*e5dd7070Spatrick   auto StatusOrErr =
1055*e5dd7070Spatrick       Result.convertFromString(Str, APFloat::rmNearestTiesToEven);
1056*e5dd7070Spatrick   assert(StatusOrErr && "Invalid floating point representation");
1057*e5dd7070Spatrick   return !errorToBool(StatusOrErr.takeError()) ? *StatusOrErr
1058*e5dd7070Spatrick                                                : APFloat::opInvalidOp;
1059*e5dd7070Spatrick }
1060*e5dd7070Spatrick 
1061*e5dd7070Spatrick static inline bool IsExponentPart(char c) {
1062*e5dd7070Spatrick   return c == 'p' || c == 'P' || c == 'e' || c == 'E';
1063*e5dd7070Spatrick }
1064*e5dd7070Spatrick 
1065*e5dd7070Spatrick bool NumericLiteralParser::GetFixedPointValue(llvm::APInt &StoreVal, unsigned Scale) {
1066*e5dd7070Spatrick   assert(radix == 16 || radix == 10);
1067*e5dd7070Spatrick 
1068*e5dd7070Spatrick   // Find how many digits are needed to store the whole literal.
1069*e5dd7070Spatrick   unsigned NumDigits = SuffixBegin - DigitsBegin;
1070*e5dd7070Spatrick   if (saw_period) --NumDigits;
1071*e5dd7070Spatrick 
1072*e5dd7070Spatrick   // Initial scan of the exponent if it exists
1073*e5dd7070Spatrick   bool ExpOverflowOccurred = false;
1074*e5dd7070Spatrick   bool NegativeExponent = false;
1075*e5dd7070Spatrick   const char *ExponentBegin;
1076*e5dd7070Spatrick   uint64_t Exponent = 0;
1077*e5dd7070Spatrick   int64_t BaseShift = 0;
1078*e5dd7070Spatrick   if (saw_exponent) {
1079*e5dd7070Spatrick     const char *Ptr = DigitsBegin;
1080*e5dd7070Spatrick 
1081*e5dd7070Spatrick     while (!IsExponentPart(*Ptr)) ++Ptr;
1082*e5dd7070Spatrick     ExponentBegin = Ptr;
1083*e5dd7070Spatrick     ++Ptr;
1084*e5dd7070Spatrick     NegativeExponent = *Ptr == '-';
1085*e5dd7070Spatrick     if (NegativeExponent) ++Ptr;
1086*e5dd7070Spatrick 
1087*e5dd7070Spatrick     unsigned NumExpDigits = SuffixBegin - Ptr;
1088*e5dd7070Spatrick     if (alwaysFitsInto64Bits(radix, NumExpDigits)) {
1089*e5dd7070Spatrick       llvm::StringRef ExpStr(Ptr, NumExpDigits);
1090*e5dd7070Spatrick       llvm::APInt ExpInt(/*numBits=*/64, ExpStr, /*radix=*/10);
1091*e5dd7070Spatrick       Exponent = ExpInt.getZExtValue();
1092*e5dd7070Spatrick     } else {
1093*e5dd7070Spatrick       ExpOverflowOccurred = true;
1094*e5dd7070Spatrick     }
1095*e5dd7070Spatrick 
1096*e5dd7070Spatrick     if (NegativeExponent) BaseShift -= Exponent;
1097*e5dd7070Spatrick     else BaseShift += Exponent;
1098*e5dd7070Spatrick   }
1099*e5dd7070Spatrick 
1100*e5dd7070Spatrick   // Number of bits needed for decimal literal is
1101*e5dd7070Spatrick   //   ceil(NumDigits * log2(10))       Integral part
1102*e5dd7070Spatrick   // + Scale                            Fractional part
1103*e5dd7070Spatrick   // + ceil(Exponent * log2(10))        Exponent
1104*e5dd7070Spatrick   // --------------------------------------------------
1105*e5dd7070Spatrick   //   ceil((NumDigits + Exponent) * log2(10)) + Scale
1106*e5dd7070Spatrick   //
1107*e5dd7070Spatrick   // But for simplicity in handling integers, we can round up log2(10) to 4,
1108*e5dd7070Spatrick   // making:
1109*e5dd7070Spatrick   // 4 * (NumDigits + Exponent) + Scale
1110*e5dd7070Spatrick   //
1111*e5dd7070Spatrick   // Number of digits needed for hexadecimal literal is
1112*e5dd7070Spatrick   //   4 * NumDigits                    Integral part
1113*e5dd7070Spatrick   // + Scale                            Fractional part
1114*e5dd7070Spatrick   // + Exponent                         Exponent
1115*e5dd7070Spatrick   // --------------------------------------------------
1116*e5dd7070Spatrick   //   (4 * NumDigits) + Scale + Exponent
1117*e5dd7070Spatrick   uint64_t NumBitsNeeded;
1118*e5dd7070Spatrick   if (radix == 10)
1119*e5dd7070Spatrick     NumBitsNeeded = 4 * (NumDigits + Exponent) + Scale;
1120*e5dd7070Spatrick   else
1121*e5dd7070Spatrick     NumBitsNeeded = 4 * NumDigits + Exponent + Scale;
1122*e5dd7070Spatrick 
1123*e5dd7070Spatrick   if (NumBitsNeeded > std::numeric_limits<unsigned>::max())
1124*e5dd7070Spatrick     ExpOverflowOccurred = true;
1125*e5dd7070Spatrick   llvm::APInt Val(static_cast<unsigned>(NumBitsNeeded), 0, /*isSigned=*/false);
1126*e5dd7070Spatrick 
1127*e5dd7070Spatrick   bool FoundDecimal = false;
1128*e5dd7070Spatrick 
1129*e5dd7070Spatrick   int64_t FractBaseShift = 0;
1130*e5dd7070Spatrick   const char *End = saw_exponent ? ExponentBegin : SuffixBegin;
1131*e5dd7070Spatrick   for (const char *Ptr = DigitsBegin; Ptr < End; ++Ptr) {
1132*e5dd7070Spatrick     if (*Ptr == '.') {
1133*e5dd7070Spatrick       FoundDecimal = true;
1134*e5dd7070Spatrick       continue;
1135*e5dd7070Spatrick     }
1136*e5dd7070Spatrick 
1137*e5dd7070Spatrick     // Normal reading of an integer
1138*e5dd7070Spatrick     unsigned C = llvm::hexDigitValue(*Ptr);
1139*e5dd7070Spatrick     assert(C < radix && "NumericLiteralParser ctor should have rejected this");
1140*e5dd7070Spatrick 
1141*e5dd7070Spatrick     Val *= radix;
1142*e5dd7070Spatrick     Val += C;
1143*e5dd7070Spatrick 
1144*e5dd7070Spatrick     if (FoundDecimal)
1145*e5dd7070Spatrick       // Keep track of how much we will need to adjust this value by from the
1146*e5dd7070Spatrick       // number of digits past the radix point.
1147*e5dd7070Spatrick       --FractBaseShift;
1148*e5dd7070Spatrick   }
1149*e5dd7070Spatrick 
1150*e5dd7070Spatrick   // For a radix of 16, we will be multiplying by 2 instead of 16.
1151*e5dd7070Spatrick   if (radix == 16) FractBaseShift *= 4;
1152*e5dd7070Spatrick   BaseShift += FractBaseShift;
1153*e5dd7070Spatrick 
1154*e5dd7070Spatrick   Val <<= Scale;
1155*e5dd7070Spatrick 
1156*e5dd7070Spatrick   uint64_t Base = (radix == 16) ? 2 : 10;
1157*e5dd7070Spatrick   if (BaseShift > 0) {
1158*e5dd7070Spatrick     for (int64_t i = 0; i < BaseShift; ++i) {
1159*e5dd7070Spatrick       Val *= Base;
1160*e5dd7070Spatrick     }
1161*e5dd7070Spatrick   } else if (BaseShift < 0) {
1162*e5dd7070Spatrick     for (int64_t i = BaseShift; i < 0 && !Val.isNullValue(); ++i)
1163*e5dd7070Spatrick       Val = Val.udiv(Base);
1164*e5dd7070Spatrick   }
1165*e5dd7070Spatrick 
1166*e5dd7070Spatrick   bool IntOverflowOccurred = false;
1167*e5dd7070Spatrick   auto MaxVal = llvm::APInt::getMaxValue(StoreVal.getBitWidth());
1168*e5dd7070Spatrick   if (Val.getBitWidth() > StoreVal.getBitWidth()) {
1169*e5dd7070Spatrick     IntOverflowOccurred |= Val.ugt(MaxVal.zext(Val.getBitWidth()));
1170*e5dd7070Spatrick     StoreVal = Val.trunc(StoreVal.getBitWidth());
1171*e5dd7070Spatrick   } else if (Val.getBitWidth() < StoreVal.getBitWidth()) {
1172*e5dd7070Spatrick     IntOverflowOccurred |= Val.zext(MaxVal.getBitWidth()).ugt(MaxVal);
1173*e5dd7070Spatrick     StoreVal = Val.zext(StoreVal.getBitWidth());
1174*e5dd7070Spatrick   } else {
1175*e5dd7070Spatrick     StoreVal = Val;
1176*e5dd7070Spatrick   }
1177*e5dd7070Spatrick 
1178*e5dd7070Spatrick   return IntOverflowOccurred || ExpOverflowOccurred;
1179*e5dd7070Spatrick }
1180*e5dd7070Spatrick 
1181*e5dd7070Spatrick /// \verbatim
1182*e5dd7070Spatrick ///       user-defined-character-literal: [C++11 lex.ext]
1183*e5dd7070Spatrick ///         character-literal ud-suffix
1184*e5dd7070Spatrick ///       ud-suffix:
1185*e5dd7070Spatrick ///         identifier
1186*e5dd7070Spatrick ///       character-literal: [C++11 lex.ccon]
1187*e5dd7070Spatrick ///         ' c-char-sequence '
1188*e5dd7070Spatrick ///         u' c-char-sequence '
1189*e5dd7070Spatrick ///         U' c-char-sequence '
1190*e5dd7070Spatrick ///         L' c-char-sequence '
1191*e5dd7070Spatrick ///         u8' c-char-sequence ' [C++1z lex.ccon]
1192*e5dd7070Spatrick ///       c-char-sequence:
1193*e5dd7070Spatrick ///         c-char
1194*e5dd7070Spatrick ///         c-char-sequence c-char
1195*e5dd7070Spatrick ///       c-char:
1196*e5dd7070Spatrick ///         any member of the source character set except the single-quote ',
1197*e5dd7070Spatrick ///           backslash \, or new-line character
1198*e5dd7070Spatrick ///         escape-sequence
1199*e5dd7070Spatrick ///         universal-character-name
1200*e5dd7070Spatrick ///       escape-sequence:
1201*e5dd7070Spatrick ///         simple-escape-sequence
1202*e5dd7070Spatrick ///         octal-escape-sequence
1203*e5dd7070Spatrick ///         hexadecimal-escape-sequence
1204*e5dd7070Spatrick ///       simple-escape-sequence:
1205*e5dd7070Spatrick ///         one of \' \" \? \\ \a \b \f \n \r \t \v
1206*e5dd7070Spatrick ///       octal-escape-sequence:
1207*e5dd7070Spatrick ///         \ octal-digit
1208*e5dd7070Spatrick ///         \ octal-digit octal-digit
1209*e5dd7070Spatrick ///         \ octal-digit octal-digit octal-digit
1210*e5dd7070Spatrick ///       hexadecimal-escape-sequence:
1211*e5dd7070Spatrick ///         \x hexadecimal-digit
1212*e5dd7070Spatrick ///         hexadecimal-escape-sequence hexadecimal-digit
1213*e5dd7070Spatrick ///       universal-character-name: [C++11 lex.charset]
1214*e5dd7070Spatrick ///         \u hex-quad
1215*e5dd7070Spatrick ///         \U hex-quad hex-quad
1216*e5dd7070Spatrick ///       hex-quad:
1217*e5dd7070Spatrick ///         hex-digit hex-digit hex-digit hex-digit
1218*e5dd7070Spatrick /// \endverbatim
1219*e5dd7070Spatrick ///
1220*e5dd7070Spatrick CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
1221*e5dd7070Spatrick                                      SourceLocation Loc, Preprocessor &PP,
1222*e5dd7070Spatrick                                      tok::TokenKind kind) {
1223*e5dd7070Spatrick   // At this point we know that the character matches the regex "(L|u|U)?'.*'".
1224*e5dd7070Spatrick   HadError = false;
1225*e5dd7070Spatrick 
1226*e5dd7070Spatrick   Kind = kind;
1227*e5dd7070Spatrick 
1228*e5dd7070Spatrick   const char *TokBegin = begin;
1229*e5dd7070Spatrick 
1230*e5dd7070Spatrick   // Skip over wide character determinant.
1231*e5dd7070Spatrick   if (Kind != tok::char_constant)
1232*e5dd7070Spatrick     ++begin;
1233*e5dd7070Spatrick   if (Kind == tok::utf8_char_constant)
1234*e5dd7070Spatrick     ++begin;
1235*e5dd7070Spatrick 
1236*e5dd7070Spatrick   // Skip over the entry quote.
1237*e5dd7070Spatrick   assert(begin[0] == '\'' && "Invalid token lexed");
1238*e5dd7070Spatrick   ++begin;
1239*e5dd7070Spatrick 
1240*e5dd7070Spatrick   // Remove an optional ud-suffix.
1241*e5dd7070Spatrick   if (end[-1] != '\'') {
1242*e5dd7070Spatrick     const char *UDSuffixEnd = end;
1243*e5dd7070Spatrick     do {
1244*e5dd7070Spatrick       --end;
1245*e5dd7070Spatrick     } while (end[-1] != '\'');
1246*e5dd7070Spatrick     // FIXME: Don't bother with this if !tok.hasUCN().
1247*e5dd7070Spatrick     expandUCNs(UDSuffixBuf, StringRef(end, UDSuffixEnd - end));
1248*e5dd7070Spatrick     UDSuffixOffset = end - TokBegin;
1249*e5dd7070Spatrick   }
1250*e5dd7070Spatrick 
1251*e5dd7070Spatrick   // Trim the ending quote.
1252*e5dd7070Spatrick   assert(end != begin && "Invalid token lexed");
1253*e5dd7070Spatrick   --end;
1254*e5dd7070Spatrick 
1255*e5dd7070Spatrick   // FIXME: The "Value" is an uint64_t so we can handle char literals of
1256*e5dd7070Spatrick   // up to 64-bits.
1257*e5dd7070Spatrick   // FIXME: This extensively assumes that 'char' is 8-bits.
1258*e5dd7070Spatrick   assert(PP.getTargetInfo().getCharWidth() == 8 &&
1259*e5dd7070Spatrick          "Assumes char is 8 bits");
1260*e5dd7070Spatrick   assert(PP.getTargetInfo().getIntWidth() <= 64 &&
1261*e5dd7070Spatrick          (PP.getTargetInfo().getIntWidth() & 7) == 0 &&
1262*e5dd7070Spatrick          "Assumes sizeof(int) on target is <= 64 and a multiple of char");
1263*e5dd7070Spatrick   assert(PP.getTargetInfo().getWCharWidth() <= 64 &&
1264*e5dd7070Spatrick          "Assumes sizeof(wchar) on target is <= 64");
1265*e5dd7070Spatrick 
1266*e5dd7070Spatrick   SmallVector<uint32_t, 4> codepoint_buffer;
1267*e5dd7070Spatrick   codepoint_buffer.resize(end - begin);
1268*e5dd7070Spatrick   uint32_t *buffer_begin = &codepoint_buffer.front();
1269*e5dd7070Spatrick   uint32_t *buffer_end = buffer_begin + codepoint_buffer.size();
1270*e5dd7070Spatrick 
1271*e5dd7070Spatrick   // Unicode escapes representing characters that cannot be correctly
1272*e5dd7070Spatrick   // represented in a single code unit are disallowed in character literals
1273*e5dd7070Spatrick   // by this implementation.
1274*e5dd7070Spatrick   uint32_t largest_character_for_kind;
1275*e5dd7070Spatrick   if (tok::wide_char_constant == Kind) {
1276*e5dd7070Spatrick     largest_character_for_kind =
1277*e5dd7070Spatrick         0xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth());
1278*e5dd7070Spatrick   } else if (tok::utf8_char_constant == Kind) {
1279*e5dd7070Spatrick     largest_character_for_kind = 0x7F;
1280*e5dd7070Spatrick   } else if (tok::utf16_char_constant == Kind) {
1281*e5dd7070Spatrick     largest_character_for_kind = 0xFFFF;
1282*e5dd7070Spatrick   } else if (tok::utf32_char_constant == Kind) {
1283*e5dd7070Spatrick     largest_character_for_kind = 0x10FFFF;
1284*e5dd7070Spatrick   } else {
1285*e5dd7070Spatrick     largest_character_for_kind = 0x7Fu;
1286*e5dd7070Spatrick   }
1287*e5dd7070Spatrick 
1288*e5dd7070Spatrick   while (begin != end) {
1289*e5dd7070Spatrick     // Is this a span of non-escape characters?
1290*e5dd7070Spatrick     if (begin[0] != '\\') {
1291*e5dd7070Spatrick       char const *start = begin;
1292*e5dd7070Spatrick       do {
1293*e5dd7070Spatrick         ++begin;
1294*e5dd7070Spatrick       } while (begin != end && *begin != '\\');
1295*e5dd7070Spatrick 
1296*e5dd7070Spatrick       char const *tmp_in_start = start;
1297*e5dd7070Spatrick       uint32_t *tmp_out_start = buffer_begin;
1298*e5dd7070Spatrick       llvm::ConversionResult res =
1299*e5dd7070Spatrick           llvm::ConvertUTF8toUTF32(reinterpret_cast<llvm::UTF8 const **>(&start),
1300*e5dd7070Spatrick                              reinterpret_cast<llvm::UTF8 const *>(begin),
1301*e5dd7070Spatrick                              &buffer_begin, buffer_end, llvm::strictConversion);
1302*e5dd7070Spatrick       if (res != llvm::conversionOK) {
1303*e5dd7070Spatrick         // If we see bad encoding for unprefixed character literals, warn and
1304*e5dd7070Spatrick         // simply copy the byte values, for compatibility with gcc and
1305*e5dd7070Spatrick         // older versions of clang.
1306*e5dd7070Spatrick         bool NoErrorOnBadEncoding = isAscii();
1307*e5dd7070Spatrick         unsigned Msg = diag::err_bad_character_encoding;
1308*e5dd7070Spatrick         if (NoErrorOnBadEncoding)
1309*e5dd7070Spatrick           Msg = diag::warn_bad_character_encoding;
1310*e5dd7070Spatrick         PP.Diag(Loc, Msg);
1311*e5dd7070Spatrick         if (NoErrorOnBadEncoding) {
1312*e5dd7070Spatrick           start = tmp_in_start;
1313*e5dd7070Spatrick           buffer_begin = tmp_out_start;
1314*e5dd7070Spatrick           for (; start != begin; ++start, ++buffer_begin)
1315*e5dd7070Spatrick             *buffer_begin = static_cast<uint8_t>(*start);
1316*e5dd7070Spatrick         } else {
1317*e5dd7070Spatrick           HadError = true;
1318*e5dd7070Spatrick         }
1319*e5dd7070Spatrick       } else {
1320*e5dd7070Spatrick         for (; tmp_out_start < buffer_begin; ++tmp_out_start) {
1321*e5dd7070Spatrick           if (*tmp_out_start > largest_character_for_kind) {
1322*e5dd7070Spatrick             HadError = true;
1323*e5dd7070Spatrick             PP.Diag(Loc, diag::err_character_too_large);
1324*e5dd7070Spatrick           }
1325*e5dd7070Spatrick         }
1326*e5dd7070Spatrick       }
1327*e5dd7070Spatrick 
1328*e5dd7070Spatrick       continue;
1329*e5dd7070Spatrick     }
1330*e5dd7070Spatrick     // Is this a Universal Character Name escape?
1331*e5dd7070Spatrick     if (begin[1] == 'u' || begin[1] == 'U') {
1332*e5dd7070Spatrick       unsigned short UcnLen = 0;
1333*e5dd7070Spatrick       if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen,
1334*e5dd7070Spatrick                             FullSourceLoc(Loc, PP.getSourceManager()),
1335*e5dd7070Spatrick                             &PP.getDiagnostics(), PP.getLangOpts(), true)) {
1336*e5dd7070Spatrick         HadError = true;
1337*e5dd7070Spatrick       } else if (*buffer_begin > largest_character_for_kind) {
1338*e5dd7070Spatrick         HadError = true;
1339*e5dd7070Spatrick         PP.Diag(Loc, diag::err_character_too_large);
1340*e5dd7070Spatrick       }
1341*e5dd7070Spatrick 
1342*e5dd7070Spatrick       ++buffer_begin;
1343*e5dd7070Spatrick       continue;
1344*e5dd7070Spatrick     }
1345*e5dd7070Spatrick     unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
1346*e5dd7070Spatrick     uint64_t result =
1347*e5dd7070Spatrick       ProcessCharEscape(TokBegin, begin, end, HadError,
1348*e5dd7070Spatrick                         FullSourceLoc(Loc,PP.getSourceManager()),
1349*e5dd7070Spatrick                         CharWidth, &PP.getDiagnostics(), PP.getLangOpts());
1350*e5dd7070Spatrick     *buffer_begin++ = result;
1351*e5dd7070Spatrick   }
1352*e5dd7070Spatrick 
1353*e5dd7070Spatrick   unsigned NumCharsSoFar = buffer_begin - &codepoint_buffer.front();
1354*e5dd7070Spatrick 
1355*e5dd7070Spatrick   if (NumCharsSoFar > 1) {
1356*e5dd7070Spatrick     if (isWide())
1357*e5dd7070Spatrick       PP.Diag(Loc, diag::warn_extraneous_char_constant);
1358*e5dd7070Spatrick     else if (isAscii() && NumCharsSoFar == 4)
1359*e5dd7070Spatrick       PP.Diag(Loc, diag::ext_four_char_character_literal);
1360*e5dd7070Spatrick     else if (isAscii())
1361*e5dd7070Spatrick       PP.Diag(Loc, diag::ext_multichar_character_literal);
1362*e5dd7070Spatrick     else
1363*e5dd7070Spatrick       PP.Diag(Loc, diag::err_multichar_utf_character_literal);
1364*e5dd7070Spatrick     IsMultiChar = true;
1365*e5dd7070Spatrick   } else {
1366*e5dd7070Spatrick     IsMultiChar = false;
1367*e5dd7070Spatrick   }
1368*e5dd7070Spatrick 
1369*e5dd7070Spatrick   llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
1370*e5dd7070Spatrick 
1371*e5dd7070Spatrick   // Narrow character literals act as though their value is concatenated
1372*e5dd7070Spatrick   // in this implementation, but warn on overflow.
1373*e5dd7070Spatrick   bool multi_char_too_long = false;
1374*e5dd7070Spatrick   if (isAscii() && isMultiChar()) {
1375*e5dd7070Spatrick     LitVal = 0;
1376*e5dd7070Spatrick     for (size_t i = 0; i < NumCharsSoFar; ++i) {
1377*e5dd7070Spatrick       // check for enough leading zeros to shift into
1378*e5dd7070Spatrick       multi_char_too_long |= (LitVal.countLeadingZeros() < 8);
1379*e5dd7070Spatrick       LitVal <<= 8;
1380*e5dd7070Spatrick       LitVal = LitVal + (codepoint_buffer[i] & 0xFF);
1381*e5dd7070Spatrick     }
1382*e5dd7070Spatrick   } else if (NumCharsSoFar > 0) {
1383*e5dd7070Spatrick     // otherwise just take the last character
1384*e5dd7070Spatrick     LitVal = buffer_begin[-1];
1385*e5dd7070Spatrick   }
1386*e5dd7070Spatrick 
1387*e5dd7070Spatrick   if (!HadError && multi_char_too_long) {
1388*e5dd7070Spatrick     PP.Diag(Loc, diag::warn_char_constant_too_large);
1389*e5dd7070Spatrick   }
1390*e5dd7070Spatrick 
1391*e5dd7070Spatrick   // Transfer the value from APInt to uint64_t
1392*e5dd7070Spatrick   Value = LitVal.getZExtValue();
1393*e5dd7070Spatrick 
1394*e5dd7070Spatrick   // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
1395*e5dd7070Spatrick   // if 'char' is signed for this target (C99 6.4.4.4p10).  Note that multiple
1396*e5dd7070Spatrick   // character constants are not sign extended in the this implementation:
1397*e5dd7070Spatrick   // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
1398*e5dd7070Spatrick   if (isAscii() && NumCharsSoFar == 1 && (Value & 128) &&
1399*e5dd7070Spatrick       PP.getLangOpts().CharIsSigned)
1400*e5dd7070Spatrick     Value = (signed char)Value;
1401*e5dd7070Spatrick }
1402*e5dd7070Spatrick 
1403*e5dd7070Spatrick /// \verbatim
1404*e5dd7070Spatrick ///       string-literal: [C++0x lex.string]
1405*e5dd7070Spatrick ///         encoding-prefix " [s-char-sequence] "
1406*e5dd7070Spatrick ///         encoding-prefix R raw-string
1407*e5dd7070Spatrick ///       encoding-prefix:
1408*e5dd7070Spatrick ///         u8
1409*e5dd7070Spatrick ///         u
1410*e5dd7070Spatrick ///         U
1411*e5dd7070Spatrick ///         L
1412*e5dd7070Spatrick ///       s-char-sequence:
1413*e5dd7070Spatrick ///         s-char
1414*e5dd7070Spatrick ///         s-char-sequence s-char
1415*e5dd7070Spatrick ///       s-char:
1416*e5dd7070Spatrick ///         any member of the source character set except the double-quote ",
1417*e5dd7070Spatrick ///           backslash \, or new-line character
1418*e5dd7070Spatrick ///         escape-sequence
1419*e5dd7070Spatrick ///         universal-character-name
1420*e5dd7070Spatrick ///       raw-string:
1421*e5dd7070Spatrick ///         " d-char-sequence ( r-char-sequence ) d-char-sequence "
1422*e5dd7070Spatrick ///       r-char-sequence:
1423*e5dd7070Spatrick ///         r-char
1424*e5dd7070Spatrick ///         r-char-sequence r-char
1425*e5dd7070Spatrick ///       r-char:
1426*e5dd7070Spatrick ///         any member of the source character set, except a right parenthesis )
1427*e5dd7070Spatrick ///           followed by the initial d-char-sequence (which may be empty)
1428*e5dd7070Spatrick ///           followed by a double quote ".
1429*e5dd7070Spatrick ///       d-char-sequence:
1430*e5dd7070Spatrick ///         d-char
1431*e5dd7070Spatrick ///         d-char-sequence d-char
1432*e5dd7070Spatrick ///       d-char:
1433*e5dd7070Spatrick ///         any member of the basic source character set except:
1434*e5dd7070Spatrick ///           space, the left parenthesis (, the right parenthesis ),
1435*e5dd7070Spatrick ///           the backslash \, and the control characters representing horizontal
1436*e5dd7070Spatrick ///           tab, vertical tab, form feed, and newline.
1437*e5dd7070Spatrick ///       escape-sequence: [C++0x lex.ccon]
1438*e5dd7070Spatrick ///         simple-escape-sequence
1439*e5dd7070Spatrick ///         octal-escape-sequence
1440*e5dd7070Spatrick ///         hexadecimal-escape-sequence
1441*e5dd7070Spatrick ///       simple-escape-sequence:
1442*e5dd7070Spatrick ///         one of \' \" \? \\ \a \b \f \n \r \t \v
1443*e5dd7070Spatrick ///       octal-escape-sequence:
1444*e5dd7070Spatrick ///         \ octal-digit
1445*e5dd7070Spatrick ///         \ octal-digit octal-digit
1446*e5dd7070Spatrick ///         \ octal-digit octal-digit octal-digit
1447*e5dd7070Spatrick ///       hexadecimal-escape-sequence:
1448*e5dd7070Spatrick ///         \x hexadecimal-digit
1449*e5dd7070Spatrick ///         hexadecimal-escape-sequence hexadecimal-digit
1450*e5dd7070Spatrick ///       universal-character-name:
1451*e5dd7070Spatrick ///         \u hex-quad
1452*e5dd7070Spatrick ///         \U hex-quad hex-quad
1453*e5dd7070Spatrick ///       hex-quad:
1454*e5dd7070Spatrick ///         hex-digit hex-digit hex-digit hex-digit
1455*e5dd7070Spatrick /// \endverbatim
1456*e5dd7070Spatrick ///
1457*e5dd7070Spatrick StringLiteralParser::
1458*e5dd7070Spatrick StringLiteralParser(ArrayRef<Token> StringToks,
1459*e5dd7070Spatrick                     Preprocessor &PP, bool Complain)
1460*e5dd7070Spatrick   : SM(PP.getSourceManager()), Features(PP.getLangOpts()),
1461*e5dd7070Spatrick     Target(PP.getTargetInfo()), Diags(Complain ? &PP.getDiagnostics() :nullptr),
1462*e5dd7070Spatrick     MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
1463*e5dd7070Spatrick     ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) {
1464*e5dd7070Spatrick   init(StringToks);
1465*e5dd7070Spatrick }
1466*e5dd7070Spatrick 
1467*e5dd7070Spatrick void StringLiteralParser::init(ArrayRef<Token> StringToks){
1468*e5dd7070Spatrick   // The literal token may have come from an invalid source location (e.g. due
1469*e5dd7070Spatrick   // to a PCH error), in which case the token length will be 0.
1470*e5dd7070Spatrick   if (StringToks.empty() || StringToks[0].getLength() < 2)
1471*e5dd7070Spatrick     return DiagnoseLexingError(SourceLocation());
1472*e5dd7070Spatrick 
1473*e5dd7070Spatrick   // Scan all of the string portions, remember the max individual token length,
1474*e5dd7070Spatrick   // computing a bound on the concatenated string length, and see whether any
1475*e5dd7070Spatrick   // piece is a wide-string.  If any of the string portions is a wide-string
1476*e5dd7070Spatrick   // literal, the result is a wide-string literal [C99 6.4.5p4].
1477*e5dd7070Spatrick   assert(!StringToks.empty() && "expected at least one token");
1478*e5dd7070Spatrick   MaxTokenLength = StringToks[0].getLength();
1479*e5dd7070Spatrick   assert(StringToks[0].getLength() >= 2 && "literal token is invalid!");
1480*e5dd7070Spatrick   SizeBound = StringToks[0].getLength()-2;  // -2 for "".
1481*e5dd7070Spatrick   Kind = StringToks[0].getKind();
1482*e5dd7070Spatrick 
1483*e5dd7070Spatrick   hadError = false;
1484*e5dd7070Spatrick 
1485*e5dd7070Spatrick   // Implement Translation Phase #6: concatenation of string literals
1486*e5dd7070Spatrick   /// (C99 5.1.1.2p1).  The common case is only one string fragment.
1487*e5dd7070Spatrick   for (unsigned i = 1; i != StringToks.size(); ++i) {
1488*e5dd7070Spatrick     if (StringToks[i].getLength() < 2)
1489*e5dd7070Spatrick       return DiagnoseLexingError(StringToks[i].getLocation());
1490*e5dd7070Spatrick 
1491*e5dd7070Spatrick     // The string could be shorter than this if it needs cleaning, but this is a
1492*e5dd7070Spatrick     // reasonable bound, which is all we need.
1493*e5dd7070Spatrick     assert(StringToks[i].getLength() >= 2 && "literal token is invalid!");
1494*e5dd7070Spatrick     SizeBound += StringToks[i].getLength()-2;  // -2 for "".
1495*e5dd7070Spatrick 
1496*e5dd7070Spatrick     // Remember maximum string piece length.
1497*e5dd7070Spatrick     if (StringToks[i].getLength() > MaxTokenLength)
1498*e5dd7070Spatrick       MaxTokenLength = StringToks[i].getLength();
1499*e5dd7070Spatrick 
1500*e5dd7070Spatrick     // Remember if we see any wide or utf-8/16/32 strings.
1501*e5dd7070Spatrick     // Also check for illegal concatenations.
1502*e5dd7070Spatrick     if (StringToks[i].isNot(Kind) && StringToks[i].isNot(tok::string_literal)) {
1503*e5dd7070Spatrick       if (isAscii()) {
1504*e5dd7070Spatrick         Kind = StringToks[i].getKind();
1505*e5dd7070Spatrick       } else {
1506*e5dd7070Spatrick         if (Diags)
1507*e5dd7070Spatrick           Diags->Report(StringToks[i].getLocation(),
1508*e5dd7070Spatrick                         diag::err_unsupported_string_concat);
1509*e5dd7070Spatrick         hadError = true;
1510*e5dd7070Spatrick       }
1511*e5dd7070Spatrick     }
1512*e5dd7070Spatrick   }
1513*e5dd7070Spatrick 
1514*e5dd7070Spatrick   // Include space for the null terminator.
1515*e5dd7070Spatrick   ++SizeBound;
1516*e5dd7070Spatrick 
1517*e5dd7070Spatrick   // TODO: K&R warning: "traditional C rejects string constant concatenation"
1518*e5dd7070Spatrick 
1519*e5dd7070Spatrick   // Get the width in bytes of char/wchar_t/char16_t/char32_t
1520*e5dd7070Spatrick   CharByteWidth = getCharWidth(Kind, Target);
1521*e5dd7070Spatrick   assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple");
1522*e5dd7070Spatrick   CharByteWidth /= 8;
1523*e5dd7070Spatrick 
1524*e5dd7070Spatrick   // The output buffer size needs to be large enough to hold wide characters.
1525*e5dd7070Spatrick   // This is a worst-case assumption which basically corresponds to L"" "long".
1526*e5dd7070Spatrick   SizeBound *= CharByteWidth;
1527*e5dd7070Spatrick 
1528*e5dd7070Spatrick   // Size the temporary buffer to hold the result string data.
1529*e5dd7070Spatrick   ResultBuf.resize(SizeBound);
1530*e5dd7070Spatrick 
1531*e5dd7070Spatrick   // Likewise, but for each string piece.
1532*e5dd7070Spatrick   SmallString<512> TokenBuf;
1533*e5dd7070Spatrick   TokenBuf.resize(MaxTokenLength);
1534*e5dd7070Spatrick 
1535*e5dd7070Spatrick   // Loop over all the strings, getting their spelling, and expanding them to
1536*e5dd7070Spatrick   // wide strings as appropriate.
1537*e5dd7070Spatrick   ResultPtr = &ResultBuf[0];   // Next byte to fill in.
1538*e5dd7070Spatrick 
1539*e5dd7070Spatrick   Pascal = false;
1540*e5dd7070Spatrick 
1541*e5dd7070Spatrick   SourceLocation UDSuffixTokLoc;
1542*e5dd7070Spatrick 
1543*e5dd7070Spatrick   for (unsigned i = 0, e = StringToks.size(); i != e; ++i) {
1544*e5dd7070Spatrick     const char *ThisTokBuf = &TokenBuf[0];
1545*e5dd7070Spatrick     // Get the spelling of the token, which eliminates trigraphs, etc.  We know
1546*e5dd7070Spatrick     // that ThisTokBuf points to a buffer that is big enough for the whole token
1547*e5dd7070Spatrick     // and 'spelled' tokens can only shrink.
1548*e5dd7070Spatrick     bool StringInvalid = false;
1549*e5dd7070Spatrick     unsigned ThisTokLen =
1550*e5dd7070Spatrick       Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features,
1551*e5dd7070Spatrick                          &StringInvalid);
1552*e5dd7070Spatrick     if (StringInvalid)
1553*e5dd7070Spatrick       return DiagnoseLexingError(StringToks[i].getLocation());
1554*e5dd7070Spatrick 
1555*e5dd7070Spatrick     const char *ThisTokBegin = ThisTokBuf;
1556*e5dd7070Spatrick     const char *ThisTokEnd = ThisTokBuf+ThisTokLen;
1557*e5dd7070Spatrick 
1558*e5dd7070Spatrick     // Remove an optional ud-suffix.
1559*e5dd7070Spatrick     if (ThisTokEnd[-1] != '"') {
1560*e5dd7070Spatrick       const char *UDSuffixEnd = ThisTokEnd;
1561*e5dd7070Spatrick       do {
1562*e5dd7070Spatrick         --ThisTokEnd;
1563*e5dd7070Spatrick       } while (ThisTokEnd[-1] != '"');
1564*e5dd7070Spatrick 
1565*e5dd7070Spatrick       StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd);
1566*e5dd7070Spatrick 
1567*e5dd7070Spatrick       if (UDSuffixBuf.empty()) {
1568*e5dd7070Spatrick         if (StringToks[i].hasUCN())
1569*e5dd7070Spatrick           expandUCNs(UDSuffixBuf, UDSuffix);
1570*e5dd7070Spatrick         else
1571*e5dd7070Spatrick           UDSuffixBuf.assign(UDSuffix);
1572*e5dd7070Spatrick         UDSuffixToken = i;
1573*e5dd7070Spatrick         UDSuffixOffset = ThisTokEnd - ThisTokBuf;
1574*e5dd7070Spatrick         UDSuffixTokLoc = StringToks[i].getLocation();
1575*e5dd7070Spatrick       } else {
1576*e5dd7070Spatrick         SmallString<32> ExpandedUDSuffix;
1577*e5dd7070Spatrick         if (StringToks[i].hasUCN()) {
1578*e5dd7070Spatrick           expandUCNs(ExpandedUDSuffix, UDSuffix);
1579*e5dd7070Spatrick           UDSuffix = ExpandedUDSuffix;
1580*e5dd7070Spatrick         }
1581*e5dd7070Spatrick 
1582*e5dd7070Spatrick         // C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the
1583*e5dd7070Spatrick         // result of a concatenation involving at least one user-defined-string-
1584*e5dd7070Spatrick         // literal, all the participating user-defined-string-literals shall
1585*e5dd7070Spatrick         // have the same ud-suffix.
1586*e5dd7070Spatrick         if (UDSuffixBuf != UDSuffix) {
1587*e5dd7070Spatrick           if (Diags) {
1588*e5dd7070Spatrick             SourceLocation TokLoc = StringToks[i].getLocation();
1589*e5dd7070Spatrick             Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix)
1590*e5dd7070Spatrick               << UDSuffixBuf << UDSuffix
1591*e5dd7070Spatrick               << SourceRange(UDSuffixTokLoc, UDSuffixTokLoc)
1592*e5dd7070Spatrick               << SourceRange(TokLoc, TokLoc);
1593*e5dd7070Spatrick           }
1594*e5dd7070Spatrick           hadError = true;
1595*e5dd7070Spatrick         }
1596*e5dd7070Spatrick       }
1597*e5dd7070Spatrick     }
1598*e5dd7070Spatrick 
1599*e5dd7070Spatrick     // Strip the end quote.
1600*e5dd7070Spatrick     --ThisTokEnd;
1601*e5dd7070Spatrick 
1602*e5dd7070Spatrick     // TODO: Input character set mapping support.
1603*e5dd7070Spatrick 
1604*e5dd7070Spatrick     // Skip marker for wide or unicode strings.
1605*e5dd7070Spatrick     if (ThisTokBuf[0] == 'L' || ThisTokBuf[0] == 'u' || ThisTokBuf[0] == 'U') {
1606*e5dd7070Spatrick       ++ThisTokBuf;
1607*e5dd7070Spatrick       // Skip 8 of u8 marker for utf8 strings.
1608*e5dd7070Spatrick       if (ThisTokBuf[0] == '8')
1609*e5dd7070Spatrick         ++ThisTokBuf;
1610*e5dd7070Spatrick     }
1611*e5dd7070Spatrick 
1612*e5dd7070Spatrick     // Check for raw string
1613*e5dd7070Spatrick     if (ThisTokBuf[0] == 'R') {
1614*e5dd7070Spatrick       ThisTokBuf += 2; // skip R"
1615*e5dd7070Spatrick 
1616*e5dd7070Spatrick       const char *Prefix = ThisTokBuf;
1617*e5dd7070Spatrick       while (ThisTokBuf[0] != '(')
1618*e5dd7070Spatrick         ++ThisTokBuf;
1619*e5dd7070Spatrick       ++ThisTokBuf; // skip '('
1620*e5dd7070Spatrick 
1621*e5dd7070Spatrick       // Remove same number of characters from the end
1622*e5dd7070Spatrick       ThisTokEnd -= ThisTokBuf - Prefix;
1623*e5dd7070Spatrick       assert(ThisTokEnd >= ThisTokBuf && "malformed raw string literal");
1624*e5dd7070Spatrick 
1625*e5dd7070Spatrick       // C++14 [lex.string]p4: A source-file new-line in a raw string literal
1626*e5dd7070Spatrick       // results in a new-line in the resulting execution string-literal.
1627*e5dd7070Spatrick       StringRef RemainingTokenSpan(ThisTokBuf, ThisTokEnd - ThisTokBuf);
1628*e5dd7070Spatrick       while (!RemainingTokenSpan.empty()) {
1629*e5dd7070Spatrick         // Split the string literal on \r\n boundaries.
1630*e5dd7070Spatrick         size_t CRLFPos = RemainingTokenSpan.find("\r\n");
1631*e5dd7070Spatrick         StringRef BeforeCRLF = RemainingTokenSpan.substr(0, CRLFPos);
1632*e5dd7070Spatrick         StringRef AfterCRLF = RemainingTokenSpan.substr(CRLFPos);
1633*e5dd7070Spatrick 
1634*e5dd7070Spatrick         // Copy everything before the \r\n sequence into the string literal.
1635*e5dd7070Spatrick         if (CopyStringFragment(StringToks[i], ThisTokBegin, BeforeCRLF))
1636*e5dd7070Spatrick           hadError = true;
1637*e5dd7070Spatrick 
1638*e5dd7070Spatrick         // Point into the \n inside the \r\n sequence and operate on the
1639*e5dd7070Spatrick         // remaining portion of the literal.
1640*e5dd7070Spatrick         RemainingTokenSpan = AfterCRLF.substr(1);
1641*e5dd7070Spatrick       }
1642*e5dd7070Spatrick     } else {
1643*e5dd7070Spatrick       if (ThisTokBuf[0] != '"') {
1644*e5dd7070Spatrick         // The file may have come from PCH and then changed after loading the
1645*e5dd7070Spatrick         // PCH; Fail gracefully.
1646*e5dd7070Spatrick         return DiagnoseLexingError(StringToks[i].getLocation());
1647*e5dd7070Spatrick       }
1648*e5dd7070Spatrick       ++ThisTokBuf; // skip "
1649*e5dd7070Spatrick 
1650*e5dd7070Spatrick       // Check if this is a pascal string
1651*e5dd7070Spatrick       if (Features.PascalStrings && ThisTokBuf + 1 != ThisTokEnd &&
1652*e5dd7070Spatrick           ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') {
1653*e5dd7070Spatrick 
1654*e5dd7070Spatrick         // If the \p sequence is found in the first token, we have a pascal string
1655*e5dd7070Spatrick         // Otherwise, if we already have a pascal string, ignore the first \p
1656*e5dd7070Spatrick         if (i == 0) {
1657*e5dd7070Spatrick           ++ThisTokBuf;
1658*e5dd7070Spatrick           Pascal = true;
1659*e5dd7070Spatrick         } else if (Pascal)
1660*e5dd7070Spatrick           ThisTokBuf += 2;
1661*e5dd7070Spatrick       }
1662*e5dd7070Spatrick 
1663*e5dd7070Spatrick       while (ThisTokBuf != ThisTokEnd) {
1664*e5dd7070Spatrick         // Is this a span of non-escape characters?
1665*e5dd7070Spatrick         if (ThisTokBuf[0] != '\\') {
1666*e5dd7070Spatrick           const char *InStart = ThisTokBuf;
1667*e5dd7070Spatrick           do {
1668*e5dd7070Spatrick             ++ThisTokBuf;
1669*e5dd7070Spatrick           } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
1670*e5dd7070Spatrick 
1671*e5dd7070Spatrick           // Copy the character span over.
1672*e5dd7070Spatrick           if (CopyStringFragment(StringToks[i], ThisTokBegin,
1673*e5dd7070Spatrick                                  StringRef(InStart, ThisTokBuf - InStart)))
1674*e5dd7070Spatrick             hadError = true;
1675*e5dd7070Spatrick           continue;
1676*e5dd7070Spatrick         }
1677*e5dd7070Spatrick         // Is this a Universal Character Name escape?
1678*e5dd7070Spatrick         if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') {
1679*e5dd7070Spatrick           EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
1680*e5dd7070Spatrick                           ResultPtr, hadError,
1681*e5dd7070Spatrick                           FullSourceLoc(StringToks[i].getLocation(), SM),
1682*e5dd7070Spatrick                           CharByteWidth, Diags, Features);
1683*e5dd7070Spatrick           continue;
1684*e5dd7070Spatrick         }
1685*e5dd7070Spatrick         // Otherwise, this is a non-UCN escape character.  Process it.
1686*e5dd7070Spatrick         unsigned ResultChar =
1687*e5dd7070Spatrick           ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError,
1688*e5dd7070Spatrick                             FullSourceLoc(StringToks[i].getLocation(), SM),
1689*e5dd7070Spatrick                             CharByteWidth*8, Diags, Features);
1690*e5dd7070Spatrick 
1691*e5dd7070Spatrick         if (CharByteWidth == 4) {
1692*e5dd7070Spatrick           // FIXME: Make the type of the result buffer correct instead of
1693*e5dd7070Spatrick           // using reinterpret_cast.
1694*e5dd7070Spatrick           llvm::UTF32 *ResultWidePtr = reinterpret_cast<llvm::UTF32*>(ResultPtr);
1695*e5dd7070Spatrick           *ResultWidePtr = ResultChar;
1696*e5dd7070Spatrick           ResultPtr += 4;
1697*e5dd7070Spatrick         } else if (CharByteWidth == 2) {
1698*e5dd7070Spatrick           // FIXME: Make the type of the result buffer correct instead of
1699*e5dd7070Spatrick           // using reinterpret_cast.
1700*e5dd7070Spatrick           llvm::UTF16 *ResultWidePtr = reinterpret_cast<llvm::UTF16*>(ResultPtr);
1701*e5dd7070Spatrick           *ResultWidePtr = ResultChar & 0xFFFF;
1702*e5dd7070Spatrick           ResultPtr += 2;
1703*e5dd7070Spatrick         } else {
1704*e5dd7070Spatrick           assert(CharByteWidth == 1 && "Unexpected char width");
1705*e5dd7070Spatrick           *ResultPtr++ = ResultChar & 0xFF;
1706*e5dd7070Spatrick         }
1707*e5dd7070Spatrick       }
1708*e5dd7070Spatrick     }
1709*e5dd7070Spatrick   }
1710*e5dd7070Spatrick 
1711*e5dd7070Spatrick   if (Pascal) {
1712*e5dd7070Spatrick     if (CharByteWidth == 4) {
1713*e5dd7070Spatrick       // FIXME: Make the type of the result buffer correct instead of
1714*e5dd7070Spatrick       // using reinterpret_cast.
1715*e5dd7070Spatrick       llvm::UTF32 *ResultWidePtr = reinterpret_cast<llvm::UTF32*>(ResultBuf.data());
1716*e5dd7070Spatrick       ResultWidePtr[0] = GetNumStringChars() - 1;
1717*e5dd7070Spatrick     } else if (CharByteWidth == 2) {
1718*e5dd7070Spatrick       // FIXME: Make the type of the result buffer correct instead of
1719*e5dd7070Spatrick       // using reinterpret_cast.
1720*e5dd7070Spatrick       llvm::UTF16 *ResultWidePtr = reinterpret_cast<llvm::UTF16*>(ResultBuf.data());
1721*e5dd7070Spatrick       ResultWidePtr[0] = GetNumStringChars() - 1;
1722*e5dd7070Spatrick     } else {
1723*e5dd7070Spatrick       assert(CharByteWidth == 1 && "Unexpected char width");
1724*e5dd7070Spatrick       ResultBuf[0] = GetNumStringChars() - 1;
1725*e5dd7070Spatrick     }
1726*e5dd7070Spatrick 
1727*e5dd7070Spatrick     // Verify that pascal strings aren't too large.
1728*e5dd7070Spatrick     if (GetStringLength() > 256) {
1729*e5dd7070Spatrick       if (Diags)
1730*e5dd7070Spatrick         Diags->Report(StringToks.front().getLocation(),
1731*e5dd7070Spatrick                       diag::err_pascal_string_too_long)
1732*e5dd7070Spatrick           << SourceRange(StringToks.front().getLocation(),
1733*e5dd7070Spatrick                          StringToks.back().getLocation());
1734*e5dd7070Spatrick       hadError = true;
1735*e5dd7070Spatrick       return;
1736*e5dd7070Spatrick     }
1737*e5dd7070Spatrick   } else if (Diags) {
1738*e5dd7070Spatrick     // Complain if this string literal has too many characters.
1739*e5dd7070Spatrick     unsigned MaxChars = Features.CPlusPlus? 65536 : Features.C99 ? 4095 : 509;
1740*e5dd7070Spatrick 
1741*e5dd7070Spatrick     if (GetNumStringChars() > MaxChars)
1742*e5dd7070Spatrick       Diags->Report(StringToks.front().getLocation(),
1743*e5dd7070Spatrick                     diag::ext_string_too_long)
1744*e5dd7070Spatrick         << GetNumStringChars() << MaxChars
1745*e5dd7070Spatrick         << (Features.CPlusPlus ? 2 : Features.C99 ? 1 : 0)
1746*e5dd7070Spatrick         << SourceRange(StringToks.front().getLocation(),
1747*e5dd7070Spatrick                        StringToks.back().getLocation());
1748*e5dd7070Spatrick   }
1749*e5dd7070Spatrick }
1750*e5dd7070Spatrick 
1751*e5dd7070Spatrick static const char *resyncUTF8(const char *Err, const char *End) {
1752*e5dd7070Spatrick   if (Err == End)
1753*e5dd7070Spatrick     return End;
1754*e5dd7070Spatrick   End = Err + std::min<unsigned>(llvm::getNumBytesForUTF8(*Err), End-Err);
1755*e5dd7070Spatrick   while (++Err != End && (*Err & 0xC0) == 0x80)
1756*e5dd7070Spatrick     ;
1757*e5dd7070Spatrick   return Err;
1758*e5dd7070Spatrick }
1759*e5dd7070Spatrick 
1760*e5dd7070Spatrick /// This function copies from Fragment, which is a sequence of bytes
1761*e5dd7070Spatrick /// within Tok's contents (which begin at TokBegin) into ResultPtr.
1762*e5dd7070Spatrick /// Performs widening for multi-byte characters.
1763*e5dd7070Spatrick bool StringLiteralParser::CopyStringFragment(const Token &Tok,
1764*e5dd7070Spatrick                                              const char *TokBegin,
1765*e5dd7070Spatrick                                              StringRef Fragment) {
1766*e5dd7070Spatrick   const llvm::UTF8 *ErrorPtrTmp;
1767*e5dd7070Spatrick   if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp))
1768*e5dd7070Spatrick     return false;
1769*e5dd7070Spatrick 
1770*e5dd7070Spatrick   // If we see bad encoding for unprefixed string literals, warn and
1771*e5dd7070Spatrick   // simply copy the byte values, for compatibility with gcc and older
1772*e5dd7070Spatrick   // versions of clang.
1773*e5dd7070Spatrick   bool NoErrorOnBadEncoding = isAscii();
1774*e5dd7070Spatrick   if (NoErrorOnBadEncoding) {
1775*e5dd7070Spatrick     memcpy(ResultPtr, Fragment.data(), Fragment.size());
1776*e5dd7070Spatrick     ResultPtr += Fragment.size();
1777*e5dd7070Spatrick   }
1778*e5dd7070Spatrick 
1779*e5dd7070Spatrick   if (Diags) {
1780*e5dd7070Spatrick     const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
1781*e5dd7070Spatrick 
1782*e5dd7070Spatrick     FullSourceLoc SourceLoc(Tok.getLocation(), SM);
1783*e5dd7070Spatrick     const DiagnosticBuilder &Builder =
1784*e5dd7070Spatrick       Diag(Diags, Features, SourceLoc, TokBegin,
1785*e5dd7070Spatrick            ErrorPtr, resyncUTF8(ErrorPtr, Fragment.end()),
1786*e5dd7070Spatrick            NoErrorOnBadEncoding ? diag::warn_bad_string_encoding
1787*e5dd7070Spatrick                                 : diag::err_bad_string_encoding);
1788*e5dd7070Spatrick 
1789*e5dd7070Spatrick     const char *NextStart = resyncUTF8(ErrorPtr, Fragment.end());
1790*e5dd7070Spatrick     StringRef NextFragment(NextStart, Fragment.end()-NextStart);
1791*e5dd7070Spatrick 
1792*e5dd7070Spatrick     // Decode into a dummy buffer.
1793*e5dd7070Spatrick     SmallString<512> Dummy;
1794*e5dd7070Spatrick     Dummy.reserve(Fragment.size() * CharByteWidth);
1795*e5dd7070Spatrick     char *Ptr = Dummy.data();
1796*e5dd7070Spatrick 
1797*e5dd7070Spatrick     while (!ConvertUTF8toWide(CharByteWidth, NextFragment, Ptr, ErrorPtrTmp)) {
1798*e5dd7070Spatrick       const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
1799*e5dd7070Spatrick       NextStart = resyncUTF8(ErrorPtr, Fragment.end());
1800*e5dd7070Spatrick       Builder << MakeCharSourceRange(Features, SourceLoc, TokBegin,
1801*e5dd7070Spatrick                                      ErrorPtr, NextStart);
1802*e5dd7070Spatrick       NextFragment = StringRef(NextStart, Fragment.end()-NextStart);
1803*e5dd7070Spatrick     }
1804*e5dd7070Spatrick   }
1805*e5dd7070Spatrick   return !NoErrorOnBadEncoding;
1806*e5dd7070Spatrick }
1807*e5dd7070Spatrick 
1808*e5dd7070Spatrick void StringLiteralParser::DiagnoseLexingError(SourceLocation Loc) {
1809*e5dd7070Spatrick   hadError = true;
1810*e5dd7070Spatrick   if (Diags)
1811*e5dd7070Spatrick     Diags->Report(Loc, diag::err_lexing_string);
1812*e5dd7070Spatrick }
1813*e5dd7070Spatrick 
1814*e5dd7070Spatrick /// getOffsetOfStringByte - This function returns the offset of the
1815*e5dd7070Spatrick /// specified byte of the string data represented by Token.  This handles
1816*e5dd7070Spatrick /// advancing over escape sequences in the string.
1817*e5dd7070Spatrick unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
1818*e5dd7070Spatrick                                                     unsigned ByteNo) const {
1819*e5dd7070Spatrick   // Get the spelling of the token.
1820*e5dd7070Spatrick   SmallString<32> SpellingBuffer;
1821*e5dd7070Spatrick   SpellingBuffer.resize(Tok.getLength());
1822*e5dd7070Spatrick 
1823*e5dd7070Spatrick   bool StringInvalid = false;
1824*e5dd7070Spatrick   const char *SpellingPtr = &SpellingBuffer[0];
1825*e5dd7070Spatrick   unsigned TokLen = Lexer::getSpelling(Tok, SpellingPtr, SM, Features,
1826*e5dd7070Spatrick                                        &StringInvalid);
1827*e5dd7070Spatrick   if (StringInvalid)
1828*e5dd7070Spatrick     return 0;
1829*e5dd7070Spatrick 
1830*e5dd7070Spatrick   const char *SpellingStart = SpellingPtr;
1831*e5dd7070Spatrick   const char *SpellingEnd = SpellingPtr+TokLen;
1832*e5dd7070Spatrick 
1833*e5dd7070Spatrick   // Handle UTF-8 strings just like narrow strings.
1834*e5dd7070Spatrick   if (SpellingPtr[0] == 'u' && SpellingPtr[1] == '8')
1835*e5dd7070Spatrick     SpellingPtr += 2;
1836*e5dd7070Spatrick 
1837*e5dd7070Spatrick   assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' &&
1838*e5dd7070Spatrick          SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet");
1839*e5dd7070Spatrick 
1840*e5dd7070Spatrick   // For raw string literals, this is easy.
1841*e5dd7070Spatrick   if (SpellingPtr[0] == 'R') {
1842*e5dd7070Spatrick     assert(SpellingPtr[1] == '"' && "Should be a raw string literal!");
1843*e5dd7070Spatrick     // Skip 'R"'.
1844*e5dd7070Spatrick     SpellingPtr += 2;
1845*e5dd7070Spatrick     while (*SpellingPtr != '(') {
1846*e5dd7070Spatrick       ++SpellingPtr;
1847*e5dd7070Spatrick       assert(SpellingPtr < SpellingEnd && "Missing ( for raw string literal");
1848*e5dd7070Spatrick     }
1849*e5dd7070Spatrick     // Skip '('.
1850*e5dd7070Spatrick     ++SpellingPtr;
1851*e5dd7070Spatrick     return SpellingPtr - SpellingStart + ByteNo;
1852*e5dd7070Spatrick   }
1853*e5dd7070Spatrick 
1854*e5dd7070Spatrick   // Skip over the leading quote
1855*e5dd7070Spatrick   assert(SpellingPtr[0] == '"' && "Should be a string literal!");
1856*e5dd7070Spatrick   ++SpellingPtr;
1857*e5dd7070Spatrick 
1858*e5dd7070Spatrick   // Skip over bytes until we find the offset we're looking for.
1859*e5dd7070Spatrick   while (ByteNo) {
1860*e5dd7070Spatrick     assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!");
1861*e5dd7070Spatrick 
1862*e5dd7070Spatrick     // Step over non-escapes simply.
1863*e5dd7070Spatrick     if (*SpellingPtr != '\\') {
1864*e5dd7070Spatrick       ++SpellingPtr;
1865*e5dd7070Spatrick       --ByteNo;
1866*e5dd7070Spatrick       continue;
1867*e5dd7070Spatrick     }
1868*e5dd7070Spatrick 
1869*e5dd7070Spatrick     // Otherwise, this is an escape character.  Advance over it.
1870*e5dd7070Spatrick     bool HadError = false;
1871*e5dd7070Spatrick     if (SpellingPtr[1] == 'u' || SpellingPtr[1] == 'U') {
1872*e5dd7070Spatrick       const char *EscapePtr = SpellingPtr;
1873*e5dd7070Spatrick       unsigned Len = MeasureUCNEscape(SpellingStart, SpellingPtr, SpellingEnd,
1874*e5dd7070Spatrick                                       1, Features, HadError);
1875*e5dd7070Spatrick       if (Len > ByteNo) {
1876*e5dd7070Spatrick         // ByteNo is somewhere within the escape sequence.
1877*e5dd7070Spatrick         SpellingPtr = EscapePtr;
1878*e5dd7070Spatrick         break;
1879*e5dd7070Spatrick       }
1880*e5dd7070Spatrick       ByteNo -= Len;
1881*e5dd7070Spatrick     } else {
1882*e5dd7070Spatrick       ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError,
1883*e5dd7070Spatrick                         FullSourceLoc(Tok.getLocation(), SM),
1884*e5dd7070Spatrick                         CharByteWidth*8, Diags, Features);
1885*e5dd7070Spatrick       --ByteNo;
1886*e5dd7070Spatrick     }
1887*e5dd7070Spatrick     assert(!HadError && "This method isn't valid on erroneous strings");
1888*e5dd7070Spatrick   }
1889*e5dd7070Spatrick 
1890*e5dd7070Spatrick   return SpellingPtr-SpellingStart;
1891*e5dd7070Spatrick }
1892*e5dd7070Spatrick 
1893*e5dd7070Spatrick /// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
1894*e5dd7070Spatrick /// suffixes as ud-suffixes, because the diagnostic experience is better if we
1895*e5dd7070Spatrick /// treat it as an invalid suffix.
1896*e5dd7070Spatrick bool StringLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
1897*e5dd7070Spatrick                                           StringRef Suffix) {
1898*e5dd7070Spatrick   return NumericLiteralParser::isValidUDSuffix(LangOpts, Suffix) ||
1899*e5dd7070Spatrick          Suffix == "sv";
1900*e5dd7070Spatrick }
1901