xref: /llvm-project/flang/include/flang/Parser/characters.h (revision 70e96dc3fb895e95dc659f87c2ed188507831801)
1 //===-- include/flang/Parser/characters.h -----------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #ifndef FORTRAN_PARSER_CHARACTERS_H_
10 #define FORTRAN_PARSER_CHARACTERS_H_
11 
12 // Define some character classification predicates and
13 // conversions here to avoid dependences upon <cctype> and
14 // also to accomodate Fortran tokenization.
15 
16 #include <cstddef>
17 #include <cstdint>
18 #include <optional>
19 #include <string>
20 
21 namespace Fortran::parser {
22 
23 extern bool useHexadecimalEscapeSequences;
24 
25 // We can easily support Fortran program source in any character
26 // set whose first 128 code points correspond to ASCII codes 0-127 (ISO/IEC646).
27 // The specific encodings that we can handle include:
28 //   LATIN_1: ISO 8859-1 Latin-1
29 //   UTF_8: Multi-byte encoding of Unicode (ISO/IEC 10646)
30 enum class Encoding { LATIN_1, UTF_8 };
31 
32 inline constexpr bool IsUpperCaseLetter(char ch) {
33   return ch >= 'A' && ch <= 'Z';
34 }
35 
36 inline constexpr bool IsLowerCaseLetter(char ch) {
37   return ch >= 'a' && ch <= 'z';
38 }
39 
40 inline constexpr bool IsLetter(char ch) {
41   return IsUpperCaseLetter(ch) || IsLowerCaseLetter(ch);
42 }
43 
44 inline constexpr bool IsDecimalDigit(char ch) { return ch >= '0' && ch <= '9'; }
45 
46 inline constexpr bool IsHexadecimalDigit(char ch) {
47   return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') ||
48       (ch >= 'a' && ch <= 'f');
49 }
50 
51 inline constexpr bool IsOctalDigit(char ch) { return ch >= '0' && ch <= '7'; }
52 
53 inline constexpr bool IsLegalIdentifierStart(char ch) {
54   return IsLetter(ch) || ch == '_' || ch == '@' || ch == '$';
55 }
56 
57 inline constexpr bool IsLegalInIdentifier(char ch) {
58   return IsLegalIdentifierStart(ch) || IsDecimalDigit(ch);
59 }
60 
61 inline constexpr bool IsPrintable(char ch) { return ch >= ' ' && ch <= '~'; }
62 
63 inline constexpr bool IsWhiteSpace(char ch) {
64   return ch == ' ' || ch == '\t' || ch == '\n' || ch == '\v' || ch == '\f' ||
65       ch == '\r';
66 }
67 
68 inline constexpr char ToLowerCaseLetter(char ch) {
69   return IsUpperCaseLetter(ch) ? ch - 'A' + 'a' : ch;
70 }
71 
72 inline constexpr char ToLowerCaseLetter(char &&ch) {
73   return IsUpperCaseLetter(ch) ? ch - 'A' + 'a' : ch;
74 }
75 
76 inline std::string ToLowerCaseLetters(std::string_view str) {
77   std::string lowered{str};
78   for (char &ch : lowered) {
79     ch = ToLowerCaseLetter(ch);
80   }
81   return lowered;
82 }
83 
84 inline constexpr char ToUpperCaseLetter(char ch) {
85   return IsLowerCaseLetter(ch) ? ch - 'a' + 'A' : ch;
86 }
87 
88 inline constexpr char ToUpperCaseLetter(char &&ch) {
89   return IsLowerCaseLetter(ch) ? ch - 'a' + 'A' : ch;
90 }
91 
92 inline std::string ToUpperCaseLetters(std::string_view str) {
93   std::string raised{str};
94   for (char &ch : raised) {
95     ch = ToUpperCaseLetter(ch);
96   }
97   return raised;
98 }
99 
100 inline constexpr bool IsSameApartFromCase(char x, char y) {
101   return ToLowerCaseLetter(x) == ToLowerCaseLetter(y);
102 }
103 
104 inline constexpr char DecimalDigitValue(char ch) { return ch - '0'; }
105 
106 inline constexpr char HexadecimalDigitValue(char ch) {
107   return IsUpperCaseLetter(ch) ? ch - 'A' + 10
108       : IsLowerCaseLetter(ch)  ? ch - 'a' + 10
109                                : DecimalDigitValue(ch);
110 }
111 
112 inline constexpr std::optional<char> BackslashEscapeValue(char ch) {
113   switch (ch) {
114   case 'a':
115     return std::nullopt; // '\a';  PGF90 doesn't know \a
116   case 'b':
117     return '\b';
118   case 'f':
119     return '\f';
120   case 'n':
121     return '\n';
122   case 'r':
123     return '\r';
124   case 't':
125     return '\t';
126   case 'v':
127     return '\v';
128   case '"':
129   case '\'':
130   case '\\':
131     return ch;
132   default:
133     return std::nullopt;
134   }
135 }
136 
137 inline constexpr std::optional<char> BackslashEscapeChar(char ch) {
138   switch (ch) {
139   case '\a':
140     return std::nullopt; // 'a';  PGF90 doesn't know \a
141   case '\b':
142     return 'b';
143   case '\f':
144     return 'f';
145   case '\n':
146     return 'n';
147   case '\r':
148     return 'r';
149   case '\t':
150     return 't';
151   case '\v':
152     return 'v';
153   case '"':
154   case '\'':
155   case '\\':
156     return ch;
157   default:
158     return std::nullopt;
159   }
160 }
161 
162 // Does not include spaces or line ending characters.
163 inline constexpr bool IsValidFortranTokenCharacter(char ch) {
164   switch (ch) {
165   case '"':
166   case '%':
167   case '\'':
168   case '(':
169   case ')':
170   case '*':
171   case '+':
172   case ',':
173   case '-':
174   case '.':
175   case '/':
176   case ':':
177   case ';':
178   case '<':
179   case '=':
180   case '>':
181   case '[':
182   case ']':
183   case '{': // Used in OpenMP context selector specification
184   case '}': //
185     return true;
186   default:
187     return IsLegalIdentifierStart(ch) || IsDecimalDigit(ch);
188   }
189 }
190 
191 struct EncodedCharacter {
192   static constexpr int maxEncodingBytes{6};
193   char buffer[maxEncodingBytes];
194   int bytes{0};
195 };
196 
197 template <Encoding ENCODING> EncodedCharacter EncodeCharacter(char32_t ucs);
198 template <> EncodedCharacter EncodeCharacter<Encoding::LATIN_1>(char32_t);
199 template <> EncodedCharacter EncodeCharacter<Encoding::UTF_8>(char32_t);
200 
201 EncodedCharacter EncodeCharacter(Encoding, char32_t ucs);
202 
203 template <Encoding ENCODING, typename STRING>
204 std::string EncodeString(const STRING &);
205 extern template std::string EncodeString<Encoding::LATIN_1, std::string>(
206     const std::string &);
207 extern template std::string EncodeString<Encoding::UTF_8, std::u32string>(
208     const std::u32string &);
209 
210 // EmitQuotedChar drives callbacks "emit" and "insert" to output the
211 // bytes of an encoding for a codepoint.
212 template <typename NORMAL, typename INSERTED>
213 void EmitQuotedChar(char32_t ch, const NORMAL &emit, const INSERTED &insert,
214     bool backslashEscapes = true, Encoding encoding = Encoding::UTF_8) {
215   auto emitOneByte{[&](std::uint8_t ch) {
216     if (backslashEscapes && (ch < ' ' || ch >= 0x7f || ch == '\\')) {
217       if (std::optional<char> escape{BackslashEscapeChar(ch)}) {
218         insert('\\');
219         emit(*escape);
220       } else if (useHexadecimalEscapeSequences) {
221         insert('\\');
222         insert('x');
223         int top{ch >> 4}, bottom{ch & 0xf};
224         insert(top > 9 ? 'a' + top - 10 : '0' + top);
225         insert(bottom > 9 ? 'a' + bottom - 10 : '0' + bottom);
226       } else {
227         // octal escape sequence; always emit 3 digits to avoid ambiguity
228         insert('\\');
229         insert('0' + (ch >> 6));
230         insert('0' + ((ch >> 3) & 7));
231         insert('0' + (ch & 7));
232       }
233     } else if (ch == '\n') { // always escape newlines
234       insert('\\');
235       insert('n');
236     } else {
237       emit(ch);
238     }
239   }};
240   if (ch <= 0x7f) {
241     emitOneByte(ch);
242   } else if (backslashEscapes && useHexadecimalEscapeSequences) {
243     insert('\\');
244     insert('u');
245     if (ch > 0xffff) {
246       unsigned c1{(ch >> 28) & 0xf}, c2{(ch >> 24) & 0xf}, c3{(ch >> 20) & 0xf},
247           c4{(ch >> 16) & 0xf};
248       insert(c1 > 9 ? 'a' + c1 - 10 : '0' + c1);
249       insert(c2 > 9 ? 'a' + c2 - 10 : '0' + c2);
250       insert(c3 > 9 ? 'a' + c3 - 10 : '0' + c3);
251       insert(c4 > 9 ? 'a' + c4 - 10 : '0' + c4);
252     }
253     unsigned c1{(ch >> 12) & 0xf}, c2{(ch >> 8) & 0xf}, c3{(ch >> 4) & 0xf},
254         c4{ch & 0xf};
255     insert(c1 > 9 ? 'a' + c1 - 10 : '0' + c1);
256     insert(c2 > 9 ? 'a' + c2 - 10 : '0' + c2);
257     insert(c3 > 9 ? 'a' + c3 - 10 : '0' + c3);
258     insert(c4 > 9 ? 'a' + c4 - 10 : '0' + c4);
259   } else {
260     EncodedCharacter encoded{EncodeCharacter(encoding, ch)};
261     for (int j{0}; j < encoded.bytes; ++j) {
262       emitOneByte(encoded.buffer[j]);
263     }
264   }
265 }
266 
267 std::string QuoteCharacterLiteral(const std::string &,
268     bool backslashEscapes = true, Encoding = Encoding::LATIN_1);
269 std::string QuoteCharacterLiteral(const std::u16string &,
270     bool backslashEscapes = true, Encoding = Encoding::UTF_8);
271 std::string QuoteCharacterLiteral(const std::u32string &,
272     bool backslashEscapes = true, Encoding = Encoding::UTF_8);
273 
274 int UTF_8CharacterBytes(const char *);
275 
276 struct DecodedCharacter {
277   char32_t codepoint{0};
278   int bytes{0}; // signifying failure
279 };
280 
281 template <Encoding ENCODING>
282 DecodedCharacter DecodeRawCharacter(const char *, std::size_t);
283 template <>
284 DecodedCharacter DecodeRawCharacter<Encoding::LATIN_1>(
285     const char *, std::size_t);
286 
287 template <>
288 DecodedCharacter DecodeRawCharacter<Encoding::UTF_8>(const char *, std::size_t);
289 
290 // DecodeCharacter optionally handles backslash escape sequences, too.
291 template <Encoding ENCODING>
292 DecodedCharacter DecodeCharacter(
293     const char *, std::size_t, bool backslashEscapes);
294 extern template DecodedCharacter DecodeCharacter<Encoding::LATIN_1>(
295     const char *, std::size_t, bool);
296 extern template DecodedCharacter DecodeCharacter<Encoding::UTF_8>(
297     const char *, std::size_t, bool);
298 
299 DecodedCharacter DecodeCharacter(
300     Encoding, const char *, std::size_t, bool backslashEscapes);
301 
302 template <typename RESULT, Encoding ENCODING>
303 RESULT DecodeString(const std::string &, bool backslashEscapes);
304 extern template std::string DecodeString<std::string, Encoding::LATIN_1>(
305     const std::string &, bool);
306 extern template std::u16string DecodeString<std::u16string, Encoding::UTF_8>(
307     const std::string &, bool);
308 extern template std::u32string DecodeString<std::u32string, Encoding::UTF_8>(
309     const std::string &, bool);
310 } // namespace Fortran::parser
311 #endif // FORTRAN_PARSER_CHARACTERS_H_
312