1 //===-- include/flang/Parser/characters.h -----------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #ifndef FORTRAN_PARSER_CHARACTERS_H_ 10 #define FORTRAN_PARSER_CHARACTERS_H_ 11 12 // Define some character classification predicates and 13 // conversions here to avoid dependences upon <cctype> and 14 // also to accomodate Fortran tokenization. 15 16 #include <cstddef> 17 #include <cstdint> 18 #include <optional> 19 #include <string> 20 21 namespace Fortran::parser { 22 23 extern bool useHexadecimalEscapeSequences; 24 25 // We can easily support Fortran program source in any character 26 // set whose first 128 code points correspond to ASCII codes 0-127 (ISO/IEC646). 27 // The specific encodings that we can handle include: 28 // LATIN_1: ISO 8859-1 Latin-1 29 // UTF_8: Multi-byte encoding of Unicode (ISO/IEC 10646) 30 enum class Encoding { LATIN_1, UTF_8 }; 31 32 inline constexpr bool IsUpperCaseLetter(char ch) { 33 return ch >= 'A' && ch <= 'Z'; 34 } 35 36 inline constexpr bool IsLowerCaseLetter(char ch) { 37 return ch >= 'a' && ch <= 'z'; 38 } 39 40 inline constexpr bool IsLetter(char ch) { 41 return IsUpperCaseLetter(ch) || IsLowerCaseLetter(ch); 42 } 43 44 inline constexpr bool IsDecimalDigit(char ch) { return ch >= '0' && ch <= '9'; } 45 46 inline constexpr bool IsHexadecimalDigit(char ch) { 47 return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') || 48 (ch >= 'a' && ch <= 'f'); 49 } 50 51 inline constexpr bool IsOctalDigit(char ch) { return ch >= '0' && ch <= '7'; } 52 53 inline constexpr bool IsLegalIdentifierStart(char ch) { 54 return IsLetter(ch) || ch == '_' || ch == '@' || ch == '$'; 55 } 56 57 inline constexpr bool IsLegalInIdentifier(char ch) { 58 return IsLegalIdentifierStart(ch) || IsDecimalDigit(ch); 59 } 60 61 inline constexpr bool IsPrintable(char ch) { return ch >= ' ' && ch <= '~'; } 62 63 inline constexpr bool IsWhiteSpace(char ch) { 64 return ch == ' ' || ch == '\t' || ch == '\n' || ch == '\v' || ch == '\f' || 65 ch == '\r'; 66 } 67 68 inline constexpr char ToLowerCaseLetter(char ch) { 69 return IsUpperCaseLetter(ch) ? ch - 'A' + 'a' : ch; 70 } 71 72 inline constexpr char ToLowerCaseLetter(char &&ch) { 73 return IsUpperCaseLetter(ch) ? ch - 'A' + 'a' : ch; 74 } 75 76 inline std::string ToLowerCaseLetters(std::string_view str) { 77 std::string lowered{str}; 78 for (char &ch : lowered) { 79 ch = ToLowerCaseLetter(ch); 80 } 81 return lowered; 82 } 83 84 inline constexpr char ToUpperCaseLetter(char ch) { 85 return IsLowerCaseLetter(ch) ? ch - 'a' + 'A' : ch; 86 } 87 88 inline constexpr char ToUpperCaseLetter(char &&ch) { 89 return IsLowerCaseLetter(ch) ? ch - 'a' + 'A' : ch; 90 } 91 92 inline std::string ToUpperCaseLetters(std::string_view str) { 93 std::string raised{str}; 94 for (char &ch : raised) { 95 ch = ToUpperCaseLetter(ch); 96 } 97 return raised; 98 } 99 100 inline constexpr bool IsSameApartFromCase(char x, char y) { 101 return ToLowerCaseLetter(x) == ToLowerCaseLetter(y); 102 } 103 104 inline constexpr char DecimalDigitValue(char ch) { return ch - '0'; } 105 106 inline constexpr char HexadecimalDigitValue(char ch) { 107 return IsUpperCaseLetter(ch) ? ch - 'A' + 10 108 : IsLowerCaseLetter(ch) ? ch - 'a' + 10 109 : DecimalDigitValue(ch); 110 } 111 112 inline constexpr std::optional<char> BackslashEscapeValue(char ch) { 113 switch (ch) { 114 case 'a': 115 return std::nullopt; // '\a'; PGF90 doesn't know \a 116 case 'b': 117 return '\b'; 118 case 'f': 119 return '\f'; 120 case 'n': 121 return '\n'; 122 case 'r': 123 return '\r'; 124 case 't': 125 return '\t'; 126 case 'v': 127 return '\v'; 128 case '"': 129 case '\'': 130 case '\\': 131 return ch; 132 default: 133 return std::nullopt; 134 } 135 } 136 137 inline constexpr std::optional<char> BackslashEscapeChar(char ch) { 138 switch (ch) { 139 case '\a': 140 return std::nullopt; // 'a'; PGF90 doesn't know \a 141 case '\b': 142 return 'b'; 143 case '\f': 144 return 'f'; 145 case '\n': 146 return 'n'; 147 case '\r': 148 return 'r'; 149 case '\t': 150 return 't'; 151 case '\v': 152 return 'v'; 153 case '"': 154 case '\'': 155 case '\\': 156 return ch; 157 default: 158 return std::nullopt; 159 } 160 } 161 162 // Does not include spaces or line ending characters. 163 inline constexpr bool IsValidFortranTokenCharacter(char ch) { 164 switch (ch) { 165 case '"': 166 case '%': 167 case '\'': 168 case '(': 169 case ')': 170 case '*': 171 case '+': 172 case ',': 173 case '-': 174 case '.': 175 case '/': 176 case ':': 177 case ';': 178 case '<': 179 case '=': 180 case '>': 181 case '[': 182 case ']': 183 case '{': // Used in OpenMP context selector specification 184 case '}': // 185 return true; 186 default: 187 return IsLegalIdentifierStart(ch) || IsDecimalDigit(ch); 188 } 189 } 190 191 struct EncodedCharacter { 192 static constexpr int maxEncodingBytes{6}; 193 char buffer[maxEncodingBytes]; 194 int bytes{0}; 195 }; 196 197 template <Encoding ENCODING> EncodedCharacter EncodeCharacter(char32_t ucs); 198 template <> EncodedCharacter EncodeCharacter<Encoding::LATIN_1>(char32_t); 199 template <> EncodedCharacter EncodeCharacter<Encoding::UTF_8>(char32_t); 200 201 EncodedCharacter EncodeCharacter(Encoding, char32_t ucs); 202 203 template <Encoding ENCODING, typename STRING> 204 std::string EncodeString(const STRING &); 205 extern template std::string EncodeString<Encoding::LATIN_1, std::string>( 206 const std::string &); 207 extern template std::string EncodeString<Encoding::UTF_8, std::u32string>( 208 const std::u32string &); 209 210 // EmitQuotedChar drives callbacks "emit" and "insert" to output the 211 // bytes of an encoding for a codepoint. 212 template <typename NORMAL, typename INSERTED> 213 void EmitQuotedChar(char32_t ch, const NORMAL &emit, const INSERTED &insert, 214 bool backslashEscapes = true, Encoding encoding = Encoding::UTF_8) { 215 auto emitOneByte{[&](std::uint8_t ch) { 216 if (backslashEscapes && (ch < ' ' || ch >= 0x7f || ch == '\\')) { 217 if (std::optional<char> escape{BackslashEscapeChar(ch)}) { 218 insert('\\'); 219 emit(*escape); 220 } else if (useHexadecimalEscapeSequences) { 221 insert('\\'); 222 insert('x'); 223 int top{ch >> 4}, bottom{ch & 0xf}; 224 insert(top > 9 ? 'a' + top - 10 : '0' + top); 225 insert(bottom > 9 ? 'a' + bottom - 10 : '0' + bottom); 226 } else { 227 // octal escape sequence; always emit 3 digits to avoid ambiguity 228 insert('\\'); 229 insert('0' + (ch >> 6)); 230 insert('0' + ((ch >> 3) & 7)); 231 insert('0' + (ch & 7)); 232 } 233 } else if (ch == '\n') { // always escape newlines 234 insert('\\'); 235 insert('n'); 236 } else { 237 emit(ch); 238 } 239 }}; 240 if (ch <= 0x7f) { 241 emitOneByte(ch); 242 } else if (backslashEscapes && useHexadecimalEscapeSequences) { 243 insert('\\'); 244 insert('u'); 245 if (ch > 0xffff) { 246 unsigned c1{(ch >> 28) & 0xf}, c2{(ch >> 24) & 0xf}, c3{(ch >> 20) & 0xf}, 247 c4{(ch >> 16) & 0xf}; 248 insert(c1 > 9 ? 'a' + c1 - 10 : '0' + c1); 249 insert(c2 > 9 ? 'a' + c2 - 10 : '0' + c2); 250 insert(c3 > 9 ? 'a' + c3 - 10 : '0' + c3); 251 insert(c4 > 9 ? 'a' + c4 - 10 : '0' + c4); 252 } 253 unsigned c1{(ch >> 12) & 0xf}, c2{(ch >> 8) & 0xf}, c3{(ch >> 4) & 0xf}, 254 c4{ch & 0xf}; 255 insert(c1 > 9 ? 'a' + c1 - 10 : '0' + c1); 256 insert(c2 > 9 ? 'a' + c2 - 10 : '0' + c2); 257 insert(c3 > 9 ? 'a' + c3 - 10 : '0' + c3); 258 insert(c4 > 9 ? 'a' + c4 - 10 : '0' + c4); 259 } else { 260 EncodedCharacter encoded{EncodeCharacter(encoding, ch)}; 261 for (int j{0}; j < encoded.bytes; ++j) { 262 emitOneByte(encoded.buffer[j]); 263 } 264 } 265 } 266 267 std::string QuoteCharacterLiteral(const std::string &, 268 bool backslashEscapes = true, Encoding = Encoding::LATIN_1); 269 std::string QuoteCharacterLiteral(const std::u16string &, 270 bool backslashEscapes = true, Encoding = Encoding::UTF_8); 271 std::string QuoteCharacterLiteral(const std::u32string &, 272 bool backslashEscapes = true, Encoding = Encoding::UTF_8); 273 274 int UTF_8CharacterBytes(const char *); 275 276 struct DecodedCharacter { 277 char32_t codepoint{0}; 278 int bytes{0}; // signifying failure 279 }; 280 281 template <Encoding ENCODING> 282 DecodedCharacter DecodeRawCharacter(const char *, std::size_t); 283 template <> 284 DecodedCharacter DecodeRawCharacter<Encoding::LATIN_1>( 285 const char *, std::size_t); 286 287 template <> 288 DecodedCharacter DecodeRawCharacter<Encoding::UTF_8>(const char *, std::size_t); 289 290 // DecodeCharacter optionally handles backslash escape sequences, too. 291 template <Encoding ENCODING> 292 DecodedCharacter DecodeCharacter( 293 const char *, std::size_t, bool backslashEscapes); 294 extern template DecodedCharacter DecodeCharacter<Encoding::LATIN_1>( 295 const char *, std::size_t, bool); 296 extern template DecodedCharacter DecodeCharacter<Encoding::UTF_8>( 297 const char *, std::size_t, bool); 298 299 DecodedCharacter DecodeCharacter( 300 Encoding, const char *, std::size_t, bool backslashEscapes); 301 302 template <typename RESULT, Encoding ENCODING> 303 RESULT DecodeString(const std::string &, bool backslashEscapes); 304 extern template std::string DecodeString<std::string, Encoding::LATIN_1>( 305 const std::string &, bool); 306 extern template std::u16string DecodeString<std::u16string, Encoding::UTF_8>( 307 const std::string &, bool); 308 extern template std::u32string DecodeString<std::u32string, Encoding::UTF_8>( 309 const std::string &, bool); 310 } // namespace Fortran::parser 311 #endif // FORTRAN_PARSER_CHARACTERS_H_ 312