1 //===-- runtime/utf.cpp ---------------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "utf.h" 10 11 namespace Fortran::runtime { 12 13 #ifndef FLANG_RUNTIME_NO_GLOBAL_VAR_DEFS 14 // clang-format off 15 RT_OFFLOAD_VAR_GROUP_BEGIN 16 const RT_CONST_VAR_ATTRS std::uint8_t UTF8FirstByteTable[256]{ 17 /* 00 - 7F: 7 bit payload in single byte */ 18 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 19 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 20 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 21 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 22 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 23 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 24 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 25 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 26 /* 80 - BF: invalid first byte, valid later byte */ 27 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 28 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 29 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 30 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 31 /* C0 - DF: 11 bit payload */ 32 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 33 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 34 /* E0 - EF: 16 bit payload */ 35 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 36 /* F0 - F7: 21 bit payload */ 4, 4, 4, 4, 4, 4, 4, 4, 37 /* F8 - FB: 26 bit payload */ 5, 5, 5, 5, 38 /* FC - FD: 31 bit payload */ 6, 6, 39 /* FE: 32 bit payload */ 7, 40 /* FF: invalid */ 0 41 }; 42 RT_OFFLOAD_VAR_GROUP_END 43 // clang-format on 44 #endif // FLANG_RUNTIME_NO_GLOBAL_VAR_DEFS 45 46 RT_OFFLOAD_API_GROUP_BEGIN 47 48 std::size_t MeasurePreviousUTF8Bytes(const char *end, std::size_t limit) { 49 // Scan back over UTF-8 continuation bytes, if any 50 for (std::size_t n{1}; n <= limit; ++n) { 51 if ((end[-n] & 0xc0) != 0x80) { 52 return n; 53 } 54 } 55 return limit; 56 } 57 58 // Non-minimal encodings are accepted. 59 Fortran::common::optional<char32_t> DecodeUTF8(const char *p0) { 60 const std::uint8_t *p{reinterpret_cast<const std::uint8_t *>(p0)}; 61 std::size_t bytes{MeasureUTF8Bytes(*p0)}; 62 if (bytes == 1) { 63 return char32_t{*p}; 64 } else if (bytes > 1) { 65 std::uint64_t result{char32_t{*p} & (0x7f >> bytes)}; 66 for (std::size_t j{1}; j < bytes; ++j) { 67 std::uint8_t next{p[j]}; 68 if (next < 0x80 || next > 0xbf) { 69 return Fortran::common::nullopt; 70 } 71 result = (result << 6) | (next & 0x3f); 72 } 73 if (result <= 0xffffffff) { 74 return static_cast<char32_t>(result); 75 } 76 } 77 return Fortran::common::nullopt; 78 } 79 80 std::size_t EncodeUTF8(char *p0, char32_t ucs) { 81 std::uint8_t *p{reinterpret_cast<std::uint8_t *>(p0)}; 82 if (ucs <= 0x7f) { 83 p[0] = ucs; 84 return 1; 85 } else if (ucs <= 0x7ff) { 86 p[0] = 0xc0 | (ucs >> 6); 87 p[1] = 0x80 | (ucs & 0x3f); 88 return 2; 89 } else if (ucs <= 0xffff) { 90 p[0] = 0xe0 | (ucs >> 12); 91 p[1] = 0x80 | ((ucs >> 6) & 0x3f); 92 p[2] = 0x80 | (ucs & 0x3f); 93 return 3; 94 } else if (ucs <= 0x1fffff) { 95 p[0] = 0xf0 | (ucs >> 18); 96 p[1] = 0x80 | ((ucs >> 12) & 0x3f); 97 p[2] = 0x80 | ((ucs >> 6) & 0x3f); 98 p[3] = 0x80 | (ucs & 0x3f); 99 return 4; 100 } else if (ucs <= 0x3ffffff) { 101 p[0] = 0xf8 | (ucs >> 24); 102 p[1] = 0x80 | ((ucs >> 18) & 0x3f); 103 p[2] = 0x80 | ((ucs >> 12) & 0x3f); 104 p[3] = 0x80 | ((ucs >> 6) & 0x3f); 105 p[4] = 0x80 | (ucs & 0x3f); 106 return 5; 107 } else if (ucs <= 0x7ffffff) { 108 p[0] = 0xf8 | (ucs >> 30); 109 p[1] = 0x80 | ((ucs >> 24) & 0x3f); 110 p[2] = 0x80 | ((ucs >> 18) & 0x3f); 111 p[3] = 0x80 | ((ucs >> 12) & 0x3f); 112 p[4] = 0x80 | ((ucs >> 6) & 0x3f); 113 p[5] = 0x80 | (ucs & 0x3f); 114 return 6; 115 } else { 116 p[0] = 0xfe; 117 p[1] = 0x80 | ((ucs >> 30) & 0x3f); 118 p[2] = 0x80 | ((ucs >> 24) & 0x3f); 119 p[3] = 0x80 | ((ucs >> 18) & 0x3f); 120 p[4] = 0x80 | ((ucs >> 12) & 0x3f); 121 p[5] = 0x80 | ((ucs >> 6) & 0x3f); 122 p[6] = 0x80 | (ucs & 0x3f); 123 return 7; 124 } 125 } 126 RT_OFFLOAD_API_GROUP_END 127 128 } // namespace Fortran::runtime 129