xref: /llvm-project/flang/runtime/utf.cpp (revision c2a95ad25c65acede2492ac83039150f9522c3ae)
1 //===-- runtime/utf.cpp ---------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "utf.h"
10 
11 namespace Fortran::runtime {
12 
13 #ifndef FLANG_RUNTIME_NO_GLOBAL_VAR_DEFS
14 // clang-format off
15 RT_OFFLOAD_VAR_GROUP_BEGIN
16 const RT_CONST_VAR_ATTRS std::uint8_t UTF8FirstByteTable[256]{
17   /* 00 - 7F:  7 bit payload in single byte */
18     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
19     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
20     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
21     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
22     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
23     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
24     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
25     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
26   /* 80 - BF: invalid first byte, valid later byte */
27     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
28     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
29     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
30     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
31   /* C0 - DF: 11 bit payload */
32     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
33     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
34   /* E0 - EF: 16 bit payload */
35     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
36   /* F0 - F7: 21 bit payload */ 4, 4, 4, 4, 4, 4, 4, 4,
37   /* F8 - FB: 26 bit payload */ 5, 5, 5, 5,
38   /* FC - FD: 31 bit payload */ 6, 6,
39   /* FE:      32 bit payload */ 7,
40   /* FF:      invalid */ 0
41 };
42 RT_OFFLOAD_VAR_GROUP_END
43 // clang-format on
44 #endif // FLANG_RUNTIME_NO_GLOBAL_VAR_DEFS
45 
46 RT_OFFLOAD_API_GROUP_BEGIN
47 
48 std::size_t MeasurePreviousUTF8Bytes(const char *end, std::size_t limit) {
49   // Scan back over UTF-8 continuation bytes, if any
50   for (std::size_t n{1}; n <= limit; ++n) {
51     if ((end[-n] & 0xc0) != 0x80) {
52       return n;
53     }
54   }
55   return limit;
56 }
57 
58 // Non-minimal encodings are accepted.
59 Fortran::common::optional<char32_t> DecodeUTF8(const char *p0) {
60   const std::uint8_t *p{reinterpret_cast<const std::uint8_t *>(p0)};
61   std::size_t bytes{MeasureUTF8Bytes(*p0)};
62   if (bytes == 1) {
63     return char32_t{*p};
64   } else if (bytes > 1) {
65     std::uint64_t result{char32_t{*p} & (0x7f >> bytes)};
66     for (std::size_t j{1}; j < bytes; ++j) {
67       std::uint8_t next{p[j]};
68       if (next < 0x80 || next > 0xbf) {
69         return Fortran::common::nullopt;
70       }
71       result = (result << 6) | (next & 0x3f);
72     }
73     if (result <= 0xffffffff) {
74       return static_cast<char32_t>(result);
75     }
76   }
77   return Fortran::common::nullopt;
78 }
79 
80 std::size_t EncodeUTF8(char *p0, char32_t ucs) {
81   std::uint8_t *p{reinterpret_cast<std::uint8_t *>(p0)};
82   if (ucs <= 0x7f) {
83     p[0] = ucs;
84     return 1;
85   } else if (ucs <= 0x7ff) {
86     p[0] = 0xc0 | (ucs >> 6);
87     p[1] = 0x80 | (ucs & 0x3f);
88     return 2;
89   } else if (ucs <= 0xffff) {
90     p[0] = 0xe0 | (ucs >> 12);
91     p[1] = 0x80 | ((ucs >> 6) & 0x3f);
92     p[2] = 0x80 | (ucs & 0x3f);
93     return 3;
94   } else if (ucs <= 0x1fffff) {
95     p[0] = 0xf0 | (ucs >> 18);
96     p[1] = 0x80 | ((ucs >> 12) & 0x3f);
97     p[2] = 0x80 | ((ucs >> 6) & 0x3f);
98     p[3] = 0x80 | (ucs & 0x3f);
99     return 4;
100   } else if (ucs <= 0x3ffffff) {
101     p[0] = 0xf8 | (ucs >> 24);
102     p[1] = 0x80 | ((ucs >> 18) & 0x3f);
103     p[2] = 0x80 | ((ucs >> 12) & 0x3f);
104     p[3] = 0x80 | ((ucs >> 6) & 0x3f);
105     p[4] = 0x80 | (ucs & 0x3f);
106     return 5;
107   } else if (ucs <= 0x7ffffff) {
108     p[0] = 0xf8 | (ucs >> 30);
109     p[1] = 0x80 | ((ucs >> 24) & 0x3f);
110     p[2] = 0x80 | ((ucs >> 18) & 0x3f);
111     p[3] = 0x80 | ((ucs >> 12) & 0x3f);
112     p[4] = 0x80 | ((ucs >> 6) & 0x3f);
113     p[5] = 0x80 | (ucs & 0x3f);
114     return 6;
115   } else {
116     p[0] = 0xfe;
117     p[1] = 0x80 | ((ucs >> 30) & 0x3f);
118     p[2] = 0x80 | ((ucs >> 24) & 0x3f);
119     p[3] = 0x80 | ((ucs >> 18) & 0x3f);
120     p[4] = 0x80 | ((ucs >> 12) & 0x3f);
121     p[5] = 0x80 | ((ucs >> 6) & 0x3f);
122     p[6] = 0x80 | (ucs & 0x3f);
123     return 7;
124   }
125 }
126 RT_OFFLOAD_API_GROUP_END
127 
128 } // namespace Fortran::runtime
129