xref: /llvm-project/flang/runtime/utf.cpp (revision c2a95ad25c65acede2492ac83039150f9522c3ae)
1bafbae23SPeter Klausler //===-- runtime/utf.cpp ---------------------------------------------------===//
2bafbae23SPeter Klausler //
3bafbae23SPeter Klausler // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4bafbae23SPeter Klausler // See https://llvm.org/LICENSE.txt for license information.
5bafbae23SPeter Klausler // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6bafbae23SPeter Klausler //
7bafbae23SPeter Klausler //===----------------------------------------------------------------------===//
8bafbae23SPeter Klausler 
9bafbae23SPeter Klausler #include "utf.h"
10bafbae23SPeter Klausler 
11bafbae23SPeter Klausler namespace Fortran::runtime {
12bafbae23SPeter Klausler 
131563a875SSlava Zakharin #ifndef FLANG_RUNTIME_NO_GLOBAL_VAR_DEFS
14bafbae23SPeter Klausler // clang-format off
158ebf7411SSlava Zakharin RT_OFFLOAD_VAR_GROUP_BEGIN
168ebf7411SSlava Zakharin const RT_CONST_VAR_ATTRS std::uint8_t UTF8FirstByteTable[256]{
17bafbae23SPeter Klausler   /* 00 - 7F:  7 bit payload in single byte */
18bafbae23SPeter Klausler     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
19bafbae23SPeter Klausler     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
20bafbae23SPeter Klausler     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
21bafbae23SPeter Klausler     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
22bafbae23SPeter Klausler     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
23bafbae23SPeter Klausler     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
24bafbae23SPeter Klausler     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
25bafbae23SPeter Klausler     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
26bafbae23SPeter Klausler   /* 80 - BF: invalid first byte, valid later byte */
27bafbae23SPeter Klausler     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
28bafbae23SPeter Klausler     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
29bafbae23SPeter Klausler     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
30bafbae23SPeter Klausler     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
31bafbae23SPeter Klausler   /* C0 - DF: 11 bit payload */
32bafbae23SPeter Klausler     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
33bafbae23SPeter Klausler     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
34bafbae23SPeter Klausler   /* E0 - EF: 16 bit payload */
35bafbae23SPeter Klausler     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
36bafbae23SPeter Klausler   /* F0 - F7: 21 bit payload */ 4, 4, 4, 4, 4, 4, 4, 4,
37bafbae23SPeter Klausler   /* F8 - FB: 26 bit payload */ 5, 5, 5, 5,
38bafbae23SPeter Klausler   /* FC - FD: 31 bit payload */ 6, 6,
39bafbae23SPeter Klausler   /* FE:      32 bit payload */ 7,
40bafbae23SPeter Klausler   /* FF:      invalid */ 0
41bafbae23SPeter Klausler };
428ebf7411SSlava Zakharin RT_OFFLOAD_VAR_GROUP_END
43bafbae23SPeter Klausler // clang-format on
441563a875SSlava Zakharin #endif // FLANG_RUNTIME_NO_GLOBAL_VAR_DEFS
45bafbae23SPeter Klausler 
468ebf7411SSlava Zakharin RT_OFFLOAD_API_GROUP_BEGIN
47*c2a95ad2SPeter Klausler 
48*c2a95ad2SPeter Klausler std::size_t MeasurePreviousUTF8Bytes(const char *end, std::size_t limit) {
49*c2a95ad2SPeter Klausler   // Scan back over UTF-8 continuation bytes, if any
50*c2a95ad2SPeter Klausler   for (std::size_t n{1}; n <= limit; ++n) {
51*c2a95ad2SPeter Klausler     if ((end[-n] & 0xc0) != 0x80) {
52*c2a95ad2SPeter Klausler       return n;
53*c2a95ad2SPeter Klausler     }
54*c2a95ad2SPeter Klausler   }
55*c2a95ad2SPeter Klausler   return limit;
56*c2a95ad2SPeter Klausler }
57*c2a95ad2SPeter Klausler 
58bafbae23SPeter Klausler // Non-minimal encodings are accepted.
5971e0261fSSlava Zakharin Fortran::common::optional<char32_t> DecodeUTF8(const char *p0) {
60bafbae23SPeter Klausler   const std::uint8_t *p{reinterpret_cast<const std::uint8_t *>(p0)};
61bafbae23SPeter Klausler   std::size_t bytes{MeasureUTF8Bytes(*p0)};
62bafbae23SPeter Klausler   if (bytes == 1) {
63bafbae23SPeter Klausler     return char32_t{*p};
64bafbae23SPeter Klausler   } else if (bytes > 1) {
65bafbae23SPeter Klausler     std::uint64_t result{char32_t{*p} & (0x7f >> bytes)};
66bafbae23SPeter Klausler     for (std::size_t j{1}; j < bytes; ++j) {
67bafbae23SPeter Klausler       std::uint8_t next{p[j]};
68bafbae23SPeter Klausler       if (next < 0x80 || next > 0xbf) {
6971e0261fSSlava Zakharin         return Fortran::common::nullopt;
70bafbae23SPeter Klausler       }
71bafbae23SPeter Klausler       result = (result << 6) | (next & 0x3f);
72bafbae23SPeter Klausler     }
73bafbae23SPeter Klausler     if (result <= 0xffffffff) {
74bafbae23SPeter Klausler       return static_cast<char32_t>(result);
75bafbae23SPeter Klausler     }
76bafbae23SPeter Klausler   }
7771e0261fSSlava Zakharin   return Fortran::common::nullopt;
78bafbae23SPeter Klausler }
79bafbae23SPeter Klausler 
80bafbae23SPeter Klausler std::size_t EncodeUTF8(char *p0, char32_t ucs) {
81bafbae23SPeter Klausler   std::uint8_t *p{reinterpret_cast<std::uint8_t *>(p0)};
82bafbae23SPeter Klausler   if (ucs <= 0x7f) {
83bafbae23SPeter Klausler     p[0] = ucs;
84bafbae23SPeter Klausler     return 1;
85bafbae23SPeter Klausler   } else if (ucs <= 0x7ff) {
86bafbae23SPeter Klausler     p[0] = 0xc0 | (ucs >> 6);
87bafbae23SPeter Klausler     p[1] = 0x80 | (ucs & 0x3f);
88bafbae23SPeter Klausler     return 2;
89bafbae23SPeter Klausler   } else if (ucs <= 0xffff) {
90bafbae23SPeter Klausler     p[0] = 0xe0 | (ucs >> 12);
91bafbae23SPeter Klausler     p[1] = 0x80 | ((ucs >> 6) & 0x3f);
92bafbae23SPeter Klausler     p[2] = 0x80 | (ucs & 0x3f);
93bafbae23SPeter Klausler     return 3;
94bafbae23SPeter Klausler   } else if (ucs <= 0x1fffff) {
95bafbae23SPeter Klausler     p[0] = 0xf0 | (ucs >> 18);
96bafbae23SPeter Klausler     p[1] = 0x80 | ((ucs >> 12) & 0x3f);
97bafbae23SPeter Klausler     p[2] = 0x80 | ((ucs >> 6) & 0x3f);
98bafbae23SPeter Klausler     p[3] = 0x80 | (ucs & 0x3f);
99bafbae23SPeter Klausler     return 4;
100bafbae23SPeter Klausler   } else if (ucs <= 0x3ffffff) {
101bafbae23SPeter Klausler     p[0] = 0xf8 | (ucs >> 24);
102bafbae23SPeter Klausler     p[1] = 0x80 | ((ucs >> 18) & 0x3f);
103bafbae23SPeter Klausler     p[2] = 0x80 | ((ucs >> 12) & 0x3f);
104bafbae23SPeter Klausler     p[3] = 0x80 | ((ucs >> 6) & 0x3f);
105bafbae23SPeter Klausler     p[4] = 0x80 | (ucs & 0x3f);
106bafbae23SPeter Klausler     return 5;
107bafbae23SPeter Klausler   } else if (ucs <= 0x7ffffff) {
108bafbae23SPeter Klausler     p[0] = 0xf8 | (ucs >> 30);
109bafbae23SPeter Klausler     p[1] = 0x80 | ((ucs >> 24) & 0x3f);
110bafbae23SPeter Klausler     p[2] = 0x80 | ((ucs >> 18) & 0x3f);
111bafbae23SPeter Klausler     p[3] = 0x80 | ((ucs >> 12) & 0x3f);
112bafbae23SPeter Klausler     p[4] = 0x80 | ((ucs >> 6) & 0x3f);
113bafbae23SPeter Klausler     p[5] = 0x80 | (ucs & 0x3f);
114bafbae23SPeter Klausler     return 6;
115bafbae23SPeter Klausler   } else {
116bafbae23SPeter Klausler     p[0] = 0xfe;
117bafbae23SPeter Klausler     p[1] = 0x80 | ((ucs >> 30) & 0x3f);
118bafbae23SPeter Klausler     p[2] = 0x80 | ((ucs >> 24) & 0x3f);
119bafbae23SPeter Klausler     p[3] = 0x80 | ((ucs >> 18) & 0x3f);
120bafbae23SPeter Klausler     p[4] = 0x80 | ((ucs >> 12) & 0x3f);
121bafbae23SPeter Klausler     p[5] = 0x80 | ((ucs >> 6) & 0x3f);
122bafbae23SPeter Klausler     p[6] = 0x80 | (ucs & 0x3f);
123bafbae23SPeter Klausler     return 7;
124bafbae23SPeter Klausler   }
125bafbae23SPeter Klausler }
1268ebf7411SSlava Zakharin RT_OFFLOAD_API_GROUP_END
127bafbae23SPeter Klausler 
128bafbae23SPeter Klausler } // namespace Fortran::runtime
129