1 //===-- runtime/utf.h -----------------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 // UTF-8 is the variant-width standard encoding of Unicode (ISO 10646) 10 // code points. 11 // 12 // 7-bit values in [00 .. 7F] represent themselves as single bytes, so true 13 // 7-bit ASCII is also valid UTF-8. 14 // 15 // Larger values are encoded with a start byte in [C0 .. FE] that carries 16 // the length of the encoding and some of the upper bits of the value, followed 17 // by one or more bytes in the range [80 .. BF]. 18 // 19 // Specifically, the first byte holds two or more uppermost set bits, 20 // a zero bit, and some payload; the second and later bytes each start with 21 // their uppermost bit set, the next bit clear, and six bits of payload. 22 // Payload parcels are in big-endian order. All bytes must be present in a 23 // valid sequence; i.e., low-order sezo bits must be explicit. UTF-8 is 24 // self-synchronizing on input as any byte value cannot be both a valid 25 // first byte or trailing byte. 26 // 27 // 0xxxxxxx - 7 bit ASCII 28 // 110xxxxx 10xxxxxx - 11-bit value 29 // 1110xxxx 10xxxxxx 10xxxxxx - 16-bit value 30 // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - 21-bit value 31 // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 26-bit value 32 // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 31-bit value 33 // 11111110 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - 36-bit value 34 // 35 // Canonical UTF-8 sequences should be minimal, and our output is so, but 36 // we do not reject non-minimal sequences on input. Unicode only defines 37 // code points up to 0x10FFFF, so 21-bit (4-byte) UTF-8 is the actual 38 // standard maximum. However, we support extended forms up to 32 bits so that 39 // CHARACTER(KIND=4) can be abused to hold arbitrary 32-bit data. 40 41 #ifndef FORTRAN_RUNTIME_UTF_H_ 42 #define FORTRAN_RUNTIME_UTF_H_ 43 44 #include "flang/Common/optional.h" 45 #include <cstddef> 46 #include <cstdint> 47 48 namespace Fortran::runtime { 49 50 // Derive the length of a UTF-8 character encoding from its first byte. 51 // A zero result signifies an invalid encoding. 52 RT_OFFLOAD_VAR_GROUP_BEGIN 53 extern const RT_CONST_VAR_ATTRS std::uint8_t UTF8FirstByteTable[256]; 54 static constexpr std::size_t maxUTF8Bytes{7}; 55 RT_OFFLOAD_VAR_GROUP_END 56 57 static inline RT_API_ATTRS std::size_t MeasureUTF8Bytes(char first) { 58 return UTF8FirstByteTable[static_cast<std::uint8_t>(first)]; 59 } 60 61 RT_API_ATTRS std::size_t MeasurePreviousUTF8Bytes( 62 const char *end, std::size_t limit); 63 64 // Ensure that all bytes are present in sequence in the input buffer 65 // before calling; use MeasureUTF8Bytes(first byte) to count them. 66 RT_API_ATTRS Fortran::common::optional<char32_t> DecodeUTF8(const char *); 67 68 // Ensure that at least maxUTF8Bytes remain in the output 69 // buffer before calling. 70 RT_API_ATTRS std::size_t EncodeUTF8(char *, char32_t); 71 72 } // namespace Fortran::runtime 73 #endif // FORTRAN_RUNTIME_UTF_H_ 74