1fcaf7f86SDimitry Andric // -*- C++ -*- 2fcaf7f86SDimitry Andric //===----------------------------------------------------------------------===// 3fcaf7f86SDimitry Andric // 4fcaf7f86SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5fcaf7f86SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 6fcaf7f86SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7fcaf7f86SDimitry Andric // 8fcaf7f86SDimitry Andric //===----------------------------------------------------------------------===// 9fcaf7f86SDimitry Andric 10fcaf7f86SDimitry Andric #ifndef _LIBCPP___FORMAT_UNICODE_H 11fcaf7f86SDimitry Andric #define _LIBCPP___FORMAT_UNICODE_H 12fcaf7f86SDimitry Andric 13fcaf7f86SDimitry Andric #include <__assert> 14*06c3fb27SDimitry Andric #include <__bit/countl.h> 15*06c3fb27SDimitry Andric #include <__concepts/same_as.h> 16fcaf7f86SDimitry Andric #include <__config> 17fcaf7f86SDimitry Andric #include <__format/extended_grapheme_cluster_table.h> 18*06c3fb27SDimitry Andric #include <__iterator/concepts.h> 19*06c3fb27SDimitry Andric #include <__iterator/readable_traits.h> // iter_value_t 20*06c3fb27SDimitry Andric #include <string_view> 21fcaf7f86SDimitry Andric 22fcaf7f86SDimitry Andric #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) 23fcaf7f86SDimitry Andric # pragma GCC system_header 24fcaf7f86SDimitry Andric #endif 25fcaf7f86SDimitry Andric 26fcaf7f86SDimitry Andric _LIBCPP_BEGIN_NAMESPACE_STD 27fcaf7f86SDimitry Andric 28*06c3fb27SDimitry Andric #if _LIBCPP_STD_VER >= 20 29fcaf7f86SDimitry Andric 30bdd1243dSDimitry Andric namespace __unicode { 31bdd1243dSDimitry Andric 32*06c3fb27SDimitry Andric // Helper struct for the result of a consume operation. 33*06c3fb27SDimitry Andric // 34*06c3fb27SDimitry Andric // The status value for a correct code point is 0. This allows a valid value to 35*06c3fb27SDimitry Andric // be used without masking. 36*06c3fb27SDimitry Andric // When the decoding fails it know the number of code units affected. For the 37*06c3fb27SDimitry Andric // current use-cases that value is not needed, therefore it is not stored. 38*06c3fb27SDimitry Andric // The escape routine needs the number of code units for both a valid and 39*06c3fb27SDimitry Andric // invalid character and keeps track of it itself. Doing it in this result 40*06c3fb27SDimitry Andric // unconditionally would give some overhead when the value is unneeded. 41*06c3fb27SDimitry Andric struct __consume_result { 42*06c3fb27SDimitry Andric // When __status == __ok it contains the decoded code point. 43*06c3fb27SDimitry Andric // Else it contains the replacement character U+FFFD 44*06c3fb27SDimitry Andric char32_t __code_point : 31; 45bdd1243dSDimitry Andric 46*06c3fb27SDimitry Andric enum : char32_t { 47*06c3fb27SDimitry Andric // Consumed a well-formed code point. 48*06c3fb27SDimitry Andric __ok = 0, 49*06c3fb27SDimitry Andric // Encountered invalid UTF-8 50*06c3fb27SDimitry Andric __error = 1 51*06c3fb27SDimitry Andric } __status : 1 {__ok}; 52bdd1243dSDimitry Andric }; 53*06c3fb27SDimitry Andric static_assert(sizeof(__consume_result) == sizeof(char32_t)); 54bdd1243dSDimitry Andric 55fcaf7f86SDimitry Andric # ifndef _LIBCPP_HAS_NO_UNICODE 56fcaf7f86SDimitry Andric 57fcaf7f86SDimitry Andric /// Implements the grapheme cluster boundary rules 58fcaf7f86SDimitry Andric /// 59fcaf7f86SDimitry Andric /// These rules are used to implement format's width estimation as stated in 60fcaf7f86SDimitry Andric /// [format.string.std]/11 61fcaf7f86SDimitry Andric /// 62fcaf7f86SDimitry Andric /// The Standard refers to UAX \#29 for Unicode 12.0.0 63fcaf7f86SDimitry Andric /// https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules 64fcaf7f86SDimitry Andric /// 65fcaf7f86SDimitry Andric /// The data tables used are 66fcaf7f86SDimitry Andric /// https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt 67fcaf7f86SDimitry Andric /// https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt 68fcaf7f86SDimitry Andric /// https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt (for testing only) 69fcaf7f86SDimitry Andric 70fcaf7f86SDimitry Andric inline constexpr char32_t __replacement_character = U'\ufffd'; 71fcaf7f86SDimitry Andric 72*06c3fb27SDimitry Andric // The error of a consume operation. 73*06c3fb27SDimitry Andric // 74*06c3fb27SDimitry Andric // This sets the code point to the replacement character. This code point does 75*06c3fb27SDimitry Andric // not participate in the grapheme clustering, so grapheme clustering code can 76*06c3fb27SDimitry Andric // ignore the error status and always use the code point. 77*06c3fb27SDimitry Andric inline constexpr __consume_result __consume_result_error{__replacement_character, __consume_result::__error}; 78*06c3fb27SDimitry Andric 79*06c3fb27SDimitry Andric [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool __is_high_surrogate(char32_t __value) { 80*06c3fb27SDimitry Andric return __value >= 0xd800 && __value <= 0xdbff; 81*06c3fb27SDimitry Andric } 82*06c3fb27SDimitry Andric 83*06c3fb27SDimitry Andric [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool __is_low_surrogate(char32_t __value) { 84*06c3fb27SDimitry Andric return __value >= 0xdc00 && __value <= 0xdfff; 85*06c3fb27SDimitry Andric } 86*06c3fb27SDimitry Andric 87*06c3fb27SDimitry Andric // https://www.unicode.org/glossary/#surrogate_code_point 88*06c3fb27SDimitry Andric [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline constexpr bool __is_surrogate(char32_t __value) { 89*06c3fb27SDimitry Andric return __value >= 0xd800 && __value <= 0xdfff; 90*06c3fb27SDimitry Andric } 91*06c3fb27SDimitry Andric 92*06c3fb27SDimitry Andric // https://www.unicode.org/glossary/#code_point 93*06c3fb27SDimitry Andric [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline constexpr bool __is_code_point(char32_t __value) { 94*06c3fb27SDimitry Andric return __value <= 0x10ffff; 95*06c3fb27SDimitry Andric } 96*06c3fb27SDimitry Andric 97*06c3fb27SDimitry Andric // https://www.unicode.org/glossary/#unicode_scalar_value 98*06c3fb27SDimitry Andric [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline constexpr bool __is_scalar_value(char32_t __value) { 99*06c3fb27SDimitry Andric return __unicode::__is_code_point(__value) && !__unicode::__is_surrogate(__value); 100*06c3fb27SDimitry Andric } 101*06c3fb27SDimitry Andric 102*06c3fb27SDimitry Andric template <contiguous_iterator _Iterator> 103*06c3fb27SDimitry Andric requires same_as<iter_value_t<_Iterator>, char> 104*06c3fb27SDimitry Andric _LIBCPP_HIDE_FROM_ABI constexpr bool __is_continuation(_Iterator __char, int __count) { 105fcaf7f86SDimitry Andric do { 106*06c3fb27SDimitry Andric if ((*__char & 0b1100'0000) != 0b1000'0000) 107fcaf7f86SDimitry Andric return false; 108fcaf7f86SDimitry Andric --__count; 109fcaf7f86SDimitry Andric ++__char; 110fcaf7f86SDimitry Andric } while (__count); 111fcaf7f86SDimitry Andric return true; 112fcaf7f86SDimitry Andric } 113fcaf7f86SDimitry Andric 114fcaf7f86SDimitry Andric /// Helper class to extract a code unit from a Unicode character range. 115fcaf7f86SDimitry Andric /// 116fcaf7f86SDimitry Andric /// The stored range is a view. There are multiple specialization for different 117fcaf7f86SDimitry Andric /// character types. 118fcaf7f86SDimitry Andric template <class _CharT> 119fcaf7f86SDimitry Andric class __code_point_view; 120fcaf7f86SDimitry Andric 121fcaf7f86SDimitry Andric /// UTF-8 specialization. 122fcaf7f86SDimitry Andric template <> 123fcaf7f86SDimitry Andric class __code_point_view<char> { 124*06c3fb27SDimitry Andric using _Iterator = basic_string_view<char>::const_iterator; 125*06c3fb27SDimitry Andric 126fcaf7f86SDimitry Andric public: 127*06c3fb27SDimitry Andric _LIBCPP_HIDE_FROM_ABI constexpr explicit __code_point_view(_Iterator __first, _Iterator __last) 128fcaf7f86SDimitry Andric : __first_(__first), __last_(__last) {} 129fcaf7f86SDimitry Andric 130fcaf7f86SDimitry Andric _LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; } 131*06c3fb27SDimitry Andric _LIBCPP_HIDE_FROM_ABI constexpr _Iterator __position() const noexcept { return __first_; } 132fcaf7f86SDimitry Andric 133*06c3fb27SDimitry Andric // https://www.unicode.org/versions/latest/ch03.pdf#G7404 134*06c3fb27SDimitry Andric // Based on Table 3-7, Well-Formed UTF-8 Byte Sequences 135*06c3fb27SDimitry Andric // 136*06c3fb27SDimitry Andric // Code Points First Byte Second Byte Third Byte Fourth Byte Remarks 137*06c3fb27SDimitry Andric // U+0000..U+007F 00..7F U+0000..U+007F 1 code unit range 138*06c3fb27SDimitry Andric // C0..C1 80..BF invalid overlong encoding 139*06c3fb27SDimitry Andric // U+0080..U+07FF C2..DF 80..BF U+0080..U+07FF 2 code unit range 140*06c3fb27SDimitry Andric // E0 80..9F 80..BF invalid overlong encoding 141*06c3fb27SDimitry Andric // U+0800..U+0FFF E0 A0..BF 80..BF U+0800..U+FFFF 3 code unit range 142*06c3fb27SDimitry Andric // U+1000..U+CFFF E1..EC 80..BF 80..BF 143*06c3fb27SDimitry Andric // U+D000..U+D7FF ED 80..9F 80..BF 144*06c3fb27SDimitry Andric // U+D800..U+DFFF ED A0..BF 80..BF invalid encoding of surrogate code point 145*06c3fb27SDimitry Andric // U+E000..U+FFFF EE..EF 80..BF 80..BF 146*06c3fb27SDimitry Andric // F0 80..8F 80..BF 80..BF invalid overlong encoding 147*06c3fb27SDimitry Andric // U+10000..U+3FFFF F0 90..BF 80..BF 80..BF U+10000..U+10FFFF 4 code unit range 148*06c3fb27SDimitry Andric // U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF 149*06c3fb27SDimitry Andric // U+100000..U+10FFFF F4 80..8F 80..BF 80..BF 150*06c3fb27SDimitry Andric // F4 90..BF 80..BF 80..BF U+110000.. invalid code point range 151*06c3fb27SDimitry Andric // 152*06c3fb27SDimitry Andric // Unlike other parsers, these invalid entries are tested after decoding. 153*06c3fb27SDimitry Andric // - The parser always needs to consume these code units 154*06c3fb27SDimitry Andric // - The code is optimized for well-formed UTF-8 155*06c3fb27SDimitry Andric [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __consume_result __consume() noexcept { 156*06c3fb27SDimitry Andric _LIBCPP_ASSERT_UNCATEGORIZED(__first_ != __last_, "can't move beyond the end of input"); 157bdd1243dSDimitry Andric 158bdd1243dSDimitry Andric // Based on the number of leading 1 bits the number of code units in the 159bdd1243dSDimitry Andric // code point can be determined. See 160bdd1243dSDimitry Andric // https://en.wikipedia.org/wiki/UTF-8#Encoding 161bdd1243dSDimitry Andric switch (std::countl_one(static_cast<unsigned char>(*__first_))) { 162bdd1243dSDimitry Andric case 0: 163*06c3fb27SDimitry Andric return {static_cast<unsigned char>(*__first_++)}; 164bdd1243dSDimitry Andric 165*06c3fb27SDimitry Andric case 2: { 166*06c3fb27SDimitry Andric if (__last_ - __first_ < 2 || !__unicode::__is_continuation(__first_ + 1, 1)) [[unlikely]] 167bdd1243dSDimitry Andric break; 168bdd1243dSDimitry Andric 169bdd1243dSDimitry Andric char32_t __value = static_cast<unsigned char>(*__first_++) & 0x1f; 170bdd1243dSDimitry Andric __value <<= 6; 171bdd1243dSDimitry Andric __value |= static_cast<unsigned char>(*__first_++) & 0x3f; 172*06c3fb27SDimitry Andric 173*06c3fb27SDimitry Andric // These values should be encoded in 1 UTF-8 code unit. 174*06c3fb27SDimitry Andric if (__value < 0x0080) [[unlikely]] 175*06c3fb27SDimitry Andric return __consume_result_error; 176*06c3fb27SDimitry Andric 177*06c3fb27SDimitry Andric return {__value}; 178bdd1243dSDimitry Andric } 179*06c3fb27SDimitry Andric 180*06c3fb27SDimitry Andric case 3: { 181*06c3fb27SDimitry Andric if (__last_ - __first_ < 3 || !__unicode::__is_continuation(__first_ + 1, 2)) [[unlikely]] 182bdd1243dSDimitry Andric break; 183bdd1243dSDimitry Andric 184bdd1243dSDimitry Andric char32_t __value = static_cast<unsigned char>(*__first_++) & 0x0f; 185bdd1243dSDimitry Andric __value <<= 6; 186bdd1243dSDimitry Andric __value |= static_cast<unsigned char>(*__first_++) & 0x3f; 187bdd1243dSDimitry Andric __value <<= 6; 188bdd1243dSDimitry Andric __value |= static_cast<unsigned char>(*__first_++) & 0x3f; 189*06c3fb27SDimitry Andric 190*06c3fb27SDimitry Andric // These values should be encoded in 1 or 2 UTF-8 code units. 191*06c3fb27SDimitry Andric if (__value < 0x0800) [[unlikely]] 192*06c3fb27SDimitry Andric return __consume_result_error; 193*06c3fb27SDimitry Andric 194*06c3fb27SDimitry Andric // A surrogate value is always encoded in 3 UTF-8 code units. 195*06c3fb27SDimitry Andric if (__unicode::__is_surrogate(__value)) [[unlikely]] 196*06c3fb27SDimitry Andric return __consume_result_error; 197*06c3fb27SDimitry Andric 198*06c3fb27SDimitry Andric return {__value}; 199bdd1243dSDimitry Andric } 200*06c3fb27SDimitry Andric 201*06c3fb27SDimitry Andric case 4: { 202*06c3fb27SDimitry Andric if (__last_ - __first_ < 4 || !__unicode::__is_continuation(__first_ + 1, 3)) [[unlikely]] 203bdd1243dSDimitry Andric break; 204bdd1243dSDimitry Andric 205bdd1243dSDimitry Andric char32_t __value = static_cast<unsigned char>(*__first_++) & 0x07; 206bdd1243dSDimitry Andric __value <<= 6; 207bdd1243dSDimitry Andric __value |= static_cast<unsigned char>(*__first_++) & 0x3f; 208bdd1243dSDimitry Andric __value <<= 6; 209bdd1243dSDimitry Andric __value |= static_cast<unsigned char>(*__first_++) & 0x3f; 210bdd1243dSDimitry Andric __value <<= 6; 211bdd1243dSDimitry Andric __value |= static_cast<unsigned char>(*__first_++) & 0x3f; 212bdd1243dSDimitry Andric 213*06c3fb27SDimitry Andric // These values should be encoded in 1, 2, or 3 UTF-8 code units. 214*06c3fb27SDimitry Andric if (__value < 0x10000) [[unlikely]] 215*06c3fb27SDimitry Andric return __consume_result_error; 216bdd1243dSDimitry Andric 217*06c3fb27SDimitry Andric // A value too large is always encoded in 4 UTF-8 code units. 218*06c3fb27SDimitry Andric if (!__unicode::__is_code_point(__value)) [[unlikely]] 219*06c3fb27SDimitry Andric return __consume_result_error; 220*06c3fb27SDimitry Andric 221*06c3fb27SDimitry Andric return {__value}; 222bdd1243dSDimitry Andric } 223bdd1243dSDimitry Andric } 224bdd1243dSDimitry Andric // An invalid number of leading ones can be garbage or a code unit in the 225bdd1243dSDimitry Andric // middle of a code point. By consuming one code unit the parser may get 226bdd1243dSDimitry Andric // "in sync" after a few code units. 227*06c3fb27SDimitry Andric ++__first_; 228*06c3fb27SDimitry Andric return __consume_result_error; 229bdd1243dSDimitry Andric } 230bdd1243dSDimitry Andric 231fcaf7f86SDimitry Andric private: 232*06c3fb27SDimitry Andric _Iterator __first_; 233*06c3fb27SDimitry Andric _Iterator __last_; 234fcaf7f86SDimitry Andric }; 235fcaf7f86SDimitry Andric 236bdd1243dSDimitry Andric # ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS 237bdd1243dSDimitry Andric _LIBCPP_HIDE_FROM_ABI constexpr bool __is_surrogate_pair_high(wchar_t __value) { 238bdd1243dSDimitry Andric return __value >= 0xd800 && __value <= 0xdbff; 239bdd1243dSDimitry Andric } 240bdd1243dSDimitry Andric 241bdd1243dSDimitry Andric _LIBCPP_HIDE_FROM_ABI constexpr bool __is_surrogate_pair_low(wchar_t __value) { 242bdd1243dSDimitry Andric return __value >= 0xdc00 && __value <= 0xdfff; 243bdd1243dSDimitry Andric } 244bdd1243dSDimitry Andric 245fcaf7f86SDimitry Andric /// This specialization depends on the size of wchar_t 246fcaf7f86SDimitry Andric /// - 2 UTF-16 (for example Windows and AIX) 247fcaf7f86SDimitry Andric /// - 4 UTF-32 (for example Linux) 248fcaf7f86SDimitry Andric template <> 249fcaf7f86SDimitry Andric class __code_point_view<wchar_t> { 250*06c3fb27SDimitry Andric using _Iterator = typename basic_string_view<wchar_t>::const_iterator; 251*06c3fb27SDimitry Andric 252fcaf7f86SDimitry Andric public: 253bdd1243dSDimitry Andric static_assert(sizeof(wchar_t) == 2 || sizeof(wchar_t) == 4, "sizeof(wchar_t) has a not implemented value"); 254bdd1243dSDimitry Andric 255*06c3fb27SDimitry Andric _LIBCPP_HIDE_FROM_ABI constexpr explicit __code_point_view(_Iterator __first, _Iterator __last) 256fcaf7f86SDimitry Andric : __first_(__first), __last_(__last) {} 257fcaf7f86SDimitry Andric 258*06c3fb27SDimitry Andric _LIBCPP_HIDE_FROM_ABI constexpr _Iterator __position() const noexcept { return __first_; } 259fcaf7f86SDimitry Andric _LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; } 260fcaf7f86SDimitry Andric 261*06c3fb27SDimitry Andric [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __consume_result __consume() noexcept { 262*06c3fb27SDimitry Andric _LIBCPP_ASSERT_UNCATEGORIZED(__first_ != __last_, "can't move beyond the end of input"); 263fcaf7f86SDimitry Andric 264*06c3fb27SDimitry Andric char32_t __value = static_cast<char32_t>(*__first_++); 265fcaf7f86SDimitry Andric if constexpr (sizeof(wchar_t) == 2) { 266*06c3fb27SDimitry Andric if (__unicode::__is_low_surrogate(__value)) [[unlikely]] 267*06c3fb27SDimitry Andric return __consume_result_error; 268fcaf7f86SDimitry Andric 269*06c3fb27SDimitry Andric if (__unicode::__is_high_surrogate(__value)) { 270*06c3fb27SDimitry Andric if (__first_ == __last_ || !__unicode::__is_low_surrogate(static_cast<char32_t>(*__first_))) [[unlikely]] 271*06c3fb27SDimitry Andric return __consume_result_error; 272*06c3fb27SDimitry Andric 273*06c3fb27SDimitry Andric __value -= 0xd800; 274*06c3fb27SDimitry Andric __value <<= 10; 275*06c3fb27SDimitry Andric __value += static_cast<char32_t>(*__first_++) - 0xdc00; 276*06c3fb27SDimitry Andric __value += 0x10000; 277*06c3fb27SDimitry Andric 278*06c3fb27SDimitry Andric if (!__unicode::__is_code_point(__value)) [[unlikely]] 279*06c3fb27SDimitry Andric return __consume_result_error; 280fcaf7f86SDimitry Andric } 281fcaf7f86SDimitry Andric } else { 282*06c3fb27SDimitry Andric if (!__unicode::__is_scalar_value(__value)) [[unlikely]] 283*06c3fb27SDimitry Andric return __consume_result_error; 284fcaf7f86SDimitry Andric } 285fcaf7f86SDimitry Andric 286*06c3fb27SDimitry Andric return {__value}; 287bdd1243dSDimitry Andric } 288bdd1243dSDimitry Andric 289fcaf7f86SDimitry Andric private: 290*06c3fb27SDimitry Andric _Iterator __first_; 291*06c3fb27SDimitry Andric _Iterator __last_; 292fcaf7f86SDimitry Andric }; 293bdd1243dSDimitry Andric # endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS 294fcaf7f86SDimitry Andric 295fcaf7f86SDimitry Andric _LIBCPP_HIDE_FROM_ABI constexpr bool __at_extended_grapheme_cluster_break( 296fcaf7f86SDimitry Andric bool& __ri_break_allowed, 297fcaf7f86SDimitry Andric bool __has_extened_pictographic, 298fcaf7f86SDimitry Andric __extended_grapheme_custer_property_boundary::__property __prev, 299fcaf7f86SDimitry Andric __extended_grapheme_custer_property_boundary::__property __next) { 300fcaf7f86SDimitry Andric using __extended_grapheme_custer_property_boundary::__property; 301fcaf7f86SDimitry Andric 302fcaf7f86SDimitry Andric __has_extened_pictographic |= __prev == __property::__Extended_Pictographic; 303fcaf7f86SDimitry Andric 304fcaf7f86SDimitry Andric // https://www.unicode.org/reports/tr29/tr29-39.html#Grapheme_Cluster_Boundary_Rules 305fcaf7f86SDimitry Andric 306fcaf7f86SDimitry Andric // *** Break at the start and end of text, unless the text is empty. *** 307fcaf7f86SDimitry Andric 308*06c3fb27SDimitry Andric _LIBCPP_ASSERT_UNCATEGORIZED(__prev != __property::__sot, "should be handled in the constructor"); // GB1 309*06c3fb27SDimitry Andric _LIBCPP_ASSERT_UNCATEGORIZED(__prev != __property::__eot, "should be handled by our caller"); // GB2 310fcaf7f86SDimitry Andric 311fcaf7f86SDimitry Andric // *** Do not break between a CR and LF. Otherwise, break before and after controls. *** 312fcaf7f86SDimitry Andric if (__prev == __property::__CR && __next == __property::__LF) // GB3 313fcaf7f86SDimitry Andric return false; 314fcaf7f86SDimitry Andric 315fcaf7f86SDimitry Andric if (__prev == __property::__Control || __prev == __property::__CR || __prev == __property::__LF) // GB4 316fcaf7f86SDimitry Andric return true; 317fcaf7f86SDimitry Andric 318fcaf7f86SDimitry Andric if (__next == __property::__Control || __next == __property::__CR || __next == __property::__LF) // GB5 319fcaf7f86SDimitry Andric return true; 320fcaf7f86SDimitry Andric 321fcaf7f86SDimitry Andric // *** Do not break Hangul syllable sequences. *** 322fcaf7f86SDimitry Andric if (__prev == __property::__L && 323fcaf7f86SDimitry Andric (__next == __property::__L || __next == __property::__V || __next == __property::__LV || 324fcaf7f86SDimitry Andric __next == __property::__LVT)) // GB6 325fcaf7f86SDimitry Andric return false; 326fcaf7f86SDimitry Andric 327fcaf7f86SDimitry Andric if ((__prev == __property::__LV || __prev == __property::__V) && 328fcaf7f86SDimitry Andric (__next == __property::__V || __next == __property::__T)) // GB7 329fcaf7f86SDimitry Andric return false; 330fcaf7f86SDimitry Andric 331fcaf7f86SDimitry Andric if ((__prev == __property::__LVT || __prev == __property::__T) && __next == __property::__T) // GB8 332fcaf7f86SDimitry Andric return false; 333fcaf7f86SDimitry Andric 334fcaf7f86SDimitry Andric // *** Do not break before extending characters or ZWJ. *** 335fcaf7f86SDimitry Andric if (__next == __property::__Extend || __next == __property::__ZWJ) 336fcaf7f86SDimitry Andric return false; // GB9 337fcaf7f86SDimitry Andric 338fcaf7f86SDimitry Andric // *** Do not break before SpacingMarks, or after Prepend characters. *** 339fcaf7f86SDimitry Andric if (__next == __property::__SpacingMark) // GB9a 340fcaf7f86SDimitry Andric return false; 341fcaf7f86SDimitry Andric 342fcaf7f86SDimitry Andric if (__prev == __property::__Prepend) // GB9b 343fcaf7f86SDimitry Andric return false; 344fcaf7f86SDimitry Andric 345fcaf7f86SDimitry Andric // *** Do not break within emoji modifier sequences or emoji zwj sequences. *** 346fcaf7f86SDimitry Andric 347fcaf7f86SDimitry Andric // GB11 \p{Extended_Pictographic} Extend* ZWJ x \p{Extended_Pictographic} 348fcaf7f86SDimitry Andric // 349fcaf7f86SDimitry Andric // Note that several parts of this rule are matched by GB9: Any x (Extend | ZWJ) 350fcaf7f86SDimitry Andric // - \p{Extended_Pictographic} x Extend 351fcaf7f86SDimitry Andric // - Extend x Extend 352fcaf7f86SDimitry Andric // - \p{Extended_Pictographic} x ZWJ 353fcaf7f86SDimitry Andric // - Extend x ZWJ 354fcaf7f86SDimitry Andric // 355fcaf7f86SDimitry Andric // So the only case left to test is 356fcaf7f86SDimitry Andric // - \p{Extended_Pictographic}' x ZWJ x \p{Extended_Pictographic} 357fcaf7f86SDimitry Andric // where \p{Extended_Pictographic}' is stored in __has_extened_pictographic 358fcaf7f86SDimitry Andric if (__has_extened_pictographic && __prev == __property::__ZWJ && __next == __property::__Extended_Pictographic) 359fcaf7f86SDimitry Andric return false; 360fcaf7f86SDimitry Andric 361fcaf7f86SDimitry Andric // *** Do not break within emoji flag sequences *** 362fcaf7f86SDimitry Andric 363fcaf7f86SDimitry Andric // That is, do not break between regional indicator (RI) symbols if there 364fcaf7f86SDimitry Andric // is an odd number of RI characters before the break point. 365fcaf7f86SDimitry Andric 366fcaf7f86SDimitry Andric if (__prev == __property::__Regional_Indicator && __next == __property::__Regional_Indicator) { // GB12 + GB13 367fcaf7f86SDimitry Andric __ri_break_allowed = !__ri_break_allowed; 368bdd1243dSDimitry Andric return __ri_break_allowed; 369fcaf7f86SDimitry Andric } 370fcaf7f86SDimitry Andric 371fcaf7f86SDimitry Andric // *** Otherwise, break everywhere. *** 372fcaf7f86SDimitry Andric return true; // GB999 373fcaf7f86SDimitry Andric } 374fcaf7f86SDimitry Andric 375fcaf7f86SDimitry Andric /// Helper class to extract an extended grapheme cluster from a Unicode character range. 376fcaf7f86SDimitry Andric /// 377fcaf7f86SDimitry Andric /// This function is used to determine the column width of an extended grapheme 378fcaf7f86SDimitry Andric /// cluster. In order to do that only the first code point is evaluated. 379fcaf7f86SDimitry Andric /// Therefore only this code point is extracted. 380fcaf7f86SDimitry Andric template <class _CharT> 381fcaf7f86SDimitry Andric class __extended_grapheme_cluster_view { 382*06c3fb27SDimitry Andric using _Iterator = typename basic_string_view<_CharT>::const_iterator; 383*06c3fb27SDimitry Andric 384fcaf7f86SDimitry Andric public: 385*06c3fb27SDimitry Andric _LIBCPP_HIDE_FROM_ABI constexpr explicit __extended_grapheme_cluster_view(_Iterator __first, _Iterator __last) 386fcaf7f86SDimitry Andric : __code_point_view_(__first, __last), 387*06c3fb27SDimitry Andric __next_code_point_(__code_point_view_.__consume().__code_point), 388fcaf7f86SDimitry Andric __next_prop_(__extended_grapheme_custer_property_boundary::__get_property(__next_code_point_)) {} 389fcaf7f86SDimitry Andric 390fcaf7f86SDimitry Andric struct __cluster { 391fcaf7f86SDimitry Andric /// The first code point of the extended grapheme cluster. 392fcaf7f86SDimitry Andric /// 393fcaf7f86SDimitry Andric /// The first code point is used to estimate the width of the extended 394fcaf7f86SDimitry Andric /// grapheme cluster. 395fcaf7f86SDimitry Andric char32_t __code_point_; 396fcaf7f86SDimitry Andric 397fcaf7f86SDimitry Andric /// Points one beyond the last code unit in the extended grapheme cluster. 398fcaf7f86SDimitry Andric /// 399fcaf7f86SDimitry Andric /// It's expected the caller has the start position and thus can determine 400fcaf7f86SDimitry Andric /// the code unit range of the extended grapheme cluster. 401*06c3fb27SDimitry Andric _Iterator __last_; 402fcaf7f86SDimitry Andric }; 403fcaf7f86SDimitry Andric 404fcaf7f86SDimitry Andric _LIBCPP_HIDE_FROM_ABI constexpr __cluster __consume() { 405*06c3fb27SDimitry Andric _LIBCPP_ASSERT_UNCATEGORIZED( 406fcaf7f86SDimitry Andric __next_prop_ != __extended_grapheme_custer_property_boundary::__property::__eot, 407fcaf7f86SDimitry Andric "can't move beyond the end of input"); 408*06c3fb27SDimitry Andric 409fcaf7f86SDimitry Andric char32_t __code_point = __next_code_point_; 410fcaf7f86SDimitry Andric if (!__code_point_view_.__at_end()) 411fcaf7f86SDimitry Andric return {__code_point, __get_break()}; 412fcaf7f86SDimitry Andric 413fcaf7f86SDimitry Andric __next_prop_ = __extended_grapheme_custer_property_boundary::__property::__eot; 414fcaf7f86SDimitry Andric return {__code_point, __code_point_view_.__position()}; 415fcaf7f86SDimitry Andric } 416fcaf7f86SDimitry Andric 417fcaf7f86SDimitry Andric private: 418fcaf7f86SDimitry Andric __code_point_view<_CharT> __code_point_view_; 419fcaf7f86SDimitry Andric 420fcaf7f86SDimitry Andric char32_t __next_code_point_; 421fcaf7f86SDimitry Andric __extended_grapheme_custer_property_boundary::__property __next_prop_; 422fcaf7f86SDimitry Andric 423*06c3fb27SDimitry Andric _LIBCPP_HIDE_FROM_ABI constexpr _Iterator __get_break() { 424fcaf7f86SDimitry Andric bool __ri_break_allowed = true; 425fcaf7f86SDimitry Andric bool __has_extened_pictographic = false; 426fcaf7f86SDimitry Andric while (true) { 427*06c3fb27SDimitry Andric _Iterator __result = __code_point_view_.__position(); 428fcaf7f86SDimitry Andric __extended_grapheme_custer_property_boundary::__property __prev = __next_prop_; 429fcaf7f86SDimitry Andric if (__code_point_view_.__at_end()) { 430fcaf7f86SDimitry Andric __next_prop_ = __extended_grapheme_custer_property_boundary::__property::__eot; 431fcaf7f86SDimitry Andric return __result; 432fcaf7f86SDimitry Andric } 433*06c3fb27SDimitry Andric __next_code_point_ = __code_point_view_.__consume().__code_point; 434fcaf7f86SDimitry Andric __next_prop_ = __extended_grapheme_custer_property_boundary::__get_property(__next_code_point_); 435fcaf7f86SDimitry Andric 436fcaf7f86SDimitry Andric __has_extened_pictographic |= 437fcaf7f86SDimitry Andric __prev == __extended_grapheme_custer_property_boundary::__property::__Extended_Pictographic; 438fcaf7f86SDimitry Andric 439fcaf7f86SDimitry Andric if (__at_extended_grapheme_cluster_break(__ri_break_allowed, __has_extened_pictographic, __prev, __next_prop_)) 440fcaf7f86SDimitry Andric return __result; 441fcaf7f86SDimitry Andric } 442fcaf7f86SDimitry Andric } 443fcaf7f86SDimitry Andric }; 444fcaf7f86SDimitry Andric 445*06c3fb27SDimitry Andric template <contiguous_iterator _Iterator> 446*06c3fb27SDimitry Andric __extended_grapheme_cluster_view(_Iterator, _Iterator) -> __extended_grapheme_cluster_view<iter_value_t<_Iterator>>; 447bdd1243dSDimitry Andric 448bdd1243dSDimitry Andric # else // _LIBCPP_HAS_NO_UNICODE 449bdd1243dSDimitry Andric 450bdd1243dSDimitry Andric // For ASCII every character is a "code point". 451bdd1243dSDimitry Andric // This makes it easier to write code agnostic of the _LIBCPP_HAS_NO_UNICODE define. 452bdd1243dSDimitry Andric template <class _CharT> 453bdd1243dSDimitry Andric class __code_point_view { 454*06c3fb27SDimitry Andric using _Iterator = typename basic_string_view<_CharT>::const_iterator; 455*06c3fb27SDimitry Andric 456bdd1243dSDimitry Andric public: 457*06c3fb27SDimitry Andric _LIBCPP_HIDE_FROM_ABI constexpr explicit __code_point_view(_Iterator __first, _Iterator __last) 458bdd1243dSDimitry Andric : __first_(__first), __last_(__last) {} 459bdd1243dSDimitry Andric 460bdd1243dSDimitry Andric _LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; } 461*06c3fb27SDimitry Andric _LIBCPP_HIDE_FROM_ABI constexpr _Iterator __position() const noexcept { return __first_; } 462bdd1243dSDimitry Andric 463*06c3fb27SDimitry Andric [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __consume_result __consume() noexcept { 464*06c3fb27SDimitry Andric _LIBCPP_ASSERT_UNCATEGORIZED(__first_ != __last_, "can't move beyond the end of input"); 465*06c3fb27SDimitry Andric return {static_cast<char32_t>(*__first_++)}; 466bdd1243dSDimitry Andric } 467bdd1243dSDimitry Andric 468bdd1243dSDimitry Andric private: 469*06c3fb27SDimitry Andric _Iterator __first_; 470*06c3fb27SDimitry Andric _Iterator __last_; 471bdd1243dSDimitry Andric }; 472fcaf7f86SDimitry Andric 473fcaf7f86SDimitry Andric # endif // _LIBCPP_HAS_NO_UNICODE 474fcaf7f86SDimitry Andric 475bdd1243dSDimitry Andric } // namespace __unicode 476bdd1243dSDimitry Andric 477*06c3fb27SDimitry Andric #endif //_LIBCPP_STD_VER >= 20 478fcaf7f86SDimitry Andric 479fcaf7f86SDimitry Andric _LIBCPP_END_NAMESPACE_STD 480fcaf7f86SDimitry Andric 481fcaf7f86SDimitry Andric #endif // _LIBCPP___FORMAT_UNICODE_H 482