1fcaf7f86SDimitry Andric // -*- C++ -*- 2fcaf7f86SDimitry Andric //===----------------------------------------------------------------------===// 3fcaf7f86SDimitry Andric // 4fcaf7f86SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 5fcaf7f86SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 6fcaf7f86SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 7fcaf7f86SDimitry Andric // 8fcaf7f86SDimitry Andric //===----------------------------------------------------------------------===// 9fcaf7f86SDimitry Andric 10fcaf7f86SDimitry Andric #ifndef _LIBCPP___FORMAT_UNICODE_H 11fcaf7f86SDimitry Andric #define _LIBCPP___FORMAT_UNICODE_H 12fcaf7f86SDimitry Andric 13fcaf7f86SDimitry Andric #include <__assert> 14fcaf7f86SDimitry Andric #include <__config> 15fcaf7f86SDimitry Andric #include <__format/extended_grapheme_cluster_table.h> 16*bdd1243dSDimitry Andric #include <__type_traits/make_unsigned.h> 17fcaf7f86SDimitry Andric #include <__utility/unreachable.h> 18fcaf7f86SDimitry Andric #include <bit> 19fcaf7f86SDimitry Andric 20fcaf7f86SDimitry Andric #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) 21fcaf7f86SDimitry Andric # pragma GCC system_header 22fcaf7f86SDimitry Andric #endif 23fcaf7f86SDimitry Andric 24fcaf7f86SDimitry Andric _LIBCPP_BEGIN_NAMESPACE_STD 25fcaf7f86SDimitry Andric 26fcaf7f86SDimitry Andric #if _LIBCPP_STD_VER > 17 27fcaf7f86SDimitry Andric 28*bdd1243dSDimitry Andric namespace __unicode { 29*bdd1243dSDimitry Andric 30*bdd1243dSDimitry Andric # if _LIBCPP_STD_VER > 20 31*bdd1243dSDimitry Andric 32*bdd1243dSDimitry Andric /// The result of consuming a code point using P2286' semantics 33*bdd1243dSDimitry Andric /// 34*bdd1243dSDimitry Andric /// TODO FMT Combine __consume and __consume_p2286 in one function. 35*bdd1243dSDimitry Andric struct __consume_p2286_result { 36*bdd1243dSDimitry Andric // A size of 0 means well formed. This to differenciate between 37*bdd1243dSDimitry Andric // a valid code point and a code unit that's invalid like 0b11111xxx. 38*bdd1243dSDimitry Andric int __ill_formed_size; 39*bdd1243dSDimitry Andric 40*bdd1243dSDimitry Andric // If well formed the consumed code point. 41*bdd1243dSDimitry Andric // Otherwise the ill-formed code units as unsigned 8-bit values. They are 42*bdd1243dSDimitry Andric // stored in reverse order, to make it easier to extract the values. 43*bdd1243dSDimitry Andric char32_t __value; 44*bdd1243dSDimitry Andric }; 45*bdd1243dSDimitry Andric 46*bdd1243dSDimitry Andric # endif // _LIBCPP_STD_VER > 20 47*bdd1243dSDimitry Andric 48fcaf7f86SDimitry Andric # ifndef _LIBCPP_HAS_NO_UNICODE 49fcaf7f86SDimitry Andric 50fcaf7f86SDimitry Andric /// Implements the grapheme cluster boundary rules 51fcaf7f86SDimitry Andric /// 52fcaf7f86SDimitry Andric /// These rules are used to implement format's width estimation as stated in 53fcaf7f86SDimitry Andric /// [format.string.std]/11 54fcaf7f86SDimitry Andric /// 55fcaf7f86SDimitry Andric /// The Standard refers to UAX \#29 for Unicode 12.0.0 56fcaf7f86SDimitry Andric /// https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules 57fcaf7f86SDimitry Andric /// 58fcaf7f86SDimitry Andric /// The data tables used are 59fcaf7f86SDimitry Andric /// https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt 60fcaf7f86SDimitry Andric /// https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt 61fcaf7f86SDimitry Andric /// https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt (for testing only) 62fcaf7f86SDimitry Andric 63fcaf7f86SDimitry Andric inline constexpr char32_t __replacement_character = U'\ufffd'; 64fcaf7f86SDimitry Andric 65fcaf7f86SDimitry Andric _LIBCPP_HIDE_FROM_ABI constexpr bool __is_continuation(const char* __char, int __count) { 66fcaf7f86SDimitry Andric do { 67fcaf7f86SDimitry Andric if ((*__char & 0b1000'0000) != 0b1000'0000) 68fcaf7f86SDimitry Andric return false; 69fcaf7f86SDimitry Andric --__count; 70fcaf7f86SDimitry Andric ++__char; 71fcaf7f86SDimitry Andric } while (__count); 72fcaf7f86SDimitry Andric return true; 73fcaf7f86SDimitry Andric } 74fcaf7f86SDimitry Andric 75fcaf7f86SDimitry Andric /// Helper class to extract a code unit from a Unicode character range. 76fcaf7f86SDimitry Andric /// 77fcaf7f86SDimitry Andric /// The stored range is a view. There are multiple specialization for different 78fcaf7f86SDimitry Andric /// character types. 79fcaf7f86SDimitry Andric template <class _CharT> 80fcaf7f86SDimitry Andric class __code_point_view; 81fcaf7f86SDimitry Andric 82fcaf7f86SDimitry Andric /// UTF-8 specialization. 83fcaf7f86SDimitry Andric template <> 84fcaf7f86SDimitry Andric class __code_point_view<char> { 85fcaf7f86SDimitry Andric public: 86fcaf7f86SDimitry Andric _LIBCPP_HIDE_FROM_ABI constexpr explicit __code_point_view(const char* __first, const char* __last) 87fcaf7f86SDimitry Andric : __first_(__first), __last_(__last) {} 88fcaf7f86SDimitry Andric 89fcaf7f86SDimitry Andric _LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; } 90fcaf7f86SDimitry Andric _LIBCPP_HIDE_FROM_ABI constexpr const char* __position() const noexcept { return __first_; } 91fcaf7f86SDimitry Andric 92fcaf7f86SDimitry Andric _LIBCPP_HIDE_FROM_ABI constexpr char32_t __consume() noexcept { 93fcaf7f86SDimitry Andric _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input"); 94fcaf7f86SDimitry Andric 95fcaf7f86SDimitry Andric // Based on the number of leading 1 bits the number of code units in the 96fcaf7f86SDimitry Andric // code point can be determined. See 97fcaf7f86SDimitry Andric // https://en.wikipedia.org/wiki/UTF-8#Encoding 98fcaf7f86SDimitry Andric switch (_VSTD::countl_one(static_cast<unsigned char>(*__first_))) { 99fcaf7f86SDimitry Andric case 0: 100fcaf7f86SDimitry Andric return *__first_++; 101fcaf7f86SDimitry Andric 102fcaf7f86SDimitry Andric case 2: 103fcaf7f86SDimitry Andric if (__last_ - __first_ < 2 || !__unicode::__is_continuation(__first_ + 1, 1)) [[unlikely]] 104fcaf7f86SDimitry Andric break; 105fcaf7f86SDimitry Andric else { 106fcaf7f86SDimitry Andric char32_t __value = static_cast<unsigned char>(*__first_++) & 0x1f; 107fcaf7f86SDimitry Andric __value <<= 6; 108fcaf7f86SDimitry Andric __value |= static_cast<unsigned char>(*__first_++) & 0x3f; 109fcaf7f86SDimitry Andric return __value; 110fcaf7f86SDimitry Andric } 111fcaf7f86SDimitry Andric 112fcaf7f86SDimitry Andric case 3: 113fcaf7f86SDimitry Andric if (__last_ - __first_ < 3 || !__unicode::__is_continuation(__first_ + 1, 2)) [[unlikely]] 114fcaf7f86SDimitry Andric break; 115fcaf7f86SDimitry Andric else { 116fcaf7f86SDimitry Andric char32_t __value = static_cast<unsigned char>(*__first_++) & 0x0f; 117fcaf7f86SDimitry Andric __value <<= 6; 118fcaf7f86SDimitry Andric __value |= static_cast<unsigned char>(*__first_++) & 0x3f; 119fcaf7f86SDimitry Andric __value <<= 6; 120fcaf7f86SDimitry Andric __value |= static_cast<unsigned char>(*__first_++) & 0x3f; 121fcaf7f86SDimitry Andric return __value; 122fcaf7f86SDimitry Andric } 123fcaf7f86SDimitry Andric 124fcaf7f86SDimitry Andric case 4: 125fcaf7f86SDimitry Andric if (__last_ - __first_ < 4 || !__unicode::__is_continuation(__first_ + 1, 3)) [[unlikely]] 126fcaf7f86SDimitry Andric break; 127fcaf7f86SDimitry Andric else { 128fcaf7f86SDimitry Andric char32_t __value = static_cast<unsigned char>(*__first_++) & 0x07; 129fcaf7f86SDimitry Andric __value <<= 6; 130fcaf7f86SDimitry Andric __value |= static_cast<unsigned char>(*__first_++) & 0x3f; 131fcaf7f86SDimitry Andric __value <<= 6; 132fcaf7f86SDimitry Andric __value |= static_cast<unsigned char>(*__first_++) & 0x3f; 133fcaf7f86SDimitry Andric __value <<= 6; 134fcaf7f86SDimitry Andric __value |= static_cast<unsigned char>(*__first_++) & 0x3f; 135fcaf7f86SDimitry Andric return __value; 136fcaf7f86SDimitry Andric } 137fcaf7f86SDimitry Andric } 138fcaf7f86SDimitry Andric // An invalid number of leading ones can be garbage or a code unit in the 139fcaf7f86SDimitry Andric // middle of a code point. By consuming one code unit the parser may get 140fcaf7f86SDimitry Andric // "in sync" after a few code units. 141fcaf7f86SDimitry Andric ++__first_; 142fcaf7f86SDimitry Andric return __replacement_character; 143fcaf7f86SDimitry Andric } 144fcaf7f86SDimitry Andric 145*bdd1243dSDimitry Andric # if _LIBCPP_STD_VER > 20 146*bdd1243dSDimitry Andric _LIBCPP_HIDE_FROM_ABI constexpr __consume_p2286_result __consume_p2286() noexcept { 147*bdd1243dSDimitry Andric _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input"); 148*bdd1243dSDimitry Andric 149*bdd1243dSDimitry Andric // Based on the number of leading 1 bits the number of code units in the 150*bdd1243dSDimitry Andric // code point can be determined. See 151*bdd1243dSDimitry Andric // https://en.wikipedia.org/wiki/UTF-8#Encoding 152*bdd1243dSDimitry Andric switch (std::countl_one(static_cast<unsigned char>(*__first_))) { 153*bdd1243dSDimitry Andric case 0: 154*bdd1243dSDimitry Andric return {0, static_cast<unsigned char>(*__first_++)}; 155*bdd1243dSDimitry Andric 156*bdd1243dSDimitry Andric case 2: 157*bdd1243dSDimitry Andric if (__last_ - __first_ < 2) [[unlikely]] 158*bdd1243dSDimitry Andric break; 159*bdd1243dSDimitry Andric 160*bdd1243dSDimitry Andric if (__unicode::__is_continuation(__first_ + 1, 1)) { 161*bdd1243dSDimitry Andric char32_t __value = static_cast<unsigned char>(*__first_++) & 0x1f; 162*bdd1243dSDimitry Andric __value <<= 6; 163*bdd1243dSDimitry Andric __value |= static_cast<unsigned char>(*__first_++) & 0x3f; 164*bdd1243dSDimitry Andric return {0, __value}; 165*bdd1243dSDimitry Andric } 166*bdd1243dSDimitry Andric break; 167*bdd1243dSDimitry Andric 168*bdd1243dSDimitry Andric case 3: 169*bdd1243dSDimitry Andric if (__last_ - __first_ < 3) [[unlikely]] 170*bdd1243dSDimitry Andric break; 171*bdd1243dSDimitry Andric 172*bdd1243dSDimitry Andric if (__unicode::__is_continuation(__first_ + 1, 2)) { 173*bdd1243dSDimitry Andric char32_t __value = static_cast<unsigned char>(*__first_++) & 0x0f; 174*bdd1243dSDimitry Andric __value <<= 6; 175*bdd1243dSDimitry Andric __value |= static_cast<unsigned char>(*__first_++) & 0x3f; 176*bdd1243dSDimitry Andric __value <<= 6; 177*bdd1243dSDimitry Andric __value |= static_cast<unsigned char>(*__first_++) & 0x3f; 178*bdd1243dSDimitry Andric return {0, __value}; 179*bdd1243dSDimitry Andric } 180*bdd1243dSDimitry Andric break; 181*bdd1243dSDimitry Andric 182*bdd1243dSDimitry Andric case 4: 183*bdd1243dSDimitry Andric if (__last_ - __first_ < 4) [[unlikely]] 184*bdd1243dSDimitry Andric break; 185*bdd1243dSDimitry Andric 186*bdd1243dSDimitry Andric if (__unicode::__is_continuation(__first_ + 1, 3)) { 187*bdd1243dSDimitry Andric char32_t __value = static_cast<unsigned char>(*__first_++) & 0x07; 188*bdd1243dSDimitry Andric __value <<= 6; 189*bdd1243dSDimitry Andric __value |= static_cast<unsigned char>(*__first_++) & 0x3f; 190*bdd1243dSDimitry Andric __value <<= 6; 191*bdd1243dSDimitry Andric __value |= static_cast<unsigned char>(*__first_++) & 0x3f; 192*bdd1243dSDimitry Andric __value <<= 6; 193*bdd1243dSDimitry Andric __value |= static_cast<unsigned char>(*__first_++) & 0x3f; 194*bdd1243dSDimitry Andric 195*bdd1243dSDimitry Andric if (__value > 0x10FFFF) // Outside the valid Unicode range? 196*bdd1243dSDimitry Andric return {4, __value}; 197*bdd1243dSDimitry Andric 198*bdd1243dSDimitry Andric return {0, __value}; 199*bdd1243dSDimitry Andric } 200*bdd1243dSDimitry Andric break; 201*bdd1243dSDimitry Andric } 202*bdd1243dSDimitry Andric // An invalid number of leading ones can be garbage or a code unit in the 203*bdd1243dSDimitry Andric // middle of a code point. By consuming one code unit the parser may get 204*bdd1243dSDimitry Andric // "in sync" after a few code units. 205*bdd1243dSDimitry Andric return {1, static_cast<unsigned char>(*__first_++)}; 206*bdd1243dSDimitry Andric } 207*bdd1243dSDimitry Andric # endif // _LIBCPP_STD_VER > 20 208*bdd1243dSDimitry Andric 209fcaf7f86SDimitry Andric private: 210fcaf7f86SDimitry Andric const char* __first_; 211fcaf7f86SDimitry Andric const char* __last_; 212fcaf7f86SDimitry Andric }; 213fcaf7f86SDimitry Andric 214*bdd1243dSDimitry Andric # ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS 215*bdd1243dSDimitry Andric _LIBCPP_HIDE_FROM_ABI constexpr bool __is_surrogate_pair_high(wchar_t __value) { 216*bdd1243dSDimitry Andric return __value >= 0xd800 && __value <= 0xdbff; 217*bdd1243dSDimitry Andric } 218*bdd1243dSDimitry Andric 219*bdd1243dSDimitry Andric _LIBCPP_HIDE_FROM_ABI constexpr bool __is_surrogate_pair_low(wchar_t __value) { 220*bdd1243dSDimitry Andric return __value >= 0xdc00 && __value <= 0xdfff; 221*bdd1243dSDimitry Andric } 222*bdd1243dSDimitry Andric 223fcaf7f86SDimitry Andric /// This specialization depends on the size of wchar_t 224fcaf7f86SDimitry Andric /// - 2 UTF-16 (for example Windows and AIX) 225fcaf7f86SDimitry Andric /// - 4 UTF-32 (for example Linux) 226fcaf7f86SDimitry Andric template <> 227fcaf7f86SDimitry Andric class __code_point_view<wchar_t> { 228fcaf7f86SDimitry Andric public: 229*bdd1243dSDimitry Andric static_assert(sizeof(wchar_t) == 2 || sizeof(wchar_t) == 4, "sizeof(wchar_t) has a not implemented value"); 230*bdd1243dSDimitry Andric 231fcaf7f86SDimitry Andric _LIBCPP_HIDE_FROM_ABI constexpr explicit __code_point_view(const wchar_t* __first, const wchar_t* __last) 232fcaf7f86SDimitry Andric : __first_(__first), __last_(__last) {} 233fcaf7f86SDimitry Andric 234fcaf7f86SDimitry Andric _LIBCPP_HIDE_FROM_ABI constexpr const wchar_t* __position() const noexcept { return __first_; } 235fcaf7f86SDimitry Andric _LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; } 236fcaf7f86SDimitry Andric 237fcaf7f86SDimitry Andric _LIBCPP_HIDE_FROM_ABI constexpr char32_t __consume() noexcept { 238fcaf7f86SDimitry Andric _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input"); 239fcaf7f86SDimitry Andric 240fcaf7f86SDimitry Andric if constexpr (sizeof(wchar_t) == 2) { 241fcaf7f86SDimitry Andric char32_t __result = *__first_++; 242fcaf7f86SDimitry Andric // Is the code unit part of a surrogate pair? See 243fcaf7f86SDimitry Andric // https://en.wikipedia.org/wiki/UTF-16#U+D800_to_U+DFFF 244fcaf7f86SDimitry Andric if (__result >= 0xd800 && __result <= 0xDfff) { 245fcaf7f86SDimitry Andric // Malformed Unicode. 246fcaf7f86SDimitry Andric if (__first_ == __last_) [[unlikely]] 247fcaf7f86SDimitry Andric return __replacement_character; 248fcaf7f86SDimitry Andric 249fcaf7f86SDimitry Andric __result -= 0xd800; 250fcaf7f86SDimitry Andric __result <<= 10; 251fcaf7f86SDimitry Andric __result += *__first_++ - 0xdc00; 252fcaf7f86SDimitry Andric __result += 0x10000; 253fcaf7f86SDimitry Andric } 254fcaf7f86SDimitry Andric return __result; 255fcaf7f86SDimitry Andric 256fcaf7f86SDimitry Andric } else if constexpr (sizeof(wchar_t) == 4) { 257fcaf7f86SDimitry Andric char32_t __result = *__first_++; 258fcaf7f86SDimitry Andric if (__result > 0x10FFFF) [[unlikely]] 259fcaf7f86SDimitry Andric return __replacement_character; 260fcaf7f86SDimitry Andric return __result; 261fcaf7f86SDimitry Andric } else { 262fcaf7f86SDimitry Andric __libcpp_unreachable(); 263fcaf7f86SDimitry Andric } 264fcaf7f86SDimitry Andric } 265fcaf7f86SDimitry Andric 266*bdd1243dSDimitry Andric # if _LIBCPP_STD_VER > 20 267*bdd1243dSDimitry Andric _LIBCPP_HIDE_FROM_ABI constexpr __consume_p2286_result __consume_p2286() noexcept { 268*bdd1243dSDimitry Andric _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input"); 269*bdd1243dSDimitry Andric 270*bdd1243dSDimitry Andric char32_t __result = *__first_++; 271*bdd1243dSDimitry Andric if constexpr (sizeof(wchar_t) == 2) { 272*bdd1243dSDimitry Andric // https://en.wikipedia.org/wiki/UTF-16#U+D800_to_U+DFFF 273*bdd1243dSDimitry Andric if (__is_surrogate_pair_high(__result)) { 274*bdd1243dSDimitry Andric // Malformed Unicode. 275*bdd1243dSDimitry Andric if (__first_ == __last_ || !__is_surrogate_pair_low(*(__first_ + 1))) [[unlikely]] 276*bdd1243dSDimitry Andric return {1, __result}; 277*bdd1243dSDimitry Andric 278*bdd1243dSDimitry Andric __result -= 0xd800; 279*bdd1243dSDimitry Andric __result <<= 10; 280*bdd1243dSDimitry Andric __result += *__first_++ - 0xdc00; 281*bdd1243dSDimitry Andric __result += 0x10000; 282*bdd1243dSDimitry Andric } else if (__is_surrogate_pair_low(__result)) 283*bdd1243dSDimitry Andric // A code point shouldn't start with the low surrogate pair 284*bdd1243dSDimitry Andric return {1, __result}; 285*bdd1243dSDimitry Andric } else { 286*bdd1243dSDimitry Andric if (__result > 0x10FFFF) [[unlikely]] 287*bdd1243dSDimitry Andric return {1, __result}; 288*bdd1243dSDimitry Andric } 289*bdd1243dSDimitry Andric 290*bdd1243dSDimitry Andric return {0, __result}; 291*bdd1243dSDimitry Andric } 292*bdd1243dSDimitry Andric # endif // _LIBCPP_STD_VER > 20 293*bdd1243dSDimitry Andric 294fcaf7f86SDimitry Andric private: 295fcaf7f86SDimitry Andric const wchar_t* __first_; 296fcaf7f86SDimitry Andric const wchar_t* __last_; 297fcaf7f86SDimitry Andric }; 298*bdd1243dSDimitry Andric # endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS 299fcaf7f86SDimitry Andric 300fcaf7f86SDimitry Andric _LIBCPP_HIDE_FROM_ABI constexpr bool __at_extended_grapheme_cluster_break( 301fcaf7f86SDimitry Andric bool& __ri_break_allowed, 302fcaf7f86SDimitry Andric bool __has_extened_pictographic, 303fcaf7f86SDimitry Andric __extended_grapheme_custer_property_boundary::__property __prev, 304fcaf7f86SDimitry Andric __extended_grapheme_custer_property_boundary::__property __next) { 305fcaf7f86SDimitry Andric using __extended_grapheme_custer_property_boundary::__property; 306fcaf7f86SDimitry Andric 307fcaf7f86SDimitry Andric __has_extened_pictographic |= __prev == __property::__Extended_Pictographic; 308fcaf7f86SDimitry Andric 309fcaf7f86SDimitry Andric // https://www.unicode.org/reports/tr29/tr29-39.html#Grapheme_Cluster_Boundary_Rules 310fcaf7f86SDimitry Andric 311fcaf7f86SDimitry Andric // *** Break at the start and end of text, unless the text is empty. *** 312fcaf7f86SDimitry Andric 313fcaf7f86SDimitry Andric _LIBCPP_ASSERT(__prev != __property::__sot, "should be handled in the constructor"); // GB1 314fcaf7f86SDimitry Andric _LIBCPP_ASSERT(__prev != __property::__eot, "should be handled by our caller"); // GB2 315fcaf7f86SDimitry Andric 316fcaf7f86SDimitry Andric // *** Do not break between a CR and LF. Otherwise, break before and after controls. *** 317fcaf7f86SDimitry Andric if (__prev == __property::__CR && __next == __property::__LF) // GB3 318fcaf7f86SDimitry Andric return false; 319fcaf7f86SDimitry Andric 320fcaf7f86SDimitry Andric if (__prev == __property::__Control || __prev == __property::__CR || __prev == __property::__LF) // GB4 321fcaf7f86SDimitry Andric return true; 322fcaf7f86SDimitry Andric 323fcaf7f86SDimitry Andric if (__next == __property::__Control || __next == __property::__CR || __next == __property::__LF) // GB5 324fcaf7f86SDimitry Andric return true; 325fcaf7f86SDimitry Andric 326fcaf7f86SDimitry Andric // *** Do not break Hangul syllable sequences. *** 327fcaf7f86SDimitry Andric if (__prev == __property::__L && 328fcaf7f86SDimitry Andric (__next == __property::__L || __next == __property::__V || __next == __property::__LV || 329fcaf7f86SDimitry Andric __next == __property::__LVT)) // GB6 330fcaf7f86SDimitry Andric return false; 331fcaf7f86SDimitry Andric 332fcaf7f86SDimitry Andric if ((__prev == __property::__LV || __prev == __property::__V) && 333fcaf7f86SDimitry Andric (__next == __property::__V || __next == __property::__T)) // GB7 334fcaf7f86SDimitry Andric return false; 335fcaf7f86SDimitry Andric 336fcaf7f86SDimitry Andric if ((__prev == __property::__LVT || __prev == __property::__T) && __next == __property::__T) // GB8 337fcaf7f86SDimitry Andric return false; 338fcaf7f86SDimitry Andric 339fcaf7f86SDimitry Andric // *** Do not break before extending characters or ZWJ. *** 340fcaf7f86SDimitry Andric if (__next == __property::__Extend || __next == __property::__ZWJ) 341fcaf7f86SDimitry Andric return false; // GB9 342fcaf7f86SDimitry Andric 343fcaf7f86SDimitry Andric // *** Do not break before SpacingMarks, or after Prepend characters. *** 344fcaf7f86SDimitry Andric if (__next == __property::__SpacingMark) // GB9a 345fcaf7f86SDimitry Andric return false; 346fcaf7f86SDimitry Andric 347fcaf7f86SDimitry Andric if (__prev == __property::__Prepend) // GB9b 348fcaf7f86SDimitry Andric return false; 349fcaf7f86SDimitry Andric 350fcaf7f86SDimitry Andric // *** Do not break within emoji modifier sequences or emoji zwj sequences. *** 351fcaf7f86SDimitry Andric 352fcaf7f86SDimitry Andric // GB11 \p{Extended_Pictographic} Extend* ZWJ x \p{Extended_Pictographic} 353fcaf7f86SDimitry Andric // 354fcaf7f86SDimitry Andric // Note that several parts of this rule are matched by GB9: Any x (Extend | ZWJ) 355fcaf7f86SDimitry Andric // - \p{Extended_Pictographic} x Extend 356fcaf7f86SDimitry Andric // - Extend x Extend 357fcaf7f86SDimitry Andric // - \p{Extended_Pictographic} x ZWJ 358fcaf7f86SDimitry Andric // - Extend x ZWJ 359fcaf7f86SDimitry Andric // 360fcaf7f86SDimitry Andric // So the only case left to test is 361fcaf7f86SDimitry Andric // - \p{Extended_Pictographic}' x ZWJ x \p{Extended_Pictographic} 362fcaf7f86SDimitry Andric // where \p{Extended_Pictographic}' is stored in __has_extened_pictographic 363fcaf7f86SDimitry Andric if (__has_extened_pictographic && __prev == __property::__ZWJ && __next == __property::__Extended_Pictographic) 364fcaf7f86SDimitry Andric return false; 365fcaf7f86SDimitry Andric 366fcaf7f86SDimitry Andric // *** Do not break within emoji flag sequences *** 367fcaf7f86SDimitry Andric 368fcaf7f86SDimitry Andric // That is, do not break between regional indicator (RI) symbols if there 369fcaf7f86SDimitry Andric // is an odd number of RI characters before the break point. 370fcaf7f86SDimitry Andric 371fcaf7f86SDimitry Andric if (__prev == __property::__Regional_Indicator && __next == __property::__Regional_Indicator) { // GB12 + GB13 372fcaf7f86SDimitry Andric __ri_break_allowed = !__ri_break_allowed; 373*bdd1243dSDimitry Andric return __ri_break_allowed; 374fcaf7f86SDimitry Andric } 375fcaf7f86SDimitry Andric 376fcaf7f86SDimitry Andric // *** Otherwise, break everywhere. *** 377fcaf7f86SDimitry Andric return true; // GB999 378fcaf7f86SDimitry Andric } 379fcaf7f86SDimitry Andric 380fcaf7f86SDimitry Andric /// Helper class to extract an extended grapheme cluster from a Unicode character range. 381fcaf7f86SDimitry Andric /// 382fcaf7f86SDimitry Andric /// This function is used to determine the column width of an extended grapheme 383fcaf7f86SDimitry Andric /// cluster. In order to do that only the first code point is evaluated. 384fcaf7f86SDimitry Andric /// Therefore only this code point is extracted. 385fcaf7f86SDimitry Andric template <class _CharT> 386fcaf7f86SDimitry Andric class __extended_grapheme_cluster_view { 387fcaf7f86SDimitry Andric public: 388fcaf7f86SDimitry Andric _LIBCPP_HIDE_FROM_ABI constexpr explicit __extended_grapheme_cluster_view(const _CharT* __first, const _CharT* __last) 389fcaf7f86SDimitry Andric : __code_point_view_(__first, __last), 390fcaf7f86SDimitry Andric __next_code_point_(__code_point_view_.__consume()), 391fcaf7f86SDimitry Andric __next_prop_(__extended_grapheme_custer_property_boundary::__get_property(__next_code_point_)) {} 392fcaf7f86SDimitry Andric 393fcaf7f86SDimitry Andric struct __cluster { 394fcaf7f86SDimitry Andric /// The first code point of the extended grapheme cluster. 395fcaf7f86SDimitry Andric /// 396fcaf7f86SDimitry Andric /// The first code point is used to estimate the width of the extended 397fcaf7f86SDimitry Andric /// grapheme cluster. 398fcaf7f86SDimitry Andric char32_t __code_point_; 399fcaf7f86SDimitry Andric 400fcaf7f86SDimitry Andric /// Points one beyond the last code unit in the extended grapheme cluster. 401fcaf7f86SDimitry Andric /// 402fcaf7f86SDimitry Andric /// It's expected the caller has the start position and thus can determine 403fcaf7f86SDimitry Andric /// the code unit range of the extended grapheme cluster. 404fcaf7f86SDimitry Andric const _CharT* __last_; 405fcaf7f86SDimitry Andric }; 406fcaf7f86SDimitry Andric 407fcaf7f86SDimitry Andric _LIBCPP_HIDE_FROM_ABI constexpr __cluster __consume() { 408fcaf7f86SDimitry Andric _LIBCPP_ASSERT( 409fcaf7f86SDimitry Andric __next_prop_ != __extended_grapheme_custer_property_boundary::__property::__eot, 410fcaf7f86SDimitry Andric "can't move beyond the end of input"); 411fcaf7f86SDimitry Andric char32_t __code_point = __next_code_point_; 412fcaf7f86SDimitry Andric if (!__code_point_view_.__at_end()) 413fcaf7f86SDimitry Andric return {__code_point, __get_break()}; 414fcaf7f86SDimitry Andric 415fcaf7f86SDimitry Andric __next_prop_ = __extended_grapheme_custer_property_boundary::__property::__eot; 416fcaf7f86SDimitry Andric return {__code_point, __code_point_view_.__position()}; 417fcaf7f86SDimitry Andric } 418fcaf7f86SDimitry Andric 419fcaf7f86SDimitry Andric private: 420fcaf7f86SDimitry Andric __code_point_view<_CharT> __code_point_view_; 421fcaf7f86SDimitry Andric 422fcaf7f86SDimitry Andric char32_t __next_code_point_; 423fcaf7f86SDimitry Andric __extended_grapheme_custer_property_boundary::__property __next_prop_; 424fcaf7f86SDimitry Andric 425fcaf7f86SDimitry Andric _LIBCPP_HIDE_FROM_ABI constexpr const _CharT* __get_break() { 426fcaf7f86SDimitry Andric bool __ri_break_allowed = true; 427fcaf7f86SDimitry Andric bool __has_extened_pictographic = false; 428fcaf7f86SDimitry Andric while (true) { 429fcaf7f86SDimitry Andric const _CharT* __result = __code_point_view_.__position(); 430fcaf7f86SDimitry Andric __extended_grapheme_custer_property_boundary::__property __prev = __next_prop_; 431fcaf7f86SDimitry Andric if (__code_point_view_.__at_end()) { 432fcaf7f86SDimitry Andric __next_prop_ = __extended_grapheme_custer_property_boundary::__property::__eot; 433fcaf7f86SDimitry Andric return __result; 434fcaf7f86SDimitry Andric } 435fcaf7f86SDimitry Andric __next_code_point_ = __code_point_view_.__consume(); 436fcaf7f86SDimitry Andric __next_prop_ = __extended_grapheme_custer_property_boundary::__get_property(__next_code_point_); 437fcaf7f86SDimitry Andric 438fcaf7f86SDimitry Andric __has_extened_pictographic |= 439fcaf7f86SDimitry Andric __prev == __extended_grapheme_custer_property_boundary::__property::__Extended_Pictographic; 440fcaf7f86SDimitry Andric 441fcaf7f86SDimitry Andric if (__at_extended_grapheme_cluster_break(__ri_break_allowed, __has_extened_pictographic, __prev, __next_prop_)) 442fcaf7f86SDimitry Andric return __result; 443fcaf7f86SDimitry Andric } 444fcaf7f86SDimitry Andric } 445fcaf7f86SDimitry Andric }; 446fcaf7f86SDimitry Andric 447*bdd1243dSDimitry Andric template <class _CharT> 448*bdd1243dSDimitry Andric __extended_grapheme_cluster_view(const _CharT*, const _CharT*) -> __extended_grapheme_cluster_view<_CharT>; 449*bdd1243dSDimitry Andric 450*bdd1243dSDimitry Andric # else // _LIBCPP_HAS_NO_UNICODE 451*bdd1243dSDimitry Andric 452*bdd1243dSDimitry Andric // For ASCII every character is a "code point". 453*bdd1243dSDimitry Andric // This makes it easier to write code agnostic of the _LIBCPP_HAS_NO_UNICODE define. 454*bdd1243dSDimitry Andric template <class _CharT> 455*bdd1243dSDimitry Andric class __code_point_view { 456*bdd1243dSDimitry Andric public: 457*bdd1243dSDimitry Andric _LIBCPP_HIDE_FROM_ABI constexpr explicit __code_point_view(const _CharT* __first, const _CharT* __last) 458*bdd1243dSDimitry Andric : __first_(__first), __last_(__last) {} 459*bdd1243dSDimitry Andric 460*bdd1243dSDimitry Andric _LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; } 461*bdd1243dSDimitry Andric _LIBCPP_HIDE_FROM_ABI constexpr const _CharT* __position() const noexcept { return __first_; } 462*bdd1243dSDimitry Andric 463*bdd1243dSDimitry Andric _LIBCPP_HIDE_FROM_ABI constexpr char32_t __consume() noexcept { 464*bdd1243dSDimitry Andric _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input"); 465*bdd1243dSDimitry Andric return *__first_++; 466*bdd1243dSDimitry Andric } 467*bdd1243dSDimitry Andric 468*bdd1243dSDimitry Andric # if _LIBCPP_STD_VER > 20 469*bdd1243dSDimitry Andric _LIBCPP_HIDE_FROM_ABI constexpr __consume_p2286_result __consume_p2286() noexcept { 470*bdd1243dSDimitry Andric _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input"); 471*bdd1243dSDimitry Andric 472*bdd1243dSDimitry Andric return {0, std::make_unsigned_t<_CharT>(*__first_++)}; 473*bdd1243dSDimitry Andric } 474*bdd1243dSDimitry Andric # endif // _LIBCPP_STD_VER > 20 475*bdd1243dSDimitry Andric 476*bdd1243dSDimitry Andric private: 477*bdd1243dSDimitry Andric const _CharT* __first_; 478*bdd1243dSDimitry Andric const _CharT* __last_; 479*bdd1243dSDimitry Andric }; 480fcaf7f86SDimitry Andric 481fcaf7f86SDimitry Andric # endif // _LIBCPP_HAS_NO_UNICODE 482fcaf7f86SDimitry Andric 483*bdd1243dSDimitry Andric } // namespace __unicode 484*bdd1243dSDimitry Andric 485fcaf7f86SDimitry Andric #endif //_LIBCPP_STD_VER > 17 486fcaf7f86SDimitry Andric 487fcaf7f86SDimitry Andric _LIBCPP_END_NAMESPACE_STD 488fcaf7f86SDimitry Andric 489fcaf7f86SDimitry Andric #endif // _LIBCPP___FORMAT_UNICODE_H 490