xref: /openbsd-src/gnu/llvm/libcxx/include/__format/unicode.h (revision 4bdff4bed0e3d54e55670334c7d0077db4170f86)
1*4bdff4beSrobert // -*- C++ -*-
2*4bdff4beSrobert //===----------------------------------------------------------------------===//
3*4bdff4beSrobert //
4*4bdff4beSrobert // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5*4bdff4beSrobert // See https://llvm.org/LICENSE.txt for license information.
6*4bdff4beSrobert // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7*4bdff4beSrobert //
8*4bdff4beSrobert //===----------------------------------------------------------------------===//
9*4bdff4beSrobert 
10*4bdff4beSrobert #ifndef _LIBCPP___FORMAT_UNICODE_H
11*4bdff4beSrobert #define _LIBCPP___FORMAT_UNICODE_H
12*4bdff4beSrobert 
13*4bdff4beSrobert #include <__assert>
14*4bdff4beSrobert #include <__config>
15*4bdff4beSrobert #include <__format/extended_grapheme_cluster_table.h>
16*4bdff4beSrobert #include <__type_traits/make_unsigned.h>
17*4bdff4beSrobert #include <__utility/unreachable.h>
18*4bdff4beSrobert #include <bit>
19*4bdff4beSrobert 
20*4bdff4beSrobert #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
21*4bdff4beSrobert #  pragma GCC system_header
22*4bdff4beSrobert #endif
23*4bdff4beSrobert 
24*4bdff4beSrobert _LIBCPP_BEGIN_NAMESPACE_STD
25*4bdff4beSrobert 
26*4bdff4beSrobert #if _LIBCPP_STD_VER > 17
27*4bdff4beSrobert 
28*4bdff4beSrobert namespace __unicode {
29*4bdff4beSrobert 
30*4bdff4beSrobert #  if _LIBCPP_STD_VER > 20
31*4bdff4beSrobert 
32*4bdff4beSrobert /// The result of consuming a code point using P2286' semantics
33*4bdff4beSrobert ///
34*4bdff4beSrobert /// TODO FMT Combine __consume and  __consume_p2286 in one function.
35*4bdff4beSrobert struct __consume_p2286_result {
36*4bdff4beSrobert   // A size of 0 means well formed. This to differenciate between
37*4bdff4beSrobert   // a valid code point and a code unit that's invalid like 0b11111xxx.
38*4bdff4beSrobert   int __ill_formed_size;
39*4bdff4beSrobert 
40*4bdff4beSrobert   // If well formed the consumed code point.
41*4bdff4beSrobert   // Otherwise the ill-formed code units as unsigned 8-bit values. They are
42*4bdff4beSrobert   // stored in reverse order, to make it easier to extract the values.
43*4bdff4beSrobert   char32_t __value;
44*4bdff4beSrobert };
45*4bdff4beSrobert 
46*4bdff4beSrobert #  endif // _LIBCPP_STD_VER > 20
47*4bdff4beSrobert 
48*4bdff4beSrobert #  ifndef _LIBCPP_HAS_NO_UNICODE
49*4bdff4beSrobert 
50*4bdff4beSrobert /// Implements the grapheme cluster boundary rules
51*4bdff4beSrobert ///
52*4bdff4beSrobert /// These rules are used to implement format's width estimation as stated in
53*4bdff4beSrobert /// [format.string.std]/11
54*4bdff4beSrobert ///
55*4bdff4beSrobert /// The Standard refers to UAX \#29 for Unicode 12.0.0
56*4bdff4beSrobert /// https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
57*4bdff4beSrobert ///
58*4bdff4beSrobert /// The data tables used are
59*4bdff4beSrobert /// https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
60*4bdff4beSrobert /// https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt
61*4bdff4beSrobert /// https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt (for testing only)
62*4bdff4beSrobert 
63*4bdff4beSrobert inline constexpr char32_t __replacement_character = U'\ufffd';
64*4bdff4beSrobert 
__is_continuation(const char * __char,int __count)65*4bdff4beSrobert _LIBCPP_HIDE_FROM_ABI constexpr bool __is_continuation(const char* __char, int __count) {
66*4bdff4beSrobert   do {
67*4bdff4beSrobert     if ((*__char & 0b1000'0000) != 0b1000'0000)
68*4bdff4beSrobert       return false;
69*4bdff4beSrobert     --__count;
70*4bdff4beSrobert     ++__char;
71*4bdff4beSrobert   } while (__count);
72*4bdff4beSrobert   return true;
73*4bdff4beSrobert }
74*4bdff4beSrobert 
75*4bdff4beSrobert /// Helper class to extract a code unit from a Unicode character range.
76*4bdff4beSrobert ///
77*4bdff4beSrobert /// The stored range is a view. There are multiple specialization for different
78*4bdff4beSrobert /// character types.
79*4bdff4beSrobert template <class _CharT>
80*4bdff4beSrobert class __code_point_view;
81*4bdff4beSrobert 
82*4bdff4beSrobert /// UTF-8 specialization.
83*4bdff4beSrobert template <>
84*4bdff4beSrobert class __code_point_view<char> {
85*4bdff4beSrobert public:
__code_point_view(const char * __first,const char * __last)86*4bdff4beSrobert   _LIBCPP_HIDE_FROM_ABI constexpr explicit __code_point_view(const char* __first, const char* __last)
87*4bdff4beSrobert       : __first_(__first), __last_(__last) {}
88*4bdff4beSrobert 
__at_end()89*4bdff4beSrobert   _LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; }
__position()90*4bdff4beSrobert   _LIBCPP_HIDE_FROM_ABI constexpr const char* __position() const noexcept { return __first_; }
91*4bdff4beSrobert 
__consume()92*4bdff4beSrobert   _LIBCPP_HIDE_FROM_ABI constexpr char32_t __consume() noexcept {
93*4bdff4beSrobert     _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
94*4bdff4beSrobert 
95*4bdff4beSrobert     // Based on the number of leading 1 bits the number of code units in the
96*4bdff4beSrobert     // code point can be determined. See
97*4bdff4beSrobert     // https://en.wikipedia.org/wiki/UTF-8#Encoding
98*4bdff4beSrobert     switch (_VSTD::countl_one(static_cast<unsigned char>(*__first_))) {
99*4bdff4beSrobert     case 0:
100*4bdff4beSrobert       return *__first_++;
101*4bdff4beSrobert 
102*4bdff4beSrobert     case 2:
103*4bdff4beSrobert       if (__last_ - __first_ < 2 || !__unicode::__is_continuation(__first_ + 1, 1)) [[unlikely]]
104*4bdff4beSrobert         break;
105*4bdff4beSrobert       else {
106*4bdff4beSrobert         char32_t __value = static_cast<unsigned char>(*__first_++) & 0x1f;
107*4bdff4beSrobert         __value <<= 6;
108*4bdff4beSrobert         __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
109*4bdff4beSrobert         return __value;
110*4bdff4beSrobert       }
111*4bdff4beSrobert 
112*4bdff4beSrobert     case 3:
113*4bdff4beSrobert       if (__last_ - __first_ < 3 || !__unicode::__is_continuation(__first_ + 1, 2)) [[unlikely]]
114*4bdff4beSrobert         break;
115*4bdff4beSrobert       else {
116*4bdff4beSrobert         char32_t __value = static_cast<unsigned char>(*__first_++) & 0x0f;
117*4bdff4beSrobert         __value <<= 6;
118*4bdff4beSrobert         __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
119*4bdff4beSrobert         __value <<= 6;
120*4bdff4beSrobert         __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
121*4bdff4beSrobert         return __value;
122*4bdff4beSrobert       }
123*4bdff4beSrobert 
124*4bdff4beSrobert     case 4:
125*4bdff4beSrobert       if (__last_ - __first_ < 4 || !__unicode::__is_continuation(__first_ + 1, 3)) [[unlikely]]
126*4bdff4beSrobert         break;
127*4bdff4beSrobert       else {
128*4bdff4beSrobert         char32_t __value = static_cast<unsigned char>(*__first_++) & 0x07;
129*4bdff4beSrobert         __value <<= 6;
130*4bdff4beSrobert         __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
131*4bdff4beSrobert         __value <<= 6;
132*4bdff4beSrobert         __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
133*4bdff4beSrobert         __value <<= 6;
134*4bdff4beSrobert         __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
135*4bdff4beSrobert         return __value;
136*4bdff4beSrobert       }
137*4bdff4beSrobert     }
138*4bdff4beSrobert     // An invalid number of leading ones can be garbage or a code unit in the
139*4bdff4beSrobert     // middle of a code point. By consuming one code unit the parser may get
140*4bdff4beSrobert     // "in sync" after a few code units.
141*4bdff4beSrobert     ++__first_;
142*4bdff4beSrobert     return __replacement_character;
143*4bdff4beSrobert   }
144*4bdff4beSrobert 
145*4bdff4beSrobert #    if _LIBCPP_STD_VER > 20
__consume_p2286()146*4bdff4beSrobert   _LIBCPP_HIDE_FROM_ABI constexpr __consume_p2286_result __consume_p2286() noexcept {
147*4bdff4beSrobert     _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
148*4bdff4beSrobert 
149*4bdff4beSrobert     // Based on the number of leading 1 bits the number of code units in the
150*4bdff4beSrobert     // code point can be determined. See
151*4bdff4beSrobert     // https://en.wikipedia.org/wiki/UTF-8#Encoding
152*4bdff4beSrobert     switch (std::countl_one(static_cast<unsigned char>(*__first_))) {
153*4bdff4beSrobert     case 0:
154*4bdff4beSrobert       return {0, static_cast<unsigned char>(*__first_++)};
155*4bdff4beSrobert 
156*4bdff4beSrobert     case 2:
157*4bdff4beSrobert       if (__last_ - __first_ < 2) [[unlikely]]
158*4bdff4beSrobert         break;
159*4bdff4beSrobert 
160*4bdff4beSrobert       if (__unicode::__is_continuation(__first_ + 1, 1)) {
161*4bdff4beSrobert         char32_t __value = static_cast<unsigned char>(*__first_++) & 0x1f;
162*4bdff4beSrobert         __value <<= 6;
163*4bdff4beSrobert         __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
164*4bdff4beSrobert         return {0, __value};
165*4bdff4beSrobert       }
166*4bdff4beSrobert       break;
167*4bdff4beSrobert 
168*4bdff4beSrobert     case 3:
169*4bdff4beSrobert       if (__last_ - __first_ < 3) [[unlikely]]
170*4bdff4beSrobert         break;
171*4bdff4beSrobert 
172*4bdff4beSrobert       if (__unicode::__is_continuation(__first_ + 1, 2)) {
173*4bdff4beSrobert         char32_t __value = static_cast<unsigned char>(*__first_++) & 0x0f;
174*4bdff4beSrobert         __value <<= 6;
175*4bdff4beSrobert         __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
176*4bdff4beSrobert         __value <<= 6;
177*4bdff4beSrobert         __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
178*4bdff4beSrobert         return {0, __value};
179*4bdff4beSrobert       }
180*4bdff4beSrobert       break;
181*4bdff4beSrobert 
182*4bdff4beSrobert     case 4:
183*4bdff4beSrobert       if (__last_ - __first_ < 4) [[unlikely]]
184*4bdff4beSrobert         break;
185*4bdff4beSrobert 
186*4bdff4beSrobert       if (__unicode::__is_continuation(__first_ + 1, 3)) {
187*4bdff4beSrobert         char32_t __value = static_cast<unsigned char>(*__first_++) & 0x07;
188*4bdff4beSrobert         __value <<= 6;
189*4bdff4beSrobert         __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
190*4bdff4beSrobert         __value <<= 6;
191*4bdff4beSrobert         __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
192*4bdff4beSrobert         __value <<= 6;
193*4bdff4beSrobert         __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
194*4bdff4beSrobert 
195*4bdff4beSrobert         if (__value > 0x10FFFF) // Outside the valid Unicode range?
196*4bdff4beSrobert           return {4, __value};
197*4bdff4beSrobert 
198*4bdff4beSrobert         return {0, __value};
199*4bdff4beSrobert       }
200*4bdff4beSrobert       break;
201*4bdff4beSrobert     }
202*4bdff4beSrobert     // An invalid number of leading ones can be garbage or a code unit in the
203*4bdff4beSrobert     // middle of a code point. By consuming one code unit the parser may get
204*4bdff4beSrobert     // "in sync" after a few code units.
205*4bdff4beSrobert     return {1, static_cast<unsigned char>(*__first_++)};
206*4bdff4beSrobert   }
207*4bdff4beSrobert #    endif // _LIBCPP_STD_VER > 20
208*4bdff4beSrobert 
209*4bdff4beSrobert private:
210*4bdff4beSrobert   const char* __first_;
211*4bdff4beSrobert   const char* __last_;
212*4bdff4beSrobert };
213*4bdff4beSrobert 
214*4bdff4beSrobert #    ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
__is_surrogate_pair_high(wchar_t __value)215*4bdff4beSrobert _LIBCPP_HIDE_FROM_ABI constexpr bool __is_surrogate_pair_high(wchar_t __value) {
216*4bdff4beSrobert   return __value >= 0xd800 && __value <= 0xdbff;
217*4bdff4beSrobert }
218*4bdff4beSrobert 
__is_surrogate_pair_low(wchar_t __value)219*4bdff4beSrobert _LIBCPP_HIDE_FROM_ABI constexpr bool __is_surrogate_pair_low(wchar_t __value) {
220*4bdff4beSrobert   return __value >= 0xdc00 && __value <= 0xdfff;
221*4bdff4beSrobert }
222*4bdff4beSrobert 
223*4bdff4beSrobert /// This specialization depends on the size of wchar_t
224*4bdff4beSrobert /// - 2 UTF-16 (for example Windows and AIX)
225*4bdff4beSrobert /// - 4 UTF-32 (for example Linux)
226*4bdff4beSrobert template <>
227*4bdff4beSrobert class __code_point_view<wchar_t> {
228*4bdff4beSrobert public:
229*4bdff4beSrobert   static_assert(sizeof(wchar_t) == 2 || sizeof(wchar_t) == 4, "sizeof(wchar_t) has a not implemented value");
230*4bdff4beSrobert 
__code_point_view(const wchar_t * __first,const wchar_t * __last)231*4bdff4beSrobert   _LIBCPP_HIDE_FROM_ABI constexpr explicit __code_point_view(const wchar_t* __first, const wchar_t* __last)
232*4bdff4beSrobert       : __first_(__first), __last_(__last) {}
233*4bdff4beSrobert 
__position()234*4bdff4beSrobert   _LIBCPP_HIDE_FROM_ABI constexpr const wchar_t* __position() const noexcept { return __first_; }
__at_end()235*4bdff4beSrobert   _LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; }
236*4bdff4beSrobert 
__consume()237*4bdff4beSrobert   _LIBCPP_HIDE_FROM_ABI constexpr char32_t __consume() noexcept {
238*4bdff4beSrobert     _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
239*4bdff4beSrobert 
240*4bdff4beSrobert     if constexpr (sizeof(wchar_t) == 2) {
241*4bdff4beSrobert       char32_t __result = *__first_++;
242*4bdff4beSrobert       // Is the code unit part of a surrogate pair? See
243*4bdff4beSrobert       // https://en.wikipedia.org/wiki/UTF-16#U+D800_to_U+DFFF
244*4bdff4beSrobert       if (__result >= 0xd800 && __result <= 0xDfff) {
245*4bdff4beSrobert         // Malformed Unicode.
246*4bdff4beSrobert         if (__first_ == __last_) [[unlikely]]
247*4bdff4beSrobert           return __replacement_character;
248*4bdff4beSrobert 
249*4bdff4beSrobert         __result -= 0xd800;
250*4bdff4beSrobert         __result <<= 10;
251*4bdff4beSrobert         __result += *__first_++ - 0xdc00;
252*4bdff4beSrobert         __result += 0x10000;
253*4bdff4beSrobert       }
254*4bdff4beSrobert       return __result;
255*4bdff4beSrobert 
256*4bdff4beSrobert     } else if constexpr (sizeof(wchar_t) == 4) {
257*4bdff4beSrobert       char32_t __result = *__first_++;
258*4bdff4beSrobert       if (__result > 0x10FFFF) [[unlikely]]
259*4bdff4beSrobert         return __replacement_character;
260*4bdff4beSrobert       return __result;
261*4bdff4beSrobert     } else {
262*4bdff4beSrobert       __libcpp_unreachable();
263*4bdff4beSrobert     }
264*4bdff4beSrobert   }
265*4bdff4beSrobert 
266*4bdff4beSrobert #      if _LIBCPP_STD_VER > 20
__consume_p2286()267*4bdff4beSrobert   _LIBCPP_HIDE_FROM_ABI constexpr __consume_p2286_result __consume_p2286() noexcept {
268*4bdff4beSrobert     _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
269*4bdff4beSrobert 
270*4bdff4beSrobert     char32_t __result = *__first_++;
271*4bdff4beSrobert     if constexpr (sizeof(wchar_t) == 2) {
272*4bdff4beSrobert       // https://en.wikipedia.org/wiki/UTF-16#U+D800_to_U+DFFF
273*4bdff4beSrobert       if (__is_surrogate_pair_high(__result)) {
274*4bdff4beSrobert         // Malformed Unicode.
275*4bdff4beSrobert         if (__first_ == __last_ || !__is_surrogate_pair_low(*(__first_ + 1))) [[unlikely]]
276*4bdff4beSrobert           return {1, __result};
277*4bdff4beSrobert 
278*4bdff4beSrobert         __result -= 0xd800;
279*4bdff4beSrobert         __result <<= 10;
280*4bdff4beSrobert         __result += *__first_++ - 0xdc00;
281*4bdff4beSrobert         __result += 0x10000;
282*4bdff4beSrobert       } else if (__is_surrogate_pair_low(__result))
283*4bdff4beSrobert         // A code point shouldn't start with the low surrogate pair
284*4bdff4beSrobert         return {1, __result};
285*4bdff4beSrobert     } else {
286*4bdff4beSrobert       if (__result > 0x10FFFF) [[unlikely]]
287*4bdff4beSrobert         return {1, __result};
288*4bdff4beSrobert     }
289*4bdff4beSrobert 
290*4bdff4beSrobert     return {0, __result};
291*4bdff4beSrobert   }
292*4bdff4beSrobert #      endif // _LIBCPP_STD_VER > 20
293*4bdff4beSrobert 
294*4bdff4beSrobert private:
295*4bdff4beSrobert   const wchar_t* __first_;
296*4bdff4beSrobert   const wchar_t* __last_;
297*4bdff4beSrobert };
298*4bdff4beSrobert #    endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS
299*4bdff4beSrobert 
__at_extended_grapheme_cluster_break(bool & __ri_break_allowed,bool __has_extened_pictographic,__extended_grapheme_custer_property_boundary::__property __prev,__extended_grapheme_custer_property_boundary::__property __next)300*4bdff4beSrobert _LIBCPP_HIDE_FROM_ABI constexpr bool __at_extended_grapheme_cluster_break(
301*4bdff4beSrobert     bool& __ri_break_allowed,
302*4bdff4beSrobert     bool __has_extened_pictographic,
303*4bdff4beSrobert     __extended_grapheme_custer_property_boundary::__property __prev,
304*4bdff4beSrobert     __extended_grapheme_custer_property_boundary::__property __next) {
305*4bdff4beSrobert   using __extended_grapheme_custer_property_boundary::__property;
306*4bdff4beSrobert 
307*4bdff4beSrobert   __has_extened_pictographic |= __prev == __property::__Extended_Pictographic;
308*4bdff4beSrobert 
309*4bdff4beSrobert   // https://www.unicode.org/reports/tr29/tr29-39.html#Grapheme_Cluster_Boundary_Rules
310*4bdff4beSrobert 
311*4bdff4beSrobert   // *** Break at the start and end of text, unless the text is empty. ***
312*4bdff4beSrobert 
313*4bdff4beSrobert   _LIBCPP_ASSERT(__prev != __property::__sot, "should be handled in the constructor"); // GB1
314*4bdff4beSrobert   _LIBCPP_ASSERT(__prev != __property::__eot, "should be handled by our caller");      // GB2
315*4bdff4beSrobert 
316*4bdff4beSrobert   // *** Do not break between a CR and LF. Otherwise, break before and after controls. ***
317*4bdff4beSrobert   if (__prev == __property::__CR && __next == __property::__LF) // GB3
318*4bdff4beSrobert     return false;
319*4bdff4beSrobert 
320*4bdff4beSrobert   if (__prev == __property::__Control || __prev == __property::__CR || __prev == __property::__LF) // GB4
321*4bdff4beSrobert     return true;
322*4bdff4beSrobert 
323*4bdff4beSrobert   if (__next == __property::__Control || __next == __property::__CR || __next == __property::__LF) // GB5
324*4bdff4beSrobert     return true;
325*4bdff4beSrobert 
326*4bdff4beSrobert   // *** Do not break Hangul syllable sequences. ***
327*4bdff4beSrobert   if (__prev == __property::__L &&
328*4bdff4beSrobert       (__next == __property::__L || __next == __property::__V || __next == __property::__LV ||
329*4bdff4beSrobert        __next == __property::__LVT)) // GB6
330*4bdff4beSrobert     return false;
331*4bdff4beSrobert 
332*4bdff4beSrobert   if ((__prev == __property::__LV || __prev == __property::__V) &&
333*4bdff4beSrobert       (__next == __property::__V || __next == __property::__T)) // GB7
334*4bdff4beSrobert     return false;
335*4bdff4beSrobert 
336*4bdff4beSrobert   if ((__prev == __property::__LVT || __prev == __property::__T) && __next == __property::__T) // GB8
337*4bdff4beSrobert     return false;
338*4bdff4beSrobert 
339*4bdff4beSrobert   // *** Do not break before extending characters or ZWJ. ***
340*4bdff4beSrobert   if (__next == __property::__Extend || __next == __property::__ZWJ)
341*4bdff4beSrobert     return false; // GB9
342*4bdff4beSrobert 
343*4bdff4beSrobert   // *** Do not break before SpacingMarks, or after Prepend characters. ***
344*4bdff4beSrobert   if (__next == __property::__SpacingMark) // GB9a
345*4bdff4beSrobert     return false;
346*4bdff4beSrobert 
347*4bdff4beSrobert   if (__prev == __property::__Prepend) // GB9b
348*4bdff4beSrobert     return false;
349*4bdff4beSrobert 
350*4bdff4beSrobert   // *** Do not break within emoji modifier sequences or emoji zwj sequences. ***
351*4bdff4beSrobert 
352*4bdff4beSrobert   // GB11 \p{Extended_Pictographic} Extend* ZWJ x \p{Extended_Pictographic}
353*4bdff4beSrobert   //
354*4bdff4beSrobert   // Note that several parts of this rule are matched by GB9: Any x (Extend | ZWJ)
355*4bdff4beSrobert   // - \p{Extended_Pictographic} x Extend
356*4bdff4beSrobert   // - Extend x Extend
357*4bdff4beSrobert   // - \p{Extended_Pictographic} x ZWJ
358*4bdff4beSrobert   // - Extend x ZWJ
359*4bdff4beSrobert   //
360*4bdff4beSrobert   // So the only case left to test is
361*4bdff4beSrobert   // - \p{Extended_Pictographic}' x ZWJ x \p{Extended_Pictographic}
362*4bdff4beSrobert   //   where  \p{Extended_Pictographic}' is stored in __has_extened_pictographic
363*4bdff4beSrobert   if (__has_extened_pictographic && __prev == __property::__ZWJ && __next == __property::__Extended_Pictographic)
364*4bdff4beSrobert     return false;
365*4bdff4beSrobert 
366*4bdff4beSrobert   // *** Do not break within emoji flag sequences ***
367*4bdff4beSrobert 
368*4bdff4beSrobert   // That is, do not break between regional indicator (RI) symbols if there
369*4bdff4beSrobert   // is an odd number of RI characters before the break point.
370*4bdff4beSrobert 
371*4bdff4beSrobert   if (__prev == __property::__Regional_Indicator && __next == __property::__Regional_Indicator) { // GB12 + GB13
372*4bdff4beSrobert     __ri_break_allowed = !__ri_break_allowed;
373*4bdff4beSrobert     return __ri_break_allowed;
374*4bdff4beSrobert   }
375*4bdff4beSrobert 
376*4bdff4beSrobert   // *** Otherwise, break everywhere. ***
377*4bdff4beSrobert   return true; // GB999
378*4bdff4beSrobert }
379*4bdff4beSrobert 
380*4bdff4beSrobert /// Helper class to extract an extended grapheme cluster from a Unicode character range.
381*4bdff4beSrobert ///
382*4bdff4beSrobert /// This function is used to determine the column width of an extended grapheme
383*4bdff4beSrobert /// cluster. In order to do that only the first code point is evaluated.
384*4bdff4beSrobert /// Therefore only this code point is extracted.
385*4bdff4beSrobert template <class _CharT>
386*4bdff4beSrobert class __extended_grapheme_cluster_view {
387*4bdff4beSrobert public:
__extended_grapheme_cluster_view(const _CharT * __first,const _CharT * __last)388*4bdff4beSrobert   _LIBCPP_HIDE_FROM_ABI constexpr explicit __extended_grapheme_cluster_view(const _CharT* __first, const _CharT* __last)
389*4bdff4beSrobert       : __code_point_view_(__first, __last),
390*4bdff4beSrobert         __next_code_point_(__code_point_view_.__consume()),
391*4bdff4beSrobert         __next_prop_(__extended_grapheme_custer_property_boundary::__get_property(__next_code_point_)) {}
392*4bdff4beSrobert 
393*4bdff4beSrobert   struct __cluster {
394*4bdff4beSrobert     /// The first code point of the extended grapheme cluster.
395*4bdff4beSrobert     ///
396*4bdff4beSrobert     /// The first code point is used to estimate the width of the extended
397*4bdff4beSrobert     /// grapheme cluster.
398*4bdff4beSrobert     char32_t __code_point_;
399*4bdff4beSrobert 
400*4bdff4beSrobert     /// Points one beyond the last code unit in the extended grapheme cluster.
401*4bdff4beSrobert     ///
402*4bdff4beSrobert     /// It's expected the caller has the start position and thus can determine
403*4bdff4beSrobert     /// the code unit range of the extended grapheme cluster.
404*4bdff4beSrobert     const _CharT* __last_;
405*4bdff4beSrobert   };
406*4bdff4beSrobert 
__consume()407*4bdff4beSrobert   _LIBCPP_HIDE_FROM_ABI constexpr __cluster __consume() {
408*4bdff4beSrobert     _LIBCPP_ASSERT(
409*4bdff4beSrobert         __next_prop_ != __extended_grapheme_custer_property_boundary::__property::__eot,
410*4bdff4beSrobert         "can't move beyond the end of input");
411*4bdff4beSrobert     char32_t __code_point = __next_code_point_;
412*4bdff4beSrobert     if (!__code_point_view_.__at_end())
413*4bdff4beSrobert       return {__code_point, __get_break()};
414*4bdff4beSrobert 
415*4bdff4beSrobert     __next_prop_ = __extended_grapheme_custer_property_boundary::__property::__eot;
416*4bdff4beSrobert     return {__code_point, __code_point_view_.__position()};
417*4bdff4beSrobert   }
418*4bdff4beSrobert 
419*4bdff4beSrobert private:
420*4bdff4beSrobert   __code_point_view<_CharT> __code_point_view_;
421*4bdff4beSrobert 
422*4bdff4beSrobert   char32_t __next_code_point_;
423*4bdff4beSrobert   __extended_grapheme_custer_property_boundary::__property __next_prop_;
424*4bdff4beSrobert 
__get_break()425*4bdff4beSrobert   _LIBCPP_HIDE_FROM_ABI constexpr const _CharT* __get_break() {
426*4bdff4beSrobert     bool __ri_break_allowed         = true;
427*4bdff4beSrobert     bool __has_extened_pictographic = false;
428*4bdff4beSrobert     while (true) {
429*4bdff4beSrobert       const _CharT* __result                                          = __code_point_view_.__position();
430*4bdff4beSrobert       __extended_grapheme_custer_property_boundary::__property __prev = __next_prop_;
431*4bdff4beSrobert       if (__code_point_view_.__at_end()) {
432*4bdff4beSrobert         __next_prop_ = __extended_grapheme_custer_property_boundary::__property::__eot;
433*4bdff4beSrobert         return __result;
434*4bdff4beSrobert       }
435*4bdff4beSrobert       __next_code_point_ = __code_point_view_.__consume();
436*4bdff4beSrobert       __next_prop_       = __extended_grapheme_custer_property_boundary::__get_property(__next_code_point_);
437*4bdff4beSrobert 
438*4bdff4beSrobert       __has_extened_pictographic |=
439*4bdff4beSrobert           __prev == __extended_grapheme_custer_property_boundary::__property::__Extended_Pictographic;
440*4bdff4beSrobert 
441*4bdff4beSrobert       if (__at_extended_grapheme_cluster_break(__ri_break_allowed, __has_extened_pictographic, __prev, __next_prop_))
442*4bdff4beSrobert         return __result;
443*4bdff4beSrobert     }
444*4bdff4beSrobert   }
445*4bdff4beSrobert };
446*4bdff4beSrobert 
447*4bdff4beSrobert template <class _CharT>
448*4bdff4beSrobert __extended_grapheme_cluster_view(const _CharT*, const _CharT*) -> __extended_grapheme_cluster_view<_CharT>;
449*4bdff4beSrobert 
450*4bdff4beSrobert #  else //  _LIBCPP_HAS_NO_UNICODE
451*4bdff4beSrobert 
452*4bdff4beSrobert // For ASCII every character is a "code point".
453*4bdff4beSrobert // This makes it easier to write code agnostic of the _LIBCPP_HAS_NO_UNICODE define.
454*4bdff4beSrobert template <class _CharT>
455*4bdff4beSrobert class __code_point_view {
456*4bdff4beSrobert public:
__code_point_view(const _CharT * __first,const _CharT * __last)457*4bdff4beSrobert   _LIBCPP_HIDE_FROM_ABI constexpr explicit __code_point_view(const _CharT* __first, const _CharT* __last)
458*4bdff4beSrobert       : __first_(__first), __last_(__last) {}
459*4bdff4beSrobert 
__at_end()460*4bdff4beSrobert   _LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; }
__position()461*4bdff4beSrobert   _LIBCPP_HIDE_FROM_ABI constexpr const _CharT* __position() const noexcept { return __first_; }
462*4bdff4beSrobert 
__consume()463*4bdff4beSrobert   _LIBCPP_HIDE_FROM_ABI constexpr char32_t __consume() noexcept {
464*4bdff4beSrobert     _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
465*4bdff4beSrobert     return *__first_++;
466*4bdff4beSrobert   }
467*4bdff4beSrobert 
468*4bdff4beSrobert #    if _LIBCPP_STD_VER > 20
__consume_p2286()469*4bdff4beSrobert   _LIBCPP_HIDE_FROM_ABI constexpr __consume_p2286_result __consume_p2286() noexcept {
470*4bdff4beSrobert     _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
471*4bdff4beSrobert 
472*4bdff4beSrobert     return {0, std::make_unsigned_t<_CharT>(*__first_++)};
473*4bdff4beSrobert   }
474*4bdff4beSrobert #    endif // _LIBCPP_STD_VER > 20
475*4bdff4beSrobert 
476*4bdff4beSrobert private:
477*4bdff4beSrobert   const _CharT* __first_;
478*4bdff4beSrobert   const _CharT* __last_;
479*4bdff4beSrobert };
480*4bdff4beSrobert 
481*4bdff4beSrobert #  endif //  _LIBCPP_HAS_NO_UNICODE
482*4bdff4beSrobert 
483*4bdff4beSrobert } // namespace __unicode
484*4bdff4beSrobert 
485*4bdff4beSrobert #endif //_LIBCPP_STD_VER > 17
486*4bdff4beSrobert 
487*4bdff4beSrobert _LIBCPP_END_NAMESPACE_STD
488*4bdff4beSrobert 
489*4bdff4beSrobert #endif // _LIBCPP___FORMAT_UNICODE_H
490