1*4bdff4beSrobert // -*- C++ -*-
2*4bdff4beSrobert //===----------------------------------------------------------------------===//
3*4bdff4beSrobert //
4*4bdff4beSrobert // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5*4bdff4beSrobert // See https://llvm.org/LICENSE.txt for license information.
6*4bdff4beSrobert // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7*4bdff4beSrobert //
8*4bdff4beSrobert //===----------------------------------------------------------------------===//
9*4bdff4beSrobert
10*4bdff4beSrobert #ifndef _LIBCPP___FORMAT_UNICODE_H
11*4bdff4beSrobert #define _LIBCPP___FORMAT_UNICODE_H
12*4bdff4beSrobert
13*4bdff4beSrobert #include <__assert>
14*4bdff4beSrobert #include <__config>
15*4bdff4beSrobert #include <__format/extended_grapheme_cluster_table.h>
16*4bdff4beSrobert #include <__type_traits/make_unsigned.h>
17*4bdff4beSrobert #include <__utility/unreachable.h>
18*4bdff4beSrobert #include <bit>
19*4bdff4beSrobert
20*4bdff4beSrobert #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
21*4bdff4beSrobert # pragma GCC system_header
22*4bdff4beSrobert #endif
23*4bdff4beSrobert
24*4bdff4beSrobert _LIBCPP_BEGIN_NAMESPACE_STD
25*4bdff4beSrobert
26*4bdff4beSrobert #if _LIBCPP_STD_VER > 17
27*4bdff4beSrobert
28*4bdff4beSrobert namespace __unicode {
29*4bdff4beSrobert
30*4bdff4beSrobert # if _LIBCPP_STD_VER > 20
31*4bdff4beSrobert
32*4bdff4beSrobert /// The result of consuming a code point using P2286' semantics
33*4bdff4beSrobert ///
34*4bdff4beSrobert /// TODO FMT Combine __consume and __consume_p2286 in one function.
35*4bdff4beSrobert struct __consume_p2286_result {
36*4bdff4beSrobert // A size of 0 means well formed. This to differenciate between
37*4bdff4beSrobert // a valid code point and a code unit that's invalid like 0b11111xxx.
38*4bdff4beSrobert int __ill_formed_size;
39*4bdff4beSrobert
40*4bdff4beSrobert // If well formed the consumed code point.
41*4bdff4beSrobert // Otherwise the ill-formed code units as unsigned 8-bit values. They are
42*4bdff4beSrobert // stored in reverse order, to make it easier to extract the values.
43*4bdff4beSrobert char32_t __value;
44*4bdff4beSrobert };
45*4bdff4beSrobert
46*4bdff4beSrobert # endif // _LIBCPP_STD_VER > 20
47*4bdff4beSrobert
48*4bdff4beSrobert # ifndef _LIBCPP_HAS_NO_UNICODE
49*4bdff4beSrobert
50*4bdff4beSrobert /// Implements the grapheme cluster boundary rules
51*4bdff4beSrobert ///
52*4bdff4beSrobert /// These rules are used to implement format's width estimation as stated in
53*4bdff4beSrobert /// [format.string.std]/11
54*4bdff4beSrobert ///
55*4bdff4beSrobert /// The Standard refers to UAX \#29 for Unicode 12.0.0
56*4bdff4beSrobert /// https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
57*4bdff4beSrobert ///
58*4bdff4beSrobert /// The data tables used are
59*4bdff4beSrobert /// https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
60*4bdff4beSrobert /// https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt
61*4bdff4beSrobert /// https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt (for testing only)
62*4bdff4beSrobert
63*4bdff4beSrobert inline constexpr char32_t __replacement_character = U'\ufffd';
64*4bdff4beSrobert
__is_continuation(const char * __char,int __count)65*4bdff4beSrobert _LIBCPP_HIDE_FROM_ABI constexpr bool __is_continuation(const char* __char, int __count) {
66*4bdff4beSrobert do {
67*4bdff4beSrobert if ((*__char & 0b1000'0000) != 0b1000'0000)
68*4bdff4beSrobert return false;
69*4bdff4beSrobert --__count;
70*4bdff4beSrobert ++__char;
71*4bdff4beSrobert } while (__count);
72*4bdff4beSrobert return true;
73*4bdff4beSrobert }
74*4bdff4beSrobert
75*4bdff4beSrobert /// Helper class to extract a code unit from a Unicode character range.
76*4bdff4beSrobert ///
77*4bdff4beSrobert /// The stored range is a view. There are multiple specialization for different
78*4bdff4beSrobert /// character types.
79*4bdff4beSrobert template <class _CharT>
80*4bdff4beSrobert class __code_point_view;
81*4bdff4beSrobert
82*4bdff4beSrobert /// UTF-8 specialization.
83*4bdff4beSrobert template <>
84*4bdff4beSrobert class __code_point_view<char> {
85*4bdff4beSrobert public:
__code_point_view(const char * __first,const char * __last)86*4bdff4beSrobert _LIBCPP_HIDE_FROM_ABI constexpr explicit __code_point_view(const char* __first, const char* __last)
87*4bdff4beSrobert : __first_(__first), __last_(__last) {}
88*4bdff4beSrobert
__at_end()89*4bdff4beSrobert _LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; }
__position()90*4bdff4beSrobert _LIBCPP_HIDE_FROM_ABI constexpr const char* __position() const noexcept { return __first_; }
91*4bdff4beSrobert
__consume()92*4bdff4beSrobert _LIBCPP_HIDE_FROM_ABI constexpr char32_t __consume() noexcept {
93*4bdff4beSrobert _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
94*4bdff4beSrobert
95*4bdff4beSrobert // Based on the number of leading 1 bits the number of code units in the
96*4bdff4beSrobert // code point can be determined. See
97*4bdff4beSrobert // https://en.wikipedia.org/wiki/UTF-8#Encoding
98*4bdff4beSrobert switch (_VSTD::countl_one(static_cast<unsigned char>(*__first_))) {
99*4bdff4beSrobert case 0:
100*4bdff4beSrobert return *__first_++;
101*4bdff4beSrobert
102*4bdff4beSrobert case 2:
103*4bdff4beSrobert if (__last_ - __first_ < 2 || !__unicode::__is_continuation(__first_ + 1, 1)) [[unlikely]]
104*4bdff4beSrobert break;
105*4bdff4beSrobert else {
106*4bdff4beSrobert char32_t __value = static_cast<unsigned char>(*__first_++) & 0x1f;
107*4bdff4beSrobert __value <<= 6;
108*4bdff4beSrobert __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
109*4bdff4beSrobert return __value;
110*4bdff4beSrobert }
111*4bdff4beSrobert
112*4bdff4beSrobert case 3:
113*4bdff4beSrobert if (__last_ - __first_ < 3 || !__unicode::__is_continuation(__first_ + 1, 2)) [[unlikely]]
114*4bdff4beSrobert break;
115*4bdff4beSrobert else {
116*4bdff4beSrobert char32_t __value = static_cast<unsigned char>(*__first_++) & 0x0f;
117*4bdff4beSrobert __value <<= 6;
118*4bdff4beSrobert __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
119*4bdff4beSrobert __value <<= 6;
120*4bdff4beSrobert __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
121*4bdff4beSrobert return __value;
122*4bdff4beSrobert }
123*4bdff4beSrobert
124*4bdff4beSrobert case 4:
125*4bdff4beSrobert if (__last_ - __first_ < 4 || !__unicode::__is_continuation(__first_ + 1, 3)) [[unlikely]]
126*4bdff4beSrobert break;
127*4bdff4beSrobert else {
128*4bdff4beSrobert char32_t __value = static_cast<unsigned char>(*__first_++) & 0x07;
129*4bdff4beSrobert __value <<= 6;
130*4bdff4beSrobert __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
131*4bdff4beSrobert __value <<= 6;
132*4bdff4beSrobert __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
133*4bdff4beSrobert __value <<= 6;
134*4bdff4beSrobert __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
135*4bdff4beSrobert return __value;
136*4bdff4beSrobert }
137*4bdff4beSrobert }
138*4bdff4beSrobert // An invalid number of leading ones can be garbage or a code unit in the
139*4bdff4beSrobert // middle of a code point. By consuming one code unit the parser may get
140*4bdff4beSrobert // "in sync" after a few code units.
141*4bdff4beSrobert ++__first_;
142*4bdff4beSrobert return __replacement_character;
143*4bdff4beSrobert }
144*4bdff4beSrobert
145*4bdff4beSrobert # if _LIBCPP_STD_VER > 20
__consume_p2286()146*4bdff4beSrobert _LIBCPP_HIDE_FROM_ABI constexpr __consume_p2286_result __consume_p2286() noexcept {
147*4bdff4beSrobert _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
148*4bdff4beSrobert
149*4bdff4beSrobert // Based on the number of leading 1 bits the number of code units in the
150*4bdff4beSrobert // code point can be determined. See
151*4bdff4beSrobert // https://en.wikipedia.org/wiki/UTF-8#Encoding
152*4bdff4beSrobert switch (std::countl_one(static_cast<unsigned char>(*__first_))) {
153*4bdff4beSrobert case 0:
154*4bdff4beSrobert return {0, static_cast<unsigned char>(*__first_++)};
155*4bdff4beSrobert
156*4bdff4beSrobert case 2:
157*4bdff4beSrobert if (__last_ - __first_ < 2) [[unlikely]]
158*4bdff4beSrobert break;
159*4bdff4beSrobert
160*4bdff4beSrobert if (__unicode::__is_continuation(__first_ + 1, 1)) {
161*4bdff4beSrobert char32_t __value = static_cast<unsigned char>(*__first_++) & 0x1f;
162*4bdff4beSrobert __value <<= 6;
163*4bdff4beSrobert __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
164*4bdff4beSrobert return {0, __value};
165*4bdff4beSrobert }
166*4bdff4beSrobert break;
167*4bdff4beSrobert
168*4bdff4beSrobert case 3:
169*4bdff4beSrobert if (__last_ - __first_ < 3) [[unlikely]]
170*4bdff4beSrobert break;
171*4bdff4beSrobert
172*4bdff4beSrobert if (__unicode::__is_continuation(__first_ + 1, 2)) {
173*4bdff4beSrobert char32_t __value = static_cast<unsigned char>(*__first_++) & 0x0f;
174*4bdff4beSrobert __value <<= 6;
175*4bdff4beSrobert __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
176*4bdff4beSrobert __value <<= 6;
177*4bdff4beSrobert __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
178*4bdff4beSrobert return {0, __value};
179*4bdff4beSrobert }
180*4bdff4beSrobert break;
181*4bdff4beSrobert
182*4bdff4beSrobert case 4:
183*4bdff4beSrobert if (__last_ - __first_ < 4) [[unlikely]]
184*4bdff4beSrobert break;
185*4bdff4beSrobert
186*4bdff4beSrobert if (__unicode::__is_continuation(__first_ + 1, 3)) {
187*4bdff4beSrobert char32_t __value = static_cast<unsigned char>(*__first_++) & 0x07;
188*4bdff4beSrobert __value <<= 6;
189*4bdff4beSrobert __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
190*4bdff4beSrobert __value <<= 6;
191*4bdff4beSrobert __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
192*4bdff4beSrobert __value <<= 6;
193*4bdff4beSrobert __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
194*4bdff4beSrobert
195*4bdff4beSrobert if (__value > 0x10FFFF) // Outside the valid Unicode range?
196*4bdff4beSrobert return {4, __value};
197*4bdff4beSrobert
198*4bdff4beSrobert return {0, __value};
199*4bdff4beSrobert }
200*4bdff4beSrobert break;
201*4bdff4beSrobert }
202*4bdff4beSrobert // An invalid number of leading ones can be garbage or a code unit in the
203*4bdff4beSrobert // middle of a code point. By consuming one code unit the parser may get
204*4bdff4beSrobert // "in sync" after a few code units.
205*4bdff4beSrobert return {1, static_cast<unsigned char>(*__first_++)};
206*4bdff4beSrobert }
207*4bdff4beSrobert # endif // _LIBCPP_STD_VER > 20
208*4bdff4beSrobert
209*4bdff4beSrobert private:
210*4bdff4beSrobert const char* __first_;
211*4bdff4beSrobert const char* __last_;
212*4bdff4beSrobert };
213*4bdff4beSrobert
214*4bdff4beSrobert # ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
__is_surrogate_pair_high(wchar_t __value)215*4bdff4beSrobert _LIBCPP_HIDE_FROM_ABI constexpr bool __is_surrogate_pair_high(wchar_t __value) {
216*4bdff4beSrobert return __value >= 0xd800 && __value <= 0xdbff;
217*4bdff4beSrobert }
218*4bdff4beSrobert
__is_surrogate_pair_low(wchar_t __value)219*4bdff4beSrobert _LIBCPP_HIDE_FROM_ABI constexpr bool __is_surrogate_pair_low(wchar_t __value) {
220*4bdff4beSrobert return __value >= 0xdc00 && __value <= 0xdfff;
221*4bdff4beSrobert }
222*4bdff4beSrobert
223*4bdff4beSrobert /// This specialization depends on the size of wchar_t
224*4bdff4beSrobert /// - 2 UTF-16 (for example Windows and AIX)
225*4bdff4beSrobert /// - 4 UTF-32 (for example Linux)
226*4bdff4beSrobert template <>
227*4bdff4beSrobert class __code_point_view<wchar_t> {
228*4bdff4beSrobert public:
229*4bdff4beSrobert static_assert(sizeof(wchar_t) == 2 || sizeof(wchar_t) == 4, "sizeof(wchar_t) has a not implemented value");
230*4bdff4beSrobert
__code_point_view(const wchar_t * __first,const wchar_t * __last)231*4bdff4beSrobert _LIBCPP_HIDE_FROM_ABI constexpr explicit __code_point_view(const wchar_t* __first, const wchar_t* __last)
232*4bdff4beSrobert : __first_(__first), __last_(__last) {}
233*4bdff4beSrobert
__position()234*4bdff4beSrobert _LIBCPP_HIDE_FROM_ABI constexpr const wchar_t* __position() const noexcept { return __first_; }
__at_end()235*4bdff4beSrobert _LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; }
236*4bdff4beSrobert
__consume()237*4bdff4beSrobert _LIBCPP_HIDE_FROM_ABI constexpr char32_t __consume() noexcept {
238*4bdff4beSrobert _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
239*4bdff4beSrobert
240*4bdff4beSrobert if constexpr (sizeof(wchar_t) == 2) {
241*4bdff4beSrobert char32_t __result = *__first_++;
242*4bdff4beSrobert // Is the code unit part of a surrogate pair? See
243*4bdff4beSrobert // https://en.wikipedia.org/wiki/UTF-16#U+D800_to_U+DFFF
244*4bdff4beSrobert if (__result >= 0xd800 && __result <= 0xDfff) {
245*4bdff4beSrobert // Malformed Unicode.
246*4bdff4beSrobert if (__first_ == __last_) [[unlikely]]
247*4bdff4beSrobert return __replacement_character;
248*4bdff4beSrobert
249*4bdff4beSrobert __result -= 0xd800;
250*4bdff4beSrobert __result <<= 10;
251*4bdff4beSrobert __result += *__first_++ - 0xdc00;
252*4bdff4beSrobert __result += 0x10000;
253*4bdff4beSrobert }
254*4bdff4beSrobert return __result;
255*4bdff4beSrobert
256*4bdff4beSrobert } else if constexpr (sizeof(wchar_t) == 4) {
257*4bdff4beSrobert char32_t __result = *__first_++;
258*4bdff4beSrobert if (__result > 0x10FFFF) [[unlikely]]
259*4bdff4beSrobert return __replacement_character;
260*4bdff4beSrobert return __result;
261*4bdff4beSrobert } else {
262*4bdff4beSrobert __libcpp_unreachable();
263*4bdff4beSrobert }
264*4bdff4beSrobert }
265*4bdff4beSrobert
266*4bdff4beSrobert # if _LIBCPP_STD_VER > 20
__consume_p2286()267*4bdff4beSrobert _LIBCPP_HIDE_FROM_ABI constexpr __consume_p2286_result __consume_p2286() noexcept {
268*4bdff4beSrobert _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
269*4bdff4beSrobert
270*4bdff4beSrobert char32_t __result = *__first_++;
271*4bdff4beSrobert if constexpr (sizeof(wchar_t) == 2) {
272*4bdff4beSrobert // https://en.wikipedia.org/wiki/UTF-16#U+D800_to_U+DFFF
273*4bdff4beSrobert if (__is_surrogate_pair_high(__result)) {
274*4bdff4beSrobert // Malformed Unicode.
275*4bdff4beSrobert if (__first_ == __last_ || !__is_surrogate_pair_low(*(__first_ + 1))) [[unlikely]]
276*4bdff4beSrobert return {1, __result};
277*4bdff4beSrobert
278*4bdff4beSrobert __result -= 0xd800;
279*4bdff4beSrobert __result <<= 10;
280*4bdff4beSrobert __result += *__first_++ - 0xdc00;
281*4bdff4beSrobert __result += 0x10000;
282*4bdff4beSrobert } else if (__is_surrogate_pair_low(__result))
283*4bdff4beSrobert // A code point shouldn't start with the low surrogate pair
284*4bdff4beSrobert return {1, __result};
285*4bdff4beSrobert } else {
286*4bdff4beSrobert if (__result > 0x10FFFF) [[unlikely]]
287*4bdff4beSrobert return {1, __result};
288*4bdff4beSrobert }
289*4bdff4beSrobert
290*4bdff4beSrobert return {0, __result};
291*4bdff4beSrobert }
292*4bdff4beSrobert # endif // _LIBCPP_STD_VER > 20
293*4bdff4beSrobert
294*4bdff4beSrobert private:
295*4bdff4beSrobert const wchar_t* __first_;
296*4bdff4beSrobert const wchar_t* __last_;
297*4bdff4beSrobert };
298*4bdff4beSrobert # endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS
299*4bdff4beSrobert
__at_extended_grapheme_cluster_break(bool & __ri_break_allowed,bool __has_extened_pictographic,__extended_grapheme_custer_property_boundary::__property __prev,__extended_grapheme_custer_property_boundary::__property __next)300*4bdff4beSrobert _LIBCPP_HIDE_FROM_ABI constexpr bool __at_extended_grapheme_cluster_break(
301*4bdff4beSrobert bool& __ri_break_allowed,
302*4bdff4beSrobert bool __has_extened_pictographic,
303*4bdff4beSrobert __extended_grapheme_custer_property_boundary::__property __prev,
304*4bdff4beSrobert __extended_grapheme_custer_property_boundary::__property __next) {
305*4bdff4beSrobert using __extended_grapheme_custer_property_boundary::__property;
306*4bdff4beSrobert
307*4bdff4beSrobert __has_extened_pictographic |= __prev == __property::__Extended_Pictographic;
308*4bdff4beSrobert
309*4bdff4beSrobert // https://www.unicode.org/reports/tr29/tr29-39.html#Grapheme_Cluster_Boundary_Rules
310*4bdff4beSrobert
311*4bdff4beSrobert // *** Break at the start and end of text, unless the text is empty. ***
312*4bdff4beSrobert
313*4bdff4beSrobert _LIBCPP_ASSERT(__prev != __property::__sot, "should be handled in the constructor"); // GB1
314*4bdff4beSrobert _LIBCPP_ASSERT(__prev != __property::__eot, "should be handled by our caller"); // GB2
315*4bdff4beSrobert
316*4bdff4beSrobert // *** Do not break between a CR and LF. Otherwise, break before and after controls. ***
317*4bdff4beSrobert if (__prev == __property::__CR && __next == __property::__LF) // GB3
318*4bdff4beSrobert return false;
319*4bdff4beSrobert
320*4bdff4beSrobert if (__prev == __property::__Control || __prev == __property::__CR || __prev == __property::__LF) // GB4
321*4bdff4beSrobert return true;
322*4bdff4beSrobert
323*4bdff4beSrobert if (__next == __property::__Control || __next == __property::__CR || __next == __property::__LF) // GB5
324*4bdff4beSrobert return true;
325*4bdff4beSrobert
326*4bdff4beSrobert // *** Do not break Hangul syllable sequences. ***
327*4bdff4beSrobert if (__prev == __property::__L &&
328*4bdff4beSrobert (__next == __property::__L || __next == __property::__V || __next == __property::__LV ||
329*4bdff4beSrobert __next == __property::__LVT)) // GB6
330*4bdff4beSrobert return false;
331*4bdff4beSrobert
332*4bdff4beSrobert if ((__prev == __property::__LV || __prev == __property::__V) &&
333*4bdff4beSrobert (__next == __property::__V || __next == __property::__T)) // GB7
334*4bdff4beSrobert return false;
335*4bdff4beSrobert
336*4bdff4beSrobert if ((__prev == __property::__LVT || __prev == __property::__T) && __next == __property::__T) // GB8
337*4bdff4beSrobert return false;
338*4bdff4beSrobert
339*4bdff4beSrobert // *** Do not break before extending characters or ZWJ. ***
340*4bdff4beSrobert if (__next == __property::__Extend || __next == __property::__ZWJ)
341*4bdff4beSrobert return false; // GB9
342*4bdff4beSrobert
343*4bdff4beSrobert // *** Do not break before SpacingMarks, or after Prepend characters. ***
344*4bdff4beSrobert if (__next == __property::__SpacingMark) // GB9a
345*4bdff4beSrobert return false;
346*4bdff4beSrobert
347*4bdff4beSrobert if (__prev == __property::__Prepend) // GB9b
348*4bdff4beSrobert return false;
349*4bdff4beSrobert
350*4bdff4beSrobert // *** Do not break within emoji modifier sequences or emoji zwj sequences. ***
351*4bdff4beSrobert
352*4bdff4beSrobert // GB11 \p{Extended_Pictographic} Extend* ZWJ x \p{Extended_Pictographic}
353*4bdff4beSrobert //
354*4bdff4beSrobert // Note that several parts of this rule are matched by GB9: Any x (Extend | ZWJ)
355*4bdff4beSrobert // - \p{Extended_Pictographic} x Extend
356*4bdff4beSrobert // - Extend x Extend
357*4bdff4beSrobert // - \p{Extended_Pictographic} x ZWJ
358*4bdff4beSrobert // - Extend x ZWJ
359*4bdff4beSrobert //
360*4bdff4beSrobert // So the only case left to test is
361*4bdff4beSrobert // - \p{Extended_Pictographic}' x ZWJ x \p{Extended_Pictographic}
362*4bdff4beSrobert // where \p{Extended_Pictographic}' is stored in __has_extened_pictographic
363*4bdff4beSrobert if (__has_extened_pictographic && __prev == __property::__ZWJ && __next == __property::__Extended_Pictographic)
364*4bdff4beSrobert return false;
365*4bdff4beSrobert
366*4bdff4beSrobert // *** Do not break within emoji flag sequences ***
367*4bdff4beSrobert
368*4bdff4beSrobert // That is, do not break between regional indicator (RI) symbols if there
369*4bdff4beSrobert // is an odd number of RI characters before the break point.
370*4bdff4beSrobert
371*4bdff4beSrobert if (__prev == __property::__Regional_Indicator && __next == __property::__Regional_Indicator) { // GB12 + GB13
372*4bdff4beSrobert __ri_break_allowed = !__ri_break_allowed;
373*4bdff4beSrobert return __ri_break_allowed;
374*4bdff4beSrobert }
375*4bdff4beSrobert
376*4bdff4beSrobert // *** Otherwise, break everywhere. ***
377*4bdff4beSrobert return true; // GB999
378*4bdff4beSrobert }
379*4bdff4beSrobert
380*4bdff4beSrobert /// Helper class to extract an extended grapheme cluster from a Unicode character range.
381*4bdff4beSrobert ///
382*4bdff4beSrobert /// This function is used to determine the column width of an extended grapheme
383*4bdff4beSrobert /// cluster. In order to do that only the first code point is evaluated.
384*4bdff4beSrobert /// Therefore only this code point is extracted.
385*4bdff4beSrobert template <class _CharT>
386*4bdff4beSrobert class __extended_grapheme_cluster_view {
387*4bdff4beSrobert public:
__extended_grapheme_cluster_view(const _CharT * __first,const _CharT * __last)388*4bdff4beSrobert _LIBCPP_HIDE_FROM_ABI constexpr explicit __extended_grapheme_cluster_view(const _CharT* __first, const _CharT* __last)
389*4bdff4beSrobert : __code_point_view_(__first, __last),
390*4bdff4beSrobert __next_code_point_(__code_point_view_.__consume()),
391*4bdff4beSrobert __next_prop_(__extended_grapheme_custer_property_boundary::__get_property(__next_code_point_)) {}
392*4bdff4beSrobert
393*4bdff4beSrobert struct __cluster {
394*4bdff4beSrobert /// The first code point of the extended grapheme cluster.
395*4bdff4beSrobert ///
396*4bdff4beSrobert /// The first code point is used to estimate the width of the extended
397*4bdff4beSrobert /// grapheme cluster.
398*4bdff4beSrobert char32_t __code_point_;
399*4bdff4beSrobert
400*4bdff4beSrobert /// Points one beyond the last code unit in the extended grapheme cluster.
401*4bdff4beSrobert ///
402*4bdff4beSrobert /// It's expected the caller has the start position and thus can determine
403*4bdff4beSrobert /// the code unit range of the extended grapheme cluster.
404*4bdff4beSrobert const _CharT* __last_;
405*4bdff4beSrobert };
406*4bdff4beSrobert
__consume()407*4bdff4beSrobert _LIBCPP_HIDE_FROM_ABI constexpr __cluster __consume() {
408*4bdff4beSrobert _LIBCPP_ASSERT(
409*4bdff4beSrobert __next_prop_ != __extended_grapheme_custer_property_boundary::__property::__eot,
410*4bdff4beSrobert "can't move beyond the end of input");
411*4bdff4beSrobert char32_t __code_point = __next_code_point_;
412*4bdff4beSrobert if (!__code_point_view_.__at_end())
413*4bdff4beSrobert return {__code_point, __get_break()};
414*4bdff4beSrobert
415*4bdff4beSrobert __next_prop_ = __extended_grapheme_custer_property_boundary::__property::__eot;
416*4bdff4beSrobert return {__code_point, __code_point_view_.__position()};
417*4bdff4beSrobert }
418*4bdff4beSrobert
419*4bdff4beSrobert private:
420*4bdff4beSrobert __code_point_view<_CharT> __code_point_view_;
421*4bdff4beSrobert
422*4bdff4beSrobert char32_t __next_code_point_;
423*4bdff4beSrobert __extended_grapheme_custer_property_boundary::__property __next_prop_;
424*4bdff4beSrobert
__get_break()425*4bdff4beSrobert _LIBCPP_HIDE_FROM_ABI constexpr const _CharT* __get_break() {
426*4bdff4beSrobert bool __ri_break_allowed = true;
427*4bdff4beSrobert bool __has_extened_pictographic = false;
428*4bdff4beSrobert while (true) {
429*4bdff4beSrobert const _CharT* __result = __code_point_view_.__position();
430*4bdff4beSrobert __extended_grapheme_custer_property_boundary::__property __prev = __next_prop_;
431*4bdff4beSrobert if (__code_point_view_.__at_end()) {
432*4bdff4beSrobert __next_prop_ = __extended_grapheme_custer_property_boundary::__property::__eot;
433*4bdff4beSrobert return __result;
434*4bdff4beSrobert }
435*4bdff4beSrobert __next_code_point_ = __code_point_view_.__consume();
436*4bdff4beSrobert __next_prop_ = __extended_grapheme_custer_property_boundary::__get_property(__next_code_point_);
437*4bdff4beSrobert
438*4bdff4beSrobert __has_extened_pictographic |=
439*4bdff4beSrobert __prev == __extended_grapheme_custer_property_boundary::__property::__Extended_Pictographic;
440*4bdff4beSrobert
441*4bdff4beSrobert if (__at_extended_grapheme_cluster_break(__ri_break_allowed, __has_extened_pictographic, __prev, __next_prop_))
442*4bdff4beSrobert return __result;
443*4bdff4beSrobert }
444*4bdff4beSrobert }
445*4bdff4beSrobert };
446*4bdff4beSrobert
447*4bdff4beSrobert template <class _CharT>
448*4bdff4beSrobert __extended_grapheme_cluster_view(const _CharT*, const _CharT*) -> __extended_grapheme_cluster_view<_CharT>;
449*4bdff4beSrobert
450*4bdff4beSrobert # else // _LIBCPP_HAS_NO_UNICODE
451*4bdff4beSrobert
452*4bdff4beSrobert // For ASCII every character is a "code point".
453*4bdff4beSrobert // This makes it easier to write code agnostic of the _LIBCPP_HAS_NO_UNICODE define.
454*4bdff4beSrobert template <class _CharT>
455*4bdff4beSrobert class __code_point_view {
456*4bdff4beSrobert public:
__code_point_view(const _CharT * __first,const _CharT * __last)457*4bdff4beSrobert _LIBCPP_HIDE_FROM_ABI constexpr explicit __code_point_view(const _CharT* __first, const _CharT* __last)
458*4bdff4beSrobert : __first_(__first), __last_(__last) {}
459*4bdff4beSrobert
__at_end()460*4bdff4beSrobert _LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; }
__position()461*4bdff4beSrobert _LIBCPP_HIDE_FROM_ABI constexpr const _CharT* __position() const noexcept { return __first_; }
462*4bdff4beSrobert
__consume()463*4bdff4beSrobert _LIBCPP_HIDE_FROM_ABI constexpr char32_t __consume() noexcept {
464*4bdff4beSrobert _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
465*4bdff4beSrobert return *__first_++;
466*4bdff4beSrobert }
467*4bdff4beSrobert
468*4bdff4beSrobert # if _LIBCPP_STD_VER > 20
__consume_p2286()469*4bdff4beSrobert _LIBCPP_HIDE_FROM_ABI constexpr __consume_p2286_result __consume_p2286() noexcept {
470*4bdff4beSrobert _LIBCPP_ASSERT(__first_ != __last_, "can't move beyond the end of input");
471*4bdff4beSrobert
472*4bdff4beSrobert return {0, std::make_unsigned_t<_CharT>(*__first_++)};
473*4bdff4beSrobert }
474*4bdff4beSrobert # endif // _LIBCPP_STD_VER > 20
475*4bdff4beSrobert
476*4bdff4beSrobert private:
477*4bdff4beSrobert const _CharT* __first_;
478*4bdff4beSrobert const _CharT* __last_;
479*4bdff4beSrobert };
480*4bdff4beSrobert
481*4bdff4beSrobert # endif // _LIBCPP_HAS_NO_UNICODE
482*4bdff4beSrobert
483*4bdff4beSrobert } // namespace __unicode
484*4bdff4beSrobert
485*4bdff4beSrobert #endif //_LIBCPP_STD_VER > 17
486*4bdff4beSrobert
487*4bdff4beSrobert _LIBCPP_END_NAMESPACE_STD
488*4bdff4beSrobert
489*4bdff4beSrobert #endif // _LIBCPP___FORMAT_UNICODE_H
490