xref: /llvm-project/libcxx/include/__cxx03/__format/unicode.h (revision ce7771902dc50d900de639d499a60486b83f70e0)
1e78f53d1SNikolas Klauser // -*- C++ -*-
2e78f53d1SNikolas Klauser //===----------------------------------------------------------------------===//
3e78f53d1SNikolas Klauser //
4e78f53d1SNikolas Klauser // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5e78f53d1SNikolas Klauser // See https://llvm.org/LICENSE.txt for license information.
6e78f53d1SNikolas Klauser // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7e78f53d1SNikolas Klauser //
8e78f53d1SNikolas Klauser //===----------------------------------------------------------------------===//
9e78f53d1SNikolas Klauser 
10*ce777190SNikolas Klauser #ifndef _LIBCPP___CXX03___FORMAT_UNICODE_H
11*ce777190SNikolas Klauser #define _LIBCPP___CXX03___FORMAT_UNICODE_H
12e78f53d1SNikolas Klauser 
1373fbae83SNikolas Klauser #include <__cxx03/__assert>
1473fbae83SNikolas Klauser #include <__cxx03/__bit/countl.h>
1573fbae83SNikolas Klauser #include <__cxx03/__concepts/same_as.h>
1673fbae83SNikolas Klauser #include <__cxx03/__config>
1773fbae83SNikolas Klauser #include <__cxx03/__format/extended_grapheme_cluster_table.h>
1873fbae83SNikolas Klauser #include <__cxx03/__format/indic_conjunct_break_table.h>
1973fbae83SNikolas Klauser #include <__cxx03/__iterator/concepts.h>
2073fbae83SNikolas Klauser #include <__cxx03/__iterator/readable_traits.h> // iter_value_t
2173fbae83SNikolas Klauser #include <__cxx03/__utility/unreachable.h>
2273fbae83SNikolas Klauser #include <__cxx03/string_view>
23e78f53d1SNikolas Klauser 
24e78f53d1SNikolas Klauser #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
25e78f53d1SNikolas Klauser #  pragma GCC system_header
26e78f53d1SNikolas Klauser #endif
27e78f53d1SNikolas Klauser 
28e78f53d1SNikolas Klauser _LIBCPP_BEGIN_NAMESPACE_STD
29e78f53d1SNikolas Klauser 
30e78f53d1SNikolas Klauser #if _LIBCPP_STD_VER >= 20
31e78f53d1SNikolas Klauser 
32e78f53d1SNikolas Klauser namespace __unicode {
33e78f53d1SNikolas Klauser 
34e78f53d1SNikolas Klauser // Helper struct for the result of a consume operation.
35e78f53d1SNikolas Klauser //
36e78f53d1SNikolas Klauser // The status value for a correct code point is 0. This allows a valid value to
37e78f53d1SNikolas Klauser // be used without masking.
38e78f53d1SNikolas Klauser // When the decoding fails it know the number of code units affected. For the
39e78f53d1SNikolas Klauser // current use-cases that value is not needed, therefore it is not stored.
40e78f53d1SNikolas Klauser // The escape routine needs the number of code units for both a valid and
41e78f53d1SNikolas Klauser // invalid character and keeps track of it itself. Doing it in this result
42e78f53d1SNikolas Klauser // unconditionally would give some overhead when the value is unneeded.
43e78f53d1SNikolas Klauser struct __consume_result {
44e78f53d1SNikolas Klauser   // When __status == __ok it contains the decoded code point.
45e78f53d1SNikolas Klauser   // Else it contains the replacement character U+FFFD
46e78f53d1SNikolas Klauser   char32_t __code_point : 31;
47e78f53d1SNikolas Klauser 
48e78f53d1SNikolas Klauser   enum : char32_t {
49e78f53d1SNikolas Klauser     // Consumed a well-formed code point.
50e78f53d1SNikolas Klauser     __ok = 0,
51e78f53d1SNikolas Klauser     // Encountered invalid UTF-8
52e78f53d1SNikolas Klauser     __error = 1
53e78f53d1SNikolas Klauser   } __status : 1 {__ok};
54e78f53d1SNikolas Klauser };
55e78f53d1SNikolas Klauser static_assert(sizeof(__consume_result) == sizeof(char32_t));
56e78f53d1SNikolas Klauser 
57e78f53d1SNikolas Klauser #  ifndef _LIBCPP_HAS_NO_UNICODE
58e78f53d1SNikolas Klauser 
59e78f53d1SNikolas Klauser /// Implements the grapheme cluster boundary rules
60e78f53d1SNikolas Klauser ///
61e78f53d1SNikolas Klauser /// These rules are used to implement format's width estimation as stated in
62e78f53d1SNikolas Klauser /// [format.string.std]/11
63e78f53d1SNikolas Klauser ///
64e78f53d1SNikolas Klauser /// The Standard refers to UAX \#29 for Unicode 12.0.0
65e78f53d1SNikolas Klauser /// https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
66e78f53d1SNikolas Klauser ///
67e78f53d1SNikolas Klauser /// The data tables used are
68e78f53d1SNikolas Klauser /// https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakProperty.txt
69e78f53d1SNikolas Klauser /// https://www.unicode.org/Public/UCD/latest/ucd/emoji/emoji-data.txt
70e78f53d1SNikolas Klauser /// https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt (for testing only)
71e78f53d1SNikolas Klauser 
72e78f53d1SNikolas Klauser inline constexpr char32_t __replacement_character = U'\ufffd';
73e78f53d1SNikolas Klauser 
74e78f53d1SNikolas Klauser // The error of a consume operation.
75e78f53d1SNikolas Klauser //
76e78f53d1SNikolas Klauser // This sets the code point to the replacement character. This code point does
77e78f53d1SNikolas Klauser // not participate in the grapheme clustering, so grapheme clustering code can
78e78f53d1SNikolas Klauser // ignore the error status and always use the code point.
79e78f53d1SNikolas Klauser inline constexpr __consume_result __consume_result_error{__replacement_character, __consume_result::__error};
80e78f53d1SNikolas Klauser 
81e78f53d1SNikolas Klauser [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool __is_high_surrogate(char32_t __value) {
82e78f53d1SNikolas Klauser   return __value >= 0xd800 && __value <= 0xdbff;
83e78f53d1SNikolas Klauser }
84e78f53d1SNikolas Klauser 
85e78f53d1SNikolas Klauser [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool __is_low_surrogate(char32_t __value) {
86e78f53d1SNikolas Klauser   return __value >= 0xdc00 && __value <= 0xdfff;
87e78f53d1SNikolas Klauser }
88e78f53d1SNikolas Klauser 
89e78f53d1SNikolas Klauser // https://www.unicode.org/glossary/#surrogate_code_point
90e78f53d1SNikolas Klauser [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline constexpr bool __is_surrogate(char32_t __value) {
91e78f53d1SNikolas Klauser   return __value >= 0xd800 && __value <= 0xdfff;
92e78f53d1SNikolas Klauser }
93e78f53d1SNikolas Klauser 
94e78f53d1SNikolas Klauser // https://www.unicode.org/glossary/#code_point
95e78f53d1SNikolas Klauser [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline constexpr bool __is_code_point(char32_t __value) {
96e78f53d1SNikolas Klauser   return __value <= 0x10ffff;
97e78f53d1SNikolas Klauser }
98e78f53d1SNikolas Klauser 
99e78f53d1SNikolas Klauser // https://www.unicode.org/glossary/#unicode_scalar_value
100e78f53d1SNikolas Klauser [[nodiscard]] _LIBCPP_HIDE_FROM_ABI inline constexpr bool __is_scalar_value(char32_t __value) {
101e78f53d1SNikolas Klauser   return __unicode::__is_code_point(__value) && !__unicode::__is_surrogate(__value);
102e78f53d1SNikolas Klauser }
103e78f53d1SNikolas Klauser 
104e78f53d1SNikolas Klauser template <contiguous_iterator _Iterator>
105e78f53d1SNikolas Klauser   requires same_as<iter_value_t<_Iterator>, char>
106e78f53d1SNikolas Klauser _LIBCPP_HIDE_FROM_ABI constexpr bool __is_continuation(_Iterator __char, int __count) {
107e78f53d1SNikolas Klauser   do {
108e78f53d1SNikolas Klauser     if ((*__char & 0b1100'0000) != 0b1000'0000)
109e78f53d1SNikolas Klauser       return false;
110e78f53d1SNikolas Klauser     --__count;
111e78f53d1SNikolas Klauser     ++__char;
112e78f53d1SNikolas Klauser   } while (__count);
113e78f53d1SNikolas Klauser   return true;
114e78f53d1SNikolas Klauser }
115e78f53d1SNikolas Klauser 
116e78f53d1SNikolas Klauser /// Helper class to extract a code unit from a Unicode character range.
117e78f53d1SNikolas Klauser ///
118e78f53d1SNikolas Klauser /// The stored range is a view. There are multiple specialization for different
119e78f53d1SNikolas Klauser /// character types.
120e78f53d1SNikolas Klauser template <class _CharT>
121e78f53d1SNikolas Klauser class __code_point_view;
122e78f53d1SNikolas Klauser 
123e78f53d1SNikolas Klauser /// UTF-8 specialization.
124e78f53d1SNikolas Klauser template <>
125e78f53d1SNikolas Klauser class __code_point_view<char> {
126e78f53d1SNikolas Klauser   using _Iterator = basic_string_view<char>::const_iterator;
127e78f53d1SNikolas Klauser 
128e78f53d1SNikolas Klauser public:
129e78f53d1SNikolas Klauser   _LIBCPP_HIDE_FROM_ABI constexpr explicit __code_point_view(_Iterator __first, _Iterator __last)
130e78f53d1SNikolas Klauser       : __first_(__first), __last_(__last) {}
131e78f53d1SNikolas Klauser 
132e78f53d1SNikolas Klauser   _LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; }
133e78f53d1SNikolas Klauser   _LIBCPP_HIDE_FROM_ABI constexpr _Iterator __position() const noexcept { return __first_; }
134e78f53d1SNikolas Klauser 
135e78f53d1SNikolas Klauser   // https://www.unicode.org/versions/latest/ch03.pdf#G7404
136e78f53d1SNikolas Klauser   // Based on Table 3-7, Well-Formed UTF-8 Byte Sequences
137e78f53d1SNikolas Klauser   //
138e78f53d1SNikolas Klauser   // Code Points        First Byte Second Byte Third Byte Fourth Byte  Remarks
139e78f53d1SNikolas Klauser   // U+0000..U+007F     00..7F                                         U+0000..U+007F 1 code unit range
140e78f53d1SNikolas Klauser   //                    C0..C1     80..BF                              invalid overlong encoding
141e78f53d1SNikolas Klauser   // U+0080..U+07FF     C2..DF     80..BF                              U+0080..U+07FF 2 code unit range
142e78f53d1SNikolas Klauser   //                    E0         80..9F      80..BF                  invalid overlong encoding
143e78f53d1SNikolas Klauser   // U+0800..U+0FFF     E0         A0..BF      80..BF                  U+0800..U+FFFF 3 code unit range
144e78f53d1SNikolas Klauser   // U+1000..U+CFFF     E1..EC     80..BF      80..BF
145e78f53d1SNikolas Klauser   // U+D000..U+D7FF     ED         80..9F      80..BF
146e78f53d1SNikolas Klauser   // U+D800..U+DFFF     ED         A0..BF      80..BF                  invalid encoding of surrogate code point
147e78f53d1SNikolas Klauser   // U+E000..U+FFFF     EE..EF     80..BF      80..BF
148e78f53d1SNikolas Klauser   //                    F0         80..8F      80..BF     80..BF       invalid overlong encoding
149e78f53d1SNikolas Klauser   // U+10000..U+3FFFF   F0         90..BF      80..BF     80..BF       U+10000..U+10FFFF 4 code unit range
150e78f53d1SNikolas Klauser   // U+40000..U+FFFFF   F1..F3     80..BF      80..BF     80..BF
151e78f53d1SNikolas Klauser   // U+100000..U+10FFFF F4         80..8F      80..BF     80..BF
152e78f53d1SNikolas Klauser   //                    F4         90..BF      80..BF     80..BF       U+110000.. invalid code point range
153e78f53d1SNikolas Klauser   //
154e78f53d1SNikolas Klauser   // Unlike other parsers, these invalid entries are tested after decoding.
155e78f53d1SNikolas Klauser   // - The parser always needs to consume these code units
156e78f53d1SNikolas Klauser   // - The code is optimized for well-formed UTF-8
157e78f53d1SNikolas Klauser   [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __consume_result __consume() noexcept {
158e78f53d1SNikolas Klauser     _LIBCPP_ASSERT_INTERNAL(__first_ != __last_, "can't move beyond the end of input");
159e78f53d1SNikolas Klauser 
160e78f53d1SNikolas Klauser     // Based on the number of leading 1 bits the number of code units in the
161e78f53d1SNikolas Klauser     // code point can be determined. See
162e78f53d1SNikolas Klauser     // https://en.wikipedia.org/wiki/UTF-8#Encoding
163e78f53d1SNikolas Klauser     switch (std::countl_one(static_cast<unsigned char>(*__first_))) {
164e78f53d1SNikolas Klauser     case 0:
165e78f53d1SNikolas Klauser       return {static_cast<unsigned char>(*__first_++)};
166e78f53d1SNikolas Klauser 
167e78f53d1SNikolas Klauser     case 2: {
168e78f53d1SNikolas Klauser       if (__last_ - __first_ < 2 || !__unicode::__is_continuation(__first_ + 1, 1)) [[unlikely]]
169e78f53d1SNikolas Klauser         break;
170e78f53d1SNikolas Klauser 
171e78f53d1SNikolas Klauser       char32_t __value = static_cast<unsigned char>(*__first_++) & 0x1f;
172e78f53d1SNikolas Klauser       __value <<= 6;
173e78f53d1SNikolas Klauser       __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
174e78f53d1SNikolas Klauser 
175e78f53d1SNikolas Klauser       // These values should be encoded in 1 UTF-8 code unit.
176e78f53d1SNikolas Klauser       if (__value < 0x0080) [[unlikely]]
177e78f53d1SNikolas Klauser         return __consume_result_error;
178e78f53d1SNikolas Klauser 
179e78f53d1SNikolas Klauser       return {__value};
180e78f53d1SNikolas Klauser     }
181e78f53d1SNikolas Klauser 
182e78f53d1SNikolas Klauser     case 3: {
183e78f53d1SNikolas Klauser       if (__last_ - __first_ < 3 || !__unicode::__is_continuation(__first_ + 1, 2)) [[unlikely]]
184e78f53d1SNikolas Klauser         break;
185e78f53d1SNikolas Klauser 
186e78f53d1SNikolas Klauser       char32_t __value = static_cast<unsigned char>(*__first_++) & 0x0f;
187e78f53d1SNikolas Klauser       __value <<= 6;
188e78f53d1SNikolas Klauser       __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
189e78f53d1SNikolas Klauser       __value <<= 6;
190e78f53d1SNikolas Klauser       __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
191e78f53d1SNikolas Klauser 
192e78f53d1SNikolas Klauser       // These values should be encoded in 1 or 2 UTF-8 code units.
193e78f53d1SNikolas Klauser       if (__value < 0x0800) [[unlikely]]
194e78f53d1SNikolas Klauser         return __consume_result_error;
195e78f53d1SNikolas Klauser 
196e78f53d1SNikolas Klauser       // A surrogate value is always encoded in 3 UTF-8 code units.
197e78f53d1SNikolas Klauser       if (__unicode::__is_surrogate(__value)) [[unlikely]]
198e78f53d1SNikolas Klauser         return __consume_result_error;
199e78f53d1SNikolas Klauser 
200e78f53d1SNikolas Klauser       return {__value};
201e78f53d1SNikolas Klauser     }
202e78f53d1SNikolas Klauser 
203e78f53d1SNikolas Klauser     case 4: {
204e78f53d1SNikolas Klauser       if (__last_ - __first_ < 4 || !__unicode::__is_continuation(__first_ + 1, 3)) [[unlikely]]
205e78f53d1SNikolas Klauser         break;
206e78f53d1SNikolas Klauser 
207e78f53d1SNikolas Klauser       char32_t __value = static_cast<unsigned char>(*__first_++) & 0x07;
208e78f53d1SNikolas Klauser       __value <<= 6;
209e78f53d1SNikolas Klauser       __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
210e78f53d1SNikolas Klauser       __value <<= 6;
211e78f53d1SNikolas Klauser       __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
212e78f53d1SNikolas Klauser       __value <<= 6;
213e78f53d1SNikolas Klauser       __value |= static_cast<unsigned char>(*__first_++) & 0x3f;
214e78f53d1SNikolas Klauser 
215e78f53d1SNikolas Klauser       // These values should be encoded in 1, 2, or 3 UTF-8 code units.
216e78f53d1SNikolas Klauser       if (__value < 0x10000) [[unlikely]]
217e78f53d1SNikolas Klauser         return __consume_result_error;
218e78f53d1SNikolas Klauser 
219e78f53d1SNikolas Klauser       // A value too large is always encoded in 4 UTF-8 code units.
220e78f53d1SNikolas Klauser       if (!__unicode::__is_code_point(__value)) [[unlikely]]
221e78f53d1SNikolas Klauser         return __consume_result_error;
222e78f53d1SNikolas Klauser 
223e78f53d1SNikolas Klauser       return {__value};
224e78f53d1SNikolas Klauser     }
225e78f53d1SNikolas Klauser     }
226e78f53d1SNikolas Klauser     // An invalid number of leading ones can be garbage or a code unit in the
227e78f53d1SNikolas Klauser     // middle of a code point. By consuming one code unit the parser may get
228e78f53d1SNikolas Klauser     // "in sync" after a few code units.
229e78f53d1SNikolas Klauser     ++__first_;
230e78f53d1SNikolas Klauser     return __consume_result_error;
231e78f53d1SNikolas Klauser   }
232e78f53d1SNikolas Klauser 
233e78f53d1SNikolas Klauser private:
234e78f53d1SNikolas Klauser   _Iterator __first_;
235e78f53d1SNikolas Klauser   _Iterator __last_;
236e78f53d1SNikolas Klauser };
237e78f53d1SNikolas Klauser 
238e78f53d1SNikolas Klauser #    ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
239e78f53d1SNikolas Klauser _LIBCPP_HIDE_FROM_ABI constexpr bool __is_surrogate_pair_high(wchar_t __value) {
240e78f53d1SNikolas Klauser   return __value >= 0xd800 && __value <= 0xdbff;
241e78f53d1SNikolas Klauser }
242e78f53d1SNikolas Klauser 
243e78f53d1SNikolas Klauser _LIBCPP_HIDE_FROM_ABI constexpr bool __is_surrogate_pair_low(wchar_t __value) {
244e78f53d1SNikolas Klauser   return __value >= 0xdc00 && __value <= 0xdfff;
245e78f53d1SNikolas Klauser }
246e78f53d1SNikolas Klauser 
247e78f53d1SNikolas Klauser /// This specialization depends on the size of wchar_t
248e78f53d1SNikolas Klauser /// - 2 UTF-16 (for example Windows and AIX)
249e78f53d1SNikolas Klauser /// - 4 UTF-32 (for example Linux)
250e78f53d1SNikolas Klauser template <>
251e78f53d1SNikolas Klauser class __code_point_view<wchar_t> {
252e78f53d1SNikolas Klauser   using _Iterator = typename basic_string_view<wchar_t>::const_iterator;
253e78f53d1SNikolas Klauser 
254e78f53d1SNikolas Klauser public:
255e78f53d1SNikolas Klauser   static_assert(sizeof(wchar_t) == 2 || sizeof(wchar_t) == 4, "sizeof(wchar_t) has a not implemented value");
256e78f53d1SNikolas Klauser 
257e78f53d1SNikolas Klauser   _LIBCPP_HIDE_FROM_ABI constexpr explicit __code_point_view(_Iterator __first, _Iterator __last)
258e78f53d1SNikolas Klauser       : __first_(__first), __last_(__last) {}
259e78f53d1SNikolas Klauser 
260e78f53d1SNikolas Klauser   _LIBCPP_HIDE_FROM_ABI constexpr _Iterator __position() const noexcept { return __first_; }
261e78f53d1SNikolas Klauser   _LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; }
262e78f53d1SNikolas Klauser 
263e78f53d1SNikolas Klauser   [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __consume_result __consume() noexcept {
264e78f53d1SNikolas Klauser     _LIBCPP_ASSERT_INTERNAL(__first_ != __last_, "can't move beyond the end of input");
265e78f53d1SNikolas Klauser 
266e78f53d1SNikolas Klauser     char32_t __value = static_cast<char32_t>(*__first_++);
267e78f53d1SNikolas Klauser     if constexpr (sizeof(wchar_t) == 2) {
268e78f53d1SNikolas Klauser       if (__unicode::__is_low_surrogate(__value)) [[unlikely]]
269e78f53d1SNikolas Klauser         return __consume_result_error;
270e78f53d1SNikolas Klauser 
271e78f53d1SNikolas Klauser       if (__unicode::__is_high_surrogate(__value)) {
272e78f53d1SNikolas Klauser         if (__first_ == __last_ || !__unicode::__is_low_surrogate(static_cast<char32_t>(*__first_))) [[unlikely]]
273e78f53d1SNikolas Klauser           return __consume_result_error;
274e78f53d1SNikolas Klauser 
275e78f53d1SNikolas Klauser         __value -= 0xd800;
276e78f53d1SNikolas Klauser         __value <<= 10;
277e78f53d1SNikolas Klauser         __value += static_cast<char32_t>(*__first_++) - 0xdc00;
278e78f53d1SNikolas Klauser         __value += 0x10000;
279e78f53d1SNikolas Klauser 
280e78f53d1SNikolas Klauser         if (!__unicode::__is_code_point(__value)) [[unlikely]]
281e78f53d1SNikolas Klauser           return __consume_result_error;
282e78f53d1SNikolas Klauser       }
283e78f53d1SNikolas Klauser     } else {
284e78f53d1SNikolas Klauser       if (!__unicode::__is_scalar_value(__value)) [[unlikely]]
285e78f53d1SNikolas Klauser         return __consume_result_error;
286e78f53d1SNikolas Klauser     }
287e78f53d1SNikolas Klauser 
288e78f53d1SNikolas Klauser     return {__value};
289e78f53d1SNikolas Klauser   }
290e78f53d1SNikolas Klauser 
291e78f53d1SNikolas Klauser private:
292e78f53d1SNikolas Klauser   _Iterator __first_;
293e78f53d1SNikolas Klauser   _Iterator __last_;
294e78f53d1SNikolas Klauser };
295e78f53d1SNikolas Klauser #    endif // _LIBCPP_HAS_NO_WIDE_CHARACTERS
296e78f53d1SNikolas Klauser 
297e78f53d1SNikolas Klauser // State machine to implement the Extended Grapheme Cluster Boundary
298e78f53d1SNikolas Klauser //
299e78f53d1SNikolas Klauser // The exact rules may change between Unicode versions.
300e78f53d1SNikolas Klauser // This implements the extended rules see
301e78f53d1SNikolas Klauser // https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
302e78f53d1SNikolas Klauser class __extended_grapheme_cluster_break {
303e78f53d1SNikolas Klauser   using __EGC_property  = __extended_grapheme_custer_property_boundary::__property;
304e78f53d1SNikolas Klauser   using __inCB_property = __indic_conjunct_break::__property;
305e78f53d1SNikolas Klauser 
306e78f53d1SNikolas Klauser public:
307e78f53d1SNikolas Klauser   _LIBCPP_HIDE_FROM_ABI constexpr explicit __extended_grapheme_cluster_break(char32_t __first_code_point)
308e78f53d1SNikolas Klauser       : __prev_code_point_(__first_code_point),
309e78f53d1SNikolas Klauser         __prev_property_(__extended_grapheme_custer_property_boundary::__get_property(__first_code_point)) {
310e78f53d1SNikolas Klauser     // Initializes the active rule.
311e78f53d1SNikolas Klauser     if (__prev_property_ == __EGC_property::__Extended_Pictographic)
312e78f53d1SNikolas Klauser       __active_rule_ = __rule::__GB11_emoji;
313e78f53d1SNikolas Klauser     else if (__prev_property_ == __EGC_property::__Regional_Indicator)
314e78f53d1SNikolas Klauser       __active_rule_ = __rule::__GB12_GB13_regional_indicator;
315e78f53d1SNikolas Klauser     else if (__indic_conjunct_break::__get_property(__first_code_point) == __inCB_property::__Consonant)
316e78f53d1SNikolas Klauser       __active_rule_ = __rule::__GB9c_indic_conjunct_break;
317e78f53d1SNikolas Klauser   }
318e78f53d1SNikolas Klauser 
319e78f53d1SNikolas Klauser   [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool operator()(char32_t __next_code_point) {
320e78f53d1SNikolas Klauser     __EGC_property __next_property = __extended_grapheme_custer_property_boundary::__get_property(__next_code_point);
321e78f53d1SNikolas Klauser     bool __result                  = __evaluate(__next_code_point, __next_property);
322e78f53d1SNikolas Klauser     __prev_code_point_             = __next_code_point;
323e78f53d1SNikolas Klauser     __prev_property_               = __next_property;
324e78f53d1SNikolas Klauser     return __result;
325e78f53d1SNikolas Klauser   }
326e78f53d1SNikolas Klauser 
327e78f53d1SNikolas Klauser   // The code point whose break propery are considered during the next
328e78f53d1SNikolas Klauser   // evaluation cyle.
329e78f53d1SNikolas Klauser   [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr char32_t __current_code_point() const { return __prev_code_point_; }
330e78f53d1SNikolas Klauser 
331e78f53d1SNikolas Klauser private:
332e78f53d1SNikolas Klauser   // The naming of the identifiers matches the Unicode standard.
333e78f53d1SNikolas Klauser   // NOLINTBEGIN(readability-identifier-naming)
334e78f53d1SNikolas Klauser 
335e78f53d1SNikolas Klauser   [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool
336e78f53d1SNikolas Klauser   __evaluate(char32_t __next_code_point, __EGC_property __next_property) {
337e78f53d1SNikolas Klauser     switch (__active_rule_) {
338e78f53d1SNikolas Klauser     case __rule::__none:
339e78f53d1SNikolas Klauser       return __evaluate_none(__next_code_point, __next_property);
340e78f53d1SNikolas Klauser     case __rule::__GB9c_indic_conjunct_break:
341e78f53d1SNikolas Klauser       return __evaluate_GB9c_indic_conjunct_break(__next_code_point, __next_property);
342e78f53d1SNikolas Klauser     case __rule::__GB11_emoji:
343e78f53d1SNikolas Klauser       return __evaluate_GB11_emoji(__next_code_point, __next_property);
344e78f53d1SNikolas Klauser     case __rule::__GB12_GB13_regional_indicator:
345e78f53d1SNikolas Klauser       return __evaluate_GB12_GB13_regional_indicator(__next_code_point, __next_property);
346e78f53d1SNikolas Klauser     }
347e78f53d1SNikolas Klauser     __libcpp_unreachable();
348e78f53d1SNikolas Klauser   }
349e78f53d1SNikolas Klauser 
350e78f53d1SNikolas Klauser   _LIBCPP_HIDE_FROM_ABI constexpr bool __evaluate_none(char32_t __next_code_point, __EGC_property __next_property) {
351e78f53d1SNikolas Klauser     // *** Break at the start and end of text, unless the text is empty. ***
352e78f53d1SNikolas Klauser 
353e78f53d1SNikolas Klauser     _LIBCPP_ASSERT_INTERNAL(__prev_property_ != __EGC_property::__sot, "should be handled in the constructor"); // GB1
354e78f53d1SNikolas Klauser     _LIBCPP_ASSERT_INTERNAL(__prev_property_ != __EGC_property::__eot, "should be handled by our caller");      // GB2
355e78f53d1SNikolas Klauser 
356e78f53d1SNikolas Klauser     // *** Do not break between a CR and LF. Otherwise, break before and after controls. ***
357e78f53d1SNikolas Klauser     if (__prev_property_ == __EGC_property::__CR && __next_property == __EGC_property::__LF) // GB3
358e78f53d1SNikolas Klauser       return false;
359e78f53d1SNikolas Klauser 
360e78f53d1SNikolas Klauser     if (__prev_property_ == __EGC_property::__Control || __prev_property_ == __EGC_property::__CR ||
361e78f53d1SNikolas Klauser         __prev_property_ == __EGC_property::__LF) // GB4
362e78f53d1SNikolas Klauser       return true;
363e78f53d1SNikolas Klauser 
364e78f53d1SNikolas Klauser     if (__next_property == __EGC_property::__Control || __next_property == __EGC_property::__CR ||
365e78f53d1SNikolas Klauser         __next_property == __EGC_property::__LF) // GB5
366e78f53d1SNikolas Klauser       return true;
367e78f53d1SNikolas Klauser 
368e78f53d1SNikolas Klauser     // *** Do not break Hangul syllable sequences. ***
369e78f53d1SNikolas Klauser     if (__prev_property_ == __EGC_property::__L &&
370e78f53d1SNikolas Klauser         (__next_property == __EGC_property::__L || __next_property == __EGC_property::__V ||
371e78f53d1SNikolas Klauser          __next_property == __EGC_property::__LV || __next_property == __EGC_property::__LVT)) // GB6
372e78f53d1SNikolas Klauser       return false;
373e78f53d1SNikolas Klauser 
374e78f53d1SNikolas Klauser     if ((__prev_property_ == __EGC_property::__LV || __prev_property_ == __EGC_property::__V) &&
375e78f53d1SNikolas Klauser         (__next_property == __EGC_property::__V || __next_property == __EGC_property::__T)) // GB7
376e78f53d1SNikolas Klauser       return false;
377e78f53d1SNikolas Klauser 
378e78f53d1SNikolas Klauser     if ((__prev_property_ == __EGC_property::__LVT || __prev_property_ == __EGC_property::__T) &&
379e78f53d1SNikolas Klauser         __next_property == __EGC_property::__T) // GB8
380e78f53d1SNikolas Klauser       return false;
381e78f53d1SNikolas Klauser 
382e78f53d1SNikolas Klauser     // *** Do not break before extending characters or ZWJ. ***
383e78f53d1SNikolas Klauser     if (__next_property == __EGC_property::__Extend || __next_property == __EGC_property::__ZWJ)
384e78f53d1SNikolas Klauser       return false; // GB9
385e78f53d1SNikolas Klauser 
386e78f53d1SNikolas Klauser     // *** Do not break before SpacingMarks, or after Prepend characters. ***
387e78f53d1SNikolas Klauser     if (__next_property == __EGC_property::__SpacingMark) // GB9a
388e78f53d1SNikolas Klauser       return false;
389e78f53d1SNikolas Klauser 
390e78f53d1SNikolas Klauser     if (__prev_property_ == __EGC_property::__Prepend) // GB9b
391e78f53d1SNikolas Klauser       return false;
392e78f53d1SNikolas Klauser 
393e78f53d1SNikolas Klauser     // *** Do not break within certain combinations with Indic_Conjunct_Break (InCB)=Linker. ***
394e78f53d1SNikolas Klauser     if (__indic_conjunct_break::__get_property(__next_code_point) == __inCB_property::__Consonant) {
395e78f53d1SNikolas Klauser       __active_rule_                     = __rule::__GB9c_indic_conjunct_break;
396e78f53d1SNikolas Klauser       __GB9c_indic_conjunct_break_state_ = __GB9c_indic_conjunct_break_state::__Consonant;
397e78f53d1SNikolas Klauser       return true;
398e78f53d1SNikolas Klauser     }
399e78f53d1SNikolas Klauser 
400e78f53d1SNikolas Klauser     // *** Do not break within emoji modifier sequences or emoji zwj sequences. ***
401e78f53d1SNikolas Klauser     if (__next_property == __EGC_property::__Extended_Pictographic) {
402e78f53d1SNikolas Klauser       __active_rule_      = __rule::__GB11_emoji;
403e78f53d1SNikolas Klauser       __GB11_emoji_state_ = __GB11_emoji_state::__Extended_Pictographic;
404e78f53d1SNikolas Klauser       return true;
405e78f53d1SNikolas Klauser     }
406e78f53d1SNikolas Klauser 
407e78f53d1SNikolas Klauser     // *** Do not break within emoji flag sequences ***
408e78f53d1SNikolas Klauser 
409e78f53d1SNikolas Klauser     // That is, do not break between regional indicator (RI) symbols if there
410e78f53d1SNikolas Klauser     // is an odd number of RI characters before the break point.
411e78f53d1SNikolas Klauser     if (__next_property == __EGC_property::__Regional_Indicator) { // GB12 + GB13
412e78f53d1SNikolas Klauser       __active_rule_ = __rule::__GB12_GB13_regional_indicator;
413e78f53d1SNikolas Klauser       return true;
414e78f53d1SNikolas Klauser     }
415e78f53d1SNikolas Klauser 
416e78f53d1SNikolas Klauser     // *** Otherwise, break everywhere. ***
417e78f53d1SNikolas Klauser     return true; // GB999
418e78f53d1SNikolas Klauser   }
419e78f53d1SNikolas Klauser 
420e78f53d1SNikolas Klauser   [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool
421e78f53d1SNikolas Klauser   __evaluate_GB9c_indic_conjunct_break(char32_t __next_code_point, __EGC_property __next_property) {
422e78f53d1SNikolas Klauser     __inCB_property __break = __indic_conjunct_break::__get_property(__next_code_point);
423e78f53d1SNikolas Klauser     if (__break == __inCB_property::__none) {
424e78f53d1SNikolas Klauser       __active_rule_ = __rule::__none;
425e78f53d1SNikolas Klauser       return __evaluate_none(__next_code_point, __next_property);
426e78f53d1SNikolas Klauser     }
427e78f53d1SNikolas Klauser 
428e78f53d1SNikolas Klauser     switch (__GB9c_indic_conjunct_break_state_) {
429e78f53d1SNikolas Klauser     case __GB9c_indic_conjunct_break_state::__Consonant:
430e78f53d1SNikolas Klauser       if (__break == __inCB_property::__Extend) {
431e78f53d1SNikolas Klauser         return false;
432e78f53d1SNikolas Klauser       }
433e78f53d1SNikolas Klauser       if (__break == __inCB_property::__Linker) {
434e78f53d1SNikolas Klauser         __GB9c_indic_conjunct_break_state_ = __GB9c_indic_conjunct_break_state::__Linker;
435e78f53d1SNikolas Klauser         return false;
436e78f53d1SNikolas Klauser       }
437e78f53d1SNikolas Klauser       __active_rule_ = __rule::__none;
438e78f53d1SNikolas Klauser       return __evaluate_none(__next_code_point, __next_property);
439e78f53d1SNikolas Klauser 
440e78f53d1SNikolas Klauser     case __GB9c_indic_conjunct_break_state::__Linker:
441e78f53d1SNikolas Klauser       if (__break == __inCB_property::__Extend) {
442e78f53d1SNikolas Klauser         return false;
443e78f53d1SNikolas Klauser       }
444e78f53d1SNikolas Klauser       if (__break == __inCB_property::__Linker) {
445e78f53d1SNikolas Klauser         return false;
446e78f53d1SNikolas Klauser       }
447e78f53d1SNikolas Klauser       if (__break == __inCB_property::__Consonant) {
448e78f53d1SNikolas Klauser         __GB9c_indic_conjunct_break_state_ = __GB9c_indic_conjunct_break_state::__Consonant;
449e78f53d1SNikolas Klauser         return false;
450e78f53d1SNikolas Klauser       }
451e78f53d1SNikolas Klauser       __active_rule_ = __rule::__none;
452e78f53d1SNikolas Klauser       return __evaluate_none(__next_code_point, __next_property);
453e78f53d1SNikolas Klauser     }
454e78f53d1SNikolas Klauser     __libcpp_unreachable();
455e78f53d1SNikolas Klauser   }
456e78f53d1SNikolas Klauser 
457e78f53d1SNikolas Klauser   [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool
458e78f53d1SNikolas Klauser   __evaluate_GB11_emoji(char32_t __next_code_point, __EGC_property __next_property) {
459e78f53d1SNikolas Klauser     switch (__GB11_emoji_state_) {
460e78f53d1SNikolas Klauser     case __GB11_emoji_state::__Extended_Pictographic:
461e78f53d1SNikolas Klauser       if (__next_property == __EGC_property::__Extend) {
462e78f53d1SNikolas Klauser         __GB11_emoji_state_ = __GB11_emoji_state::__Extend;
463e78f53d1SNikolas Klauser         return false;
464e78f53d1SNikolas Klauser       }
465e78f53d1SNikolas Klauser       [[fallthrough]];
466e78f53d1SNikolas Klauser     case __GB11_emoji_state::__Extend:
467e78f53d1SNikolas Klauser       if (__next_property == __EGC_property::__ZWJ) {
468e78f53d1SNikolas Klauser         __GB11_emoji_state_ = __GB11_emoji_state::__ZWJ;
469e78f53d1SNikolas Klauser         return false;
470e78f53d1SNikolas Klauser       }
471e78f53d1SNikolas Klauser       if (__next_property == __EGC_property::__Extend)
472e78f53d1SNikolas Klauser         return false;
473e78f53d1SNikolas Klauser       __active_rule_ = __rule::__none;
474e78f53d1SNikolas Klauser       return __evaluate_none(__next_code_point, __next_property);
475e78f53d1SNikolas Klauser 
476e78f53d1SNikolas Klauser     case __GB11_emoji_state::__ZWJ:
477e78f53d1SNikolas Klauser       if (__next_property == __EGC_property::__Extended_Pictographic) {
478e78f53d1SNikolas Klauser         __GB11_emoji_state_ = __GB11_emoji_state::__Extended_Pictographic;
479e78f53d1SNikolas Klauser         return false;
480e78f53d1SNikolas Klauser       }
481e78f53d1SNikolas Klauser       __active_rule_ = __rule::__none;
482e78f53d1SNikolas Klauser       return __evaluate_none(__next_code_point, __next_property);
483e78f53d1SNikolas Klauser     }
484e78f53d1SNikolas Klauser     __libcpp_unreachable();
485e78f53d1SNikolas Klauser   }
486e78f53d1SNikolas Klauser 
487e78f53d1SNikolas Klauser   [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool
488e78f53d1SNikolas Klauser   __evaluate_GB12_GB13_regional_indicator(char32_t __next_code_point, __EGC_property __next_property) {
489e78f53d1SNikolas Klauser     __active_rule_ = __rule::__none;
490e78f53d1SNikolas Klauser     if (__next_property == __EGC_property::__Regional_Indicator)
491e78f53d1SNikolas Klauser       return false;
492e78f53d1SNikolas Klauser     return __evaluate_none(__next_code_point, __next_property);
493e78f53d1SNikolas Klauser   }
494e78f53d1SNikolas Klauser 
495e78f53d1SNikolas Klauser   char32_t __prev_code_point_;
496e78f53d1SNikolas Klauser   __EGC_property __prev_property_;
497e78f53d1SNikolas Klauser 
498e78f53d1SNikolas Klauser   enum class __rule {
499e78f53d1SNikolas Klauser     __none,
500e78f53d1SNikolas Klauser     __GB9c_indic_conjunct_break,
501e78f53d1SNikolas Klauser     __GB11_emoji,
502e78f53d1SNikolas Klauser     __GB12_GB13_regional_indicator,
503e78f53d1SNikolas Klauser   };
504e78f53d1SNikolas Klauser   __rule __active_rule_ = __rule::__none;
505e78f53d1SNikolas Klauser 
506e78f53d1SNikolas Klauser   enum class __GB11_emoji_state {
507e78f53d1SNikolas Klauser     __Extended_Pictographic,
508e78f53d1SNikolas Klauser     __Extend,
509e78f53d1SNikolas Klauser     __ZWJ,
510e78f53d1SNikolas Klauser   };
511e78f53d1SNikolas Klauser   __GB11_emoji_state __GB11_emoji_state_ = __GB11_emoji_state::__Extended_Pictographic;
512e78f53d1SNikolas Klauser 
513e78f53d1SNikolas Klauser   enum class __GB9c_indic_conjunct_break_state {
514e78f53d1SNikolas Klauser     __Consonant,
515e78f53d1SNikolas Klauser     __Linker,
516e78f53d1SNikolas Klauser   };
517e78f53d1SNikolas Klauser 
518e78f53d1SNikolas Klauser   __GB9c_indic_conjunct_break_state __GB9c_indic_conjunct_break_state_ = __GB9c_indic_conjunct_break_state::__Consonant;
519e78f53d1SNikolas Klauser 
520e78f53d1SNikolas Klauser   // NOLINTEND(readability-identifier-naming)
521e78f53d1SNikolas Klauser };
522e78f53d1SNikolas Klauser 
523e78f53d1SNikolas Klauser /// Helper class to extract an extended grapheme cluster from a Unicode character range.
524e78f53d1SNikolas Klauser ///
525e78f53d1SNikolas Klauser /// This function is used to determine the column width of an extended grapheme
526e78f53d1SNikolas Klauser /// cluster. In order to do that only the first code point is evaluated.
527e78f53d1SNikolas Klauser /// Therefore only this code point is extracted.
528e78f53d1SNikolas Klauser template <class _CharT>
529e78f53d1SNikolas Klauser class __extended_grapheme_cluster_view {
530e78f53d1SNikolas Klauser   using _Iterator = typename basic_string_view<_CharT>::const_iterator;
531e78f53d1SNikolas Klauser 
532e78f53d1SNikolas Klauser public:
533e78f53d1SNikolas Klauser   _LIBCPP_HIDE_FROM_ABI constexpr explicit __extended_grapheme_cluster_view(_Iterator __first, _Iterator __last)
534e78f53d1SNikolas Klauser       : __code_point_view_(__first, __last), __at_break_(__code_point_view_.__consume().__code_point) {}
535e78f53d1SNikolas Klauser 
536e78f53d1SNikolas Klauser   struct __cluster {
537e78f53d1SNikolas Klauser     /// The first code point of the extended grapheme cluster.
538e78f53d1SNikolas Klauser     ///
539e78f53d1SNikolas Klauser     /// The first code point is used to estimate the width of the extended
540e78f53d1SNikolas Klauser     /// grapheme cluster.
541e78f53d1SNikolas Klauser     char32_t __code_point_;
542e78f53d1SNikolas Klauser 
543e78f53d1SNikolas Klauser     /// Points one beyond the last code unit in the extended grapheme cluster.
544e78f53d1SNikolas Klauser     ///
545e78f53d1SNikolas Klauser     /// It's expected the caller has the start position and thus can determine
546e78f53d1SNikolas Klauser     /// the code unit range of the extended grapheme cluster.
547e78f53d1SNikolas Klauser     _Iterator __last_;
548e78f53d1SNikolas Klauser   };
549e78f53d1SNikolas Klauser 
550e78f53d1SNikolas Klauser   [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __cluster __consume() {
551e78f53d1SNikolas Klauser     char32_t __code_point = __at_break_.__current_code_point();
552e78f53d1SNikolas Klauser     _Iterator __position  = __code_point_view_.__position();
553e78f53d1SNikolas Klauser     while (!__code_point_view_.__at_end()) {
554e78f53d1SNikolas Klauser       if (__at_break_(__code_point_view_.__consume().__code_point))
555e78f53d1SNikolas Klauser         break;
556e78f53d1SNikolas Klauser       __position = __code_point_view_.__position();
557e78f53d1SNikolas Klauser     }
558e78f53d1SNikolas Klauser     return {__code_point, __position};
559e78f53d1SNikolas Klauser   }
560e78f53d1SNikolas Klauser 
561e78f53d1SNikolas Klauser private:
562e78f53d1SNikolas Klauser   __code_point_view<_CharT> __code_point_view_;
563e78f53d1SNikolas Klauser   __extended_grapheme_cluster_break __at_break_;
564e78f53d1SNikolas Klauser };
565e78f53d1SNikolas Klauser 
566e78f53d1SNikolas Klauser template <contiguous_iterator _Iterator>
567e78f53d1SNikolas Klauser __extended_grapheme_cluster_view(_Iterator, _Iterator) -> __extended_grapheme_cluster_view<iter_value_t<_Iterator>>;
568e78f53d1SNikolas Klauser 
569e78f53d1SNikolas Klauser #  else //  _LIBCPP_HAS_NO_UNICODE
570e78f53d1SNikolas Klauser 
571e78f53d1SNikolas Klauser // For ASCII every character is a "code point".
572e78f53d1SNikolas Klauser // This makes it easier to write code agnostic of the _LIBCPP_HAS_NO_UNICODE define.
573e78f53d1SNikolas Klauser template <class _CharT>
574e78f53d1SNikolas Klauser class __code_point_view {
575e78f53d1SNikolas Klauser   using _Iterator = typename basic_string_view<_CharT>::const_iterator;
576e78f53d1SNikolas Klauser 
577e78f53d1SNikolas Klauser public:
578e78f53d1SNikolas Klauser   _LIBCPP_HIDE_FROM_ABI constexpr explicit __code_point_view(_Iterator __first, _Iterator __last)
579e78f53d1SNikolas Klauser       : __first_(__first), __last_(__last) {}
580e78f53d1SNikolas Klauser 
581e78f53d1SNikolas Klauser   _LIBCPP_HIDE_FROM_ABI constexpr bool __at_end() const noexcept { return __first_ == __last_; }
582e78f53d1SNikolas Klauser   _LIBCPP_HIDE_FROM_ABI constexpr _Iterator __position() const noexcept { return __first_; }
583e78f53d1SNikolas Klauser 
584e78f53d1SNikolas Klauser   [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr __consume_result __consume() noexcept {
585e78f53d1SNikolas Klauser     _LIBCPP_ASSERT_INTERNAL(__first_ != __last_, "can't move beyond the end of input");
586e78f53d1SNikolas Klauser     return {static_cast<char32_t>(*__first_++)};
587e78f53d1SNikolas Klauser   }
588e78f53d1SNikolas Klauser 
589e78f53d1SNikolas Klauser private:
590e78f53d1SNikolas Klauser   _Iterator __first_;
591e78f53d1SNikolas Klauser   _Iterator __last_;
592e78f53d1SNikolas Klauser };
593e78f53d1SNikolas Klauser 
594e78f53d1SNikolas Klauser #  endif //  _LIBCPP_HAS_NO_UNICODE
595e78f53d1SNikolas Klauser 
596e78f53d1SNikolas Klauser } // namespace __unicode
597e78f53d1SNikolas Klauser 
598e78f53d1SNikolas Klauser #endif //_LIBCPP_STD_VER >= 20
599e78f53d1SNikolas Klauser 
600e78f53d1SNikolas Klauser _LIBCPP_END_NAMESPACE_STD
601e78f53d1SNikolas Klauser 
602*ce777190SNikolas Klauser #endif // _LIBCPP___CXX03___FORMAT_UNICODE_H
603