xref: /llvm-project/libcxx/test/std/utilities/format/format.functions/unicode.pass.cpp (revision 6195bdb9f13be70e5f6791cb2cf4c33c52bc800e)
1 //===----------------------------------------------------------------------===//
2 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
3 // See https://llvm.org/LICENSE.txt for license information.
4 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
5 //
6 //===----------------------------------------------------------------------===//
7 
8 // UNSUPPORTED: c++03, c++11, c++14, c++17
9 // UNSUPPORTED: libcpp-has-no-incomplete-format
10 
11 // This version runs the test when the platform has Unicode support.
12 // UNSUPPORTED: libcpp-has-no-unicode
13 
14 // <format>
15 
16 // Tests the Unicode width support of the standard format specifiers.
17 // It tests [format.string.std]/8 - 11:
18 // - Properly determining the estimated with of a unicode string.
19 // - Properly truncating to the wanted maximum width.
20 
21 // More specific extended grapheme cluster boundary rules are tested in
22 // test/libcxx/utilities/format/format.string/format.string.std/extended_grapheme_cluster.pass.cpp
23 // this test is based on test data provided by the Unicode Consortium.
24 
25 #include <format>
26 #include <cassert>
27 #include <vector>
28 
29 #include "make_string.h"
30 #include "test_macros.h"
31 #include "string_literal.h"
32 #include "test_format_string.h"
33 
34 #ifndef TEST_HAS_NO_LOCALIZATION
35 #  include <iostream>
36 #  include <type_traits>
37 #endif
38 
39 #define SV(S) MAKE_STRING_VIEW(CharT, S)
40 
41 template < class CharT, class... Args>
42 void check(std::basic_string_view<CharT> expected, test_format_string<CharT, Args...> fmt, Args&&... args) {
43   std::basic_string<CharT> out = std::format(fmt, std::forward<Args>(args)...);
44 #ifndef TEST_HAS_NO_LOCALIZATION
45   if constexpr (std::same_as<CharT, char>)
46     if (out != expected)
47       std::cerr << "\nFormat string   " << fmt.get() << "\nExpected output " << expected << "\nActual output   " << out
48                 << '\n';
49 #endif
50   assert(out == expected);
51 };
52 
53 template <class CharT>
54 static void test_single_code_point_fill() {
55   //*** 1-byte code points ***
56   check(SV("* *"), SV("{:*^3}"), SV(" "));
57   check(SV("*~*"), SV("{:*^3}"), SV("~"));
58 
59   //*** 2-byte code points ***
60   check(SV("*\u00a1*"), SV("{:*^3}"), SV("\u00a1")); // INVERTED EXCLAMATION MARK
61   check(SV("*\u07ff*"), SV("{:*^3}"), SV("\u07ff")); // NKO TAMAN SIGN
62 
63   //*** 3-byte code points ***
64   check(SV("*\u0800*"), SV("{:*^3}"), SV("\u0800")); // SAMARITAN LETTER ALAF
65   check(SV("*\ufffd*"), SV("{:*^3}"), SV("\ufffd")); // REPLACEMENT CHARACTER
66 
67   // 2 column ranges
68   check(SV("*\u1100*"), SV("{:*^4}"), SV("\u1100")); // HANGUL CHOSEONG KIYEOK
69   check(SV("*\u115f*"), SV("{:*^4}"), SV("\u115f")); // HANGUL CHOSEONG FILLER
70 
71   check(SV("*\u2329*"), SV("{:*^4}"), SV("\u2329")); // LEFT-POINTING ANGLE BRACKET
72   check(SV("*\u232a*"), SV("{:*^4}"), SV("\u232a")); // RIGHT-POINTING ANGLE BRACKET
73 
74   check(SV("*\u2e80*"), SV("{:*^4}"), SV("\u2e80")); // CJK RADICAL REPEAT
75   check(SV("*\u303e*"), SV("{:*^4}"), SV("\u303e")); // IDEOGRAPHIC VARIATION INDICATOR
76 
77   check(SV("*\u3040*"), SV("{:*^4}"), SV("\u3040")); // U+3041 HIRAGANA LETTER SMALL A
78   check(SV("*\ua4cf*"), SV("{:*^4}"), SV("\ua4cf")); // U+A4D0 LISU LETTER BA
79 
80   check(SV("*\uac00*"), SV("{:*^4}"), SV("\uac00")); // <Hangul Syllable, First>
81   check(SV("*\ud7a3*"), SV("{:*^4}"), SV("\ud7a3")); // Hangul Syllable Hih
82 
83   check(SV("*\uf900*"), SV("{:*^4}"), SV("\uf900")); // CJK COMPATIBILITY IDEOGRAPH-F900
84   check(SV("*\ufaff*"), SV("{:*^4}"), SV("\ufaff")); // U+FB00 LATIN SMALL LIGATURE FF
85 
86   check(SV("*\ufe10*"), SV("{:*^4}"), SV("\ufe10")); // PRESENTATION FORM FOR VERTICAL COMMA
87   check(SV("*\ufe19*"), SV("{:*^4}"), SV("\ufe19")); // PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS
88 
89   check(SV("*\ufe30*"), SV("{:*^4}"), SV("\ufe30")); // PRESENTATION FORM FOR VERTICAL TWO DOT LEADER
90   check(SV("*\ufe6f*"), SV("{:*^4}"), SV("\ufe6f")); // U+FE70 ARABIC FATHATAN ISOLATED FORM
91 
92   check(SV("*\uff00*"), SV("{:*^4}"), SV("\uff00")); // U+FF01 FULLWIDTH EXCLAMATION MARK
93   check(SV("*\uff60*"), SV("{:*^4}"), SV("\uff60")); // FULLWIDTH RIGHT WHITE PARENTHESIS
94 
95   check(SV("*\uffe0*"), SV("{:*^4}"), SV("\uffe0")); // FULLWIDTH CENT SIGN
96   check(SV("*\uffe6*"), SV("{:*^4}"), SV("\uffe6")); // FULLWIDTH WON SIGN
97 
98   //*** 4-byte code points ***
99   check(SV("*\U00010000*"), SV("{:*^3}"), SV("\U00010000")); // LINEAR B SYLLABLE B008 A
100   check(SV("*\U0010FFFF*"), SV("{:*^3}"), SV("\U0010FFFF")); // Undefined Character
101 
102   // 2 column ranges
103   check(SV("*\U0001f300*"), SV("{:*^4}"), SV("\U0001f300")); // CYCLONE
104   check(SV("*\U0001f64f*"), SV("{:*^4}"), SV("\U0001f64f")); // PERSON WITH FOLDED HANDS
105   check(SV("*\U0001f900*"), SV("{:*^4}"), SV("\U0001f900")); // CIRCLED CROSS FORMEE WITH FOUR DOTS
106   check(SV("*\U0001f9ff*"), SV("{:*^4}"), SV("\U0001f9ff")); // NAZAR AMULET
107   check(SV("*\U00020000*"), SV("{:*^4}"), SV("\U00020000")); // <CJK Ideograph Extension B, First>
108   check(SV("*\U0002fffd*"), SV("{:*^4}"), SV("\U0002fffd")); // Undefined Character
109   check(SV("*\U00030000*"), SV("{:*^4}"), SV("\U00030000")); // <CJK Ideograph Extension G, First>
110   check(SV("*\U0003fffd*"), SV("{:*^4}"), SV("\U0003fffd")); // Undefined Character
111 }
112 
113 // One column output is unaffected.
114 // Two column output is removed, thus the result is only the fill character.
115 template <class CharT>
116 static void test_single_code_point_truncate() {
117   //*** 1-byte code points ***
118   check(SV("* *"), SV("{:*^3.1}"), SV(" "));
119   check(SV("*~*"), SV("{:*^3.1}"), SV("~"));
120 
121   //*** 2-byte code points ***
122   check(SV("*\u00a1*"), SV("{:*^3.1}"), SV("\u00a1")); // INVERTED EXCLAMATION MARK
123   check(SV("*\u07ff*"), SV("{:*^3.1}"), SV("\u07ff")); // NKO TAMAN SIGN
124 
125   //*** 3.1-byte code points ***
126   check(SV("*\u0800*"), SV("{:*^3.1}"), SV("\u0800")); // SAMARITAN LETTER ALAF
127   check(SV("*\ufffd*"), SV("{:*^3.1}"), SV("\ufffd")); // REPLACEMENT CHARACTER
128 
129   // 2 column ranges
130   check(SV("***"), SV("{:*^3.1}"), SV("\u1100")); // HANGUL CHOSEONG KIYEOK
131   check(SV("***"), SV("{:*^3.1}"), SV("\u115f")); // HANGUL CHOSEONG FILLER
132 
133   check(SV("***"), SV("{:*^3.1}"), SV("\u2329")); // LEFT-POINTING ANGLE BRACKET
134   check(SV("***"), SV("{:*^3.1}"), SV("\u232a")); // RIGHT-POINTING ANGLE BRACKET
135 
136   check(SV("***"), SV("{:*^3.1}"), SV("\u2e80")); // CJK RADICAL REPEAT
137   check(SV("***"), SV("{:*^3.1}"), SV("\u303e")); // IDEOGRAPHIC VARIATION INDICATOR
138 
139   check(SV("***"), SV("{:*^3.1}"), SV("\u3040")); // U+3041 HIRAGANA LETTER SMALL A
140   check(SV("***"), SV("{:*^3.1}"), SV("\ua4cf")); // U+A4D0 LISU LETTER BA
141 
142   check(SV("***"), SV("{:*^3.1}"), SV("\uac00")); // <Hangul Syllable, First>
143   check(SV("***"), SV("{:*^3.1}"), SV("\ud7a3")); // Hangul Syllable Hih
144 
145   check(SV("***"), SV("{:*^3.1}"), SV("\uf900")); // CJK COMPATIBILITY IDEOGRAPH-F900
146   check(SV("***"), SV("{:*^3.1}"), SV("\ufaff")); // U+FB00 LATIN SMALL LIGATURE FF
147 
148   check(SV("***"), SV("{:*^3.1}"), SV("\ufe10")); // PRESENTATION FORM FOR VERTICAL COMMA
149   check(SV("***"), SV("{:*^3.1}"), SV("\ufe19")); // PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS
150 
151   check(SV("***"), SV("{:*^3.1}"), SV("\ufe30")); // PRESENTATION FORM FOR VERTICAL TWO DOT LEADER
152   check(SV("***"), SV("{:*^3.1}"), SV("\ufe6f")); // U+FE70 ARABIC FATHATAN ISOLATED FORM
153 
154   check(SV("***"), SV("{:*^3.1}"), SV("\uff00")); // U+FF01 FULLWIDTH EXCLAMATION MARK
155   check(SV("***"), SV("{:*^3.1}"), SV("\uff60")); // FULLWIDTH RIGHT WHITE PARENTHESIS
156 
157   check(SV("***"), SV("{:*^3.1}"), SV("\uffe0")); // FULLWIDTH CENT SIGN
158   check(SV("***"), SV("{:*^3.1}"), SV("\uffe6")); // FULLWIDTH WON SIGN
159 
160   //*** 3.1-byte code points ***
161   check(SV("*\U00010000*"), SV("{:*^3.1}"), SV("\U00010000")); // LINEAR B SYLLABLE B008 A
162   check(SV("*\U0010FFFF*"), SV("{:*^3.1}"), SV("\U0010FFFF")); // Undefined Character
163 
164   // 2 column ranges
165   check(SV("***"), SV("{:*^3.1}"), SV("\U0001f300")); // CYCLONE
166   check(SV("***"), SV("{:*^3.1}"), SV("\U0001f64f")); // PERSON WITH FOLDED HANDS
167   check(SV("***"), SV("{:*^3.1}"), SV("\U0001f900")); // CIRCLED CROSS FORMEE WITH FOUR DOTS
168   check(SV("***"), SV("{:*^3.1}"), SV("\U0001f9ff")); // NAZAR AMULET
169   check(SV("***"), SV("{:*^3.1}"), SV("\U00020000")); // <CJK Ideograph Extension B, First>
170   check(SV("***"), SV("{:*^3.1}"), SV("\U0002fffd")); // Undefined Character
171   check(SV("***"), SV("{:*^3.1}"), SV("\U00030000")); // <CJK Ideograph Extension G, First>
172   check(SV("***"), SV("{:*^3.1}"), SV("\U0003fffd")); // Undefined Character
173 }
174 
175 // The examples used in that paper.
176 template <class CharT>
177 static void test_P1868() {
178   // Fill
179   check(SV("*\u0041*"), SV("{:*^3}"), SV("\u0041")); // { LATIN CAPITAL LETTER A }
180   check(SV("*\u00c1*"), SV("{:*^3}"), SV("\u00c1")); // { LATIN CAPITAL LETTER A WITH ACUTE }
181   check(SV("*\u0041\u0301*"),
182         SV("{:*^3}"),
183         SV("\u0041\u0301"));                         // { LATIN CAPITAL LETTER A } { COMBINING ACUTE ACCENT }
184   check(SV("*\u0132*"), SV("{:*^3}"), SV("\u0132")); // { LATIN CAPITAL LIGATURE IJ }
185   check(SV("*\u0394*"), SV("{:*^3}"), SV("\u0394")); // { GREEK CAPITAL LETTER DELTA }
186 
187   check(SV("*\u0429*"), SV("{:*^3}"), SV("\u0429"));         // { CYRILLIC CAPITAL LETTER SHCHA }
188   check(SV("*\u05d0*"), SV("{:*^3}"), SV("\u05d0"));         // { HEBREW LETTER ALEF }
189   check(SV("*\u0634*"), SV("{:*^3}"), SV("\u0634"));         // { ARABIC LETTER SHEEN }
190   check(SV("*\u3009*"), SV("{:*^4}"), SV("\u3009"));         // { RIGHT-POINTING ANGLE BRACKET }
191   check(SV("*\u754c*"), SV("{:*^4}"), SV("\u754c"));         // { CJK Unified Ideograph-754C }
192   check(SV("*\U0001f921*"), SV("{:*^4}"), SV("\U0001f921")); // { UNICORN FACE }
193   check(SV("*\U0001f468\u200d\U0001F469\u200d\U0001F467\u200d\U0001F466*"),
194         SV("{:*^4}"),
195         SV("\U0001f468\u200d\U0001F469\u200d\U0001F467\u200d\U0001F466")); // { Family: Man, Woman, Girl, Boy }
196 
197   // Truncate to 1 column: 1 column grapheme clusters are kept together.
198   check(SV("*\u0041*"), SV("{:*^3.1}"), SV("\u0041")); // { LATIN CAPITAL LETTER A }
199   check(SV("*\u00c1*"), SV("{:*^3.1}"), SV("\u00c1")); // { LATIN CAPITAL LETTER A WITH ACUTE }
200   check(SV("*\u0041\u0301*"),
201         SV("{:*^3.1}"),
202         SV("\u0041\u0301"));                           // { LATIN CAPITAL LETTER A } { COMBINING ACUTE ACCENT }
203   check(SV("*\u0132*"), SV("{:*^3.1}"), SV("\u0132")); // { LATIN CAPITAL LIGATURE IJ }
204   check(SV("*\u0394*"), SV("{:*^3.1}"), SV("\u0394")); // { GREEK CAPITAL LETTER DELTA }
205 
206   check(SV("*\u0429*"), SV("{:*^3.1}"), SV("\u0429")); // { CYRILLIC CAPITAL LETTER SHCHA }
207   check(SV("*\u05d0*"), SV("{:*^3.1}"), SV("\u05d0")); // { HEBREW LETTER ALEF }
208   check(SV("*\u0634*"), SV("{:*^3.1}"), SV("\u0634")); // { ARABIC LETTER SHEEN }
209   check(SV("***"), SV("{:*^3.1}"), SV("\u3009"));      // { RIGHT-POINTING ANGLE BRACKET }
210   check(SV("***"), SV("{:*^3.1}"), SV("\u754c"));      // { CJK Unified Ideograph-754C }
211   check(SV("***"), SV("{:*^3.1}"), SV("\U0001f921"));  // { UNICORN FACE }
212   check(SV("***"),
213         SV("{:*^3.1}"),
214         SV("\U0001f468\u200d\U0001F469\u200d\U0001F467\u200d\U0001F466")); // { Family: Man, Woman, Girl, Boy }
215 
216   // Truncate to 2 column: 2 column grapheme clusters are kept together.
217   check(SV("*\u0041*"), SV("{:*^3.2}"), SV("\u0041")); // { LATIN CAPITAL LETTER A }
218   check(SV("*\u00c1*"), SV("{:*^3.2}"), SV("\u00c1")); // { LATIN CAPITAL LETTER A WITH ACUTE }
219   check(SV("*\u0041\u0301*"),
220         SV("{:*^3.2}"),
221         SV("\u0041\u0301"));                           // { LATIN CAPITAL LETTER A } { COMBINING ACUTE ACCENT }
222   check(SV("*\u0132*"), SV("{:*^3.2}"), SV("\u0132")); // { LATIN CAPITAL LIGATURE IJ }
223   check(SV("*\u0394*"), SV("{:*^3.2}"), SV("\u0394")); // { GREEK CAPITAL LETTER DELTA }
224 
225   check(SV("*\u0429*"), SV("{:*^3.2}"), SV("\u0429"));         // { CYRILLIC CAPITAL LETTER SHCHA }
226   check(SV("*\u05d0*"), SV("{:*^3.2}"), SV("\u05d0"));         // { HEBREW LETTER ALEF }
227   check(SV("*\u0634*"), SV("{:*^3.2}"), SV("\u0634"));         // { ARABIC LETTER SHEEN }
228   check(SV("*\u3009*"), SV("{:*^4.2}"), SV("\u3009"));         // { RIGHT-POINTING ANGLE BRACKET }
229   check(SV("*\u754c*"), SV("{:*^4.2}"), SV("\u754c"));         // { CJK Unified Ideograph-754C }
230   check(SV("*\U0001f921*"), SV("{:*^4.2}"), SV("\U0001f921")); // { UNICORN FACE }
231   check(SV("*\U0001f468\u200d\U0001F469\u200d\U0001F467\u200d\U0001F466*"),
232         SV("{:*^4.2}"),
233         SV("\U0001f468\u200d\U0001F469\u200d\U0001F467\u200d\U0001F466")); // { Family: Man, Woman, Girl, Boy }
234 }
235 
236 #ifdef _LIBCPP_VERSION
237 // Tests the libc++ specific behaviour for malformed UTF-sequences. The
238 // Standard doesn't specify how to handle this.
239 template <class CharT>
240 static void test_malformed_code_point() {
241   if constexpr (sizeof(CharT) == 1) {
242     // Malformed at end.
243     check(SV("*ZZZZ\x8f*"), SV("{:*^7}"), SV("ZZZZ\x8f"));
244     check(SV("*ZZZZ\xcf*"), SV("{:*^7}"), SV("ZZZZ\xcf"));
245     check(SV("*ZZZZ\xef*"), SV("{:*^7}"), SV("ZZZZ\xef"));
246     check(SV("*ZZZZ\xff*"), SV("{:*^7}"), SV("ZZZZ\xff"));
247 
248     // Malformed in middle, no continuation
249     check(SV("*ZZZZ\x8fZ*"), SV("{:*^8}"), SV("ZZZZ\x8fZ"));
250     check(SV("*ZZZZ\xcfZ*"), SV("{:*^8}"), SV("ZZZZ\xcfZ"));
251     check(SV("*ZZZZ\xefZ*"), SV("{:*^8}"), SV("ZZZZ\xefZ"));
252     check(SV("*ZZZZ\xffZ*"), SV("{:*^8}"), SV("ZZZZ\xffZ"));
253 
254     check(SV("*ZZZZ\x8fZZ*"), SV("{:*^9}"), SV("ZZZZ\x8fZZ"));
255     check(SV("*ZZZZ\xcfZZ*"), SV("{:*^9}"), SV("ZZZZ\xcfZZ"));
256     check(SV("*ZZZZ\xefZZ*"), SV("{:*^9}"), SV("ZZZZ\xefZZ"));
257     check(SV("*ZZZZ\xffZZ*"), SV("{:*^9}"), SV("ZZZZ\xffZZ"));
258 
259     check(SV("*ZZZZ\x8fZZZ*"), SV("{:*^10}"), SV("ZZZZ\x8fZZZ"));
260     check(SV("*ZZZZ\xcfZZZ*"), SV("{:*^10}"), SV("ZZZZ\xcfZZZ"));
261     check(SV("*ZZZZ\xefZZZ*"), SV("{:*^10}"), SV("ZZZZ\xefZZZ"));
262     check(SV("*ZZZZ\xffZZZ*"), SV("{:*^10}"), SV("ZZZZ\xffZZZ"));
263 
264     check(SV("*ZZZZ\x8fZZZZ*"), SV("{:*^11}"), SV("ZZZZ\x8fZZZZ"));
265     check(SV("*ZZZZ\xcfZZZZ*"), SV("{:*^11}"), SV("ZZZZ\xcfZZZZ"));
266     check(SV("*ZZZZ\xefZZZZ*"), SV("{:*^11}"), SV("ZZZZ\xefZZZZ"));
267     check(SV("*ZZZZ\xffZZZZ*"), SV("{:*^11}"), SV("ZZZZ\xffZZZZ"));
268 
269     // Premature end.
270     check(SV("*ZZZZ\xef\xf5*"), SV("{:*^8}"), SV("ZZZZ\xef\xf5"));
271     check(SV("*ZZZZ\xef\xf5ZZZZ*"), SV("{:*^12}"), SV("ZZZZ\xef\xf5ZZZZ"));
272     check(SV("*ZZZZ\xff\xf5\xf5*"), SV("{:*^9}"), SV("ZZZZ\xff\xf5\xf5"));
273     check(SV("*ZZZZ\xff\xf5\xf5ZZZZ*"), SV("{:*^13}"), SV("ZZZZ\xff\xf5\xf5ZZZZ"));
274 
275   } else if constexpr (sizeof(CharT) == 2) {
276     // TODO FMT Add these tests.
277   }
278   // UTF-32 doesn't combine characters, thus no corruption tests.
279 }
280 #endif
281 
282 template <class CharT>
283 static void test() {
284   test_single_code_point_fill<CharT>();
285   test_single_code_point_truncate<CharT>();
286   test_P1868<CharT>();
287 
288 #ifdef _LIBCPP_VERSION
289   test_malformed_code_point<CharT>();
290 #endif
291 }
292 
293 int main(int, char**) {
294   test<char>();
295 
296 #ifndef TEST_HAS_NO_WIDE_CHARACTERS
297   test<wchar_t>();
298 #endif
299 
300   return 0;
301 }
302