xref: /llvm-project/libcxx/test/std/utilities/format/format.functions/unicode.pass.cpp (revision 3d334df58742ff53fb00aa3caeb7eb5da3436348)
1 //===----------------------------------------------------------------------===//
2 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
3 // See https://llvm.org/LICENSE.txt for license information.
4 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
5 //
6 //===----------------------------------------------------------------------===//
7 
8 // UNSUPPORTED: c++03, c++11, c++14, c++17
9 // UNSUPPORTED: libcpp-has-no-incomplete-format
10 
11 // This version runs the test when the platform has Unicode support.
12 // UNSUPPORTED: libcpp-has-no-unicode
13 
14 // TODO FMT Investigate Windows and AIX issues.
15 // UNSUPPORTED msvc, target={{.+}}-windows-gnu
16 // UNSUPPORTED: LIBCXX-AIX-FIXME
17 
18 // TODO FMT This test should not require std::to_chars(floating-point)
19 // This test requires std::to_chars(floating-point), which is in the dylib
20 // XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx{{10.9|10.10|10.11|10.12|10.13|10.14|10.15|11.0}}
21 
22 // <format>
23 
24 // Tests the Unicode width support of the standard format specifiers.
25 // It tests [format.string.std]/8 - 11:
26 // - Properly determining the estimated with of a unicode string.
27 // - Properly truncating to the wanted maximum width.
28 
29 // More specific extended grapheme cluster boundary rules are tested in
30 // test/libcxx/utilities/format/format.string/format.string.std/extended_grapheme_cluster.pass.cpp
31 // this test is based on test data provided by the Unicode Consortium.
32 
33 #include <format>
34 #include <cassert>
35 #include <vector>
36 
37 #include "make_string.h"
38 #include "test_macros.h"
39 #include "string_literal.h"
40 #include "test_format_string.h"
41 
42 #ifndef TEST_HAS_NO_LOCALIZATION
43 #  include <iostream>
44 #  include <type_traits>
45 #endif
46 
47 #define SV(S) MAKE_STRING_VIEW(CharT, S)
48 
49 template < class CharT, class... Args>
50 void check(std::basic_string_view<CharT> expected, test_format_string<CharT, Args...> fmt, Args&&... args) {
51   std::basic_string<CharT> out = std::format(fmt, std::forward<Args>(args)...);
52 #ifndef TEST_HAS_NO_LOCALIZATION
53   if constexpr (std::same_as<CharT, char>)
54     if (out != expected)
55       std::cerr << "\nFormat string   " << fmt.get() << "\nExpected output " << expected << "\nActual output   " << out
56                 << '\n';
57 #endif
58   assert(out == expected);
59 };
60 
61 template <class CharT>
62 static void test_single_code_point_fill() {
63   //*** 1-byte code points ***
64   check(SV("* *"), SV("{:*^3}"), SV(" "));
65   check(SV("*~*"), SV("{:*^3}"), SV("~"));
66 
67   //*** 2-byte code points ***
68   check(SV("*\u00a1*"), SV("{:*^3}"), SV("\u00a1")); // INVERTED EXCLAMATION MARK
69   check(SV("*\u07ff*"), SV("{:*^3}"), SV("\u07ff")); // NKO TAMAN SIGN
70 
71   //*** 3-byte code points ***
72   check(SV("*\u0800*"), SV("{:*^3}"), SV("\u0800")); // SAMARITAN LETTER ALAF
73   check(SV("*\ufffd*"), SV("{:*^3}"), SV("\ufffd")); // REPLACEMENT CHARACTER
74 
75   // 2 column ranges
76   check(SV("*\u1100*"), SV("{:*^4}"), SV("\u1100")); // HANGUL CHOSEONG KIYEOK
77   check(SV("*\u115f*"), SV("{:*^4}"), SV("\u115f")); // HANGUL CHOSEONG FILLER
78 
79   check(SV("*\u2329*"), SV("{:*^4}"), SV("\u2329")); // LEFT-POINTING ANGLE BRACKET
80   check(SV("*\u232a*"), SV("{:*^4}"), SV("\u232a")); // RIGHT-POINTING ANGLE BRACKET
81 
82   check(SV("*\u2e80*"), SV("{:*^4}"), SV("\u2e80")); // CJK RADICAL REPEAT
83   check(SV("*\u303e*"), SV("{:*^4}"), SV("\u303e")); // IDEOGRAPHIC VARIATION INDICATOR
84 
85   check(SV("*\u3040*"), SV("{:*^4}"), SV("\u3040")); // U+3041 HIRAGANA LETTER SMALL A
86   check(SV("*\ua4cf*"), SV("{:*^4}"), SV("\ua4cf")); // U+A4D0 LISU LETTER BA
87 
88   check(SV("*\uac00*"), SV("{:*^4}"), SV("\uac00")); // <Hangul Syllable, First>
89   check(SV("*\ud7a3*"), SV("{:*^4}"), SV("\ud7a3")); // Hangul Syllable Hih
90 
91   check(SV("*\uf900*"), SV("{:*^4}"), SV("\uf900")); // CJK COMPATIBILITY IDEOGRAPH-F900
92   check(SV("*\ufaff*"), SV("{:*^4}"), SV("\ufaff")); // U+FB00 LATIN SMALL LIGATURE FF
93 
94   check(SV("*\ufe10*"), SV("{:*^4}"), SV("\ufe10")); // PRESENTATION FORM FOR VERTICAL COMMA
95   check(SV("*\ufe19*"), SV("{:*^4}"), SV("\ufe19")); // PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS
96 
97   check(SV("*\ufe30*"), SV("{:*^4}"), SV("\ufe30")); // PRESENTATION FORM FOR VERTICAL TWO DOT LEADER
98   check(SV("*\ufe6f*"), SV("{:*^4}"), SV("\ufe6f")); // U+FE70 ARABIC FATHATAN ISOLATED FORM
99 
100   check(SV("*\uff00*"), SV("{:*^4}"), SV("\uff00")); // U+FF01 FULLWIDTH EXCLAMATION MARK
101   check(SV("*\uff60*"), SV("{:*^4}"), SV("\uff60")); // FULLWIDTH RIGHT WHITE PARENTHESIS
102 
103   check(SV("*\uffe0*"), SV("{:*^4}"), SV("\uffe0")); // FULLWIDTH CENT SIGN
104   check(SV("*\uffe6*"), SV("{:*^4}"), SV("\uffe6")); // FULLWIDTH WON SIGN
105 
106   //*** 4-byte code points ***
107   check(SV("*\U00010000*"), SV("{:*^3}"), SV("\U00010000")); // LINEAR B SYLLABLE B008 A
108   check(SV("*\U0010FFFF*"), SV("{:*^3}"), SV("\U0010FFFF")); // Undefined Character
109 
110   // 2 column ranges
111   check(SV("*\U0001f300*"), SV("{:*^4}"), SV("\U0001f300")); // CYCLONE
112   check(SV("*\U0001f64f*"), SV("{:*^4}"), SV("\U0001f64f")); // PERSON WITH FOLDED HANDS
113   check(SV("*\U0001f900*"), SV("{:*^4}"), SV("\U0001f900")); // CIRCLED CROSS FORMEE WITH FOUR DOTS
114   check(SV("*\U0001f9ff*"), SV("{:*^4}"), SV("\U0001f9ff")); // NAZAR AMULET
115   check(SV("*\U00020000*"), SV("{:*^4}"), SV("\U00020000")); // <CJK Ideograph Extension B, First>
116   check(SV("*\U0002fffd*"), SV("{:*^4}"), SV("\U0002fffd")); // Undefined Character
117   check(SV("*\U00030000*"), SV("{:*^4}"), SV("\U00030000")); // <CJK Ideograph Extension G, First>
118   check(SV("*\U0003fffd*"), SV("{:*^4}"), SV("\U0003fffd")); // Undefined Character
119 }
120 
121 // One column output is unaffected.
122 // Two column output is removed, thus the result is only the fill character.
123 template <class CharT>
124 static void test_single_code_point_truncate() {
125   //*** 1-byte code points ***
126   check(SV("* *"), SV("{:*^3.1}"), SV(" "));
127   check(SV("*~*"), SV("{:*^3.1}"), SV("~"));
128 
129   //*** 2-byte code points ***
130   check(SV("*\u00a1*"), SV("{:*^3.1}"), SV("\u00a1")); // INVERTED EXCLAMATION MARK
131   check(SV("*\u07ff*"), SV("{:*^3.1}"), SV("\u07ff")); // NKO TAMAN SIGN
132 
133   //*** 3.1-byte code points ***
134   check(SV("*\u0800*"), SV("{:*^3.1}"), SV("\u0800")); // SAMARITAN LETTER ALAF
135   check(SV("*\ufffd*"), SV("{:*^3.1}"), SV("\ufffd")); // REPLACEMENT CHARACTER
136 
137   // 2 column ranges
138   check(SV("***"), SV("{:*^3.1}"), SV("\u1100")); // HANGUL CHOSEONG KIYEOK
139   check(SV("***"), SV("{:*^3.1}"), SV("\u115f")); // HANGUL CHOSEONG FILLER
140 
141   check(SV("***"), SV("{:*^3.1}"), SV("\u2329")); // LEFT-POINTING ANGLE BRACKET
142   check(SV("***"), SV("{:*^3.1}"), SV("\u232a")); // RIGHT-POINTING ANGLE BRACKET
143 
144   check(SV("***"), SV("{:*^3.1}"), SV("\u2e80")); // CJK RADICAL REPEAT
145   check(SV("***"), SV("{:*^3.1}"), SV("\u303e")); // IDEOGRAPHIC VARIATION INDICATOR
146 
147   check(SV("***"), SV("{:*^3.1}"), SV("\u3040")); // U+3041 HIRAGANA LETTER SMALL A
148   check(SV("***"), SV("{:*^3.1}"), SV("\ua4cf")); // U+A4D0 LISU LETTER BA
149 
150   check(SV("***"), SV("{:*^3.1}"), SV("\uac00")); // <Hangul Syllable, First>
151   check(SV("***"), SV("{:*^3.1}"), SV("\ud7a3")); // Hangul Syllable Hih
152 
153   check(SV("***"), SV("{:*^3.1}"), SV("\uf900")); // CJK COMPATIBILITY IDEOGRAPH-F900
154   check(SV("***"), SV("{:*^3.1}"), SV("\ufaff")); // U+FB00 LATIN SMALL LIGATURE FF
155 
156   check(SV("***"), SV("{:*^3.1}"), SV("\ufe10")); // PRESENTATION FORM FOR VERTICAL COMMA
157   check(SV("***"), SV("{:*^3.1}"), SV("\ufe19")); // PRESENTATION FORM FOR VERTICAL HORIZONTAL ELLIPSIS
158 
159   check(SV("***"), SV("{:*^3.1}"), SV("\ufe30")); // PRESENTATION FORM FOR VERTICAL TWO DOT LEADER
160   check(SV("***"), SV("{:*^3.1}"), SV("\ufe6f")); // U+FE70 ARABIC FATHATAN ISOLATED FORM
161 
162   check(SV("***"), SV("{:*^3.1}"), SV("\uff00")); // U+FF01 FULLWIDTH EXCLAMATION MARK
163   check(SV("***"), SV("{:*^3.1}"), SV("\uff60")); // FULLWIDTH RIGHT WHITE PARENTHESIS
164 
165   check(SV("***"), SV("{:*^3.1}"), SV("\uffe0")); // FULLWIDTH CENT SIGN
166   check(SV("***"), SV("{:*^3.1}"), SV("\uffe6")); // FULLWIDTH WON SIGN
167 
168   //*** 3.1-byte code points ***
169   check(SV("*\U00010000*"), SV("{:*^3.1}"), SV("\U00010000")); // LINEAR B SYLLABLE B008 A
170   check(SV("*\U0010FFFF*"), SV("{:*^3.1}"), SV("\U0010FFFF")); // Undefined Character
171 
172   // 2 column ranges
173   check(SV("***"), SV("{:*^3.1}"), SV("\U0001f300")); // CYCLONE
174   check(SV("***"), SV("{:*^3.1}"), SV("\U0001f64f")); // PERSON WITH FOLDED HANDS
175   check(SV("***"), SV("{:*^3.1}"), SV("\U0001f900")); // CIRCLED CROSS FORMEE WITH FOUR DOTS
176   check(SV("***"), SV("{:*^3.1}"), SV("\U0001f9ff")); // NAZAR AMULET
177   check(SV("***"), SV("{:*^3.1}"), SV("\U00020000")); // <CJK Ideograph Extension B, First>
178   check(SV("***"), SV("{:*^3.1}"), SV("\U0002fffd")); // Undefined Character
179   check(SV("***"), SV("{:*^3.1}"), SV("\U00030000")); // <CJK Ideograph Extension G, First>
180   check(SV("***"), SV("{:*^3.1}"), SV("\U0003fffd")); // Undefined Character
181 }
182 
183 // The examples used in that paper.
184 template <class CharT>
185 static void test_P1868() {
186   // Fill
187   check(SV("*\u0041*"), SV("{:*^3}"), SV("\u0041")); // { LATIN CAPITAL LETTER A }
188   check(SV("*\u00c1*"), SV("{:*^3}"), SV("\u00c1")); // { LATIN CAPITAL LETTER A WITH ACUTE }
189   check(SV("*\u0041\u0301*"),
190         SV("{:*^3}"),
191         SV("\u0041\u0301"));                         // { LATIN CAPITAL LETTER A } { COMBINING ACUTE ACCENT }
192   check(SV("*\u0132*"), SV("{:*^3}"), SV("\u0132")); // { LATIN CAPITAL LIGATURE IJ }
193   check(SV("*\u0394*"), SV("{:*^3}"), SV("\u0394")); // { GREEK CAPITAL LETTER DELTA }
194 
195   check(SV("*\u0429*"), SV("{:*^3}"), SV("\u0429"));         // { CYRILLIC CAPITAL LETTER SHCHA }
196   check(SV("*\u05d0*"), SV("{:*^3}"), SV("\u05d0"));         // { HEBREW LETTER ALEF }
197   check(SV("*\u0634*"), SV("{:*^3}"), SV("\u0634"));         // { ARABIC LETTER SHEEN }
198   check(SV("*\u3009*"), SV("{:*^4}"), SV("\u3009"));         // { RIGHT-POINTING ANGLE BRACKET }
199   check(SV("*\u754c*"), SV("{:*^4}"), SV("\u754c"));         // { CJK Unified Ideograph-754C }
200   check(SV("*\U0001f921*"), SV("{:*^4}"), SV("\U0001f921")); // { UNICORN FACE }
201   check(SV("*\U0001f468\u200d\U0001F469\u200d\U0001F467\u200d\U0001F466*"),
202         SV("{:*^4}"),
203         SV("\U0001f468\u200d\U0001F469\u200d\U0001F467\u200d\U0001F466")); // { Family: Man, Woman, Girl, Boy }
204 
205   // Truncate to 1 column: 1 column grapheme clusters are kept together.
206   check(SV("*\u0041*"), SV("{:*^3.1}"), SV("\u0041")); // { LATIN CAPITAL LETTER A }
207   check(SV("*\u00c1*"), SV("{:*^3.1}"), SV("\u00c1")); // { LATIN CAPITAL LETTER A WITH ACUTE }
208   check(SV("*\u0041\u0301*"),
209         SV("{:*^3.1}"),
210         SV("\u0041\u0301"));                           // { LATIN CAPITAL LETTER A } { COMBINING ACUTE ACCENT }
211   check(SV("*\u0132*"), SV("{:*^3.1}"), SV("\u0132")); // { LATIN CAPITAL LIGATURE IJ }
212   check(SV("*\u0394*"), SV("{:*^3.1}"), SV("\u0394")); // { GREEK CAPITAL LETTER DELTA }
213 
214   check(SV("*\u0429*"), SV("{:*^3.1}"), SV("\u0429")); // { CYRILLIC CAPITAL LETTER SHCHA }
215   check(SV("*\u05d0*"), SV("{:*^3.1}"), SV("\u05d0")); // { HEBREW LETTER ALEF }
216   check(SV("*\u0634*"), SV("{:*^3.1}"), SV("\u0634")); // { ARABIC LETTER SHEEN }
217   check(SV("***"), SV("{:*^3.1}"), SV("\u3009"));      // { RIGHT-POINTING ANGLE BRACKET }
218   check(SV("***"), SV("{:*^3.1}"), SV("\u754c"));      // { CJK Unified Ideograph-754C }
219   check(SV("***"), SV("{:*^3.1}"), SV("\U0001f921"));  // { UNICORN FACE }
220   check(SV("***"),
221         SV("{:*^3.1}"),
222         SV("\U0001f468\u200d\U0001F469\u200d\U0001F467\u200d\U0001F466")); // { Family: Man, Woman, Girl, Boy }
223 
224   // Truncate to 2 column: 2 column grapheme clusters are kept together.
225   check(SV("*\u0041*"), SV("{:*^3.2}"), SV("\u0041")); // { LATIN CAPITAL LETTER A }
226   check(SV("*\u00c1*"), SV("{:*^3.2}"), SV("\u00c1")); // { LATIN CAPITAL LETTER A WITH ACUTE }
227   check(SV("*\u0041\u0301*"),
228         SV("{:*^3.2}"),
229         SV("\u0041\u0301"));                           // { LATIN CAPITAL LETTER A } { COMBINING ACUTE ACCENT }
230   check(SV("*\u0132*"), SV("{:*^3.2}"), SV("\u0132")); // { LATIN CAPITAL LIGATURE IJ }
231   check(SV("*\u0394*"), SV("{:*^3.2}"), SV("\u0394")); // { GREEK CAPITAL LETTER DELTA }
232 
233   check(SV("*\u0429*"), SV("{:*^3.2}"), SV("\u0429"));         // { CYRILLIC CAPITAL LETTER SHCHA }
234   check(SV("*\u05d0*"), SV("{:*^3.2}"), SV("\u05d0"));         // { HEBREW LETTER ALEF }
235   check(SV("*\u0634*"), SV("{:*^3.2}"), SV("\u0634"));         // { ARABIC LETTER SHEEN }
236   check(SV("*\u3009*"), SV("{:*^4.2}"), SV("\u3009"));         // { RIGHT-POINTING ANGLE BRACKET }
237   check(SV("*\u754c*"), SV("{:*^4.2}"), SV("\u754c"));         // { CJK Unified Ideograph-754C }
238   check(SV("*\U0001f921*"), SV("{:*^4.2}"), SV("\U0001f921")); // { UNICORN FACE }
239   check(SV("*\U0001f468\u200d\U0001F469\u200d\U0001F467\u200d\U0001F466*"),
240         SV("{:*^4.2}"),
241         SV("\U0001f468\u200d\U0001F469\u200d\U0001F467\u200d\U0001F466")); // { Family: Man, Woman, Girl, Boy }
242 }
243 
244 #ifdef _LIBCPP_VERSION
245 // Tests the libc++ specific behaviour for malformed UTF-sequences. The
246 // Standard doesn't specify how to handle this.
247 template <class CharT>
248 static void test_malformed_code_point() {
249   if constexpr (sizeof(CharT) == 1) {
250     // Malformed at end.
251     check(SV("*ZZZZ\x8f*"), SV("{:*^7}"), SV("ZZZZ\x8f"));
252     check(SV("*ZZZZ\xcf*"), SV("{:*^7}"), SV("ZZZZ\xcf"));
253     check(SV("*ZZZZ\xef*"), SV("{:*^7}"), SV("ZZZZ\xef"));
254     check(SV("*ZZZZ\xff*"), SV("{:*^7}"), SV("ZZZZ\xff"));
255 
256     // Malformed in middle, no continuation
257     check(SV("*ZZZZ\x8fZ*"), SV("{:*^8}"), SV("ZZZZ\x8fZ"));
258     check(SV("*ZZZZ\xcfZ*"), SV("{:*^8}"), SV("ZZZZ\xcfZ"));
259     check(SV("*ZZZZ\xefZ*"), SV("{:*^8}"), SV("ZZZZ\xefZ"));
260     check(SV("*ZZZZ\xffZ*"), SV("{:*^8}"), SV("ZZZZ\xffZ"));
261 
262     check(SV("*ZZZZ\x8fZZ*"), SV("{:*^9}"), SV("ZZZZ\x8fZZ"));
263     check(SV("*ZZZZ\xcfZZ*"), SV("{:*^9}"), SV("ZZZZ\xcfZZ"));
264     check(SV("*ZZZZ\xefZZ*"), SV("{:*^9}"), SV("ZZZZ\xefZZ"));
265     check(SV("*ZZZZ\xffZZ*"), SV("{:*^9}"), SV("ZZZZ\xffZZ"));
266 
267     check(SV("*ZZZZ\x8fZZZ*"), SV("{:*^10}"), SV("ZZZZ\x8fZZZ"));
268     check(SV("*ZZZZ\xcfZZZ*"), SV("{:*^10}"), SV("ZZZZ\xcfZZZ"));
269     check(SV("*ZZZZ\xefZZZ*"), SV("{:*^10}"), SV("ZZZZ\xefZZZ"));
270     check(SV("*ZZZZ\xffZZZ*"), SV("{:*^10}"), SV("ZZZZ\xffZZZ"));
271 
272     check(SV("*ZZZZ\x8fZZZZ*"), SV("{:*^11}"), SV("ZZZZ\x8fZZZZ"));
273     check(SV("*ZZZZ\xcfZZZZ*"), SV("{:*^11}"), SV("ZZZZ\xcfZZZZ"));
274     check(SV("*ZZZZ\xefZZZZ*"), SV("{:*^11}"), SV("ZZZZ\xefZZZZ"));
275     check(SV("*ZZZZ\xffZZZZ*"), SV("{:*^11}"), SV("ZZZZ\xffZZZZ"));
276 
277     // Premature end.
278     check(SV("*ZZZZ\xef\xf5*"), SV("{:*^8}"), SV("ZZZZ\xef\xf5"));
279     check(SV("*ZZZZ\xef\xf5ZZZZ*"), SV("{:*^12}"), SV("ZZZZ\xef\xf5ZZZZ"));
280     check(SV("*ZZZZ\xff\xf5\xf5*"), SV("{:*^9}"), SV("ZZZZ\xff\xf5\xf5"));
281     check(SV("*ZZZZ\xff\xf5\xf5ZZZZ*"), SV("{:*^13}"), SV("ZZZZ\xff\xf5\xf5ZZZZ"));
282 
283   } else if constexpr (sizeof(CharT) == 2) {
284     // TODO FMT Add these tests.
285   }
286   // UTF-32 doesn't combine characters, thus no corruption tests.
287 }
288 #endif
289 
290 template <class CharT>
291 static void test() {
292   test_single_code_point_fill<CharT>();
293   test_single_code_point_truncate<CharT>();
294   test_P1868<CharT>();
295 
296 #ifdef _LIBCPP_VERSION
297   test_malformed_code_point<CharT>();
298 #endif
299 }
300 
301 int main(int, char**) {
302   test<char>();
303 
304 #ifndef TEST_HAS_NO_WIDE_CHARACTERS
305   test<wchar_t>();
306 #endif
307 
308   return 0;
309 }
310