xref: /llvm-project/libcxx/test/std/input.output/filesystems/class.path/path.member/path.charconv.pass.cpp (revision ac8c9f1e39e1a773fd81ce23dbf1c80ea186f226)
1 //===----------------------------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 // UNSUPPORTED: no-localization
10 // UNSUPPORTED: c++03, c++11, c++14
11 // UNSUPPORTED: availability-filesystem-missing
12 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
13 
14 // <filesystem>
15 
16 // class path
17 
18 // Test constructors, accessors and modifiers that convert from/to various
19 // character encodings. Constructors and modifiers (append, concat,
20 // operator/=, operator+=) accept inputs with various character encodings,
21 // and accessors (*string(), string<>(), u8string()) export the string with
22 // various encodings.
23 //
24 // Some encodings are standardized; char16_t, char32_t and the u8string
25 // accessor and u8path constructor (and normal functions taking char8_t in
26 // C++20) convert from/to UTF-16, UTF-32 and UTF-8. wchar_t can be either
27 // UTF-16 or UTF-32 depending on the size of the wchar_t type, or can be
28 // left unimplemented.
29 //
30 // Plain char is implicitly UTF-8 on posix systems. On Windows, plain char
31 // is supposed to be in the same encoding as the platform's native file
32 // system APIs consumes in the functions that take narrow strings as path
33 // names.
34 
35 #include <filesystem>
36 #include <type_traits>
37 #include <cassert>
38 
39 #include "test_macros.h"
40 
41 #ifdef _WIN32
42 #  include <windows.h> // SetFileApisToANSI & friends
43 #endif
44 namespace fs = std::filesystem;
45 
46 // Test conversion with strings that fit within the latin1 charset, that fit
47 // within one code point in UTF-16, and that can be expressible in certain
48 // one-byte code pages.
test_latin_unicode()49 static void test_latin_unicode()
50 {
51   const char16_t u16str[] = { 0xe5, 0xe4, 0xf6, 0x00 };
52   const char32_t u32str[] = { 0xe5, 0xe4, 0xf6, 0x00 };
53   const char str[] = { char(0xc3), char(0xa5), char(0xc3), char(0xa4), char(0xc3), char(0xb6), 0x00 }; // UTF8, in a regular char string
54 #if TEST_STD_VER > 17 && defined(__cpp_lib_char8_t)
55   const char8_t u8str[] = { 0xc3, 0xa5, 0xc3, 0xa4, 0xc3, 0xb6, 0x00 };
56 #else
57   const char u8str[] = { char(0xc3), char(0xa5), char(0xc3), char(0xa4), char(0xc3), char(0xb6), 0x00 };
58 #endif
59 #ifndef TEST_HAS_NO_WIDE_CHARACTERS
60   const wchar_t wstr[] = { 0xe5, 0xe4, 0xf6, 0x00 };
61 #endif
62 
63   // Test well-defined conversion between UTF-8, UTF-16 and UTF-32
64   {
65     const fs::path p(u16str);
66     assert(p.u8string() == u8str);
67     assert(p.u16string() == u16str);
68     assert(p.u32string() == u32str);
69     assert(p.string<char16_t>() == u16str);
70     assert(p.string<char32_t>() == u32str);
71   }
72   {
73     const fs::path p(u32str);
74     assert(p.u8string() == u8str);
75     assert(p.u16string() == u16str);
76     assert(p.u32string() == u32str);
77     assert(p.string<char16_t>() == u16str);
78     assert(p.string<char32_t>() == u32str);
79   }
80   {
81     const fs::path p = fs::u8path(str);
82     assert(p.u8string() == u8str);
83     assert(p.u16string() == u16str);
84     assert(p.u32string() == u32str);
85     assert(p.string<char16_t>() == u16str);
86     assert(p.string<char32_t>() == u32str);
87   }
88 #if TEST_STD_VER > 17 && defined(__cpp_lib_char8_t)
89   {
90     // In C++20, the path constructor can unambiguously handle UTF-8 input,
91     // even if the plain char constructor would treat it as something else.
92     const fs::path p(u8str);
93     assert(p.u8string() == u8str);
94     assert(p.u16string() == u16str);
95     assert(p.u32string() == u32str);
96     assert(p.string<char8_t>() == u8str);
97     assert(p.string<char16_t>() == u16str);
98     assert(p.string<char32_t>() == u32str);
99   }
100   // Check reading various inputs with string<char8_t>()
101   {
102     const fs::path p(u16str);
103     assert(p.string<char8_t>() == u8str);
104   }
105   {
106     const fs::path p(u32str);
107     assert(p.string<char8_t>() == u8str);
108   }
109   {
110     const fs::path p = fs::u8path(str);
111     assert(p.string<char8_t>() == u8str);
112   }
113 #endif
114 #ifndef TEST_HAS_NO_WIDE_CHARACTERS
115   // Test conversion to/from wchar_t.
116   {
117     const fs::path p(u16str);
118     assert(p.wstring() == wstr);
119     assert(p.string<wchar_t>() == wstr);
120   }
121   {
122     const fs::path p = fs::u8path(str);
123     assert(p.wstring() == wstr);
124     assert(p.string<wchar_t>() == wstr);
125   }
126   {
127     const fs::path p(wstr);
128     assert(p.wstring() == wstr);
129     assert(p.u8string() == u8str);
130     assert(p.u16string() == u16str);
131     assert(p.u32string() == u32str);
132     assert(p.string<wchar_t>() == wstr);
133   }
134 #endif // TEST_HAS_NO_WIDE_CHARACTERS
135 #ifndef _WIN32
136   // Test conversion to/from regular char-based string. On POSIX, this
137   // is implied to convert to/from UTF-8.
138   {
139     const fs::path p(str);
140     assert(p.string() == str);
141     assert(p.u16string() == u16str);
142     assert(p.string<char>() == str);
143   }
144   {
145     const fs::path p(u16str);
146     assert(p.string() == str);
147     assert(p.string<char>() == str);
148   }
149 #else
150   // On windows, the narrow char-based input/output is supposed to be
151   // in the charset that narrow file IO APIs use. This can either be the
152   // current active code page (ACP) or the OEM code page, exposed by
153   // the AreFileApisANSI() function, and settable with SetFileApisToANSI() and
154   // SetFileApisToOEM(). We can't set which codepage is active within
155   // the process, but for some specific known ones, we can check if they
156   // behave as expected.
157   SetFileApisToANSI();
158   if (GetACP() == 1252) {
159     const char latin1[] = { char(0xe5), char(0xe4), char(0xf6), 0x00 };
160     {
161       const fs::path p(wstr);
162       assert(p.string() == latin1);
163       assert(p.string<char>() == latin1);
164     }
165     {
166       const fs::path p(latin1);
167       assert(p.string() == latin1);
168       assert(p.wstring() == wstr);
169       assert(p.u8string() == u8str);
170       assert(p.u16string() == u16str);
171       assert(p.string<char>() == latin1);
172       assert(p.string<wchar_t>() == wstr);
173     }
174   }
175   SetFileApisToOEM();
176   if (GetOEMCP() == 850 || GetOEMCP() == 437) {
177     // These chars are identical in both CP 850 and 437
178     const char cp850[] = { char(0x86), char(0x84), char(0x94), 0x00 };
179     {
180       const fs::path p(wstr);
181       assert(p.string() == cp850);
182       assert(p.string<char>() == cp850);
183     }
184     {
185       const fs::path p(cp850);
186       assert(p.string() == cp850);
187       assert(p.wstring() == wstr);
188       assert(p.u8string() == u8str);
189       assert(p.u16string() == u16str);
190       assert(p.string<char>() == cp850);
191       assert(p.string<wchar_t>() == wstr);
192     }
193   }
194 #endif
195 }
196 
197 // Test conversion with strings that don't fit within one UTF-16 code point.
198 // Here, wchar_t can be either UTF-16 or UTF-32 depending on the size on the
199 // particular platform.
test_wide_unicode()200 static void test_wide_unicode()
201 {
202   const char16_t u16str[] = { 0xd801, 0xdc37, 0x00 };
203   const char32_t u32str[] = { 0x10437, 0x00 };
204 #if TEST_STD_VER > 17 && defined(__cpp_lib_char8_t)
205   const char8_t u8str[] = { 0xf0, 0x90, 0x90, 0xb7, 0x00 };
206 #else
207   const char u8str[] = { char(0xf0), char(0x90), char(0x90), char(0xb7), 0x00 };
208 #endif
209   const char str[] = { char(0xf0), char(0x90), char(0x90), char(0xb7), 0x00 };
210   {
211     const fs::path p = fs::u8path(str);
212     assert(p.u8string() == u8str);
213     assert(p.u16string() == u16str);
214     assert(p.u32string() == u32str);
215   }
216   {
217     const fs::path p(u16str);
218     assert(p.u8string() == u8str);
219     assert(p.u16string() == u16str);
220     assert(p.u32string() == u32str);
221   }
222   {
223     const fs::path p(u32str);
224     assert(p.u8string() == u8str);
225     assert(p.u16string() == u16str);
226     assert(p.u32string() == u32str);
227   }
228 #if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__)
229 # if __SIZEOF_WCHAR_T__ == 2
230   const wchar_t wstr[] = { 0xd801, 0xdc37, 0x00 };
231 # else
232   const wchar_t wstr[] = { 0x10437, 0x00 };
233 # endif
234   // Test conversion to/from wchar_t.
235   {
236     const fs::path p = fs::u8path(str);
237     assert(p.wstring() == wstr);
238   }
239   {
240     const fs::path p(u16str);
241     assert(p.wstring() == wstr);
242   }
243   {
244     const fs::path p(u32str);
245     assert(p.wstring() == wstr);
246   }
247   {
248     const fs::path p(wstr);
249     assert(p.u8string() == u8str);
250     assert(p.u16string() == u16str);
251     assert(p.u32string() == u32str);
252     assert(p.wstring() == wstr);
253   }
254 #endif // !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__)
255 }
256 
257 // Test appending paths in different encodings.
test_append()258 static void test_append()
259 {
260   const char16_t u16str[] = { 0xd801, 0xdc37, 0x00 };
261   const char32_t u32str[] = { 0x10437, 0x00 };
262   const char32_t u32ref[] = { 0x10437, fs::path::preferred_separator, 0x10437, fs::path::preferred_separator, 0x10437, 0x00 };
263   const char str[] = { char(0xf0), char(0x90), char(0x90), char(0xb7), 0x00 };
264   {
265     fs::path p = fs::u8path(str) / u16str / u32str;
266     assert(p.u32string() == u32ref);
267     p = fs::u8path(str).append(u16str).append(u32str);
268     assert(p.u32string() == u32ref);
269     p = fs::u8path(str);
270     p /= u16str;
271     p /= u32str;
272     assert(p.u32string() == u32ref);
273   }
274 #if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__)
275 # if __SIZEOF_WCHAR_T__ == 2
276   const wchar_t wstr[] = { 0xd801, 0xdc37, 0x00 };
277 # else
278   const wchar_t wstr[] = { 0x10437, 0x00 };
279 # endif
280   // Test conversion from wchar_t.
281   {
282     fs::path p = fs::path(u16str) / wstr / u32str;
283     assert(p.u32string() == u32ref);
284     p = fs::path(u16str).append(wstr).append(u32str);
285     assert(p.u32string() == u32ref);
286     p = fs::path(u16str);
287     p /= wstr;
288     p /= u32str;
289     assert(p.u32string() == u32ref);
290   }
291 #endif // !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__)
292 }
293 
test_concat()294 static void test_concat()
295 {
296   const char16_t u16str[] = { 0xd801, 0xdc37, 0x00 };
297   const char32_t u32str[] = { 0x10437, 0x00 };
298   const char32_t u32ref[] = { 0x10437, 0x10437, 0x10437, 0x00 };
299   const char str[] = { char(0xf0), char(0x90), char(0x90), char(0xb7), 0x00 };
300   {
301     fs::path p = fs::u8path(str);
302     p += u16str;
303     p += u32str;
304     assert(p.u32string() == u32ref);
305     p = fs::u8path(str).concat(u16str).concat(u32str);
306     assert(p.u32string() == u32ref);
307   }
308 #if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__)
309 # if __SIZEOF_WCHAR_T__ == 2
310   const wchar_t wstr[] = { 0xd801, 0xdc37, 0x00 };
311 # else
312   const wchar_t wstr[] = { 0x10437, 0x00 };
313 # endif
314   // Test conversion from wchar_t.
315   {
316     fs::path p = fs::path(u16str);
317     p += wstr;
318     p += u32str;
319     assert(p.u32string() == u32ref);
320     p = fs::path(u16str).concat(wstr).concat(u32str);
321     assert(p.u32string() == u32ref);
322   }
323 #endif // !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__)
324 }
325 
test_append_concat_narrow()326 static void test_append_concat_narrow()
327 {
328   const char16_t u16str[] = { 0xe5, 0x00 };
329   const char32_t u32ref_append[] = { 0xe5, fs::path::preferred_separator, 0xe5, 0x00 };
330   const char32_t u32ref_concat[] = { 0xe5, 0xe5, 0x00 };
331 
332 #if TEST_STD_VER > 17 && defined(__cpp_lib_char8_t)
333   {
334     const char8_t u8str[] = { 0xc3, 0xa5, 0x00 };
335     // In C++20, appends of a char8_t string is unambiguously treated as
336     // UTF-8.
337     fs::path p = fs::path(u16str) / u8str;
338     assert(p.u32string() == u32ref_append);
339     p = fs::path(u16str).append(u8str);
340     assert(p.u32string() == u32ref_append);
341     p = fs::path(u16str);
342     p /= u8str;
343     assert(p.u32string() == u32ref_append);
344     p = fs::path(u16str).concat(u8str);
345     assert(p.u32string() == u32ref_concat);
346     p = fs::path(u16str);
347     p += u8str;
348     assert(p.u32string() == u32ref_concat);
349   }
350 #endif
351 #ifndef _WIN32
352   // Test appending a regular char-based string. On POSIX, this
353   // is implied to convert to/from UTF-8.
354   {
355     const char str[] = { char(0xc3), char(0xa5), 0x00 }; // UTF8, in a regular char string
356     fs::path p = fs::path(u16str) / str;
357     assert(p.u32string() == u32ref_append);
358     p = fs::path(u16str).append(str);
359     assert(p.u32string() == u32ref_append);
360     p = fs::path(u16str);
361     p /= str;
362     assert(p.u32string() == u32ref_append);
363     p = fs::path(u16str).concat(str);
364     assert(p.u32string() == u32ref_concat);
365     p = fs::path(u16str);
366     p += str;
367     assert(p.u32string() == u32ref_concat);
368   }
369 #else
370   SetFileApisToANSI();
371   if (GetACP() == 1252) {
372     const char latin1[] = { char(0xe5), 0x00 };
373     fs::path p = fs::path(u16str) / latin1;
374     assert(p.u32string() == u32ref_append);
375     p = fs::path(u16str).append(latin1);
376     assert(p.u32string() == u32ref_append);
377     p = fs::path(u16str);
378     p /= latin1;
379     assert(p.u32string() == u32ref_append);
380     p = fs::path(u16str).concat(latin1);
381     assert(p.u32string() == u32ref_concat);
382     p = fs::path(u16str);
383     p += latin1;
384     assert(p.u32string() == u32ref_concat);
385   }
386   SetFileApisToOEM();
387   if (GetOEMCP() == 850 || GetOEMCP() == 437) {
388     // This chars is identical in both CP 850 and 437
389     const char cp850[] = { char(0x86), 0x00 };
390     fs::path p = fs::path(u16str) / cp850;
391     assert(p.u32string() == u32ref_append);
392     p = fs::path(u16str).append(cp850);
393     assert(p.u32string() == u32ref_append);
394     p = fs::path(u16str);
395     p /= cp850;
396     assert(p.u32string() == u32ref_append);
397     p = fs::path(u16str).concat(cp850);
398     assert(p.u32string() == u32ref_concat);
399     p = fs::path(u16str);
400     p += cp850;
401     assert(p.u32string() == u32ref_concat);
402   }
403 #endif
404 }
405 
main(int,char **)406 int main(int, char**)
407 {
408   test_latin_unicode();
409   test_wide_unicode();
410   test_append();
411   test_concat();
412   test_append_concat_narrow();
413 
414   return 0;
415 }
416