1 //===----------------------------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8
9 // UNSUPPORTED: no-localization
10 // UNSUPPORTED: c++03, c++11, c++14
11 // UNSUPPORTED: availability-filesystem-missing
12 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DISABLE_DEPRECATION_WARNINGS
13
14 // <filesystem>
15
16 // class path
17
18 // Test constructors, accessors and modifiers that convert from/to various
19 // character encodings. Constructors and modifiers (append, concat,
20 // operator/=, operator+=) accept inputs with various character encodings,
21 // and accessors (*string(), string<>(), u8string()) export the string with
22 // various encodings.
23 //
24 // Some encodings are standardized; char16_t, char32_t and the u8string
25 // accessor and u8path constructor (and normal functions taking char8_t in
26 // C++20) convert from/to UTF-16, UTF-32 and UTF-8. wchar_t can be either
27 // UTF-16 or UTF-32 depending on the size of the wchar_t type, or can be
28 // left unimplemented.
29 //
30 // Plain char is implicitly UTF-8 on posix systems. On Windows, plain char
31 // is supposed to be in the same encoding as the platform's native file
32 // system APIs consumes in the functions that take narrow strings as path
33 // names.
34
35 #include <filesystem>
36 #include <type_traits>
37 #include <cassert>
38
39 #include "test_macros.h"
40
41 #ifdef _WIN32
42 # include <windows.h> // SetFileApisToANSI & friends
43 #endif
44 namespace fs = std::filesystem;
45
46 // Test conversion with strings that fit within the latin1 charset, that fit
47 // within one code point in UTF-16, and that can be expressible in certain
48 // one-byte code pages.
test_latin_unicode()49 static void test_latin_unicode()
50 {
51 const char16_t u16str[] = { 0xe5, 0xe4, 0xf6, 0x00 };
52 const char32_t u32str[] = { 0xe5, 0xe4, 0xf6, 0x00 };
53 const char str[] = { char(0xc3), char(0xa5), char(0xc3), char(0xa4), char(0xc3), char(0xb6), 0x00 }; // UTF8, in a regular char string
54 #if TEST_STD_VER > 17 && defined(__cpp_lib_char8_t)
55 const char8_t u8str[] = { 0xc3, 0xa5, 0xc3, 0xa4, 0xc3, 0xb6, 0x00 };
56 #else
57 const char u8str[] = { char(0xc3), char(0xa5), char(0xc3), char(0xa4), char(0xc3), char(0xb6), 0x00 };
58 #endif
59 #ifndef TEST_HAS_NO_WIDE_CHARACTERS
60 const wchar_t wstr[] = { 0xe5, 0xe4, 0xf6, 0x00 };
61 #endif
62
63 // Test well-defined conversion between UTF-8, UTF-16 and UTF-32
64 {
65 const fs::path p(u16str);
66 assert(p.u8string() == u8str);
67 assert(p.u16string() == u16str);
68 assert(p.u32string() == u32str);
69 assert(p.string<char16_t>() == u16str);
70 assert(p.string<char32_t>() == u32str);
71 }
72 {
73 const fs::path p(u32str);
74 assert(p.u8string() == u8str);
75 assert(p.u16string() == u16str);
76 assert(p.u32string() == u32str);
77 assert(p.string<char16_t>() == u16str);
78 assert(p.string<char32_t>() == u32str);
79 }
80 {
81 const fs::path p = fs::u8path(str);
82 assert(p.u8string() == u8str);
83 assert(p.u16string() == u16str);
84 assert(p.u32string() == u32str);
85 assert(p.string<char16_t>() == u16str);
86 assert(p.string<char32_t>() == u32str);
87 }
88 #if TEST_STD_VER > 17 && defined(__cpp_lib_char8_t)
89 {
90 // In C++20, the path constructor can unambiguously handle UTF-8 input,
91 // even if the plain char constructor would treat it as something else.
92 const fs::path p(u8str);
93 assert(p.u8string() == u8str);
94 assert(p.u16string() == u16str);
95 assert(p.u32string() == u32str);
96 assert(p.string<char8_t>() == u8str);
97 assert(p.string<char16_t>() == u16str);
98 assert(p.string<char32_t>() == u32str);
99 }
100 // Check reading various inputs with string<char8_t>()
101 {
102 const fs::path p(u16str);
103 assert(p.string<char8_t>() == u8str);
104 }
105 {
106 const fs::path p(u32str);
107 assert(p.string<char8_t>() == u8str);
108 }
109 {
110 const fs::path p = fs::u8path(str);
111 assert(p.string<char8_t>() == u8str);
112 }
113 #endif
114 #ifndef TEST_HAS_NO_WIDE_CHARACTERS
115 // Test conversion to/from wchar_t.
116 {
117 const fs::path p(u16str);
118 assert(p.wstring() == wstr);
119 assert(p.string<wchar_t>() == wstr);
120 }
121 {
122 const fs::path p = fs::u8path(str);
123 assert(p.wstring() == wstr);
124 assert(p.string<wchar_t>() == wstr);
125 }
126 {
127 const fs::path p(wstr);
128 assert(p.wstring() == wstr);
129 assert(p.u8string() == u8str);
130 assert(p.u16string() == u16str);
131 assert(p.u32string() == u32str);
132 assert(p.string<wchar_t>() == wstr);
133 }
134 #endif // TEST_HAS_NO_WIDE_CHARACTERS
135 #ifndef _WIN32
136 // Test conversion to/from regular char-based string. On POSIX, this
137 // is implied to convert to/from UTF-8.
138 {
139 const fs::path p(str);
140 assert(p.string() == str);
141 assert(p.u16string() == u16str);
142 assert(p.string<char>() == str);
143 }
144 {
145 const fs::path p(u16str);
146 assert(p.string() == str);
147 assert(p.string<char>() == str);
148 }
149 #else
150 // On windows, the narrow char-based input/output is supposed to be
151 // in the charset that narrow file IO APIs use. This can either be the
152 // current active code page (ACP) or the OEM code page, exposed by
153 // the AreFileApisANSI() function, and settable with SetFileApisToANSI() and
154 // SetFileApisToOEM(). We can't set which codepage is active within
155 // the process, but for some specific known ones, we can check if they
156 // behave as expected.
157 SetFileApisToANSI();
158 if (GetACP() == 1252) {
159 const char latin1[] = { char(0xe5), char(0xe4), char(0xf6), 0x00 };
160 {
161 const fs::path p(wstr);
162 assert(p.string() == latin1);
163 assert(p.string<char>() == latin1);
164 }
165 {
166 const fs::path p(latin1);
167 assert(p.string() == latin1);
168 assert(p.wstring() == wstr);
169 assert(p.u8string() == u8str);
170 assert(p.u16string() == u16str);
171 assert(p.string<char>() == latin1);
172 assert(p.string<wchar_t>() == wstr);
173 }
174 }
175 SetFileApisToOEM();
176 if (GetOEMCP() == 850 || GetOEMCP() == 437) {
177 // These chars are identical in both CP 850 and 437
178 const char cp850[] = { char(0x86), char(0x84), char(0x94), 0x00 };
179 {
180 const fs::path p(wstr);
181 assert(p.string() == cp850);
182 assert(p.string<char>() == cp850);
183 }
184 {
185 const fs::path p(cp850);
186 assert(p.string() == cp850);
187 assert(p.wstring() == wstr);
188 assert(p.u8string() == u8str);
189 assert(p.u16string() == u16str);
190 assert(p.string<char>() == cp850);
191 assert(p.string<wchar_t>() == wstr);
192 }
193 }
194 #endif
195 }
196
197 // Test conversion with strings that don't fit within one UTF-16 code point.
198 // Here, wchar_t can be either UTF-16 or UTF-32 depending on the size on the
199 // particular platform.
test_wide_unicode()200 static void test_wide_unicode()
201 {
202 const char16_t u16str[] = { 0xd801, 0xdc37, 0x00 };
203 const char32_t u32str[] = { 0x10437, 0x00 };
204 #if TEST_STD_VER > 17 && defined(__cpp_lib_char8_t)
205 const char8_t u8str[] = { 0xf0, 0x90, 0x90, 0xb7, 0x00 };
206 #else
207 const char u8str[] = { char(0xf0), char(0x90), char(0x90), char(0xb7), 0x00 };
208 #endif
209 const char str[] = { char(0xf0), char(0x90), char(0x90), char(0xb7), 0x00 };
210 {
211 const fs::path p = fs::u8path(str);
212 assert(p.u8string() == u8str);
213 assert(p.u16string() == u16str);
214 assert(p.u32string() == u32str);
215 }
216 {
217 const fs::path p(u16str);
218 assert(p.u8string() == u8str);
219 assert(p.u16string() == u16str);
220 assert(p.u32string() == u32str);
221 }
222 {
223 const fs::path p(u32str);
224 assert(p.u8string() == u8str);
225 assert(p.u16string() == u16str);
226 assert(p.u32string() == u32str);
227 }
228 #if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__)
229 # if __SIZEOF_WCHAR_T__ == 2
230 const wchar_t wstr[] = { 0xd801, 0xdc37, 0x00 };
231 # else
232 const wchar_t wstr[] = { 0x10437, 0x00 };
233 # endif
234 // Test conversion to/from wchar_t.
235 {
236 const fs::path p = fs::u8path(str);
237 assert(p.wstring() == wstr);
238 }
239 {
240 const fs::path p(u16str);
241 assert(p.wstring() == wstr);
242 }
243 {
244 const fs::path p(u32str);
245 assert(p.wstring() == wstr);
246 }
247 {
248 const fs::path p(wstr);
249 assert(p.u8string() == u8str);
250 assert(p.u16string() == u16str);
251 assert(p.u32string() == u32str);
252 assert(p.wstring() == wstr);
253 }
254 #endif // !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__)
255 }
256
257 // Test appending paths in different encodings.
test_append()258 static void test_append()
259 {
260 const char16_t u16str[] = { 0xd801, 0xdc37, 0x00 };
261 const char32_t u32str[] = { 0x10437, 0x00 };
262 const char32_t u32ref[] = { 0x10437, fs::path::preferred_separator, 0x10437, fs::path::preferred_separator, 0x10437, 0x00 };
263 const char str[] = { char(0xf0), char(0x90), char(0x90), char(0xb7), 0x00 };
264 {
265 fs::path p = fs::u8path(str) / u16str / u32str;
266 assert(p.u32string() == u32ref);
267 p = fs::u8path(str).append(u16str).append(u32str);
268 assert(p.u32string() == u32ref);
269 p = fs::u8path(str);
270 p /= u16str;
271 p /= u32str;
272 assert(p.u32string() == u32ref);
273 }
274 #if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__)
275 # if __SIZEOF_WCHAR_T__ == 2
276 const wchar_t wstr[] = { 0xd801, 0xdc37, 0x00 };
277 # else
278 const wchar_t wstr[] = { 0x10437, 0x00 };
279 # endif
280 // Test conversion from wchar_t.
281 {
282 fs::path p = fs::path(u16str) / wstr / u32str;
283 assert(p.u32string() == u32ref);
284 p = fs::path(u16str).append(wstr).append(u32str);
285 assert(p.u32string() == u32ref);
286 p = fs::path(u16str);
287 p /= wstr;
288 p /= u32str;
289 assert(p.u32string() == u32ref);
290 }
291 #endif // !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__)
292 }
293
test_concat()294 static void test_concat()
295 {
296 const char16_t u16str[] = { 0xd801, 0xdc37, 0x00 };
297 const char32_t u32str[] = { 0x10437, 0x00 };
298 const char32_t u32ref[] = { 0x10437, 0x10437, 0x10437, 0x00 };
299 const char str[] = { char(0xf0), char(0x90), char(0x90), char(0xb7), 0x00 };
300 {
301 fs::path p = fs::u8path(str);
302 p += u16str;
303 p += u32str;
304 assert(p.u32string() == u32ref);
305 p = fs::u8path(str).concat(u16str).concat(u32str);
306 assert(p.u32string() == u32ref);
307 }
308 #if !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__)
309 # if __SIZEOF_WCHAR_T__ == 2
310 const wchar_t wstr[] = { 0xd801, 0xdc37, 0x00 };
311 # else
312 const wchar_t wstr[] = { 0x10437, 0x00 };
313 # endif
314 // Test conversion from wchar_t.
315 {
316 fs::path p = fs::path(u16str);
317 p += wstr;
318 p += u32str;
319 assert(p.u32string() == u32ref);
320 p = fs::path(u16str).concat(wstr).concat(u32str);
321 assert(p.u32string() == u32ref);
322 }
323 #endif // !defined(TEST_HAS_NO_WIDE_CHARACTERS) && defined(__SIZEOF_WCHAR_T__)
324 }
325
test_append_concat_narrow()326 static void test_append_concat_narrow()
327 {
328 const char16_t u16str[] = { 0xe5, 0x00 };
329 const char32_t u32ref_append[] = { 0xe5, fs::path::preferred_separator, 0xe5, 0x00 };
330 const char32_t u32ref_concat[] = { 0xe5, 0xe5, 0x00 };
331
332 #if TEST_STD_VER > 17 && defined(__cpp_lib_char8_t)
333 {
334 const char8_t u8str[] = { 0xc3, 0xa5, 0x00 };
335 // In C++20, appends of a char8_t string is unambiguously treated as
336 // UTF-8.
337 fs::path p = fs::path(u16str) / u8str;
338 assert(p.u32string() == u32ref_append);
339 p = fs::path(u16str).append(u8str);
340 assert(p.u32string() == u32ref_append);
341 p = fs::path(u16str);
342 p /= u8str;
343 assert(p.u32string() == u32ref_append);
344 p = fs::path(u16str).concat(u8str);
345 assert(p.u32string() == u32ref_concat);
346 p = fs::path(u16str);
347 p += u8str;
348 assert(p.u32string() == u32ref_concat);
349 }
350 #endif
351 #ifndef _WIN32
352 // Test appending a regular char-based string. On POSIX, this
353 // is implied to convert to/from UTF-8.
354 {
355 const char str[] = { char(0xc3), char(0xa5), 0x00 }; // UTF8, in a regular char string
356 fs::path p = fs::path(u16str) / str;
357 assert(p.u32string() == u32ref_append);
358 p = fs::path(u16str).append(str);
359 assert(p.u32string() == u32ref_append);
360 p = fs::path(u16str);
361 p /= str;
362 assert(p.u32string() == u32ref_append);
363 p = fs::path(u16str).concat(str);
364 assert(p.u32string() == u32ref_concat);
365 p = fs::path(u16str);
366 p += str;
367 assert(p.u32string() == u32ref_concat);
368 }
369 #else
370 SetFileApisToANSI();
371 if (GetACP() == 1252) {
372 const char latin1[] = { char(0xe5), 0x00 };
373 fs::path p = fs::path(u16str) / latin1;
374 assert(p.u32string() == u32ref_append);
375 p = fs::path(u16str).append(latin1);
376 assert(p.u32string() == u32ref_append);
377 p = fs::path(u16str);
378 p /= latin1;
379 assert(p.u32string() == u32ref_append);
380 p = fs::path(u16str).concat(latin1);
381 assert(p.u32string() == u32ref_concat);
382 p = fs::path(u16str);
383 p += latin1;
384 assert(p.u32string() == u32ref_concat);
385 }
386 SetFileApisToOEM();
387 if (GetOEMCP() == 850 || GetOEMCP() == 437) {
388 // This chars is identical in both CP 850 and 437
389 const char cp850[] = { char(0x86), 0x00 };
390 fs::path p = fs::path(u16str) / cp850;
391 assert(p.u32string() == u32ref_append);
392 p = fs::path(u16str).append(cp850);
393 assert(p.u32string() == u32ref_append);
394 p = fs::path(u16str);
395 p /= cp850;
396 assert(p.u32string() == u32ref_append);
397 p = fs::path(u16str).concat(cp850);
398 assert(p.u32string() == u32ref_concat);
399 p = fs::path(u16str);
400 p += cp850;
401 assert(p.u32string() == u32ref_concat);
402 }
403 #endif
404 }
405
main(int,char **)406 int main(int, char**)
407 {
408 test_latin_unicode();
409 test_wide_unicode();
410 test_append();
411 test_concat();
412 test_append_concat_narrow();
413
414 return 0;
415 }
416