xref: /freebsd-src/contrib/libarchive/libarchive/test/test_pax_filename_encoding.c (revision bd66c1b43e33540205dbc1187c2f2a15c58b57ba)
1 /*-
2  * Copyright (c) 2003-2007 Tim Kientzle
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
15  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
17  * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
18  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
19  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
20  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
21  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  */
25 #include "test.h"
26 
27 #include <locale.h>
28 
29 /*
30  * Pax interchange is supposed to encode filenames into
31  * UTF-8.  Of course, that's not always possible.  This
32  * test is intended to verify that filenames always get
33  * stored and restored correctly, regardless of the encodings.
34  */
35 
36 /*
37  * Read a manually-created archive that has filenames that are
38  * stored in binary instead of UTF-8 and verify that we get
39  * the right filename returned and that we get a warning only
40  * if the header isn't marked as binary.
41  */
42 static void
43 test_pax_filename_encoding_1(void)
44 {
45 	static const char testname[] = "test_pax_filename_encoding.tar";
46 	/*
47 	 * \314\214 is a valid 2-byte UTF-8 sequence.
48 	 * \374 is invalid in UTF-8.
49 	 */
50 	char filename[] = "abc\314\214mno\374xyz";
51 	struct archive *a;
52 	struct archive_entry *entry;
53 
54 	/*
55 	 * Read an archive that has non-UTF8 pax filenames in it.
56 	 */
57 	extract_reference_file(testname);
58 	a = archive_read_new();
59 	assertEqualInt(ARCHIVE_OK, archive_read_support_format_tar(a));
60 	assertEqualInt(ARCHIVE_OK, archive_read_support_filter_all(a));
61 	assertEqualInt(ARCHIVE_OK,
62 	    archive_read_open_filename(a, testname, 10240));
63 	/*
64 	 * First entry in this test archive has an invalid UTF-8 sequence
65 	 * in it, but the header is not marked as hdrcharset=BINARY, so that
66 	 * requires a warning.
67 	 */
68 	failure("Invalid UTF8 in a pax archive pathname should cause a warning");
69 	assertEqualInt(ARCHIVE_WARN, archive_read_next_header(a, &entry));
70 	assertEqualString(filename, archive_entry_pathname(entry));
71 	/*
72 	 * Second entry is identical except that it does have
73 	 * hdrcharset=BINARY, so no warning should be generated.
74 	 */
75 	failure("A pathname with hdrcharset=BINARY can have invalid UTF8\n"
76 	    " characters in it without generating a warning");
77 	assertEqualInt(ARCHIVE_OK, archive_read_next_header(a, &entry));
78 	assertEqualString(filename, archive_entry_pathname(entry));
79 	archive_read_free(a);
80 }
81 
82 /*
83  * Set the locale and write a pathname containing invalid characters.
84  * This should work; the underlying implementation should automatically
85  * fall back to storing the pathname in binary.
86  */
87 static void
88 test_pax_filename_encoding_2(void)
89 {
90 	char filename[] = "abc\314\214mno\374xyz";
91 	struct archive *a;
92 	struct archive_entry *entry;
93 	char buff[65536];
94 	char longname[] = "abc\314\214mno\374xyz"
95 	    "/abc\314\214mno\374xyz/abcdefghijklmnopqrstuvwxyz"
96 	    "/abc\314\214mno\374xyz/abcdefghijklmnopqrstuvwxyz"
97 	    "/abc\314\214mno\374xyz/abcdefghijklmnopqrstuvwxyz"
98 	    "/abc\314\214mno\374xyz/abcdefghijklmnopqrstuvwxyz"
99 	    "/abc\314\214mno\374xyz/abcdefghijklmnopqrstuvwxyz"
100 	    "/abc\314\214mno\374xyz/abcdefghijklmnopqrstuvwxyz"
101 	    ;
102 	size_t used;
103 
104 	/*
105 	 * We need a starting locale which has invalid sequences.
106 	 * en_US.UTF-8 seems to be commonly supported.
107 	 */
108 	/* If it doesn't exist, just warn and return. */
109 	if (NULL == setlocale(LC_ALL, "en_US.UTF-8")) {
110 		skipping("invalid encoding tests require a suitable locale;"
111 		    " en_US.UTF-8 not available on this system");
112 		return;
113 	}
114 
115 	assert((a = archive_write_new()) != NULL);
116 	assertEqualIntA(a, 0, archive_write_set_format_pax(a));
117 	assertEqualIntA(a, 0, archive_write_add_filter_none(a));
118 	assertEqualIntA(a, 0, archive_write_set_bytes_per_block(a, 0));
119 	assertEqualInt(0,
120 	    archive_write_open_memory(a, buff, sizeof(buff), &used));
121 
122 	assert((entry = archive_entry_new()) != NULL);
123 	/* Set pathname, gname, uname, hardlink to nonconvertible values. */
124 	archive_entry_copy_pathname(entry, filename);
125 	archive_entry_copy_gname(entry, filename);
126 	archive_entry_copy_uname(entry, filename);
127 	archive_entry_copy_hardlink(entry, filename);
128 	archive_entry_set_filetype(entry, AE_IFREG);
129 	failure("This should generate a warning for nonconvertible names.");
130 	assertEqualInt(ARCHIVE_WARN, archive_write_header(a, entry));
131 	archive_entry_free(entry);
132 
133 	assert((entry = archive_entry_new()) != NULL);
134 	/* Set path, gname, uname, and symlink to nonconvertible values. */
135 	archive_entry_copy_pathname(entry, filename);
136 	archive_entry_copy_gname(entry, filename);
137 	archive_entry_copy_uname(entry, filename);
138 	archive_entry_copy_symlink(entry, filename);
139 	archive_entry_set_filetype(entry, AE_IFLNK);
140 	failure("This should generate a warning for nonconvertible names.");
141 	assertEqualInt(ARCHIVE_WARN, archive_write_header(a, entry));
142 	archive_entry_free(entry);
143 
144 	assert((entry = archive_entry_new()) != NULL);
145 	/* Set pathname to a very long nonconvertible value. */
146 	archive_entry_copy_pathname(entry, longname);
147 	archive_entry_set_filetype(entry, AE_IFREG);
148 	failure("This should generate a warning for nonconvertible names.");
149 	assertEqualInt(ARCHIVE_WARN, archive_write_header(a, entry));
150 	archive_entry_free(entry);
151 
152 	assertEqualIntA(a, ARCHIVE_OK, archive_write_close(a));
153 	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
154 
155 	/*
156 	 * Now read the entries back.
157 	 */
158 
159 	assert((a = archive_read_new()) != NULL);
160 	assertEqualInt(0, archive_read_support_format_tar(a));
161 	assertEqualInt(0, archive_read_open_memory(a, buff, used));
162 
163 	assertEqualInt(0, archive_read_next_header(a, &entry));
164 	assertEqualString(filename, archive_entry_pathname(entry));
165 	assertEqualString(filename, archive_entry_gname(entry));
166 	assertEqualString(filename, archive_entry_uname(entry));
167 	assertEqualString(filename, archive_entry_hardlink(entry));
168 
169 	assertEqualInt(0, archive_read_next_header(a, &entry));
170 	assertEqualString(filename, archive_entry_pathname(entry));
171 	assertEqualString(filename, archive_entry_gname(entry));
172 	assertEqualString(filename, archive_entry_uname(entry));
173 	assertEqualString(filename, archive_entry_symlink(entry));
174 
175 	assertEqualInt(0, archive_read_next_header(a, &entry));
176 	assertEqualString(longname, archive_entry_pathname(entry));
177 
178 	assertEqualIntA(a, ARCHIVE_OK, archive_read_close(a));
179 	assertEqualInt(ARCHIVE_OK, archive_read_free(a));
180 }
181 
182 #if 0 /* Disable this until Tim check out it. */
183 
184 /*
185  * Create an entry starting from a wide-character Unicode pathname,
186  * read it back into "C" locale, which doesn't support the name.
187  * TODO: Figure out the "right" behavior here.
188  */
189 static void
190 test_pax_filename_encoding_3(void)
191 {
192 	wchar_t badname[] = L"xxxAyyyBzzz";
193 	const char badname_utf8[] = "xxx\xE1\x88\xB4yyy\xE5\x99\xB8zzz";
194 	struct archive *a;
195 	struct archive_entry *entry;
196 	char buff[65536];
197 	size_t used;
198 
199 	badname[3] = 0x1234;
200 	badname[7] = 0x5678;
201 
202 	/* If it doesn't exist, just warn and return. */
203 	if (NULL == setlocale(LC_ALL, "C")) {
204 		skipping("Can't set \"C\" locale, so can't exercise "
205 		    "certain character-conversion failures");
206 		return;
207 	}
208 
209 	/* If wctomb is broken, warn and return. */
210 	if (wctomb(buff, 0x1234) > 0) {
211 		skipping("Cannot test conversion failures because \"C\" "
212 		    "locale on this system has no invalid characters.");
213 		return;
214 	}
215 
216 	/* If wctomb is broken, warn and return. */
217 	if (wctomb(buff, 0x1234) > 0) {
218 		skipping("Cannot test conversion failures because \"C\" "
219 		    "locale on this system has no invalid characters.");
220 		return;
221 	}
222 
223 	/* Skip test if archive_entry_update_pathname_utf8() is broken. */
224 	/* In particular, this is currently broken on Win32 because
225 	 * setlocale() does not set the default encoding for CP_ACP. */
226 	entry = archive_entry_new();
227 	if (archive_entry_update_pathname_utf8(entry, badname_utf8)) {
228 		archive_entry_free(entry);
229 		skipping("Cannot test conversion failures.");
230 		return;
231 	}
232 	archive_entry_free(entry);
233 
234 	assert((a = archive_write_new()) != NULL);
235 	assertEqualIntA(a, 0, archive_write_set_format_pax(a));
236 	assertEqualIntA(a, 0, archive_write_add_filter_none(a));
237 	assertEqualIntA(a, 0, archive_write_set_bytes_per_block(a, 0));
238 	assertEqualInt(0,
239 	    archive_write_open_memory(a, buff, sizeof(buff), &used));
240 
241 	assert((entry = archive_entry_new()) != NULL);
242 	/* Set pathname to non-convertible wide value. */
243 	archive_entry_copy_pathname_w(entry, badname);
244 	archive_entry_set_filetype(entry, AE_IFREG);
245 	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
246 	archive_entry_free(entry);
247 
248 	assert((entry = archive_entry_new()) != NULL);
249 	archive_entry_copy_pathname_w(entry, L"abc");
250 	/* Set gname to non-convertible wide value. */
251 	archive_entry_copy_gname_w(entry, badname);
252 	archive_entry_set_filetype(entry, AE_IFREG);
253 	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
254 	archive_entry_free(entry);
255 
256 	assert((entry = archive_entry_new()) != NULL);
257 	archive_entry_copy_pathname_w(entry, L"abc");
258 	/* Set uname to non-convertible wide value. */
259 	archive_entry_copy_uname_w(entry, badname);
260 	archive_entry_set_filetype(entry, AE_IFREG);
261 	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
262 	archive_entry_free(entry);
263 
264 	assert((entry = archive_entry_new()) != NULL);
265 	archive_entry_copy_pathname_w(entry, L"abc");
266 	/* Set hardlink to non-convertible wide value. */
267 	archive_entry_copy_hardlink_w(entry, badname);
268 	archive_entry_set_filetype(entry, AE_IFREG);
269 	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
270 	archive_entry_free(entry);
271 
272 	assert((entry = archive_entry_new()) != NULL);
273 	archive_entry_copy_pathname_w(entry, L"abc");
274 	/* Set symlink to non-convertible wide value. */
275 	archive_entry_copy_symlink_w(entry, badname);
276 	archive_entry_set_filetype(entry, AE_IFLNK);
277 	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
278 	archive_entry_free(entry);
279 
280 	assertEqualIntA(a, ARCHIVE_OK, archive_write_close(a));
281 	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
282 
283 	/*
284 	 * Now read the entries back.
285 	 */
286 
287 	assert((a = archive_read_new()) != NULL);
288 	assertEqualInt(0, archive_read_support_format_tar(a));
289 	assertEqualInt(0, archive_read_open_memory(a, buff, used));
290 
291 	failure("A non-convertible pathname should cause a warning.");
292 	assertEqualInt(ARCHIVE_WARN, archive_read_next_header(a, &entry));
293 	assertEqualWString(badname, archive_entry_pathname_w(entry));
294 	failure("If native locale can't convert, we should get UTF-8 back.");
295 	assertEqualString(badname_utf8, archive_entry_pathname(entry));
296 
297 	failure("A non-convertible gname should cause a warning.");
298 	assertEqualInt(ARCHIVE_WARN, archive_read_next_header(a, &entry));
299 	assertEqualWString(badname, archive_entry_gname_w(entry));
300 	failure("If native locale can't convert, we should get UTF-8 back.");
301 	assertEqualString(badname_utf8, archive_entry_gname(entry));
302 
303 	failure("A non-convertible uname should cause a warning.");
304 	assertEqualInt(ARCHIVE_WARN, archive_read_next_header(a, &entry));
305 	assertEqualWString(badname, archive_entry_uname_w(entry));
306 	failure("If native locale can't convert, we should get UTF-8 back.");
307 	assertEqualString(badname_utf8, archive_entry_uname(entry));
308 
309 	failure("A non-convertible hardlink should cause a warning.");
310 	assertEqualInt(ARCHIVE_WARN, archive_read_next_header(a, &entry));
311 	assertEqualWString(badname, archive_entry_hardlink_w(entry));
312 	failure("If native locale can't convert, we should get UTF-8 back.");
313 	assertEqualString(badname_utf8, archive_entry_hardlink(entry));
314 
315 	failure("A non-convertible symlink should cause a warning.");
316 	assertEqualInt(ARCHIVE_WARN, archive_read_next_header(a, &entry));
317 	assertEqualWString(badname, archive_entry_symlink_w(entry));
318 	assertEqualWString(NULL, archive_entry_hardlink_w(entry));
319 	failure("If native locale can't convert, we should get UTF-8 back.");
320 	assertEqualString(badname_utf8, archive_entry_symlink(entry));
321 
322 	assertEqualInt(ARCHIVE_EOF, archive_read_next_header(a, &entry));
323 
324 	assertEqualIntA(a, ARCHIVE_OK, archive_read_close(a));
325 	assertEqualInt(ARCHIVE_OK, archive_read_free(a));
326 }
327 #else
328 static void
329 test_pax_filename_encoding_3(void)
330 {
331 }
332 #endif
333 
334 /*
335  * Verify that KOI8-R filenames are correctly translated to Unicode and UTF-8.
336  */
337 DEFINE_TEST(test_pax_filename_encoding_KOI8R)
338 {
339   	struct archive *a;
340   	struct archive_entry *entry;
341 	char buff[4096];
342 	size_t used;
343 
344 	if (NULL == setlocale(LC_ALL, "ru_RU.KOI8-R")) {
345 		skipping("KOI8-R locale not available on this system.");
346 		return;
347 	}
348 
349 	/* Check if the platform completely supports the string conversion. */
350 	a = archive_write_new();
351 	assertEqualInt(ARCHIVE_OK, archive_write_set_format_pax(a));
352 	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
353 		skipping("This system cannot convert character-set"
354 		    " from KOI8-R to UTF-8.");
355 		archive_write_free(a);
356 		return;
357 	}
358 	archive_write_free(a);
359 
360 	/* Re-create a write archive object since filenames should be written
361 	 * in UTF-8 by default. */
362 	a = archive_write_new();
363 	assertEqualInt(ARCHIVE_OK, archive_write_set_format_pax(a));
364 	assertEqualInt(ARCHIVE_OK,
365 	    archive_write_open_memory(a, buff, sizeof(buff), &used));
366 
367 	entry = archive_entry_new2(a);
368 	archive_entry_set_pathname(entry, "\xD0\xD2\xC9");
369 	archive_entry_set_filetype(entry, AE_IFREG);
370 	archive_entry_set_size(entry, 0);
371 	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
372 	archive_entry_free(entry);
373 	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
374 
375 	/* Above three characters in KOI8-R should translate to the following
376 	 * three characters (two bytes each) in UTF-8. */
377 	assertEqualMem(buff + 512, "15 path=\xD0\xBF\xD1\x80\xD0\xB8\x0A", 15);
378 }
379 
380 /*
381  * Verify that CP1251 filenames are correctly translated to Unicode and UTF-8.
382  */
383 DEFINE_TEST(test_pax_filename_encoding_CP1251)
384 {
385   	struct archive *a;
386   	struct archive_entry *entry;
387 	char buff[4096];
388 	size_t used;
389 
390 	if (NULL == setlocale(LC_ALL, "Russian_Russia") &&
391 	    NULL == setlocale(LC_ALL, "ru_RU.CP1251")) {
392 		skipping("KOI8-R locale not available on this system.");
393 		return;
394 	}
395 
396 	/* Check if the platform completely supports the string conversion. */
397 	a = archive_write_new();
398 	assertEqualInt(ARCHIVE_OK, archive_write_set_format_pax(a));
399 	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
400 		skipping("This system cannot convert character-set"
401 		    " from KOI8-R to UTF-8.");
402 		archive_write_free(a);
403 		return;
404 	}
405 	archive_write_free(a);
406 
407 	/* Re-create a write archive object since filenames should be written
408 	 * in UTF-8 by default. */
409 	a = archive_write_new();
410 	assertEqualInt(ARCHIVE_OK, archive_write_set_format_pax(a));
411 	assertEqualInt(ARCHIVE_OK,
412 	    archive_write_open_memory(a, buff, sizeof(buff), &used));
413 
414 	entry = archive_entry_new2(a);
415 	archive_entry_set_pathname(entry, "\xef\xf0\xe8");
416 	archive_entry_set_filetype(entry, AE_IFREG);
417 	archive_entry_set_size(entry, 0);
418 	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
419 	archive_entry_free(entry);
420 	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
421 
422 	/* Above three characters in KOI8-R should translate to the following
423 	 * three characters (two bytes each) in UTF-8. */
424 	assertEqualMem(buff + 512, "15 path=\xD0\xBF\xD1\x80\xD0\xB8\x0A", 15);
425 }
426 
427 /*
428  * Verify that EUC-JP filenames are correctly translated to Unicode and UTF-8.
429  */
430 DEFINE_TEST(test_pax_filename_encoding_EUCJP)
431 {
432   	struct archive *a;
433   	struct archive_entry *entry;
434 	char buff[4096];
435 	size_t used;
436 
437 	if (NULL == setlocale(LC_ALL, "ja_JP.eucJP")) {
438 		skipping("eucJP locale not available on this system.");
439 		return;
440 	}
441 
442 	/* Check if the platform completely supports the string conversion. */
443 	a = archive_write_new();
444 	assertEqualInt(ARCHIVE_OK, archive_write_set_format_pax(a));
445 	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
446 		skipping("This system cannot convert character-set"
447 		    " from eucJP to UTF-8.");
448 		archive_write_free(a);
449 		return;
450 	}
451 	archive_write_free(a);
452 
453 	/* Re-create a write archive object since filenames should be written
454 	 * in UTF-8 by default. */
455 	a = archive_write_new();
456 	assertEqualInt(ARCHIVE_OK, archive_write_set_format_pax(a));
457 	assertEqualInt(ARCHIVE_OK,
458 	    archive_write_open_memory(a, buff, sizeof(buff), &used));
459 
460 	entry = archive_entry_new2(a);
461 	archive_entry_set_pathname(entry, "\xC9\xBD.txt");
462 	/* Check the Unicode version. */
463 	archive_entry_set_filetype(entry, AE_IFREG);
464 	archive_entry_set_size(entry, 0);
465 	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
466 	archive_entry_free(entry);
467 	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
468 
469 	/* Check UTF-8 version. */
470 	assertEqualMem(buff + 512, "16 path=\xE8\xA1\xA8.txt\x0A", 16);
471 
472 }
473 
474 /*
475  * Verify that CP932/SJIS filenames are correctly translated to Unicode and UTF-8.
476  */
477 DEFINE_TEST(test_pax_filename_encoding_CP932)
478 {
479   	struct archive *a;
480   	struct archive_entry *entry;
481 	char buff[4096];
482 	size_t used;
483 
484 	if (NULL == setlocale(LC_ALL, "Japanese_Japan") &&
485 	    NULL == setlocale(LC_ALL, "ja_JP.SJIS")) {
486 		skipping("eucJP locale not available on this system.");
487 		return;
488 	}
489 
490 	/* Check if the platform completely supports the string conversion. */
491 	a = archive_write_new();
492 	assertEqualInt(ARCHIVE_OK, archive_write_set_format_pax(a));
493 	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
494 		skipping("This system cannot convert character-set"
495 		    " from CP932/SJIS to UTF-8.");
496 		archive_write_free(a);
497 		return;
498 	}
499 	archive_write_free(a);
500 
501 	/* Re-create a write archive object since filenames should be written
502 	 * in UTF-8 by default. */
503 	a = archive_write_new();
504 	assertEqualInt(ARCHIVE_OK, archive_write_set_format_pax(a));
505 	assertEqualInt(ARCHIVE_OK,
506 	    archive_write_open_memory(a, buff, sizeof(buff), &used));
507 
508 	entry = archive_entry_new2(a);
509 	archive_entry_set_pathname(entry, "\x95\x5C.txt");
510 	/* Check the Unicode version. */
511 	archive_entry_set_filetype(entry, AE_IFREG);
512 	archive_entry_set_size(entry, 0);
513 	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
514 	archive_entry_free(entry);
515 	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
516 
517 	/* Check UTF-8 version. */
518 	assertEqualMem(buff + 512, "16 path=\xE8\xA1\xA8.txt\x0A", 16);
519 
520 }
521 
522 /*
523  * Verify that KOI8-R filenames are not translated to Unicode and UTF-8
524  * when using hdrcharset=BINARY option.
525  */
526 DEFINE_TEST(test_pax_filename_encoding_KOI8R_BINARY)
527 {
528   	struct archive *a;
529   	struct archive_entry *entry;
530 	char buff[4096];
531 	size_t used;
532 
533 	if (NULL == setlocale(LC_ALL, "ru_RU.KOI8-R")) {
534 		skipping("KOI8-R locale not available on this system.");
535 		return;
536 	}
537 
538 	a = archive_write_new();
539 	assertEqualInt(ARCHIVE_OK, archive_write_set_format_pax(a));
540 	/* BINARY mode should be accepted. */
541 	assertEqualInt(ARCHIVE_OK,
542 	    archive_write_set_options(a, "hdrcharset=BINARY"));
543 	assertEqualInt(ARCHIVE_OK,
544 	    archive_write_open_memory(a, buff, sizeof(buff), &used));
545 
546 	entry = archive_entry_new2(a);
547 	archive_entry_set_pathname(entry, "\xD0\xD2\xC9");
548 	archive_entry_set_filetype(entry, AE_IFREG);
549 	archive_entry_set_size(entry, 0);
550 	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
551 	archive_entry_free(entry);
552 	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
553 
554 	/* "hdrcharset=BINARY" pax attribute should be written. */
555 	assertEqualMem(buff + 512, "21 hdrcharset=BINARY\x0A", 21);
556 	/* Above three characters in KOI8-R should not translate to any
557 	 * character-set. */
558 	assertEqualMem(buff + 512+21, "12 path=\xD0\xD2\xC9\x0A", 12);
559 }
560 
561 /*
562  * Pax format writer only accepts both BINARY and UTF-8.
563  * If other character-set name is specified, you will get ARCHIVE_FAILED.
564  */
565 DEFINE_TEST(test_pax_filename_encoding_KOI8R_CP1251)
566 {
567   	struct archive *a;
568 
569 	if (NULL == setlocale(LC_ALL, "ru_RU.KOI8-R")) {
570 		skipping("KOI8-R locale not available on this system.");
571 		return;
572 	}
573 
574 	a = archive_write_new();
575 	assertEqualInt(ARCHIVE_OK, archive_write_set_format_pax(a));
576 	/* pax format writer only accepts both BINARY and UTF-8. */
577 	assertEqualInt(ARCHIVE_FAILED,
578 	    archive_write_set_options(a, "hdrcharset=CP1251"));
579 	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
580 }
581 
582 /*
583  * Verify that unicode filenames are correctly preserved on Windows
584  */
585 DEFINE_TEST(test_pax_filename_encoding_UTF16_win)
586 {
587 #if !defined(_WIN32) || defined(__CYGWIN__)
588 	skipping("This test is meant to verify unicode string handling"
589 		" on Windows with UTF-16 names");
590 	return;
591 #else
592 	struct archive *a;
593 	struct archive_entry *entry;
594 	char buff[0x2000];
595 	char *p;
596 	size_t used;
597 
598 	/*
599 	 * Don't call setlocale because we're verifying that the '_w' functions
600 	 * work as expected when 'hdrcharset' is UTF-8
601 	 */
602 
603 	/* Check if the platform completely supports the string conversion. */
604 	a = archive_write_new();
605 	assertEqualInt(ARCHIVE_OK, archive_write_set_format_pax(a));
606 	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
607 		skipping("This system cannot convert character-set"
608 		    " from UTF-16 to UTF-8.");
609 		archive_write_free(a);
610 		return;
611 	}
612 	archive_write_free(a);
613 
614 	/*
615 	 * Create a new archive handle with default charset handling
616 	 */
617 	a = archive_write_new();
618 	assertEqualInt(ARCHIVE_OK, archive_write_set_format_pax(a));
619 	assertEqualInt(ARCHIVE_OK,
620 	    archive_write_open_memory(a, buff, sizeof(buff), &used));
621 
622 	/* Part 1: file */
623 	entry = archive_entry_new2(a);
624 	archive_entry_copy_pathname_w(entry, L"\u4f60\u597d.txt");
625 	archive_entry_set_filetype(entry, AE_IFREG);
626 	archive_entry_set_size(entry, 0);
627 	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
628 
629 	/* Part 2: directory */
630 	/* NOTE: Explicitly not adding trailing slash to test that code path */
631 	archive_entry_copy_pathname_w(entry, L"\u043f\u0440\u0438");
632 	archive_entry_set_filetype(entry, AE_IFDIR);
633 	archive_entry_set_size(entry, 0);
634 	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
635 
636 	/* Part 3: symlink */
637 	archive_entry_copy_pathname_w(entry, L"\u518d\u89c1.txt");
638 	archive_entry_copy_symlink_w(entry, L"\u4f60\u597d.txt");
639 	archive_entry_set_filetype(entry, AE_IFLNK);
640 	archive_entry_set_symlink_type(entry, AE_SYMLINK_TYPE_FILE);
641 	archive_entry_set_size(entry, 0);
642 	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
643 
644 	/* Part 4: hardlink */
645 	archive_entry_copy_pathname_w(entry, L"\u665a\u5b89.txt");
646 	archive_entry_copy_hardlink_w(entry, L"\u4f60\u597d.txt");
647 	archive_entry_set_filetype(entry, AE_IFREG);
648 	archive_entry_set_size(entry, 0);
649 	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
650 
651 	archive_entry_free(entry);
652 	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
653 
654 	/*
655 	 * Examine the bytes to ensure the filenames ended up UTF-8
656 	 * encoded as we expect.
657 	 */
658 
659 	/* Part 1: file */
660 	p = buff + 0;
661 	assertEqualString(p + 0, "PaxHeader/\xE4\xBD\xA0\xE5\xA5\xBD.txt"); /* File name */
662 	assertEqualInt(p[156], 'x'); /* Pax extension header */
663 	p += 512; /* Pax extension body */
664 	assertEqualString(p + 0, "19 path=\xE4\xBD\xA0\xE5\xA5\xBD.txt\n");
665 	p += 512; /* Ustar header */
666 	assertEqualString(p + 0, "\xE4\xBD\xA0\xE5\xA5\xBD.txt"); /* File name */
667 	assertEqualInt(p[156], '0');
668 
669 	/* Part 2: directory */
670 	p += 512; /* Pax extension header */
671 	assertEqualString(p + 0, "PaxHeader/\xD0\xBF\xD1\x80\xD0\xB8"); /* File name */
672 	assertEqualInt(p[156], 'x');
673 	p += 512; /* Pax extension body */
674 	assertEqualString(p + 0, "16 path=\xD0\xBF\xD1\x80\xD0\xB8/\n");
675 	p += 512; /* Ustar header */
676 	assertEqualString(p + 0, "\xD0\xBF\xD1\x80\xD0\xB8/"); /* File name */
677 	assertEqualInt(p[156], '5'); /* directory */
678 
679 	/* Part 3: symlink */
680 	p += 512; /* Pax Extension Header */
681 	assertEqualString(p + 0, "PaxHeader/\xE5\x86\x8D\xE8\xA7\x81.txt"); /* File name */
682 	p += 512; /* Pax extension body */
683 	assertEqualString(p + 0,
684 			  "19 path=\xE5\x86\x8D\xE8\xA7\x81.txt\n"
685 			  "23 linkpath=\xE4\xBD\xA0\xE5\xA5\xBD.txt\n"
686 			  "31 LIBARCHIVE.symlinktype=file\n");
687 	p += 512; /* Ustar header */
688 	assertEqualString(p + 0, "\xE5\x86\x8D\xE8\xA7\x81.txt"); /* File name */
689 	assertEqualInt(p[156], '2'); /* symlink */
690 	assertEqualString(p + 157, "\xE4\xBD\xA0\xE5\xA5\xBD.txt"); /* link name */
691 
692 	/* Part 4: hardlink */
693 	p += 512; /* Pax extension header */
694 	assertEqualString(p + 0, "PaxHeader/\xE6\x99\x9A\xE5\xAE\x89.txt"); /* File name */
695 	p += 512; /* Pax extension body */
696 	assertEqualString(p + 0,
697 			  "19 path=\xE6\x99\x9A\xE5\xAE\x89.txt\n"
698 			  "23 linkpath=\xE4\xBD\xA0\xE5\xA5\xBD.txt\n"
699 			  "31 LIBARCHIVE.symlinktype=file\n");
700 	p += 512; /* Ustar header */
701 	assertEqualString(p + 0, "\xE6\x99\x9A\xE5\xAE\x89.txt"); /* File name */
702 	assertEqualInt(p[156], '1'); /* hard link */
703 	assertEqualString(p + 157, "\xE4\xBD\xA0\xE5\xA5\xBD.txt"); /* link name */
704 
705 	/*
706 	 * Read back the archive to see if we get the original names
707 	 */
708 	a = archive_read_new();
709 	archive_read_support_format_all(a);
710 	archive_read_support_filter_all(a);
711 	assertEqualInt(0, archive_read_open_memory(a, buff, used));
712 
713 	/* Read part 1: file */
714 	assertEqualIntA(a, ARCHIVE_OK, archive_read_next_header(a, &entry));
715 	assertEqualWString(L"\u4f60\u597d.txt", archive_entry_pathname_w(entry));
716 
717 	/* Read part 2: directory */
718 	assertEqualIntA(a, ARCHIVE_OK, archive_read_next_header(a, &entry));
719 	assertEqualWString(L"\u043f\u0440\u0438/", archive_entry_pathname_w(entry));
720 
721 	/* Read part 3: symlink */
722 	assertEqualIntA(a, ARCHIVE_OK, archive_read_next_header(a, &entry));
723 	assertEqualWString(L"\u518d\u89c1.txt", archive_entry_pathname_w(entry));
724 	assertEqualWString(L"\u4f60\u597d.txt", archive_entry_symlink_w(entry));
725 
726 	/* Read part 4: hardlink */
727 	assertEqualIntA(a, ARCHIVE_OK, archive_read_next_header(a, &entry));
728 	assertEqualWString(L"\u665a\u5b89.txt", archive_entry_pathname_w(entry));
729 	assertEqualWString(L"\u4f60\u597d.txt", archive_entry_hardlink_w(entry));
730 
731 	archive_free(a);
732 #endif
733 }
734 
735 DEFINE_TEST(test_pax_filename_encoding)
736 {
737 	test_pax_filename_encoding_1();
738 	test_pax_filename_encoding_2();
739 	test_pax_filename_encoding_3();
740 }
741