xref: /freebsd-src/contrib/libarchive/libarchive/test/test_ustar_filename_encoding.c (revision f0bd5302dd9e20355beadd0f260ffb926b6ac164)
1 /*-
2  * Copyright (c) 2011 Michihiro NAKAJIMA
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
15  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
16  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
17  * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
18  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
19  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
20  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
21  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
23  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24  */
25 #include "test.h"
26 __FBSDID("$FreeBSD$");
27 
28 #include <locale.h>
29 
30 static void
31 test_ustar_filename_encoding_UTF8_CP866(void)
32 {
33   	struct archive *a;
34   	struct archive_entry *entry;
35 	char buff[4096];
36 	size_t used;
37 
38 	if (NULL == setlocale(LC_ALL, "en_US.UTF-8")) {
39 		skipping("en_US.UTF-8 locale not available on this system.");
40 		return;
41 	}
42 
43 	/*
44 	 * Verify that UTF-8 filenames are correctly translated into CP866
45 	 * and stored with hdrcharset=CP866 option.
46 	 */
47 	a = archive_write_new();
48 	assertEqualInt(ARCHIVE_OK, archive_write_set_format_ustar(a));
49 	if (archive_write_set_options(a, "hdrcharset=CP866") != ARCHIVE_OK) {
50 		skipping("This system cannot convert character-set"
51 		    " from UTF-8 to CP866.");
52 		archive_write_free(a);
53 		return;
54 	}
55 	assertEqualInt(ARCHIVE_OK,
56 	    archive_write_open_memory(a, buff, sizeof(buff), &used));
57 
58 	entry = archive_entry_new2(a);
59 	/* Set a UTF-8 filename. */
60 	archive_entry_set_pathname(entry, "\xD0\xBF\xD1\x80\xD0\xB8");
61 	archive_entry_set_filetype(entry, AE_IFREG);
62 	archive_entry_set_size(entry, 0);
63 	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
64 	archive_entry_free(entry);
65 	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
66 
67 	/* Above three characters in UTF-8 should translate to the following
68 	 * three characters in CP866. */
69 	assertEqualMem(buff, "\xAF\xE0\xA8", 3);
70 }
71 
72 static void
73 test_ustar_filename_encoding_KOI8R_UTF8(void)
74 {
75   	struct archive *a;
76   	struct archive_entry *entry;
77 	char buff[4096];
78 	size_t used;
79 
80 	if (NULL == setlocale(LC_ALL, "ru_RU.KOI8-R")) {
81 		skipping("KOI8-R locale not available on this system.");
82 		return;
83 	}
84 
85 	/*
86 	 * Verify that KOI8-R filenames are correctly translated into UTF-8
87 	 * and stored with hdrcharset=UTF-8 option.
88 	 */
89 	a = archive_write_new();
90 	assertEqualInt(ARCHIVE_OK, archive_write_set_format_ustar(a));
91 	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
92 		skipping("This system cannot convert character-set"
93 		    " from KOI8-R to UTF-8.");
94 		archive_write_free(a);
95 		return;
96 	}
97 	assertEqualInt(ARCHIVE_OK,
98 	    archive_write_open_memory(a, buff, sizeof(buff), &used));
99 
100 	entry = archive_entry_new2(a);
101 	/* Set a KOI8-R filename. */
102 	archive_entry_set_pathname(entry, "\xD0\xD2\xC9");
103 	archive_entry_set_filetype(entry, AE_IFREG);
104 	archive_entry_set_size(entry, 0);
105 	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
106 	archive_entry_free(entry);
107 	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
108 
109 	/* Above three characters in KOI8-R should translate to the following
110 	 * three characters (two bytes each) in UTF-8. */
111 	assertEqualMem(buff, "\xD0\xBF\xD1\x80\xD0\xB8", 6);
112 }
113 
114 static void
115 test_ustar_filename_encoding_KOI8R_CP866(void)
116 {
117   	struct archive *a;
118   	struct archive_entry *entry;
119 	char buff[4096];
120 	size_t used;
121 
122 	if (NULL == setlocale(LC_ALL, "ru_RU.KOI8-R")) {
123 		skipping("KOI8-R locale not available on this system.");
124 		return;
125 	}
126 
127 	/*
128 	 * Verify that KOI8-R filenames are correctly translated into CP866
129 	 * and stored with hdrcharset=CP866 option.
130 	 */
131 	a = archive_write_new();
132 	assertEqualInt(ARCHIVE_OK, archive_write_set_format_ustar(a));
133 	if (archive_write_set_options(a, "hdrcharset=CP866") != ARCHIVE_OK) {
134 		skipping("This system cannot convert character-set"
135 		    " from KOI8-R to CP866.");
136 		archive_write_free(a);
137 		return;
138 	}
139 	assertEqualInt(ARCHIVE_OK,
140 	    archive_write_open_memory(a, buff, sizeof(buff), &used));
141 
142 	entry = archive_entry_new2(a);
143 	/* Set a KOI8-R filename. */
144 	archive_entry_set_pathname(entry, "\xD0\xD2\xC9");
145 	archive_entry_set_filetype(entry, AE_IFREG);
146 	archive_entry_set_size(entry, 0);
147 	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
148 	archive_entry_free(entry);
149 	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
150 
151 	/* Above three characters in KOI8-R should translate to the following
152 	 * three characters in CP866. */
153 	assertEqualMem(buff, "\xAF\xE0\xA8", 3);
154 }
155 
156 static void
157 test_ustar_filename_encoding_CP1251_UTF8(void)
158 {
159   	struct archive *a;
160   	struct archive_entry *entry;
161 	char buff[4096];
162 	size_t used;
163 
164 	if (NULL == setlocale(LC_ALL, "Russian_Russia") &&
165 	    NULL == setlocale(LC_ALL, "ru_RU.CP1251")) {
166 		skipping("KOI8-R locale not available on this system.");
167 		return;
168 	}
169 
170 	/*
171 	 * Verify that CP1251 filenames are correctly translated into UTF-8
172 	 * and stored with hdrcharset=UTF-8 option.
173 	 */
174 	a = archive_write_new();
175 	assertEqualInt(ARCHIVE_OK, archive_write_set_format_ustar(a));
176 	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
177 		skipping("This system cannot convert character-set"
178 		    " from KOI8-R to UTF-8.");
179 		archive_write_free(a);
180 		return;
181 	}
182 	assertEqualInt(ARCHIVE_OK,
183 	    archive_write_open_memory(a, buff, sizeof(buff), &used));
184 
185 	entry = archive_entry_new2(a);
186 	/* Set a KOI8-R filename. */
187 	archive_entry_set_pathname(entry, "\xEF\xF0\xE8");
188 	archive_entry_set_filetype(entry, AE_IFREG);
189 	archive_entry_set_size(entry, 0);
190 	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
191 	archive_entry_free(entry);
192 	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
193 
194 	/* Above three characters in CP1251 should translate to the following
195 	 * three characters (two bytes each) in UTF-8. */
196 	assertEqualMem(buff, "\xD0\xBF\xD1\x80\xD0\xB8", 6);
197 }
198 
199 /*
200  * Do not translate CP1251 into CP866 if non Windows platform.
201  */
202 static void
203 test_ustar_filename_encoding_ru_RU_CP1251(void)
204 {
205   	struct archive *a;
206   	struct archive_entry *entry;
207 	char buff[4096];
208 	size_t used;
209 
210 	if (NULL == setlocale(LC_ALL, "ru_RU.CP1251")) {
211 		skipping("KOI8-R locale not available on this system.");
212 		return;
213 	}
214 
215 	/*
216 	 * Verify that CP1251 filenames are not translated into any
217 	 * other character-set, in particular, CP866.
218 	 */
219 	a = archive_write_new();
220 	assertEqualInt(ARCHIVE_OK, archive_write_set_format_ustar(a));
221 	assertEqualInt(ARCHIVE_OK,
222 	    archive_write_open_memory(a, buff, sizeof(buff), &used));
223 
224 	entry = archive_entry_new2(a);
225 	/* Set a KOI8-R filename. */
226 	archive_entry_set_pathname(entry, "\xEF\xF0\xE8");
227 	archive_entry_set_filetype(entry, AE_IFREG);
228 	archive_entry_set_size(entry, 0);
229 	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
230 	archive_entry_free(entry);
231 	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
232 
233 	/* Above three characters in CP1251 should not translate to
234 	 * any other character-set. */
235 	assertEqualMem(buff, "\xEF\xF0\xE8", 3);
236 }
237 
238 /*
239  * Other archiver applications on Windows translate CP1251 filenames
240  * into CP866 filenames and store it in the ustar file.
241  * Test above behavior works well.
242  */
243 static void
244 test_ustar_filename_encoding_Russian_Russia(void)
245 {
246   	struct archive *a;
247   	struct archive_entry *entry;
248 	char buff[4096];
249 	size_t used;
250 
251 	if (NULL == setlocale(LC_ALL, "Russian_Russia")) {
252 		skipping("Russian_Russia locale not available on this system.");
253 		return;
254 	}
255 
256 	/*
257 	 * Verify that Russian_Russia(CP1251) filenames are correctly translated
258 	 * to CP866.
259 	 */
260 	a = archive_write_new();
261 	assertEqualInt(ARCHIVE_OK, archive_write_set_format_ustar(a));
262 	assertEqualInt(ARCHIVE_OK,
263 	    archive_write_open_memory(a, buff, sizeof(buff), &used));
264 
265 	entry = archive_entry_new2(a);
266 	/* Set a CP1251 filename. */
267 	archive_entry_set_pathname(entry, "\xEF\xF0\xE8");
268 	archive_entry_set_filetype(entry, AE_IFREG);
269 	archive_entry_set_size(entry, 0);
270 	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
271 	archive_entry_free(entry);
272 	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
273 
274 	/* Above three characters in CP1251 should translate to the following
275 	 * three characters in CP866. */
276 	assertEqualMem(buff, "\xAF\xE0\xA8", 3);
277 }
278 
279 static void
280 test_ustar_filename_encoding_EUCJP_UTF8(void)
281 {
282   	struct archive *a;
283   	struct archive_entry *entry;
284 	char buff[4096];
285 	size_t used;
286 
287 	if (NULL == setlocale(LC_ALL, "ja_JP.eucJP")) {
288 		skipping("eucJP locale not available on this system.");
289 		return;
290 	}
291 
292 	/*
293 	 * Verify that EUC-JP filenames are correctly translated to UTF-8.
294 	 */
295 	a = archive_write_new();
296 	assertEqualInt(ARCHIVE_OK, archive_write_set_format_ustar(a));
297 	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
298 		skipping("This system cannot convert character-set"
299 		    " from eucJP to UTF-8.");
300 		archive_write_free(a);
301 		return;
302 	}
303 	assertEqualInt(ARCHIVE_OK,
304 	    archive_write_open_memory(a, buff, sizeof(buff), &used));
305 
306 	entry = archive_entry_new2(a);
307 	/* Set an EUC-JP filename. */
308 	archive_entry_set_pathname(entry, "\xC9\xBD.txt");
309 	/* Check the Unicode version. */
310 	archive_entry_set_filetype(entry, AE_IFREG);
311 	archive_entry_set_size(entry, 0);
312 	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
313 	archive_entry_free(entry);
314 	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
315 
316 	/* Check UTF-8 version. */
317 	assertEqualMem(buff, "\xE8\xA1\xA8.txt", 7);
318 }
319 
320 static void
321 test_ustar_filename_encoding_EUCJP_CP932(void)
322 {
323   	struct archive *a;
324   	struct archive_entry *entry;
325 	char buff[4096];
326 	size_t used;
327 
328 	if (NULL == setlocale(LC_ALL, "ja_JP.eucJP")) {
329 		skipping("eucJP locale not available on this system.");
330 		return;
331 	}
332 
333 	/*
334 	 * Verify that EUC-JP filenames are correctly translated to CP932.
335 	 */
336 	a = archive_write_new();
337 	assertEqualInt(ARCHIVE_OK, archive_write_set_format_ustar(a));
338 	if (archive_write_set_options(a, "hdrcharset=CP932") != ARCHIVE_OK) {
339 		skipping("This system cannot convert character-set"
340 		    " from eucJP to CP932.");
341 		archive_write_free(a);
342 		return;
343 	}
344 	assertEqualInt(ARCHIVE_OK,
345 	    archive_write_open_memory(a, buff, sizeof(buff), &used));
346 
347 	entry = archive_entry_new2(a);
348 	/* Set an EUC-JP filename. */
349 	archive_entry_set_pathname(entry, "\xC9\xBD.txt");
350 	/* Check the Unicode version. */
351 	archive_entry_set_filetype(entry, AE_IFREG);
352 	archive_entry_set_size(entry, 0);
353 	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
354 	archive_entry_free(entry);
355 	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
356 
357 	/* Check CP932 version. */
358 	assertEqualMem(buff, "\x95\x5C.txt", 6);
359 }
360 
361 static void
362 test_ustar_filename_encoding_CP932_UTF8(void)
363 {
364   	struct archive *a;
365   	struct archive_entry *entry;
366 	char buff[4096];
367 	size_t used;
368 
369 	if (NULL == setlocale(LC_ALL, "Japanese_Japan") &&
370 	    NULL == setlocale(LC_ALL, "ja_JP.SJIS")) {
371 		skipping("CP932/SJIS locale not available on this system.");
372 		return;
373 	}
374 
375 	/*
376 	 * Verify that CP932/SJIS filenames are correctly translated to UTF-8.
377 	 */
378 	a = archive_write_new();
379 	assertEqualInt(ARCHIVE_OK, archive_write_set_format_ustar(a));
380 	if (archive_write_set_options(a, "hdrcharset=UTF-8") != ARCHIVE_OK) {
381 		skipping("This system cannot convert character-set"
382 		    " from CP932/SJIS to UTF-8.");
383 		archive_write_free(a);
384 		return;
385 	}
386 	assertEqualInt(ARCHIVE_OK,
387 	    archive_write_open_memory(a, buff, sizeof(buff), &used));
388 
389 	entry = archive_entry_new2(a);
390 	/* Set a CP932/SJIS filename. */
391 	archive_entry_set_pathname(entry, "\x95\x5C.txt");
392 	/* Check the Unicode version. */
393 	archive_entry_set_filetype(entry, AE_IFREG);
394 	archive_entry_set_size(entry, 0);
395 	assertEqualInt(ARCHIVE_OK, archive_write_header(a, entry));
396 	archive_entry_free(entry);
397 	assertEqualInt(ARCHIVE_OK, archive_write_free(a));
398 
399 	/* Check UTF-8 version. */
400 	assertEqualMem(buff, "\xE8\xA1\xA8.txt", 7);
401 }
402 
403 DEFINE_TEST(test_ustar_filename_encoding)
404 {
405 	test_ustar_filename_encoding_UTF8_CP866();
406 	test_ustar_filename_encoding_KOI8R_UTF8();
407 	test_ustar_filename_encoding_KOI8R_CP866();
408 	test_ustar_filename_encoding_CP1251_UTF8();
409 	test_ustar_filename_encoding_ru_RU_CP1251();
410 	test_ustar_filename_encoding_Russian_Russia();
411 	test_ustar_filename_encoding_EUCJP_UTF8();
412 	test_ustar_filename_encoding_EUCJP_CP932();
413 	test_ustar_filename_encoding_CP932_UTF8();
414 }
415