xref: /netbsd-src/usr.bin/gzip/unxz.c (revision 82d56013d7b633d116a93943de88e08335357a7c)
1 /*	$NetBSD: unxz.c,v 1.8 2018/10/06 16:36:45 martin Exp $	*/
2 
3 /*-
4  * Copyright (c) 2011 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Christos Zoulas.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 #include <sys/cdefs.h>
32 __RCSID("$NetBSD: unxz.c,v 1.8 2018/10/06 16:36:45 martin Exp $");
33 
34 #include <stdarg.h>
35 #include <errno.h>
36 #include <stdio.h>
37 #include <unistd.h>
38 #include <lzma.h>
39 
40 static off_t
41 unxz(int i, int o, char *pre, size_t prelen, off_t *bytes_in)
42 {
43 	lzma_stream strm = LZMA_STREAM_INIT;
44 	static const int flags = LZMA_TELL_UNSUPPORTED_CHECK|LZMA_CONCATENATED;
45 	lzma_ret ret;
46 	lzma_action action = LZMA_RUN;
47 	off_t bytes_out, bp;
48 	uint8_t ibuf[BUFSIZ];
49 	uint8_t obuf[BUFSIZ];
50 
51 	if (bytes_in == NULL)
52 		bytes_in = &bp;
53 
54 	strm.next_in = ibuf;
55 	memcpy(ibuf, pre, prelen);
56 	strm.avail_in = read(i, ibuf + prelen, sizeof(ibuf) - prelen);
57 	if (strm.avail_in == (size_t)-1)
58 		maybe_err("read failed");
59 	infile_newdata(strm.avail_in);
60 	strm.avail_in += prelen;
61 	*bytes_in = strm.avail_in;
62 
63 	if ((ret = lzma_stream_decoder(&strm, UINT64_MAX, flags)) != LZMA_OK)
64 		maybe_errx("Can't initialize decoder (%d)", ret);
65 
66 	strm.next_out = NULL;
67 	strm.avail_out = 0;
68 	if ((ret = lzma_code(&strm, LZMA_RUN)) != LZMA_OK)
69 		maybe_errx("Can't read headers (%d)", ret);
70 
71 	bytes_out = 0;
72 	strm.next_out = obuf;
73 	strm.avail_out = sizeof(obuf);
74 
75 	for (;;) {
76 		check_siginfo();
77 		if (strm.avail_in == 0) {
78 			strm.next_in = ibuf;
79 			strm.avail_in = read(i, ibuf, sizeof(ibuf));
80 			switch (strm.avail_in) {
81 			case (size_t)-1:
82 				maybe_err("read failed");
83 				/*NOTREACHED*/
84 			case 0:
85 				action = LZMA_FINISH;
86 				break;
87 			default:
88 				infile_newdata(strm.avail_in);
89 				*bytes_in += strm.avail_in;
90 				break;
91 			}
92 		}
93 
94 		ret = lzma_code(&strm, action);
95 
96 		// Write and check write error before checking decoder error.
97 		// This way as much data as possible gets written to output
98 		// even if decoder detected an error.
99 		if (strm.avail_out == 0 || ret != LZMA_OK) {
100 			const size_t write_size = sizeof(obuf) - strm.avail_out;
101 
102 			if (write(o, obuf, write_size) != (ssize_t)write_size)
103 				maybe_err("write failed");
104 
105 			strm.next_out = obuf;
106 			strm.avail_out = sizeof(obuf);
107 			bytes_out += write_size;
108 		}
109 
110 		if (ret != LZMA_OK) {
111 			if (ret == LZMA_STREAM_END) {
112 				// Check that there's no trailing garbage.
113 				if (strm.avail_in != 0 || read(i, ibuf, 1))
114 					ret = LZMA_DATA_ERROR;
115 				else {
116 					lzma_end(&strm);
117 					return bytes_out;
118 				}
119 			}
120 
121 			const char *msg;
122 			switch (ret) {
123 			case LZMA_MEM_ERROR:
124 				msg = strerror(ENOMEM);
125 				break;
126 
127 			case LZMA_FORMAT_ERROR:
128 				msg = "File format not recognized";
129 				break;
130 
131 			case LZMA_OPTIONS_ERROR:
132 				// FIXME: Better message?
133 				msg = "Unsupported compression options";
134 				break;
135 
136 			case LZMA_DATA_ERROR:
137 				msg = "File is corrupt";
138 				break;
139 
140 			case LZMA_BUF_ERROR:
141 				msg = "Unexpected end of input";
142 				break;
143 
144 			case LZMA_MEMLIMIT_ERROR:
145 				msg = "Reached memory limit";
146 				break;
147 
148 			default:
149 				maybe_errx("Unknown error (%d)", ret);
150 				break;
151 			}
152 			maybe_errx("%s", msg);
153 
154 		}
155 	}
156 }
157 
158 #include <stdbool.h>
159 
160 /*
161  * Copied various bits and pieces from xz support code or brute force
162  * replacements.
163  */
164 
165 #define	my_min(A,B)	((A)<(B)?(A):(B))
166 
167 // Some systems have suboptimal BUFSIZ. Use a bit bigger value on them.
168 // We also need that IO_BUFFER_SIZE is a multiple of 8 (sizeof(uint64_t))
169 #if BUFSIZ <= 1024
170 #       define IO_BUFFER_SIZE 8192
171 #else
172 #       define IO_BUFFER_SIZE (BUFSIZ & ~7U)
173 #endif
174 
175 /// is_sparse() accesses the buffer as uint64_t for maximum speed.
176 /// Use an union to make sure that the buffer is properly aligned.
177 typedef union {
178         uint8_t u8[IO_BUFFER_SIZE];
179         uint32_t u32[IO_BUFFER_SIZE / sizeof(uint32_t)];
180         uint64_t u64[IO_BUFFER_SIZE / sizeof(uint64_t)];
181 } io_buf;
182 
183 
184 static bool
185 io_pread(int fd, io_buf *buf, size_t size, off_t pos)
186 {
187 	// Using lseek() and read() is more portable than pread() and
188 	// for us it is as good as real pread().
189 	if (lseek(fd, pos, SEEK_SET) != pos) {
190 		return true;
191 	}
192 
193 	const size_t amount = read(fd, buf, size);
194 	if (amount == SIZE_MAX)
195 		return true;
196 
197 	if (amount != size) {
198 		return true;
199 	}
200 
201 	return false;
202 }
203 
204 /*
205  * Most of the following is copied (mostly verbatim) from the xz
206  * distribution, from file src/xz/list.c
207  */
208 
209 ///////////////////////////////////////////////////////////////////////////////
210 //
211 /// \file       list.c
212 /// \brief      Listing information about .xz files
213 //
214 //  Author:     Lasse Collin
215 //
216 //  This file has been put into the public domain.
217 //  You can do whatever you want with this file.
218 //
219 ///////////////////////////////////////////////////////////////////////////////
220 
221 
222 /// Information about a .xz file
223 typedef struct {
224 	/// Combined Index of all Streams in the file
225 	lzma_index *idx;
226 
227 	/// Total amount of Stream Padding
228 	uint64_t stream_padding;
229 
230 	/// Highest memory usage so far
231 	uint64_t memusage_max;
232 
233 	/// True if all Blocks so far have Compressed Size and
234 	/// Uncompressed Size fields
235 	bool all_have_sizes;
236 
237 	/// Oldest XZ Utils version that will decompress the file
238 	uint32_t min_version;
239 
240 } xz_file_info;
241 
242 #define XZ_FILE_INFO_INIT { NULL, 0, 0, true, 50000002 }
243 
244 
245 /// \brief      Parse the Index(es) from the given .xz file
246 ///
247 /// \param      xfi     Pointer to structure where the decoded information
248 ///                     is stored.
249 /// \param      pair    Input file
250 ///
251 /// \return     On success, false is returned. On error, true is returned.
252 ///
253 // TODO: This function is pretty big. liblzma should have a function that
254 // takes a callback function to parse the Index(es) from a .xz file to make
255 // it easy for applications.
256 static bool
257 parse_indexes(xz_file_info *xfi, int src_fd)
258 {
259 	struct stat st;
260 
261 	fstat(src_fd, &st);
262 	if (st.st_size <= 0) {
263 		return true;
264 	}
265 
266 	if (st.st_size < 2 * LZMA_STREAM_HEADER_SIZE) {
267 		return true;
268 	}
269 
270 	io_buf buf;
271 	lzma_stream_flags header_flags;
272 	lzma_stream_flags footer_flags;
273 	lzma_ret ret;
274 
275 	// lzma_stream for the Index decoder
276 	lzma_stream strm = LZMA_STREAM_INIT;
277 
278 	// All Indexes decoded so far
279 	lzma_index *combined_index = NULL;
280 
281 	// The Index currently being decoded
282 	lzma_index *this_index = NULL;
283 
284 	// Current position in the file. We parse the file backwards so
285 	// initialize it to point to the end of the file.
286 	off_t pos = st.st_size;
287 
288 	// Each loop iteration decodes one Index.
289 	do {
290 		// Check that there is enough data left to contain at least
291 		// the Stream Header and Stream Footer. This check cannot
292 		// fail in the first pass of this loop.
293 		if (pos < 2 * LZMA_STREAM_HEADER_SIZE) {
294 			goto error;
295 		}
296 
297 		pos -= LZMA_STREAM_HEADER_SIZE;
298 		lzma_vli stream_padding = 0;
299 
300 		// Locate the Stream Footer. There may be Stream Padding which
301 		// we must skip when reading backwards.
302 		while (true) {
303 			if (pos < LZMA_STREAM_HEADER_SIZE) {
304 				goto error;
305 			}
306 
307 			if (io_pread(src_fd, &buf,
308 					LZMA_STREAM_HEADER_SIZE, pos))
309 				goto error;
310 
311 			// Stream Padding is always a multiple of four bytes.
312 			int i = 2;
313 			if (buf.u32[i] != 0)
314 				break;
315 
316 			// To avoid calling io_pread() for every four bytes
317 			// of Stream Padding, take advantage that we read
318 			// 12 bytes (LZMA_STREAM_HEADER_SIZE) already and
319 			// check them too before calling io_pread() again.
320 			do {
321 				stream_padding += 4;
322 				pos -= 4;
323 				--i;
324 			} while (i >= 0 && buf.u32[i] == 0);
325 		}
326 
327 		// Decode the Stream Footer.
328 		ret = lzma_stream_footer_decode(&footer_flags, buf.u8);
329 		if (ret != LZMA_OK) {
330 			goto error;
331 		}
332 
333 		// Check that the Stream Footer doesn't specify something
334 		// that we don't support. This can only happen if the xz
335 		// version is older than liblzma and liblzma supports
336 		// something new.
337 		//
338 		// It is enough to check Stream Footer. Stream Header must
339 		// match when it is compared against Stream Footer with
340 		// lzma_stream_flags_compare().
341 		if (footer_flags.version != 0) {
342 			goto error;
343 		}
344 
345 		// Check that the size of the Index field looks sane.
346 		lzma_vli index_size = footer_flags.backward_size;
347 		if ((lzma_vli)(pos) < index_size + LZMA_STREAM_HEADER_SIZE) {
348 			goto error;
349 		}
350 
351 		// Set pos to the beginning of the Index.
352 		pos -= index_size;
353 
354 		// Decode the Index.
355 		ret = lzma_index_decoder(&strm, &this_index, UINT64_MAX);
356 		if (ret != LZMA_OK) {
357 			goto error;
358 		}
359 
360 		do {
361 			// Don't give the decoder more input than the
362 			// Index size.
363 			strm.avail_in = my_min(IO_BUFFER_SIZE, index_size);
364 			if (io_pread(src_fd, &buf, strm.avail_in, pos))
365 				goto error;
366 
367 			pos += strm.avail_in;
368 			index_size -= strm.avail_in;
369 
370 			strm.next_in = buf.u8;
371 			ret = lzma_code(&strm, LZMA_RUN);
372 
373 		} while (ret == LZMA_OK);
374 
375 		// If the decoding seems to be successful, check also that
376 		// the Index decoder consumed as much input as indicated
377 		// by the Backward Size field.
378 		if (ret == LZMA_STREAM_END)
379 			if (index_size != 0 || strm.avail_in != 0)
380 				ret = LZMA_DATA_ERROR;
381 
382 		if (ret != LZMA_STREAM_END) {
383 			// LZMA_BUFFER_ERROR means that the Index decoder
384 			// would have liked more input than what the Index
385 			// size should be according to Stream Footer.
386 			// The message for LZMA_DATA_ERROR makes more
387 			// sense in that case.
388 			if (ret == LZMA_BUF_ERROR)
389 				ret = LZMA_DATA_ERROR;
390 
391 			goto error;
392 		}
393 
394 		// Decode the Stream Header and check that its Stream Flags
395 		// match the Stream Footer.
396 		pos -= footer_flags.backward_size + LZMA_STREAM_HEADER_SIZE;
397 		if ((lzma_vli)(pos) < lzma_index_total_size(this_index)) {
398 			goto error;
399 		}
400 
401 		pos -= lzma_index_total_size(this_index);
402 		if (io_pread(src_fd, &buf, LZMA_STREAM_HEADER_SIZE, pos))
403 			goto error;
404 
405 		ret = lzma_stream_header_decode(&header_flags, buf.u8);
406 		if (ret != LZMA_OK) {
407 			goto error;
408 		}
409 
410 		ret = lzma_stream_flags_compare(&header_flags, &footer_flags);
411 		if (ret != LZMA_OK) {
412 			goto error;
413 		}
414 
415 		// Store the decoded Stream Flags into this_index. This is
416 		// needed so that we can print which Check is used in each
417 		// Stream.
418 		ret = lzma_index_stream_flags(this_index, &footer_flags);
419 		if (ret != LZMA_OK)
420 			goto error;
421 
422 		// Store also the size of the Stream Padding field. It is
423 		// needed to show the offsets of the Streams correctly.
424 		ret = lzma_index_stream_padding(this_index, stream_padding);
425 		if (ret != LZMA_OK)
426 			goto error;
427 
428 		if (combined_index != NULL) {
429 			// Append the earlier decoded Indexes
430 			// after this_index.
431 			ret = lzma_index_cat(
432 					this_index, combined_index, NULL);
433 			if (ret != LZMA_OK) {
434 				goto error;
435 			}
436 		}
437 
438 		combined_index = this_index;
439 		this_index = NULL;
440 
441 		xfi->stream_padding += stream_padding;
442 
443 	} while (pos > 0);
444 
445 	lzma_end(&strm);
446 
447 	// All OK. Make combined_index available to the caller.
448 	xfi->idx = combined_index;
449 	return false;
450 
451 error:
452 	// Something went wrong, free the allocated memory.
453 	lzma_end(&strm);
454 	lzma_index_end(combined_index, NULL);
455 	lzma_index_end(this_index, NULL);
456 	return true;
457 }
458 
459 /***************** end of copy form list.c *************************/
460 
461 /*
462  * Small wrapper to extract total length of a file
463  */
464 off_t
465 unxz_len(int fd)
466 {
467 	xz_file_info xfi = XZ_FILE_INFO_INIT;
468 	if (!parse_indexes(&xfi, fd)) {
469 		off_t res = lzma_index_uncompressed_size(xfi.idx);
470 		lzma_index_end(xfi.idx, NULL);
471 		return res;
472 	}
473 	return 0;
474 }
475 
476