xref: /netbsd-src/usr.bin/vndcompress/vndcompress.c (revision d909946ca08dceb44d7d0f22ec9488679695d976)
1 /*	$NetBSD: vndcompress.c,v 1.25 2014/11/18 03:48:17 riastradh Exp $	*/
2 
3 /*-
4  * Copyright (c) 2013 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Taylor R. Campbell.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #include <sys/cdefs.h>
33 __RCSID("$NetBSD: vndcompress.c,v 1.25 2014/11/18 03:48:17 riastradh Exp $");
34 
35 #include <sys/endian.h>
36 
37 #include <assert.h>
38 #include <err.h>
39 #include <errno.h>
40 #include <fcntl.h>
41 #include <inttypes.h>
42 #include <limits.h>
43 #include <signal.h>
44 #include <stdbool.h>
45 #include <stdint.h>
46 #include <stdio.h>
47 #include <stdlib.h>
48 #include <string.h>
49 #include <unistd.h>
50 #include <zlib.h>
51 
52 #include "common.h"
53 #include "offtab.h"
54 #include "utils.h"
55 
56 /*
57  * XXX Switch to control bug-for-bug byte-for-byte compatibility with
58  * NetBSD's vndcompress.
59  */
60 #define	VNDCOMPRESS_COMPAT	0
61 
62 __CTASSERT(sizeof(struct cloop2_header) == CLOOP2_OFFSET_TABLE_OFFSET);
63 
64 struct compress_state {
65 	uint64_t	size;		/* uncompressed size */
66 	uint64_t	offset;		/* output byte offset */
67 	uint32_t	blocksize;	/* bytes per block */
68 	uint32_t	blkno;		/* input block number */
69 	uint32_t	n_full_blocks;	/* floor(size/blocksize) */
70 	uint32_t	n_blocks;	/* ceiling(size/blocksize) */
71 	uint32_t	n_offsets;	/* n_blocks + 1 */
72 	uint32_t	end_block;	/* last block to transfer */
73 	uint32_t	checkpoint_blocks;	/* blocks before checkpoint */
74 	int		image_fd;
75 	int		cloop2_fd;
76 	struct offtab	offtab;
77 	uint32_t	n_checkpointed_blocks;
78 	volatile sig_atomic_t
79 			initialized;	/* everything above initialized?  */
80 };
81 
82 /* Global compression state for SIGINFO handler.  */
83 static struct compress_state	global_state;
84 
85 struct sigdesc {
86 	int sd_signo;
87 	const char *sd_name;
88 };
89 
90 static const struct sigdesc info_signals[] = {
91 	{ SIGINFO, "SIGINFO" },
92 	{ SIGUSR1, "SIGUSR1" },
93 };
94 
95 static const struct sigdesc checkpoint_signals[] = {
96 	{ SIGUSR2, "SIGUSR2" },
97 };
98 
99 static void	init_signals(void);
100 static void	init_signal_handler(int, const struct sigdesc *, size_t,
101 		    void (*)(int));
102 static void	info_signal_handler(int);
103 static void	checkpoint_signal_handler(int);
104 static void	compress_progress(struct compress_state *);
105 static void	compress_init(int, char **, const struct options *,
106 		    struct compress_state *);
107 static bool	compress_restart(struct compress_state *);
108 static uint32_t	compress_block(int, int, uint32_t, uint32_t, uint32_t, void *,
109 		    void *);
110 static void	compress_maybe_checkpoint(struct compress_state *);
111 static void	compress_checkpoint(struct compress_state *);
112 static void	compress_exit(struct compress_state *);
113 
114 /*
115  * Compression entry point.
116  */
117 int
118 vndcompress(int argc, char **argv, const struct options *O)
119 {
120 	struct compress_state *const S = &global_state;
121 
122 	/* Paranoia.  The other fields either have no sentinel or use zero.  */
123 	S->image_fd = -1;
124 	S->cloop2_fd = -1;
125 
126 	/* Set up signal handlers so we can handle SIGINFO ASAP.  */
127 	init_signals();
128 
129 	/*
130 	 * Parse the arguments to initialize our state.
131 	 */
132 	compress_init(argc, argv, O, S);
133 	assert(MIN_BLOCKSIZE <= S->blocksize);
134 	assert(S->blocksize <= MAX_BLOCKSIZE);
135 
136 	/*
137 	 * Allocate compression buffers.
138 	 *
139 	 * Compression may actually expand.  From an overabundance of
140 	 * caution, assume it can expand by at most double.
141 	 *
142 	 * XXX Check and consider tightening this assumption.
143 	 */
144 	__CTASSERT(MAX_BLOCKSIZE <= SIZE_MAX);
145 	void *const uncompbuf = malloc(S->blocksize);
146 	if (uncompbuf == NULL)
147 		err(1, "malloc uncompressed buffer");
148 
149 	/* XXX compression ratio bound */
150 	__CTASSERT(MAX_BLOCKSIZE <= (SIZE_MAX / 2));
151 	void *const compbuf = malloc(2 * (size_t)S->blocksize);
152 	if (compbuf == NULL)
153 		err(1, "malloc compressed buffer");
154 
155 	/*
156 	 * Compress the blocks.  S->blkno specifies the input block
157 	 * we're about to transfer.  S->offset is the current output
158 	 * offset.
159 	 */
160 	while (S->blkno < S->n_blocks) {
161 		/* Report any progress.  */
162 		compress_progress(S);
163 
164 		/* Stop if we've done the requested partial transfer.  */
165 		if ((0 < S->end_block) && (S->end_block <= S->blkno))
166 			goto out;
167 
168 		/* Checkpoint if appropriate.  */
169 		compress_maybe_checkpoint(S);
170 		offtab_prepare_put(&S->offtab, (S->blkno + 1));
171 
172 		/* Choose read size: partial if last block, full if not.  */
173 		const uint32_t readsize = (S->blkno == S->n_full_blocks?
174 		    (S->size % S->blocksize) : S->blocksize);
175 		assert(readsize > 0);
176 		assert(readsize <= S->blocksize);
177 
178 		/* Fail noisily if we might be about to overflow.  */
179 		/* XXX compression ratio bound */
180 		__CTASSERT(MAX_BLOCKSIZE <= (UINTMAX_MAX / 2));
181 		assert(S->offset <= MIN(UINT64_MAX, OFF_MAX));
182 		if ((2 * (uintmax_t)readsize) >
183 		    (MIN(UINT64_MAX, OFF_MAX) - S->offset))
184 			errx(1, "blkno %"PRIu32" may overflow: %ju + 2*%ju",
185 			    S->blkno, (uintmax_t)S->offset,
186 			    (uintmax_t)readsize);
187 
188 		/* Process the block.  */
189 		const uint32_t complen =
190 		    compress_block(S->image_fd, S->cloop2_fd, S->blkno,
191 			S->blocksize, readsize, uncompbuf, compbuf);
192 
193 		/*
194 		 * Signal-atomically update the state to reflect
195 		 * (a) what block number we are now at,
196 		 * (b) how far we are now in the output file, and
197 		 * (c) where the last block ended.
198 		 */
199 		assert(S->blkno <= (UINT32_MAX - 1));
200 		assert(complen <= (MIN(UINT64_MAX, OFF_MAX) - S->offset));
201 		assert((S->blkno + 1) < S->n_offsets);
202 	    {
203 		sigset_t old_sigmask;
204 		block_signals(&old_sigmask);
205 		S->blkno += 1;					/* (a) */
206 		S->offset += complen;				/* (b) */
207 		offtab_put(&S->offtab, S->blkno, S->offset);	/* (c) */
208 		restore_sigmask(&old_sigmask);
209 	    }
210 	}
211 
212 	/* Make sure we're all done. */
213 	assert(S->blkno == S->n_blocks);
214 	assert((S->blkno + 1) == S->n_offsets);
215 
216 	/* Pad to the disk block size.  */
217 	const uint32_t n_extra = (S->offset % DEV_BSIZE);
218 	if (n_extra != 0) {
219 		const uint32_t n_padding = (DEV_BSIZE - n_extra);
220 		/* Reuse compbuf -- guaranteed to be large enough.  */
221 		(void)memset(compbuf, 0, n_padding);
222 		const ssize_t n_written = write(S->cloop2_fd, compbuf,
223 		    n_padding);
224 		if (n_written == -1)
225 			err(1, "write final padding failed");
226 		assert(n_written >= 0);
227 		if ((size_t)n_written != n_padding)
228 			errx(1, "partial write of final padding bytes"
229 			    ": %zu != %"PRIu32,
230 			    (size_t)n_written, n_padding);
231 
232 		/* Account for the extra bytes in the output file.  */
233 		assert(n_padding <= (MIN(UINT64_MAX, OFF_MAX) - S->offset));
234 	    {
235 		sigset_t old_sigmask;
236 		block_signals(&old_sigmask);
237 		S->offset += n_padding;
238 		restore_sigmask(&old_sigmask);
239 	    }
240 	}
241 
242 out:
243 	/* One last checkpoint to commit the offset table.  */
244 	assert(S->offset <= OFF_MAX);
245 	assert((off_t)S->offset == lseek(S->cloop2_fd, 0, SEEK_CUR));
246 	compress_checkpoint(S);
247 
248 	/*
249 	 * Free the compression buffers and finalize the compression.
250 	 */
251 	free(compbuf);
252 	free(uncompbuf);
253 	compress_exit(S);
254 
255 	return 0;
256 }
257 
258 /*
259  * Signal cruft.
260  */
261 
262 static void
263 init_signals(void)
264 {
265 
266 	init_signal_handler(SA_RESTART, info_signals,
267 	    __arraycount(info_signals), &info_signal_handler);
268 	init_signal_handler(SA_RESTART, checkpoint_signals,
269 	    __arraycount(checkpoint_signals), &checkpoint_signal_handler);
270 }
271 
272 static void
273 init_signal_handler(int flags, const struct sigdesc *signals, size_t n,
274     void (*handler)(int))
275 {
276 	static const struct sigaction zero_sa;
277 	struct sigaction sa = zero_sa;
278 	size_t i;
279 
280 	(void)sigemptyset(&sa.sa_mask);
281 	for (i = 0; i < n; i++)
282 		(void)sigaddset(&sa.sa_mask, signals[i].sd_signo);
283 	sa.sa_flags = flags;
284 	sa.sa_handler = handler;
285 	for (i = 0; i < n; i++)
286 		if (sigaction(signals[i].sd_signo, &sa, NULL) == -1)
287 			err(1, "sigaction(%s)", signals[i].sd_name);
288 }
289 
290 static void
291 info_signal_handler(int signo __unused)
292 {
293 	/* Save errno.  */
294 	const int error = errno;
295 	struct compress_state *const S = &global_state;
296 	char buf[128];
297 
298 	/* Bail if the state is not yet initialized.  */
299 	if (!S->initialized) {
300 		warnx_ss("initializing");
301 		goto out;
302 	}
303 
304 	/* Carefully calculate our I/O position.  */
305 	assert(S->blocksize > 0);
306 	__CTASSERT(MAX_N_BLOCKS <= (UINT64_MAX / MAX_BLOCKSIZE));
307 	const uint64_t nread = ((uint64_t)S->blkno * (uint64_t)S->blocksize);
308 
309 	assert(S->n_blocks > 0);
310 	__CTASSERT(CLOOP2_OFFSET_TABLE_OFFSET <=
311 	    (UINT64_MAX / sizeof(uint64_t)));
312 	__CTASSERT(MAX_N_BLOCKS <= ((UINT64_MAX / sizeof(uint64_t)) -
313 		CLOOP2_OFFSET_TABLE_OFFSET));
314 	const uint64_t nwritten = (S->offset <= (CLOOP2_OFFSET_TABLE_OFFSET +
315 		((uint64_t)S->n_blocks * sizeof(uint64_t)))?
316 	    0 : S->offset);
317 
318 	/* snprintf_ss can't do floating-point, so do fixed-point instead.  */
319 	const uint64_t ratio_percent =
320 	    (nread > 0?
321 		((nwritten >= (UINT64_MAX / 100)) ?
322 		    ((nwritten / nread) * 100) : ((nwritten * 100) / nread))
323 		: 0);
324 
325 	/* Format the status.  */
326 	assert(S->n_checkpointed_blocks <= (UINT64_MAX / S->blocksize));
327 	const int n = snprintf_ss(buf, sizeof(buf),
328 	    "vndcompress: read %"PRIu64" bytes, wrote %"PRIu64" bytes, "
329 	    "compression ratio %"PRIu64"%% (checkpointed %"PRIu64" bytes)\n",
330 	    nread, nwritten, ratio_percent,
331 	    ((uint64_t)S->n_checkpointed_blocks * (uint64_t)S->blocksize));
332 	if (n < 0) {
333 		const char msg[] = "vndcompress: can't format info\n";
334 		(void)write(STDERR_FILENO, msg, __arraycount(msg));
335 	} else {
336 		__CTASSERT(INT_MAX <= SIZE_MAX);
337 		(void)write(STDERR_FILENO, buf, (size_t)n);
338 	}
339 
340 out:
341 	/* Restore errno.  */
342 	errno = error;
343 }
344 
345 static void
346 checkpoint_signal_handler(int signo __unused)
347 {
348 	/* Save errno.  */
349 	const int error = errno;
350 	struct compress_state *const S = &global_state;
351 
352 	/* Bail if the state is not yet initialized.  */
353 	if (!S->initialized) {
354 		warnx_ss("nothing to checkpoint yet");
355 		goto out;
356 	}
357 
358 	assert(S->image_fd >= 0);
359 	assert(S->cloop2_fd >= 0);
360 
361 	/* Take a checkpoint.  */
362 	assert(S->blocksize > 0);
363 	assert(S->blkno <= (UINT64_MAX / S->blocksize));
364 	warnx_ss("checkpointing %"PRIu64" bytes",
365 	    ((uint64_t)S->blkno * (uint64_t)S->blocksize));
366 	compress_checkpoint(S);
367 
368 out:
369 	/* Restore errno.  */
370 	errno = error;
371 }
372 
373 /*
374  * Report progress.
375  *
376  * XXX Should do a progress bar here.
377  */
378 static void
379 compress_progress(struct compress_state *S __unused)
380 {
381 }
382 
383 /*
384  * Parse arguments, open the files, and initialize the state.
385  */
386 static void
387 compress_init(int argc, char **argv, const struct options *O,
388     struct compress_state *S)
389 {
390 
391 	if (!((argc == 2) || (argc == 3)))
392 		usage();
393 
394 	const char *const image_pathname = argv[0];
395 	const char *const cloop2_pathname = argv[1];
396 
397 	/* Grab the block size either from `-b' or from the last argument.  */
398 	__CTASSERT(0 < DEV_BSIZE);
399 	__CTASSERT((MIN_BLOCKSIZE % DEV_BSIZE) == 0);
400 	__CTASSERT(MIN_BLOCKSIZE <= DEF_BLOCKSIZE);
401 	__CTASSERT((DEF_BLOCKSIZE % DEV_BSIZE) == 0);
402 	__CTASSERT(DEF_BLOCKSIZE <= MAX_BLOCKSIZE);
403 	__CTASSERT((MAX_BLOCKSIZE % DEV_BSIZE) == 0);
404 	if (ISSET(O->flags, FLAG_b)) {
405 		if (argc == 3) {
406 			warnx("use -b or the extra argument, not both");
407 			usage();
408 		}
409 		S->blocksize = O->blocksize;
410 	} else {
411 		S->blocksize = (argc == 2? DEF_BLOCKSIZE :
412 		    strsuftoll("block size", argv[2], MIN_BLOCKSIZE,
413 			MAX_BLOCKSIZE));
414 	}
415 
416 	/* Sanity-check the blocksize.  (strsuftoll guarantees bounds.)  */
417 	__CTASSERT(DEV_BSIZE <= UINT32_MAX);
418 	if ((S->blocksize % DEV_BSIZE) != 0)
419 		errx(1, "bad blocksize: %"PRIu32
420 		    " (not a multiple of %"PRIu32")",
421 		    S->blocksize, (uint32_t)DEV_BSIZE);
422 	assert(MIN_BLOCKSIZE <= S->blocksize);
423 	assert((S->blocksize % DEV_BSIZE) == 0);
424 	assert(S->blocksize <= MAX_BLOCKSIZE);
425 
426 	/* Grab the end block number if we have one.  */
427 	S->end_block = (ISSET(O->flags, FLAG_p)? O->end_block : 0);
428 
429 	/* Grab the checkpoint block count, if we have one.  */
430 	S->checkpoint_blocks =
431 	    (ISSET(O->flags, FLAG_k)? O->checkpoint_blocks : 0);
432 
433 	/* Open the input image file and the output cloop2 file.  */
434 	S->image_fd = open(image_pathname, O_RDONLY);
435 	if (S->image_fd == -1)
436 		err(1, "open(%s)", image_pathname);
437 
438 	int oflags;
439 	if (!ISSET(O->flags, FLAG_r))
440 		oflags = (O_WRONLY | O_TRUNC | O_CREAT); /* XXX O_EXCL?  */
441 	else if (!ISSET(O->flags, FLAG_R))
442 		oflags = (O_RDWR | O_CREAT);
443 	else
444 		oflags = O_RDWR;
445 	S->cloop2_fd = open(cloop2_pathname, oflags, 0777);
446 	if (S->cloop2_fd == -1)
447 		err(1, "open(%s)", cloop2_pathname);
448 
449 	/* Find the size of the input image.  */
450 	if (ISSET(O->flags, FLAG_l)) {
451 		S->size = O->length;
452 	} else {
453 		static const struct stat zero_st;
454 		struct stat st = zero_st;
455 		if (fstat(S->image_fd, &st) == -1)
456 			err(1, "stat(%s)", image_pathname);
457 		if (st.st_size <= 0)
458 			errx(1, "unknown image size");
459 		assert(st.st_size >= 0);
460 		__CTASSERT(OFF_MAX <= UINT64_MAX);
461 		assert(__type_fit(uint64_t, st.st_size));
462 		S->size = st.st_size;
463 	}
464 	assert(S->size <= OFF_MAX);
465 
466 	/* Find number of full blocks and whether there's a partial block.  */
467 	S->n_full_blocks = (S->size / S->blocksize);
468 	assert(S->n_full_blocks <=
469 	    (UINT32_MAX - ((S->size % S->blocksize) > 0)));
470 	S->n_blocks = (S->n_full_blocks + ((S->size % S->blocksize) > 0));
471 	assert(S->n_full_blocks <= S->n_blocks);
472 
473 	if (S->n_blocks > MAX_N_BLOCKS)
474 		errx(1, "image too large for block size %"PRIu32": %"PRIu64,
475 		    S->blocksize, S->size);
476 	assert(S->n_blocks <= MAX_N_BLOCKS);
477 
478 	/* Choose a window size.  */
479 	const uint32_t window_size = (ISSET(O->flags, FLAG_w)? O->window_size :
480 	    DEF_WINDOW_SIZE);
481 
482 	/* Create an offset table for the blocks; one extra for the end.  */
483 	__CTASSERT(MAX_N_BLOCKS <= (UINT32_MAX - 1));
484 	S->n_offsets = (S->n_blocks + 1);
485 	__CTASSERT(MAX_N_OFFSETS == (MAX_N_BLOCKS + 1));
486 	__CTASSERT(MAX_N_OFFSETS <= (SIZE_MAX / sizeof(uint64_t)));
487 	offtab_init(&S->offtab, S->n_offsets, window_size, S->cloop2_fd,
488 	    CLOOP2_OFFSET_TABLE_OFFSET);
489 
490 	/* Attempt to restart a partial transfer if requested.  */
491 	if (ISSET(O->flags, FLAG_r)) {
492 		if (compress_restart(S)) {
493 			/*
494 			 * Restart succeeded.  Truncate the output
495 			 * here, in case any garbage got appended.  We
496 			 * are committed to making progress at this
497 			 * point.  If the ftruncate fails, we don't
498 			 * lose anything valuable -- this is the last
499 			 * point at which we can restart anyway.
500 			 */
501 			if (ftruncate(S->cloop2_fd, S->offset) == -1)
502 				err(1, "ftruncate failed");
503 
504 			/* All set!  No more initialization to do.  */
505 			return;
506 		} else {
507 			/* Restart failed.  Barf now if requested.  */
508 			if (ISSET(O->flags, FLAG_R))
509 				errx(1, "restart failed, aborting");
510 
511 			/* Otherwise, truncate and start at the top.  */
512 			if (ftruncate(S->cloop2_fd, 0) == -1)
513 				err(1, "truncate failed");
514 			if (lseek(S->cloop2_fd, 0, SEEK_SET) == -1)
515 				err(1, "lseek to cloop2 beginning failed");
516 
517 			/* If we seeked in the input, rewind.  */
518 			if (S->blkno != 0) {
519 				if (lseek(S->image_fd, 0, SEEK_SET) == -1)
520 					err(1,
521 					    "lseek to image beginning failed");
522 			}
523 		}
524 	}
525 
526 	/* Write a bogus (zero) header for now, until we checkpoint.  */
527 	static const struct cloop2_header zero_header;
528 	const ssize_t h_written = write(S->cloop2_fd, &zero_header,
529 	    sizeof(zero_header));
530 	if (h_written == -1)
531 		err(1, "write header");
532 	assert(h_written >= 0);
533 	if ((size_t)h_written != sizeof(zero_header))
534 		errx(1, "partial write of header: %zu != %zu",
535 		    (size_t)h_written, sizeof(zero_header));
536 
537 	/* Reset the offset table to be empty and write it.  */
538 	offtab_reset_write(&S->offtab);
539 
540 	/* Start at the beginning of the image.  */
541 	S->blkno = 0;
542 	S->offset = (sizeof(struct cloop2_header) +
543 	    ((uint64_t)S->n_offsets * sizeof(uint64_t)));
544 	S->n_checkpointed_blocks = 0;
545 
546 	/* Good to go and ready for interruption by a signal.  */
547 	S->initialized = 1;
548 }
549 
550 /*
551  * Try to recover state from an existing output file.
552  *
553  * On success, fill the offset table with what's in the file, set
554  * S->blkno and S->offset to reflect our position, and seek to the
555  * respective positions in the input and output files.
556  *
557  * On failure, return false.  May clobber the offset table, S->blkno,
558  * S->offset, and the file pointers.
559  */
560 static bool
561 compress_restart(struct compress_state *S)
562 {
563 
564 	/* Read in the header.  */
565 	static const struct cloop2_header zero_header;
566 	struct cloop2_header header = zero_header;
567 
568 	const ssize_t h_read = read_block(S->cloop2_fd, &header,
569 	    sizeof(header));
570 	if (h_read == -1) {
571 		warn("failed to read header");
572 		return false;
573 	}
574 	assert(h_read >= 0);
575 	if ((size_t)h_read != sizeof(header)) {
576 		warnx("partial read of header");
577 		return false;
578 	}
579 
580 	/* Check that the header looks like a header.  */
581 	__CTASSERT(sizeof(cloop2_magic) <= sizeof(header.cl2h_magic));
582 	if (memcmp(header.cl2h_magic, cloop2_magic, sizeof(cloop2_magic))
583 	    != 0) {
584 		warnx("bad cloop2 shell script magic");
585 		return false;
586 	}
587 
588 	/* Check the header parameters.  */
589 	if (be32toh(header.cl2h_blocksize) != S->blocksize) {
590 		warnx("mismatched block size: %"PRIu32
591 		    " (expected %"PRIu32")",
592 		    be32toh(header.cl2h_blocksize), S->blocksize);
593 		return false;
594 	}
595 	if (be32toh(header.cl2h_n_blocks) != S->n_blocks) {
596 		warnx("mismatched number of blocks: %"PRIu32
597 		    " (expected %"PRIu32")",
598 		    be32toh(header.cl2h_n_blocks), S->n_blocks);
599 		return false;
600 	}
601 
602 	/* Read in the partial offset table.  */
603 	if (!offtab_reset_read(&S->offtab, &warn, &warnx))
604 		return false;
605 	if (!offtab_prepare_get(&S->offtab, 0))
606 		return false;
607 	const uint64_t first_offset = offtab_get(&S->offtab, 0);
608 	const uint64_t expected = sizeof(struct cloop2_header) +
609 	    ((uint64_t)S->n_offsets * sizeof(uint64_t));
610 	if (first_offset != expected) {
611 		warnx("first offset is not 0x%"PRIx64": 0x%"PRIx64,
612 		    expected, first_offset);
613 		return false;
614 	}
615 
616 	/* Find where we left off.  */
617 	__CTASSERT(MAX_N_OFFSETS <= UINT32_MAX);
618 	uint32_t blkno = 0;
619 	uint64_t last_offset = first_offset;
620 	for (blkno = 0; blkno < S->n_blocks; blkno++) {
621 		if (!offtab_prepare_get(&S->offtab, blkno))
622 			return false;
623 		const uint64_t offset = offtab_get(&S->offtab, blkno);
624 		if (offset == ~(uint64_t)0)
625 			break;
626 
627 		if (0 < blkno) {
628 			const uint64_t start = last_offset;
629 			const uint64_t end = offset;
630 			if (end <= start) {
631 				warnx("bad offset table: 0x%"PRIx64
632 				    ", 0x%"PRIx64, start, end);
633 				return false;
634 			}
635 			/* XXX compression ratio bound */
636 			__CTASSERT(MAX_BLOCKSIZE <= (SIZE_MAX / 2));
637 			if ((2 * (size_t)S->blocksize) <= (end - start)) {
638 				warnx("block %"PRIu32" too large:"
639 				    " %"PRIu64" bytes"
640 				    " from 0x%"PRIx64" to 0x%"PRIx64,
641 				    blkno, (end - start), start, end);
642 				return false;
643 			}
644 		}
645 
646 		last_offset = offset;
647 	}
648 
649 	if (blkno == 0) {
650 		warnx("no blocks were written; nothing to restart");
651 		return false;
652 	}
653 
654 	/* Make sure the rest of the offset table is all ones.  */
655 	if (blkno < S->n_blocks) {
656 		uint32_t nblkno;
657 
658 		for (nblkno = blkno; nblkno < S->n_blocks; nblkno++) {
659 			if (!offtab_prepare_get(&S->offtab, nblkno))
660 				return false;
661 			const uint64_t offset = offtab_get(&S->offtab, nblkno);
662 			if (offset != ~(uint64_t)0) {
663 				warnx("bad partial offset table entry"
664 				    " at %"PRIu32": 0x%"PRIx64,
665 				    nblkno, offset);
666 				return false;
667 			}
668 		}
669 	}
670 
671 	/*
672 	 * XXX Consider decompressing some number of blocks to make
673 	 * sure they match.
674 	 */
675 
676 	/* Back up by one.  */
677 	assert(1 <= blkno);
678 	blkno -= 1;
679 
680 	/* Seek to the output position.  */
681 	assert(last_offset <= OFF_MAX);
682 	if (lseek(S->cloop2_fd, last_offset, SEEK_SET) == -1) {
683 		warn("lseek output cloop2 to %"PRIx64" failed", last_offset);
684 		return false;
685 	}
686 
687 	/* Switch from reading to writing the offset table.  */
688 	if (!offtab_transmogrify_read_to_write(&S->offtab, blkno))
689 		return false;
690 
691 	/*
692 	 * Seek to the input position last, after all other possible
693 	 * failures, because if the input is a pipe, we can't change
694 	 * our mind, rewind, and start at the beginning instead of
695 	 * restarting.
696 	 */
697 	assert(S->size <= OFF_MAX);
698 	assert(blkno <= (S->size / S->blocksize));
699 	const off_t restart_position = ((off_t)blkno * (off_t)S->blocksize);
700 	assert(0 <= restart_position);
701 	assert(restart_position <= (off_t)S->size);
702 	if (lseek(S->image_fd, restart_position, SEEK_SET) == -1) {
703 		if (errno != ESPIPE) {
704 			warn("lseek input image failed");
705 			return false;
706 		}
707 
708 		/* Try read instead of lseek for a pipe/socket/fifo.  */
709 		void *const buffer = malloc(0x10000);
710 		if (buffer == NULL)
711 			err(1, "malloc temporary buffer");
712 		off_t left = restart_position;
713 		while (left > 0) {
714 			const size_t size = MIN(0x10000, left);
715 			const ssize_t n_read = read_block(S->image_fd, buffer,
716 			    size);
717 			if (n_read == -1) {
718 				free(buffer);
719 				warn("read of input image failed");
720 				return false;
721 			}
722 			assert(n_read >= 0);
723 			if ((size_t)n_read != size) {
724 				free(buffer);
725 				warnx("partial read of input image");
726 				return false;
727 			}
728 			assert((off_t)size <= left);
729 			left -= size;
730 		}
731 		free(buffer);
732 	}
733 
734 	/* Start where we left off.  */
735 	S->blkno = blkno;
736 	S->offset = last_offset;
737 	S->n_checkpointed_blocks = blkno;
738 
739 	/* Good to go and ready for interruption by a signal.  */
740 	S->initialized = 1;
741 
742 	/* Success!  */
743 	return true;
744 }
745 
746 /*
747  * Read a single block, compress it, and write the compressed block.
748  * Return the size of the compressed block.
749  */
750 static uint32_t
751 compress_block(int in_fd, int out_fd, uint32_t blkno, uint32_t blocksize,
752     uint32_t readsize, void *uncompbuf, void *compbuf)
753 {
754 
755 	assert(readsize <= blocksize);
756 	assert(blocksize <= MAX_BLOCKSIZE);
757 
758 	/* Read the uncompressed block.  */
759 	const ssize_t n_read = read_block(in_fd, uncompbuf, readsize);
760 	if (n_read == -1)
761 		err(1, "read block %"PRIu32, blkno);
762 	assert(n_read >= 0);
763 	if ((size_t)n_read != readsize)
764 		errx(1, "partial read of block %"PRIu32": %zu != %"PRIu32,
765 		    blkno, (size_t)n_read, readsize);
766 
767 	/* Compress the block.  */
768 	/* XXX compression ratio bound */
769 	__CTASSERT(MAX_BLOCKSIZE <= (ULONG_MAX / 2));
770 	const unsigned long uncomplen =
771 	    (VNDCOMPRESS_COMPAT? blocksize : readsize); /* XXX */
772 	unsigned long complen = (uncomplen * 2);
773 	const int zerror = compress2(compbuf, &complen, uncompbuf, uncomplen,
774 	    Z_BEST_COMPRESSION);
775 	if (zerror != Z_OK)
776 		errx(1, "compressed failed at block %"PRIu32" (%d): %s", blkno,
777 		    zerror, zError(zerror));
778 	assert(complen <= (uncomplen * 2));
779 
780 	/* Write the compressed block.  */
781 	const ssize_t n_written = write(out_fd, compbuf, complen);
782 	if (n_written == -1)
783 		err(1, "write block %"PRIu32, blkno);
784 	assert(n_written >= 0);
785 	if ((size_t)n_written != complen)
786 		errx(1, "partial write of block %"PRIu32": %zu != %lu",
787 		    blkno, (size_t)n_written, complen);
788 
789 	return (size_t)n_written;
790 }
791 
792 /*
793  * Checkpoint if appropriate.
794  */
795 static void
796 compress_maybe_checkpoint(struct compress_state *S)
797 {
798 
799 	if ((0 < S->checkpoint_blocks) && (0 < S->blkno) &&
800 	    ((S->blkno % S->checkpoint_blocks) == 0)) {
801 		assert(S->offset <= OFF_MAX);
802 		assert((off_t)S->offset == lseek(S->cloop2_fd, 0, SEEK_CUR));
803 		compress_checkpoint(S);
804 	}
805 }
806 
807 /*
808  * Write the prefix of the offset table that we have filled so far.
809  *
810  * We fsync the data blocks we have written, and then write the offset
811  * table, and then fsync the offset table and file metadata.  This
812  * should help to avoid offset tables that point at garbage data.
813  *
814  * This may be called from a signal handler, so it must not use stdio,
815  * malloc, &c. -- it may only (a) handle signal-safe state in S, and
816  * (b) do file descriptor I/O / fsync.
817  *
818  * XXX This requires further thought and heavy testing to be sure.
819  *
820  * XXX Should have an option to suppress fsync.
821  *
822  * XXX Should have an option to fail on fsync failures.
823  *
824  * XXX Would be nice if we could just do a barrier rather than an
825  * fsync.
826  *
827  * XXX How might we automatically test the fsyncs?
828  */
829 static void
830 compress_checkpoint(struct compress_state *S)
831 {
832 
833 	assert(S->blkno < S->n_offsets);
834 	const uint32_t n_offsets = (S->blkno + 1);
835 	assert(n_offsets <= S->n_offsets);
836 
837 	assert(S->offset <= OFF_MAX);
838 	assert((off_t)S->offset <= lseek(S->cloop2_fd, 0, SEEK_CUR));
839 
840 	/* Make sure the data hits the disk before we say it's ready.  */
841 	if (fsync_range(S->cloop2_fd, (FFILESYNC | FDISKSYNC), 0, S->offset)
842 	    == -1)
843 		warn_ss("fsync of output failed");
844 
845 	/* Say the data blocks are ready.  */
846 	offtab_checkpoint(&S->offtab, n_offsets,
847 	    (S->n_checkpointed_blocks == 0? OFFTAB_CHECKPOINT_SYNC : 0));
848 
849 	/*
850 	 * If this is the first checkpoint, initialize the header.
851 	 * Signal handler can race with main code here, but it is
852 	 * harmless -- just an extra fsync and write of the header,
853 	 * which are both idempotent.
854 	 *
855 	 * Once we have synchronously checkpointed the offset table,
856 	 * subsequent writes will preserve a valid state.
857 	 */
858 	if (S->n_checkpointed_blocks == 0) {
859 		static const struct cloop2_header zero_header;
860 		struct cloop2_header header = zero_header;
861 
862 		/* Format the header.  */
863 		__CTASSERT(sizeof(cloop2_magic) <= sizeof(header.cl2h_magic));
864 		(void)memcpy(header.cl2h_magic, cloop2_magic,
865 		    sizeof(cloop2_magic));
866 		header.cl2h_blocksize = htobe32(S->blocksize);
867 		header.cl2h_n_blocks = htobe32(S->n_blocks);
868 
869 		/* Write the header.  */
870 		const ssize_t h_written = pwrite(S->cloop2_fd, &header,
871 		    sizeof(header), 0);
872 		if (h_written == -1)
873 			err_ss(1, "write header");
874 		assert(h_written >= 0);
875 		if ((size_t)h_written != sizeof(header))
876 			errx_ss(1, "partial write of header: %zu != %zu",
877 			    (size_t)h_written, sizeof(header));
878 	}
879 
880 	/* Record how many blocks we've checkpointed.  */
881     {
882 	sigset_t old_sigmask;
883 	block_signals(&old_sigmask);
884 	S->n_checkpointed_blocks = S->blkno;
885 	restore_sigmask(&old_sigmask);
886     }
887 }
888 
889 /*
890  * Release everything we allocated in compress_init.
891  */
892 static void
893 compress_exit(struct compress_state *S)
894 {
895 
896 	/* Done with the offset table.  Destroy it.  */
897 	offtab_destroy(&S->offtab);
898 
899 	/* Done with the files.  Close them.  */
900 	if (close(S->cloop2_fd) == -1)
901 		warn("close(cloop2 fd)");
902 	if (close(S->image_fd) == -1)
903 		warn("close(image fd)");
904 }
905