xref: /netbsd-src/usr.bin/vndcompress/vndcompress.c (revision e89934bbf778a6d6d6894877c4da59d0c7835b0f)
1 /*	$NetBSD: vndcompress.c,v 1.26 2017/01/10 21:15:54 christos Exp $	*/
2 
3 /*-
4  * Copyright (c) 2013 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Taylor R. Campbell.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #include <sys/cdefs.h>
33 __RCSID("$NetBSD: vndcompress.c,v 1.26 2017/01/10 21:15:54 christos Exp $");
34 
35 #include <sys/endian.h>
36 #include <sys/stat.h>
37 
38 #include <assert.h>
39 #include <err.h>
40 #include <errno.h>
41 #include <fcntl.h>
42 #include <inttypes.h>
43 #include <limits.h>
44 #include <signal.h>
45 #include <stdbool.h>
46 #include <stdint.h>
47 #include <stdio.h>
48 #include <stdlib.h>
49 #include <string.h>
50 #include <unistd.h>
51 #include <zlib.h>
52 
53 #include "common.h"
54 #include "offtab.h"
55 #include "utils.h"
56 
57 /*
58  * XXX Switch to control bug-for-bug byte-for-byte compatibility with
59  * NetBSD's vndcompress.
60  */
61 #define	VNDCOMPRESS_COMPAT	0
62 
63 __CTASSERT(sizeof(struct cloop2_header) == CLOOP2_OFFSET_TABLE_OFFSET);
64 
65 struct compress_state {
66 	uint64_t	size;		/* uncompressed size */
67 	uint64_t	offset;		/* output byte offset */
68 	uint32_t	blocksize;	/* bytes per block */
69 	uint32_t	blkno;		/* input block number */
70 	uint32_t	n_full_blocks;	/* floor(size/blocksize) */
71 	uint32_t	n_blocks;	/* ceiling(size/blocksize) */
72 	uint32_t	n_offsets;	/* n_blocks + 1 */
73 	uint32_t	end_block;	/* last block to transfer */
74 	uint32_t	checkpoint_blocks;	/* blocks before checkpoint */
75 	int		image_fd;
76 	int		cloop2_fd;
77 	struct offtab	offtab;
78 	uint32_t	n_checkpointed_blocks;
79 	volatile sig_atomic_t
80 			initialized;	/* everything above initialized?  */
81 };
82 
83 /* Global compression state for SIGINFO handler.  */
84 static struct compress_state	global_state;
85 
86 struct sigdesc {
87 	int sd_signo;
88 	const char *sd_name;
89 };
90 
91 static const struct sigdesc info_signals[] = {
92 	{ SIGINFO, "SIGINFO" },
93 	{ SIGUSR1, "SIGUSR1" },
94 };
95 
96 static const struct sigdesc checkpoint_signals[] = {
97 	{ SIGUSR2, "SIGUSR2" },
98 };
99 
100 static void	init_signals(void);
101 static void	init_signal_handler(int, const struct sigdesc *, size_t,
102 		    void (*)(int));
103 static void	info_signal_handler(int);
104 static void	checkpoint_signal_handler(int);
105 static void	compress_progress(struct compress_state *);
106 static void	compress_init(int, char **, const struct options *,
107 		    struct compress_state *);
108 static bool	compress_restart(struct compress_state *);
109 static uint32_t	compress_block(int, int, uint32_t, uint32_t, uint32_t, void *,
110 		    void *);
111 static void	compress_maybe_checkpoint(struct compress_state *);
112 static void	compress_checkpoint(struct compress_state *);
113 static void	compress_exit(struct compress_state *);
114 
115 /*
116  * Compression entry point.
117  */
118 int
119 vndcompress(int argc, char **argv, const struct options *O)
120 {
121 	struct compress_state *const S = &global_state;
122 
123 	/* Paranoia.  The other fields either have no sentinel or use zero.  */
124 	S->image_fd = -1;
125 	S->cloop2_fd = -1;
126 
127 	/* Set up signal handlers so we can handle SIGINFO ASAP.  */
128 	init_signals();
129 
130 	/*
131 	 * Parse the arguments to initialize our state.
132 	 */
133 	compress_init(argc, argv, O, S);
134 	assert(MIN_BLOCKSIZE <= S->blocksize);
135 	assert(S->blocksize <= MAX_BLOCKSIZE);
136 
137 	/*
138 	 * Allocate compression buffers.
139 	 *
140 	 * Compression may actually expand.  From an overabundance of
141 	 * caution, assume it can expand by at most double.
142 	 *
143 	 * XXX Check and consider tightening this assumption.
144 	 */
145 	__CTASSERT(MAX_BLOCKSIZE <= SIZE_MAX);
146 	void *const uncompbuf = malloc(S->blocksize);
147 	if (uncompbuf == NULL)
148 		err(1, "malloc uncompressed buffer");
149 
150 	/* XXX compression ratio bound */
151 	__CTASSERT(MAX_BLOCKSIZE <= (SIZE_MAX / 2));
152 	void *const compbuf = malloc(2 * (size_t)S->blocksize);
153 	if (compbuf == NULL)
154 		err(1, "malloc compressed buffer");
155 
156 	/*
157 	 * Compress the blocks.  S->blkno specifies the input block
158 	 * we're about to transfer.  S->offset is the current output
159 	 * offset.
160 	 */
161 	while (S->blkno < S->n_blocks) {
162 		/* Report any progress.  */
163 		compress_progress(S);
164 
165 		/* Stop if we've done the requested partial transfer.  */
166 		if ((0 < S->end_block) && (S->end_block <= S->blkno))
167 			goto out;
168 
169 		/* Checkpoint if appropriate.  */
170 		compress_maybe_checkpoint(S);
171 		offtab_prepare_put(&S->offtab, (S->blkno + 1));
172 
173 		/* Choose read size: partial if last block, full if not.  */
174 		const uint32_t readsize = (S->blkno == S->n_full_blocks?
175 		    (S->size % S->blocksize) : S->blocksize);
176 		assert(readsize > 0);
177 		assert(readsize <= S->blocksize);
178 
179 		/* Fail noisily if we might be about to overflow.  */
180 		/* XXX compression ratio bound */
181 		__CTASSERT(MAX_BLOCKSIZE <= (UINTMAX_MAX / 2));
182 		assert(S->offset <= MIN(UINT64_MAX, OFF_MAX));
183 		if ((2 * (uintmax_t)readsize) >
184 		    (MIN(UINT64_MAX, OFF_MAX) - S->offset))
185 			errx(1, "blkno %"PRIu32" may overflow: %ju + 2*%ju",
186 			    S->blkno, (uintmax_t)S->offset,
187 			    (uintmax_t)readsize);
188 
189 		/* Process the block.  */
190 		const uint32_t complen =
191 		    compress_block(S->image_fd, S->cloop2_fd, S->blkno,
192 			S->blocksize, readsize, uncompbuf, compbuf);
193 
194 		/*
195 		 * Signal-atomically update the state to reflect
196 		 * (a) what block number we are now at,
197 		 * (b) how far we are now in the output file, and
198 		 * (c) where the last block ended.
199 		 */
200 		assert(S->blkno <= (UINT32_MAX - 1));
201 		assert(complen <= (MIN(UINT64_MAX, OFF_MAX) - S->offset));
202 		assert((S->blkno + 1) < S->n_offsets);
203 	    {
204 		sigset_t old_sigmask;
205 		block_signals(&old_sigmask);
206 		S->blkno += 1;					/* (a) */
207 		S->offset += complen;				/* (b) */
208 		offtab_put(&S->offtab, S->blkno, S->offset);	/* (c) */
209 		restore_sigmask(&old_sigmask);
210 	    }
211 	}
212 
213 	/* Make sure we're all done. */
214 	assert(S->blkno == S->n_blocks);
215 	assert((S->blkno + 1) == S->n_offsets);
216 
217 	/* Pad to the disk block size.  */
218 	const uint32_t n_extra = (S->offset % DEV_BSIZE);
219 	if (n_extra != 0) {
220 		const uint32_t n_padding = (DEV_BSIZE - n_extra);
221 		/* Reuse compbuf -- guaranteed to be large enough.  */
222 		(void)memset(compbuf, 0, n_padding);
223 		const ssize_t n_written = write(S->cloop2_fd, compbuf,
224 		    n_padding);
225 		if (n_written == -1)
226 			err(1, "write final padding failed");
227 		assert(n_written >= 0);
228 		if ((size_t)n_written != n_padding)
229 			errx(1, "partial write of final padding bytes"
230 			    ": %zu != %"PRIu32,
231 			    (size_t)n_written, n_padding);
232 
233 		/* Account for the extra bytes in the output file.  */
234 		assert(n_padding <= (MIN(UINT64_MAX, OFF_MAX) - S->offset));
235 	    {
236 		sigset_t old_sigmask;
237 		block_signals(&old_sigmask);
238 		S->offset += n_padding;
239 		restore_sigmask(&old_sigmask);
240 	    }
241 	}
242 
243 out:
244 	/* One last checkpoint to commit the offset table.  */
245 	assert(S->offset <= OFF_MAX);
246 	assert((off_t)S->offset == lseek(S->cloop2_fd, 0, SEEK_CUR));
247 	compress_checkpoint(S);
248 
249 	/*
250 	 * Free the compression buffers and finalize the compression.
251 	 */
252 	free(compbuf);
253 	free(uncompbuf);
254 	compress_exit(S);
255 
256 	return 0;
257 }
258 
259 /*
260  * Signal cruft.
261  */
262 
263 static void
264 init_signals(void)
265 {
266 
267 	init_signal_handler(SA_RESTART, info_signals,
268 	    __arraycount(info_signals), &info_signal_handler);
269 	init_signal_handler(SA_RESTART, checkpoint_signals,
270 	    __arraycount(checkpoint_signals), &checkpoint_signal_handler);
271 }
272 
273 static void
274 init_signal_handler(int flags, const struct sigdesc *signals, size_t n,
275     void (*handler)(int))
276 {
277 	static const struct sigaction zero_sa;
278 	struct sigaction sa = zero_sa;
279 	size_t i;
280 
281 	(void)sigemptyset(&sa.sa_mask);
282 	for (i = 0; i < n; i++)
283 		(void)sigaddset(&sa.sa_mask, signals[i].sd_signo);
284 	sa.sa_flags = flags;
285 	sa.sa_handler = handler;
286 	for (i = 0; i < n; i++)
287 		if (sigaction(signals[i].sd_signo, &sa, NULL) == -1)
288 			err(1, "sigaction(%s)", signals[i].sd_name);
289 }
290 
291 static void
292 info_signal_handler(int signo __unused)
293 {
294 	/* Save errno.  */
295 	const int error = errno;
296 	struct compress_state *const S = &global_state;
297 	char buf[128];
298 
299 	/* Bail if the state is not yet initialized.  */
300 	if (!S->initialized) {
301 		warnx_ss("initializing");
302 		goto out;
303 	}
304 
305 	/* Carefully calculate our I/O position.  */
306 	assert(S->blocksize > 0);
307 	__CTASSERT(MAX_N_BLOCKS <= (UINT64_MAX / MAX_BLOCKSIZE));
308 	const uint64_t nread = ((uint64_t)S->blkno * (uint64_t)S->blocksize);
309 
310 	assert(S->n_blocks > 0);
311 	__CTASSERT(CLOOP2_OFFSET_TABLE_OFFSET <=
312 	    (UINT64_MAX / sizeof(uint64_t)));
313 	__CTASSERT(MAX_N_BLOCKS <= ((UINT64_MAX / sizeof(uint64_t)) -
314 		CLOOP2_OFFSET_TABLE_OFFSET));
315 	const uint64_t nwritten = (S->offset <= (CLOOP2_OFFSET_TABLE_OFFSET +
316 		((uint64_t)S->n_blocks * sizeof(uint64_t)))?
317 	    0 : S->offset);
318 
319 	/* snprintf_ss can't do floating-point, so do fixed-point instead.  */
320 	const uint64_t ratio_percent =
321 	    (nread > 0?
322 		((nwritten >= (UINT64_MAX / 100)) ?
323 		    ((nwritten / nread) * 100) : ((nwritten * 100) / nread))
324 		: 0);
325 
326 	/* Format the status.  */
327 	assert(S->n_checkpointed_blocks <= (UINT64_MAX / S->blocksize));
328 	const int n = snprintf_ss(buf, sizeof(buf),
329 	    "vndcompress: read %"PRIu64" bytes, wrote %"PRIu64" bytes, "
330 	    "compression ratio %"PRIu64"%% (checkpointed %"PRIu64" bytes)\n",
331 	    nread, nwritten, ratio_percent,
332 	    ((uint64_t)S->n_checkpointed_blocks * (uint64_t)S->blocksize));
333 	if (n < 0) {
334 		const char msg[] = "vndcompress: can't format info\n";
335 		(void)write(STDERR_FILENO, msg, __arraycount(msg));
336 	} else {
337 		__CTASSERT(INT_MAX <= SIZE_MAX);
338 		(void)write(STDERR_FILENO, buf, (size_t)n);
339 	}
340 
341 out:
342 	/* Restore errno.  */
343 	errno = error;
344 }
345 
346 static void
347 checkpoint_signal_handler(int signo __unused)
348 {
349 	/* Save errno.  */
350 	const int error = errno;
351 	struct compress_state *const S = &global_state;
352 
353 	/* Bail if the state is not yet initialized.  */
354 	if (!S->initialized) {
355 		warnx_ss("nothing to checkpoint yet");
356 		goto out;
357 	}
358 
359 	assert(S->image_fd >= 0);
360 	assert(S->cloop2_fd >= 0);
361 
362 	/* Take a checkpoint.  */
363 	assert(S->blocksize > 0);
364 	assert(S->blkno <= (UINT64_MAX / S->blocksize));
365 	warnx_ss("checkpointing %"PRIu64" bytes",
366 	    ((uint64_t)S->blkno * (uint64_t)S->blocksize));
367 	compress_checkpoint(S);
368 
369 out:
370 	/* Restore errno.  */
371 	errno = error;
372 }
373 
374 /*
375  * Report progress.
376  *
377  * XXX Should do a progress bar here.
378  */
379 static void
380 compress_progress(struct compress_state *S __unused)
381 {
382 }
383 
384 /*
385  * Parse arguments, open the files, and initialize the state.
386  */
387 static void
388 compress_init(int argc, char **argv, const struct options *O,
389     struct compress_state *S)
390 {
391 
392 	if (!((argc == 2) || (argc == 3)))
393 		usage();
394 
395 	const char *const image_pathname = argv[0];
396 	const char *const cloop2_pathname = argv[1];
397 
398 	/* Grab the block size either from `-b' or from the last argument.  */
399 	__CTASSERT(0 < DEV_BSIZE);
400 	__CTASSERT((MIN_BLOCKSIZE % DEV_BSIZE) == 0);
401 	__CTASSERT(MIN_BLOCKSIZE <= DEF_BLOCKSIZE);
402 	__CTASSERT((DEF_BLOCKSIZE % DEV_BSIZE) == 0);
403 	__CTASSERT(DEF_BLOCKSIZE <= MAX_BLOCKSIZE);
404 	__CTASSERT((MAX_BLOCKSIZE % DEV_BSIZE) == 0);
405 	if (ISSET(O->flags, FLAG_b)) {
406 		if (argc == 3) {
407 			warnx("use -b or the extra argument, not both");
408 			usage();
409 		}
410 		S->blocksize = O->blocksize;
411 	} else {
412 		S->blocksize = (argc == 2? DEF_BLOCKSIZE :
413 		    strsuftoll("block size", argv[2], MIN_BLOCKSIZE,
414 			MAX_BLOCKSIZE));
415 	}
416 
417 	/* Sanity-check the blocksize.  (strsuftoll guarantees bounds.)  */
418 	__CTASSERT(DEV_BSIZE <= UINT32_MAX);
419 	if ((S->blocksize % DEV_BSIZE) != 0)
420 		errx(1, "bad blocksize: %"PRIu32
421 		    " (not a multiple of %"PRIu32")",
422 		    S->blocksize, (uint32_t)DEV_BSIZE);
423 	assert(MIN_BLOCKSIZE <= S->blocksize);
424 	assert((S->blocksize % DEV_BSIZE) == 0);
425 	assert(S->blocksize <= MAX_BLOCKSIZE);
426 
427 	/* Grab the end block number if we have one.  */
428 	S->end_block = (ISSET(O->flags, FLAG_p)? O->end_block : 0);
429 
430 	/* Grab the checkpoint block count, if we have one.  */
431 	S->checkpoint_blocks =
432 	    (ISSET(O->flags, FLAG_k)? O->checkpoint_blocks : 0);
433 
434 	/* Open the input image file and the output cloop2 file.  */
435 	S->image_fd = open(image_pathname, O_RDONLY);
436 	if (S->image_fd == -1)
437 		err(1, "open(%s)", image_pathname);
438 
439 	int oflags;
440 	if (!ISSET(O->flags, FLAG_r))
441 		oflags = (O_WRONLY | O_TRUNC | O_CREAT); /* XXX O_EXCL?  */
442 	else if (!ISSET(O->flags, FLAG_R))
443 		oflags = (O_RDWR | O_CREAT);
444 	else
445 		oflags = O_RDWR;
446 	S->cloop2_fd = open(cloop2_pathname, oflags, 0777);
447 	if (S->cloop2_fd == -1)
448 		err(1, "open(%s)", cloop2_pathname);
449 
450 	/* Find the size of the input image.  */
451 	if (ISSET(O->flags, FLAG_l)) {
452 		S->size = O->length;
453 	} else {
454 		static const struct stat zero_st;
455 		struct stat st = zero_st;
456 		if (fstat(S->image_fd, &st) == -1)
457 			err(1, "stat(%s)", image_pathname);
458 		if (st.st_size <= 0)
459 			errx(1, "unknown image size");
460 		assert(st.st_size >= 0);
461 		__CTASSERT(OFF_MAX <= UINT64_MAX);
462 		assert(__type_fit(uint64_t, st.st_size));
463 		S->size = st.st_size;
464 	}
465 	assert(S->size <= OFF_MAX);
466 
467 	/* Find number of full blocks and whether there's a partial block.  */
468 	S->n_full_blocks = (S->size / S->blocksize);
469 	assert(S->n_full_blocks <=
470 	    (UINT32_MAX - ((S->size % S->blocksize) > 0)));
471 	S->n_blocks = (S->n_full_blocks + ((S->size % S->blocksize) > 0));
472 	assert(S->n_full_blocks <= S->n_blocks);
473 
474 	if (S->n_blocks > MAX_N_BLOCKS)
475 		errx(1, "image too large for block size %"PRIu32": %"PRIu64,
476 		    S->blocksize, S->size);
477 	assert(S->n_blocks <= MAX_N_BLOCKS);
478 
479 	/* Choose a window size.  */
480 	const uint32_t window_size = (ISSET(O->flags, FLAG_w)? O->window_size :
481 	    DEF_WINDOW_SIZE);
482 
483 	/* Create an offset table for the blocks; one extra for the end.  */
484 	__CTASSERT(MAX_N_BLOCKS <= (UINT32_MAX - 1));
485 	S->n_offsets = (S->n_blocks + 1);
486 	__CTASSERT(MAX_N_OFFSETS == (MAX_N_BLOCKS + 1));
487 	__CTASSERT(MAX_N_OFFSETS <= (SIZE_MAX / sizeof(uint64_t)));
488 	offtab_init(&S->offtab, S->n_offsets, window_size, S->cloop2_fd,
489 	    CLOOP2_OFFSET_TABLE_OFFSET);
490 
491 	/* Attempt to restart a partial transfer if requested.  */
492 	if (ISSET(O->flags, FLAG_r)) {
493 		if (compress_restart(S)) {
494 			/*
495 			 * Restart succeeded.  Truncate the output
496 			 * here, in case any garbage got appended.  We
497 			 * are committed to making progress at this
498 			 * point.  If the ftruncate fails, we don't
499 			 * lose anything valuable -- this is the last
500 			 * point at which we can restart anyway.
501 			 */
502 			if (ftruncate(S->cloop2_fd, S->offset) == -1)
503 				err(1, "ftruncate failed");
504 
505 			/* All set!  No more initialization to do.  */
506 			return;
507 		} else {
508 			/* Restart failed.  Barf now if requested.  */
509 			if (ISSET(O->flags, FLAG_R))
510 				errx(1, "restart failed, aborting");
511 
512 			/* Otherwise, truncate and start at the top.  */
513 			if (ftruncate(S->cloop2_fd, 0) == -1)
514 				err(1, "truncate failed");
515 			if (lseek(S->cloop2_fd, 0, SEEK_SET) == -1)
516 				err(1, "lseek to cloop2 beginning failed");
517 
518 			/* If we seeked in the input, rewind.  */
519 			if (S->blkno != 0) {
520 				if (lseek(S->image_fd, 0, SEEK_SET) == -1)
521 					err(1,
522 					    "lseek to image beginning failed");
523 			}
524 		}
525 	}
526 
527 	/* Write a bogus (zero) header for now, until we checkpoint.  */
528 	static const struct cloop2_header zero_header;
529 	const ssize_t h_written = write(S->cloop2_fd, &zero_header,
530 	    sizeof(zero_header));
531 	if (h_written == -1)
532 		err(1, "write header");
533 	assert(h_written >= 0);
534 	if ((size_t)h_written != sizeof(zero_header))
535 		errx(1, "partial write of header: %zu != %zu",
536 		    (size_t)h_written, sizeof(zero_header));
537 
538 	/* Reset the offset table to be empty and write it.  */
539 	offtab_reset_write(&S->offtab);
540 
541 	/* Start at the beginning of the image.  */
542 	S->blkno = 0;
543 	S->offset = (sizeof(struct cloop2_header) +
544 	    ((uint64_t)S->n_offsets * sizeof(uint64_t)));
545 	S->n_checkpointed_blocks = 0;
546 
547 	/* Good to go and ready for interruption by a signal.  */
548 	S->initialized = 1;
549 }
550 
551 /*
552  * Try to recover state from an existing output file.
553  *
554  * On success, fill the offset table with what's in the file, set
555  * S->blkno and S->offset to reflect our position, and seek to the
556  * respective positions in the input and output files.
557  *
558  * On failure, return false.  May clobber the offset table, S->blkno,
559  * S->offset, and the file pointers.
560  */
561 static bool
562 compress_restart(struct compress_state *S)
563 {
564 
565 	/* Read in the header.  */
566 	static const struct cloop2_header zero_header;
567 	struct cloop2_header header = zero_header;
568 
569 	const ssize_t h_read = read_block(S->cloop2_fd, &header,
570 	    sizeof(header));
571 	if (h_read == -1) {
572 		warn("failed to read header");
573 		return false;
574 	}
575 	assert(h_read >= 0);
576 	if ((size_t)h_read != sizeof(header)) {
577 		warnx("partial read of header");
578 		return false;
579 	}
580 
581 	/* Check that the header looks like a header.  */
582 	__CTASSERT(sizeof(cloop2_magic) <= sizeof(header.cl2h_magic));
583 	if (memcmp(header.cl2h_magic, cloop2_magic, sizeof(cloop2_magic))
584 	    != 0) {
585 		warnx("bad cloop2 shell script magic");
586 		return false;
587 	}
588 
589 	/* Check the header parameters.  */
590 	if (be32toh(header.cl2h_blocksize) != S->blocksize) {
591 		warnx("mismatched block size: %"PRIu32
592 		    " (expected %"PRIu32")",
593 		    be32toh(header.cl2h_blocksize), S->blocksize);
594 		return false;
595 	}
596 	if (be32toh(header.cl2h_n_blocks) != S->n_blocks) {
597 		warnx("mismatched number of blocks: %"PRIu32
598 		    " (expected %"PRIu32")",
599 		    be32toh(header.cl2h_n_blocks), S->n_blocks);
600 		return false;
601 	}
602 
603 	/* Read in the partial offset table.  */
604 	if (!offtab_reset_read(&S->offtab, &warn, &warnx))
605 		return false;
606 	if (!offtab_prepare_get(&S->offtab, 0))
607 		return false;
608 	const uint64_t first_offset = offtab_get(&S->offtab, 0);
609 	const uint64_t expected = sizeof(struct cloop2_header) +
610 	    ((uint64_t)S->n_offsets * sizeof(uint64_t));
611 	if (first_offset != expected) {
612 		warnx("first offset is not 0x%"PRIx64": 0x%"PRIx64,
613 		    expected, first_offset);
614 		return false;
615 	}
616 
617 	/* Find where we left off.  */
618 	__CTASSERT(MAX_N_OFFSETS <= UINT32_MAX);
619 	uint32_t blkno = 0;
620 	uint64_t last_offset = first_offset;
621 	for (blkno = 0; blkno < S->n_blocks; blkno++) {
622 		if (!offtab_prepare_get(&S->offtab, blkno))
623 			return false;
624 		const uint64_t offset = offtab_get(&S->offtab, blkno);
625 		if (offset == ~(uint64_t)0)
626 			break;
627 
628 		if (0 < blkno) {
629 			const uint64_t start = last_offset;
630 			const uint64_t end = offset;
631 			if (end <= start) {
632 				warnx("bad offset table: 0x%"PRIx64
633 				    ", 0x%"PRIx64, start, end);
634 				return false;
635 			}
636 			/* XXX compression ratio bound */
637 			__CTASSERT(MAX_BLOCKSIZE <= (SIZE_MAX / 2));
638 			if ((2 * (size_t)S->blocksize) <= (end - start)) {
639 				warnx("block %"PRIu32" too large:"
640 				    " %"PRIu64" bytes"
641 				    " from 0x%"PRIx64" to 0x%"PRIx64,
642 				    blkno, (end - start), start, end);
643 				return false;
644 			}
645 		}
646 
647 		last_offset = offset;
648 	}
649 
650 	if (blkno == 0) {
651 		warnx("no blocks were written; nothing to restart");
652 		return false;
653 	}
654 
655 	/* Make sure the rest of the offset table is all ones.  */
656 	if (blkno < S->n_blocks) {
657 		uint32_t nblkno;
658 
659 		for (nblkno = blkno; nblkno < S->n_blocks; nblkno++) {
660 			if (!offtab_prepare_get(&S->offtab, nblkno))
661 				return false;
662 			const uint64_t offset = offtab_get(&S->offtab, nblkno);
663 			if (offset != ~(uint64_t)0) {
664 				warnx("bad partial offset table entry"
665 				    " at %"PRIu32": 0x%"PRIx64,
666 				    nblkno, offset);
667 				return false;
668 			}
669 		}
670 	}
671 
672 	/*
673 	 * XXX Consider decompressing some number of blocks to make
674 	 * sure they match.
675 	 */
676 
677 	/* Back up by one.  */
678 	assert(1 <= blkno);
679 	blkno -= 1;
680 
681 	/* Seek to the output position.  */
682 	assert(last_offset <= OFF_MAX);
683 	if (lseek(S->cloop2_fd, last_offset, SEEK_SET) == -1) {
684 		warn("lseek output cloop2 to %"PRIx64" failed", last_offset);
685 		return false;
686 	}
687 
688 	/* Switch from reading to writing the offset table.  */
689 	if (!offtab_transmogrify_read_to_write(&S->offtab, blkno))
690 		return false;
691 
692 	/*
693 	 * Seek to the input position last, after all other possible
694 	 * failures, because if the input is a pipe, we can't change
695 	 * our mind, rewind, and start at the beginning instead of
696 	 * restarting.
697 	 */
698 	assert(S->size <= OFF_MAX);
699 	assert(blkno <= (S->size / S->blocksize));
700 	const off_t restart_position = ((off_t)blkno * (off_t)S->blocksize);
701 	assert(0 <= restart_position);
702 	assert(restart_position <= (off_t)S->size);
703 	if (lseek(S->image_fd, restart_position, SEEK_SET) == -1) {
704 		if (errno != ESPIPE) {
705 			warn("lseek input image failed");
706 			return false;
707 		}
708 
709 		/* Try read instead of lseek for a pipe/socket/fifo.  */
710 		void *const buffer = malloc(0x10000);
711 		if (buffer == NULL)
712 			err(1, "malloc temporary buffer");
713 		off_t left = restart_position;
714 		while (left > 0) {
715 			const size_t size = MIN(0x10000, left);
716 			const ssize_t n_read = read_block(S->image_fd, buffer,
717 			    size);
718 			if (n_read == -1) {
719 				free(buffer);
720 				warn("read of input image failed");
721 				return false;
722 			}
723 			assert(n_read >= 0);
724 			if ((size_t)n_read != size) {
725 				free(buffer);
726 				warnx("partial read of input image");
727 				return false;
728 			}
729 			assert((off_t)size <= left);
730 			left -= size;
731 		}
732 		free(buffer);
733 	}
734 
735 	/* Start where we left off.  */
736 	S->blkno = blkno;
737 	S->offset = last_offset;
738 	S->n_checkpointed_blocks = blkno;
739 
740 	/* Good to go and ready for interruption by a signal.  */
741 	S->initialized = 1;
742 
743 	/* Success!  */
744 	return true;
745 }
746 
747 /*
748  * Read a single block, compress it, and write the compressed block.
749  * Return the size of the compressed block.
750  */
751 static uint32_t
752 compress_block(int in_fd, int out_fd, uint32_t blkno, uint32_t blocksize,
753     uint32_t readsize, void *uncompbuf, void *compbuf)
754 {
755 
756 	assert(readsize <= blocksize);
757 	assert(blocksize <= MAX_BLOCKSIZE);
758 
759 	/* Read the uncompressed block.  */
760 	const ssize_t n_read = read_block(in_fd, uncompbuf, readsize);
761 	if (n_read == -1)
762 		err(1, "read block %"PRIu32, blkno);
763 	assert(n_read >= 0);
764 	if ((size_t)n_read != readsize)
765 		errx(1, "partial read of block %"PRIu32": %zu != %"PRIu32,
766 		    blkno, (size_t)n_read, readsize);
767 
768 	/* Compress the block.  */
769 	/* XXX compression ratio bound */
770 	__CTASSERT(MAX_BLOCKSIZE <= (ULONG_MAX / 2));
771 	const unsigned long uncomplen =
772 	    (VNDCOMPRESS_COMPAT? blocksize : readsize); /* XXX */
773 	unsigned long complen = (uncomplen * 2);
774 	const int zerror = compress2(compbuf, &complen, uncompbuf, uncomplen,
775 	    Z_BEST_COMPRESSION);
776 	if (zerror != Z_OK)
777 		errx(1, "compressed failed at block %"PRIu32" (%d): %s", blkno,
778 		    zerror, zError(zerror));
779 	assert(complen <= (uncomplen * 2));
780 
781 	/* Write the compressed block.  */
782 	const ssize_t n_written = write(out_fd, compbuf, complen);
783 	if (n_written == -1)
784 		err(1, "write block %"PRIu32, blkno);
785 	assert(n_written >= 0);
786 	if ((size_t)n_written != complen)
787 		errx(1, "partial write of block %"PRIu32": %zu != %lu",
788 		    blkno, (size_t)n_written, complen);
789 
790 	return (size_t)n_written;
791 }
792 
793 /*
794  * Checkpoint if appropriate.
795  */
796 static void
797 compress_maybe_checkpoint(struct compress_state *S)
798 {
799 
800 	if ((0 < S->checkpoint_blocks) && (0 < S->blkno) &&
801 	    ((S->blkno % S->checkpoint_blocks) == 0)) {
802 		assert(S->offset <= OFF_MAX);
803 		assert((off_t)S->offset == lseek(S->cloop2_fd, 0, SEEK_CUR));
804 		compress_checkpoint(S);
805 	}
806 }
807 
808 /*
809  * Write the prefix of the offset table that we have filled so far.
810  *
811  * We fsync the data blocks we have written, and then write the offset
812  * table, and then fsync the offset table and file metadata.  This
813  * should help to avoid offset tables that point at garbage data.
814  *
815  * This may be called from a signal handler, so it must not use stdio,
816  * malloc, &c. -- it may only (a) handle signal-safe state in S, and
817  * (b) do file descriptor I/O / fsync.
818  *
819  * XXX This requires further thought and heavy testing to be sure.
820  *
821  * XXX Should have an option to suppress fsync.
822  *
823  * XXX Should have an option to fail on fsync failures.
824  *
825  * XXX Would be nice if we could just do a barrier rather than an
826  * fsync.
827  *
828  * XXX How might we automatically test the fsyncs?
829  */
830 static void
831 compress_checkpoint(struct compress_state *S)
832 {
833 
834 	assert(S->blkno < S->n_offsets);
835 	const uint32_t n_offsets = (S->blkno + 1);
836 	assert(n_offsets <= S->n_offsets);
837 
838 	assert(S->offset <= OFF_MAX);
839 	assert((off_t)S->offset <= lseek(S->cloop2_fd, 0, SEEK_CUR));
840 
841 	/* Make sure the data hits the disk before we say it's ready.  */
842 	if (fsync_range(S->cloop2_fd, (FFILESYNC | FDISKSYNC), 0, S->offset)
843 	    == -1)
844 		warn_ss("fsync of output failed");
845 
846 	/* Say the data blocks are ready.  */
847 	offtab_checkpoint(&S->offtab, n_offsets,
848 	    (S->n_checkpointed_blocks == 0? OFFTAB_CHECKPOINT_SYNC : 0));
849 
850 	/*
851 	 * If this is the first checkpoint, initialize the header.
852 	 * Signal handler can race with main code here, but it is
853 	 * harmless -- just an extra fsync and write of the header,
854 	 * which are both idempotent.
855 	 *
856 	 * Once we have synchronously checkpointed the offset table,
857 	 * subsequent writes will preserve a valid state.
858 	 */
859 	if (S->n_checkpointed_blocks == 0) {
860 		static const struct cloop2_header zero_header;
861 		struct cloop2_header header = zero_header;
862 
863 		/* Format the header.  */
864 		__CTASSERT(sizeof(cloop2_magic) <= sizeof(header.cl2h_magic));
865 		(void)memcpy(header.cl2h_magic, cloop2_magic,
866 		    sizeof(cloop2_magic));
867 		header.cl2h_blocksize = htobe32(S->blocksize);
868 		header.cl2h_n_blocks = htobe32(S->n_blocks);
869 
870 		/* Write the header.  */
871 		const ssize_t h_written = pwrite(S->cloop2_fd, &header,
872 		    sizeof(header), 0);
873 		if (h_written == -1)
874 			err_ss(1, "write header");
875 		assert(h_written >= 0);
876 		if ((size_t)h_written != sizeof(header))
877 			errx_ss(1, "partial write of header: %zu != %zu",
878 			    (size_t)h_written, sizeof(header));
879 	}
880 
881 	/* Record how many blocks we've checkpointed.  */
882     {
883 	sigset_t old_sigmask;
884 	block_signals(&old_sigmask);
885 	S->n_checkpointed_blocks = S->blkno;
886 	restore_sigmask(&old_sigmask);
887     }
888 }
889 
890 /*
891  * Release everything we allocated in compress_init.
892  */
893 static void
894 compress_exit(struct compress_state *S)
895 {
896 
897 	/* Done with the offset table.  Destroy it.  */
898 	offtab_destroy(&S->offtab);
899 
900 	/* Done with the files.  Close them.  */
901 	if (close(S->cloop2_fd) == -1)
902 		warn("close(cloop2 fd)");
903 	if (close(S->image_fd) == -1)
904 		warn("close(image fd)");
905 }
906