xref: /dflybsd-src/sys/vfs/hammer2/hammer2_strategy.c (revision aa6ac96e01825b3efcab953441f85adbf9815e0f)
1 /*
2  * Copyright (c) 2011-2015 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  * by Daniel Flores (GSOC 2013 - mentored by Matthew Dillon, compression)
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in
17  *    the documentation and/or other materials provided with the
18  *    distribution.
19  * 3. Neither the name of The DragonFly Project nor the names of its
20  *    contributors may be used to endorse or promote products derived
21  *    from this software without specific, prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  */
36 /*
37  * This module handles low level logical file I/O (strategy) which backs
38  * the logical buffer cache.
39  */
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/kernel.h>
44 #include <sys/fcntl.h>
45 #include <sys/buf.h>
46 #include <sys/proc.h>
47 #include <sys/namei.h>
48 #include <sys/mount.h>
49 #include <sys/vnode.h>
50 #include <sys/mountctl.h>
51 #include <sys/dirent.h>
52 #include <sys/uio.h>
53 #include <sys/objcache.h>
54 #include <sys/event.h>
55 #include <sys/file.h>
56 #include <vfs/fifofs/fifo.h>
57 
58 #include "hammer2.h"
59 #include "hammer2_lz4.h"
60 
61 #include "zlib/hammer2_zlib.h"
62 
63 struct objcache *cache_buffer_read;
64 struct objcache *cache_buffer_write;
65 
66 /*
67  * Strategy code (async logical file buffer I/O from system)
68  *
69  * WARNING: The strategy code cannot safely use hammer2 transactions
70  *	    as this can deadlock against vfs_sync's vfsync() call
71  *	    if multiple flushes are queued.  All H2 structures must
72  *	    already be present and ready for the DIO.
73  *
74  *	    Reads can be initiated asynchronously, writes have to be
75  *	    spooled to a separate thread for action to avoid deadlocks.
76  */
77 static void hammer2_strategy_xop_read(hammer2_xop_t *arg, int clindex);
78 static void hammer2_strategy_xop_write(hammer2_xop_t *arg, int clindex);
79 static int hammer2_strategy_read(struct vop_strategy_args *ap);
80 static int hammer2_strategy_write(struct vop_strategy_args *ap);
81 static void hammer2_strategy_read_completion(hammer2_chain_t *chain,
82 				char *data, struct bio *bio);
83 
84 int
85 hammer2_vop_strategy(struct vop_strategy_args *ap)
86 {
87 	struct bio *biop;
88 	struct buf *bp;
89 	int error;
90 
91 	biop = ap->a_bio;
92 	bp = biop->bio_buf;
93 
94 	switch(bp->b_cmd) {
95 	case BUF_CMD_READ:
96 		error = hammer2_strategy_read(ap);
97 		++hammer2_iod_file_read;
98 		break;
99 	case BUF_CMD_WRITE:
100 		error = hammer2_strategy_write(ap);
101 		++hammer2_iod_file_write;
102 		break;
103 	default:
104 		bp->b_error = error = EINVAL;
105 		bp->b_flags |= B_ERROR;
106 		biodone(biop);
107 		break;
108 	}
109 	return (error);
110 }
111 
112 /*
113  * Return the largest contiguous physical disk range for the logical
114  * request, in bytes.
115  *
116  * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
117  *
118  * Basically disabled, the logical buffer write thread has to deal with
119  * buffers one-at-a-time.
120  */
121 int
122 hammer2_vop_bmap(struct vop_bmap_args *ap)
123 {
124 	*ap->a_doffsetp = NOOFFSET;
125 	if (ap->a_runp)
126 		*ap->a_runp = 0;
127 	if (ap->a_runb)
128 		*ap->a_runb = 0;
129 	return (EOPNOTSUPP);
130 }
131 
132 /****************************************************************************
133  *				READ SUPPORT				    *
134  ****************************************************************************/
135 /*
136  * Callback used in read path in case that a block is compressed with LZ4.
137  */
138 static
139 void
140 hammer2_decompress_LZ4_callback(const char *data, u_int bytes, struct bio *bio)
141 {
142 	struct buf *bp;
143 	char *compressed_buffer;
144 	int compressed_size;
145 	int result;
146 
147 	bp = bio->bio_buf;
148 
149 #if 0
150 	if bio->bio_caller_info2.index &&
151 	      bio->bio_caller_info1.uvalue32 !=
152 	      crc32(bp->b_data, bp->b_bufsize) --- return error
153 #endif
154 
155 	KKASSERT(bp->b_bufsize <= HAMMER2_PBUFSIZE);
156 	compressed_size = *(const int *)data;
157 	KKASSERT(compressed_size <= bytes - sizeof(int));
158 
159 	compressed_buffer = objcache_get(cache_buffer_read, M_INTWAIT);
160 	result = LZ4_decompress_safe(__DECONST(char *, &data[sizeof(int)]),
161 				     compressed_buffer,
162 				     compressed_size,
163 				     bp->b_bufsize);
164 	if (result < 0) {
165 		kprintf("READ PATH: Error during decompression."
166 			"bio %016jx/%d\n",
167 			(intmax_t)bio->bio_offset, bytes);
168 		/* make sure it isn't random garbage */
169 		bzero(compressed_buffer, bp->b_bufsize);
170 	}
171 	KKASSERT(result <= bp->b_bufsize);
172 	bcopy(compressed_buffer, bp->b_data, bp->b_bufsize);
173 	if (result < bp->b_bufsize)
174 		bzero(bp->b_data + result, bp->b_bufsize - result);
175 	objcache_put(cache_buffer_read, compressed_buffer);
176 	bp->b_resid = 0;
177 	bp->b_flags |= B_AGE;
178 }
179 
180 /*
181  * Callback used in read path in case that a block is compressed with ZLIB.
182  * It is almost identical to LZ4 callback, so in theory they can be unified,
183  * but we didn't want to make changes in bio structure for that.
184  */
185 static
186 void
187 hammer2_decompress_ZLIB_callback(const char *data, u_int bytes, struct bio *bio)
188 {
189 	struct buf *bp;
190 	char *compressed_buffer;
191 	z_stream strm_decompress;
192 	int result;
193 	int ret;
194 
195 	bp = bio->bio_buf;
196 
197 	KKASSERT(bp->b_bufsize <= HAMMER2_PBUFSIZE);
198 	strm_decompress.avail_in = 0;
199 	strm_decompress.next_in = Z_NULL;
200 
201 	ret = inflateInit(&strm_decompress);
202 
203 	if (ret != Z_OK)
204 		kprintf("HAMMER2 ZLIB: Fatal error in inflateInit.\n");
205 
206 	compressed_buffer = objcache_get(cache_buffer_read, M_INTWAIT);
207 	strm_decompress.next_in = __DECONST(char *, data);
208 
209 	/* XXX supply proper size, subset of device bp */
210 	strm_decompress.avail_in = bytes;
211 	strm_decompress.next_out = compressed_buffer;
212 	strm_decompress.avail_out = bp->b_bufsize;
213 
214 	ret = inflate(&strm_decompress, Z_FINISH);
215 	if (ret != Z_STREAM_END) {
216 		kprintf("HAMMER2 ZLIB: Fatar error during decompression.\n");
217 		bzero(compressed_buffer, bp->b_bufsize);
218 	}
219 	bcopy(compressed_buffer, bp->b_data, bp->b_bufsize);
220 	result = bp->b_bufsize - strm_decompress.avail_out;
221 	if (result < bp->b_bufsize)
222 		bzero(bp->b_data + result, strm_decompress.avail_out);
223 	objcache_put(cache_buffer_read, compressed_buffer);
224 	ret = inflateEnd(&strm_decompress);
225 
226 	bp->b_resid = 0;
227 	bp->b_flags |= B_AGE;
228 }
229 
230 /*
231  * Logical buffer I/O, async read.
232  */
233 static
234 int
235 hammer2_strategy_read(struct vop_strategy_args *ap)
236 {
237 	hammer2_xop_strategy_t *xop;
238 	struct buf *bp;
239 	struct bio *bio;
240 	struct bio *nbio;
241 	hammer2_inode_t *ip;
242 	hammer2_key_t lbase;
243 
244 	bio = ap->a_bio;
245 	bp = bio->bio_buf;
246 	ip = VTOI(ap->a_vp);
247 	nbio = push_bio(bio);
248 
249 	lbase = bio->bio_offset;
250 	KKASSERT(((int)lbase & HAMMER2_PBUFMASK) == 0);
251 
252 	xop = hammer2_xop_alloc(ip, 0);
253 	xop->finished = 0;
254 	xop->bio = bio;
255 	xop->lbase = lbase;
256 	hammer2_mtx_init(&xop->lock, "h2bio");
257 	hammer2_xop_start(&xop->head, hammer2_strategy_xop_read);
258 
259 	return(0);
260 }
261 
262 /*
263  * Per-node XOP (threaded), do a synchronous lookup of the chain and
264  * its data.  The frontend is asynchronous, so we are also responsible
265  * for racing to terminate the frontend.
266  */
267 static
268 void
269 hammer2_strategy_xop_read(hammer2_xop_t *arg, int clindex)
270 {
271 	hammer2_xop_strategy_t *xop = &arg->xop_strategy;
272 	hammer2_chain_t *parent;
273 	hammer2_chain_t *chain;
274 	hammer2_key_t key_dummy;
275 	hammer2_key_t lbase;
276 	struct bio *bio;
277 	struct buf *bp;
278 	int cache_index = -1;
279 	int error;
280 
281 	lbase = xop->lbase;
282 	bio = xop->bio;
283 	bp = bio->bio_buf;
284 
285 	parent = hammer2_inode_chain(xop->head.ip1, clindex,
286 				     HAMMER2_RESOLVE_ALWAYS |
287 				     HAMMER2_RESOLVE_SHARED);
288 	if (parent) {
289 		chain = hammer2_chain_lookup(&parent, &key_dummy,
290 					     lbase, lbase,
291 					     &cache_index,
292 					     HAMMER2_LOOKUP_ALWAYS |
293 					     HAMMER2_LOOKUP_SHARED);
294 		error = chain ? chain->error : 0;
295 	} else {
296 		error = EIO;
297 		chain = NULL;
298 	}
299 	error = hammer2_xop_feed(&xop->head, chain, clindex, error);
300 	if (chain)
301 		hammer2_chain_drop(chain);
302 	if (parent) {
303 		hammer2_chain_unlock(parent);
304 		hammer2_chain_drop(parent);
305 	}
306 	chain = NULL;	/* safety */
307 	parent = NULL;	/* safety */
308 
309 	/*
310 	 * Race to finish the frontend
311 	 */
312 	if (xop->finished)
313 		return;
314 	hammer2_mtx_ex(&xop->lock);
315 	if (xop->finished) {
316 		hammer2_mtx_unlock(&xop->lock);
317 		return;
318 	}
319 
320 	/*
321 	 * Async operation has not completed and we now own the lock.
322 	 * Determine if we can complete the operation by issuing the
323 	 * frontend collection non-blocking.
324 	 */
325 	error = hammer2_xop_collect(&xop->head, HAMMER2_XOP_COLLECT_NOWAIT);
326 
327 	switch(error) {
328 	case 0:
329 		xop->finished = 1;
330 		hammer2_mtx_unlock(&xop->lock);
331 		chain = xop->head.cluster.focus;
332 		hammer2_strategy_read_completion(chain, (char *)chain->data,
333 						 xop->bio);
334 		hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
335 		biodone(bio);
336 		break;
337 	case ENOENT:
338 		xop->finished = 1;
339 		hammer2_mtx_unlock(&xop->lock);
340 		bp->b_resid = 0;
341 		bp->b_error = 0;
342 		bzero(bp->b_data, bp->b_bcount);
343 		biodone(bio);
344 		hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
345 		break;
346 	case EINPROGRESS:
347 		hammer2_mtx_unlock(&xop->lock);
348 		break;
349 	default:
350 		xop->finished = 1;
351 		hammer2_mtx_unlock(&xop->lock);
352 		bp->b_flags |= B_ERROR;
353 		bp->b_error = EIO;
354 		biodone(bio);
355 		hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
356 		break;
357 	}
358 }
359 
360 static
361 void
362 hammer2_strategy_read_completion(hammer2_chain_t *chain, char *data,
363 				 struct bio *bio)
364 {
365 	struct buf *bp = bio->bio_buf;
366 
367 	if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
368 		/*
369 		 * Data is embedded in the inode (copy from inode).
370 		 */
371 		bcopy(((hammer2_inode_data_t *)data)->u.data,
372 		      bp->b_data, HAMMER2_EMBEDDED_BYTES);
373 		bzero(bp->b_data + HAMMER2_EMBEDDED_BYTES,
374 		      bp->b_bcount - HAMMER2_EMBEDDED_BYTES);
375 		bp->b_resid = 0;
376 		bp->b_error = 0;
377 	} else if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
378 		/*
379 		 * Data is on-media, issue device I/O and copy.
380 		 *
381 		 * XXX direct-IO shortcut could go here XXX.
382 		 */
383 		switch (HAMMER2_DEC_COMP(chain->bref.methods)) {
384 		case HAMMER2_COMP_LZ4:
385 			hammer2_decompress_LZ4_callback(data, chain->bytes,
386 							bio);
387 			break;
388 		case HAMMER2_COMP_ZLIB:
389 			hammer2_decompress_ZLIB_callback(data, chain->bytes,
390 							 bio);
391 			break;
392 		case HAMMER2_COMP_NONE:
393 			KKASSERT(chain->bytes <= bp->b_bcount);
394 			bcopy(data, bp->b_data, chain->bytes);
395 			if (chain->bytes < bp->b_bcount) {
396 				bzero(bp->b_data + chain->bytes,
397 				      bp->b_bcount - chain->bytes);
398 			}
399 			bp->b_flags |= B_NOTMETA;
400 			bp->b_resid = 0;
401 			bp->b_error = 0;
402 			break;
403 		default:
404 			panic("hammer2_strategy_read: "
405 			      "unknown compression type");
406 		}
407 	} else {
408 		panic("hammer2_strategy_read: unknown bref type");
409 	}
410 }
411 
412 /****************************************************************************
413  *				WRITE SUPPORT				    *
414  ****************************************************************************/
415 
416 /*
417  * Functions for compression in threads,
418  * from hammer2_vnops.c
419  */
420 static void hammer2_write_file_core(struct buf *bp, hammer2_inode_t *ip,
421 				hammer2_chain_t **parentp,
422 				hammer2_key_t lbase, int ioflag, int pblksize,
423 				hammer2_tid_t mtid, int *errorp);
424 static void hammer2_compress_and_write(struct buf *bp, hammer2_inode_t *ip,
425 				hammer2_chain_t **parentp,
426 				hammer2_key_t lbase, int ioflag, int pblksize,
427 				hammer2_tid_t mtid, int *errorp,
428 				int comp_algo, int check_algo);
429 static void hammer2_zero_check_and_write(struct buf *bp, hammer2_inode_t *ip,
430 				hammer2_chain_t **parentp,
431 				hammer2_key_t lbase, int ioflag, int pblksize,
432 				hammer2_tid_t mtid, int *errorp,
433 				int check_algo);
434 static int test_block_zeros(const char *buf, size_t bytes);
435 static void zero_write(struct buf *bp, hammer2_inode_t *ip,
436 				hammer2_chain_t **parentp,
437 				hammer2_key_t lbase,
438 				hammer2_tid_t mtid, int *errorp);
439 static void hammer2_write_bp(hammer2_chain_t *chain, struct buf *bp,
440 				int ioflag, int pblksize,
441 				hammer2_tid_t mtid, int *errorp,
442 				int check_algo);
443 
444 static
445 int
446 hammer2_strategy_write(struct vop_strategy_args *ap)
447 {
448 	hammer2_xop_strategy_t *xop;
449 	hammer2_pfs_t *pmp;
450 	struct bio *bio;
451 	struct buf *bp;
452 	hammer2_inode_t *ip;
453 
454 	bio = ap->a_bio;
455 	bp = bio->bio_buf;
456 	ip = VTOI(ap->a_vp);
457 	pmp = ip->pmp;
458 
459 	hammer2_lwinprog_ref(pmp);
460 	hammer2_trans_assert_strategy(pmp);
461 
462 	xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING);
463 	xop->finished = 0;
464 	xop->bio = bio;
465 	xop->lbase = bio->bio_offset;
466 	hammer2_xop_start(&xop->head, hammer2_strategy_xop_write);
467 	/* asynchronous completion */
468 
469 	hammer2_lwinprog_wait(pmp, hammer2_flush_pipe);
470 
471 	return(0);
472 }
473 
474 /*
475  * Per-node XOP (threaded).  Write the logical buffer to the media.
476  */
477 static
478 void
479 hammer2_strategy_xop_write(hammer2_xop_t *arg, int clindex)
480 {
481 	hammer2_xop_strategy_t *xop = &arg->xop_strategy;
482 	hammer2_chain_t *parent;
483 	hammer2_key_t lbase;
484 	hammer2_inode_t *ip;
485 	struct bio *bio;
486 	struct buf *bp;
487 	int error;
488 	int lblksize;
489 	int pblksize;
490 
491 	lbase = xop->lbase;
492 	bio = xop->bio;
493 	bp = bio->bio_buf;
494 	ip = xop->head.ip1;
495 
496 	/* hammer2_trans_init(parent->hmp->spmp, HAMMER2_TRANS_BUFCACHE); */
497 
498 	lblksize = hammer2_calc_logical(ip, bio->bio_offset, &lbase, NULL);
499 	pblksize = hammer2_calc_physical(ip, lbase);
500 	parent = hammer2_inode_chain(ip, clindex, HAMMER2_RESOLVE_ALWAYS);
501 	hammer2_write_file_core(bp, ip, &parent,
502 				lbase, IO_ASYNC, pblksize,
503 				xop->head.mtid, &error);
504 	if (parent) {
505 		hammer2_chain_unlock(parent);
506 		hammer2_chain_drop(parent);
507 		parent = NULL;	/* safety */
508 	}
509 	error = hammer2_xop_feed(&xop->head, NULL, clindex, error);
510 
511 	/*
512 	 * Race to finish the frontend
513 	 */
514 	if (xop->finished)
515 		return;
516 	hammer2_mtx_ex(&xop->lock);
517 	if (xop->finished) {
518 		hammer2_mtx_unlock(&xop->lock);
519 		return;
520 	}
521 
522 	/*
523 	 * Async operation has not completed and we now own the lock.
524 	 * Determine if we can complete the operation by issuing the
525 	 * frontend collection non-blocking.
526 	 */
527 	error = hammer2_xop_collect(&xop->head, HAMMER2_XOP_COLLECT_NOWAIT);
528 
529 	switch(error) {
530 	case ENOENT:
531 	case 0:
532 		xop->finished = 1;
533 		hammer2_mtx_unlock(&xop->lock);
534 		bp->b_resid = 0;
535 		bp->b_error = 0;
536 		biodone(bio);
537 		hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
538 		hammer2_lwinprog_drop(ip->pmp);
539 		break;
540 	case EINPROGRESS:
541 		hammer2_mtx_unlock(&xop->lock);
542 		break;
543 	default:
544 		xop->finished = 1;
545 		hammer2_mtx_unlock(&xop->lock);
546 		bp->b_flags |= B_ERROR;
547 		bp->b_error = EIO;
548 		biodone(bio);
549 		hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
550 		hammer2_lwinprog_drop(ip->pmp);
551 		break;
552 	}
553 }
554 
555 /*
556  * Wait for pending I/O to complete
557  */
558 void
559 hammer2_bioq_sync(hammer2_pfs_t *pmp)
560 {
561 	hammer2_lwinprog_wait(pmp, 0);
562 }
563 
564 /*
565  * Create a new cluster at (cparent, lbase) and assign physical storage,
566  * returning a cluster suitable for I/O.  The cluster will be in a modified
567  * state.
568  *
569  * cparent can wind up being anything.
570  *
571  * NOTE: Special case for data embedded in inode.
572  */
573 static
574 hammer2_chain_t *
575 hammer2_assign_physical(hammer2_inode_t *ip, hammer2_chain_t **parentp,
576 			hammer2_key_t lbase, int pblksize,
577 			hammer2_tid_t mtid, int *errorp)
578 {
579 	hammer2_chain_t *chain;
580 	hammer2_key_t key_dummy;
581 	int pradix = hammer2_getradix(pblksize);
582 	int cache_index = -1;
583 
584 	/*
585 	 * Locate the chain associated with lbase, return a locked chain.
586 	 * However, do not instantiate any data reference (which utilizes a
587 	 * device buffer) because we will be using direct IO via the
588 	 * logical buffer cache buffer.
589 	 */
590 	*errorp = 0;
591 	KKASSERT(pblksize >= HAMMER2_ALLOC_MIN);
592 retry:
593 	chain = hammer2_chain_lookup(parentp, &key_dummy,
594 				     lbase, lbase,
595 				     &cache_index,
596 				     HAMMER2_LOOKUP_NODATA);
597 	if (chain == NULL) {
598 		/*
599 		 * We found a hole, create a new chain entry.
600 		 *
601 		 * NOTE: DATA chains are created without device backing
602 		 *	 store (nor do we want any).
603 		 */
604 		*errorp = hammer2_chain_create(parentp, &chain, ip->pmp,
605 					       lbase, HAMMER2_PBUFRADIX,
606 					       HAMMER2_BREF_TYPE_DATA,
607 					       pblksize,
608 					       mtid, 0);
609 		if (chain == NULL) {
610 			panic("hammer2_chain_create: par=%p error=%d\n",
611 			      *parentp, *errorp);
612 			goto retry;
613 		}
614 		/*ip->delta_dcount += pblksize;*/
615 	} else {
616 		switch (chain->bref.type) {
617 		case HAMMER2_BREF_TYPE_INODE:
618 			/*
619 			 * The data is embedded in the inode, which requires
620 			 * a bit more finess.
621 			 */
622 			hammer2_chain_modify_ip(ip, chain, mtid, 0);
623 			break;
624 		case HAMMER2_BREF_TYPE_DATA:
625 			if (chain->bytes != pblksize) {
626 				hammer2_chain_resize(ip, *parentp, chain,
627 						     mtid, pradix,
628 						     HAMMER2_MODIFY_OPTDATA);
629 			}
630 
631 			/*
632 			 * DATA buffers must be marked modified whether the
633 			 * data is in a logical buffer or not.  We also have
634 			 * to make this call to fixup the chain data pointers
635 			 * after resizing in case this is an encrypted or
636 			 * compressed buffer.
637 			 */
638 			hammer2_chain_modify(chain, mtid,
639 					     HAMMER2_MODIFY_OPTDATA);
640 			break;
641 		default:
642 			panic("hammer2_assign_physical: bad type");
643 			/* NOT REACHED */
644 			break;
645 		}
646 	}
647 	return (chain);
648 }
649 
650 /*
651  * hammer2_write_file_core() - hammer2_write_thread() helper
652  *
653  * The core write function which determines which path to take
654  * depending on compression settings.  We also have to locate the
655  * related chains so we can calculate and set the check data for
656  * the blockref.
657  */
658 static
659 void
660 hammer2_write_file_core(struct buf *bp, hammer2_inode_t *ip,
661 			hammer2_chain_t **parentp,
662 			hammer2_key_t lbase, int ioflag, int pblksize,
663 			hammer2_tid_t mtid, int *errorp)
664 {
665 	hammer2_chain_t *chain;
666 
667 	switch(HAMMER2_DEC_ALGO(ip->meta.comp_algo)) {
668 	case HAMMER2_COMP_NONE:
669 		/*
670 		 * We have to assign physical storage to the buffer
671 		 * we intend to dirty or write now to avoid deadlocks
672 		 * in the strategy code later.
673 		 *
674 		 * This can return NOOFFSET for inode-embedded data.
675 		 * The strategy code will take care of it in that case.
676 		 */
677 		chain = hammer2_assign_physical(ip, parentp, lbase, pblksize,
678 						mtid, errorp);
679 		if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
680 			hammer2_inode_data_t *wipdata;
681 
682 			wipdata = &chain->data->ipdata;
683 			KKASSERT(wipdata->meta.op_flags &
684 				 HAMMER2_OPFLAG_DIRECTDATA);
685 			KKASSERT(bp->b_loffset == 0);
686 			bcopy(bp->b_data, wipdata->u.data,
687 			      HAMMER2_EMBEDDED_BYTES);
688 		} else {
689 			hammer2_write_bp(chain, bp, ioflag, pblksize,
690 					 mtid, errorp, ip->meta.check_algo);
691 		}
692 		if (chain) {
693 			hammer2_chain_unlock(chain);
694 			hammer2_chain_drop(chain);
695 		}
696 		break;
697 	case HAMMER2_COMP_AUTOZERO:
698 		/*
699 		 * Check for zero-fill only
700 		 */
701 		hammer2_zero_check_and_write(bp, ip, parentp,
702 					     lbase, ioflag, pblksize,
703 					     mtid, errorp,
704 					     ip->meta.check_algo);
705 		break;
706 	case HAMMER2_COMP_LZ4:
707 	case HAMMER2_COMP_ZLIB:
708 	default:
709 		/*
710 		 * Check for zero-fill and attempt compression.
711 		 */
712 		hammer2_compress_and_write(bp, ip, parentp,
713 					   lbase, ioflag, pblksize,
714 					   mtid, errorp,
715 					   ip->meta.comp_algo,
716 					   ip->meta.check_algo);
717 		break;
718 	}
719 }
720 
721 /*
722  * Helper
723  *
724  * Generic function that will perform the compression in compression
725  * write path. The compression algorithm is determined by the settings
726  * obtained from inode.
727  */
728 static
729 void
730 hammer2_compress_and_write(struct buf *bp, hammer2_inode_t *ip,
731 	hammer2_chain_t **parentp,
732 	hammer2_key_t lbase, int ioflag, int pblksize,
733 	hammer2_tid_t mtid, int *errorp, int comp_algo, int check_algo)
734 {
735 	hammer2_chain_t *chain;
736 	int comp_size;
737 	int comp_block_size;
738 	char *comp_buffer;
739 
740 	if (test_block_zeros(bp->b_data, pblksize)) {
741 		zero_write(bp, ip, parentp, lbase, mtid, errorp);
742 		return;
743 	}
744 
745 	comp_size = 0;
746 	comp_buffer = NULL;
747 
748 	KKASSERT(pblksize / 2 <= 32768);
749 
750 	if (ip->comp_heuristic < 8 || (ip->comp_heuristic & 7) == 0) {
751 		z_stream strm_compress;
752 		int comp_level;
753 		int ret;
754 
755 		switch(HAMMER2_DEC_ALGO(comp_algo)) {
756 		case HAMMER2_COMP_LZ4:
757 			comp_buffer = objcache_get(cache_buffer_write,
758 						   M_INTWAIT);
759 			comp_size = LZ4_compress_limitedOutput(
760 					bp->b_data,
761 					&comp_buffer[sizeof(int)],
762 					pblksize,
763 					pblksize / 2 - sizeof(int));
764 			/*
765 			 * We need to prefix with the size, LZ4
766 			 * doesn't do it for us.  Add the related
767 			 * overhead.
768 			 */
769 			*(int *)comp_buffer = comp_size;
770 			if (comp_size)
771 				comp_size += sizeof(int);
772 			break;
773 		case HAMMER2_COMP_ZLIB:
774 			comp_level = HAMMER2_DEC_LEVEL(comp_algo);
775 			if (comp_level == 0)
776 				comp_level = 6;	/* default zlib compression */
777 			else if (comp_level < 6)
778 				comp_level = 6;
779 			else if (comp_level > 9)
780 				comp_level = 9;
781 			ret = deflateInit(&strm_compress, comp_level);
782 			if (ret != Z_OK) {
783 				kprintf("HAMMER2 ZLIB: fatal error "
784 					"on deflateInit.\n");
785 			}
786 
787 			comp_buffer = objcache_get(cache_buffer_write,
788 						   M_INTWAIT);
789 			strm_compress.next_in = bp->b_data;
790 			strm_compress.avail_in = pblksize;
791 			strm_compress.next_out = comp_buffer;
792 			strm_compress.avail_out = pblksize / 2;
793 			ret = deflate(&strm_compress, Z_FINISH);
794 			if (ret == Z_STREAM_END) {
795 				comp_size = pblksize / 2 -
796 					    strm_compress.avail_out;
797 			} else {
798 				comp_size = 0;
799 			}
800 			ret = deflateEnd(&strm_compress);
801 			break;
802 		default:
803 			kprintf("Error: Unknown compression method.\n");
804 			kprintf("Comp_method = %d.\n", comp_algo);
805 			break;
806 		}
807 	}
808 
809 	if (comp_size == 0) {
810 		/*
811 		 * compression failed or turned off
812 		 */
813 		comp_block_size = pblksize;	/* safety */
814 		if (++ip->comp_heuristic > 128)
815 			ip->comp_heuristic = 8;
816 	} else {
817 		/*
818 		 * compression succeeded
819 		 */
820 		ip->comp_heuristic = 0;
821 		if (comp_size <= 1024) {
822 			comp_block_size = 1024;
823 		} else if (comp_size <= 2048) {
824 			comp_block_size = 2048;
825 		} else if (comp_size <= 4096) {
826 			comp_block_size = 4096;
827 		} else if (comp_size <= 8192) {
828 			comp_block_size = 8192;
829 		} else if (comp_size <= 16384) {
830 			comp_block_size = 16384;
831 		} else if (comp_size <= 32768) {
832 			comp_block_size = 32768;
833 		} else {
834 			panic("hammer2: WRITE PATH: "
835 			      "Weird comp_size value.");
836 			/* NOT REACHED */
837 			comp_block_size = pblksize;
838 		}
839 	}
840 
841 	chain = hammer2_assign_physical(ip, parentp, lbase, comp_block_size,
842 					mtid, errorp);
843 	if (*errorp) {
844 		kprintf("WRITE PATH: An error occurred while "
845 			"assigning physical space.\n");
846 		KKASSERT(chain == NULL);
847 		goto done;
848 	}
849 
850 	if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
851 		hammer2_inode_data_t *wipdata;
852 
853 		hammer2_chain_modify_ip(ip, chain, mtid, 0);
854 		wipdata = &chain->data->ipdata;
855 		KKASSERT(wipdata->meta.op_flags & HAMMER2_OPFLAG_DIRECTDATA);
856 		KKASSERT(bp->b_loffset == 0);
857 		bcopy(bp->b_data, wipdata->u.data, HAMMER2_EMBEDDED_BYTES);
858 	} else {
859 		hammer2_io_t *dio;
860 		char *bdata;
861 
862 		KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
863 
864 		switch(chain->bref.type) {
865 		case HAMMER2_BREF_TYPE_INODE:
866 			panic("hammer2_write_bp: unexpected inode\n");
867 			break;
868 		case HAMMER2_BREF_TYPE_DATA:
869 			/*
870 			 * Optimize out the read-before-write
871 			 * if possible.
872 			 */
873 			*errorp = hammer2_io_newnz(chain->hmp,
874 						   chain->bref.data_off,
875 						   chain->bytes,
876 						   &dio);
877 			if (*errorp) {
878 				hammer2_io_brelse(&dio);
879 				kprintf("hammer2: WRITE PATH: "
880 					"dbp bread error\n");
881 				break;
882 			}
883 			bdata = hammer2_io_data(dio, chain->bref.data_off);
884 
885 			/*
886 			 * When loading the block make sure we don't
887 			 * leave garbage after the compressed data.
888 			 */
889 			if (comp_size) {
890 				chain->bref.methods =
891 					HAMMER2_ENC_COMP(comp_algo) +
892 					HAMMER2_ENC_CHECK(check_algo);
893 				bcopy(comp_buffer, bdata, comp_size);
894 				if (comp_size != comp_block_size) {
895 					bzero(bdata + comp_size,
896 					      comp_block_size - comp_size);
897 				}
898 			} else {
899 				chain->bref.methods =
900 					HAMMER2_ENC_COMP(
901 						HAMMER2_COMP_NONE) +
902 					HAMMER2_ENC_CHECK(check_algo);
903 				bcopy(bp->b_data, bdata, pblksize);
904 			}
905 
906 			/*
907 			 * The flush code doesn't calculate check codes for
908 			 * file data (doing so can result in excessive I/O),
909 			 * so we do it here.
910 			 */
911 			hammer2_chain_setcheck(chain, bdata);
912 
913 			/*
914 			 * Device buffer is now valid, chain is no longer in
915 			 * the initial state.
916 			 *
917 			 * (No blockref table worries with file data)
918 			 */
919 			atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
920 
921 			/* Now write the related bdp. */
922 			if (ioflag & IO_SYNC) {
923 				/*
924 				 * Synchronous I/O requested.
925 				 */
926 				hammer2_io_bwrite(&dio);
927 			/*
928 			} else if ((ioflag & IO_DIRECT) &&
929 				   loff + n == pblksize) {
930 				hammer2_io_bdwrite(&dio);
931 			*/
932 			} else if (ioflag & IO_ASYNC) {
933 				hammer2_io_bawrite(&dio);
934 			} else {
935 				hammer2_io_bdwrite(&dio);
936 			}
937 			break;
938 		default:
939 			panic("hammer2_write_bp: bad chain type %d\n",
940 				chain->bref.type);
941 			/* NOT REACHED */
942 			break;
943 		}
944 	}
945 done:
946 	if (chain) {
947 		hammer2_chain_unlock(chain);
948 		hammer2_chain_drop(chain);
949 	}
950 	if (comp_buffer)
951 		objcache_put(cache_buffer_write, comp_buffer);
952 }
953 
954 /*
955  * Helper
956  *
957  * Function that performs zero-checking and writing without compression,
958  * it corresponds to default zero-checking path.
959  */
960 static
961 void
962 hammer2_zero_check_and_write(struct buf *bp, hammer2_inode_t *ip,
963 	hammer2_chain_t **parentp,
964 	hammer2_key_t lbase, int ioflag, int pblksize,
965 	hammer2_tid_t mtid, int *errorp,
966 	int check_algo)
967 {
968 	hammer2_chain_t *chain;
969 
970 	if (test_block_zeros(bp->b_data, pblksize)) {
971 		zero_write(bp, ip, parentp, lbase, mtid, errorp);
972 	} else {
973 		chain = hammer2_assign_physical(ip, parentp, lbase, pblksize,
974 						mtid, errorp);
975 		hammer2_write_bp(chain, bp, ioflag, pblksize,
976 				 mtid, errorp, check_algo);
977 		if (chain) {
978 			hammer2_chain_unlock(chain);
979 			hammer2_chain_drop(chain);
980 		}
981 	}
982 }
983 
984 /*
985  * Helper
986  *
987  * A function to test whether a block of data contains only zeros,
988  * returns TRUE (non-zero) if the block is all zeros.
989  */
990 static
991 int
992 test_block_zeros(const char *buf, size_t bytes)
993 {
994 	size_t i;
995 
996 	for (i = 0; i < bytes; i += sizeof(long)) {
997 		if (*(const long *)(buf + i) != 0)
998 			return (0);
999 	}
1000 	return (1);
1001 }
1002 
1003 /*
1004  * Helper
1005  *
1006  * Function to "write" a block that contains only zeros.
1007  */
1008 static
1009 void
1010 zero_write(struct buf *bp, hammer2_inode_t *ip,
1011 	   hammer2_chain_t **parentp,
1012 	   hammer2_key_t lbase, hammer2_tid_t mtid, int *errorp __unused)
1013 {
1014 	hammer2_chain_t *chain;
1015 	hammer2_key_t key_dummy;
1016 	int cache_index = -1;
1017 
1018 	chain = hammer2_chain_lookup(parentp, &key_dummy,
1019 				     lbase, lbase,
1020 				     &cache_index,
1021 				     HAMMER2_LOOKUP_NODATA);
1022 	if (chain) {
1023 		if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
1024 			hammer2_inode_data_t *wipdata;
1025 
1026 			hammer2_chain_modify_ip(ip, chain, mtid, 0);
1027 			wipdata = &chain->data->ipdata;
1028 			KKASSERT(wipdata->meta.op_flags &
1029 				 HAMMER2_OPFLAG_DIRECTDATA);
1030 			KKASSERT(bp->b_loffset == 0);
1031 			bzero(wipdata->u.data, HAMMER2_EMBEDDED_BYTES);
1032 		} else {
1033 			hammer2_chain_delete(*parentp, chain,
1034 					     mtid, HAMMER2_DELETE_PERMANENT);
1035 		}
1036 		hammer2_chain_unlock(chain);
1037 		hammer2_chain_drop(chain);
1038 	}
1039 }
1040 
1041 /*
1042  * Helper
1043  *
1044  * Function to write the data as it is, without performing any sort of
1045  * compression. This function is used in path without compression and
1046  * default zero-checking path.
1047  */
1048 static
1049 void
1050 hammer2_write_bp(hammer2_chain_t *chain, struct buf *bp, int ioflag,
1051 		 int pblksize,
1052 		 hammer2_tid_t mtid, int *errorp, int check_algo)
1053 {
1054 	hammer2_inode_data_t *wipdata;
1055 	hammer2_io_t *dio;
1056 	char *bdata;
1057 	int error;
1058 
1059 	error = 0;	/* XXX TODO below */
1060 
1061 	KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
1062 
1063 	switch(chain->bref.type) {
1064 	case HAMMER2_BREF_TYPE_INODE:
1065 		wipdata = &chain->data->ipdata;
1066 		KKASSERT(wipdata->meta.op_flags & HAMMER2_OPFLAG_DIRECTDATA);
1067 		KKASSERT(bp->b_loffset == 0);
1068 		bcopy(bp->b_data, wipdata->u.data, HAMMER2_EMBEDDED_BYTES);
1069 		error = 0;
1070 		break;
1071 	case HAMMER2_BREF_TYPE_DATA:
1072 		error = hammer2_io_newnz(chain->hmp,
1073 					 chain->bref.data_off,
1074 					 chain->bytes, &dio);
1075 		if (error) {
1076 			hammer2_io_bqrelse(&dio);
1077 			kprintf("hammer2: WRITE PATH: "
1078 				"dbp bread error\n");
1079 			break;
1080 		}
1081 		bdata = hammer2_io_data(dio, chain->bref.data_off);
1082 
1083 		chain->bref.methods = HAMMER2_ENC_COMP(
1084 						HAMMER2_COMP_NONE) +
1085 				      HAMMER2_ENC_CHECK(check_algo);
1086 		bcopy(bp->b_data, bdata, chain->bytes);
1087 
1088 		/*
1089 		 * The flush code doesn't calculate check codes for
1090 		 * file data (doing so can result in excessive I/O),
1091 		 * so we do it here.
1092 		 */
1093 		hammer2_chain_setcheck(chain, bdata);
1094 
1095 		/*
1096 		 * Device buffer is now valid, chain is no longer in
1097 		 * the initial state.
1098 		 *
1099 		 * (No blockref table worries with file data)
1100 		 */
1101 		atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1102 
1103 		if (ioflag & IO_SYNC) {
1104 			/*
1105 			 * Synchronous I/O requested.
1106 			 */
1107 			hammer2_io_bwrite(&dio);
1108 		/*
1109 		} else if ((ioflag & IO_DIRECT) &&
1110 			   loff + n == pblksize) {
1111 			hammer2_io_bdwrite(&dio);
1112 		*/
1113 		} else if (ioflag & IO_ASYNC) {
1114 			hammer2_io_bawrite(&dio);
1115 		} else {
1116 			hammer2_io_bdwrite(&dio);
1117 		}
1118 		break;
1119 	default:
1120 		panic("hammer2_write_bp: bad chain type %d\n",
1121 		      chain->bref.type);
1122 		/* NOT REACHED */
1123 		error = 0;
1124 		break;
1125 	}
1126 	KKASSERT(error == 0);	/* XXX TODO */
1127 	*errorp = error;
1128 }
1129