xref: /dflybsd-src/sys/vfs/hammer2/hammer2_strategy.c (revision f1324544d77eb13a4e38abb5cdfabd4fb67a7b57)
1 /*
2  * Copyright (c) 2011-2015 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  * by Daniel Flores (GSOC 2013 - mentored by Matthew Dillon, compression)
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in
17  *    the documentation and/or other materials provided with the
18  *    distribution.
19  * 3. Neither the name of The DragonFly Project nor the names of its
20  *    contributors may be used to endorse or promote products derived
21  *    from this software without specific, prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  */
36 /*
37  * This module handles low level logical file I/O (strategy) which backs
38  * the logical buffer cache.
39  */
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/kernel.h>
44 #include <sys/fcntl.h>
45 #include <sys/buf.h>
46 #include <sys/proc.h>
47 #include <sys/namei.h>
48 #include <sys/mount.h>
49 #include <sys/vnode.h>
50 #include <sys/mountctl.h>
51 #include <sys/dirent.h>
52 #include <sys/uio.h>
53 #include <sys/objcache.h>
54 #include <sys/event.h>
55 #include <sys/file.h>
56 #include <vfs/fifofs/fifo.h>
57 
58 #include "hammer2.h"
59 #include "hammer2_lz4.h"
60 
61 #include "zlib/hammer2_zlib.h"
62 
63 struct objcache *cache_buffer_read;
64 struct objcache *cache_buffer_write;
65 
66 /*
67  * Strategy code (async logical file buffer I/O from system)
68  *
69  * WARNING: The strategy code cannot safely use hammer2 transactions
70  *	    as this can deadlock against vfs_sync's vfsync() call
71  *	    if multiple flushes are queued.  All H2 structures must
72  *	    already be present and ready for the DIO.
73  *
74  *	    Reads can be initiated asynchronously, writes have to be
75  *	    spooled to a separate thread for action to avoid deadlocks.
76  */
77 static void hammer2_strategy_xop_read(hammer2_xop_t *arg, int clindex);
78 static void hammer2_strategy_xop_write(hammer2_xop_t *arg, int clindex);
79 static int hammer2_strategy_read(struct vop_strategy_args *ap);
80 static int hammer2_strategy_write(struct vop_strategy_args *ap);
81 static void hammer2_strategy_read_completion(hammer2_chain_t *chain,
82 				char *data, struct bio *bio);
83 
84 int
85 hammer2_vop_strategy(struct vop_strategy_args *ap)
86 {
87 	struct bio *biop;
88 	struct buf *bp;
89 	int error;
90 
91 	biop = ap->a_bio;
92 	bp = biop->bio_buf;
93 
94 	switch(bp->b_cmd) {
95 	case BUF_CMD_READ:
96 		error = hammer2_strategy_read(ap);
97 		++hammer2_iod_file_read;
98 		break;
99 	case BUF_CMD_WRITE:
100 		error = hammer2_strategy_write(ap);
101 		++hammer2_iod_file_write;
102 		break;
103 	default:
104 		bp->b_error = error = EINVAL;
105 		bp->b_flags |= B_ERROR;
106 		biodone(biop);
107 		break;
108 	}
109 	return (error);
110 }
111 
112 /*
113  * Return the largest contiguous physical disk range for the logical
114  * request, in bytes.
115  *
116  * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
117  *
118  * Basically disabled, the logical buffer write thread has to deal with
119  * buffers one-at-a-time.
120  */
121 int
122 hammer2_vop_bmap(struct vop_bmap_args *ap)
123 {
124 	*ap->a_doffsetp = NOOFFSET;
125 	if (ap->a_runp)
126 		*ap->a_runp = 0;
127 	if (ap->a_runb)
128 		*ap->a_runb = 0;
129 	return (EOPNOTSUPP);
130 }
131 
132 /****************************************************************************
133  *				READ SUPPORT				    *
134  ****************************************************************************/
135 /*
136  * Callback used in read path in case that a block is compressed with LZ4.
137  */
138 static
139 void
140 hammer2_decompress_LZ4_callback(const char *data, u_int bytes, struct bio *bio)
141 {
142 	struct buf *bp;
143 	char *compressed_buffer;
144 	int compressed_size;
145 	int result;
146 
147 	bp = bio->bio_buf;
148 
149 #if 0
150 	if bio->bio_caller_info2.index &&
151 	      bio->bio_caller_info1.uvalue32 !=
152 	      crc32(bp->b_data, bp->b_bufsize) --- return error
153 #endif
154 
155 	KKASSERT(bp->b_bufsize <= HAMMER2_PBUFSIZE);
156 	compressed_size = *(const int *)data;
157 	KKASSERT(compressed_size <= bytes - sizeof(int));
158 
159 	compressed_buffer = objcache_get(cache_buffer_read, M_INTWAIT);
160 	result = LZ4_decompress_safe(__DECONST(char *, &data[sizeof(int)]),
161 				     compressed_buffer,
162 				     compressed_size,
163 				     bp->b_bufsize);
164 	if (result < 0) {
165 		kprintf("READ PATH: Error during decompression."
166 			"bio %016jx/%d\n",
167 			(intmax_t)bio->bio_offset, bytes);
168 		/* make sure it isn't random garbage */
169 		bzero(compressed_buffer, bp->b_bufsize);
170 	}
171 	KKASSERT(result <= bp->b_bufsize);
172 	bcopy(compressed_buffer, bp->b_data, bp->b_bufsize);
173 	if (result < bp->b_bufsize)
174 		bzero(bp->b_data + result, bp->b_bufsize - result);
175 	objcache_put(cache_buffer_read, compressed_buffer);
176 	bp->b_resid = 0;
177 	bp->b_flags |= B_AGE;
178 }
179 
180 /*
181  * Callback used in read path in case that a block is compressed with ZLIB.
182  * It is almost identical to LZ4 callback, so in theory they can be unified,
183  * but we didn't want to make changes in bio structure for that.
184  */
185 static
186 void
187 hammer2_decompress_ZLIB_callback(const char *data, u_int bytes, struct bio *bio)
188 {
189 	struct buf *bp;
190 	char *compressed_buffer;
191 	z_stream strm_decompress;
192 	int result;
193 	int ret;
194 
195 	bp = bio->bio_buf;
196 
197 	KKASSERT(bp->b_bufsize <= HAMMER2_PBUFSIZE);
198 	strm_decompress.avail_in = 0;
199 	strm_decompress.next_in = Z_NULL;
200 
201 	ret = inflateInit(&strm_decompress);
202 
203 	if (ret != Z_OK)
204 		kprintf("HAMMER2 ZLIB: Fatal error in inflateInit.\n");
205 
206 	compressed_buffer = objcache_get(cache_buffer_read, M_INTWAIT);
207 	strm_decompress.next_in = __DECONST(char *, data);
208 
209 	/* XXX supply proper size, subset of device bp */
210 	strm_decompress.avail_in = bytes;
211 	strm_decompress.next_out = compressed_buffer;
212 	strm_decompress.avail_out = bp->b_bufsize;
213 
214 	ret = inflate(&strm_decompress, Z_FINISH);
215 	if (ret != Z_STREAM_END) {
216 		kprintf("HAMMER2 ZLIB: Fatar error during decompression.\n");
217 		bzero(compressed_buffer, bp->b_bufsize);
218 	}
219 	bcopy(compressed_buffer, bp->b_data, bp->b_bufsize);
220 	result = bp->b_bufsize - strm_decompress.avail_out;
221 	if (result < bp->b_bufsize)
222 		bzero(bp->b_data + result, strm_decompress.avail_out);
223 	objcache_put(cache_buffer_read, compressed_buffer);
224 	ret = inflateEnd(&strm_decompress);
225 
226 	bp->b_resid = 0;
227 	bp->b_flags |= B_AGE;
228 }
229 
230 /*
231  * Logical buffer I/O, async read.
232  */
233 static
234 int
235 hammer2_strategy_read(struct vop_strategy_args *ap)
236 {
237 	hammer2_xop_strategy_t *xop;
238 	struct buf *bp;
239 	struct bio *bio;
240 	struct bio *nbio;
241 	hammer2_inode_t *ip;
242 	hammer2_key_t lbase;
243 
244 	bio = ap->a_bio;
245 	bp = bio->bio_buf;
246 	ip = VTOI(ap->a_vp);
247 	nbio = push_bio(bio);
248 
249 	lbase = bio->bio_offset;
250 	KKASSERT(((int)lbase & HAMMER2_PBUFMASK) == 0);
251 
252 	xop = hammer2_xop_alloc(ip, 0);
253 	xop->finished = 0;
254 	xop->bio = bio;
255 	xop->lbase = lbase;
256 	hammer2_xop_start(&xop->head, hammer2_strategy_xop_read);
257 
258 	return(0);
259 }
260 
261 /*
262  * Per-node XOP (threaded), do a synchronous lookup of the chain and
263  * its data.  The frontend is asynchronous, so we are also responsible
264  * for racing to terminate the frontend.
265  */
266 static
267 void
268 hammer2_strategy_xop_read(hammer2_xop_t *arg, int clindex)
269 {
270 	hammer2_xop_strategy_t *xop = &arg->xop_strategy;
271 	hammer2_chain_t *parent;
272 	hammer2_chain_t *chain;
273 	hammer2_key_t key_dummy;
274 	hammer2_key_t lbase;
275 	struct bio *bio;
276 	struct buf *bp;
277 	int cache_index = -1;
278 	int error;
279 
280 	lbase = xop->lbase;
281 	bio = xop->bio;
282 	bp = bio->bio_buf;
283 
284 	parent = hammer2_inode_chain(xop->head.ip, clindex,
285 				     HAMMER2_RESOLVE_ALWAYS |
286 				     HAMMER2_RESOLVE_SHARED);
287 	if (parent) {
288 		chain = hammer2_chain_lookup(&parent, &key_dummy,
289 					     lbase, lbase,
290 					     &cache_index,
291 					     HAMMER2_LOOKUP_ALWAYS |
292 					     HAMMER2_LOOKUP_SHARED);
293 		error = chain ? chain->error : 0;
294 	} else {
295 		error = EIO;
296 		chain = NULL;
297 	}
298 	error = hammer2_xop_feed(&xop->head, chain, clindex, error);
299 	if (chain)
300 		hammer2_chain_drop(chain);
301 	if (parent) {
302 		hammer2_chain_unlock(parent);
303 		hammer2_chain_drop(parent);
304 	}
305 	chain = NULL;	/* safety */
306 	parent = NULL;	/* safety */
307 
308 	/*
309 	 * Race to finish the frontend
310 	 */
311 	if (xop->finished)
312 		return;
313 	hammer2_mtx_ex(&xop->head.xgrp->mtx2);
314 	if (xop->finished) {
315 		hammer2_mtx_unlock(&xop->head.xgrp->mtx2);
316 		return;
317 	}
318 
319 	/*
320 	 * Async operation has not completed and we now own the lock.
321 	 * Determine if we can complete the operation by issuing the
322 	 * frontend collection non-blocking.
323 	 */
324 	error = hammer2_xop_collect(&xop->head, HAMMER2_XOP_COLLECT_NOWAIT);
325 
326 	switch(error) {
327 	case 0:
328 		xop->finished = 1;
329 		hammer2_mtx_unlock(&xop->head.xgrp->mtx2);
330 		chain = xop->head.cluster.focus;
331 		hammer2_strategy_read_completion(chain, (char *)chain->data,
332 						 xop->bio);
333 		hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
334 		biodone(bio);
335 		break;
336 	case ENOENT:
337 		xop->finished = 1;
338 		hammer2_mtx_unlock(&xop->head.xgrp->mtx2);
339 		bp->b_resid = 0;
340 		bp->b_error = 0;
341 		bzero(bp->b_data, bp->b_bcount);
342 		biodone(bio);
343 		hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
344 		break;
345 	case EINPROGRESS:
346 		hammer2_mtx_unlock(&xop->head.xgrp->mtx2);
347 		break;
348 	default:
349 		xop->finished = 1;
350 		hammer2_mtx_unlock(&xop->head.xgrp->mtx2);
351 		bp->b_flags |= B_ERROR;
352 		bp->b_error = EIO;
353 		biodone(bio);
354 		hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
355 		break;
356 	}
357 }
358 
359 static
360 void
361 hammer2_strategy_read_completion(hammer2_chain_t *chain, char *data,
362 				 struct bio *bio)
363 {
364 	struct buf *bp = bio->bio_buf;
365 
366 	if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
367 		/*
368 		 * Data is embedded in the inode (copy from inode).
369 		 */
370 		bcopy(((hammer2_inode_data_t *)data)->u.data,
371 		      bp->b_data, HAMMER2_EMBEDDED_BYTES);
372 		bzero(bp->b_data + HAMMER2_EMBEDDED_BYTES,
373 		      bp->b_bcount - HAMMER2_EMBEDDED_BYTES);
374 		bp->b_resid = 0;
375 		bp->b_error = 0;
376 	} else if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
377 		/*
378 		 * Data is on-media, issue device I/O and copy.
379 		 *
380 		 * XXX direct-IO shortcut could go here XXX.
381 		 */
382 		switch (HAMMER2_DEC_COMP(chain->bref.methods)) {
383 		case HAMMER2_COMP_LZ4:
384 			hammer2_decompress_LZ4_callback(data, chain->bytes,
385 							bio);
386 			break;
387 		case HAMMER2_COMP_ZLIB:
388 			hammer2_decompress_ZLIB_callback(data, chain->bytes,
389 							 bio);
390 			break;
391 		case HAMMER2_COMP_NONE:
392 			KKASSERT(chain->bytes <= bp->b_bcount);
393 			bcopy(data, bp->b_data, chain->bytes);
394 			if (chain->bytes < bp->b_bcount) {
395 				bzero(bp->b_data + chain->bytes,
396 				      bp->b_bcount - chain->bytes);
397 			}
398 			bp->b_flags |= B_NOTMETA;
399 			bp->b_resid = 0;
400 			bp->b_error = 0;
401 			break;
402 		default:
403 			panic("hammer2_strategy_read: "
404 			      "unknown compression type");
405 		}
406 	} else {
407 		panic("hammer2_strategy_read: unknown bref type");
408 	}
409 }
410 
411 /****************************************************************************
412  *				WRITE SUPPORT				    *
413  ****************************************************************************/
414 
415 /*
416  * Functions for compression in threads,
417  * from hammer2_vnops.c
418  */
419 static void hammer2_write_file_core(struct buf *bp, hammer2_inode_t *ip,
420 				hammer2_chain_t **parentp,
421 				hammer2_key_t lbase, int ioflag, int pblksize,
422 				hammer2_tid_t mtid, int *errorp);
423 static void hammer2_compress_and_write(struct buf *bp, hammer2_inode_t *ip,
424 				hammer2_chain_t **parentp,
425 				hammer2_key_t lbase, int ioflag, int pblksize,
426 				hammer2_tid_t mtid, int *errorp,
427 				int comp_algo, int check_algo);
428 static void hammer2_zero_check_and_write(struct buf *bp, hammer2_inode_t *ip,
429 				hammer2_chain_t **parentp,
430 				hammer2_key_t lbase, int ioflag, int pblksize,
431 				hammer2_tid_t mtid, int *errorp,
432 				int check_algo);
433 static int test_block_zeros(const char *buf, size_t bytes);
434 static void zero_write(struct buf *bp, hammer2_inode_t *ip,
435 				hammer2_chain_t **parentp,
436 				hammer2_key_t lbase,
437 				hammer2_tid_t mtid, int *errorp);
438 static void hammer2_write_bp(hammer2_chain_t *chain, struct buf *bp,
439 				int ioflag, int pblksize,
440 				hammer2_tid_t mtid, int *errorp,
441 				int check_algo);
442 
443 static
444 int
445 hammer2_strategy_write(struct vop_strategy_args *ap)
446 {
447 	hammer2_xop_strategy_t *xop;
448 	hammer2_pfs_t *pmp;
449 	struct bio *bio;
450 	struct buf *bp;
451 	hammer2_inode_t *ip;
452 
453 	bio = ap->a_bio;
454 	bp = bio->bio_buf;
455 	ip = VTOI(ap->a_vp);
456 	pmp = ip->pmp;
457 
458 	hammer2_lwinprog_ref(pmp);
459 	hammer2_trans_assert_strategy(pmp);
460 
461 	xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING);
462 	xop->finished = 0;
463 	xop->bio = bio;
464 	xop->lbase = bio->bio_offset;
465 	hammer2_xop_start(&xop->head, hammer2_strategy_xop_write);
466 	/* asynchronous completion */
467 
468 	hammer2_lwinprog_wait(pmp, hammer2_flush_pipe);
469 
470 	return(0);
471 }
472 
473 /*
474  * Per-node XOP (threaded).  Write the logical buffer to the media.
475  */
476 static
477 void
478 hammer2_strategy_xop_write(hammer2_xop_t *arg, int clindex)
479 {
480 	hammer2_xop_strategy_t *xop = &arg->xop_strategy;
481 	hammer2_chain_t *parent;
482 	hammer2_key_t lbase;
483 	hammer2_inode_t *ip;
484 	struct bio *bio;
485 	struct buf *bp;
486 	int error;
487 	int lblksize;
488 	int pblksize;
489 
490 	lbase = xop->lbase;
491 	bio = xop->bio;
492 	bp = bio->bio_buf;
493 	ip = xop->head.ip;
494 
495 	/* hammer2_trans_init(parent->hmp->spmp, HAMMER2_TRANS_BUFCACHE); */
496 
497 	lblksize = hammer2_calc_logical(ip, bio->bio_offset, &lbase, NULL);
498 	pblksize = hammer2_calc_physical(ip, lbase);
499 	parent = hammer2_inode_chain(ip, clindex, HAMMER2_RESOLVE_ALWAYS);
500 	hammer2_write_file_core(bp, ip, &parent,
501 				lbase, IO_ASYNC, pblksize,
502 				xop->head.mtid, &error);
503 	if (parent) {
504 		hammer2_chain_unlock(parent);
505 		hammer2_chain_drop(parent);
506 		parent = NULL;	/* safety */
507 	}
508 	error = hammer2_xop_feed(&xop->head, NULL, clindex, error);
509 
510 	/*
511 	 * Race to finish the frontend
512 	 */
513 	if (xop->finished)
514 		return;
515 	hammer2_mtx_ex(&xop->head.xgrp->mtx2);
516 	if (xop->finished) {
517 		hammer2_mtx_unlock(&xop->head.xgrp->mtx2);
518 		return;
519 	}
520 
521 	/*
522 	 * Async operation has not completed and we now own the lock.
523 	 * Determine if we can complete the operation by issuing the
524 	 * frontend collection non-blocking.
525 	 */
526 	error = hammer2_xop_collect(&xop->head, HAMMER2_XOP_COLLECT_NOWAIT);
527 
528 	switch(error) {
529 	case ENOENT:
530 	case 0:
531 		xop->finished = 1;
532 		hammer2_mtx_unlock(&xop->head.xgrp->mtx2);
533 		bp->b_resid = 0;
534 		bp->b_error = 0;
535 		biodone(bio);
536 		hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
537 		hammer2_lwinprog_drop(ip->pmp);
538 		break;
539 	case EINPROGRESS:
540 		hammer2_mtx_unlock(&xop->head.xgrp->mtx2);
541 		break;
542 	default:
543 		xop->finished = 1;
544 		hammer2_mtx_unlock(&xop->head.xgrp->mtx2);
545 		bp->b_flags |= B_ERROR;
546 		bp->b_error = EIO;
547 		biodone(bio);
548 		hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
549 		hammer2_lwinprog_drop(ip->pmp);
550 		break;
551 	}
552 }
553 
554 /*
555  * Wait for pending I/O to complete
556  */
557 void
558 hammer2_bioq_sync(hammer2_pfs_t *pmp)
559 {
560 	hammer2_lwinprog_wait(pmp, 0);
561 }
562 
563 /*
564  * Create a new cluster at (cparent, lbase) and assign physical storage,
565  * returning a cluster suitable for I/O.  The cluster will be in a modified
566  * state.
567  *
568  * cparent can wind up being anything.
569  *
570  * NOTE: Special case for data embedded in inode.
571  */
572 static
573 hammer2_chain_t *
574 hammer2_assign_physical(hammer2_inode_t *ip, hammer2_chain_t **parentp,
575 			hammer2_key_t lbase, int pblksize,
576 			hammer2_tid_t mtid, int *errorp)
577 {
578 	hammer2_chain_t *chain;
579 	hammer2_key_t key_dummy;
580 	int pradix = hammer2_getradix(pblksize);
581 	int cache_index = -1;
582 
583 	/*
584 	 * Locate the chain associated with lbase, return a locked chain.
585 	 * However, do not instantiate any data reference (which utilizes a
586 	 * device buffer) because we will be using direct IO via the
587 	 * logical buffer cache buffer.
588 	 */
589 	*errorp = 0;
590 	KKASSERT(pblksize >= HAMMER2_ALLOC_MIN);
591 retry:
592 	chain = hammer2_chain_lookup(parentp, &key_dummy,
593 				     lbase, lbase,
594 				     &cache_index,
595 				     HAMMER2_LOOKUP_NODATA);
596 	if (chain == NULL) {
597 		/*
598 		 * We found a hole, create a new chain entry.
599 		 *
600 		 * NOTE: DATA chains are created without device backing
601 		 *	 store (nor do we want any).
602 		 */
603 		*errorp = hammer2_chain_create(parentp, &chain, ip->pmp,
604 					       lbase, HAMMER2_PBUFRADIX,
605 					       HAMMER2_BREF_TYPE_DATA,
606 					       pblksize,
607 					       mtid, 0);
608 		if (chain == NULL) {
609 			panic("hammer2_chain_create: par=%p error=%d\n",
610 			      *parentp, *errorp);
611 			goto retry;
612 		}
613 		/*ip->delta_dcount += pblksize;*/
614 	} else {
615 		switch (chain->bref.type) {
616 		case HAMMER2_BREF_TYPE_INODE:
617 			/*
618 			 * The data is embedded in the inode, which requires
619 			 * a bit more finess.
620 			 */
621 			hammer2_chain_modify_ip(ip, chain, mtid, 0);
622 			break;
623 		case HAMMER2_BREF_TYPE_DATA:
624 			if (chain->bytes != pblksize) {
625 				hammer2_chain_resize(ip, *parentp, chain,
626 						     mtid, pradix,
627 						     HAMMER2_MODIFY_OPTDATA);
628 			}
629 
630 			/*
631 			 * DATA buffers must be marked modified whether the
632 			 * data is in a logical buffer or not.  We also have
633 			 * to make this call to fixup the chain data pointers
634 			 * after resizing in case this is an encrypted or
635 			 * compressed buffer.
636 			 */
637 			hammer2_chain_modify(chain, mtid,
638 					     HAMMER2_MODIFY_OPTDATA);
639 			break;
640 		default:
641 			panic("hammer2_assign_physical: bad type");
642 			/* NOT REACHED */
643 			break;
644 		}
645 	}
646 	return (chain);
647 }
648 
649 /*
650  * hammer2_write_file_core() - hammer2_write_thread() helper
651  *
652  * The core write function which determines which path to take
653  * depending on compression settings.  We also have to locate the
654  * related chains so we can calculate and set the check data for
655  * the blockref.
656  */
657 static
658 void
659 hammer2_write_file_core(struct buf *bp, hammer2_inode_t *ip,
660 			hammer2_chain_t **parentp,
661 			hammer2_key_t lbase, int ioflag, int pblksize,
662 			hammer2_tid_t mtid, int *errorp)
663 {
664 	hammer2_chain_t *chain;
665 
666 	switch(HAMMER2_DEC_ALGO(ip->meta.comp_algo)) {
667 	case HAMMER2_COMP_NONE:
668 		/*
669 		 * We have to assign physical storage to the buffer
670 		 * we intend to dirty or write now to avoid deadlocks
671 		 * in the strategy code later.
672 		 *
673 		 * This can return NOOFFSET for inode-embedded data.
674 		 * The strategy code will take care of it in that case.
675 		 */
676 		chain = hammer2_assign_physical(ip, parentp, lbase, pblksize,
677 						mtid, errorp);
678 		if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
679 			hammer2_inode_data_t *wipdata;
680 
681 			wipdata = &chain->data->ipdata;
682 			KKASSERT(wipdata->meta.op_flags &
683 				 HAMMER2_OPFLAG_DIRECTDATA);
684 			KKASSERT(bp->b_loffset == 0);
685 			bcopy(bp->b_data, wipdata->u.data,
686 			      HAMMER2_EMBEDDED_BYTES);
687 		} else {
688 			hammer2_write_bp(chain, bp, ioflag, pblksize,
689 					 mtid, errorp, ip->meta.check_algo);
690 		}
691 		if (chain) {
692 			hammer2_chain_unlock(chain);
693 			hammer2_chain_drop(chain);
694 		}
695 		break;
696 	case HAMMER2_COMP_AUTOZERO:
697 		/*
698 		 * Check for zero-fill only
699 		 */
700 		hammer2_zero_check_and_write(bp, ip, parentp,
701 					     lbase, ioflag, pblksize,
702 					     mtid, errorp,
703 					     ip->meta.check_algo);
704 		break;
705 	case HAMMER2_COMP_LZ4:
706 	case HAMMER2_COMP_ZLIB:
707 	default:
708 		/*
709 		 * Check for zero-fill and attempt compression.
710 		 */
711 		hammer2_compress_and_write(bp, ip, parentp,
712 					   lbase, ioflag, pblksize,
713 					   mtid, errorp,
714 					   ip->meta.comp_algo,
715 					   ip->meta.check_algo);
716 		break;
717 	}
718 }
719 
720 /*
721  * Helper
722  *
723  * Generic function that will perform the compression in compression
724  * write path. The compression algorithm is determined by the settings
725  * obtained from inode.
726  */
727 static
728 void
729 hammer2_compress_and_write(struct buf *bp, hammer2_inode_t *ip,
730 	hammer2_chain_t **parentp,
731 	hammer2_key_t lbase, int ioflag, int pblksize,
732 	hammer2_tid_t mtid, int *errorp, int comp_algo, int check_algo)
733 {
734 	hammer2_chain_t *chain;
735 	int comp_size;
736 	int comp_block_size;
737 	char *comp_buffer;
738 
739 	if (test_block_zeros(bp->b_data, pblksize)) {
740 		zero_write(bp, ip, parentp, lbase, mtid, errorp);
741 		return;
742 	}
743 
744 	comp_size = 0;
745 	comp_buffer = NULL;
746 
747 	KKASSERT(pblksize / 2 <= 32768);
748 
749 	if (ip->comp_heuristic < 8 || (ip->comp_heuristic & 7) == 0) {
750 		z_stream strm_compress;
751 		int comp_level;
752 		int ret;
753 
754 		switch(HAMMER2_DEC_ALGO(comp_algo)) {
755 		case HAMMER2_COMP_LZ4:
756 			comp_buffer = objcache_get(cache_buffer_write,
757 						   M_INTWAIT);
758 			comp_size = LZ4_compress_limitedOutput(
759 					bp->b_data,
760 					&comp_buffer[sizeof(int)],
761 					pblksize,
762 					pblksize / 2 - sizeof(int));
763 			/*
764 			 * We need to prefix with the size, LZ4
765 			 * doesn't do it for us.  Add the related
766 			 * overhead.
767 			 */
768 			*(int *)comp_buffer = comp_size;
769 			if (comp_size)
770 				comp_size += sizeof(int);
771 			break;
772 		case HAMMER2_COMP_ZLIB:
773 			comp_level = HAMMER2_DEC_LEVEL(comp_algo);
774 			if (comp_level == 0)
775 				comp_level = 6;	/* default zlib compression */
776 			else if (comp_level < 6)
777 				comp_level = 6;
778 			else if (comp_level > 9)
779 				comp_level = 9;
780 			ret = deflateInit(&strm_compress, comp_level);
781 			if (ret != Z_OK) {
782 				kprintf("HAMMER2 ZLIB: fatal error "
783 					"on deflateInit.\n");
784 			}
785 
786 			comp_buffer = objcache_get(cache_buffer_write,
787 						   M_INTWAIT);
788 			strm_compress.next_in = bp->b_data;
789 			strm_compress.avail_in = pblksize;
790 			strm_compress.next_out = comp_buffer;
791 			strm_compress.avail_out = pblksize / 2;
792 			ret = deflate(&strm_compress, Z_FINISH);
793 			if (ret == Z_STREAM_END) {
794 				comp_size = pblksize / 2 -
795 					    strm_compress.avail_out;
796 			} else {
797 				comp_size = 0;
798 			}
799 			ret = deflateEnd(&strm_compress);
800 			break;
801 		default:
802 			kprintf("Error: Unknown compression method.\n");
803 			kprintf("Comp_method = %d.\n", comp_algo);
804 			break;
805 		}
806 	}
807 
808 	if (comp_size == 0) {
809 		/*
810 		 * compression failed or turned off
811 		 */
812 		comp_block_size = pblksize;	/* safety */
813 		if (++ip->comp_heuristic > 128)
814 			ip->comp_heuristic = 8;
815 	} else {
816 		/*
817 		 * compression succeeded
818 		 */
819 		ip->comp_heuristic = 0;
820 		if (comp_size <= 1024) {
821 			comp_block_size = 1024;
822 		} else if (comp_size <= 2048) {
823 			comp_block_size = 2048;
824 		} else if (comp_size <= 4096) {
825 			comp_block_size = 4096;
826 		} else if (comp_size <= 8192) {
827 			comp_block_size = 8192;
828 		} else if (comp_size <= 16384) {
829 			comp_block_size = 16384;
830 		} else if (comp_size <= 32768) {
831 			comp_block_size = 32768;
832 		} else {
833 			panic("hammer2: WRITE PATH: "
834 			      "Weird comp_size value.");
835 			/* NOT REACHED */
836 			comp_block_size = pblksize;
837 		}
838 	}
839 
840 	chain = hammer2_assign_physical(ip, parentp, lbase, comp_block_size,
841 					mtid, errorp);
842 	if (*errorp) {
843 		kprintf("WRITE PATH: An error occurred while "
844 			"assigning physical space.\n");
845 		KKASSERT(chain == NULL);
846 		goto done;
847 	}
848 
849 	if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
850 		hammer2_inode_data_t *wipdata;
851 
852 		hammer2_chain_modify_ip(ip, chain, mtid, 0);
853 		wipdata = &chain->data->ipdata;
854 		KKASSERT(wipdata->meta.op_flags & HAMMER2_OPFLAG_DIRECTDATA);
855 		KKASSERT(bp->b_loffset == 0);
856 		bcopy(bp->b_data, wipdata->u.data, HAMMER2_EMBEDDED_BYTES);
857 	} else {
858 		hammer2_io_t *dio;
859 		char *bdata;
860 
861 		KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
862 
863 		switch(chain->bref.type) {
864 		case HAMMER2_BREF_TYPE_INODE:
865 			panic("hammer2_write_bp: unexpected inode\n");
866 			break;
867 		case HAMMER2_BREF_TYPE_DATA:
868 			/*
869 			 * Optimize out the read-before-write
870 			 * if possible.
871 			 */
872 			*errorp = hammer2_io_newnz(chain->hmp,
873 						   chain->bref.data_off,
874 						   chain->bytes,
875 						   &dio);
876 			if (*errorp) {
877 				hammer2_io_brelse(&dio);
878 				kprintf("hammer2: WRITE PATH: "
879 					"dbp bread error\n");
880 				break;
881 			}
882 			bdata = hammer2_io_data(dio, chain->bref.data_off);
883 
884 			/*
885 			 * When loading the block make sure we don't
886 			 * leave garbage after the compressed data.
887 			 */
888 			if (comp_size) {
889 				chain->bref.methods =
890 					HAMMER2_ENC_COMP(comp_algo) +
891 					HAMMER2_ENC_CHECK(check_algo);
892 				bcopy(comp_buffer, bdata, comp_size);
893 				if (comp_size != comp_block_size) {
894 					bzero(bdata + comp_size,
895 					      comp_block_size - comp_size);
896 				}
897 			} else {
898 				chain->bref.methods =
899 					HAMMER2_ENC_COMP(
900 						HAMMER2_COMP_NONE) +
901 					HAMMER2_ENC_CHECK(check_algo);
902 				bcopy(bp->b_data, bdata, pblksize);
903 			}
904 
905 			/*
906 			 * The flush code doesn't calculate check codes for
907 			 * file data (doing so can result in excessive I/O),
908 			 * so we do it here.
909 			 */
910 			hammer2_chain_setcheck(chain, bdata);
911 
912 			/*
913 			 * Device buffer is now valid, chain is no longer in
914 			 * the initial state.
915 			 *
916 			 * (No blockref table worries with file data)
917 			 */
918 			atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
919 
920 			/* Now write the related bdp. */
921 			if (ioflag & IO_SYNC) {
922 				/*
923 				 * Synchronous I/O requested.
924 				 */
925 				hammer2_io_bwrite(&dio);
926 			/*
927 			} else if ((ioflag & IO_DIRECT) &&
928 				   loff + n == pblksize) {
929 				hammer2_io_bdwrite(&dio);
930 			*/
931 			} else if (ioflag & IO_ASYNC) {
932 				hammer2_io_bawrite(&dio);
933 			} else {
934 				hammer2_io_bdwrite(&dio);
935 			}
936 			break;
937 		default:
938 			panic("hammer2_write_bp: bad chain type %d\n",
939 				chain->bref.type);
940 			/* NOT REACHED */
941 			break;
942 		}
943 	}
944 done:
945 	if (chain) {
946 		hammer2_chain_unlock(chain);
947 		hammer2_chain_drop(chain);
948 	}
949 	if (comp_buffer)
950 		objcache_put(cache_buffer_write, comp_buffer);
951 }
952 
953 /*
954  * Helper
955  *
956  * Function that performs zero-checking and writing without compression,
957  * it corresponds to default zero-checking path.
958  */
959 static
960 void
961 hammer2_zero_check_and_write(struct buf *bp, hammer2_inode_t *ip,
962 	hammer2_chain_t **parentp,
963 	hammer2_key_t lbase, int ioflag, int pblksize,
964 	hammer2_tid_t mtid, int *errorp,
965 	int check_algo)
966 {
967 	hammer2_chain_t *chain;
968 
969 	if (test_block_zeros(bp->b_data, pblksize)) {
970 		zero_write(bp, ip, parentp, lbase, mtid, errorp);
971 	} else {
972 		chain = hammer2_assign_physical(ip, parentp, lbase, pblksize,
973 						mtid, errorp);
974 		hammer2_write_bp(chain, bp, ioflag, pblksize,
975 				 mtid, errorp, check_algo);
976 		if (chain) {
977 			hammer2_chain_unlock(chain);
978 			hammer2_chain_drop(chain);
979 		}
980 	}
981 }
982 
983 /*
984  * Helper
985  *
986  * A function to test whether a block of data contains only zeros,
987  * returns TRUE (non-zero) if the block is all zeros.
988  */
989 static
990 int
991 test_block_zeros(const char *buf, size_t bytes)
992 {
993 	size_t i;
994 
995 	for (i = 0; i < bytes; i += sizeof(long)) {
996 		if (*(const long *)(buf + i) != 0)
997 			return (0);
998 	}
999 	return (1);
1000 }
1001 
1002 /*
1003  * Helper
1004  *
1005  * Function to "write" a block that contains only zeros.
1006  */
1007 static
1008 void
1009 zero_write(struct buf *bp, hammer2_inode_t *ip,
1010 	   hammer2_chain_t **parentp,
1011 	   hammer2_key_t lbase, hammer2_tid_t mtid, int *errorp __unused)
1012 {
1013 	hammer2_chain_t *chain;
1014 	hammer2_key_t key_dummy;
1015 	int cache_index = -1;
1016 
1017 	chain = hammer2_chain_lookup(parentp, &key_dummy,
1018 				     lbase, lbase,
1019 				     &cache_index,
1020 				     HAMMER2_LOOKUP_NODATA);
1021 	if (chain) {
1022 		if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
1023 			hammer2_inode_data_t *wipdata;
1024 
1025 			hammer2_chain_modify_ip(ip, chain, mtid, 0);
1026 			wipdata = &chain->data->ipdata;
1027 			KKASSERT(wipdata->meta.op_flags &
1028 				 HAMMER2_OPFLAG_DIRECTDATA);
1029 			KKASSERT(bp->b_loffset == 0);
1030 			bzero(wipdata->u.data, HAMMER2_EMBEDDED_BYTES);
1031 		} else {
1032 			hammer2_chain_delete(*parentp, chain,
1033 					     mtid, HAMMER2_DELETE_PERMANENT);
1034 		}
1035 		hammer2_chain_unlock(chain);
1036 		hammer2_chain_drop(chain);
1037 	}
1038 }
1039 
1040 /*
1041  * Helper
1042  *
1043  * Function to write the data as it is, without performing any sort of
1044  * compression. This function is used in path without compression and
1045  * default zero-checking path.
1046  */
1047 static
1048 void
1049 hammer2_write_bp(hammer2_chain_t *chain, struct buf *bp, int ioflag,
1050 		 int pblksize,
1051 		 hammer2_tid_t mtid, int *errorp, int check_algo)
1052 {
1053 	hammer2_inode_data_t *wipdata;
1054 	hammer2_io_t *dio;
1055 	char *bdata;
1056 	int error;
1057 
1058 	error = 0;	/* XXX TODO below */
1059 
1060 	KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
1061 
1062 	switch(chain->bref.type) {
1063 	case HAMMER2_BREF_TYPE_INODE:
1064 		wipdata = &chain->data->ipdata;
1065 		KKASSERT(wipdata->meta.op_flags & HAMMER2_OPFLAG_DIRECTDATA);
1066 		KKASSERT(bp->b_loffset == 0);
1067 		bcopy(bp->b_data, wipdata->u.data, HAMMER2_EMBEDDED_BYTES);
1068 		error = 0;
1069 		break;
1070 	case HAMMER2_BREF_TYPE_DATA:
1071 		error = hammer2_io_newnz(chain->hmp,
1072 					 chain->bref.data_off,
1073 					 chain->bytes, &dio);
1074 		if (error) {
1075 			hammer2_io_bqrelse(&dio);
1076 			kprintf("hammer2: WRITE PATH: "
1077 				"dbp bread error\n");
1078 			break;
1079 		}
1080 		bdata = hammer2_io_data(dio, chain->bref.data_off);
1081 
1082 		chain->bref.methods = HAMMER2_ENC_COMP(
1083 						HAMMER2_COMP_NONE) +
1084 				      HAMMER2_ENC_CHECK(check_algo);
1085 		bcopy(bp->b_data, bdata, chain->bytes);
1086 
1087 		/*
1088 		 * The flush code doesn't calculate check codes for
1089 		 * file data (doing so can result in excessive I/O),
1090 		 * so we do it here.
1091 		 */
1092 		hammer2_chain_setcheck(chain, bdata);
1093 
1094 		/*
1095 		 * Device buffer is now valid, chain is no longer in
1096 		 * the initial state.
1097 		 *
1098 		 * (No blockref table worries with file data)
1099 		 */
1100 		atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1101 
1102 		if (ioflag & IO_SYNC) {
1103 			/*
1104 			 * Synchronous I/O requested.
1105 			 */
1106 			hammer2_io_bwrite(&dio);
1107 		/*
1108 		} else if ((ioflag & IO_DIRECT) &&
1109 			   loff + n == pblksize) {
1110 			hammer2_io_bdwrite(&dio);
1111 		*/
1112 		} else if (ioflag & IO_ASYNC) {
1113 			hammer2_io_bawrite(&dio);
1114 		} else {
1115 			hammer2_io_bdwrite(&dio);
1116 		}
1117 		break;
1118 	default:
1119 		panic("hammer2_write_bp: bad chain type %d\n",
1120 		      chain->bref.type);
1121 		/* NOT REACHED */
1122 		error = 0;
1123 		break;
1124 	}
1125 	KKASSERT(error == 0);	/* XXX TODO */
1126 	*errorp = error;
1127 }
1128