xref: /dflybsd-src/sys/vfs/hammer2/hammer2_strategy.c (revision d63676ccce44debffd22823892f437449f6acdad)
1 /*
2  * Copyright (c) 2011-2015 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  * by Daniel Flores (GSOC 2013 - mentored by Matthew Dillon, compression)
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in
17  *    the documentation and/or other materials provided with the
18  *    distribution.
19  * 3. Neither the name of The DragonFly Project nor the names of its
20  *    contributors may be used to endorse or promote products derived
21  *    from this software without specific, prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  */
36 /*
37  * This module handles low level logical file I/O (strategy) which backs
38  * the logical buffer cache.
39  */
40 
41 #include <sys/param.h>
42 #include <sys/systm.h>
43 #include <sys/kernel.h>
44 #include <sys/fcntl.h>
45 #include <sys/buf.h>
46 #include <sys/proc.h>
47 #include <sys/namei.h>
48 #include <sys/mount.h>
49 #include <sys/vnode.h>
50 #include <sys/mountctl.h>
51 #include <sys/dirent.h>
52 #include <sys/uio.h>
53 #include <sys/objcache.h>
54 #include <sys/event.h>
55 #include <sys/file.h>
56 #include <vfs/fifofs/fifo.h>
57 
58 #include "hammer2.h"
59 #include "hammer2_lz4.h"
60 
61 #include "zlib/hammer2_zlib.h"
62 
63 struct objcache *cache_buffer_read;
64 struct objcache *cache_buffer_write;
65 
66 /*
67  * Strategy code (async logical file buffer I/O from system)
68  *
69  * WARNING: The strategy code cannot safely use hammer2 transactions
70  *	    as this can deadlock against vfs_sync's vfsync() call
71  *	    if multiple flushes are queued.  All H2 structures must
72  *	    already be present and ready for the DIO.
73  *
74  *	    Reads can be initiated asynchronously, writes have to be
75  *	    spooled to a separate thread for action to avoid deadlocks.
76  */
77 static void hammer2_strategy_xop_read(hammer2_xop_t *arg, int clindex);
78 static void hammer2_strategy_xop_write(hammer2_xop_t *arg, int clindex);
79 static int hammer2_strategy_read(struct vop_strategy_args *ap);
80 static int hammer2_strategy_write(struct vop_strategy_args *ap);
81 static void hammer2_strategy_read_completion(hammer2_chain_t *chain,
82 				char *data, struct bio *bio);
83 
84 int
85 hammer2_vop_strategy(struct vop_strategy_args *ap)
86 {
87 	struct bio *biop;
88 	struct buf *bp;
89 	int error;
90 
91 	biop = ap->a_bio;
92 	bp = biop->bio_buf;
93 
94 	switch(bp->b_cmd) {
95 	case BUF_CMD_READ:
96 		error = hammer2_strategy_read(ap);
97 		++hammer2_iod_file_read;
98 		break;
99 	case BUF_CMD_WRITE:
100 		error = hammer2_strategy_write(ap);
101 		++hammer2_iod_file_write;
102 		break;
103 	default:
104 		bp->b_error = error = EINVAL;
105 		bp->b_flags |= B_ERROR;
106 		biodone(biop);
107 		break;
108 	}
109 	return (error);
110 }
111 
112 /*
113  * Return the largest contiguous physical disk range for the logical
114  * request, in bytes.
115  *
116  * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
117  *
118  * Basically disabled, the logical buffer write thread has to deal with
119  * buffers one-at-a-time.
120  */
121 int
122 hammer2_vop_bmap(struct vop_bmap_args *ap)
123 {
124 	*ap->a_doffsetp = NOOFFSET;
125 	if (ap->a_runp)
126 		*ap->a_runp = 0;
127 	if (ap->a_runb)
128 		*ap->a_runb = 0;
129 	return (EOPNOTSUPP);
130 }
131 
132 /****************************************************************************
133  *				READ SUPPORT				    *
134  ****************************************************************************/
135 /*
136  * Callback used in read path in case that a block is compressed with LZ4.
137  */
138 static
139 void
140 hammer2_decompress_LZ4_callback(const char *data, u_int bytes, struct bio *bio)
141 {
142 	struct buf *bp;
143 	char *compressed_buffer;
144 	int compressed_size;
145 	int result;
146 
147 	bp = bio->bio_buf;
148 
149 #if 0
150 	if bio->bio_caller_info2.index &&
151 	      bio->bio_caller_info1.uvalue32 !=
152 	      crc32(bp->b_data, bp->b_bufsize) --- return error
153 #endif
154 
155 	KKASSERT(bp->b_bufsize <= HAMMER2_PBUFSIZE);
156 	compressed_size = *(const int *)data;
157 	KKASSERT(compressed_size <= bytes - sizeof(int));
158 
159 	compressed_buffer = objcache_get(cache_buffer_read, M_INTWAIT);
160 	result = LZ4_decompress_safe(__DECONST(char *, &data[sizeof(int)]),
161 				     compressed_buffer,
162 				     compressed_size,
163 				     bp->b_bufsize);
164 	if (result < 0) {
165 		kprintf("READ PATH: Error during decompression."
166 			"bio %016jx/%d\n",
167 			(intmax_t)bio->bio_offset, bytes);
168 		/* make sure it isn't random garbage */
169 		bzero(compressed_buffer, bp->b_bufsize);
170 	}
171 	KKASSERT(result <= bp->b_bufsize);
172 	bcopy(compressed_buffer, bp->b_data, bp->b_bufsize);
173 	if (result < bp->b_bufsize)
174 		bzero(bp->b_data + result, bp->b_bufsize - result);
175 	objcache_put(cache_buffer_read, compressed_buffer);
176 	bp->b_resid = 0;
177 	bp->b_flags |= B_AGE;
178 }
179 
180 /*
181  * Callback used in read path in case that a block is compressed with ZLIB.
182  * It is almost identical to LZ4 callback, so in theory they can be unified,
183  * but we didn't want to make changes in bio structure for that.
184  */
185 static
186 void
187 hammer2_decompress_ZLIB_callback(const char *data, u_int bytes, struct bio *bio)
188 {
189 	struct buf *bp;
190 	char *compressed_buffer;
191 	z_stream strm_decompress;
192 	int result;
193 	int ret;
194 
195 	bp = bio->bio_buf;
196 
197 	KKASSERT(bp->b_bufsize <= HAMMER2_PBUFSIZE);
198 	strm_decompress.avail_in = 0;
199 	strm_decompress.next_in = Z_NULL;
200 
201 	ret = inflateInit(&strm_decompress);
202 
203 	if (ret != Z_OK)
204 		kprintf("HAMMER2 ZLIB: Fatal error in inflateInit.\n");
205 
206 	compressed_buffer = objcache_get(cache_buffer_read, M_INTWAIT);
207 	strm_decompress.next_in = __DECONST(char *, data);
208 
209 	/* XXX supply proper size, subset of device bp */
210 	strm_decompress.avail_in = bytes;
211 	strm_decompress.next_out = compressed_buffer;
212 	strm_decompress.avail_out = bp->b_bufsize;
213 
214 	ret = inflate(&strm_decompress, Z_FINISH);
215 	if (ret != Z_STREAM_END) {
216 		kprintf("HAMMER2 ZLIB: Fatar error during decompression.\n");
217 		bzero(compressed_buffer, bp->b_bufsize);
218 	}
219 	bcopy(compressed_buffer, bp->b_data, bp->b_bufsize);
220 	result = bp->b_bufsize - strm_decompress.avail_out;
221 	if (result < bp->b_bufsize)
222 		bzero(bp->b_data + result, strm_decompress.avail_out);
223 	objcache_put(cache_buffer_read, compressed_buffer);
224 	ret = inflateEnd(&strm_decompress);
225 
226 	bp->b_resid = 0;
227 	bp->b_flags |= B_AGE;
228 }
229 
230 /*
231  * Logical buffer I/O, async read.
232  */
233 static
234 int
235 hammer2_strategy_read(struct vop_strategy_args *ap)
236 {
237 	hammer2_xop_strategy_t *xop;
238 	struct buf *bp;
239 	struct bio *bio;
240 	struct bio *nbio;
241 	hammer2_inode_t *ip;
242 	hammer2_key_t lbase;
243 
244 	bio = ap->a_bio;
245 	bp = bio->bio_buf;
246 	ip = VTOI(ap->a_vp);
247 	nbio = push_bio(bio);
248 
249 	lbase = bio->bio_offset;
250 	KKASSERT(((int)lbase & HAMMER2_PBUFMASK) == 0);
251 
252 	xop = &hammer2_xop_alloc(ip)->xop_strategy;
253 	xop->finished = 0;
254 	xop->bio = bio;
255 	xop->lbase = lbase;
256 	hammer2_xop_start(&xop->head, hammer2_strategy_xop_read);
257 
258 	return(0);
259 }
260 
261 /*
262  * Per-node XOP (threaded), do a synchronous lookup of the chain and
263  * its data.  The frontend is asynchronous, so we are also responsible
264  * for racing to terminate the frontend.
265  */
266 static
267 void
268 hammer2_strategy_xop_read(hammer2_xop_t *arg, int clindex)
269 {
270 	hammer2_xop_strategy_t *xop = &arg->xop_strategy;
271 	hammer2_chain_t *parent;
272 	hammer2_chain_t *chain;
273 	hammer2_key_t key_dummy;
274 	hammer2_key_t lbase;
275 	struct bio *bio;
276 	struct buf *bp;
277 	int cache_index = -1;
278 	int error;
279 
280 	lbase = xop->lbase;
281 	bio = xop->bio;
282 	bp = bio->bio_buf;
283 
284 	parent = hammer2_inode_chain(xop->head.ip, clindex,
285 				     HAMMER2_RESOLVE_ALWAYS |
286 				     HAMMER2_RESOLVE_SHARED);
287 	if (parent) {
288 		chain = hammer2_chain_lookup(&parent, &key_dummy,
289 					     lbase, lbase,
290 					     &cache_index,
291 					     HAMMER2_LOOKUP_ALWAYS |
292 					     HAMMER2_LOOKUP_SHARED);
293 		error = chain ? chain->error : 0;
294 	} else {
295 		error = EIO;
296 		chain = NULL;
297 	}
298 	error = hammer2_xop_feed(&xop->head, chain, clindex, error);
299 	if (chain)
300 		hammer2_chain_drop(chain);
301 	if (parent) {
302 		hammer2_chain_unlock(parent);
303 		hammer2_chain_drop(parent);
304 	}
305 	chain = NULL;	/* safety */
306 	parent = NULL;	/* safety */
307 
308 	/*
309 	 * Race to finish the frontend
310 	 */
311 	if (xop->finished)
312 		return;
313 	hammer2_mtx_ex(&xop->head.xgrp->mtx2);
314 	if (xop->finished) {
315 		hammer2_mtx_unlock(&xop->head.xgrp->mtx2);
316 		return;
317 	}
318 
319 	/*
320 	 * Async operation has not completed and we now own the lock.
321 	 * Determine if we can complete the operation by issuing the
322 	 * frontend collection non-blocking.
323 	 */
324 	error = hammer2_xop_collect(&xop->head, HAMMER2_XOP_COLLECT_NOWAIT);
325 
326 	switch(error) {
327 	case 0:
328 		xop->finished = 1;
329 		hammer2_mtx_unlock(&xop->head.xgrp->mtx2);
330 		chain = xop->head.cluster.focus;
331 		hammer2_strategy_read_completion(chain, (char *)chain->data,
332 						 xop->bio);
333 		hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
334 		biodone(bio);
335 		break;
336 	case ENOENT:
337 		xop->finished = 1;
338 		hammer2_mtx_unlock(&xop->head.xgrp->mtx2);
339 		bp->b_resid = 0;
340 		bp->b_error = 0;
341 		bzero(bp->b_data, bp->b_bcount);
342 		biodone(bio);
343 		hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
344 		break;
345 	case EINPROGRESS:
346 		hammer2_mtx_unlock(&xop->head.xgrp->mtx2);
347 		break;
348 	default:
349 		xop->finished = 1;
350 		hammer2_mtx_unlock(&xop->head.xgrp->mtx2);
351 		bp->b_flags |= B_ERROR;
352 		bp->b_error = EIO;
353 		biodone(bio);
354 		hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
355 		break;
356 	}
357 }
358 
359 static
360 void
361 hammer2_strategy_read_completion(hammer2_chain_t *chain, char *data,
362 				 struct bio *bio)
363 {
364 	struct buf *bp = bio->bio_buf;
365 
366 	if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
367 		/*
368 		 * Data is embedded in the inode (copy from inode).
369 		 */
370 		bcopy(((hammer2_inode_data_t *)data)->u.data,
371 		      bp->b_data, HAMMER2_EMBEDDED_BYTES);
372 		bzero(bp->b_data + HAMMER2_EMBEDDED_BYTES,
373 		      bp->b_bcount - HAMMER2_EMBEDDED_BYTES);
374 		bp->b_resid = 0;
375 		bp->b_error = 0;
376 	} else if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
377 		/*
378 		 * Data is on-media, issue device I/O and copy.
379 		 *
380 		 * XXX direct-IO shortcut could go here XXX.
381 		 */
382 		switch (HAMMER2_DEC_COMP(chain->bref.methods)) {
383 		case HAMMER2_COMP_LZ4:
384 			hammer2_decompress_LZ4_callback(data, chain->bytes,
385 							bio);
386 			break;
387 		case HAMMER2_COMP_ZLIB:
388 			hammer2_decompress_ZLIB_callback(data, chain->bytes,
389 							 bio);
390 			break;
391 		case HAMMER2_COMP_NONE:
392 			KKASSERT(chain->bytes <= bp->b_bcount);
393 			bcopy(data, bp->b_data, chain->bytes);
394 			if (chain->bytes < bp->b_bcount) {
395 				bzero(bp->b_data + chain->bytes,
396 				      bp->b_bcount - chain->bytes);
397 			}
398 			bp->b_flags |= B_NOTMETA;
399 			bp->b_resid = 0;
400 			bp->b_error = 0;
401 			break;
402 		default:
403 			panic("hammer2_strategy_read: "
404 			      "unknown compression type");
405 		}
406 	} else {
407 		panic("hammer2_strategy_read: unknown bref type");
408 	}
409 }
410 
411 /****************************************************************************
412  *				WRITE SUPPORT				    *
413  ****************************************************************************/
414 
415 /*
416  * Functions for compression in threads,
417  * from hammer2_vnops.c
418  */
419 static void hammer2_write_file_core(struct buf *bp, hammer2_inode_t *ip,
420 				hammer2_chain_t **parentp,
421 				hammer2_key_t lbase, int ioflag, int pblksize,
422 				int *errorp);
423 static void hammer2_compress_and_write(struct buf *bp, hammer2_inode_t *ip,
424 				hammer2_chain_t **parentp,
425 				hammer2_key_t lbase, int ioflag,
426 				int pblksize, int *errorp,
427 				int comp_algo, int check_algo);
428 static void hammer2_zero_check_and_write(struct buf *bp, hammer2_inode_t *ip,
429 				hammer2_chain_t **parentp,
430 				hammer2_key_t lbase,
431 				int ioflag, int pblksize, int *errorp,
432 				int check_algo);
433 static int test_block_zeros(const char *buf, size_t bytes);
434 static void zero_write(struct buf *bp, hammer2_inode_t *ip,
435 				hammer2_chain_t **parentp,
436 				hammer2_key_t lbase,
437 				int *errorp);
438 static void hammer2_write_bp(hammer2_chain_t *chain, struct buf *bp,
439 				int ioflag, int pblksize, int *errorp,
440 				int check_algo);
441 
442 static
443 int
444 hammer2_strategy_write(struct vop_strategy_args *ap)
445 {
446 	hammer2_xop_strategy_t *xop;
447 	hammer2_pfs_t *pmp;
448 	struct bio *bio;
449 	struct buf *bp;
450 	hammer2_inode_t *ip;
451 
452 	bio = ap->a_bio;
453 	bp = bio->bio_buf;
454 	ip = VTOI(ap->a_vp);
455 	pmp = ip->pmp;
456 
457 	hammer2_lwinprog_ref(pmp);
458 	hammer2_trans_assert_strategy(pmp);
459 
460 	xop = &hammer2_xop_alloc(ip)->xop_strategy;
461 	xop->finished = 0;
462 	xop->bio = bio;
463 	xop->lbase = bio->bio_offset;
464 	hammer2_xop_start(&xop->head, hammer2_strategy_xop_write);
465 	/* asynchronous completion */
466 
467 	hammer2_lwinprog_wait(pmp, hammer2_flush_pipe);
468 
469 	return(0);
470 }
471 
472 /*
473  * Per-node XOP (threaded).  Write the logical buffer to the media.
474  */
475 static
476 void
477 hammer2_strategy_xop_write(hammer2_xop_t *arg, int clindex)
478 {
479 	hammer2_xop_strategy_t *xop = &arg->xop_strategy;
480 	hammer2_chain_t *parent;
481 	hammer2_key_t lbase;
482 	hammer2_inode_t *ip;
483 	struct bio *bio;
484 	struct buf *bp;
485 	int error;
486 	int lblksize;
487 	int pblksize;
488 
489 	lbase = xop->lbase;
490 	bio = xop->bio;
491 	bp = bio->bio_buf;
492 	ip = xop->head.ip;
493 
494 	/* hammer2_trans_init(parent->hmp->spmp, HAMMER2_TRANS_BUFCACHE); */
495 
496 	lblksize = hammer2_calc_logical(ip, bio->bio_offset, &lbase, NULL);
497 	pblksize = hammer2_calc_physical(ip, lbase);
498 	parent = hammer2_inode_chain(ip, clindex, HAMMER2_RESOLVE_ALWAYS);
499 	hammer2_write_file_core(bp, ip, &parent,
500 				lbase, IO_ASYNC,
501 				pblksize, &error);
502 	if (parent) {
503 		hammer2_chain_unlock(parent);
504 		hammer2_chain_drop(parent);
505 		parent = NULL;	/* safety */
506 	}
507 	error = hammer2_xop_feed(&xop->head, NULL, clindex, error);
508 
509 	/*
510 	 * Race to finish the frontend
511 	 */
512 	if (xop->finished)
513 		return;
514 	hammer2_mtx_ex(&xop->head.xgrp->mtx2);
515 	if (xop->finished) {
516 		hammer2_mtx_unlock(&xop->head.xgrp->mtx2);
517 		return;
518 	}
519 
520 	/*
521 	 * Async operation has not completed and we now own the lock.
522 	 * Determine if we can complete the operation by issuing the
523 	 * frontend collection non-blocking.
524 	 */
525 	error = hammer2_xop_collect(&xop->head, HAMMER2_XOP_COLLECT_NOWAIT);
526 
527 	switch(error) {
528 	case ENOENT:
529 	case 0:
530 		xop->finished = 1;
531 		hammer2_mtx_unlock(&xop->head.xgrp->mtx2);
532 		bp->b_resid = 0;
533 		bp->b_error = 0;
534 		biodone(bio);
535 		hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
536 		hammer2_lwinprog_drop(ip->pmp);
537 		break;
538 	case EINPROGRESS:
539 		hammer2_mtx_unlock(&xop->head.xgrp->mtx2);
540 		break;
541 	default:
542 		xop->finished = 1;
543 		hammer2_mtx_unlock(&xop->head.xgrp->mtx2);
544 		bp->b_flags |= B_ERROR;
545 		bp->b_error = EIO;
546 		biodone(bio);
547 		hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
548 		hammer2_lwinprog_drop(ip->pmp);
549 		break;
550 	}
551 }
552 
553 /*
554  * Wait for pending I/O to complete
555  */
556 void
557 hammer2_bioq_sync(hammer2_pfs_t *pmp)
558 {
559 	hammer2_lwinprog_wait(pmp, 0);
560 }
561 
562 /*
563  * Create a new cluster at (cparent, lbase) and assign physical storage,
564  * returning a cluster suitable for I/O.  The cluster will be in a modified
565  * state.
566  *
567  * cparent can wind up being anything.
568  *
569  * NOTE: Special case for data embedded in inode.
570  */
571 static
572 hammer2_chain_t *
573 hammer2_assign_physical(hammer2_inode_t *ip, hammer2_chain_t **parentp,
574 			hammer2_key_t lbase, int pblksize, int *errorp)
575 {
576 	hammer2_chain_t *chain;
577 	hammer2_key_t key_dummy;
578 	int pradix = hammer2_getradix(pblksize);
579 	int cache_index = -1;
580 
581 	/*
582 	 * Locate the chain associated with lbase, return a locked chain.
583 	 * However, do not instantiate any data reference (which utilizes a
584 	 * device buffer) because we will be using direct IO via the
585 	 * logical buffer cache buffer.
586 	 */
587 	*errorp = 0;
588 	KKASSERT(pblksize >= HAMMER2_ALLOC_MIN);
589 retry:
590 	chain = hammer2_chain_lookup(parentp, &key_dummy,
591 				     lbase, lbase,
592 				     &cache_index,
593 				     HAMMER2_LOOKUP_NODATA);
594 	if (chain == NULL) {
595 		/*
596 		 * We found a hole, create a new chain entry.
597 		 *
598 		 * NOTE: DATA chains are created without device backing
599 		 *	 store (nor do we want any).
600 		 */
601 		*errorp = hammer2_chain_create(parentp, &chain, ip->pmp,
602 					       lbase, HAMMER2_PBUFRADIX,
603 					       HAMMER2_BREF_TYPE_DATA,
604 					       pblksize, 0);
605 		if (chain == NULL) {
606 			panic("hammer2_chain_create: par=%p error=%d\n",
607 			      *parentp, *errorp);
608 			goto retry;
609 		}
610 		/*ip->delta_dcount += pblksize;*/
611 	} else {
612 		switch (chain->bref.type) {
613 		case HAMMER2_BREF_TYPE_INODE:
614 			/*
615 			 * The data is embedded in the inode, which requires
616 			 * a bit more finess.
617 			 */
618 			hammer2_chain_modify_ip(ip, chain, 0);
619 			break;
620 		case HAMMER2_BREF_TYPE_DATA:
621 			if (chain->bytes != pblksize) {
622 				hammer2_chain_resize(ip, *parentp, chain,
623 						     pradix,
624 						     HAMMER2_MODIFY_OPTDATA);
625 			}
626 
627 			/*
628 			 * DATA buffers must be marked modified whether the
629 			 * data is in a logical buffer or not.  We also have
630 			 * to make this call to fixup the chain data pointers
631 			 * after resizing in case this is an encrypted or
632 			 * compressed buffer.
633 			 */
634 			hammer2_chain_modify(chain, HAMMER2_MODIFY_OPTDATA);
635 			break;
636 		default:
637 			panic("hammer2_assign_physical: bad type");
638 			/* NOT REACHED */
639 			break;
640 		}
641 	}
642 	return (chain);
643 }
644 
645 /*
646  * hammer2_write_file_core() - hammer2_write_thread() helper
647  *
648  * The core write function which determines which path to take
649  * depending on compression settings.  We also have to locate the
650  * related chains so we can calculate and set the check data for
651  * the blockref.
652  */
653 static
654 void
655 hammer2_write_file_core(struct buf *bp, hammer2_inode_t *ip,
656 			hammer2_chain_t **parentp,
657 			hammer2_key_t lbase, int ioflag, int pblksize,
658 			int *errorp)
659 {
660 	hammer2_chain_t *chain;
661 
662 	switch(HAMMER2_DEC_ALGO(ip->meta.comp_algo)) {
663 	case HAMMER2_COMP_NONE:
664 		/*
665 		 * We have to assign physical storage to the buffer
666 		 * we intend to dirty or write now to avoid deadlocks
667 		 * in the strategy code later.
668 		 *
669 		 * This can return NOOFFSET for inode-embedded data.
670 		 * The strategy code will take care of it in that case.
671 		 */
672 		chain = hammer2_assign_physical(ip, parentp,
673 					        lbase, pblksize, errorp);
674 		if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
675 			hammer2_inode_data_t *wipdata;
676 
677 			wipdata = &chain->data->ipdata;
678 			KKASSERT(wipdata->meta.op_flags &
679 				 HAMMER2_OPFLAG_DIRECTDATA);
680 			KKASSERT(bp->b_loffset == 0);
681 			bcopy(bp->b_data, wipdata->u.data,
682 			      HAMMER2_EMBEDDED_BYTES);
683 		} else {
684 			hammer2_write_bp(chain, bp, ioflag, pblksize,
685 					 errorp, ip->meta.check_algo);
686 		}
687 		if (chain) {
688 			hammer2_chain_unlock(chain);
689 			hammer2_chain_drop(chain);
690 		}
691 		break;
692 	case HAMMER2_COMP_AUTOZERO:
693 		/*
694 		 * Check for zero-fill only
695 		 */
696 		hammer2_zero_check_and_write(bp, ip, parentp,
697 					     lbase, ioflag,
698 					     pblksize, errorp,
699 					     ip->meta.check_algo);
700 		break;
701 	case HAMMER2_COMP_LZ4:
702 	case HAMMER2_COMP_ZLIB:
703 	default:
704 		/*
705 		 * Check for zero-fill and attempt compression.
706 		 */
707 		hammer2_compress_and_write(bp, ip, parentp,
708 					   lbase, ioflag,
709 					   pblksize, errorp,
710 					   ip->meta.comp_algo,
711 					   ip->meta.check_algo);
712 		break;
713 	}
714 }
715 
716 /*
717  * Helper
718  *
719  * Generic function that will perform the compression in compression
720  * write path. The compression algorithm is determined by the settings
721  * obtained from inode.
722  */
723 static
724 void
725 hammer2_compress_and_write(struct buf *bp, hammer2_inode_t *ip,
726 	hammer2_chain_t **parentp,
727 	hammer2_key_t lbase, int ioflag, int pblksize,
728 	int *errorp, int comp_algo, int check_algo)
729 {
730 	hammer2_chain_t *chain;
731 	int comp_size;
732 	int comp_block_size;
733 	char *comp_buffer;
734 
735 	if (test_block_zeros(bp->b_data, pblksize)) {
736 		zero_write(bp, ip, parentp, lbase, errorp);
737 		return;
738 	}
739 
740 	comp_size = 0;
741 	comp_buffer = NULL;
742 
743 	KKASSERT(pblksize / 2 <= 32768);
744 
745 	if (ip->comp_heuristic < 8 || (ip->comp_heuristic & 7) == 0) {
746 		z_stream strm_compress;
747 		int comp_level;
748 		int ret;
749 
750 		switch(HAMMER2_DEC_ALGO(comp_algo)) {
751 		case HAMMER2_COMP_LZ4:
752 			comp_buffer = objcache_get(cache_buffer_write,
753 						   M_INTWAIT);
754 			comp_size = LZ4_compress_limitedOutput(
755 					bp->b_data,
756 					&comp_buffer[sizeof(int)],
757 					pblksize,
758 					pblksize / 2 - sizeof(int));
759 			/*
760 			 * We need to prefix with the size, LZ4
761 			 * doesn't do it for us.  Add the related
762 			 * overhead.
763 			 */
764 			*(int *)comp_buffer = comp_size;
765 			if (comp_size)
766 				comp_size += sizeof(int);
767 			break;
768 		case HAMMER2_COMP_ZLIB:
769 			comp_level = HAMMER2_DEC_LEVEL(comp_algo);
770 			if (comp_level == 0)
771 				comp_level = 6;	/* default zlib compression */
772 			else if (comp_level < 6)
773 				comp_level = 6;
774 			else if (comp_level > 9)
775 				comp_level = 9;
776 			ret = deflateInit(&strm_compress, comp_level);
777 			if (ret != Z_OK) {
778 				kprintf("HAMMER2 ZLIB: fatal error "
779 					"on deflateInit.\n");
780 			}
781 
782 			comp_buffer = objcache_get(cache_buffer_write,
783 						   M_INTWAIT);
784 			strm_compress.next_in = bp->b_data;
785 			strm_compress.avail_in = pblksize;
786 			strm_compress.next_out = comp_buffer;
787 			strm_compress.avail_out = pblksize / 2;
788 			ret = deflate(&strm_compress, Z_FINISH);
789 			if (ret == Z_STREAM_END) {
790 				comp_size = pblksize / 2 -
791 					    strm_compress.avail_out;
792 			} else {
793 				comp_size = 0;
794 			}
795 			ret = deflateEnd(&strm_compress);
796 			break;
797 		default:
798 			kprintf("Error: Unknown compression method.\n");
799 			kprintf("Comp_method = %d.\n", comp_algo);
800 			break;
801 		}
802 	}
803 
804 	if (comp_size == 0) {
805 		/*
806 		 * compression failed or turned off
807 		 */
808 		comp_block_size = pblksize;	/* safety */
809 		if (++ip->comp_heuristic > 128)
810 			ip->comp_heuristic = 8;
811 	} else {
812 		/*
813 		 * compression succeeded
814 		 */
815 		ip->comp_heuristic = 0;
816 		if (comp_size <= 1024) {
817 			comp_block_size = 1024;
818 		} else if (comp_size <= 2048) {
819 			comp_block_size = 2048;
820 		} else if (comp_size <= 4096) {
821 			comp_block_size = 4096;
822 		} else if (comp_size <= 8192) {
823 			comp_block_size = 8192;
824 		} else if (comp_size <= 16384) {
825 			comp_block_size = 16384;
826 		} else if (comp_size <= 32768) {
827 			comp_block_size = 32768;
828 		} else {
829 			panic("hammer2: WRITE PATH: "
830 			      "Weird comp_size value.");
831 			/* NOT REACHED */
832 			comp_block_size = pblksize;
833 		}
834 	}
835 
836 	chain = hammer2_assign_physical(ip, parentp, lbase,
837 					comp_block_size, errorp);
838 	if (*errorp) {
839 		kprintf("WRITE PATH: An error occurred while "
840 			"assigning physical space.\n");
841 		KKASSERT(chain == NULL);
842 		goto done;
843 	}
844 
845 	if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
846 		hammer2_inode_data_t *wipdata;
847 
848 		hammer2_chain_modify_ip(ip, chain, 0);
849 		wipdata = &chain->data->ipdata;
850 		KKASSERT(wipdata->meta.op_flags & HAMMER2_OPFLAG_DIRECTDATA);
851 		KKASSERT(bp->b_loffset == 0);
852 		bcopy(bp->b_data, wipdata->u.data, HAMMER2_EMBEDDED_BYTES);
853 	} else {
854 		hammer2_io_t *dio;
855 		char *bdata;
856 
857 		KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
858 
859 		switch(chain->bref.type) {
860 		case HAMMER2_BREF_TYPE_INODE:
861 			panic("hammer2_write_bp: unexpected inode\n");
862 			break;
863 		case HAMMER2_BREF_TYPE_DATA:
864 			/*
865 			 * Optimize out the read-before-write
866 			 * if possible.
867 			 */
868 			*errorp = hammer2_io_newnz(chain->hmp,
869 						   chain->bref.data_off,
870 						   chain->bytes,
871 						   &dio);
872 			if (*errorp) {
873 				hammer2_io_brelse(&dio);
874 				kprintf("hammer2: WRITE PATH: "
875 					"dbp bread error\n");
876 				break;
877 			}
878 			bdata = hammer2_io_data(dio, chain->bref.data_off);
879 
880 			/*
881 			 * When loading the block make sure we don't
882 			 * leave garbage after the compressed data.
883 			 */
884 			if (comp_size) {
885 				chain->bref.methods =
886 					HAMMER2_ENC_COMP(comp_algo) +
887 					HAMMER2_ENC_CHECK(check_algo);
888 				bcopy(comp_buffer, bdata, comp_size);
889 				if (comp_size != comp_block_size) {
890 					bzero(bdata + comp_size,
891 					      comp_block_size - comp_size);
892 				}
893 			} else {
894 				chain->bref.methods =
895 					HAMMER2_ENC_COMP(
896 						HAMMER2_COMP_NONE) +
897 					HAMMER2_ENC_CHECK(check_algo);
898 				bcopy(bp->b_data, bdata, pblksize);
899 			}
900 
901 			/*
902 			 * The flush code doesn't calculate check codes for
903 			 * file data (doing so can result in excessive I/O),
904 			 * so we do it here.
905 			 */
906 			hammer2_chain_setcheck(chain, bdata);
907 
908 			/*
909 			 * Device buffer is now valid, chain is no longer in
910 			 * the initial state.
911 			 *
912 			 * (No blockref table worries with file data)
913 			 */
914 			atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
915 
916 			/* Now write the related bdp. */
917 			if (ioflag & IO_SYNC) {
918 				/*
919 				 * Synchronous I/O requested.
920 				 */
921 				hammer2_io_bwrite(&dio);
922 			/*
923 			} else if ((ioflag & IO_DIRECT) &&
924 				   loff + n == pblksize) {
925 				hammer2_io_bdwrite(&dio);
926 			*/
927 			} else if (ioflag & IO_ASYNC) {
928 				hammer2_io_bawrite(&dio);
929 			} else {
930 				hammer2_io_bdwrite(&dio);
931 			}
932 			break;
933 		default:
934 			panic("hammer2_write_bp: bad chain type %d\n",
935 				chain->bref.type);
936 			/* NOT REACHED */
937 			break;
938 		}
939 	}
940 done:
941 	if (chain) {
942 		hammer2_chain_unlock(chain);
943 		hammer2_chain_drop(chain);
944 	}
945 	if (comp_buffer)
946 		objcache_put(cache_buffer_write, comp_buffer);
947 }
948 
949 /*
950  * Helper
951  *
952  * Function that performs zero-checking and writing without compression,
953  * it corresponds to default zero-checking path.
954  */
955 static
956 void
957 hammer2_zero_check_and_write(struct buf *bp, hammer2_inode_t *ip,
958 	hammer2_chain_t **parentp,
959 	hammer2_key_t lbase, int ioflag, int pblksize, int *errorp,
960 	int check_algo)
961 {
962 	hammer2_chain_t *chain;
963 
964 	if (test_block_zeros(bp->b_data, pblksize)) {
965 		zero_write(bp, ip, parentp, lbase, errorp);
966 	} else {
967 		chain = hammer2_assign_physical(ip, parentp, lbase,
968 						pblksize, errorp);
969 		hammer2_write_bp(chain, bp, ioflag, pblksize,
970 				 errorp, check_algo);
971 		if (chain) {
972 			hammer2_chain_unlock(chain);
973 			hammer2_chain_drop(chain);
974 		}
975 	}
976 }
977 
978 /*
979  * Helper
980  *
981  * A function to test whether a block of data contains only zeros,
982  * returns TRUE (non-zero) if the block is all zeros.
983  */
984 static
985 int
986 test_block_zeros(const char *buf, size_t bytes)
987 {
988 	size_t i;
989 
990 	for (i = 0; i < bytes; i += sizeof(long)) {
991 		if (*(const long *)(buf + i) != 0)
992 			return (0);
993 	}
994 	return (1);
995 }
996 
997 /*
998  * Helper
999  *
1000  * Function to "write" a block that contains only zeros.
1001  */
1002 static
1003 void
1004 zero_write(struct buf *bp, hammer2_inode_t *ip,
1005 	   hammer2_chain_t **parentp,
1006 	   hammer2_key_t lbase, int *errorp __unused)
1007 {
1008 	hammer2_chain_t *chain;
1009 	hammer2_key_t key_dummy;
1010 	int cache_index = -1;
1011 
1012 	chain = hammer2_chain_lookup(parentp, &key_dummy,
1013 				     lbase, lbase,
1014 				     &cache_index,
1015 				     HAMMER2_LOOKUP_NODATA);
1016 	if (chain) {
1017 		if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
1018 			hammer2_inode_data_t *wipdata;
1019 
1020 			hammer2_chain_modify_ip(ip, chain, 0);
1021 			wipdata = &chain->data->ipdata;
1022 			KKASSERT(wipdata->meta.op_flags &
1023 				 HAMMER2_OPFLAG_DIRECTDATA);
1024 			KKASSERT(bp->b_loffset == 0);
1025 			bzero(wipdata->u.data, HAMMER2_EMBEDDED_BYTES);
1026 		} else {
1027 			hammer2_chain_delete(*parentp, chain,
1028 					     HAMMER2_DELETE_PERMANENT);
1029 		}
1030 		hammer2_chain_unlock(chain);
1031 		hammer2_chain_drop(chain);
1032 	}
1033 }
1034 
1035 /*
1036  * Helper
1037  *
1038  * Function to write the data as it is, without performing any sort of
1039  * compression. This function is used in path without compression and
1040  * default zero-checking path.
1041  */
1042 static
1043 void
1044 hammer2_write_bp(hammer2_chain_t *chain, struct buf *bp, int ioflag,
1045 		 int pblksize, int *errorp, int check_algo)
1046 {
1047 	hammer2_inode_data_t *wipdata;
1048 	hammer2_io_t *dio;
1049 	char *bdata;
1050 	int error;
1051 
1052 	error = 0;	/* XXX TODO below */
1053 
1054 	KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
1055 
1056 	switch(chain->bref.type) {
1057 	case HAMMER2_BREF_TYPE_INODE:
1058 		wipdata = &chain->data->ipdata;
1059 		KKASSERT(wipdata->meta.op_flags & HAMMER2_OPFLAG_DIRECTDATA);
1060 		KKASSERT(bp->b_loffset == 0);
1061 		bcopy(bp->b_data, wipdata->u.data, HAMMER2_EMBEDDED_BYTES);
1062 		error = 0;
1063 		break;
1064 	case HAMMER2_BREF_TYPE_DATA:
1065 		error = hammer2_io_newnz(chain->hmp,
1066 					 chain->bref.data_off,
1067 					 chain->bytes, &dio);
1068 		if (error) {
1069 			hammer2_io_bqrelse(&dio);
1070 			kprintf("hammer2: WRITE PATH: "
1071 				"dbp bread error\n");
1072 			break;
1073 		}
1074 		bdata = hammer2_io_data(dio, chain->bref.data_off);
1075 
1076 		chain->bref.methods = HAMMER2_ENC_COMP(
1077 						HAMMER2_COMP_NONE) +
1078 				      HAMMER2_ENC_CHECK(check_algo);
1079 		bcopy(bp->b_data, bdata, chain->bytes);
1080 
1081 		/*
1082 		 * The flush code doesn't calculate check codes for
1083 		 * file data (doing so can result in excessive I/O),
1084 		 * so we do it here.
1085 		 */
1086 		hammer2_chain_setcheck(chain, bdata);
1087 
1088 		/*
1089 		 * Device buffer is now valid, chain is no longer in
1090 		 * the initial state.
1091 		 *
1092 		 * (No blockref table worries with file data)
1093 		 */
1094 		atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1095 
1096 		if (ioflag & IO_SYNC) {
1097 			/*
1098 			 * Synchronous I/O requested.
1099 			 */
1100 			hammer2_io_bwrite(&dio);
1101 		/*
1102 		} else if ((ioflag & IO_DIRECT) &&
1103 			   loff + n == pblksize) {
1104 			hammer2_io_bdwrite(&dio);
1105 		*/
1106 		} else if (ioflag & IO_ASYNC) {
1107 			hammer2_io_bawrite(&dio);
1108 		} else {
1109 			hammer2_io_bdwrite(&dio);
1110 		}
1111 		break;
1112 	default:
1113 		panic("hammer2_write_bp: bad chain type %d\n",
1114 		      chain->bref.type);
1115 		/* NOT REACHED */
1116 		error = 0;
1117 		break;
1118 	}
1119 	KKASSERT(error == 0);	/* XXX TODO */
1120 	*errorp = error;
1121 }
1122