xref: /dflybsd-src/sys/vfs/hammer/hammer_io.c (revision e7d75765a12690c7d3bbf05bc3680156bbf2e03c)
166325755SMatthew Dillon /*
2b84de5afSMatthew Dillon  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
366325755SMatthew Dillon  *
466325755SMatthew Dillon  * This code is derived from software contributed to The DragonFly Project
566325755SMatthew Dillon  * by Matthew Dillon <dillon@backplane.com>
666325755SMatthew Dillon  *
766325755SMatthew Dillon  * Redistribution and use in source and binary forms, with or without
866325755SMatthew Dillon  * modification, are permitted provided that the following conditions
966325755SMatthew Dillon  * are met:
1066325755SMatthew Dillon  *
1166325755SMatthew Dillon  * 1. Redistributions of source code must retain the above copyright
1266325755SMatthew Dillon  *    notice, this list of conditions and the following disclaimer.
1366325755SMatthew Dillon  * 2. Redistributions in binary form must reproduce the above copyright
1466325755SMatthew Dillon  *    notice, this list of conditions and the following disclaimer in
1566325755SMatthew Dillon  *    the documentation and/or other materials provided with the
1666325755SMatthew Dillon  *    distribution.
1766325755SMatthew Dillon  * 3. Neither the name of The DragonFly Project nor the names of its
1866325755SMatthew Dillon  *    contributors may be used to endorse or promote products derived
1966325755SMatthew Dillon  *    from this software without specific, prior written permission.
2066325755SMatthew Dillon  *
2166325755SMatthew Dillon  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
2266325755SMatthew Dillon  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
2366325755SMatthew Dillon  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
2466325755SMatthew Dillon  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
2566325755SMatthew Dillon  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
2666325755SMatthew Dillon  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
2766325755SMatthew Dillon  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
2866325755SMatthew Dillon  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
2966325755SMatthew Dillon  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
3066325755SMatthew Dillon  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
3166325755SMatthew Dillon  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
3266325755SMatthew Dillon  * SUCH DAMAGE.
3366325755SMatthew Dillon  */
3466325755SMatthew Dillon /*
3566325755SMatthew Dillon  * IO Primitives and buffer cache management
3666325755SMatthew Dillon  *
3766325755SMatthew Dillon  * All major data-tracking structures in HAMMER contain a struct hammer_io
3866325755SMatthew Dillon  * which is used to manage their backing store.  We use filesystem buffers
3966325755SMatthew Dillon  * for backing store and we leave them passively associated with their
4066325755SMatthew Dillon  * HAMMER structures.
4166325755SMatthew Dillon  *
429f5097dcSMatthew Dillon  * If the kernel tries to destroy a passively associated buf which we cannot
4366325755SMatthew Dillon  * yet let go we set B_LOCKED in the buffer and then actively released it
4466325755SMatthew Dillon  * later when we can.
4577912481SMatthew Dillon  *
4677912481SMatthew Dillon  * The io_token is required for anything which might race bioops and bio_done
4777912481SMatthew Dillon  * callbacks, with one exception: A successful hammer_try_interlock_norefs().
4877912481SMatthew Dillon  * the fs_token will be held in all other cases.
4966325755SMatthew Dillon  */
5066325755SMatthew Dillon 
5197fb61c0STomohiro Kusumi #include <sys/buf2.h>
5254341a3bSMatthew Dillon 
53b45803e3STomohiro Kusumi #include "hammer.h"
5466325755SMatthew Dillon 
5510a5d1baSMatthew Dillon static void hammer_io_modify(hammer_io_t io, int count);
56055f5ff8SMatthew Dillon static void hammer_io_deallocate(struct buf *bp);
579a98f3ccSMatthew Dillon static void hammer_indirect_callback(struct bio *bio);
581b0ab2c3SMatthew Dillon static void hammer_io_direct_write_complete(struct bio *nbio);
5943c665aeSMatthew Dillon static int hammer_io_direct_uncache_callback(hammer_inode_t ip, void *data);
60cdb6e4e6SMatthew Dillon static void hammer_io_set_modlist(struct hammer_io *io);
619e6939a5STomohiro Kusumi static __inline void hammer_io_flush_mark(hammer_volume_t volume);
62e397030bSTomohiro Kusumi static struct bio_ops hammer_bioops;
63748efb59SMatthew Dillon 
641afb73cfSMatthew Dillon static int
651afb73cfSMatthew Dillon hammer_mod_rb_compare(hammer_io_t io1, hammer_io_t io2)
661afb73cfSMatthew Dillon {
671afb73cfSMatthew Dillon 	hammer_off_t io1_offset;
681afb73cfSMatthew Dillon 	hammer_off_t io2_offset;
691afb73cfSMatthew Dillon 
70b49481fbSTomohiro Kusumi 	/*
71b49481fbSTomohiro Kusumi 	 * Encoded offsets are neither valid block device offsets
72b49481fbSTomohiro Kusumi 	 * nor valid zone-X offsets.
73b49481fbSTomohiro Kusumi 	 */
74b49481fbSTomohiro Kusumi 	io1_offset = HAMMER_ENCODE(0, io1->volume->vol_no, io1->offset);
75b49481fbSTomohiro Kusumi 	io2_offset = HAMMER_ENCODE(0, io2->volume->vol_no, io2->offset);
761afb73cfSMatthew Dillon 
771afb73cfSMatthew Dillon 	if (io1_offset < io2_offset)
781afb73cfSMatthew Dillon 		return(-1);
791afb73cfSMatthew Dillon 	if (io1_offset > io2_offset)
801afb73cfSMatthew Dillon 		return(1);
811afb73cfSMatthew Dillon 	return(0);
821afb73cfSMatthew Dillon }
831afb73cfSMatthew Dillon 
841afb73cfSMatthew Dillon RB_GENERATE(hammer_mod_rb_tree, hammer_io, rb_node, hammer_mod_rb_compare);
851afb73cfSMatthew Dillon 
86055f5ff8SMatthew Dillon /*
8710a5d1baSMatthew Dillon  * Initialize a new, already-zero'd hammer_io structure, or reinitialize
8810a5d1baSMatthew Dillon  * an existing hammer_io structure which may have switched to another type.
89055f5ff8SMatthew Dillon  */
90055f5ff8SMatthew Dillon void
91748efb59SMatthew Dillon hammer_io_init(hammer_io_t io, hammer_volume_t volume, enum hammer_io_type type)
92055f5ff8SMatthew Dillon {
93748efb59SMatthew Dillon 	io->volume = volume;
94748efb59SMatthew Dillon 	io->hmp = volume->io.hmp;
95055f5ff8SMatthew Dillon 	io->type = type;
96055f5ff8SMatthew Dillon }
97055f5ff8SMatthew Dillon 
9866325755SMatthew Dillon /*
99fbc6e32aSMatthew Dillon  * Helper routine to disassociate a buffer cache buffer from an I/O
10077912481SMatthew Dillon  * structure.  The io must be interlocked and marked appropriately for
101b0aab9b9SMatthew Dillon  * reclamation.
102055f5ff8SMatthew Dillon  *
103b0aab9b9SMatthew Dillon  * The io must be in a released state with the io->bp owned and
104b0aab9b9SMatthew Dillon  * locked by the caller of this function.  When not called from an
105b0aab9b9SMatthew Dillon  * io_deallocate() this cannot race an io_deallocate() since the
106b0aab9b9SMatthew Dillon  * kernel would be unable to get the buffer lock in that case.
10777912481SMatthew Dillon  * (The released state in this case means we own the bp, not the
10877912481SMatthew Dillon  * hammer_io structure).
10977912481SMatthew Dillon  *
11077912481SMatthew Dillon  * The io may have 0 or 1 references depending on who called us.  The
11177912481SMatthew Dillon  * caller is responsible for dealing with the refs.
112b0aab9b9SMatthew Dillon  *
113055f5ff8SMatthew Dillon  * This call can only be made when no action is required on the buffer.
114ecca949aSMatthew Dillon  *
11577912481SMatthew Dillon  * This function is guaranteed not to race against anything because we
11677912481SMatthew Dillon  * own both the io lock and the bp lock and are interlocked with no
11777912481SMatthew Dillon  * references.
11866325755SMatthew Dillon  */
11966325755SMatthew Dillon static void
120ff66f880STomohiro Kusumi hammer_io_disassociate(hammer_io_t io)
12166325755SMatthew Dillon {
122ff66f880STomohiro Kusumi 	struct buf *bp = io->bp;
12366325755SMatthew Dillon 
124ff66f880STomohiro Kusumi 	KKASSERT(io->released);
125ff66f880STomohiro Kusumi 	KKASSERT(io->modified == 0);
1260ae73f43STomohiro Kusumi 	KKASSERT(hammer_buf_peek_io(bp) == io);
1274d75d829SMatthew Dillon 	buf_dep_init(bp);
128ff66f880STomohiro Kusumi 	io->bp = NULL;
1299f5097dcSMatthew Dillon 
1309f5097dcSMatthew Dillon 	/*
1319f5097dcSMatthew Dillon 	 * If the buffer was locked someone wanted to get rid of it.
1329f5097dcSMatthew Dillon 	 */
133a99b9ea2SMatthew Dillon 	if (bp->b_flags & B_LOCKED) {
134b0aab9b9SMatthew Dillon 		atomic_add_int(&hammer_count_io_locked, -1);
135d8971d2bSMatthew Dillon 		bp->b_flags &= ~B_LOCKED;
136a99b9ea2SMatthew Dillon 	}
137ff66f880STomohiro Kusumi 	if (io->reclaim) {
138cebe9493SMatthew Dillon 		bp->b_flags |= B_NOCACHE|B_RELBUF;
139ff66f880STomohiro Kusumi 		io->reclaim = 0;
140ecca949aSMatthew Dillon 	}
14166325755SMatthew Dillon 
142ff66f880STomohiro Kusumi 	switch(io->type) {
14366325755SMatthew Dillon 	case HAMMER_STRUCTURE_VOLUME:
144ff66f880STomohiro Kusumi 		HAMMER_ITOV(io)->ondisk = NULL;
14566325755SMatthew Dillon 		break;
14610a5d1baSMatthew Dillon 	case HAMMER_STRUCTURE_DATA_BUFFER:
14710a5d1baSMatthew Dillon 	case HAMMER_STRUCTURE_META_BUFFER:
14810a5d1baSMatthew Dillon 	case HAMMER_STRUCTURE_UNDO_BUFFER:
149ff66f880STomohiro Kusumi 		HAMMER_ITOB(io)->ondisk = NULL;
15066325755SMatthew Dillon 		break;
151eddadaeeSMatthew Dillon 	case HAMMER_STRUCTURE_DUMMY:
152903fdd05STomohiro Kusumi 		hpanic("bad io type");
153eddadaeeSMatthew Dillon 		break;
15466325755SMatthew Dillon 	}
15566325755SMatthew Dillon }
156fbc6e32aSMatthew Dillon 
157fbc6e32aSMatthew Dillon /*
158055f5ff8SMatthew Dillon  * Wait for any physical IO to complete
159ae8e83e6SMatthew Dillon  *
160ae8e83e6SMatthew Dillon  * XXX we aren't interlocked against a spinlock or anything so there
161ae8e83e6SMatthew Dillon  *     is a small window in the interlock / io->running == 0 test.
162fbc6e32aSMatthew Dillon  */
1631b0ab2c3SMatthew Dillon void
164055f5ff8SMatthew Dillon hammer_io_wait(hammer_io_t io)
165fbc6e32aSMatthew Dillon {
166055f5ff8SMatthew Dillon 	if (io->running) {
167b0aab9b9SMatthew Dillon 		hammer_mount_t hmp = io->hmp;
168b0aab9b9SMatthew Dillon 
169b0aab9b9SMatthew Dillon 		lwkt_gettoken(&hmp->io_token);
170b0aab9b9SMatthew Dillon 		while (io->running) {
171ae8e83e6SMatthew Dillon 			io->waiting = 1;
172ae8e83e6SMatthew Dillon 			tsleep_interlock(io, 0);
173b0aab9b9SMatthew Dillon 			if (io->running)
174ae8e83e6SMatthew Dillon 				tsleep(io, PINTERLOCKED, "hmrflw", hz);
175055f5ff8SMatthew Dillon 		}
176b0aab9b9SMatthew Dillon 		lwkt_reltoken(&hmp->io_token);
177055f5ff8SMatthew Dillon 	}
178055f5ff8SMatthew Dillon }
179055f5ff8SMatthew Dillon 
180af209b0fSMatthew Dillon /*
181eddadaeeSMatthew Dillon  * Wait for all currently queued HAMMER-initiated I/Os to complete.
182eddadaeeSMatthew Dillon  *
183eddadaeeSMatthew Dillon  * This is not supposed to count direct I/O's but some can leak
184eddadaeeSMatthew Dillon  * through (for non-full-sized direct I/Os).
185af209b0fSMatthew Dillon  */
186af209b0fSMatthew Dillon void
187eddadaeeSMatthew Dillon hammer_io_wait_all(hammer_mount_t hmp, const char *ident, int doflush)
188af209b0fSMatthew Dillon {
189eddadaeeSMatthew Dillon 	struct hammer_io iodummy;
190eddadaeeSMatthew Dillon 	hammer_io_t io;
191eddadaeeSMatthew Dillon 
192eddadaeeSMatthew Dillon 	/*
193eddadaeeSMatthew Dillon 	 * Degenerate case, no I/O is running
194eddadaeeSMatthew Dillon 	 */
195b0aab9b9SMatthew Dillon 	lwkt_gettoken(&hmp->io_token);
196eddadaeeSMatthew Dillon 	if (TAILQ_EMPTY(&hmp->iorun_list)) {
197b0aab9b9SMatthew Dillon 		lwkt_reltoken(&hmp->io_token);
198eddadaeeSMatthew Dillon 		if (doflush)
199eddadaeeSMatthew Dillon 			hammer_io_flush_sync(hmp);
200eddadaeeSMatthew Dillon 		return;
201eddadaeeSMatthew Dillon 	}
202eddadaeeSMatthew Dillon 	bzero(&iodummy, sizeof(iodummy));
203eddadaeeSMatthew Dillon 	iodummy.type = HAMMER_STRUCTURE_DUMMY;
204eddadaeeSMatthew Dillon 
205eddadaeeSMatthew Dillon 	/*
206eddadaeeSMatthew Dillon 	 * Add placemarker and then wait until it becomes the head of
207eddadaeeSMatthew Dillon 	 * the list.
208eddadaeeSMatthew Dillon 	 */
209eddadaeeSMatthew Dillon 	TAILQ_INSERT_TAIL(&hmp->iorun_list, &iodummy, iorun_entry);
210eddadaeeSMatthew Dillon 	while (TAILQ_FIRST(&hmp->iorun_list) != &iodummy) {
211eddadaeeSMatthew Dillon 		tsleep(&iodummy, 0, ident, 0);
212eddadaeeSMatthew Dillon 	}
213eddadaeeSMatthew Dillon 
214eddadaeeSMatthew Dillon 	/*
215eddadaeeSMatthew Dillon 	 * Chain in case several placemarkers are present.
216eddadaeeSMatthew Dillon 	 */
217eddadaeeSMatthew Dillon 	TAILQ_REMOVE(&hmp->iorun_list, &iodummy, iorun_entry);
218eddadaeeSMatthew Dillon 	io = TAILQ_FIRST(&hmp->iorun_list);
219eddadaeeSMatthew Dillon 	if (io && io->type == HAMMER_STRUCTURE_DUMMY)
220eddadaeeSMatthew Dillon 		wakeup(io);
221b0aab9b9SMatthew Dillon 	lwkt_reltoken(&hmp->io_token);
222eddadaeeSMatthew Dillon 
223eddadaeeSMatthew Dillon 	if (doflush)
224eddadaeeSMatthew Dillon 		hammer_io_flush_sync(hmp);
225af209b0fSMatthew Dillon }
226af209b0fSMatthew Dillon 
2272faf0737SMatthew Dillon /*
2282faf0737SMatthew Dillon  * Clear a flagged error condition on a I/O buffer.  The caller must hold
2292faf0737SMatthew Dillon  * its own ref on the buffer.
2302faf0737SMatthew Dillon  */
2312faf0737SMatthew Dillon void
2322faf0737SMatthew Dillon hammer_io_clear_error(struct hammer_io *io)
2332faf0737SMatthew Dillon {
23477912481SMatthew Dillon 	hammer_mount_t hmp = io->hmp;
23577912481SMatthew Dillon 
23677912481SMatthew Dillon 	lwkt_gettoken(&hmp->io_token);
2372faf0737SMatthew Dillon 	if (io->ioerror) {
2382faf0737SMatthew Dillon 		io->ioerror = 0;
239250aec18SMatthew Dillon 		hammer_rel(&io->lock);
240250aec18SMatthew Dillon 		KKASSERT(hammer_isactive(&io->lock));
2412faf0737SMatthew Dillon 	}
24277912481SMatthew Dillon 	lwkt_reltoken(&hmp->io_token);
24377912481SMatthew Dillon }
24477912481SMatthew Dillon 
24577912481SMatthew Dillon void
24677912481SMatthew Dillon hammer_io_clear_error_noassert(struct hammer_io *io)
24777912481SMatthew Dillon {
24877912481SMatthew Dillon 	hammer_mount_t hmp = io->hmp;
24977912481SMatthew Dillon 
25077912481SMatthew Dillon 	lwkt_gettoken(&hmp->io_token);
25177912481SMatthew Dillon 	if (io->ioerror) {
25277912481SMatthew Dillon 		io->ioerror = 0;
25377912481SMatthew Dillon 		hammer_rel(&io->lock);
25477912481SMatthew Dillon 	}
25577912481SMatthew Dillon 	lwkt_reltoken(&hmp->io_token);
2562faf0737SMatthew Dillon }
2572faf0737SMatthew Dillon 
258b8a41159SMatthew Dillon /*
259b8a41159SMatthew Dillon  * This is an advisory function only which tells the buffer cache
260b8a41159SMatthew Dillon  * the bp is not a meta-data buffer, even though it is backed by
261b8a41159SMatthew Dillon  * a block device.
262b8a41159SMatthew Dillon  *
263b8a41159SMatthew Dillon  * This is used by HAMMER's reblocking code to avoid trying to
264b8a41159SMatthew Dillon  * swapcache the filesystem's data when it is read or written
265b8a41159SMatthew Dillon  * by the reblocking code.
266b0aab9b9SMatthew Dillon  *
267b0aab9b9SMatthew Dillon  * The caller has a ref on the buffer preventing the bp from
268b0aab9b9SMatthew Dillon  * being disassociated from it.
269b8a41159SMatthew Dillon  */
270b8a41159SMatthew Dillon void
271b8a41159SMatthew Dillon hammer_io_notmeta(hammer_buffer_t buffer)
272b8a41159SMatthew Dillon {
273b0aab9b9SMatthew Dillon 	if ((buffer->io.bp->b_flags & B_NOTMETA) == 0) {
274b0aab9b9SMatthew Dillon 		hammer_mount_t hmp = buffer->io.hmp;
275b0aab9b9SMatthew Dillon 
276b0aab9b9SMatthew Dillon 		lwkt_gettoken(&hmp->io_token);
277b8a41159SMatthew Dillon 		buffer->io.bp->b_flags |= B_NOTMETA;
278b0aab9b9SMatthew Dillon 		lwkt_reltoken(&hmp->io_token);
279b0aab9b9SMatthew Dillon 	}
280b8a41159SMatthew Dillon }
281b8a41159SMatthew Dillon 
28261aeeb33SMatthew Dillon /*
28310a5d1baSMatthew Dillon  * Load bp for a HAMMER structure.  The io must be exclusively locked by
28410a5d1baSMatthew Dillon  * the caller.
2852f85fa4dSMatthew Dillon  *
286a99b9ea2SMatthew Dillon  * This routine is mostly used on meta-data and small-data blocks.  Generally
287b7de8aa5SMatthew Dillon  * speaking HAMMER assumes some locality of reference and will cluster.
288af209b0fSMatthew Dillon  *
289b7de8aa5SMatthew Dillon  * Note that the caller (hammer_ondisk.c) may place further restrictions
290b7de8aa5SMatthew Dillon  * on clusterability via the limit (in bytes).  Typically large-data
291b7de8aa5SMatthew Dillon  * zones cannot be clustered due to their mixed buffer sizes.  This is
292b7de8aa5SMatthew Dillon  * not an issue since such clustering occurs in hammer_vnops at the
293b7de8aa5SMatthew Dillon  * regular file layer, whereas this is the buffered block device layer.
294b0aab9b9SMatthew Dillon  *
295b0aab9b9SMatthew Dillon  * No I/O callbacks can occur while we hold the buffer locked.
29666325755SMatthew Dillon  */
29766325755SMatthew Dillon int
298b7de8aa5SMatthew Dillon hammer_io_read(struct vnode *devvp, struct hammer_io *io, int limit)
29966325755SMatthew Dillon {
30066325755SMatthew Dillon 	struct buf *bp;
30166325755SMatthew Dillon 	int   error;
30266325755SMatthew Dillon 
30366325755SMatthew Dillon 	if ((bp = io->bp) == NULL) {
304*e7d75765SMatthew Dillon 		int hce = hammer_cluster_enable;
305*e7d75765SMatthew Dillon 
3063583bbb4SMatthew Dillon 		atomic_add_long(&hammer_count_io_running_read, io->bytes);
307*e7d75765SMatthew Dillon 		if (hce && limit > io->bytes) {
308b7de8aa5SMatthew Dillon 			error = cluster_read(devvp, io->offset + limit,
309ce0138a6SMatthew Dillon 					     io->offset, io->bytes,
310af209b0fSMatthew Dillon 					     HAMMER_CLUSTER_SIZE,
311*e7d75765SMatthew Dillon 					     HAMMER_CLUSTER_SIZE * hce,
312364c022cSMatthew Dillon 					     &io->bp);
313ce0138a6SMatthew Dillon 		} else {
3144a2796f3SMatthew Dillon 			error = bread(devvp, io->offset, io->bytes, &io->bp);
315ce0138a6SMatthew Dillon 		}
316ce0138a6SMatthew Dillon 		hammer_stats_disk_read += io->bytes;
3173583bbb4SMatthew Dillon 		atomic_add_long(&hammer_count_io_running_read, -io->bytes);
318cdb6e4e6SMatthew Dillon 
319cdb6e4e6SMatthew Dillon 		/*
320cdb6e4e6SMatthew Dillon 		 * The code generally assumes b_ops/b_dep has been set-up,
321cdb6e4e6SMatthew Dillon 		 * even if we error out here.
322cdb6e4e6SMatthew Dillon 		 */
32366325755SMatthew Dillon 		bp = io->bp;
32424c8374aSMatthew Dillon 		if ((hammer_debug_io & 0x0001) && (bp->b_flags & B_IODEBUG)) {
32524c8374aSMatthew Dillon 			const char *metatype;
32624c8374aSMatthew Dillon 
32724c8374aSMatthew Dillon 			switch(io->type) {
32824c8374aSMatthew Dillon 			case HAMMER_STRUCTURE_VOLUME:
32924c8374aSMatthew Dillon 				metatype = "volume";
33024c8374aSMatthew Dillon 				break;
33124c8374aSMatthew Dillon 			case HAMMER_STRUCTURE_META_BUFFER:
3325bce7157STomohiro Kusumi 				switch(HAMMER_ZONE(HAMMER_ITOB(io)->zoneX_offset)) {
33324c8374aSMatthew Dillon 				case HAMMER_ZONE_BTREE:
33424c8374aSMatthew Dillon 					metatype = "btree";
33524c8374aSMatthew Dillon 					break;
33624c8374aSMatthew Dillon 				case HAMMER_ZONE_META:
33724c8374aSMatthew Dillon 					metatype = "meta";
33824c8374aSMatthew Dillon 					break;
33924c8374aSMatthew Dillon 				case HAMMER_ZONE_FREEMAP:
34024c8374aSMatthew Dillon 					metatype = "freemap";
34124c8374aSMatthew Dillon 					break;
34224c8374aSMatthew Dillon 				default:
34324c8374aSMatthew Dillon 					metatype = "meta?";
34424c8374aSMatthew Dillon 					break;
34524c8374aSMatthew Dillon 				}
34624c8374aSMatthew Dillon 				break;
34724c8374aSMatthew Dillon 			case HAMMER_STRUCTURE_DATA_BUFFER:
34824c8374aSMatthew Dillon 				metatype = "data";
34924c8374aSMatthew Dillon 				break;
35024c8374aSMatthew Dillon 			case HAMMER_STRUCTURE_UNDO_BUFFER:
35124c8374aSMatthew Dillon 				metatype = "undo";
35224c8374aSMatthew Dillon 				break;
35324c8374aSMatthew Dillon 			default:
35424c8374aSMatthew Dillon 				metatype = "unknown";
35524c8374aSMatthew Dillon 				break;
35624c8374aSMatthew Dillon 			}
35765d9d14fSTomohiro Kusumi 			hdkprintf("zone2_offset %016jx %s\n",
35824c8374aSMatthew Dillon 				(intmax_t)bp->b_bio2.bio_offset,
35924c8374aSMatthew Dillon 				metatype);
36024c8374aSMatthew Dillon 		}
36124c8374aSMatthew Dillon 		bp->b_flags &= ~B_IODEBUG;
36266325755SMatthew Dillon 		bp->b_ops = &hammer_bioops;
363b0aab9b9SMatthew Dillon 
3640ae73f43STomohiro Kusumi 		hammer_buf_attach_io(bp, io); /* locked by the io lock */
36566325755SMatthew Dillon 		BUF_KERNPROC(bp);
36610a5d1baSMatthew Dillon 		KKASSERT(io->modified == 0);
36710a5d1baSMatthew Dillon 		KKASSERT(io->running == 0);
36810a5d1baSMatthew Dillon 		KKASSERT(io->waiting == 0);
36966325755SMatthew Dillon 		io->released = 0;	/* we hold an active lock on bp */
37066325755SMatthew Dillon 	} else {
37166325755SMatthew Dillon 		error = 0;
37266325755SMatthew Dillon 	}
37366325755SMatthew Dillon 	return(error);
37466325755SMatthew Dillon }
37566325755SMatthew Dillon 
37666325755SMatthew Dillon /*
37766325755SMatthew Dillon  * Similar to hammer_io_read() but returns a zero'd out buffer instead.
37810a5d1baSMatthew Dillon  * Must be called with the IO exclusively locked.
379055f5ff8SMatthew Dillon  *
38010a5d1baSMatthew Dillon  * vfs_bio_clrbuf() is kinda nasty, enforce serialization against background
38110a5d1baSMatthew Dillon  * I/O by forcing the buffer to not be in a released state before calling
38210a5d1baSMatthew Dillon  * it.
38310a5d1baSMatthew Dillon  *
38410a5d1baSMatthew Dillon  * This function will also mark the IO as modified but it will not
38510a5d1baSMatthew Dillon  * increment the modify_refs count.
386b0aab9b9SMatthew Dillon  *
387b0aab9b9SMatthew Dillon  * No I/O callbacks can occur while we hold the buffer locked.
38866325755SMatthew Dillon  */
38966325755SMatthew Dillon int
39066325755SMatthew Dillon hammer_io_new(struct vnode *devvp, struct hammer_io *io)
39166325755SMatthew Dillon {
39266325755SMatthew Dillon 	struct buf *bp;
39366325755SMatthew Dillon 
39466325755SMatthew Dillon 	if ((bp = io->bp) == NULL) {
3954a2796f3SMatthew Dillon 		io->bp = getblk(devvp, io->offset, io->bytes, 0, 0);
39666325755SMatthew Dillon 		bp = io->bp;
39766325755SMatthew Dillon 		bp->b_ops = &hammer_bioops;
398b0aab9b9SMatthew Dillon 
3990ae73f43STomohiro Kusumi 		hammer_buf_attach_io(bp, io); /* locked by the io lock */
400055f5ff8SMatthew Dillon 		io->released = 0;
40110a5d1baSMatthew Dillon 		KKASSERT(io->running == 0);
402055f5ff8SMatthew Dillon 		io->waiting = 0;
40366325755SMatthew Dillon 		BUF_KERNPROC(bp);
40466325755SMatthew Dillon 	} else {
40566325755SMatthew Dillon 		if (io->released) {
40666325755SMatthew Dillon 			regetblk(bp);
40766325755SMatthew Dillon 			BUF_KERNPROC(bp);
408d113fda1SMatthew Dillon 			io->released = 0;
40966325755SMatthew Dillon 		}
41066325755SMatthew Dillon 	}
41110a5d1baSMatthew Dillon 	hammer_io_modify(io, 0);
41266325755SMatthew Dillon 	vfs_bio_clrbuf(bp);
41366325755SMatthew Dillon 	return(0);
41466325755SMatthew Dillon }
41566325755SMatthew Dillon 
41666325755SMatthew Dillon /*
4170e8bd897SMatthew Dillon  * Advance the activity count on the underlying buffer because
4180e8bd897SMatthew Dillon  * HAMMER does not getblk/brelse on every access.
419b0aab9b9SMatthew Dillon  *
420b0aab9b9SMatthew Dillon  * The io->bp cannot go away while the buffer is referenced.
4210e8bd897SMatthew Dillon  */
4220e8bd897SMatthew Dillon void
4230e8bd897SMatthew Dillon hammer_io_advance(struct hammer_io *io)
4240e8bd897SMatthew Dillon {
4250e8bd897SMatthew Dillon 	if (io->bp)
4260e8bd897SMatthew Dillon 		buf_act_advance(io->bp);
4270e8bd897SMatthew Dillon }
4280e8bd897SMatthew Dillon 
4290e8bd897SMatthew Dillon /*
43047637bffSMatthew Dillon  * Remove potential device level aliases against buffers managed by high level
431362ec2dcSMatthew Dillon  * vnodes.  Aliases can also be created due to mixed buffer sizes or via
432362ec2dcSMatthew Dillon  * direct access to the backing store device.
433e469566bSMatthew Dillon  *
434e469566bSMatthew Dillon  * This is nasty because the buffers are also VMIO-backed.  Even if a buffer
435e469566bSMatthew Dillon  * does not exist its backing VM pages might, and we have to invalidate
436e469566bSMatthew Dillon  * those as well or a getblk() will reinstate them.
437362ec2dcSMatthew Dillon  *
438362ec2dcSMatthew Dillon  * Buffer cache buffers associated with hammer_buffers cannot be
439362ec2dcSMatthew Dillon  * invalidated.
44047637bffSMatthew Dillon  */
441362ec2dcSMatthew Dillon int
44247637bffSMatthew Dillon hammer_io_inval(hammer_volume_t volume, hammer_off_t zone2_offset)
44347637bffSMatthew Dillon {
444ff66f880STomohiro Kusumi 	hammer_io_t io;
445b0aab9b9SMatthew Dillon 	hammer_mount_t hmp;
44647637bffSMatthew Dillon 	hammer_off_t phys_offset;
44747637bffSMatthew Dillon 	struct buf *bp;
448362ec2dcSMatthew Dillon 	int error;
44947637bffSMatthew Dillon 
450b0aab9b9SMatthew Dillon 	hmp = volume->io.hmp;
451b0aab9b9SMatthew Dillon 	lwkt_gettoken(&hmp->io_token);
452b0aab9b9SMatthew Dillon 
4539c90dba2SMatthew Dillon 	/*
4543b98d912SMatthew Dillon 	 * If a device buffer already exists for the specified physical
4553b98d912SMatthew Dillon 	 * offset use that, otherwise instantiate a buffer to cover any
4563b98d912SMatthew Dillon 	 * related VM pages, set BNOCACHE, and brelse().
4579c90dba2SMatthew Dillon 	 */
458516655e8STomohiro Kusumi 	phys_offset = hammer_xlate_to_phys(volume->ondisk, zone2_offset);
4593b98d912SMatthew Dillon 	if ((bp = findblk(volume->devvp, phys_offset, 0)) != NULL)
4603b98d912SMatthew Dillon 		bremfree(bp);
461e469566bSMatthew Dillon 	else
462e469566bSMatthew Dillon 		bp = getblk(volume->devvp, phys_offset, HAMMER_BUFSIZE, 0, 0);
463b0aab9b9SMatthew Dillon 
4640ae73f43STomohiro Kusumi 	if ((io = hammer_buf_peek_io(bp)) != NULL) {
465362ec2dcSMatthew Dillon #if 0
466ff66f880STomohiro Kusumi 		hammer_ref(&io->lock);
467ff66f880STomohiro Kusumi 		hammer_io_clear_modify(io, 1);
468cebe9493SMatthew Dillon 		bundirty(bp);
469ff66f880STomohiro Kusumi 		io->released = 0;
470e83ca595SMatthew Dillon 		BUF_KERNPROC(bp);
471ff66f880STomohiro Kusumi 		io->reclaim = 1;
472ff66f880STomohiro Kusumi 		io->waitdep = 1;	/* XXX this is a fs_token field */
473ff66f880STomohiro Kusumi 		KKASSERT(hammer_isactive(&io->lock) == 1);
474ff66f880STomohiro Kusumi 		hammer_rel_buffer(HAMMER_ITOB(io), 0);
4755c8d05e2SMatthew Dillon 		/*hammer_io_deallocate(bp);*/
476362ec2dcSMatthew Dillon #endif
47704b04ca6SMatthew Dillon 		bqrelse(bp);
478362ec2dcSMatthew Dillon 		error = EAGAIN;
4790832c9bbSMatthew Dillon 	} else {
480cebe9493SMatthew Dillon 		KKASSERT((bp->b_flags & B_LOCKED) == 0);
481cebe9493SMatthew Dillon 		bundirty(bp);
482cebe9493SMatthew Dillon 		bp->b_flags |= B_NOCACHE|B_RELBUF;
483ecca949aSMatthew Dillon 		brelse(bp);
484362ec2dcSMatthew Dillon 		error = 0;
485e83ca595SMatthew Dillon 	}
486b0aab9b9SMatthew Dillon 	lwkt_reltoken(&hmp->io_token);
487362ec2dcSMatthew Dillon 	return(error);
4880832c9bbSMatthew Dillon }
48947637bffSMatthew Dillon 
49047637bffSMatthew Dillon /*
491b3deaf57SMatthew Dillon  * This routine is called on the last reference to a hammer structure.
492250aec18SMatthew Dillon  * The io must be interlocked with a refcount of zero.  The hammer structure
493250aec18SMatthew Dillon  * will remain interlocked on return.
494b3deaf57SMatthew Dillon  *
495250aec18SMatthew Dillon  * This routine may return a non-NULL bp to the caller for dispoal.
496250aec18SMatthew Dillon  * The caller typically brelse()'s the bp.
497250aec18SMatthew Dillon  *
498250aec18SMatthew Dillon  * The bp may or may not still be passively associated with the IO.  It
499250aec18SMatthew Dillon  * will remain passively associated if it is unreleasable (e.g. a modified
500250aec18SMatthew Dillon  * meta-data buffer).
501ecca949aSMatthew Dillon  *
502ecca949aSMatthew Dillon  * The only requirement here is that modified meta-data and volume-header
503ecca949aSMatthew Dillon  * buffer may NOT be disassociated from the IO structure, and consequently
504ecca949aSMatthew Dillon  * we also leave such buffers actively associated with the IO if they already
505ecca949aSMatthew Dillon  * are (since the kernel can't do anything with them anyway).  Only the
506ecca949aSMatthew Dillon  * flusher is allowed to write such buffers out.  Modified pure-data and
507ecca949aSMatthew Dillon  * undo buffers are returned to the kernel but left passively associated
508ecca949aSMatthew Dillon  * so we can track when the kernel writes the bp out.
50966325755SMatthew Dillon  */
510ecca949aSMatthew Dillon struct buf *
51109ac686bSMatthew Dillon hammer_io_release(struct hammer_io *io, int flush)
51266325755SMatthew Dillon {
51366325755SMatthew Dillon 	struct buf *bp;
51466325755SMatthew Dillon 
515fbc6e32aSMatthew Dillon 	if ((bp = io->bp) == NULL)
516ecca949aSMatthew Dillon 		return(NULL);
517fbc6e32aSMatthew Dillon 
5180b075555SMatthew Dillon 	/*
51910a5d1baSMatthew Dillon 	 * Try to flush a dirty IO to disk if asked to by the
52010a5d1baSMatthew Dillon 	 * caller or if the kernel tried to flush the buffer in the past.
5210b075555SMatthew Dillon 	 *
52210a5d1baSMatthew Dillon 	 * Kernel-initiated flushes are only allowed for pure-data buffers.
52310a5d1baSMatthew Dillon 	 * meta-data and volume buffers can only be flushed explicitly
52410a5d1baSMatthew Dillon 	 * by HAMMER.
525055f5ff8SMatthew Dillon 	 */
52610a5d1baSMatthew Dillon 	if (io->modified) {
52709ac686bSMatthew Dillon 		if (flush) {
528710733a6SMatthew Dillon 			hammer_io_flush(io, 0);
52910a5d1baSMatthew Dillon 		} else if (bp->b_flags & B_LOCKED) {
53010a5d1baSMatthew Dillon 			switch(io->type) {
53110a5d1baSMatthew Dillon 			case HAMMER_STRUCTURE_DATA_BUFFER:
532710733a6SMatthew Dillon 				hammer_io_flush(io, 0);
533710733a6SMatthew Dillon 				break;
53410a5d1baSMatthew Dillon 			case HAMMER_STRUCTURE_UNDO_BUFFER:
535710733a6SMatthew Dillon 				hammer_io_flush(io, hammer_undo_reclaim(io));
53610a5d1baSMatthew Dillon 				break;
53710a5d1baSMatthew Dillon 			default:
53810a5d1baSMatthew Dillon 				break;
53910a5d1baSMatthew Dillon 			}
54010a5d1baSMatthew Dillon 		} /* else no explicit request to flush the buffer */
54110a5d1baSMatthew Dillon 	}
542055f5ff8SMatthew Dillon 
543055f5ff8SMatthew Dillon 	/*
5445c8d05e2SMatthew Dillon 	 * Wait for the IO to complete if asked to.  This occurs when
5455c8d05e2SMatthew Dillon 	 * the buffer must be disposed of definitively during an umount
5465c8d05e2SMatthew Dillon 	 * or buffer invalidation.
547055f5ff8SMatthew Dillon 	 */
548b58c6388SMatthew Dillon 	if (io->waitdep && io->running) {
549055f5ff8SMatthew Dillon 		hammer_io_wait(io);
550055f5ff8SMatthew Dillon 	}
551055f5ff8SMatthew Dillon 
552055f5ff8SMatthew Dillon 	/*
55310a5d1baSMatthew Dillon 	 * Return control of the buffer to the kernel (with the provisio
55410a5d1baSMatthew Dillon 	 * that our bioops can override kernel decisions with regards to
55510a5d1baSMatthew Dillon 	 * the buffer).
556055f5ff8SMatthew Dillon 	 */
557cebe9493SMatthew Dillon 	if ((flush || io->reclaim) && io->modified == 0 && io->running == 0) {
55810a5d1baSMatthew Dillon 		/*
55910a5d1baSMatthew Dillon 		 * Always disassociate the bp if an explicit flush
56010a5d1baSMatthew Dillon 		 * was requested and the IO completed with no error
56110a5d1baSMatthew Dillon 		 * (so unmount can really clean up the structure).
56210a5d1baSMatthew Dillon 		 */
563055f5ff8SMatthew Dillon 		if (io->released) {
564055f5ff8SMatthew Dillon 			regetblk(bp);
56546fe7ae1SMatthew Dillon 			BUF_KERNPROC(bp);
566ecca949aSMatthew Dillon 		} else {
567ecca949aSMatthew Dillon 			io->released = 1;
568055f5ff8SMatthew Dillon 		}
569ff66f880STomohiro Kusumi 		hammer_io_disassociate(io);
570ecca949aSMatthew Dillon 		/* return the bp */
571055f5ff8SMatthew Dillon 	} else if (io->modified) {
57210a5d1baSMatthew Dillon 		/*
573ecca949aSMatthew Dillon 		 * Only certain IO types can be released to the kernel if
574ecca949aSMatthew Dillon 		 * the buffer has been modified.
575ecca949aSMatthew Dillon 		 *
576ecca949aSMatthew Dillon 		 * volume and meta-data IO types may only be explicitly
577ecca949aSMatthew Dillon 		 * flushed by HAMMER.
57810a5d1baSMatthew Dillon 		 */
57910a5d1baSMatthew Dillon 		switch(io->type) {
58010a5d1baSMatthew Dillon 		case HAMMER_STRUCTURE_DATA_BUFFER:
58110a5d1baSMatthew Dillon 		case HAMMER_STRUCTURE_UNDO_BUFFER:
582b58c6388SMatthew Dillon 			if (io->released == 0) {
583055f5ff8SMatthew Dillon 				io->released = 1;
5849de13b88SMatthew Dillon 				bp->b_flags |= B_CLUSTEROK;
585055f5ff8SMatthew Dillon 				bdwrite(bp);
586055f5ff8SMatthew Dillon 			}
58710a5d1baSMatthew Dillon 			break;
58810a5d1baSMatthew Dillon 		default:
58910a5d1baSMatthew Dillon 			break;
59010a5d1baSMatthew Dillon 		}
591ecca949aSMatthew Dillon 		bp = NULL;	/* bp left associated */
592055f5ff8SMatthew Dillon 	} else if (io->released == 0) {
59310a5d1baSMatthew Dillon 		/*
59410a5d1baSMatthew Dillon 		 * Clean buffers can be generally released to the kernel.
59510a5d1baSMatthew Dillon 		 * We leave the bp passively associated with the HAMMER
59610a5d1baSMatthew Dillon 		 * structure and use bioops to disconnect it later on
59710a5d1baSMatthew Dillon 		 * if the kernel wants to discard the buffer.
598ecca949aSMatthew Dillon 		 *
599ecca949aSMatthew Dillon 		 * We can steal the structure's ownership of the bp.
60010a5d1baSMatthew Dillon 		 */
601ecca949aSMatthew Dillon 		io->released = 1;
6029f5097dcSMatthew Dillon 		if (bp->b_flags & B_LOCKED) {
603ff66f880STomohiro Kusumi 			hammer_io_disassociate(io);
604ecca949aSMatthew Dillon 			/* return the bp */
6059f5097dcSMatthew Dillon 		} else {
606cebe9493SMatthew Dillon 			if (io->reclaim) {
607ff66f880STomohiro Kusumi 				hammer_io_disassociate(io);
608ecca949aSMatthew Dillon 				/* return the bp */
609cebe9493SMatthew Dillon 			} else {
610ecca949aSMatthew Dillon 				/* return the bp (bp passively associated) */
6119f5097dcSMatthew Dillon 			}
612cebe9493SMatthew Dillon 		}
61319b97e01SMatthew Dillon 	} else {
61419b97e01SMatthew Dillon 		/*
615af209b0fSMatthew Dillon 		 * A released buffer is passively associate with our
616af209b0fSMatthew Dillon 		 * hammer_io structure.  The kernel cannot destroy it
617af209b0fSMatthew Dillon 		 * without making a bioops call.  If the kernel (B_LOCKED)
618af209b0fSMatthew Dillon 		 * or we (reclaim) requested that the buffer be destroyed
619af209b0fSMatthew Dillon 		 * we destroy it, otherwise we do a quick get/release to
620af209b0fSMatthew Dillon 		 * reset its position in the kernel's LRU list.
621af209b0fSMatthew Dillon 		 *
622af209b0fSMatthew Dillon 		 * Leaving the buffer passively associated allows us to
623af209b0fSMatthew Dillon 		 * use the kernel's LRU buffer flushing mechanisms rather
624af209b0fSMatthew Dillon 		 * then rolling our own.
625cb51be26SMatthew Dillon 		 *
626cb51be26SMatthew Dillon 		 * XXX there are two ways of doing this.  We can re-acquire
627cb51be26SMatthew Dillon 		 * and passively release to reset the LRU, or not.
62819b97e01SMatthew Dillon 		 */
629af209b0fSMatthew Dillon 		if (io->running == 0) {
63019b97e01SMatthew Dillon 			regetblk(bp);
631cebe9493SMatthew Dillon 			if ((bp->b_flags & B_LOCKED) || io->reclaim) {
632ff66f880STomohiro Kusumi 				hammer_io_disassociate(io);
633ecca949aSMatthew Dillon 				/* return the bp */
6349f5097dcSMatthew Dillon 			} else {
635ecca949aSMatthew Dillon 				/* return the bp (bp passively associated) */
636ecca949aSMatthew Dillon 			}
637ecca949aSMatthew Dillon 		} else {
638ecca949aSMatthew Dillon 			/*
639ecca949aSMatthew Dillon 			 * bp is left passively associated but we do not
640ecca949aSMatthew Dillon 			 * try to reacquire it.  Interactions with the io
641ecca949aSMatthew Dillon 			 * structure will occur on completion of the bp's
642ecca949aSMatthew Dillon 			 * I/O.
643ecca949aSMatthew Dillon 			 */
644ecca949aSMatthew Dillon 			bp = NULL;
64519b97e01SMatthew Dillon 		}
6469f5097dcSMatthew Dillon 	}
647ecca949aSMatthew Dillon 	return(bp);
648055f5ff8SMatthew Dillon }
649055f5ff8SMatthew Dillon 
650055f5ff8SMatthew Dillon /*
651b33e2cc0SMatthew Dillon  * This routine is called with a locked IO when a flush is desired and
652b33e2cc0SMatthew Dillon  * no other references to the structure exists other then ours.  This
653b33e2cc0SMatthew Dillon  * routine is ONLY called when HAMMER believes it is safe to flush a
654b33e2cc0SMatthew Dillon  * potentially modified buffer out.
65577912481SMatthew Dillon  *
65677912481SMatthew Dillon  * The locked io or io reference prevents a flush from being initiated
65777912481SMatthew Dillon  * by the kernel.
6580b075555SMatthew Dillon  */
6590b075555SMatthew Dillon void
660710733a6SMatthew Dillon hammer_io_flush(struct hammer_io *io, int reclaim)
6610b075555SMatthew Dillon {
662055f5ff8SMatthew Dillon 	struct buf *bp;
66377912481SMatthew Dillon 	hammer_mount_t hmp;
664055f5ff8SMatthew Dillon 
665055f5ff8SMatthew Dillon 	/*
66610a5d1baSMatthew Dillon 	 * Degenerate case - nothing to flush if nothing is dirty.
667055f5ff8SMatthew Dillon 	 */
668b0aab9b9SMatthew Dillon 	if (io->modified == 0)
669055f5ff8SMatthew Dillon 		return;
670055f5ff8SMatthew Dillon 
671055f5ff8SMatthew Dillon 	KKASSERT(io->bp);
6729f5097dcSMatthew Dillon 	KKASSERT(io->modify_refs <= 0);
673055f5ff8SMatthew Dillon 
674b33e2cc0SMatthew Dillon 	/*
67577062c8aSMatthew Dillon 	 * Acquire ownership of the bp, particularly before we clear our
67677062c8aSMatthew Dillon 	 * modified flag.
67777062c8aSMatthew Dillon 	 *
67877062c8aSMatthew Dillon 	 * We are going to bawrite() this bp.  Don't leave a window where
67977062c8aSMatthew Dillon 	 * io->released is set, we actually own the bp rather then our
68077062c8aSMatthew Dillon 	 * buffer.
681b0aab9b9SMatthew Dillon 	 *
682b0aab9b9SMatthew Dillon 	 * The io_token should not be required here as only
68377062c8aSMatthew Dillon 	 */
68477912481SMatthew Dillon 	hmp = io->hmp;
68577062c8aSMatthew Dillon 	bp = io->bp;
68677062c8aSMatthew Dillon 	if (io->released) {
68777062c8aSMatthew Dillon 		regetblk(bp);
68877062c8aSMatthew Dillon 		/* BUF_KERNPROC(io->bp); */
68977062c8aSMatthew Dillon 		/* io->released = 0; */
69077062c8aSMatthew Dillon 		KKASSERT(io->released);
69177062c8aSMatthew Dillon 		KKASSERT(io->bp == bp);
692b0aab9b9SMatthew Dillon 	} else {
69377062c8aSMatthew Dillon 		io->released = 1;
694b0aab9b9SMatthew Dillon 	}
69577062c8aSMatthew Dillon 
696710733a6SMatthew Dillon 	if (reclaim) {
697710733a6SMatthew Dillon 		io->reclaim = 1;
698710733a6SMatthew Dillon 		if ((bp->b_flags & B_LOCKED) == 0) {
699710733a6SMatthew Dillon 			bp->b_flags |= B_LOCKED;
700b0aab9b9SMatthew Dillon 			atomic_add_int(&hammer_count_io_locked, 1);
701710733a6SMatthew Dillon 		}
702710733a6SMatthew Dillon 	}
703710733a6SMatthew Dillon 
70477062c8aSMatthew Dillon 	/*
70510a5d1baSMatthew Dillon 	 * Acquire exclusive access to the bp and then clear the modified
70610a5d1baSMatthew Dillon 	 * state of the buffer prior to issuing I/O to interlock any
70710a5d1baSMatthew Dillon 	 * modifications made while the I/O is in progress.  This shouldn't
70810a5d1baSMatthew Dillon 	 * happen anyway but losing data would be worse.  The modified bit
70910a5d1baSMatthew Dillon 	 * will be rechecked after the IO completes.
71010a5d1baSMatthew Dillon 	 *
7114a2796f3SMatthew Dillon 	 * NOTE: This call also finalizes the buffer's content (inval == 0).
7124a2796f3SMatthew Dillon 	 *
713b33e2cc0SMatthew Dillon 	 * This is only legal when lock.refs == 1 (otherwise we might clear
714b33e2cc0SMatthew Dillon 	 * the modified bit while there are still users of the cluster
715b33e2cc0SMatthew Dillon 	 * modifying the data).
716b33e2cc0SMatthew Dillon 	 *
717b33e2cc0SMatthew Dillon 	 * Do this before potentially blocking so any attempt to modify the
718b33e2cc0SMatthew Dillon 	 * ondisk while we are blocked blocks waiting for us.
719b33e2cc0SMatthew Dillon 	 */
7205c8d05e2SMatthew Dillon 	hammer_ref(&io->lock);
7214a2796f3SMatthew Dillon 	hammer_io_clear_modify(io, 0);
722250aec18SMatthew Dillon 	hammer_rel(&io->lock);
723bcac4bbbSMatthew Dillon 
7246367d0f9SMatthew Dillon 	if (hammer_debug_io & 0x0002)
72511605a5cSTomohiro Kusumi 		hdkprintf("%016jx\n", bp->b_bio1.bio_offset);
7266367d0f9SMatthew Dillon 
727bcac4bbbSMatthew Dillon 	/*
72810a5d1baSMatthew Dillon 	 * Transfer ownership to the kernel and initiate I/O.
729b0aab9b9SMatthew Dillon 	 *
730b0aab9b9SMatthew Dillon 	 * NOTE: We do not hold io_token so an atomic op is required to
731b0aab9b9SMatthew Dillon 	 *	 update io_running_space.
73210a5d1baSMatthew Dillon 	 */
733055f5ff8SMatthew Dillon 	io->running = 1;
7343583bbb4SMatthew Dillon 	atomic_add_long(&hmp->io_running_space, io->bytes);
7353583bbb4SMatthew Dillon 	atomic_add_long(&hammer_count_io_running_write, io->bytes);
73677912481SMatthew Dillon 	lwkt_gettoken(&hmp->io_token);
73777912481SMatthew Dillon 	TAILQ_INSERT_TAIL(&hmp->iorun_list, io, iorun_entry);
73877912481SMatthew Dillon 	lwkt_reltoken(&hmp->io_token);
7399de13b88SMatthew Dillon 	cluster_awrite(bp);
740748efb59SMatthew Dillon 	hammer_io_flush_mark(io->volume);
741055f5ff8SMatthew Dillon }
742055f5ff8SMatthew Dillon 
743055f5ff8SMatthew Dillon /************************************************************************
744055f5ff8SMatthew Dillon  *				BUFFER DIRTYING				*
745055f5ff8SMatthew Dillon  ************************************************************************
746055f5ff8SMatthew Dillon  *
747055f5ff8SMatthew Dillon  * These routines deal with dependancies created when IO buffers get
748055f5ff8SMatthew Dillon  * modified.  The caller must call hammer_modify_*() on a referenced
749055f5ff8SMatthew Dillon  * HAMMER structure prior to modifying its on-disk data.
750055f5ff8SMatthew Dillon  *
751055f5ff8SMatthew Dillon  * Any intent to modify an IO buffer acquires the related bp and imposes
752055f5ff8SMatthew Dillon  * various write ordering dependancies.
753055f5ff8SMatthew Dillon  */
754055f5ff8SMatthew Dillon 
755055f5ff8SMatthew Dillon /*
75610a5d1baSMatthew Dillon  * Mark a HAMMER structure as undergoing modification.  Meta-data buffers
75710a5d1baSMatthew Dillon  * are locked until the flusher can deal with them, pure data buffers
75810a5d1baSMatthew Dillon  * can be written out.
75977912481SMatthew Dillon  *
76077912481SMatthew Dillon  * The referenced io prevents races.
761055f5ff8SMatthew Dillon  */
76210a5d1baSMatthew Dillon static
763b58c6388SMatthew Dillon void
76410a5d1baSMatthew Dillon hammer_io_modify(hammer_io_t io, int count)
765055f5ff8SMatthew Dillon {
76646fe7ae1SMatthew Dillon 	/*
7679f5097dcSMatthew Dillon 	 * io->modify_refs must be >= 0
7689f5097dcSMatthew Dillon 	 */
7699f5097dcSMatthew Dillon 	while (io->modify_refs < 0) {
7709f5097dcSMatthew Dillon 		io->waitmod = 1;
7719f5097dcSMatthew Dillon 		tsleep(io, 0, "hmrmod", 0);
7729f5097dcSMatthew Dillon 	}
7739f5097dcSMatthew Dillon 
7749f5097dcSMatthew Dillon 	/*
77546fe7ae1SMatthew Dillon 	 * Shortcut if nothing to do.
77646fe7ae1SMatthew Dillon 	 */
777250aec18SMatthew Dillon 	KKASSERT(hammer_isactive(&io->lock) && io->bp != NULL);
77810a5d1baSMatthew Dillon 	io->modify_refs += count;
779b58c6388SMatthew Dillon 	if (io->modified && io->released == 0)
780b58c6388SMatthew Dillon 		return;
78146fe7ae1SMatthew Dillon 
78277912481SMatthew Dillon 	/*
78377912481SMatthew Dillon 	 * NOTE: It is important not to set the modified bit
78477912481SMatthew Dillon 	 *	 until after we have acquired the bp or we risk
78577912481SMatthew Dillon 	 *	 racing against checkwrite.
78677912481SMatthew Dillon 	 */
787055f5ff8SMatthew Dillon 	hammer_lock_ex(&io->lock);
788055f5ff8SMatthew Dillon 	if (io->released) {
789055f5ff8SMatthew Dillon 		regetblk(io->bp);
790055f5ff8SMatthew Dillon 		BUF_KERNPROC(io->bp);
791055f5ff8SMatthew Dillon 		io->released = 0;
79277912481SMatthew Dillon 	}
79377912481SMatthew Dillon 	if (io->modified == 0) {
79477912481SMatthew Dillon 		hammer_io_set_modlist(io);
79577912481SMatthew Dillon 		io->modified = 1;
796055f5ff8SMatthew Dillon 	}
797055f5ff8SMatthew Dillon 	hammer_unlock(&io->lock);
7980b075555SMatthew Dillon }
7990b075555SMatthew Dillon 
80010a5d1baSMatthew Dillon static __inline
80110a5d1baSMatthew Dillon void
80210a5d1baSMatthew Dillon hammer_io_modify_done(hammer_io_t io)
80310a5d1baSMatthew Dillon {
80410a5d1baSMatthew Dillon 	KKASSERT(io->modify_refs > 0);
80510a5d1baSMatthew Dillon 	--io->modify_refs;
8069f5097dcSMatthew Dillon 	if (io->modify_refs == 0 && io->waitmod) {
8079f5097dcSMatthew Dillon 		io->waitmod = 0;
8089f5097dcSMatthew Dillon 		wakeup(io);
8099f5097dcSMatthew Dillon 	}
8109f5097dcSMatthew Dillon }
8119f5097dcSMatthew Dillon 
81277912481SMatthew Dillon /*
81377912481SMatthew Dillon  * The write interlock blocks other threads trying to modify a buffer
81477912481SMatthew Dillon  * (they block in hammer_io_modify()) after us, or blocks us while other
81577912481SMatthew Dillon  * threads are in the middle of modifying a buffer.
81677912481SMatthew Dillon  *
81777912481SMatthew Dillon  * The caller also has a ref on the io, however if we are not careful
81877912481SMatthew Dillon  * we will race bioops callbacks (checkwrite).  To deal with this
81977912481SMatthew Dillon  * we must at least acquire and release the io_token, and it is probably
82077912481SMatthew Dillon  * better to hold it through the setting of modify_refs.
82177912481SMatthew Dillon  */
8229f5097dcSMatthew Dillon void
8239f5097dcSMatthew Dillon hammer_io_write_interlock(hammer_io_t io)
8249f5097dcSMatthew Dillon {
82577912481SMatthew Dillon 	hammer_mount_t hmp = io->hmp;
82677912481SMatthew Dillon 
82777912481SMatthew Dillon 	lwkt_gettoken(&hmp->io_token);
8289f5097dcSMatthew Dillon 	while (io->modify_refs != 0) {
8299f5097dcSMatthew Dillon 		io->waitmod = 1;
8309f5097dcSMatthew Dillon 		tsleep(io, 0, "hmrmod", 0);
8319f5097dcSMatthew Dillon 	}
8329f5097dcSMatthew Dillon 	io->modify_refs = -1;
83377912481SMatthew Dillon 	lwkt_reltoken(&hmp->io_token);
8349f5097dcSMatthew Dillon }
8359f5097dcSMatthew Dillon 
8369f5097dcSMatthew Dillon void
8379f5097dcSMatthew Dillon hammer_io_done_interlock(hammer_io_t io)
8389f5097dcSMatthew Dillon {
8399f5097dcSMatthew Dillon 	KKASSERT(io->modify_refs == -1);
8409f5097dcSMatthew Dillon 	io->modify_refs = 0;
8419f5097dcSMatthew Dillon 	if (io->waitmod) {
8429f5097dcSMatthew Dillon 		io->waitmod = 0;
8439f5097dcSMatthew Dillon 		wakeup(io);
8449f5097dcSMatthew Dillon 	}
84510a5d1baSMatthew Dillon }
84610a5d1baSMatthew Dillon 
8472f85fa4dSMatthew Dillon /*
8482f85fa4dSMatthew Dillon  * Caller intends to modify a volume's ondisk structure.
8492f85fa4dSMatthew Dillon  *
8502f85fa4dSMatthew Dillon  * This is only allowed if we are the flusher or we have a ref on the
8512f85fa4dSMatthew Dillon  * sync_lock.
8522f85fa4dSMatthew Dillon  */
8530b075555SMatthew Dillon void
85436f82b23SMatthew Dillon hammer_modify_volume(hammer_transaction_t trans, hammer_volume_t volume,
85536f82b23SMatthew Dillon 		     void *base, int len)
8560b075555SMatthew Dillon {
8572f85fa4dSMatthew Dillon 	KKASSERT (trans == NULL || trans->sync_lock_refs > 0);
858055f5ff8SMatthew Dillon 
8592f85fa4dSMatthew Dillon 	hammer_io_modify(&volume->io, 1);
86047197d71SMatthew Dillon 	if (len) {
86147197d71SMatthew Dillon 		intptr_t rel_offset = (intptr_t)base - (intptr_t)volume->ondisk;
86247197d71SMatthew Dillon 		KKASSERT((rel_offset & ~(intptr_t)HAMMER_BUFMASK) == 0);
86302428fb6SMatthew Dillon 		hammer_generate_undo(trans,
86447197d71SMatthew Dillon 			 HAMMER_ENCODE_RAW_VOLUME(volume->vol_no, rel_offset),
86547197d71SMatthew Dillon 			 base, len);
866055f5ff8SMatthew Dillon 	}
867055f5ff8SMatthew Dillon }
868055f5ff8SMatthew Dillon 
869055f5ff8SMatthew Dillon /*
8702f85fa4dSMatthew Dillon  * Caller intends to modify a buffer's ondisk structure.
8712f85fa4dSMatthew Dillon  *
8722f85fa4dSMatthew Dillon  * This is only allowed if we are the flusher or we have a ref on the
8732f85fa4dSMatthew Dillon  * sync_lock.
874055f5ff8SMatthew Dillon  */
875055f5ff8SMatthew Dillon void
87636f82b23SMatthew Dillon hammer_modify_buffer(hammer_transaction_t trans, hammer_buffer_t buffer,
87736f82b23SMatthew Dillon 		     void *base, int len)
87846fe7ae1SMatthew Dillon {
8792f85fa4dSMatthew Dillon 	KKASSERT (trans == NULL || trans->sync_lock_refs > 0);
8802f85fa4dSMatthew Dillon 
88110a5d1baSMatthew Dillon 	hammer_io_modify(&buffer->io, 1);
88247197d71SMatthew Dillon 	if (len) {
88347197d71SMatthew Dillon 		intptr_t rel_offset = (intptr_t)base - (intptr_t)buffer->ondisk;
88447197d71SMatthew Dillon 		KKASSERT((rel_offset & ~(intptr_t)HAMMER_BUFMASK) == 0);
88502428fb6SMatthew Dillon 		hammer_generate_undo(trans,
88634d829f7SMatthew Dillon 				     buffer->zone2_offset + rel_offset,
88747197d71SMatthew Dillon 				     base, len);
88847197d71SMatthew Dillon 	}
88946fe7ae1SMatthew Dillon }
89046fe7ae1SMatthew Dillon 
89110a5d1baSMatthew Dillon void
89210a5d1baSMatthew Dillon hammer_modify_volume_done(hammer_volume_t volume)
89310a5d1baSMatthew Dillon {
89410a5d1baSMatthew Dillon 	hammer_io_modify_done(&volume->io);
89510a5d1baSMatthew Dillon }
89610a5d1baSMatthew Dillon 
89710a5d1baSMatthew Dillon void
89810a5d1baSMatthew Dillon hammer_modify_buffer_done(hammer_buffer_t buffer)
89910a5d1baSMatthew Dillon {
90010a5d1baSMatthew Dillon 	hammer_io_modify_done(&buffer->io);
90110a5d1baSMatthew Dillon }
90210a5d1baSMatthew Dillon 
90346fe7ae1SMatthew Dillon /*
9044a2796f3SMatthew Dillon  * Mark an entity as not being dirty any more and finalize any
9054a2796f3SMatthew Dillon  * delayed adjustments to the buffer.
9064a2796f3SMatthew Dillon  *
9074a2796f3SMatthew Dillon  * Delayed adjustments are an important performance enhancement, allowing
9084a2796f3SMatthew Dillon  * us to avoid recalculating B-Tree node CRCs over and over again when
9094a2796f3SMatthew Dillon  * making bulk-modifications to the B-Tree.
9104a2796f3SMatthew Dillon  *
9114a2796f3SMatthew Dillon  * If inval is non-zero delayed adjustments are ignored.
9125c8d05e2SMatthew Dillon  *
9135c8d05e2SMatthew Dillon  * This routine may dereference related btree nodes and cause the
9145c8d05e2SMatthew Dillon  * buffer to be dereferenced.  The caller must own a reference on io.
91561aeeb33SMatthew Dillon  */
91661aeeb33SMatthew Dillon void
9174a2796f3SMatthew Dillon hammer_io_clear_modify(struct hammer_io *io, int inval)
91861aeeb33SMatthew Dillon {
91977912481SMatthew Dillon 	hammer_mount_t hmp;
92077912481SMatthew Dillon 
92177912481SMatthew Dillon 	/*
9221afb73cfSMatthew Dillon 	 * io_token is needed to avoid races on mod_root
92377912481SMatthew Dillon 	 */
9244a2796f3SMatthew Dillon 	if (io->modified == 0)
9254a2796f3SMatthew Dillon 		return;
92677912481SMatthew Dillon 	hmp = io->hmp;
92777912481SMatthew Dillon 	lwkt_gettoken(&hmp->io_token);
92877912481SMatthew Dillon 	if (io->modified == 0) {
92977912481SMatthew Dillon 		lwkt_reltoken(&hmp->io_token);
93077912481SMatthew Dillon 		return;
93177912481SMatthew Dillon 	}
9324a2796f3SMatthew Dillon 
9334a2796f3SMatthew Dillon 	/*
9344a2796f3SMatthew Dillon 	 * Take us off the mod-list and clear the modified bit.
9354a2796f3SMatthew Dillon 	 */
9361afb73cfSMatthew Dillon 	KKASSERT(io->mod_root != NULL);
9371afb73cfSMatthew Dillon 	if (io->mod_root == &io->hmp->volu_root ||
9381afb73cfSMatthew Dillon 	    io->mod_root == &io->hmp->meta_root) {
939f5a07a7aSMatthew Dillon 		io->hmp->locked_dirty_space -= io->bytes;
9403583bbb4SMatthew Dillon 		atomic_add_long(&hammer_count_dirtybufspace, -io->bytes);
941cebe9493SMatthew Dillon 	}
9421afb73cfSMatthew Dillon 	RB_REMOVE(hammer_mod_rb_tree, io->mod_root, io);
9431afb73cfSMatthew Dillon 	io->mod_root = NULL;
94461aeeb33SMatthew Dillon 	io->modified = 0;
9454a2796f3SMatthew Dillon 
94677912481SMatthew Dillon 	lwkt_reltoken(&hmp->io_token);
94777912481SMatthew Dillon 
9484a2796f3SMatthew Dillon 	/*
9494a2796f3SMatthew Dillon 	 * If this bit is not set there are no delayed adjustments.
9504a2796f3SMatthew Dillon 	 */
9514a2796f3SMatthew Dillon 	if (io->gencrc == 0)
9524a2796f3SMatthew Dillon 		return;
9534a2796f3SMatthew Dillon 	io->gencrc = 0;
9544a2796f3SMatthew Dillon 
9554a2796f3SMatthew Dillon 	/*
9564a2796f3SMatthew Dillon 	 * Finalize requested CRCs.  The NEEDSCRC flag also holds a reference
9574a2796f3SMatthew Dillon 	 * on the node (& underlying buffer).  Release the node after clearing
9584a2796f3SMatthew Dillon 	 * the flag.
9594a2796f3SMatthew Dillon 	 */
9604a2796f3SMatthew Dillon 	if (io->type == HAMMER_STRUCTURE_META_BUFFER) {
961195f6076STomohiro Kusumi 		hammer_buffer_t buffer = HAMMER_ITOB(io);
9624a2796f3SMatthew Dillon 		hammer_node_t node;
9634a2796f3SMatthew Dillon 
9644a2796f3SMatthew Dillon restart:
965c242ffecSTomohiro Kusumi 		TAILQ_FOREACH(node, &buffer->node_list, entry) {
9664a2796f3SMatthew Dillon 			if ((node->flags & HAMMER_NODE_NEEDSCRC) == 0)
9674a2796f3SMatthew Dillon 				continue;
9684a2796f3SMatthew Dillon 			node->flags &= ~HAMMER_NODE_NEEDSCRC;
9694a2796f3SMatthew Dillon 			KKASSERT(node->ondisk);
9704a2796f3SMatthew Dillon 			if (inval == 0)
971af3e4d3eSTomohiro Kusumi 				hammer_crc_set_btree(node->ondisk);
9724a2796f3SMatthew Dillon 			hammer_rel_node(node);
9734a2796f3SMatthew Dillon 			goto restart;
97461aeeb33SMatthew Dillon 		}
97561aeeb33SMatthew Dillon 	}
9765c8d05e2SMatthew Dillon 	/* caller must still have ref on io */
977250aec18SMatthew Dillon 	KKASSERT(hammer_isactive(&io->lock));
9784a2796f3SMatthew Dillon }
9794a2796f3SMatthew Dillon 
980cebe9493SMatthew Dillon /*
981cebe9493SMatthew Dillon  * Clear the IO's modify list.  Even though the IO is no longer modified
9821afb73cfSMatthew Dillon  * it may still be on the lose_root.  This routine is called just before
983cebe9493SMatthew Dillon  * the governing hammer_buffer is destroyed.
984b0aab9b9SMatthew Dillon  *
9851afb73cfSMatthew Dillon  * mod_root requires io_token protection.
986cebe9493SMatthew Dillon  */
987cebe9493SMatthew Dillon void
988cebe9493SMatthew Dillon hammer_io_clear_modlist(struct hammer_io *io)
989cebe9493SMatthew Dillon {
990b0aab9b9SMatthew Dillon 	hammer_mount_t hmp = io->hmp;
991b0aab9b9SMatthew Dillon 
9924a2796f3SMatthew Dillon 	KKASSERT(io->modified == 0);
9931afb73cfSMatthew Dillon 	if (io->mod_root) {
994b0aab9b9SMatthew Dillon 		lwkt_gettoken(&hmp->io_token);
9951afb73cfSMatthew Dillon 		if (io->mod_root) {
9961afb73cfSMatthew Dillon 			KKASSERT(io->mod_root == &io->hmp->lose_root);
9971afb73cfSMatthew Dillon 			RB_REMOVE(hammer_mod_rb_tree, io->mod_root, io);
9981afb73cfSMatthew Dillon 			io->mod_root = NULL;
999b0aab9b9SMatthew Dillon 		}
1000b0aab9b9SMatthew Dillon 		lwkt_reltoken(&hmp->io_token);
1001cebe9493SMatthew Dillon 	}
100266325755SMatthew Dillon }
100366325755SMatthew Dillon 
1004cdb6e4e6SMatthew Dillon static void
1005cdb6e4e6SMatthew Dillon hammer_io_set_modlist(struct hammer_io *io)
1006cdb6e4e6SMatthew Dillon {
1007cdb6e4e6SMatthew Dillon 	struct hammer_mount *hmp = io->hmp;
1008cdb6e4e6SMatthew Dillon 
100977912481SMatthew Dillon 	lwkt_gettoken(&hmp->io_token);
10101afb73cfSMatthew Dillon 	KKASSERT(io->mod_root == NULL);
1011cdb6e4e6SMatthew Dillon 
1012cdb6e4e6SMatthew Dillon 	switch(io->type) {
1013cdb6e4e6SMatthew Dillon 	case HAMMER_STRUCTURE_VOLUME:
10141afb73cfSMatthew Dillon 		io->mod_root = &hmp->volu_root;
1015cdb6e4e6SMatthew Dillon 		hmp->locked_dirty_space += io->bytes;
10163583bbb4SMatthew Dillon 		atomic_add_long(&hammer_count_dirtybufspace, io->bytes);
1017cdb6e4e6SMatthew Dillon 		break;
1018cdb6e4e6SMatthew Dillon 	case HAMMER_STRUCTURE_META_BUFFER:
10191afb73cfSMatthew Dillon 		io->mod_root = &hmp->meta_root;
1020cdb6e4e6SMatthew Dillon 		hmp->locked_dirty_space += io->bytes;
10213583bbb4SMatthew Dillon 		atomic_add_long(&hammer_count_dirtybufspace, io->bytes);
1022cdb6e4e6SMatthew Dillon 		break;
1023cdb6e4e6SMatthew Dillon 	case HAMMER_STRUCTURE_UNDO_BUFFER:
10241afb73cfSMatthew Dillon 		io->mod_root = &hmp->undo_root;
1025cdb6e4e6SMatthew Dillon 		break;
1026cdb6e4e6SMatthew Dillon 	case HAMMER_STRUCTURE_DATA_BUFFER:
10271afb73cfSMatthew Dillon 		io->mod_root = &hmp->data_root;
1028cdb6e4e6SMatthew Dillon 		break;
1029eddadaeeSMatthew Dillon 	case HAMMER_STRUCTURE_DUMMY:
1030903fdd05STomohiro Kusumi 		hpanic("bad io type");
10311afb73cfSMatthew Dillon 		break; /* NOT REACHED */
1032cdb6e4e6SMatthew Dillon 	}
10331afb73cfSMatthew Dillon 	if (RB_INSERT(hammer_mod_rb_tree, io->mod_root, io)) {
1034b49481fbSTomohiro Kusumi 		hpanic("duplicate entry @ %d:%015jx",
1035b49481fbSTomohiro Kusumi 			io->volume->vol_no, io->offset);
10361afb73cfSMatthew Dillon 		/* NOT REACHED */
10371afb73cfSMatthew Dillon 	}
103877912481SMatthew Dillon 	lwkt_reltoken(&hmp->io_token);
1039cdb6e4e6SMatthew Dillon }
1040cdb6e4e6SMatthew Dillon 
1041055f5ff8SMatthew Dillon /************************************************************************
1042055f5ff8SMatthew Dillon  *				HAMMER_BIOOPS				*
1043055f5ff8SMatthew Dillon  ************************************************************************
1044055f5ff8SMatthew Dillon  *
1045055f5ff8SMatthew Dillon  */
1046055f5ff8SMatthew Dillon 
1047055f5ff8SMatthew Dillon /*
1048055f5ff8SMatthew Dillon  * Pre-IO initiation kernel callback - cluster build only
1049b0aab9b9SMatthew Dillon  *
1050b0aab9b9SMatthew Dillon  * bioops callback - hold io_token
1051055f5ff8SMatthew Dillon  */
1052055f5ff8SMatthew Dillon static void
1053055f5ff8SMatthew Dillon hammer_io_start(struct buf *bp)
1054055f5ff8SMatthew Dillon {
1055b0aab9b9SMatthew Dillon 	/* nothing to do, so io_token not needed */
1056055f5ff8SMatthew Dillon }
1057055f5ff8SMatthew Dillon 
1058055f5ff8SMatthew Dillon /*
10597bc5b8c2SMatthew Dillon  * Post-IO completion kernel callback - MAY BE CALLED FROM INTERRUPT!
1060b33e2cc0SMatthew Dillon  *
106177912481SMatthew Dillon  * NOTE: HAMMER may modify a data buffer after we have initiated write
106277912481SMatthew Dillon  *	 I/O.
106377912481SMatthew Dillon  *
106477912481SMatthew Dillon  * NOTE: MPSAFE callback
1065b0aab9b9SMatthew Dillon  *
1066b0aab9b9SMatthew Dillon  * bioops callback - hold io_token
1067055f5ff8SMatthew Dillon  */
106866325755SMatthew Dillon static void
106966325755SMatthew Dillon hammer_io_complete(struct buf *bp)
107066325755SMatthew Dillon {
10710ae73f43STomohiro Kusumi 	hammer_io_t io = hammer_buf_peek_io(bp);
1072ff66f880STomohiro Kusumi 	struct hammer_mount *hmp = io->hmp;
1073eddadaeeSMatthew Dillon 	struct hammer_io *ionext;
1074fbc6e32aSMatthew Dillon 
1075b0aab9b9SMatthew Dillon 	lwkt_gettoken(&hmp->io_token);
1076b0aab9b9SMatthew Dillon 
1077ff66f880STomohiro Kusumi 	KKASSERT(io->released == 1);
1078055f5ff8SMatthew Dillon 
1079bf3b416bSMatthew Dillon 	/*
1080bf3b416bSMatthew Dillon 	 * Deal with people waiting for I/O to drain
1081bf3b416bSMatthew Dillon 	 */
1082ff66f880STomohiro Kusumi 	if (io->running) {
1083cdb6e4e6SMatthew Dillon 		/*
1084cdb6e4e6SMatthew Dillon 		 * Deal with critical write errors.  Once a critical error
1085cdb6e4e6SMatthew Dillon 		 * has been flagged in hmp the UNDO FIFO will not be updated.
1086cdb6e4e6SMatthew Dillon 		 * That way crash recover will give us a consistent
1087cdb6e4e6SMatthew Dillon 		 * filesystem.
1088cdb6e4e6SMatthew Dillon 		 *
1089cdb6e4e6SMatthew Dillon 		 * Because of this we can throw away failed UNDO buffers.  If
1090cdb6e4e6SMatthew Dillon 		 * we throw away META or DATA buffers we risk corrupting
1091cdb6e4e6SMatthew Dillon 		 * the now read-only version of the filesystem visible to
1092cdb6e4e6SMatthew Dillon 		 * the user.  Clear B_ERROR so the buffer is not re-dirtied
1093cdb6e4e6SMatthew Dillon 		 * by the kernel and ref the io so it doesn't get thrown
1094cdb6e4e6SMatthew Dillon 		 * away.
1095cdb6e4e6SMatthew Dillon 		 */
1096cdb6e4e6SMatthew Dillon 		if (bp->b_flags & B_ERROR) {
109777912481SMatthew Dillon 			lwkt_gettoken(&hmp->fs_token);
1098ba298df1SMatthew Dillon 			hammer_critical_error(hmp, NULL, bp->b_error,
1099cdb6e4e6SMatthew Dillon 					      "while flushing meta-data");
110077912481SMatthew Dillon 			lwkt_reltoken(&hmp->fs_token);
110177912481SMatthew Dillon 
1102ff66f880STomohiro Kusumi 			switch(io->type) {
1103cdb6e4e6SMatthew Dillon 			case HAMMER_STRUCTURE_UNDO_BUFFER:
1104cdb6e4e6SMatthew Dillon 				break;
1105cdb6e4e6SMatthew Dillon 			default:
1106ff66f880STomohiro Kusumi 				if (io->ioerror == 0) {
1107ff66f880STomohiro Kusumi 					io->ioerror = 1;
1108ff66f880STomohiro Kusumi 					hammer_ref(&io->lock);
1109cdb6e4e6SMatthew Dillon 				}
1110cdb6e4e6SMatthew Dillon 				break;
1111cdb6e4e6SMatthew Dillon 			}
1112cdb6e4e6SMatthew Dillon 			bp->b_flags &= ~B_ERROR;
1113cdb6e4e6SMatthew Dillon 			bundirty(bp);
1114cdb6e4e6SMatthew Dillon #if 0
1115ff66f880STomohiro Kusumi 			hammer_io_set_modlist(io);
1116ff66f880STomohiro Kusumi 			io->modified = 1;
1117cdb6e4e6SMatthew Dillon #endif
1118cdb6e4e6SMatthew Dillon 		}
1119ff66f880STomohiro Kusumi 		hammer_stats_disk_write += io->bytes;
1120ff66f880STomohiro Kusumi 		atomic_add_long(&hammer_count_io_running_write, -io->bytes);
1121ff66f880STomohiro Kusumi 		atomic_add_long(&hmp->io_running_space, -io->bytes);
1122ba298df1SMatthew Dillon 		KKASSERT(hmp->io_running_space >= 0);
1123ff66f880STomohiro Kusumi 		io->running = 0;
1124eddadaeeSMatthew Dillon 
1125eddadaeeSMatthew Dillon 		/*
1126eddadaeeSMatthew Dillon 		 * Remove from iorun list and wakeup any multi-io waiter(s).
1127eddadaeeSMatthew Dillon 		 */
1128ff66f880STomohiro Kusumi 		if (TAILQ_FIRST(&hmp->iorun_list) == io) {
1129ff66f880STomohiro Kusumi 			ionext = TAILQ_NEXT(io, iorun_entry);
1130eddadaeeSMatthew Dillon 			if (ionext && ionext->type == HAMMER_STRUCTURE_DUMMY)
1131eddadaeeSMatthew Dillon 				wakeup(ionext);
1132eddadaeeSMatthew Dillon 		}
1133ff66f880STomohiro Kusumi 		TAILQ_REMOVE(&hmp->iorun_list, io, iorun_entry);
1134ce0138a6SMatthew Dillon 	} else {
1135ff66f880STomohiro Kusumi 		hammer_stats_disk_read += io->bytes;
1136f90dde4cSMatthew Dillon 	}
1137f90dde4cSMatthew Dillon 
1138ff66f880STomohiro Kusumi 	if (io->waiting) {
1139ff66f880STomohiro Kusumi 		io->waiting = 0;
1140ff66f880STomohiro Kusumi 		wakeup(io);
1141055f5ff8SMatthew Dillon 	}
1142055f5ff8SMatthew Dillon 
1143055f5ff8SMatthew Dillon 	/*
1144bf3b416bSMatthew Dillon 	 * If B_LOCKED is set someone wanted to deallocate the bp at some
1145250aec18SMatthew Dillon 	 * point, try to do it now.  The operation will fail if there are
1146250aec18SMatthew Dillon 	 * refs or if hammer_io_deallocate() is unable to gain the
1147250aec18SMatthew Dillon 	 * interlock.
1148055f5ff8SMatthew Dillon 	 */
1149250aec18SMatthew Dillon 	if (bp->b_flags & B_LOCKED) {
1150b0aab9b9SMatthew Dillon 		atomic_add_int(&hammer_count_io_locked, -1);
1151d5ef456eSMatthew Dillon 		bp->b_flags &= ~B_LOCKED;
1152055f5ff8SMatthew Dillon 		hammer_io_deallocate(bp);
1153055f5ff8SMatthew Dillon 		/* structure may be dead now */
1154fbc6e32aSMatthew Dillon 	}
1155b0aab9b9SMatthew Dillon 	lwkt_reltoken(&hmp->io_token);
115666325755SMatthew Dillon }
115766325755SMatthew Dillon 
115866325755SMatthew Dillon /*
115966325755SMatthew Dillon  * Callback from kernel when it wishes to deallocate a passively
116010a5d1baSMatthew Dillon  * associated structure.  This mostly occurs with clean buffers
116110a5d1baSMatthew Dillon  * but it may be possible for a holding structure to be marked dirty
11627bc5b8c2SMatthew Dillon  * while its buffer is passively associated.  The caller owns the bp.
116366325755SMatthew Dillon  *
116466325755SMatthew Dillon  * If we cannot disassociate we set B_LOCKED to prevent the buffer
116566325755SMatthew Dillon  * from getting reused.
116646fe7ae1SMatthew Dillon  *
116746fe7ae1SMatthew Dillon  * WARNING: Because this can be called directly by getnewbuf we cannot
116846fe7ae1SMatthew Dillon  * recurse into the tree.  If a bp cannot be immediately disassociated
116946fe7ae1SMatthew Dillon  * our only recourse is to set B_LOCKED.
11707bc5b8c2SMatthew Dillon  *
11717bc5b8c2SMatthew Dillon  * WARNING: This may be called from an interrupt via hammer_io_complete()
1172b0aab9b9SMatthew Dillon  *
1173b0aab9b9SMatthew Dillon  * bioops callback - hold io_token
117466325755SMatthew Dillon  */
117566325755SMatthew Dillon static void
117666325755SMatthew Dillon hammer_io_deallocate(struct buf *bp)
117766325755SMatthew Dillon {
11780ae73f43STomohiro Kusumi 	hammer_io_t io = hammer_buf_peek_io(bp);
1179b0aab9b9SMatthew Dillon 	hammer_mount_t hmp;
1180b0aab9b9SMatthew Dillon 
1181ff66f880STomohiro Kusumi 	hmp = io->hmp;
1182b0aab9b9SMatthew Dillon 
1183b0aab9b9SMatthew Dillon 	lwkt_gettoken(&hmp->io_token);
118466325755SMatthew Dillon 
1185ff66f880STomohiro Kusumi 	KKASSERT((bp->b_flags & B_LOCKED) == 0 && io->running == 0);
1186ff66f880STomohiro Kusumi 	if (hammer_try_interlock_norefs(&io->lock) == 0) {
1187250aec18SMatthew Dillon 		/*
1188250aec18SMatthew Dillon 		 * We cannot safely disassociate a bp from a referenced
1189250aec18SMatthew Dillon 		 * or interlocked HAMMER structure.
1190250aec18SMatthew Dillon 		 */
1191250aec18SMatthew Dillon 		bp->b_flags |= B_LOCKED;
1192b0aab9b9SMatthew Dillon 		atomic_add_int(&hammer_count_io_locked, 1);
1193ff66f880STomohiro Kusumi 	} else if (io->modified) {
119410a5d1baSMatthew Dillon 		/*
119510a5d1baSMatthew Dillon 		 * It is not legal to disassociate a modified buffer.  This
119610a5d1baSMatthew Dillon 		 * case really shouldn't ever occur.
119710a5d1baSMatthew Dillon 		 */
1198055f5ff8SMatthew Dillon 		bp->b_flags |= B_LOCKED;
1199b0aab9b9SMatthew Dillon 		atomic_add_int(&hammer_count_io_locked, 1);
1200ff66f880STomohiro Kusumi 		hammer_put_interlock(&io->lock, 0);
1201055f5ff8SMatthew Dillon 	} else {
120210a5d1baSMatthew Dillon 		/*
120310a5d1baSMatthew Dillon 		 * Disassociate the BP.  If the io has no refs left we
1204b0aab9b9SMatthew Dillon 		 * have to add it to the loose list.  The kernel has
1205b0aab9b9SMatthew Dillon 		 * locked the buffer and therefore our io must be
1206b0aab9b9SMatthew Dillon 		 * in a released state.
120710a5d1baSMatthew Dillon 		 */
1208ff66f880STomohiro Kusumi 		hammer_io_disassociate(io);
1209ff66f880STomohiro Kusumi 		if (io->type != HAMMER_STRUCTURE_VOLUME) {
1210ff66f880STomohiro Kusumi 			KKASSERT(io->bp == NULL);
1211ff66f880STomohiro Kusumi 			KKASSERT(io->mod_root == NULL);
1212ff66f880STomohiro Kusumi 			io->mod_root = &hmp->lose_root;
1213b49481fbSTomohiro Kusumi 			if (RB_INSERT(hammer_mod_rb_tree, io->mod_root, io)) {
1214b49481fbSTomohiro Kusumi 				hpanic("duplicate entry @ %d:%015jx",
1215b49481fbSTomohiro Kusumi 					io->volume->vol_no, io->offset);
1216b49481fbSTomohiro Kusumi 				/* NOT REACHED */
1217b49481fbSTomohiro Kusumi 			}
12181afb73cfSMatthew Dillon 		}
1219ff66f880STomohiro Kusumi 		hammer_put_interlock(&io->lock, 1);
122066325755SMatthew Dillon 	}
1221b0aab9b9SMatthew Dillon 	lwkt_reltoken(&hmp->io_token);
122266325755SMatthew Dillon }
122366325755SMatthew Dillon 
1224b0aab9b9SMatthew Dillon /*
1225b0aab9b9SMatthew Dillon  * bioops callback - hold io_token
1226b0aab9b9SMatthew Dillon  */
122766325755SMatthew Dillon static int
122866325755SMatthew Dillon hammer_io_fsync(struct vnode *vp)
122966325755SMatthew Dillon {
1230b0aab9b9SMatthew Dillon 	/* nothing to do, so io_token not needed */
123166325755SMatthew Dillon 	return(0);
123266325755SMatthew Dillon }
123366325755SMatthew Dillon 
123466325755SMatthew Dillon /*
123566325755SMatthew Dillon  * NOTE: will not be called unless we tell the kernel about the
123666325755SMatthew Dillon  * bioops.  Unused... we use the mount's VFS_SYNC instead.
1237b0aab9b9SMatthew Dillon  *
1238b0aab9b9SMatthew Dillon  * bioops callback - hold io_token
123966325755SMatthew Dillon  */
124066325755SMatthew Dillon static int
124166325755SMatthew Dillon hammer_io_sync(struct mount *mp)
124266325755SMatthew Dillon {
1243b0aab9b9SMatthew Dillon 	/* nothing to do, so io_token not needed */
124466325755SMatthew Dillon 	return(0);
124566325755SMatthew Dillon }
124666325755SMatthew Dillon 
1247b0aab9b9SMatthew Dillon /*
1248b0aab9b9SMatthew Dillon  * bioops callback - hold io_token
1249b0aab9b9SMatthew Dillon  */
125066325755SMatthew Dillon static void
125166325755SMatthew Dillon hammer_io_movedeps(struct buf *bp1, struct buf *bp2)
125266325755SMatthew Dillon {
1253b0aab9b9SMatthew Dillon 	/* nothing to do, so io_token not needed */
125466325755SMatthew Dillon }
125566325755SMatthew Dillon 
125666325755SMatthew Dillon /*
125766325755SMatthew Dillon  * I/O pre-check for reading and writing.  HAMMER only uses this for
125866325755SMatthew Dillon  * B_CACHE buffers so checkread just shouldn't happen, but if it does
125966325755SMatthew Dillon  * allow it.
126066325755SMatthew Dillon  *
1261fbc6e32aSMatthew Dillon  * Writing is a different case.  We don't want the kernel to try to write
1262fbc6e32aSMatthew Dillon  * out a buffer that HAMMER may be modifying passively or which has a
126310a5d1baSMatthew Dillon  * dependancy.  In addition, kernel-demanded writes can only proceed for
126410a5d1baSMatthew Dillon  * certain types of buffers (i.e. UNDO and DATA types).  Other dirty
126510a5d1baSMatthew Dillon  * buffer types can only be explicitly written by the flusher.
1266fbc6e32aSMatthew Dillon  *
126710a5d1baSMatthew Dillon  * checkwrite will only be called for bdwrite()n buffers.  If we return
126810a5d1baSMatthew Dillon  * success the kernel is guaranteed to initiate the buffer write.
1269b0aab9b9SMatthew Dillon  *
1270b0aab9b9SMatthew Dillon  * bioops callback - hold io_token
127166325755SMatthew Dillon  */
127266325755SMatthew Dillon static int
127366325755SMatthew Dillon hammer_io_checkread(struct buf *bp)
127466325755SMatthew Dillon {
1275b0aab9b9SMatthew Dillon 	/* nothing to do, so io_token not needed */
127666325755SMatthew Dillon 	return(0);
127766325755SMatthew Dillon }
127866325755SMatthew Dillon 
1279b0aab9b9SMatthew Dillon /*
128077912481SMatthew Dillon  * The kernel is asking us whether it can write out a dirty buffer or not.
128177912481SMatthew Dillon  *
1282b0aab9b9SMatthew Dillon  * bioops callback - hold io_token
1283b0aab9b9SMatthew Dillon  */
128466325755SMatthew Dillon static int
128566325755SMatthew Dillon hammer_io_checkwrite(struct buf *bp)
128666325755SMatthew Dillon {
12870ae73f43STomohiro Kusumi 	hammer_io_t io = hammer_buf_peek_io(bp);
1288b0aab9b9SMatthew Dillon 	hammer_mount_t hmp = io->hmp;
128966325755SMatthew Dillon 
129077062c8aSMatthew Dillon 	/*
129177062c8aSMatthew Dillon 	 * This shouldn't happen under normal operation.
129277062c8aSMatthew Dillon 	 */
1293b0aab9b9SMatthew Dillon 	lwkt_gettoken(&hmp->io_token);
129477062c8aSMatthew Dillon 	if (io->type == HAMMER_STRUCTURE_VOLUME ||
129577062c8aSMatthew Dillon 	    io->type == HAMMER_STRUCTURE_META_BUFFER) {
129677062c8aSMatthew Dillon 		if (!panicstr)
1297903fdd05STomohiro Kusumi 			hpanic("illegal buffer");
1298a99b9ea2SMatthew Dillon 		if ((bp->b_flags & B_LOCKED) == 0) {
129977062c8aSMatthew Dillon 			bp->b_flags |= B_LOCKED;
1300b0aab9b9SMatthew Dillon 			atomic_add_int(&hammer_count_io_locked, 1);
1301a99b9ea2SMatthew Dillon 		}
1302b0aab9b9SMatthew Dillon 		lwkt_reltoken(&hmp->io_token);
130377062c8aSMatthew Dillon 		return(1);
130477062c8aSMatthew Dillon 	}
1305c9b9e29dSMatthew Dillon 
1306fbc6e32aSMatthew Dillon 	/*
130777912481SMatthew Dillon 	 * We have to be able to interlock the IO to safely modify any
130877912481SMatthew Dillon 	 * of its fields without holding the fs_token.  If we can't lock
130977912481SMatthew Dillon 	 * it then we are racing someone.
131077912481SMatthew Dillon 	 *
131177912481SMatthew Dillon 	 * Our ownership of the bp lock prevents the io from being ripped
131277912481SMatthew Dillon 	 * out from under us.
131377912481SMatthew Dillon 	 */
131477912481SMatthew Dillon 	if (hammer_try_interlock_norefs(&io->lock) == 0) {
131577912481SMatthew Dillon 		bp->b_flags |= B_LOCKED;
131677912481SMatthew Dillon 		atomic_add_int(&hammer_count_io_locked, 1);
131777912481SMatthew Dillon 		lwkt_reltoken(&hmp->io_token);
131877912481SMatthew Dillon 		return(1);
131977912481SMatthew Dillon 	}
132077912481SMatthew Dillon 
132177912481SMatthew Dillon 	/*
132277912481SMatthew Dillon 	 * The modified bit must be cleared prior to the initiation of
132377912481SMatthew Dillon 	 * any IO (returning 0 initiates the IO).  Because this is a
132477912481SMatthew Dillon 	 * normal data buffer hammer_io_clear_modify() runs through a
132577912481SMatthew Dillon 	 * simple degenerate case.
132677912481SMatthew Dillon 	 *
132777912481SMatthew Dillon 	 * Return 0 will cause the kernel to initiate the IO, and we
132877912481SMatthew Dillon 	 * must normally clear the modified bit before we begin.  If
132977912481SMatthew Dillon 	 * the io has modify_refs we do not clear the modified bit,
133077912481SMatthew Dillon 	 * otherwise we may miss changes.
13315c8d05e2SMatthew Dillon 	 *
13325c8d05e2SMatthew Dillon 	 * Only data and undo buffers can reach here.  These buffers do
13335c8d05e2SMatthew Dillon 	 * not have terminal crc functions but we temporarily reference
13345c8d05e2SMatthew Dillon 	 * the IO anyway, just in case.
1335b33e2cc0SMatthew Dillon 	 */
13365c8d05e2SMatthew Dillon 	if (io->modify_refs == 0 && io->modified) {
13375c8d05e2SMatthew Dillon 		hammer_ref(&io->lock);
13384a2796f3SMatthew Dillon 		hammer_io_clear_modify(io, 0);
1339250aec18SMatthew Dillon 		hammer_rel(&io->lock);
13405c8d05e2SMatthew Dillon 	} else if (io->modified) {
13415c8d05e2SMatthew Dillon 		KKASSERT(io->type == HAMMER_STRUCTURE_DATA_BUFFER);
13425c8d05e2SMatthew Dillon 	}
1343f90dde4cSMatthew Dillon 
1344f90dde4cSMatthew Dillon 	/*
1345f90dde4cSMatthew Dillon 	 * The kernel is going to start the IO, set io->running.
1346f90dde4cSMatthew Dillon 	 */
1347f90dde4cSMatthew Dillon 	KKASSERT(io->running == 0);
1348f90dde4cSMatthew Dillon 	io->running = 1;
13493583bbb4SMatthew Dillon 	atomic_add_long(&io->hmp->io_running_space, io->bytes);
13503583bbb4SMatthew Dillon 	atomic_add_long(&hammer_count_io_running_write, io->bytes);
1351eddadaeeSMatthew Dillon 	TAILQ_INSERT_TAIL(&io->hmp->iorun_list, io, iorun_entry);
1352b0aab9b9SMatthew Dillon 
135377912481SMatthew Dillon 	hammer_put_interlock(&io->lock, 1);
1354b0aab9b9SMatthew Dillon 	lwkt_reltoken(&hmp->io_token);
1355b0aab9b9SMatthew Dillon 
1356055f5ff8SMatthew Dillon 	return(0);
1357055f5ff8SMatthew Dillon }
135866325755SMatthew Dillon 
13598cd0a023SMatthew Dillon /*
136066325755SMatthew Dillon  * Return non-zero if we wish to delay the kernel's attempt to flush
136166325755SMatthew Dillon  * this buffer to disk.
1362b0aab9b9SMatthew Dillon  *
1363b0aab9b9SMatthew Dillon  * bioops callback - hold io_token
136466325755SMatthew Dillon  */
136566325755SMatthew Dillon static int
136666325755SMatthew Dillon hammer_io_countdeps(struct buf *bp, int n)
136766325755SMatthew Dillon {
1368b0aab9b9SMatthew Dillon 	/* nothing to do, so io_token not needed */
136966325755SMatthew Dillon 	return(0);
137066325755SMatthew Dillon }
137166325755SMatthew Dillon 
1372e397030bSTomohiro Kusumi static struct bio_ops hammer_bioops = {
137366325755SMatthew Dillon 	.io_start	= hammer_io_start,
137466325755SMatthew Dillon 	.io_complete	= hammer_io_complete,
137566325755SMatthew Dillon 	.io_deallocate	= hammer_io_deallocate,
137666325755SMatthew Dillon 	.io_fsync	= hammer_io_fsync,
137766325755SMatthew Dillon 	.io_sync	= hammer_io_sync,
137866325755SMatthew Dillon 	.io_movedeps	= hammer_io_movedeps,
137966325755SMatthew Dillon 	.io_countdeps	= hammer_io_countdeps,
138066325755SMatthew Dillon 	.io_checkread	= hammer_io_checkread,
138166325755SMatthew Dillon 	.io_checkwrite	= hammer_io_checkwrite,
138266325755SMatthew Dillon };
138366325755SMatthew Dillon 
138447637bffSMatthew Dillon /************************************************************************
138547637bffSMatthew Dillon  *				DIRECT IO OPS 				*
138647637bffSMatthew Dillon  ************************************************************************
138747637bffSMatthew Dillon  *
138847637bffSMatthew Dillon  * These functions operate directly on the buffer cache buffer associated
138947637bffSMatthew Dillon  * with a front-end vnode rather then a back-end device vnode.
139047637bffSMatthew Dillon  */
139147637bffSMatthew Dillon 
139247637bffSMatthew Dillon /*
139347637bffSMatthew Dillon  * Read a buffer associated with a front-end vnode directly from the
13941b0ab2c3SMatthew Dillon  * disk media.  The bio may be issued asynchronously.  If leaf is non-NULL
13951b0ab2c3SMatthew Dillon  * we validate the CRC.
1396a99b9ea2SMatthew Dillon  *
13971b0ab2c3SMatthew Dillon  * We must check for the presence of a HAMMER buffer to handle the case
13981b0ab2c3SMatthew Dillon  * where the reblocker has rewritten the data (which it does via the HAMMER
13991b0ab2c3SMatthew Dillon  * buffer system, not via the high-level vnode buffer cache), but not yet
14001b0ab2c3SMatthew Dillon  * committed the buffer to the media.
140147637bffSMatthew Dillon  */
140247637bffSMatthew Dillon int
14031b0ab2c3SMatthew Dillon hammer_io_direct_read(hammer_mount_t hmp, struct bio *bio,
14041b0ab2c3SMatthew Dillon 		      hammer_btree_leaf_elm_t leaf)
140547637bffSMatthew Dillon {
14061b0ab2c3SMatthew Dillon 	hammer_off_t buf_offset;
140747637bffSMatthew Dillon 	hammer_off_t zone2_offset;
140847637bffSMatthew Dillon 	hammer_volume_t volume;
140947637bffSMatthew Dillon 	struct buf *bp;
141047637bffSMatthew Dillon 	struct bio *nbio;
141147637bffSMatthew Dillon 	int vol_no;
141247637bffSMatthew Dillon 	int error;
141347637bffSMatthew Dillon 
14141b0ab2c3SMatthew Dillon 	buf_offset = bio->bio_offset;
1415e1545c47STomohiro Kusumi 	KKASSERT(hammer_is_zone_large_data(buf_offset));
14164a2796f3SMatthew Dillon 
14171b0ab2c3SMatthew Dillon 	/*
14181b0ab2c3SMatthew Dillon 	 * The buffer cache may have an aliased buffer (the reblocker can
14191b0ab2c3SMatthew Dillon 	 * write them).  If it does we have to sync any dirty data before
14201b0ab2c3SMatthew Dillon 	 * we can build our direct-read.  This is a non-critical code path.
14211b0ab2c3SMatthew Dillon 	 */
14221b0ab2c3SMatthew Dillon 	bp = bio->bio_buf;
14231b0ab2c3SMatthew Dillon 	hammer_sync_buffers(hmp, buf_offset, bp->b_bufsize);
14241b0ab2c3SMatthew Dillon 
14251b0ab2c3SMatthew Dillon 	/*
14261b0ab2c3SMatthew Dillon 	 * Resolve to a zone-2 offset.  The conversion just requires
14271b0ab2c3SMatthew Dillon 	 * munging the top 4 bits but we want to abstract it anyway
14281b0ab2c3SMatthew Dillon 	 * so the blockmap code can verify the zone assignment.
14291b0ab2c3SMatthew Dillon 	 */
14301b0ab2c3SMatthew Dillon 	zone2_offset = hammer_blockmap_lookup(hmp, buf_offset, &error);
14311b0ab2c3SMatthew Dillon 	if (error)
14321b0ab2c3SMatthew Dillon 		goto done;
1433e1545c47STomohiro Kusumi 	KKASSERT(hammer_is_zone_raw_buffer(zone2_offset));
143443c665aeSMatthew Dillon 
14351b0ab2c3SMatthew Dillon 	/*
14361b0ab2c3SMatthew Dillon 	 * Resolve volume and raw-offset for 3rd level bio.  The
14371b0ab2c3SMatthew Dillon 	 * offset will be specific to the volume.
14381b0ab2c3SMatthew Dillon 	 */
143947637bffSMatthew Dillon 	vol_no = HAMMER_VOL_DECODE(zone2_offset);
144047637bffSMatthew Dillon 	volume = hammer_get_volume(hmp, vol_no, &error);
144147637bffSMatthew Dillon 	if (error == 0 && zone2_offset >= volume->maxbuf_off)
144247637bffSMatthew Dillon 		error = EIO;
144343c665aeSMatthew Dillon 
144447637bffSMatthew Dillon 	if (error == 0) {
1445e469566bSMatthew Dillon 		/*
144665d9d14fSTomohiro Kusumi 		 * 3rd level bio (the caller has already pushed once)
1447e469566bSMatthew Dillon 		 */
144847637bffSMatthew Dillon 		nbio = push_bio(bio);
1449516655e8STomohiro Kusumi 		nbio->bio_offset = hammer_xlate_to_phys(volume->ondisk,
1450516655e8STomohiro Kusumi 							zone2_offset);
1451ce0138a6SMatthew Dillon 		hammer_stats_disk_read += bp->b_bufsize;
145247637bffSMatthew Dillon 		vn_strategy(volume->devvp, nbio);
145347637bffSMatthew Dillon 	}
145447637bffSMatthew Dillon 	hammer_rel_volume(volume, 0);
14551b0ab2c3SMatthew Dillon done:
145647637bffSMatthew Dillon 	if (error) {
145735a5249bSTomohiro Kusumi 		hdkprintf("failed @ %016jx\n", (intmax_t)zone2_offset);
145847637bffSMatthew Dillon 		bp->b_error = error;
145947637bffSMatthew Dillon 		bp->b_flags |= B_ERROR;
146047637bffSMatthew Dillon 		biodone(bio);
146147637bffSMatthew Dillon 	}
146247637bffSMatthew Dillon 	return(error);
146347637bffSMatthew Dillon }
146447637bffSMatthew Dillon 
14659a98f3ccSMatthew Dillon /*
14669a98f3ccSMatthew Dillon  * This works similarly to hammer_io_direct_read() except instead of
14679a98f3ccSMatthew Dillon  * directly reading from the device into the bio we instead indirectly
14689a98f3ccSMatthew Dillon  * read through the device's buffer cache and then copy the data into
14699a98f3ccSMatthew Dillon  * the bio.
14709a98f3ccSMatthew Dillon  *
14719a98f3ccSMatthew Dillon  * If leaf is non-NULL and validation is enabled, the CRC will be checked.
14729a98f3ccSMatthew Dillon  *
14739a98f3ccSMatthew Dillon  * This routine also executes asynchronously.  It allows hammer strategy
14749a98f3ccSMatthew Dillon  * calls to operate asynchronously when in double_buffer mode (in addition
14759a98f3ccSMatthew Dillon  * to operating asynchronously when in normal mode).
14769a98f3ccSMatthew Dillon  */
14779a98f3ccSMatthew Dillon int
14789a98f3ccSMatthew Dillon hammer_io_indirect_read(hammer_mount_t hmp, struct bio *bio,
14799a98f3ccSMatthew Dillon 			hammer_btree_leaf_elm_t leaf)
14809a98f3ccSMatthew Dillon {
14819a98f3ccSMatthew Dillon 	hammer_off_t buf_offset;
14829a98f3ccSMatthew Dillon 	hammer_off_t zone2_offset;
14839a98f3ccSMatthew Dillon 	hammer_volume_t volume;
14849a98f3ccSMatthew Dillon 	struct buf *bp;
14859a98f3ccSMatthew Dillon 	int vol_no;
14869a98f3ccSMatthew Dillon 	int error;
14879a98f3ccSMatthew Dillon 
14889a98f3ccSMatthew Dillon 	buf_offset = bio->bio_offset;
1489e1545c47STomohiro Kusumi 	KKASSERT(hammer_is_zone_large_data(buf_offset));
14909a98f3ccSMatthew Dillon 
14919a98f3ccSMatthew Dillon 	/*
14929a98f3ccSMatthew Dillon 	 * The buffer cache may have an aliased buffer (the reblocker can
14939a98f3ccSMatthew Dillon 	 * write them).  If it does we have to sync any dirty data before
14949a98f3ccSMatthew Dillon 	 * we can build our direct-read.  This is a non-critical code path.
14959a98f3ccSMatthew Dillon 	 */
14969a98f3ccSMatthew Dillon 	bp = bio->bio_buf;
14979a98f3ccSMatthew Dillon 	hammer_sync_buffers(hmp, buf_offset, bp->b_bufsize);
14989a98f3ccSMatthew Dillon 
14999a98f3ccSMatthew Dillon 	/*
15009a98f3ccSMatthew Dillon 	 * Resolve to a zone-2 offset.  The conversion just requires
15019a98f3ccSMatthew Dillon 	 * munging the top 4 bits but we want to abstract it anyway
15029a98f3ccSMatthew Dillon 	 * so the blockmap code can verify the zone assignment.
15039a98f3ccSMatthew Dillon 	 */
15049a98f3ccSMatthew Dillon 	zone2_offset = hammer_blockmap_lookup(hmp, buf_offset, &error);
15059a98f3ccSMatthew Dillon 	if (error)
15069a98f3ccSMatthew Dillon 		goto done;
1507e1545c47STomohiro Kusumi 	KKASSERT(hammer_is_zone_raw_buffer(zone2_offset));
15089a98f3ccSMatthew Dillon 
15099a98f3ccSMatthew Dillon 	/*
15109a98f3ccSMatthew Dillon 	 * Resolve volume and raw-offset for 3rd level bio.  The
15119a98f3ccSMatthew Dillon 	 * offset will be specific to the volume.
15129a98f3ccSMatthew Dillon 	 */
15139a98f3ccSMatthew Dillon 	vol_no = HAMMER_VOL_DECODE(zone2_offset);
15149a98f3ccSMatthew Dillon 	volume = hammer_get_volume(hmp, vol_no, &error);
15159a98f3ccSMatthew Dillon 	if (error == 0 && zone2_offset >= volume->maxbuf_off)
15169a98f3ccSMatthew Dillon 		error = EIO;
15179a98f3ccSMatthew Dillon 
15189a98f3ccSMatthew Dillon 	if (error == 0) {
15199a98f3ccSMatthew Dillon 		/*
15209a98f3ccSMatthew Dillon 		 * Convert to the raw volume->devvp offset and acquire
15219a98f3ccSMatthew Dillon 		 * the buf, issuing async I/O if necessary.
15229a98f3ccSMatthew Dillon 		 */
1523*e7d75765SMatthew Dillon 		hammer_off_t limit;
1524*e7d75765SMatthew Dillon 		int hce;
1525*e7d75765SMatthew Dillon 
1526516655e8STomohiro Kusumi 		buf_offset = hammer_xlate_to_phys(volume->ondisk, zone2_offset);
15279a98f3ccSMatthew Dillon 
15289a98f3ccSMatthew Dillon 		if (leaf && hammer_verify_data) {
15299a98f3ccSMatthew Dillon 			bio->bio_caller_info1.uvalue32 = leaf->data_crc;
15309a98f3ccSMatthew Dillon 			bio->bio_caller_info2.index = 1;
15319a98f3ccSMatthew Dillon 		} else {
15329a98f3ccSMatthew Dillon 			bio->bio_caller_info2.index = 0;
15339a98f3ccSMatthew Dillon 		}
1534*e7d75765SMatthew Dillon 
1535*e7d75765SMatthew Dillon 		hce = hammer_cluster_enable;
1536*e7d75765SMatthew Dillon 		if (hce > 0) {
1537*e7d75765SMatthew Dillon 			limit = (zone2_offset + HAMMER_BIGBLOCK_MASK64) &
1538*e7d75765SMatthew Dillon 				~HAMMER_BIGBLOCK_MASK64;
1539*e7d75765SMatthew Dillon 			limit -= zone2_offset;
1540*e7d75765SMatthew Dillon 			cluster_readcb(volume->devvp, limit, buf_offset,
1541*e7d75765SMatthew Dillon 				       bp->b_bufsize,
1542*e7d75765SMatthew Dillon 				       HAMMER_CLUSTER_SIZE,
1543*e7d75765SMatthew Dillon 				       HAMMER_CLUSTER_SIZE * hce,
1544*e7d75765SMatthew Dillon 				       hammer_indirect_callback,
1545*e7d75765SMatthew Dillon 				       bio);
1546*e7d75765SMatthew Dillon 		} else {
15479a98f3ccSMatthew Dillon 			breadcb(volume->devvp, buf_offset, bp->b_bufsize,
15489a98f3ccSMatthew Dillon 				hammer_indirect_callback, bio);
15499a98f3ccSMatthew Dillon 		}
1550*e7d75765SMatthew Dillon 	}
15519a98f3ccSMatthew Dillon 	hammer_rel_volume(volume, 0);
15529a98f3ccSMatthew Dillon done:
15539a98f3ccSMatthew Dillon 	if (error) {
155435a5249bSTomohiro Kusumi 		hdkprintf("failed @ %016jx\n", (intmax_t)zone2_offset);
15559a98f3ccSMatthew Dillon 		bp->b_error = error;
15569a98f3ccSMatthew Dillon 		bp->b_flags |= B_ERROR;
15579a98f3ccSMatthew Dillon 		biodone(bio);
15589a98f3ccSMatthew Dillon 	}
15599a98f3ccSMatthew Dillon 	return(error);
15609a98f3ccSMatthew Dillon }
15619a98f3ccSMatthew Dillon 
15629a98f3ccSMatthew Dillon /*
15639a98f3ccSMatthew Dillon  * Indirect callback on completion.  bio/bp specify the device-backed
15649a98f3ccSMatthew Dillon  * buffer.  bio->bio_caller_info1.ptr holds obio.
15659a98f3ccSMatthew Dillon  *
15669a98f3ccSMatthew Dillon  * obio/obp is the original regular file buffer.  obio->bio_caller_info*
15679a98f3ccSMatthew Dillon  * contains the crc specification.
15689a98f3ccSMatthew Dillon  *
15699a98f3ccSMatthew Dillon  * We are responsible for calling bpdone() and bqrelse() on bio/bp, and
15709a98f3ccSMatthew Dillon  * for calling biodone() on obio.
15719a98f3ccSMatthew Dillon  */
15729a98f3ccSMatthew Dillon static void
15739a98f3ccSMatthew Dillon hammer_indirect_callback(struct bio *bio)
15749a98f3ccSMatthew Dillon {
15759a98f3ccSMatthew Dillon 	struct buf *bp = bio->bio_buf;
15769a98f3ccSMatthew Dillon 	struct buf *obp;
15779a98f3ccSMatthew Dillon 	struct bio *obio;
15789a98f3ccSMatthew Dillon 
15799a98f3ccSMatthew Dillon 	/*
15809a98f3ccSMatthew Dillon 	 * If BIO_DONE is already set the device buffer was already
15819a98f3ccSMatthew Dillon 	 * fully valid (B_CACHE).  If it is not set then I/O was issued
15829a98f3ccSMatthew Dillon 	 * and we have to run I/O completion as the last bio.
15839a98f3ccSMatthew Dillon 	 *
15849a98f3ccSMatthew Dillon 	 * Nobody is waiting for our device I/O to complete, we are
15859a98f3ccSMatthew Dillon 	 * responsible for bqrelse()ing it which means we also have to do
15869a98f3ccSMatthew Dillon 	 * the equivalent of biowait() and clear BIO_DONE (which breadcb()
15879a98f3ccSMatthew Dillon 	 * may have set).
15889a98f3ccSMatthew Dillon 	 *
15899a98f3ccSMatthew Dillon 	 * Any preexisting device buffer should match the requested size,
1590a981af19STomohiro Kusumi 	 * but due to big-block recycling and other factors there is some
15919a98f3ccSMatthew Dillon 	 * fragility there, so we assert that the device buffer covers
15929a98f3ccSMatthew Dillon 	 * the request.
15939a98f3ccSMatthew Dillon 	 */
15949a98f3ccSMatthew Dillon 	if ((bio->bio_flags & BIO_DONE) == 0)
15959a98f3ccSMatthew Dillon 		bpdone(bp, 0);
15969a98f3ccSMatthew Dillon 	bio->bio_flags &= ~(BIO_DONE | BIO_SYNC);
15979a98f3ccSMatthew Dillon 
15989a98f3ccSMatthew Dillon 	obio = bio->bio_caller_info1.ptr;
15999a98f3ccSMatthew Dillon 	obp = obio->bio_buf;
16009a98f3ccSMatthew Dillon 
16019a98f3ccSMatthew Dillon 	if (bp->b_flags & B_ERROR) {
16029a98f3ccSMatthew Dillon 		obp->b_flags |= B_ERROR;
16039a98f3ccSMatthew Dillon 		obp->b_error = bp->b_error;
16049a98f3ccSMatthew Dillon 	} else if (obio->bio_caller_info2.index &&
16059a98f3ccSMatthew Dillon 		   obio->bio_caller_info1.uvalue32 !=
16069a98f3ccSMatthew Dillon 		    crc32(bp->b_data, bp->b_bufsize)) {
16079a98f3ccSMatthew Dillon 		obp->b_flags |= B_ERROR;
16089a98f3ccSMatthew Dillon 		obp->b_error = EIO;
16099a98f3ccSMatthew Dillon 	} else {
16109a98f3ccSMatthew Dillon 		KKASSERT(bp->b_bufsize >= obp->b_bufsize);
16119a98f3ccSMatthew Dillon 		bcopy(bp->b_data, obp->b_data, obp->b_bufsize);
16129a98f3ccSMatthew Dillon 		obp->b_resid = 0;
16139a98f3ccSMatthew Dillon 		obp->b_flags |= B_AGE;
16149a98f3ccSMatthew Dillon 	}
16159a98f3ccSMatthew Dillon 	biodone(obio);
16169a98f3ccSMatthew Dillon 	bqrelse(bp);
16179a98f3ccSMatthew Dillon }
16189a98f3ccSMatthew Dillon 
161947637bffSMatthew Dillon /*
162047637bffSMatthew Dillon  * Write a buffer associated with a front-end vnode directly to the
162147637bffSMatthew Dillon  * disk media.  The bio may be issued asynchronously.
16221b0ab2c3SMatthew Dillon  *
162377912481SMatthew Dillon  * The BIO is associated with the specified record and RECG_DIRECT_IO
1624e469566bSMatthew Dillon  * is set.  The recorded is added to its object.
162547637bffSMatthew Dillon  */
162647637bffSMatthew Dillon int
16276362a262SMatthew Dillon hammer_io_direct_write(hammer_mount_t hmp, struct bio *bio,
16286362a262SMatthew Dillon 		       hammer_record_t record)
162947637bffSMatthew Dillon {
16301b0ab2c3SMatthew Dillon 	hammer_btree_leaf_elm_t leaf = &record->leaf;
16310832c9bbSMatthew Dillon 	hammer_off_t buf_offset;
163247637bffSMatthew Dillon 	hammer_off_t zone2_offset;
163347637bffSMatthew Dillon 	hammer_volume_t volume;
16340832c9bbSMatthew Dillon 	hammer_buffer_t buffer;
163547637bffSMatthew Dillon 	struct buf *bp;
163647637bffSMatthew Dillon 	struct bio *nbio;
16370832c9bbSMatthew Dillon 	char *ptr;
163847637bffSMatthew Dillon 	int vol_no;
163947637bffSMatthew Dillon 	int error;
164047637bffSMatthew Dillon 
16410832c9bbSMatthew Dillon 	buf_offset = leaf->data_offset;
16420832c9bbSMatthew Dillon 
1643b0cce327STomohiro Kusumi 	KKASSERT(hammer_is_zone2_mapped_index(
1644b0cce327STomohiro Kusumi 		HAMMER_ZONE_DECODE(buf_offset)));
164547637bffSMatthew Dillon 	KKASSERT(bio->bio_buf->b_cmd == BUF_CMD_WRITE);
164647637bffSMatthew Dillon 
16476362a262SMatthew Dillon 	/*
16486362a262SMatthew Dillon 	 * Issue or execute the I/O.  The new memory record must replace
16496362a262SMatthew Dillon 	 * the old one before the I/O completes, otherwise a reaquisition of
16506362a262SMatthew Dillon 	 * the buffer will load the old media data instead of the new.
16516362a262SMatthew Dillon 	 */
16520832c9bbSMatthew Dillon 	if ((buf_offset & HAMMER_BUFMASK) == 0 &&
16534a2796f3SMatthew Dillon 	    leaf->data_len >= HAMMER_BUFSIZE) {
16540832c9bbSMatthew Dillon 		/*
16550832c9bbSMatthew Dillon 		 * We are using the vnode's bio to write directly to the
16560832c9bbSMatthew Dillon 		 * media, any hammer_buffer at the same zone-X offset will
16570832c9bbSMatthew Dillon 		 * now have stale data.
16580832c9bbSMatthew Dillon 		 */
16590832c9bbSMatthew Dillon 		zone2_offset = hammer_blockmap_lookup(hmp, buf_offset, &error);
166047637bffSMatthew Dillon 		vol_no = HAMMER_VOL_DECODE(zone2_offset);
166147637bffSMatthew Dillon 		volume = hammer_get_volume(hmp, vol_no, &error);
166247637bffSMatthew Dillon 
166347637bffSMatthew Dillon 		if (error == 0 && zone2_offset >= volume->maxbuf_off)
166447637bffSMatthew Dillon 			error = EIO;
166547637bffSMatthew Dillon 		if (error == 0) {
16660832c9bbSMatthew Dillon 			bp = bio->bio_buf;
16674a2796f3SMatthew Dillon 			KKASSERT((bp->b_bufsize & HAMMER_BUFMASK) == 0);
1668e469566bSMatthew Dillon 			/*
16694a2796f3SMatthew Dillon 			hammer_del_buffers(hmp, buf_offset,
16704a2796f3SMatthew Dillon 					   zone2_offset, bp->b_bufsize);
1671e469566bSMatthew Dillon 			*/
16721b0ab2c3SMatthew Dillon 
167343c665aeSMatthew Dillon 			/*
167443c665aeSMatthew Dillon 			 * Second level bio - cached zone2 offset.
16751b0ab2c3SMatthew Dillon 			 *
16761b0ab2c3SMatthew Dillon 			 * (We can put our bio_done function in either the
16771b0ab2c3SMatthew Dillon 			 *  2nd or 3rd level).
167843c665aeSMatthew Dillon 			 */
167947637bffSMatthew Dillon 			nbio = push_bio(bio);
168043c665aeSMatthew Dillon 			nbio->bio_offset = zone2_offset;
16811b0ab2c3SMatthew Dillon 			nbio->bio_done = hammer_io_direct_write_complete;
16821b0ab2c3SMatthew Dillon 			nbio->bio_caller_info1.ptr = record;
1683e469566bSMatthew Dillon 			record->zone2_offset = zone2_offset;
168477912481SMatthew Dillon 			record->gflags |= HAMMER_RECG_DIRECT_IO |
168577912481SMatthew Dillon 					 HAMMER_RECG_DIRECT_INVAL;
168643c665aeSMatthew Dillon 
168743c665aeSMatthew Dillon 			/*
168843c665aeSMatthew Dillon 			 * Third level bio - raw offset specific to the
168943c665aeSMatthew Dillon 			 * correct volume.
169043c665aeSMatthew Dillon 			 */
169143c665aeSMatthew Dillon 			nbio = push_bio(nbio);
1692516655e8STomohiro Kusumi 			nbio->bio_offset = hammer_xlate_to_phys(volume->ondisk,
1693516655e8STomohiro Kusumi 								zone2_offset);
1694ce0138a6SMatthew Dillon 			hammer_stats_disk_write += bp->b_bufsize;
16956362a262SMatthew Dillon 			hammer_ip_replace_bulk(hmp, record);
169647637bffSMatthew Dillon 			vn_strategy(volume->devvp, nbio);
1697748efb59SMatthew Dillon 			hammer_io_flush_mark(volume);
169847637bffSMatthew Dillon 		}
169947637bffSMatthew Dillon 		hammer_rel_volume(volume, 0);
17000832c9bbSMatthew Dillon 	} else {
17011b0ab2c3SMatthew Dillon 		/*
17021b0ab2c3SMatthew Dillon 		 * Must fit in a standard HAMMER buffer.  In this case all
170377912481SMatthew Dillon 		 * consumers use the HAMMER buffer system and RECG_DIRECT_IO
17041b0ab2c3SMatthew Dillon 		 * does not need to be set-up.
17051b0ab2c3SMatthew Dillon 		 */
17060832c9bbSMatthew Dillon 		KKASSERT(((buf_offset ^ (buf_offset + leaf->data_len - 1)) & ~HAMMER_BUFMASK64) == 0);
17070832c9bbSMatthew Dillon 		buffer = NULL;
17080832c9bbSMatthew Dillon 		ptr = hammer_bread(hmp, buf_offset, &error, &buffer);
17090832c9bbSMatthew Dillon 		if (error == 0) {
17100832c9bbSMatthew Dillon 			bp = bio->bio_buf;
17117bc5b8c2SMatthew Dillon 			bp->b_flags |= B_AGE;
17120832c9bbSMatthew Dillon 			hammer_io_modify(&buffer->io, 1);
17130832c9bbSMatthew Dillon 			bcopy(bp->b_data, ptr, leaf->data_len);
17140832c9bbSMatthew Dillon 			hammer_io_modify_done(&buffer->io);
17157bc5b8c2SMatthew Dillon 			hammer_rel_buffer(buffer, 0);
17160832c9bbSMatthew Dillon 			bp->b_resid = 0;
17176362a262SMatthew Dillon 			hammer_ip_replace_bulk(hmp, record);
17180832c9bbSMatthew Dillon 			biodone(bio);
17190832c9bbSMatthew Dillon 		}
172047637bffSMatthew Dillon 	}
17216362a262SMatthew Dillon 	if (error) {
1722e469566bSMatthew Dillon 		/*
17236362a262SMatthew Dillon 		 * Major suckage occured.  Also note:  The record was
17246362a262SMatthew Dillon 		 * never added to the tree so we do not have to worry
17256362a262SMatthew Dillon 		 * about the backend.
1726e469566bSMatthew Dillon 		 */
172735a5249bSTomohiro Kusumi 		hdkprintf("failed @ %016jx\n", (intmax_t)leaf->data_offset);
172847637bffSMatthew Dillon 		bp = bio->bio_buf;
172947637bffSMatthew Dillon 		bp->b_resid = 0;
173047637bffSMatthew Dillon 		bp->b_error = EIO;
173147637bffSMatthew Dillon 		bp->b_flags |= B_ERROR;
173247637bffSMatthew Dillon 		biodone(bio);
1733e469566bSMatthew Dillon 		record->flags |= HAMMER_RECF_DELETED_FE;
1734e469566bSMatthew Dillon 		hammer_rel_mem_record(record);
173547637bffSMatthew Dillon 	}
173647637bffSMatthew Dillon 	return(error);
173747637bffSMatthew Dillon }
173847637bffSMatthew Dillon 
173943c665aeSMatthew Dillon /*
17401b0ab2c3SMatthew Dillon  * On completion of the BIO this callback must disconnect
17411b0ab2c3SMatthew Dillon  * it from the hammer_record and chain to the previous bio.
1742cdb6e4e6SMatthew Dillon  *
1743cdb6e4e6SMatthew Dillon  * An I/O error forces the mount to read-only.  Data buffers
1744cdb6e4e6SMatthew Dillon  * are not B_LOCKED like meta-data buffers are, so we have to
1745cdb6e4e6SMatthew Dillon  * throw the buffer away to prevent the kernel from retrying.
174677912481SMatthew Dillon  *
174777912481SMatthew Dillon  * NOTE: MPSAFE callback, only modify fields we have explicit
174877912481SMatthew Dillon  *	 access to (the bp and the record->gflags).
17491b0ab2c3SMatthew Dillon  */
17501b0ab2c3SMatthew Dillon static
17511b0ab2c3SMatthew Dillon void
17521b0ab2c3SMatthew Dillon hammer_io_direct_write_complete(struct bio *nbio)
17531b0ab2c3SMatthew Dillon {
17541b0ab2c3SMatthew Dillon 	struct bio *obio;
1755e469566bSMatthew Dillon 	struct buf *bp;
1756b0aab9b9SMatthew Dillon 	hammer_record_t record;
1757b0aab9b9SMatthew Dillon 	hammer_mount_t hmp;
1758b0aab9b9SMatthew Dillon 
1759b0aab9b9SMatthew Dillon 	record = nbio->bio_caller_info1.ptr;
1760b0aab9b9SMatthew Dillon 	KKASSERT(record != NULL);
1761b0aab9b9SMatthew Dillon 	hmp = record->ip->hmp;
1762b0aab9b9SMatthew Dillon 
1763b0aab9b9SMatthew Dillon 	lwkt_gettoken(&hmp->io_token);
17641b0ab2c3SMatthew Dillon 
1765e469566bSMatthew Dillon 	bp = nbio->bio_buf;
17661b0ab2c3SMatthew Dillon 	obio = pop_bio(nbio);
1767e469566bSMatthew Dillon 	if (bp->b_flags & B_ERROR) {
176877912481SMatthew Dillon 		lwkt_gettoken(&hmp->fs_token);
1769653fa4cdSTomohiro Kusumi 		hammer_critical_error(hmp, record->ip, bp->b_error,
1770cdb6e4e6SMatthew Dillon 				      "while writing bulk data");
177177912481SMatthew Dillon 		lwkt_reltoken(&hmp->fs_token);
1772e469566bSMatthew Dillon 		bp->b_flags |= B_INVAL;
1773cdb6e4e6SMatthew Dillon 	}
17741b0ab2c3SMatthew Dillon 	biodone(obio);
1775e469566bSMatthew Dillon 
177677912481SMatthew Dillon 	KKASSERT(record->gflags & HAMMER_RECG_DIRECT_IO);
177777912481SMatthew Dillon 	if (record->gflags & HAMMER_RECG_DIRECT_WAIT) {
177877912481SMatthew Dillon 		record->gflags &= ~(HAMMER_RECG_DIRECT_IO |
177977912481SMatthew Dillon 				    HAMMER_RECG_DIRECT_WAIT);
1780de996e86SMatthew Dillon 		/* record can disappear once DIRECT_IO flag is cleared */
17811b0ab2c3SMatthew Dillon 		wakeup(&record->flags);
1782de996e86SMatthew Dillon 	} else {
178377912481SMatthew Dillon 		record->gflags &= ~HAMMER_RECG_DIRECT_IO;
1784de996e86SMatthew Dillon 		/* record can disappear once DIRECT_IO flag is cleared */
17851b0ab2c3SMatthew Dillon 	}
1786b0aab9b9SMatthew Dillon 	lwkt_reltoken(&hmp->io_token);
17871b0ab2c3SMatthew Dillon }
17881b0ab2c3SMatthew Dillon 
17891b0ab2c3SMatthew Dillon 
17901b0ab2c3SMatthew Dillon /*
17911b0ab2c3SMatthew Dillon  * This is called before a record is either committed to the B-Tree
1792e469566bSMatthew Dillon  * or destroyed, to resolve any associated direct-IO.
17931b0ab2c3SMatthew Dillon  *
1794e469566bSMatthew Dillon  * (1) We must wait for any direct-IO related to the record to complete.
1795e469566bSMatthew Dillon  *
1796e469566bSMatthew Dillon  * (2) We must remove any buffer cache aliases for data accessed via
1797e469566bSMatthew Dillon  *     leaf->data_offset or zone2_offset so non-direct-IO consumers
1798e469566bSMatthew Dillon  *     (the mirroring and reblocking code) do not see stale data.
17991b0ab2c3SMatthew Dillon  */
18001b0ab2c3SMatthew Dillon void
18011b0ab2c3SMatthew Dillon hammer_io_direct_wait(hammer_record_t record)
18021b0ab2c3SMatthew Dillon {
1803b0aab9b9SMatthew Dillon 	hammer_mount_t hmp = record->ip->hmp;
1804b0aab9b9SMatthew Dillon 
1805e469566bSMatthew Dillon 	/*
1806e469566bSMatthew Dillon 	 * Wait for I/O to complete
1807e469566bSMatthew Dillon 	 */
180877912481SMatthew Dillon 	if (record->gflags & HAMMER_RECG_DIRECT_IO) {
1809b0aab9b9SMatthew Dillon 		lwkt_gettoken(&hmp->io_token);
181077912481SMatthew Dillon 		while (record->gflags & HAMMER_RECG_DIRECT_IO) {
181177912481SMatthew Dillon 			record->gflags |= HAMMER_RECG_DIRECT_WAIT;
18121b0ab2c3SMatthew Dillon 			tsleep(&record->flags, 0, "hmdiow", 0);
18131b0ab2c3SMatthew Dillon 		}
1814b0aab9b9SMatthew Dillon 		lwkt_reltoken(&hmp->io_token);
18151b0ab2c3SMatthew Dillon 	}
18161b0ab2c3SMatthew Dillon 
18171b0ab2c3SMatthew Dillon 	/*
1818362ec2dcSMatthew Dillon 	 * Invalidate any related buffer cache aliases associated with the
1819362ec2dcSMatthew Dillon 	 * backing device.  This is needed because the buffer cache buffer
1820362ec2dcSMatthew Dillon 	 * for file data is associated with the file vnode, not the backing
1821362ec2dcSMatthew Dillon 	 * device vnode.
1822362ec2dcSMatthew Dillon 	 *
1823362ec2dcSMatthew Dillon 	 * XXX I do not think this case can occur any more now that
1824362ec2dcSMatthew Dillon 	 * reservations ensure that all such buffers are removed before
1825362ec2dcSMatthew Dillon 	 * an area can be reused.
1826e469566bSMatthew Dillon 	 */
182777912481SMatthew Dillon 	if (record->gflags & HAMMER_RECG_DIRECT_INVAL) {
1828e469566bSMatthew Dillon 		KKASSERT(record->leaf.data_offset);
1829b0aab9b9SMatthew Dillon 		hammer_del_buffers(hmp, record->leaf.data_offset,
1830362ec2dcSMatthew Dillon 				   record->zone2_offset, record->leaf.data_len,
1831362ec2dcSMatthew Dillon 				   1);
183277912481SMatthew Dillon 		record->gflags &= ~HAMMER_RECG_DIRECT_INVAL;
1833e469566bSMatthew Dillon 	}
1834e469566bSMatthew Dillon }
1835e469566bSMatthew Dillon 
1836e469566bSMatthew Dillon /*
183743c665aeSMatthew Dillon  * This is called to remove the second-level cached zone-2 offset from
183843c665aeSMatthew Dillon  * frontend buffer cache buffers, now stale due to a data relocation.
183943c665aeSMatthew Dillon  * These offsets are generated by cluster_read() via VOP_BMAP, or directly
184043c665aeSMatthew Dillon  * by hammer_vop_strategy_read().
184143c665aeSMatthew Dillon  *
184243c665aeSMatthew Dillon  * This is rather nasty because here we have something like the reblocker
184343c665aeSMatthew Dillon  * scanning the raw B-Tree with no held references on anything, really,
184443c665aeSMatthew Dillon  * other then a shared lock on the B-Tree node, and we have to access the
184543c665aeSMatthew Dillon  * frontend's buffer cache to check for and clean out the association.
184643c665aeSMatthew Dillon  * Specifically, if the reblocker is moving data on the disk, these cached
184743c665aeSMatthew Dillon  * offsets will become invalid.
184843c665aeSMatthew Dillon  *
184943c665aeSMatthew Dillon  * Only data record types associated with the large-data zone are subject
185043c665aeSMatthew Dillon  * to direct-io and need to be checked.
185143c665aeSMatthew Dillon  *
185243c665aeSMatthew Dillon  */
185343c665aeSMatthew Dillon void
185443c665aeSMatthew Dillon hammer_io_direct_uncache(hammer_mount_t hmp, hammer_btree_leaf_elm_t leaf)
185543c665aeSMatthew Dillon {
185643c665aeSMatthew Dillon 	struct hammer_inode_info iinfo;
185743c665aeSMatthew Dillon 	int zone;
185843c665aeSMatthew Dillon 
185943c665aeSMatthew Dillon 	if (leaf->base.rec_type != HAMMER_RECTYPE_DATA)
186043c665aeSMatthew Dillon 		return;
186143c665aeSMatthew Dillon 	zone = HAMMER_ZONE_DECODE(leaf->data_offset);
186243c665aeSMatthew Dillon 	if (zone != HAMMER_ZONE_LARGE_DATA_INDEX)
186343c665aeSMatthew Dillon 		return;
186443c665aeSMatthew Dillon 	iinfo.obj_id = leaf->base.obj_id;
186543c665aeSMatthew Dillon 	iinfo.obj_asof = 0;	/* unused */
186643c665aeSMatthew Dillon 	iinfo.obj_localization = leaf->base.localization &
18675a930e66SMatthew Dillon 				 HAMMER_LOCALIZE_PSEUDOFS_MASK;
186843c665aeSMatthew Dillon 	iinfo.u.leaf = leaf;
186943c665aeSMatthew Dillon 	hammer_scan_inode_snapshots(hmp, &iinfo,
187043c665aeSMatthew Dillon 				    hammer_io_direct_uncache_callback,
187143c665aeSMatthew Dillon 				    leaf);
187243c665aeSMatthew Dillon }
187343c665aeSMatthew Dillon 
187443c665aeSMatthew Dillon static int
187543c665aeSMatthew Dillon hammer_io_direct_uncache_callback(hammer_inode_t ip, void *data)
187643c665aeSMatthew Dillon {
187743c665aeSMatthew Dillon 	hammer_inode_info_t iinfo = data;
187843c665aeSMatthew Dillon 	hammer_off_t file_offset;
187943c665aeSMatthew Dillon 	struct vnode *vp;
188043c665aeSMatthew Dillon 	struct buf *bp;
188143c665aeSMatthew Dillon 	int blksize;
188243c665aeSMatthew Dillon 
188343c665aeSMatthew Dillon 	if (ip->vp == NULL)
188443c665aeSMatthew Dillon 		return(0);
188543c665aeSMatthew Dillon 	file_offset = iinfo->u.leaf->base.key - iinfo->u.leaf->data_len;
188643c665aeSMatthew Dillon 	blksize = iinfo->u.leaf->data_len;
188743c665aeSMatthew Dillon 	KKASSERT((blksize & HAMMER_BUFMASK) == 0);
188843c665aeSMatthew Dillon 
18899c90dba2SMatthew Dillon 	/*
18909c90dba2SMatthew Dillon 	 * Warning: FINDBLK_TEST return stable storage but not stable
18919c90dba2SMatthew Dillon 	 *	    contents.  It happens to be ok in this case.
18929c90dba2SMatthew Dillon 	 */
189343c665aeSMatthew Dillon 	hammer_ref(&ip->lock);
189443c665aeSMatthew Dillon 	if (hammer_get_vnode(ip, &vp) == 0) {
1895b1c20cfaSMatthew Dillon 		if ((bp = findblk(ip->vp, file_offset, FINDBLK_TEST)) != NULL &&
189643c665aeSMatthew Dillon 		    bp->b_bio2.bio_offset != NOOFFSET) {
189743c665aeSMatthew Dillon 			bp = getblk(ip->vp, file_offset, blksize, 0, 0);
189843c665aeSMatthew Dillon 			bp->b_bio2.bio_offset = NOOFFSET;
189943c665aeSMatthew Dillon 			brelse(bp);
190043c665aeSMatthew Dillon 		}
190143c665aeSMatthew Dillon 		vput(vp);
190243c665aeSMatthew Dillon 	}
190343c665aeSMatthew Dillon 	hammer_rel_inode(ip, 0);
190443c665aeSMatthew Dillon 	return(0);
190543c665aeSMatthew Dillon }
190647637bffSMatthew Dillon 
1907748efb59SMatthew Dillon 
1908748efb59SMatthew Dillon /*
1909748efb59SMatthew Dillon  * This function is called when writes may have occured on the volume,
1910748efb59SMatthew Dillon  * indicating that the device may be holding cached writes.
1911748efb59SMatthew Dillon  */
19129e6939a5STomohiro Kusumi static __inline void
1913748efb59SMatthew Dillon hammer_io_flush_mark(hammer_volume_t volume)
1914748efb59SMatthew Dillon {
191577912481SMatthew Dillon 	atomic_set_int(&volume->vol_flags, HAMMER_VOLF_NEEDFLUSH);
1916748efb59SMatthew Dillon }
1917748efb59SMatthew Dillon 
1918748efb59SMatthew Dillon /*
1919748efb59SMatthew Dillon  * This function ensures that the device has flushed any cached writes out.
1920748efb59SMatthew Dillon  */
1921748efb59SMatthew Dillon void
1922748efb59SMatthew Dillon hammer_io_flush_sync(hammer_mount_t hmp)
1923748efb59SMatthew Dillon {
1924748efb59SMatthew Dillon 	hammer_volume_t volume;
1925748efb59SMatthew Dillon 	struct buf *bp_base = NULL;
1926748efb59SMatthew Dillon 	struct buf *bp;
1927748efb59SMatthew Dillon 
1928748efb59SMatthew Dillon 	RB_FOREACH(volume, hammer_vol_rb_tree, &hmp->rb_vols_root) {
1929748efb59SMatthew Dillon 		if (volume->vol_flags & HAMMER_VOLF_NEEDFLUSH) {
193077912481SMatthew Dillon 			atomic_clear_int(&volume->vol_flags,
193177912481SMatthew Dillon 					 HAMMER_VOLF_NEEDFLUSH);
1932748efb59SMatthew Dillon 			bp = getpbuf(NULL);
1933748efb59SMatthew Dillon 			bp->b_bio1.bio_offset = 0;
1934748efb59SMatthew Dillon 			bp->b_bufsize = 0;
1935748efb59SMatthew Dillon 			bp->b_bcount = 0;
1936748efb59SMatthew Dillon 			bp->b_cmd = BUF_CMD_FLUSH;
1937748efb59SMatthew Dillon 			bp->b_bio1.bio_caller_info1.cluster_head = bp_base;
1938ae8e83e6SMatthew Dillon 			bp->b_bio1.bio_done = biodone_sync;
1939ae8e83e6SMatthew Dillon 			bp->b_bio1.bio_flags |= BIO_SYNC;
1940748efb59SMatthew Dillon 			bp_base = bp;
1941748efb59SMatthew Dillon 			vn_strategy(volume->devvp, &bp->b_bio1);
1942748efb59SMatthew Dillon 		}
1943748efb59SMatthew Dillon 	}
1944748efb59SMatthew Dillon 	while ((bp = bp_base) != NULL) {
1945748efb59SMatthew Dillon 		bp_base = bp->b_bio1.bio_caller_info1.cluster_head;
1946ae8e83e6SMatthew Dillon 		biowait(&bp->b_bio1, "hmrFLS");
1947748efb59SMatthew Dillon 		relpbuf(bp, NULL);
1948748efb59SMatthew Dillon 	}
1949748efb59SMatthew Dillon }
1950ba298df1SMatthew Dillon 
1951ba298df1SMatthew Dillon /*
1952ba298df1SMatthew Dillon  * Limit the amount of backlog which we allow to build up
1953ba298df1SMatthew Dillon  */
1954ba298df1SMatthew Dillon void
1955ba298df1SMatthew Dillon hammer_io_limit_backlog(hammer_mount_t hmp)
1956ba298df1SMatthew Dillon {
19573038a8caSMatthew Dillon 	waitrunningbufspace();
1958ba298df1SMatthew Dillon }
1959