xref: /dflybsd-src/sys/vfs/hammer/hammer_io.c (revision f5a07a7a074b65b763bd61fd65e0b29f12032fdd)
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/vfs/hammer/hammer_io.c,v 1.47 2008/06/28 23:50:37 dillon Exp $
35  */
36 /*
37  * IO Primitives and buffer cache management
38  *
39  * All major data-tracking structures in HAMMER contain a struct hammer_io
40  * which is used to manage their backing store.  We use filesystem buffers
41  * for backing store and we leave them passively associated with their
42  * HAMMER structures.
43  *
44  * If the kernel tries to destroy a passively associated buf which we cannot
45  * yet let go we set B_LOCKED in the buffer and then actively released it
46  * later when we can.
47  */
48 
49 #include "hammer.h"
50 #include <sys/fcntl.h>
51 #include <sys/nlookup.h>
52 #include <sys/buf.h>
53 #include <sys/buf2.h>
54 
55 static void hammer_io_modify(hammer_io_t io, int count);
56 static void hammer_io_deallocate(struct buf *bp);
57 static int hammer_io_direct_uncache_callback(hammer_inode_t ip, void *data);
58 
59 /*
60  * Initialize a new, already-zero'd hammer_io structure, or reinitialize
61  * an existing hammer_io structure which may have switched to another type.
62  */
63 void
64 hammer_io_init(hammer_io_t io, hammer_mount_t hmp, enum hammer_io_type type)
65 {
66 	io->hmp = hmp;
67 	io->type = type;
68 }
69 
70 /*
71  * Helper routine to disassociate a buffer cache buffer from an I/O
72  * structure.
73  *
74  * The io may have 0 or 1 references depending on who called us.  The
75  * caller is responsible for dealing with the refs.
76  *
77  * This call can only be made when no action is required on the buffer.
78  * HAMMER must own the buffer (released == 0) since we mess around with it.
79  */
80 static void
81 hammer_io_disassociate(hammer_io_structure_t iou, int elseit)
82 {
83 	struct buf *bp = iou->io.bp;
84 
85 	KKASSERT(iou->io.modified == 0);
86 	KKASSERT(LIST_FIRST(&bp->b_dep) == (void *)iou);
87 	buf_dep_init(bp);
88 	iou->io.bp = NULL;
89 
90 	/*
91 	 * If the buffer was locked someone wanted to get rid of it.
92 	 */
93 	if (bp->b_flags & B_LOCKED) {
94 		--hammer_count_io_locked;
95 		bp->b_flags &= ~B_LOCKED;
96 	}
97 
98 	/*
99 	 * elseit is 0 when called from the kernel path when the io
100 	 * might have no references.
101 	 */
102 	if (elseit) {
103 		KKASSERT(iou->io.released == 0);
104 		iou->io.released = 1;
105 		if (iou->io.reclaim)
106 			bp->b_flags |= B_NOCACHE|B_RELBUF;
107 		bqrelse(bp);
108 	} else {
109 		KKASSERT(iou->io.released);
110 	}
111 	iou->io.reclaim = 0;
112 
113 	switch(iou->io.type) {
114 	case HAMMER_STRUCTURE_VOLUME:
115 		iou->volume.ondisk = NULL;
116 		break;
117 	case HAMMER_STRUCTURE_DATA_BUFFER:
118 	case HAMMER_STRUCTURE_META_BUFFER:
119 	case HAMMER_STRUCTURE_UNDO_BUFFER:
120 		iou->buffer.ondisk = NULL;
121 		break;
122 	}
123 }
124 
125 /*
126  * Wait for any physical IO to complete
127  */
128 static void
129 hammer_io_wait(hammer_io_t io)
130 {
131 	if (io->running) {
132 		crit_enter();
133 		tsleep_interlock(io);
134 		io->waiting = 1;
135 		for (;;) {
136 			tsleep(io, 0, "hmrflw", 0);
137 			if (io->running == 0)
138 				break;
139 			tsleep_interlock(io);
140 			io->waiting = 1;
141 			if (io->running == 0)
142 				break;
143 		}
144 		crit_exit();
145 	}
146 }
147 
148 /*
149  * Wait for all hammer_io-initated write I/O's to complete.  This is not
150  * supposed to count direct I/O's but some can leak through (for
151  * non-full-sized direct I/Os).
152  */
153 void
154 hammer_io_wait_all(hammer_mount_t hmp, const char *ident)
155 {
156 	crit_enter();
157 	while (hmp->io_running_space)
158 		tsleep(&hmp->io_running_space, 0, ident, 0);
159 	crit_exit();
160 }
161 
162 #define HAMMER_MAXRA	4
163 
164 /*
165  * Load bp for a HAMMER structure.  The io must be exclusively locked by
166  * the caller.
167  *
168  * This routine is mostly used on meta-data and small-data blocks.  Generally
169  * speaking HAMMER assumes some locality of reference and will cluster
170  * a 64K read.
171  *
172  * Note that clustering occurs at the device layer, not the logical layer.
173  * If the buffers do not apply to the current operation they may apply to
174  * some other.
175  */
176 int
177 hammer_io_read(struct vnode *devvp, struct hammer_io *io, hammer_off_t limit)
178 {
179 	struct buf *bp;
180 	int   error;
181 
182 	if ((bp = io->bp) == NULL) {
183 		hammer_count_io_running_read += io->bytes;
184 #if 1
185 		error = cluster_read(devvp, limit, io->offset, io->bytes,
186 				     HAMMER_CLUSTER_SIZE,
187 				     HAMMER_CLUSTER_BUFS, &io->bp);
188 #else
189 		error = bread(devvp, io->offset, io->bytes, &io->bp);
190 #endif
191 		hammer_count_io_running_read -= io->bytes;
192 		if (error == 0) {
193 			bp = io->bp;
194 			bp->b_ops = &hammer_bioops;
195 			KKASSERT(LIST_FIRST(&bp->b_dep) == NULL);
196 			LIST_INSERT_HEAD(&bp->b_dep, &io->worklist, node);
197 			BUF_KERNPROC(bp);
198 		}
199 		KKASSERT(io->modified == 0);
200 		KKASSERT(io->running == 0);
201 		KKASSERT(io->waiting == 0);
202 		io->released = 0;	/* we hold an active lock on bp */
203 	} else {
204 		error = 0;
205 	}
206 	return(error);
207 }
208 
209 /*
210  * Similar to hammer_io_read() but returns a zero'd out buffer instead.
211  * Must be called with the IO exclusively locked.
212  *
213  * vfs_bio_clrbuf() is kinda nasty, enforce serialization against background
214  * I/O by forcing the buffer to not be in a released state before calling
215  * it.
216  *
217  * This function will also mark the IO as modified but it will not
218  * increment the modify_refs count.
219  */
220 int
221 hammer_io_new(struct vnode *devvp, struct hammer_io *io)
222 {
223 	struct buf *bp;
224 
225 	if ((bp = io->bp) == NULL) {
226 		io->bp = getblk(devvp, io->offset, io->bytes, 0, 0);
227 		bp = io->bp;
228 		bp->b_ops = &hammer_bioops;
229 		KKASSERT(LIST_FIRST(&bp->b_dep) == NULL);
230 		LIST_INSERT_HEAD(&bp->b_dep, &io->worklist, node);
231 		io->released = 0;
232 		KKASSERT(io->running == 0);
233 		io->waiting = 0;
234 		BUF_KERNPROC(bp);
235 	} else {
236 		if (io->released) {
237 			regetblk(bp);
238 			BUF_KERNPROC(bp);
239 			io->released = 0;
240 		}
241 	}
242 	hammer_io_modify(io, 0);
243 	vfs_bio_clrbuf(bp);
244 	return(0);
245 }
246 
247 /*
248  * Remove potential device level aliases against buffers managed by high level
249  * vnodes.
250  */
251 void
252 hammer_io_inval(hammer_volume_t volume, hammer_off_t zone2_offset)
253 {
254 	hammer_io_structure_t iou;
255 	hammer_off_t phys_offset;
256 	struct buf *bp;
257 
258 	phys_offset = volume->ondisk->vol_buf_beg +
259 		      (zone2_offset & HAMMER_OFF_SHORT_MASK);
260 	crit_enter();
261 	if ((bp = findblk(volume->devvp, phys_offset)) != NULL) {
262 		bp = getblk(volume->devvp, phys_offset, bp->b_bufsize, 0, 0);
263 		if ((iou = (void *)LIST_FIRST(&bp->b_dep)) != NULL) {
264 			hammer_io_clear_modify(&iou->io, 1);
265 			bundirty(bp);
266 			iou->io.reclaim = 1;
267 			hammer_io_deallocate(bp);
268 		} else {
269 			KKASSERT((bp->b_flags & B_LOCKED) == 0);
270 			bundirty(bp);
271 			bp->b_flags |= B_NOCACHE|B_RELBUF;
272 			brelse(bp);
273 		}
274 	}
275 	crit_exit();
276 }
277 
278 /*
279  * This routine is called on the last reference to a hammer structure.
280  * The io is usually locked exclusively (but may not be during unmount).
281  *
282  * This routine is responsible for the disposition of the buffer cache
283  * buffer backing the IO.  Only pure-data and undo buffers can be handed
284  * back to the kernel.  Volume and meta-data buffers must be retained
285  * by HAMMER until explicitly flushed by the backend.
286  */
287 void
288 hammer_io_release(struct hammer_io *io, int flush)
289 {
290 	union hammer_io_structure *iou = (void *)io;
291 	struct buf *bp;
292 
293 	if ((bp = io->bp) == NULL)
294 		return;
295 
296 	/*
297 	 * Try to flush a dirty IO to disk if asked to by the
298 	 * caller or if the kernel tried to flush the buffer in the past.
299 	 *
300 	 * Kernel-initiated flushes are only allowed for pure-data buffers.
301 	 * meta-data and volume buffers can only be flushed explicitly
302 	 * by HAMMER.
303 	 */
304 	if (io->modified) {
305 		if (flush) {
306 			hammer_io_flush(io);
307 		} else if (bp->b_flags & B_LOCKED) {
308 			switch(io->type) {
309 			case HAMMER_STRUCTURE_DATA_BUFFER:
310 			case HAMMER_STRUCTURE_UNDO_BUFFER:
311 				hammer_io_flush(io);
312 				break;
313 			default:
314 				break;
315 			}
316 		} /* else no explicit request to flush the buffer */
317 	}
318 
319 	/*
320 	 * Wait for the IO to complete if asked to.
321 	 */
322 	if (io->waitdep && io->running) {
323 		hammer_io_wait(io);
324 	}
325 
326 	/*
327 	 * Return control of the buffer to the kernel (with the provisio
328 	 * that our bioops can override kernel decisions with regards to
329 	 * the buffer).
330 	 */
331 	if ((flush || io->reclaim) && io->modified == 0 && io->running == 0) {
332 		/*
333 		 * Always disassociate the bp if an explicit flush
334 		 * was requested and the IO completed with no error
335 		 * (so unmount can really clean up the structure).
336 		 */
337 		if (io->released) {
338 			regetblk(bp);
339 			BUF_KERNPROC(bp);
340 			io->released = 0;
341 		}
342 		hammer_io_disassociate((hammer_io_structure_t)io, 1);
343 	} else if (io->modified) {
344 		/*
345 		 * Only certain IO types can be released to the kernel.
346 		 * volume and meta-data IO types must be explicitly flushed
347 		 * by HAMMER.
348 		 */
349 		switch(io->type) {
350 		case HAMMER_STRUCTURE_DATA_BUFFER:
351 		case HAMMER_STRUCTURE_UNDO_BUFFER:
352 			if (io->released == 0) {
353 				io->released = 1;
354 				bdwrite(bp);
355 			}
356 			break;
357 		default:
358 			break;
359 		}
360 	} else if (io->released == 0) {
361 		/*
362 		 * Clean buffers can be generally released to the kernel.
363 		 * We leave the bp passively associated with the HAMMER
364 		 * structure and use bioops to disconnect it later on
365 		 * if the kernel wants to discard the buffer.
366 		 */
367 		if (bp->b_flags & B_LOCKED) {
368 			hammer_io_disassociate(iou, 1);
369 		} else {
370 			if (io->reclaim) {
371 				hammer_io_disassociate(iou, 1);
372 			} else {
373 				io->released = 1;
374 				bqrelse(bp);
375 			}
376 		}
377 	} else {
378 		/*
379 		 * A released buffer is passively associate with our
380 		 * hammer_io structure.  The kernel cannot destroy it
381 		 * without making a bioops call.  If the kernel (B_LOCKED)
382 		 * or we (reclaim) requested that the buffer be destroyed
383 		 * we destroy it, otherwise we do a quick get/release to
384 		 * reset its position in the kernel's LRU list.
385 		 *
386 		 * Leaving the buffer passively associated allows us to
387 		 * use the kernel's LRU buffer flushing mechanisms rather
388 		 * then rolling our own.
389 		 *
390 		 * XXX there are two ways of doing this.  We can re-acquire
391 		 * and passively release to reset the LRU, or not.
392 		 */
393 		crit_enter();
394 		if (io->running == 0) {
395 			regetblk(bp);
396 			if ((bp->b_flags & B_LOCKED) || io->reclaim) {
397 				/*regetblk(bp);*/
398 				io->released = 0;
399 				hammer_io_disassociate(iou, 1);
400 			} else {
401 				bqrelse(bp);
402 			}
403 		}
404 		crit_exit();
405 	}
406 }
407 
408 /*
409  * This routine is called with a locked IO when a flush is desired and
410  * no other references to the structure exists other then ours.  This
411  * routine is ONLY called when HAMMER believes it is safe to flush a
412  * potentially modified buffer out.
413  */
414 void
415 hammer_io_flush(struct hammer_io *io)
416 {
417 	struct buf *bp;
418 
419 	/*
420 	 * Degenerate case - nothing to flush if nothing is dirty.
421 	 */
422 	if (io->modified == 0) {
423 		return;
424 	}
425 
426 	KKASSERT(io->bp);
427 	KKASSERT(io->modify_refs <= 0);
428 
429 	/*
430 	 * Acquire ownership of the bp, particularly before we clear our
431 	 * modified flag.
432 	 *
433 	 * We are going to bawrite() this bp.  Don't leave a window where
434 	 * io->released is set, we actually own the bp rather then our
435 	 * buffer.
436 	 */
437 	bp = io->bp;
438 	if (io->released) {
439 		regetblk(bp);
440 		/* BUF_KERNPROC(io->bp); */
441 		/* io->released = 0; */
442 		KKASSERT(io->released);
443 		KKASSERT(io->bp == bp);
444 	}
445 	io->released = 1;
446 
447 	/*
448 	 * Acquire exclusive access to the bp and then clear the modified
449 	 * state of the buffer prior to issuing I/O to interlock any
450 	 * modifications made while the I/O is in progress.  This shouldn't
451 	 * happen anyway but losing data would be worse.  The modified bit
452 	 * will be rechecked after the IO completes.
453 	 *
454 	 * NOTE: This call also finalizes the buffer's content (inval == 0).
455 	 *
456 	 * This is only legal when lock.refs == 1 (otherwise we might clear
457 	 * the modified bit while there are still users of the cluster
458 	 * modifying the data).
459 	 *
460 	 * Do this before potentially blocking so any attempt to modify the
461 	 * ondisk while we are blocked blocks waiting for us.
462 	 */
463 	hammer_io_clear_modify(io, 0);
464 
465 	/*
466 	 * Transfer ownership to the kernel and initiate I/O.
467 	 */
468 	io->running = 1;
469 	io->hmp->io_running_space += io->bytes;
470 	hammer_count_io_running_write += io->bytes;
471 	bawrite(bp);
472 }
473 
474 /************************************************************************
475  *				BUFFER DIRTYING				*
476  ************************************************************************
477  *
478  * These routines deal with dependancies created when IO buffers get
479  * modified.  The caller must call hammer_modify_*() on a referenced
480  * HAMMER structure prior to modifying its on-disk data.
481  *
482  * Any intent to modify an IO buffer acquires the related bp and imposes
483  * various write ordering dependancies.
484  */
485 
486 /*
487  * Mark a HAMMER structure as undergoing modification.  Meta-data buffers
488  * are locked until the flusher can deal with them, pure data buffers
489  * can be written out.
490  */
491 static
492 void
493 hammer_io_modify(hammer_io_t io, int count)
494 {
495 	struct hammer_mount *hmp = io->hmp;
496 
497 	/*
498 	 * io->modify_refs must be >= 0
499 	 */
500 	while (io->modify_refs < 0) {
501 		io->waitmod = 1;
502 		tsleep(io, 0, "hmrmod", 0);
503 	}
504 
505 	/*
506 	 * Shortcut if nothing to do.
507 	 */
508 	KKASSERT(io->lock.refs != 0 && io->bp != NULL);
509 	io->modify_refs += count;
510 	if (io->modified && io->released == 0)
511 		return;
512 
513 	hammer_lock_ex(&io->lock);
514 	if (io->modified == 0) {
515 		KKASSERT(io->mod_list == NULL);
516 		switch(io->type) {
517 		case HAMMER_STRUCTURE_VOLUME:
518 			io->mod_list = &hmp->volu_list;
519 			hmp->locked_dirty_space += io->bytes;
520 			hammer_count_dirtybufspace += io->bytes;
521 			break;
522 		case HAMMER_STRUCTURE_META_BUFFER:
523 			io->mod_list = &hmp->meta_list;
524 			hmp->locked_dirty_space += io->bytes;
525 			hammer_count_dirtybufspace += io->bytes;
526 			break;
527 		case HAMMER_STRUCTURE_UNDO_BUFFER:
528 			io->mod_list = &hmp->undo_list;
529 			break;
530 		case HAMMER_STRUCTURE_DATA_BUFFER:
531 			io->mod_list = &hmp->data_list;
532 			break;
533 		}
534 		TAILQ_INSERT_TAIL(io->mod_list, io, mod_entry);
535 		io->modified = 1;
536 	}
537 	if (io->released) {
538 		regetblk(io->bp);
539 		BUF_KERNPROC(io->bp);
540 		io->released = 0;
541 		KKASSERT(io->modified != 0);
542 	}
543 	hammer_unlock(&io->lock);
544 }
545 
546 static __inline
547 void
548 hammer_io_modify_done(hammer_io_t io)
549 {
550 	KKASSERT(io->modify_refs > 0);
551 	--io->modify_refs;
552 	if (io->modify_refs == 0 && io->waitmod) {
553 		io->waitmod = 0;
554 		wakeup(io);
555 	}
556 }
557 
558 void
559 hammer_io_write_interlock(hammer_io_t io)
560 {
561 	while (io->modify_refs != 0) {
562 		io->waitmod = 1;
563 		tsleep(io, 0, "hmrmod", 0);
564 	}
565 	io->modify_refs = -1;
566 }
567 
568 void
569 hammer_io_done_interlock(hammer_io_t io)
570 {
571 	KKASSERT(io->modify_refs == -1);
572 	io->modify_refs = 0;
573 	if (io->waitmod) {
574 		io->waitmod = 0;
575 		wakeup(io);
576 	}
577 }
578 
579 /*
580  * Caller intends to modify a volume's ondisk structure.
581  *
582  * This is only allowed if we are the flusher or we have a ref on the
583  * sync_lock.
584  */
585 void
586 hammer_modify_volume(hammer_transaction_t trans, hammer_volume_t volume,
587 		     void *base, int len)
588 {
589 	KKASSERT (trans == NULL || trans->sync_lock_refs > 0);
590 
591 	hammer_io_modify(&volume->io, 1);
592 	if (len) {
593 		intptr_t rel_offset = (intptr_t)base - (intptr_t)volume->ondisk;
594 		KKASSERT((rel_offset & ~(intptr_t)HAMMER_BUFMASK) == 0);
595 		hammer_generate_undo(trans, &volume->io,
596 			 HAMMER_ENCODE_RAW_VOLUME(volume->vol_no, rel_offset),
597 			 base, len);
598 	}
599 }
600 
601 /*
602  * Caller intends to modify a buffer's ondisk structure.
603  *
604  * This is only allowed if we are the flusher or we have a ref on the
605  * sync_lock.
606  */
607 void
608 hammer_modify_buffer(hammer_transaction_t trans, hammer_buffer_t buffer,
609 		     void *base, int len)
610 {
611 	KKASSERT (trans == NULL || trans->sync_lock_refs > 0);
612 
613 	hammer_io_modify(&buffer->io, 1);
614 	if (len) {
615 		intptr_t rel_offset = (intptr_t)base - (intptr_t)buffer->ondisk;
616 		KKASSERT((rel_offset & ~(intptr_t)HAMMER_BUFMASK) == 0);
617 		hammer_generate_undo(trans, &buffer->io,
618 				     buffer->zone2_offset + rel_offset,
619 				     base, len);
620 	}
621 }
622 
623 void
624 hammer_modify_volume_done(hammer_volume_t volume)
625 {
626 	hammer_io_modify_done(&volume->io);
627 }
628 
629 void
630 hammer_modify_buffer_done(hammer_buffer_t buffer)
631 {
632 	hammer_io_modify_done(&buffer->io);
633 }
634 
635 /*
636  * Mark an entity as not being dirty any more and finalize any
637  * delayed adjustments to the buffer.
638  *
639  * Delayed adjustments are an important performance enhancement, allowing
640  * us to avoid recalculating B-Tree node CRCs over and over again when
641  * making bulk-modifications to the B-Tree.
642  *
643  * If inval is non-zero delayed adjustments are ignored.
644  */
645 void
646 hammer_io_clear_modify(struct hammer_io *io, int inval)
647 {
648 	if (io->modified == 0)
649 		return;
650 
651 	/*
652 	 * Take us off the mod-list and clear the modified bit.
653 	 */
654 	KKASSERT(io->mod_list != NULL);
655 	if (io->mod_list == &io->hmp->volu_list ||
656 	    io->mod_list == &io->hmp->meta_list) {
657 		io->hmp->locked_dirty_space -= io->bytes;
658 		hammer_count_dirtybufspace -= io->bytes;
659 	}
660 	TAILQ_REMOVE(io->mod_list, io, mod_entry);
661 	io->mod_list = NULL;
662 	io->modified = 0;
663 
664 	/*
665 	 * If this bit is not set there are no delayed adjustments.
666 	 */
667 	if (io->gencrc == 0)
668 		return;
669 	io->gencrc = 0;
670 
671 	/*
672 	 * Finalize requested CRCs.  The NEEDSCRC flag also holds a reference
673 	 * on the node (& underlying buffer).  Release the node after clearing
674 	 * the flag.
675 	 */
676 	if (io->type == HAMMER_STRUCTURE_META_BUFFER) {
677 		hammer_buffer_t buffer = (void *)io;
678 		hammer_node_t node;
679 
680 restart:
681 		TAILQ_FOREACH(node, &buffer->clist, entry) {
682 			if ((node->flags & HAMMER_NODE_NEEDSCRC) == 0)
683 				continue;
684 			node->flags &= ~HAMMER_NODE_NEEDSCRC;
685 			KKASSERT(node->ondisk);
686 			if (inval == 0)
687 				node->ondisk->crc = crc32(&node->ondisk->crc + 1, HAMMER_BTREE_CRCSIZE);
688 			hammer_rel_node(node);
689 			goto restart;
690 		}
691 	}
692 
693 }
694 
695 /*
696  * Clear the IO's modify list.  Even though the IO is no longer modified
697  * it may still be on the lose_list.  This routine is called just before
698  * the governing hammer_buffer is destroyed.
699  */
700 void
701 hammer_io_clear_modlist(struct hammer_io *io)
702 {
703 	KKASSERT(io->modified == 0);
704 	if (io->mod_list) {
705 		crit_enter();	/* biodone race against list */
706 		KKASSERT(io->mod_list == &io->hmp->lose_list);
707 		TAILQ_REMOVE(io->mod_list, io, mod_entry);
708 		io->mod_list = NULL;
709 		crit_exit();
710 	}
711 }
712 
713 /************************************************************************
714  *				HAMMER_BIOOPS				*
715  ************************************************************************
716  *
717  */
718 
719 /*
720  * Pre-IO initiation kernel callback - cluster build only
721  */
722 static void
723 hammer_io_start(struct buf *bp)
724 {
725 }
726 
727 /*
728  * Post-IO completion kernel callback - MAY BE CALLED FROM INTERRUPT!
729  *
730  * NOTE: HAMMER may modify a buffer after initiating I/O.  The modified bit
731  * may also be set if we were marking a cluster header open.  Only remove
732  * our dependancy if the modified bit is clear.
733  */
734 static void
735 hammer_io_complete(struct buf *bp)
736 {
737 	union hammer_io_structure *iou = (void *)LIST_FIRST(&bp->b_dep);
738 
739 	KKASSERT(iou->io.released == 1);
740 
741 	/*
742 	 * Deal with people waiting for I/O to drain
743 	 */
744 	if (iou->io.running) {
745 		hammer_count_io_running_write -= iou->io.bytes;
746 		iou->io.hmp->io_running_space -= iou->io.bytes;
747 		if (iou->io.hmp->io_running_space == 0)
748 			wakeup(&iou->io.hmp->io_running_space);
749 		KKASSERT(iou->io.hmp->io_running_space >= 0);
750 		iou->io.running = 0;
751 	}
752 
753 	if (iou->io.waiting) {
754 		iou->io.waiting = 0;
755 		wakeup(iou);
756 	}
757 
758 	/*
759 	 * If B_LOCKED is set someone wanted to deallocate the bp at some
760 	 * point, do it now if refs has become zero.
761 	 */
762 	if ((bp->b_flags & B_LOCKED) && iou->io.lock.refs == 0) {
763 		KKASSERT(iou->io.modified == 0);
764 		--hammer_count_io_locked;
765 		bp->b_flags &= ~B_LOCKED;
766 		hammer_io_deallocate(bp);
767 		/* structure may be dead now */
768 	}
769 }
770 
771 /*
772  * Callback from kernel when it wishes to deallocate a passively
773  * associated structure.  This mostly occurs with clean buffers
774  * but it may be possible for a holding structure to be marked dirty
775  * while its buffer is passively associated.  The caller owns the bp.
776  *
777  * If we cannot disassociate we set B_LOCKED to prevent the buffer
778  * from getting reused.
779  *
780  * WARNING: Because this can be called directly by getnewbuf we cannot
781  * recurse into the tree.  If a bp cannot be immediately disassociated
782  * our only recourse is to set B_LOCKED.
783  *
784  * WARNING: This may be called from an interrupt via hammer_io_complete()
785  */
786 static void
787 hammer_io_deallocate(struct buf *bp)
788 {
789 	hammer_io_structure_t iou = (void *)LIST_FIRST(&bp->b_dep);
790 
791 	KKASSERT((bp->b_flags & B_LOCKED) == 0 && iou->io.running == 0);
792 	if (iou->io.lock.refs > 0 || iou->io.modified) {
793 		/*
794 		 * It is not legal to disassociate a modified buffer.  This
795 		 * case really shouldn't ever occur.
796 		 */
797 		bp->b_flags |= B_LOCKED;
798 		++hammer_count_io_locked;
799 	} else {
800 		/*
801 		 * Disassociate the BP.  If the io has no refs left we
802 		 * have to add it to the loose list.
803 		 */
804 		hammer_io_disassociate(iou, 0);
805 		if (iou->io.bp == NULL &&
806 		    iou->io.type != HAMMER_STRUCTURE_VOLUME) {
807 			KKASSERT(iou->io.mod_list == NULL);
808 			crit_enter();	/* biodone race against list */
809 			iou->io.mod_list = &iou->io.hmp->lose_list;
810 			TAILQ_INSERT_TAIL(iou->io.mod_list, &iou->io, mod_entry);
811 			crit_exit();
812 		}
813 	}
814 }
815 
816 static int
817 hammer_io_fsync(struct vnode *vp)
818 {
819 	return(0);
820 }
821 
822 /*
823  * NOTE: will not be called unless we tell the kernel about the
824  * bioops.  Unused... we use the mount's VFS_SYNC instead.
825  */
826 static int
827 hammer_io_sync(struct mount *mp)
828 {
829 	return(0);
830 }
831 
832 static void
833 hammer_io_movedeps(struct buf *bp1, struct buf *bp2)
834 {
835 }
836 
837 /*
838  * I/O pre-check for reading and writing.  HAMMER only uses this for
839  * B_CACHE buffers so checkread just shouldn't happen, but if it does
840  * allow it.
841  *
842  * Writing is a different case.  We don't want the kernel to try to write
843  * out a buffer that HAMMER may be modifying passively or which has a
844  * dependancy.  In addition, kernel-demanded writes can only proceed for
845  * certain types of buffers (i.e. UNDO and DATA types).  Other dirty
846  * buffer types can only be explicitly written by the flusher.
847  *
848  * checkwrite will only be called for bdwrite()n buffers.  If we return
849  * success the kernel is guaranteed to initiate the buffer write.
850  */
851 static int
852 hammer_io_checkread(struct buf *bp)
853 {
854 	return(0);
855 }
856 
857 static int
858 hammer_io_checkwrite(struct buf *bp)
859 {
860 	hammer_io_t io = (void *)LIST_FIRST(&bp->b_dep);
861 
862 	/*
863 	 * This shouldn't happen under normal operation.
864 	 */
865 	if (io->type == HAMMER_STRUCTURE_VOLUME ||
866 	    io->type == HAMMER_STRUCTURE_META_BUFFER) {
867 		if (!panicstr)
868 			panic("hammer_io_checkwrite: illegal buffer");
869 		if ((bp->b_flags & B_LOCKED) == 0) {
870 			bp->b_flags |= B_LOCKED;
871 			++hammer_count_io_locked;
872 		}
873 		return(1);
874 	}
875 
876 	/*
877 	 * We can only clear the modified bit if the IO is not currently
878 	 * undergoing modification.  Otherwise we may miss changes.
879 	 */
880 	if (io->modify_refs == 0 && io->modified)
881 		hammer_io_clear_modify(io, 0);
882 
883 	/*
884 	 * The kernel is going to start the IO, set io->running.
885 	 */
886 	KKASSERT(io->running == 0);
887 	io->running = 1;
888 	io->hmp->io_running_space += io->bytes;
889 	hammer_count_io_running_write += io->bytes;
890 	return(0);
891 }
892 
893 /*
894  * Return non-zero if we wish to delay the kernel's attempt to flush
895  * this buffer to disk.
896  */
897 static int
898 hammer_io_countdeps(struct buf *bp, int n)
899 {
900 	return(0);
901 }
902 
903 struct bio_ops hammer_bioops = {
904 	.io_start	= hammer_io_start,
905 	.io_complete	= hammer_io_complete,
906 	.io_deallocate	= hammer_io_deallocate,
907 	.io_fsync	= hammer_io_fsync,
908 	.io_sync	= hammer_io_sync,
909 	.io_movedeps	= hammer_io_movedeps,
910 	.io_countdeps	= hammer_io_countdeps,
911 	.io_checkread	= hammer_io_checkread,
912 	.io_checkwrite	= hammer_io_checkwrite,
913 };
914 
915 /************************************************************************
916  *				DIRECT IO OPS 				*
917  ************************************************************************
918  *
919  * These functions operate directly on the buffer cache buffer associated
920  * with a front-end vnode rather then a back-end device vnode.
921  */
922 
923 /*
924  * Read a buffer associated with a front-end vnode directly from the
925  * disk media.  The bio may be issued asynchronously.
926  *
927  * A second-level bio already resolved to a zone-2 offset (typically by
928  * the BMAP code, or by a previous hammer_io_direct_write()), is passed.
929  */
930 int
931 hammer_io_direct_read(hammer_mount_t hmp, struct bio *bio)
932 {
933 	hammer_off_t zone2_offset;
934 	hammer_volume_t volume;
935 	struct buf *bp;
936 	struct bio *nbio;
937 	int vol_no;
938 	int error;
939 
940 	zone2_offset = bio->bio_offset;
941 
942 	KKASSERT((zone2_offset & HAMMER_OFF_ZONE_MASK) ==
943 		 HAMMER_ZONE_RAW_BUFFER);
944 
945 	vol_no = HAMMER_VOL_DECODE(zone2_offset);
946 	volume = hammer_get_volume(hmp, vol_no, &error);
947 	if (error == 0 && zone2_offset >= volume->maxbuf_off)
948 		error = EIO;
949 
950 	/*
951 	 * Third level bio - raw offset specific to the
952 	 * correct volume.
953 	 */
954 	if (error == 0) {
955 		zone2_offset &= HAMMER_OFF_SHORT_MASK;
956 
957 		nbio = push_bio(bio);
958 		nbio->bio_offset = volume->ondisk->vol_buf_beg +
959 				   zone2_offset;
960 		vn_strategy(volume->devvp, nbio);
961 	}
962 	hammer_rel_volume(volume, 0);
963 
964 	if (error) {
965 		kprintf("hammer_direct_read: failed @ %016llx\n",
966 			zone2_offset);
967 		bp = bio->bio_buf;
968 		bp->b_error = error;
969 		bp->b_flags |= B_ERROR;
970 		biodone(bio);
971 	}
972 	return(error);
973 }
974 
975 /*
976  * Write a buffer associated with a front-end vnode directly to the
977  * disk media.  The bio may be issued asynchronously.
978  */
979 int
980 hammer_io_direct_write(hammer_mount_t hmp, hammer_btree_leaf_elm_t leaf,
981 		       struct bio *bio)
982 {
983 	hammer_off_t buf_offset;
984 	hammer_off_t zone2_offset;
985 	hammer_volume_t volume;
986 	hammer_buffer_t buffer;
987 	struct buf *bp;
988 	struct bio *nbio;
989 	char *ptr;
990 	int vol_no;
991 	int error;
992 
993 	buf_offset = leaf->data_offset;
994 
995 	KKASSERT(buf_offset > HAMMER_ZONE_BTREE);
996 	KKASSERT(bio->bio_buf->b_cmd == BUF_CMD_WRITE);
997 
998 	if ((buf_offset & HAMMER_BUFMASK) == 0 &&
999 	    leaf->data_len >= HAMMER_BUFSIZE) {
1000 		/*
1001 		 * We are using the vnode's bio to write directly to the
1002 		 * media, any hammer_buffer at the same zone-X offset will
1003 		 * now have stale data.
1004 		 */
1005 		zone2_offset = hammer_blockmap_lookup(hmp, buf_offset, &error);
1006 		vol_no = HAMMER_VOL_DECODE(zone2_offset);
1007 		volume = hammer_get_volume(hmp, vol_no, &error);
1008 
1009 		if (error == 0 && zone2_offset >= volume->maxbuf_off)
1010 			error = EIO;
1011 		if (error == 0) {
1012 			bp = bio->bio_buf;
1013 			KKASSERT((bp->b_bufsize & HAMMER_BUFMASK) == 0);
1014 			hammer_del_buffers(hmp, buf_offset,
1015 					   zone2_offset, bp->b_bufsize);
1016 			/*
1017 			 * Second level bio - cached zone2 offset.
1018 			 */
1019 			nbio = push_bio(bio);
1020 			nbio->bio_offset = zone2_offset;
1021 
1022 			/*
1023 			 * Third level bio - raw offset specific to the
1024 			 * correct volume.
1025 			 */
1026 			zone2_offset &= HAMMER_OFF_SHORT_MASK;
1027 			nbio = push_bio(nbio);
1028 			nbio->bio_offset = volume->ondisk->vol_buf_beg +
1029 					   zone2_offset;
1030 			vn_strategy(volume->devvp, nbio);
1031 		}
1032 		hammer_rel_volume(volume, 0);
1033 	} else {
1034 		/* must fit in a standard HAMMER buffer */
1035 		KKASSERT(((buf_offset ^ (buf_offset + leaf->data_len - 1)) & ~HAMMER_BUFMASK64) == 0);
1036 		buffer = NULL;
1037 		ptr = hammer_bread(hmp, buf_offset, &error, &buffer);
1038 		if (error == 0) {
1039 			bp = bio->bio_buf;
1040 			bp->b_flags |= B_AGE;
1041 			hammer_io_modify(&buffer->io, 1);
1042 			bcopy(bp->b_data, ptr, leaf->data_len);
1043 			hammer_io_modify_done(&buffer->io);
1044 			hammer_rel_buffer(buffer, 0);
1045 			bp->b_resid = 0;
1046 			biodone(bio);
1047 		}
1048 	}
1049 	if (error) {
1050 		kprintf("hammer_direct_write: failed @ %016llx\n",
1051 			leaf->data_offset);
1052 		bp = bio->bio_buf;
1053 		bp->b_resid = 0;
1054 		bp->b_error = EIO;
1055 		bp->b_flags |= B_ERROR;
1056 		biodone(bio);
1057 	}
1058 	return(error);
1059 }
1060 
1061 /*
1062  * This is called to remove the second-level cached zone-2 offset from
1063  * frontend buffer cache buffers, now stale due to a data relocation.
1064  * These offsets are generated by cluster_read() via VOP_BMAP, or directly
1065  * by hammer_vop_strategy_read().
1066  *
1067  * This is rather nasty because here we have something like the reblocker
1068  * scanning the raw B-Tree with no held references on anything, really,
1069  * other then a shared lock on the B-Tree node, and we have to access the
1070  * frontend's buffer cache to check for and clean out the association.
1071  * Specifically, if the reblocker is moving data on the disk, these cached
1072  * offsets will become invalid.
1073  *
1074  * Only data record types associated with the large-data zone are subject
1075  * to direct-io and need to be checked.
1076  *
1077  */
1078 void
1079 hammer_io_direct_uncache(hammer_mount_t hmp, hammer_btree_leaf_elm_t leaf)
1080 {
1081 	struct hammer_inode_info iinfo;
1082 	int zone;
1083 
1084 	if (leaf->base.rec_type != HAMMER_RECTYPE_DATA)
1085 		return;
1086 	zone = HAMMER_ZONE_DECODE(leaf->data_offset);
1087 	if (zone != HAMMER_ZONE_LARGE_DATA_INDEX)
1088 		return;
1089 	iinfo.obj_id = leaf->base.obj_id;
1090 	iinfo.obj_asof = 0;	/* unused */
1091 	iinfo.obj_localization = leaf->base.localization &
1092 				 HAMMER_LOCALIZE_PSEUDOFS_MASK;
1093 	iinfo.u.leaf = leaf;
1094 	hammer_scan_inode_snapshots(hmp, &iinfo,
1095 				    hammer_io_direct_uncache_callback,
1096 				    leaf);
1097 }
1098 
1099 static int
1100 hammer_io_direct_uncache_callback(hammer_inode_t ip, void *data)
1101 {
1102 	hammer_inode_info_t iinfo = data;
1103 	hammer_off_t data_offset;
1104 	hammer_off_t file_offset;
1105 	struct vnode *vp;
1106 	struct buf *bp;
1107 	int blksize;
1108 
1109 	if (ip->vp == NULL)
1110 		return(0);
1111 	data_offset = iinfo->u.leaf->data_offset;
1112 	file_offset = iinfo->u.leaf->base.key - iinfo->u.leaf->data_len;
1113 	blksize = iinfo->u.leaf->data_len;
1114 	KKASSERT((blksize & HAMMER_BUFMASK) == 0);
1115 
1116 	hammer_ref(&ip->lock);
1117 	if (hammer_get_vnode(ip, &vp) == 0) {
1118 		if ((bp = findblk(ip->vp, file_offset)) != NULL &&
1119 		    bp->b_bio2.bio_offset != NOOFFSET) {
1120 			bp = getblk(ip->vp, file_offset, blksize, 0, 0);
1121 			bp->b_bio2.bio_offset = NOOFFSET;
1122 			brelse(bp);
1123 		}
1124 		vput(vp);
1125 	}
1126 	hammer_rel_inode(ip, 0);
1127 	return(0);
1128 }
1129 
1130