xref: /netbsd-src/sys/ufs/lfs/lfs_bio.c (revision 267197ec1eebfcb9810ea27a89625b6ddf68e3e7)
1 /*	$NetBSD: lfs_bio.c,v 1.109 2008/02/15 13:30:56 ad Exp $	*/
2 
3 /*-
4  * Copyright (c) 1999, 2000, 2001, 2002, 2003, 2008 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Konrad E. Schroder <perseant@hhhh.org>.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. All advertising materials mentioning features or use of this software
19  *    must display the following acknowledgement:
20  *	This product includes software developed by the NetBSD
21  *	Foundation, Inc. and its contributors.
22  * 4. Neither the name of The NetBSD Foundation nor the names of its
23  *    contributors may be used to endorse or promote products derived
24  *    from this software without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
27  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
28  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
30  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36  * POSSIBILITY OF SUCH DAMAGE.
37  */
38 /*
39  * Copyright (c) 1991, 1993
40  *	The Regents of the University of California.  All rights reserved.
41  *
42  * Redistribution and use in source and binary forms, with or without
43  * modification, are permitted provided that the following conditions
44  * are met:
45  * 1. Redistributions of source code must retain the above copyright
46  *    notice, this list of conditions and the following disclaimer.
47  * 2. Redistributions in binary form must reproduce the above copyright
48  *    notice, this list of conditions and the following disclaimer in the
49  *    documentation and/or other materials provided with the distribution.
50  * 3. Neither the name of the University nor the names of its contributors
51  *    may be used to endorse or promote products derived from this software
52  *    without specific prior written permission.
53  *
54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64  * SUCH DAMAGE.
65  *
66  *	@(#)lfs_bio.c	8.10 (Berkeley) 6/10/95
67  */
68 
69 #include <sys/cdefs.h>
70 __KERNEL_RCSID(0, "$NetBSD: lfs_bio.c,v 1.109 2008/02/15 13:30:56 ad Exp $");
71 
72 #include <sys/param.h>
73 #include <sys/systm.h>
74 #include <sys/proc.h>
75 #include <sys/buf.h>
76 #include <sys/vnode.h>
77 #include <sys/resourcevar.h>
78 #include <sys/mount.h>
79 #include <sys/kernel.h>
80 #include <sys/kauth.h>
81 
82 #include <ufs/ufs/inode.h>
83 #include <ufs/ufs/ufsmount.h>
84 #include <ufs/ufs/ufs_extern.h>
85 
86 #include <ufs/lfs/lfs.h>
87 #include <ufs/lfs/lfs_extern.h>
88 
89 #include <uvm/uvm.h>
90 
91 /*
92  * LFS block write function.
93  *
94  * XXX
95  * No write cost accounting is done.
96  * This is almost certainly wrong for synchronous operations and NFS.
97  *
98  * protected by lfs_lock.
99  */
100 int	locked_queue_count   = 0;	/* Count of locked-down buffers. */
101 long	locked_queue_bytes   = 0L;	/* Total size of locked buffers. */
102 int	lfs_subsys_pages     = 0L;	/* Total number LFS-written pages */
103 int	lfs_fs_pagetrip	     = 0;	/* # of pages to trip per-fs write */
104 int	lfs_writing	     = 0;	/* Set if already kicked off a writer
105 					   because of buffer space */
106 
107 /* Lock and condition variables for above. */
108 kcondvar_t	locked_queue_cv;
109 kcondvar_t	lfs_writing_cv;
110 kmutex_t	lfs_lock;
111 
112 extern int lfs_dostats;
113 
114 /*
115  * reserved number/bytes of locked buffers
116  */
117 int locked_queue_rcount = 0;
118 long locked_queue_rbytes = 0L;
119 
120 int lfs_fits_buf(struct lfs *, int, int);
121 int lfs_reservebuf(struct lfs *, struct vnode *vp, struct vnode *vp2,
122     int, int);
123 int lfs_reserveavail(struct lfs *, struct vnode *vp, struct vnode *vp2, int);
124 
125 int
126 lfs_fits_buf(struct lfs *fs, int n, int bytes)
127 {
128 	int count_fit, bytes_fit;
129 
130 	ASSERT_NO_SEGLOCK(fs);
131 	KASSERT(mutex_owned(&lfs_lock));
132 
133 	count_fit =
134 	    (locked_queue_count + locked_queue_rcount + n < LFS_WAIT_BUFS);
135 	bytes_fit =
136 	    (locked_queue_bytes + locked_queue_rbytes + bytes < LFS_WAIT_BYTES);
137 
138 #ifdef DEBUG
139 	if (!count_fit) {
140 		DLOG((DLOG_AVAIL, "lfs_fits_buf: no fit count: %d + %d + %d >= %d\n",
141 		      locked_queue_count, locked_queue_rcount,
142 		      n, LFS_WAIT_BUFS));
143 	}
144 	if (!bytes_fit) {
145 		DLOG((DLOG_AVAIL, "lfs_fits_buf: no fit bytes: %ld + %ld + %d >= %ld\n",
146 		      locked_queue_bytes, locked_queue_rbytes,
147 		      bytes, LFS_WAIT_BYTES));
148 	}
149 #endif /* DEBUG */
150 
151 	return (count_fit && bytes_fit);
152 }
153 
154 /* ARGSUSED */
155 int
156 lfs_reservebuf(struct lfs *fs, struct vnode *vp,
157     struct vnode *vp2, int n, int bytes)
158 {
159 	ASSERT_MAYBE_SEGLOCK(fs);
160 	KASSERT(locked_queue_rcount >= 0);
161 	KASSERT(locked_queue_rbytes >= 0);
162 
163 	mutex_enter(&lfs_lock);
164 	while (n > 0 && !lfs_fits_buf(fs, n, bytes)) {
165 		int error;
166 
167 		lfs_flush(fs, 0, 0);
168 
169 		error = cv_timedwait_sig(&locked_queue_cv, &lfs_lock,
170 		    hz * LFS_BUFWAIT);
171 		if (error && error != EWOULDBLOCK) {
172 			mutex_exit(&lfs_lock);
173 			return error;
174 		}
175 	}
176 
177 	locked_queue_rcount += n;
178 	locked_queue_rbytes += bytes;
179 
180 	mutex_exit(&lfs_lock);
181 
182 	KASSERT(locked_queue_rcount >= 0);
183 	KASSERT(locked_queue_rbytes >= 0);
184 
185 	return 0;
186 }
187 
188 /*
189  * Try to reserve some blocks, prior to performing a sensitive operation that
190  * requires the vnode lock to be honored.  If there is not enough space, give
191  * up the vnode lock temporarily and wait for the space to become available.
192  *
193  * Called with vp locked.  (Note nowever that if fsb < 0, vp is ignored.)
194  *
195  * XXX YAMT - it isn't safe to unlock vp here
196  * because the node might be modified while we sleep.
197  * (eg. cached states like i_offset might be stale,
198  *  the vnode might be truncated, etc..)
199  * maybe we should have a way to restart the vnodeop (EVOPRESTART?)
200  * or rearrange vnodeop interface to leave vnode locking to file system
201  * specific code so that each file systems can have their own vnode locking and
202  * vnode re-using strategies.
203  */
204 int
205 lfs_reserveavail(struct lfs *fs, struct vnode *vp,
206     struct vnode *vp2, int fsb)
207 {
208 	CLEANERINFO *cip;
209 	struct buf *bp;
210 	int error, slept;
211 
212 	ASSERT_MAYBE_SEGLOCK(fs);
213 	slept = 0;
214 	mutex_enter(&lfs_lock);
215 	while (fsb > 0 && !lfs_fits(fs, fsb + fs->lfs_ravail + fs->lfs_favail)) {
216 		mutex_exit(&lfs_lock);
217 #if 0
218 		/*
219 		 * XXX ideally, we should unlock vnodes here
220 		 * because we might sleep very long time.
221 		 */
222 		VOP_UNLOCK(vp, 0);
223 		if (vp2 != NULL) {
224 			VOP_UNLOCK(vp2, 0);
225 		}
226 #else
227 		/*
228 		 * XXX since we'll sleep for cleaner with vnode lock holding,
229 		 * deadlock will occur if cleaner tries to lock the vnode.
230 		 * (eg. lfs_markv -> lfs_fastvget -> getnewvnode -> vclean)
231 		 */
232 #endif
233 
234 		if (!slept) {
235 			DLOG((DLOG_AVAIL, "lfs_reserve: waiting for %ld (bfree = %d,"
236 			      " est_bfree = %d)\n",
237 			      fsb + fs->lfs_ravail + fs->lfs_favail,
238 			      fs->lfs_bfree, LFS_EST_BFREE(fs)));
239 		}
240 		++slept;
241 
242 		/* Wake up the cleaner */
243 		LFS_CLEANERINFO(cip, fs, bp);
244 		LFS_SYNC_CLEANERINFO(cip, fs, bp, 0);
245 		lfs_wakeup_cleaner(fs);
246 
247 		mutex_enter(&lfs_lock);
248 		/* Cleaner might have run while we were reading, check again */
249 		if (lfs_fits(fs, fsb + fs->lfs_ravail + fs->lfs_favail))
250 			break;
251 
252 		error = mtsleep(&fs->lfs_avail, PCATCH | PUSER, "lfs_reserve",
253 				0, &lfs_lock);
254 #if 0
255 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); /* XXX use lockstatus */
256 		vn_lock(vp2, LK_EXCLUSIVE | LK_RETRY); /* XXX use lockstatus */
257 #endif
258 		if (error) {
259 			mutex_exit(&lfs_lock);
260 			return error;
261 		}
262 	}
263 #ifdef DEBUG
264 	if (slept) {
265 		DLOG((DLOG_AVAIL, "lfs_reserve: woke up\n"));
266 	}
267 #endif
268 	fs->lfs_ravail += fsb;
269 	mutex_exit(&lfs_lock);
270 
271 	return 0;
272 }
273 
274 #ifdef DIAGNOSTIC
275 int lfs_rescount;
276 int lfs_rescountdirop;
277 #endif
278 
279 int
280 lfs_reserve(struct lfs *fs, struct vnode *vp, struct vnode *vp2, int fsb)
281 {
282 	int error;
283 	int cantwait;
284 
285 	ASSERT_MAYBE_SEGLOCK(fs);
286 	if (vp2) {
287 		/* Make sure we're not in the process of reclaiming vp2 */
288 		mutex_enter(&lfs_lock);
289 		while(fs->lfs_flags & LFS_UNDIROP) {
290 			mtsleep(&fs->lfs_flags, PRIBIO + 1, "lfsrundirop", 0,
291 			    &lfs_lock);
292 		}
293 		mutex_exit(&lfs_lock);
294 	}
295 
296 	KASSERT(fsb < 0 || VOP_ISLOCKED(vp));
297 	KASSERT(vp2 == NULL || fsb < 0 || VOP_ISLOCKED(vp2));
298 	KASSERT(vp2 == NULL || !(VTOI(vp2)->i_flag & IN_ADIROP));
299 	KASSERT(vp2 == NULL || vp2 != fs->lfs_unlockvp);
300 
301 	cantwait = (VTOI(vp)->i_flag & IN_ADIROP) || fs->lfs_unlockvp == vp;
302 #ifdef DIAGNOSTIC
303 	if (cantwait) {
304 		if (fsb > 0)
305 			lfs_rescountdirop++;
306 		else if (fsb < 0)
307 			lfs_rescountdirop--;
308 		if (lfs_rescountdirop < 0)
309 			panic("lfs_rescountdirop");
310 	}
311 	else {
312 		if (fsb > 0)
313 			lfs_rescount++;
314 		else if (fsb < 0)
315 			lfs_rescount--;
316 		if (lfs_rescount < 0)
317 			panic("lfs_rescount");
318 	}
319 #endif
320 	if (cantwait)
321 		return 0;
322 
323 	/*
324 	 * XXX
325 	 * vref vnodes here so that cleaner doesn't try to reuse them.
326 	 * (see XXX comment in lfs_reserveavail)
327 	 */
328 	mutex_enter(&vp->v_interlock);
329 	lfs_vref(vp);
330 	if (vp2 != NULL) {
331 		mutex_enter(&vp2->v_interlock);
332 		lfs_vref(vp2);
333 	}
334 
335 	error = lfs_reserveavail(fs, vp, vp2, fsb);
336 	if (error)
337 		goto done;
338 
339 	/*
340 	 * XXX just a guess. should be more precise.
341 	 */
342 	error = lfs_reservebuf(fs, vp, vp2,
343 	    fragstoblks(fs, fsb), fsbtob(fs, fsb));
344 	if (error)
345 		lfs_reserveavail(fs, vp, vp2, -fsb);
346 
347 done:
348 	lfs_vunref(vp);
349 	if (vp2 != NULL) {
350 		lfs_vunref(vp2);
351 	}
352 
353 	return error;
354 }
355 
356 int
357 lfs_bwrite(void *v)
358 {
359 	struct vop_bwrite_args /* {
360 		struct buf *a_bp;
361 	} */ *ap = v;
362 	struct buf *bp = ap->a_bp;
363 
364 #ifdef DIAGNOSTIC
365 	if (VTOI(bp->b_vp)->i_lfs->lfs_ronly == 0 && (bp->b_flags & B_ASYNC)) {
366 		panic("bawrite LFS buffer");
367 	}
368 #endif /* DIAGNOSTIC */
369 	return lfs_bwrite_ext(bp, 0);
370 }
371 
372 /*
373  * Determine if there is enough room currently available to write fsb
374  * blocks.  We need enough blocks for the new blocks, the current
375  * inode blocks (including potentially the ifile inode), a summary block,
376  * and the segment usage table, plus an ifile block.
377  */
378 int
379 lfs_fits(struct lfs *fs, int fsb)
380 {
381 	int needed;
382 
383 	ASSERT_NO_SEGLOCK(fs);
384 	needed = fsb + btofsb(fs, fs->lfs_sumsize) +
385 		 ((howmany(fs->lfs_uinodes + 1, INOPB(fs)) + fs->lfs_segtabsz +
386 		   1) << (fs->lfs_blktodb - fs->lfs_fsbtodb));
387 
388 	if (needed >= fs->lfs_avail) {
389 #ifdef DEBUG
390 		DLOG((DLOG_AVAIL, "lfs_fits: no fit: fsb = %ld, uinodes = %ld, "
391 		      "needed = %ld, avail = %ld\n",
392 		      (long)fsb, (long)fs->lfs_uinodes, (long)needed,
393 		      (long)fs->lfs_avail));
394 #endif
395 		return 0;
396 	}
397 	return 1;
398 }
399 
400 int
401 lfs_availwait(struct lfs *fs, int fsb)
402 {
403 	int error;
404 	CLEANERINFO *cip;
405 	struct buf *cbp;
406 
407 	ASSERT_NO_SEGLOCK(fs);
408 	/* Push cleaner blocks through regardless */
409 	mutex_enter(&lfs_lock);
410 	if (LFS_SEGLOCK_HELD(fs) &&
411 	    fs->lfs_sp->seg_flags & (SEGM_CLEAN | SEGM_FORCE_CKP)) {
412 		mutex_exit(&lfs_lock);
413 		return 0;
414 	}
415 	mutex_exit(&lfs_lock);
416 
417 	while (!lfs_fits(fs, fsb)) {
418 		/*
419 		 * Out of space, need cleaner to run.
420 		 * Update the cleaner info, then wake it up.
421 		 * Note the cleanerinfo block is on the ifile
422 		 * so it CANT_WAIT.
423 		 */
424 		LFS_CLEANERINFO(cip, fs, cbp);
425 		LFS_SYNC_CLEANERINFO(cip, fs, cbp, 0);
426 
427 #ifdef DEBUG
428 		DLOG((DLOG_AVAIL, "lfs_availwait: out of available space, "
429 		      "waiting on cleaner\n"));
430 #endif
431 
432 		lfs_wakeup_cleaner(fs);
433 #ifdef DIAGNOSTIC
434 		if (LFS_SEGLOCK_HELD(fs))
435 			panic("lfs_availwait: deadlock");
436 #endif
437 		error = tsleep(&fs->lfs_avail, PCATCH | PUSER, "cleaner", 0);
438 		if (error)
439 			return (error);
440 	}
441 	return 0;
442 }
443 
444 int
445 lfs_bwrite_ext(struct buf *bp, int flags)
446 {
447 	struct lfs *fs;
448 	struct inode *ip;
449 	struct vnode *vp;
450 	int fsb;
451 
452 	vp = bp->b_vp;
453 	fs = VFSTOUFS(vp->v_mount)->um_lfs;
454 
455 	ASSERT_MAYBE_SEGLOCK(fs);
456 	KASSERT(bp->b_cflags & BC_BUSY);
457 	KASSERT(flags & BW_CLEAN || !LFS_IS_MALLOC_BUF(bp));
458 	KASSERT(((bp->b_oflags | bp->b_flags) & (BO_DELWRI|B_LOCKED))
459 	    != BO_DELWRI);
460 
461 	/*
462 	 * Don't write *any* blocks if we're mounted read-only, or
463 	 * if we are "already unmounted".
464 	 *
465 	 * In particular the cleaner can't write blocks either.
466 	 */
467 	if (fs->lfs_ronly || (fs->lfs_pflags & LFS_PF_CLEAN)) {
468 		bp->b_oflags &= ~BO_DELWRI;
469 		bp->b_flags |= B_READ;
470 		bp->b_error = 0;
471 		mutex_enter(&bufcache_lock);
472 		LFS_UNLOCK_BUF(bp);
473 		if (LFS_IS_MALLOC_BUF(bp))
474 			bp->b_cflags &= ~BC_BUSY;
475 		else
476 			brelsel(bp, 0);
477 		mutex_exit(&bufcache_lock);
478 		return (fs->lfs_ronly ? EROFS : 0);
479 	}
480 
481 	/*
482 	 * Set the delayed write flag and use reassignbuf to move the buffer
483 	 * from the clean list to the dirty one.
484 	 *
485 	 * Set the B_LOCKED flag and unlock the buffer, causing brelse to move
486 	 * the buffer onto the LOCKED free list.  This is necessary, otherwise
487 	 * getnewbuf() would try to reclaim the buffers using bawrite, which
488 	 * isn't going to work.
489 	 *
490 	 * XXX we don't let meta-data writes run out of space because they can
491 	 * come from the segment writer.  We need to make sure that there is
492 	 * enough space reserved so that there's room to write meta-data
493 	 * blocks.
494 	 */
495 	if ((bp->b_flags & B_LOCKED) == 0) {
496 		fsb = fragstofsb(fs, numfrags(fs, bp->b_bcount));
497 
498 		ip = VTOI(vp);
499 		mutex_enter(&lfs_lock);
500 		if (flags & BW_CLEAN) {
501 			LFS_SET_UINO(ip, IN_CLEANING);
502 		} else {
503 			LFS_SET_UINO(ip, IN_MODIFIED);
504 		}
505 		mutex_exit(&lfs_lock);
506 		fs->lfs_avail -= fsb;
507 
508 		mutex_enter(&bufcache_lock);
509 		mutex_enter(&vp->v_interlock);
510 		bp->b_oflags = (bp->b_oflags | BO_DELWRI) & ~BO_DONE;
511 		LFS_LOCK_BUF(bp);
512 		bp->b_flags &= ~B_READ;
513 		bp->b_error = 0;
514 		reassignbuf(bp, bp->b_vp);
515 		mutex_exit(&vp->v_interlock);
516 	} else {
517 		mutex_enter(&bufcache_lock);
518 	}
519 
520 	if (bp->b_iodone != NULL)
521 		bp->b_cflags &= ~BC_BUSY;
522 	else
523 		brelsel(bp, 0);
524 	mutex_exit(&bufcache_lock);
525 
526 	return (0);
527 }
528 
529 /*
530  * Called and return with the lfs_lock held.
531  */
532 void
533 lfs_flush_fs(struct lfs *fs, int flags)
534 {
535 	ASSERT_NO_SEGLOCK(fs);
536 	KASSERT(mutex_owned(&lfs_lock));
537 	if (fs->lfs_ronly)
538 		return;
539 
540 	if (lfs_dostats)
541 		++lfs_stats.flush_invoked;
542 
543 	mutex_exit(&lfs_lock);
544 	lfs_writer_enter(fs, "fldirop");
545 	lfs_segwrite(fs->lfs_ivnode->v_mount, flags);
546 	lfs_writer_leave(fs);
547 	mutex_enter(&lfs_lock);
548 	fs->lfs_favail = 0; /* XXX */
549 }
550 
551 /*
552  * This routine initiates segment writes when LFS is consuming too many
553  * resources.  Ideally the pageout daemon would be able to direct LFS
554  * more subtly.
555  * XXX We have one static count of locked buffers;
556  * XXX need to think more about the multiple filesystem case.
557  *
558  * Called and return with lfs_lock held.
559  * If fs != NULL, we hold the segment lock for fs.
560  */
561 void
562 lfs_flush(struct lfs *fs, int flags, int only_onefs)
563 {
564 	extern u_int64_t locked_fakequeue_count;
565 	struct mount *mp, *nmp;
566 	struct lfs *tfs;
567 
568 	KASSERT(mutex_owned(&lfs_lock));
569 	KDASSERT(fs == NULL || !LFS_SEGLOCK_HELD(fs));
570 
571 	if (lfs_dostats)
572 		++lfs_stats.write_exceeded;
573 	/* XXX should we include SEGM_CKP here? */
574 	if (lfs_writing && !(flags & SEGM_SYNC)) {
575 		DLOG((DLOG_FLUSH, "lfs_flush: not flushing because another flush is active\n"));
576 		return;
577 	}
578 	while (lfs_writing)
579 		cv_wait(&lfs_writing_cv, &lfs_lock);
580 	lfs_writing = 1;
581 
582 	mutex_exit(&lfs_lock);
583 
584 	if (only_onefs) {
585 		KASSERT(fs != NULL);
586 		if (vfs_trybusy(fs->lfs_ivnode->v_mount, RW_READER,
587 		    &mountlist_lock))
588 			goto errout;
589 		mutex_enter(&lfs_lock);
590 		lfs_flush_fs(fs, flags);
591 		mutex_exit(&lfs_lock);
592 		vfs_unbusy(fs->lfs_ivnode->v_mount, false);
593 	} else {
594 		locked_fakequeue_count = 0;
595 		mutex_enter(&mountlist_lock);
596 		for (mp = CIRCLEQ_FIRST(&mountlist); mp != (void *)&mountlist;
597 		     mp = nmp) {
598 			if (vfs_trybusy(mp, RW_READER, &mountlist_lock)) {
599 				DLOG((DLOG_FLUSH, "lfs_flush: fs vfs_busy\n"));
600 				nmp = CIRCLEQ_NEXT(mp, mnt_list);
601 				continue;
602 			}
603 			if (strncmp(&mp->mnt_stat.f_fstypename[0], MOUNT_LFS,
604 			    sizeof(mp->mnt_stat.f_fstypename)) == 0) {
605 				tfs = VFSTOUFS(mp)->um_lfs;
606 				mutex_enter(&lfs_lock);
607 				lfs_flush_fs(tfs, flags);
608 				mutex_exit(&lfs_lock);
609 			}
610 			mutex_enter(&mountlist_lock);
611 			nmp = CIRCLEQ_NEXT(mp, mnt_list);
612 			vfs_unbusy(mp, false);
613 		}
614 		mutex_exit(&mountlist_lock);
615 	}
616 	LFS_DEBUG_COUNTLOCKED("flush");
617 	wakeup(&lfs_subsys_pages);
618 
619     errout:
620 	mutex_enter(&lfs_lock);
621 	KASSERT(lfs_writing);
622 	lfs_writing = 0;
623 	wakeup(&lfs_writing);
624 }
625 
626 #define INOCOUNT(fs) howmany((fs)->lfs_uinodes, INOPB(fs))
627 #define INOBYTES(fs) ((fs)->lfs_uinodes * sizeof (struct ufs1_dinode))
628 
629 /*
630  * make sure that we don't have too many locked buffers.
631  * flush buffers if needed.
632  */
633 int
634 lfs_check(struct vnode *vp, daddr_t blkno, int flags)
635 {
636 	int error;
637 	struct lfs *fs;
638 	struct inode *ip;
639 	extern pid_t lfs_writer_daemon;
640 
641 	error = 0;
642 	ip = VTOI(vp);
643 
644 	/* If out of buffers, wait on writer */
645 	/* XXX KS - if it's the Ifile, we're probably the cleaner! */
646 	if (ip->i_number == LFS_IFILE_INUM)
647 		return 0;
648 	/* If we're being called from inside a dirop, don't sleep */
649 	if (ip->i_flag & IN_ADIROP)
650 		return 0;
651 
652 	fs = ip->i_lfs;
653 
654 	ASSERT_NO_SEGLOCK(fs);
655 
656 	/*
657 	 * If we would flush below, but dirops are active, sleep.
658 	 * Note that a dirop cannot ever reach this code!
659 	 */
660 	mutex_enter(&lfs_lock);
661 	while (fs->lfs_dirops > 0 &&
662 	       (locked_queue_count + INOCOUNT(fs) > LFS_MAX_BUFS ||
663 		locked_queue_bytes + INOBYTES(fs) > LFS_MAX_BYTES ||
664 		lfs_subsys_pages > LFS_MAX_PAGES ||
665 		fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs) ||
666 		lfs_dirvcount > LFS_MAX_DIROP || fs->lfs_diropwait > 0))
667 	{
668 		++fs->lfs_diropwait;
669 		mtsleep(&fs->lfs_writer, PRIBIO+1, "bufdirop", 0,
670 			&lfs_lock);
671 		--fs->lfs_diropwait;
672 	}
673 
674 #ifdef DEBUG
675 	if (locked_queue_count + INOCOUNT(fs) > LFS_MAX_BUFS)
676 		DLOG((DLOG_FLUSH, "lfs_check: lqc = %d, max %d\n",
677 		      locked_queue_count + INOCOUNT(fs), LFS_MAX_BUFS));
678 	if (locked_queue_bytes + INOBYTES(fs) > LFS_MAX_BYTES)
679 		DLOG((DLOG_FLUSH, "lfs_check: lqb = %ld, max %ld\n",
680 		      locked_queue_bytes + INOBYTES(fs), LFS_MAX_BYTES));
681 	if (lfs_subsys_pages > LFS_MAX_PAGES)
682 		DLOG((DLOG_FLUSH, "lfs_check: lssp = %d, max %d\n",
683 		      lfs_subsys_pages, LFS_MAX_PAGES));
684 	if (lfs_fs_pagetrip && fs->lfs_pages > lfs_fs_pagetrip)
685 		DLOG((DLOG_FLUSH, "lfs_check: fssp = %d, trip at %d\n",
686 		      fs->lfs_pages, lfs_fs_pagetrip));
687 	if (lfs_dirvcount > LFS_MAX_DIROP)
688 		DLOG((DLOG_FLUSH, "lfs_check: ldvc = %d, max %d\n",
689 		      lfs_dirvcount, LFS_MAX_DIROP));
690 	if (fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs))
691 		DLOG((DLOG_FLUSH, "lfs_check: lfdvc = %d, max %d\n",
692 		      fs->lfs_dirvcount, LFS_MAX_FSDIROP(fs)));
693 	if (fs->lfs_diropwait > 0)
694 		DLOG((DLOG_FLUSH, "lfs_check: ldvw = %d\n",
695 		      fs->lfs_diropwait));
696 #endif
697 
698 	/* If there are too many pending dirops, we have to flush them. */
699 	if (fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs) ||
700 	    lfs_dirvcount > LFS_MAX_DIROP || fs->lfs_diropwait > 0) {
701 		flags |= SEGM_CKP;
702 	}
703 
704 	if (locked_queue_count + INOCOUNT(fs) > LFS_MAX_BUFS ||
705 	    locked_queue_bytes + INOBYTES(fs) > LFS_MAX_BYTES ||
706 	    lfs_subsys_pages > LFS_MAX_PAGES ||
707 	    fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs) ||
708 	    lfs_dirvcount > LFS_MAX_DIROP || fs->lfs_diropwait > 0) {
709 		lfs_flush(fs, flags, 0);
710 	} else if (lfs_fs_pagetrip && fs->lfs_pages > lfs_fs_pagetrip) {
711 		/*
712 		 * If we didn't flush the whole thing, some filesystems
713 		 * still might want to be flushed.
714 		 */
715 		++fs->lfs_pdflush;
716 		wakeup(&lfs_writer_daemon);
717 	}
718 
719 	while (locked_queue_count + INOCOUNT(fs) > LFS_WAIT_BUFS ||
720 		locked_queue_bytes + INOBYTES(fs) > LFS_WAIT_BYTES ||
721 		lfs_subsys_pages > LFS_WAIT_PAGES ||
722 		fs->lfs_dirvcount > LFS_MAX_FSDIROP(fs) ||
723 		lfs_dirvcount > LFS_MAX_DIROP) {
724 
725 		if (lfs_dostats)
726 			++lfs_stats.wait_exceeded;
727 		DLOG((DLOG_AVAIL, "lfs_check: waiting: count=%d, bytes=%ld\n",
728 		      locked_queue_count, locked_queue_bytes));
729 		error = cv_timedwait_sig(&locked_queue_cv, &lfs_lock,
730 		    hz * LFS_BUFWAIT);
731 		if (error != EWOULDBLOCK)
732 			break;
733 
734 		/*
735 		 * lfs_flush might not flush all the buffers, if some of the
736 		 * inodes were locked or if most of them were Ifile blocks
737 		 * and we weren't asked to checkpoint.	Try flushing again
738 		 * to keep us from blocking indefinitely.
739 		 */
740 		if (locked_queue_count + INOCOUNT(fs) > LFS_MAX_BUFS ||
741 		    locked_queue_bytes + INOBYTES(fs) > LFS_MAX_BYTES) {
742 			lfs_flush(fs, flags | SEGM_CKP, 0);
743 		}
744 	}
745 	mutex_exit(&lfs_lock);
746 	return (error);
747 }
748 
749 /*
750  * Allocate a new buffer header.
751  */
752 struct buf *
753 lfs_newbuf(struct lfs *fs, struct vnode *vp, daddr_t daddr, size_t size, int type)
754 {
755 	struct buf *bp;
756 	size_t nbytes;
757 
758 	ASSERT_MAYBE_SEGLOCK(fs);
759 	nbytes = roundup(size, fsbtob(fs, 1));
760 
761 	bp = getiobuf(NULL, true);
762 	if (nbytes) {
763 		bp->b_data = lfs_malloc(fs, nbytes, type);
764 		/* memset(bp->b_data, 0, nbytes); */
765 	}
766 #ifdef DIAGNOSTIC
767 	if (vp == NULL)
768 		panic("vp is NULL in lfs_newbuf");
769 	if (bp == NULL)
770 		panic("bp is NULL after malloc in lfs_newbuf");
771 #endif
772 
773 	bp->b_bufsize = size;
774 	bp->b_bcount = size;
775 	bp->b_lblkno = daddr;
776 	bp->b_blkno = daddr;
777 	bp->b_error = 0;
778 	bp->b_resid = 0;
779 	bp->b_iodone = lfs_callback;
780 	bp->b_cflags = BC_BUSY | BC_NOCACHE;
781 	bp->b_private = fs;
782 
783 	mutex_enter(&bufcache_lock);
784 	mutex_enter(&vp->v_interlock);
785 	bgetvp(vp, bp);
786 	mutex_exit(&vp->v_interlock);
787 	mutex_exit(&bufcache_lock);
788 
789 	return (bp);
790 }
791 
792 void
793 lfs_freebuf(struct lfs *fs, struct buf *bp)
794 {
795 	struct vnode *vp;
796 
797 	if ((vp = bp->b_vp) != NULL) {
798 		mutex_enter(&bufcache_lock);
799 		mutex_enter(&vp->v_interlock);
800 		brelvp(bp);
801 		mutex_exit(&vp->v_interlock);
802 		mutex_exit(&bufcache_lock);
803 	}
804 	if (!(bp->b_cflags & BC_INVAL)) { /* BC_INVAL indicates a "fake" buffer */
805 		lfs_free(fs, bp->b_data, LFS_NB_UNKNOWN);
806 		bp->b_data = NULL;
807 	}
808 	putiobuf(bp);
809 }
810 
811 /*
812  * Definitions for the buffer free lists.
813  */
814 #define BQUEUES		4		/* number of free buffer queues */
815 
816 #define BQ_LOCKED	0		/* super-blocks &c */
817 #define BQ_LRU		1		/* lru, useful buffers */
818 #define BQ_AGE		2		/* rubbish */
819 #define BQ_EMPTY	3		/* buffer headers with no memory */
820 
821 extern TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
822 
823 /*
824  * Count buffers on the "locked" queue, and compare it to a pro-forma count.
825  * Don't count malloced buffers, since they don't detract from the total.
826  */
827 void
828 lfs_countlocked(int *count, long *bytes, const char *msg)
829 {
830 	struct buf *bp;
831 	int n = 0;
832 	long int size = 0L;
833 
834 	mutex_enter(&bufcache_lock);
835 	TAILQ_FOREACH(bp, &bufqueues[BQ_LOCKED], b_freelist) {
836 		KASSERT(bp->b_iodone == NULL);
837 		n++;
838 		size += bp->b_bufsize;
839 #ifdef DIAGNOSTIC
840 		if (n > nbuf)
841 			panic("lfs_countlocked: this can't happen: more"
842 			      " buffers locked than exist");
843 #endif
844 	}
845 	/*
846 	 * Theoretically this function never really does anything.
847 	 * Give a warning if we have to fix the accounting.
848 	 */
849 	if (n != *count) {
850 		DLOG((DLOG_LLIST, "lfs_countlocked: %s: adjusted buf count"
851 		      " from %d to %d\n", msg, *count, n));
852 	}
853 	if (size != *bytes) {
854 		DLOG((DLOG_LLIST, "lfs_countlocked: %s: adjusted byte count"
855 		      " from %ld to %ld\n", msg, *bytes, size));
856 	}
857 	*count = n;
858 	*bytes = size;
859 	mutex_exit(&bufcache_lock);
860 	return;
861 }
862 
863 int
864 lfs_wait_pages(void)
865 {
866 	int active, inactive;
867 
868 	uvm_estimatepageable(&active, &inactive);
869 	return LFS_WAIT_RESOURCE(active + inactive + uvmexp.free, 1);
870 }
871 
872 int
873 lfs_max_pages(void)
874 {
875 	int active, inactive;
876 
877 	uvm_estimatepageable(&active, &inactive);
878 	return LFS_MAX_RESOURCE(active + inactive + uvmexp.free, 1);
879 }
880