xref: /openbsd-src/sys/kern/vfs_bio.c (revision 3a3fbb3f2e2521ab7c4a56b7ff7462ebd9095ec5)
1 /*	$OpenBSD: vfs_bio.c,v 1.55 2001/12/19 08:58:06 art Exp $	*/
2 /*	$NetBSD: vfs_bio.c,v 1.44 1996/06/11 11:15:36 pk Exp $	*/
3 
4 /*-
5  * Copyright (c) 1994 Christopher G. Demetriou
6  * Copyright (c) 1982, 1986, 1989, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  * (c) UNIX System Laboratories, Inc.
9  * All or some portions of this file are derived from material licensed
10  * to the University of California by American Telephone and Telegraph
11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12  * the permission of UNIX System Laboratories, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. All advertising materials mentioning features or use of this software
23  *    must display the following acknowledgement:
24  *	This product includes software developed by the University of
25  *	California, Berkeley and its contributors.
26  * 4. Neither the name of the University nor the names of its contributors
27  *    may be used to endorse or promote products derived from this software
28  *    without specific prior written permission.
29  *
30  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40  * SUCH DAMAGE.
41  *
42  *	@(#)vfs_bio.c	8.6 (Berkeley) 1/11/94
43  */
44 
45 /*
46  * Some references:
47  *	Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
48  *	Leffler, et al.: The Design and Implementation of the 4.3BSD
49  *		UNIX Operating System (Addison Welley, 1989)
50  */
51 
52 #include <sys/param.h>
53 #include <sys/systm.h>
54 #include <sys/proc.h>
55 #include <sys/buf.h>
56 #include <sys/vnode.h>
57 #include <sys/mount.h>
58 #include <sys/malloc.h>
59 #include <sys/pool.h>
60 #include <sys/resourcevar.h>
61 #include <sys/conf.h>
62 #include <sys/kernel.h>
63 
64 #include <uvm/uvm_extern.h>
65 
66 #include <miscfs/specfs/specdev.h>
67 
68 /* Macros to clear/set/test flags. */
69 #define	SET(t, f)	(t) |= (f)
70 #define	CLR(t, f)	(t) &= ~(f)
71 #define	ISSET(t, f)	((t) & (f))
72 
73 /*
74  * Definitions for the buffer hash lists.
75  */
76 #define	BUFHASH(dvp, lbn)	\
77 	(&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
78 LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
79 u_long	bufhash;
80 
81 /*
82  * Insq/Remq for the buffer hash lists.
83  */
84 #define	binshash(bp, dp)	LIST_INSERT_HEAD(dp, bp, b_hash)
85 #define	bremhash(bp)		LIST_REMOVE(bp, b_hash)
86 
87 /*
88  * Definitions for the buffer free lists.
89  */
90 #define	BQUEUES		4		/* number of free buffer queues */
91 
92 #define	BQ_LOCKED	0		/* super-blocks &c */
93 #define	BQ_CLEAN	1		/* LRU queue with clean buffers */
94 #define	BQ_DIRTY	2		/* LRU queue with dirty buffers */
95 #define	BQ_EMPTY	3		/* buffer headers with no memory */
96 
97 TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
98 int needbuffer;
99 int nobuffers;
100 struct bio_ops bioops;
101 
102 /*
103  * Buffer pool for I/O buffers.
104  */
105 struct pool bufpool;
106 
107 /*
108  * Insq/Remq for the buffer free lists.
109  */
110 #define	binsheadfree(bp, dp)	TAILQ_INSERT_HEAD(dp, bp, b_freelist)
111 #define	binstailfree(bp, dp)	TAILQ_INSERT_TAIL(dp, bp, b_freelist)
112 
113 static __inline struct buf *bio_doread __P((struct vnode *, daddr_t, int, int));
114 int getnewbuf __P((int slpflag, int slptimeo, struct buf **));
115 
116 /*
117  * We keep a few counters to monitor the utilization of the buffer cache
118  *
119  *  numdirtypages - number of pages on BQ_DIRTY queue.
120  *  lodirtypages  - low water mark for buffer cleaning daemon.
121  *  hidirtypages  - high water mark for buffer cleaning daemon.
122  *  numfreepages  - number of pages on BQ_CLEAN and BQ_DIRTY queues. unused.
123  *  numcleanpages - number of pages on BQ_CLEAN queue.
124  *		    Used to track the need to speedup the cleaner and
125  *		    as a reserve for special processes like syncer.
126  *  mincleanpages - the lowest byte count on BQ_CLEAN.
127  *  numemptybufs  - number of buffers on BQ_EMPTY. unused.
128  */
129 long numdirtypages;
130 long lodirtypages;
131 long hidirtypages;
132 long numfreepages;
133 long numcleanpages;
134 long locleanpages;
135 int numemptybufs;
136 #ifdef DEBUG
137 long mincleanpages;
138 #endif
139 
140 struct proc *cleanerproc;
141 int bd_req;			/* Sleep point for cleaner daemon. */
142 
143 void
144 bremfree(bp)
145 	struct buf *bp;
146 {
147 	struct bqueues *dp = NULL;
148 
149 	/*
150 	 * We only calculate the head of the freelist when removing
151 	 * the last element of the list as that is the only time that
152 	 * it is needed (e.g. to reset the tail pointer).
153 	 *
154 	 * NB: This makes an assumption about how tailq's are implemented.
155 	 */
156 	if (bp->b_freelist.tqe_next == NULL) {
157 		for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
158 			if (dp->tqh_last == &bp->b_freelist.tqe_next)
159 				break;
160 		if (dp == &bufqueues[BQUEUES])
161 			panic("bremfree: lost tail");
162 	}
163 	if (bp->b_bufsize <= 0) {
164 		numemptybufs--;
165 	} else if (!ISSET(bp->b_flags, B_LOCKED)) {
166 		numfreepages -= btoc(bp->b_bufsize);
167 		if (!ISSET(bp->b_flags, B_DELWRI)) {
168 			numcleanpages -= btoc(bp->b_bufsize);
169 #ifdef DEBUG
170 			if (mincleanpages > numcleanpages)
171 				mincleanpages = numcleanpages;
172 #endif
173 		} else {
174 			numdirtypages -= btoc(bp->b_bufsize);
175 		}
176 	}
177 	TAILQ_REMOVE(dp, bp, b_freelist);
178 }
179 
180 /*
181  * Initialize buffers and hash links for buffers.
182  */
183 void
184 bufinit()
185 {
186 	register struct buf *bp;
187 	struct bqueues *dp;
188 	register int i;
189 	int base, residual;
190 
191 	pool_init(&bufpool, sizeof(struct buf), 0, 0, 0, "bufpl", 0,
192 	    NULL, NULL, M_DEVBUF);
193 	for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
194 		TAILQ_INIT(dp);
195 	bufhashtbl = hashinit(nbuf, M_CACHE, M_WAITOK, &bufhash);
196 	base = bufpages / nbuf;
197 	residual = bufpages % nbuf;
198 	for (i = 0; i < nbuf; i++) {
199 		bp = &buf[i];
200 		bzero((char *)bp, sizeof *bp);
201 		bp->b_dev = NODEV;
202 		bp->b_vnbufs.le_next = NOLIST;
203 		bp->b_data = buffers + i * MAXBSIZE;
204 		LIST_INIT(&bp->b_dep);
205 		if (i < residual)
206 			bp->b_bufsize = (base + 1) * PAGE_SIZE;
207 		else
208 			bp->b_bufsize = base * PAGE_SIZE;
209 		bp->b_flags = B_INVAL;
210 		if (bp->b_bufsize) {
211 			dp = &bufqueues[BQ_CLEAN];
212 			numfreepages += btoc(bp->b_bufsize);
213 			numcleanpages += btoc(bp->b_bufsize);
214 		} else {
215 			dp = &bufqueues[BQ_EMPTY];
216 			numemptybufs++;
217 		}
218 		binsheadfree(bp, dp);
219 		binshash(bp, &invalhash);
220 	}
221 
222 	hidirtypages = bufpages / 4;
223 	lodirtypages = hidirtypages / 2;
224 
225 	/*
226 	 * Reserve 5% of bufpages for syncer's needs,
227 	 * but not more than 25% and if possible
228 	 * not less then 2 * MAXBSIZE. locleanpages
229 	 * value must be not too small, but probably
230 	 * there are no reason to set it more than 1-2 MB.
231 	 */
232 	locleanpages = bufpages / 20;
233 	if (locleanpages < btoc(2 * MAXBSIZE))
234 		locleanpages = btoc(2 * MAXBSIZE);
235 	if (locleanpages > bufpages / 4)
236 		locleanpages = bufpages / 4;
237 	if (locleanpages > btoc(2 * 1024 * 1024))
238 		locleanpages = btoc(2 * 1024 * 1024);
239 
240 #ifdef DEBUG
241 	mincleanpages = locleanpages;
242 #endif
243 }
244 
245 static __inline struct buf *
246 bio_doread(vp, blkno, size, async)
247 	struct vnode *vp;
248 	daddr_t blkno;
249 	int size;
250 	int async;
251 {
252 	register struct buf *bp;
253 
254 	bp = getblk(vp, blkno, size, 0, 0);
255 
256 	/*
257 	 * If buffer does not have data valid, start a read.
258 	 * Note that if buffer is B_INVAL, getblk() won't return it.
259 	 * Therefore, it's valid if it's I/O has completed or been delayed.
260 	 */
261 	if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
262 		SET(bp->b_flags, B_READ | async);
263 		VOP_STRATEGY(bp);
264 
265 		/* Pay for the read. */
266 		curproc->p_stats->p_ru.ru_inblock++;		/* XXX */
267 	} else if (async) {
268 		brelse(bp);
269 	}
270 
271 	return (bp);
272 }
273 
274 /*
275  * Read a disk block.
276  * This algorithm described in Bach (p.54).
277  */
278 int
279 bread(vp, blkno, size, cred, bpp)
280 	struct vnode *vp;
281 	daddr_t blkno;
282 	int size;
283 	struct ucred *cred;
284 	struct buf **bpp;
285 {
286 	register struct buf *bp;
287 
288 	/* Get buffer for block. */
289 	bp = *bpp = bio_doread(vp, blkno, size, 0);
290 
291 	/* Wait for the read to complete, and return result. */
292 	return (biowait(bp));
293 }
294 
295 /*
296  * Read-ahead multiple disk blocks. The first is sync, the rest async.
297  * Trivial modification to the breada algorithm presented in Bach (p.55).
298  */
299 int
300 breadn(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp)
301 	struct vnode *vp;
302 	daddr_t blkno; int size;
303 	daddr_t rablks[]; int rasizes[];
304 	int nrablks;
305 	struct ucred *cred;
306 	struct buf **bpp;
307 {
308 	register struct buf *bp;
309 	int i;
310 
311 	bp = *bpp = bio_doread(vp, blkno, size, 0);
312 
313 	/*
314 	 * For each of the read-ahead blocks, start a read, if necessary.
315 	 */
316 	for (i = 0; i < nrablks; i++) {
317 		/* If it's in the cache, just go on to next one. */
318 		if (incore(vp, rablks[i]))
319 			continue;
320 
321 		/* Get a buffer for the read-ahead block */
322 		(void) bio_doread(vp, rablks[i], rasizes[i], B_ASYNC);
323 	}
324 
325 	/* Otherwise, we had to start a read for it; wait until it's valid. */
326 	return (biowait(bp));
327 }
328 
329 /*
330  * Read with single-block read-ahead.  Defined in Bach (p.55), but
331  * implemented as a call to breadn().
332  * XXX for compatibility with old file systems.
333  */
334 int
335 breada(vp, blkno, size, rablkno, rabsize, cred, bpp)
336 	struct vnode *vp;
337 	daddr_t blkno; int size;
338 	daddr_t rablkno; int rabsize;
339 	struct ucred *cred;
340 	struct buf **bpp;
341 {
342 
343 	return (breadn(vp, blkno, size, &rablkno, &rabsize, 1, cred, bpp));
344 }
345 
346 /*
347  * Block write.  Described in Bach (p.56)
348  */
349 int
350 bwrite(bp)
351 	struct buf *bp;
352 {
353 	int rv, async, wasdelayed, s;
354 	struct vnode *vp;
355 	struct mount *mp;
356 
357 	/*
358 	 * Remember buffer type, to switch on it later.  If the write was
359 	 * synchronous, but the file system was mounted with MNT_ASYNC,
360 	 * convert it to a delayed write.
361 	 * XXX note that this relies on delayed tape writes being converted
362 	 * to async, not sync writes (which is safe, but ugly).
363 	 */
364 	async = ISSET(bp->b_flags, B_ASYNC);
365 	if (!async && bp->b_vp && bp->b_vp->v_mount &&
366 	    ISSET(bp->b_vp->v_mount->mnt_flag, MNT_ASYNC)) {
367 		bdwrite(bp);
368 		return (0);
369 	}
370 
371 	/*
372 	 * Collect statistics on synchronous and asynchronous writes.
373 	 * Writes to block devices are charged to their associated
374 	 * filesystem (if any).
375 	 */
376 	if ((vp = bp->b_vp) != NULL) {
377 		if (vp->v_type == VBLK)
378 			mp = vp->v_specmountpoint;
379 		else
380 			mp = vp->v_mount;
381 		if (mp != NULL) {
382 			if (async)
383 				mp->mnt_stat.f_asyncwrites++;
384 			else
385 				mp->mnt_stat.f_syncwrites++;
386 		}
387 	}
388 
389 	wasdelayed = ISSET(bp->b_flags, B_DELWRI);
390 	CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
391 
392 	s = splbio();
393 
394 	/*
395 	 * If not synchronous, pay for the I/O operation and make
396 	 * sure the buf is on the correct vnode queue.  We have
397 	 * to do this now, because if we don't, the vnode may not
398 	 * be properly notified that its I/O has completed.
399 	 */
400 	if (wasdelayed) {
401 		reassignbuf(bp);
402 	} else
403 		curproc->p_stats->p_ru.ru_oublock++;
404 
405 
406 	/* Initiate disk write.  Make sure the appropriate party is charged. */
407 	bp->b_vp->v_numoutput++;
408 	splx(s);
409 	SET(bp->b_flags, B_WRITEINPROG);
410 	VOP_STRATEGY(bp);
411 
412 	if (async)
413 		return (0);
414 
415 	/*
416 	 * If I/O was synchronous, wait for it to complete.
417 	 */
418 	rv = biowait(bp);
419 
420 	/* Release the buffer. */
421 	brelse(bp);
422 
423 	return (rv);
424 }
425 
426 
427 /*
428  * Delayed write.
429  *
430  * The buffer is marked dirty, but is not queued for I/O.
431  * This routine should be used when the buffer is expected
432  * to be modified again soon, typically a small write that
433  * partially fills a buffer.
434  *
435  * NB: magnetic tapes cannot be delayed; they must be
436  * written in the order that the writes are requested.
437  *
438  * Described in Leffler, et al. (pp. 208-213).
439  */
440 void
441 bdwrite(bp)
442 	struct buf *bp;
443 {
444 	int s;
445 
446 	/*
447 	 * If the block hasn't been seen before:
448 	 *	(1) Mark it as having been seen,
449 	 *	(2) Charge for the write.
450 	 *	(3) Make sure it's on its vnode's correct block list,
451 	 *	(4) If a buffer is rewritten, move it to end of dirty list
452 	 */
453 	if (!ISSET(bp->b_flags, B_DELWRI)) {
454 		SET(bp->b_flags, B_DELWRI);
455 		s = splbio();
456 		reassignbuf(bp);
457 		splx(s);
458 		curproc->p_stats->p_ru.ru_oublock++;	/* XXX */
459 	}
460 
461 	/* If this is a tape block, write the block now. */
462 	if (major(bp->b_dev) < nblkdev &&
463 	    bdevsw[major(bp->b_dev)].d_type == D_TAPE) {
464 		bawrite(bp);
465 		return;
466 	}
467 
468 	/* Otherwise, the "write" is done, so mark and release the buffer. */
469 	CLR(bp->b_flags, B_NEEDCOMMIT);
470 	SET(bp->b_flags, B_DONE);
471 	brelse(bp);
472 }
473 
474 /*
475  * Asynchronous block write; just an asynchronous bwrite().
476  */
477 void
478 bawrite(bp)
479 	struct buf *bp;
480 {
481 
482 	SET(bp->b_flags, B_ASYNC);
483 	VOP_BWRITE(bp);
484 }
485 
486 /*
487  * Must be called at splbio()
488  */
489 void
490 buf_dirty(bp)
491 	struct buf *bp;
492 {
493 	if (ISSET(bp->b_flags, B_DELWRI) == 0) {
494 		SET(bp->b_flags, B_DELWRI);
495 		reassignbuf(bp);
496 	}
497 }
498 
499 /*
500  * Must be called at splbio()
501  */
502 void
503 buf_undirty(bp)
504 	struct buf *bp;
505 {
506 	if (ISSET(bp->b_flags, B_DELWRI)) {
507 		CLR(bp->b_flags, B_DELWRI);
508 		reassignbuf(bp);
509 	}
510 }
511 
512 /*
513  * Release a buffer on to the free lists.
514  * Described in Bach (p. 46).
515  */
516 void
517 brelse(bp)
518 	struct buf *bp;
519 {
520 	struct bqueues *bufq;
521 	int s;
522 
523 	/* Block disk interrupts. */
524 	s = splbio();
525 
526 	/*
527 	 * Determine which queue the buffer should be on, then put it there.
528 	 */
529 
530 	/* If it's locked, don't report an error; try again later. */
531 	if (ISSET(bp->b_flags, (B_LOCKED|B_ERROR)) == (B_LOCKED|B_ERROR))
532 		CLR(bp->b_flags, B_ERROR);
533 
534 	/* If it's not cacheable, or an error, mark it invalid. */
535 	if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR)))
536 		SET(bp->b_flags, B_INVAL);
537 
538 	if ((bp->b_bufsize <= 0) || ISSET(bp->b_flags, B_INVAL)) {
539 		/*
540 		 * If it's invalid or empty, dissociate it from its vnode
541 		 * and put on the head of the appropriate queue.
542 		 */
543 		if (LIST_FIRST(&bp->b_dep) != NULL)
544 			buf_deallocate(bp);
545 
546 		if (ISSET(bp->b_flags, B_DELWRI)) {
547 			CLR(bp->b_flags, B_DELWRI);
548 		}
549 
550 		if (bp->b_vp) {
551 			reassignbuf(bp);
552 			brelvp(bp);
553 		}
554 		if (bp->b_bufsize <= 0) {
555 			/* no data */
556 			bufq = &bufqueues[BQ_EMPTY];
557 			numemptybufs++;
558 		} else {
559 			/* invalid data */
560 			bufq = &bufqueues[BQ_CLEAN];
561 			numfreepages += btoc(bp->b_bufsize);
562 			numcleanpages += btoc(bp->b_bufsize);
563 		}
564 		binsheadfree(bp, bufq);
565 	} else {
566 		/*
567 		 * It has valid data.  Put it on the end of the appropriate
568 		 * queue, so that it'll stick around for as long as possible.
569 		 */
570 		if (ISSET(bp->b_flags, B_LOCKED))
571 			/* locked in core */
572 			bufq = &bufqueues[BQ_LOCKED];
573 		else {
574 			numfreepages += btoc(bp->b_bufsize);
575 			if (!ISSET(bp->b_flags, B_DELWRI)) {
576 				numcleanpages += btoc(bp->b_bufsize);
577 				bufq = &bufqueues[BQ_CLEAN];
578 			} else {
579 				numdirtypages += btoc(bp->b_bufsize);
580 				bufq = &bufqueues[BQ_DIRTY];
581 			}
582 		}
583 		if (ISSET(bp->b_flags, B_AGE))
584 			binsheadfree(bp, bufq);
585 		else
586 			binstailfree(bp, bufq);
587 	}
588 
589 	/* Unlock the buffer. */
590 	CLR(bp->b_flags, (B_AGE | B_ASYNC | B_BUSY | B_NOCACHE | B_DEFERRED));
591 
592 	/* Allow disk interrupts. */
593 	splx(s);
594 
595 	/* Wake up syncer and cleaner processes waiting for buffers */
596 	if (nobuffers) {
597 		wakeup(&nobuffers);
598 		nobuffers = 0;
599 	}
600 
601 	/* Wake up any processes waiting for any buffer to become free. */
602 	if (needbuffer && (numcleanpages > locleanpages)) {
603 		needbuffer--;
604 		wakeup_one(&needbuffer);
605 	}
606 
607 	/* Wake up any proceeses waiting for _this_ buffer to become free. */
608 	if (ISSET(bp->b_flags, B_WANTED)) {
609 		CLR(bp->b_flags, B_WANTED);
610 		wakeup(bp);
611 	}
612 }
613 
614 /*
615  * Determine if a block is in the cache.
616  * Just look on what would be its hash chain.  If it's there, return
617  * a pointer to it, unless it's marked invalid.  If it's marked invalid,
618  * we normally don't return the buffer, unless the caller explicitly
619  * wants us to.
620  */
621 struct buf *
622 incore(vp, blkno)
623 	struct vnode *vp;
624 	daddr_t blkno;
625 {
626 	struct buf *bp;
627 
628 	bp = BUFHASH(vp, blkno)->lh_first;
629 
630 	/* Search hash chain */
631 	for (; bp != NULL; bp = bp->b_hash.le_next) {
632 		if (bp->b_lblkno == blkno && bp->b_vp == vp &&
633 		    !ISSET(bp->b_flags, B_INVAL))
634 			return (bp);
635 	}
636 
637 	return (0);
638 }
639 
640 /*
641  * Get a block of requested size that is associated with
642  * a given vnode and block offset. If it is found in the
643  * block cache, mark it as having been found, make it busy
644  * and return it. Otherwise, return an empty block of the
645  * correct size. It is up to the caller to insure that the
646  * cached blocks be of the correct size.
647  */
648 struct buf *
649 getblk(vp, blkno, size, slpflag, slptimeo)
650 	register struct vnode *vp;
651 	daddr_t blkno;
652 	int size, slpflag, slptimeo;
653 {
654 	struct bufhashhdr *bh;
655 	struct buf *bp, *nbp = NULL;
656 	int s, err;
657 
658 	/*
659 	 * XXX
660 	 * The following is an inlined version of 'incore()', but with
661 	 * the 'invalid' test moved to after the 'busy' test.  It's
662 	 * necessary because there are some cases in which the NFS
663 	 * code sets B_INVAL prior to writing data to the server, but
664 	 * in which the buffers actually contain valid data.  In this
665 	 * case, we can't allow the system to allocate a new buffer for
666 	 * the block until the write is finished.
667 	 */
668 	bh = BUFHASH(vp, blkno);
669 start:
670 	bp = bh->lh_first;
671 	for (; bp != NULL; bp = bp->b_hash.le_next) {
672 		if (bp->b_lblkno != blkno || bp->b_vp != vp)
673 			continue;
674 
675 		s = splbio();
676 		if (ISSET(bp->b_flags, B_BUSY)) {
677 			SET(bp->b_flags, B_WANTED);
678 			err = tsleep(bp, slpflag | (PRIBIO + 1), "getblk",
679 			    slptimeo);
680 			splx(s);
681 			if (err)
682 				return (NULL);
683 			goto start;
684 		}
685 
686 		if (!ISSET(bp->b_flags, B_INVAL)) {
687 			SET(bp->b_flags, (B_BUSY | B_CACHE));
688 			bremfree(bp);
689 			splx(s);
690 			break;
691 		}
692 		splx(s);
693 	}
694 
695 	if (bp == NULL) {
696 		if (nbp == NULL && getnewbuf(slpflag, slptimeo, &nbp) != 0) {
697 			goto start;
698 		}
699 		bp = nbp;
700 		binshash(bp, bh);
701 		bp->b_blkno = bp->b_lblkno = blkno;
702 		s = splbio();
703 		bgetvp(vp, bp);
704 		splx(s);
705 	} else if (nbp != NULL) {
706 		/*
707 		 * Set B_AGE so that buffer appear at BQ_CLEAN head
708 		 * and gets reused ASAP.
709 		 */
710 		SET(nbp->b_flags, B_AGE);
711 		brelse(nbp);
712 	}
713 	allocbuf(bp, size);
714 
715 	return (bp);
716 }
717 
718 /*
719  * Get an empty, disassociated buffer of given size.
720  */
721 struct buf *
722 geteblk(size)
723 	int size;
724 {
725 	struct buf *bp;
726 
727 	getnewbuf(0, 0, &bp);
728 	SET(bp->b_flags, B_INVAL);
729 	binshash(bp, &invalhash);
730 	allocbuf(bp, size);
731 
732 	return (bp);
733 }
734 
735 /*
736  * Expand or contract the actual memory allocated to a buffer.
737  *
738  * If the buffer shrinks, data is lost, so it's up to the
739  * caller to have written it out *first*; this routine will not
740  * start a write.  If the buffer grows, it's the callers
741  * responsibility to fill out the buffer's additional contents.
742  */
743 void
744 allocbuf(bp, size)
745 	struct buf *bp;
746 	int size;
747 {
748 	struct buf	*nbp;
749 	vsize_t		desired_size;
750 	int		s;
751 
752 	desired_size = round_page(size);
753 	if (desired_size > MAXBSIZE)
754 		panic("allocbuf: buffer larger than MAXBSIZE requested");
755 
756 	if (bp->b_bufsize == desired_size)
757 		goto out;
758 
759 	/*
760 	 * If the buffer is smaller than the desired size, we need to snarf
761 	 * it from other buffers.  Get buffers (via getnewbuf()), and
762 	 * steal their pages.
763 	 */
764 	while (bp->b_bufsize < desired_size) {
765 		int amt;
766 
767 		/* find a buffer */
768 		getnewbuf(0, 0, &nbp);
769  		SET(nbp->b_flags, B_INVAL);
770 		binshash(nbp, &invalhash);
771 
772 		/* and steal its pages, up to the amount we need */
773 		amt = MIN(nbp->b_bufsize, (desired_size - bp->b_bufsize));
774 		pagemove((nbp->b_data + nbp->b_bufsize - amt),
775 			 bp->b_data + bp->b_bufsize, amt);
776 		bp->b_bufsize += amt;
777 		nbp->b_bufsize -= amt;
778 
779 		/* reduce transfer count if we stole some data */
780 		if (nbp->b_bcount > nbp->b_bufsize)
781 			nbp->b_bcount = nbp->b_bufsize;
782 
783 #ifdef DIAGNOSTIC
784 		if (nbp->b_bufsize < 0)
785 			panic("allocbuf: negative bufsize");
786 #endif
787 
788 		brelse(nbp);
789 	}
790 
791 	/*
792 	 * If we want a buffer smaller than the current size,
793 	 * shrink this buffer.  Grab a buf head from the EMPTY queue,
794 	 * move a page onto it, and put it on front of the AGE queue.
795 	 * If there are no free buffer headers, leave the buffer alone.
796 	 */
797 	if (bp->b_bufsize > desired_size) {
798 		s = splbio();
799 		if ((nbp = bufqueues[BQ_EMPTY].tqh_first) == NULL) {
800 			/* No free buffer head */
801 			splx(s);
802 			goto out;
803 		}
804 		bremfree(nbp);
805 		SET(nbp->b_flags, B_BUSY);
806 		splx(s);
807 
808 		/* move the page to it and note this change */
809 		pagemove(bp->b_data + desired_size,
810 		    nbp->b_data, bp->b_bufsize - desired_size);
811 		nbp->b_bufsize = bp->b_bufsize - desired_size;
812 		bp->b_bufsize = desired_size;
813 		nbp->b_bcount = 0;
814 		SET(nbp->b_flags, B_INVAL);
815 
816 		/* release the newly-filled buffer and leave */
817 		brelse(nbp);
818 	}
819 
820 out:
821 	bp->b_bcount = size;
822 }
823 
824 /*
825  * Find a buffer which is available for use.
826  *
827  * We must notify getblk if we slept during the buffer allocation. When
828  * that happens, we allocate a buffer anyway (unless tsleep is interrupted
829  * or times out) and return !0.
830  */
831 int
832 getnewbuf(slpflag, slptimeo, bpp)
833 	int slpflag, slptimeo;
834 	struct buf **bpp;
835 {
836 	struct buf *bp;
837 	int s, ret, error;
838 
839 	*bpp = NULL;
840 	ret = 0;
841 
842 start:
843 	s = splbio();
844 	/*
845 	 * Wake up cleaner if we're getting low on buffers.
846 	 */
847 	if (numdirtypages >= hidirtypages)
848 		wakeup(&bd_req);
849 
850 	if ((numcleanpages <= locleanpages) &&
851 	    curproc != syncerproc && curproc != cleanerproc) {
852 		needbuffer++;
853 		error = tsleep(&needbuffer, slpflag|(PRIBIO+1), "getnewbuf",
854 				slptimeo);
855 		splx(s);
856 		if (error)
857 			return (1);
858 		ret = 1;
859 		goto start;
860 	}
861 	if ((bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN])) == NULL) {
862 		/* wait for a free buffer of any kind */
863 		nobuffers = 1;
864 		error = tsleep(&nobuffers, slpflag|(PRIBIO-3),
865 				"getnewbuf", slptimeo);
866 		splx(s);
867 		if (error)
868 			return (1);
869 		ret = 1;
870 		goto start;
871 	}
872 
873 	bremfree(bp);
874 
875 	/* Buffer is no longer on free lists. */
876 	SET(bp->b_flags, B_BUSY);
877 
878 #ifdef DIAGNOSTIC
879 	if (ISSET(bp->b_flags, B_DELWRI))
880 		panic("Dirty buffer on BQ_CLEAN");
881 #endif
882 
883 	/* disassociate us from our vnode, if we had one... */
884 	if (bp->b_vp)
885 		brelvp(bp);
886 
887 	splx(s);
888 
889 #ifdef DIAGNOSTIC
890 	/* CLEAN buffers must have no dependencies */
891 	if (LIST_FIRST(&bp->b_dep) != NULL)
892 		panic("BQ_CLEAN has buffer with dependencies");
893 #endif
894 
895 	/* clear out various other fields */
896 	bp->b_flags = B_BUSY;
897 	bp->b_dev = NODEV;
898 	bp->b_blkno = bp->b_lblkno = 0;
899 	bp->b_iodone = 0;
900 	bp->b_error = 0;
901 	bp->b_resid = 0;
902 	bp->b_bcount = 0;
903 	bp->b_dirtyoff = bp->b_dirtyend = 0;
904 	bp->b_validoff = bp->b_validend = 0;
905 
906 	bremhash(bp);
907 	*bpp = bp;
908 	return (ret);
909 }
910 
911 /*
912  * Buffer cleaning daemon.
913  */
914 void
915 buf_daemon(struct proc *p)
916 {
917 	int s;
918 	struct buf *bp;
919 	struct timeval starttime, timediff;
920 
921 	cleanerproc = curproc;
922 
923 	for (;;) {
924 		if (numdirtypages < hidirtypages) {
925 			tsleep(&bd_req, PRIBIO - 7, "cleaner", 0);
926 		}
927 
928 		starttime = time;
929 		s = splbio();
930 		while ((bp = TAILQ_FIRST(&bufqueues[BQ_DIRTY]))) {
931 			bremfree(bp);
932 			SET(bp->b_flags, B_BUSY);
933 			splx(s);
934 
935 			if (ISSET(bp->b_flags, B_INVAL)) {
936 				brelse(bp);
937 				s = splbio();
938 				continue;
939 			}
940 #ifdef DIAGNOSTIC
941 			if (!ISSET(bp->b_flags, B_DELWRI))
942 				panic("Clean buffer on BQ_DIRTY");
943 #endif
944 			if (LIST_FIRST(&bp->b_dep) != NULL &&
945 			    !ISSET(bp->b_flags, B_DEFERRED) &&
946 			    buf_countdeps(bp, 0, 1)) {
947 				SET(bp->b_flags, B_DEFERRED);
948 				s = splbio();
949 				numfreepages += btoc(bp->b_bufsize);
950 				numdirtypages += btoc(bp->b_bufsize);
951 				binstailfree(bp, &bufqueues[BQ_DIRTY]);
952 				CLR(bp->b_flags, B_BUSY);
953 				continue;
954 			}
955 
956 			bawrite(bp);
957 
958 			if (numdirtypages < lodirtypages)
959 				break;
960 			/* Never allow processing to run for more than 1 sec */
961 			timersub(&time, &starttime, &timediff);
962 			if (timediff.tv_sec)
963 				break;
964 
965 			s = splbio();
966 		}
967 	}
968 }
969 
970 /*
971  * Wait for operations on the buffer to complete.
972  * When they do, extract and return the I/O's error value.
973  */
974 int
975 biowait(bp)
976 	struct buf *bp;
977 {
978 	int s;
979 
980 	s = splbio();
981 	while (!ISSET(bp->b_flags, B_DONE))
982 		tsleep(bp, PRIBIO + 1, "biowait", 0);
983 	splx(s);
984 
985 	/* check for interruption of I/O (e.g. via NFS), then errors. */
986 	if (ISSET(bp->b_flags, B_EINTR)) {
987 		CLR(bp->b_flags, B_EINTR);
988 		return (EINTR);
989 	}
990 
991 	if (ISSET(bp->b_flags, B_ERROR))
992 		return (bp->b_error ? bp->b_error : EIO);
993 	else
994 		return (0);
995 }
996 
997 /*
998  * Mark I/O complete on a buffer.
999  *
1000  * If a callback has been requested, e.g. the pageout
1001  * daemon, do so. Otherwise, awaken waiting processes.
1002  *
1003  * [ Leffler, et al., says on p.247:
1004  *	"This routine wakes up the blocked process, frees the buffer
1005  *	for an asynchronous write, or, for a request by the pagedaemon
1006  *	process, invokes a procedure specified in the buffer structure" ]
1007  *
1008  * In real life, the pagedaemon (or other system processes) wants
1009  * to do async stuff to, and doesn't want the buffer brelse()'d.
1010  * (for swap pager, that puts swap buffers on the free lists (!!!),
1011  * for the vn device, that puts malloc'd buffers on the free lists!)
1012  */
1013 void
1014 biodone(bp)
1015 	struct buf *bp;
1016 {
1017 	if (ISSET(bp->b_flags, B_DONE))
1018 		panic("biodone already");
1019 	SET(bp->b_flags, B_DONE);		/* note that it's done */
1020 
1021 	if (LIST_FIRST(&bp->b_dep) != NULL)
1022 		buf_complete(bp);
1023 
1024 	if (!ISSET(bp->b_flags, B_READ)) {
1025 		CLR(bp->b_flags, B_WRITEINPROG);
1026 		vwakeup(bp->b_vp);
1027 	}
1028 
1029 	if (ISSET(bp->b_flags, B_CALL)) {	/* if necessary, call out */
1030 		CLR(bp->b_flags, B_CALL);	/* but note callout done */
1031 		(*bp->b_iodone)(bp);
1032 	} else {
1033 		if (ISSET(bp->b_flags, B_ASYNC)) {/* if async, release it */
1034 			brelse(bp);
1035 		} else {			/* or just wakeup the buffer */
1036 			CLR(bp->b_flags, B_WANTED);
1037 			wakeup(bp);
1038 		}
1039 	}
1040 }
1041 
1042 #ifdef DEBUG
1043 /*
1044  * Print out statistics on the current allocation of the buffer pool.
1045  * Can be enabled to print out on every ``sync'' by setting "syncprt"
1046  * in vfs_syscalls.c using sysctl.
1047  */
1048 void
1049 vfs_bufstats()
1050 {
1051 	int s, i, j, count;
1052 	register struct buf *bp;
1053 	register struct bqueues *dp;
1054 	int counts[MAXBSIZE/PAGE_SIZE+1];
1055 	int totals[BQUEUES];
1056 	long ptotals[BQUEUES];
1057 	long pages;
1058 	static char *bname[BQUEUES] = { "LOCKED", "CLEAN", "DIRTY", "EMPTY" };
1059 
1060 	s = splbio();
1061 	for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
1062 		count = 0;
1063 		pages = 0;
1064 		for (j = 0; j <= MAXBSIZE/PAGE_SIZE; j++)
1065 			counts[j] = 0;
1066 		for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) {
1067 			counts[bp->b_bufsize/PAGE_SIZE]++;
1068 			count++;
1069 			pages += btoc(bp->b_bufsize);
1070 		}
1071 		totals[i] = count;
1072 		ptotals[i] = pages;
1073 		printf("%s: total-%d(%d pages)", bname[i], count, pages);
1074 		for (j = 0; j <= MAXBSIZE/PAGE_SIZE; j++)
1075 			if (counts[j] != 0)
1076 				printf(", %d-%d", j * PAGE_SIZE, counts[j]);
1077 		printf("\n");
1078 	}
1079 	if (totals[BQ_EMPTY] != numemptybufs)
1080 		printf("numemptybufs counter wrong: %d != %d\n",
1081 			numemptybufs, totals[BQ_EMPTY]);
1082 	if ((ptotals[BQ_CLEAN] + ptotals[BQ_DIRTY]) != numfreepages)
1083 		printf("numfreepages counter wrong: %ld != %ld\n",
1084 			numfreepages, ptotals[BQ_CLEAN] + ptotals[BQ_DIRTY]);
1085 	if (ptotals[BQ_CLEAN] != numcleanpages)
1086 		printf("numcleanpages counter wrong: %ld != %ld\n",
1087 			numcleanpages, ptotals[BQ_CLEAN]);
1088 	else
1089 		printf("numcleanpages: %ld\n", numcleanpages);
1090 	if (numdirtypages != ptotals[BQ_DIRTY])
1091 		printf("numdirtypages counter wrong: %ld != %ld\n",
1092 			numdirtypages, ptotals[BQ_DIRTY]);
1093 	else
1094 		printf("numdirtypages: %ld\n", numdirtypages);
1095 
1096 	printf("syncer eating up to %ld pages from %ld reserved\n",
1097 			locleanpages - mincleanpages, locleanpages);
1098 	splx(s);
1099 }
1100 #endif /* DEBUG */
1101