xref: /openbsd-src/sys/kern/vfs_bio.c (revision 8500990981f885cbe5e6a4958549cacc238b5ae6)
1 /*	$OpenBSD: vfs_bio.c,v 1.68 2003/06/02 23:28:07 millert Exp $	*/
2 /*	$NetBSD: vfs_bio.c,v 1.44 1996/06/11 11:15:36 pk Exp $	*/
3 
4 /*-
5  * Copyright (c) 1994 Christopher G. Demetriou
6  * Copyright (c) 1982, 1986, 1989, 1993
7  *	The Regents of the University of California.  All rights reserved.
8  * (c) UNIX System Laboratories, Inc.
9  * All or some portions of this file are derived from material licensed
10  * to the University of California by American Telephone and Telegraph
11  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12  * the permission of UNIX System Laboratories, Inc.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  *	@(#)vfs_bio.c	8.6 (Berkeley) 1/11/94
39  */
40 
41 /*
42  * Some references:
43  *	Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
44  *	Leffler, et al.: The Design and Implementation of the 4.3BSD
45  *		UNIX Operating System (Addison Welley, 1989)
46  */
47 
48 #include <sys/param.h>
49 #include <sys/systm.h>
50 #include <sys/proc.h>
51 #include <sys/buf.h>
52 #include <sys/vnode.h>
53 #include <sys/mount.h>
54 #include <sys/malloc.h>
55 #include <sys/pool.h>
56 #include <sys/resourcevar.h>
57 #include <sys/conf.h>
58 #include <sys/kernel.h>
59 
60 #include <uvm/uvm_extern.h>
61 
62 #include <miscfs/specfs/specdev.h>
63 
64 /*
65  * Definitions for the buffer hash lists.
66  */
67 #define	BUFHASH(dvp, lbn)	\
68 	(&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
69 LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
70 u_long	bufhash;
71 
72 /*
73  * Insq/Remq for the buffer hash lists.
74  */
75 #define	binshash(bp, dp)	LIST_INSERT_HEAD(dp, bp, b_hash)
76 #define	bremhash(bp)		LIST_REMOVE(bp, b_hash)
77 
78 /*
79  * Definitions for the buffer free lists.
80  */
81 #define	BQUEUES		4		/* number of free buffer queues */
82 
83 #define	BQ_LOCKED	0		/* super-blocks &c */
84 #define	BQ_CLEAN	1		/* LRU queue with clean buffers */
85 #define	BQ_DIRTY	2		/* LRU queue with dirty buffers */
86 #define	BQ_EMPTY	3		/* buffer headers with no memory */
87 
88 TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
89 int needbuffer;
90 int nobuffers;
91 struct bio_ops bioops;
92 
93 /*
94  * Buffer pool for I/O buffers.
95  */
96 struct pool bufpool;
97 
98 /*
99  * Insq/Remq for the buffer free lists.
100  */
101 #define	binsheadfree(bp, dp)	TAILQ_INSERT_HEAD(dp, bp, b_freelist)
102 #define	binstailfree(bp, dp)	TAILQ_INSERT_TAIL(dp, bp, b_freelist)
103 
104 static __inline struct buf *bio_doread(struct vnode *, daddr_t, int, int);
105 int getnewbuf(int slpflag, int slptimeo, struct buf **);
106 
107 /*
108  * We keep a few counters to monitor the utilization of the buffer cache
109  *
110  *  numdirtypages - number of pages on BQ_DIRTY queue.
111  *  lodirtypages  - low water mark for buffer cleaning daemon.
112  *  hidirtypages  - high water mark for buffer cleaning daemon.
113  *  numfreepages  - number of pages on BQ_CLEAN and BQ_DIRTY queues. unused.
114  *  numcleanpages - number of pages on BQ_CLEAN queue.
115  *		    Used to track the need to speedup the cleaner and
116  *		    as a reserve for special processes like syncer.
117  *  mincleanpages - the lowest byte count on BQ_CLEAN.
118  *  numemptybufs  - number of buffers on BQ_EMPTY. unused.
119  */
120 long numdirtypages;
121 long lodirtypages;
122 long hidirtypages;
123 long numfreepages;
124 long numcleanpages;
125 long locleanpages;
126 int numemptybufs;
127 #ifdef DEBUG
128 long mincleanpages;
129 #endif
130 
131 struct proc *cleanerproc;
132 int bd_req;			/* Sleep point for cleaner daemon. */
133 
134 void
135 bremfree(struct buf *bp)
136 {
137 	struct bqueues *dp = NULL;
138 
139 	/*
140 	 * We only calculate the head of the freelist when removing
141 	 * the last element of the list as that is the only time that
142 	 * it is needed (e.g. to reset the tail pointer).
143 	 *
144 	 * NB: This makes an assumption about how tailq's are implemented.
145 	 */
146 	if (bp->b_freelist.tqe_next == NULL) {
147 		for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
148 			if (dp->tqh_last == &bp->b_freelist.tqe_next)
149 				break;
150 		if (dp == &bufqueues[BQUEUES])
151 			panic("bremfree: lost tail");
152 	}
153 	if (bp->b_bufsize <= 0) {
154 		numemptybufs--;
155 	} else if (!ISSET(bp->b_flags, B_LOCKED)) {
156 		numfreepages -= btoc(bp->b_bufsize);
157 		if (!ISSET(bp->b_flags, B_DELWRI)) {
158 			numcleanpages -= btoc(bp->b_bufsize);
159 #ifdef DEBUG
160 			if (mincleanpages > numcleanpages)
161 				mincleanpages = numcleanpages;
162 #endif
163 		} else {
164 			numdirtypages -= btoc(bp->b_bufsize);
165 		}
166 	}
167 	TAILQ_REMOVE(dp, bp, b_freelist);
168 }
169 
170 /*
171  * Initialize buffers and hash links for buffers.
172  */
173 void
174 bufinit(void)
175 {
176 	struct buf *bp;
177 	struct bqueues *dp;
178 	int i;
179 	int base, residual;
180 
181 	pool_init(&bufpool, sizeof(struct buf), 0, 0, 0, "bufpl", NULL);
182 	for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
183 		TAILQ_INIT(dp);
184 	bufhashtbl = hashinit(nbuf, M_CACHE, M_WAITOK, &bufhash);
185 	base = bufpages / nbuf;
186 	residual = bufpages % nbuf;
187 	for (i = 0; i < nbuf; i++) {
188 		bp = &buf[i];
189 		bzero((char *)bp, sizeof *bp);
190 		bp->b_dev = NODEV;
191 		bp->b_vnbufs.le_next = NOLIST;
192 		bp->b_data = buffers + i * MAXBSIZE;
193 		LIST_INIT(&bp->b_dep);
194 		if (i < residual)
195 			bp->b_bufsize = (base + 1) * PAGE_SIZE;
196 		else
197 			bp->b_bufsize = base * PAGE_SIZE;
198 		bp->b_flags = B_INVAL;
199 		if (bp->b_bufsize) {
200 			dp = &bufqueues[BQ_CLEAN];
201 			numfreepages += btoc(bp->b_bufsize);
202 			numcleanpages += btoc(bp->b_bufsize);
203 		} else {
204 			dp = &bufqueues[BQ_EMPTY];
205 			numemptybufs++;
206 		}
207 		binsheadfree(bp, dp);
208 		binshash(bp, &invalhash);
209 	}
210 
211 	hidirtypages = bufpages / 4;
212 	lodirtypages = hidirtypages / 2;
213 
214 	/*
215 	 * Reserve 5% of bufpages for syncer's needs,
216 	 * but not more than 25% and if possible
217 	 * not less then 2 * MAXBSIZE. locleanpages
218 	 * value must be not too small, but probably
219 	 * there are no reason to set it more than 1-2 MB.
220 	 */
221 	locleanpages = bufpages / 20;
222 	if (locleanpages < btoc(2 * MAXBSIZE))
223 		locleanpages = btoc(2 * MAXBSIZE);
224 	if (locleanpages > bufpages / 4)
225 		locleanpages = bufpages / 4;
226 	if (locleanpages > btoc(2 * 1024 * 1024))
227 		locleanpages = btoc(2 * 1024 * 1024);
228 
229 #ifdef DEBUG
230 	mincleanpages = locleanpages;
231 #endif
232 }
233 
234 static __inline struct buf *
235 bio_doread(struct vnode *vp, daddr_t blkno, int size, int async)
236 {
237 	struct buf *bp;
238 
239 	bp = getblk(vp, blkno, size, 0, 0);
240 
241 	/*
242 	 * If buffer does not have data valid, start a read.
243 	 * Note that if buffer is B_INVAL, getblk() won't return it.
244 	 * Therefore, it's valid if it's I/O has completed or been delayed.
245 	 */
246 	if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
247 		SET(bp->b_flags, B_READ | async);
248 		VOP_STRATEGY(bp);
249 
250 		/* Pay for the read. */
251 		curproc->p_stats->p_ru.ru_inblock++;		/* XXX */
252 	} else if (async) {
253 		brelse(bp);
254 	}
255 
256 	return (bp);
257 }
258 
259 /*
260  * Read a disk block.
261  * This algorithm described in Bach (p.54).
262  */
263 int
264 bread(struct vnode *vp, daddr_t blkno, int size, struct ucred *cred,
265     struct buf **bpp)
266 {
267 	struct buf *bp;
268 
269 	/* Get buffer for block. */
270 	bp = *bpp = bio_doread(vp, blkno, size, 0);
271 
272 	/* Wait for the read to complete, and return result. */
273 	return (biowait(bp));
274 }
275 
276 /*
277  * Read-ahead multiple disk blocks. The first is sync, the rest async.
278  * Trivial modification to the breada algorithm presented in Bach (p.55).
279  */
280 int
281 breadn(struct vnode *vp, daddr_t blkno, int size, daddr_t rablks[],
282     int rasizes[], int nrablks, struct ucred *cred, struct buf **bpp)
283 {
284 	struct buf *bp;
285 	int i;
286 
287 	bp = *bpp = bio_doread(vp, blkno, size, 0);
288 
289 	/*
290 	 * For each of the read-ahead blocks, start a read, if necessary.
291 	 */
292 	for (i = 0; i < nrablks; i++) {
293 		/* If it's in the cache, just go on to next one. */
294 		if (incore(vp, rablks[i]))
295 			continue;
296 
297 		/* Get a buffer for the read-ahead block */
298 		(void) bio_doread(vp, rablks[i], rasizes[i], B_ASYNC);
299 	}
300 
301 	/* Otherwise, we had to start a read for it; wait until it's valid. */
302 	return (biowait(bp));
303 }
304 
305 /*
306  * Read with single-block read-ahead.  Defined in Bach (p.55), but
307  * implemented as a call to breadn().
308  * XXX for compatibility with old file systems.
309  */
310 int
311 breada(struct vnode *vp, daddr_t blkno, int size, daddr_t rablkno, int rabsize,
312     struct ucred *cred, struct buf **bpp)
313 {
314 
315 	return (breadn(vp, blkno, size, &rablkno, &rabsize, 1, cred, bpp));
316 }
317 
318 /*
319  * Block write.  Described in Bach (p.56)
320  */
321 int
322 bwrite(struct buf *bp)
323 {
324 	int rv, async, wasdelayed, s;
325 	struct vnode *vp;
326 	struct mount *mp;
327 
328 	/*
329 	 * Remember buffer type, to switch on it later.  If the write was
330 	 * synchronous, but the file system was mounted with MNT_ASYNC,
331 	 * convert it to a delayed write.
332 	 * XXX note that this relies on delayed tape writes being converted
333 	 * to async, not sync writes (which is safe, but ugly).
334 	 */
335 	async = ISSET(bp->b_flags, B_ASYNC);
336 	if (!async && bp->b_vp && bp->b_vp->v_mount &&
337 	    ISSET(bp->b_vp->v_mount->mnt_flag, MNT_ASYNC)) {
338 		bdwrite(bp);
339 		return (0);
340 	}
341 
342 	/*
343 	 * Collect statistics on synchronous and asynchronous writes.
344 	 * Writes to block devices are charged to their associated
345 	 * filesystem (if any).
346 	 */
347 	if ((vp = bp->b_vp) != NULL) {
348 		if (vp->v_type == VBLK)
349 			mp = vp->v_specmountpoint;
350 		else
351 			mp = vp->v_mount;
352 		if (mp != NULL) {
353 			if (async)
354 				mp->mnt_stat.f_asyncwrites++;
355 			else
356 				mp->mnt_stat.f_syncwrites++;
357 		}
358 	}
359 
360 	wasdelayed = ISSET(bp->b_flags, B_DELWRI);
361 	CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
362 
363 	s = splbio();
364 
365 	/*
366 	 * If not synchronous, pay for the I/O operation and make
367 	 * sure the buf is on the correct vnode queue.  We have
368 	 * to do this now, because if we don't, the vnode may not
369 	 * be properly notified that its I/O has completed.
370 	 */
371 	if (wasdelayed) {
372 		reassignbuf(bp);
373 	} else
374 		curproc->p_stats->p_ru.ru_oublock++;
375 
376 
377 	/* Initiate disk write.  Make sure the appropriate party is charged. */
378 	bp->b_vp->v_numoutput++;
379 	splx(s);
380 	SET(bp->b_flags, B_WRITEINPROG);
381 	VOP_STRATEGY(bp);
382 
383 	if (async)
384 		return (0);
385 
386 	/*
387 	 * If I/O was synchronous, wait for it to complete.
388 	 */
389 	rv = biowait(bp);
390 
391 	/* Release the buffer. */
392 	brelse(bp);
393 
394 	return (rv);
395 }
396 
397 
398 /*
399  * Delayed write.
400  *
401  * The buffer is marked dirty, but is not queued for I/O.
402  * This routine should be used when the buffer is expected
403  * to be modified again soon, typically a small write that
404  * partially fills a buffer.
405  *
406  * NB: magnetic tapes cannot be delayed; they must be
407  * written in the order that the writes are requested.
408  *
409  * Described in Leffler, et al. (pp. 208-213).
410  */
411 void
412 bdwrite(struct buf *bp)
413 {
414 	int s;
415 
416 	/*
417 	 * If the block hasn't been seen before:
418 	 *	(1) Mark it as having been seen,
419 	 *	(2) Charge for the write.
420 	 *	(3) Make sure it's on its vnode's correct block list,
421 	 *	(4) If a buffer is rewritten, move it to end of dirty list
422 	 */
423 	if (!ISSET(bp->b_flags, B_DELWRI)) {
424 		SET(bp->b_flags, B_DELWRI);
425 		s = splbio();
426 		reassignbuf(bp);
427 		splx(s);
428 		curproc->p_stats->p_ru.ru_oublock++;	/* XXX */
429 	}
430 
431 	/* If this is a tape block, write the block now. */
432 	if (major(bp->b_dev) < nblkdev &&
433 	    bdevsw[major(bp->b_dev)].d_type == D_TAPE) {
434 		bawrite(bp);
435 		return;
436 	}
437 
438 	/* Otherwise, the "write" is done, so mark and release the buffer. */
439 	CLR(bp->b_flags, B_NEEDCOMMIT);
440 	SET(bp->b_flags, B_DONE);
441 	brelse(bp);
442 }
443 
444 /*
445  * Asynchronous block write; just an asynchronous bwrite().
446  */
447 void
448 bawrite(struct buf *bp)
449 {
450 
451 	SET(bp->b_flags, B_ASYNC);
452 	VOP_BWRITE(bp);
453 }
454 
455 /*
456  * Must be called at splbio()
457  */
458 void
459 buf_dirty(struct buf *bp)
460 {
461 	splassert(IPL_BIO);
462 
463 	if (ISSET(bp->b_flags, B_DELWRI) == 0) {
464 		SET(bp->b_flags, B_DELWRI);
465 		reassignbuf(bp);
466 	}
467 }
468 
469 /*
470  * Must be called at splbio()
471  */
472 void
473 buf_undirty(struct buf *bp)
474 {
475 	splassert(IPL_BIO);
476 
477 	if (ISSET(bp->b_flags, B_DELWRI)) {
478 		CLR(bp->b_flags, B_DELWRI);
479 		reassignbuf(bp);
480 	}
481 }
482 
483 /*
484  * Release a buffer on to the free lists.
485  * Described in Bach (p. 46).
486  */
487 void
488 brelse(struct buf *bp)
489 {
490 	struct bqueues *bufq;
491 	int s;
492 
493 	/* Block disk interrupts. */
494 	s = splbio();
495 
496 	/*
497 	 * Determine which queue the buffer should be on, then put it there.
498 	 */
499 
500 	/* If it's locked, don't report an error; try again later. */
501 	if (ISSET(bp->b_flags, (B_LOCKED|B_ERROR)) == (B_LOCKED|B_ERROR))
502 		CLR(bp->b_flags, B_ERROR);
503 
504 	/* If it's not cacheable, or an error, mark it invalid. */
505 	if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR)))
506 		SET(bp->b_flags, B_INVAL);
507 
508 	if ((bp->b_bufsize <= 0) || ISSET(bp->b_flags, B_INVAL)) {
509 		/*
510 		 * If it's invalid or empty, dissociate it from its vnode
511 		 * and put on the head of the appropriate queue.
512 		 */
513 		if (LIST_FIRST(&bp->b_dep) != NULL)
514 			buf_deallocate(bp);
515 
516 		if (ISSET(bp->b_flags, B_DELWRI)) {
517 			CLR(bp->b_flags, B_DELWRI);
518 		}
519 
520 		if (bp->b_vp) {
521 			reassignbuf(bp);
522 			brelvp(bp);
523 		}
524 		if (bp->b_bufsize <= 0) {
525 			/* no data */
526 			bufq = &bufqueues[BQ_EMPTY];
527 			numemptybufs++;
528 		} else {
529 			/* invalid data */
530 			bufq = &bufqueues[BQ_CLEAN];
531 			numfreepages += btoc(bp->b_bufsize);
532 			numcleanpages += btoc(bp->b_bufsize);
533 		}
534 		binsheadfree(bp, bufq);
535 	} else {
536 		/*
537 		 * It has valid data.  Put it on the end of the appropriate
538 		 * queue, so that it'll stick around for as long as possible.
539 		 */
540 		if (ISSET(bp->b_flags, B_LOCKED))
541 			/* locked in core */
542 			bufq = &bufqueues[BQ_LOCKED];
543 		else {
544 			numfreepages += btoc(bp->b_bufsize);
545 			if (!ISSET(bp->b_flags, B_DELWRI)) {
546 				numcleanpages += btoc(bp->b_bufsize);
547 				bufq = &bufqueues[BQ_CLEAN];
548 			} else {
549 				numdirtypages += btoc(bp->b_bufsize);
550 				bufq = &bufqueues[BQ_DIRTY];
551 			}
552 		}
553 		if (ISSET(bp->b_flags, B_AGE))
554 			binsheadfree(bp, bufq);
555 		else
556 			binstailfree(bp, bufq);
557 	}
558 
559 	/* Unlock the buffer. */
560 	CLR(bp->b_flags, (B_AGE | B_ASYNC | B_BUSY | B_NOCACHE | B_DEFERRED));
561 
562 
563 	/* Wake up syncer and cleaner processes waiting for buffers */
564 	if (nobuffers) {
565 		wakeup(&nobuffers);
566 		nobuffers = 0;
567 	}
568 
569 	/* Wake up any processes waiting for any buffer to become free. */
570 	if (needbuffer && (numcleanpages > locleanpages)) {
571 		needbuffer--;
572 		wakeup_one(&needbuffer);
573 	}
574 
575 	splx(s);
576 
577 	/* Wake up any processes waiting for _this_ buffer to become free. */
578 	if (ISSET(bp->b_flags, B_WANTED)) {
579 		CLR(bp->b_flags, B_WANTED);
580 		wakeup(bp);
581 	}
582 }
583 
584 /*
585  * Determine if a block is in the cache.
586  * Just look on what would be its hash chain.  If it's there, return
587  * a pointer to it, unless it's marked invalid.  If it's marked invalid,
588  * we normally don't return the buffer, unless the caller explicitly
589  * wants us to.
590  */
591 struct buf *
592 incore(struct vnode *vp, daddr_t blkno)
593 {
594 	struct buf *bp;
595 
596 	/* Search hash chain */
597 	LIST_FOREACH(bp, BUFHASH(vp, blkno), b_hash) {
598 		if (bp->b_lblkno == blkno && bp->b_vp == vp &&
599 		    !ISSET(bp->b_flags, B_INVAL))
600 			return (bp);
601 	}
602 
603 	return (0);
604 }
605 
606 /*
607  * Get a block of requested size that is associated with
608  * a given vnode and block offset. If it is found in the
609  * block cache, mark it as having been found, make it busy
610  * and return it. Otherwise, return an empty block of the
611  * correct size. It is up to the caller to insure that the
612  * cached blocks be of the correct size.
613  */
614 struct buf *
615 getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo)
616 {
617 	struct bufhashhdr *bh;
618 	struct buf *bp, *nbp = NULL;
619 	int s, err;
620 
621 	/*
622 	 * XXX
623 	 * The following is an inlined version of 'incore()', but with
624 	 * the 'invalid' test moved to after the 'busy' test.  It's
625 	 * necessary because there are some cases in which the NFS
626 	 * code sets B_INVAL prior to writing data to the server, but
627 	 * in which the buffers actually contain valid data.  In this
628 	 * case, we can't allow the system to allocate a new buffer for
629 	 * the block until the write is finished.
630 	 */
631 	bh = BUFHASH(vp, blkno);
632 start:
633 	LIST_FOREACH(bp, BUFHASH(vp, blkno), b_hash) {
634 		if (bp->b_lblkno != blkno || bp->b_vp != vp)
635 			continue;
636 
637 		s = splbio();
638 		if (ISSET(bp->b_flags, B_BUSY)) {
639 			SET(bp->b_flags, B_WANTED);
640 			err = tsleep(bp, slpflag | (PRIBIO + 1), "getblk",
641 			    slptimeo);
642 			splx(s);
643 			if (err)
644 				return (NULL);
645 			goto start;
646 		}
647 
648 		if (!ISSET(bp->b_flags, B_INVAL)) {
649 			SET(bp->b_flags, (B_BUSY | B_CACHE));
650 			bremfree(bp);
651 			splx(s);
652 			break;
653 		}
654 		splx(s);
655 	}
656 
657 	if (bp == NULL) {
658 		if (nbp == NULL && getnewbuf(slpflag, slptimeo, &nbp) != 0) {
659 			goto start;
660 		}
661 		bp = nbp;
662 		binshash(bp, bh);
663 		bp->b_blkno = bp->b_lblkno = blkno;
664 		s = splbio();
665 		bgetvp(vp, bp);
666 		splx(s);
667 	} else if (nbp != NULL) {
668 		/*
669 		 * Set B_AGE so that buffer appear at BQ_CLEAN head
670 		 * and gets reused ASAP.
671 		 */
672 		SET(nbp->b_flags, B_AGE);
673 		brelse(nbp);
674 	}
675 	allocbuf(bp, size);
676 
677 	return (bp);
678 }
679 
680 /*
681  * Get an empty, disassociated buffer of given size.
682  */
683 struct buf *
684 geteblk(int size)
685 {
686 	struct buf *bp;
687 
688 	getnewbuf(0, 0, &bp);
689 	SET(bp->b_flags, B_INVAL);
690 	binshash(bp, &invalhash);
691 	allocbuf(bp, size);
692 
693 	return (bp);
694 }
695 
696 /*
697  * Expand or contract the actual memory allocated to a buffer.
698  *
699  * If the buffer shrinks, data is lost, so it's up to the
700  * caller to have written it out *first*; this routine will not
701  * start a write.  If the buffer grows, it's the callers
702  * responsibility to fill out the buffer's additional contents.
703  */
704 void
705 allocbuf(struct buf *bp, int size)
706 {
707 	struct buf	*nbp;
708 	vsize_t		desired_size;
709 	int		s;
710 
711 	desired_size = round_page(size);
712 	if (desired_size > MAXBSIZE)
713 		panic("allocbuf: buffer larger than MAXBSIZE requested");
714 
715 	if (bp->b_bufsize == desired_size)
716 		goto out;
717 
718 	/*
719 	 * If the buffer is smaller than the desired size, we need to snarf
720 	 * it from other buffers.  Get buffers (via getnewbuf()), and
721 	 * steal their pages.
722 	 */
723 	while (bp->b_bufsize < desired_size) {
724 		int amt;
725 
726 		/* find a buffer */
727 		getnewbuf(0, 0, &nbp);
728  		SET(nbp->b_flags, B_INVAL);
729 		binshash(nbp, &invalhash);
730 
731 		/* and steal its pages, up to the amount we need */
732 		amt = MIN(nbp->b_bufsize, (desired_size - bp->b_bufsize));
733 		pagemove((nbp->b_data + nbp->b_bufsize - amt),
734 			 bp->b_data + bp->b_bufsize, amt);
735 		bp->b_bufsize += amt;
736 		nbp->b_bufsize -= amt;
737 
738 		/* reduce transfer count if we stole some data */
739 		if (nbp->b_bcount > nbp->b_bufsize)
740 			nbp->b_bcount = nbp->b_bufsize;
741 
742 #ifdef DIAGNOSTIC
743 		if (nbp->b_bufsize < 0)
744 			panic("allocbuf: negative bufsize");
745 #endif
746 
747 		brelse(nbp);
748 	}
749 
750 	/*
751 	 * If we want a buffer smaller than the current size,
752 	 * shrink this buffer.  Grab a buf head from the EMPTY queue,
753 	 * move a page onto it, and put it on front of the AGE queue.
754 	 * If there are no free buffer headers, leave the buffer alone.
755 	 */
756 	if (bp->b_bufsize > desired_size) {
757 		s = splbio();
758 		if ((nbp = bufqueues[BQ_EMPTY].tqh_first) == NULL) {
759 			/* No free buffer head */
760 			splx(s);
761 			goto out;
762 		}
763 		bremfree(nbp);
764 		SET(nbp->b_flags, B_BUSY);
765 		splx(s);
766 
767 		/* move the page to it and note this change */
768 		pagemove(bp->b_data + desired_size,
769 		    nbp->b_data, bp->b_bufsize - desired_size);
770 		nbp->b_bufsize = bp->b_bufsize - desired_size;
771 		bp->b_bufsize = desired_size;
772 		nbp->b_bcount = 0;
773 		SET(nbp->b_flags, B_INVAL);
774 
775 		/* release the newly-filled buffer and leave */
776 		brelse(nbp);
777 	}
778 
779 out:
780 	bp->b_bcount = size;
781 }
782 
783 /*
784  * Find a buffer which is available for use.
785  *
786  * We must notify getblk if we slept during the buffer allocation. When
787  * that happens, we allocate a buffer anyway (unless tsleep is interrupted
788  * or times out) and return !0.
789  */
790 int
791 getnewbuf(int slpflag, int slptimeo, struct buf **bpp)
792 {
793 	struct buf *bp;
794 	int s, ret, error;
795 
796 	*bpp = NULL;
797 	ret = 0;
798 
799 start:
800 	s = splbio();
801 	/*
802 	 * Wake up cleaner if we're getting low on buffers.
803 	 */
804 	if (numdirtypages >= hidirtypages)
805 		wakeup(&bd_req);
806 
807 	if ((numcleanpages <= locleanpages) &&
808 	    curproc != syncerproc && curproc != cleanerproc) {
809 		needbuffer++;
810 		error = tsleep(&needbuffer, slpflag|(PRIBIO+1), "getnewbuf",
811 				slptimeo);
812 		splx(s);
813 		if (error)
814 			return (1);
815 		ret = 1;
816 		goto start;
817 	}
818 	if ((bp = TAILQ_FIRST(&bufqueues[BQ_CLEAN])) == NULL) {
819 		/* wait for a free buffer of any kind */
820 		nobuffers = 1;
821 		error = tsleep(&nobuffers, slpflag|(PRIBIO-3),
822 				"getnewbuf", slptimeo);
823 		splx(s);
824 		if (error)
825 			return (1);
826 		ret = 1;
827 		goto start;
828 	}
829 
830 	bremfree(bp);
831 
832 	/* Buffer is no longer on free lists. */
833 	SET(bp->b_flags, B_BUSY);
834 
835 #ifdef DIAGNOSTIC
836 	if (ISSET(bp->b_flags, B_DELWRI))
837 		panic("Dirty buffer on BQ_CLEAN");
838 #endif
839 
840 	/* disassociate us from our vnode, if we had one... */
841 	if (bp->b_vp)
842 		brelvp(bp);
843 
844 	splx(s);
845 
846 #ifdef DIAGNOSTIC
847 	/* CLEAN buffers must have no dependencies */
848 	if (LIST_FIRST(&bp->b_dep) != NULL)
849 		panic("BQ_CLEAN has buffer with dependencies");
850 #endif
851 
852 	/* clear out various other fields */
853 	bp->b_flags = B_BUSY;
854 	bp->b_dev = NODEV;
855 	bp->b_blkno = bp->b_lblkno = 0;
856 	bp->b_iodone = 0;
857 	bp->b_error = 0;
858 	bp->b_resid = 0;
859 	bp->b_bcount = 0;
860 	bp->b_dirtyoff = bp->b_dirtyend = 0;
861 	bp->b_validoff = bp->b_validend = 0;
862 
863 	bremhash(bp);
864 	*bpp = bp;
865 	return (ret);
866 }
867 
868 /*
869  * Buffer cleaning daemon.
870  */
871 void
872 buf_daemon(struct proc *p)
873 {
874 	int s;
875 	struct buf *bp;
876 	struct timeval starttime, timediff;
877 
878 	cleanerproc = curproc;
879 
880 	for (;;) {
881 		if (numdirtypages < hidirtypages) {
882 			tsleep(&bd_req, PRIBIO - 7, "cleaner", 0);
883 		}
884 
885 		starttime = time;
886 		s = splbio();
887 		while ((bp = TAILQ_FIRST(&bufqueues[BQ_DIRTY]))) {
888 			bremfree(bp);
889 			SET(bp->b_flags, B_BUSY);
890 			splx(s);
891 
892 			if (ISSET(bp->b_flags, B_INVAL)) {
893 				brelse(bp);
894 				s = splbio();
895 				continue;
896 			}
897 #ifdef DIAGNOSTIC
898 			if (!ISSET(bp->b_flags, B_DELWRI))
899 				panic("Clean buffer on BQ_DIRTY");
900 #endif
901 			if (LIST_FIRST(&bp->b_dep) != NULL &&
902 			    !ISSET(bp->b_flags, B_DEFERRED) &&
903 			    buf_countdeps(bp, 0, 1)) {
904 				SET(bp->b_flags, B_DEFERRED);
905 				s = splbio();
906 				numfreepages += btoc(bp->b_bufsize);
907 				numdirtypages += btoc(bp->b_bufsize);
908 				binstailfree(bp, &bufqueues[BQ_DIRTY]);
909 				CLR(bp->b_flags, B_BUSY);
910 				continue;
911 			}
912 
913 			bawrite(bp);
914 
915 			if (numdirtypages < lodirtypages)
916 				break;
917 			/* Never allow processing to run for more than 1 sec */
918 			timersub(&time, &starttime, &timediff);
919 			if (timediff.tv_sec)
920 				break;
921 
922 			s = splbio();
923 		}
924 	}
925 }
926 
927 /*
928  * Wait for operations on the buffer to complete.
929  * When they do, extract and return the I/O's error value.
930  */
931 int
932 biowait(struct buf *bp)
933 {
934 	int s;
935 
936 	s = splbio();
937 	while (!ISSET(bp->b_flags, B_DONE))
938 		tsleep(bp, PRIBIO + 1, "biowait", 0);
939 	splx(s);
940 
941 	/* check for interruption of I/O (e.g. via NFS), then errors. */
942 	if (ISSET(bp->b_flags, B_EINTR)) {
943 		CLR(bp->b_flags, B_EINTR);
944 		return (EINTR);
945 	}
946 
947 	if (ISSET(bp->b_flags, B_ERROR))
948 		return (bp->b_error ? bp->b_error : EIO);
949 	else
950 		return (0);
951 }
952 
953 /*
954  * Mark I/O complete on a buffer.
955  *
956  * If a callback has been requested, e.g. the pageout
957  * daemon, do so. Otherwise, awaken waiting processes.
958  *
959  * [ Leffler, et al., says on p.247:
960  *	"This routine wakes up the blocked process, frees the buffer
961  *	for an asynchronous write, or, for a request by the pagedaemon
962  *	process, invokes a procedure specified in the buffer structure" ]
963  *
964  * In real life, the pagedaemon (or other system processes) wants
965  * to do async stuff to, and doesn't want the buffer brelse()'d.
966  * (for swap pager, that puts swap buffers on the free lists (!!!),
967  * for the vn device, that puts malloc'd buffers on the free lists!)
968  *
969  * Must be called at splbio().
970  */
971 void
972 biodone(struct buf *bp)
973 {
974 	splassert(IPL_BIO);
975 
976 	if (ISSET(bp->b_flags, B_DONE))
977 		panic("biodone already");
978 	SET(bp->b_flags, B_DONE);		/* note that it's done */
979 
980 	if (LIST_FIRST(&bp->b_dep) != NULL)
981 		buf_complete(bp);
982 
983 	if (!ISSET(bp->b_flags, B_READ)) {
984 		CLR(bp->b_flags, B_WRITEINPROG);
985 		vwakeup(bp->b_vp);
986 	}
987 
988 	if (ISSET(bp->b_flags, B_CALL)) {	/* if necessary, call out */
989 		CLR(bp->b_flags, B_CALL);	/* but note callout done */
990 		(*bp->b_iodone)(bp);
991 	} else {
992 		if (ISSET(bp->b_flags, B_ASYNC)) {/* if async, release it */
993 			brelse(bp);
994 		} else {			/* or just wakeup the buffer */
995 			CLR(bp->b_flags, B_WANTED);
996 			wakeup(bp);
997 		}
998 	}
999 }
1000 
1001 #ifdef DEBUG
1002 /*
1003  * Print out statistics on the current allocation of the buffer pool.
1004  * Can be enabled to print out on every ``sync'' by setting "syncprt"
1005  * in vfs_syscalls.c using sysctl.
1006  */
1007 void
1008 vfs_bufstats()
1009 {
1010 	int s, i, j, count;
1011 	register struct buf *bp;
1012 	register struct bqueues *dp;
1013 	int counts[MAXBSIZE/PAGE_SIZE+1];
1014 	int totals[BQUEUES];
1015 	long ptotals[BQUEUES];
1016 	long pages;
1017 	static char *bname[BQUEUES] = { "LOCKED", "CLEAN", "DIRTY", "EMPTY" };
1018 
1019 	s = splbio();
1020 	for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
1021 		count = 0;
1022 		pages = 0;
1023 		for (j = 0; j <= MAXBSIZE/PAGE_SIZE; j++)
1024 			counts[j] = 0;
1025 		for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) {
1026 			counts[bp->b_bufsize/PAGE_SIZE]++;
1027 			count++;
1028 			pages += btoc(bp->b_bufsize);
1029 		}
1030 		totals[i] = count;
1031 		ptotals[i] = pages;
1032 		printf("%s: total-%d(%d pages)", bname[i], count, pages);
1033 		for (j = 0; j <= MAXBSIZE/PAGE_SIZE; j++)
1034 			if (counts[j] != 0)
1035 				printf(", %d-%d", j * PAGE_SIZE, counts[j]);
1036 		printf("\n");
1037 	}
1038 	if (totals[BQ_EMPTY] != numemptybufs)
1039 		printf("numemptybufs counter wrong: %d != %d\n",
1040 			numemptybufs, totals[BQ_EMPTY]);
1041 	if ((ptotals[BQ_CLEAN] + ptotals[BQ_DIRTY]) != numfreepages)
1042 		printf("numfreepages counter wrong: %ld != %ld\n",
1043 			numfreepages, ptotals[BQ_CLEAN] + ptotals[BQ_DIRTY]);
1044 	if (ptotals[BQ_CLEAN] != numcleanpages)
1045 		printf("numcleanpages counter wrong: %ld != %ld\n",
1046 			numcleanpages, ptotals[BQ_CLEAN]);
1047 	else
1048 		printf("numcleanpages: %ld\n", numcleanpages);
1049 	if (numdirtypages != ptotals[BQ_DIRTY])
1050 		printf("numdirtypages counter wrong: %ld != %ld\n",
1051 			numdirtypages, ptotals[BQ_DIRTY]);
1052 	else
1053 		printf("numdirtypages: %ld\n", numdirtypages);
1054 
1055 	printf("syncer eating up to %ld pages from %ld reserved\n",
1056 			locleanpages - mincleanpages, locleanpages);
1057 	splx(s);
1058 }
1059 #endif /* DEBUG */
1060