xref: /csrg-svn/sys/kern/vfs_bio.c (revision 57797)
149589Sbostic /*-
249589Sbostic  * Copyright (c) 1982, 1986, 1989 The Regents of the University of California.
337736Smckusick  * All rights reserved.
423395Smckusick  *
549618Smckusick  * This module is believed to contain source code proprietary to AT&T.
649618Smckusick  * Use and redistribution is subject to the Berkeley Software License
749618Smckusick  * Agreement and your Software Agreement with AT&T (Western Electric).
837736Smckusick  *
9*57797Smckusick  *	@(#)vfs_bio.c	7.58 (Berkeley) 02/02/93
1023395Smckusick  */
118Sbill 
1251455Sbostic #include <sys/param.h>
1351455Sbostic #include <sys/proc.h>
1451455Sbostic #include <sys/buf.h>
1551455Sbostic #include <sys/vnode.h>
1651455Sbostic #include <sys/mount.h>
1751455Sbostic #include <sys/trace.h>
1851455Sbostic #include <sys/resourcevar.h>
1956395Smckusick #include <sys/malloc.h>
2056395Smckusick #include <libkern/libkern.h>
218Sbill 
2291Sbill /*
2356395Smckusick  * Definitions for the buffer hash lists.
2456395Smckusick  */
2556395Smckusick #define	BUFHASH(dvp, lbn)	\
2656395Smckusick 	(&bufhashtbl[((int)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
2756607Smckusick struct	list_entry *bufhashtbl, invalhash;
2856395Smckusick u_long	bufhash;
2956395Smckusick 
3056395Smckusick /*
3156395Smckusick  * Insq/Remq for the buffer hash lists.
3256395Smckusick  */
3356607Smckusick #define	binshash(bp, dp)	list_enter_head(dp, bp, struct buf *, b_hash)
3456607Smckusick #define	bremhash(bp)		list_remove(bp, struct buf *, b_hash)
3556395Smckusick 
3656395Smckusick /*
3756395Smckusick  * Definitions for the buffer free lists.
3856395Smckusick  */
3956395Smckusick #define	BQUEUES		4		/* number of free buffer queues */
4056395Smckusick 
4156395Smckusick #define	BQ_LOCKED	0		/* super-blocks &c */
4256395Smckusick #define	BQ_LRU		1		/* lru, useful buffers */
4356395Smckusick #define	BQ_AGE		2		/* rubbish */
4456395Smckusick #define	BQ_EMPTY	3		/* buffer headers with no memory */
4556395Smckusick 
4656607Smckusick struct queue_entry bufqueues[BQUEUES];
4756395Smckusick int needbuffer;
4856395Smckusick 
4956395Smckusick /*
5056395Smckusick  * Insq/Remq for the buffer free lists.
5156395Smckusick  */
5256607Smckusick #define	binsheadfree(bp, dp) \
5356607Smckusick 	queue_enter_head(dp, bp, struct buf *, b_freelist)
5456607Smckusick #define	binstailfree(bp, dp) \
5556607Smckusick 	queue_enter_tail(dp, bp, struct buf *, b_freelist)
5656607Smckusick 
5757045Smargo /*
5857045Smargo  * Local declarations
5957045Smargo  */
6057045Smargo struct buf *cluster_newbuf __P((struct vnode *, struct buf *, long, daddr_t,
6157045Smargo 	    daddr_t, long, int));
6257045Smargo struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *,
6357045Smargo 	    daddr_t, daddr_t, long, int, long));
6457045Smargo void	    cluster_wbuild __P((struct vnode *, struct buf *, long size,
6557045Smargo 	    daddr_t start_lbn, int len, daddr_t lbn));
6657045Smargo 
6756395Smckusick void
6856395Smckusick bremfree(bp)
6956395Smckusick 	struct buf *bp;
7056395Smckusick {
7156607Smckusick 	struct queue_entry *dp;
7256395Smckusick 
7356607Smckusick 	/*
7456607Smckusick 	 * We only calculate the head of the freelist when removing
7556607Smckusick 	 * the last element of the list as that is the only time that
7656607Smckusick 	 * it is needed (e.g. to reset the tail pointer).
7756607Smckusick 	 */
7856607Smckusick 	if (bp->b_freelist.qe_next == NULL) {
7956395Smckusick 		for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
8056607Smckusick 			if (dp->qe_prev == &bp->b_freelist.qe_next)
8156395Smckusick 				break;
8256395Smckusick 		if (dp == &bufqueues[BQUEUES])
8356395Smckusick 			panic("bremfree: lost tail");
8456395Smckusick 	}
8556607Smckusick 	queue_remove(dp, bp, struct buf *, b_freelist);
8656395Smckusick }
8756395Smckusick 
8856395Smckusick /*
8949280Skarels  * Initialize buffers and hash links for buffers.
9049280Skarels  */
9151455Sbostic void
9249280Skarels bufinit()
9349280Skarels {
9456395Smckusick 	register struct buf *bp;
9556607Smckusick 	struct queue_entry *dp;
9649280Skarels 	register int i;
9749280Skarels 	int base, residual;
9849280Skarels 
9956395Smckusick 	for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
10056607Smckusick 		queue_init(dp);
10156607Smckusick 	bufhashtbl = (struct list_entry *)hashinit(nbuf, M_CACHE, &bufhash);
10249280Skarels 	base = bufpages / nbuf;
10349280Skarels 	residual = bufpages % nbuf;
10449280Skarels 	for (i = 0; i < nbuf; i++) {
10549280Skarels 		bp = &buf[i];
10656395Smckusick 		bzero((char *)bp, sizeof *bp);
10749280Skarels 		bp->b_dev = NODEV;
10849280Skarels 		bp->b_rcred = NOCRED;
10949280Skarels 		bp->b_wcred = NOCRED;
11049280Skarels 		bp->b_un.b_addr = buffers + i * MAXBSIZE;
11149280Skarels 		if (i < residual)
11249280Skarels 			bp->b_bufsize = (base + 1) * CLBYTES;
11349280Skarels 		else
11449280Skarels 			bp->b_bufsize = base * CLBYTES;
11552413Storek 		bp->b_flags = B_INVAL;
11656395Smckusick 		dp = bp->b_bufsize ? &bufqueues[BQ_AGE] : &bufqueues[BQ_EMPTY];
11752413Storek 		binsheadfree(bp, dp);
11856395Smckusick 		binshash(bp, &invalhash);
11949280Skarels 	}
12049280Skarels }
12149280Skarels 
12249280Skarels /*
12346151Smckusick  * Find the block in the buffer pool.
12446151Smckusick  * If the buffer is not present, allocate a new buffer and load
12546151Smckusick  * its contents according to the filesystem fill routine.
1268Sbill  */
12738776Smckusick bread(vp, blkno, size, cred, bpp)
12837736Smckusick 	struct vnode *vp;
1296563Smckusic 	daddr_t blkno;
1306563Smckusic 	int size;
13138776Smckusick 	struct ucred *cred;
13237736Smckusick 	struct buf **bpp;
1338Sbill {
13447545Skarels 	struct proc *p = curproc;		/* XXX */
1358Sbill 	register struct buf *bp;
1368Sbill 
1378670S 	if (size == 0)
1388670S 		panic("bread: size 0");
139*57797Smckusick 	*bpp = bp = getblk(vp, blkno, size, 0, 0);
14046151Smckusick 	if (bp->b_flags & (B_DONE | B_DELWRI)) {
14140341Smckusick 		trace(TR_BREADHIT, pack(vp, size), blkno);
14237736Smckusick 		return (0);
1438Sbill 	}
1448Sbill 	bp->b_flags |= B_READ;
1458670S 	if (bp->b_bcount > bp->b_bufsize)
1468670S 		panic("bread");
14738776Smckusick 	if (bp->b_rcred == NOCRED && cred != NOCRED) {
14838776Smckusick 		crhold(cred);
14938776Smckusick 		bp->b_rcred = cred;
15038776Smckusick 	}
15137736Smckusick 	VOP_STRATEGY(bp);
15240341Smckusick 	trace(TR_BREADMISS, pack(vp, size), blkno);
15347545Skarels 	p->p_stats->p_ru.ru_inblock++;		/* pay for read */
15437736Smckusick 	return (biowait(bp));
1558Sbill }
1568Sbill 
1578Sbill /*
15852189Smckusick  * Operates like bread, but also starts I/O on the N specified
15952189Smckusick  * read-ahead blocks.
1608Sbill  */
16152189Smckusick breadn(vp, blkno, size, rablkno, rabsize, num, cred, bpp)
16237736Smckusick 	struct vnode *vp;
1637114Smckusick 	daddr_t blkno; int size;
16452189Smckusick 	daddr_t rablkno[]; int rabsize[];
16552189Smckusick 	int num;
16638776Smckusick 	struct ucred *cred;
16737736Smckusick 	struct buf **bpp;
1688Sbill {
16947545Skarels 	struct proc *p = curproc;		/* XXX */
1708Sbill 	register struct buf *bp, *rabp;
17152189Smckusick 	register int i;
1728Sbill 
1738Sbill 	bp = NULL;
1747015Smckusick 	/*
17546151Smckusick 	 * If the block is not memory resident,
17646151Smckusick 	 * allocate a buffer and start I/O.
1777015Smckusick 	 */
17837736Smckusick 	if (!incore(vp, blkno)) {
179*57797Smckusick 		*bpp = bp = getblk(vp, blkno, size, 0, 0);
18046151Smckusick 		if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) {
1818Sbill 			bp->b_flags |= B_READ;
1828670S 			if (bp->b_bcount > bp->b_bufsize)
18352189Smckusick 				panic("breadn");
18438776Smckusick 			if (bp->b_rcred == NOCRED && cred != NOCRED) {
18538776Smckusick 				crhold(cred);
18638776Smckusick 				bp->b_rcred = cred;
18738776Smckusick 			}
18837736Smckusick 			VOP_STRATEGY(bp);
18940341Smckusick 			trace(TR_BREADMISS, pack(vp, size), blkno);
19047545Skarels 			p->p_stats->p_ru.ru_inblock++;	/* pay for read */
19154342Smckusick 		} else {
19240341Smckusick 			trace(TR_BREADHIT, pack(vp, size), blkno);
19354342Smckusick 		}
1948Sbill 	}
1957015Smckusick 
1967015Smckusick 	/*
19752189Smckusick 	 * If there's read-ahead block(s), start I/O
19852189Smckusick 	 * on them also (as above).
1997015Smckusick 	 */
20052189Smckusick 	for (i = 0; i < num; i++) {
20152189Smckusick 		if (incore(vp, rablkno[i]))
20252189Smckusick 			continue;
203*57797Smckusick 		rabp = getblk(vp, rablkno[i], rabsize[i], 0, 0);
20446151Smckusick 		if (rabp->b_flags & (B_DONE | B_DELWRI)) {
2058Sbill 			brelse(rabp);
20652189Smckusick 			trace(TR_BREADHITRA, pack(vp, rabsize[i]), rablkno[i]);
2072045Swnj 		} else {
20846151Smckusick 			rabp->b_flags |= B_ASYNC | B_READ;
2098670S 			if (rabp->b_bcount > rabp->b_bufsize)
2108670S 				panic("breadrabp");
21138880Smckusick 			if (rabp->b_rcred == NOCRED && cred != NOCRED) {
21238776Smckusick 				crhold(cred);
21338880Smckusick 				rabp->b_rcred = cred;
21438776Smckusick 			}
21537736Smckusick 			VOP_STRATEGY(rabp);
21652189Smckusick 			trace(TR_BREADMISSRA, pack(vp, rabsize[i]), rablkno[i]);
21747545Skarels 			p->p_stats->p_ru.ru_inblock++;	/* pay in advance */
2188Sbill 		}
2198Sbill 	}
2207015Smckusick 
2217015Smckusick 	/*
22246151Smckusick 	 * If block was memory resident, let bread get it.
22346151Smckusick 	 * If block was not memory resident, the read was
22446151Smckusick 	 * started above, so just wait for the read to complete.
2257015Smckusick 	 */
2267114Smckusick 	if (bp == NULL)
22738776Smckusick 		return (bread(vp, blkno, size, cred, bpp));
22837736Smckusick 	return (biowait(bp));
2298Sbill }
2308Sbill 
2318Sbill /*
23257045Smargo  * We could optimize this by keeping track of where the last read-ahead
23357045Smargo  * was, but it would involve adding fields to the vnode.  For now, let's
23457045Smargo  * just get it working.
23557045Smargo  *
23657045Smargo  * This replaces bread.  If this is a bread at the beginning of a file and
23757045Smargo  * lastr is 0, we assume this is the first read and we'll read up to two
23857045Smargo  * blocks if they are sequential.  After that, we'll do regular read ahead
23957045Smargo  * in clustered chunks.
24057045Smargo  *
24157045Smargo  * There are 4 or 5 cases depending on how you count:
24257045Smargo  *	Desired block is in the cache:
24357045Smargo  *	    1 Not sequential access (0 I/Os).
24457045Smargo  *	    2 Access is sequential, do read-ahead (1 ASYNC).
24557045Smargo  *	Desired block is not in cache:
24657045Smargo  *	    3 Not sequential access (1 SYNC).
24757045Smargo  *	    4 Sequential access, next block is contiguous (1 SYNC).
24857045Smargo  *	    5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC)
24957045Smargo  *
25057045Smargo  * There are potentially two buffers that require I/O.
25157045Smargo  * 	bp is the block requested.
25257045Smargo  *	rbp is the read-ahead block.
25357045Smargo  *	If either is NULL, then you don't have to do the I/O.
25457045Smargo  */
25557045Smargo cluster_read(vp, filesize, lblkno, size, cred, bpp)
25657045Smargo 	struct vnode *vp;
25757045Smargo 	u_quad_t filesize;
25857045Smargo 	daddr_t lblkno;
25957045Smargo 	long size;
26057045Smargo 	struct ucred *cred;
26157045Smargo 	struct buf **bpp;
26257045Smargo {
26357045Smargo 	struct buf *bp, *rbp;
26457045Smargo 	daddr_t blkno, ioblkno;
26557045Smargo 	long flags;
26657045Smargo 	int error, num_ra, alreadyincore;
26757045Smargo 
26857045Smargo #ifdef DIAGNOSTIC
26957045Smargo 	if (size == 0)
27057045Smargo 		panic("cluster_read: size = 0");
27157045Smargo #endif
27257045Smargo 
27357045Smargo 	error = 0;
27457045Smargo 	flags = B_READ;
275*57797Smckusick 	*bpp = bp = getblk(vp, lblkno, size, 0, 0);
27657045Smargo 	if (bp->b_flags & (B_CACHE | B_DONE | B_DELWRI)) {
27757045Smargo 		/*
27857045Smargo 		 * Desired block is in cache; do any readahead ASYNC.
27957045Smargo 		 * Case 1, 2.
28057045Smargo 		 */
28157045Smargo 		trace(TR_BREADHIT, pack(vp, size), lblkno);
28257045Smargo 		flags |= B_ASYNC;
28357045Smargo 		ioblkno = lblkno +
28457045Smargo 		    (lblkno < vp->v_ralen ? vp->v_ralen >> 1 : vp->v_ralen);
285*57797Smckusick 		alreadyincore = (int)incore(vp, ioblkno);
28657045Smargo 		bp = NULL;
28757045Smargo 	} else {
28857045Smargo 		/* Block wasn't in cache, case 3, 4, 5. */
28957045Smargo 		trace(TR_BREADMISS, pack(vp, size), lblkno);
29057045Smargo 		ioblkno = lblkno;
29157045Smargo 		bp->b_flags |= flags;
29257045Smargo 		alreadyincore = 0;
29357045Smargo 		curproc->p_stats->p_ru.ru_inblock++;		/* XXX */
29457045Smargo 	}
29557045Smargo 	/*
29657045Smargo 	 * XXX
29757045Smargo 	 * Replace 1 with a window size based on some permutation of
29857045Smargo 	 * maxcontig and rot_delay.  This will let you figure out how
29957045Smargo 	 * many blocks you should read-ahead (case 2, 4, 5).
30057045Smargo 	 *
30157045Smargo 	 * If the access isn't sequential, cut the window size in half.
30257045Smargo 	 */
30357045Smargo 	rbp = NULL;
30457045Smargo 	if (lblkno != vp->v_lastr + 1 && lblkno != 0)
30557045Smargo 		vp->v_ralen = max(vp->v_ralen >> 1, 1);
30657045Smargo 	else if ((ioblkno + 1) * size < filesize && !alreadyincore &&
30757045Smargo 	    !(error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra))) {
30857045Smargo 		/*
30957045Smargo 		 * Reading sequentially, and the next block is not in the
31057045Smargo 		 * cache.  We are going to try reading ahead. If this is
31157045Smargo 		 * the first read of a file, then limit read-ahead to a
31257045Smargo 		 * single block, else read as much as we're allowed.
31357045Smargo 		 */
31457045Smargo 		if (num_ra > vp->v_ralen) {
31557045Smargo 			num_ra = vp->v_ralen;
31657045Smargo 			vp->v_ralen = min(MAXPHYS / size, vp->v_ralen << 1);
31757045Smargo 		} else
31857045Smargo 			vp->v_ralen = num_ra + 1;
31957045Smargo 
32057045Smargo 
32157045Smargo 		if (num_ra)				/* case 2, 4 */
32257045Smargo 			rbp = cluster_rbuild(vp, filesize,
32357045Smargo 			    bp, ioblkno, blkno, size, num_ra, flags);
32457045Smargo 		else if (lblkno != 0 && ioblkno == lblkno) {
32557045Smargo 			/* Case 5: check how many blocks to read ahead */
32657045Smargo 			++ioblkno;
32757045Smargo 			if ((ioblkno + 1) * size > filesize ||
32857045Smargo 			    (error = VOP_BMAP(vp,
32957045Smargo 			    ioblkno, NULL, &blkno, &num_ra)))
33057045Smargo 				goto skip_readahead;
33157045Smargo 			flags |= B_ASYNC;
33257045Smargo 			if (num_ra)
33357045Smargo 				rbp = cluster_rbuild(vp, filesize,
33457045Smargo 				    NULL, ioblkno, blkno, size, num_ra, flags);
33557045Smargo 			else {
336*57797Smckusick 				rbp = getblk(vp, ioblkno, size, 0, 0);
33757045Smargo 				rbp->b_flags |= flags;
33857045Smargo 				rbp->b_blkno = blkno;
33957045Smargo 			}
34057045Smargo 		} else if (lblkno != 0) {
34157045Smargo 			/* case 2; read ahead single block */
342*57797Smckusick 			rbp = getblk(vp, ioblkno, size, 0, 0);
34357045Smargo 			rbp->b_flags |= flags;
34457045Smargo 			rbp->b_blkno = blkno;
34557045Smargo 		} else if (bp)				/* case 1, 3, block 0 */
34657045Smargo 			bp->b_blkno = blkno;
34757045Smargo 		/* Case 1 on block 0; not really doing sequential I/O */
34857045Smargo 
34957045Smargo 		if (rbp == bp)		/* case 4 */
35057045Smargo 			rbp = NULL;
35157045Smargo 		else if (rbp) {			/* case 2, 5 */
35257045Smargo 			trace(TR_BREADMISSRA,
35357045Smargo 			    pack(vp, (num_ra + 1) * size), ioblkno);
35457045Smargo 			curproc->p_stats->p_ru.ru_inblock++;	/* XXX */
35557045Smargo 		}
35657045Smargo 	}
35757045Smargo 
35857045Smargo 	/* XXX Kirk, do we need to make sure the bp has creds? */
35957045Smargo skip_readahead:
36057045Smargo 	if (bp)
36157045Smargo 		if (bp->b_flags & (B_DONE | B_DELWRI))
36257045Smargo 			panic("cluster_read: DONE bp");
36357045Smargo 		else
36457045Smargo 			error = VOP_STRATEGY(bp);
36557045Smargo 
36657045Smargo 	if (rbp)
36757045Smargo 		if (error || rbp->b_flags & (B_DONE | B_DELWRI)) {
36857045Smargo 			rbp->b_flags &= ~(B_ASYNC | B_READ);
36957045Smargo 			brelse(rbp);
37057045Smargo 		} else
37157045Smargo 			(void) VOP_STRATEGY(rbp);
37257045Smargo 
37357045Smargo 	if (bp)
37457045Smargo 		return(biowait(bp));
37557045Smargo 	return(error);
37657045Smargo }
37757045Smargo 
37857045Smargo /*
37957045Smargo  * If blocks are contiguous on disk, use this to provide clustered
38057045Smargo  * read ahead.  We will read as many blocks as possible sequentially
38157045Smargo  * and then parcel them up into logical blocks in the buffer hash table.
38257045Smargo  */
38357045Smargo struct buf *
38457045Smargo cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags)
38557045Smargo 	struct vnode *vp;
38657045Smargo 	u_quad_t filesize;
38757045Smargo 	struct buf *bp;
38857045Smargo 	daddr_t lbn;
38957045Smargo 	daddr_t blkno;
39057045Smargo 	long size;
39157045Smargo 	int run;
39257045Smargo 	long flags;
39357045Smargo {
39457045Smargo 	struct cluster_save *b_save;
39557045Smargo 	struct buf *tbp;
39657045Smargo 	daddr_t bn;
39757045Smargo 	int i, inc;
39857045Smargo 
39957045Smargo 	if (size * (lbn + run + 1) > filesize)
40057045Smargo 		--run;
40157045Smargo 	if (run == 0) {
40257045Smargo 		if (!bp) {
403*57797Smckusick 			bp = getblk(vp, lbn, size, 0, 0);
40457045Smargo 			bp->b_blkno = blkno;
40557045Smargo 			bp->b_flags |= flags;
40657045Smargo 		}
40757045Smargo 		return(bp);
40857045Smargo 	}
40957045Smargo 
41057045Smargo 	bp = cluster_newbuf(vp, bp, flags, blkno, lbn, size, run + 1);
41157045Smargo 	if (bp->b_flags & (B_DONE | B_DELWRI))
41257045Smargo 		return (bp);
41357045Smargo 
41457045Smargo 	b_save = malloc(sizeof(struct buf *) * run + sizeof(struct cluster_save),
41557045Smargo 	    M_SEGMENT, M_WAITOK);
41657045Smargo 	b_save->bs_bufsize = b_save->bs_bcount = size;
41757045Smargo 	b_save->bs_nchildren = 0;
41857045Smargo 	b_save->bs_children = (struct buf **)(b_save + 1);
41957045Smargo 	b_save->bs_saveaddr = bp->b_saveaddr;
42057045Smargo 	bp->b_saveaddr = (caddr_t) b_save;
42157045Smargo 
42257045Smargo 	inc = size / DEV_BSIZE;
42357045Smargo 	for (bn = blkno + inc, i = 1; i <= run; ++i, bn += inc) {
42457045Smargo 		if (incore(vp, lbn + i)) {
42557045Smargo 			if (i == 1) {
42657045Smargo 				bp->b_saveaddr = b_save->bs_saveaddr;
42757045Smargo 				bp->b_flags &= ~B_CALL;
42857045Smargo 				bp->b_iodone = NULL;
42957045Smargo 				allocbuf(bp, size);
43057045Smargo 				free(b_save, M_SEGMENT);
43157045Smargo 			} else
43257045Smargo 				allocbuf(bp, size * i);
43357045Smargo 			break;
43457045Smargo 		}
435*57797Smckusick 		tbp = getblk(vp, lbn + i, 0, 0, 0);
43657045Smargo 		tbp->b_bcount = tbp->b_bufsize = size;
43757045Smargo 		tbp->b_blkno = bn;
43857045Smargo 		tbp->b_flags |= flags | B_READ | B_ASYNC;
43957045Smargo 		++b_save->bs_nchildren;
44057045Smargo 		b_save->bs_children[i - 1] = tbp;
44157045Smargo 	}
44257045Smargo 	if (!(bp->b_flags & B_ASYNC))
44357045Smargo 		vp->v_ralen = max(vp->v_ralen - 1, 1);
44457045Smargo 	return(bp);
44557045Smargo }
44657045Smargo 
44757045Smargo /*
44857045Smargo  * Either get a new buffer or grow the existing one.
44957045Smargo  */
45057045Smargo struct buf *
45157045Smargo cluster_newbuf(vp, bp, flags, blkno, lblkno, size, run)
45257045Smargo 	struct vnode *vp;
45357045Smargo 	struct buf *bp;
45457045Smargo 	long flags;
45557045Smargo 	daddr_t blkno;
45657045Smargo 	daddr_t lblkno;
45757045Smargo 	long size;
45857045Smargo 	int run;
45957045Smargo {
46057045Smargo 	if (!bp) {
461*57797Smckusick 		bp = getblk(vp, lblkno, size, 0, 0);
46257045Smargo 		if (bp->b_flags & (B_DONE | B_DELWRI)) {
46357045Smargo 			bp->b_blkno = blkno;
46457045Smargo 			return(bp);
46557045Smargo 		}
46657045Smargo 	}
46757045Smargo 	allocbuf(bp, run * size);
46857045Smargo 	bp->b_blkno = blkno;
46957045Smargo 	bp->b_iodone = cluster_callback;
47057045Smargo 	bp->b_flags |= flags | B_CALL;
47157045Smargo 	return(bp);
47257045Smargo }
47357045Smargo 
47457045Smargo /*
47557045Smargo  * Cleanup after a clustered read or write.
47657045Smargo  */
47757045Smargo void
47857045Smargo cluster_callback(bp)
47957045Smargo 	struct buf *bp;
48057045Smargo {
48157045Smargo 	struct cluster_save *b_save;
48257045Smargo 	struct buf **tbp;
48357045Smargo 	long bsize;
48457045Smargo 	caddr_t cp;
48557045Smargo 	b_save = (struct cluster_save *)(bp->b_saveaddr);
48657045Smargo 	bp->b_saveaddr = b_save->bs_saveaddr;
48757045Smargo 
48857045Smargo 	cp = bp->b_un.b_addr + b_save->bs_bufsize;
48957045Smargo 	for (tbp = b_save->bs_children; b_save->bs_nchildren--; ++tbp) {
49057045Smargo 		pagemove(cp, (*tbp)->b_un.b_addr, (*tbp)->b_bufsize);
49157045Smargo 		cp += (*tbp)->b_bufsize;
49257045Smargo 		bp->b_bufsize -= (*tbp)->b_bufsize;
49357045Smargo 		biodone(*tbp);
49457045Smargo 	}
49557045Smargo #ifdef DIAGNOSTIC
49657045Smargo 	if (bp->b_bufsize != b_save->bs_bufsize)
49757045Smargo 		panic ("cluster_callback: more space to reclaim");
49857045Smargo #endif
49957045Smargo 	bp->b_bcount = bp->b_bufsize;
50057045Smargo 	bp->b_iodone = NULL;
50157045Smargo 	free(b_save, M_SEGMENT);
50257045Smargo 	if (bp->b_flags & B_ASYNC)
50357045Smargo 		brelse(bp);
50457045Smargo 	else
50557045Smargo 		wakeup((caddr_t)bp);
50657045Smargo }
50757045Smargo 
50857045Smargo /*
50946151Smckusick  * Synchronous write.
51046151Smckusick  * Release buffer on completion.
5118Sbill  */
5128Sbill bwrite(bp)
5137015Smckusick 	register struct buf *bp;
5148Sbill {
51547545Skarels 	struct proc *p = curproc;		/* XXX */
51637736Smckusick 	register int flag;
51752413Storek 	int s, error = 0;
5188Sbill 
5198Sbill 	flag = bp->b_flags;
5209857Ssam 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
52149459Smckusick 	if (flag & B_ASYNC) {
52249459Smckusick 		if ((flag & B_DELWRI) == 0)
52349459Smckusick 			p->p_stats->p_ru.ru_oublock++;	/* no one paid yet */
52449459Smckusick 		else
52549459Smckusick 			reassignbuf(bp, bp->b_vp);
52649459Smckusick 	}
52740341Smckusick 	trace(TR_BWRITE, pack(bp->b_vp, bp->b_bcount), bp->b_lblkno);
5288670S 	if (bp->b_bcount > bp->b_bufsize)
5298670S 		panic("bwrite");
53040226Smckusick 	s = splbio();
53139882Smckusick 	bp->b_vp->v_numoutput++;
532*57797Smckusick 	bp->b_flags |= B_WRITEINPROG;
53340226Smckusick 	splx(s);
53437736Smckusick 	VOP_STRATEGY(bp);
5357015Smckusick 
5367015Smckusick 	/*
53746151Smckusick 	 * If the write was synchronous, then await I/O completion.
5387015Smckusick 	 * If the write was "delayed", then we put the buffer on
53946151Smckusick 	 * the queue of blocks awaiting I/O completion status.
5407015Smckusick 	 */
54146151Smckusick 	if ((flag & B_ASYNC) == 0) {
54237736Smckusick 		error = biowait(bp);
54349459Smckusick 		if ((flag&B_DELWRI) == 0)
54449459Smckusick 			p->p_stats->p_ru.ru_oublock++;	/* no one paid yet */
54549459Smckusick 		else
54649459Smckusick 			reassignbuf(bp, bp->b_vp);
547*57797Smckusick 		if (bp->b_flags & B_EINTR) {
548*57797Smckusick 			bp->b_flags &= ~B_EINTR;
549*57797Smckusick 			error = EINTR;
550*57797Smckusick 		}
5518Sbill 		brelse(bp);
55237736Smckusick 	} else if (flag & B_DELWRI) {
55352413Storek 		s = splbio();
5548Sbill 		bp->b_flags |= B_AGE;
55552413Storek 		splx(s);
55637736Smckusick 	}
55737736Smckusick 	return (error);
5588Sbill }
5598Sbill 
56053578Sheideman int
56153578Sheideman vn_bwrite(ap)
56253578Sheideman 	struct vop_bwrite_args *ap;
56353578Sheideman {
56456395Smckusick 	return (bwrite(ap->a_bp));
56553578Sheideman }
56653578Sheideman 
56753578Sheideman 
5688Sbill /*
56946151Smckusick  * Delayed write.
57046151Smckusick  *
57146151Smckusick  * The buffer is marked dirty, but is not queued for I/O.
57246151Smckusick  * This routine should be used when the buffer is expected
57346151Smckusick  * to be modified again soon, typically a small write that
57446151Smckusick  * partially fills a buffer.
57546151Smckusick  *
57646151Smckusick  * NB: magnetic tapes cannot be delayed; they must be
57746151Smckusick  * written in the order that the writes are requested.
5788Sbill  */
5798Sbill bdwrite(bp)
5807015Smckusick 	register struct buf *bp;
5818Sbill {
58247545Skarels 	struct proc *p = curproc;		/* XXX */
5838Sbill 
58439882Smckusick 	if ((bp->b_flags & B_DELWRI) == 0) {
58539882Smckusick 		bp->b_flags |= B_DELWRI;
58639882Smckusick 		reassignbuf(bp, bp->b_vp);
58747545Skarels 		p->p_stats->p_ru.ru_oublock++;		/* no one paid yet */
58839882Smckusick 	}
58937736Smckusick 	/*
59039668Smckusick 	 * If this is a tape drive, the write must be initiated.
59137736Smckusick 	 */
59248360Smckusick 	if (VOP_IOCTL(bp->b_vp, 0, (caddr_t)B_TAPE, 0, NOCRED, p) == 0) {
5938Sbill 		bawrite(bp);
59439668Smckusick 	} else {
59546151Smckusick 		bp->b_flags |= (B_DONE | B_DELWRI);
5968Sbill 		brelse(bp);
5978Sbill 	}
5988Sbill }
5998Sbill 
6008Sbill /*
60146151Smckusick  * Asynchronous write.
60246151Smckusick  * Start I/O on a buffer, but do not wait for it to complete.
60346151Smckusick  * The buffer is released when the I/O completes.
6048Sbill  */
6058Sbill bawrite(bp)
6067015Smckusick 	register struct buf *bp;
6078Sbill {
6088Sbill 
60946151Smckusick 	/*
61046151Smckusick 	 * Setting the ASYNC flag causes bwrite to return
61146151Smckusick 	 * after starting the I/O.
61246151Smckusick 	 */
6138Sbill 	bp->b_flags |= B_ASYNC;
614*57797Smckusick 	(void) VOP_BWRITE(bp);
6158Sbill }
6168Sbill 
6178Sbill /*
61857045Smargo  * Do clustered write for FFS.
61957045Smargo  *
62057045Smargo  * Three cases:
62157045Smargo  *	1. Write is not sequential (write asynchronously)
62257045Smargo  *	Write is sequential:
62357045Smargo  *	2.	beginning of cluster - begin cluster
62457045Smargo  *	3.	middle of a cluster - add to cluster
62557045Smargo  *	4.	end of a cluster - asynchronously write cluster
62657045Smargo  */
62757045Smargo void
62857045Smargo cluster_write(bp, filesize)
62957045Smargo         struct buf *bp;
63057045Smargo 	u_quad_t filesize;
63157045Smargo {
63257045Smargo         struct vnode *vp;
63357045Smargo         daddr_t lbn;
63457045Smargo         int clen, error, maxrun;
63557045Smargo 
63657045Smargo         vp = bp->b_vp;
63757045Smargo         lbn = bp->b_lblkno;
63857045Smargo 	clen = 0;
63957045Smargo 
64057045Smargo 	/*
64157045Smargo 	 * Handle end of file first.  If we are appending, we need to check
64257045Smargo 	 * if the current block was allocated contiguously.  If it wasn't,
64357045Smargo 	 * then we need to fire off a previous cluster if it existed.
64457045Smargo 	 * Additionally, when we're appending, we need to figure out how
64557045Smargo 	 * to initialize vp->v_clen.
64657045Smargo 	 */
64757045Smargo 	if ((lbn + 1) * bp->b_bcount == filesize) {
64857045Smargo 		if (bp->b_blkno != vp->v_lasta + bp->b_bcount / DEV_BSIZE) {
64957045Smargo 			/* This block was not allocated contiguously */
65057045Smargo 			if (vp->v_clen)
65157045Smargo 			    cluster_wbuild(vp, NULL, bp->b_bcount, vp->v_cstart,
65257045Smargo 				vp->v_lastw - vp->v_cstart + 1, lbn);
65357045Smargo 			vp->v_cstart = lbn;
65457045Smargo 			clen = vp->v_clen =
65557045Smargo 			    MAXBSIZE / vp->v_mount->mnt_stat.f_iosize - 1;
65657045Smargo 			/*
65757045Smargo 			 * Next cluster started. Write this buffer and return.
65857045Smargo 			 */
65957045Smargo 			vp->v_lastw = lbn;
66057045Smargo 			vp->v_lasta = bp->b_blkno;
66157045Smargo 			bdwrite(bp);
66257045Smargo 			return;
66357045Smargo 		}
66457045Smargo 		vp->v_lasta = bp->b_blkno;
66557045Smargo 	} else if (lbn == 0) {
66657045Smargo 		vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
66757045Smargo 	}
66857045Smargo         if (vp->v_clen == 0 || lbn != vp->v_lastw + 1) {
66957045Smargo 		if (vp->v_clen != 0)
67057045Smargo 			/*
67157045Smargo 			 * Write is not sequential.
67257045Smargo 			 */
67357045Smargo 			cluster_wbuild(vp, NULL, bp->b_bcount, vp->v_cstart,
67457045Smargo 			    vp->v_lastw - vp->v_cstart + 1, lbn);
67557045Smargo 		/*
67657045Smargo 		 * Consider beginning a cluster.
67757045Smargo 		 */
67857045Smargo 		if (error = VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &clen)) {
67957045Smargo 			bawrite(bp);
68057045Smargo 			vp->v_cstart = lbn + 1;
68157045Smargo 			vp->v_lastw = lbn;
68257045Smargo 			return;
68357045Smargo 		}
68457045Smargo                 vp->v_clen = clen;
68557045Smargo                 if (clen == 0) {		/* I/O not contiguous */
68657045Smargo 			vp->v_cstart = lbn + 1;
68757045Smargo                         bawrite(bp);
68857045Smargo                 } else {			/* Wait for rest of cluster */
68957045Smargo 			vp->v_cstart = lbn;
69057045Smargo                         bdwrite(bp);
69157045Smargo 		}
69257045Smargo         } else if (lbn == vp->v_cstart + vp->v_clen) {
69357045Smargo 		/*
69457045Smargo 		 * At end of cluster, write it out.
69557045Smargo 		 */
69657045Smargo 		cluster_wbuild(vp, bp, bp->b_bcount, vp->v_cstart,
69757045Smargo 		    vp->v_clen + 1, lbn);
69857045Smargo 		vp->v_clen = 0;
69957045Smargo 		vp->v_cstart = lbn + 1;
70057045Smargo         } else
70157045Smargo 		/*
70257045Smargo 		 * In the middle of a cluster, so just delay the
70357045Smargo 		 * I/O for now.
70457045Smargo 		 */
70557045Smargo                 bdwrite(bp);
70657045Smargo         vp->v_lastw = lbn;
70757045Smargo }
70857045Smargo 
70957045Smargo 
71057045Smargo /*
71157045Smargo  * This is an awful lot like cluster_rbuild...wish they could be combined.
71257045Smargo  * The last lbn argument is the current block on which I/O is being
71357045Smargo  * performed.  Check to see that it doesn't fall in the middle of
71457045Smargo  * the current block.
71557045Smargo  */
71657045Smargo void
71757045Smargo cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn)
71857045Smargo 	struct vnode *vp;
71957045Smargo 	struct buf *last_bp;
72057045Smargo 	long size;
72157045Smargo 	daddr_t start_lbn;
72257045Smargo 	int len;
72357045Smargo 	daddr_t	lbn;
72457045Smargo {
72557045Smargo 	struct cluster_save *b_save;
72657045Smargo 	struct buf *bp, *tbp;
72757045Smargo 	caddr_t	cp;
72857045Smargo 	int i, s;
72957045Smargo 
73057045Smargo redo:
73157045Smargo 	while ((!incore(vp, start_lbn) || start_lbn == lbn) && len) {
73257045Smargo 		++start_lbn;
73357045Smargo 		--len;
73457045Smargo 	}
73557045Smargo 
73657045Smargo 	/* Get more memory for current buffer */
73757045Smargo 	if (len <= 1) {
73857045Smargo 		if (last_bp)
73957045Smargo 			bawrite(last_bp);
74057045Smargo 		return;
74157045Smargo 	}
74257045Smargo 
743*57797Smckusick 	bp = getblk(vp, start_lbn, size, 0, 0);
74457045Smargo 	if (!(bp->b_flags & B_DELWRI)) {
74557045Smargo 		++start_lbn;
74657045Smargo 		--len;
74757045Smargo 		brelse(bp);
74857045Smargo 		goto redo;
74957045Smargo 	}
75057045Smargo 
75157045Smargo 	--len;
75257045Smargo 	b_save = malloc(sizeof(struct buf *) * len + sizeof(struct cluster_save),
75357045Smargo 	    M_SEGMENT, M_WAITOK);
75457045Smargo 	b_save->bs_bcount = bp->b_bcount;
75557045Smargo 	b_save->bs_bufsize = bp->b_bufsize;
75657045Smargo 	b_save->bs_nchildren = 0;
75757045Smargo 	b_save->bs_children = (struct buf **)(b_save + 1);
75857045Smargo 	b_save->bs_saveaddr = bp->b_saveaddr;
75957045Smargo 	bp->b_saveaddr = (caddr_t) b_save;
76057045Smargo 
76157045Smargo 
76257045Smargo 	bp->b_flags |= B_CALL;
76357045Smargo 	bp->b_iodone = cluster_callback;
76457045Smargo 	cp = bp->b_un.b_addr + bp->b_bufsize;
76557045Smargo 	for (++start_lbn, i = 0; i < len; ++i, ++start_lbn) {
76657045Smargo 		if (!incore(vp, start_lbn) || start_lbn == lbn)
76757045Smargo 			break;
76857045Smargo 
76957045Smargo 		if (last_bp == NULL || start_lbn != last_bp->b_lblkno) {
770*57797Smckusick 			tbp = getblk(vp, start_lbn, size, 0, 0);
77157045Smargo #ifdef DIAGNOSTIC
77257045Smargo 			if (tbp->b_bcount != tbp->b_bufsize)
77357045Smargo 				panic("cluster_wbuild: Buffer too big");
77457045Smargo #endif
77557045Smargo 			if (!(tbp->b_flags & B_DELWRI)) {
77657045Smargo 				brelse(tbp);
77757045Smargo 				break;
77857045Smargo 			}
77957045Smargo 		} else
78057045Smargo 			tbp = last_bp;
78157045Smargo 
78257045Smargo 		++b_save->bs_nchildren;
78357045Smargo 
78457045Smargo 		/* Move memory from children to parent */
78557045Smargo 		pagemove(tbp->b_un.b_daddr, cp, size);
78657045Smargo 		bp->b_bcount += size;
78757045Smargo 		bp->b_bufsize += size;
78857045Smargo 
78957045Smargo 		tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
79057045Smargo 		tbp->b_flags |= B_ASYNC;
79157045Smargo 		s = splbio();
79257045Smargo 		reassignbuf(tbp, tbp->b_vp);		/* put on clean list */
79357045Smargo 		++tbp->b_vp->v_numoutput;
79457045Smargo 		splx(s);
79557045Smargo 		b_save->bs_children[i] = tbp;
79657045Smargo 
79757045Smargo 		cp += tbp->b_bufsize;
79857045Smargo 	}
79957045Smargo 
80057045Smargo 	if (i == 0) {
80157045Smargo 		/* None to cluster */
80257045Smargo 		bp->b_saveaddr = b_save->bs_saveaddr;
80357045Smargo 		bp->b_flags &= ~B_CALL;
80457045Smargo 		bp->b_iodone = NULL;
80557045Smargo 		free(b_save, M_SEGMENT);
80657045Smargo 	}
80757045Smargo 	bawrite(bp);
80857045Smargo 	if (i < len) {
80957045Smargo 		len -= i + 1;
81057045Smargo 		start_lbn += 1;
81157045Smargo 		goto redo;
81257045Smargo 	}
81357045Smargo }
81457045Smargo 
81557045Smargo /*
81646151Smckusick  * Release a buffer.
81746151Smckusick  * Even if the buffer is dirty, no I/O is started.
8188Sbill  */
8198Sbill brelse(bp)
8207015Smckusick 	register struct buf *bp;
8218Sbill {
82256607Smckusick 	register struct queue_entry *flist;
82346151Smckusick 	int s;
8248Sbill 
82540341Smckusick 	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
8267015Smckusick 	/*
82739668Smckusick 	 * If a process is waiting for the buffer, or
82839668Smckusick 	 * is waiting for a free buffer, awaken it.
8297015Smckusick 	 */
83046151Smckusick 	if (bp->b_flags & B_WANTED)
8318Sbill 		wakeup((caddr_t)bp);
83256395Smckusick 	if (needbuffer) {
83356395Smckusick 		needbuffer = 0;
83456395Smckusick 		wakeup((caddr_t)&needbuffer);
8358Sbill 	}
83639668Smckusick 	/*
83739668Smckusick 	 * Retry I/O for locked buffers rather than invalidating them.
83839668Smckusick 	 */
83952413Storek 	s = splbio();
84039668Smckusick 	if ((bp->b_flags & B_ERROR) && (bp->b_flags & B_LOCKED))
84139668Smckusick 		bp->b_flags &= ~B_ERROR;
84239668Smckusick 	/*
84339668Smckusick 	 * Disassociate buffers that are no longer valid.
84439668Smckusick 	 */
84546151Smckusick 	if (bp->b_flags & (B_NOCACHE | B_ERROR))
84637736Smckusick 		bp->b_flags |= B_INVAL;
84746151Smckusick 	if ((bp->b_bufsize <= 0) || (bp->b_flags & (B_ERROR | B_INVAL))) {
84839668Smckusick 		if (bp->b_vp)
84939668Smckusick 			brelvp(bp);
85039668Smckusick 		bp->b_flags &= ~B_DELWRI;
85137736Smckusick 	}
8527015Smckusick 	/*
8537015Smckusick 	 * Stick the buffer back on a free list.
8547015Smckusick 	 */
8558670S 	if (bp->b_bufsize <= 0) {
8568670S 		/* block has no buffer ... put at front of unused buffer list */
85756395Smckusick 		flist = &bufqueues[BQ_EMPTY];
8588670S 		binsheadfree(bp, flist);
85946151Smckusick 	} else if (bp->b_flags & (B_ERROR | B_INVAL)) {
8602325Swnj 		/* block has no info ... put at front of most free list */
86156395Smckusick 		flist = &bufqueues[BQ_AGE];
8627015Smckusick 		binsheadfree(bp, flist);
8638Sbill 	} else {
8642325Swnj 		if (bp->b_flags & B_LOCKED)
86556395Smckusick 			flist = &bufqueues[BQ_LOCKED];
8662325Swnj 		else if (bp->b_flags & B_AGE)
86756395Smckusick 			flist = &bufqueues[BQ_AGE];
8682325Swnj 		else
86956395Smckusick 			flist = &bufqueues[BQ_LRU];
8707015Smckusick 		binstailfree(bp, flist);
8718Sbill 	}
87246151Smckusick 	bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_AGE | B_NOCACHE);
8738Sbill 	splx(s);
8748Sbill }
8758Sbill 
8768Sbill /*
87746151Smckusick  * Check to see if a block is currently memory resident.
8788Sbill  */
879*57797Smckusick struct buf *
88037736Smckusick incore(vp, blkno)
88137736Smckusick 	struct vnode *vp;
8827015Smckusick 	daddr_t blkno;
8838Sbill {
8848Sbill 	register struct buf *bp;
8858Sbill 
88656607Smckusick 	for (bp = BUFHASH(vp, blkno)->le_next; bp; bp = bp->b_hash.qe_next)
88739668Smckusick 		if (bp->b_lblkno == blkno && bp->b_vp == vp &&
8887015Smckusick 		    (bp->b_flags & B_INVAL) == 0)
889*57797Smckusick 			return (bp);
890*57797Smckusick 	return (NULL);
8918Sbill }
8928Sbill 
89339668Smckusick /*
89446151Smckusick  * Check to see if a block is currently memory resident.
89546151Smckusick  * If it is resident, return it. If it is not resident,
89646151Smckusick  * allocate a new buffer and assign it to the block.
89739668Smckusick  */
8988Sbill struct buf *
899*57797Smckusick getblk(vp, blkno, size, slpflag, slptimeo)
90037736Smckusick 	register struct vnode *vp;
9016563Smckusic 	daddr_t blkno;
902*57797Smckusick 	int size, slpflag, slptimeo;
9038Sbill {
90456607Smckusick 	register struct buf *bp;
90556607Smckusick 	struct list_entry *dp;
906*57797Smckusick 	int s, error;
9078Sbill 
90825255Smckusick 	if (size > MAXBSIZE)
90925255Smckusick 		panic("getblk: size too big");
9107015Smckusick 	/*
91146151Smckusick 	 * Search the cache for the block. If the buffer is found,
91246151Smckusick 	 * but it is currently locked, the we must wait for it to
91346151Smckusick 	 * become available.
9147015Smckusick 	 */
91537736Smckusick 	dp = BUFHASH(vp, blkno);
9167015Smckusick loop:
91756607Smckusick 	for (bp = dp->le_next; bp; bp = bp->b_hash.qe_next) {
918*57797Smckusick 		if (bp->b_lblkno != blkno || bp->b_vp != vp)
9198Sbill 			continue;
92026271Skarels 		s = splbio();
92146151Smckusick 		if (bp->b_flags & B_BUSY) {
9228Sbill 			bp->b_flags |= B_WANTED;
923*57797Smckusick 			error = tsleep((caddr_t)bp, slpflag | (PRIBIO + 1),
924*57797Smckusick 				"getblk", slptimeo);
9255424Swnj 			splx(s);
926*57797Smckusick 			if (error)
927*57797Smckusick 				return (NULL);
9288Sbill 			goto loop;
9298Sbill 		}
930*57797Smckusick 		/*
931*57797Smckusick 		 * The test for B_INVAL is moved down here, since there
932*57797Smckusick 		 * are cases where B_INVAL is set before VOP_BWRITE() is
933*57797Smckusick 		 * called and for NFS, the process cannot be allowed to
934*57797Smckusick 		 * allocate a new buffer for the same block until the write
935*57797Smckusick 		 * back to the server has been completed. (ie. B_BUSY clears)
936*57797Smckusick 		 */
937*57797Smckusick 		if (bp->b_flags & B_INVAL) {
938*57797Smckusick 			splx(s);
939*57797Smckusick 			continue;
940*57797Smckusick 		}
94139882Smckusick 		bremfree(bp);
94239882Smckusick 		bp->b_flags |= B_BUSY;
9435424Swnj 		splx(s);
94432608Smckusick 		if (bp->b_bcount != size) {
94539668Smckusick 			printf("getblk: stray size");
94639668Smckusick 			bp->b_flags |= B_INVAL;
947*57797Smckusick 			VOP_BWRITE(bp);
94839668Smckusick 			goto loop;
94932608Smckusick 		}
9508Sbill 		bp->b_flags |= B_CACHE;
95126271Skarels 		return (bp);
9528Sbill 	}
953*57797Smckusick 	/*
954*57797Smckusick 	 * The loop back to the top when getnewbuf() fails is because
955*57797Smckusick 	 * stateless filesystems like NFS have no node locks. Thus,
956*57797Smckusick 	 * there is a slight chance that more than one process will
957*57797Smckusick 	 * try and getnewbuf() for the same block concurrently when
958*57797Smckusick 	 * the first sleeps in getnewbuf(). So after a sleep, go back
959*57797Smckusick 	 * up to the top to check the hash lists again.
960*57797Smckusick 	 */
961*57797Smckusick 	if ((bp = getnewbuf(slpflag, slptimeo)) == 0)
962*57797Smckusick 		goto loop;
9637015Smckusick 	bremhash(bp);
96439668Smckusick 	bgetvp(vp, bp);
96545116Smckusick 	bp->b_bcount = 0;
96639668Smckusick 	bp->b_lblkno = blkno;
9676563Smckusic 	bp->b_blkno = blkno;
9688670S 	bp->b_error = 0;
96937736Smckusick 	bp->b_resid = 0;
97037736Smckusick 	binshash(bp, dp);
97145116Smckusick 	allocbuf(bp, size);
97226271Skarels 	return (bp);
9738Sbill }
9748Sbill 
9758Sbill /*
97646151Smckusick  * Allocate a buffer.
97746151Smckusick  * The caller will assign it to a block.
9788Sbill  */
9798Sbill struct buf *
9806563Smckusic geteblk(size)
9816563Smckusic 	int size;
9828Sbill {
98356395Smckusick 	register struct buf *bp;
9848Sbill 
98525255Smckusick 	if (size > MAXBSIZE)
98625255Smckusick 		panic("geteblk: size too big");
987*57797Smckusick 	while ((bp = getnewbuf(0, 0)) == NULL)
988*57797Smckusick 		/* void */;
9898670S 	bp->b_flags |= B_INVAL;
9907015Smckusick 	bremhash(bp);
99156395Smckusick 	binshash(bp, &invalhash);
99245116Smckusick 	bp->b_bcount = 0;
99337736Smckusick 	bp->b_error = 0;
99437736Smckusick 	bp->b_resid = 0;
99545116Smckusick 	allocbuf(bp, size);
99626271Skarels 	return (bp);
9978Sbill }
9988Sbill 
9998Sbill /*
100045116Smckusick  * Expand or contract the actual memory allocated to a buffer.
100146151Smckusick  * If no memory is available, release buffer and take error exit.
10026563Smckusic  */
100345116Smckusick allocbuf(tp, size)
100445116Smckusick 	register struct buf *tp;
10056563Smckusic 	int size;
10066563Smckusic {
100745116Smckusick 	register struct buf *bp, *ep;
100845116Smckusick 	int sizealloc, take, s;
10096563Smckusic 
101045116Smckusick 	sizealloc = roundup(size, CLBYTES);
101145116Smckusick 	/*
101245116Smckusick 	 * Buffer size does not change
101345116Smckusick 	 */
101445116Smckusick 	if (sizealloc == tp->b_bufsize)
101545116Smckusick 		goto out;
101645116Smckusick 	/*
101745116Smckusick 	 * Buffer size is shrinking.
101845116Smckusick 	 * Place excess space in a buffer header taken from the
101945116Smckusick 	 * BQ_EMPTY buffer list and placed on the "most free" list.
102045116Smckusick 	 * If no extra buffer headers are available, leave the
102145116Smckusick 	 * extra space in the present buffer.
102245116Smckusick 	 */
102345116Smckusick 	if (sizealloc < tp->b_bufsize) {
102456607Smckusick 		if ((ep = bufqueues[BQ_EMPTY].qe_next) == NULL)
102545116Smckusick 			goto out;
102645116Smckusick 		s = splbio();
102745116Smckusick 		bremfree(ep);
102845116Smckusick 		ep->b_flags |= B_BUSY;
102945116Smckusick 		splx(s);
103045116Smckusick 		pagemove(tp->b_un.b_addr + sizealloc, ep->b_un.b_addr,
103145116Smckusick 		    (int)tp->b_bufsize - sizealloc);
103245116Smckusick 		ep->b_bufsize = tp->b_bufsize - sizealloc;
103345116Smckusick 		tp->b_bufsize = sizealloc;
103445116Smckusick 		ep->b_flags |= B_INVAL;
103545116Smckusick 		ep->b_bcount = 0;
103645116Smckusick 		brelse(ep);
103745116Smckusick 		goto out;
103845116Smckusick 	}
103945116Smckusick 	/*
104045116Smckusick 	 * More buffer space is needed. Get it out of buffers on
104145116Smckusick 	 * the "most free" list, placing the empty headers on the
104245116Smckusick 	 * BQ_EMPTY buffer header list.
104345116Smckusick 	 */
104445116Smckusick 	while (tp->b_bufsize < sizealloc) {
104545116Smckusick 		take = sizealloc - tp->b_bufsize;
1046*57797Smckusick 		while ((bp = getnewbuf(0, 0)) == NULL)
1047*57797Smckusick 			/* void */;
104845116Smckusick 		if (take >= bp->b_bufsize)
104945116Smckusick 			take = bp->b_bufsize;
105045116Smckusick 		pagemove(&bp->b_un.b_addr[bp->b_bufsize - take],
105145116Smckusick 		    &tp->b_un.b_addr[tp->b_bufsize], take);
105245116Smckusick 		tp->b_bufsize += take;
105345116Smckusick 		bp->b_bufsize = bp->b_bufsize - take;
105445116Smckusick 		if (bp->b_bcount > bp->b_bufsize)
105545116Smckusick 			bp->b_bcount = bp->b_bufsize;
105645116Smckusick 		if (bp->b_bufsize <= 0) {
105745116Smckusick 			bremhash(bp);
105856395Smckusick 			binshash(bp, &invalhash);
105946151Smckusick 			bp->b_dev = NODEV;
106045116Smckusick 			bp->b_error = 0;
106145116Smckusick 			bp->b_flags |= B_INVAL;
106245116Smckusick 		}
106345116Smckusick 		brelse(bp);
106445116Smckusick 	}
106545116Smckusick out:
106645116Smckusick 	tp->b_bcount = size;
106745116Smckusick 	return (1);
10688670S }
10698670S 
10708670S /*
10718670S  * Find a buffer which is available for use.
10728670S  * Select something from a free list.
10738670S  * Preference is to AGE list, then LRU list.
10748670S  */
10758670S struct buf *
1076*57797Smckusick getnewbuf(slpflag, slptimeo)
1077*57797Smckusick 	int slpflag, slptimeo;
10788670S {
107956395Smckusick 	register struct buf *bp;
108056607Smckusick 	register struct queue_entry *dp;
108138776Smckusick 	register struct ucred *cred;
10828670S 	int s;
10838670S 
10848670S loop:
108526271Skarels 	s = splbio();
108656395Smckusick 	for (dp = &bufqueues[BQ_AGE]; dp > bufqueues; dp--)
108756607Smckusick 		if (dp->qe_next)
10888670S 			break;
108956395Smckusick 	if (dp == bufqueues) {		/* no free blocks */
109056395Smckusick 		needbuffer = 1;
1091*57797Smckusick 		(void) tsleep((caddr_t)&needbuffer, slpflag | (PRIBIO + 1),
1092*57797Smckusick 			"getnewbuf", slptimeo);
109312170Ssam 		splx(s);
1094*57797Smckusick 		return (NULL);
10958670S 	}
109656607Smckusick 	bp = dp->qe_next;
109739882Smckusick 	bremfree(bp);
109839882Smckusick 	bp->b_flags |= B_BUSY;
10998670S 	splx(s);
11008670S 	if (bp->b_flags & B_DELWRI) {
110138614Smckusick 		(void) bawrite(bp);
11028670S 		goto loop;
11038670S 	}
110440341Smckusick 	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
110539668Smckusick 	if (bp->b_vp)
110639668Smckusick 		brelvp(bp);
110738776Smckusick 	if (bp->b_rcred != NOCRED) {
110838776Smckusick 		cred = bp->b_rcred;
110938776Smckusick 		bp->b_rcred = NOCRED;
111038776Smckusick 		crfree(cred);
111138776Smckusick 	}
111238776Smckusick 	if (bp->b_wcred != NOCRED) {
111338776Smckusick 		cred = bp->b_wcred;
111438776Smckusick 		bp->b_wcred = NOCRED;
111538776Smckusick 		crfree(cred);
111638776Smckusick 	}
11178670S 	bp->b_flags = B_BUSY;
111846989Smckusick 	bp->b_dirtyoff = bp->b_dirtyend = 0;
111952189Smckusick 	bp->b_validoff = bp->b_validend = 0;
11208670S 	return (bp);
11218670S }
11228670S 
11238670S /*
112446151Smckusick  * Wait for I/O to complete.
112546151Smckusick  *
112646151Smckusick  * Extract and return any errors associated with the I/O.
112746151Smckusick  * If the error flag is set, but no specific error is
112846151Smckusick  * given, return EIO.
11298Sbill  */
11307015Smckusick biowait(bp)
11316563Smckusic 	register struct buf *bp;
11328Sbill {
11335431Sroot 	int s;
11348Sbill 
113526271Skarels 	s = splbio();
113638776Smckusick 	while ((bp->b_flags & B_DONE) == 0)
11378Sbill 		sleep((caddr_t)bp, PRIBIO);
11385431Sroot 	splx(s);
113937736Smckusick 	if ((bp->b_flags & B_ERROR) == 0)
114037736Smckusick 		return (0);
114137736Smckusick 	if (bp->b_error)
114237736Smckusick 		return (bp->b_error);
114337736Smckusick 	return (EIO);
11448Sbill }
11458Sbill 
11468Sbill /*
114713128Ssam  * Mark I/O complete on a buffer.
114846151Smckusick  *
114946151Smckusick  * If a callback has been requested, e.g. the pageout
115046151Smckusick  * daemon, do so. Otherwise, awaken waiting processes.
11518Sbill  */
115251455Sbostic void
11537015Smckusick biodone(bp)
11547015Smckusick 	register struct buf *bp;
11558Sbill {
11568Sbill 
1157420Sbill 	if (bp->b_flags & B_DONE)
11587015Smckusick 		panic("dup biodone");
11598Sbill 	bp->b_flags |= B_DONE;
116049232Smckusick 	if ((bp->b_flags & B_READ) == 0)
116149232Smckusick 		vwakeup(bp);
11629763Ssam 	if (bp->b_flags & B_CALL) {
11639763Ssam 		bp->b_flags &= ~B_CALL;
11649763Ssam 		(*bp->b_iodone)(bp);
11659763Ssam 		return;
11669763Ssam 	}
116746151Smckusick 	if (bp->b_flags & B_ASYNC)
11688Sbill 		brelse(bp);
11698Sbill 	else {
11708Sbill 		bp->b_flags &= ~B_WANTED;
11718Sbill 		wakeup((caddr_t)bp);
11728Sbill 	}
11738Sbill }
117456356Smckusick 
117557035Smargo int
117657035Smargo count_lock_queue()
117757035Smargo {
117857035Smargo 	register struct buf *bp;
117957035Smargo 	register int ret;
118057035Smargo 
118157035Smargo 	for (ret = 0, bp = (struct buf *)bufqueues[BQ_LOCKED].qe_next;
118257035Smargo 	    bp; bp = (struct buf *)bp->b_freelist.qe_next)
118357035Smargo 		++ret;
118457035Smargo 	return(ret);
118557035Smargo }
118657035Smargo 
118756356Smckusick #ifdef DIAGNOSTIC
118856356Smckusick /*
118956356Smckusick  * Print out statistics on the current allocation of the buffer pool.
119056356Smckusick  * Can be enabled to print out on every ``sync'' by setting "syncprt"
119156356Smckusick  * above.
119256356Smckusick  */
119356356Smckusick void
119456356Smckusick vfs_bufstats()
119556356Smckusick {
119656356Smckusick 	int s, i, j, count;
119756395Smckusick 	register struct buf *bp;
119856607Smckusick 	register struct queue_entry *dp;
119956356Smckusick 	int counts[MAXBSIZE/CLBYTES+1];
120056356Smckusick 	static char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE", "EMPTY" };
120156356Smckusick 
120256395Smckusick 	for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
120356356Smckusick 		count = 0;
120456356Smckusick 		for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
120556356Smckusick 			counts[j] = 0;
120656356Smckusick 		s = splbio();
120756607Smckusick 		for (bp = dp->qe_next; bp; bp = bp->b_freelist.qe_next) {
120856356Smckusick 			counts[bp->b_bufsize/CLBYTES]++;
120956356Smckusick 			count++;
121056356Smckusick 		}
121156356Smckusick 		splx(s);
121256356Smckusick 		printf("%s: total-%d", bname[i], count);
121356356Smckusick 		for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
121456356Smckusick 			if (counts[j] != 0)
121556356Smckusick 				printf(", %d-%d", j * CLBYTES, counts[j]);
121656356Smckusick 		printf("\n");
121756356Smckusick 	}
121856356Smckusick }
121956356Smckusick #endif /* DIAGNOSTIC */
1220