xref: /csrg-svn/sys/kern/vfs_cluster.c (revision 59872)
149589Sbostic /*-
249589Sbostic  * Copyright (c) 1982, 1986, 1989 The Regents of the University of California.
337736Smckusick  * All rights reserved.
423395Smckusick  *
549618Smckusick  * This module is believed to contain source code proprietary to AT&T.
649618Smckusick  * Use and redistribution is subject to the Berkeley Software License
749618Smckusick  * Agreement and your Software Agreement with AT&T (Western Electric).
837736Smckusick  *
9*59872Smargo  *	@(#)vfs_cluster.c	7.59 (Berkeley) 05/10/93
1023395Smckusick  */
118Sbill 
1251455Sbostic #include <sys/param.h>
1351455Sbostic #include <sys/proc.h>
1451455Sbostic #include <sys/buf.h>
1551455Sbostic #include <sys/vnode.h>
1651455Sbostic #include <sys/mount.h>
1751455Sbostic #include <sys/trace.h>
1851455Sbostic #include <sys/resourcevar.h>
1956395Smckusick #include <sys/malloc.h>
2056395Smckusick #include <libkern/libkern.h>
218Sbill 
2291Sbill /*
2356395Smckusick  * Definitions for the buffer hash lists.
2456395Smckusick  */
2556395Smckusick #define	BUFHASH(dvp, lbn)	\
2656395Smckusick 	(&bufhashtbl[((int)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
2756607Smckusick struct	list_entry *bufhashtbl, invalhash;
2856395Smckusick u_long	bufhash;
2956395Smckusick 
3056395Smckusick /*
3156395Smckusick  * Insq/Remq for the buffer hash lists.
3256395Smckusick  */
3356607Smckusick #define	binshash(bp, dp)	list_enter_head(dp, bp, struct buf *, b_hash)
3456607Smckusick #define	bremhash(bp)		list_remove(bp, struct buf *, b_hash)
3556395Smckusick 
3656395Smckusick /*
3756395Smckusick  * Definitions for the buffer free lists.
3856395Smckusick  */
3956395Smckusick #define	BQUEUES		4		/* number of free buffer queues */
4056395Smckusick 
4156395Smckusick #define	BQ_LOCKED	0		/* super-blocks &c */
4256395Smckusick #define	BQ_LRU		1		/* lru, useful buffers */
4356395Smckusick #define	BQ_AGE		2		/* rubbish */
4456395Smckusick #define	BQ_EMPTY	3		/* buffer headers with no memory */
4556395Smckusick 
4656607Smckusick struct queue_entry bufqueues[BQUEUES];
4756395Smckusick int needbuffer;
4856395Smckusick 
4956395Smckusick /*
5056395Smckusick  * Insq/Remq for the buffer free lists.
5156395Smckusick  */
5256607Smckusick #define	binsheadfree(bp, dp) \
5356607Smckusick 	queue_enter_head(dp, bp, struct buf *, b_freelist)
5456607Smckusick #define	binstailfree(bp, dp) \
5556607Smckusick 	queue_enter_tail(dp, bp, struct buf *, b_freelist)
5656607Smckusick 
5757045Smargo /*
5857045Smargo  * Local declarations
5957045Smargo  */
6057045Smargo struct buf *cluster_newbuf __P((struct vnode *, struct buf *, long, daddr_t,
6157045Smargo 	    daddr_t, long, int));
6257045Smargo struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *,
6357045Smargo 	    daddr_t, daddr_t, long, int, long));
6457045Smargo void	    cluster_wbuild __P((struct vnode *, struct buf *, long size,
6557045Smargo 	    daddr_t start_lbn, int len, daddr_t lbn));
6657045Smargo 
6756395Smckusick void
6856395Smckusick bremfree(bp)
6956395Smckusick 	struct buf *bp;
7056395Smckusick {
7156607Smckusick 	struct queue_entry *dp;
7256395Smckusick 
7356607Smckusick 	/*
7456607Smckusick 	 * We only calculate the head of the freelist when removing
7556607Smckusick 	 * the last element of the list as that is the only time that
7656607Smckusick 	 * it is needed (e.g. to reset the tail pointer).
7756607Smckusick 	 */
7856607Smckusick 	if (bp->b_freelist.qe_next == NULL) {
7956395Smckusick 		for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
8056607Smckusick 			if (dp->qe_prev == &bp->b_freelist.qe_next)
8156395Smckusick 				break;
8256395Smckusick 		if (dp == &bufqueues[BQUEUES])
8356395Smckusick 			panic("bremfree: lost tail");
8456395Smckusick 	}
8556607Smckusick 	queue_remove(dp, bp, struct buf *, b_freelist);
8656395Smckusick }
8756395Smckusick 
8856395Smckusick /*
8949280Skarels  * Initialize buffers and hash links for buffers.
9049280Skarels  */
9151455Sbostic void
9249280Skarels bufinit()
9349280Skarels {
9456395Smckusick 	register struct buf *bp;
9556607Smckusick 	struct queue_entry *dp;
9649280Skarels 	register int i;
9749280Skarels 	int base, residual;
9849280Skarels 
9956395Smckusick 	for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
10056607Smckusick 		queue_init(dp);
10156607Smckusick 	bufhashtbl = (struct list_entry *)hashinit(nbuf, M_CACHE, &bufhash);
10249280Skarels 	base = bufpages / nbuf;
10349280Skarels 	residual = bufpages % nbuf;
10449280Skarels 	for (i = 0; i < nbuf; i++) {
10549280Skarels 		bp = &buf[i];
10656395Smckusick 		bzero((char *)bp, sizeof *bp);
10749280Skarels 		bp->b_dev = NODEV;
10849280Skarels 		bp->b_rcred = NOCRED;
10949280Skarels 		bp->b_wcred = NOCRED;
11049280Skarels 		bp->b_un.b_addr = buffers + i * MAXBSIZE;
11149280Skarels 		if (i < residual)
11249280Skarels 			bp->b_bufsize = (base + 1) * CLBYTES;
11349280Skarels 		else
11449280Skarels 			bp->b_bufsize = base * CLBYTES;
11552413Storek 		bp->b_flags = B_INVAL;
11656395Smckusick 		dp = bp->b_bufsize ? &bufqueues[BQ_AGE] : &bufqueues[BQ_EMPTY];
11752413Storek 		binsheadfree(bp, dp);
11856395Smckusick 		binshash(bp, &invalhash);
11949280Skarels 	}
12049280Skarels }
12149280Skarels 
12249280Skarels /*
12346151Smckusick  * Find the block in the buffer pool.
12446151Smckusick  * If the buffer is not present, allocate a new buffer and load
12546151Smckusick  * its contents according to the filesystem fill routine.
1268Sbill  */
12738776Smckusick bread(vp, blkno, size, cred, bpp)
12837736Smckusick 	struct vnode *vp;
1296563Smckusic 	daddr_t blkno;
1306563Smckusic 	int size;
13138776Smckusick 	struct ucred *cred;
13237736Smckusick 	struct buf **bpp;
1338Sbill {
13447545Skarels 	struct proc *p = curproc;		/* XXX */
1358Sbill 	register struct buf *bp;
1368Sbill 
1378670S 	if (size == 0)
1388670S 		panic("bread: size 0");
13957797Smckusick 	*bpp = bp = getblk(vp, blkno, size, 0, 0);
14046151Smckusick 	if (bp->b_flags & (B_DONE | B_DELWRI)) {
14140341Smckusick 		trace(TR_BREADHIT, pack(vp, size), blkno);
14237736Smckusick 		return (0);
1438Sbill 	}
1448Sbill 	bp->b_flags |= B_READ;
1458670S 	if (bp->b_bcount > bp->b_bufsize)
1468670S 		panic("bread");
14738776Smckusick 	if (bp->b_rcred == NOCRED && cred != NOCRED) {
14838776Smckusick 		crhold(cred);
14938776Smckusick 		bp->b_rcred = cred;
15038776Smckusick 	}
15137736Smckusick 	VOP_STRATEGY(bp);
15240341Smckusick 	trace(TR_BREADMISS, pack(vp, size), blkno);
15347545Skarels 	p->p_stats->p_ru.ru_inblock++;		/* pay for read */
15437736Smckusick 	return (biowait(bp));
1558Sbill }
1568Sbill 
1578Sbill /*
15852189Smckusick  * Operates like bread, but also starts I/O on the N specified
15952189Smckusick  * read-ahead blocks.
1608Sbill  */
16152189Smckusick breadn(vp, blkno, size, rablkno, rabsize, num, cred, bpp)
16237736Smckusick 	struct vnode *vp;
1637114Smckusick 	daddr_t blkno; int size;
16452189Smckusick 	daddr_t rablkno[]; int rabsize[];
16552189Smckusick 	int num;
16638776Smckusick 	struct ucred *cred;
16737736Smckusick 	struct buf **bpp;
1688Sbill {
16947545Skarels 	struct proc *p = curproc;		/* XXX */
1708Sbill 	register struct buf *bp, *rabp;
17152189Smckusick 	register int i;
1728Sbill 
1738Sbill 	bp = NULL;
1747015Smckusick 	/*
17546151Smckusick 	 * If the block is not memory resident,
17646151Smckusick 	 * allocate a buffer and start I/O.
1777015Smckusick 	 */
17837736Smckusick 	if (!incore(vp, blkno)) {
17957797Smckusick 		*bpp = bp = getblk(vp, blkno, size, 0, 0);
18046151Smckusick 		if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) {
1818Sbill 			bp->b_flags |= B_READ;
1828670S 			if (bp->b_bcount > bp->b_bufsize)
18352189Smckusick 				panic("breadn");
18438776Smckusick 			if (bp->b_rcred == NOCRED && cred != NOCRED) {
18538776Smckusick 				crhold(cred);
18638776Smckusick 				bp->b_rcred = cred;
18738776Smckusick 			}
18837736Smckusick 			VOP_STRATEGY(bp);
18940341Smckusick 			trace(TR_BREADMISS, pack(vp, size), blkno);
19047545Skarels 			p->p_stats->p_ru.ru_inblock++;	/* pay for read */
19154342Smckusick 		} else {
19240341Smckusick 			trace(TR_BREADHIT, pack(vp, size), blkno);
19354342Smckusick 		}
1948Sbill 	}
1957015Smckusick 
1967015Smckusick 	/*
19752189Smckusick 	 * If there's read-ahead block(s), start I/O
19852189Smckusick 	 * on them also (as above).
1997015Smckusick 	 */
20052189Smckusick 	for (i = 0; i < num; i++) {
20152189Smckusick 		if (incore(vp, rablkno[i]))
20252189Smckusick 			continue;
20357797Smckusick 		rabp = getblk(vp, rablkno[i], rabsize[i], 0, 0);
20446151Smckusick 		if (rabp->b_flags & (B_DONE | B_DELWRI)) {
2058Sbill 			brelse(rabp);
20652189Smckusick 			trace(TR_BREADHITRA, pack(vp, rabsize[i]), rablkno[i]);
2072045Swnj 		} else {
20846151Smckusick 			rabp->b_flags |= B_ASYNC | B_READ;
2098670S 			if (rabp->b_bcount > rabp->b_bufsize)
2108670S 				panic("breadrabp");
21138880Smckusick 			if (rabp->b_rcred == NOCRED && cred != NOCRED) {
21238776Smckusick 				crhold(cred);
21338880Smckusick 				rabp->b_rcred = cred;
21438776Smckusick 			}
21537736Smckusick 			VOP_STRATEGY(rabp);
21652189Smckusick 			trace(TR_BREADMISSRA, pack(vp, rabsize[i]), rablkno[i]);
21747545Skarels 			p->p_stats->p_ru.ru_inblock++;	/* pay in advance */
2188Sbill 		}
2198Sbill 	}
2207015Smckusick 
2217015Smckusick 	/*
22246151Smckusick 	 * If block was memory resident, let bread get it.
22346151Smckusick 	 * If block was not memory resident, the read was
22446151Smckusick 	 * started above, so just wait for the read to complete.
2257015Smckusick 	 */
2267114Smckusick 	if (bp == NULL)
22738776Smckusick 		return (bread(vp, blkno, size, cred, bpp));
22837736Smckusick 	return (biowait(bp));
2298Sbill }
2308Sbill 
2318Sbill /*
23257045Smargo  * We could optimize this by keeping track of where the last read-ahead
23357045Smargo  * was, but it would involve adding fields to the vnode.  For now, let's
23457045Smargo  * just get it working.
23557045Smargo  *
23657045Smargo  * This replaces bread.  If this is a bread at the beginning of a file and
23757045Smargo  * lastr is 0, we assume this is the first read and we'll read up to two
23857045Smargo  * blocks if they are sequential.  After that, we'll do regular read ahead
23957045Smargo  * in clustered chunks.
24057045Smargo  *
24157045Smargo  * There are 4 or 5 cases depending on how you count:
24257045Smargo  *	Desired block is in the cache:
24357045Smargo  *	    1 Not sequential access (0 I/Os).
24457045Smargo  *	    2 Access is sequential, do read-ahead (1 ASYNC).
24557045Smargo  *	Desired block is not in cache:
24657045Smargo  *	    3 Not sequential access (1 SYNC).
24757045Smargo  *	    4 Sequential access, next block is contiguous (1 SYNC).
24857045Smargo  *	    5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC)
24957045Smargo  *
25057045Smargo  * There are potentially two buffers that require I/O.
25157045Smargo  * 	bp is the block requested.
25257045Smargo  *	rbp is the read-ahead block.
25357045Smargo  *	If either is NULL, then you don't have to do the I/O.
25457045Smargo  */
25557045Smargo cluster_read(vp, filesize, lblkno, size, cred, bpp)
25657045Smargo 	struct vnode *vp;
25757045Smargo 	u_quad_t filesize;
25857045Smargo 	daddr_t lblkno;
25957045Smargo 	long size;
26057045Smargo 	struct ucred *cred;
26157045Smargo 	struct buf **bpp;
26257045Smargo {
26357045Smargo 	struct buf *bp, *rbp;
26457045Smargo 	daddr_t blkno, ioblkno;
26557045Smargo 	long flags;
26657045Smargo 	int error, num_ra, alreadyincore;
26757045Smargo 
26857045Smargo #ifdef DIAGNOSTIC
26957045Smargo 	if (size == 0)
27057045Smargo 		panic("cluster_read: size = 0");
27157045Smargo #endif
27257045Smargo 
27357045Smargo 	error = 0;
27457045Smargo 	flags = B_READ;
27557797Smckusick 	*bpp = bp = getblk(vp, lblkno, size, 0, 0);
27657045Smargo 	if (bp->b_flags & (B_CACHE | B_DONE | B_DELWRI)) {
27757045Smargo 		/*
27857045Smargo 		 * Desired block is in cache; do any readahead ASYNC.
27957045Smargo 		 * Case 1, 2.
28057045Smargo 		 */
28157045Smargo 		trace(TR_BREADHIT, pack(vp, size), lblkno);
28257045Smargo 		flags |= B_ASYNC;
28357045Smargo 		ioblkno = lblkno +
28457045Smargo 		    (lblkno < vp->v_ralen ? vp->v_ralen >> 1 : vp->v_ralen);
28557797Smckusick 		alreadyincore = (int)incore(vp, ioblkno);
28657045Smargo 		bp = NULL;
28757045Smargo 	} else {
28857045Smargo 		/* Block wasn't in cache, case 3, 4, 5. */
28957045Smargo 		trace(TR_BREADMISS, pack(vp, size), lblkno);
29057045Smargo 		ioblkno = lblkno;
29157045Smargo 		bp->b_flags |= flags;
29257045Smargo 		alreadyincore = 0;
29357045Smargo 		curproc->p_stats->p_ru.ru_inblock++;		/* XXX */
29457045Smargo 	}
29557045Smargo 	/*
29657045Smargo 	 * XXX
29757045Smargo 	 * Replace 1 with a window size based on some permutation of
29857045Smargo 	 * maxcontig and rot_delay.  This will let you figure out how
29957045Smargo 	 * many blocks you should read-ahead (case 2, 4, 5).
30057045Smargo 	 *
30157045Smargo 	 * If the access isn't sequential, cut the window size in half.
30257045Smargo 	 */
30357045Smargo 	rbp = NULL;
30457045Smargo 	if (lblkno != vp->v_lastr + 1 && lblkno != 0)
30557045Smargo 		vp->v_ralen = max(vp->v_ralen >> 1, 1);
30657045Smargo 	else if ((ioblkno + 1) * size < filesize && !alreadyincore &&
30757045Smargo 	    !(error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra))) {
30857045Smargo 		/*
30957045Smargo 		 * Reading sequentially, and the next block is not in the
31057045Smargo 		 * cache.  We are going to try reading ahead. If this is
31157045Smargo 		 * the first read of a file, then limit read-ahead to a
31257045Smargo 		 * single block, else read as much as we're allowed.
31357045Smargo 		 */
31457045Smargo 		if (num_ra > vp->v_ralen) {
31557045Smargo 			num_ra = vp->v_ralen;
31657045Smargo 			vp->v_ralen = min(MAXPHYS / size, vp->v_ralen << 1);
31757045Smargo 		} else
31857045Smargo 			vp->v_ralen = num_ra + 1;
31957045Smargo 
32057045Smargo 
32157045Smargo 		if (num_ra)				/* case 2, 4 */
32257045Smargo 			rbp = cluster_rbuild(vp, filesize,
32357045Smargo 			    bp, ioblkno, blkno, size, num_ra, flags);
32457045Smargo 		else if (lblkno != 0 && ioblkno == lblkno) {
32557045Smargo 			/* Case 5: check how many blocks to read ahead */
32657045Smargo 			++ioblkno;
32757045Smargo 			if ((ioblkno + 1) * size > filesize ||
32857045Smargo 			    (error = VOP_BMAP(vp,
32957045Smargo 			    ioblkno, NULL, &blkno, &num_ra)))
33057045Smargo 				goto skip_readahead;
33157045Smargo 			flags |= B_ASYNC;
33257045Smargo 			if (num_ra)
33357045Smargo 				rbp = cluster_rbuild(vp, filesize,
33457045Smargo 				    NULL, ioblkno, blkno, size, num_ra, flags);
33557045Smargo 			else {
33657797Smckusick 				rbp = getblk(vp, ioblkno, size, 0, 0);
33757045Smargo 				rbp->b_flags |= flags;
33857045Smargo 				rbp->b_blkno = blkno;
33957045Smargo 			}
34057045Smargo 		} else if (lblkno != 0) {
34157045Smargo 			/* case 2; read ahead single block */
34257797Smckusick 			rbp = getblk(vp, ioblkno, size, 0, 0);
34357045Smargo 			rbp->b_flags |= flags;
34457045Smargo 			rbp->b_blkno = blkno;
34557045Smargo 		} else if (bp)				/* case 1, 3, block 0 */
34657045Smargo 			bp->b_blkno = blkno;
34757045Smargo 		/* Case 1 on block 0; not really doing sequential I/O */
34857045Smargo 
34957045Smargo 		if (rbp == bp)		/* case 4 */
35057045Smargo 			rbp = NULL;
35157045Smargo 		else if (rbp) {			/* case 2, 5 */
35257045Smargo 			trace(TR_BREADMISSRA,
35357045Smargo 			    pack(vp, (num_ra + 1) * size), ioblkno);
35457045Smargo 			curproc->p_stats->p_ru.ru_inblock++;	/* XXX */
35557045Smargo 		}
35657045Smargo 	}
35757045Smargo 
35857045Smargo 	/* XXX Kirk, do we need to make sure the bp has creds? */
35957045Smargo skip_readahead:
36057045Smargo 	if (bp)
36157045Smargo 		if (bp->b_flags & (B_DONE | B_DELWRI))
36257045Smargo 			panic("cluster_read: DONE bp");
36357045Smargo 		else
36457045Smargo 			error = VOP_STRATEGY(bp);
36557045Smargo 
36657045Smargo 	if (rbp)
36757045Smargo 		if (error || rbp->b_flags & (B_DONE | B_DELWRI)) {
36857045Smargo 			rbp->b_flags &= ~(B_ASYNC | B_READ);
36957045Smargo 			brelse(rbp);
37057045Smargo 		} else
37157045Smargo 			(void) VOP_STRATEGY(rbp);
37257045Smargo 
37357045Smargo 	if (bp)
37457045Smargo 		return(biowait(bp));
37557045Smargo 	return(error);
37657045Smargo }
37757045Smargo 
37857045Smargo /*
37957045Smargo  * If blocks are contiguous on disk, use this to provide clustered
38057045Smargo  * read ahead.  We will read as many blocks as possible sequentially
38157045Smargo  * and then parcel them up into logical blocks in the buffer hash table.
38257045Smargo  */
38357045Smargo struct buf *
38457045Smargo cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags)
38557045Smargo 	struct vnode *vp;
38657045Smargo 	u_quad_t filesize;
38757045Smargo 	struct buf *bp;
38857045Smargo 	daddr_t lbn;
38957045Smargo 	daddr_t blkno;
39057045Smargo 	long size;
39157045Smargo 	int run;
39257045Smargo 	long flags;
39357045Smargo {
39457045Smargo 	struct cluster_save *b_save;
39557045Smargo 	struct buf *tbp;
39657045Smargo 	daddr_t bn;
39757045Smargo 	int i, inc;
39857045Smargo 
399*59872Smargo #ifdef DIAGNOSTIC
400*59872Smargo 	if (size != vp->v_mount->mnt_stat.f_iosize)
401*59872Smargo 		panic("cluster_rbuild: size %d != filesize %d\n",
402*59872Smargo 			size, vp->v_mount->mnt_stat.f_iosize);
403*59872Smargo #endif
40457045Smargo 	if (size * (lbn + run + 1) > filesize)
40557045Smargo 		--run;
40657045Smargo 	if (run == 0) {
40757045Smargo 		if (!bp) {
40857797Smckusick 			bp = getblk(vp, lbn, size, 0, 0);
40957045Smargo 			bp->b_blkno = blkno;
41057045Smargo 			bp->b_flags |= flags;
41157045Smargo 		}
41257045Smargo 		return(bp);
41357045Smargo 	}
41457045Smargo 
41557045Smargo 	bp = cluster_newbuf(vp, bp, flags, blkno, lbn, size, run + 1);
41657045Smargo 	if (bp->b_flags & (B_DONE | B_DELWRI))
41757045Smargo 		return (bp);
41857045Smargo 
41957045Smargo 	b_save = malloc(sizeof(struct buf *) * run + sizeof(struct cluster_save),
42057045Smargo 	    M_SEGMENT, M_WAITOK);
42157045Smargo 	b_save->bs_bufsize = b_save->bs_bcount = size;
42257045Smargo 	b_save->bs_nchildren = 0;
42357045Smargo 	b_save->bs_children = (struct buf **)(b_save + 1);
42457045Smargo 	b_save->bs_saveaddr = bp->b_saveaddr;
42557045Smargo 	bp->b_saveaddr = (caddr_t) b_save;
42657045Smargo 
42757045Smargo 	inc = size / DEV_BSIZE;
42857045Smargo 	for (bn = blkno + inc, i = 1; i <= run; ++i, bn += inc) {
42957045Smargo 		if (incore(vp, lbn + i)) {
43057045Smargo 			if (i == 1) {
43157045Smargo 				bp->b_saveaddr = b_save->bs_saveaddr;
43257045Smargo 				bp->b_flags &= ~B_CALL;
43357045Smargo 				bp->b_iodone = NULL;
43457045Smargo 				allocbuf(bp, size);
43557045Smargo 				free(b_save, M_SEGMENT);
43657045Smargo 			} else
43757045Smargo 				allocbuf(bp, size * i);
43857045Smargo 			break;
43957045Smargo 		}
44057797Smckusick 		tbp = getblk(vp, lbn + i, 0, 0, 0);
44157045Smargo 		tbp->b_bcount = tbp->b_bufsize = size;
44257045Smargo 		tbp->b_blkno = bn;
44357045Smargo 		tbp->b_flags |= flags | B_READ | B_ASYNC;
44457045Smargo 		++b_save->bs_nchildren;
44557045Smargo 		b_save->bs_children[i - 1] = tbp;
44657045Smargo 	}
44757045Smargo 	if (!(bp->b_flags & B_ASYNC))
44857045Smargo 		vp->v_ralen = max(vp->v_ralen - 1, 1);
44957045Smargo 	return(bp);
45057045Smargo }
45157045Smargo 
45257045Smargo /*
45357045Smargo  * Either get a new buffer or grow the existing one.
45457045Smargo  */
45557045Smargo struct buf *
45657045Smargo cluster_newbuf(vp, bp, flags, blkno, lblkno, size, run)
45757045Smargo 	struct vnode *vp;
45857045Smargo 	struct buf *bp;
45957045Smargo 	long flags;
46057045Smargo 	daddr_t blkno;
46157045Smargo 	daddr_t lblkno;
46257045Smargo 	long size;
46357045Smargo 	int run;
46457045Smargo {
46557045Smargo 	if (!bp) {
46657797Smckusick 		bp = getblk(vp, lblkno, size, 0, 0);
46757045Smargo 		if (bp->b_flags & (B_DONE | B_DELWRI)) {
46857045Smargo 			bp->b_blkno = blkno;
46957045Smargo 			return(bp);
47057045Smargo 		}
47157045Smargo 	}
47257045Smargo 	allocbuf(bp, run * size);
47357045Smargo 	bp->b_blkno = blkno;
47457045Smargo 	bp->b_iodone = cluster_callback;
47557045Smargo 	bp->b_flags |= flags | B_CALL;
47657045Smargo 	return(bp);
47757045Smargo }
47857045Smargo 
47957045Smargo /*
48057045Smargo  * Cleanup after a clustered read or write.
48157045Smargo  */
48257045Smargo void
48357045Smargo cluster_callback(bp)
48457045Smargo 	struct buf *bp;
48557045Smargo {
48657045Smargo 	struct cluster_save *b_save;
48757045Smargo 	struct buf **tbp;
48857045Smargo 	long bsize;
48957045Smargo 	caddr_t cp;
49057045Smargo 	b_save = (struct cluster_save *)(bp->b_saveaddr);
49157045Smargo 	bp->b_saveaddr = b_save->bs_saveaddr;
49257045Smargo 
49357045Smargo 	cp = bp->b_un.b_addr + b_save->bs_bufsize;
49457045Smargo 	for (tbp = b_save->bs_children; b_save->bs_nchildren--; ++tbp) {
49557045Smargo 		pagemove(cp, (*tbp)->b_un.b_addr, (*tbp)->b_bufsize);
49657045Smargo 		cp += (*tbp)->b_bufsize;
49757045Smargo 		bp->b_bufsize -= (*tbp)->b_bufsize;
49857045Smargo 		biodone(*tbp);
49957045Smargo 	}
50057045Smargo #ifdef DIAGNOSTIC
50157045Smargo 	if (bp->b_bufsize != b_save->bs_bufsize)
50257045Smargo 		panic ("cluster_callback: more space to reclaim");
50357045Smargo #endif
50457045Smargo 	bp->b_bcount = bp->b_bufsize;
50557045Smargo 	bp->b_iodone = NULL;
50657045Smargo 	free(b_save, M_SEGMENT);
50757045Smargo 	if (bp->b_flags & B_ASYNC)
50857045Smargo 		brelse(bp);
50957045Smargo 	else
51057045Smargo 		wakeup((caddr_t)bp);
51157045Smargo }
51257045Smargo 
51357045Smargo /*
51446151Smckusick  * Synchronous write.
51546151Smckusick  * Release buffer on completion.
5168Sbill  */
5178Sbill bwrite(bp)
5187015Smckusick 	register struct buf *bp;
5198Sbill {
52047545Skarels 	struct proc *p = curproc;		/* XXX */
52137736Smckusick 	register int flag;
52252413Storek 	int s, error = 0;
5238Sbill 
5248Sbill 	flag = bp->b_flags;
5259857Ssam 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
52649459Smckusick 	if (flag & B_ASYNC) {
52749459Smckusick 		if ((flag & B_DELWRI) == 0)
52849459Smckusick 			p->p_stats->p_ru.ru_oublock++;	/* no one paid yet */
52949459Smckusick 		else
53049459Smckusick 			reassignbuf(bp, bp->b_vp);
53149459Smckusick 	}
53240341Smckusick 	trace(TR_BWRITE, pack(bp->b_vp, bp->b_bcount), bp->b_lblkno);
5338670S 	if (bp->b_bcount > bp->b_bufsize)
5348670S 		panic("bwrite");
53540226Smckusick 	s = splbio();
53639882Smckusick 	bp->b_vp->v_numoutput++;
53757797Smckusick 	bp->b_flags |= B_WRITEINPROG;
53840226Smckusick 	splx(s);
53937736Smckusick 	VOP_STRATEGY(bp);
5407015Smckusick 
5417015Smckusick 	/*
54246151Smckusick 	 * If the write was synchronous, then await I/O completion.
5437015Smckusick 	 * If the write was "delayed", then we put the buffer on
54446151Smckusick 	 * the queue of blocks awaiting I/O completion status.
5457015Smckusick 	 */
54646151Smckusick 	if ((flag & B_ASYNC) == 0) {
54737736Smckusick 		error = biowait(bp);
54849459Smckusick 		if ((flag&B_DELWRI) == 0)
54949459Smckusick 			p->p_stats->p_ru.ru_oublock++;	/* no one paid yet */
55049459Smckusick 		else
55149459Smckusick 			reassignbuf(bp, bp->b_vp);
55257797Smckusick 		if (bp->b_flags & B_EINTR) {
55357797Smckusick 			bp->b_flags &= ~B_EINTR;
55457797Smckusick 			error = EINTR;
55557797Smckusick 		}
5568Sbill 		brelse(bp);
55737736Smckusick 	} else if (flag & B_DELWRI) {
55852413Storek 		s = splbio();
5598Sbill 		bp->b_flags |= B_AGE;
56052413Storek 		splx(s);
56137736Smckusick 	}
56237736Smckusick 	return (error);
5638Sbill }
5648Sbill 
56553578Sheideman int
56653578Sheideman vn_bwrite(ap)
56753578Sheideman 	struct vop_bwrite_args *ap;
56853578Sheideman {
56956395Smckusick 	return (bwrite(ap->a_bp));
57053578Sheideman }
57153578Sheideman 
57253578Sheideman 
5738Sbill /*
57446151Smckusick  * Delayed write.
57546151Smckusick  *
57646151Smckusick  * The buffer is marked dirty, but is not queued for I/O.
57746151Smckusick  * This routine should be used when the buffer is expected
57846151Smckusick  * to be modified again soon, typically a small write that
57946151Smckusick  * partially fills a buffer.
58046151Smckusick  *
58146151Smckusick  * NB: magnetic tapes cannot be delayed; they must be
58246151Smckusick  * written in the order that the writes are requested.
5838Sbill  */
5848Sbill bdwrite(bp)
5857015Smckusick 	register struct buf *bp;
5868Sbill {
58747545Skarels 	struct proc *p = curproc;		/* XXX */
5888Sbill 
58939882Smckusick 	if ((bp->b_flags & B_DELWRI) == 0) {
59039882Smckusick 		bp->b_flags |= B_DELWRI;
59139882Smckusick 		reassignbuf(bp, bp->b_vp);
59247545Skarels 		p->p_stats->p_ru.ru_oublock++;		/* no one paid yet */
59339882Smckusick 	}
59437736Smckusick 	/*
59539668Smckusick 	 * If this is a tape drive, the write must be initiated.
59637736Smckusick 	 */
59748360Smckusick 	if (VOP_IOCTL(bp->b_vp, 0, (caddr_t)B_TAPE, 0, NOCRED, p) == 0) {
5988Sbill 		bawrite(bp);
59939668Smckusick 	} else {
60046151Smckusick 		bp->b_flags |= (B_DONE | B_DELWRI);
6018Sbill 		brelse(bp);
6028Sbill 	}
6038Sbill }
6048Sbill 
6058Sbill /*
60646151Smckusick  * Asynchronous write.
60746151Smckusick  * Start I/O on a buffer, but do not wait for it to complete.
60846151Smckusick  * The buffer is released when the I/O completes.
6098Sbill  */
6108Sbill bawrite(bp)
6117015Smckusick 	register struct buf *bp;
6128Sbill {
6138Sbill 
61446151Smckusick 	/*
61546151Smckusick 	 * Setting the ASYNC flag causes bwrite to return
61646151Smckusick 	 * after starting the I/O.
61746151Smckusick 	 */
6188Sbill 	bp->b_flags |= B_ASYNC;
61957797Smckusick 	(void) VOP_BWRITE(bp);
6208Sbill }
6218Sbill 
6228Sbill /*
62357045Smargo  * Do clustered write for FFS.
62457045Smargo  *
62557045Smargo  * Three cases:
62657045Smargo  *	1. Write is not sequential (write asynchronously)
62757045Smargo  *	Write is sequential:
62857045Smargo  *	2.	beginning of cluster - begin cluster
62957045Smargo  *	3.	middle of a cluster - add to cluster
63057045Smargo  *	4.	end of a cluster - asynchronously write cluster
63157045Smargo  */
63257045Smargo void
63357045Smargo cluster_write(bp, filesize)
63457045Smargo         struct buf *bp;
63557045Smargo 	u_quad_t filesize;
63657045Smargo {
63757045Smargo         struct vnode *vp;
63857045Smargo         daddr_t lbn;
639*59872Smargo         int clen;
64057045Smargo 
64157045Smargo         vp = bp->b_vp;
64257045Smargo         lbn = bp->b_lblkno;
64357045Smargo 
644*59872Smargo 	/* Initialize vnode to beginning of file. */
645*59872Smargo 	if (lbn == 0)
646*59872Smargo 		vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
647*59872Smargo 
648*59872Smargo         if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 ||
649*59872Smargo 	    (bp->b_blkno != vp->v_lasta + bp->b_bcount / DEV_BSIZE)) {
65057045Smargo 		if (vp->v_clen != 0)
65157045Smargo 			/*
65257045Smargo 			 * Write is not sequential.
65357045Smargo 			 */
65457045Smargo 			cluster_wbuild(vp, NULL, bp->b_bcount, vp->v_cstart,
65557045Smargo 			    vp->v_lastw - vp->v_cstart + 1, lbn);
65657045Smargo 		/*
65757045Smargo 		 * Consider beginning a cluster.
65857045Smargo 		 */
659*59872Smargo 		if ((lbn + 1) * bp->b_bcount == filesize)
660*59872Smargo 			/* End of file, make cluster as large as possible */
661*59872Smargo 			clen = MAXBSIZE / vp->v_mount->mnt_stat.f_iosize - 1;
662*59872Smargo 		else if (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &clen)) {
66357045Smargo 			bawrite(bp);
664*59872Smargo 			vp->v_clen = 0;
665*59872Smargo 			vp->v_lasta = bp->b_blkno;
66657045Smargo 			vp->v_cstart = lbn + 1;
66757045Smargo 			vp->v_lastw = lbn;
66857045Smargo 			return;
669*59872Smargo 		} else
670*59872Smargo 			clen = 0;
67157045Smargo                 vp->v_clen = clen;
67257045Smargo                 if (clen == 0) {		/* I/O not contiguous */
67357045Smargo 			vp->v_cstart = lbn + 1;
67457045Smargo                         bawrite(bp);
67557045Smargo                 } else {			/* Wait for rest of cluster */
67657045Smargo 			vp->v_cstart = lbn;
67757045Smargo                         bdwrite(bp);
67857045Smargo 		}
67957045Smargo         } else if (lbn == vp->v_cstart + vp->v_clen) {
68057045Smargo 		/*
68157045Smargo 		 * At end of cluster, write it out.
68257045Smargo 		 */
68357045Smargo 		cluster_wbuild(vp, bp, bp->b_bcount, vp->v_cstart,
68457045Smargo 		    vp->v_clen + 1, lbn);
68557045Smargo 		vp->v_clen = 0;
68657045Smargo 		vp->v_cstart = lbn + 1;
68757045Smargo         } else
68857045Smargo 		/*
68957045Smargo 		 * In the middle of a cluster, so just delay the
69057045Smargo 		 * I/O for now.
69157045Smargo 		 */
69257045Smargo                 bdwrite(bp);
69357045Smargo         vp->v_lastw = lbn;
694*59872Smargo 	vp->v_lasta = bp->b_blkno;
69557045Smargo }
69657045Smargo 
69757045Smargo 
69857045Smargo /*
69957045Smargo  * This is an awful lot like cluster_rbuild...wish they could be combined.
70057045Smargo  * The last lbn argument is the current block on which I/O is being
70157045Smargo  * performed.  Check to see that it doesn't fall in the middle of
70257045Smargo  * the current block.
70357045Smargo  */
70457045Smargo void
70557045Smargo cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn)
70657045Smargo 	struct vnode *vp;
70757045Smargo 	struct buf *last_bp;
70857045Smargo 	long size;
70957045Smargo 	daddr_t start_lbn;
71057045Smargo 	int len;
71157045Smargo 	daddr_t	lbn;
71257045Smargo {
71357045Smargo 	struct cluster_save *b_save;
71457045Smargo 	struct buf *bp, *tbp;
71557045Smargo 	caddr_t	cp;
71657045Smargo 	int i, s;
71757045Smargo 
718*59872Smargo #ifdef DIAGNOSTIC
719*59872Smargo 	if (size != vp->v_mount->mnt_stat.f_iosize)
720*59872Smargo 		panic("cluster_wbuild: size %d != filesize %d\n",
721*59872Smargo 			size, vp->v_mount->mnt_stat.f_iosize);
722*59872Smargo #endif
72357045Smargo redo:
72457045Smargo 	while ((!incore(vp, start_lbn) || start_lbn == lbn) && len) {
72557045Smargo 		++start_lbn;
72657045Smargo 		--len;
72757045Smargo 	}
72857045Smargo 
72957045Smargo 	/* Get more memory for current buffer */
73057045Smargo 	if (len <= 1) {
731*59872Smargo 		if (last_bp) {
73257045Smargo 			bawrite(last_bp);
733*59872Smargo 		} else if (len) {
734*59872Smargo 			bp = getblk(vp, start_lbn, size, 0, 0);
735*59872Smargo 			bawrite(bp);
736*59872Smargo 		}
73757045Smargo 		return;
73857045Smargo 	}
73957045Smargo 
74057797Smckusick 	bp = getblk(vp, start_lbn, size, 0, 0);
74157045Smargo 	if (!(bp->b_flags & B_DELWRI)) {
74257045Smargo 		++start_lbn;
74357045Smargo 		--len;
74457045Smargo 		brelse(bp);
74557045Smargo 		goto redo;
74657045Smargo 	}
74757045Smargo 
74857045Smargo 	--len;
74957045Smargo 	b_save = malloc(sizeof(struct buf *) * len + sizeof(struct cluster_save),
75057045Smargo 	    M_SEGMENT, M_WAITOK);
75157045Smargo 	b_save->bs_bcount = bp->b_bcount;
75257045Smargo 	b_save->bs_bufsize = bp->b_bufsize;
75357045Smargo 	b_save->bs_nchildren = 0;
75457045Smargo 	b_save->bs_children = (struct buf **)(b_save + 1);
75557045Smargo 	b_save->bs_saveaddr = bp->b_saveaddr;
75657045Smargo 	bp->b_saveaddr = (caddr_t) b_save;
75757045Smargo 
75857045Smargo 
75957045Smargo 	bp->b_flags |= B_CALL;
76057045Smargo 	bp->b_iodone = cluster_callback;
76157045Smargo 	cp = bp->b_un.b_addr + bp->b_bufsize;
76257045Smargo 	for (++start_lbn, i = 0; i < len; ++i, ++start_lbn) {
76357045Smargo 		if (!incore(vp, start_lbn) || start_lbn == lbn)
76457045Smargo 			break;
76557045Smargo 
76657045Smargo 		if (last_bp == NULL || start_lbn != last_bp->b_lblkno) {
76757797Smckusick 			tbp = getblk(vp, start_lbn, size, 0, 0);
76857045Smargo #ifdef DIAGNOSTIC
76957045Smargo 			if (tbp->b_bcount != tbp->b_bufsize)
77057045Smargo 				panic("cluster_wbuild: Buffer too big");
77157045Smargo #endif
77257045Smargo 			if (!(tbp->b_flags & B_DELWRI)) {
77357045Smargo 				brelse(tbp);
77457045Smargo 				break;
77557045Smargo 			}
77657045Smargo 		} else
77757045Smargo 			tbp = last_bp;
77857045Smargo 
77957045Smargo 		++b_save->bs_nchildren;
78057045Smargo 
78157045Smargo 		/* Move memory from children to parent */
782*59872Smargo 		if (tbp->b_blkno != (bp->b_blkno + bp->b_bufsize / DEV_BSIZE)) {
783*59872Smargo 			printf("Clustered Block: %d addr %x bufsize: %d\n",
784*59872Smargo 			    bp->b_lblkno, bp->b_blkno, bp->b_bufsize);
785*59872Smargo 			printf("Child Block: %d addr: %x\n", tbp->b_lblkno,
786*59872Smargo 			    tbp->b_blkno);
787*59872Smargo 			panic("Clustered write to wrong blocks");
788*59872Smargo 		}
789*59872Smargo 
79057045Smargo 		pagemove(tbp->b_un.b_daddr, cp, size);
79157045Smargo 		bp->b_bcount += size;
79257045Smargo 		bp->b_bufsize += size;
79357045Smargo 
79457045Smargo 		tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
79557045Smargo 		tbp->b_flags |= B_ASYNC;
79657045Smargo 		s = splbio();
79757045Smargo 		reassignbuf(tbp, tbp->b_vp);		/* put on clean list */
79857045Smargo 		++tbp->b_vp->v_numoutput;
79957045Smargo 		splx(s);
80057045Smargo 		b_save->bs_children[i] = tbp;
80157045Smargo 
80257045Smargo 		cp += tbp->b_bufsize;
80357045Smargo 	}
80457045Smargo 
80557045Smargo 	if (i == 0) {
80657045Smargo 		/* None to cluster */
80757045Smargo 		bp->b_saveaddr = b_save->bs_saveaddr;
80857045Smargo 		bp->b_flags &= ~B_CALL;
80957045Smargo 		bp->b_iodone = NULL;
81057045Smargo 		free(b_save, M_SEGMENT);
81157045Smargo 	}
81257045Smargo 	bawrite(bp);
81357045Smargo 	if (i < len) {
81457045Smargo 		len -= i + 1;
81557045Smargo 		start_lbn += 1;
81657045Smargo 		goto redo;
81757045Smargo 	}
81857045Smargo }
81957045Smargo 
82057045Smargo /*
82146151Smckusick  * Release a buffer.
82246151Smckusick  * Even if the buffer is dirty, no I/O is started.
8238Sbill  */
8248Sbill brelse(bp)
8257015Smckusick 	register struct buf *bp;
8268Sbill {
82756607Smckusick 	register struct queue_entry *flist;
82846151Smckusick 	int s;
8298Sbill 
83040341Smckusick 	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
8317015Smckusick 	/*
83239668Smckusick 	 * If a process is waiting for the buffer, or
83339668Smckusick 	 * is waiting for a free buffer, awaken it.
8347015Smckusick 	 */
83546151Smckusick 	if (bp->b_flags & B_WANTED)
8368Sbill 		wakeup((caddr_t)bp);
83756395Smckusick 	if (needbuffer) {
83856395Smckusick 		needbuffer = 0;
83956395Smckusick 		wakeup((caddr_t)&needbuffer);
8408Sbill 	}
84139668Smckusick 	/*
84239668Smckusick 	 * Retry I/O for locked buffers rather than invalidating them.
84339668Smckusick 	 */
84452413Storek 	s = splbio();
84539668Smckusick 	if ((bp->b_flags & B_ERROR) && (bp->b_flags & B_LOCKED))
84639668Smckusick 		bp->b_flags &= ~B_ERROR;
84739668Smckusick 	/*
84839668Smckusick 	 * Disassociate buffers that are no longer valid.
84939668Smckusick 	 */
85046151Smckusick 	if (bp->b_flags & (B_NOCACHE | B_ERROR))
85137736Smckusick 		bp->b_flags |= B_INVAL;
85246151Smckusick 	if ((bp->b_bufsize <= 0) || (bp->b_flags & (B_ERROR | B_INVAL))) {
85339668Smckusick 		if (bp->b_vp)
85439668Smckusick 			brelvp(bp);
85539668Smckusick 		bp->b_flags &= ~B_DELWRI;
85637736Smckusick 	}
8577015Smckusick 	/*
8587015Smckusick 	 * Stick the buffer back on a free list.
8597015Smckusick 	 */
8608670S 	if (bp->b_bufsize <= 0) {
8618670S 		/* block has no buffer ... put at front of unused buffer list */
86256395Smckusick 		flist = &bufqueues[BQ_EMPTY];
8638670S 		binsheadfree(bp, flist);
86446151Smckusick 	} else if (bp->b_flags & (B_ERROR | B_INVAL)) {
8652325Swnj 		/* block has no info ... put at front of most free list */
86656395Smckusick 		flist = &bufqueues[BQ_AGE];
8677015Smckusick 		binsheadfree(bp, flist);
8688Sbill 	} else {
8692325Swnj 		if (bp->b_flags & B_LOCKED)
87056395Smckusick 			flist = &bufqueues[BQ_LOCKED];
8712325Swnj 		else if (bp->b_flags & B_AGE)
87256395Smckusick 			flist = &bufqueues[BQ_AGE];
8732325Swnj 		else
87456395Smckusick 			flist = &bufqueues[BQ_LRU];
8757015Smckusick 		binstailfree(bp, flist);
8768Sbill 	}
87746151Smckusick 	bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_AGE | B_NOCACHE);
8788Sbill 	splx(s);
8798Sbill }
8808Sbill 
8818Sbill /*
88246151Smckusick  * Check to see if a block is currently memory resident.
8838Sbill  */
88457797Smckusick struct buf *
88537736Smckusick incore(vp, blkno)
88637736Smckusick 	struct vnode *vp;
8877015Smckusick 	daddr_t blkno;
8888Sbill {
8898Sbill 	register struct buf *bp;
8908Sbill 
89156607Smckusick 	for (bp = BUFHASH(vp, blkno)->le_next; bp; bp = bp->b_hash.qe_next)
89239668Smckusick 		if (bp->b_lblkno == blkno && bp->b_vp == vp &&
8937015Smckusick 		    (bp->b_flags & B_INVAL) == 0)
89457797Smckusick 			return (bp);
89557797Smckusick 	return (NULL);
8968Sbill }
8978Sbill 
89839668Smckusick /*
89946151Smckusick  * Check to see if a block is currently memory resident.
90046151Smckusick  * If it is resident, return it. If it is not resident,
90146151Smckusick  * allocate a new buffer and assign it to the block.
90239668Smckusick  */
9038Sbill struct buf *
90457797Smckusick getblk(vp, blkno, size, slpflag, slptimeo)
90537736Smckusick 	register struct vnode *vp;
9066563Smckusic 	daddr_t blkno;
90757797Smckusick 	int size, slpflag, slptimeo;
9088Sbill {
90956607Smckusick 	register struct buf *bp;
91056607Smckusick 	struct list_entry *dp;
91157797Smckusick 	int s, error;
9128Sbill 
91325255Smckusick 	if (size > MAXBSIZE)
91425255Smckusick 		panic("getblk: size too big");
9157015Smckusick 	/*
91646151Smckusick 	 * Search the cache for the block. If the buffer is found,
91746151Smckusick 	 * but it is currently locked, the we must wait for it to
91846151Smckusick 	 * become available.
9197015Smckusick 	 */
92037736Smckusick 	dp = BUFHASH(vp, blkno);
9217015Smckusick loop:
92256607Smckusick 	for (bp = dp->le_next; bp; bp = bp->b_hash.qe_next) {
92357797Smckusick 		if (bp->b_lblkno != blkno || bp->b_vp != vp)
9248Sbill 			continue;
92526271Skarels 		s = splbio();
92646151Smckusick 		if (bp->b_flags & B_BUSY) {
9278Sbill 			bp->b_flags |= B_WANTED;
92857797Smckusick 			error = tsleep((caddr_t)bp, slpflag | (PRIBIO + 1),
92957797Smckusick 				"getblk", slptimeo);
9305424Swnj 			splx(s);
93157797Smckusick 			if (error)
93257797Smckusick 				return (NULL);
9338Sbill 			goto loop;
9348Sbill 		}
93557797Smckusick 		/*
93657797Smckusick 		 * The test for B_INVAL is moved down here, since there
93757797Smckusick 		 * are cases where B_INVAL is set before VOP_BWRITE() is
93857797Smckusick 		 * called and for NFS, the process cannot be allowed to
93957797Smckusick 		 * allocate a new buffer for the same block until the write
94057797Smckusick 		 * back to the server has been completed. (ie. B_BUSY clears)
94157797Smckusick 		 */
94257797Smckusick 		if (bp->b_flags & B_INVAL) {
94357797Smckusick 			splx(s);
94457797Smckusick 			continue;
94557797Smckusick 		}
94639882Smckusick 		bremfree(bp);
94739882Smckusick 		bp->b_flags |= B_BUSY;
9485424Swnj 		splx(s);
94932608Smckusick 		if (bp->b_bcount != size) {
95039668Smckusick 			printf("getblk: stray size");
95139668Smckusick 			bp->b_flags |= B_INVAL;
95257797Smckusick 			VOP_BWRITE(bp);
95339668Smckusick 			goto loop;
95432608Smckusick 		}
9558Sbill 		bp->b_flags |= B_CACHE;
95626271Skarels 		return (bp);
9578Sbill 	}
95857797Smckusick 	/*
95957797Smckusick 	 * The loop back to the top when getnewbuf() fails is because
96057797Smckusick 	 * stateless filesystems like NFS have no node locks. Thus,
96157797Smckusick 	 * there is a slight chance that more than one process will
96257797Smckusick 	 * try and getnewbuf() for the same block concurrently when
96357797Smckusick 	 * the first sleeps in getnewbuf(). So after a sleep, go back
96457797Smckusick 	 * up to the top to check the hash lists again.
96557797Smckusick 	 */
96657797Smckusick 	if ((bp = getnewbuf(slpflag, slptimeo)) == 0)
96757797Smckusick 		goto loop;
9687015Smckusick 	bremhash(bp);
96939668Smckusick 	bgetvp(vp, bp);
97045116Smckusick 	bp->b_bcount = 0;
97139668Smckusick 	bp->b_lblkno = blkno;
9726563Smckusic 	bp->b_blkno = blkno;
9738670S 	bp->b_error = 0;
97437736Smckusick 	bp->b_resid = 0;
97537736Smckusick 	binshash(bp, dp);
97645116Smckusick 	allocbuf(bp, size);
97726271Skarels 	return (bp);
9788Sbill }
9798Sbill 
9808Sbill /*
98146151Smckusick  * Allocate a buffer.
98246151Smckusick  * The caller will assign it to a block.
9838Sbill  */
9848Sbill struct buf *
9856563Smckusic geteblk(size)
9866563Smckusic 	int size;
9878Sbill {
98856395Smckusick 	register struct buf *bp;
9898Sbill 
99025255Smckusick 	if (size > MAXBSIZE)
99125255Smckusick 		panic("geteblk: size too big");
99257797Smckusick 	while ((bp = getnewbuf(0, 0)) == NULL)
99357797Smckusick 		/* void */;
9948670S 	bp->b_flags |= B_INVAL;
9957015Smckusick 	bremhash(bp);
99656395Smckusick 	binshash(bp, &invalhash);
99745116Smckusick 	bp->b_bcount = 0;
99837736Smckusick 	bp->b_error = 0;
99937736Smckusick 	bp->b_resid = 0;
100045116Smckusick 	allocbuf(bp, size);
100126271Skarels 	return (bp);
10028Sbill }
10038Sbill 
10048Sbill /*
100545116Smckusick  * Expand or contract the actual memory allocated to a buffer.
100646151Smckusick  * If no memory is available, release buffer and take error exit.
10076563Smckusic  */
100845116Smckusick allocbuf(tp, size)
100945116Smckusick 	register struct buf *tp;
10106563Smckusic 	int size;
10116563Smckusic {
101245116Smckusick 	register struct buf *bp, *ep;
101345116Smckusick 	int sizealloc, take, s;
10146563Smckusic 
101545116Smckusick 	sizealloc = roundup(size, CLBYTES);
101645116Smckusick 	/*
101745116Smckusick 	 * Buffer size does not change
101845116Smckusick 	 */
101945116Smckusick 	if (sizealloc == tp->b_bufsize)
102045116Smckusick 		goto out;
102145116Smckusick 	/*
102245116Smckusick 	 * Buffer size is shrinking.
102345116Smckusick 	 * Place excess space in a buffer header taken from the
102445116Smckusick 	 * BQ_EMPTY buffer list and placed on the "most free" list.
102545116Smckusick 	 * If no extra buffer headers are available, leave the
102645116Smckusick 	 * extra space in the present buffer.
102745116Smckusick 	 */
102845116Smckusick 	if (sizealloc < tp->b_bufsize) {
102956607Smckusick 		if ((ep = bufqueues[BQ_EMPTY].qe_next) == NULL)
103045116Smckusick 			goto out;
103145116Smckusick 		s = splbio();
103245116Smckusick 		bremfree(ep);
103345116Smckusick 		ep->b_flags |= B_BUSY;
103445116Smckusick 		splx(s);
103545116Smckusick 		pagemove(tp->b_un.b_addr + sizealloc, ep->b_un.b_addr,
103645116Smckusick 		    (int)tp->b_bufsize - sizealloc);
103745116Smckusick 		ep->b_bufsize = tp->b_bufsize - sizealloc;
103845116Smckusick 		tp->b_bufsize = sizealloc;
103945116Smckusick 		ep->b_flags |= B_INVAL;
104045116Smckusick 		ep->b_bcount = 0;
104145116Smckusick 		brelse(ep);
104245116Smckusick 		goto out;
104345116Smckusick 	}
104445116Smckusick 	/*
104545116Smckusick 	 * More buffer space is needed. Get it out of buffers on
104645116Smckusick 	 * the "most free" list, placing the empty headers on the
104745116Smckusick 	 * BQ_EMPTY buffer header list.
104845116Smckusick 	 */
104945116Smckusick 	while (tp->b_bufsize < sizealloc) {
105045116Smckusick 		take = sizealloc - tp->b_bufsize;
105157797Smckusick 		while ((bp = getnewbuf(0, 0)) == NULL)
105257797Smckusick 			/* void */;
105345116Smckusick 		if (take >= bp->b_bufsize)
105445116Smckusick 			take = bp->b_bufsize;
105545116Smckusick 		pagemove(&bp->b_un.b_addr[bp->b_bufsize - take],
105645116Smckusick 		    &tp->b_un.b_addr[tp->b_bufsize], take);
105745116Smckusick 		tp->b_bufsize += take;
105845116Smckusick 		bp->b_bufsize = bp->b_bufsize - take;
105945116Smckusick 		if (bp->b_bcount > bp->b_bufsize)
106045116Smckusick 			bp->b_bcount = bp->b_bufsize;
106145116Smckusick 		if (bp->b_bufsize <= 0) {
106245116Smckusick 			bremhash(bp);
106356395Smckusick 			binshash(bp, &invalhash);
106446151Smckusick 			bp->b_dev = NODEV;
106545116Smckusick 			bp->b_error = 0;
106645116Smckusick 			bp->b_flags |= B_INVAL;
106745116Smckusick 		}
106845116Smckusick 		brelse(bp);
106945116Smckusick 	}
107045116Smckusick out:
107145116Smckusick 	tp->b_bcount = size;
107245116Smckusick 	return (1);
10738670S }
10748670S 
10758670S /*
10768670S  * Find a buffer which is available for use.
10778670S  * Select something from a free list.
10788670S  * Preference is to AGE list, then LRU list.
10798670S  */
10808670S struct buf *
108157797Smckusick getnewbuf(slpflag, slptimeo)
108257797Smckusick 	int slpflag, slptimeo;
10838670S {
108456395Smckusick 	register struct buf *bp;
108556607Smckusick 	register struct queue_entry *dp;
108638776Smckusick 	register struct ucred *cred;
10878670S 	int s;
10888670S 
10898670S loop:
109026271Skarels 	s = splbio();
109156395Smckusick 	for (dp = &bufqueues[BQ_AGE]; dp > bufqueues; dp--)
109256607Smckusick 		if (dp->qe_next)
10938670S 			break;
109456395Smckusick 	if (dp == bufqueues) {		/* no free blocks */
109556395Smckusick 		needbuffer = 1;
109657797Smckusick 		(void) tsleep((caddr_t)&needbuffer, slpflag | (PRIBIO + 1),
109757797Smckusick 			"getnewbuf", slptimeo);
109812170Ssam 		splx(s);
109957797Smckusick 		return (NULL);
11008670S 	}
110156607Smckusick 	bp = dp->qe_next;
110239882Smckusick 	bremfree(bp);
110339882Smckusick 	bp->b_flags |= B_BUSY;
11048670S 	splx(s);
11058670S 	if (bp->b_flags & B_DELWRI) {
110638614Smckusick 		(void) bawrite(bp);
11078670S 		goto loop;
11088670S 	}
110940341Smckusick 	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
111039668Smckusick 	if (bp->b_vp)
111139668Smckusick 		brelvp(bp);
111238776Smckusick 	if (bp->b_rcred != NOCRED) {
111338776Smckusick 		cred = bp->b_rcred;
111438776Smckusick 		bp->b_rcred = NOCRED;
111538776Smckusick 		crfree(cred);
111638776Smckusick 	}
111738776Smckusick 	if (bp->b_wcred != NOCRED) {
111838776Smckusick 		cred = bp->b_wcred;
111938776Smckusick 		bp->b_wcred = NOCRED;
112038776Smckusick 		crfree(cred);
112138776Smckusick 	}
11228670S 	bp->b_flags = B_BUSY;
112346989Smckusick 	bp->b_dirtyoff = bp->b_dirtyend = 0;
112452189Smckusick 	bp->b_validoff = bp->b_validend = 0;
11258670S 	return (bp);
11268670S }
11278670S 
11288670S /*
112946151Smckusick  * Wait for I/O to complete.
113046151Smckusick  *
113146151Smckusick  * Extract and return any errors associated with the I/O.
113246151Smckusick  * If the error flag is set, but no specific error is
113346151Smckusick  * given, return EIO.
11348Sbill  */
11357015Smckusick biowait(bp)
11366563Smckusic 	register struct buf *bp;
11378Sbill {
11385431Sroot 	int s;
11398Sbill 
114026271Skarels 	s = splbio();
114138776Smckusick 	while ((bp->b_flags & B_DONE) == 0)
11428Sbill 		sleep((caddr_t)bp, PRIBIO);
11435431Sroot 	splx(s);
114437736Smckusick 	if ((bp->b_flags & B_ERROR) == 0)
114537736Smckusick 		return (0);
114637736Smckusick 	if (bp->b_error)
114737736Smckusick 		return (bp->b_error);
114837736Smckusick 	return (EIO);
11498Sbill }
11508Sbill 
11518Sbill /*
115213128Ssam  * Mark I/O complete on a buffer.
115346151Smckusick  *
115446151Smckusick  * If a callback has been requested, e.g. the pageout
115546151Smckusick  * daemon, do so. Otherwise, awaken waiting processes.
11568Sbill  */
115751455Sbostic void
11587015Smckusick biodone(bp)
11597015Smckusick 	register struct buf *bp;
11608Sbill {
11618Sbill 
1162420Sbill 	if (bp->b_flags & B_DONE)
11637015Smckusick 		panic("dup biodone");
11648Sbill 	bp->b_flags |= B_DONE;
116549232Smckusick 	if ((bp->b_flags & B_READ) == 0)
116649232Smckusick 		vwakeup(bp);
11679763Ssam 	if (bp->b_flags & B_CALL) {
11689763Ssam 		bp->b_flags &= ~B_CALL;
11699763Ssam 		(*bp->b_iodone)(bp);
11709763Ssam 		return;
11719763Ssam 	}
117246151Smckusick 	if (bp->b_flags & B_ASYNC)
11738Sbill 		brelse(bp);
11748Sbill 	else {
11758Sbill 		bp->b_flags &= ~B_WANTED;
11768Sbill 		wakeup((caddr_t)bp);
11778Sbill 	}
11788Sbill }
117956356Smckusick 
118057035Smargo int
118157035Smargo count_lock_queue()
118257035Smargo {
118357035Smargo 	register struct buf *bp;
118457035Smargo 	register int ret;
118557035Smargo 
118657035Smargo 	for (ret = 0, bp = (struct buf *)bufqueues[BQ_LOCKED].qe_next;
118757035Smargo 	    bp; bp = (struct buf *)bp->b_freelist.qe_next)
118857035Smargo 		++ret;
118957035Smargo 	return(ret);
119057035Smargo }
119157035Smargo 
119256356Smckusick #ifdef DIAGNOSTIC
119356356Smckusick /*
119456356Smckusick  * Print out statistics on the current allocation of the buffer pool.
119556356Smckusick  * Can be enabled to print out on every ``sync'' by setting "syncprt"
119656356Smckusick  * above.
119756356Smckusick  */
119856356Smckusick void
119956356Smckusick vfs_bufstats()
120056356Smckusick {
120156356Smckusick 	int s, i, j, count;
120256395Smckusick 	register struct buf *bp;
120356607Smckusick 	register struct queue_entry *dp;
120456356Smckusick 	int counts[MAXBSIZE/CLBYTES+1];
120556356Smckusick 	static char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE", "EMPTY" };
120656356Smckusick 
120756395Smckusick 	for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
120856356Smckusick 		count = 0;
120956356Smckusick 		for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
121056356Smckusick 			counts[j] = 0;
121156356Smckusick 		s = splbio();
121256607Smckusick 		for (bp = dp->qe_next; bp; bp = bp->b_freelist.qe_next) {
121356356Smckusick 			counts[bp->b_bufsize/CLBYTES]++;
121456356Smckusick 			count++;
121556356Smckusick 		}
121656356Smckusick 		splx(s);
121756356Smckusick 		printf("%s: total-%d", bname[i], count);
121856356Smckusick 		for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
121956356Smckusick 			if (counts[j] != 0)
122056356Smckusick 				printf(", %d-%d", j * CLBYTES, counts[j]);
122156356Smckusick 		printf("\n");
122256356Smckusick 	}
122356356Smckusick }
122456356Smckusick #endif /* DIAGNOSTIC */
1225