xref: /csrg-svn/sys/kern/vfs_cluster.c (revision 57045)
149589Sbostic /*-
249589Sbostic  * Copyright (c) 1982, 1986, 1989 The Regents of the University of California.
337736Smckusick  * All rights reserved.
423395Smckusick  *
549618Smckusick  * This module is believed to contain source code proprietary to AT&T.
649618Smckusick  * Use and redistribution is subject to the Berkeley Software License
749618Smckusick  * Agreement and your Software Agreement with AT&T (Western Electric).
837736Smckusick  *
9*57045Smargo  *	@(#)vfs_cluster.c	7.57 (Berkeley) 12/09/92
1023395Smckusick  */
118Sbill 
1251455Sbostic #include <sys/param.h>
1351455Sbostic #include <sys/proc.h>
1451455Sbostic #include <sys/buf.h>
1551455Sbostic #include <sys/vnode.h>
1651455Sbostic #include <sys/mount.h>
1751455Sbostic #include <sys/trace.h>
1851455Sbostic #include <sys/resourcevar.h>
1956395Smckusick #include <sys/malloc.h>
2056395Smckusick #include <libkern/libkern.h>
218Sbill 
2291Sbill /*
2356395Smckusick  * Definitions for the buffer hash lists.
2456395Smckusick  */
2556395Smckusick #define	BUFHASH(dvp, lbn)	\
2656395Smckusick 	(&bufhashtbl[((int)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
2756607Smckusick struct	list_entry *bufhashtbl, invalhash;
2856395Smckusick u_long	bufhash;
2956395Smckusick 
3056395Smckusick /*
3156395Smckusick  * Insq/Remq for the buffer hash lists.
3256395Smckusick  */
3356607Smckusick #define	binshash(bp, dp)	list_enter_head(dp, bp, struct buf *, b_hash)
3456607Smckusick #define	bremhash(bp)		list_remove(bp, struct buf *, b_hash)
3556395Smckusick 
3656395Smckusick /*
3756395Smckusick  * Definitions for the buffer free lists.
3856395Smckusick  */
3956395Smckusick #define	BQUEUES		4		/* number of free buffer queues */
4056395Smckusick 
4156395Smckusick #define	BQ_LOCKED	0		/* super-blocks &c */
4256395Smckusick #define	BQ_LRU		1		/* lru, useful buffers */
4356395Smckusick #define	BQ_AGE		2		/* rubbish */
4456395Smckusick #define	BQ_EMPTY	3		/* buffer headers with no memory */
4556395Smckusick 
4656607Smckusick struct queue_entry bufqueues[BQUEUES];
4756395Smckusick int needbuffer;
4856395Smckusick 
4956395Smckusick /*
5056395Smckusick  * Insq/Remq for the buffer free lists.
5156395Smckusick  */
5256607Smckusick #define	binsheadfree(bp, dp) \
5356607Smckusick 	queue_enter_head(dp, bp, struct buf *, b_freelist)
5456607Smckusick #define	binstailfree(bp, dp) \
5556607Smckusick 	queue_enter_tail(dp, bp, struct buf *, b_freelist)
5656607Smckusick 
57*57045Smargo /*
58*57045Smargo  * Local declarations
59*57045Smargo  */
60*57045Smargo struct buf *cluster_newbuf __P((struct vnode *, struct buf *, long, daddr_t,
61*57045Smargo 	    daddr_t, long, int));
62*57045Smargo struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *,
63*57045Smargo 	    daddr_t, daddr_t, long, int, long));
64*57045Smargo void	    cluster_wbuild __P((struct vnode *, struct buf *, long size,
65*57045Smargo 	    daddr_t start_lbn, int len, daddr_t lbn));
66*57045Smargo 
6756395Smckusick void
6856395Smckusick bremfree(bp)
6956395Smckusick 	struct buf *bp;
7056395Smckusick {
7156607Smckusick 	struct queue_entry *dp;
7256395Smckusick 
7356607Smckusick 	/*
7456607Smckusick 	 * We only calculate the head of the freelist when removing
7556607Smckusick 	 * the last element of the list as that is the only time that
7656607Smckusick 	 * it is needed (e.g. to reset the tail pointer).
7756607Smckusick 	 */
7856607Smckusick 	if (bp->b_freelist.qe_next == NULL) {
7956395Smckusick 		for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
8056607Smckusick 			if (dp->qe_prev == &bp->b_freelist.qe_next)
8156395Smckusick 				break;
8256395Smckusick 		if (dp == &bufqueues[BQUEUES])
8356395Smckusick 			panic("bremfree: lost tail");
8456395Smckusick 	}
8556607Smckusick 	queue_remove(dp, bp, struct buf *, b_freelist);
8656395Smckusick }
8756395Smckusick 
8856395Smckusick /*
8949280Skarels  * Initialize buffers and hash links for buffers.
9049280Skarels  */
9151455Sbostic void
9249280Skarels bufinit()
9349280Skarels {
9456395Smckusick 	register struct buf *bp;
9556607Smckusick 	struct queue_entry *dp;
9649280Skarels 	register int i;
9749280Skarels 	int base, residual;
9849280Skarels 
9956395Smckusick 	for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
10056607Smckusick 		queue_init(dp);
10156607Smckusick 	bufhashtbl = (struct list_entry *)hashinit(nbuf, M_CACHE, &bufhash);
10249280Skarels 	base = bufpages / nbuf;
10349280Skarels 	residual = bufpages % nbuf;
10449280Skarels 	for (i = 0; i < nbuf; i++) {
10549280Skarels 		bp = &buf[i];
10656395Smckusick 		bzero((char *)bp, sizeof *bp);
10749280Skarels 		bp->b_dev = NODEV;
10849280Skarels 		bp->b_rcred = NOCRED;
10949280Skarels 		bp->b_wcred = NOCRED;
11049280Skarels 		bp->b_un.b_addr = buffers + i * MAXBSIZE;
11149280Skarels 		if (i < residual)
11249280Skarels 			bp->b_bufsize = (base + 1) * CLBYTES;
11349280Skarels 		else
11449280Skarels 			bp->b_bufsize = base * CLBYTES;
11552413Storek 		bp->b_flags = B_INVAL;
11656395Smckusick 		dp = bp->b_bufsize ? &bufqueues[BQ_AGE] : &bufqueues[BQ_EMPTY];
11752413Storek 		binsheadfree(bp, dp);
11856395Smckusick 		binshash(bp, &invalhash);
11949280Skarels 	}
12049280Skarels }
12149280Skarels 
12249280Skarels /*
12346151Smckusick  * Find the block in the buffer pool.
12446151Smckusick  * If the buffer is not present, allocate a new buffer and load
12546151Smckusick  * its contents according to the filesystem fill routine.
1268Sbill  */
12738776Smckusick bread(vp, blkno, size, cred, bpp)
12837736Smckusick 	struct vnode *vp;
1296563Smckusic 	daddr_t blkno;
1306563Smckusic 	int size;
13138776Smckusick 	struct ucred *cred;
13237736Smckusick 	struct buf **bpp;
1338Sbill {
13447545Skarels 	struct proc *p = curproc;		/* XXX */
1358Sbill 	register struct buf *bp;
1368Sbill 
1378670S 	if (size == 0)
1388670S 		panic("bread: size 0");
13937736Smckusick 	*bpp = bp = getblk(vp, blkno, size);
14046151Smckusick 	if (bp->b_flags & (B_DONE | B_DELWRI)) {
14140341Smckusick 		trace(TR_BREADHIT, pack(vp, size), blkno);
14237736Smckusick 		return (0);
1438Sbill 	}
1448Sbill 	bp->b_flags |= B_READ;
1458670S 	if (bp->b_bcount > bp->b_bufsize)
1468670S 		panic("bread");
14738776Smckusick 	if (bp->b_rcred == NOCRED && cred != NOCRED) {
14838776Smckusick 		crhold(cred);
14938776Smckusick 		bp->b_rcred = cred;
15038776Smckusick 	}
15137736Smckusick 	VOP_STRATEGY(bp);
15240341Smckusick 	trace(TR_BREADMISS, pack(vp, size), blkno);
15347545Skarels 	p->p_stats->p_ru.ru_inblock++;		/* pay for read */
15437736Smckusick 	return (biowait(bp));
1558Sbill }
1568Sbill 
1578Sbill /*
15852189Smckusick  * Operates like bread, but also starts I/O on the N specified
15952189Smckusick  * read-ahead blocks.
1608Sbill  */
16152189Smckusick breadn(vp, blkno, size, rablkno, rabsize, num, cred, bpp)
16237736Smckusick 	struct vnode *vp;
1637114Smckusick 	daddr_t blkno; int size;
16452189Smckusick 	daddr_t rablkno[]; int rabsize[];
16552189Smckusick 	int num;
16638776Smckusick 	struct ucred *cred;
16737736Smckusick 	struct buf **bpp;
1688Sbill {
16947545Skarels 	struct proc *p = curproc;		/* XXX */
1708Sbill 	register struct buf *bp, *rabp;
17152189Smckusick 	register int i;
1728Sbill 
1738Sbill 	bp = NULL;
1747015Smckusick 	/*
17546151Smckusick 	 * If the block is not memory resident,
17646151Smckusick 	 * allocate a buffer and start I/O.
1777015Smckusick 	 */
17837736Smckusick 	if (!incore(vp, blkno)) {
17937736Smckusick 		*bpp = bp = getblk(vp, blkno, size);
18046151Smckusick 		if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) {
1818Sbill 			bp->b_flags |= B_READ;
1828670S 			if (bp->b_bcount > bp->b_bufsize)
18352189Smckusick 				panic("breadn");
18438776Smckusick 			if (bp->b_rcred == NOCRED && cred != NOCRED) {
18538776Smckusick 				crhold(cred);
18638776Smckusick 				bp->b_rcred = cred;
18738776Smckusick 			}
18837736Smckusick 			VOP_STRATEGY(bp);
18940341Smckusick 			trace(TR_BREADMISS, pack(vp, size), blkno);
19047545Skarels 			p->p_stats->p_ru.ru_inblock++;	/* pay for read */
19154342Smckusick 		} else {
19240341Smckusick 			trace(TR_BREADHIT, pack(vp, size), blkno);
19354342Smckusick 		}
1948Sbill 	}
1957015Smckusick 
1967015Smckusick 	/*
19752189Smckusick 	 * If there's read-ahead block(s), start I/O
19852189Smckusick 	 * on them also (as above).
1997015Smckusick 	 */
20052189Smckusick 	for (i = 0; i < num; i++) {
20152189Smckusick 		if (incore(vp, rablkno[i]))
20252189Smckusick 			continue;
20352189Smckusick 		rabp = getblk(vp, rablkno[i], rabsize[i]);
20446151Smckusick 		if (rabp->b_flags & (B_DONE | B_DELWRI)) {
2058Sbill 			brelse(rabp);
20652189Smckusick 			trace(TR_BREADHITRA, pack(vp, rabsize[i]), rablkno[i]);
2072045Swnj 		} else {
20846151Smckusick 			rabp->b_flags |= B_ASYNC | B_READ;
2098670S 			if (rabp->b_bcount > rabp->b_bufsize)
2108670S 				panic("breadrabp");
21138880Smckusick 			if (rabp->b_rcred == NOCRED && cred != NOCRED) {
21238776Smckusick 				crhold(cred);
21338880Smckusick 				rabp->b_rcred = cred;
21438776Smckusick 			}
21537736Smckusick 			VOP_STRATEGY(rabp);
21652189Smckusick 			trace(TR_BREADMISSRA, pack(vp, rabsize[i]), rablkno[i]);
21747545Skarels 			p->p_stats->p_ru.ru_inblock++;	/* pay in advance */
2188Sbill 		}
2198Sbill 	}
2207015Smckusick 
2217015Smckusick 	/*
22246151Smckusick 	 * If block was memory resident, let bread get it.
22346151Smckusick 	 * If block was not memory resident, the read was
22446151Smckusick 	 * started above, so just wait for the read to complete.
2257015Smckusick 	 */
2267114Smckusick 	if (bp == NULL)
22738776Smckusick 		return (bread(vp, blkno, size, cred, bpp));
22837736Smckusick 	return (biowait(bp));
2298Sbill }
2308Sbill 
2318Sbill /*
232*57045Smargo  * We could optimize this by keeping track of where the last read-ahead
233*57045Smargo  * was, but it would involve adding fields to the vnode.  For now, let's
234*57045Smargo  * just get it working.
235*57045Smargo  *
236*57045Smargo  * This replaces bread.  If this is a bread at the beginning of a file and
237*57045Smargo  * lastr is 0, we assume this is the first read and we'll read up to two
238*57045Smargo  * blocks if they are sequential.  After that, we'll do regular read ahead
239*57045Smargo  * in clustered chunks.
240*57045Smargo  *
241*57045Smargo  * There are 4 or 5 cases depending on how you count:
242*57045Smargo  *	Desired block is in the cache:
243*57045Smargo  *	    1 Not sequential access (0 I/Os).
244*57045Smargo  *	    2 Access is sequential, do read-ahead (1 ASYNC).
245*57045Smargo  *	Desired block is not in cache:
246*57045Smargo  *	    3 Not sequential access (1 SYNC).
247*57045Smargo  *	    4 Sequential access, next block is contiguous (1 SYNC).
248*57045Smargo  *	    5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC)
249*57045Smargo  *
250*57045Smargo  * There are potentially two buffers that require I/O.
251*57045Smargo  * 	bp is the block requested.
252*57045Smargo  *	rbp is the read-ahead block.
253*57045Smargo  *	If either is NULL, then you don't have to do the I/O.
254*57045Smargo  */
255*57045Smargo cluster_read(vp, filesize, lblkno, size, cred, bpp)
256*57045Smargo 	struct vnode *vp;
257*57045Smargo 	u_quad_t filesize;
258*57045Smargo 	daddr_t lblkno;
259*57045Smargo 	long size;
260*57045Smargo 	struct ucred *cred;
261*57045Smargo 	struct buf **bpp;
262*57045Smargo {
263*57045Smargo 	struct buf *bp, *rbp;
264*57045Smargo 	daddr_t blkno, ioblkno;
265*57045Smargo 	long flags;
266*57045Smargo 	int error, num_ra, alreadyincore;
267*57045Smargo 
268*57045Smargo #ifdef DIAGNOSTIC
269*57045Smargo 	if (size == 0)
270*57045Smargo 		panic("cluster_read: size = 0");
271*57045Smargo #endif
272*57045Smargo 
273*57045Smargo 	error = 0;
274*57045Smargo 	flags = B_READ;
275*57045Smargo 	*bpp = bp = getblk(vp, lblkno, size);
276*57045Smargo 	if (bp->b_flags & (B_CACHE | B_DONE | B_DELWRI)) {
277*57045Smargo 		/*
278*57045Smargo 		 * Desired block is in cache; do any readahead ASYNC.
279*57045Smargo 		 * Case 1, 2.
280*57045Smargo 		 */
281*57045Smargo 		trace(TR_BREADHIT, pack(vp, size), lblkno);
282*57045Smargo 		flags |= B_ASYNC;
283*57045Smargo 		ioblkno = lblkno +
284*57045Smargo 		    (lblkno < vp->v_ralen ? vp->v_ralen >> 1 : vp->v_ralen);
285*57045Smargo 		alreadyincore = incore(vp, ioblkno);
286*57045Smargo 		bp = NULL;
287*57045Smargo 	} else {
288*57045Smargo 		/* Block wasn't in cache, case 3, 4, 5. */
289*57045Smargo 		trace(TR_BREADMISS, pack(vp, size), lblkno);
290*57045Smargo 		ioblkno = lblkno;
291*57045Smargo 		bp->b_flags |= flags;
292*57045Smargo 		alreadyincore = 0;
293*57045Smargo 		curproc->p_stats->p_ru.ru_inblock++;		/* XXX */
294*57045Smargo 	}
295*57045Smargo 	/*
296*57045Smargo 	 * XXX
297*57045Smargo 	 * Replace 1 with a window size based on some permutation of
298*57045Smargo 	 * maxcontig and rot_delay.  This will let you figure out how
299*57045Smargo 	 * many blocks you should read-ahead (case 2, 4, 5).
300*57045Smargo 	 *
301*57045Smargo 	 * If the access isn't sequential, cut the window size in half.
302*57045Smargo 	 */
303*57045Smargo 	rbp = NULL;
304*57045Smargo 	if (lblkno != vp->v_lastr + 1 && lblkno != 0)
305*57045Smargo 		vp->v_ralen = max(vp->v_ralen >> 1, 1);
306*57045Smargo 	else if ((ioblkno + 1) * size < filesize && !alreadyincore &&
307*57045Smargo 	    !(error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra))) {
308*57045Smargo 		/*
309*57045Smargo 		 * Reading sequentially, and the next block is not in the
310*57045Smargo 		 * cache.  We are going to try reading ahead. If this is
311*57045Smargo 		 * the first read of a file, then limit read-ahead to a
312*57045Smargo 		 * single block, else read as much as we're allowed.
313*57045Smargo 		 */
314*57045Smargo 		if (num_ra > vp->v_ralen) {
315*57045Smargo 			num_ra = vp->v_ralen;
316*57045Smargo 			vp->v_ralen = min(MAXPHYS / size, vp->v_ralen << 1);
317*57045Smargo 		} else
318*57045Smargo 			vp->v_ralen = num_ra + 1;
319*57045Smargo 
320*57045Smargo 
321*57045Smargo 		if (num_ra)				/* case 2, 4 */
322*57045Smargo 			rbp = cluster_rbuild(vp, filesize,
323*57045Smargo 			    bp, ioblkno, blkno, size, num_ra, flags);
324*57045Smargo 		else if (lblkno != 0 && ioblkno == lblkno) {
325*57045Smargo 			/* Case 5: check how many blocks to read ahead */
326*57045Smargo 			++ioblkno;
327*57045Smargo 			if ((ioblkno + 1) * size > filesize ||
328*57045Smargo 			    (error = VOP_BMAP(vp,
329*57045Smargo 			    ioblkno, NULL, &blkno, &num_ra)))
330*57045Smargo 				goto skip_readahead;
331*57045Smargo 			flags |= B_ASYNC;
332*57045Smargo 			if (num_ra)
333*57045Smargo 				rbp = cluster_rbuild(vp, filesize,
334*57045Smargo 				    NULL, ioblkno, blkno, size, num_ra, flags);
335*57045Smargo 			else {
336*57045Smargo 				rbp = getblk(vp, ioblkno, size);
337*57045Smargo 				rbp->b_flags |= flags;
338*57045Smargo 				rbp->b_blkno = blkno;
339*57045Smargo 			}
340*57045Smargo 		} else if (lblkno != 0) {
341*57045Smargo 			/* case 2; read ahead single block */
342*57045Smargo 			rbp = getblk(vp, ioblkno, size);
343*57045Smargo 			rbp->b_flags |= flags;
344*57045Smargo 			rbp->b_blkno = blkno;
345*57045Smargo 		} else if (bp)				/* case 1, 3, block 0 */
346*57045Smargo 			bp->b_blkno = blkno;
347*57045Smargo 		/* Case 1 on block 0; not really doing sequential I/O */
348*57045Smargo 
349*57045Smargo 		if (rbp == bp)		/* case 4 */
350*57045Smargo 			rbp = NULL;
351*57045Smargo 		else if (rbp) {			/* case 2, 5 */
352*57045Smargo 			trace(TR_BREADMISSRA,
353*57045Smargo 			    pack(vp, (num_ra + 1) * size), ioblkno);
354*57045Smargo 			curproc->p_stats->p_ru.ru_inblock++;	/* XXX */
355*57045Smargo 		}
356*57045Smargo 	}
357*57045Smargo 
358*57045Smargo 	/* XXX Kirk, do we need to make sure the bp has creds? */
359*57045Smargo skip_readahead:
360*57045Smargo 	if (bp)
361*57045Smargo 		if (bp->b_flags & (B_DONE | B_DELWRI))
362*57045Smargo 			panic("cluster_read: DONE bp");
363*57045Smargo 		else
364*57045Smargo 			error = VOP_STRATEGY(bp);
365*57045Smargo 
366*57045Smargo 	if (rbp)
367*57045Smargo 		if (error || rbp->b_flags & (B_DONE | B_DELWRI)) {
368*57045Smargo 			rbp->b_flags &= ~(B_ASYNC | B_READ);
369*57045Smargo 			brelse(rbp);
370*57045Smargo 		} else
371*57045Smargo 			(void) VOP_STRATEGY(rbp);
372*57045Smargo 
373*57045Smargo 	if (bp)
374*57045Smargo 		return(biowait(bp));
375*57045Smargo 	return(error);
376*57045Smargo }
377*57045Smargo 
378*57045Smargo /*
379*57045Smargo  * If blocks are contiguous on disk, use this to provide clustered
380*57045Smargo  * read ahead.  We will read as many blocks as possible sequentially
381*57045Smargo  * and then parcel them up into logical blocks in the buffer hash table.
382*57045Smargo  */
383*57045Smargo struct buf *
384*57045Smargo cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags)
385*57045Smargo 	struct vnode *vp;
386*57045Smargo 	u_quad_t filesize;
387*57045Smargo 	struct buf *bp;
388*57045Smargo 	daddr_t lbn;
389*57045Smargo 	daddr_t blkno;
390*57045Smargo 	long size;
391*57045Smargo 	int run;
392*57045Smargo 	long flags;
393*57045Smargo {
394*57045Smargo 	struct cluster_save *b_save;
395*57045Smargo 	struct buf *tbp;
396*57045Smargo 	daddr_t bn;
397*57045Smargo 	int i, inc;
398*57045Smargo 
399*57045Smargo 	if (size * (lbn + run + 1) > filesize)
400*57045Smargo 		--run;
401*57045Smargo 	if (run == 0) {
402*57045Smargo 		if (!bp) {
403*57045Smargo 			bp = getblk(vp, lbn, size);
404*57045Smargo 			bp->b_blkno = blkno;
405*57045Smargo 			bp->b_flags |= flags;
406*57045Smargo 		}
407*57045Smargo 		return(bp);
408*57045Smargo 	}
409*57045Smargo 
410*57045Smargo 	bp = cluster_newbuf(vp, bp, flags, blkno, lbn, size, run + 1);
411*57045Smargo 	if (bp->b_flags & (B_DONE | B_DELWRI))
412*57045Smargo 		return (bp);
413*57045Smargo 
414*57045Smargo 	b_save = malloc(sizeof(struct buf *) * run + sizeof(struct cluster_save),
415*57045Smargo 	    M_SEGMENT, M_WAITOK);
416*57045Smargo 	b_save->bs_bufsize = b_save->bs_bcount = size;
417*57045Smargo 	b_save->bs_nchildren = 0;
418*57045Smargo 	b_save->bs_children = (struct buf **)(b_save + 1);
419*57045Smargo 	b_save->bs_saveaddr = bp->b_saveaddr;
420*57045Smargo 	bp->b_saveaddr = (caddr_t) b_save;
421*57045Smargo 
422*57045Smargo 	inc = size / DEV_BSIZE;
423*57045Smargo 	for (bn = blkno + inc, i = 1; i <= run; ++i, bn += inc) {
424*57045Smargo 		if (incore(vp, lbn + i)) {
425*57045Smargo 			if (i == 1) {
426*57045Smargo 				bp->b_saveaddr = b_save->bs_saveaddr;
427*57045Smargo 				bp->b_flags &= ~B_CALL;
428*57045Smargo 				bp->b_iodone = NULL;
429*57045Smargo 				allocbuf(bp, size);
430*57045Smargo 				free(b_save, M_SEGMENT);
431*57045Smargo 			} else
432*57045Smargo 				allocbuf(bp, size * i);
433*57045Smargo 			break;
434*57045Smargo 		}
435*57045Smargo 		tbp = getblk(vp, lbn + i, 0);
436*57045Smargo 		tbp->b_bcount = tbp->b_bufsize = size;
437*57045Smargo 		tbp->b_blkno = bn;
438*57045Smargo 		tbp->b_flags |= flags | B_READ | B_ASYNC;
439*57045Smargo 		++b_save->bs_nchildren;
440*57045Smargo 		b_save->bs_children[i - 1] = tbp;
441*57045Smargo 	}
442*57045Smargo 	if (!(bp->b_flags & B_ASYNC))
443*57045Smargo 		vp->v_ralen = max(vp->v_ralen - 1, 1);
444*57045Smargo 	return(bp);
445*57045Smargo }
446*57045Smargo 
447*57045Smargo /*
448*57045Smargo  * Either get a new buffer or grow the existing one.
449*57045Smargo  */
450*57045Smargo struct buf *
451*57045Smargo cluster_newbuf(vp, bp, flags, blkno, lblkno, size, run)
452*57045Smargo 	struct vnode *vp;
453*57045Smargo 	struct buf *bp;
454*57045Smargo 	long flags;
455*57045Smargo 	daddr_t blkno;
456*57045Smargo 	daddr_t lblkno;
457*57045Smargo 	long size;
458*57045Smargo 	int run;
459*57045Smargo {
460*57045Smargo 	if (!bp) {
461*57045Smargo 		bp = getblk(vp, lblkno, size);
462*57045Smargo 		if (bp->b_flags & (B_DONE | B_DELWRI)) {
463*57045Smargo 			bp->b_blkno = blkno;
464*57045Smargo 			return(bp);
465*57045Smargo 		}
466*57045Smargo 	}
467*57045Smargo 	allocbuf(bp, run * size);
468*57045Smargo 	bp->b_blkno = blkno;
469*57045Smargo 	bp->b_iodone = cluster_callback;
470*57045Smargo 	bp->b_flags |= flags | B_CALL;
471*57045Smargo 	return(bp);
472*57045Smargo }
473*57045Smargo 
474*57045Smargo /*
475*57045Smargo  * Cleanup after a clustered read or write.
476*57045Smargo  */
477*57045Smargo void
478*57045Smargo cluster_callback(bp)
479*57045Smargo 	struct buf *bp;
480*57045Smargo {
481*57045Smargo 	struct cluster_save *b_save;
482*57045Smargo 	struct buf **tbp;
483*57045Smargo 	long bsize;
484*57045Smargo 	caddr_t cp;
485*57045Smargo 
486*57045Smargo 	b_save = (struct cluster_save *)(bp->b_saveaddr);
487*57045Smargo 	bp->b_saveaddr = b_save->bs_saveaddr;
488*57045Smargo 
489*57045Smargo 	cp = bp->b_un.b_addr + b_save->bs_bufsize;
490*57045Smargo 	for (tbp = b_save->bs_children; b_save->bs_nchildren--; ++tbp) {
491*57045Smargo 		pagemove(cp, (*tbp)->b_un.b_addr, (*tbp)->b_bufsize);
492*57045Smargo 		cp += (*tbp)->b_bufsize;
493*57045Smargo 		bp->b_bufsize -= (*tbp)->b_bufsize;
494*57045Smargo 		biodone(*tbp);
495*57045Smargo 	}
496*57045Smargo #ifdef DIAGNOSTIC
497*57045Smargo 	if (bp->b_bufsize != b_save->bs_bufsize)
498*57045Smargo 		panic ("cluster_callback: more space to reclaim");
499*57045Smargo #endif
500*57045Smargo 	bp->b_bcount = bp->b_bufsize;
501*57045Smargo 	bp->b_iodone = NULL;
502*57045Smargo 	free(b_save, M_SEGMENT);
503*57045Smargo 	if (bp->b_flags & B_ASYNC)
504*57045Smargo 		brelse(bp);
505*57045Smargo 	else
506*57045Smargo 		wakeup((caddr_t)bp);
507*57045Smargo }
508*57045Smargo 
509*57045Smargo /*
51046151Smckusick  * Synchronous write.
51146151Smckusick  * Release buffer on completion.
5128Sbill  */
5138Sbill bwrite(bp)
5147015Smckusick 	register struct buf *bp;
5158Sbill {
51647545Skarels 	struct proc *p = curproc;		/* XXX */
51737736Smckusick 	register int flag;
51852413Storek 	int s, error = 0;
5198Sbill 
5208Sbill 	flag = bp->b_flags;
5219857Ssam 	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
52249459Smckusick 	if (flag & B_ASYNC) {
52349459Smckusick 		if ((flag & B_DELWRI) == 0)
52449459Smckusick 			p->p_stats->p_ru.ru_oublock++;	/* no one paid yet */
52549459Smckusick 		else
52649459Smckusick 			reassignbuf(bp, bp->b_vp);
52749459Smckusick 	}
52840341Smckusick 	trace(TR_BWRITE, pack(bp->b_vp, bp->b_bcount), bp->b_lblkno);
5298670S 	if (bp->b_bcount > bp->b_bufsize)
5308670S 		panic("bwrite");
53140226Smckusick 	s = splbio();
53239882Smckusick 	bp->b_vp->v_numoutput++;
53340226Smckusick 	splx(s);
53437736Smckusick 	VOP_STRATEGY(bp);
5357015Smckusick 
5367015Smckusick 	/*
53746151Smckusick 	 * If the write was synchronous, then await I/O completion.
5387015Smckusick 	 * If the write was "delayed", then we put the buffer on
53946151Smckusick 	 * the queue of blocks awaiting I/O completion status.
5407015Smckusick 	 */
54146151Smckusick 	if ((flag & B_ASYNC) == 0) {
54237736Smckusick 		error = biowait(bp);
54349459Smckusick 		if ((flag&B_DELWRI) == 0)
54449459Smckusick 			p->p_stats->p_ru.ru_oublock++;	/* no one paid yet */
54549459Smckusick 		else
54649459Smckusick 			reassignbuf(bp, bp->b_vp);
5478Sbill 		brelse(bp);
54837736Smckusick 	} else if (flag & B_DELWRI) {
54952413Storek 		s = splbio();
5508Sbill 		bp->b_flags |= B_AGE;
55152413Storek 		splx(s);
55237736Smckusick 	}
55337736Smckusick 	return (error);
5548Sbill }
5558Sbill 
55653578Sheideman int
55753578Sheideman vn_bwrite(ap)
55853578Sheideman 	struct vop_bwrite_args *ap;
55953578Sheideman {
56056395Smckusick 	return (bwrite(ap->a_bp));
56153578Sheideman }
56253578Sheideman 
56353578Sheideman 
5648Sbill /*
56546151Smckusick  * Delayed write.
56646151Smckusick  *
56746151Smckusick  * The buffer is marked dirty, but is not queued for I/O.
56846151Smckusick  * This routine should be used when the buffer is expected
56946151Smckusick  * to be modified again soon, typically a small write that
57046151Smckusick  * partially fills a buffer.
57146151Smckusick  *
57246151Smckusick  * NB: magnetic tapes cannot be delayed; they must be
57346151Smckusick  * written in the order that the writes are requested.
5748Sbill  */
5758Sbill bdwrite(bp)
5767015Smckusick 	register struct buf *bp;
5778Sbill {
57847545Skarels 	struct proc *p = curproc;		/* XXX */
5798Sbill 
58039882Smckusick 	if ((bp->b_flags & B_DELWRI) == 0) {
58139882Smckusick 		bp->b_flags |= B_DELWRI;
58239882Smckusick 		reassignbuf(bp, bp->b_vp);
58347545Skarels 		p->p_stats->p_ru.ru_oublock++;		/* no one paid yet */
58439882Smckusick 	}
58537736Smckusick 	/*
58639668Smckusick 	 * If this is a tape drive, the write must be initiated.
58737736Smckusick 	 */
58848360Smckusick 	if (VOP_IOCTL(bp->b_vp, 0, (caddr_t)B_TAPE, 0, NOCRED, p) == 0) {
5898Sbill 		bawrite(bp);
59039668Smckusick 	} else {
59146151Smckusick 		bp->b_flags |= (B_DONE | B_DELWRI);
5928Sbill 		brelse(bp);
5938Sbill 	}
5948Sbill }
5958Sbill 
5968Sbill /*
59746151Smckusick  * Asynchronous write.
59846151Smckusick  * Start I/O on a buffer, but do not wait for it to complete.
59946151Smckusick  * The buffer is released when the I/O completes.
6008Sbill  */
6018Sbill bawrite(bp)
6027015Smckusick 	register struct buf *bp;
6038Sbill {
6048Sbill 
60546151Smckusick 	/*
60646151Smckusick 	 * Setting the ASYNC flag causes bwrite to return
60746151Smckusick 	 * after starting the I/O.
60846151Smckusick 	 */
6098Sbill 	bp->b_flags |= B_ASYNC;
61037736Smckusick 	(void) bwrite(bp);
6118Sbill }
6128Sbill 
6138Sbill /*
614*57045Smargo  * Do clustered write for FFS.
615*57045Smargo  *
616*57045Smargo  * Three cases:
617*57045Smargo  *	1. Write is not sequential (write asynchronously)
618*57045Smargo  *	Write is sequential:
619*57045Smargo  *	2.	beginning of cluster - begin cluster
620*57045Smargo  *	3.	middle of a cluster - add to cluster
621*57045Smargo  *	4.	end of a cluster - asynchronously write cluster
622*57045Smargo  */
623*57045Smargo void
624*57045Smargo cluster_write(bp, filesize)
625*57045Smargo         struct buf *bp;
626*57045Smargo 	u_quad_t filesize;
627*57045Smargo {
628*57045Smargo         struct vnode *vp;
629*57045Smargo         daddr_t lbn;
630*57045Smargo         int clen, error, maxrun;
631*57045Smargo 
632*57045Smargo         vp = bp->b_vp;
633*57045Smargo         lbn = bp->b_lblkno;
634*57045Smargo 	clen = 0;
635*57045Smargo 
636*57045Smargo 	/*
637*57045Smargo 	 * Handle end of file first.  If we are appending, we need to check
638*57045Smargo 	 * if the current block was allocated contiguously.  If it wasn't,
639*57045Smargo 	 * then we need to fire off a previous cluster if it existed.
640*57045Smargo 	 * Additionally, when we're appending, we need to figure out how
641*57045Smargo 	 * to initialize vp->v_clen.
642*57045Smargo 	 */
643*57045Smargo 	if ((lbn + 1) * bp->b_bcount == filesize) {
644*57045Smargo 		if (bp->b_blkno != vp->v_lasta + bp->b_bcount / DEV_BSIZE) {
645*57045Smargo 			/* This block was not allocated contiguously */
646*57045Smargo 			if (vp->v_clen)
647*57045Smargo 			    cluster_wbuild(vp, NULL, bp->b_bcount, vp->v_cstart,
648*57045Smargo 				vp->v_lastw - vp->v_cstart + 1, lbn);
649*57045Smargo 			vp->v_cstart = lbn;
650*57045Smargo 			clen = vp->v_clen =
651*57045Smargo 			    MAXBSIZE / vp->v_mount->mnt_stat.f_iosize - 1;
652*57045Smargo 			/*
653*57045Smargo 			 * Next cluster started. Write this buffer and return.
654*57045Smargo 			 */
655*57045Smargo 			vp->v_lastw = lbn;
656*57045Smargo 			vp->v_lasta = bp->b_blkno;
657*57045Smargo 			bdwrite(bp);
658*57045Smargo 			return;
659*57045Smargo 		}
660*57045Smargo 		vp->v_lasta = bp->b_blkno;
661*57045Smargo 	} else if (lbn == 0) {
662*57045Smargo 		vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
663*57045Smargo 	}
664*57045Smargo         if (vp->v_clen == 0 || lbn != vp->v_lastw + 1) {
665*57045Smargo 		if (vp->v_clen != 0)
666*57045Smargo 			/*
667*57045Smargo 			 * Write is not sequential.
668*57045Smargo 			 */
669*57045Smargo 			cluster_wbuild(vp, NULL, bp->b_bcount, vp->v_cstart,
670*57045Smargo 			    vp->v_lastw - vp->v_cstart + 1, lbn);
671*57045Smargo 		/*
672*57045Smargo 		 * Consider beginning a cluster.
673*57045Smargo 		 */
674*57045Smargo 		if (error = VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &clen)) {
675*57045Smargo 			bawrite(bp);
676*57045Smargo 			vp->v_cstart = lbn + 1;
677*57045Smargo 			vp->v_lastw = lbn;
678*57045Smargo 			return;
679*57045Smargo 		}
680*57045Smargo                 vp->v_clen = clen;
681*57045Smargo                 if (clen == 0) {		/* I/O not contiguous */
682*57045Smargo 			vp->v_cstart = lbn + 1;
683*57045Smargo                         bawrite(bp);
684*57045Smargo                 } else {			/* Wait for rest of cluster */
685*57045Smargo 			vp->v_cstart = lbn;
686*57045Smargo                         bdwrite(bp);
687*57045Smargo 		}
688*57045Smargo         } else if (lbn == vp->v_cstart + vp->v_clen) {
689*57045Smargo 		/*
690*57045Smargo 		 * At end of cluster, write it out.
691*57045Smargo 		 */
692*57045Smargo 		cluster_wbuild(vp, bp, bp->b_bcount, vp->v_cstart,
693*57045Smargo 		    vp->v_clen + 1, lbn);
694*57045Smargo 		vp->v_clen = 0;
695*57045Smargo 		vp->v_cstart = lbn + 1;
696*57045Smargo         } else
697*57045Smargo 		/*
698*57045Smargo 		 * In the middle of a cluster, so just delay the
699*57045Smargo 		 * I/O for now.
700*57045Smargo 		 */
701*57045Smargo                 bdwrite(bp);
702*57045Smargo         vp->v_lastw = lbn;
703*57045Smargo }
704*57045Smargo 
705*57045Smargo 
706*57045Smargo /*
707*57045Smargo  * This is an awful lot like cluster_rbuild...wish they could be combined.
708*57045Smargo  * The last lbn argument is the current block on which I/O is being
709*57045Smargo  * performed.  Check to see that it doesn't fall in the middle of
710*57045Smargo  * the current block.
711*57045Smargo  */
712*57045Smargo void
713*57045Smargo cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn)
714*57045Smargo 	struct vnode *vp;
715*57045Smargo 	struct buf *last_bp;
716*57045Smargo 	long size;
717*57045Smargo 	daddr_t start_lbn;
718*57045Smargo 	int len;
719*57045Smargo 	daddr_t	lbn;
720*57045Smargo {
721*57045Smargo 	struct cluster_save *b_save;
722*57045Smargo 	struct buf *bp, *tbp;
723*57045Smargo 	caddr_t	cp;
724*57045Smargo 	int i, s;
725*57045Smargo 
726*57045Smargo redo:
727*57045Smargo 	while ((!incore(vp, start_lbn) || start_lbn == lbn) && len) {
728*57045Smargo 		++start_lbn;
729*57045Smargo 		--len;
730*57045Smargo 	}
731*57045Smargo 
732*57045Smargo 	/* Get more memory for current buffer */
733*57045Smargo 	if (len <= 1) {
734*57045Smargo 		if (last_bp)
735*57045Smargo 			bawrite(last_bp);
736*57045Smargo 		return;
737*57045Smargo 	}
738*57045Smargo 
739*57045Smargo 	bp = getblk(vp, start_lbn, size);
740*57045Smargo 	if (!(bp->b_flags & B_DELWRI)) {
741*57045Smargo 		++start_lbn;
742*57045Smargo 		--len;
743*57045Smargo 		brelse(bp);
744*57045Smargo 		goto redo;
745*57045Smargo 	}
746*57045Smargo 
747*57045Smargo 	--len;
748*57045Smargo 	b_save = malloc(sizeof(struct buf *) * len + sizeof(struct cluster_save),
749*57045Smargo 	    M_SEGMENT, M_WAITOK);
750*57045Smargo 	b_save->bs_bcount = bp->b_bcount;
751*57045Smargo 	b_save->bs_bufsize = bp->b_bufsize;
752*57045Smargo 	b_save->bs_nchildren = 0;
753*57045Smargo 	b_save->bs_children = (struct buf **)(b_save + 1);
754*57045Smargo 	b_save->bs_saveaddr = bp->b_saveaddr;
755*57045Smargo 	bp->b_saveaddr = (caddr_t) b_save;
756*57045Smargo 
757*57045Smargo 
758*57045Smargo 	bp->b_flags |= B_CALL;
759*57045Smargo 	bp->b_iodone = cluster_callback;
760*57045Smargo 	cp = bp->b_un.b_addr + bp->b_bufsize;
761*57045Smargo 	for (++start_lbn, i = 0; i < len; ++i, ++start_lbn) {
762*57045Smargo 		if (!incore(vp, start_lbn) || start_lbn == lbn)
763*57045Smargo 			break;
764*57045Smargo 
765*57045Smargo 		if (last_bp == NULL || start_lbn != last_bp->b_lblkno) {
766*57045Smargo 			tbp = getblk(vp, start_lbn, size);
767*57045Smargo #ifdef DIAGNOSTIC
768*57045Smargo 			if (tbp->b_bcount != tbp->b_bufsize)
769*57045Smargo 				panic("cluster_wbuild: Buffer too big");
770*57045Smargo #endif
771*57045Smargo 			if (!(tbp->b_flags & B_DELWRI)) {
772*57045Smargo 				brelse(tbp);
773*57045Smargo 				break;
774*57045Smargo 			}
775*57045Smargo 		} else
776*57045Smargo 			tbp = last_bp;
777*57045Smargo 
778*57045Smargo 		++b_save->bs_nchildren;
779*57045Smargo 
780*57045Smargo 		/* Move memory from children to parent */
781*57045Smargo 		pagemove(tbp->b_un.b_daddr, cp, size);
782*57045Smargo 		bp->b_bcount += size;
783*57045Smargo 		bp->b_bufsize += size;
784*57045Smargo 
785*57045Smargo 		tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
786*57045Smargo 		tbp->b_flags |= B_ASYNC;
787*57045Smargo 		s = splbio();
788*57045Smargo 		reassignbuf(tbp, tbp->b_vp);		/* put on clean list */
789*57045Smargo 		++tbp->b_vp->v_numoutput;
790*57045Smargo 		splx(s);
791*57045Smargo 		b_save->bs_children[i] = tbp;
792*57045Smargo 
793*57045Smargo 		cp += tbp->b_bufsize;
794*57045Smargo 	}
795*57045Smargo 
796*57045Smargo 	if (i == 0) {
797*57045Smargo 		/* None to cluster */
798*57045Smargo 		bp->b_saveaddr = b_save->bs_saveaddr;
799*57045Smargo 		bp->b_flags &= ~B_CALL;
800*57045Smargo 		bp->b_iodone = NULL;
801*57045Smargo 		free(b_save, M_SEGMENT);
802*57045Smargo 	}
803*57045Smargo 	bawrite(bp);
804*57045Smargo 	if (i < len) {
805*57045Smargo 		len -= i + 1;
806*57045Smargo 		start_lbn += 1;
807*57045Smargo 		goto redo;
808*57045Smargo 	}
809*57045Smargo }
810*57045Smargo 
811*57045Smargo /*
81246151Smckusick  * Release a buffer.
81346151Smckusick  * Even if the buffer is dirty, no I/O is started.
8148Sbill  */
8158Sbill brelse(bp)
8167015Smckusick 	register struct buf *bp;
8178Sbill {
81856607Smckusick 	register struct queue_entry *flist;
81946151Smckusick 	int s;
8208Sbill 
82140341Smckusick 	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
8227015Smckusick 	/*
82339668Smckusick 	 * If a process is waiting for the buffer, or
82439668Smckusick 	 * is waiting for a free buffer, awaken it.
8257015Smckusick 	 */
82646151Smckusick 	if (bp->b_flags & B_WANTED)
8278Sbill 		wakeup((caddr_t)bp);
82856395Smckusick 	if (needbuffer) {
82956395Smckusick 		needbuffer = 0;
83056395Smckusick 		wakeup((caddr_t)&needbuffer);
8318Sbill 	}
83239668Smckusick 	/*
83339668Smckusick 	 * Retry I/O for locked buffers rather than invalidating them.
83439668Smckusick 	 */
83552413Storek 	s = splbio();
83639668Smckusick 	if ((bp->b_flags & B_ERROR) && (bp->b_flags & B_LOCKED))
83739668Smckusick 		bp->b_flags &= ~B_ERROR;
83839668Smckusick 	/*
83939668Smckusick 	 * Disassociate buffers that are no longer valid.
84039668Smckusick 	 */
84146151Smckusick 	if (bp->b_flags & (B_NOCACHE | B_ERROR))
84237736Smckusick 		bp->b_flags |= B_INVAL;
84346151Smckusick 	if ((bp->b_bufsize <= 0) || (bp->b_flags & (B_ERROR | B_INVAL))) {
84439668Smckusick 		if (bp->b_vp)
84539668Smckusick 			brelvp(bp);
84639668Smckusick 		bp->b_flags &= ~B_DELWRI;
84737736Smckusick 	}
8487015Smckusick 	/*
8497015Smckusick 	 * Stick the buffer back on a free list.
8507015Smckusick 	 */
8518670S 	if (bp->b_bufsize <= 0) {
8528670S 		/* block has no buffer ... put at front of unused buffer list */
85356395Smckusick 		flist = &bufqueues[BQ_EMPTY];
8548670S 		binsheadfree(bp, flist);
85546151Smckusick 	} else if (bp->b_flags & (B_ERROR | B_INVAL)) {
8562325Swnj 		/* block has no info ... put at front of most free list */
85756395Smckusick 		flist = &bufqueues[BQ_AGE];
8587015Smckusick 		binsheadfree(bp, flist);
8598Sbill 	} else {
8602325Swnj 		if (bp->b_flags & B_LOCKED)
86156395Smckusick 			flist = &bufqueues[BQ_LOCKED];
8622325Swnj 		else if (bp->b_flags & B_AGE)
86356395Smckusick 			flist = &bufqueues[BQ_AGE];
8642325Swnj 		else
86556395Smckusick 			flist = &bufqueues[BQ_LRU];
8667015Smckusick 		binstailfree(bp, flist);
8678Sbill 	}
86846151Smckusick 	bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_AGE | B_NOCACHE);
8698Sbill 	splx(s);
8708Sbill }
8718Sbill 
8728Sbill /*
87346151Smckusick  * Check to see if a block is currently memory resident.
8748Sbill  */
87537736Smckusick incore(vp, blkno)
87637736Smckusick 	struct vnode *vp;
8777015Smckusick 	daddr_t blkno;
8788Sbill {
8798Sbill 	register struct buf *bp;
8808Sbill 
88156607Smckusick 	for (bp = BUFHASH(vp, blkno)->le_next; bp; bp = bp->b_hash.qe_next)
88239668Smckusick 		if (bp->b_lblkno == blkno && bp->b_vp == vp &&
8837015Smckusick 		    (bp->b_flags & B_INVAL) == 0)
88491Sbill 			return (1);
88591Sbill 	return (0);
8868Sbill }
8878Sbill 
88839668Smckusick /*
88946151Smckusick  * Check to see if a block is currently memory resident.
89046151Smckusick  * If it is resident, return it. If it is not resident,
89146151Smckusick  * allocate a new buffer and assign it to the block.
89239668Smckusick  */
8938Sbill struct buf *
89437736Smckusick getblk(vp, blkno, size)
89537736Smckusick 	register struct vnode *vp;
8966563Smckusic 	daddr_t blkno;
8976563Smckusic 	int size;
8988Sbill {
89956607Smckusick 	register struct buf *bp;
90056607Smckusick 	struct list_entry *dp;
9015424Swnj 	int s;
9028Sbill 
90325255Smckusick 	if (size > MAXBSIZE)
90425255Smckusick 		panic("getblk: size too big");
9057015Smckusick 	/*
90646151Smckusick 	 * Search the cache for the block. If the buffer is found,
90746151Smckusick 	 * but it is currently locked, the we must wait for it to
90846151Smckusick 	 * become available.
9097015Smckusick 	 */
91037736Smckusick 	dp = BUFHASH(vp, blkno);
9117015Smckusick loop:
91256607Smckusick 	for (bp = dp->le_next; bp; bp = bp->b_hash.qe_next) {
91339668Smckusick 		if (bp->b_lblkno != blkno || bp->b_vp != vp ||
91446151Smckusick 		    (bp->b_flags & B_INVAL))
9158Sbill 			continue;
91626271Skarels 		s = splbio();
91746151Smckusick 		if (bp->b_flags & B_BUSY) {
9188Sbill 			bp->b_flags |= B_WANTED;
91946151Smckusick 			sleep((caddr_t)bp, PRIBIO + 1);
9205424Swnj 			splx(s);
9218Sbill 			goto loop;
9228Sbill 		}
92339882Smckusick 		bremfree(bp);
92439882Smckusick 		bp->b_flags |= B_BUSY;
9255424Swnj 		splx(s);
92632608Smckusick 		if (bp->b_bcount != size) {
92739668Smckusick 			printf("getblk: stray size");
92839668Smckusick 			bp->b_flags |= B_INVAL;
92939668Smckusick 			bwrite(bp);
93039668Smckusick 			goto loop;
93132608Smckusick 		}
9328Sbill 		bp->b_flags |= B_CACHE;
93326271Skarels 		return (bp);
9348Sbill 	}
9358670S 	bp = getnewbuf();
9367015Smckusick 	bremhash(bp);
93739668Smckusick 	bgetvp(vp, bp);
93845116Smckusick 	bp->b_bcount = 0;
93939668Smckusick 	bp->b_lblkno = blkno;
9406563Smckusic 	bp->b_blkno = blkno;
9418670S 	bp->b_error = 0;
94237736Smckusick 	bp->b_resid = 0;
94337736Smckusick 	binshash(bp, dp);
94445116Smckusick 	allocbuf(bp, size);
94526271Skarels 	return (bp);
9468Sbill }
9478Sbill 
9488Sbill /*
94946151Smckusick  * Allocate a buffer.
95046151Smckusick  * The caller will assign it to a block.
9518Sbill  */
9528Sbill struct buf *
9536563Smckusic geteblk(size)
9546563Smckusic 	int size;
9558Sbill {
95656395Smckusick 	register struct buf *bp;
9578Sbill 
95825255Smckusick 	if (size > MAXBSIZE)
95925255Smckusick 		panic("geteblk: size too big");
9608670S 	bp = getnewbuf();
9618670S 	bp->b_flags |= B_INVAL;
9627015Smckusick 	bremhash(bp);
96356395Smckusick 	binshash(bp, &invalhash);
96445116Smckusick 	bp->b_bcount = 0;
96537736Smckusick 	bp->b_error = 0;
96637736Smckusick 	bp->b_resid = 0;
96745116Smckusick 	allocbuf(bp, size);
96826271Skarels 	return (bp);
9698Sbill }
9708Sbill 
9718Sbill /*
97245116Smckusick  * Expand or contract the actual memory allocated to a buffer.
97346151Smckusick  * If no memory is available, release buffer and take error exit.
9746563Smckusic  */
97545116Smckusick allocbuf(tp, size)
97645116Smckusick 	register struct buf *tp;
9776563Smckusic 	int size;
9786563Smckusic {
97945116Smckusick 	register struct buf *bp, *ep;
98045116Smckusick 	int sizealloc, take, s;
9816563Smckusic 
98245116Smckusick 	sizealloc = roundup(size, CLBYTES);
98345116Smckusick 	/*
98445116Smckusick 	 * Buffer size does not change
98545116Smckusick 	 */
98645116Smckusick 	if (sizealloc == tp->b_bufsize)
98745116Smckusick 		goto out;
98845116Smckusick 	/*
98945116Smckusick 	 * Buffer size is shrinking.
99045116Smckusick 	 * Place excess space in a buffer header taken from the
99145116Smckusick 	 * BQ_EMPTY buffer list and placed on the "most free" list.
99245116Smckusick 	 * If no extra buffer headers are available, leave the
99345116Smckusick 	 * extra space in the present buffer.
99445116Smckusick 	 */
99545116Smckusick 	if (sizealloc < tp->b_bufsize) {
99656607Smckusick 		if ((ep = bufqueues[BQ_EMPTY].qe_next) == NULL)
99745116Smckusick 			goto out;
99845116Smckusick 		s = splbio();
99945116Smckusick 		bremfree(ep);
100045116Smckusick 		ep->b_flags |= B_BUSY;
100145116Smckusick 		splx(s);
100245116Smckusick 		pagemove(tp->b_un.b_addr + sizealloc, ep->b_un.b_addr,
100345116Smckusick 		    (int)tp->b_bufsize - sizealloc);
100445116Smckusick 		ep->b_bufsize = tp->b_bufsize - sizealloc;
100545116Smckusick 		tp->b_bufsize = sizealloc;
100645116Smckusick 		ep->b_flags |= B_INVAL;
100745116Smckusick 		ep->b_bcount = 0;
100845116Smckusick 		brelse(ep);
100945116Smckusick 		goto out;
101045116Smckusick 	}
101145116Smckusick 	/*
101245116Smckusick 	 * More buffer space is needed. Get it out of buffers on
101345116Smckusick 	 * the "most free" list, placing the empty headers on the
101445116Smckusick 	 * BQ_EMPTY buffer header list.
101545116Smckusick 	 */
101645116Smckusick 	while (tp->b_bufsize < sizealloc) {
101745116Smckusick 		take = sizealloc - tp->b_bufsize;
101845116Smckusick 		bp = getnewbuf();
101945116Smckusick 		if (take >= bp->b_bufsize)
102045116Smckusick 			take = bp->b_bufsize;
102145116Smckusick 		pagemove(&bp->b_un.b_addr[bp->b_bufsize - take],
102245116Smckusick 		    &tp->b_un.b_addr[tp->b_bufsize], take);
102345116Smckusick 		tp->b_bufsize += take;
102445116Smckusick 		bp->b_bufsize = bp->b_bufsize - take;
102545116Smckusick 		if (bp->b_bcount > bp->b_bufsize)
102645116Smckusick 			bp->b_bcount = bp->b_bufsize;
102745116Smckusick 		if (bp->b_bufsize <= 0) {
102845116Smckusick 			bremhash(bp);
102956395Smckusick 			binshash(bp, &invalhash);
103046151Smckusick 			bp->b_dev = NODEV;
103145116Smckusick 			bp->b_error = 0;
103245116Smckusick 			bp->b_flags |= B_INVAL;
103345116Smckusick 		}
103445116Smckusick 		brelse(bp);
103545116Smckusick 	}
103645116Smckusick out:
103745116Smckusick 	tp->b_bcount = size;
103845116Smckusick 	return (1);
10398670S }
10408670S 
10418670S /*
10428670S  * Find a buffer which is available for use.
10438670S  * Select something from a free list.
10448670S  * Preference is to AGE list, then LRU list.
10458670S  */
10468670S struct buf *
10478670S getnewbuf()
10488670S {
104956395Smckusick 	register struct buf *bp;
105056607Smckusick 	register struct queue_entry *dp;
105138776Smckusick 	register struct ucred *cred;
10528670S 	int s;
10538670S 
10548670S loop:
105526271Skarels 	s = splbio();
105656395Smckusick 	for (dp = &bufqueues[BQ_AGE]; dp > bufqueues; dp--)
105756607Smckusick 		if (dp->qe_next)
10588670S 			break;
105956395Smckusick 	if (dp == bufqueues) {		/* no free blocks */
106056395Smckusick 		needbuffer = 1;
106156395Smckusick 		sleep((caddr_t)&needbuffer, PRIBIO + 1);
106212170Ssam 		splx(s);
10638670S 		goto loop;
10648670S 	}
106556607Smckusick 	bp = dp->qe_next;
106639882Smckusick 	bremfree(bp);
106739882Smckusick 	bp->b_flags |= B_BUSY;
10688670S 	splx(s);
10698670S 	if (bp->b_flags & B_DELWRI) {
107038614Smckusick 		(void) bawrite(bp);
10718670S 		goto loop;
10728670S 	}
107340341Smckusick 	trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno);
107439668Smckusick 	if (bp->b_vp)
107539668Smckusick 		brelvp(bp);
107638776Smckusick 	if (bp->b_rcred != NOCRED) {
107738776Smckusick 		cred = bp->b_rcred;
107838776Smckusick 		bp->b_rcred = NOCRED;
107938776Smckusick 		crfree(cred);
108038776Smckusick 	}
108138776Smckusick 	if (bp->b_wcred != NOCRED) {
108238776Smckusick 		cred = bp->b_wcred;
108338776Smckusick 		bp->b_wcred = NOCRED;
108438776Smckusick 		crfree(cred);
108538776Smckusick 	}
10868670S 	bp->b_flags = B_BUSY;
108746989Smckusick 	bp->b_dirtyoff = bp->b_dirtyend = 0;
108852189Smckusick 	bp->b_validoff = bp->b_validend = 0;
10898670S 	return (bp);
10908670S }
10918670S 
10928670S /*
109346151Smckusick  * Wait for I/O to complete.
109446151Smckusick  *
109546151Smckusick  * Extract and return any errors associated with the I/O.
109646151Smckusick  * If the error flag is set, but no specific error is
109746151Smckusick  * given, return EIO.
10988Sbill  */
10997015Smckusick biowait(bp)
11006563Smckusic 	register struct buf *bp;
11018Sbill {
11025431Sroot 	int s;
11038Sbill 
110426271Skarels 	s = splbio();
110538776Smckusick 	while ((bp->b_flags & B_DONE) == 0)
11068Sbill 		sleep((caddr_t)bp, PRIBIO);
11075431Sroot 	splx(s);
110837736Smckusick 	if ((bp->b_flags & B_ERROR) == 0)
110937736Smckusick 		return (0);
111037736Smckusick 	if (bp->b_error)
111137736Smckusick 		return (bp->b_error);
111237736Smckusick 	return (EIO);
11138Sbill }
11148Sbill 
11158Sbill /*
111613128Ssam  * Mark I/O complete on a buffer.
111746151Smckusick  *
111846151Smckusick  * If a callback has been requested, e.g. the pageout
111946151Smckusick  * daemon, do so. Otherwise, awaken waiting processes.
11208Sbill  */
112151455Sbostic void
11227015Smckusick biodone(bp)
11237015Smckusick 	register struct buf *bp;
11248Sbill {
11258Sbill 
1126420Sbill 	if (bp->b_flags & B_DONE)
11277015Smckusick 		panic("dup biodone");
11288Sbill 	bp->b_flags |= B_DONE;
112949232Smckusick 	if ((bp->b_flags & B_READ) == 0)
113049232Smckusick 		vwakeup(bp);
11319763Ssam 	if (bp->b_flags & B_CALL) {
11329763Ssam 		bp->b_flags &= ~B_CALL;
11339763Ssam 		(*bp->b_iodone)(bp);
11349763Ssam 		return;
11359763Ssam 	}
113646151Smckusick 	if (bp->b_flags & B_ASYNC)
11378Sbill 		brelse(bp);
11388Sbill 	else {
11398Sbill 		bp->b_flags &= ~B_WANTED;
11408Sbill 		wakeup((caddr_t)bp);
11418Sbill 	}
11428Sbill }
114356356Smckusick 
114457035Smargo int
114557035Smargo count_lock_queue()
114657035Smargo {
114757035Smargo 	register struct buf *bp;
114857035Smargo 	register int ret;
114957035Smargo 
115057035Smargo 	for (ret = 0, bp = (struct buf *)bufqueues[BQ_LOCKED].qe_next;
115157035Smargo 	    bp; bp = (struct buf *)bp->b_freelist.qe_next)
115257035Smargo 		++ret;
115357035Smargo 	return(ret);
115457035Smargo }
115557035Smargo 
115656356Smckusick #ifdef DIAGNOSTIC
115756356Smckusick /*
115856356Smckusick  * Print out statistics on the current allocation of the buffer pool.
115956356Smckusick  * Can be enabled to print out on every ``sync'' by setting "syncprt"
116056356Smckusick  * above.
116156356Smckusick  */
116256356Smckusick void
116356356Smckusick vfs_bufstats()
116456356Smckusick {
116556356Smckusick 	int s, i, j, count;
116656395Smckusick 	register struct buf *bp;
116756607Smckusick 	register struct queue_entry *dp;
116856356Smckusick 	int counts[MAXBSIZE/CLBYTES+1];
116956356Smckusick 	static char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE", "EMPTY" };
117056356Smckusick 
117156395Smckusick 	for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
117256356Smckusick 		count = 0;
117356356Smckusick 		for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
117456356Smckusick 			counts[j] = 0;
117556356Smckusick 		s = splbio();
117656607Smckusick 		for (bp = dp->qe_next; bp; bp = bp->b_freelist.qe_next) {
117756356Smckusick 			counts[bp->b_bufsize/CLBYTES]++;
117856356Smckusick 			count++;
117956356Smckusick 		}
118056356Smckusick 		splx(s);
118156356Smckusick 		printf("%s: total-%d", bname[i], count);
118256356Smckusick 		for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
118356356Smckusick 			if (counts[j] != 0)
118456356Smckusick 				printf(", %d-%d", j * CLBYTES, counts[j]);
118556356Smckusick 		printf("\n");
118656356Smckusick 	}
118756356Smckusick }
118856356Smckusick #endif /* DIAGNOSTIC */
1189