xref: /csrg-svn/sys/kern/vfs_cluster.c (revision 66080)
149589Sbostic /*-
263180Sbostic  * Copyright (c) 1993
363180Sbostic  *	The Regents of the University of California.  All rights reserved.
423395Smckusick  *
559878Smckusick  * %sccs.include.redist.c%
637736Smckusick  *
7*66080Shibler  *	@(#)vfs_cluster.c	8.7 (Berkeley) 02/13/94
823395Smckusick  */
98Sbill 
1051455Sbostic #include <sys/param.h>
1151455Sbostic #include <sys/proc.h>
1251455Sbostic #include <sys/buf.h>
1351455Sbostic #include <sys/vnode.h>
1451455Sbostic #include <sys/mount.h>
1551455Sbostic #include <sys/trace.h>
1659878Smckusick #include <sys/malloc.h>
1751455Sbostic #include <sys/resourcevar.h>
1856395Smckusick #include <libkern/libkern.h>
198Sbill 
20*66080Shibler #ifdef DEBUG
21*66080Shibler #include <vm/vm.h>
22*66080Shibler #include <sys/sysctl.h>
23*66080Shibler int doreallocblks = 1;
24*66080Shibler struct ctldebug debug13 = { "doreallocblks", &doreallocblks };
25*66080Shibler #else
26*66080Shibler /* XXX for cluster_write */
27*66080Shibler #define doreallocblks 1
28*66080Shibler #endif
29*66080Shibler 
3091Sbill /*
3157045Smargo  * Local declarations
3257045Smargo  */
3357045Smargo struct buf *cluster_newbuf __P((struct vnode *, struct buf *, long, daddr_t,
3457045Smargo 	    daddr_t, long, int));
3557045Smargo struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *,
3657045Smargo 	    daddr_t, daddr_t, long, int, long));
3764717Smckusick void	    cluster_wbuild __P((struct vnode *, struct buf *, long,
3864717Smckusick 	    daddr_t, int, daddr_t));
3965998Smckusick struct cluster_save *cluster_collectbufs __P((struct vnode *, struct buf *));
4057045Smargo 
4165670Shibler #ifdef DIAGNOSTIC
4256395Smckusick /*
4365670Shibler  * Set to 1 if reads of block zero should cause readahead to be done.
4465670Shibler  * Set to 0 treats a read of block zero as a non-sequential read.
4557045Smargo  *
4665670Shibler  * Setting to one assumes that most reads of block zero of files are due to
4765670Shibler  * sequential passes over the files (e.g. cat, sum) where additional blocks
4865670Shibler  * will soon be needed.  Setting to zero assumes that the majority are
4965670Shibler  * surgical strikes to get particular info (e.g. size, file) where readahead
5065670Shibler  * blocks will not be used and, in fact, push out other potentially useful
5165670Shibler  * blocks from the cache.  The former seems intuitive, but some quick tests
5265670Shibler  * showed that the latter performed better from a system-wide point of view.
5365670Shibler  */
5465670Shibler int	doclusterraz = 0;
5565670Shibler #define ISSEQREAD(vp, blk) \
5665670Shibler 	(((blk) != 0 || doclusterraz) && \
5765670Shibler 	 ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))
5865670Shibler #else
5965670Shibler #define ISSEQREAD(vp, blk) \
6065670Shibler 	((blk) != 0 && ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))
6165670Shibler #endif
6265670Shibler 
6365670Shibler /*
6457045Smargo  * This replaces bread.  If this is a bread at the beginning of a file and
6557045Smargo  * lastr is 0, we assume this is the first read and we'll read up to two
6657045Smargo  * blocks if they are sequential.  After that, we'll do regular read ahead
6757045Smargo  * in clustered chunks.
6857045Smargo  *
6957045Smargo  * There are 4 or 5 cases depending on how you count:
7057045Smargo  *	Desired block is in the cache:
7157045Smargo  *	    1 Not sequential access (0 I/Os).
7257045Smargo  *	    2 Access is sequential, do read-ahead (1 ASYNC).
7357045Smargo  *	Desired block is not in cache:
7457045Smargo  *	    3 Not sequential access (1 SYNC).
7557045Smargo  *	    4 Sequential access, next block is contiguous (1 SYNC).
7657045Smargo  *	    5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC)
7757045Smargo  *
7857045Smargo  * There are potentially two buffers that require I/O.
7957045Smargo  * 	bp is the block requested.
8057045Smargo  *	rbp is the read-ahead block.
8157045Smargo  *	If either is NULL, then you don't have to do the I/O.
8257045Smargo  */
8357045Smargo cluster_read(vp, filesize, lblkno, size, cred, bpp)
8457045Smargo 	struct vnode *vp;
8557045Smargo 	u_quad_t filesize;
8657045Smargo 	daddr_t lblkno;
8757045Smargo 	long size;
8857045Smargo 	struct ucred *cred;
8957045Smargo 	struct buf **bpp;
9057045Smargo {
9157045Smargo 	struct buf *bp, *rbp;
9257045Smargo 	daddr_t blkno, ioblkno;
9357045Smargo 	long flags;
9457045Smargo 	int error, num_ra, alreadyincore;
9557045Smargo 
9657045Smargo #ifdef DIAGNOSTIC
9757045Smargo 	if (size == 0)
9857045Smargo 		panic("cluster_read: size = 0");
9957045Smargo #endif
10057045Smargo 
10157045Smargo 	error = 0;
10257045Smargo 	flags = B_READ;
10357797Smckusick 	*bpp = bp = getblk(vp, lblkno, size, 0, 0);
10465670Shibler 	if (bp->b_flags & B_CACHE) {
10557045Smargo 		/*
10657045Smargo 		 * Desired block is in cache; do any readahead ASYNC.
10757045Smargo 		 * Case 1, 2.
10857045Smargo 		 */
10957045Smargo 		trace(TR_BREADHIT, pack(vp, size), lblkno);
11057045Smargo 		flags |= B_ASYNC;
11165670Shibler 		ioblkno = lblkno + (vp->v_ralen ? vp->v_ralen : 1);
11257797Smckusick 		alreadyincore = (int)incore(vp, ioblkno);
11357045Smargo 		bp = NULL;
11457045Smargo 	} else {
11557045Smargo 		/* Block wasn't in cache, case 3, 4, 5. */
11657045Smargo 		trace(TR_BREADMISS, pack(vp, size), lblkno);
11765670Shibler 		bp->b_flags |= B_READ;
11857045Smargo 		ioblkno = lblkno;
11957045Smargo 		alreadyincore = 0;
12057045Smargo 		curproc->p_stats->p_ru.ru_inblock++;		/* XXX */
12157045Smargo 	}
12257045Smargo 	/*
12357045Smargo 	 * XXX
12457045Smargo 	 * Replace 1 with a window size based on some permutation of
12557045Smargo 	 * maxcontig and rot_delay.  This will let you figure out how
12657045Smargo 	 * many blocks you should read-ahead (case 2, 4, 5).
12757045Smargo 	 *
12865670Shibler 	 * If the access isn't sequential, reset the window to 1.
12965670Shibler 	 * Note that a read to the same block is considered sequential.
13065670Shibler 	 * This catches the case where the file is being read sequentially,
13165670Shibler 	 * but at smaller than the filesystem block size.
13257045Smargo 	 */
13357045Smargo 	rbp = NULL;
13465670Shibler 	if (!ISSEQREAD(vp, lblkno)) {
13565670Shibler 		vp->v_ralen = 0;
13665670Shibler 		vp->v_maxra = lblkno;
13765670Shibler 	} else if ((ioblkno + 1) * size <= filesize && !alreadyincore &&
13864717Smckusick 	    !(error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra)) &&
13964717Smckusick 	    blkno != -1) {
14057045Smargo 		/*
14157045Smargo 		 * Reading sequentially, and the next block is not in the
14265670Shibler 		 * cache.  We are going to try reading ahead.
14357045Smargo 		 */
14465670Shibler 		if (num_ra) {
14565670Shibler 			/*
14665670Shibler 			 * If our desired readahead block had been read
14765670Shibler 			 * in a previous readahead but is no longer in
14865670Shibler 			 * core, then we may be reading ahead too far
14965670Shibler 			 * or are not using our readahead very rapidly.
15065670Shibler 			 * In this case we scale back the window.
15165670Shibler 			 */
15265670Shibler 			if (!alreadyincore && ioblkno <= vp->v_maxra)
15365670Shibler 				vp->v_ralen = max(vp->v_ralen >> 1, 1);
15465670Shibler 			/*
15565670Shibler 			 * There are more sequential blocks than our current
15665670Shibler 			 * window allows, scale up.  Ideally we want to get
15765670Shibler 			 * in sync with the filesystem maxcontig value.
15865670Shibler 			 */
15965670Shibler 			else if (num_ra > vp->v_ralen && lblkno != vp->v_lastr)
16065670Shibler 				vp->v_ralen = vp->v_ralen ?
16165670Shibler 					min(num_ra, vp->v_ralen << 1) : 1;
16257045Smargo 
16365670Shibler 			if (num_ra > vp->v_ralen)
16465670Shibler 				num_ra = vp->v_ralen;
16565670Shibler 		}
16657045Smargo 
16757045Smargo 		if (num_ra)				/* case 2, 4 */
16857045Smargo 			rbp = cluster_rbuild(vp, filesize,
16957045Smargo 			    bp, ioblkno, blkno, size, num_ra, flags);
17065670Shibler 		else if (ioblkno == lblkno) {
17165670Shibler 			bp->b_blkno = blkno;
17257045Smargo 			/* Case 5: check how many blocks to read ahead */
17357045Smargo 			++ioblkno;
17457045Smargo 			if ((ioblkno + 1) * size > filesize ||
17565670Shibler 			    incore(vp, ioblkno) || (error = VOP_BMAP(vp,
17665670Shibler 			     ioblkno, NULL, &blkno, &num_ra)) || blkno == -1)
17757045Smargo 				goto skip_readahead;
17865670Shibler 			/*
17965670Shibler 			 * Adjust readahead as above
18065670Shibler 			 */
18165670Shibler 			if (num_ra) {
18265670Shibler 				if (!alreadyincore && ioblkno <= vp->v_maxra)
18365670Shibler 					vp->v_ralen = max(vp->v_ralen >> 1, 1);
18465670Shibler 				else if (num_ra > vp->v_ralen &&
18565670Shibler 					 lblkno != vp->v_lastr)
18665670Shibler 					vp->v_ralen = vp->v_ralen ?
18765670Shibler 						min(num_ra,vp->v_ralen<<1) : 1;
18865670Shibler 				if (num_ra > vp->v_ralen)
18965670Shibler 					num_ra = vp->v_ralen;
19065670Shibler 			}
19157045Smargo 			flags |= B_ASYNC;
19257045Smargo 			if (num_ra)
19357045Smargo 				rbp = cluster_rbuild(vp, filesize,
19457045Smargo 				    NULL, ioblkno, blkno, size, num_ra, flags);
19557045Smargo 			else {
19657797Smckusick 				rbp = getblk(vp, ioblkno, size, 0, 0);
19757045Smargo 				rbp->b_flags |= flags;
19857045Smargo 				rbp->b_blkno = blkno;
19957045Smargo 			}
20065670Shibler 		} else {
20157045Smargo 			/* case 2; read ahead single block */
20257797Smckusick 			rbp = getblk(vp, ioblkno, size, 0, 0);
20357045Smargo 			rbp->b_flags |= flags;
20457045Smargo 			rbp->b_blkno = blkno;
20565670Shibler 		}
20657045Smargo 
20765670Shibler 		if (rbp == bp)			/* case 4 */
20857045Smargo 			rbp = NULL;
20957045Smargo 		else if (rbp) {			/* case 2, 5 */
21057045Smargo 			trace(TR_BREADMISSRA,
21157045Smargo 			    pack(vp, (num_ra + 1) * size), ioblkno);
21257045Smargo 			curproc->p_stats->p_ru.ru_inblock++;	/* XXX */
21357045Smargo 		}
21457045Smargo 	}
21557045Smargo 
21657045Smargo 	/* XXX Kirk, do we need to make sure the bp has creds? */
21757045Smargo skip_readahead:
21857045Smargo 	if (bp)
21957045Smargo 		if (bp->b_flags & (B_DONE | B_DELWRI))
22057045Smargo 			panic("cluster_read: DONE bp");
22157045Smargo 		else
22257045Smargo 			error = VOP_STRATEGY(bp);
22357045Smargo 
22457045Smargo 	if (rbp)
22557045Smargo 		if (error || rbp->b_flags & (B_DONE | B_DELWRI)) {
22657045Smargo 			rbp->b_flags &= ~(B_ASYNC | B_READ);
22757045Smargo 			brelse(rbp);
22857045Smargo 		} else
22957045Smargo 			(void) VOP_STRATEGY(rbp);
23057045Smargo 
23165670Shibler 	/*
23265670Shibler 	 * Recalculate our maximum readahead
23365670Shibler 	 */
23465670Shibler 	if (rbp == NULL)
23565670Shibler 		rbp = bp;
23665670Shibler 	if (rbp)
23765670Shibler 		vp->v_maxra = rbp->b_lblkno + (rbp->b_bufsize / size) - 1;
23865670Shibler 
23957045Smargo 	if (bp)
24057045Smargo 		return(biowait(bp));
24157045Smargo 	return(error);
24257045Smargo }
24357045Smargo 
24457045Smargo /*
24557045Smargo  * If blocks are contiguous on disk, use this to provide clustered
24657045Smargo  * read ahead.  We will read as many blocks as possible sequentially
24757045Smargo  * and then parcel them up into logical blocks in the buffer hash table.
24857045Smargo  */
24957045Smargo struct buf *
25057045Smargo cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags)
25157045Smargo 	struct vnode *vp;
25257045Smargo 	u_quad_t filesize;
25357045Smargo 	struct buf *bp;
25457045Smargo 	daddr_t lbn;
25557045Smargo 	daddr_t blkno;
25657045Smargo 	long size;
25757045Smargo 	int run;
25857045Smargo 	long flags;
25957045Smargo {
26057045Smargo 	struct cluster_save *b_save;
26157045Smargo 	struct buf *tbp;
26257045Smargo 	daddr_t bn;
26357045Smargo 	int i, inc;
26457045Smargo 
26559872Smargo #ifdef DIAGNOSTIC
26659872Smargo 	if (size != vp->v_mount->mnt_stat.f_iosize)
26759872Smargo 		panic("cluster_rbuild: size %d != filesize %d\n",
26859872Smargo 			size, vp->v_mount->mnt_stat.f_iosize);
26959872Smargo #endif
27057045Smargo 	if (size * (lbn + run + 1) > filesize)
27157045Smargo 		--run;
27257045Smargo 	if (run == 0) {
27357045Smargo 		if (!bp) {
27457797Smckusick 			bp = getblk(vp, lbn, size, 0, 0);
27557045Smargo 			bp->b_blkno = blkno;
27657045Smargo 			bp->b_flags |= flags;
27757045Smargo 		}
27857045Smargo 		return(bp);
27957045Smargo 	}
28057045Smargo 
28157045Smargo 	bp = cluster_newbuf(vp, bp, flags, blkno, lbn, size, run + 1);
28257045Smargo 	if (bp->b_flags & (B_DONE | B_DELWRI))
28357045Smargo 		return (bp);
28457045Smargo 
28557045Smargo 	b_save = malloc(sizeof(struct buf *) * run + sizeof(struct cluster_save),
28657045Smargo 	    M_SEGMENT, M_WAITOK);
28757045Smargo 	b_save->bs_bufsize = b_save->bs_bcount = size;
28857045Smargo 	b_save->bs_nchildren = 0;
28957045Smargo 	b_save->bs_children = (struct buf **)(b_save + 1);
29057045Smargo 	b_save->bs_saveaddr = bp->b_saveaddr;
29157045Smargo 	bp->b_saveaddr = (caddr_t) b_save;
29257045Smargo 
29365670Shibler 	inc = btodb(size);
29457045Smargo 	for (bn = blkno + inc, i = 1; i <= run; ++i, bn += inc) {
29557045Smargo 		if (incore(vp, lbn + i)) {
29657045Smargo 			if (i == 1) {
29757045Smargo 				bp->b_saveaddr = b_save->bs_saveaddr;
29857045Smargo 				bp->b_flags &= ~B_CALL;
29957045Smargo 				bp->b_iodone = NULL;
30057045Smargo 				allocbuf(bp, size);
30157045Smargo 				free(b_save, M_SEGMENT);
30257045Smargo 			} else
30357045Smargo 				allocbuf(bp, size * i);
30457045Smargo 			break;
30557045Smargo 		}
30657797Smckusick 		tbp = getblk(vp, lbn + i, 0, 0, 0);
30765670Shibler 		/*
30865670Shibler 		 * getblk may return some memory in the buffer if there were
30965670Shibler 		 * no empty buffers to shed it to.  If there is currently
31065670Shibler 		 * memory in the buffer, we move it down size bytes to make
31165670Shibler 		 * room for the valid pages that cluster_callback will insert.
31265670Shibler 		 * We do this now so we don't have to do it at interrupt time
31365670Shibler 		 * in the callback routine.
31465670Shibler 		 */
31565670Shibler 		if (tbp->b_bufsize != 0) {
31665670Shibler 			caddr_t bdata = (char *)tbp->b_data;
31765670Shibler 
31865670Shibler 			if (tbp->b_bufsize + size > MAXBSIZE)
31965670Shibler 				panic("cluster_rbuild: too much memory");
32065670Shibler 			if (tbp->b_bufsize > size) {
32165670Shibler 				/*
32265670Shibler 				 * XXX if the source and destination regions
32365670Shibler 				 * overlap we have to copy backward to avoid
32465670Shibler 				 * clobbering any valid pages (i.e. pagemove
32565670Shibler 				 * implementations typically can't handle
32665670Shibler 				 * overlap).
32765670Shibler 				 */
32865670Shibler 				bdata += tbp->b_bufsize;
32965670Shibler 				while (bdata > (char *)tbp->b_data) {
33065670Shibler 					bdata -= CLBYTES;
33165670Shibler 					pagemove(bdata, bdata + size, CLBYTES);
33265670Shibler 				}
33365670Shibler 			} else
33465670Shibler 				pagemove(bdata, bdata + size, tbp->b_bufsize);
33565670Shibler 		}
33657045Smargo 		tbp->b_blkno = bn;
33757045Smargo 		tbp->b_flags |= flags | B_READ | B_ASYNC;
33857045Smargo 		++b_save->bs_nchildren;
33957045Smargo 		b_save->bs_children[i - 1] = tbp;
34057045Smargo 	}
34157045Smargo 	return(bp);
34257045Smargo }
34357045Smargo 
34457045Smargo /*
34557045Smargo  * Either get a new buffer or grow the existing one.
34657045Smargo  */
34757045Smargo struct buf *
34857045Smargo cluster_newbuf(vp, bp, flags, blkno, lblkno, size, run)
34957045Smargo 	struct vnode *vp;
35057045Smargo 	struct buf *bp;
35157045Smargo 	long flags;
35257045Smargo 	daddr_t blkno;
35357045Smargo 	daddr_t lblkno;
35457045Smargo 	long size;
35557045Smargo 	int run;
35657045Smargo {
35757045Smargo 	if (!bp) {
35857797Smckusick 		bp = getblk(vp, lblkno, size, 0, 0);
35957045Smargo 		if (bp->b_flags & (B_DONE | B_DELWRI)) {
36057045Smargo 			bp->b_blkno = blkno;
36157045Smargo 			return(bp);
36257045Smargo 		}
36357045Smargo 	}
36457045Smargo 	allocbuf(bp, run * size);
36557045Smargo 	bp->b_blkno = blkno;
36657045Smargo 	bp->b_iodone = cluster_callback;
36757045Smargo 	bp->b_flags |= flags | B_CALL;
36857045Smargo 	return(bp);
36957045Smargo }
37057045Smargo 
37157045Smargo /*
37257045Smargo  * Cleanup after a clustered read or write.
37365670Shibler  * This is complicated by the fact that any of the buffers might have
37465670Shibler  * extra memory (if there were no empty buffer headers at allocbuf time)
37565670Shibler  * that we will need to shift around.
37657045Smargo  */
37757045Smargo void
37857045Smargo cluster_callback(bp)
37957045Smargo 	struct buf *bp;
38057045Smargo {
38157045Smargo 	struct cluster_save *b_save;
38265670Shibler 	struct buf **bpp, *tbp;
38365670Shibler 	long bsize;
38457045Smargo 	caddr_t cp;
38565670Shibler 	int error = 0;
38664717Smckusick 
38765670Shibler 	/*
38865670Shibler 	 * Must propogate errors to all the components.
38965670Shibler 	 */
39065670Shibler 	if (bp->b_flags & B_ERROR)
39165670Shibler 		error = bp->b_error;
39265670Shibler 
39357045Smargo 	b_save = (struct cluster_save *)(bp->b_saveaddr);
39457045Smargo 	bp->b_saveaddr = b_save->bs_saveaddr;
39557045Smargo 
39665670Shibler 	bsize = b_save->bs_bufsize;
39765670Shibler 	cp = (char *)bp->b_data + bsize;
39865670Shibler 	/*
39965670Shibler 	 * Move memory from the large cluster buffer into the component
40065670Shibler 	 * buffers and mark IO as done on these.
40165670Shibler 	 */
40265670Shibler 	for (bpp = b_save->bs_children; b_save->bs_nchildren--; ++bpp) {
40365670Shibler 		tbp = *bpp;
40465670Shibler 		pagemove(cp, tbp->b_data, bsize);
40565670Shibler 		tbp->b_bufsize += bsize;
40665670Shibler 		tbp->b_bcount = bsize;
40765670Shibler 		if (error) {
40865670Shibler 			tbp->b_flags |= B_ERROR;
40965670Shibler 			tbp->b_error = error;
41065670Shibler 		}
41165670Shibler 		biodone(tbp);
41265670Shibler 		bp->b_bufsize -= bsize;
41365670Shibler 		cp += bsize;
41457045Smargo 	}
41565670Shibler 	/*
41665670Shibler 	 * If there was excess memory in the cluster buffer,
41765670Shibler 	 * slide it up adjacent to the remaining valid data.
41865670Shibler 	 */
41965670Shibler 	if (bp->b_bufsize != bsize) {
42065670Shibler 		if (bp->b_bufsize < bsize)
42165670Shibler 			panic("cluster_callback: too little memory");
42265670Shibler 		pagemove(cp, (char *)bp->b_data + bsize, bp->b_bufsize - bsize);
42365670Shibler 	}
42465670Shibler 	bp->b_bcount = bsize;
42557045Smargo 	bp->b_iodone = NULL;
42657045Smargo 	free(b_save, M_SEGMENT);
42757045Smargo 	if (bp->b_flags & B_ASYNC)
42857045Smargo 		brelse(bp);
42965670Shibler 	else {
43065670Shibler 		bp->b_flags &= ~B_WANTED;
43157045Smargo 		wakeup((caddr_t)bp);
43265670Shibler 	}
43357045Smargo }
43457045Smargo 
43557045Smargo /*
43657045Smargo  * Do clustered write for FFS.
43757045Smargo  *
43857045Smargo  * Three cases:
43957045Smargo  *	1. Write is not sequential (write asynchronously)
44057045Smargo  *	Write is sequential:
44157045Smargo  *	2.	beginning of cluster - begin cluster
44257045Smargo  *	3.	middle of a cluster - add to cluster
44357045Smargo  *	4.	end of a cluster - asynchronously write cluster
44457045Smargo  */
44557045Smargo void
44657045Smargo cluster_write(bp, filesize)
44757045Smargo         struct buf *bp;
44857045Smargo 	u_quad_t filesize;
44957045Smargo {
45057045Smargo         struct vnode *vp;
45157045Smargo         daddr_t lbn;
45265998Smckusick         int maxclen, cursize;
45357045Smargo 
45457045Smargo         vp = bp->b_vp;
45557045Smargo         lbn = bp->b_lblkno;
45657045Smargo 
45759872Smargo 	/* Initialize vnode to beginning of file. */
45859872Smargo 	if (lbn == 0)
45959872Smargo 		vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
46059872Smargo 
46159872Smargo         if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 ||
46265670Shibler 	    (bp->b_blkno != vp->v_lasta + btodb(bp->b_bcount))) {
46365998Smckusick 		maxclen = MAXBSIZE / vp->v_mount->mnt_stat.f_iosize - 1;
46465998Smckusick 		if (vp->v_clen != 0) {
46557045Smargo 			/*
46665998Smckusick 			 * Next block is not sequential.
46765998Smckusick 			 *
46865998Smckusick 			 * If we are not writing at end of file, the process
46965998Smckusick 			 * seeked to another point in the file since its
47065998Smckusick 			 * last write, or we have reached our maximum
47165998Smckusick 			 * cluster size, then push the previous cluster.
47265998Smckusick 			 * Otherwise try reallocating to make it sequential.
47357045Smargo 			 */
47465998Smckusick 			cursize = vp->v_lastw - vp->v_cstart + 1;
475*66080Shibler 			if (!doreallocblks ||
476*66080Shibler 			    (lbn + 1) * bp->b_bcount != filesize ||
47765998Smckusick 			    lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) {
47865998Smckusick 				cluster_wbuild(vp, NULL, bp->b_bcount,
47965998Smckusick 				    vp->v_cstart, cursize, lbn);
48065998Smckusick 			} else {
48165998Smckusick 				struct buf **bpp, **endbp;
48265998Smckusick 				struct cluster_save *buflist;
48365998Smckusick 
48465998Smckusick 				buflist = cluster_collectbufs(vp, bp);
48565998Smckusick 				endbp = &buflist->bs_children
48665998Smckusick 				    [buflist->bs_nchildren - 1];
48765998Smckusick 				if (VOP_REALLOCBLKS(vp, buflist)) {
48865998Smckusick 					/*
48965998Smckusick 					 * Failed, push the previous cluster.
49065998Smckusick 					 */
49165998Smckusick 					for (bpp = buflist->bs_children;
49265998Smckusick 					     bpp < endbp; bpp++)
49365998Smckusick 						brelse(*bpp);
49465998Smckusick 					free(buflist, M_SEGMENT);
49565998Smckusick 					cluster_wbuild(vp, NULL, bp->b_bcount,
49665998Smckusick 					    vp->v_cstart, cursize, lbn);
49765998Smckusick 				} else {
49865998Smckusick 					/*
49965998Smckusick 					 * Succeeded, keep building cluster.
50065998Smckusick 					 */
50165998Smckusick 					for (bpp = buflist->bs_children;
50265998Smckusick 					     bpp <= endbp; bpp++)
50365998Smckusick 						bdwrite(*bpp);
50465998Smckusick 					free(buflist, M_SEGMENT);
50565998Smckusick 					vp->v_lastw = lbn;
50665998Smckusick 					vp->v_lasta = bp->b_blkno;
50765998Smckusick 					return;
50865998Smckusick 				}
50965998Smckusick 			}
51065998Smckusick 		}
51157045Smargo 		/*
51257045Smargo 		 * Consider beginning a cluster.
51365998Smckusick 		 * If at end of file, make cluster as large as possible,
51465998Smckusick 		 * otherwise find size of existing cluster.
51557045Smargo 		 */
51665998Smckusick 		if ((lbn + 1) * bp->b_bcount != filesize &&
51765998Smckusick 		    (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen) ||
51865998Smckusick 		     bp->b_blkno == -1)) {
51957045Smargo 			bawrite(bp);
52059872Smargo 			vp->v_clen = 0;
52159872Smargo 			vp->v_lasta = bp->b_blkno;
52257045Smargo 			vp->v_cstart = lbn + 1;
52357045Smargo 			vp->v_lastw = lbn;
52457045Smargo 			return;
52564717Smckusick 		}
52665998Smckusick                 vp->v_clen = maxclen;
52765998Smckusick                 if (maxclen == 0) {		/* I/O not contiguous */
52857045Smargo 			vp->v_cstart = lbn + 1;
52957045Smargo                         bawrite(bp);
53057045Smargo                 } else {			/* Wait for rest of cluster */
53157045Smargo 			vp->v_cstart = lbn;
53257045Smargo                         bdwrite(bp);
53357045Smargo 		}
53465670Shibler 	} else if (lbn == vp->v_cstart + vp->v_clen) {
53557045Smargo 		/*
53657045Smargo 		 * At end of cluster, write it out.
53757045Smargo 		 */
53857045Smargo 		cluster_wbuild(vp, bp, bp->b_bcount, vp->v_cstart,
53957045Smargo 		    vp->v_clen + 1, lbn);
54057045Smargo 		vp->v_clen = 0;
54157045Smargo 		vp->v_cstart = lbn + 1;
54265670Shibler 	} else
54357045Smargo 		/*
54457045Smargo 		 * In the middle of a cluster, so just delay the
54557045Smargo 		 * I/O for now.
54657045Smargo 		 */
54765670Shibler 		bdwrite(bp);
54865670Shibler 	vp->v_lastw = lbn;
54959872Smargo 	vp->v_lasta = bp->b_blkno;
55057045Smargo }
55157045Smargo 
55257045Smargo 
55357045Smargo /*
55457045Smargo  * This is an awful lot like cluster_rbuild...wish they could be combined.
55557045Smargo  * The last lbn argument is the current block on which I/O is being
55657045Smargo  * performed.  Check to see that it doesn't fall in the middle of
55765670Shibler  * the current block (if last_bp == NULL).
55857045Smargo  */
55957045Smargo void
56057045Smargo cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn)
56157045Smargo 	struct vnode *vp;
56257045Smargo 	struct buf *last_bp;
56357045Smargo 	long size;
56457045Smargo 	daddr_t start_lbn;
56557045Smargo 	int len;
56657045Smargo 	daddr_t	lbn;
56757045Smargo {
56857045Smargo 	struct cluster_save *b_save;
56957045Smargo 	struct buf *bp, *tbp;
57057045Smargo 	caddr_t	cp;
57157045Smargo 	int i, s;
57257045Smargo 
57359872Smargo #ifdef DIAGNOSTIC
57459872Smargo 	if (size != vp->v_mount->mnt_stat.f_iosize)
57559872Smargo 		panic("cluster_wbuild: size %d != filesize %d\n",
57659872Smargo 			size, vp->v_mount->mnt_stat.f_iosize);
57759872Smargo #endif
57857045Smargo redo:
57957045Smargo 	while ((!incore(vp, start_lbn) || start_lbn == lbn) && len) {
58057045Smargo 		++start_lbn;
58157045Smargo 		--len;
58257045Smargo 	}
58357045Smargo 
58457045Smargo 	/* Get more memory for current buffer */
58557045Smargo 	if (len <= 1) {
58659872Smargo 		if (last_bp) {
58757045Smargo 			bawrite(last_bp);
58859872Smargo 		} else if (len) {
58959872Smargo 			bp = getblk(vp, start_lbn, size, 0, 0);
59059872Smargo 			bawrite(bp);
59159872Smargo 		}
59257045Smargo 		return;
59357045Smargo 	}
59457045Smargo 
59557797Smckusick 	bp = getblk(vp, start_lbn, size, 0, 0);
59657045Smargo 	if (!(bp->b_flags & B_DELWRI)) {
59757045Smargo 		++start_lbn;
59857045Smargo 		--len;
59957045Smargo 		brelse(bp);
60057045Smargo 		goto redo;
60157045Smargo 	}
60257045Smargo 
60365670Shibler 	/*
60465670Shibler 	 * Extra memory in the buffer, punt on this buffer.
60565670Shibler 	 * XXX we could handle this in most cases, but we would have to
60665670Shibler 	 * push the extra memory down to after our max possible cluster
60765670Shibler 	 * size and then potentially pull it back up if the cluster was
60865670Shibler 	 * terminated prematurely--too much hassle.
60965670Shibler 	 */
61065670Shibler 	if (bp->b_bcount != bp->b_bufsize) {
61165670Shibler 		++start_lbn;
61265670Shibler 		--len;
61365670Shibler 		bawrite(bp);
61465670Shibler 		goto redo;
61565670Shibler 	}
61665670Shibler 
61757045Smargo 	--len;
61857045Smargo 	b_save = malloc(sizeof(struct buf *) * len + sizeof(struct cluster_save),
61957045Smargo 	    M_SEGMENT, M_WAITOK);
62057045Smargo 	b_save->bs_bcount = bp->b_bcount;
62157045Smargo 	b_save->bs_bufsize = bp->b_bufsize;
62257045Smargo 	b_save->bs_nchildren = 0;
62357045Smargo 	b_save->bs_children = (struct buf **)(b_save + 1);
62457045Smargo 	b_save->bs_saveaddr = bp->b_saveaddr;
62557045Smargo 	bp->b_saveaddr = (caddr_t) b_save;
62657045Smargo 
62757045Smargo 	bp->b_flags |= B_CALL;
62857045Smargo 	bp->b_iodone = cluster_callback;
62965670Shibler 	cp = (char *)bp->b_data + size;
63057045Smargo 	for (++start_lbn, i = 0; i < len; ++i, ++start_lbn) {
63165670Shibler 		/*
63265670Shibler 		 * Block is not in core or the non-sequential block
63365670Shibler 		 * ending our cluster was part of the cluster (in which
63465670Shibler 		 * case we don't want to write it twice).
63565670Shibler 		 */
63665670Shibler 		if (!incore(vp, start_lbn) ||
63765670Shibler 		    last_bp == NULL && start_lbn == lbn)
63857045Smargo 			break;
63957045Smargo 
64065670Shibler 		/*
64165670Shibler 		 * Get the desired block buffer (unless it is the final
64265670Shibler 		 * sequential block whose buffer was passed in explictly
64365670Shibler 		 * as last_bp).
64465670Shibler 		 */
64565670Shibler 		if (last_bp == NULL || start_lbn != lbn) {
64657797Smckusick 			tbp = getblk(vp, start_lbn, size, 0, 0);
64757045Smargo 			if (!(tbp->b_flags & B_DELWRI)) {
64857045Smargo 				brelse(tbp);
64957045Smargo 				break;
65057045Smargo 			}
65157045Smargo 		} else
65257045Smargo 			tbp = last_bp;
65357045Smargo 
65457045Smargo 		++b_save->bs_nchildren;
65557045Smargo 
65657045Smargo 		/* Move memory from children to parent */
65765670Shibler 		if (tbp->b_blkno != (bp->b_blkno + btodb(bp->b_bufsize))) {
65859872Smargo 			printf("Clustered Block: %d addr %x bufsize: %d\n",
65959872Smargo 			    bp->b_lblkno, bp->b_blkno, bp->b_bufsize);
66059872Smargo 			printf("Child Block: %d addr: %x\n", tbp->b_lblkno,
66159872Smargo 			    tbp->b_blkno);
66259872Smargo 			panic("Clustered write to wrong blocks");
66359872Smargo 		}
66459872Smargo 
66564528Sbostic 		pagemove(tbp->b_data, cp, size);
66657045Smargo 		bp->b_bcount += size;
66757045Smargo 		bp->b_bufsize += size;
66857045Smargo 
66965670Shibler 		tbp->b_bufsize -= size;
67057045Smargo 		tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
67165670Shibler 		tbp->b_flags |= (B_ASYNC | B_AGE);
67257045Smargo 		s = splbio();
67357045Smargo 		reassignbuf(tbp, tbp->b_vp);		/* put on clean list */
67457045Smargo 		++tbp->b_vp->v_numoutput;
67557045Smargo 		splx(s);
67657045Smargo 		b_save->bs_children[i] = tbp;
67757045Smargo 
67865670Shibler 		cp += size;
67957045Smargo 	}
68057045Smargo 
68157045Smargo 	if (i == 0) {
68257045Smargo 		/* None to cluster */
68357045Smargo 		bp->b_saveaddr = b_save->bs_saveaddr;
68457045Smargo 		bp->b_flags &= ~B_CALL;
68557045Smargo 		bp->b_iodone = NULL;
68657045Smargo 		free(b_save, M_SEGMENT);
68757045Smargo 	}
68857045Smargo 	bawrite(bp);
68957045Smargo 	if (i < len) {
69057045Smargo 		len -= i + 1;
69157045Smargo 		start_lbn += 1;
69257045Smargo 		goto redo;
69357045Smargo 	}
69457045Smargo }
69565998Smckusick 
69665998Smckusick /*
69765998Smckusick  * Collect together all the buffers in a cluster.
69865998Smckusick  * Plus add one additional buffer.
69965998Smckusick  */
70065998Smckusick struct cluster_save *
70165998Smckusick cluster_collectbufs(vp, last_bp)
70265998Smckusick 	struct vnode *vp;
70365998Smckusick 	struct buf *last_bp;
70465998Smckusick {
70565998Smckusick 	struct cluster_save *buflist;
70665998Smckusick 	daddr_t	lbn;
70765998Smckusick 	int i, len;
70865998Smckusick 
70965998Smckusick 	len = vp->v_lastw - vp->v_cstart + 1;
71065998Smckusick 	buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist),
71165998Smckusick 	    M_SEGMENT, M_WAITOK);
71265998Smckusick 	buflist->bs_nchildren = 0;
71365998Smckusick 	buflist->bs_children = (struct buf **)(buflist + 1);
71465998Smckusick 	for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++)
71565998Smckusick 		    (void)bread(vp, lbn, last_bp->b_bcount, NOCRED,
71665998Smckusick 			&buflist->bs_children[i]);
71765998Smckusick 	buflist->bs_children[i] = last_bp;
71865998Smckusick 	buflist->bs_nchildren = i + 1;
71965998Smckusick 	return (buflist);
72065998Smckusick }
721