xref: /csrg-svn/sys/kern/vfs_cluster.c (revision 68320)
149589Sbostic /*-
263180Sbostic  * Copyright (c) 1993
363180Sbostic  *	The Regents of the University of California.  All rights reserved.
423395Smckusick  *
559878Smckusick  * %sccs.include.redist.c%
637736Smckusick  *
7*68320Scgd  *	@(#)vfs_cluster.c	8.9 (Berkeley) 02/14/95
823395Smckusick  */
98Sbill 
1051455Sbostic #include <sys/param.h>
1151455Sbostic #include <sys/proc.h>
1251455Sbostic #include <sys/buf.h>
1351455Sbostic #include <sys/vnode.h>
1451455Sbostic #include <sys/mount.h>
1551455Sbostic #include <sys/trace.h>
1659878Smckusick #include <sys/malloc.h>
1751455Sbostic #include <sys/resourcevar.h>
1856395Smckusick #include <libkern/libkern.h>
198Sbill 
2066080Shibler #ifdef DEBUG
2166080Shibler #include <vm/vm.h>
2266080Shibler #include <sys/sysctl.h>
2366080Shibler int doreallocblks = 1;
2466080Shibler struct ctldebug debug13 = { "doreallocblks", &doreallocblks };
2566080Shibler #else
2666080Shibler /* XXX for cluster_write */
2766080Shibler #define doreallocblks 1
2866080Shibler #endif
2966080Shibler 
3091Sbill /*
3157045Smargo  * Local declarations
3257045Smargo  */
3357045Smargo struct buf *cluster_newbuf __P((struct vnode *, struct buf *, long, daddr_t,
3457045Smargo 	    daddr_t, long, int));
3557045Smargo struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *,
3657045Smargo 	    daddr_t, daddr_t, long, int, long));
3764717Smckusick void	    cluster_wbuild __P((struct vnode *, struct buf *, long,
3864717Smckusick 	    daddr_t, int, daddr_t));
3965998Smckusick struct cluster_save *cluster_collectbufs __P((struct vnode *, struct buf *));
4057045Smargo 
4165670Shibler #ifdef DIAGNOSTIC
4256395Smckusick /*
4365670Shibler  * Set to 1 if reads of block zero should cause readahead to be done.
4465670Shibler  * Set to 0 treats a read of block zero as a non-sequential read.
4557045Smargo  *
4665670Shibler  * Setting to one assumes that most reads of block zero of files are due to
4765670Shibler  * sequential passes over the files (e.g. cat, sum) where additional blocks
4865670Shibler  * will soon be needed.  Setting to zero assumes that the majority are
4965670Shibler  * surgical strikes to get particular info (e.g. size, file) where readahead
5065670Shibler  * blocks will not be used and, in fact, push out other potentially useful
5165670Shibler  * blocks from the cache.  The former seems intuitive, but some quick tests
5265670Shibler  * showed that the latter performed better from a system-wide point of view.
5365670Shibler  */
5465670Shibler int	doclusterraz = 0;
5565670Shibler #define ISSEQREAD(vp, blk) \
5665670Shibler 	(((blk) != 0 || doclusterraz) && \
5765670Shibler 	 ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))
5865670Shibler #else
5965670Shibler #define ISSEQREAD(vp, blk) \
6065670Shibler 	((blk) != 0 && ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))
6165670Shibler #endif
6265670Shibler 
6365670Shibler /*
6457045Smargo  * This replaces bread.  If this is a bread at the beginning of a file and
6557045Smargo  * lastr is 0, we assume this is the first read and we'll read up to two
6657045Smargo  * blocks if they are sequential.  After that, we'll do regular read ahead
6757045Smargo  * in clustered chunks.
6857045Smargo  *
6957045Smargo  * There are 4 or 5 cases depending on how you count:
7057045Smargo  *	Desired block is in the cache:
7157045Smargo  *	    1 Not sequential access (0 I/Os).
7257045Smargo  *	    2 Access is sequential, do read-ahead (1 ASYNC).
7357045Smargo  *	Desired block is not in cache:
7457045Smargo  *	    3 Not sequential access (1 SYNC).
7557045Smargo  *	    4 Sequential access, next block is contiguous (1 SYNC).
7657045Smargo  *	    5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC)
7757045Smargo  *
7857045Smargo  * There are potentially two buffers that require I/O.
7957045Smargo  * 	bp is the block requested.
8057045Smargo  *	rbp is the read-ahead block.
8157045Smargo  *	If either is NULL, then you don't have to do the I/O.
8257045Smargo  */
8357045Smargo cluster_read(vp, filesize, lblkno, size, cred, bpp)
8457045Smargo 	struct vnode *vp;
8557045Smargo 	u_quad_t filesize;
8657045Smargo 	daddr_t lblkno;
8757045Smargo 	long size;
8857045Smargo 	struct ucred *cred;
8957045Smargo 	struct buf **bpp;
9057045Smargo {
9157045Smargo 	struct buf *bp, *rbp;
9257045Smargo 	daddr_t blkno, ioblkno;
9357045Smargo 	long flags;
9457045Smargo 	int error, num_ra, alreadyincore;
9557045Smargo 
9657045Smargo #ifdef DIAGNOSTIC
9757045Smargo 	if (size == 0)
9857045Smargo 		panic("cluster_read: size = 0");
9957045Smargo #endif
10057045Smargo 
10157045Smargo 	error = 0;
10257045Smargo 	flags = B_READ;
10357797Smckusick 	*bpp = bp = getblk(vp, lblkno, size, 0, 0);
10465670Shibler 	if (bp->b_flags & B_CACHE) {
10557045Smargo 		/*
10657045Smargo 		 * Desired block is in cache; do any readahead ASYNC.
10757045Smargo 		 * Case 1, 2.
10857045Smargo 		 */
10957045Smargo 		trace(TR_BREADHIT, pack(vp, size), lblkno);
11057045Smargo 		flags |= B_ASYNC;
11165670Shibler 		ioblkno = lblkno + (vp->v_ralen ? vp->v_ralen : 1);
112*68320Scgd 		alreadyincore = incore(vp, ioblkno) != NULL;
11357045Smargo 		bp = NULL;
11457045Smargo 	} else {
11557045Smargo 		/* Block wasn't in cache, case 3, 4, 5. */
11657045Smargo 		trace(TR_BREADMISS, pack(vp, size), lblkno);
11765670Shibler 		bp->b_flags |= B_READ;
11857045Smargo 		ioblkno = lblkno;
11957045Smargo 		alreadyincore = 0;
12057045Smargo 		curproc->p_stats->p_ru.ru_inblock++;		/* XXX */
12157045Smargo 	}
12257045Smargo 	/*
12357045Smargo 	 * XXX
12457045Smargo 	 * Replace 1 with a window size based on some permutation of
12557045Smargo 	 * maxcontig and rot_delay.  This will let you figure out how
12657045Smargo 	 * many blocks you should read-ahead (case 2, 4, 5).
12757045Smargo 	 *
12865670Shibler 	 * If the access isn't sequential, reset the window to 1.
12965670Shibler 	 * Note that a read to the same block is considered sequential.
13065670Shibler 	 * This catches the case where the file is being read sequentially,
13165670Shibler 	 * but at smaller than the filesystem block size.
13257045Smargo 	 */
13357045Smargo 	rbp = NULL;
13465670Shibler 	if (!ISSEQREAD(vp, lblkno)) {
13565670Shibler 		vp->v_ralen = 0;
13665670Shibler 		vp->v_maxra = lblkno;
13765670Shibler 	} else if ((ioblkno + 1) * size <= filesize && !alreadyincore &&
13864717Smckusick 	    !(error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra)) &&
13964717Smckusick 	    blkno != -1) {
14057045Smargo 		/*
14157045Smargo 		 * Reading sequentially, and the next block is not in the
14265670Shibler 		 * cache.  We are going to try reading ahead.
14357045Smargo 		 */
14465670Shibler 		if (num_ra) {
14565670Shibler 			/*
14665670Shibler 			 * If our desired readahead block had been read
14765670Shibler 			 * in a previous readahead but is no longer in
14865670Shibler 			 * core, then we may be reading ahead too far
14965670Shibler 			 * or are not using our readahead very rapidly.
15065670Shibler 			 * In this case we scale back the window.
15165670Shibler 			 */
15265670Shibler 			if (!alreadyincore && ioblkno <= vp->v_maxra)
15365670Shibler 				vp->v_ralen = max(vp->v_ralen >> 1, 1);
15465670Shibler 			/*
15565670Shibler 			 * There are more sequential blocks than our current
15665670Shibler 			 * window allows, scale up.  Ideally we want to get
15765670Shibler 			 * in sync with the filesystem maxcontig value.
15865670Shibler 			 */
15965670Shibler 			else if (num_ra > vp->v_ralen && lblkno != vp->v_lastr)
16065670Shibler 				vp->v_ralen = vp->v_ralen ?
16165670Shibler 					min(num_ra, vp->v_ralen << 1) : 1;
16257045Smargo 
16365670Shibler 			if (num_ra > vp->v_ralen)
16465670Shibler 				num_ra = vp->v_ralen;
16565670Shibler 		}
16657045Smargo 
16757045Smargo 		if (num_ra)				/* case 2, 4 */
16857045Smargo 			rbp = cluster_rbuild(vp, filesize,
16957045Smargo 			    bp, ioblkno, blkno, size, num_ra, flags);
17065670Shibler 		else if (ioblkno == lblkno) {
17165670Shibler 			bp->b_blkno = blkno;
17257045Smargo 			/* Case 5: check how many blocks to read ahead */
17357045Smargo 			++ioblkno;
17457045Smargo 			if ((ioblkno + 1) * size > filesize ||
17565670Shibler 			    incore(vp, ioblkno) || (error = VOP_BMAP(vp,
17665670Shibler 			     ioblkno, NULL, &blkno, &num_ra)) || blkno == -1)
17757045Smargo 				goto skip_readahead;
17865670Shibler 			/*
17967578Shibler 			 * Adjust readahead as above.
18067578Shibler 			 * Don't check alreadyincore, we know it is 0 from
18167578Shibler 			 * the previous conditional.
18265670Shibler 			 */
18365670Shibler 			if (num_ra) {
18467578Shibler 				if (ioblkno <= vp->v_maxra)
18565670Shibler 					vp->v_ralen = max(vp->v_ralen >> 1, 1);
18665670Shibler 				else if (num_ra > vp->v_ralen &&
18765670Shibler 					 lblkno != vp->v_lastr)
18865670Shibler 					vp->v_ralen = vp->v_ralen ?
18965670Shibler 						min(num_ra,vp->v_ralen<<1) : 1;
19065670Shibler 				if (num_ra > vp->v_ralen)
19165670Shibler 					num_ra = vp->v_ralen;
19265670Shibler 			}
19357045Smargo 			flags |= B_ASYNC;
19457045Smargo 			if (num_ra)
19557045Smargo 				rbp = cluster_rbuild(vp, filesize,
19657045Smargo 				    NULL, ioblkno, blkno, size, num_ra, flags);
19757045Smargo 			else {
19857797Smckusick 				rbp = getblk(vp, ioblkno, size, 0, 0);
19957045Smargo 				rbp->b_flags |= flags;
20057045Smargo 				rbp->b_blkno = blkno;
20157045Smargo 			}
20265670Shibler 		} else {
20357045Smargo 			/* case 2; read ahead single block */
20457797Smckusick 			rbp = getblk(vp, ioblkno, size, 0, 0);
20557045Smargo 			rbp->b_flags |= flags;
20657045Smargo 			rbp->b_blkno = blkno;
20765670Shibler 		}
20857045Smargo 
20965670Shibler 		if (rbp == bp)			/* case 4 */
21057045Smargo 			rbp = NULL;
21157045Smargo 		else if (rbp) {			/* case 2, 5 */
21257045Smargo 			trace(TR_BREADMISSRA,
21357045Smargo 			    pack(vp, (num_ra + 1) * size), ioblkno);
21457045Smargo 			curproc->p_stats->p_ru.ru_inblock++;	/* XXX */
21557045Smargo 		}
21657045Smargo 	}
21757045Smargo 
21857045Smargo 	/* XXX Kirk, do we need to make sure the bp has creds? */
21957045Smargo skip_readahead:
22057045Smargo 	if (bp)
22157045Smargo 		if (bp->b_flags & (B_DONE | B_DELWRI))
22257045Smargo 			panic("cluster_read: DONE bp");
22357045Smargo 		else
22457045Smargo 			error = VOP_STRATEGY(bp);
22557045Smargo 
22657045Smargo 	if (rbp)
22757045Smargo 		if (error || rbp->b_flags & (B_DONE | B_DELWRI)) {
22857045Smargo 			rbp->b_flags &= ~(B_ASYNC | B_READ);
22957045Smargo 			brelse(rbp);
23057045Smargo 		} else
23157045Smargo 			(void) VOP_STRATEGY(rbp);
23257045Smargo 
23365670Shibler 	/*
23465670Shibler 	 * Recalculate our maximum readahead
23565670Shibler 	 */
23665670Shibler 	if (rbp == NULL)
23765670Shibler 		rbp = bp;
23865670Shibler 	if (rbp)
23965670Shibler 		vp->v_maxra = rbp->b_lblkno + (rbp->b_bufsize / size) - 1;
24065670Shibler 
24157045Smargo 	if (bp)
24257045Smargo 		return(biowait(bp));
24357045Smargo 	return(error);
24457045Smargo }
24557045Smargo 
24657045Smargo /*
24757045Smargo  * If blocks are contiguous on disk, use this to provide clustered
24857045Smargo  * read ahead.  We will read as many blocks as possible sequentially
24957045Smargo  * and then parcel them up into logical blocks in the buffer hash table.
25057045Smargo  */
25157045Smargo struct buf *
25257045Smargo cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags)
25357045Smargo 	struct vnode *vp;
25457045Smargo 	u_quad_t filesize;
25557045Smargo 	struct buf *bp;
25657045Smargo 	daddr_t lbn;
25757045Smargo 	daddr_t blkno;
25857045Smargo 	long size;
25957045Smargo 	int run;
26057045Smargo 	long flags;
26157045Smargo {
26257045Smargo 	struct cluster_save *b_save;
26357045Smargo 	struct buf *tbp;
26457045Smargo 	daddr_t bn;
26557045Smargo 	int i, inc;
26657045Smargo 
26759872Smargo #ifdef DIAGNOSTIC
26859872Smargo 	if (size != vp->v_mount->mnt_stat.f_iosize)
26959872Smargo 		panic("cluster_rbuild: size %d != filesize %d\n",
27059872Smargo 			size, vp->v_mount->mnt_stat.f_iosize);
27159872Smargo #endif
27257045Smargo 	if (size * (lbn + run + 1) > filesize)
27357045Smargo 		--run;
27457045Smargo 	if (run == 0) {
27557045Smargo 		if (!bp) {
27657797Smckusick 			bp = getblk(vp, lbn, size, 0, 0);
27757045Smargo 			bp->b_blkno = blkno;
27857045Smargo 			bp->b_flags |= flags;
27957045Smargo 		}
28057045Smargo 		return(bp);
28157045Smargo 	}
28257045Smargo 
28357045Smargo 	bp = cluster_newbuf(vp, bp, flags, blkno, lbn, size, run + 1);
28457045Smargo 	if (bp->b_flags & (B_DONE | B_DELWRI))
28557045Smargo 		return (bp);
28657045Smargo 
28757045Smargo 	b_save = malloc(sizeof(struct buf *) * run + sizeof(struct cluster_save),
28857045Smargo 	    M_SEGMENT, M_WAITOK);
28957045Smargo 	b_save->bs_bufsize = b_save->bs_bcount = size;
29057045Smargo 	b_save->bs_nchildren = 0;
29157045Smargo 	b_save->bs_children = (struct buf **)(b_save + 1);
29257045Smargo 	b_save->bs_saveaddr = bp->b_saveaddr;
29357045Smargo 	bp->b_saveaddr = (caddr_t) b_save;
29457045Smargo 
29565670Shibler 	inc = btodb(size);
29657045Smargo 	for (bn = blkno + inc, i = 1; i <= run; ++i, bn += inc) {
29767578Shibler 		/*
29867578Shibler 		 * A component of the cluster is already in core,
29967578Shibler 		 * terminate the cluster early.
30067578Shibler 		 */
30167578Shibler 		if (incore(vp, lbn + i))
30257045Smargo 			break;
30357797Smckusick 		tbp = getblk(vp, lbn + i, 0, 0, 0);
30465670Shibler 		/*
30565670Shibler 		 * getblk may return some memory in the buffer if there were
30665670Shibler 		 * no empty buffers to shed it to.  If there is currently
30765670Shibler 		 * memory in the buffer, we move it down size bytes to make
30865670Shibler 		 * room for the valid pages that cluster_callback will insert.
30965670Shibler 		 * We do this now so we don't have to do it at interrupt time
31065670Shibler 		 * in the callback routine.
31165670Shibler 		 */
31265670Shibler 		if (tbp->b_bufsize != 0) {
31365670Shibler 			caddr_t bdata = (char *)tbp->b_data;
31465670Shibler 
31567578Shibler 			/*
31667578Shibler 			 * No room in the buffer to add another page,
31767578Shibler 			 * terminate the cluster early.
31867578Shibler 			 */
31967578Shibler 			if (tbp->b_bufsize + size > MAXBSIZE) {
32067578Shibler #ifdef DIAGNOSTIC
32167578Shibler 				if (tbp->b_bufsize != MAXBSIZE)
32267578Shibler 					panic("cluster_rbuild: too much memory");
32367578Shibler #endif
32467578Shibler 				brelse(tbp);
32567578Shibler 				break;
32667578Shibler 			}
32765670Shibler 			if (tbp->b_bufsize > size) {
32865670Shibler 				/*
32965670Shibler 				 * XXX if the source and destination regions
33065670Shibler 				 * overlap we have to copy backward to avoid
33165670Shibler 				 * clobbering any valid pages (i.e. pagemove
33265670Shibler 				 * implementations typically can't handle
33365670Shibler 				 * overlap).
33465670Shibler 				 */
33565670Shibler 				bdata += tbp->b_bufsize;
33665670Shibler 				while (bdata > (char *)tbp->b_data) {
33765670Shibler 					bdata -= CLBYTES;
33865670Shibler 					pagemove(bdata, bdata + size, CLBYTES);
33965670Shibler 				}
34065670Shibler 			} else
34165670Shibler 				pagemove(bdata, bdata + size, tbp->b_bufsize);
34265670Shibler 		}
34357045Smargo 		tbp->b_blkno = bn;
34457045Smargo 		tbp->b_flags |= flags | B_READ | B_ASYNC;
34557045Smargo 		++b_save->bs_nchildren;
34657045Smargo 		b_save->bs_children[i - 1] = tbp;
34757045Smargo 	}
34867578Shibler 	/*
34967578Shibler 	 * The cluster may have been terminated early, adjust the cluster
35067578Shibler 	 * buffer size accordingly.  If no cluster could be formed,
35167578Shibler 	 * deallocate the cluster save info.
35267578Shibler 	 */
35367578Shibler 	if (i <= run) {
35467578Shibler 		if (i == 1) {
35567578Shibler 			bp->b_saveaddr = b_save->bs_saveaddr;
35667578Shibler 			bp->b_flags &= ~B_CALL;
35767578Shibler 			bp->b_iodone = NULL;
35867578Shibler 			free(b_save, M_SEGMENT);
35967578Shibler 		}
36067578Shibler 		allocbuf(bp, size * i);
36167578Shibler 	}
36257045Smargo 	return(bp);
36357045Smargo }
36457045Smargo 
36557045Smargo /*
36657045Smargo  * Either get a new buffer or grow the existing one.
36757045Smargo  */
36857045Smargo struct buf *
36957045Smargo cluster_newbuf(vp, bp, flags, blkno, lblkno, size, run)
37057045Smargo 	struct vnode *vp;
37157045Smargo 	struct buf *bp;
37257045Smargo 	long flags;
37357045Smargo 	daddr_t blkno;
37457045Smargo 	daddr_t lblkno;
37557045Smargo 	long size;
37657045Smargo 	int run;
37757045Smargo {
37857045Smargo 	if (!bp) {
37957797Smckusick 		bp = getblk(vp, lblkno, size, 0, 0);
38057045Smargo 		if (bp->b_flags & (B_DONE | B_DELWRI)) {
38157045Smargo 			bp->b_blkno = blkno;
38257045Smargo 			return(bp);
38357045Smargo 		}
38457045Smargo 	}
38557045Smargo 	allocbuf(bp, run * size);
38657045Smargo 	bp->b_blkno = blkno;
38757045Smargo 	bp->b_iodone = cluster_callback;
38857045Smargo 	bp->b_flags |= flags | B_CALL;
38957045Smargo 	return(bp);
39057045Smargo }
39157045Smargo 
39257045Smargo /*
39357045Smargo  * Cleanup after a clustered read or write.
39465670Shibler  * This is complicated by the fact that any of the buffers might have
39565670Shibler  * extra memory (if there were no empty buffer headers at allocbuf time)
39665670Shibler  * that we will need to shift around.
39757045Smargo  */
39857045Smargo void
39957045Smargo cluster_callback(bp)
40057045Smargo 	struct buf *bp;
40157045Smargo {
40257045Smargo 	struct cluster_save *b_save;
40365670Shibler 	struct buf **bpp, *tbp;
40465670Shibler 	long bsize;
40557045Smargo 	caddr_t cp;
40665670Shibler 	int error = 0;
40764717Smckusick 
40865670Shibler 	/*
40965670Shibler 	 * Must propogate errors to all the components.
41065670Shibler 	 */
41165670Shibler 	if (bp->b_flags & B_ERROR)
41265670Shibler 		error = bp->b_error;
41365670Shibler 
41457045Smargo 	b_save = (struct cluster_save *)(bp->b_saveaddr);
41557045Smargo 	bp->b_saveaddr = b_save->bs_saveaddr;
41657045Smargo 
41765670Shibler 	bsize = b_save->bs_bufsize;
41865670Shibler 	cp = (char *)bp->b_data + bsize;
41965670Shibler 	/*
42065670Shibler 	 * Move memory from the large cluster buffer into the component
42165670Shibler 	 * buffers and mark IO as done on these.
42265670Shibler 	 */
42365670Shibler 	for (bpp = b_save->bs_children; b_save->bs_nchildren--; ++bpp) {
42465670Shibler 		tbp = *bpp;
42565670Shibler 		pagemove(cp, tbp->b_data, bsize);
42665670Shibler 		tbp->b_bufsize += bsize;
42765670Shibler 		tbp->b_bcount = bsize;
42865670Shibler 		if (error) {
42965670Shibler 			tbp->b_flags |= B_ERROR;
43065670Shibler 			tbp->b_error = error;
43165670Shibler 		}
43265670Shibler 		biodone(tbp);
43365670Shibler 		bp->b_bufsize -= bsize;
43465670Shibler 		cp += bsize;
43557045Smargo 	}
43665670Shibler 	/*
43765670Shibler 	 * If there was excess memory in the cluster buffer,
43865670Shibler 	 * slide it up adjacent to the remaining valid data.
43965670Shibler 	 */
44065670Shibler 	if (bp->b_bufsize != bsize) {
44165670Shibler 		if (bp->b_bufsize < bsize)
44265670Shibler 			panic("cluster_callback: too little memory");
44365670Shibler 		pagemove(cp, (char *)bp->b_data + bsize, bp->b_bufsize - bsize);
44465670Shibler 	}
44565670Shibler 	bp->b_bcount = bsize;
44657045Smargo 	bp->b_iodone = NULL;
44757045Smargo 	free(b_save, M_SEGMENT);
44857045Smargo 	if (bp->b_flags & B_ASYNC)
44957045Smargo 		brelse(bp);
45065670Shibler 	else {
45165670Shibler 		bp->b_flags &= ~B_WANTED;
45257045Smargo 		wakeup((caddr_t)bp);
45365670Shibler 	}
45457045Smargo }
45557045Smargo 
45657045Smargo /*
45757045Smargo  * Do clustered write for FFS.
45857045Smargo  *
45957045Smargo  * Three cases:
46057045Smargo  *	1. Write is not sequential (write asynchronously)
46157045Smargo  *	Write is sequential:
46257045Smargo  *	2.	beginning of cluster - begin cluster
46357045Smargo  *	3.	middle of a cluster - add to cluster
46457045Smargo  *	4.	end of a cluster - asynchronously write cluster
46557045Smargo  */
46657045Smargo void
46757045Smargo cluster_write(bp, filesize)
46857045Smargo         struct buf *bp;
46957045Smargo 	u_quad_t filesize;
47057045Smargo {
47157045Smargo         struct vnode *vp;
47257045Smargo         daddr_t lbn;
47365998Smckusick         int maxclen, cursize;
47457045Smargo 
47557045Smargo         vp = bp->b_vp;
47657045Smargo         lbn = bp->b_lblkno;
47757045Smargo 
47859872Smargo 	/* Initialize vnode to beginning of file. */
47959872Smargo 	if (lbn == 0)
48059872Smargo 		vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
48159872Smargo 
48259872Smargo         if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 ||
48365670Shibler 	    (bp->b_blkno != vp->v_lasta + btodb(bp->b_bcount))) {
48465998Smckusick 		maxclen = MAXBSIZE / vp->v_mount->mnt_stat.f_iosize - 1;
48565998Smckusick 		if (vp->v_clen != 0) {
48657045Smargo 			/*
48765998Smckusick 			 * Next block is not sequential.
48865998Smckusick 			 *
48965998Smckusick 			 * If we are not writing at end of file, the process
49065998Smckusick 			 * seeked to another point in the file since its
49165998Smckusick 			 * last write, or we have reached our maximum
49265998Smckusick 			 * cluster size, then push the previous cluster.
49365998Smckusick 			 * Otherwise try reallocating to make it sequential.
49457045Smargo 			 */
49565998Smckusick 			cursize = vp->v_lastw - vp->v_cstart + 1;
49666080Shibler 			if (!doreallocblks ||
49766080Shibler 			    (lbn + 1) * bp->b_bcount != filesize ||
49865998Smckusick 			    lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) {
49965998Smckusick 				cluster_wbuild(vp, NULL, bp->b_bcount,
50065998Smckusick 				    vp->v_cstart, cursize, lbn);
50165998Smckusick 			} else {
50265998Smckusick 				struct buf **bpp, **endbp;
50365998Smckusick 				struct cluster_save *buflist;
50465998Smckusick 
50565998Smckusick 				buflist = cluster_collectbufs(vp, bp);
50665998Smckusick 				endbp = &buflist->bs_children
50765998Smckusick 				    [buflist->bs_nchildren - 1];
50865998Smckusick 				if (VOP_REALLOCBLKS(vp, buflist)) {
50965998Smckusick 					/*
51065998Smckusick 					 * Failed, push the previous cluster.
51165998Smckusick 					 */
51265998Smckusick 					for (bpp = buflist->bs_children;
51365998Smckusick 					     bpp < endbp; bpp++)
51465998Smckusick 						brelse(*bpp);
51565998Smckusick 					free(buflist, M_SEGMENT);
51665998Smckusick 					cluster_wbuild(vp, NULL, bp->b_bcount,
51765998Smckusick 					    vp->v_cstart, cursize, lbn);
51865998Smckusick 				} else {
51965998Smckusick 					/*
52065998Smckusick 					 * Succeeded, keep building cluster.
52165998Smckusick 					 */
52265998Smckusick 					for (bpp = buflist->bs_children;
52365998Smckusick 					     bpp <= endbp; bpp++)
52465998Smckusick 						bdwrite(*bpp);
52565998Smckusick 					free(buflist, M_SEGMENT);
52665998Smckusick 					vp->v_lastw = lbn;
52765998Smckusick 					vp->v_lasta = bp->b_blkno;
52865998Smckusick 					return;
52965998Smckusick 				}
53065998Smckusick 			}
53165998Smckusick 		}
53257045Smargo 		/*
53357045Smargo 		 * Consider beginning a cluster.
53465998Smckusick 		 * If at end of file, make cluster as large as possible,
53565998Smckusick 		 * otherwise find size of existing cluster.
53657045Smargo 		 */
53765998Smckusick 		if ((lbn + 1) * bp->b_bcount != filesize &&
53865998Smckusick 		    (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen) ||
53965998Smckusick 		     bp->b_blkno == -1)) {
54057045Smargo 			bawrite(bp);
54159872Smargo 			vp->v_clen = 0;
54259872Smargo 			vp->v_lasta = bp->b_blkno;
54357045Smargo 			vp->v_cstart = lbn + 1;
54457045Smargo 			vp->v_lastw = lbn;
54557045Smargo 			return;
54664717Smckusick 		}
54765998Smckusick                 vp->v_clen = maxclen;
54865998Smckusick                 if (maxclen == 0) {		/* I/O not contiguous */
54957045Smargo 			vp->v_cstart = lbn + 1;
55057045Smargo                         bawrite(bp);
55157045Smargo                 } else {			/* Wait for rest of cluster */
55257045Smargo 			vp->v_cstart = lbn;
55357045Smargo                         bdwrite(bp);
55457045Smargo 		}
55565670Shibler 	} else if (lbn == vp->v_cstart + vp->v_clen) {
55657045Smargo 		/*
55757045Smargo 		 * At end of cluster, write it out.
55857045Smargo 		 */
55957045Smargo 		cluster_wbuild(vp, bp, bp->b_bcount, vp->v_cstart,
56057045Smargo 		    vp->v_clen + 1, lbn);
56157045Smargo 		vp->v_clen = 0;
56257045Smargo 		vp->v_cstart = lbn + 1;
56365670Shibler 	} else
56457045Smargo 		/*
56557045Smargo 		 * In the middle of a cluster, so just delay the
56657045Smargo 		 * I/O for now.
56757045Smargo 		 */
56865670Shibler 		bdwrite(bp);
56965670Shibler 	vp->v_lastw = lbn;
57059872Smargo 	vp->v_lasta = bp->b_blkno;
57157045Smargo }
57257045Smargo 
57357045Smargo 
57457045Smargo /*
57557045Smargo  * This is an awful lot like cluster_rbuild...wish they could be combined.
57657045Smargo  * The last lbn argument is the current block on which I/O is being
57757045Smargo  * performed.  Check to see that it doesn't fall in the middle of
57865670Shibler  * the current block (if last_bp == NULL).
57957045Smargo  */
58057045Smargo void
58157045Smargo cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn)
58257045Smargo 	struct vnode *vp;
58357045Smargo 	struct buf *last_bp;
58457045Smargo 	long size;
58557045Smargo 	daddr_t start_lbn;
58657045Smargo 	int len;
58757045Smargo 	daddr_t	lbn;
58857045Smargo {
58957045Smargo 	struct cluster_save *b_save;
59057045Smargo 	struct buf *bp, *tbp;
59157045Smargo 	caddr_t	cp;
59257045Smargo 	int i, s;
59357045Smargo 
59459872Smargo #ifdef DIAGNOSTIC
59559872Smargo 	if (size != vp->v_mount->mnt_stat.f_iosize)
59659872Smargo 		panic("cluster_wbuild: size %d != filesize %d\n",
59759872Smargo 			size, vp->v_mount->mnt_stat.f_iosize);
59859872Smargo #endif
59957045Smargo redo:
60057045Smargo 	while ((!incore(vp, start_lbn) || start_lbn == lbn) && len) {
60157045Smargo 		++start_lbn;
60257045Smargo 		--len;
60357045Smargo 	}
60457045Smargo 
60557045Smargo 	/* Get more memory for current buffer */
60657045Smargo 	if (len <= 1) {
60759872Smargo 		if (last_bp) {
60857045Smargo 			bawrite(last_bp);
60959872Smargo 		} else if (len) {
61059872Smargo 			bp = getblk(vp, start_lbn, size, 0, 0);
61159872Smargo 			bawrite(bp);
61259872Smargo 		}
61357045Smargo 		return;
61457045Smargo 	}
61557045Smargo 
61657797Smckusick 	bp = getblk(vp, start_lbn, size, 0, 0);
61757045Smargo 	if (!(bp->b_flags & B_DELWRI)) {
61857045Smargo 		++start_lbn;
61957045Smargo 		--len;
62057045Smargo 		brelse(bp);
62157045Smargo 		goto redo;
62257045Smargo 	}
62357045Smargo 
62465670Shibler 	/*
62565670Shibler 	 * Extra memory in the buffer, punt on this buffer.
62665670Shibler 	 * XXX we could handle this in most cases, but we would have to
62765670Shibler 	 * push the extra memory down to after our max possible cluster
62865670Shibler 	 * size and then potentially pull it back up if the cluster was
62965670Shibler 	 * terminated prematurely--too much hassle.
63065670Shibler 	 */
63165670Shibler 	if (bp->b_bcount != bp->b_bufsize) {
63265670Shibler 		++start_lbn;
63365670Shibler 		--len;
63465670Shibler 		bawrite(bp);
63565670Shibler 		goto redo;
63665670Shibler 	}
63765670Shibler 
63857045Smargo 	--len;
63957045Smargo 	b_save = malloc(sizeof(struct buf *) * len + sizeof(struct cluster_save),
64057045Smargo 	    M_SEGMENT, M_WAITOK);
64157045Smargo 	b_save->bs_bcount = bp->b_bcount;
64257045Smargo 	b_save->bs_bufsize = bp->b_bufsize;
64357045Smargo 	b_save->bs_nchildren = 0;
64457045Smargo 	b_save->bs_children = (struct buf **)(b_save + 1);
64557045Smargo 	b_save->bs_saveaddr = bp->b_saveaddr;
64657045Smargo 	bp->b_saveaddr = (caddr_t) b_save;
64757045Smargo 
64857045Smargo 	bp->b_flags |= B_CALL;
64957045Smargo 	bp->b_iodone = cluster_callback;
65065670Shibler 	cp = (char *)bp->b_data + size;
65157045Smargo 	for (++start_lbn, i = 0; i < len; ++i, ++start_lbn) {
65265670Shibler 		/*
65365670Shibler 		 * Block is not in core or the non-sequential block
65465670Shibler 		 * ending our cluster was part of the cluster (in which
65565670Shibler 		 * case we don't want to write it twice).
65665670Shibler 		 */
65765670Shibler 		if (!incore(vp, start_lbn) ||
65865670Shibler 		    last_bp == NULL && start_lbn == lbn)
65957045Smargo 			break;
66057045Smargo 
66165670Shibler 		/*
66265670Shibler 		 * Get the desired block buffer (unless it is the final
66365670Shibler 		 * sequential block whose buffer was passed in explictly
66465670Shibler 		 * as last_bp).
66565670Shibler 		 */
66665670Shibler 		if (last_bp == NULL || start_lbn != lbn) {
66757797Smckusick 			tbp = getblk(vp, start_lbn, size, 0, 0);
66857045Smargo 			if (!(tbp->b_flags & B_DELWRI)) {
66957045Smargo 				brelse(tbp);
67057045Smargo 				break;
67157045Smargo 			}
67257045Smargo 		} else
67357045Smargo 			tbp = last_bp;
67457045Smargo 
67557045Smargo 		++b_save->bs_nchildren;
67657045Smargo 
67757045Smargo 		/* Move memory from children to parent */
67865670Shibler 		if (tbp->b_blkno != (bp->b_blkno + btodb(bp->b_bufsize))) {
67959872Smargo 			printf("Clustered Block: %d addr %x bufsize: %d\n",
68059872Smargo 			    bp->b_lblkno, bp->b_blkno, bp->b_bufsize);
68159872Smargo 			printf("Child Block: %d addr: %x\n", tbp->b_lblkno,
68259872Smargo 			    tbp->b_blkno);
68359872Smargo 			panic("Clustered write to wrong blocks");
68459872Smargo 		}
68559872Smargo 
68664528Sbostic 		pagemove(tbp->b_data, cp, size);
68757045Smargo 		bp->b_bcount += size;
68857045Smargo 		bp->b_bufsize += size;
68957045Smargo 
69065670Shibler 		tbp->b_bufsize -= size;
69157045Smargo 		tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
69265670Shibler 		tbp->b_flags |= (B_ASYNC | B_AGE);
69357045Smargo 		s = splbio();
69457045Smargo 		reassignbuf(tbp, tbp->b_vp);		/* put on clean list */
69557045Smargo 		++tbp->b_vp->v_numoutput;
69657045Smargo 		splx(s);
69757045Smargo 		b_save->bs_children[i] = tbp;
69857045Smargo 
69965670Shibler 		cp += size;
70057045Smargo 	}
70157045Smargo 
70257045Smargo 	if (i == 0) {
70357045Smargo 		/* None to cluster */
70457045Smargo 		bp->b_saveaddr = b_save->bs_saveaddr;
70557045Smargo 		bp->b_flags &= ~B_CALL;
70657045Smargo 		bp->b_iodone = NULL;
70757045Smargo 		free(b_save, M_SEGMENT);
70857045Smargo 	}
70957045Smargo 	bawrite(bp);
71057045Smargo 	if (i < len) {
71157045Smargo 		len -= i + 1;
71257045Smargo 		start_lbn += 1;
71357045Smargo 		goto redo;
71457045Smargo 	}
71557045Smargo }
71665998Smckusick 
71765998Smckusick /*
71865998Smckusick  * Collect together all the buffers in a cluster.
71965998Smckusick  * Plus add one additional buffer.
72065998Smckusick  */
72165998Smckusick struct cluster_save *
72265998Smckusick cluster_collectbufs(vp, last_bp)
72365998Smckusick 	struct vnode *vp;
72465998Smckusick 	struct buf *last_bp;
72565998Smckusick {
72665998Smckusick 	struct cluster_save *buflist;
72765998Smckusick 	daddr_t	lbn;
72865998Smckusick 	int i, len;
72965998Smckusick 
73065998Smckusick 	len = vp->v_lastw - vp->v_cstart + 1;
73165998Smckusick 	buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist),
73265998Smckusick 	    M_SEGMENT, M_WAITOK);
73365998Smckusick 	buflist->bs_nchildren = 0;
73465998Smckusick 	buflist->bs_children = (struct buf **)(buflist + 1);
73565998Smckusick 	for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++)
73665998Smckusick 		    (void)bread(vp, lbn, last_bp->b_bcount, NOCRED,
73765998Smckusick 			&buflist->bs_children[i]);
73865998Smckusick 	buflist->bs_children[i] = last_bp;
73965998Smckusick 	buflist->bs_nchildren = i + 1;
74065998Smckusick 	return (buflist);
74165998Smckusick }
742