149589Sbostic /*- 263180Sbostic * Copyright (c) 1993 363180Sbostic * The Regents of the University of California. All rights reserved. 423395Smckusick * 559878Smckusick * %sccs.include.redist.c% 637736Smckusick * 7*68320Scgd * @(#)vfs_cluster.c 8.9 (Berkeley) 02/14/95 823395Smckusick */ 98Sbill 1051455Sbostic #include <sys/param.h> 1151455Sbostic #include <sys/proc.h> 1251455Sbostic #include <sys/buf.h> 1351455Sbostic #include <sys/vnode.h> 1451455Sbostic #include <sys/mount.h> 1551455Sbostic #include <sys/trace.h> 1659878Smckusick #include <sys/malloc.h> 1751455Sbostic #include <sys/resourcevar.h> 1856395Smckusick #include <libkern/libkern.h> 198Sbill 2066080Shibler #ifdef DEBUG 2166080Shibler #include <vm/vm.h> 2266080Shibler #include <sys/sysctl.h> 2366080Shibler int doreallocblks = 1; 2466080Shibler struct ctldebug debug13 = { "doreallocblks", &doreallocblks }; 2566080Shibler #else 2666080Shibler /* XXX for cluster_write */ 2766080Shibler #define doreallocblks 1 2866080Shibler #endif 2966080Shibler 3091Sbill /* 3157045Smargo * Local declarations 3257045Smargo */ 3357045Smargo struct buf *cluster_newbuf __P((struct vnode *, struct buf *, long, daddr_t, 3457045Smargo daddr_t, long, int)); 3557045Smargo struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *, 3657045Smargo daddr_t, daddr_t, long, int, long)); 3764717Smckusick void cluster_wbuild __P((struct vnode *, struct buf *, long, 3864717Smckusick daddr_t, int, daddr_t)); 3965998Smckusick struct cluster_save *cluster_collectbufs __P((struct vnode *, struct buf *)); 4057045Smargo 4165670Shibler #ifdef DIAGNOSTIC 4256395Smckusick /* 4365670Shibler * Set to 1 if reads of block zero should cause readahead to be done. 4465670Shibler * Set to 0 treats a read of block zero as a non-sequential read. 4557045Smargo * 4665670Shibler * Setting to one assumes that most reads of block zero of files are due to 4765670Shibler * sequential passes over the files (e.g. cat, sum) where additional blocks 4865670Shibler * will soon be needed. Setting to zero assumes that the majority are 4965670Shibler * surgical strikes to get particular info (e.g. size, file) where readahead 5065670Shibler * blocks will not be used and, in fact, push out other potentially useful 5165670Shibler * blocks from the cache. The former seems intuitive, but some quick tests 5265670Shibler * showed that the latter performed better from a system-wide point of view. 5365670Shibler */ 5465670Shibler int doclusterraz = 0; 5565670Shibler #define ISSEQREAD(vp, blk) \ 5665670Shibler (((blk) != 0 || doclusterraz) && \ 5765670Shibler ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr)) 5865670Shibler #else 5965670Shibler #define ISSEQREAD(vp, blk) \ 6065670Shibler ((blk) != 0 && ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr)) 6165670Shibler #endif 6265670Shibler 6365670Shibler /* 6457045Smargo * This replaces bread. If this is a bread at the beginning of a file and 6557045Smargo * lastr is 0, we assume this is the first read and we'll read up to two 6657045Smargo * blocks if they are sequential. After that, we'll do regular read ahead 6757045Smargo * in clustered chunks. 6857045Smargo * 6957045Smargo * There are 4 or 5 cases depending on how you count: 7057045Smargo * Desired block is in the cache: 7157045Smargo * 1 Not sequential access (0 I/Os). 7257045Smargo * 2 Access is sequential, do read-ahead (1 ASYNC). 7357045Smargo * Desired block is not in cache: 7457045Smargo * 3 Not sequential access (1 SYNC). 7557045Smargo * 4 Sequential access, next block is contiguous (1 SYNC). 7657045Smargo * 5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC) 7757045Smargo * 7857045Smargo * There are potentially two buffers that require I/O. 7957045Smargo * bp is the block requested. 8057045Smargo * rbp is the read-ahead block. 8157045Smargo * If either is NULL, then you don't have to do the I/O. 8257045Smargo */ 8357045Smargo cluster_read(vp, filesize, lblkno, size, cred, bpp) 8457045Smargo struct vnode *vp; 8557045Smargo u_quad_t filesize; 8657045Smargo daddr_t lblkno; 8757045Smargo long size; 8857045Smargo struct ucred *cred; 8957045Smargo struct buf **bpp; 9057045Smargo { 9157045Smargo struct buf *bp, *rbp; 9257045Smargo daddr_t blkno, ioblkno; 9357045Smargo long flags; 9457045Smargo int error, num_ra, alreadyincore; 9557045Smargo 9657045Smargo #ifdef DIAGNOSTIC 9757045Smargo if (size == 0) 9857045Smargo panic("cluster_read: size = 0"); 9957045Smargo #endif 10057045Smargo 10157045Smargo error = 0; 10257045Smargo flags = B_READ; 10357797Smckusick *bpp = bp = getblk(vp, lblkno, size, 0, 0); 10465670Shibler if (bp->b_flags & B_CACHE) { 10557045Smargo /* 10657045Smargo * Desired block is in cache; do any readahead ASYNC. 10757045Smargo * Case 1, 2. 10857045Smargo */ 10957045Smargo trace(TR_BREADHIT, pack(vp, size), lblkno); 11057045Smargo flags |= B_ASYNC; 11165670Shibler ioblkno = lblkno + (vp->v_ralen ? vp->v_ralen : 1); 112*68320Scgd alreadyincore = incore(vp, ioblkno) != NULL; 11357045Smargo bp = NULL; 11457045Smargo } else { 11557045Smargo /* Block wasn't in cache, case 3, 4, 5. */ 11657045Smargo trace(TR_BREADMISS, pack(vp, size), lblkno); 11765670Shibler bp->b_flags |= B_READ; 11857045Smargo ioblkno = lblkno; 11957045Smargo alreadyincore = 0; 12057045Smargo curproc->p_stats->p_ru.ru_inblock++; /* XXX */ 12157045Smargo } 12257045Smargo /* 12357045Smargo * XXX 12457045Smargo * Replace 1 with a window size based on some permutation of 12557045Smargo * maxcontig and rot_delay. This will let you figure out how 12657045Smargo * many blocks you should read-ahead (case 2, 4, 5). 12757045Smargo * 12865670Shibler * If the access isn't sequential, reset the window to 1. 12965670Shibler * Note that a read to the same block is considered sequential. 13065670Shibler * This catches the case where the file is being read sequentially, 13165670Shibler * but at smaller than the filesystem block size. 13257045Smargo */ 13357045Smargo rbp = NULL; 13465670Shibler if (!ISSEQREAD(vp, lblkno)) { 13565670Shibler vp->v_ralen = 0; 13665670Shibler vp->v_maxra = lblkno; 13765670Shibler } else if ((ioblkno + 1) * size <= filesize && !alreadyincore && 13864717Smckusick !(error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra)) && 13964717Smckusick blkno != -1) { 14057045Smargo /* 14157045Smargo * Reading sequentially, and the next block is not in the 14265670Shibler * cache. We are going to try reading ahead. 14357045Smargo */ 14465670Shibler if (num_ra) { 14565670Shibler /* 14665670Shibler * If our desired readahead block had been read 14765670Shibler * in a previous readahead but is no longer in 14865670Shibler * core, then we may be reading ahead too far 14965670Shibler * or are not using our readahead very rapidly. 15065670Shibler * In this case we scale back the window. 15165670Shibler */ 15265670Shibler if (!alreadyincore && ioblkno <= vp->v_maxra) 15365670Shibler vp->v_ralen = max(vp->v_ralen >> 1, 1); 15465670Shibler /* 15565670Shibler * There are more sequential blocks than our current 15665670Shibler * window allows, scale up. Ideally we want to get 15765670Shibler * in sync with the filesystem maxcontig value. 15865670Shibler */ 15965670Shibler else if (num_ra > vp->v_ralen && lblkno != vp->v_lastr) 16065670Shibler vp->v_ralen = vp->v_ralen ? 16165670Shibler min(num_ra, vp->v_ralen << 1) : 1; 16257045Smargo 16365670Shibler if (num_ra > vp->v_ralen) 16465670Shibler num_ra = vp->v_ralen; 16565670Shibler } 16657045Smargo 16757045Smargo if (num_ra) /* case 2, 4 */ 16857045Smargo rbp = cluster_rbuild(vp, filesize, 16957045Smargo bp, ioblkno, blkno, size, num_ra, flags); 17065670Shibler else if (ioblkno == lblkno) { 17165670Shibler bp->b_blkno = blkno; 17257045Smargo /* Case 5: check how many blocks to read ahead */ 17357045Smargo ++ioblkno; 17457045Smargo if ((ioblkno + 1) * size > filesize || 17565670Shibler incore(vp, ioblkno) || (error = VOP_BMAP(vp, 17665670Shibler ioblkno, NULL, &blkno, &num_ra)) || blkno == -1) 17757045Smargo goto skip_readahead; 17865670Shibler /* 17967578Shibler * Adjust readahead as above. 18067578Shibler * Don't check alreadyincore, we know it is 0 from 18167578Shibler * the previous conditional. 18265670Shibler */ 18365670Shibler if (num_ra) { 18467578Shibler if (ioblkno <= vp->v_maxra) 18565670Shibler vp->v_ralen = max(vp->v_ralen >> 1, 1); 18665670Shibler else if (num_ra > vp->v_ralen && 18765670Shibler lblkno != vp->v_lastr) 18865670Shibler vp->v_ralen = vp->v_ralen ? 18965670Shibler min(num_ra,vp->v_ralen<<1) : 1; 19065670Shibler if (num_ra > vp->v_ralen) 19165670Shibler num_ra = vp->v_ralen; 19265670Shibler } 19357045Smargo flags |= B_ASYNC; 19457045Smargo if (num_ra) 19557045Smargo rbp = cluster_rbuild(vp, filesize, 19657045Smargo NULL, ioblkno, blkno, size, num_ra, flags); 19757045Smargo else { 19857797Smckusick rbp = getblk(vp, ioblkno, size, 0, 0); 19957045Smargo rbp->b_flags |= flags; 20057045Smargo rbp->b_blkno = blkno; 20157045Smargo } 20265670Shibler } else { 20357045Smargo /* case 2; read ahead single block */ 20457797Smckusick rbp = getblk(vp, ioblkno, size, 0, 0); 20557045Smargo rbp->b_flags |= flags; 20657045Smargo rbp->b_blkno = blkno; 20765670Shibler } 20857045Smargo 20965670Shibler if (rbp == bp) /* case 4 */ 21057045Smargo rbp = NULL; 21157045Smargo else if (rbp) { /* case 2, 5 */ 21257045Smargo trace(TR_BREADMISSRA, 21357045Smargo pack(vp, (num_ra + 1) * size), ioblkno); 21457045Smargo curproc->p_stats->p_ru.ru_inblock++; /* XXX */ 21557045Smargo } 21657045Smargo } 21757045Smargo 21857045Smargo /* XXX Kirk, do we need to make sure the bp has creds? */ 21957045Smargo skip_readahead: 22057045Smargo if (bp) 22157045Smargo if (bp->b_flags & (B_DONE | B_DELWRI)) 22257045Smargo panic("cluster_read: DONE bp"); 22357045Smargo else 22457045Smargo error = VOP_STRATEGY(bp); 22557045Smargo 22657045Smargo if (rbp) 22757045Smargo if (error || rbp->b_flags & (B_DONE | B_DELWRI)) { 22857045Smargo rbp->b_flags &= ~(B_ASYNC | B_READ); 22957045Smargo brelse(rbp); 23057045Smargo } else 23157045Smargo (void) VOP_STRATEGY(rbp); 23257045Smargo 23365670Shibler /* 23465670Shibler * Recalculate our maximum readahead 23565670Shibler */ 23665670Shibler if (rbp == NULL) 23765670Shibler rbp = bp; 23865670Shibler if (rbp) 23965670Shibler vp->v_maxra = rbp->b_lblkno + (rbp->b_bufsize / size) - 1; 24065670Shibler 24157045Smargo if (bp) 24257045Smargo return(biowait(bp)); 24357045Smargo return(error); 24457045Smargo } 24557045Smargo 24657045Smargo /* 24757045Smargo * If blocks are contiguous on disk, use this to provide clustered 24857045Smargo * read ahead. We will read as many blocks as possible sequentially 24957045Smargo * and then parcel them up into logical blocks in the buffer hash table. 25057045Smargo */ 25157045Smargo struct buf * 25257045Smargo cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags) 25357045Smargo struct vnode *vp; 25457045Smargo u_quad_t filesize; 25557045Smargo struct buf *bp; 25657045Smargo daddr_t lbn; 25757045Smargo daddr_t blkno; 25857045Smargo long size; 25957045Smargo int run; 26057045Smargo long flags; 26157045Smargo { 26257045Smargo struct cluster_save *b_save; 26357045Smargo struct buf *tbp; 26457045Smargo daddr_t bn; 26557045Smargo int i, inc; 26657045Smargo 26759872Smargo #ifdef DIAGNOSTIC 26859872Smargo if (size != vp->v_mount->mnt_stat.f_iosize) 26959872Smargo panic("cluster_rbuild: size %d != filesize %d\n", 27059872Smargo size, vp->v_mount->mnt_stat.f_iosize); 27159872Smargo #endif 27257045Smargo if (size * (lbn + run + 1) > filesize) 27357045Smargo --run; 27457045Smargo if (run == 0) { 27557045Smargo if (!bp) { 27657797Smckusick bp = getblk(vp, lbn, size, 0, 0); 27757045Smargo bp->b_blkno = blkno; 27857045Smargo bp->b_flags |= flags; 27957045Smargo } 28057045Smargo return(bp); 28157045Smargo } 28257045Smargo 28357045Smargo bp = cluster_newbuf(vp, bp, flags, blkno, lbn, size, run + 1); 28457045Smargo if (bp->b_flags & (B_DONE | B_DELWRI)) 28557045Smargo return (bp); 28657045Smargo 28757045Smargo b_save = malloc(sizeof(struct buf *) * run + sizeof(struct cluster_save), 28857045Smargo M_SEGMENT, M_WAITOK); 28957045Smargo b_save->bs_bufsize = b_save->bs_bcount = size; 29057045Smargo b_save->bs_nchildren = 0; 29157045Smargo b_save->bs_children = (struct buf **)(b_save + 1); 29257045Smargo b_save->bs_saveaddr = bp->b_saveaddr; 29357045Smargo bp->b_saveaddr = (caddr_t) b_save; 29457045Smargo 29565670Shibler inc = btodb(size); 29657045Smargo for (bn = blkno + inc, i = 1; i <= run; ++i, bn += inc) { 29767578Shibler /* 29867578Shibler * A component of the cluster is already in core, 29967578Shibler * terminate the cluster early. 30067578Shibler */ 30167578Shibler if (incore(vp, lbn + i)) 30257045Smargo break; 30357797Smckusick tbp = getblk(vp, lbn + i, 0, 0, 0); 30465670Shibler /* 30565670Shibler * getblk may return some memory in the buffer if there were 30665670Shibler * no empty buffers to shed it to. If there is currently 30765670Shibler * memory in the buffer, we move it down size bytes to make 30865670Shibler * room for the valid pages that cluster_callback will insert. 30965670Shibler * We do this now so we don't have to do it at interrupt time 31065670Shibler * in the callback routine. 31165670Shibler */ 31265670Shibler if (tbp->b_bufsize != 0) { 31365670Shibler caddr_t bdata = (char *)tbp->b_data; 31465670Shibler 31567578Shibler /* 31667578Shibler * No room in the buffer to add another page, 31767578Shibler * terminate the cluster early. 31867578Shibler */ 31967578Shibler if (tbp->b_bufsize + size > MAXBSIZE) { 32067578Shibler #ifdef DIAGNOSTIC 32167578Shibler if (tbp->b_bufsize != MAXBSIZE) 32267578Shibler panic("cluster_rbuild: too much memory"); 32367578Shibler #endif 32467578Shibler brelse(tbp); 32567578Shibler break; 32667578Shibler } 32765670Shibler if (tbp->b_bufsize > size) { 32865670Shibler /* 32965670Shibler * XXX if the source and destination regions 33065670Shibler * overlap we have to copy backward to avoid 33165670Shibler * clobbering any valid pages (i.e. pagemove 33265670Shibler * implementations typically can't handle 33365670Shibler * overlap). 33465670Shibler */ 33565670Shibler bdata += tbp->b_bufsize; 33665670Shibler while (bdata > (char *)tbp->b_data) { 33765670Shibler bdata -= CLBYTES; 33865670Shibler pagemove(bdata, bdata + size, CLBYTES); 33965670Shibler } 34065670Shibler } else 34165670Shibler pagemove(bdata, bdata + size, tbp->b_bufsize); 34265670Shibler } 34357045Smargo tbp->b_blkno = bn; 34457045Smargo tbp->b_flags |= flags | B_READ | B_ASYNC; 34557045Smargo ++b_save->bs_nchildren; 34657045Smargo b_save->bs_children[i - 1] = tbp; 34757045Smargo } 34867578Shibler /* 34967578Shibler * The cluster may have been terminated early, adjust the cluster 35067578Shibler * buffer size accordingly. If no cluster could be formed, 35167578Shibler * deallocate the cluster save info. 35267578Shibler */ 35367578Shibler if (i <= run) { 35467578Shibler if (i == 1) { 35567578Shibler bp->b_saveaddr = b_save->bs_saveaddr; 35667578Shibler bp->b_flags &= ~B_CALL; 35767578Shibler bp->b_iodone = NULL; 35867578Shibler free(b_save, M_SEGMENT); 35967578Shibler } 36067578Shibler allocbuf(bp, size * i); 36167578Shibler } 36257045Smargo return(bp); 36357045Smargo } 36457045Smargo 36557045Smargo /* 36657045Smargo * Either get a new buffer or grow the existing one. 36757045Smargo */ 36857045Smargo struct buf * 36957045Smargo cluster_newbuf(vp, bp, flags, blkno, lblkno, size, run) 37057045Smargo struct vnode *vp; 37157045Smargo struct buf *bp; 37257045Smargo long flags; 37357045Smargo daddr_t blkno; 37457045Smargo daddr_t lblkno; 37557045Smargo long size; 37657045Smargo int run; 37757045Smargo { 37857045Smargo if (!bp) { 37957797Smckusick bp = getblk(vp, lblkno, size, 0, 0); 38057045Smargo if (bp->b_flags & (B_DONE | B_DELWRI)) { 38157045Smargo bp->b_blkno = blkno; 38257045Smargo return(bp); 38357045Smargo } 38457045Smargo } 38557045Smargo allocbuf(bp, run * size); 38657045Smargo bp->b_blkno = blkno; 38757045Smargo bp->b_iodone = cluster_callback; 38857045Smargo bp->b_flags |= flags | B_CALL; 38957045Smargo return(bp); 39057045Smargo } 39157045Smargo 39257045Smargo /* 39357045Smargo * Cleanup after a clustered read or write. 39465670Shibler * This is complicated by the fact that any of the buffers might have 39565670Shibler * extra memory (if there were no empty buffer headers at allocbuf time) 39665670Shibler * that we will need to shift around. 39757045Smargo */ 39857045Smargo void 39957045Smargo cluster_callback(bp) 40057045Smargo struct buf *bp; 40157045Smargo { 40257045Smargo struct cluster_save *b_save; 40365670Shibler struct buf **bpp, *tbp; 40465670Shibler long bsize; 40557045Smargo caddr_t cp; 40665670Shibler int error = 0; 40764717Smckusick 40865670Shibler /* 40965670Shibler * Must propogate errors to all the components. 41065670Shibler */ 41165670Shibler if (bp->b_flags & B_ERROR) 41265670Shibler error = bp->b_error; 41365670Shibler 41457045Smargo b_save = (struct cluster_save *)(bp->b_saveaddr); 41557045Smargo bp->b_saveaddr = b_save->bs_saveaddr; 41657045Smargo 41765670Shibler bsize = b_save->bs_bufsize; 41865670Shibler cp = (char *)bp->b_data + bsize; 41965670Shibler /* 42065670Shibler * Move memory from the large cluster buffer into the component 42165670Shibler * buffers and mark IO as done on these. 42265670Shibler */ 42365670Shibler for (bpp = b_save->bs_children; b_save->bs_nchildren--; ++bpp) { 42465670Shibler tbp = *bpp; 42565670Shibler pagemove(cp, tbp->b_data, bsize); 42665670Shibler tbp->b_bufsize += bsize; 42765670Shibler tbp->b_bcount = bsize; 42865670Shibler if (error) { 42965670Shibler tbp->b_flags |= B_ERROR; 43065670Shibler tbp->b_error = error; 43165670Shibler } 43265670Shibler biodone(tbp); 43365670Shibler bp->b_bufsize -= bsize; 43465670Shibler cp += bsize; 43557045Smargo } 43665670Shibler /* 43765670Shibler * If there was excess memory in the cluster buffer, 43865670Shibler * slide it up adjacent to the remaining valid data. 43965670Shibler */ 44065670Shibler if (bp->b_bufsize != bsize) { 44165670Shibler if (bp->b_bufsize < bsize) 44265670Shibler panic("cluster_callback: too little memory"); 44365670Shibler pagemove(cp, (char *)bp->b_data + bsize, bp->b_bufsize - bsize); 44465670Shibler } 44565670Shibler bp->b_bcount = bsize; 44657045Smargo bp->b_iodone = NULL; 44757045Smargo free(b_save, M_SEGMENT); 44857045Smargo if (bp->b_flags & B_ASYNC) 44957045Smargo brelse(bp); 45065670Shibler else { 45165670Shibler bp->b_flags &= ~B_WANTED; 45257045Smargo wakeup((caddr_t)bp); 45365670Shibler } 45457045Smargo } 45557045Smargo 45657045Smargo /* 45757045Smargo * Do clustered write for FFS. 45857045Smargo * 45957045Smargo * Three cases: 46057045Smargo * 1. Write is not sequential (write asynchronously) 46157045Smargo * Write is sequential: 46257045Smargo * 2. beginning of cluster - begin cluster 46357045Smargo * 3. middle of a cluster - add to cluster 46457045Smargo * 4. end of a cluster - asynchronously write cluster 46557045Smargo */ 46657045Smargo void 46757045Smargo cluster_write(bp, filesize) 46857045Smargo struct buf *bp; 46957045Smargo u_quad_t filesize; 47057045Smargo { 47157045Smargo struct vnode *vp; 47257045Smargo daddr_t lbn; 47365998Smckusick int maxclen, cursize; 47457045Smargo 47557045Smargo vp = bp->b_vp; 47657045Smargo lbn = bp->b_lblkno; 47757045Smargo 47859872Smargo /* Initialize vnode to beginning of file. */ 47959872Smargo if (lbn == 0) 48059872Smargo vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; 48159872Smargo 48259872Smargo if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || 48365670Shibler (bp->b_blkno != vp->v_lasta + btodb(bp->b_bcount))) { 48465998Smckusick maxclen = MAXBSIZE / vp->v_mount->mnt_stat.f_iosize - 1; 48565998Smckusick if (vp->v_clen != 0) { 48657045Smargo /* 48765998Smckusick * Next block is not sequential. 48865998Smckusick * 48965998Smckusick * If we are not writing at end of file, the process 49065998Smckusick * seeked to another point in the file since its 49165998Smckusick * last write, or we have reached our maximum 49265998Smckusick * cluster size, then push the previous cluster. 49365998Smckusick * Otherwise try reallocating to make it sequential. 49457045Smargo */ 49565998Smckusick cursize = vp->v_lastw - vp->v_cstart + 1; 49666080Shibler if (!doreallocblks || 49766080Shibler (lbn + 1) * bp->b_bcount != filesize || 49865998Smckusick lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { 49965998Smckusick cluster_wbuild(vp, NULL, bp->b_bcount, 50065998Smckusick vp->v_cstart, cursize, lbn); 50165998Smckusick } else { 50265998Smckusick struct buf **bpp, **endbp; 50365998Smckusick struct cluster_save *buflist; 50465998Smckusick 50565998Smckusick buflist = cluster_collectbufs(vp, bp); 50665998Smckusick endbp = &buflist->bs_children 50765998Smckusick [buflist->bs_nchildren - 1]; 50865998Smckusick if (VOP_REALLOCBLKS(vp, buflist)) { 50965998Smckusick /* 51065998Smckusick * Failed, push the previous cluster. 51165998Smckusick */ 51265998Smckusick for (bpp = buflist->bs_children; 51365998Smckusick bpp < endbp; bpp++) 51465998Smckusick brelse(*bpp); 51565998Smckusick free(buflist, M_SEGMENT); 51665998Smckusick cluster_wbuild(vp, NULL, bp->b_bcount, 51765998Smckusick vp->v_cstart, cursize, lbn); 51865998Smckusick } else { 51965998Smckusick /* 52065998Smckusick * Succeeded, keep building cluster. 52165998Smckusick */ 52265998Smckusick for (bpp = buflist->bs_children; 52365998Smckusick bpp <= endbp; bpp++) 52465998Smckusick bdwrite(*bpp); 52565998Smckusick free(buflist, M_SEGMENT); 52665998Smckusick vp->v_lastw = lbn; 52765998Smckusick vp->v_lasta = bp->b_blkno; 52865998Smckusick return; 52965998Smckusick } 53065998Smckusick } 53165998Smckusick } 53257045Smargo /* 53357045Smargo * Consider beginning a cluster. 53465998Smckusick * If at end of file, make cluster as large as possible, 53565998Smckusick * otherwise find size of existing cluster. 53657045Smargo */ 53765998Smckusick if ((lbn + 1) * bp->b_bcount != filesize && 53865998Smckusick (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen) || 53965998Smckusick bp->b_blkno == -1)) { 54057045Smargo bawrite(bp); 54159872Smargo vp->v_clen = 0; 54259872Smargo vp->v_lasta = bp->b_blkno; 54357045Smargo vp->v_cstart = lbn + 1; 54457045Smargo vp->v_lastw = lbn; 54557045Smargo return; 54664717Smckusick } 54765998Smckusick vp->v_clen = maxclen; 54865998Smckusick if (maxclen == 0) { /* I/O not contiguous */ 54957045Smargo vp->v_cstart = lbn + 1; 55057045Smargo bawrite(bp); 55157045Smargo } else { /* Wait for rest of cluster */ 55257045Smargo vp->v_cstart = lbn; 55357045Smargo bdwrite(bp); 55457045Smargo } 55565670Shibler } else if (lbn == vp->v_cstart + vp->v_clen) { 55657045Smargo /* 55757045Smargo * At end of cluster, write it out. 55857045Smargo */ 55957045Smargo cluster_wbuild(vp, bp, bp->b_bcount, vp->v_cstart, 56057045Smargo vp->v_clen + 1, lbn); 56157045Smargo vp->v_clen = 0; 56257045Smargo vp->v_cstart = lbn + 1; 56365670Shibler } else 56457045Smargo /* 56557045Smargo * In the middle of a cluster, so just delay the 56657045Smargo * I/O for now. 56757045Smargo */ 56865670Shibler bdwrite(bp); 56965670Shibler vp->v_lastw = lbn; 57059872Smargo vp->v_lasta = bp->b_blkno; 57157045Smargo } 57257045Smargo 57357045Smargo 57457045Smargo /* 57557045Smargo * This is an awful lot like cluster_rbuild...wish they could be combined. 57657045Smargo * The last lbn argument is the current block on which I/O is being 57757045Smargo * performed. Check to see that it doesn't fall in the middle of 57865670Shibler * the current block (if last_bp == NULL). 57957045Smargo */ 58057045Smargo void 58157045Smargo cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn) 58257045Smargo struct vnode *vp; 58357045Smargo struct buf *last_bp; 58457045Smargo long size; 58557045Smargo daddr_t start_lbn; 58657045Smargo int len; 58757045Smargo daddr_t lbn; 58857045Smargo { 58957045Smargo struct cluster_save *b_save; 59057045Smargo struct buf *bp, *tbp; 59157045Smargo caddr_t cp; 59257045Smargo int i, s; 59357045Smargo 59459872Smargo #ifdef DIAGNOSTIC 59559872Smargo if (size != vp->v_mount->mnt_stat.f_iosize) 59659872Smargo panic("cluster_wbuild: size %d != filesize %d\n", 59759872Smargo size, vp->v_mount->mnt_stat.f_iosize); 59859872Smargo #endif 59957045Smargo redo: 60057045Smargo while ((!incore(vp, start_lbn) || start_lbn == lbn) && len) { 60157045Smargo ++start_lbn; 60257045Smargo --len; 60357045Smargo } 60457045Smargo 60557045Smargo /* Get more memory for current buffer */ 60657045Smargo if (len <= 1) { 60759872Smargo if (last_bp) { 60857045Smargo bawrite(last_bp); 60959872Smargo } else if (len) { 61059872Smargo bp = getblk(vp, start_lbn, size, 0, 0); 61159872Smargo bawrite(bp); 61259872Smargo } 61357045Smargo return; 61457045Smargo } 61557045Smargo 61657797Smckusick bp = getblk(vp, start_lbn, size, 0, 0); 61757045Smargo if (!(bp->b_flags & B_DELWRI)) { 61857045Smargo ++start_lbn; 61957045Smargo --len; 62057045Smargo brelse(bp); 62157045Smargo goto redo; 62257045Smargo } 62357045Smargo 62465670Shibler /* 62565670Shibler * Extra memory in the buffer, punt on this buffer. 62665670Shibler * XXX we could handle this in most cases, but we would have to 62765670Shibler * push the extra memory down to after our max possible cluster 62865670Shibler * size and then potentially pull it back up if the cluster was 62965670Shibler * terminated prematurely--too much hassle. 63065670Shibler */ 63165670Shibler if (bp->b_bcount != bp->b_bufsize) { 63265670Shibler ++start_lbn; 63365670Shibler --len; 63465670Shibler bawrite(bp); 63565670Shibler goto redo; 63665670Shibler } 63765670Shibler 63857045Smargo --len; 63957045Smargo b_save = malloc(sizeof(struct buf *) * len + sizeof(struct cluster_save), 64057045Smargo M_SEGMENT, M_WAITOK); 64157045Smargo b_save->bs_bcount = bp->b_bcount; 64257045Smargo b_save->bs_bufsize = bp->b_bufsize; 64357045Smargo b_save->bs_nchildren = 0; 64457045Smargo b_save->bs_children = (struct buf **)(b_save + 1); 64557045Smargo b_save->bs_saveaddr = bp->b_saveaddr; 64657045Smargo bp->b_saveaddr = (caddr_t) b_save; 64757045Smargo 64857045Smargo bp->b_flags |= B_CALL; 64957045Smargo bp->b_iodone = cluster_callback; 65065670Shibler cp = (char *)bp->b_data + size; 65157045Smargo for (++start_lbn, i = 0; i < len; ++i, ++start_lbn) { 65265670Shibler /* 65365670Shibler * Block is not in core or the non-sequential block 65465670Shibler * ending our cluster was part of the cluster (in which 65565670Shibler * case we don't want to write it twice). 65665670Shibler */ 65765670Shibler if (!incore(vp, start_lbn) || 65865670Shibler last_bp == NULL && start_lbn == lbn) 65957045Smargo break; 66057045Smargo 66165670Shibler /* 66265670Shibler * Get the desired block buffer (unless it is the final 66365670Shibler * sequential block whose buffer was passed in explictly 66465670Shibler * as last_bp). 66565670Shibler */ 66665670Shibler if (last_bp == NULL || start_lbn != lbn) { 66757797Smckusick tbp = getblk(vp, start_lbn, size, 0, 0); 66857045Smargo if (!(tbp->b_flags & B_DELWRI)) { 66957045Smargo brelse(tbp); 67057045Smargo break; 67157045Smargo } 67257045Smargo } else 67357045Smargo tbp = last_bp; 67457045Smargo 67557045Smargo ++b_save->bs_nchildren; 67657045Smargo 67757045Smargo /* Move memory from children to parent */ 67865670Shibler if (tbp->b_blkno != (bp->b_blkno + btodb(bp->b_bufsize))) { 67959872Smargo printf("Clustered Block: %d addr %x bufsize: %d\n", 68059872Smargo bp->b_lblkno, bp->b_blkno, bp->b_bufsize); 68159872Smargo printf("Child Block: %d addr: %x\n", tbp->b_lblkno, 68259872Smargo tbp->b_blkno); 68359872Smargo panic("Clustered write to wrong blocks"); 68459872Smargo } 68559872Smargo 68664528Sbostic pagemove(tbp->b_data, cp, size); 68757045Smargo bp->b_bcount += size; 68857045Smargo bp->b_bufsize += size; 68957045Smargo 69065670Shibler tbp->b_bufsize -= size; 69157045Smargo tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 69265670Shibler tbp->b_flags |= (B_ASYNC | B_AGE); 69357045Smargo s = splbio(); 69457045Smargo reassignbuf(tbp, tbp->b_vp); /* put on clean list */ 69557045Smargo ++tbp->b_vp->v_numoutput; 69657045Smargo splx(s); 69757045Smargo b_save->bs_children[i] = tbp; 69857045Smargo 69965670Shibler cp += size; 70057045Smargo } 70157045Smargo 70257045Smargo if (i == 0) { 70357045Smargo /* None to cluster */ 70457045Smargo bp->b_saveaddr = b_save->bs_saveaddr; 70557045Smargo bp->b_flags &= ~B_CALL; 70657045Smargo bp->b_iodone = NULL; 70757045Smargo free(b_save, M_SEGMENT); 70857045Smargo } 70957045Smargo bawrite(bp); 71057045Smargo if (i < len) { 71157045Smargo len -= i + 1; 71257045Smargo start_lbn += 1; 71357045Smargo goto redo; 71457045Smargo } 71557045Smargo } 71665998Smckusick 71765998Smckusick /* 71865998Smckusick * Collect together all the buffers in a cluster. 71965998Smckusick * Plus add one additional buffer. 72065998Smckusick */ 72165998Smckusick struct cluster_save * 72265998Smckusick cluster_collectbufs(vp, last_bp) 72365998Smckusick struct vnode *vp; 72465998Smckusick struct buf *last_bp; 72565998Smckusick { 72665998Smckusick struct cluster_save *buflist; 72765998Smckusick daddr_t lbn; 72865998Smckusick int i, len; 72965998Smckusick 73065998Smckusick len = vp->v_lastw - vp->v_cstart + 1; 73165998Smckusick buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), 73265998Smckusick M_SEGMENT, M_WAITOK); 73365998Smckusick buflist->bs_nchildren = 0; 73465998Smckusick buflist->bs_children = (struct buf **)(buflist + 1); 73565998Smckusick for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) 73665998Smckusick (void)bread(vp, lbn, last_bp->b_bcount, NOCRED, 73765998Smckusick &buflist->bs_children[i]); 73865998Smckusick buflist->bs_children[i] = last_bp; 73965998Smckusick buflist->bs_nchildren = i + 1; 74065998Smckusick return (buflist); 74165998Smckusick } 742