149589Sbostic /*- 263180Sbostic * Copyright (c) 1993 363180Sbostic * The Regents of the University of California. All rights reserved. 423395Smckusick * 559878Smckusick * %sccs.include.redist.c% 637736Smckusick * 7*65998Smckusick * @(#)vfs_cluster.c 8.6 (Berkeley) 02/05/94 823395Smckusick */ 98Sbill 1051455Sbostic #include <sys/param.h> 1151455Sbostic #include <sys/proc.h> 1251455Sbostic #include <sys/buf.h> 1351455Sbostic #include <sys/vnode.h> 1451455Sbostic #include <sys/mount.h> 1551455Sbostic #include <sys/trace.h> 1659878Smckusick #include <sys/malloc.h> 1751455Sbostic #include <sys/resourcevar.h> 1856395Smckusick #include <libkern/libkern.h> 198Sbill 2091Sbill /* 2157045Smargo * Local declarations 2257045Smargo */ 2357045Smargo struct buf *cluster_newbuf __P((struct vnode *, struct buf *, long, daddr_t, 2457045Smargo daddr_t, long, int)); 2557045Smargo struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *, 2657045Smargo daddr_t, daddr_t, long, int, long)); 2764717Smckusick void cluster_wbuild __P((struct vnode *, struct buf *, long, 2864717Smckusick daddr_t, int, daddr_t)); 29*65998Smckusick struct cluster_save *cluster_collectbufs __P((struct vnode *, struct buf *)); 3057045Smargo 3165670Shibler #ifdef DIAGNOSTIC 3256395Smckusick /* 3365670Shibler * Set to 1 if reads of block zero should cause readahead to be done. 3465670Shibler * Set to 0 treats a read of block zero as a non-sequential read. 3557045Smargo * 3665670Shibler * Setting to one assumes that most reads of block zero of files are due to 3765670Shibler * sequential passes over the files (e.g. cat, sum) where additional blocks 3865670Shibler * will soon be needed. Setting to zero assumes that the majority are 3965670Shibler * surgical strikes to get particular info (e.g. size, file) where readahead 4065670Shibler * blocks will not be used and, in fact, push out other potentially useful 4165670Shibler * blocks from the cache. The former seems intuitive, but some quick tests 4265670Shibler * showed that the latter performed better from a system-wide point of view. 4365670Shibler */ 4465670Shibler int doclusterraz = 0; 4565670Shibler #define ISSEQREAD(vp, blk) \ 4665670Shibler (((blk) != 0 || doclusterraz) && \ 4765670Shibler ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr)) 4865670Shibler #else 4965670Shibler #define ISSEQREAD(vp, blk) \ 5065670Shibler ((blk) != 0 && ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr)) 5165670Shibler #endif 5265670Shibler 5365670Shibler /* 5457045Smargo * This replaces bread. If this is a bread at the beginning of a file and 5557045Smargo * lastr is 0, we assume this is the first read and we'll read up to two 5657045Smargo * blocks if they are sequential. After that, we'll do regular read ahead 5757045Smargo * in clustered chunks. 5857045Smargo * 5957045Smargo * There are 4 or 5 cases depending on how you count: 6057045Smargo * Desired block is in the cache: 6157045Smargo * 1 Not sequential access (0 I/Os). 6257045Smargo * 2 Access is sequential, do read-ahead (1 ASYNC). 6357045Smargo * Desired block is not in cache: 6457045Smargo * 3 Not sequential access (1 SYNC). 6557045Smargo * 4 Sequential access, next block is contiguous (1 SYNC). 6657045Smargo * 5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC) 6757045Smargo * 6857045Smargo * There are potentially two buffers that require I/O. 6957045Smargo * bp is the block requested. 7057045Smargo * rbp is the read-ahead block. 7157045Smargo * If either is NULL, then you don't have to do the I/O. 7257045Smargo */ 7357045Smargo cluster_read(vp, filesize, lblkno, size, cred, bpp) 7457045Smargo struct vnode *vp; 7557045Smargo u_quad_t filesize; 7657045Smargo daddr_t lblkno; 7757045Smargo long size; 7857045Smargo struct ucred *cred; 7957045Smargo struct buf **bpp; 8057045Smargo { 8157045Smargo struct buf *bp, *rbp; 8257045Smargo daddr_t blkno, ioblkno; 8357045Smargo long flags; 8457045Smargo int error, num_ra, alreadyincore; 8557045Smargo 8657045Smargo #ifdef DIAGNOSTIC 8757045Smargo if (size == 0) 8857045Smargo panic("cluster_read: size = 0"); 8957045Smargo #endif 9057045Smargo 9157045Smargo error = 0; 9257045Smargo flags = B_READ; 9357797Smckusick *bpp = bp = getblk(vp, lblkno, size, 0, 0); 9465670Shibler if (bp->b_flags & B_CACHE) { 9557045Smargo /* 9657045Smargo * Desired block is in cache; do any readahead ASYNC. 9757045Smargo * Case 1, 2. 9857045Smargo */ 9957045Smargo trace(TR_BREADHIT, pack(vp, size), lblkno); 10057045Smargo flags |= B_ASYNC; 10165670Shibler ioblkno = lblkno + (vp->v_ralen ? vp->v_ralen : 1); 10257797Smckusick alreadyincore = (int)incore(vp, ioblkno); 10357045Smargo bp = NULL; 10457045Smargo } else { 10557045Smargo /* Block wasn't in cache, case 3, 4, 5. */ 10657045Smargo trace(TR_BREADMISS, pack(vp, size), lblkno); 10765670Shibler bp->b_flags |= B_READ; 10857045Smargo ioblkno = lblkno; 10957045Smargo alreadyincore = 0; 11057045Smargo curproc->p_stats->p_ru.ru_inblock++; /* XXX */ 11157045Smargo } 11257045Smargo /* 11357045Smargo * XXX 11457045Smargo * Replace 1 with a window size based on some permutation of 11557045Smargo * maxcontig and rot_delay. This will let you figure out how 11657045Smargo * many blocks you should read-ahead (case 2, 4, 5). 11757045Smargo * 11865670Shibler * If the access isn't sequential, reset the window to 1. 11965670Shibler * Note that a read to the same block is considered sequential. 12065670Shibler * This catches the case where the file is being read sequentially, 12165670Shibler * but at smaller than the filesystem block size. 12257045Smargo */ 12357045Smargo rbp = NULL; 12465670Shibler if (!ISSEQREAD(vp, lblkno)) { 12565670Shibler vp->v_ralen = 0; 12665670Shibler vp->v_maxra = lblkno; 12765670Shibler } else if ((ioblkno + 1) * size <= filesize && !alreadyincore && 12864717Smckusick !(error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra)) && 12964717Smckusick blkno != -1) { 13057045Smargo /* 13157045Smargo * Reading sequentially, and the next block is not in the 13265670Shibler * cache. We are going to try reading ahead. 13357045Smargo */ 13465670Shibler if (num_ra) { 13565670Shibler /* 13665670Shibler * If our desired readahead block had been read 13765670Shibler * in a previous readahead but is no longer in 13865670Shibler * core, then we may be reading ahead too far 13965670Shibler * or are not using our readahead very rapidly. 14065670Shibler * In this case we scale back the window. 14165670Shibler */ 14265670Shibler if (!alreadyincore && ioblkno <= vp->v_maxra) 14365670Shibler vp->v_ralen = max(vp->v_ralen >> 1, 1); 14465670Shibler /* 14565670Shibler * There are more sequential blocks than our current 14665670Shibler * window allows, scale up. Ideally we want to get 14765670Shibler * in sync with the filesystem maxcontig value. 14865670Shibler */ 14965670Shibler else if (num_ra > vp->v_ralen && lblkno != vp->v_lastr) 15065670Shibler vp->v_ralen = vp->v_ralen ? 15165670Shibler min(num_ra, vp->v_ralen << 1) : 1; 15257045Smargo 15365670Shibler if (num_ra > vp->v_ralen) 15465670Shibler num_ra = vp->v_ralen; 15565670Shibler } 15657045Smargo 15757045Smargo if (num_ra) /* case 2, 4 */ 15857045Smargo rbp = cluster_rbuild(vp, filesize, 15957045Smargo bp, ioblkno, blkno, size, num_ra, flags); 16065670Shibler else if (ioblkno == lblkno) { 16165670Shibler bp->b_blkno = blkno; 16257045Smargo /* Case 5: check how many blocks to read ahead */ 16357045Smargo ++ioblkno; 16457045Smargo if ((ioblkno + 1) * size > filesize || 16565670Shibler incore(vp, ioblkno) || (error = VOP_BMAP(vp, 16665670Shibler ioblkno, NULL, &blkno, &num_ra)) || blkno == -1) 16757045Smargo goto skip_readahead; 16865670Shibler /* 16965670Shibler * Adjust readahead as above 17065670Shibler */ 17165670Shibler if (num_ra) { 17265670Shibler if (!alreadyincore && ioblkno <= vp->v_maxra) 17365670Shibler vp->v_ralen = max(vp->v_ralen >> 1, 1); 17465670Shibler else if (num_ra > vp->v_ralen && 17565670Shibler lblkno != vp->v_lastr) 17665670Shibler vp->v_ralen = vp->v_ralen ? 17765670Shibler min(num_ra,vp->v_ralen<<1) : 1; 17865670Shibler if (num_ra > vp->v_ralen) 17965670Shibler num_ra = vp->v_ralen; 18065670Shibler } 18157045Smargo flags |= B_ASYNC; 18257045Smargo if (num_ra) 18357045Smargo rbp = cluster_rbuild(vp, filesize, 18457045Smargo NULL, ioblkno, blkno, size, num_ra, flags); 18557045Smargo else { 18657797Smckusick rbp = getblk(vp, ioblkno, size, 0, 0); 18757045Smargo rbp->b_flags |= flags; 18857045Smargo rbp->b_blkno = blkno; 18957045Smargo } 19065670Shibler } else { 19157045Smargo /* case 2; read ahead single block */ 19257797Smckusick rbp = getblk(vp, ioblkno, size, 0, 0); 19357045Smargo rbp->b_flags |= flags; 19457045Smargo rbp->b_blkno = blkno; 19565670Shibler } 19657045Smargo 19765670Shibler if (rbp == bp) /* case 4 */ 19857045Smargo rbp = NULL; 19957045Smargo else if (rbp) { /* case 2, 5 */ 20057045Smargo trace(TR_BREADMISSRA, 20157045Smargo pack(vp, (num_ra + 1) * size), ioblkno); 20257045Smargo curproc->p_stats->p_ru.ru_inblock++; /* XXX */ 20357045Smargo } 20457045Smargo } 20557045Smargo 20657045Smargo /* XXX Kirk, do we need to make sure the bp has creds? */ 20757045Smargo skip_readahead: 20857045Smargo if (bp) 20957045Smargo if (bp->b_flags & (B_DONE | B_DELWRI)) 21057045Smargo panic("cluster_read: DONE bp"); 21157045Smargo else 21257045Smargo error = VOP_STRATEGY(bp); 21357045Smargo 21457045Smargo if (rbp) 21557045Smargo if (error || rbp->b_flags & (B_DONE | B_DELWRI)) { 21657045Smargo rbp->b_flags &= ~(B_ASYNC | B_READ); 21757045Smargo brelse(rbp); 21857045Smargo } else 21957045Smargo (void) VOP_STRATEGY(rbp); 22057045Smargo 22165670Shibler /* 22265670Shibler * Recalculate our maximum readahead 22365670Shibler */ 22465670Shibler if (rbp == NULL) 22565670Shibler rbp = bp; 22665670Shibler if (rbp) 22765670Shibler vp->v_maxra = rbp->b_lblkno + (rbp->b_bufsize / size) - 1; 22865670Shibler 22957045Smargo if (bp) 23057045Smargo return(biowait(bp)); 23157045Smargo return(error); 23257045Smargo } 23357045Smargo 23457045Smargo /* 23557045Smargo * If blocks are contiguous on disk, use this to provide clustered 23657045Smargo * read ahead. We will read as many blocks as possible sequentially 23757045Smargo * and then parcel them up into logical blocks in the buffer hash table. 23857045Smargo */ 23957045Smargo struct buf * 24057045Smargo cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags) 24157045Smargo struct vnode *vp; 24257045Smargo u_quad_t filesize; 24357045Smargo struct buf *bp; 24457045Smargo daddr_t lbn; 24557045Smargo daddr_t blkno; 24657045Smargo long size; 24757045Smargo int run; 24857045Smargo long flags; 24957045Smargo { 25057045Smargo struct cluster_save *b_save; 25157045Smargo struct buf *tbp; 25257045Smargo daddr_t bn; 25357045Smargo int i, inc; 25457045Smargo 25559872Smargo #ifdef DIAGNOSTIC 25659872Smargo if (size != vp->v_mount->mnt_stat.f_iosize) 25759872Smargo panic("cluster_rbuild: size %d != filesize %d\n", 25859872Smargo size, vp->v_mount->mnt_stat.f_iosize); 25959872Smargo #endif 26057045Smargo if (size * (lbn + run + 1) > filesize) 26157045Smargo --run; 26257045Smargo if (run == 0) { 26357045Smargo if (!bp) { 26457797Smckusick bp = getblk(vp, lbn, size, 0, 0); 26557045Smargo bp->b_blkno = blkno; 26657045Smargo bp->b_flags |= flags; 26757045Smargo } 26857045Smargo return(bp); 26957045Smargo } 27057045Smargo 27157045Smargo bp = cluster_newbuf(vp, bp, flags, blkno, lbn, size, run + 1); 27257045Smargo if (bp->b_flags & (B_DONE | B_DELWRI)) 27357045Smargo return (bp); 27457045Smargo 27557045Smargo b_save = malloc(sizeof(struct buf *) * run + sizeof(struct cluster_save), 27657045Smargo M_SEGMENT, M_WAITOK); 27757045Smargo b_save->bs_bufsize = b_save->bs_bcount = size; 27857045Smargo b_save->bs_nchildren = 0; 27957045Smargo b_save->bs_children = (struct buf **)(b_save + 1); 28057045Smargo b_save->bs_saveaddr = bp->b_saveaddr; 28157045Smargo bp->b_saveaddr = (caddr_t) b_save; 28257045Smargo 28365670Shibler inc = btodb(size); 28457045Smargo for (bn = blkno + inc, i = 1; i <= run; ++i, bn += inc) { 28557045Smargo if (incore(vp, lbn + i)) { 28657045Smargo if (i == 1) { 28757045Smargo bp->b_saveaddr = b_save->bs_saveaddr; 28857045Smargo bp->b_flags &= ~B_CALL; 28957045Smargo bp->b_iodone = NULL; 29057045Smargo allocbuf(bp, size); 29157045Smargo free(b_save, M_SEGMENT); 29257045Smargo } else 29357045Smargo allocbuf(bp, size * i); 29457045Smargo break; 29557045Smargo } 29657797Smckusick tbp = getblk(vp, lbn + i, 0, 0, 0); 29765670Shibler /* 29865670Shibler * getblk may return some memory in the buffer if there were 29965670Shibler * no empty buffers to shed it to. If there is currently 30065670Shibler * memory in the buffer, we move it down size bytes to make 30165670Shibler * room for the valid pages that cluster_callback will insert. 30265670Shibler * We do this now so we don't have to do it at interrupt time 30365670Shibler * in the callback routine. 30465670Shibler */ 30565670Shibler if (tbp->b_bufsize != 0) { 30665670Shibler caddr_t bdata = (char *)tbp->b_data; 30765670Shibler 30865670Shibler if (tbp->b_bufsize + size > MAXBSIZE) 30965670Shibler panic("cluster_rbuild: too much memory"); 31065670Shibler if (tbp->b_bufsize > size) { 31165670Shibler /* 31265670Shibler * XXX if the source and destination regions 31365670Shibler * overlap we have to copy backward to avoid 31465670Shibler * clobbering any valid pages (i.e. pagemove 31565670Shibler * implementations typically can't handle 31665670Shibler * overlap). 31765670Shibler */ 31865670Shibler bdata += tbp->b_bufsize; 31965670Shibler while (bdata > (char *)tbp->b_data) { 32065670Shibler bdata -= CLBYTES; 32165670Shibler pagemove(bdata, bdata + size, CLBYTES); 32265670Shibler } 32365670Shibler } else 32465670Shibler pagemove(bdata, bdata + size, tbp->b_bufsize); 32565670Shibler } 32657045Smargo tbp->b_blkno = bn; 32757045Smargo tbp->b_flags |= flags | B_READ | B_ASYNC; 32857045Smargo ++b_save->bs_nchildren; 32957045Smargo b_save->bs_children[i - 1] = tbp; 33057045Smargo } 33157045Smargo return(bp); 33257045Smargo } 33357045Smargo 33457045Smargo /* 33557045Smargo * Either get a new buffer or grow the existing one. 33657045Smargo */ 33757045Smargo struct buf * 33857045Smargo cluster_newbuf(vp, bp, flags, blkno, lblkno, size, run) 33957045Smargo struct vnode *vp; 34057045Smargo struct buf *bp; 34157045Smargo long flags; 34257045Smargo daddr_t blkno; 34357045Smargo daddr_t lblkno; 34457045Smargo long size; 34557045Smargo int run; 34657045Smargo { 34757045Smargo if (!bp) { 34857797Smckusick bp = getblk(vp, lblkno, size, 0, 0); 34957045Smargo if (bp->b_flags & (B_DONE | B_DELWRI)) { 35057045Smargo bp->b_blkno = blkno; 35157045Smargo return(bp); 35257045Smargo } 35357045Smargo } 35457045Smargo allocbuf(bp, run * size); 35557045Smargo bp->b_blkno = blkno; 35657045Smargo bp->b_iodone = cluster_callback; 35757045Smargo bp->b_flags |= flags | B_CALL; 35857045Smargo return(bp); 35957045Smargo } 36057045Smargo 36157045Smargo /* 36257045Smargo * Cleanup after a clustered read or write. 36365670Shibler * This is complicated by the fact that any of the buffers might have 36465670Shibler * extra memory (if there were no empty buffer headers at allocbuf time) 36565670Shibler * that we will need to shift around. 36657045Smargo */ 36757045Smargo void 36857045Smargo cluster_callback(bp) 36957045Smargo struct buf *bp; 37057045Smargo { 37157045Smargo struct cluster_save *b_save; 37265670Shibler struct buf **bpp, *tbp; 37365670Shibler long bsize; 37457045Smargo caddr_t cp; 37565670Shibler int error = 0; 37664717Smckusick 37765670Shibler /* 37865670Shibler * Must propogate errors to all the components. 37965670Shibler */ 38065670Shibler if (bp->b_flags & B_ERROR) 38165670Shibler error = bp->b_error; 38265670Shibler 38357045Smargo b_save = (struct cluster_save *)(bp->b_saveaddr); 38457045Smargo bp->b_saveaddr = b_save->bs_saveaddr; 38557045Smargo 38665670Shibler bsize = b_save->bs_bufsize; 38765670Shibler cp = (char *)bp->b_data + bsize; 38865670Shibler /* 38965670Shibler * Move memory from the large cluster buffer into the component 39065670Shibler * buffers and mark IO as done on these. 39165670Shibler */ 39265670Shibler for (bpp = b_save->bs_children; b_save->bs_nchildren--; ++bpp) { 39365670Shibler tbp = *bpp; 39465670Shibler pagemove(cp, tbp->b_data, bsize); 39565670Shibler tbp->b_bufsize += bsize; 39665670Shibler tbp->b_bcount = bsize; 39765670Shibler if (error) { 39865670Shibler tbp->b_flags |= B_ERROR; 39965670Shibler tbp->b_error = error; 40065670Shibler } 40165670Shibler biodone(tbp); 40265670Shibler bp->b_bufsize -= bsize; 40365670Shibler cp += bsize; 40457045Smargo } 40565670Shibler /* 40665670Shibler * If there was excess memory in the cluster buffer, 40765670Shibler * slide it up adjacent to the remaining valid data. 40865670Shibler */ 40965670Shibler if (bp->b_bufsize != bsize) { 41065670Shibler if (bp->b_bufsize < bsize) 41165670Shibler panic("cluster_callback: too little memory"); 41265670Shibler pagemove(cp, (char *)bp->b_data + bsize, bp->b_bufsize - bsize); 41365670Shibler } 41465670Shibler bp->b_bcount = bsize; 41557045Smargo bp->b_iodone = NULL; 41657045Smargo free(b_save, M_SEGMENT); 41757045Smargo if (bp->b_flags & B_ASYNC) 41857045Smargo brelse(bp); 41965670Shibler else { 42065670Shibler bp->b_flags &= ~B_WANTED; 42157045Smargo wakeup((caddr_t)bp); 42265670Shibler } 42357045Smargo } 42457045Smargo 42557045Smargo /* 42657045Smargo * Do clustered write for FFS. 42757045Smargo * 42857045Smargo * Three cases: 42957045Smargo * 1. Write is not sequential (write asynchronously) 43057045Smargo * Write is sequential: 43157045Smargo * 2. beginning of cluster - begin cluster 43257045Smargo * 3. middle of a cluster - add to cluster 43357045Smargo * 4. end of a cluster - asynchronously write cluster 43457045Smargo */ 43557045Smargo void 43657045Smargo cluster_write(bp, filesize) 43757045Smargo struct buf *bp; 43857045Smargo u_quad_t filesize; 43957045Smargo { 44057045Smargo struct vnode *vp; 44157045Smargo daddr_t lbn; 442*65998Smckusick int maxclen, cursize; 44357045Smargo 44457045Smargo vp = bp->b_vp; 44557045Smargo lbn = bp->b_lblkno; 44657045Smargo 44759872Smargo /* Initialize vnode to beginning of file. */ 44859872Smargo if (lbn == 0) 44959872Smargo vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; 45059872Smargo 45159872Smargo if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || 45265670Shibler (bp->b_blkno != vp->v_lasta + btodb(bp->b_bcount))) { 453*65998Smckusick maxclen = MAXBSIZE / vp->v_mount->mnt_stat.f_iosize - 1; 454*65998Smckusick if (vp->v_clen != 0) { 45557045Smargo /* 456*65998Smckusick * Next block is not sequential. 457*65998Smckusick * 458*65998Smckusick * If we are not writing at end of file, the process 459*65998Smckusick * seeked to another point in the file since its 460*65998Smckusick * last write, or we have reached our maximum 461*65998Smckusick * cluster size, then push the previous cluster. 462*65998Smckusick * Otherwise try reallocating to make it sequential. 46357045Smargo */ 464*65998Smckusick cursize = vp->v_lastw - vp->v_cstart + 1; 465*65998Smckusick if ((lbn + 1) * bp->b_bcount != filesize || 466*65998Smckusick lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { 467*65998Smckusick cluster_wbuild(vp, NULL, bp->b_bcount, 468*65998Smckusick vp->v_cstart, cursize, lbn); 469*65998Smckusick } else { 470*65998Smckusick struct buf **bpp, **endbp; 471*65998Smckusick struct cluster_save *buflist; 472*65998Smckusick 473*65998Smckusick buflist = cluster_collectbufs(vp, bp); 474*65998Smckusick endbp = &buflist->bs_children 475*65998Smckusick [buflist->bs_nchildren - 1]; 476*65998Smckusick if (VOP_REALLOCBLKS(vp, buflist)) { 477*65998Smckusick /* 478*65998Smckusick * Failed, push the previous cluster. 479*65998Smckusick */ 480*65998Smckusick for (bpp = buflist->bs_children; 481*65998Smckusick bpp < endbp; bpp++) 482*65998Smckusick brelse(*bpp); 483*65998Smckusick free(buflist, M_SEGMENT); 484*65998Smckusick cluster_wbuild(vp, NULL, bp->b_bcount, 485*65998Smckusick vp->v_cstart, cursize, lbn); 486*65998Smckusick } else { 487*65998Smckusick /* 488*65998Smckusick * Succeeded, keep building cluster. 489*65998Smckusick */ 490*65998Smckusick for (bpp = buflist->bs_children; 491*65998Smckusick bpp <= endbp; bpp++) 492*65998Smckusick bdwrite(*bpp); 493*65998Smckusick free(buflist, M_SEGMENT); 494*65998Smckusick vp->v_lastw = lbn; 495*65998Smckusick vp->v_lasta = bp->b_blkno; 496*65998Smckusick return; 497*65998Smckusick } 498*65998Smckusick } 499*65998Smckusick } 50057045Smargo /* 50157045Smargo * Consider beginning a cluster. 502*65998Smckusick * If at end of file, make cluster as large as possible, 503*65998Smckusick * otherwise find size of existing cluster. 50457045Smargo */ 505*65998Smckusick if ((lbn + 1) * bp->b_bcount != filesize && 506*65998Smckusick (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen) || 507*65998Smckusick bp->b_blkno == -1)) { 50857045Smargo bawrite(bp); 50959872Smargo vp->v_clen = 0; 51059872Smargo vp->v_lasta = bp->b_blkno; 51157045Smargo vp->v_cstart = lbn + 1; 51257045Smargo vp->v_lastw = lbn; 51357045Smargo return; 51464717Smckusick } 515*65998Smckusick vp->v_clen = maxclen; 516*65998Smckusick if (maxclen == 0) { /* I/O not contiguous */ 51757045Smargo vp->v_cstart = lbn + 1; 51857045Smargo bawrite(bp); 51957045Smargo } else { /* Wait for rest of cluster */ 52057045Smargo vp->v_cstart = lbn; 52157045Smargo bdwrite(bp); 52257045Smargo } 52365670Shibler } else if (lbn == vp->v_cstart + vp->v_clen) { 52457045Smargo /* 52557045Smargo * At end of cluster, write it out. 52657045Smargo */ 52757045Smargo cluster_wbuild(vp, bp, bp->b_bcount, vp->v_cstart, 52857045Smargo vp->v_clen + 1, lbn); 52957045Smargo vp->v_clen = 0; 53057045Smargo vp->v_cstart = lbn + 1; 53165670Shibler } else 53257045Smargo /* 53357045Smargo * In the middle of a cluster, so just delay the 53457045Smargo * I/O for now. 53557045Smargo */ 53665670Shibler bdwrite(bp); 53765670Shibler vp->v_lastw = lbn; 53859872Smargo vp->v_lasta = bp->b_blkno; 53957045Smargo } 54057045Smargo 54157045Smargo 54257045Smargo /* 54357045Smargo * This is an awful lot like cluster_rbuild...wish they could be combined. 54457045Smargo * The last lbn argument is the current block on which I/O is being 54557045Smargo * performed. Check to see that it doesn't fall in the middle of 54665670Shibler * the current block (if last_bp == NULL). 54757045Smargo */ 54857045Smargo void 54957045Smargo cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn) 55057045Smargo struct vnode *vp; 55157045Smargo struct buf *last_bp; 55257045Smargo long size; 55357045Smargo daddr_t start_lbn; 55457045Smargo int len; 55557045Smargo daddr_t lbn; 55657045Smargo { 55757045Smargo struct cluster_save *b_save; 55857045Smargo struct buf *bp, *tbp; 55957045Smargo caddr_t cp; 56057045Smargo int i, s; 56157045Smargo 56259872Smargo #ifdef DIAGNOSTIC 56359872Smargo if (size != vp->v_mount->mnt_stat.f_iosize) 56459872Smargo panic("cluster_wbuild: size %d != filesize %d\n", 56559872Smargo size, vp->v_mount->mnt_stat.f_iosize); 56659872Smargo #endif 56757045Smargo redo: 56857045Smargo while ((!incore(vp, start_lbn) || start_lbn == lbn) && len) { 56957045Smargo ++start_lbn; 57057045Smargo --len; 57157045Smargo } 57257045Smargo 57357045Smargo /* Get more memory for current buffer */ 57457045Smargo if (len <= 1) { 57559872Smargo if (last_bp) { 57657045Smargo bawrite(last_bp); 57759872Smargo } else if (len) { 57859872Smargo bp = getblk(vp, start_lbn, size, 0, 0); 57959872Smargo bawrite(bp); 58059872Smargo } 58157045Smargo return; 58257045Smargo } 58357045Smargo 58457797Smckusick bp = getblk(vp, start_lbn, size, 0, 0); 58557045Smargo if (!(bp->b_flags & B_DELWRI)) { 58657045Smargo ++start_lbn; 58757045Smargo --len; 58857045Smargo brelse(bp); 58957045Smargo goto redo; 59057045Smargo } 59157045Smargo 59265670Shibler /* 59365670Shibler * Extra memory in the buffer, punt on this buffer. 59465670Shibler * XXX we could handle this in most cases, but we would have to 59565670Shibler * push the extra memory down to after our max possible cluster 59665670Shibler * size and then potentially pull it back up if the cluster was 59765670Shibler * terminated prematurely--too much hassle. 59865670Shibler */ 59965670Shibler if (bp->b_bcount != bp->b_bufsize) { 60065670Shibler ++start_lbn; 60165670Shibler --len; 60265670Shibler bawrite(bp); 60365670Shibler goto redo; 60465670Shibler } 60565670Shibler 60657045Smargo --len; 60757045Smargo b_save = malloc(sizeof(struct buf *) * len + sizeof(struct cluster_save), 60857045Smargo M_SEGMENT, M_WAITOK); 60957045Smargo b_save->bs_bcount = bp->b_bcount; 61057045Smargo b_save->bs_bufsize = bp->b_bufsize; 61157045Smargo b_save->bs_nchildren = 0; 61257045Smargo b_save->bs_children = (struct buf **)(b_save + 1); 61357045Smargo b_save->bs_saveaddr = bp->b_saveaddr; 61457045Smargo bp->b_saveaddr = (caddr_t) b_save; 61557045Smargo 61657045Smargo bp->b_flags |= B_CALL; 61757045Smargo bp->b_iodone = cluster_callback; 61865670Shibler cp = (char *)bp->b_data + size; 61957045Smargo for (++start_lbn, i = 0; i < len; ++i, ++start_lbn) { 62065670Shibler /* 62165670Shibler * Block is not in core or the non-sequential block 62265670Shibler * ending our cluster was part of the cluster (in which 62365670Shibler * case we don't want to write it twice). 62465670Shibler */ 62565670Shibler if (!incore(vp, start_lbn) || 62665670Shibler last_bp == NULL && start_lbn == lbn) 62757045Smargo break; 62857045Smargo 62965670Shibler /* 63065670Shibler * Get the desired block buffer (unless it is the final 63165670Shibler * sequential block whose buffer was passed in explictly 63265670Shibler * as last_bp). 63365670Shibler */ 63465670Shibler if (last_bp == NULL || start_lbn != lbn) { 63557797Smckusick tbp = getblk(vp, start_lbn, size, 0, 0); 63657045Smargo if (!(tbp->b_flags & B_DELWRI)) { 63757045Smargo brelse(tbp); 63857045Smargo break; 63957045Smargo } 64057045Smargo } else 64157045Smargo tbp = last_bp; 64257045Smargo 64357045Smargo ++b_save->bs_nchildren; 64457045Smargo 64557045Smargo /* Move memory from children to parent */ 64665670Shibler if (tbp->b_blkno != (bp->b_blkno + btodb(bp->b_bufsize))) { 64759872Smargo printf("Clustered Block: %d addr %x bufsize: %d\n", 64859872Smargo bp->b_lblkno, bp->b_blkno, bp->b_bufsize); 64959872Smargo printf("Child Block: %d addr: %x\n", tbp->b_lblkno, 65059872Smargo tbp->b_blkno); 65159872Smargo panic("Clustered write to wrong blocks"); 65259872Smargo } 65359872Smargo 65464528Sbostic pagemove(tbp->b_data, cp, size); 65557045Smargo bp->b_bcount += size; 65657045Smargo bp->b_bufsize += size; 65757045Smargo 65865670Shibler tbp->b_bufsize -= size; 65957045Smargo tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 66065670Shibler tbp->b_flags |= (B_ASYNC | B_AGE); 66157045Smargo s = splbio(); 66257045Smargo reassignbuf(tbp, tbp->b_vp); /* put on clean list */ 66357045Smargo ++tbp->b_vp->v_numoutput; 66457045Smargo splx(s); 66557045Smargo b_save->bs_children[i] = tbp; 66657045Smargo 66765670Shibler cp += size; 66857045Smargo } 66957045Smargo 67057045Smargo if (i == 0) { 67157045Smargo /* None to cluster */ 67257045Smargo bp->b_saveaddr = b_save->bs_saveaddr; 67357045Smargo bp->b_flags &= ~B_CALL; 67457045Smargo bp->b_iodone = NULL; 67557045Smargo free(b_save, M_SEGMENT); 67657045Smargo } 67757045Smargo bawrite(bp); 67857045Smargo if (i < len) { 67957045Smargo len -= i + 1; 68057045Smargo start_lbn += 1; 68157045Smargo goto redo; 68257045Smargo } 68357045Smargo } 684*65998Smckusick 685*65998Smckusick /* 686*65998Smckusick * Collect together all the buffers in a cluster. 687*65998Smckusick * Plus add one additional buffer. 688*65998Smckusick */ 689*65998Smckusick struct cluster_save * 690*65998Smckusick cluster_collectbufs(vp, last_bp) 691*65998Smckusick struct vnode *vp; 692*65998Smckusick struct buf *last_bp; 693*65998Smckusick { 694*65998Smckusick struct cluster_save *buflist; 695*65998Smckusick daddr_t lbn; 696*65998Smckusick int i, len; 697*65998Smckusick 698*65998Smckusick len = vp->v_lastw - vp->v_cstart + 1; 699*65998Smckusick buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), 700*65998Smckusick M_SEGMENT, M_WAITOK); 701*65998Smckusick buflist->bs_nchildren = 0; 702*65998Smckusick buflist->bs_children = (struct buf **)(buflist + 1); 703*65998Smckusick for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) 704*65998Smckusick (void)bread(vp, lbn, last_bp->b_bcount, NOCRED, 705*65998Smckusick &buflist->bs_children[i]); 706*65998Smckusick buflist->bs_children[i] = last_bp; 707*65998Smckusick buflist->bs_nchildren = i + 1; 708*65998Smckusick return (buflist); 709*65998Smckusick } 710