149589Sbostic /*- 263180Sbostic * Copyright (c) 1993 363180Sbostic * The Regents of the University of California. All rights reserved. 423395Smckusick * 559878Smckusick * %sccs.include.redist.c% 637736Smckusick * 7*65670Shibler * @(#)vfs_cluster.c 8.5 (Berkeley) 01/12/94 823395Smckusick */ 98Sbill 1051455Sbostic #include <sys/param.h> 1151455Sbostic #include <sys/proc.h> 1251455Sbostic #include <sys/buf.h> 1351455Sbostic #include <sys/vnode.h> 1451455Sbostic #include <sys/mount.h> 1551455Sbostic #include <sys/trace.h> 1659878Smckusick #include <sys/malloc.h> 1751455Sbostic #include <sys/resourcevar.h> 1856395Smckusick #include <libkern/libkern.h> 198Sbill 2091Sbill /* 2157045Smargo * Local declarations 2257045Smargo */ 2357045Smargo struct buf *cluster_newbuf __P((struct vnode *, struct buf *, long, daddr_t, 2457045Smargo daddr_t, long, int)); 2557045Smargo struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *, 2657045Smargo daddr_t, daddr_t, long, int, long)); 2764717Smckusick void cluster_wbuild __P((struct vnode *, struct buf *, long, 2864717Smckusick daddr_t, int, daddr_t)); 2957045Smargo 30*65670Shibler #ifdef DIAGNOSTIC 3156395Smckusick /* 32*65670Shibler * Set to 1 if reads of block zero should cause readahead to be done. 33*65670Shibler * Set to 0 treats a read of block zero as a non-sequential read. 3457045Smargo * 35*65670Shibler * Setting to one assumes that most reads of block zero of files are due to 36*65670Shibler * sequential passes over the files (e.g. cat, sum) where additional blocks 37*65670Shibler * will soon be needed. Setting to zero assumes that the majority are 38*65670Shibler * surgical strikes to get particular info (e.g. size, file) where readahead 39*65670Shibler * blocks will not be used and, in fact, push out other potentially useful 40*65670Shibler * blocks from the cache. The former seems intuitive, but some quick tests 41*65670Shibler * showed that the latter performed better from a system-wide point of view. 42*65670Shibler */ 43*65670Shibler int doclusterraz = 0; 44*65670Shibler #define ISSEQREAD(vp, blk) \ 45*65670Shibler (((blk) != 0 || doclusterraz) && \ 46*65670Shibler ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr)) 47*65670Shibler #else 48*65670Shibler #define ISSEQREAD(vp, blk) \ 49*65670Shibler ((blk) != 0 && ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr)) 50*65670Shibler #endif 51*65670Shibler 52*65670Shibler /* 5357045Smargo * This replaces bread. If this is a bread at the beginning of a file and 5457045Smargo * lastr is 0, we assume this is the first read and we'll read up to two 5557045Smargo * blocks if they are sequential. After that, we'll do regular read ahead 5657045Smargo * in clustered chunks. 5757045Smargo * 5857045Smargo * There are 4 or 5 cases depending on how you count: 5957045Smargo * Desired block is in the cache: 6057045Smargo * 1 Not sequential access (0 I/Os). 6157045Smargo * 2 Access is sequential, do read-ahead (1 ASYNC). 6257045Smargo * Desired block is not in cache: 6357045Smargo * 3 Not sequential access (1 SYNC). 6457045Smargo * 4 Sequential access, next block is contiguous (1 SYNC). 6557045Smargo * 5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC) 6657045Smargo * 6757045Smargo * There are potentially two buffers that require I/O. 6857045Smargo * bp is the block requested. 6957045Smargo * rbp is the read-ahead block. 7057045Smargo * If either is NULL, then you don't have to do the I/O. 7157045Smargo */ 7257045Smargo cluster_read(vp, filesize, lblkno, size, cred, bpp) 7357045Smargo struct vnode *vp; 7457045Smargo u_quad_t filesize; 7557045Smargo daddr_t lblkno; 7657045Smargo long size; 7757045Smargo struct ucred *cred; 7857045Smargo struct buf **bpp; 7957045Smargo { 8057045Smargo struct buf *bp, *rbp; 8157045Smargo daddr_t blkno, ioblkno; 8257045Smargo long flags; 8357045Smargo int error, num_ra, alreadyincore; 8457045Smargo 8557045Smargo #ifdef DIAGNOSTIC 8657045Smargo if (size == 0) 8757045Smargo panic("cluster_read: size = 0"); 8857045Smargo #endif 8957045Smargo 9057045Smargo error = 0; 9157045Smargo flags = B_READ; 9257797Smckusick *bpp = bp = getblk(vp, lblkno, size, 0, 0); 93*65670Shibler if (bp->b_flags & B_CACHE) { 9457045Smargo /* 9557045Smargo * Desired block is in cache; do any readahead ASYNC. 9657045Smargo * Case 1, 2. 9757045Smargo */ 9857045Smargo trace(TR_BREADHIT, pack(vp, size), lblkno); 9957045Smargo flags |= B_ASYNC; 100*65670Shibler ioblkno = lblkno + (vp->v_ralen ? vp->v_ralen : 1); 10157797Smckusick alreadyincore = (int)incore(vp, ioblkno); 10257045Smargo bp = NULL; 10357045Smargo } else { 10457045Smargo /* Block wasn't in cache, case 3, 4, 5. */ 10557045Smargo trace(TR_BREADMISS, pack(vp, size), lblkno); 106*65670Shibler bp->b_flags |= B_READ; 10757045Smargo ioblkno = lblkno; 10857045Smargo alreadyincore = 0; 10957045Smargo curproc->p_stats->p_ru.ru_inblock++; /* XXX */ 11057045Smargo } 11157045Smargo /* 11257045Smargo * XXX 11357045Smargo * Replace 1 with a window size based on some permutation of 11457045Smargo * maxcontig and rot_delay. This will let you figure out how 11557045Smargo * many blocks you should read-ahead (case 2, 4, 5). 11657045Smargo * 117*65670Shibler * If the access isn't sequential, reset the window to 1. 118*65670Shibler * Note that a read to the same block is considered sequential. 119*65670Shibler * This catches the case where the file is being read sequentially, 120*65670Shibler * but at smaller than the filesystem block size. 12157045Smargo */ 12257045Smargo rbp = NULL; 123*65670Shibler if (!ISSEQREAD(vp, lblkno)) { 124*65670Shibler vp->v_ralen = 0; 125*65670Shibler vp->v_maxra = lblkno; 126*65670Shibler } else if ((ioblkno + 1) * size <= filesize && !alreadyincore && 12764717Smckusick !(error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra)) && 12864717Smckusick blkno != -1) { 12957045Smargo /* 13057045Smargo * Reading sequentially, and the next block is not in the 131*65670Shibler * cache. We are going to try reading ahead. 13257045Smargo */ 133*65670Shibler if (num_ra) { 134*65670Shibler /* 135*65670Shibler * If our desired readahead block had been read 136*65670Shibler * in a previous readahead but is no longer in 137*65670Shibler * core, then we may be reading ahead too far 138*65670Shibler * or are not using our readahead very rapidly. 139*65670Shibler * In this case we scale back the window. 140*65670Shibler */ 141*65670Shibler if (!alreadyincore && ioblkno <= vp->v_maxra) 142*65670Shibler vp->v_ralen = max(vp->v_ralen >> 1, 1); 143*65670Shibler /* 144*65670Shibler * There are more sequential blocks than our current 145*65670Shibler * window allows, scale up. Ideally we want to get 146*65670Shibler * in sync with the filesystem maxcontig value. 147*65670Shibler */ 148*65670Shibler else if (num_ra > vp->v_ralen && lblkno != vp->v_lastr) 149*65670Shibler vp->v_ralen = vp->v_ralen ? 150*65670Shibler min(num_ra, vp->v_ralen << 1) : 1; 15157045Smargo 152*65670Shibler if (num_ra > vp->v_ralen) 153*65670Shibler num_ra = vp->v_ralen; 154*65670Shibler } 15557045Smargo 15657045Smargo if (num_ra) /* case 2, 4 */ 15757045Smargo rbp = cluster_rbuild(vp, filesize, 15857045Smargo bp, ioblkno, blkno, size, num_ra, flags); 159*65670Shibler else if (ioblkno == lblkno) { 160*65670Shibler bp->b_blkno = blkno; 16157045Smargo /* Case 5: check how many blocks to read ahead */ 16257045Smargo ++ioblkno; 16357045Smargo if ((ioblkno + 1) * size > filesize || 164*65670Shibler incore(vp, ioblkno) || (error = VOP_BMAP(vp, 165*65670Shibler ioblkno, NULL, &blkno, &num_ra)) || blkno == -1) 16657045Smargo goto skip_readahead; 167*65670Shibler /* 168*65670Shibler * Adjust readahead as above 169*65670Shibler */ 170*65670Shibler if (num_ra) { 171*65670Shibler if (!alreadyincore && ioblkno <= vp->v_maxra) 172*65670Shibler vp->v_ralen = max(vp->v_ralen >> 1, 1); 173*65670Shibler else if (num_ra > vp->v_ralen && 174*65670Shibler lblkno != vp->v_lastr) 175*65670Shibler vp->v_ralen = vp->v_ralen ? 176*65670Shibler min(num_ra,vp->v_ralen<<1) : 1; 177*65670Shibler if (num_ra > vp->v_ralen) 178*65670Shibler num_ra = vp->v_ralen; 179*65670Shibler } 18057045Smargo flags |= B_ASYNC; 18157045Smargo if (num_ra) 18257045Smargo rbp = cluster_rbuild(vp, filesize, 18357045Smargo NULL, ioblkno, blkno, size, num_ra, flags); 18457045Smargo else { 18557797Smckusick rbp = getblk(vp, ioblkno, size, 0, 0); 18657045Smargo rbp->b_flags |= flags; 18757045Smargo rbp->b_blkno = blkno; 18857045Smargo } 189*65670Shibler } else { 19057045Smargo /* case 2; read ahead single block */ 19157797Smckusick rbp = getblk(vp, ioblkno, size, 0, 0); 19257045Smargo rbp->b_flags |= flags; 19357045Smargo rbp->b_blkno = blkno; 194*65670Shibler } 19557045Smargo 196*65670Shibler if (rbp == bp) /* case 4 */ 19757045Smargo rbp = NULL; 19857045Smargo else if (rbp) { /* case 2, 5 */ 19957045Smargo trace(TR_BREADMISSRA, 20057045Smargo pack(vp, (num_ra + 1) * size), ioblkno); 20157045Smargo curproc->p_stats->p_ru.ru_inblock++; /* XXX */ 20257045Smargo } 20357045Smargo } 20457045Smargo 20557045Smargo /* XXX Kirk, do we need to make sure the bp has creds? */ 20657045Smargo skip_readahead: 20757045Smargo if (bp) 20857045Smargo if (bp->b_flags & (B_DONE | B_DELWRI)) 20957045Smargo panic("cluster_read: DONE bp"); 21057045Smargo else 21157045Smargo error = VOP_STRATEGY(bp); 21257045Smargo 21357045Smargo if (rbp) 21457045Smargo if (error || rbp->b_flags & (B_DONE | B_DELWRI)) { 21557045Smargo rbp->b_flags &= ~(B_ASYNC | B_READ); 21657045Smargo brelse(rbp); 21757045Smargo } else 21857045Smargo (void) VOP_STRATEGY(rbp); 21957045Smargo 220*65670Shibler /* 221*65670Shibler * Recalculate our maximum readahead 222*65670Shibler */ 223*65670Shibler if (rbp == NULL) 224*65670Shibler rbp = bp; 225*65670Shibler if (rbp) 226*65670Shibler vp->v_maxra = rbp->b_lblkno + (rbp->b_bufsize / size) - 1; 227*65670Shibler 22857045Smargo if (bp) 22957045Smargo return(biowait(bp)); 23057045Smargo return(error); 23157045Smargo } 23257045Smargo 23357045Smargo /* 23457045Smargo * If blocks are contiguous on disk, use this to provide clustered 23557045Smargo * read ahead. We will read as many blocks as possible sequentially 23657045Smargo * and then parcel them up into logical blocks in the buffer hash table. 23757045Smargo */ 23857045Smargo struct buf * 23957045Smargo cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags) 24057045Smargo struct vnode *vp; 24157045Smargo u_quad_t filesize; 24257045Smargo struct buf *bp; 24357045Smargo daddr_t lbn; 24457045Smargo daddr_t blkno; 24557045Smargo long size; 24657045Smargo int run; 24757045Smargo long flags; 24857045Smargo { 24957045Smargo struct cluster_save *b_save; 25057045Smargo struct buf *tbp; 25157045Smargo daddr_t bn; 25257045Smargo int i, inc; 25357045Smargo 25459872Smargo #ifdef DIAGNOSTIC 25559872Smargo if (size != vp->v_mount->mnt_stat.f_iosize) 25659872Smargo panic("cluster_rbuild: size %d != filesize %d\n", 25759872Smargo size, vp->v_mount->mnt_stat.f_iosize); 25859872Smargo #endif 25957045Smargo if (size * (lbn + run + 1) > filesize) 26057045Smargo --run; 26157045Smargo if (run == 0) { 26257045Smargo if (!bp) { 26357797Smckusick bp = getblk(vp, lbn, size, 0, 0); 26457045Smargo bp->b_blkno = blkno; 26557045Smargo bp->b_flags |= flags; 26657045Smargo } 26757045Smargo return(bp); 26857045Smargo } 26957045Smargo 27057045Smargo bp = cluster_newbuf(vp, bp, flags, blkno, lbn, size, run + 1); 27157045Smargo if (bp->b_flags & (B_DONE | B_DELWRI)) 27257045Smargo return (bp); 27357045Smargo 27457045Smargo b_save = malloc(sizeof(struct buf *) * run + sizeof(struct cluster_save), 27557045Smargo M_SEGMENT, M_WAITOK); 27657045Smargo b_save->bs_bufsize = b_save->bs_bcount = size; 27757045Smargo b_save->bs_nchildren = 0; 27857045Smargo b_save->bs_children = (struct buf **)(b_save + 1); 27957045Smargo b_save->bs_saveaddr = bp->b_saveaddr; 28057045Smargo bp->b_saveaddr = (caddr_t) b_save; 28157045Smargo 282*65670Shibler inc = btodb(size); 28357045Smargo for (bn = blkno + inc, i = 1; i <= run; ++i, bn += inc) { 28457045Smargo if (incore(vp, lbn + i)) { 28557045Smargo if (i == 1) { 28657045Smargo bp->b_saveaddr = b_save->bs_saveaddr; 28757045Smargo bp->b_flags &= ~B_CALL; 28857045Smargo bp->b_iodone = NULL; 28957045Smargo allocbuf(bp, size); 29057045Smargo free(b_save, M_SEGMENT); 29157045Smargo } else 29257045Smargo allocbuf(bp, size * i); 29357045Smargo break; 29457045Smargo } 29557797Smckusick tbp = getblk(vp, lbn + i, 0, 0, 0); 296*65670Shibler /* 297*65670Shibler * getblk may return some memory in the buffer if there were 298*65670Shibler * no empty buffers to shed it to. If there is currently 299*65670Shibler * memory in the buffer, we move it down size bytes to make 300*65670Shibler * room for the valid pages that cluster_callback will insert. 301*65670Shibler * We do this now so we don't have to do it at interrupt time 302*65670Shibler * in the callback routine. 303*65670Shibler */ 304*65670Shibler if (tbp->b_bufsize != 0) { 305*65670Shibler caddr_t bdata = (char *)tbp->b_data; 306*65670Shibler 307*65670Shibler if (tbp->b_bufsize + size > MAXBSIZE) 308*65670Shibler panic("cluster_rbuild: too much memory"); 309*65670Shibler if (tbp->b_bufsize > size) { 310*65670Shibler /* 311*65670Shibler * XXX if the source and destination regions 312*65670Shibler * overlap we have to copy backward to avoid 313*65670Shibler * clobbering any valid pages (i.e. pagemove 314*65670Shibler * implementations typically can't handle 315*65670Shibler * overlap). 316*65670Shibler */ 317*65670Shibler bdata += tbp->b_bufsize; 318*65670Shibler while (bdata > (char *)tbp->b_data) { 319*65670Shibler bdata -= CLBYTES; 320*65670Shibler pagemove(bdata, bdata + size, CLBYTES); 321*65670Shibler } 322*65670Shibler } else 323*65670Shibler pagemove(bdata, bdata + size, tbp->b_bufsize); 324*65670Shibler } 32557045Smargo tbp->b_blkno = bn; 32657045Smargo tbp->b_flags |= flags | B_READ | B_ASYNC; 32757045Smargo ++b_save->bs_nchildren; 32857045Smargo b_save->bs_children[i - 1] = tbp; 32957045Smargo } 33057045Smargo return(bp); 33157045Smargo } 33257045Smargo 33357045Smargo /* 33457045Smargo * Either get a new buffer or grow the existing one. 33557045Smargo */ 33657045Smargo struct buf * 33757045Smargo cluster_newbuf(vp, bp, flags, blkno, lblkno, size, run) 33857045Smargo struct vnode *vp; 33957045Smargo struct buf *bp; 34057045Smargo long flags; 34157045Smargo daddr_t blkno; 34257045Smargo daddr_t lblkno; 34357045Smargo long size; 34457045Smargo int run; 34557045Smargo { 34657045Smargo if (!bp) { 34757797Smckusick bp = getblk(vp, lblkno, size, 0, 0); 34857045Smargo if (bp->b_flags & (B_DONE | B_DELWRI)) { 34957045Smargo bp->b_blkno = blkno; 35057045Smargo return(bp); 35157045Smargo } 35257045Smargo } 35357045Smargo allocbuf(bp, run * size); 35457045Smargo bp->b_blkno = blkno; 35557045Smargo bp->b_iodone = cluster_callback; 35657045Smargo bp->b_flags |= flags | B_CALL; 35757045Smargo return(bp); 35857045Smargo } 35957045Smargo 36057045Smargo /* 36157045Smargo * Cleanup after a clustered read or write. 362*65670Shibler * This is complicated by the fact that any of the buffers might have 363*65670Shibler * extra memory (if there were no empty buffer headers at allocbuf time) 364*65670Shibler * that we will need to shift around. 36557045Smargo */ 36657045Smargo void 36757045Smargo cluster_callback(bp) 36857045Smargo struct buf *bp; 36957045Smargo { 37057045Smargo struct cluster_save *b_save; 371*65670Shibler struct buf **bpp, *tbp; 372*65670Shibler long bsize; 37357045Smargo caddr_t cp; 374*65670Shibler int error = 0; 37564717Smckusick 376*65670Shibler /* 377*65670Shibler * Must propogate errors to all the components. 378*65670Shibler */ 379*65670Shibler if (bp->b_flags & B_ERROR) 380*65670Shibler error = bp->b_error; 381*65670Shibler 38257045Smargo b_save = (struct cluster_save *)(bp->b_saveaddr); 38357045Smargo bp->b_saveaddr = b_save->bs_saveaddr; 38457045Smargo 385*65670Shibler bsize = b_save->bs_bufsize; 386*65670Shibler cp = (char *)bp->b_data + bsize; 387*65670Shibler /* 388*65670Shibler * Move memory from the large cluster buffer into the component 389*65670Shibler * buffers and mark IO as done on these. 390*65670Shibler */ 391*65670Shibler for (bpp = b_save->bs_children; b_save->bs_nchildren--; ++bpp) { 392*65670Shibler tbp = *bpp; 393*65670Shibler pagemove(cp, tbp->b_data, bsize); 394*65670Shibler tbp->b_bufsize += bsize; 395*65670Shibler tbp->b_bcount = bsize; 396*65670Shibler if (error) { 397*65670Shibler tbp->b_flags |= B_ERROR; 398*65670Shibler tbp->b_error = error; 399*65670Shibler } 400*65670Shibler biodone(tbp); 401*65670Shibler bp->b_bufsize -= bsize; 402*65670Shibler cp += bsize; 40357045Smargo } 404*65670Shibler /* 405*65670Shibler * If there was excess memory in the cluster buffer, 406*65670Shibler * slide it up adjacent to the remaining valid data. 407*65670Shibler */ 408*65670Shibler if (bp->b_bufsize != bsize) { 409*65670Shibler if (bp->b_bufsize < bsize) 410*65670Shibler panic("cluster_callback: too little memory"); 411*65670Shibler pagemove(cp, (char *)bp->b_data + bsize, bp->b_bufsize - bsize); 412*65670Shibler } 413*65670Shibler bp->b_bcount = bsize; 41457045Smargo bp->b_iodone = NULL; 41557045Smargo free(b_save, M_SEGMENT); 41657045Smargo if (bp->b_flags & B_ASYNC) 41757045Smargo brelse(bp); 418*65670Shibler else { 419*65670Shibler bp->b_flags &= ~B_WANTED; 42057045Smargo wakeup((caddr_t)bp); 421*65670Shibler } 42257045Smargo } 42357045Smargo 42457045Smargo /* 42557045Smargo * Do clustered write for FFS. 42657045Smargo * 42757045Smargo * Three cases: 42857045Smargo * 1. Write is not sequential (write asynchronously) 42957045Smargo * Write is sequential: 43057045Smargo * 2. beginning of cluster - begin cluster 43157045Smargo * 3. middle of a cluster - add to cluster 43257045Smargo * 4. end of a cluster - asynchronously write cluster 43357045Smargo */ 43457045Smargo void 43557045Smargo cluster_write(bp, filesize) 43657045Smargo struct buf *bp; 43757045Smargo u_quad_t filesize; 43857045Smargo { 43957045Smargo struct vnode *vp; 44057045Smargo daddr_t lbn; 44159872Smargo int clen; 44257045Smargo 44357045Smargo vp = bp->b_vp; 44457045Smargo lbn = bp->b_lblkno; 44557045Smargo 44659872Smargo /* Initialize vnode to beginning of file. */ 44759872Smargo if (lbn == 0) 44859872Smargo vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; 44959872Smargo 45059872Smargo if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || 451*65670Shibler (bp->b_blkno != vp->v_lasta + btodb(bp->b_bcount))) { 45257045Smargo if (vp->v_clen != 0) 45357045Smargo /* 45457045Smargo * Write is not sequential. 45557045Smargo */ 45657045Smargo cluster_wbuild(vp, NULL, bp->b_bcount, vp->v_cstart, 45757045Smargo vp->v_lastw - vp->v_cstart + 1, lbn); 45857045Smargo /* 45957045Smargo * Consider beginning a cluster. 46057045Smargo */ 46159872Smargo if ((lbn + 1) * bp->b_bcount == filesize) 46259872Smargo /* End of file, make cluster as large as possible */ 46359872Smargo clen = MAXBSIZE / vp->v_mount->mnt_stat.f_iosize - 1; 46464717Smckusick else if (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &clen) || 46564717Smckusick bp->b_blkno == -1) { 46657045Smargo bawrite(bp); 46759872Smargo vp->v_clen = 0; 46859872Smargo vp->v_lasta = bp->b_blkno; 46957045Smargo vp->v_cstart = lbn + 1; 47057045Smargo vp->v_lastw = lbn; 47157045Smargo return; 47264717Smckusick } 47357045Smargo vp->v_clen = clen; 47457045Smargo if (clen == 0) { /* I/O not contiguous */ 47557045Smargo vp->v_cstart = lbn + 1; 47657045Smargo bawrite(bp); 47757045Smargo } else { /* Wait for rest of cluster */ 47857045Smargo vp->v_cstart = lbn; 47957045Smargo bdwrite(bp); 48057045Smargo } 481*65670Shibler } else if (lbn == vp->v_cstart + vp->v_clen) { 48257045Smargo /* 48357045Smargo * At end of cluster, write it out. 48457045Smargo */ 48557045Smargo cluster_wbuild(vp, bp, bp->b_bcount, vp->v_cstart, 48657045Smargo vp->v_clen + 1, lbn); 48757045Smargo vp->v_clen = 0; 48857045Smargo vp->v_cstart = lbn + 1; 489*65670Shibler } else 49057045Smargo /* 49157045Smargo * In the middle of a cluster, so just delay the 49257045Smargo * I/O for now. 49357045Smargo */ 494*65670Shibler bdwrite(bp); 495*65670Shibler vp->v_lastw = lbn; 49659872Smargo vp->v_lasta = bp->b_blkno; 49757045Smargo } 49857045Smargo 49957045Smargo 50057045Smargo /* 50157045Smargo * This is an awful lot like cluster_rbuild...wish they could be combined. 50257045Smargo * The last lbn argument is the current block on which I/O is being 50357045Smargo * performed. Check to see that it doesn't fall in the middle of 504*65670Shibler * the current block (if last_bp == NULL). 50557045Smargo */ 50657045Smargo void 50757045Smargo cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn) 50857045Smargo struct vnode *vp; 50957045Smargo struct buf *last_bp; 51057045Smargo long size; 51157045Smargo daddr_t start_lbn; 51257045Smargo int len; 51357045Smargo daddr_t lbn; 51457045Smargo { 51557045Smargo struct cluster_save *b_save; 51657045Smargo struct buf *bp, *tbp; 51757045Smargo caddr_t cp; 51857045Smargo int i, s; 51957045Smargo 52059872Smargo #ifdef DIAGNOSTIC 52159872Smargo if (size != vp->v_mount->mnt_stat.f_iosize) 52259872Smargo panic("cluster_wbuild: size %d != filesize %d\n", 52359872Smargo size, vp->v_mount->mnt_stat.f_iosize); 52459872Smargo #endif 52557045Smargo redo: 52657045Smargo while ((!incore(vp, start_lbn) || start_lbn == lbn) && len) { 52757045Smargo ++start_lbn; 52857045Smargo --len; 52957045Smargo } 53057045Smargo 53157045Smargo /* Get more memory for current buffer */ 53257045Smargo if (len <= 1) { 53359872Smargo if (last_bp) { 53457045Smargo bawrite(last_bp); 53559872Smargo } else if (len) { 53659872Smargo bp = getblk(vp, start_lbn, size, 0, 0); 53759872Smargo bawrite(bp); 53859872Smargo } 53957045Smargo return; 54057045Smargo } 54157045Smargo 54257797Smckusick bp = getblk(vp, start_lbn, size, 0, 0); 54357045Smargo if (!(bp->b_flags & B_DELWRI)) { 54457045Smargo ++start_lbn; 54557045Smargo --len; 54657045Smargo brelse(bp); 54757045Smargo goto redo; 54857045Smargo } 54957045Smargo 550*65670Shibler /* 551*65670Shibler * Extra memory in the buffer, punt on this buffer. 552*65670Shibler * XXX we could handle this in most cases, but we would have to 553*65670Shibler * push the extra memory down to after our max possible cluster 554*65670Shibler * size and then potentially pull it back up if the cluster was 555*65670Shibler * terminated prematurely--too much hassle. 556*65670Shibler */ 557*65670Shibler if (bp->b_bcount != bp->b_bufsize) { 558*65670Shibler ++start_lbn; 559*65670Shibler --len; 560*65670Shibler bawrite(bp); 561*65670Shibler goto redo; 562*65670Shibler } 563*65670Shibler 56457045Smargo --len; 56557045Smargo b_save = malloc(sizeof(struct buf *) * len + sizeof(struct cluster_save), 56657045Smargo M_SEGMENT, M_WAITOK); 56757045Smargo b_save->bs_bcount = bp->b_bcount; 56857045Smargo b_save->bs_bufsize = bp->b_bufsize; 56957045Smargo b_save->bs_nchildren = 0; 57057045Smargo b_save->bs_children = (struct buf **)(b_save + 1); 57157045Smargo b_save->bs_saveaddr = bp->b_saveaddr; 57257045Smargo bp->b_saveaddr = (caddr_t) b_save; 57357045Smargo 57457045Smargo bp->b_flags |= B_CALL; 57557045Smargo bp->b_iodone = cluster_callback; 576*65670Shibler cp = (char *)bp->b_data + size; 57757045Smargo for (++start_lbn, i = 0; i < len; ++i, ++start_lbn) { 578*65670Shibler /* 579*65670Shibler * Block is not in core or the non-sequential block 580*65670Shibler * ending our cluster was part of the cluster (in which 581*65670Shibler * case we don't want to write it twice). 582*65670Shibler */ 583*65670Shibler if (!incore(vp, start_lbn) || 584*65670Shibler last_bp == NULL && start_lbn == lbn) 58557045Smargo break; 58657045Smargo 587*65670Shibler /* 588*65670Shibler * Get the desired block buffer (unless it is the final 589*65670Shibler * sequential block whose buffer was passed in explictly 590*65670Shibler * as last_bp). 591*65670Shibler */ 592*65670Shibler if (last_bp == NULL || start_lbn != lbn) { 59357797Smckusick tbp = getblk(vp, start_lbn, size, 0, 0); 59457045Smargo if (!(tbp->b_flags & B_DELWRI)) { 59557045Smargo brelse(tbp); 59657045Smargo break; 59757045Smargo } 59857045Smargo } else 59957045Smargo tbp = last_bp; 60057045Smargo 60157045Smargo ++b_save->bs_nchildren; 60257045Smargo 60357045Smargo /* Move memory from children to parent */ 604*65670Shibler if (tbp->b_blkno != (bp->b_blkno + btodb(bp->b_bufsize))) { 60559872Smargo printf("Clustered Block: %d addr %x bufsize: %d\n", 60659872Smargo bp->b_lblkno, bp->b_blkno, bp->b_bufsize); 60759872Smargo printf("Child Block: %d addr: %x\n", tbp->b_lblkno, 60859872Smargo tbp->b_blkno); 60959872Smargo panic("Clustered write to wrong blocks"); 61059872Smargo } 61159872Smargo 61264528Sbostic pagemove(tbp->b_data, cp, size); 61357045Smargo bp->b_bcount += size; 61457045Smargo bp->b_bufsize += size; 61557045Smargo 616*65670Shibler tbp->b_bufsize -= size; 61757045Smargo tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 618*65670Shibler tbp->b_flags |= (B_ASYNC | B_AGE); 61957045Smargo s = splbio(); 62057045Smargo reassignbuf(tbp, tbp->b_vp); /* put on clean list */ 62157045Smargo ++tbp->b_vp->v_numoutput; 62257045Smargo splx(s); 62357045Smargo b_save->bs_children[i] = tbp; 62457045Smargo 625*65670Shibler cp += size; 62657045Smargo } 62757045Smargo 62857045Smargo if (i == 0) { 62957045Smargo /* None to cluster */ 63057045Smargo bp->b_saveaddr = b_save->bs_saveaddr; 63157045Smargo bp->b_flags &= ~B_CALL; 63257045Smargo bp->b_iodone = NULL; 63357045Smargo free(b_save, M_SEGMENT); 63457045Smargo } 63557045Smargo bawrite(bp); 63657045Smargo if (i < len) { 63757045Smargo len -= i + 1; 63857045Smargo start_lbn += 1; 63957045Smargo goto redo; 64057045Smargo } 64157045Smargo } 642