149589Sbostic /*- 263180Sbostic * Copyright (c) 1993 363180Sbostic * The Regents of the University of California. All rights reserved. 423395Smckusick * 559878Smckusick * %sccs.include.redist.c% 637736Smckusick * 7*66080Shibler * @(#)vfs_cluster.c 8.7 (Berkeley) 02/13/94 823395Smckusick */ 98Sbill 1051455Sbostic #include <sys/param.h> 1151455Sbostic #include <sys/proc.h> 1251455Sbostic #include <sys/buf.h> 1351455Sbostic #include <sys/vnode.h> 1451455Sbostic #include <sys/mount.h> 1551455Sbostic #include <sys/trace.h> 1659878Smckusick #include <sys/malloc.h> 1751455Sbostic #include <sys/resourcevar.h> 1856395Smckusick #include <libkern/libkern.h> 198Sbill 20*66080Shibler #ifdef DEBUG 21*66080Shibler #include <vm/vm.h> 22*66080Shibler #include <sys/sysctl.h> 23*66080Shibler int doreallocblks = 1; 24*66080Shibler struct ctldebug debug13 = { "doreallocblks", &doreallocblks }; 25*66080Shibler #else 26*66080Shibler /* XXX for cluster_write */ 27*66080Shibler #define doreallocblks 1 28*66080Shibler #endif 29*66080Shibler 3091Sbill /* 3157045Smargo * Local declarations 3257045Smargo */ 3357045Smargo struct buf *cluster_newbuf __P((struct vnode *, struct buf *, long, daddr_t, 3457045Smargo daddr_t, long, int)); 3557045Smargo struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *, 3657045Smargo daddr_t, daddr_t, long, int, long)); 3764717Smckusick void cluster_wbuild __P((struct vnode *, struct buf *, long, 3864717Smckusick daddr_t, int, daddr_t)); 3965998Smckusick struct cluster_save *cluster_collectbufs __P((struct vnode *, struct buf *)); 4057045Smargo 4165670Shibler #ifdef DIAGNOSTIC 4256395Smckusick /* 4365670Shibler * Set to 1 if reads of block zero should cause readahead to be done. 4465670Shibler * Set to 0 treats a read of block zero as a non-sequential read. 4557045Smargo * 4665670Shibler * Setting to one assumes that most reads of block zero of files are due to 4765670Shibler * sequential passes over the files (e.g. cat, sum) where additional blocks 4865670Shibler * will soon be needed. Setting to zero assumes that the majority are 4965670Shibler * surgical strikes to get particular info (e.g. size, file) where readahead 5065670Shibler * blocks will not be used and, in fact, push out other potentially useful 5165670Shibler * blocks from the cache. The former seems intuitive, but some quick tests 5265670Shibler * showed that the latter performed better from a system-wide point of view. 5365670Shibler */ 5465670Shibler int doclusterraz = 0; 5565670Shibler #define ISSEQREAD(vp, blk) \ 5665670Shibler (((blk) != 0 || doclusterraz) && \ 5765670Shibler ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr)) 5865670Shibler #else 5965670Shibler #define ISSEQREAD(vp, blk) \ 6065670Shibler ((blk) != 0 && ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr)) 6165670Shibler #endif 6265670Shibler 6365670Shibler /* 6457045Smargo * This replaces bread. If this is a bread at the beginning of a file and 6557045Smargo * lastr is 0, we assume this is the first read and we'll read up to two 6657045Smargo * blocks if they are sequential. After that, we'll do regular read ahead 6757045Smargo * in clustered chunks. 6857045Smargo * 6957045Smargo * There are 4 or 5 cases depending on how you count: 7057045Smargo * Desired block is in the cache: 7157045Smargo * 1 Not sequential access (0 I/Os). 7257045Smargo * 2 Access is sequential, do read-ahead (1 ASYNC). 7357045Smargo * Desired block is not in cache: 7457045Smargo * 3 Not sequential access (1 SYNC). 7557045Smargo * 4 Sequential access, next block is contiguous (1 SYNC). 7657045Smargo * 5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC) 7757045Smargo * 7857045Smargo * There are potentially two buffers that require I/O. 7957045Smargo * bp is the block requested. 8057045Smargo * rbp is the read-ahead block. 8157045Smargo * If either is NULL, then you don't have to do the I/O. 8257045Smargo */ 8357045Smargo cluster_read(vp, filesize, lblkno, size, cred, bpp) 8457045Smargo struct vnode *vp; 8557045Smargo u_quad_t filesize; 8657045Smargo daddr_t lblkno; 8757045Smargo long size; 8857045Smargo struct ucred *cred; 8957045Smargo struct buf **bpp; 9057045Smargo { 9157045Smargo struct buf *bp, *rbp; 9257045Smargo daddr_t blkno, ioblkno; 9357045Smargo long flags; 9457045Smargo int error, num_ra, alreadyincore; 9557045Smargo 9657045Smargo #ifdef DIAGNOSTIC 9757045Smargo if (size == 0) 9857045Smargo panic("cluster_read: size = 0"); 9957045Smargo #endif 10057045Smargo 10157045Smargo error = 0; 10257045Smargo flags = B_READ; 10357797Smckusick *bpp = bp = getblk(vp, lblkno, size, 0, 0); 10465670Shibler if (bp->b_flags & B_CACHE) { 10557045Smargo /* 10657045Smargo * Desired block is in cache; do any readahead ASYNC. 10757045Smargo * Case 1, 2. 10857045Smargo */ 10957045Smargo trace(TR_BREADHIT, pack(vp, size), lblkno); 11057045Smargo flags |= B_ASYNC; 11165670Shibler ioblkno = lblkno + (vp->v_ralen ? vp->v_ralen : 1); 11257797Smckusick alreadyincore = (int)incore(vp, ioblkno); 11357045Smargo bp = NULL; 11457045Smargo } else { 11557045Smargo /* Block wasn't in cache, case 3, 4, 5. */ 11657045Smargo trace(TR_BREADMISS, pack(vp, size), lblkno); 11765670Shibler bp->b_flags |= B_READ; 11857045Smargo ioblkno = lblkno; 11957045Smargo alreadyincore = 0; 12057045Smargo curproc->p_stats->p_ru.ru_inblock++; /* XXX */ 12157045Smargo } 12257045Smargo /* 12357045Smargo * XXX 12457045Smargo * Replace 1 with a window size based on some permutation of 12557045Smargo * maxcontig and rot_delay. This will let you figure out how 12657045Smargo * many blocks you should read-ahead (case 2, 4, 5). 12757045Smargo * 12865670Shibler * If the access isn't sequential, reset the window to 1. 12965670Shibler * Note that a read to the same block is considered sequential. 13065670Shibler * This catches the case where the file is being read sequentially, 13165670Shibler * but at smaller than the filesystem block size. 13257045Smargo */ 13357045Smargo rbp = NULL; 13465670Shibler if (!ISSEQREAD(vp, lblkno)) { 13565670Shibler vp->v_ralen = 0; 13665670Shibler vp->v_maxra = lblkno; 13765670Shibler } else if ((ioblkno + 1) * size <= filesize && !alreadyincore && 13864717Smckusick !(error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra)) && 13964717Smckusick blkno != -1) { 14057045Smargo /* 14157045Smargo * Reading sequentially, and the next block is not in the 14265670Shibler * cache. We are going to try reading ahead. 14357045Smargo */ 14465670Shibler if (num_ra) { 14565670Shibler /* 14665670Shibler * If our desired readahead block had been read 14765670Shibler * in a previous readahead but is no longer in 14865670Shibler * core, then we may be reading ahead too far 14965670Shibler * or are not using our readahead very rapidly. 15065670Shibler * In this case we scale back the window. 15165670Shibler */ 15265670Shibler if (!alreadyincore && ioblkno <= vp->v_maxra) 15365670Shibler vp->v_ralen = max(vp->v_ralen >> 1, 1); 15465670Shibler /* 15565670Shibler * There are more sequential blocks than our current 15665670Shibler * window allows, scale up. Ideally we want to get 15765670Shibler * in sync with the filesystem maxcontig value. 15865670Shibler */ 15965670Shibler else if (num_ra > vp->v_ralen && lblkno != vp->v_lastr) 16065670Shibler vp->v_ralen = vp->v_ralen ? 16165670Shibler min(num_ra, vp->v_ralen << 1) : 1; 16257045Smargo 16365670Shibler if (num_ra > vp->v_ralen) 16465670Shibler num_ra = vp->v_ralen; 16565670Shibler } 16657045Smargo 16757045Smargo if (num_ra) /* case 2, 4 */ 16857045Smargo rbp = cluster_rbuild(vp, filesize, 16957045Smargo bp, ioblkno, blkno, size, num_ra, flags); 17065670Shibler else if (ioblkno == lblkno) { 17165670Shibler bp->b_blkno = blkno; 17257045Smargo /* Case 5: check how many blocks to read ahead */ 17357045Smargo ++ioblkno; 17457045Smargo if ((ioblkno + 1) * size > filesize || 17565670Shibler incore(vp, ioblkno) || (error = VOP_BMAP(vp, 17665670Shibler ioblkno, NULL, &blkno, &num_ra)) || blkno == -1) 17757045Smargo goto skip_readahead; 17865670Shibler /* 17965670Shibler * Adjust readahead as above 18065670Shibler */ 18165670Shibler if (num_ra) { 18265670Shibler if (!alreadyincore && ioblkno <= vp->v_maxra) 18365670Shibler vp->v_ralen = max(vp->v_ralen >> 1, 1); 18465670Shibler else if (num_ra > vp->v_ralen && 18565670Shibler lblkno != vp->v_lastr) 18665670Shibler vp->v_ralen = vp->v_ralen ? 18765670Shibler min(num_ra,vp->v_ralen<<1) : 1; 18865670Shibler if (num_ra > vp->v_ralen) 18965670Shibler num_ra = vp->v_ralen; 19065670Shibler } 19157045Smargo flags |= B_ASYNC; 19257045Smargo if (num_ra) 19357045Smargo rbp = cluster_rbuild(vp, filesize, 19457045Smargo NULL, ioblkno, blkno, size, num_ra, flags); 19557045Smargo else { 19657797Smckusick rbp = getblk(vp, ioblkno, size, 0, 0); 19757045Smargo rbp->b_flags |= flags; 19857045Smargo rbp->b_blkno = blkno; 19957045Smargo } 20065670Shibler } else { 20157045Smargo /* case 2; read ahead single block */ 20257797Smckusick rbp = getblk(vp, ioblkno, size, 0, 0); 20357045Smargo rbp->b_flags |= flags; 20457045Smargo rbp->b_blkno = blkno; 20565670Shibler } 20657045Smargo 20765670Shibler if (rbp == bp) /* case 4 */ 20857045Smargo rbp = NULL; 20957045Smargo else if (rbp) { /* case 2, 5 */ 21057045Smargo trace(TR_BREADMISSRA, 21157045Smargo pack(vp, (num_ra + 1) * size), ioblkno); 21257045Smargo curproc->p_stats->p_ru.ru_inblock++; /* XXX */ 21357045Smargo } 21457045Smargo } 21557045Smargo 21657045Smargo /* XXX Kirk, do we need to make sure the bp has creds? */ 21757045Smargo skip_readahead: 21857045Smargo if (bp) 21957045Smargo if (bp->b_flags & (B_DONE | B_DELWRI)) 22057045Smargo panic("cluster_read: DONE bp"); 22157045Smargo else 22257045Smargo error = VOP_STRATEGY(bp); 22357045Smargo 22457045Smargo if (rbp) 22557045Smargo if (error || rbp->b_flags & (B_DONE | B_DELWRI)) { 22657045Smargo rbp->b_flags &= ~(B_ASYNC | B_READ); 22757045Smargo brelse(rbp); 22857045Smargo } else 22957045Smargo (void) VOP_STRATEGY(rbp); 23057045Smargo 23165670Shibler /* 23265670Shibler * Recalculate our maximum readahead 23365670Shibler */ 23465670Shibler if (rbp == NULL) 23565670Shibler rbp = bp; 23665670Shibler if (rbp) 23765670Shibler vp->v_maxra = rbp->b_lblkno + (rbp->b_bufsize / size) - 1; 23865670Shibler 23957045Smargo if (bp) 24057045Smargo return(biowait(bp)); 24157045Smargo return(error); 24257045Smargo } 24357045Smargo 24457045Smargo /* 24557045Smargo * If blocks are contiguous on disk, use this to provide clustered 24657045Smargo * read ahead. We will read as many blocks as possible sequentially 24757045Smargo * and then parcel them up into logical blocks in the buffer hash table. 24857045Smargo */ 24957045Smargo struct buf * 25057045Smargo cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags) 25157045Smargo struct vnode *vp; 25257045Smargo u_quad_t filesize; 25357045Smargo struct buf *bp; 25457045Smargo daddr_t lbn; 25557045Smargo daddr_t blkno; 25657045Smargo long size; 25757045Smargo int run; 25857045Smargo long flags; 25957045Smargo { 26057045Smargo struct cluster_save *b_save; 26157045Smargo struct buf *tbp; 26257045Smargo daddr_t bn; 26357045Smargo int i, inc; 26457045Smargo 26559872Smargo #ifdef DIAGNOSTIC 26659872Smargo if (size != vp->v_mount->mnt_stat.f_iosize) 26759872Smargo panic("cluster_rbuild: size %d != filesize %d\n", 26859872Smargo size, vp->v_mount->mnt_stat.f_iosize); 26959872Smargo #endif 27057045Smargo if (size * (lbn + run + 1) > filesize) 27157045Smargo --run; 27257045Smargo if (run == 0) { 27357045Smargo if (!bp) { 27457797Smckusick bp = getblk(vp, lbn, size, 0, 0); 27557045Smargo bp->b_blkno = blkno; 27657045Smargo bp->b_flags |= flags; 27757045Smargo } 27857045Smargo return(bp); 27957045Smargo } 28057045Smargo 28157045Smargo bp = cluster_newbuf(vp, bp, flags, blkno, lbn, size, run + 1); 28257045Smargo if (bp->b_flags & (B_DONE | B_DELWRI)) 28357045Smargo return (bp); 28457045Smargo 28557045Smargo b_save = malloc(sizeof(struct buf *) * run + sizeof(struct cluster_save), 28657045Smargo M_SEGMENT, M_WAITOK); 28757045Smargo b_save->bs_bufsize = b_save->bs_bcount = size; 28857045Smargo b_save->bs_nchildren = 0; 28957045Smargo b_save->bs_children = (struct buf **)(b_save + 1); 29057045Smargo b_save->bs_saveaddr = bp->b_saveaddr; 29157045Smargo bp->b_saveaddr = (caddr_t) b_save; 29257045Smargo 29365670Shibler inc = btodb(size); 29457045Smargo for (bn = blkno + inc, i = 1; i <= run; ++i, bn += inc) { 29557045Smargo if (incore(vp, lbn + i)) { 29657045Smargo if (i == 1) { 29757045Smargo bp->b_saveaddr = b_save->bs_saveaddr; 29857045Smargo bp->b_flags &= ~B_CALL; 29957045Smargo bp->b_iodone = NULL; 30057045Smargo allocbuf(bp, size); 30157045Smargo free(b_save, M_SEGMENT); 30257045Smargo } else 30357045Smargo allocbuf(bp, size * i); 30457045Smargo break; 30557045Smargo } 30657797Smckusick tbp = getblk(vp, lbn + i, 0, 0, 0); 30765670Shibler /* 30865670Shibler * getblk may return some memory in the buffer if there were 30965670Shibler * no empty buffers to shed it to. If there is currently 31065670Shibler * memory in the buffer, we move it down size bytes to make 31165670Shibler * room for the valid pages that cluster_callback will insert. 31265670Shibler * We do this now so we don't have to do it at interrupt time 31365670Shibler * in the callback routine. 31465670Shibler */ 31565670Shibler if (tbp->b_bufsize != 0) { 31665670Shibler caddr_t bdata = (char *)tbp->b_data; 31765670Shibler 31865670Shibler if (tbp->b_bufsize + size > MAXBSIZE) 31965670Shibler panic("cluster_rbuild: too much memory"); 32065670Shibler if (tbp->b_bufsize > size) { 32165670Shibler /* 32265670Shibler * XXX if the source and destination regions 32365670Shibler * overlap we have to copy backward to avoid 32465670Shibler * clobbering any valid pages (i.e. pagemove 32565670Shibler * implementations typically can't handle 32665670Shibler * overlap). 32765670Shibler */ 32865670Shibler bdata += tbp->b_bufsize; 32965670Shibler while (bdata > (char *)tbp->b_data) { 33065670Shibler bdata -= CLBYTES; 33165670Shibler pagemove(bdata, bdata + size, CLBYTES); 33265670Shibler } 33365670Shibler } else 33465670Shibler pagemove(bdata, bdata + size, tbp->b_bufsize); 33565670Shibler } 33657045Smargo tbp->b_blkno = bn; 33757045Smargo tbp->b_flags |= flags | B_READ | B_ASYNC; 33857045Smargo ++b_save->bs_nchildren; 33957045Smargo b_save->bs_children[i - 1] = tbp; 34057045Smargo } 34157045Smargo return(bp); 34257045Smargo } 34357045Smargo 34457045Smargo /* 34557045Smargo * Either get a new buffer or grow the existing one. 34657045Smargo */ 34757045Smargo struct buf * 34857045Smargo cluster_newbuf(vp, bp, flags, blkno, lblkno, size, run) 34957045Smargo struct vnode *vp; 35057045Smargo struct buf *bp; 35157045Smargo long flags; 35257045Smargo daddr_t blkno; 35357045Smargo daddr_t lblkno; 35457045Smargo long size; 35557045Smargo int run; 35657045Smargo { 35757045Smargo if (!bp) { 35857797Smckusick bp = getblk(vp, lblkno, size, 0, 0); 35957045Smargo if (bp->b_flags & (B_DONE | B_DELWRI)) { 36057045Smargo bp->b_blkno = blkno; 36157045Smargo return(bp); 36257045Smargo } 36357045Smargo } 36457045Smargo allocbuf(bp, run * size); 36557045Smargo bp->b_blkno = blkno; 36657045Smargo bp->b_iodone = cluster_callback; 36757045Smargo bp->b_flags |= flags | B_CALL; 36857045Smargo return(bp); 36957045Smargo } 37057045Smargo 37157045Smargo /* 37257045Smargo * Cleanup after a clustered read or write. 37365670Shibler * This is complicated by the fact that any of the buffers might have 37465670Shibler * extra memory (if there were no empty buffer headers at allocbuf time) 37565670Shibler * that we will need to shift around. 37657045Smargo */ 37757045Smargo void 37857045Smargo cluster_callback(bp) 37957045Smargo struct buf *bp; 38057045Smargo { 38157045Smargo struct cluster_save *b_save; 38265670Shibler struct buf **bpp, *tbp; 38365670Shibler long bsize; 38457045Smargo caddr_t cp; 38565670Shibler int error = 0; 38664717Smckusick 38765670Shibler /* 38865670Shibler * Must propogate errors to all the components. 38965670Shibler */ 39065670Shibler if (bp->b_flags & B_ERROR) 39165670Shibler error = bp->b_error; 39265670Shibler 39357045Smargo b_save = (struct cluster_save *)(bp->b_saveaddr); 39457045Smargo bp->b_saveaddr = b_save->bs_saveaddr; 39557045Smargo 39665670Shibler bsize = b_save->bs_bufsize; 39765670Shibler cp = (char *)bp->b_data + bsize; 39865670Shibler /* 39965670Shibler * Move memory from the large cluster buffer into the component 40065670Shibler * buffers and mark IO as done on these. 40165670Shibler */ 40265670Shibler for (bpp = b_save->bs_children; b_save->bs_nchildren--; ++bpp) { 40365670Shibler tbp = *bpp; 40465670Shibler pagemove(cp, tbp->b_data, bsize); 40565670Shibler tbp->b_bufsize += bsize; 40665670Shibler tbp->b_bcount = bsize; 40765670Shibler if (error) { 40865670Shibler tbp->b_flags |= B_ERROR; 40965670Shibler tbp->b_error = error; 41065670Shibler } 41165670Shibler biodone(tbp); 41265670Shibler bp->b_bufsize -= bsize; 41365670Shibler cp += bsize; 41457045Smargo } 41565670Shibler /* 41665670Shibler * If there was excess memory in the cluster buffer, 41765670Shibler * slide it up adjacent to the remaining valid data. 41865670Shibler */ 41965670Shibler if (bp->b_bufsize != bsize) { 42065670Shibler if (bp->b_bufsize < bsize) 42165670Shibler panic("cluster_callback: too little memory"); 42265670Shibler pagemove(cp, (char *)bp->b_data + bsize, bp->b_bufsize - bsize); 42365670Shibler } 42465670Shibler bp->b_bcount = bsize; 42557045Smargo bp->b_iodone = NULL; 42657045Smargo free(b_save, M_SEGMENT); 42757045Smargo if (bp->b_flags & B_ASYNC) 42857045Smargo brelse(bp); 42965670Shibler else { 43065670Shibler bp->b_flags &= ~B_WANTED; 43157045Smargo wakeup((caddr_t)bp); 43265670Shibler } 43357045Smargo } 43457045Smargo 43557045Smargo /* 43657045Smargo * Do clustered write for FFS. 43757045Smargo * 43857045Smargo * Three cases: 43957045Smargo * 1. Write is not sequential (write asynchronously) 44057045Smargo * Write is sequential: 44157045Smargo * 2. beginning of cluster - begin cluster 44257045Smargo * 3. middle of a cluster - add to cluster 44357045Smargo * 4. end of a cluster - asynchronously write cluster 44457045Smargo */ 44557045Smargo void 44657045Smargo cluster_write(bp, filesize) 44757045Smargo struct buf *bp; 44857045Smargo u_quad_t filesize; 44957045Smargo { 45057045Smargo struct vnode *vp; 45157045Smargo daddr_t lbn; 45265998Smckusick int maxclen, cursize; 45357045Smargo 45457045Smargo vp = bp->b_vp; 45557045Smargo lbn = bp->b_lblkno; 45657045Smargo 45759872Smargo /* Initialize vnode to beginning of file. */ 45859872Smargo if (lbn == 0) 45959872Smargo vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; 46059872Smargo 46159872Smargo if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || 46265670Shibler (bp->b_blkno != vp->v_lasta + btodb(bp->b_bcount))) { 46365998Smckusick maxclen = MAXBSIZE / vp->v_mount->mnt_stat.f_iosize - 1; 46465998Smckusick if (vp->v_clen != 0) { 46557045Smargo /* 46665998Smckusick * Next block is not sequential. 46765998Smckusick * 46865998Smckusick * If we are not writing at end of file, the process 46965998Smckusick * seeked to another point in the file since its 47065998Smckusick * last write, or we have reached our maximum 47165998Smckusick * cluster size, then push the previous cluster. 47265998Smckusick * Otherwise try reallocating to make it sequential. 47357045Smargo */ 47465998Smckusick cursize = vp->v_lastw - vp->v_cstart + 1; 475*66080Shibler if (!doreallocblks || 476*66080Shibler (lbn + 1) * bp->b_bcount != filesize || 47765998Smckusick lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { 47865998Smckusick cluster_wbuild(vp, NULL, bp->b_bcount, 47965998Smckusick vp->v_cstart, cursize, lbn); 48065998Smckusick } else { 48165998Smckusick struct buf **bpp, **endbp; 48265998Smckusick struct cluster_save *buflist; 48365998Smckusick 48465998Smckusick buflist = cluster_collectbufs(vp, bp); 48565998Smckusick endbp = &buflist->bs_children 48665998Smckusick [buflist->bs_nchildren - 1]; 48765998Smckusick if (VOP_REALLOCBLKS(vp, buflist)) { 48865998Smckusick /* 48965998Smckusick * Failed, push the previous cluster. 49065998Smckusick */ 49165998Smckusick for (bpp = buflist->bs_children; 49265998Smckusick bpp < endbp; bpp++) 49365998Smckusick brelse(*bpp); 49465998Smckusick free(buflist, M_SEGMENT); 49565998Smckusick cluster_wbuild(vp, NULL, bp->b_bcount, 49665998Smckusick vp->v_cstart, cursize, lbn); 49765998Smckusick } else { 49865998Smckusick /* 49965998Smckusick * Succeeded, keep building cluster. 50065998Smckusick */ 50165998Smckusick for (bpp = buflist->bs_children; 50265998Smckusick bpp <= endbp; bpp++) 50365998Smckusick bdwrite(*bpp); 50465998Smckusick free(buflist, M_SEGMENT); 50565998Smckusick vp->v_lastw = lbn; 50665998Smckusick vp->v_lasta = bp->b_blkno; 50765998Smckusick return; 50865998Smckusick } 50965998Smckusick } 51065998Smckusick } 51157045Smargo /* 51257045Smargo * Consider beginning a cluster. 51365998Smckusick * If at end of file, make cluster as large as possible, 51465998Smckusick * otherwise find size of existing cluster. 51557045Smargo */ 51665998Smckusick if ((lbn + 1) * bp->b_bcount != filesize && 51765998Smckusick (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen) || 51865998Smckusick bp->b_blkno == -1)) { 51957045Smargo bawrite(bp); 52059872Smargo vp->v_clen = 0; 52159872Smargo vp->v_lasta = bp->b_blkno; 52257045Smargo vp->v_cstart = lbn + 1; 52357045Smargo vp->v_lastw = lbn; 52457045Smargo return; 52564717Smckusick } 52665998Smckusick vp->v_clen = maxclen; 52765998Smckusick if (maxclen == 0) { /* I/O not contiguous */ 52857045Smargo vp->v_cstart = lbn + 1; 52957045Smargo bawrite(bp); 53057045Smargo } else { /* Wait for rest of cluster */ 53157045Smargo vp->v_cstart = lbn; 53257045Smargo bdwrite(bp); 53357045Smargo } 53465670Shibler } else if (lbn == vp->v_cstart + vp->v_clen) { 53557045Smargo /* 53657045Smargo * At end of cluster, write it out. 53757045Smargo */ 53857045Smargo cluster_wbuild(vp, bp, bp->b_bcount, vp->v_cstart, 53957045Smargo vp->v_clen + 1, lbn); 54057045Smargo vp->v_clen = 0; 54157045Smargo vp->v_cstart = lbn + 1; 54265670Shibler } else 54357045Smargo /* 54457045Smargo * In the middle of a cluster, so just delay the 54557045Smargo * I/O for now. 54657045Smargo */ 54765670Shibler bdwrite(bp); 54865670Shibler vp->v_lastw = lbn; 54959872Smargo vp->v_lasta = bp->b_blkno; 55057045Smargo } 55157045Smargo 55257045Smargo 55357045Smargo /* 55457045Smargo * This is an awful lot like cluster_rbuild...wish they could be combined. 55557045Smargo * The last lbn argument is the current block on which I/O is being 55657045Smargo * performed. Check to see that it doesn't fall in the middle of 55765670Shibler * the current block (if last_bp == NULL). 55857045Smargo */ 55957045Smargo void 56057045Smargo cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn) 56157045Smargo struct vnode *vp; 56257045Smargo struct buf *last_bp; 56357045Smargo long size; 56457045Smargo daddr_t start_lbn; 56557045Smargo int len; 56657045Smargo daddr_t lbn; 56757045Smargo { 56857045Smargo struct cluster_save *b_save; 56957045Smargo struct buf *bp, *tbp; 57057045Smargo caddr_t cp; 57157045Smargo int i, s; 57257045Smargo 57359872Smargo #ifdef DIAGNOSTIC 57459872Smargo if (size != vp->v_mount->mnt_stat.f_iosize) 57559872Smargo panic("cluster_wbuild: size %d != filesize %d\n", 57659872Smargo size, vp->v_mount->mnt_stat.f_iosize); 57759872Smargo #endif 57857045Smargo redo: 57957045Smargo while ((!incore(vp, start_lbn) || start_lbn == lbn) && len) { 58057045Smargo ++start_lbn; 58157045Smargo --len; 58257045Smargo } 58357045Smargo 58457045Smargo /* Get more memory for current buffer */ 58557045Smargo if (len <= 1) { 58659872Smargo if (last_bp) { 58757045Smargo bawrite(last_bp); 58859872Smargo } else if (len) { 58959872Smargo bp = getblk(vp, start_lbn, size, 0, 0); 59059872Smargo bawrite(bp); 59159872Smargo } 59257045Smargo return; 59357045Smargo } 59457045Smargo 59557797Smckusick bp = getblk(vp, start_lbn, size, 0, 0); 59657045Smargo if (!(bp->b_flags & B_DELWRI)) { 59757045Smargo ++start_lbn; 59857045Smargo --len; 59957045Smargo brelse(bp); 60057045Smargo goto redo; 60157045Smargo } 60257045Smargo 60365670Shibler /* 60465670Shibler * Extra memory in the buffer, punt on this buffer. 60565670Shibler * XXX we could handle this in most cases, but we would have to 60665670Shibler * push the extra memory down to after our max possible cluster 60765670Shibler * size and then potentially pull it back up if the cluster was 60865670Shibler * terminated prematurely--too much hassle. 60965670Shibler */ 61065670Shibler if (bp->b_bcount != bp->b_bufsize) { 61165670Shibler ++start_lbn; 61265670Shibler --len; 61365670Shibler bawrite(bp); 61465670Shibler goto redo; 61565670Shibler } 61665670Shibler 61757045Smargo --len; 61857045Smargo b_save = malloc(sizeof(struct buf *) * len + sizeof(struct cluster_save), 61957045Smargo M_SEGMENT, M_WAITOK); 62057045Smargo b_save->bs_bcount = bp->b_bcount; 62157045Smargo b_save->bs_bufsize = bp->b_bufsize; 62257045Smargo b_save->bs_nchildren = 0; 62357045Smargo b_save->bs_children = (struct buf **)(b_save + 1); 62457045Smargo b_save->bs_saveaddr = bp->b_saveaddr; 62557045Smargo bp->b_saveaddr = (caddr_t) b_save; 62657045Smargo 62757045Smargo bp->b_flags |= B_CALL; 62857045Smargo bp->b_iodone = cluster_callback; 62965670Shibler cp = (char *)bp->b_data + size; 63057045Smargo for (++start_lbn, i = 0; i < len; ++i, ++start_lbn) { 63165670Shibler /* 63265670Shibler * Block is not in core or the non-sequential block 63365670Shibler * ending our cluster was part of the cluster (in which 63465670Shibler * case we don't want to write it twice). 63565670Shibler */ 63665670Shibler if (!incore(vp, start_lbn) || 63765670Shibler last_bp == NULL && start_lbn == lbn) 63857045Smargo break; 63957045Smargo 64065670Shibler /* 64165670Shibler * Get the desired block buffer (unless it is the final 64265670Shibler * sequential block whose buffer was passed in explictly 64365670Shibler * as last_bp). 64465670Shibler */ 64565670Shibler if (last_bp == NULL || start_lbn != lbn) { 64657797Smckusick tbp = getblk(vp, start_lbn, size, 0, 0); 64757045Smargo if (!(tbp->b_flags & B_DELWRI)) { 64857045Smargo brelse(tbp); 64957045Smargo break; 65057045Smargo } 65157045Smargo } else 65257045Smargo tbp = last_bp; 65357045Smargo 65457045Smargo ++b_save->bs_nchildren; 65557045Smargo 65657045Smargo /* Move memory from children to parent */ 65765670Shibler if (tbp->b_blkno != (bp->b_blkno + btodb(bp->b_bufsize))) { 65859872Smargo printf("Clustered Block: %d addr %x bufsize: %d\n", 65959872Smargo bp->b_lblkno, bp->b_blkno, bp->b_bufsize); 66059872Smargo printf("Child Block: %d addr: %x\n", tbp->b_lblkno, 66159872Smargo tbp->b_blkno); 66259872Smargo panic("Clustered write to wrong blocks"); 66359872Smargo } 66459872Smargo 66564528Sbostic pagemove(tbp->b_data, cp, size); 66657045Smargo bp->b_bcount += size; 66757045Smargo bp->b_bufsize += size; 66857045Smargo 66965670Shibler tbp->b_bufsize -= size; 67057045Smargo tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 67165670Shibler tbp->b_flags |= (B_ASYNC | B_AGE); 67257045Smargo s = splbio(); 67357045Smargo reassignbuf(tbp, tbp->b_vp); /* put on clean list */ 67457045Smargo ++tbp->b_vp->v_numoutput; 67557045Smargo splx(s); 67657045Smargo b_save->bs_children[i] = tbp; 67757045Smargo 67865670Shibler cp += size; 67957045Smargo } 68057045Smargo 68157045Smargo if (i == 0) { 68257045Smargo /* None to cluster */ 68357045Smargo bp->b_saveaddr = b_save->bs_saveaddr; 68457045Smargo bp->b_flags &= ~B_CALL; 68557045Smargo bp->b_iodone = NULL; 68657045Smargo free(b_save, M_SEGMENT); 68757045Smargo } 68857045Smargo bawrite(bp); 68957045Smargo if (i < len) { 69057045Smargo len -= i + 1; 69157045Smargo start_lbn += 1; 69257045Smargo goto redo; 69357045Smargo } 69457045Smargo } 69565998Smckusick 69665998Smckusick /* 69765998Smckusick * Collect together all the buffers in a cluster. 69865998Smckusick * Plus add one additional buffer. 69965998Smckusick */ 70065998Smckusick struct cluster_save * 70165998Smckusick cluster_collectbufs(vp, last_bp) 70265998Smckusick struct vnode *vp; 70365998Smckusick struct buf *last_bp; 70465998Smckusick { 70565998Smckusick struct cluster_save *buflist; 70665998Smckusick daddr_t lbn; 70765998Smckusick int i, len; 70865998Smckusick 70965998Smckusick len = vp->v_lastw - vp->v_cstart + 1; 71065998Smckusick buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), 71165998Smckusick M_SEGMENT, M_WAITOK); 71265998Smckusick buflist->bs_nchildren = 0; 71365998Smckusick buflist->bs_children = (struct buf **)(buflist + 1); 71465998Smckusick for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) 71565998Smckusick (void)bread(vp, lbn, last_bp->b_bcount, NOCRED, 71665998Smckusick &buflist->bs_children[i]); 71765998Smckusick buflist->bs_children[i] = last_bp; 71865998Smckusick buflist->bs_nchildren = i + 1; 71965998Smckusick return (buflist); 72065998Smckusick } 721