149589Sbostic /*- 249589Sbostic * Copyright (c) 1982, 1986, 1989 The Regents of the University of California. 337736Smckusick * All rights reserved. 423395Smckusick * 549618Smckusick * This module is believed to contain source code proprietary to AT&T. 649618Smckusick * Use and redistribution is subject to the Berkeley Software License 749618Smckusick * Agreement and your Software Agreement with AT&T (Western Electric). 837736Smckusick * 9*59872Smargo * @(#)vfs_cluster.c 7.59 (Berkeley) 05/10/93 1023395Smckusick */ 118Sbill 1251455Sbostic #include <sys/param.h> 1351455Sbostic #include <sys/proc.h> 1451455Sbostic #include <sys/buf.h> 1551455Sbostic #include <sys/vnode.h> 1651455Sbostic #include <sys/mount.h> 1751455Sbostic #include <sys/trace.h> 1851455Sbostic #include <sys/resourcevar.h> 1956395Smckusick #include <sys/malloc.h> 2056395Smckusick #include <libkern/libkern.h> 218Sbill 2291Sbill /* 2356395Smckusick * Definitions for the buffer hash lists. 2456395Smckusick */ 2556395Smckusick #define BUFHASH(dvp, lbn) \ 2656395Smckusick (&bufhashtbl[((int)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash]) 2756607Smckusick struct list_entry *bufhashtbl, invalhash; 2856395Smckusick u_long bufhash; 2956395Smckusick 3056395Smckusick /* 3156395Smckusick * Insq/Remq for the buffer hash lists. 3256395Smckusick */ 3356607Smckusick #define binshash(bp, dp) list_enter_head(dp, bp, struct buf *, b_hash) 3456607Smckusick #define bremhash(bp) list_remove(bp, struct buf *, b_hash) 3556395Smckusick 3656395Smckusick /* 3756395Smckusick * Definitions for the buffer free lists. 3856395Smckusick */ 3956395Smckusick #define BQUEUES 4 /* number of free buffer queues */ 4056395Smckusick 4156395Smckusick #define BQ_LOCKED 0 /* super-blocks &c */ 4256395Smckusick #define BQ_LRU 1 /* lru, useful buffers */ 4356395Smckusick #define BQ_AGE 2 /* rubbish */ 4456395Smckusick #define BQ_EMPTY 3 /* buffer headers with no memory */ 4556395Smckusick 4656607Smckusick struct queue_entry bufqueues[BQUEUES]; 4756395Smckusick int needbuffer; 4856395Smckusick 4956395Smckusick /* 5056395Smckusick * Insq/Remq for the buffer free lists. 5156395Smckusick */ 5256607Smckusick #define binsheadfree(bp, dp) \ 5356607Smckusick queue_enter_head(dp, bp, struct buf *, b_freelist) 5456607Smckusick #define binstailfree(bp, dp) \ 5556607Smckusick queue_enter_tail(dp, bp, struct buf *, b_freelist) 5656607Smckusick 5757045Smargo /* 5857045Smargo * Local declarations 5957045Smargo */ 6057045Smargo struct buf *cluster_newbuf __P((struct vnode *, struct buf *, long, daddr_t, 6157045Smargo daddr_t, long, int)); 6257045Smargo struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *, 6357045Smargo daddr_t, daddr_t, long, int, long)); 6457045Smargo void cluster_wbuild __P((struct vnode *, struct buf *, long size, 6557045Smargo daddr_t start_lbn, int len, daddr_t lbn)); 6657045Smargo 6756395Smckusick void 6856395Smckusick bremfree(bp) 6956395Smckusick struct buf *bp; 7056395Smckusick { 7156607Smckusick struct queue_entry *dp; 7256395Smckusick 7356607Smckusick /* 7456607Smckusick * We only calculate the head of the freelist when removing 7556607Smckusick * the last element of the list as that is the only time that 7656607Smckusick * it is needed (e.g. to reset the tail pointer). 7756607Smckusick */ 7856607Smckusick if (bp->b_freelist.qe_next == NULL) { 7956395Smckusick for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) 8056607Smckusick if (dp->qe_prev == &bp->b_freelist.qe_next) 8156395Smckusick break; 8256395Smckusick if (dp == &bufqueues[BQUEUES]) 8356395Smckusick panic("bremfree: lost tail"); 8456395Smckusick } 8556607Smckusick queue_remove(dp, bp, struct buf *, b_freelist); 8656395Smckusick } 8756395Smckusick 8856395Smckusick /* 8949280Skarels * Initialize buffers and hash links for buffers. 9049280Skarels */ 9151455Sbostic void 9249280Skarels bufinit() 9349280Skarels { 9456395Smckusick register struct buf *bp; 9556607Smckusick struct queue_entry *dp; 9649280Skarels register int i; 9749280Skarels int base, residual; 9849280Skarels 9956395Smckusick for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) 10056607Smckusick queue_init(dp); 10156607Smckusick bufhashtbl = (struct list_entry *)hashinit(nbuf, M_CACHE, &bufhash); 10249280Skarels base = bufpages / nbuf; 10349280Skarels residual = bufpages % nbuf; 10449280Skarels for (i = 0; i < nbuf; i++) { 10549280Skarels bp = &buf[i]; 10656395Smckusick bzero((char *)bp, sizeof *bp); 10749280Skarels bp->b_dev = NODEV; 10849280Skarels bp->b_rcred = NOCRED; 10949280Skarels bp->b_wcred = NOCRED; 11049280Skarels bp->b_un.b_addr = buffers + i * MAXBSIZE; 11149280Skarels if (i < residual) 11249280Skarels bp->b_bufsize = (base + 1) * CLBYTES; 11349280Skarels else 11449280Skarels bp->b_bufsize = base * CLBYTES; 11552413Storek bp->b_flags = B_INVAL; 11656395Smckusick dp = bp->b_bufsize ? &bufqueues[BQ_AGE] : &bufqueues[BQ_EMPTY]; 11752413Storek binsheadfree(bp, dp); 11856395Smckusick binshash(bp, &invalhash); 11949280Skarels } 12049280Skarels } 12149280Skarels 12249280Skarels /* 12346151Smckusick * Find the block in the buffer pool. 12446151Smckusick * If the buffer is not present, allocate a new buffer and load 12546151Smckusick * its contents according to the filesystem fill routine. 1268Sbill */ 12738776Smckusick bread(vp, blkno, size, cred, bpp) 12837736Smckusick struct vnode *vp; 1296563Smckusic daddr_t blkno; 1306563Smckusic int size; 13138776Smckusick struct ucred *cred; 13237736Smckusick struct buf **bpp; 1338Sbill { 13447545Skarels struct proc *p = curproc; /* XXX */ 1358Sbill register struct buf *bp; 1368Sbill 1378670S if (size == 0) 1388670S panic("bread: size 0"); 13957797Smckusick *bpp = bp = getblk(vp, blkno, size, 0, 0); 14046151Smckusick if (bp->b_flags & (B_DONE | B_DELWRI)) { 14140341Smckusick trace(TR_BREADHIT, pack(vp, size), blkno); 14237736Smckusick return (0); 1438Sbill } 1448Sbill bp->b_flags |= B_READ; 1458670S if (bp->b_bcount > bp->b_bufsize) 1468670S panic("bread"); 14738776Smckusick if (bp->b_rcred == NOCRED && cred != NOCRED) { 14838776Smckusick crhold(cred); 14938776Smckusick bp->b_rcred = cred; 15038776Smckusick } 15137736Smckusick VOP_STRATEGY(bp); 15240341Smckusick trace(TR_BREADMISS, pack(vp, size), blkno); 15347545Skarels p->p_stats->p_ru.ru_inblock++; /* pay for read */ 15437736Smckusick return (biowait(bp)); 1558Sbill } 1568Sbill 1578Sbill /* 15852189Smckusick * Operates like bread, but also starts I/O on the N specified 15952189Smckusick * read-ahead blocks. 1608Sbill */ 16152189Smckusick breadn(vp, blkno, size, rablkno, rabsize, num, cred, bpp) 16237736Smckusick struct vnode *vp; 1637114Smckusick daddr_t blkno; int size; 16452189Smckusick daddr_t rablkno[]; int rabsize[]; 16552189Smckusick int num; 16638776Smckusick struct ucred *cred; 16737736Smckusick struct buf **bpp; 1688Sbill { 16947545Skarels struct proc *p = curproc; /* XXX */ 1708Sbill register struct buf *bp, *rabp; 17152189Smckusick register int i; 1728Sbill 1738Sbill bp = NULL; 1747015Smckusick /* 17546151Smckusick * If the block is not memory resident, 17646151Smckusick * allocate a buffer and start I/O. 1777015Smckusick */ 17837736Smckusick if (!incore(vp, blkno)) { 17957797Smckusick *bpp = bp = getblk(vp, blkno, size, 0, 0); 18046151Smckusick if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) { 1818Sbill bp->b_flags |= B_READ; 1828670S if (bp->b_bcount > bp->b_bufsize) 18352189Smckusick panic("breadn"); 18438776Smckusick if (bp->b_rcred == NOCRED && cred != NOCRED) { 18538776Smckusick crhold(cred); 18638776Smckusick bp->b_rcred = cred; 18738776Smckusick } 18837736Smckusick VOP_STRATEGY(bp); 18940341Smckusick trace(TR_BREADMISS, pack(vp, size), blkno); 19047545Skarels p->p_stats->p_ru.ru_inblock++; /* pay for read */ 19154342Smckusick } else { 19240341Smckusick trace(TR_BREADHIT, pack(vp, size), blkno); 19354342Smckusick } 1948Sbill } 1957015Smckusick 1967015Smckusick /* 19752189Smckusick * If there's read-ahead block(s), start I/O 19852189Smckusick * on them also (as above). 1997015Smckusick */ 20052189Smckusick for (i = 0; i < num; i++) { 20152189Smckusick if (incore(vp, rablkno[i])) 20252189Smckusick continue; 20357797Smckusick rabp = getblk(vp, rablkno[i], rabsize[i], 0, 0); 20446151Smckusick if (rabp->b_flags & (B_DONE | B_DELWRI)) { 2058Sbill brelse(rabp); 20652189Smckusick trace(TR_BREADHITRA, pack(vp, rabsize[i]), rablkno[i]); 2072045Swnj } else { 20846151Smckusick rabp->b_flags |= B_ASYNC | B_READ; 2098670S if (rabp->b_bcount > rabp->b_bufsize) 2108670S panic("breadrabp"); 21138880Smckusick if (rabp->b_rcred == NOCRED && cred != NOCRED) { 21238776Smckusick crhold(cred); 21338880Smckusick rabp->b_rcred = cred; 21438776Smckusick } 21537736Smckusick VOP_STRATEGY(rabp); 21652189Smckusick trace(TR_BREADMISSRA, pack(vp, rabsize[i]), rablkno[i]); 21747545Skarels p->p_stats->p_ru.ru_inblock++; /* pay in advance */ 2188Sbill } 2198Sbill } 2207015Smckusick 2217015Smckusick /* 22246151Smckusick * If block was memory resident, let bread get it. 22346151Smckusick * If block was not memory resident, the read was 22446151Smckusick * started above, so just wait for the read to complete. 2257015Smckusick */ 2267114Smckusick if (bp == NULL) 22738776Smckusick return (bread(vp, blkno, size, cred, bpp)); 22837736Smckusick return (biowait(bp)); 2298Sbill } 2308Sbill 2318Sbill /* 23257045Smargo * We could optimize this by keeping track of where the last read-ahead 23357045Smargo * was, but it would involve adding fields to the vnode. For now, let's 23457045Smargo * just get it working. 23557045Smargo * 23657045Smargo * This replaces bread. If this is a bread at the beginning of a file and 23757045Smargo * lastr is 0, we assume this is the first read and we'll read up to two 23857045Smargo * blocks if they are sequential. After that, we'll do regular read ahead 23957045Smargo * in clustered chunks. 24057045Smargo * 24157045Smargo * There are 4 or 5 cases depending on how you count: 24257045Smargo * Desired block is in the cache: 24357045Smargo * 1 Not sequential access (0 I/Os). 24457045Smargo * 2 Access is sequential, do read-ahead (1 ASYNC). 24557045Smargo * Desired block is not in cache: 24657045Smargo * 3 Not sequential access (1 SYNC). 24757045Smargo * 4 Sequential access, next block is contiguous (1 SYNC). 24857045Smargo * 5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC) 24957045Smargo * 25057045Smargo * There are potentially two buffers that require I/O. 25157045Smargo * bp is the block requested. 25257045Smargo * rbp is the read-ahead block. 25357045Smargo * If either is NULL, then you don't have to do the I/O. 25457045Smargo */ 25557045Smargo cluster_read(vp, filesize, lblkno, size, cred, bpp) 25657045Smargo struct vnode *vp; 25757045Smargo u_quad_t filesize; 25857045Smargo daddr_t lblkno; 25957045Smargo long size; 26057045Smargo struct ucred *cred; 26157045Smargo struct buf **bpp; 26257045Smargo { 26357045Smargo struct buf *bp, *rbp; 26457045Smargo daddr_t blkno, ioblkno; 26557045Smargo long flags; 26657045Smargo int error, num_ra, alreadyincore; 26757045Smargo 26857045Smargo #ifdef DIAGNOSTIC 26957045Smargo if (size == 0) 27057045Smargo panic("cluster_read: size = 0"); 27157045Smargo #endif 27257045Smargo 27357045Smargo error = 0; 27457045Smargo flags = B_READ; 27557797Smckusick *bpp = bp = getblk(vp, lblkno, size, 0, 0); 27657045Smargo if (bp->b_flags & (B_CACHE | B_DONE | B_DELWRI)) { 27757045Smargo /* 27857045Smargo * Desired block is in cache; do any readahead ASYNC. 27957045Smargo * Case 1, 2. 28057045Smargo */ 28157045Smargo trace(TR_BREADHIT, pack(vp, size), lblkno); 28257045Smargo flags |= B_ASYNC; 28357045Smargo ioblkno = lblkno + 28457045Smargo (lblkno < vp->v_ralen ? vp->v_ralen >> 1 : vp->v_ralen); 28557797Smckusick alreadyincore = (int)incore(vp, ioblkno); 28657045Smargo bp = NULL; 28757045Smargo } else { 28857045Smargo /* Block wasn't in cache, case 3, 4, 5. */ 28957045Smargo trace(TR_BREADMISS, pack(vp, size), lblkno); 29057045Smargo ioblkno = lblkno; 29157045Smargo bp->b_flags |= flags; 29257045Smargo alreadyincore = 0; 29357045Smargo curproc->p_stats->p_ru.ru_inblock++; /* XXX */ 29457045Smargo } 29557045Smargo /* 29657045Smargo * XXX 29757045Smargo * Replace 1 with a window size based on some permutation of 29857045Smargo * maxcontig and rot_delay. This will let you figure out how 29957045Smargo * many blocks you should read-ahead (case 2, 4, 5). 30057045Smargo * 30157045Smargo * If the access isn't sequential, cut the window size in half. 30257045Smargo */ 30357045Smargo rbp = NULL; 30457045Smargo if (lblkno != vp->v_lastr + 1 && lblkno != 0) 30557045Smargo vp->v_ralen = max(vp->v_ralen >> 1, 1); 30657045Smargo else if ((ioblkno + 1) * size < filesize && !alreadyincore && 30757045Smargo !(error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra))) { 30857045Smargo /* 30957045Smargo * Reading sequentially, and the next block is not in the 31057045Smargo * cache. We are going to try reading ahead. If this is 31157045Smargo * the first read of a file, then limit read-ahead to a 31257045Smargo * single block, else read as much as we're allowed. 31357045Smargo */ 31457045Smargo if (num_ra > vp->v_ralen) { 31557045Smargo num_ra = vp->v_ralen; 31657045Smargo vp->v_ralen = min(MAXPHYS / size, vp->v_ralen << 1); 31757045Smargo } else 31857045Smargo vp->v_ralen = num_ra + 1; 31957045Smargo 32057045Smargo 32157045Smargo if (num_ra) /* case 2, 4 */ 32257045Smargo rbp = cluster_rbuild(vp, filesize, 32357045Smargo bp, ioblkno, blkno, size, num_ra, flags); 32457045Smargo else if (lblkno != 0 && ioblkno == lblkno) { 32557045Smargo /* Case 5: check how many blocks to read ahead */ 32657045Smargo ++ioblkno; 32757045Smargo if ((ioblkno + 1) * size > filesize || 32857045Smargo (error = VOP_BMAP(vp, 32957045Smargo ioblkno, NULL, &blkno, &num_ra))) 33057045Smargo goto skip_readahead; 33157045Smargo flags |= B_ASYNC; 33257045Smargo if (num_ra) 33357045Smargo rbp = cluster_rbuild(vp, filesize, 33457045Smargo NULL, ioblkno, blkno, size, num_ra, flags); 33557045Smargo else { 33657797Smckusick rbp = getblk(vp, ioblkno, size, 0, 0); 33757045Smargo rbp->b_flags |= flags; 33857045Smargo rbp->b_blkno = blkno; 33957045Smargo } 34057045Smargo } else if (lblkno != 0) { 34157045Smargo /* case 2; read ahead single block */ 34257797Smckusick rbp = getblk(vp, ioblkno, size, 0, 0); 34357045Smargo rbp->b_flags |= flags; 34457045Smargo rbp->b_blkno = blkno; 34557045Smargo } else if (bp) /* case 1, 3, block 0 */ 34657045Smargo bp->b_blkno = blkno; 34757045Smargo /* Case 1 on block 0; not really doing sequential I/O */ 34857045Smargo 34957045Smargo if (rbp == bp) /* case 4 */ 35057045Smargo rbp = NULL; 35157045Smargo else if (rbp) { /* case 2, 5 */ 35257045Smargo trace(TR_BREADMISSRA, 35357045Smargo pack(vp, (num_ra + 1) * size), ioblkno); 35457045Smargo curproc->p_stats->p_ru.ru_inblock++; /* XXX */ 35557045Smargo } 35657045Smargo } 35757045Smargo 35857045Smargo /* XXX Kirk, do we need to make sure the bp has creds? */ 35957045Smargo skip_readahead: 36057045Smargo if (bp) 36157045Smargo if (bp->b_flags & (B_DONE | B_DELWRI)) 36257045Smargo panic("cluster_read: DONE bp"); 36357045Smargo else 36457045Smargo error = VOP_STRATEGY(bp); 36557045Smargo 36657045Smargo if (rbp) 36757045Smargo if (error || rbp->b_flags & (B_DONE | B_DELWRI)) { 36857045Smargo rbp->b_flags &= ~(B_ASYNC | B_READ); 36957045Smargo brelse(rbp); 37057045Smargo } else 37157045Smargo (void) VOP_STRATEGY(rbp); 37257045Smargo 37357045Smargo if (bp) 37457045Smargo return(biowait(bp)); 37557045Smargo return(error); 37657045Smargo } 37757045Smargo 37857045Smargo /* 37957045Smargo * If blocks are contiguous on disk, use this to provide clustered 38057045Smargo * read ahead. We will read as many blocks as possible sequentially 38157045Smargo * and then parcel them up into logical blocks in the buffer hash table. 38257045Smargo */ 38357045Smargo struct buf * 38457045Smargo cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags) 38557045Smargo struct vnode *vp; 38657045Smargo u_quad_t filesize; 38757045Smargo struct buf *bp; 38857045Smargo daddr_t lbn; 38957045Smargo daddr_t blkno; 39057045Smargo long size; 39157045Smargo int run; 39257045Smargo long flags; 39357045Smargo { 39457045Smargo struct cluster_save *b_save; 39557045Smargo struct buf *tbp; 39657045Smargo daddr_t bn; 39757045Smargo int i, inc; 39857045Smargo 399*59872Smargo #ifdef DIAGNOSTIC 400*59872Smargo if (size != vp->v_mount->mnt_stat.f_iosize) 401*59872Smargo panic("cluster_rbuild: size %d != filesize %d\n", 402*59872Smargo size, vp->v_mount->mnt_stat.f_iosize); 403*59872Smargo #endif 40457045Smargo if (size * (lbn + run + 1) > filesize) 40557045Smargo --run; 40657045Smargo if (run == 0) { 40757045Smargo if (!bp) { 40857797Smckusick bp = getblk(vp, lbn, size, 0, 0); 40957045Smargo bp->b_blkno = blkno; 41057045Smargo bp->b_flags |= flags; 41157045Smargo } 41257045Smargo return(bp); 41357045Smargo } 41457045Smargo 41557045Smargo bp = cluster_newbuf(vp, bp, flags, blkno, lbn, size, run + 1); 41657045Smargo if (bp->b_flags & (B_DONE | B_DELWRI)) 41757045Smargo return (bp); 41857045Smargo 41957045Smargo b_save = malloc(sizeof(struct buf *) * run + sizeof(struct cluster_save), 42057045Smargo M_SEGMENT, M_WAITOK); 42157045Smargo b_save->bs_bufsize = b_save->bs_bcount = size; 42257045Smargo b_save->bs_nchildren = 0; 42357045Smargo b_save->bs_children = (struct buf **)(b_save + 1); 42457045Smargo b_save->bs_saveaddr = bp->b_saveaddr; 42557045Smargo bp->b_saveaddr = (caddr_t) b_save; 42657045Smargo 42757045Smargo inc = size / DEV_BSIZE; 42857045Smargo for (bn = blkno + inc, i = 1; i <= run; ++i, bn += inc) { 42957045Smargo if (incore(vp, lbn + i)) { 43057045Smargo if (i == 1) { 43157045Smargo bp->b_saveaddr = b_save->bs_saveaddr; 43257045Smargo bp->b_flags &= ~B_CALL; 43357045Smargo bp->b_iodone = NULL; 43457045Smargo allocbuf(bp, size); 43557045Smargo free(b_save, M_SEGMENT); 43657045Smargo } else 43757045Smargo allocbuf(bp, size * i); 43857045Smargo break; 43957045Smargo } 44057797Smckusick tbp = getblk(vp, lbn + i, 0, 0, 0); 44157045Smargo tbp->b_bcount = tbp->b_bufsize = size; 44257045Smargo tbp->b_blkno = bn; 44357045Smargo tbp->b_flags |= flags | B_READ | B_ASYNC; 44457045Smargo ++b_save->bs_nchildren; 44557045Smargo b_save->bs_children[i - 1] = tbp; 44657045Smargo } 44757045Smargo if (!(bp->b_flags & B_ASYNC)) 44857045Smargo vp->v_ralen = max(vp->v_ralen - 1, 1); 44957045Smargo return(bp); 45057045Smargo } 45157045Smargo 45257045Smargo /* 45357045Smargo * Either get a new buffer or grow the existing one. 45457045Smargo */ 45557045Smargo struct buf * 45657045Smargo cluster_newbuf(vp, bp, flags, blkno, lblkno, size, run) 45757045Smargo struct vnode *vp; 45857045Smargo struct buf *bp; 45957045Smargo long flags; 46057045Smargo daddr_t blkno; 46157045Smargo daddr_t lblkno; 46257045Smargo long size; 46357045Smargo int run; 46457045Smargo { 46557045Smargo if (!bp) { 46657797Smckusick bp = getblk(vp, lblkno, size, 0, 0); 46757045Smargo if (bp->b_flags & (B_DONE | B_DELWRI)) { 46857045Smargo bp->b_blkno = blkno; 46957045Smargo return(bp); 47057045Smargo } 47157045Smargo } 47257045Smargo allocbuf(bp, run * size); 47357045Smargo bp->b_blkno = blkno; 47457045Smargo bp->b_iodone = cluster_callback; 47557045Smargo bp->b_flags |= flags | B_CALL; 47657045Smargo return(bp); 47757045Smargo } 47857045Smargo 47957045Smargo /* 48057045Smargo * Cleanup after a clustered read or write. 48157045Smargo */ 48257045Smargo void 48357045Smargo cluster_callback(bp) 48457045Smargo struct buf *bp; 48557045Smargo { 48657045Smargo struct cluster_save *b_save; 48757045Smargo struct buf **tbp; 48857045Smargo long bsize; 48957045Smargo caddr_t cp; 49057045Smargo b_save = (struct cluster_save *)(bp->b_saveaddr); 49157045Smargo bp->b_saveaddr = b_save->bs_saveaddr; 49257045Smargo 49357045Smargo cp = bp->b_un.b_addr + b_save->bs_bufsize; 49457045Smargo for (tbp = b_save->bs_children; b_save->bs_nchildren--; ++tbp) { 49557045Smargo pagemove(cp, (*tbp)->b_un.b_addr, (*tbp)->b_bufsize); 49657045Smargo cp += (*tbp)->b_bufsize; 49757045Smargo bp->b_bufsize -= (*tbp)->b_bufsize; 49857045Smargo biodone(*tbp); 49957045Smargo } 50057045Smargo #ifdef DIAGNOSTIC 50157045Smargo if (bp->b_bufsize != b_save->bs_bufsize) 50257045Smargo panic ("cluster_callback: more space to reclaim"); 50357045Smargo #endif 50457045Smargo bp->b_bcount = bp->b_bufsize; 50557045Smargo bp->b_iodone = NULL; 50657045Smargo free(b_save, M_SEGMENT); 50757045Smargo if (bp->b_flags & B_ASYNC) 50857045Smargo brelse(bp); 50957045Smargo else 51057045Smargo wakeup((caddr_t)bp); 51157045Smargo } 51257045Smargo 51357045Smargo /* 51446151Smckusick * Synchronous write. 51546151Smckusick * Release buffer on completion. 5168Sbill */ 5178Sbill bwrite(bp) 5187015Smckusick register struct buf *bp; 5198Sbill { 52047545Skarels struct proc *p = curproc; /* XXX */ 52137736Smckusick register int flag; 52252413Storek int s, error = 0; 5238Sbill 5248Sbill flag = bp->b_flags; 5259857Ssam bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 52649459Smckusick if (flag & B_ASYNC) { 52749459Smckusick if ((flag & B_DELWRI) == 0) 52849459Smckusick p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 52949459Smckusick else 53049459Smckusick reassignbuf(bp, bp->b_vp); 53149459Smckusick } 53240341Smckusick trace(TR_BWRITE, pack(bp->b_vp, bp->b_bcount), bp->b_lblkno); 5338670S if (bp->b_bcount > bp->b_bufsize) 5348670S panic("bwrite"); 53540226Smckusick s = splbio(); 53639882Smckusick bp->b_vp->v_numoutput++; 53757797Smckusick bp->b_flags |= B_WRITEINPROG; 53840226Smckusick splx(s); 53937736Smckusick VOP_STRATEGY(bp); 5407015Smckusick 5417015Smckusick /* 54246151Smckusick * If the write was synchronous, then await I/O completion. 5437015Smckusick * If the write was "delayed", then we put the buffer on 54446151Smckusick * the queue of blocks awaiting I/O completion status. 5457015Smckusick */ 54646151Smckusick if ((flag & B_ASYNC) == 0) { 54737736Smckusick error = biowait(bp); 54849459Smckusick if ((flag&B_DELWRI) == 0) 54949459Smckusick p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 55049459Smckusick else 55149459Smckusick reassignbuf(bp, bp->b_vp); 55257797Smckusick if (bp->b_flags & B_EINTR) { 55357797Smckusick bp->b_flags &= ~B_EINTR; 55457797Smckusick error = EINTR; 55557797Smckusick } 5568Sbill brelse(bp); 55737736Smckusick } else if (flag & B_DELWRI) { 55852413Storek s = splbio(); 5598Sbill bp->b_flags |= B_AGE; 56052413Storek splx(s); 56137736Smckusick } 56237736Smckusick return (error); 5638Sbill } 5648Sbill 56553578Sheideman int 56653578Sheideman vn_bwrite(ap) 56753578Sheideman struct vop_bwrite_args *ap; 56853578Sheideman { 56956395Smckusick return (bwrite(ap->a_bp)); 57053578Sheideman } 57153578Sheideman 57253578Sheideman 5738Sbill /* 57446151Smckusick * Delayed write. 57546151Smckusick * 57646151Smckusick * The buffer is marked dirty, but is not queued for I/O. 57746151Smckusick * This routine should be used when the buffer is expected 57846151Smckusick * to be modified again soon, typically a small write that 57946151Smckusick * partially fills a buffer. 58046151Smckusick * 58146151Smckusick * NB: magnetic tapes cannot be delayed; they must be 58246151Smckusick * written in the order that the writes are requested. 5838Sbill */ 5848Sbill bdwrite(bp) 5857015Smckusick register struct buf *bp; 5868Sbill { 58747545Skarels struct proc *p = curproc; /* XXX */ 5888Sbill 58939882Smckusick if ((bp->b_flags & B_DELWRI) == 0) { 59039882Smckusick bp->b_flags |= B_DELWRI; 59139882Smckusick reassignbuf(bp, bp->b_vp); 59247545Skarels p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 59339882Smckusick } 59437736Smckusick /* 59539668Smckusick * If this is a tape drive, the write must be initiated. 59637736Smckusick */ 59748360Smckusick if (VOP_IOCTL(bp->b_vp, 0, (caddr_t)B_TAPE, 0, NOCRED, p) == 0) { 5988Sbill bawrite(bp); 59939668Smckusick } else { 60046151Smckusick bp->b_flags |= (B_DONE | B_DELWRI); 6018Sbill brelse(bp); 6028Sbill } 6038Sbill } 6048Sbill 6058Sbill /* 60646151Smckusick * Asynchronous write. 60746151Smckusick * Start I/O on a buffer, but do not wait for it to complete. 60846151Smckusick * The buffer is released when the I/O completes. 6098Sbill */ 6108Sbill bawrite(bp) 6117015Smckusick register struct buf *bp; 6128Sbill { 6138Sbill 61446151Smckusick /* 61546151Smckusick * Setting the ASYNC flag causes bwrite to return 61646151Smckusick * after starting the I/O. 61746151Smckusick */ 6188Sbill bp->b_flags |= B_ASYNC; 61957797Smckusick (void) VOP_BWRITE(bp); 6208Sbill } 6218Sbill 6228Sbill /* 62357045Smargo * Do clustered write for FFS. 62457045Smargo * 62557045Smargo * Three cases: 62657045Smargo * 1. Write is not sequential (write asynchronously) 62757045Smargo * Write is sequential: 62857045Smargo * 2. beginning of cluster - begin cluster 62957045Smargo * 3. middle of a cluster - add to cluster 63057045Smargo * 4. end of a cluster - asynchronously write cluster 63157045Smargo */ 63257045Smargo void 63357045Smargo cluster_write(bp, filesize) 63457045Smargo struct buf *bp; 63557045Smargo u_quad_t filesize; 63657045Smargo { 63757045Smargo struct vnode *vp; 63857045Smargo daddr_t lbn; 639*59872Smargo int clen; 64057045Smargo 64157045Smargo vp = bp->b_vp; 64257045Smargo lbn = bp->b_lblkno; 64357045Smargo 644*59872Smargo /* Initialize vnode to beginning of file. */ 645*59872Smargo if (lbn == 0) 646*59872Smargo vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; 647*59872Smargo 648*59872Smargo if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || 649*59872Smargo (bp->b_blkno != vp->v_lasta + bp->b_bcount / DEV_BSIZE)) { 65057045Smargo if (vp->v_clen != 0) 65157045Smargo /* 65257045Smargo * Write is not sequential. 65357045Smargo */ 65457045Smargo cluster_wbuild(vp, NULL, bp->b_bcount, vp->v_cstart, 65557045Smargo vp->v_lastw - vp->v_cstart + 1, lbn); 65657045Smargo /* 65757045Smargo * Consider beginning a cluster. 65857045Smargo */ 659*59872Smargo if ((lbn + 1) * bp->b_bcount == filesize) 660*59872Smargo /* End of file, make cluster as large as possible */ 661*59872Smargo clen = MAXBSIZE / vp->v_mount->mnt_stat.f_iosize - 1; 662*59872Smargo else if (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &clen)) { 66357045Smargo bawrite(bp); 664*59872Smargo vp->v_clen = 0; 665*59872Smargo vp->v_lasta = bp->b_blkno; 66657045Smargo vp->v_cstart = lbn + 1; 66757045Smargo vp->v_lastw = lbn; 66857045Smargo return; 669*59872Smargo } else 670*59872Smargo clen = 0; 67157045Smargo vp->v_clen = clen; 67257045Smargo if (clen == 0) { /* I/O not contiguous */ 67357045Smargo vp->v_cstart = lbn + 1; 67457045Smargo bawrite(bp); 67557045Smargo } else { /* Wait for rest of cluster */ 67657045Smargo vp->v_cstart = lbn; 67757045Smargo bdwrite(bp); 67857045Smargo } 67957045Smargo } else if (lbn == vp->v_cstart + vp->v_clen) { 68057045Smargo /* 68157045Smargo * At end of cluster, write it out. 68257045Smargo */ 68357045Smargo cluster_wbuild(vp, bp, bp->b_bcount, vp->v_cstart, 68457045Smargo vp->v_clen + 1, lbn); 68557045Smargo vp->v_clen = 0; 68657045Smargo vp->v_cstart = lbn + 1; 68757045Smargo } else 68857045Smargo /* 68957045Smargo * In the middle of a cluster, so just delay the 69057045Smargo * I/O for now. 69157045Smargo */ 69257045Smargo bdwrite(bp); 69357045Smargo vp->v_lastw = lbn; 694*59872Smargo vp->v_lasta = bp->b_blkno; 69557045Smargo } 69657045Smargo 69757045Smargo 69857045Smargo /* 69957045Smargo * This is an awful lot like cluster_rbuild...wish they could be combined. 70057045Smargo * The last lbn argument is the current block on which I/O is being 70157045Smargo * performed. Check to see that it doesn't fall in the middle of 70257045Smargo * the current block. 70357045Smargo */ 70457045Smargo void 70557045Smargo cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn) 70657045Smargo struct vnode *vp; 70757045Smargo struct buf *last_bp; 70857045Smargo long size; 70957045Smargo daddr_t start_lbn; 71057045Smargo int len; 71157045Smargo daddr_t lbn; 71257045Smargo { 71357045Smargo struct cluster_save *b_save; 71457045Smargo struct buf *bp, *tbp; 71557045Smargo caddr_t cp; 71657045Smargo int i, s; 71757045Smargo 718*59872Smargo #ifdef DIAGNOSTIC 719*59872Smargo if (size != vp->v_mount->mnt_stat.f_iosize) 720*59872Smargo panic("cluster_wbuild: size %d != filesize %d\n", 721*59872Smargo size, vp->v_mount->mnt_stat.f_iosize); 722*59872Smargo #endif 72357045Smargo redo: 72457045Smargo while ((!incore(vp, start_lbn) || start_lbn == lbn) && len) { 72557045Smargo ++start_lbn; 72657045Smargo --len; 72757045Smargo } 72857045Smargo 72957045Smargo /* Get more memory for current buffer */ 73057045Smargo if (len <= 1) { 731*59872Smargo if (last_bp) { 73257045Smargo bawrite(last_bp); 733*59872Smargo } else if (len) { 734*59872Smargo bp = getblk(vp, start_lbn, size, 0, 0); 735*59872Smargo bawrite(bp); 736*59872Smargo } 73757045Smargo return; 73857045Smargo } 73957045Smargo 74057797Smckusick bp = getblk(vp, start_lbn, size, 0, 0); 74157045Smargo if (!(bp->b_flags & B_DELWRI)) { 74257045Smargo ++start_lbn; 74357045Smargo --len; 74457045Smargo brelse(bp); 74557045Smargo goto redo; 74657045Smargo } 74757045Smargo 74857045Smargo --len; 74957045Smargo b_save = malloc(sizeof(struct buf *) * len + sizeof(struct cluster_save), 75057045Smargo M_SEGMENT, M_WAITOK); 75157045Smargo b_save->bs_bcount = bp->b_bcount; 75257045Smargo b_save->bs_bufsize = bp->b_bufsize; 75357045Smargo b_save->bs_nchildren = 0; 75457045Smargo b_save->bs_children = (struct buf **)(b_save + 1); 75557045Smargo b_save->bs_saveaddr = bp->b_saveaddr; 75657045Smargo bp->b_saveaddr = (caddr_t) b_save; 75757045Smargo 75857045Smargo 75957045Smargo bp->b_flags |= B_CALL; 76057045Smargo bp->b_iodone = cluster_callback; 76157045Smargo cp = bp->b_un.b_addr + bp->b_bufsize; 76257045Smargo for (++start_lbn, i = 0; i < len; ++i, ++start_lbn) { 76357045Smargo if (!incore(vp, start_lbn) || start_lbn == lbn) 76457045Smargo break; 76557045Smargo 76657045Smargo if (last_bp == NULL || start_lbn != last_bp->b_lblkno) { 76757797Smckusick tbp = getblk(vp, start_lbn, size, 0, 0); 76857045Smargo #ifdef DIAGNOSTIC 76957045Smargo if (tbp->b_bcount != tbp->b_bufsize) 77057045Smargo panic("cluster_wbuild: Buffer too big"); 77157045Smargo #endif 77257045Smargo if (!(tbp->b_flags & B_DELWRI)) { 77357045Smargo brelse(tbp); 77457045Smargo break; 77557045Smargo } 77657045Smargo } else 77757045Smargo tbp = last_bp; 77857045Smargo 77957045Smargo ++b_save->bs_nchildren; 78057045Smargo 78157045Smargo /* Move memory from children to parent */ 782*59872Smargo if (tbp->b_blkno != (bp->b_blkno + bp->b_bufsize / DEV_BSIZE)) { 783*59872Smargo printf("Clustered Block: %d addr %x bufsize: %d\n", 784*59872Smargo bp->b_lblkno, bp->b_blkno, bp->b_bufsize); 785*59872Smargo printf("Child Block: %d addr: %x\n", tbp->b_lblkno, 786*59872Smargo tbp->b_blkno); 787*59872Smargo panic("Clustered write to wrong blocks"); 788*59872Smargo } 789*59872Smargo 79057045Smargo pagemove(tbp->b_un.b_daddr, cp, size); 79157045Smargo bp->b_bcount += size; 79257045Smargo bp->b_bufsize += size; 79357045Smargo 79457045Smargo tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 79557045Smargo tbp->b_flags |= B_ASYNC; 79657045Smargo s = splbio(); 79757045Smargo reassignbuf(tbp, tbp->b_vp); /* put on clean list */ 79857045Smargo ++tbp->b_vp->v_numoutput; 79957045Smargo splx(s); 80057045Smargo b_save->bs_children[i] = tbp; 80157045Smargo 80257045Smargo cp += tbp->b_bufsize; 80357045Smargo } 80457045Smargo 80557045Smargo if (i == 0) { 80657045Smargo /* None to cluster */ 80757045Smargo bp->b_saveaddr = b_save->bs_saveaddr; 80857045Smargo bp->b_flags &= ~B_CALL; 80957045Smargo bp->b_iodone = NULL; 81057045Smargo free(b_save, M_SEGMENT); 81157045Smargo } 81257045Smargo bawrite(bp); 81357045Smargo if (i < len) { 81457045Smargo len -= i + 1; 81557045Smargo start_lbn += 1; 81657045Smargo goto redo; 81757045Smargo } 81857045Smargo } 81957045Smargo 82057045Smargo /* 82146151Smckusick * Release a buffer. 82246151Smckusick * Even if the buffer is dirty, no I/O is started. 8238Sbill */ 8248Sbill brelse(bp) 8257015Smckusick register struct buf *bp; 8268Sbill { 82756607Smckusick register struct queue_entry *flist; 82846151Smckusick int s; 8298Sbill 83040341Smckusick trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); 8317015Smckusick /* 83239668Smckusick * If a process is waiting for the buffer, or 83339668Smckusick * is waiting for a free buffer, awaken it. 8347015Smckusick */ 83546151Smckusick if (bp->b_flags & B_WANTED) 8368Sbill wakeup((caddr_t)bp); 83756395Smckusick if (needbuffer) { 83856395Smckusick needbuffer = 0; 83956395Smckusick wakeup((caddr_t)&needbuffer); 8408Sbill } 84139668Smckusick /* 84239668Smckusick * Retry I/O for locked buffers rather than invalidating them. 84339668Smckusick */ 84452413Storek s = splbio(); 84539668Smckusick if ((bp->b_flags & B_ERROR) && (bp->b_flags & B_LOCKED)) 84639668Smckusick bp->b_flags &= ~B_ERROR; 84739668Smckusick /* 84839668Smckusick * Disassociate buffers that are no longer valid. 84939668Smckusick */ 85046151Smckusick if (bp->b_flags & (B_NOCACHE | B_ERROR)) 85137736Smckusick bp->b_flags |= B_INVAL; 85246151Smckusick if ((bp->b_bufsize <= 0) || (bp->b_flags & (B_ERROR | B_INVAL))) { 85339668Smckusick if (bp->b_vp) 85439668Smckusick brelvp(bp); 85539668Smckusick bp->b_flags &= ~B_DELWRI; 85637736Smckusick } 8577015Smckusick /* 8587015Smckusick * Stick the buffer back on a free list. 8597015Smckusick */ 8608670S if (bp->b_bufsize <= 0) { 8618670S /* block has no buffer ... put at front of unused buffer list */ 86256395Smckusick flist = &bufqueues[BQ_EMPTY]; 8638670S binsheadfree(bp, flist); 86446151Smckusick } else if (bp->b_flags & (B_ERROR | B_INVAL)) { 8652325Swnj /* block has no info ... put at front of most free list */ 86656395Smckusick flist = &bufqueues[BQ_AGE]; 8677015Smckusick binsheadfree(bp, flist); 8688Sbill } else { 8692325Swnj if (bp->b_flags & B_LOCKED) 87056395Smckusick flist = &bufqueues[BQ_LOCKED]; 8712325Swnj else if (bp->b_flags & B_AGE) 87256395Smckusick flist = &bufqueues[BQ_AGE]; 8732325Swnj else 87456395Smckusick flist = &bufqueues[BQ_LRU]; 8757015Smckusick binstailfree(bp, flist); 8768Sbill } 87746151Smckusick bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_AGE | B_NOCACHE); 8788Sbill splx(s); 8798Sbill } 8808Sbill 8818Sbill /* 88246151Smckusick * Check to see if a block is currently memory resident. 8838Sbill */ 88457797Smckusick struct buf * 88537736Smckusick incore(vp, blkno) 88637736Smckusick struct vnode *vp; 8877015Smckusick daddr_t blkno; 8888Sbill { 8898Sbill register struct buf *bp; 8908Sbill 89156607Smckusick for (bp = BUFHASH(vp, blkno)->le_next; bp; bp = bp->b_hash.qe_next) 89239668Smckusick if (bp->b_lblkno == blkno && bp->b_vp == vp && 8937015Smckusick (bp->b_flags & B_INVAL) == 0) 89457797Smckusick return (bp); 89557797Smckusick return (NULL); 8968Sbill } 8978Sbill 89839668Smckusick /* 89946151Smckusick * Check to see if a block is currently memory resident. 90046151Smckusick * If it is resident, return it. If it is not resident, 90146151Smckusick * allocate a new buffer and assign it to the block. 90239668Smckusick */ 9038Sbill struct buf * 90457797Smckusick getblk(vp, blkno, size, slpflag, slptimeo) 90537736Smckusick register struct vnode *vp; 9066563Smckusic daddr_t blkno; 90757797Smckusick int size, slpflag, slptimeo; 9088Sbill { 90956607Smckusick register struct buf *bp; 91056607Smckusick struct list_entry *dp; 91157797Smckusick int s, error; 9128Sbill 91325255Smckusick if (size > MAXBSIZE) 91425255Smckusick panic("getblk: size too big"); 9157015Smckusick /* 91646151Smckusick * Search the cache for the block. If the buffer is found, 91746151Smckusick * but it is currently locked, the we must wait for it to 91846151Smckusick * become available. 9197015Smckusick */ 92037736Smckusick dp = BUFHASH(vp, blkno); 9217015Smckusick loop: 92256607Smckusick for (bp = dp->le_next; bp; bp = bp->b_hash.qe_next) { 92357797Smckusick if (bp->b_lblkno != blkno || bp->b_vp != vp) 9248Sbill continue; 92526271Skarels s = splbio(); 92646151Smckusick if (bp->b_flags & B_BUSY) { 9278Sbill bp->b_flags |= B_WANTED; 92857797Smckusick error = tsleep((caddr_t)bp, slpflag | (PRIBIO + 1), 92957797Smckusick "getblk", slptimeo); 9305424Swnj splx(s); 93157797Smckusick if (error) 93257797Smckusick return (NULL); 9338Sbill goto loop; 9348Sbill } 93557797Smckusick /* 93657797Smckusick * The test for B_INVAL is moved down here, since there 93757797Smckusick * are cases where B_INVAL is set before VOP_BWRITE() is 93857797Smckusick * called and for NFS, the process cannot be allowed to 93957797Smckusick * allocate a new buffer for the same block until the write 94057797Smckusick * back to the server has been completed. (ie. B_BUSY clears) 94157797Smckusick */ 94257797Smckusick if (bp->b_flags & B_INVAL) { 94357797Smckusick splx(s); 94457797Smckusick continue; 94557797Smckusick } 94639882Smckusick bremfree(bp); 94739882Smckusick bp->b_flags |= B_BUSY; 9485424Swnj splx(s); 94932608Smckusick if (bp->b_bcount != size) { 95039668Smckusick printf("getblk: stray size"); 95139668Smckusick bp->b_flags |= B_INVAL; 95257797Smckusick VOP_BWRITE(bp); 95339668Smckusick goto loop; 95432608Smckusick } 9558Sbill bp->b_flags |= B_CACHE; 95626271Skarels return (bp); 9578Sbill } 95857797Smckusick /* 95957797Smckusick * The loop back to the top when getnewbuf() fails is because 96057797Smckusick * stateless filesystems like NFS have no node locks. Thus, 96157797Smckusick * there is a slight chance that more than one process will 96257797Smckusick * try and getnewbuf() for the same block concurrently when 96357797Smckusick * the first sleeps in getnewbuf(). So after a sleep, go back 96457797Smckusick * up to the top to check the hash lists again. 96557797Smckusick */ 96657797Smckusick if ((bp = getnewbuf(slpflag, slptimeo)) == 0) 96757797Smckusick goto loop; 9687015Smckusick bremhash(bp); 96939668Smckusick bgetvp(vp, bp); 97045116Smckusick bp->b_bcount = 0; 97139668Smckusick bp->b_lblkno = blkno; 9726563Smckusic bp->b_blkno = blkno; 9738670S bp->b_error = 0; 97437736Smckusick bp->b_resid = 0; 97537736Smckusick binshash(bp, dp); 97645116Smckusick allocbuf(bp, size); 97726271Skarels return (bp); 9788Sbill } 9798Sbill 9808Sbill /* 98146151Smckusick * Allocate a buffer. 98246151Smckusick * The caller will assign it to a block. 9838Sbill */ 9848Sbill struct buf * 9856563Smckusic geteblk(size) 9866563Smckusic int size; 9878Sbill { 98856395Smckusick register struct buf *bp; 9898Sbill 99025255Smckusick if (size > MAXBSIZE) 99125255Smckusick panic("geteblk: size too big"); 99257797Smckusick while ((bp = getnewbuf(0, 0)) == NULL) 99357797Smckusick /* void */; 9948670S bp->b_flags |= B_INVAL; 9957015Smckusick bremhash(bp); 99656395Smckusick binshash(bp, &invalhash); 99745116Smckusick bp->b_bcount = 0; 99837736Smckusick bp->b_error = 0; 99937736Smckusick bp->b_resid = 0; 100045116Smckusick allocbuf(bp, size); 100126271Skarels return (bp); 10028Sbill } 10038Sbill 10048Sbill /* 100545116Smckusick * Expand or contract the actual memory allocated to a buffer. 100646151Smckusick * If no memory is available, release buffer and take error exit. 10076563Smckusic */ 100845116Smckusick allocbuf(tp, size) 100945116Smckusick register struct buf *tp; 10106563Smckusic int size; 10116563Smckusic { 101245116Smckusick register struct buf *bp, *ep; 101345116Smckusick int sizealloc, take, s; 10146563Smckusic 101545116Smckusick sizealloc = roundup(size, CLBYTES); 101645116Smckusick /* 101745116Smckusick * Buffer size does not change 101845116Smckusick */ 101945116Smckusick if (sizealloc == tp->b_bufsize) 102045116Smckusick goto out; 102145116Smckusick /* 102245116Smckusick * Buffer size is shrinking. 102345116Smckusick * Place excess space in a buffer header taken from the 102445116Smckusick * BQ_EMPTY buffer list and placed on the "most free" list. 102545116Smckusick * If no extra buffer headers are available, leave the 102645116Smckusick * extra space in the present buffer. 102745116Smckusick */ 102845116Smckusick if (sizealloc < tp->b_bufsize) { 102956607Smckusick if ((ep = bufqueues[BQ_EMPTY].qe_next) == NULL) 103045116Smckusick goto out; 103145116Smckusick s = splbio(); 103245116Smckusick bremfree(ep); 103345116Smckusick ep->b_flags |= B_BUSY; 103445116Smckusick splx(s); 103545116Smckusick pagemove(tp->b_un.b_addr + sizealloc, ep->b_un.b_addr, 103645116Smckusick (int)tp->b_bufsize - sizealloc); 103745116Smckusick ep->b_bufsize = tp->b_bufsize - sizealloc; 103845116Smckusick tp->b_bufsize = sizealloc; 103945116Smckusick ep->b_flags |= B_INVAL; 104045116Smckusick ep->b_bcount = 0; 104145116Smckusick brelse(ep); 104245116Smckusick goto out; 104345116Smckusick } 104445116Smckusick /* 104545116Smckusick * More buffer space is needed. Get it out of buffers on 104645116Smckusick * the "most free" list, placing the empty headers on the 104745116Smckusick * BQ_EMPTY buffer header list. 104845116Smckusick */ 104945116Smckusick while (tp->b_bufsize < sizealloc) { 105045116Smckusick take = sizealloc - tp->b_bufsize; 105157797Smckusick while ((bp = getnewbuf(0, 0)) == NULL) 105257797Smckusick /* void */; 105345116Smckusick if (take >= bp->b_bufsize) 105445116Smckusick take = bp->b_bufsize; 105545116Smckusick pagemove(&bp->b_un.b_addr[bp->b_bufsize - take], 105645116Smckusick &tp->b_un.b_addr[tp->b_bufsize], take); 105745116Smckusick tp->b_bufsize += take; 105845116Smckusick bp->b_bufsize = bp->b_bufsize - take; 105945116Smckusick if (bp->b_bcount > bp->b_bufsize) 106045116Smckusick bp->b_bcount = bp->b_bufsize; 106145116Smckusick if (bp->b_bufsize <= 0) { 106245116Smckusick bremhash(bp); 106356395Smckusick binshash(bp, &invalhash); 106446151Smckusick bp->b_dev = NODEV; 106545116Smckusick bp->b_error = 0; 106645116Smckusick bp->b_flags |= B_INVAL; 106745116Smckusick } 106845116Smckusick brelse(bp); 106945116Smckusick } 107045116Smckusick out: 107145116Smckusick tp->b_bcount = size; 107245116Smckusick return (1); 10738670S } 10748670S 10758670S /* 10768670S * Find a buffer which is available for use. 10778670S * Select something from a free list. 10788670S * Preference is to AGE list, then LRU list. 10798670S */ 10808670S struct buf * 108157797Smckusick getnewbuf(slpflag, slptimeo) 108257797Smckusick int slpflag, slptimeo; 10838670S { 108456395Smckusick register struct buf *bp; 108556607Smckusick register struct queue_entry *dp; 108638776Smckusick register struct ucred *cred; 10878670S int s; 10888670S 10898670S loop: 109026271Skarels s = splbio(); 109156395Smckusick for (dp = &bufqueues[BQ_AGE]; dp > bufqueues; dp--) 109256607Smckusick if (dp->qe_next) 10938670S break; 109456395Smckusick if (dp == bufqueues) { /* no free blocks */ 109556395Smckusick needbuffer = 1; 109657797Smckusick (void) tsleep((caddr_t)&needbuffer, slpflag | (PRIBIO + 1), 109757797Smckusick "getnewbuf", slptimeo); 109812170Ssam splx(s); 109957797Smckusick return (NULL); 11008670S } 110156607Smckusick bp = dp->qe_next; 110239882Smckusick bremfree(bp); 110339882Smckusick bp->b_flags |= B_BUSY; 11048670S splx(s); 11058670S if (bp->b_flags & B_DELWRI) { 110638614Smckusick (void) bawrite(bp); 11078670S goto loop; 11088670S } 110940341Smckusick trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); 111039668Smckusick if (bp->b_vp) 111139668Smckusick brelvp(bp); 111238776Smckusick if (bp->b_rcred != NOCRED) { 111338776Smckusick cred = bp->b_rcred; 111438776Smckusick bp->b_rcred = NOCRED; 111538776Smckusick crfree(cred); 111638776Smckusick } 111738776Smckusick if (bp->b_wcred != NOCRED) { 111838776Smckusick cred = bp->b_wcred; 111938776Smckusick bp->b_wcred = NOCRED; 112038776Smckusick crfree(cred); 112138776Smckusick } 11228670S bp->b_flags = B_BUSY; 112346989Smckusick bp->b_dirtyoff = bp->b_dirtyend = 0; 112452189Smckusick bp->b_validoff = bp->b_validend = 0; 11258670S return (bp); 11268670S } 11278670S 11288670S /* 112946151Smckusick * Wait for I/O to complete. 113046151Smckusick * 113146151Smckusick * Extract and return any errors associated with the I/O. 113246151Smckusick * If the error flag is set, but no specific error is 113346151Smckusick * given, return EIO. 11348Sbill */ 11357015Smckusick biowait(bp) 11366563Smckusic register struct buf *bp; 11378Sbill { 11385431Sroot int s; 11398Sbill 114026271Skarels s = splbio(); 114138776Smckusick while ((bp->b_flags & B_DONE) == 0) 11428Sbill sleep((caddr_t)bp, PRIBIO); 11435431Sroot splx(s); 114437736Smckusick if ((bp->b_flags & B_ERROR) == 0) 114537736Smckusick return (0); 114637736Smckusick if (bp->b_error) 114737736Smckusick return (bp->b_error); 114837736Smckusick return (EIO); 11498Sbill } 11508Sbill 11518Sbill /* 115213128Ssam * Mark I/O complete on a buffer. 115346151Smckusick * 115446151Smckusick * If a callback has been requested, e.g. the pageout 115546151Smckusick * daemon, do so. Otherwise, awaken waiting processes. 11568Sbill */ 115751455Sbostic void 11587015Smckusick biodone(bp) 11597015Smckusick register struct buf *bp; 11608Sbill { 11618Sbill 1162420Sbill if (bp->b_flags & B_DONE) 11637015Smckusick panic("dup biodone"); 11648Sbill bp->b_flags |= B_DONE; 116549232Smckusick if ((bp->b_flags & B_READ) == 0) 116649232Smckusick vwakeup(bp); 11679763Ssam if (bp->b_flags & B_CALL) { 11689763Ssam bp->b_flags &= ~B_CALL; 11699763Ssam (*bp->b_iodone)(bp); 11709763Ssam return; 11719763Ssam } 117246151Smckusick if (bp->b_flags & B_ASYNC) 11738Sbill brelse(bp); 11748Sbill else { 11758Sbill bp->b_flags &= ~B_WANTED; 11768Sbill wakeup((caddr_t)bp); 11778Sbill } 11788Sbill } 117956356Smckusick 118057035Smargo int 118157035Smargo count_lock_queue() 118257035Smargo { 118357035Smargo register struct buf *bp; 118457035Smargo register int ret; 118557035Smargo 118657035Smargo for (ret = 0, bp = (struct buf *)bufqueues[BQ_LOCKED].qe_next; 118757035Smargo bp; bp = (struct buf *)bp->b_freelist.qe_next) 118857035Smargo ++ret; 118957035Smargo return(ret); 119057035Smargo } 119157035Smargo 119256356Smckusick #ifdef DIAGNOSTIC 119356356Smckusick /* 119456356Smckusick * Print out statistics on the current allocation of the buffer pool. 119556356Smckusick * Can be enabled to print out on every ``sync'' by setting "syncprt" 119656356Smckusick * above. 119756356Smckusick */ 119856356Smckusick void 119956356Smckusick vfs_bufstats() 120056356Smckusick { 120156356Smckusick int s, i, j, count; 120256395Smckusick register struct buf *bp; 120356607Smckusick register struct queue_entry *dp; 120456356Smckusick int counts[MAXBSIZE/CLBYTES+1]; 120556356Smckusick static char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE", "EMPTY" }; 120656356Smckusick 120756395Smckusick for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) { 120856356Smckusick count = 0; 120956356Smckusick for (j = 0; j <= MAXBSIZE/CLBYTES; j++) 121056356Smckusick counts[j] = 0; 121156356Smckusick s = splbio(); 121256607Smckusick for (bp = dp->qe_next; bp; bp = bp->b_freelist.qe_next) { 121356356Smckusick counts[bp->b_bufsize/CLBYTES]++; 121456356Smckusick count++; 121556356Smckusick } 121656356Smckusick splx(s); 121756356Smckusick printf("%s: total-%d", bname[i], count); 121856356Smckusick for (j = 0; j <= MAXBSIZE/CLBYTES; j++) 121956356Smckusick if (counts[j] != 0) 122056356Smckusick printf(", %d-%d", j * CLBYTES, counts[j]); 122156356Smckusick printf("\n"); 122256356Smckusick } 122356356Smckusick } 122456356Smckusick #endif /* DIAGNOSTIC */ 1225