149589Sbostic /*- 249589Sbostic * Copyright (c) 1982, 1986, 1989 The Regents of the University of California. 337736Smckusick * All rights reserved. 423395Smckusick * 549618Smckusick * This module is believed to contain source code proprietary to AT&T. 649618Smckusick * Use and redistribution is subject to the Berkeley Software License 749618Smckusick * Agreement and your Software Agreement with AT&T (Western Electric). 837736Smckusick * 9*57797Smckusick * @(#)vfs_bio.c 7.58 (Berkeley) 02/02/93 1023395Smckusick */ 118Sbill 1251455Sbostic #include <sys/param.h> 1351455Sbostic #include <sys/proc.h> 1451455Sbostic #include <sys/buf.h> 1551455Sbostic #include <sys/vnode.h> 1651455Sbostic #include <sys/mount.h> 1751455Sbostic #include <sys/trace.h> 1851455Sbostic #include <sys/resourcevar.h> 1956395Smckusick #include <sys/malloc.h> 2056395Smckusick #include <libkern/libkern.h> 218Sbill 2291Sbill /* 2356395Smckusick * Definitions for the buffer hash lists. 2456395Smckusick */ 2556395Smckusick #define BUFHASH(dvp, lbn) \ 2656395Smckusick (&bufhashtbl[((int)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash]) 2756607Smckusick struct list_entry *bufhashtbl, invalhash; 2856395Smckusick u_long bufhash; 2956395Smckusick 3056395Smckusick /* 3156395Smckusick * Insq/Remq for the buffer hash lists. 3256395Smckusick */ 3356607Smckusick #define binshash(bp, dp) list_enter_head(dp, bp, struct buf *, b_hash) 3456607Smckusick #define bremhash(bp) list_remove(bp, struct buf *, b_hash) 3556395Smckusick 3656395Smckusick /* 3756395Smckusick * Definitions for the buffer free lists. 3856395Smckusick */ 3956395Smckusick #define BQUEUES 4 /* number of free buffer queues */ 4056395Smckusick 4156395Smckusick #define BQ_LOCKED 0 /* super-blocks &c */ 4256395Smckusick #define BQ_LRU 1 /* lru, useful buffers */ 4356395Smckusick #define BQ_AGE 2 /* rubbish */ 4456395Smckusick #define BQ_EMPTY 3 /* buffer headers with no memory */ 4556395Smckusick 4656607Smckusick struct queue_entry bufqueues[BQUEUES]; 4756395Smckusick int needbuffer; 4856395Smckusick 4956395Smckusick /* 5056395Smckusick * Insq/Remq for the buffer free lists. 5156395Smckusick */ 5256607Smckusick #define binsheadfree(bp, dp) \ 5356607Smckusick queue_enter_head(dp, bp, struct buf *, b_freelist) 5456607Smckusick #define binstailfree(bp, dp) \ 5556607Smckusick queue_enter_tail(dp, bp, struct buf *, b_freelist) 5656607Smckusick 5757045Smargo /* 5857045Smargo * Local declarations 5957045Smargo */ 6057045Smargo struct buf *cluster_newbuf __P((struct vnode *, struct buf *, long, daddr_t, 6157045Smargo daddr_t, long, int)); 6257045Smargo struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *, 6357045Smargo daddr_t, daddr_t, long, int, long)); 6457045Smargo void cluster_wbuild __P((struct vnode *, struct buf *, long size, 6557045Smargo daddr_t start_lbn, int len, daddr_t lbn)); 6657045Smargo 6756395Smckusick void 6856395Smckusick bremfree(bp) 6956395Smckusick struct buf *bp; 7056395Smckusick { 7156607Smckusick struct queue_entry *dp; 7256395Smckusick 7356607Smckusick /* 7456607Smckusick * We only calculate the head of the freelist when removing 7556607Smckusick * the last element of the list as that is the only time that 7656607Smckusick * it is needed (e.g. to reset the tail pointer). 7756607Smckusick */ 7856607Smckusick if (bp->b_freelist.qe_next == NULL) { 7956395Smckusick for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) 8056607Smckusick if (dp->qe_prev == &bp->b_freelist.qe_next) 8156395Smckusick break; 8256395Smckusick if (dp == &bufqueues[BQUEUES]) 8356395Smckusick panic("bremfree: lost tail"); 8456395Smckusick } 8556607Smckusick queue_remove(dp, bp, struct buf *, b_freelist); 8656395Smckusick } 8756395Smckusick 8856395Smckusick /* 8949280Skarels * Initialize buffers and hash links for buffers. 9049280Skarels */ 9151455Sbostic void 9249280Skarels bufinit() 9349280Skarels { 9456395Smckusick register struct buf *bp; 9556607Smckusick struct queue_entry *dp; 9649280Skarels register int i; 9749280Skarels int base, residual; 9849280Skarels 9956395Smckusick for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) 10056607Smckusick queue_init(dp); 10156607Smckusick bufhashtbl = (struct list_entry *)hashinit(nbuf, M_CACHE, &bufhash); 10249280Skarels base = bufpages / nbuf; 10349280Skarels residual = bufpages % nbuf; 10449280Skarels for (i = 0; i < nbuf; i++) { 10549280Skarels bp = &buf[i]; 10656395Smckusick bzero((char *)bp, sizeof *bp); 10749280Skarels bp->b_dev = NODEV; 10849280Skarels bp->b_rcred = NOCRED; 10949280Skarels bp->b_wcred = NOCRED; 11049280Skarels bp->b_un.b_addr = buffers + i * MAXBSIZE; 11149280Skarels if (i < residual) 11249280Skarels bp->b_bufsize = (base + 1) * CLBYTES; 11349280Skarels else 11449280Skarels bp->b_bufsize = base * CLBYTES; 11552413Storek bp->b_flags = B_INVAL; 11656395Smckusick dp = bp->b_bufsize ? &bufqueues[BQ_AGE] : &bufqueues[BQ_EMPTY]; 11752413Storek binsheadfree(bp, dp); 11856395Smckusick binshash(bp, &invalhash); 11949280Skarels } 12049280Skarels } 12149280Skarels 12249280Skarels /* 12346151Smckusick * Find the block in the buffer pool. 12446151Smckusick * If the buffer is not present, allocate a new buffer and load 12546151Smckusick * its contents according to the filesystem fill routine. 1268Sbill */ 12738776Smckusick bread(vp, blkno, size, cred, bpp) 12837736Smckusick struct vnode *vp; 1296563Smckusic daddr_t blkno; 1306563Smckusic int size; 13138776Smckusick struct ucred *cred; 13237736Smckusick struct buf **bpp; 1338Sbill { 13447545Skarels struct proc *p = curproc; /* XXX */ 1358Sbill register struct buf *bp; 1368Sbill 1378670S if (size == 0) 1388670S panic("bread: size 0"); 139*57797Smckusick *bpp = bp = getblk(vp, blkno, size, 0, 0); 14046151Smckusick if (bp->b_flags & (B_DONE | B_DELWRI)) { 14140341Smckusick trace(TR_BREADHIT, pack(vp, size), blkno); 14237736Smckusick return (0); 1438Sbill } 1448Sbill bp->b_flags |= B_READ; 1458670S if (bp->b_bcount > bp->b_bufsize) 1468670S panic("bread"); 14738776Smckusick if (bp->b_rcred == NOCRED && cred != NOCRED) { 14838776Smckusick crhold(cred); 14938776Smckusick bp->b_rcred = cred; 15038776Smckusick } 15137736Smckusick VOP_STRATEGY(bp); 15240341Smckusick trace(TR_BREADMISS, pack(vp, size), blkno); 15347545Skarels p->p_stats->p_ru.ru_inblock++; /* pay for read */ 15437736Smckusick return (biowait(bp)); 1558Sbill } 1568Sbill 1578Sbill /* 15852189Smckusick * Operates like bread, but also starts I/O on the N specified 15952189Smckusick * read-ahead blocks. 1608Sbill */ 16152189Smckusick breadn(vp, blkno, size, rablkno, rabsize, num, cred, bpp) 16237736Smckusick struct vnode *vp; 1637114Smckusick daddr_t blkno; int size; 16452189Smckusick daddr_t rablkno[]; int rabsize[]; 16552189Smckusick int num; 16638776Smckusick struct ucred *cred; 16737736Smckusick struct buf **bpp; 1688Sbill { 16947545Skarels struct proc *p = curproc; /* XXX */ 1708Sbill register struct buf *bp, *rabp; 17152189Smckusick register int i; 1728Sbill 1738Sbill bp = NULL; 1747015Smckusick /* 17546151Smckusick * If the block is not memory resident, 17646151Smckusick * allocate a buffer and start I/O. 1777015Smckusick */ 17837736Smckusick if (!incore(vp, blkno)) { 179*57797Smckusick *bpp = bp = getblk(vp, blkno, size, 0, 0); 18046151Smckusick if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) { 1818Sbill bp->b_flags |= B_READ; 1828670S if (bp->b_bcount > bp->b_bufsize) 18352189Smckusick panic("breadn"); 18438776Smckusick if (bp->b_rcred == NOCRED && cred != NOCRED) { 18538776Smckusick crhold(cred); 18638776Smckusick bp->b_rcred = cred; 18738776Smckusick } 18837736Smckusick VOP_STRATEGY(bp); 18940341Smckusick trace(TR_BREADMISS, pack(vp, size), blkno); 19047545Skarels p->p_stats->p_ru.ru_inblock++; /* pay for read */ 19154342Smckusick } else { 19240341Smckusick trace(TR_BREADHIT, pack(vp, size), blkno); 19354342Smckusick } 1948Sbill } 1957015Smckusick 1967015Smckusick /* 19752189Smckusick * If there's read-ahead block(s), start I/O 19852189Smckusick * on them also (as above). 1997015Smckusick */ 20052189Smckusick for (i = 0; i < num; i++) { 20152189Smckusick if (incore(vp, rablkno[i])) 20252189Smckusick continue; 203*57797Smckusick rabp = getblk(vp, rablkno[i], rabsize[i], 0, 0); 20446151Smckusick if (rabp->b_flags & (B_DONE | B_DELWRI)) { 2058Sbill brelse(rabp); 20652189Smckusick trace(TR_BREADHITRA, pack(vp, rabsize[i]), rablkno[i]); 2072045Swnj } else { 20846151Smckusick rabp->b_flags |= B_ASYNC | B_READ; 2098670S if (rabp->b_bcount > rabp->b_bufsize) 2108670S panic("breadrabp"); 21138880Smckusick if (rabp->b_rcred == NOCRED && cred != NOCRED) { 21238776Smckusick crhold(cred); 21338880Smckusick rabp->b_rcred = cred; 21438776Smckusick } 21537736Smckusick VOP_STRATEGY(rabp); 21652189Smckusick trace(TR_BREADMISSRA, pack(vp, rabsize[i]), rablkno[i]); 21747545Skarels p->p_stats->p_ru.ru_inblock++; /* pay in advance */ 2188Sbill } 2198Sbill } 2207015Smckusick 2217015Smckusick /* 22246151Smckusick * If block was memory resident, let bread get it. 22346151Smckusick * If block was not memory resident, the read was 22446151Smckusick * started above, so just wait for the read to complete. 2257015Smckusick */ 2267114Smckusick if (bp == NULL) 22738776Smckusick return (bread(vp, blkno, size, cred, bpp)); 22837736Smckusick return (biowait(bp)); 2298Sbill } 2308Sbill 2318Sbill /* 23257045Smargo * We could optimize this by keeping track of where the last read-ahead 23357045Smargo * was, but it would involve adding fields to the vnode. For now, let's 23457045Smargo * just get it working. 23557045Smargo * 23657045Smargo * This replaces bread. If this is a bread at the beginning of a file and 23757045Smargo * lastr is 0, we assume this is the first read and we'll read up to two 23857045Smargo * blocks if they are sequential. After that, we'll do regular read ahead 23957045Smargo * in clustered chunks. 24057045Smargo * 24157045Smargo * There are 4 or 5 cases depending on how you count: 24257045Smargo * Desired block is in the cache: 24357045Smargo * 1 Not sequential access (0 I/Os). 24457045Smargo * 2 Access is sequential, do read-ahead (1 ASYNC). 24557045Smargo * Desired block is not in cache: 24657045Smargo * 3 Not sequential access (1 SYNC). 24757045Smargo * 4 Sequential access, next block is contiguous (1 SYNC). 24857045Smargo * 5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC) 24957045Smargo * 25057045Smargo * There are potentially two buffers that require I/O. 25157045Smargo * bp is the block requested. 25257045Smargo * rbp is the read-ahead block. 25357045Smargo * If either is NULL, then you don't have to do the I/O. 25457045Smargo */ 25557045Smargo cluster_read(vp, filesize, lblkno, size, cred, bpp) 25657045Smargo struct vnode *vp; 25757045Smargo u_quad_t filesize; 25857045Smargo daddr_t lblkno; 25957045Smargo long size; 26057045Smargo struct ucred *cred; 26157045Smargo struct buf **bpp; 26257045Smargo { 26357045Smargo struct buf *bp, *rbp; 26457045Smargo daddr_t blkno, ioblkno; 26557045Smargo long flags; 26657045Smargo int error, num_ra, alreadyincore; 26757045Smargo 26857045Smargo #ifdef DIAGNOSTIC 26957045Smargo if (size == 0) 27057045Smargo panic("cluster_read: size = 0"); 27157045Smargo #endif 27257045Smargo 27357045Smargo error = 0; 27457045Smargo flags = B_READ; 275*57797Smckusick *bpp = bp = getblk(vp, lblkno, size, 0, 0); 27657045Smargo if (bp->b_flags & (B_CACHE | B_DONE | B_DELWRI)) { 27757045Smargo /* 27857045Smargo * Desired block is in cache; do any readahead ASYNC. 27957045Smargo * Case 1, 2. 28057045Smargo */ 28157045Smargo trace(TR_BREADHIT, pack(vp, size), lblkno); 28257045Smargo flags |= B_ASYNC; 28357045Smargo ioblkno = lblkno + 28457045Smargo (lblkno < vp->v_ralen ? vp->v_ralen >> 1 : vp->v_ralen); 285*57797Smckusick alreadyincore = (int)incore(vp, ioblkno); 28657045Smargo bp = NULL; 28757045Smargo } else { 28857045Smargo /* Block wasn't in cache, case 3, 4, 5. */ 28957045Smargo trace(TR_BREADMISS, pack(vp, size), lblkno); 29057045Smargo ioblkno = lblkno; 29157045Smargo bp->b_flags |= flags; 29257045Smargo alreadyincore = 0; 29357045Smargo curproc->p_stats->p_ru.ru_inblock++; /* XXX */ 29457045Smargo } 29557045Smargo /* 29657045Smargo * XXX 29757045Smargo * Replace 1 with a window size based on some permutation of 29857045Smargo * maxcontig and rot_delay. This will let you figure out how 29957045Smargo * many blocks you should read-ahead (case 2, 4, 5). 30057045Smargo * 30157045Smargo * If the access isn't sequential, cut the window size in half. 30257045Smargo */ 30357045Smargo rbp = NULL; 30457045Smargo if (lblkno != vp->v_lastr + 1 && lblkno != 0) 30557045Smargo vp->v_ralen = max(vp->v_ralen >> 1, 1); 30657045Smargo else if ((ioblkno + 1) * size < filesize && !alreadyincore && 30757045Smargo !(error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra))) { 30857045Smargo /* 30957045Smargo * Reading sequentially, and the next block is not in the 31057045Smargo * cache. We are going to try reading ahead. If this is 31157045Smargo * the first read of a file, then limit read-ahead to a 31257045Smargo * single block, else read as much as we're allowed. 31357045Smargo */ 31457045Smargo if (num_ra > vp->v_ralen) { 31557045Smargo num_ra = vp->v_ralen; 31657045Smargo vp->v_ralen = min(MAXPHYS / size, vp->v_ralen << 1); 31757045Smargo } else 31857045Smargo vp->v_ralen = num_ra + 1; 31957045Smargo 32057045Smargo 32157045Smargo if (num_ra) /* case 2, 4 */ 32257045Smargo rbp = cluster_rbuild(vp, filesize, 32357045Smargo bp, ioblkno, blkno, size, num_ra, flags); 32457045Smargo else if (lblkno != 0 && ioblkno == lblkno) { 32557045Smargo /* Case 5: check how many blocks to read ahead */ 32657045Smargo ++ioblkno; 32757045Smargo if ((ioblkno + 1) * size > filesize || 32857045Smargo (error = VOP_BMAP(vp, 32957045Smargo ioblkno, NULL, &blkno, &num_ra))) 33057045Smargo goto skip_readahead; 33157045Smargo flags |= B_ASYNC; 33257045Smargo if (num_ra) 33357045Smargo rbp = cluster_rbuild(vp, filesize, 33457045Smargo NULL, ioblkno, blkno, size, num_ra, flags); 33557045Smargo else { 336*57797Smckusick rbp = getblk(vp, ioblkno, size, 0, 0); 33757045Smargo rbp->b_flags |= flags; 33857045Smargo rbp->b_blkno = blkno; 33957045Smargo } 34057045Smargo } else if (lblkno != 0) { 34157045Smargo /* case 2; read ahead single block */ 342*57797Smckusick rbp = getblk(vp, ioblkno, size, 0, 0); 34357045Smargo rbp->b_flags |= flags; 34457045Smargo rbp->b_blkno = blkno; 34557045Smargo } else if (bp) /* case 1, 3, block 0 */ 34657045Smargo bp->b_blkno = blkno; 34757045Smargo /* Case 1 on block 0; not really doing sequential I/O */ 34857045Smargo 34957045Smargo if (rbp == bp) /* case 4 */ 35057045Smargo rbp = NULL; 35157045Smargo else if (rbp) { /* case 2, 5 */ 35257045Smargo trace(TR_BREADMISSRA, 35357045Smargo pack(vp, (num_ra + 1) * size), ioblkno); 35457045Smargo curproc->p_stats->p_ru.ru_inblock++; /* XXX */ 35557045Smargo } 35657045Smargo } 35757045Smargo 35857045Smargo /* XXX Kirk, do we need to make sure the bp has creds? */ 35957045Smargo skip_readahead: 36057045Smargo if (bp) 36157045Smargo if (bp->b_flags & (B_DONE | B_DELWRI)) 36257045Smargo panic("cluster_read: DONE bp"); 36357045Smargo else 36457045Smargo error = VOP_STRATEGY(bp); 36557045Smargo 36657045Smargo if (rbp) 36757045Smargo if (error || rbp->b_flags & (B_DONE | B_DELWRI)) { 36857045Smargo rbp->b_flags &= ~(B_ASYNC | B_READ); 36957045Smargo brelse(rbp); 37057045Smargo } else 37157045Smargo (void) VOP_STRATEGY(rbp); 37257045Smargo 37357045Smargo if (bp) 37457045Smargo return(biowait(bp)); 37557045Smargo return(error); 37657045Smargo } 37757045Smargo 37857045Smargo /* 37957045Smargo * If blocks are contiguous on disk, use this to provide clustered 38057045Smargo * read ahead. We will read as many blocks as possible sequentially 38157045Smargo * and then parcel them up into logical blocks in the buffer hash table. 38257045Smargo */ 38357045Smargo struct buf * 38457045Smargo cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags) 38557045Smargo struct vnode *vp; 38657045Smargo u_quad_t filesize; 38757045Smargo struct buf *bp; 38857045Smargo daddr_t lbn; 38957045Smargo daddr_t blkno; 39057045Smargo long size; 39157045Smargo int run; 39257045Smargo long flags; 39357045Smargo { 39457045Smargo struct cluster_save *b_save; 39557045Smargo struct buf *tbp; 39657045Smargo daddr_t bn; 39757045Smargo int i, inc; 39857045Smargo 39957045Smargo if (size * (lbn + run + 1) > filesize) 40057045Smargo --run; 40157045Smargo if (run == 0) { 40257045Smargo if (!bp) { 403*57797Smckusick bp = getblk(vp, lbn, size, 0, 0); 40457045Smargo bp->b_blkno = blkno; 40557045Smargo bp->b_flags |= flags; 40657045Smargo } 40757045Smargo return(bp); 40857045Smargo } 40957045Smargo 41057045Smargo bp = cluster_newbuf(vp, bp, flags, blkno, lbn, size, run + 1); 41157045Smargo if (bp->b_flags & (B_DONE | B_DELWRI)) 41257045Smargo return (bp); 41357045Smargo 41457045Smargo b_save = malloc(sizeof(struct buf *) * run + sizeof(struct cluster_save), 41557045Smargo M_SEGMENT, M_WAITOK); 41657045Smargo b_save->bs_bufsize = b_save->bs_bcount = size; 41757045Smargo b_save->bs_nchildren = 0; 41857045Smargo b_save->bs_children = (struct buf **)(b_save + 1); 41957045Smargo b_save->bs_saveaddr = bp->b_saveaddr; 42057045Smargo bp->b_saveaddr = (caddr_t) b_save; 42157045Smargo 42257045Smargo inc = size / DEV_BSIZE; 42357045Smargo for (bn = blkno + inc, i = 1; i <= run; ++i, bn += inc) { 42457045Smargo if (incore(vp, lbn + i)) { 42557045Smargo if (i == 1) { 42657045Smargo bp->b_saveaddr = b_save->bs_saveaddr; 42757045Smargo bp->b_flags &= ~B_CALL; 42857045Smargo bp->b_iodone = NULL; 42957045Smargo allocbuf(bp, size); 43057045Smargo free(b_save, M_SEGMENT); 43157045Smargo } else 43257045Smargo allocbuf(bp, size * i); 43357045Smargo break; 43457045Smargo } 435*57797Smckusick tbp = getblk(vp, lbn + i, 0, 0, 0); 43657045Smargo tbp->b_bcount = tbp->b_bufsize = size; 43757045Smargo tbp->b_blkno = bn; 43857045Smargo tbp->b_flags |= flags | B_READ | B_ASYNC; 43957045Smargo ++b_save->bs_nchildren; 44057045Smargo b_save->bs_children[i - 1] = tbp; 44157045Smargo } 44257045Smargo if (!(bp->b_flags & B_ASYNC)) 44357045Smargo vp->v_ralen = max(vp->v_ralen - 1, 1); 44457045Smargo return(bp); 44557045Smargo } 44657045Smargo 44757045Smargo /* 44857045Smargo * Either get a new buffer or grow the existing one. 44957045Smargo */ 45057045Smargo struct buf * 45157045Smargo cluster_newbuf(vp, bp, flags, blkno, lblkno, size, run) 45257045Smargo struct vnode *vp; 45357045Smargo struct buf *bp; 45457045Smargo long flags; 45557045Smargo daddr_t blkno; 45657045Smargo daddr_t lblkno; 45757045Smargo long size; 45857045Smargo int run; 45957045Smargo { 46057045Smargo if (!bp) { 461*57797Smckusick bp = getblk(vp, lblkno, size, 0, 0); 46257045Smargo if (bp->b_flags & (B_DONE | B_DELWRI)) { 46357045Smargo bp->b_blkno = blkno; 46457045Smargo return(bp); 46557045Smargo } 46657045Smargo } 46757045Smargo allocbuf(bp, run * size); 46857045Smargo bp->b_blkno = blkno; 46957045Smargo bp->b_iodone = cluster_callback; 47057045Smargo bp->b_flags |= flags | B_CALL; 47157045Smargo return(bp); 47257045Smargo } 47357045Smargo 47457045Smargo /* 47557045Smargo * Cleanup after a clustered read or write. 47657045Smargo */ 47757045Smargo void 47857045Smargo cluster_callback(bp) 47957045Smargo struct buf *bp; 48057045Smargo { 48157045Smargo struct cluster_save *b_save; 48257045Smargo struct buf **tbp; 48357045Smargo long bsize; 48457045Smargo caddr_t cp; 48557045Smargo b_save = (struct cluster_save *)(bp->b_saveaddr); 48657045Smargo bp->b_saveaddr = b_save->bs_saveaddr; 48757045Smargo 48857045Smargo cp = bp->b_un.b_addr + b_save->bs_bufsize; 48957045Smargo for (tbp = b_save->bs_children; b_save->bs_nchildren--; ++tbp) { 49057045Smargo pagemove(cp, (*tbp)->b_un.b_addr, (*tbp)->b_bufsize); 49157045Smargo cp += (*tbp)->b_bufsize; 49257045Smargo bp->b_bufsize -= (*tbp)->b_bufsize; 49357045Smargo biodone(*tbp); 49457045Smargo } 49557045Smargo #ifdef DIAGNOSTIC 49657045Smargo if (bp->b_bufsize != b_save->bs_bufsize) 49757045Smargo panic ("cluster_callback: more space to reclaim"); 49857045Smargo #endif 49957045Smargo bp->b_bcount = bp->b_bufsize; 50057045Smargo bp->b_iodone = NULL; 50157045Smargo free(b_save, M_SEGMENT); 50257045Smargo if (bp->b_flags & B_ASYNC) 50357045Smargo brelse(bp); 50457045Smargo else 50557045Smargo wakeup((caddr_t)bp); 50657045Smargo } 50757045Smargo 50857045Smargo /* 50946151Smckusick * Synchronous write. 51046151Smckusick * Release buffer on completion. 5118Sbill */ 5128Sbill bwrite(bp) 5137015Smckusick register struct buf *bp; 5148Sbill { 51547545Skarels struct proc *p = curproc; /* XXX */ 51637736Smckusick register int flag; 51752413Storek int s, error = 0; 5188Sbill 5198Sbill flag = bp->b_flags; 5209857Ssam bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 52149459Smckusick if (flag & B_ASYNC) { 52249459Smckusick if ((flag & B_DELWRI) == 0) 52349459Smckusick p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 52449459Smckusick else 52549459Smckusick reassignbuf(bp, bp->b_vp); 52649459Smckusick } 52740341Smckusick trace(TR_BWRITE, pack(bp->b_vp, bp->b_bcount), bp->b_lblkno); 5288670S if (bp->b_bcount > bp->b_bufsize) 5298670S panic("bwrite"); 53040226Smckusick s = splbio(); 53139882Smckusick bp->b_vp->v_numoutput++; 532*57797Smckusick bp->b_flags |= B_WRITEINPROG; 53340226Smckusick splx(s); 53437736Smckusick VOP_STRATEGY(bp); 5357015Smckusick 5367015Smckusick /* 53746151Smckusick * If the write was synchronous, then await I/O completion. 5387015Smckusick * If the write was "delayed", then we put the buffer on 53946151Smckusick * the queue of blocks awaiting I/O completion status. 5407015Smckusick */ 54146151Smckusick if ((flag & B_ASYNC) == 0) { 54237736Smckusick error = biowait(bp); 54349459Smckusick if ((flag&B_DELWRI) == 0) 54449459Smckusick p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 54549459Smckusick else 54649459Smckusick reassignbuf(bp, bp->b_vp); 547*57797Smckusick if (bp->b_flags & B_EINTR) { 548*57797Smckusick bp->b_flags &= ~B_EINTR; 549*57797Smckusick error = EINTR; 550*57797Smckusick } 5518Sbill brelse(bp); 55237736Smckusick } else if (flag & B_DELWRI) { 55352413Storek s = splbio(); 5548Sbill bp->b_flags |= B_AGE; 55552413Storek splx(s); 55637736Smckusick } 55737736Smckusick return (error); 5588Sbill } 5598Sbill 56053578Sheideman int 56153578Sheideman vn_bwrite(ap) 56253578Sheideman struct vop_bwrite_args *ap; 56353578Sheideman { 56456395Smckusick return (bwrite(ap->a_bp)); 56553578Sheideman } 56653578Sheideman 56753578Sheideman 5688Sbill /* 56946151Smckusick * Delayed write. 57046151Smckusick * 57146151Smckusick * The buffer is marked dirty, but is not queued for I/O. 57246151Smckusick * This routine should be used when the buffer is expected 57346151Smckusick * to be modified again soon, typically a small write that 57446151Smckusick * partially fills a buffer. 57546151Smckusick * 57646151Smckusick * NB: magnetic tapes cannot be delayed; they must be 57746151Smckusick * written in the order that the writes are requested. 5788Sbill */ 5798Sbill bdwrite(bp) 5807015Smckusick register struct buf *bp; 5818Sbill { 58247545Skarels struct proc *p = curproc; /* XXX */ 5838Sbill 58439882Smckusick if ((bp->b_flags & B_DELWRI) == 0) { 58539882Smckusick bp->b_flags |= B_DELWRI; 58639882Smckusick reassignbuf(bp, bp->b_vp); 58747545Skarels p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 58839882Smckusick } 58937736Smckusick /* 59039668Smckusick * If this is a tape drive, the write must be initiated. 59137736Smckusick */ 59248360Smckusick if (VOP_IOCTL(bp->b_vp, 0, (caddr_t)B_TAPE, 0, NOCRED, p) == 0) { 5938Sbill bawrite(bp); 59439668Smckusick } else { 59546151Smckusick bp->b_flags |= (B_DONE | B_DELWRI); 5968Sbill brelse(bp); 5978Sbill } 5988Sbill } 5998Sbill 6008Sbill /* 60146151Smckusick * Asynchronous write. 60246151Smckusick * Start I/O on a buffer, but do not wait for it to complete. 60346151Smckusick * The buffer is released when the I/O completes. 6048Sbill */ 6058Sbill bawrite(bp) 6067015Smckusick register struct buf *bp; 6078Sbill { 6088Sbill 60946151Smckusick /* 61046151Smckusick * Setting the ASYNC flag causes bwrite to return 61146151Smckusick * after starting the I/O. 61246151Smckusick */ 6138Sbill bp->b_flags |= B_ASYNC; 614*57797Smckusick (void) VOP_BWRITE(bp); 6158Sbill } 6168Sbill 6178Sbill /* 61857045Smargo * Do clustered write for FFS. 61957045Smargo * 62057045Smargo * Three cases: 62157045Smargo * 1. Write is not sequential (write asynchronously) 62257045Smargo * Write is sequential: 62357045Smargo * 2. beginning of cluster - begin cluster 62457045Smargo * 3. middle of a cluster - add to cluster 62557045Smargo * 4. end of a cluster - asynchronously write cluster 62657045Smargo */ 62757045Smargo void 62857045Smargo cluster_write(bp, filesize) 62957045Smargo struct buf *bp; 63057045Smargo u_quad_t filesize; 63157045Smargo { 63257045Smargo struct vnode *vp; 63357045Smargo daddr_t lbn; 63457045Smargo int clen, error, maxrun; 63557045Smargo 63657045Smargo vp = bp->b_vp; 63757045Smargo lbn = bp->b_lblkno; 63857045Smargo clen = 0; 63957045Smargo 64057045Smargo /* 64157045Smargo * Handle end of file first. If we are appending, we need to check 64257045Smargo * if the current block was allocated contiguously. If it wasn't, 64357045Smargo * then we need to fire off a previous cluster if it existed. 64457045Smargo * Additionally, when we're appending, we need to figure out how 64557045Smargo * to initialize vp->v_clen. 64657045Smargo */ 64757045Smargo if ((lbn + 1) * bp->b_bcount == filesize) { 64857045Smargo if (bp->b_blkno != vp->v_lasta + bp->b_bcount / DEV_BSIZE) { 64957045Smargo /* This block was not allocated contiguously */ 65057045Smargo if (vp->v_clen) 65157045Smargo cluster_wbuild(vp, NULL, bp->b_bcount, vp->v_cstart, 65257045Smargo vp->v_lastw - vp->v_cstart + 1, lbn); 65357045Smargo vp->v_cstart = lbn; 65457045Smargo clen = vp->v_clen = 65557045Smargo MAXBSIZE / vp->v_mount->mnt_stat.f_iosize - 1; 65657045Smargo /* 65757045Smargo * Next cluster started. Write this buffer and return. 65857045Smargo */ 65957045Smargo vp->v_lastw = lbn; 66057045Smargo vp->v_lasta = bp->b_blkno; 66157045Smargo bdwrite(bp); 66257045Smargo return; 66357045Smargo } 66457045Smargo vp->v_lasta = bp->b_blkno; 66557045Smargo } else if (lbn == 0) { 66657045Smargo vp->v_clen = vp->v_cstart = vp->v_lastw = 0; 66757045Smargo } 66857045Smargo if (vp->v_clen == 0 || lbn != vp->v_lastw + 1) { 66957045Smargo if (vp->v_clen != 0) 67057045Smargo /* 67157045Smargo * Write is not sequential. 67257045Smargo */ 67357045Smargo cluster_wbuild(vp, NULL, bp->b_bcount, vp->v_cstart, 67457045Smargo vp->v_lastw - vp->v_cstart + 1, lbn); 67557045Smargo /* 67657045Smargo * Consider beginning a cluster. 67757045Smargo */ 67857045Smargo if (error = VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &clen)) { 67957045Smargo bawrite(bp); 68057045Smargo vp->v_cstart = lbn + 1; 68157045Smargo vp->v_lastw = lbn; 68257045Smargo return; 68357045Smargo } 68457045Smargo vp->v_clen = clen; 68557045Smargo if (clen == 0) { /* I/O not contiguous */ 68657045Smargo vp->v_cstart = lbn + 1; 68757045Smargo bawrite(bp); 68857045Smargo } else { /* Wait for rest of cluster */ 68957045Smargo vp->v_cstart = lbn; 69057045Smargo bdwrite(bp); 69157045Smargo } 69257045Smargo } else if (lbn == vp->v_cstart + vp->v_clen) { 69357045Smargo /* 69457045Smargo * At end of cluster, write it out. 69557045Smargo */ 69657045Smargo cluster_wbuild(vp, bp, bp->b_bcount, vp->v_cstart, 69757045Smargo vp->v_clen + 1, lbn); 69857045Smargo vp->v_clen = 0; 69957045Smargo vp->v_cstart = lbn + 1; 70057045Smargo } else 70157045Smargo /* 70257045Smargo * In the middle of a cluster, so just delay the 70357045Smargo * I/O for now. 70457045Smargo */ 70557045Smargo bdwrite(bp); 70657045Smargo vp->v_lastw = lbn; 70757045Smargo } 70857045Smargo 70957045Smargo 71057045Smargo /* 71157045Smargo * This is an awful lot like cluster_rbuild...wish they could be combined. 71257045Smargo * The last lbn argument is the current block on which I/O is being 71357045Smargo * performed. Check to see that it doesn't fall in the middle of 71457045Smargo * the current block. 71557045Smargo */ 71657045Smargo void 71757045Smargo cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn) 71857045Smargo struct vnode *vp; 71957045Smargo struct buf *last_bp; 72057045Smargo long size; 72157045Smargo daddr_t start_lbn; 72257045Smargo int len; 72357045Smargo daddr_t lbn; 72457045Smargo { 72557045Smargo struct cluster_save *b_save; 72657045Smargo struct buf *bp, *tbp; 72757045Smargo caddr_t cp; 72857045Smargo int i, s; 72957045Smargo 73057045Smargo redo: 73157045Smargo while ((!incore(vp, start_lbn) || start_lbn == lbn) && len) { 73257045Smargo ++start_lbn; 73357045Smargo --len; 73457045Smargo } 73557045Smargo 73657045Smargo /* Get more memory for current buffer */ 73757045Smargo if (len <= 1) { 73857045Smargo if (last_bp) 73957045Smargo bawrite(last_bp); 74057045Smargo return; 74157045Smargo } 74257045Smargo 743*57797Smckusick bp = getblk(vp, start_lbn, size, 0, 0); 74457045Smargo if (!(bp->b_flags & B_DELWRI)) { 74557045Smargo ++start_lbn; 74657045Smargo --len; 74757045Smargo brelse(bp); 74857045Smargo goto redo; 74957045Smargo } 75057045Smargo 75157045Smargo --len; 75257045Smargo b_save = malloc(sizeof(struct buf *) * len + sizeof(struct cluster_save), 75357045Smargo M_SEGMENT, M_WAITOK); 75457045Smargo b_save->bs_bcount = bp->b_bcount; 75557045Smargo b_save->bs_bufsize = bp->b_bufsize; 75657045Smargo b_save->bs_nchildren = 0; 75757045Smargo b_save->bs_children = (struct buf **)(b_save + 1); 75857045Smargo b_save->bs_saveaddr = bp->b_saveaddr; 75957045Smargo bp->b_saveaddr = (caddr_t) b_save; 76057045Smargo 76157045Smargo 76257045Smargo bp->b_flags |= B_CALL; 76357045Smargo bp->b_iodone = cluster_callback; 76457045Smargo cp = bp->b_un.b_addr + bp->b_bufsize; 76557045Smargo for (++start_lbn, i = 0; i < len; ++i, ++start_lbn) { 76657045Smargo if (!incore(vp, start_lbn) || start_lbn == lbn) 76757045Smargo break; 76857045Smargo 76957045Smargo if (last_bp == NULL || start_lbn != last_bp->b_lblkno) { 770*57797Smckusick tbp = getblk(vp, start_lbn, size, 0, 0); 77157045Smargo #ifdef DIAGNOSTIC 77257045Smargo if (tbp->b_bcount != tbp->b_bufsize) 77357045Smargo panic("cluster_wbuild: Buffer too big"); 77457045Smargo #endif 77557045Smargo if (!(tbp->b_flags & B_DELWRI)) { 77657045Smargo brelse(tbp); 77757045Smargo break; 77857045Smargo } 77957045Smargo } else 78057045Smargo tbp = last_bp; 78157045Smargo 78257045Smargo ++b_save->bs_nchildren; 78357045Smargo 78457045Smargo /* Move memory from children to parent */ 78557045Smargo pagemove(tbp->b_un.b_daddr, cp, size); 78657045Smargo bp->b_bcount += size; 78757045Smargo bp->b_bufsize += size; 78857045Smargo 78957045Smargo tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 79057045Smargo tbp->b_flags |= B_ASYNC; 79157045Smargo s = splbio(); 79257045Smargo reassignbuf(tbp, tbp->b_vp); /* put on clean list */ 79357045Smargo ++tbp->b_vp->v_numoutput; 79457045Smargo splx(s); 79557045Smargo b_save->bs_children[i] = tbp; 79657045Smargo 79757045Smargo cp += tbp->b_bufsize; 79857045Smargo } 79957045Smargo 80057045Smargo if (i == 0) { 80157045Smargo /* None to cluster */ 80257045Smargo bp->b_saveaddr = b_save->bs_saveaddr; 80357045Smargo bp->b_flags &= ~B_CALL; 80457045Smargo bp->b_iodone = NULL; 80557045Smargo free(b_save, M_SEGMENT); 80657045Smargo } 80757045Smargo bawrite(bp); 80857045Smargo if (i < len) { 80957045Smargo len -= i + 1; 81057045Smargo start_lbn += 1; 81157045Smargo goto redo; 81257045Smargo } 81357045Smargo } 81457045Smargo 81557045Smargo /* 81646151Smckusick * Release a buffer. 81746151Smckusick * Even if the buffer is dirty, no I/O is started. 8188Sbill */ 8198Sbill brelse(bp) 8207015Smckusick register struct buf *bp; 8218Sbill { 82256607Smckusick register struct queue_entry *flist; 82346151Smckusick int s; 8248Sbill 82540341Smckusick trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); 8267015Smckusick /* 82739668Smckusick * If a process is waiting for the buffer, or 82839668Smckusick * is waiting for a free buffer, awaken it. 8297015Smckusick */ 83046151Smckusick if (bp->b_flags & B_WANTED) 8318Sbill wakeup((caddr_t)bp); 83256395Smckusick if (needbuffer) { 83356395Smckusick needbuffer = 0; 83456395Smckusick wakeup((caddr_t)&needbuffer); 8358Sbill } 83639668Smckusick /* 83739668Smckusick * Retry I/O for locked buffers rather than invalidating them. 83839668Smckusick */ 83952413Storek s = splbio(); 84039668Smckusick if ((bp->b_flags & B_ERROR) && (bp->b_flags & B_LOCKED)) 84139668Smckusick bp->b_flags &= ~B_ERROR; 84239668Smckusick /* 84339668Smckusick * Disassociate buffers that are no longer valid. 84439668Smckusick */ 84546151Smckusick if (bp->b_flags & (B_NOCACHE | B_ERROR)) 84637736Smckusick bp->b_flags |= B_INVAL; 84746151Smckusick if ((bp->b_bufsize <= 0) || (bp->b_flags & (B_ERROR | B_INVAL))) { 84839668Smckusick if (bp->b_vp) 84939668Smckusick brelvp(bp); 85039668Smckusick bp->b_flags &= ~B_DELWRI; 85137736Smckusick } 8527015Smckusick /* 8537015Smckusick * Stick the buffer back on a free list. 8547015Smckusick */ 8558670S if (bp->b_bufsize <= 0) { 8568670S /* block has no buffer ... put at front of unused buffer list */ 85756395Smckusick flist = &bufqueues[BQ_EMPTY]; 8588670S binsheadfree(bp, flist); 85946151Smckusick } else if (bp->b_flags & (B_ERROR | B_INVAL)) { 8602325Swnj /* block has no info ... put at front of most free list */ 86156395Smckusick flist = &bufqueues[BQ_AGE]; 8627015Smckusick binsheadfree(bp, flist); 8638Sbill } else { 8642325Swnj if (bp->b_flags & B_LOCKED) 86556395Smckusick flist = &bufqueues[BQ_LOCKED]; 8662325Swnj else if (bp->b_flags & B_AGE) 86756395Smckusick flist = &bufqueues[BQ_AGE]; 8682325Swnj else 86956395Smckusick flist = &bufqueues[BQ_LRU]; 8707015Smckusick binstailfree(bp, flist); 8718Sbill } 87246151Smckusick bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_AGE | B_NOCACHE); 8738Sbill splx(s); 8748Sbill } 8758Sbill 8768Sbill /* 87746151Smckusick * Check to see if a block is currently memory resident. 8788Sbill */ 879*57797Smckusick struct buf * 88037736Smckusick incore(vp, blkno) 88137736Smckusick struct vnode *vp; 8827015Smckusick daddr_t blkno; 8838Sbill { 8848Sbill register struct buf *bp; 8858Sbill 88656607Smckusick for (bp = BUFHASH(vp, blkno)->le_next; bp; bp = bp->b_hash.qe_next) 88739668Smckusick if (bp->b_lblkno == blkno && bp->b_vp == vp && 8887015Smckusick (bp->b_flags & B_INVAL) == 0) 889*57797Smckusick return (bp); 890*57797Smckusick return (NULL); 8918Sbill } 8928Sbill 89339668Smckusick /* 89446151Smckusick * Check to see if a block is currently memory resident. 89546151Smckusick * If it is resident, return it. If it is not resident, 89646151Smckusick * allocate a new buffer and assign it to the block. 89739668Smckusick */ 8988Sbill struct buf * 899*57797Smckusick getblk(vp, blkno, size, slpflag, slptimeo) 90037736Smckusick register struct vnode *vp; 9016563Smckusic daddr_t blkno; 902*57797Smckusick int size, slpflag, slptimeo; 9038Sbill { 90456607Smckusick register struct buf *bp; 90556607Smckusick struct list_entry *dp; 906*57797Smckusick int s, error; 9078Sbill 90825255Smckusick if (size > MAXBSIZE) 90925255Smckusick panic("getblk: size too big"); 9107015Smckusick /* 91146151Smckusick * Search the cache for the block. If the buffer is found, 91246151Smckusick * but it is currently locked, the we must wait for it to 91346151Smckusick * become available. 9147015Smckusick */ 91537736Smckusick dp = BUFHASH(vp, blkno); 9167015Smckusick loop: 91756607Smckusick for (bp = dp->le_next; bp; bp = bp->b_hash.qe_next) { 918*57797Smckusick if (bp->b_lblkno != blkno || bp->b_vp != vp) 9198Sbill continue; 92026271Skarels s = splbio(); 92146151Smckusick if (bp->b_flags & B_BUSY) { 9228Sbill bp->b_flags |= B_WANTED; 923*57797Smckusick error = tsleep((caddr_t)bp, slpflag | (PRIBIO + 1), 924*57797Smckusick "getblk", slptimeo); 9255424Swnj splx(s); 926*57797Smckusick if (error) 927*57797Smckusick return (NULL); 9288Sbill goto loop; 9298Sbill } 930*57797Smckusick /* 931*57797Smckusick * The test for B_INVAL is moved down here, since there 932*57797Smckusick * are cases where B_INVAL is set before VOP_BWRITE() is 933*57797Smckusick * called and for NFS, the process cannot be allowed to 934*57797Smckusick * allocate a new buffer for the same block until the write 935*57797Smckusick * back to the server has been completed. (ie. B_BUSY clears) 936*57797Smckusick */ 937*57797Smckusick if (bp->b_flags & B_INVAL) { 938*57797Smckusick splx(s); 939*57797Smckusick continue; 940*57797Smckusick } 94139882Smckusick bremfree(bp); 94239882Smckusick bp->b_flags |= B_BUSY; 9435424Swnj splx(s); 94432608Smckusick if (bp->b_bcount != size) { 94539668Smckusick printf("getblk: stray size"); 94639668Smckusick bp->b_flags |= B_INVAL; 947*57797Smckusick VOP_BWRITE(bp); 94839668Smckusick goto loop; 94932608Smckusick } 9508Sbill bp->b_flags |= B_CACHE; 95126271Skarels return (bp); 9528Sbill } 953*57797Smckusick /* 954*57797Smckusick * The loop back to the top when getnewbuf() fails is because 955*57797Smckusick * stateless filesystems like NFS have no node locks. Thus, 956*57797Smckusick * there is a slight chance that more than one process will 957*57797Smckusick * try and getnewbuf() for the same block concurrently when 958*57797Smckusick * the first sleeps in getnewbuf(). So after a sleep, go back 959*57797Smckusick * up to the top to check the hash lists again. 960*57797Smckusick */ 961*57797Smckusick if ((bp = getnewbuf(slpflag, slptimeo)) == 0) 962*57797Smckusick goto loop; 9637015Smckusick bremhash(bp); 96439668Smckusick bgetvp(vp, bp); 96545116Smckusick bp->b_bcount = 0; 96639668Smckusick bp->b_lblkno = blkno; 9676563Smckusic bp->b_blkno = blkno; 9688670S bp->b_error = 0; 96937736Smckusick bp->b_resid = 0; 97037736Smckusick binshash(bp, dp); 97145116Smckusick allocbuf(bp, size); 97226271Skarels return (bp); 9738Sbill } 9748Sbill 9758Sbill /* 97646151Smckusick * Allocate a buffer. 97746151Smckusick * The caller will assign it to a block. 9788Sbill */ 9798Sbill struct buf * 9806563Smckusic geteblk(size) 9816563Smckusic int size; 9828Sbill { 98356395Smckusick register struct buf *bp; 9848Sbill 98525255Smckusick if (size > MAXBSIZE) 98625255Smckusick panic("geteblk: size too big"); 987*57797Smckusick while ((bp = getnewbuf(0, 0)) == NULL) 988*57797Smckusick /* void */; 9898670S bp->b_flags |= B_INVAL; 9907015Smckusick bremhash(bp); 99156395Smckusick binshash(bp, &invalhash); 99245116Smckusick bp->b_bcount = 0; 99337736Smckusick bp->b_error = 0; 99437736Smckusick bp->b_resid = 0; 99545116Smckusick allocbuf(bp, size); 99626271Skarels return (bp); 9978Sbill } 9988Sbill 9998Sbill /* 100045116Smckusick * Expand or contract the actual memory allocated to a buffer. 100146151Smckusick * If no memory is available, release buffer and take error exit. 10026563Smckusic */ 100345116Smckusick allocbuf(tp, size) 100445116Smckusick register struct buf *tp; 10056563Smckusic int size; 10066563Smckusic { 100745116Smckusick register struct buf *bp, *ep; 100845116Smckusick int sizealloc, take, s; 10096563Smckusic 101045116Smckusick sizealloc = roundup(size, CLBYTES); 101145116Smckusick /* 101245116Smckusick * Buffer size does not change 101345116Smckusick */ 101445116Smckusick if (sizealloc == tp->b_bufsize) 101545116Smckusick goto out; 101645116Smckusick /* 101745116Smckusick * Buffer size is shrinking. 101845116Smckusick * Place excess space in a buffer header taken from the 101945116Smckusick * BQ_EMPTY buffer list and placed on the "most free" list. 102045116Smckusick * If no extra buffer headers are available, leave the 102145116Smckusick * extra space in the present buffer. 102245116Smckusick */ 102345116Smckusick if (sizealloc < tp->b_bufsize) { 102456607Smckusick if ((ep = bufqueues[BQ_EMPTY].qe_next) == NULL) 102545116Smckusick goto out; 102645116Smckusick s = splbio(); 102745116Smckusick bremfree(ep); 102845116Smckusick ep->b_flags |= B_BUSY; 102945116Smckusick splx(s); 103045116Smckusick pagemove(tp->b_un.b_addr + sizealloc, ep->b_un.b_addr, 103145116Smckusick (int)tp->b_bufsize - sizealloc); 103245116Smckusick ep->b_bufsize = tp->b_bufsize - sizealloc; 103345116Smckusick tp->b_bufsize = sizealloc; 103445116Smckusick ep->b_flags |= B_INVAL; 103545116Smckusick ep->b_bcount = 0; 103645116Smckusick brelse(ep); 103745116Smckusick goto out; 103845116Smckusick } 103945116Smckusick /* 104045116Smckusick * More buffer space is needed. Get it out of buffers on 104145116Smckusick * the "most free" list, placing the empty headers on the 104245116Smckusick * BQ_EMPTY buffer header list. 104345116Smckusick */ 104445116Smckusick while (tp->b_bufsize < sizealloc) { 104545116Smckusick take = sizealloc - tp->b_bufsize; 1046*57797Smckusick while ((bp = getnewbuf(0, 0)) == NULL) 1047*57797Smckusick /* void */; 104845116Smckusick if (take >= bp->b_bufsize) 104945116Smckusick take = bp->b_bufsize; 105045116Smckusick pagemove(&bp->b_un.b_addr[bp->b_bufsize - take], 105145116Smckusick &tp->b_un.b_addr[tp->b_bufsize], take); 105245116Smckusick tp->b_bufsize += take; 105345116Smckusick bp->b_bufsize = bp->b_bufsize - take; 105445116Smckusick if (bp->b_bcount > bp->b_bufsize) 105545116Smckusick bp->b_bcount = bp->b_bufsize; 105645116Smckusick if (bp->b_bufsize <= 0) { 105745116Smckusick bremhash(bp); 105856395Smckusick binshash(bp, &invalhash); 105946151Smckusick bp->b_dev = NODEV; 106045116Smckusick bp->b_error = 0; 106145116Smckusick bp->b_flags |= B_INVAL; 106245116Smckusick } 106345116Smckusick brelse(bp); 106445116Smckusick } 106545116Smckusick out: 106645116Smckusick tp->b_bcount = size; 106745116Smckusick return (1); 10688670S } 10698670S 10708670S /* 10718670S * Find a buffer which is available for use. 10728670S * Select something from a free list. 10738670S * Preference is to AGE list, then LRU list. 10748670S */ 10758670S struct buf * 1076*57797Smckusick getnewbuf(slpflag, slptimeo) 1077*57797Smckusick int slpflag, slptimeo; 10788670S { 107956395Smckusick register struct buf *bp; 108056607Smckusick register struct queue_entry *dp; 108138776Smckusick register struct ucred *cred; 10828670S int s; 10838670S 10848670S loop: 108526271Skarels s = splbio(); 108656395Smckusick for (dp = &bufqueues[BQ_AGE]; dp > bufqueues; dp--) 108756607Smckusick if (dp->qe_next) 10888670S break; 108956395Smckusick if (dp == bufqueues) { /* no free blocks */ 109056395Smckusick needbuffer = 1; 1091*57797Smckusick (void) tsleep((caddr_t)&needbuffer, slpflag | (PRIBIO + 1), 1092*57797Smckusick "getnewbuf", slptimeo); 109312170Ssam splx(s); 1094*57797Smckusick return (NULL); 10958670S } 109656607Smckusick bp = dp->qe_next; 109739882Smckusick bremfree(bp); 109839882Smckusick bp->b_flags |= B_BUSY; 10998670S splx(s); 11008670S if (bp->b_flags & B_DELWRI) { 110138614Smckusick (void) bawrite(bp); 11028670S goto loop; 11038670S } 110440341Smckusick trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); 110539668Smckusick if (bp->b_vp) 110639668Smckusick brelvp(bp); 110738776Smckusick if (bp->b_rcred != NOCRED) { 110838776Smckusick cred = bp->b_rcred; 110938776Smckusick bp->b_rcred = NOCRED; 111038776Smckusick crfree(cred); 111138776Smckusick } 111238776Smckusick if (bp->b_wcred != NOCRED) { 111338776Smckusick cred = bp->b_wcred; 111438776Smckusick bp->b_wcred = NOCRED; 111538776Smckusick crfree(cred); 111638776Smckusick } 11178670S bp->b_flags = B_BUSY; 111846989Smckusick bp->b_dirtyoff = bp->b_dirtyend = 0; 111952189Smckusick bp->b_validoff = bp->b_validend = 0; 11208670S return (bp); 11218670S } 11228670S 11238670S /* 112446151Smckusick * Wait for I/O to complete. 112546151Smckusick * 112646151Smckusick * Extract and return any errors associated with the I/O. 112746151Smckusick * If the error flag is set, but no specific error is 112846151Smckusick * given, return EIO. 11298Sbill */ 11307015Smckusick biowait(bp) 11316563Smckusic register struct buf *bp; 11328Sbill { 11335431Sroot int s; 11348Sbill 113526271Skarels s = splbio(); 113638776Smckusick while ((bp->b_flags & B_DONE) == 0) 11378Sbill sleep((caddr_t)bp, PRIBIO); 11385431Sroot splx(s); 113937736Smckusick if ((bp->b_flags & B_ERROR) == 0) 114037736Smckusick return (0); 114137736Smckusick if (bp->b_error) 114237736Smckusick return (bp->b_error); 114337736Smckusick return (EIO); 11448Sbill } 11458Sbill 11468Sbill /* 114713128Ssam * Mark I/O complete on a buffer. 114846151Smckusick * 114946151Smckusick * If a callback has been requested, e.g. the pageout 115046151Smckusick * daemon, do so. Otherwise, awaken waiting processes. 11518Sbill */ 115251455Sbostic void 11537015Smckusick biodone(bp) 11547015Smckusick register struct buf *bp; 11558Sbill { 11568Sbill 1157420Sbill if (bp->b_flags & B_DONE) 11587015Smckusick panic("dup biodone"); 11598Sbill bp->b_flags |= B_DONE; 116049232Smckusick if ((bp->b_flags & B_READ) == 0) 116149232Smckusick vwakeup(bp); 11629763Ssam if (bp->b_flags & B_CALL) { 11639763Ssam bp->b_flags &= ~B_CALL; 11649763Ssam (*bp->b_iodone)(bp); 11659763Ssam return; 11669763Ssam } 116746151Smckusick if (bp->b_flags & B_ASYNC) 11688Sbill brelse(bp); 11698Sbill else { 11708Sbill bp->b_flags &= ~B_WANTED; 11718Sbill wakeup((caddr_t)bp); 11728Sbill } 11738Sbill } 117456356Smckusick 117557035Smargo int 117657035Smargo count_lock_queue() 117757035Smargo { 117857035Smargo register struct buf *bp; 117957035Smargo register int ret; 118057035Smargo 118157035Smargo for (ret = 0, bp = (struct buf *)bufqueues[BQ_LOCKED].qe_next; 118257035Smargo bp; bp = (struct buf *)bp->b_freelist.qe_next) 118357035Smargo ++ret; 118457035Smargo return(ret); 118557035Smargo } 118657035Smargo 118756356Smckusick #ifdef DIAGNOSTIC 118856356Smckusick /* 118956356Smckusick * Print out statistics on the current allocation of the buffer pool. 119056356Smckusick * Can be enabled to print out on every ``sync'' by setting "syncprt" 119156356Smckusick * above. 119256356Smckusick */ 119356356Smckusick void 119456356Smckusick vfs_bufstats() 119556356Smckusick { 119656356Smckusick int s, i, j, count; 119756395Smckusick register struct buf *bp; 119856607Smckusick register struct queue_entry *dp; 119956356Smckusick int counts[MAXBSIZE/CLBYTES+1]; 120056356Smckusick static char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE", "EMPTY" }; 120156356Smckusick 120256395Smckusick for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) { 120356356Smckusick count = 0; 120456356Smckusick for (j = 0; j <= MAXBSIZE/CLBYTES; j++) 120556356Smckusick counts[j] = 0; 120656356Smckusick s = splbio(); 120756607Smckusick for (bp = dp->qe_next; bp; bp = bp->b_freelist.qe_next) { 120856356Smckusick counts[bp->b_bufsize/CLBYTES]++; 120956356Smckusick count++; 121056356Smckusick } 121156356Smckusick splx(s); 121256356Smckusick printf("%s: total-%d", bname[i], count); 121356356Smckusick for (j = 0; j <= MAXBSIZE/CLBYTES; j++) 121456356Smckusick if (counts[j] != 0) 121556356Smckusick printf(", %d-%d", j * CLBYTES, counts[j]); 121656356Smckusick printf("\n"); 121756356Smckusick } 121856356Smckusick } 121956356Smckusick #endif /* DIAGNOSTIC */ 1220