149589Sbostic /*- 249589Sbostic * Copyright (c) 1982, 1986, 1989 The Regents of the University of California. 337736Smckusick * All rights reserved. 423395Smckusick * 549618Smckusick * This module is believed to contain source code proprietary to AT&T. 649618Smckusick * Use and redistribution is subject to the Berkeley Software License 749618Smckusick * Agreement and your Software Agreement with AT&T (Western Electric). 837736Smckusick * 9*57045Smargo * @(#)vfs_cluster.c 7.57 (Berkeley) 12/09/92 1023395Smckusick */ 118Sbill 1251455Sbostic #include <sys/param.h> 1351455Sbostic #include <sys/proc.h> 1451455Sbostic #include <sys/buf.h> 1551455Sbostic #include <sys/vnode.h> 1651455Sbostic #include <sys/mount.h> 1751455Sbostic #include <sys/trace.h> 1851455Sbostic #include <sys/resourcevar.h> 1956395Smckusick #include <sys/malloc.h> 2056395Smckusick #include <libkern/libkern.h> 218Sbill 2291Sbill /* 2356395Smckusick * Definitions for the buffer hash lists. 2456395Smckusick */ 2556395Smckusick #define BUFHASH(dvp, lbn) \ 2656395Smckusick (&bufhashtbl[((int)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash]) 2756607Smckusick struct list_entry *bufhashtbl, invalhash; 2856395Smckusick u_long bufhash; 2956395Smckusick 3056395Smckusick /* 3156395Smckusick * Insq/Remq for the buffer hash lists. 3256395Smckusick */ 3356607Smckusick #define binshash(bp, dp) list_enter_head(dp, bp, struct buf *, b_hash) 3456607Smckusick #define bremhash(bp) list_remove(bp, struct buf *, b_hash) 3556395Smckusick 3656395Smckusick /* 3756395Smckusick * Definitions for the buffer free lists. 3856395Smckusick */ 3956395Smckusick #define BQUEUES 4 /* number of free buffer queues */ 4056395Smckusick 4156395Smckusick #define BQ_LOCKED 0 /* super-blocks &c */ 4256395Smckusick #define BQ_LRU 1 /* lru, useful buffers */ 4356395Smckusick #define BQ_AGE 2 /* rubbish */ 4456395Smckusick #define BQ_EMPTY 3 /* buffer headers with no memory */ 4556395Smckusick 4656607Smckusick struct queue_entry bufqueues[BQUEUES]; 4756395Smckusick int needbuffer; 4856395Smckusick 4956395Smckusick /* 5056395Smckusick * Insq/Remq for the buffer free lists. 5156395Smckusick */ 5256607Smckusick #define binsheadfree(bp, dp) \ 5356607Smckusick queue_enter_head(dp, bp, struct buf *, b_freelist) 5456607Smckusick #define binstailfree(bp, dp) \ 5556607Smckusick queue_enter_tail(dp, bp, struct buf *, b_freelist) 5656607Smckusick 57*57045Smargo /* 58*57045Smargo * Local declarations 59*57045Smargo */ 60*57045Smargo struct buf *cluster_newbuf __P((struct vnode *, struct buf *, long, daddr_t, 61*57045Smargo daddr_t, long, int)); 62*57045Smargo struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *, 63*57045Smargo daddr_t, daddr_t, long, int, long)); 64*57045Smargo void cluster_wbuild __P((struct vnode *, struct buf *, long size, 65*57045Smargo daddr_t start_lbn, int len, daddr_t lbn)); 66*57045Smargo 6756395Smckusick void 6856395Smckusick bremfree(bp) 6956395Smckusick struct buf *bp; 7056395Smckusick { 7156607Smckusick struct queue_entry *dp; 7256395Smckusick 7356607Smckusick /* 7456607Smckusick * We only calculate the head of the freelist when removing 7556607Smckusick * the last element of the list as that is the only time that 7656607Smckusick * it is needed (e.g. to reset the tail pointer). 7756607Smckusick */ 7856607Smckusick if (bp->b_freelist.qe_next == NULL) { 7956395Smckusick for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) 8056607Smckusick if (dp->qe_prev == &bp->b_freelist.qe_next) 8156395Smckusick break; 8256395Smckusick if (dp == &bufqueues[BQUEUES]) 8356395Smckusick panic("bremfree: lost tail"); 8456395Smckusick } 8556607Smckusick queue_remove(dp, bp, struct buf *, b_freelist); 8656395Smckusick } 8756395Smckusick 8856395Smckusick /* 8949280Skarels * Initialize buffers and hash links for buffers. 9049280Skarels */ 9151455Sbostic void 9249280Skarels bufinit() 9349280Skarels { 9456395Smckusick register struct buf *bp; 9556607Smckusick struct queue_entry *dp; 9649280Skarels register int i; 9749280Skarels int base, residual; 9849280Skarels 9956395Smckusick for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) 10056607Smckusick queue_init(dp); 10156607Smckusick bufhashtbl = (struct list_entry *)hashinit(nbuf, M_CACHE, &bufhash); 10249280Skarels base = bufpages / nbuf; 10349280Skarels residual = bufpages % nbuf; 10449280Skarels for (i = 0; i < nbuf; i++) { 10549280Skarels bp = &buf[i]; 10656395Smckusick bzero((char *)bp, sizeof *bp); 10749280Skarels bp->b_dev = NODEV; 10849280Skarels bp->b_rcred = NOCRED; 10949280Skarels bp->b_wcred = NOCRED; 11049280Skarels bp->b_un.b_addr = buffers + i * MAXBSIZE; 11149280Skarels if (i < residual) 11249280Skarels bp->b_bufsize = (base + 1) * CLBYTES; 11349280Skarels else 11449280Skarels bp->b_bufsize = base * CLBYTES; 11552413Storek bp->b_flags = B_INVAL; 11656395Smckusick dp = bp->b_bufsize ? &bufqueues[BQ_AGE] : &bufqueues[BQ_EMPTY]; 11752413Storek binsheadfree(bp, dp); 11856395Smckusick binshash(bp, &invalhash); 11949280Skarels } 12049280Skarels } 12149280Skarels 12249280Skarels /* 12346151Smckusick * Find the block in the buffer pool. 12446151Smckusick * If the buffer is not present, allocate a new buffer and load 12546151Smckusick * its contents according to the filesystem fill routine. 1268Sbill */ 12738776Smckusick bread(vp, blkno, size, cred, bpp) 12837736Smckusick struct vnode *vp; 1296563Smckusic daddr_t blkno; 1306563Smckusic int size; 13138776Smckusick struct ucred *cred; 13237736Smckusick struct buf **bpp; 1338Sbill { 13447545Skarels struct proc *p = curproc; /* XXX */ 1358Sbill register struct buf *bp; 1368Sbill 1378670S if (size == 0) 1388670S panic("bread: size 0"); 13937736Smckusick *bpp = bp = getblk(vp, blkno, size); 14046151Smckusick if (bp->b_flags & (B_DONE | B_DELWRI)) { 14140341Smckusick trace(TR_BREADHIT, pack(vp, size), blkno); 14237736Smckusick return (0); 1438Sbill } 1448Sbill bp->b_flags |= B_READ; 1458670S if (bp->b_bcount > bp->b_bufsize) 1468670S panic("bread"); 14738776Smckusick if (bp->b_rcred == NOCRED && cred != NOCRED) { 14838776Smckusick crhold(cred); 14938776Smckusick bp->b_rcred = cred; 15038776Smckusick } 15137736Smckusick VOP_STRATEGY(bp); 15240341Smckusick trace(TR_BREADMISS, pack(vp, size), blkno); 15347545Skarels p->p_stats->p_ru.ru_inblock++; /* pay for read */ 15437736Smckusick return (biowait(bp)); 1558Sbill } 1568Sbill 1578Sbill /* 15852189Smckusick * Operates like bread, but also starts I/O on the N specified 15952189Smckusick * read-ahead blocks. 1608Sbill */ 16152189Smckusick breadn(vp, blkno, size, rablkno, rabsize, num, cred, bpp) 16237736Smckusick struct vnode *vp; 1637114Smckusick daddr_t blkno; int size; 16452189Smckusick daddr_t rablkno[]; int rabsize[]; 16552189Smckusick int num; 16638776Smckusick struct ucred *cred; 16737736Smckusick struct buf **bpp; 1688Sbill { 16947545Skarels struct proc *p = curproc; /* XXX */ 1708Sbill register struct buf *bp, *rabp; 17152189Smckusick register int i; 1728Sbill 1738Sbill bp = NULL; 1747015Smckusick /* 17546151Smckusick * If the block is not memory resident, 17646151Smckusick * allocate a buffer and start I/O. 1777015Smckusick */ 17837736Smckusick if (!incore(vp, blkno)) { 17937736Smckusick *bpp = bp = getblk(vp, blkno, size); 18046151Smckusick if ((bp->b_flags & (B_DONE | B_DELWRI)) == 0) { 1818Sbill bp->b_flags |= B_READ; 1828670S if (bp->b_bcount > bp->b_bufsize) 18352189Smckusick panic("breadn"); 18438776Smckusick if (bp->b_rcred == NOCRED && cred != NOCRED) { 18538776Smckusick crhold(cred); 18638776Smckusick bp->b_rcred = cred; 18738776Smckusick } 18837736Smckusick VOP_STRATEGY(bp); 18940341Smckusick trace(TR_BREADMISS, pack(vp, size), blkno); 19047545Skarels p->p_stats->p_ru.ru_inblock++; /* pay for read */ 19154342Smckusick } else { 19240341Smckusick trace(TR_BREADHIT, pack(vp, size), blkno); 19354342Smckusick } 1948Sbill } 1957015Smckusick 1967015Smckusick /* 19752189Smckusick * If there's read-ahead block(s), start I/O 19852189Smckusick * on them also (as above). 1997015Smckusick */ 20052189Smckusick for (i = 0; i < num; i++) { 20152189Smckusick if (incore(vp, rablkno[i])) 20252189Smckusick continue; 20352189Smckusick rabp = getblk(vp, rablkno[i], rabsize[i]); 20446151Smckusick if (rabp->b_flags & (B_DONE | B_DELWRI)) { 2058Sbill brelse(rabp); 20652189Smckusick trace(TR_BREADHITRA, pack(vp, rabsize[i]), rablkno[i]); 2072045Swnj } else { 20846151Smckusick rabp->b_flags |= B_ASYNC | B_READ; 2098670S if (rabp->b_bcount > rabp->b_bufsize) 2108670S panic("breadrabp"); 21138880Smckusick if (rabp->b_rcred == NOCRED && cred != NOCRED) { 21238776Smckusick crhold(cred); 21338880Smckusick rabp->b_rcred = cred; 21438776Smckusick } 21537736Smckusick VOP_STRATEGY(rabp); 21652189Smckusick trace(TR_BREADMISSRA, pack(vp, rabsize[i]), rablkno[i]); 21747545Skarels p->p_stats->p_ru.ru_inblock++; /* pay in advance */ 2188Sbill } 2198Sbill } 2207015Smckusick 2217015Smckusick /* 22246151Smckusick * If block was memory resident, let bread get it. 22346151Smckusick * If block was not memory resident, the read was 22446151Smckusick * started above, so just wait for the read to complete. 2257015Smckusick */ 2267114Smckusick if (bp == NULL) 22738776Smckusick return (bread(vp, blkno, size, cred, bpp)); 22837736Smckusick return (biowait(bp)); 2298Sbill } 2308Sbill 2318Sbill /* 232*57045Smargo * We could optimize this by keeping track of where the last read-ahead 233*57045Smargo * was, but it would involve adding fields to the vnode. For now, let's 234*57045Smargo * just get it working. 235*57045Smargo * 236*57045Smargo * This replaces bread. If this is a bread at the beginning of a file and 237*57045Smargo * lastr is 0, we assume this is the first read and we'll read up to two 238*57045Smargo * blocks if they are sequential. After that, we'll do regular read ahead 239*57045Smargo * in clustered chunks. 240*57045Smargo * 241*57045Smargo * There are 4 or 5 cases depending on how you count: 242*57045Smargo * Desired block is in the cache: 243*57045Smargo * 1 Not sequential access (0 I/Os). 244*57045Smargo * 2 Access is sequential, do read-ahead (1 ASYNC). 245*57045Smargo * Desired block is not in cache: 246*57045Smargo * 3 Not sequential access (1 SYNC). 247*57045Smargo * 4 Sequential access, next block is contiguous (1 SYNC). 248*57045Smargo * 5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC) 249*57045Smargo * 250*57045Smargo * There are potentially two buffers that require I/O. 251*57045Smargo * bp is the block requested. 252*57045Smargo * rbp is the read-ahead block. 253*57045Smargo * If either is NULL, then you don't have to do the I/O. 254*57045Smargo */ 255*57045Smargo cluster_read(vp, filesize, lblkno, size, cred, bpp) 256*57045Smargo struct vnode *vp; 257*57045Smargo u_quad_t filesize; 258*57045Smargo daddr_t lblkno; 259*57045Smargo long size; 260*57045Smargo struct ucred *cred; 261*57045Smargo struct buf **bpp; 262*57045Smargo { 263*57045Smargo struct buf *bp, *rbp; 264*57045Smargo daddr_t blkno, ioblkno; 265*57045Smargo long flags; 266*57045Smargo int error, num_ra, alreadyincore; 267*57045Smargo 268*57045Smargo #ifdef DIAGNOSTIC 269*57045Smargo if (size == 0) 270*57045Smargo panic("cluster_read: size = 0"); 271*57045Smargo #endif 272*57045Smargo 273*57045Smargo error = 0; 274*57045Smargo flags = B_READ; 275*57045Smargo *bpp = bp = getblk(vp, lblkno, size); 276*57045Smargo if (bp->b_flags & (B_CACHE | B_DONE | B_DELWRI)) { 277*57045Smargo /* 278*57045Smargo * Desired block is in cache; do any readahead ASYNC. 279*57045Smargo * Case 1, 2. 280*57045Smargo */ 281*57045Smargo trace(TR_BREADHIT, pack(vp, size), lblkno); 282*57045Smargo flags |= B_ASYNC; 283*57045Smargo ioblkno = lblkno + 284*57045Smargo (lblkno < vp->v_ralen ? vp->v_ralen >> 1 : vp->v_ralen); 285*57045Smargo alreadyincore = incore(vp, ioblkno); 286*57045Smargo bp = NULL; 287*57045Smargo } else { 288*57045Smargo /* Block wasn't in cache, case 3, 4, 5. */ 289*57045Smargo trace(TR_BREADMISS, pack(vp, size), lblkno); 290*57045Smargo ioblkno = lblkno; 291*57045Smargo bp->b_flags |= flags; 292*57045Smargo alreadyincore = 0; 293*57045Smargo curproc->p_stats->p_ru.ru_inblock++; /* XXX */ 294*57045Smargo } 295*57045Smargo /* 296*57045Smargo * XXX 297*57045Smargo * Replace 1 with a window size based on some permutation of 298*57045Smargo * maxcontig and rot_delay. This will let you figure out how 299*57045Smargo * many blocks you should read-ahead (case 2, 4, 5). 300*57045Smargo * 301*57045Smargo * If the access isn't sequential, cut the window size in half. 302*57045Smargo */ 303*57045Smargo rbp = NULL; 304*57045Smargo if (lblkno != vp->v_lastr + 1 && lblkno != 0) 305*57045Smargo vp->v_ralen = max(vp->v_ralen >> 1, 1); 306*57045Smargo else if ((ioblkno + 1) * size < filesize && !alreadyincore && 307*57045Smargo !(error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra))) { 308*57045Smargo /* 309*57045Smargo * Reading sequentially, and the next block is not in the 310*57045Smargo * cache. We are going to try reading ahead. If this is 311*57045Smargo * the first read of a file, then limit read-ahead to a 312*57045Smargo * single block, else read as much as we're allowed. 313*57045Smargo */ 314*57045Smargo if (num_ra > vp->v_ralen) { 315*57045Smargo num_ra = vp->v_ralen; 316*57045Smargo vp->v_ralen = min(MAXPHYS / size, vp->v_ralen << 1); 317*57045Smargo } else 318*57045Smargo vp->v_ralen = num_ra + 1; 319*57045Smargo 320*57045Smargo 321*57045Smargo if (num_ra) /* case 2, 4 */ 322*57045Smargo rbp = cluster_rbuild(vp, filesize, 323*57045Smargo bp, ioblkno, blkno, size, num_ra, flags); 324*57045Smargo else if (lblkno != 0 && ioblkno == lblkno) { 325*57045Smargo /* Case 5: check how many blocks to read ahead */ 326*57045Smargo ++ioblkno; 327*57045Smargo if ((ioblkno + 1) * size > filesize || 328*57045Smargo (error = VOP_BMAP(vp, 329*57045Smargo ioblkno, NULL, &blkno, &num_ra))) 330*57045Smargo goto skip_readahead; 331*57045Smargo flags |= B_ASYNC; 332*57045Smargo if (num_ra) 333*57045Smargo rbp = cluster_rbuild(vp, filesize, 334*57045Smargo NULL, ioblkno, blkno, size, num_ra, flags); 335*57045Smargo else { 336*57045Smargo rbp = getblk(vp, ioblkno, size); 337*57045Smargo rbp->b_flags |= flags; 338*57045Smargo rbp->b_blkno = blkno; 339*57045Smargo } 340*57045Smargo } else if (lblkno != 0) { 341*57045Smargo /* case 2; read ahead single block */ 342*57045Smargo rbp = getblk(vp, ioblkno, size); 343*57045Smargo rbp->b_flags |= flags; 344*57045Smargo rbp->b_blkno = blkno; 345*57045Smargo } else if (bp) /* case 1, 3, block 0 */ 346*57045Smargo bp->b_blkno = blkno; 347*57045Smargo /* Case 1 on block 0; not really doing sequential I/O */ 348*57045Smargo 349*57045Smargo if (rbp == bp) /* case 4 */ 350*57045Smargo rbp = NULL; 351*57045Smargo else if (rbp) { /* case 2, 5 */ 352*57045Smargo trace(TR_BREADMISSRA, 353*57045Smargo pack(vp, (num_ra + 1) * size), ioblkno); 354*57045Smargo curproc->p_stats->p_ru.ru_inblock++; /* XXX */ 355*57045Smargo } 356*57045Smargo } 357*57045Smargo 358*57045Smargo /* XXX Kirk, do we need to make sure the bp has creds? */ 359*57045Smargo skip_readahead: 360*57045Smargo if (bp) 361*57045Smargo if (bp->b_flags & (B_DONE | B_DELWRI)) 362*57045Smargo panic("cluster_read: DONE bp"); 363*57045Smargo else 364*57045Smargo error = VOP_STRATEGY(bp); 365*57045Smargo 366*57045Smargo if (rbp) 367*57045Smargo if (error || rbp->b_flags & (B_DONE | B_DELWRI)) { 368*57045Smargo rbp->b_flags &= ~(B_ASYNC | B_READ); 369*57045Smargo brelse(rbp); 370*57045Smargo } else 371*57045Smargo (void) VOP_STRATEGY(rbp); 372*57045Smargo 373*57045Smargo if (bp) 374*57045Smargo return(biowait(bp)); 375*57045Smargo return(error); 376*57045Smargo } 377*57045Smargo 378*57045Smargo /* 379*57045Smargo * If blocks are contiguous on disk, use this to provide clustered 380*57045Smargo * read ahead. We will read as many blocks as possible sequentially 381*57045Smargo * and then parcel them up into logical blocks in the buffer hash table. 382*57045Smargo */ 383*57045Smargo struct buf * 384*57045Smargo cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags) 385*57045Smargo struct vnode *vp; 386*57045Smargo u_quad_t filesize; 387*57045Smargo struct buf *bp; 388*57045Smargo daddr_t lbn; 389*57045Smargo daddr_t blkno; 390*57045Smargo long size; 391*57045Smargo int run; 392*57045Smargo long flags; 393*57045Smargo { 394*57045Smargo struct cluster_save *b_save; 395*57045Smargo struct buf *tbp; 396*57045Smargo daddr_t bn; 397*57045Smargo int i, inc; 398*57045Smargo 399*57045Smargo if (size * (lbn + run + 1) > filesize) 400*57045Smargo --run; 401*57045Smargo if (run == 0) { 402*57045Smargo if (!bp) { 403*57045Smargo bp = getblk(vp, lbn, size); 404*57045Smargo bp->b_blkno = blkno; 405*57045Smargo bp->b_flags |= flags; 406*57045Smargo } 407*57045Smargo return(bp); 408*57045Smargo } 409*57045Smargo 410*57045Smargo bp = cluster_newbuf(vp, bp, flags, blkno, lbn, size, run + 1); 411*57045Smargo if (bp->b_flags & (B_DONE | B_DELWRI)) 412*57045Smargo return (bp); 413*57045Smargo 414*57045Smargo b_save = malloc(sizeof(struct buf *) * run + sizeof(struct cluster_save), 415*57045Smargo M_SEGMENT, M_WAITOK); 416*57045Smargo b_save->bs_bufsize = b_save->bs_bcount = size; 417*57045Smargo b_save->bs_nchildren = 0; 418*57045Smargo b_save->bs_children = (struct buf **)(b_save + 1); 419*57045Smargo b_save->bs_saveaddr = bp->b_saveaddr; 420*57045Smargo bp->b_saveaddr = (caddr_t) b_save; 421*57045Smargo 422*57045Smargo inc = size / DEV_BSIZE; 423*57045Smargo for (bn = blkno + inc, i = 1; i <= run; ++i, bn += inc) { 424*57045Smargo if (incore(vp, lbn + i)) { 425*57045Smargo if (i == 1) { 426*57045Smargo bp->b_saveaddr = b_save->bs_saveaddr; 427*57045Smargo bp->b_flags &= ~B_CALL; 428*57045Smargo bp->b_iodone = NULL; 429*57045Smargo allocbuf(bp, size); 430*57045Smargo free(b_save, M_SEGMENT); 431*57045Smargo } else 432*57045Smargo allocbuf(bp, size * i); 433*57045Smargo break; 434*57045Smargo } 435*57045Smargo tbp = getblk(vp, lbn + i, 0); 436*57045Smargo tbp->b_bcount = tbp->b_bufsize = size; 437*57045Smargo tbp->b_blkno = bn; 438*57045Smargo tbp->b_flags |= flags | B_READ | B_ASYNC; 439*57045Smargo ++b_save->bs_nchildren; 440*57045Smargo b_save->bs_children[i - 1] = tbp; 441*57045Smargo } 442*57045Smargo if (!(bp->b_flags & B_ASYNC)) 443*57045Smargo vp->v_ralen = max(vp->v_ralen - 1, 1); 444*57045Smargo return(bp); 445*57045Smargo } 446*57045Smargo 447*57045Smargo /* 448*57045Smargo * Either get a new buffer or grow the existing one. 449*57045Smargo */ 450*57045Smargo struct buf * 451*57045Smargo cluster_newbuf(vp, bp, flags, blkno, lblkno, size, run) 452*57045Smargo struct vnode *vp; 453*57045Smargo struct buf *bp; 454*57045Smargo long flags; 455*57045Smargo daddr_t blkno; 456*57045Smargo daddr_t lblkno; 457*57045Smargo long size; 458*57045Smargo int run; 459*57045Smargo { 460*57045Smargo if (!bp) { 461*57045Smargo bp = getblk(vp, lblkno, size); 462*57045Smargo if (bp->b_flags & (B_DONE | B_DELWRI)) { 463*57045Smargo bp->b_blkno = blkno; 464*57045Smargo return(bp); 465*57045Smargo } 466*57045Smargo } 467*57045Smargo allocbuf(bp, run * size); 468*57045Smargo bp->b_blkno = blkno; 469*57045Smargo bp->b_iodone = cluster_callback; 470*57045Smargo bp->b_flags |= flags | B_CALL; 471*57045Smargo return(bp); 472*57045Smargo } 473*57045Smargo 474*57045Smargo /* 475*57045Smargo * Cleanup after a clustered read or write. 476*57045Smargo */ 477*57045Smargo void 478*57045Smargo cluster_callback(bp) 479*57045Smargo struct buf *bp; 480*57045Smargo { 481*57045Smargo struct cluster_save *b_save; 482*57045Smargo struct buf **tbp; 483*57045Smargo long bsize; 484*57045Smargo caddr_t cp; 485*57045Smargo 486*57045Smargo b_save = (struct cluster_save *)(bp->b_saveaddr); 487*57045Smargo bp->b_saveaddr = b_save->bs_saveaddr; 488*57045Smargo 489*57045Smargo cp = bp->b_un.b_addr + b_save->bs_bufsize; 490*57045Smargo for (tbp = b_save->bs_children; b_save->bs_nchildren--; ++tbp) { 491*57045Smargo pagemove(cp, (*tbp)->b_un.b_addr, (*tbp)->b_bufsize); 492*57045Smargo cp += (*tbp)->b_bufsize; 493*57045Smargo bp->b_bufsize -= (*tbp)->b_bufsize; 494*57045Smargo biodone(*tbp); 495*57045Smargo } 496*57045Smargo #ifdef DIAGNOSTIC 497*57045Smargo if (bp->b_bufsize != b_save->bs_bufsize) 498*57045Smargo panic ("cluster_callback: more space to reclaim"); 499*57045Smargo #endif 500*57045Smargo bp->b_bcount = bp->b_bufsize; 501*57045Smargo bp->b_iodone = NULL; 502*57045Smargo free(b_save, M_SEGMENT); 503*57045Smargo if (bp->b_flags & B_ASYNC) 504*57045Smargo brelse(bp); 505*57045Smargo else 506*57045Smargo wakeup((caddr_t)bp); 507*57045Smargo } 508*57045Smargo 509*57045Smargo /* 51046151Smckusick * Synchronous write. 51146151Smckusick * Release buffer on completion. 5128Sbill */ 5138Sbill bwrite(bp) 5147015Smckusick register struct buf *bp; 5158Sbill { 51647545Skarels struct proc *p = curproc; /* XXX */ 51737736Smckusick register int flag; 51852413Storek int s, error = 0; 5198Sbill 5208Sbill flag = bp->b_flags; 5219857Ssam bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 52249459Smckusick if (flag & B_ASYNC) { 52349459Smckusick if ((flag & B_DELWRI) == 0) 52449459Smckusick p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 52549459Smckusick else 52649459Smckusick reassignbuf(bp, bp->b_vp); 52749459Smckusick } 52840341Smckusick trace(TR_BWRITE, pack(bp->b_vp, bp->b_bcount), bp->b_lblkno); 5298670S if (bp->b_bcount > bp->b_bufsize) 5308670S panic("bwrite"); 53140226Smckusick s = splbio(); 53239882Smckusick bp->b_vp->v_numoutput++; 53340226Smckusick splx(s); 53437736Smckusick VOP_STRATEGY(bp); 5357015Smckusick 5367015Smckusick /* 53746151Smckusick * If the write was synchronous, then await I/O completion. 5387015Smckusick * If the write was "delayed", then we put the buffer on 53946151Smckusick * the queue of blocks awaiting I/O completion status. 5407015Smckusick */ 54146151Smckusick if ((flag & B_ASYNC) == 0) { 54237736Smckusick error = biowait(bp); 54349459Smckusick if ((flag&B_DELWRI) == 0) 54449459Smckusick p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 54549459Smckusick else 54649459Smckusick reassignbuf(bp, bp->b_vp); 5478Sbill brelse(bp); 54837736Smckusick } else if (flag & B_DELWRI) { 54952413Storek s = splbio(); 5508Sbill bp->b_flags |= B_AGE; 55152413Storek splx(s); 55237736Smckusick } 55337736Smckusick return (error); 5548Sbill } 5558Sbill 55653578Sheideman int 55753578Sheideman vn_bwrite(ap) 55853578Sheideman struct vop_bwrite_args *ap; 55953578Sheideman { 56056395Smckusick return (bwrite(ap->a_bp)); 56153578Sheideman } 56253578Sheideman 56353578Sheideman 5648Sbill /* 56546151Smckusick * Delayed write. 56646151Smckusick * 56746151Smckusick * The buffer is marked dirty, but is not queued for I/O. 56846151Smckusick * This routine should be used when the buffer is expected 56946151Smckusick * to be modified again soon, typically a small write that 57046151Smckusick * partially fills a buffer. 57146151Smckusick * 57246151Smckusick * NB: magnetic tapes cannot be delayed; they must be 57346151Smckusick * written in the order that the writes are requested. 5748Sbill */ 5758Sbill bdwrite(bp) 5767015Smckusick register struct buf *bp; 5778Sbill { 57847545Skarels struct proc *p = curproc; /* XXX */ 5798Sbill 58039882Smckusick if ((bp->b_flags & B_DELWRI) == 0) { 58139882Smckusick bp->b_flags |= B_DELWRI; 58239882Smckusick reassignbuf(bp, bp->b_vp); 58347545Skarels p->p_stats->p_ru.ru_oublock++; /* no one paid yet */ 58439882Smckusick } 58537736Smckusick /* 58639668Smckusick * If this is a tape drive, the write must be initiated. 58737736Smckusick */ 58848360Smckusick if (VOP_IOCTL(bp->b_vp, 0, (caddr_t)B_TAPE, 0, NOCRED, p) == 0) { 5898Sbill bawrite(bp); 59039668Smckusick } else { 59146151Smckusick bp->b_flags |= (B_DONE | B_DELWRI); 5928Sbill brelse(bp); 5938Sbill } 5948Sbill } 5958Sbill 5968Sbill /* 59746151Smckusick * Asynchronous write. 59846151Smckusick * Start I/O on a buffer, but do not wait for it to complete. 59946151Smckusick * The buffer is released when the I/O completes. 6008Sbill */ 6018Sbill bawrite(bp) 6027015Smckusick register struct buf *bp; 6038Sbill { 6048Sbill 60546151Smckusick /* 60646151Smckusick * Setting the ASYNC flag causes bwrite to return 60746151Smckusick * after starting the I/O. 60846151Smckusick */ 6098Sbill bp->b_flags |= B_ASYNC; 61037736Smckusick (void) bwrite(bp); 6118Sbill } 6128Sbill 6138Sbill /* 614*57045Smargo * Do clustered write for FFS. 615*57045Smargo * 616*57045Smargo * Three cases: 617*57045Smargo * 1. Write is not sequential (write asynchronously) 618*57045Smargo * Write is sequential: 619*57045Smargo * 2. beginning of cluster - begin cluster 620*57045Smargo * 3. middle of a cluster - add to cluster 621*57045Smargo * 4. end of a cluster - asynchronously write cluster 622*57045Smargo */ 623*57045Smargo void 624*57045Smargo cluster_write(bp, filesize) 625*57045Smargo struct buf *bp; 626*57045Smargo u_quad_t filesize; 627*57045Smargo { 628*57045Smargo struct vnode *vp; 629*57045Smargo daddr_t lbn; 630*57045Smargo int clen, error, maxrun; 631*57045Smargo 632*57045Smargo vp = bp->b_vp; 633*57045Smargo lbn = bp->b_lblkno; 634*57045Smargo clen = 0; 635*57045Smargo 636*57045Smargo /* 637*57045Smargo * Handle end of file first. If we are appending, we need to check 638*57045Smargo * if the current block was allocated contiguously. If it wasn't, 639*57045Smargo * then we need to fire off a previous cluster if it existed. 640*57045Smargo * Additionally, when we're appending, we need to figure out how 641*57045Smargo * to initialize vp->v_clen. 642*57045Smargo */ 643*57045Smargo if ((lbn + 1) * bp->b_bcount == filesize) { 644*57045Smargo if (bp->b_blkno != vp->v_lasta + bp->b_bcount / DEV_BSIZE) { 645*57045Smargo /* This block was not allocated contiguously */ 646*57045Smargo if (vp->v_clen) 647*57045Smargo cluster_wbuild(vp, NULL, bp->b_bcount, vp->v_cstart, 648*57045Smargo vp->v_lastw - vp->v_cstart + 1, lbn); 649*57045Smargo vp->v_cstart = lbn; 650*57045Smargo clen = vp->v_clen = 651*57045Smargo MAXBSIZE / vp->v_mount->mnt_stat.f_iosize - 1; 652*57045Smargo /* 653*57045Smargo * Next cluster started. Write this buffer and return. 654*57045Smargo */ 655*57045Smargo vp->v_lastw = lbn; 656*57045Smargo vp->v_lasta = bp->b_blkno; 657*57045Smargo bdwrite(bp); 658*57045Smargo return; 659*57045Smargo } 660*57045Smargo vp->v_lasta = bp->b_blkno; 661*57045Smargo } else if (lbn == 0) { 662*57045Smargo vp->v_clen = vp->v_cstart = vp->v_lastw = 0; 663*57045Smargo } 664*57045Smargo if (vp->v_clen == 0 || lbn != vp->v_lastw + 1) { 665*57045Smargo if (vp->v_clen != 0) 666*57045Smargo /* 667*57045Smargo * Write is not sequential. 668*57045Smargo */ 669*57045Smargo cluster_wbuild(vp, NULL, bp->b_bcount, vp->v_cstart, 670*57045Smargo vp->v_lastw - vp->v_cstart + 1, lbn); 671*57045Smargo /* 672*57045Smargo * Consider beginning a cluster. 673*57045Smargo */ 674*57045Smargo if (error = VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &clen)) { 675*57045Smargo bawrite(bp); 676*57045Smargo vp->v_cstart = lbn + 1; 677*57045Smargo vp->v_lastw = lbn; 678*57045Smargo return; 679*57045Smargo } 680*57045Smargo vp->v_clen = clen; 681*57045Smargo if (clen == 0) { /* I/O not contiguous */ 682*57045Smargo vp->v_cstart = lbn + 1; 683*57045Smargo bawrite(bp); 684*57045Smargo } else { /* Wait for rest of cluster */ 685*57045Smargo vp->v_cstart = lbn; 686*57045Smargo bdwrite(bp); 687*57045Smargo } 688*57045Smargo } else if (lbn == vp->v_cstart + vp->v_clen) { 689*57045Smargo /* 690*57045Smargo * At end of cluster, write it out. 691*57045Smargo */ 692*57045Smargo cluster_wbuild(vp, bp, bp->b_bcount, vp->v_cstart, 693*57045Smargo vp->v_clen + 1, lbn); 694*57045Smargo vp->v_clen = 0; 695*57045Smargo vp->v_cstart = lbn + 1; 696*57045Smargo } else 697*57045Smargo /* 698*57045Smargo * In the middle of a cluster, so just delay the 699*57045Smargo * I/O for now. 700*57045Smargo */ 701*57045Smargo bdwrite(bp); 702*57045Smargo vp->v_lastw = lbn; 703*57045Smargo } 704*57045Smargo 705*57045Smargo 706*57045Smargo /* 707*57045Smargo * This is an awful lot like cluster_rbuild...wish they could be combined. 708*57045Smargo * The last lbn argument is the current block on which I/O is being 709*57045Smargo * performed. Check to see that it doesn't fall in the middle of 710*57045Smargo * the current block. 711*57045Smargo */ 712*57045Smargo void 713*57045Smargo cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn) 714*57045Smargo struct vnode *vp; 715*57045Smargo struct buf *last_bp; 716*57045Smargo long size; 717*57045Smargo daddr_t start_lbn; 718*57045Smargo int len; 719*57045Smargo daddr_t lbn; 720*57045Smargo { 721*57045Smargo struct cluster_save *b_save; 722*57045Smargo struct buf *bp, *tbp; 723*57045Smargo caddr_t cp; 724*57045Smargo int i, s; 725*57045Smargo 726*57045Smargo redo: 727*57045Smargo while ((!incore(vp, start_lbn) || start_lbn == lbn) && len) { 728*57045Smargo ++start_lbn; 729*57045Smargo --len; 730*57045Smargo } 731*57045Smargo 732*57045Smargo /* Get more memory for current buffer */ 733*57045Smargo if (len <= 1) { 734*57045Smargo if (last_bp) 735*57045Smargo bawrite(last_bp); 736*57045Smargo return; 737*57045Smargo } 738*57045Smargo 739*57045Smargo bp = getblk(vp, start_lbn, size); 740*57045Smargo if (!(bp->b_flags & B_DELWRI)) { 741*57045Smargo ++start_lbn; 742*57045Smargo --len; 743*57045Smargo brelse(bp); 744*57045Smargo goto redo; 745*57045Smargo } 746*57045Smargo 747*57045Smargo --len; 748*57045Smargo b_save = malloc(sizeof(struct buf *) * len + sizeof(struct cluster_save), 749*57045Smargo M_SEGMENT, M_WAITOK); 750*57045Smargo b_save->bs_bcount = bp->b_bcount; 751*57045Smargo b_save->bs_bufsize = bp->b_bufsize; 752*57045Smargo b_save->bs_nchildren = 0; 753*57045Smargo b_save->bs_children = (struct buf **)(b_save + 1); 754*57045Smargo b_save->bs_saveaddr = bp->b_saveaddr; 755*57045Smargo bp->b_saveaddr = (caddr_t) b_save; 756*57045Smargo 757*57045Smargo 758*57045Smargo bp->b_flags |= B_CALL; 759*57045Smargo bp->b_iodone = cluster_callback; 760*57045Smargo cp = bp->b_un.b_addr + bp->b_bufsize; 761*57045Smargo for (++start_lbn, i = 0; i < len; ++i, ++start_lbn) { 762*57045Smargo if (!incore(vp, start_lbn) || start_lbn == lbn) 763*57045Smargo break; 764*57045Smargo 765*57045Smargo if (last_bp == NULL || start_lbn != last_bp->b_lblkno) { 766*57045Smargo tbp = getblk(vp, start_lbn, size); 767*57045Smargo #ifdef DIAGNOSTIC 768*57045Smargo if (tbp->b_bcount != tbp->b_bufsize) 769*57045Smargo panic("cluster_wbuild: Buffer too big"); 770*57045Smargo #endif 771*57045Smargo if (!(tbp->b_flags & B_DELWRI)) { 772*57045Smargo brelse(tbp); 773*57045Smargo break; 774*57045Smargo } 775*57045Smargo } else 776*57045Smargo tbp = last_bp; 777*57045Smargo 778*57045Smargo ++b_save->bs_nchildren; 779*57045Smargo 780*57045Smargo /* Move memory from children to parent */ 781*57045Smargo pagemove(tbp->b_un.b_daddr, cp, size); 782*57045Smargo bp->b_bcount += size; 783*57045Smargo bp->b_bufsize += size; 784*57045Smargo 785*57045Smargo tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 786*57045Smargo tbp->b_flags |= B_ASYNC; 787*57045Smargo s = splbio(); 788*57045Smargo reassignbuf(tbp, tbp->b_vp); /* put on clean list */ 789*57045Smargo ++tbp->b_vp->v_numoutput; 790*57045Smargo splx(s); 791*57045Smargo b_save->bs_children[i] = tbp; 792*57045Smargo 793*57045Smargo cp += tbp->b_bufsize; 794*57045Smargo } 795*57045Smargo 796*57045Smargo if (i == 0) { 797*57045Smargo /* None to cluster */ 798*57045Smargo bp->b_saveaddr = b_save->bs_saveaddr; 799*57045Smargo bp->b_flags &= ~B_CALL; 800*57045Smargo bp->b_iodone = NULL; 801*57045Smargo free(b_save, M_SEGMENT); 802*57045Smargo } 803*57045Smargo bawrite(bp); 804*57045Smargo if (i < len) { 805*57045Smargo len -= i + 1; 806*57045Smargo start_lbn += 1; 807*57045Smargo goto redo; 808*57045Smargo } 809*57045Smargo } 810*57045Smargo 811*57045Smargo /* 81246151Smckusick * Release a buffer. 81346151Smckusick * Even if the buffer is dirty, no I/O is started. 8148Sbill */ 8158Sbill brelse(bp) 8167015Smckusick register struct buf *bp; 8178Sbill { 81856607Smckusick register struct queue_entry *flist; 81946151Smckusick int s; 8208Sbill 82140341Smckusick trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); 8227015Smckusick /* 82339668Smckusick * If a process is waiting for the buffer, or 82439668Smckusick * is waiting for a free buffer, awaken it. 8257015Smckusick */ 82646151Smckusick if (bp->b_flags & B_WANTED) 8278Sbill wakeup((caddr_t)bp); 82856395Smckusick if (needbuffer) { 82956395Smckusick needbuffer = 0; 83056395Smckusick wakeup((caddr_t)&needbuffer); 8318Sbill } 83239668Smckusick /* 83339668Smckusick * Retry I/O for locked buffers rather than invalidating them. 83439668Smckusick */ 83552413Storek s = splbio(); 83639668Smckusick if ((bp->b_flags & B_ERROR) && (bp->b_flags & B_LOCKED)) 83739668Smckusick bp->b_flags &= ~B_ERROR; 83839668Smckusick /* 83939668Smckusick * Disassociate buffers that are no longer valid. 84039668Smckusick */ 84146151Smckusick if (bp->b_flags & (B_NOCACHE | B_ERROR)) 84237736Smckusick bp->b_flags |= B_INVAL; 84346151Smckusick if ((bp->b_bufsize <= 0) || (bp->b_flags & (B_ERROR | B_INVAL))) { 84439668Smckusick if (bp->b_vp) 84539668Smckusick brelvp(bp); 84639668Smckusick bp->b_flags &= ~B_DELWRI; 84737736Smckusick } 8487015Smckusick /* 8497015Smckusick * Stick the buffer back on a free list. 8507015Smckusick */ 8518670S if (bp->b_bufsize <= 0) { 8528670S /* block has no buffer ... put at front of unused buffer list */ 85356395Smckusick flist = &bufqueues[BQ_EMPTY]; 8548670S binsheadfree(bp, flist); 85546151Smckusick } else if (bp->b_flags & (B_ERROR | B_INVAL)) { 8562325Swnj /* block has no info ... put at front of most free list */ 85756395Smckusick flist = &bufqueues[BQ_AGE]; 8587015Smckusick binsheadfree(bp, flist); 8598Sbill } else { 8602325Swnj if (bp->b_flags & B_LOCKED) 86156395Smckusick flist = &bufqueues[BQ_LOCKED]; 8622325Swnj else if (bp->b_flags & B_AGE) 86356395Smckusick flist = &bufqueues[BQ_AGE]; 8642325Swnj else 86556395Smckusick flist = &bufqueues[BQ_LRU]; 8667015Smckusick binstailfree(bp, flist); 8678Sbill } 86846151Smckusick bp->b_flags &= ~(B_WANTED | B_BUSY | B_ASYNC | B_AGE | B_NOCACHE); 8698Sbill splx(s); 8708Sbill } 8718Sbill 8728Sbill /* 87346151Smckusick * Check to see if a block is currently memory resident. 8748Sbill */ 87537736Smckusick incore(vp, blkno) 87637736Smckusick struct vnode *vp; 8777015Smckusick daddr_t blkno; 8788Sbill { 8798Sbill register struct buf *bp; 8808Sbill 88156607Smckusick for (bp = BUFHASH(vp, blkno)->le_next; bp; bp = bp->b_hash.qe_next) 88239668Smckusick if (bp->b_lblkno == blkno && bp->b_vp == vp && 8837015Smckusick (bp->b_flags & B_INVAL) == 0) 88491Sbill return (1); 88591Sbill return (0); 8868Sbill } 8878Sbill 88839668Smckusick /* 88946151Smckusick * Check to see if a block is currently memory resident. 89046151Smckusick * If it is resident, return it. If it is not resident, 89146151Smckusick * allocate a new buffer and assign it to the block. 89239668Smckusick */ 8938Sbill struct buf * 89437736Smckusick getblk(vp, blkno, size) 89537736Smckusick register struct vnode *vp; 8966563Smckusic daddr_t blkno; 8976563Smckusic int size; 8988Sbill { 89956607Smckusick register struct buf *bp; 90056607Smckusick struct list_entry *dp; 9015424Swnj int s; 9028Sbill 90325255Smckusick if (size > MAXBSIZE) 90425255Smckusick panic("getblk: size too big"); 9057015Smckusick /* 90646151Smckusick * Search the cache for the block. If the buffer is found, 90746151Smckusick * but it is currently locked, the we must wait for it to 90846151Smckusick * become available. 9097015Smckusick */ 91037736Smckusick dp = BUFHASH(vp, blkno); 9117015Smckusick loop: 91256607Smckusick for (bp = dp->le_next; bp; bp = bp->b_hash.qe_next) { 91339668Smckusick if (bp->b_lblkno != blkno || bp->b_vp != vp || 91446151Smckusick (bp->b_flags & B_INVAL)) 9158Sbill continue; 91626271Skarels s = splbio(); 91746151Smckusick if (bp->b_flags & B_BUSY) { 9188Sbill bp->b_flags |= B_WANTED; 91946151Smckusick sleep((caddr_t)bp, PRIBIO + 1); 9205424Swnj splx(s); 9218Sbill goto loop; 9228Sbill } 92339882Smckusick bremfree(bp); 92439882Smckusick bp->b_flags |= B_BUSY; 9255424Swnj splx(s); 92632608Smckusick if (bp->b_bcount != size) { 92739668Smckusick printf("getblk: stray size"); 92839668Smckusick bp->b_flags |= B_INVAL; 92939668Smckusick bwrite(bp); 93039668Smckusick goto loop; 93132608Smckusick } 9328Sbill bp->b_flags |= B_CACHE; 93326271Skarels return (bp); 9348Sbill } 9358670S bp = getnewbuf(); 9367015Smckusick bremhash(bp); 93739668Smckusick bgetvp(vp, bp); 93845116Smckusick bp->b_bcount = 0; 93939668Smckusick bp->b_lblkno = blkno; 9406563Smckusic bp->b_blkno = blkno; 9418670S bp->b_error = 0; 94237736Smckusick bp->b_resid = 0; 94337736Smckusick binshash(bp, dp); 94445116Smckusick allocbuf(bp, size); 94526271Skarels return (bp); 9468Sbill } 9478Sbill 9488Sbill /* 94946151Smckusick * Allocate a buffer. 95046151Smckusick * The caller will assign it to a block. 9518Sbill */ 9528Sbill struct buf * 9536563Smckusic geteblk(size) 9546563Smckusic int size; 9558Sbill { 95656395Smckusick register struct buf *bp; 9578Sbill 95825255Smckusick if (size > MAXBSIZE) 95925255Smckusick panic("geteblk: size too big"); 9608670S bp = getnewbuf(); 9618670S bp->b_flags |= B_INVAL; 9627015Smckusick bremhash(bp); 96356395Smckusick binshash(bp, &invalhash); 96445116Smckusick bp->b_bcount = 0; 96537736Smckusick bp->b_error = 0; 96637736Smckusick bp->b_resid = 0; 96745116Smckusick allocbuf(bp, size); 96826271Skarels return (bp); 9698Sbill } 9708Sbill 9718Sbill /* 97245116Smckusick * Expand or contract the actual memory allocated to a buffer. 97346151Smckusick * If no memory is available, release buffer and take error exit. 9746563Smckusic */ 97545116Smckusick allocbuf(tp, size) 97645116Smckusick register struct buf *tp; 9776563Smckusic int size; 9786563Smckusic { 97945116Smckusick register struct buf *bp, *ep; 98045116Smckusick int sizealloc, take, s; 9816563Smckusic 98245116Smckusick sizealloc = roundup(size, CLBYTES); 98345116Smckusick /* 98445116Smckusick * Buffer size does not change 98545116Smckusick */ 98645116Smckusick if (sizealloc == tp->b_bufsize) 98745116Smckusick goto out; 98845116Smckusick /* 98945116Smckusick * Buffer size is shrinking. 99045116Smckusick * Place excess space in a buffer header taken from the 99145116Smckusick * BQ_EMPTY buffer list and placed on the "most free" list. 99245116Smckusick * If no extra buffer headers are available, leave the 99345116Smckusick * extra space in the present buffer. 99445116Smckusick */ 99545116Smckusick if (sizealloc < tp->b_bufsize) { 99656607Smckusick if ((ep = bufqueues[BQ_EMPTY].qe_next) == NULL) 99745116Smckusick goto out; 99845116Smckusick s = splbio(); 99945116Smckusick bremfree(ep); 100045116Smckusick ep->b_flags |= B_BUSY; 100145116Smckusick splx(s); 100245116Smckusick pagemove(tp->b_un.b_addr + sizealloc, ep->b_un.b_addr, 100345116Smckusick (int)tp->b_bufsize - sizealloc); 100445116Smckusick ep->b_bufsize = tp->b_bufsize - sizealloc; 100545116Smckusick tp->b_bufsize = sizealloc; 100645116Smckusick ep->b_flags |= B_INVAL; 100745116Smckusick ep->b_bcount = 0; 100845116Smckusick brelse(ep); 100945116Smckusick goto out; 101045116Smckusick } 101145116Smckusick /* 101245116Smckusick * More buffer space is needed. Get it out of buffers on 101345116Smckusick * the "most free" list, placing the empty headers on the 101445116Smckusick * BQ_EMPTY buffer header list. 101545116Smckusick */ 101645116Smckusick while (tp->b_bufsize < sizealloc) { 101745116Smckusick take = sizealloc - tp->b_bufsize; 101845116Smckusick bp = getnewbuf(); 101945116Smckusick if (take >= bp->b_bufsize) 102045116Smckusick take = bp->b_bufsize; 102145116Smckusick pagemove(&bp->b_un.b_addr[bp->b_bufsize - take], 102245116Smckusick &tp->b_un.b_addr[tp->b_bufsize], take); 102345116Smckusick tp->b_bufsize += take; 102445116Smckusick bp->b_bufsize = bp->b_bufsize - take; 102545116Smckusick if (bp->b_bcount > bp->b_bufsize) 102645116Smckusick bp->b_bcount = bp->b_bufsize; 102745116Smckusick if (bp->b_bufsize <= 0) { 102845116Smckusick bremhash(bp); 102956395Smckusick binshash(bp, &invalhash); 103046151Smckusick bp->b_dev = NODEV; 103145116Smckusick bp->b_error = 0; 103245116Smckusick bp->b_flags |= B_INVAL; 103345116Smckusick } 103445116Smckusick brelse(bp); 103545116Smckusick } 103645116Smckusick out: 103745116Smckusick tp->b_bcount = size; 103845116Smckusick return (1); 10398670S } 10408670S 10418670S /* 10428670S * Find a buffer which is available for use. 10438670S * Select something from a free list. 10448670S * Preference is to AGE list, then LRU list. 10458670S */ 10468670S struct buf * 10478670S getnewbuf() 10488670S { 104956395Smckusick register struct buf *bp; 105056607Smckusick register struct queue_entry *dp; 105138776Smckusick register struct ucred *cred; 10528670S int s; 10538670S 10548670S loop: 105526271Skarels s = splbio(); 105656395Smckusick for (dp = &bufqueues[BQ_AGE]; dp > bufqueues; dp--) 105756607Smckusick if (dp->qe_next) 10588670S break; 105956395Smckusick if (dp == bufqueues) { /* no free blocks */ 106056395Smckusick needbuffer = 1; 106156395Smckusick sleep((caddr_t)&needbuffer, PRIBIO + 1); 106212170Ssam splx(s); 10638670S goto loop; 10648670S } 106556607Smckusick bp = dp->qe_next; 106639882Smckusick bremfree(bp); 106739882Smckusick bp->b_flags |= B_BUSY; 10688670S splx(s); 10698670S if (bp->b_flags & B_DELWRI) { 107038614Smckusick (void) bawrite(bp); 10718670S goto loop; 10728670S } 107340341Smckusick trace(TR_BRELSE, pack(bp->b_vp, bp->b_bufsize), bp->b_lblkno); 107439668Smckusick if (bp->b_vp) 107539668Smckusick brelvp(bp); 107638776Smckusick if (bp->b_rcred != NOCRED) { 107738776Smckusick cred = bp->b_rcred; 107838776Smckusick bp->b_rcred = NOCRED; 107938776Smckusick crfree(cred); 108038776Smckusick } 108138776Smckusick if (bp->b_wcred != NOCRED) { 108238776Smckusick cred = bp->b_wcred; 108338776Smckusick bp->b_wcred = NOCRED; 108438776Smckusick crfree(cred); 108538776Smckusick } 10868670S bp->b_flags = B_BUSY; 108746989Smckusick bp->b_dirtyoff = bp->b_dirtyend = 0; 108852189Smckusick bp->b_validoff = bp->b_validend = 0; 10898670S return (bp); 10908670S } 10918670S 10928670S /* 109346151Smckusick * Wait for I/O to complete. 109446151Smckusick * 109546151Smckusick * Extract and return any errors associated with the I/O. 109646151Smckusick * If the error flag is set, but no specific error is 109746151Smckusick * given, return EIO. 10988Sbill */ 10997015Smckusick biowait(bp) 11006563Smckusic register struct buf *bp; 11018Sbill { 11025431Sroot int s; 11038Sbill 110426271Skarels s = splbio(); 110538776Smckusick while ((bp->b_flags & B_DONE) == 0) 11068Sbill sleep((caddr_t)bp, PRIBIO); 11075431Sroot splx(s); 110837736Smckusick if ((bp->b_flags & B_ERROR) == 0) 110937736Smckusick return (0); 111037736Smckusick if (bp->b_error) 111137736Smckusick return (bp->b_error); 111237736Smckusick return (EIO); 11138Sbill } 11148Sbill 11158Sbill /* 111613128Ssam * Mark I/O complete on a buffer. 111746151Smckusick * 111846151Smckusick * If a callback has been requested, e.g. the pageout 111946151Smckusick * daemon, do so. Otherwise, awaken waiting processes. 11208Sbill */ 112151455Sbostic void 11227015Smckusick biodone(bp) 11237015Smckusick register struct buf *bp; 11248Sbill { 11258Sbill 1126420Sbill if (bp->b_flags & B_DONE) 11277015Smckusick panic("dup biodone"); 11288Sbill bp->b_flags |= B_DONE; 112949232Smckusick if ((bp->b_flags & B_READ) == 0) 113049232Smckusick vwakeup(bp); 11319763Ssam if (bp->b_flags & B_CALL) { 11329763Ssam bp->b_flags &= ~B_CALL; 11339763Ssam (*bp->b_iodone)(bp); 11349763Ssam return; 11359763Ssam } 113646151Smckusick if (bp->b_flags & B_ASYNC) 11378Sbill brelse(bp); 11388Sbill else { 11398Sbill bp->b_flags &= ~B_WANTED; 11408Sbill wakeup((caddr_t)bp); 11418Sbill } 11428Sbill } 114356356Smckusick 114457035Smargo int 114557035Smargo count_lock_queue() 114657035Smargo { 114757035Smargo register struct buf *bp; 114857035Smargo register int ret; 114957035Smargo 115057035Smargo for (ret = 0, bp = (struct buf *)bufqueues[BQ_LOCKED].qe_next; 115157035Smargo bp; bp = (struct buf *)bp->b_freelist.qe_next) 115257035Smargo ++ret; 115357035Smargo return(ret); 115457035Smargo } 115557035Smargo 115656356Smckusick #ifdef DIAGNOSTIC 115756356Smckusick /* 115856356Smckusick * Print out statistics on the current allocation of the buffer pool. 115956356Smckusick * Can be enabled to print out on every ``sync'' by setting "syncprt" 116056356Smckusick * above. 116156356Smckusick */ 116256356Smckusick void 116356356Smckusick vfs_bufstats() 116456356Smckusick { 116556356Smckusick int s, i, j, count; 116656395Smckusick register struct buf *bp; 116756607Smckusick register struct queue_entry *dp; 116856356Smckusick int counts[MAXBSIZE/CLBYTES+1]; 116956356Smckusick static char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE", "EMPTY" }; 117056356Smckusick 117156395Smckusick for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) { 117256356Smckusick count = 0; 117356356Smckusick for (j = 0; j <= MAXBSIZE/CLBYTES; j++) 117456356Smckusick counts[j] = 0; 117556356Smckusick s = splbio(); 117656607Smckusick for (bp = dp->qe_next; bp; bp = bp->b_freelist.qe_next) { 117756356Smckusick counts[bp->b_bufsize/CLBYTES]++; 117856356Smckusick count++; 117956356Smckusick } 118056356Smckusick splx(s); 118156356Smckusick printf("%s: total-%d", bname[i], count); 118256356Smckusick for (j = 0; j <= MAXBSIZE/CLBYTES; j++) 118356356Smckusick if (counts[j] != 0) 118456356Smckusick printf(", %d-%d", j * CLBYTES, counts[j]); 118556356Smckusick printf("\n"); 118656356Smckusick } 118756356Smckusick } 118856356Smckusick #endif /* DIAGNOSTIC */ 1189