1*7015Smckusick /* vfs_cluster.c 4.31 82/05/31 */ 28Sbill 38Sbill #include "../h/param.h" 48Sbill #include "../h/systm.h" 58Sbill #include "../h/dir.h" 68Sbill #include "../h/user.h" 78Sbill #include "../h/buf.h" 88Sbill #include "../h/conf.h" 98Sbill #include "../h/proc.h" 108Sbill #include "../h/seg.h" 118Sbill #include "../h/pte.h" 128Sbill #include "../h/vm.h" 132045Swnj #include "../h/trace.h" 148Sbill 1591Sbill /* 168Sbill * Read in (if necessary) the block and return a buffer pointer. 178Sbill */ 188Sbill struct buf * 196563Smckusic bread(dev, blkno, size) 206563Smckusic dev_t dev; 216563Smckusic daddr_t blkno; 226563Smckusic int size; 238Sbill { 248Sbill register struct buf *bp; 258Sbill 266563Smckusic bp = getblk(dev, blkno, size); 278Sbill if (bp->b_flags&B_DONE) { 283199Swnj trace(TR_BREADHIT, dev, blkno); 298Sbill return(bp); 308Sbill } 318Sbill bp->b_flags |= B_READ; 328Sbill (*bdevsw[major(dev)].d_strategy)(bp); 333199Swnj trace(TR_BREADMISS, dev, blkno); 348Sbill u.u_vm.vm_inblk++; /* pay for read */ 35*7015Smckusick biowait(bp); 368Sbill return(bp); 378Sbill } 388Sbill 398Sbill /* 408Sbill * Read in the block, like bread, but also start I/O on the 418Sbill * read-ahead block (which is not allocated to the caller) 428Sbill */ 438Sbill struct buf * 446563Smckusic breada(dev, blkno, rablkno, size) 456563Smckusic dev_t dev; 466563Smckusic daddr_t blkno, rablkno; 476563Smckusic int size; 488Sbill { 498Sbill register struct buf *bp, *rabp; 508Sbill 518Sbill bp = NULL; 52*7015Smckusick /* 53*7015Smckusick * If the block isn't in core, then allocate 54*7015Smckusick * a buffer and initiate i/o (getblk checks 55*7015Smckusick * for a cache hit). 56*7015Smckusick */ 578Sbill if (!incore(dev, blkno)) { 586563Smckusic bp = getblk(dev, blkno, size); 598Sbill if ((bp->b_flags&B_DONE) == 0) { 608Sbill bp->b_flags |= B_READ; 618Sbill (*bdevsw[major(dev)].d_strategy)(bp); 623199Swnj trace(TR_BREADMISS, dev, blkno); 638Sbill u.u_vm.vm_inblk++; /* pay for read */ 64*7015Smckusick } else 653199Swnj trace(TR_BREADHIT, dev, blkno); 668Sbill } 67*7015Smckusick 68*7015Smckusick /* 69*7015Smckusick * If there's a read-ahead block, start i/o 70*7015Smckusick * on it also (as above). 71*7015Smckusick */ 728Sbill if (rablkno && !incore(dev, rablkno)) { 736563Smckusic rabp = getblk(dev, rablkno, size); 742045Swnj if (rabp->b_flags & B_DONE) { 758Sbill brelse(rabp); 763199Swnj trace(TR_BREADHITRA, dev, blkno); 772045Swnj } else { 788Sbill rabp->b_flags |= B_READ|B_ASYNC; 798Sbill (*bdevsw[major(dev)].d_strategy)(rabp); 803199Swnj trace(TR_BREADMISSRA, dev, rablock); 818Sbill u.u_vm.vm_inblk++; /* pay in advance */ 828Sbill } 838Sbill } 84*7015Smckusick 85*7015Smckusick /* 86*7015Smckusick * If we get here with bp NULL, then the block 87*7015Smckusick * must've been in core and bread will find it for us. 88*7015Smckusick */ 898Sbill if(bp == NULL) 906563Smckusic return(bread(dev, blkno, size)); 91*7015Smckusick biowait(bp); 928Sbill return(bp); 938Sbill } 948Sbill 958Sbill /* 968Sbill * Write the buffer, waiting for completion. 978Sbill * Then release the buffer. 988Sbill */ 998Sbill bwrite(bp) 100*7015Smckusick register struct buf *bp; 1018Sbill { 1028Sbill register flag; 1038Sbill 1048Sbill flag = bp->b_flags; 1058Sbill bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI | B_AGE); 1068Sbill if ((flag&B_DELWRI) == 0) 1078Sbill u.u_vm.vm_oublk++; /* noone paid yet */ 1084033Swnj trace(TR_BWRITE, bp->b_dev, bp->b_blkno); 1098Sbill (*bdevsw[major(bp->b_dev)].d_strategy)(bp); 110*7015Smckusick 111*7015Smckusick /* 112*7015Smckusick * If the write was synchronous, then await i/o completion. 113*7015Smckusick * If the write was "delayed", then we put the buffer on 114*7015Smckusick * the q of blocks awaiting i/o completion status. 115*7015Smckusick * Otherwise, the i/o must be finished and we check for 116*7015Smckusick * an error. 117*7015Smckusick */ 1188Sbill if ((flag&B_ASYNC) == 0) { 119*7015Smckusick biowait(bp); 1208Sbill brelse(bp); 1218Sbill } else if (flag & B_DELWRI) 1228Sbill bp->b_flags |= B_AGE; 1238Sbill else 1248Sbill geterror(bp); 1258Sbill } 1268Sbill 1278Sbill /* 1288Sbill * Release the buffer, marking it so that if it is grabbed 1298Sbill * for another purpose it will be written out before being 1308Sbill * given up (e.g. when writing a partial block where it is 1318Sbill * assumed that another write for the same block will soon follow). 1328Sbill * This can't be done for magtape, since writes must be done 1338Sbill * in the same order as requested. 1348Sbill */ 1358Sbill bdwrite(bp) 136*7015Smckusick register struct buf *bp; 1378Sbill { 1382403Skre register int flags; 1398Sbill 1408Sbill if ((bp->b_flags&B_DELWRI) == 0) 1418Sbill u.u_vm.vm_oublk++; /* noone paid yet */ 1422403Skre flags = bdevsw[major(bp->b_dev)].d_flags; 1432403Skre if(flags & B_TAPE) 1448Sbill bawrite(bp); 1458Sbill else { 1468Sbill bp->b_flags |= B_DELWRI | B_DONE; 1478Sbill brelse(bp); 1488Sbill } 1498Sbill } 1508Sbill 1518Sbill /* 1528Sbill * Release the buffer, start I/O on it, but don't wait for completion. 1538Sbill */ 1548Sbill bawrite(bp) 155*7015Smckusick register struct buf *bp; 1568Sbill { 1578Sbill 1588Sbill bp->b_flags |= B_ASYNC; 1598Sbill bwrite(bp); 1608Sbill } 1618Sbill 1628Sbill /* 163*7015Smckusick * Release the buffer, with no I/O implied. 1648Sbill */ 1658Sbill brelse(bp) 166*7015Smckusick register struct buf *bp; 1678Sbill { 1682325Swnj register struct buf *flist; 1698Sbill register s; 1708Sbill 171*7015Smckusick /* 172*7015Smckusick * If someone's waiting for the buffer, or 173*7015Smckusick * is waiting for a buffer wake 'em up. 174*7015Smckusick */ 1758Sbill if (bp->b_flags&B_WANTED) 1768Sbill wakeup((caddr_t)bp); 1772325Swnj if (bfreelist[0].b_flags&B_WANTED) { 1782325Swnj bfreelist[0].b_flags &= ~B_WANTED; 1792325Swnj wakeup((caddr_t)bfreelist); 1808Sbill } 1812683Swnj if (bp->b_flags&B_ERROR) 1822683Swnj if (bp->b_flags & B_LOCKED) 1832683Swnj bp->b_flags &= ~B_ERROR; /* try again later */ 1842683Swnj else 1852683Swnj bp->b_dev = NODEV; /* no assoc */ 186*7015Smckusick 187*7015Smckusick /* 188*7015Smckusick * Stick the buffer back on a free list. 189*7015Smckusick */ 1908Sbill s = spl6(); 1912325Swnj if (bp->b_flags & (B_ERROR|B_INVAL)) { 1922325Swnj /* block has no info ... put at front of most free list */ 1932325Swnj flist = &bfreelist[BQUEUES-1]; 194*7015Smckusick binsheadfree(bp, flist); 1958Sbill } else { 1962325Swnj if (bp->b_flags & B_LOCKED) 1972325Swnj flist = &bfreelist[BQ_LOCKED]; 1982325Swnj else if (bp->b_flags & B_AGE) 1992325Swnj flist = &bfreelist[BQ_AGE]; 2002325Swnj else 2012325Swnj flist = &bfreelist[BQ_LRU]; 202*7015Smckusick binstailfree(bp, flist); 2038Sbill } 2048Sbill bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_AGE); 2058Sbill splx(s); 2068Sbill } 2078Sbill 2088Sbill /* 2098Sbill * See if the block is associated with some buffer 2108Sbill * (mainly to avoid getting hung up on a wait in breada) 2118Sbill */ 2128Sbill incore(dev, blkno) 213*7015Smckusick dev_t dev; 214*7015Smckusick daddr_t blkno; 2158Sbill { 2168Sbill register struct buf *bp; 2172325Swnj register struct buf *dp; 2188Sbill 2196563Smckusic dp = BUFHASH(dev, blkno); 2202325Swnj for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) 2216563Smckusic if (bp->b_blkno == blkno && bp->b_dev == dev && 222*7015Smckusick (bp->b_flags & B_INVAL) == 0) 22391Sbill return (1); 22491Sbill return (0); 2258Sbill } 2268Sbill 2278Sbill struct buf * 2286563Smckusic baddr(dev, blkno, size) 2296563Smckusic dev_t dev; 2306563Smckusic daddr_t blkno; 2316563Smckusic int size; 2328Sbill { 2338Sbill 2348Sbill if (incore(dev, blkno)) 2356563Smckusic return (bread(dev, blkno, size)); 2368Sbill return (0); 2378Sbill } 2388Sbill 2398Sbill /* 2408Sbill * Assign a buffer for the given block. If the appropriate 2418Sbill * block is already associated, return it; otherwise search 2428Sbill * for the oldest non-busy buffer and reassign it. 2435424Swnj * 2445424Swnj * We use splx here because this routine may be called 2455424Swnj * on the interrupt stack during a dump, and we don't 2465424Swnj * want to lower the ipl back to 0. 2478Sbill */ 2488Sbill struct buf * 2496563Smckusic getblk(dev, blkno, size) 2506563Smckusic dev_t dev; 2516563Smckusic daddr_t blkno; 2526563Smckusic int size; 2538Sbill { 25491Sbill register struct buf *bp, *dp, *ep; 2555424Swnj int s; 2568Sbill 2571831Sbill if ((unsigned)blkno >= 1 << (sizeof(int)*NBBY-PGSHIFT)) 2581831Sbill blkno = 1 << ((sizeof(int)*NBBY-PGSHIFT) + 1); 259*7015Smckusick /* 260*7015Smckusick * Search the cache for the block. If we hit, but 261*7015Smckusick * the buffer is in use for i/o, then we wait until 262*7015Smckusick * the i/o has completed. 263*7015Smckusick */ 2646563Smckusic dp = BUFHASH(dev, blkno); 265*7015Smckusick loop: 2662325Swnj for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 2676563Smckusic if (bp->b_blkno != blkno || bp->b_dev != dev || 2682325Swnj bp->b_flags&B_INVAL) 2698Sbill continue; 2705424Swnj s = spl6(); 2718Sbill if (bp->b_flags&B_BUSY) { 2728Sbill bp->b_flags |= B_WANTED; 2738Sbill sleep((caddr_t)bp, PRIBIO+1); 2745424Swnj splx(s); 2758Sbill goto loop; 2768Sbill } 2775424Swnj splx(s); 2788Sbill notavail(bp); 2796563Smckusic brealloc(bp, size); 2808Sbill bp->b_flags |= B_CACHE; 2818Sbill return(bp); 2828Sbill } 28391Sbill if (major(dev) >= nblkdev) 28491Sbill panic("blkdev"); 285*7015Smckusick /* 286*7015Smckusick * Not found in the cache, select something from 287*7015Smckusick * a free list. Preference is to LRU list, then AGE list. 288*7015Smckusick */ 2895424Swnj s = spl6(); 2902325Swnj for (ep = &bfreelist[BQUEUES-1]; ep > bfreelist; ep--) 2912325Swnj if (ep->av_forw != ep) 2922325Swnj break; 2932325Swnj if (ep == bfreelist) { /* no free blocks at all */ 2942325Swnj ep->b_flags |= B_WANTED; 2952325Swnj sleep((caddr_t)ep, PRIBIO+1); 2965424Swnj splx(s); 2978Sbill goto loop; 2988Sbill } 2995424Swnj splx(s); 3002325Swnj bp = ep->av_forw; 3018Sbill notavail(bp); 3028Sbill if (bp->b_flags & B_DELWRI) { 3038Sbill bp->b_flags |= B_ASYNC; 3048Sbill bwrite(bp); 3058Sbill goto loop; 3068Sbill } 3074033Swnj trace(TR_BRELSE, bp->b_dev, bp->b_blkno); 3088Sbill bp->b_flags = B_BUSY; 3096563Smckusic bfree(bp); 310*7015Smckusick bremhash(bp); 311*7015Smckusick binshash(bp, dp); 3128Sbill bp->b_dev = dev; 3136563Smckusic bp->b_blkno = blkno; 3146563Smckusic brealloc(bp, size); 3158Sbill return(bp); 3168Sbill } 3178Sbill 3188Sbill /* 3198Sbill * get an empty block, 3208Sbill * not assigned to any particular device 3218Sbill */ 3228Sbill struct buf * 3236563Smckusic geteblk(size) 3246563Smckusic int size; 3258Sbill { 326182Sbill register struct buf *bp, *dp; 3275431Sroot int s; 3288Sbill 3298Sbill loop: 3305431Sroot s = spl6(); 3312325Swnj for (dp = &bfreelist[BQUEUES-1]; dp > bfreelist; dp--) 3322325Swnj if (dp->av_forw != dp) 3332325Swnj break; 3342325Swnj if (dp == bfreelist) { /* no free blocks */ 3352325Swnj dp->b_flags |= B_WANTED; 3362325Swnj sleep((caddr_t)dp, PRIBIO+1); 3372325Swnj goto loop; 3388Sbill } 3395431Sroot splx(s); 3402325Swnj bp = dp->av_forw; 3418Sbill notavail(bp); 3428Sbill if (bp->b_flags & B_DELWRI) { 3438Sbill bp->b_flags |= B_ASYNC; 3448Sbill bwrite(bp); 3458Sbill goto loop; 3468Sbill } 3474033Swnj trace(TR_BRELSE, bp->b_dev, bp->b_blkno); 3482325Swnj bp->b_flags = B_BUSY|B_INVAL; 349*7015Smckusick bfree(bp); 350*7015Smckusick bremhash(bp); 351*7015Smckusick binshash(bp, dp); 3528Sbill bp->b_dev = (dev_t)NODEV; 3536563Smckusic bp->b_bcount = size; 3548Sbill return(bp); 3558Sbill } 3568Sbill 3578Sbill /* 3586563Smckusic * Allocate space associated with a buffer. 3596563Smckusic */ 3606563Smckusic brealloc(bp, size) 3616563Smckusic register struct buf *bp; 3626563Smckusic int size; 3636563Smckusic { 3646563Smckusic daddr_t start, last; 3656563Smckusic register struct buf *ep; 3666563Smckusic struct buf *dp; 3676563Smckusic int s; 3686563Smckusic 3696563Smckusic /* 3706563Smckusic * First need to make sure that all overlaping previous I/O 3716563Smckusic * is dispatched with. 3726563Smckusic */ 3736563Smckusic if (size == bp->b_bcount) 3746563Smckusic return; 3756563Smckusic if (size < bp->b_bcount) { 3766563Smckusic bp->b_bcount = size; 3776563Smckusic return; 3786563Smckusic } 3796563Smckusic start = bp->b_blkno + (bp->b_bcount / DEV_BSIZE); 3806563Smckusic last = bp->b_blkno + (size / DEV_BSIZE) - 1; 3816563Smckusic if (bp->b_bcount == 0) { 3826563Smckusic start++; 3836563Smckusic if (start == last) 3846563Smckusic goto allocit; 3856563Smckusic } 3866563Smckusic dp = BUFHASH(bp->b_dev, bp->b_blkno); 3876563Smckusic loop: 3886563Smckusic for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) { 3896563Smckusic if (ep->b_blkno < start || ep->b_blkno > last || 3906563Smckusic ep->b_dev != bp->b_dev || ep->b_flags&B_INVAL) 3916563Smckusic continue; 3926563Smckusic s = spl6(); 3936563Smckusic if (ep->b_flags&B_BUSY) { 3946563Smckusic ep->b_flags |= B_WANTED; 3956563Smckusic sleep((caddr_t)ep, PRIBIO+1); 396*7015Smckusick (void) splx(s); 3976563Smckusic goto loop; 3986563Smckusic } 399*7015Smckusick (void) splx(s); 4006563Smckusic /* 4016563Smckusic * What we would really like to do is kill this 4026563Smckusic * I/O since it is now useless. We cannot do that 4036563Smckusic * so we force it to complete, so that it cannot 4046563Smckusic * over-write our useful data later. 4056563Smckusic */ 4066563Smckusic if (ep->b_flags & B_DELWRI) { 4076563Smckusic notavail(ep); 4086563Smckusic ep->b_flags |= B_ASYNC; 4096563Smckusic bwrite(ep); 4106563Smckusic goto loop; 4116563Smckusic } 4126563Smckusic } 4136563Smckusic allocit: 4146563Smckusic /* 4156563Smckusic * Here the buffer is already available, so all we 4166563Smckusic * need to do is set the size. Someday a better memory 4176563Smckusic * management scheme will be implemented. 4186563Smckusic */ 4196563Smckusic bp->b_bcount = size; 4206563Smckusic } 4216563Smckusic 4226563Smckusic /* 4236563Smckusic * Release space associated with a buffer. 4246563Smckusic */ 4256563Smckusic bfree(bp) 4266563Smckusic struct buf *bp; 4276563Smckusic { 4286563Smckusic /* 4296563Smckusic * Here the buffer does not change, so all we 4306563Smckusic * need to do is set the size. Someday a better memory 4316563Smckusic * management scheme will be implemented. 4326563Smckusic */ 4336563Smckusic bp->b_bcount = 0; 4346563Smckusic } 4356563Smckusic 4366563Smckusic /* 4378Sbill * Wait for I/O completion on the buffer; return errors 4388Sbill * to the user. 4398Sbill */ 440*7015Smckusick biowait(bp) 4416563Smckusic register struct buf *bp; 4428Sbill { 4435431Sroot int s; 4448Sbill 4455431Sroot s = spl6(); 4468Sbill while ((bp->b_flags&B_DONE)==0) 4478Sbill sleep((caddr_t)bp, PRIBIO); 4485431Sroot splx(s); 4498Sbill geterror(bp); 4508Sbill } 4518Sbill 4528Sbill /* 4538Sbill * Mark I/O complete on a buffer. If the header 4548Sbill * indicates a dirty page push completion, the 4558Sbill * header is inserted into the ``cleaned'' list 4568Sbill * to be processed by the pageout daemon. Otherwise 4578Sbill * release it if I/O is asynchronous, and wake 4588Sbill * up anyone waiting for it. 4598Sbill */ 460*7015Smckusick biodone(bp) 461*7015Smckusick register struct buf *bp; 4628Sbill { 4638Sbill register int s; 4648Sbill 465420Sbill if (bp->b_flags & B_DONE) 466*7015Smckusick panic("dup biodone"); 4678Sbill bp->b_flags |= B_DONE; 4688Sbill if (bp->b_flags & B_DIRTY) { 4698Sbill if (bp->b_flags & B_ERROR) 4708Sbill panic("IO err in push"); 4718Sbill s = spl6(); 4728Sbill bp->av_forw = bclnlist; 4738Sbill bp->b_bcount = swsize[bp - swbuf]; 4748Sbill bp->b_pfcent = swpf[bp - swbuf]; 4753601Swnj cnt.v_pgout++; 4763601Swnj cnt.v_pgpgout += bp->b_bcount / NBPG; 4778Sbill bclnlist = bp; 4788Sbill if (bswlist.b_flags & B_WANTED) 4798Sbill wakeup((caddr_t)&proc[2]); 4808Sbill splx(s); 481383Sbill return; 4828Sbill } 4838Sbill if (bp->b_flags&B_ASYNC) 4848Sbill brelse(bp); 4858Sbill else { 4868Sbill bp->b_flags &= ~B_WANTED; 4878Sbill wakeup((caddr_t)bp); 4888Sbill } 4898Sbill } 4908Sbill 4918Sbill /* 4928Sbill * make sure all write-behind blocks 4938Sbill * on dev (or NODEV for all) 4948Sbill * are flushed out. 4958Sbill * (from umount and update) 4966563Smckusic * (and temporarily pagein) 4978Sbill */ 4988Sbill bflush(dev) 499*7015Smckusick dev_t dev; 5008Sbill { 5018Sbill register struct buf *bp; 5022325Swnj register struct buf *flist; 5035431Sroot int s; 5048Sbill 5058Sbill loop: 5065431Sroot s = spl6(); 5072325Swnj for (flist = bfreelist; flist < &bfreelist[BQUEUES]; flist++) 5082325Swnj for (bp = flist->av_forw; bp != flist; bp = bp->av_forw) { 509*7015Smckusick if ((bp->b_flags & B_DELWRI) == 0) 510*7015Smckusick continue; 511*7015Smckusick if (dev == NODEV || dev == bp->b_dev) { 5128Sbill bp->b_flags |= B_ASYNC; 5138Sbill notavail(bp); 5148Sbill bwrite(bp); 5158Sbill goto loop; 5168Sbill } 5178Sbill } 5185431Sroot splx(s); 5198Sbill } 5208Sbill 5218Sbill /* 5228Sbill * Pick up the device's error number and pass it to the user; 5238Sbill * if there is an error but the number is 0 set a generalized 5248Sbill * code. Actually the latter is always true because devices 5258Sbill * don't yet return specific errors. 5268Sbill */ 5278Sbill geterror(bp) 528*7015Smckusick register struct buf *bp; 5298Sbill { 5308Sbill 5318Sbill if (bp->b_flags&B_ERROR) 5328Sbill if ((u.u_error = bp->b_error)==0) 5338Sbill u.u_error = EIO; 5348Sbill } 5352299Skre 5362299Skre /* 5372299Skre * Invalidate in core blocks belonging to closed or umounted filesystem 5382299Skre * 5392299Skre * This is not nicely done at all - the buffer ought to be removed from the 5402299Skre * hash chains & have its dev/blkno fields clobbered, but unfortunately we 5412299Skre * can't do that here, as it is quite possible that the block is still 5422299Skre * being used for i/o. Eventually, all disc drivers should be forced to 5432299Skre * have a close routine, which ought ensure that the queue is empty, then 5442299Skre * properly flush the queues. Until that happy day, this suffices for 5452299Skre * correctness. ... kre 5462299Skre */ 5472299Skre binval(dev) 548*7015Smckusick dev_t dev; 5492299Skre { 5502361Skre register struct buf *bp; 5512361Skre register struct bufhd *hp; 5522361Skre #define dp ((struct buf *)hp) 5532299Skre 5542361Skre for (hp = bufhash; hp < &bufhash[BUFHSZ]; hp++) 5552361Skre for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) 5562361Skre if (bp->b_dev == dev) 5572361Skre bp->b_flags |= B_INVAL; 5582299Skre } 559