1 /* vfs_cluster.c 4.36 82/09/04 */ 2 3 #include "../h/param.h" 4 #include "../h/systm.h" 5 #include "../h/dir.h" 6 #include "../h/user.h" 7 #include "../h/buf.h" 8 #include "../h/conf.h" 9 #include "../h/proc.h" 10 #include "../h/seg.h" 11 #include "../h/pte.h" 12 #include "../h/vm.h" 13 #include "../h/trace.h" 14 15 int bioprintfs = 0; 16 17 /* 18 * Read in (if necessary) the block and return a buffer pointer. 19 */ 20 struct buf * 21 bread(dev, blkno, size) 22 dev_t dev; 23 daddr_t blkno; 24 int size; 25 { 26 register struct buf *bp; 27 28 bp = getblk(dev, blkno, size); 29 if (bp->b_flags&B_DONE) { 30 trace(TR_BREADHIT, dev, blkno); 31 return(bp); 32 } 33 bp->b_flags |= B_READ; 34 (*bdevsw[major(dev)].d_strategy)(bp); 35 trace(TR_BREADMISS, dev, blkno); 36 u.u_ru.ru_inblock++; /* pay for read */ 37 biowait(bp); 38 return(bp); 39 } 40 41 /* 42 * Read in the block, like bread, but also start I/O on the 43 * read-ahead block (which is not allocated to the caller) 44 */ 45 struct buf * 46 breada(dev, blkno, size, rablkno, rasize) 47 dev_t dev; 48 daddr_t blkno; int size; 49 daddr_t rablkno; int rasize; 50 { 51 register struct buf *bp, *rabp; 52 53 bp = NULL; 54 /* 55 * If the block isn't in core, then allocate 56 * a buffer and initiate i/o (getblk checks 57 * for a cache hit). 58 */ 59 if (!incore(dev, blkno)) { 60 bp = getblk(dev, blkno, size); 61 if ((bp->b_flags&B_DONE) == 0) { 62 bp->b_flags |= B_READ; 63 (*bdevsw[major(dev)].d_strategy)(bp); 64 trace(TR_BREADMISS, dev, blkno); 65 u.u_ru.ru_inblock++; /* pay for read */ 66 } else 67 trace(TR_BREADHIT, dev, blkno); 68 } 69 70 /* 71 * If there's a read-ahead block, start i/o 72 * on it also (as above). 73 */ 74 if (rablkno && !incore(dev, rablkno)) { 75 rabp = getblk(dev, rablkno, rasize); 76 if (rabp->b_flags & B_DONE) { 77 brelse(rabp); 78 trace(TR_BREADHITRA, dev, blkno); 79 } else { 80 rabp->b_flags |= B_READ|B_ASYNC; 81 (*bdevsw[major(dev)].d_strategy)(rabp); 82 trace(TR_BREADMISSRA, dev, rablock); 83 u.u_ru.ru_inblock++; /* pay in advance */ 84 } 85 } 86 87 /* 88 * If block was in core, let bread get it. 89 * If block wasn't in core, then the read was started 90 * above, and just wait for it. 91 */ 92 if (bp == NULL) 93 return (bread(dev, blkno, size)); 94 biowait(bp); 95 return (bp); 96 } 97 98 /* 99 * Write the buffer, waiting for completion. 100 * Then release the buffer. 101 */ 102 bwrite(bp) 103 register struct buf *bp; 104 { 105 register flag; 106 107 flag = bp->b_flags; 108 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI | B_AGE); 109 if ((flag&B_DELWRI) == 0) 110 u.u_ru.ru_oublock++; /* noone paid yet */ 111 trace(TR_BWRITE, bp->b_dev, bp->b_blkno); 112 if (bioprintfs) 113 printf("write %x blk %d count %d\n", bp->b_dev, bp->b_blkno, bp->b_bcount); 114 (*bdevsw[major(bp->b_dev)].d_strategy)(bp); 115 116 /* 117 * If the write was synchronous, then await i/o completion. 118 * If the write was "delayed", then we put the buffer on 119 * the q of blocks awaiting i/o completion status. 120 * Otherwise, the i/o must be finished and we check for 121 * an error. 122 */ 123 if ((flag&B_ASYNC) == 0) { 124 biowait(bp); 125 brelse(bp); 126 } else if (flag & B_DELWRI) 127 bp->b_flags |= B_AGE; 128 else 129 u.u_error = geterror(bp); 130 } 131 132 /* 133 * Release the buffer, marking it so that if it is grabbed 134 * for another purpose it will be written out before being 135 * given up (e.g. when writing a partial block where it is 136 * assumed that another write for the same block will soon follow). 137 * This can't be done for magtape, since writes must be done 138 * in the same order as requested. 139 */ 140 bdwrite(bp) 141 register struct buf *bp; 142 { 143 register int flags; 144 145 if ((bp->b_flags&B_DELWRI) == 0) 146 u.u_ru.ru_oublock++; /* noone paid yet */ 147 flags = bdevsw[major(bp->b_dev)].d_flags; 148 if(flags & B_TAPE) 149 bawrite(bp); 150 else { 151 bp->b_flags |= B_DELWRI | B_DONE; 152 brelse(bp); 153 } 154 } 155 156 /* 157 * Release the buffer, start I/O on it, but don't wait for completion. 158 */ 159 bawrite(bp) 160 register struct buf *bp; 161 { 162 163 bp->b_flags |= B_ASYNC; 164 bwrite(bp); 165 } 166 167 /* 168 * Release the buffer, with no I/O implied. 169 */ 170 brelse(bp) 171 register struct buf *bp; 172 { 173 register struct buf *flist; 174 register s; 175 176 /* 177 * If someone's waiting for the buffer, or 178 * is waiting for a buffer wake 'em up. 179 */ 180 if (bp->b_flags&B_WANTED) 181 wakeup((caddr_t)bp); 182 if (bfreelist[0].b_flags&B_WANTED) { 183 bfreelist[0].b_flags &= ~B_WANTED; 184 wakeup((caddr_t)bfreelist); 185 } 186 if (bp->b_flags&B_ERROR) 187 if (bp->b_flags & B_LOCKED) 188 bp->b_flags &= ~B_ERROR; /* try again later */ 189 else 190 bp->b_dev = NODEV; /* no assoc */ 191 192 /* 193 * Stick the buffer back on a free list. 194 */ 195 s = spl6(); 196 if (bp->b_flags & (B_ERROR|B_INVAL)) { 197 /* block has no info ... put at front of most free list */ 198 flist = &bfreelist[BQUEUES-1]; 199 binsheadfree(bp, flist); 200 } else { 201 if (bp->b_flags & B_LOCKED) 202 flist = &bfreelist[BQ_LOCKED]; 203 else if (bp->b_flags & B_AGE) 204 flist = &bfreelist[BQ_AGE]; 205 else 206 flist = &bfreelist[BQ_LRU]; 207 binstailfree(bp, flist); 208 } 209 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_AGE); 210 splx(s); 211 } 212 213 /* 214 * See if the block is associated with some buffer 215 * (mainly to avoid getting hung up on a wait in breada) 216 */ 217 incore(dev, blkno) 218 dev_t dev; 219 daddr_t blkno; 220 { 221 register struct buf *bp; 222 register struct buf *dp; 223 224 dp = BUFHASH(dev, blkno); 225 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) 226 if (bp->b_blkno == blkno && bp->b_dev == dev && 227 (bp->b_flags & B_INVAL) == 0) 228 return (1); 229 return (0); 230 } 231 232 struct buf * 233 baddr(dev, blkno, size) 234 dev_t dev; 235 daddr_t blkno; 236 int size; 237 { 238 239 if (incore(dev, blkno)) 240 return (bread(dev, blkno, size)); 241 return (0); 242 } 243 244 /* 245 * Assign a buffer for the given block. If the appropriate 246 * block is already associated, return it; otherwise search 247 * for the oldest non-busy buffer and reassign it. 248 * 249 * We use splx here because this routine may be called 250 * on the interrupt stack during a dump, and we don't 251 * want to lower the ipl back to 0. 252 */ 253 struct buf * 254 getblk(dev, blkno, size) 255 dev_t dev; 256 daddr_t blkno; 257 int size; 258 { 259 register struct buf *bp, *dp, *ep; 260 int s; 261 262 if ((unsigned)blkno >= 1 << (sizeof(int)*NBBY-PGSHIFT)) 263 blkno = 1 << ((sizeof(int)*NBBY-PGSHIFT) + 1); 264 /* 265 * Search the cache for the block. If we hit, but 266 * the buffer is in use for i/o, then we wait until 267 * the i/o has completed. 268 */ 269 dp = BUFHASH(dev, blkno); 270 loop: 271 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 272 if (bp->b_blkno != blkno || bp->b_dev != dev || 273 bp->b_flags&B_INVAL) 274 continue; 275 s = spl6(); 276 if (bp->b_flags&B_BUSY) { 277 bp->b_flags |= B_WANTED; 278 sleep((caddr_t)bp, PRIBIO+1); 279 splx(s); 280 goto loop; 281 } 282 splx(s); 283 notavail(bp); 284 if (brealloc(bp, size) == 0) 285 goto loop; 286 bp->b_flags |= B_CACHE; 287 return(bp); 288 } 289 if (major(dev) >= nblkdev) 290 panic("blkdev"); 291 /* 292 * Not found in the cache, select something from 293 * a free list. Preference is to LRU list, then AGE list. 294 */ 295 s = spl6(); 296 for (ep = &bfreelist[BQUEUES-1]; ep > bfreelist; ep--) 297 if (ep->av_forw != ep) 298 break; 299 if (ep == bfreelist) { /* no free blocks at all */ 300 ep->b_flags |= B_WANTED; 301 sleep((caddr_t)ep, PRIBIO+1); 302 splx(s); 303 goto loop; 304 } 305 splx(s); 306 bp = ep->av_forw; 307 notavail(bp); 308 if (bp->b_flags & B_DELWRI) { 309 bp->b_flags |= B_ASYNC; 310 bwrite(bp); 311 goto loop; 312 } 313 trace(TR_BRELSE, bp->b_dev, bp->b_blkno); 314 bp->b_flags = B_BUSY; 315 bfree(bp); 316 bremhash(bp); 317 binshash(bp, dp); 318 bp->b_dev = dev; 319 bp->b_blkno = blkno; 320 if (brealloc(bp, size) == 0) 321 goto loop; 322 return(bp); 323 } 324 325 /* 326 * get an empty block, 327 * not assigned to any particular device 328 */ 329 struct buf * 330 geteblk(size) 331 int size; 332 { 333 register struct buf *bp, *dp; 334 int s; 335 336 loop: 337 s = spl6(); 338 for (dp = &bfreelist[BQUEUES-1]; dp > bfreelist; dp--) 339 if (dp->av_forw != dp) 340 break; 341 if (dp == bfreelist) { /* no free blocks */ 342 dp->b_flags |= B_WANTED; 343 sleep((caddr_t)dp, PRIBIO+1); 344 goto loop; 345 } 346 splx(s); 347 bp = dp->av_forw; 348 notavail(bp); 349 if (bp->b_flags & B_DELWRI) { 350 bp->b_flags |= B_ASYNC; 351 bwrite(bp); 352 goto loop; 353 } 354 trace(TR_BRELSE, bp->b_dev, bp->b_blkno); 355 bp->b_flags = B_BUSY|B_INVAL; 356 bfree(bp); 357 bremhash(bp); 358 binshash(bp, dp); 359 bp->b_dev = (dev_t)NODEV; 360 if (brealloc(bp, size) == 0) 361 goto loop; 362 return(bp); 363 } 364 365 /* 366 * Allocate space associated with a buffer. 367 */ 368 brealloc(bp, size) 369 register struct buf *bp; 370 int size; 371 { 372 daddr_t start, last; 373 register struct buf *ep; 374 struct buf *dp; 375 int s; 376 377 /* 378 * First need to make sure that all overlaping previous I/O 379 * is dispatched with. 380 */ 381 if (size == bp->b_bcount) 382 return (1); 383 if (size < bp->b_bcount) { 384 if (bp->b_flags & B_DELWRI) { 385 bwrite(bp); 386 return (0); 387 } 388 if (bp->b_flags & B_LOCKED) 389 panic("brealloc"); 390 goto allocit; 391 } 392 bp->b_flags &= ~B_DONE; 393 if (bp->b_dev == NODEV) 394 goto allocit; 395 396 /* 397 * Search cache for any buffers that overlap the one that we 398 * are trying to allocate. Overlapping buffers must be marked 399 * invalid, after being written out if they are dirty. (indicated 400 * by B_DELWRI) A disk block must be mapped by at most one buffer 401 * at any point in time. Care must be taken to avoid deadlocking 402 * when two buffer are trying to get the same set of disk blocks. 403 */ 404 start = bp->b_blkno; 405 last = start + (size / DEV_BSIZE) - 1; 406 dp = BUFHASH(bp->b_dev, bp->b_blkno); 407 loop: 408 for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) { 409 if (ep == bp || ep->b_dev != bp->b_dev || (ep->b_flags&B_INVAL)) 410 continue; 411 /* look for overlap */ 412 if (ep->b_bcount == 0 || ep->b_blkno > last || 413 ep->b_blkno + (ep->b_bcount / DEV_BSIZE) <= start) 414 continue; 415 if (bioprintfs) 416 if (ep->b_flags&B_BUSY) 417 printf("sleeping on:dev 0x%x, blks %d-%d, flg 0%o allocing dev 0x%x, blks %d-%d, flg 0%o\n", 418 ep->b_dev, ep->b_blkno, ep->b_blkno + (ep->b_bcount / DEV_BSIZE) - 1, 419 ep->b_flags, bp->b_dev, start, last, bp->b_flags); 420 s = spl6(); 421 if (ep->b_flags&B_BUSY) { 422 ep->b_flags |= B_WANTED; 423 sleep((caddr_t)ep, PRIBIO+1); 424 (void) splx(s); 425 goto loop; 426 } 427 (void) splx(s); 428 notavail(ep); 429 if (ep->b_flags & B_DELWRI) { 430 if (bioprintfs) 431 printf("DELWRI:dev 0x%x, blks %d-%d, flg 0%o allocing dev 0x%x, blks %d-%d, flg 0%o\n", 432 ep->b_dev, ep->b_blkno, ep->b_blkno + (ep->b_bcount / DEV_BSIZE) - 1, 433 ep->b_flags, bp->b_dev, start, last, bp->b_flags); 434 bwrite(ep); 435 goto loop; 436 } 437 ep->b_flags |= B_INVAL; 438 brelse(ep); 439 } 440 allocit: 441 /* 442 * Here the buffer is already available, so all we 443 * need to do is set the size. Someday a better memory 444 * management scheme will be implemented. 445 */ 446 bp->b_bcount = size; 447 return (1); 448 } 449 450 /* 451 * Release space associated with a buffer. 452 */ 453 bfree(bp) 454 struct buf *bp; 455 { 456 /* 457 * Here the buffer does not change, so all we 458 * need to do is set the size. Someday a better memory 459 * management scheme will be implemented. 460 */ 461 bp->b_bcount = 0; 462 } 463 464 /* 465 * Wait for I/O completion on the buffer; return errors 466 * to the user. 467 */ 468 biowait(bp) 469 register struct buf *bp; 470 { 471 int s; 472 473 s = spl6(); 474 while ((bp->b_flags&B_DONE)==0) 475 sleep((caddr_t)bp, PRIBIO); 476 splx(s); 477 u.u_error = geterror(bp); 478 } 479 480 /* 481 * Mark I/O complete on a buffer. If the header 482 * indicates a dirty page push completion, the 483 * header is inserted into the ``cleaned'' list 484 * to be processed by the pageout daemon. Otherwise 485 * release it if I/O is asynchronous, and wake 486 * up anyone waiting for it. 487 */ 488 biodone(bp) 489 register struct buf *bp; 490 { 491 register int s; 492 493 if (bp->b_flags & B_DONE) 494 panic("dup biodone"); 495 bp->b_flags |= B_DONE; 496 if (bp->b_flags & B_DIRTY) { 497 if (bp->b_flags & B_ERROR) 498 panic("IO err in push"); 499 s = spl6(); 500 bp->av_forw = bclnlist; 501 bp->b_bcount = swsize[bp - swbuf]; 502 bp->b_pfcent = swpf[bp - swbuf]; 503 cnt.v_pgout++; 504 cnt.v_pgpgout += bp->b_bcount / NBPG; 505 bclnlist = bp; 506 if (bswlist.b_flags & B_WANTED) 507 wakeup((caddr_t)&proc[2]); 508 splx(s); 509 return; 510 } 511 if (bp->b_flags&B_ASYNC) 512 brelse(bp); 513 else { 514 bp->b_flags &= ~B_WANTED; 515 wakeup((caddr_t)bp); 516 } 517 } 518 519 /* 520 * make sure all write-behind blocks 521 * on dev (or NODEV for all) 522 * are flushed out. 523 * (from umount and update) 524 * (and temporarily pagein) 525 */ 526 bflush(dev) 527 dev_t dev; 528 { 529 register struct buf *bp; 530 register struct buf *flist; 531 int s; 532 533 loop: 534 s = spl6(); 535 for (flist = bfreelist; flist < &bfreelist[BQUEUES]; flist++) 536 for (bp = flist->av_forw; bp != flist; bp = bp->av_forw) { 537 if ((bp->b_flags & B_DELWRI) == 0) 538 continue; 539 if (dev == NODEV || dev == bp->b_dev) { 540 bp->b_flags |= B_ASYNC; 541 notavail(bp); 542 bwrite(bp); 543 goto loop; 544 } 545 } 546 splx(s); 547 } 548 549 /* 550 * Pick up the device's error number and pass it to the user; 551 * if there is an error but the number is 0 set a generalized 552 * code. Actually the latter is always true because devices 553 * don't yet return specific errors. 554 */ 555 geterror(bp) 556 register struct buf *bp; 557 { 558 int error = 0; 559 560 if (bp->b_flags&B_ERROR) 561 if ((error = bp->b_error)==0) 562 return (EIO); 563 return (error); 564 } 565 566 /* 567 * Invalidate in core blocks belonging to closed or umounted filesystem 568 * 569 * This is not nicely done at all - the buffer ought to be removed from the 570 * hash chains & have its dev/blkno fields clobbered, but unfortunately we 571 * can't do that here, as it is quite possible that the block is still 572 * being used for i/o. Eventually, all disc drivers should be forced to 573 * have a close routine, which ought ensure that the queue is empty, then 574 * properly flush the queues. Until that happy day, this suffices for 575 * correctness. ... kre 576 */ 577 binval(dev) 578 dev_t dev; 579 { 580 register struct buf *bp; 581 register struct bufhd *hp; 582 #define dp ((struct buf *)hp) 583 584 for (hp = bufhash; hp < &bufhash[BUFHSZ]; hp++) 585 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) 586 if (bp->b_dev == dev) 587 bp->b_flags |= B_INVAL; 588 } 589