1 /* 2 * Copyright (c) 1982, 1986 Regents of the University of California. 3 * All rights reserved. The Berkeley software License Agreement 4 * specifies the terms and conditions for redistribution. 5 * 6 * @(#)vfs_bio.c 7.1.1.1 (Berkeley) 04/02/87 7 */ 8 9 #include "../machine/pte.h" 10 11 #include "param.h" 12 #include "systm.h" 13 #include "dir.h" 14 #include "user.h" 15 #include "buf.h" 16 #include "conf.h" 17 #include "proc.h" 18 #include "seg.h" 19 #include "vm.h" 20 #include "trace.h" 21 22 /* 23 * Read in (if necessary) the block and return a buffer pointer. 24 */ 25 struct buf * 26 #ifdef SECSIZE 27 bread(dev, blkno, size, secsize) 28 #else SECSIZE 29 bread(dev, blkno, size) 30 #endif SECSIZE 31 dev_t dev; 32 daddr_t blkno; 33 int size; 34 #ifdef SECSIZE 35 long secsize; 36 #endif SECSIZE 37 { 38 register struct buf *bp; 39 40 if (size == 0) 41 panic("bread: size 0"); 42 #ifdef SECSIZE 43 bp = getblk(dev, blkno, size, secsize); 44 #else SECSIZE 45 bp = getblk(dev, blkno, size); 46 #endif SECSIZE 47 if (bp->b_flags&B_DONE) { 48 trace(TR_BREADHIT, pack(dev, size), blkno); 49 return (bp); 50 } 51 bp->b_flags |= B_READ; 52 if (bp->b_bcount > bp->b_bufsize) 53 panic("bread"); 54 (*bdevsw[major(dev)].d_strategy)(bp); 55 trace(TR_BREADMISS, pack(dev, size), blkno); 56 u.u_ru.ru_inblock++; /* pay for read */ 57 biowait(bp); 58 return (bp); 59 } 60 61 /* 62 * Read in the block, like bread, but also start I/O on the 63 * read-ahead block (which is not allocated to the caller) 64 */ 65 struct buf * 66 #ifdef SECSIZE 67 breada(dev, blkno, size, secsize, rablkno, rabsize) 68 #else SECSIZE 69 breada(dev, blkno, size, rablkno, rabsize) 70 #endif SECSIZE 71 dev_t dev; 72 daddr_t blkno; int size; 73 #ifdef SECSIZE 74 long secsize; 75 #endif SECSIZE 76 daddr_t rablkno; int rabsize; 77 { 78 register struct buf *bp, *rabp; 79 80 bp = NULL; 81 /* 82 * If the block isn't in core, then allocate 83 * a buffer and initiate i/o (getblk checks 84 * for a cache hit). 85 */ 86 if (!incore(dev, blkno)) { 87 #ifdef SECSIZE 88 bp = getblk(dev, blkno, size, secsize); 89 #else SECSIZE 90 bp = getblk(dev, blkno, size); 91 #endif SECSIZE 92 if ((bp->b_flags&B_DONE) == 0) { 93 bp->b_flags |= B_READ; 94 if (bp->b_bcount > bp->b_bufsize) 95 panic("breada"); 96 (*bdevsw[major(dev)].d_strategy)(bp); 97 trace(TR_BREADMISS, pack(dev, size), blkno); 98 u.u_ru.ru_inblock++; /* pay for read */ 99 } else 100 trace(TR_BREADHIT, pack(dev, size), blkno); 101 } 102 103 /* 104 * If there's a read-ahead block, start i/o 105 * on it also (as above). 106 */ 107 if (rablkno && !incore(dev, rablkno)) { 108 #ifdef SECSIZE 109 rabp = getblk(dev, rablkno, rabsize, secsize); 110 #else SECSIZE 111 rabp = getblk(dev, rablkno, rabsize); 112 #endif SECSIZE 113 if (rabp->b_flags & B_DONE) { 114 brelse(rabp); 115 trace(TR_BREADHITRA, pack(dev, rabsize), blkno); 116 } else { 117 rabp->b_flags |= B_READ|B_ASYNC; 118 if (rabp->b_bcount > rabp->b_bufsize) 119 panic("breadrabp"); 120 (*bdevsw[major(dev)].d_strategy)(rabp); 121 trace(TR_BREADMISSRA, pack(dev, rabsize), rablock); 122 u.u_ru.ru_inblock++; /* pay in advance */ 123 } 124 } 125 126 /* 127 * If block was in core, let bread get it. 128 * If block wasn't in core, then the read was started 129 * above, and just wait for it. 130 */ 131 if (bp == NULL) 132 #ifdef SECSIZE 133 return (bread(dev, blkno, size, secsize)); 134 #else SECSIZE 135 return (bread(dev, blkno, size)); 136 #endif SECSIZE 137 biowait(bp); 138 return (bp); 139 } 140 141 /* 142 * Write the buffer, waiting for completion. 143 * Then release the buffer. 144 */ 145 bwrite(bp) 146 register struct buf *bp; 147 { 148 register flag; 149 150 flag = bp->b_flags; 151 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 152 if ((flag&B_DELWRI) == 0) 153 u.u_ru.ru_oublock++; /* noone paid yet */ 154 trace(TR_BWRITE, pack(bp->b_dev, bp->b_bcount), bp->b_blkno); 155 if (bp->b_bcount > bp->b_bufsize) 156 panic("bwrite"); 157 (*bdevsw[major(bp->b_dev)].d_strategy)(bp); 158 159 /* 160 * If the write was synchronous, then await i/o completion. 161 * If the write was "delayed", then we put the buffer on 162 * the q of blocks awaiting i/o completion status. 163 */ 164 if ((flag&B_ASYNC) == 0) { 165 biowait(bp); 166 brelse(bp); 167 } else if (flag & B_DELWRI) 168 bp->b_flags |= B_AGE; 169 } 170 171 /* 172 * Release the buffer, marking it so that if it is grabbed 173 * for another purpose it will be written out before being 174 * given up (e.g. when writing a partial block where it is 175 * assumed that another write for the same block will soon follow). 176 * This can't be done for magtape, since writes must be done 177 * in the same order as requested. 178 */ 179 bdwrite(bp) 180 register struct buf *bp; 181 { 182 183 if ((bp->b_flags&B_DELWRI) == 0) 184 u.u_ru.ru_oublock++; /* noone paid yet */ 185 if (bdevsw[major(bp->b_dev)].d_flags & B_TAPE) 186 bawrite(bp); 187 else { 188 bp->b_flags |= B_DELWRI | B_DONE; 189 brelse(bp); 190 } 191 } 192 193 /* 194 * Release the buffer, start I/O on it, but don't wait for completion. 195 */ 196 bawrite(bp) 197 register struct buf *bp; 198 { 199 200 bp->b_flags |= B_ASYNC; 201 bwrite(bp); 202 } 203 204 /* 205 * Release the buffer, with no I/O implied. 206 */ 207 brelse(bp) 208 register struct buf *bp; 209 { 210 register struct buf *flist; 211 register s; 212 213 trace(TR_BRELSE, pack(bp->b_dev, bp->b_bufsize), bp->b_blkno); 214 /* 215 * If someone's waiting for the buffer, or 216 * is waiting for a buffer wake 'em up. 217 */ 218 if (bp->b_flags&B_WANTED) 219 wakeup((caddr_t)bp); 220 if (bfreelist[0].b_flags&B_WANTED) { 221 bfreelist[0].b_flags &= ~B_WANTED; 222 wakeup((caddr_t)bfreelist); 223 } 224 if (bp->b_flags&B_ERROR) 225 if (bp->b_flags & B_LOCKED) 226 bp->b_flags &= ~B_ERROR; /* try again later */ 227 else 228 bp->b_dev = NODEV; /* no assoc */ 229 230 /* 231 * Stick the buffer back on a free list. 232 */ 233 s = splbio(); 234 if (bp->b_bufsize <= 0) { 235 /* block has no buffer ... put at front of unused buffer list */ 236 flist = &bfreelist[BQ_EMPTY]; 237 binsheadfree(bp, flist); 238 } else if (bp->b_flags & (B_ERROR|B_INVAL)) { 239 /* block has no info ... put at front of most free list */ 240 flist = &bfreelist[BQ_AGE]; 241 binsheadfree(bp, flist); 242 } else { 243 if (bp->b_flags & B_LOCKED) 244 flist = &bfreelist[BQ_LOCKED]; 245 else if (bp->b_flags & B_AGE) 246 flist = &bfreelist[BQ_AGE]; 247 else 248 flist = &bfreelist[BQ_LRU]; 249 binstailfree(bp, flist); 250 } 251 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_AGE); 252 splx(s); 253 } 254 255 /* 256 * See if the block is associated with some buffer 257 * (mainly to avoid getting hung up on a wait in breada) 258 */ 259 incore(dev, blkno) 260 dev_t dev; 261 daddr_t blkno; 262 { 263 register struct buf *bp; 264 register struct buf *dp; 265 266 dp = BUFHASH(dev, blkno); 267 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) 268 if (bp->b_blkno == blkno && bp->b_dev == dev && 269 (bp->b_flags & B_INVAL) == 0) 270 return (1); 271 return (0); 272 } 273 274 struct buf * 275 #ifdef SECSIZE 276 baddr(dev, blkno, size, secsize) 277 #else SECSIZE 278 baddr(dev, blkno, size) 279 #endif SECSIZE 280 dev_t dev; 281 daddr_t blkno; 282 int size; 283 #ifdef SECSIZE 284 long secsize; 285 #endif SECSIZE 286 { 287 288 if (incore(dev, blkno)) 289 #ifdef SECSIZE 290 return (bread(dev, blkno, size, secsize)); 291 #else SECSIZE 292 return (bread(dev, blkno, size)); 293 #endif SECSIZE 294 return (0); 295 } 296 297 /* 298 * Assign a buffer for the given block. If the appropriate 299 * block is already associated, return it; otherwise search 300 * for the oldest non-busy buffer and reassign it. 301 * 302 * We use splx here because this routine may be called 303 * on the interrupt stack during a dump, and we don't 304 * want to lower the ipl back to 0. 305 */ 306 struct buf * 307 #ifdef SECSIZE 308 getblk(dev, blkno, size, secsize) 309 #else SECSIZE 310 getblk(dev, blkno, size) 311 #endif SECSIZE 312 dev_t dev; 313 daddr_t blkno; 314 int size; 315 #ifdef SECSIZE 316 long secsize; 317 #endif SECSIZE 318 { 319 register struct buf *bp, *dp; 320 int s; 321 322 if (size > MAXBSIZE) 323 panic("getblk: size too big"); 324 /* 325 * To prevent overflow of 32-bit ints when converting block 326 * numbers to byte offsets, blknos > 2^32 / DEV_BSIZE are set 327 * to the maximum number that can be converted to a byte offset 328 * without overflow. This is historic code; what bug it fixed, 329 * or whether it is still a reasonable thing to do is open to 330 * dispute. mkm 9/85 331 */ 332 if ((unsigned)blkno >= 1 << (sizeof(int)*NBBY-DEV_BSHIFT)) 333 blkno = 1 << ((sizeof(int)*NBBY-DEV_BSHIFT) + 1); 334 /* 335 * Search the cache for the block. If we hit, but 336 * the buffer is in use for i/o, then we wait until 337 * the i/o has completed. 338 */ 339 dp = BUFHASH(dev, blkno); 340 loop: 341 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 342 if (bp->b_blkno != blkno || bp->b_dev != dev || 343 bp->b_flags&B_INVAL) 344 continue; 345 s = splbio(); 346 if (bp->b_flags&B_BUSY) { 347 bp->b_flags |= B_WANTED; 348 sleep((caddr_t)bp, PRIBIO+1); 349 splx(s); 350 goto loop; 351 } 352 splx(s); 353 notavail(bp); 354 if (bp->b_bcount != size && brealloc(bp, size) == 0) 355 goto loop; 356 bp->b_flags |= B_CACHE; 357 return (bp); 358 } 359 if (major(dev) >= nblkdev) 360 panic("blkdev"); 361 bp = getnewbuf(); 362 bfree(bp); 363 bremhash(bp); 364 binshash(bp, dp); 365 bp->b_dev = dev; 366 #ifdef SECSIZE 367 bp->b_blksize = secsize; 368 #endif SECSIZE 369 bp->b_blkno = blkno; 370 bp->b_error = 0; 371 if (brealloc(bp, size) == 0) 372 goto loop; 373 return (bp); 374 } 375 376 /* 377 * get an empty block, 378 * not assigned to any particular device 379 */ 380 struct buf * 381 geteblk(size) 382 int size; 383 { 384 register struct buf *bp, *flist; 385 386 if (size > MAXBSIZE) 387 panic("geteblk: size too big"); 388 loop: 389 bp = getnewbuf(); 390 bp->b_flags |= B_INVAL; 391 bfree(bp); 392 bremhash(bp); 393 flist = &bfreelist[BQ_AGE]; 394 binshash(bp, flist); 395 bp->b_dev = (dev_t)NODEV; 396 #ifdef SECSIZE 397 bp->b_blksize = DEV_BSIZE; 398 #endif SECSIZE 399 bp->b_error = 0; 400 if (brealloc(bp, size) == 0) 401 goto loop; 402 return (bp); 403 } 404 405 /* 406 * Allocate space associated with a buffer. 407 * If can't get space, buffer is released 408 */ 409 brealloc(bp, size) 410 register struct buf *bp; 411 int size; 412 { 413 daddr_t start, last; 414 register struct buf *ep; 415 struct buf *dp; 416 int s; 417 418 /* 419 * First need to make sure that all overlapping previous I/O 420 * is dispatched with. 421 */ 422 if (size == bp->b_bcount) 423 return (1); 424 if (size < bp->b_bcount) { 425 if (bp->b_flags & B_DELWRI) { 426 bwrite(bp); 427 return (0); 428 } 429 if (bp->b_flags & B_LOCKED) 430 panic("brealloc"); 431 return (allocbuf(bp, size)); 432 } 433 bp->b_flags &= ~B_DONE; 434 if (bp->b_dev == NODEV) 435 return (allocbuf(bp, size)); 436 437 trace(TR_BREALLOC, pack(bp->b_dev, size), bp->b_blkno); 438 /* 439 * Search cache for any buffers that overlap the one that we 440 * are trying to allocate. Overlapping buffers must be marked 441 * invalid, after being written out if they are dirty. (indicated 442 * by B_DELWRI) A disk block must be mapped by at most one buffer 443 * at any point in time. Care must be taken to avoid deadlocking 444 * when two buffer are trying to get the same set of disk blocks. 445 */ 446 start = bp->b_blkno; 447 #ifdef SECSIZE 448 last = start + size/bp->b_blksize - 1; 449 #else SECSIZE 450 last = start + btodb(size) - 1; 451 #endif SECSIZE 452 dp = BUFHASH(bp->b_dev, bp->b_blkno); 453 loop: 454 for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) { 455 if (ep == bp || ep->b_dev != bp->b_dev || (ep->b_flags&B_INVAL)) 456 continue; 457 /* look for overlap */ 458 if (ep->b_bcount == 0 || ep->b_blkno > last || 459 #ifdef SECSIZE 460 ep->b_blkno + ep->b_bcount/ep->b_blksize <= start) 461 #else SECSIZE 462 ep->b_blkno + btodb(ep->b_bcount) <= start) 463 #endif SECSIZE 464 continue; 465 s = splbio(); 466 if (ep->b_flags&B_BUSY) { 467 ep->b_flags |= B_WANTED; 468 sleep((caddr_t)ep, PRIBIO+1); 469 splx(s); 470 goto loop; 471 } 472 splx(s); 473 notavail(ep); 474 if (ep->b_flags & B_DELWRI) { 475 bwrite(ep); 476 goto loop; 477 } 478 ep->b_flags |= B_INVAL; 479 brelse(ep); 480 } 481 return (allocbuf(bp, size)); 482 } 483 484 /* 485 * Find a buffer which is available for use. 486 * Select something from a free list. 487 * Preference is to AGE list, then LRU list. 488 */ 489 struct buf * 490 getnewbuf() 491 { 492 register struct buf *bp, *dp; 493 int s; 494 495 loop: 496 s = splbio(); 497 for (dp = &bfreelist[BQ_AGE]; dp > bfreelist; dp--) 498 if (dp->av_forw != dp) 499 break; 500 if (dp == bfreelist) { /* no free blocks */ 501 dp->b_flags |= B_WANTED; 502 sleep((caddr_t)dp, PRIBIO+1); 503 splx(s); 504 goto loop; 505 } 506 splx(s); 507 bp = dp->av_forw; 508 notavail(bp); 509 if (bp->b_flags & B_DELWRI) { 510 bp->b_flags |= B_ASYNC; 511 bwrite(bp); 512 goto loop; 513 } 514 trace(TR_BRELSE, pack(bp->b_dev, bp->b_bufsize), bp->b_blkno); 515 bp->b_flags = B_BUSY; 516 return (bp); 517 } 518 519 /* 520 * Wait for I/O completion on the buffer; return errors 521 * to the user. 522 */ 523 biowait(bp) 524 register struct buf *bp; 525 { 526 int s; 527 528 s = splbio(); 529 while ((bp->b_flags&B_DONE)==0) 530 sleep((caddr_t)bp, PRIBIO); 531 splx(s); 532 if (u.u_error == 0) /* XXX */ 533 u.u_error = geterror(bp); 534 } 535 536 /* 537 * Mark I/O complete on a buffer. 538 * If someone should be called, e.g. the pageout 539 * daemon, do so. Otherwise, wake up anyone 540 * waiting for it. 541 */ 542 biodone(bp) 543 register struct buf *bp; 544 { 545 546 if (bp->b_flags & B_DONE) 547 panic("dup biodone"); 548 bp->b_flags |= B_DONE; 549 if (bp->b_flags & B_CALL) { 550 bp->b_flags &= ~B_CALL; 551 (*bp->b_iodone)(bp); 552 return; 553 } 554 if (bp->b_flags&B_ASYNC) 555 brelse(bp); 556 else { 557 bp->b_flags &= ~B_WANTED; 558 wakeup((caddr_t)bp); 559 } 560 } 561 562 /* 563 * Insure that no part of a specified block is in an incore buffer. 564 #ifdef SECSIZE 565 * "size" is given in device blocks (the units of b_blkno). 566 #endif SECSIZE 567 */ 568 blkflush(dev, blkno, size) 569 dev_t dev; 570 daddr_t blkno; 571 #ifdef SECSIZE 572 int size; 573 #else SECSIZE 574 long size; 575 #endif SECSIZE 576 { 577 register struct buf *ep; 578 struct buf *dp; 579 daddr_t start, last; 580 int s; 581 582 start = blkno; 583 #ifdef SECSIZE 584 last = start + size - 1; 585 #else SECSIZE 586 last = start + btodb(size) - 1; 587 #endif SECSIZE 588 dp = BUFHASH(dev, blkno); 589 loop: 590 for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) { 591 if (ep->b_dev != dev || (ep->b_flags&B_INVAL)) 592 continue; 593 /* look for overlap */ 594 if (ep->b_bcount == 0 || ep->b_blkno > last || 595 #ifdef SECSIZE 596 ep->b_blkno + ep->b_bcount / ep->b_blksize <= start) 597 #else SECSIZE 598 ep->b_blkno + btodb(ep->b_bcount) <= start) 599 #endif SECSIZE 600 continue; 601 s = splbio(); 602 if (ep->b_flags&B_BUSY) { 603 ep->b_flags |= B_WANTED; 604 sleep((caddr_t)ep, PRIBIO+1); 605 splx(s); 606 goto loop; 607 } 608 if (ep->b_flags & B_DELWRI) { 609 splx(s); 610 notavail(ep); 611 bwrite(ep); 612 goto loop; 613 } 614 splx(s); 615 } 616 } 617 618 /* 619 * Make sure all write-behind blocks 620 * on dev (or NODEV for all) 621 * are flushed out. 622 * (from umount and update) 623 */ 624 bflush(dev) 625 dev_t dev; 626 { 627 register struct buf *bp; 628 register struct buf *flist; 629 int s; 630 631 loop: 632 s = splbio(); 633 for (flist = bfreelist; flist < &bfreelist[BQ_EMPTY]; flist++) 634 for (bp = flist->av_forw; bp != flist; bp = bp->av_forw) { 635 if ((bp->b_flags & B_DELWRI) == 0) 636 continue; 637 if (dev == NODEV || dev == bp->b_dev) { 638 bp->b_flags |= B_ASYNC; 639 notavail(bp); 640 bwrite(bp); 641 splx(s); 642 goto loop; 643 } 644 } 645 splx(s); 646 } 647 648 /* 649 * Pick up the device's error number and pass it to the user; 650 * if there is an error but the number is 0 set a generalized code. 651 */ 652 geterror(bp) 653 register struct buf *bp; 654 { 655 int error = 0; 656 657 if (bp->b_flags&B_ERROR) 658 if ((error = bp->b_error)==0) 659 return (EIO); 660 return (error); 661 } 662 663 /* 664 * Invalidate in core blocks belonging to closed or umounted filesystem 665 * 666 * This is not nicely done at all - the buffer ought to be removed from the 667 * hash chains & have its dev/blkno fields clobbered, but unfortunately we 668 * can't do that here, as it is quite possible that the block is still 669 * being used for i/o. Eventually, all disc drivers should be forced to 670 * have a close routine, which ought ensure that the queue is empty, then 671 * properly flush the queues. Until that happy day, this suffices for 672 * correctness. ... kre 673 */ 674 binval(dev) 675 dev_t dev; 676 { 677 register struct buf *bp; 678 register struct bufhd *hp; 679 #define dp ((struct buf *)hp) 680 681 for (hp = bufhash; hp < &bufhash[BUFHSZ]; hp++) 682 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) 683 if (bp->b_dev == dev) 684 bp->b_flags |= B_INVAL; 685 } 686