1 /* 2 * Copyright (c) 1982, 1986, 1989 Regents of the University of California. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms are permitted 6 * provided that the above copyright notice and this paragraph are 7 * duplicated in all such forms and that any documentation, 8 * advertising materials, and other materials related to such 9 * distribution and use acknowledge that the software was developed 10 * by the University of California, Berkeley. The name of the 11 * University may not be used to endorse or promote products derived 12 * from this software without specific prior written permission. 13 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR 14 * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED 15 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. 16 * 17 * @(#)vfs_bio.c 7.11 (Berkeley) 08/28/89 18 */ 19 20 #include "param.h" 21 #include "user.h" 22 #include "buf.h" 23 #include "vnode.h" 24 #include "trace.h" 25 #include "ucred.h" 26 27 /* 28 * Read in (if necessary) the block and return a buffer pointer. 29 */ 30 bread(vp, blkno, size, cred, bpp) 31 struct vnode *vp; 32 daddr_t blkno; 33 int size; 34 struct ucred *cred; 35 struct buf **bpp; 36 { 37 register struct buf *bp; 38 39 if (size == 0) 40 panic("bread: size 0"); 41 *bpp = bp = getblk(vp, blkno, size); 42 if (bp->b_flags&(B_DONE|B_DELWRI)) { 43 trace(TR_BREADHIT, pack(vp->v_mount->m_fsid[0], size), blkno); 44 return (0); 45 } 46 bp->b_flags |= B_READ; 47 if (bp->b_bcount > bp->b_bufsize) 48 panic("bread"); 49 if (bp->b_rcred == NOCRED && cred != NOCRED) { 50 crhold(cred); 51 bp->b_rcred = cred; 52 } 53 VOP_STRATEGY(bp); 54 trace(TR_BREADMISS, pack(vp->v_mount->m_fsid[0], size), blkno); 55 u.u_ru.ru_inblock++; /* pay for read */ 56 return (biowait(bp)); 57 } 58 59 /* 60 * Read in the block, like bread, but also start I/O on the 61 * read-ahead block (which is not allocated to the caller) 62 */ 63 breada(vp, blkno, size, rablkno, rabsize, cred, bpp) 64 struct vnode *vp; 65 daddr_t blkno; int size; 66 daddr_t rablkno; int rabsize; 67 struct ucred *cred; 68 struct buf **bpp; 69 { 70 register struct buf *bp, *rabp; 71 72 bp = NULL; 73 /* 74 * If the block isn't in core, then allocate 75 * a buffer and initiate i/o (getblk checks 76 * for a cache hit). 77 */ 78 if (!incore(vp, blkno)) { 79 *bpp = bp = getblk(vp, blkno, size); 80 if ((bp->b_flags&(B_DONE|B_DELWRI)) == 0) { 81 bp->b_flags |= B_READ; 82 if (bp->b_bcount > bp->b_bufsize) 83 panic("breada"); 84 if (bp->b_rcred == NOCRED && cred != NOCRED) { 85 crhold(cred); 86 bp->b_rcred = cred; 87 } 88 VOP_STRATEGY(bp); 89 trace(TR_BREADMISS, pack(vp->v_mount->m_fsid[0], size), 90 blkno); 91 u.u_ru.ru_inblock++; /* pay for read */ 92 } else 93 trace(TR_BREADHIT, pack(vp->v_mount->m_fsid[0], size), 94 blkno); 95 } 96 97 /* 98 * If there's a read-ahead block, start i/o 99 * on it also (as above). 100 */ 101 if (rablkno && !incore(vp, rablkno)) { 102 rabp = getblk(vp, rablkno, rabsize); 103 if (rabp->b_flags & (B_DONE|B_DELWRI)) { 104 brelse(rabp); 105 trace(TR_BREADHITRA, 106 pack(vp->v_mount->m_fsid[0], rabsize), blkno); 107 } else { 108 rabp->b_flags |= B_READ|B_ASYNC; 109 if (rabp->b_bcount > rabp->b_bufsize) 110 panic("breadrabp"); 111 if (bp->b_rcred == NOCRED && cred != NOCRED) { 112 crhold(cred); 113 bp->b_rcred = cred; 114 } 115 VOP_STRATEGY(rabp); 116 trace(TR_BREADMISSRA, 117 pack(vp->v_mount->m_fsid[0], rabsize), rablock); 118 u.u_ru.ru_inblock++; /* pay in advance */ 119 } 120 } 121 122 /* 123 * If block was in core, let bread get it. 124 * If block wasn't in core, then the read was started 125 * above, and just wait for it. 126 */ 127 if (bp == NULL) 128 return (bread(vp, blkno, size, cred, bpp)); 129 return (biowait(bp)); 130 } 131 132 /* 133 * Write the buffer, waiting for completion. 134 * Then release the buffer. 135 */ 136 bwrite(bp) 137 register struct buf *bp; 138 { 139 register int flag; 140 int error; 141 142 flag = bp->b_flags; 143 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 144 if ((flag&B_DELWRI) == 0) 145 u.u_ru.ru_oublock++; /* noone paid yet */ 146 trace(TR_BWRITE, 147 pack(bp->b_vp->v_mount->m_fsid[0], bp->b_bcount), bp->b_blkno); 148 if (bp->b_bcount > bp->b_bufsize) 149 panic("bwrite"); 150 VOP_STRATEGY(bp); 151 152 /* 153 * If the write was synchronous, then await i/o completion. 154 * If the write was "delayed", then we put the buffer on 155 * the q of blocks awaiting i/o completion status. 156 */ 157 if ((flag&B_ASYNC) == 0) { 158 error = biowait(bp); 159 brelse(bp); 160 } else if (flag & B_DELWRI) { 161 bp->b_flags |= B_AGE; 162 error = 0; 163 } 164 return (error); 165 } 166 167 /* 168 * Release the buffer, marking it so that if it is grabbed 169 * for another purpose it will be written out before being 170 * given up (e.g. when writing a partial block where it is 171 * assumed that another write for the same block will soon follow). 172 * This can't be done for magtape, since writes must be done 173 * in the same order as requested. 174 */ 175 bdwrite(bp) 176 register struct buf *bp; 177 { 178 179 if ((bp->b_flags&B_DELWRI) == 0) 180 u.u_ru.ru_oublock++; /* noone paid yet */ 181 #ifdef notdef 182 /* 183 * This does not work for buffers associated with 184 * vnodes that are remote - they have no dev. 185 * Besides, we don't use bio with tapes, so rather 186 * than develop a fix, we just ifdef this out for now. 187 */ 188 if (bdevsw[major(bp->b_dev)].d_flags & B_TAPE) 189 bawrite(bp); 190 else { 191 bp->b_flags |= B_DELWRI | B_DONE; 192 brelse(bp); 193 } 194 #endif 195 bp->b_flags |= B_DELWRI | B_DONE; 196 brelse(bp); 197 } 198 199 /* 200 * Release the buffer, start I/O on it, but don't wait for completion. 201 */ 202 bawrite(bp) 203 register struct buf *bp; 204 { 205 206 bp->b_flags |= B_ASYNC; 207 (void) bwrite(bp); 208 } 209 210 /* 211 * Release the buffer, with no I/O implied. 212 */ 213 brelse(bp) 214 register struct buf *bp; 215 { 216 register struct buf *flist; 217 register s; 218 219 trace(TR_BRELSE, 220 pack(bp->b_vp->v_mount->m_fsid[0], bp->b_bufsize), bp->b_blkno); 221 /* 222 * If someone's waiting for the buffer, or 223 * is waiting for a buffer wake 'em up. 224 */ 225 if (bp->b_flags&B_WANTED) 226 wakeup((caddr_t)bp); 227 if (bfreelist[0].b_flags&B_WANTED) { 228 bfreelist[0].b_flags &= ~B_WANTED; 229 wakeup((caddr_t)bfreelist); 230 } 231 if (bp->b_flags & B_NOCACHE) { 232 bp->b_flags |= B_INVAL; 233 } 234 if (bp->b_flags&B_ERROR) 235 if (bp->b_flags & B_LOCKED) 236 bp->b_flags &= ~B_ERROR; /* try again later */ 237 else 238 brelvp(bp); /* no assoc */ 239 240 /* 241 * Stick the buffer back on a free list. 242 */ 243 s = splbio(); 244 if (bp->b_bufsize <= 0) { 245 /* block has no buffer ... put at front of unused buffer list */ 246 flist = &bfreelist[BQ_EMPTY]; 247 binsheadfree(bp, flist); 248 } else if (bp->b_flags & (B_ERROR|B_INVAL)) { 249 /* block has no info ... put at front of most free list */ 250 flist = &bfreelist[BQ_AGE]; 251 binsheadfree(bp, flist); 252 } else { 253 if (bp->b_flags & B_LOCKED) 254 flist = &bfreelist[BQ_LOCKED]; 255 else if (bp->b_flags & B_AGE) 256 flist = &bfreelist[BQ_AGE]; 257 else 258 flist = &bfreelist[BQ_LRU]; 259 binstailfree(bp, flist); 260 } 261 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_AGE|B_NOCACHE); 262 splx(s); 263 } 264 265 /* 266 * See if the block is associated with some buffer 267 * (mainly to avoid getting hung up on a wait in breada) 268 */ 269 incore(vp, blkno) 270 struct vnode *vp; 271 daddr_t blkno; 272 { 273 register struct buf *bp; 274 register struct buf *dp; 275 276 dp = BUFHASH(vp, blkno); 277 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) 278 if (bp->b_blkno == blkno && bp->b_vp == vp && 279 (bp->b_flags & B_INVAL) == 0) 280 return (1); 281 return (0); 282 } 283 284 baddr(vp, blkno, size, cred, bpp) 285 struct vnode *vp; 286 daddr_t blkno; 287 int size; 288 struct ucred *cred; 289 struct buf **bpp; 290 { 291 292 if (incore(vp, blkno)) 293 return (bread(vp, blkno, size, cred, bpp)); 294 *bpp = 0; 295 return (0); 296 } 297 298 /* 299 * Assign a buffer for the given block. If the appropriate 300 * block is already associated, return it; otherwise search 301 * for the oldest non-busy buffer and reassign it. 302 * 303 * If we find the buffer, but it is dirty (marked DELWRI) and 304 * its size is changing, we must write it out first. When the 305 * buffer is shrinking, the write is done by brealloc to avoid 306 * losing the unwritten data. When the buffer is growing, the 307 * write is done by getblk, so that bread will not read stale 308 * disk data over the modified data in the buffer. 309 * 310 * We use splx here because this routine may be called 311 * on the interrupt stack during a dump, and we don't 312 * want to lower the ipl back to 0. 313 */ 314 struct buf * 315 getblk(vp, blkno, size) 316 register struct vnode *vp; 317 daddr_t blkno; 318 int size; 319 { 320 register struct buf *bp, *dp; 321 int s; 322 323 if (size > MAXBSIZE) 324 panic("getblk: size too big"); 325 /* 326 * To prevent overflow of 32-bit ints when converting block 327 * numbers to byte offsets, blknos > 2^32 / DEV_BSIZE are set 328 * to the maximum number that can be converted to a byte offset 329 * without overflow. This is historic code; what bug it fixed, 330 * or whether it is still a reasonable thing to do is open to 331 * dispute. mkm 9/85 332 */ 333 if ((unsigned)blkno >= 1 << (sizeof(int)*NBBY-DEV_BSHIFT)) 334 blkno = 1 << ((sizeof(int)*NBBY-DEV_BSHIFT) + 1); 335 /* 336 * Search the cache for the block. If we hit, but 337 * the buffer is in use for i/o, then we wait until 338 * the i/o has completed. 339 */ 340 dp = BUFHASH(vp, blkno); 341 loop: 342 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 343 if (bp->b_blkno != blkno || bp->b_vp != vp || 344 bp->b_flags&B_INVAL) 345 continue; 346 s = splbio(); 347 if (bp->b_flags&B_BUSY) { 348 bp->b_flags |= B_WANTED; 349 sleep((caddr_t)bp, PRIBIO+1); 350 splx(s); 351 goto loop; 352 } 353 splx(s); 354 notavail(bp); 355 if (bp->b_bcount != size) { 356 if (bp->b_bcount < size && (bp->b_flags&B_DELWRI)) { 357 bp->b_flags &= ~B_ASYNC; 358 (void) bwrite(bp); 359 goto loop; 360 } 361 if (brealloc(bp, size) == 0) 362 goto loop; 363 } 364 if (bp->b_bcount != size && brealloc(bp, size) == 0) 365 goto loop; 366 bp->b_flags |= B_CACHE; 367 return (bp); 368 } 369 bp = getnewbuf(); 370 bfree(bp); 371 bremhash(bp); 372 if (bp->b_vp) 373 brelvp(bp); 374 VREF(vp); 375 bp->b_vp = vp; 376 bp->b_dev = vp->v_rdev; 377 bp->b_blkno = blkno; 378 bp->b_error = 0; 379 bp->b_resid = 0; 380 binshash(bp, dp); 381 if (brealloc(bp, size) == 0) 382 goto loop; 383 return (bp); 384 } 385 386 /* 387 * get an empty block, 388 * not assigned to any particular device 389 */ 390 struct buf * 391 geteblk(size) 392 int size; 393 { 394 register struct buf *bp, *flist; 395 396 if (size > MAXBSIZE) 397 panic("geteblk: size too big"); 398 loop: 399 bp = getnewbuf(); 400 bp->b_flags |= B_INVAL; 401 bfree(bp); 402 bremhash(bp); 403 flist = &bfreelist[BQ_AGE]; 404 brelvp(bp); 405 bp->b_error = 0; 406 bp->b_resid = 0; 407 binshash(bp, flist); 408 if (brealloc(bp, size) == 0) 409 goto loop; 410 return (bp); 411 } 412 413 /* 414 * Allocate space associated with a buffer. 415 * If can't get space, buffer is released 416 */ 417 brealloc(bp, size) 418 register struct buf *bp; 419 int size; 420 { 421 daddr_t start, last; 422 register struct buf *ep; 423 struct buf *dp; 424 int s; 425 426 /* 427 * First need to make sure that all overlapping previous I/O 428 * is dispatched with. 429 */ 430 if (size == bp->b_bcount) 431 return (1); 432 if (size < bp->b_bcount) { 433 if (bp->b_flags & B_DELWRI) { 434 (void) bwrite(bp); 435 return (0); 436 } 437 if (bp->b_flags & B_LOCKED) 438 panic("brealloc"); 439 return (allocbuf(bp, size)); 440 } 441 bp->b_flags &= ~B_DONE; 442 if (bp->b_vp == (struct vnode *)0) 443 return (allocbuf(bp, size)); 444 445 trace(TR_BREALLOC, 446 pack(bp->b_vp->v_mount->m_fsid[0], size), bp->b_blkno); 447 /* 448 * Search cache for any buffers that overlap the one that we 449 * are trying to allocate. Overlapping buffers must be marked 450 * invalid, after being written out if they are dirty. (indicated 451 * by B_DELWRI) A disk block must be mapped by at most one buffer 452 * at any point in time. Care must be taken to avoid deadlocking 453 * when two buffer are trying to get the same set of disk blocks. 454 */ 455 start = bp->b_blkno; 456 last = start + btodb(size) - 1; 457 dp = BUFHASH(bp->b_vp, bp->b_blkno); 458 loop: 459 for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) { 460 if (ep == bp || ep->b_vp != bp->b_vp || 461 (ep->b_flags & B_INVAL)) 462 continue; 463 /* look for overlap */ 464 if (ep->b_bcount == 0 || ep->b_blkno > last || 465 ep->b_blkno + btodb(ep->b_bcount) <= start) 466 continue; 467 s = splbio(); 468 if (ep->b_flags&B_BUSY) { 469 ep->b_flags |= B_WANTED; 470 sleep((caddr_t)ep, PRIBIO+1); 471 splx(s); 472 goto loop; 473 } 474 splx(s); 475 notavail(ep); 476 if (ep->b_flags & B_DELWRI) { 477 (void) bwrite(ep); 478 goto loop; 479 } 480 ep->b_flags |= B_INVAL; 481 brelse(ep); 482 } 483 return (allocbuf(bp, size)); 484 } 485 486 /* 487 * Find a buffer which is available for use. 488 * Select something from a free list. 489 * Preference is to AGE list, then LRU list. 490 */ 491 struct buf * 492 getnewbuf() 493 { 494 register struct buf *bp, *dp; 495 register struct ucred *cred; 496 int s; 497 498 loop: 499 s = splbio(); 500 for (dp = &bfreelist[BQ_AGE]; dp > bfreelist; dp--) 501 if (dp->av_forw != dp) 502 break; 503 if (dp == bfreelist) { /* no free blocks */ 504 dp->b_flags |= B_WANTED; 505 sleep((caddr_t)dp, PRIBIO+1); 506 splx(s); 507 goto loop; 508 } 509 splx(s); 510 bp = dp->av_forw; 511 notavail(bp); 512 if (bp->b_flags & B_DELWRI) { 513 (void) bawrite(bp); 514 goto loop; 515 } 516 trace(TR_BRELSE, 517 pack(bp->b_vp->v_mount->m_fsid[0], bp->b_bufsize), bp->b_blkno); 518 brelvp(bp); 519 if (bp->b_rcred != NOCRED) { 520 cred = bp->b_rcred; 521 bp->b_rcred = NOCRED; 522 crfree(cred); 523 } 524 if (bp->b_wcred != NOCRED) { 525 cred = bp->b_wcred; 526 bp->b_wcred = NOCRED; 527 crfree(cred); 528 } 529 bp->b_flags = B_BUSY; 530 return (bp); 531 } 532 533 /* 534 * Wait for I/O completion on the buffer; return errors 535 * to the user. 536 */ 537 biowait(bp) 538 register struct buf *bp; 539 { 540 int s; 541 542 s = splbio(); 543 while ((bp->b_flags & B_DONE) == 0) 544 sleep((caddr_t)bp, PRIBIO); 545 splx(s); 546 /* 547 * Pick up the device's error number and pass it to the user; 548 * if there is an error but the number is 0 set a generalized code. 549 */ 550 if ((bp->b_flags & B_ERROR) == 0) 551 return (0); 552 if (bp->b_error) 553 return (bp->b_error); 554 return (EIO); 555 } 556 557 /* 558 * Mark I/O complete on a buffer. 559 * If someone should be called, e.g. the pageout 560 * daemon, do so. Otherwise, wake up anyone 561 * waiting for it. 562 */ 563 biodone(bp) 564 register struct buf *bp; 565 { 566 567 if (bp->b_flags & B_DONE) 568 panic("dup biodone"); 569 bp->b_flags |= B_DONE; 570 if ((bp->b_flags & B_READ) == 0) 571 bp->b_dirtyoff = bp->b_dirtyend = 0; 572 if (bp->b_flags & B_CALL) { 573 bp->b_flags &= ~B_CALL; 574 (*bp->b_iodone)(bp); 575 return; 576 } 577 if (bp->b_flags&B_ASYNC) 578 brelse(bp); 579 else { 580 bp->b_flags &= ~B_WANTED; 581 wakeup((caddr_t)bp); 582 } 583 } 584 585 /* 586 * Ensure that no part of a specified block is in an incore buffer. 587 #ifdef SECSIZE 588 * "size" is given in device blocks (the units of b_blkno). 589 #endif SECSIZE 590 */ 591 blkflush(vp, blkno, size) 592 struct vnode *vp; 593 daddr_t blkno; 594 long size; 595 { 596 register struct buf *ep; 597 struct buf *dp; 598 daddr_t start, last; 599 int s, error, allerrors = 0; 600 601 start = blkno; 602 last = start + btodb(size) - 1; 603 dp = BUFHASH(vp, blkno); 604 loop: 605 for (ep = dp->b_forw; ep != dp; ep = ep->b_forw) { 606 if (ep->b_vp != vp || (ep->b_flags & B_INVAL)) 607 continue; 608 /* look for overlap */ 609 if (ep->b_bcount == 0 || ep->b_blkno > last || 610 ep->b_blkno + btodb(ep->b_bcount) <= start) 611 continue; 612 s = splbio(); 613 if (ep->b_flags&B_BUSY) { 614 ep->b_flags |= B_WANTED; 615 sleep((caddr_t)ep, PRIBIO+1); 616 splx(s); 617 goto loop; 618 } 619 if (ep->b_flags & B_DELWRI) { 620 splx(s); 621 notavail(ep); 622 if (error = bwrite(ep)) 623 allerrors = error; 624 goto loop; 625 } 626 splx(s); 627 } 628 return (allerrors); 629 } 630 631 /* 632 * Make sure all write-behind blocks associated 633 * with mount point are flushed out (from sync). 634 */ 635 bflush(mountp) 636 struct mount *mountp; 637 { 638 register struct buf *bp; 639 register struct buf *flist; 640 int s; 641 642 loop: 643 s = splbio(); 644 for (flist = bfreelist; flist < &bfreelist[BQ_EMPTY]; flist++) { 645 for (bp = flist->av_forw; bp != flist; bp = bp->av_forw) { 646 if ((bp->b_flags & B_BUSY)) 647 continue; 648 if ((bp->b_flags & B_DELWRI) == 0) 649 continue; 650 if (bp->b_vp && bp->b_vp->v_mount == mountp) { 651 splx(s); 652 notavail(bp); 653 (void) bawrite(bp); 654 goto loop; 655 } 656 } 657 } 658 splx(s); 659 } 660 661 /* 662 * Invalidate in core blocks belonging to closed or umounted filesystem 663 * 664 * We walk through the buffer pool and invalidate any buffers for the 665 * indicated mount point. Normally this routine is preceeded by a bflush 666 * call, so that on a quiescent filesystem there will be no dirty 667 * buffers when we are done. We return the count of dirty buffers when 668 * we are finished. 669 */ 670 binval(mountp) 671 struct mount *mountp; 672 { 673 register struct buf *bp; 674 register struct bufhd *hp; 675 int s, dirty = 0; 676 #define dp ((struct buf *)hp) 677 678 loop: 679 for (hp = bufhash; hp < &bufhash[BUFHSZ]; hp++) { 680 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 681 if (bp->b_vp == NULL || bp->b_vp->v_mount != mountp) 682 continue; 683 s = splbio(); 684 if (bp->b_flags & B_BUSY) { 685 bp->b_flags |= B_WANTED; 686 sleep((caddr_t)bp, PRIBIO+1); 687 splx(s); 688 goto loop; 689 } 690 splx(s); 691 notavail(bp); 692 if (bp->b_flags & B_DELWRI) { 693 (void) bawrite(bp); 694 dirty++; 695 continue; 696 } 697 bp->b_flags |= B_INVAL; 698 brelvp(bp); 699 brelse(bp); 700 } 701 } 702 return (dirty); 703 } 704 705 brelvp(bp) 706 struct buf *bp; 707 { 708 struct vnode *vp; 709 710 if (bp->b_vp == (struct vnode *) 0) 711 return; 712 vp = bp->b_vp; 713 bp->b_vp = (struct vnode *) 0; 714 vrele(vp); 715 } 716