1 /* 2 * Copyright (c) 1982, 1986, 1989 Regents of the University of California. 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms are permitted 6 * provided that the above copyright notice and this paragraph are 7 * duplicated in all such forms and that any documentation, 8 * advertising materials, and other materials related to such 9 * distribution and use acknowledge that the software was developed 10 * by the University of California, Berkeley. The name of the 11 * University may not be used to endorse or promote products derived 12 * from this software without specific prior written permission. 13 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR 14 * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED 15 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. 16 * 17 * @(#)vfs_cluster.c 7.18 (Berkeley) 12/30/89 18 */ 19 20 #include "param.h" 21 #include "user.h" 22 #include "buf.h" 23 #include "vnode.h" 24 #include "mount.h" 25 #include "trace.h" 26 #include "ucred.h" 27 28 /* 29 * Read in (if necessary) the block and return a buffer pointer. 30 */ 31 bread(vp, blkno, size, cred, bpp) 32 struct vnode *vp; 33 daddr_t blkno; 34 int size; 35 struct ucred *cred; 36 struct buf **bpp; 37 { 38 register struct buf *bp; 39 40 if (size == 0) 41 panic("bread: size 0"); 42 *bpp = bp = getblk(vp, blkno, size); 43 if (bp->b_flags&(B_DONE|B_DELWRI)) { 44 trace(TR_BREADHIT, pack(vp->v_mount->m_fsid[0], size), blkno); 45 return (0); 46 } 47 bp->b_flags |= B_READ; 48 if (bp->b_bcount > bp->b_bufsize) 49 panic("bread"); 50 if (bp->b_rcred == NOCRED && cred != NOCRED) { 51 crhold(cred); 52 bp->b_rcred = cred; 53 } 54 VOP_STRATEGY(bp); 55 trace(TR_BREADMISS, pack(vp->v_mount->m_fsid[0], size), blkno); 56 u.u_ru.ru_inblock++; /* pay for read */ 57 return (biowait(bp)); 58 } 59 60 /* 61 * Read in the block, like bread, but also start I/O on the 62 * read-ahead block (which is not allocated to the caller) 63 */ 64 breada(vp, blkno, size, rablkno, rabsize, cred, bpp) 65 struct vnode *vp; 66 daddr_t blkno; int size; 67 daddr_t rablkno; int rabsize; 68 struct ucred *cred; 69 struct buf **bpp; 70 { 71 register struct buf *bp, *rabp; 72 73 bp = NULL; 74 /* 75 * If the block isn't in core, then allocate 76 * a buffer and initiate i/o (getblk checks 77 * for a cache hit). 78 */ 79 if (!incore(vp, blkno)) { 80 *bpp = bp = getblk(vp, blkno, size); 81 if ((bp->b_flags&(B_DONE|B_DELWRI)) == 0) { 82 bp->b_flags |= B_READ; 83 if (bp->b_bcount > bp->b_bufsize) 84 panic("breada"); 85 if (bp->b_rcred == NOCRED && cred != NOCRED) { 86 crhold(cred); 87 bp->b_rcred = cred; 88 } 89 VOP_STRATEGY(bp); 90 trace(TR_BREADMISS, pack(vp->v_mount->m_fsid[0], size), 91 blkno); 92 u.u_ru.ru_inblock++; /* pay for read */ 93 } else 94 trace(TR_BREADHIT, pack(vp->v_mount->m_fsid[0], size), 95 blkno); 96 } 97 98 /* 99 * If there's a read-ahead block, start i/o 100 * on it also (as above). 101 */ 102 if (rablkno && !incore(vp, rablkno)) { 103 rabp = getblk(vp, rablkno, rabsize); 104 if (rabp->b_flags & (B_DONE|B_DELWRI)) { 105 brelse(rabp); 106 trace(TR_BREADHITRA, 107 pack(vp->v_mount->m_fsid[0], rabsize), rablkno); 108 } else { 109 rabp->b_flags |= B_READ|B_ASYNC; 110 if (rabp->b_bcount > rabp->b_bufsize) 111 panic("breadrabp"); 112 if (rabp->b_rcred == NOCRED && cred != NOCRED) { 113 crhold(cred); 114 rabp->b_rcred = cred; 115 } 116 VOP_STRATEGY(rabp); 117 trace(TR_BREADMISSRA, 118 pack(vp->v_mount->m_fsid[0], rabsize), rablkno); 119 u.u_ru.ru_inblock++; /* pay in advance */ 120 } 121 } 122 123 /* 124 * If block was in core, let bread get it. 125 * If block wasn't in core, then the read was started 126 * above, and just wait for it. 127 */ 128 if (bp == NULL) 129 return (bread(vp, blkno, size, cred, bpp)); 130 return (biowait(bp)); 131 } 132 133 /* 134 * Write the buffer, waiting for completion. 135 * Then release the buffer. 136 */ 137 bwrite(bp) 138 register struct buf *bp; 139 { 140 register int flag; 141 int error; 142 143 flag = bp->b_flags; 144 bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); 145 if ((flag&B_DELWRI) == 0) 146 u.u_ru.ru_oublock++; /* noone paid yet */ 147 trace(TR_BWRITE, 148 pack(bp->b_vp->v_mount->m_fsid[0], bp->b_bcount), bp->b_lblkno); 149 if (bp->b_bcount > bp->b_bufsize) 150 panic("bwrite"); 151 VOP_STRATEGY(bp); 152 153 /* 154 * If the write was synchronous, then await i/o completion. 155 * If the write was "delayed", then we put the buffer on 156 * the q of blocks awaiting i/o completion status. 157 */ 158 if ((flag&B_ASYNC) == 0) { 159 error = biowait(bp); 160 brelse(bp); 161 } else if (flag & B_DELWRI) { 162 bp->b_flags |= B_AGE; 163 error = 0; 164 } 165 return (error); 166 } 167 168 /* 169 * Release the buffer, marking it so that if it is grabbed 170 * for another purpose it will be written out before being 171 * given up (e.g. when writing a partial block where it is 172 * assumed that another write for the same block will soon follow). 173 * This can't be done for magtape, since writes must be done 174 * in the same order as requested. 175 */ 176 bdwrite(bp) 177 register struct buf *bp; 178 { 179 180 if ((bp->b_flags&B_DELWRI) == 0) 181 u.u_ru.ru_oublock++; /* noone paid yet */ 182 /* 183 * If this is a tape drive, the write must be initiated. 184 */ 185 if (VOP_IOCTL(bp->b_vp, 0, B_TAPE, 0, NOCRED) == 0) { 186 bawrite(bp); 187 } else { 188 bp->b_flags |= B_DELWRI | B_DONE; 189 brelse(bp); 190 } 191 } 192 193 /* 194 * Release the buffer, start I/O on it, but don't wait for completion. 195 */ 196 bawrite(bp) 197 register struct buf *bp; 198 { 199 200 bp->b_flags |= B_ASYNC; 201 (void) bwrite(bp); 202 } 203 204 /* 205 * Release the buffer, with no I/O implied. 206 */ 207 brelse(bp) 208 register struct buf *bp; 209 { 210 register struct buf *flist; 211 register s; 212 213 trace(TR_BRELSE, 214 pack(bp->b_vp->v_mount->m_fsid[0], bp->b_bufsize), bp->b_lblkno); 215 /* 216 * If a process is waiting for the buffer, or 217 * is waiting for a free buffer, awaken it. 218 */ 219 if (bp->b_flags&B_WANTED) 220 wakeup((caddr_t)bp); 221 if (bfreelist[0].b_flags&B_WANTED) { 222 bfreelist[0].b_flags &= ~B_WANTED; 223 wakeup((caddr_t)bfreelist); 224 } 225 /* 226 * Retry I/O for locked buffers rather than invalidating them. 227 */ 228 if ((bp->b_flags & B_ERROR) && (bp->b_flags & B_LOCKED)) 229 bp->b_flags &= ~B_ERROR; 230 231 /* 232 * Disassociate buffers that are no longer valid. 233 */ 234 if (bp->b_flags & (B_NOCACHE|B_ERROR)) 235 bp->b_flags |= B_INVAL; 236 if ((bp->b_bufsize <= 0) || (bp->b_flags & (B_ERROR|B_INVAL))) { 237 if (bp->b_vp) 238 brelvp(bp); 239 bp->b_flags &= ~B_DELWRI; 240 } 241 /* 242 * Stick the buffer back on a free list. 243 */ 244 s = splbio(); 245 if (bp->b_bufsize <= 0) { 246 /* block has no buffer ... put at front of unused buffer list */ 247 flist = &bfreelist[BQ_EMPTY]; 248 binsheadfree(bp, flist); 249 } else if (bp->b_flags & (B_ERROR|B_INVAL)) { 250 /* block has no info ... put at front of most free list */ 251 flist = &bfreelist[BQ_AGE]; 252 binsheadfree(bp, flist); 253 } else { 254 if (bp->b_flags & B_LOCKED) 255 flist = &bfreelist[BQ_LOCKED]; 256 else if (bp->b_flags & B_AGE) 257 flist = &bfreelist[BQ_AGE]; 258 else 259 flist = &bfreelist[BQ_LRU]; 260 binstailfree(bp, flist); 261 } 262 bp->b_flags &= ~(B_WANTED|B_BUSY|B_ASYNC|B_AGE|B_NOCACHE); 263 splx(s); 264 } 265 266 /* 267 * See if the block is associated with some buffer 268 * (mainly to avoid getting hung up on a wait in breada) 269 */ 270 incore(vp, blkno) 271 struct vnode *vp; 272 daddr_t blkno; 273 { 274 register struct buf *bp; 275 register struct buf *dp; 276 277 dp = BUFHASH(vp, blkno); 278 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) 279 if (bp->b_lblkno == blkno && bp->b_vp == vp && 280 (bp->b_flags & B_INVAL) == 0) 281 return (1); 282 return (0); 283 } 284 285 /* 286 * Return a block if it is in memory. 287 */ 288 baddr(vp, blkno, size, cred, bpp) 289 struct vnode *vp; 290 daddr_t blkno; 291 int size; 292 struct ucred *cred; 293 struct buf **bpp; 294 { 295 296 if (incore(vp, blkno)) 297 return (bread(vp, blkno, size, cred, bpp)); 298 *bpp = 0; 299 return (0); 300 } 301 302 /* 303 * Assign a buffer for the given block. If the appropriate 304 * block is already associated, return it; otherwise search 305 * for the oldest non-busy buffer and reassign it. 306 * 307 * We use splx here because this routine may be called 308 * on the interrupt stack during a dump, and we don't 309 * want to lower the ipl back to 0. 310 */ 311 struct buf * 312 getblk(vp, blkno, size) 313 register struct vnode *vp; 314 daddr_t blkno; 315 int size; 316 { 317 register struct buf *bp, *dp; 318 int s; 319 320 if (size > MAXBSIZE) 321 panic("getblk: size too big"); 322 /* 323 * To prevent overflow of 32-bit ints when converting block 324 * numbers to byte offsets, blknos > 2^32 / DEV_BSIZE are set 325 * to the maximum number that can be converted to a byte offset 326 * without overflow. This is historic code; what bug it fixed, 327 * or whether it is still a reasonable thing to do is open to 328 * dispute. mkm 9/85 329 * 330 * Make it a panic to see if it ever really happens. mkm 11/89 331 */ 332 if ((unsigned)blkno >= 1 << (sizeof(int)*NBBY-DEV_BSHIFT)) { 333 panic("getblk: blkno too big"); 334 blkno = 1 << ((sizeof(int)*NBBY-DEV_BSHIFT) + 1); 335 } 336 /* 337 * Search the cache for the block. If we hit, but 338 * the buffer is in use for i/o, then we wait until 339 * the i/o has completed. 340 */ 341 dp = BUFHASH(vp, blkno); 342 loop: 343 for (bp = dp->b_forw; bp != dp; bp = bp->b_forw) { 344 if (bp->b_lblkno != blkno || bp->b_vp != vp || 345 bp->b_flags&B_INVAL) 346 continue; 347 s = splbio(); 348 if (bp->b_flags&B_BUSY) { 349 bp->b_flags |= B_WANTED; 350 sleep((caddr_t)bp, PRIBIO+1); 351 splx(s); 352 goto loop; 353 } 354 splx(s); 355 notavail(bp); 356 if (bp->b_bcount != size) { 357 printf("getblk: stray size"); 358 bp->b_flags |= B_INVAL; 359 bwrite(bp); 360 goto loop; 361 } 362 bp->b_flags |= B_CACHE; 363 return (bp); 364 } 365 bp = getnewbuf(); 366 bfree(bp); 367 bremhash(bp); 368 bgetvp(vp, bp); 369 bp->b_lblkno = blkno; 370 bp->b_blkno = blkno; 371 bp->b_error = 0; 372 bp->b_resid = 0; 373 binshash(bp, dp); 374 brealloc(bp, size); 375 return (bp); 376 } 377 378 /* 379 * get an empty block, 380 * not assigned to any particular device 381 */ 382 struct buf * 383 geteblk(size) 384 int size; 385 { 386 register struct buf *bp, *flist; 387 388 if (size > MAXBSIZE) 389 panic("geteblk: size too big"); 390 bp = getnewbuf(); 391 bp->b_flags |= B_INVAL; 392 bfree(bp); 393 bremhash(bp); 394 flist = &bfreelist[BQ_AGE]; 395 bp->b_error = 0; 396 bp->b_resid = 0; 397 binshash(bp, flist); 398 brealloc(bp, size); 399 return (bp); 400 } 401 402 /* 403 * Allocate space associated with a buffer. 404 */ 405 brealloc(bp, size) 406 register struct buf *bp; 407 int size; 408 { 409 daddr_t start, last; 410 register struct buf *ep; 411 struct buf *dp; 412 int s; 413 414 if (size == bp->b_bcount) 415 return; 416 allocbuf(bp, size); 417 } 418 419 /* 420 * Find a buffer which is available for use. 421 * Select something from a free list. 422 * Preference is to AGE list, then LRU list. 423 */ 424 struct buf * 425 getnewbuf() 426 { 427 register struct buf *bp, *dp; 428 register struct ucred *cred; 429 int s; 430 431 loop: 432 s = splbio(); 433 for (dp = &bfreelist[BQ_AGE]; dp > bfreelist; dp--) 434 if (dp->av_forw != dp) 435 break; 436 if (dp == bfreelist) { /* no free blocks */ 437 dp->b_flags |= B_WANTED; 438 sleep((caddr_t)dp, PRIBIO+1); 439 splx(s); 440 goto loop; 441 } 442 splx(s); 443 bp = dp->av_forw; 444 notavail(bp); 445 if (bp->b_flags & B_DELWRI) { 446 (void) bawrite(bp); 447 goto loop; 448 } 449 trace(TR_BRELSE, 450 pack(bp->b_vp->v_mount->m_fsid[0], bp->b_bufsize), bp->b_lblkno); 451 if (bp->b_vp) 452 brelvp(bp); 453 if (bp->b_rcred != NOCRED) { 454 cred = bp->b_rcred; 455 bp->b_rcred = NOCRED; 456 crfree(cred); 457 } 458 if (bp->b_wcred != NOCRED) { 459 cred = bp->b_wcred; 460 bp->b_wcred = NOCRED; 461 crfree(cred); 462 } 463 bp->b_flags = B_BUSY; 464 return (bp); 465 } 466 467 /* 468 * Wait for I/O completion on the buffer; return errors 469 * to the user. 470 */ 471 biowait(bp) 472 register struct buf *bp; 473 { 474 int s; 475 476 s = splbio(); 477 while ((bp->b_flags & B_DONE) == 0) 478 sleep((caddr_t)bp, PRIBIO); 479 splx(s); 480 /* 481 * Pick up the device's error number and pass it to the user; 482 * if there is an error but the number is 0 set a generalized code. 483 */ 484 if ((bp->b_flags & B_ERROR) == 0) 485 return (0); 486 if (bp->b_error) 487 return (bp->b_error); 488 return (EIO); 489 } 490 491 /* 492 * Mark I/O complete on a buffer. 493 * If someone should be called, e.g. the pageout 494 * daemon, do so. Otherwise, wake up anyone 495 * waiting for it. 496 */ 497 biodone(bp) 498 register struct buf *bp; 499 { 500 501 if (bp->b_flags & B_DONE) 502 panic("dup biodone"); 503 bp->b_flags |= B_DONE; 504 if ((bp->b_flags & B_READ) == 0) 505 bp->b_dirtyoff = bp->b_dirtyend = 0; 506 if (bp->b_flags & B_CALL) { 507 bp->b_flags &= ~B_CALL; 508 (*bp->b_iodone)(bp); 509 return; 510 } 511 if (bp->b_flags&B_ASYNC) 512 brelse(bp); 513 else { 514 bp->b_flags &= ~B_WANTED; 515 wakeup((caddr_t)bp); 516 } 517 } 518 519 /* 520 * Make sure all write-behind blocks associated 521 * with mount point are flushed out (from sync). 522 */ 523 mntflushbuf(mountp, flags) 524 struct mount *mountp; 525 int flags; 526 { 527 register struct vnode *vp; 528 struct vnode *nvp; 529 530 loop: 531 for (vp = mountp->m_mounth; vp; vp = nvp) { 532 nvp = vp->v_mountf; 533 if (vget(vp)) 534 goto loop; 535 vflushbuf(vp, flags); 536 vput(vp); 537 } 538 } 539 540 /* 541 * Flush all dirty buffers associated with a vnode. 542 */ 543 vflushbuf(vp, flags) 544 register struct vnode *vp; 545 int flags; 546 { 547 register struct buf *bp; 548 struct buf *nbp; 549 int s; 550 551 loop: 552 s = splbio(); 553 for (bp = vp->v_blockh; bp; bp = nbp) { 554 nbp = bp->b_blockf; 555 if ((bp->b_flags & B_BUSY)) 556 continue; 557 if ((bp->b_flags & B_DELWRI) == 0) 558 continue; 559 splx(s); 560 notavail(bp); 561 (void) bawrite(bp); 562 goto loop; 563 } 564 splx(s); 565 if ((flags & B_SYNC) == 0) 566 return; 567 wloop: 568 s = splbio(); 569 for (bp = vp->v_blockh; bp; bp = nbp) { 570 nbp = bp->b_blockf; 571 if (bp->b_flags & B_BUSY) { 572 bp->b_flags |= B_WANTED; 573 sleep((caddr_t)bp, PRIBIO+1); 574 splx(s); 575 goto wloop; 576 } 577 if ((bp->b_flags & B_DELWRI)) { 578 splx(s); 579 goto loop; 580 } 581 } 582 splx(s); 583 } 584 585 /* 586 * Invalidate in core blocks belonging to closed or umounted filesystem 587 * 588 * Go through the list of vnodes associated with the file system; 589 * for each vnode invalidate any buffers that it holds. Normally 590 * this routine is preceeded by a bflush call, so that on a quiescent 591 * filesystem there will be no dirty buffers when we are done. Binval 592 * returns the count of dirty buffers when it is finished. 593 */ 594 mntinvalbuf(mountp) 595 struct mount *mountp; 596 { 597 register struct vnode *vp; 598 struct vnode *nvp; 599 int dirty = 0; 600 601 loop: 602 for (vp = mountp->m_mounth; vp; vp = nvp) { 603 nvp = vp->v_mountf; 604 if (vget(vp)) 605 goto loop; 606 dirty += vinvalbuf(vp, 1); 607 vput(vp); 608 } 609 return (dirty); 610 } 611 612 /* 613 * Flush out and invalidate all buffers associated with a vnode. 614 * Called with the underlying object locked. 615 */ 616 vinvalbuf(vp, save) 617 register struct vnode *vp; 618 int save; 619 { 620 register struct buf *bp; 621 struct buf *nbp; 622 int s, dirty = 0; 623 624 loop: 625 for (bp = vp->v_blockh; bp; bp = nbp) { 626 nbp = bp->b_blockf; 627 s = splbio(); 628 if (bp->b_flags & B_BUSY) { 629 bp->b_flags |= B_WANTED; 630 sleep((caddr_t)bp, PRIBIO+1); 631 splx(s); 632 goto loop; 633 } 634 splx(s); 635 notavail(bp); 636 if (save) { 637 if (bp->b_flags & B_DELWRI) { 638 dirty++; 639 (void) bwrite(bp); 640 goto loop; 641 } 642 } 643 bp->b_flags |= B_INVAL; 644 brelse(bp); 645 } 646 if (vp->v_blockh != 0) 647 panic("vinvalbuf: flush failed"); 648 return (dirty); 649 } 650 651 /* 652 * Associate a buffer with a vnode. 653 */ 654 bgetvp(vp, bp) 655 register struct vnode *vp; 656 register struct buf *bp; 657 { 658 659 if (bp->b_vp) 660 panic("bgetvp: not free"); 661 VHOLD(vp); 662 bp->b_vp = vp; 663 if (vp->v_type == VBLK || vp->v_type == VCHR) 664 bp->b_dev = vp->v_rdev; 665 else 666 bp->b_dev = NODEV; 667 /* 668 * Insert onto list for new vnode. 669 */ 670 if (vp->v_blockh) { 671 bp->b_blockf = vp->v_blockh; 672 bp->b_blockb = &vp->v_blockh; 673 vp->v_blockh->b_blockb = &bp->b_blockf; 674 vp->v_blockh = bp; 675 } else { 676 vp->v_blockh = bp; 677 bp->b_blockb = &vp->v_blockh; 678 bp->b_blockf = NULL; 679 } 680 } 681 682 /* 683 * Disassociate a buffer from a vnode. 684 */ 685 brelvp(bp) 686 register struct buf *bp; 687 { 688 struct buf *bq; 689 struct vnode *vp; 690 691 if (bp->b_vp == (struct vnode *) 0) 692 panic("brelvp: NULL"); 693 /* 694 * Delete from old vnode list, if on one. 695 */ 696 if (bp->b_blockb) { 697 if (bq = bp->b_blockf) 698 bq->b_blockb = bp->b_blockb; 699 *bp->b_blockb = bq; 700 bp->b_blockf = NULL; 701 bp->b_blockb = NULL; 702 } 703 vp = bp->b_vp; 704 bp->b_vp = (struct vnode *) 0; 705 HOLDRELE(vp); 706 } 707 708 /* 709 * Reassign a buffer from one vnode to another. 710 * Used to assign file specific control information 711 * (indirect blocks) to the vnode to which they belong. 712 */ 713 reassignbuf(bp, newvp) 714 register struct buf *bp; 715 register struct vnode *newvp; 716 { 717 register struct buf *bq; 718 719 /* 720 * Delete from old vnode list, if on one. 721 */ 722 if (bp->b_blockb) { 723 if (bq = bp->b_blockf) 724 bq->b_blockb = bp->b_blockb; 725 *bp->b_blockb = bq; 726 } 727 /* 728 * Insert onto list for new vnode. 729 */ 730 if (newvp->v_blockh) { 731 bp->b_blockf = newvp->v_blockh; 732 bp->b_blockb = &newvp->v_blockh; 733 newvp->v_blockh->b_blockb = &bp->b_blockf; 734 newvp->v_blockh = bp; 735 } else { 736 newvp->v_blockh = bp; 737 bp->b_blockb = &newvp->v_blockh; 738 bp->b_blockf = NULL; 739 } 740 } 741