1 /* $NetBSD: vfs_bio.c,v 1.46 1996/06/18 20:50:23 mycroft Exp $ */ 2 3 /*- 4 * Copyright (c) 1994 Christopher G. Demetriou 5 * Copyright (c) 1982, 1986, 1989, 1993 6 * The Regents of the University of California. All rights reserved. 7 * (c) UNIX System Laboratories, Inc. 8 * All or some portions of this file are derived from material licensed 9 * to the University of California by American Telephone and Telegraph 10 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 11 * the permission of UNIX System Laboratories, Inc. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 3. All advertising materials mentioning features or use of this software 22 * must display the following acknowledgement: 23 * This product includes software developed by the University of 24 * California, Berkeley and its contributors. 25 * 4. Neither the name of the University nor the names of its contributors 26 * may be used to endorse or promote products derived from this software 27 * without specific prior written permission. 28 * 29 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 30 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 31 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 32 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 33 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 34 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 35 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 36 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 37 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 38 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 39 * SUCH DAMAGE. 40 * 41 * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94 42 */ 43 44 /* 45 * Some references: 46 * Bach: The Design of the UNIX Operating System (Prentice Hall, 1986) 47 * Leffler, et al.: The Design and Implementation of the 4.3BSD 48 * UNIX Operating System (Addison Welley, 1989) 49 */ 50 51 #include <sys/param.h> 52 #include <sys/systm.h> 53 #include <sys/proc.h> 54 #include <sys/buf.h> 55 #include <sys/vnode.h> 56 #include <sys/mount.h> 57 #include <sys/trace.h> 58 #include <sys/malloc.h> 59 #include <sys/resourcevar.h> 60 #include <sys/conf.h> 61 62 #include <vm/vm.h> 63 64 /* Macros to clear/set/test flags. */ 65 #define SET(t, f) (t) |= (f) 66 #define CLR(t, f) (t) &= ~(f) 67 #define ISSET(t, f) ((t) & (f)) 68 69 /* 70 * Definitions for the buffer hash lists. 71 */ 72 #define BUFHASH(dvp, lbn) \ 73 (&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash]) 74 LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash; 75 u_long bufhash; 76 77 /* 78 * Insq/Remq for the buffer hash lists. 79 */ 80 #define binshash(bp, dp) LIST_INSERT_HEAD(dp, bp, b_hash) 81 #define bremhash(bp) LIST_REMOVE(bp, b_hash) 82 83 /* 84 * Definitions for the buffer free lists. 85 */ 86 #define BQUEUES 4 /* number of free buffer queues */ 87 88 #define BQ_LOCKED 0 /* super-blocks &c */ 89 #define BQ_LRU 1 /* lru, useful buffers */ 90 #define BQ_AGE 2 /* rubbish */ 91 #define BQ_EMPTY 3 /* buffer headers with no memory */ 92 93 TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES]; 94 int needbuffer; 95 96 /* 97 * Insq/Remq for the buffer free lists. 98 */ 99 #define binsheadfree(bp, dp) TAILQ_INSERT_HEAD(dp, bp, b_freelist) 100 #define binstailfree(bp, dp) TAILQ_INSERT_TAIL(dp, bp, b_freelist) 101 102 static __inline struct buf *bio_doread __P((struct vnode *, daddr_t, int, 103 struct ucred *, int)); 104 int count_lock_queue __P((void)); 105 106 void 107 bremfree(bp) 108 struct buf *bp; 109 { 110 struct bqueues *dp = NULL; 111 112 /* 113 * We only calculate the head of the freelist when removing 114 * the last element of the list as that is the only time that 115 * it is needed (e.g. to reset the tail pointer). 116 * 117 * NB: This makes an assumption about how tailq's are implemented. 118 */ 119 if (bp->b_freelist.tqe_next == NULL) { 120 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) 121 if (dp->tqh_last == &bp->b_freelist.tqe_next) 122 break; 123 if (dp == &bufqueues[BQUEUES]) 124 panic("bremfree: lost tail"); 125 } 126 TAILQ_REMOVE(dp, bp, b_freelist); 127 } 128 129 /* 130 * Initialize buffers and hash links for buffers. 131 */ 132 void 133 bufinit() 134 { 135 register struct buf *bp; 136 struct bqueues *dp; 137 register int i; 138 int base, residual; 139 140 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) 141 TAILQ_INIT(dp); 142 bufhashtbl = hashinit(nbuf, M_CACHE, &bufhash); 143 base = bufpages / nbuf; 144 residual = bufpages % nbuf; 145 for (i = 0; i < nbuf; i++) { 146 bp = &buf[i]; 147 bzero((char *)bp, sizeof *bp); 148 bp->b_dev = NODEV; 149 bp->b_rcred = NOCRED; 150 bp->b_wcred = NOCRED; 151 bp->b_vnbufs.le_next = NOLIST; 152 bp->b_data = buffers + i * MAXBSIZE; 153 if (i < residual) 154 bp->b_bufsize = (base + 1) * CLBYTES; 155 else 156 bp->b_bufsize = base * CLBYTES; 157 bp->b_flags = B_INVAL; 158 dp = bp->b_bufsize ? &bufqueues[BQ_AGE] : &bufqueues[BQ_EMPTY]; 159 binsheadfree(bp, dp); 160 binshash(bp, &invalhash); 161 } 162 } 163 164 static __inline struct buf * 165 bio_doread(vp, blkno, size, cred, async) 166 struct vnode *vp; 167 daddr_t blkno; 168 int size; 169 struct ucred *cred; 170 int async; 171 { 172 register struct buf *bp; 173 174 bp = getblk(vp, blkno, size, 0, 0); 175 176 /* 177 * If buffer does not have data valid, start a read. 178 * Note that if buffer is B_INVAL, getblk() won't return it. 179 * Therefore, it's valid if it's I/O has completed or been delayed. 180 */ 181 if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) { 182 /* Start I/O for the buffer (keeping credentials). */ 183 SET(bp->b_flags, B_READ | async); 184 if (cred != NOCRED && bp->b_rcred == NOCRED) { 185 crhold(cred); 186 bp->b_rcred = cred; 187 } 188 VOP_STRATEGY(bp); 189 190 /* Pay for the read. */ 191 curproc->p_stats->p_ru.ru_inblock++; /* XXX */ 192 } else if (async) { 193 brelse(bp); 194 } 195 196 return (bp); 197 } 198 199 /* 200 * Read a disk block. 201 * This algorithm described in Bach (p.54). 202 */ 203 int 204 bread(vp, blkno, size, cred, bpp) 205 struct vnode *vp; 206 daddr_t blkno; 207 int size; 208 struct ucred *cred; 209 struct buf **bpp; 210 { 211 register struct buf *bp; 212 213 /* Get buffer for block. */ 214 bp = *bpp = bio_doread(vp, blkno, size, cred, 0); 215 216 /* Wait for the read to complete, and return result. */ 217 return (biowait(bp)); 218 } 219 220 /* 221 * Read-ahead multiple disk blocks. The first is sync, the rest async. 222 * Trivial modification to the breada algorithm presented in Bach (p.55). 223 */ 224 int 225 breadn(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp) 226 struct vnode *vp; 227 daddr_t blkno; int size; 228 daddr_t rablks[]; int rasizes[]; 229 int nrablks; 230 struct ucred *cred; 231 struct buf **bpp; 232 { 233 register struct buf *bp; 234 int i; 235 236 bp = *bpp = bio_doread(vp, blkno, size, cred, 0); 237 238 /* 239 * For each of the read-ahead blocks, start a read, if necessary. 240 */ 241 for (i = 0; i < nrablks; i++) { 242 /* If it's in the cache, just go on to next one. */ 243 if (incore(vp, rablks[i])) 244 continue; 245 246 /* Get a buffer for the read-ahead block */ 247 (void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC); 248 } 249 250 /* Otherwise, we had to start a read for it; wait until it's valid. */ 251 return (biowait(bp)); 252 } 253 254 /* 255 * Read with single-block read-ahead. Defined in Bach (p.55), but 256 * implemented as a call to breadn(). 257 * XXX for compatibility with old file systems. 258 */ 259 int 260 breada(vp, blkno, size, rablkno, rabsize, cred, bpp) 261 struct vnode *vp; 262 daddr_t blkno; int size; 263 daddr_t rablkno; int rabsize; 264 struct ucred *cred; 265 struct buf **bpp; 266 { 267 268 return (breadn(vp, blkno, size, &rablkno, &rabsize, 1, cred, bpp)); 269 } 270 271 /* 272 * Block write. Described in Bach (p.56) 273 */ 274 int 275 bwrite(bp) 276 struct buf *bp; 277 { 278 int rv, sync, wasdelayed, s; 279 280 /* 281 * Remember buffer type, to switch on it later. If the write was 282 * synchronous, but the file system was mounted with MNT_ASYNC, 283 * convert it to a delayed write. 284 * XXX note that this relies on delayed tape writes being converted 285 * to async, not sync writes (which is safe, but ugly). 286 */ 287 sync = !ISSET(bp->b_flags, B_ASYNC); 288 if (sync && bp->b_vp && bp->b_vp->v_mount && 289 ISSET(bp->b_vp->v_mount->mnt_flag, MNT_ASYNC)) { 290 bdwrite(bp); 291 return (0); 292 } 293 294 wasdelayed = ISSET(bp->b_flags, B_DELWRI); 295 CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI)); 296 297 s = splbio(); 298 299 /* 300 * Pay for the I/O operation and make sure the buf is on the correct 301 * vnode queue. 302 */ 303 if (wasdelayed) 304 reassignbuf(bp, bp->b_vp); 305 else 306 curproc->p_stats->p_ru.ru_oublock++; 307 308 /* Initiate disk write. Make sure the appropriate party is charged. */ 309 bp->b_vp->v_numoutput++; 310 splx(s); 311 312 SET(bp->b_flags, B_WRITEINPROG); 313 VOP_STRATEGY(bp); 314 315 if (sync) { 316 /* If I/O was synchronous, wait for it to complete. */ 317 rv = biowait(bp); 318 319 /* Release the buffer. */ 320 brelse(bp); 321 322 return (rv); 323 } else { 324 return (0); 325 } 326 } 327 328 int 329 vn_bwrite(v) 330 void *v; 331 { 332 struct vop_bwrite_args *ap = v; 333 334 return (bwrite(ap->a_bp)); 335 } 336 337 /* 338 * Delayed write. 339 * 340 * The buffer is marked dirty, but is not queued for I/O. 341 * This routine should be used when the buffer is expected 342 * to be modified again soon, typically a small write that 343 * partially fills a buffer. 344 * 345 * NB: magnetic tapes cannot be delayed; they must be 346 * written in the order that the writes are requested. 347 * 348 * Described in Leffler, et al. (pp. 208-213). 349 */ 350 void 351 bdwrite(bp) 352 struct buf *bp; 353 { 354 int s; 355 356 /* If this is a tape block, write the block now. */ 357 if (bdevsw[major(bp->b_dev)].d_type == D_TAPE) { 358 bawrite(bp); 359 return; 360 } 361 362 /* 363 * If the block hasn't been seen before: 364 * (1) Mark it as having been seen, 365 * (2) Charge for the write, 366 * (3) Make sure it's on its vnode's correct block list. 367 */ 368 if (!ISSET(bp->b_flags, B_DELWRI)) { 369 SET(bp->b_flags, B_DELWRI); 370 curproc->p_stats->p_ru.ru_oublock++; 371 s = splbio(); 372 reassignbuf(bp, bp->b_vp); 373 splx(s); 374 } 375 376 /* Otherwise, the "write" is done, so mark and release the buffer. */ 377 CLR(bp->b_flags, B_NEEDCOMMIT); 378 SET(bp->b_flags, B_DONE); 379 brelse(bp); 380 } 381 382 /* 383 * Asynchronous block write; just an asynchronous bwrite(). 384 */ 385 void 386 bawrite(bp) 387 struct buf *bp; 388 { 389 390 SET(bp->b_flags, B_ASYNC); 391 VOP_BWRITE(bp); 392 } 393 394 /* 395 * Release a buffer on to the free lists. 396 * Described in Bach (p. 46). 397 */ 398 void 399 brelse(bp) 400 struct buf *bp; 401 { 402 struct bqueues *bufq; 403 int s; 404 405 /* Wake up any processes waiting for any buffer to become free. */ 406 if (needbuffer) { 407 needbuffer = 0; 408 wakeup(&needbuffer); 409 } 410 411 /* Wake up any proceeses waiting for _this_ buffer to become free. */ 412 if (ISSET(bp->b_flags, B_WANTED)) { 413 CLR(bp->b_flags, B_WANTED); 414 wakeup(bp); 415 } 416 417 /* Block disk interrupts. */ 418 s = splbio(); 419 420 /* 421 * Determine which queue the buffer should be on, then put it there. 422 */ 423 424 /* If it's locked, don't report an error; try again later. */ 425 if (ISSET(bp->b_flags, (B_LOCKED|B_ERROR)) == (B_LOCKED|B_ERROR)) 426 CLR(bp->b_flags, B_ERROR); 427 428 /* If it's not cacheable, or an error, mark it invalid. */ 429 if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR))) 430 SET(bp->b_flags, B_INVAL); 431 432 if ((bp->b_bufsize <= 0) || ISSET(bp->b_flags, B_INVAL)) { 433 /* 434 * If it's invalid or empty, dissociate it from its vnode 435 * and put on the head of the appropriate queue. 436 */ 437 if (bp->b_vp) 438 brelvp(bp); 439 CLR(bp->b_flags, B_DELWRI); 440 if (bp->b_bufsize <= 0) 441 /* no data */ 442 bufq = &bufqueues[BQ_EMPTY]; 443 else 444 /* invalid data */ 445 bufq = &bufqueues[BQ_AGE]; 446 binsheadfree(bp, bufq); 447 } else { 448 /* 449 * It has valid data. Put it on the end of the appropriate 450 * queue, so that it'll stick around for as long as possible. 451 */ 452 if (ISSET(bp->b_flags, B_LOCKED)) 453 /* locked in core */ 454 bufq = &bufqueues[BQ_LOCKED]; 455 else if (ISSET(bp->b_flags, B_AGE)) 456 /* stale but valid data */ 457 bufq = &bufqueues[BQ_AGE]; 458 else 459 /* valid data */ 460 bufq = &bufqueues[BQ_LRU]; 461 binstailfree(bp, bufq); 462 } 463 464 /* Unlock the buffer. */ 465 CLR(bp->b_flags, (B_AGE | B_ASYNC | B_BUSY | B_NOCACHE)); 466 467 /* Allow disk interrupts. */ 468 splx(s); 469 } 470 471 /* 472 * Determine if a block is in the cache. 473 * Just look on what would be its hash chain. If it's there, return 474 * a pointer to it, unless it's marked invalid. If it's marked invalid, 475 * we normally don't return the buffer, unless the caller explicitly 476 * wants us to. 477 */ 478 struct buf * 479 incore(vp, blkno) 480 struct vnode *vp; 481 daddr_t blkno; 482 { 483 struct buf *bp; 484 485 bp = BUFHASH(vp, blkno)->lh_first; 486 487 /* Search hash chain */ 488 for (; bp != NULL; bp = bp->b_hash.le_next) { 489 if (bp->b_lblkno == blkno && bp->b_vp == vp && 490 !ISSET(bp->b_flags, B_INVAL)) 491 return (bp); 492 } 493 494 return (0); 495 } 496 497 /* 498 * Get a block of requested size that is associated with 499 * a given vnode and block offset. If it is found in the 500 * block cache, mark it as having been found, make it busy 501 * and return it. Otherwise, return an empty block of the 502 * correct size. It is up to the caller to insure that the 503 * cached blocks be of the correct size. 504 */ 505 struct buf * 506 getblk(vp, blkno, size, slpflag, slptimeo) 507 register struct vnode *vp; 508 daddr_t blkno; 509 int size, slpflag, slptimeo; 510 { 511 struct bufhashhdr *bh; 512 struct buf *bp; 513 int s, err; 514 515 /* 516 * XXX 517 * The following is an inlined version of 'incore()', but with 518 * the 'invalid' test moved to after the 'busy' test. It's 519 * necessary because there are some cases in which the NFS 520 * code sets B_INVAL prior to writing data to the server, but 521 * in which the buffers actually contain valid data. In this 522 * case, we can't allow the system to allocate a new buffer for 523 * the block until the write is finished. 524 */ 525 bh = BUFHASH(vp, blkno); 526 start: 527 bp = bh->lh_first; 528 for (; bp != NULL; bp = bp->b_hash.le_next) { 529 if (bp->b_lblkno != blkno || bp->b_vp != vp) 530 continue; 531 532 s = splbio(); 533 if (ISSET(bp->b_flags, B_BUSY)) { 534 SET(bp->b_flags, B_WANTED); 535 err = tsleep(bp, slpflag | (PRIBIO + 1), "getblk", 536 slptimeo); 537 splx(s); 538 if (err) 539 return (NULL); 540 goto start; 541 } 542 543 if (!ISSET(bp->b_flags, B_INVAL)) { 544 SET(bp->b_flags, (B_BUSY | B_CACHE)); 545 bremfree(bp); 546 splx(s); 547 break; 548 } 549 splx(s); 550 } 551 552 if (bp == NULL) { 553 if ((bp = getnewbuf(slpflag, slptimeo)) == NULL) 554 goto start; 555 binshash(bp, bh); 556 bp->b_blkno = bp->b_lblkno = blkno; 557 s = splbio(); 558 bgetvp(vp, bp); 559 splx(s); 560 } 561 allocbuf(bp, size); 562 return (bp); 563 } 564 565 /* 566 * Get an empty, disassociated buffer of given size. 567 */ 568 struct buf * 569 geteblk(size) 570 int size; 571 { 572 struct buf *bp; 573 574 while ((bp = getnewbuf(0, 0)) == 0) 575 ; 576 SET(bp->b_flags, B_INVAL); 577 binshash(bp, &invalhash); 578 allocbuf(bp, size); 579 580 return (bp); 581 } 582 583 /* 584 * Expand or contract the actual memory allocated to a buffer. 585 * 586 * If the buffer shrinks, data is lost, so it's up to the 587 * caller to have written it out *first*; this routine will not 588 * start a write. If the buffer grows, it's the callers 589 * responsibility to fill out the buffer's additional contents. 590 */ 591 void 592 allocbuf(bp, size) 593 struct buf *bp; 594 int size; 595 { 596 struct buf *nbp; 597 vm_size_t desired_size; 598 int s; 599 600 desired_size = roundup(size, CLBYTES); 601 if (desired_size > MAXBSIZE) 602 panic("allocbuf: buffer larger than MAXBSIZE requested"); 603 604 if (bp->b_bufsize == desired_size) 605 goto out; 606 607 /* 608 * If the buffer is smaller than the desired size, we need to snarf 609 * it from other buffers. Get buffers (via getnewbuf()), and 610 * steal their pages. 611 */ 612 while (bp->b_bufsize < desired_size) { 613 int amt; 614 615 /* find a buffer */ 616 while ((nbp = getnewbuf(0, 0)) == NULL) 617 ; 618 SET(nbp->b_flags, B_INVAL); 619 binshash(nbp, &invalhash); 620 621 /* and steal its pages, up to the amount we need */ 622 amt = min(nbp->b_bufsize, (desired_size - bp->b_bufsize)); 623 pagemove((nbp->b_data + nbp->b_bufsize - amt), 624 bp->b_data + bp->b_bufsize, amt); 625 bp->b_bufsize += amt; 626 nbp->b_bufsize -= amt; 627 628 /* reduce transfer count if we stole some data */ 629 if (nbp->b_bcount > nbp->b_bufsize) 630 nbp->b_bcount = nbp->b_bufsize; 631 632 #ifdef DIAGNOSTIC 633 if (nbp->b_bufsize < 0) 634 panic("allocbuf: negative bufsize"); 635 #endif 636 637 brelse(nbp); 638 } 639 640 /* 641 * If we want a buffer smaller than the current size, 642 * shrink this buffer. Grab a buf head from the EMPTY queue, 643 * move a page onto it, and put it on front of the AGE queue. 644 * If there are no free buffer headers, leave the buffer alone. 645 */ 646 if (bp->b_bufsize > desired_size) { 647 s = splbio(); 648 if ((nbp = bufqueues[BQ_EMPTY].tqh_first) == NULL) { 649 /* No free buffer head */ 650 splx(s); 651 goto out; 652 } 653 bremfree(nbp); 654 SET(nbp->b_flags, B_BUSY); 655 splx(s); 656 657 /* move the page to it and note this change */ 658 pagemove(bp->b_data + desired_size, 659 nbp->b_data, bp->b_bufsize - desired_size); 660 nbp->b_bufsize = bp->b_bufsize - desired_size; 661 bp->b_bufsize = desired_size; 662 nbp->b_bcount = 0; 663 SET(nbp->b_flags, B_INVAL); 664 665 /* release the newly-filled buffer and leave */ 666 brelse(nbp); 667 } 668 669 out: 670 bp->b_bcount = size; 671 } 672 673 /* 674 * Find a buffer which is available for use. 675 * Select something from a free list. 676 * Preference is to AGE list, then LRU list. 677 */ 678 struct buf * 679 getnewbuf(slpflag, slptimeo) 680 int slpflag, slptimeo; 681 { 682 register struct buf *bp; 683 int s; 684 685 start: 686 s = splbio(); 687 if ((bp = bufqueues[BQ_AGE].tqh_first) != NULL || 688 (bp = bufqueues[BQ_LRU].tqh_first) != NULL) { 689 bremfree(bp); 690 } else { 691 /* wait for a free buffer of any kind */ 692 needbuffer = 1; 693 tsleep(&needbuffer, slpflag|(PRIBIO+1), "getnewbuf", slptimeo); 694 splx(s); 695 return (0); 696 } 697 698 /* Buffer is no longer on free lists. */ 699 SET(bp->b_flags, B_BUSY); 700 701 /* If buffer was a delayed write, start it, and go back to the top. */ 702 if (ISSET(bp->b_flags, B_DELWRI)) { 703 splx(s); 704 bawrite (bp); 705 goto start; 706 } 707 708 /* disassociate us from our vnode, if we had one... */ 709 if (bp->b_vp) 710 brelvp(bp); 711 splx(s); 712 713 /* clear out various other fields */ 714 bp->b_flags = B_BUSY; 715 bp->b_dev = NODEV; 716 bp->b_blkno = bp->b_lblkno = 0; 717 bp->b_iodone = 0; 718 bp->b_error = 0; 719 bp->b_resid = 0; 720 bp->b_bcount = 0; 721 bp->b_dirtyoff = bp->b_dirtyend = 0; 722 bp->b_validoff = bp->b_validend = 0; 723 724 /* nuke any credentials we were holding */ 725 if (bp->b_rcred != NOCRED) { 726 crfree(bp->b_rcred); 727 bp->b_rcred = NOCRED; 728 } 729 if (bp->b_wcred != NOCRED) { 730 crfree(bp->b_wcred); 731 bp->b_wcred = NOCRED; 732 } 733 734 bremhash(bp); 735 return (bp); 736 } 737 738 /* 739 * Wait for operations on the buffer to complete. 740 * When they do, extract and return the I/O's error value. 741 */ 742 int 743 biowait(bp) 744 struct buf *bp; 745 { 746 int s; 747 748 s = splbio(); 749 while (!ISSET(bp->b_flags, B_DONE)) 750 tsleep(bp, PRIBIO + 1, "biowait", 0); 751 splx(s); 752 753 /* check for interruption of I/O (e.g. via NFS), then errors. */ 754 if (ISSET(bp->b_flags, B_EINTR)) { 755 CLR(bp->b_flags, B_EINTR); 756 return (EINTR); 757 } else if (ISSET(bp->b_flags, B_ERROR)) 758 return (bp->b_error ? bp->b_error : EIO); 759 else 760 return (0); 761 } 762 763 /* 764 * Mark I/O complete on a buffer. 765 * 766 * If a callback has been requested, e.g. the pageout 767 * daemon, do so. Otherwise, awaken waiting processes. 768 * 769 * [ Leffler, et al., says on p.247: 770 * "This routine wakes up the blocked process, frees the buffer 771 * for an asynchronous write, or, for a request by the pagedaemon 772 * process, invokes a procedure specified in the buffer structure" ] 773 * 774 * In real life, the pagedaemon (or other system processes) wants 775 * to do async stuff to, and doesn't want the buffer brelse()'d. 776 * (for swap pager, that puts swap buffers on the free lists (!!!), 777 * for the vn device, that puts malloc'd buffers on the free lists!) 778 */ 779 void 780 biodone(bp) 781 struct buf *bp; 782 { 783 if (ISSET(bp->b_flags, B_DONE)) 784 panic("biodone already"); 785 SET(bp->b_flags, B_DONE); /* note that it's done */ 786 787 if (!ISSET(bp->b_flags, B_READ)) /* wake up reader */ 788 vwakeup(bp); 789 790 if (ISSET(bp->b_flags, B_CALL)) { /* if necessary, call out */ 791 CLR(bp->b_flags, B_CALL); /* but note callout done */ 792 (*bp->b_iodone)(bp); 793 } else if (ISSET(bp->b_flags, B_ASYNC)) /* if async, release it */ 794 brelse(bp); 795 else { /* or just wakeup the buffer */ 796 CLR(bp->b_flags, B_WANTED); 797 wakeup(bp); 798 } 799 } 800 801 /* 802 * Return a count of buffers on the "locked" queue. 803 */ 804 int 805 count_lock_queue() 806 { 807 register struct buf *bp; 808 register int n = 0; 809 810 for (bp = bufqueues[BQ_LOCKED].tqh_first; bp; 811 bp = bp->b_freelist.tqe_next) 812 n++; 813 return (n); 814 } 815 816 #ifdef DEBUG 817 /* 818 * Print out statistics on the current allocation of the buffer pool. 819 * Can be enabled to print out on every ``sync'' by setting "syncprt" 820 * in vfs_syscalls.c using sysctl. 821 */ 822 void 823 vfs_bufstats() 824 { 825 int s, i, j, count; 826 register struct buf *bp; 827 register struct bqueues *dp; 828 int counts[MAXBSIZE/CLBYTES+1]; 829 static char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE", "EMPTY" }; 830 831 for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) { 832 count = 0; 833 for (j = 0; j <= MAXBSIZE/CLBYTES; j++) 834 counts[j] = 0; 835 s = splbio(); 836 for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) { 837 counts[bp->b_bufsize/CLBYTES]++; 838 count++; 839 } 840 splx(s); 841 printf("%s: total-%d", bname[i], count); 842 for (j = 0; j <= MAXBSIZE/CLBYTES; j++) 843 if (counts[j] != 0) 844 printf(", %d-%d", j * CLBYTES, counts[j]); 845 printf("\n"); 846 } 847 } 848 #endif /* DEBUG */ 849