1 /* $NetBSD: vfs_bio.c,v 1.86 2003/01/18 10:06:37 thorpej Exp $ */ 2 3 /*- 4 * Copyright (c) 1994 Christopher G. Demetriou 5 * Copyright (c) 1982, 1986, 1989, 1993 6 * The Regents of the University of California. All rights reserved. 7 * (c) UNIX System Laboratories, Inc. 8 * All or some portions of this file are derived from material licensed 9 * to the University of California by American Telephone and Telegraph 10 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 11 * the permission of UNIX System Laboratories, Inc. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 3. All advertising materials mentioning features or use of this software 22 * must display the following acknowledgement: 23 * This product includes software developed by the University of 24 * California, Berkeley and its contributors. 25 * 4. Neither the name of the University nor the names of its contributors 26 * may be used to endorse or promote products derived from this software 27 * without specific prior written permission. 28 * 29 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 30 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 31 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 32 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 33 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 34 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 35 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 36 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 37 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 38 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 39 * SUCH DAMAGE. 40 * 41 * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94 42 */ 43 44 /* 45 * Some references: 46 * Bach: The Design of the UNIX Operating System (Prentice Hall, 1986) 47 * Leffler, et al.: The Design and Implementation of the 4.3BSD 48 * UNIX Operating System (Addison Welley, 1989) 49 */ 50 51 #include "opt_softdep.h" 52 53 #include <sys/cdefs.h> 54 __KERNEL_RCSID(0, "$NetBSD: vfs_bio.c,v 1.86 2003/01/18 10:06:37 thorpej Exp $"); 55 56 #include <sys/param.h> 57 #include <sys/systm.h> 58 #include <sys/proc.h> 59 #include <sys/buf.h> 60 #include <sys/vnode.h> 61 #include <sys/mount.h> 62 #include <sys/malloc.h> 63 #include <sys/resourcevar.h> 64 #include <sys/conf.h> 65 66 #include <uvm/uvm.h> 67 68 #include <miscfs/specfs/specdev.h> 69 70 /* Macros to clear/set/test flags. */ 71 #define SET(t, f) (t) |= (f) 72 #define CLR(t, f) (t) &= ~(f) 73 #define ISSET(t, f) ((t) & (f)) 74 75 /* 76 * Definitions for the buffer hash lists. 77 */ 78 #define BUFHASH(dvp, lbn) \ 79 (&bufhashtbl[(((long)(dvp) >> 8) + (int)(lbn)) & bufhash]) 80 LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash; 81 u_long bufhash; 82 #ifndef SOFTDEP 83 struct bio_ops bioops; /* I/O operation notification */ 84 #endif 85 86 /* 87 * Insq/Remq for the buffer hash lists. 88 */ 89 #define binshash(bp, dp) LIST_INSERT_HEAD(dp, bp, b_hash) 90 #define bremhash(bp) LIST_REMOVE(bp, b_hash) 91 92 /* 93 * Definitions for the buffer free lists. 94 */ 95 #define BQUEUES 4 /* number of free buffer queues */ 96 97 #define BQ_LOCKED 0 /* super-blocks &c */ 98 #define BQ_LRU 1 /* lru, useful buffers */ 99 #define BQ_AGE 2 /* rubbish */ 100 #define BQ_EMPTY 3 /* buffer headers with no memory */ 101 102 TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES]; 103 int needbuffer; 104 105 /* 106 * Buffer pool for I/O buffers. 107 */ 108 struct pool bufpool; 109 110 /* 111 * Insq/Remq for the buffer free lists. 112 */ 113 #define binsheadfree(bp, dp) TAILQ_INSERT_HEAD(dp, bp, b_freelist) 114 #define binstailfree(bp, dp) TAILQ_INSERT_TAIL(dp, bp, b_freelist) 115 116 static __inline struct buf *bio_doread __P((struct vnode *, daddr_t, int, 117 struct ucred *, int)); 118 int count_lock_queue __P((void)); 119 120 void 121 bremfree(bp) 122 struct buf *bp; 123 { 124 int s = splbio(); 125 126 struct bqueues *dp = NULL; 127 128 /* 129 * We only calculate the head of the freelist when removing 130 * the last element of the list as that is the only time that 131 * it is needed (e.g. to reset the tail pointer). 132 * 133 * NB: This makes an assumption about how tailq's are implemented. 134 */ 135 if (TAILQ_NEXT(bp, b_freelist) == NULL) { 136 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) 137 if (dp->tqh_last == &bp->b_freelist.tqe_next) 138 break; 139 if (dp == &bufqueues[BQUEUES]) 140 panic("bremfree: lost tail"); 141 } 142 TAILQ_REMOVE(dp, bp, b_freelist); 143 splx(s); 144 } 145 146 /* 147 * Initialize buffers and hash links for buffers. 148 */ 149 void 150 bufinit() 151 { 152 struct buf *bp; 153 struct bqueues *dp; 154 u_int i, base, residual; 155 156 /* 157 * Initialize the buffer pool. This pool is used for buffers 158 * which are strictly I/O control blocks, not buffer cache 159 * buffers. 160 */ 161 pool_init(&bufpool, sizeof(struct buf), 0, 0, 0, "bufpl", NULL); 162 163 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) 164 TAILQ_INIT(dp); 165 bufhashtbl = hashinit(nbuf, HASH_LIST, M_CACHE, M_WAITOK, &bufhash); 166 base = bufpages / nbuf; 167 residual = bufpages % nbuf; 168 for (i = 0; i < nbuf; i++) { 169 bp = &buf[i]; 170 memset((char *)bp, 0, sizeof(*bp)); 171 bp->b_dev = NODEV; 172 bp->b_vnbufs.le_next = NOLIST; 173 LIST_INIT(&bp->b_dep); 174 bp->b_data = buffers + i * MAXBSIZE; 175 if (i < residual) 176 bp->b_bufsize = (base + 1) * PAGE_SIZE; 177 else 178 bp->b_bufsize = base * PAGE_SIZE; 179 bp->b_flags = B_INVAL; 180 dp = bp->b_bufsize ? &bufqueues[BQ_AGE] : &bufqueues[BQ_EMPTY]; 181 binsheadfree(bp, dp); 182 binshash(bp, &invalhash); 183 } 184 } 185 186 static __inline struct buf * 187 bio_doread(vp, blkno, size, cred, async) 188 struct vnode *vp; 189 daddr_t blkno; 190 int size; 191 struct ucred *cred; 192 int async; 193 { 194 struct buf *bp; 195 struct lwp *l = (curlwp != NULL ? curlwp : &lwp0); /* XXX */ 196 struct proc *p = l->l_proc; 197 198 bp = getblk(vp, blkno, size, 0, 0); 199 200 #ifdef DIAGNOSTIC 201 if (bp == NULL) { 202 panic("bio_doread: no such buf"); 203 } 204 #endif 205 206 /* 207 * If buffer does not have data valid, start a read. 208 * Note that if buffer is B_INVAL, getblk() won't return it. 209 * Therefore, it's valid if it's I/O has completed or been delayed. 210 */ 211 if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) { 212 /* Start I/O for the buffer. */ 213 SET(bp->b_flags, B_READ | async); 214 VOP_STRATEGY(bp); 215 216 /* Pay for the read. */ 217 p->p_stats->p_ru.ru_inblock++; 218 } else if (async) { 219 brelse(bp); 220 } 221 222 return (bp); 223 } 224 225 /* 226 * Read a disk block. 227 * This algorithm described in Bach (p.54). 228 */ 229 int 230 bread(vp, blkno, size, cred, bpp) 231 struct vnode *vp; 232 daddr_t blkno; 233 int size; 234 struct ucred *cred; 235 struct buf **bpp; 236 { 237 struct buf *bp; 238 239 /* Get buffer for block. */ 240 bp = *bpp = bio_doread(vp, blkno, size, cred, 0); 241 242 /* Wait for the read to complete, and return result. */ 243 return (biowait(bp)); 244 } 245 246 /* 247 * Read-ahead multiple disk blocks. The first is sync, the rest async. 248 * Trivial modification to the breada algorithm presented in Bach (p.55). 249 */ 250 int 251 breadn(vp, blkno, size, rablks, rasizes, nrablks, cred, bpp) 252 struct vnode *vp; 253 daddr_t blkno; int size; 254 daddr_t rablks[]; int rasizes[]; 255 int nrablks; 256 struct ucred *cred; 257 struct buf **bpp; 258 { 259 struct buf *bp; 260 int i; 261 262 bp = *bpp = bio_doread(vp, blkno, size, cred, 0); 263 264 /* 265 * For each of the read-ahead blocks, start a read, if necessary. 266 */ 267 for (i = 0; i < nrablks; i++) { 268 /* If it's in the cache, just go on to next one. */ 269 if (incore(vp, rablks[i])) 270 continue; 271 272 /* Get a buffer for the read-ahead block */ 273 (void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC); 274 } 275 276 /* Otherwise, we had to start a read for it; wait until it's valid. */ 277 return (biowait(bp)); 278 } 279 280 /* 281 * Read with single-block read-ahead. Defined in Bach (p.55), but 282 * implemented as a call to breadn(). 283 * XXX for compatibility with old file systems. 284 */ 285 int 286 breada(vp, blkno, size, rablkno, rabsize, cred, bpp) 287 struct vnode *vp; 288 daddr_t blkno; int size; 289 daddr_t rablkno; int rabsize; 290 struct ucred *cred; 291 struct buf **bpp; 292 { 293 294 return (breadn(vp, blkno, size, &rablkno, &rabsize, 1, cred, bpp)); 295 } 296 297 /* 298 * Block write. Described in Bach (p.56) 299 */ 300 int 301 bwrite(bp) 302 struct buf *bp; 303 { 304 int rv, sync, wasdelayed, s; 305 struct lwp *l = (curlwp != NULL ? curlwp : &lwp0); /* XXX */ 306 struct proc *p = l->l_proc; 307 struct vnode *vp; 308 struct mount *mp; 309 310 vp = bp->b_vp; 311 if (vp != NULL) { 312 if (vp->v_type == VBLK) 313 mp = vp->v_specmountpoint; 314 else 315 mp = vp->v_mount; 316 } else { 317 mp = NULL; 318 } 319 320 /* 321 * Remember buffer type, to switch on it later. If the write was 322 * synchronous, but the file system was mounted with MNT_ASYNC, 323 * convert it to a delayed write. 324 * XXX note that this relies on delayed tape writes being converted 325 * to async, not sync writes (which is safe, but ugly). 326 */ 327 sync = !ISSET(bp->b_flags, B_ASYNC); 328 if (sync && mp != NULL && ISSET(mp->mnt_flag, MNT_ASYNC)) { 329 bdwrite(bp); 330 return (0); 331 } 332 333 /* 334 * Collect statistics on synchronous and asynchronous writes. 335 * Writes to block devices are charged to their associated 336 * filesystem (if any). 337 */ 338 if (mp != NULL) { 339 if (sync) 340 mp->mnt_stat.f_syncwrites++; 341 else 342 mp->mnt_stat.f_asyncwrites++; 343 } 344 345 wasdelayed = ISSET(bp->b_flags, B_DELWRI); 346 347 s = splbio(); 348 349 CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI)); 350 351 /* 352 * Pay for the I/O operation and make sure the buf is on the correct 353 * vnode queue. 354 */ 355 if (wasdelayed) 356 reassignbuf(bp, bp->b_vp); 357 else 358 p->p_stats->p_ru.ru_oublock++; 359 360 /* Initiate disk write. Make sure the appropriate party is charged. */ 361 bp->b_vp->v_numoutput++; 362 splx(s); 363 364 VOP_STRATEGY(bp); 365 366 if (sync) { 367 /* If I/O was synchronous, wait for it to complete. */ 368 rv = biowait(bp); 369 370 /* Release the buffer. */ 371 brelse(bp); 372 373 return (rv); 374 } else { 375 return (0); 376 } 377 } 378 379 int 380 vn_bwrite(v) 381 void *v; 382 { 383 struct vop_bwrite_args *ap = v; 384 385 return (bwrite(ap->a_bp)); 386 } 387 388 /* 389 * Delayed write. 390 * 391 * The buffer is marked dirty, but is not queued for I/O. 392 * This routine should be used when the buffer is expected 393 * to be modified again soon, typically a small write that 394 * partially fills a buffer. 395 * 396 * NB: magnetic tapes cannot be delayed; they must be 397 * written in the order that the writes are requested. 398 * 399 * Described in Leffler, et al. (pp. 208-213). 400 */ 401 void 402 bdwrite(bp) 403 struct buf *bp; 404 { 405 struct lwp *l = (curlwp != NULL ? curlwp : &lwp0); /* XXX */ 406 struct proc *p = l->l_proc; 407 const struct bdevsw *bdev; 408 int s; 409 410 /* If this is a tape block, write the block now. */ 411 /* XXX NOTE: the memory filesystem usurpes major device */ 412 /* XXX number 4095, which is a bad idea. */ 413 if (bp->b_dev != NODEV && major(bp->b_dev) != 4095) { 414 bdev = bdevsw_lookup(bp->b_dev); 415 if (bdev != NULL && bdev->d_type == D_TAPE) { 416 bawrite(bp); 417 return; 418 } 419 } 420 421 /* 422 * If the block hasn't been seen before: 423 * (1) Mark it as having been seen, 424 * (2) Charge for the write, 425 * (3) Make sure it's on its vnode's correct block list. 426 */ 427 s = splbio(); 428 429 if (!ISSET(bp->b_flags, B_DELWRI)) { 430 SET(bp->b_flags, B_DELWRI); 431 p->p_stats->p_ru.ru_oublock++; 432 reassignbuf(bp, bp->b_vp); 433 } 434 435 /* Otherwise, the "write" is done, so mark and release the buffer. */ 436 CLR(bp->b_flags, B_NEEDCOMMIT|B_DONE); 437 splx(s); 438 439 brelse(bp); 440 } 441 442 /* 443 * Asynchronous block write; just an asynchronous bwrite(). 444 */ 445 void 446 bawrite(bp) 447 struct buf *bp; 448 { 449 450 SET(bp->b_flags, B_ASYNC); 451 VOP_BWRITE(bp); 452 } 453 454 /* 455 * Same as first half of bdwrite, mark buffer dirty, but do not release it. 456 */ 457 void 458 bdirty(bp) 459 struct buf *bp; 460 { 461 struct lwp *l = (curlwp != NULL ? curlwp : &lwp0); /* XXX */ 462 struct proc *p = l->l_proc; 463 int s; 464 465 s = splbio(); 466 467 CLR(bp->b_flags, B_AGE); 468 469 if (!ISSET(bp->b_flags, B_DELWRI)) { 470 SET(bp->b_flags, B_DELWRI); 471 p->p_stats->p_ru.ru_oublock++; 472 reassignbuf(bp, bp->b_vp); 473 } 474 475 splx(s); 476 } 477 478 /* 479 * Release a buffer on to the free lists. 480 * Described in Bach (p. 46). 481 */ 482 void 483 brelse(bp) 484 struct buf *bp; 485 { 486 struct bqueues *bufq; 487 int s; 488 489 KASSERT(ISSET(bp->b_flags, B_BUSY)); 490 491 /* Wake up any processes waiting for any buffer to become free. */ 492 if (needbuffer) { 493 needbuffer = 0; 494 wakeup(&needbuffer); 495 } 496 497 /* Block disk interrupts. */ 498 s = splbio(); 499 500 /* Wake up any proceeses waiting for _this_ buffer to become free. */ 501 if (ISSET(bp->b_flags, B_WANTED)) { 502 CLR(bp->b_flags, B_WANTED|B_AGE); 503 wakeup(bp); 504 } 505 506 /* 507 * Determine which queue the buffer should be on, then put it there. 508 */ 509 510 /* If it's locked, don't report an error; try again later. */ 511 if (ISSET(bp->b_flags, (B_LOCKED|B_ERROR)) == (B_LOCKED|B_ERROR)) 512 CLR(bp->b_flags, B_ERROR); 513 514 /* If it's not cacheable, or an error, mark it invalid. */ 515 if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR))) 516 SET(bp->b_flags, B_INVAL); 517 518 if (ISSET(bp->b_flags, B_VFLUSH)) { 519 /* 520 * This is a delayed write buffer that was just flushed to 521 * disk. It is still on the LRU queue. If it's become 522 * invalid, then we need to move it to a different queue; 523 * otherwise leave it in its current position. 524 */ 525 CLR(bp->b_flags, B_VFLUSH); 526 if (!ISSET(bp->b_flags, B_ERROR|B_INVAL|B_LOCKED|B_AGE)) 527 goto already_queued; 528 else 529 bremfree(bp); 530 } 531 532 if ((bp->b_bufsize <= 0) || ISSET(bp->b_flags, B_INVAL)) { 533 /* 534 * If it's invalid or empty, dissociate it from its vnode 535 * and put on the head of the appropriate queue. 536 */ 537 if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate) 538 (*bioops.io_deallocate)(bp); 539 CLR(bp->b_flags, B_DONE|B_DELWRI); 540 if (bp->b_vp) { 541 reassignbuf(bp, bp->b_vp); 542 brelvp(bp); 543 } 544 if (bp->b_bufsize <= 0) 545 /* no data */ 546 bufq = &bufqueues[BQ_EMPTY]; 547 else 548 /* invalid data */ 549 bufq = &bufqueues[BQ_AGE]; 550 binsheadfree(bp, bufq); 551 } else { 552 /* 553 * It has valid data. Put it on the end of the appropriate 554 * queue, so that it'll stick around for as long as possible. 555 * If buf is AGE, but has dependencies, must put it on last 556 * bufqueue to be scanned, ie LRU. This protects against the 557 * livelock where BQ_AGE only has buffers with dependencies, 558 * and we thus never get to the dependent buffers in BQ_LRU. 559 */ 560 if (ISSET(bp->b_flags, B_LOCKED)) 561 /* locked in core */ 562 bufq = &bufqueues[BQ_LOCKED]; 563 else if (!ISSET(bp->b_flags, B_AGE)) 564 /* valid data */ 565 bufq = &bufqueues[BQ_LRU]; 566 else { 567 /* stale but valid data */ 568 int has_deps; 569 570 if (LIST_FIRST(&bp->b_dep) != NULL && 571 bioops.io_countdeps) 572 has_deps = (*bioops.io_countdeps)(bp, 0); 573 else 574 has_deps = 0; 575 bufq = has_deps ? &bufqueues[BQ_LRU] : 576 &bufqueues[BQ_AGE]; 577 } 578 binstailfree(bp, bufq); 579 } 580 581 already_queued: 582 /* Unlock the buffer. */ 583 CLR(bp->b_flags, B_AGE|B_ASYNC|B_BUSY|B_NOCACHE); 584 SET(bp->b_flags, B_CACHE); 585 586 /* Allow disk interrupts. */ 587 splx(s); 588 } 589 590 /* 591 * Determine if a block is in the cache. 592 * Just look on what would be its hash chain. If it's there, return 593 * a pointer to it, unless it's marked invalid. If it's marked invalid, 594 * we normally don't return the buffer, unless the caller explicitly 595 * wants us to. 596 */ 597 struct buf * 598 incore(vp, blkno) 599 struct vnode *vp; 600 daddr_t blkno; 601 { 602 struct buf *bp; 603 604 /* Search hash chain */ 605 LIST_FOREACH(bp, BUFHASH(vp, blkno), b_hash) { 606 if (bp->b_lblkno == blkno && bp->b_vp == vp && 607 !ISSET(bp->b_flags, B_INVAL)) 608 return (bp); 609 } 610 611 return (NULL); 612 } 613 614 /* 615 * Get a block of requested size that is associated with 616 * a given vnode and block offset. If it is found in the 617 * block cache, mark it as having been found, make it busy 618 * and return it. Otherwise, return an empty block of the 619 * correct size. It is up to the caller to insure that the 620 * cached blocks be of the correct size. 621 */ 622 struct buf * 623 getblk(vp, blkno, size, slpflag, slptimeo) 624 struct vnode *vp; 625 daddr_t blkno; 626 int size, slpflag, slptimeo; 627 { 628 struct buf *bp; 629 int s, err; 630 631 start: 632 bp = incore(vp, blkno); 633 if (bp != NULL) { 634 s = splbio(); 635 if (ISSET(bp->b_flags, B_BUSY)) { 636 if (curproc == uvm.pagedaemon_proc) { 637 splx(s); 638 return NULL; 639 } 640 SET(bp->b_flags, B_WANTED); 641 err = tsleep(bp, slpflag | (PRIBIO + 1), "getblk", 642 slptimeo); 643 splx(s); 644 if (err) 645 return (NULL); 646 goto start; 647 } 648 #ifdef DIAGNOSTIC 649 if (ISSET(bp->b_flags, B_DONE|B_DELWRI) && 650 bp->b_bcount < size && vp->v_type != VBLK) 651 panic("getblk: block size invariant failed"); 652 #endif 653 SET(bp->b_flags, B_BUSY); 654 bremfree(bp); 655 splx(s); 656 } else { 657 if ((bp = getnewbuf(slpflag, slptimeo)) == NULL) 658 goto start; 659 660 binshash(bp, BUFHASH(vp, blkno)); 661 bp->b_blkno = bp->b_lblkno = bp->b_rawblkno = blkno; 662 s = splbio(); 663 bgetvp(vp, bp); 664 splx(s); 665 } 666 allocbuf(bp, size); 667 return (bp); 668 } 669 670 /* 671 * Get an empty, disassociated buffer of given size. 672 */ 673 struct buf * 674 geteblk(size) 675 int size; 676 { 677 struct buf *bp; 678 679 while ((bp = getnewbuf(0, 0)) == 0) 680 ; 681 SET(bp->b_flags, B_INVAL); 682 binshash(bp, &invalhash); 683 allocbuf(bp, size); 684 return (bp); 685 } 686 687 /* 688 * Expand or contract the actual memory allocated to a buffer. 689 * 690 * If the buffer shrinks, data is lost, so it's up to the 691 * caller to have written it out *first*; this routine will not 692 * start a write. If the buffer grows, it's the callers 693 * responsibility to fill out the buffer's additional contents. 694 */ 695 void 696 allocbuf(bp, size) 697 struct buf *bp; 698 int size; 699 { 700 struct buf *nbp; 701 vsize_t desired_size; 702 int s; 703 704 desired_size = round_page((vsize_t)size); 705 if (desired_size > MAXBSIZE) 706 panic("allocbuf: buffer larger than MAXBSIZE requested"); 707 708 if (bp->b_bufsize == desired_size) 709 goto out; 710 711 /* 712 * If the buffer is smaller than the desired size, we need to snarf 713 * it from other buffers. Get buffers (via getnewbuf()), and 714 * steal their pages. 715 */ 716 while (bp->b_bufsize < desired_size) { 717 int amt; 718 719 /* find a buffer */ 720 while ((nbp = getnewbuf(0, 0)) == NULL) 721 ; 722 723 SET(nbp->b_flags, B_INVAL); 724 binshash(nbp, &invalhash); 725 726 /* and steal its pages, up to the amount we need */ 727 amt = min(nbp->b_bufsize, (desired_size - bp->b_bufsize)); 728 pagemove((nbp->b_data + nbp->b_bufsize - amt), 729 bp->b_data + bp->b_bufsize, amt); 730 bp->b_bufsize += amt; 731 nbp->b_bufsize -= amt; 732 733 /* reduce transfer count if we stole some data */ 734 if (nbp->b_bcount > nbp->b_bufsize) 735 nbp->b_bcount = nbp->b_bufsize; 736 737 #ifdef DIAGNOSTIC 738 if (nbp->b_bufsize < 0) 739 panic("allocbuf: negative bufsize"); 740 #endif 741 742 brelse(nbp); 743 } 744 745 /* 746 * If we want a buffer smaller than the current size, 747 * shrink this buffer. Grab a buf head from the EMPTY queue, 748 * move a page onto it, and put it on front of the AGE queue. 749 * If there are no free buffer headers, leave the buffer alone. 750 */ 751 if (bp->b_bufsize > desired_size) { 752 s = splbio(); 753 if ((nbp = TAILQ_FIRST(&bufqueues[BQ_EMPTY])) == NULL) { 754 /* No free buffer head */ 755 splx(s); 756 goto out; 757 } 758 bremfree(nbp); 759 SET(nbp->b_flags, B_BUSY); 760 splx(s); 761 762 /* move the page to it and note this change */ 763 pagemove(bp->b_data + desired_size, 764 nbp->b_data, bp->b_bufsize - desired_size); 765 nbp->b_bufsize = bp->b_bufsize - desired_size; 766 bp->b_bufsize = desired_size; 767 nbp->b_bcount = 0; 768 SET(nbp->b_flags, B_INVAL); 769 770 /* release the newly-filled buffer and leave */ 771 brelse(nbp); 772 } 773 774 out: 775 bp->b_bcount = size; 776 } 777 778 /* 779 * Find a buffer which is available for use. 780 * Select something from a free list. 781 * Preference is to AGE list, then LRU list. 782 */ 783 struct buf * 784 getnewbuf(slpflag, slptimeo) 785 int slpflag, slptimeo; 786 { 787 struct buf *bp; 788 int s; 789 790 start: 791 s = splbio(); 792 if ((bp = TAILQ_FIRST(&bufqueues[BQ_AGE])) != NULL || 793 (bp = TAILQ_FIRST(&bufqueues[BQ_LRU])) != NULL) { 794 bremfree(bp); 795 } else { 796 /* wait for a free buffer of any kind */ 797 needbuffer = 1; 798 tsleep(&needbuffer, slpflag|(PRIBIO+1), "getnewbuf", slptimeo); 799 splx(s); 800 return (NULL); 801 } 802 803 if (ISSET(bp->b_flags, B_VFLUSH)) { 804 /* 805 * This is a delayed write buffer being flushed to disk. Make 806 * sure it gets aged out of the queue when it's finished, and 807 * leave it off the LRU queue. 808 */ 809 CLR(bp->b_flags, B_VFLUSH); 810 SET(bp->b_flags, B_AGE); 811 splx(s); 812 goto start; 813 } 814 815 /* Buffer is no longer on free lists. */ 816 SET(bp->b_flags, B_BUSY); 817 818 /* 819 * If buffer was a delayed write, start it and return NULL 820 * (since we might sleep while starting the write). 821 */ 822 if (ISSET(bp->b_flags, B_DELWRI)) { 823 splx(s); 824 /* 825 * This buffer has gone through the LRU, so make sure it gets 826 * reused ASAP. 827 */ 828 SET(bp->b_flags, B_AGE); 829 bawrite(bp); 830 return (NULL); 831 } 832 833 /* disassociate us from our vnode, if we had one... */ 834 if (bp->b_vp) 835 brelvp(bp); 836 splx(s); 837 838 if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate) 839 (*bioops.io_deallocate)(bp); 840 841 /* clear out various other fields */ 842 bp->b_flags = B_BUSY; 843 bp->b_dev = NODEV; 844 bp->b_blkno = bp->b_lblkno = bp->b_rawblkno = 0; 845 bp->b_iodone = 0; 846 bp->b_error = 0; 847 bp->b_resid = 0; 848 bp->b_bcount = 0; 849 850 bremhash(bp); 851 return (bp); 852 } 853 854 /* 855 * Wait for operations on the buffer to complete. 856 * When they do, extract and return the I/O's error value. 857 */ 858 int 859 biowait(bp) 860 struct buf *bp; 861 { 862 int s; 863 864 s = splbio(); 865 while (!ISSET(bp->b_flags, B_DONE | B_DELWRI)) 866 tsleep(bp, PRIBIO + 1, "biowait", 0); 867 splx(s); 868 869 /* check for interruption of I/O (e.g. via NFS), then errors. */ 870 if (ISSET(bp->b_flags, B_EINTR)) { 871 CLR(bp->b_flags, B_EINTR); 872 return (EINTR); 873 } else if (ISSET(bp->b_flags, B_ERROR)) 874 return (bp->b_error ? bp->b_error : EIO); 875 else 876 return (0); 877 } 878 879 /* 880 * Mark I/O complete on a buffer. 881 * 882 * If a callback has been requested, e.g. the pageout 883 * daemon, do so. Otherwise, awaken waiting processes. 884 * 885 * [ Leffler, et al., says on p.247: 886 * "This routine wakes up the blocked process, frees the buffer 887 * for an asynchronous write, or, for a request by the pagedaemon 888 * process, invokes a procedure specified in the buffer structure" ] 889 * 890 * In real life, the pagedaemon (or other system processes) wants 891 * to do async stuff to, and doesn't want the buffer brelse()'d. 892 * (for swap pager, that puts swap buffers on the free lists (!!!), 893 * for the vn device, that puts malloc'd buffers on the free lists!) 894 */ 895 void 896 biodone(bp) 897 struct buf *bp; 898 { 899 int s = splbio(); 900 901 if (ISSET(bp->b_flags, B_DONE)) 902 panic("biodone already"); 903 SET(bp->b_flags, B_DONE); /* note that it's done */ 904 905 if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_complete) 906 (*bioops.io_complete)(bp); 907 908 if (!ISSET(bp->b_flags, B_READ)) /* wake up reader */ 909 vwakeup(bp); 910 911 if (ISSET(bp->b_flags, B_CALL)) { /* if necessary, call out */ 912 CLR(bp->b_flags, B_CALL); /* but note callout done */ 913 (*bp->b_iodone)(bp); 914 } else { 915 if (ISSET(bp->b_flags, B_ASYNC)) /* if async, release */ 916 brelse(bp); 917 else { /* or just wakeup the buffer */ 918 CLR(bp->b_flags, B_WANTED); 919 wakeup(bp); 920 } 921 } 922 923 splx(s); 924 } 925 926 /* 927 * Return a count of buffers on the "locked" queue. 928 */ 929 int 930 count_lock_queue() 931 { 932 struct buf *bp; 933 int n = 0; 934 935 TAILQ_FOREACH(bp, &bufqueues[BQ_LOCKED], b_freelist) 936 n++; 937 return (n); 938 } 939 940 #ifdef DEBUG 941 /* 942 * Print out statistics on the current allocation of the buffer pool. 943 * Can be enabled to print out on every ``sync'' by setting "syncprt" 944 * in vfs_syscalls.c using sysctl. 945 */ 946 void 947 vfs_bufstats() 948 { 949 int s, i, j, count; 950 struct buf *bp; 951 struct bqueues *dp; 952 int counts[(MAXBSIZE / PAGE_SIZE) + 1]; 953 static char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE", "EMPTY" }; 954 955 for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) { 956 count = 0; 957 for (j = 0; j <= MAXBSIZE/PAGE_SIZE; j++) 958 counts[j] = 0; 959 s = splbio(); 960 TAILQ_FOREACH(bp, dp, b_freelist) { 961 counts[bp->b_bufsize/PAGE_SIZE]++; 962 count++; 963 } 964 splx(s); 965 printf("%s: total-%d", bname[i], count); 966 for (j = 0; j <= MAXBSIZE/PAGE_SIZE; j++) 967 if (counts[j] != 0) 968 printf(", %d-%d", j * PAGE_SIZE, counts[j]); 969 printf("\n"); 970 } 971 } 972 #endif /* DEBUG */ 973