1 /* $NetBSD: vfs_bio.c,v 1.208 2008/07/31 05:38:05 simonb Exp $ */ 2 3 /*- 4 * Copyright (c) 2007, 2008 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Andrew Doran. 9 * This code is derived from software contributed to The NetBSD Foundation 10 * by Wasabi Systems, Inc. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 23 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 24 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 25 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 /*- 35 * Copyright (c) 1982, 1986, 1989, 1993 36 * The Regents of the University of California. All rights reserved. 37 * (c) UNIX System Laboratories, Inc. 38 * All or some portions of this file are derived from material licensed 39 * to the University of California by American Telephone and Telegraph 40 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 41 * the permission of UNIX System Laboratories, Inc. 42 * 43 * Redistribution and use in source and binary forms, with or without 44 * modification, are permitted provided that the following conditions 45 * are met: 46 * 1. Redistributions of source code must retain the above copyright 47 * notice, this list of conditions and the following disclaimer. 48 * 2. Redistributions in binary form must reproduce the above copyright 49 * notice, this list of conditions and the following disclaimer in the 50 * documentation and/or other materials provided with the distribution. 51 * 3. Neither the name of the University nor the names of its contributors 52 * may be used to endorse or promote products derived from this software 53 * without specific prior written permission. 54 * 55 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 56 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 57 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 58 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 59 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 60 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 61 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 62 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 63 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 64 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 65 * SUCH DAMAGE. 66 * 67 * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94 68 */ 69 70 /*- 71 * Copyright (c) 1994 Christopher G. Demetriou 72 * 73 * Redistribution and use in source and binary forms, with or without 74 * modification, are permitted provided that the following conditions 75 * are met: 76 * 1. Redistributions of source code must retain the above copyright 77 * notice, this list of conditions and the following disclaimer. 78 * 2. Redistributions in binary form must reproduce the above copyright 79 * notice, this list of conditions and the following disclaimer in the 80 * documentation and/or other materials provided with the distribution. 81 * 3. All advertising materials mentioning features or use of this software 82 * must display the following acknowledgement: 83 * This product includes software developed by the University of 84 * California, Berkeley and its contributors. 85 * 4. Neither the name of the University nor the names of its contributors 86 * may be used to endorse or promote products derived from this software 87 * without specific prior written permission. 88 * 89 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 90 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 91 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 92 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 93 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 94 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 95 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 96 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 97 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 98 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 99 * SUCH DAMAGE. 100 * 101 * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94 102 */ 103 104 /* 105 * Some references: 106 * Bach: The Design of the UNIX Operating System (Prentice Hall, 1986) 107 * Leffler, et al.: The Design and Implementation of the 4.3BSD 108 * UNIX Operating System (Addison Welley, 1989) 109 */ 110 111 #include <sys/cdefs.h> 112 __KERNEL_RCSID(0, "$NetBSD: vfs_bio.c,v 1.208 2008/07/31 05:38:05 simonb Exp $"); 113 114 #include "fs_ffs.h" 115 #include "opt_bufcache.h" 116 117 #include <sys/param.h> 118 #include <sys/systm.h> 119 #include <sys/kernel.h> 120 #include <sys/proc.h> 121 #include <sys/buf.h> 122 #include <sys/vnode.h> 123 #include <sys/mount.h> 124 #include <sys/resourcevar.h> 125 #include <sys/sysctl.h> 126 #include <sys/conf.h> 127 #include <sys/kauth.h> 128 #include <sys/fstrans.h> 129 #include <sys/intr.h> 130 #include <sys/cpu.h> 131 #include <sys/wapbl.h> 132 133 #include <uvm/uvm.h> 134 135 #include <miscfs/specfs/specdev.h> 136 137 #ifndef BUFPAGES 138 # define BUFPAGES 0 139 #endif 140 141 #ifdef BUFCACHE 142 # if (BUFCACHE < 5) || (BUFCACHE > 95) 143 # error BUFCACHE is not between 5 and 95 144 # endif 145 #else 146 # define BUFCACHE 15 147 #endif 148 149 u_int nbuf; /* XXX - for softdep_lockedbufs */ 150 u_int bufpages = BUFPAGES; /* optional hardwired count */ 151 u_int bufcache = BUFCACHE; /* max % of RAM to use for buffer cache */ 152 153 /* Function prototypes */ 154 struct bqueue; 155 156 static void buf_setwm(void); 157 static int buf_trim(void); 158 static void *bufpool_page_alloc(struct pool *, int); 159 static void bufpool_page_free(struct pool *, void *); 160 static buf_t *bio_doread(struct vnode *, daddr_t, int, 161 kauth_cred_t, int); 162 static buf_t *getnewbuf(int, int, int); 163 static int buf_lotsfree(void); 164 static int buf_canrelease(void); 165 static u_long buf_mempoolidx(u_long); 166 static u_long buf_roundsize(u_long); 167 static void *buf_malloc(size_t); 168 static void buf_mrelease(void *, size_t); 169 static void binsheadfree(buf_t *, struct bqueue *); 170 static void binstailfree(buf_t *, struct bqueue *); 171 int count_lock_queue(void); /* XXX */ 172 #ifdef DEBUG 173 static int checkfreelist(buf_t *, struct bqueue *, int); 174 #endif 175 static void biointr(void *); 176 static void biodone2(buf_t *); 177 static void bref(buf_t *); 178 static void brele(buf_t *); 179 180 /* 181 * Definitions for the buffer hash lists. 182 */ 183 #define BUFHASH(dvp, lbn) \ 184 (&bufhashtbl[(((long)(dvp) >> 8) + (int)(lbn)) & bufhash]) 185 LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash; 186 u_long bufhash; 187 struct bqueue bufqueues[BQUEUES]; 188 const struct bio_ops *bioopsp; /* I/O operation notification */ 189 190 static kcondvar_t needbuffer_cv; 191 192 /* 193 * Buffer queue lock. 194 */ 195 kmutex_t bufcache_lock; 196 kmutex_t buffer_lock; 197 198 /* Software ISR for completed transfers. */ 199 static void *biodone_sih; 200 201 /* Buffer pool for I/O buffers. */ 202 static pool_cache_t buf_cache; 203 static pool_cache_t bufio_cache; 204 205 /* XXX - somewhat gross.. */ 206 #if MAXBSIZE == 0x2000 207 #define NMEMPOOLS 5 208 #elif MAXBSIZE == 0x4000 209 #define NMEMPOOLS 6 210 #elif MAXBSIZE == 0x8000 211 #define NMEMPOOLS 7 212 #else 213 #define NMEMPOOLS 8 214 #endif 215 216 #define MEMPOOL_INDEX_OFFSET 9 /* smallest pool is 512 bytes */ 217 #if (1 << (NMEMPOOLS + MEMPOOL_INDEX_OFFSET - 1)) != MAXBSIZE 218 #error update vfs_bio buffer memory parameters 219 #endif 220 221 /* Buffer memory pools */ 222 static struct pool bmempools[NMEMPOOLS]; 223 224 static struct vm_map *buf_map; 225 226 /* 227 * Buffer memory pool allocator. 228 */ 229 static void * 230 bufpool_page_alloc(struct pool *pp, int flags) 231 { 232 233 return (void *)uvm_km_alloc(buf_map, 234 MAXBSIZE, MAXBSIZE, 235 ((flags & PR_WAITOK) ? 0 : UVM_KMF_NOWAIT | UVM_KMF_TRYLOCK) 236 | UVM_KMF_WIRED); 237 } 238 239 static void 240 bufpool_page_free(struct pool *pp, void *v) 241 { 242 243 uvm_km_free(buf_map, (vaddr_t)v, MAXBSIZE, UVM_KMF_WIRED); 244 } 245 246 static struct pool_allocator bufmempool_allocator = { 247 .pa_alloc = bufpool_page_alloc, 248 .pa_free = bufpool_page_free, 249 .pa_pagesz = MAXBSIZE, 250 }; 251 252 /* Buffer memory management variables */ 253 u_long bufmem_valimit; 254 u_long bufmem_hiwater; 255 u_long bufmem_lowater; 256 u_long bufmem; 257 258 /* 259 * MD code can call this to set a hard limit on the amount 260 * of virtual memory used by the buffer cache. 261 */ 262 int 263 buf_setvalimit(vsize_t sz) 264 { 265 266 /* We need to accommodate at least NMEMPOOLS of MAXBSIZE each */ 267 if (sz < NMEMPOOLS * MAXBSIZE) 268 return EINVAL; 269 270 bufmem_valimit = sz; 271 return 0; 272 } 273 274 static void 275 buf_setwm(void) 276 { 277 278 bufmem_hiwater = buf_memcalc(); 279 /* lowater is approx. 2% of memory (with bufcache = 15) */ 280 #define BUFMEM_WMSHIFT 3 281 #define BUFMEM_HIWMMIN (64 * 1024 << BUFMEM_WMSHIFT) 282 if (bufmem_hiwater < BUFMEM_HIWMMIN) 283 /* Ensure a reasonable minimum value */ 284 bufmem_hiwater = BUFMEM_HIWMMIN; 285 bufmem_lowater = bufmem_hiwater >> BUFMEM_WMSHIFT; 286 } 287 288 #ifdef DEBUG 289 int debug_verify_freelist = 0; 290 static int 291 checkfreelist(buf_t *bp, struct bqueue *dp, int ison) 292 { 293 buf_t *b; 294 295 if (!debug_verify_freelist) 296 return 1; 297 298 TAILQ_FOREACH(b, &dp->bq_queue, b_freelist) { 299 if (b == bp) 300 return ison ? 1 : 0; 301 } 302 303 return ison ? 0 : 1; 304 } 305 #endif 306 307 /* 308 * Insq/Remq for the buffer hash lists. 309 * Call with buffer queue locked. 310 */ 311 static void 312 binsheadfree(buf_t *bp, struct bqueue *dp) 313 { 314 315 KASSERT(mutex_owned(&bufcache_lock)); 316 KASSERT(bp->b_freelistindex == -1); 317 TAILQ_INSERT_HEAD(&dp->bq_queue, bp, b_freelist); 318 dp->bq_bytes += bp->b_bufsize; 319 bp->b_freelistindex = dp - bufqueues; 320 } 321 322 static void 323 binstailfree(buf_t *bp, struct bqueue *dp) 324 { 325 326 KASSERT(mutex_owned(&bufcache_lock)); 327 KASSERT(bp->b_freelistindex == -1); 328 TAILQ_INSERT_TAIL(&dp->bq_queue, bp, b_freelist); 329 dp->bq_bytes += bp->b_bufsize; 330 bp->b_freelistindex = dp - bufqueues; 331 } 332 333 void 334 bremfree(buf_t *bp) 335 { 336 struct bqueue *dp; 337 int bqidx = bp->b_freelistindex; 338 339 KASSERT(mutex_owned(&bufcache_lock)); 340 341 KASSERT(bqidx != -1); 342 dp = &bufqueues[bqidx]; 343 KDASSERT(checkfreelist(bp, dp, 1)); 344 KASSERT(dp->bq_bytes >= bp->b_bufsize); 345 TAILQ_REMOVE(&dp->bq_queue, bp, b_freelist); 346 dp->bq_bytes -= bp->b_bufsize; 347 348 /* For the sysctl helper. */ 349 if (bp == dp->bq_marker) 350 dp->bq_marker = NULL; 351 352 #if defined(DIAGNOSTIC) 353 bp->b_freelistindex = -1; 354 #endif /* defined(DIAGNOSTIC) */ 355 } 356 357 /* 358 * Add a reference to an buffer structure that came from buf_cache. 359 */ 360 static inline void 361 bref(buf_t *bp) 362 { 363 364 KASSERT(mutex_owned(&bufcache_lock)); 365 KASSERT(bp->b_refcnt > 0); 366 367 bp->b_refcnt++; 368 } 369 370 /* 371 * Free an unused buffer structure that came from buf_cache. 372 */ 373 static inline void 374 brele(buf_t *bp) 375 { 376 377 KASSERT(mutex_owned(&bufcache_lock)); 378 KASSERT(bp->b_refcnt > 0); 379 380 if (bp->b_refcnt-- == 1) { 381 buf_destroy(bp); 382 #ifdef DEBUG 383 memset((char *)bp, 0, sizeof(*bp)); 384 #endif 385 pool_cache_put(buf_cache, bp); 386 } 387 } 388 389 /* 390 * note that for some ports this is used by pmap bootstrap code to 391 * determine kva size. 392 */ 393 u_long 394 buf_memcalc(void) 395 { 396 u_long n; 397 398 /* 399 * Determine the upper bound of memory to use for buffers. 400 * 401 * - If bufpages is specified, use that as the number 402 * pages. 403 * 404 * - Otherwise, use bufcache as the percentage of 405 * physical memory. 406 */ 407 if (bufpages != 0) { 408 n = bufpages; 409 } else { 410 if (bufcache < 5) { 411 printf("forcing bufcache %d -> 5", bufcache); 412 bufcache = 5; 413 } 414 if (bufcache > 95) { 415 printf("forcing bufcache %d -> 95", bufcache); 416 bufcache = 95; 417 } 418 n = calc_cache_size(buf_map, bufcache, 419 (buf_map != kernel_map) ? 100 : BUFCACHE_VA_MAXPCT) 420 / PAGE_SIZE; 421 } 422 423 n <<= PAGE_SHIFT; 424 if (bufmem_valimit != 0 && n > bufmem_valimit) 425 n = bufmem_valimit; 426 427 return (n); 428 } 429 430 /* 431 * Initialize buffers and hash links for buffers. 432 */ 433 void 434 bufinit(void) 435 { 436 struct bqueue *dp; 437 int use_std; 438 u_int i; 439 440 mutex_init(&bufcache_lock, MUTEX_DEFAULT, IPL_NONE); 441 mutex_init(&buffer_lock, MUTEX_DEFAULT, IPL_NONE); 442 cv_init(&needbuffer_cv, "needbuf"); 443 444 if (bufmem_valimit != 0) { 445 vaddr_t minaddr = 0, maxaddr; 446 buf_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr, 447 bufmem_valimit, 0, false, 0); 448 if (buf_map == NULL) 449 panic("bufinit: cannot allocate submap"); 450 } else 451 buf_map = kernel_map; 452 453 /* 454 * Initialize buffer cache memory parameters. 455 */ 456 bufmem = 0; 457 buf_setwm(); 458 459 /* On "small" machines use small pool page sizes where possible */ 460 use_std = (physmem < atop(16*1024*1024)); 461 462 /* 463 * Also use them on systems that can map the pool pages using 464 * a direct-mapped segment. 465 */ 466 #ifdef PMAP_MAP_POOLPAGE 467 use_std = 1; 468 #endif 469 470 buf_cache = pool_cache_init(sizeof(buf_t), 0, 0, 0, 471 "bufpl", NULL, IPL_SOFTBIO, NULL, NULL, NULL); 472 bufio_cache = pool_cache_init(sizeof(buf_t), 0, 0, 0, 473 "biopl", NULL, IPL_BIO, NULL, NULL, NULL); 474 475 bufmempool_allocator.pa_backingmap = buf_map; 476 for (i = 0; i < NMEMPOOLS; i++) { 477 struct pool_allocator *pa; 478 struct pool *pp = &bmempools[i]; 479 u_int size = 1 << (i + MEMPOOL_INDEX_OFFSET); 480 char *name = kmem_alloc(8, KM_SLEEP); 481 if (__predict_true(size >= 1024)) 482 (void)snprintf(name, 8, "buf%dk", size / 1024); 483 else 484 (void)snprintf(name, 8, "buf%db", size); 485 pa = (size <= PAGE_SIZE && use_std) 486 ? &pool_allocator_nointr 487 : &bufmempool_allocator; 488 pool_init(pp, size, 0, 0, 0, name, pa, IPL_NONE); 489 pool_setlowat(pp, 1); 490 pool_sethiwat(pp, 1); 491 } 492 493 /* Initialize the buffer queues */ 494 for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) { 495 TAILQ_INIT(&dp->bq_queue); 496 dp->bq_bytes = 0; 497 } 498 499 /* 500 * Estimate hash table size based on the amount of memory we 501 * intend to use for the buffer cache. The average buffer 502 * size is dependent on our clients (i.e. filesystems). 503 * 504 * For now, use an empirical 3K per buffer. 505 */ 506 nbuf = (bufmem_hiwater / 1024) / 3; 507 bufhashtbl = hashinit(nbuf, HASH_LIST, true, &bufhash); 508 } 509 510 void 511 bufinit2(void) 512 { 513 514 biodone_sih = softint_establish(SOFTINT_BIO | SOFTINT_MPSAFE, biointr, 515 NULL); 516 if (biodone_sih == NULL) 517 panic("bufinit2: can't establish soft interrupt"); 518 } 519 520 static int 521 buf_lotsfree(void) 522 { 523 int try, thresh; 524 525 /* Always allocate if less than the low water mark. */ 526 if (bufmem < bufmem_lowater) 527 return 1; 528 529 /* Never allocate if greater than the high water mark. */ 530 if (bufmem > bufmem_hiwater) 531 return 0; 532 533 /* If there's anything on the AGE list, it should be eaten. */ 534 if (TAILQ_FIRST(&bufqueues[BQ_AGE].bq_queue) != NULL) 535 return 0; 536 537 /* 538 * The probabily of getting a new allocation is inversely 539 * proportional to the current size of the cache, using 540 * a granularity of 16 steps. 541 */ 542 try = random() & 0x0000000fL; 543 544 /* Don't use "16 * bufmem" here to avoid a 32-bit overflow. */ 545 thresh = (bufmem - bufmem_lowater) / 546 ((bufmem_hiwater - bufmem_lowater) / 16); 547 548 if (try >= thresh) 549 return 1; 550 551 /* Otherwise don't allocate. */ 552 return 0; 553 } 554 555 /* 556 * Return estimate of bytes we think need to be 557 * released to help resolve low memory conditions. 558 * 559 * => called with bufcache_lock held. 560 */ 561 static int 562 buf_canrelease(void) 563 { 564 int pagedemand, ninvalid = 0; 565 566 KASSERT(mutex_owned(&bufcache_lock)); 567 568 if (bufmem < bufmem_lowater) 569 return 0; 570 571 if (bufmem > bufmem_hiwater) 572 return bufmem - bufmem_hiwater; 573 574 ninvalid += bufqueues[BQ_AGE].bq_bytes; 575 576 pagedemand = uvmexp.freetarg - uvmexp.free; 577 if (pagedemand < 0) 578 return ninvalid; 579 return MAX(ninvalid, MIN(2 * MAXBSIZE, 580 MIN((bufmem - bufmem_lowater) / 16, pagedemand * PAGE_SIZE))); 581 } 582 583 /* 584 * Buffer memory allocation helper functions 585 */ 586 static u_long 587 buf_mempoolidx(u_long size) 588 { 589 u_int n = 0; 590 591 size -= 1; 592 size >>= MEMPOOL_INDEX_OFFSET; 593 while (size) { 594 size >>= 1; 595 n += 1; 596 } 597 if (n >= NMEMPOOLS) 598 panic("buf mem pool index %d", n); 599 return n; 600 } 601 602 static u_long 603 buf_roundsize(u_long size) 604 { 605 /* Round up to nearest power of 2 */ 606 return (1 << (buf_mempoolidx(size) + MEMPOOL_INDEX_OFFSET)); 607 } 608 609 static void * 610 buf_malloc(size_t size) 611 { 612 u_int n = buf_mempoolidx(size); 613 void *addr; 614 615 while (1) { 616 addr = pool_get(&bmempools[n], PR_NOWAIT); 617 if (addr != NULL) 618 break; 619 620 /* No memory, see if we can free some. If so, try again */ 621 mutex_enter(&bufcache_lock); 622 if (buf_drain(1) > 0) { 623 mutex_exit(&bufcache_lock); 624 continue; 625 } 626 627 if (curlwp == uvm.pagedaemon_lwp) { 628 mutex_exit(&bufcache_lock); 629 return NULL; 630 } 631 632 /* Wait for buffers to arrive on the LRU queue */ 633 cv_timedwait(&needbuffer_cv, &bufcache_lock, hz / 4); 634 mutex_exit(&bufcache_lock); 635 } 636 637 return addr; 638 } 639 640 static void 641 buf_mrelease(void *addr, size_t size) 642 { 643 644 pool_put(&bmempools[buf_mempoolidx(size)], addr); 645 } 646 647 /* 648 * bread()/breadn() helper. 649 */ 650 static buf_t * 651 bio_doread(struct vnode *vp, daddr_t blkno, int size, kauth_cred_t cred, 652 int async) 653 { 654 buf_t *bp; 655 struct mount *mp; 656 657 bp = getblk(vp, blkno, size, 0, 0); 658 659 #ifdef DIAGNOSTIC 660 if (bp == NULL) { 661 panic("bio_doread: no such buf"); 662 } 663 #endif 664 665 /* 666 * If buffer does not have data valid, start a read. 667 * Note that if buffer is BC_INVAL, getblk() won't return it. 668 * Therefore, it's valid if its I/O has completed or been delayed. 669 */ 670 if (!ISSET(bp->b_oflags, (BO_DONE | BO_DELWRI))) { 671 /* Start I/O for the buffer. */ 672 SET(bp->b_flags, B_READ | async); 673 if (async) 674 BIO_SETPRIO(bp, BPRIO_TIMELIMITED); 675 else 676 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 677 VOP_STRATEGY(vp, bp); 678 679 /* Pay for the read. */ 680 curlwp->l_ru.ru_inblock++; 681 } else if (async) 682 brelse(bp, 0); 683 684 if (vp->v_type == VBLK) 685 mp = vp->v_specmountpoint; 686 else 687 mp = vp->v_mount; 688 689 /* 690 * Collect statistics on synchronous and asynchronous reads. 691 * Reads from block devices are charged to their associated 692 * filesystem (if any). 693 */ 694 if (mp != NULL) { 695 if (async == 0) 696 mp->mnt_stat.f_syncreads++; 697 else 698 mp->mnt_stat.f_asyncreads++; 699 } 700 701 return (bp); 702 } 703 704 /* 705 * Read a disk block. 706 * This algorithm described in Bach (p.54). 707 */ 708 int 709 bread(struct vnode *vp, daddr_t blkno, int size, kauth_cred_t cred, 710 int flags, buf_t **bpp) 711 { 712 buf_t *bp; 713 int error; 714 715 /* Get buffer for block. */ 716 bp = *bpp = bio_doread(vp, blkno, size, cred, 0); 717 718 /* Wait for the read to complete, and return result. */ 719 error = biowait(bp); 720 if (error == 0 && (flags & B_MODIFY) != 0) /* XXXX before the next code block or after? */ 721 error = fscow_run(bp, true); 722 723 if (!error) { 724 struct mount *mp = wapbl_vptomp(vp); 725 726 if (mp && mp->mnt_wapbl_replay && 727 WAPBL_REPLAY_ISOPEN(mp)) { 728 error = WAPBL_REPLAY_READ(mp, bp->b_data, bp->b_blkno, 729 bp->b_bcount); 730 if (error) { 731 mutex_enter(&bufcache_lock); 732 SET(bp->b_cflags, BC_INVAL); 733 mutex_exit(&bufcache_lock); 734 } 735 } 736 } 737 return error; 738 } 739 740 /* 741 * Read-ahead multiple disk blocks. The first is sync, the rest async. 742 * Trivial modification to the breada algorithm presented in Bach (p.55). 743 */ 744 int 745 breadn(struct vnode *vp, daddr_t blkno, int size, daddr_t *rablks, 746 int *rasizes, int nrablks, kauth_cred_t cred, int flags, buf_t **bpp) 747 { 748 buf_t *bp; 749 int error, i; 750 751 bp = *bpp = bio_doread(vp, blkno, size, cred, 0); 752 753 /* 754 * For each of the read-ahead blocks, start a read, if necessary. 755 */ 756 mutex_enter(&bufcache_lock); 757 for (i = 0; i < nrablks; i++) { 758 /* If it's in the cache, just go on to next one. */ 759 if (incore(vp, rablks[i])) 760 continue; 761 762 /* Get a buffer for the read-ahead block */ 763 mutex_exit(&bufcache_lock); 764 (void) bio_doread(vp, rablks[i], rasizes[i], cred, B_ASYNC); 765 mutex_enter(&bufcache_lock); 766 } 767 mutex_exit(&bufcache_lock); 768 769 /* Otherwise, we had to start a read for it; wait until it's valid. */ 770 error = biowait(bp); 771 if (error == 0 && (flags & B_MODIFY) != 0) 772 error = fscow_run(bp, true); 773 return error; 774 } 775 776 /* 777 * Read with single-block read-ahead. Defined in Bach (p.55), but 778 * implemented as a call to breadn(). 779 * XXX for compatibility with old file systems. 780 */ 781 int 782 breada(struct vnode *vp, daddr_t blkno, int size, daddr_t rablkno, 783 int rabsize, kauth_cred_t cred, int flags, buf_t **bpp) 784 { 785 786 return (breadn(vp, blkno, size, &rablkno, &rabsize, 1, 787 cred, flags, bpp)); 788 } 789 790 /* 791 * Block write. Described in Bach (p.56) 792 */ 793 int 794 bwrite(buf_t *bp) 795 { 796 int rv, sync, wasdelayed; 797 struct vnode *vp; 798 struct mount *mp; 799 800 KASSERT(ISSET(bp->b_cflags, BC_BUSY)); 801 KASSERT(!cv_has_waiters(&bp->b_done)); 802 803 vp = bp->b_vp; 804 if (vp != NULL) { 805 KASSERT(bp->b_objlock == &vp->v_interlock); 806 if (vp->v_type == VBLK) 807 mp = vp->v_specmountpoint; 808 else 809 mp = vp->v_mount; 810 } else { 811 mp = NULL; 812 } 813 814 if (mp && mp->mnt_wapbl) { 815 if (bp->b_iodone != mp->mnt_wapbl_op->wo_wapbl_biodone) { 816 bdwrite(bp); 817 return 0; 818 } 819 } 820 821 /* 822 * Remember buffer type, to switch on it later. If the write was 823 * synchronous, but the file system was mounted with MNT_ASYNC, 824 * convert it to a delayed write. 825 * XXX note that this relies on delayed tape writes being converted 826 * to async, not sync writes (which is safe, but ugly). 827 */ 828 sync = !ISSET(bp->b_flags, B_ASYNC); 829 if (sync && mp != NULL && ISSET(mp->mnt_flag, MNT_ASYNC)) { 830 bdwrite(bp); 831 return (0); 832 } 833 834 /* 835 * Collect statistics on synchronous and asynchronous writes. 836 * Writes to block devices are charged to their associated 837 * filesystem (if any). 838 */ 839 if (mp != NULL) { 840 if (sync) 841 mp->mnt_stat.f_syncwrites++; 842 else 843 mp->mnt_stat.f_asyncwrites++; 844 } 845 846 /* 847 * Pay for the I/O operation and make sure the buf is on the correct 848 * vnode queue. 849 */ 850 bp->b_error = 0; 851 wasdelayed = ISSET(bp->b_oflags, BO_DELWRI); 852 CLR(bp->b_flags, B_READ); 853 if (wasdelayed) { 854 mutex_enter(&bufcache_lock); 855 mutex_enter(bp->b_objlock); 856 CLR(bp->b_oflags, BO_DONE | BO_DELWRI); 857 reassignbuf(bp, bp->b_vp); 858 mutex_exit(&bufcache_lock); 859 } else { 860 curlwp->l_ru.ru_oublock++; 861 mutex_enter(bp->b_objlock); 862 CLR(bp->b_oflags, BO_DONE | BO_DELWRI); 863 } 864 if (vp != NULL) 865 vp->v_numoutput++; 866 mutex_exit(bp->b_objlock); 867 868 /* Initiate disk write. */ 869 if (sync) 870 BIO_SETPRIO(bp, BPRIO_TIMECRITICAL); 871 else 872 BIO_SETPRIO(bp, BPRIO_TIMELIMITED); 873 874 VOP_STRATEGY(vp, bp); 875 876 if (sync) { 877 /* If I/O was synchronous, wait for it to complete. */ 878 rv = biowait(bp); 879 880 /* Release the buffer. */ 881 brelse(bp, 0); 882 883 return (rv); 884 } else { 885 return (0); 886 } 887 } 888 889 int 890 vn_bwrite(void *v) 891 { 892 struct vop_bwrite_args *ap = v; 893 894 return (bwrite(ap->a_bp)); 895 } 896 897 /* 898 * Delayed write. 899 * 900 * The buffer is marked dirty, but is not queued for I/O. 901 * This routine should be used when the buffer is expected 902 * to be modified again soon, typically a small write that 903 * partially fills a buffer. 904 * 905 * NB: magnetic tapes cannot be delayed; they must be 906 * written in the order that the writes are requested. 907 * 908 * Described in Leffler, et al. (pp. 208-213). 909 */ 910 void 911 bdwrite(buf_t *bp) 912 { 913 914 KASSERT(bp->b_vp == NULL || bp->b_vp->v_tag != VT_UFS || 915 bp->b_vp->v_type == VBLK || ISSET(bp->b_flags, B_COWDONE)); 916 KASSERT(ISSET(bp->b_cflags, BC_BUSY)); 917 KASSERT(!cv_has_waiters(&bp->b_done)); 918 919 /* If this is a tape block, write the block now. */ 920 if (bdev_type(bp->b_dev) == D_TAPE) { 921 bawrite(bp); 922 return; 923 } 924 925 if (wapbl_vphaswapbl(bp->b_vp)) { 926 struct mount *mp = wapbl_vptomp(bp->b_vp); 927 928 if (bp->b_iodone != mp->mnt_wapbl_op->wo_wapbl_biodone) { 929 WAPBL_ADD_BUF(mp, bp); 930 } 931 } 932 933 /* 934 * If the block hasn't been seen before: 935 * (1) Mark it as having been seen, 936 * (2) Charge for the write, 937 * (3) Make sure it's on its vnode's correct block list. 938 */ 939 KASSERT(bp->b_vp == NULL || bp->b_objlock == &bp->b_vp->v_interlock); 940 941 if (!ISSET(bp->b_oflags, BO_DELWRI)) { 942 mutex_enter(&bufcache_lock); 943 mutex_enter(bp->b_objlock); 944 SET(bp->b_oflags, BO_DELWRI); 945 curlwp->l_ru.ru_oublock++; 946 reassignbuf(bp, bp->b_vp); 947 mutex_exit(&bufcache_lock); 948 } else { 949 mutex_enter(bp->b_objlock); 950 } 951 /* Otherwise, the "write" is done, so mark and release the buffer. */ 952 CLR(bp->b_oflags, BO_DONE); 953 mutex_exit(bp->b_objlock); 954 955 brelse(bp, 0); 956 } 957 958 /* 959 * Asynchronous block write; just an asynchronous bwrite(). 960 */ 961 void 962 bawrite(buf_t *bp) 963 { 964 965 KASSERT(ISSET(bp->b_cflags, BC_BUSY)); 966 967 SET(bp->b_flags, B_ASYNC); 968 VOP_BWRITE(bp); 969 } 970 971 /* 972 * Same as first half of bdwrite, mark buffer dirty, but do not release it. 973 * Call with the buffer interlock held. 974 * 975 * Note: called only from biodone() through ffs softdep's io_complete() 976 * Note2: smbfs also learned about bdirty(). 977 */ 978 void 979 bdirty(buf_t *bp) 980 { 981 982 KASSERT(mutex_owned(&bufcache_lock)); 983 KASSERT(bp->b_objlock == &bp->b_vp->v_interlock); 984 KASSERT(mutex_owned(bp->b_objlock)); 985 KASSERT(ISSET(bp->b_cflags, BC_BUSY)); 986 987 CLR(bp->b_cflags, BC_AGE); 988 989 if (!ISSET(bp->b_oflags, BO_DELWRI)) { 990 SET(bp->b_oflags, BO_DELWRI); 991 curlwp->l_ru.ru_oublock++; 992 reassignbuf(bp, bp->b_vp); 993 } 994 } 995 996 997 /* 998 * Release a buffer on to the free lists. 999 * Described in Bach (p. 46). 1000 */ 1001 void 1002 brelsel(buf_t *bp, int set) 1003 { 1004 struct bqueue *bufq; 1005 struct vnode *vp; 1006 1007 KASSERT(mutex_owned(&bufcache_lock)); 1008 KASSERT(!cv_has_waiters(&bp->b_done)); 1009 KASSERT(bp->b_refcnt > 0); 1010 1011 SET(bp->b_cflags, set); 1012 1013 KASSERT(ISSET(bp->b_cflags, BC_BUSY)); 1014 KASSERT(bp->b_iodone == NULL); 1015 1016 /* Wake up any processes waiting for any buffer to become free. */ 1017 cv_signal(&needbuffer_cv); 1018 1019 /* Wake up any proceeses waiting for _this_ buffer to become */ 1020 if (ISSET(bp->b_cflags, BC_WANTED)) 1021 CLR(bp->b_cflags, BC_WANTED|BC_AGE); 1022 1023 /* 1024 * Determine which queue the buffer should be on, then put it there. 1025 */ 1026 1027 /* If it's locked, don't report an error; try again later. */ 1028 if (ISSET(bp->b_flags, B_LOCKED)) 1029 bp->b_error = 0; 1030 1031 /* If it's not cacheable, or an error, mark it invalid. */ 1032 if (ISSET(bp->b_cflags, BC_NOCACHE) || bp->b_error != 0) 1033 SET(bp->b_cflags, BC_INVAL); 1034 1035 if (ISSET(bp->b_cflags, BC_VFLUSH)) { 1036 /* 1037 * This is a delayed write buffer that was just flushed to 1038 * disk. It is still on the LRU queue. If it's become 1039 * invalid, then we need to move it to a different queue; 1040 * otherwise leave it in its current position. 1041 */ 1042 CLR(bp->b_cflags, BC_VFLUSH); 1043 if (!ISSET(bp->b_cflags, BC_INVAL|BC_AGE) && 1044 !ISSET(bp->b_flags, B_LOCKED) && bp->b_error == 0) { 1045 KDASSERT(checkfreelist(bp, &bufqueues[BQ_LRU], 1)); 1046 goto already_queued; 1047 } else { 1048 bremfree(bp); 1049 } 1050 } 1051 1052 KDASSERT(checkfreelist(bp, &bufqueues[BQ_AGE], 0)); 1053 KDASSERT(checkfreelist(bp, &bufqueues[BQ_LRU], 0)); 1054 KDASSERT(checkfreelist(bp, &bufqueues[BQ_LOCKED], 0)); 1055 1056 if ((bp->b_bufsize <= 0) || ISSET(bp->b_cflags, BC_INVAL)) { 1057 /* 1058 * If it's invalid or empty, dissociate it from its vnode 1059 * and put on the head of the appropriate queue. 1060 */ 1061 if (bioopsp != NULL) 1062 (*bioopsp->io_deallocate)(bp); 1063 1064 if (ISSET(bp->b_flags, B_LOCKED)) { 1065 if (wapbl_vphaswapbl(vp = bp->b_vp)) { 1066 struct mount *mp = wapbl_vptomp(vp); 1067 1068 KASSERT(bp->b_iodone 1069 != mp->mnt_wapbl_op->wo_wapbl_biodone); 1070 WAPBL_REMOVE_BUF(mp, bp); 1071 } 1072 } 1073 1074 mutex_enter(bp->b_objlock); 1075 CLR(bp->b_oflags, BO_DONE|BO_DELWRI); 1076 if ((vp = bp->b_vp) != NULL) { 1077 KASSERT(bp->b_objlock == &vp->v_interlock); 1078 reassignbuf(bp, bp->b_vp); 1079 brelvp(bp); 1080 mutex_exit(&vp->v_interlock); 1081 } else { 1082 KASSERT(bp->b_objlock == &buffer_lock); 1083 mutex_exit(bp->b_objlock); 1084 } 1085 1086 if (bp->b_bufsize <= 0) 1087 /* no data */ 1088 goto already_queued; 1089 else 1090 /* invalid data */ 1091 bufq = &bufqueues[BQ_AGE]; 1092 binsheadfree(bp, bufq); 1093 } else { 1094 /* 1095 * It has valid data. Put it on the end of the appropriate 1096 * queue, so that it'll stick around for as long as possible. 1097 * If buf is AGE, but has dependencies, must put it on last 1098 * bufqueue to be scanned, ie LRU. This protects against the 1099 * livelock where BQ_AGE only has buffers with dependencies, 1100 * and we thus never get to the dependent buffers in BQ_LRU. 1101 */ 1102 if (ISSET(bp->b_flags, B_LOCKED)) { 1103 /* locked in core */ 1104 bufq = &bufqueues[BQ_LOCKED]; 1105 } else if (!ISSET(bp->b_cflags, BC_AGE)) { 1106 /* valid data */ 1107 bufq = &bufqueues[BQ_LRU]; 1108 } else { 1109 /* stale but valid data */ 1110 int has_deps; 1111 1112 if (bioopsp != NULL) 1113 has_deps = (*bioopsp->io_countdeps)(bp, 0); 1114 else 1115 has_deps = 0; 1116 bufq = has_deps ? &bufqueues[BQ_LRU] : 1117 &bufqueues[BQ_AGE]; 1118 } 1119 binstailfree(bp, bufq); 1120 } 1121 already_queued: 1122 /* Unlock the buffer. */ 1123 CLR(bp->b_cflags, BC_AGE|BC_BUSY|BC_NOCACHE); 1124 CLR(bp->b_flags, B_ASYNC); 1125 cv_broadcast(&bp->b_busy); 1126 1127 if (bp->b_bufsize <= 0) 1128 brele(bp); 1129 } 1130 1131 void 1132 brelse(buf_t *bp, int set) 1133 { 1134 1135 mutex_enter(&bufcache_lock); 1136 brelsel(bp, set); 1137 mutex_exit(&bufcache_lock); 1138 } 1139 1140 /* 1141 * Determine if a block is in the cache. 1142 * Just look on what would be its hash chain. If it's there, return 1143 * a pointer to it, unless it's marked invalid. If it's marked invalid, 1144 * we normally don't return the buffer, unless the caller explicitly 1145 * wants us to. 1146 */ 1147 buf_t * 1148 incore(struct vnode *vp, daddr_t blkno) 1149 { 1150 buf_t *bp; 1151 1152 KASSERT(mutex_owned(&bufcache_lock)); 1153 1154 /* Search hash chain */ 1155 LIST_FOREACH(bp, BUFHASH(vp, blkno), b_hash) { 1156 if (bp->b_lblkno == blkno && bp->b_vp == vp && 1157 !ISSET(bp->b_cflags, BC_INVAL)) { 1158 KASSERT(bp->b_objlock == &vp->v_interlock); 1159 return (bp); 1160 } 1161 } 1162 1163 return (NULL); 1164 } 1165 1166 /* 1167 * Get a block of requested size that is associated with 1168 * a given vnode and block offset. If it is found in the 1169 * block cache, mark it as having been found, make it busy 1170 * and return it. Otherwise, return an empty block of the 1171 * correct size. It is up to the caller to insure that the 1172 * cached blocks be of the correct size. 1173 */ 1174 buf_t * 1175 getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, int slptimeo) 1176 { 1177 int err, preserve; 1178 buf_t *bp; 1179 1180 mutex_enter(&bufcache_lock); 1181 loop: 1182 bp = incore(vp, blkno); 1183 if (bp != NULL) { 1184 err = bbusy(bp, ((slpflag & PCATCH) != 0), slptimeo, NULL); 1185 if (err != 0) { 1186 if (err == EPASSTHROUGH) 1187 goto loop; 1188 mutex_exit(&bufcache_lock); 1189 return (NULL); 1190 } 1191 KASSERT(!cv_has_waiters(&bp->b_done)); 1192 #ifdef DIAGNOSTIC 1193 if (ISSET(bp->b_oflags, BO_DONE|BO_DELWRI) && 1194 bp->b_bcount < size && vp->v_type != VBLK) 1195 panic("getblk: block size invariant failed"); 1196 #endif 1197 bremfree(bp); 1198 preserve = 1; 1199 } else { 1200 if ((bp = getnewbuf(slpflag, slptimeo, 0)) == NULL) 1201 goto loop; 1202 1203 if (incore(vp, blkno) != NULL) { 1204 /* The block has come into memory in the meantime. */ 1205 brelsel(bp, 0); 1206 goto loop; 1207 } 1208 1209 LIST_INSERT_HEAD(BUFHASH(vp, blkno), bp, b_hash); 1210 bp->b_blkno = bp->b_lblkno = bp->b_rawblkno = blkno; 1211 mutex_enter(&vp->v_interlock); 1212 bgetvp(vp, bp); 1213 mutex_exit(&vp->v_interlock); 1214 preserve = 0; 1215 } 1216 mutex_exit(&bufcache_lock); 1217 1218 /* 1219 * LFS can't track total size of B_LOCKED buffer (locked_queue_bytes) 1220 * if we re-size buffers here. 1221 */ 1222 if (ISSET(bp->b_flags, B_LOCKED)) { 1223 KASSERT(bp->b_bufsize >= size); 1224 } else { 1225 if (allocbuf(bp, size, preserve)) { 1226 mutex_enter(&bufcache_lock); 1227 LIST_REMOVE(bp, b_hash); 1228 mutex_exit(&bufcache_lock); 1229 brelse(bp, BC_INVAL); 1230 return NULL; 1231 } 1232 } 1233 BIO_SETPRIO(bp, BPRIO_DEFAULT); 1234 return (bp); 1235 } 1236 1237 /* 1238 * Get an empty, disassociated buffer of given size. 1239 */ 1240 buf_t * 1241 geteblk(int size) 1242 { 1243 buf_t *bp; 1244 int error; 1245 1246 mutex_enter(&bufcache_lock); 1247 while ((bp = getnewbuf(0, 0, 0)) == NULL) 1248 ; 1249 1250 SET(bp->b_cflags, BC_INVAL); 1251 LIST_INSERT_HEAD(&invalhash, bp, b_hash); 1252 mutex_exit(&bufcache_lock); 1253 BIO_SETPRIO(bp, BPRIO_DEFAULT); 1254 error = allocbuf(bp, size, 0); 1255 KASSERT(error == 0); 1256 return (bp); 1257 } 1258 1259 /* 1260 * Expand or contract the actual memory allocated to a buffer. 1261 * 1262 * If the buffer shrinks, data is lost, so it's up to the 1263 * caller to have written it out *first*; this routine will not 1264 * start a write. If the buffer grows, it's the callers 1265 * responsibility to fill out the buffer's additional contents. 1266 */ 1267 int 1268 allocbuf(buf_t *bp, int size, int preserve) 1269 { 1270 void *addr; 1271 vsize_t oldsize, desired_size; 1272 int oldcount; 1273 int delta; 1274 1275 desired_size = buf_roundsize(size); 1276 if (desired_size > MAXBSIZE) 1277 printf("allocbuf: buffer larger than MAXBSIZE requested"); 1278 1279 oldcount = bp->b_bcount; 1280 1281 bp->b_bcount = size; 1282 1283 oldsize = bp->b_bufsize; 1284 if (oldsize == desired_size) 1285 goto out; 1286 1287 /* 1288 * If we want a buffer of a different size, re-allocate the 1289 * buffer's memory; copy old content only if needed. 1290 */ 1291 addr = buf_malloc(desired_size); 1292 if (addr == NULL) 1293 return ENOMEM; 1294 if (preserve) 1295 memcpy(addr, bp->b_data, MIN(oldsize,desired_size)); 1296 if (bp->b_data != NULL) 1297 buf_mrelease(bp->b_data, oldsize); 1298 bp->b_data = addr; 1299 bp->b_bufsize = desired_size; 1300 1301 /* 1302 * Update overall buffer memory counter (protected by bufcache_lock) 1303 */ 1304 delta = (long)desired_size - (long)oldsize; 1305 1306 mutex_enter(&bufcache_lock); 1307 if ((bufmem += delta) > bufmem_hiwater) { 1308 /* 1309 * Need to trim overall memory usage. 1310 */ 1311 while (buf_canrelease()) { 1312 if (curcpu()->ci_schedstate.spc_flags & 1313 SPCF_SHOULDYIELD) { 1314 mutex_exit(&bufcache_lock); 1315 preempt(); 1316 mutex_enter(&bufcache_lock); 1317 } 1318 if (buf_trim() == 0) 1319 break; 1320 } 1321 } 1322 mutex_exit(&bufcache_lock); 1323 1324 out: 1325 if (wapbl_vphaswapbl(bp->b_vp)) 1326 WAPBL_RESIZE_BUF(wapbl_vptomp(bp->b_vp), bp, oldsize, oldcount); 1327 1328 return 0; 1329 } 1330 1331 /* 1332 * Find a buffer which is available for use. 1333 * Select something from a free list. 1334 * Preference is to AGE list, then LRU list. 1335 * 1336 * Called with the buffer queues locked. 1337 * Return buffer locked. 1338 */ 1339 buf_t * 1340 getnewbuf(int slpflag, int slptimeo, int from_bufq) 1341 { 1342 buf_t *bp; 1343 struct vnode *vp; 1344 1345 start: 1346 KASSERT(mutex_owned(&bufcache_lock)); 1347 1348 /* 1349 * Get a new buffer from the pool. 1350 */ 1351 if (!from_bufq && buf_lotsfree()) { 1352 mutex_exit(&bufcache_lock); 1353 bp = pool_cache_get(buf_cache, PR_NOWAIT); 1354 if (bp != NULL) { 1355 memset((char *)bp, 0, sizeof(*bp)); 1356 buf_init(bp); 1357 SET(bp->b_cflags, BC_BUSY); /* mark buffer busy */ 1358 mutex_enter(&bufcache_lock); 1359 #if defined(DIAGNOSTIC) 1360 bp->b_freelistindex = -1; 1361 #endif /* defined(DIAGNOSTIC) */ 1362 return (bp); 1363 } 1364 mutex_enter(&bufcache_lock); 1365 } 1366 1367 if ((bp = TAILQ_FIRST(&bufqueues[BQ_AGE].bq_queue)) != NULL || 1368 (bp = TAILQ_FIRST(&bufqueues[BQ_LRU].bq_queue)) != NULL) { 1369 KASSERT(!ISSET(bp->b_cflags, BC_BUSY) || ISSET(bp->b_cflags, BC_VFLUSH)); 1370 bremfree(bp); 1371 1372 /* Buffer is no longer on free lists. */ 1373 SET(bp->b_cflags, BC_BUSY); 1374 } else { 1375 /* 1376 * XXX: !from_bufq should be removed. 1377 */ 1378 if (!from_bufq || curlwp != uvm.pagedaemon_lwp) { 1379 /* wait for a free buffer of any kind */ 1380 if ((slpflag & PCATCH) != 0) 1381 (void)cv_timedwait_sig(&needbuffer_cv, 1382 &bufcache_lock, slptimeo); 1383 else 1384 (void)cv_timedwait(&needbuffer_cv, 1385 &bufcache_lock, slptimeo); 1386 } 1387 return (NULL); 1388 } 1389 1390 #ifdef DIAGNOSTIC 1391 if (bp->b_bufsize <= 0) 1392 panic("buffer %p: on queue but empty", bp); 1393 #endif 1394 1395 if (ISSET(bp->b_cflags, BC_VFLUSH)) { 1396 /* 1397 * This is a delayed write buffer being flushed to disk. Make 1398 * sure it gets aged out of the queue when it's finished, and 1399 * leave it off the LRU queue. 1400 */ 1401 CLR(bp->b_cflags, BC_VFLUSH); 1402 SET(bp->b_cflags, BC_AGE); 1403 goto start; 1404 } 1405 1406 KASSERT(ISSET(bp->b_cflags, BC_BUSY)); 1407 KASSERT(bp->b_refcnt > 0); 1408 KASSERT(!cv_has_waiters(&bp->b_done)); 1409 1410 /* 1411 * If buffer was a delayed write, start it and return NULL 1412 * (since we might sleep while starting the write). 1413 */ 1414 if (ISSET(bp->b_oflags, BO_DELWRI)) { 1415 /* 1416 * This buffer has gone through the LRU, so make sure it gets 1417 * reused ASAP. 1418 */ 1419 SET(bp->b_cflags, BC_AGE); 1420 mutex_exit(&bufcache_lock); 1421 bawrite(bp); 1422 mutex_enter(&bufcache_lock); 1423 return (NULL); 1424 } 1425 1426 vp = bp->b_vp; 1427 if (bioopsp != NULL) 1428 (*bioopsp->io_deallocate)(bp); 1429 1430 /* clear out various other fields */ 1431 bp->b_cflags = BC_BUSY; 1432 bp->b_oflags = 0; 1433 bp->b_flags = 0; 1434 bp->b_dev = NODEV; 1435 bp->b_blkno = 0; 1436 bp->b_lblkno = 0; 1437 bp->b_rawblkno = 0; 1438 bp->b_iodone = 0; 1439 bp->b_error = 0; 1440 bp->b_resid = 0; 1441 bp->b_bcount = 0; 1442 1443 LIST_REMOVE(bp, b_hash); 1444 1445 /* Disassociate us from our vnode, if we had one... */ 1446 if (vp != NULL) { 1447 mutex_enter(&vp->v_interlock); 1448 brelvp(bp); 1449 mutex_exit(&vp->v_interlock); 1450 } 1451 1452 return (bp); 1453 } 1454 1455 /* 1456 * Attempt to free an aged buffer off the queues. 1457 * Called with queue lock held. 1458 * Returns the amount of buffer memory freed. 1459 */ 1460 static int 1461 buf_trim(void) 1462 { 1463 buf_t *bp; 1464 long size = 0; 1465 1466 KASSERT(mutex_owned(&bufcache_lock)); 1467 1468 /* Instruct getnewbuf() to get buffers off the queues */ 1469 if ((bp = getnewbuf(PCATCH, 1, 1)) == NULL) 1470 return 0; 1471 1472 KASSERT((bp->b_cflags & BC_WANTED) == 0); 1473 size = bp->b_bufsize; 1474 bufmem -= size; 1475 if (size > 0) { 1476 buf_mrelease(bp->b_data, size); 1477 bp->b_bcount = bp->b_bufsize = 0; 1478 } 1479 /* brelse() will return the buffer to the global buffer pool */ 1480 brelsel(bp, 0); 1481 return size; 1482 } 1483 1484 int 1485 buf_drain(int n) 1486 { 1487 int size = 0, sz; 1488 1489 KASSERT(mutex_owned(&bufcache_lock)); 1490 1491 while (size < n && bufmem > bufmem_lowater) { 1492 sz = buf_trim(); 1493 if (sz <= 0) 1494 break; 1495 size += sz; 1496 } 1497 1498 return size; 1499 } 1500 1501 /* 1502 * Wait for operations on the buffer to complete. 1503 * When they do, extract and return the I/O's error value. 1504 */ 1505 int 1506 biowait(buf_t *bp) 1507 { 1508 1509 KASSERT(ISSET(bp->b_cflags, BC_BUSY)); 1510 KASSERT(bp->b_refcnt > 0); 1511 1512 mutex_enter(bp->b_objlock); 1513 while (!ISSET(bp->b_oflags, BO_DONE | BO_DELWRI)) 1514 cv_wait(&bp->b_done, bp->b_objlock); 1515 mutex_exit(bp->b_objlock); 1516 1517 return bp->b_error; 1518 } 1519 1520 /* 1521 * Mark I/O complete on a buffer. 1522 * 1523 * If a callback has been requested, e.g. the pageout 1524 * daemon, do so. Otherwise, awaken waiting processes. 1525 * 1526 * [ Leffler, et al., says on p.247: 1527 * "This routine wakes up the blocked process, frees the buffer 1528 * for an asynchronous write, or, for a request by the pagedaemon 1529 * process, invokes a procedure specified in the buffer structure" ] 1530 * 1531 * In real life, the pagedaemon (or other system processes) wants 1532 * to do async stuff to, and doesn't want the buffer brelse()'d. 1533 * (for swap pager, that puts swap buffers on the free lists (!!!), 1534 * for the vn device, that puts malloc'd buffers on the free lists!) 1535 */ 1536 void 1537 biodone(buf_t *bp) 1538 { 1539 int s; 1540 1541 KASSERT(!ISSET(bp->b_oflags, BO_DONE)); 1542 1543 if (cpu_intr_p()) { 1544 /* From interrupt mode: defer to a soft interrupt. */ 1545 s = splvm(); 1546 TAILQ_INSERT_TAIL(&curcpu()->ci_data.cpu_biodone, bp, b_actq); 1547 softint_schedule(biodone_sih); 1548 splx(s); 1549 } else { 1550 /* Process now - the buffer may be freed soon. */ 1551 biodone2(bp); 1552 } 1553 } 1554 1555 static void 1556 biodone2(buf_t *bp) 1557 { 1558 void (*callout)(buf_t *); 1559 1560 if (bioopsp != NULL) 1561 (*bioopsp->io_complete)(bp); 1562 1563 mutex_enter(bp->b_objlock); 1564 /* Note that the transfer is done. */ 1565 if (ISSET(bp->b_oflags, BO_DONE)) 1566 panic("biodone2 already"); 1567 CLR(bp->b_flags, B_COWDONE); 1568 SET(bp->b_oflags, BO_DONE); 1569 BIO_SETPRIO(bp, BPRIO_DEFAULT); 1570 1571 /* Wake up waiting writers. */ 1572 if (!ISSET(bp->b_flags, B_READ)) 1573 vwakeup(bp); 1574 1575 if ((callout = bp->b_iodone) != NULL) { 1576 /* Note callout done, then call out. */ 1577 KASSERT(!cv_has_waiters(&bp->b_done)); 1578 KERNEL_LOCK(1, NULL); /* XXXSMP */ 1579 bp->b_iodone = NULL; 1580 mutex_exit(bp->b_objlock); 1581 (*callout)(bp); 1582 KERNEL_UNLOCK_ONE(NULL); /* XXXSMP */ 1583 } else if (ISSET(bp->b_flags, B_ASYNC)) { 1584 /* If async, release. */ 1585 KASSERT(!cv_has_waiters(&bp->b_done)); 1586 mutex_exit(bp->b_objlock); 1587 brelse(bp, 0); 1588 } else { 1589 /* Otherwise just wake up waiters in biowait(). */ 1590 cv_broadcast(&bp->b_done); 1591 mutex_exit(bp->b_objlock); 1592 } 1593 } 1594 1595 static void 1596 biointr(void *cookie) 1597 { 1598 struct cpu_info *ci; 1599 buf_t *bp; 1600 int s; 1601 1602 ci = curcpu(); 1603 1604 while (!TAILQ_EMPTY(&ci->ci_data.cpu_biodone)) { 1605 KASSERT(curcpu() == ci); 1606 1607 s = splvm(); 1608 bp = TAILQ_FIRST(&ci->ci_data.cpu_biodone); 1609 TAILQ_REMOVE(&ci->ci_data.cpu_biodone, bp, b_actq); 1610 splx(s); 1611 1612 biodone2(bp); 1613 } 1614 } 1615 1616 /* 1617 * Return a count of buffers on the "locked" queue. 1618 */ 1619 int 1620 count_lock_queue(void) 1621 { 1622 buf_t *bp; 1623 int n = 0; 1624 1625 mutex_enter(&bufcache_lock); 1626 TAILQ_FOREACH(bp, &bufqueues[BQ_LOCKED].bq_queue, b_freelist) 1627 n++; 1628 mutex_exit(&bufcache_lock); 1629 return (n); 1630 } 1631 1632 /* 1633 * Wait for all buffers to complete I/O 1634 * Return the number of "stuck" buffers. 1635 */ 1636 int 1637 buf_syncwait(void) 1638 { 1639 buf_t *bp; 1640 int iter, nbusy, nbusy_prev = 0, dcount, ihash; 1641 1642 dcount = 10000; 1643 for (iter = 0; iter < 20;) { 1644 mutex_enter(&bufcache_lock); 1645 nbusy = 0; 1646 for (ihash = 0; ihash < bufhash+1; ihash++) { 1647 LIST_FOREACH(bp, &bufhashtbl[ihash], b_hash) { 1648 if ((bp->b_cflags & (BC_BUSY|BC_INVAL)) == BC_BUSY) 1649 nbusy += ((bp->b_flags & B_READ) == 0); 1650 /* 1651 * With soft updates, some buffers that are 1652 * written will be remarked as dirty until other 1653 * buffers are written. 1654 */ 1655 if (bp->b_vp && bp->b_vp->v_mount 1656 && (bp->b_vp->v_mount->mnt_flag & MNT_SOFTDEP) 1657 && (bp->b_oflags & BO_DELWRI)) { 1658 bremfree(bp); 1659 bp->b_cflags |= BC_BUSY; 1660 nbusy++; 1661 mutex_exit(&bufcache_lock); 1662 bawrite(bp); 1663 if (dcount-- <= 0) { 1664 printf("softdep "); 1665 goto fail; 1666 } 1667 mutex_enter(&bufcache_lock); 1668 } 1669 } 1670 } 1671 mutex_exit(&bufcache_lock); 1672 1673 if (nbusy == 0) 1674 break; 1675 if (nbusy_prev == 0) 1676 nbusy_prev = nbusy; 1677 printf("%d ", nbusy); 1678 kpause("bflush", false, (iter == 0) ? 1 : hz / 25 * iter, NULL); 1679 if (nbusy >= nbusy_prev) /* we didn't flush anything */ 1680 iter++; 1681 else 1682 nbusy_prev = nbusy; 1683 } 1684 1685 if (nbusy) { 1686 fail:; 1687 #if defined(DEBUG) || defined(DEBUG_HALT_BUSY) 1688 printf("giving up\nPrinting vnodes for busy buffers\n"); 1689 for (ihash = 0; ihash < bufhash+1; ihash++) { 1690 LIST_FOREACH(bp, &bufhashtbl[ihash], b_hash) { 1691 if ((bp->b_cflags & (BC_BUSY|BC_INVAL)) == BC_BUSY && 1692 (bp->b_flags & B_READ) == 0) 1693 vprint(NULL, bp->b_vp); 1694 } 1695 } 1696 #endif 1697 } 1698 1699 return nbusy; 1700 } 1701 1702 static void 1703 sysctl_fillbuf(buf_t *i, struct buf_sysctl *o) 1704 { 1705 1706 o->b_flags = i->b_flags | i->b_cflags | i->b_oflags; 1707 o->b_error = i->b_error; 1708 o->b_prio = i->b_prio; 1709 o->b_dev = i->b_dev; 1710 o->b_bufsize = i->b_bufsize; 1711 o->b_bcount = i->b_bcount; 1712 o->b_resid = i->b_resid; 1713 o->b_addr = PTRTOUINT64(i->b_data); 1714 o->b_blkno = i->b_blkno; 1715 o->b_rawblkno = i->b_rawblkno; 1716 o->b_iodone = PTRTOUINT64(i->b_iodone); 1717 o->b_proc = PTRTOUINT64(i->b_proc); 1718 o->b_vp = PTRTOUINT64(i->b_vp); 1719 o->b_saveaddr = PTRTOUINT64(i->b_saveaddr); 1720 o->b_lblkno = i->b_lblkno; 1721 } 1722 1723 #define KERN_BUFSLOP 20 1724 static int 1725 sysctl_dobuf(SYSCTLFN_ARGS) 1726 { 1727 buf_t *bp; 1728 struct buf_sysctl bs; 1729 struct bqueue *bq; 1730 char *dp; 1731 u_int i, op, arg; 1732 size_t len, needed, elem_size, out_size; 1733 int error, elem_count, retries; 1734 1735 if (namelen == 1 && name[0] == CTL_QUERY) 1736 return (sysctl_query(SYSCTLFN_CALL(rnode))); 1737 1738 if (namelen != 4) 1739 return (EINVAL); 1740 1741 retries = 100; 1742 retry: 1743 dp = oldp; 1744 len = (oldp != NULL) ? *oldlenp : 0; 1745 op = name[0]; 1746 arg = name[1]; 1747 elem_size = name[2]; 1748 elem_count = name[3]; 1749 out_size = MIN(sizeof(bs), elem_size); 1750 1751 /* 1752 * at the moment, these are just "placeholders" to make the 1753 * API for retrieving kern.buf data more extensible in the 1754 * future. 1755 * 1756 * XXX kern.buf currently has "netbsd32" issues. hopefully 1757 * these will be resolved at a later point. 1758 */ 1759 if (op != KERN_BUF_ALL || arg != KERN_BUF_ALL || 1760 elem_size < 1 || elem_count < 0) 1761 return (EINVAL); 1762 1763 error = 0; 1764 needed = 0; 1765 sysctl_unlock(); 1766 mutex_enter(&bufcache_lock); 1767 for (i = 0; i < BQUEUES; i++) { 1768 bq = &bufqueues[i]; 1769 TAILQ_FOREACH(bp, &bq->bq_queue, b_freelist) { 1770 bq->bq_marker = bp; 1771 if (len >= elem_size && elem_count > 0) { 1772 sysctl_fillbuf(bp, &bs); 1773 mutex_exit(&bufcache_lock); 1774 error = copyout(&bs, dp, out_size); 1775 mutex_enter(&bufcache_lock); 1776 if (error) 1777 break; 1778 if (bq->bq_marker != bp) { 1779 /* 1780 * This sysctl node is only for 1781 * statistics. Retry; if the 1782 * queue keeps changing, then 1783 * bail out. 1784 */ 1785 if (retries-- == 0) { 1786 error = EAGAIN; 1787 break; 1788 } 1789 mutex_exit(&bufcache_lock); 1790 goto retry; 1791 } 1792 dp += elem_size; 1793 len -= elem_size; 1794 } 1795 if (elem_count > 0) { 1796 needed += elem_size; 1797 if (elem_count != INT_MAX) 1798 elem_count--; 1799 } 1800 } 1801 if (error != 0) 1802 break; 1803 } 1804 mutex_exit(&bufcache_lock); 1805 sysctl_relock(); 1806 1807 *oldlenp = needed; 1808 if (oldp == NULL) 1809 *oldlenp += KERN_BUFSLOP * sizeof(buf_t); 1810 1811 return (error); 1812 } 1813 1814 static int 1815 sysctl_bufvm_update(SYSCTLFN_ARGS) 1816 { 1817 int t, error, rv; 1818 struct sysctlnode node; 1819 1820 node = *rnode; 1821 node.sysctl_data = &t; 1822 t = *(int *)rnode->sysctl_data; 1823 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 1824 if (error || newp == NULL) 1825 return (error); 1826 1827 if (t < 0) 1828 return EINVAL; 1829 if (rnode->sysctl_data == &bufcache) { 1830 if (t > 100) 1831 return (EINVAL); 1832 bufcache = t; 1833 buf_setwm(); 1834 } else if (rnode->sysctl_data == &bufmem_lowater) { 1835 if (bufmem_hiwater - t < 16) 1836 return (EINVAL); 1837 bufmem_lowater = t; 1838 } else if (rnode->sysctl_data == &bufmem_hiwater) { 1839 if (t - bufmem_lowater < 16) 1840 return (EINVAL); 1841 bufmem_hiwater = t; 1842 } else 1843 return (EINVAL); 1844 1845 /* Drain until below new high water mark */ 1846 sysctl_unlock(); 1847 mutex_enter(&bufcache_lock); 1848 while ((t = bufmem - bufmem_hiwater) >= 0) { 1849 rv = buf_drain(t / (2 * 1024)); 1850 if (rv <= 0) 1851 break; 1852 } 1853 mutex_exit(&bufcache_lock); 1854 sysctl_relock(); 1855 1856 return 0; 1857 } 1858 1859 SYSCTL_SETUP(sysctl_kern_buf_setup, "sysctl kern.buf subtree setup") 1860 { 1861 1862 sysctl_createv(clog, 0, NULL, NULL, 1863 CTLFLAG_PERMANENT, 1864 CTLTYPE_NODE, "kern", NULL, 1865 NULL, 0, NULL, 0, 1866 CTL_KERN, CTL_EOL); 1867 sysctl_createv(clog, 0, NULL, NULL, 1868 CTLFLAG_PERMANENT, 1869 CTLTYPE_NODE, "buf", 1870 SYSCTL_DESCR("Kernel buffer cache information"), 1871 sysctl_dobuf, 0, NULL, 0, 1872 CTL_KERN, KERN_BUF, CTL_EOL); 1873 } 1874 1875 SYSCTL_SETUP(sysctl_vm_buf_setup, "sysctl vm.buf* subtree setup") 1876 { 1877 1878 sysctl_createv(clog, 0, NULL, NULL, 1879 CTLFLAG_PERMANENT, 1880 CTLTYPE_NODE, "vm", NULL, 1881 NULL, 0, NULL, 0, 1882 CTL_VM, CTL_EOL); 1883 1884 sysctl_createv(clog, 0, NULL, NULL, 1885 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1886 CTLTYPE_INT, "bufcache", 1887 SYSCTL_DESCR("Percentage of physical memory to use for " 1888 "buffer cache"), 1889 sysctl_bufvm_update, 0, &bufcache, 0, 1890 CTL_VM, CTL_CREATE, CTL_EOL); 1891 sysctl_createv(clog, 0, NULL, NULL, 1892 CTLFLAG_PERMANENT|CTLFLAG_READONLY, 1893 CTLTYPE_INT, "bufmem", 1894 SYSCTL_DESCR("Amount of kernel memory used by buffer " 1895 "cache"), 1896 NULL, 0, &bufmem, 0, 1897 CTL_VM, CTL_CREATE, CTL_EOL); 1898 sysctl_createv(clog, 0, NULL, NULL, 1899 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1900 CTLTYPE_INT, "bufmem_lowater", 1901 SYSCTL_DESCR("Minimum amount of kernel memory to " 1902 "reserve for buffer cache"), 1903 sysctl_bufvm_update, 0, &bufmem_lowater, 0, 1904 CTL_VM, CTL_CREATE, CTL_EOL); 1905 sysctl_createv(clog, 0, NULL, NULL, 1906 CTLFLAG_PERMANENT|CTLFLAG_READWRITE, 1907 CTLTYPE_INT, "bufmem_hiwater", 1908 SYSCTL_DESCR("Maximum amount of kernel memory to use " 1909 "for buffer cache"), 1910 sysctl_bufvm_update, 0, &bufmem_hiwater, 0, 1911 CTL_VM, CTL_CREATE, CTL_EOL); 1912 } 1913 1914 #ifdef DEBUG 1915 /* 1916 * Print out statistics on the current allocation of the buffer pool. 1917 * Can be enabled to print out on every ``sync'' by setting "syncprt" 1918 * in vfs_syscalls.c using sysctl. 1919 */ 1920 void 1921 vfs_bufstats(void) 1922 { 1923 int i, j, count; 1924 buf_t *bp; 1925 struct bqueue *dp; 1926 int counts[(MAXBSIZE / PAGE_SIZE) + 1]; 1927 static const char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE" }; 1928 1929 for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) { 1930 count = 0; 1931 for (j = 0; j <= MAXBSIZE/PAGE_SIZE; j++) 1932 counts[j] = 0; 1933 TAILQ_FOREACH(bp, &dp->bq_queue, b_freelist) { 1934 counts[bp->b_bufsize/PAGE_SIZE]++; 1935 count++; 1936 } 1937 printf("%s: total-%d", bname[i], count); 1938 for (j = 0; j <= MAXBSIZE/PAGE_SIZE; j++) 1939 if (counts[j] != 0) 1940 printf(", %d-%d", j * PAGE_SIZE, counts[j]); 1941 printf("\n"); 1942 } 1943 } 1944 #endif /* DEBUG */ 1945 1946 /* ------------------------------ */ 1947 1948 buf_t * 1949 getiobuf(struct vnode *vp, bool waitok) 1950 { 1951 buf_t *bp; 1952 1953 bp = pool_cache_get(bufio_cache, (waitok ? PR_WAITOK : PR_NOWAIT)); 1954 if (bp == NULL) 1955 return bp; 1956 1957 buf_init(bp); 1958 1959 if ((bp->b_vp = vp) == NULL) 1960 bp->b_objlock = &buffer_lock; 1961 else 1962 bp->b_objlock = &vp->v_interlock; 1963 1964 return bp; 1965 } 1966 1967 void 1968 putiobuf(buf_t *bp) 1969 { 1970 1971 buf_destroy(bp); 1972 pool_cache_put(bufio_cache, bp); 1973 } 1974 1975 /* 1976 * nestiobuf_iodone: b_iodone callback for nested buffers. 1977 */ 1978 1979 void 1980 nestiobuf_iodone(buf_t *bp) 1981 { 1982 buf_t *mbp = bp->b_private; 1983 int error; 1984 int donebytes; 1985 1986 KASSERT(bp->b_bcount <= bp->b_bufsize); 1987 KASSERT(mbp != bp); 1988 1989 error = bp->b_error; 1990 if (bp->b_error == 0 && 1991 (bp->b_bcount < bp->b_bufsize || bp->b_resid > 0)) { 1992 /* 1993 * Not all got transfered, raise an error. We have no way to 1994 * propagate these conditions to mbp. 1995 */ 1996 error = EIO; 1997 } 1998 1999 donebytes = bp->b_bufsize; 2000 2001 putiobuf(bp); 2002 nestiobuf_done(mbp, donebytes, error); 2003 } 2004 2005 /* 2006 * nestiobuf_setup: setup a "nested" buffer. 2007 * 2008 * => 'mbp' is a "master" buffer which is being divided into sub pieces. 2009 * => 'bp' should be a buffer allocated by getiobuf. 2010 * => 'offset' is a byte offset in the master buffer. 2011 * => 'size' is a size in bytes of this nested buffer. 2012 */ 2013 2014 void 2015 nestiobuf_setup(buf_t *mbp, buf_t *bp, int offset, size_t size) 2016 { 2017 const int b_read = mbp->b_flags & B_READ; 2018 struct vnode *vp = mbp->b_vp; 2019 2020 KASSERT(mbp->b_bcount >= offset + size); 2021 bp->b_vp = vp; 2022 bp->b_objlock = mbp->b_objlock; 2023 bp->b_cflags = BC_BUSY; 2024 bp->b_flags = B_ASYNC | b_read; 2025 bp->b_iodone = nestiobuf_iodone; 2026 bp->b_data = (char *)mbp->b_data + offset; 2027 bp->b_resid = bp->b_bcount = size; 2028 bp->b_bufsize = bp->b_bcount; 2029 bp->b_private = mbp; 2030 BIO_COPYPRIO(bp, mbp); 2031 if (!b_read && vp != NULL) { 2032 mutex_enter(&vp->v_interlock); 2033 vp->v_numoutput++; 2034 mutex_exit(&vp->v_interlock); 2035 } 2036 } 2037 2038 /* 2039 * nestiobuf_done: propagate completion to the master buffer. 2040 * 2041 * => 'donebytes' specifies how many bytes in the 'mbp' is completed. 2042 * => 'error' is an errno(2) that 'donebytes' has been completed with. 2043 */ 2044 2045 void 2046 nestiobuf_done(buf_t *mbp, int donebytes, int error) 2047 { 2048 2049 if (donebytes == 0) { 2050 return; 2051 } 2052 mutex_enter(mbp->b_objlock); 2053 KASSERT(mbp->b_resid >= donebytes); 2054 mbp->b_resid -= donebytes; 2055 if (error) 2056 mbp->b_error = error; 2057 if (mbp->b_resid == 0) { 2058 mutex_exit(mbp->b_objlock); 2059 biodone(mbp); 2060 } else 2061 mutex_exit(mbp->b_objlock); 2062 } 2063 2064 void 2065 buf_init(buf_t *bp) 2066 { 2067 2068 LIST_INIT(&bp->b_dep); 2069 cv_init(&bp->b_busy, "biolock"); 2070 cv_init(&bp->b_done, "biowait"); 2071 bp->b_dev = NODEV; 2072 bp->b_error = 0; 2073 bp->b_flags = 0; 2074 bp->b_cflags = 0; 2075 bp->b_oflags = 0; 2076 bp->b_objlock = &buffer_lock; 2077 bp->b_iodone = NULL; 2078 bp->b_refcnt = 1; 2079 bp->b_dev = NODEV; 2080 bp->b_vnbufs.le_next = NOLIST; 2081 BIO_SETPRIO(bp, BPRIO_DEFAULT); 2082 } 2083 2084 void 2085 buf_destroy(buf_t *bp) 2086 { 2087 2088 cv_destroy(&bp->b_done); 2089 cv_destroy(&bp->b_busy); 2090 } 2091 2092 int 2093 bbusy(buf_t *bp, bool intr, int timo, kmutex_t *interlock) 2094 { 2095 int error; 2096 2097 KASSERT(mutex_owned(&bufcache_lock)); 2098 2099 if ((bp->b_cflags & BC_BUSY) != 0) { 2100 if (curlwp == uvm.pagedaemon_lwp) 2101 return EDEADLK; 2102 bp->b_cflags |= BC_WANTED; 2103 bref(bp); 2104 if (interlock != NULL) 2105 mutex_exit(interlock); 2106 if (intr) { 2107 error = cv_timedwait_sig(&bp->b_busy, &bufcache_lock, 2108 timo); 2109 } else { 2110 error = cv_timedwait(&bp->b_busy, &bufcache_lock, 2111 timo); 2112 } 2113 brele(bp); 2114 if (interlock != NULL) 2115 mutex_enter(interlock); 2116 if (error != 0) 2117 return error; 2118 return EPASSTHROUGH; 2119 } 2120 bp->b_cflags |= BC_BUSY; 2121 2122 return 0; 2123 } 2124