1 /* $OpenBSD: vfs_bio.c,v 1.214 2024/11/05 17:28:31 mpi Exp $ */ 2 /* $NetBSD: vfs_bio.c,v 1.44 1996/06/11 11:15:36 pk Exp $ */ 3 4 /* 5 * Copyright (c) 1994 Christopher G. Demetriou 6 * Copyright (c) 1982, 1986, 1989, 1993 7 * The Regents of the University of California. All rights reserved. 8 * (c) UNIX System Laboratories, Inc. 9 * All or some portions of this file are derived from material licensed 10 * to the University of California by American Telephone and Telegraph 11 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 12 * the permission of UNIX System Laboratories, Inc. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94 39 */ 40 41 /* 42 * Some references: 43 * Bach: The Design of the UNIX Operating System (Prentice Hall, 1986) 44 * Leffler, et al.: The Design and Implementation of the 4.3BSD 45 * UNIX Operating System (Addison Welley, 1989) 46 */ 47 48 #include <sys/param.h> 49 #include <sys/systm.h> 50 #include <sys/proc.h> 51 #include <sys/buf.h> 52 #include <sys/vnode.h> 53 #include <sys/mount.h> 54 #include <sys/malloc.h> 55 #include <sys/pool.h> 56 #include <sys/specdev.h> 57 #include <sys/tracepoint.h> 58 #include <uvm/uvm_extern.h> 59 60 /* XXX Should really be in buf.h, but for uvm_constraint_range.. */ 61 int buf_realloc_pages(struct buf *, struct uvm_constraint_range *, int); 62 63 struct uvm_constraint_range high_constraint; 64 int fliphigh; 65 66 int nobuffers; 67 int needbuffer; 68 69 /* private bufcache functions */ 70 void bufcache_init(void); 71 void bufcache_adjust(void); 72 struct buf *bufcache_gethighcleanbuf(void); 73 struct buf *bufcache_getdmacleanbuf(void); 74 75 /* 76 * Buffer pool for I/O buffers. 77 */ 78 struct pool bufpool; 79 struct bufhead bufhead = LIST_HEAD_INITIALIZER(bufhead); 80 void buf_put(struct buf *); 81 82 struct buf *bio_doread(struct vnode *, daddr_t, int, int); 83 struct buf *buf_get(struct vnode *, daddr_t, size_t); 84 void bread_cluster_callback(struct buf *); 85 int64_t bufcache_recover_dmapages(int discard, int64_t howmany); 86 static struct buf *incore_locked(struct vnode *vp, daddr_t blkno); 87 88 struct bcachestats bcstats; /* counters */ 89 long lodirtypages; /* dirty page count low water mark */ 90 long hidirtypages; /* dirty page count high water mark */ 91 long targetpages; /* target number of pages for cache size */ 92 long buflowpages; /* smallest size cache allowed */ 93 long bufhighpages; /* largest size cache allowed */ 94 long bufbackpages; /* minimum number of pages we shrink when asked to */ 95 96 vsize_t bufkvm; 97 98 struct proc *cleanerproc; 99 int bd_req; /* Sleep point for cleaner daemon. */ 100 101 #define NUM_CACHES 2 102 #define DMA_CACHE 0 103 struct bufcache cleancache[NUM_CACHES]; 104 struct bufqueue dirtyqueue; 105 106 void 107 buf_put(struct buf *bp) 108 { 109 splassert(IPL_BIO); 110 111 #ifdef DIAGNOSTIC 112 if (bp->b_pobj != NULL) 113 KASSERT(bp->b_bufsize > 0); 114 if (ISSET(bp->b_flags, B_DELWRI)) 115 panic("buf_put: releasing dirty buffer"); 116 if (bp->b_freelist.tqe_next != NOLIST && 117 bp->b_freelist.tqe_next != (void *)-1) 118 panic("buf_put: still on the free list"); 119 if (bp->b_vnbufs.le_next != NOLIST && 120 bp->b_vnbufs.le_next != (void *)-1) 121 panic("buf_put: still on the vnode list"); 122 #endif 123 124 LIST_REMOVE(bp, b_list); 125 bcstats.numbufs--; 126 127 if (buf_dealloc_mem(bp) != 0) 128 return; 129 pool_put(&bufpool, bp); 130 } 131 132 /* 133 * Initialize buffers and hash links for buffers. 134 */ 135 void 136 bufinit(void) 137 { 138 u_int64_t dmapages; 139 u_int64_t highpages; 140 141 dmapages = uvm_pagecount(&dma_constraint); 142 /* take away a guess at how much of this the kernel will consume */ 143 dmapages -= (atop(physmem) - atop(uvmexp.free)); 144 145 /* See if we have memory above the dma accessible region. */ 146 high_constraint.ucr_low = dma_constraint.ucr_high; 147 high_constraint.ucr_high = no_constraint.ucr_high; 148 if (high_constraint.ucr_low != high_constraint.ucr_high) 149 high_constraint.ucr_low++; 150 highpages = uvm_pagecount(&high_constraint); 151 152 /* 153 * Do we have any significant amount of high memory above 154 * the DMA region? if so enable moving buffers there, if not, 155 * don't bother. 156 */ 157 if (highpages > dmapages / 4) 158 fliphigh = 1; 159 else 160 fliphigh = 0; 161 162 /* 163 * If MD code doesn't say otherwise, use up to 10% of DMA'able 164 * memory for buffers. 165 */ 166 if (bufcachepercent == 0) 167 bufcachepercent = 10; 168 169 /* 170 * XXX these values and their same use in kern_sysctl 171 * need to move into buf.h 172 */ 173 KASSERT(bufcachepercent <= 90); 174 KASSERT(bufcachepercent >= 5); 175 if (bufpages == 0) 176 bufpages = dmapages * bufcachepercent / 100; 177 if (bufpages < BCACHE_MIN) 178 bufpages = BCACHE_MIN; 179 KASSERT(bufpages < dmapages); 180 181 bufhighpages = bufpages; 182 183 /* 184 * Set the base backoff level for the buffer cache. We will 185 * not allow uvm to steal back more than this number of pages. 186 */ 187 buflowpages = dmapages * 5 / 100; 188 if (buflowpages < BCACHE_MIN) 189 buflowpages = BCACHE_MIN; 190 191 /* 192 * set bufbackpages to 100 pages, or 10 percent of the low water mark 193 * if we don't have that many pages. 194 */ 195 196 bufbackpages = buflowpages * 10 / 100; 197 if (bufbackpages > 100) 198 bufbackpages = 100; 199 200 /* 201 * If the MD code does not say otherwise, reserve 10% of kva 202 * space for mapping buffers. 203 */ 204 if (bufkvm == 0) 205 bufkvm = VM_KERNEL_SPACE_SIZE / 10; 206 207 /* 208 * Don't use more than twice the amount of bufpages for mappings. 209 * It's twice since we map things sparsely. 210 */ 211 if (bufkvm > bufpages * PAGE_SIZE) 212 bufkvm = bufpages * PAGE_SIZE; 213 /* 214 * Round bufkvm to MAXPHYS because we allocate chunks of va space 215 * in MAXPHYS chunks. 216 */ 217 bufkvm &= ~(MAXPHYS - 1); 218 219 pool_init(&bufpool, sizeof(struct buf), 0, IPL_BIO, 0, "bufpl", NULL); 220 221 bufcache_init(); 222 223 /* 224 * hmm - bufkvm is an argument because it's static, while 225 * bufpages is global because it can change while running. 226 */ 227 buf_mem_init(bufkvm); 228 229 /* 230 * Set the dirty page high water mark to be less than the low 231 * water mark for pages in the buffer cache. This ensures we 232 * can always back off by throwing away clean pages, and give 233 * ourselves a chance to write out the dirty pages eventually. 234 */ 235 hidirtypages = (buflowpages / 4) * 3; 236 lodirtypages = buflowpages / 2; 237 238 /* 239 * We are allowed to use up to the reserve. 240 */ 241 targetpages = bufpages - RESERVE_PAGES; 242 } 243 244 /* 245 * Change cachepct 246 */ 247 void 248 bufadjust(int newbufpages) 249 { 250 int s; 251 int64_t npages; 252 253 if (newbufpages < buflowpages) 254 newbufpages = buflowpages; 255 256 s = splbio(); 257 bufpages = newbufpages; 258 259 /* 260 * We are allowed to use up to the reserve 261 */ 262 targetpages = bufpages - RESERVE_PAGES; 263 264 npages = bcstats.dmapages - targetpages; 265 266 /* 267 * Shrinking the cache happens here only if someone has manually 268 * adjusted bufcachepercent - or the pagedaemon has told us 269 * to give back memory *now* - so we give it all back. 270 */ 271 if (bcstats.dmapages > targetpages) 272 (void) bufcache_recover_dmapages(0, bcstats.dmapages - targetpages); 273 bufcache_adjust(); 274 275 /* 276 * Wake up the cleaner if we have lots of dirty pages, 277 * or if we are getting low on buffer cache kva. 278 */ 279 if ((UNCLEAN_PAGES >= hidirtypages) || 280 bcstats.kvaslots_avail <= 2 * RESERVE_SLOTS) 281 wakeup(&bd_req); 282 283 splx(s); 284 } 285 286 /* 287 * Back off "size" buffer cache pages. Called by the page 288 * daemon to consume buffer cache pages rather than scanning. 289 * 290 * It returns the number of freed pages. 291 */ 292 unsigned long 293 bufbackoff(struct uvm_constraint_range *range, long size) 294 { 295 long pdelta, oldbufpages; 296 int64_t dmarecovered, recovered = 0; 297 298 /* 299 * If we will accept high memory for this backoff 300 * try to steal it from the high memory buffer cache. 301 */ 302 if (range != NULL && range->ucr_high > dma_constraint.ucr_high) { 303 struct buf *bp; 304 int64_t start; 305 int s; 306 307 start = bcstats.numbufpages; 308 309 s = splbio(); 310 while (recovered < size && (bp = bufcache_gethighcleanbuf())) { 311 bufcache_take(bp); 312 if (bp->b_vp) { 313 RBT_REMOVE(buf_rb_bufs, 314 &bp->b_vp->v_bufs_tree, bp); 315 brelvp(bp); 316 } 317 buf_put(bp); 318 recovered = start - bcstats.numbufpages; 319 } 320 bufcache_adjust(); 321 splx(s); 322 323 /* We got enough. */ 324 if (recovered >= size) 325 return recovered; 326 327 /* If we needed only memory above DMA, we're done. */ 328 if (range->ucr_low > dma_constraint.ucr_high) 329 return recovered; 330 331 /* Otherwise get the rest from DMA */ 332 size -= recovered; 333 } 334 335 /* 336 * XXX Otherwise do the dma memory cache dance. this needs 337 * refactoring later to get rid of 'bufpages' 338 */ 339 340 /* 341 * Back off by at least bufbackpages. If the page daemon gave us 342 * a larger size, back off by that much. 343 */ 344 pdelta = (size > bufbackpages) ? size : bufbackpages; 345 346 if (bufpages <= buflowpages) 347 return recovered; 348 if (bufpages - pdelta < buflowpages) 349 pdelta = bufpages - buflowpages; 350 oldbufpages = bufpages; 351 bufadjust(bufpages - pdelta); 352 dmarecovered = oldbufpages - bufpages; 353 354 return recovered + dmarecovered; 355 } 356 357 358 /* 359 * Opportunistically flip a buffer into high memory. Will move the buffer 360 * if memory is available without sleeping, and return 0, otherwise will 361 * fail and return -1 with the buffer unchanged. 362 */ 363 364 int 365 buf_flip_high(struct buf *bp) 366 { 367 int ret = -1; 368 369 KASSERT(ISSET(bp->b_flags, B_BC)); 370 KASSERT(ISSET(bp->b_flags, B_DMA)); 371 KASSERT(bp->cache == DMA_CACHE); 372 KASSERT(fliphigh); 373 374 splassert(IPL_BIO); 375 376 /* Attempt to move the buffer to high memory if we can */ 377 if (buf_realloc_pages(bp, &high_constraint, UVM_PLA_NOWAIT) == 0) { 378 KASSERT(!ISSET(bp->b_flags, B_DMA)); 379 bcstats.highflips++; 380 ret = 0; 381 } else 382 bcstats.highflops++; 383 384 return ret; 385 } 386 387 /* 388 * Flip a buffer to dma reachable memory, when we need it there for 389 * I/O. This can sleep since it will wait for memory allocation in the 390 * DMA reachable area since we have to have the buffer there to proceed. 391 */ 392 void 393 buf_flip_dma(struct buf *bp) 394 { 395 KASSERT(ISSET(bp->b_flags, B_BC)); 396 KASSERT(ISSET(bp->b_flags, B_BUSY)); 397 KASSERT(bp->cache < NUM_CACHES); 398 399 splassert(IPL_BIO); 400 401 if (!ISSET(bp->b_flags, B_DMA)) { 402 403 /* move buf to dma reachable memory */ 404 (void) buf_realloc_pages(bp, &dma_constraint, UVM_PLA_WAITOK); 405 KASSERT(ISSET(bp->b_flags, B_DMA)); 406 bcstats.dmaflips++; 407 } 408 409 if (bp->cache > DMA_CACHE) { 410 CLR(bp->b_flags, B_COLD); 411 CLR(bp->b_flags, B_WARM); 412 bp->cache = DMA_CACHE; 413 } 414 } 415 416 struct buf * 417 bio_doread(struct vnode *vp, daddr_t blkno, int size, int async) 418 { 419 struct buf *bp; 420 struct mount *mp; 421 422 bp = getblk(vp, blkno, size, 0, INFSLP); 423 424 /* 425 * If buffer does not have valid data, start a read. 426 * Note that if buffer is B_INVAL, getblk() won't return it. 427 * Therefore, it's valid if its I/O has completed or been delayed. 428 */ 429 if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) { 430 SET(bp->b_flags, B_READ | async); 431 bcstats.pendingreads++; 432 bcstats.numreads++; 433 VOP_STRATEGY(bp->b_vp, bp); 434 /* Pay for the read. */ 435 curproc->p_ru.ru_inblock++; /* XXX */ 436 } else if (async) { 437 brelse(bp); 438 } 439 440 mp = vp->v_type == VBLK ? vp->v_specmountpoint : vp->v_mount; 441 442 /* 443 * Collect statistics on synchronous and asynchronous reads. 444 * Reads from block devices are charged to their associated 445 * filesystem (if any). 446 */ 447 if (mp != NULL) { 448 if (async == 0) 449 mp->mnt_stat.f_syncreads++; 450 else 451 mp->mnt_stat.f_asyncreads++; 452 } 453 454 return (bp); 455 } 456 457 /* 458 * Read a disk block. 459 * This algorithm described in Bach (p.54). 460 */ 461 int 462 bread(struct vnode *vp, daddr_t blkno, int size, struct buf **bpp) 463 { 464 struct buf *bp; 465 466 /* Get buffer for block. */ 467 bp = *bpp = bio_doread(vp, blkno, size, 0); 468 469 /* Wait for the read to complete, and return result. */ 470 return (biowait(bp)); 471 } 472 473 /* 474 * Read-ahead multiple disk blocks. The first is sync, the rest async. 475 * Trivial modification to the breada algorithm presented in Bach (p.55). 476 */ 477 int 478 breadn(struct vnode *vp, daddr_t blkno, int size, daddr_t rablks[], 479 int rasizes[], int nrablks, struct buf **bpp) 480 { 481 struct buf *bp; 482 int i; 483 484 bp = *bpp = bio_doread(vp, blkno, size, 0); 485 486 /* 487 * For each of the read-ahead blocks, start a read, if necessary. 488 */ 489 for (i = 0; i < nrablks; i++) { 490 /* If it's in the cache, just go on to next one. */ 491 if (incore(vp, rablks[i])) 492 continue; 493 494 /* Get a buffer for the read-ahead block */ 495 (void) bio_doread(vp, rablks[i], rasizes[i], B_ASYNC); 496 } 497 498 /* Otherwise, we had to start a read for it; wait until it's valid. */ 499 return (biowait(bp)); 500 } 501 502 /* 503 * Called from interrupt context. 504 */ 505 void 506 bread_cluster_callback(struct buf *bp) 507 { 508 struct buf **xbpp = bp->b_saveaddr; 509 int i; 510 511 if (xbpp[1] != NULL) { 512 size_t newsize = xbpp[1]->b_bufsize; 513 514 /* 515 * Shrink this buffer's mapping to only cover its part of 516 * the total I/O. 517 */ 518 buf_fix_mapping(bp, newsize); 519 bp->b_bcount = newsize; 520 } 521 522 /* Invalidate read-ahead buffers if read short */ 523 if (bp->b_resid > 0) { 524 for (i = 1; xbpp[i] != NULL; i++) 525 continue; 526 for (i = i - 1; i != 0; i--) { 527 if (xbpp[i]->b_bufsize <= bp->b_resid) { 528 bp->b_resid -= xbpp[i]->b_bufsize; 529 SET(xbpp[i]->b_flags, B_INVAL); 530 } else if (bp->b_resid > 0) { 531 bp->b_resid = 0; 532 SET(xbpp[i]->b_flags, B_INVAL); 533 } else 534 break; 535 } 536 } 537 538 for (i = 1; xbpp[i] != NULL; i++) { 539 if (ISSET(bp->b_flags, B_ERROR)) 540 SET(xbpp[i]->b_flags, B_INVAL | B_ERROR); 541 /* 542 * Move the pages from the master buffer's uvm object 543 * into the individual buffer's uvm objects. 544 */ 545 struct uvm_object *newobj = &xbpp[i]->b_uobj; 546 struct uvm_object *oldobj = &bp->b_uobj; 547 int page; 548 549 uvm_obj_init(newobj, &bufcache_pager, 1); 550 for (page = 0; page < atop(xbpp[i]->b_bufsize); page++) { 551 struct vm_page *pg = uvm_pagelookup(oldobj, 552 xbpp[i]->b_poffs + ptoa(page)); 553 KASSERT(pg != NULL); 554 KASSERT(pg->wire_count == 1); 555 uvm_pagerealloc(pg, newobj, xbpp[i]->b_poffs + ptoa(page)); 556 } 557 xbpp[i]->b_pobj = newobj; 558 559 biodone(xbpp[i]); 560 } 561 562 free(xbpp, M_TEMP, (i + 1) * sizeof(*xbpp)); 563 564 if (ISSET(bp->b_flags, B_ASYNC)) { 565 brelse(bp); 566 } else { 567 CLR(bp->b_flags, B_WANTED); 568 wakeup(bp); 569 } 570 } 571 572 /* 573 * Read-ahead multiple disk blocks, but make sure only one (big) I/O 574 * request is sent to the disk. 575 * XXX This should probably be dropped and breadn should instead be optimized 576 * XXX to do fewer I/O requests. 577 */ 578 int 579 bread_cluster(struct vnode *vp, daddr_t blkno, int size, struct buf **rbpp) 580 { 581 struct buf *bp, **xbpp; 582 int howmany, maxra, i, inc; 583 daddr_t sblkno; 584 585 *rbpp = bio_doread(vp, blkno, size, 0); 586 587 /* 588 * If the buffer is in the cache skip any I/O operation. 589 */ 590 if (ISSET((*rbpp)->b_flags, B_CACHE)) 591 goto out; 592 593 if (size != round_page(size)) 594 goto out; 595 596 if (VOP_BMAP(vp, blkno + 1, NULL, &sblkno, &maxra)) 597 goto out; 598 599 maxra++; 600 if (sblkno == -1 || maxra < 2) 601 goto out; 602 603 howmany = MAXPHYS / size; 604 if (howmany > maxra) 605 howmany = maxra; 606 607 xbpp = mallocarray(howmany + 1, sizeof(*xbpp), M_TEMP, M_NOWAIT); 608 if (xbpp == NULL) 609 goto out; 610 611 for (i = howmany - 1; i >= 0; i--) { 612 size_t sz; 613 614 /* 615 * First buffer allocates big enough size to cover what 616 * all the other buffers need. 617 */ 618 sz = i == 0 ? howmany * size : 0; 619 620 xbpp[i] = buf_get(vp, blkno + i + 1, sz); 621 if (xbpp[i] == NULL) { 622 for (++i; i < howmany; i++) { 623 SET(xbpp[i]->b_flags, B_INVAL); 624 brelse(xbpp[i]); 625 } 626 free(xbpp, M_TEMP, (howmany + 1) * sizeof(*xbpp)); 627 goto out; 628 } 629 } 630 631 bp = xbpp[0]; 632 633 xbpp[howmany] = NULL; 634 635 inc = btodb(size); 636 637 for (i = 1; i < howmany; i++) { 638 bcstats.pendingreads++; 639 bcstats.numreads++; 640 /* 641 * We set B_DMA here because bp above will be B_DMA, 642 * and we are playing buffer slice-n-dice games from 643 * the memory allocated in bp. 644 */ 645 SET(xbpp[i]->b_flags, B_DMA | B_READ | B_ASYNC); 646 xbpp[i]->b_blkno = sblkno + (i * inc); 647 xbpp[i]->b_bufsize = xbpp[i]->b_bcount = size; 648 xbpp[i]->b_data = NULL; 649 xbpp[i]->b_pobj = bp->b_pobj; 650 xbpp[i]->b_poffs = bp->b_poffs + (i * size); 651 } 652 653 KASSERT(bp->b_lblkno == blkno + 1); 654 KASSERT(bp->b_vp == vp); 655 656 bp->b_blkno = sblkno; 657 SET(bp->b_flags, B_READ | B_ASYNC | B_CALL); 658 659 bp->b_saveaddr = (void *)xbpp; 660 bp->b_iodone = bread_cluster_callback; 661 662 bcstats.pendingreads++; 663 bcstats.numreads++; 664 VOP_STRATEGY(bp->b_vp, bp); 665 curproc->p_ru.ru_inblock++; 666 667 out: 668 return (biowait(*rbpp)); 669 } 670 671 /* 672 * Block write. Described in Bach (p.56) 673 */ 674 int 675 bwrite(struct buf *bp) 676 { 677 int rv, async, wasdelayed, s; 678 struct vnode *vp; 679 struct mount *mp; 680 681 vp = bp->b_vp; 682 if (vp != NULL) 683 mp = vp->v_type == VBLK? vp->v_specmountpoint : vp->v_mount; 684 else 685 mp = NULL; 686 687 /* 688 * Remember buffer type, to switch on it later. If the write was 689 * synchronous, but the file system was mounted with MNT_ASYNC, 690 * convert it to a delayed write. 691 * XXX note that this relies on delayed tape writes being converted 692 * to async, not sync writes (which is safe, but ugly). 693 */ 694 async = ISSET(bp->b_flags, B_ASYNC); 695 if (!async && mp && ISSET(mp->mnt_flag, MNT_ASYNC)) { 696 /* 697 * Don't convert writes from VND on async filesystems 698 * that already have delayed writes in the upper layer. 699 */ 700 if (!ISSET(bp->b_flags, B_NOCACHE)) { 701 bdwrite(bp); 702 return (0); 703 } 704 } 705 706 /* 707 * Collect statistics on synchronous and asynchronous writes. 708 * Writes to block devices are charged to their associated 709 * filesystem (if any). 710 */ 711 if (mp != NULL) { 712 if (async) 713 mp->mnt_stat.f_asyncwrites++; 714 else 715 mp->mnt_stat.f_syncwrites++; 716 } 717 bcstats.pendingwrites++; 718 bcstats.numwrites++; 719 720 wasdelayed = ISSET(bp->b_flags, B_DELWRI); 721 CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI)); 722 723 s = splbio(); 724 725 /* 726 * If not synchronous, pay for the I/O operation and make 727 * sure the buf is on the correct vnode queue. We have 728 * to do this now, because if we don't, the vnode may not 729 * be properly notified that its I/O has completed. 730 */ 731 if (wasdelayed) { 732 reassignbuf(bp); 733 } else 734 curproc->p_ru.ru_oublock++; 735 736 737 /* Initiate disk write. Make sure the appropriate party is charged. */ 738 bp->b_vp->v_numoutput++; 739 buf_flip_dma(bp); 740 SET(bp->b_flags, B_WRITEINPROG); 741 splx(s); 742 VOP_STRATEGY(bp->b_vp, bp); 743 744 /* 745 * If the queue is above the high water mark, wait till 746 * the number of outstanding write bufs drops below the low 747 * water mark. 748 */ 749 if (bp->b_bq) 750 bufq_wait(bp->b_bq); 751 752 if (async) 753 return (0); 754 755 /* 756 * If I/O was synchronous, wait for it to complete. 757 */ 758 rv = biowait(bp); 759 760 /* Release the buffer. */ 761 brelse(bp); 762 763 return (rv); 764 } 765 766 767 /* 768 * Delayed write. 769 * 770 * The buffer is marked dirty, but is not queued for I/O. 771 * This routine should be used when the buffer is expected 772 * to be modified again soon, typically a small write that 773 * partially fills a buffer. 774 * 775 * NB: magnetic tapes cannot be delayed; they must be 776 * written in the order that the writes are requested. 777 * 778 * Described in Leffler, et al. (pp. 208-213). 779 */ 780 void 781 bdwrite(struct buf *bp) 782 { 783 int s; 784 785 /* 786 * If the block hasn't been seen before: 787 * (1) Mark it as having been seen, 788 * (2) Charge for the write. 789 * (3) Make sure it's on its vnode's correct block list, 790 * (4) If a buffer is rewritten, move it to end of dirty list 791 */ 792 if (!ISSET(bp->b_flags, B_DELWRI)) { 793 SET(bp->b_flags, B_DELWRI); 794 s = splbio(); 795 buf_flip_dma(bp); 796 reassignbuf(bp); 797 splx(s); 798 curproc->p_ru.ru_oublock++; /* XXX */ 799 } 800 801 /* The "write" is done, so mark and release the buffer. */ 802 CLR(bp->b_flags, B_NEEDCOMMIT); 803 CLR(bp->b_flags, B_NOCACHE); /* Must cache delayed writes */ 804 SET(bp->b_flags, B_DONE); 805 brelse(bp); 806 } 807 808 /* 809 * Asynchronous block write; just an asynchronous bwrite(). 810 */ 811 void 812 bawrite(struct buf *bp) 813 { 814 815 SET(bp->b_flags, B_ASYNC); 816 VOP_BWRITE(bp); 817 } 818 819 /* 820 * Must be called at splbio() 821 */ 822 void 823 buf_dirty(struct buf *bp) 824 { 825 splassert(IPL_BIO); 826 827 #ifdef DIAGNOSTIC 828 if (!ISSET(bp->b_flags, B_BUSY)) 829 panic("Trying to dirty buffer on freelist!"); 830 #endif 831 832 if (ISSET(bp->b_flags, B_DELWRI) == 0) { 833 SET(bp->b_flags, B_DELWRI); 834 buf_flip_dma(bp); 835 reassignbuf(bp); 836 } 837 } 838 839 /* 840 * Must be called at splbio() 841 */ 842 void 843 buf_undirty(struct buf *bp) 844 { 845 splassert(IPL_BIO); 846 847 #ifdef DIAGNOSTIC 848 if (!ISSET(bp->b_flags, B_BUSY)) 849 panic("Trying to undirty buffer on freelist!"); 850 #endif 851 if (ISSET(bp->b_flags, B_DELWRI)) { 852 CLR(bp->b_flags, B_DELWRI); 853 reassignbuf(bp); 854 } 855 } 856 857 /* 858 * Release a buffer on to the free lists. 859 * Described in Bach (p. 46). 860 */ 861 void 862 brelse(struct buf *bp) 863 { 864 int s; 865 866 s = splbio(); 867 868 if (bp->b_data != NULL) 869 KASSERT(bp->b_bufsize > 0); 870 871 /* 872 * Determine which queue the buffer should be on, then put it there. 873 */ 874 875 /* If it's not cacheable, or an error, mark it invalid. */ 876 if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR))) 877 SET(bp->b_flags, B_INVAL); 878 /* If it's a write error, also mark the vnode as damaged. */ 879 if (ISSET(bp->b_flags, B_ERROR) && !ISSET(bp->b_flags, B_READ)) { 880 if (bp->b_vp && bp->b_vp->v_type == VREG) 881 SET(bp->b_vp->v_bioflag, VBIOERROR); 882 } 883 884 if (ISSET(bp->b_flags, B_INVAL)) { 885 /* 886 * If the buffer is invalid, free it now rather than leaving 887 * it in a queue and wasting memory. 888 */ 889 if (ISSET(bp->b_flags, B_DELWRI)) { 890 CLR(bp->b_flags, B_DELWRI); 891 } 892 893 if (bp->b_vp) { 894 RBT_REMOVE(buf_rb_bufs, &bp->b_vp->v_bufs_tree, bp); 895 brelvp(bp); 896 } 897 bp->b_vp = NULL; 898 899 /* 900 * Wake up any processes waiting for _this_ buffer to 901 * become free. They are not allowed to grab it 902 * since it will be freed. But the only sleeper is 903 * getblk and it will restart the operation after 904 * sleep. 905 */ 906 if (ISSET(bp->b_flags, B_WANTED)) { 907 CLR(bp->b_flags, B_WANTED); 908 wakeup(bp); 909 } 910 buf_put(bp); 911 } else { 912 /* 913 * It has valid data. Put it on the end of the appropriate 914 * queue, so that it'll stick around for as long as possible. 915 */ 916 bufcache_release(bp); 917 918 /* Unlock the buffer. */ 919 CLR(bp->b_flags, (B_AGE | B_ASYNC | B_NOCACHE | B_DEFERRED)); 920 buf_release(bp); 921 922 /* Wake up any processes waiting for _this_ buffer to 923 * become free. */ 924 if (ISSET(bp->b_flags, B_WANTED)) { 925 CLR(bp->b_flags, B_WANTED); 926 wakeup(bp); 927 } 928 929 if (bcstats.dmapages > targetpages) 930 (void) bufcache_recover_dmapages(0, 931 bcstats.dmapages - targetpages); 932 bufcache_adjust(); 933 } 934 935 /* Wake up syncer and cleaner processes waiting for buffers. */ 936 if (nobuffers) { 937 nobuffers = 0; 938 wakeup(&nobuffers); 939 } 940 941 /* Wake up any processes waiting for any buffer to become free. */ 942 if (needbuffer && bcstats.dmapages < targetpages && 943 bcstats.kvaslots_avail > RESERVE_SLOTS) { 944 needbuffer = 0; 945 wakeup(&needbuffer); 946 } 947 948 splx(s); 949 } 950 951 /* 952 * Determine if a block is in the cache. Just look on what would be its hash 953 * chain. If it's there, return a pointer to it, unless it's marked invalid. 954 */ 955 static struct buf * 956 incore_locked(struct vnode *vp, daddr_t blkno) 957 { 958 struct buf *bp; 959 struct buf b; 960 961 splassert(IPL_BIO); 962 963 /* Search buf lookup tree */ 964 b.b_lblkno = blkno; 965 bp = RBT_FIND(buf_rb_bufs, &vp->v_bufs_tree, &b); 966 if (bp != NULL && ISSET(bp->b_flags, B_INVAL)) 967 bp = NULL; 968 969 return (bp); 970 } 971 struct buf * 972 incore(struct vnode *vp, daddr_t blkno) 973 { 974 struct buf *bp; 975 int s; 976 977 s = splbio(); 978 bp = incore_locked(vp, blkno); 979 splx(s); 980 981 return (bp); 982 } 983 984 /* 985 * Get a block of requested size that is associated with 986 * a given vnode and block offset. If it is found in the 987 * block cache, mark it as having been found, make it busy 988 * and return it. Otherwise, return an empty block of the 989 * correct size. It is up to the caller to ensure that the 990 * cached blocks be of the correct size. 991 */ 992 struct buf * 993 getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, 994 uint64_t slptimeo) 995 { 996 struct buf *bp; 997 struct buf b; 998 int s, error; 999 1000 /* 1001 * XXX 1002 * The following is an inlined version of 'incore()', but with 1003 * the 'invalid' test moved to after the 'busy' test. It's 1004 * necessary because there are some cases in which the NFS 1005 * code sets B_INVAL prior to writing data to the server, but 1006 * in which the buffers actually contain valid data. In this 1007 * case, we can't allow the system to allocate a new buffer for 1008 * the block until the write is finished. 1009 */ 1010 start: 1011 s = splbio(); 1012 b.b_lblkno = blkno; 1013 bp = RBT_FIND(buf_rb_bufs, &vp->v_bufs_tree, &b); 1014 if (bp != NULL) { 1015 if (ISSET(bp->b_flags, B_BUSY)) { 1016 SET(bp->b_flags, B_WANTED); 1017 error = tsleep_nsec(bp, slpflag | (PRIBIO + 1), 1018 "getblk", slptimeo); 1019 splx(s); 1020 if (error) 1021 return (NULL); 1022 goto start; 1023 } 1024 1025 if (!ISSET(bp->b_flags, B_INVAL)) { 1026 bcstats.cachehits++; 1027 SET(bp->b_flags, B_CACHE); 1028 bufcache_take(bp); 1029 buf_acquire(bp); 1030 splx(s); 1031 return (bp); 1032 } 1033 } 1034 splx(s); 1035 1036 if ((bp = buf_get(vp, blkno, size)) == NULL) 1037 goto start; 1038 1039 return (bp); 1040 } 1041 1042 /* 1043 * Get an empty, disassociated buffer of given size. 1044 */ 1045 struct buf * 1046 geteblk(size_t size) 1047 { 1048 struct buf *bp; 1049 1050 while ((bp = buf_get(NULL, 0, size)) == NULL) 1051 continue; 1052 1053 return (bp); 1054 } 1055 1056 /* 1057 * Allocate a buffer. 1058 * If vp is given, put it into the buffer cache for that vnode. 1059 * If size != 0, allocate memory and call buf_map(). 1060 * If there is already a buffer for the given vnode/blkno, return NULL. 1061 */ 1062 struct buf * 1063 buf_get(struct vnode *vp, daddr_t blkno, size_t size) 1064 { 1065 struct buf *bp; 1066 int poolwait = size == 0 ? PR_NOWAIT : PR_WAITOK; 1067 int npages; 1068 int s; 1069 1070 s = splbio(); 1071 if (size) { 1072 /* 1073 * Wake up the cleaner if we have lots of dirty pages, 1074 * or if we are getting low on buffer cache kva. 1075 */ 1076 if (UNCLEAN_PAGES >= hidirtypages || 1077 bcstats.kvaslots_avail <= 2 * RESERVE_SLOTS) 1078 wakeup(&bd_req); 1079 1080 npages = atop(round_page(size)); 1081 1082 /* 1083 * if our cache has been previously shrunk, 1084 * allow it to grow again with use up to 1085 * bufhighpages (cachepercent) 1086 */ 1087 if (bufpages < bufhighpages) 1088 bufadjust(bufhighpages); 1089 1090 /* 1091 * If we would go over the page target with our 1092 * new allocation, free enough buffers first 1093 * to stay at the target with our new allocation. 1094 */ 1095 if (bcstats.dmapages + npages > targetpages) { 1096 (void) bufcache_recover_dmapages(0, npages); 1097 bufcache_adjust(); 1098 } 1099 1100 /* 1101 * If we get here, we tried to free the world down 1102 * above, and couldn't get down - Wake the cleaner 1103 * and wait for it to push some buffers out. 1104 */ 1105 if ((bcstats.dmapages + npages > targetpages || 1106 bcstats.kvaslots_avail <= RESERVE_SLOTS) && 1107 curproc != syncerproc && curproc != cleanerproc) { 1108 wakeup(&bd_req); 1109 needbuffer++; 1110 tsleep_nsec(&needbuffer, PRIBIO, "needbuffer", INFSLP); 1111 splx(s); 1112 return (NULL); 1113 } 1114 if (bcstats.dmapages + npages > bufpages) { 1115 /* cleaner or syncer */ 1116 nobuffers = 1; 1117 tsleep_nsec(&nobuffers, PRIBIO, "nobuffers", INFSLP); 1118 splx(s); 1119 return (NULL); 1120 } 1121 } 1122 1123 bp = pool_get(&bufpool, poolwait|PR_ZERO); 1124 1125 if (bp == NULL) { 1126 splx(s); 1127 return (NULL); 1128 } 1129 1130 bp->b_freelist.tqe_next = NOLIST; 1131 bp->b_dev = NODEV; 1132 bp->b_bcount = size; 1133 1134 buf_acquire_nomap(bp); 1135 1136 if (vp != NULL) { 1137 /* 1138 * We insert the buffer into the hash with B_BUSY set 1139 * while we allocate pages for it. This way any getblk 1140 * that happens while we allocate pages will wait for 1141 * this buffer instead of starting its own buf_get. 1142 * 1143 * But first, we check if someone beat us to it. 1144 */ 1145 if (incore_locked(vp, blkno)) { 1146 pool_put(&bufpool, bp); 1147 splx(s); 1148 return (NULL); 1149 } 1150 1151 bp->b_blkno = bp->b_lblkno = blkno; 1152 bgetvp(vp, bp); 1153 if (RBT_INSERT(buf_rb_bufs, &vp->v_bufs_tree, bp)) 1154 panic("buf_get: dup lblk vp %p bp %p", vp, bp); 1155 } else { 1156 bp->b_vnbufs.le_next = NOLIST; 1157 SET(bp->b_flags, B_INVAL); 1158 bp->b_vp = NULL; 1159 } 1160 1161 LIST_INSERT_HEAD(&bufhead, bp, b_list); 1162 bcstats.numbufs++; 1163 1164 if (size) { 1165 buf_alloc_pages(bp, round_page(size)); 1166 KASSERT(ISSET(bp->b_flags, B_DMA)); 1167 buf_map(bp); 1168 } 1169 1170 SET(bp->b_flags, B_BC); 1171 splx(s); 1172 1173 return (bp); 1174 } 1175 1176 /* 1177 * Buffer cleaning daemon. 1178 */ 1179 void 1180 buf_daemon(void *arg) 1181 { 1182 struct buf *bp = NULL; 1183 int s, pushed = 0; 1184 1185 s = splbio(); 1186 for (;;) { 1187 if (bp == NULL || (pushed >= 16 && 1188 UNCLEAN_PAGES < hidirtypages && 1189 bcstats.kvaslots_avail > 2 * RESERVE_SLOTS)){ 1190 pushed = 0; 1191 /* 1192 * Wake up anyone who was waiting for buffers 1193 * to be released. 1194 */ 1195 if (needbuffer) { 1196 needbuffer = 0; 1197 wakeup(&needbuffer); 1198 } 1199 tsleep_nsec(&bd_req, PRIBIO - 7, "cleaner", INFSLP); 1200 } 1201 1202 while ((bp = bufcache_getdirtybuf())) { 1203 TRACEPOINT(vfs, cleaner, bp->b_flags, pushed, 1204 lodirtypages, hidirtypages); 1205 1206 if (UNCLEAN_PAGES < lodirtypages && 1207 bcstats.kvaslots_avail > 2 * RESERVE_SLOTS && 1208 pushed >= 16) 1209 break; 1210 1211 bufcache_take(bp); 1212 buf_acquire(bp); 1213 splx(s); 1214 1215 if (ISSET(bp->b_flags, B_INVAL)) { 1216 brelse(bp); 1217 s = splbio(); 1218 continue; 1219 } 1220 #ifdef DIAGNOSTIC 1221 if (!ISSET(bp->b_flags, B_DELWRI)) 1222 panic("Clean buffer on dirty queue"); 1223 #endif 1224 bawrite(bp); 1225 pushed++; 1226 1227 sched_pause(yield); 1228 1229 s = splbio(); 1230 } 1231 } 1232 } 1233 1234 /* 1235 * Wait for operations on the buffer to complete. 1236 * When they do, extract and return the I/O's error value. 1237 */ 1238 int 1239 biowait(struct buf *bp) 1240 { 1241 int s; 1242 1243 KASSERT(!(bp->b_flags & B_ASYNC)); 1244 1245 s = splbio(); 1246 while (!ISSET(bp->b_flags, B_DONE)) 1247 tsleep_nsec(bp, PRIBIO + 1, "biowait", INFSLP); 1248 splx(s); 1249 1250 /* check for interruption of I/O (e.g. via NFS), then errors. */ 1251 if (ISSET(bp->b_flags, B_EINTR)) { 1252 CLR(bp->b_flags, B_EINTR); 1253 return (EINTR); 1254 } 1255 1256 if (ISSET(bp->b_flags, B_ERROR)) 1257 return (bp->b_error ? bp->b_error : EIO); 1258 else 1259 return (0); 1260 } 1261 1262 /* 1263 * Mark I/O complete on a buffer. 1264 * 1265 * If a callback has been requested, e.g. the pageout 1266 * daemon, do so. Otherwise, awaken waiting processes. 1267 * 1268 * [ Leffler, et al., says on p.247: 1269 * "This routine wakes up the blocked process, frees the buffer 1270 * for an asynchronous write, or, for a request by the pagedaemon 1271 * process, invokes a procedure specified in the buffer structure" ] 1272 * 1273 * In real life, the pagedaemon (or other system processes) wants 1274 * to do async stuff to, and doesn't want the buffer brelse()'d. 1275 * (for swap pager, that puts swap buffers on the free lists (!!!), 1276 * for the vn device, that puts malloc'd buffers on the free lists!) 1277 * 1278 * Must be called at splbio(). 1279 */ 1280 void 1281 biodone(struct buf *bp) 1282 { 1283 splassert(IPL_BIO); 1284 1285 if (ISSET(bp->b_flags, B_DONE)) 1286 panic("biodone already"); 1287 SET(bp->b_flags, B_DONE); /* note that it's done */ 1288 1289 if (bp->b_bq) 1290 bufq_done(bp->b_bq, bp); 1291 1292 if (!ISSET(bp->b_flags, B_READ)) { 1293 CLR(bp->b_flags, B_WRITEINPROG); 1294 vwakeup(bp->b_vp); 1295 } 1296 if (bcstats.numbufs && 1297 (!(ISSET(bp->b_flags, B_RAW) || ISSET(bp->b_flags, B_PHYS)))) { 1298 if (!ISSET(bp->b_flags, B_READ)) { 1299 bcstats.pendingwrites--; 1300 } else 1301 bcstats.pendingreads--; 1302 } 1303 if (ISSET(bp->b_flags, B_CALL)) { /* if necessary, call out */ 1304 CLR(bp->b_flags, B_CALL); /* but note callout done */ 1305 (*bp->b_iodone)(bp); 1306 } else { 1307 if (ISSET(bp->b_flags, B_ASYNC)) {/* if async, release it */ 1308 brelse(bp); 1309 } else { /* or just wakeup the buffer */ 1310 CLR(bp->b_flags, B_WANTED); 1311 wakeup(bp); 1312 } 1313 } 1314 } 1315 1316 #ifdef DDB 1317 void bcstats_print(int (*)(const char *, ...) 1318 __attribute__((__format__(__kprintf__,1,2)))); 1319 /* 1320 * bcstats_print: ddb hook to print interesting buffer cache counters 1321 */ 1322 void 1323 bcstats_print( 1324 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) 1325 { 1326 (*pr)("Current Buffer Cache status:\n"); 1327 (*pr)("numbufs %lld busymapped %lld, delwri %lld\n", 1328 bcstats.numbufs, bcstats.busymapped, bcstats.delwribufs); 1329 (*pr)("kvaslots %lld avail kva slots %lld\n", 1330 bcstats.kvaslots, bcstats.kvaslots_avail); 1331 (*pr)("bufpages %lld, dmapages %lld, dirtypages %lld\n", 1332 bcstats.numbufpages, bcstats.dmapages, bcstats.numdirtypages); 1333 (*pr)("pendingreads %lld, pendingwrites %lld\n", 1334 bcstats.pendingreads, bcstats.pendingwrites); 1335 (*pr)("highflips %lld, highflops %lld, dmaflips %lld\n", 1336 bcstats.highflips, bcstats.highflops, bcstats.dmaflips); 1337 } 1338 #endif 1339 1340 void 1341 buf_adjcnt(struct buf *bp, long ncount) 1342 { 1343 KASSERT(ncount <= bp->b_bufsize); 1344 bp->b_bcount = ncount; 1345 } 1346 1347 /* bufcache freelist code below */ 1348 /* 1349 * Copyright (c) 2014 Ted Unangst <tedu@openbsd.org> 1350 * 1351 * Permission to use, copy, modify, and distribute this software for any 1352 * purpose with or without fee is hereby granted, provided that the above 1353 * copyright notice and this permission notice appear in all copies. 1354 * 1355 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 1356 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 1357 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 1358 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 1359 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 1360 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 1361 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 1362 */ 1363 1364 /* 1365 * The code below implements a variant of the 2Q buffer cache algorithm by 1366 * Johnson and Shasha. 1367 * 1368 * General Outline 1369 * We divide the buffer cache into three working sets: current, previous, 1370 * and long term. Each list is itself LRU and buffers get promoted and moved 1371 * around between them. A buffer starts its life in the current working set. 1372 * As time passes and newer buffers push it out, it will turn into the previous 1373 * working set and is subject to recycling. But if it's accessed again from 1374 * the previous working set, that's an indication that it's actually in the 1375 * long term working set, so we promote it there. The separation of current 1376 * and previous working sets prevents us from promoting a buffer that's only 1377 * temporarily hot to the long term cache. 1378 * 1379 * The objective is to provide scan resistance by making the long term 1380 * working set ineligible for immediate recycling, even as the current 1381 * working set is rapidly turned over. 1382 * 1383 * Implementation 1384 * The code below identifies the current, previous, and long term sets as 1385 * hotqueue, coldqueue, and warmqueue. The hot and warm queues are capped at 1386 * 1/3 of the total clean pages, after which point they start pushing their 1387 * oldest buffers into coldqueue. 1388 * A buf always starts out with neither WARM or COLD flags set (implying HOT). 1389 * When released, it will be returned to the tail of the hotqueue list. 1390 * When the hotqueue gets too large, the oldest hot buf will be moved to the 1391 * coldqueue, with the B_COLD flag set. When a cold buf is released, we set 1392 * the B_WARM flag and put it onto the warmqueue. Warm bufs are also 1393 * directly returned to the end of the warmqueue. As with the hotqueue, when 1394 * the warmqueue grows too large, B_WARM bufs are moved onto the coldqueue. 1395 * 1396 * Note that this design does still support large working sets, greater 1397 * than the cap of hotqueue or warmqueue would imply. The coldqueue is still 1398 * cached and has no maximum length. The hot and warm queues form a Y feeding 1399 * into the coldqueue. Moving bufs between queues is constant time, so this 1400 * design decays to one long warm->cold queue. 1401 * 1402 * In the 2Q paper, hotqueue and coldqueue are A1in and A1out. The warmqueue 1403 * is Am. We always cache pages, as opposed to pointers to pages for A1. 1404 * 1405 * This implementation adds support for multiple 2q caches. 1406 * 1407 * If we have more than one 2q cache, as bufs fall off the cold queue 1408 * for recycling, bufs that have been warm before (which retain the 1409 * B_WARM flag in addition to B_COLD) can be put into the hot queue of 1410 * a second level 2Q cache. buffers which are only B_COLD are 1411 * recycled. Bufs falling off the last cache's cold queue are always 1412 * recycled. 1413 * 1414 */ 1415 1416 /* 1417 * this function is called when a hot or warm queue may have exceeded its 1418 * size limit. it will move a buf to the coldqueue. 1419 */ 1420 int chillbufs(struct 1421 bufcache *cache, struct bufqueue *queue, int64_t *queuepages); 1422 1423 void 1424 bufcache_init(void) 1425 { 1426 int i; 1427 1428 for (i = 0; i < NUM_CACHES; i++) { 1429 TAILQ_INIT(&cleancache[i].hotqueue); 1430 TAILQ_INIT(&cleancache[i].coldqueue); 1431 TAILQ_INIT(&cleancache[i].warmqueue); 1432 } 1433 TAILQ_INIT(&dirtyqueue); 1434 } 1435 1436 /* 1437 * if the buffer caches have shrunk, we may need to rebalance our queues. 1438 */ 1439 void 1440 bufcache_adjust(void) 1441 { 1442 int i; 1443 1444 for (i = 0; i < NUM_CACHES; i++) { 1445 while (chillbufs(&cleancache[i], &cleancache[i].warmqueue, 1446 &cleancache[i].warmbufpages) || 1447 chillbufs(&cleancache[i], &cleancache[i].hotqueue, 1448 &cleancache[i].hotbufpages)) 1449 continue; 1450 } 1451 } 1452 1453 /* 1454 * Get a clean buffer from the cache. if "discard" is set do not promote 1455 * previously warm buffers as normal, because we are tossing everything 1456 * away such as in a hibernation 1457 */ 1458 struct buf * 1459 bufcache_getcleanbuf(int cachenum, int discard) 1460 { 1461 struct buf *bp = NULL; 1462 struct bufcache *cache = &cleancache[cachenum]; 1463 struct bufqueue * queue; 1464 1465 splassert(IPL_BIO); 1466 1467 /* try cold queue */ 1468 while ((bp = TAILQ_FIRST(&cache->coldqueue)) || 1469 (bp = TAILQ_FIRST(&cache->warmqueue)) || 1470 (bp = TAILQ_FIRST(&cache->hotqueue))) { 1471 int64_t pages = atop(bp->b_bufsize); 1472 struct bufcache *newcache; 1473 1474 if (discard || cachenum >= NUM_CACHES - 1) { 1475 /* Victim selected, give it up */ 1476 return bp; 1477 } 1478 KASSERT(bp->cache == cachenum); 1479 1480 /* 1481 * If this buffer was warm before, move it to 1482 * the hot queue in the next cache 1483 */ 1484 1485 if (fliphigh) { 1486 /* 1487 * If we are in the DMA cache, try to flip the 1488 * buffer up high to move it on to the other 1489 * caches. if we can't move the buffer to high 1490 * memory without sleeping, we give it up and 1491 * return it rather than fight for more memory 1492 * against non buffer cache competitors. 1493 */ 1494 SET(bp->b_flags, B_BUSY); 1495 if (bp->cache == 0 && buf_flip_high(bp) == -1) { 1496 CLR(bp->b_flags, B_BUSY); 1497 return bp; 1498 } 1499 CLR(bp->b_flags, B_BUSY); 1500 } 1501 1502 /* Move the buffer to the hot queue in the next cache */ 1503 if (ISSET(bp->b_flags, B_COLD)) { 1504 queue = &cache->coldqueue; 1505 } else if (ISSET(bp->b_flags, B_WARM)) { 1506 queue = &cache->warmqueue; 1507 cache->warmbufpages -= pages; 1508 } else { 1509 queue = &cache->hotqueue; 1510 cache->hotbufpages -= pages; 1511 } 1512 TAILQ_REMOVE(queue, bp, b_freelist); 1513 cache->cachepages -= pages; 1514 CLR(bp->b_flags, B_WARM); 1515 CLR(bp->b_flags, B_COLD); 1516 bp->cache++; 1517 newcache= &cleancache[bp->cache]; 1518 newcache->cachepages += pages; 1519 newcache->hotbufpages += pages; 1520 chillbufs(newcache, &newcache->hotqueue, 1521 &newcache->hotbufpages); 1522 TAILQ_INSERT_TAIL(&newcache->hotqueue, bp, b_freelist); 1523 } 1524 return bp; 1525 } 1526 1527 1528 void 1529 discard_buffer(struct buf *bp) 1530 { 1531 splassert(IPL_BIO); 1532 1533 bufcache_take(bp); 1534 if (bp->b_vp) { 1535 RBT_REMOVE(buf_rb_bufs, 1536 &bp->b_vp->v_bufs_tree, bp); 1537 brelvp(bp); 1538 } 1539 buf_put(bp); 1540 } 1541 1542 int64_t 1543 bufcache_recover_dmapages(int discard, int64_t howmany) 1544 { 1545 struct buf *bp = NULL; 1546 struct bufcache *cache = &cleancache[DMA_CACHE]; 1547 struct bufqueue * queue; 1548 int64_t recovered = 0; 1549 1550 splassert(IPL_BIO); 1551 1552 while ((recovered < howmany) && 1553 ((bp = TAILQ_FIRST(&cache->coldqueue)) || 1554 (bp = TAILQ_FIRST(&cache->warmqueue)) || 1555 (bp = TAILQ_FIRST(&cache->hotqueue)))) { 1556 int64_t pages = atop(bp->b_bufsize); 1557 struct bufcache *newcache; 1558 1559 if (discard || DMA_CACHE >= NUM_CACHES - 1) { 1560 discard_buffer(bp); 1561 continue; 1562 } 1563 KASSERT(bp->cache == DMA_CACHE); 1564 1565 /* 1566 * If this buffer was warm before, move it to 1567 * the hot queue in the next cache 1568 */ 1569 1570 /* 1571 * One way or another, the pages for this 1572 * buffer are leaving DMA memory 1573 */ 1574 recovered += pages; 1575 1576 if (!fliphigh) { 1577 discard_buffer(bp); 1578 continue; 1579 } 1580 1581 /* 1582 * If we are in the DMA cache, try to flip the 1583 * buffer up high to move it on to the other 1584 * caches. if we can't move the buffer to high 1585 * memory without sleeping, we give it up 1586 * now rather than fight for more memory 1587 * against non buffer cache competitors. 1588 */ 1589 SET(bp->b_flags, B_BUSY); 1590 if (bp->cache == 0 && buf_flip_high(bp) == -1) { 1591 CLR(bp->b_flags, B_BUSY); 1592 discard_buffer(bp); 1593 continue; 1594 } 1595 CLR(bp->b_flags, B_BUSY); 1596 1597 /* 1598 * Move the buffer to the hot queue in the next cache 1599 */ 1600 if (ISSET(bp->b_flags, B_COLD)) { 1601 queue = &cache->coldqueue; 1602 } else if (ISSET(bp->b_flags, B_WARM)) { 1603 queue = &cache->warmqueue; 1604 cache->warmbufpages -= pages; 1605 } else { 1606 queue = &cache->hotqueue; 1607 cache->hotbufpages -= pages; 1608 } 1609 TAILQ_REMOVE(queue, bp, b_freelist); 1610 cache->cachepages -= pages; 1611 CLR(bp->b_flags, B_WARM); 1612 CLR(bp->b_flags, B_COLD); 1613 bp->cache++; 1614 newcache= &cleancache[bp->cache]; 1615 newcache->cachepages += pages; 1616 newcache->hotbufpages += pages; 1617 chillbufs(newcache, &newcache->hotqueue, 1618 &newcache->hotbufpages); 1619 TAILQ_INSERT_TAIL(&newcache->hotqueue, bp, b_freelist); 1620 } 1621 return recovered; 1622 } 1623 1624 struct buf * 1625 bufcache_getcleanbuf_range(int start, int end, int discard) 1626 { 1627 int i, j = start, q = end; 1628 struct buf *bp = NULL; 1629 1630 /* 1631 * XXX in theory we could promote warm buffers into a previous queue 1632 * so in the pathological case of where we go through all the caches 1633 * without getting a buffer we have to start at the beginning again. 1634 */ 1635 while (j <= q) { 1636 for (i = q; i >= j; i--) 1637 if ((bp = bufcache_getcleanbuf(i, discard))) 1638 return (bp); 1639 j++; 1640 } 1641 return bp; 1642 } 1643 1644 struct buf * 1645 bufcache_gethighcleanbuf(void) 1646 { 1647 if (!fliphigh) 1648 return NULL; 1649 return bufcache_getcleanbuf_range(DMA_CACHE + 1, NUM_CACHES - 1, 0); 1650 } 1651 1652 1653 struct buf * 1654 bufcache_getdmacleanbuf(void) 1655 { 1656 if (fliphigh) 1657 return bufcache_getcleanbuf_range(DMA_CACHE, DMA_CACHE, 0); 1658 return bufcache_getcleanbuf_range(DMA_CACHE, NUM_CACHES - 1, 0); 1659 } 1660 1661 1662 struct buf * 1663 bufcache_getdirtybuf(void) 1664 { 1665 return TAILQ_FIRST(&dirtyqueue); 1666 } 1667 1668 void 1669 bufcache_take(struct buf *bp) 1670 { 1671 struct bufqueue *queue; 1672 int64_t pages; 1673 1674 splassert(IPL_BIO); 1675 KASSERT(ISSET(bp->b_flags, B_BC)); 1676 KASSERT(bp->cache >= DMA_CACHE); 1677 KASSERT((bp->cache < NUM_CACHES)); 1678 1679 pages = atop(bp->b_bufsize); 1680 1681 TRACEPOINT(vfs, bufcache_take, bp->b_flags, bp->cache, pages); 1682 1683 struct bufcache *cache = &cleancache[bp->cache]; 1684 if (!ISSET(bp->b_flags, B_DELWRI)) { 1685 if (ISSET(bp->b_flags, B_COLD)) { 1686 queue = &cache->coldqueue; 1687 } else if (ISSET(bp->b_flags, B_WARM)) { 1688 queue = &cache->warmqueue; 1689 cache->warmbufpages -= pages; 1690 } else { 1691 queue = &cache->hotqueue; 1692 cache->hotbufpages -= pages; 1693 } 1694 bcstats.numcleanpages -= pages; 1695 cache->cachepages -= pages; 1696 } else { 1697 queue = &dirtyqueue; 1698 bcstats.numdirtypages -= pages; 1699 bcstats.delwribufs--; 1700 } 1701 TAILQ_REMOVE(queue, bp, b_freelist); 1702 } 1703 1704 /* move buffers from a hot or warm queue to a cold queue in a cache */ 1705 int 1706 chillbufs(struct bufcache *cache, struct bufqueue *queue, int64_t *queuepages) 1707 { 1708 struct buf *bp; 1709 int64_t limit, pages; 1710 1711 /* 1712 * We limit the hot queue to be small, with a max of 4096 pages. 1713 * We limit the warm queue to half the cache size. 1714 * 1715 * We impose a minimum size of 96 to prevent too much "wobbling". 1716 */ 1717 if (queue == &cache->hotqueue) 1718 limit = min(cache->cachepages / 20, 4096); 1719 else if (queue == &cache->warmqueue) 1720 limit = (cache->cachepages / 2); 1721 else 1722 panic("chillbufs: invalid queue"); 1723 1724 if (*queuepages > 96 && *queuepages > limit) { 1725 bp = TAILQ_FIRST(queue); 1726 if (!bp) 1727 panic("inconsistent bufpage counts"); 1728 pages = atop(bp->b_bufsize); 1729 *queuepages -= pages; 1730 TAILQ_REMOVE(queue, bp, b_freelist); 1731 /* we do not clear B_WARM */ 1732 SET(bp->b_flags, B_COLD); 1733 TAILQ_INSERT_TAIL(&cache->coldqueue, bp, b_freelist); 1734 return 1; 1735 } 1736 return 0; 1737 } 1738 1739 void 1740 bufcache_release(struct buf *bp) 1741 { 1742 struct bufqueue *queue; 1743 int64_t pages; 1744 struct bufcache *cache = &cleancache[bp->cache]; 1745 1746 KASSERT(ISSET(bp->b_flags, B_BC)); 1747 pages = atop(bp->b_bufsize); 1748 1749 TRACEPOINT(vfs, bufcache_rel, bp->b_flags, bp->cache, pages); 1750 1751 if (fliphigh) { 1752 if (ISSET(bp->b_flags, B_DMA) && bp->cache > 0) 1753 panic("B_DMA buffer release from cache %d", 1754 bp->cache); 1755 else if ((!ISSET(bp->b_flags, B_DMA)) && bp->cache == 0) 1756 panic("Non B_DMA buffer release from cache %d", 1757 bp->cache); 1758 } 1759 1760 if (!ISSET(bp->b_flags, B_DELWRI)) { 1761 int64_t *queuepages; 1762 if (ISSET(bp->b_flags, B_WARM | B_COLD)) { 1763 SET(bp->b_flags, B_WARM); 1764 CLR(bp->b_flags, B_COLD); 1765 queue = &cache->warmqueue; 1766 queuepages = &cache->warmbufpages; 1767 } else { 1768 queue = &cache->hotqueue; 1769 queuepages = &cache->hotbufpages; 1770 } 1771 *queuepages += pages; 1772 bcstats.numcleanpages += pages; 1773 cache->cachepages += pages; 1774 chillbufs(cache, queue, queuepages); 1775 } else { 1776 queue = &dirtyqueue; 1777 bcstats.numdirtypages += pages; 1778 bcstats.delwribufs++; 1779 } 1780 TAILQ_INSERT_TAIL(queue, bp, b_freelist); 1781 } 1782 1783 #ifdef HIBERNATE 1784 /* 1785 * Nuke the buffer cache from orbit when hibernating. We do not want to save 1786 * any clean cache pages to swap and read them back. the original disk files 1787 * are just as good. 1788 */ 1789 void 1790 hibernate_suspend_bufcache(void) 1791 { 1792 struct buf *bp; 1793 int s; 1794 1795 s = splbio(); 1796 /* Chuck away all the cache pages.. discard bufs, do not promote */ 1797 while ((bp = bufcache_getcleanbuf_range(DMA_CACHE, NUM_CACHES - 1, 1))) { 1798 bufcache_take(bp); 1799 if (bp->b_vp) { 1800 RBT_REMOVE(buf_rb_bufs, &bp->b_vp->v_bufs_tree, bp); 1801 brelvp(bp); 1802 } 1803 buf_put(bp); 1804 } 1805 splx(s); 1806 } 1807 1808 void 1809 hibernate_resume_bufcache(void) 1810 { 1811 /* XXX Nothing needed here for now */ 1812 } 1813 #endif /* HIBERNATE */ 1814