1 /* $OpenBSD: vfs_bio.c,v 1.212 2023/04/26 15:13:52 beck Exp $ */ 2 /* $NetBSD: vfs_bio.c,v 1.44 1996/06/11 11:15:36 pk Exp $ */ 3 4 /* 5 * Copyright (c) 1994 Christopher G. Demetriou 6 * Copyright (c) 1982, 1986, 1989, 1993 7 * The Regents of the University of California. All rights reserved. 8 * (c) UNIX System Laboratories, Inc. 9 * All or some portions of this file are derived from material licensed 10 * to the University of California by American Telephone and Telegraph 11 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 12 * the permission of UNIX System Laboratories, Inc. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94 39 */ 40 41 /* 42 * Some references: 43 * Bach: The Design of the UNIX Operating System (Prentice Hall, 1986) 44 * Leffler, et al.: The Design and Implementation of the 4.3BSD 45 * UNIX Operating System (Addison Welley, 1989) 46 */ 47 48 #include <sys/param.h> 49 #include <sys/systm.h> 50 #include <sys/proc.h> 51 #include <sys/buf.h> 52 #include <sys/vnode.h> 53 #include <sys/mount.h> 54 #include <sys/malloc.h> 55 #include <sys/pool.h> 56 #include <sys/specdev.h> 57 #include <sys/tracepoint.h> 58 #include <uvm/uvm_extern.h> 59 60 /* XXX Should really be in buf.h, but for uvm_constraint_range.. */ 61 int buf_realloc_pages(struct buf *, struct uvm_constraint_range *, int); 62 63 struct uvm_constraint_range high_constraint; 64 int fliphigh; 65 66 int nobuffers; 67 int needbuffer; 68 struct bio_ops bioops; 69 70 /* private bufcache functions */ 71 void bufcache_init(void); 72 void bufcache_adjust(void); 73 struct buf *bufcache_gethighcleanbuf(void); 74 struct buf *bufcache_getdmacleanbuf(void); 75 76 /* 77 * Buffer pool for I/O buffers. 78 */ 79 struct pool bufpool; 80 struct bufhead bufhead = LIST_HEAD_INITIALIZER(bufhead); 81 void buf_put(struct buf *); 82 83 struct buf *bio_doread(struct vnode *, daddr_t, int, int); 84 struct buf *buf_get(struct vnode *, daddr_t, size_t); 85 void bread_cluster_callback(struct buf *); 86 int64_t bufcache_recover_dmapages(int discard, int64_t howmany); 87 static struct buf *incore_locked(struct vnode *vp, daddr_t blkno); 88 89 struct bcachestats bcstats; /* counters */ 90 long lodirtypages; /* dirty page count low water mark */ 91 long hidirtypages; /* dirty page count high water mark */ 92 long targetpages; /* target number of pages for cache size */ 93 long buflowpages; /* smallest size cache allowed */ 94 long bufhighpages; /* largest size cache allowed */ 95 long bufbackpages; /* minimum number of pages we shrink when asked to */ 96 97 vsize_t bufkvm; 98 99 struct proc *cleanerproc; 100 int bd_req; /* Sleep point for cleaner daemon. */ 101 102 #define NUM_CACHES 2 103 #define DMA_CACHE 0 104 struct bufcache cleancache[NUM_CACHES]; 105 struct bufqueue dirtyqueue; 106 107 void 108 buf_put(struct buf *bp) 109 { 110 splassert(IPL_BIO); 111 112 #ifdef DIAGNOSTIC 113 if (bp->b_pobj != NULL) 114 KASSERT(bp->b_bufsize > 0); 115 if (ISSET(bp->b_flags, B_DELWRI)) 116 panic("buf_put: releasing dirty buffer"); 117 if (bp->b_freelist.tqe_next != NOLIST && 118 bp->b_freelist.tqe_next != (void *)-1) 119 panic("buf_put: still on the free list"); 120 if (bp->b_vnbufs.le_next != NOLIST && 121 bp->b_vnbufs.le_next != (void *)-1) 122 panic("buf_put: still on the vnode list"); 123 if (!LIST_EMPTY(&bp->b_dep)) 124 panic("buf_put: b_dep is not empty"); 125 #endif 126 127 LIST_REMOVE(bp, b_list); 128 bcstats.numbufs--; 129 130 if (buf_dealloc_mem(bp) != 0) 131 return; 132 pool_put(&bufpool, bp); 133 } 134 135 /* 136 * Initialize buffers and hash links for buffers. 137 */ 138 void 139 bufinit(void) 140 { 141 u_int64_t dmapages; 142 u_int64_t highpages; 143 144 dmapages = uvm_pagecount(&dma_constraint); 145 /* take away a guess at how much of this the kernel will consume */ 146 dmapages -= (atop(physmem) - atop(uvmexp.free)); 147 148 /* See if we have memory above the dma accessible region. */ 149 high_constraint.ucr_low = dma_constraint.ucr_high; 150 high_constraint.ucr_high = no_constraint.ucr_high; 151 if (high_constraint.ucr_low != high_constraint.ucr_high) 152 high_constraint.ucr_low++; 153 highpages = uvm_pagecount(&high_constraint); 154 155 /* 156 * Do we have any significant amount of high memory above 157 * the DMA region? if so enable moving buffers there, if not, 158 * don't bother. 159 */ 160 if (highpages > dmapages / 4) 161 fliphigh = 1; 162 else 163 fliphigh = 0; 164 165 /* 166 * If MD code doesn't say otherwise, use up to 10% of DMA'able 167 * memory for buffers. 168 */ 169 if (bufcachepercent == 0) 170 bufcachepercent = 10; 171 172 /* 173 * XXX these values and their same use in kern_sysctl 174 * need to move into buf.h 175 */ 176 KASSERT(bufcachepercent <= 90); 177 KASSERT(bufcachepercent >= 5); 178 if (bufpages == 0) 179 bufpages = dmapages * bufcachepercent / 100; 180 if (bufpages < BCACHE_MIN) 181 bufpages = BCACHE_MIN; 182 KASSERT(bufpages < dmapages); 183 184 bufhighpages = bufpages; 185 186 /* 187 * Set the base backoff level for the buffer cache. We will 188 * not allow uvm to steal back more than this number of pages. 189 */ 190 buflowpages = dmapages * 5 / 100; 191 if (buflowpages < BCACHE_MIN) 192 buflowpages = BCACHE_MIN; 193 194 /* 195 * set bufbackpages to 100 pages, or 10 percent of the low water mark 196 * if we don't have that many pages. 197 */ 198 199 bufbackpages = buflowpages * 10 / 100; 200 if (bufbackpages > 100) 201 bufbackpages = 100; 202 203 /* 204 * If the MD code does not say otherwise, reserve 10% of kva 205 * space for mapping buffers. 206 */ 207 if (bufkvm == 0) 208 bufkvm = VM_KERNEL_SPACE_SIZE / 10; 209 210 /* 211 * Don't use more than twice the amount of bufpages for mappings. 212 * It's twice since we map things sparsely. 213 */ 214 if (bufkvm > bufpages * PAGE_SIZE) 215 bufkvm = bufpages * PAGE_SIZE; 216 /* 217 * Round bufkvm to MAXPHYS because we allocate chunks of va space 218 * in MAXPHYS chunks. 219 */ 220 bufkvm &= ~(MAXPHYS - 1); 221 222 pool_init(&bufpool, sizeof(struct buf), 0, IPL_BIO, 0, "bufpl", NULL); 223 224 bufcache_init(); 225 226 /* 227 * hmm - bufkvm is an argument because it's static, while 228 * bufpages is global because it can change while running. 229 */ 230 buf_mem_init(bufkvm); 231 232 /* 233 * Set the dirty page high water mark to be less than the low 234 * water mark for pages in the buffer cache. This ensures we 235 * can always back off by throwing away clean pages, and give 236 * ourselves a chance to write out the dirty pages eventually. 237 */ 238 hidirtypages = (buflowpages / 4) * 3; 239 lodirtypages = buflowpages / 2; 240 241 /* 242 * We are allowed to use up to the reserve. 243 */ 244 targetpages = bufpages - RESERVE_PAGES; 245 } 246 247 /* 248 * Change cachepct 249 */ 250 void 251 bufadjust(int newbufpages) 252 { 253 int s; 254 int64_t npages; 255 256 if (newbufpages < buflowpages) 257 newbufpages = buflowpages; 258 259 s = splbio(); 260 bufpages = newbufpages; 261 262 /* 263 * We are allowed to use up to the reserve 264 */ 265 targetpages = bufpages - RESERVE_PAGES; 266 267 npages = bcstats.dmapages - targetpages; 268 269 /* 270 * Shrinking the cache happens here only if someone has manually 271 * adjusted bufcachepercent - or the pagedaemon has told us 272 * to give back memory *now* - so we give it all back. 273 */ 274 if (bcstats.dmapages > targetpages) 275 (void) bufcache_recover_dmapages(0, bcstats.dmapages - targetpages); 276 bufcache_adjust(); 277 278 /* 279 * Wake up the cleaner if we have lots of dirty pages, 280 * or if we are getting low on buffer cache kva. 281 */ 282 if ((UNCLEAN_PAGES >= hidirtypages) || 283 bcstats.kvaslots_avail <= 2 * RESERVE_SLOTS) 284 wakeup(&bd_req); 285 286 splx(s); 287 } 288 289 /* 290 * Make the buffer cache back off from cachepct. 291 */ 292 int 293 bufbackoff(struct uvm_constraint_range *range, long size) 294 { 295 /* 296 * Back off "size" buffer cache pages. Called by the page 297 * daemon to consume buffer cache pages rather than scanning. 298 * 299 * It returns 0 to the pagedaemon to indicate that it has 300 * succeeded in freeing enough pages. It returns -1 to 301 * indicate that it could not and the pagedaemon should take 302 * other measures. 303 * 304 */ 305 long pdelta, oldbufpages; 306 307 /* 308 * If we will accept high memory for this backoff 309 * try to steal it from the high memory buffer cache. 310 */ 311 if (range != NULL && range->ucr_high > dma_constraint.ucr_high) { 312 struct buf *bp; 313 int64_t start = bcstats.numbufpages, recovered = 0; 314 int s = splbio(); 315 316 while ((recovered < size) && 317 (bp = bufcache_gethighcleanbuf())) { 318 bufcache_take(bp); 319 if (bp->b_vp) { 320 RBT_REMOVE(buf_rb_bufs, 321 &bp->b_vp->v_bufs_tree, bp); 322 brelvp(bp); 323 } 324 buf_put(bp); 325 recovered = start - bcstats.numbufpages; 326 } 327 bufcache_adjust(); 328 splx(s); 329 330 /* If we got enough, return success */ 331 if (recovered >= size) 332 return 0; 333 334 /* 335 * If we needed only memory above DMA, 336 * return failure 337 */ 338 if (range->ucr_low > dma_constraint.ucr_high) 339 return -1; 340 341 /* Otherwise get the rest from DMA */ 342 size -= recovered; 343 } 344 345 /* 346 * XXX Otherwise do the dma memory cache dance. this needs 347 * refactoring later to get rid of 'bufpages' 348 */ 349 350 /* 351 * Back off by at least bufbackpages. If the page daemon gave us 352 * a larger size, back off by that much. 353 */ 354 pdelta = (size > bufbackpages) ? size : bufbackpages; 355 356 if (bufpages <= buflowpages) 357 return(-1); 358 if (bufpages - pdelta < buflowpages) 359 pdelta = bufpages - buflowpages; 360 oldbufpages = bufpages; 361 bufadjust(bufpages - pdelta); 362 if (oldbufpages - bufpages < size) 363 return (-1); /* we did not free what we were asked */ 364 else 365 return(0); 366 } 367 368 369 /* 370 * Opportunistically flip a buffer into high memory. Will move the buffer 371 * if memory is available without sleeping, and return 0, otherwise will 372 * fail and return -1 with the buffer unchanged. 373 */ 374 375 int 376 buf_flip_high(struct buf *bp) 377 { 378 int ret = -1; 379 380 KASSERT(ISSET(bp->b_flags, B_BC)); 381 KASSERT(ISSET(bp->b_flags, B_DMA)); 382 KASSERT(bp->cache == DMA_CACHE); 383 KASSERT(fliphigh); 384 385 splassert(IPL_BIO); 386 387 /* Attempt to move the buffer to high memory if we can */ 388 if (buf_realloc_pages(bp, &high_constraint, UVM_PLA_NOWAIT) == 0) { 389 KASSERT(!ISSET(bp->b_flags, B_DMA)); 390 bcstats.highflips++; 391 ret = 0; 392 } else 393 bcstats.highflops++; 394 395 return ret; 396 } 397 398 /* 399 * Flip a buffer to dma reachable memory, when we need it there for 400 * I/O. This can sleep since it will wait for memory allocation in the 401 * DMA reachable area since we have to have the buffer there to proceed. 402 */ 403 void 404 buf_flip_dma(struct buf *bp) 405 { 406 KASSERT(ISSET(bp->b_flags, B_BC)); 407 KASSERT(ISSET(bp->b_flags, B_BUSY)); 408 KASSERT(bp->cache < NUM_CACHES); 409 410 splassert(IPL_BIO); 411 412 if (!ISSET(bp->b_flags, B_DMA)) { 413 414 /* move buf to dma reachable memory */ 415 (void) buf_realloc_pages(bp, &dma_constraint, UVM_PLA_WAITOK); 416 KASSERT(ISSET(bp->b_flags, B_DMA)); 417 bcstats.dmaflips++; 418 } 419 420 if (bp->cache > DMA_CACHE) { 421 CLR(bp->b_flags, B_COLD); 422 CLR(bp->b_flags, B_WARM); 423 bp->cache = DMA_CACHE; 424 } 425 } 426 427 struct buf * 428 bio_doread(struct vnode *vp, daddr_t blkno, int size, int async) 429 { 430 struct buf *bp; 431 struct mount *mp; 432 433 bp = getblk(vp, blkno, size, 0, INFSLP); 434 435 /* 436 * If buffer does not have valid data, start a read. 437 * Note that if buffer is B_INVAL, getblk() won't return it. 438 * Therefore, it's valid if its I/O has completed or been delayed. 439 */ 440 if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) { 441 SET(bp->b_flags, B_READ | async); 442 bcstats.pendingreads++; 443 bcstats.numreads++; 444 VOP_STRATEGY(bp->b_vp, bp); 445 /* Pay for the read. */ 446 curproc->p_ru.ru_inblock++; /* XXX */ 447 } else if (async) { 448 brelse(bp); 449 } 450 451 mp = vp->v_type == VBLK ? vp->v_specmountpoint : vp->v_mount; 452 453 /* 454 * Collect statistics on synchronous and asynchronous reads. 455 * Reads from block devices are charged to their associated 456 * filesystem (if any). 457 */ 458 if (mp != NULL) { 459 if (async == 0) 460 mp->mnt_stat.f_syncreads++; 461 else 462 mp->mnt_stat.f_asyncreads++; 463 } 464 465 return (bp); 466 } 467 468 /* 469 * Read a disk block. 470 * This algorithm described in Bach (p.54). 471 */ 472 int 473 bread(struct vnode *vp, daddr_t blkno, int size, struct buf **bpp) 474 { 475 struct buf *bp; 476 477 /* Get buffer for block. */ 478 bp = *bpp = bio_doread(vp, blkno, size, 0); 479 480 /* Wait for the read to complete, and return result. */ 481 return (biowait(bp)); 482 } 483 484 /* 485 * Read-ahead multiple disk blocks. The first is sync, the rest async. 486 * Trivial modification to the breada algorithm presented in Bach (p.55). 487 */ 488 int 489 breadn(struct vnode *vp, daddr_t blkno, int size, daddr_t rablks[], 490 int rasizes[], int nrablks, struct buf **bpp) 491 { 492 struct buf *bp; 493 int i; 494 495 bp = *bpp = bio_doread(vp, blkno, size, 0); 496 497 /* 498 * For each of the read-ahead blocks, start a read, if necessary. 499 */ 500 for (i = 0; i < nrablks; i++) { 501 /* If it's in the cache, just go on to next one. */ 502 if (incore(vp, rablks[i])) 503 continue; 504 505 /* Get a buffer for the read-ahead block */ 506 (void) bio_doread(vp, rablks[i], rasizes[i], B_ASYNC); 507 } 508 509 /* Otherwise, we had to start a read for it; wait until it's valid. */ 510 return (biowait(bp)); 511 } 512 513 /* 514 * Called from interrupt context. 515 */ 516 void 517 bread_cluster_callback(struct buf *bp) 518 { 519 struct buf **xbpp = bp->b_saveaddr; 520 int i; 521 522 if (xbpp[1] != NULL) { 523 size_t newsize = xbpp[1]->b_bufsize; 524 525 /* 526 * Shrink this buffer's mapping to only cover its part of 527 * the total I/O. 528 */ 529 buf_fix_mapping(bp, newsize); 530 bp->b_bcount = newsize; 531 } 532 533 /* Invalidate read-ahead buffers if read short */ 534 if (bp->b_resid > 0) { 535 for (i = 1; xbpp[i] != NULL; i++) 536 continue; 537 for (i = i - 1; i != 0; i--) { 538 if (xbpp[i]->b_bufsize <= bp->b_resid) { 539 bp->b_resid -= xbpp[i]->b_bufsize; 540 SET(xbpp[i]->b_flags, B_INVAL); 541 } else if (bp->b_resid > 0) { 542 bp->b_resid = 0; 543 SET(xbpp[i]->b_flags, B_INVAL); 544 } else 545 break; 546 } 547 } 548 549 for (i = 1; xbpp[i] != NULL; i++) { 550 if (ISSET(bp->b_flags, B_ERROR)) 551 SET(xbpp[i]->b_flags, B_INVAL | B_ERROR); 552 /* 553 * Move the pages from the master buffer's uvm object 554 * into the individual buffer's uvm objects. 555 */ 556 struct uvm_object *newobj = &xbpp[i]->b_uobj; 557 struct uvm_object *oldobj = &bp->b_uobj; 558 int page; 559 560 uvm_obj_init(newobj, &bufcache_pager, 1); 561 for (page = 0; page < atop(xbpp[i]->b_bufsize); page++) { 562 struct vm_page *pg = uvm_pagelookup(oldobj, 563 xbpp[i]->b_poffs + ptoa(page)); 564 KASSERT(pg != NULL); 565 KASSERT(pg->wire_count == 1); 566 uvm_pagerealloc(pg, newobj, xbpp[i]->b_poffs + ptoa(page)); 567 } 568 xbpp[i]->b_pobj = newobj; 569 570 biodone(xbpp[i]); 571 } 572 573 free(xbpp, M_TEMP, (i + 1) * sizeof(*xbpp)); 574 575 if (ISSET(bp->b_flags, B_ASYNC)) { 576 brelse(bp); 577 } else { 578 CLR(bp->b_flags, B_WANTED); 579 wakeup(bp); 580 } 581 } 582 583 /* 584 * Read-ahead multiple disk blocks, but make sure only one (big) I/O 585 * request is sent to the disk. 586 * XXX This should probably be dropped and breadn should instead be optimized 587 * XXX to do fewer I/O requests. 588 */ 589 int 590 bread_cluster(struct vnode *vp, daddr_t blkno, int size, struct buf **rbpp) 591 { 592 struct buf *bp, **xbpp; 593 int howmany, maxra, i, inc; 594 daddr_t sblkno; 595 596 *rbpp = bio_doread(vp, blkno, size, 0); 597 598 /* 599 * If the buffer is in the cache skip any I/O operation. 600 */ 601 if (ISSET((*rbpp)->b_flags, B_CACHE)) 602 goto out; 603 604 if (size != round_page(size)) 605 goto out; 606 607 if (VOP_BMAP(vp, blkno + 1, NULL, &sblkno, &maxra)) 608 goto out; 609 610 maxra++; 611 if (sblkno == -1 || maxra < 2) 612 goto out; 613 614 howmany = MAXPHYS / size; 615 if (howmany > maxra) 616 howmany = maxra; 617 618 xbpp = mallocarray(howmany + 1, sizeof(*xbpp), M_TEMP, M_NOWAIT); 619 if (xbpp == NULL) 620 goto out; 621 622 for (i = howmany - 1; i >= 0; i--) { 623 size_t sz; 624 625 /* 626 * First buffer allocates big enough size to cover what 627 * all the other buffers need. 628 */ 629 sz = i == 0 ? howmany * size : 0; 630 631 xbpp[i] = buf_get(vp, blkno + i + 1, sz); 632 if (xbpp[i] == NULL) { 633 for (++i; i < howmany; i++) { 634 SET(xbpp[i]->b_flags, B_INVAL); 635 brelse(xbpp[i]); 636 } 637 free(xbpp, M_TEMP, (howmany + 1) * sizeof(*xbpp)); 638 goto out; 639 } 640 } 641 642 bp = xbpp[0]; 643 644 xbpp[howmany] = NULL; 645 646 inc = btodb(size); 647 648 for (i = 1; i < howmany; i++) { 649 bcstats.pendingreads++; 650 bcstats.numreads++; 651 /* 652 * We set B_DMA here because bp above will be B_DMA, 653 * and we are playing buffer slice-n-dice games from 654 * the memory allocated in bp. 655 */ 656 SET(xbpp[i]->b_flags, B_DMA | B_READ | B_ASYNC); 657 xbpp[i]->b_blkno = sblkno + (i * inc); 658 xbpp[i]->b_bufsize = xbpp[i]->b_bcount = size; 659 xbpp[i]->b_data = NULL; 660 xbpp[i]->b_pobj = bp->b_pobj; 661 xbpp[i]->b_poffs = bp->b_poffs + (i * size); 662 } 663 664 KASSERT(bp->b_lblkno == blkno + 1); 665 KASSERT(bp->b_vp == vp); 666 667 bp->b_blkno = sblkno; 668 SET(bp->b_flags, B_READ | B_ASYNC | B_CALL); 669 670 bp->b_saveaddr = (void *)xbpp; 671 bp->b_iodone = bread_cluster_callback; 672 673 bcstats.pendingreads++; 674 bcstats.numreads++; 675 VOP_STRATEGY(bp->b_vp, bp); 676 curproc->p_ru.ru_inblock++; 677 678 out: 679 return (biowait(*rbpp)); 680 } 681 682 /* 683 * Block write. Described in Bach (p.56) 684 */ 685 int 686 bwrite(struct buf *bp) 687 { 688 int rv, async, wasdelayed, s; 689 struct vnode *vp; 690 struct mount *mp; 691 692 vp = bp->b_vp; 693 if (vp != NULL) 694 mp = vp->v_type == VBLK? vp->v_specmountpoint : vp->v_mount; 695 else 696 mp = NULL; 697 698 /* 699 * Remember buffer type, to switch on it later. If the write was 700 * synchronous, but the file system was mounted with MNT_ASYNC, 701 * convert it to a delayed write. 702 * XXX note that this relies on delayed tape writes being converted 703 * to async, not sync writes (which is safe, but ugly). 704 */ 705 async = ISSET(bp->b_flags, B_ASYNC); 706 if (!async && mp && ISSET(mp->mnt_flag, MNT_ASYNC)) { 707 /* 708 * Don't convert writes from VND on async filesystems 709 * that already have delayed writes in the upper layer. 710 */ 711 if (!ISSET(bp->b_flags, B_NOCACHE)) { 712 bdwrite(bp); 713 return (0); 714 } 715 } 716 717 /* 718 * Collect statistics on synchronous and asynchronous writes. 719 * Writes to block devices are charged to their associated 720 * filesystem (if any). 721 */ 722 if (mp != NULL) { 723 if (async) 724 mp->mnt_stat.f_asyncwrites++; 725 else 726 mp->mnt_stat.f_syncwrites++; 727 } 728 bcstats.pendingwrites++; 729 bcstats.numwrites++; 730 731 wasdelayed = ISSET(bp->b_flags, B_DELWRI); 732 CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI)); 733 734 s = splbio(); 735 736 /* 737 * If not synchronous, pay for the I/O operation and make 738 * sure the buf is on the correct vnode queue. We have 739 * to do this now, because if we don't, the vnode may not 740 * be properly notified that its I/O has completed. 741 */ 742 if (wasdelayed) { 743 reassignbuf(bp); 744 } else 745 curproc->p_ru.ru_oublock++; 746 747 748 /* Initiate disk write. Make sure the appropriate party is charged. */ 749 bp->b_vp->v_numoutput++; 750 buf_flip_dma(bp); 751 SET(bp->b_flags, B_WRITEINPROG); 752 splx(s); 753 VOP_STRATEGY(bp->b_vp, bp); 754 755 /* 756 * If the queue is above the high water mark, wait till 757 * the number of outstanding write bufs drops below the low 758 * water mark. 759 */ 760 if (bp->b_bq) 761 bufq_wait(bp->b_bq); 762 763 if (async) 764 return (0); 765 766 /* 767 * If I/O was synchronous, wait for it to complete. 768 */ 769 rv = biowait(bp); 770 771 /* Release the buffer. */ 772 brelse(bp); 773 774 return (rv); 775 } 776 777 778 /* 779 * Delayed write. 780 * 781 * The buffer is marked dirty, but is not queued for I/O. 782 * This routine should be used when the buffer is expected 783 * to be modified again soon, typically a small write that 784 * partially fills a buffer. 785 * 786 * NB: magnetic tapes cannot be delayed; they must be 787 * written in the order that the writes are requested. 788 * 789 * Described in Leffler, et al. (pp. 208-213). 790 */ 791 void 792 bdwrite(struct buf *bp) 793 { 794 int s; 795 796 /* 797 * If the block hasn't been seen before: 798 * (1) Mark it as having been seen, 799 * (2) Charge for the write. 800 * (3) Make sure it's on its vnode's correct block list, 801 * (4) If a buffer is rewritten, move it to end of dirty list 802 */ 803 if (!ISSET(bp->b_flags, B_DELWRI)) { 804 SET(bp->b_flags, B_DELWRI); 805 s = splbio(); 806 buf_flip_dma(bp); 807 reassignbuf(bp); 808 splx(s); 809 curproc->p_ru.ru_oublock++; /* XXX */ 810 } 811 812 /* The "write" is done, so mark and release the buffer. */ 813 CLR(bp->b_flags, B_NEEDCOMMIT); 814 CLR(bp->b_flags, B_NOCACHE); /* Must cache delayed writes */ 815 SET(bp->b_flags, B_DONE); 816 brelse(bp); 817 } 818 819 /* 820 * Asynchronous block write; just an asynchronous bwrite(). 821 */ 822 void 823 bawrite(struct buf *bp) 824 { 825 826 SET(bp->b_flags, B_ASYNC); 827 VOP_BWRITE(bp); 828 } 829 830 /* 831 * Must be called at splbio() 832 */ 833 void 834 buf_dirty(struct buf *bp) 835 { 836 splassert(IPL_BIO); 837 838 #ifdef DIAGNOSTIC 839 if (!ISSET(bp->b_flags, B_BUSY)) 840 panic("Trying to dirty buffer on freelist!"); 841 #endif 842 843 if (ISSET(bp->b_flags, B_DELWRI) == 0) { 844 SET(bp->b_flags, B_DELWRI); 845 buf_flip_dma(bp); 846 reassignbuf(bp); 847 } 848 } 849 850 /* 851 * Must be called at splbio() 852 */ 853 void 854 buf_undirty(struct buf *bp) 855 { 856 splassert(IPL_BIO); 857 858 #ifdef DIAGNOSTIC 859 if (!ISSET(bp->b_flags, B_BUSY)) 860 panic("Trying to undirty buffer on freelist!"); 861 #endif 862 if (ISSET(bp->b_flags, B_DELWRI)) { 863 CLR(bp->b_flags, B_DELWRI); 864 reassignbuf(bp); 865 } 866 } 867 868 /* 869 * Release a buffer on to the free lists. 870 * Described in Bach (p. 46). 871 */ 872 void 873 brelse(struct buf *bp) 874 { 875 int s; 876 877 s = splbio(); 878 879 if (bp->b_data != NULL) 880 KASSERT(bp->b_bufsize > 0); 881 882 /* 883 * softdep is basically incompatible with not caching buffers 884 * that have dependencies, so this buffer must be cached 885 */ 886 if (LIST_FIRST(&bp->b_dep) != NULL) 887 CLR(bp->b_flags, B_NOCACHE); 888 889 /* 890 * Determine which queue the buffer should be on, then put it there. 891 */ 892 893 /* If it's not cacheable, or an error, mark it invalid. */ 894 if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR))) 895 SET(bp->b_flags, B_INVAL); 896 /* If it's a write error, also mark the vnode as damaged. */ 897 if (ISSET(bp->b_flags, B_ERROR) && !ISSET(bp->b_flags, B_READ)) { 898 if (bp->b_vp && bp->b_vp->v_type == VREG) 899 SET(bp->b_vp->v_bioflag, VBIOERROR); 900 } 901 902 if (ISSET(bp->b_flags, B_INVAL)) { 903 /* 904 * If the buffer is invalid, free it now rather than leaving 905 * it in a queue and wasting memory. 906 */ 907 if (LIST_FIRST(&bp->b_dep) != NULL) 908 buf_deallocate(bp); 909 910 if (ISSET(bp->b_flags, B_DELWRI)) { 911 CLR(bp->b_flags, B_DELWRI); 912 } 913 914 if (bp->b_vp) { 915 RBT_REMOVE(buf_rb_bufs, &bp->b_vp->v_bufs_tree, bp); 916 brelvp(bp); 917 } 918 bp->b_vp = NULL; 919 920 /* 921 * Wake up any processes waiting for _this_ buffer to 922 * become free. They are not allowed to grab it 923 * since it will be freed. But the only sleeper is 924 * getblk and it will restart the operation after 925 * sleep. 926 */ 927 if (ISSET(bp->b_flags, B_WANTED)) { 928 CLR(bp->b_flags, B_WANTED); 929 wakeup(bp); 930 } 931 buf_put(bp); 932 } else { 933 /* 934 * It has valid data. Put it on the end of the appropriate 935 * queue, so that it'll stick around for as long as possible. 936 */ 937 bufcache_release(bp); 938 939 /* Unlock the buffer. */ 940 CLR(bp->b_flags, (B_AGE | B_ASYNC | B_NOCACHE | B_DEFERRED)); 941 buf_release(bp); 942 943 /* Wake up any processes waiting for _this_ buffer to 944 * become free. */ 945 if (ISSET(bp->b_flags, B_WANTED)) { 946 CLR(bp->b_flags, B_WANTED); 947 wakeup(bp); 948 } 949 950 if (bcstats.dmapages > targetpages) 951 (void) bufcache_recover_dmapages(0, 952 bcstats.dmapages - targetpages); 953 bufcache_adjust(); 954 } 955 956 /* Wake up syncer and cleaner processes waiting for buffers. */ 957 if (nobuffers) { 958 nobuffers = 0; 959 wakeup(&nobuffers); 960 } 961 962 /* Wake up any processes waiting for any buffer to become free. */ 963 if (needbuffer && bcstats.dmapages < targetpages && 964 bcstats.kvaslots_avail > RESERVE_SLOTS) { 965 needbuffer = 0; 966 wakeup(&needbuffer); 967 } 968 969 splx(s); 970 } 971 972 /* 973 * Determine if a block is in the cache. Just look on what would be its hash 974 * chain. If it's there, return a pointer to it, unless it's marked invalid. 975 */ 976 static struct buf * 977 incore_locked(struct vnode *vp, daddr_t blkno) 978 { 979 struct buf *bp; 980 struct buf b; 981 982 splassert(IPL_BIO); 983 984 /* Search buf lookup tree */ 985 b.b_lblkno = blkno; 986 bp = RBT_FIND(buf_rb_bufs, &vp->v_bufs_tree, &b); 987 if (bp != NULL && ISSET(bp->b_flags, B_INVAL)) 988 bp = NULL; 989 990 return (bp); 991 } 992 struct buf * 993 incore(struct vnode *vp, daddr_t blkno) 994 { 995 struct buf *bp; 996 int s; 997 998 s = splbio(); 999 bp = incore_locked(vp, blkno); 1000 splx(s); 1001 1002 return (bp); 1003 } 1004 1005 /* 1006 * Get a block of requested size that is associated with 1007 * a given vnode and block offset. If it is found in the 1008 * block cache, mark it as having been found, make it busy 1009 * and return it. Otherwise, return an empty block of the 1010 * correct size. It is up to the caller to ensure that the 1011 * cached blocks be of the correct size. 1012 */ 1013 struct buf * 1014 getblk(struct vnode *vp, daddr_t blkno, int size, int slpflag, 1015 uint64_t slptimeo) 1016 { 1017 struct buf *bp; 1018 struct buf b; 1019 int s, error; 1020 1021 /* 1022 * XXX 1023 * The following is an inlined version of 'incore()', but with 1024 * the 'invalid' test moved to after the 'busy' test. It's 1025 * necessary because there are some cases in which the NFS 1026 * code sets B_INVAL prior to writing data to the server, but 1027 * in which the buffers actually contain valid data. In this 1028 * case, we can't allow the system to allocate a new buffer for 1029 * the block until the write is finished. 1030 */ 1031 start: 1032 s = splbio(); 1033 b.b_lblkno = blkno; 1034 bp = RBT_FIND(buf_rb_bufs, &vp->v_bufs_tree, &b); 1035 if (bp != NULL) { 1036 if (ISSET(bp->b_flags, B_BUSY)) { 1037 SET(bp->b_flags, B_WANTED); 1038 error = tsleep_nsec(bp, slpflag | (PRIBIO + 1), 1039 "getblk", slptimeo); 1040 splx(s); 1041 if (error) 1042 return (NULL); 1043 goto start; 1044 } 1045 1046 if (!ISSET(bp->b_flags, B_INVAL)) { 1047 bcstats.cachehits++; 1048 SET(bp->b_flags, B_CACHE); 1049 bufcache_take(bp); 1050 buf_acquire(bp); 1051 splx(s); 1052 return (bp); 1053 } 1054 } 1055 splx(s); 1056 1057 if ((bp = buf_get(vp, blkno, size)) == NULL) 1058 goto start; 1059 1060 return (bp); 1061 } 1062 1063 /* 1064 * Get an empty, disassociated buffer of given size. 1065 */ 1066 struct buf * 1067 geteblk(size_t size) 1068 { 1069 struct buf *bp; 1070 1071 while ((bp = buf_get(NULL, 0, size)) == NULL) 1072 continue; 1073 1074 return (bp); 1075 } 1076 1077 /* 1078 * Allocate a buffer. 1079 * If vp is given, put it into the buffer cache for that vnode. 1080 * If size != 0, allocate memory and call buf_map(). 1081 * If there is already a buffer for the given vnode/blkno, return NULL. 1082 */ 1083 struct buf * 1084 buf_get(struct vnode *vp, daddr_t blkno, size_t size) 1085 { 1086 struct buf *bp; 1087 int poolwait = size == 0 ? PR_NOWAIT : PR_WAITOK; 1088 int npages; 1089 int s; 1090 1091 s = splbio(); 1092 if (size) { 1093 /* 1094 * Wake up the cleaner if we have lots of dirty pages, 1095 * or if we are getting low on buffer cache kva. 1096 */ 1097 if (UNCLEAN_PAGES >= hidirtypages || 1098 bcstats.kvaslots_avail <= 2 * RESERVE_SLOTS) 1099 wakeup(&bd_req); 1100 1101 npages = atop(round_page(size)); 1102 1103 /* 1104 * if our cache has been previously shrunk, 1105 * allow it to grow again with use up to 1106 * bufhighpages (cachepercent) 1107 */ 1108 if (bufpages < bufhighpages) 1109 bufadjust(bufhighpages); 1110 1111 /* 1112 * If we would go over the page target with our 1113 * new allocation, free enough buffers first 1114 * to stay at the target with our new allocation. 1115 */ 1116 if (bcstats.dmapages + npages > targetpages) { 1117 (void) bufcache_recover_dmapages(0, npages); 1118 bufcache_adjust(); 1119 } 1120 1121 /* 1122 * If we get here, we tried to free the world down 1123 * above, and couldn't get down - Wake the cleaner 1124 * and wait for it to push some buffers out. 1125 */ 1126 if ((bcstats.dmapages + npages > targetpages || 1127 bcstats.kvaslots_avail <= RESERVE_SLOTS) && 1128 curproc != syncerproc && curproc != cleanerproc) { 1129 wakeup(&bd_req); 1130 needbuffer++; 1131 tsleep_nsec(&needbuffer, PRIBIO, "needbuffer", INFSLP); 1132 splx(s); 1133 return (NULL); 1134 } 1135 if (bcstats.dmapages + npages > bufpages) { 1136 /* cleaner or syncer */ 1137 nobuffers = 1; 1138 tsleep_nsec(&nobuffers, PRIBIO, "nobuffers", INFSLP); 1139 splx(s); 1140 return (NULL); 1141 } 1142 } 1143 1144 bp = pool_get(&bufpool, poolwait|PR_ZERO); 1145 1146 if (bp == NULL) { 1147 splx(s); 1148 return (NULL); 1149 } 1150 1151 bp->b_freelist.tqe_next = NOLIST; 1152 bp->b_dev = NODEV; 1153 LIST_INIT(&bp->b_dep); 1154 bp->b_bcount = size; 1155 1156 buf_acquire_nomap(bp); 1157 1158 if (vp != NULL) { 1159 /* 1160 * We insert the buffer into the hash with B_BUSY set 1161 * while we allocate pages for it. This way any getblk 1162 * that happens while we allocate pages will wait for 1163 * this buffer instead of starting its own buf_get. 1164 * 1165 * But first, we check if someone beat us to it. 1166 */ 1167 if (incore_locked(vp, blkno)) { 1168 pool_put(&bufpool, bp); 1169 splx(s); 1170 return (NULL); 1171 } 1172 1173 bp->b_blkno = bp->b_lblkno = blkno; 1174 bgetvp(vp, bp); 1175 if (RBT_INSERT(buf_rb_bufs, &vp->v_bufs_tree, bp)) 1176 panic("buf_get: dup lblk vp %p bp %p", vp, bp); 1177 } else { 1178 bp->b_vnbufs.le_next = NOLIST; 1179 SET(bp->b_flags, B_INVAL); 1180 bp->b_vp = NULL; 1181 } 1182 1183 LIST_INSERT_HEAD(&bufhead, bp, b_list); 1184 bcstats.numbufs++; 1185 1186 if (size) { 1187 buf_alloc_pages(bp, round_page(size)); 1188 KASSERT(ISSET(bp->b_flags, B_DMA)); 1189 buf_map(bp); 1190 } 1191 1192 SET(bp->b_flags, B_BC); 1193 splx(s); 1194 1195 return (bp); 1196 } 1197 1198 /* 1199 * Buffer cleaning daemon. 1200 */ 1201 void 1202 buf_daemon(void *arg) 1203 { 1204 struct buf *bp = NULL; 1205 int s, pushed = 0; 1206 1207 s = splbio(); 1208 for (;;) { 1209 if (bp == NULL || (pushed >= 16 && 1210 UNCLEAN_PAGES < hidirtypages && 1211 bcstats.kvaslots_avail > 2 * RESERVE_SLOTS)){ 1212 pushed = 0; 1213 /* 1214 * Wake up anyone who was waiting for buffers 1215 * to be released. 1216 */ 1217 if (needbuffer) { 1218 needbuffer = 0; 1219 wakeup(&needbuffer); 1220 } 1221 tsleep_nsec(&bd_req, PRIBIO - 7, "cleaner", INFSLP); 1222 } 1223 1224 while ((bp = bufcache_getdirtybuf())) { 1225 TRACEPOINT(vfs, cleaner, bp->b_flags, pushed, 1226 lodirtypages, hidirtypages); 1227 1228 if (UNCLEAN_PAGES < lodirtypages && 1229 bcstats.kvaslots_avail > 2 * RESERVE_SLOTS && 1230 pushed >= 16) 1231 break; 1232 1233 bufcache_take(bp); 1234 buf_acquire(bp); 1235 splx(s); 1236 1237 if (ISSET(bp->b_flags, B_INVAL)) { 1238 brelse(bp); 1239 s = splbio(); 1240 continue; 1241 } 1242 #ifdef DIAGNOSTIC 1243 if (!ISSET(bp->b_flags, B_DELWRI)) 1244 panic("Clean buffer on dirty queue"); 1245 #endif 1246 if (LIST_FIRST(&bp->b_dep) != NULL && 1247 !ISSET(bp->b_flags, B_DEFERRED) && 1248 buf_countdeps(bp, 0, 0)) { 1249 SET(bp->b_flags, B_DEFERRED); 1250 s = splbio(); 1251 bufcache_release(bp); 1252 buf_release(bp); 1253 continue; 1254 } 1255 1256 bawrite(bp); 1257 pushed++; 1258 1259 sched_pause(yield); 1260 1261 s = splbio(); 1262 } 1263 } 1264 } 1265 1266 /* 1267 * Wait for operations on the buffer to complete. 1268 * When they do, extract and return the I/O's error value. 1269 */ 1270 int 1271 biowait(struct buf *bp) 1272 { 1273 int s; 1274 1275 KASSERT(!(bp->b_flags & B_ASYNC)); 1276 1277 s = splbio(); 1278 while (!ISSET(bp->b_flags, B_DONE)) 1279 tsleep_nsec(bp, PRIBIO + 1, "biowait", INFSLP); 1280 splx(s); 1281 1282 /* check for interruption of I/O (e.g. via NFS), then errors. */ 1283 if (ISSET(bp->b_flags, B_EINTR)) { 1284 CLR(bp->b_flags, B_EINTR); 1285 return (EINTR); 1286 } 1287 1288 if (ISSET(bp->b_flags, B_ERROR)) 1289 return (bp->b_error ? bp->b_error : EIO); 1290 else 1291 return (0); 1292 } 1293 1294 /* 1295 * Mark I/O complete on a buffer. 1296 * 1297 * If a callback has been requested, e.g. the pageout 1298 * daemon, do so. Otherwise, awaken waiting processes. 1299 * 1300 * [ Leffler, et al., says on p.247: 1301 * "This routine wakes up the blocked process, frees the buffer 1302 * for an asynchronous write, or, for a request by the pagedaemon 1303 * process, invokes a procedure specified in the buffer structure" ] 1304 * 1305 * In real life, the pagedaemon (or other system processes) wants 1306 * to do async stuff to, and doesn't want the buffer brelse()'d. 1307 * (for swap pager, that puts swap buffers on the free lists (!!!), 1308 * for the vn device, that puts malloc'd buffers on the free lists!) 1309 * 1310 * Must be called at splbio(). 1311 */ 1312 void 1313 biodone(struct buf *bp) 1314 { 1315 splassert(IPL_BIO); 1316 1317 if (ISSET(bp->b_flags, B_DONE)) 1318 panic("biodone already"); 1319 SET(bp->b_flags, B_DONE); /* note that it's done */ 1320 1321 if (bp->b_bq) 1322 bufq_done(bp->b_bq, bp); 1323 1324 if (LIST_FIRST(&bp->b_dep) != NULL) 1325 buf_complete(bp); 1326 1327 if (!ISSET(bp->b_flags, B_READ)) { 1328 CLR(bp->b_flags, B_WRITEINPROG); 1329 vwakeup(bp->b_vp); 1330 } 1331 if (bcstats.numbufs && 1332 (!(ISSET(bp->b_flags, B_RAW) || ISSET(bp->b_flags, B_PHYS)))) { 1333 if (!ISSET(bp->b_flags, B_READ)) { 1334 bcstats.pendingwrites--; 1335 } else 1336 bcstats.pendingreads--; 1337 } 1338 if (ISSET(bp->b_flags, B_CALL)) { /* if necessary, call out */ 1339 CLR(bp->b_flags, B_CALL); /* but note callout done */ 1340 (*bp->b_iodone)(bp); 1341 } else { 1342 if (ISSET(bp->b_flags, B_ASYNC)) {/* if async, release it */ 1343 brelse(bp); 1344 } else { /* or just wakeup the buffer */ 1345 CLR(bp->b_flags, B_WANTED); 1346 wakeup(bp); 1347 } 1348 } 1349 } 1350 1351 #ifdef DDB 1352 void bcstats_print(int (*)(const char *, ...) 1353 __attribute__((__format__(__kprintf__,1,2)))); 1354 /* 1355 * bcstats_print: ddb hook to print interesting buffer cache counters 1356 */ 1357 void 1358 bcstats_print( 1359 int (*pr)(const char *, ...) __attribute__((__format__(__kprintf__,1,2)))) 1360 { 1361 (*pr)("Current Buffer Cache status:\n"); 1362 (*pr)("numbufs %lld busymapped %lld, delwri %lld\n", 1363 bcstats.numbufs, bcstats.busymapped, bcstats.delwribufs); 1364 (*pr)("kvaslots %lld avail kva slots %lld\n", 1365 bcstats.kvaslots, bcstats.kvaslots_avail); 1366 (*pr)("bufpages %lld, dmapages %lld, dirtypages %lld\n", 1367 bcstats.numbufpages, bcstats.dmapages, bcstats.numdirtypages); 1368 (*pr)("pendingreads %lld, pendingwrites %lld\n", 1369 bcstats.pendingreads, bcstats.pendingwrites); 1370 (*pr)("highflips %lld, highflops %lld, dmaflips %lld\n", 1371 bcstats.highflips, bcstats.highflops, bcstats.dmaflips); 1372 } 1373 #endif 1374 1375 void 1376 buf_adjcnt(struct buf *bp, long ncount) 1377 { 1378 KASSERT(ncount <= bp->b_bufsize); 1379 bp->b_bcount = ncount; 1380 } 1381 1382 /* bufcache freelist code below */ 1383 /* 1384 * Copyright (c) 2014 Ted Unangst <tedu@openbsd.org> 1385 * 1386 * Permission to use, copy, modify, and distribute this software for any 1387 * purpose with or without fee is hereby granted, provided that the above 1388 * copyright notice and this permission notice appear in all copies. 1389 * 1390 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 1391 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 1392 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 1393 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 1394 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 1395 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 1396 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 1397 */ 1398 1399 /* 1400 * The code below implements a variant of the 2Q buffer cache algorithm by 1401 * Johnson and Shasha. 1402 * 1403 * General Outline 1404 * We divide the buffer cache into three working sets: current, previous, 1405 * and long term. Each list is itself LRU and buffers get promoted and moved 1406 * around between them. A buffer starts its life in the current working set. 1407 * As time passes and newer buffers push it out, it will turn into the previous 1408 * working set and is subject to recycling. But if it's accessed again from 1409 * the previous working set, that's an indication that it's actually in the 1410 * long term working set, so we promote it there. The separation of current 1411 * and previous working sets prevents us from promoting a buffer that's only 1412 * temporarily hot to the long term cache. 1413 * 1414 * The objective is to provide scan resistance by making the long term 1415 * working set ineligible for immediate recycling, even as the current 1416 * working set is rapidly turned over. 1417 * 1418 * Implementation 1419 * The code below identifies the current, previous, and long term sets as 1420 * hotqueue, coldqueue, and warmqueue. The hot and warm queues are capped at 1421 * 1/3 of the total clean pages, after which point they start pushing their 1422 * oldest buffers into coldqueue. 1423 * A buf always starts out with neither WARM or COLD flags set (implying HOT). 1424 * When released, it will be returned to the tail of the hotqueue list. 1425 * When the hotqueue gets too large, the oldest hot buf will be moved to the 1426 * coldqueue, with the B_COLD flag set. When a cold buf is released, we set 1427 * the B_WARM flag and put it onto the warmqueue. Warm bufs are also 1428 * directly returned to the end of the warmqueue. As with the hotqueue, when 1429 * the warmqueue grows too large, B_WARM bufs are moved onto the coldqueue. 1430 * 1431 * Note that this design does still support large working sets, greater 1432 * than the cap of hotqueue or warmqueue would imply. The coldqueue is still 1433 * cached and has no maximum length. The hot and warm queues form a Y feeding 1434 * into the coldqueue. Moving bufs between queues is constant time, so this 1435 * design decays to one long warm->cold queue. 1436 * 1437 * In the 2Q paper, hotqueue and coldqueue are A1in and A1out. The warmqueue 1438 * is Am. We always cache pages, as opposed to pointers to pages for A1. 1439 * 1440 * This implementation adds support for multiple 2q caches. 1441 * 1442 * If we have more than one 2q cache, as bufs fall off the cold queue 1443 * for recycling, bufs that have been warm before (which retain the 1444 * B_WARM flag in addition to B_COLD) can be put into the hot queue of 1445 * a second level 2Q cache. buffers which are only B_COLD are 1446 * recycled. Bufs falling off the last cache's cold queue are always 1447 * recycled. 1448 * 1449 */ 1450 1451 /* 1452 * this function is called when a hot or warm queue may have exceeded its 1453 * size limit. it will move a buf to the coldqueue. 1454 */ 1455 int chillbufs(struct 1456 bufcache *cache, struct bufqueue *queue, int64_t *queuepages); 1457 1458 void 1459 bufcache_init(void) 1460 { 1461 int i; 1462 1463 for (i = 0; i < NUM_CACHES; i++) { 1464 TAILQ_INIT(&cleancache[i].hotqueue); 1465 TAILQ_INIT(&cleancache[i].coldqueue); 1466 TAILQ_INIT(&cleancache[i].warmqueue); 1467 } 1468 TAILQ_INIT(&dirtyqueue); 1469 } 1470 1471 /* 1472 * if the buffer caches have shrunk, we may need to rebalance our queues. 1473 */ 1474 void 1475 bufcache_adjust(void) 1476 { 1477 int i; 1478 1479 for (i = 0; i < NUM_CACHES; i++) { 1480 while (chillbufs(&cleancache[i], &cleancache[i].warmqueue, 1481 &cleancache[i].warmbufpages) || 1482 chillbufs(&cleancache[i], &cleancache[i].hotqueue, 1483 &cleancache[i].hotbufpages)) 1484 continue; 1485 } 1486 } 1487 1488 /* 1489 * Get a clean buffer from the cache. if "discard" is set do not promote 1490 * previously warm buffers as normal, because we are tossing everything 1491 * away such as in a hibernation 1492 */ 1493 struct buf * 1494 bufcache_getcleanbuf(int cachenum, int discard) 1495 { 1496 struct buf *bp = NULL; 1497 struct bufcache *cache = &cleancache[cachenum]; 1498 struct bufqueue * queue; 1499 1500 splassert(IPL_BIO); 1501 1502 /* try cold queue */ 1503 while ((bp = TAILQ_FIRST(&cache->coldqueue)) || 1504 (bp = TAILQ_FIRST(&cache->warmqueue)) || 1505 (bp = TAILQ_FIRST(&cache->hotqueue))) { 1506 int64_t pages = atop(bp->b_bufsize); 1507 struct bufcache *newcache; 1508 1509 if (discard || cachenum >= NUM_CACHES - 1) { 1510 /* Victim selected, give it up */ 1511 return bp; 1512 } 1513 KASSERT(bp->cache == cachenum); 1514 1515 /* 1516 * If this buffer was warm before, move it to 1517 * the hot queue in the next cache 1518 */ 1519 1520 if (fliphigh) { 1521 /* 1522 * If we are in the DMA cache, try to flip the 1523 * buffer up high to move it on to the other 1524 * caches. if we can't move the buffer to high 1525 * memory without sleeping, we give it up and 1526 * return it rather than fight for more memory 1527 * against non buffer cache competitors. 1528 */ 1529 SET(bp->b_flags, B_BUSY); 1530 if (bp->cache == 0 && buf_flip_high(bp) == -1) { 1531 CLR(bp->b_flags, B_BUSY); 1532 return bp; 1533 } 1534 CLR(bp->b_flags, B_BUSY); 1535 } 1536 1537 /* Move the buffer to the hot queue in the next cache */ 1538 if (ISSET(bp->b_flags, B_COLD)) { 1539 queue = &cache->coldqueue; 1540 } else if (ISSET(bp->b_flags, B_WARM)) { 1541 queue = &cache->warmqueue; 1542 cache->warmbufpages -= pages; 1543 } else { 1544 queue = &cache->hotqueue; 1545 cache->hotbufpages -= pages; 1546 } 1547 TAILQ_REMOVE(queue, bp, b_freelist); 1548 cache->cachepages -= pages; 1549 CLR(bp->b_flags, B_WARM); 1550 CLR(bp->b_flags, B_COLD); 1551 bp->cache++; 1552 newcache= &cleancache[bp->cache]; 1553 newcache->cachepages += pages; 1554 newcache->hotbufpages += pages; 1555 chillbufs(newcache, &newcache->hotqueue, 1556 &newcache->hotbufpages); 1557 TAILQ_INSERT_TAIL(&newcache->hotqueue, bp, b_freelist); 1558 } 1559 return bp; 1560 } 1561 1562 1563 void 1564 discard_buffer(struct buf *bp) 1565 { 1566 splassert(IPL_BIO); 1567 1568 bufcache_take(bp); 1569 if (bp->b_vp) { 1570 RBT_REMOVE(buf_rb_bufs, 1571 &bp->b_vp->v_bufs_tree, bp); 1572 brelvp(bp); 1573 } 1574 buf_put(bp); 1575 } 1576 1577 int64_t 1578 bufcache_recover_dmapages(int discard, int64_t howmany) 1579 { 1580 struct buf *bp = NULL; 1581 struct bufcache *cache = &cleancache[DMA_CACHE]; 1582 struct bufqueue * queue; 1583 int64_t recovered = 0; 1584 1585 splassert(IPL_BIO); 1586 1587 while ((recovered < howmany) && 1588 ((bp = TAILQ_FIRST(&cache->coldqueue)) || 1589 (bp = TAILQ_FIRST(&cache->warmqueue)) || 1590 (bp = TAILQ_FIRST(&cache->hotqueue)))) { 1591 int64_t pages = atop(bp->b_bufsize); 1592 struct bufcache *newcache; 1593 1594 if (discard || DMA_CACHE >= NUM_CACHES - 1) { 1595 discard_buffer(bp); 1596 continue; 1597 } 1598 KASSERT(bp->cache == DMA_CACHE); 1599 1600 /* 1601 * If this buffer was warm before, move it to 1602 * the hot queue in the next cache 1603 */ 1604 1605 /* 1606 * One way or another, the pages for this 1607 * buffer are leaving DMA memory 1608 */ 1609 recovered += pages; 1610 1611 if (!fliphigh) { 1612 discard_buffer(bp); 1613 continue; 1614 } 1615 1616 /* 1617 * If we are in the DMA cache, try to flip the 1618 * buffer up high to move it on to the other 1619 * caches. if we can't move the buffer to high 1620 * memory without sleeping, we give it up 1621 * now rather than fight for more memory 1622 * against non buffer cache competitors. 1623 */ 1624 SET(bp->b_flags, B_BUSY); 1625 if (bp->cache == 0 && buf_flip_high(bp) == -1) { 1626 CLR(bp->b_flags, B_BUSY); 1627 discard_buffer(bp); 1628 continue; 1629 } 1630 CLR(bp->b_flags, B_BUSY); 1631 1632 /* 1633 * Move the buffer to the hot queue in the next cache 1634 */ 1635 if (ISSET(bp->b_flags, B_COLD)) { 1636 queue = &cache->coldqueue; 1637 } else if (ISSET(bp->b_flags, B_WARM)) { 1638 queue = &cache->warmqueue; 1639 cache->warmbufpages -= pages; 1640 } else { 1641 queue = &cache->hotqueue; 1642 cache->hotbufpages -= pages; 1643 } 1644 TAILQ_REMOVE(queue, bp, b_freelist); 1645 cache->cachepages -= pages; 1646 CLR(bp->b_flags, B_WARM); 1647 CLR(bp->b_flags, B_COLD); 1648 bp->cache++; 1649 newcache= &cleancache[bp->cache]; 1650 newcache->cachepages += pages; 1651 newcache->hotbufpages += pages; 1652 chillbufs(newcache, &newcache->hotqueue, 1653 &newcache->hotbufpages); 1654 TAILQ_INSERT_TAIL(&newcache->hotqueue, bp, b_freelist); 1655 } 1656 return recovered; 1657 } 1658 1659 struct buf * 1660 bufcache_getcleanbuf_range(int start, int end, int discard) 1661 { 1662 int i, j = start, q = end; 1663 struct buf *bp = NULL; 1664 1665 /* 1666 * XXX in theory we could promote warm buffers into a previous queue 1667 * so in the pathological case of where we go through all the caches 1668 * without getting a buffer we have to start at the beginning again. 1669 */ 1670 while (j <= q) { 1671 for (i = q; i >= j; i--) 1672 if ((bp = bufcache_getcleanbuf(i, discard))) 1673 return (bp); 1674 j++; 1675 } 1676 return bp; 1677 } 1678 1679 struct buf * 1680 bufcache_gethighcleanbuf(void) 1681 { 1682 if (!fliphigh) 1683 return NULL; 1684 return bufcache_getcleanbuf_range(DMA_CACHE + 1, NUM_CACHES - 1, 0); 1685 } 1686 1687 1688 struct buf * 1689 bufcache_getdmacleanbuf(void) 1690 { 1691 if (fliphigh) 1692 return bufcache_getcleanbuf_range(DMA_CACHE, DMA_CACHE, 0); 1693 return bufcache_getcleanbuf_range(DMA_CACHE, NUM_CACHES - 1, 0); 1694 } 1695 1696 1697 struct buf * 1698 bufcache_getdirtybuf(void) 1699 { 1700 return TAILQ_FIRST(&dirtyqueue); 1701 } 1702 1703 void 1704 bufcache_take(struct buf *bp) 1705 { 1706 struct bufqueue *queue; 1707 int64_t pages; 1708 1709 splassert(IPL_BIO); 1710 KASSERT(ISSET(bp->b_flags, B_BC)); 1711 KASSERT(bp->cache >= DMA_CACHE); 1712 KASSERT((bp->cache < NUM_CACHES)); 1713 1714 pages = atop(bp->b_bufsize); 1715 1716 TRACEPOINT(vfs, bufcache_take, bp->b_flags, bp->cache, pages); 1717 1718 struct bufcache *cache = &cleancache[bp->cache]; 1719 if (!ISSET(bp->b_flags, B_DELWRI)) { 1720 if (ISSET(bp->b_flags, B_COLD)) { 1721 queue = &cache->coldqueue; 1722 } else if (ISSET(bp->b_flags, B_WARM)) { 1723 queue = &cache->warmqueue; 1724 cache->warmbufpages -= pages; 1725 } else { 1726 queue = &cache->hotqueue; 1727 cache->hotbufpages -= pages; 1728 } 1729 bcstats.numcleanpages -= pages; 1730 cache->cachepages -= pages; 1731 } else { 1732 queue = &dirtyqueue; 1733 bcstats.numdirtypages -= pages; 1734 bcstats.delwribufs--; 1735 } 1736 TAILQ_REMOVE(queue, bp, b_freelist); 1737 } 1738 1739 /* move buffers from a hot or warm queue to a cold queue in a cache */ 1740 int 1741 chillbufs(struct bufcache *cache, struct bufqueue *queue, int64_t *queuepages) 1742 { 1743 struct buf *bp; 1744 int64_t limit, pages; 1745 1746 /* 1747 * We limit the hot queue to be small, with a max of 4096 pages. 1748 * We limit the warm queue to half the cache size. 1749 * 1750 * We impose a minimum size of 96 to prevent too much "wobbling". 1751 */ 1752 if (queue == &cache->hotqueue) 1753 limit = min(cache->cachepages / 20, 4096); 1754 else if (queue == &cache->warmqueue) 1755 limit = (cache->cachepages / 2); 1756 else 1757 panic("chillbufs: invalid queue"); 1758 1759 if (*queuepages > 96 && *queuepages > limit) { 1760 bp = TAILQ_FIRST(queue); 1761 if (!bp) 1762 panic("inconsistent bufpage counts"); 1763 pages = atop(bp->b_bufsize); 1764 *queuepages -= pages; 1765 TAILQ_REMOVE(queue, bp, b_freelist); 1766 /* we do not clear B_WARM */ 1767 SET(bp->b_flags, B_COLD); 1768 TAILQ_INSERT_TAIL(&cache->coldqueue, bp, b_freelist); 1769 return 1; 1770 } 1771 return 0; 1772 } 1773 1774 void 1775 bufcache_release(struct buf *bp) 1776 { 1777 struct bufqueue *queue; 1778 int64_t pages; 1779 struct bufcache *cache = &cleancache[bp->cache]; 1780 1781 KASSERT(ISSET(bp->b_flags, B_BC)); 1782 pages = atop(bp->b_bufsize); 1783 1784 TRACEPOINT(vfs, bufcache_rel, bp->b_flags, bp->cache, pages); 1785 1786 if (fliphigh) { 1787 if (ISSET(bp->b_flags, B_DMA) && bp->cache > 0) 1788 panic("B_DMA buffer release from cache %d", 1789 bp->cache); 1790 else if ((!ISSET(bp->b_flags, B_DMA)) && bp->cache == 0) 1791 panic("Non B_DMA buffer release from cache %d", 1792 bp->cache); 1793 } 1794 1795 if (!ISSET(bp->b_flags, B_DELWRI)) { 1796 int64_t *queuepages; 1797 if (ISSET(bp->b_flags, B_WARM | B_COLD)) { 1798 SET(bp->b_flags, B_WARM); 1799 CLR(bp->b_flags, B_COLD); 1800 queue = &cache->warmqueue; 1801 queuepages = &cache->warmbufpages; 1802 } else { 1803 queue = &cache->hotqueue; 1804 queuepages = &cache->hotbufpages; 1805 } 1806 *queuepages += pages; 1807 bcstats.numcleanpages += pages; 1808 cache->cachepages += pages; 1809 chillbufs(cache, queue, queuepages); 1810 } else { 1811 queue = &dirtyqueue; 1812 bcstats.numdirtypages += pages; 1813 bcstats.delwribufs++; 1814 } 1815 TAILQ_INSERT_TAIL(queue, bp, b_freelist); 1816 } 1817 1818 #ifdef HIBERNATE 1819 /* 1820 * Nuke the buffer cache from orbit when hibernating. We do not want to save 1821 * any clean cache pages to swap and read them back. the original disk files 1822 * are just as good. 1823 */ 1824 void 1825 hibernate_suspend_bufcache(void) 1826 { 1827 struct buf *bp; 1828 int s; 1829 1830 s = splbio(); 1831 /* Chuck away all the cache pages.. discard bufs, do not promote */ 1832 while ((bp = bufcache_getcleanbuf_range(DMA_CACHE, NUM_CACHES - 1, 1))) { 1833 bufcache_take(bp); 1834 if (bp->b_vp) { 1835 RBT_REMOVE(buf_rb_bufs, &bp->b_vp->v_bufs_tree, bp); 1836 brelvp(bp); 1837 } 1838 buf_put(bp); 1839 } 1840 splx(s); 1841 } 1842 1843 void 1844 hibernate_resume_bufcache(void) 1845 { 1846 /* XXX Nothing needed here for now */ 1847 } 1848 #endif /* HIBERNATE */ 1849