1 /* 2 * Copyright (c) 1994,1997 John S. Dyson 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice immediately at the beginning of the file, without modification, 10 * this list of conditions, and the following disclaimer. 11 * 2. Absolutely no warranty of function or purpose is made by the author 12 * John S. Dyson. 13 * 14 * $FreeBSD: src/sys/kern/vfs_bio.c,v 1.242.2.20 2003/05/28 18:38:10 alc Exp $ 15 * $DragonFly: src/sys/kern/vfs_bio.c,v 1.115 2008/08/13 11:02:31 swildner Exp $ 16 */ 17 18 /* 19 * this file contains a new buffer I/O scheme implementing a coherent 20 * VM object and buffer cache scheme. Pains have been taken to make 21 * sure that the performance degradation associated with schemes such 22 * as this is not realized. 23 * 24 * Author: John S. Dyson 25 * Significant help during the development and debugging phases 26 * had been provided by David Greenman, also of the FreeBSD core team. 27 * 28 * see man buf(9) for more info. 29 */ 30 31 #include <sys/param.h> 32 #include <sys/systm.h> 33 #include <sys/buf.h> 34 #include <sys/conf.h> 35 #include <sys/eventhandler.h> 36 #include <sys/lock.h> 37 #include <sys/malloc.h> 38 #include <sys/mount.h> 39 #include <sys/kernel.h> 40 #include <sys/kthread.h> 41 #include <sys/proc.h> 42 #include <sys/reboot.h> 43 #include <sys/resourcevar.h> 44 #include <sys/sysctl.h> 45 #include <sys/vmmeter.h> 46 #include <sys/vnode.h> 47 #include <sys/dsched.h> 48 #include <sys/proc.h> 49 #include <vm/vm.h> 50 #include <vm/vm_param.h> 51 #include <vm/vm_kern.h> 52 #include <vm/vm_pageout.h> 53 #include <vm/vm_page.h> 54 #include <vm/vm_object.h> 55 #include <vm/vm_extern.h> 56 #include <vm/vm_map.h> 57 #include <vm/vm_pager.h> 58 #include <vm/swap_pager.h> 59 60 #include <sys/buf2.h> 61 #include <sys/thread2.h> 62 #include <sys/spinlock2.h> 63 #include <sys/mplock2.h> 64 #include <vm/vm_page2.h> 65 66 #include "opt_ddb.h" 67 #ifdef DDB 68 #include <ddb/ddb.h> 69 #endif 70 71 /* 72 * Buffer queues. 73 */ 74 enum bufq_type { 75 BQUEUE_NONE, /* not on any queue */ 76 BQUEUE_LOCKED, /* locked buffers */ 77 BQUEUE_CLEAN, /* non-B_DELWRI buffers */ 78 BQUEUE_DIRTY, /* B_DELWRI buffers */ 79 BQUEUE_DIRTY_HW, /* B_DELWRI buffers - heavy weight */ 80 BQUEUE_EMPTYKVA, /* empty buffer headers with KVA assignment */ 81 BQUEUE_EMPTY, /* empty buffer headers */ 82 83 BUFFER_QUEUES /* number of buffer queues */ 84 }; 85 86 typedef enum bufq_type bufq_type_t; 87 88 #define BD_WAKE_SIZE 16384 89 #define BD_WAKE_MASK (BD_WAKE_SIZE - 1) 90 91 TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES]; 92 static struct spinlock bufqspin = SPINLOCK_INITIALIZER(&bufqspin); 93 static struct spinlock bufcspin = SPINLOCK_INITIALIZER(&bufcspin); 94 95 static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer"); 96 97 struct buf *buf; /* buffer header pool */ 98 99 static void vfs_clean_pages(struct buf *bp); 100 static void vfs_clean_one_page(struct buf *bp, int pageno, vm_page_t m); 101 static void vfs_dirty_one_page(struct buf *bp, int pageno, vm_page_t m); 102 static void vfs_vmio_release(struct buf *bp); 103 static int flushbufqueues(bufq_type_t q); 104 static vm_page_t bio_page_alloc(vm_object_t obj, vm_pindex_t pg, int deficit); 105 106 static void bd_signal(int totalspace); 107 static void buf_daemon(void); 108 static void buf_daemon_hw(void); 109 110 /* 111 * bogus page -- for I/O to/from partially complete buffers 112 * this is a temporary solution to the problem, but it is not 113 * really that bad. it would be better to split the buffer 114 * for input in the case of buffers partially already in memory, 115 * but the code is intricate enough already. 116 */ 117 vm_page_t bogus_page; 118 119 /* 120 * These are all static, but make the ones we export globals so we do 121 * not need to use compiler magic. 122 */ 123 int bufspace; /* locked by buffer_map */ 124 int maxbufspace; 125 static int bufmallocspace; /* atomic ops */ 126 int maxbufmallocspace, lobufspace, hibufspace; 127 static int bufreusecnt, bufdefragcnt, buffreekvacnt; 128 static int lorunningspace; 129 static int hirunningspace; 130 static int runningbufreq; /* locked by bufcspin */ 131 static int dirtybufspace; /* locked by bufcspin */ 132 static int dirtybufcount; /* locked by bufcspin */ 133 static int dirtybufspacehw; /* locked by bufcspin */ 134 static int dirtybufcounthw; /* locked by bufcspin */ 135 static int runningbufspace; /* locked by bufcspin */ 136 static int runningbufcount; /* locked by bufcspin */ 137 int lodirtybufspace; 138 int hidirtybufspace; 139 static int getnewbufcalls; 140 static int getnewbufrestarts; 141 static int recoverbufcalls; 142 static int needsbuffer; /* locked by bufcspin */ 143 static int bd_request; /* locked by bufcspin */ 144 static int bd_request_hw; /* locked by bufcspin */ 145 static u_int bd_wake_ary[BD_WAKE_SIZE]; 146 static u_int bd_wake_index; 147 static u_int vm_cycle_point = 40; /* 23-36 will migrate more act->inact */ 148 static int debug_commit; 149 150 static struct thread *bufdaemon_td; 151 static struct thread *bufdaemonhw_td; 152 static u_int lowmempgallocs; 153 static u_int lowmempgfails; 154 155 /* 156 * Sysctls for operational control of the buffer cache. 157 */ 158 SYSCTL_INT(_vfs, OID_AUTO, lodirtybufspace, CTLFLAG_RW, &lodirtybufspace, 0, 159 "Number of dirty buffers to flush before bufdaemon becomes inactive"); 160 SYSCTL_INT(_vfs, OID_AUTO, hidirtybufspace, CTLFLAG_RW, &hidirtybufspace, 0, 161 "High watermark used to trigger explicit flushing of dirty buffers"); 162 SYSCTL_INT(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW, &lorunningspace, 0, 163 "Minimum amount of buffer space required for active I/O"); 164 SYSCTL_INT(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW, &hirunningspace, 0, 165 "Maximum amount of buffer space to usable for active I/O"); 166 SYSCTL_UINT(_vfs, OID_AUTO, lowmempgallocs, CTLFLAG_RW, &lowmempgallocs, 0, 167 "Page allocations done during periods of very low free memory"); 168 SYSCTL_UINT(_vfs, OID_AUTO, lowmempgfails, CTLFLAG_RW, &lowmempgfails, 0, 169 "Page allocations which failed during periods of very low free memory"); 170 SYSCTL_UINT(_vfs, OID_AUTO, vm_cycle_point, CTLFLAG_RW, &vm_cycle_point, 0, 171 "Recycle pages to active or inactive queue transition pt 0-64"); 172 /* 173 * Sysctls determining current state of the buffer cache. 174 */ 175 SYSCTL_INT(_vfs, OID_AUTO, nbuf, CTLFLAG_RD, &nbuf, 0, 176 "Total number of buffers in buffer cache"); 177 SYSCTL_INT(_vfs, OID_AUTO, dirtybufspace, CTLFLAG_RD, &dirtybufspace, 0, 178 "Pending bytes of dirty buffers (all)"); 179 SYSCTL_INT(_vfs, OID_AUTO, dirtybufspacehw, CTLFLAG_RD, &dirtybufspacehw, 0, 180 "Pending bytes of dirty buffers (heavy weight)"); 181 SYSCTL_INT(_vfs, OID_AUTO, dirtybufcount, CTLFLAG_RD, &dirtybufcount, 0, 182 "Pending number of dirty buffers"); 183 SYSCTL_INT(_vfs, OID_AUTO, dirtybufcounthw, CTLFLAG_RD, &dirtybufcounthw, 0, 184 "Pending number of dirty buffers (heavy weight)"); 185 SYSCTL_INT(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0, 186 "I/O bytes currently in progress due to asynchronous writes"); 187 SYSCTL_INT(_vfs, OID_AUTO, runningbufcount, CTLFLAG_RD, &runningbufcount, 0, 188 "I/O buffers currently in progress due to asynchronous writes"); 189 SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0, 190 "Hard limit on maximum amount of memory usable for buffer space"); 191 SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0, 192 "Soft limit on maximum amount of memory usable for buffer space"); 193 SYSCTL_INT(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0, 194 "Minimum amount of memory to reserve for system buffer space"); 195 SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0, 196 "Amount of memory available for buffers"); 197 SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RD, &maxbufmallocspace, 198 0, "Maximum amount of memory reserved for buffers using malloc"); 199 SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0, 200 "Amount of memory left for buffers using malloc-scheme"); 201 SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RD, &getnewbufcalls, 0, 202 "New buffer header acquisition requests"); 203 SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RD, &getnewbufrestarts, 204 0, "New buffer header acquisition restarts"); 205 SYSCTL_INT(_vfs, OID_AUTO, recoverbufcalls, CTLFLAG_RD, &recoverbufcalls, 0, 206 "Recover VM space in an emergency"); 207 SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RD, &bufdefragcnt, 0, 208 "Buffer acquisition restarts due to fragmented buffer map"); 209 SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RD, &buffreekvacnt, 0, 210 "Amount of time KVA space was deallocated in an arbitrary buffer"); 211 SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RD, &bufreusecnt, 0, 212 "Amount of time buffer re-use operations were successful"); 213 SYSCTL_INT(_vfs, OID_AUTO, debug_commit, CTLFLAG_RW, &debug_commit, 0, ""); 214 SYSCTL_INT(_debug_sizeof, OID_AUTO, buf, CTLFLAG_RD, 0, sizeof(struct buf), 215 "sizeof(struct buf)"); 216 217 char *buf_wmesg = BUF_WMESG; 218 219 #define VFS_BIO_NEED_ANY 0x01 /* any freeable buffer */ 220 #define VFS_BIO_NEED_UNUSED02 0x02 221 #define VFS_BIO_NEED_UNUSED04 0x04 222 #define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */ 223 224 /* 225 * bufspacewakeup: 226 * 227 * Called when buffer space is potentially available for recovery. 228 * getnewbuf() will block on this flag when it is unable to free 229 * sufficient buffer space. Buffer space becomes recoverable when 230 * bp's get placed back in the queues. 231 */ 232 static __inline void 233 bufspacewakeup(void) 234 { 235 /* 236 * If someone is waiting for BUF space, wake them up. Even 237 * though we haven't freed the kva space yet, the waiting 238 * process will be able to now. 239 */ 240 spin_lock(&bufcspin); 241 if (needsbuffer & VFS_BIO_NEED_BUFSPACE) { 242 needsbuffer &= ~VFS_BIO_NEED_BUFSPACE; 243 spin_unlock(&bufcspin); 244 wakeup(&needsbuffer); 245 } else { 246 spin_unlock(&bufcspin); 247 } 248 } 249 250 /* 251 * runningbufwakeup: 252 * 253 * Accounting for I/O in progress. 254 * 255 */ 256 static __inline void 257 runningbufwakeup(struct buf *bp) 258 { 259 int totalspace; 260 int limit; 261 262 if ((totalspace = bp->b_runningbufspace) != 0) { 263 spin_lock(&bufcspin); 264 runningbufspace -= totalspace; 265 --runningbufcount; 266 bp->b_runningbufspace = 0; 267 268 /* 269 * see waitrunningbufspace() for limit test. 270 */ 271 limit = hirunningspace * 4 / 6; 272 if (runningbufreq && runningbufspace <= limit) { 273 runningbufreq = 0; 274 spin_unlock(&bufcspin); 275 wakeup(&runningbufreq); 276 } else { 277 spin_unlock(&bufcspin); 278 } 279 bd_signal(totalspace); 280 } 281 } 282 283 /* 284 * bufcountwakeup: 285 * 286 * Called when a buffer has been added to one of the free queues to 287 * account for the buffer and to wakeup anyone waiting for free buffers. 288 * This typically occurs when large amounts of metadata are being handled 289 * by the buffer cache ( else buffer space runs out first, usually ). 290 * 291 * MPSAFE 292 */ 293 static __inline void 294 bufcountwakeup(void) 295 { 296 spin_lock(&bufcspin); 297 if (needsbuffer) { 298 needsbuffer &= ~VFS_BIO_NEED_ANY; 299 spin_unlock(&bufcspin); 300 wakeup(&needsbuffer); 301 } else { 302 spin_unlock(&bufcspin); 303 } 304 } 305 306 /* 307 * waitrunningbufspace() 308 * 309 * Wait for the amount of running I/O to drop to hirunningspace * 4 / 6. 310 * This is the point where write bursting stops so we don't want to wait 311 * for the running amount to drop below it (at least if we still want bioq 312 * to burst writes). 313 * 314 * The caller may be using this function to block in a tight loop, we 315 * must block while runningbufspace is greater then or equal to 316 * hirunningspace * 4 / 6. 317 * 318 * And even with that it may not be enough, due to the presence of 319 * B_LOCKED dirty buffers, so also wait for at least one running buffer 320 * to complete. 321 */ 322 void 323 waitrunningbufspace(void) 324 { 325 int limit = hirunningspace * 4 / 6; 326 int dummy; 327 328 spin_lock(&bufcspin); 329 if (runningbufspace > limit) { 330 while (runningbufspace > limit) { 331 ++runningbufreq; 332 ssleep(&runningbufreq, &bufcspin, 0, "wdrn1", 0); 333 } 334 spin_unlock(&bufcspin); 335 } else if (runningbufspace > limit / 2) { 336 ++runningbufreq; 337 spin_unlock(&bufcspin); 338 tsleep(&dummy, 0, "wdrn2", 1); 339 } else { 340 spin_unlock(&bufcspin); 341 } 342 } 343 344 /* 345 * buf_dirty_count_severe: 346 * 347 * Return true if we have too many dirty buffers. 348 */ 349 int 350 buf_dirty_count_severe(void) 351 { 352 return (runningbufspace + dirtybufspace >= hidirtybufspace || 353 dirtybufcount >= nbuf / 2); 354 } 355 356 /* 357 * Return true if the amount of running I/O is severe and BIOQ should 358 * start bursting. 359 */ 360 int 361 buf_runningbufspace_severe(void) 362 { 363 return (runningbufspace >= hirunningspace * 4 / 6); 364 } 365 366 /* 367 * vfs_buf_test_cache: 368 * 369 * Called when a buffer is extended. This function clears the B_CACHE 370 * bit if the newly extended portion of the buffer does not contain 371 * valid data. 372 * 373 * NOTE! Dirty VM pages are not processed into dirty (B_DELWRI) buffer 374 * cache buffers. The VM pages remain dirty, as someone had mmap()'d 375 * them while a clean buffer was present. 376 */ 377 static __inline__ 378 void 379 vfs_buf_test_cache(struct buf *bp, 380 vm_ooffset_t foff, vm_offset_t off, vm_offset_t size, 381 vm_page_t m) 382 { 383 if (bp->b_flags & B_CACHE) { 384 int base = (foff + off) & PAGE_MASK; 385 if (vm_page_is_valid(m, base, size) == 0) 386 bp->b_flags &= ~B_CACHE; 387 } 388 } 389 390 /* 391 * bd_speedup() 392 * 393 * Spank the buf_daemon[_hw] if the total dirty buffer space exceeds the 394 * low water mark. 395 * 396 * MPSAFE 397 */ 398 static __inline__ 399 void 400 bd_speedup(void) 401 { 402 if (dirtybufspace < lodirtybufspace && dirtybufcount < nbuf / 2) 403 return; 404 405 if (bd_request == 0 && 406 (dirtybufspace - dirtybufspacehw > lodirtybufspace / 2 || 407 dirtybufcount - dirtybufcounthw >= nbuf / 2)) { 408 spin_lock(&bufcspin); 409 bd_request = 1; 410 spin_unlock(&bufcspin); 411 wakeup(&bd_request); 412 } 413 if (bd_request_hw == 0 && 414 (dirtybufspacehw > lodirtybufspace / 2 || 415 dirtybufcounthw >= nbuf / 2)) { 416 spin_lock(&bufcspin); 417 bd_request_hw = 1; 418 spin_unlock(&bufcspin); 419 wakeup(&bd_request_hw); 420 } 421 } 422 423 /* 424 * bd_heatup() 425 * 426 * Get the buf_daemon heated up when the number of running and dirty 427 * buffers exceeds the mid-point. 428 * 429 * Return the total number of dirty bytes past the second mid point 430 * as a measure of how much excess dirty data there is in the system. 431 * 432 * MPSAFE 433 */ 434 int 435 bd_heatup(void) 436 { 437 int mid1; 438 int mid2; 439 int totalspace; 440 441 mid1 = lodirtybufspace + (hidirtybufspace - lodirtybufspace) / 2; 442 443 totalspace = runningbufspace + dirtybufspace; 444 if (totalspace >= mid1 || dirtybufcount >= nbuf / 2) { 445 bd_speedup(); 446 mid2 = mid1 + (hidirtybufspace - mid1) / 2; 447 if (totalspace >= mid2) 448 return(totalspace - mid2); 449 } 450 return(0); 451 } 452 453 /* 454 * bd_wait() 455 * 456 * Wait for the buffer cache to flush (totalspace) bytes worth of 457 * buffers, then return. 458 * 459 * Regardless this function blocks while the number of dirty buffers 460 * exceeds hidirtybufspace. 461 * 462 * MPSAFE 463 */ 464 void 465 bd_wait(int totalspace) 466 { 467 u_int i; 468 int count; 469 470 if (curthread == bufdaemonhw_td || curthread == bufdaemon_td) 471 return; 472 473 while (totalspace > 0) { 474 bd_heatup(); 475 if (totalspace > runningbufspace + dirtybufspace) 476 totalspace = runningbufspace + dirtybufspace; 477 count = totalspace / BKVASIZE; 478 if (count >= BD_WAKE_SIZE) 479 count = BD_WAKE_SIZE - 1; 480 481 spin_lock(&bufcspin); 482 i = (bd_wake_index + count) & BD_WAKE_MASK; 483 ++bd_wake_ary[i]; 484 485 /* 486 * This is not a strict interlock, so we play a bit loose 487 * with locking access to dirtybufspace* 488 */ 489 tsleep_interlock(&bd_wake_ary[i], 0); 490 spin_unlock(&bufcspin); 491 tsleep(&bd_wake_ary[i], PINTERLOCKED, "flstik", hz); 492 493 totalspace = runningbufspace + dirtybufspace - hidirtybufspace; 494 } 495 } 496 497 /* 498 * bd_signal() 499 * 500 * This function is called whenever runningbufspace or dirtybufspace 501 * is reduced. Track threads waiting for run+dirty buffer I/O 502 * complete. 503 * 504 * MPSAFE 505 */ 506 static void 507 bd_signal(int totalspace) 508 { 509 u_int i; 510 511 if (totalspace > 0) { 512 if (totalspace > BKVASIZE * BD_WAKE_SIZE) 513 totalspace = BKVASIZE * BD_WAKE_SIZE; 514 spin_lock(&bufcspin); 515 while (totalspace > 0) { 516 i = bd_wake_index++; 517 i &= BD_WAKE_MASK; 518 if (bd_wake_ary[i]) { 519 bd_wake_ary[i] = 0; 520 spin_unlock(&bufcspin); 521 wakeup(&bd_wake_ary[i]); 522 spin_lock(&bufcspin); 523 } 524 totalspace -= BKVASIZE; 525 } 526 spin_unlock(&bufcspin); 527 } 528 } 529 530 /* 531 * BIO tracking support routines. 532 * 533 * Release a ref on a bio_track. Wakeup requests are atomically released 534 * along with the last reference so bk_active will never wind up set to 535 * only 0x80000000. 536 * 537 * MPSAFE 538 */ 539 static 540 void 541 bio_track_rel(struct bio_track *track) 542 { 543 int active; 544 int desired; 545 546 /* 547 * Shortcut 548 */ 549 active = track->bk_active; 550 if (active == 1 && atomic_cmpset_int(&track->bk_active, 1, 0)) 551 return; 552 553 /* 554 * Full-on. Note that the wait flag is only atomically released on 555 * the 1->0 count transition. 556 * 557 * We check for a negative count transition using bit 30 since bit 31 558 * has a different meaning. 559 */ 560 for (;;) { 561 desired = (active & 0x7FFFFFFF) - 1; 562 if (desired) 563 desired |= active & 0x80000000; 564 if (atomic_cmpset_int(&track->bk_active, active, desired)) { 565 if (desired & 0x40000000) 566 panic("bio_track_rel: bad count: %p\n", track); 567 if (active & 0x80000000) 568 wakeup(track); 569 break; 570 } 571 active = track->bk_active; 572 } 573 } 574 575 /* 576 * Wait for the tracking count to reach 0. 577 * 578 * Use atomic ops such that the wait flag is only set atomically when 579 * bk_active is non-zero. 580 * 581 * MPSAFE 582 */ 583 int 584 bio_track_wait(struct bio_track *track, int slp_flags, int slp_timo) 585 { 586 int active; 587 int desired; 588 int error; 589 590 /* 591 * Shortcut 592 */ 593 if (track->bk_active == 0) 594 return(0); 595 596 /* 597 * Full-on. Note that the wait flag may only be atomically set if 598 * the active count is non-zero. 599 * 600 * NOTE: We cannot optimize active == desired since a wakeup could 601 * clear active prior to our tsleep_interlock(). 602 */ 603 error = 0; 604 while ((active = track->bk_active) != 0) { 605 desired = active | 0x80000000; 606 tsleep_interlock(track, slp_flags); 607 if (atomic_cmpset_int(&track->bk_active, active, desired)) { 608 error = tsleep(track, slp_flags | PINTERLOCKED, 609 "trwait", slp_timo); 610 if (error) 611 break; 612 } 613 } 614 return (error); 615 } 616 617 /* 618 * bufinit: 619 * 620 * Load time initialisation of the buffer cache, called from machine 621 * dependant initialization code. 622 */ 623 void 624 bufinit(void) 625 { 626 struct buf *bp; 627 vm_offset_t bogus_offset; 628 int i; 629 630 /* next, make a null set of free lists */ 631 for (i = 0; i < BUFFER_QUEUES; i++) 632 TAILQ_INIT(&bufqueues[i]); 633 634 /* finally, initialize each buffer header and stick on empty q */ 635 for (i = 0; i < nbuf; i++) { 636 bp = &buf[i]; 637 bzero(bp, sizeof *bp); 638 bp->b_flags = B_INVAL; /* we're just an empty header */ 639 bp->b_cmd = BUF_CMD_DONE; 640 bp->b_qindex = BQUEUE_EMPTY; 641 initbufbio(bp); 642 xio_init(&bp->b_xio); 643 buf_dep_init(bp); 644 TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_EMPTY], bp, b_freelist); 645 } 646 647 /* 648 * maxbufspace is the absolute maximum amount of buffer space we are 649 * allowed to reserve in KVM and in real terms. The absolute maximum 650 * is nominally used by buf_daemon. hibufspace is the nominal maximum 651 * used by most other processes. The differential is required to 652 * ensure that buf_daemon is able to run when other processes might 653 * be blocked waiting for buffer space. 654 * 655 * maxbufspace is based on BKVASIZE. Allocating buffers larger then 656 * this may result in KVM fragmentation which is not handled optimally 657 * by the system. 658 */ 659 maxbufspace = nbuf * BKVASIZE; 660 hibufspace = imax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10); 661 lobufspace = hibufspace - MAXBSIZE; 662 663 lorunningspace = 512 * 1024; 664 /* hirunningspace -- see below */ 665 666 /* 667 * Limit the amount of malloc memory since it is wired permanently 668 * into the kernel space. Even though this is accounted for in 669 * the buffer allocation, we don't want the malloced region to grow 670 * uncontrolled. The malloc scheme improves memory utilization 671 * significantly on average (small) directories. 672 */ 673 maxbufmallocspace = hibufspace / 20; 674 675 /* 676 * Reduce the chance of a deadlock occuring by limiting the number 677 * of delayed-write dirty buffers we allow to stack up. 678 * 679 * We don't want too much actually queued to the device at once 680 * (XXX this needs to be per-mount!), because the buffers will 681 * wind up locked for a very long period of time while the I/O 682 * drains. 683 */ 684 hidirtybufspace = hibufspace / 2; /* dirty + running */ 685 hirunningspace = hibufspace / 16; /* locked & queued to device */ 686 if (hirunningspace < 1024 * 1024) 687 hirunningspace = 1024 * 1024; 688 689 dirtybufspace = 0; 690 dirtybufspacehw = 0; 691 692 lodirtybufspace = hidirtybufspace / 2; 693 694 /* 695 * Maximum number of async ops initiated per buf_daemon loop. This is 696 * somewhat of a hack at the moment, we really need to limit ourselves 697 * based on the number of bytes of I/O in-transit that were initiated 698 * from buf_daemon. 699 */ 700 701 bogus_offset = kmem_alloc_pageable(&kernel_map, PAGE_SIZE); 702 bogus_page = vm_page_alloc(&kernel_object, 703 (bogus_offset >> PAGE_SHIFT), 704 VM_ALLOC_NORMAL); 705 vmstats.v_wire_count++; 706 707 } 708 709 /* 710 * Initialize the embedded bio structures, typically used by 711 * deprecated code which tries to allocate its own struct bufs. 712 */ 713 void 714 initbufbio(struct buf *bp) 715 { 716 bp->b_bio1.bio_buf = bp; 717 bp->b_bio1.bio_prev = NULL; 718 bp->b_bio1.bio_offset = NOOFFSET; 719 bp->b_bio1.bio_next = &bp->b_bio2; 720 bp->b_bio1.bio_done = NULL; 721 bp->b_bio1.bio_flags = 0; 722 723 bp->b_bio2.bio_buf = bp; 724 bp->b_bio2.bio_prev = &bp->b_bio1; 725 bp->b_bio2.bio_offset = NOOFFSET; 726 bp->b_bio2.bio_next = NULL; 727 bp->b_bio2.bio_done = NULL; 728 bp->b_bio2.bio_flags = 0; 729 730 BUF_LOCKINIT(bp); 731 } 732 733 /* 734 * Reinitialize the embedded bio structures as well as any additional 735 * translation cache layers. 736 */ 737 void 738 reinitbufbio(struct buf *bp) 739 { 740 struct bio *bio; 741 742 for (bio = &bp->b_bio1; bio; bio = bio->bio_next) { 743 bio->bio_done = NULL; 744 bio->bio_offset = NOOFFSET; 745 } 746 } 747 748 /* 749 * Undo the effects of an initbufbio(). 750 */ 751 void 752 uninitbufbio(struct buf *bp) 753 { 754 dsched_exit_buf(bp); 755 BUF_LOCKFREE(bp); 756 } 757 758 /* 759 * Push another BIO layer onto an existing BIO and return it. The new 760 * BIO layer may already exist, holding cached translation data. 761 */ 762 struct bio * 763 push_bio(struct bio *bio) 764 { 765 struct bio *nbio; 766 767 if ((nbio = bio->bio_next) == NULL) { 768 int index = bio - &bio->bio_buf->b_bio_array[0]; 769 if (index >= NBUF_BIO - 1) { 770 panic("push_bio: too many layers bp %p\n", 771 bio->bio_buf); 772 } 773 nbio = &bio->bio_buf->b_bio_array[index + 1]; 774 bio->bio_next = nbio; 775 nbio->bio_prev = bio; 776 nbio->bio_buf = bio->bio_buf; 777 nbio->bio_offset = NOOFFSET; 778 nbio->bio_done = NULL; 779 nbio->bio_next = NULL; 780 } 781 KKASSERT(nbio->bio_done == NULL); 782 return(nbio); 783 } 784 785 /* 786 * Pop a BIO translation layer, returning the previous layer. The 787 * must have been previously pushed. 788 */ 789 struct bio * 790 pop_bio(struct bio *bio) 791 { 792 return(bio->bio_prev); 793 } 794 795 void 796 clearbiocache(struct bio *bio) 797 { 798 while (bio) { 799 bio->bio_offset = NOOFFSET; 800 bio = bio->bio_next; 801 } 802 } 803 804 /* 805 * bfreekva: 806 * 807 * Free the KVA allocation for buffer 'bp'. 808 * 809 * Must be called from a critical section as this is the only locking for 810 * buffer_map. 811 * 812 * Since this call frees up buffer space, we call bufspacewakeup(). 813 * 814 * MPALMOSTSAFE 815 */ 816 static void 817 bfreekva(struct buf *bp) 818 { 819 int count; 820 821 if (bp->b_kvasize) { 822 ++buffreekvacnt; 823 count = vm_map_entry_reserve(MAP_RESERVE_COUNT); 824 vm_map_lock(&buffer_map); 825 bufspace -= bp->b_kvasize; 826 vm_map_delete(&buffer_map, 827 (vm_offset_t) bp->b_kvabase, 828 (vm_offset_t) bp->b_kvabase + bp->b_kvasize, 829 &count 830 ); 831 vm_map_unlock(&buffer_map); 832 vm_map_entry_release(count); 833 bp->b_kvasize = 0; 834 bp->b_kvabase = NULL; 835 bufspacewakeup(); 836 } 837 } 838 839 /* 840 * bremfree: 841 * 842 * Remove the buffer from the appropriate free list. 843 */ 844 static __inline void 845 _bremfree(struct buf *bp) 846 { 847 if (bp->b_qindex != BQUEUE_NONE) { 848 KASSERT(BUF_REFCNTNB(bp) == 1, 849 ("bremfree: bp %p not locked",bp)); 850 TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); 851 bp->b_qindex = BQUEUE_NONE; 852 } else { 853 if (BUF_REFCNTNB(bp) <= 1) 854 panic("bremfree: removing a buffer not on a queue"); 855 } 856 } 857 858 void 859 bremfree(struct buf *bp) 860 { 861 spin_lock(&bufqspin); 862 _bremfree(bp); 863 spin_unlock(&bufqspin); 864 } 865 866 static void 867 bremfree_locked(struct buf *bp) 868 { 869 _bremfree(bp); 870 } 871 872 /* 873 * bread: 874 * 875 * Get a buffer with the specified data. Look in the cache first. We 876 * must clear B_ERROR and B_INVAL prior to initiating I/O. If B_CACHE 877 * is set, the buffer is valid and we do not have to do anything ( see 878 * getblk() ). 879 * 880 * MPALMOSTSAFE 881 */ 882 int 883 bread(struct vnode *vp, off_t loffset, int size, struct buf **bpp) 884 { 885 struct buf *bp; 886 887 bp = getblk(vp, loffset, size, 0, 0); 888 *bpp = bp; 889 890 /* if not found in cache, do some I/O */ 891 if ((bp->b_flags & B_CACHE) == 0) { 892 bp->b_flags &= ~(B_ERROR | B_EINTR | B_INVAL); 893 bp->b_cmd = BUF_CMD_READ; 894 bp->b_bio1.bio_done = biodone_sync; 895 bp->b_bio1.bio_flags |= BIO_SYNC; 896 vfs_busy_pages(vp, bp); 897 vn_strategy(vp, &bp->b_bio1); 898 return (biowait(&bp->b_bio1, "biord")); 899 } 900 return (0); 901 } 902 903 /* 904 * breadn: 905 * 906 * Operates like bread, but also starts asynchronous I/O on 907 * read-ahead blocks. We must clear B_ERROR and B_INVAL prior 908 * to initiating I/O . If B_CACHE is set, the buffer is valid 909 * and we do not have to do anything. 910 * 911 * MPALMOSTSAFE 912 */ 913 int 914 breadn(struct vnode *vp, off_t loffset, int size, off_t *raoffset, 915 int *rabsize, int cnt, struct buf **bpp) 916 { 917 struct buf *bp, *rabp; 918 int i; 919 int rv = 0, readwait = 0; 920 921 *bpp = bp = getblk(vp, loffset, size, 0, 0); 922 923 /* if not found in cache, do some I/O */ 924 if ((bp->b_flags & B_CACHE) == 0) { 925 bp->b_flags &= ~(B_ERROR | B_EINTR | B_INVAL); 926 bp->b_cmd = BUF_CMD_READ; 927 bp->b_bio1.bio_done = biodone_sync; 928 bp->b_bio1.bio_flags |= BIO_SYNC; 929 vfs_busy_pages(vp, bp); 930 vn_strategy(vp, &bp->b_bio1); 931 ++readwait; 932 } 933 934 for (i = 0; i < cnt; i++, raoffset++, rabsize++) { 935 if (inmem(vp, *raoffset)) 936 continue; 937 rabp = getblk(vp, *raoffset, *rabsize, 0, 0); 938 939 if ((rabp->b_flags & B_CACHE) == 0) { 940 rabp->b_flags &= ~(B_ERROR | B_EINTR | B_INVAL); 941 rabp->b_cmd = BUF_CMD_READ; 942 vfs_busy_pages(vp, rabp); 943 BUF_KERNPROC(rabp); 944 vn_strategy(vp, &rabp->b_bio1); 945 } else { 946 brelse(rabp); 947 } 948 } 949 if (readwait) 950 rv = biowait(&bp->b_bio1, "biord"); 951 return (rv); 952 } 953 954 /* 955 * bwrite: 956 * 957 * Synchronous write, waits for completion. 958 * 959 * Write, release buffer on completion. (Done by iodone 960 * if async). Do not bother writing anything if the buffer 961 * is invalid. 962 * 963 * Note that we set B_CACHE here, indicating that buffer is 964 * fully valid and thus cacheable. This is true even of NFS 965 * now so we set it generally. This could be set either here 966 * or in biodone() since the I/O is synchronous. We put it 967 * here. 968 */ 969 int 970 bwrite(struct buf *bp) 971 { 972 int error; 973 974 if (bp->b_flags & B_INVAL) { 975 brelse(bp); 976 return (0); 977 } 978 if (BUF_REFCNTNB(bp) == 0) 979 panic("bwrite: buffer is not busy???"); 980 981 /* Mark the buffer clean */ 982 bundirty(bp); 983 984 bp->b_flags &= ~(B_ERROR | B_EINTR); 985 bp->b_flags |= B_CACHE; 986 bp->b_cmd = BUF_CMD_WRITE; 987 bp->b_bio1.bio_done = biodone_sync; 988 bp->b_bio1.bio_flags |= BIO_SYNC; 989 vfs_busy_pages(bp->b_vp, bp); 990 991 /* 992 * Normal bwrites pipeline writes. NOTE: b_bufsize is only 993 * valid for vnode-backed buffers. 994 */ 995 bsetrunningbufspace(bp, bp->b_bufsize); 996 vn_strategy(bp->b_vp, &bp->b_bio1); 997 error = biowait(&bp->b_bio1, "biows"); 998 brelse(bp); 999 1000 return (error); 1001 } 1002 1003 /* 1004 * bawrite: 1005 * 1006 * Asynchronous write. Start output on a buffer, but do not wait for 1007 * it to complete. The buffer is released when the output completes. 1008 * 1009 * bwrite() ( or the VOP routine anyway ) is responsible for handling 1010 * B_INVAL buffers. Not us. 1011 */ 1012 void 1013 bawrite(struct buf *bp) 1014 { 1015 if (bp->b_flags & B_INVAL) { 1016 brelse(bp); 1017 return; 1018 } 1019 if (BUF_REFCNTNB(bp) == 0) 1020 panic("bwrite: buffer is not busy???"); 1021 1022 /* Mark the buffer clean */ 1023 bundirty(bp); 1024 1025 bp->b_flags &= ~(B_ERROR | B_EINTR); 1026 bp->b_flags |= B_CACHE; 1027 bp->b_cmd = BUF_CMD_WRITE; 1028 KKASSERT(bp->b_bio1.bio_done == NULL); 1029 vfs_busy_pages(bp->b_vp, bp); 1030 1031 /* 1032 * Normal bwrites pipeline writes. NOTE: b_bufsize is only 1033 * valid for vnode-backed buffers. 1034 */ 1035 bsetrunningbufspace(bp, bp->b_bufsize); 1036 BUF_KERNPROC(bp); 1037 vn_strategy(bp->b_vp, &bp->b_bio1); 1038 } 1039 1040 /* 1041 * bowrite: 1042 * 1043 * Ordered write. Start output on a buffer, and flag it so that the 1044 * device will write it in the order it was queued. The buffer is 1045 * released when the output completes. bwrite() ( or the VOP routine 1046 * anyway ) is responsible for handling B_INVAL buffers. 1047 */ 1048 int 1049 bowrite(struct buf *bp) 1050 { 1051 bp->b_flags |= B_ORDERED; 1052 bawrite(bp); 1053 return (0); 1054 } 1055 1056 /* 1057 * bdwrite: 1058 * 1059 * Delayed write. (Buffer is marked dirty). Do not bother writing 1060 * anything if the buffer is marked invalid. 1061 * 1062 * Note that since the buffer must be completely valid, we can safely 1063 * set B_CACHE. In fact, we have to set B_CACHE here rather then in 1064 * biodone() in order to prevent getblk from writing the buffer 1065 * out synchronously. 1066 */ 1067 void 1068 bdwrite(struct buf *bp) 1069 { 1070 if (BUF_REFCNTNB(bp) == 0) 1071 panic("bdwrite: buffer is not busy"); 1072 1073 if (bp->b_flags & B_INVAL) { 1074 brelse(bp); 1075 return; 1076 } 1077 bdirty(bp); 1078 1079 if (dsched_is_clear_buf_priv(bp)) 1080 dsched_new_buf(bp); 1081 1082 /* 1083 * Set B_CACHE, indicating that the buffer is fully valid. This is 1084 * true even of NFS now. 1085 */ 1086 bp->b_flags |= B_CACHE; 1087 1088 /* 1089 * This bmap keeps the system from needing to do the bmap later, 1090 * perhaps when the system is attempting to do a sync. Since it 1091 * is likely that the indirect block -- or whatever other datastructure 1092 * that the filesystem needs is still in memory now, it is a good 1093 * thing to do this. Note also, that if the pageout daemon is 1094 * requesting a sync -- there might not be enough memory to do 1095 * the bmap then... So, this is important to do. 1096 */ 1097 if (bp->b_bio2.bio_offset == NOOFFSET) { 1098 VOP_BMAP(bp->b_vp, bp->b_loffset, &bp->b_bio2.bio_offset, 1099 NULL, NULL, BUF_CMD_WRITE); 1100 } 1101 1102 /* 1103 * Because the underlying pages may still be mapped and 1104 * writable trying to set the dirty buffer (b_dirtyoff/end) 1105 * range here will be inaccurate. 1106 * 1107 * However, we must still clean the pages to satisfy the 1108 * vnode_pager and pageout daemon, so theythink the pages 1109 * have been "cleaned". What has really occured is that 1110 * they've been earmarked for later writing by the buffer 1111 * cache. 1112 * 1113 * So we get the b_dirtyoff/end update but will not actually 1114 * depend on it (NFS that is) until the pages are busied for 1115 * writing later on. 1116 */ 1117 vfs_clean_pages(bp); 1118 bqrelse(bp); 1119 1120 /* 1121 * note: we cannot initiate I/O from a bdwrite even if we wanted to, 1122 * due to the softdep code. 1123 */ 1124 } 1125 1126 /* 1127 * Fake write - return pages to VM system as dirty, leave the buffer clean. 1128 * This is used by tmpfs. 1129 * 1130 * It is important for any VFS using this routine to NOT use it for 1131 * IO_SYNC or IO_ASYNC operations which occur when the system really 1132 * wants to flush VM pages to backing store. 1133 */ 1134 void 1135 buwrite(struct buf *bp) 1136 { 1137 vm_page_t m; 1138 int i; 1139 1140 /* 1141 * Only works for VMIO buffers. If the buffer is already 1142 * marked for delayed-write we can't avoid the bdwrite(). 1143 */ 1144 if ((bp->b_flags & B_VMIO) == 0 || (bp->b_flags & B_DELWRI)) { 1145 bdwrite(bp); 1146 return; 1147 } 1148 1149 /* 1150 * Set valid & dirty. 1151 */ 1152 for (i = 0; i < bp->b_xio.xio_npages; i++) { 1153 m = bp->b_xio.xio_pages[i]; 1154 vfs_dirty_one_page(bp, i, m); 1155 } 1156 bqrelse(bp); 1157 } 1158 1159 /* 1160 * bdirty: 1161 * 1162 * Turn buffer into delayed write request by marking it B_DELWRI. 1163 * B_RELBUF and B_NOCACHE must be cleared. 1164 * 1165 * We reassign the buffer to itself to properly update it in the 1166 * dirty/clean lists. 1167 * 1168 * Must be called from a critical section. 1169 * The buffer must be on BQUEUE_NONE. 1170 */ 1171 void 1172 bdirty(struct buf *bp) 1173 { 1174 KASSERT(bp->b_qindex == BQUEUE_NONE, ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex)); 1175 if (bp->b_flags & B_NOCACHE) { 1176 kprintf("bdirty: clearing B_NOCACHE on buf %p\n", bp); 1177 bp->b_flags &= ~B_NOCACHE; 1178 } 1179 if (bp->b_flags & B_INVAL) { 1180 kprintf("bdirty: warning, dirtying invalid buffer %p\n", bp); 1181 } 1182 bp->b_flags &= ~B_RELBUF; 1183 1184 if ((bp->b_flags & B_DELWRI) == 0) { 1185 lwkt_gettoken(&bp->b_vp->v_token); 1186 bp->b_flags |= B_DELWRI; 1187 reassignbuf(bp); 1188 lwkt_reltoken(&bp->b_vp->v_token); 1189 1190 spin_lock(&bufcspin); 1191 ++dirtybufcount; 1192 dirtybufspace += bp->b_bufsize; 1193 if (bp->b_flags & B_HEAVY) { 1194 ++dirtybufcounthw; 1195 dirtybufspacehw += bp->b_bufsize; 1196 } 1197 spin_unlock(&bufcspin); 1198 1199 bd_heatup(); 1200 } 1201 } 1202 1203 /* 1204 * Set B_HEAVY, indicating that this is a heavy-weight buffer that 1205 * needs to be flushed with a different buf_daemon thread to avoid 1206 * deadlocks. B_HEAVY also imposes restrictions in getnewbuf(). 1207 */ 1208 void 1209 bheavy(struct buf *bp) 1210 { 1211 if ((bp->b_flags & B_HEAVY) == 0) { 1212 bp->b_flags |= B_HEAVY; 1213 if (bp->b_flags & B_DELWRI) { 1214 spin_lock(&bufcspin); 1215 ++dirtybufcounthw; 1216 dirtybufspacehw += bp->b_bufsize; 1217 spin_unlock(&bufcspin); 1218 } 1219 } 1220 } 1221 1222 /* 1223 * bundirty: 1224 * 1225 * Clear B_DELWRI for buffer. 1226 * 1227 * Must be called from a critical section. 1228 * 1229 * The buffer is typically on BQUEUE_NONE but there is one case in 1230 * brelse() that calls this function after placing the buffer on 1231 * a different queue. 1232 * 1233 * MPSAFE 1234 */ 1235 void 1236 bundirty(struct buf *bp) 1237 { 1238 if (bp->b_flags & B_DELWRI) { 1239 lwkt_gettoken(&bp->b_vp->v_token); 1240 bp->b_flags &= ~B_DELWRI; 1241 reassignbuf(bp); 1242 lwkt_reltoken(&bp->b_vp->v_token); 1243 1244 spin_lock(&bufcspin); 1245 --dirtybufcount; 1246 dirtybufspace -= bp->b_bufsize; 1247 if (bp->b_flags & B_HEAVY) { 1248 --dirtybufcounthw; 1249 dirtybufspacehw -= bp->b_bufsize; 1250 } 1251 spin_unlock(&bufcspin); 1252 1253 bd_signal(bp->b_bufsize); 1254 } 1255 /* 1256 * Since it is now being written, we can clear its deferred write flag. 1257 */ 1258 bp->b_flags &= ~B_DEFERRED; 1259 } 1260 1261 /* 1262 * Set the b_runningbufspace field, used to track how much I/O is 1263 * in progress at any given moment. 1264 */ 1265 void 1266 bsetrunningbufspace(struct buf *bp, int bytes) 1267 { 1268 bp->b_runningbufspace = bytes; 1269 if (bytes) { 1270 spin_lock(&bufcspin); 1271 runningbufspace += bytes; 1272 ++runningbufcount; 1273 spin_unlock(&bufcspin); 1274 } 1275 } 1276 1277 /* 1278 * brelse: 1279 * 1280 * Release a busy buffer and, if requested, free its resources. The 1281 * buffer will be stashed in the appropriate bufqueue[] allowing it 1282 * to be accessed later as a cache entity or reused for other purposes. 1283 * 1284 * MPALMOSTSAFE 1285 */ 1286 void 1287 brelse(struct buf *bp) 1288 { 1289 #ifdef INVARIANTS 1290 int saved_flags = bp->b_flags; 1291 #endif 1292 1293 KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); 1294 1295 /* 1296 * If B_NOCACHE is set we are being asked to destroy the buffer and 1297 * its backing store. Clear B_DELWRI. 1298 * 1299 * B_NOCACHE is set in two cases: (1) when the caller really wants 1300 * to destroy the buffer and backing store and (2) when the caller 1301 * wants to destroy the buffer and backing store after a write 1302 * completes. 1303 */ 1304 if ((bp->b_flags & (B_NOCACHE|B_DELWRI)) == (B_NOCACHE|B_DELWRI)) { 1305 bundirty(bp); 1306 } 1307 1308 if ((bp->b_flags & (B_INVAL | B_DELWRI)) == B_DELWRI) { 1309 /* 1310 * A re-dirtied buffer is only subject to destruction 1311 * by B_INVAL. B_ERROR and B_NOCACHE are ignored. 1312 */ 1313 /* leave buffer intact */ 1314 } else if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) || 1315 (bp->b_bufsize <= 0)) { 1316 /* 1317 * Either a failed read or we were asked to free or not 1318 * cache the buffer. This path is reached with B_DELWRI 1319 * set only if B_INVAL is already set. B_NOCACHE governs 1320 * backing store destruction. 1321 * 1322 * NOTE: HAMMER will set B_LOCKED in buf_deallocate if the 1323 * buffer cannot be immediately freed. 1324 */ 1325 bp->b_flags |= B_INVAL; 1326 if (LIST_FIRST(&bp->b_dep) != NULL) 1327 buf_deallocate(bp); 1328 if (bp->b_flags & B_DELWRI) { 1329 spin_lock(&bufcspin); 1330 --dirtybufcount; 1331 dirtybufspace -= bp->b_bufsize; 1332 if (bp->b_flags & B_HEAVY) { 1333 --dirtybufcounthw; 1334 dirtybufspacehw -= bp->b_bufsize; 1335 } 1336 spin_unlock(&bufcspin); 1337 1338 bd_signal(bp->b_bufsize); 1339 } 1340 bp->b_flags &= ~(B_DELWRI | B_CACHE); 1341 } 1342 1343 /* 1344 * We must clear B_RELBUF if B_DELWRI or B_LOCKED is set. 1345 * If vfs_vmio_release() is called with either bit set, the 1346 * underlying pages may wind up getting freed causing a previous 1347 * write (bdwrite()) to get 'lost' because pages associated with 1348 * a B_DELWRI bp are marked clean. Pages associated with a 1349 * B_LOCKED buffer may be mapped by the filesystem. 1350 * 1351 * If we want to release the buffer ourselves (rather then the 1352 * originator asking us to release it), give the originator a 1353 * chance to countermand the release by setting B_LOCKED. 1354 * 1355 * We still allow the B_INVAL case to call vfs_vmio_release(), even 1356 * if B_DELWRI is set. 1357 * 1358 * If B_DELWRI is not set we may have to set B_RELBUF if we are low 1359 * on pages to return pages to the VM page queues. 1360 */ 1361 if (bp->b_flags & (B_DELWRI | B_LOCKED)) { 1362 bp->b_flags &= ~B_RELBUF; 1363 } else if (vm_page_count_severe()) { 1364 if (LIST_FIRST(&bp->b_dep) != NULL) 1365 buf_deallocate(bp); /* can set B_LOCKED */ 1366 if (bp->b_flags & (B_DELWRI | B_LOCKED)) 1367 bp->b_flags &= ~B_RELBUF; 1368 else 1369 bp->b_flags |= B_RELBUF; 1370 } 1371 1372 /* 1373 * Make sure b_cmd is clear. It may have already been cleared by 1374 * biodone(). 1375 * 1376 * At this point destroying the buffer is governed by the B_INVAL 1377 * or B_RELBUF flags. 1378 */ 1379 bp->b_cmd = BUF_CMD_DONE; 1380 dsched_exit_buf(bp); 1381 1382 /* 1383 * VMIO buffer rundown. Make sure the VM page array is restored 1384 * after an I/O may have replaces some of the pages with bogus pages 1385 * in order to not destroy dirty pages in a fill-in read. 1386 * 1387 * Note that due to the code above, if a buffer is marked B_DELWRI 1388 * then the B_RELBUF and B_NOCACHE bits will always be clear. 1389 * B_INVAL may still be set, however. 1390 * 1391 * For clean buffers, B_INVAL or B_RELBUF will destroy the buffer 1392 * but not the backing store. B_NOCACHE will destroy the backing 1393 * store. 1394 * 1395 * Note that dirty NFS buffers contain byte-granular write ranges 1396 * and should not be destroyed w/ B_INVAL even if the backing store 1397 * is left intact. 1398 */ 1399 if (bp->b_flags & B_VMIO) { 1400 /* 1401 * Rundown for VMIO buffers which are not dirty NFS buffers. 1402 */ 1403 int i, j, resid; 1404 vm_page_t m; 1405 off_t foff; 1406 vm_pindex_t poff; 1407 vm_object_t obj; 1408 struct vnode *vp; 1409 1410 vp = bp->b_vp; 1411 1412 /* 1413 * Get the base offset and length of the buffer. Note that 1414 * in the VMIO case if the buffer block size is not 1415 * page-aligned then b_data pointer may not be page-aligned. 1416 * But our b_xio.xio_pages array *IS* page aligned. 1417 * 1418 * block sizes less then DEV_BSIZE (usually 512) are not 1419 * supported due to the page granularity bits (m->valid, 1420 * m->dirty, etc...). 1421 * 1422 * See man buf(9) for more information 1423 */ 1424 1425 resid = bp->b_bufsize; 1426 foff = bp->b_loffset; 1427 1428 lwkt_gettoken(&vm_token); 1429 for (i = 0; i < bp->b_xio.xio_npages; i++) { 1430 m = bp->b_xio.xio_pages[i]; 1431 vm_page_flag_clear(m, PG_ZERO); 1432 /* 1433 * If we hit a bogus page, fixup *all* of them 1434 * now. Note that we left these pages wired 1435 * when we removed them so they had better exist, 1436 * and they cannot be ripped out from under us so 1437 * no critical section protection is necessary. 1438 */ 1439 if (m == bogus_page) { 1440 obj = vp->v_object; 1441 poff = OFF_TO_IDX(bp->b_loffset); 1442 1443 for (j = i; j < bp->b_xio.xio_npages; j++) { 1444 vm_page_t mtmp; 1445 1446 mtmp = bp->b_xio.xio_pages[j]; 1447 if (mtmp == bogus_page) { 1448 mtmp = vm_page_lookup(obj, poff + j); 1449 if (!mtmp) { 1450 panic("brelse: page missing"); 1451 } 1452 bp->b_xio.xio_pages[j] = mtmp; 1453 } 1454 } 1455 bp->b_flags &= ~B_HASBOGUS; 1456 1457 if ((bp->b_flags & B_INVAL) == 0) { 1458 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), 1459 bp->b_xio.xio_pages, bp->b_xio.xio_npages); 1460 } 1461 m = bp->b_xio.xio_pages[i]; 1462 } 1463 1464 /* 1465 * Invalidate the backing store if B_NOCACHE is set 1466 * (e.g. used with vinvalbuf()). If this is NFS 1467 * we impose a requirement that the block size be 1468 * a multiple of PAGE_SIZE and create a temporary 1469 * hack to basically invalidate the whole page. The 1470 * problem is that NFS uses really odd buffer sizes 1471 * especially when tracking piecemeal writes and 1472 * it also vinvalbuf()'s a lot, which would result 1473 * in only partial page validation and invalidation 1474 * here. If the file page is mmap()'d, however, 1475 * all the valid bits get set so after we invalidate 1476 * here we would end up with weird m->valid values 1477 * like 0xfc. nfs_getpages() can't handle this so 1478 * we clear all the valid bits for the NFS case 1479 * instead of just some of them. 1480 * 1481 * The real bug is the VM system having to set m->valid 1482 * to VM_PAGE_BITS_ALL for faulted-in pages, which 1483 * itself is an artifact of the whole 512-byte 1484 * granular mess that exists to support odd block 1485 * sizes and UFS meta-data block sizes (e.g. 6144). 1486 * A complete rewrite is required. 1487 * 1488 * XXX 1489 */ 1490 if (bp->b_flags & (B_NOCACHE|B_ERROR)) { 1491 int poffset = foff & PAGE_MASK; 1492 int presid; 1493 1494 presid = PAGE_SIZE - poffset; 1495 if (bp->b_vp->v_tag == VT_NFS && 1496 bp->b_vp->v_type == VREG) { 1497 ; /* entire page */ 1498 } else if (presid > resid) { 1499 presid = resid; 1500 } 1501 KASSERT(presid >= 0, ("brelse: extra page")); 1502 vm_page_set_invalid(m, poffset, presid); 1503 1504 /* 1505 * Also make sure any swap cache is removed 1506 * as it is now stale (HAMMER in particular 1507 * uses B_NOCACHE to deal with buffer 1508 * aliasing). 1509 */ 1510 swap_pager_unswapped(m); 1511 } 1512 resid -= PAGE_SIZE - (foff & PAGE_MASK); 1513 foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; 1514 } 1515 if (bp->b_flags & (B_INVAL | B_RELBUF)) 1516 vfs_vmio_release(bp); 1517 lwkt_reltoken(&vm_token); 1518 } else { 1519 /* 1520 * Rundown for non-VMIO buffers. 1521 */ 1522 if (bp->b_flags & (B_INVAL | B_RELBUF)) { 1523 if (bp->b_bufsize) 1524 allocbuf(bp, 0); 1525 KKASSERT (LIST_FIRST(&bp->b_dep) == NULL); 1526 if (bp->b_vp) 1527 brelvp(bp); 1528 } 1529 } 1530 1531 if (bp->b_qindex != BQUEUE_NONE) 1532 panic("brelse: free buffer onto another queue???"); 1533 if (BUF_REFCNTNB(bp) > 1) { 1534 /* Temporary panic to verify exclusive locking */ 1535 /* This panic goes away when we allow shared refs */ 1536 panic("brelse: multiple refs"); 1537 /* NOT REACHED */ 1538 return; 1539 } 1540 1541 /* 1542 * Figure out the correct queue to place the cleaned up buffer on. 1543 * Buffers placed in the EMPTY or EMPTYKVA had better already be 1544 * disassociated from their vnode. 1545 */ 1546 spin_lock(&bufqspin); 1547 if (bp->b_flags & B_LOCKED) { 1548 /* 1549 * Buffers that are locked are placed in the locked queue 1550 * immediately, regardless of their state. 1551 */ 1552 bp->b_qindex = BQUEUE_LOCKED; 1553 TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_LOCKED], bp, b_freelist); 1554 } else if (bp->b_bufsize == 0) { 1555 /* 1556 * Buffers with no memory. Due to conditionals near the top 1557 * of brelse() such buffers should probably already be 1558 * marked B_INVAL and disassociated from their vnode. 1559 */ 1560 bp->b_flags |= B_INVAL; 1561 KASSERT(bp->b_vp == NULL, ("bp1 %p flags %08x/%08x vnode %p unexpectededly still associated!", bp, saved_flags, bp->b_flags, bp->b_vp)); 1562 KKASSERT((bp->b_flags & B_HASHED) == 0); 1563 if (bp->b_kvasize) { 1564 bp->b_qindex = BQUEUE_EMPTYKVA; 1565 } else { 1566 bp->b_qindex = BQUEUE_EMPTY; 1567 } 1568 TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); 1569 } else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF)) { 1570 /* 1571 * Buffers with junk contents. Again these buffers had better 1572 * already be disassociated from their vnode. 1573 */ 1574 KASSERT(bp->b_vp == NULL, ("bp2 %p flags %08x/%08x vnode %p unexpectededly still associated!", bp, saved_flags, bp->b_flags, bp->b_vp)); 1575 KKASSERT((bp->b_flags & B_HASHED) == 0); 1576 bp->b_flags |= B_INVAL; 1577 bp->b_qindex = BQUEUE_CLEAN; 1578 TAILQ_INSERT_HEAD(&bufqueues[BQUEUE_CLEAN], bp, b_freelist); 1579 } else { 1580 /* 1581 * Remaining buffers. These buffers are still associated with 1582 * their vnode. 1583 */ 1584 switch(bp->b_flags & (B_DELWRI|B_HEAVY)) { 1585 case B_DELWRI: 1586 bp->b_qindex = BQUEUE_DIRTY; 1587 TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_DIRTY], bp, b_freelist); 1588 break; 1589 case B_DELWRI | B_HEAVY: 1590 bp->b_qindex = BQUEUE_DIRTY_HW; 1591 TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_DIRTY_HW], bp, 1592 b_freelist); 1593 break; 1594 default: 1595 /* 1596 * NOTE: Buffers are always placed at the end of the 1597 * queue. If B_AGE is not set the buffer will cycle 1598 * through the queue twice. 1599 */ 1600 bp->b_qindex = BQUEUE_CLEAN; 1601 TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_CLEAN], bp, b_freelist); 1602 break; 1603 } 1604 } 1605 spin_unlock(&bufqspin); 1606 1607 /* 1608 * If B_INVAL, clear B_DELWRI. We've already placed the buffer 1609 * on the correct queue. 1610 */ 1611 if ((bp->b_flags & (B_INVAL|B_DELWRI)) == (B_INVAL|B_DELWRI)) 1612 bundirty(bp); 1613 1614 /* 1615 * The bp is on an appropriate queue unless locked. If it is not 1616 * locked or dirty we can wakeup threads waiting for buffer space. 1617 * 1618 * We've already handled the B_INVAL case ( B_DELWRI will be clear 1619 * if B_INVAL is set ). 1620 */ 1621 if ((bp->b_flags & (B_LOCKED|B_DELWRI)) == 0) 1622 bufcountwakeup(); 1623 1624 /* 1625 * Something we can maybe free or reuse 1626 */ 1627 if (bp->b_bufsize || bp->b_kvasize) 1628 bufspacewakeup(); 1629 1630 /* 1631 * Clean up temporary flags and unlock the buffer. 1632 */ 1633 bp->b_flags &= ~(B_ORDERED | B_NOCACHE | B_RELBUF | B_DIRECT); 1634 BUF_UNLOCK(bp); 1635 } 1636 1637 /* 1638 * bqrelse: 1639 * 1640 * Release a buffer back to the appropriate queue but do not try to free 1641 * it. The buffer is expected to be used again soon. 1642 * 1643 * bqrelse() is used by bdwrite() to requeue a delayed write, and used by 1644 * biodone() to requeue an async I/O on completion. It is also used when 1645 * known good buffers need to be requeued but we think we may need the data 1646 * again soon. 1647 * 1648 * XXX we should be able to leave the B_RELBUF hint set on completion. 1649 * 1650 * MPSAFE 1651 */ 1652 void 1653 bqrelse(struct buf *bp) 1654 { 1655 KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); 1656 1657 if (bp->b_qindex != BQUEUE_NONE) 1658 panic("bqrelse: free buffer onto another queue???"); 1659 if (BUF_REFCNTNB(bp) > 1) { 1660 /* do not release to free list */ 1661 panic("bqrelse: multiple refs"); 1662 return; 1663 } 1664 1665 buf_act_advance(bp); 1666 1667 spin_lock(&bufqspin); 1668 if (bp->b_flags & B_LOCKED) { 1669 /* 1670 * Locked buffers are released to the locked queue. However, 1671 * if the buffer is dirty it will first go into the dirty 1672 * queue and later on after the I/O completes successfully it 1673 * will be released to the locked queue. 1674 */ 1675 bp->b_qindex = BQUEUE_LOCKED; 1676 TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_LOCKED], bp, b_freelist); 1677 } else if (bp->b_flags & B_DELWRI) { 1678 bp->b_qindex = (bp->b_flags & B_HEAVY) ? 1679 BQUEUE_DIRTY_HW : BQUEUE_DIRTY; 1680 TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist); 1681 } else if (vm_page_count_severe()) { 1682 /* 1683 * We are too low on memory, we have to try to free the 1684 * buffer (most importantly: the wired pages making up its 1685 * backing store) *now*. 1686 */ 1687 spin_unlock(&bufqspin); 1688 brelse(bp); 1689 return; 1690 } else { 1691 bp->b_qindex = BQUEUE_CLEAN; 1692 TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_CLEAN], bp, b_freelist); 1693 } 1694 spin_unlock(&bufqspin); 1695 1696 if ((bp->b_flags & B_LOCKED) == 0 && 1697 ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0)) { 1698 bufcountwakeup(); 1699 } 1700 1701 /* 1702 * Something we can maybe free or reuse. 1703 */ 1704 if (bp->b_bufsize && !(bp->b_flags & B_DELWRI)) 1705 bufspacewakeup(); 1706 1707 /* 1708 * Final cleanup and unlock. Clear bits that are only used while a 1709 * buffer is actively locked. 1710 */ 1711 bp->b_flags &= ~(B_ORDERED | B_NOCACHE | B_RELBUF); 1712 dsched_exit_buf(bp); 1713 BUF_UNLOCK(bp); 1714 } 1715 1716 /* 1717 * vfs_vmio_release: 1718 * 1719 * Return backing pages held by the buffer 'bp' back to the VM system 1720 * if possible. The pages are freed if they are no longer valid or 1721 * attempt to free if it was used for direct I/O otherwise they are 1722 * sent to the page cache. 1723 * 1724 * Pages that were marked busy are left alone and skipped. 1725 * 1726 * The KVA mapping (b_data) for the underlying pages is removed by 1727 * this function. 1728 */ 1729 static void 1730 vfs_vmio_release(struct buf *bp) 1731 { 1732 int i; 1733 vm_page_t m; 1734 1735 lwkt_gettoken(&vm_token); 1736 for (i = 0; i < bp->b_xio.xio_npages; i++) { 1737 m = bp->b_xio.xio_pages[i]; 1738 bp->b_xio.xio_pages[i] = NULL; 1739 1740 /* 1741 * The VFS is telling us this is not a meta-data buffer 1742 * even if it is backed by a block device. 1743 */ 1744 if (bp->b_flags & B_NOTMETA) 1745 vm_page_flag_set(m, PG_NOTMETA); 1746 1747 /* 1748 * This is a very important bit of code. We try to track 1749 * VM page use whether the pages are wired into the buffer 1750 * cache or not. While wired into the buffer cache the 1751 * bp tracks the act_count. 1752 * 1753 * We can choose to place unwired pages on the inactive 1754 * queue (0) or active queue (1). If we place too many 1755 * on the active queue the queue will cycle the act_count 1756 * on pages we'd like to keep, just from single-use pages 1757 * (such as when doing a tar-up or file scan). 1758 */ 1759 if (bp->b_act_count < vm_cycle_point) 1760 vm_page_unwire(m, 0); 1761 else 1762 vm_page_unwire(m, 1); 1763 1764 /* 1765 * We don't mess with busy pages, it is 1766 * the responsibility of the process that 1767 * busied the pages to deal with them. 1768 */ 1769 if ((m->flags & PG_BUSY) || (m->busy != 0)) 1770 continue; 1771 1772 if (m->wire_count == 0) { 1773 vm_page_flag_clear(m, PG_ZERO); 1774 /* 1775 * Might as well free the page if we can and it has 1776 * no valid data. We also free the page if the 1777 * buffer was used for direct I/O. 1778 */ 1779 #if 0 1780 if ((bp->b_flags & B_ASYNC) == 0 && !m->valid && 1781 m->hold_count == 0) { 1782 vm_page_busy(m); 1783 vm_page_protect(m, VM_PROT_NONE); 1784 vm_page_free(m); 1785 } else 1786 #endif 1787 if (bp->b_flags & B_DIRECT) { 1788 vm_page_try_to_free(m); 1789 } else if (vm_page_count_severe()) { 1790 m->act_count = bp->b_act_count; 1791 vm_page_try_to_cache(m); 1792 } else { 1793 m->act_count = bp->b_act_count; 1794 } 1795 } 1796 } 1797 lwkt_reltoken(&vm_token); 1798 1799 pmap_qremove(trunc_page((vm_offset_t) bp->b_data), 1800 bp->b_xio.xio_npages); 1801 if (bp->b_bufsize) { 1802 bufspacewakeup(); 1803 bp->b_bufsize = 0; 1804 } 1805 bp->b_xio.xio_npages = 0; 1806 bp->b_flags &= ~B_VMIO; 1807 KKASSERT (LIST_FIRST(&bp->b_dep) == NULL); 1808 if (bp->b_vp) 1809 brelvp(bp); 1810 } 1811 1812 /* 1813 * vfs_bio_awrite: 1814 * 1815 * Implement clustered async writes for clearing out B_DELWRI buffers. 1816 * This is much better then the old way of writing only one buffer at 1817 * a time. Note that we may not be presented with the buffers in the 1818 * correct order, so we search for the cluster in both directions. 1819 * 1820 * The buffer is locked on call. 1821 */ 1822 int 1823 vfs_bio_awrite(struct buf *bp) 1824 { 1825 int i; 1826 int j; 1827 off_t loffset = bp->b_loffset; 1828 struct vnode *vp = bp->b_vp; 1829 int nbytes; 1830 struct buf *bpa; 1831 int nwritten; 1832 int size; 1833 1834 /* 1835 * right now we support clustered writing only to regular files. If 1836 * we find a clusterable block we could be in the middle of a cluster 1837 * rather then at the beginning. 1838 * 1839 * NOTE: b_bio1 contains the logical loffset and is aliased 1840 * to b_loffset. b_bio2 contains the translated block number. 1841 */ 1842 if ((vp->v_type == VREG) && 1843 (vp->v_mount != 0) && /* Only on nodes that have the size info */ 1844 (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { 1845 1846 size = vp->v_mount->mnt_stat.f_iosize; 1847 1848 for (i = size; i < MAXPHYS; i += size) { 1849 if ((bpa = findblk(vp, loffset + i, FINDBLK_TEST)) && 1850 BUF_REFCNT(bpa) == 0 && 1851 ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) == 1852 (B_DELWRI | B_CLUSTEROK)) && 1853 (bpa->b_bufsize == size)) { 1854 if ((bpa->b_bio2.bio_offset == NOOFFSET) || 1855 (bpa->b_bio2.bio_offset != 1856 bp->b_bio2.bio_offset + i)) 1857 break; 1858 } else { 1859 break; 1860 } 1861 } 1862 for (j = size; i + j <= MAXPHYS && j <= loffset; j += size) { 1863 if ((bpa = findblk(vp, loffset - j, FINDBLK_TEST)) && 1864 BUF_REFCNT(bpa) == 0 && 1865 ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) == 1866 (B_DELWRI | B_CLUSTEROK)) && 1867 (bpa->b_bufsize == size)) { 1868 if ((bpa->b_bio2.bio_offset == NOOFFSET) || 1869 (bpa->b_bio2.bio_offset != 1870 bp->b_bio2.bio_offset - j)) 1871 break; 1872 } else { 1873 break; 1874 } 1875 } 1876 j -= size; 1877 nbytes = (i + j); 1878 1879 /* 1880 * this is a possible cluster write 1881 */ 1882 if (nbytes != size) { 1883 BUF_UNLOCK(bp); 1884 nwritten = cluster_wbuild(vp, size, 1885 loffset - j, nbytes); 1886 return nwritten; 1887 } 1888 } 1889 1890 /* 1891 * default (old) behavior, writing out only one block 1892 * 1893 * XXX returns b_bufsize instead of b_bcount for nwritten? 1894 */ 1895 nwritten = bp->b_bufsize; 1896 bremfree(bp); 1897 bawrite(bp); 1898 1899 return nwritten; 1900 } 1901 1902 /* 1903 * getnewbuf: 1904 * 1905 * Find and initialize a new buffer header, freeing up existing buffers 1906 * in the bufqueues as necessary. The new buffer is returned locked. 1907 * 1908 * Important: B_INVAL is not set. If the caller wishes to throw the 1909 * buffer away, the caller must set B_INVAL prior to calling brelse(). 1910 * 1911 * We block if: 1912 * We have insufficient buffer headers 1913 * We have insufficient buffer space 1914 * buffer_map is too fragmented ( space reservation fails ) 1915 * If we have to flush dirty buffers ( but we try to avoid this ) 1916 * 1917 * To avoid VFS layer recursion we do not flush dirty buffers ourselves. 1918 * Instead we ask the buf daemon to do it for us. We attempt to 1919 * avoid piecemeal wakeups of the pageout daemon. 1920 * 1921 * MPALMOSTSAFE 1922 */ 1923 static struct buf * 1924 getnewbuf(int blkflags, int slptimeo, int size, int maxsize) 1925 { 1926 struct buf *bp; 1927 struct buf *nbp; 1928 int defrag = 0; 1929 int nqindex; 1930 int slpflags = (blkflags & GETBLK_PCATCH) ? PCATCH : 0; 1931 static int flushingbufs; 1932 1933 /* 1934 * We can't afford to block since we might be holding a vnode lock, 1935 * which may prevent system daemons from running. We deal with 1936 * low-memory situations by proactively returning memory and running 1937 * async I/O rather then sync I/O. 1938 */ 1939 1940 ++getnewbufcalls; 1941 --getnewbufrestarts; 1942 restart: 1943 ++getnewbufrestarts; 1944 1945 /* 1946 * Setup for scan. If we do not have enough free buffers, 1947 * we setup a degenerate case that immediately fails. Note 1948 * that if we are specially marked process, we are allowed to 1949 * dip into our reserves. 1950 * 1951 * The scanning sequence is nominally: EMPTY->EMPTYKVA->CLEAN 1952 * 1953 * We start with EMPTYKVA. If the list is empty we backup to EMPTY. 1954 * However, there are a number of cases (defragging, reusing, ...) 1955 * where we cannot backup. 1956 */ 1957 nqindex = BQUEUE_EMPTYKVA; 1958 spin_lock(&bufqspin); 1959 nbp = TAILQ_FIRST(&bufqueues[BQUEUE_EMPTYKVA]); 1960 1961 if (nbp == NULL) { 1962 /* 1963 * If no EMPTYKVA buffers and we are either 1964 * defragging or reusing, locate a CLEAN buffer 1965 * to free or reuse. If bufspace useage is low 1966 * skip this step so we can allocate a new buffer. 1967 */ 1968 if (defrag || bufspace >= lobufspace) { 1969 nqindex = BQUEUE_CLEAN; 1970 nbp = TAILQ_FIRST(&bufqueues[BQUEUE_CLEAN]); 1971 } 1972 1973 /* 1974 * If we could not find or were not allowed to reuse a 1975 * CLEAN buffer, check to see if it is ok to use an EMPTY 1976 * buffer. We can only use an EMPTY buffer if allocating 1977 * its KVA would not otherwise run us out of buffer space. 1978 */ 1979 if (nbp == NULL && defrag == 0 && 1980 bufspace + maxsize < hibufspace) { 1981 nqindex = BQUEUE_EMPTY; 1982 nbp = TAILQ_FIRST(&bufqueues[BQUEUE_EMPTY]); 1983 } 1984 } 1985 1986 /* 1987 * Run scan, possibly freeing data and/or kva mappings on the fly 1988 * depending. 1989 * 1990 * WARNING! bufqspin is held! 1991 */ 1992 while ((bp = nbp) != NULL) { 1993 int qindex = nqindex; 1994 1995 nbp = TAILQ_NEXT(bp, b_freelist); 1996 1997 /* 1998 * BQUEUE_CLEAN - B_AGE special case. If not set the bp 1999 * cycles through the queue twice before being selected. 2000 */ 2001 if (qindex == BQUEUE_CLEAN && 2002 (bp->b_flags & B_AGE) == 0 && nbp) { 2003 bp->b_flags |= B_AGE; 2004 TAILQ_REMOVE(&bufqueues[qindex], bp, b_freelist); 2005 TAILQ_INSERT_TAIL(&bufqueues[qindex], bp, b_freelist); 2006 continue; 2007 } 2008 2009 /* 2010 * Calculate next bp ( we can only use it if we do not block 2011 * or do other fancy things ). 2012 */ 2013 if (nbp == NULL) { 2014 switch(qindex) { 2015 case BQUEUE_EMPTY: 2016 nqindex = BQUEUE_EMPTYKVA; 2017 if ((nbp = TAILQ_FIRST(&bufqueues[BQUEUE_EMPTYKVA]))) 2018 break; 2019 /* fall through */ 2020 case BQUEUE_EMPTYKVA: 2021 nqindex = BQUEUE_CLEAN; 2022 if ((nbp = TAILQ_FIRST(&bufqueues[BQUEUE_CLEAN]))) 2023 break; 2024 /* fall through */ 2025 case BQUEUE_CLEAN: 2026 /* 2027 * nbp is NULL. 2028 */ 2029 break; 2030 } 2031 } 2032 2033 /* 2034 * Sanity Checks 2035 */ 2036 KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistent queue %d bp %p", qindex, bp)); 2037 2038 /* 2039 * Note: we no longer distinguish between VMIO and non-VMIO 2040 * buffers. 2041 */ 2042 KASSERT((bp->b_flags & B_DELWRI) == 0, 2043 ("delwri buffer %p found in queue %d", bp, qindex)); 2044 2045 /* 2046 * Do not try to reuse a buffer with a non-zero b_refs. 2047 * This is an unsynchronized test. A synchronized test 2048 * is also performed after we lock the buffer. 2049 */ 2050 if (bp->b_refs) 2051 continue; 2052 2053 /* 2054 * If we are defragging then we need a buffer with 2055 * b_kvasize != 0. XXX this situation should no longer 2056 * occur, if defrag is non-zero the buffer's b_kvasize 2057 * should also be non-zero at this point. XXX 2058 */ 2059 if (defrag && bp->b_kvasize == 0) { 2060 kprintf("Warning: defrag empty buffer %p\n", bp); 2061 continue; 2062 } 2063 2064 /* 2065 * Start freeing the bp. This is somewhat involved. nbp 2066 * remains valid only for BQUEUE_EMPTY[KVA] bp's. Buffers 2067 * on the clean list must be disassociated from their 2068 * current vnode. Buffers on the empty[kva] lists have 2069 * already been disassociated. 2070 */ 2071 2072 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { 2073 spin_unlock(&bufqspin); 2074 tsleep(&bd_request, 0, "gnbxxx", hz / 100); 2075 goto restart; 2076 } 2077 if (bp->b_qindex != qindex) { 2078 spin_unlock(&bufqspin); 2079 kprintf("getnewbuf: warning, BUF_LOCK blocked " 2080 "unexpectedly on buf %p index %d->%d, " 2081 "race corrected\n", 2082 bp, qindex, bp->b_qindex); 2083 BUF_UNLOCK(bp); 2084 goto restart; 2085 } 2086 bremfree_locked(bp); 2087 spin_unlock(&bufqspin); 2088 2089 /* 2090 * Dependancies must be handled before we disassociate the 2091 * vnode. 2092 * 2093 * NOTE: HAMMER will set B_LOCKED if the buffer cannot 2094 * be immediately disassociated. HAMMER then becomes 2095 * responsible for releasing the buffer. 2096 * 2097 * NOTE: bufqspin is UNLOCKED now. 2098 */ 2099 if (LIST_FIRST(&bp->b_dep) != NULL) { 2100 buf_deallocate(bp); 2101 if (bp->b_flags & B_LOCKED) { 2102 bqrelse(bp); 2103 goto restart; 2104 } 2105 KKASSERT(LIST_FIRST(&bp->b_dep) == NULL); 2106 } 2107 2108 if (qindex == BQUEUE_CLEAN) { 2109 if (bp->b_flags & B_VMIO) 2110 vfs_vmio_release(bp); 2111 if (bp->b_vp) 2112 brelvp(bp); 2113 } 2114 2115 /* 2116 * NOTE: nbp is now entirely invalid. We can only restart 2117 * the scan from this point on. 2118 * 2119 * Get the rest of the buffer freed up. b_kva* is still 2120 * valid after this operation. 2121 */ 2122 2123 KASSERT(bp->b_vp == NULL, ("bp3 %p flags %08x vnode %p qindex %d unexpectededly still associated!", bp, bp->b_flags, bp->b_vp, qindex)); 2124 KKASSERT((bp->b_flags & B_HASHED) == 0); 2125 2126 /* 2127 * critical section protection is not required when 2128 * scrapping a buffer's contents because it is already 2129 * wired. 2130 */ 2131 if (bp->b_bufsize) 2132 allocbuf(bp, 0); 2133 2134 bp->b_flags = B_BNOCLIP; 2135 bp->b_cmd = BUF_CMD_DONE; 2136 bp->b_vp = NULL; 2137 bp->b_error = 0; 2138 bp->b_resid = 0; 2139 bp->b_bcount = 0; 2140 bp->b_xio.xio_npages = 0; 2141 bp->b_dirtyoff = bp->b_dirtyend = 0; 2142 bp->b_act_count = ACT_INIT; 2143 reinitbufbio(bp); 2144 KKASSERT(LIST_FIRST(&bp->b_dep) == NULL); 2145 buf_dep_init(bp); 2146 if (blkflags & GETBLK_BHEAVY) 2147 bp->b_flags |= B_HEAVY; 2148 2149 /* 2150 * If we are defragging then free the buffer. 2151 */ 2152 if (defrag) { 2153 bp->b_flags |= B_INVAL; 2154 bfreekva(bp); 2155 brelse(bp); 2156 defrag = 0; 2157 goto restart; 2158 } 2159 2160 /* 2161 * If we are overcomitted then recover the buffer and its 2162 * KVM space. This occurs in rare situations when multiple 2163 * processes are blocked in getnewbuf() or allocbuf(). 2164 */ 2165 if (bufspace >= hibufspace) 2166 flushingbufs = 1; 2167 if (flushingbufs && bp->b_kvasize != 0) { 2168 bp->b_flags |= B_INVAL; 2169 bfreekva(bp); 2170 brelse(bp); 2171 goto restart; 2172 } 2173 if (bufspace < lobufspace) 2174 flushingbufs = 0; 2175 2176 /* 2177 * The brelvp() above interlocked the buffer, test b_refs 2178 * to determine if the buffer can be reused. b_refs 2179 * interlocks lookup/blocking-lock operations and allowing 2180 * buffer reuse can create deadlocks depending on what 2181 * (vp,loffset) is assigned to the reused buffer (see getblk). 2182 */ 2183 if (bp->b_refs) { 2184 bp->b_flags |= B_INVAL; 2185 bfreekva(bp); 2186 brelse(bp); 2187 goto restart; 2188 } 2189 2190 break; 2191 /* NOT REACHED, bufqspin not held */ 2192 } 2193 2194 /* 2195 * If we exhausted our list, sleep as appropriate. We may have to 2196 * wakeup various daemons and write out some dirty buffers. 2197 * 2198 * Generally we are sleeping due to insufficient buffer space. 2199 * 2200 * NOTE: bufqspin is held if bp is NULL, else it is not held. 2201 */ 2202 if (bp == NULL) { 2203 int flags; 2204 char *waitmsg; 2205 2206 spin_unlock(&bufqspin); 2207 if (defrag) { 2208 flags = VFS_BIO_NEED_BUFSPACE; 2209 waitmsg = "nbufkv"; 2210 } else if (bufspace >= hibufspace) { 2211 waitmsg = "nbufbs"; 2212 flags = VFS_BIO_NEED_BUFSPACE; 2213 } else { 2214 waitmsg = "newbuf"; 2215 flags = VFS_BIO_NEED_ANY; 2216 } 2217 2218 bd_speedup(); /* heeeelp */ 2219 spin_lock(&bufcspin); 2220 needsbuffer |= flags; 2221 while (needsbuffer & flags) { 2222 if (ssleep(&needsbuffer, &bufcspin, 2223 slpflags, waitmsg, slptimeo)) { 2224 spin_unlock(&bufcspin); 2225 return (NULL); 2226 } 2227 } 2228 spin_unlock(&bufcspin); 2229 } else { 2230 /* 2231 * We finally have a valid bp. We aren't quite out of the 2232 * woods, we still have to reserve kva space. In order 2233 * to keep fragmentation sane we only allocate kva in 2234 * BKVASIZE chunks. 2235 * 2236 * (bufqspin is not held) 2237 */ 2238 maxsize = (maxsize + BKVAMASK) & ~BKVAMASK; 2239 2240 if (maxsize != bp->b_kvasize) { 2241 vm_offset_t addr = 0; 2242 int count; 2243 2244 bfreekva(bp); 2245 2246 count = vm_map_entry_reserve(MAP_RESERVE_COUNT); 2247 vm_map_lock(&buffer_map); 2248 2249 if (vm_map_findspace(&buffer_map, 2250 vm_map_min(&buffer_map), maxsize, 2251 maxsize, 0, &addr)) { 2252 /* 2253 * Uh oh. Buffer map is too fragmented. We 2254 * must defragment the map. 2255 */ 2256 vm_map_unlock(&buffer_map); 2257 vm_map_entry_release(count); 2258 ++bufdefragcnt; 2259 defrag = 1; 2260 bp->b_flags |= B_INVAL; 2261 brelse(bp); 2262 goto restart; 2263 } 2264 if (addr) { 2265 vm_map_insert(&buffer_map, &count, 2266 NULL, 0, 2267 addr, addr + maxsize, 2268 VM_MAPTYPE_NORMAL, 2269 VM_PROT_ALL, VM_PROT_ALL, 2270 MAP_NOFAULT); 2271 2272 bp->b_kvabase = (caddr_t) addr; 2273 bp->b_kvasize = maxsize; 2274 bufspace += bp->b_kvasize; 2275 ++bufreusecnt; 2276 } 2277 vm_map_unlock(&buffer_map); 2278 vm_map_entry_release(count); 2279 } 2280 bp->b_data = bp->b_kvabase; 2281 } 2282 return(bp); 2283 } 2284 2285 /* 2286 * This routine is called in an emergency to recover VM pages from the 2287 * buffer cache by cashing in clean buffers. The idea is to recover 2288 * enough pages to be able to satisfy a stuck bio_page_alloc(). 2289 * 2290 * MPSAFE 2291 */ 2292 static int 2293 recoverbufpages(void) 2294 { 2295 struct buf *bp; 2296 int bytes = 0; 2297 2298 ++recoverbufcalls; 2299 2300 spin_lock(&bufqspin); 2301 while (bytes < MAXBSIZE) { 2302 bp = TAILQ_FIRST(&bufqueues[BQUEUE_CLEAN]); 2303 if (bp == NULL) 2304 break; 2305 2306 /* 2307 * BQUEUE_CLEAN - B_AGE special case. If not set the bp 2308 * cycles through the queue twice before being selected. 2309 */ 2310 if ((bp->b_flags & B_AGE) == 0 && TAILQ_NEXT(bp, b_freelist)) { 2311 bp->b_flags |= B_AGE; 2312 TAILQ_REMOVE(&bufqueues[BQUEUE_CLEAN], bp, b_freelist); 2313 TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_CLEAN], 2314 bp, b_freelist); 2315 continue; 2316 } 2317 2318 /* 2319 * Sanity Checks 2320 */ 2321 KKASSERT(bp->b_qindex == BQUEUE_CLEAN); 2322 KKASSERT((bp->b_flags & B_DELWRI) == 0); 2323 2324 /* 2325 * Start freeing the bp. This is somewhat involved. 2326 * 2327 * Buffers on the clean list must be disassociated from 2328 * their current vnode 2329 */ 2330 2331 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { 2332 kprintf("recoverbufpages: warning, locked buf %p, " 2333 "race corrected\n", 2334 bp); 2335 ssleep(&bd_request, &bufqspin, 0, "gnbxxx", hz / 100); 2336 continue; 2337 } 2338 if (bp->b_qindex != BQUEUE_CLEAN) { 2339 kprintf("recoverbufpages: warning, BUF_LOCK blocked " 2340 "unexpectedly on buf %p index %d, race " 2341 "corrected\n", 2342 bp, bp->b_qindex); 2343 BUF_UNLOCK(bp); 2344 continue; 2345 } 2346 bremfree_locked(bp); 2347 spin_unlock(&bufqspin); 2348 2349 /* 2350 * Dependancies must be handled before we disassociate the 2351 * vnode. 2352 * 2353 * NOTE: HAMMER will set B_LOCKED if the buffer cannot 2354 * be immediately disassociated. HAMMER then becomes 2355 * responsible for releasing the buffer. 2356 */ 2357 if (LIST_FIRST(&bp->b_dep) != NULL) { 2358 buf_deallocate(bp); 2359 if (bp->b_flags & B_LOCKED) { 2360 bqrelse(bp); 2361 spin_lock(&bufqspin); 2362 continue; 2363 } 2364 KKASSERT(LIST_FIRST(&bp->b_dep) == NULL); 2365 } 2366 2367 bytes += bp->b_bufsize; 2368 2369 if (bp->b_flags & B_VMIO) { 2370 bp->b_flags |= B_DIRECT; /* try to free pages */ 2371 vfs_vmio_release(bp); 2372 } 2373 if (bp->b_vp) 2374 brelvp(bp); 2375 2376 KKASSERT(bp->b_vp == NULL); 2377 KKASSERT((bp->b_flags & B_HASHED) == 0); 2378 2379 /* 2380 * critical section protection is not required when 2381 * scrapping a buffer's contents because it is already 2382 * wired. 2383 */ 2384 if (bp->b_bufsize) 2385 allocbuf(bp, 0); 2386 2387 bp->b_flags = B_BNOCLIP; 2388 bp->b_cmd = BUF_CMD_DONE; 2389 bp->b_vp = NULL; 2390 bp->b_error = 0; 2391 bp->b_resid = 0; 2392 bp->b_bcount = 0; 2393 bp->b_xio.xio_npages = 0; 2394 bp->b_dirtyoff = bp->b_dirtyend = 0; 2395 reinitbufbio(bp); 2396 KKASSERT(LIST_FIRST(&bp->b_dep) == NULL); 2397 buf_dep_init(bp); 2398 bp->b_flags |= B_INVAL; 2399 /* bfreekva(bp); */ 2400 brelse(bp); 2401 spin_lock(&bufqspin); 2402 } 2403 spin_unlock(&bufqspin); 2404 return(bytes); 2405 } 2406 2407 /* 2408 * buf_daemon: 2409 * 2410 * Buffer flushing daemon. Buffers are normally flushed by the 2411 * update daemon but if it cannot keep up this process starts to 2412 * take the load in an attempt to prevent getnewbuf() from blocking. 2413 * 2414 * Once a flush is initiated it does not stop until the number 2415 * of buffers falls below lodirtybuffers, but we will wake up anyone 2416 * waiting at the mid-point. 2417 */ 2418 2419 static struct kproc_desc buf_kp = { 2420 "bufdaemon", 2421 buf_daemon, 2422 &bufdaemon_td 2423 }; 2424 SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, 2425 kproc_start, &buf_kp) 2426 2427 static struct kproc_desc bufhw_kp = { 2428 "bufdaemon_hw", 2429 buf_daemon_hw, 2430 &bufdaemonhw_td 2431 }; 2432 SYSINIT(bufdaemon_hw, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, 2433 kproc_start, &bufhw_kp) 2434 2435 /* 2436 * MPSAFE thread 2437 */ 2438 static void 2439 buf_daemon(void) 2440 { 2441 int limit; 2442 2443 /* 2444 * This process needs to be suspended prior to shutdown sync. 2445 */ 2446 EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, 2447 bufdaemon_td, SHUTDOWN_PRI_LAST); 2448 curthread->td_flags |= TDF_SYSTHREAD; 2449 2450 /* 2451 * This process is allowed to take the buffer cache to the limit 2452 */ 2453 for (;;) { 2454 kproc_suspend_loop(); 2455 2456 /* 2457 * Do the flush as long as the number of dirty buffers 2458 * (including those running) exceeds lodirtybufspace. 2459 * 2460 * When flushing limit running I/O to hirunningspace 2461 * Do the flush. Limit the amount of in-transit I/O we 2462 * allow to build up, otherwise we would completely saturate 2463 * the I/O system. Wakeup any waiting processes before we 2464 * normally would so they can run in parallel with our drain. 2465 * 2466 * Our aggregate normal+HW lo water mark is lodirtybufspace, 2467 * but because we split the operation into two threads we 2468 * have to cut it in half for each thread. 2469 */ 2470 waitrunningbufspace(); 2471 limit = lodirtybufspace / 2; 2472 while (runningbufspace + dirtybufspace > limit || 2473 dirtybufcount - dirtybufcounthw >= nbuf / 2) { 2474 if (flushbufqueues(BQUEUE_DIRTY) == 0) 2475 break; 2476 if (runningbufspace < hirunningspace) 2477 continue; 2478 waitrunningbufspace(); 2479 } 2480 2481 /* 2482 * We reached our low water mark, reset the 2483 * request and sleep until we are needed again. 2484 * The sleep is just so the suspend code works. 2485 */ 2486 spin_lock(&bufcspin); 2487 if (bd_request == 0) 2488 ssleep(&bd_request, &bufcspin, 0, "psleep", hz); 2489 bd_request = 0; 2490 spin_unlock(&bufcspin); 2491 } 2492 } 2493 2494 /* 2495 * MPSAFE thread 2496 */ 2497 static void 2498 buf_daemon_hw(void) 2499 { 2500 int limit; 2501 2502 /* 2503 * This process needs to be suspended prior to shutdown sync. 2504 */ 2505 EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, 2506 bufdaemonhw_td, SHUTDOWN_PRI_LAST); 2507 curthread->td_flags |= TDF_SYSTHREAD; 2508 2509 /* 2510 * This process is allowed to take the buffer cache to the limit 2511 */ 2512 for (;;) { 2513 kproc_suspend_loop(); 2514 2515 /* 2516 * Do the flush. Limit the amount of in-transit I/O we 2517 * allow to build up, otherwise we would completely saturate 2518 * the I/O system. Wakeup any waiting processes before we 2519 * normally would so they can run in parallel with our drain. 2520 * 2521 * Once we decide to flush push the queued I/O up to 2522 * hirunningspace in order to trigger bursting by the bioq 2523 * subsystem. 2524 * 2525 * Our aggregate normal+HW lo water mark is lodirtybufspace, 2526 * but because we split the operation into two threads we 2527 * have to cut it in half for each thread. 2528 */ 2529 waitrunningbufspace(); 2530 limit = lodirtybufspace / 2; 2531 while (runningbufspace + dirtybufspacehw > limit || 2532 dirtybufcounthw >= nbuf / 2) { 2533 if (flushbufqueues(BQUEUE_DIRTY_HW) == 0) 2534 break; 2535 if (runningbufspace < hirunningspace) 2536 continue; 2537 waitrunningbufspace(); 2538 } 2539 2540 /* 2541 * We reached our low water mark, reset the 2542 * request and sleep until we are needed again. 2543 * The sleep is just so the suspend code works. 2544 */ 2545 spin_lock(&bufcspin); 2546 if (bd_request_hw == 0) 2547 ssleep(&bd_request_hw, &bufcspin, 0, "psleep", hz); 2548 bd_request_hw = 0; 2549 spin_unlock(&bufcspin); 2550 } 2551 } 2552 2553 /* 2554 * flushbufqueues: 2555 * 2556 * Try to flush a buffer in the dirty queue. We must be careful to 2557 * free up B_INVAL buffers instead of write them, which NFS is 2558 * particularly sensitive to. 2559 * 2560 * B_RELBUF may only be set by VFSs. We do set B_AGE to indicate 2561 * that we really want to try to get the buffer out and reuse it 2562 * due to the write load on the machine. 2563 * 2564 * We must lock the buffer in order to check its validity before we 2565 * can mess with its contents. bufqspin isn't enough. 2566 */ 2567 static int 2568 flushbufqueues(bufq_type_t q) 2569 { 2570 struct buf *bp; 2571 int r = 0; 2572 int spun; 2573 2574 spin_lock(&bufqspin); 2575 spun = 1; 2576 2577 bp = TAILQ_FIRST(&bufqueues[q]); 2578 while (bp) { 2579 if ((bp->b_flags & B_DELWRI) == 0) { 2580 kprintf("Unexpected clean buffer %p\n", bp); 2581 bp = TAILQ_NEXT(bp, b_freelist); 2582 continue; 2583 } 2584 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 2585 bp = TAILQ_NEXT(bp, b_freelist); 2586 continue; 2587 } 2588 KKASSERT(bp->b_qindex == q); 2589 2590 /* 2591 * Must recheck B_DELWRI after successfully locking 2592 * the buffer. 2593 */ 2594 if ((bp->b_flags & B_DELWRI) == 0) { 2595 BUF_UNLOCK(bp); 2596 bp = TAILQ_NEXT(bp, b_freelist); 2597 continue; 2598 } 2599 2600 if (bp->b_flags & B_INVAL) { 2601 _bremfree(bp); 2602 spin_unlock(&bufqspin); 2603 spun = 0; 2604 brelse(bp); 2605 ++r; 2606 break; 2607 } 2608 2609 if (LIST_FIRST(&bp->b_dep) != NULL && 2610 (bp->b_flags & B_DEFERRED) == 0 && 2611 buf_countdeps(bp, 0)) { 2612 TAILQ_REMOVE(&bufqueues[q], bp, b_freelist); 2613 TAILQ_INSERT_TAIL(&bufqueues[q], bp, b_freelist); 2614 bp->b_flags |= B_DEFERRED; 2615 BUF_UNLOCK(bp); 2616 bp = TAILQ_FIRST(&bufqueues[q]); 2617 continue; 2618 } 2619 2620 /* 2621 * If the buffer has a dependancy, buf_checkwrite() must 2622 * also return 0 for us to be able to initate the write. 2623 * 2624 * If the buffer is flagged B_ERROR it may be requeued 2625 * over and over again, we try to avoid a live lock. 2626 * 2627 * NOTE: buf_checkwrite is MPSAFE. 2628 */ 2629 spin_unlock(&bufqspin); 2630 spun = 0; 2631 2632 if (LIST_FIRST(&bp->b_dep) != NULL && buf_checkwrite(bp)) { 2633 bremfree(bp); 2634 brelse(bp); 2635 } else if (bp->b_flags & B_ERROR) { 2636 tsleep(bp, 0, "bioer", 1); 2637 bp->b_flags &= ~B_AGE; 2638 vfs_bio_awrite(bp); 2639 } else { 2640 bp->b_flags |= B_AGE; 2641 vfs_bio_awrite(bp); 2642 } 2643 ++r; 2644 break; 2645 } 2646 if (spun) 2647 spin_unlock(&bufqspin); 2648 return (r); 2649 } 2650 2651 /* 2652 * inmem: 2653 * 2654 * Returns true if no I/O is needed to access the associated VM object. 2655 * This is like findblk except it also hunts around in the VM system for 2656 * the data. 2657 * 2658 * Note that we ignore vm_page_free() races from interrupts against our 2659 * lookup, since if the caller is not protected our return value will not 2660 * be any more valid then otherwise once we exit the critical section. 2661 */ 2662 int 2663 inmem(struct vnode *vp, off_t loffset) 2664 { 2665 vm_object_t obj; 2666 vm_offset_t toff, tinc, size; 2667 vm_page_t m; 2668 2669 if (findblk(vp, loffset, FINDBLK_TEST)) 2670 return 1; 2671 if (vp->v_mount == NULL) 2672 return 0; 2673 if ((obj = vp->v_object) == NULL) 2674 return 0; 2675 2676 size = PAGE_SIZE; 2677 if (size > vp->v_mount->mnt_stat.f_iosize) 2678 size = vp->v_mount->mnt_stat.f_iosize; 2679 2680 for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { 2681 lwkt_gettoken(&vm_token); 2682 m = vm_page_lookup(obj, OFF_TO_IDX(loffset + toff)); 2683 lwkt_reltoken(&vm_token); 2684 if (m == NULL) 2685 return 0; 2686 tinc = size; 2687 if (tinc > PAGE_SIZE - ((toff + loffset) & PAGE_MASK)) 2688 tinc = PAGE_SIZE - ((toff + loffset) & PAGE_MASK); 2689 if (vm_page_is_valid(m, 2690 (vm_offset_t) ((toff + loffset) & PAGE_MASK), tinc) == 0) 2691 return 0; 2692 } 2693 return 1; 2694 } 2695 2696 /* 2697 * findblk: 2698 * 2699 * Locate and return the specified buffer. Unless flagged otherwise, 2700 * a locked buffer will be returned if it exists or NULL if it does not. 2701 * 2702 * findblk()'d buffers are still on the bufqueues and if you intend 2703 * to use your (locked NON-TEST) buffer you need to bremfree(bp) 2704 * and possibly do other stuff to it. 2705 * 2706 * FINDBLK_TEST - Do not lock the buffer. The caller is responsible 2707 * for locking the buffer and ensuring that it remains 2708 * the desired buffer after locking. 2709 * 2710 * FINDBLK_NBLOCK - Lock the buffer non-blocking. If we are unable 2711 * to acquire the lock we return NULL, even if the 2712 * buffer exists. 2713 * 2714 * FINDBLK_REF - Returns the buffer ref'd, which prevents reuse 2715 * by getnewbuf() but does not prevent disassociation 2716 * while we are locked. Used to avoid deadlocks 2717 * against random (vp,loffset)s due to reassignment. 2718 * 2719 * (0) - Lock the buffer blocking. 2720 * 2721 * MPSAFE 2722 */ 2723 struct buf * 2724 findblk(struct vnode *vp, off_t loffset, int flags) 2725 { 2726 struct buf *bp; 2727 int lkflags; 2728 2729 lkflags = LK_EXCLUSIVE; 2730 if (flags & FINDBLK_NBLOCK) 2731 lkflags |= LK_NOWAIT; 2732 2733 for (;;) { 2734 /* 2735 * Lookup. Ref the buf while holding v_token to prevent 2736 * reuse (but does not prevent diassociation). 2737 */ 2738 lwkt_gettoken(&vp->v_token); 2739 bp = buf_rb_hash_RB_LOOKUP(&vp->v_rbhash_tree, loffset); 2740 if (bp == NULL) { 2741 lwkt_reltoken(&vp->v_token); 2742 return(NULL); 2743 } 2744 atomic_add_int(&bp->b_refs, 1); 2745 lwkt_reltoken(&vp->v_token); 2746 2747 /* 2748 * If testing only break and return bp, do not lock. 2749 */ 2750 if (flags & FINDBLK_TEST) 2751 break; 2752 2753 /* 2754 * Lock the buffer, return an error if the lock fails. 2755 * (only FINDBLK_NBLOCK can cause the lock to fail). 2756 */ 2757 if (BUF_LOCK(bp, lkflags)) { 2758 atomic_subtract_int(&bp->b_refs, 1); 2759 /* bp = NULL; not needed */ 2760 return(NULL); 2761 } 2762 2763 /* 2764 * Revalidate the locked buf before allowing it to be 2765 * returned. 2766 */ 2767 if (bp->b_vp == vp && bp->b_loffset == loffset) 2768 break; 2769 atomic_subtract_int(&bp->b_refs, 1); 2770 BUF_UNLOCK(bp); 2771 } 2772 2773 /* 2774 * Success 2775 */ 2776 if ((flags & FINDBLK_REF) == 0) 2777 atomic_subtract_int(&bp->b_refs, 1); 2778 return(bp); 2779 } 2780 2781 void 2782 unrefblk(struct buf *bp) 2783 { 2784 atomic_subtract_int(&bp->b_refs, 1); 2785 } 2786 2787 /* 2788 * getcacheblk: 2789 * 2790 * Similar to getblk() except only returns the buffer if it is 2791 * B_CACHE and requires no other manipulation. Otherwise NULL 2792 * is returned. 2793 * 2794 * If B_RAM is set the buffer might be just fine, but we return 2795 * NULL anyway because we want the code to fall through to the 2796 * cluster read. Otherwise read-ahead breaks. 2797 */ 2798 struct buf * 2799 getcacheblk(struct vnode *vp, off_t loffset) 2800 { 2801 struct buf *bp; 2802 2803 bp = findblk(vp, loffset, 0); 2804 if (bp) { 2805 if ((bp->b_flags & (B_INVAL | B_CACHE | B_RAM)) == B_CACHE) { 2806 bp->b_flags &= ~B_AGE; 2807 bremfree(bp); 2808 } else { 2809 BUF_UNLOCK(bp); 2810 bp = NULL; 2811 } 2812 } 2813 return (bp); 2814 } 2815 2816 /* 2817 * getblk: 2818 * 2819 * Get a block given a specified block and offset into a file/device. 2820 * B_INVAL may or may not be set on return. The caller should clear 2821 * B_INVAL prior to initiating a READ. 2822 * 2823 * IT IS IMPORTANT TO UNDERSTAND THAT IF YOU CALL GETBLK() AND B_CACHE 2824 * IS NOT SET, YOU MUST INITIALIZE THE RETURNED BUFFER, ISSUE A READ, 2825 * OR SET B_INVAL BEFORE RETIRING IT. If you retire a getblk'd buffer 2826 * without doing any of those things the system will likely believe 2827 * the buffer to be valid (especially if it is not B_VMIO), and the 2828 * next getblk() will return the buffer with B_CACHE set. 2829 * 2830 * For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for 2831 * an existing buffer. 2832 * 2833 * For a VMIO buffer, B_CACHE is modified according to the backing VM. 2834 * If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set 2835 * and then cleared based on the backing VM. If the previous buffer is 2836 * non-0-sized but invalid, B_CACHE will be cleared. 2837 * 2838 * If getblk() must create a new buffer, the new buffer is returned with 2839 * both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which 2840 * case it is returned with B_INVAL clear and B_CACHE set based on the 2841 * backing VM. 2842 * 2843 * getblk() also forces a bwrite() for any B_DELWRI buffer whos 2844 * B_CACHE bit is clear. 2845 * 2846 * What this means, basically, is that the caller should use B_CACHE to 2847 * determine whether the buffer is fully valid or not and should clear 2848 * B_INVAL prior to issuing a read. If the caller intends to validate 2849 * the buffer by loading its data area with something, the caller needs 2850 * to clear B_INVAL. If the caller does this without issuing an I/O, 2851 * the caller should set B_CACHE ( as an optimization ), else the caller 2852 * should issue the I/O and biodone() will set B_CACHE if the I/O was 2853 * a write attempt or if it was a successfull read. If the caller 2854 * intends to issue a READ, the caller must clear B_INVAL and B_ERROR 2855 * prior to issuing the READ. biodone() will *not* clear B_INVAL. 2856 * 2857 * getblk flags: 2858 * 2859 * GETBLK_PCATCH - catch signal if blocked, can cause NULL return 2860 * GETBLK_BHEAVY - heavy-weight buffer cache buffer 2861 * 2862 * MPALMOSTSAFE 2863 */ 2864 struct buf * 2865 getblk(struct vnode *vp, off_t loffset, int size, int blkflags, int slptimeo) 2866 { 2867 struct buf *bp; 2868 int slpflags = (blkflags & GETBLK_PCATCH) ? PCATCH : 0; 2869 int error; 2870 int lkflags; 2871 2872 if (size > MAXBSIZE) 2873 panic("getblk: size(%d) > MAXBSIZE(%d)", size, MAXBSIZE); 2874 if (vp->v_object == NULL) 2875 panic("getblk: vnode %p has no object!", vp); 2876 2877 loop: 2878 if ((bp = findblk(vp, loffset, FINDBLK_REF | FINDBLK_TEST)) != NULL) { 2879 /* 2880 * The buffer was found in the cache, but we need to lock it. 2881 * We must acquire a ref on the bp to prevent reuse, but 2882 * this will not prevent disassociation (brelvp()) so we 2883 * must recheck (vp,loffset) after acquiring the lock. 2884 * 2885 * Without the ref the buffer could potentially be reused 2886 * before we acquire the lock and create a deadlock 2887 * situation between the thread trying to reuse the buffer 2888 * and us due to the fact that we would wind up blocking 2889 * on a random (vp,loffset). 2890 */ 2891 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 2892 if (blkflags & GETBLK_NOWAIT) { 2893 unrefblk(bp); 2894 return(NULL); 2895 } 2896 lkflags = LK_EXCLUSIVE | LK_SLEEPFAIL; 2897 if (blkflags & GETBLK_PCATCH) 2898 lkflags |= LK_PCATCH; 2899 error = BUF_TIMELOCK(bp, lkflags, "getblk", slptimeo); 2900 if (error) { 2901 unrefblk(bp); 2902 if (error == ENOLCK) 2903 goto loop; 2904 return (NULL); 2905 } 2906 /* buffer may have changed on us */ 2907 } 2908 unrefblk(bp); 2909 2910 /* 2911 * Once the buffer has been locked, make sure we didn't race 2912 * a buffer recyclement. Buffers that are no longer hashed 2913 * will have b_vp == NULL, so this takes care of that check 2914 * as well. 2915 */ 2916 if (bp->b_vp != vp || bp->b_loffset != loffset) { 2917 kprintf("Warning buffer %p (vp %p loffset %lld) " 2918 "was recycled\n", 2919 bp, vp, (long long)loffset); 2920 BUF_UNLOCK(bp); 2921 goto loop; 2922 } 2923 2924 /* 2925 * If SZMATCH any pre-existing buffer must be of the requested 2926 * size or NULL is returned. The caller absolutely does not 2927 * want getblk() to bwrite() the buffer on a size mismatch. 2928 */ 2929 if ((blkflags & GETBLK_SZMATCH) && size != bp->b_bcount) { 2930 BUF_UNLOCK(bp); 2931 return(NULL); 2932 } 2933 2934 /* 2935 * All vnode-based buffers must be backed by a VM object. 2936 */ 2937 KKASSERT(bp->b_flags & B_VMIO); 2938 KKASSERT(bp->b_cmd == BUF_CMD_DONE); 2939 bp->b_flags &= ~B_AGE; 2940 2941 /* 2942 * Make sure that B_INVAL buffers do not have a cached 2943 * block number translation. 2944 */ 2945 if ((bp->b_flags & B_INVAL) && (bp->b_bio2.bio_offset != NOOFFSET)) { 2946 kprintf("Warning invalid buffer %p (vp %p loffset %lld)" 2947 " did not have cleared bio_offset cache\n", 2948 bp, vp, (long long)loffset); 2949 clearbiocache(&bp->b_bio2); 2950 } 2951 2952 /* 2953 * The buffer is locked. B_CACHE is cleared if the buffer is 2954 * invalid. 2955 */ 2956 if (bp->b_flags & B_INVAL) 2957 bp->b_flags &= ~B_CACHE; 2958 bremfree(bp); 2959 2960 /* 2961 * Any size inconsistancy with a dirty buffer or a buffer 2962 * with a softupdates dependancy must be resolved. Resizing 2963 * the buffer in such circumstances can lead to problems. 2964 * 2965 * Dirty or dependant buffers are written synchronously. 2966 * Other types of buffers are simply released and 2967 * reconstituted as they may be backed by valid, dirty VM 2968 * pages (but not marked B_DELWRI). 2969 * 2970 * NFS NOTE: NFS buffers which straddle EOF are oddly-sized 2971 * and may be left over from a prior truncation (and thus 2972 * no longer represent the actual EOF point), so we 2973 * definitely do not want to B_NOCACHE the backing store. 2974 */ 2975 if (size != bp->b_bcount) { 2976 if (bp->b_flags & B_DELWRI) { 2977 bp->b_flags |= B_RELBUF; 2978 bwrite(bp); 2979 } else if (LIST_FIRST(&bp->b_dep)) { 2980 bp->b_flags |= B_RELBUF; 2981 bwrite(bp); 2982 } else { 2983 bp->b_flags |= B_RELBUF; 2984 brelse(bp); 2985 } 2986 goto loop; 2987 } 2988 KKASSERT(size <= bp->b_kvasize); 2989 KASSERT(bp->b_loffset != NOOFFSET, 2990 ("getblk: no buffer offset")); 2991 2992 /* 2993 * A buffer with B_DELWRI set and B_CACHE clear must 2994 * be committed before we can return the buffer in 2995 * order to prevent the caller from issuing a read 2996 * ( due to B_CACHE not being set ) and overwriting 2997 * it. 2998 * 2999 * Most callers, including NFS and FFS, need this to 3000 * operate properly either because they assume they 3001 * can issue a read if B_CACHE is not set, or because 3002 * ( for example ) an uncached B_DELWRI might loop due 3003 * to softupdates re-dirtying the buffer. In the latter 3004 * case, B_CACHE is set after the first write completes, 3005 * preventing further loops. 3006 * 3007 * NOTE! b*write() sets B_CACHE. If we cleared B_CACHE 3008 * above while extending the buffer, we cannot allow the 3009 * buffer to remain with B_CACHE set after the write 3010 * completes or it will represent a corrupt state. To 3011 * deal with this we set B_NOCACHE to scrap the buffer 3012 * after the write. 3013 * 3014 * XXX Should this be B_RELBUF instead of B_NOCACHE? 3015 * I'm not even sure this state is still possible 3016 * now that getblk() writes out any dirty buffers 3017 * on size changes. 3018 * 3019 * We might be able to do something fancy, like setting 3020 * B_CACHE in bwrite() except if B_DELWRI is already set, 3021 * so the below call doesn't set B_CACHE, but that gets real 3022 * confusing. This is much easier. 3023 */ 3024 3025 if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) { 3026 kprintf("getblk: Warning, bp %p loff=%jx DELWRI set " 3027 "and CACHE clear, b_flags %08x\n", 3028 bp, (intmax_t)bp->b_loffset, bp->b_flags); 3029 bp->b_flags |= B_NOCACHE; 3030 bwrite(bp); 3031 goto loop; 3032 } 3033 } else { 3034 /* 3035 * Buffer is not in-core, create new buffer. The buffer 3036 * returned by getnewbuf() is locked. Note that the returned 3037 * buffer is also considered valid (not marked B_INVAL). 3038 * 3039 * Calculating the offset for the I/O requires figuring out 3040 * the block size. We use DEV_BSIZE for VBLK or VCHR and 3041 * the mount's f_iosize otherwise. If the vnode does not 3042 * have an associated mount we assume that the passed size is 3043 * the block size. 3044 * 3045 * Note that vn_isdisk() cannot be used here since it may 3046 * return a failure for numerous reasons. Note that the 3047 * buffer size may be larger then the block size (the caller 3048 * will use block numbers with the proper multiple). Beware 3049 * of using any v_* fields which are part of unions. In 3050 * particular, in DragonFly the mount point overloading 3051 * mechanism uses the namecache only and the underlying 3052 * directory vnode is not a special case. 3053 */ 3054 int bsize, maxsize; 3055 3056 if (vp->v_type == VBLK || vp->v_type == VCHR) 3057 bsize = DEV_BSIZE; 3058 else if (vp->v_mount) 3059 bsize = vp->v_mount->mnt_stat.f_iosize; 3060 else 3061 bsize = size; 3062 3063 maxsize = size + (loffset & PAGE_MASK); 3064 maxsize = imax(maxsize, bsize); 3065 3066 bp = getnewbuf(blkflags, slptimeo, size, maxsize); 3067 if (bp == NULL) { 3068 if (slpflags || slptimeo) 3069 return NULL; 3070 goto loop; 3071 } 3072 3073 /* 3074 * Atomically insert the buffer into the hash, so that it can 3075 * be found by findblk(). 3076 * 3077 * If bgetvp() returns non-zero a collision occured, and the 3078 * bp will not be associated with the vnode. 3079 * 3080 * Make sure the translation layer has been cleared. 3081 */ 3082 bp->b_loffset = loffset; 3083 bp->b_bio2.bio_offset = NOOFFSET; 3084 /* bp->b_bio2.bio_next = NULL; */ 3085 3086 if (bgetvp(vp, bp, size)) { 3087 bp->b_flags |= B_INVAL; 3088 brelse(bp); 3089 goto loop; 3090 } 3091 3092 /* 3093 * All vnode-based buffers must be backed by a VM object. 3094 */ 3095 KKASSERT(vp->v_object != NULL); 3096 bp->b_flags |= B_VMIO; 3097 KKASSERT(bp->b_cmd == BUF_CMD_DONE); 3098 3099 allocbuf(bp, size); 3100 } 3101 KKASSERT(dsched_is_clear_buf_priv(bp)); 3102 return (bp); 3103 } 3104 3105 /* 3106 * regetblk(bp) 3107 * 3108 * Reacquire a buffer that was previously released to the locked queue, 3109 * or reacquire a buffer which is interlocked by having bioops->io_deallocate 3110 * set B_LOCKED (which handles the acquisition race). 3111 * 3112 * To this end, either B_LOCKED must be set or the dependancy list must be 3113 * non-empty. 3114 * 3115 * MPSAFE 3116 */ 3117 void 3118 regetblk(struct buf *bp) 3119 { 3120 KKASSERT((bp->b_flags & B_LOCKED) || LIST_FIRST(&bp->b_dep) != NULL); 3121 BUF_LOCK(bp, LK_EXCLUSIVE | LK_RETRY); 3122 bremfree(bp); 3123 } 3124 3125 /* 3126 * geteblk: 3127 * 3128 * Get an empty, disassociated buffer of given size. The buffer is 3129 * initially set to B_INVAL. 3130 * 3131 * critical section protection is not required for the allocbuf() 3132 * call because races are impossible here. 3133 * 3134 * MPALMOSTSAFE 3135 */ 3136 struct buf * 3137 geteblk(int size) 3138 { 3139 struct buf *bp; 3140 int maxsize; 3141 3142 maxsize = (size + BKVAMASK) & ~BKVAMASK; 3143 3144 while ((bp = getnewbuf(0, 0, size, maxsize)) == 0) 3145 ; 3146 allocbuf(bp, size); 3147 bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ 3148 KKASSERT(dsched_is_clear_buf_priv(bp)); 3149 return (bp); 3150 } 3151 3152 3153 /* 3154 * allocbuf: 3155 * 3156 * This code constitutes the buffer memory from either anonymous system 3157 * memory (in the case of non-VMIO operations) or from an associated 3158 * VM object (in the case of VMIO operations). This code is able to 3159 * resize a buffer up or down. 3160 * 3161 * Note that this code is tricky, and has many complications to resolve 3162 * deadlock or inconsistant data situations. Tread lightly!!! 3163 * There are B_CACHE and B_DELWRI interactions that must be dealt with by 3164 * the caller. Calling this code willy nilly can result in the loss of 3165 * data. 3166 * 3167 * allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with 3168 * B_CACHE for the non-VMIO case. 3169 * 3170 * This routine does not need to be called from a critical section but you 3171 * must own the buffer. 3172 * 3173 * MPSAFE 3174 */ 3175 int 3176 allocbuf(struct buf *bp, int size) 3177 { 3178 int newbsize, mbsize; 3179 int i; 3180 3181 if (BUF_REFCNT(bp) == 0) 3182 panic("allocbuf: buffer not busy"); 3183 3184 if (bp->b_kvasize < size) 3185 panic("allocbuf: buffer too small"); 3186 3187 if ((bp->b_flags & B_VMIO) == 0) { 3188 caddr_t origbuf; 3189 int origbufsize; 3190 /* 3191 * Just get anonymous memory from the kernel. Don't 3192 * mess with B_CACHE. 3193 */ 3194 mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); 3195 if (bp->b_flags & B_MALLOC) 3196 newbsize = mbsize; 3197 else 3198 newbsize = round_page(size); 3199 3200 if (newbsize < bp->b_bufsize) { 3201 /* 3202 * Malloced buffers are not shrunk 3203 */ 3204 if (bp->b_flags & B_MALLOC) { 3205 if (newbsize) { 3206 bp->b_bcount = size; 3207 } else { 3208 kfree(bp->b_data, M_BIOBUF); 3209 if (bp->b_bufsize) { 3210 atomic_subtract_int(&bufmallocspace, bp->b_bufsize); 3211 bufspacewakeup(); 3212 bp->b_bufsize = 0; 3213 } 3214 bp->b_data = bp->b_kvabase; 3215 bp->b_bcount = 0; 3216 bp->b_flags &= ~B_MALLOC; 3217 } 3218 return 1; 3219 } 3220 vm_hold_free_pages( 3221 bp, 3222 (vm_offset_t) bp->b_data + newbsize, 3223 (vm_offset_t) bp->b_data + bp->b_bufsize); 3224 } else if (newbsize > bp->b_bufsize) { 3225 /* 3226 * We only use malloced memory on the first allocation. 3227 * and revert to page-allocated memory when the buffer 3228 * grows. 3229 */ 3230 if ((bufmallocspace < maxbufmallocspace) && 3231 (bp->b_bufsize == 0) && 3232 (mbsize <= PAGE_SIZE/2)) { 3233 3234 bp->b_data = kmalloc(mbsize, M_BIOBUF, M_WAITOK); 3235 bp->b_bufsize = mbsize; 3236 bp->b_bcount = size; 3237 bp->b_flags |= B_MALLOC; 3238 atomic_add_int(&bufmallocspace, mbsize); 3239 return 1; 3240 } 3241 origbuf = NULL; 3242 origbufsize = 0; 3243 /* 3244 * If the buffer is growing on its other-than-first 3245 * allocation, then we revert to the page-allocation 3246 * scheme. 3247 */ 3248 if (bp->b_flags & B_MALLOC) { 3249 origbuf = bp->b_data; 3250 origbufsize = bp->b_bufsize; 3251 bp->b_data = bp->b_kvabase; 3252 if (bp->b_bufsize) { 3253 atomic_subtract_int(&bufmallocspace, 3254 bp->b_bufsize); 3255 bufspacewakeup(); 3256 bp->b_bufsize = 0; 3257 } 3258 bp->b_flags &= ~B_MALLOC; 3259 newbsize = round_page(newbsize); 3260 } 3261 vm_hold_load_pages( 3262 bp, 3263 (vm_offset_t) bp->b_data + bp->b_bufsize, 3264 (vm_offset_t) bp->b_data + newbsize); 3265 if (origbuf) { 3266 bcopy(origbuf, bp->b_data, origbufsize); 3267 kfree(origbuf, M_BIOBUF); 3268 } 3269 } 3270 } else { 3271 vm_page_t m; 3272 int desiredpages; 3273 3274 newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); 3275 desiredpages = ((int)(bp->b_loffset & PAGE_MASK) + 3276 newbsize + PAGE_MASK) >> PAGE_SHIFT; 3277 KKASSERT(desiredpages <= XIO_INTERNAL_PAGES); 3278 3279 if (bp->b_flags & B_MALLOC) 3280 panic("allocbuf: VMIO buffer can't be malloced"); 3281 /* 3282 * Set B_CACHE initially if buffer is 0 length or will become 3283 * 0-length. 3284 */ 3285 if (size == 0 || bp->b_bufsize == 0) 3286 bp->b_flags |= B_CACHE; 3287 3288 if (newbsize < bp->b_bufsize) { 3289 /* 3290 * DEV_BSIZE aligned new buffer size is less then the 3291 * DEV_BSIZE aligned existing buffer size. Figure out 3292 * if we have to remove any pages. 3293 */ 3294 if (desiredpages < bp->b_xio.xio_npages) { 3295 for (i = desiredpages; i < bp->b_xio.xio_npages; i++) { 3296 /* 3297 * the page is not freed here -- it 3298 * is the responsibility of 3299 * vnode_pager_setsize 3300 */ 3301 m = bp->b_xio.xio_pages[i]; 3302 KASSERT(m != bogus_page, 3303 ("allocbuf: bogus page found")); 3304 while (vm_page_sleep_busy(m, TRUE, "biodep")) 3305 ; 3306 3307 bp->b_xio.xio_pages[i] = NULL; 3308 vm_page_unwire(m, 0); 3309 } 3310 pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) + 3311 (desiredpages << PAGE_SHIFT), (bp->b_xio.xio_npages - desiredpages)); 3312 bp->b_xio.xio_npages = desiredpages; 3313 } 3314 } else if (size > bp->b_bcount) { 3315 /* 3316 * We are growing the buffer, possibly in a 3317 * byte-granular fashion. 3318 */ 3319 struct vnode *vp; 3320 vm_object_t obj; 3321 vm_offset_t toff; 3322 vm_offset_t tinc; 3323 3324 /* 3325 * Step 1, bring in the VM pages from the object, 3326 * allocating them if necessary. We must clear 3327 * B_CACHE if these pages are not valid for the 3328 * range covered by the buffer. 3329 * 3330 * critical section protection is required to protect 3331 * against interrupts unbusying and freeing pages 3332 * between our vm_page_lookup() and our 3333 * busycheck/wiring call. 3334 */ 3335 vp = bp->b_vp; 3336 obj = vp->v_object; 3337 3338 lwkt_gettoken(&vm_token); 3339 while (bp->b_xio.xio_npages < desiredpages) { 3340 vm_page_t m; 3341 vm_pindex_t pi; 3342 3343 pi = OFF_TO_IDX(bp->b_loffset) + bp->b_xio.xio_npages; 3344 if ((m = vm_page_lookup(obj, pi)) == NULL) { 3345 /* 3346 * note: must allocate system pages 3347 * since blocking here could intefere 3348 * with paging I/O, no matter which 3349 * process we are. 3350 */ 3351 m = bio_page_alloc(obj, pi, desiredpages - bp->b_xio.xio_npages); 3352 if (m) { 3353 vm_page_wire(m); 3354 vm_page_wakeup(m); 3355 vm_page_flag_clear(m, PG_ZERO); 3356 bp->b_flags &= ~B_CACHE; 3357 bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m; 3358 ++bp->b_xio.xio_npages; 3359 } 3360 continue; 3361 } 3362 3363 /* 3364 * We found a page. If we have to sleep on it, 3365 * retry because it might have gotten freed out 3366 * from under us. 3367 * 3368 * We can only test PG_BUSY here. Blocking on 3369 * m->busy might lead to a deadlock: 3370 * 3371 * vm_fault->getpages->cluster_read->allocbuf 3372 * 3373 */ 3374 3375 if (vm_page_sleep_busy(m, FALSE, "pgtblk")) 3376 continue; 3377 vm_page_flag_clear(m, PG_ZERO); 3378 vm_page_wire(m); 3379 bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m; 3380 ++bp->b_xio.xio_npages; 3381 if (bp->b_act_count < m->act_count) 3382 bp->b_act_count = m->act_count; 3383 } 3384 lwkt_reltoken(&vm_token); 3385 3386 /* 3387 * Step 2. We've loaded the pages into the buffer, 3388 * we have to figure out if we can still have B_CACHE 3389 * set. Note that B_CACHE is set according to the 3390 * byte-granular range ( bcount and size ), not the 3391 * aligned range ( newbsize ). 3392 * 3393 * The VM test is against m->valid, which is DEV_BSIZE 3394 * aligned. Needless to say, the validity of the data 3395 * needs to also be DEV_BSIZE aligned. Note that this 3396 * fails with NFS if the server or some other client 3397 * extends the file's EOF. If our buffer is resized, 3398 * B_CACHE may remain set! XXX 3399 */ 3400 3401 toff = bp->b_bcount; 3402 tinc = PAGE_SIZE - ((bp->b_loffset + toff) & PAGE_MASK); 3403 3404 while ((bp->b_flags & B_CACHE) && toff < size) { 3405 vm_pindex_t pi; 3406 3407 if (tinc > (size - toff)) 3408 tinc = size - toff; 3409 3410 pi = ((bp->b_loffset & PAGE_MASK) + toff) >> 3411 PAGE_SHIFT; 3412 3413 vfs_buf_test_cache( 3414 bp, 3415 bp->b_loffset, 3416 toff, 3417 tinc, 3418 bp->b_xio.xio_pages[pi] 3419 ); 3420 toff += tinc; 3421 tinc = PAGE_SIZE; 3422 } 3423 3424 /* 3425 * Step 3, fixup the KVM pmap. Remember that 3426 * bp->b_data is relative to bp->b_loffset, but 3427 * bp->b_loffset may be offset into the first page. 3428 */ 3429 3430 bp->b_data = (caddr_t) 3431 trunc_page((vm_offset_t)bp->b_data); 3432 pmap_qenter( 3433 (vm_offset_t)bp->b_data, 3434 bp->b_xio.xio_pages, 3435 bp->b_xio.xio_npages 3436 ); 3437 bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | 3438 (vm_offset_t)(bp->b_loffset & PAGE_MASK)); 3439 } 3440 } 3441 3442 /* adjust space use on already-dirty buffer */ 3443 if (bp->b_flags & B_DELWRI) { 3444 spin_lock(&bufcspin); 3445 dirtybufspace += newbsize - bp->b_bufsize; 3446 if (bp->b_flags & B_HEAVY) 3447 dirtybufspacehw += newbsize - bp->b_bufsize; 3448 spin_unlock(&bufcspin); 3449 } 3450 if (newbsize < bp->b_bufsize) 3451 bufspacewakeup(); 3452 bp->b_bufsize = newbsize; /* actual buffer allocation */ 3453 bp->b_bcount = size; /* requested buffer size */ 3454 return 1; 3455 } 3456 3457 /* 3458 * biowait: 3459 * 3460 * Wait for buffer I/O completion, returning error status. B_EINTR 3461 * is converted into an EINTR error but not cleared (since a chain 3462 * of biowait() calls may occur). 3463 * 3464 * On return bpdone() will have been called but the buffer will remain 3465 * locked and will not have been brelse()'d. 3466 * 3467 * NOTE! If a timeout is specified and ETIMEDOUT occurs the I/O is 3468 * likely still in progress on return. 3469 * 3470 * NOTE! This operation is on a BIO, not a BUF. 3471 * 3472 * NOTE! BIO_DONE is cleared by vn_strategy() 3473 * 3474 * MPSAFE 3475 */ 3476 static __inline int 3477 _biowait(struct bio *bio, const char *wmesg, int to) 3478 { 3479 struct buf *bp = bio->bio_buf; 3480 u_int32_t flags; 3481 u_int32_t nflags; 3482 int error; 3483 3484 KKASSERT(bio == &bp->b_bio1); 3485 for (;;) { 3486 flags = bio->bio_flags; 3487 if (flags & BIO_DONE) 3488 break; 3489 tsleep_interlock(bio, 0); 3490 nflags = flags | BIO_WANT; 3491 tsleep_interlock(bio, 0); 3492 if (atomic_cmpset_int(&bio->bio_flags, flags, nflags)) { 3493 if (wmesg) 3494 error = tsleep(bio, PINTERLOCKED, wmesg, to); 3495 else if (bp->b_cmd == BUF_CMD_READ) 3496 error = tsleep(bio, PINTERLOCKED, "biord", to); 3497 else 3498 error = tsleep(bio, PINTERLOCKED, "biowr", to); 3499 if (error) { 3500 kprintf("tsleep error biowait %d\n", error); 3501 return (error); 3502 } 3503 } 3504 } 3505 3506 /* 3507 * Finish up. 3508 */ 3509 KKASSERT(bp->b_cmd == BUF_CMD_DONE); 3510 bio->bio_flags &= ~(BIO_DONE | BIO_SYNC); 3511 if (bp->b_flags & B_EINTR) 3512 return (EINTR); 3513 if (bp->b_flags & B_ERROR) 3514 return (bp->b_error ? bp->b_error : EIO); 3515 return (0); 3516 } 3517 3518 int 3519 biowait(struct bio *bio, const char *wmesg) 3520 { 3521 return(_biowait(bio, wmesg, 0)); 3522 } 3523 3524 int 3525 biowait_timeout(struct bio *bio, const char *wmesg, int to) 3526 { 3527 return(_biowait(bio, wmesg, to)); 3528 } 3529 3530 /* 3531 * This associates a tracking count with an I/O. vn_strategy() and 3532 * dev_dstrategy() do this automatically but there are a few cases 3533 * where a vnode or device layer is bypassed when a block translation 3534 * is cached. In such cases bio_start_transaction() may be called on 3535 * the bypassed layers so the system gets an I/O in progress indication 3536 * for those higher layers. 3537 */ 3538 void 3539 bio_start_transaction(struct bio *bio, struct bio_track *track) 3540 { 3541 bio->bio_track = track; 3542 if (dsched_is_clear_buf_priv(bio->bio_buf)) 3543 dsched_new_buf(bio->bio_buf); 3544 bio_track_ref(track); 3545 } 3546 3547 /* 3548 * Initiate I/O on a vnode. 3549 * 3550 * SWAPCACHE OPERATION: 3551 * 3552 * Real buffer cache buffers have a non-NULL bp->b_vp. Unfortunately 3553 * devfs also uses b_vp for fake buffers so we also have to check 3554 * that B_PAGING is 0. In this case the passed 'vp' is probably the 3555 * underlying block device. The swap assignments are related to the 3556 * buffer cache buffer's b_vp, not the passed vp. 3557 * 3558 * The passed vp == bp->b_vp only in the case where the strategy call 3559 * is made on the vp itself for its own buffers (a regular file or 3560 * block device vp). The filesystem usually then re-calls vn_strategy() 3561 * after translating the request to an underlying device. 3562 * 3563 * Cluster buffers set B_CLUSTER and the passed vp is the vp of the 3564 * underlying buffer cache buffers. 3565 * 3566 * We can only deal with page-aligned buffers at the moment, because 3567 * we can't tell what the real dirty state for pages straddling a buffer 3568 * are. 3569 * 3570 * In order to call swap_pager_strategy() we must provide the VM object 3571 * and base offset for the underlying buffer cache pages so it can find 3572 * the swap blocks. 3573 */ 3574 void 3575 vn_strategy(struct vnode *vp, struct bio *bio) 3576 { 3577 struct bio_track *track; 3578 struct buf *bp = bio->bio_buf; 3579 3580 KKASSERT(bp->b_cmd != BUF_CMD_DONE); 3581 3582 /* 3583 * Set when an I/O is issued on the bp. Cleared by consumers 3584 * (aka HAMMER), allowing the consumer to determine if I/O had 3585 * actually occurred. 3586 */ 3587 bp->b_flags |= B_IODEBUG; 3588 3589 /* 3590 * Handle the swap cache intercept. 3591 */ 3592 if (vn_cache_strategy(vp, bio)) 3593 return; 3594 3595 /* 3596 * Otherwise do the operation through the filesystem 3597 */ 3598 if (bp->b_cmd == BUF_CMD_READ) 3599 track = &vp->v_track_read; 3600 else 3601 track = &vp->v_track_write; 3602 KKASSERT((bio->bio_flags & BIO_DONE) == 0); 3603 bio->bio_track = track; 3604 if (dsched_is_clear_buf_priv(bio->bio_buf)) 3605 dsched_new_buf(bio->bio_buf); 3606 bio_track_ref(track); 3607 vop_strategy(*vp->v_ops, vp, bio); 3608 } 3609 3610 int 3611 vn_cache_strategy(struct vnode *vp, struct bio *bio) 3612 { 3613 struct buf *bp = bio->bio_buf; 3614 struct bio *nbio; 3615 vm_object_t object; 3616 vm_page_t m; 3617 int i; 3618 3619 /* 3620 * Is this buffer cache buffer suitable for reading from 3621 * the swap cache? 3622 */ 3623 if (vm_swapcache_read_enable == 0 || 3624 bp->b_cmd != BUF_CMD_READ || 3625 ((bp->b_flags & B_CLUSTER) == 0 && 3626 (bp->b_vp == NULL || (bp->b_flags & B_PAGING))) || 3627 ((int)bp->b_loffset & PAGE_MASK) != 0 || 3628 (bp->b_bcount & PAGE_MASK) != 0) { 3629 return(0); 3630 } 3631 3632 /* 3633 * Figure out the original VM object (it will match the underlying 3634 * VM pages). Note that swap cached data uses page indices relative 3635 * to that object, not relative to bio->bio_offset. 3636 */ 3637 if (bp->b_flags & B_CLUSTER) 3638 object = vp->v_object; 3639 else 3640 object = bp->b_vp->v_object; 3641 3642 /* 3643 * In order to be able to use the swap cache all underlying VM 3644 * pages must be marked as such, and we can't have any bogus pages. 3645 */ 3646 for (i = 0; i < bp->b_xio.xio_npages; ++i) { 3647 m = bp->b_xio.xio_pages[i]; 3648 if ((m->flags & PG_SWAPPED) == 0) 3649 break; 3650 if (m == bogus_page) 3651 break; 3652 } 3653 3654 /* 3655 * If we are good then issue the I/O using swap_pager_strategy() 3656 */ 3657 if (i == bp->b_xio.xio_npages) { 3658 m = bp->b_xio.xio_pages[0]; 3659 nbio = push_bio(bio); 3660 nbio->bio_offset = ptoa(m->pindex); 3661 KKASSERT(m->object == object); 3662 swap_pager_strategy(object, nbio); 3663 return(1); 3664 } 3665 return(0); 3666 } 3667 3668 /* 3669 * bpdone: 3670 * 3671 * Finish I/O on a buffer after all BIOs have been processed. 3672 * Called when the bio chain is exhausted or by biowait. If called 3673 * by biowait, elseit is typically 0. 3674 * 3675 * bpdone is also responsible for setting B_CACHE in a B_VMIO bp. 3676 * In a non-VMIO bp, B_CACHE will be set on the next getblk() 3677 * assuming B_INVAL is clear. 3678 * 3679 * For the VMIO case, we set B_CACHE if the op was a read and no 3680 * read error occured, or if the op was a write. B_CACHE is never 3681 * set if the buffer is invalid or otherwise uncacheable. 3682 * 3683 * bpdone does not mess with B_INVAL, allowing the I/O routine or the 3684 * initiator to leave B_INVAL set to brelse the buffer out of existance 3685 * in the biodone routine. 3686 */ 3687 void 3688 bpdone(struct buf *bp, int elseit) 3689 { 3690 buf_cmd_t cmd; 3691 3692 KASSERT(BUF_REFCNTNB(bp) > 0, 3693 ("biodone: bp %p not busy %d", bp, BUF_REFCNTNB(bp))); 3694 KASSERT(bp->b_cmd != BUF_CMD_DONE, 3695 ("biodone: bp %p already done!", bp)); 3696 3697 /* 3698 * No more BIOs are left. All completion functions have been dealt 3699 * with, now we clean up the buffer. 3700 */ 3701 cmd = bp->b_cmd; 3702 bp->b_cmd = BUF_CMD_DONE; 3703 3704 /* 3705 * Only reads and writes are processed past this point. 3706 */ 3707 if (cmd != BUF_CMD_READ && cmd != BUF_CMD_WRITE) { 3708 if (cmd == BUF_CMD_FREEBLKS) 3709 bp->b_flags |= B_NOCACHE; 3710 if (elseit) 3711 brelse(bp); 3712 return; 3713 } 3714 3715 /* 3716 * Warning: softupdates may re-dirty the buffer, and HAMMER can do 3717 * a lot worse. XXX - move this above the clearing of b_cmd 3718 */ 3719 if (LIST_FIRST(&bp->b_dep) != NULL) 3720 buf_complete(bp); /* MPSAFE */ 3721 3722 /* 3723 * A failed write must re-dirty the buffer unless B_INVAL 3724 * was set. Only applicable to normal buffers (with VPs). 3725 * vinum buffers may not have a vp. 3726 */ 3727 if (cmd == BUF_CMD_WRITE && 3728 (bp->b_flags & (B_ERROR | B_INVAL)) == B_ERROR) { 3729 bp->b_flags &= ~B_NOCACHE; 3730 if (bp->b_vp) 3731 bdirty(bp); 3732 } 3733 3734 if (bp->b_flags & B_VMIO) { 3735 int i; 3736 vm_ooffset_t foff; 3737 vm_page_t m; 3738 vm_object_t obj; 3739 int iosize; 3740 struct vnode *vp = bp->b_vp; 3741 3742 obj = vp->v_object; 3743 3744 #if defined(VFS_BIO_DEBUG) 3745 if (vp->v_auxrefs == 0) 3746 panic("biodone: zero vnode hold count"); 3747 if ((vp->v_flag & VOBJBUF) == 0) 3748 panic("biodone: vnode is not setup for merged cache"); 3749 #endif 3750 3751 foff = bp->b_loffset; 3752 KASSERT(foff != NOOFFSET, ("biodone: no buffer offset")); 3753 KASSERT(obj != NULL, ("biodone: missing VM object")); 3754 3755 #if defined(VFS_BIO_DEBUG) 3756 if (obj->paging_in_progress < bp->b_xio.xio_npages) { 3757 kprintf("biodone: paging in progress(%d) < bp->b_xio.xio_npages(%d)\n", 3758 obj->paging_in_progress, bp->b_xio.xio_npages); 3759 } 3760 #endif 3761 3762 /* 3763 * Set B_CACHE if the op was a normal read and no error 3764 * occured. B_CACHE is set for writes in the b*write() 3765 * routines. 3766 */ 3767 iosize = bp->b_bcount - bp->b_resid; 3768 if (cmd == BUF_CMD_READ && 3769 (bp->b_flags & (B_INVAL|B_NOCACHE|B_ERROR)) == 0) { 3770 bp->b_flags |= B_CACHE; 3771 } 3772 3773 lwkt_gettoken(&vm_token); 3774 for (i = 0; i < bp->b_xio.xio_npages; i++) { 3775 int bogusflag = 0; 3776 int resid; 3777 3778 resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff; 3779 if (resid > iosize) 3780 resid = iosize; 3781 3782 /* 3783 * cleanup bogus pages, restoring the originals. Since 3784 * the originals should still be wired, we don't have 3785 * to worry about interrupt/freeing races destroying 3786 * the VM object association. 3787 */ 3788 m = bp->b_xio.xio_pages[i]; 3789 if (m == bogus_page) { 3790 bogusflag = 1; 3791 m = vm_page_lookup(obj, OFF_TO_IDX(foff)); 3792 if (m == NULL) 3793 panic("biodone: page disappeared"); 3794 bp->b_xio.xio_pages[i] = m; 3795 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), 3796 bp->b_xio.xio_pages, bp->b_xio.xio_npages); 3797 } 3798 #if defined(VFS_BIO_DEBUG) 3799 if (OFF_TO_IDX(foff) != m->pindex) { 3800 kprintf("biodone: foff(%lu)/m->pindex(%ld) " 3801 "mismatch\n", 3802 (unsigned long)foff, (long)m->pindex); 3803 } 3804 #endif 3805 3806 /* 3807 * In the write case, the valid and clean bits are 3808 * already changed correctly (see bdwrite()), so we 3809 * only need to do this here in the read case. 3810 */ 3811 if (cmd == BUF_CMD_READ && !bogusflag && resid > 0) { 3812 vfs_clean_one_page(bp, i, m); 3813 } 3814 vm_page_flag_clear(m, PG_ZERO); 3815 3816 /* 3817 * when debugging new filesystems or buffer I/O 3818 * methods, this is the most common error that pops 3819 * up. if you see this, you have not set the page 3820 * busy flag correctly!!! 3821 */ 3822 if (m->busy == 0) { 3823 kprintf("biodone: page busy < 0, " 3824 "pindex: %d, foff: 0x(%x,%x), " 3825 "resid: %d, index: %d\n", 3826 (int) m->pindex, (int)(foff >> 32), 3827 (int) foff & 0xffffffff, resid, i); 3828 if (!vn_isdisk(vp, NULL)) 3829 kprintf(" iosize: %ld, loffset: %lld, " 3830 "flags: 0x%08x, npages: %d\n", 3831 bp->b_vp->v_mount->mnt_stat.f_iosize, 3832 (long long)bp->b_loffset, 3833 bp->b_flags, bp->b_xio.xio_npages); 3834 else 3835 kprintf(" VDEV, loffset: %lld, flags: 0x%08x, npages: %d\n", 3836 (long long)bp->b_loffset, 3837 bp->b_flags, bp->b_xio.xio_npages); 3838 kprintf(" valid: 0x%x, dirty: 0x%x, wired: %d\n", 3839 m->valid, m->dirty, m->wire_count); 3840 panic("biodone: page busy < 0"); 3841 } 3842 vm_page_io_finish(m); 3843 vm_object_pip_subtract(obj, 1); 3844 foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; 3845 iosize -= resid; 3846 } 3847 bp->b_flags &= ~B_HASBOGUS; 3848 if (obj) 3849 vm_object_pip_wakeupn(obj, 0); 3850 lwkt_reltoken(&vm_token); 3851 } 3852 3853 /* 3854 * Finish up by releasing the buffer. There are no more synchronous 3855 * or asynchronous completions, those were handled by bio_done 3856 * callbacks. 3857 */ 3858 if (elseit) { 3859 if (bp->b_flags & (B_NOCACHE|B_INVAL|B_ERROR|B_RELBUF)) 3860 brelse(bp); 3861 else 3862 bqrelse(bp); 3863 } 3864 } 3865 3866 /* 3867 * Normal biodone. 3868 */ 3869 void 3870 biodone(struct bio *bio) 3871 { 3872 struct buf *bp = bio->bio_buf; 3873 3874 runningbufwakeup(bp); 3875 3876 /* 3877 * Run up the chain of BIO's. Leave b_cmd intact for the duration. 3878 */ 3879 while (bio) { 3880 biodone_t *done_func; 3881 struct bio_track *track; 3882 3883 /* 3884 * BIO tracking. Most but not all BIOs are tracked. 3885 */ 3886 if ((track = bio->bio_track) != NULL) { 3887 bio_track_rel(track); 3888 bio->bio_track = NULL; 3889 } 3890 3891 /* 3892 * A bio_done function terminates the loop. The function 3893 * will be responsible for any further chaining and/or 3894 * buffer management. 3895 * 3896 * WARNING! The done function can deallocate the buffer! 3897 */ 3898 if ((done_func = bio->bio_done) != NULL) { 3899 bio->bio_done = NULL; 3900 done_func(bio); 3901 return; 3902 } 3903 bio = bio->bio_prev; 3904 } 3905 3906 /* 3907 * If we've run out of bio's do normal [a]synchronous completion. 3908 */ 3909 bpdone(bp, 1); 3910 } 3911 3912 /* 3913 * Synchronous biodone - this terminates a synchronous BIO. 3914 * 3915 * bpdone() is called with elseit=FALSE, leaving the buffer completed 3916 * but still locked. The caller must brelse() the buffer after waiting 3917 * for completion. 3918 */ 3919 void 3920 biodone_sync(struct bio *bio) 3921 { 3922 struct buf *bp = bio->bio_buf; 3923 int flags; 3924 int nflags; 3925 3926 KKASSERT(bio == &bp->b_bio1); 3927 bpdone(bp, 0); 3928 3929 for (;;) { 3930 flags = bio->bio_flags; 3931 nflags = (flags | BIO_DONE) & ~BIO_WANT; 3932 3933 if (atomic_cmpset_int(&bio->bio_flags, flags, nflags)) { 3934 if (flags & BIO_WANT) 3935 wakeup(bio); 3936 break; 3937 } 3938 } 3939 } 3940 3941 /* 3942 * vfs_unbusy_pages: 3943 * 3944 * This routine is called in lieu of iodone in the case of 3945 * incomplete I/O. This keeps the busy status for pages 3946 * consistant. 3947 */ 3948 void 3949 vfs_unbusy_pages(struct buf *bp) 3950 { 3951 int i; 3952 3953 runningbufwakeup(bp); 3954 3955 lwkt_gettoken(&vm_token); 3956 if (bp->b_flags & B_VMIO) { 3957 struct vnode *vp = bp->b_vp; 3958 vm_object_t obj; 3959 3960 obj = vp->v_object; 3961 3962 for (i = 0; i < bp->b_xio.xio_npages; i++) { 3963 vm_page_t m = bp->b_xio.xio_pages[i]; 3964 3965 /* 3966 * When restoring bogus changes the original pages 3967 * should still be wired, so we are in no danger of 3968 * losing the object association and do not need 3969 * critical section protection particularly. 3970 */ 3971 if (m == bogus_page) { 3972 m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_loffset) + i); 3973 if (!m) { 3974 panic("vfs_unbusy_pages: page missing"); 3975 } 3976 bp->b_xio.xio_pages[i] = m; 3977 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), 3978 bp->b_xio.xio_pages, bp->b_xio.xio_npages); 3979 } 3980 vm_object_pip_subtract(obj, 1); 3981 vm_page_flag_clear(m, PG_ZERO); 3982 vm_page_io_finish(m); 3983 } 3984 bp->b_flags &= ~B_HASBOGUS; 3985 vm_object_pip_wakeupn(obj, 0); 3986 } 3987 lwkt_reltoken(&vm_token); 3988 } 3989 3990 /* 3991 * vfs_busy_pages: 3992 * 3993 * This routine is called before a device strategy routine. 3994 * It is used to tell the VM system that paging I/O is in 3995 * progress, and treat the pages associated with the buffer 3996 * almost as being PG_BUSY. Also the object 'paging_in_progress' 3997 * flag is handled to make sure that the object doesn't become 3998 * inconsistant. 3999 * 4000 * Since I/O has not been initiated yet, certain buffer flags 4001 * such as B_ERROR or B_INVAL may be in an inconsistant state 4002 * and should be ignored. 4003 * 4004 * MPSAFE 4005 */ 4006 void 4007 vfs_busy_pages(struct vnode *vp, struct buf *bp) 4008 { 4009 int i, bogus; 4010 struct lwp *lp = curthread->td_lwp; 4011 4012 /* 4013 * The buffer's I/O command must already be set. If reading, 4014 * B_CACHE must be 0 (double check against callers only doing 4015 * I/O when B_CACHE is 0). 4016 */ 4017 KKASSERT(bp->b_cmd != BUF_CMD_DONE); 4018 KKASSERT(bp->b_cmd == BUF_CMD_WRITE || (bp->b_flags & B_CACHE) == 0); 4019 4020 if (bp->b_flags & B_VMIO) { 4021 vm_object_t obj; 4022 4023 lwkt_gettoken(&vm_token); 4024 4025 obj = vp->v_object; 4026 KASSERT(bp->b_loffset != NOOFFSET, 4027 ("vfs_busy_pages: no buffer offset")); 4028 4029 /* 4030 * Loop until none of the pages are busy. 4031 */ 4032 retry: 4033 for (i = 0; i < bp->b_xio.xio_npages; i++) { 4034 vm_page_t m = bp->b_xio.xio_pages[i]; 4035 4036 if (vm_page_sleep_busy(m, FALSE, "vbpage")) 4037 goto retry; 4038 } 4039 4040 /* 4041 * Setup for I/O, soft-busy the page right now because 4042 * the next loop may block. 4043 */ 4044 for (i = 0; i < bp->b_xio.xio_npages; i++) { 4045 vm_page_t m = bp->b_xio.xio_pages[i]; 4046 4047 vm_page_flag_clear(m, PG_ZERO); 4048 if ((bp->b_flags & B_CLUSTER) == 0) { 4049 vm_object_pip_add(obj, 1); 4050 vm_page_io_start(m); 4051 } 4052 } 4053 4054 /* 4055 * Adjust protections for I/O and do bogus-page mapping. 4056 * Assume that vm_page_protect() can block (it can block 4057 * if VM_PROT_NONE, don't take any chances regardless). 4058 * 4059 * In particular note that for writes we must incorporate 4060 * page dirtyness from the VM system into the buffer's 4061 * dirty range. 4062 * 4063 * For reads we theoretically must incorporate page dirtyness 4064 * from the VM system to determine if the page needs bogus 4065 * replacement, but we shortcut the test by simply checking 4066 * that all m->valid bits are set, indicating that the page 4067 * is fully valid and does not need to be re-read. For any 4068 * VM system dirtyness the page will also be fully valid 4069 * since it was mapped at one point. 4070 */ 4071 bogus = 0; 4072 for (i = 0; i < bp->b_xio.xio_npages; i++) { 4073 vm_page_t m = bp->b_xio.xio_pages[i]; 4074 4075 vm_page_flag_clear(m, PG_ZERO); /* XXX */ 4076 if (bp->b_cmd == BUF_CMD_WRITE) { 4077 /* 4078 * When readying a vnode-backed buffer for 4079 * a write we must zero-fill any invalid 4080 * portions of the backing VM pages, mark 4081 * it valid and clear related dirty bits. 4082 * 4083 * vfs_clean_one_page() incorporates any 4084 * VM dirtyness and updates the b_dirtyoff 4085 * range (after we've made the page RO). 4086 * 4087 * It is also expected that the pmap modified 4088 * bit has already been cleared by the 4089 * vm_page_protect(). We may not be able 4090 * to clear all dirty bits for a page if it 4091 * was also memory mapped (NFS). 4092 * 4093 * Finally be sure to unassign any swap-cache 4094 * backing store as it is now stale. 4095 */ 4096 vm_page_protect(m, VM_PROT_READ); 4097 vfs_clean_one_page(bp, i, m); 4098 swap_pager_unswapped(m); 4099 } else if (m->valid == VM_PAGE_BITS_ALL) { 4100 /* 4101 * When readying a vnode-backed buffer for 4102 * read we must replace any dirty pages with 4103 * a bogus page so dirty data is not destroyed 4104 * when filling gaps. 4105 * 4106 * To avoid testing whether the page is 4107 * dirty we instead test that the page was 4108 * at some point mapped (m->valid fully 4109 * valid) with the understanding that 4110 * this also covers the dirty case. 4111 */ 4112 bp->b_xio.xio_pages[i] = bogus_page; 4113 bp->b_flags |= B_HASBOGUS; 4114 bogus++; 4115 } else if (m->valid & m->dirty) { 4116 /* 4117 * This case should not occur as partial 4118 * dirtyment can only happen if the buffer 4119 * is B_CACHE, and this code is not entered 4120 * if the buffer is B_CACHE. 4121 */ 4122 kprintf("Warning: vfs_busy_pages - page not " 4123 "fully valid! loff=%jx bpf=%08x " 4124 "idx=%d val=%02x dir=%02x\n", 4125 (intmax_t)bp->b_loffset, bp->b_flags, 4126 i, m->valid, m->dirty); 4127 vm_page_protect(m, VM_PROT_NONE); 4128 } else { 4129 /* 4130 * The page is not valid and can be made 4131 * part of the read. 4132 */ 4133 vm_page_protect(m, VM_PROT_NONE); 4134 } 4135 } 4136 if (bogus) { 4137 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), 4138 bp->b_xio.xio_pages, bp->b_xio.xio_npages); 4139 } 4140 lwkt_reltoken(&vm_token); 4141 } 4142 4143 /* 4144 * This is the easiest place to put the process accounting for the I/O 4145 * for now. 4146 */ 4147 if (lp != NULL) { 4148 if (bp->b_cmd == BUF_CMD_READ) 4149 lp->lwp_ru.ru_inblock++; 4150 else 4151 lp->lwp_ru.ru_oublock++; 4152 } 4153 } 4154 4155 /* 4156 * vfs_clean_pages: 4157 * 4158 * Tell the VM system that the pages associated with this buffer 4159 * are clean. This is used for delayed writes where the data is 4160 * going to go to disk eventually without additional VM intevention. 4161 * 4162 * Note that while we only really need to clean through to b_bcount, we 4163 * just go ahead and clean through to b_bufsize. 4164 */ 4165 static void 4166 vfs_clean_pages(struct buf *bp) 4167 { 4168 vm_page_t m; 4169 int i; 4170 4171 if ((bp->b_flags & B_VMIO) == 0) 4172 return; 4173 4174 KASSERT(bp->b_loffset != NOOFFSET, 4175 ("vfs_clean_pages: no buffer offset")); 4176 4177 for (i = 0; i < bp->b_xio.xio_npages; i++) { 4178 m = bp->b_xio.xio_pages[i]; 4179 vfs_clean_one_page(bp, i, m); 4180 } 4181 } 4182 4183 /* 4184 * vfs_clean_one_page: 4185 * 4186 * Set the valid bits and clear the dirty bits in a page within a 4187 * buffer. The range is restricted to the buffer's size and the 4188 * buffer's logical offset might index into the first page. 4189 * 4190 * The caller has busied or soft-busied the page and it is not mapped, 4191 * test and incorporate the dirty bits into b_dirtyoff/end before 4192 * clearing them. Note that we need to clear the pmap modified bits 4193 * after determining the the page was dirty, vm_page_set_validclean() 4194 * does not do it for us. 4195 * 4196 * This routine is typically called after a read completes (dirty should 4197 * be zero in that case as we are not called on bogus-replace pages), 4198 * or before a write is initiated. 4199 */ 4200 static void 4201 vfs_clean_one_page(struct buf *bp, int pageno, vm_page_t m) 4202 { 4203 int bcount; 4204 int xoff; 4205 int soff; 4206 int eoff; 4207 4208 /* 4209 * Calculate offset range within the page but relative to buffer's 4210 * loffset. loffset might be offset into the first page. 4211 */ 4212 xoff = (int)bp->b_loffset & PAGE_MASK; /* loffset offset into pg 0 */ 4213 bcount = bp->b_bcount + xoff; /* offset adjusted */ 4214 4215 if (pageno == 0) { 4216 soff = xoff; 4217 eoff = PAGE_SIZE; 4218 } else { 4219 soff = (pageno << PAGE_SHIFT); 4220 eoff = soff + PAGE_SIZE; 4221 } 4222 if (eoff > bcount) 4223 eoff = bcount; 4224 if (soff >= eoff) 4225 return; 4226 4227 /* 4228 * Test dirty bits and adjust b_dirtyoff/end. 4229 * 4230 * If dirty pages are incorporated into the bp any prior 4231 * B_NEEDCOMMIT state (NFS) must be cleared because the 4232 * caller has not taken into account the new dirty data. 4233 * 4234 * If the page was memory mapped the dirty bits might go beyond the 4235 * end of the buffer, but we can't really make the assumption that 4236 * a file EOF straddles the buffer (even though this is the case for 4237 * NFS if B_NEEDCOMMIT is also set). So for the purposes of clearing 4238 * B_NEEDCOMMIT we only test the dirty bits covered by the buffer. 4239 * This also saves some console spam. 4240 * 4241 * When clearing B_NEEDCOMMIT we must also clear B_CLUSTEROK, 4242 * NFS can handle huge commits but not huge writes. 4243 */ 4244 vm_page_test_dirty(m); 4245 if (m->dirty) { 4246 if ((bp->b_flags & B_NEEDCOMMIT) && 4247 (m->dirty & vm_page_bits(soff & PAGE_MASK, eoff - soff))) { 4248 if (debug_commit) 4249 kprintf("Warning: vfs_clean_one_page: bp %p " 4250 "loff=%jx,%d flgs=%08x clr B_NEEDCOMMIT" 4251 " cmd %d vd %02x/%02x x/s/e %d %d %d " 4252 "doff/end %d %d\n", 4253 bp, (intmax_t)bp->b_loffset, bp->b_bcount, 4254 bp->b_flags, bp->b_cmd, 4255 m->valid, m->dirty, xoff, soff, eoff, 4256 bp->b_dirtyoff, bp->b_dirtyend); 4257 bp->b_flags &= ~(B_NEEDCOMMIT | B_CLUSTEROK); 4258 if (debug_commit) 4259 print_backtrace(-1); 4260 } 4261 /* 4262 * Only clear the pmap modified bits if ALL the dirty bits 4263 * are set, otherwise the system might mis-clear portions 4264 * of a page. 4265 */ 4266 if (m->dirty == VM_PAGE_BITS_ALL && 4267 (bp->b_flags & B_NEEDCOMMIT) == 0) { 4268 pmap_clear_modify(m); 4269 } 4270 if (bp->b_dirtyoff > soff - xoff) 4271 bp->b_dirtyoff = soff - xoff; 4272 if (bp->b_dirtyend < eoff - xoff) 4273 bp->b_dirtyend = eoff - xoff; 4274 } 4275 4276 /* 4277 * Set related valid bits, clear related dirty bits. 4278 * Does not mess with the pmap modified bit. 4279 * 4280 * WARNING! We cannot just clear all of m->dirty here as the 4281 * buffer cache buffers may use a DEV_BSIZE'd aligned 4282 * block size, or have an odd size (e.g. NFS at file EOF). 4283 * The putpages code can clear m->dirty to 0. 4284 * 4285 * If a VOP_WRITE generates a buffer cache buffer which 4286 * covers the same space as mapped writable pages the 4287 * buffer flush might not be able to clear all the dirty 4288 * bits and still require a putpages from the VM system 4289 * to finish it off. 4290 */ 4291 vm_page_set_validclean(m, soff & PAGE_MASK, eoff - soff); 4292 } 4293 4294 /* 4295 * Similar to vfs_clean_one_page() but sets the bits to valid and dirty. 4296 * The page data is assumed to be valid (there is no zeroing here). 4297 */ 4298 static void 4299 vfs_dirty_one_page(struct buf *bp, int pageno, vm_page_t m) 4300 { 4301 int bcount; 4302 int xoff; 4303 int soff; 4304 int eoff; 4305 4306 /* 4307 * Calculate offset range within the page but relative to buffer's 4308 * loffset. loffset might be offset into the first page. 4309 */ 4310 xoff = (int)bp->b_loffset & PAGE_MASK; /* loffset offset into pg 0 */ 4311 bcount = bp->b_bcount + xoff; /* offset adjusted */ 4312 4313 if (pageno == 0) { 4314 soff = xoff; 4315 eoff = PAGE_SIZE; 4316 } else { 4317 soff = (pageno << PAGE_SHIFT); 4318 eoff = soff + PAGE_SIZE; 4319 } 4320 if (eoff > bcount) 4321 eoff = bcount; 4322 if (soff >= eoff) 4323 return; 4324 vm_page_set_validdirty(m, soff & PAGE_MASK, eoff - soff); 4325 } 4326 4327 /* 4328 * vfs_bio_clrbuf: 4329 * 4330 * Clear a buffer. This routine essentially fakes an I/O, so we need 4331 * to clear B_ERROR and B_INVAL. 4332 * 4333 * Note that while we only theoretically need to clear through b_bcount, 4334 * we go ahead and clear through b_bufsize. 4335 */ 4336 4337 void 4338 vfs_bio_clrbuf(struct buf *bp) 4339 { 4340 int i, mask = 0; 4341 caddr_t sa, ea; 4342 if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) { 4343 bp->b_flags &= ~(B_INVAL | B_EINTR | B_ERROR); 4344 if ((bp->b_xio.xio_npages == 1) && (bp->b_bufsize < PAGE_SIZE) && 4345 (bp->b_loffset & PAGE_MASK) == 0) { 4346 mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1; 4347 if ((bp->b_xio.xio_pages[0]->valid & mask) == mask) { 4348 bp->b_resid = 0; 4349 return; 4350 } 4351 if (((bp->b_xio.xio_pages[0]->flags & PG_ZERO) == 0) && 4352 ((bp->b_xio.xio_pages[0]->valid & mask) == 0)) { 4353 bzero(bp->b_data, bp->b_bufsize); 4354 bp->b_xio.xio_pages[0]->valid |= mask; 4355 bp->b_resid = 0; 4356 return; 4357 } 4358 } 4359 sa = bp->b_data; 4360 for(i=0;i<bp->b_xio.xio_npages;i++,sa=ea) { 4361 int j = ((vm_offset_t)sa & PAGE_MASK) / DEV_BSIZE; 4362 ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE); 4363 ea = (caddr_t)(vm_offset_t)ulmin( 4364 (u_long)(vm_offset_t)ea, 4365 (u_long)(vm_offset_t)bp->b_data + bp->b_bufsize); 4366 mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j; 4367 if ((bp->b_xio.xio_pages[i]->valid & mask) == mask) 4368 continue; 4369 if ((bp->b_xio.xio_pages[i]->valid & mask) == 0) { 4370 if ((bp->b_xio.xio_pages[i]->flags & PG_ZERO) == 0) { 4371 bzero(sa, ea - sa); 4372 } 4373 } else { 4374 for (; sa < ea; sa += DEV_BSIZE, j++) { 4375 if (((bp->b_xio.xio_pages[i]->flags & PG_ZERO) == 0) && 4376 (bp->b_xio.xio_pages[i]->valid & (1<<j)) == 0) 4377 bzero(sa, DEV_BSIZE); 4378 } 4379 } 4380 bp->b_xio.xio_pages[i]->valid |= mask; 4381 vm_page_flag_clear(bp->b_xio.xio_pages[i], PG_ZERO); 4382 } 4383 bp->b_resid = 0; 4384 } else { 4385 clrbuf(bp); 4386 } 4387 } 4388 4389 /* 4390 * vm_hold_load_pages: 4391 * 4392 * Load pages into the buffer's address space. The pages are 4393 * allocated from the kernel object in order to reduce interference 4394 * with the any VM paging I/O activity. The range of loaded 4395 * pages will be wired. 4396 * 4397 * If a page cannot be allocated, the 'pagedaemon' is woken up to 4398 * retrieve the full range (to - from) of pages. 4399 * 4400 * MPSAFE 4401 */ 4402 void 4403 vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to) 4404 { 4405 vm_offset_t pg; 4406 vm_page_t p; 4407 int index; 4408 4409 to = round_page(to); 4410 from = round_page(from); 4411 index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; 4412 4413 pg = from; 4414 while (pg < to) { 4415 /* 4416 * Note: must allocate system pages since blocking here 4417 * could intefere with paging I/O, no matter which 4418 * process we are. 4419 */ 4420 p = bio_page_alloc(&kernel_object, pg >> PAGE_SHIFT, 4421 (vm_pindex_t)((to - pg) >> PAGE_SHIFT)); 4422 if (p) { 4423 vm_page_wire(p); 4424 p->valid = VM_PAGE_BITS_ALL; 4425 vm_page_flag_clear(p, PG_ZERO); 4426 pmap_kenter(pg, VM_PAGE_TO_PHYS(p)); 4427 bp->b_xio.xio_pages[index] = p; 4428 vm_page_wakeup(p); 4429 4430 pg += PAGE_SIZE; 4431 ++index; 4432 } 4433 } 4434 bp->b_xio.xio_npages = index; 4435 } 4436 4437 /* 4438 * Allocate pages for a buffer cache buffer. 4439 * 4440 * Under extremely severe memory conditions even allocating out of the 4441 * system reserve can fail. If this occurs we must allocate out of the 4442 * interrupt reserve to avoid a deadlock with the pageout daemon. 4443 * 4444 * The pageout daemon can run (putpages -> VOP_WRITE -> getblk -> allocbuf). 4445 * If the buffer cache's vm_page_alloc() fails a vm_wait() can deadlock 4446 * against the pageout daemon if pages are not freed from other sources. 4447 * 4448 * MPSAFE 4449 */ 4450 static 4451 vm_page_t 4452 bio_page_alloc(vm_object_t obj, vm_pindex_t pg, int deficit) 4453 { 4454 vm_page_t p; 4455 4456 /* 4457 * Try a normal allocation, allow use of system reserve. 4458 */ 4459 lwkt_gettoken(&vm_token); 4460 p = vm_page_alloc(obj, pg, VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM); 4461 if (p) { 4462 lwkt_reltoken(&vm_token); 4463 return(p); 4464 } 4465 4466 /* 4467 * The normal allocation failed and we clearly have a page 4468 * deficit. Try to reclaim some clean VM pages directly 4469 * from the buffer cache. 4470 */ 4471 vm_pageout_deficit += deficit; 4472 recoverbufpages(); 4473 4474 /* 4475 * We may have blocked, the caller will know what to do if the 4476 * page now exists. 4477 */ 4478 if (vm_page_lookup(obj, pg)) { 4479 lwkt_reltoken(&vm_token); 4480 return(NULL); 4481 } 4482 4483 /* 4484 * Allocate and allow use of the interrupt reserve. 4485 * 4486 * If after all that we still can't allocate a VM page we are 4487 * in real trouble, but we slog on anyway hoping that the system 4488 * won't deadlock. 4489 */ 4490 p = vm_page_alloc(obj, pg, VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM | 4491 VM_ALLOC_INTERRUPT); 4492 if (p) { 4493 if (vm_page_count_severe()) { 4494 ++lowmempgallocs; 4495 vm_wait(hz / 20 + 1); 4496 } 4497 } else { 4498 kprintf("bio_page_alloc: Memory exhausted during bufcache " 4499 "page allocation\n"); 4500 ++lowmempgfails; 4501 vm_wait(hz); 4502 } 4503 lwkt_reltoken(&vm_token); 4504 return(p); 4505 } 4506 4507 /* 4508 * vm_hold_free_pages: 4509 * 4510 * Return pages associated with the buffer back to the VM system. 4511 * 4512 * The range of pages underlying the buffer's address space will 4513 * be unmapped and un-wired. 4514 * 4515 * MPSAFE 4516 */ 4517 void 4518 vm_hold_free_pages(struct buf *bp, vm_offset_t from, vm_offset_t to) 4519 { 4520 vm_offset_t pg; 4521 vm_page_t p; 4522 int index, newnpages; 4523 4524 from = round_page(from); 4525 to = round_page(to); 4526 index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; 4527 newnpages = index; 4528 4529 lwkt_gettoken(&vm_token); 4530 for (pg = from; pg < to; pg += PAGE_SIZE, index++) { 4531 p = bp->b_xio.xio_pages[index]; 4532 if (p && (index < bp->b_xio.xio_npages)) { 4533 if (p->busy) { 4534 kprintf("vm_hold_free_pages: doffset: %lld, " 4535 "loffset: %lld\n", 4536 (long long)bp->b_bio2.bio_offset, 4537 (long long)bp->b_loffset); 4538 } 4539 bp->b_xio.xio_pages[index] = NULL; 4540 pmap_kremove(pg); 4541 vm_page_busy(p); 4542 vm_page_unwire(p, 0); 4543 vm_page_free(p); 4544 } 4545 } 4546 bp->b_xio.xio_npages = newnpages; 4547 lwkt_reltoken(&vm_token); 4548 } 4549 4550 /* 4551 * vmapbuf: 4552 * 4553 * Map a user buffer into KVM via a pbuf. On return the buffer's 4554 * b_data, b_bufsize, and b_bcount will be set, and its XIO page array 4555 * initialized. 4556 */ 4557 int 4558 vmapbuf(struct buf *bp, caddr_t udata, int bytes) 4559 { 4560 caddr_t addr; 4561 vm_offset_t va; 4562 vm_page_t m; 4563 int vmprot; 4564 int error; 4565 int pidx; 4566 int i; 4567 4568 /* 4569 * bp had better have a command and it better be a pbuf. 4570 */ 4571 KKASSERT(bp->b_cmd != BUF_CMD_DONE); 4572 KKASSERT(bp->b_flags & B_PAGING); 4573 KKASSERT(bp->b_kvabase); 4574 4575 if (bytes < 0) 4576 return (-1); 4577 4578 /* 4579 * Map the user data into KVM. Mappings have to be page-aligned. 4580 */ 4581 addr = (caddr_t)trunc_page((vm_offset_t)udata); 4582 pidx = 0; 4583 4584 vmprot = VM_PROT_READ; 4585 if (bp->b_cmd == BUF_CMD_READ) 4586 vmprot |= VM_PROT_WRITE; 4587 4588 while (addr < udata + bytes) { 4589 /* 4590 * Do the vm_fault if needed; do the copy-on-write thing 4591 * when reading stuff off device into memory. 4592 * 4593 * vm_fault_page*() returns a held VM page. 4594 */ 4595 va = (addr >= udata) ? (vm_offset_t)addr : (vm_offset_t)udata; 4596 va = trunc_page(va); 4597 4598 m = vm_fault_page_quick(va, vmprot, &error); 4599 if (m == NULL) { 4600 for (i = 0; i < pidx; ++i) { 4601 vm_page_unhold(bp->b_xio.xio_pages[i]); 4602 bp->b_xio.xio_pages[i] = NULL; 4603 } 4604 return(-1); 4605 } 4606 bp->b_xio.xio_pages[pidx] = m; 4607 addr += PAGE_SIZE; 4608 ++pidx; 4609 } 4610 4611 /* 4612 * Map the page array and set the buffer fields to point to 4613 * the mapped data buffer. 4614 */ 4615 if (pidx > btoc(MAXPHYS)) 4616 panic("vmapbuf: mapped more than MAXPHYS"); 4617 pmap_qenter((vm_offset_t)bp->b_kvabase, bp->b_xio.xio_pages, pidx); 4618 4619 bp->b_xio.xio_npages = pidx; 4620 bp->b_data = bp->b_kvabase + ((int)(intptr_t)udata & PAGE_MASK); 4621 bp->b_bcount = bytes; 4622 bp->b_bufsize = bytes; 4623 return(0); 4624 } 4625 4626 /* 4627 * vunmapbuf: 4628 * 4629 * Free the io map PTEs associated with this IO operation. 4630 * We also invalidate the TLB entries and restore the original b_addr. 4631 */ 4632 void 4633 vunmapbuf(struct buf *bp) 4634 { 4635 int pidx; 4636 int npages; 4637 4638 KKASSERT(bp->b_flags & B_PAGING); 4639 4640 npages = bp->b_xio.xio_npages; 4641 pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages); 4642 for (pidx = 0; pidx < npages; ++pidx) { 4643 vm_page_unhold(bp->b_xio.xio_pages[pidx]); 4644 bp->b_xio.xio_pages[pidx] = NULL; 4645 } 4646 bp->b_xio.xio_npages = 0; 4647 bp->b_data = bp->b_kvabase; 4648 } 4649 4650 /* 4651 * Scan all buffers in the system and issue the callback. 4652 */ 4653 int 4654 scan_all_buffers(int (*callback)(struct buf *, void *), void *info) 4655 { 4656 int count = 0; 4657 int error; 4658 int n; 4659 4660 for (n = 0; n < nbuf; ++n) { 4661 if ((error = callback(&buf[n], info)) < 0) { 4662 count = error; 4663 break; 4664 } 4665 count += error; 4666 } 4667 return (count); 4668 } 4669 4670 /* 4671 * nestiobuf_iodone: biodone callback for nested buffers and propagate 4672 * completion to the master buffer. 4673 */ 4674 static void 4675 nestiobuf_iodone(struct bio *bio) 4676 { 4677 struct bio *mbio; 4678 struct buf *mbp, *bp; 4679 int error; 4680 int donebytes; 4681 4682 bp = bio->bio_buf; 4683 mbio = bio->bio_caller_info1.ptr; 4684 mbp = mbio->bio_buf; 4685 4686 KKASSERT(bp->b_bcount <= bp->b_bufsize); 4687 KKASSERT(mbp != bp); 4688 4689 error = bp->b_error; 4690 if (bp->b_error == 0 && 4691 (bp->b_bcount < bp->b_bufsize || bp->b_resid > 0)) { 4692 /* 4693 * Not all got transfered, raise an error. We have no way to 4694 * propagate these conditions to mbp. 4695 */ 4696 error = EIO; 4697 } 4698 4699 donebytes = bp->b_bufsize; 4700 4701 relpbuf(bp, NULL); 4702 nestiobuf_done(mbio, donebytes, error); 4703 } 4704 4705 void 4706 nestiobuf_done(struct bio *mbio, int donebytes, int error) 4707 { 4708 struct buf *mbp; 4709 4710 mbp = mbio->bio_buf; 4711 4712 KKASSERT((int)(intptr_t)mbio->bio_driver_info > 0); 4713 4714 /* 4715 * If an error occured, propagate it to the master buffer. 4716 * 4717 * Several biodone()s may wind up running concurrently so 4718 * use an atomic op to adjust b_flags. 4719 */ 4720 if (error) { 4721 mbp->b_error = error; 4722 atomic_set_int(&mbp->b_flags, B_ERROR); 4723 } 4724 4725 /* 4726 * Decrement the operations in progress counter and terminate the 4727 * I/O if this was the last bit. 4728 */ 4729 if (atomic_fetchadd_int((int *)&mbio->bio_driver_info, -1) == 1) { 4730 mbp->b_resid = 0; 4731 biodone(mbio); 4732 } 4733 } 4734 4735 /* 4736 * Initialize a nestiobuf for use. Set an initial count of 1 to prevent 4737 * the mbio from being biodone()'d while we are still adding sub-bios to 4738 * it. 4739 */ 4740 void 4741 nestiobuf_init(struct bio *bio) 4742 { 4743 bio->bio_driver_info = (void *)1; 4744 } 4745 4746 /* 4747 * The BIOs added to the nestedio have already been started, remove the 4748 * count that placeheld our mbio and biodone() it if the count would 4749 * transition to 0. 4750 */ 4751 void 4752 nestiobuf_start(struct bio *mbio) 4753 { 4754 struct buf *mbp = mbio->bio_buf; 4755 4756 /* 4757 * Decrement the operations in progress counter and terminate the 4758 * I/O if this was the last bit. 4759 */ 4760 if (atomic_fetchadd_int((int *)&mbio->bio_driver_info, -1) == 1) { 4761 if (mbp->b_flags & B_ERROR) 4762 mbp->b_resid = mbp->b_bcount; 4763 else 4764 mbp->b_resid = 0; 4765 biodone(mbio); 4766 } 4767 } 4768 4769 /* 4770 * Set an intermediate error prior to calling nestiobuf_start() 4771 */ 4772 void 4773 nestiobuf_error(struct bio *mbio, int error) 4774 { 4775 struct buf *mbp = mbio->bio_buf; 4776 4777 if (error) { 4778 mbp->b_error = error; 4779 atomic_set_int(&mbp->b_flags, B_ERROR); 4780 } 4781 } 4782 4783 /* 4784 * nestiobuf_add: setup a "nested" buffer. 4785 * 4786 * => 'mbp' is a "master" buffer which is being divided into sub pieces. 4787 * => 'bp' should be a buffer allocated by getiobuf. 4788 * => 'offset' is a byte offset in the master buffer. 4789 * => 'size' is a size in bytes of this nested buffer. 4790 */ 4791 void 4792 nestiobuf_add(struct bio *mbio, struct buf *bp, int offset, size_t size) 4793 { 4794 struct buf *mbp = mbio->bio_buf; 4795 struct vnode *vp = mbp->b_vp; 4796 4797 KKASSERT(mbp->b_bcount >= offset + size); 4798 4799 atomic_add_int((int *)&mbio->bio_driver_info, 1); 4800 4801 /* kernel needs to own the lock for it to be released in biodone */ 4802 BUF_KERNPROC(bp); 4803 bp->b_vp = vp; 4804 bp->b_cmd = mbp->b_cmd; 4805 bp->b_bio1.bio_done = nestiobuf_iodone; 4806 bp->b_data = (char *)mbp->b_data + offset; 4807 bp->b_resid = bp->b_bcount = size; 4808 bp->b_bufsize = bp->b_bcount; 4809 4810 bp->b_bio1.bio_track = NULL; 4811 bp->b_bio1.bio_caller_info1.ptr = mbio; 4812 } 4813 4814 /* 4815 * print out statistics from the current status of the buffer pool 4816 * this can be toggeled by the system control option debug.syncprt 4817 */ 4818 #ifdef DEBUG 4819 void 4820 vfs_bufstats(void) 4821 { 4822 int i, j, count; 4823 struct buf *bp; 4824 struct bqueues *dp; 4825 int counts[(MAXBSIZE / PAGE_SIZE) + 1]; 4826 static char *bname[3] = { "LOCKED", "LRU", "AGE" }; 4827 4828 for (dp = bufqueues, i = 0; dp < &bufqueues[3]; dp++, i++) { 4829 count = 0; 4830 for (j = 0; j <= MAXBSIZE/PAGE_SIZE; j++) 4831 counts[j] = 0; 4832 4833 spin_lock(&bufqspin); 4834 TAILQ_FOREACH(bp, dp, b_freelist) { 4835 counts[bp->b_bufsize/PAGE_SIZE]++; 4836 count++; 4837 } 4838 spin_unlock(&bufqspin); 4839 4840 kprintf("%s: total-%d", bname[i], count); 4841 for (j = 0; j <= MAXBSIZE/PAGE_SIZE; j++) 4842 if (counts[j] != 0) 4843 kprintf(", %d-%d", j * PAGE_SIZE, counts[j]); 4844 kprintf("\n"); 4845 } 4846 } 4847 #endif 4848 4849 #ifdef DDB 4850 4851 DB_SHOW_COMMAND(buffer, db_show_buffer) 4852 { 4853 /* get args */ 4854 struct buf *bp = (struct buf *)addr; 4855 4856 if (!have_addr) { 4857 db_printf("usage: show buffer <addr>\n"); 4858 return; 4859 } 4860 4861 db_printf("b_flags = 0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS); 4862 db_printf("b_cmd = %d\n", bp->b_cmd); 4863 db_printf("b_error = %d, b_bufsize = %d, b_bcount = %d, " 4864 "b_resid = %d\n, b_data = %p, " 4865 "bio_offset(disk) = %lld, bio_offset(phys) = %lld\n", 4866 bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid, 4867 bp->b_data, 4868 (long long)bp->b_bio2.bio_offset, 4869 (long long)(bp->b_bio2.bio_next ? 4870 bp->b_bio2.bio_next->bio_offset : (off_t)-1)); 4871 if (bp->b_xio.xio_npages) { 4872 int i; 4873 db_printf("b_xio.xio_npages = %d, pages(OBJ, IDX, PA): ", 4874 bp->b_xio.xio_npages); 4875 for (i = 0; i < bp->b_xio.xio_npages; i++) { 4876 vm_page_t m; 4877 m = bp->b_xio.xio_pages[i]; 4878 db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object, 4879 (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m)); 4880 if ((i + 1) < bp->b_xio.xio_npages) 4881 db_printf(","); 4882 } 4883 db_printf("\n"); 4884 } 4885 } 4886 #endif /* DDB */ 4887