1 /* 2 * Copyright (c) 1994,1997 John S. Dyson 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice immediately at the beginning of the file, without modification, 10 * this list of conditions, and the following disclaimer. 11 * 2. Absolutely no warranty of function or purpose is made by the author 12 * John S. Dyson. 13 * 14 * $FreeBSD: src/sys/kern/vfs_bio.c,v 1.242.2.20 2003/05/28 18:38:10 alc Exp $ 15 * $DragonFly: src/sys/kern/vfs_bio.c,v 1.115 2008/08/13 11:02:31 swildner Exp $ 16 */ 17 18 /* 19 * this file contains a new buffer I/O scheme implementing a coherent 20 * VM object and buffer cache scheme. Pains have been taken to make 21 * sure that the performance degradation associated with schemes such 22 * as this is not realized. 23 * 24 * Author: John S. Dyson 25 * Significant help during the development and debugging phases 26 * had been provided by David Greenman, also of the FreeBSD core team. 27 * 28 * see man buf(9) for more info. 29 */ 30 31 #include <sys/param.h> 32 #include <sys/systm.h> 33 #include <sys/buf.h> 34 #include <sys/conf.h> 35 #include <sys/eventhandler.h> 36 #include <sys/lock.h> 37 #include <sys/malloc.h> 38 #include <sys/mount.h> 39 #include <sys/kernel.h> 40 #include <sys/kthread.h> 41 #include <sys/proc.h> 42 #include <sys/reboot.h> 43 #include <sys/resourcevar.h> 44 #include <sys/sysctl.h> 45 #include <sys/vmmeter.h> 46 #include <sys/vnode.h> 47 #include <sys/proc.h> 48 #include <vm/vm.h> 49 #include <vm/vm_param.h> 50 #include <vm/vm_kern.h> 51 #include <vm/vm_pageout.h> 52 #include <vm/vm_page.h> 53 #include <vm/vm_object.h> 54 #include <vm/vm_extern.h> 55 #include <vm/vm_map.h> 56 57 #include <sys/buf2.h> 58 #include <sys/thread2.h> 59 #include <sys/spinlock2.h> 60 #include <vm/vm_page2.h> 61 62 #include "opt_ddb.h" 63 #ifdef DDB 64 #include <ddb/ddb.h> 65 #endif 66 67 /* 68 * Buffer queues. 69 */ 70 enum bufq_type { 71 BQUEUE_NONE, /* not on any queue */ 72 BQUEUE_LOCKED, /* locked buffers */ 73 BQUEUE_CLEAN, /* non-B_DELWRI buffers */ 74 BQUEUE_DIRTY, /* B_DELWRI buffers */ 75 BQUEUE_DIRTY_HW, /* B_DELWRI buffers - heavy weight */ 76 BQUEUE_EMPTYKVA, /* empty buffer headers with KVA assignment */ 77 BQUEUE_EMPTY, /* empty buffer headers */ 78 79 BUFFER_QUEUES /* number of buffer queues */ 80 }; 81 82 typedef enum bufq_type bufq_type_t; 83 84 #define BD_WAKE_SIZE 16384 85 #define BD_WAKE_MASK (BD_WAKE_SIZE - 1) 86 87 TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES]; 88 struct spinlock bufspin = SPINLOCK_INITIALIZER(&bufspin); 89 90 static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer"); 91 92 struct buf *buf; /* buffer header pool */ 93 94 static void vfs_clean_pages(struct buf *bp); 95 static void vfs_clean_one_page(struct buf *bp, int pageno, vm_page_t m); 96 static void vfs_vmio_release(struct buf *bp); 97 static int flushbufqueues(bufq_type_t q); 98 static vm_page_t bio_page_alloc(vm_object_t obj, vm_pindex_t pg, int deficit); 99 100 static void bd_signal(int totalspace); 101 static void buf_daemon(void); 102 static void buf_daemon_hw(void); 103 104 /* 105 * bogus page -- for I/O to/from partially complete buffers 106 * this is a temporary solution to the problem, but it is not 107 * really that bad. it would be better to split the buffer 108 * for input in the case of buffers partially already in memory, 109 * but the code is intricate enough already. 110 */ 111 vm_page_t bogus_page; 112 113 /* 114 * These are all static, but make the ones we export globals so we do 115 * not need to use compiler magic. 116 */ 117 int bufspace, maxbufspace, 118 bufmallocspace, maxbufmallocspace, lobufspace, hibufspace; 119 static int bufreusecnt, bufdefragcnt, buffreekvacnt; 120 static int lorunningspace, hirunningspace, runningbufreq; 121 int dirtybufspace, dirtybufspacehw, lodirtybufspace, hidirtybufspace; 122 int dirtybufcount, dirtybufcounthw; 123 int runningbufspace, runningbufcount; 124 static int getnewbufcalls; 125 static int getnewbufrestarts; 126 static int recoverbufcalls; 127 static int needsbuffer; /* locked by needsbuffer_spin */ 128 static int bd_request; /* locked by needsbuffer_spin */ 129 static int bd_request_hw; /* locked by needsbuffer_spin */ 130 static u_int bd_wake_ary[BD_WAKE_SIZE]; 131 static u_int bd_wake_index; 132 static u_int vm_cycle_point = ACT_INIT + ACT_ADVANCE * 6; 133 static struct spinlock needsbuffer_spin; 134 135 static struct thread *bufdaemon_td; 136 static struct thread *bufdaemonhw_td; 137 138 139 /* 140 * Sysctls for operational control of the buffer cache. 141 */ 142 SYSCTL_INT(_vfs, OID_AUTO, lodirtybufspace, CTLFLAG_RW, &lodirtybufspace, 0, 143 "Number of dirty buffers to flush before bufdaemon becomes inactive"); 144 SYSCTL_INT(_vfs, OID_AUTO, hidirtybufspace, CTLFLAG_RW, &hidirtybufspace, 0, 145 "High watermark used to trigger explicit flushing of dirty buffers"); 146 SYSCTL_INT(_vfs, OID_AUTO, lorunningspace, CTLFLAG_RW, &lorunningspace, 0, 147 "Minimum amount of buffer space required for active I/O"); 148 SYSCTL_INT(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW, &hirunningspace, 0, 149 "Maximum amount of buffer space to usable for active I/O"); 150 SYSCTL_UINT(_vfs, OID_AUTO, vm_cycle_point, CTLFLAG_RW, &vm_cycle_point, 0, 151 "Recycle pages to active or inactive queue transition pt 0-64"); 152 /* 153 * Sysctls determining current state of the buffer cache. 154 */ 155 SYSCTL_INT(_vfs, OID_AUTO, nbuf, CTLFLAG_RD, &nbuf, 0, 156 "Total number of buffers in buffer cache"); 157 SYSCTL_INT(_vfs, OID_AUTO, dirtybufspace, CTLFLAG_RD, &dirtybufspace, 0, 158 "Pending bytes of dirty buffers (all)"); 159 SYSCTL_INT(_vfs, OID_AUTO, dirtybufspacehw, CTLFLAG_RD, &dirtybufspacehw, 0, 160 "Pending bytes of dirty buffers (heavy weight)"); 161 SYSCTL_INT(_vfs, OID_AUTO, dirtybufcount, CTLFLAG_RD, &dirtybufcount, 0, 162 "Pending number of dirty buffers"); 163 SYSCTL_INT(_vfs, OID_AUTO, dirtybufcounthw, CTLFLAG_RD, &dirtybufcounthw, 0, 164 "Pending number of dirty buffers (heavy weight)"); 165 SYSCTL_INT(_vfs, OID_AUTO, runningbufspace, CTLFLAG_RD, &runningbufspace, 0, 166 "I/O bytes currently in progress due to asynchronous writes"); 167 SYSCTL_INT(_vfs, OID_AUTO, runningbufcount, CTLFLAG_RD, &runningbufcount, 0, 168 "I/O buffers currently in progress due to asynchronous writes"); 169 SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0, 170 "Hard limit on maximum amount of memory usable for buffer space"); 171 SYSCTL_INT(_vfs, OID_AUTO, hibufspace, CTLFLAG_RD, &hibufspace, 0, 172 "Soft limit on maximum amount of memory usable for buffer space"); 173 SYSCTL_INT(_vfs, OID_AUTO, lobufspace, CTLFLAG_RD, &lobufspace, 0, 174 "Minimum amount of memory to reserve for system buffer space"); 175 SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0, 176 "Amount of memory available for buffers"); 177 SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RD, &maxbufmallocspace, 178 0, "Maximum amount of memory reserved for buffers using malloc"); 179 SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, &bufmallocspace, 0, 180 "Amount of memory left for buffers using malloc-scheme"); 181 SYSCTL_INT(_vfs, OID_AUTO, getnewbufcalls, CTLFLAG_RD, &getnewbufcalls, 0, 182 "New buffer header acquisition requests"); 183 SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RD, &getnewbufrestarts, 184 0, "New buffer header acquisition restarts"); 185 SYSCTL_INT(_vfs, OID_AUTO, recoverbufcalls, CTLFLAG_RD, &recoverbufcalls, 0, 186 "Recover VM space in an emergency"); 187 SYSCTL_INT(_vfs, OID_AUTO, bufdefragcnt, CTLFLAG_RD, &bufdefragcnt, 0, 188 "Buffer acquisition restarts due to fragmented buffer map"); 189 SYSCTL_INT(_vfs, OID_AUTO, buffreekvacnt, CTLFLAG_RD, &buffreekvacnt, 0, 190 "Amount of time KVA space was deallocated in an arbitrary buffer"); 191 SYSCTL_INT(_vfs, OID_AUTO, bufreusecnt, CTLFLAG_RD, &bufreusecnt, 0, 192 "Amount of time buffer re-use operations were successful"); 193 SYSCTL_INT(_debug_sizeof, OID_AUTO, buf, CTLFLAG_RD, 0, sizeof(struct buf), 194 "sizeof(struct buf)"); 195 196 char *buf_wmesg = BUF_WMESG; 197 198 extern int vm_swap_size; 199 200 #define VFS_BIO_NEED_ANY 0x01 /* any freeable buffer */ 201 #define VFS_BIO_NEED_UNUSED02 0x02 202 #define VFS_BIO_NEED_UNUSED04 0x04 203 #define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */ 204 205 /* 206 * bufspacewakeup: 207 * 208 * Called when buffer space is potentially available for recovery. 209 * getnewbuf() will block on this flag when it is unable to free 210 * sufficient buffer space. Buffer space becomes recoverable when 211 * bp's get placed back in the queues. 212 */ 213 214 static __inline void 215 bufspacewakeup(void) 216 { 217 /* 218 * If someone is waiting for BUF space, wake them up. Even 219 * though we haven't freed the kva space yet, the waiting 220 * process will be able to now. 221 */ 222 if (needsbuffer & VFS_BIO_NEED_BUFSPACE) { 223 spin_lock_wr(&needsbuffer_spin); 224 needsbuffer &= ~VFS_BIO_NEED_BUFSPACE; 225 spin_unlock_wr(&needsbuffer_spin); 226 wakeup(&needsbuffer); 227 } 228 } 229 230 /* 231 * runningbufwakeup: 232 * 233 * Accounting for I/O in progress. 234 * 235 */ 236 static __inline void 237 runningbufwakeup(struct buf *bp) 238 { 239 int totalspace; 240 int limit; 241 242 if ((totalspace = bp->b_runningbufspace) != 0) { 243 atomic_subtract_int(&runningbufspace, totalspace); 244 atomic_subtract_int(&runningbufcount, 1); 245 bp->b_runningbufspace = 0; 246 247 /* 248 * see waitrunningbufspace() for limit test. 249 */ 250 limit = hirunningspace * 2 / 3; 251 if (runningbufreq && runningbufspace <= limit) { 252 runningbufreq = 0; 253 wakeup(&runningbufreq); 254 } 255 bd_signal(totalspace); 256 } 257 } 258 259 /* 260 * bufcountwakeup: 261 * 262 * Called when a buffer has been added to one of the free queues to 263 * account for the buffer and to wakeup anyone waiting for free buffers. 264 * This typically occurs when large amounts of metadata are being handled 265 * by the buffer cache ( else buffer space runs out first, usually ). 266 * 267 * MPSAFE 268 */ 269 static __inline void 270 bufcountwakeup(void) 271 { 272 if (needsbuffer) { 273 spin_lock_wr(&needsbuffer_spin); 274 needsbuffer &= ~VFS_BIO_NEED_ANY; 275 spin_unlock_wr(&needsbuffer_spin); 276 wakeup(&needsbuffer); 277 } 278 } 279 280 /* 281 * waitrunningbufspace() 282 * 283 * Wait for the amount of running I/O to drop to hirunningspace * 2 / 3. 284 * This is the point where write bursting stops so we don't want to wait 285 * for the running amount to drop below it (at least if we still want bioq 286 * to burst writes). 287 * 288 * The caller may be using this function to block in a tight loop, we 289 * must block while runningbufspace is greater then or equal to 290 * hirunningspace * 2 / 3. 291 * 292 * And even with that it may not be enough, due to the presence of 293 * B_LOCKED dirty buffers, so also wait for at least one running buffer 294 * to complete. 295 */ 296 static __inline void 297 waitrunningbufspace(void) 298 { 299 int limit = hirunningspace * 2 / 3; 300 301 crit_enter(); 302 if (runningbufspace > limit) { 303 while (runningbufspace > limit) { 304 ++runningbufreq; 305 tsleep(&runningbufreq, 0, "wdrn1", 0); 306 } 307 } else if (runningbufspace) { 308 ++runningbufreq; 309 tsleep(&runningbufreq, 0, "wdrn2", 1); 310 } 311 crit_exit(); 312 } 313 314 /* 315 * buf_dirty_count_severe: 316 * 317 * Return true if we have too many dirty buffers. 318 */ 319 int 320 buf_dirty_count_severe(void) 321 { 322 return (runningbufspace + dirtybufspace >= hidirtybufspace || 323 dirtybufcount >= nbuf / 2); 324 } 325 326 /* 327 * Return true if the amount of running I/O is severe and BIOQ should 328 * start bursting. 329 */ 330 int 331 buf_runningbufspace_severe(void) 332 { 333 return (runningbufspace >= hirunningspace * 2 / 3); 334 } 335 336 /* 337 * vfs_buf_test_cache: 338 * 339 * Called when a buffer is extended. This function clears the B_CACHE 340 * bit if the newly extended portion of the buffer does not contain 341 * valid data. 342 * 343 * NOTE! Dirty VM pages are not processed into dirty (B_DELWRI) buffer 344 * cache buffers. The VM pages remain dirty, as someone had mmap()'d 345 * them while a clean buffer was present. 346 */ 347 static __inline__ 348 void 349 vfs_buf_test_cache(struct buf *bp, 350 vm_ooffset_t foff, vm_offset_t off, vm_offset_t size, 351 vm_page_t m) 352 { 353 if (bp->b_flags & B_CACHE) { 354 int base = (foff + off) & PAGE_MASK; 355 if (vm_page_is_valid(m, base, size) == 0) 356 bp->b_flags &= ~B_CACHE; 357 } 358 } 359 360 /* 361 * bd_speedup() 362 * 363 * Spank the buf_daemon[_hw] if the total dirty buffer space exceeds the 364 * low water mark. 365 * 366 * MPSAFE 367 */ 368 static __inline__ 369 void 370 bd_speedup(void) 371 { 372 if (dirtybufspace < lodirtybufspace && dirtybufcount < nbuf / 2) 373 return; 374 375 if (bd_request == 0 && 376 (dirtybufspace - dirtybufspacehw > lodirtybufspace / 2 || 377 dirtybufcount - dirtybufcounthw >= nbuf / 2)) { 378 spin_lock_wr(&needsbuffer_spin); 379 bd_request = 1; 380 spin_unlock_wr(&needsbuffer_spin); 381 wakeup(&bd_request); 382 } 383 if (bd_request_hw == 0 && 384 (dirtybufspacehw > lodirtybufspace / 2 || 385 dirtybufcounthw >= nbuf / 2)) { 386 spin_lock_wr(&needsbuffer_spin); 387 bd_request_hw = 1; 388 spin_unlock_wr(&needsbuffer_spin); 389 wakeup(&bd_request_hw); 390 } 391 } 392 393 /* 394 * bd_heatup() 395 * 396 * Get the buf_daemon heated up when the number of running and dirty 397 * buffers exceeds the mid-point. 398 * 399 * Return the total number of dirty bytes past the second mid point 400 * as a measure of how much excess dirty data there is in the system. 401 * 402 * MPSAFE 403 */ 404 int 405 bd_heatup(void) 406 { 407 int mid1; 408 int mid2; 409 int totalspace; 410 411 mid1 = lodirtybufspace + (hidirtybufspace - lodirtybufspace) / 2; 412 413 totalspace = runningbufspace + dirtybufspace; 414 if (totalspace >= mid1 || dirtybufcount >= nbuf / 2) { 415 bd_speedup(); 416 mid2 = mid1 + (hidirtybufspace - mid1) / 2; 417 if (totalspace >= mid2) 418 return(totalspace - mid2); 419 } 420 return(0); 421 } 422 423 /* 424 * bd_wait() 425 * 426 * Wait for the buffer cache to flush (totalspace) bytes worth of 427 * buffers, then return. 428 * 429 * Regardless this function blocks while the number of dirty buffers 430 * exceeds hidirtybufspace. 431 * 432 * MPSAFE 433 */ 434 void 435 bd_wait(int totalspace) 436 { 437 u_int i; 438 int count; 439 440 if (curthread == bufdaemonhw_td || curthread == bufdaemon_td) 441 return; 442 443 while (totalspace > 0) { 444 bd_heatup(); 445 if (totalspace > runningbufspace + dirtybufspace) 446 totalspace = runningbufspace + dirtybufspace; 447 count = totalspace / BKVASIZE; 448 if (count >= BD_WAKE_SIZE) 449 count = BD_WAKE_SIZE - 1; 450 451 spin_lock_wr(&needsbuffer_spin); 452 i = (bd_wake_index + count) & BD_WAKE_MASK; 453 ++bd_wake_ary[i]; 454 tsleep_interlock(&bd_wake_ary[i], 0); 455 spin_unlock_wr(&needsbuffer_spin); 456 tsleep(&bd_wake_ary[i], PINTERLOCKED, "flstik", hz); 457 458 totalspace = runningbufspace + dirtybufspace - hidirtybufspace; 459 } 460 } 461 462 /* 463 * bd_signal() 464 * 465 * This function is called whenever runningbufspace or dirtybufspace 466 * is reduced. Track threads waiting for run+dirty buffer I/O 467 * complete. 468 * 469 * MPSAFE 470 */ 471 static void 472 bd_signal(int totalspace) 473 { 474 u_int i; 475 476 if (totalspace > 0) { 477 if (totalspace > BKVASIZE * BD_WAKE_SIZE) 478 totalspace = BKVASIZE * BD_WAKE_SIZE; 479 spin_lock_wr(&needsbuffer_spin); 480 while (totalspace > 0) { 481 i = bd_wake_index++; 482 i &= BD_WAKE_MASK; 483 if (bd_wake_ary[i]) { 484 bd_wake_ary[i] = 0; 485 spin_unlock_wr(&needsbuffer_spin); 486 wakeup(&bd_wake_ary[i]); 487 spin_lock_wr(&needsbuffer_spin); 488 } 489 totalspace -= BKVASIZE; 490 } 491 spin_unlock_wr(&needsbuffer_spin); 492 } 493 } 494 495 /* 496 * BIO tracking support routines. 497 * 498 * Release a ref on a bio_track. Wakeup requests are atomically released 499 * along with the last reference so bk_active will never wind up set to 500 * only 0x80000000. 501 * 502 * MPSAFE 503 */ 504 static 505 void 506 bio_track_rel(struct bio_track *track) 507 { 508 int active; 509 int desired; 510 511 /* 512 * Shortcut 513 */ 514 active = track->bk_active; 515 if (active == 1 && atomic_cmpset_int(&track->bk_active, 1, 0)) 516 return; 517 518 /* 519 * Full-on. Note that the wait flag is only atomically released on 520 * the 1->0 count transition. 521 * 522 * We check for a negative count transition using bit 30 since bit 31 523 * has a different meaning. 524 */ 525 for (;;) { 526 desired = (active & 0x7FFFFFFF) - 1; 527 if (desired) 528 desired |= active & 0x80000000; 529 if (atomic_cmpset_int(&track->bk_active, active, desired)) { 530 if (desired & 0x40000000) 531 panic("bio_track_rel: bad count: %p\n", track); 532 if (active & 0x80000000) 533 wakeup(track); 534 break; 535 } 536 active = track->bk_active; 537 } 538 } 539 540 /* 541 * Wait for the tracking count to reach 0. 542 * 543 * Use atomic ops such that the wait flag is only set atomically when 544 * bk_active is non-zero. 545 * 546 * MPSAFE 547 */ 548 int 549 bio_track_wait(struct bio_track *track, int slp_flags, int slp_timo) 550 { 551 int active; 552 int desired; 553 int error; 554 555 /* 556 * Shortcut 557 */ 558 if (track->bk_active == 0) 559 return(0); 560 561 /* 562 * Full-on. Note that the wait flag may only be atomically set if 563 * the active count is non-zero. 564 */ 565 error = 0; 566 while ((active = track->bk_active) != 0) { 567 desired = active | 0x80000000; 568 tsleep_interlock(track, slp_flags); 569 if (active == desired || 570 atomic_cmpset_int(&track->bk_active, active, desired)) { 571 error = tsleep(track, slp_flags | PINTERLOCKED, 572 "iowait", slp_timo); 573 if (error) 574 break; 575 } 576 } 577 return (error); 578 } 579 580 /* 581 * bufinit: 582 * 583 * Load time initialisation of the buffer cache, called from machine 584 * dependant initialization code. 585 */ 586 void 587 bufinit(void) 588 { 589 struct buf *bp; 590 vm_offset_t bogus_offset; 591 int i; 592 593 spin_init(&needsbuffer_spin); 594 595 /* next, make a null set of free lists */ 596 for (i = 0; i < BUFFER_QUEUES; i++) 597 TAILQ_INIT(&bufqueues[i]); 598 599 /* finally, initialize each buffer header and stick on empty q */ 600 for (i = 0; i < nbuf; i++) { 601 bp = &buf[i]; 602 bzero(bp, sizeof *bp); 603 bp->b_flags = B_INVAL; /* we're just an empty header */ 604 bp->b_cmd = BUF_CMD_DONE; 605 bp->b_qindex = BQUEUE_EMPTY; 606 initbufbio(bp); 607 xio_init(&bp->b_xio); 608 buf_dep_init(bp); 609 BUF_LOCKINIT(bp); 610 TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_EMPTY], bp, b_freelist); 611 } 612 613 /* 614 * maxbufspace is the absolute maximum amount of buffer space we are 615 * allowed to reserve in KVM and in real terms. The absolute maximum 616 * is nominally used by buf_daemon. hibufspace is the nominal maximum 617 * used by most other processes. The differential is required to 618 * ensure that buf_daemon is able to run when other processes might 619 * be blocked waiting for buffer space. 620 * 621 * maxbufspace is based on BKVASIZE. Allocating buffers larger then 622 * this may result in KVM fragmentation which is not handled optimally 623 * by the system. 624 */ 625 maxbufspace = nbuf * BKVASIZE; 626 hibufspace = imax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10); 627 lobufspace = hibufspace - MAXBSIZE; 628 629 lorunningspace = 512 * 1024; 630 /* hirunningspace -- see below */ 631 632 /* 633 * Limit the amount of malloc memory since it is wired permanently 634 * into the kernel space. Even though this is accounted for in 635 * the buffer allocation, we don't want the malloced region to grow 636 * uncontrolled. The malloc scheme improves memory utilization 637 * significantly on average (small) directories. 638 */ 639 maxbufmallocspace = hibufspace / 20; 640 641 /* 642 * Reduce the chance of a deadlock occuring by limiting the number 643 * of delayed-write dirty buffers we allow to stack up. 644 * 645 * We don't want too much actually queued to the device at once 646 * (XXX this needs to be per-mount!), because the buffers will 647 * wind up locked for a very long period of time while the I/O 648 * drains. 649 */ 650 hidirtybufspace = hibufspace / 2; /* dirty + running */ 651 hirunningspace = hibufspace / 16; /* locked & queued to device */ 652 if (hirunningspace < 1024 * 1024) 653 hirunningspace = 1024 * 1024; 654 655 dirtybufspace = 0; 656 dirtybufspacehw = 0; 657 658 lodirtybufspace = hidirtybufspace / 2; 659 660 /* 661 * Maximum number of async ops initiated per buf_daemon loop. This is 662 * somewhat of a hack at the moment, we really need to limit ourselves 663 * based on the number of bytes of I/O in-transit that were initiated 664 * from buf_daemon. 665 */ 666 667 bogus_offset = kmem_alloc_pageable(&kernel_map, PAGE_SIZE); 668 bogus_page = vm_page_alloc(&kernel_object, 669 (bogus_offset >> PAGE_SHIFT), 670 VM_ALLOC_NORMAL); 671 vmstats.v_wire_count++; 672 673 } 674 675 /* 676 * Initialize the embedded bio structures 677 */ 678 void 679 initbufbio(struct buf *bp) 680 { 681 bp->b_bio1.bio_buf = bp; 682 bp->b_bio1.bio_prev = NULL; 683 bp->b_bio1.bio_offset = NOOFFSET; 684 bp->b_bio1.bio_next = &bp->b_bio2; 685 bp->b_bio1.bio_done = NULL; 686 bp->b_bio1.bio_flags = 0; 687 688 bp->b_bio2.bio_buf = bp; 689 bp->b_bio2.bio_prev = &bp->b_bio1; 690 bp->b_bio2.bio_offset = NOOFFSET; 691 bp->b_bio2.bio_next = NULL; 692 bp->b_bio2.bio_done = NULL; 693 bp->b_bio2.bio_flags = 0; 694 } 695 696 /* 697 * Reinitialize the embedded bio structures as well as any additional 698 * translation cache layers. 699 */ 700 void 701 reinitbufbio(struct buf *bp) 702 { 703 struct bio *bio; 704 705 for (bio = &bp->b_bio1; bio; bio = bio->bio_next) { 706 bio->bio_done = NULL; 707 bio->bio_offset = NOOFFSET; 708 } 709 } 710 711 /* 712 * Push another BIO layer onto an existing BIO and return it. The new 713 * BIO layer may already exist, holding cached translation data. 714 */ 715 struct bio * 716 push_bio(struct bio *bio) 717 { 718 struct bio *nbio; 719 720 if ((nbio = bio->bio_next) == NULL) { 721 int index = bio - &bio->bio_buf->b_bio_array[0]; 722 if (index >= NBUF_BIO - 1) { 723 panic("push_bio: too many layers bp %p\n", 724 bio->bio_buf); 725 } 726 nbio = &bio->bio_buf->b_bio_array[index + 1]; 727 bio->bio_next = nbio; 728 nbio->bio_prev = bio; 729 nbio->bio_buf = bio->bio_buf; 730 nbio->bio_offset = NOOFFSET; 731 nbio->bio_done = NULL; 732 nbio->bio_next = NULL; 733 } 734 KKASSERT(nbio->bio_done == NULL); 735 return(nbio); 736 } 737 738 /* 739 * Pop a BIO translation layer, returning the previous layer. The 740 * must have been previously pushed. 741 */ 742 struct bio * 743 pop_bio(struct bio *bio) 744 { 745 return(bio->bio_prev); 746 } 747 748 void 749 clearbiocache(struct bio *bio) 750 { 751 while (bio) { 752 bio->bio_offset = NOOFFSET; 753 bio = bio->bio_next; 754 } 755 } 756 757 /* 758 * bfreekva: 759 * 760 * Free the KVA allocation for buffer 'bp'. 761 * 762 * Must be called from a critical section as this is the only locking for 763 * buffer_map. 764 * 765 * Since this call frees up buffer space, we call bufspacewakeup(). 766 * 767 * MPALMOSTSAFE 768 */ 769 static void 770 bfreekva(struct buf *bp) 771 { 772 int count; 773 774 if (bp->b_kvasize) { 775 get_mplock(); 776 ++buffreekvacnt; 777 count = vm_map_entry_reserve(MAP_RESERVE_COUNT); 778 vm_map_lock(&buffer_map); 779 bufspace -= bp->b_kvasize; 780 vm_map_delete(&buffer_map, 781 (vm_offset_t) bp->b_kvabase, 782 (vm_offset_t) bp->b_kvabase + bp->b_kvasize, 783 &count 784 ); 785 vm_map_unlock(&buffer_map); 786 vm_map_entry_release(count); 787 bp->b_kvasize = 0; 788 bufspacewakeup(); 789 rel_mplock(); 790 } 791 } 792 793 /* 794 * bremfree: 795 * 796 * Remove the buffer from the appropriate free list. 797 */ 798 static __inline void 799 _bremfree(struct buf *bp) 800 { 801 if (bp->b_qindex != BQUEUE_NONE) { 802 KASSERT(BUF_REFCNTNB(bp) == 1, 803 ("bremfree: bp %p not locked",bp)); 804 TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); 805 bp->b_qindex = BQUEUE_NONE; 806 } else { 807 if (BUF_REFCNTNB(bp) <= 1) 808 panic("bremfree: removing a buffer not on a queue"); 809 } 810 } 811 812 void 813 bremfree(struct buf *bp) 814 { 815 spin_lock_wr(&bufspin); 816 _bremfree(bp); 817 spin_unlock_wr(&bufspin); 818 } 819 820 static void 821 bremfree_locked(struct buf *bp) 822 { 823 _bremfree(bp); 824 } 825 826 /* 827 * bread: 828 * 829 * Get a buffer with the specified data. Look in the cache first. We 830 * must clear B_ERROR and B_INVAL prior to initiating I/O. If B_CACHE 831 * is set, the buffer is valid and we do not have to do anything ( see 832 * getblk() ). 833 * 834 * MPALMOSTSAFE 835 */ 836 int 837 bread(struct vnode *vp, off_t loffset, int size, struct buf **bpp) 838 { 839 struct buf *bp; 840 841 bp = getblk(vp, loffset, size, 0, 0); 842 *bpp = bp; 843 844 /* if not found in cache, do some I/O */ 845 if ((bp->b_flags & B_CACHE) == 0) { 846 get_mplock(); 847 bp->b_flags &= ~(B_ERROR | B_EINTR | B_INVAL); 848 bp->b_cmd = BUF_CMD_READ; 849 bp->b_bio1.bio_done = biodone_sync; 850 bp->b_bio1.bio_flags |= BIO_SYNC; 851 vfs_busy_pages(vp, bp); 852 vn_strategy(vp, &bp->b_bio1); 853 rel_mplock(); 854 return (biowait(&bp->b_bio1, "biord")); 855 } 856 return (0); 857 } 858 859 /* 860 * breadn: 861 * 862 * Operates like bread, but also starts asynchronous I/O on 863 * read-ahead blocks. We must clear B_ERROR and B_INVAL prior 864 * to initiating I/O . If B_CACHE is set, the buffer is valid 865 * and we do not have to do anything. 866 * 867 * MPALMOSTSAFE 868 */ 869 int 870 breadn(struct vnode *vp, off_t loffset, int size, off_t *raoffset, 871 int *rabsize, int cnt, struct buf **bpp) 872 { 873 struct buf *bp, *rabp; 874 int i; 875 int rv = 0, readwait = 0; 876 877 *bpp = bp = getblk(vp, loffset, size, 0, 0); 878 879 /* if not found in cache, do some I/O */ 880 if ((bp->b_flags & B_CACHE) == 0) { 881 get_mplock(); 882 bp->b_flags &= ~(B_ERROR | B_EINTR | B_INVAL); 883 bp->b_cmd = BUF_CMD_READ; 884 bp->b_bio1.bio_done = biodone_sync; 885 bp->b_bio1.bio_flags |= BIO_SYNC; 886 vfs_busy_pages(vp, bp); 887 vn_strategy(vp, &bp->b_bio1); 888 ++readwait; 889 rel_mplock(); 890 } 891 892 for (i = 0; i < cnt; i++, raoffset++, rabsize++) { 893 if (inmem(vp, *raoffset)) 894 continue; 895 rabp = getblk(vp, *raoffset, *rabsize, 0, 0); 896 897 if ((rabp->b_flags & B_CACHE) == 0) { 898 get_mplock(); 899 rabp->b_flags &= ~(B_ERROR | B_EINTR | B_INVAL); 900 rabp->b_cmd = BUF_CMD_READ; 901 vfs_busy_pages(vp, rabp); 902 BUF_KERNPROC(rabp); 903 vn_strategy(vp, &rabp->b_bio1); 904 rel_mplock(); 905 } else { 906 brelse(rabp); 907 } 908 } 909 if (readwait) 910 rv = biowait(&bp->b_bio1, "biord"); 911 return (rv); 912 } 913 914 /* 915 * bwrite: 916 * 917 * Synchronous write, waits for completion. 918 * 919 * Write, release buffer on completion. (Done by iodone 920 * if async). Do not bother writing anything if the buffer 921 * is invalid. 922 * 923 * Note that we set B_CACHE here, indicating that buffer is 924 * fully valid and thus cacheable. This is true even of NFS 925 * now so we set it generally. This could be set either here 926 * or in biodone() since the I/O is synchronous. We put it 927 * here. 928 */ 929 int 930 bwrite(struct buf *bp) 931 { 932 int error; 933 934 if (bp->b_flags & B_INVAL) { 935 brelse(bp); 936 return (0); 937 } 938 if (BUF_REFCNTNB(bp) == 0) 939 panic("bwrite: buffer is not busy???"); 940 941 /* Mark the buffer clean */ 942 bundirty(bp); 943 944 bp->b_flags &= ~(B_ERROR | B_EINTR); 945 bp->b_flags |= B_CACHE; 946 bp->b_cmd = BUF_CMD_WRITE; 947 bp->b_bio1.bio_done = biodone_sync; 948 bp->b_bio1.bio_flags |= BIO_SYNC; 949 vfs_busy_pages(bp->b_vp, bp); 950 951 /* 952 * Normal bwrites pipeline writes. NOTE: b_bufsize is only 953 * valid for vnode-backed buffers. 954 */ 955 bp->b_runningbufspace = bp->b_bufsize; 956 if (bp->b_runningbufspace) { 957 runningbufspace += bp->b_runningbufspace; 958 ++runningbufcount; 959 } 960 961 vn_strategy(bp->b_vp, &bp->b_bio1); 962 error = biowait(&bp->b_bio1, "biows"); 963 brelse(bp); 964 return (error); 965 } 966 967 /* 968 * bawrite: 969 * 970 * Asynchronous write. Start output on a buffer, but do not wait for 971 * it to complete. The buffer is released when the output completes. 972 * 973 * bwrite() ( or the VOP routine anyway ) is responsible for handling 974 * B_INVAL buffers. Not us. 975 */ 976 void 977 bawrite(struct buf *bp) 978 { 979 if (bp->b_flags & B_INVAL) { 980 brelse(bp); 981 return; 982 } 983 if (BUF_REFCNTNB(bp) == 0) 984 panic("bwrite: buffer is not busy???"); 985 986 /* Mark the buffer clean */ 987 bundirty(bp); 988 989 bp->b_flags &= ~(B_ERROR | B_EINTR); 990 bp->b_flags |= B_CACHE; 991 bp->b_cmd = BUF_CMD_WRITE; 992 KKASSERT(bp->b_bio1.bio_done == NULL); 993 vfs_busy_pages(bp->b_vp, bp); 994 995 /* 996 * Normal bwrites pipeline writes. NOTE: b_bufsize is only 997 * valid for vnode-backed buffers. 998 */ 999 bp->b_runningbufspace = bp->b_bufsize; 1000 if (bp->b_runningbufspace) { 1001 runningbufspace += bp->b_runningbufspace; 1002 ++runningbufcount; 1003 } 1004 1005 BUF_KERNPROC(bp); 1006 vn_strategy(bp->b_vp, &bp->b_bio1); 1007 } 1008 1009 /* 1010 * bowrite: 1011 * 1012 * Ordered write. Start output on a buffer, and flag it so that the 1013 * device will write it in the order it was queued. The buffer is 1014 * released when the output completes. bwrite() ( or the VOP routine 1015 * anyway ) is responsible for handling B_INVAL buffers. 1016 */ 1017 int 1018 bowrite(struct buf *bp) 1019 { 1020 bp->b_flags |= B_ORDERED; 1021 bawrite(bp); 1022 return (0); 1023 } 1024 1025 /* 1026 * bdwrite: 1027 * 1028 * Delayed write. (Buffer is marked dirty). Do not bother writing 1029 * anything if the buffer is marked invalid. 1030 * 1031 * Note that since the buffer must be completely valid, we can safely 1032 * set B_CACHE. In fact, we have to set B_CACHE here rather then in 1033 * biodone() in order to prevent getblk from writing the buffer 1034 * out synchronously. 1035 */ 1036 void 1037 bdwrite(struct buf *bp) 1038 { 1039 if (BUF_REFCNTNB(bp) == 0) 1040 panic("bdwrite: buffer is not busy"); 1041 1042 if (bp->b_flags & B_INVAL) { 1043 brelse(bp); 1044 return; 1045 } 1046 bdirty(bp); 1047 1048 /* 1049 * Set B_CACHE, indicating that the buffer is fully valid. This is 1050 * true even of NFS now. 1051 */ 1052 bp->b_flags |= B_CACHE; 1053 1054 /* 1055 * This bmap keeps the system from needing to do the bmap later, 1056 * perhaps when the system is attempting to do a sync. Since it 1057 * is likely that the indirect block -- or whatever other datastructure 1058 * that the filesystem needs is still in memory now, it is a good 1059 * thing to do this. Note also, that if the pageout daemon is 1060 * requesting a sync -- there might not be enough memory to do 1061 * the bmap then... So, this is important to do. 1062 */ 1063 if (bp->b_bio2.bio_offset == NOOFFSET) { 1064 VOP_BMAP(bp->b_vp, bp->b_loffset, &bp->b_bio2.bio_offset, 1065 NULL, NULL, BUF_CMD_WRITE); 1066 } 1067 1068 /* 1069 * Because the underlying pages may still be mapped and 1070 * writable trying to set the dirty buffer (b_dirtyoff/end) 1071 * range here will be inaccurate. 1072 * 1073 * However, we must still clean the pages to satisfy the 1074 * vnode_pager and pageout daemon, so theythink the pages 1075 * have been "cleaned". What has really occured is that 1076 * they've been earmarked for later writing by the buffer 1077 * cache. 1078 * 1079 * So we get the b_dirtyoff/end update but will not actually 1080 * depend on it (NFS that is) until the pages are busied for 1081 * writing later on. 1082 */ 1083 vfs_clean_pages(bp); 1084 bqrelse(bp); 1085 1086 /* 1087 * note: we cannot initiate I/O from a bdwrite even if we wanted to, 1088 * due to the softdep code. 1089 */ 1090 } 1091 1092 /* 1093 * bdirty: 1094 * 1095 * Turn buffer into delayed write request by marking it B_DELWRI. 1096 * B_RELBUF and B_NOCACHE must be cleared. 1097 * 1098 * We reassign the buffer to itself to properly update it in the 1099 * dirty/clean lists. 1100 * 1101 * Must be called from a critical section. 1102 * The buffer must be on BQUEUE_NONE. 1103 */ 1104 void 1105 bdirty(struct buf *bp) 1106 { 1107 KASSERT(bp->b_qindex == BQUEUE_NONE, ("bdirty: buffer %p still on queue %d", bp, bp->b_qindex)); 1108 if (bp->b_flags & B_NOCACHE) { 1109 kprintf("bdirty: clearing B_NOCACHE on buf %p\n", bp); 1110 bp->b_flags &= ~B_NOCACHE; 1111 } 1112 if (bp->b_flags & B_INVAL) { 1113 kprintf("bdirty: warning, dirtying invalid buffer %p\n", bp); 1114 } 1115 bp->b_flags &= ~B_RELBUF; 1116 1117 if ((bp->b_flags & B_DELWRI) == 0) { 1118 bp->b_flags |= B_DELWRI; 1119 reassignbuf(bp); 1120 atomic_add_int(&dirtybufcount, 1); 1121 dirtybufspace += bp->b_bufsize; 1122 if (bp->b_flags & B_HEAVY) { 1123 atomic_add_int(&dirtybufcounthw, 1); 1124 atomic_add_int(&dirtybufspacehw, bp->b_bufsize); 1125 } 1126 bd_heatup(); 1127 } 1128 } 1129 1130 /* 1131 * Set B_HEAVY, indicating that this is a heavy-weight buffer that 1132 * needs to be flushed with a different buf_daemon thread to avoid 1133 * deadlocks. B_HEAVY also imposes restrictions in getnewbuf(). 1134 */ 1135 void 1136 bheavy(struct buf *bp) 1137 { 1138 if ((bp->b_flags & B_HEAVY) == 0) { 1139 bp->b_flags |= B_HEAVY; 1140 if (bp->b_flags & B_DELWRI) { 1141 atomic_add_int(&dirtybufcounthw, 1); 1142 atomic_add_int(&dirtybufspacehw, bp->b_bufsize); 1143 } 1144 } 1145 } 1146 1147 /* 1148 * bundirty: 1149 * 1150 * Clear B_DELWRI for buffer. 1151 * 1152 * Must be called from a critical section. 1153 * 1154 * The buffer is typically on BQUEUE_NONE but there is one case in 1155 * brelse() that calls this function after placing the buffer on 1156 * a different queue. 1157 * 1158 * MPSAFE 1159 */ 1160 void 1161 bundirty(struct buf *bp) 1162 { 1163 if (bp->b_flags & B_DELWRI) { 1164 bp->b_flags &= ~B_DELWRI; 1165 reassignbuf(bp); 1166 atomic_subtract_int(&dirtybufcount, 1); 1167 atomic_subtract_int(&dirtybufspace, bp->b_bufsize); 1168 if (bp->b_flags & B_HEAVY) { 1169 atomic_subtract_int(&dirtybufcounthw, 1); 1170 atomic_subtract_int(&dirtybufspacehw, bp->b_bufsize); 1171 } 1172 bd_signal(bp->b_bufsize); 1173 } 1174 /* 1175 * Since it is now being written, we can clear its deferred write flag. 1176 */ 1177 bp->b_flags &= ~B_DEFERRED; 1178 } 1179 1180 /* 1181 * brelse: 1182 * 1183 * Release a busy buffer and, if requested, free its resources. The 1184 * buffer will be stashed in the appropriate bufqueue[] allowing it 1185 * to be accessed later as a cache entity or reused for other purposes. 1186 * 1187 * MPALMOSTSAFE 1188 */ 1189 void 1190 brelse(struct buf *bp) 1191 { 1192 #ifdef INVARIANTS 1193 int saved_flags = bp->b_flags; 1194 #endif 1195 1196 KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); 1197 1198 /* 1199 * If B_NOCACHE is set we are being asked to destroy the buffer and 1200 * its backing store. Clear B_DELWRI. 1201 * 1202 * B_NOCACHE is set in two cases: (1) when the caller really wants 1203 * to destroy the buffer and backing store and (2) when the caller 1204 * wants to destroy the buffer and backing store after a write 1205 * completes. 1206 */ 1207 if ((bp->b_flags & (B_NOCACHE|B_DELWRI)) == (B_NOCACHE|B_DELWRI)) { 1208 bundirty(bp); 1209 } 1210 1211 if ((bp->b_flags & (B_INVAL | B_DELWRI)) == B_DELWRI) { 1212 /* 1213 * A re-dirtied buffer is only subject to destruction 1214 * by B_INVAL. B_ERROR and B_NOCACHE are ignored. 1215 */ 1216 /* leave buffer intact */ 1217 } else if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) || 1218 (bp->b_bufsize <= 0)) { 1219 /* 1220 * Either a failed read or we were asked to free or not 1221 * cache the buffer. This path is reached with B_DELWRI 1222 * set only if B_INVAL is already set. B_NOCACHE governs 1223 * backing store destruction. 1224 * 1225 * NOTE: HAMMER will set B_LOCKED in buf_deallocate if the 1226 * buffer cannot be immediately freed. 1227 */ 1228 bp->b_flags |= B_INVAL; 1229 if (LIST_FIRST(&bp->b_dep) != NULL) { 1230 get_mplock(); 1231 buf_deallocate(bp); 1232 rel_mplock(); 1233 } 1234 if (bp->b_flags & B_DELWRI) { 1235 atomic_subtract_int(&dirtybufcount, 1); 1236 atomic_subtract_int(&dirtybufspace, bp->b_bufsize); 1237 if (bp->b_flags & B_HEAVY) { 1238 atomic_subtract_int(&dirtybufcounthw, 1); 1239 atomic_subtract_int(&dirtybufspacehw, bp->b_bufsize); 1240 } 1241 bd_signal(bp->b_bufsize); 1242 } 1243 bp->b_flags &= ~(B_DELWRI | B_CACHE); 1244 } 1245 1246 /* 1247 * We must clear B_RELBUF if B_DELWRI or B_LOCKED is set. 1248 * If vfs_vmio_release() is called with either bit set, the 1249 * underlying pages may wind up getting freed causing a previous 1250 * write (bdwrite()) to get 'lost' because pages associated with 1251 * a B_DELWRI bp are marked clean. Pages associated with a 1252 * B_LOCKED buffer may be mapped by the filesystem. 1253 * 1254 * If we want to release the buffer ourselves (rather then the 1255 * originator asking us to release it), give the originator a 1256 * chance to countermand the release by setting B_LOCKED. 1257 * 1258 * We still allow the B_INVAL case to call vfs_vmio_release(), even 1259 * if B_DELWRI is set. 1260 * 1261 * If B_DELWRI is not set we may have to set B_RELBUF if we are low 1262 * on pages to return pages to the VM page queues. 1263 */ 1264 if (bp->b_flags & (B_DELWRI | B_LOCKED)) { 1265 bp->b_flags &= ~B_RELBUF; 1266 } else if (vm_page_count_severe()) { 1267 if (LIST_FIRST(&bp->b_dep) != NULL) { 1268 get_mplock(); 1269 buf_deallocate(bp); /* can set B_LOCKED */ 1270 rel_mplock(); 1271 } 1272 if (bp->b_flags & (B_DELWRI | B_LOCKED)) 1273 bp->b_flags &= ~B_RELBUF; 1274 else 1275 bp->b_flags |= B_RELBUF; 1276 } 1277 1278 /* 1279 * Make sure b_cmd is clear. It may have already been cleared by 1280 * biodone(). 1281 * 1282 * At this point destroying the buffer is governed by the B_INVAL 1283 * or B_RELBUF flags. 1284 */ 1285 bp->b_cmd = BUF_CMD_DONE; 1286 1287 /* 1288 * VMIO buffer rundown. Make sure the VM page array is restored 1289 * after an I/O may have replaces some of the pages with bogus pages 1290 * in order to not destroy dirty pages in a fill-in read. 1291 * 1292 * Note that due to the code above, if a buffer is marked B_DELWRI 1293 * then the B_RELBUF and B_NOCACHE bits will always be clear. 1294 * B_INVAL may still be set, however. 1295 * 1296 * For clean buffers, B_INVAL or B_RELBUF will destroy the buffer 1297 * but not the backing store. B_NOCACHE will destroy the backing 1298 * store. 1299 * 1300 * Note that dirty NFS buffers contain byte-granular write ranges 1301 * and should not be destroyed w/ B_INVAL even if the backing store 1302 * is left intact. 1303 */ 1304 if (bp->b_flags & B_VMIO) { 1305 /* 1306 * Rundown for VMIO buffers which are not dirty NFS buffers. 1307 */ 1308 int i, j, resid; 1309 vm_page_t m; 1310 off_t foff; 1311 vm_pindex_t poff; 1312 vm_object_t obj; 1313 struct vnode *vp; 1314 1315 vp = bp->b_vp; 1316 1317 /* 1318 * Get the base offset and length of the buffer. Note that 1319 * in the VMIO case if the buffer block size is not 1320 * page-aligned then b_data pointer may not be page-aligned. 1321 * But our b_xio.xio_pages array *IS* page aligned. 1322 * 1323 * block sizes less then DEV_BSIZE (usually 512) are not 1324 * supported due to the page granularity bits (m->valid, 1325 * m->dirty, etc...). 1326 * 1327 * See man buf(9) for more information 1328 */ 1329 1330 resid = bp->b_bufsize; 1331 foff = bp->b_loffset; 1332 1333 get_mplock(); 1334 for (i = 0; i < bp->b_xio.xio_npages; i++) { 1335 m = bp->b_xio.xio_pages[i]; 1336 vm_page_flag_clear(m, PG_ZERO); 1337 /* 1338 * If we hit a bogus page, fixup *all* of them 1339 * now. Note that we left these pages wired 1340 * when we removed them so they had better exist, 1341 * and they cannot be ripped out from under us so 1342 * no critical section protection is necessary. 1343 */ 1344 if (m == bogus_page) { 1345 obj = vp->v_object; 1346 poff = OFF_TO_IDX(bp->b_loffset); 1347 1348 for (j = i; j < bp->b_xio.xio_npages; j++) { 1349 vm_page_t mtmp; 1350 1351 mtmp = bp->b_xio.xio_pages[j]; 1352 if (mtmp == bogus_page) { 1353 mtmp = vm_page_lookup(obj, poff + j); 1354 if (!mtmp) { 1355 panic("brelse: page missing"); 1356 } 1357 bp->b_xio.xio_pages[j] = mtmp; 1358 } 1359 } 1360 1361 if ((bp->b_flags & B_INVAL) == 0) { 1362 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), 1363 bp->b_xio.xio_pages, bp->b_xio.xio_npages); 1364 } 1365 m = bp->b_xio.xio_pages[i]; 1366 } 1367 1368 /* 1369 * Invalidate the backing store if B_NOCACHE is set 1370 * (e.g. used with vinvalbuf()). If this is NFS 1371 * we impose a requirement that the block size be 1372 * a multiple of PAGE_SIZE and create a temporary 1373 * hack to basically invalidate the whole page. The 1374 * problem is that NFS uses really odd buffer sizes 1375 * especially when tracking piecemeal writes and 1376 * it also vinvalbuf()'s a lot, which would result 1377 * in only partial page validation and invalidation 1378 * here. If the file page is mmap()'d, however, 1379 * all the valid bits get set so after we invalidate 1380 * here we would end up with weird m->valid values 1381 * like 0xfc. nfs_getpages() can't handle this so 1382 * we clear all the valid bits for the NFS case 1383 * instead of just some of them. 1384 * 1385 * The real bug is the VM system having to set m->valid 1386 * to VM_PAGE_BITS_ALL for faulted-in pages, which 1387 * itself is an artifact of the whole 512-byte 1388 * granular mess that exists to support odd block 1389 * sizes and UFS meta-data block sizes (e.g. 6144). 1390 * A complete rewrite is required. 1391 * 1392 * XXX 1393 */ 1394 if (bp->b_flags & (B_NOCACHE|B_ERROR)) { 1395 int poffset = foff & PAGE_MASK; 1396 int presid; 1397 1398 presid = PAGE_SIZE - poffset; 1399 if (bp->b_vp->v_tag == VT_NFS && 1400 bp->b_vp->v_type == VREG) { 1401 ; /* entire page */ 1402 } else if (presid > resid) { 1403 presid = resid; 1404 } 1405 KASSERT(presid >= 0, ("brelse: extra page")); 1406 vm_page_set_invalid(m, poffset, presid); 1407 } 1408 resid -= PAGE_SIZE - (foff & PAGE_MASK); 1409 foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; 1410 } 1411 if (bp->b_flags & (B_INVAL | B_RELBUF)) 1412 vfs_vmio_release(bp); 1413 rel_mplock(); 1414 } else { 1415 /* 1416 * Rundown for non-VMIO buffers. 1417 */ 1418 if (bp->b_flags & (B_INVAL | B_RELBUF)) { 1419 get_mplock(); 1420 if (bp->b_bufsize) 1421 allocbuf(bp, 0); 1422 KKASSERT (LIST_FIRST(&bp->b_dep) == NULL); 1423 if (bp->b_vp) 1424 brelvp(bp); 1425 rel_mplock(); 1426 } 1427 } 1428 1429 if (bp->b_qindex != BQUEUE_NONE) 1430 panic("brelse: free buffer onto another queue???"); 1431 if (BUF_REFCNTNB(bp) > 1) { 1432 /* Temporary panic to verify exclusive locking */ 1433 /* This panic goes away when we allow shared refs */ 1434 panic("brelse: multiple refs"); 1435 /* NOT REACHED */ 1436 return; 1437 } 1438 1439 /* 1440 * Figure out the correct queue to place the cleaned up buffer on. 1441 * Buffers placed in the EMPTY or EMPTYKVA had better already be 1442 * disassociated from their vnode. 1443 */ 1444 spin_lock_wr(&bufspin); 1445 if (bp->b_flags & B_LOCKED) { 1446 /* 1447 * Buffers that are locked are placed in the locked queue 1448 * immediately, regardless of their state. 1449 */ 1450 bp->b_qindex = BQUEUE_LOCKED; 1451 TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_LOCKED], bp, b_freelist); 1452 } else if (bp->b_bufsize == 0) { 1453 /* 1454 * Buffers with no memory. Due to conditionals near the top 1455 * of brelse() such buffers should probably already be 1456 * marked B_INVAL and disassociated from their vnode. 1457 */ 1458 bp->b_flags |= B_INVAL; 1459 KASSERT(bp->b_vp == NULL, ("bp1 %p flags %08x/%08x vnode %p unexpectededly still associated!", bp, saved_flags, bp->b_flags, bp->b_vp)); 1460 KKASSERT((bp->b_flags & B_HASHED) == 0); 1461 if (bp->b_kvasize) { 1462 bp->b_qindex = BQUEUE_EMPTYKVA; 1463 } else { 1464 bp->b_qindex = BQUEUE_EMPTY; 1465 } 1466 TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist); 1467 } else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF)) { 1468 /* 1469 * Buffers with junk contents. Again these buffers had better 1470 * already be disassociated from their vnode. 1471 */ 1472 KASSERT(bp->b_vp == NULL, ("bp2 %p flags %08x/%08x vnode %p unexpectededly still associated!", bp, saved_flags, bp->b_flags, bp->b_vp)); 1473 KKASSERT((bp->b_flags & B_HASHED) == 0); 1474 bp->b_flags |= B_INVAL; 1475 bp->b_qindex = BQUEUE_CLEAN; 1476 TAILQ_INSERT_HEAD(&bufqueues[BQUEUE_CLEAN], bp, b_freelist); 1477 } else { 1478 /* 1479 * Remaining buffers. These buffers are still associated with 1480 * their vnode. 1481 */ 1482 switch(bp->b_flags & (B_DELWRI|B_HEAVY)) { 1483 case B_DELWRI: 1484 bp->b_qindex = BQUEUE_DIRTY; 1485 TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_DIRTY], bp, b_freelist); 1486 break; 1487 case B_DELWRI | B_HEAVY: 1488 bp->b_qindex = BQUEUE_DIRTY_HW; 1489 TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_DIRTY_HW], bp, 1490 b_freelist); 1491 break; 1492 default: 1493 /* 1494 * NOTE: Buffers are always placed at the end of the 1495 * queue. If B_AGE is not set the buffer will cycle 1496 * through the queue twice. 1497 */ 1498 bp->b_qindex = BQUEUE_CLEAN; 1499 TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_CLEAN], bp, b_freelist); 1500 break; 1501 } 1502 } 1503 spin_unlock_wr(&bufspin); 1504 1505 /* 1506 * If B_INVAL, clear B_DELWRI. We've already placed the buffer 1507 * on the correct queue. 1508 */ 1509 if ((bp->b_flags & (B_INVAL|B_DELWRI)) == (B_INVAL|B_DELWRI)) 1510 bundirty(bp); 1511 1512 /* 1513 * The bp is on an appropriate queue unless locked. If it is not 1514 * locked or dirty we can wakeup threads waiting for buffer space. 1515 * 1516 * We've already handled the B_INVAL case ( B_DELWRI will be clear 1517 * if B_INVAL is set ). 1518 */ 1519 if ((bp->b_flags & (B_LOCKED|B_DELWRI)) == 0) 1520 bufcountwakeup(); 1521 1522 /* 1523 * Something we can maybe free or reuse 1524 */ 1525 if (bp->b_bufsize || bp->b_kvasize) 1526 bufspacewakeup(); 1527 1528 /* 1529 * Clean up temporary flags and unlock the buffer. 1530 */ 1531 bp->b_flags &= ~(B_ORDERED | B_NOCACHE | B_RELBUF | B_DIRECT); 1532 BUF_UNLOCK(bp); 1533 } 1534 1535 /* 1536 * bqrelse: 1537 * 1538 * Release a buffer back to the appropriate queue but do not try to free 1539 * it. The buffer is expected to be used again soon. 1540 * 1541 * bqrelse() is used by bdwrite() to requeue a delayed write, and used by 1542 * biodone() to requeue an async I/O on completion. It is also used when 1543 * known good buffers need to be requeued but we think we may need the data 1544 * again soon. 1545 * 1546 * XXX we should be able to leave the B_RELBUF hint set on completion. 1547 * 1548 * MPSAFE 1549 */ 1550 void 1551 bqrelse(struct buf *bp) 1552 { 1553 KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)), ("bqrelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp)); 1554 1555 if (bp->b_qindex != BQUEUE_NONE) 1556 panic("bqrelse: free buffer onto another queue???"); 1557 if (BUF_REFCNTNB(bp) > 1) { 1558 /* do not release to free list */ 1559 panic("bqrelse: multiple refs"); 1560 return; 1561 } 1562 1563 buf_act_advance(bp); 1564 1565 spin_lock_wr(&bufspin); 1566 if (bp->b_flags & B_LOCKED) { 1567 /* 1568 * Locked buffers are released to the locked queue. However, 1569 * if the buffer is dirty it will first go into the dirty 1570 * queue and later on after the I/O completes successfully it 1571 * will be released to the locked queue. 1572 */ 1573 bp->b_qindex = BQUEUE_LOCKED; 1574 TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_LOCKED], bp, b_freelist); 1575 } else if (bp->b_flags & B_DELWRI) { 1576 bp->b_qindex = (bp->b_flags & B_HEAVY) ? 1577 BQUEUE_DIRTY_HW : BQUEUE_DIRTY; 1578 TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist); 1579 } else if (vm_page_count_severe()) { 1580 /* 1581 * We are too low on memory, we have to try to free the 1582 * buffer (most importantly: the wired pages making up its 1583 * backing store) *now*. 1584 */ 1585 spin_unlock_wr(&bufspin); 1586 brelse(bp); 1587 return; 1588 } else { 1589 bp->b_qindex = BQUEUE_CLEAN; 1590 TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_CLEAN], bp, b_freelist); 1591 } 1592 spin_unlock_wr(&bufspin); 1593 1594 if ((bp->b_flags & B_LOCKED) == 0 && 1595 ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0)) { 1596 bufcountwakeup(); 1597 } 1598 1599 /* 1600 * Something we can maybe free or reuse. 1601 */ 1602 if (bp->b_bufsize && !(bp->b_flags & B_DELWRI)) 1603 bufspacewakeup(); 1604 1605 /* 1606 * Final cleanup and unlock. Clear bits that are only used while a 1607 * buffer is actively locked. 1608 */ 1609 bp->b_flags &= ~(B_ORDERED | B_NOCACHE | B_RELBUF); 1610 BUF_UNLOCK(bp); 1611 } 1612 1613 /* 1614 * vfs_vmio_release: 1615 * 1616 * Return backing pages held by the buffer 'bp' back to the VM system 1617 * if possible. The pages are freed if they are no longer valid or 1618 * attempt to free if it was used for direct I/O otherwise they are 1619 * sent to the page cache. 1620 * 1621 * Pages that were marked busy are left alone and skipped. 1622 * 1623 * The KVA mapping (b_data) for the underlying pages is removed by 1624 * this function. 1625 */ 1626 static void 1627 vfs_vmio_release(struct buf *bp) 1628 { 1629 int i; 1630 vm_page_t m; 1631 1632 crit_enter(); 1633 for (i = 0; i < bp->b_xio.xio_npages; i++) { 1634 m = bp->b_xio.xio_pages[i]; 1635 bp->b_xio.xio_pages[i] = NULL; 1636 1637 /* 1638 * This is a very important bit of code. We try to track 1639 * VM page use whether the pages are wired into the buffer 1640 * cache or not. While wired into the buffer cache the 1641 * bp tracks the act_count. 1642 * 1643 * We can choose to place unwired pages on the inactive 1644 * queue (0) or active queue (1). If we place too many 1645 * on the active queue the queue will cycle the act_count 1646 * on pages we'd like to keep, just from single-use pages 1647 * (such as when doing a tar-up or file scan). 1648 */ 1649 if (bp->b_act_count < vm_cycle_point) 1650 vm_page_unwire(m, 0); 1651 else 1652 vm_page_unwire(m, 1); 1653 1654 /* 1655 * We don't mess with busy pages, it is 1656 * the responsibility of the process that 1657 * busied the pages to deal with them. 1658 */ 1659 if ((m->flags & PG_BUSY) || (m->busy != 0)) 1660 continue; 1661 1662 if (m->wire_count == 0) { 1663 vm_page_flag_clear(m, PG_ZERO); 1664 /* 1665 * Might as well free the page if we can and it has 1666 * no valid data. We also free the page if the 1667 * buffer was used for direct I/O. 1668 */ 1669 #if 0 1670 if ((bp->b_flags & B_ASYNC) == 0 && !m->valid && 1671 m->hold_count == 0) { 1672 vm_page_busy(m); 1673 vm_page_protect(m, VM_PROT_NONE); 1674 vm_page_free(m); 1675 } else 1676 #endif 1677 if (bp->b_flags & B_DIRECT) { 1678 vm_page_try_to_free(m); 1679 } else if (vm_page_count_severe()) { 1680 m->act_count = bp->b_act_count; 1681 vm_page_try_to_cache(m); 1682 } else { 1683 m->act_count = bp->b_act_count; 1684 } 1685 } 1686 } 1687 crit_exit(); 1688 pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_xio.xio_npages); 1689 if (bp->b_bufsize) { 1690 bufspacewakeup(); 1691 bp->b_bufsize = 0; 1692 } 1693 bp->b_xio.xio_npages = 0; 1694 bp->b_flags &= ~B_VMIO; 1695 KKASSERT (LIST_FIRST(&bp->b_dep) == NULL); 1696 if (bp->b_vp) { 1697 get_mplock(); 1698 brelvp(bp); 1699 rel_mplock(); 1700 } 1701 } 1702 1703 /* 1704 * vfs_bio_awrite: 1705 * 1706 * Implement clustered async writes for clearing out B_DELWRI buffers. 1707 * This is much better then the old way of writing only one buffer at 1708 * a time. Note that we may not be presented with the buffers in the 1709 * correct order, so we search for the cluster in both directions. 1710 * 1711 * The buffer is locked on call. 1712 */ 1713 int 1714 vfs_bio_awrite(struct buf *bp) 1715 { 1716 int i; 1717 int j; 1718 off_t loffset = bp->b_loffset; 1719 struct vnode *vp = bp->b_vp; 1720 int nbytes; 1721 struct buf *bpa; 1722 int nwritten; 1723 int size; 1724 1725 /* 1726 * right now we support clustered writing only to regular files. If 1727 * we find a clusterable block we could be in the middle of a cluster 1728 * rather then at the beginning. 1729 * 1730 * NOTE: b_bio1 contains the logical loffset and is aliased 1731 * to b_loffset. b_bio2 contains the translated block number. 1732 */ 1733 if ((vp->v_type == VREG) && 1734 (vp->v_mount != 0) && /* Only on nodes that have the size info */ 1735 (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { 1736 1737 size = vp->v_mount->mnt_stat.f_iosize; 1738 1739 for (i = size; i < MAXPHYS; i += size) { 1740 if ((bpa = findblk(vp, loffset + i, FINDBLK_TEST)) && 1741 BUF_REFCNT(bpa) == 0 && 1742 ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) == 1743 (B_DELWRI | B_CLUSTEROK)) && 1744 (bpa->b_bufsize == size)) { 1745 if ((bpa->b_bio2.bio_offset == NOOFFSET) || 1746 (bpa->b_bio2.bio_offset != 1747 bp->b_bio2.bio_offset + i)) 1748 break; 1749 } else { 1750 break; 1751 } 1752 } 1753 for (j = size; i + j <= MAXPHYS && j <= loffset; j += size) { 1754 if ((bpa = findblk(vp, loffset - j, FINDBLK_TEST)) && 1755 BUF_REFCNT(bpa) == 0 && 1756 ((bpa->b_flags & (B_DELWRI | B_CLUSTEROK | B_INVAL)) == 1757 (B_DELWRI | B_CLUSTEROK)) && 1758 (bpa->b_bufsize == size)) { 1759 if ((bpa->b_bio2.bio_offset == NOOFFSET) || 1760 (bpa->b_bio2.bio_offset != 1761 bp->b_bio2.bio_offset - j)) 1762 break; 1763 } else { 1764 break; 1765 } 1766 } 1767 j -= size; 1768 nbytes = (i + j); 1769 1770 /* 1771 * this is a possible cluster write 1772 */ 1773 if (nbytes != size) { 1774 BUF_UNLOCK(bp); 1775 nwritten = cluster_wbuild(vp, size, 1776 loffset - j, nbytes); 1777 return nwritten; 1778 } 1779 } 1780 1781 /* 1782 * default (old) behavior, writing out only one block 1783 * 1784 * XXX returns b_bufsize instead of b_bcount for nwritten? 1785 */ 1786 nwritten = bp->b_bufsize; 1787 bremfree(bp); 1788 bawrite(bp); 1789 1790 return nwritten; 1791 } 1792 1793 /* 1794 * getnewbuf: 1795 * 1796 * Find and initialize a new buffer header, freeing up existing buffers 1797 * in the bufqueues as necessary. The new buffer is returned locked. 1798 * 1799 * Important: B_INVAL is not set. If the caller wishes to throw the 1800 * buffer away, the caller must set B_INVAL prior to calling brelse(). 1801 * 1802 * We block if: 1803 * We have insufficient buffer headers 1804 * We have insufficient buffer space 1805 * buffer_map is too fragmented ( space reservation fails ) 1806 * If we have to flush dirty buffers ( but we try to avoid this ) 1807 * 1808 * To avoid VFS layer recursion we do not flush dirty buffers ourselves. 1809 * Instead we ask the buf daemon to do it for us. We attempt to 1810 * avoid piecemeal wakeups of the pageout daemon. 1811 * 1812 * MPALMOSTSAFE 1813 */ 1814 static struct buf * 1815 getnewbuf(int blkflags, int slptimeo, int size, int maxsize) 1816 { 1817 struct buf *bp; 1818 struct buf *nbp; 1819 int defrag = 0; 1820 int nqindex; 1821 int slpflags = (blkflags & GETBLK_PCATCH) ? PCATCH : 0; 1822 static int flushingbufs; 1823 1824 /* 1825 * We can't afford to block since we might be holding a vnode lock, 1826 * which may prevent system daemons from running. We deal with 1827 * low-memory situations by proactively returning memory and running 1828 * async I/O rather then sync I/O. 1829 */ 1830 1831 ++getnewbufcalls; 1832 --getnewbufrestarts; 1833 restart: 1834 ++getnewbufrestarts; 1835 1836 /* 1837 * Setup for scan. If we do not have enough free buffers, 1838 * we setup a degenerate case that immediately fails. Note 1839 * that if we are specially marked process, we are allowed to 1840 * dip into our reserves. 1841 * 1842 * The scanning sequence is nominally: EMPTY->EMPTYKVA->CLEAN 1843 * 1844 * We start with EMPTYKVA. If the list is empty we backup to EMPTY. 1845 * However, there are a number of cases (defragging, reusing, ...) 1846 * where we cannot backup. 1847 */ 1848 nqindex = BQUEUE_EMPTYKVA; 1849 spin_lock_wr(&bufspin); 1850 nbp = TAILQ_FIRST(&bufqueues[BQUEUE_EMPTYKVA]); 1851 1852 if (nbp == NULL) { 1853 /* 1854 * If no EMPTYKVA buffers and we are either 1855 * defragging or reusing, locate a CLEAN buffer 1856 * to free or reuse. If bufspace useage is low 1857 * skip this step so we can allocate a new buffer. 1858 */ 1859 if (defrag || bufspace >= lobufspace) { 1860 nqindex = BQUEUE_CLEAN; 1861 nbp = TAILQ_FIRST(&bufqueues[BQUEUE_CLEAN]); 1862 } 1863 1864 /* 1865 * If we could not find or were not allowed to reuse a 1866 * CLEAN buffer, check to see if it is ok to use an EMPTY 1867 * buffer. We can only use an EMPTY buffer if allocating 1868 * its KVA would not otherwise run us out of buffer space. 1869 */ 1870 if (nbp == NULL && defrag == 0 && 1871 bufspace + maxsize < hibufspace) { 1872 nqindex = BQUEUE_EMPTY; 1873 nbp = TAILQ_FIRST(&bufqueues[BQUEUE_EMPTY]); 1874 } 1875 } 1876 1877 /* 1878 * Run scan, possibly freeing data and/or kva mappings on the fly 1879 * depending. 1880 * 1881 * WARNING! bufspin is held! 1882 */ 1883 while ((bp = nbp) != NULL) { 1884 int qindex = nqindex; 1885 1886 nbp = TAILQ_NEXT(bp, b_freelist); 1887 1888 /* 1889 * BQUEUE_CLEAN - B_AGE special case. If not set the bp 1890 * cycles through the queue twice before being selected. 1891 */ 1892 if (qindex == BQUEUE_CLEAN && 1893 (bp->b_flags & B_AGE) == 0 && nbp) { 1894 bp->b_flags |= B_AGE; 1895 TAILQ_REMOVE(&bufqueues[qindex], bp, b_freelist); 1896 TAILQ_INSERT_TAIL(&bufqueues[qindex], bp, b_freelist); 1897 continue; 1898 } 1899 1900 /* 1901 * Calculate next bp ( we can only use it if we do not block 1902 * or do other fancy things ). 1903 */ 1904 if (nbp == NULL) { 1905 switch(qindex) { 1906 case BQUEUE_EMPTY: 1907 nqindex = BQUEUE_EMPTYKVA; 1908 if ((nbp = TAILQ_FIRST(&bufqueues[BQUEUE_EMPTYKVA]))) 1909 break; 1910 /* fall through */ 1911 case BQUEUE_EMPTYKVA: 1912 nqindex = BQUEUE_CLEAN; 1913 if ((nbp = TAILQ_FIRST(&bufqueues[BQUEUE_CLEAN]))) 1914 break; 1915 /* fall through */ 1916 case BQUEUE_CLEAN: 1917 /* 1918 * nbp is NULL. 1919 */ 1920 break; 1921 } 1922 } 1923 1924 /* 1925 * Sanity Checks 1926 */ 1927 KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistent queue %d bp %p", qindex, bp)); 1928 1929 /* 1930 * Note: we no longer distinguish between VMIO and non-VMIO 1931 * buffers. 1932 */ 1933 1934 KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p found in queue %d", bp, qindex)); 1935 1936 /* 1937 * If we are defragging then we need a buffer with 1938 * b_kvasize != 0. XXX this situation should no longer 1939 * occur, if defrag is non-zero the buffer's b_kvasize 1940 * should also be non-zero at this point. XXX 1941 */ 1942 if (defrag && bp->b_kvasize == 0) { 1943 kprintf("Warning: defrag empty buffer %p\n", bp); 1944 continue; 1945 } 1946 1947 /* 1948 * Start freeing the bp. This is somewhat involved. nbp 1949 * remains valid only for BQUEUE_EMPTY[KVA] bp's. Buffers 1950 * on the clean list must be disassociated from their 1951 * current vnode. Buffers on the empty[kva] lists have 1952 * already been disassociated. 1953 */ 1954 1955 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { 1956 spin_unlock_wr(&bufspin); 1957 kprintf("getnewbuf: warning, locked buf %p, race corrected\n", bp); 1958 tsleep(&bd_request, 0, "gnbxxx", hz / 100); 1959 goto restart; 1960 } 1961 if (bp->b_qindex != qindex) { 1962 spin_unlock_wr(&bufspin); 1963 kprintf("getnewbuf: warning, BUF_LOCK blocked unexpectedly on buf %p index %d->%d, race corrected\n", bp, qindex, bp->b_qindex); 1964 BUF_UNLOCK(bp); 1965 goto restart; 1966 } 1967 bremfree_locked(bp); 1968 spin_unlock_wr(&bufspin); 1969 1970 /* 1971 * Dependancies must be handled before we disassociate the 1972 * vnode. 1973 * 1974 * NOTE: HAMMER will set B_LOCKED if the buffer cannot 1975 * be immediately disassociated. HAMMER then becomes 1976 * responsible for releasing the buffer. 1977 * 1978 * NOTE: bufspin is UNLOCKED now. 1979 */ 1980 if (LIST_FIRST(&bp->b_dep) != NULL) { 1981 get_mplock(); 1982 buf_deallocate(bp); 1983 rel_mplock(); 1984 if (bp->b_flags & B_LOCKED) { 1985 bqrelse(bp); 1986 goto restart; 1987 } 1988 KKASSERT(LIST_FIRST(&bp->b_dep) == NULL); 1989 } 1990 1991 if (qindex == BQUEUE_CLEAN) { 1992 get_mplock(); 1993 if (bp->b_flags & B_VMIO) { 1994 get_mplock(); 1995 vfs_vmio_release(bp); 1996 rel_mplock(); 1997 } 1998 if (bp->b_vp) 1999 brelvp(bp); 2000 rel_mplock(); 2001 } 2002 2003 /* 2004 * NOTE: nbp is now entirely invalid. We can only restart 2005 * the scan from this point on. 2006 * 2007 * Get the rest of the buffer freed up. b_kva* is still 2008 * valid after this operation. 2009 */ 2010 2011 KASSERT(bp->b_vp == NULL, ("bp3 %p flags %08x vnode %p qindex %d unexpectededly still associated!", bp, bp->b_flags, bp->b_vp, qindex)); 2012 KKASSERT((bp->b_flags & B_HASHED) == 0); 2013 2014 /* 2015 * critical section protection is not required when 2016 * scrapping a buffer's contents because it is already 2017 * wired. 2018 */ 2019 if (bp->b_bufsize) { 2020 get_mplock(); 2021 allocbuf(bp, 0); 2022 rel_mplock(); 2023 } 2024 2025 bp->b_flags = B_BNOCLIP; 2026 bp->b_cmd = BUF_CMD_DONE; 2027 bp->b_vp = NULL; 2028 bp->b_error = 0; 2029 bp->b_resid = 0; 2030 bp->b_bcount = 0; 2031 bp->b_xio.xio_npages = 0; 2032 bp->b_dirtyoff = bp->b_dirtyend = 0; 2033 bp->b_act_count = ACT_INIT; 2034 reinitbufbio(bp); 2035 KKASSERT(LIST_FIRST(&bp->b_dep) == NULL); 2036 buf_dep_init(bp); 2037 if (blkflags & GETBLK_BHEAVY) 2038 bp->b_flags |= B_HEAVY; 2039 2040 /* 2041 * If we are defragging then free the buffer. 2042 */ 2043 if (defrag) { 2044 bp->b_flags |= B_INVAL; 2045 bfreekva(bp); 2046 brelse(bp); 2047 defrag = 0; 2048 goto restart; 2049 } 2050 2051 /* 2052 * If we are overcomitted then recover the buffer and its 2053 * KVM space. This occurs in rare situations when multiple 2054 * processes are blocked in getnewbuf() or allocbuf(). 2055 */ 2056 if (bufspace >= hibufspace) 2057 flushingbufs = 1; 2058 if (flushingbufs && bp->b_kvasize != 0) { 2059 bp->b_flags |= B_INVAL; 2060 bfreekva(bp); 2061 brelse(bp); 2062 goto restart; 2063 } 2064 if (bufspace < lobufspace) 2065 flushingbufs = 0; 2066 break; 2067 /* NOT REACHED, bufspin not held */ 2068 } 2069 2070 /* 2071 * If we exhausted our list, sleep as appropriate. We may have to 2072 * wakeup various daemons and write out some dirty buffers. 2073 * 2074 * Generally we are sleeping due to insufficient buffer space. 2075 * 2076 * NOTE: bufspin is held if bp is NULL, else it is not held. 2077 */ 2078 if (bp == NULL) { 2079 int flags; 2080 char *waitmsg; 2081 2082 spin_unlock_wr(&bufspin); 2083 if (defrag) { 2084 flags = VFS_BIO_NEED_BUFSPACE; 2085 waitmsg = "nbufkv"; 2086 } else if (bufspace >= hibufspace) { 2087 waitmsg = "nbufbs"; 2088 flags = VFS_BIO_NEED_BUFSPACE; 2089 } else { 2090 waitmsg = "newbuf"; 2091 flags = VFS_BIO_NEED_ANY; 2092 } 2093 2094 needsbuffer |= flags; 2095 bd_speedup(); /* heeeelp */ 2096 while (needsbuffer & flags) { 2097 if (tsleep(&needsbuffer, slpflags, waitmsg, slptimeo)) 2098 return (NULL); 2099 } 2100 } else { 2101 /* 2102 * We finally have a valid bp. We aren't quite out of the 2103 * woods, we still have to reserve kva space. In order 2104 * to keep fragmentation sane we only allocate kva in 2105 * BKVASIZE chunks. 2106 * 2107 * (bufspin is not held) 2108 */ 2109 maxsize = (maxsize + BKVAMASK) & ~BKVAMASK; 2110 2111 if (maxsize != bp->b_kvasize) { 2112 vm_offset_t addr = 0; 2113 int count; 2114 2115 bfreekva(bp); 2116 2117 get_mplock(); 2118 count = vm_map_entry_reserve(MAP_RESERVE_COUNT); 2119 vm_map_lock(&buffer_map); 2120 2121 if (vm_map_findspace(&buffer_map, 2122 vm_map_min(&buffer_map), maxsize, 2123 maxsize, 0, &addr)) { 2124 /* 2125 * Uh oh. Buffer map is too fragmented. We 2126 * must defragment the map. 2127 */ 2128 vm_map_unlock(&buffer_map); 2129 vm_map_entry_release(count); 2130 ++bufdefragcnt; 2131 defrag = 1; 2132 bp->b_flags |= B_INVAL; 2133 rel_mplock(); 2134 brelse(bp); 2135 goto restart; 2136 } 2137 if (addr) { 2138 vm_map_insert(&buffer_map, &count, 2139 NULL, 0, 2140 addr, addr + maxsize, 2141 VM_MAPTYPE_NORMAL, 2142 VM_PROT_ALL, VM_PROT_ALL, 2143 MAP_NOFAULT); 2144 2145 bp->b_kvabase = (caddr_t) addr; 2146 bp->b_kvasize = maxsize; 2147 bufspace += bp->b_kvasize; 2148 ++bufreusecnt; 2149 } 2150 vm_map_unlock(&buffer_map); 2151 vm_map_entry_release(count); 2152 rel_mplock(); 2153 } 2154 bp->b_data = bp->b_kvabase; 2155 } 2156 return(bp); 2157 } 2158 2159 /* 2160 * This routine is called in an emergency to recover VM pages from the 2161 * buffer cache by cashing in clean buffers. The idea is to recover 2162 * enough pages to be able to satisfy a stuck bio_page_alloc(). 2163 */ 2164 static int 2165 recoverbufpages(void) 2166 { 2167 struct buf *bp; 2168 int bytes = 0; 2169 2170 ++recoverbufcalls; 2171 2172 spin_lock_wr(&bufspin); 2173 while (bytes < MAXBSIZE) { 2174 bp = TAILQ_FIRST(&bufqueues[BQUEUE_CLEAN]); 2175 if (bp == NULL) 2176 break; 2177 2178 /* 2179 * BQUEUE_CLEAN - B_AGE special case. If not set the bp 2180 * cycles through the queue twice before being selected. 2181 */ 2182 if ((bp->b_flags & B_AGE) == 0 && TAILQ_NEXT(bp, b_freelist)) { 2183 bp->b_flags |= B_AGE; 2184 TAILQ_REMOVE(&bufqueues[BQUEUE_CLEAN], bp, b_freelist); 2185 TAILQ_INSERT_TAIL(&bufqueues[BQUEUE_CLEAN], 2186 bp, b_freelist); 2187 continue; 2188 } 2189 2190 /* 2191 * Sanity Checks 2192 */ 2193 KKASSERT(bp->b_qindex == BQUEUE_CLEAN); 2194 KKASSERT((bp->b_flags & B_DELWRI) == 0); 2195 2196 /* 2197 * Start freeing the bp. This is somewhat involved. 2198 * 2199 * Buffers on the clean list must be disassociated from 2200 * their current vnode 2201 */ 2202 2203 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0) { 2204 kprintf("recoverbufpages: warning, locked buf %p, race corrected\n", bp); 2205 tsleep(&bd_request, 0, "gnbxxx", hz / 100); 2206 continue; 2207 } 2208 if (bp->b_qindex != BQUEUE_CLEAN) { 2209 kprintf("recoverbufpages: warning, BUF_LOCK blocked unexpectedly on buf %p index %d, race corrected\n", bp, bp->b_qindex); 2210 BUF_UNLOCK(bp); 2211 continue; 2212 } 2213 bremfree_locked(bp); 2214 spin_unlock_wr(&bufspin); 2215 2216 /* 2217 * Dependancies must be handled before we disassociate the 2218 * vnode. 2219 * 2220 * NOTE: HAMMER will set B_LOCKED if the buffer cannot 2221 * be immediately disassociated. HAMMER then becomes 2222 * responsible for releasing the buffer. 2223 */ 2224 if (LIST_FIRST(&bp->b_dep) != NULL) { 2225 buf_deallocate(bp); 2226 if (bp->b_flags & B_LOCKED) { 2227 bqrelse(bp); 2228 spin_lock_wr(&bufspin); 2229 continue; 2230 } 2231 KKASSERT(LIST_FIRST(&bp->b_dep) == NULL); 2232 } 2233 2234 bytes += bp->b_bufsize; 2235 2236 get_mplock(); 2237 if (bp->b_flags & B_VMIO) { 2238 bp->b_flags |= B_DIRECT; /* try to free pages */ 2239 vfs_vmio_release(bp); 2240 } 2241 if (bp->b_vp) 2242 brelvp(bp); 2243 2244 KKASSERT(bp->b_vp == NULL); 2245 KKASSERT((bp->b_flags & B_HASHED) == 0); 2246 2247 /* 2248 * critical section protection is not required when 2249 * scrapping a buffer's contents because it is already 2250 * wired. 2251 */ 2252 if (bp->b_bufsize) 2253 allocbuf(bp, 0); 2254 rel_mplock(); 2255 2256 bp->b_flags = B_BNOCLIP; 2257 bp->b_cmd = BUF_CMD_DONE; 2258 bp->b_vp = NULL; 2259 bp->b_error = 0; 2260 bp->b_resid = 0; 2261 bp->b_bcount = 0; 2262 bp->b_xio.xio_npages = 0; 2263 bp->b_dirtyoff = bp->b_dirtyend = 0; 2264 reinitbufbio(bp); 2265 KKASSERT(LIST_FIRST(&bp->b_dep) == NULL); 2266 buf_dep_init(bp); 2267 bp->b_flags |= B_INVAL; 2268 /* bfreekva(bp); */ 2269 brelse(bp); 2270 spin_lock_wr(&bufspin); 2271 } 2272 spin_unlock_wr(&bufspin); 2273 return(bytes); 2274 } 2275 2276 /* 2277 * buf_daemon: 2278 * 2279 * Buffer flushing daemon. Buffers are normally flushed by the 2280 * update daemon but if it cannot keep up this process starts to 2281 * take the load in an attempt to prevent getnewbuf() from blocking. 2282 * 2283 * Once a flush is initiated it does not stop until the number 2284 * of buffers falls below lodirtybuffers, but we will wake up anyone 2285 * waiting at the mid-point. 2286 */ 2287 2288 static struct kproc_desc buf_kp = { 2289 "bufdaemon", 2290 buf_daemon, 2291 &bufdaemon_td 2292 }; 2293 SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, 2294 kproc_start, &buf_kp) 2295 2296 static struct kproc_desc bufhw_kp = { 2297 "bufdaemon_hw", 2298 buf_daemon_hw, 2299 &bufdaemonhw_td 2300 }; 2301 SYSINIT(bufdaemon_hw, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, 2302 kproc_start, &bufhw_kp) 2303 2304 static void 2305 buf_daemon(void) 2306 { 2307 int limit; 2308 2309 /* 2310 * This process needs to be suspended prior to shutdown sync. 2311 */ 2312 EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, 2313 bufdaemon_td, SHUTDOWN_PRI_LAST); 2314 curthread->td_flags |= TDF_SYSTHREAD; 2315 2316 /* 2317 * This process is allowed to take the buffer cache to the limit 2318 */ 2319 crit_enter(); 2320 2321 for (;;) { 2322 kproc_suspend_loop(); 2323 2324 /* 2325 * Do the flush as long as the number of dirty buffers 2326 * (including those running) exceeds lodirtybufspace. 2327 * 2328 * When flushing limit running I/O to hirunningspace 2329 * Do the flush. Limit the amount of in-transit I/O we 2330 * allow to build up, otherwise we would completely saturate 2331 * the I/O system. Wakeup any waiting processes before we 2332 * normally would so they can run in parallel with our drain. 2333 * 2334 * Our aggregate normal+HW lo water mark is lodirtybufspace, 2335 * but because we split the operation into two threads we 2336 * have to cut it in half for each thread. 2337 */ 2338 waitrunningbufspace(); 2339 limit = lodirtybufspace / 2; 2340 while (runningbufspace + dirtybufspace > limit || 2341 dirtybufcount - dirtybufcounthw >= nbuf / 2) { 2342 if (flushbufqueues(BQUEUE_DIRTY) == 0) 2343 break; 2344 if (runningbufspace < hirunningspace) 2345 continue; 2346 waitrunningbufspace(); 2347 } 2348 2349 /* 2350 * We reached our low water mark, reset the 2351 * request and sleep until we are needed again. 2352 * The sleep is just so the suspend code works. 2353 */ 2354 spin_lock_wr(&needsbuffer_spin); 2355 if (bd_request == 0) { 2356 ssleep(&bd_request, &needsbuffer_spin, 0, 2357 "psleep", hz); 2358 } 2359 bd_request = 0; 2360 spin_unlock_wr(&needsbuffer_spin); 2361 } 2362 } 2363 2364 static void 2365 buf_daemon_hw(void) 2366 { 2367 int limit; 2368 2369 /* 2370 * This process needs to be suspended prior to shutdown sync. 2371 */ 2372 EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, 2373 bufdaemonhw_td, SHUTDOWN_PRI_LAST); 2374 curthread->td_flags |= TDF_SYSTHREAD; 2375 2376 /* 2377 * This process is allowed to take the buffer cache to the limit 2378 */ 2379 crit_enter(); 2380 2381 for (;;) { 2382 kproc_suspend_loop(); 2383 2384 /* 2385 * Do the flush. Limit the amount of in-transit I/O we 2386 * allow to build up, otherwise we would completely saturate 2387 * the I/O system. Wakeup any waiting processes before we 2388 * normally would so they can run in parallel with our drain. 2389 * 2390 * Once we decide to flush push the queued I/O up to 2391 * hirunningspace in order to trigger bursting by the bioq 2392 * subsystem. 2393 * 2394 * Our aggregate normal+HW lo water mark is lodirtybufspace, 2395 * but because we split the operation into two threads we 2396 * have to cut it in half for each thread. 2397 */ 2398 waitrunningbufspace(); 2399 limit = lodirtybufspace / 2; 2400 while (runningbufspace + dirtybufspacehw > limit || 2401 dirtybufcounthw >= nbuf / 2) { 2402 if (flushbufqueues(BQUEUE_DIRTY_HW) == 0) 2403 break; 2404 if (runningbufspace < hirunningspace) 2405 continue; 2406 waitrunningbufspace(); 2407 } 2408 2409 /* 2410 * We reached our low water mark, reset the 2411 * request and sleep until we are needed again. 2412 * The sleep is just so the suspend code works. 2413 */ 2414 spin_lock_wr(&needsbuffer_spin); 2415 if (bd_request_hw == 0) { 2416 ssleep(&bd_request_hw, &needsbuffer_spin, 0, 2417 "psleep", hz); 2418 } 2419 bd_request_hw = 0; 2420 spin_unlock_wr(&needsbuffer_spin); 2421 } 2422 } 2423 2424 /* 2425 * flushbufqueues: 2426 * 2427 * Try to flush a buffer in the dirty queue. We must be careful to 2428 * free up B_INVAL buffers instead of write them, which NFS is 2429 * particularly sensitive to. 2430 * 2431 * B_RELBUF may only be set by VFSs. We do set B_AGE to indicate 2432 * that we really want to try to get the buffer out and reuse it 2433 * due to the write load on the machine. 2434 */ 2435 static int 2436 flushbufqueues(bufq_type_t q) 2437 { 2438 struct buf *bp; 2439 int r = 0; 2440 int spun; 2441 2442 spin_lock_wr(&bufspin); 2443 spun = 1; 2444 2445 bp = TAILQ_FIRST(&bufqueues[q]); 2446 while (bp) { 2447 KASSERT((bp->b_flags & B_DELWRI), 2448 ("unexpected clean buffer %p", bp)); 2449 2450 if (bp->b_flags & B_DELWRI) { 2451 if (bp->b_flags & B_INVAL) { 2452 spin_unlock_wr(&bufspin); 2453 spun = 0; 2454 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) != 0) 2455 panic("flushbufqueues: locked buf"); 2456 bremfree(bp); 2457 brelse(bp); 2458 ++r; 2459 break; 2460 } 2461 if (LIST_FIRST(&bp->b_dep) != NULL && 2462 (bp->b_flags & B_DEFERRED) == 0 && 2463 buf_countdeps(bp, 0)) { 2464 TAILQ_REMOVE(&bufqueues[q], bp, b_freelist); 2465 TAILQ_INSERT_TAIL(&bufqueues[q], bp, 2466 b_freelist); 2467 bp->b_flags |= B_DEFERRED; 2468 bp = TAILQ_FIRST(&bufqueues[q]); 2469 continue; 2470 } 2471 2472 /* 2473 * Only write it out if we can successfully lock 2474 * it. If the buffer has a dependancy, 2475 * buf_checkwrite must also return 0 for us to 2476 * be able to initate the write. 2477 * 2478 * If the buffer is flagged B_ERROR it may be 2479 * requeued over and over again, we try to 2480 * avoid a live lock. 2481 */ 2482 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT) == 0) { 2483 spin_unlock_wr(&bufspin); 2484 spun = 0; 2485 if (LIST_FIRST(&bp->b_dep) != NULL && 2486 buf_checkwrite(bp)) { 2487 bremfree(bp); 2488 brelse(bp); 2489 } else if (bp->b_flags & B_ERROR) { 2490 tsleep(bp, 0, "bioer", 1); 2491 bp->b_flags &= ~B_AGE; 2492 vfs_bio_awrite(bp); 2493 } else { 2494 bp->b_flags |= B_AGE; 2495 vfs_bio_awrite(bp); 2496 } 2497 ++r; 2498 break; 2499 } 2500 } 2501 bp = TAILQ_NEXT(bp, b_freelist); 2502 } 2503 if (spun) 2504 spin_unlock_wr(&bufspin); 2505 return (r); 2506 } 2507 2508 /* 2509 * inmem: 2510 * 2511 * Returns true if no I/O is needed to access the associated VM object. 2512 * This is like findblk except it also hunts around in the VM system for 2513 * the data. 2514 * 2515 * Note that we ignore vm_page_free() races from interrupts against our 2516 * lookup, since if the caller is not protected our return value will not 2517 * be any more valid then otherwise once we exit the critical section. 2518 */ 2519 int 2520 inmem(struct vnode *vp, off_t loffset) 2521 { 2522 vm_object_t obj; 2523 vm_offset_t toff, tinc, size; 2524 vm_page_t m; 2525 2526 if (findblk(vp, loffset, FINDBLK_TEST)) 2527 return 1; 2528 if (vp->v_mount == NULL) 2529 return 0; 2530 if ((obj = vp->v_object) == NULL) 2531 return 0; 2532 2533 size = PAGE_SIZE; 2534 if (size > vp->v_mount->mnt_stat.f_iosize) 2535 size = vp->v_mount->mnt_stat.f_iosize; 2536 2537 for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { 2538 m = vm_page_lookup(obj, OFF_TO_IDX(loffset + toff)); 2539 if (m == NULL) 2540 return 0; 2541 tinc = size; 2542 if (tinc > PAGE_SIZE - ((toff + loffset) & PAGE_MASK)) 2543 tinc = PAGE_SIZE - ((toff + loffset) & PAGE_MASK); 2544 if (vm_page_is_valid(m, 2545 (vm_offset_t) ((toff + loffset) & PAGE_MASK), tinc) == 0) 2546 return 0; 2547 } 2548 return 1; 2549 } 2550 2551 /* 2552 * findblk: 2553 * 2554 * Locate and return the specified buffer. Unless flagged otherwise, 2555 * a locked buffer will be returned if it exists or NULL if it does not. 2556 * 2557 * findblk()'d buffers are still on the bufqueues and if you intend 2558 * to use your (locked NON-TEST) buffer you need to bremfree(bp) 2559 * and possibly do other stuff to it. 2560 * 2561 * FINDBLK_TEST - Do not lock the buffer. The caller is responsible 2562 * for locking the buffer and ensuring that it remains 2563 * the desired buffer after locking. 2564 * 2565 * FINDBLK_NBLOCK - Lock the buffer non-blocking. If we are unable 2566 * to acquire the lock we return NULL, even if the 2567 * buffer exists. 2568 * 2569 * (0) - Lock the buffer blocking. 2570 * 2571 * MPSAFE 2572 */ 2573 struct buf * 2574 findblk(struct vnode *vp, off_t loffset, int flags) 2575 { 2576 lwkt_tokref vlock; 2577 struct buf *bp; 2578 int lkflags; 2579 2580 lkflags = LK_EXCLUSIVE; 2581 if (flags & FINDBLK_NBLOCK) 2582 lkflags |= LK_NOWAIT; 2583 2584 for (;;) { 2585 lwkt_gettoken(&vlock, &vp->v_token); 2586 bp = buf_rb_hash_RB_LOOKUP(&vp->v_rbhash_tree, loffset); 2587 lwkt_reltoken(&vlock); 2588 if (bp == NULL || (flags & FINDBLK_TEST)) 2589 break; 2590 if (BUF_LOCK(bp, lkflags)) { 2591 bp = NULL; 2592 break; 2593 } 2594 if (bp->b_vp == vp && bp->b_loffset == loffset) 2595 break; 2596 BUF_UNLOCK(bp); 2597 } 2598 return(bp); 2599 } 2600 2601 /* 2602 * getcacheblk: 2603 * 2604 * Similar to getblk() except only returns the buffer if it is 2605 * B_CACHE and requires no other manipulation. Otherwise NULL 2606 * is returned. 2607 * 2608 * If B_RAM is set the buffer might be just fine, but we return 2609 * NULL anyway because we want the code to fall through to the 2610 * cluster read. Otherwise read-ahead breaks. 2611 */ 2612 struct buf * 2613 getcacheblk(struct vnode *vp, off_t loffset) 2614 { 2615 struct buf *bp; 2616 2617 bp = findblk(vp, loffset, 0); 2618 if (bp) { 2619 if ((bp->b_flags & (B_INVAL | B_CACHE | B_RAM)) == B_CACHE) { 2620 bp->b_flags &= ~B_AGE; 2621 bremfree(bp); 2622 } else { 2623 BUF_UNLOCK(bp); 2624 bp = NULL; 2625 } 2626 } 2627 return (bp); 2628 } 2629 2630 /* 2631 * getblk: 2632 * 2633 * Get a block given a specified block and offset into a file/device. 2634 * B_INVAL may or may not be set on return. The caller should clear 2635 * B_INVAL prior to initiating a READ. 2636 * 2637 * IT IS IMPORTANT TO UNDERSTAND THAT IF YOU CALL GETBLK() AND B_CACHE 2638 * IS NOT SET, YOU MUST INITIALIZE THE RETURNED BUFFER, ISSUE A READ, 2639 * OR SET B_INVAL BEFORE RETIRING IT. If you retire a getblk'd buffer 2640 * without doing any of those things the system will likely believe 2641 * the buffer to be valid (especially if it is not B_VMIO), and the 2642 * next getblk() will return the buffer with B_CACHE set. 2643 * 2644 * For a non-VMIO buffer, B_CACHE is set to the opposite of B_INVAL for 2645 * an existing buffer. 2646 * 2647 * For a VMIO buffer, B_CACHE is modified according to the backing VM. 2648 * If getblk()ing a previously 0-sized invalid buffer, B_CACHE is set 2649 * and then cleared based on the backing VM. If the previous buffer is 2650 * non-0-sized but invalid, B_CACHE will be cleared. 2651 * 2652 * If getblk() must create a new buffer, the new buffer is returned with 2653 * both B_INVAL and B_CACHE clear unless it is a VMIO buffer, in which 2654 * case it is returned with B_INVAL clear and B_CACHE set based on the 2655 * backing VM. 2656 * 2657 * getblk() also forces a bwrite() for any B_DELWRI buffer whos 2658 * B_CACHE bit is clear. 2659 * 2660 * What this means, basically, is that the caller should use B_CACHE to 2661 * determine whether the buffer is fully valid or not and should clear 2662 * B_INVAL prior to issuing a read. If the caller intends to validate 2663 * the buffer by loading its data area with something, the caller needs 2664 * to clear B_INVAL. If the caller does this without issuing an I/O, 2665 * the caller should set B_CACHE ( as an optimization ), else the caller 2666 * should issue the I/O and biodone() will set B_CACHE if the I/O was 2667 * a write attempt or if it was a successfull read. If the caller 2668 * intends to issue a READ, the caller must clear B_INVAL and B_ERROR 2669 * prior to issuing the READ. biodone() will *not* clear B_INVAL. 2670 * 2671 * getblk flags: 2672 * 2673 * GETBLK_PCATCH - catch signal if blocked, can cause NULL return 2674 * GETBLK_BHEAVY - heavy-weight buffer cache buffer 2675 * 2676 * MPALMOSTSAFE 2677 */ 2678 struct buf * 2679 getblk(struct vnode *vp, off_t loffset, int size, int blkflags, int slptimeo) 2680 { 2681 struct buf *bp; 2682 int slpflags = (blkflags & GETBLK_PCATCH) ? PCATCH : 0; 2683 int error; 2684 int lkflags; 2685 2686 if (size > MAXBSIZE) 2687 panic("getblk: size(%d) > MAXBSIZE(%d)", size, MAXBSIZE); 2688 if (vp->v_object == NULL) 2689 panic("getblk: vnode %p has no object!", vp); 2690 2691 loop: 2692 if ((bp = findblk(vp, loffset, FINDBLK_TEST)) != NULL) { 2693 /* 2694 * The buffer was found in the cache, but we need to lock it. 2695 * Even with LK_NOWAIT the lockmgr may break our critical 2696 * section, so double-check the validity of the buffer 2697 * once the lock has been obtained. 2698 */ 2699 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) { 2700 if (blkflags & GETBLK_NOWAIT) 2701 return(NULL); 2702 lkflags = LK_EXCLUSIVE | LK_SLEEPFAIL; 2703 if (blkflags & GETBLK_PCATCH) 2704 lkflags |= LK_PCATCH; 2705 error = BUF_TIMELOCK(bp, lkflags, "getblk", slptimeo); 2706 if (error) { 2707 if (error == ENOLCK) 2708 goto loop; 2709 return (NULL); 2710 } 2711 /* buffer may have changed on us */ 2712 } 2713 2714 /* 2715 * Once the buffer has been locked, make sure we didn't race 2716 * a buffer recyclement. Buffers that are no longer hashed 2717 * will have b_vp == NULL, so this takes care of that check 2718 * as well. 2719 */ 2720 if (bp->b_vp != vp || bp->b_loffset != loffset) { 2721 kprintf("Warning buffer %p (vp %p loffset %lld) " 2722 "was recycled\n", 2723 bp, vp, (long long)loffset); 2724 BUF_UNLOCK(bp); 2725 goto loop; 2726 } 2727 2728 /* 2729 * If SZMATCH any pre-existing buffer must be of the requested 2730 * size or NULL is returned. The caller absolutely does not 2731 * want getblk() to bwrite() the buffer on a size mismatch. 2732 */ 2733 if ((blkflags & GETBLK_SZMATCH) && size != bp->b_bcount) { 2734 BUF_UNLOCK(bp); 2735 return(NULL); 2736 } 2737 2738 /* 2739 * All vnode-based buffers must be backed by a VM object. 2740 */ 2741 KKASSERT(bp->b_flags & B_VMIO); 2742 KKASSERT(bp->b_cmd == BUF_CMD_DONE); 2743 bp->b_flags &= ~B_AGE; 2744 2745 /* 2746 * Make sure that B_INVAL buffers do not have a cached 2747 * block number translation. 2748 */ 2749 if ((bp->b_flags & B_INVAL) && (bp->b_bio2.bio_offset != NOOFFSET)) { 2750 kprintf("Warning invalid buffer %p (vp %p loffset %lld)" 2751 " did not have cleared bio_offset cache\n", 2752 bp, vp, (long long)loffset); 2753 clearbiocache(&bp->b_bio2); 2754 } 2755 2756 /* 2757 * The buffer is locked. B_CACHE is cleared if the buffer is 2758 * invalid. 2759 */ 2760 if (bp->b_flags & B_INVAL) 2761 bp->b_flags &= ~B_CACHE; 2762 bremfree(bp); 2763 2764 /* 2765 * Any size inconsistancy with a dirty buffer or a buffer 2766 * with a softupdates dependancy must be resolved. Resizing 2767 * the buffer in such circumstances can lead to problems. 2768 * 2769 * Dirty or dependant buffers are written synchronously. 2770 * Other types of buffers are simply released and 2771 * reconstituted as they may be backed by valid, dirty VM 2772 * pages (but not marked B_DELWRI). 2773 * 2774 * NFS NOTE: NFS buffers which straddle EOF are oddly-sized 2775 * and may be left over from a prior truncation (and thus 2776 * no longer represent the actual EOF point), so we 2777 * definitely do not want to B_NOCACHE the backing store. 2778 */ 2779 if (size != bp->b_bcount) { 2780 get_mplock(); 2781 if (bp->b_flags & B_DELWRI) { 2782 bp->b_flags |= B_RELBUF; 2783 bwrite(bp); 2784 } else if (LIST_FIRST(&bp->b_dep)) { 2785 bp->b_flags |= B_RELBUF; 2786 bwrite(bp); 2787 } else { 2788 bp->b_flags |= B_RELBUF; 2789 brelse(bp); 2790 } 2791 rel_mplock(); 2792 goto loop; 2793 } 2794 KKASSERT(size <= bp->b_kvasize); 2795 KASSERT(bp->b_loffset != NOOFFSET, 2796 ("getblk: no buffer offset")); 2797 2798 /* 2799 * A buffer with B_DELWRI set and B_CACHE clear must 2800 * be committed before we can return the buffer in 2801 * order to prevent the caller from issuing a read 2802 * ( due to B_CACHE not being set ) and overwriting 2803 * it. 2804 * 2805 * Most callers, including NFS and FFS, need this to 2806 * operate properly either because they assume they 2807 * can issue a read if B_CACHE is not set, or because 2808 * ( for example ) an uncached B_DELWRI might loop due 2809 * to softupdates re-dirtying the buffer. In the latter 2810 * case, B_CACHE is set after the first write completes, 2811 * preventing further loops. 2812 * 2813 * NOTE! b*write() sets B_CACHE. If we cleared B_CACHE 2814 * above while extending the buffer, we cannot allow the 2815 * buffer to remain with B_CACHE set after the write 2816 * completes or it will represent a corrupt state. To 2817 * deal with this we set B_NOCACHE to scrap the buffer 2818 * after the write. 2819 * 2820 * XXX Should this be B_RELBUF instead of B_NOCACHE? 2821 * I'm not even sure this state is still possible 2822 * now that getblk() writes out any dirty buffers 2823 * on size changes. 2824 * 2825 * We might be able to do something fancy, like setting 2826 * B_CACHE in bwrite() except if B_DELWRI is already set, 2827 * so the below call doesn't set B_CACHE, but that gets real 2828 * confusing. This is much easier. 2829 */ 2830 2831 if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) { 2832 get_mplock(); 2833 kprintf("getblk: Warning, bp %p loff=%jx DELWRI set " 2834 "and CACHE clear, b_flags %08x\n", 2835 bp, (intmax_t)bp->b_loffset, bp->b_flags); 2836 bp->b_flags |= B_NOCACHE; 2837 bwrite(bp); 2838 rel_mplock(); 2839 goto loop; 2840 } 2841 } else { 2842 /* 2843 * Buffer is not in-core, create new buffer. The buffer 2844 * returned by getnewbuf() is locked. Note that the returned 2845 * buffer is also considered valid (not marked B_INVAL). 2846 * 2847 * Calculating the offset for the I/O requires figuring out 2848 * the block size. We use DEV_BSIZE for VBLK or VCHR and 2849 * the mount's f_iosize otherwise. If the vnode does not 2850 * have an associated mount we assume that the passed size is 2851 * the block size. 2852 * 2853 * Note that vn_isdisk() cannot be used here since it may 2854 * return a failure for numerous reasons. Note that the 2855 * buffer size may be larger then the block size (the caller 2856 * will use block numbers with the proper multiple). Beware 2857 * of using any v_* fields which are part of unions. In 2858 * particular, in DragonFly the mount point overloading 2859 * mechanism uses the namecache only and the underlying 2860 * directory vnode is not a special case. 2861 */ 2862 int bsize, maxsize; 2863 2864 if (vp->v_type == VBLK || vp->v_type == VCHR) 2865 bsize = DEV_BSIZE; 2866 else if (vp->v_mount) 2867 bsize = vp->v_mount->mnt_stat.f_iosize; 2868 else 2869 bsize = size; 2870 2871 maxsize = size + (loffset & PAGE_MASK); 2872 maxsize = imax(maxsize, bsize); 2873 2874 bp = getnewbuf(blkflags, slptimeo, size, maxsize); 2875 if (bp == NULL) { 2876 if (slpflags || slptimeo) 2877 return NULL; 2878 goto loop; 2879 } 2880 2881 /* 2882 * Atomically insert the buffer into the hash, so that it can 2883 * be found by findblk(). 2884 * 2885 * If bgetvp() returns non-zero a collision occured, and the 2886 * bp will not be associated with the vnode. 2887 * 2888 * Make sure the translation layer has been cleared. 2889 */ 2890 bp->b_loffset = loffset; 2891 bp->b_bio2.bio_offset = NOOFFSET; 2892 /* bp->b_bio2.bio_next = NULL; */ 2893 2894 if (bgetvp(vp, bp)) { 2895 bp->b_flags |= B_INVAL; 2896 brelse(bp); 2897 goto loop; 2898 } 2899 2900 /* 2901 * All vnode-based buffers must be backed by a VM object. 2902 */ 2903 KKASSERT(vp->v_object != NULL); 2904 bp->b_flags |= B_VMIO; 2905 KKASSERT(bp->b_cmd == BUF_CMD_DONE); 2906 2907 get_mplock(); 2908 allocbuf(bp, size); 2909 rel_mplock(); 2910 } 2911 return (bp); 2912 } 2913 2914 /* 2915 * regetblk(bp) 2916 * 2917 * Reacquire a buffer that was previously released to the locked queue, 2918 * or reacquire a buffer which is interlocked by having bioops->io_deallocate 2919 * set B_LOCKED (which handles the acquisition race). 2920 * 2921 * To this end, either B_LOCKED must be set or the dependancy list must be 2922 * non-empty. 2923 * 2924 * MPSAFE 2925 */ 2926 void 2927 regetblk(struct buf *bp) 2928 { 2929 KKASSERT((bp->b_flags & B_LOCKED) || LIST_FIRST(&bp->b_dep) != NULL); 2930 BUF_LOCK(bp, LK_EXCLUSIVE | LK_RETRY); 2931 bremfree(bp); 2932 } 2933 2934 /* 2935 * geteblk: 2936 * 2937 * Get an empty, disassociated buffer of given size. The buffer is 2938 * initially set to B_INVAL. 2939 * 2940 * critical section protection is not required for the allocbuf() 2941 * call because races are impossible here. 2942 * 2943 * MPALMOSTSAFE 2944 */ 2945 struct buf * 2946 geteblk(int size) 2947 { 2948 struct buf *bp; 2949 int maxsize; 2950 2951 maxsize = (size + BKVAMASK) & ~BKVAMASK; 2952 2953 while ((bp = getnewbuf(0, 0, size, maxsize)) == 0) 2954 ; 2955 get_mplock(); 2956 allocbuf(bp, size); 2957 rel_mplock(); 2958 bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ 2959 return (bp); 2960 } 2961 2962 2963 /* 2964 * allocbuf: 2965 * 2966 * This code constitutes the buffer memory from either anonymous system 2967 * memory (in the case of non-VMIO operations) or from an associated 2968 * VM object (in the case of VMIO operations). This code is able to 2969 * resize a buffer up or down. 2970 * 2971 * Note that this code is tricky, and has many complications to resolve 2972 * deadlock or inconsistant data situations. Tread lightly!!! 2973 * There are B_CACHE and B_DELWRI interactions that must be dealt with by 2974 * the caller. Calling this code willy nilly can result in the loss of data. 2975 * 2976 * allocbuf() only adjusts B_CACHE for VMIO buffers. getblk() deals with 2977 * B_CACHE for the non-VMIO case. 2978 * 2979 * This routine does not need to be called from a critical section but you 2980 * must own the buffer. 2981 * 2982 * NOTMPSAFE 2983 */ 2984 int 2985 allocbuf(struct buf *bp, int size) 2986 { 2987 int newbsize, mbsize; 2988 int i; 2989 2990 if (BUF_REFCNT(bp) == 0) 2991 panic("allocbuf: buffer not busy"); 2992 2993 if (bp->b_kvasize < size) 2994 panic("allocbuf: buffer too small"); 2995 2996 if ((bp->b_flags & B_VMIO) == 0) { 2997 caddr_t origbuf; 2998 int origbufsize; 2999 /* 3000 * Just get anonymous memory from the kernel. Don't 3001 * mess with B_CACHE. 3002 */ 3003 mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); 3004 if (bp->b_flags & B_MALLOC) 3005 newbsize = mbsize; 3006 else 3007 newbsize = round_page(size); 3008 3009 if (newbsize < bp->b_bufsize) { 3010 /* 3011 * Malloced buffers are not shrunk 3012 */ 3013 if (bp->b_flags & B_MALLOC) { 3014 if (newbsize) { 3015 bp->b_bcount = size; 3016 } else { 3017 kfree(bp->b_data, M_BIOBUF); 3018 if (bp->b_bufsize) { 3019 bufmallocspace -= bp->b_bufsize; 3020 bufspacewakeup(); 3021 bp->b_bufsize = 0; 3022 } 3023 bp->b_data = bp->b_kvabase; 3024 bp->b_bcount = 0; 3025 bp->b_flags &= ~B_MALLOC; 3026 } 3027 return 1; 3028 } 3029 vm_hold_free_pages( 3030 bp, 3031 (vm_offset_t) bp->b_data + newbsize, 3032 (vm_offset_t) bp->b_data + bp->b_bufsize); 3033 } else if (newbsize > bp->b_bufsize) { 3034 /* 3035 * We only use malloced memory on the first allocation. 3036 * and revert to page-allocated memory when the buffer 3037 * grows. 3038 */ 3039 if ((bufmallocspace < maxbufmallocspace) && 3040 (bp->b_bufsize == 0) && 3041 (mbsize <= PAGE_SIZE/2)) { 3042 3043 bp->b_data = kmalloc(mbsize, M_BIOBUF, M_WAITOK); 3044 bp->b_bufsize = mbsize; 3045 bp->b_bcount = size; 3046 bp->b_flags |= B_MALLOC; 3047 bufmallocspace += mbsize; 3048 return 1; 3049 } 3050 origbuf = NULL; 3051 origbufsize = 0; 3052 /* 3053 * If the buffer is growing on its other-than-first 3054 * allocation, then we revert to the page-allocation 3055 * scheme. 3056 */ 3057 if (bp->b_flags & B_MALLOC) { 3058 origbuf = bp->b_data; 3059 origbufsize = bp->b_bufsize; 3060 bp->b_data = bp->b_kvabase; 3061 if (bp->b_bufsize) { 3062 bufmallocspace -= bp->b_bufsize; 3063 bufspacewakeup(); 3064 bp->b_bufsize = 0; 3065 } 3066 bp->b_flags &= ~B_MALLOC; 3067 newbsize = round_page(newbsize); 3068 } 3069 vm_hold_load_pages( 3070 bp, 3071 (vm_offset_t) bp->b_data + bp->b_bufsize, 3072 (vm_offset_t) bp->b_data + newbsize); 3073 if (origbuf) { 3074 bcopy(origbuf, bp->b_data, origbufsize); 3075 kfree(origbuf, M_BIOBUF); 3076 } 3077 } 3078 } else { 3079 vm_page_t m; 3080 int desiredpages; 3081 3082 newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); 3083 desiredpages = ((int)(bp->b_loffset & PAGE_MASK) + 3084 newbsize + PAGE_MASK) >> PAGE_SHIFT; 3085 KKASSERT(desiredpages <= XIO_INTERNAL_PAGES); 3086 3087 if (bp->b_flags & B_MALLOC) 3088 panic("allocbuf: VMIO buffer can't be malloced"); 3089 /* 3090 * Set B_CACHE initially if buffer is 0 length or will become 3091 * 0-length. 3092 */ 3093 if (size == 0 || bp->b_bufsize == 0) 3094 bp->b_flags |= B_CACHE; 3095 3096 if (newbsize < bp->b_bufsize) { 3097 /* 3098 * DEV_BSIZE aligned new buffer size is less then the 3099 * DEV_BSIZE aligned existing buffer size. Figure out 3100 * if we have to remove any pages. 3101 */ 3102 if (desiredpages < bp->b_xio.xio_npages) { 3103 for (i = desiredpages; i < bp->b_xio.xio_npages; i++) { 3104 /* 3105 * the page is not freed here -- it 3106 * is the responsibility of 3107 * vnode_pager_setsize 3108 */ 3109 m = bp->b_xio.xio_pages[i]; 3110 KASSERT(m != bogus_page, 3111 ("allocbuf: bogus page found")); 3112 while (vm_page_sleep_busy(m, TRUE, "biodep")) 3113 ; 3114 3115 bp->b_xio.xio_pages[i] = NULL; 3116 vm_page_unwire(m, 0); 3117 } 3118 pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) + 3119 (desiredpages << PAGE_SHIFT), (bp->b_xio.xio_npages - desiredpages)); 3120 bp->b_xio.xio_npages = desiredpages; 3121 } 3122 } else if (size > bp->b_bcount) { 3123 /* 3124 * We are growing the buffer, possibly in a 3125 * byte-granular fashion. 3126 */ 3127 struct vnode *vp; 3128 vm_object_t obj; 3129 vm_offset_t toff; 3130 vm_offset_t tinc; 3131 3132 /* 3133 * Step 1, bring in the VM pages from the object, 3134 * allocating them if necessary. We must clear 3135 * B_CACHE if these pages are not valid for the 3136 * range covered by the buffer. 3137 * 3138 * critical section protection is required to protect 3139 * against interrupts unbusying and freeing pages 3140 * between our vm_page_lookup() and our 3141 * busycheck/wiring call. 3142 */ 3143 vp = bp->b_vp; 3144 obj = vp->v_object; 3145 3146 crit_enter(); 3147 while (bp->b_xio.xio_npages < desiredpages) { 3148 vm_page_t m; 3149 vm_pindex_t pi; 3150 3151 pi = OFF_TO_IDX(bp->b_loffset) + bp->b_xio.xio_npages; 3152 if ((m = vm_page_lookup(obj, pi)) == NULL) { 3153 /* 3154 * note: must allocate system pages 3155 * since blocking here could intefere 3156 * with paging I/O, no matter which 3157 * process we are. 3158 */ 3159 m = bio_page_alloc(obj, pi, desiredpages - bp->b_xio.xio_npages); 3160 if (m) { 3161 vm_page_wire(m); 3162 vm_page_wakeup(m); 3163 bp->b_flags &= ~B_CACHE; 3164 bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m; 3165 ++bp->b_xio.xio_npages; 3166 } 3167 continue; 3168 } 3169 3170 /* 3171 * We found a page. If we have to sleep on it, 3172 * retry because it might have gotten freed out 3173 * from under us. 3174 * 3175 * We can only test PG_BUSY here. Blocking on 3176 * m->busy might lead to a deadlock: 3177 * 3178 * vm_fault->getpages->cluster_read->allocbuf 3179 * 3180 */ 3181 3182 if (vm_page_sleep_busy(m, FALSE, "pgtblk")) 3183 continue; 3184 vm_page_flag_clear(m, PG_ZERO); 3185 vm_page_wire(m); 3186 bp->b_xio.xio_pages[bp->b_xio.xio_npages] = m; 3187 ++bp->b_xio.xio_npages; 3188 if (bp->b_act_count < m->act_count) 3189 bp->b_act_count = m->act_count; 3190 } 3191 crit_exit(); 3192 3193 /* 3194 * Step 2. We've loaded the pages into the buffer, 3195 * we have to figure out if we can still have B_CACHE 3196 * set. Note that B_CACHE is set according to the 3197 * byte-granular range ( bcount and size ), not the 3198 * aligned range ( newbsize ). 3199 * 3200 * The VM test is against m->valid, which is DEV_BSIZE 3201 * aligned. Needless to say, the validity of the data 3202 * needs to also be DEV_BSIZE aligned. Note that this 3203 * fails with NFS if the server or some other client 3204 * extends the file's EOF. If our buffer is resized, 3205 * B_CACHE may remain set! XXX 3206 */ 3207 3208 toff = bp->b_bcount; 3209 tinc = PAGE_SIZE - ((bp->b_loffset + toff) & PAGE_MASK); 3210 3211 while ((bp->b_flags & B_CACHE) && toff < size) { 3212 vm_pindex_t pi; 3213 3214 if (tinc > (size - toff)) 3215 tinc = size - toff; 3216 3217 pi = ((bp->b_loffset & PAGE_MASK) + toff) >> 3218 PAGE_SHIFT; 3219 3220 vfs_buf_test_cache( 3221 bp, 3222 bp->b_loffset, 3223 toff, 3224 tinc, 3225 bp->b_xio.xio_pages[pi] 3226 ); 3227 toff += tinc; 3228 tinc = PAGE_SIZE; 3229 } 3230 3231 /* 3232 * Step 3, fixup the KVM pmap. Remember that 3233 * bp->b_data is relative to bp->b_loffset, but 3234 * bp->b_loffset may be offset into the first page. 3235 */ 3236 3237 bp->b_data = (caddr_t) 3238 trunc_page((vm_offset_t)bp->b_data); 3239 pmap_qenter( 3240 (vm_offset_t)bp->b_data, 3241 bp->b_xio.xio_pages, 3242 bp->b_xio.xio_npages 3243 ); 3244 bp->b_data = (caddr_t)((vm_offset_t)bp->b_data | 3245 (vm_offset_t)(bp->b_loffset & PAGE_MASK)); 3246 } 3247 } 3248 3249 /* adjust space use on already-dirty buffer */ 3250 if (bp->b_flags & B_DELWRI) { 3251 dirtybufspace += newbsize - bp->b_bufsize; 3252 if (bp->b_flags & B_HEAVY) 3253 dirtybufspacehw += newbsize - bp->b_bufsize; 3254 } 3255 if (newbsize < bp->b_bufsize) 3256 bufspacewakeup(); 3257 bp->b_bufsize = newbsize; /* actual buffer allocation */ 3258 bp->b_bcount = size; /* requested buffer size */ 3259 return 1; 3260 } 3261 3262 /* 3263 * biowait: 3264 * 3265 * Wait for buffer I/O completion, returning error status. B_EINTR 3266 * is converted into an EINTR error but not cleared (since a chain 3267 * of biowait() calls may occur). 3268 * 3269 * On return bpdone() will have been called but the buffer will remain 3270 * locked and will not have been brelse()'d. 3271 * 3272 * NOTE! If a timeout is specified and ETIMEDOUT occurs the I/O is 3273 * likely still in progress on return. 3274 * 3275 * NOTE! This operation is on a BIO, not a BUF. 3276 * 3277 * NOTE! BIO_DONE is cleared by vn_strategy() 3278 * 3279 * MPSAFE 3280 */ 3281 static __inline int 3282 _biowait(struct bio *bio, const char *wmesg, int to) 3283 { 3284 struct buf *bp = bio->bio_buf; 3285 u_int32_t flags; 3286 u_int32_t nflags; 3287 int error; 3288 3289 KKASSERT(bio == &bp->b_bio1); 3290 for (;;) { 3291 flags = bio->bio_flags; 3292 if (flags & BIO_DONE) 3293 break; 3294 tsleep_interlock(bio, 0); 3295 nflags = flags | BIO_WANT; 3296 tsleep_interlock(bio, 0); 3297 if (atomic_cmpset_int(&bio->bio_flags, flags, nflags)) { 3298 if (wmesg) 3299 error = tsleep(bio, PINTERLOCKED, wmesg, to); 3300 else if (bp->b_cmd == BUF_CMD_READ) 3301 error = tsleep(bio, PINTERLOCKED, "biord", to); 3302 else 3303 error = tsleep(bio, PINTERLOCKED, "biowr", to); 3304 if (error) { 3305 kprintf("tsleep error biowait %d\n", error); 3306 return (error); 3307 } 3308 break; 3309 } 3310 } 3311 3312 /* 3313 * Finish up. 3314 */ 3315 KKASSERT(bp->b_cmd == BUF_CMD_DONE); 3316 bio->bio_flags &= ~(BIO_DONE | BIO_SYNC); 3317 if (bp->b_flags & B_EINTR) 3318 return (EINTR); 3319 if (bp->b_flags & B_ERROR) 3320 return (bp->b_error ? bp->b_error : EIO); 3321 return (0); 3322 } 3323 3324 int 3325 biowait(struct bio *bio, const char *wmesg) 3326 { 3327 return(_biowait(bio, wmesg, 0)); 3328 } 3329 3330 int 3331 biowait_timeout(struct bio *bio, const char *wmesg, int to) 3332 { 3333 return(_biowait(bio, wmesg, to)); 3334 } 3335 3336 /* 3337 * This associates a tracking count with an I/O. vn_strategy() and 3338 * dev_dstrategy() do this automatically but there are a few cases 3339 * where a vnode or device layer is bypassed when a block translation 3340 * is cached. In such cases bio_start_transaction() may be called on 3341 * the bypassed layers so the system gets an I/O in progress indication 3342 * for those higher layers. 3343 */ 3344 void 3345 bio_start_transaction(struct bio *bio, struct bio_track *track) 3346 { 3347 bio->bio_track = track; 3348 bio_track_ref(track); 3349 } 3350 3351 /* 3352 * Initiate I/O on a vnode. 3353 */ 3354 void 3355 vn_strategy(struct vnode *vp, struct bio *bio) 3356 { 3357 struct bio_track *track; 3358 3359 KKASSERT(bio->bio_buf->b_cmd != BUF_CMD_DONE); 3360 if (bio->bio_buf->b_cmd == BUF_CMD_READ) 3361 track = &vp->v_track_read; 3362 else 3363 track = &vp->v_track_write; 3364 KKASSERT((bio->bio_flags & BIO_DONE) == 0); 3365 bio->bio_track = track; 3366 bio_track_ref(track); 3367 vop_strategy(*vp->v_ops, vp, bio); 3368 } 3369 3370 /* 3371 * bpdone: 3372 * 3373 * Finish I/O on a buffer after all BIOs have been processed. 3374 * Called when the bio chain is exhausted or by biowait. If called 3375 * by biowait, elseit is typically 0. 3376 * 3377 * bpdone is also responsible for setting B_CACHE in a B_VMIO bp. 3378 * In a non-VMIO bp, B_CACHE will be set on the next getblk() 3379 * assuming B_INVAL is clear. 3380 * 3381 * For the VMIO case, we set B_CACHE if the op was a read and no 3382 * read error occured, or if the op was a write. B_CACHE is never 3383 * set if the buffer is invalid or otherwise uncacheable. 3384 * 3385 * bpdone does not mess with B_INVAL, allowing the I/O routine or the 3386 * initiator to leave B_INVAL set to brelse the buffer out of existance 3387 * in the biodone routine. 3388 */ 3389 void 3390 bpdone(struct buf *bp, int elseit) 3391 { 3392 buf_cmd_t cmd; 3393 3394 KASSERT(BUF_REFCNTNB(bp) > 0, 3395 ("biodone: bp %p not busy %d", bp, BUF_REFCNTNB(bp))); 3396 KASSERT(bp->b_cmd != BUF_CMD_DONE, 3397 ("biodone: bp %p already done!", bp)); 3398 3399 /* 3400 * No more BIOs are left. All completion functions have been dealt 3401 * with, now we clean up the buffer. 3402 */ 3403 cmd = bp->b_cmd; 3404 bp->b_cmd = BUF_CMD_DONE; 3405 3406 /* 3407 * Only reads and writes are processed past this point. 3408 */ 3409 if (cmd != BUF_CMD_READ && cmd != BUF_CMD_WRITE) { 3410 if (cmd == BUF_CMD_FREEBLKS) 3411 bp->b_flags |= B_NOCACHE; 3412 if (elseit) 3413 brelse(bp); 3414 return; 3415 } 3416 3417 /* 3418 * Warning: softupdates may re-dirty the buffer, and HAMMER can do 3419 * a lot worse. XXX - move this above the clearing of b_cmd 3420 */ 3421 if (LIST_FIRST(&bp->b_dep) != NULL) 3422 buf_complete(bp); 3423 3424 /* 3425 * A failed write must re-dirty the buffer unless B_INVAL 3426 * was set. Only applicable to normal buffers (with VPs). 3427 * vinum buffers may not have a vp. 3428 */ 3429 if (cmd == BUF_CMD_WRITE && 3430 (bp->b_flags & (B_ERROR | B_INVAL)) == B_ERROR) { 3431 bp->b_flags &= ~B_NOCACHE; 3432 if (bp->b_vp) 3433 bdirty(bp); 3434 } 3435 3436 if (bp->b_flags & B_VMIO) { 3437 int i; 3438 vm_ooffset_t foff; 3439 vm_page_t m; 3440 vm_object_t obj; 3441 int iosize; 3442 struct vnode *vp = bp->b_vp; 3443 3444 obj = vp->v_object; 3445 3446 #if defined(VFS_BIO_DEBUG) 3447 if (vp->v_auxrefs == 0) 3448 panic("biodone: zero vnode hold count"); 3449 if ((vp->v_flag & VOBJBUF) == 0) 3450 panic("biodone: vnode is not setup for merged cache"); 3451 #endif 3452 3453 foff = bp->b_loffset; 3454 KASSERT(foff != NOOFFSET, ("biodone: no buffer offset")); 3455 KASSERT(obj != NULL, ("biodone: missing VM object")); 3456 3457 #if defined(VFS_BIO_DEBUG) 3458 if (obj->paging_in_progress < bp->b_xio.xio_npages) { 3459 kprintf("biodone: paging in progress(%d) < bp->b_xio.xio_npages(%d)\n", 3460 obj->paging_in_progress, bp->b_xio.xio_npages); 3461 } 3462 #endif 3463 3464 /* 3465 * Set B_CACHE if the op was a normal read and no error 3466 * occured. B_CACHE is set for writes in the b*write() 3467 * routines. 3468 */ 3469 iosize = bp->b_bcount - bp->b_resid; 3470 if (cmd == BUF_CMD_READ && 3471 (bp->b_flags & (B_INVAL|B_NOCACHE|B_ERROR)) == 0) { 3472 bp->b_flags |= B_CACHE; 3473 } 3474 3475 crit_enter(); 3476 get_mplock(); 3477 for (i = 0; i < bp->b_xio.xio_npages; i++) { 3478 int bogusflag = 0; 3479 int resid; 3480 3481 resid = ((foff + PAGE_SIZE) & ~(off_t)PAGE_MASK) - foff; 3482 if (resid > iosize) 3483 resid = iosize; 3484 3485 /* 3486 * cleanup bogus pages, restoring the originals. Since 3487 * the originals should still be wired, we don't have 3488 * to worry about interrupt/freeing races destroying 3489 * the VM object association. 3490 */ 3491 m = bp->b_xio.xio_pages[i]; 3492 if (m == bogus_page) { 3493 bogusflag = 1; 3494 m = vm_page_lookup(obj, OFF_TO_IDX(foff)); 3495 if (m == NULL) 3496 panic("biodone: page disappeared"); 3497 bp->b_xio.xio_pages[i] = m; 3498 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), 3499 bp->b_xio.xio_pages, bp->b_xio.xio_npages); 3500 } 3501 #if defined(VFS_BIO_DEBUG) 3502 if (OFF_TO_IDX(foff) != m->pindex) { 3503 kprintf("biodone: foff(%lu)/m->pindex(%ld) " 3504 "mismatch\n", 3505 (unsigned long)foff, (long)m->pindex); 3506 } 3507 #endif 3508 3509 /* 3510 * In the write case, the valid and clean bits are 3511 * already changed correctly (see bdwrite()), so we 3512 * only need to do this here in the read case. 3513 */ 3514 if (cmd == BUF_CMD_READ && !bogusflag && resid > 0) { 3515 vfs_clean_one_page(bp, i, m); 3516 } 3517 vm_page_flag_clear(m, PG_ZERO); 3518 3519 /* 3520 * when debugging new filesystems or buffer I/O 3521 * methods, this is the most common error that pops 3522 * up. if you see this, you have not set the page 3523 * busy flag correctly!!! 3524 */ 3525 if (m->busy == 0) { 3526 kprintf("biodone: page busy < 0, " 3527 "pindex: %d, foff: 0x(%x,%x), " 3528 "resid: %d, index: %d\n", 3529 (int) m->pindex, (int)(foff >> 32), 3530 (int) foff & 0xffffffff, resid, i); 3531 if (!vn_isdisk(vp, NULL)) 3532 kprintf(" iosize: %ld, loffset: %lld, " 3533 "flags: 0x%08x, npages: %d\n", 3534 bp->b_vp->v_mount->mnt_stat.f_iosize, 3535 (long long)bp->b_loffset, 3536 bp->b_flags, bp->b_xio.xio_npages); 3537 else 3538 kprintf(" VDEV, loffset: %lld, flags: 0x%08x, npages: %d\n", 3539 (long long)bp->b_loffset, 3540 bp->b_flags, bp->b_xio.xio_npages); 3541 kprintf(" valid: 0x%x, dirty: 0x%x, wired: %d\n", 3542 m->valid, m->dirty, m->wire_count); 3543 panic("biodone: page busy < 0"); 3544 } 3545 vm_page_io_finish(m); 3546 vm_object_pip_subtract(obj, 1); 3547 foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK; 3548 iosize -= resid; 3549 } 3550 if (obj) 3551 vm_object_pip_wakeupn(obj, 0); 3552 rel_mplock(); 3553 crit_exit(); 3554 } 3555 3556 /* 3557 * Finish up by releasing the buffer. There are no more synchronous 3558 * or asynchronous completions, those were handled by bio_done 3559 * callbacks. 3560 */ 3561 if (elseit) { 3562 if (bp->b_flags & (B_NOCACHE|B_INVAL|B_ERROR|B_RELBUF)) 3563 brelse(bp); 3564 else 3565 bqrelse(bp); 3566 } 3567 } 3568 3569 /* 3570 * Normal biodone. 3571 */ 3572 void 3573 biodone(struct bio *bio) 3574 { 3575 struct buf *bp = bio->bio_buf; 3576 3577 runningbufwakeup(bp); 3578 3579 /* 3580 * Run up the chain of BIO's. Leave b_cmd intact for the duration. 3581 */ 3582 while (bio) { 3583 biodone_t *done_func; 3584 struct bio_track *track; 3585 3586 /* 3587 * BIO tracking. Most but not all BIOs are tracked. 3588 */ 3589 if ((track = bio->bio_track) != NULL) { 3590 bio_track_rel(track); 3591 bio->bio_track = NULL; 3592 } 3593 3594 /* 3595 * A bio_done function terminates the loop. The function 3596 * will be responsible for any further chaining and/or 3597 * buffer management. 3598 * 3599 * WARNING! The done function can deallocate the buffer! 3600 */ 3601 if ((done_func = bio->bio_done) != NULL) { 3602 bio->bio_done = NULL; 3603 done_func(bio); 3604 return; 3605 } 3606 bio = bio->bio_prev; 3607 } 3608 3609 /* 3610 * If we've run out of bio's do normal [a]synchronous completion. 3611 */ 3612 bpdone(bp, 1); 3613 } 3614 3615 /* 3616 * Synchronous biodone - this terminates a synchronous BIO. 3617 * 3618 * bpdone() is called with elseit=FALSE, leaving the buffer completed 3619 * but still locked. The caller must brelse() the buffer after waiting 3620 * for completion. 3621 */ 3622 void 3623 biodone_sync(struct bio *bio) 3624 { 3625 struct buf *bp = bio->bio_buf; 3626 int flags; 3627 int nflags; 3628 3629 KKASSERT(bio == &bp->b_bio1); 3630 bpdone(bp, 0); 3631 3632 for (;;) { 3633 flags = bio->bio_flags; 3634 nflags = (flags | BIO_DONE) & ~BIO_WANT; 3635 3636 if (atomic_cmpset_int(&bio->bio_flags, flags, nflags)) { 3637 if (flags & BIO_WANT) 3638 wakeup(bio); 3639 break; 3640 } 3641 } 3642 } 3643 3644 /* 3645 * vfs_unbusy_pages: 3646 * 3647 * This routine is called in lieu of iodone in the case of 3648 * incomplete I/O. This keeps the busy status for pages 3649 * consistant. 3650 */ 3651 void 3652 vfs_unbusy_pages(struct buf *bp) 3653 { 3654 int i; 3655 3656 runningbufwakeup(bp); 3657 if (bp->b_flags & B_VMIO) { 3658 struct vnode *vp = bp->b_vp; 3659 vm_object_t obj; 3660 3661 obj = vp->v_object; 3662 3663 for (i = 0; i < bp->b_xio.xio_npages; i++) { 3664 vm_page_t m = bp->b_xio.xio_pages[i]; 3665 3666 /* 3667 * When restoring bogus changes the original pages 3668 * should still be wired, so we are in no danger of 3669 * losing the object association and do not need 3670 * critical section protection particularly. 3671 */ 3672 if (m == bogus_page) { 3673 m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_loffset) + i); 3674 if (!m) { 3675 panic("vfs_unbusy_pages: page missing"); 3676 } 3677 bp->b_xio.xio_pages[i] = m; 3678 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), 3679 bp->b_xio.xio_pages, bp->b_xio.xio_npages); 3680 } 3681 vm_object_pip_subtract(obj, 1); 3682 vm_page_flag_clear(m, PG_ZERO); 3683 vm_page_io_finish(m); 3684 } 3685 vm_object_pip_wakeupn(obj, 0); 3686 } 3687 } 3688 3689 /* 3690 * vfs_busy_pages: 3691 * 3692 * This routine is called before a device strategy routine. 3693 * It is used to tell the VM system that paging I/O is in 3694 * progress, and treat the pages associated with the buffer 3695 * almost as being PG_BUSY. Also the object 'paging_in_progress' 3696 * flag is handled to make sure that the object doesn't become 3697 * inconsistant. 3698 * 3699 * Since I/O has not been initiated yet, certain buffer flags 3700 * such as B_ERROR or B_INVAL may be in an inconsistant state 3701 * and should be ignored. 3702 */ 3703 void 3704 vfs_busy_pages(struct vnode *vp, struct buf *bp) 3705 { 3706 int i, bogus; 3707 struct lwp *lp = curthread->td_lwp; 3708 3709 /* 3710 * The buffer's I/O command must already be set. If reading, 3711 * B_CACHE must be 0 (double check against callers only doing 3712 * I/O when B_CACHE is 0). 3713 */ 3714 KKASSERT(bp->b_cmd != BUF_CMD_DONE); 3715 KKASSERT(bp->b_cmd == BUF_CMD_WRITE || (bp->b_flags & B_CACHE) == 0); 3716 3717 if (bp->b_flags & B_VMIO) { 3718 vm_object_t obj; 3719 3720 obj = vp->v_object; 3721 KASSERT(bp->b_loffset != NOOFFSET, 3722 ("vfs_busy_pages: no buffer offset")); 3723 3724 /* 3725 * Loop until none of the pages are busy. 3726 */ 3727 retry: 3728 for (i = 0; i < bp->b_xio.xio_npages; i++) { 3729 vm_page_t m = bp->b_xio.xio_pages[i]; 3730 3731 if (vm_page_sleep_busy(m, FALSE, "vbpage")) 3732 goto retry; 3733 } 3734 3735 /* 3736 * Setup for I/O, soft-busy the page right now because 3737 * the next loop may block. 3738 */ 3739 for (i = 0; i < bp->b_xio.xio_npages; i++) { 3740 vm_page_t m = bp->b_xio.xio_pages[i]; 3741 3742 vm_page_flag_clear(m, PG_ZERO); 3743 if ((bp->b_flags & B_CLUSTER) == 0) { 3744 vm_object_pip_add(obj, 1); 3745 vm_page_io_start(m); 3746 } 3747 } 3748 3749 /* 3750 * Adjust protections for I/O and do bogus-page mapping. 3751 * Assume that vm_page_protect() can block (it can block 3752 * if VM_PROT_NONE, don't take any chances regardless). 3753 * 3754 * In particularly note that for writes we must incorporate 3755 * page dirtyness from the VM system into the buffer's 3756 * dirty range. 3757 * 3758 * For reads we theoretically must incorporate page dirtyness 3759 * from the VM system to determine if the page needs bogus 3760 * replacement, but we shortcut the test by simply checking 3761 * that all m->valid bits are set, indicating that the page 3762 * is fully valid and does not need to be re-read. For any 3763 * VM system dirtyness the page will also be fully valid 3764 * since it was mapped at one point. 3765 */ 3766 bogus = 0; 3767 for (i = 0; i < bp->b_xio.xio_npages; i++) { 3768 vm_page_t m = bp->b_xio.xio_pages[i]; 3769 3770 vm_page_flag_clear(m, PG_ZERO); /* XXX */ 3771 if (bp->b_cmd == BUF_CMD_WRITE) { 3772 /* 3773 * When readying a vnode-backed buffer for 3774 * a write we must zero-fill any invalid 3775 * portions of the backing VM pages, mark 3776 * it valid and clear related dirty bits. 3777 * 3778 * vfs_clean_one_page() incorporates any 3779 * VM dirtyness and updates the b_dirtyoff 3780 * range (after we've made the page RO). 3781 * 3782 * It is also expected that the pmap modified 3783 * bit has already been cleared by the 3784 * vm_page_protect(). We may not be able 3785 * to clear all dirty bits for a page if it 3786 * was also memory mapped (NFS). 3787 */ 3788 vm_page_protect(m, VM_PROT_READ); 3789 vfs_clean_one_page(bp, i, m); 3790 } else if (m->valid == VM_PAGE_BITS_ALL) { 3791 /* 3792 * When readying a vnode-backed buffer for 3793 * read we must replace any dirty pages with 3794 * a bogus page so dirty data is not destroyed 3795 * when filling gaps. 3796 * 3797 * To avoid testing whether the page is 3798 * dirty we instead test that the page was 3799 * at some point mapped (m->valid fully 3800 * valid) with the understanding that 3801 * this also covers the dirty case. 3802 */ 3803 bp->b_xio.xio_pages[i] = bogus_page; 3804 bogus++; 3805 } else if (m->valid & m->dirty) { 3806 /* 3807 * This case should not occur as partial 3808 * dirtyment can only happen if the buffer 3809 * is B_CACHE, and this code is not entered 3810 * if the buffer is B_CACHE. 3811 */ 3812 kprintf("Warning: vfs_busy_pages - page not " 3813 "fully valid! loff=%jx bpf=%08x " 3814 "idx=%d val=%02x dir=%02x\n", 3815 (intmax_t)bp->b_loffset, bp->b_flags, 3816 i, m->valid, m->dirty); 3817 vm_page_protect(m, VM_PROT_NONE); 3818 } else { 3819 /* 3820 * The page is not valid and can be made 3821 * part of the read. 3822 */ 3823 vm_page_protect(m, VM_PROT_NONE); 3824 } 3825 } 3826 if (bogus) { 3827 pmap_qenter(trunc_page((vm_offset_t)bp->b_data), 3828 bp->b_xio.xio_pages, bp->b_xio.xio_npages); 3829 } 3830 } 3831 3832 /* 3833 * This is the easiest place to put the process accounting for the I/O 3834 * for now. 3835 */ 3836 if (lp != NULL) { 3837 if (bp->b_cmd == BUF_CMD_READ) 3838 lp->lwp_ru.ru_inblock++; 3839 else 3840 lp->lwp_ru.ru_oublock++; 3841 } 3842 } 3843 3844 /* 3845 * vfs_clean_pages: 3846 * 3847 * Tell the VM system that the pages associated with this buffer 3848 * are clean. This is used for delayed writes where the data is 3849 * going to go to disk eventually without additional VM intevention. 3850 * 3851 * Note that while we only really need to clean through to b_bcount, we 3852 * just go ahead and clean through to b_bufsize. 3853 */ 3854 static void 3855 vfs_clean_pages(struct buf *bp) 3856 { 3857 vm_page_t m; 3858 int i; 3859 3860 if ((bp->b_flags & B_VMIO) == 0) 3861 return; 3862 3863 KASSERT(bp->b_loffset != NOOFFSET, 3864 ("vfs_clean_pages: no buffer offset")); 3865 3866 for (i = 0; i < bp->b_xio.xio_npages; i++) { 3867 m = bp->b_xio.xio_pages[i]; 3868 vfs_clean_one_page(bp, i, m); 3869 } 3870 } 3871 3872 /* 3873 * vfs_clean_one_page: 3874 * 3875 * Set the valid bits and clear the dirty bits in a page within a 3876 * buffer. The range is restricted to the buffer's size and the 3877 * buffer's logical offset might index into the first page. 3878 * 3879 * The caller has busied or soft-busied the page and it is not mapped, 3880 * test and incorporate the dirty bits into b_dirtyoff/end before 3881 * clearing them. Note that we need to clear the pmap modified bits 3882 * after determining the the page was dirty, vm_page_set_validclean() 3883 * does not do it for us. 3884 * 3885 * This routine is typically called after a read completes (dirty should 3886 * be zero in that case as we are not called on bogus-replace pages), 3887 * or before a write is initiated. 3888 */ 3889 static void 3890 vfs_clean_one_page(struct buf *bp, int pageno, vm_page_t m) 3891 { 3892 int bcount; 3893 int xoff; 3894 int soff; 3895 int eoff; 3896 3897 /* 3898 * Calculate offset range within the page but relative to buffer's 3899 * loffset. loffset might be offset into the first page. 3900 */ 3901 xoff = (int)bp->b_loffset & PAGE_MASK; /* loffset offset into pg 0 */ 3902 bcount = bp->b_bcount + xoff; /* offset adjusted */ 3903 3904 if (pageno == 0) { 3905 soff = xoff; 3906 eoff = PAGE_SIZE; 3907 } else { 3908 soff = (pageno << PAGE_SHIFT); 3909 eoff = soff + PAGE_SIZE; 3910 } 3911 if (eoff > bcount) 3912 eoff = bcount; 3913 if (soff >= eoff) 3914 return; 3915 3916 /* 3917 * Test dirty bits and adjust b_dirtyoff/end. 3918 * 3919 * If dirty pages are incorporated into the bp any prior 3920 * B_NEEDCOMMIT state (NFS) must be cleared because the 3921 * caller has not taken into account the new dirty data. 3922 * 3923 * If the page was memory mapped the dirty bits might go beyond the 3924 * end of the buffer, but we can't really make the assumption that 3925 * a file EOF straddles the buffer (even though this is the case for 3926 * NFS if B_NEEDCOMMIT is also set). So for the purposes of clearing 3927 * B_NEEDCOMMIT we only test the dirty bits covered by the buffer. 3928 * This also saves some console spam. 3929 */ 3930 vm_page_test_dirty(m); 3931 if (m->dirty) { 3932 pmap_clear_modify(m); 3933 if ((bp->b_flags & B_NEEDCOMMIT) && 3934 (m->dirty & vm_page_bits(soff & PAGE_MASK, eoff - soff))) { 3935 kprintf("Warning: vfs_clean_one_page: bp %p " 3936 "loff=%jx,%d flgs=%08x clr B_NEEDCOMMIT\n", 3937 bp, (intmax_t)bp->b_loffset, bp->b_bcount, 3938 bp->b_flags); 3939 bp->b_flags &= ~B_NEEDCOMMIT; 3940 } 3941 if (bp->b_dirtyoff > soff - xoff) 3942 bp->b_dirtyoff = soff - xoff; 3943 if (bp->b_dirtyend < eoff - xoff) 3944 bp->b_dirtyend = eoff - xoff; 3945 } 3946 3947 /* 3948 * Set related valid bits, clear related dirty bits. 3949 * Does not mess with the pmap modified bit. 3950 * 3951 * WARNING! We cannot just clear all of m->dirty here as the 3952 * buffer cache buffers may use a DEV_BSIZE'd aligned 3953 * block size, or have an odd size (e.g. NFS at file EOF). 3954 * The putpages code can clear m->dirty to 0. 3955 * 3956 * If a VOP_WRITE generates a buffer cache buffer which 3957 * covers the same space as mapped writable pages the 3958 * buffer flush might not be able to clear all the dirty 3959 * bits and still require a putpages from the VM system 3960 * to finish it off. 3961 */ 3962 vm_page_set_validclean(m, soff & PAGE_MASK, eoff - soff); 3963 } 3964 3965 /* 3966 * vfs_bio_clrbuf: 3967 * 3968 * Clear a buffer. This routine essentially fakes an I/O, so we need 3969 * to clear B_ERROR and B_INVAL. 3970 * 3971 * Note that while we only theoretically need to clear through b_bcount, 3972 * we go ahead and clear through b_bufsize. 3973 */ 3974 3975 void 3976 vfs_bio_clrbuf(struct buf *bp) 3977 { 3978 int i, mask = 0; 3979 caddr_t sa, ea; 3980 if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) { 3981 bp->b_flags &= ~(B_INVAL | B_EINTR | B_ERROR); 3982 if ((bp->b_xio.xio_npages == 1) && (bp->b_bufsize < PAGE_SIZE) && 3983 (bp->b_loffset & PAGE_MASK) == 0) { 3984 mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1; 3985 if ((bp->b_xio.xio_pages[0]->valid & mask) == mask) { 3986 bp->b_resid = 0; 3987 return; 3988 } 3989 if (((bp->b_xio.xio_pages[0]->flags & PG_ZERO) == 0) && 3990 ((bp->b_xio.xio_pages[0]->valid & mask) == 0)) { 3991 bzero(bp->b_data, bp->b_bufsize); 3992 bp->b_xio.xio_pages[0]->valid |= mask; 3993 bp->b_resid = 0; 3994 return; 3995 } 3996 } 3997 sa = bp->b_data; 3998 for(i=0;i<bp->b_xio.xio_npages;i++,sa=ea) { 3999 int j = ((vm_offset_t)sa & PAGE_MASK) / DEV_BSIZE; 4000 ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE); 4001 ea = (caddr_t)(vm_offset_t)ulmin( 4002 (u_long)(vm_offset_t)ea, 4003 (u_long)(vm_offset_t)bp->b_data + bp->b_bufsize); 4004 mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j; 4005 if ((bp->b_xio.xio_pages[i]->valid & mask) == mask) 4006 continue; 4007 if ((bp->b_xio.xio_pages[i]->valid & mask) == 0) { 4008 if ((bp->b_xio.xio_pages[i]->flags & PG_ZERO) == 0) { 4009 bzero(sa, ea - sa); 4010 } 4011 } else { 4012 for (; sa < ea; sa += DEV_BSIZE, j++) { 4013 if (((bp->b_xio.xio_pages[i]->flags & PG_ZERO) == 0) && 4014 (bp->b_xio.xio_pages[i]->valid & (1<<j)) == 0) 4015 bzero(sa, DEV_BSIZE); 4016 } 4017 } 4018 bp->b_xio.xio_pages[i]->valid |= mask; 4019 vm_page_flag_clear(bp->b_xio.xio_pages[i], PG_ZERO); 4020 } 4021 bp->b_resid = 0; 4022 } else { 4023 clrbuf(bp); 4024 } 4025 } 4026 4027 /* 4028 * vm_hold_load_pages: 4029 * 4030 * Load pages into the buffer's address space. The pages are 4031 * allocated from the kernel object in order to reduce interference 4032 * with the any VM paging I/O activity. The range of loaded 4033 * pages will be wired. 4034 * 4035 * If a page cannot be allocated, the 'pagedaemon' is woken up to 4036 * retrieve the full range (to - from) of pages. 4037 * 4038 */ 4039 void 4040 vm_hold_load_pages(struct buf *bp, vm_offset_t from, vm_offset_t to) 4041 { 4042 vm_offset_t pg; 4043 vm_page_t p; 4044 int index; 4045 4046 to = round_page(to); 4047 from = round_page(from); 4048 index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; 4049 4050 pg = from; 4051 while (pg < to) { 4052 /* 4053 * Note: must allocate system pages since blocking here 4054 * could intefere with paging I/O, no matter which 4055 * process we are. 4056 */ 4057 p = bio_page_alloc(&kernel_object, pg >> PAGE_SHIFT, 4058 (vm_pindex_t)((to - pg) >> PAGE_SHIFT)); 4059 if (p) { 4060 vm_page_wire(p); 4061 p->valid = VM_PAGE_BITS_ALL; 4062 vm_page_flag_clear(p, PG_ZERO); 4063 pmap_kenter(pg, VM_PAGE_TO_PHYS(p)); 4064 bp->b_xio.xio_pages[index] = p; 4065 vm_page_wakeup(p); 4066 4067 pg += PAGE_SIZE; 4068 ++index; 4069 } 4070 } 4071 bp->b_xio.xio_npages = index; 4072 } 4073 4074 /* 4075 * Allocate pages for a buffer cache buffer. 4076 * 4077 * Under extremely severe memory conditions even allocating out of the 4078 * system reserve can fail. If this occurs we must allocate out of the 4079 * interrupt reserve to avoid a deadlock with the pageout daemon. 4080 * 4081 * The pageout daemon can run (putpages -> VOP_WRITE -> getblk -> allocbuf). 4082 * If the buffer cache's vm_page_alloc() fails a vm_wait() can deadlock 4083 * against the pageout daemon if pages are not freed from other sources. 4084 */ 4085 static 4086 vm_page_t 4087 bio_page_alloc(vm_object_t obj, vm_pindex_t pg, int deficit) 4088 { 4089 vm_page_t p; 4090 4091 /* 4092 * Try a normal allocation, allow use of system reserve. 4093 */ 4094 p = vm_page_alloc(obj, pg, VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM); 4095 if (p) 4096 return(p); 4097 4098 /* 4099 * The normal allocation failed and we clearly have a page 4100 * deficit. Try to reclaim some clean VM pages directly 4101 * from the buffer cache. 4102 */ 4103 vm_pageout_deficit += deficit; 4104 recoverbufpages(); 4105 4106 /* 4107 * We may have blocked, the caller will know what to do if the 4108 * page now exists. 4109 */ 4110 if (vm_page_lookup(obj, pg)) 4111 return(NULL); 4112 4113 /* 4114 * Allocate and allow use of the interrupt reserve. 4115 * 4116 * If after all that we still can't allocate a VM page we are 4117 * in real trouble, but we slog on anyway hoping that the system 4118 * won't deadlock. 4119 */ 4120 p = vm_page_alloc(obj, pg, VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM | 4121 VM_ALLOC_INTERRUPT); 4122 if (p) { 4123 if (vm_page_count_severe()) { 4124 kprintf("bio_page_alloc: WARNING emergency page " 4125 "allocation\n"); 4126 vm_wait(hz / 20); 4127 } 4128 } else { 4129 kprintf("bio_page_alloc: WARNING emergency page " 4130 "allocation failed\n"); 4131 vm_wait(hz * 5); 4132 } 4133 return(p); 4134 } 4135 4136 /* 4137 * vm_hold_free_pages: 4138 * 4139 * Return pages associated with the buffer back to the VM system. 4140 * 4141 * The range of pages underlying the buffer's address space will 4142 * be unmapped and un-wired. 4143 */ 4144 void 4145 vm_hold_free_pages(struct buf *bp, vm_offset_t from, vm_offset_t to) 4146 { 4147 vm_offset_t pg; 4148 vm_page_t p; 4149 int index, newnpages; 4150 4151 from = round_page(from); 4152 to = round_page(to); 4153 newnpages = index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; 4154 4155 for (pg = from; pg < to; pg += PAGE_SIZE, index++) { 4156 p = bp->b_xio.xio_pages[index]; 4157 if (p && (index < bp->b_xio.xio_npages)) { 4158 if (p->busy) { 4159 kprintf("vm_hold_free_pages: doffset: %lld, " 4160 "loffset: %lld\n", 4161 (long long)bp->b_bio2.bio_offset, 4162 (long long)bp->b_loffset); 4163 } 4164 bp->b_xio.xio_pages[index] = NULL; 4165 pmap_kremove(pg); 4166 vm_page_busy(p); 4167 vm_page_unwire(p, 0); 4168 vm_page_free(p); 4169 } 4170 } 4171 bp->b_xio.xio_npages = newnpages; 4172 } 4173 4174 /* 4175 * vmapbuf: 4176 * 4177 * Map a user buffer into KVM via a pbuf. On return the buffer's 4178 * b_data, b_bufsize, and b_bcount will be set, and its XIO page array 4179 * initialized. 4180 */ 4181 int 4182 vmapbuf(struct buf *bp, caddr_t udata, int bytes) 4183 { 4184 caddr_t addr; 4185 vm_offset_t va; 4186 vm_page_t m; 4187 int vmprot; 4188 int error; 4189 int pidx; 4190 int i; 4191 4192 /* 4193 * bp had better have a command and it better be a pbuf. 4194 */ 4195 KKASSERT(bp->b_cmd != BUF_CMD_DONE); 4196 KKASSERT(bp->b_flags & B_PAGING); 4197 4198 if (bytes < 0) 4199 return (-1); 4200 4201 /* 4202 * Map the user data into KVM. Mappings have to be page-aligned. 4203 */ 4204 addr = (caddr_t)trunc_page((vm_offset_t)udata); 4205 pidx = 0; 4206 4207 vmprot = VM_PROT_READ; 4208 if (bp->b_cmd == BUF_CMD_READ) 4209 vmprot |= VM_PROT_WRITE; 4210 4211 while (addr < udata + bytes) { 4212 /* 4213 * Do the vm_fault if needed; do the copy-on-write thing 4214 * when reading stuff off device into memory. 4215 * 4216 * vm_fault_page*() returns a held VM page. 4217 */ 4218 va = (addr >= udata) ? (vm_offset_t)addr : (vm_offset_t)udata; 4219 va = trunc_page(va); 4220 4221 m = vm_fault_page_quick(va, vmprot, &error); 4222 if (m == NULL) { 4223 for (i = 0; i < pidx; ++i) { 4224 vm_page_unhold(bp->b_xio.xio_pages[i]); 4225 bp->b_xio.xio_pages[i] = NULL; 4226 } 4227 return(-1); 4228 } 4229 bp->b_xio.xio_pages[pidx] = m; 4230 addr += PAGE_SIZE; 4231 ++pidx; 4232 } 4233 4234 /* 4235 * Map the page array and set the buffer fields to point to 4236 * the mapped data buffer. 4237 */ 4238 if (pidx > btoc(MAXPHYS)) 4239 panic("vmapbuf: mapped more than MAXPHYS"); 4240 pmap_qenter((vm_offset_t)bp->b_kvabase, bp->b_xio.xio_pages, pidx); 4241 4242 bp->b_xio.xio_npages = pidx; 4243 bp->b_data = bp->b_kvabase + ((int)(intptr_t)udata & PAGE_MASK); 4244 bp->b_bcount = bytes; 4245 bp->b_bufsize = bytes; 4246 return(0); 4247 } 4248 4249 /* 4250 * vunmapbuf: 4251 * 4252 * Free the io map PTEs associated with this IO operation. 4253 * We also invalidate the TLB entries and restore the original b_addr. 4254 */ 4255 void 4256 vunmapbuf(struct buf *bp) 4257 { 4258 int pidx; 4259 int npages; 4260 4261 KKASSERT(bp->b_flags & B_PAGING); 4262 4263 npages = bp->b_xio.xio_npages; 4264 pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages); 4265 for (pidx = 0; pidx < npages; ++pidx) { 4266 vm_page_unhold(bp->b_xio.xio_pages[pidx]); 4267 bp->b_xio.xio_pages[pidx] = NULL; 4268 } 4269 bp->b_xio.xio_npages = 0; 4270 bp->b_data = bp->b_kvabase; 4271 } 4272 4273 /* 4274 * Scan all buffers in the system and issue the callback. 4275 */ 4276 int 4277 scan_all_buffers(int (*callback)(struct buf *, void *), void *info) 4278 { 4279 int count = 0; 4280 int error; 4281 int n; 4282 4283 for (n = 0; n < nbuf; ++n) { 4284 if ((error = callback(&buf[n], info)) < 0) { 4285 count = error; 4286 break; 4287 } 4288 count += error; 4289 } 4290 return (count); 4291 } 4292 4293 /* 4294 * print out statistics from the current status of the buffer pool 4295 * this can be toggeled by the system control option debug.syncprt 4296 */ 4297 #ifdef DEBUG 4298 void 4299 vfs_bufstats(void) 4300 { 4301 int i, j, count; 4302 struct buf *bp; 4303 struct bqueues *dp; 4304 int counts[(MAXBSIZE / PAGE_SIZE) + 1]; 4305 static char *bname[3] = { "LOCKED", "LRU", "AGE" }; 4306 4307 for (dp = bufqueues, i = 0; dp < &bufqueues[3]; dp++, i++) { 4308 count = 0; 4309 for (j = 0; j <= MAXBSIZE/PAGE_SIZE; j++) 4310 counts[j] = 0; 4311 crit_enter(); 4312 TAILQ_FOREACH(bp, dp, b_freelist) { 4313 counts[bp->b_bufsize/PAGE_SIZE]++; 4314 count++; 4315 } 4316 crit_exit(); 4317 kprintf("%s: total-%d", bname[i], count); 4318 for (j = 0; j <= MAXBSIZE/PAGE_SIZE; j++) 4319 if (counts[j] != 0) 4320 kprintf(", %d-%d", j * PAGE_SIZE, counts[j]); 4321 kprintf("\n"); 4322 } 4323 } 4324 #endif 4325 4326 #ifdef DDB 4327 4328 DB_SHOW_COMMAND(buffer, db_show_buffer) 4329 { 4330 /* get args */ 4331 struct buf *bp = (struct buf *)addr; 4332 4333 if (!have_addr) { 4334 db_printf("usage: show buffer <addr>\n"); 4335 return; 4336 } 4337 4338 db_printf("b_flags = 0x%b\n", (u_int)bp->b_flags, PRINT_BUF_FLAGS); 4339 db_printf("b_cmd = %d\n", bp->b_cmd); 4340 db_printf("b_error = %d, b_bufsize = %d, b_bcount = %d, " 4341 "b_resid = %d\n, b_data = %p, " 4342 "bio_offset(disk) = %lld, bio_offset(phys) = %lld\n", 4343 bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid, 4344 bp->b_data, 4345 (long long)bp->b_bio2.bio_offset, 4346 (long long)(bp->b_bio2.bio_next ? 4347 bp->b_bio2.bio_next->bio_offset : (off_t)-1)); 4348 if (bp->b_xio.xio_npages) { 4349 int i; 4350 db_printf("b_xio.xio_npages = %d, pages(OBJ, IDX, PA): ", 4351 bp->b_xio.xio_npages); 4352 for (i = 0; i < bp->b_xio.xio_npages; i++) { 4353 vm_page_t m; 4354 m = bp->b_xio.xio_pages[i]; 4355 db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object, 4356 (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m)); 4357 if ((i + 1) < bp->b_xio.xio_npages) 4358 db_printf(","); 4359 } 4360 db_printf("\n"); 4361 } 4362 } 4363 #endif /* DDB */ 4364