1 /* 2 * (MPSAFE) 3 * 4 * Copyright (c) 2010 The DragonFly Project. All rights reserved. 5 * 6 * This code is derived from software contributed to The DragonFly Project 7 * by Matthew Dillon <dillon@backplane.com> 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in 17 * the documentation and/or other materials provided with the 18 * distribution. 19 * 3. Neither the name of The DragonFly Project nor the names of its 20 * contributors may be used to endorse or promote products derived 21 * from this software without specific, prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 26 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 27 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 28 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 29 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 30 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 31 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 32 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 33 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 */ 36 37 /* 38 * Implement the swapcache daemon. When enabled swap is assumed to be 39 * configured on a fast storage device such as a SSD. Swap is assigned 40 * to clean vnode-backed pages in the inactive queue, clustered by object 41 * if possible, and written out. The swap assignment sticks around even 42 * after the underlying pages have been recycled. 43 * 44 * The daemon manages write bandwidth based on sysctl settings to control 45 * wear on the SSD. 46 * 47 * The vnode strategy code will check for the swap assignments and divert 48 * reads to the swap device when the data is present in the swapcache. 49 * 50 * This operates on both regular files and the block device vnodes used by 51 * filesystems to manage meta-data. 52 */ 53 54 #include "opt_vm.h" 55 #include <sys/param.h> 56 #include <sys/systm.h> 57 #include <sys/kernel.h> 58 #include <sys/proc.h> 59 #include <sys/kthread.h> 60 #include <sys/resourcevar.h> 61 #include <sys/signalvar.h> 62 #include <sys/vnode.h> 63 #include <sys/vmmeter.h> 64 #include <sys/sysctl.h> 65 #include <sys/eventhandler.h> 66 67 #include <vm/vm.h> 68 #include <vm/vm_param.h> 69 #include <sys/lock.h> 70 #include <vm/vm_object.h> 71 #include <vm/vm_page.h> 72 #include <vm/vm_map.h> 73 #include <vm/vm_pageout.h> 74 #include <vm/vm_pager.h> 75 #include <vm/swap_pager.h> 76 #include <vm/vm_extern.h> 77 78 #include <sys/thread2.h> 79 #include <sys/spinlock2.h> 80 #include <vm/vm_page2.h> 81 82 /* the kernel process "vm_pageout"*/ 83 static int vm_swapcached_flush (vm_page_t m, int isblkdev); 84 static int vm_swapcache_test(vm_page_t m); 85 static void vm_swapcache_writing(vm_page_t marker); 86 static void vm_swapcache_cleaning(vm_object_t marker); 87 static void vm_swapcache_movemarker(vm_object_t marker, vm_object_t object); 88 struct thread *swapcached_thread; 89 90 SYSCTL_NODE(_vm, OID_AUTO, swapcache, CTLFLAG_RW, NULL, NULL); 91 92 int vm_swapcache_read_enable; 93 int vm_swapcache_inactive_heuristic; 94 static int vm_swapcache_sleep; 95 static int vm_swapcache_maxlaunder = 256; 96 static int vm_swapcache_data_enable = 0; 97 static int vm_swapcache_meta_enable = 0; 98 static int vm_swapcache_maxswappct = 75; 99 static int vm_swapcache_hysteresis; 100 int vm_swapcache_use_chflags = 1; /* require chflags cache */ 101 static int64_t vm_swapcache_minburst = 10000000LL; /* 10MB */ 102 static int64_t vm_swapcache_curburst = 4000000000LL; /* 4G after boot */ 103 static int64_t vm_swapcache_maxburst = 2000000000LL; /* 2G nominal max */ 104 static int64_t vm_swapcache_accrate = 100000LL; /* 100K/s */ 105 static int64_t vm_swapcache_write_count; 106 static int64_t vm_swapcache_maxfilesize; 107 static int64_t vm_swapcache_cleanperobj = 16*1024*1024; 108 109 SYSCTL_INT(_vm_swapcache, OID_AUTO, maxlaunder, 110 CTLFLAG_RW, &vm_swapcache_maxlaunder, 0, ""); 111 112 SYSCTL_INT(_vm_swapcache, OID_AUTO, data_enable, 113 CTLFLAG_RW, &vm_swapcache_data_enable, 0, ""); 114 SYSCTL_INT(_vm_swapcache, OID_AUTO, meta_enable, 115 CTLFLAG_RW, &vm_swapcache_meta_enable, 0, ""); 116 SYSCTL_INT(_vm_swapcache, OID_AUTO, read_enable, 117 CTLFLAG_RW, &vm_swapcache_read_enable, 0, ""); 118 SYSCTL_INT(_vm_swapcache, OID_AUTO, maxswappct, 119 CTLFLAG_RW, &vm_swapcache_maxswappct, 0, ""); 120 SYSCTL_INT(_vm_swapcache, OID_AUTO, hysteresis, 121 CTLFLAG_RW, &vm_swapcache_hysteresis, 0, ""); 122 SYSCTL_INT(_vm_swapcache, OID_AUTO, use_chflags, 123 CTLFLAG_RW, &vm_swapcache_use_chflags, 0, ""); 124 125 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, minburst, 126 CTLFLAG_RW, &vm_swapcache_minburst, 0, ""); 127 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, curburst, 128 CTLFLAG_RW, &vm_swapcache_curburst, 0, ""); 129 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, maxburst, 130 CTLFLAG_RW, &vm_swapcache_maxburst, 0, ""); 131 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, maxfilesize, 132 CTLFLAG_RW, &vm_swapcache_maxfilesize, 0, ""); 133 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, accrate, 134 CTLFLAG_RW, &vm_swapcache_accrate, 0, ""); 135 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, write_count, 136 CTLFLAG_RW, &vm_swapcache_write_count, 0, ""); 137 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, cleanperobj, 138 CTLFLAG_RW, &vm_swapcache_cleanperobj, 0, ""); 139 140 #define SWAPMAX(adj) \ 141 ((int64_t)vm_swap_max * (vm_swapcache_maxswappct + (adj)) / 100) 142 143 /* 144 * When shutting down the machine we want to stop swapcache operation 145 * immediately so swap is not accessed after devices have been shuttered. 146 */ 147 static void 148 shutdown_swapcache(void *arg __unused) 149 { 150 vm_swapcache_read_enable = 0; 151 vm_swapcache_data_enable = 0; 152 vm_swapcache_meta_enable = 0; 153 wakeup(&vm_swapcache_sleep); /* shortcut 5-second wait */ 154 } 155 156 /* 157 * vm_swapcached is the high level pageout daemon. 158 * 159 * No requirements. 160 */ 161 static void 162 vm_swapcached_thread(void) 163 { 164 enum { SWAPC_WRITING, SWAPC_CLEANING } state = SWAPC_WRITING; 165 enum { SWAPB_BURSTING, SWAPB_RECOVERING } burst = SWAPB_BURSTING; 166 static struct vm_page page_marker[PQ_L2_SIZE]; 167 static struct vm_object object_marker; 168 int q; 169 170 /* 171 * Thread setup 172 */ 173 curthread->td_flags |= TDF_SYSTHREAD; 174 EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_kproc, 175 swapcached_thread, SHUTDOWN_PRI_FIRST); 176 EVENTHANDLER_REGISTER(shutdown_pre_sync, shutdown_swapcache, 177 NULL, SHUTDOWN_PRI_SECOND); 178 179 /* 180 * Initialize our marker for the inactive scan (SWAPC_WRITING) 181 */ 182 bzero(&page_marker, sizeof(page_marker)); 183 for (q = 0; q < PQ_L2_SIZE; ++q) { 184 page_marker[q].flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER; 185 page_marker[q].queue = PQ_INACTIVE + q; 186 page_marker[q].pc = q; 187 page_marker[q].wire_count = 1; 188 vm_page_queues_spin_lock(PQ_INACTIVE + q); 189 TAILQ_INSERT_HEAD( 190 &vm_page_queues[PQ_INACTIVE + q].pl, 191 &page_marker[q], pageq); 192 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 193 } 194 195 vm_swapcache_hysteresis = vmstats.v_inactive_target / 2; 196 vm_swapcache_inactive_heuristic = -vm_swapcache_hysteresis; 197 198 /* 199 * Initialize our marker for the vm_object scan (SWAPC_CLEANING) 200 */ 201 bzero(&object_marker, sizeof(object_marker)); 202 object_marker.type = OBJT_MARKER; 203 lwkt_gettoken(&vmobj_token); 204 TAILQ_INSERT_HEAD(&vm_object_list, &object_marker, object_list); 205 lwkt_reltoken(&vmobj_token); 206 207 for (;;) { 208 /* 209 * Handle shutdown 210 */ 211 kproc_suspend_loop(); 212 213 /* 214 * Check every 5 seconds when not enabled or if no swap 215 * is present. 216 */ 217 if ((vm_swapcache_data_enable == 0 && 218 vm_swapcache_meta_enable == 0) || 219 vm_swap_max == 0) { 220 tsleep(&vm_swapcache_sleep, 0, "csleep", hz * 5); 221 continue; 222 } 223 224 /* 225 * Polling rate when enabled is approximately 10 hz. 226 */ 227 tsleep(&vm_swapcache_sleep, 0, "csleep", hz / 10); 228 229 /* 230 * State hysteresis. Generate write activity up to 75% of 231 * swap, then clean out swap assignments down to 70%, then 232 * repeat. 233 */ 234 if (state == SWAPC_WRITING) { 235 if (vm_swap_cache_use > SWAPMAX(0)) 236 state = SWAPC_CLEANING; 237 } else { 238 if (vm_swap_cache_use < SWAPMAX(-10)) 239 state = SWAPC_WRITING; 240 } 241 242 /* 243 * We are allowed to continue accumulating burst value 244 * in either state. Allow the user to set curburst > maxburst 245 * for the initial load-in. 246 */ 247 if (vm_swapcache_curburst < vm_swapcache_maxburst) { 248 vm_swapcache_curburst += vm_swapcache_accrate / 10; 249 if (vm_swapcache_curburst > vm_swapcache_maxburst) 250 vm_swapcache_curburst = vm_swapcache_maxburst; 251 } 252 253 /* 254 * We don't want to nickle-and-dime the scan as that will 255 * create unnecessary fragmentation. The minimum burst 256 * is one-seconds worth of accumulation. 257 */ 258 if (state == SWAPC_WRITING) { 259 if (vm_swapcache_curburst >= vm_swapcache_accrate) { 260 if (burst == SWAPB_BURSTING) { 261 for (q = 0; q < PQ_L2_SIZE; ++q) { 262 vm_swapcache_writing( 263 &page_marker[q]); 264 } 265 if (vm_swapcache_curburst <= 0) 266 burst = SWAPB_RECOVERING; 267 } else if (vm_swapcache_curburst > 268 vm_swapcache_minburst) { 269 for (q = 0; q < PQ_L2_SIZE; ++q) { 270 vm_swapcache_writing( 271 &page_marker[q]); 272 } 273 burst = SWAPB_BURSTING; 274 } 275 } 276 } else { 277 vm_swapcache_cleaning(&object_marker); 278 } 279 } 280 281 /* 282 * Cleanup (NOT REACHED) 283 */ 284 for (q = 0; q < PQ_L2_SIZE; ++q) { 285 vm_page_queues_spin_lock(PQ_INACTIVE + q); 286 TAILQ_REMOVE( 287 &vm_page_queues[PQ_INACTIVE + q].pl, 288 &page_marker[q], pageq); 289 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 290 } 291 292 lwkt_gettoken(&vmobj_token); 293 TAILQ_REMOVE(&vm_object_list, &object_marker, object_list); 294 lwkt_reltoken(&vmobj_token); 295 } 296 297 static struct kproc_desc swpc_kp = { 298 "swapcached", 299 vm_swapcached_thread, 300 &swapcached_thread 301 }; 302 SYSINIT(swapcached, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start, &swpc_kp) 303 304 static void 305 vm_swapcache_writing(vm_page_t marker) 306 { 307 vm_object_t object; 308 struct vnode *vp; 309 vm_page_t m; 310 int count; 311 int isblkdev; 312 313 /* 314 * Deal with an overflow of the heuristic counter or if the user 315 * manually changes the hysteresis. 316 * 317 * Try to avoid small incremental pageouts by waiting for enough 318 * pages to buildup in the inactive queue to hopefully get a good 319 * burst in. This heuristic is bumped by the VM system and reset 320 * when our scan hits the end of the queue. 321 */ 322 if (vm_swapcache_inactive_heuristic < -vm_swapcache_hysteresis) 323 vm_swapcache_inactive_heuristic = -vm_swapcache_hysteresis; 324 if (vm_swapcache_inactive_heuristic < 0) 325 return; 326 327 /* 328 * Scan the inactive queue from our marker to locate 329 * suitable pages to push to the swap cache. 330 * 331 * We are looking for clean vnode-backed pages. 332 * 333 * NOTE: PG_SWAPPED pages in particular are not part of 334 * our count because once the cache stabilizes we 335 * can end up with a very high datarate of VM pages 336 * cycling from it. 337 */ 338 count = vm_swapcache_maxlaunder; 339 340 vm_page_queues_spin_lock(marker->queue); 341 while ((m = TAILQ_NEXT(marker, pageq)) != NULL && count-- > 0) { 342 KKASSERT(m->queue == marker->queue); 343 344 if (vm_swapcache_curburst < 0) 345 break; 346 TAILQ_REMOVE( 347 &vm_page_queues[marker->queue].pl, marker, pageq); 348 TAILQ_INSERT_AFTER( 349 &vm_page_queues[marker->queue].pl, m, marker, pageq); 350 351 /* 352 * Ignore markers and ignore pages that already have a swap 353 * assignment. 354 */ 355 if (m->flags & (PG_MARKER | PG_SWAPPED)) { 356 ++count; 357 continue; 358 } 359 if (vm_page_busy_try(m, TRUE)) 360 continue; 361 vm_page_queues_spin_unlock(marker->queue); 362 363 if ((object = m->object) == NULL) { 364 vm_page_wakeup(m); 365 vm_page_queues_spin_lock(marker->queue); 366 continue; 367 } 368 vm_object_hold(object); 369 if (m->object != object) { 370 vm_object_drop(object); 371 vm_page_wakeup(m); 372 vm_page_queues_spin_lock(marker->queue); 373 continue; 374 } 375 if (vm_swapcache_test(m)) { 376 vm_object_drop(object); 377 vm_page_wakeup(m); 378 vm_page_queues_spin_lock(marker->queue); 379 continue; 380 } 381 382 vp = object->handle; 383 if (vp == NULL) { 384 vm_object_drop(object); 385 vm_page_wakeup(m); 386 vm_page_queues_spin_lock(marker->queue); 387 continue; 388 } 389 390 switch(vp->v_type) { 391 case VREG: 392 /* 393 * PG_NOTMETA generically means 'don't swapcache this', 394 * and HAMMER will set this for regular data buffers 395 * (and leave it unset for meta-data buffers) as 396 * appropriate when double buffering is enabled. 397 */ 398 if (m->flags & PG_NOTMETA) { 399 vm_object_drop(object); 400 vm_page_wakeup(m); 401 vm_page_queues_spin_lock(marker->queue); 402 continue; 403 } 404 405 /* 406 * If data_enable is 0 do not try to swapcache data. 407 * If use_chflags is set then only swapcache data for 408 * VSWAPCACHE marked vnodes, otherwise any vnode. 409 */ 410 if (vm_swapcache_data_enable == 0 || 411 ((vp->v_flag & VSWAPCACHE) == 0 && 412 vm_swapcache_use_chflags)) { 413 vm_object_drop(object); 414 vm_page_wakeup(m); 415 vm_page_queues_spin_lock(marker->queue); 416 continue; 417 } 418 if (vm_swapcache_maxfilesize && 419 object->size > 420 (vm_swapcache_maxfilesize >> PAGE_SHIFT)) { 421 vm_object_drop(object); 422 vm_page_wakeup(m); 423 vm_page_queues_spin_lock(marker->queue); 424 continue; 425 } 426 isblkdev = 0; 427 break; 428 case VCHR: 429 /* 430 * PG_NOTMETA generically means 'don't swapcache this', 431 * and HAMMER will set this for regular data buffers 432 * (and leave it unset for meta-data buffers) as 433 * appropriate when double buffering is enabled. 434 */ 435 if (m->flags & PG_NOTMETA) { 436 vm_object_drop(object); 437 vm_page_wakeup(m); 438 vm_page_queues_spin_lock(marker->queue); 439 continue; 440 } 441 if (vm_swapcache_meta_enable == 0) { 442 vm_object_drop(object); 443 vm_page_wakeup(m); 444 vm_page_queues_spin_lock(marker->queue); 445 continue; 446 } 447 isblkdev = 1; 448 break; 449 default: 450 vm_object_drop(object); 451 vm_page_wakeup(m); 452 vm_page_queues_spin_lock(marker->queue); 453 continue; 454 } 455 456 457 /* 458 * Assign swap and initiate I/O. 459 * 460 * (adjust for the --count which also occurs in the loop) 461 */ 462 count -= vm_swapcached_flush(m, isblkdev) - 1; 463 464 /* 465 * Setup for next loop using marker. 466 */ 467 vm_object_drop(object); 468 vm_page_queues_spin_lock(marker->queue); 469 } 470 471 /* 472 * The marker could wind up at the end, which is ok. If we hit the 473 * end of the list adjust the heuristic. 474 * 475 * Earlier inactive pages that were dirty and become clean 476 * are typically moved to the end of PQ_INACTIVE by virtue 477 * of vfs_vmio_release() when they become unwired from the 478 * buffer cache. 479 */ 480 if (m == NULL) 481 vm_swapcache_inactive_heuristic = -vm_swapcache_hysteresis; 482 vm_page_queues_spin_unlock(marker->queue); 483 } 484 485 /* 486 * Flush the specified page using the swap_pager. The page 487 * must be busied by the caller and its disposition will become 488 * the responsibility of this function. 489 * 490 * Try to collect surrounding pages, including pages which may 491 * have already been assigned swap. Try to cluster within a 492 * contiguous aligned SMAP_META_PAGES (typ 16 x PAGE_SIZE) block 493 * to match what swap_pager_putpages() can do. 494 * 495 * We also want to try to match against the buffer cache blocksize 496 * but we don't really know what it is here. Since the buffer cache 497 * wires and unwires pages in groups the fact that we skip wired pages 498 * should be sufficient. 499 * 500 * Returns a count of pages we might have flushed (minimum 1) 501 */ 502 static 503 int 504 vm_swapcached_flush(vm_page_t m, int isblkdev) 505 { 506 vm_object_t object; 507 vm_page_t marray[SWAP_META_PAGES]; 508 vm_pindex_t basei; 509 int rtvals[SWAP_META_PAGES]; 510 int x; 511 int i; 512 int j; 513 int count; 514 int error; 515 516 vm_page_io_start(m); 517 vm_page_protect(m, VM_PROT_READ); 518 object = m->object; 519 vm_object_hold(object); 520 521 /* 522 * Try to cluster around (m), keeping in mind that the swap pager 523 * can only do SMAP_META_PAGES worth of continguous write. 524 */ 525 x = (int)m->pindex & SWAP_META_MASK; 526 marray[x] = m; 527 basei = m->pindex; 528 vm_page_wakeup(m); 529 530 for (i = x - 1; i >= 0; --i) { 531 m = vm_page_lookup_busy_try(object, basei - x + i, 532 TRUE, &error); 533 if (error || m == NULL) 534 break; 535 if (vm_swapcache_test(m)) { 536 vm_page_wakeup(m); 537 break; 538 } 539 if (isblkdev && (m->flags & PG_NOTMETA)) { 540 vm_page_wakeup(m); 541 break; 542 } 543 vm_page_io_start(m); 544 vm_page_protect(m, VM_PROT_READ); 545 if (m->queue - m->pc == PQ_CACHE) { 546 vm_page_unqueue_nowakeup(m); 547 vm_page_deactivate(m); 548 } 549 marray[i] = m; 550 vm_page_wakeup(m); 551 } 552 ++i; 553 554 for (j = x + 1; j < SWAP_META_PAGES; ++j) { 555 m = vm_page_lookup_busy_try(object, basei - x + j, 556 TRUE, &error); 557 if (error || m == NULL) 558 break; 559 if (vm_swapcache_test(m)) { 560 vm_page_wakeup(m); 561 break; 562 } 563 if (isblkdev && (m->flags & PG_NOTMETA)) { 564 vm_page_wakeup(m); 565 break; 566 } 567 vm_page_io_start(m); 568 vm_page_protect(m, VM_PROT_READ); 569 if (m->queue - m->pc == PQ_CACHE) { 570 vm_page_unqueue_nowakeup(m); 571 vm_page_deactivate(m); 572 } 573 marray[j] = m; 574 vm_page_wakeup(m); 575 } 576 577 count = j - i; 578 vm_object_pip_add(object, count); 579 swap_pager_putpages(object, marray + i, count, FALSE, rtvals + i); 580 vm_swapcache_write_count += count * PAGE_SIZE; 581 vm_swapcache_curburst -= count * PAGE_SIZE; 582 583 while (i < j) { 584 if (rtvals[i] != VM_PAGER_PEND) { 585 vm_page_busy_wait(marray[i], FALSE, "swppgfd"); 586 vm_page_io_finish(marray[i]); 587 vm_page_wakeup(marray[i]); 588 vm_object_pip_wakeup(object); 589 } 590 ++i; 591 } 592 vm_object_drop(object); 593 return(count); 594 } 595 596 /* 597 * Test whether a VM page is suitable for writing to the swapcache. 598 * Does not test m->queue, PG_MARKER, or PG_SWAPPED. 599 * 600 * Returns 0 on success, 1 on failure 601 */ 602 static int 603 vm_swapcache_test(vm_page_t m) 604 { 605 vm_object_t object; 606 607 if (m->flags & PG_UNMANAGED) 608 return(1); 609 if (m->hold_count || m->wire_count) 610 return(1); 611 if (m->valid != VM_PAGE_BITS_ALL) 612 return(1); 613 if (m->dirty & m->valid) 614 return(1); 615 if ((object = m->object) == NULL) 616 return(1); 617 if (object->type != OBJT_VNODE || 618 (object->flags & OBJ_DEAD)) { 619 return(1); 620 } 621 vm_page_test_dirty(m); 622 if (m->dirty & m->valid) 623 return(1); 624 return(0); 625 } 626 627 /* 628 * Cleaning pass. 629 * 630 * We clean whole objects up to 16MB 631 */ 632 static 633 void 634 vm_swapcache_cleaning(vm_object_t marker) 635 { 636 vm_object_t object; 637 struct vnode *vp; 638 int count; 639 int n; 640 641 count = vm_swapcache_maxlaunder; 642 643 /* 644 * Look for vnode objects 645 */ 646 lwkt_gettoken(&vmobj_token); 647 648 while ((object = TAILQ_NEXT(marker, object_list)) != NULL) { 649 /* 650 * We have to skip markers. We cannot hold/drop marker 651 * objects! 652 */ 653 if (object->type == OBJT_MARKER) { 654 vm_swapcache_movemarker(marker, object); 655 continue; 656 } 657 658 /* 659 * Safety, or in case there are millions of VM objects 660 * without swapcache backing. 661 */ 662 if (--count <= 0) 663 break; 664 665 /* 666 * We must hold the object before potentially yielding. 667 */ 668 vm_object_hold(object); 669 lwkt_yield(); 670 671 /* 672 * Only operate on live VNODE objects that are either 673 * VREG or VCHR (VCHR for meta-data). 674 */ 675 if ((object->type != OBJT_VNODE) || 676 ((object->flags & OBJ_DEAD) || 677 object->swblock_count == 0) || 678 ((vp = object->handle) == NULL) || 679 (vp->v_type != VREG && vp->v_type != VCHR)) { 680 vm_object_drop(object); 681 /* object may be invalid now */ 682 vm_swapcache_movemarker(marker, object); 683 continue; 684 } 685 686 /* 687 * Reset the object pindex stored in the marker if the 688 * working object has changed. 689 */ 690 if (marker->backing_object != object) { 691 marker->size = 0; 692 marker->backing_object_offset = 0; 693 marker->backing_object = object; 694 } 695 696 /* 697 * Look for swblocks starting at our iterator. 698 * 699 * The swap_pager_condfree() function attempts to free 700 * swap space starting at the specified index. The index 701 * will be updated on return. The function will return 702 * a scan factor (NOT the number of blocks freed). 703 * 704 * If it must cut its scan of the object short due to an 705 * excessive number of swblocks, or is able to free the 706 * requested number of blocks, it will return n >= count 707 * and we break and pick it back up on a future attempt. 708 * 709 * Scan the object linearly and try to batch large sets of 710 * blocks that are likely to clean out entire swap radix 711 * tree leafs. 712 */ 713 lwkt_token_swap(); 714 lwkt_reltoken(&vmobj_token); 715 716 n = swap_pager_condfree(object, &marker->size, 717 (count + SWAP_META_MASK) & ~SWAP_META_MASK); 718 719 vm_object_drop(object); /* object may be invalid now */ 720 lwkt_gettoken(&vmobj_token); 721 722 /* 723 * If we have exhausted the object or deleted our per-pass 724 * page limit then move us to the next object. Note that 725 * the current object may no longer be on the vm_object_list. 726 */ 727 if (n <= 0 || 728 marker->backing_object_offset > vm_swapcache_cleanperobj) { 729 vm_swapcache_movemarker(marker, object); 730 } 731 732 /* 733 * If we have exhausted our max-launder stop for now. 734 */ 735 count -= n; 736 marker->backing_object_offset += n * PAGE_SIZE; 737 if (count < 0) 738 break; 739 } 740 741 /* 742 * If we wound up at the end of the list this will move the 743 * marker back to the beginning. 744 */ 745 if (object == NULL) 746 vm_swapcache_movemarker(marker, NULL); 747 748 lwkt_reltoken(&vmobj_token); 749 } 750 751 /* 752 * Move the marker past the current object. Object can be stale, but we 753 * still need it to determine if the marker has to be moved. If the object 754 * is still the 'current object' (object after the marker), we hop-scotch 755 * the marker past it. 756 */ 757 static void 758 vm_swapcache_movemarker(vm_object_t marker, vm_object_t object) 759 { 760 if (TAILQ_NEXT(marker, object_list) == object) { 761 TAILQ_REMOVE(&vm_object_list, marker, object_list); 762 if (object) { 763 TAILQ_INSERT_AFTER(&vm_object_list, object, 764 marker, object_list); 765 } else { 766 TAILQ_INSERT_HEAD(&vm_object_list, 767 marker, object_list); 768 } 769 } 770 } 771