1 /* 2 * (MPSAFE) 3 * 4 * Copyright (c) 2010 The DragonFly Project. All rights reserved. 5 * 6 * This code is derived from software contributed to The DragonFly Project 7 * by Matthew Dillon <dillon@backplane.com> 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in 17 * the documentation and/or other materials provided with the 18 * distribution. 19 * 3. Neither the name of The DragonFly Project nor the names of its 20 * contributors may be used to endorse or promote products derived 21 * from this software without specific, prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 26 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 27 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 28 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 29 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 30 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 31 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 32 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 33 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 */ 36 37 /* 38 * Implement the swapcache daemon. When enabled swap is assumed to be 39 * configured on a fast storage device such as a SSD. Swap is assigned 40 * to clean vnode-backed pages in the inactive queue, clustered by object 41 * if possible, and written out. The swap assignment sticks around even 42 * after the underlying pages have been recycled. 43 * 44 * The daemon manages write bandwidth based on sysctl settings to control 45 * wear on the SSD. 46 * 47 * The vnode strategy code will check for the swap assignments and divert 48 * reads to the swap device when the data is present in the swapcache. 49 * 50 * This operates on both regular files and the block device vnodes used by 51 * filesystems to manage meta-data. 52 */ 53 54 #include "opt_vm.h" 55 #include <sys/param.h> 56 #include <sys/systm.h> 57 #include <sys/kernel.h> 58 #include <sys/proc.h> 59 #include <sys/kthread.h> 60 #include <sys/resourcevar.h> 61 #include <sys/signalvar.h> 62 #include <sys/vnode.h> 63 #include <sys/vmmeter.h> 64 #include <sys/sysctl.h> 65 66 #include <vm/vm.h> 67 #include <vm/vm_param.h> 68 #include <sys/lock.h> 69 #include <vm/vm_object.h> 70 #include <vm/vm_page.h> 71 #include <vm/vm_map.h> 72 #include <vm/vm_pageout.h> 73 #include <vm/vm_pager.h> 74 #include <vm/swap_pager.h> 75 #include <vm/vm_extern.h> 76 77 #include <sys/thread2.h> 78 #include <vm/vm_page2.h> 79 80 #define INACTIVE_LIST (&vm_page_queues[PQ_INACTIVE].pl) 81 82 /* the kernel process "vm_pageout"*/ 83 static void vm_swapcached (void); 84 static int vm_swapcached_flush (vm_page_t m, int isblkdev); 85 static int vm_swapcache_test(vm_page_t m); 86 static void vm_swapcache_writing(vm_page_t marker); 87 static void vm_swapcache_cleaning(vm_object_t marker); 88 struct thread *swapcached_thread; 89 90 static struct kproc_desc swpc_kp = { 91 "swapcached", 92 vm_swapcached, 93 &swapcached_thread 94 }; 95 SYSINIT(swapcached, SI_SUB_KTHREAD_PAGE, SI_ORDER_SECOND, kproc_start, &swpc_kp) 96 97 SYSCTL_NODE(_vm, OID_AUTO, swapcache, CTLFLAG_RW, NULL, NULL); 98 99 int vm_swapcache_read_enable; 100 int vm_swapcache_inactive_heuristic; 101 static int vm_swapcache_sleep; 102 static int vm_swapcache_maxlaunder = 256; 103 static int vm_swapcache_data_enable = 0; 104 static int vm_swapcache_meta_enable = 0; 105 static int vm_swapcache_maxswappct = 75; 106 static int vm_swapcache_hysteresis; 107 static int vm_swapcache_use_chflags = 1; /* require chflags cache */ 108 static int64_t vm_swapcache_minburst = 10000000LL; /* 10MB */ 109 static int64_t vm_swapcache_curburst = 4000000000LL; /* 4G after boot */ 110 static int64_t vm_swapcache_maxburst = 2000000000LL; /* 2G nominal max */ 111 static int64_t vm_swapcache_accrate = 100000LL; /* 100K/s */ 112 static int64_t vm_swapcache_write_count; 113 static int64_t vm_swapcache_maxfilesize; 114 115 SYSCTL_INT(_vm_swapcache, OID_AUTO, maxlaunder, 116 CTLFLAG_RW, &vm_swapcache_maxlaunder, 0, ""); 117 118 SYSCTL_INT(_vm_swapcache, OID_AUTO, data_enable, 119 CTLFLAG_RW, &vm_swapcache_data_enable, 0, ""); 120 SYSCTL_INT(_vm_swapcache, OID_AUTO, meta_enable, 121 CTLFLAG_RW, &vm_swapcache_meta_enable, 0, ""); 122 SYSCTL_INT(_vm_swapcache, OID_AUTO, read_enable, 123 CTLFLAG_RW, &vm_swapcache_read_enable, 0, ""); 124 SYSCTL_INT(_vm_swapcache, OID_AUTO, maxswappct, 125 CTLFLAG_RW, &vm_swapcache_maxswappct, 0, ""); 126 SYSCTL_INT(_vm_swapcache, OID_AUTO, hysteresis, 127 CTLFLAG_RW, &vm_swapcache_hysteresis, 0, ""); 128 SYSCTL_INT(_vm_swapcache, OID_AUTO, use_chflags, 129 CTLFLAG_RW, &vm_swapcache_use_chflags, 0, ""); 130 131 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, minburst, 132 CTLFLAG_RW, &vm_swapcache_minburst, 0, ""); 133 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, curburst, 134 CTLFLAG_RW, &vm_swapcache_curburst, 0, ""); 135 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, maxburst, 136 CTLFLAG_RW, &vm_swapcache_maxburst, 0, ""); 137 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, maxfilesize, 138 CTLFLAG_RW, &vm_swapcache_maxfilesize, 0, ""); 139 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, accrate, 140 CTLFLAG_RW, &vm_swapcache_accrate, 0, ""); 141 SYSCTL_QUAD(_vm_swapcache, OID_AUTO, write_count, 142 CTLFLAG_RW, &vm_swapcache_write_count, 0, ""); 143 144 #define SWAPMAX(adj) \ 145 ((int64_t)vm_swap_max * (vm_swapcache_maxswappct + (adj)) / 100) 146 147 /* 148 * vm_swapcached is the high level pageout daemon. 149 * 150 * No requirements. 151 */ 152 static void 153 vm_swapcached(void) 154 { 155 enum { SWAPC_WRITING, SWAPC_CLEANING } state = SWAPC_WRITING; 156 enum { SWAPB_BURSTING, SWAPB_RECOVERING } burst = SWAPB_BURSTING; 157 struct vm_page page_marker; 158 struct vm_object object_marker; 159 160 /* 161 * Thread setup 162 */ 163 curthread->td_flags |= TDF_SYSTHREAD; 164 crit_enter(); 165 lwkt_gettoken(&vm_token); 166 167 /* 168 * Initialize our marker for the inactive scan (SWAPC_WRITING) 169 */ 170 bzero(&page_marker, sizeof(page_marker)); 171 page_marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER; 172 page_marker.queue = PQ_INACTIVE; 173 page_marker.wire_count = 1; 174 TAILQ_INSERT_HEAD(INACTIVE_LIST, &page_marker, pageq); 175 vm_swapcache_hysteresis = vmstats.v_inactive_target / 2; 176 vm_swapcache_inactive_heuristic = -vm_swapcache_hysteresis; 177 178 /* 179 * Initialize our marker for the vm_object scan (SWAPC_CLEANING) 180 */ 181 bzero(&object_marker, sizeof(object_marker)); 182 object_marker.type = OBJT_MARKER; 183 TAILQ_INSERT_HEAD(&vm_object_list, &object_marker, object_list); 184 185 for (;;) { 186 /* 187 * Check every 5 seconds when not enabled or if no swap 188 * is present. 189 */ 190 if ((vm_swapcache_data_enable == 0 && 191 vm_swapcache_meta_enable == 0) || 192 vm_swap_max == 0) { 193 tsleep(&vm_swapcache_sleep, 0, "csleep", hz * 5); 194 continue; 195 } 196 197 /* 198 * Polling rate when enabled is approximately 10 hz. 199 */ 200 tsleep(&vm_swapcache_sleep, 0, "csleep", hz / 10); 201 202 /* 203 * State hysteresis. Generate write activity up to 75% of 204 * swap, then clean out swap assignments down to 70%, then 205 * repeat. 206 */ 207 if (state == SWAPC_WRITING) { 208 if (vm_swap_cache_use > SWAPMAX(0)) 209 state = SWAPC_CLEANING; 210 } else { 211 if (vm_swap_cache_use < SWAPMAX(-5)) 212 state = SWAPC_WRITING; 213 } 214 215 /* 216 * We are allowed to continue accumulating burst value 217 * in either state. Allow the user to set curburst > maxburst 218 * for the initial load-in. 219 */ 220 if (vm_swapcache_curburst < vm_swapcache_maxburst) { 221 vm_swapcache_curburst += vm_swapcache_accrate / 10; 222 if (vm_swapcache_curburst > vm_swapcache_maxburst) 223 vm_swapcache_curburst = vm_swapcache_maxburst; 224 } 225 226 /* 227 * We don't want to nickle-and-dime the scan as that will 228 * create unnecessary fragmentation. The minimum burst 229 * is one-seconds worth of accumulation. 230 */ 231 if (state == SWAPC_WRITING) { 232 if (vm_swapcache_curburst >= vm_swapcache_accrate) { 233 if (burst == SWAPB_BURSTING) { 234 vm_swapcache_writing(&page_marker); 235 if (vm_swapcache_curburst <= 0) 236 burst = SWAPB_RECOVERING; 237 } else if (vm_swapcache_curburst > 238 vm_swapcache_minburst) { 239 vm_swapcache_writing(&page_marker); 240 burst = SWAPB_BURSTING; 241 } 242 } 243 } else { 244 vm_swapcache_cleaning(&object_marker); 245 } 246 } 247 TAILQ_REMOVE(INACTIVE_LIST, &page_marker, pageq); 248 TAILQ_REMOVE(&vm_object_list, &object_marker, object_list); 249 lwkt_reltoken(&vm_token); 250 crit_exit(); 251 } 252 253 /* 254 * The caller must hold vm_token. 255 */ 256 static void 257 vm_swapcache_writing(vm_page_t marker) 258 { 259 vm_object_t object; 260 struct vnode *vp; 261 vm_page_t m; 262 int count; 263 int isblkdev; 264 265 /* 266 * Deal with an overflow of the heuristic counter or if the user 267 * manually changes the hysteresis. 268 * 269 * Try to avoid small incremental pageouts by waiting for enough 270 * pages to buildup in the inactive queue to hopefully get a good 271 * burst in. This heuristic is bumped by the VM system and reset 272 * when our scan hits the end of the queue. 273 */ 274 if (vm_swapcache_inactive_heuristic < -vm_swapcache_hysteresis) 275 vm_swapcache_inactive_heuristic = -vm_swapcache_hysteresis; 276 if (vm_swapcache_inactive_heuristic < 0) 277 return; 278 279 /* 280 * Scan the inactive queue from our marker to locate 281 * suitable pages to push to the swap cache. 282 * 283 * We are looking for clean vnode-backed pages. 284 * 285 * NOTE: PG_SWAPPED pages in particular are not part of 286 * our count because once the cache stabilizes we 287 * can end up with a very high datarate of VM pages 288 * cycling from it. 289 */ 290 m = marker; 291 count = vm_swapcache_maxlaunder; 292 293 while ((m = TAILQ_NEXT(m, pageq)) != NULL && count--) { 294 if (m->flags & (PG_MARKER | PG_SWAPPED)) { 295 ++count; 296 continue; 297 } 298 if (vm_swapcache_curburst < 0) 299 break; 300 if (vm_swapcache_test(m)) 301 continue; 302 object = m->object; 303 vp = object->handle; 304 if (vp == NULL) 305 continue; 306 307 switch(vp->v_type) { 308 case VREG: 309 /* 310 * If data_enable is 0 do not try to swapcache data. 311 * If use_chflags is set then only swapcache data for 312 * VSWAPCACHE marked vnodes, otherwise any vnode. 313 */ 314 if (vm_swapcache_data_enable == 0 || 315 ((vp->v_flag & VSWAPCACHE) == 0 && 316 vm_swapcache_use_chflags)) { 317 continue; 318 } 319 if (vm_swapcache_maxfilesize && 320 object->size > 321 (vm_swapcache_maxfilesize >> PAGE_SHIFT)) { 322 continue; 323 } 324 isblkdev = 0; 325 break; 326 case VCHR: 327 /* 328 * The PG_NOTMETA flag only applies to pages 329 * associated with block devices. 330 */ 331 if (m->flags & PG_NOTMETA) 332 continue; 333 if (vm_swapcache_meta_enable == 0) 334 continue; 335 isblkdev = 1; 336 break; 337 default: 338 continue; 339 } 340 341 /* 342 * Ok, move the marker and soft-busy the page. 343 */ 344 TAILQ_REMOVE(INACTIVE_LIST, marker, pageq); 345 TAILQ_INSERT_AFTER(INACTIVE_LIST, m, marker, pageq); 346 347 /* 348 * Assign swap and initiate I/O. 349 * 350 * (adjust for the --count which also occurs in the loop) 351 */ 352 count -= vm_swapcached_flush(m, isblkdev) - 1; 353 354 /* 355 * Setup for next loop using marker. 356 */ 357 m = marker; 358 } 359 360 /* 361 * Cleanup marker position. If we hit the end of the 362 * list the marker is placed at the tail. Newly deactivated 363 * pages will be placed after it. 364 * 365 * Earlier inactive pages that were dirty and become clean 366 * are typically moved to the end of PQ_INACTIVE by virtue 367 * of vfs_vmio_release() when they become unwired from the 368 * buffer cache. 369 */ 370 TAILQ_REMOVE(INACTIVE_LIST, marker, pageq); 371 if (m) { 372 TAILQ_INSERT_BEFORE(m, marker, pageq); 373 } else { 374 TAILQ_INSERT_TAIL(INACTIVE_LIST, marker, pageq); 375 vm_swapcache_inactive_heuristic = -vm_swapcache_hysteresis; 376 } 377 } 378 379 /* 380 * Flush the specified page using the swap_pager. 381 * 382 * Try to collect surrounding pages, including pages which may 383 * have already been assigned swap. Try to cluster within a 384 * contiguous aligned SMAP_META_PAGES (typ 16 x PAGE_SIZE) block 385 * to match what swap_pager_putpages() can do. 386 * 387 * We also want to try to match against the buffer cache blocksize 388 * but we don't really know what it is here. Since the buffer cache 389 * wires and unwires pages in groups the fact that we skip wired pages 390 * should be sufficient. 391 * 392 * Returns a count of pages we might have flushed (minimum 1) 393 * 394 * The caller must hold vm_token. 395 */ 396 static 397 int 398 vm_swapcached_flush(vm_page_t m, int isblkdev) 399 { 400 vm_object_t object; 401 vm_page_t marray[SWAP_META_PAGES]; 402 vm_pindex_t basei; 403 int rtvals[SWAP_META_PAGES]; 404 int x; 405 int i; 406 int j; 407 int count; 408 409 vm_page_io_start(m); 410 vm_page_protect(m, VM_PROT_READ); 411 object = m->object; 412 413 /* 414 * Try to cluster around (m), keeping in mind that the swap pager 415 * can only do SMAP_META_PAGES worth of continguous write. 416 */ 417 x = (int)m->pindex & SWAP_META_MASK; 418 marray[x] = m; 419 basei = m->pindex; 420 421 for (i = x - 1; i >= 0; --i) { 422 m = vm_page_lookup(object, basei - x + i); 423 if (m == NULL) 424 break; 425 if (vm_swapcache_test(m)) 426 break; 427 if (isblkdev && (m->flags & PG_NOTMETA)) 428 break; 429 vm_page_io_start(m); 430 vm_page_protect(m, VM_PROT_READ); 431 if (m->queue - m->pc == PQ_CACHE) { 432 vm_page_unqueue_nowakeup(m); 433 vm_page_deactivate(m); 434 } 435 marray[i] = m; 436 } 437 ++i; 438 439 for (j = x + 1; j < SWAP_META_PAGES; ++j) { 440 m = vm_page_lookup(object, basei - x + j); 441 if (m == NULL) 442 break; 443 if (vm_swapcache_test(m)) 444 break; 445 if (isblkdev && (m->flags & PG_NOTMETA)) 446 break; 447 vm_page_io_start(m); 448 vm_page_protect(m, VM_PROT_READ); 449 if (m->queue - m->pc == PQ_CACHE) { 450 vm_page_unqueue_nowakeup(m); 451 vm_page_deactivate(m); 452 } 453 marray[j] = m; 454 } 455 456 count = j - i; 457 vm_object_pip_add(object, count); 458 swap_pager_putpages(object, marray + i, count, FALSE, rtvals + i); 459 vm_swapcache_write_count += count * PAGE_SIZE; 460 vm_swapcache_curburst -= count * PAGE_SIZE; 461 462 while (i < j) { 463 if (rtvals[i] != VM_PAGER_PEND) { 464 vm_page_io_finish(marray[i]); 465 vm_object_pip_wakeup(object); 466 } 467 ++i; 468 } 469 return(count); 470 } 471 472 /* 473 * Test whether a VM page is suitable for writing to the swapcache. 474 * Does not test m->queue, PG_MARKER, or PG_SWAPPED. 475 * 476 * Returns 0 on success, 1 on failure 477 * 478 * The caller must hold vm_token. 479 */ 480 static int 481 vm_swapcache_test(vm_page_t m) 482 { 483 vm_object_t object; 484 485 if (m->flags & (PG_BUSY | PG_UNMANAGED)) 486 return(1); 487 if (m->busy || m->hold_count || m->wire_count) 488 return(1); 489 if (m->valid != VM_PAGE_BITS_ALL) 490 return(1); 491 if (m->dirty & m->valid) 492 return(1); 493 if ((object = m->object) == NULL) 494 return(1); 495 if (object->type != OBJT_VNODE || 496 (object->flags & OBJ_DEAD)) { 497 return(1); 498 } 499 vm_page_test_dirty(m); 500 if (m->dirty & m->valid) 501 return(1); 502 return(0); 503 } 504 505 /* 506 * Cleaning pass 507 * 508 * The caller must hold vm_token. 509 */ 510 static 511 void 512 vm_swapcache_cleaning(vm_object_t marker) 513 { 514 vm_object_t object; 515 struct vnode *vp; 516 int count; 517 int n; 518 519 object = marker; 520 count = vm_swapcache_maxlaunder; 521 522 /* 523 * Look for vnode objects 524 */ 525 lwkt_gettoken(&vm_token); 526 while ((object = TAILQ_NEXT(object, object_list)) != NULL && count--) { 527 if (object->type != OBJT_VNODE) 528 continue; 529 if ((object->flags & OBJ_DEAD) || object->swblock_count == 0) 530 continue; 531 if ((vp = object->handle) == NULL) 532 continue; 533 if (vp->v_type != VREG && vp->v_type != VCHR) 534 continue; 535 536 /* 537 * Adjust iterator. 538 */ 539 if (marker->backing_object != object) 540 marker->size = 0; 541 542 /* 543 * Move the marker so we can work on the VM object 544 */ 545 TAILQ_REMOVE(&vm_object_list, marker, object_list); 546 TAILQ_INSERT_AFTER(&vm_object_list, object, 547 marker, object_list); 548 549 /* 550 * Look for swblocks starting at our iterator. 551 * 552 * The swap_pager_condfree() function attempts to free 553 * swap space starting at the specified index. The index 554 * will be updated on return. The function will return 555 * a scan factor (NOT the number of blocks freed). 556 * 557 * If it must cut its scan of the object short due to an 558 * excessive number of swblocks, or is able to free the 559 * requested number of blocks, it will return n >= count 560 * and we break and pick it back up on a future attempt. 561 */ 562 n = swap_pager_condfree(object, &marker->size, count); 563 count -= n; 564 if (count < 0) 565 break; 566 567 /* 568 * Setup for loop. 569 */ 570 marker->size = 0; 571 object = marker; 572 } 573 574 /* 575 * Adjust marker so we continue the scan from where we left off. 576 * When we reach the end we start back at the beginning. 577 */ 578 TAILQ_REMOVE(&vm_object_list, marker, object_list); 579 if (object) 580 TAILQ_INSERT_BEFORE(object, marker, object_list); 581 else 582 TAILQ_INSERT_HEAD(&vm_object_list, marker, object_list); 583 marker->backing_object = object; 584 lwkt_reltoken(&vm_token); 585 } 586