1 /* 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1994 John S. Dyson 5 * All rights reserved. 6 * Copyright (c) 1994 David Greenman 7 * All rights reserved. 8 * 9 * This code is derived from software contributed to Berkeley by 10 * The Mach Operating System project at Carnegie-Mellon University. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 37 * 38 * 39 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 40 * All rights reserved. 41 * 42 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 43 * 44 * Permission to use, copy, modify and distribute this software and 45 * its documentation is hereby granted, provided that both the copyright 46 * notice and this permission notice appear in all copies of the 47 * software, derivative works or modified versions, and any portions 48 * thereof, and that both notices appear in supporting documentation. 49 * 50 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 51 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 52 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 53 * 54 * Carnegie Mellon requests users of this software to return to 55 * 56 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 57 * School of Computer Science 58 * Carnegie Mellon University 59 * Pittsburgh PA 15213-3890 60 * 61 * any improvements or extensions that they make and grant Carnegie the 62 * rights to redistribute these changes. 63 * 64 * $FreeBSD: src/sys/vm/vm_pageout.c,v 1.151.2.15 2002/12/29 18:21:04 dillon Exp $ 65 */ 66 67 /* 68 * The proverbial page-out daemon. 69 */ 70 71 #include "opt_vm.h" 72 #include <sys/param.h> 73 #include <sys/systm.h> 74 #include <sys/kernel.h> 75 #include <sys/proc.h> 76 #include <sys/kthread.h> 77 #include <sys/resourcevar.h> 78 #include <sys/signalvar.h> 79 #include <sys/vnode.h> 80 #include <sys/vmmeter.h> 81 #include <sys/conf.h> 82 #include <sys/sysctl.h> 83 84 #include <vm/vm.h> 85 #include <vm/vm_param.h> 86 #include <sys/lock.h> 87 #include <vm/vm_object.h> 88 #include <vm/vm_page.h> 89 #include <vm/vm_map.h> 90 #include <vm/vm_pageout.h> 91 #include <vm/vm_pager.h> 92 #include <vm/swap_pager.h> 93 #include <vm/vm_extern.h> 94 95 #include <sys/spinlock2.h> 96 #include <vm/vm_page2.h> 97 98 /* 99 * System initialization 100 */ 101 102 /* the kernel process "vm_pageout"*/ 103 static int vm_pageout_page(vm_page_t m, long *max_launderp, 104 long *vnodes_skippedp, struct vnode **vpfailedp, 105 int pass, int vmflush_flags); 106 static int vm_pageout_clean_helper (vm_page_t, int); 107 static int vm_pageout_free_page_calc (vm_size_t count); 108 static void vm_pageout_page_free(vm_page_t m) ; 109 struct thread *emergpager; 110 struct thread *pagethread; 111 static int sequence_emerg_pager; 112 113 #if !defined(NO_SWAPPING) 114 /* the kernel process "vm_daemon"*/ 115 static void vm_daemon (void); 116 static struct thread *vmthread; 117 118 static struct kproc_desc vm_kp = { 119 "vmdaemon", 120 vm_daemon, 121 &vmthread 122 }; 123 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp); 124 #endif 125 126 int vm_pages_needed = 0; /* Event on which pageout daemon sleeps */ 127 int vm_pageout_deficit = 0; /* Estimated number of pages deficit */ 128 int vm_pageout_pages_needed = 0;/* pageout daemon needs pages */ 129 int vm_page_free_hysteresis = 16; 130 static int vm_pagedaemon_time; 131 132 #if !defined(NO_SWAPPING) 133 static int vm_pageout_req_swapout; 134 static int vm_daemon_needed; 135 #endif 136 static int vm_max_launder = 4096; 137 static int vm_emerg_launder = 100; 138 static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0; 139 static int vm_pageout_full_stats_interval = 0; 140 static int vm_pageout_stats_free_max=0, vm_pageout_algorithm=0; 141 static int defer_swap_pageouts=0; 142 static int disable_swap_pageouts=0; 143 static u_int vm_anonmem_decline = ACT_DECLINE; 144 static u_int vm_filemem_decline = ACT_DECLINE * 2; 145 146 #if defined(NO_SWAPPING) 147 static int vm_swap_enabled=0; 148 static int vm_swap_idle_enabled=0; 149 #else 150 static int vm_swap_enabled=1; 151 static int vm_swap_idle_enabled=0; 152 #endif 153 int vm_pageout_memuse_mode=1; /* 0-disable, 1-passive, 2-active swp*/ 154 155 SYSCTL_UINT(_vm, VM_PAGEOUT_ALGORITHM, anonmem_decline, 156 CTLFLAG_RW, &vm_anonmem_decline, 0, "active->inactive anon memory"); 157 158 SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, filemem_decline, 159 CTLFLAG_RW, &vm_filemem_decline, 0, "active->inactive file cache"); 160 161 SYSCTL_INT(_vm, OID_AUTO, page_free_hysteresis, 162 CTLFLAG_RW, &vm_page_free_hysteresis, 0, 163 "Free more pages than the minimum required"); 164 165 SYSCTL_INT(_vm, OID_AUTO, max_launder, 166 CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout"); 167 SYSCTL_INT(_vm, OID_AUTO, emerg_launder, 168 CTLFLAG_RW, &vm_emerg_launder, 0, "Emergency pager minimum"); 169 170 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max, 171 CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length"); 172 173 SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval, 174 CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan"); 175 176 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval, 177 CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan"); 178 179 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_free_max, 180 CTLFLAG_RW, &vm_pageout_stats_free_max, 0, "Not implemented"); 181 SYSCTL_INT(_vm, OID_AUTO, pageout_memuse_mode, 182 CTLFLAG_RW, &vm_pageout_memuse_mode, 0, "memoryuse resource mode"); 183 184 #if defined(NO_SWAPPING) 185 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 186 CTLFLAG_RD, &vm_swap_enabled, 0, ""); 187 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, 188 CTLFLAG_RD, &vm_swap_idle_enabled, 0, ""); 189 #else 190 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 191 CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout"); 192 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, 193 CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); 194 #endif 195 196 SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts, 197 CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem"); 198 199 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, 200 CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); 201 202 static int pageout_lock_miss; 203 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss, 204 CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout"); 205 206 int vm_page_max_wired; /* XXX max # of wired pages system-wide */ 207 208 #if !defined(NO_SWAPPING) 209 static void vm_req_vmdaemon (void); 210 #endif 211 static void vm_pageout_page_stats(int q); 212 213 /* 214 * Calculate approximately how many pages on each queue to try to 215 * clean. An exact calculation creates an edge condition when the 216 * queues are unbalanced so add significant slop. The queue scans 217 * will stop early when targets are reached and will start where they 218 * left off on the next pass. 219 * 220 * We need to be generous here because there are all sorts of loading 221 * conditions that can cause edge cases if try to average over all queues. 222 * In particular, storage subsystems have become so fast that paging 223 * activity can become quite frantic. Eventually we will probably need 224 * two paging threads, one for dirty pages and one for clean, to deal 225 * with the bandwidth requirements. 226 227 * So what we do is calculate a value that can be satisfied nominally by 228 * only having to scan half the queues. 229 */ 230 static __inline long 231 PQAVERAGE(long n) 232 { 233 long avg; 234 235 if (n >= 0) { 236 avg = ((n + (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) + 1); 237 } else { 238 avg = ((n - (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) - 1); 239 } 240 return avg; 241 } 242 243 /* 244 * vm_pageout_clean_helper: 245 * 246 * Clean the page and remove it from the laundry. The page must be busied 247 * by the caller and will be disposed of (put away, flushed) by this routine. 248 */ 249 static int 250 vm_pageout_clean_helper(vm_page_t m, int vmflush_flags) 251 { 252 vm_object_t object; 253 vm_page_t mc[BLIST_MAX_ALLOC]; 254 int error; 255 int ib, is, page_base; 256 vm_pindex_t pindex = m->pindex; 257 258 object = m->object; 259 260 /* 261 * Don't mess with the page if it's held or special. 262 * 263 * XXX do we really need to check hold_count here? hold_count 264 * isn't supposed to mess with vm_page ops except prevent the 265 * page from being reused. 266 */ 267 if (m->hold_count != 0 || (m->flags & PG_UNMANAGED)) { 268 vm_page_wakeup(m); 269 return 0; 270 } 271 272 /* 273 * Place page in cluster. Align cluster for optimal swap space 274 * allocation (whether it is swap or not). This is typically ~16-32 275 * pages, which also tends to align the cluster to multiples of the 276 * filesystem block size if backed by a filesystem. 277 */ 278 page_base = pindex % BLIST_MAX_ALLOC; 279 mc[page_base] = m; 280 ib = page_base - 1; 281 is = page_base + 1; 282 283 /* 284 * Scan object for clusterable pages. 285 * 286 * We can cluster ONLY if: ->> the page is NOT 287 * clean, wired, busy, held, or mapped into a 288 * buffer, and one of the following: 289 * 1) The page is inactive, or a seldom used 290 * active page. 291 * -or- 292 * 2) we force the issue. 293 * 294 * During heavy mmap/modification loads the pageout 295 * daemon can really fragment the underlying file 296 * due to flushing pages out of order and not trying 297 * align the clusters (which leave sporatic out-of-order 298 * holes). To solve this problem we do the reverse scan 299 * first and attempt to align our cluster, then do a 300 * forward scan if room remains. 301 */ 302 vm_object_hold(object); 303 304 while (ib >= 0) { 305 vm_page_t p; 306 307 p = vm_page_lookup_busy_try(object, pindex - page_base + ib, 308 TRUE, &error); 309 if (error || p == NULL) 310 break; 311 if ((p->queue - p->pc) == PQ_CACHE || 312 (p->flags & PG_UNMANAGED)) { 313 vm_page_wakeup(p); 314 break; 315 } 316 vm_page_test_dirty(p); 317 if (((p->dirty & p->valid) == 0 && 318 (p->flags & PG_NEED_COMMIT) == 0) || 319 p->wire_count != 0 || /* may be held by buf cache */ 320 p->hold_count != 0) { /* may be undergoing I/O */ 321 vm_page_wakeup(p); 322 break; 323 } 324 if (p->queue - p->pc != PQ_INACTIVE) { 325 if (p->queue - p->pc != PQ_ACTIVE || 326 (vmflush_flags & VM_PAGER_ALLOW_ACTIVE) == 0) { 327 vm_page_wakeup(p); 328 break; 329 } 330 } 331 332 /* 333 * Try to maintain page groupings in the cluster. 334 */ 335 if (m->flags & PG_WINATCFLS) 336 vm_page_flag_set(p, PG_WINATCFLS); 337 else 338 vm_page_flag_clear(p, PG_WINATCFLS); 339 p->act_count = m->act_count; 340 341 mc[ib] = p; 342 --ib; 343 } 344 ++ib; /* fixup */ 345 346 while (is < BLIST_MAX_ALLOC && 347 pindex - page_base + is < object->size) { 348 vm_page_t p; 349 350 p = vm_page_lookup_busy_try(object, pindex - page_base + is, 351 TRUE, &error); 352 if (error || p == NULL) 353 break; 354 if (((p->queue - p->pc) == PQ_CACHE) || 355 (p->flags & PG_UNMANAGED)) { 356 vm_page_wakeup(p); 357 break; 358 } 359 vm_page_test_dirty(p); 360 if (((p->dirty & p->valid) == 0 && 361 (p->flags & PG_NEED_COMMIT) == 0) || 362 p->wire_count != 0 || /* may be held by buf cache */ 363 p->hold_count != 0) { /* may be undergoing I/O */ 364 vm_page_wakeup(p); 365 break; 366 } 367 if (p->queue - p->pc != PQ_INACTIVE) { 368 if (p->queue - p->pc != PQ_ACTIVE || 369 (vmflush_flags & VM_PAGER_ALLOW_ACTIVE) == 0) { 370 vm_page_wakeup(p); 371 break; 372 } 373 } 374 375 /* 376 * Try to maintain page groupings in the cluster. 377 */ 378 if (m->flags & PG_WINATCFLS) 379 vm_page_flag_set(p, PG_WINATCFLS); 380 else 381 vm_page_flag_clear(p, PG_WINATCFLS); 382 p->act_count = m->act_count; 383 384 mc[is] = p; 385 ++is; 386 } 387 388 vm_object_drop(object); 389 390 /* 391 * we allow reads during pageouts... 392 */ 393 return vm_pageout_flush(&mc[ib], is - ib, vmflush_flags); 394 } 395 396 /* 397 * vm_pageout_flush() - launder the given pages 398 * 399 * The given pages are laundered. Note that we setup for the start of 400 * I/O ( i.e. busy the page ), mark it read-only, and bump the object 401 * reference count all in here rather then in the parent. If we want 402 * the parent to do more sophisticated things we may have to change 403 * the ordering. 404 * 405 * The pages in the array must be busied by the caller and will be 406 * unbusied by this function. 407 */ 408 int 409 vm_pageout_flush(vm_page_t *mc, int count, int vmflush_flags) 410 { 411 vm_object_t object; 412 int pageout_status[count]; 413 int numpagedout = 0; 414 int i; 415 416 /* 417 * Initiate I/O. Bump the vm_page_t->busy counter. 418 */ 419 for (i = 0; i < count; i++) { 420 KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, 421 ("vm_pageout_flush page %p index %d/%d: partially " 422 "invalid page", mc[i], i, count)); 423 vm_page_io_start(mc[i]); 424 } 425 426 /* 427 * We must make the pages read-only. This will also force the 428 * modified bit in the related pmaps to be cleared. The pager 429 * cannot clear the bit for us since the I/O completion code 430 * typically runs from an interrupt. The act of making the page 431 * read-only handles the case for us. 432 * 433 * Then we can unbusy the pages, we still hold a reference by virtue 434 * of our soft-busy. 435 */ 436 for (i = 0; i < count; i++) { 437 if (vmflush_flags & VM_PAGER_TRY_TO_CACHE) 438 vm_page_protect(mc[i], VM_PROT_NONE); 439 else 440 vm_page_protect(mc[i], VM_PROT_READ); 441 vm_page_wakeup(mc[i]); 442 } 443 444 object = mc[0]->object; 445 vm_object_pip_add(object, count); 446 447 vm_pager_put_pages(object, mc, count, 448 (vmflush_flags | 449 ((object == &kernel_object) ? 450 VM_PAGER_PUT_SYNC : 0)), 451 pageout_status); 452 453 for (i = 0; i < count; i++) { 454 vm_page_t mt = mc[i]; 455 456 switch (pageout_status[i]) { 457 case VM_PAGER_OK: 458 numpagedout++; 459 break; 460 case VM_PAGER_PEND: 461 numpagedout++; 462 break; 463 case VM_PAGER_BAD: 464 /* 465 * Page outside of range of object. Right now we 466 * essentially lose the changes by pretending it 467 * worked. 468 */ 469 vm_page_busy_wait(mt, FALSE, "pgbad"); 470 pmap_clear_modify(mt); 471 vm_page_undirty(mt); 472 vm_page_wakeup(mt); 473 break; 474 case VM_PAGER_ERROR: 475 case VM_PAGER_FAIL: 476 /* 477 * A page typically cannot be paged out when we 478 * have run out of swap. We leave the page 479 * marked inactive and will try to page it out 480 * again later. 481 * 482 * Starvation of the active page list is used to 483 * determine when the system is massively memory 484 * starved. 485 */ 486 break; 487 case VM_PAGER_AGAIN: 488 break; 489 } 490 491 /* 492 * If not PENDing this was a synchronous operation and we 493 * clean up after the I/O. If it is PENDing the mess is 494 * cleaned up asynchronously. 495 * 496 * Also nominally act on the caller's wishes if the caller 497 * wants to try to really clean (cache or free) the page. 498 * 499 * Also nominally deactivate the page if the system is 500 * memory-stressed. 501 */ 502 if (pageout_status[i] != VM_PAGER_PEND) { 503 vm_page_busy_wait(mt, FALSE, "pgouw"); 504 vm_page_io_finish(mt); 505 if (vmflush_flags & VM_PAGER_TRY_TO_CACHE) { 506 vm_page_try_to_cache(mt); 507 } else if (vm_page_count_severe()) { 508 vm_page_deactivate(mt); 509 vm_page_wakeup(mt); 510 } else { 511 vm_page_wakeup(mt); 512 } 513 vm_object_pip_wakeup(object); 514 } 515 } 516 return numpagedout; 517 } 518 519 #if !defined(NO_SWAPPING) 520 521 /* 522 * Callback function, page busied for us. We must dispose of the busy 523 * condition. Any related pmap pages may be held but will not be locked. 524 */ 525 static 526 int 527 vm_pageout_mdp_callback(struct pmap_pgscan_info *info, vm_offset_t va, 528 vm_page_t p) 529 { 530 int actcount; 531 int cleanit = 0; 532 533 /* 534 * Basic tests - There should never be a marker, and we can stop 535 * once the RSS is below the required level. 536 */ 537 KKASSERT((p->flags & PG_MARKER) == 0); 538 if (pmap_resident_tlnw_count(info->pmap) <= info->limit) { 539 vm_page_wakeup(p); 540 return(-1); 541 } 542 543 mycpu->gd_cnt.v_pdpages++; 544 545 if (p->wire_count || p->hold_count || (p->flags & PG_UNMANAGED)) { 546 vm_page_wakeup(p); 547 goto done; 548 } 549 550 ++info->actioncount; 551 552 /* 553 * Check if the page has been referened recently. If it has, 554 * activate it and skip. 555 */ 556 actcount = pmap_ts_referenced(p); 557 if (actcount) { 558 vm_page_flag_set(p, PG_REFERENCED); 559 } else if (p->flags & PG_REFERENCED) { 560 actcount = 1; 561 } 562 563 if (actcount) { 564 if (p->queue - p->pc != PQ_ACTIVE) { 565 vm_page_and_queue_spin_lock(p); 566 if (p->queue - p->pc != PQ_ACTIVE) { 567 vm_page_and_queue_spin_unlock(p); 568 vm_page_activate(p); 569 } else { 570 vm_page_and_queue_spin_unlock(p); 571 } 572 } else { 573 p->act_count += actcount; 574 if (p->act_count > ACT_MAX) 575 p->act_count = ACT_MAX; 576 } 577 vm_page_flag_clear(p, PG_REFERENCED); 578 vm_page_wakeup(p); 579 goto done; 580 } 581 582 /* 583 * Remove the page from this particular pmap. Once we do this, our 584 * pmap scans will not see it again (unless it gets faulted in), so 585 * we must actively dispose of or deal with the page. 586 */ 587 pmap_remove_specific(info->pmap, p); 588 589 /* 590 * If the page is not mapped to another process (i.e. as would be 591 * typical if this were a shared page from a library) then deactivate 592 * the page and clean it in two passes only. 593 * 594 * If the page hasn't been referenced since the last check, remove it 595 * from the pmap. If it is no longer mapped, deactivate it 596 * immediately, accelerating the normal decline. 597 * 598 * Once the page has been removed from the pmap the RSS code no 599 * longer tracks it so we have to make sure that it is staged for 600 * potential flush action. 601 */ 602 if ((p->flags & PG_MAPPED) == 0) { 603 if (p->queue - p->pc == PQ_ACTIVE) { 604 vm_page_deactivate(p); 605 } 606 if (p->queue - p->pc == PQ_INACTIVE) { 607 cleanit = 1; 608 } 609 } 610 611 /* 612 * Ok, try to fully clean the page and any nearby pages such that at 613 * least the requested page is freed or moved to the cache queue. 614 * 615 * We usually do this synchronously to allow us to get the page into 616 * the CACHE queue quickly, which will prevent memory exhaustion if 617 * a process with a memoryuse limit is running away. However, the 618 * sysadmin may desire to set vm.swap_user_async which relaxes this 619 * and improves write performance. 620 */ 621 if (cleanit) { 622 long max_launder = 0x7FFF; 623 long vnodes_skipped = 0; 624 int vmflush_flags; 625 struct vnode *vpfailed = NULL; 626 627 info->offset = va; 628 629 if (vm_pageout_memuse_mode >= 2) { 630 vmflush_flags = VM_PAGER_TRY_TO_CACHE | 631 VM_PAGER_ALLOW_ACTIVE; 632 if (swap_user_async == 0) 633 vmflush_flags |= VM_PAGER_PUT_SYNC; 634 vm_page_flag_set(p, PG_WINATCFLS); 635 info->cleancount += 636 vm_pageout_page(p, &max_launder, 637 &vnodes_skipped, 638 &vpfailed, 1, vmflush_flags); 639 } else { 640 vm_page_wakeup(p); 641 ++info->cleancount; 642 } 643 } else { 644 vm_page_wakeup(p); 645 } 646 647 /* 648 * Must be at end to avoid SMP races. 649 */ 650 done: 651 lwkt_user_yield(); 652 return 0; 653 } 654 655 /* 656 * Deactivate some number of pages in a map due to set RLIMIT_RSS limits. 657 * that is relatively difficult to do. We try to keep track of where we 658 * left off last time to reduce scan overhead. 659 * 660 * Called when vm_pageout_memuse_mode is >= 1. 661 */ 662 void 663 vm_pageout_map_deactivate_pages(vm_map_t map, vm_pindex_t limit) 664 { 665 vm_offset_t pgout_offset; 666 struct pmap_pgscan_info info; 667 int retries = 3; 668 669 pgout_offset = map->pgout_offset; 670 again: 671 #if 0 672 kprintf("%016jx ", pgout_offset); 673 #endif 674 if (pgout_offset < VM_MIN_USER_ADDRESS) 675 pgout_offset = VM_MIN_USER_ADDRESS; 676 if (pgout_offset >= VM_MAX_USER_ADDRESS) 677 pgout_offset = 0; 678 info.pmap = vm_map_pmap(map); 679 info.limit = limit; 680 info.beg_addr = pgout_offset; 681 info.end_addr = VM_MAX_USER_ADDRESS; 682 info.callback = vm_pageout_mdp_callback; 683 info.cleancount = 0; 684 info.actioncount = 0; 685 info.busycount = 0; 686 687 pmap_pgscan(&info); 688 pgout_offset = info.offset; 689 #if 0 690 kprintf("%016jx %08lx %08lx\n", pgout_offset, 691 info.cleancount, info.actioncount); 692 #endif 693 694 if (pgout_offset != VM_MAX_USER_ADDRESS && 695 pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) { 696 goto again; 697 } else if (retries && 698 pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) { 699 --retries; 700 goto again; 701 } 702 map->pgout_offset = pgout_offset; 703 } 704 #endif 705 706 /* 707 * Called when the pageout scan wants to free a page. We no longer 708 * try to cycle the vm_object here with a reference & dealloc, which can 709 * cause a non-trivial object collapse in a critical path. 710 * 711 * It is unclear why we cycled the ref_count in the past, perhaps to try 712 * to optimize shadow chain collapses but I don't quite see why it would 713 * be necessary. An OBJ_DEAD object should terminate any and all vm_pages 714 * synchronously and not have to be kicked-start. 715 */ 716 static void 717 vm_pageout_page_free(vm_page_t m) 718 { 719 vm_page_protect(m, VM_PROT_NONE); 720 vm_page_free(m); 721 } 722 723 /* 724 * vm_pageout_scan does the dirty work for the pageout daemon. 725 */ 726 struct vm_pageout_scan_info { 727 struct proc *bigproc; 728 vm_offset_t bigsize; 729 }; 730 731 static int vm_pageout_scan_callback(struct proc *p, void *data); 732 733 /* 734 * Scan inactive queue 735 * 736 * WARNING! Can be called from two pagedaemon threads simultaneously. 737 */ 738 static int 739 vm_pageout_scan_inactive(int pass, int q, long avail_shortage, 740 long *vnodes_skipped) 741 { 742 vm_page_t m; 743 struct vm_page marker; 744 struct vnode *vpfailed; /* warning, allowed to be stale */ 745 int maxscan; 746 long delta = 0; 747 long max_launder; 748 int isep; 749 750 isep = (curthread == emergpager); 751 752 /* 753 * Start scanning the inactive queue for pages we can move to the 754 * cache or free. The scan will stop when the target is reached or 755 * we have scanned the entire inactive queue. Note that m->act_count 756 * is not used to form decisions for the inactive queue, only for the 757 * active queue. 758 * 759 * max_launder limits the number of dirty pages we flush per scan. 760 * For most systems a smaller value (16 or 32) is more robust under 761 * extreme memory and disk pressure because any unnecessary writes 762 * to disk can result in extreme performance degredation. However, 763 * systems with excessive dirty pages (especially when MAP_NOSYNC is 764 * used) will die horribly with limited laundering. If the pageout 765 * daemon cannot clean enough pages in the first pass, we let it go 766 * all out in succeeding passes. 767 * 768 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT LAUNDER VNODE-BACKED 769 * PAGES. 770 */ 771 if ((max_launder = vm_max_launder) <= 1) 772 max_launder = 1; 773 if (pass) 774 max_launder = 10000; 775 776 /* 777 * Initialize our marker 778 */ 779 bzero(&marker, sizeof(marker)); 780 marker.flags = PG_FICTITIOUS | PG_MARKER; 781 marker.busy_count = PBUSY_LOCKED; 782 marker.queue = PQ_INACTIVE + q; 783 marker.pc = q; 784 marker.wire_count = 1; 785 786 /* 787 * Inactive queue scan. 788 * 789 * NOTE: The vm_page must be spinlocked before the queue to avoid 790 * deadlocks, so it is easiest to simply iterate the loop 791 * with the queue unlocked at the top. 792 */ 793 vpfailed = NULL; 794 795 vm_page_queues_spin_lock(PQ_INACTIVE + q); 796 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); 797 maxscan = vm_page_queues[PQ_INACTIVE + q].lcnt; 798 799 /* 800 * Queue locked at top of loop to avoid stack marker issues. 801 */ 802 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 803 maxscan-- > 0 && avail_shortage - delta > 0) 804 { 805 int count; 806 807 KKASSERT(m->queue == PQ_INACTIVE + q); 808 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, 809 &marker, pageq); 810 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE + q].pl, m, 811 &marker, pageq); 812 mycpu->gd_cnt.v_pdpages++; 813 814 /* 815 * Skip marker pages (atomic against other markers to avoid 816 * infinite hop-over scans). 817 */ 818 if (m->flags & PG_MARKER) 819 continue; 820 821 /* 822 * Try to busy the page. Don't mess with pages which are 823 * already busy or reorder them in the queue. 824 */ 825 if (vm_page_busy_try(m, TRUE)) 826 continue; 827 828 /* 829 * Remaining operations run with the page busy and neither 830 * the page or the queue will be spin-locked. 831 */ 832 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 833 KKASSERT(m->queue == PQ_INACTIVE + q); 834 835 /* 836 * The emergency pager runs when the primary pager gets 837 * stuck, which typically means the primary pager deadlocked 838 * on a vnode-backed page. Therefore, the emergency pager 839 * must skip any complex objects. 840 * 841 * We disallow VNODEs unless they are VCHR whos device ops 842 * does not flag D_NOEMERGPGR. 843 */ 844 if (isep && m->object) { 845 struct vnode *vp; 846 847 switch(m->object->type) { 848 case OBJT_DEFAULT: 849 case OBJT_SWAP: 850 /* 851 * Allow anonymous memory and assume that 852 * swap devices are not complex, since its 853 * kinda worthless if we can't swap out dirty 854 * anonymous pages. 855 */ 856 break; 857 case OBJT_VNODE: 858 /* 859 * Allow VCHR device if the D_NOEMERGPGR 860 * flag is not set, deny other vnode types 861 * as being too complex. 862 */ 863 vp = m->object->handle; 864 if (vp && vp->v_type == VCHR && 865 vp->v_rdev && vp->v_rdev->si_ops && 866 (vp->v_rdev->si_ops->head.flags & 867 D_NOEMERGPGR) == 0) { 868 break; 869 } 870 /* Deny - fall through */ 871 default: 872 /* 873 * Deny 874 */ 875 vm_page_wakeup(m); 876 vm_page_queues_spin_lock(PQ_INACTIVE + q); 877 lwkt_yield(); 878 continue; 879 } 880 } 881 882 /* 883 * Try to pageout the page and perhaps other nearby pages. 884 */ 885 count = vm_pageout_page(m, &max_launder, vnodes_skipped, 886 &vpfailed, pass, 0); 887 delta += count; 888 889 /* 890 * Systems with a ton of memory can wind up with huge 891 * deactivation counts. Because the inactive scan is 892 * doing a lot of flushing, the combination can result 893 * in excessive paging even in situations where other 894 * unrelated threads free up sufficient VM. 895 * 896 * To deal with this we abort the nominal active->inactive 897 * scan before we hit the inactive target when free+cache 898 * levels have reached a reasonable target. 899 * 900 * When deciding to stop early we need to add some slop to 901 * the test and we need to return full completion to the caller 902 * to prevent the caller from thinking there is something 903 * wrong and issuing a low-memory+swap warning or pkill. 904 * 905 * A deficit forces paging regardless of the state of the 906 * VM page queues (used for RSS enforcement). 907 */ 908 lwkt_yield(); 909 vm_page_queues_spin_lock(PQ_INACTIVE + q); 910 if (vm_paging_target() < -vm_max_launder) { 911 /* 912 * Stopping early, return full completion to caller. 913 */ 914 if (delta < avail_shortage) 915 delta = avail_shortage; 916 break; 917 } 918 } 919 920 /* page queue still spin-locked */ 921 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); 922 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 923 924 return (delta); 925 } 926 927 /* 928 * Pageout the specified page, return the total number of pages paged out 929 * (this routine may cluster). 930 * 931 * The page must be busied and soft-busied by the caller and will be disposed 932 * of by this function. 933 */ 934 static int 935 vm_pageout_page(vm_page_t m, long *max_launderp, long *vnodes_skippedp, 936 struct vnode **vpfailedp, int pass, int vmflush_flags) 937 { 938 vm_object_t object; 939 int actcount; 940 int count = 0; 941 942 /* 943 * It is possible for a page to be busied ad-hoc (e.g. the 944 * pmap_collect() code) and wired and race against the 945 * allocation of a new page. vm_page_alloc() may be forced 946 * to deactivate the wired page in which case it winds up 947 * on the inactive queue and must be handled here. We 948 * correct the problem simply by unqueuing the page. 949 */ 950 if (m->wire_count) { 951 vm_page_unqueue_nowakeup(m); 952 vm_page_wakeup(m); 953 kprintf("WARNING: pagedaemon: wired page on " 954 "inactive queue %p\n", m); 955 return 0; 956 } 957 958 /* 959 * A held page may be undergoing I/O, so skip it. 960 */ 961 if (m->hold_count) { 962 vm_page_and_queue_spin_lock(m); 963 if (m->queue - m->pc == PQ_INACTIVE) { 964 TAILQ_REMOVE( 965 &vm_page_queues[m->queue].pl, m, pageq); 966 TAILQ_INSERT_TAIL( 967 &vm_page_queues[m->queue].pl, m, pageq); 968 ++vm_swapcache_inactive_heuristic; 969 } 970 vm_page_and_queue_spin_unlock(m); 971 vm_page_wakeup(m); 972 return 0; 973 } 974 975 if (m->object == NULL || m->object->ref_count == 0) { 976 /* 977 * If the object is not being used, we ignore previous 978 * references. 979 */ 980 vm_page_flag_clear(m, PG_REFERENCED); 981 pmap_clear_reference(m); 982 /* fall through to end */ 983 } else if (((m->flags & PG_REFERENCED) == 0) && 984 (actcount = pmap_ts_referenced(m))) { 985 /* 986 * Otherwise, if the page has been referenced while 987 * in the inactive queue, we bump the "activation 988 * count" upwards, making it less likely that the 989 * page will be added back to the inactive queue 990 * prematurely again. Here we check the page tables 991 * (or emulated bits, if any), given the upper level 992 * VM system not knowing anything about existing 993 * references. 994 */ 995 vm_page_activate(m); 996 m->act_count += (actcount + ACT_ADVANCE); 997 vm_page_wakeup(m); 998 return 0; 999 } 1000 1001 /* 1002 * (m) is still busied. 1003 * 1004 * If the upper level VM system knows about any page 1005 * references, we activate the page. We also set the 1006 * "activation count" higher than normal so that we will less 1007 * likely place pages back onto the inactive queue again. 1008 */ 1009 if ((m->flags & PG_REFERENCED) != 0) { 1010 vm_page_flag_clear(m, PG_REFERENCED); 1011 actcount = pmap_ts_referenced(m); 1012 vm_page_activate(m); 1013 m->act_count += (actcount + ACT_ADVANCE + 1); 1014 vm_page_wakeup(m); 1015 return 0; 1016 } 1017 1018 /* 1019 * If the upper level VM system doesn't know anything about 1020 * the page being dirty, we have to check for it again. As 1021 * far as the VM code knows, any partially dirty pages are 1022 * fully dirty. 1023 * 1024 * Pages marked PG_WRITEABLE may be mapped into the user 1025 * address space of a process running on another cpu. A 1026 * user process (without holding the MP lock) running on 1027 * another cpu may be able to touch the page while we are 1028 * trying to remove it. vm_page_cache() will handle this 1029 * case for us. 1030 */ 1031 if (m->dirty == 0) { 1032 vm_page_test_dirty(m); 1033 } else { 1034 vm_page_dirty(m); 1035 } 1036 1037 if (m->valid == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 1038 /* 1039 * Invalid pages can be easily freed 1040 */ 1041 vm_pageout_page_free(m); 1042 mycpu->gd_cnt.v_dfree++; 1043 ++count; 1044 } else if (m->dirty == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 1045 /* 1046 * Clean pages can be placed onto the cache queue. 1047 * This effectively frees them. 1048 */ 1049 vm_page_cache(m); 1050 ++count; 1051 } else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) { 1052 /* 1053 * Dirty pages need to be paged out, but flushing 1054 * a page is extremely expensive verses freeing 1055 * a clean page. Rather then artificially limiting 1056 * the number of pages we can flush, we instead give 1057 * dirty pages extra priority on the inactive queue 1058 * by forcing them to be cycled through the queue 1059 * twice before being flushed, after which the 1060 * (now clean) page will cycle through once more 1061 * before being freed. This significantly extends 1062 * the thrash point for a heavily loaded machine. 1063 */ 1064 vm_page_flag_set(m, PG_WINATCFLS); 1065 vm_page_and_queue_spin_lock(m); 1066 if (m->queue - m->pc == PQ_INACTIVE) { 1067 TAILQ_REMOVE( 1068 &vm_page_queues[m->queue].pl, m, pageq); 1069 TAILQ_INSERT_TAIL( 1070 &vm_page_queues[m->queue].pl, m, pageq); 1071 ++vm_swapcache_inactive_heuristic; 1072 } 1073 vm_page_and_queue_spin_unlock(m); 1074 vm_page_wakeup(m); 1075 } else if (*max_launderp > 0) { 1076 /* 1077 * We always want to try to flush some dirty pages if 1078 * we encounter them, to keep the system stable. 1079 * Normally this number is small, but under extreme 1080 * pressure where there are insufficient clean pages 1081 * on the inactive queue, we may have to go all out. 1082 */ 1083 int swap_pageouts_ok; 1084 struct vnode *vp = NULL; 1085 1086 swap_pageouts_ok = 0; 1087 object = m->object; 1088 if (object && 1089 (object->type != OBJT_SWAP) && 1090 (object->type != OBJT_DEFAULT)) { 1091 swap_pageouts_ok = 1; 1092 } else { 1093 swap_pageouts_ok = !(defer_swap_pageouts || 1094 disable_swap_pageouts); 1095 swap_pageouts_ok |= (!disable_swap_pageouts && 1096 defer_swap_pageouts && 1097 vm_page_count_min(0)); 1098 } 1099 1100 /* 1101 * We don't bother paging objects that are "dead". 1102 * Those objects are in a "rundown" state. 1103 */ 1104 if (!swap_pageouts_ok || 1105 (object == NULL) || 1106 (object->flags & OBJ_DEAD)) { 1107 vm_page_and_queue_spin_lock(m); 1108 if (m->queue - m->pc == PQ_INACTIVE) { 1109 TAILQ_REMOVE( 1110 &vm_page_queues[m->queue].pl, 1111 m, pageq); 1112 TAILQ_INSERT_TAIL( 1113 &vm_page_queues[m->queue].pl, 1114 m, pageq); 1115 ++vm_swapcache_inactive_heuristic; 1116 } 1117 vm_page_and_queue_spin_unlock(m); 1118 vm_page_wakeup(m); 1119 return 0; 1120 } 1121 1122 /* 1123 * (m) is still busied. 1124 * 1125 * The object is already known NOT to be dead. It 1126 * is possible for the vget() to block the whole 1127 * pageout daemon, but the new low-memory handling 1128 * code should prevent it. 1129 * 1130 * The previous code skipped locked vnodes and, worse, 1131 * reordered pages in the queue. This results in 1132 * completely non-deterministic operation because, 1133 * quite often, a vm_fault has initiated an I/O and 1134 * is holding a locked vnode at just the point where 1135 * the pageout daemon is woken up. 1136 * 1137 * We can't wait forever for the vnode lock, we might 1138 * deadlock due to a vn_read() getting stuck in 1139 * vm_wait while holding this vnode. We skip the 1140 * vnode if we can't get it in a reasonable amount 1141 * of time. 1142 * 1143 * vpfailed is used to (try to) avoid the case where 1144 * a large number of pages are associated with a 1145 * locked vnode, which could cause the pageout daemon 1146 * to stall for an excessive amount of time. 1147 */ 1148 if (object->type == OBJT_VNODE) { 1149 int flags; 1150 1151 vp = object->handle; 1152 flags = LK_EXCLUSIVE; 1153 if (vp == *vpfailedp) 1154 flags |= LK_NOWAIT; 1155 else 1156 flags |= LK_TIMELOCK; 1157 vm_page_hold(m); 1158 vm_page_wakeup(m); 1159 1160 /* 1161 * We have unbusied (m) temporarily so we can 1162 * acquire the vp lock without deadlocking. 1163 * (m) is held to prevent destruction. 1164 */ 1165 if (vget(vp, flags) != 0) { 1166 *vpfailedp = vp; 1167 ++pageout_lock_miss; 1168 if (object->flags & OBJ_MIGHTBEDIRTY) 1169 ++*vnodes_skippedp; 1170 vm_page_unhold(m); 1171 return 0; 1172 } 1173 1174 /* 1175 * The page might have been moved to another 1176 * queue during potential blocking in vget() 1177 * above. The page might have been freed and 1178 * reused for another vnode. The object might 1179 * have been reused for another vnode. 1180 */ 1181 if (m->queue - m->pc != PQ_INACTIVE || 1182 m->object != object || 1183 object->handle != vp) { 1184 if (object->flags & OBJ_MIGHTBEDIRTY) 1185 ++*vnodes_skippedp; 1186 vput(vp); 1187 vm_page_unhold(m); 1188 return 0; 1189 } 1190 1191 /* 1192 * The page may have been busied during the 1193 * blocking in vput(); We don't move the 1194 * page back onto the end of the queue so that 1195 * statistics are more correct if we don't. 1196 */ 1197 if (vm_page_busy_try(m, TRUE)) { 1198 vput(vp); 1199 vm_page_unhold(m); 1200 return 0; 1201 } 1202 vm_page_unhold(m); 1203 1204 /* 1205 * (m) is busied again 1206 * 1207 * We own the busy bit and remove our hold 1208 * bit. If the page is still held it 1209 * might be undergoing I/O, so skip it. 1210 */ 1211 if (m->hold_count) { 1212 vm_page_and_queue_spin_lock(m); 1213 if (m->queue - m->pc == PQ_INACTIVE) { 1214 TAILQ_REMOVE(&vm_page_queues[m->queue].pl, m, pageq); 1215 TAILQ_INSERT_TAIL(&vm_page_queues[m->queue].pl, m, pageq); 1216 ++vm_swapcache_inactive_heuristic; 1217 } 1218 vm_page_and_queue_spin_unlock(m); 1219 if (object->flags & OBJ_MIGHTBEDIRTY) 1220 ++*vnodes_skippedp; 1221 vm_page_wakeup(m); 1222 vput(vp); 1223 return 0; 1224 } 1225 /* (m) is left busied as we fall through */ 1226 } 1227 1228 /* 1229 * page is busy and not held here. 1230 * 1231 * If a page is dirty, then it is either being washed 1232 * (but not yet cleaned) or it is still in the 1233 * laundry. If it is still in the laundry, then we 1234 * start the cleaning operation. 1235 * 1236 * decrement inactive_shortage on success to account 1237 * for the (future) cleaned page. Otherwise we 1238 * could wind up laundering or cleaning too many 1239 * pages. 1240 * 1241 * NOTE: Cleaning the page here does not cause 1242 * force_deficit to be adjusted, because the 1243 * page is not being freed or moved to the 1244 * cache. 1245 */ 1246 count = vm_pageout_clean_helper(m, vmflush_flags); 1247 *max_launderp -= count; 1248 1249 /* 1250 * Clean ate busy, page no longer accessible 1251 */ 1252 if (vp != NULL) 1253 vput(vp); 1254 } else { 1255 vm_page_wakeup(m); 1256 } 1257 return count; 1258 } 1259 1260 /* 1261 * Scan active queue 1262 * 1263 * WARNING! Can be called from two pagedaemon threads simultaneously. 1264 */ 1265 static int 1266 vm_pageout_scan_active(int pass, int q, 1267 long avail_shortage, long inactive_shortage, 1268 long *recycle_countp) 1269 { 1270 struct vm_page marker; 1271 vm_page_t m; 1272 int actcount; 1273 long delta = 0; 1274 long maxscan; 1275 int isep; 1276 1277 isep = (curthread == emergpager); 1278 1279 /* 1280 * We want to move pages from the active queue to the inactive 1281 * queue to get the inactive queue to the inactive target. If 1282 * we still have a page shortage from above we try to directly free 1283 * clean pages instead of moving them. 1284 * 1285 * If we do still have a shortage we keep track of the number of 1286 * pages we free or cache (recycle_count) as a measure of thrashing 1287 * between the active and inactive queues. 1288 * 1289 * If we were able to completely satisfy the free+cache targets 1290 * from the inactive pool we limit the number of pages we move 1291 * from the active pool to the inactive pool to 2x the pages we 1292 * had removed from the inactive pool (with a minimum of 1/5 the 1293 * inactive target). If we were not able to completely satisfy 1294 * the free+cache targets we go for the whole target aggressively. 1295 * 1296 * NOTE: Both variables can end up negative. 1297 * NOTE: We are still in a critical section. 1298 * 1299 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT LAUNDER VNODE-BACKED 1300 * PAGES. 1301 */ 1302 1303 bzero(&marker, sizeof(marker)); 1304 marker.flags = PG_FICTITIOUS | PG_MARKER; 1305 marker.busy_count = PBUSY_LOCKED; 1306 marker.queue = PQ_ACTIVE + q; 1307 marker.pc = q; 1308 marker.wire_count = 1; 1309 1310 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1311 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1312 maxscan = vm_page_queues[PQ_ACTIVE + q].lcnt; 1313 1314 /* 1315 * Queue locked at top of loop to avoid stack marker issues. 1316 */ 1317 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 1318 maxscan-- > 0 && (avail_shortage - delta > 0 || 1319 inactive_shortage > 0)) 1320 { 1321 KKASSERT(m->queue == PQ_ACTIVE + q); 1322 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, 1323 &marker, pageq); 1324 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m, 1325 &marker, pageq); 1326 1327 /* 1328 * Skip marker pages (atomic against other markers to avoid 1329 * infinite hop-over scans). 1330 */ 1331 if (m->flags & PG_MARKER) 1332 continue; 1333 1334 /* 1335 * Try to busy the page. Don't mess with pages which are 1336 * already busy or reorder them in the queue. 1337 */ 1338 if (vm_page_busy_try(m, TRUE)) 1339 continue; 1340 1341 /* 1342 * Remaining operations run with the page busy and neither 1343 * the page or the queue will be spin-locked. 1344 */ 1345 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1346 KKASSERT(m->queue == PQ_ACTIVE + q); 1347 1348 /* 1349 * Don't deactivate pages that are held, even if we can 1350 * busy them. (XXX why not?) 1351 */ 1352 if (m->hold_count != 0) { 1353 vm_page_and_queue_spin_lock(m); 1354 if (m->queue - m->pc == PQ_ACTIVE) { 1355 TAILQ_REMOVE( 1356 &vm_page_queues[PQ_ACTIVE + q].pl, 1357 m, pageq); 1358 TAILQ_INSERT_TAIL( 1359 &vm_page_queues[PQ_ACTIVE + q].pl, 1360 m, pageq); 1361 } 1362 vm_page_and_queue_spin_unlock(m); 1363 vm_page_wakeup(m); 1364 goto next; 1365 } 1366 1367 /* 1368 * The emergency pager ignores vnode-backed pages as these 1369 * are the pages that probably bricked the main pager. 1370 */ 1371 if (isep && m->object && m->object->type == OBJT_VNODE) { 1372 vm_page_and_queue_spin_lock(m); 1373 if (m->queue - m->pc == PQ_ACTIVE) { 1374 TAILQ_REMOVE( 1375 &vm_page_queues[PQ_ACTIVE + q].pl, 1376 m, pageq); 1377 TAILQ_INSERT_TAIL( 1378 &vm_page_queues[PQ_ACTIVE + q].pl, 1379 m, pageq); 1380 } 1381 vm_page_and_queue_spin_unlock(m); 1382 vm_page_wakeup(m); 1383 goto next; 1384 } 1385 1386 /* 1387 * The count for pagedaemon pages is done after checking the 1388 * page for eligibility... 1389 */ 1390 mycpu->gd_cnt.v_pdpages++; 1391 1392 /* 1393 * Check to see "how much" the page has been used and clear 1394 * the tracking access bits. If the object has no references 1395 * don't bother paying the expense. 1396 */ 1397 actcount = 0; 1398 if (m->object && m->object->ref_count != 0) { 1399 if (m->flags & PG_REFERENCED) 1400 ++actcount; 1401 actcount += pmap_ts_referenced(m); 1402 if (actcount) { 1403 m->act_count += ACT_ADVANCE + actcount; 1404 if (m->act_count > ACT_MAX) 1405 m->act_count = ACT_MAX; 1406 } 1407 } 1408 vm_page_flag_clear(m, PG_REFERENCED); 1409 1410 /* 1411 * actcount is only valid if the object ref_count is non-zero. 1412 * If the page does not have an object, actcount will be zero. 1413 */ 1414 if (actcount && m->object->ref_count != 0) { 1415 vm_page_and_queue_spin_lock(m); 1416 if (m->queue - m->pc == PQ_ACTIVE) { 1417 TAILQ_REMOVE( 1418 &vm_page_queues[PQ_ACTIVE + q].pl, 1419 m, pageq); 1420 TAILQ_INSERT_TAIL( 1421 &vm_page_queues[PQ_ACTIVE + q].pl, 1422 m, pageq); 1423 } 1424 vm_page_and_queue_spin_unlock(m); 1425 vm_page_wakeup(m); 1426 } else { 1427 switch(m->object->type) { 1428 case OBJT_DEFAULT: 1429 case OBJT_SWAP: 1430 m->act_count -= min(m->act_count, 1431 vm_anonmem_decline); 1432 break; 1433 default: 1434 m->act_count -= min(m->act_count, 1435 vm_filemem_decline); 1436 break; 1437 } 1438 if (vm_pageout_algorithm || 1439 (m->object == NULL) || 1440 (m->object && (m->object->ref_count == 0)) || 1441 m->act_count < pass + 1 1442 ) { 1443 /* 1444 * Deactivate the page. If we had a 1445 * shortage from our inactive scan try to 1446 * free (cache) the page instead. 1447 * 1448 * Don't just blindly cache the page if 1449 * we do not have a shortage from the 1450 * inactive scan, that could lead to 1451 * gigabytes being moved. 1452 */ 1453 --inactive_shortage; 1454 if (avail_shortage - delta > 0 || 1455 (m->object && (m->object->ref_count == 0))) 1456 { 1457 if (avail_shortage - delta > 0) 1458 ++*recycle_countp; 1459 vm_page_protect(m, VM_PROT_NONE); 1460 if (m->dirty == 0 && 1461 (m->flags & PG_NEED_COMMIT) == 0 && 1462 avail_shortage - delta > 0) { 1463 vm_page_cache(m); 1464 } else { 1465 vm_page_deactivate(m); 1466 vm_page_wakeup(m); 1467 } 1468 } else { 1469 vm_page_deactivate(m); 1470 vm_page_wakeup(m); 1471 } 1472 ++delta; 1473 } else { 1474 vm_page_and_queue_spin_lock(m); 1475 if (m->queue - m->pc == PQ_ACTIVE) { 1476 TAILQ_REMOVE( 1477 &vm_page_queues[PQ_ACTIVE + q].pl, 1478 m, pageq); 1479 TAILQ_INSERT_TAIL( 1480 &vm_page_queues[PQ_ACTIVE + q].pl, 1481 m, pageq); 1482 } 1483 vm_page_and_queue_spin_unlock(m); 1484 vm_page_wakeup(m); 1485 } 1486 } 1487 next: 1488 lwkt_yield(); 1489 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1490 } 1491 1492 /* 1493 * Clean out our local marker. 1494 * 1495 * Page queue still spin-locked. 1496 */ 1497 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1498 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1499 1500 return (delta); 1501 } 1502 1503 /* 1504 * The number of actually free pages can drop down to v_free_reserved, 1505 * we try to build the free count back above v_free_min. Note that 1506 * vm_paging_needed() also returns TRUE if v_free_count is not at 1507 * least v_free_min so that is the minimum we must build the free 1508 * count to. 1509 * 1510 * We use a slightly higher target to improve hysteresis, 1511 * ((v_free_target + v_free_min) / 2). Since v_free_target 1512 * is usually the same as v_cache_min this maintains about 1513 * half the pages in the free queue as are in the cache queue, 1514 * providing pretty good pipelining for pageout operation. 1515 * 1516 * The system operator can manipulate vm.v_cache_min and 1517 * vm.v_free_target to tune the pageout demon. Be sure 1518 * to keep vm.v_free_min < vm.v_free_target. 1519 * 1520 * Note that the original paging target is to get at least 1521 * (free_min + cache_min) into (free + cache). The slightly 1522 * higher target will shift additional pages from cache to free 1523 * without effecting the original paging target in order to 1524 * maintain better hysteresis and not have the free count always 1525 * be dead-on v_free_min. 1526 * 1527 * NOTE: we are still in a critical section. 1528 * 1529 * Pages moved from PQ_CACHE to totally free are not counted in the 1530 * pages_freed counter. 1531 * 1532 * WARNING! Can be called from two pagedaemon threads simultaneously. 1533 */ 1534 static void 1535 vm_pageout_scan_cache(long avail_shortage, int pass, 1536 long vnodes_skipped, long recycle_count) 1537 { 1538 static int lastkillticks; 1539 struct vm_pageout_scan_info info; 1540 vm_page_t m; 1541 int isep; 1542 1543 isep = (curthread == emergpager); 1544 1545 while (vmstats.v_free_count < 1546 (vmstats.v_free_min + vmstats.v_free_target) / 2) { 1547 /* 1548 * This steals some code from vm/vm_page.c 1549 * 1550 * Create two rovers and adjust the code to reduce 1551 * chances of them winding up at the same index (which 1552 * can cause a lot of contention). 1553 */ 1554 static int cache_rover[2] = { 0, PQ_L2_MASK / 2 }; 1555 1556 if (((cache_rover[0] ^ cache_rover[1]) & PQ_L2_MASK) == 0) 1557 goto next_rover; 1558 1559 m = vm_page_list_find(PQ_CACHE, cache_rover[isep] & PQ_L2_MASK); 1560 if (m == NULL) 1561 break; 1562 /* page is returned removed from its queue and spinlocked */ 1563 if (vm_page_busy_try(m, TRUE)) { 1564 vm_page_deactivate_locked(m); 1565 vm_page_spin_unlock(m); 1566 continue; 1567 } 1568 vm_page_spin_unlock(m); 1569 pagedaemon_wakeup(); 1570 lwkt_yield(); 1571 1572 /* 1573 * Remaining operations run with the page busy and neither 1574 * the page or the queue will be spin-locked. 1575 */ 1576 if ((m->flags & (PG_UNMANAGED | PG_NEED_COMMIT)) || 1577 m->hold_count || 1578 m->wire_count) { 1579 vm_page_deactivate(m); 1580 vm_page_wakeup(m); 1581 continue; 1582 } 1583 KKASSERT((m->flags & PG_MAPPED) == 0); 1584 KKASSERT(m->dirty == 0); 1585 vm_pageout_page_free(m); 1586 mycpu->gd_cnt.v_dfree++; 1587 next_rover: 1588 if (isep) 1589 cache_rover[1] -= PQ_PRIME2; 1590 else 1591 cache_rover[0] += PQ_PRIME2; 1592 } 1593 1594 #if !defined(NO_SWAPPING) 1595 /* 1596 * Idle process swapout -- run once per second. 1597 */ 1598 if (vm_swap_idle_enabled) { 1599 static time_t lsec; 1600 if (time_uptime != lsec) { 1601 atomic_set_int(&vm_pageout_req_swapout, VM_SWAP_IDLE); 1602 vm_req_vmdaemon(); 1603 lsec = time_uptime; 1604 } 1605 } 1606 #endif 1607 1608 /* 1609 * If we didn't get enough free pages, and we have skipped a vnode 1610 * in a writeable object, wakeup the sync daemon. And kick swapout 1611 * if we did not get enough free pages. 1612 */ 1613 if (vm_paging_target() > 0) { 1614 if (vnodes_skipped && vm_page_count_min(0)) 1615 speedup_syncer(NULL); 1616 #if !defined(NO_SWAPPING) 1617 if (vm_swap_enabled && vm_page_count_target()) { 1618 atomic_set_int(&vm_pageout_req_swapout, VM_SWAP_NORMAL); 1619 vm_req_vmdaemon(); 1620 } 1621 #endif 1622 } 1623 1624 /* 1625 * Handle catastrophic conditions. Under good conditions we should 1626 * be at the target, well beyond our minimum. If we could not even 1627 * reach our minimum the system is under heavy stress. But just being 1628 * under heavy stress does not trigger process killing. 1629 * 1630 * We consider ourselves to have run out of memory if the swap pager 1631 * is full and avail_shortage is still positive. The secondary check 1632 * ensures that we do not kill processes if the instantanious 1633 * availability is good, even if the pageout demon pass says it 1634 * couldn't get to the target. 1635 * 1636 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT HANDLE SWAP FULL 1637 * SITUATIONS. 1638 */ 1639 if (swap_pager_almost_full && 1640 pass > 0 && 1641 isep == 0 && 1642 (vm_page_count_min(recycle_count) || avail_shortage > 0)) { 1643 kprintf("Warning: system low on memory+swap " 1644 "shortage %ld for %d ticks!\n", 1645 avail_shortage, ticks - swap_fail_ticks); 1646 if (bootverbose) 1647 kprintf("Metrics: spaf=%d spf=%d pass=%d " 1648 "avail=%ld target=%ld last=%u\n", 1649 swap_pager_almost_full, 1650 swap_pager_full, 1651 pass, 1652 avail_shortage, 1653 vm_paging_target(), 1654 (unsigned int)(ticks - lastkillticks)); 1655 } 1656 if (swap_pager_full && 1657 pass > 1 && 1658 isep == 0 && 1659 avail_shortage > 0 && 1660 vm_paging_target() > 0 && 1661 (unsigned int)(ticks - lastkillticks) >= hz) { 1662 /* 1663 * Kill something, maximum rate once per second to give 1664 * the process time to free up sufficient memory. 1665 */ 1666 lastkillticks = ticks; 1667 info.bigproc = NULL; 1668 info.bigsize = 0; 1669 allproc_scan(vm_pageout_scan_callback, &info, 0); 1670 if (info.bigproc != NULL) { 1671 kprintf("Try to kill process %d %s\n", 1672 info.bigproc->p_pid, info.bigproc->p_comm); 1673 info.bigproc->p_nice = PRIO_MIN; 1674 info.bigproc->p_usched->resetpriority( 1675 FIRST_LWP_IN_PROC(info.bigproc)); 1676 atomic_set_int(&info.bigproc->p_flags, P_LOWMEMKILL); 1677 killproc(info.bigproc, "out of swap space"); 1678 wakeup(&vmstats.v_free_count); 1679 PRELE(info.bigproc); 1680 } 1681 } 1682 } 1683 1684 static int 1685 vm_pageout_scan_callback(struct proc *p, void *data) 1686 { 1687 struct vm_pageout_scan_info *info = data; 1688 vm_offset_t size; 1689 1690 /* 1691 * Never kill system processes or init. If we have configured swap 1692 * then try to avoid killing low-numbered pids. 1693 */ 1694 if ((p->p_flags & P_SYSTEM) || (p->p_pid == 1) || 1695 ((p->p_pid < 48) && (vm_swap_size != 0))) { 1696 return (0); 1697 } 1698 1699 lwkt_gettoken(&p->p_token); 1700 1701 /* 1702 * if the process is in a non-running type state, 1703 * don't touch it. 1704 */ 1705 if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) { 1706 lwkt_reltoken(&p->p_token); 1707 return (0); 1708 } 1709 1710 /* 1711 * Get the approximate process size. Note that anonymous pages 1712 * with backing swap will be counted twice, but there should not 1713 * be too many such pages due to the stress the VM system is 1714 * under at this point. 1715 */ 1716 size = vmspace_anonymous_count(p->p_vmspace) + 1717 vmspace_swap_count(p->p_vmspace); 1718 1719 /* 1720 * If the this process is bigger than the biggest one 1721 * remember it. 1722 */ 1723 if (info->bigsize < size) { 1724 if (info->bigproc) 1725 PRELE(info->bigproc); 1726 PHOLD(p); 1727 info->bigproc = p; 1728 info->bigsize = size; 1729 } 1730 lwkt_reltoken(&p->p_token); 1731 lwkt_yield(); 1732 1733 return(0); 1734 } 1735 1736 /* 1737 * This routine tries to maintain the pseudo LRU active queue, 1738 * so that during long periods of time where there is no paging, 1739 * that some statistic accumulation still occurs. This code 1740 * helps the situation where paging just starts to occur. 1741 */ 1742 static void 1743 vm_pageout_page_stats(int q) 1744 { 1745 static int fullintervalcount = 0; 1746 struct vm_page marker; 1747 vm_page_t m; 1748 long pcount, tpcount; /* Number of pages to check */ 1749 long page_shortage; 1750 1751 page_shortage = (vmstats.v_inactive_target + vmstats.v_cache_max + 1752 vmstats.v_free_min) - 1753 (vmstats.v_free_count + vmstats.v_inactive_count + 1754 vmstats.v_cache_count); 1755 1756 if (page_shortage <= 0) 1757 return; 1758 1759 pcount = vm_page_queues[PQ_ACTIVE + q].lcnt; 1760 fullintervalcount += vm_pageout_stats_interval; 1761 if (fullintervalcount < vm_pageout_full_stats_interval) { 1762 tpcount = (vm_pageout_stats_max * pcount) / 1763 vmstats.v_page_count + 1; 1764 if (pcount > tpcount) 1765 pcount = tpcount; 1766 } else { 1767 fullintervalcount = 0; 1768 } 1769 1770 bzero(&marker, sizeof(marker)); 1771 marker.flags = PG_FICTITIOUS | PG_MARKER; 1772 marker.busy_count = PBUSY_LOCKED; 1773 marker.queue = PQ_ACTIVE + q; 1774 marker.pc = q; 1775 marker.wire_count = 1; 1776 1777 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1778 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1779 1780 /* 1781 * Queue locked at top of loop to avoid stack marker issues. 1782 */ 1783 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 1784 pcount-- > 0) 1785 { 1786 int actcount; 1787 1788 KKASSERT(m->queue == PQ_ACTIVE + q); 1789 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1790 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m, 1791 &marker, pageq); 1792 1793 /* 1794 * Skip marker pages (atomic against other markers to avoid 1795 * infinite hop-over scans). 1796 */ 1797 if (m->flags & PG_MARKER) 1798 continue; 1799 1800 /* 1801 * Ignore pages we can't busy 1802 */ 1803 if (vm_page_busy_try(m, TRUE)) 1804 continue; 1805 1806 /* 1807 * Remaining operations run with the page busy and neither 1808 * the page or the queue will be spin-locked. 1809 */ 1810 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1811 KKASSERT(m->queue == PQ_ACTIVE + q); 1812 1813 /* 1814 * We now have a safely busied page, the page and queue 1815 * spinlocks have been released. 1816 * 1817 * Ignore held pages 1818 */ 1819 if (m->hold_count) { 1820 vm_page_wakeup(m); 1821 goto next; 1822 } 1823 1824 /* 1825 * Calculate activity 1826 */ 1827 actcount = 0; 1828 if (m->flags & PG_REFERENCED) { 1829 vm_page_flag_clear(m, PG_REFERENCED); 1830 actcount += 1; 1831 } 1832 actcount += pmap_ts_referenced(m); 1833 1834 /* 1835 * Update act_count and move page to end of queue. 1836 */ 1837 if (actcount) { 1838 m->act_count += ACT_ADVANCE + actcount; 1839 if (m->act_count > ACT_MAX) 1840 m->act_count = ACT_MAX; 1841 vm_page_and_queue_spin_lock(m); 1842 if (m->queue - m->pc == PQ_ACTIVE) { 1843 TAILQ_REMOVE( 1844 &vm_page_queues[PQ_ACTIVE + q].pl, 1845 m, pageq); 1846 TAILQ_INSERT_TAIL( 1847 &vm_page_queues[PQ_ACTIVE + q].pl, 1848 m, pageq); 1849 } 1850 vm_page_and_queue_spin_unlock(m); 1851 vm_page_wakeup(m); 1852 goto next; 1853 } 1854 1855 if (m->act_count == 0) { 1856 /* 1857 * We turn off page access, so that we have 1858 * more accurate RSS stats. We don't do this 1859 * in the normal page deactivation when the 1860 * system is loaded VM wise, because the 1861 * cost of the large number of page protect 1862 * operations would be higher than the value 1863 * of doing the operation. 1864 * 1865 * We use the marker to save our place so 1866 * we can release the spin lock. both (m) 1867 * and (next) will be invalid. 1868 */ 1869 vm_page_protect(m, VM_PROT_NONE); 1870 vm_page_deactivate(m); 1871 } else { 1872 m->act_count -= min(m->act_count, ACT_DECLINE); 1873 vm_page_and_queue_spin_lock(m); 1874 if (m->queue - m->pc == PQ_ACTIVE) { 1875 TAILQ_REMOVE( 1876 &vm_page_queues[PQ_ACTIVE + q].pl, 1877 m, pageq); 1878 TAILQ_INSERT_TAIL( 1879 &vm_page_queues[PQ_ACTIVE + q].pl, 1880 m, pageq); 1881 } 1882 vm_page_and_queue_spin_unlock(m); 1883 } 1884 vm_page_wakeup(m); 1885 next: 1886 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1887 } 1888 1889 /* 1890 * Remove our local marker 1891 * 1892 * Page queue still spin-locked. 1893 */ 1894 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1895 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1896 } 1897 1898 static int 1899 vm_pageout_free_page_calc(vm_size_t count) 1900 { 1901 if (count < vmstats.v_page_count) 1902 return 0; 1903 /* 1904 * free_reserved needs to include enough for the largest swap pager 1905 * structures plus enough for any pv_entry structs when paging. 1906 * 1907 * v_free_min normal allocations 1908 * v_free_reserved system allocations 1909 * v_pageout_free_min allocations by pageout daemon 1910 * v_interrupt_free_min low level allocations (e.g swap structures) 1911 */ 1912 if (vmstats.v_page_count > 1024) 1913 vmstats.v_free_min = 64 + (vmstats.v_page_count - 1024) / 200; 1914 else 1915 vmstats.v_free_min = 64; 1916 1917 /* 1918 * Make sure the vmmeter slop can't blow out our global minimums. 1919 * 1920 * However, to accomodate weird configurations (vkernels with many 1921 * cpus and little memory, or artifically reduced hw.physmem), do 1922 * not allow v_free_min to exceed 1/20 of ram or the pageout demon 1923 * will go out of control. 1924 */ 1925 if (vmstats.v_free_min < VMMETER_SLOP_COUNT * ncpus * 10) 1926 vmstats.v_free_min = VMMETER_SLOP_COUNT * ncpus * 10; 1927 if (vmstats.v_free_min > vmstats.v_page_count / 20) 1928 vmstats.v_free_min = vmstats.v_page_count / 20; 1929 1930 vmstats.v_free_reserved = vmstats.v_free_min * 4 / 8 + 7; 1931 vmstats.v_free_severe = vmstats.v_free_min * 4 / 8 + 0; 1932 vmstats.v_pageout_free_min = vmstats.v_free_min * 2 / 8 + 7; 1933 vmstats.v_interrupt_free_min = vmstats.v_free_min * 1 / 8 + 7; 1934 1935 return 1; 1936 } 1937 1938 1939 /* 1940 * vm_pageout is the high level pageout daemon. TWO kernel threads run 1941 * this daemon, the primary pageout daemon and the emergency pageout daemon. 1942 * 1943 * The emergency pageout daemon takes over when the primary pageout daemon 1944 * deadlocks. The emergency pageout daemon ONLY pages out to swap, thus 1945 * avoiding the many low-memory deadlocks which can occur when paging out 1946 * to VFS's. 1947 */ 1948 static void 1949 vm_pageout_thread(void) 1950 { 1951 int pass; 1952 int q; 1953 int q1iterator = 0; 1954 int q2iterator = 0; 1955 int isep; 1956 1957 curthread->td_flags |= TDF_SYSTHREAD; 1958 1959 /* 1960 * We only need to setup once. 1961 */ 1962 isep = 0; 1963 if (curthread == emergpager) { 1964 isep = 1; 1965 goto skip_setup; 1966 } 1967 1968 /* 1969 * Initialize some paging parameters. 1970 */ 1971 vm_pageout_free_page_calc(vmstats.v_page_count); 1972 1973 /* 1974 * v_free_target and v_cache_min control pageout hysteresis. Note 1975 * that these are more a measure of the VM cache queue hysteresis 1976 * then the VM free queue. Specifically, v_free_target is the 1977 * high water mark (free+cache pages). 1978 * 1979 * v_free_reserved + v_cache_min (mostly means v_cache_min) is the 1980 * low water mark, while v_free_min is the stop. v_cache_min must 1981 * be big enough to handle memory needs while the pageout daemon 1982 * is signalled and run to free more pages. 1983 */ 1984 if (vmstats.v_free_count > 6144) 1985 vmstats.v_free_target = 4 * vmstats.v_free_min + 1986 vmstats.v_free_reserved; 1987 else 1988 vmstats.v_free_target = 2 * vmstats.v_free_min + 1989 vmstats.v_free_reserved; 1990 1991 /* 1992 * NOTE: With the new buffer cache b_act_count we want the default 1993 * inactive target to be a percentage of available memory. 1994 * 1995 * The inactive target essentially determines the minimum 1996 * number of 'temporary' pages capable of caching one-time-use 1997 * files when the VM system is otherwise full of pages 1998 * belonging to multi-time-use files or active program data. 1999 * 2000 * NOTE: The inactive target is aggressively persued only if the 2001 * inactive queue becomes too small. If the inactive queue 2002 * is large enough to satisfy page movement to free+cache 2003 * then it is repopulated more slowly from the active queue. 2004 * This allows a general inactive_target default to be set. 2005 * 2006 * There is an issue here for processes which sit mostly idle 2007 * 'overnight', such as sshd, tcsh, and X. Any movement from 2008 * the active queue will eventually cause such pages to 2009 * recycle eventually causing a lot of paging in the morning. 2010 * To reduce the incidence of this pages cycled out of the 2011 * buffer cache are moved directly to the inactive queue if 2012 * they were only used once or twice. 2013 * 2014 * The vfs.vm_cycle_point sysctl can be used to adjust this. 2015 * Increasing the value (up to 64) increases the number of 2016 * buffer recyclements which go directly to the inactive queue. 2017 */ 2018 if (vmstats.v_free_count > 2048) { 2019 vmstats.v_cache_min = vmstats.v_free_target; 2020 vmstats.v_cache_max = 2 * vmstats.v_cache_min; 2021 } else { 2022 vmstats.v_cache_min = 0; 2023 vmstats.v_cache_max = 0; 2024 } 2025 vmstats.v_inactive_target = vmstats.v_free_count / 4; 2026 2027 /* XXX does not really belong here */ 2028 if (vm_page_max_wired == 0) 2029 vm_page_max_wired = vmstats.v_free_count / 3; 2030 2031 if (vm_pageout_stats_max == 0) 2032 vm_pageout_stats_max = vmstats.v_free_target; 2033 2034 /* 2035 * Set interval in seconds for stats scan. 2036 */ 2037 if (vm_pageout_stats_interval == 0) 2038 vm_pageout_stats_interval = 5; 2039 if (vm_pageout_full_stats_interval == 0) 2040 vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4; 2041 2042 2043 /* 2044 * Set maximum free per pass 2045 */ 2046 if (vm_pageout_stats_free_max == 0) 2047 vm_pageout_stats_free_max = 5; 2048 2049 swap_pager_swap_init(); 2050 pass = 0; 2051 2052 atomic_swap_int(&sequence_emerg_pager, 1); 2053 wakeup(&sequence_emerg_pager); 2054 2055 skip_setup: 2056 /* 2057 * Sequence emergency pager startup 2058 */ 2059 if (isep) { 2060 while (sequence_emerg_pager == 0) 2061 tsleep(&sequence_emerg_pager, 0, "pstartup", hz); 2062 } 2063 2064 /* 2065 * The pageout daemon is never done, so loop forever. 2066 * 2067 * WARNING! This code is being executed by two kernel threads 2068 * potentially simultaneously. 2069 */ 2070 while (TRUE) { 2071 int error; 2072 long avail_shortage; 2073 long inactive_shortage; 2074 long vnodes_skipped = 0; 2075 long recycle_count = 0; 2076 long tmp; 2077 2078 /* 2079 * Wait for an action request. If we timeout check to 2080 * see if paging is needed (in case the normal wakeup 2081 * code raced us). 2082 */ 2083 if (isep) { 2084 /* 2085 * Emergency pagedaemon monitors the primary 2086 * pagedaemon while vm_pages_needed != 0. 2087 * 2088 * The emergency pagedaemon only runs if VM paging 2089 * is needed and the primary pagedaemon has not 2090 * updated vm_pagedaemon_time for more than 2 seconds. 2091 */ 2092 if (vm_pages_needed) 2093 tsleep(&vm_pagedaemon_time, 0, "psleep", hz); 2094 else 2095 tsleep(&vm_pagedaemon_time, 0, "psleep", hz*10); 2096 if (vm_pages_needed == 0) { 2097 pass = 0; 2098 continue; 2099 } 2100 if ((int)(ticks - vm_pagedaemon_time) < hz * 2) { 2101 pass = 0; 2102 continue; 2103 } 2104 } else { 2105 /* 2106 * Primary pagedaemon 2107 */ 2108 if (vm_pages_needed == 0) { 2109 error = tsleep(&vm_pages_needed, 2110 0, "psleep", 2111 vm_pageout_stats_interval * hz); 2112 if (error && 2113 vm_paging_needed() == 0 && 2114 vm_pages_needed == 0) { 2115 for (q = 0; q < PQ_L2_SIZE; ++q) 2116 vm_pageout_page_stats(q); 2117 continue; 2118 } 2119 vm_pagedaemon_time = ticks; 2120 vm_pages_needed = 1; 2121 2122 /* 2123 * Wake the emergency pagedaemon up so it 2124 * can monitor us. It will automatically 2125 * go back into a long sleep when 2126 * vm_pages_needed returns to 0. 2127 */ 2128 wakeup(&vm_pagedaemon_time); 2129 } 2130 } 2131 2132 mycpu->gd_cnt.v_pdwakeups++; 2133 2134 /* 2135 * Scan for INACTIVE->CLEAN/PAGEOUT 2136 * 2137 * This routine tries to avoid thrashing the system with 2138 * unnecessary activity. 2139 * 2140 * Calculate our target for the number of free+cache pages we 2141 * want to get to. This is higher then the number that causes 2142 * allocations to stall (severe) in order to provide hysteresis, 2143 * and if we don't make it all the way but get to the minimum 2144 * we're happy. Goose it a bit if there are multiple requests 2145 * for memory. 2146 * 2147 * Don't reduce avail_shortage inside the loop or the 2148 * PQAVERAGE() calculation will break. 2149 * 2150 * NOTE! deficit is differentiated from avail_shortage as 2151 * REQUIRING at least (deficit) pages to be cleaned, 2152 * even if the page queues are in good shape. This 2153 * is used primarily for handling per-process 2154 * RLIMIT_RSS and may also see small values when 2155 * processes block due to low memory. 2156 */ 2157 vmstats_rollup(); 2158 if (isep == 0) 2159 vm_pagedaemon_time = ticks; 2160 avail_shortage = vm_paging_target() + vm_pageout_deficit; 2161 vm_pageout_deficit = 0; 2162 2163 if (avail_shortage > 0) { 2164 long delta = 0; 2165 int qq; 2166 2167 qq = q1iterator; 2168 for (q = 0; q < PQ_L2_SIZE; ++q) { 2169 delta += vm_pageout_scan_inactive( 2170 pass, 2171 qq & PQ_L2_MASK, 2172 PQAVERAGE(avail_shortage), 2173 &vnodes_skipped); 2174 if (isep) 2175 --qq; 2176 else 2177 ++qq; 2178 if (avail_shortage - delta <= 0) 2179 break; 2180 } 2181 avail_shortage -= delta; 2182 q1iterator = qq; 2183 } 2184 2185 /* 2186 * Figure out how many active pages we must deactivate. If 2187 * we were able to reach our target with just the inactive 2188 * scan above we limit the number of active pages we 2189 * deactivate to reduce unnecessary work. 2190 */ 2191 vmstats_rollup(); 2192 if (isep == 0) 2193 vm_pagedaemon_time = ticks; 2194 inactive_shortage = vmstats.v_inactive_target - 2195 vmstats.v_inactive_count; 2196 2197 /* 2198 * If we were unable to free sufficient inactive pages to 2199 * satisfy the free/cache queue requirements then simply 2200 * reaching the inactive target may not be good enough. 2201 * Try to deactivate pages in excess of the target based 2202 * on the shortfall. 2203 * 2204 * However to prevent thrashing the VM system do not 2205 * deactivate more than an additional 1/10 the inactive 2206 * target's worth of active pages. 2207 */ 2208 if (avail_shortage > 0) { 2209 tmp = avail_shortage * 2; 2210 if (tmp > vmstats.v_inactive_target / 10) 2211 tmp = vmstats.v_inactive_target / 10; 2212 inactive_shortage += tmp; 2213 } 2214 2215 /* 2216 * Only trigger a pmap cleanup on inactive shortage. 2217 */ 2218 if (isep == 0 && inactive_shortage > 0) { 2219 pmap_collect(); 2220 } 2221 2222 /* 2223 * Scan for ACTIVE->INACTIVE 2224 * 2225 * Only trigger on inactive shortage. Triggering on 2226 * avail_shortage can starve the active queue with 2227 * unnecessary active->inactive transitions and destroy 2228 * performance. 2229 * 2230 * If this is the emergency pager, always try to move 2231 * a few pages from active to inactive because the inactive 2232 * queue might have enough pages, but not enough anonymous 2233 * pages. 2234 */ 2235 if (isep && inactive_shortage < vm_emerg_launder) 2236 inactive_shortage = vm_emerg_launder; 2237 2238 if (/*avail_shortage > 0 ||*/ inactive_shortage > 0) { 2239 long delta = 0; 2240 int qq; 2241 2242 qq = q2iterator; 2243 for (q = 0; q < PQ_L2_SIZE; ++q) { 2244 delta += vm_pageout_scan_active( 2245 pass, 2246 qq & PQ_L2_MASK, 2247 PQAVERAGE(avail_shortage), 2248 PQAVERAGE(inactive_shortage), 2249 &recycle_count); 2250 if (isep) 2251 --qq; 2252 else 2253 ++qq; 2254 if (inactive_shortage - delta <= 0 && 2255 avail_shortage - delta <= 0) { 2256 break; 2257 } 2258 } 2259 inactive_shortage -= delta; 2260 avail_shortage -= delta; 2261 q2iterator = qq; 2262 } 2263 2264 /* 2265 * Scan for CACHE->FREE 2266 * 2267 * Finally free enough cache pages to meet our free page 2268 * requirement and take more drastic measures if we are 2269 * still in trouble. 2270 */ 2271 vmstats_rollup(); 2272 if (isep == 0) 2273 vm_pagedaemon_time = ticks; 2274 vm_pageout_scan_cache(avail_shortage, pass, 2275 vnodes_skipped, recycle_count); 2276 2277 /* 2278 * Wait for more work. 2279 */ 2280 if (avail_shortage > 0) { 2281 ++pass; 2282 if (pass < 10 && vm_pages_needed > 1) { 2283 /* 2284 * Normal operation, additional processes 2285 * have already kicked us. Retry immediately 2286 * unless swap space is completely full in 2287 * which case delay a bit. 2288 */ 2289 if (swap_pager_full) { 2290 tsleep(&vm_pages_needed, 0, "pdelay", 2291 hz / 5); 2292 } /* else immediate retry */ 2293 } else if (pass < 10) { 2294 /* 2295 * Normal operation, fewer processes. Delay 2296 * a bit but allow wakeups. vm_pages_needed 2297 * is only adjusted against the primary 2298 * pagedaemon here. 2299 */ 2300 if (isep == 0) 2301 vm_pages_needed = 0; 2302 tsleep(&vm_pages_needed, 0, "pdelay", hz / 10); 2303 if (isep == 0) 2304 vm_pages_needed = 1; 2305 } else if (swap_pager_full == 0) { 2306 /* 2307 * We've taken too many passes, forced delay. 2308 */ 2309 tsleep(&vm_pages_needed, 0, "pdelay", hz / 10); 2310 } else { 2311 /* 2312 * Running out of memory, catastrophic 2313 * back-off to one-second intervals. 2314 */ 2315 tsleep(&vm_pages_needed, 0, "pdelay", hz); 2316 } 2317 } else if (vm_pages_needed) { 2318 /* 2319 * Interlocked wakeup of waiters (non-optional). 2320 * 2321 * Similar to vm_page_free_wakeup() in vm_page.c, 2322 * wake 2323 */ 2324 pass = 0; 2325 if (!vm_page_count_min(vm_page_free_hysteresis) || 2326 !vm_page_count_target()) { 2327 vm_pages_needed = 0; 2328 wakeup(&vmstats.v_free_count); 2329 } 2330 } else { 2331 pass = 0; 2332 } 2333 } 2334 } 2335 2336 static struct kproc_desc pg1_kp = { 2337 "pagedaemon", 2338 vm_pageout_thread, 2339 &pagethread 2340 }; 2341 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &pg1_kp); 2342 2343 static struct kproc_desc pg2_kp = { 2344 "emergpager", 2345 vm_pageout_thread, 2346 &emergpager 2347 }; 2348 SYSINIT(emergpager, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, kproc_start, &pg2_kp); 2349 2350 2351 /* 2352 * Called after allocating a page out of the cache or free queue 2353 * to possibly wake the pagedaemon up to replentish our supply. 2354 * 2355 * We try to generate some hysteresis by waking the pagedaemon up 2356 * when our free+cache pages go below the free_min+cache_min level. 2357 * The pagedaemon tries to get the count back up to at least the 2358 * minimum, and through to the target level if possible. 2359 * 2360 * If the pagedaemon is already active bump vm_pages_needed as a hint 2361 * that there are even more requests pending. 2362 * 2363 * SMP races ok? 2364 * No requirements. 2365 */ 2366 void 2367 pagedaemon_wakeup(void) 2368 { 2369 if (vm_paging_needed() && curthread != pagethread) { 2370 if (vm_pages_needed == 0) { 2371 vm_pages_needed = 1; /* SMP race ok */ 2372 wakeup(&vm_pages_needed); 2373 } else if (vm_page_count_min(0)) { 2374 ++vm_pages_needed; /* SMP race ok */ 2375 } 2376 } 2377 } 2378 2379 #if !defined(NO_SWAPPING) 2380 2381 /* 2382 * SMP races ok? 2383 * No requirements. 2384 */ 2385 static void 2386 vm_req_vmdaemon(void) 2387 { 2388 static int lastrun = 0; 2389 2390 if ((ticks > (lastrun + hz)) || (ticks < lastrun)) { 2391 wakeup(&vm_daemon_needed); 2392 lastrun = ticks; 2393 } 2394 } 2395 2396 static int vm_daemon_callback(struct proc *p, void *data __unused); 2397 2398 /* 2399 * No requirements. 2400 */ 2401 static void 2402 vm_daemon(void) 2403 { 2404 int req_swapout; 2405 2406 while (TRUE) { 2407 tsleep(&vm_daemon_needed, 0, "psleep", 0); 2408 req_swapout = atomic_swap_int(&vm_pageout_req_swapout, 0); 2409 2410 /* 2411 * forced swapouts 2412 */ 2413 if (req_swapout) 2414 swapout_procs(vm_pageout_req_swapout); 2415 2416 /* 2417 * scan the processes for exceeding their rlimits or if 2418 * process is swapped out -- deactivate pages 2419 */ 2420 allproc_scan(vm_daemon_callback, NULL, 0); 2421 } 2422 } 2423 2424 static int 2425 vm_daemon_callback(struct proc *p, void *data __unused) 2426 { 2427 struct vmspace *vm; 2428 vm_pindex_t limit, size; 2429 2430 /* 2431 * if this is a system process or if we have already 2432 * looked at this process, skip it. 2433 */ 2434 lwkt_gettoken(&p->p_token); 2435 2436 if (p->p_flags & (P_SYSTEM | P_WEXIT)) { 2437 lwkt_reltoken(&p->p_token); 2438 return (0); 2439 } 2440 2441 /* 2442 * if the process is in a non-running type state, 2443 * don't touch it. 2444 */ 2445 if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) { 2446 lwkt_reltoken(&p->p_token); 2447 return (0); 2448 } 2449 2450 /* 2451 * get a limit 2452 */ 2453 limit = OFF_TO_IDX(qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur, 2454 p->p_rlimit[RLIMIT_RSS].rlim_max)); 2455 2456 /* 2457 * let processes that are swapped out really be 2458 * swapped out. Set the limit to nothing to get as 2459 * many pages out to swap as possible. 2460 */ 2461 if (p->p_flags & P_SWAPPEDOUT) 2462 limit = 0; 2463 2464 vm = p->p_vmspace; 2465 vmspace_hold(vm); 2466 size = pmap_resident_tlnw_count(&vm->vm_pmap); 2467 if (limit >= 0 && size > 4096 && 2468 size - 4096 >= limit && vm_pageout_memuse_mode >= 1) { 2469 vm_pageout_map_deactivate_pages(&vm->vm_map, limit); 2470 } 2471 vmspace_drop(vm); 2472 2473 lwkt_reltoken(&p->p_token); 2474 2475 return (0); 2476 } 2477 2478 #endif 2479