1 /* 2 * Copyright (c) 1991 Regents of the University of California. 3 * All rights reserved. 4 * Copyright (c) 1994 John S. Dyson 5 * All rights reserved. 6 * Copyright (c) 1994 David Greenman 7 * All rights reserved. 8 * 9 * This code is derived from software contributed to Berkeley by 10 * The Mach Operating System project at Carnegie-Mellon University. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 3. Neither the name of the University nor the names of its contributors 21 * may be used to endorse or promote products derived from this software 22 * without specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 37 * 38 * 39 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 40 * All rights reserved. 41 * 42 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 43 * 44 * Permission to use, copy, modify and distribute this software and 45 * its documentation is hereby granted, provided that both the copyright 46 * notice and this permission notice appear in all copies of the 47 * software, derivative works or modified versions, and any portions 48 * thereof, and that both notices appear in supporting documentation. 49 * 50 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 51 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 52 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 53 * 54 * Carnegie Mellon requests users of this software to return to 55 * 56 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 57 * School of Computer Science 58 * Carnegie Mellon University 59 * Pittsburgh PA 15213-3890 60 * 61 * any improvements or extensions that they make and grant Carnegie the 62 * rights to redistribute these changes. 63 * 64 * $FreeBSD: src/sys/vm/vm_pageout.c,v 1.151.2.15 2002/12/29 18:21:04 dillon Exp $ 65 */ 66 67 /* 68 * The proverbial page-out daemon. 69 */ 70 71 #include "opt_vm.h" 72 #include <sys/param.h> 73 #include <sys/systm.h> 74 #include <sys/kernel.h> 75 #include <sys/proc.h> 76 #include <sys/kthread.h> 77 #include <sys/resourcevar.h> 78 #include <sys/signalvar.h> 79 #include <sys/vnode.h> 80 #include <sys/vmmeter.h> 81 #include <sys/conf.h> 82 #include <sys/sysctl.h> 83 84 #include <vm/vm.h> 85 #include <vm/vm_param.h> 86 #include <sys/lock.h> 87 #include <vm/vm_object.h> 88 #include <vm/vm_page.h> 89 #include <vm/vm_map.h> 90 #include <vm/vm_pageout.h> 91 #include <vm/vm_pager.h> 92 #include <vm/swap_pager.h> 93 #include <vm/vm_extern.h> 94 95 #include <sys/thread2.h> 96 #include <sys/spinlock2.h> 97 #include <vm/vm_page2.h> 98 99 /* 100 * System initialization 101 */ 102 103 /* the kernel process "vm_pageout"*/ 104 static int vm_pageout_page(vm_page_t m, int *max_launderp, 105 int *vnodes_skippedp, struct vnode **vpfailedp, 106 int pass, int vmflush_flags); 107 static int vm_pageout_clean_helper (vm_page_t, int); 108 static int vm_pageout_free_page_calc (vm_size_t count); 109 static void vm_pageout_page_free(vm_page_t m) ; 110 struct thread *emergpager; 111 struct thread *pagethread; 112 static int sequence_emerg_pager; 113 114 #if !defined(NO_SWAPPING) 115 /* the kernel process "vm_daemon"*/ 116 static void vm_daemon (void); 117 static struct thread *vmthread; 118 119 static struct kproc_desc vm_kp = { 120 "vmdaemon", 121 vm_daemon, 122 &vmthread 123 }; 124 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp); 125 #endif 126 127 int vm_pages_needed = 0; /* Event on which pageout daemon sleeps */ 128 int vm_pageout_deficit = 0; /* Estimated number of pages deficit */ 129 int vm_pageout_pages_needed = 0;/* pageout daemon needs pages */ 130 int vm_page_free_hysteresis = 16; 131 static int vm_pagedaemon_time; 132 133 #if !defined(NO_SWAPPING) 134 static int vm_pageout_req_swapout; 135 static int vm_daemon_needed; 136 #endif 137 static int vm_max_launder = 4096; 138 static int vm_emerg_launder = 100; 139 static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0; 140 static int vm_pageout_full_stats_interval = 0; 141 static int vm_pageout_stats_free_max=0, vm_pageout_algorithm=0; 142 static int defer_swap_pageouts=0; 143 static int disable_swap_pageouts=0; 144 static u_int vm_anonmem_decline = ACT_DECLINE; 145 static u_int vm_filemem_decline = ACT_DECLINE * 2; 146 147 #if defined(NO_SWAPPING) 148 static int vm_swap_enabled=0; 149 static int vm_swap_idle_enabled=0; 150 #else 151 static int vm_swap_enabled=1; 152 static int vm_swap_idle_enabled=0; 153 #endif 154 int vm_pageout_memuse_mode=1; /* 0-disable, 1-passive, 2-active swp*/ 155 156 SYSCTL_UINT(_vm, VM_PAGEOUT_ALGORITHM, anonmem_decline, 157 CTLFLAG_RW, &vm_anonmem_decline, 0, "active->inactive anon memory"); 158 159 SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, filemem_decline, 160 CTLFLAG_RW, &vm_filemem_decline, 0, "active->inactive file cache"); 161 162 SYSCTL_INT(_vm, OID_AUTO, page_free_hysteresis, 163 CTLFLAG_RW, &vm_page_free_hysteresis, 0, 164 "Free more pages than the minimum required"); 165 166 SYSCTL_INT(_vm, OID_AUTO, max_launder, 167 CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout"); 168 SYSCTL_INT(_vm, OID_AUTO, emerg_launder, 169 CTLFLAG_RW, &vm_emerg_launder, 0, "Emergency pager minimum"); 170 171 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max, 172 CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length"); 173 174 SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval, 175 CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan"); 176 177 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval, 178 CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan"); 179 180 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_free_max, 181 CTLFLAG_RW, &vm_pageout_stats_free_max, 0, "Not implemented"); 182 SYSCTL_INT(_vm, OID_AUTO, pageout_memuse_mode, 183 CTLFLAG_RW, &vm_pageout_memuse_mode, 0, "memoryuse resource mode"); 184 185 #if defined(NO_SWAPPING) 186 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 187 CTLFLAG_RD, &vm_swap_enabled, 0, ""); 188 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, 189 CTLFLAG_RD, &vm_swap_idle_enabled, 0, ""); 190 #else 191 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 192 CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout"); 193 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, 194 CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); 195 #endif 196 197 SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts, 198 CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem"); 199 200 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, 201 CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); 202 203 static int pageout_lock_miss; 204 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss, 205 CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout"); 206 207 int vm_page_max_wired; /* XXX max # of wired pages system-wide */ 208 209 #if !defined(NO_SWAPPING) 210 static void vm_req_vmdaemon (void); 211 #endif 212 static void vm_pageout_page_stats(int q); 213 214 /* 215 * Calculate approximately how many pages on each queue to try to 216 * clean. An exact calculation creates an edge condition when the 217 * queues are unbalanced so add significant slop. The queue scans 218 * will stop early when targets are reached and will start where they 219 * left off on the next pass. 220 * 221 * We need to be generous here because there are all sorts of loading 222 * conditions that can cause edge cases if try to average over all queues. 223 * In particular, storage subsystems have become so fast that paging 224 * activity can become quite frantic. Eventually we will probably need 225 * two paging threads, one for dirty pages and one for clean, to deal 226 * with the bandwidth requirements. 227 228 * So what we do is calculate a value that can be satisfied nominally by 229 * only having to scan half the queues. 230 */ 231 static __inline int 232 PQAVERAGE(int n) 233 { 234 int avg; 235 236 if (n >= 0) { 237 avg = ((n + (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) + 1); 238 } else { 239 avg = ((n - (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) - 1); 240 } 241 return avg; 242 } 243 244 /* 245 * vm_pageout_clean_helper: 246 * 247 * Clean the page and remove it from the laundry. The page must be busied 248 * by the caller and will be disposed of (put away, flushed) by this routine. 249 */ 250 static int 251 vm_pageout_clean_helper(vm_page_t m, int vmflush_flags) 252 { 253 vm_object_t object; 254 vm_page_t mc[BLIST_MAX_ALLOC]; 255 int error; 256 int ib, is, page_base; 257 vm_pindex_t pindex = m->pindex; 258 259 object = m->object; 260 261 /* 262 * Don't mess with the page if it's held or special. 263 * 264 * XXX do we really need to check hold_count here? hold_count 265 * isn't supposed to mess with vm_page ops except prevent the 266 * page from being reused. 267 */ 268 if (m->hold_count != 0 || (m->flags & PG_UNMANAGED)) { 269 vm_page_wakeup(m); 270 return 0; 271 } 272 273 /* 274 * Place page in cluster. Align cluster for optimal swap space 275 * allocation (whether it is swap or not). This is typically ~16-32 276 * pages, which also tends to align the cluster to multiples of the 277 * filesystem block size if backed by a filesystem. 278 */ 279 page_base = pindex % BLIST_MAX_ALLOC; 280 mc[page_base] = m; 281 ib = page_base - 1; 282 is = page_base + 1; 283 284 /* 285 * Scan object for clusterable pages. 286 * 287 * We can cluster ONLY if: ->> the page is NOT 288 * clean, wired, busy, held, or mapped into a 289 * buffer, and one of the following: 290 * 1) The page is inactive, or a seldom used 291 * active page. 292 * -or- 293 * 2) we force the issue. 294 * 295 * During heavy mmap/modification loads the pageout 296 * daemon can really fragment the underlying file 297 * due to flushing pages out of order and not trying 298 * align the clusters (which leave sporatic out-of-order 299 * holes). To solve this problem we do the reverse scan 300 * first and attempt to align our cluster, then do a 301 * forward scan if room remains. 302 */ 303 vm_object_hold(object); 304 305 while (ib >= 0) { 306 vm_page_t p; 307 308 p = vm_page_lookup_busy_try(object, pindex - page_base + ib, 309 TRUE, &error); 310 if (error || p == NULL) 311 break; 312 if ((p->queue - p->pc) == PQ_CACHE || 313 (p->flags & PG_UNMANAGED)) { 314 vm_page_wakeup(p); 315 break; 316 } 317 vm_page_test_dirty(p); 318 if (((p->dirty & p->valid) == 0 && 319 (p->flags & PG_NEED_COMMIT) == 0) || 320 p->wire_count != 0 || /* may be held by buf cache */ 321 p->hold_count != 0) { /* may be undergoing I/O */ 322 vm_page_wakeup(p); 323 break; 324 } 325 if (p->queue - p->pc != PQ_INACTIVE) { 326 if (p->queue - p->pc != PQ_ACTIVE || 327 (vmflush_flags & VM_PAGER_ALLOW_ACTIVE) == 0) { 328 vm_page_wakeup(p); 329 break; 330 } 331 } 332 333 /* 334 * Try to maintain page groupings in the cluster. 335 */ 336 if (m->flags & PG_WINATCFLS) 337 vm_page_flag_set(p, PG_WINATCFLS); 338 else 339 vm_page_flag_clear(p, PG_WINATCFLS); 340 p->act_count = m->act_count; 341 342 mc[ib] = p; 343 --ib; 344 } 345 ++ib; /* fixup */ 346 347 while (is < BLIST_MAX_ALLOC && 348 pindex - page_base + is < object->size) { 349 vm_page_t p; 350 351 p = vm_page_lookup_busy_try(object, pindex - page_base + is, 352 TRUE, &error); 353 if (error || p == NULL) 354 break; 355 if (((p->queue - p->pc) == PQ_CACHE) || 356 (p->flags & PG_UNMANAGED)) { 357 vm_page_wakeup(p); 358 break; 359 } 360 vm_page_test_dirty(p); 361 if (((p->dirty & p->valid) == 0 && 362 (p->flags & PG_NEED_COMMIT) == 0) || 363 p->wire_count != 0 || /* may be held by buf cache */ 364 p->hold_count != 0) { /* may be undergoing I/O */ 365 vm_page_wakeup(p); 366 break; 367 } 368 if (p->queue - p->pc != PQ_INACTIVE) { 369 if (p->queue - p->pc != PQ_ACTIVE || 370 (vmflush_flags & VM_PAGER_ALLOW_ACTIVE) == 0) { 371 vm_page_wakeup(p); 372 break; 373 } 374 } 375 376 /* 377 * Try to maintain page groupings in the cluster. 378 */ 379 if (m->flags & PG_WINATCFLS) 380 vm_page_flag_set(p, PG_WINATCFLS); 381 else 382 vm_page_flag_clear(p, PG_WINATCFLS); 383 p->act_count = m->act_count; 384 385 mc[is] = p; 386 ++is; 387 } 388 389 vm_object_drop(object); 390 391 /* 392 * we allow reads during pageouts... 393 */ 394 return vm_pageout_flush(&mc[ib], is - ib, vmflush_flags); 395 } 396 397 /* 398 * vm_pageout_flush() - launder the given pages 399 * 400 * The given pages are laundered. Note that we setup for the start of 401 * I/O ( i.e. busy the page ), mark it read-only, and bump the object 402 * reference count all in here rather then in the parent. If we want 403 * the parent to do more sophisticated things we may have to change 404 * the ordering. 405 * 406 * The pages in the array must be busied by the caller and will be 407 * unbusied by this function. 408 */ 409 int 410 vm_pageout_flush(vm_page_t *mc, int count, int vmflush_flags) 411 { 412 vm_object_t object; 413 int pageout_status[count]; 414 int numpagedout = 0; 415 int i; 416 417 /* 418 * Initiate I/O. Bump the vm_page_t->busy counter. 419 */ 420 for (i = 0; i < count; i++) { 421 KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, 422 ("vm_pageout_flush page %p index %d/%d: partially " 423 "invalid page", mc[i], i, count)); 424 vm_page_io_start(mc[i]); 425 } 426 427 /* 428 * We must make the pages read-only. This will also force the 429 * modified bit in the related pmaps to be cleared. The pager 430 * cannot clear the bit for us since the I/O completion code 431 * typically runs from an interrupt. The act of making the page 432 * read-only handles the case for us. 433 * 434 * Then we can unbusy the pages, we still hold a reference by virtue 435 * of our soft-busy. 436 */ 437 for (i = 0; i < count; i++) { 438 if (vmflush_flags & VM_PAGER_TRY_TO_CACHE) 439 vm_page_protect(mc[i], VM_PROT_NONE); 440 else 441 vm_page_protect(mc[i], VM_PROT_READ); 442 vm_page_wakeup(mc[i]); 443 } 444 445 object = mc[0]->object; 446 vm_object_pip_add(object, count); 447 448 vm_pager_put_pages(object, mc, count, 449 (vmflush_flags | 450 ((object == &kernel_object) ? 451 VM_PAGER_PUT_SYNC : 0)), 452 pageout_status); 453 454 for (i = 0; i < count; i++) { 455 vm_page_t mt = mc[i]; 456 457 switch (pageout_status[i]) { 458 case VM_PAGER_OK: 459 numpagedout++; 460 break; 461 case VM_PAGER_PEND: 462 numpagedout++; 463 break; 464 case VM_PAGER_BAD: 465 /* 466 * Page outside of range of object. Right now we 467 * essentially lose the changes by pretending it 468 * worked. 469 */ 470 vm_page_busy_wait(mt, FALSE, "pgbad"); 471 pmap_clear_modify(mt); 472 vm_page_undirty(mt); 473 vm_page_wakeup(mt); 474 break; 475 case VM_PAGER_ERROR: 476 case VM_PAGER_FAIL: 477 /* 478 * A page typically cannot be paged out when we 479 * have run out of swap. We leave the page 480 * marked inactive and will try to page it out 481 * again later. 482 * 483 * Starvation of the active page list is used to 484 * determine when the system is massively memory 485 * starved. 486 */ 487 break; 488 case VM_PAGER_AGAIN: 489 break; 490 } 491 492 /* 493 * If not PENDing this was a synchronous operation and we 494 * clean up after the I/O. If it is PENDing the mess is 495 * cleaned up asynchronously. 496 * 497 * Also nominally act on the caller's wishes if the caller 498 * wants to try to really clean (cache or free) the page. 499 * 500 * Also nominally deactivate the page if the system is 501 * memory-stressed. 502 */ 503 if (pageout_status[i] != VM_PAGER_PEND) { 504 vm_page_busy_wait(mt, FALSE, "pgouw"); 505 vm_page_io_finish(mt); 506 if (vmflush_flags & VM_PAGER_TRY_TO_CACHE) { 507 vm_page_try_to_cache(mt); 508 } else if (vm_page_count_severe()) { 509 vm_page_deactivate(mt); 510 vm_page_wakeup(mt); 511 } else { 512 vm_page_wakeup(mt); 513 } 514 vm_object_pip_wakeup(object); 515 } 516 } 517 return numpagedout; 518 } 519 520 #if !defined(NO_SWAPPING) 521 522 /* 523 * Callback function, page busied for us. We must dispose of the busy 524 * condition. Any related pmap pages may be held but will not be locked. 525 */ 526 static 527 int 528 vm_pageout_mdp_callback(struct pmap_pgscan_info *info, vm_offset_t va, 529 vm_page_t p) 530 { 531 int actcount; 532 int cleanit = 0; 533 534 /* 535 * Basic tests - There should never be a marker, and we can stop 536 * once the RSS is below the required level. 537 */ 538 KKASSERT((p->flags & PG_MARKER) == 0); 539 if (pmap_resident_tlnw_count(info->pmap) <= info->limit) { 540 vm_page_wakeup(p); 541 return(-1); 542 } 543 544 mycpu->gd_cnt.v_pdpages++; 545 546 if (p->wire_count || p->hold_count || (p->flags & PG_UNMANAGED)) { 547 vm_page_wakeup(p); 548 goto done; 549 } 550 551 ++info->actioncount; 552 553 /* 554 * Check if the page has been referened recently. If it has, 555 * activate it and skip. 556 */ 557 actcount = pmap_ts_referenced(p); 558 if (actcount) { 559 vm_page_flag_set(p, PG_REFERENCED); 560 } else if (p->flags & PG_REFERENCED) { 561 actcount = 1; 562 } 563 564 if (actcount) { 565 if (p->queue - p->pc != PQ_ACTIVE) { 566 vm_page_and_queue_spin_lock(p); 567 if (p->queue - p->pc != PQ_ACTIVE) { 568 vm_page_and_queue_spin_unlock(p); 569 vm_page_activate(p); 570 } else { 571 vm_page_and_queue_spin_unlock(p); 572 } 573 } else { 574 p->act_count += actcount; 575 if (p->act_count > ACT_MAX) 576 p->act_count = ACT_MAX; 577 } 578 vm_page_flag_clear(p, PG_REFERENCED); 579 vm_page_wakeup(p); 580 goto done; 581 } 582 583 /* 584 * Remove the page from this particular pmap. Once we do this, our 585 * pmap scans will not see it again (unless it gets faulted in), so 586 * we must actively dispose of or deal with the page. 587 */ 588 pmap_remove_specific(info->pmap, p); 589 590 /* 591 * If the page is not mapped to another process (i.e. as would be 592 * typical if this were a shared page from a library) then deactivate 593 * the page and clean it in two passes only. 594 * 595 * If the page hasn't been referenced since the last check, remove it 596 * from the pmap. If it is no longer mapped, deactivate it 597 * immediately, accelerating the normal decline. 598 * 599 * Once the page has been removed from the pmap the RSS code no 600 * longer tracks it so we have to make sure that it is staged for 601 * potential flush action. 602 */ 603 if ((p->flags & PG_MAPPED) == 0) { 604 if (p->queue - p->pc == PQ_ACTIVE) { 605 vm_page_deactivate(p); 606 } 607 if (p->queue - p->pc == PQ_INACTIVE) { 608 cleanit = 1; 609 } 610 } 611 612 /* 613 * Ok, try to fully clean the page and any nearby pages such that at 614 * least the requested page is freed or moved to the cache queue. 615 * 616 * We usually do this synchronously to allow us to get the page into 617 * the CACHE queue quickly, which will prevent memory exhaustion if 618 * a process with a memoryuse limit is running away. However, the 619 * sysadmin may desire to set vm.swap_user_async which relaxes this 620 * and improves write performance. 621 */ 622 if (cleanit) { 623 int max_launder = 0x7FFF; 624 int vnodes_skipped = 0; 625 int vmflush_flags; 626 struct vnode *vpfailed = NULL; 627 628 info->offset = va; 629 630 if (vm_pageout_memuse_mode >= 2) { 631 vmflush_flags = VM_PAGER_TRY_TO_CACHE | 632 VM_PAGER_ALLOW_ACTIVE; 633 if (swap_user_async == 0) 634 vmflush_flags |= VM_PAGER_PUT_SYNC; 635 vm_page_flag_set(p, PG_WINATCFLS); 636 info->cleancount += 637 vm_pageout_page(p, &max_launder, 638 &vnodes_skipped, 639 &vpfailed, 1, vmflush_flags); 640 } else { 641 vm_page_wakeup(p); 642 ++info->cleancount; 643 } 644 } else { 645 vm_page_wakeup(p); 646 } 647 648 /* 649 * Must be at end to avoid SMP races. 650 */ 651 done: 652 lwkt_user_yield(); 653 return 0; 654 } 655 656 /* 657 * Deactivate some number of pages in a map due to set RLIMIT_RSS limits. 658 * that is relatively difficult to do. We try to keep track of where we 659 * left off last time to reduce scan overhead. 660 * 661 * Called when vm_pageout_memuse_mode is >= 1. 662 */ 663 void 664 vm_pageout_map_deactivate_pages(vm_map_t map, vm_pindex_t limit) 665 { 666 vm_offset_t pgout_offset; 667 struct pmap_pgscan_info info; 668 int retries = 3; 669 670 pgout_offset = map->pgout_offset; 671 again: 672 #if 0 673 kprintf("%016jx ", pgout_offset); 674 #endif 675 if (pgout_offset < VM_MIN_USER_ADDRESS) 676 pgout_offset = VM_MIN_USER_ADDRESS; 677 if (pgout_offset >= VM_MAX_USER_ADDRESS) 678 pgout_offset = 0; 679 info.pmap = vm_map_pmap(map); 680 info.limit = limit; 681 info.beg_addr = pgout_offset; 682 info.end_addr = VM_MAX_USER_ADDRESS; 683 info.callback = vm_pageout_mdp_callback; 684 info.cleancount = 0; 685 info.actioncount = 0; 686 info.busycount = 0; 687 688 pmap_pgscan(&info); 689 pgout_offset = info.offset; 690 #if 0 691 kprintf("%016jx %08lx %08lx\n", pgout_offset, 692 info.cleancount, info.actioncount); 693 #endif 694 695 if (pgout_offset != VM_MAX_USER_ADDRESS && 696 pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) { 697 goto again; 698 } else if (retries && 699 pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) { 700 --retries; 701 goto again; 702 } 703 map->pgout_offset = pgout_offset; 704 } 705 #endif 706 707 /* 708 * Called when the pageout scan wants to free a page. We no longer 709 * try to cycle the vm_object here with a reference & dealloc, which can 710 * cause a non-trivial object collapse in a critical path. 711 * 712 * It is unclear why we cycled the ref_count in the past, perhaps to try 713 * to optimize shadow chain collapses but I don't quite see why it would 714 * be necessary. An OBJ_DEAD object should terminate any and all vm_pages 715 * synchronously and not have to be kicked-start. 716 */ 717 static void 718 vm_pageout_page_free(vm_page_t m) 719 { 720 vm_page_protect(m, VM_PROT_NONE); 721 vm_page_free(m); 722 } 723 724 /* 725 * vm_pageout_scan does the dirty work for the pageout daemon. 726 */ 727 struct vm_pageout_scan_info { 728 struct proc *bigproc; 729 vm_offset_t bigsize; 730 }; 731 732 static int vm_pageout_scan_callback(struct proc *p, void *data); 733 734 /* 735 * Scan inactive queue 736 * 737 * WARNING! Can be called from two pagedaemon threads simultaneously. 738 */ 739 static int 740 vm_pageout_scan_inactive(int pass, int q, int avail_shortage, 741 int *vnodes_skipped) 742 { 743 vm_page_t m; 744 struct vm_page marker; 745 struct vnode *vpfailed; /* warning, allowed to be stale */ 746 int maxscan; 747 int delta = 0; 748 int max_launder; 749 int isep; 750 751 isep = (curthread == emergpager); 752 753 /* 754 * Start scanning the inactive queue for pages we can move to the 755 * cache or free. The scan will stop when the target is reached or 756 * we have scanned the entire inactive queue. Note that m->act_count 757 * is not used to form decisions for the inactive queue, only for the 758 * active queue. 759 * 760 * max_launder limits the number of dirty pages we flush per scan. 761 * For most systems a smaller value (16 or 32) is more robust under 762 * extreme memory and disk pressure because any unnecessary writes 763 * to disk can result in extreme performance degredation. However, 764 * systems with excessive dirty pages (especially when MAP_NOSYNC is 765 * used) will die horribly with limited laundering. If the pageout 766 * daemon cannot clean enough pages in the first pass, we let it go 767 * all out in succeeding passes. 768 * 769 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT LAUNDER VNODE-BACKED 770 * PAGES. 771 */ 772 if ((max_launder = vm_max_launder) <= 1) 773 max_launder = 1; 774 if (pass) 775 max_launder = 10000; 776 777 /* 778 * Initialize our marker 779 */ 780 bzero(&marker, sizeof(marker)); 781 marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER; 782 marker.queue = PQ_INACTIVE + q; 783 marker.pc = q; 784 marker.wire_count = 1; 785 786 /* 787 * Inactive queue scan. 788 * 789 * NOTE: The vm_page must be spinlocked before the queue to avoid 790 * deadlocks, so it is easiest to simply iterate the loop 791 * with the queue unlocked at the top. 792 */ 793 vpfailed = NULL; 794 795 vm_page_queues_spin_lock(PQ_INACTIVE + q); 796 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); 797 maxscan = vm_page_queues[PQ_INACTIVE + q].lcnt; 798 799 /* 800 * Queue locked at top of loop to avoid stack marker issues. 801 */ 802 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 803 maxscan-- > 0 && avail_shortage - delta > 0) 804 { 805 int count; 806 807 KKASSERT(m->queue == PQ_INACTIVE + q); 808 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, 809 &marker, pageq); 810 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE + q].pl, m, 811 &marker, pageq); 812 mycpu->gd_cnt.v_pdpages++; 813 814 /* 815 * Skip marker pages (atomic against other markers to avoid 816 * infinite hop-over scans). 817 */ 818 if (m->flags & PG_MARKER) 819 continue; 820 821 /* 822 * Try to busy the page. Don't mess with pages which are 823 * already busy or reorder them in the queue. 824 */ 825 if (vm_page_busy_try(m, TRUE)) 826 continue; 827 828 /* 829 * Remaining operations run with the page busy and neither 830 * the page or the queue will be spin-locked. 831 */ 832 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 833 KKASSERT(m->queue == PQ_INACTIVE + q); 834 835 /* 836 * The emergency pager runs when the primary pager gets 837 * stuck, which typically means the primary pager deadlocked 838 * on a vnode-backed page. Therefore, the emergency pager 839 * must skip any complex objects. 840 * 841 * We disallow VNODEs unless they are VCHR whos device ops 842 * does not flag D_NOEMERGPGR. 843 */ 844 if (isep && m->object) { 845 struct vnode *vp; 846 847 switch(m->object->type) { 848 case OBJT_DEFAULT: 849 case OBJT_SWAP: 850 /* 851 * Allow anonymous memory and assume that 852 * swap devices are not complex, since its 853 * kinda worthless if we can't swap out dirty 854 * anonymous pages. 855 */ 856 break; 857 case OBJT_VNODE: 858 /* 859 * Allow VCHR device if the D_NOEMERGPGR 860 * flag is not set, deny other vnode types 861 * as being too complex. 862 */ 863 vp = m->object->handle; 864 if (vp && vp->v_type == VCHR && 865 vp->v_rdev && vp->v_rdev->si_ops && 866 (vp->v_rdev->si_ops->head.flags & 867 D_NOEMERGPGR) == 0) { 868 break; 869 } 870 /* Deny - fall through */ 871 default: 872 /* 873 * Deny 874 */ 875 vm_page_wakeup(m); 876 vm_page_queues_spin_lock(PQ_INACTIVE + q); 877 lwkt_yield(); 878 continue; 879 } 880 } 881 882 /* 883 * Try to pageout the page and perhaps other nearby pages. 884 */ 885 count = vm_pageout_page(m, &max_launder, vnodes_skipped, 886 &vpfailed, pass, 0); 887 delta += count; 888 889 /* 890 * Systems with a ton of memory can wind up with huge 891 * deactivation counts. Because the inactive scan is 892 * doing a lot of flushing, the combination can result 893 * in excessive paging even in situations where other 894 * unrelated threads free up sufficient VM. 895 * 896 * To deal with this we abort the nominal active->inactive 897 * scan before we hit the inactive target when free+cache 898 * levels have reached a reasonable target. 899 * 900 * When deciding to stop early we need to add some slop to 901 * the test and we need to return full completion to the caller 902 * to prevent the caller from thinking there is something 903 * wrong and issuing a low-memory+swap warning or pkill. 904 * 905 * A deficit forces paging regardless of the state of the 906 * VM page queues (used for RSS enforcement). 907 */ 908 lwkt_yield(); 909 vm_page_queues_spin_lock(PQ_INACTIVE + q); 910 if (vm_paging_target() < -vm_max_launder) { 911 /* 912 * Stopping early, return full completion to caller. 913 */ 914 if (delta < avail_shortage) 915 delta = avail_shortage; 916 break; 917 } 918 } 919 920 /* page queue still spin-locked */ 921 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); 922 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 923 924 return (delta); 925 } 926 927 /* 928 * Pageout the specified page, return the total number of pages paged out 929 * (this routine may cluster). 930 * 931 * The page must be busied and soft-busied by the caller and will be disposed 932 * of by this function. 933 */ 934 static int 935 vm_pageout_page(vm_page_t m, int *max_launderp, int *vnodes_skippedp, 936 struct vnode **vpfailedp, int pass, int vmflush_flags) 937 { 938 vm_object_t object; 939 int actcount; 940 int count = 0; 941 942 /* 943 * It is possible for a page to be busied ad-hoc (e.g. the 944 * pmap_collect() code) and wired and race against the 945 * allocation of a new page. vm_page_alloc() may be forced 946 * to deactivate the wired page in which case it winds up 947 * on the inactive queue and must be handled here. We 948 * correct the problem simply by unqueuing the page. 949 */ 950 if (m->wire_count) { 951 vm_page_unqueue_nowakeup(m); 952 vm_page_wakeup(m); 953 kprintf("WARNING: pagedaemon: wired page on " 954 "inactive queue %p\n", m); 955 return 0; 956 } 957 958 /* 959 * A held page may be undergoing I/O, so skip it. 960 */ 961 if (m->hold_count) { 962 vm_page_and_queue_spin_lock(m); 963 if (m->queue - m->pc == PQ_INACTIVE) { 964 TAILQ_REMOVE( 965 &vm_page_queues[m->queue].pl, m, pageq); 966 TAILQ_INSERT_TAIL( 967 &vm_page_queues[m->queue].pl, m, pageq); 968 ++vm_swapcache_inactive_heuristic; 969 } 970 vm_page_and_queue_spin_unlock(m); 971 vm_page_wakeup(m); 972 return 0; 973 } 974 975 if (m->object == NULL || m->object->ref_count == 0) { 976 /* 977 * If the object is not being used, we ignore previous 978 * references. 979 */ 980 vm_page_flag_clear(m, PG_REFERENCED); 981 pmap_clear_reference(m); 982 /* fall through to end */ 983 } else if (((m->flags & PG_REFERENCED) == 0) && 984 (actcount = pmap_ts_referenced(m))) { 985 /* 986 * Otherwise, if the page has been referenced while 987 * in the inactive queue, we bump the "activation 988 * count" upwards, making it less likely that the 989 * page will be added back to the inactive queue 990 * prematurely again. Here we check the page tables 991 * (or emulated bits, if any), given the upper level 992 * VM system not knowing anything about existing 993 * references. 994 */ 995 vm_page_activate(m); 996 m->act_count += (actcount + ACT_ADVANCE); 997 vm_page_wakeup(m); 998 return 0; 999 } 1000 1001 /* 1002 * (m) is still busied. 1003 * 1004 * If the upper level VM system knows about any page 1005 * references, we activate the page. We also set the 1006 * "activation count" higher than normal so that we will less 1007 * likely place pages back onto the inactive queue again. 1008 */ 1009 if ((m->flags & PG_REFERENCED) != 0) { 1010 vm_page_flag_clear(m, PG_REFERENCED); 1011 actcount = pmap_ts_referenced(m); 1012 vm_page_activate(m); 1013 m->act_count += (actcount + ACT_ADVANCE + 1); 1014 vm_page_wakeup(m); 1015 return 0; 1016 } 1017 1018 /* 1019 * If the upper level VM system doesn't know anything about 1020 * the page being dirty, we have to check for it again. As 1021 * far as the VM code knows, any partially dirty pages are 1022 * fully dirty. 1023 * 1024 * Pages marked PG_WRITEABLE may be mapped into the user 1025 * address space of a process running on another cpu. A 1026 * user process (without holding the MP lock) running on 1027 * another cpu may be able to touch the page while we are 1028 * trying to remove it. vm_page_cache() will handle this 1029 * case for us. 1030 */ 1031 if (m->dirty == 0) { 1032 vm_page_test_dirty(m); 1033 } else { 1034 vm_page_dirty(m); 1035 } 1036 1037 if (m->valid == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 1038 /* 1039 * Invalid pages can be easily freed 1040 */ 1041 vm_pageout_page_free(m); 1042 mycpu->gd_cnt.v_dfree++; 1043 ++count; 1044 } else if (m->dirty == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 1045 /* 1046 * Clean pages can be placed onto the cache queue. 1047 * This effectively frees them. 1048 */ 1049 vm_page_cache(m); 1050 ++count; 1051 } else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) { 1052 /* 1053 * Dirty pages need to be paged out, but flushing 1054 * a page is extremely expensive verses freeing 1055 * a clean page. Rather then artificially limiting 1056 * the number of pages we can flush, we instead give 1057 * dirty pages extra priority on the inactive queue 1058 * by forcing them to be cycled through the queue 1059 * twice before being flushed, after which the 1060 * (now clean) page will cycle through once more 1061 * before being freed. This significantly extends 1062 * the thrash point for a heavily loaded machine. 1063 */ 1064 vm_page_flag_set(m, PG_WINATCFLS); 1065 vm_page_and_queue_spin_lock(m); 1066 if (m->queue - m->pc == PQ_INACTIVE) { 1067 TAILQ_REMOVE( 1068 &vm_page_queues[m->queue].pl, m, pageq); 1069 TAILQ_INSERT_TAIL( 1070 &vm_page_queues[m->queue].pl, m, pageq); 1071 ++vm_swapcache_inactive_heuristic; 1072 } 1073 vm_page_and_queue_spin_unlock(m); 1074 vm_page_wakeup(m); 1075 } else if (*max_launderp > 0) { 1076 /* 1077 * We always want to try to flush some dirty pages if 1078 * we encounter them, to keep the system stable. 1079 * Normally this number is small, but under extreme 1080 * pressure where there are insufficient clean pages 1081 * on the inactive queue, we may have to go all out. 1082 */ 1083 int swap_pageouts_ok; 1084 struct vnode *vp = NULL; 1085 1086 swap_pageouts_ok = 0; 1087 object = m->object; 1088 if (object && 1089 (object->type != OBJT_SWAP) && 1090 (object->type != OBJT_DEFAULT)) { 1091 swap_pageouts_ok = 1; 1092 } else { 1093 swap_pageouts_ok = !(defer_swap_pageouts || 1094 disable_swap_pageouts); 1095 swap_pageouts_ok |= (!disable_swap_pageouts && 1096 defer_swap_pageouts && 1097 vm_page_count_min(0)); 1098 } 1099 1100 /* 1101 * We don't bother paging objects that are "dead". 1102 * Those objects are in a "rundown" state. 1103 */ 1104 if (!swap_pageouts_ok || 1105 (object == NULL) || 1106 (object->flags & OBJ_DEAD)) { 1107 vm_page_and_queue_spin_lock(m); 1108 if (m->queue - m->pc == PQ_INACTIVE) { 1109 TAILQ_REMOVE( 1110 &vm_page_queues[m->queue].pl, 1111 m, pageq); 1112 TAILQ_INSERT_TAIL( 1113 &vm_page_queues[m->queue].pl, 1114 m, pageq); 1115 ++vm_swapcache_inactive_heuristic; 1116 } 1117 vm_page_and_queue_spin_unlock(m); 1118 vm_page_wakeup(m); 1119 return 0; 1120 } 1121 1122 /* 1123 * (m) is still busied. 1124 * 1125 * The object is already known NOT to be dead. It 1126 * is possible for the vget() to block the whole 1127 * pageout daemon, but the new low-memory handling 1128 * code should prevent it. 1129 * 1130 * The previous code skipped locked vnodes and, worse, 1131 * reordered pages in the queue. This results in 1132 * completely non-deterministic operation because, 1133 * quite often, a vm_fault has initiated an I/O and 1134 * is holding a locked vnode at just the point where 1135 * the pageout daemon is woken up. 1136 * 1137 * We can't wait forever for the vnode lock, we might 1138 * deadlock due to a vn_read() getting stuck in 1139 * vm_wait while holding this vnode. We skip the 1140 * vnode if we can't get it in a reasonable amount 1141 * of time. 1142 * 1143 * vpfailed is used to (try to) avoid the case where 1144 * a large number of pages are associated with a 1145 * locked vnode, which could cause the pageout daemon 1146 * to stall for an excessive amount of time. 1147 */ 1148 if (object->type == OBJT_VNODE) { 1149 int flags; 1150 1151 vp = object->handle; 1152 flags = LK_EXCLUSIVE; 1153 if (vp == *vpfailedp) 1154 flags |= LK_NOWAIT; 1155 else 1156 flags |= LK_TIMELOCK; 1157 vm_page_hold(m); 1158 vm_page_wakeup(m); 1159 1160 /* 1161 * We have unbusied (m) temporarily so we can 1162 * acquire the vp lock without deadlocking. 1163 * (m) is held to prevent destruction. 1164 */ 1165 if (vget(vp, flags) != 0) { 1166 *vpfailedp = vp; 1167 ++pageout_lock_miss; 1168 if (object->flags & OBJ_MIGHTBEDIRTY) 1169 ++*vnodes_skippedp; 1170 vm_page_unhold(m); 1171 return 0; 1172 } 1173 1174 /* 1175 * The page might have been moved to another 1176 * queue during potential blocking in vget() 1177 * above. The page might have been freed and 1178 * reused for another vnode. The object might 1179 * have been reused for another vnode. 1180 */ 1181 if (m->queue - m->pc != PQ_INACTIVE || 1182 m->object != object || 1183 object->handle != vp) { 1184 if (object->flags & OBJ_MIGHTBEDIRTY) 1185 ++*vnodes_skippedp; 1186 vput(vp); 1187 vm_page_unhold(m); 1188 return 0; 1189 } 1190 1191 /* 1192 * The page may have been busied during the 1193 * blocking in vput(); We don't move the 1194 * page back onto the end of the queue so that 1195 * statistics are more correct if we don't. 1196 */ 1197 if (vm_page_busy_try(m, TRUE)) { 1198 vput(vp); 1199 vm_page_unhold(m); 1200 return 0; 1201 } 1202 vm_page_unhold(m); 1203 1204 /* 1205 * (m) is busied again 1206 * 1207 * We own the busy bit and remove our hold 1208 * bit. If the page is still held it 1209 * might be undergoing I/O, so skip it. 1210 */ 1211 if (m->hold_count) { 1212 vm_page_and_queue_spin_lock(m); 1213 if (m->queue - m->pc == PQ_INACTIVE) { 1214 TAILQ_REMOVE(&vm_page_queues[m->queue].pl, m, pageq); 1215 TAILQ_INSERT_TAIL(&vm_page_queues[m->queue].pl, m, pageq); 1216 ++vm_swapcache_inactive_heuristic; 1217 } 1218 vm_page_and_queue_spin_unlock(m); 1219 if (object->flags & OBJ_MIGHTBEDIRTY) 1220 ++*vnodes_skippedp; 1221 vm_page_wakeup(m); 1222 vput(vp); 1223 return 0; 1224 } 1225 /* (m) is left busied as we fall through */ 1226 } 1227 1228 /* 1229 * page is busy and not held here. 1230 * 1231 * If a page is dirty, then it is either being washed 1232 * (but not yet cleaned) or it is still in the 1233 * laundry. If it is still in the laundry, then we 1234 * start the cleaning operation. 1235 * 1236 * decrement inactive_shortage on success to account 1237 * for the (future) cleaned page. Otherwise we 1238 * could wind up laundering or cleaning too many 1239 * pages. 1240 * 1241 * NOTE: Cleaning the page here does not cause 1242 * force_deficit to be adjusted, because the 1243 * page is not being freed or moved to the 1244 * cache. 1245 */ 1246 count = vm_pageout_clean_helper(m, vmflush_flags); 1247 *max_launderp -= count; 1248 1249 /* 1250 * Clean ate busy, page no longer accessible 1251 */ 1252 if (vp != NULL) 1253 vput(vp); 1254 } else { 1255 vm_page_wakeup(m); 1256 } 1257 return count; 1258 } 1259 1260 /* 1261 * Scan active queue 1262 * 1263 * WARNING! Can be called from two pagedaemon threads simultaneously. 1264 */ 1265 static int 1266 vm_pageout_scan_active(int pass, int q, 1267 int avail_shortage, int inactive_shortage, 1268 int *recycle_countp) 1269 { 1270 struct vm_page marker; 1271 vm_page_t m; 1272 int actcount; 1273 int delta = 0; 1274 int maxscan; 1275 int isep; 1276 1277 isep = (curthread == emergpager); 1278 1279 /* 1280 * We want to move pages from the active queue to the inactive 1281 * queue to get the inactive queue to the inactive target. If 1282 * we still have a page shortage from above we try to directly free 1283 * clean pages instead of moving them. 1284 * 1285 * If we do still have a shortage we keep track of the number of 1286 * pages we free or cache (recycle_count) as a measure of thrashing 1287 * between the active and inactive queues. 1288 * 1289 * If we were able to completely satisfy the free+cache targets 1290 * from the inactive pool we limit the number of pages we move 1291 * from the active pool to the inactive pool to 2x the pages we 1292 * had removed from the inactive pool (with a minimum of 1/5 the 1293 * inactive target). If we were not able to completely satisfy 1294 * the free+cache targets we go for the whole target aggressively. 1295 * 1296 * NOTE: Both variables can end up negative. 1297 * NOTE: We are still in a critical section. 1298 * 1299 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT LAUNDER VNODE-BACKED 1300 * PAGES. 1301 */ 1302 1303 bzero(&marker, sizeof(marker)); 1304 marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER; 1305 marker.queue = PQ_ACTIVE + q; 1306 marker.pc = q; 1307 marker.wire_count = 1; 1308 1309 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1310 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1311 maxscan = vm_page_queues[PQ_ACTIVE + q].lcnt; 1312 1313 /* 1314 * Queue locked at top of loop to avoid stack marker issues. 1315 */ 1316 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 1317 maxscan-- > 0 && (avail_shortage - delta > 0 || 1318 inactive_shortage > 0)) 1319 { 1320 KKASSERT(m->queue == PQ_ACTIVE + q); 1321 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, 1322 &marker, pageq); 1323 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m, 1324 &marker, pageq); 1325 1326 /* 1327 * Skip marker pages (atomic against other markers to avoid 1328 * infinite hop-over scans). 1329 */ 1330 if (m->flags & PG_MARKER) 1331 continue; 1332 1333 /* 1334 * Try to busy the page. Don't mess with pages which are 1335 * already busy or reorder them in the queue. 1336 */ 1337 if (vm_page_busy_try(m, TRUE)) 1338 continue; 1339 1340 /* 1341 * Remaining operations run with the page busy and neither 1342 * the page or the queue will be spin-locked. 1343 */ 1344 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1345 KKASSERT(m->queue == PQ_ACTIVE + q); 1346 1347 /* 1348 * Don't deactivate pages that are held, even if we can 1349 * busy them. (XXX why not?) 1350 */ 1351 if (m->hold_count != 0) { 1352 vm_page_and_queue_spin_lock(m); 1353 if (m->queue - m->pc == PQ_ACTIVE) { 1354 TAILQ_REMOVE( 1355 &vm_page_queues[PQ_ACTIVE + q].pl, 1356 m, pageq); 1357 TAILQ_INSERT_TAIL( 1358 &vm_page_queues[PQ_ACTIVE + q].pl, 1359 m, pageq); 1360 } 1361 vm_page_and_queue_spin_unlock(m); 1362 vm_page_wakeup(m); 1363 goto next; 1364 } 1365 1366 /* 1367 * The emergency pager ignores vnode-backed pages as these 1368 * are the pages that probably bricked the main pager. 1369 */ 1370 if (isep && m->object && m->object->type == OBJT_VNODE) { 1371 vm_page_and_queue_spin_lock(m); 1372 if (m->queue - m->pc == PQ_ACTIVE) { 1373 TAILQ_REMOVE( 1374 &vm_page_queues[PQ_ACTIVE + q].pl, 1375 m, pageq); 1376 TAILQ_INSERT_TAIL( 1377 &vm_page_queues[PQ_ACTIVE + q].pl, 1378 m, pageq); 1379 } 1380 vm_page_and_queue_spin_unlock(m); 1381 vm_page_wakeup(m); 1382 goto next; 1383 } 1384 1385 /* 1386 * The count for pagedaemon pages is done after checking the 1387 * page for eligibility... 1388 */ 1389 mycpu->gd_cnt.v_pdpages++; 1390 1391 /* 1392 * Check to see "how much" the page has been used and clear 1393 * the tracking access bits. If the object has no references 1394 * don't bother paying the expense. 1395 */ 1396 actcount = 0; 1397 if (m->object && m->object->ref_count != 0) { 1398 if (m->flags & PG_REFERENCED) 1399 ++actcount; 1400 actcount += pmap_ts_referenced(m); 1401 if (actcount) { 1402 m->act_count += ACT_ADVANCE + actcount; 1403 if (m->act_count > ACT_MAX) 1404 m->act_count = ACT_MAX; 1405 } 1406 } 1407 vm_page_flag_clear(m, PG_REFERENCED); 1408 1409 /* 1410 * actcount is only valid if the object ref_count is non-zero. 1411 * If the page does not have an object, actcount will be zero. 1412 */ 1413 if (actcount && m->object->ref_count != 0) { 1414 vm_page_and_queue_spin_lock(m); 1415 if (m->queue - m->pc == PQ_ACTIVE) { 1416 TAILQ_REMOVE( 1417 &vm_page_queues[PQ_ACTIVE + q].pl, 1418 m, pageq); 1419 TAILQ_INSERT_TAIL( 1420 &vm_page_queues[PQ_ACTIVE + q].pl, 1421 m, pageq); 1422 } 1423 vm_page_and_queue_spin_unlock(m); 1424 vm_page_wakeup(m); 1425 } else { 1426 switch(m->object->type) { 1427 case OBJT_DEFAULT: 1428 case OBJT_SWAP: 1429 m->act_count -= min(m->act_count, 1430 vm_anonmem_decline); 1431 break; 1432 default: 1433 m->act_count -= min(m->act_count, 1434 vm_filemem_decline); 1435 break; 1436 } 1437 if (vm_pageout_algorithm || 1438 (m->object == NULL) || 1439 (m->object && (m->object->ref_count == 0)) || 1440 m->act_count < pass + 1 1441 ) { 1442 /* 1443 * Deactivate the page. If we had a 1444 * shortage from our inactive scan try to 1445 * free (cache) the page instead. 1446 * 1447 * Don't just blindly cache the page if 1448 * we do not have a shortage from the 1449 * inactive scan, that could lead to 1450 * gigabytes being moved. 1451 */ 1452 --inactive_shortage; 1453 if (avail_shortage - delta > 0 || 1454 (m->object && (m->object->ref_count == 0))) 1455 { 1456 if (avail_shortage - delta > 0) 1457 ++*recycle_countp; 1458 vm_page_protect(m, VM_PROT_NONE); 1459 if (m->dirty == 0 && 1460 (m->flags & PG_NEED_COMMIT) == 0 && 1461 avail_shortage - delta > 0) { 1462 vm_page_cache(m); 1463 } else { 1464 vm_page_deactivate(m); 1465 vm_page_wakeup(m); 1466 } 1467 } else { 1468 vm_page_deactivate(m); 1469 vm_page_wakeup(m); 1470 } 1471 ++delta; 1472 } else { 1473 vm_page_and_queue_spin_lock(m); 1474 if (m->queue - m->pc == PQ_ACTIVE) { 1475 TAILQ_REMOVE( 1476 &vm_page_queues[PQ_ACTIVE + q].pl, 1477 m, pageq); 1478 TAILQ_INSERT_TAIL( 1479 &vm_page_queues[PQ_ACTIVE + q].pl, 1480 m, pageq); 1481 } 1482 vm_page_and_queue_spin_unlock(m); 1483 vm_page_wakeup(m); 1484 } 1485 } 1486 next: 1487 lwkt_yield(); 1488 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1489 } 1490 1491 /* 1492 * Clean out our local marker. 1493 * 1494 * Page queue still spin-locked. 1495 */ 1496 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1497 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1498 1499 return (delta); 1500 } 1501 1502 /* 1503 * The number of actually free pages can drop down to v_free_reserved, 1504 * we try to build the free count back above v_free_min. Note that 1505 * vm_paging_needed() also returns TRUE if v_free_count is not at 1506 * least v_free_min so that is the minimum we must build the free 1507 * count to. 1508 * 1509 * We use a slightly higher target to improve hysteresis, 1510 * ((v_free_target + v_free_min) / 2). Since v_free_target 1511 * is usually the same as v_cache_min this maintains about 1512 * half the pages in the free queue as are in the cache queue, 1513 * providing pretty good pipelining for pageout operation. 1514 * 1515 * The system operator can manipulate vm.v_cache_min and 1516 * vm.v_free_target to tune the pageout demon. Be sure 1517 * to keep vm.v_free_min < vm.v_free_target. 1518 * 1519 * Note that the original paging target is to get at least 1520 * (free_min + cache_min) into (free + cache). The slightly 1521 * higher target will shift additional pages from cache to free 1522 * without effecting the original paging target in order to 1523 * maintain better hysteresis and not have the free count always 1524 * be dead-on v_free_min. 1525 * 1526 * NOTE: we are still in a critical section. 1527 * 1528 * Pages moved from PQ_CACHE to totally free are not counted in the 1529 * pages_freed counter. 1530 * 1531 * WARNING! Can be called from two pagedaemon threads simultaneously. 1532 */ 1533 static void 1534 vm_pageout_scan_cache(int avail_shortage, int pass, 1535 int vnodes_skipped, int recycle_count) 1536 { 1537 static int lastkillticks; 1538 struct vm_pageout_scan_info info; 1539 vm_page_t m; 1540 int isep; 1541 1542 isep = (curthread == emergpager); 1543 1544 while (vmstats.v_free_count < 1545 (vmstats.v_free_min + vmstats.v_free_target) / 2) { 1546 /* 1547 * This steals some code from vm/vm_page.c 1548 */ 1549 static int cache_rover = 0; 1550 1551 m = vm_page_list_find(PQ_CACHE, cache_rover & PQ_L2_MASK); 1552 if (m == NULL) 1553 break; 1554 /* page is returned removed from its queue and spinlocked */ 1555 if (vm_page_busy_try(m, TRUE)) { 1556 vm_page_deactivate_locked(m); 1557 vm_page_spin_unlock(m); 1558 continue; 1559 } 1560 vm_page_spin_unlock(m); 1561 pagedaemon_wakeup(); 1562 lwkt_yield(); 1563 1564 /* 1565 * Remaining operations run with the page busy and neither 1566 * the page or the queue will be spin-locked. 1567 */ 1568 if ((m->flags & (PG_UNMANAGED | PG_NEED_COMMIT)) || 1569 m->hold_count || 1570 m->wire_count) { 1571 vm_page_deactivate(m); 1572 vm_page_wakeup(m); 1573 continue; 1574 } 1575 KKASSERT((m->flags & PG_MAPPED) == 0); 1576 KKASSERT(m->dirty == 0); 1577 cache_rover += PQ_PRIME2; 1578 vm_pageout_page_free(m); 1579 mycpu->gd_cnt.v_dfree++; 1580 } 1581 1582 #if !defined(NO_SWAPPING) 1583 /* 1584 * Idle process swapout -- run once per second. 1585 */ 1586 if (vm_swap_idle_enabled) { 1587 static time_t lsec; 1588 if (time_uptime != lsec) { 1589 atomic_set_int(&vm_pageout_req_swapout, VM_SWAP_IDLE); 1590 vm_req_vmdaemon(); 1591 lsec = time_uptime; 1592 } 1593 } 1594 #endif 1595 1596 /* 1597 * If we didn't get enough free pages, and we have skipped a vnode 1598 * in a writeable object, wakeup the sync daemon. And kick swapout 1599 * if we did not get enough free pages. 1600 */ 1601 if (vm_paging_target() > 0) { 1602 if (vnodes_skipped && vm_page_count_min(0)) 1603 speedup_syncer(NULL); 1604 #if !defined(NO_SWAPPING) 1605 if (vm_swap_enabled && vm_page_count_target()) { 1606 atomic_set_int(&vm_pageout_req_swapout, VM_SWAP_NORMAL); 1607 vm_req_vmdaemon(); 1608 } 1609 #endif 1610 } 1611 1612 /* 1613 * Handle catastrophic conditions. Under good conditions we should 1614 * be at the target, well beyond our minimum. If we could not even 1615 * reach our minimum the system is under heavy stress. But just being 1616 * under heavy stress does not trigger process killing. 1617 * 1618 * We consider ourselves to have run out of memory if the swap pager 1619 * is full and avail_shortage is still positive. The secondary check 1620 * ensures that we do not kill processes if the instantanious 1621 * availability is good, even if the pageout demon pass says it 1622 * couldn't get to the target. 1623 * 1624 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT HANDLE SWAP FULL 1625 * SITUATIONS. 1626 */ 1627 if (swap_pager_almost_full && 1628 pass > 0 && 1629 isep == 0 && 1630 (vm_page_count_min(recycle_count) || avail_shortage > 0)) { 1631 kprintf("Warning: system low on memory+swap " 1632 "shortage %d for %d ticks!\n", 1633 avail_shortage, ticks - swap_fail_ticks); 1634 if (bootverbose) 1635 kprintf("Metrics: spaf=%d spf=%d pass=%d avail=%d target=%d last=%u\n", 1636 swap_pager_almost_full, 1637 swap_pager_full, 1638 pass, 1639 avail_shortage, 1640 vm_paging_target(), 1641 (unsigned int)(ticks - lastkillticks)); 1642 } 1643 if (swap_pager_full && 1644 pass > 1 && 1645 isep == 0 && 1646 avail_shortage > 0 && 1647 vm_paging_target() > 0 && 1648 (unsigned int)(ticks - lastkillticks) >= hz) { 1649 /* 1650 * Kill something, maximum rate once per second to give 1651 * the process time to free up sufficient memory. 1652 */ 1653 lastkillticks = ticks; 1654 info.bigproc = NULL; 1655 info.bigsize = 0; 1656 allproc_scan(vm_pageout_scan_callback, &info, 0); 1657 if (info.bigproc != NULL) { 1658 kprintf("Try to kill process %d %s\n", 1659 info.bigproc->p_pid, info.bigproc->p_comm); 1660 info.bigproc->p_nice = PRIO_MIN; 1661 info.bigproc->p_usched->resetpriority( 1662 FIRST_LWP_IN_PROC(info.bigproc)); 1663 atomic_set_int(&info.bigproc->p_flags, P_LOWMEMKILL); 1664 killproc(info.bigproc, "out of swap space"); 1665 wakeup(&vmstats.v_free_count); 1666 PRELE(info.bigproc); 1667 } 1668 } 1669 } 1670 1671 static int 1672 vm_pageout_scan_callback(struct proc *p, void *data) 1673 { 1674 struct vm_pageout_scan_info *info = data; 1675 vm_offset_t size; 1676 1677 /* 1678 * Never kill system processes or init. If we have configured swap 1679 * then try to avoid killing low-numbered pids. 1680 */ 1681 if ((p->p_flags & P_SYSTEM) || (p->p_pid == 1) || 1682 ((p->p_pid < 48) && (vm_swap_size != 0))) { 1683 return (0); 1684 } 1685 1686 lwkt_gettoken(&p->p_token); 1687 1688 /* 1689 * if the process is in a non-running type state, 1690 * don't touch it. 1691 */ 1692 if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) { 1693 lwkt_reltoken(&p->p_token); 1694 return (0); 1695 } 1696 1697 /* 1698 * Get the approximate process size. Note that anonymous pages 1699 * with backing swap will be counted twice, but there should not 1700 * be too many such pages due to the stress the VM system is 1701 * under at this point. 1702 */ 1703 size = vmspace_anonymous_count(p->p_vmspace) + 1704 vmspace_swap_count(p->p_vmspace); 1705 1706 /* 1707 * If the this process is bigger than the biggest one 1708 * remember it. 1709 */ 1710 if (info->bigsize < size) { 1711 if (info->bigproc) 1712 PRELE(info->bigproc); 1713 PHOLD(p); 1714 info->bigproc = p; 1715 info->bigsize = size; 1716 } 1717 lwkt_reltoken(&p->p_token); 1718 lwkt_yield(); 1719 1720 return(0); 1721 } 1722 1723 /* 1724 * This routine tries to maintain the pseudo LRU active queue, 1725 * so that during long periods of time where there is no paging, 1726 * that some statistic accumulation still occurs. This code 1727 * helps the situation where paging just starts to occur. 1728 */ 1729 static void 1730 vm_pageout_page_stats(int q) 1731 { 1732 static int fullintervalcount = 0; 1733 struct vm_page marker; 1734 vm_page_t m; 1735 int pcount, tpcount; /* Number of pages to check */ 1736 int page_shortage; 1737 1738 page_shortage = (vmstats.v_inactive_target + vmstats.v_cache_max + 1739 vmstats.v_free_min) - 1740 (vmstats.v_free_count + vmstats.v_inactive_count + 1741 vmstats.v_cache_count); 1742 1743 if (page_shortage <= 0) 1744 return; 1745 1746 pcount = vm_page_queues[PQ_ACTIVE + q].lcnt; 1747 fullintervalcount += vm_pageout_stats_interval; 1748 if (fullintervalcount < vm_pageout_full_stats_interval) { 1749 tpcount = (vm_pageout_stats_max * pcount) / 1750 vmstats.v_page_count + 1; 1751 if (pcount > tpcount) 1752 pcount = tpcount; 1753 } else { 1754 fullintervalcount = 0; 1755 } 1756 1757 bzero(&marker, sizeof(marker)); 1758 marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER; 1759 marker.queue = PQ_ACTIVE + q; 1760 marker.pc = q; 1761 marker.wire_count = 1; 1762 1763 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1764 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1765 1766 /* 1767 * Queue locked at top of loop to avoid stack marker issues. 1768 */ 1769 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 1770 pcount-- > 0) 1771 { 1772 int actcount; 1773 1774 KKASSERT(m->queue == PQ_ACTIVE + q); 1775 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1776 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m, 1777 &marker, pageq); 1778 1779 /* 1780 * Skip marker pages (atomic against other markers to avoid 1781 * infinite hop-over scans). 1782 */ 1783 if (m->flags & PG_MARKER) 1784 continue; 1785 1786 /* 1787 * Ignore pages we can't busy 1788 */ 1789 if (vm_page_busy_try(m, TRUE)) 1790 continue; 1791 1792 /* 1793 * Remaining operations run with the page busy and neither 1794 * the page or the queue will be spin-locked. 1795 */ 1796 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1797 KKASSERT(m->queue == PQ_ACTIVE + q); 1798 1799 /* 1800 * We now have a safely busied page, the page and queue 1801 * spinlocks have been released. 1802 * 1803 * Ignore held pages 1804 */ 1805 if (m->hold_count) { 1806 vm_page_wakeup(m); 1807 goto next; 1808 } 1809 1810 /* 1811 * Calculate activity 1812 */ 1813 actcount = 0; 1814 if (m->flags & PG_REFERENCED) { 1815 vm_page_flag_clear(m, PG_REFERENCED); 1816 actcount += 1; 1817 } 1818 actcount += pmap_ts_referenced(m); 1819 1820 /* 1821 * Update act_count and move page to end of queue. 1822 */ 1823 if (actcount) { 1824 m->act_count += ACT_ADVANCE + actcount; 1825 if (m->act_count > ACT_MAX) 1826 m->act_count = ACT_MAX; 1827 vm_page_and_queue_spin_lock(m); 1828 if (m->queue - m->pc == PQ_ACTIVE) { 1829 TAILQ_REMOVE( 1830 &vm_page_queues[PQ_ACTIVE + q].pl, 1831 m, pageq); 1832 TAILQ_INSERT_TAIL( 1833 &vm_page_queues[PQ_ACTIVE + q].pl, 1834 m, pageq); 1835 } 1836 vm_page_and_queue_spin_unlock(m); 1837 vm_page_wakeup(m); 1838 goto next; 1839 } 1840 1841 if (m->act_count == 0) { 1842 /* 1843 * We turn off page access, so that we have 1844 * more accurate RSS stats. We don't do this 1845 * in the normal page deactivation when the 1846 * system is loaded VM wise, because the 1847 * cost of the large number of page protect 1848 * operations would be higher than the value 1849 * of doing the operation. 1850 * 1851 * We use the marker to save our place so 1852 * we can release the spin lock. both (m) 1853 * and (next) will be invalid. 1854 */ 1855 vm_page_protect(m, VM_PROT_NONE); 1856 vm_page_deactivate(m); 1857 } else { 1858 m->act_count -= min(m->act_count, ACT_DECLINE); 1859 vm_page_and_queue_spin_lock(m); 1860 if (m->queue - m->pc == PQ_ACTIVE) { 1861 TAILQ_REMOVE( 1862 &vm_page_queues[PQ_ACTIVE + q].pl, 1863 m, pageq); 1864 TAILQ_INSERT_TAIL( 1865 &vm_page_queues[PQ_ACTIVE + q].pl, 1866 m, pageq); 1867 } 1868 vm_page_and_queue_spin_unlock(m); 1869 } 1870 vm_page_wakeup(m); 1871 next: 1872 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1873 } 1874 1875 /* 1876 * Remove our local marker 1877 * 1878 * Page queue still spin-locked. 1879 */ 1880 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1881 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1882 } 1883 1884 static int 1885 vm_pageout_free_page_calc(vm_size_t count) 1886 { 1887 if (count < vmstats.v_page_count) 1888 return 0; 1889 /* 1890 * free_reserved needs to include enough for the largest swap pager 1891 * structures plus enough for any pv_entry structs when paging. 1892 * 1893 * v_free_min normal allocations 1894 * v_free_reserved system allocations 1895 * v_pageout_free_min allocations by pageout daemon 1896 * v_interrupt_free_min low level allocations (e.g swap structures) 1897 */ 1898 if (vmstats.v_page_count > 1024) 1899 vmstats.v_free_min = 64 + (vmstats.v_page_count - 1024) / 200; 1900 else 1901 vmstats.v_free_min = 64; 1902 1903 /* 1904 * Make sure the vmmeter slop can't blow out our global minimums. 1905 * 1906 * However, to accomodate weird configurations (vkernels with many 1907 * cpus and little memory, or artifically reduced hw.physmem), do 1908 * not allow v_free_min to exceed 1/20 of ram or the pageout demon 1909 * will go out of control. 1910 */ 1911 if (vmstats.v_free_min < VMMETER_SLOP_COUNT * ncpus * 10) 1912 vmstats.v_free_min = VMMETER_SLOP_COUNT * ncpus * 10; 1913 if (vmstats.v_free_min > vmstats.v_page_count / 20) 1914 vmstats.v_free_min = vmstats.v_page_count / 20; 1915 1916 vmstats.v_free_reserved = vmstats.v_free_min * 4 / 8 + 7; 1917 vmstats.v_free_severe = vmstats.v_free_min * 4 / 8 + 0; 1918 vmstats.v_pageout_free_min = vmstats.v_free_min * 2 / 8 + 7; 1919 vmstats.v_interrupt_free_min = vmstats.v_free_min * 1 / 8 + 7; 1920 1921 return 1; 1922 } 1923 1924 1925 /* 1926 * vm_pageout is the high level pageout daemon. TWO kernel threads run 1927 * this daemon, the primary pageout daemon and the emergency pageout daemon. 1928 * 1929 * The emergency pageout daemon takes over when the primary pageout daemon 1930 * deadlocks. The emergency pageout daemon ONLY pages out to swap, thus 1931 * avoiding the many low-memory deadlocks which can occur when paging out 1932 * to VFS's. 1933 */ 1934 static void 1935 vm_pageout_thread(void) 1936 { 1937 int pass; 1938 int q; 1939 int q1iterator = 0; 1940 int q2iterator = 0; 1941 int isep; 1942 int emrunning; 1943 1944 curthread->td_flags |= TDF_SYSTHREAD; 1945 1946 /* 1947 * We only need to setup once. 1948 */ 1949 isep = 0; 1950 emrunning = 0; 1951 if (curthread == emergpager) { 1952 isep = 1; 1953 goto skip_setup; 1954 } 1955 1956 /* 1957 * Initialize some paging parameters. 1958 */ 1959 vm_pageout_free_page_calc(vmstats.v_page_count); 1960 1961 /* 1962 * v_free_target and v_cache_min control pageout hysteresis. Note 1963 * that these are more a measure of the VM cache queue hysteresis 1964 * then the VM free queue. Specifically, v_free_target is the 1965 * high water mark (free+cache pages). 1966 * 1967 * v_free_reserved + v_cache_min (mostly means v_cache_min) is the 1968 * low water mark, while v_free_min is the stop. v_cache_min must 1969 * be big enough to handle memory needs while the pageout daemon 1970 * is signalled and run to free more pages. 1971 */ 1972 if (vmstats.v_free_count > 6144) 1973 vmstats.v_free_target = 4 * vmstats.v_free_min + 1974 vmstats.v_free_reserved; 1975 else 1976 vmstats.v_free_target = 2 * vmstats.v_free_min + 1977 vmstats.v_free_reserved; 1978 1979 /* 1980 * NOTE: With the new buffer cache b_act_count we want the default 1981 * inactive target to be a percentage of available memory. 1982 * 1983 * The inactive target essentially determines the minimum 1984 * number of 'temporary' pages capable of caching one-time-use 1985 * files when the VM system is otherwise full of pages 1986 * belonging to multi-time-use files or active program data. 1987 * 1988 * NOTE: The inactive target is aggressively persued only if the 1989 * inactive queue becomes too small. If the inactive queue 1990 * is large enough to satisfy page movement to free+cache 1991 * then it is repopulated more slowly from the active queue. 1992 * This allows a general inactive_target default to be set. 1993 * 1994 * There is an issue here for processes which sit mostly idle 1995 * 'overnight', such as sshd, tcsh, and X. Any movement from 1996 * the active queue will eventually cause such pages to 1997 * recycle eventually causing a lot of paging in the morning. 1998 * To reduce the incidence of this pages cycled out of the 1999 * buffer cache are moved directly to the inactive queue if 2000 * they were only used once or twice. 2001 * 2002 * The vfs.vm_cycle_point sysctl can be used to adjust this. 2003 * Increasing the value (up to 64) increases the number of 2004 * buffer recyclements which go directly to the inactive queue. 2005 */ 2006 if (vmstats.v_free_count > 2048) { 2007 vmstats.v_cache_min = vmstats.v_free_target; 2008 vmstats.v_cache_max = 2 * vmstats.v_cache_min; 2009 } else { 2010 vmstats.v_cache_min = 0; 2011 vmstats.v_cache_max = 0; 2012 } 2013 vmstats.v_inactive_target = vmstats.v_free_count / 4; 2014 2015 /* XXX does not really belong here */ 2016 if (vm_page_max_wired == 0) 2017 vm_page_max_wired = vmstats.v_free_count / 3; 2018 2019 if (vm_pageout_stats_max == 0) 2020 vm_pageout_stats_max = vmstats.v_free_target; 2021 2022 /* 2023 * Set interval in seconds for stats scan. 2024 */ 2025 if (vm_pageout_stats_interval == 0) 2026 vm_pageout_stats_interval = 5; 2027 if (vm_pageout_full_stats_interval == 0) 2028 vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4; 2029 2030 2031 /* 2032 * Set maximum free per pass 2033 */ 2034 if (vm_pageout_stats_free_max == 0) 2035 vm_pageout_stats_free_max = 5; 2036 2037 swap_pager_swap_init(); 2038 pass = 0; 2039 2040 atomic_swap_int(&sequence_emerg_pager, 1); 2041 wakeup(&sequence_emerg_pager); 2042 2043 skip_setup: 2044 /* 2045 * Sequence emergency pager startup 2046 */ 2047 if (isep) { 2048 while (sequence_emerg_pager == 0) 2049 tsleep(&sequence_emerg_pager, 0, "pstartup", hz); 2050 } 2051 2052 /* 2053 * The pageout daemon is never done, so loop forever. 2054 * 2055 * WARNING! This code is being executed by two kernel threads 2056 * potentially simultaneously. 2057 */ 2058 while (TRUE) { 2059 int error; 2060 int avail_shortage; 2061 int inactive_shortage; 2062 int vnodes_skipped = 0; 2063 int recycle_count = 0; 2064 int tmp; 2065 2066 /* 2067 * Wait for an action request. If we timeout check to 2068 * see if paging is needed (in case the normal wakeup 2069 * code raced us). 2070 */ 2071 if (isep) { 2072 /* 2073 * Emergency pagedaemon monitors the primary 2074 * pagedaemon while vm_pages_needed != 0. 2075 * 2076 * The emergency pagedaemon only runs if VM paging 2077 * is needed and the primary pagedaemon has not 2078 * updated vm_pagedaemon_time for more than 2 seconds. 2079 */ 2080 if (vm_pages_needed) 2081 tsleep(&vm_pagedaemon_time, 0, "psleep", hz); 2082 else 2083 tsleep(&vm_pagedaemon_time, 0, "psleep", hz*10); 2084 if (vm_pages_needed == 0) { 2085 pass = 0; 2086 continue; 2087 } 2088 if ((int)(ticks - vm_pagedaemon_time) < hz * 2) { 2089 if (emrunning) { 2090 emrunning = 0; 2091 kprintf("Emergency pager finished\n"); 2092 } 2093 pass = 0; 2094 continue; 2095 } 2096 if (emrunning == 0) { 2097 emrunning = 1; 2098 kprintf("Emergency pager running\n"); 2099 } 2100 } else { 2101 /* 2102 * Primary pagedaemon 2103 */ 2104 if (vm_pages_needed == 0) { 2105 error = tsleep(&vm_pages_needed, 2106 0, "psleep", 2107 vm_pageout_stats_interval * hz); 2108 if (error && 2109 vm_paging_needed() == 0 && 2110 vm_pages_needed == 0) { 2111 for (q = 0; q < PQ_L2_SIZE; ++q) 2112 vm_pageout_page_stats(q); 2113 continue; 2114 } 2115 vm_pagedaemon_time = ticks; 2116 vm_pages_needed = 1; 2117 2118 /* 2119 * Wake the emergency pagedaemon up so it 2120 * can monitor us. It will automatically 2121 * go back into a long sleep when 2122 * vm_pages_needed returns to 0. 2123 */ 2124 wakeup(&vm_pagedaemon_time); 2125 } 2126 } 2127 2128 mycpu->gd_cnt.v_pdwakeups++; 2129 2130 /* 2131 * Scan for INACTIVE->CLEAN/PAGEOUT 2132 * 2133 * This routine tries to avoid thrashing the system with 2134 * unnecessary activity. 2135 * 2136 * Calculate our target for the number of free+cache pages we 2137 * want to get to. This is higher then the number that causes 2138 * allocations to stall (severe) in order to provide hysteresis, 2139 * and if we don't make it all the way but get to the minimum 2140 * we're happy. Goose it a bit if there are multiple requests 2141 * for memory. 2142 * 2143 * Don't reduce avail_shortage inside the loop or the 2144 * PQAVERAGE() calculation will break. 2145 * 2146 * NOTE! deficit is differentiated from avail_shortage as 2147 * REQUIRING at least (deficit) pages to be cleaned, 2148 * even if the page queues are in good shape. This 2149 * is used primarily for handling per-process 2150 * RLIMIT_RSS and may also see small values when 2151 * processes block due to low memory. 2152 */ 2153 vmstats_rollup(); 2154 if (isep == 0) 2155 vm_pagedaemon_time = ticks; 2156 avail_shortage = vm_paging_target() + vm_pageout_deficit; 2157 vm_pageout_deficit = 0; 2158 2159 if (avail_shortage > 0) { 2160 int delta = 0; 2161 2162 for (q = 0; q < PQ_L2_SIZE; ++q) { 2163 delta += vm_pageout_scan_inactive( 2164 pass, 2165 (q + q1iterator) & PQ_L2_MASK, 2166 PQAVERAGE(avail_shortage), 2167 &vnodes_skipped); 2168 if (avail_shortage - delta <= 0) 2169 break; 2170 } 2171 avail_shortage -= delta; 2172 q1iterator = q + 1; 2173 } 2174 2175 /* 2176 * Figure out how many active pages we must deactivate. If 2177 * we were able to reach our target with just the inactive 2178 * scan above we limit the number of active pages we 2179 * deactivate to reduce unnecessary work. 2180 */ 2181 vmstats_rollup(); 2182 if (isep == 0) 2183 vm_pagedaemon_time = ticks; 2184 inactive_shortage = vmstats.v_inactive_target - 2185 vmstats.v_inactive_count; 2186 2187 /* 2188 * If we were unable to free sufficient inactive pages to 2189 * satisfy the free/cache queue requirements then simply 2190 * reaching the inactive target may not be good enough. 2191 * Try to deactivate pages in excess of the target based 2192 * on the shortfall. 2193 * 2194 * However to prevent thrashing the VM system do not 2195 * deactivate more than an additional 1/10 the inactive 2196 * target's worth of active pages. 2197 */ 2198 if (avail_shortage > 0) { 2199 tmp = avail_shortage * 2; 2200 if (tmp > vmstats.v_inactive_target / 10) 2201 tmp = vmstats.v_inactive_target / 10; 2202 inactive_shortage += tmp; 2203 } 2204 2205 /* 2206 * Only trigger a pmap cleanup on inactive shortage. 2207 */ 2208 if (isep == 0 && inactive_shortage > 0) { 2209 pmap_collect(); 2210 } 2211 2212 /* 2213 * Scan for ACTIVE->INACTIVE 2214 * 2215 * Only trigger on inactive shortage. Triggering on 2216 * avail_shortage can starve the active queue with 2217 * unnecessary active->inactive transitions and destroy 2218 * performance. 2219 * 2220 * If this is the emergency pager, always try to move 2221 * a few pages from active to inactive because the inactive 2222 * queue might have enough pages, but not enough anonymous 2223 * pages. 2224 */ 2225 if (isep && inactive_shortage < vm_emerg_launder) 2226 inactive_shortage = vm_emerg_launder; 2227 2228 if (/*avail_shortage > 0 ||*/ inactive_shortage > 0) { 2229 int delta = 0; 2230 2231 for (q = 0; q < PQ_L2_SIZE; ++q) { 2232 delta += vm_pageout_scan_active( 2233 pass, 2234 (q + q2iterator) & PQ_L2_MASK, 2235 PQAVERAGE(avail_shortage), 2236 PQAVERAGE(inactive_shortage), 2237 &recycle_count); 2238 if (inactive_shortage - delta <= 0 && 2239 avail_shortage - delta <= 0) { 2240 break; 2241 } 2242 } 2243 inactive_shortage -= delta; 2244 avail_shortage -= delta; 2245 q2iterator = q + 1; 2246 } 2247 2248 /* 2249 * Scan for CACHE->FREE 2250 * 2251 * Finally free enough cache pages to meet our free page 2252 * requirement and take more drastic measures if we are 2253 * still in trouble. 2254 */ 2255 vmstats_rollup(); 2256 if (isep == 0) 2257 vm_pagedaemon_time = ticks; 2258 vm_pageout_scan_cache(avail_shortage, pass, 2259 vnodes_skipped, recycle_count); 2260 2261 /* 2262 * Wait for more work. 2263 */ 2264 if (avail_shortage > 0) { 2265 ++pass; 2266 if (pass < 10 && vm_pages_needed > 1) { 2267 /* 2268 * Normal operation, additional processes 2269 * have already kicked us. Retry immediately 2270 * unless swap space is completely full in 2271 * which case delay a bit. 2272 */ 2273 if (swap_pager_full) { 2274 tsleep(&vm_pages_needed, 0, "pdelay", 2275 hz / 5); 2276 } /* else immediate retry */ 2277 } else if (pass < 10) { 2278 /* 2279 * Normal operation, fewer processes. Delay 2280 * a bit but allow wakeups. vm_pages_needed 2281 * is only adjusted against the primary 2282 * pagedaemon here. 2283 */ 2284 if (isep == 0) 2285 vm_pages_needed = 0; 2286 tsleep(&vm_pages_needed, 0, "pdelay", hz / 10); 2287 if (isep == 0) 2288 vm_pages_needed = 1; 2289 } else if (swap_pager_full == 0) { 2290 /* 2291 * We've taken too many passes, forced delay. 2292 */ 2293 tsleep(&vm_pages_needed, 0, "pdelay", hz / 10); 2294 } else { 2295 /* 2296 * Running out of memory, catastrophic 2297 * back-off to one-second intervals. 2298 */ 2299 tsleep(&vm_pages_needed, 0, "pdelay", hz); 2300 } 2301 } else if (vm_pages_needed) { 2302 /* 2303 * Interlocked wakeup of waiters (non-optional). 2304 * 2305 * Similar to vm_page_free_wakeup() in vm_page.c, 2306 * wake 2307 */ 2308 pass = 0; 2309 if (!vm_page_count_min(vm_page_free_hysteresis) || 2310 !vm_page_count_target()) { 2311 vm_pages_needed = 0; 2312 wakeup(&vmstats.v_free_count); 2313 } 2314 } else { 2315 pass = 0; 2316 } 2317 } 2318 } 2319 2320 static struct kproc_desc pg1_kp = { 2321 "pagedaemon", 2322 vm_pageout_thread, 2323 &pagethread 2324 }; 2325 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &pg1_kp); 2326 2327 static struct kproc_desc pg2_kp = { 2328 "emergpager", 2329 vm_pageout_thread, 2330 &emergpager 2331 }; 2332 SYSINIT(emergpager, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, kproc_start, &pg2_kp); 2333 2334 2335 /* 2336 * Called after allocating a page out of the cache or free queue 2337 * to possibly wake the pagedaemon up to replentish our supply. 2338 * 2339 * We try to generate some hysteresis by waking the pagedaemon up 2340 * when our free+cache pages go below the free_min+cache_min level. 2341 * The pagedaemon tries to get the count back up to at least the 2342 * minimum, and through to the target level if possible. 2343 * 2344 * If the pagedaemon is already active bump vm_pages_needed as a hint 2345 * that there are even more requests pending. 2346 * 2347 * SMP races ok? 2348 * No requirements. 2349 */ 2350 void 2351 pagedaemon_wakeup(void) 2352 { 2353 if (vm_paging_needed() && curthread != pagethread) { 2354 if (vm_pages_needed == 0) { 2355 vm_pages_needed = 1; /* SMP race ok */ 2356 wakeup(&vm_pages_needed); 2357 } else if (vm_page_count_min(0)) { 2358 ++vm_pages_needed; /* SMP race ok */ 2359 } 2360 } 2361 } 2362 2363 #if !defined(NO_SWAPPING) 2364 2365 /* 2366 * SMP races ok? 2367 * No requirements. 2368 */ 2369 static void 2370 vm_req_vmdaemon(void) 2371 { 2372 static int lastrun = 0; 2373 2374 if ((ticks > (lastrun + hz)) || (ticks < lastrun)) { 2375 wakeup(&vm_daemon_needed); 2376 lastrun = ticks; 2377 } 2378 } 2379 2380 static int vm_daemon_callback(struct proc *p, void *data __unused); 2381 2382 /* 2383 * No requirements. 2384 */ 2385 static void 2386 vm_daemon(void) 2387 { 2388 int req_swapout; 2389 2390 while (TRUE) { 2391 tsleep(&vm_daemon_needed, 0, "psleep", 0); 2392 req_swapout = atomic_swap_int(&vm_pageout_req_swapout, 0); 2393 2394 /* 2395 * forced swapouts 2396 */ 2397 if (req_swapout) 2398 swapout_procs(vm_pageout_req_swapout); 2399 2400 /* 2401 * scan the processes for exceeding their rlimits or if 2402 * process is swapped out -- deactivate pages 2403 */ 2404 allproc_scan(vm_daemon_callback, NULL, 0); 2405 } 2406 } 2407 2408 static int 2409 vm_daemon_callback(struct proc *p, void *data __unused) 2410 { 2411 struct vmspace *vm; 2412 vm_pindex_t limit, size; 2413 2414 /* 2415 * if this is a system process or if we have already 2416 * looked at this process, skip it. 2417 */ 2418 lwkt_gettoken(&p->p_token); 2419 2420 if (p->p_flags & (P_SYSTEM | P_WEXIT)) { 2421 lwkt_reltoken(&p->p_token); 2422 return (0); 2423 } 2424 2425 /* 2426 * if the process is in a non-running type state, 2427 * don't touch it. 2428 */ 2429 if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) { 2430 lwkt_reltoken(&p->p_token); 2431 return (0); 2432 } 2433 2434 /* 2435 * get a limit 2436 */ 2437 limit = OFF_TO_IDX(qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur, 2438 p->p_rlimit[RLIMIT_RSS].rlim_max)); 2439 2440 /* 2441 * let processes that are swapped out really be 2442 * swapped out. Set the limit to nothing to get as 2443 * many pages out to swap as possible. 2444 */ 2445 if (p->p_flags & P_SWAPPEDOUT) 2446 limit = 0; 2447 2448 vm = p->p_vmspace; 2449 vmspace_hold(vm); 2450 size = pmap_resident_tlnw_count(&vm->vm_pmap); 2451 if (limit >= 0 && size > 4096 && 2452 size - 4096 >= limit && vm_pageout_memuse_mode >= 1) { 2453 vm_pageout_map_deactivate_pages(&vm->vm_map, limit); 2454 } 2455 vmspace_drop(vm); 2456 2457 lwkt_reltoken(&p->p_token); 2458 2459 return (0); 2460 } 2461 2462 #endif 2463