1 /* 2 * Copyright (c) 2003-2020 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * Copyright (c) 1991 Regents of the University of California. 35 * All rights reserved. 36 * Copyright (c) 1994 John S. Dyson 37 * All rights reserved. 38 * Copyright (c) 1994 David Greenman 39 * All rights reserved. 40 * 41 * This code is derived from software contributed to Berkeley by 42 * The Mach Operating System project at Carnegie-Mellon University. 43 * 44 * Redistribution and use in source and binary forms, with or without 45 * modification, are permitted provided that the following conditions 46 * are met: 47 * 1. Redistributions of source code must retain the above copyright 48 * notice, this list of conditions and the following disclaimer. 49 * 2. Redistributions in binary form must reproduce the above copyright 50 * notice, this list of conditions and the following disclaimer in the 51 * documentation and/or other materials provided with the distribution. 52 * 3. Neither the name of the University nor the names of its contributors 53 * may be used to endorse or promote products derived from this software 54 * without specific prior written permission. 55 * 56 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 57 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 58 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 59 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 60 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 61 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 62 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 63 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 64 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 65 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 66 * SUCH DAMAGE. 67 * 68 * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 69 * 70 * 71 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 72 * All rights reserved. 73 * 74 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 75 * 76 * Permission to use, copy, modify and distribute this software and 77 * its documentation is hereby granted, provided that both the copyright 78 * notice and this permission notice appear in all copies of the 79 * software, derivative works or modified versions, and any portions 80 * thereof, and that both notices appear in supporting documentation. 81 * 82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 85 * 86 * Carnegie Mellon requests users of this software to return to 87 * 88 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 89 * School of Computer Science 90 * Carnegie Mellon University 91 * Pittsburgh PA 15213-3890 92 * 93 * any improvements or extensions that they make and grant Carnegie the 94 * rights to redistribute these changes. 95 */ 96 97 /* 98 * The proverbial page-out daemon, rewritten many times over the decades. 99 */ 100 101 #include "opt_vm.h" 102 #include <sys/param.h> 103 #include <sys/systm.h> 104 #include <sys/kernel.h> 105 #include <sys/proc.h> 106 #include <sys/kthread.h> 107 #include <sys/resourcevar.h> 108 #include <sys/signalvar.h> 109 #include <sys/vnode.h> 110 #include <sys/vmmeter.h> 111 #include <sys/conf.h> 112 #include <sys/sysctl.h> 113 114 #include <vm/vm.h> 115 #include <vm/vm_param.h> 116 #include <sys/lock.h> 117 #include <vm/vm_object.h> 118 #include <vm/vm_page.h> 119 #include <vm/vm_map.h> 120 #include <vm/vm_pageout.h> 121 #include <vm/vm_pager.h> 122 #include <vm/swap_pager.h> 123 #include <vm/vm_extern.h> 124 125 #include <sys/spinlock2.h> 126 #include <vm/vm_page2.h> 127 128 /* 129 * System initialization 130 */ 131 132 /* the kernel process "vm_pageout"*/ 133 static int vm_pageout_page(vm_page_t m, long *max_launderp, 134 long *vnodes_skippedp, struct vnode **vpfailedp, 135 int pass, int vmflush_flags); 136 static int vm_pageout_clean_helper (vm_page_t, int); 137 static void vm_pageout_free_page_calc (vm_size_t count); 138 static void vm_pageout_page_free(vm_page_t m) ; 139 struct thread *emergpager; 140 struct thread *pagethread; 141 static int sequence_emerg_pager; 142 143 #if !defined(NO_SWAPPING) 144 /* the kernel process "vm_daemon"*/ 145 static void vm_daemon (void); 146 static struct thread *vmthread; 147 148 static struct kproc_desc vm_kp = { 149 "vmdaemon", 150 vm_daemon, 151 &vmthread 152 }; 153 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp); 154 #endif 155 156 int vm_pages_needed = 0; /* Event on which pageout daemon sleeps */ 157 int vm_pageout_deficit = 0; /* Estimated number of pages deficit */ 158 int vm_pageout_pages_needed = 0;/* pageout daemon needs pages */ 159 int vm_page_free_hysteresis = 16; 160 static int vm_pagedaemon_time; 161 162 #if !defined(NO_SWAPPING) 163 static int vm_pageout_req_swapout; 164 static int vm_daemon_needed; 165 #endif 166 __read_mostly static int vm_max_launder = 4096; 167 __read_mostly static int vm_emerg_launder = 100; 168 __read_mostly static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0; 169 __read_mostly static int vm_pageout_full_stats_interval = 0; 170 __read_mostly static int vm_pageout_stats_free_max=0, vm_pageout_algorithm=0; 171 __read_mostly static int defer_swap_pageouts=0; 172 __read_mostly static int disable_swap_pageouts=0; 173 __read_mostly static u_int vm_anonmem_decline = ACT_DECLINE; 174 __read_mostly static u_int vm_filemem_decline = ACT_DECLINE * 2; 175 __read_mostly static int vm_pageout_debug; 176 177 #if defined(NO_SWAPPING) 178 __read_mostly static int vm_swap_enabled=0; 179 __read_mostly static int vm_swap_idle_enabled=0; 180 #else 181 __read_mostly static int vm_swap_enabled=1; 182 __read_mostly static int vm_swap_idle_enabled=0; 183 #endif 184 185 /* 0-disable, 1-passive, 2-active swp*/ 186 __read_mostly int vm_pageout_memuse_mode=2; 187 188 SYSCTL_UINT(_vm, VM_PAGEOUT_ALGORITHM, anonmem_decline, 189 CTLFLAG_RW, &vm_anonmem_decline, 0, "active->inactive anon memory"); 190 191 SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, filemem_decline, 192 CTLFLAG_RW, &vm_filemem_decline, 0, "active->inactive file cache"); 193 194 SYSCTL_INT(_vm, OID_AUTO, page_free_hysteresis, 195 CTLFLAG_RW, &vm_page_free_hysteresis, 0, 196 "Free more pages than the minimum required"); 197 198 SYSCTL_INT(_vm, OID_AUTO, max_launder, 199 CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout"); 200 SYSCTL_INT(_vm, OID_AUTO, emerg_launder, 201 CTLFLAG_RW, &vm_emerg_launder, 0, "Emergency pager minimum"); 202 203 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max, 204 CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length"); 205 206 SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval, 207 CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan"); 208 209 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval, 210 CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan"); 211 212 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_free_max, 213 CTLFLAG_RW, &vm_pageout_stats_free_max, 0, "Not implemented"); 214 SYSCTL_INT(_vm, OID_AUTO, pageout_memuse_mode, 215 CTLFLAG_RW, &vm_pageout_memuse_mode, 0, "memoryuse resource mode"); 216 SYSCTL_INT(_vm, OID_AUTO, pageout_debug, 217 CTLFLAG_RW, &vm_pageout_debug, 0, "debug pageout pages (count)"); 218 219 220 #if defined(NO_SWAPPING) 221 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 222 CTLFLAG_RD, &vm_swap_enabled, 0, ""); 223 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, 224 CTLFLAG_RD, &vm_swap_idle_enabled, 0, ""); 225 #else 226 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 227 CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout"); 228 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, 229 CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); 230 #endif 231 232 SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts, 233 CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem"); 234 235 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, 236 CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); 237 238 static int pageout_lock_miss; 239 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss, 240 CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout"); 241 242 int vm_page_max_wired; /* XXX max # of wired pages system-wide */ 243 244 #if !defined(NO_SWAPPING) 245 static void vm_req_vmdaemon (void); 246 #endif 247 static void vm_pageout_page_stats(int q); 248 249 /* 250 * Calculate approximately how many pages on each queue to try to 251 * clean. An exact calculation creates an edge condition when the 252 * queues are unbalanced so add significant slop. The queue scans 253 * will stop early when targets are reached and will start where they 254 * left off on the next pass. 255 * 256 * We need to be generous here because there are all sorts of loading 257 * conditions that can cause edge cases if try to average over all queues. 258 * In particular, storage subsystems have become so fast that paging 259 * activity can become quite frantic. Eventually we will probably need 260 * two paging threads, one for dirty pages and one for clean, to deal 261 * with the bandwidth requirements. 262 263 * So what we do is calculate a value that can be satisfied nominally by 264 * only having to scan half the queues. 265 */ 266 static __inline long 267 PQAVERAGE(long n) 268 { 269 long avg; 270 271 if (n >= 0) { 272 avg = ((n + (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) + 1); 273 } else { 274 avg = ((n - (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) - 1); 275 } 276 return avg; 277 } 278 279 /* 280 * vm_pageout_clean_helper: 281 * 282 * Clean the page and remove it from the laundry. The page must be busied 283 * by the caller and will be disposed of (put away, flushed) by this routine. 284 */ 285 static int 286 vm_pageout_clean_helper(vm_page_t m, int vmflush_flags) 287 { 288 vm_object_t object; 289 vm_page_t mc[BLIST_MAX_ALLOC]; 290 int error; 291 int ib, is, page_base; 292 vm_pindex_t pindex = m->pindex; 293 294 object = m->object; 295 296 /* 297 * Don't mess with the page if it's held or special. Theoretically 298 * we can pageout held pages but there is no real need to press our 299 * luck, so don't. 300 */ 301 if (m->hold_count != 0 || (m->flags & PG_UNQUEUED)) { 302 vm_page_wakeup(m); 303 return 0; 304 } 305 306 /* 307 * Place page in cluster. Align cluster for optimal swap space 308 * allocation (whether it is swap or not). This is typically ~16-32 309 * pages, which also tends to align the cluster to multiples of the 310 * filesystem block size if backed by a filesystem. 311 */ 312 page_base = pindex % BLIST_MAX_ALLOC; 313 mc[page_base] = m; 314 ib = page_base - 1; 315 is = page_base + 1; 316 317 /* 318 * Scan object for clusterable pages. 319 * 320 * We can cluster ONLY if: ->> the page is NOT 321 * clean, wired, busy, held, or mapped into a 322 * buffer, and one of the following: 323 * 1) The page is inactive, or a seldom used 324 * active page. 325 * -or- 326 * 2) we force the issue. 327 * 328 * During heavy mmap/modification loads the pageout 329 * daemon can really fragment the underlying file 330 * due to flushing pages out of order and not trying 331 * align the clusters (which leave sporatic out-of-order 332 * holes). To solve this problem we do the reverse scan 333 * first and attempt to align our cluster, then do a 334 * forward scan if room remains. 335 */ 336 vm_object_hold(object); 337 338 while (ib >= 0) { 339 vm_page_t p; 340 341 p = vm_page_lookup_busy_try(object, pindex - page_base + ib, 342 TRUE, &error); 343 if (error || p == NULL) 344 break; 345 if ((p->queue - p->pc) == PQ_CACHE || 346 (p->flags & PG_UNQUEUED)) { 347 vm_page_wakeup(p); 348 break; 349 } 350 vm_page_test_dirty(p); 351 if (((p->dirty & p->valid) == 0 && 352 (p->flags & PG_NEED_COMMIT) == 0) || 353 p->wire_count != 0 || /* may be held by buf cache */ 354 p->hold_count != 0) { /* may be undergoing I/O */ 355 vm_page_wakeup(p); 356 break; 357 } 358 if (p->queue - p->pc != PQ_INACTIVE) { 359 if (p->queue - p->pc != PQ_ACTIVE || 360 (vmflush_flags & VM_PAGER_ALLOW_ACTIVE) == 0) { 361 vm_page_wakeup(p); 362 break; 363 } 364 } 365 366 /* 367 * Try to maintain page groupings in the cluster. 368 */ 369 if (m->flags & PG_WINATCFLS) 370 vm_page_flag_set(p, PG_WINATCFLS); 371 else 372 vm_page_flag_clear(p, PG_WINATCFLS); 373 p->act_count = m->act_count; 374 375 mc[ib] = p; 376 --ib; 377 } 378 ++ib; /* fixup */ 379 380 while (is < BLIST_MAX_ALLOC && 381 pindex - page_base + is < object->size) { 382 vm_page_t p; 383 384 p = vm_page_lookup_busy_try(object, pindex - page_base + is, 385 TRUE, &error); 386 if (error || p == NULL) 387 break; 388 if (((p->queue - p->pc) == PQ_CACHE) || 389 (p->flags & PG_UNQUEUED)) { 390 vm_page_wakeup(p); 391 break; 392 } 393 vm_page_test_dirty(p); 394 if (((p->dirty & p->valid) == 0 && 395 (p->flags & PG_NEED_COMMIT) == 0) || 396 p->wire_count != 0 || /* may be held by buf cache */ 397 p->hold_count != 0) { /* may be undergoing I/O */ 398 vm_page_wakeup(p); 399 break; 400 } 401 if (p->queue - p->pc != PQ_INACTIVE) { 402 if (p->queue - p->pc != PQ_ACTIVE || 403 (vmflush_flags & VM_PAGER_ALLOW_ACTIVE) == 0) { 404 vm_page_wakeup(p); 405 break; 406 } 407 } 408 409 /* 410 * Try to maintain page groupings in the cluster. 411 */ 412 if (m->flags & PG_WINATCFLS) 413 vm_page_flag_set(p, PG_WINATCFLS); 414 else 415 vm_page_flag_clear(p, PG_WINATCFLS); 416 p->act_count = m->act_count; 417 418 mc[is] = p; 419 ++is; 420 } 421 422 vm_object_drop(object); 423 424 /* 425 * we allow reads during pageouts... 426 */ 427 return vm_pageout_flush(&mc[ib], is - ib, vmflush_flags); 428 } 429 430 /* 431 * vm_pageout_flush() - launder the given pages 432 * 433 * The given pages are laundered. Note that we setup for the start of 434 * I/O ( i.e. busy the page ), mark it read-only, and bump the object 435 * reference count all in here rather then in the parent. If we want 436 * the parent to do more sophisticated things we may have to change 437 * the ordering. 438 * 439 * The pages in the array must be busied by the caller and will be 440 * unbusied by this function. 441 */ 442 int 443 vm_pageout_flush(vm_page_t *mc, int count, int vmflush_flags) 444 { 445 vm_object_t object; 446 int pageout_status[count]; 447 int numpagedout = 0; 448 int i; 449 int dodebug; 450 451 if (vm_pageout_debug > 0) { 452 --vm_pageout_debug; 453 dodebug = 1; 454 } else { 455 dodebug = 0; 456 } 457 458 /* 459 * Initiate I/O. Bump the vm_page_t->busy counter. 460 */ 461 for (i = 0; i < count; i++) { 462 KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, 463 ("vm_pageout_flush page %p index %d/%d: partially " 464 "invalid page", mc[i], i, count)); 465 vm_page_io_start(mc[i]); 466 } 467 468 /* 469 * We must make the pages read-only. This will also force the 470 * modified bit in the related pmaps to be cleared. The pager 471 * cannot clear the bit for us since the I/O completion code 472 * typically runs from an interrupt. The act of making the page 473 * read-only handles the case for us. 474 * 475 * Then we can unbusy the pages, we still hold a reference by virtue 476 * of our soft-busy. 477 */ 478 if (dodebug) 479 kprintf("pageout: "); 480 for (i = 0; i < count; i++) { 481 if (vmflush_flags & VM_PAGER_TRY_TO_CACHE) 482 vm_page_protect(mc[i], VM_PROT_NONE); 483 else 484 vm_page_protect(mc[i], VM_PROT_READ); 485 vm_page_wakeup(mc[i]); 486 if (dodebug) 487 kprintf(" %p", mc[i]); 488 } 489 if (dodebug) 490 kprintf("\n"); 491 492 object = mc[0]->object; 493 vm_object_pip_add(object, count); 494 495 vm_pager_put_pages(object, mc, count, 496 (vmflush_flags | 497 ((object == &kernel_object) ? 498 VM_PAGER_PUT_SYNC : 0)), 499 pageout_status); 500 501 if (dodebug) 502 kprintf("result: "); 503 for (i = 0; i < count; i++) { 504 vm_page_t mt = mc[i]; 505 506 if (dodebug) 507 kprintf(" S%d", pageout_status[i]); 508 509 switch (pageout_status[i]) { 510 case VM_PAGER_OK: 511 numpagedout++; 512 break; 513 case VM_PAGER_PEND: 514 numpagedout++; 515 break; 516 case VM_PAGER_BAD: 517 /* 518 * Page outside of range of object. Right now we 519 * essentially lose the changes by pretending it 520 * worked. 521 */ 522 vm_page_busy_wait(mt, FALSE, "pgbad"); 523 pmap_clear_modify(mt); 524 vm_page_undirty(mt); 525 vm_page_wakeup(mt); 526 break; 527 case VM_PAGER_ERROR: 528 case VM_PAGER_FAIL: 529 /* 530 * A page typically cannot be paged out when we 531 * have run out of swap. We leave the page 532 * marked inactive and will try to page it out 533 * again later. 534 * 535 * Starvation of the active page list is used to 536 * determine when the system is massively memory 537 * starved. 538 */ 539 break; 540 case VM_PAGER_AGAIN: 541 break; 542 } 543 544 /* 545 * If not PENDing this was a synchronous operation and we 546 * clean up after the I/O. If it is PENDing the mess is 547 * cleaned up asynchronously. 548 * 549 * Also nominally act on the caller's wishes if the caller 550 * wants to try to really clean (cache or free) the page. 551 * 552 * Also nominally deactivate the page if the system is 553 * memory-stressed. 554 */ 555 if (pageout_status[i] != VM_PAGER_PEND) { 556 vm_page_busy_wait(mt, FALSE, "pgouw"); 557 vm_page_io_finish(mt); 558 if (vmflush_flags & VM_PAGER_TRY_TO_CACHE) { 559 vm_page_try_to_cache(mt); 560 if (dodebug) 561 kprintf("A[pq_cache=%d]", 562 ((mt->queue - mt->pc) == PQ_CACHE)); 563 } else if (vm_page_count_severe()) { 564 vm_page_deactivate(mt); 565 vm_page_wakeup(mt); 566 if (dodebug) 567 kprintf("B"); 568 } else { 569 vm_page_wakeup(mt); 570 if (dodebug) 571 kprintf("C"); 572 } 573 vm_object_pip_wakeup(object); 574 } 575 } 576 if (dodebug) 577 kprintf("\n"); 578 return numpagedout; 579 } 580 581 #if !defined(NO_SWAPPING) 582 583 /* 584 * Callback function, page busied for us. We must dispose of the busy 585 * condition. Any related pmap pages may be held but will not be locked. 586 */ 587 static 588 int 589 vm_pageout_mdp_callback(struct pmap_pgscan_info *info, vm_offset_t va, 590 vm_page_t p) 591 { 592 int actcount; 593 int cleanit = 0; 594 595 /* 596 * Basic tests - There should never be a marker, and we can stop 597 * once the RSS is below the required level. 598 */ 599 KKASSERT((p->flags & PG_MARKER) == 0); 600 if (pmap_resident_tlnw_count(info->pmap) <= info->limit) { 601 vm_page_wakeup(p); 602 return(-1); 603 } 604 605 mycpu->gd_cnt.v_pdpages++; 606 607 if (p->wire_count || p->hold_count || (p->flags & PG_UNQUEUED)) { 608 vm_page_wakeup(p); 609 goto done; 610 } 611 612 ++info->actioncount; 613 614 /* 615 * Check if the page has been referened recently. If it has, 616 * activate it and skip. 617 */ 618 actcount = pmap_ts_referenced(p); 619 if (actcount) { 620 vm_page_flag_set(p, PG_REFERENCED); 621 } else if (p->flags & PG_REFERENCED) { 622 actcount = 1; 623 } 624 625 if (actcount) { 626 if (p->queue - p->pc != PQ_ACTIVE) { 627 vm_page_and_queue_spin_lock(p); 628 if (p->queue - p->pc != PQ_ACTIVE) { 629 vm_page_and_queue_spin_unlock(p); 630 vm_page_activate(p); 631 } else { 632 vm_page_and_queue_spin_unlock(p); 633 } 634 } else { 635 p->act_count += actcount; 636 if (p->act_count > ACT_MAX) 637 p->act_count = ACT_MAX; 638 } 639 vm_page_flag_clear(p, PG_REFERENCED); 640 vm_page_wakeup(p); 641 goto done; 642 } 643 644 /* 645 * Remove the page from this particular pmap. Once we do this, our 646 * pmap scans will not see it again (unless it gets faulted in), so 647 * we must actively dispose of or deal with the page. 648 */ 649 pmap_remove_specific(info->pmap, p); 650 651 /* 652 * If the page is not mapped to another process (i.e. as would be 653 * typical if this were a shared page from a library) then deactivate 654 * the page and clean it in two passes only. 655 * 656 * If the page hasn't been referenced since the last check, remove it 657 * from the pmap. If it is no longer mapped, deactivate it 658 * immediately, accelerating the normal decline. 659 * 660 * Once the page has been removed from the pmap the RSS code no 661 * longer tracks it so we have to make sure that it is staged for 662 * potential flush action. 663 */ 664 if ((p->flags & PG_MAPPED) == 0 || 665 (pmap_mapped_sync(p) & PG_MAPPED) == 0) { 666 if (p->queue - p->pc == PQ_ACTIVE) { 667 vm_page_deactivate(p); 668 } 669 if (p->queue - p->pc == PQ_INACTIVE) { 670 cleanit = 1; 671 } 672 } 673 674 /* 675 * Ok, try to fully clean the page and any nearby pages such that at 676 * least the requested page is freed or moved to the cache queue. 677 * 678 * We usually do this synchronously to allow us to get the page into 679 * the CACHE queue quickly, which will prevent memory exhaustion if 680 * a process with a memoryuse limit is running away. However, the 681 * sysadmin may desire to set vm.swap_user_async which relaxes this 682 * and improves write performance. 683 */ 684 if (cleanit) { 685 long max_launder = 0x7FFF; 686 long vnodes_skipped = 0; 687 int vmflush_flags; 688 struct vnode *vpfailed = NULL; 689 690 info->offset = va; 691 692 if (vm_pageout_memuse_mode >= 2) { 693 vmflush_flags = VM_PAGER_TRY_TO_CACHE | 694 VM_PAGER_ALLOW_ACTIVE; 695 if (swap_user_async == 0) 696 vmflush_flags |= VM_PAGER_PUT_SYNC; 697 vm_page_flag_set(p, PG_WINATCFLS); 698 info->cleancount += 699 vm_pageout_page(p, &max_launder, 700 &vnodes_skipped, 701 &vpfailed, 1, vmflush_flags); 702 } else { 703 vm_page_wakeup(p); 704 ++info->cleancount; 705 } 706 } else { 707 vm_page_wakeup(p); 708 } 709 710 /* 711 * Must be at end to avoid SMP races. 712 */ 713 done: 714 lwkt_user_yield(); 715 return 0; 716 } 717 718 /* 719 * Deactivate some number of pages in a map due to set RLIMIT_RSS limits. 720 * that is relatively difficult to do. We try to keep track of where we 721 * left off last time to reduce scan overhead. 722 * 723 * Called when vm_pageout_memuse_mode is >= 1. 724 */ 725 void 726 vm_pageout_map_deactivate_pages(vm_map_t map, vm_pindex_t limit) 727 { 728 vm_offset_t pgout_offset; 729 struct pmap_pgscan_info info; 730 int retries = 3; 731 732 pgout_offset = map->pgout_offset; 733 again: 734 #if 0 735 kprintf("%016jx ", pgout_offset); 736 #endif 737 if (pgout_offset < VM_MIN_USER_ADDRESS) 738 pgout_offset = VM_MIN_USER_ADDRESS; 739 if (pgout_offset >= VM_MAX_USER_ADDRESS) 740 pgout_offset = 0; 741 info.pmap = vm_map_pmap(map); 742 info.limit = limit; 743 info.beg_addr = pgout_offset; 744 info.end_addr = VM_MAX_USER_ADDRESS; 745 info.callback = vm_pageout_mdp_callback; 746 info.cleancount = 0; 747 info.actioncount = 0; 748 info.busycount = 0; 749 750 pmap_pgscan(&info); 751 pgout_offset = info.offset; 752 #if 0 753 kprintf("%016jx %08lx %08lx\n", pgout_offset, 754 info.cleancount, info.actioncount); 755 #endif 756 757 if (pgout_offset != VM_MAX_USER_ADDRESS && 758 pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) { 759 goto again; 760 } else if (retries && 761 pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) { 762 --retries; 763 goto again; 764 } 765 map->pgout_offset = pgout_offset; 766 } 767 #endif 768 769 /* 770 * Called when the pageout scan wants to free a page. We no longer 771 * try to cycle the vm_object here with a reference & dealloc, which can 772 * cause a non-trivial object collapse in a critical path. 773 * 774 * It is unclear why we cycled the ref_count in the past, perhaps to try 775 * to optimize shadow chain collapses but I don't quite see why it would 776 * be necessary. An OBJ_DEAD object should terminate any and all vm_pages 777 * synchronously and not have to be kicked-start. 778 */ 779 static void 780 vm_pageout_page_free(vm_page_t m) 781 { 782 vm_page_protect(m, VM_PROT_NONE); 783 vm_page_free(m); 784 } 785 786 /* 787 * vm_pageout_scan does the dirty work for the pageout daemon. 788 */ 789 struct vm_pageout_scan_info { 790 struct proc *bigproc; 791 vm_offset_t bigsize; 792 }; 793 794 static int vm_pageout_scan_callback(struct proc *p, void *data); 795 796 /* 797 * Scan inactive queue 798 * 799 * WARNING! Can be called from two pagedaemon threads simultaneously. 800 */ 801 static int 802 vm_pageout_scan_inactive(int pass, int q, long avail_shortage, 803 long *vnodes_skipped) 804 { 805 vm_page_t m; 806 struct vm_page marker; 807 struct vnode *vpfailed; /* warning, allowed to be stale */ 808 long maxscan; 809 long delta = 0; 810 long max_launder; 811 int isep; 812 int vmflush_flags; 813 814 isep = (curthread == emergpager); 815 816 /* 817 * Start scanning the inactive queue for pages we can move to the 818 * cache or free. The scan will stop when the target is reached or 819 * we have scanned the entire inactive queue. Note that m->act_count 820 * is not used to form decisions for the inactive queue, only for the 821 * active queue. 822 * 823 * max_launder limits the number of dirty pages we flush per scan. 824 * For most systems a smaller value (16 or 32) is more robust under 825 * extreme memory and disk pressure because any unnecessary writes 826 * to disk can result in extreme performance degredation. However, 827 * systems with excessive dirty pages (especially when MAP_NOSYNC is 828 * used) will die horribly with limited laundering. If the pageout 829 * daemon cannot clean enough pages in the first pass, we let it go 830 * all out in succeeding passes. 831 * 832 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT LAUNDER VNODE-BACKED 833 * PAGES. 834 */ 835 if ((max_launder = vm_max_launder) <= 1) 836 max_launder = 1; 837 if (pass) 838 max_launder = 10000; 839 840 /* 841 * Initialize our marker 842 */ 843 bzero(&marker, sizeof(marker)); 844 marker.flags = PG_FICTITIOUS | PG_MARKER; 845 marker.busy_count = PBUSY_LOCKED; 846 marker.queue = PQ_INACTIVE + q; 847 marker.pc = q; 848 marker.wire_count = 1; 849 850 /* 851 * Inactive queue scan. 852 * 853 * We pick off approximately 1/10 of each queue. Each queue is 854 * effectively organized LRU so scanning the entire queue would 855 * improperly pick up pages that might still be in regular use. 856 * 857 * NOTE: The vm_page must be spinlocked before the queue to avoid 858 * deadlocks, so it is easiest to simply iterate the loop 859 * with the queue unlocked at the top. 860 */ 861 vpfailed = NULL; 862 863 vm_page_queues_spin_lock(PQ_INACTIVE + q); 864 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); 865 maxscan = vm_page_queues[PQ_INACTIVE + q].lcnt / 10 + 1; 866 867 /* 868 * Queue locked at top of loop to avoid stack marker issues. 869 */ 870 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 871 maxscan-- > 0 && avail_shortage - delta > 0) 872 { 873 int count; 874 875 KKASSERT(m->queue == PQ_INACTIVE + q); 876 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, 877 &marker, pageq); 878 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE + q].pl, m, 879 &marker, pageq); 880 mycpu->gd_cnt.v_pdpages++; 881 882 /* 883 * Skip marker pages (atomic against other markers to avoid 884 * infinite hop-over scans). 885 */ 886 if (m->flags & PG_MARKER) 887 continue; 888 889 /* 890 * Try to busy the page. Don't mess with pages which are 891 * already busy or reorder them in the queue. 892 */ 893 if (vm_page_busy_try(m, TRUE)) 894 continue; 895 896 /* 897 * Remaining operations run with the page busy and neither 898 * the page or the queue will be spin-locked. 899 */ 900 KKASSERT(m->queue == PQ_INACTIVE + q); 901 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 902 903 /* 904 * The emergency pager runs when the primary pager gets 905 * stuck, which typically means the primary pager deadlocked 906 * on a vnode-backed page. Therefore, the emergency pager 907 * must skip any complex objects. 908 * 909 * We disallow VNODEs unless they are VCHR whos device ops 910 * does not flag D_NOEMERGPGR. 911 */ 912 if (isep && m->object) { 913 struct vnode *vp; 914 915 switch(m->object->type) { 916 case OBJT_DEFAULT: 917 case OBJT_SWAP: 918 /* 919 * Allow anonymous memory and assume that 920 * swap devices are not complex, since its 921 * kinda worthless if we can't swap out dirty 922 * anonymous pages. 923 */ 924 break; 925 case OBJT_VNODE: 926 /* 927 * Allow VCHR device if the D_NOEMERGPGR 928 * flag is not set, deny other vnode types 929 * as being too complex. 930 */ 931 vp = m->object->handle; 932 if (vp && vp->v_type == VCHR && 933 vp->v_rdev && vp->v_rdev->si_ops && 934 (vp->v_rdev->si_ops->head.flags & 935 D_NOEMERGPGR) == 0) { 936 break; 937 } 938 /* Deny - fall through */ 939 default: 940 /* 941 * Deny 942 */ 943 vm_page_wakeup(m); 944 vm_page_queues_spin_lock(PQ_INACTIVE + q); 945 lwkt_yield(); 946 continue; 947 } 948 } 949 950 /* 951 * Try to pageout the page and perhaps other nearby pages. 952 * We want to get the pages into the cache on the second 953 * pass. Otherwise the pages can wind up just cycling in 954 * the inactive queue, getting flushed over and over again. 955 */ 956 if (vm_pageout_memuse_mode >= 2) 957 vm_page_flag_set(m, PG_WINATCFLS); 958 if (m->flags & PG_WINATCFLS) 959 vmflush_flags = VM_PAGER_TRY_TO_CACHE; 960 else 961 vmflush_flags = 0; 962 count = vm_pageout_page(m, &max_launder, vnodes_skipped, 963 &vpfailed, pass, vmflush_flags); 964 delta += count; 965 966 /* 967 * Systems with a ton of memory can wind up with huge 968 * deactivation counts. Because the inactive scan is 969 * doing a lot of flushing, the combination can result 970 * in excessive paging even in situations where other 971 * unrelated threads free up sufficient VM. 972 * 973 * To deal with this we abort the nominal active->inactive 974 * scan before we hit the inactive target when free+cache 975 * levels have reached a reasonable target. 976 * 977 * When deciding to stop early we need to add some slop to 978 * the test and we need to return full completion to the caller 979 * to prevent the caller from thinking there is something 980 * wrong and issuing a low-memory+swap warning or pkill. 981 * 982 * A deficit forces paging regardless of the state of the 983 * VM page queues (used for RSS enforcement). 984 */ 985 lwkt_yield(); 986 vm_page_queues_spin_lock(PQ_INACTIVE + q); 987 if (vm_paging_target() < -vm_max_launder) { 988 /* 989 * Stopping early, return full completion to caller. 990 */ 991 if (delta < avail_shortage) 992 delta = avail_shortage; 993 break; 994 } 995 } 996 997 /* page queue still spin-locked */ 998 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); 999 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 1000 1001 return (delta); 1002 } 1003 1004 /* 1005 * Pageout the specified page, return the total number of pages paged out 1006 * (this routine may cluster). 1007 * 1008 * The page must be busied and soft-busied by the caller and will be disposed 1009 * of by this function. 1010 */ 1011 static int 1012 vm_pageout_page(vm_page_t m, long *max_launderp, long *vnodes_skippedp, 1013 struct vnode **vpfailedp, int pass, int vmflush_flags) 1014 { 1015 vm_object_t object; 1016 int actcount; 1017 int count = 0; 1018 1019 /* 1020 * Wiring no longer removes a page from its queue. The last unwiring 1021 * will requeue the page. Obviously wired pages cannot be paged out 1022 * so unqueue it and return. 1023 */ 1024 if (m->wire_count) { 1025 vm_page_unqueue_nowakeup(m); 1026 vm_page_wakeup(m); 1027 return 0; 1028 } 1029 1030 /* 1031 * A held page may be undergoing I/O, so skip it. 1032 */ 1033 if (m->hold_count) { 1034 vm_page_and_queue_spin_lock(m); 1035 if (m->queue - m->pc == PQ_INACTIVE) { 1036 TAILQ_REMOVE( 1037 &vm_page_queues[m->queue].pl, m, pageq); 1038 TAILQ_INSERT_TAIL( 1039 &vm_page_queues[m->queue].pl, m, pageq); 1040 } 1041 vm_page_and_queue_spin_unlock(m); 1042 vm_page_wakeup(m); 1043 return 0; 1044 } 1045 1046 if (m->object == NULL || m->object->ref_count == 0) { 1047 /* 1048 * If the object is not being used, we ignore previous 1049 * references. 1050 */ 1051 vm_page_flag_clear(m, PG_REFERENCED); 1052 pmap_clear_reference(m); 1053 /* fall through to end */ 1054 } else if (((m->flags & PG_REFERENCED) == 0) && 1055 (actcount = pmap_ts_referenced(m))) { 1056 /* 1057 * Otherwise, if the page has been referenced while 1058 * in the inactive queue, we bump the "activation 1059 * count" upwards, making it less likely that the 1060 * page will be added back to the inactive queue 1061 * prematurely again. Here we check the page tables 1062 * (or emulated bits, if any), given the upper level 1063 * VM system not knowing anything about existing 1064 * references. 1065 */ 1066 vm_page_activate(m); 1067 m->act_count += (actcount + ACT_ADVANCE); 1068 vm_page_wakeup(m); 1069 return 0; 1070 } 1071 1072 /* 1073 * (m) is still busied. 1074 * 1075 * If the upper level VM system knows about any page 1076 * references, we activate the page. We also set the 1077 * "activation count" higher than normal so that we will less 1078 * likely place pages back onto the inactive queue again. 1079 */ 1080 if ((m->flags & PG_REFERENCED) != 0) { 1081 vm_page_flag_clear(m, PG_REFERENCED); 1082 actcount = pmap_ts_referenced(m); 1083 vm_page_activate(m); 1084 m->act_count += (actcount + ACT_ADVANCE + 1); 1085 vm_page_wakeup(m); 1086 return 0; 1087 } 1088 1089 /* 1090 * If the upper level VM system doesn't know anything about 1091 * the page being dirty, we have to check for it again. As 1092 * far as the VM code knows, any partially dirty pages are 1093 * fully dirty. 1094 * 1095 * Pages marked PG_WRITEABLE may be mapped into the user 1096 * address space of a process running on another cpu. A 1097 * user process (without holding the MP lock) running on 1098 * another cpu may be able to touch the page while we are 1099 * trying to remove it. vm_page_cache() will handle this 1100 * case for us. 1101 */ 1102 if (m->dirty == 0) { 1103 vm_page_test_dirty(m); 1104 } else { 1105 vm_page_dirty(m); 1106 } 1107 1108 if (m->valid == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 1109 /* 1110 * Invalid pages can be easily freed 1111 */ 1112 vm_pageout_page_free(m); 1113 mycpu->gd_cnt.v_dfree++; 1114 ++count; 1115 } else if (m->dirty == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 1116 /* 1117 * Clean pages can be placed onto the cache queue. 1118 * This effectively frees them. 1119 */ 1120 vm_page_cache(m); 1121 ++count; 1122 } else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) { 1123 /* 1124 * Dirty pages need to be paged out, but flushing 1125 * a page is extremely expensive verses freeing 1126 * a clean page. Rather then artificially limiting 1127 * the number of pages we can flush, we instead give 1128 * dirty pages extra priority on the inactive queue 1129 * by forcing them to be cycled through the queue 1130 * twice before being flushed, after which the 1131 * (now clean) page will cycle through once more 1132 * before being freed. This significantly extends 1133 * the thrash point for a heavily loaded machine. 1134 */ 1135 vm_page_flag_set(m, PG_WINATCFLS); 1136 vm_page_and_queue_spin_lock(m); 1137 if (m->queue - m->pc == PQ_INACTIVE) { 1138 TAILQ_REMOVE( 1139 &vm_page_queues[m->queue].pl, m, pageq); 1140 TAILQ_INSERT_TAIL( 1141 &vm_page_queues[m->queue].pl, m, pageq); 1142 } 1143 vm_page_and_queue_spin_unlock(m); 1144 vm_page_wakeup(m); 1145 } else if (*max_launderp > 0) { 1146 /* 1147 * We always want to try to flush some dirty pages if 1148 * we encounter them, to keep the system stable. 1149 * Normally this number is small, but under extreme 1150 * pressure where there are insufficient clean pages 1151 * on the inactive queue, we may have to go all out. 1152 */ 1153 int swap_pageouts_ok; 1154 struct vnode *vp = NULL; 1155 1156 swap_pageouts_ok = 0; 1157 object = m->object; 1158 if (object && 1159 (object->type != OBJT_SWAP) && 1160 (object->type != OBJT_DEFAULT)) { 1161 swap_pageouts_ok = 1; 1162 } else { 1163 swap_pageouts_ok = !(defer_swap_pageouts || 1164 disable_swap_pageouts); 1165 swap_pageouts_ok |= (!disable_swap_pageouts && 1166 defer_swap_pageouts && 1167 vm_page_count_min(0)); 1168 } 1169 1170 /* 1171 * We don't bother paging objects that are "dead". 1172 * Those objects are in a "rundown" state. 1173 */ 1174 if (!swap_pageouts_ok || 1175 (object == NULL) || 1176 (object->flags & OBJ_DEAD)) { 1177 vm_page_and_queue_spin_lock(m); 1178 if (m->queue - m->pc == PQ_INACTIVE) { 1179 TAILQ_REMOVE( 1180 &vm_page_queues[m->queue].pl, 1181 m, pageq); 1182 TAILQ_INSERT_TAIL( 1183 &vm_page_queues[m->queue].pl, 1184 m, pageq); 1185 } 1186 vm_page_and_queue_spin_unlock(m); 1187 vm_page_wakeup(m); 1188 return 0; 1189 } 1190 1191 /* 1192 * (m) is still busied. 1193 * 1194 * The object is already known NOT to be dead. It 1195 * is possible for the vget() to block the whole 1196 * pageout daemon, but the new low-memory handling 1197 * code should prevent it. 1198 * 1199 * The previous code skipped locked vnodes and, worse, 1200 * reordered pages in the queue. This results in 1201 * completely non-deterministic operation because, 1202 * quite often, a vm_fault has initiated an I/O and 1203 * is holding a locked vnode at just the point where 1204 * the pageout daemon is woken up. 1205 * 1206 * We can't wait forever for the vnode lock, we might 1207 * deadlock due to a vn_read() getting stuck in 1208 * vm_wait while holding this vnode. We skip the 1209 * vnode if we can't get it in a reasonable amount 1210 * of time. 1211 * 1212 * vpfailed is used to (try to) avoid the case where 1213 * a large number of pages are associated with a 1214 * locked vnode, which could cause the pageout daemon 1215 * to stall for an excessive amount of time. 1216 */ 1217 if (object->type == OBJT_VNODE) { 1218 int flags; 1219 1220 vp = object->handle; 1221 flags = LK_EXCLUSIVE; 1222 if (vp == *vpfailedp) 1223 flags |= LK_NOWAIT; 1224 else 1225 flags |= LK_TIMELOCK; 1226 vm_page_hold(m); 1227 vm_page_wakeup(m); 1228 1229 /* 1230 * We have unbusied (m) temporarily so we can 1231 * acquire the vp lock without deadlocking. 1232 * (m) is held to prevent destruction. 1233 */ 1234 if (vget(vp, flags) != 0) { 1235 *vpfailedp = vp; 1236 ++pageout_lock_miss; 1237 if (object->flags & OBJ_MIGHTBEDIRTY) 1238 ++*vnodes_skippedp; 1239 vm_page_unhold(m); 1240 return 0; 1241 } 1242 1243 /* 1244 * The page might have been moved to another 1245 * queue during potential blocking in vget() 1246 * above. The page might have been freed and 1247 * reused for another vnode. The object might 1248 * have been reused for another vnode. 1249 */ 1250 if (m->queue - m->pc != PQ_INACTIVE || 1251 m->object != object || 1252 object->handle != vp) { 1253 if (object->flags & OBJ_MIGHTBEDIRTY) 1254 ++*vnodes_skippedp; 1255 vput(vp); 1256 vm_page_unhold(m); 1257 return 0; 1258 } 1259 1260 /* 1261 * The page may have been busied during the 1262 * blocking in vput(); We don't move the 1263 * page back onto the end of the queue so that 1264 * statistics are more correct if we don't. 1265 */ 1266 if (vm_page_busy_try(m, TRUE)) { 1267 vput(vp); 1268 vm_page_unhold(m); 1269 return 0; 1270 } 1271 vm_page_unhold(m); 1272 1273 /* 1274 * If it was wired while we didn't own it. 1275 */ 1276 if (m->wire_count) { 1277 vm_page_unqueue_nowakeup(m); 1278 vput(vp); 1279 vm_page_wakeup(m); 1280 return 0; 1281 } 1282 1283 /* 1284 * (m) is busied again 1285 * 1286 * We own the busy bit and remove our hold 1287 * bit. If the page is still held it 1288 * might be undergoing I/O, so skip it. 1289 */ 1290 if (m->hold_count) { 1291 vm_page_and_queue_spin_lock(m); 1292 if (m->queue - m->pc == PQ_INACTIVE) { 1293 TAILQ_REMOVE(&vm_page_queues[m->queue].pl, m, pageq); 1294 TAILQ_INSERT_TAIL(&vm_page_queues[m->queue].pl, m, pageq); 1295 } 1296 vm_page_and_queue_spin_unlock(m); 1297 if (object->flags & OBJ_MIGHTBEDIRTY) 1298 ++*vnodes_skippedp; 1299 vm_page_wakeup(m); 1300 vput(vp); 1301 return 0; 1302 } 1303 /* (m) is left busied as we fall through */ 1304 } 1305 1306 /* 1307 * page is busy and not held here. 1308 * 1309 * If a page is dirty, then it is either being washed 1310 * (but not yet cleaned) or it is still in the 1311 * laundry. If it is still in the laundry, then we 1312 * start the cleaning operation. 1313 * 1314 * decrement inactive_shortage on success to account 1315 * for the (future) cleaned page. Otherwise we 1316 * could wind up laundering or cleaning too many 1317 * pages. 1318 * 1319 * NOTE: Cleaning the page here does not cause 1320 * force_deficit to be adjusted, because the 1321 * page is not being freed or moved to the 1322 * cache. 1323 */ 1324 count = vm_pageout_clean_helper(m, vmflush_flags); 1325 *max_launderp -= count; 1326 1327 /* 1328 * Clean ate busy, page no longer accessible 1329 */ 1330 if (vp != NULL) 1331 vput(vp); 1332 } else { 1333 vm_page_wakeup(m); 1334 } 1335 return count; 1336 } 1337 1338 /* 1339 * Scan active queue 1340 * 1341 * WARNING! Can be called from two pagedaemon threads simultaneously. 1342 */ 1343 static int 1344 vm_pageout_scan_active(int pass, int q, 1345 long avail_shortage, long inactive_shortage, 1346 long *recycle_countp) 1347 { 1348 struct vm_page marker; 1349 vm_page_t m; 1350 int actcount; 1351 long delta = 0; 1352 long maxscan; 1353 int isep; 1354 1355 isep = (curthread == emergpager); 1356 1357 /* 1358 * We want to move pages from the active queue to the inactive 1359 * queue to get the inactive queue to the inactive target. If 1360 * we still have a page shortage from above we try to directly free 1361 * clean pages instead of moving them. 1362 * 1363 * If we do still have a shortage we keep track of the number of 1364 * pages we free or cache (recycle_count) as a measure of thrashing 1365 * between the active and inactive queues. 1366 * 1367 * If we were able to completely satisfy the free+cache targets 1368 * from the inactive pool we limit the number of pages we move 1369 * from the active pool to the inactive pool to 2x the pages we 1370 * had removed from the inactive pool (with a minimum of 1/5 the 1371 * inactive target). If we were not able to completely satisfy 1372 * the free+cache targets we go for the whole target aggressively. 1373 * 1374 * NOTE: Both variables can end up negative. 1375 * NOTE: We are still in a critical section. 1376 * 1377 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT LAUNDER VNODE-BACKED 1378 * PAGES. 1379 */ 1380 1381 bzero(&marker, sizeof(marker)); 1382 marker.flags = PG_FICTITIOUS | PG_MARKER; 1383 marker.busy_count = PBUSY_LOCKED; 1384 marker.queue = PQ_ACTIVE + q; 1385 marker.pc = q; 1386 marker.wire_count = 1; 1387 1388 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1389 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1390 maxscan = vm_page_queues[PQ_ACTIVE + q].lcnt / 10 + 1; 1391 1392 /* 1393 * Queue locked at top of loop to avoid stack marker issues. 1394 */ 1395 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 1396 maxscan-- > 0 && (avail_shortage - delta > 0 || 1397 inactive_shortage > 0)) 1398 { 1399 KKASSERT(m->queue == PQ_ACTIVE + q); 1400 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, 1401 &marker, pageq); 1402 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m, 1403 &marker, pageq); 1404 1405 /* 1406 * Skip marker pages (atomic against other markers to avoid 1407 * infinite hop-over scans). 1408 */ 1409 if (m->flags & PG_MARKER) 1410 continue; 1411 1412 /* 1413 * Try to busy the page. Don't mess with pages which are 1414 * already busy or reorder them in the queue. 1415 */ 1416 if (vm_page_busy_try(m, TRUE)) 1417 continue; 1418 1419 /* 1420 * Remaining operations run with the page busy and neither 1421 * the page or the queue will be spin-locked. 1422 */ 1423 KKASSERT(m->queue == PQ_ACTIVE + q); 1424 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1425 1426 #if 0 1427 /* 1428 * Don't deactivate pages that are held, even if we can 1429 * busy them. (XXX why not?) 1430 */ 1431 if (m->hold_count) { 1432 vm_page_and_queue_spin_lock(m); 1433 if (m->queue - m->pc == PQ_ACTIVE) { 1434 TAILQ_REMOVE( 1435 &vm_page_queues[PQ_ACTIVE + q].pl, 1436 m, pageq); 1437 TAILQ_INSERT_TAIL( 1438 &vm_page_queues[PQ_ACTIVE + q].pl, 1439 m, pageq); 1440 } 1441 vm_page_and_queue_spin_unlock(m); 1442 vm_page_wakeup(m); 1443 goto next; 1444 } 1445 #endif 1446 /* 1447 * We can just remove wired pages from the queue 1448 */ 1449 if (m->wire_count) { 1450 vm_page_unqueue_nowakeup(m); 1451 vm_page_wakeup(m); 1452 goto next; 1453 } 1454 1455 /* 1456 * The emergency pager ignores vnode-backed pages as these 1457 * are the pages that probably bricked the main pager. 1458 */ 1459 if (isep && m->object && m->object->type == OBJT_VNODE) { 1460 vm_page_and_queue_spin_lock(m); 1461 if (m->queue - m->pc == PQ_ACTIVE) { 1462 TAILQ_REMOVE( 1463 &vm_page_queues[PQ_ACTIVE + q].pl, 1464 m, pageq); 1465 TAILQ_INSERT_TAIL( 1466 &vm_page_queues[PQ_ACTIVE + q].pl, 1467 m, pageq); 1468 } 1469 vm_page_and_queue_spin_unlock(m); 1470 vm_page_wakeup(m); 1471 goto next; 1472 } 1473 1474 /* 1475 * The count for pagedaemon pages is done after checking the 1476 * page for eligibility... 1477 */ 1478 mycpu->gd_cnt.v_pdpages++; 1479 1480 /* 1481 * Check to see "how much" the page has been used and clear 1482 * the tracking access bits. If the object has no references 1483 * don't bother paying the expense. 1484 */ 1485 actcount = 0; 1486 if (m->object && m->object->ref_count != 0) { 1487 if (m->flags & PG_REFERENCED) 1488 ++actcount; 1489 actcount += pmap_ts_referenced(m); 1490 if (actcount) { 1491 m->act_count += ACT_ADVANCE + actcount; 1492 if (m->act_count > ACT_MAX) 1493 m->act_count = ACT_MAX; 1494 } 1495 } 1496 vm_page_flag_clear(m, PG_REFERENCED); 1497 1498 /* 1499 * actcount is only valid if the object ref_count is non-zero. 1500 * If the page does not have an object, actcount will be zero. 1501 */ 1502 if (actcount && m->object->ref_count != 0) { 1503 vm_page_and_queue_spin_lock(m); 1504 if (m->queue - m->pc == PQ_ACTIVE) { 1505 TAILQ_REMOVE( 1506 &vm_page_queues[PQ_ACTIVE + q].pl, 1507 m, pageq); 1508 TAILQ_INSERT_TAIL( 1509 &vm_page_queues[PQ_ACTIVE + q].pl, 1510 m, pageq); 1511 } 1512 vm_page_and_queue_spin_unlock(m); 1513 vm_page_wakeup(m); 1514 } else { 1515 switch(m->object->type) { 1516 case OBJT_DEFAULT: 1517 case OBJT_SWAP: 1518 m->act_count -= min(m->act_count, 1519 vm_anonmem_decline); 1520 break; 1521 default: 1522 m->act_count -= min(m->act_count, 1523 vm_filemem_decline); 1524 break; 1525 } 1526 if (vm_pageout_algorithm || 1527 (m->object == NULL) || 1528 (m->object && (m->object->ref_count == 0)) || 1529 m->act_count < pass + 1 1530 ) { 1531 /* 1532 * Deactivate the page. If we had a 1533 * shortage from our inactive scan try to 1534 * free (cache) the page instead. 1535 * 1536 * Don't just blindly cache the page if 1537 * we do not have a shortage from the 1538 * inactive scan, that could lead to 1539 * gigabytes being moved. 1540 */ 1541 --inactive_shortage; 1542 if (avail_shortage - delta > 0 || 1543 (m->object && (m->object->ref_count == 0))) 1544 { 1545 if (avail_shortage - delta > 0) 1546 ++*recycle_countp; 1547 vm_page_protect(m, VM_PROT_NONE); 1548 if (m->dirty == 0 && 1549 (m->flags & PG_NEED_COMMIT) == 0 && 1550 avail_shortage - delta > 0) { 1551 vm_page_cache(m); 1552 } else { 1553 vm_page_deactivate(m); 1554 vm_page_wakeup(m); 1555 } 1556 } else { 1557 vm_page_deactivate(m); 1558 vm_page_wakeup(m); 1559 } 1560 ++delta; 1561 } else { 1562 vm_page_and_queue_spin_lock(m); 1563 if (m->queue - m->pc == PQ_ACTIVE) { 1564 TAILQ_REMOVE( 1565 &vm_page_queues[PQ_ACTIVE + q].pl, 1566 m, pageq); 1567 TAILQ_INSERT_TAIL( 1568 &vm_page_queues[PQ_ACTIVE + q].pl, 1569 m, pageq); 1570 } 1571 vm_page_and_queue_spin_unlock(m); 1572 vm_page_wakeup(m); 1573 } 1574 } 1575 next: 1576 lwkt_yield(); 1577 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1578 } 1579 1580 /* 1581 * Clean out our local marker. 1582 * 1583 * Page queue still spin-locked. 1584 */ 1585 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1586 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1587 1588 return (delta); 1589 } 1590 1591 /* 1592 * The number of actually free pages can drop down to v_free_reserved, 1593 * we try to build the free count back above v_free_min. Note that 1594 * vm_paging_needed() also returns TRUE if v_free_count is not at 1595 * least v_free_min so that is the minimum we must build the free 1596 * count to. 1597 * 1598 * We use a slightly higher target to improve hysteresis, 1599 * ((v_free_target + v_free_min) / 2). Since v_free_target 1600 * is usually the same as v_cache_min this maintains about 1601 * half the pages in the free queue as are in the cache queue, 1602 * providing pretty good pipelining for pageout operation. 1603 * 1604 * The system operator can manipulate vm.v_cache_min and 1605 * vm.v_free_target to tune the pageout demon. Be sure 1606 * to keep vm.v_free_min < vm.v_free_target. 1607 * 1608 * Note that the original paging target is to get at least 1609 * (free_min + cache_min) into (free + cache). The slightly 1610 * higher target will shift additional pages from cache to free 1611 * without effecting the original paging target in order to 1612 * maintain better hysteresis and not have the free count always 1613 * be dead-on v_free_min. 1614 * 1615 * NOTE: we are still in a critical section. 1616 * 1617 * Pages moved from PQ_CACHE to totally free are not counted in the 1618 * pages_freed counter. 1619 * 1620 * WARNING! Can be called from two pagedaemon threads simultaneously. 1621 */ 1622 static void 1623 vm_pageout_scan_cache(long avail_shortage, int pass, 1624 long vnodes_skipped, long recycle_count) 1625 { 1626 static int lastkillticks; 1627 struct vm_pageout_scan_info info; 1628 vm_page_t m; 1629 int isep; 1630 1631 isep = (curthread == emergpager); 1632 1633 while (vmstats.v_free_count < 1634 (vmstats.v_free_min + vmstats.v_free_target) / 2) { 1635 /* 1636 * This steals some code from vm/vm_page.c 1637 * 1638 * Create two rovers and adjust the code to reduce 1639 * chances of them winding up at the same index (which 1640 * can cause a lot of contention). 1641 */ 1642 static int cache_rover[2] = { 0, PQ_L2_MASK / 2 }; 1643 1644 if (((cache_rover[0] ^ cache_rover[1]) & PQ_L2_MASK) == 0) 1645 goto next_rover; 1646 1647 m = vm_page_list_find(PQ_CACHE, cache_rover[isep] & PQ_L2_MASK); 1648 if (m == NULL) 1649 break; 1650 1651 /* 1652 * If the busy attempt fails we can still deactivate the page. 1653 */ 1654 /* page is returned removed from its queue and spinlocked */ 1655 if (vm_page_busy_try(m, TRUE)) { 1656 vm_page_deactivate_locked(m); 1657 vm_page_spin_unlock(m); 1658 continue; 1659 } 1660 vm_page_spin_unlock(m); 1661 pagedaemon_wakeup(); 1662 lwkt_yield(); 1663 1664 /* 1665 * Remaining operations run with the page busy and neither 1666 * the page or the queue will be spin-locked. 1667 */ 1668 if ((m->flags & (PG_UNQUEUED | PG_NEED_COMMIT)) || 1669 m->hold_count || 1670 m->wire_count) { 1671 vm_page_deactivate(m); 1672 vm_page_wakeup(m); 1673 continue; 1674 } 1675 pmap_mapped_sync(m); 1676 KKASSERT((m->flags & PG_MAPPED) == 0); 1677 KKASSERT(m->dirty == 0); 1678 vm_pageout_page_free(m); 1679 mycpu->gd_cnt.v_dfree++; 1680 next_rover: 1681 if (isep) 1682 cache_rover[1] -= PQ_PRIME2; 1683 else 1684 cache_rover[0] += PQ_PRIME2; 1685 } 1686 1687 #if !defined(NO_SWAPPING) 1688 /* 1689 * Idle process swapout -- run once per second. 1690 */ 1691 if (vm_swap_idle_enabled) { 1692 static time_t lsec; 1693 if (time_uptime != lsec) { 1694 atomic_set_int(&vm_pageout_req_swapout, VM_SWAP_IDLE); 1695 vm_req_vmdaemon(); 1696 lsec = time_uptime; 1697 } 1698 } 1699 #endif 1700 1701 /* 1702 * If we didn't get enough free pages, and we have skipped a vnode 1703 * in a writeable object, wakeup the sync daemon. And kick swapout 1704 * if we did not get enough free pages. 1705 */ 1706 if (vm_paging_target() > 0) { 1707 if (vnodes_skipped && vm_page_count_min(0)) 1708 speedup_syncer(NULL); 1709 #if !defined(NO_SWAPPING) 1710 if (vm_swap_enabled && vm_page_count_target()) { 1711 atomic_set_int(&vm_pageout_req_swapout, VM_SWAP_NORMAL); 1712 vm_req_vmdaemon(); 1713 } 1714 #endif 1715 } 1716 1717 /* 1718 * Handle catastrophic conditions. Under good conditions we should 1719 * be at the target, well beyond our minimum. If we could not even 1720 * reach our minimum the system is under heavy stress. But just being 1721 * under heavy stress does not trigger process killing. 1722 * 1723 * We consider ourselves to have run out of memory if the swap pager 1724 * is full and avail_shortage is still positive. The secondary check 1725 * ensures that we do not kill processes if the instantanious 1726 * availability is good, even if the pageout demon pass says it 1727 * couldn't get to the target. 1728 * 1729 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT HANDLE SWAP FULL 1730 * SITUATIONS. 1731 */ 1732 if (swap_pager_almost_full && 1733 pass > 0 && 1734 isep == 0 && 1735 (vm_page_count_min(recycle_count) || avail_shortage > 0)) { 1736 kprintf("Warning: system low on memory+swap " 1737 "shortage %ld for %d ticks!\n", 1738 avail_shortage, ticks - swap_fail_ticks); 1739 if (bootverbose) 1740 kprintf("Metrics: spaf=%d spf=%d pass=%d " 1741 "avail=%ld target=%ld last=%u\n", 1742 swap_pager_almost_full, 1743 swap_pager_full, 1744 pass, 1745 avail_shortage, 1746 vm_paging_target(), 1747 (unsigned int)(ticks - lastkillticks)); 1748 } 1749 if (swap_pager_full && 1750 pass > 1 && 1751 isep == 0 && 1752 avail_shortage > 0 && 1753 vm_paging_target() > 0 && 1754 (unsigned int)(ticks - lastkillticks) >= hz) { 1755 /* 1756 * Kill something, maximum rate once per second to give 1757 * the process time to free up sufficient memory. 1758 */ 1759 lastkillticks = ticks; 1760 info.bigproc = NULL; 1761 info.bigsize = 0; 1762 allproc_scan(vm_pageout_scan_callback, &info, 0); 1763 if (info.bigproc != NULL) { 1764 kprintf("Try to kill process %d %s\n", 1765 info.bigproc->p_pid, info.bigproc->p_comm); 1766 info.bigproc->p_nice = PRIO_MIN; 1767 info.bigproc->p_usched->resetpriority( 1768 FIRST_LWP_IN_PROC(info.bigproc)); 1769 atomic_set_int(&info.bigproc->p_flags, P_LOWMEMKILL); 1770 killproc(info.bigproc, "out of swap space"); 1771 wakeup(&vmstats.v_free_count); 1772 PRELE(info.bigproc); 1773 } 1774 } 1775 } 1776 1777 static int 1778 vm_pageout_scan_callback(struct proc *p, void *data) 1779 { 1780 struct vm_pageout_scan_info *info = data; 1781 vm_offset_t size; 1782 1783 /* 1784 * Never kill system processes or init. If we have configured swap 1785 * then try to avoid killing low-numbered pids. 1786 */ 1787 if ((p->p_flags & P_SYSTEM) || (p->p_pid == 1) || 1788 ((p->p_pid < 48) && (vm_swap_size != 0))) { 1789 return (0); 1790 } 1791 1792 lwkt_gettoken(&p->p_token); 1793 1794 /* 1795 * if the process is in a non-running type state, 1796 * don't touch it. 1797 */ 1798 if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) { 1799 lwkt_reltoken(&p->p_token); 1800 return (0); 1801 } 1802 1803 /* 1804 * Get the approximate process size. Note that anonymous pages 1805 * with backing swap will be counted twice, but there should not 1806 * be too many such pages due to the stress the VM system is 1807 * under at this point. 1808 */ 1809 size = vmspace_anonymous_count(p->p_vmspace) + 1810 vmspace_swap_count(p->p_vmspace); 1811 1812 /* 1813 * If the this process is bigger than the biggest one 1814 * remember it. 1815 */ 1816 if (info->bigsize < size) { 1817 if (info->bigproc) 1818 PRELE(info->bigproc); 1819 PHOLD(p); 1820 info->bigproc = p; 1821 info->bigsize = size; 1822 } 1823 lwkt_reltoken(&p->p_token); 1824 lwkt_yield(); 1825 1826 return(0); 1827 } 1828 1829 /* 1830 * This old guy slowly walks PQ_HOLD looking for pages which need to be 1831 * moved back to PQ_FREE. It is possible for pages to accumulate here 1832 * when vm_page_free() races against vm_page_unhold(), resulting in a 1833 * page being left on a PQ_HOLD queue with hold_count == 0. 1834 * 1835 * It is easier to handle this edge condition here, in non-critical code, 1836 * rather than enforce a spin-lock for every 1->0 transition in 1837 * vm_page_unhold(). 1838 * 1839 * NOTE: TAILQ_FOREACH becomes invalid the instant we unlock the queue. 1840 */ 1841 static void 1842 vm_pageout_scan_hold(int q) 1843 { 1844 vm_page_t m; 1845 1846 vm_page_queues_spin_lock(PQ_HOLD + q); 1847 TAILQ_FOREACH(m, &vm_page_queues[PQ_HOLD + q].pl, pageq) { 1848 if (m->flags & PG_MARKER) 1849 continue; 1850 1851 /* 1852 * Process one page and return 1853 */ 1854 if (m->hold_count) 1855 break; 1856 kprintf("DEBUG: pageout HOLD->FREE %p\n", m); 1857 vm_page_hold(m); 1858 vm_page_queues_spin_unlock(PQ_HOLD + q); 1859 vm_page_unhold(m); /* reprocess */ 1860 return; 1861 } 1862 vm_page_queues_spin_unlock(PQ_HOLD + q); 1863 } 1864 1865 /* 1866 * This routine tries to maintain the pseudo LRU active queue, 1867 * so that during long periods of time where there is no paging, 1868 * that some statistic accumulation still occurs. This code 1869 * helps the situation where paging just starts to occur. 1870 */ 1871 static void 1872 vm_pageout_page_stats(int q) 1873 { 1874 static int fullintervalcount = 0; 1875 struct vm_page marker; 1876 vm_page_t m; 1877 long pcount, tpcount; /* Number of pages to check */ 1878 long page_shortage; 1879 1880 page_shortage = (vmstats.v_inactive_target + vmstats.v_cache_max + 1881 vmstats.v_free_min) - 1882 (vmstats.v_free_count + vmstats.v_inactive_count + 1883 vmstats.v_cache_count); 1884 1885 if (page_shortage <= 0) 1886 return; 1887 1888 pcount = vm_page_queues[PQ_ACTIVE + q].lcnt; 1889 fullintervalcount += vm_pageout_stats_interval; 1890 if (fullintervalcount < vm_pageout_full_stats_interval) { 1891 tpcount = (vm_pageout_stats_max * pcount) / 1892 vmstats.v_page_count + 1; 1893 if (pcount > tpcount) 1894 pcount = tpcount; 1895 } else { 1896 fullintervalcount = 0; 1897 } 1898 1899 bzero(&marker, sizeof(marker)); 1900 marker.flags = PG_FICTITIOUS | PG_MARKER; 1901 marker.busy_count = PBUSY_LOCKED; 1902 marker.queue = PQ_ACTIVE + q; 1903 marker.pc = q; 1904 marker.wire_count = 1; 1905 1906 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1907 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1908 1909 /* 1910 * Queue locked at top of loop to avoid stack marker issues. 1911 */ 1912 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 1913 pcount-- > 0) 1914 { 1915 int actcount; 1916 1917 KKASSERT(m->queue == PQ_ACTIVE + q); 1918 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1919 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m, 1920 &marker, pageq); 1921 1922 /* 1923 * Skip marker pages (atomic against other markers to avoid 1924 * infinite hop-over scans). 1925 */ 1926 if (m->flags & PG_MARKER) 1927 continue; 1928 1929 /* 1930 * Ignore pages we can't busy 1931 */ 1932 if (vm_page_busy_try(m, TRUE)) 1933 continue; 1934 1935 /* 1936 * Remaining operations run with the page busy and neither 1937 * the page or the queue will be spin-locked. 1938 */ 1939 KKASSERT(m->queue == PQ_ACTIVE + q); 1940 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1941 1942 /* 1943 * We can just remove wired pages from the queue 1944 */ 1945 if (m->wire_count) { 1946 vm_page_unqueue_nowakeup(m); 1947 vm_page_wakeup(m); 1948 goto next; 1949 } 1950 1951 1952 /* 1953 * We now have a safely busied page, the page and queue 1954 * spinlocks have been released. 1955 * 1956 * Ignore held and wired pages 1957 */ 1958 if (m->hold_count || m->wire_count) { 1959 vm_page_wakeup(m); 1960 goto next; 1961 } 1962 1963 /* 1964 * Calculate activity 1965 */ 1966 actcount = 0; 1967 if (m->flags & PG_REFERENCED) { 1968 vm_page_flag_clear(m, PG_REFERENCED); 1969 actcount += 1; 1970 } 1971 actcount += pmap_ts_referenced(m); 1972 1973 /* 1974 * Update act_count and move page to end of queue. 1975 */ 1976 if (actcount) { 1977 m->act_count += ACT_ADVANCE + actcount; 1978 if (m->act_count > ACT_MAX) 1979 m->act_count = ACT_MAX; 1980 vm_page_and_queue_spin_lock(m); 1981 if (m->queue - m->pc == PQ_ACTIVE) { 1982 TAILQ_REMOVE( 1983 &vm_page_queues[PQ_ACTIVE + q].pl, 1984 m, pageq); 1985 TAILQ_INSERT_TAIL( 1986 &vm_page_queues[PQ_ACTIVE + q].pl, 1987 m, pageq); 1988 } 1989 vm_page_and_queue_spin_unlock(m); 1990 vm_page_wakeup(m); 1991 goto next; 1992 } 1993 1994 if (m->act_count == 0) { 1995 /* 1996 * We turn off page access, so that we have 1997 * more accurate RSS stats. We don't do this 1998 * in the normal page deactivation when the 1999 * system is loaded VM wise, because the 2000 * cost of the large number of page protect 2001 * operations would be higher than the value 2002 * of doing the operation. 2003 * 2004 * We use the marker to save our place so 2005 * we can release the spin lock. both (m) 2006 * and (next) will be invalid. 2007 */ 2008 vm_page_protect(m, VM_PROT_NONE); 2009 vm_page_deactivate(m); 2010 } else { 2011 m->act_count -= min(m->act_count, ACT_DECLINE); 2012 vm_page_and_queue_spin_lock(m); 2013 if (m->queue - m->pc == PQ_ACTIVE) { 2014 TAILQ_REMOVE( 2015 &vm_page_queues[PQ_ACTIVE + q].pl, 2016 m, pageq); 2017 TAILQ_INSERT_TAIL( 2018 &vm_page_queues[PQ_ACTIVE + q].pl, 2019 m, pageq); 2020 } 2021 vm_page_and_queue_spin_unlock(m); 2022 } 2023 vm_page_wakeup(m); 2024 next: 2025 vm_page_queues_spin_lock(PQ_ACTIVE + q); 2026 } 2027 2028 /* 2029 * Remove our local marker 2030 * 2031 * Page queue still spin-locked. 2032 */ 2033 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 2034 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 2035 } 2036 2037 static void 2038 vm_pageout_free_page_calc(vm_size_t count) 2039 { 2040 /* 2041 * v_free_min normal allocations 2042 * v_free_reserved system allocations 2043 * v_pageout_free_min allocations by pageout daemon 2044 * v_interrupt_free_min low level allocations (e.g swap structures) 2045 * 2046 * v_free_min is used to generate several other baselines, and they 2047 * can get pretty silly on systems with a lot of memory. 2048 */ 2049 vmstats.v_free_min = 64 + vmstats.v_page_count / 200; 2050 vmstats.v_free_reserved = vmstats.v_free_min * 4 / 8 + 7; 2051 vmstats.v_free_severe = vmstats.v_free_min * 4 / 8 + 0; 2052 vmstats.v_pageout_free_min = vmstats.v_free_min * 2 / 8 + 7; 2053 vmstats.v_interrupt_free_min = vmstats.v_free_min * 1 / 8 + 7; 2054 } 2055 2056 2057 /* 2058 * vm_pageout is the high level pageout daemon. TWO kernel threads run 2059 * this daemon, the primary pageout daemon and the emergency pageout daemon. 2060 * 2061 * The emergency pageout daemon takes over when the primary pageout daemon 2062 * deadlocks. The emergency pageout daemon ONLY pages out to swap, thus 2063 * avoiding the many low-memory deadlocks which can occur when paging out 2064 * to VFS's. 2065 */ 2066 static void 2067 vm_pageout_thread(void) 2068 { 2069 int pass; 2070 int q; 2071 int q1iterator = 0; 2072 int q2iterator = 0; 2073 int q3iterator = 0; 2074 int isep; 2075 2076 curthread->td_flags |= TDF_SYSTHREAD; 2077 2078 /* 2079 * We only need to setup once. 2080 */ 2081 isep = 0; 2082 if (curthread == emergpager) { 2083 isep = 1; 2084 goto skip_setup; 2085 } 2086 2087 /* 2088 * Initialize some paging parameters. 2089 */ 2090 vm_pageout_free_page_calc(vmstats.v_page_count); 2091 2092 /* 2093 * v_free_target and v_cache_min control pageout hysteresis. Note 2094 * that these are more a measure of the VM cache queue hysteresis 2095 * then the VM free queue. Specifically, v_free_target is the 2096 * high water mark (free+cache pages). 2097 * 2098 * v_free_reserved + v_cache_min (mostly means v_cache_min) is the 2099 * low water mark, while v_free_min is the stop. v_cache_min must 2100 * be big enough to handle memory needs while the pageout daemon 2101 * is signalled and run to free more pages. 2102 */ 2103 vmstats.v_free_target = 4 * vmstats.v_free_min + 2104 vmstats.v_free_reserved; 2105 2106 /* 2107 * NOTE: With the new buffer cache b_act_count we want the default 2108 * inactive target to be a percentage of available memory. 2109 * 2110 * The inactive target essentially determines the minimum 2111 * number of 'temporary' pages capable of caching one-time-use 2112 * files when the VM system is otherwise full of pages 2113 * belonging to multi-time-use files or active program data. 2114 * 2115 * NOTE: The inactive target is aggressively persued only if the 2116 * inactive queue becomes too small. If the inactive queue 2117 * is large enough to satisfy page movement to free+cache 2118 * then it is repopulated more slowly from the active queue. 2119 * This allows a general inactive_target default to be set. 2120 * 2121 * There is an issue here for processes which sit mostly idle 2122 * 'overnight', such as sshd, tcsh, and X. Any movement from 2123 * the active queue will eventually cause such pages to 2124 * recycle eventually causing a lot of paging in the morning. 2125 * To reduce the incidence of this pages cycled out of the 2126 * buffer cache are moved directly to the inactive queue if 2127 * they were only used once or twice. 2128 * 2129 * The vfs.vm_cycle_point sysctl can be used to adjust this. 2130 * Increasing the value (up to 64) increases the number of 2131 * buffer recyclements which go directly to the inactive queue. 2132 */ 2133 if (vmstats.v_free_count > 2048) { 2134 vmstats.v_cache_min = vmstats.v_free_target; 2135 vmstats.v_cache_max = 2 * vmstats.v_cache_min; 2136 } else { 2137 vmstats.v_cache_min = 0; 2138 vmstats.v_cache_max = 0; 2139 } 2140 vmstats.v_inactive_target = vmstats.v_free_count / 4; 2141 2142 /* XXX does not really belong here */ 2143 if (vm_page_max_wired == 0) 2144 vm_page_max_wired = vmstats.v_free_count / 3; 2145 2146 if (vm_pageout_stats_max == 0) 2147 vm_pageout_stats_max = vmstats.v_free_target; 2148 2149 /* 2150 * Set interval in seconds for stats scan. 2151 */ 2152 if (vm_pageout_stats_interval == 0) 2153 vm_pageout_stats_interval = 5; 2154 if (vm_pageout_full_stats_interval == 0) 2155 vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4; 2156 2157 2158 /* 2159 * Set maximum free per pass 2160 */ 2161 if (vm_pageout_stats_free_max == 0) 2162 vm_pageout_stats_free_max = 5; 2163 2164 swap_pager_swap_init(); 2165 pass = 0; 2166 2167 atomic_swap_int(&sequence_emerg_pager, 1); 2168 wakeup(&sequence_emerg_pager); 2169 2170 skip_setup: 2171 /* 2172 * Sequence emergency pager startup 2173 */ 2174 if (isep) { 2175 while (sequence_emerg_pager == 0) 2176 tsleep(&sequence_emerg_pager, 0, "pstartup", hz); 2177 } 2178 2179 /* 2180 * The pageout daemon is never done, so loop forever. 2181 * 2182 * WARNING! This code is being executed by two kernel threads 2183 * potentially simultaneously. 2184 */ 2185 while (TRUE) { 2186 int error; 2187 long avail_shortage; 2188 long inactive_shortage; 2189 long vnodes_skipped = 0; 2190 long recycle_count = 0; 2191 long tmp; 2192 2193 /* 2194 * Wait for an action request. If we timeout check to 2195 * see if paging is needed (in case the normal wakeup 2196 * code raced us). 2197 */ 2198 if (isep) { 2199 /* 2200 * Emergency pagedaemon monitors the primary 2201 * pagedaemon while vm_pages_needed != 0. 2202 * 2203 * The emergency pagedaemon only runs if VM paging 2204 * is needed and the primary pagedaemon has not 2205 * updated vm_pagedaemon_time for more than 2 seconds. 2206 */ 2207 if (vm_pages_needed) 2208 tsleep(&vm_pagedaemon_time, 0, "psleep", hz); 2209 else 2210 tsleep(&vm_pagedaemon_time, 0, "psleep", hz*10); 2211 if (vm_pages_needed == 0) { 2212 pass = 0; 2213 continue; 2214 } 2215 if ((int)(ticks - vm_pagedaemon_time) < hz * 2) { 2216 pass = 0; 2217 continue; 2218 } 2219 } else { 2220 /* 2221 * Primary pagedaemon 2222 * 2223 * NOTE: We unconditionally cleanup PQ_HOLD even 2224 * when there is no work to do. 2225 */ 2226 vm_pageout_scan_hold(q3iterator & PQ_L2_MASK); 2227 ++q3iterator; 2228 2229 if (vm_pages_needed == 0) { 2230 error = tsleep(&vm_pages_needed, 2231 0, "psleep", 2232 vm_pageout_stats_interval * hz); 2233 if (error && 2234 vm_paging_needed(0) == 0 && 2235 vm_pages_needed == 0) { 2236 for (q = 0; q < PQ_L2_SIZE; ++q) 2237 vm_pageout_page_stats(q); 2238 continue; 2239 } 2240 vm_pagedaemon_time = ticks; 2241 vm_pages_needed = 1; 2242 2243 /* 2244 * Wake the emergency pagedaemon up so it 2245 * can monitor us. It will automatically 2246 * go back into a long sleep when 2247 * vm_pages_needed returns to 0. 2248 */ 2249 wakeup(&vm_pagedaemon_time); 2250 } 2251 } 2252 2253 mycpu->gd_cnt.v_pdwakeups++; 2254 2255 /* 2256 * Scan for INACTIVE->CLEAN/PAGEOUT 2257 * 2258 * This routine tries to avoid thrashing the system with 2259 * unnecessary activity. 2260 * 2261 * Calculate our target for the number of free+cache pages we 2262 * want to get to. This is higher then the number that causes 2263 * allocations to stall (severe) in order to provide hysteresis, 2264 * and if we don't make it all the way but get to the minimum 2265 * we're happy. Goose it a bit if there are multiple requests 2266 * for memory. 2267 * 2268 * Don't reduce avail_shortage inside the loop or the 2269 * PQAVERAGE() calculation will break. 2270 * 2271 * NOTE! deficit is differentiated from avail_shortage as 2272 * REQUIRING at least (deficit) pages to be cleaned, 2273 * even if the page queues are in good shape. This 2274 * is used primarily for handling per-process 2275 * RLIMIT_RSS and may also see small values when 2276 * processes block due to low memory. 2277 */ 2278 vmstats_rollup(); 2279 if (isep == 0) 2280 vm_pagedaemon_time = ticks; 2281 avail_shortage = vm_paging_target() + vm_pageout_deficit; 2282 vm_pageout_deficit = 0; 2283 2284 if (avail_shortage > 0) { 2285 long delta = 0; 2286 int qq; 2287 2288 qq = q1iterator; 2289 for (q = 0; q < PQ_L2_SIZE; ++q) { 2290 delta += vm_pageout_scan_inactive( 2291 pass, 2292 qq & PQ_L2_MASK, 2293 PQAVERAGE(avail_shortage), 2294 &vnodes_skipped); 2295 if (isep) 2296 --qq; 2297 else 2298 ++qq; 2299 if (avail_shortage - delta <= 0) 2300 break; 2301 2302 /* 2303 * It is possible for avail_shortage to be 2304 * very large. If a large program exits or 2305 * frees a ton of memory all at once, we do 2306 * not have to continue deactivations. 2307 * 2308 * (We will still run the active->inactive 2309 * target, however). 2310 */ 2311 if (!vm_page_count_target() && 2312 !vm_page_count_min( 2313 vm_page_free_hysteresis)) { 2314 avail_shortage = 0; 2315 break; 2316 } 2317 } 2318 avail_shortage -= delta; 2319 q1iterator = qq; 2320 } 2321 2322 /* 2323 * Figure out how many active pages we must deactivate. If 2324 * we were able to reach our target with just the inactive 2325 * scan above we limit the number of active pages we 2326 * deactivate to reduce unnecessary work. 2327 */ 2328 vmstats_rollup(); 2329 if (isep == 0) 2330 vm_pagedaemon_time = ticks; 2331 inactive_shortage = vmstats.v_inactive_target - 2332 vmstats.v_inactive_count; 2333 2334 /* 2335 * If we were unable to free sufficient inactive pages to 2336 * satisfy the free/cache queue requirements then simply 2337 * reaching the inactive target may not be good enough. 2338 * Try to deactivate pages in excess of the target based 2339 * on the shortfall. 2340 * 2341 * However to prevent thrashing the VM system do not 2342 * deactivate more than an additional 1/10 the inactive 2343 * target's worth of active pages. 2344 */ 2345 if (avail_shortage > 0) { 2346 tmp = avail_shortage * 2; 2347 if (tmp > vmstats.v_inactive_target / 10) 2348 tmp = vmstats.v_inactive_target / 10; 2349 inactive_shortage += tmp; 2350 } 2351 2352 /* 2353 * Only trigger a pmap cleanup on inactive shortage. 2354 */ 2355 if (isep == 0 && inactive_shortage > 0) { 2356 pmap_collect(); 2357 } 2358 2359 /* 2360 * Scan for ACTIVE->INACTIVE 2361 * 2362 * Only trigger on inactive shortage. Triggering on 2363 * avail_shortage can starve the active queue with 2364 * unnecessary active->inactive transitions and destroy 2365 * performance. 2366 * 2367 * If this is the emergency pager, always try to move 2368 * a few pages from active to inactive because the inactive 2369 * queue might have enough pages, but not enough anonymous 2370 * pages. 2371 */ 2372 if (isep && inactive_shortage < vm_emerg_launder) 2373 inactive_shortage = vm_emerg_launder; 2374 2375 if (/*avail_shortage > 0 ||*/ inactive_shortage > 0) { 2376 long delta = 0; 2377 int qq; 2378 2379 qq = q2iterator; 2380 for (q = 0; q < PQ_L2_SIZE; ++q) { 2381 delta += vm_pageout_scan_active( 2382 pass, 2383 qq & PQ_L2_MASK, 2384 PQAVERAGE(avail_shortage), 2385 PQAVERAGE(inactive_shortage), 2386 &recycle_count); 2387 if (isep) 2388 --qq; 2389 else 2390 ++qq; 2391 if (inactive_shortage - delta <= 0 && 2392 avail_shortage - delta <= 0) { 2393 break; 2394 } 2395 2396 /* 2397 * inactive_shortage can be a very large 2398 * number. This is intended to break out 2399 * early if our inactive_target has been 2400 * reached due to other system activity. 2401 */ 2402 if (vmstats.v_inactive_count > 2403 vmstats.v_inactive_target) { 2404 inactive_shortage = 0; 2405 break; 2406 } 2407 } 2408 inactive_shortage -= delta; 2409 avail_shortage -= delta; 2410 q2iterator = qq; 2411 } 2412 2413 /* 2414 * Scan for CACHE->FREE 2415 * 2416 * Finally free enough cache pages to meet our free page 2417 * requirement and take more drastic measures if we are 2418 * still in trouble. 2419 */ 2420 vmstats_rollup(); 2421 if (isep == 0) 2422 vm_pagedaemon_time = ticks; 2423 vm_pageout_scan_cache(avail_shortage, pass, 2424 vnodes_skipped, recycle_count); 2425 2426 /* 2427 * This is a bit sophisticated because we do not necessarily 2428 * want to force paging until our targets are reached if we 2429 * were able to successfully retire the shortage we calculated. 2430 */ 2431 if (avail_shortage > 0) { 2432 /* 2433 * If we did not retire enough pages continue the 2434 * pageout operation until we are able to. 2435 */ 2436 ++pass; 2437 2438 if (pass < 10 && vm_pages_needed > 1) { 2439 /* 2440 * Normal operation, additional processes 2441 * have already kicked us. Retry immediately 2442 * unless swap space is completely full in 2443 * which case delay a bit. 2444 */ 2445 if (swap_pager_full) { 2446 tsleep(&vm_pages_needed, 0, "pdelay", 2447 hz / 5); 2448 } /* else immediate retry */ 2449 } else if (pass < 10) { 2450 /* 2451 * Do a short sleep for the first 10 passes, 2452 * allow the sleep to be woken up by resetting 2453 * vm_pages_needed to 1 (NOTE: we are still 2454 * active paging!). 2455 */ 2456 if (isep == 0) 2457 vm_pages_needed = 1; 2458 tsleep(&vm_pages_needed, 0, "pdelay", 2); 2459 } else if (swap_pager_full == 0) { 2460 /* 2461 * We've taken too many passes, force a 2462 * longer delay. 2463 */ 2464 tsleep(&vm_pages_needed, 0, "pdelay", hz / 10); 2465 } else { 2466 /* 2467 * Running out of memory, catastrophic 2468 * back-off to one-second intervals. 2469 */ 2470 tsleep(&vm_pages_needed, 0, "pdelay", hz); 2471 } 2472 } else if (vm_pages_needed) { 2473 /* 2474 * We retired our calculated shortage but we may have 2475 * to continue paging if threads drain memory too far 2476 * below our target. 2477 * 2478 * Similar to vm_page_free_wakeup() in vm_page.c. 2479 */ 2480 pass = 0; 2481 if (!vm_paging_needed(0)) { 2482 /* still more than half-way to our target */ 2483 vm_pages_needed = 0; 2484 wakeup(&vmstats.v_free_count); 2485 } else 2486 if (!vm_page_count_min(vm_page_free_hysteresis)) { 2487 /* 2488 * Continue operations with wakeup 2489 * (set variable to avoid overflow) 2490 */ 2491 vm_pages_needed = 2; 2492 wakeup(&vmstats.v_free_count); 2493 } else { 2494 /* 2495 * No wakeup() needed, continue operations. 2496 * (set variable to avoid overflow) 2497 */ 2498 vm_pages_needed = 2; 2499 } 2500 } else { 2501 /* 2502 * Turn paging back on immediately if we are under 2503 * minimum. 2504 */ 2505 pass = 0; 2506 } 2507 } 2508 } 2509 2510 static struct kproc_desc pg1_kp = { 2511 "pagedaemon", 2512 vm_pageout_thread, 2513 &pagethread 2514 }; 2515 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &pg1_kp); 2516 2517 static struct kproc_desc pg2_kp = { 2518 "emergpager", 2519 vm_pageout_thread, 2520 &emergpager 2521 }; 2522 SYSINIT(emergpager, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, kproc_start, &pg2_kp); 2523 2524 2525 /* 2526 * Called after allocating a page out of the cache or free queue 2527 * to possibly wake the pagedaemon up to replentish our supply. 2528 * 2529 * We try to generate some hysteresis by waking the pagedaemon up 2530 * when our free+cache pages go below the free_min+cache_min level. 2531 * The pagedaemon tries to get the count back up to at least the 2532 * minimum, and through to the target level if possible. 2533 * 2534 * If the pagedaemon is already active bump vm_pages_needed as a hint 2535 * that there are even more requests pending. 2536 * 2537 * SMP races ok? 2538 * No requirements. 2539 */ 2540 void 2541 pagedaemon_wakeup(void) 2542 { 2543 if (vm_paging_needed(0) && curthread != pagethread) { 2544 if (vm_pages_needed <= 1) { 2545 vm_pages_needed = 1; /* SMP race ok */ 2546 wakeup(&vm_pages_needed); /* tickle pageout */ 2547 } else if (vm_page_count_min(0)) { 2548 ++vm_pages_needed; /* SMP race ok */ 2549 /* a wakeup() would be wasted here */ 2550 } 2551 } 2552 } 2553 2554 #if !defined(NO_SWAPPING) 2555 2556 /* 2557 * SMP races ok? 2558 * No requirements. 2559 */ 2560 static void 2561 vm_req_vmdaemon(void) 2562 { 2563 static int lastrun = 0; 2564 2565 if ((ticks > (lastrun + hz)) || (ticks < lastrun)) { 2566 wakeup(&vm_daemon_needed); 2567 lastrun = ticks; 2568 } 2569 } 2570 2571 static int vm_daemon_callback(struct proc *p, void *data __unused); 2572 2573 /* 2574 * No requirements. 2575 */ 2576 static void 2577 vm_daemon(void) 2578 { 2579 int req_swapout; 2580 2581 while (TRUE) { 2582 tsleep(&vm_daemon_needed, 0, "psleep", 0); 2583 req_swapout = atomic_swap_int(&vm_pageout_req_swapout, 0); 2584 2585 /* 2586 * forced swapouts 2587 */ 2588 if (req_swapout) 2589 swapout_procs(vm_pageout_req_swapout); 2590 2591 /* 2592 * scan the processes for exceeding their rlimits or if 2593 * process is swapped out -- deactivate pages 2594 */ 2595 allproc_scan(vm_daemon_callback, NULL, 0); 2596 } 2597 } 2598 2599 static int 2600 vm_daemon_callback(struct proc *p, void *data __unused) 2601 { 2602 struct vmspace *vm; 2603 vm_pindex_t limit, size; 2604 2605 /* 2606 * if this is a system process or if we have already 2607 * looked at this process, skip it. 2608 */ 2609 lwkt_gettoken(&p->p_token); 2610 2611 if (p->p_flags & (P_SYSTEM | P_WEXIT)) { 2612 lwkt_reltoken(&p->p_token); 2613 return (0); 2614 } 2615 2616 /* 2617 * if the process is in a non-running type state, 2618 * don't touch it. 2619 */ 2620 if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) { 2621 lwkt_reltoken(&p->p_token); 2622 return (0); 2623 } 2624 2625 /* 2626 * get a limit 2627 */ 2628 limit = OFF_TO_IDX(qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur, 2629 p->p_rlimit[RLIMIT_RSS].rlim_max)); 2630 2631 /* 2632 * let processes that are swapped out really be 2633 * swapped out. Set the limit to nothing to get as 2634 * many pages out to swap as possible. 2635 */ 2636 if (p->p_flags & P_SWAPPEDOUT) 2637 limit = 0; 2638 2639 vm = p->p_vmspace; 2640 vmspace_hold(vm); 2641 size = pmap_resident_tlnw_count(&vm->vm_pmap); 2642 if (limit >= 0 && size > 4096 && 2643 size - 4096 >= limit && vm_pageout_memuse_mode >= 1) { 2644 vm_pageout_map_deactivate_pages(&vm->vm_map, limit); 2645 } 2646 vmspace_drop(vm); 2647 2648 lwkt_reltoken(&p->p_token); 2649 2650 return (0); 2651 } 2652 2653 #endif 2654