1 /* 2 * Copyright (c) 2003-2020 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * Copyright (c) 1991 Regents of the University of California. 35 * All rights reserved. 36 * Copyright (c) 1994 John S. Dyson 37 * All rights reserved. 38 * Copyright (c) 1994 David Greenman 39 * All rights reserved. 40 * 41 * This code is derived from software contributed to Berkeley by 42 * The Mach Operating System project at Carnegie-Mellon University. 43 * 44 * Redistribution and use in source and binary forms, with or without 45 * modification, are permitted provided that the following conditions 46 * are met: 47 * 1. Redistributions of source code must retain the above copyright 48 * notice, this list of conditions and the following disclaimer. 49 * 2. Redistributions in binary form must reproduce the above copyright 50 * notice, this list of conditions and the following disclaimer in the 51 * documentation and/or other materials provided with the distribution. 52 * 3. Neither the name of the University nor the names of its contributors 53 * may be used to endorse or promote products derived from this software 54 * without specific prior written permission. 55 * 56 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 57 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 58 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 59 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 60 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 61 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 62 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 63 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 64 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 65 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 66 * SUCH DAMAGE. 67 * 68 * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 69 * 70 * 71 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 72 * All rights reserved. 73 * 74 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 75 * 76 * Permission to use, copy, modify and distribute this software and 77 * its documentation is hereby granted, provided that both the copyright 78 * notice and this permission notice appear in all copies of the 79 * software, derivative works or modified versions, and any portions 80 * thereof, and that both notices appear in supporting documentation. 81 * 82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 85 * 86 * Carnegie Mellon requests users of this software to return to 87 * 88 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 89 * School of Computer Science 90 * Carnegie Mellon University 91 * Pittsburgh PA 15213-3890 92 * 93 * any improvements or extensions that they make and grant Carnegie the 94 * rights to redistribute these changes. 95 */ 96 97 /* 98 * The proverbial page-out daemon, rewritten many times over the decades. 99 */ 100 101 #include "opt_vm.h" 102 #include <sys/param.h> 103 #include <sys/systm.h> 104 #include <sys/kernel.h> 105 #include <sys/proc.h> 106 #include <sys/kthread.h> 107 #include <sys/resourcevar.h> 108 #include <sys/signalvar.h> 109 #include <sys/vnode.h> 110 #include <sys/vmmeter.h> 111 #include <sys/conf.h> 112 #include <sys/sysctl.h> 113 114 #include <vm/vm.h> 115 #include <vm/vm_param.h> 116 #include <sys/lock.h> 117 #include <vm/vm_object.h> 118 #include <vm/vm_page.h> 119 #include <vm/vm_map.h> 120 #include <vm/vm_pageout.h> 121 #include <vm/vm_pager.h> 122 #include <vm/swap_pager.h> 123 #include <vm/vm_extern.h> 124 125 #include <sys/spinlock2.h> 126 #include <vm/vm_page2.h> 127 128 /* 129 * System initialization 130 */ 131 132 /* the kernel process "vm_pageout"*/ 133 static int vm_pageout_page(vm_page_t m, long *max_launderp, 134 long *vnodes_skippedp, struct vnode **vpfailedp, 135 int pass, int vmflush_flags); 136 static int vm_pageout_clean_helper (vm_page_t, int); 137 static void vm_pageout_free_page_calc (vm_size_t count); 138 static void vm_pageout_page_free(vm_page_t m) ; 139 __read_frequently struct thread *emergpager; 140 __read_frequently struct thread *pagethread; 141 static int sequence_emerg_pager; 142 143 #if !defined(NO_SWAPPING) 144 /* the kernel process "vm_daemon"*/ 145 static void vm_daemon (void); 146 static struct thread *vmthread; 147 148 static struct kproc_desc vm_kp = { 149 "vmdaemon", 150 vm_daemon, 151 &vmthread 152 }; 153 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp); 154 #endif 155 156 __read_mostly int vm_pages_needed = 0; /* pageout daemon tsleep event */ 157 __read_mostly int vm_pageout_deficit = 0;/* Estimated number of pages deficit */ 158 __read_mostly int vm_pageout_pages_needed = 0;/* pageout daemon needs pages */ 159 __read_mostly int vm_page_free_hysteresis = 16; 160 __read_mostly static int vm_pagedaemon_time; 161 162 #if !defined(NO_SWAPPING) 163 static int vm_pageout_req_swapout; 164 static int vm_daemon_needed; 165 #endif 166 __read_mostly static int vm_max_launder = 4096; 167 __read_mostly static int vm_emerg_launder = 100; 168 __read_mostly static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0; 169 __read_mostly static int vm_pageout_full_stats_interval = 0; 170 __read_mostly static int vm_pageout_stats_free_max=0, vm_pageout_algorithm=0; 171 __read_mostly static int defer_swap_pageouts=0; 172 __read_mostly static int disable_swap_pageouts=0; 173 __read_mostly static u_int vm_anonmem_decline = ACT_DECLINE; 174 __read_mostly static u_int vm_filemem_decline = ACT_DECLINE * 2; 175 __read_mostly static int vm_pageout_debug; 176 177 #if defined(NO_SWAPPING) 178 __read_mostly static int vm_swap_enabled=0; 179 __read_mostly static int vm_swap_idle_enabled=0; 180 #else 181 __read_mostly static int vm_swap_enabled=1; 182 __read_mostly static int vm_swap_idle_enabled=0; 183 #endif 184 185 /* 0-disable, 1-passive, 2-active swp*/ 186 __read_mostly int vm_pageout_memuse_mode=2; 187 __read_mostly int vm_pageout_allow_active=1; 188 189 SYSCTL_UINT(_vm, VM_PAGEOUT_ALGORITHM, anonmem_decline, 190 CTLFLAG_RW, &vm_anonmem_decline, 0, "active->inactive anon memory"); 191 192 SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, filemem_decline, 193 CTLFLAG_RW, &vm_filemem_decline, 0, "active->inactive file cache"); 194 195 SYSCTL_INT(_vm, OID_AUTO, page_free_hysteresis, 196 CTLFLAG_RW, &vm_page_free_hysteresis, 0, 197 "Free more pages than the minimum required"); 198 199 SYSCTL_INT(_vm, OID_AUTO, max_launder, 200 CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout"); 201 SYSCTL_INT(_vm, OID_AUTO, emerg_launder, 202 CTLFLAG_RW, &vm_emerg_launder, 0, "Emergency pager minimum"); 203 204 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max, 205 CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length"); 206 207 SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval, 208 CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan"); 209 210 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval, 211 CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan"); 212 213 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_free_max, 214 CTLFLAG_RW, &vm_pageout_stats_free_max, 0, "Not implemented"); 215 SYSCTL_INT(_vm, OID_AUTO, pageout_memuse_mode, 216 CTLFLAG_RW, &vm_pageout_memuse_mode, 0, "memoryuse resource mode"); 217 SYSCTL_INT(_vm, OID_AUTO, pageout_allow_active, 218 CTLFLAG_RW, &vm_pageout_allow_active, 0, "allow inactive+active"); 219 SYSCTL_INT(_vm, OID_AUTO, pageout_debug, 220 CTLFLAG_RW, &vm_pageout_debug, 0, "debug pageout pages (count)"); 221 222 223 #if defined(NO_SWAPPING) 224 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 225 CTLFLAG_RD, &vm_swap_enabled, 0, ""); 226 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, 227 CTLFLAG_RD, &vm_swap_idle_enabled, 0, ""); 228 #else 229 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 230 CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout"); 231 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, 232 CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); 233 #endif 234 235 SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts, 236 CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem"); 237 238 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, 239 CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); 240 241 static int pageout_lock_miss; 242 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss, 243 CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout"); 244 245 int vm_page_max_wired; /* XXX max # of wired pages system-wide */ 246 247 #if !defined(NO_SWAPPING) 248 static void vm_req_vmdaemon (void); 249 #endif 250 static void vm_pageout_page_stats(int q); 251 252 /* 253 * Calculate approximately how many pages on each queue to try to 254 * clean. An exact calculation creates an edge condition when the 255 * queues are unbalanced so add significant slop. The queue scans 256 * will stop early when targets are reached and will start where they 257 * left off on the next pass. 258 * 259 * We need to be generous here because there are all sorts of loading 260 * conditions that can cause edge cases if try to average over all queues. 261 * In particular, storage subsystems have become so fast that paging 262 * activity can become quite frantic. Eventually we will probably need 263 * two paging threads, one for dirty pages and one for clean, to deal 264 * with the bandwidth requirements. 265 266 * So what we do is calculate a value that can be satisfied nominally by 267 * only having to scan half the queues. 268 */ 269 static __inline long 270 PQAVERAGE(long n) 271 { 272 long avg; 273 274 if (n >= 0) { 275 avg = ((n + (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) + 1); 276 } else { 277 avg = ((n - (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) - 1); 278 } 279 return avg; 280 } 281 282 /* 283 * vm_pageout_clean_helper: 284 * 285 * Clean the page and remove it from the laundry. The page must be busied 286 * by the caller and will be disposed of (put away, flushed) by this routine. 287 */ 288 static int 289 vm_pageout_clean_helper(vm_page_t m, int vmflush_flags) 290 { 291 vm_object_t object; 292 vm_page_t mc[BLIST_MAX_ALLOC]; 293 int error; 294 int ib, is, page_base; 295 vm_pindex_t pindex = m->pindex; 296 297 object = m->object; 298 299 /* 300 * Don't mess with the page if it's held or special. Theoretically 301 * we can pageout held pages but there is no real need to press our 302 * luck, so don't. 303 */ 304 if (m->hold_count != 0 || (m->flags & PG_UNQUEUED)) { 305 vm_page_wakeup(m); 306 return 0; 307 } 308 309 /* 310 * Place page in cluster. Align cluster for optimal swap space 311 * allocation (whether it is swap or not). This is typically ~16-32 312 * pages, which also tends to align the cluster to multiples of the 313 * filesystem block size if backed by a filesystem. 314 */ 315 page_base = pindex % BLIST_MAX_ALLOC; 316 mc[page_base] = m; 317 ib = page_base - 1; 318 is = page_base + 1; 319 320 /* 321 * Scan object for clusterable pages. 322 * 323 * We can cluster ONLY if: ->> the page is NOT 324 * clean, wired, busy, held, or mapped into a 325 * buffer, and one of the following: 326 * 1) The page is inactive, or a seldom used 327 * active page. 328 * -or- 329 * 2) we force the issue. 330 * 331 * During heavy mmap/modification loads the pageout 332 * daemon can really fragment the underlying file 333 * due to flushing pages out of order and not trying 334 * align the clusters (which leave sporatic out-of-order 335 * holes). To solve this problem we do the reverse scan 336 * first and attempt to align our cluster, then do a 337 * forward scan if room remains. 338 */ 339 vm_object_hold(object); 340 341 while (ib >= 0) { 342 vm_page_t p; 343 344 p = vm_page_lookup_busy_try(object, pindex - page_base + ib, 345 TRUE, &error); 346 if (error || p == NULL) 347 break; 348 if ((p->queue - p->pc) == PQ_CACHE || 349 (p->flags & PG_UNQUEUED)) { 350 vm_page_wakeup(p); 351 break; 352 } 353 vm_page_test_dirty(p); 354 if (((p->dirty & p->valid) == 0 && 355 (p->flags & PG_NEED_COMMIT) == 0) || 356 p->wire_count != 0 || /* may be held by buf cache */ 357 p->hold_count != 0) { /* may be undergoing I/O */ 358 vm_page_wakeup(p); 359 break; 360 } 361 if (p->queue - p->pc != PQ_INACTIVE) { 362 if (p->queue - p->pc != PQ_ACTIVE || 363 (vmflush_flags & VM_PAGER_ALLOW_ACTIVE) == 0) { 364 vm_page_wakeup(p); 365 break; 366 } 367 } 368 369 /* 370 * Try to maintain page groupings in the cluster. 371 */ 372 if (m->flags & PG_WINATCFLS) 373 vm_page_flag_set(p, PG_WINATCFLS); 374 else 375 vm_page_flag_clear(p, PG_WINATCFLS); 376 p->act_count = m->act_count; 377 378 mc[ib] = p; 379 --ib; 380 } 381 ++ib; /* fixup */ 382 383 while (is < BLIST_MAX_ALLOC && 384 pindex - page_base + is < object->size) { 385 vm_page_t p; 386 387 p = vm_page_lookup_busy_try(object, pindex - page_base + is, 388 TRUE, &error); 389 if (error || p == NULL) 390 break; 391 if (((p->queue - p->pc) == PQ_CACHE) || 392 (p->flags & PG_UNQUEUED)) { 393 vm_page_wakeup(p); 394 break; 395 } 396 vm_page_test_dirty(p); 397 if (((p->dirty & p->valid) == 0 && 398 (p->flags & PG_NEED_COMMIT) == 0) || 399 p->wire_count != 0 || /* may be held by buf cache */ 400 p->hold_count != 0) { /* may be undergoing I/O */ 401 vm_page_wakeup(p); 402 break; 403 } 404 if (p->queue - p->pc != PQ_INACTIVE) { 405 if (p->queue - p->pc != PQ_ACTIVE || 406 (vmflush_flags & VM_PAGER_ALLOW_ACTIVE) == 0) { 407 vm_page_wakeup(p); 408 break; 409 } 410 } 411 412 /* 413 * Try to maintain page groupings in the cluster. 414 */ 415 if (m->flags & PG_WINATCFLS) 416 vm_page_flag_set(p, PG_WINATCFLS); 417 else 418 vm_page_flag_clear(p, PG_WINATCFLS); 419 p->act_count = m->act_count; 420 421 mc[is] = p; 422 ++is; 423 } 424 425 vm_object_drop(object); 426 427 /* 428 * we allow reads during pageouts... 429 */ 430 return vm_pageout_flush(&mc[ib], is - ib, vmflush_flags); 431 } 432 433 /* 434 * vm_pageout_flush() - launder the given pages 435 * 436 * The given pages are laundered. Note that we setup for the start of 437 * I/O ( i.e. busy the page ), mark it read-only, and bump the object 438 * reference count all in here rather then in the parent. If we want 439 * the parent to do more sophisticated things we may have to change 440 * the ordering. 441 * 442 * The pages in the array must be busied by the caller and will be 443 * unbusied by this function. 444 */ 445 int 446 vm_pageout_flush(vm_page_t *mc, int count, int vmflush_flags) 447 { 448 vm_object_t object; 449 int pageout_status[count]; 450 int numpagedout = 0; 451 int i; 452 int dodebug; 453 454 if (vm_pageout_debug > 0) { 455 --vm_pageout_debug; 456 dodebug = 1; 457 } else { 458 dodebug = 0; 459 } 460 461 /* 462 * Initiate I/O. Bump the vm_page_t->busy counter. 463 */ 464 for (i = 0; i < count; i++) { 465 KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, 466 ("vm_pageout_flush page %p index %d/%d: partially " 467 "invalid page", mc[i], i, count)); 468 vm_page_io_start(mc[i]); 469 } 470 471 /* 472 * We must make the pages read-only. This will also force the 473 * modified bit in the related pmaps to be cleared. The pager 474 * cannot clear the bit for us since the I/O completion code 475 * typically runs from an interrupt. The act of making the page 476 * read-only handles the case for us. 477 * 478 * Then we can unbusy the pages, we still hold a reference by virtue 479 * of our soft-busy. 480 */ 481 if (dodebug) 482 kprintf("pageout(%d): ", count); 483 for (i = 0; i < count; i++) { 484 if (vmflush_flags & VM_PAGER_TRY_TO_CACHE) 485 vm_page_protect(mc[i], VM_PROT_NONE); 486 else 487 vm_page_protect(mc[i], VM_PROT_READ); 488 vm_page_wakeup(mc[i]); 489 if (dodebug) 490 kprintf(" %p", mc[i]); 491 } 492 if (dodebug) 493 kprintf("\n"); 494 495 object = mc[0]->object; 496 vm_object_pip_add(object, count); 497 498 vm_pager_put_pages(object, mc, count, 499 (vmflush_flags | 500 ((object == &kernel_object) ? 501 VM_PAGER_PUT_SYNC : 0)), 502 pageout_status); 503 504 if (dodebug) 505 kprintf("result: "); 506 for (i = 0; i < count; i++) { 507 vm_page_t mt = mc[i]; 508 509 if (dodebug) 510 kprintf(" S%d", pageout_status[i]); 511 512 switch (pageout_status[i]) { 513 case VM_PAGER_OK: 514 numpagedout++; 515 break; 516 case VM_PAGER_PEND: 517 numpagedout++; 518 break; 519 case VM_PAGER_BAD: 520 /* 521 * Page outside of range of object. Right now we 522 * essentially lose the changes by pretending it 523 * worked. 524 */ 525 vm_page_busy_wait(mt, FALSE, "pgbad"); 526 pmap_clear_modify(mt); 527 vm_page_undirty(mt); 528 vm_page_wakeup(mt); 529 break; 530 case VM_PAGER_ERROR: 531 case VM_PAGER_FAIL: 532 /* 533 * A page typically cannot be paged out when we 534 * have run out of swap. We leave the page 535 * marked inactive and will try to page it out 536 * again later. 537 * 538 * Starvation of the active page list is used to 539 * determine when the system is massively memory 540 * starved. 541 */ 542 break; 543 case VM_PAGER_AGAIN: 544 break; 545 } 546 547 /* 548 * If not PENDing this was a synchronous operation and we 549 * clean up after the I/O. If it is PENDing the mess is 550 * cleaned up asynchronously. 551 * 552 * Also nominally act on the caller's wishes if the caller 553 * wants to try to really clean (cache or free) the page. 554 * 555 * Also nominally deactivate the page if the system is 556 * memory-stressed. 557 */ 558 if (pageout_status[i] != VM_PAGER_PEND) { 559 vm_page_busy_wait(mt, FALSE, "pgouw"); 560 vm_page_io_finish(mt); 561 if (vmflush_flags & VM_PAGER_TRY_TO_CACHE) { 562 vm_page_try_to_cache(mt); 563 if (dodebug) 564 kprintf("A[pq_cache=%d]", 565 ((mt->queue - mt->pc) == PQ_CACHE)); 566 } else if (vm_page_count_severe()) { 567 vm_page_deactivate(mt); 568 vm_page_wakeup(mt); 569 if (dodebug) 570 kprintf("B"); 571 } else { 572 vm_page_wakeup(mt); 573 if (dodebug) 574 kprintf("C"); 575 } 576 vm_object_pip_wakeup(object); 577 } 578 } 579 if (dodebug) 580 kprintf("(%d paged out)\n", numpagedout); 581 return numpagedout; 582 } 583 584 #if !defined(NO_SWAPPING) 585 586 /* 587 * Callback function, page busied for us. We must dispose of the busy 588 * condition. Any related pmap pages may be held but will not be locked. 589 */ 590 static 591 int 592 vm_pageout_mdp_callback(struct pmap_pgscan_info *info, vm_offset_t va, 593 vm_page_t p) 594 { 595 int actcount; 596 int cleanit = 0; 597 598 /* 599 * Basic tests - There should never be a marker, and we can stop 600 * once the RSS is below the required level. 601 */ 602 KKASSERT((p->flags & PG_MARKER) == 0); 603 if (pmap_resident_tlnw_count(info->pmap) <= info->limit) { 604 vm_page_wakeup(p); 605 return(-1); 606 } 607 608 mycpu->gd_cnt.v_pdpages++; 609 610 if (p->wire_count || p->hold_count || (p->flags & PG_UNQUEUED)) { 611 vm_page_wakeup(p); 612 goto done; 613 } 614 615 ++info->actioncount; 616 617 /* 618 * Check if the page has been referened recently. If it has, 619 * activate it and skip. 620 */ 621 actcount = pmap_ts_referenced(p); 622 if (actcount) { 623 vm_page_flag_set(p, PG_REFERENCED); 624 } else if (p->flags & PG_REFERENCED) { 625 actcount = 1; 626 } 627 628 if (actcount) { 629 if (p->queue - p->pc != PQ_ACTIVE) { 630 vm_page_and_queue_spin_lock(p); 631 if (p->queue - p->pc != PQ_ACTIVE) { 632 vm_page_and_queue_spin_unlock(p); 633 vm_page_activate(p); 634 } else { 635 vm_page_and_queue_spin_unlock(p); 636 } 637 } else { 638 p->act_count += actcount; 639 if (p->act_count > ACT_MAX) 640 p->act_count = ACT_MAX; 641 } 642 vm_page_flag_clear(p, PG_REFERENCED); 643 vm_page_wakeup(p); 644 goto done; 645 } 646 647 /* 648 * Remove the page from this particular pmap. Once we do this, our 649 * pmap scans will not see it again (unless it gets faulted in), so 650 * we must actively dispose of or deal with the page. 651 */ 652 pmap_remove_specific(info->pmap, p); 653 654 /* 655 * If the page is not mapped to another process (i.e. as would be 656 * typical if this were a shared page from a library) then deactivate 657 * the page and clean it in two passes only. 658 * 659 * If the page hasn't been referenced since the last check, remove it 660 * from the pmap. If it is no longer mapped, deactivate it 661 * immediately, accelerating the normal decline. 662 * 663 * Once the page has been removed from the pmap the RSS code no 664 * longer tracks it so we have to make sure that it is staged for 665 * potential flush action. 666 * 667 * XXX 668 */ 669 if ((p->flags & PG_MAPPED) == 0 || 670 (pmap_mapped_sync(p) & PG_MAPPED) == 0) { 671 if (p->queue - p->pc == PQ_ACTIVE) { 672 vm_page_deactivate(p); 673 } 674 if (p->queue - p->pc == PQ_INACTIVE) { 675 cleanit = 1; 676 } 677 } 678 679 /* 680 * Ok, try to fully clean the page and any nearby pages such that at 681 * least the requested page is freed or moved to the cache queue. 682 * 683 * We usually do this synchronously to allow us to get the page into 684 * the CACHE queue quickly, which will prevent memory exhaustion if 685 * a process with a memoryuse limit is running away. However, the 686 * sysadmin may desire to set vm.swap_user_async which relaxes this 687 * and improves write performance. 688 */ 689 if (cleanit) { 690 long max_launder = 0x7FFF; 691 long vnodes_skipped = 0; 692 int vmflush_flags; 693 struct vnode *vpfailed = NULL; 694 695 info->offset = va; 696 697 if (vm_pageout_memuse_mode >= 2) { 698 vmflush_flags = VM_PAGER_TRY_TO_CACHE | 699 VM_PAGER_ALLOW_ACTIVE; 700 if (swap_user_async == 0) 701 vmflush_flags |= VM_PAGER_PUT_SYNC; 702 vm_page_flag_set(p, PG_WINATCFLS); 703 info->cleancount += 704 vm_pageout_page(p, &max_launder, 705 &vnodes_skipped, 706 &vpfailed, 1, vmflush_flags); 707 } else { 708 vm_page_wakeup(p); 709 ++info->cleancount; 710 } 711 } else { 712 vm_page_wakeup(p); 713 } 714 715 /* 716 * Must be at end to avoid SMP races. 717 */ 718 done: 719 lwkt_user_yield(); 720 return 0; 721 } 722 723 /* 724 * Deactivate some number of pages in a map due to set RLIMIT_RSS limits. 725 * that is relatively difficult to do. We try to keep track of where we 726 * left off last time to reduce scan overhead. 727 * 728 * Called when vm_pageout_memuse_mode is >= 1. 729 */ 730 void 731 vm_pageout_map_deactivate_pages(vm_map_t map, vm_pindex_t limit) 732 { 733 vm_offset_t pgout_offset; 734 struct pmap_pgscan_info info; 735 int retries = 3; 736 737 pgout_offset = map->pgout_offset; 738 again: 739 #if 0 740 kprintf("%016jx ", pgout_offset); 741 #endif 742 if (pgout_offset < VM_MIN_USER_ADDRESS) 743 pgout_offset = VM_MIN_USER_ADDRESS; 744 if (pgout_offset >= VM_MAX_USER_ADDRESS) 745 pgout_offset = 0; 746 info.pmap = vm_map_pmap(map); 747 info.limit = limit; 748 info.beg_addr = pgout_offset; 749 info.end_addr = VM_MAX_USER_ADDRESS; 750 info.callback = vm_pageout_mdp_callback; 751 info.cleancount = 0; 752 info.actioncount = 0; 753 info.busycount = 0; 754 755 pmap_pgscan(&info); 756 pgout_offset = info.offset; 757 #if 0 758 kprintf("%016jx %08lx %08lx\n", pgout_offset, 759 info.cleancount, info.actioncount); 760 #endif 761 762 if (pgout_offset != VM_MAX_USER_ADDRESS && 763 pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) { 764 goto again; 765 } else if (retries && 766 pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) { 767 --retries; 768 goto again; 769 } 770 map->pgout_offset = pgout_offset; 771 } 772 #endif 773 774 /* 775 * Called when the pageout scan wants to free a page. We no longer 776 * try to cycle the vm_object here with a reference & dealloc, which can 777 * cause a non-trivial object collapse in a critical path. 778 * 779 * It is unclear why we cycled the ref_count in the past, perhaps to try 780 * to optimize shadow chain collapses but I don't quite see why it would 781 * be necessary. An OBJ_DEAD object should terminate any and all vm_pages 782 * synchronously and not have to be kicked-start. 783 */ 784 static void 785 vm_pageout_page_free(vm_page_t m) 786 { 787 vm_page_protect(m, VM_PROT_NONE); 788 vm_page_free(m); 789 } 790 791 /* 792 * vm_pageout_scan does the dirty work for the pageout daemon. 793 */ 794 struct vm_pageout_scan_info { 795 struct proc *bigproc; 796 vm_offset_t bigsize; 797 }; 798 799 static int vm_pageout_scan_callback(struct proc *p, void *data); 800 801 /* 802 * Scan inactive queue 803 * 804 * WARNING! Can be called from two pagedaemon threads simultaneously. 805 */ 806 static int 807 vm_pageout_scan_inactive(int pass, int q, long avail_shortage, 808 long *vnodes_skipped) 809 { 810 vm_page_t m; 811 struct vm_page marker; 812 struct vnode *vpfailed; /* warning, allowed to be stale */ 813 long maxscan; 814 long delta = 0; 815 long max_launder; 816 int isep; 817 int vmflush_flags; 818 819 isep = (curthread == emergpager); 820 821 /* 822 * Start scanning the inactive queue for pages we can move to the 823 * cache or free. The scan will stop when the target is reached or 824 * we have scanned the entire inactive queue. Note that m->act_count 825 * is not used to form decisions for the inactive queue, only for the 826 * active queue. 827 * 828 * max_launder limits the number of dirty pages we flush per scan. 829 * For most systems a smaller value (16 or 32) is more robust under 830 * extreme memory and disk pressure because any unnecessary writes 831 * to disk can result in extreme performance degredation. However, 832 * systems with excessive dirty pages (especially when MAP_NOSYNC is 833 * used) will die horribly with limited laundering. If the pageout 834 * daemon cannot clean enough pages in the first pass, we let it go 835 * all out in succeeding passes. 836 * 837 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT LAUNDER VNODE-BACKED 838 * PAGES. 839 */ 840 if ((max_launder = vm_max_launder) <= 1) 841 max_launder = 1; 842 if (pass) 843 max_launder = 10000; 844 845 /* 846 * Initialize our marker 847 */ 848 bzero(&marker, sizeof(marker)); 849 marker.flags = PG_FICTITIOUS | PG_MARKER; 850 marker.busy_count = PBUSY_LOCKED; 851 marker.queue = PQ_INACTIVE + q; 852 marker.pc = q; 853 marker.wire_count = 1; 854 855 /* 856 * Inactive queue scan. 857 * 858 * We pick off approximately 1/10 of each queue. Each queue is 859 * effectively organized LRU so scanning the entire queue would 860 * improperly pick up pages that might still be in regular use. 861 * 862 * NOTE: The vm_page must be spinlocked before the queue to avoid 863 * deadlocks, so it is easiest to simply iterate the loop 864 * with the queue unlocked at the top. 865 */ 866 vpfailed = NULL; 867 868 vm_page_queues_spin_lock(PQ_INACTIVE + q); 869 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); 870 maxscan = vm_page_queues[PQ_INACTIVE + q].lcnt / 10 + 1; 871 872 /* 873 * Queue locked at top of loop to avoid stack marker issues. 874 */ 875 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 876 maxscan-- > 0 && avail_shortage - delta > 0) 877 { 878 int count; 879 880 KKASSERT(m->queue == PQ_INACTIVE + q); 881 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, 882 &marker, pageq); 883 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE + q].pl, m, 884 &marker, pageq); 885 mycpu->gd_cnt.v_pdpages++; 886 887 /* 888 * Skip marker pages (atomic against other markers to avoid 889 * infinite hop-over scans). 890 */ 891 if (m->flags & PG_MARKER) 892 continue; 893 894 /* 895 * Try to busy the page. Don't mess with pages which are 896 * already busy or reorder them in the queue. 897 */ 898 if (vm_page_busy_try(m, TRUE)) 899 continue; 900 901 /* 902 * Remaining operations run with the page busy and neither 903 * the page or the queue will be spin-locked. 904 */ 905 KKASSERT(m->queue == PQ_INACTIVE + q); 906 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 907 908 /* 909 * The emergency pager runs when the primary pager gets 910 * stuck, which typically means the primary pager deadlocked 911 * on a vnode-backed page. Therefore, the emergency pager 912 * must skip any complex objects. 913 * 914 * We disallow VNODEs unless they are VCHR whos device ops 915 * does not flag D_NOEMERGPGR. 916 */ 917 if (isep && m->object) { 918 struct vnode *vp; 919 920 switch(m->object->type) { 921 case OBJT_DEFAULT: 922 case OBJT_SWAP: 923 /* 924 * Allow anonymous memory and assume that 925 * swap devices are not complex, since its 926 * kinda worthless if we can't swap out dirty 927 * anonymous pages. 928 */ 929 break; 930 case OBJT_VNODE: 931 /* 932 * Allow VCHR device if the D_NOEMERGPGR 933 * flag is not set, deny other vnode types 934 * as being too complex. 935 */ 936 vp = m->object->handle; 937 if (vp && vp->v_type == VCHR && 938 vp->v_rdev && vp->v_rdev->si_ops && 939 (vp->v_rdev->si_ops->head.flags & 940 D_NOEMERGPGR) == 0) { 941 break; 942 } 943 /* Deny - fall through */ 944 default: 945 /* 946 * Deny 947 */ 948 vm_page_wakeup(m); 949 vm_page_queues_spin_lock(PQ_INACTIVE + q); 950 lwkt_yield(); 951 continue; 952 } 953 } 954 955 /* 956 * Try to pageout the page and perhaps other nearby pages. 957 * We want to get the pages into the cache eventually ( 958 * first or second pass). Otherwise the pages can wind up 959 * just cycling in the inactive queue, getting flushed over 960 * and over again. 961 */ 962 if (vm_pageout_memuse_mode >= 2) 963 vm_page_flag_set(m, PG_WINATCFLS); 964 965 vmflush_flags = 0; 966 if (vm_pageout_allow_active) 967 vmflush_flags |= VM_PAGER_ALLOW_ACTIVE; 968 if (m->flags & PG_WINATCFLS) 969 vmflush_flags |= VM_PAGER_TRY_TO_CACHE; 970 count = vm_pageout_page(m, &max_launder, vnodes_skipped, 971 &vpfailed, pass, vmflush_flags); 972 delta += count; 973 974 /* 975 * Systems with a ton of memory can wind up with huge 976 * deactivation counts. Because the inactive scan is 977 * doing a lot of flushing, the combination can result 978 * in excessive paging even in situations where other 979 * unrelated threads free up sufficient VM. 980 * 981 * To deal with this we abort the nominal active->inactive 982 * scan before we hit the inactive target when free+cache 983 * levels have reached a reasonable target. 984 * 985 * When deciding to stop early we need to add some slop to 986 * the test and we need to return full completion to the caller 987 * to prevent the caller from thinking there is something 988 * wrong and issuing a low-memory+swap warning or pkill. 989 * 990 * A deficit forces paging regardless of the state of the 991 * VM page queues (used for RSS enforcement). 992 */ 993 lwkt_yield(); 994 vm_page_queues_spin_lock(PQ_INACTIVE + q); 995 if (vm_paging_target() < -vm_max_launder) { 996 /* 997 * Stopping early, return full completion to caller. 998 */ 999 if (delta < avail_shortage) 1000 delta = avail_shortage; 1001 break; 1002 } 1003 } 1004 1005 /* page queue still spin-locked */ 1006 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); 1007 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 1008 1009 return (delta); 1010 } 1011 1012 /* 1013 * Pageout the specified page, return the total number of pages paged out 1014 * (this routine may cluster). 1015 * 1016 * The page must be busied and soft-busied by the caller and will be disposed 1017 * of by this function. 1018 */ 1019 static int 1020 vm_pageout_page(vm_page_t m, long *max_launderp, long *vnodes_skippedp, 1021 struct vnode **vpfailedp, int pass, int vmflush_flags) 1022 { 1023 vm_object_t object; 1024 int actcount; 1025 int count = 0; 1026 1027 /* 1028 * Wiring no longer removes a page from its queue. The last unwiring 1029 * will requeue the page. Obviously wired pages cannot be paged out 1030 * so unqueue it and return. 1031 */ 1032 if (m->wire_count) { 1033 vm_page_unqueue_nowakeup(m); 1034 vm_page_wakeup(m); 1035 return 0; 1036 } 1037 1038 /* 1039 * A held page may be undergoing I/O, so skip it. 1040 */ 1041 if (m->hold_count) { 1042 vm_page_and_queue_spin_lock(m); 1043 if (m->queue - m->pc == PQ_INACTIVE) { 1044 TAILQ_REMOVE( 1045 &vm_page_queues[m->queue].pl, m, pageq); 1046 TAILQ_INSERT_TAIL( 1047 &vm_page_queues[m->queue].pl, m, pageq); 1048 } 1049 vm_page_and_queue_spin_unlock(m); 1050 vm_page_wakeup(m); 1051 return 0; 1052 } 1053 1054 if (m->object == NULL || m->object->ref_count == 0) { 1055 /* 1056 * If the object is not being used, we ignore previous 1057 * references. 1058 */ 1059 vm_page_flag_clear(m, PG_REFERENCED); 1060 pmap_clear_reference(m); 1061 /* fall through to end */ 1062 } else if (((m->flags & PG_REFERENCED) == 0) && 1063 (actcount = pmap_ts_referenced(m))) { 1064 /* 1065 * Otherwise, if the page has been referenced while 1066 * in the inactive queue, we bump the "activation 1067 * count" upwards, making it less likely that the 1068 * page will be added back to the inactive queue 1069 * prematurely again. Here we check the page tables 1070 * (or emulated bits, if any), given the upper level 1071 * VM system not knowing anything about existing 1072 * references. 1073 */ 1074 vm_page_activate(m); 1075 m->act_count += (actcount + ACT_ADVANCE); 1076 vm_page_wakeup(m); 1077 return 0; 1078 } 1079 1080 /* 1081 * (m) is still busied. 1082 * 1083 * If the upper level VM system knows about any page 1084 * references, we activate the page. We also set the 1085 * "activation count" higher than normal so that we will less 1086 * likely place pages back onto the inactive queue again. 1087 */ 1088 if ((m->flags & PG_REFERENCED) != 0) { 1089 vm_page_flag_clear(m, PG_REFERENCED); 1090 actcount = pmap_ts_referenced(m); 1091 vm_page_activate(m); 1092 m->act_count += (actcount + ACT_ADVANCE + 1); 1093 vm_page_wakeup(m); 1094 return 0; 1095 } 1096 1097 /* 1098 * If the upper level VM system doesn't know anything about 1099 * the page being dirty, we have to check for it again. As 1100 * far as the VM code knows, any partially dirty pages are 1101 * fully dirty. 1102 * 1103 * Pages marked PG_WRITEABLE may be mapped into the user 1104 * address space of a process running on another cpu. A 1105 * user process (without holding the MP lock) running on 1106 * another cpu may be able to touch the page while we are 1107 * trying to remove it. vm_page_cache() will handle this 1108 * case for us. 1109 */ 1110 if (m->dirty == 0) { 1111 vm_page_test_dirty(m); 1112 } else { 1113 vm_page_dirty(m); 1114 } 1115 1116 if (m->valid == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 1117 /* 1118 * Invalid pages can be easily freed 1119 */ 1120 vm_pageout_page_free(m); 1121 mycpu->gd_cnt.v_dfree++; 1122 ++count; 1123 } else if (m->dirty == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 1124 /* 1125 * Clean pages can be placed onto the cache queue. 1126 * This effectively frees them. 1127 */ 1128 vm_page_cache(m); 1129 ++count; 1130 } else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) { 1131 /* 1132 * Dirty pages need to be paged out, but flushing 1133 * a page is extremely expensive verses freeing 1134 * a clean page. Rather then artificially limiting 1135 * the number of pages we can flush, we instead give 1136 * dirty pages extra priority on the inactive queue 1137 * by forcing them to be cycled through the queue 1138 * twice before being flushed, after which the 1139 * (now clean) page will cycle through once more 1140 * before being freed. This significantly extends 1141 * the thrash point for a heavily loaded machine. 1142 */ 1143 vm_page_flag_set(m, PG_WINATCFLS); 1144 vm_page_and_queue_spin_lock(m); 1145 if (m->queue - m->pc == PQ_INACTIVE) { 1146 TAILQ_REMOVE( 1147 &vm_page_queues[m->queue].pl, m, pageq); 1148 TAILQ_INSERT_TAIL( 1149 &vm_page_queues[m->queue].pl, m, pageq); 1150 } 1151 vm_page_and_queue_spin_unlock(m); 1152 vm_page_wakeup(m); 1153 } else if (*max_launderp > 0) { 1154 /* 1155 * We always want to try to flush some dirty pages if 1156 * we encounter them, to keep the system stable. 1157 * Normally this number is small, but under extreme 1158 * pressure where there are insufficient clean pages 1159 * on the inactive queue, we may have to go all out. 1160 */ 1161 int swap_pageouts_ok; 1162 struct vnode *vp = NULL; 1163 1164 swap_pageouts_ok = 0; 1165 object = m->object; 1166 if (object && 1167 (object->type != OBJT_SWAP) && 1168 (object->type != OBJT_DEFAULT)) { 1169 swap_pageouts_ok = 1; 1170 } else { 1171 swap_pageouts_ok = !(defer_swap_pageouts || 1172 disable_swap_pageouts); 1173 swap_pageouts_ok |= (!disable_swap_pageouts && 1174 defer_swap_pageouts && 1175 vm_page_count_min(0)); 1176 } 1177 1178 /* 1179 * We don't bother paging objects that are "dead". 1180 * Those objects are in a "rundown" state. 1181 */ 1182 if (!swap_pageouts_ok || 1183 (object == NULL) || 1184 (object->flags & OBJ_DEAD)) { 1185 vm_page_and_queue_spin_lock(m); 1186 if (m->queue - m->pc == PQ_INACTIVE) { 1187 TAILQ_REMOVE( 1188 &vm_page_queues[m->queue].pl, 1189 m, pageq); 1190 TAILQ_INSERT_TAIL( 1191 &vm_page_queues[m->queue].pl, 1192 m, pageq); 1193 } 1194 vm_page_and_queue_spin_unlock(m); 1195 vm_page_wakeup(m); 1196 return 0; 1197 } 1198 1199 /* 1200 * (m) is still busied. 1201 * 1202 * The object is already known NOT to be dead. It 1203 * is possible for the vget() to block the whole 1204 * pageout daemon, but the new low-memory handling 1205 * code should prevent it. 1206 * 1207 * The previous code skipped locked vnodes and, worse, 1208 * reordered pages in the queue. This results in 1209 * completely non-deterministic operation because, 1210 * quite often, a vm_fault has initiated an I/O and 1211 * is holding a locked vnode at just the point where 1212 * the pageout daemon is woken up. 1213 * 1214 * We can't wait forever for the vnode lock, we might 1215 * deadlock due to a vn_read() getting stuck in 1216 * vm_wait while holding this vnode. We skip the 1217 * vnode if we can't get it in a reasonable amount 1218 * of time. 1219 * 1220 * vpfailed is used to (try to) avoid the case where 1221 * a large number of pages are associated with a 1222 * locked vnode, which could cause the pageout daemon 1223 * to stall for an excessive amount of time. 1224 */ 1225 if (object->type == OBJT_VNODE) { 1226 int flags; 1227 1228 vp = object->handle; 1229 flags = LK_EXCLUSIVE; 1230 if (vp == *vpfailedp) 1231 flags |= LK_NOWAIT; 1232 else 1233 flags |= LK_TIMELOCK; 1234 vm_page_hold(m); 1235 vm_page_wakeup(m); 1236 1237 /* 1238 * We have unbusied (m) temporarily so we can 1239 * acquire the vp lock without deadlocking. 1240 * (m) is held to prevent destruction. 1241 */ 1242 if (vget(vp, flags) != 0) { 1243 *vpfailedp = vp; 1244 ++pageout_lock_miss; 1245 if (object->flags & OBJ_MIGHTBEDIRTY) 1246 ++*vnodes_skippedp; 1247 vm_page_unhold(m); 1248 return 0; 1249 } 1250 1251 /* 1252 * The page might have been moved to another 1253 * queue during potential blocking in vget() 1254 * above. The page might have been freed and 1255 * reused for another vnode. The object might 1256 * have been reused for another vnode. 1257 */ 1258 if (m->queue - m->pc != PQ_INACTIVE || 1259 m->object != object || 1260 object->handle != vp) { 1261 if (object->flags & OBJ_MIGHTBEDIRTY) 1262 ++*vnodes_skippedp; 1263 vput(vp); 1264 vm_page_unhold(m); 1265 return 0; 1266 } 1267 1268 /* 1269 * The page may have been busied during the 1270 * blocking in vput(); We don't move the 1271 * page back onto the end of the queue so that 1272 * statistics are more correct if we don't. 1273 */ 1274 if (vm_page_busy_try(m, TRUE)) { 1275 vput(vp); 1276 vm_page_unhold(m); 1277 return 0; 1278 } 1279 vm_page_unhold(m); 1280 1281 /* 1282 * If it was wired while we didn't own it. 1283 */ 1284 if (m->wire_count) { 1285 vm_page_unqueue_nowakeup(m); 1286 vput(vp); 1287 vm_page_wakeup(m); 1288 return 0; 1289 } 1290 1291 /* 1292 * (m) is busied again 1293 * 1294 * We own the busy bit and remove our hold 1295 * bit. If the page is still held it 1296 * might be undergoing I/O, so skip it. 1297 */ 1298 if (m->hold_count) { 1299 vm_page_and_queue_spin_lock(m); 1300 if (m->queue - m->pc == PQ_INACTIVE) { 1301 TAILQ_REMOVE(&vm_page_queues[m->queue].pl, m, pageq); 1302 TAILQ_INSERT_TAIL(&vm_page_queues[m->queue].pl, m, pageq); 1303 } 1304 vm_page_and_queue_spin_unlock(m); 1305 if (object->flags & OBJ_MIGHTBEDIRTY) 1306 ++*vnodes_skippedp; 1307 vm_page_wakeup(m); 1308 vput(vp); 1309 return 0; 1310 } 1311 /* (m) is left busied as we fall through */ 1312 } 1313 1314 /* 1315 * page is busy and not held here. 1316 * 1317 * If a page is dirty, then it is either being washed 1318 * (but not yet cleaned) or it is still in the 1319 * laundry. If it is still in the laundry, then we 1320 * start the cleaning operation. 1321 * 1322 * decrement inactive_shortage on success to account 1323 * for the (future) cleaned page. Otherwise we 1324 * could wind up laundering or cleaning too many 1325 * pages. 1326 * 1327 * NOTE: Cleaning the page here does not cause 1328 * force_deficit to be adjusted, because the 1329 * page is not being freed or moved to the 1330 * cache. 1331 */ 1332 count = vm_pageout_clean_helper(m, vmflush_flags); 1333 *max_launderp -= count; 1334 1335 /* 1336 * Clean ate busy, page no longer accessible 1337 */ 1338 if (vp != NULL) 1339 vput(vp); 1340 } else { 1341 vm_page_wakeup(m); 1342 } 1343 return count; 1344 } 1345 1346 /* 1347 * Scan active queue 1348 * 1349 * WARNING! Can be called from two pagedaemon threads simultaneously. 1350 */ 1351 static int 1352 vm_pageout_scan_active(int pass, int q, 1353 long avail_shortage, long inactive_shortage, 1354 long *recycle_countp) 1355 { 1356 struct vm_page marker; 1357 vm_page_t m; 1358 int actcount; 1359 long delta = 0; 1360 long maxscan; 1361 int isep; 1362 1363 isep = (curthread == emergpager); 1364 1365 /* 1366 * We want to move pages from the active queue to the inactive 1367 * queue to get the inactive queue to the inactive target. If 1368 * we still have a page shortage from above we try to directly free 1369 * clean pages instead of moving them. 1370 * 1371 * If we do still have a shortage we keep track of the number of 1372 * pages we free or cache (recycle_count) as a measure of thrashing 1373 * between the active and inactive queues. 1374 * 1375 * If we were able to completely satisfy the free+cache targets 1376 * from the inactive pool we limit the number of pages we move 1377 * from the active pool to the inactive pool to 2x the pages we 1378 * had removed from the inactive pool (with a minimum of 1/5 the 1379 * inactive target). If we were not able to completely satisfy 1380 * the free+cache targets we go for the whole target aggressively. 1381 * 1382 * NOTE: Both variables can end up negative. 1383 * NOTE: We are still in a critical section. 1384 * 1385 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT LAUNDER VNODE-BACKED 1386 * PAGES. 1387 */ 1388 1389 bzero(&marker, sizeof(marker)); 1390 marker.flags = PG_FICTITIOUS | PG_MARKER; 1391 marker.busy_count = PBUSY_LOCKED; 1392 marker.queue = PQ_ACTIVE + q; 1393 marker.pc = q; 1394 marker.wire_count = 1; 1395 1396 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1397 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1398 maxscan = vm_page_queues[PQ_ACTIVE + q].lcnt / 10 + 1; 1399 1400 /* 1401 * Queue locked at top of loop to avoid stack marker issues. 1402 */ 1403 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 1404 maxscan-- > 0 && (avail_shortage - delta > 0 || 1405 inactive_shortage > 0)) 1406 { 1407 KKASSERT(m->queue == PQ_ACTIVE + q); 1408 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, 1409 &marker, pageq); 1410 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m, 1411 &marker, pageq); 1412 1413 /* 1414 * Skip marker pages (atomic against other markers to avoid 1415 * infinite hop-over scans). 1416 */ 1417 if (m->flags & PG_MARKER) 1418 continue; 1419 1420 /* 1421 * Try to busy the page. Don't mess with pages which are 1422 * already busy or reorder them in the queue. 1423 */ 1424 if (vm_page_busy_try(m, TRUE)) 1425 continue; 1426 1427 /* 1428 * Remaining operations run with the page busy and neither 1429 * the page or the queue will be spin-locked. 1430 */ 1431 KKASSERT(m->queue == PQ_ACTIVE + q); 1432 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1433 1434 #if 0 1435 /* 1436 * Don't deactivate pages that are held, even if we can 1437 * busy them. (XXX why not?) 1438 */ 1439 if (m->hold_count) { 1440 vm_page_and_queue_spin_lock(m); 1441 if (m->queue - m->pc == PQ_ACTIVE) { 1442 TAILQ_REMOVE( 1443 &vm_page_queues[PQ_ACTIVE + q].pl, 1444 m, pageq); 1445 TAILQ_INSERT_TAIL( 1446 &vm_page_queues[PQ_ACTIVE + q].pl, 1447 m, pageq); 1448 } 1449 vm_page_and_queue_spin_unlock(m); 1450 vm_page_wakeup(m); 1451 goto next; 1452 } 1453 #endif 1454 /* 1455 * We can just remove wired pages from the queue 1456 */ 1457 if (m->wire_count) { 1458 vm_page_unqueue_nowakeup(m); 1459 vm_page_wakeup(m); 1460 goto next; 1461 } 1462 1463 /* 1464 * The emergency pager ignores vnode-backed pages as these 1465 * are the pages that probably bricked the main pager. 1466 */ 1467 if (isep && m->object && m->object->type == OBJT_VNODE) { 1468 vm_page_and_queue_spin_lock(m); 1469 if (m->queue - m->pc == PQ_ACTIVE) { 1470 TAILQ_REMOVE( 1471 &vm_page_queues[PQ_ACTIVE + q].pl, 1472 m, pageq); 1473 TAILQ_INSERT_TAIL( 1474 &vm_page_queues[PQ_ACTIVE + q].pl, 1475 m, pageq); 1476 } 1477 vm_page_and_queue_spin_unlock(m); 1478 vm_page_wakeup(m); 1479 goto next; 1480 } 1481 1482 /* 1483 * The count for pagedaemon pages is done after checking the 1484 * page for eligibility... 1485 */ 1486 mycpu->gd_cnt.v_pdpages++; 1487 1488 /* 1489 * Check to see "how much" the page has been used and clear 1490 * the tracking access bits. If the object has no references 1491 * don't bother paying the expense. 1492 */ 1493 actcount = 0; 1494 if (m->object && m->object->ref_count != 0) { 1495 if (m->flags & PG_REFERENCED) 1496 ++actcount; 1497 actcount += pmap_ts_referenced(m); 1498 if (actcount) { 1499 m->act_count += ACT_ADVANCE + actcount; 1500 if (m->act_count > ACT_MAX) 1501 m->act_count = ACT_MAX; 1502 } 1503 } 1504 vm_page_flag_clear(m, PG_REFERENCED); 1505 1506 /* 1507 * actcount is only valid if the object ref_count is non-zero. 1508 * If the page does not have an object, actcount will be zero. 1509 */ 1510 if (actcount && m->object->ref_count != 0) { 1511 vm_page_and_queue_spin_lock(m); 1512 if (m->queue - m->pc == PQ_ACTIVE) { 1513 TAILQ_REMOVE( 1514 &vm_page_queues[PQ_ACTIVE + q].pl, 1515 m, pageq); 1516 TAILQ_INSERT_TAIL( 1517 &vm_page_queues[PQ_ACTIVE + q].pl, 1518 m, pageq); 1519 } 1520 vm_page_and_queue_spin_unlock(m); 1521 vm_page_wakeup(m); 1522 } else { 1523 switch(m->object->type) { 1524 case OBJT_DEFAULT: 1525 case OBJT_SWAP: 1526 m->act_count -= min(m->act_count, 1527 vm_anonmem_decline); 1528 break; 1529 default: 1530 m->act_count -= min(m->act_count, 1531 vm_filemem_decline); 1532 break; 1533 } 1534 if (vm_pageout_algorithm || 1535 (m->object == NULL) || 1536 (m->object && (m->object->ref_count == 0)) || 1537 m->act_count < pass + 1 1538 ) { 1539 /* 1540 * Deactivate the page. If we had a 1541 * shortage from our inactive scan try to 1542 * free (cache) the page instead. 1543 * 1544 * Don't just blindly cache the page if 1545 * we do not have a shortage from the 1546 * inactive scan, that could lead to 1547 * gigabytes being moved. 1548 */ 1549 --inactive_shortage; 1550 if (avail_shortage - delta > 0 || 1551 (m->object && (m->object->ref_count == 0))) 1552 { 1553 if (avail_shortage - delta > 0) 1554 ++*recycle_countp; 1555 vm_page_protect(m, VM_PROT_NONE); 1556 if (m->dirty == 0 && 1557 (m->flags & PG_NEED_COMMIT) == 0 && 1558 avail_shortage - delta > 0) { 1559 vm_page_cache(m); 1560 } else { 1561 vm_page_deactivate(m); 1562 vm_page_wakeup(m); 1563 } 1564 } else { 1565 vm_page_deactivate(m); 1566 vm_page_wakeup(m); 1567 } 1568 ++delta; 1569 } else { 1570 vm_page_and_queue_spin_lock(m); 1571 if (m->queue - m->pc == PQ_ACTIVE) { 1572 TAILQ_REMOVE( 1573 &vm_page_queues[PQ_ACTIVE + q].pl, 1574 m, pageq); 1575 TAILQ_INSERT_TAIL( 1576 &vm_page_queues[PQ_ACTIVE + q].pl, 1577 m, pageq); 1578 } 1579 vm_page_and_queue_spin_unlock(m); 1580 vm_page_wakeup(m); 1581 } 1582 } 1583 next: 1584 lwkt_yield(); 1585 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1586 } 1587 1588 /* 1589 * Clean out our local marker. 1590 * 1591 * Page queue still spin-locked. 1592 */ 1593 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1594 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1595 1596 return (delta); 1597 } 1598 1599 /* 1600 * The number of actually free pages can drop down to v_free_reserved, 1601 * we try to build the free count back above v_free_min. Note that 1602 * vm_paging_needed() also returns TRUE if v_free_count is not at 1603 * least v_free_min so that is the minimum we must build the free 1604 * count to. 1605 * 1606 * We use a slightly higher target to improve hysteresis, 1607 * ((v_free_target + v_free_min) / 2). Since v_free_target 1608 * is usually the same as v_cache_min this maintains about 1609 * half the pages in the free queue as are in the cache queue, 1610 * providing pretty good pipelining for pageout operation. 1611 * 1612 * The system operator can manipulate vm.v_cache_min and 1613 * vm.v_free_target to tune the pageout demon. Be sure 1614 * to keep vm.v_free_min < vm.v_free_target. 1615 * 1616 * Note that the original paging target is to get at least 1617 * (free_min + cache_min) into (free + cache). The slightly 1618 * higher target will shift additional pages from cache to free 1619 * without effecting the original paging target in order to 1620 * maintain better hysteresis and not have the free count always 1621 * be dead-on v_free_min. 1622 * 1623 * NOTE: we are still in a critical section. 1624 * 1625 * Pages moved from PQ_CACHE to totally free are not counted in the 1626 * pages_freed counter. 1627 * 1628 * WARNING! Can be called from two pagedaemon threads simultaneously. 1629 */ 1630 static void 1631 vm_pageout_scan_cache(long avail_shortage, int pass, 1632 long vnodes_skipped, long recycle_count) 1633 { 1634 static int lastkillticks; 1635 struct vm_pageout_scan_info info; 1636 vm_page_t m; 1637 int isep; 1638 1639 isep = (curthread == emergpager); 1640 1641 while (vmstats.v_free_count < 1642 (vmstats.v_free_min + vmstats.v_free_target) / 2) { 1643 /* 1644 * This steals some code from vm/vm_page.c 1645 * 1646 * Create two rovers and adjust the code to reduce 1647 * chances of them winding up at the same index (which 1648 * can cause a lot of contention). 1649 */ 1650 static int cache_rover[2] = { 0, PQ_L2_MASK / 2 }; 1651 1652 if (((cache_rover[0] ^ cache_rover[1]) & PQ_L2_MASK) == 0) 1653 goto next_rover; 1654 1655 m = vm_page_list_find(PQ_CACHE, cache_rover[isep] & PQ_L2_MASK); 1656 if (m == NULL) 1657 break; 1658 /* 1659 * page is returned removed from its queue and spinlocked 1660 * 1661 * If the busy attempt fails we can still deactivate the page. 1662 */ 1663 if (vm_page_busy_try(m, TRUE)) { 1664 vm_page_deactivate_locked(m); 1665 vm_page_spin_unlock(m); 1666 continue; 1667 } 1668 vm_page_spin_unlock(m); 1669 pagedaemon_wakeup(); 1670 lwkt_yield(); 1671 1672 /* 1673 * Remaining operations run with the page busy and neither 1674 * the page or the queue will be spin-locked. 1675 */ 1676 if ((m->flags & (PG_UNQUEUED | PG_NEED_COMMIT)) || 1677 m->hold_count || 1678 m->wire_count) { 1679 vm_page_deactivate(m); 1680 vm_page_wakeup(m); 1681 continue; 1682 } 1683 1684 /* 1685 * Because the page is in the cache, it shouldn't be mapped. 1686 */ 1687 pmap_mapped_sync(m); 1688 KKASSERT((m->flags & PG_MAPPED) == 0); 1689 KKASSERT(m->dirty == 0); 1690 vm_pageout_page_free(m); 1691 mycpu->gd_cnt.v_dfree++; 1692 next_rover: 1693 if (isep) 1694 cache_rover[1] -= PQ_PRIME2; 1695 else 1696 cache_rover[0] += PQ_PRIME2; 1697 } 1698 1699 #if !defined(NO_SWAPPING) 1700 /* 1701 * Idle process swapout -- run once per second. 1702 */ 1703 if (vm_swap_idle_enabled) { 1704 static time_t lsec; 1705 if (time_uptime != lsec) { 1706 atomic_set_int(&vm_pageout_req_swapout, VM_SWAP_IDLE); 1707 vm_req_vmdaemon(); 1708 lsec = time_uptime; 1709 } 1710 } 1711 #endif 1712 1713 /* 1714 * If we didn't get enough free pages, and we have skipped a vnode 1715 * in a writeable object, wakeup the sync daemon. And kick swapout 1716 * if we did not get enough free pages. 1717 */ 1718 if (vm_paging_target() > 0) { 1719 if (vnodes_skipped && vm_page_count_min(0)) 1720 speedup_syncer(NULL); 1721 #if !defined(NO_SWAPPING) 1722 if (vm_swap_enabled && vm_page_count_target()) { 1723 atomic_set_int(&vm_pageout_req_swapout, VM_SWAP_NORMAL); 1724 vm_req_vmdaemon(); 1725 } 1726 #endif 1727 } 1728 1729 /* 1730 * Handle catastrophic conditions. Under good conditions we should 1731 * be at the target, well beyond our minimum. If we could not even 1732 * reach our minimum the system is under heavy stress. But just being 1733 * under heavy stress does not trigger process killing. 1734 * 1735 * We consider ourselves to have run out of memory if the swap pager 1736 * is full and avail_shortage is still positive. The secondary check 1737 * ensures that we do not kill processes if the instantanious 1738 * availability is good, even if the pageout demon pass says it 1739 * couldn't get to the target. 1740 * 1741 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT HANDLE SWAP FULL 1742 * SITUATIONS. 1743 */ 1744 if (swap_pager_almost_full && 1745 pass > 0 && 1746 isep == 0 && 1747 (vm_page_count_min(recycle_count) || avail_shortage > 0)) { 1748 kprintf("Warning: system low on memory+swap " 1749 "shortage %ld for %d ticks!\n", 1750 avail_shortage, ticks - swap_fail_ticks); 1751 if (bootverbose) 1752 kprintf("Metrics: spaf=%d spf=%d pass=%d " 1753 "avail=%ld target=%ld last=%u\n", 1754 swap_pager_almost_full, 1755 swap_pager_full, 1756 pass, 1757 avail_shortage, 1758 vm_paging_target(), 1759 (unsigned int)(ticks - lastkillticks)); 1760 } 1761 if (swap_pager_full && 1762 pass > 1 && 1763 isep == 0 && 1764 avail_shortage > 0 && 1765 vm_paging_target() > 0 && 1766 (unsigned int)(ticks - lastkillticks) >= hz) { 1767 /* 1768 * Kill something, maximum rate once per second to give 1769 * the process time to free up sufficient memory. 1770 */ 1771 lastkillticks = ticks; 1772 info.bigproc = NULL; 1773 info.bigsize = 0; 1774 allproc_scan(vm_pageout_scan_callback, &info, 0); 1775 if (info.bigproc != NULL) { 1776 kprintf("Try to kill process %d %s\n", 1777 info.bigproc->p_pid, info.bigproc->p_comm); 1778 info.bigproc->p_nice = PRIO_MIN; 1779 info.bigproc->p_usched->resetpriority( 1780 FIRST_LWP_IN_PROC(info.bigproc)); 1781 atomic_set_int(&info.bigproc->p_flags, P_LOWMEMKILL); 1782 killproc(info.bigproc, "out of swap space"); 1783 wakeup(&vmstats.v_free_count); 1784 PRELE(info.bigproc); 1785 } 1786 } 1787 } 1788 1789 static int 1790 vm_pageout_scan_callback(struct proc *p, void *data) 1791 { 1792 struct vm_pageout_scan_info *info = data; 1793 vm_offset_t size; 1794 1795 /* 1796 * Never kill system processes or init. If we have configured swap 1797 * then try to avoid killing low-numbered pids. 1798 */ 1799 if ((p->p_flags & P_SYSTEM) || (p->p_pid == 1) || 1800 ((p->p_pid < 48) && (vm_swap_size != 0))) { 1801 return (0); 1802 } 1803 1804 lwkt_gettoken(&p->p_token); 1805 1806 /* 1807 * if the process is in a non-running type state, 1808 * don't touch it. 1809 */ 1810 if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) { 1811 lwkt_reltoken(&p->p_token); 1812 return (0); 1813 } 1814 1815 /* 1816 * Get the approximate process size. Note that anonymous pages 1817 * with backing swap will be counted twice, but there should not 1818 * be too many such pages due to the stress the VM system is 1819 * under at this point. 1820 */ 1821 size = vmspace_anonymous_count(p->p_vmspace) + 1822 vmspace_swap_count(p->p_vmspace); 1823 1824 /* 1825 * If the this process is bigger than the biggest one 1826 * remember it. 1827 */ 1828 if (info->bigsize < size) { 1829 if (info->bigproc) 1830 PRELE(info->bigproc); 1831 PHOLD(p); 1832 info->bigproc = p; 1833 info->bigsize = size; 1834 } 1835 lwkt_reltoken(&p->p_token); 1836 lwkt_yield(); 1837 1838 return(0); 1839 } 1840 1841 /* 1842 * This old guy slowly walks PQ_HOLD looking for pages which need to be 1843 * moved back to PQ_FREE. It is possible for pages to accumulate here 1844 * when vm_page_free() races against vm_page_unhold(), resulting in a 1845 * page being left on a PQ_HOLD queue with hold_count == 0. 1846 * 1847 * It is easier to handle this edge condition here, in non-critical code, 1848 * rather than enforce a spin-lock for every 1->0 transition in 1849 * vm_page_unhold(). 1850 * 1851 * NOTE: TAILQ_FOREACH becomes invalid the instant we unlock the queue. 1852 */ 1853 static void 1854 vm_pageout_scan_hold(int q) 1855 { 1856 vm_page_t m; 1857 1858 vm_page_queues_spin_lock(PQ_HOLD + q); 1859 TAILQ_FOREACH(m, &vm_page_queues[PQ_HOLD + q].pl, pageq) { 1860 if (m->flags & PG_MARKER) 1861 continue; 1862 1863 /* 1864 * Process one page and return 1865 */ 1866 if (m->hold_count) 1867 break; 1868 kprintf("DEBUG: pageout HOLD->FREE %p\n", m); 1869 vm_page_hold(m); 1870 vm_page_queues_spin_unlock(PQ_HOLD + q); 1871 vm_page_unhold(m); /* reprocess */ 1872 return; 1873 } 1874 vm_page_queues_spin_unlock(PQ_HOLD + q); 1875 } 1876 1877 /* 1878 * This routine tries to maintain the pseudo LRU active queue, 1879 * so that during long periods of time where there is no paging, 1880 * that some statistic accumulation still occurs. This code 1881 * helps the situation where paging just starts to occur. 1882 */ 1883 static void 1884 vm_pageout_page_stats(int q) 1885 { 1886 static int fullintervalcount = 0; 1887 struct vm_page marker; 1888 vm_page_t m; 1889 long pcount, tpcount; /* Number of pages to check */ 1890 long page_shortage; 1891 1892 page_shortage = (vmstats.v_inactive_target + vmstats.v_cache_max + 1893 vmstats.v_free_min) - 1894 (vmstats.v_free_count + vmstats.v_inactive_count + 1895 vmstats.v_cache_count); 1896 1897 if (page_shortage <= 0) 1898 return; 1899 1900 pcount = vm_page_queues[PQ_ACTIVE + q].lcnt; 1901 fullintervalcount += vm_pageout_stats_interval; 1902 if (fullintervalcount < vm_pageout_full_stats_interval) { 1903 tpcount = (vm_pageout_stats_max * pcount) / 1904 vmstats.v_page_count + 1; 1905 if (pcount > tpcount) 1906 pcount = tpcount; 1907 } else { 1908 fullintervalcount = 0; 1909 } 1910 1911 bzero(&marker, sizeof(marker)); 1912 marker.flags = PG_FICTITIOUS | PG_MARKER; 1913 marker.busy_count = PBUSY_LOCKED; 1914 marker.queue = PQ_ACTIVE + q; 1915 marker.pc = q; 1916 marker.wire_count = 1; 1917 1918 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1919 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1920 1921 /* 1922 * Queue locked at top of loop to avoid stack marker issues. 1923 */ 1924 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 1925 pcount-- > 0) 1926 { 1927 int actcount; 1928 1929 KKASSERT(m->queue == PQ_ACTIVE + q); 1930 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1931 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m, 1932 &marker, pageq); 1933 1934 /* 1935 * Skip marker pages (atomic against other markers to avoid 1936 * infinite hop-over scans). 1937 */ 1938 if (m->flags & PG_MARKER) 1939 continue; 1940 1941 /* 1942 * Ignore pages we can't busy 1943 */ 1944 if (vm_page_busy_try(m, TRUE)) 1945 continue; 1946 1947 /* 1948 * Remaining operations run with the page busy and neither 1949 * the page or the queue will be spin-locked. 1950 */ 1951 KKASSERT(m->queue == PQ_ACTIVE + q); 1952 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1953 1954 /* 1955 * We can just remove wired pages from the queue 1956 */ 1957 if (m->wire_count) { 1958 vm_page_unqueue_nowakeup(m); 1959 vm_page_wakeup(m); 1960 goto next; 1961 } 1962 1963 1964 /* 1965 * We now have a safely busied page, the page and queue 1966 * spinlocks have been released. 1967 * 1968 * Ignore held and wired pages 1969 */ 1970 if (m->hold_count || m->wire_count) { 1971 vm_page_wakeup(m); 1972 goto next; 1973 } 1974 1975 /* 1976 * Calculate activity 1977 */ 1978 actcount = 0; 1979 if (m->flags & PG_REFERENCED) { 1980 vm_page_flag_clear(m, PG_REFERENCED); 1981 actcount += 1; 1982 } 1983 actcount += pmap_ts_referenced(m); 1984 1985 /* 1986 * Update act_count and move page to end of queue. 1987 */ 1988 if (actcount) { 1989 m->act_count += ACT_ADVANCE + actcount; 1990 if (m->act_count > ACT_MAX) 1991 m->act_count = ACT_MAX; 1992 vm_page_and_queue_spin_lock(m); 1993 if (m->queue - m->pc == PQ_ACTIVE) { 1994 TAILQ_REMOVE( 1995 &vm_page_queues[PQ_ACTIVE + q].pl, 1996 m, pageq); 1997 TAILQ_INSERT_TAIL( 1998 &vm_page_queues[PQ_ACTIVE + q].pl, 1999 m, pageq); 2000 } 2001 vm_page_and_queue_spin_unlock(m); 2002 vm_page_wakeup(m); 2003 goto next; 2004 } 2005 2006 if (m->act_count == 0) { 2007 /* 2008 * We turn off page access, so that we have 2009 * more accurate RSS stats. We don't do this 2010 * in the normal page deactivation when the 2011 * system is loaded VM wise, because the 2012 * cost of the large number of page protect 2013 * operations would be higher than the value 2014 * of doing the operation. 2015 * 2016 * We use the marker to save our place so 2017 * we can release the spin lock. both (m) 2018 * and (next) will be invalid. 2019 */ 2020 vm_page_protect(m, VM_PROT_NONE); 2021 vm_page_deactivate(m); 2022 } else { 2023 m->act_count -= min(m->act_count, ACT_DECLINE); 2024 vm_page_and_queue_spin_lock(m); 2025 if (m->queue - m->pc == PQ_ACTIVE) { 2026 TAILQ_REMOVE( 2027 &vm_page_queues[PQ_ACTIVE + q].pl, 2028 m, pageq); 2029 TAILQ_INSERT_TAIL( 2030 &vm_page_queues[PQ_ACTIVE + q].pl, 2031 m, pageq); 2032 } 2033 vm_page_and_queue_spin_unlock(m); 2034 } 2035 vm_page_wakeup(m); 2036 next: 2037 vm_page_queues_spin_lock(PQ_ACTIVE + q); 2038 } 2039 2040 /* 2041 * Remove our local marker 2042 * 2043 * Page queue still spin-locked. 2044 */ 2045 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 2046 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 2047 } 2048 2049 static void 2050 vm_pageout_free_page_calc(vm_size_t count) 2051 { 2052 /* 2053 * v_free_min normal allocations 2054 * v_free_reserved system allocations 2055 * v_pageout_free_min allocations by pageout daemon 2056 * v_interrupt_free_min low level allocations (e.g swap structures) 2057 * 2058 * v_free_min is used to generate several other baselines, and they 2059 * can get pretty silly on systems with a lot of memory. 2060 */ 2061 vmstats.v_free_min = 64 + vmstats.v_page_count / 200; 2062 vmstats.v_free_reserved = vmstats.v_free_min * 4 / 8 + 7; 2063 vmstats.v_free_severe = vmstats.v_free_min * 4 / 8 + 0; 2064 vmstats.v_pageout_free_min = vmstats.v_free_min * 2 / 8 + 7; 2065 vmstats.v_interrupt_free_min = vmstats.v_free_min * 1 / 8 + 7; 2066 } 2067 2068 2069 /* 2070 * vm_pageout is the high level pageout daemon. TWO kernel threads run 2071 * this daemon, the primary pageout daemon and the emergency pageout daemon. 2072 * 2073 * The emergency pageout daemon takes over when the primary pageout daemon 2074 * deadlocks. The emergency pageout daemon ONLY pages out to swap, thus 2075 * avoiding the many low-memory deadlocks which can occur when paging out 2076 * to VFS's. 2077 */ 2078 static void 2079 vm_pageout_thread(void) 2080 { 2081 int pass; 2082 int q; 2083 int q1iterator = 0; 2084 int q2iterator = 0; 2085 int q3iterator = 0; 2086 int isep; 2087 2088 curthread->td_flags |= TDF_SYSTHREAD; 2089 2090 /* 2091 * We only need to setup once. 2092 */ 2093 isep = 0; 2094 if (curthread == emergpager) { 2095 isep = 1; 2096 goto skip_setup; 2097 } 2098 2099 /* 2100 * Initialize some paging parameters. 2101 */ 2102 vm_pageout_free_page_calc(vmstats.v_page_count); 2103 2104 /* 2105 * v_free_target and v_cache_min control pageout hysteresis. Note 2106 * that these are more a measure of the VM cache queue hysteresis 2107 * then the VM free queue. Specifically, v_free_target is the 2108 * high water mark (free+cache pages). 2109 * 2110 * v_free_reserved + v_cache_min (mostly means v_cache_min) is the 2111 * low water mark, while v_free_min is the stop. v_cache_min must 2112 * be big enough to handle memory needs while the pageout daemon 2113 * is signalled and run to free more pages. 2114 */ 2115 vmstats.v_free_target = 4 * vmstats.v_free_min + 2116 vmstats.v_free_reserved; 2117 2118 /* 2119 * NOTE: With the new buffer cache b_act_count we want the default 2120 * inactive target to be a percentage of available memory. 2121 * 2122 * The inactive target essentially determines the minimum 2123 * number of 'temporary' pages capable of caching one-time-use 2124 * files when the VM system is otherwise full of pages 2125 * belonging to multi-time-use files or active program data. 2126 * 2127 * NOTE: The inactive target is aggressively persued only if the 2128 * inactive queue becomes too small. If the inactive queue 2129 * is large enough to satisfy page movement to free+cache 2130 * then it is repopulated more slowly from the active queue. 2131 * This allows a general inactive_target default to be set. 2132 * 2133 * There is an issue here for processes which sit mostly idle 2134 * 'overnight', such as sshd, tcsh, and X. Any movement from 2135 * the active queue will eventually cause such pages to 2136 * recycle eventually causing a lot of paging in the morning. 2137 * To reduce the incidence of this pages cycled out of the 2138 * buffer cache are moved directly to the inactive queue if 2139 * they were only used once or twice. 2140 * 2141 * The vfs.vm_cycle_point sysctl can be used to adjust this. 2142 * Increasing the value (up to 64) increases the number of 2143 * buffer recyclements which go directly to the inactive queue. 2144 */ 2145 if (vmstats.v_free_count > 2048) { 2146 vmstats.v_cache_min = vmstats.v_free_target; 2147 vmstats.v_cache_max = 2 * vmstats.v_cache_min; 2148 } else { 2149 vmstats.v_cache_min = 0; 2150 vmstats.v_cache_max = 0; 2151 } 2152 vmstats.v_inactive_target = vmstats.v_free_count / 4; 2153 2154 /* XXX does not really belong here */ 2155 if (vm_page_max_wired == 0) 2156 vm_page_max_wired = vmstats.v_free_count / 3; 2157 2158 if (vm_pageout_stats_max == 0) 2159 vm_pageout_stats_max = vmstats.v_free_target; 2160 2161 /* 2162 * Set interval in seconds for stats scan. 2163 */ 2164 if (vm_pageout_stats_interval == 0) 2165 vm_pageout_stats_interval = 5; 2166 if (vm_pageout_full_stats_interval == 0) 2167 vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4; 2168 2169 2170 /* 2171 * Set maximum free per pass 2172 */ 2173 if (vm_pageout_stats_free_max == 0) 2174 vm_pageout_stats_free_max = 5; 2175 2176 swap_pager_swap_init(); 2177 pass = 0; 2178 2179 atomic_swap_int(&sequence_emerg_pager, 1); 2180 wakeup(&sequence_emerg_pager); 2181 2182 skip_setup: 2183 /* 2184 * Sequence emergency pager startup 2185 */ 2186 if (isep) { 2187 while (sequence_emerg_pager == 0) 2188 tsleep(&sequence_emerg_pager, 0, "pstartup", hz); 2189 } 2190 2191 /* 2192 * The pageout daemon is never done, so loop forever. 2193 * 2194 * WARNING! This code is being executed by two kernel threads 2195 * potentially simultaneously. 2196 */ 2197 while (TRUE) { 2198 int error; 2199 long avail_shortage; 2200 long inactive_shortage; 2201 long vnodes_skipped = 0; 2202 long recycle_count = 0; 2203 long tmp; 2204 2205 /* 2206 * Wait for an action request. If we timeout check to 2207 * see if paging is needed (in case the normal wakeup 2208 * code raced us). 2209 */ 2210 if (isep) { 2211 /* 2212 * Emergency pagedaemon monitors the primary 2213 * pagedaemon while vm_pages_needed != 0. 2214 * 2215 * The emergency pagedaemon only runs if VM paging 2216 * is needed and the primary pagedaemon has not 2217 * updated vm_pagedaemon_time for more than 2 seconds. 2218 */ 2219 if (vm_pages_needed) 2220 tsleep(&vm_pagedaemon_time, 0, "psleep", hz); 2221 else 2222 tsleep(&vm_pagedaemon_time, 0, "psleep", hz*10); 2223 if (vm_pages_needed == 0) { 2224 pass = 0; 2225 continue; 2226 } 2227 if ((int)(ticks - vm_pagedaemon_time) < hz * 2) { 2228 pass = 0; 2229 continue; 2230 } 2231 } else { 2232 /* 2233 * Primary pagedaemon 2234 * 2235 * NOTE: We unconditionally cleanup PQ_HOLD even 2236 * when there is no work to do. 2237 */ 2238 vm_pageout_scan_hold(q3iterator & PQ_L2_MASK); 2239 ++q3iterator; 2240 2241 if (vm_pages_needed == 0) { 2242 error = tsleep(&vm_pages_needed, 2243 0, "psleep", 2244 vm_pageout_stats_interval * hz); 2245 if (error && 2246 vm_paging_needed(0) == 0 && 2247 vm_pages_needed == 0) { 2248 for (q = 0; q < PQ_L2_SIZE; ++q) 2249 vm_pageout_page_stats(q); 2250 continue; 2251 } 2252 vm_pagedaemon_time = ticks; 2253 vm_pages_needed = 1; 2254 2255 /* 2256 * Wake the emergency pagedaemon up so it 2257 * can monitor us. It will automatically 2258 * go back into a long sleep when 2259 * vm_pages_needed returns to 0. 2260 */ 2261 wakeup(&vm_pagedaemon_time); 2262 } 2263 } 2264 2265 mycpu->gd_cnt.v_pdwakeups++; 2266 2267 /* 2268 * Scan for INACTIVE->CLEAN/PAGEOUT 2269 * 2270 * This routine tries to avoid thrashing the system with 2271 * unnecessary activity. 2272 * 2273 * Calculate our target for the number of free+cache pages we 2274 * want to get to. This is higher then the number that causes 2275 * allocations to stall (severe) in order to provide hysteresis, 2276 * and if we don't make it all the way but get to the minimum 2277 * we're happy. Goose it a bit if there are multiple requests 2278 * for memory. 2279 * 2280 * Don't reduce avail_shortage inside the loop or the 2281 * PQAVERAGE() calculation will break. 2282 * 2283 * NOTE! deficit is differentiated from avail_shortage as 2284 * REQUIRING at least (deficit) pages to be cleaned, 2285 * even if the page queues are in good shape. This 2286 * is used primarily for handling per-process 2287 * RLIMIT_RSS and may also see small values when 2288 * processes block due to low memory. 2289 */ 2290 vmstats_rollup(); 2291 if (isep == 0) 2292 vm_pagedaemon_time = ticks; 2293 avail_shortage = vm_paging_target() + vm_pageout_deficit; 2294 vm_pageout_deficit = 0; 2295 2296 if (avail_shortage > 0) { 2297 long delta = 0; 2298 int qq; 2299 2300 qq = q1iterator; 2301 for (q = 0; q < PQ_L2_SIZE; ++q) { 2302 delta += vm_pageout_scan_inactive( 2303 pass, 2304 qq & PQ_L2_MASK, 2305 PQAVERAGE(avail_shortage), 2306 &vnodes_skipped); 2307 if (isep) 2308 --qq; 2309 else 2310 ++qq; 2311 if (avail_shortage - delta <= 0) 2312 break; 2313 2314 /* 2315 * It is possible for avail_shortage to be 2316 * very large. If a large program exits or 2317 * frees a ton of memory all at once, we do 2318 * not have to continue deactivations. 2319 * 2320 * (We will still run the active->inactive 2321 * target, however). 2322 */ 2323 if (!vm_page_count_target() && 2324 !vm_page_count_min( 2325 vm_page_free_hysteresis)) { 2326 avail_shortage = 0; 2327 break; 2328 } 2329 } 2330 avail_shortage -= delta; 2331 q1iterator = qq; 2332 } 2333 2334 /* 2335 * Figure out how many active pages we must deactivate. If 2336 * we were able to reach our target with just the inactive 2337 * scan above we limit the number of active pages we 2338 * deactivate to reduce unnecessary work. 2339 */ 2340 vmstats_rollup(); 2341 if (isep == 0) 2342 vm_pagedaemon_time = ticks; 2343 inactive_shortage = vmstats.v_inactive_target - 2344 vmstats.v_inactive_count; 2345 2346 /* 2347 * If we were unable to free sufficient inactive pages to 2348 * satisfy the free/cache queue requirements then simply 2349 * reaching the inactive target may not be good enough. 2350 * Try to deactivate pages in excess of the target based 2351 * on the shortfall. 2352 * 2353 * However to prevent thrashing the VM system do not 2354 * deactivate more than an additional 1/10 the inactive 2355 * target's worth of active pages. 2356 */ 2357 if (avail_shortage > 0) { 2358 tmp = avail_shortage * 2; 2359 if (tmp > vmstats.v_inactive_target / 10) 2360 tmp = vmstats.v_inactive_target / 10; 2361 inactive_shortage += tmp; 2362 } 2363 2364 /* 2365 * Only trigger a pmap cleanup on inactive shortage. 2366 */ 2367 if (isep == 0 && inactive_shortage > 0) { 2368 pmap_collect(); 2369 } 2370 2371 /* 2372 * Scan for ACTIVE->INACTIVE 2373 * 2374 * Only trigger on inactive shortage. Triggering on 2375 * avail_shortage can starve the active queue with 2376 * unnecessary active->inactive transitions and destroy 2377 * performance. 2378 * 2379 * If this is the emergency pager, always try to move 2380 * a few pages from active to inactive because the inactive 2381 * queue might have enough pages, but not enough anonymous 2382 * pages. 2383 */ 2384 if (isep && inactive_shortage < vm_emerg_launder) 2385 inactive_shortage = vm_emerg_launder; 2386 2387 if (/*avail_shortage > 0 ||*/ inactive_shortage > 0) { 2388 long delta = 0; 2389 int qq; 2390 2391 qq = q2iterator; 2392 for (q = 0; q < PQ_L2_SIZE; ++q) { 2393 delta += vm_pageout_scan_active( 2394 pass, 2395 qq & PQ_L2_MASK, 2396 PQAVERAGE(avail_shortage), 2397 PQAVERAGE(inactive_shortage), 2398 &recycle_count); 2399 if (isep) 2400 --qq; 2401 else 2402 ++qq; 2403 if (inactive_shortage - delta <= 0 && 2404 avail_shortage - delta <= 0) { 2405 break; 2406 } 2407 2408 /* 2409 * inactive_shortage can be a very large 2410 * number. This is intended to break out 2411 * early if our inactive_target has been 2412 * reached due to other system activity. 2413 */ 2414 if (vmstats.v_inactive_count > 2415 vmstats.v_inactive_target) { 2416 inactive_shortage = 0; 2417 break; 2418 } 2419 } 2420 inactive_shortage -= delta; 2421 avail_shortage -= delta; 2422 q2iterator = qq; 2423 } 2424 2425 /* 2426 * Scan for CACHE->FREE 2427 * 2428 * Finally free enough cache pages to meet our free page 2429 * requirement and take more drastic measures if we are 2430 * still in trouble. 2431 */ 2432 vmstats_rollup(); 2433 if (isep == 0) 2434 vm_pagedaemon_time = ticks; 2435 vm_pageout_scan_cache(avail_shortage, pass, 2436 vnodes_skipped, recycle_count); 2437 2438 /* 2439 * This is a bit sophisticated because we do not necessarily 2440 * want to force paging until our targets are reached if we 2441 * were able to successfully retire the shortage we calculated. 2442 */ 2443 if (avail_shortage > 0) { 2444 /* 2445 * If we did not retire enough pages continue the 2446 * pageout operation until we are able to. 2447 */ 2448 ++pass; 2449 2450 if (pass < 10 && vm_pages_needed > 1) { 2451 /* 2452 * Normal operation, additional processes 2453 * have already kicked us. Retry immediately 2454 * unless swap space is completely full in 2455 * which case delay a bit. 2456 */ 2457 if (swap_pager_full) { 2458 tsleep(&vm_pages_needed, 0, "pdelay", 2459 hz / 5); 2460 } /* else immediate retry */ 2461 } else if (pass < 10) { 2462 /* 2463 * Do a short sleep for the first 10 passes, 2464 * allow the sleep to be woken up by resetting 2465 * vm_pages_needed to 1 (NOTE: we are still 2466 * active paging!). 2467 */ 2468 if (isep == 0) 2469 vm_pages_needed = 1; 2470 tsleep(&vm_pages_needed, 0, "pdelay", 2); 2471 } else if (swap_pager_full == 0) { 2472 /* 2473 * We've taken too many passes, force a 2474 * longer delay. 2475 */ 2476 tsleep(&vm_pages_needed, 0, "pdelay", hz / 10); 2477 } else { 2478 /* 2479 * Running out of memory, catastrophic 2480 * back-off to one-second intervals. 2481 */ 2482 tsleep(&vm_pages_needed, 0, "pdelay", hz); 2483 } 2484 } else if (vm_pages_needed) { 2485 /* 2486 * We retired our calculated shortage but we may have 2487 * to continue paging if threads drain memory too far 2488 * below our target. 2489 * 2490 * Similar to vm_page_free_wakeup() in vm_page.c. 2491 */ 2492 pass = 0; 2493 if (!vm_paging_needed(0)) { 2494 /* still more than half-way to our target */ 2495 vm_pages_needed = 0; 2496 wakeup(&vmstats.v_free_count); 2497 } else 2498 if (!vm_page_count_min(vm_page_free_hysteresis)) { 2499 /* 2500 * Continue operations with wakeup 2501 * (set variable to avoid overflow) 2502 */ 2503 vm_pages_needed = 2; 2504 wakeup(&vmstats.v_free_count); 2505 } else { 2506 /* 2507 * No wakeup() needed, continue operations. 2508 * (set variable to avoid overflow) 2509 */ 2510 vm_pages_needed = 2; 2511 } 2512 } else { 2513 /* 2514 * Turn paging back on immediately if we are under 2515 * minimum. 2516 */ 2517 pass = 0; 2518 } 2519 } 2520 } 2521 2522 static struct kproc_desc pg1_kp = { 2523 "pagedaemon", 2524 vm_pageout_thread, 2525 &pagethread 2526 }; 2527 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &pg1_kp); 2528 2529 static struct kproc_desc pg2_kp = { 2530 "emergpager", 2531 vm_pageout_thread, 2532 &emergpager 2533 }; 2534 SYSINIT(emergpager, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, kproc_start, &pg2_kp); 2535 2536 2537 /* 2538 * Called after allocating a page out of the cache or free queue 2539 * to possibly wake the pagedaemon up to replentish our supply. 2540 * 2541 * We try to generate some hysteresis by waking the pagedaemon up 2542 * when our free+cache pages go below the free_min+cache_min level. 2543 * The pagedaemon tries to get the count back up to at least the 2544 * minimum, and through to the target level if possible. 2545 * 2546 * If the pagedaemon is already active bump vm_pages_needed as a hint 2547 * that there are even more requests pending. 2548 * 2549 * SMP races ok? 2550 * No requirements. 2551 */ 2552 void 2553 pagedaemon_wakeup(void) 2554 { 2555 if (vm_paging_needed(0) && curthread != pagethread) { 2556 if (vm_pages_needed <= 1) { 2557 vm_pages_needed = 1; /* SMP race ok */ 2558 wakeup(&vm_pages_needed); /* tickle pageout */ 2559 } else if (vm_page_count_min(0)) { 2560 ++vm_pages_needed; /* SMP race ok */ 2561 /* a wakeup() would be wasted here */ 2562 } 2563 } 2564 } 2565 2566 #if !defined(NO_SWAPPING) 2567 2568 /* 2569 * SMP races ok? 2570 * No requirements. 2571 */ 2572 static void 2573 vm_req_vmdaemon(void) 2574 { 2575 static int lastrun = 0; 2576 2577 if ((ticks > (lastrun + hz)) || (ticks < lastrun)) { 2578 wakeup(&vm_daemon_needed); 2579 lastrun = ticks; 2580 } 2581 } 2582 2583 static int vm_daemon_callback(struct proc *p, void *data __unused); 2584 2585 /* 2586 * No requirements. 2587 */ 2588 static void 2589 vm_daemon(void) 2590 { 2591 int req_swapout; 2592 2593 while (TRUE) { 2594 tsleep(&vm_daemon_needed, 0, "psleep", 0); 2595 req_swapout = atomic_swap_int(&vm_pageout_req_swapout, 0); 2596 2597 /* 2598 * forced swapouts 2599 */ 2600 if (req_swapout) 2601 swapout_procs(vm_pageout_req_swapout); 2602 2603 /* 2604 * scan the processes for exceeding their rlimits or if 2605 * process is swapped out -- deactivate pages 2606 */ 2607 allproc_scan(vm_daemon_callback, NULL, 0); 2608 } 2609 } 2610 2611 static int 2612 vm_daemon_callback(struct proc *p, void *data __unused) 2613 { 2614 struct vmspace *vm; 2615 vm_pindex_t limit, size; 2616 2617 /* 2618 * if this is a system process or if we have already 2619 * looked at this process, skip it. 2620 */ 2621 lwkt_gettoken(&p->p_token); 2622 2623 if (p->p_flags & (P_SYSTEM | P_WEXIT)) { 2624 lwkt_reltoken(&p->p_token); 2625 return (0); 2626 } 2627 2628 /* 2629 * if the process is in a non-running type state, 2630 * don't touch it. 2631 */ 2632 if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) { 2633 lwkt_reltoken(&p->p_token); 2634 return (0); 2635 } 2636 2637 /* 2638 * get a limit 2639 */ 2640 limit = OFF_TO_IDX(qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur, 2641 p->p_rlimit[RLIMIT_RSS].rlim_max)); 2642 2643 /* 2644 * let processes that are swapped out really be 2645 * swapped out. Set the limit to nothing to get as 2646 * many pages out to swap as possible. 2647 */ 2648 if (p->p_flags & P_SWAPPEDOUT) 2649 limit = 0; 2650 2651 vm = p->p_vmspace; 2652 vmspace_hold(vm); 2653 size = pmap_resident_tlnw_count(&vm->vm_pmap); 2654 if (limit >= 0 && size > 4096 && 2655 size - 4096 >= limit && vm_pageout_memuse_mode >= 1) { 2656 vm_pageout_map_deactivate_pages(&vm->vm_map, limit); 2657 } 2658 vmspace_drop(vm); 2659 2660 lwkt_reltoken(&p->p_token); 2661 2662 return (0); 2663 } 2664 2665 #endif 2666