1 /* 2 * Copyright (c) 2003-2020 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * Copyright (c) 1991 Regents of the University of California. 35 * All rights reserved. 36 * Copyright (c) 1994 John S. Dyson 37 * All rights reserved. 38 * Copyright (c) 1994 David Greenman 39 * All rights reserved. 40 * 41 * This code is derived from software contributed to Berkeley by 42 * The Mach Operating System project at Carnegie-Mellon University. 43 * 44 * Redistribution and use in source and binary forms, with or without 45 * modification, are permitted provided that the following conditions 46 * are met: 47 * 1. Redistributions of source code must retain the above copyright 48 * notice, this list of conditions and the following disclaimer. 49 * 2. Redistributions in binary form must reproduce the above copyright 50 * notice, this list of conditions and the following disclaimer in the 51 * documentation and/or other materials provided with the distribution. 52 * 3. Neither the name of the University nor the names of its contributors 53 * may be used to endorse or promote products derived from this software 54 * without specific prior written permission. 55 * 56 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 57 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 58 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 59 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 60 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 61 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 62 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 63 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 64 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 65 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 66 * SUCH DAMAGE. 67 * 68 * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 69 * 70 * 71 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 72 * All rights reserved. 73 * 74 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 75 * 76 * Permission to use, copy, modify and distribute this software and 77 * its documentation is hereby granted, provided that both the copyright 78 * notice and this permission notice appear in all copies of the 79 * software, derivative works or modified versions, and any portions 80 * thereof, and that both notices appear in supporting documentation. 81 * 82 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 83 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 84 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 85 * 86 * Carnegie Mellon requests users of this software to return to 87 * 88 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 89 * School of Computer Science 90 * Carnegie Mellon University 91 * Pittsburgh PA 15213-3890 92 * 93 * any improvements or extensions that they make and grant Carnegie the 94 * rights to redistribute these changes. 95 */ 96 97 /* 98 * The proverbial page-out daemon, rewritten many times over the decades. 99 */ 100 101 #include "opt_vm.h" 102 #include <sys/param.h> 103 #include <sys/systm.h> 104 #include <sys/kernel.h> 105 #include <sys/proc.h> 106 #include <sys/kthread.h> 107 #include <sys/resourcevar.h> 108 #include <sys/signalvar.h> 109 #include <sys/vnode.h> 110 #include <sys/vmmeter.h> 111 #include <sys/conf.h> 112 #include <sys/sysctl.h> 113 114 #include <vm/vm.h> 115 #include <vm/vm_param.h> 116 #include <sys/lock.h> 117 #include <vm/vm_object.h> 118 #include <vm/vm_page.h> 119 #include <vm/vm_map.h> 120 #include <vm/vm_pageout.h> 121 #include <vm/vm_pager.h> 122 #include <vm/swap_pager.h> 123 #include <vm/vm_extern.h> 124 125 #include <sys/spinlock2.h> 126 #include <vm/vm_page2.h> 127 128 /* 129 * System initialization 130 */ 131 132 /* the kernel process "vm_pageout"*/ 133 static int vm_pageout_page(vm_page_t m, long *max_launderp, 134 long *vnodes_skippedp, struct vnode **vpfailedp, 135 int pass, int vmflush_flags); 136 static int vm_pageout_clean_helper (vm_page_t, int); 137 static void vm_pageout_free_page_calc (vm_size_t count); 138 static void vm_pageout_page_free(vm_page_t m) ; 139 struct thread *emergpager; 140 struct thread *pagethread; 141 static int sequence_emerg_pager; 142 143 #if !defined(NO_SWAPPING) 144 /* the kernel process "vm_daemon"*/ 145 static void vm_daemon (void); 146 static struct thread *vmthread; 147 148 static struct kproc_desc vm_kp = { 149 "vmdaemon", 150 vm_daemon, 151 &vmthread 152 }; 153 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp); 154 #endif 155 156 int vm_pages_needed = 0; /* Event on which pageout daemon sleeps */ 157 int vm_pageout_deficit = 0; /* Estimated number of pages deficit */ 158 int vm_pageout_pages_needed = 0;/* pageout daemon needs pages */ 159 int vm_page_free_hysteresis = 16; 160 static int vm_pagedaemon_time; 161 162 #if !defined(NO_SWAPPING) 163 static int vm_pageout_req_swapout; 164 static int vm_daemon_needed; 165 #endif 166 __read_mostly static int vm_max_launder = 4096; 167 __read_mostly static int vm_emerg_launder = 100; 168 __read_mostly static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0; 169 __read_mostly static int vm_pageout_full_stats_interval = 0; 170 __read_mostly static int vm_pageout_stats_free_max=0, vm_pageout_algorithm=0; 171 __read_mostly static int defer_swap_pageouts=0; 172 __read_mostly static int disable_swap_pageouts=0; 173 __read_mostly static u_int vm_anonmem_decline = ACT_DECLINE; 174 __read_mostly static u_int vm_filemem_decline = ACT_DECLINE * 2; 175 __read_mostly static int vm_pageout_debug; 176 177 #if defined(NO_SWAPPING) 178 __read_mostly static int vm_swap_enabled=0; 179 __read_mostly static int vm_swap_idle_enabled=0; 180 #else 181 __read_mostly static int vm_swap_enabled=1; 182 __read_mostly static int vm_swap_idle_enabled=0; 183 #endif 184 185 /* 0-disable, 1-passive, 2-active swp*/ 186 __read_mostly int vm_pageout_memuse_mode=2; 187 __read_mostly int vm_pageout_allow_active=1; 188 189 SYSCTL_UINT(_vm, VM_PAGEOUT_ALGORITHM, anonmem_decline, 190 CTLFLAG_RW, &vm_anonmem_decline, 0, "active->inactive anon memory"); 191 192 SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, filemem_decline, 193 CTLFLAG_RW, &vm_filemem_decline, 0, "active->inactive file cache"); 194 195 SYSCTL_INT(_vm, OID_AUTO, page_free_hysteresis, 196 CTLFLAG_RW, &vm_page_free_hysteresis, 0, 197 "Free more pages than the minimum required"); 198 199 SYSCTL_INT(_vm, OID_AUTO, max_launder, 200 CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout"); 201 SYSCTL_INT(_vm, OID_AUTO, emerg_launder, 202 CTLFLAG_RW, &vm_emerg_launder, 0, "Emergency pager minimum"); 203 204 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max, 205 CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length"); 206 207 SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval, 208 CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan"); 209 210 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval, 211 CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan"); 212 213 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_free_max, 214 CTLFLAG_RW, &vm_pageout_stats_free_max, 0, "Not implemented"); 215 SYSCTL_INT(_vm, OID_AUTO, pageout_memuse_mode, 216 CTLFLAG_RW, &vm_pageout_memuse_mode, 0, "memoryuse resource mode"); 217 SYSCTL_INT(_vm, OID_AUTO, pageout_allow_active, 218 CTLFLAG_RW, &vm_pageout_allow_active, 0, "allow inactive+active"); 219 SYSCTL_INT(_vm, OID_AUTO, pageout_debug, 220 CTLFLAG_RW, &vm_pageout_debug, 0, "debug pageout pages (count)"); 221 222 223 #if defined(NO_SWAPPING) 224 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 225 CTLFLAG_RD, &vm_swap_enabled, 0, ""); 226 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, 227 CTLFLAG_RD, &vm_swap_idle_enabled, 0, ""); 228 #else 229 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 230 CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout"); 231 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, 232 CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); 233 #endif 234 235 SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts, 236 CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem"); 237 238 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, 239 CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); 240 241 static int pageout_lock_miss; 242 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss, 243 CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout"); 244 245 int vm_page_max_wired; /* XXX max # of wired pages system-wide */ 246 247 #if !defined(NO_SWAPPING) 248 static void vm_req_vmdaemon (void); 249 #endif 250 static void vm_pageout_page_stats(int q); 251 252 /* 253 * Calculate approximately how many pages on each queue to try to 254 * clean. An exact calculation creates an edge condition when the 255 * queues are unbalanced so add significant slop. The queue scans 256 * will stop early when targets are reached and will start where they 257 * left off on the next pass. 258 * 259 * We need to be generous here because there are all sorts of loading 260 * conditions that can cause edge cases if try to average over all queues. 261 * In particular, storage subsystems have become so fast that paging 262 * activity can become quite frantic. Eventually we will probably need 263 * two paging threads, one for dirty pages and one for clean, to deal 264 * with the bandwidth requirements. 265 266 * So what we do is calculate a value that can be satisfied nominally by 267 * only having to scan half the queues. 268 */ 269 static __inline long 270 PQAVERAGE(long n) 271 { 272 long avg; 273 274 if (n >= 0) { 275 avg = ((n + (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) + 1); 276 } else { 277 avg = ((n - (PQ_L2_SIZE - 1)) / (PQ_L2_SIZE / 2) - 1); 278 } 279 return avg; 280 } 281 282 /* 283 * vm_pageout_clean_helper: 284 * 285 * Clean the page and remove it from the laundry. The page must be busied 286 * by the caller and will be disposed of (put away, flushed) by this routine. 287 */ 288 static int 289 vm_pageout_clean_helper(vm_page_t m, int vmflush_flags) 290 { 291 vm_object_t object; 292 vm_page_t mc[BLIST_MAX_ALLOC]; 293 int error; 294 int ib, is, page_base; 295 vm_pindex_t pindex = m->pindex; 296 297 object = m->object; 298 299 /* 300 * Don't mess with the page if it's held or special. Theoretically 301 * we can pageout held pages but there is no real need to press our 302 * luck, so don't. 303 */ 304 if (m->hold_count != 0 || (m->flags & PG_UNQUEUED)) { 305 vm_page_wakeup(m); 306 return 0; 307 } 308 309 /* 310 * Place page in cluster. Align cluster for optimal swap space 311 * allocation (whether it is swap or not). This is typically ~16-32 312 * pages, which also tends to align the cluster to multiples of the 313 * filesystem block size if backed by a filesystem. 314 */ 315 page_base = pindex % BLIST_MAX_ALLOC; 316 mc[page_base] = m; 317 ib = page_base - 1; 318 is = page_base + 1; 319 320 /* 321 * Scan object for clusterable pages. 322 * 323 * We can cluster ONLY if: ->> the page is NOT 324 * clean, wired, busy, held, or mapped into a 325 * buffer, and one of the following: 326 * 1) The page is inactive, or a seldom used 327 * active page. 328 * -or- 329 * 2) we force the issue. 330 * 331 * During heavy mmap/modification loads the pageout 332 * daemon can really fragment the underlying file 333 * due to flushing pages out of order and not trying 334 * align the clusters (which leave sporatic out-of-order 335 * holes). To solve this problem we do the reverse scan 336 * first and attempt to align our cluster, then do a 337 * forward scan if room remains. 338 */ 339 vm_object_hold(object); 340 341 while (ib >= 0) { 342 vm_page_t p; 343 344 p = vm_page_lookup_busy_try(object, pindex - page_base + ib, 345 TRUE, &error); 346 if (error || p == NULL) 347 break; 348 if ((p->queue - p->pc) == PQ_CACHE || 349 (p->flags & PG_UNQUEUED)) { 350 vm_page_wakeup(p); 351 break; 352 } 353 vm_page_test_dirty(p); 354 if (((p->dirty & p->valid) == 0 && 355 (p->flags & PG_NEED_COMMIT) == 0) || 356 p->wire_count != 0 || /* may be held by buf cache */ 357 p->hold_count != 0) { /* may be undergoing I/O */ 358 vm_page_wakeup(p); 359 break; 360 } 361 if (p->queue - p->pc != PQ_INACTIVE) { 362 if (p->queue - p->pc != PQ_ACTIVE || 363 (vmflush_flags & VM_PAGER_ALLOW_ACTIVE) == 0) { 364 vm_page_wakeup(p); 365 break; 366 } 367 } 368 369 /* 370 * Try to maintain page groupings in the cluster. 371 */ 372 if (m->flags & PG_WINATCFLS) 373 vm_page_flag_set(p, PG_WINATCFLS); 374 else 375 vm_page_flag_clear(p, PG_WINATCFLS); 376 p->act_count = m->act_count; 377 378 mc[ib] = p; 379 --ib; 380 } 381 ++ib; /* fixup */ 382 383 while (is < BLIST_MAX_ALLOC && 384 pindex - page_base + is < object->size) { 385 vm_page_t p; 386 387 p = vm_page_lookup_busy_try(object, pindex - page_base + is, 388 TRUE, &error); 389 if (error || p == NULL) 390 break; 391 if (((p->queue - p->pc) == PQ_CACHE) || 392 (p->flags & PG_UNQUEUED)) { 393 vm_page_wakeup(p); 394 break; 395 } 396 vm_page_test_dirty(p); 397 if (((p->dirty & p->valid) == 0 && 398 (p->flags & PG_NEED_COMMIT) == 0) || 399 p->wire_count != 0 || /* may be held by buf cache */ 400 p->hold_count != 0) { /* may be undergoing I/O */ 401 vm_page_wakeup(p); 402 break; 403 } 404 if (p->queue - p->pc != PQ_INACTIVE) { 405 if (p->queue - p->pc != PQ_ACTIVE || 406 (vmflush_flags & VM_PAGER_ALLOW_ACTIVE) == 0) { 407 vm_page_wakeup(p); 408 break; 409 } 410 } 411 412 /* 413 * Try to maintain page groupings in the cluster. 414 */ 415 if (m->flags & PG_WINATCFLS) 416 vm_page_flag_set(p, PG_WINATCFLS); 417 else 418 vm_page_flag_clear(p, PG_WINATCFLS); 419 p->act_count = m->act_count; 420 421 mc[is] = p; 422 ++is; 423 } 424 425 vm_object_drop(object); 426 427 /* 428 * we allow reads during pageouts... 429 */ 430 return vm_pageout_flush(&mc[ib], is - ib, vmflush_flags); 431 } 432 433 /* 434 * vm_pageout_flush() - launder the given pages 435 * 436 * The given pages are laundered. Note that we setup for the start of 437 * I/O ( i.e. busy the page ), mark it read-only, and bump the object 438 * reference count all in here rather then in the parent. If we want 439 * the parent to do more sophisticated things we may have to change 440 * the ordering. 441 * 442 * The pages in the array must be busied by the caller and will be 443 * unbusied by this function. 444 */ 445 int 446 vm_pageout_flush(vm_page_t *mc, int count, int vmflush_flags) 447 { 448 vm_object_t object; 449 int pageout_status[count]; 450 int numpagedout = 0; 451 int i; 452 int dodebug; 453 454 if (vm_pageout_debug > 0) { 455 --vm_pageout_debug; 456 dodebug = 1; 457 } else { 458 dodebug = 0; 459 } 460 461 /* 462 * Initiate I/O. Bump the vm_page_t->busy counter. 463 */ 464 for (i = 0; i < count; i++) { 465 KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, 466 ("vm_pageout_flush page %p index %d/%d: partially " 467 "invalid page", mc[i], i, count)); 468 vm_page_io_start(mc[i]); 469 } 470 471 /* 472 * We must make the pages read-only. This will also force the 473 * modified bit in the related pmaps to be cleared. The pager 474 * cannot clear the bit for us since the I/O completion code 475 * typically runs from an interrupt. The act of making the page 476 * read-only handles the case for us. 477 * 478 * Then we can unbusy the pages, we still hold a reference by virtue 479 * of our soft-busy. 480 */ 481 if (dodebug) 482 kprintf("pageout(%d): ", count); 483 for (i = 0; i < count; i++) { 484 if (vmflush_flags & VM_PAGER_TRY_TO_CACHE) 485 vm_page_protect(mc[i], VM_PROT_NONE); 486 else 487 vm_page_protect(mc[i], VM_PROT_READ); 488 vm_page_wakeup(mc[i]); 489 if (dodebug) 490 kprintf(" %p", mc[i]); 491 } 492 if (dodebug) 493 kprintf("\n"); 494 495 object = mc[0]->object; 496 vm_object_pip_add(object, count); 497 498 vm_pager_put_pages(object, mc, count, 499 (vmflush_flags | 500 ((object == &kernel_object) ? 501 VM_PAGER_PUT_SYNC : 0)), 502 pageout_status); 503 504 if (dodebug) 505 kprintf("result: "); 506 for (i = 0; i < count; i++) { 507 vm_page_t mt = mc[i]; 508 509 if (dodebug) 510 kprintf(" S%d", pageout_status[i]); 511 512 switch (pageout_status[i]) { 513 case VM_PAGER_OK: 514 numpagedout++; 515 break; 516 case VM_PAGER_PEND: 517 numpagedout++; 518 break; 519 case VM_PAGER_BAD: 520 /* 521 * Page outside of range of object. Right now we 522 * essentially lose the changes by pretending it 523 * worked. 524 */ 525 vm_page_busy_wait(mt, FALSE, "pgbad"); 526 pmap_clear_modify(mt); 527 vm_page_undirty(mt); 528 vm_page_wakeup(mt); 529 break; 530 case VM_PAGER_ERROR: 531 case VM_PAGER_FAIL: 532 /* 533 * A page typically cannot be paged out when we 534 * have run out of swap. We leave the page 535 * marked inactive and will try to page it out 536 * again later. 537 * 538 * Starvation of the active page list is used to 539 * determine when the system is massively memory 540 * starved. 541 */ 542 break; 543 case VM_PAGER_AGAIN: 544 break; 545 } 546 547 /* 548 * If not PENDing this was a synchronous operation and we 549 * clean up after the I/O. If it is PENDing the mess is 550 * cleaned up asynchronously. 551 * 552 * Also nominally act on the caller's wishes if the caller 553 * wants to try to really clean (cache or free) the page. 554 * 555 * Also nominally deactivate the page if the system is 556 * memory-stressed. 557 */ 558 if (pageout_status[i] != VM_PAGER_PEND) { 559 vm_page_busy_wait(mt, FALSE, "pgouw"); 560 vm_page_io_finish(mt); 561 if (vmflush_flags & VM_PAGER_TRY_TO_CACHE) { 562 vm_page_try_to_cache(mt); 563 if (dodebug) 564 kprintf("A[pq_cache=%d]", 565 ((mt->queue - mt->pc) == PQ_CACHE)); 566 } else if (vm_page_count_severe()) { 567 vm_page_deactivate(mt); 568 vm_page_wakeup(mt); 569 if (dodebug) 570 kprintf("B"); 571 } else { 572 vm_page_wakeup(mt); 573 if (dodebug) 574 kprintf("C"); 575 } 576 vm_object_pip_wakeup(object); 577 } 578 } 579 if (dodebug) 580 kprintf("(%d paged out)\n", numpagedout); 581 return numpagedout; 582 } 583 584 #if !defined(NO_SWAPPING) 585 586 /* 587 * Callback function, page busied for us. We must dispose of the busy 588 * condition. Any related pmap pages may be held but will not be locked. 589 */ 590 static 591 int 592 vm_pageout_mdp_callback(struct pmap_pgscan_info *info, vm_offset_t va, 593 vm_page_t p) 594 { 595 int actcount; 596 int cleanit = 0; 597 598 /* 599 * Basic tests - There should never be a marker, and we can stop 600 * once the RSS is below the required level. 601 */ 602 KKASSERT((p->flags & PG_MARKER) == 0); 603 if (pmap_resident_tlnw_count(info->pmap) <= info->limit) { 604 vm_page_wakeup(p); 605 return(-1); 606 } 607 608 mycpu->gd_cnt.v_pdpages++; 609 610 if (p->wire_count || p->hold_count || (p->flags & PG_UNQUEUED)) { 611 vm_page_wakeup(p); 612 goto done; 613 } 614 615 ++info->actioncount; 616 617 /* 618 * Check if the page has been referened recently. If it has, 619 * activate it and skip. 620 */ 621 actcount = pmap_ts_referenced(p); 622 if (actcount) { 623 vm_page_flag_set(p, PG_REFERENCED); 624 } else if (p->flags & PG_REFERENCED) { 625 actcount = 1; 626 } 627 628 if (actcount) { 629 if (p->queue - p->pc != PQ_ACTIVE) { 630 vm_page_and_queue_spin_lock(p); 631 if (p->queue - p->pc != PQ_ACTIVE) { 632 vm_page_and_queue_spin_unlock(p); 633 vm_page_activate(p); 634 } else { 635 vm_page_and_queue_spin_unlock(p); 636 } 637 } else { 638 p->act_count += actcount; 639 if (p->act_count > ACT_MAX) 640 p->act_count = ACT_MAX; 641 } 642 vm_page_flag_clear(p, PG_REFERENCED); 643 vm_page_wakeup(p); 644 goto done; 645 } 646 647 /* 648 * Remove the page from this particular pmap. Once we do this, our 649 * pmap scans will not see it again (unless it gets faulted in), so 650 * we must actively dispose of or deal with the page. 651 */ 652 pmap_remove_specific(info->pmap, p); 653 654 /* 655 * If the page is not mapped to another process (i.e. as would be 656 * typical if this were a shared page from a library) then deactivate 657 * the page and clean it in two passes only. 658 * 659 * If the page hasn't been referenced since the last check, remove it 660 * from the pmap. If it is no longer mapped, deactivate it 661 * immediately, accelerating the normal decline. 662 * 663 * Once the page has been removed from the pmap the RSS code no 664 * longer tracks it so we have to make sure that it is staged for 665 * potential flush action. 666 */ 667 if ((p->flags & PG_MAPPED) == 0 || 668 (pmap_mapped_sync(p) & PG_MAPPED) == 0) { 669 if (p->queue - p->pc == PQ_ACTIVE) { 670 vm_page_deactivate(p); 671 } 672 if (p->queue - p->pc == PQ_INACTIVE) { 673 cleanit = 1; 674 } 675 } 676 677 /* 678 * Ok, try to fully clean the page and any nearby pages such that at 679 * least the requested page is freed or moved to the cache queue. 680 * 681 * We usually do this synchronously to allow us to get the page into 682 * the CACHE queue quickly, which will prevent memory exhaustion if 683 * a process with a memoryuse limit is running away. However, the 684 * sysadmin may desire to set vm.swap_user_async which relaxes this 685 * and improves write performance. 686 */ 687 if (cleanit) { 688 long max_launder = 0x7FFF; 689 long vnodes_skipped = 0; 690 int vmflush_flags; 691 struct vnode *vpfailed = NULL; 692 693 info->offset = va; 694 695 if (vm_pageout_memuse_mode >= 2) { 696 vmflush_flags = VM_PAGER_TRY_TO_CACHE | 697 VM_PAGER_ALLOW_ACTIVE; 698 if (swap_user_async == 0) 699 vmflush_flags |= VM_PAGER_PUT_SYNC; 700 vm_page_flag_set(p, PG_WINATCFLS); 701 info->cleancount += 702 vm_pageout_page(p, &max_launder, 703 &vnodes_skipped, 704 &vpfailed, 1, vmflush_flags); 705 } else { 706 vm_page_wakeup(p); 707 ++info->cleancount; 708 } 709 } else { 710 vm_page_wakeup(p); 711 } 712 713 /* 714 * Must be at end to avoid SMP races. 715 */ 716 done: 717 lwkt_user_yield(); 718 return 0; 719 } 720 721 /* 722 * Deactivate some number of pages in a map due to set RLIMIT_RSS limits. 723 * that is relatively difficult to do. We try to keep track of where we 724 * left off last time to reduce scan overhead. 725 * 726 * Called when vm_pageout_memuse_mode is >= 1. 727 */ 728 void 729 vm_pageout_map_deactivate_pages(vm_map_t map, vm_pindex_t limit) 730 { 731 vm_offset_t pgout_offset; 732 struct pmap_pgscan_info info; 733 int retries = 3; 734 735 pgout_offset = map->pgout_offset; 736 again: 737 #if 0 738 kprintf("%016jx ", pgout_offset); 739 #endif 740 if (pgout_offset < VM_MIN_USER_ADDRESS) 741 pgout_offset = VM_MIN_USER_ADDRESS; 742 if (pgout_offset >= VM_MAX_USER_ADDRESS) 743 pgout_offset = 0; 744 info.pmap = vm_map_pmap(map); 745 info.limit = limit; 746 info.beg_addr = pgout_offset; 747 info.end_addr = VM_MAX_USER_ADDRESS; 748 info.callback = vm_pageout_mdp_callback; 749 info.cleancount = 0; 750 info.actioncount = 0; 751 info.busycount = 0; 752 753 pmap_pgscan(&info); 754 pgout_offset = info.offset; 755 #if 0 756 kprintf("%016jx %08lx %08lx\n", pgout_offset, 757 info.cleancount, info.actioncount); 758 #endif 759 760 if (pgout_offset != VM_MAX_USER_ADDRESS && 761 pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) { 762 goto again; 763 } else if (retries && 764 pmap_resident_tlnw_count(vm_map_pmap(map)) > limit) { 765 --retries; 766 goto again; 767 } 768 map->pgout_offset = pgout_offset; 769 } 770 #endif 771 772 /* 773 * Called when the pageout scan wants to free a page. We no longer 774 * try to cycle the vm_object here with a reference & dealloc, which can 775 * cause a non-trivial object collapse in a critical path. 776 * 777 * It is unclear why we cycled the ref_count in the past, perhaps to try 778 * to optimize shadow chain collapses but I don't quite see why it would 779 * be necessary. An OBJ_DEAD object should terminate any and all vm_pages 780 * synchronously and not have to be kicked-start. 781 */ 782 static void 783 vm_pageout_page_free(vm_page_t m) 784 { 785 vm_page_protect(m, VM_PROT_NONE); 786 vm_page_free(m); 787 } 788 789 /* 790 * vm_pageout_scan does the dirty work for the pageout daemon. 791 */ 792 struct vm_pageout_scan_info { 793 struct proc *bigproc; 794 vm_offset_t bigsize; 795 }; 796 797 static int vm_pageout_scan_callback(struct proc *p, void *data); 798 799 /* 800 * Scan inactive queue 801 * 802 * WARNING! Can be called from two pagedaemon threads simultaneously. 803 */ 804 static int 805 vm_pageout_scan_inactive(int pass, int q, long avail_shortage, 806 long *vnodes_skipped) 807 { 808 vm_page_t m; 809 struct vm_page marker; 810 struct vnode *vpfailed; /* warning, allowed to be stale */ 811 long maxscan; 812 long delta = 0; 813 long max_launder; 814 int isep; 815 int vmflush_flags; 816 817 isep = (curthread == emergpager); 818 819 /* 820 * Start scanning the inactive queue for pages we can move to the 821 * cache or free. The scan will stop when the target is reached or 822 * we have scanned the entire inactive queue. Note that m->act_count 823 * is not used to form decisions for the inactive queue, only for the 824 * active queue. 825 * 826 * max_launder limits the number of dirty pages we flush per scan. 827 * For most systems a smaller value (16 or 32) is more robust under 828 * extreme memory and disk pressure because any unnecessary writes 829 * to disk can result in extreme performance degredation. However, 830 * systems with excessive dirty pages (especially when MAP_NOSYNC is 831 * used) will die horribly with limited laundering. If the pageout 832 * daemon cannot clean enough pages in the first pass, we let it go 833 * all out in succeeding passes. 834 * 835 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT LAUNDER VNODE-BACKED 836 * PAGES. 837 */ 838 if ((max_launder = vm_max_launder) <= 1) 839 max_launder = 1; 840 if (pass) 841 max_launder = 10000; 842 843 /* 844 * Initialize our marker 845 */ 846 bzero(&marker, sizeof(marker)); 847 marker.flags = PG_FICTITIOUS | PG_MARKER; 848 marker.busy_count = PBUSY_LOCKED; 849 marker.queue = PQ_INACTIVE + q; 850 marker.pc = q; 851 marker.wire_count = 1; 852 853 /* 854 * Inactive queue scan. 855 * 856 * We pick off approximately 1/10 of each queue. Each queue is 857 * effectively organized LRU so scanning the entire queue would 858 * improperly pick up pages that might still be in regular use. 859 * 860 * NOTE: The vm_page must be spinlocked before the queue to avoid 861 * deadlocks, so it is easiest to simply iterate the loop 862 * with the queue unlocked at the top. 863 */ 864 vpfailed = NULL; 865 866 vm_page_queues_spin_lock(PQ_INACTIVE + q); 867 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); 868 maxscan = vm_page_queues[PQ_INACTIVE + q].lcnt / 10 + 1; 869 870 /* 871 * Queue locked at top of loop to avoid stack marker issues. 872 */ 873 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 874 maxscan-- > 0 && avail_shortage - delta > 0) 875 { 876 int count; 877 878 KKASSERT(m->queue == PQ_INACTIVE + q); 879 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, 880 &marker, pageq); 881 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE + q].pl, m, 882 &marker, pageq); 883 mycpu->gd_cnt.v_pdpages++; 884 885 /* 886 * Skip marker pages (atomic against other markers to avoid 887 * infinite hop-over scans). 888 */ 889 if (m->flags & PG_MARKER) 890 continue; 891 892 /* 893 * Try to busy the page. Don't mess with pages which are 894 * already busy or reorder them in the queue. 895 */ 896 if (vm_page_busy_try(m, TRUE)) 897 continue; 898 899 /* 900 * Remaining operations run with the page busy and neither 901 * the page or the queue will be spin-locked. 902 */ 903 KKASSERT(m->queue == PQ_INACTIVE + q); 904 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 905 906 /* 907 * The emergency pager runs when the primary pager gets 908 * stuck, which typically means the primary pager deadlocked 909 * on a vnode-backed page. Therefore, the emergency pager 910 * must skip any complex objects. 911 * 912 * We disallow VNODEs unless they are VCHR whos device ops 913 * does not flag D_NOEMERGPGR. 914 */ 915 if (isep && m->object) { 916 struct vnode *vp; 917 918 switch(m->object->type) { 919 case OBJT_DEFAULT: 920 case OBJT_SWAP: 921 /* 922 * Allow anonymous memory and assume that 923 * swap devices are not complex, since its 924 * kinda worthless if we can't swap out dirty 925 * anonymous pages. 926 */ 927 break; 928 case OBJT_VNODE: 929 /* 930 * Allow VCHR device if the D_NOEMERGPGR 931 * flag is not set, deny other vnode types 932 * as being too complex. 933 */ 934 vp = m->object->handle; 935 if (vp && vp->v_type == VCHR && 936 vp->v_rdev && vp->v_rdev->si_ops && 937 (vp->v_rdev->si_ops->head.flags & 938 D_NOEMERGPGR) == 0) { 939 break; 940 } 941 /* Deny - fall through */ 942 default: 943 /* 944 * Deny 945 */ 946 vm_page_wakeup(m); 947 vm_page_queues_spin_lock(PQ_INACTIVE + q); 948 lwkt_yield(); 949 continue; 950 } 951 } 952 953 /* 954 * Try to pageout the page and perhaps other nearby pages. 955 * We want to get the pages into the cache eventually ( 956 * first or second pass). Otherwise the pages can wind up 957 * just cycling in the inactive queue, getting flushed over 958 * and over again. 959 */ 960 if (vm_pageout_memuse_mode >= 2) 961 vm_page_flag_set(m, PG_WINATCFLS); 962 963 vmflush_flags = 0; 964 if (vm_pageout_allow_active) 965 vmflush_flags |= VM_PAGER_ALLOW_ACTIVE; 966 if (m->flags & PG_WINATCFLS) 967 vmflush_flags |= VM_PAGER_TRY_TO_CACHE; 968 count = vm_pageout_page(m, &max_launder, vnodes_skipped, 969 &vpfailed, pass, vmflush_flags); 970 delta += count; 971 972 /* 973 * Systems with a ton of memory can wind up with huge 974 * deactivation counts. Because the inactive scan is 975 * doing a lot of flushing, the combination can result 976 * in excessive paging even in situations where other 977 * unrelated threads free up sufficient VM. 978 * 979 * To deal with this we abort the nominal active->inactive 980 * scan before we hit the inactive target when free+cache 981 * levels have reached a reasonable target. 982 * 983 * When deciding to stop early we need to add some slop to 984 * the test and we need to return full completion to the caller 985 * to prevent the caller from thinking there is something 986 * wrong and issuing a low-memory+swap warning or pkill. 987 * 988 * A deficit forces paging regardless of the state of the 989 * VM page queues (used for RSS enforcement). 990 */ 991 lwkt_yield(); 992 vm_page_queues_spin_lock(PQ_INACTIVE + q); 993 if (vm_paging_target() < -vm_max_launder) { 994 /* 995 * Stopping early, return full completion to caller. 996 */ 997 if (delta < avail_shortage) 998 delta = avail_shortage; 999 break; 1000 } 1001 } 1002 1003 /* page queue still spin-locked */ 1004 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); 1005 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 1006 1007 return (delta); 1008 } 1009 1010 /* 1011 * Pageout the specified page, return the total number of pages paged out 1012 * (this routine may cluster). 1013 * 1014 * The page must be busied and soft-busied by the caller and will be disposed 1015 * of by this function. 1016 */ 1017 static int 1018 vm_pageout_page(vm_page_t m, long *max_launderp, long *vnodes_skippedp, 1019 struct vnode **vpfailedp, int pass, int vmflush_flags) 1020 { 1021 vm_object_t object; 1022 int actcount; 1023 int count = 0; 1024 1025 /* 1026 * Wiring no longer removes a page from its queue. The last unwiring 1027 * will requeue the page. Obviously wired pages cannot be paged out 1028 * so unqueue it and return. 1029 */ 1030 if (m->wire_count) { 1031 vm_page_unqueue_nowakeup(m); 1032 vm_page_wakeup(m); 1033 return 0; 1034 } 1035 1036 /* 1037 * A held page may be undergoing I/O, so skip it. 1038 */ 1039 if (m->hold_count) { 1040 vm_page_and_queue_spin_lock(m); 1041 if (m->queue - m->pc == PQ_INACTIVE) { 1042 TAILQ_REMOVE( 1043 &vm_page_queues[m->queue].pl, m, pageq); 1044 TAILQ_INSERT_TAIL( 1045 &vm_page_queues[m->queue].pl, m, pageq); 1046 } 1047 vm_page_and_queue_spin_unlock(m); 1048 vm_page_wakeup(m); 1049 return 0; 1050 } 1051 1052 if (m->object == NULL || m->object->ref_count == 0) { 1053 /* 1054 * If the object is not being used, we ignore previous 1055 * references. 1056 */ 1057 vm_page_flag_clear(m, PG_REFERENCED); 1058 pmap_clear_reference(m); 1059 /* fall through to end */ 1060 } else if (((m->flags & PG_REFERENCED) == 0) && 1061 (actcount = pmap_ts_referenced(m))) { 1062 /* 1063 * Otherwise, if the page has been referenced while 1064 * in the inactive queue, we bump the "activation 1065 * count" upwards, making it less likely that the 1066 * page will be added back to the inactive queue 1067 * prematurely again. Here we check the page tables 1068 * (or emulated bits, if any), given the upper level 1069 * VM system not knowing anything about existing 1070 * references. 1071 */ 1072 vm_page_activate(m); 1073 m->act_count += (actcount + ACT_ADVANCE); 1074 vm_page_wakeup(m); 1075 return 0; 1076 } 1077 1078 /* 1079 * (m) is still busied. 1080 * 1081 * If the upper level VM system knows about any page 1082 * references, we activate the page. We also set the 1083 * "activation count" higher than normal so that we will less 1084 * likely place pages back onto the inactive queue again. 1085 */ 1086 if ((m->flags & PG_REFERENCED) != 0) { 1087 vm_page_flag_clear(m, PG_REFERENCED); 1088 actcount = pmap_ts_referenced(m); 1089 vm_page_activate(m); 1090 m->act_count += (actcount + ACT_ADVANCE + 1); 1091 vm_page_wakeup(m); 1092 return 0; 1093 } 1094 1095 /* 1096 * If the upper level VM system doesn't know anything about 1097 * the page being dirty, we have to check for it again. As 1098 * far as the VM code knows, any partially dirty pages are 1099 * fully dirty. 1100 * 1101 * Pages marked PG_WRITEABLE may be mapped into the user 1102 * address space of a process running on another cpu. A 1103 * user process (without holding the MP lock) running on 1104 * another cpu may be able to touch the page while we are 1105 * trying to remove it. vm_page_cache() will handle this 1106 * case for us. 1107 */ 1108 if (m->dirty == 0) { 1109 vm_page_test_dirty(m); 1110 } else { 1111 vm_page_dirty(m); 1112 } 1113 1114 if (m->valid == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 1115 /* 1116 * Invalid pages can be easily freed 1117 */ 1118 vm_pageout_page_free(m); 1119 mycpu->gd_cnt.v_dfree++; 1120 ++count; 1121 } else if (m->dirty == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 1122 /* 1123 * Clean pages can be placed onto the cache queue. 1124 * This effectively frees them. 1125 */ 1126 vm_page_cache(m); 1127 ++count; 1128 } else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) { 1129 /* 1130 * Dirty pages need to be paged out, but flushing 1131 * a page is extremely expensive verses freeing 1132 * a clean page. Rather then artificially limiting 1133 * the number of pages we can flush, we instead give 1134 * dirty pages extra priority on the inactive queue 1135 * by forcing them to be cycled through the queue 1136 * twice before being flushed, after which the 1137 * (now clean) page will cycle through once more 1138 * before being freed. This significantly extends 1139 * the thrash point for a heavily loaded machine. 1140 */ 1141 vm_page_flag_set(m, PG_WINATCFLS); 1142 vm_page_and_queue_spin_lock(m); 1143 if (m->queue - m->pc == PQ_INACTIVE) { 1144 TAILQ_REMOVE( 1145 &vm_page_queues[m->queue].pl, m, pageq); 1146 TAILQ_INSERT_TAIL( 1147 &vm_page_queues[m->queue].pl, m, pageq); 1148 } 1149 vm_page_and_queue_spin_unlock(m); 1150 vm_page_wakeup(m); 1151 } else if (*max_launderp > 0) { 1152 /* 1153 * We always want to try to flush some dirty pages if 1154 * we encounter them, to keep the system stable. 1155 * Normally this number is small, but under extreme 1156 * pressure where there are insufficient clean pages 1157 * on the inactive queue, we may have to go all out. 1158 */ 1159 int swap_pageouts_ok; 1160 struct vnode *vp = NULL; 1161 1162 swap_pageouts_ok = 0; 1163 object = m->object; 1164 if (object && 1165 (object->type != OBJT_SWAP) && 1166 (object->type != OBJT_DEFAULT)) { 1167 swap_pageouts_ok = 1; 1168 } else { 1169 swap_pageouts_ok = !(defer_swap_pageouts || 1170 disable_swap_pageouts); 1171 swap_pageouts_ok |= (!disable_swap_pageouts && 1172 defer_swap_pageouts && 1173 vm_page_count_min(0)); 1174 } 1175 1176 /* 1177 * We don't bother paging objects that are "dead". 1178 * Those objects are in a "rundown" state. 1179 */ 1180 if (!swap_pageouts_ok || 1181 (object == NULL) || 1182 (object->flags & OBJ_DEAD)) { 1183 vm_page_and_queue_spin_lock(m); 1184 if (m->queue - m->pc == PQ_INACTIVE) { 1185 TAILQ_REMOVE( 1186 &vm_page_queues[m->queue].pl, 1187 m, pageq); 1188 TAILQ_INSERT_TAIL( 1189 &vm_page_queues[m->queue].pl, 1190 m, pageq); 1191 } 1192 vm_page_and_queue_spin_unlock(m); 1193 vm_page_wakeup(m); 1194 return 0; 1195 } 1196 1197 /* 1198 * (m) is still busied. 1199 * 1200 * The object is already known NOT to be dead. It 1201 * is possible for the vget() to block the whole 1202 * pageout daemon, but the new low-memory handling 1203 * code should prevent it. 1204 * 1205 * The previous code skipped locked vnodes and, worse, 1206 * reordered pages in the queue. This results in 1207 * completely non-deterministic operation because, 1208 * quite often, a vm_fault has initiated an I/O and 1209 * is holding a locked vnode at just the point where 1210 * the pageout daemon is woken up. 1211 * 1212 * We can't wait forever for the vnode lock, we might 1213 * deadlock due to a vn_read() getting stuck in 1214 * vm_wait while holding this vnode. We skip the 1215 * vnode if we can't get it in a reasonable amount 1216 * of time. 1217 * 1218 * vpfailed is used to (try to) avoid the case where 1219 * a large number of pages are associated with a 1220 * locked vnode, which could cause the pageout daemon 1221 * to stall for an excessive amount of time. 1222 */ 1223 if (object->type == OBJT_VNODE) { 1224 int flags; 1225 1226 vp = object->handle; 1227 flags = LK_EXCLUSIVE; 1228 if (vp == *vpfailedp) 1229 flags |= LK_NOWAIT; 1230 else 1231 flags |= LK_TIMELOCK; 1232 vm_page_hold(m); 1233 vm_page_wakeup(m); 1234 1235 /* 1236 * We have unbusied (m) temporarily so we can 1237 * acquire the vp lock without deadlocking. 1238 * (m) is held to prevent destruction. 1239 */ 1240 if (vget(vp, flags) != 0) { 1241 *vpfailedp = vp; 1242 ++pageout_lock_miss; 1243 if (object->flags & OBJ_MIGHTBEDIRTY) 1244 ++*vnodes_skippedp; 1245 vm_page_unhold(m); 1246 return 0; 1247 } 1248 1249 /* 1250 * The page might have been moved to another 1251 * queue during potential blocking in vget() 1252 * above. The page might have been freed and 1253 * reused for another vnode. The object might 1254 * have been reused for another vnode. 1255 */ 1256 if (m->queue - m->pc != PQ_INACTIVE || 1257 m->object != object || 1258 object->handle != vp) { 1259 if (object->flags & OBJ_MIGHTBEDIRTY) 1260 ++*vnodes_skippedp; 1261 vput(vp); 1262 vm_page_unhold(m); 1263 return 0; 1264 } 1265 1266 /* 1267 * The page may have been busied during the 1268 * blocking in vput(); We don't move the 1269 * page back onto the end of the queue so that 1270 * statistics are more correct if we don't. 1271 */ 1272 if (vm_page_busy_try(m, TRUE)) { 1273 vput(vp); 1274 vm_page_unhold(m); 1275 return 0; 1276 } 1277 vm_page_unhold(m); 1278 1279 /* 1280 * If it was wired while we didn't own it. 1281 */ 1282 if (m->wire_count) { 1283 vm_page_unqueue_nowakeup(m); 1284 vput(vp); 1285 vm_page_wakeup(m); 1286 return 0; 1287 } 1288 1289 /* 1290 * (m) is busied again 1291 * 1292 * We own the busy bit and remove our hold 1293 * bit. If the page is still held it 1294 * might be undergoing I/O, so skip it. 1295 */ 1296 if (m->hold_count) { 1297 vm_page_and_queue_spin_lock(m); 1298 if (m->queue - m->pc == PQ_INACTIVE) { 1299 TAILQ_REMOVE(&vm_page_queues[m->queue].pl, m, pageq); 1300 TAILQ_INSERT_TAIL(&vm_page_queues[m->queue].pl, m, pageq); 1301 } 1302 vm_page_and_queue_spin_unlock(m); 1303 if (object->flags & OBJ_MIGHTBEDIRTY) 1304 ++*vnodes_skippedp; 1305 vm_page_wakeup(m); 1306 vput(vp); 1307 return 0; 1308 } 1309 /* (m) is left busied as we fall through */ 1310 } 1311 1312 /* 1313 * page is busy and not held here. 1314 * 1315 * If a page is dirty, then it is either being washed 1316 * (but not yet cleaned) or it is still in the 1317 * laundry. If it is still in the laundry, then we 1318 * start the cleaning operation. 1319 * 1320 * decrement inactive_shortage on success to account 1321 * for the (future) cleaned page. Otherwise we 1322 * could wind up laundering or cleaning too many 1323 * pages. 1324 * 1325 * NOTE: Cleaning the page here does not cause 1326 * force_deficit to be adjusted, because the 1327 * page is not being freed or moved to the 1328 * cache. 1329 */ 1330 count = vm_pageout_clean_helper(m, vmflush_flags); 1331 *max_launderp -= count; 1332 1333 /* 1334 * Clean ate busy, page no longer accessible 1335 */ 1336 if (vp != NULL) 1337 vput(vp); 1338 } else { 1339 vm_page_wakeup(m); 1340 } 1341 return count; 1342 } 1343 1344 /* 1345 * Scan active queue 1346 * 1347 * WARNING! Can be called from two pagedaemon threads simultaneously. 1348 */ 1349 static int 1350 vm_pageout_scan_active(int pass, int q, 1351 long avail_shortage, long inactive_shortage, 1352 long *recycle_countp) 1353 { 1354 struct vm_page marker; 1355 vm_page_t m; 1356 int actcount; 1357 long delta = 0; 1358 long maxscan; 1359 int isep; 1360 1361 isep = (curthread == emergpager); 1362 1363 /* 1364 * We want to move pages from the active queue to the inactive 1365 * queue to get the inactive queue to the inactive target. If 1366 * we still have a page shortage from above we try to directly free 1367 * clean pages instead of moving them. 1368 * 1369 * If we do still have a shortage we keep track of the number of 1370 * pages we free or cache (recycle_count) as a measure of thrashing 1371 * between the active and inactive queues. 1372 * 1373 * If we were able to completely satisfy the free+cache targets 1374 * from the inactive pool we limit the number of pages we move 1375 * from the active pool to the inactive pool to 2x the pages we 1376 * had removed from the inactive pool (with a minimum of 1/5 the 1377 * inactive target). If we were not able to completely satisfy 1378 * the free+cache targets we go for the whole target aggressively. 1379 * 1380 * NOTE: Both variables can end up negative. 1381 * NOTE: We are still in a critical section. 1382 * 1383 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT LAUNDER VNODE-BACKED 1384 * PAGES. 1385 */ 1386 1387 bzero(&marker, sizeof(marker)); 1388 marker.flags = PG_FICTITIOUS | PG_MARKER; 1389 marker.busy_count = PBUSY_LOCKED; 1390 marker.queue = PQ_ACTIVE + q; 1391 marker.pc = q; 1392 marker.wire_count = 1; 1393 1394 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1395 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1396 maxscan = vm_page_queues[PQ_ACTIVE + q].lcnt / 10 + 1; 1397 1398 /* 1399 * Queue locked at top of loop to avoid stack marker issues. 1400 */ 1401 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 1402 maxscan-- > 0 && (avail_shortage - delta > 0 || 1403 inactive_shortage > 0)) 1404 { 1405 KKASSERT(m->queue == PQ_ACTIVE + q); 1406 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, 1407 &marker, pageq); 1408 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m, 1409 &marker, pageq); 1410 1411 /* 1412 * Skip marker pages (atomic against other markers to avoid 1413 * infinite hop-over scans). 1414 */ 1415 if (m->flags & PG_MARKER) 1416 continue; 1417 1418 /* 1419 * Try to busy the page. Don't mess with pages which are 1420 * already busy or reorder them in the queue. 1421 */ 1422 if (vm_page_busy_try(m, TRUE)) 1423 continue; 1424 1425 /* 1426 * Remaining operations run with the page busy and neither 1427 * the page or the queue will be spin-locked. 1428 */ 1429 KKASSERT(m->queue == PQ_ACTIVE + q); 1430 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1431 1432 #if 0 1433 /* 1434 * Don't deactivate pages that are held, even if we can 1435 * busy them. (XXX why not?) 1436 */ 1437 if (m->hold_count) { 1438 vm_page_and_queue_spin_lock(m); 1439 if (m->queue - m->pc == PQ_ACTIVE) { 1440 TAILQ_REMOVE( 1441 &vm_page_queues[PQ_ACTIVE + q].pl, 1442 m, pageq); 1443 TAILQ_INSERT_TAIL( 1444 &vm_page_queues[PQ_ACTIVE + q].pl, 1445 m, pageq); 1446 } 1447 vm_page_and_queue_spin_unlock(m); 1448 vm_page_wakeup(m); 1449 goto next; 1450 } 1451 #endif 1452 /* 1453 * We can just remove wired pages from the queue 1454 */ 1455 if (m->wire_count) { 1456 vm_page_unqueue_nowakeup(m); 1457 vm_page_wakeup(m); 1458 goto next; 1459 } 1460 1461 /* 1462 * The emergency pager ignores vnode-backed pages as these 1463 * are the pages that probably bricked the main pager. 1464 */ 1465 if (isep && m->object && m->object->type == OBJT_VNODE) { 1466 vm_page_and_queue_spin_lock(m); 1467 if (m->queue - m->pc == PQ_ACTIVE) { 1468 TAILQ_REMOVE( 1469 &vm_page_queues[PQ_ACTIVE + q].pl, 1470 m, pageq); 1471 TAILQ_INSERT_TAIL( 1472 &vm_page_queues[PQ_ACTIVE + q].pl, 1473 m, pageq); 1474 } 1475 vm_page_and_queue_spin_unlock(m); 1476 vm_page_wakeup(m); 1477 goto next; 1478 } 1479 1480 /* 1481 * The count for pagedaemon pages is done after checking the 1482 * page for eligibility... 1483 */ 1484 mycpu->gd_cnt.v_pdpages++; 1485 1486 /* 1487 * Check to see "how much" the page has been used and clear 1488 * the tracking access bits. If the object has no references 1489 * don't bother paying the expense. 1490 */ 1491 actcount = 0; 1492 if (m->object && m->object->ref_count != 0) { 1493 if (m->flags & PG_REFERENCED) 1494 ++actcount; 1495 actcount += pmap_ts_referenced(m); 1496 if (actcount) { 1497 m->act_count += ACT_ADVANCE + actcount; 1498 if (m->act_count > ACT_MAX) 1499 m->act_count = ACT_MAX; 1500 } 1501 } 1502 vm_page_flag_clear(m, PG_REFERENCED); 1503 1504 /* 1505 * actcount is only valid if the object ref_count is non-zero. 1506 * If the page does not have an object, actcount will be zero. 1507 */ 1508 if (actcount && m->object->ref_count != 0) { 1509 vm_page_and_queue_spin_lock(m); 1510 if (m->queue - m->pc == PQ_ACTIVE) { 1511 TAILQ_REMOVE( 1512 &vm_page_queues[PQ_ACTIVE + q].pl, 1513 m, pageq); 1514 TAILQ_INSERT_TAIL( 1515 &vm_page_queues[PQ_ACTIVE + q].pl, 1516 m, pageq); 1517 } 1518 vm_page_and_queue_spin_unlock(m); 1519 vm_page_wakeup(m); 1520 } else { 1521 switch(m->object->type) { 1522 case OBJT_DEFAULT: 1523 case OBJT_SWAP: 1524 m->act_count -= min(m->act_count, 1525 vm_anonmem_decline); 1526 break; 1527 default: 1528 m->act_count -= min(m->act_count, 1529 vm_filemem_decline); 1530 break; 1531 } 1532 if (vm_pageout_algorithm || 1533 (m->object == NULL) || 1534 (m->object && (m->object->ref_count == 0)) || 1535 m->act_count < pass + 1 1536 ) { 1537 /* 1538 * Deactivate the page. If we had a 1539 * shortage from our inactive scan try to 1540 * free (cache) the page instead. 1541 * 1542 * Don't just blindly cache the page if 1543 * we do not have a shortage from the 1544 * inactive scan, that could lead to 1545 * gigabytes being moved. 1546 */ 1547 --inactive_shortage; 1548 if (avail_shortage - delta > 0 || 1549 (m->object && (m->object->ref_count == 0))) 1550 { 1551 if (avail_shortage - delta > 0) 1552 ++*recycle_countp; 1553 vm_page_protect(m, VM_PROT_NONE); 1554 if (m->dirty == 0 && 1555 (m->flags & PG_NEED_COMMIT) == 0 && 1556 avail_shortage - delta > 0) { 1557 vm_page_cache(m); 1558 } else { 1559 vm_page_deactivate(m); 1560 vm_page_wakeup(m); 1561 } 1562 } else { 1563 vm_page_deactivate(m); 1564 vm_page_wakeup(m); 1565 } 1566 ++delta; 1567 } else { 1568 vm_page_and_queue_spin_lock(m); 1569 if (m->queue - m->pc == PQ_ACTIVE) { 1570 TAILQ_REMOVE( 1571 &vm_page_queues[PQ_ACTIVE + q].pl, 1572 m, pageq); 1573 TAILQ_INSERT_TAIL( 1574 &vm_page_queues[PQ_ACTIVE + q].pl, 1575 m, pageq); 1576 } 1577 vm_page_and_queue_spin_unlock(m); 1578 vm_page_wakeup(m); 1579 } 1580 } 1581 next: 1582 lwkt_yield(); 1583 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1584 } 1585 1586 /* 1587 * Clean out our local marker. 1588 * 1589 * Page queue still spin-locked. 1590 */ 1591 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1592 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1593 1594 return (delta); 1595 } 1596 1597 /* 1598 * The number of actually free pages can drop down to v_free_reserved, 1599 * we try to build the free count back above v_free_min. Note that 1600 * vm_paging_needed() also returns TRUE if v_free_count is not at 1601 * least v_free_min so that is the minimum we must build the free 1602 * count to. 1603 * 1604 * We use a slightly higher target to improve hysteresis, 1605 * ((v_free_target + v_free_min) / 2). Since v_free_target 1606 * is usually the same as v_cache_min this maintains about 1607 * half the pages in the free queue as are in the cache queue, 1608 * providing pretty good pipelining for pageout operation. 1609 * 1610 * The system operator can manipulate vm.v_cache_min and 1611 * vm.v_free_target to tune the pageout demon. Be sure 1612 * to keep vm.v_free_min < vm.v_free_target. 1613 * 1614 * Note that the original paging target is to get at least 1615 * (free_min + cache_min) into (free + cache). The slightly 1616 * higher target will shift additional pages from cache to free 1617 * without effecting the original paging target in order to 1618 * maintain better hysteresis and not have the free count always 1619 * be dead-on v_free_min. 1620 * 1621 * NOTE: we are still in a critical section. 1622 * 1623 * Pages moved from PQ_CACHE to totally free are not counted in the 1624 * pages_freed counter. 1625 * 1626 * WARNING! Can be called from two pagedaemon threads simultaneously. 1627 */ 1628 static void 1629 vm_pageout_scan_cache(long avail_shortage, int pass, 1630 long vnodes_skipped, long recycle_count) 1631 { 1632 static int lastkillticks; 1633 struct vm_pageout_scan_info info; 1634 vm_page_t m; 1635 int isep; 1636 1637 isep = (curthread == emergpager); 1638 1639 while (vmstats.v_free_count < 1640 (vmstats.v_free_min + vmstats.v_free_target) / 2) { 1641 /* 1642 * This steals some code from vm/vm_page.c 1643 * 1644 * Create two rovers and adjust the code to reduce 1645 * chances of them winding up at the same index (which 1646 * can cause a lot of contention). 1647 */ 1648 static int cache_rover[2] = { 0, PQ_L2_MASK / 2 }; 1649 1650 if (((cache_rover[0] ^ cache_rover[1]) & PQ_L2_MASK) == 0) 1651 goto next_rover; 1652 1653 m = vm_page_list_find(PQ_CACHE, cache_rover[isep] & PQ_L2_MASK); 1654 if (m == NULL) 1655 break; 1656 /* 1657 * page is returned removed from its queue and spinlocked 1658 * 1659 * If the busy attempt fails we can still deactivate the page. 1660 */ 1661 if (vm_page_busy_try(m, TRUE)) { 1662 vm_page_deactivate_locked(m); 1663 vm_page_spin_unlock(m); 1664 continue; 1665 } 1666 vm_page_spin_unlock(m); 1667 pagedaemon_wakeup(); 1668 lwkt_yield(); 1669 1670 /* 1671 * Remaining operations run with the page busy and neither 1672 * the page or the queue will be spin-locked. 1673 */ 1674 if ((m->flags & (PG_UNQUEUED | PG_NEED_COMMIT)) || 1675 m->hold_count || 1676 m->wire_count) { 1677 vm_page_deactivate(m); 1678 vm_page_wakeup(m); 1679 continue; 1680 } 1681 pmap_mapped_sync(m); 1682 KKASSERT((m->flags & PG_MAPPED) == 0); 1683 KKASSERT(m->dirty == 0); 1684 vm_pageout_page_free(m); 1685 mycpu->gd_cnt.v_dfree++; 1686 next_rover: 1687 if (isep) 1688 cache_rover[1] -= PQ_PRIME2; 1689 else 1690 cache_rover[0] += PQ_PRIME2; 1691 } 1692 1693 #if !defined(NO_SWAPPING) 1694 /* 1695 * Idle process swapout -- run once per second. 1696 */ 1697 if (vm_swap_idle_enabled) { 1698 static time_t lsec; 1699 if (time_uptime != lsec) { 1700 atomic_set_int(&vm_pageout_req_swapout, VM_SWAP_IDLE); 1701 vm_req_vmdaemon(); 1702 lsec = time_uptime; 1703 } 1704 } 1705 #endif 1706 1707 /* 1708 * If we didn't get enough free pages, and we have skipped a vnode 1709 * in a writeable object, wakeup the sync daemon. And kick swapout 1710 * if we did not get enough free pages. 1711 */ 1712 if (vm_paging_target() > 0) { 1713 if (vnodes_skipped && vm_page_count_min(0)) 1714 speedup_syncer(NULL); 1715 #if !defined(NO_SWAPPING) 1716 if (vm_swap_enabled && vm_page_count_target()) { 1717 atomic_set_int(&vm_pageout_req_swapout, VM_SWAP_NORMAL); 1718 vm_req_vmdaemon(); 1719 } 1720 #endif 1721 } 1722 1723 /* 1724 * Handle catastrophic conditions. Under good conditions we should 1725 * be at the target, well beyond our minimum. If we could not even 1726 * reach our minimum the system is under heavy stress. But just being 1727 * under heavy stress does not trigger process killing. 1728 * 1729 * We consider ourselves to have run out of memory if the swap pager 1730 * is full and avail_shortage is still positive. The secondary check 1731 * ensures that we do not kill processes if the instantanious 1732 * availability is good, even if the pageout demon pass says it 1733 * couldn't get to the target. 1734 * 1735 * NOTE! THE EMERGENCY PAGER (isep) DOES NOT HANDLE SWAP FULL 1736 * SITUATIONS. 1737 */ 1738 if (swap_pager_almost_full && 1739 pass > 0 && 1740 isep == 0 && 1741 (vm_page_count_min(recycle_count) || avail_shortage > 0)) { 1742 kprintf("Warning: system low on memory+swap " 1743 "shortage %ld for %d ticks!\n", 1744 avail_shortage, ticks - swap_fail_ticks); 1745 if (bootverbose) 1746 kprintf("Metrics: spaf=%d spf=%d pass=%d " 1747 "avail=%ld target=%ld last=%u\n", 1748 swap_pager_almost_full, 1749 swap_pager_full, 1750 pass, 1751 avail_shortage, 1752 vm_paging_target(), 1753 (unsigned int)(ticks - lastkillticks)); 1754 } 1755 if (swap_pager_full && 1756 pass > 1 && 1757 isep == 0 && 1758 avail_shortage > 0 && 1759 vm_paging_target() > 0 && 1760 (unsigned int)(ticks - lastkillticks) >= hz) { 1761 /* 1762 * Kill something, maximum rate once per second to give 1763 * the process time to free up sufficient memory. 1764 */ 1765 lastkillticks = ticks; 1766 info.bigproc = NULL; 1767 info.bigsize = 0; 1768 allproc_scan(vm_pageout_scan_callback, &info, 0); 1769 if (info.bigproc != NULL) { 1770 kprintf("Try to kill process %d %s\n", 1771 info.bigproc->p_pid, info.bigproc->p_comm); 1772 info.bigproc->p_nice = PRIO_MIN; 1773 info.bigproc->p_usched->resetpriority( 1774 FIRST_LWP_IN_PROC(info.bigproc)); 1775 atomic_set_int(&info.bigproc->p_flags, P_LOWMEMKILL); 1776 killproc(info.bigproc, "out of swap space"); 1777 wakeup(&vmstats.v_free_count); 1778 PRELE(info.bigproc); 1779 } 1780 } 1781 } 1782 1783 static int 1784 vm_pageout_scan_callback(struct proc *p, void *data) 1785 { 1786 struct vm_pageout_scan_info *info = data; 1787 vm_offset_t size; 1788 1789 /* 1790 * Never kill system processes or init. If we have configured swap 1791 * then try to avoid killing low-numbered pids. 1792 */ 1793 if ((p->p_flags & P_SYSTEM) || (p->p_pid == 1) || 1794 ((p->p_pid < 48) && (vm_swap_size != 0))) { 1795 return (0); 1796 } 1797 1798 lwkt_gettoken(&p->p_token); 1799 1800 /* 1801 * if the process is in a non-running type state, 1802 * don't touch it. 1803 */ 1804 if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) { 1805 lwkt_reltoken(&p->p_token); 1806 return (0); 1807 } 1808 1809 /* 1810 * Get the approximate process size. Note that anonymous pages 1811 * with backing swap will be counted twice, but there should not 1812 * be too many such pages due to the stress the VM system is 1813 * under at this point. 1814 */ 1815 size = vmspace_anonymous_count(p->p_vmspace) + 1816 vmspace_swap_count(p->p_vmspace); 1817 1818 /* 1819 * If the this process is bigger than the biggest one 1820 * remember it. 1821 */ 1822 if (info->bigsize < size) { 1823 if (info->bigproc) 1824 PRELE(info->bigproc); 1825 PHOLD(p); 1826 info->bigproc = p; 1827 info->bigsize = size; 1828 } 1829 lwkt_reltoken(&p->p_token); 1830 lwkt_yield(); 1831 1832 return(0); 1833 } 1834 1835 /* 1836 * This old guy slowly walks PQ_HOLD looking for pages which need to be 1837 * moved back to PQ_FREE. It is possible for pages to accumulate here 1838 * when vm_page_free() races against vm_page_unhold(), resulting in a 1839 * page being left on a PQ_HOLD queue with hold_count == 0. 1840 * 1841 * It is easier to handle this edge condition here, in non-critical code, 1842 * rather than enforce a spin-lock for every 1->0 transition in 1843 * vm_page_unhold(). 1844 * 1845 * NOTE: TAILQ_FOREACH becomes invalid the instant we unlock the queue. 1846 */ 1847 static void 1848 vm_pageout_scan_hold(int q) 1849 { 1850 vm_page_t m; 1851 1852 vm_page_queues_spin_lock(PQ_HOLD + q); 1853 TAILQ_FOREACH(m, &vm_page_queues[PQ_HOLD + q].pl, pageq) { 1854 if (m->flags & PG_MARKER) 1855 continue; 1856 1857 /* 1858 * Process one page and return 1859 */ 1860 if (m->hold_count) 1861 break; 1862 kprintf("DEBUG: pageout HOLD->FREE %p\n", m); 1863 vm_page_hold(m); 1864 vm_page_queues_spin_unlock(PQ_HOLD + q); 1865 vm_page_unhold(m); /* reprocess */ 1866 return; 1867 } 1868 vm_page_queues_spin_unlock(PQ_HOLD + q); 1869 } 1870 1871 /* 1872 * This routine tries to maintain the pseudo LRU active queue, 1873 * so that during long periods of time where there is no paging, 1874 * that some statistic accumulation still occurs. This code 1875 * helps the situation where paging just starts to occur. 1876 */ 1877 static void 1878 vm_pageout_page_stats(int q) 1879 { 1880 static int fullintervalcount = 0; 1881 struct vm_page marker; 1882 vm_page_t m; 1883 long pcount, tpcount; /* Number of pages to check */ 1884 long page_shortage; 1885 1886 page_shortage = (vmstats.v_inactive_target + vmstats.v_cache_max + 1887 vmstats.v_free_min) - 1888 (vmstats.v_free_count + vmstats.v_inactive_count + 1889 vmstats.v_cache_count); 1890 1891 if (page_shortage <= 0) 1892 return; 1893 1894 pcount = vm_page_queues[PQ_ACTIVE + q].lcnt; 1895 fullintervalcount += vm_pageout_stats_interval; 1896 if (fullintervalcount < vm_pageout_full_stats_interval) { 1897 tpcount = (vm_pageout_stats_max * pcount) / 1898 vmstats.v_page_count + 1; 1899 if (pcount > tpcount) 1900 pcount = tpcount; 1901 } else { 1902 fullintervalcount = 0; 1903 } 1904 1905 bzero(&marker, sizeof(marker)); 1906 marker.flags = PG_FICTITIOUS | PG_MARKER; 1907 marker.busy_count = PBUSY_LOCKED; 1908 marker.queue = PQ_ACTIVE + q; 1909 marker.pc = q; 1910 marker.wire_count = 1; 1911 1912 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1913 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1914 1915 /* 1916 * Queue locked at top of loop to avoid stack marker issues. 1917 */ 1918 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 1919 pcount-- > 0) 1920 { 1921 int actcount; 1922 1923 KKASSERT(m->queue == PQ_ACTIVE + q); 1924 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1925 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m, 1926 &marker, pageq); 1927 1928 /* 1929 * Skip marker pages (atomic against other markers to avoid 1930 * infinite hop-over scans). 1931 */ 1932 if (m->flags & PG_MARKER) 1933 continue; 1934 1935 /* 1936 * Ignore pages we can't busy 1937 */ 1938 if (vm_page_busy_try(m, TRUE)) 1939 continue; 1940 1941 /* 1942 * Remaining operations run with the page busy and neither 1943 * the page or the queue will be spin-locked. 1944 */ 1945 KKASSERT(m->queue == PQ_ACTIVE + q); 1946 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1947 1948 /* 1949 * We can just remove wired pages from the queue 1950 */ 1951 if (m->wire_count) { 1952 vm_page_unqueue_nowakeup(m); 1953 vm_page_wakeup(m); 1954 goto next; 1955 } 1956 1957 1958 /* 1959 * We now have a safely busied page, the page and queue 1960 * spinlocks have been released. 1961 * 1962 * Ignore held and wired pages 1963 */ 1964 if (m->hold_count || m->wire_count) { 1965 vm_page_wakeup(m); 1966 goto next; 1967 } 1968 1969 /* 1970 * Calculate activity 1971 */ 1972 actcount = 0; 1973 if (m->flags & PG_REFERENCED) { 1974 vm_page_flag_clear(m, PG_REFERENCED); 1975 actcount += 1; 1976 } 1977 actcount += pmap_ts_referenced(m); 1978 1979 /* 1980 * Update act_count and move page to end of queue. 1981 */ 1982 if (actcount) { 1983 m->act_count += ACT_ADVANCE + actcount; 1984 if (m->act_count > ACT_MAX) 1985 m->act_count = ACT_MAX; 1986 vm_page_and_queue_spin_lock(m); 1987 if (m->queue - m->pc == PQ_ACTIVE) { 1988 TAILQ_REMOVE( 1989 &vm_page_queues[PQ_ACTIVE + q].pl, 1990 m, pageq); 1991 TAILQ_INSERT_TAIL( 1992 &vm_page_queues[PQ_ACTIVE + q].pl, 1993 m, pageq); 1994 } 1995 vm_page_and_queue_spin_unlock(m); 1996 vm_page_wakeup(m); 1997 goto next; 1998 } 1999 2000 if (m->act_count == 0) { 2001 /* 2002 * We turn off page access, so that we have 2003 * more accurate RSS stats. We don't do this 2004 * in the normal page deactivation when the 2005 * system is loaded VM wise, because the 2006 * cost of the large number of page protect 2007 * operations would be higher than the value 2008 * of doing the operation. 2009 * 2010 * We use the marker to save our place so 2011 * we can release the spin lock. both (m) 2012 * and (next) will be invalid. 2013 */ 2014 vm_page_protect(m, VM_PROT_NONE); 2015 vm_page_deactivate(m); 2016 } else { 2017 m->act_count -= min(m->act_count, ACT_DECLINE); 2018 vm_page_and_queue_spin_lock(m); 2019 if (m->queue - m->pc == PQ_ACTIVE) { 2020 TAILQ_REMOVE( 2021 &vm_page_queues[PQ_ACTIVE + q].pl, 2022 m, pageq); 2023 TAILQ_INSERT_TAIL( 2024 &vm_page_queues[PQ_ACTIVE + q].pl, 2025 m, pageq); 2026 } 2027 vm_page_and_queue_spin_unlock(m); 2028 } 2029 vm_page_wakeup(m); 2030 next: 2031 vm_page_queues_spin_lock(PQ_ACTIVE + q); 2032 } 2033 2034 /* 2035 * Remove our local marker 2036 * 2037 * Page queue still spin-locked. 2038 */ 2039 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 2040 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 2041 } 2042 2043 static void 2044 vm_pageout_free_page_calc(vm_size_t count) 2045 { 2046 /* 2047 * v_free_min normal allocations 2048 * v_free_reserved system allocations 2049 * v_pageout_free_min allocations by pageout daemon 2050 * v_interrupt_free_min low level allocations (e.g swap structures) 2051 * 2052 * v_free_min is used to generate several other baselines, and they 2053 * can get pretty silly on systems with a lot of memory. 2054 */ 2055 vmstats.v_free_min = 64 + vmstats.v_page_count / 200; 2056 vmstats.v_free_reserved = vmstats.v_free_min * 4 / 8 + 7; 2057 vmstats.v_free_severe = vmstats.v_free_min * 4 / 8 + 0; 2058 vmstats.v_pageout_free_min = vmstats.v_free_min * 2 / 8 + 7; 2059 vmstats.v_interrupt_free_min = vmstats.v_free_min * 1 / 8 + 7; 2060 } 2061 2062 2063 /* 2064 * vm_pageout is the high level pageout daemon. TWO kernel threads run 2065 * this daemon, the primary pageout daemon and the emergency pageout daemon. 2066 * 2067 * The emergency pageout daemon takes over when the primary pageout daemon 2068 * deadlocks. The emergency pageout daemon ONLY pages out to swap, thus 2069 * avoiding the many low-memory deadlocks which can occur when paging out 2070 * to VFS's. 2071 */ 2072 static void 2073 vm_pageout_thread(void) 2074 { 2075 int pass; 2076 int q; 2077 int q1iterator = 0; 2078 int q2iterator = 0; 2079 int q3iterator = 0; 2080 int isep; 2081 2082 curthread->td_flags |= TDF_SYSTHREAD; 2083 2084 /* 2085 * We only need to setup once. 2086 */ 2087 isep = 0; 2088 if (curthread == emergpager) { 2089 isep = 1; 2090 goto skip_setup; 2091 } 2092 2093 /* 2094 * Initialize some paging parameters. 2095 */ 2096 vm_pageout_free_page_calc(vmstats.v_page_count); 2097 2098 /* 2099 * v_free_target and v_cache_min control pageout hysteresis. Note 2100 * that these are more a measure of the VM cache queue hysteresis 2101 * then the VM free queue. Specifically, v_free_target is the 2102 * high water mark (free+cache pages). 2103 * 2104 * v_free_reserved + v_cache_min (mostly means v_cache_min) is the 2105 * low water mark, while v_free_min is the stop. v_cache_min must 2106 * be big enough to handle memory needs while the pageout daemon 2107 * is signalled and run to free more pages. 2108 */ 2109 vmstats.v_free_target = 4 * vmstats.v_free_min + 2110 vmstats.v_free_reserved; 2111 2112 /* 2113 * NOTE: With the new buffer cache b_act_count we want the default 2114 * inactive target to be a percentage of available memory. 2115 * 2116 * The inactive target essentially determines the minimum 2117 * number of 'temporary' pages capable of caching one-time-use 2118 * files when the VM system is otherwise full of pages 2119 * belonging to multi-time-use files or active program data. 2120 * 2121 * NOTE: The inactive target is aggressively persued only if the 2122 * inactive queue becomes too small. If the inactive queue 2123 * is large enough to satisfy page movement to free+cache 2124 * then it is repopulated more slowly from the active queue. 2125 * This allows a general inactive_target default to be set. 2126 * 2127 * There is an issue here for processes which sit mostly idle 2128 * 'overnight', such as sshd, tcsh, and X. Any movement from 2129 * the active queue will eventually cause such pages to 2130 * recycle eventually causing a lot of paging in the morning. 2131 * To reduce the incidence of this pages cycled out of the 2132 * buffer cache are moved directly to the inactive queue if 2133 * they were only used once or twice. 2134 * 2135 * The vfs.vm_cycle_point sysctl can be used to adjust this. 2136 * Increasing the value (up to 64) increases the number of 2137 * buffer recyclements which go directly to the inactive queue. 2138 */ 2139 if (vmstats.v_free_count > 2048) { 2140 vmstats.v_cache_min = vmstats.v_free_target; 2141 vmstats.v_cache_max = 2 * vmstats.v_cache_min; 2142 } else { 2143 vmstats.v_cache_min = 0; 2144 vmstats.v_cache_max = 0; 2145 } 2146 vmstats.v_inactive_target = vmstats.v_free_count / 4; 2147 2148 /* XXX does not really belong here */ 2149 if (vm_page_max_wired == 0) 2150 vm_page_max_wired = vmstats.v_free_count / 3; 2151 2152 if (vm_pageout_stats_max == 0) 2153 vm_pageout_stats_max = vmstats.v_free_target; 2154 2155 /* 2156 * Set interval in seconds for stats scan. 2157 */ 2158 if (vm_pageout_stats_interval == 0) 2159 vm_pageout_stats_interval = 5; 2160 if (vm_pageout_full_stats_interval == 0) 2161 vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4; 2162 2163 2164 /* 2165 * Set maximum free per pass 2166 */ 2167 if (vm_pageout_stats_free_max == 0) 2168 vm_pageout_stats_free_max = 5; 2169 2170 swap_pager_swap_init(); 2171 pass = 0; 2172 2173 atomic_swap_int(&sequence_emerg_pager, 1); 2174 wakeup(&sequence_emerg_pager); 2175 2176 skip_setup: 2177 /* 2178 * Sequence emergency pager startup 2179 */ 2180 if (isep) { 2181 while (sequence_emerg_pager == 0) 2182 tsleep(&sequence_emerg_pager, 0, "pstartup", hz); 2183 } 2184 2185 /* 2186 * The pageout daemon is never done, so loop forever. 2187 * 2188 * WARNING! This code is being executed by two kernel threads 2189 * potentially simultaneously. 2190 */ 2191 while (TRUE) { 2192 int error; 2193 long avail_shortage; 2194 long inactive_shortage; 2195 long vnodes_skipped = 0; 2196 long recycle_count = 0; 2197 long tmp; 2198 2199 /* 2200 * Wait for an action request. If we timeout check to 2201 * see if paging is needed (in case the normal wakeup 2202 * code raced us). 2203 */ 2204 if (isep) { 2205 /* 2206 * Emergency pagedaemon monitors the primary 2207 * pagedaemon while vm_pages_needed != 0. 2208 * 2209 * The emergency pagedaemon only runs if VM paging 2210 * is needed and the primary pagedaemon has not 2211 * updated vm_pagedaemon_time for more than 2 seconds. 2212 */ 2213 if (vm_pages_needed) 2214 tsleep(&vm_pagedaemon_time, 0, "psleep", hz); 2215 else 2216 tsleep(&vm_pagedaemon_time, 0, "psleep", hz*10); 2217 if (vm_pages_needed == 0) { 2218 pass = 0; 2219 continue; 2220 } 2221 if ((int)(ticks - vm_pagedaemon_time) < hz * 2) { 2222 pass = 0; 2223 continue; 2224 } 2225 } else { 2226 /* 2227 * Primary pagedaemon 2228 * 2229 * NOTE: We unconditionally cleanup PQ_HOLD even 2230 * when there is no work to do. 2231 */ 2232 vm_pageout_scan_hold(q3iterator & PQ_L2_MASK); 2233 ++q3iterator; 2234 2235 if (vm_pages_needed == 0) { 2236 error = tsleep(&vm_pages_needed, 2237 0, "psleep", 2238 vm_pageout_stats_interval * hz); 2239 if (error && 2240 vm_paging_needed(0) == 0 && 2241 vm_pages_needed == 0) { 2242 for (q = 0; q < PQ_L2_SIZE; ++q) 2243 vm_pageout_page_stats(q); 2244 continue; 2245 } 2246 vm_pagedaemon_time = ticks; 2247 vm_pages_needed = 1; 2248 2249 /* 2250 * Wake the emergency pagedaemon up so it 2251 * can monitor us. It will automatically 2252 * go back into a long sleep when 2253 * vm_pages_needed returns to 0. 2254 */ 2255 wakeup(&vm_pagedaemon_time); 2256 } 2257 } 2258 2259 mycpu->gd_cnt.v_pdwakeups++; 2260 2261 /* 2262 * Scan for INACTIVE->CLEAN/PAGEOUT 2263 * 2264 * This routine tries to avoid thrashing the system with 2265 * unnecessary activity. 2266 * 2267 * Calculate our target for the number of free+cache pages we 2268 * want to get to. This is higher then the number that causes 2269 * allocations to stall (severe) in order to provide hysteresis, 2270 * and if we don't make it all the way but get to the minimum 2271 * we're happy. Goose it a bit if there are multiple requests 2272 * for memory. 2273 * 2274 * Don't reduce avail_shortage inside the loop or the 2275 * PQAVERAGE() calculation will break. 2276 * 2277 * NOTE! deficit is differentiated from avail_shortage as 2278 * REQUIRING at least (deficit) pages to be cleaned, 2279 * even if the page queues are in good shape. This 2280 * is used primarily for handling per-process 2281 * RLIMIT_RSS and may also see small values when 2282 * processes block due to low memory. 2283 */ 2284 vmstats_rollup(); 2285 if (isep == 0) 2286 vm_pagedaemon_time = ticks; 2287 avail_shortage = vm_paging_target() + vm_pageout_deficit; 2288 vm_pageout_deficit = 0; 2289 2290 if (avail_shortage > 0) { 2291 long delta = 0; 2292 int qq; 2293 2294 qq = q1iterator; 2295 for (q = 0; q < PQ_L2_SIZE; ++q) { 2296 delta += vm_pageout_scan_inactive( 2297 pass, 2298 qq & PQ_L2_MASK, 2299 PQAVERAGE(avail_shortage), 2300 &vnodes_skipped); 2301 if (isep) 2302 --qq; 2303 else 2304 ++qq; 2305 if (avail_shortage - delta <= 0) 2306 break; 2307 2308 /* 2309 * It is possible for avail_shortage to be 2310 * very large. If a large program exits or 2311 * frees a ton of memory all at once, we do 2312 * not have to continue deactivations. 2313 * 2314 * (We will still run the active->inactive 2315 * target, however). 2316 */ 2317 if (!vm_page_count_target() && 2318 !vm_page_count_min( 2319 vm_page_free_hysteresis)) { 2320 avail_shortage = 0; 2321 break; 2322 } 2323 } 2324 avail_shortage -= delta; 2325 q1iterator = qq; 2326 } 2327 2328 /* 2329 * Figure out how many active pages we must deactivate. If 2330 * we were able to reach our target with just the inactive 2331 * scan above we limit the number of active pages we 2332 * deactivate to reduce unnecessary work. 2333 */ 2334 vmstats_rollup(); 2335 if (isep == 0) 2336 vm_pagedaemon_time = ticks; 2337 inactive_shortage = vmstats.v_inactive_target - 2338 vmstats.v_inactive_count; 2339 2340 /* 2341 * If we were unable to free sufficient inactive pages to 2342 * satisfy the free/cache queue requirements then simply 2343 * reaching the inactive target may not be good enough. 2344 * Try to deactivate pages in excess of the target based 2345 * on the shortfall. 2346 * 2347 * However to prevent thrashing the VM system do not 2348 * deactivate more than an additional 1/10 the inactive 2349 * target's worth of active pages. 2350 */ 2351 if (avail_shortage > 0) { 2352 tmp = avail_shortage * 2; 2353 if (tmp > vmstats.v_inactive_target / 10) 2354 tmp = vmstats.v_inactive_target / 10; 2355 inactive_shortage += tmp; 2356 } 2357 2358 /* 2359 * Only trigger a pmap cleanup on inactive shortage. 2360 */ 2361 if (isep == 0 && inactive_shortage > 0) { 2362 pmap_collect(); 2363 } 2364 2365 /* 2366 * Scan for ACTIVE->INACTIVE 2367 * 2368 * Only trigger on inactive shortage. Triggering on 2369 * avail_shortage can starve the active queue with 2370 * unnecessary active->inactive transitions and destroy 2371 * performance. 2372 * 2373 * If this is the emergency pager, always try to move 2374 * a few pages from active to inactive because the inactive 2375 * queue might have enough pages, but not enough anonymous 2376 * pages. 2377 */ 2378 if (isep && inactive_shortage < vm_emerg_launder) 2379 inactive_shortage = vm_emerg_launder; 2380 2381 if (/*avail_shortage > 0 ||*/ inactive_shortage > 0) { 2382 long delta = 0; 2383 int qq; 2384 2385 qq = q2iterator; 2386 for (q = 0; q < PQ_L2_SIZE; ++q) { 2387 delta += vm_pageout_scan_active( 2388 pass, 2389 qq & PQ_L2_MASK, 2390 PQAVERAGE(avail_shortage), 2391 PQAVERAGE(inactive_shortage), 2392 &recycle_count); 2393 if (isep) 2394 --qq; 2395 else 2396 ++qq; 2397 if (inactive_shortage - delta <= 0 && 2398 avail_shortage - delta <= 0) { 2399 break; 2400 } 2401 2402 /* 2403 * inactive_shortage can be a very large 2404 * number. This is intended to break out 2405 * early if our inactive_target has been 2406 * reached due to other system activity. 2407 */ 2408 if (vmstats.v_inactive_count > 2409 vmstats.v_inactive_target) { 2410 inactive_shortage = 0; 2411 break; 2412 } 2413 } 2414 inactive_shortage -= delta; 2415 avail_shortage -= delta; 2416 q2iterator = qq; 2417 } 2418 2419 /* 2420 * Scan for CACHE->FREE 2421 * 2422 * Finally free enough cache pages to meet our free page 2423 * requirement and take more drastic measures if we are 2424 * still in trouble. 2425 */ 2426 vmstats_rollup(); 2427 if (isep == 0) 2428 vm_pagedaemon_time = ticks; 2429 vm_pageout_scan_cache(avail_shortage, pass, 2430 vnodes_skipped, recycle_count); 2431 2432 /* 2433 * This is a bit sophisticated because we do not necessarily 2434 * want to force paging until our targets are reached if we 2435 * were able to successfully retire the shortage we calculated. 2436 */ 2437 if (avail_shortage > 0) { 2438 /* 2439 * If we did not retire enough pages continue the 2440 * pageout operation until we are able to. 2441 */ 2442 ++pass; 2443 2444 if (pass < 10 && vm_pages_needed > 1) { 2445 /* 2446 * Normal operation, additional processes 2447 * have already kicked us. Retry immediately 2448 * unless swap space is completely full in 2449 * which case delay a bit. 2450 */ 2451 if (swap_pager_full) { 2452 tsleep(&vm_pages_needed, 0, "pdelay", 2453 hz / 5); 2454 } /* else immediate retry */ 2455 } else if (pass < 10) { 2456 /* 2457 * Do a short sleep for the first 10 passes, 2458 * allow the sleep to be woken up by resetting 2459 * vm_pages_needed to 1 (NOTE: we are still 2460 * active paging!). 2461 */ 2462 if (isep == 0) 2463 vm_pages_needed = 1; 2464 tsleep(&vm_pages_needed, 0, "pdelay", 2); 2465 } else if (swap_pager_full == 0) { 2466 /* 2467 * We've taken too many passes, force a 2468 * longer delay. 2469 */ 2470 tsleep(&vm_pages_needed, 0, "pdelay", hz / 10); 2471 } else { 2472 /* 2473 * Running out of memory, catastrophic 2474 * back-off to one-second intervals. 2475 */ 2476 tsleep(&vm_pages_needed, 0, "pdelay", hz); 2477 } 2478 } else if (vm_pages_needed) { 2479 /* 2480 * We retired our calculated shortage but we may have 2481 * to continue paging if threads drain memory too far 2482 * below our target. 2483 * 2484 * Similar to vm_page_free_wakeup() in vm_page.c. 2485 */ 2486 pass = 0; 2487 if (!vm_paging_needed(0)) { 2488 /* still more than half-way to our target */ 2489 vm_pages_needed = 0; 2490 wakeup(&vmstats.v_free_count); 2491 } else 2492 if (!vm_page_count_min(vm_page_free_hysteresis)) { 2493 /* 2494 * Continue operations with wakeup 2495 * (set variable to avoid overflow) 2496 */ 2497 vm_pages_needed = 2; 2498 wakeup(&vmstats.v_free_count); 2499 } else { 2500 /* 2501 * No wakeup() needed, continue operations. 2502 * (set variable to avoid overflow) 2503 */ 2504 vm_pages_needed = 2; 2505 } 2506 } else { 2507 /* 2508 * Turn paging back on immediately if we are under 2509 * minimum. 2510 */ 2511 pass = 0; 2512 } 2513 } 2514 } 2515 2516 static struct kproc_desc pg1_kp = { 2517 "pagedaemon", 2518 vm_pageout_thread, 2519 &pagethread 2520 }; 2521 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &pg1_kp); 2522 2523 static struct kproc_desc pg2_kp = { 2524 "emergpager", 2525 vm_pageout_thread, 2526 &emergpager 2527 }; 2528 SYSINIT(emergpager, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, kproc_start, &pg2_kp); 2529 2530 2531 /* 2532 * Called after allocating a page out of the cache or free queue 2533 * to possibly wake the pagedaemon up to replentish our supply. 2534 * 2535 * We try to generate some hysteresis by waking the pagedaemon up 2536 * when our free+cache pages go below the free_min+cache_min level. 2537 * The pagedaemon tries to get the count back up to at least the 2538 * minimum, and through to the target level if possible. 2539 * 2540 * If the pagedaemon is already active bump vm_pages_needed as a hint 2541 * that there are even more requests pending. 2542 * 2543 * SMP races ok? 2544 * No requirements. 2545 */ 2546 void 2547 pagedaemon_wakeup(void) 2548 { 2549 if (vm_paging_needed(0) && curthread != pagethread) { 2550 if (vm_pages_needed <= 1) { 2551 vm_pages_needed = 1; /* SMP race ok */ 2552 wakeup(&vm_pages_needed); /* tickle pageout */ 2553 } else if (vm_page_count_min(0)) { 2554 ++vm_pages_needed; /* SMP race ok */ 2555 /* a wakeup() would be wasted here */ 2556 } 2557 } 2558 } 2559 2560 #if !defined(NO_SWAPPING) 2561 2562 /* 2563 * SMP races ok? 2564 * No requirements. 2565 */ 2566 static void 2567 vm_req_vmdaemon(void) 2568 { 2569 static int lastrun = 0; 2570 2571 if ((ticks > (lastrun + hz)) || (ticks < lastrun)) { 2572 wakeup(&vm_daemon_needed); 2573 lastrun = ticks; 2574 } 2575 } 2576 2577 static int vm_daemon_callback(struct proc *p, void *data __unused); 2578 2579 /* 2580 * No requirements. 2581 */ 2582 static void 2583 vm_daemon(void) 2584 { 2585 int req_swapout; 2586 2587 while (TRUE) { 2588 tsleep(&vm_daemon_needed, 0, "psleep", 0); 2589 req_swapout = atomic_swap_int(&vm_pageout_req_swapout, 0); 2590 2591 /* 2592 * forced swapouts 2593 */ 2594 if (req_swapout) 2595 swapout_procs(vm_pageout_req_swapout); 2596 2597 /* 2598 * scan the processes for exceeding their rlimits or if 2599 * process is swapped out -- deactivate pages 2600 */ 2601 allproc_scan(vm_daemon_callback, NULL, 0); 2602 } 2603 } 2604 2605 static int 2606 vm_daemon_callback(struct proc *p, void *data __unused) 2607 { 2608 struct vmspace *vm; 2609 vm_pindex_t limit, size; 2610 2611 /* 2612 * if this is a system process or if we have already 2613 * looked at this process, skip it. 2614 */ 2615 lwkt_gettoken(&p->p_token); 2616 2617 if (p->p_flags & (P_SYSTEM | P_WEXIT)) { 2618 lwkt_reltoken(&p->p_token); 2619 return (0); 2620 } 2621 2622 /* 2623 * if the process is in a non-running type state, 2624 * don't touch it. 2625 */ 2626 if (p->p_stat != SACTIVE && p->p_stat != SSTOP && p->p_stat != SCORE) { 2627 lwkt_reltoken(&p->p_token); 2628 return (0); 2629 } 2630 2631 /* 2632 * get a limit 2633 */ 2634 limit = OFF_TO_IDX(qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur, 2635 p->p_rlimit[RLIMIT_RSS].rlim_max)); 2636 2637 /* 2638 * let processes that are swapped out really be 2639 * swapped out. Set the limit to nothing to get as 2640 * many pages out to swap as possible. 2641 */ 2642 if (p->p_flags & P_SWAPPEDOUT) 2643 limit = 0; 2644 2645 vm = p->p_vmspace; 2646 vmspace_hold(vm); 2647 size = pmap_resident_tlnw_count(&vm->vm_pmap); 2648 if (limit >= 0 && size > 4096 && 2649 size - 4096 >= limit && vm_pageout_memuse_mode >= 1) { 2650 vm_pageout_map_deactivate_pages(&vm->vm_map, limit); 2651 } 2652 vmspace_drop(vm); 2653 2654 lwkt_reltoken(&p->p_token); 2655 2656 return (0); 2657 } 2658 2659 #endif 2660