1 /* 2 * (MPSAFE) 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * All rights reserved. 6 * Copyright (c) 1994 John S. Dyson 7 * All rights reserved. 8 * Copyright (c) 1994 David Greenman 9 * All rights reserved. 10 * 11 * This code is derived from software contributed to Berkeley by 12 * The Mach Operating System project at Carnegie-Mellon University. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 39 * 40 * 41 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 42 * All rights reserved. 43 * 44 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 45 * 46 * Permission to use, copy, modify and distribute this software and 47 * its documentation is hereby granted, provided that both the copyright 48 * notice and this permission notice appear in all copies of the 49 * software, derivative works or modified versions, and any portions 50 * thereof, and that both notices appear in supporting documentation. 51 * 52 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 53 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 54 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 55 * 56 * Carnegie Mellon requests users of this software to return to 57 * 58 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 59 * School of Computer Science 60 * Carnegie Mellon University 61 * Pittsburgh PA 15213-3890 62 * 63 * any improvements or extensions that they make and grant Carnegie the 64 * rights to redistribute these changes. 65 * 66 * $FreeBSD: src/sys/vm/vm_pageout.c,v 1.151.2.15 2002/12/29 18:21:04 dillon Exp $ 67 */ 68 69 /* 70 * The proverbial page-out daemon. 71 */ 72 73 #include "opt_vm.h" 74 #include <sys/param.h> 75 #include <sys/systm.h> 76 #include <sys/kernel.h> 77 #include <sys/proc.h> 78 #include <sys/kthread.h> 79 #include <sys/resourcevar.h> 80 #include <sys/signalvar.h> 81 #include <sys/vnode.h> 82 #include <sys/vmmeter.h> 83 #include <sys/sysctl.h> 84 85 #include <vm/vm.h> 86 #include <vm/vm_param.h> 87 #include <sys/lock.h> 88 #include <vm/vm_object.h> 89 #include <vm/vm_page.h> 90 #include <vm/vm_map.h> 91 #include <vm/vm_pageout.h> 92 #include <vm/vm_pager.h> 93 #include <vm/swap_pager.h> 94 #include <vm/vm_extern.h> 95 96 #include <sys/thread2.h> 97 #include <sys/spinlock2.h> 98 #include <vm/vm_page2.h> 99 100 /* 101 * System initialization 102 */ 103 104 /* the kernel process "vm_pageout"*/ 105 static int vm_pageout_clean (vm_page_t); 106 static int vm_pageout_free_page_calc (vm_size_t count); 107 struct thread *pagethread; 108 109 #if !defined(NO_SWAPPING) 110 /* the kernel process "vm_daemon"*/ 111 static void vm_daemon (void); 112 static struct thread *vmthread; 113 114 static struct kproc_desc vm_kp = { 115 "vmdaemon", 116 vm_daemon, 117 &vmthread 118 }; 119 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp) 120 #endif 121 122 123 int vm_pages_needed=0; /* Event on which pageout daemon sleeps */ 124 int vm_pageout_deficit=0; /* Estimated number of pages deficit */ 125 int vm_pageout_pages_needed=0; /* flag saying that the pageout daemon needs pages */ 126 127 #if !defined(NO_SWAPPING) 128 static int vm_pageout_req_swapout; /* XXX */ 129 static int vm_daemon_needed; 130 #endif 131 static int vm_max_launder = 32; 132 static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0; 133 static int vm_pageout_full_stats_interval = 0; 134 static int vm_pageout_stats_free_max=0, vm_pageout_algorithm=0; 135 static int defer_swap_pageouts=0; 136 static int disable_swap_pageouts=0; 137 138 #if defined(NO_SWAPPING) 139 static int vm_swap_enabled=0; 140 static int vm_swap_idle_enabled=0; 141 #else 142 static int vm_swap_enabled=1; 143 static int vm_swap_idle_enabled=0; 144 #endif 145 146 SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, pageout_algorithm, 147 CTLFLAG_RW, &vm_pageout_algorithm, 0, "LRU page mgmt"); 148 149 SYSCTL_INT(_vm, OID_AUTO, max_launder, 150 CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout"); 151 152 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max, 153 CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length"); 154 155 SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval, 156 CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan"); 157 158 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval, 159 CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan"); 160 161 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_free_max, 162 CTLFLAG_RW, &vm_pageout_stats_free_max, 0, "Not implemented"); 163 164 #if defined(NO_SWAPPING) 165 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 166 CTLFLAG_RD, &vm_swap_enabled, 0, ""); 167 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, 168 CTLFLAG_RD, &vm_swap_idle_enabled, 0, ""); 169 #else 170 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 171 CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout"); 172 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, 173 CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); 174 #endif 175 176 SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts, 177 CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem"); 178 179 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, 180 CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); 181 182 static int pageout_lock_miss; 183 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss, 184 CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout"); 185 186 int vm_load; 187 SYSCTL_INT(_vm, OID_AUTO, vm_load, 188 CTLFLAG_RD, &vm_load, 0, "load on the VM system"); 189 int vm_load_enable = 1; 190 SYSCTL_INT(_vm, OID_AUTO, vm_load_enable, 191 CTLFLAG_RW, &vm_load_enable, 0, "enable vm_load rate limiting"); 192 #ifdef INVARIANTS 193 int vm_load_debug; 194 SYSCTL_INT(_vm, OID_AUTO, vm_load_debug, 195 CTLFLAG_RW, &vm_load_debug, 0, "debug vm_load"); 196 #endif 197 198 #define VM_PAGEOUT_PAGE_COUNT 16 199 int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT; 200 201 int vm_page_max_wired; /* XXX max # of wired pages system-wide */ 202 203 #if !defined(NO_SWAPPING) 204 typedef void freeer_fcn_t (vm_map_t, vm_object_t, vm_pindex_t, int); 205 static void vm_pageout_map_deactivate_pages (vm_map_t, vm_pindex_t); 206 static freeer_fcn_t vm_pageout_object_deactivate_pages; 207 static void vm_req_vmdaemon (void); 208 #endif 209 static void vm_pageout_page_stats(int q); 210 211 /* 212 * Update vm_load to slow down faulting processes. 213 * 214 * SMP races ok. 215 * No requirements. 216 */ 217 void 218 vm_fault_ratecheck(void) 219 { 220 if (vm_pages_needed) { 221 if (vm_load < 1000) 222 ++vm_load; 223 } else { 224 if (vm_load > 0) 225 --vm_load; 226 } 227 } 228 229 /* 230 * vm_pageout_clean: 231 * 232 * Clean the page and remove it from the laundry. The page must not be 233 * busy on-call. 234 * 235 * We set the busy bit to cause potential page faults on this page to 236 * block. Note the careful timing, however, the busy bit isn't set till 237 * late and we cannot do anything that will mess with the page. 238 */ 239 static int 240 vm_pageout_clean(vm_page_t m) 241 { 242 vm_object_t object; 243 vm_page_t mc[2*vm_pageout_page_count]; 244 int pageout_count; 245 int error; 246 int ib, is, page_base; 247 vm_pindex_t pindex = m->pindex; 248 249 object = m->object; 250 251 /* 252 * It doesn't cost us anything to pageout OBJT_DEFAULT or OBJT_SWAP 253 * with the new swapper, but we could have serious problems paging 254 * out other object types if there is insufficient memory. 255 * 256 * Unfortunately, checking free memory here is far too late, so the 257 * check has been moved up a procedural level. 258 */ 259 260 /* 261 * Don't mess with the page if it's busy, held, or special 262 * 263 * XXX do we really need to check hold_count here? hold_count 264 * isn't supposed to mess with vm_page ops except prevent the 265 * page from being reused. 266 */ 267 if (m->hold_count != 0 || (m->flags & PG_UNMANAGED)) { 268 vm_page_wakeup(m); 269 return 0; 270 } 271 272 mc[vm_pageout_page_count] = m; 273 pageout_count = 1; 274 page_base = vm_pageout_page_count; 275 ib = 1; 276 is = 1; 277 278 /* 279 * Scan object for clusterable pages. 280 * 281 * We can cluster ONLY if: ->> the page is NOT 282 * clean, wired, busy, held, or mapped into a 283 * buffer, and one of the following: 284 * 1) The page is inactive, or a seldom used 285 * active page. 286 * -or- 287 * 2) we force the issue. 288 * 289 * During heavy mmap/modification loads the pageout 290 * daemon can really fragment the underlying file 291 * due to flushing pages out of order and not trying 292 * align the clusters (which leave sporatic out-of-order 293 * holes). To solve this problem we do the reverse scan 294 * first and attempt to align our cluster, then do a 295 * forward scan if room remains. 296 */ 297 298 vm_object_hold(object); 299 more: 300 while (ib && pageout_count < vm_pageout_page_count) { 301 vm_page_t p; 302 303 if (ib > pindex) { 304 ib = 0; 305 break; 306 } 307 308 p = vm_page_lookup_busy_try(object, pindex - ib, TRUE, &error); 309 if (error || p == NULL) { 310 ib = 0; 311 break; 312 } 313 if ((p->queue - p->pc) == PQ_CACHE || 314 (p->flags & PG_UNMANAGED)) { 315 vm_page_wakeup(p); 316 ib = 0; 317 break; 318 } 319 vm_page_test_dirty(p); 320 if ((p->dirty & p->valid) == 0 || 321 p->queue - p->pc != PQ_INACTIVE || 322 p->wire_count != 0 || /* may be held by buf cache */ 323 p->hold_count != 0) { /* may be undergoing I/O */ 324 vm_page_wakeup(p); 325 ib = 0; 326 break; 327 } 328 mc[--page_base] = p; 329 ++pageout_count; 330 ++ib; 331 /* 332 * alignment boundry, stop here and switch directions. Do 333 * not clear ib. 334 */ 335 if ((pindex - (ib - 1)) % vm_pageout_page_count == 0) 336 break; 337 } 338 339 while (pageout_count < vm_pageout_page_count && 340 pindex + is < object->size) { 341 vm_page_t p; 342 343 p = vm_page_lookup_busy_try(object, pindex + is, TRUE, &error); 344 if (error || p == NULL) 345 break; 346 if (((p->queue - p->pc) == PQ_CACHE) || 347 (p->flags & (PG_BUSY|PG_UNMANAGED)) || p->busy) { 348 vm_page_wakeup(p); 349 break; 350 } 351 vm_page_test_dirty(p); 352 if ((p->dirty & p->valid) == 0 || 353 p->queue - p->pc != PQ_INACTIVE || 354 p->wire_count != 0 || /* may be held by buf cache */ 355 p->hold_count != 0) { /* may be undergoing I/O */ 356 vm_page_wakeup(p); 357 break; 358 } 359 mc[page_base + pageout_count] = p; 360 ++pageout_count; 361 ++is; 362 } 363 364 /* 365 * If we exhausted our forward scan, continue with the reverse scan 366 * when possible, even past a page boundry. This catches boundry 367 * conditions. 368 */ 369 if (ib && pageout_count < vm_pageout_page_count) 370 goto more; 371 372 vm_object_drop(object); 373 374 /* 375 * we allow reads during pageouts... 376 */ 377 return vm_pageout_flush(&mc[page_base], pageout_count, 0); 378 } 379 380 /* 381 * vm_pageout_flush() - launder the given pages 382 * 383 * The given pages are laundered. Note that we setup for the start of 384 * I/O ( i.e. busy the page ), mark it read-only, and bump the object 385 * reference count all in here rather then in the parent. If we want 386 * the parent to do more sophisticated things we may have to change 387 * the ordering. 388 * 389 * The pages in the array must be busied by the caller and will be 390 * unbusied by this function. 391 */ 392 int 393 vm_pageout_flush(vm_page_t *mc, int count, int flags) 394 { 395 vm_object_t object; 396 int pageout_status[count]; 397 int numpagedout = 0; 398 int i; 399 400 /* 401 * Initiate I/O. Bump the vm_page_t->busy counter. 402 */ 403 for (i = 0; i < count; i++) { 404 KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, 405 ("vm_pageout_flush page %p index %d/%d: partially " 406 "invalid page", mc[i], i, count)); 407 vm_page_io_start(mc[i]); 408 } 409 410 /* 411 * We must make the pages read-only. This will also force the 412 * modified bit in the related pmaps to be cleared. The pager 413 * cannot clear the bit for us since the I/O completion code 414 * typically runs from an interrupt. The act of making the page 415 * read-only handles the case for us. 416 * 417 * Then we can unbusy the pages, we still hold a reference by virtue 418 * of our soft-busy. 419 */ 420 for (i = 0; i < count; i++) { 421 vm_page_protect(mc[i], VM_PROT_READ); 422 vm_page_wakeup(mc[i]); 423 } 424 425 object = mc[0]->object; 426 vm_object_pip_add(object, count); 427 428 vm_pager_put_pages(object, mc, count, 429 (flags | ((object == &kernel_object) ? VM_PAGER_PUT_SYNC : 0)), 430 pageout_status); 431 432 for (i = 0; i < count; i++) { 433 vm_page_t mt = mc[i]; 434 435 switch (pageout_status[i]) { 436 case VM_PAGER_OK: 437 numpagedout++; 438 break; 439 case VM_PAGER_PEND: 440 numpagedout++; 441 break; 442 case VM_PAGER_BAD: 443 /* 444 * Page outside of range of object. Right now we 445 * essentially lose the changes by pretending it 446 * worked. 447 */ 448 vm_page_busy_wait(mt, FALSE, "pgbad"); 449 pmap_clear_modify(mt); 450 vm_page_undirty(mt); 451 vm_page_wakeup(mt); 452 break; 453 case VM_PAGER_ERROR: 454 case VM_PAGER_FAIL: 455 /* 456 * A page typically cannot be paged out when we 457 * have run out of swap. We leave the page 458 * marked inactive and will try to page it out 459 * again later. 460 * 461 * Starvation of the active page list is used to 462 * determine when the system is massively memory 463 * starved. 464 */ 465 break; 466 case VM_PAGER_AGAIN: 467 break; 468 } 469 470 /* 471 * If the operation is still going, leave the page busy to 472 * block all other accesses. Also, leave the paging in 473 * progress indicator set so that we don't attempt an object 474 * collapse. 475 * 476 * For any pages which have completed synchronously, 477 * deactivate the page if we are under a severe deficit. 478 * Do not try to enter them into the cache, though, they 479 * might still be read-heavy. 480 */ 481 if (pageout_status[i] != VM_PAGER_PEND) { 482 vm_page_busy_wait(mt, FALSE, "pgouw"); 483 if (vm_page_count_severe()) 484 vm_page_deactivate(mt); 485 #if 0 486 if (!vm_page_count_severe() || !vm_page_try_to_cache(mt)) 487 vm_page_protect(mt, VM_PROT_READ); 488 #endif 489 vm_page_io_finish(mt); 490 vm_page_wakeup(mt); 491 vm_object_pip_wakeup(object); 492 } 493 } 494 return numpagedout; 495 } 496 497 #if !defined(NO_SWAPPING) 498 /* 499 * deactivate enough pages to satisfy the inactive target 500 * requirements or if vm_page_proc_limit is set, then 501 * deactivate all of the pages in the object and its 502 * backing_objects. 503 * 504 * The map must be locked. 505 * The caller must hold the vm_object. 506 */ 507 static int vm_pageout_object_deactivate_pages_callback(vm_page_t, void *); 508 509 static void 510 vm_pageout_object_deactivate_pages(vm_map_t map, vm_object_t object, 511 vm_pindex_t desired, int map_remove_only) 512 { 513 struct rb_vm_page_scan_info info; 514 vm_object_t lobject; 515 vm_object_t tobject; 516 int remove_mode; 517 518 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 519 lobject = object; 520 521 while (lobject) { 522 if (pmap_resident_count(vm_map_pmap(map)) <= desired) 523 break; 524 if (lobject->type == OBJT_DEVICE || lobject->type == OBJT_PHYS) 525 break; 526 if (lobject->paging_in_progress) 527 break; 528 529 remove_mode = map_remove_only; 530 if (lobject->shadow_count > 1) 531 remove_mode = 1; 532 533 /* 534 * scan the objects entire memory queue. We hold the 535 * object's token so the scan should not race anything. 536 */ 537 info.limit = remove_mode; 538 info.map = map; 539 info.desired = desired; 540 vm_page_rb_tree_RB_SCAN(&lobject->rb_memq, NULL, 541 vm_pageout_object_deactivate_pages_callback, 542 &info 543 ); 544 while ((tobject = lobject->backing_object) != NULL) { 545 KKASSERT(tobject != object); 546 vm_object_hold(tobject); 547 if (tobject == lobject->backing_object) 548 break; 549 vm_object_drop(tobject); 550 } 551 if (lobject != object) { 552 vm_object_lock_swap(); 553 vm_object_drop(lobject); 554 } 555 lobject = tobject; 556 } 557 if (lobject != object) 558 vm_object_drop(lobject); 559 } 560 561 /* 562 * The caller must hold the vm_object. 563 */ 564 static int 565 vm_pageout_object_deactivate_pages_callback(vm_page_t p, void *data) 566 { 567 struct rb_vm_page_scan_info *info = data; 568 int actcount; 569 570 if (pmap_resident_count(vm_map_pmap(info->map)) <= info->desired) { 571 return(-1); 572 } 573 mycpu->gd_cnt.v_pdpages++; 574 575 if (vm_page_busy_try(p, TRUE)) 576 return(0); 577 if (p->wire_count || p->hold_count || (p->flags & PG_UNMANAGED)) { 578 vm_page_wakeup(p); 579 return(0); 580 } 581 if (!pmap_page_exists_quick(vm_map_pmap(info->map), p)) { 582 vm_page_wakeup(p); 583 return(0); 584 } 585 586 actcount = pmap_ts_referenced(p); 587 if (actcount) { 588 vm_page_flag_set(p, PG_REFERENCED); 589 } else if (p->flags & PG_REFERENCED) { 590 actcount = 1; 591 } 592 593 vm_page_and_queue_spin_lock(p); 594 if (p->queue - p->pc != PQ_ACTIVE && (p->flags & PG_REFERENCED)) { 595 vm_page_and_queue_spin_unlock(p); 596 vm_page_activate(p); 597 p->act_count += actcount; 598 vm_page_flag_clear(p, PG_REFERENCED); 599 } else if (p->queue - p->pc == PQ_ACTIVE) { 600 if ((p->flags & PG_REFERENCED) == 0) { 601 p->act_count -= min(p->act_count, ACT_DECLINE); 602 if (!info->limit && 603 (vm_pageout_algorithm || (p->act_count == 0))) { 604 vm_page_and_queue_spin_unlock(p); 605 vm_page_protect(p, VM_PROT_NONE); 606 vm_page_deactivate(p); 607 } else { 608 TAILQ_REMOVE(&vm_page_queues[p->queue].pl, 609 p, pageq); 610 TAILQ_INSERT_TAIL(&vm_page_queues[p->queue].pl, 611 p, pageq); 612 vm_page_and_queue_spin_unlock(p); 613 } 614 } else { 615 vm_page_and_queue_spin_unlock(p); 616 vm_page_activate(p); 617 vm_page_flag_clear(p, PG_REFERENCED); 618 619 vm_page_and_queue_spin_lock(p); 620 if (p->queue - p->pc == PQ_ACTIVE) { 621 if (p->act_count < (ACT_MAX - ACT_ADVANCE)) 622 p->act_count += ACT_ADVANCE; 623 TAILQ_REMOVE(&vm_page_queues[p->queue].pl, 624 p, pageq); 625 TAILQ_INSERT_TAIL(&vm_page_queues[p->queue].pl, 626 p, pageq); 627 } 628 vm_page_and_queue_spin_unlock(p); 629 } 630 } else if (p->queue - p->pc == PQ_INACTIVE) { 631 vm_page_and_queue_spin_unlock(p); 632 vm_page_protect(p, VM_PROT_NONE); 633 } else { 634 vm_page_and_queue_spin_unlock(p); 635 } 636 vm_page_wakeup(p); 637 return(0); 638 } 639 640 /* 641 * Deactivate some number of pages in a map, try to do it fairly, but 642 * that is really hard to do. 643 */ 644 static void 645 vm_pageout_map_deactivate_pages(vm_map_t map, vm_pindex_t desired) 646 { 647 vm_map_entry_t tmpe; 648 vm_object_t obj, bigobj; 649 int nothingwired; 650 651 if (lockmgr(&map->lock, LK_EXCLUSIVE | LK_NOWAIT)) { 652 return; 653 } 654 655 bigobj = NULL; 656 nothingwired = TRUE; 657 658 /* 659 * first, search out the biggest object, and try to free pages from 660 * that. 661 */ 662 tmpe = map->header.next; 663 while (tmpe != &map->header) { 664 switch(tmpe->maptype) { 665 case VM_MAPTYPE_NORMAL: 666 case VM_MAPTYPE_VPAGETABLE: 667 obj = tmpe->object.vm_object; 668 if ((obj != NULL) && (obj->shadow_count <= 1) && 669 ((bigobj == NULL) || 670 (bigobj->resident_page_count < obj->resident_page_count))) { 671 bigobj = obj; 672 } 673 break; 674 default: 675 break; 676 } 677 if (tmpe->wired_count > 0) 678 nothingwired = FALSE; 679 tmpe = tmpe->next; 680 } 681 682 if (bigobj) { 683 vm_object_hold(bigobj); 684 vm_pageout_object_deactivate_pages(map, bigobj, desired, 0); 685 vm_object_drop(bigobj); 686 } 687 688 /* 689 * Next, hunt around for other pages to deactivate. We actually 690 * do this search sort of wrong -- .text first is not the best idea. 691 */ 692 tmpe = map->header.next; 693 while (tmpe != &map->header) { 694 if (pmap_resident_count(vm_map_pmap(map)) <= desired) 695 break; 696 switch(tmpe->maptype) { 697 case VM_MAPTYPE_NORMAL: 698 case VM_MAPTYPE_VPAGETABLE: 699 obj = tmpe->object.vm_object; 700 if (obj) { 701 vm_object_hold(obj); 702 vm_pageout_object_deactivate_pages(map, obj, desired, 0); 703 vm_object_drop(obj); 704 } 705 break; 706 default: 707 break; 708 } 709 tmpe = tmpe->next; 710 }; 711 712 /* 713 * Remove all mappings if a process is swapped out, this will free page 714 * table pages. 715 */ 716 if (desired == 0 && nothingwired) 717 pmap_remove(vm_map_pmap(map), 718 VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS); 719 vm_map_unlock(map); 720 } 721 #endif 722 723 /* 724 * Called when the pageout scan wants to free a page. We no longer 725 * try to cycle the vm_object here with a reference & dealloc, which can 726 * cause a non-trivial object collapse in a critical path. 727 * 728 * It is unclear why we cycled the ref_count in the past, perhaps to try 729 * to optimize shadow chain collapses but I don't quite see why it would 730 * be necessary. An OBJ_DEAD object should terminate any and all vm_pages 731 * synchronously and not have to be kicked-start. 732 */ 733 static void 734 vm_pageout_page_free(vm_page_t m) 735 { 736 vm_page_protect(m, VM_PROT_NONE); 737 vm_page_free(m); 738 } 739 740 /* 741 * vm_pageout_scan does the dirty work for the pageout daemon. 742 */ 743 struct vm_pageout_scan_info { 744 struct proc *bigproc; 745 vm_offset_t bigsize; 746 }; 747 748 static int vm_pageout_scan_callback(struct proc *p, void *data); 749 750 static int 751 vm_pageout_scan_inactive(int pass, int q, int inactive_shortage, 752 int *vnodes_skippedp) 753 { 754 vm_page_t m; 755 struct vm_page marker; 756 struct vnode *vpfailed; /* warning, allowed to be stale */ 757 int maxscan; 758 int delta = 0; 759 vm_object_t object; 760 int actcount; 761 int maxlaunder; 762 763 /* 764 * Start scanning the inactive queue for pages we can move to the 765 * cache or free. The scan will stop when the target is reached or 766 * we have scanned the entire inactive queue. Note that m->act_count 767 * is not used to form decisions for the inactive queue, only for the 768 * active queue. 769 * 770 * maxlaunder limits the number of dirty pages we flush per scan. 771 * For most systems a smaller value (16 or 32) is more robust under 772 * extreme memory and disk pressure because any unnecessary writes 773 * to disk can result in extreme performance degredation. However, 774 * systems with excessive dirty pages (especially when MAP_NOSYNC is 775 * used) will die horribly with limited laundering. If the pageout 776 * daemon cannot clean enough pages in the first pass, we let it go 777 * all out in succeeding passes. 778 */ 779 if ((maxlaunder = vm_max_launder) <= 1) 780 maxlaunder = 1; 781 if (pass) 782 maxlaunder = 10000; 783 784 /* 785 * Initialize our marker 786 */ 787 bzero(&marker, sizeof(marker)); 788 marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER; 789 marker.queue = PQ_INACTIVE + q; 790 marker.pc = q; 791 marker.wire_count = 1; 792 793 /* 794 * Inactive queue scan. 795 * 796 * NOTE: The vm_page must be spinlocked before the queue to avoid 797 * deadlocks, so it is easiest to simply iterate the loop 798 * with the queue unlocked at the top. 799 */ 800 vpfailed = NULL; 801 802 vm_page_queues_spin_lock(PQ_INACTIVE + q); 803 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); 804 maxscan = vmstats.v_inactive_count; 805 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 806 807 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 808 maxscan-- > 0 && inactive_shortage - delta > 0) 809 { 810 vm_page_and_queue_spin_lock(m); 811 if (m != TAILQ_NEXT(&marker, pageq)) { 812 vm_page_and_queue_spin_unlock(m); 813 ++maxscan; 814 continue; 815 } 816 KKASSERT(m->queue - m->pc == PQ_INACTIVE); 817 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, 818 &marker, pageq); 819 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE + q].pl, m, 820 &marker, pageq); 821 mycpu->gd_cnt.v_pdpages++; 822 823 /* 824 * Skip marker pages 825 */ 826 if (m->flags & PG_MARKER) { 827 vm_page_and_queue_spin_unlock(m); 828 continue; 829 } 830 831 /* 832 * Try to busy the page. Don't mess with pages which are 833 * already busy or reorder them in the queue. 834 */ 835 if (vm_page_busy_try(m, TRUE)) { 836 vm_page_and_queue_spin_unlock(m); 837 continue; 838 } 839 vm_page_and_queue_spin_unlock(m); 840 KKASSERT(m->queue - m->pc == PQ_INACTIVE); 841 842 lwkt_yield(); 843 844 /* 845 * The page has been successfully busied and is now no 846 * longer spinlocked. The queue is no longer spinlocked 847 * either. 848 */ 849 850 /* 851 * A held page may be undergoing I/O, so skip it. 852 */ 853 if (m->hold_count) { 854 vm_page_and_queue_spin_lock(m); 855 if (m->queue - m->pc == PQ_INACTIVE) { 856 TAILQ_REMOVE( 857 &vm_page_queues[PQ_INACTIVE + q].pl, 858 m, pageq); 859 TAILQ_INSERT_TAIL( 860 &vm_page_queues[PQ_INACTIVE + q].pl, 861 m, pageq); 862 } 863 vm_page_and_queue_spin_unlock(m); 864 ++vm_swapcache_inactive_heuristic; 865 vm_page_wakeup(m); 866 continue; 867 } 868 869 if (m->object->ref_count == 0) { 870 /* 871 * If the object is not being used, we ignore previous 872 * references. 873 */ 874 vm_page_flag_clear(m, PG_REFERENCED); 875 pmap_clear_reference(m); 876 /* fall through to end */ 877 } else if (((m->flags & PG_REFERENCED) == 0) && 878 (actcount = pmap_ts_referenced(m))) { 879 /* 880 * Otherwise, if the page has been referenced while 881 * in the inactive queue, we bump the "activation 882 * count" upwards, making it less likely that the 883 * page will be added back to the inactive queue 884 * prematurely again. Here we check the page tables 885 * (or emulated bits, if any), given the upper level 886 * VM system not knowing anything about existing 887 * references. 888 */ 889 vm_page_activate(m); 890 m->act_count += (actcount + ACT_ADVANCE); 891 vm_page_wakeup(m); 892 continue; 893 } 894 895 /* 896 * (m) is still busied. 897 * 898 * If the upper level VM system knows about any page 899 * references, we activate the page. We also set the 900 * "activation count" higher than normal so that we will less 901 * likely place pages back onto the inactive queue again. 902 */ 903 if ((m->flags & PG_REFERENCED) != 0) { 904 vm_page_flag_clear(m, PG_REFERENCED); 905 actcount = pmap_ts_referenced(m); 906 vm_page_activate(m); 907 m->act_count += (actcount + ACT_ADVANCE + 1); 908 vm_page_wakeup(m); 909 continue; 910 } 911 912 /* 913 * If the upper level VM system doesn't know anything about 914 * the page being dirty, we have to check for it again. As 915 * far as the VM code knows, any partially dirty pages are 916 * fully dirty. 917 * 918 * Pages marked PG_WRITEABLE may be mapped into the user 919 * address space of a process running on another cpu. A 920 * user process (without holding the MP lock) running on 921 * another cpu may be able to touch the page while we are 922 * trying to remove it. vm_page_cache() will handle this 923 * case for us. 924 */ 925 if (m->dirty == 0) { 926 vm_page_test_dirty(m); 927 } else { 928 vm_page_dirty(m); 929 } 930 931 if (m->valid == 0) { 932 /* 933 * Invalid pages can be easily freed 934 */ 935 vm_pageout_page_free(m); 936 mycpu->gd_cnt.v_dfree++; 937 ++delta; 938 } else if (m->dirty == 0) { 939 /* 940 * Clean pages can be placed onto the cache queue. 941 * This effectively frees them. 942 */ 943 vm_page_cache(m); 944 ++delta; 945 } else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) { 946 /* 947 * Dirty pages need to be paged out, but flushing 948 * a page is extremely expensive verses freeing 949 * a clean page. Rather then artificially limiting 950 * the number of pages we can flush, we instead give 951 * dirty pages extra priority on the inactive queue 952 * by forcing them to be cycled through the queue 953 * twice before being flushed, after which the 954 * (now clean) page will cycle through once more 955 * before being freed. This significantly extends 956 * the thrash point for a heavily loaded machine. 957 */ 958 vm_page_flag_set(m, PG_WINATCFLS); 959 vm_page_and_queue_spin_lock(m); 960 if (m->queue - m->pc == PQ_INACTIVE) { 961 TAILQ_REMOVE( 962 &vm_page_queues[PQ_INACTIVE + q].pl, 963 m, pageq); 964 TAILQ_INSERT_TAIL( 965 &vm_page_queues[PQ_INACTIVE + q].pl, 966 m, pageq); 967 } 968 vm_page_and_queue_spin_unlock(m); 969 ++vm_swapcache_inactive_heuristic; 970 vm_page_wakeup(m); 971 } else if (maxlaunder > 0) { 972 /* 973 * We always want to try to flush some dirty pages if 974 * we encounter them, to keep the system stable. 975 * Normally this number is small, but under extreme 976 * pressure where there are insufficient clean pages 977 * on the inactive queue, we may have to go all out. 978 */ 979 int swap_pageouts_ok; 980 struct vnode *vp = NULL; 981 982 object = m->object; 983 984 if ((object->type != OBJT_SWAP) && (object->type != OBJT_DEFAULT)) { 985 swap_pageouts_ok = 1; 986 } else { 987 swap_pageouts_ok = !(defer_swap_pageouts || disable_swap_pageouts); 988 swap_pageouts_ok |= (!disable_swap_pageouts && defer_swap_pageouts && 989 vm_page_count_min(0)); 990 991 } 992 993 /* 994 * We don't bother paging objects that are "dead". 995 * Those objects are in a "rundown" state. 996 */ 997 if (!swap_pageouts_ok || (object->flags & OBJ_DEAD)) { 998 vm_page_and_queue_spin_lock(m); 999 if (m->queue - m->pc == PQ_INACTIVE) { 1000 TAILQ_REMOVE( 1001 &vm_page_queues[PQ_INACTIVE + q].pl, 1002 m, pageq); 1003 TAILQ_INSERT_TAIL( 1004 &vm_page_queues[PQ_INACTIVE + q].pl, 1005 m, pageq); 1006 } 1007 vm_page_and_queue_spin_unlock(m); 1008 ++vm_swapcache_inactive_heuristic; 1009 vm_page_wakeup(m); 1010 continue; 1011 } 1012 1013 /* 1014 * (m) is still busied. 1015 * 1016 * The object is already known NOT to be dead. It 1017 * is possible for the vget() to block the whole 1018 * pageout daemon, but the new low-memory handling 1019 * code should prevent it. 1020 * 1021 * The previous code skipped locked vnodes and, worse, 1022 * reordered pages in the queue. This results in 1023 * completely non-deterministic operation because, 1024 * quite often, a vm_fault has initiated an I/O and 1025 * is holding a locked vnode at just the point where 1026 * the pageout daemon is woken up. 1027 * 1028 * We can't wait forever for the vnode lock, we might 1029 * deadlock due to a vn_read() getting stuck in 1030 * vm_wait while holding this vnode. We skip the 1031 * vnode if we can't get it in a reasonable amount 1032 * of time. 1033 * 1034 * vpfailed is used to (try to) avoid the case where 1035 * a large number of pages are associated with a 1036 * locked vnode, which could cause the pageout daemon 1037 * to stall for an excessive amount of time. 1038 */ 1039 if (object->type == OBJT_VNODE) { 1040 int flags; 1041 1042 vp = object->handle; 1043 flags = LK_EXCLUSIVE | LK_NOOBJ; 1044 if (vp == vpfailed) 1045 flags |= LK_NOWAIT; 1046 else 1047 flags |= LK_TIMELOCK; 1048 vm_page_hold(m); 1049 vm_page_wakeup(m); 1050 1051 /* 1052 * We have unbusied (m) temporarily so we can 1053 * acquire the vp lock without deadlocking. 1054 * (m) is held to prevent destruction. 1055 */ 1056 if (vget(vp, flags) != 0) { 1057 vpfailed = vp; 1058 ++pageout_lock_miss; 1059 if (object->flags & OBJ_MIGHTBEDIRTY) 1060 ++*vnodes_skippedp; 1061 vm_page_unhold(m); 1062 continue; 1063 } 1064 1065 /* 1066 * The page might have been moved to another 1067 * queue during potential blocking in vget() 1068 * above. The page might have been freed and 1069 * reused for another vnode. The object might 1070 * have been reused for another vnode. 1071 */ 1072 if (m->queue - m->pc != PQ_INACTIVE || 1073 m->object != object || 1074 object->handle != vp) { 1075 if (object->flags & OBJ_MIGHTBEDIRTY) 1076 ++*vnodes_skippedp; 1077 vput(vp); 1078 vm_page_unhold(m); 1079 continue; 1080 } 1081 1082 /* 1083 * The page may have been busied during the 1084 * blocking in vput(); We don't move the 1085 * page back onto the end of the queue so that 1086 * statistics are more correct if we don't. 1087 */ 1088 if (vm_page_busy_try(m, TRUE)) { 1089 vput(vp); 1090 vm_page_unhold(m); 1091 continue; 1092 } 1093 vm_page_unhold(m); 1094 1095 /* 1096 * (m) is busied again 1097 * 1098 * We own the busy bit and remove our hold 1099 * bit. If the page is still held it 1100 * might be undergoing I/O, so skip it. 1101 */ 1102 if (m->hold_count) { 1103 vm_page_and_queue_spin_lock(m); 1104 if (m->queue - m->pc == PQ_INACTIVE) { 1105 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, m, pageq); 1106 TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE + q].pl, m, pageq); 1107 } 1108 vm_page_and_queue_spin_unlock(m); 1109 ++vm_swapcache_inactive_heuristic; 1110 if (object->flags & OBJ_MIGHTBEDIRTY) 1111 ++*vnodes_skippedp; 1112 vm_page_wakeup(m); 1113 vput(vp); 1114 continue; 1115 } 1116 /* (m) is left busied as we fall through */ 1117 } 1118 1119 /* 1120 * page is busy and not held here. 1121 * 1122 * If a page is dirty, then it is either being washed 1123 * (but not yet cleaned) or it is still in the 1124 * laundry. If it is still in the laundry, then we 1125 * start the cleaning operation. 1126 * 1127 * decrement inactive_shortage on success to account 1128 * for the (future) cleaned page. Otherwise we 1129 * could wind up laundering or cleaning too many 1130 * pages. 1131 */ 1132 if (vm_pageout_clean(m) != 0) { 1133 ++delta; 1134 --maxlaunder; 1135 } 1136 /* clean ate busy, page no longer accessible */ 1137 if (vp != NULL) 1138 vput(vp); 1139 } else { 1140 vm_page_wakeup(m); 1141 } 1142 } 1143 vm_page_queues_spin_lock(PQ_INACTIVE + q); 1144 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); 1145 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 1146 1147 return (delta); 1148 } 1149 1150 static int 1151 vm_pageout_scan_active(int pass, int q, 1152 int inactive_shortage, int active_shortage, 1153 int *recycle_countp) 1154 { 1155 struct vm_page marker; 1156 vm_page_t m; 1157 int actcount; 1158 int delta = 0; 1159 int pcount; 1160 1161 /* 1162 * We want to move pages from the active queue to the inactive 1163 * queue to get the inactive queue to the inactive target. If 1164 * we still have a page shortage from above we try to directly free 1165 * clean pages instead of moving them. 1166 * 1167 * If we do still have a shortage we keep track of the number of 1168 * pages we free or cache (recycle_count) as a measure of thrashing 1169 * between the active and inactive queues. 1170 * 1171 * If we were able to completely satisfy the free+cache targets 1172 * from the inactive pool we limit the number of pages we move 1173 * from the active pool to the inactive pool to 2x the pages we 1174 * had removed from the inactive pool (with a minimum of 1/5 the 1175 * inactive target). If we were not able to completely satisfy 1176 * the free+cache targets we go for the whole target aggressively. 1177 * 1178 * NOTE: Both variables can end up negative. 1179 * NOTE: We are still in a critical section. 1180 */ 1181 1182 bzero(&marker, sizeof(marker)); 1183 marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER; 1184 marker.queue = PQ_ACTIVE + q; 1185 marker.pc = q; 1186 marker.wire_count = 1; 1187 1188 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1189 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1190 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1191 pcount = vmstats.v_active_count; 1192 1193 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 1194 pcount-- > 0 && (inactive_shortage - delta > 0 || 1195 active_shortage > 0)) 1196 { 1197 vm_page_and_queue_spin_lock(m); 1198 if (m != TAILQ_NEXT(&marker, pageq)) { 1199 vm_page_and_queue_spin_unlock(m); 1200 ++pcount; 1201 continue; 1202 } 1203 KKASSERT(m->queue - m->pc == PQ_ACTIVE); 1204 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, 1205 &marker, pageq); 1206 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m, 1207 &marker, pageq); 1208 1209 /* 1210 * Skip marker pages 1211 */ 1212 if (m->flags & PG_MARKER) { 1213 vm_page_and_queue_spin_unlock(m); 1214 continue; 1215 } 1216 1217 /* 1218 * Try to busy the page. Don't mess with pages which are 1219 * already busy or reorder them in the queue. 1220 */ 1221 if (vm_page_busy_try(m, TRUE)) { 1222 vm_page_and_queue_spin_unlock(m); 1223 continue; 1224 } 1225 1226 /* 1227 * Don't deactivate pages that are held, even if we can 1228 * busy them. (XXX why not?) 1229 */ 1230 if (m->hold_count != 0) { 1231 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, 1232 m, pageq); 1233 TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE + q].pl, 1234 m, pageq); 1235 vm_page_and_queue_spin_unlock(m); 1236 vm_page_wakeup(m); 1237 continue; 1238 } 1239 vm_page_and_queue_spin_unlock(m); 1240 lwkt_yield(); 1241 1242 /* 1243 * The page has been successfully busied and the page and 1244 * queue are no longer locked. 1245 */ 1246 1247 /* 1248 * The count for pagedaemon pages is done after checking the 1249 * page for eligibility... 1250 */ 1251 mycpu->gd_cnt.v_pdpages++; 1252 1253 /* 1254 * Check to see "how much" the page has been used and clear 1255 * the tracking access bits. If the object has no references 1256 * don't bother paying the expense. 1257 */ 1258 actcount = 0; 1259 if (m->object->ref_count != 0) { 1260 if (m->flags & PG_REFERENCED) 1261 ++actcount; 1262 actcount += pmap_ts_referenced(m); 1263 if (actcount) { 1264 m->act_count += ACT_ADVANCE + actcount; 1265 if (m->act_count > ACT_MAX) 1266 m->act_count = ACT_MAX; 1267 } 1268 } 1269 vm_page_flag_clear(m, PG_REFERENCED); 1270 1271 /* 1272 * actcount is only valid if the object ref_count is non-zero. 1273 */ 1274 if (actcount && m->object->ref_count != 0) { 1275 vm_page_and_queue_spin_lock(m); 1276 if (m->queue - m->pc == PQ_ACTIVE) { 1277 TAILQ_REMOVE( 1278 &vm_page_queues[PQ_ACTIVE + q].pl, 1279 m, pageq); 1280 TAILQ_INSERT_TAIL( 1281 &vm_page_queues[PQ_ACTIVE + q].pl, 1282 m, pageq); 1283 } 1284 vm_page_and_queue_spin_unlock(m); 1285 vm_page_wakeup(m); 1286 } else { 1287 m->act_count -= min(m->act_count, ACT_DECLINE); 1288 if (vm_pageout_algorithm || 1289 m->object->ref_count == 0 || 1290 m->act_count < pass + 1 1291 ) { 1292 /* 1293 * Deactivate the page. If we had a 1294 * shortage from our inactive scan try to 1295 * free (cache) the page instead. 1296 * 1297 * Don't just blindly cache the page if 1298 * we do not have a shortage from the 1299 * inactive scan, that could lead to 1300 * gigabytes being moved. 1301 */ 1302 --active_shortage; 1303 if (inactive_shortage - delta > 0 || 1304 m->object->ref_count == 0) { 1305 if (inactive_shortage - delta > 0) 1306 ++*recycle_countp; 1307 vm_page_protect(m, VM_PROT_NONE); 1308 if (m->dirty == 0 && 1309 inactive_shortage - delta > 0) { 1310 ++delta; 1311 vm_page_cache(m); 1312 } else { 1313 vm_page_deactivate(m); 1314 vm_page_wakeup(m); 1315 } 1316 } else { 1317 vm_page_deactivate(m); 1318 vm_page_wakeup(m); 1319 } 1320 } else { 1321 vm_page_and_queue_spin_lock(m); 1322 if (m->queue - m->pc == PQ_ACTIVE) { 1323 TAILQ_REMOVE( 1324 &vm_page_queues[PQ_ACTIVE + q].pl, 1325 m, pageq); 1326 TAILQ_INSERT_TAIL( 1327 &vm_page_queues[PQ_ACTIVE + q].pl, 1328 m, pageq); 1329 } 1330 vm_page_and_queue_spin_unlock(m); 1331 vm_page_wakeup(m); 1332 } 1333 } 1334 } 1335 1336 /* 1337 * Clean out our local marker. 1338 */ 1339 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1340 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1341 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1342 1343 return (delta); 1344 } 1345 1346 /* 1347 * The number of actually free pages can drop down to v_free_reserved, 1348 * we try to build the free count back above v_free_min. Note that 1349 * vm_paging_needed() also returns TRUE if v_free_count is not at 1350 * least v_free_min so that is the minimum we must build the free 1351 * count to. 1352 * 1353 * We use a slightly higher target to improve hysteresis, 1354 * ((v_free_target + v_free_min) / 2). Since v_free_target 1355 * is usually the same as v_cache_min this maintains about 1356 * half the pages in the free queue as are in the cache queue, 1357 * providing pretty good pipelining for pageout operation. 1358 * 1359 * The system operator can manipulate vm.v_cache_min and 1360 * vm.v_free_target to tune the pageout demon. Be sure 1361 * to keep vm.v_free_min < vm.v_free_target. 1362 * 1363 * Note that the original paging target is to get at least 1364 * (free_min + cache_min) into (free + cache). The slightly 1365 * higher target will shift additional pages from cache to free 1366 * without effecting the original paging target in order to 1367 * maintain better hysteresis and not have the free count always 1368 * be dead-on v_free_min. 1369 * 1370 * NOTE: we are still in a critical section. 1371 * 1372 * Pages moved from PQ_CACHE to totally free are not counted in the 1373 * pages_freed counter. 1374 */ 1375 static void 1376 vm_pageout_scan_cache(int inactive_shortage, 1377 int vnodes_skipped, int recycle_count) 1378 { 1379 struct vm_pageout_scan_info info; 1380 vm_page_t m; 1381 1382 while (vmstats.v_free_count < 1383 (vmstats.v_free_min + vmstats.v_free_target) / 2) { 1384 /* 1385 * This steals some code from vm/vm_page.c 1386 */ 1387 static int cache_rover = 0; 1388 1389 m = vm_page_list_find(PQ_CACHE, cache_rover & PQ_L2_MASK, FALSE); 1390 if (m == NULL) 1391 break; 1392 /* page is returned removed from its queue and spinlocked */ 1393 if (vm_page_busy_try(m, TRUE)) { 1394 vm_page_deactivate_locked(m); 1395 vm_page_spin_unlock(m); 1396 #ifdef INVARIANTS 1397 kprintf("Warning: busy page %p found in cache\n", m); 1398 #endif 1399 continue; 1400 } 1401 vm_page_spin_unlock(m); 1402 pagedaemon_wakeup(); 1403 lwkt_yield(); 1404 1405 /* 1406 * Page has been successfully busied and it and its queue 1407 * is no longer spinlocked. 1408 */ 1409 if ((m->flags & PG_UNMANAGED) || 1410 m->hold_count || 1411 m->wire_count) { 1412 vm_page_deactivate(m); 1413 vm_page_wakeup(m); 1414 continue; 1415 } 1416 KKASSERT((m->flags & PG_MAPPED) == 0); 1417 KKASSERT(m->dirty == 0); 1418 cache_rover += PQ_PRIME2; 1419 vm_pageout_page_free(m); 1420 mycpu->gd_cnt.v_dfree++; 1421 } 1422 1423 #if !defined(NO_SWAPPING) 1424 /* 1425 * Idle process swapout -- run once per second. 1426 */ 1427 if (vm_swap_idle_enabled) { 1428 static long lsec; 1429 if (time_second != lsec) { 1430 vm_pageout_req_swapout |= VM_SWAP_IDLE; 1431 vm_req_vmdaemon(); 1432 lsec = time_second; 1433 } 1434 } 1435 #endif 1436 1437 /* 1438 * If we didn't get enough free pages, and we have skipped a vnode 1439 * in a writeable object, wakeup the sync daemon. And kick swapout 1440 * if we did not get enough free pages. 1441 */ 1442 if (vm_paging_target() > 0) { 1443 if (vnodes_skipped && vm_page_count_min(0)) 1444 speedup_syncer(); 1445 #if !defined(NO_SWAPPING) 1446 if (vm_swap_enabled && vm_page_count_target()) { 1447 vm_req_vmdaemon(); 1448 vm_pageout_req_swapout |= VM_SWAP_NORMAL; 1449 } 1450 #endif 1451 } 1452 1453 /* 1454 * Handle catastrophic conditions. Under good conditions we should 1455 * be at the target, well beyond our minimum. If we could not even 1456 * reach our minimum the system is under heavy stress. 1457 * 1458 * Determine whether we have run out of memory. This occurs when 1459 * swap_pager_full is TRUE and the only pages left in the page 1460 * queues are dirty. We will still likely have page shortages. 1461 * 1462 * - swap_pager_full is set if insufficient swap was 1463 * available to satisfy a requested pageout. 1464 * 1465 * - the inactive queue is bloated (4 x size of active queue), 1466 * meaning it is unable to get rid of dirty pages and. 1467 * 1468 * - vm_page_count_min() without counting pages recycled from the 1469 * active queue (recycle_count) means we could not recover 1470 * enough pages to meet bare minimum needs. This test only 1471 * works if the inactive queue is bloated. 1472 * 1473 * - due to a positive inactive_shortage we shifted the remaining 1474 * dirty pages from the active queue to the inactive queue 1475 * trying to find clean ones to free. 1476 */ 1477 if (swap_pager_full && vm_page_count_min(recycle_count)) 1478 kprintf("Warning: system low on memory+swap!\n"); 1479 if (swap_pager_full && vm_page_count_min(recycle_count) && 1480 vmstats.v_inactive_count > vmstats.v_active_count * 4 && 1481 inactive_shortage > 0) { 1482 /* 1483 * Kill something. 1484 */ 1485 info.bigproc = NULL; 1486 info.bigsize = 0; 1487 allproc_scan(vm_pageout_scan_callback, &info); 1488 if (info.bigproc != NULL) { 1489 killproc(info.bigproc, "out of swap space"); 1490 info.bigproc->p_nice = PRIO_MIN; 1491 info.bigproc->p_usched->resetpriority( 1492 FIRST_LWP_IN_PROC(info.bigproc)); 1493 wakeup(&vmstats.v_free_count); 1494 PRELE(info.bigproc); 1495 } 1496 } 1497 } 1498 1499 /* 1500 * The caller must hold proc_token. 1501 */ 1502 static int 1503 vm_pageout_scan_callback(struct proc *p, void *data) 1504 { 1505 struct vm_pageout_scan_info *info = data; 1506 vm_offset_t size; 1507 1508 /* 1509 * Never kill system processes or init. If we have configured swap 1510 * then try to avoid killing low-numbered pids. 1511 */ 1512 if ((p->p_flag & P_SYSTEM) || (p->p_pid == 1) || 1513 ((p->p_pid < 48) && (vm_swap_size != 0))) { 1514 return (0); 1515 } 1516 1517 /* 1518 * if the process is in a non-running type state, 1519 * don't touch it. 1520 */ 1521 if (p->p_stat != SACTIVE && p->p_stat != SSTOP) 1522 return (0); 1523 1524 /* 1525 * Get the approximate process size. Note that anonymous pages 1526 * with backing swap will be counted twice, but there should not 1527 * be too many such pages due to the stress the VM system is 1528 * under at this point. 1529 */ 1530 size = vmspace_anonymous_count(p->p_vmspace) + 1531 vmspace_swap_count(p->p_vmspace); 1532 1533 /* 1534 * If the this process is bigger than the biggest one 1535 * remember it. 1536 */ 1537 if (info->bigsize < size) { 1538 if (info->bigproc) 1539 PRELE(info->bigproc); 1540 PHOLD(p); 1541 info->bigproc = p; 1542 info->bigsize = size; 1543 } 1544 lwkt_yield(); 1545 return(0); 1546 } 1547 1548 /* 1549 * This routine tries to maintain the pseudo LRU active queue, 1550 * so that during long periods of time where there is no paging, 1551 * that some statistic accumulation still occurs. This code 1552 * helps the situation where paging just starts to occur. 1553 */ 1554 static void 1555 vm_pageout_page_stats(int q) 1556 { 1557 static int fullintervalcount = 0; 1558 struct vm_page marker; 1559 vm_page_t m; 1560 int pcount, tpcount; /* Number of pages to check */ 1561 int page_shortage; 1562 1563 page_shortage = (vmstats.v_inactive_target + vmstats.v_cache_max + 1564 vmstats.v_free_min) - 1565 (vmstats.v_free_count + vmstats.v_inactive_count + 1566 vmstats.v_cache_count); 1567 1568 if (page_shortage <= 0) 1569 return; 1570 1571 pcount = vmstats.v_active_count; 1572 fullintervalcount += vm_pageout_stats_interval; 1573 if (fullintervalcount < vm_pageout_full_stats_interval) { 1574 tpcount = (vm_pageout_stats_max * vmstats.v_active_count) / 1575 vmstats.v_page_count; 1576 if (pcount > tpcount) 1577 pcount = tpcount; 1578 } else { 1579 fullintervalcount = 0; 1580 } 1581 1582 bzero(&marker, sizeof(marker)); 1583 marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER; 1584 marker.queue = PQ_ACTIVE + q; 1585 marker.pc = q; 1586 marker.wire_count = 1; 1587 1588 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1589 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1590 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1591 1592 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 1593 pcount-- > 0) 1594 { 1595 int actcount; 1596 1597 vm_page_and_queue_spin_lock(m); 1598 if (m != TAILQ_NEXT(&marker, pageq)) { 1599 vm_page_and_queue_spin_unlock(m); 1600 ++pcount; 1601 continue; 1602 } 1603 KKASSERT(m->queue - m->pc == PQ_ACTIVE); 1604 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1605 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m, 1606 &marker, pageq); 1607 1608 /* 1609 * Ignore markers 1610 */ 1611 if (m->flags & PG_MARKER) { 1612 vm_page_and_queue_spin_unlock(m); 1613 continue; 1614 } 1615 1616 /* 1617 * Ignore pages we can't busy 1618 */ 1619 if (vm_page_busy_try(m, TRUE)) { 1620 vm_page_and_queue_spin_unlock(m); 1621 continue; 1622 } 1623 vm_page_and_queue_spin_unlock(m); 1624 KKASSERT(m->queue - m->pc == PQ_ACTIVE); 1625 1626 /* 1627 * We now have a safely busied page, the page and queue 1628 * spinlocks have been released. 1629 * 1630 * Ignore held pages 1631 */ 1632 if (m->hold_count) { 1633 vm_page_wakeup(m); 1634 continue; 1635 } 1636 1637 /* 1638 * Calculate activity 1639 */ 1640 actcount = 0; 1641 if (m->flags & PG_REFERENCED) { 1642 vm_page_flag_clear(m, PG_REFERENCED); 1643 actcount += 1; 1644 } 1645 actcount += pmap_ts_referenced(m); 1646 1647 /* 1648 * Update act_count and move page to end of queue. 1649 */ 1650 if (actcount) { 1651 m->act_count += ACT_ADVANCE + actcount; 1652 if (m->act_count > ACT_MAX) 1653 m->act_count = ACT_MAX; 1654 vm_page_and_queue_spin_lock(m); 1655 if (m->queue - m->pc == PQ_ACTIVE) { 1656 TAILQ_REMOVE( 1657 &vm_page_queues[PQ_ACTIVE + q].pl, 1658 m, pageq); 1659 TAILQ_INSERT_TAIL( 1660 &vm_page_queues[PQ_ACTIVE + q].pl, 1661 m, pageq); 1662 } 1663 vm_page_and_queue_spin_unlock(m); 1664 vm_page_wakeup(m); 1665 continue; 1666 } 1667 1668 if (m->act_count == 0) { 1669 /* 1670 * We turn off page access, so that we have 1671 * more accurate RSS stats. We don't do this 1672 * in the normal page deactivation when the 1673 * system is loaded VM wise, because the 1674 * cost of the large number of page protect 1675 * operations would be higher than the value 1676 * of doing the operation. 1677 * 1678 * We use the marker to save our place so 1679 * we can release the spin lock. both (m) 1680 * and (next) will be invalid. 1681 */ 1682 vm_page_protect(m, VM_PROT_NONE); 1683 vm_page_deactivate(m); 1684 } else { 1685 m->act_count -= min(m->act_count, ACT_DECLINE); 1686 vm_page_and_queue_spin_lock(m); 1687 if (m->queue - m->pc == PQ_ACTIVE) { 1688 TAILQ_REMOVE( 1689 &vm_page_queues[PQ_ACTIVE + q].pl, 1690 m, pageq); 1691 TAILQ_INSERT_TAIL( 1692 &vm_page_queues[PQ_ACTIVE + q].pl, 1693 m, pageq); 1694 } 1695 vm_page_and_queue_spin_unlock(m); 1696 } 1697 vm_page_wakeup(m); 1698 } 1699 1700 /* 1701 * Remove our local marker 1702 */ 1703 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1704 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1705 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1706 } 1707 1708 static int 1709 vm_pageout_free_page_calc(vm_size_t count) 1710 { 1711 if (count < vmstats.v_page_count) 1712 return 0; 1713 /* 1714 * free_reserved needs to include enough for the largest swap pager 1715 * structures plus enough for any pv_entry structs when paging. 1716 * 1717 * v_free_min normal allocations 1718 * v_free_reserved system allocations 1719 * v_pageout_free_min allocations by pageout daemon 1720 * v_interrupt_free_min low level allocations (e.g swap structures) 1721 */ 1722 if (vmstats.v_page_count > 1024) 1723 vmstats.v_free_min = 64 + (vmstats.v_page_count - 1024) / 200; 1724 else 1725 vmstats.v_free_min = 64; 1726 vmstats.v_free_reserved = vmstats.v_free_min * 4 / 8 + 7; 1727 vmstats.v_free_severe = vmstats.v_free_min * 4 / 8 + 0; 1728 vmstats.v_pageout_free_min = vmstats.v_free_min * 2 / 8 + 7; 1729 vmstats.v_interrupt_free_min = vmstats.v_free_min * 1 / 8 + 7; 1730 1731 return 1; 1732 } 1733 1734 1735 /* 1736 * vm_pageout is the high level pageout daemon. 1737 * 1738 * No requirements. 1739 */ 1740 static void 1741 vm_pageout_thread(void) 1742 { 1743 int pass; 1744 int q; 1745 1746 /* 1747 * Initialize some paging parameters. 1748 */ 1749 curthread->td_flags |= TDF_SYSTHREAD; 1750 1751 if (vmstats.v_page_count < 2000) 1752 vm_pageout_page_count = 8; 1753 1754 vm_pageout_free_page_calc(vmstats.v_page_count); 1755 1756 /* 1757 * v_free_target and v_cache_min control pageout hysteresis. Note 1758 * that these are more a measure of the VM cache queue hysteresis 1759 * then the VM free queue. Specifically, v_free_target is the 1760 * high water mark (free+cache pages). 1761 * 1762 * v_free_reserved + v_cache_min (mostly means v_cache_min) is the 1763 * low water mark, while v_free_min is the stop. v_cache_min must 1764 * be big enough to handle memory needs while the pageout daemon 1765 * is signalled and run to free more pages. 1766 */ 1767 if (vmstats.v_free_count > 6144) 1768 vmstats.v_free_target = 4 * vmstats.v_free_min + vmstats.v_free_reserved; 1769 else 1770 vmstats.v_free_target = 2 * vmstats.v_free_min + vmstats.v_free_reserved; 1771 1772 /* 1773 * NOTE: With the new buffer cache b_act_count we want the default 1774 * inactive target to be a percentage of available memory. 1775 * 1776 * The inactive target essentially determines the minimum 1777 * number of 'temporary' pages capable of caching one-time-use 1778 * files when the VM system is otherwise full of pages 1779 * belonging to multi-time-use files or active program data. 1780 * 1781 * NOTE: The inactive target is aggressively persued only if the 1782 * inactive queue becomes too small. If the inactive queue 1783 * is large enough to satisfy page movement to free+cache 1784 * then it is repopulated more slowly from the active queue. 1785 * This allows a general inactive_target default to be set. 1786 * 1787 * There is an issue here for processes which sit mostly idle 1788 * 'overnight', such as sshd, tcsh, and X. Any movement from 1789 * the active queue will eventually cause such pages to 1790 * recycle eventually causing a lot of paging in the morning. 1791 * To reduce the incidence of this pages cycled out of the 1792 * buffer cache are moved directly to the inactive queue if 1793 * they were only used once or twice. 1794 * 1795 * The vfs.vm_cycle_point sysctl can be used to adjust this. 1796 * Increasing the value (up to 64) increases the number of 1797 * buffer recyclements which go directly to the inactive queue. 1798 */ 1799 if (vmstats.v_free_count > 2048) { 1800 vmstats.v_cache_min = vmstats.v_free_target; 1801 vmstats.v_cache_max = 2 * vmstats.v_cache_min; 1802 } else { 1803 vmstats.v_cache_min = 0; 1804 vmstats.v_cache_max = 0; 1805 } 1806 vmstats.v_inactive_target = vmstats.v_free_count / 4; 1807 1808 /* XXX does not really belong here */ 1809 if (vm_page_max_wired == 0) 1810 vm_page_max_wired = vmstats.v_free_count / 3; 1811 1812 if (vm_pageout_stats_max == 0) 1813 vm_pageout_stats_max = vmstats.v_free_target; 1814 1815 /* 1816 * Set interval in seconds for stats scan. 1817 */ 1818 if (vm_pageout_stats_interval == 0) 1819 vm_pageout_stats_interval = 5; 1820 if (vm_pageout_full_stats_interval == 0) 1821 vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4; 1822 1823 1824 /* 1825 * Set maximum free per pass 1826 */ 1827 if (vm_pageout_stats_free_max == 0) 1828 vm_pageout_stats_free_max = 5; 1829 1830 swap_pager_swap_init(); 1831 pass = 0; 1832 1833 /* 1834 * The pageout daemon is never done, so loop forever. 1835 */ 1836 while (TRUE) { 1837 int error; 1838 int delta1; 1839 int delta2; 1840 int inactive_shortage; 1841 int active_shortage; 1842 int vnodes_skipped = 0; 1843 int recycle_count = 0; 1844 int tmp; 1845 1846 /* 1847 * Wait for an action request. If we timeout check to 1848 * see if paging is needed (in case the normal wakeup 1849 * code raced us). 1850 */ 1851 if (vm_pages_needed == 0) { 1852 error = tsleep(&vm_pages_needed, 1853 0, "psleep", 1854 vm_pageout_stats_interval * hz); 1855 if (error && 1856 vm_paging_needed() == 0 && 1857 vm_pages_needed == 0) { 1858 for (q = 0; q < PQ_MAXL2_SIZE; ++q) 1859 vm_pageout_page_stats(q); 1860 continue; 1861 } 1862 vm_pages_needed = 1; 1863 } 1864 1865 mycpu->gd_cnt.v_pdwakeups++; 1866 1867 /* 1868 * Do whatever cleanup that the pmap code can. 1869 */ 1870 pmap_collect(); 1871 1872 /* 1873 * Scan for pageout. Try to avoid thrashing the system 1874 * with activity. 1875 * 1876 * Calculate our target for the number of free+cache pages we 1877 * want to get to. This is higher then the number that causes 1878 * allocations to stall (severe) in order to provide hysteresis, 1879 * and if we don't make it all the way but get to the minimum 1880 * we're happy. 1881 */ 1882 inactive_shortage = vm_paging_target() + vm_pageout_deficit; 1883 vm_pageout_deficit = 0; 1884 delta1 = 0; 1885 for (q = 0; q < PQ_MAXL2_SIZE; ++q) { 1886 delta1 += vm_pageout_scan_inactive( 1887 pass, q, 1888 inactive_shortage / PQ_MAXL2_SIZE + 1, 1889 &vnodes_skipped); 1890 } 1891 1892 /* 1893 * Figure out how many active pages we must deactivate. If 1894 * we were able to reach our target with just the inactive 1895 * scan above we limit the number of active pages we 1896 * deactivate to reduce unnecessary work. 1897 */ 1898 active_shortage = vmstats.v_inactive_target - 1899 vmstats.v_inactive_count; 1900 1901 tmp = inactive_shortage; 1902 if (tmp < vmstats.v_inactive_target / 10) 1903 tmp = vmstats.v_inactive_target / 10; 1904 inactive_shortage -= delta1; 1905 if (inactive_shortage <= 0 && active_shortage > tmp * 2) 1906 active_shortage = tmp * 2; 1907 1908 delta2 = 0; 1909 for (q = 0; q < PQ_MAXL2_SIZE; ++q) { 1910 delta2 += vm_pageout_scan_active( 1911 pass, q, 1912 inactive_shortage / PQ_MAXL2_SIZE + 1, 1913 active_shortage / PQ_MAXL2_SIZE + 1, 1914 &recycle_count); 1915 } 1916 1917 /* 1918 * Finally free enough cache pages to meet our free page 1919 * requirement and take more drastic measures if we are 1920 * still in trouble. 1921 */ 1922 inactive_shortage -= delta2; 1923 vm_pageout_scan_cache(inactive_shortage, vnodes_skipped, 1924 recycle_count); 1925 1926 /* 1927 * Wait for more work. 1928 */ 1929 if (inactive_shortage > 0) { 1930 ++pass; 1931 if (swap_pager_full) { 1932 /* 1933 * Running out of memory, catastrophic back-off 1934 * to one-second intervals. 1935 */ 1936 tsleep(&vm_pages_needed, 0, "pdelay", hz); 1937 } else if (pass < 10 && vm_pages_needed > 1) { 1938 /* 1939 * Normal operation, additional processes 1940 * have already kicked us. Retry immediately. 1941 */ 1942 } else if (pass < 10) { 1943 /* 1944 * Normal operation, fewer processes. Delay 1945 * a bit but allow wakeups. 1946 */ 1947 vm_pages_needed = 0; 1948 tsleep(&vm_pages_needed, 0, "pdelay", hz / 10); 1949 vm_pages_needed = 1; 1950 } else { 1951 /* 1952 * We've taken too many passes, forced delay. 1953 */ 1954 tsleep(&vm_pages_needed, 0, "pdelay", hz / 10); 1955 } 1956 } else { 1957 /* 1958 * Interlocked wakeup of waiters (non-optional) 1959 */ 1960 pass = 0; 1961 if (vm_pages_needed && !vm_page_count_min(0)) { 1962 wakeup(&vmstats.v_free_count); 1963 vm_pages_needed = 0; 1964 } 1965 } 1966 } 1967 } 1968 1969 static struct kproc_desc page_kp = { 1970 "pagedaemon", 1971 vm_pageout_thread, 1972 &pagethread 1973 }; 1974 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &page_kp) 1975 1976 1977 /* 1978 * Called after allocating a page out of the cache or free queue 1979 * to possibly wake the pagedaemon up to replentish our supply. 1980 * 1981 * We try to generate some hysteresis by waking the pagedaemon up 1982 * when our free+cache pages go below the free_min+cache_min level. 1983 * The pagedaemon tries to get the count back up to at least the 1984 * minimum, and through to the target level if possible. 1985 * 1986 * If the pagedaemon is already active bump vm_pages_needed as a hint 1987 * that there are even more requests pending. 1988 * 1989 * SMP races ok? 1990 * No requirements. 1991 */ 1992 void 1993 pagedaemon_wakeup(void) 1994 { 1995 if (vm_paging_needed() && curthread != pagethread) { 1996 if (vm_pages_needed == 0) { 1997 vm_pages_needed = 1; /* SMP race ok */ 1998 wakeup(&vm_pages_needed); 1999 } else if (vm_page_count_min(0)) { 2000 ++vm_pages_needed; /* SMP race ok */ 2001 } 2002 } 2003 } 2004 2005 #if !defined(NO_SWAPPING) 2006 2007 /* 2008 * SMP races ok? 2009 * No requirements. 2010 */ 2011 static void 2012 vm_req_vmdaemon(void) 2013 { 2014 static int lastrun = 0; 2015 2016 if ((ticks > (lastrun + hz)) || (ticks < lastrun)) { 2017 wakeup(&vm_daemon_needed); 2018 lastrun = ticks; 2019 } 2020 } 2021 2022 static int vm_daemon_callback(struct proc *p, void *data __unused); 2023 2024 /* 2025 * No requirements. 2026 */ 2027 static void 2028 vm_daemon(void) 2029 { 2030 /* 2031 * XXX vm_daemon_needed specific token? 2032 */ 2033 while (TRUE) { 2034 tsleep(&vm_daemon_needed, 0, "psleep", 0); 2035 if (vm_pageout_req_swapout) { 2036 swapout_procs(vm_pageout_req_swapout); 2037 vm_pageout_req_swapout = 0; 2038 } 2039 /* 2040 * scan the processes for exceeding their rlimits or if 2041 * process is swapped out -- deactivate pages 2042 */ 2043 allproc_scan(vm_daemon_callback, NULL); 2044 } 2045 } 2046 2047 /* 2048 * Caller must hold proc_token. 2049 */ 2050 static int 2051 vm_daemon_callback(struct proc *p, void *data __unused) 2052 { 2053 vm_pindex_t limit, size; 2054 2055 /* 2056 * if this is a system process or if we have already 2057 * looked at this process, skip it. 2058 */ 2059 if (p->p_flag & (P_SYSTEM | P_WEXIT)) 2060 return (0); 2061 2062 /* 2063 * if the process is in a non-running type state, 2064 * don't touch it. 2065 */ 2066 if (p->p_stat != SACTIVE && p->p_stat != SSTOP) 2067 return (0); 2068 2069 /* 2070 * get a limit 2071 */ 2072 limit = OFF_TO_IDX(qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur, 2073 p->p_rlimit[RLIMIT_RSS].rlim_max)); 2074 2075 /* 2076 * let processes that are swapped out really be 2077 * swapped out. Set the limit to nothing to get as 2078 * many pages out to swap as possible. 2079 */ 2080 if (p->p_flag & P_SWAPPEDOUT) 2081 limit = 0; 2082 2083 lwkt_gettoken(&p->p_vmspace->vm_map.token); 2084 size = vmspace_resident_count(p->p_vmspace); 2085 if (limit >= 0 && size >= limit) { 2086 vm_pageout_map_deactivate_pages(&p->p_vmspace->vm_map, limit); 2087 } 2088 lwkt_reltoken(&p->p_vmspace->vm_map.token); 2089 return (0); 2090 } 2091 2092 #endif 2093