1 /* 2 * (MPSAFE) 3 * 4 * Copyright (c) 1991 Regents of the University of California. 5 * All rights reserved. 6 * Copyright (c) 1994 John S. Dyson 7 * All rights reserved. 8 * Copyright (c) 1994 David Greenman 9 * All rights reserved. 10 * 11 * This code is derived from software contributed to Berkeley by 12 * The Mach Operating System project at Carnegie-Mellon University. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * from: @(#)vm_pageout.c 7.4 (Berkeley) 5/7/91 39 * 40 * 41 * Copyright (c) 1987, 1990 Carnegie-Mellon University. 42 * All rights reserved. 43 * 44 * Authors: Avadis Tevanian, Jr., Michael Wayne Young 45 * 46 * Permission to use, copy, modify and distribute this software and 47 * its documentation is hereby granted, provided that both the copyright 48 * notice and this permission notice appear in all copies of the 49 * software, derivative works or modified versions, and any portions 50 * thereof, and that both notices appear in supporting documentation. 51 * 52 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" 53 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND 54 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. 55 * 56 * Carnegie Mellon requests users of this software to return to 57 * 58 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU 59 * School of Computer Science 60 * Carnegie Mellon University 61 * Pittsburgh PA 15213-3890 62 * 63 * any improvements or extensions that they make and grant Carnegie the 64 * rights to redistribute these changes. 65 * 66 * $FreeBSD: src/sys/vm/vm_pageout.c,v 1.151.2.15 2002/12/29 18:21:04 dillon Exp $ 67 */ 68 69 /* 70 * The proverbial page-out daemon. 71 */ 72 73 #include "opt_vm.h" 74 #include <sys/param.h> 75 #include <sys/systm.h> 76 #include <sys/kernel.h> 77 #include <sys/proc.h> 78 #include <sys/kthread.h> 79 #include <sys/resourcevar.h> 80 #include <sys/signalvar.h> 81 #include <sys/vnode.h> 82 #include <sys/vmmeter.h> 83 #include <sys/sysctl.h> 84 85 #include <vm/vm.h> 86 #include <vm/vm_param.h> 87 #include <sys/lock.h> 88 #include <vm/vm_object.h> 89 #include <vm/vm_page.h> 90 #include <vm/vm_map.h> 91 #include <vm/vm_pageout.h> 92 #include <vm/vm_pager.h> 93 #include <vm/swap_pager.h> 94 #include <vm/vm_extern.h> 95 96 #include <sys/thread2.h> 97 #include <sys/spinlock2.h> 98 #include <vm/vm_page2.h> 99 100 /* 101 * System initialization 102 */ 103 104 /* the kernel process "vm_pageout"*/ 105 static int vm_pageout_clean (vm_page_t); 106 static int vm_pageout_free_page_calc (vm_size_t count); 107 struct thread *pagethread; 108 109 #if !defined(NO_SWAPPING) 110 /* the kernel process "vm_daemon"*/ 111 static void vm_daemon (void); 112 static struct thread *vmthread; 113 114 static struct kproc_desc vm_kp = { 115 "vmdaemon", 116 vm_daemon, 117 &vmthread 118 }; 119 SYSINIT(vmdaemon, SI_SUB_KTHREAD_VM, SI_ORDER_FIRST, kproc_start, &vm_kp) 120 #endif 121 122 int vm_pages_needed=0; /* Event on which pageout daemon sleeps */ 123 int vm_pageout_deficit=0; /* Estimated number of pages deficit */ 124 int vm_pageout_pages_needed=0; /* flag saying that the pageout daemon needs pages */ 125 126 #if !defined(NO_SWAPPING) 127 static int vm_pageout_req_swapout; /* XXX */ 128 static int vm_daemon_needed; 129 #endif 130 static int vm_max_launder = 32; 131 static int vm_pageout_stats_max=0, vm_pageout_stats_interval = 0; 132 static int vm_pageout_full_stats_interval = 0; 133 static int vm_pageout_stats_free_max=0, vm_pageout_algorithm=0; 134 static int defer_swap_pageouts=0; 135 static int disable_swap_pageouts=0; 136 137 #if defined(NO_SWAPPING) 138 static int vm_swap_enabled=0; 139 static int vm_swap_idle_enabled=0; 140 #else 141 static int vm_swap_enabled=1; 142 static int vm_swap_idle_enabled=0; 143 #endif 144 145 SYSCTL_INT(_vm, VM_PAGEOUT_ALGORITHM, pageout_algorithm, 146 CTLFLAG_RW, &vm_pageout_algorithm, 0, "LRU page mgmt"); 147 148 SYSCTL_INT(_vm, OID_AUTO, max_launder, 149 CTLFLAG_RW, &vm_max_launder, 0, "Limit dirty flushes in pageout"); 150 151 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_max, 152 CTLFLAG_RW, &vm_pageout_stats_max, 0, "Max pageout stats scan length"); 153 154 SYSCTL_INT(_vm, OID_AUTO, pageout_full_stats_interval, 155 CTLFLAG_RW, &vm_pageout_full_stats_interval, 0, "Interval for full stats scan"); 156 157 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_interval, 158 CTLFLAG_RW, &vm_pageout_stats_interval, 0, "Interval for partial stats scan"); 159 160 SYSCTL_INT(_vm, OID_AUTO, pageout_stats_free_max, 161 CTLFLAG_RW, &vm_pageout_stats_free_max, 0, "Not implemented"); 162 163 #if defined(NO_SWAPPING) 164 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 165 CTLFLAG_RD, &vm_swap_enabled, 0, ""); 166 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, 167 CTLFLAG_RD, &vm_swap_idle_enabled, 0, ""); 168 #else 169 SYSCTL_INT(_vm, VM_SWAPPING_ENABLED, swap_enabled, 170 CTLFLAG_RW, &vm_swap_enabled, 0, "Enable entire process swapout"); 171 SYSCTL_INT(_vm, OID_AUTO, swap_idle_enabled, 172 CTLFLAG_RW, &vm_swap_idle_enabled, 0, "Allow swapout on idle criteria"); 173 #endif 174 175 SYSCTL_INT(_vm, OID_AUTO, defer_swapspace_pageouts, 176 CTLFLAG_RW, &defer_swap_pageouts, 0, "Give preference to dirty pages in mem"); 177 178 SYSCTL_INT(_vm, OID_AUTO, disable_swapspace_pageouts, 179 CTLFLAG_RW, &disable_swap_pageouts, 0, "Disallow swapout of dirty pages"); 180 181 static int pageout_lock_miss; 182 SYSCTL_INT(_vm, OID_AUTO, pageout_lock_miss, 183 CTLFLAG_RD, &pageout_lock_miss, 0, "vget() lock misses during pageout"); 184 185 #define VM_PAGEOUT_PAGE_COUNT 16 186 int vm_pageout_page_count = VM_PAGEOUT_PAGE_COUNT; 187 188 int vm_page_max_wired; /* XXX max # of wired pages system-wide */ 189 190 #if !defined(NO_SWAPPING) 191 typedef void freeer_fcn_t (vm_map_t, vm_object_t, vm_pindex_t, int); 192 static void vm_pageout_map_deactivate_pages (vm_map_t, vm_pindex_t); 193 static freeer_fcn_t vm_pageout_object_deactivate_pages; 194 static void vm_req_vmdaemon (void); 195 #endif 196 static void vm_pageout_page_stats(int q); 197 198 static __inline int 199 PQAVERAGE(int n) 200 { 201 if (n >= 0) 202 return((n + (PQ_L2_SIZE - 1)) / PQ_L2_SIZE + 1); 203 else 204 return((n - (PQ_L2_SIZE - 1)) / PQ_L2_SIZE - 1); 205 } 206 207 /* 208 * vm_pageout_clean: 209 * 210 * Clean the page and remove it from the laundry. The page must not be 211 * busy on-call. 212 * 213 * We set the busy bit to cause potential page faults on this page to 214 * block. Note the careful timing, however, the busy bit isn't set till 215 * late and we cannot do anything that will mess with the page. 216 */ 217 static int 218 vm_pageout_clean(vm_page_t m) 219 { 220 vm_object_t object; 221 vm_page_t mc[2*vm_pageout_page_count]; 222 int pageout_count; 223 int error; 224 int ib, is, page_base; 225 vm_pindex_t pindex = m->pindex; 226 227 object = m->object; 228 229 /* 230 * It doesn't cost us anything to pageout OBJT_DEFAULT or OBJT_SWAP 231 * with the new swapper, but we could have serious problems paging 232 * out other object types if there is insufficient memory. 233 * 234 * Unfortunately, checking free memory here is far too late, so the 235 * check has been moved up a procedural level. 236 */ 237 238 /* 239 * Don't mess with the page if it's busy, held, or special 240 * 241 * XXX do we really need to check hold_count here? hold_count 242 * isn't supposed to mess with vm_page ops except prevent the 243 * page from being reused. 244 */ 245 if (m->hold_count != 0 || (m->flags & PG_UNMANAGED)) { 246 vm_page_wakeup(m); 247 return 0; 248 } 249 250 mc[vm_pageout_page_count] = m; 251 pageout_count = 1; 252 page_base = vm_pageout_page_count; 253 ib = 1; 254 is = 1; 255 256 /* 257 * Scan object for clusterable pages. 258 * 259 * We can cluster ONLY if: ->> the page is NOT 260 * clean, wired, busy, held, or mapped into a 261 * buffer, and one of the following: 262 * 1) The page is inactive, or a seldom used 263 * active page. 264 * -or- 265 * 2) we force the issue. 266 * 267 * During heavy mmap/modification loads the pageout 268 * daemon can really fragment the underlying file 269 * due to flushing pages out of order and not trying 270 * align the clusters (which leave sporatic out-of-order 271 * holes). To solve this problem we do the reverse scan 272 * first and attempt to align our cluster, then do a 273 * forward scan if room remains. 274 */ 275 276 vm_object_hold(object); 277 more: 278 while (ib && pageout_count < vm_pageout_page_count) { 279 vm_page_t p; 280 281 if (ib > pindex) { 282 ib = 0; 283 break; 284 } 285 286 p = vm_page_lookup_busy_try(object, pindex - ib, TRUE, &error); 287 if (error || p == NULL) { 288 ib = 0; 289 break; 290 } 291 if ((p->queue - p->pc) == PQ_CACHE || 292 (p->flags & PG_UNMANAGED)) { 293 vm_page_wakeup(p); 294 ib = 0; 295 break; 296 } 297 vm_page_test_dirty(p); 298 if (((p->dirty & p->valid) == 0 && 299 (p->flags & PG_NEED_COMMIT) == 0) || 300 p->queue - p->pc != PQ_INACTIVE || 301 p->wire_count != 0 || /* may be held by buf cache */ 302 p->hold_count != 0) { /* may be undergoing I/O */ 303 vm_page_wakeup(p); 304 ib = 0; 305 break; 306 } 307 mc[--page_base] = p; 308 ++pageout_count; 309 ++ib; 310 /* 311 * alignment boundry, stop here and switch directions. Do 312 * not clear ib. 313 */ 314 if ((pindex - (ib - 1)) % vm_pageout_page_count == 0) 315 break; 316 } 317 318 while (pageout_count < vm_pageout_page_count && 319 pindex + is < object->size) { 320 vm_page_t p; 321 322 p = vm_page_lookup_busy_try(object, pindex + is, TRUE, &error); 323 if (error || p == NULL) 324 break; 325 if (((p->queue - p->pc) == PQ_CACHE) || 326 (p->flags & (PG_BUSY|PG_UNMANAGED)) || p->busy) { 327 vm_page_wakeup(p); 328 break; 329 } 330 vm_page_test_dirty(p); 331 if (((p->dirty & p->valid) == 0 && 332 (p->flags & PG_NEED_COMMIT) == 0) || 333 p->queue - p->pc != PQ_INACTIVE || 334 p->wire_count != 0 || /* may be held by buf cache */ 335 p->hold_count != 0) { /* may be undergoing I/O */ 336 vm_page_wakeup(p); 337 break; 338 } 339 mc[page_base + pageout_count] = p; 340 ++pageout_count; 341 ++is; 342 } 343 344 /* 345 * If we exhausted our forward scan, continue with the reverse scan 346 * when possible, even past a page boundry. This catches boundry 347 * conditions. 348 */ 349 if (ib && pageout_count < vm_pageout_page_count) 350 goto more; 351 352 vm_object_drop(object); 353 354 /* 355 * we allow reads during pageouts... 356 */ 357 return vm_pageout_flush(&mc[page_base], pageout_count, 0); 358 } 359 360 /* 361 * vm_pageout_flush() - launder the given pages 362 * 363 * The given pages are laundered. Note that we setup for the start of 364 * I/O ( i.e. busy the page ), mark it read-only, and bump the object 365 * reference count all in here rather then in the parent. If we want 366 * the parent to do more sophisticated things we may have to change 367 * the ordering. 368 * 369 * The pages in the array must be busied by the caller and will be 370 * unbusied by this function. 371 */ 372 int 373 vm_pageout_flush(vm_page_t *mc, int count, int flags) 374 { 375 vm_object_t object; 376 int pageout_status[count]; 377 int numpagedout = 0; 378 int i; 379 380 /* 381 * Initiate I/O. Bump the vm_page_t->busy counter. 382 */ 383 for (i = 0; i < count; i++) { 384 KASSERT(mc[i]->valid == VM_PAGE_BITS_ALL, 385 ("vm_pageout_flush page %p index %d/%d: partially " 386 "invalid page", mc[i], i, count)); 387 vm_page_io_start(mc[i]); 388 } 389 390 /* 391 * We must make the pages read-only. This will also force the 392 * modified bit in the related pmaps to be cleared. The pager 393 * cannot clear the bit for us since the I/O completion code 394 * typically runs from an interrupt. The act of making the page 395 * read-only handles the case for us. 396 * 397 * Then we can unbusy the pages, we still hold a reference by virtue 398 * of our soft-busy. 399 */ 400 for (i = 0; i < count; i++) { 401 vm_page_protect(mc[i], VM_PROT_READ); 402 vm_page_wakeup(mc[i]); 403 } 404 405 object = mc[0]->object; 406 vm_object_pip_add(object, count); 407 408 vm_pager_put_pages(object, mc, count, 409 (flags | ((object == &kernel_object) ? VM_PAGER_PUT_SYNC : 0)), 410 pageout_status); 411 412 for (i = 0; i < count; i++) { 413 vm_page_t mt = mc[i]; 414 415 switch (pageout_status[i]) { 416 case VM_PAGER_OK: 417 numpagedout++; 418 break; 419 case VM_PAGER_PEND: 420 numpagedout++; 421 break; 422 case VM_PAGER_BAD: 423 /* 424 * Page outside of range of object. Right now we 425 * essentially lose the changes by pretending it 426 * worked. 427 */ 428 vm_page_busy_wait(mt, FALSE, "pgbad"); 429 pmap_clear_modify(mt); 430 vm_page_undirty(mt); 431 vm_page_wakeup(mt); 432 break; 433 case VM_PAGER_ERROR: 434 case VM_PAGER_FAIL: 435 /* 436 * A page typically cannot be paged out when we 437 * have run out of swap. We leave the page 438 * marked inactive and will try to page it out 439 * again later. 440 * 441 * Starvation of the active page list is used to 442 * determine when the system is massively memory 443 * starved. 444 */ 445 break; 446 case VM_PAGER_AGAIN: 447 break; 448 } 449 450 /* 451 * If the operation is still going, leave the page busy to 452 * block all other accesses. Also, leave the paging in 453 * progress indicator set so that we don't attempt an object 454 * collapse. 455 * 456 * For any pages which have completed synchronously, 457 * deactivate the page if we are under a severe deficit. 458 * Do not try to enter them into the cache, though, they 459 * might still be read-heavy. 460 */ 461 if (pageout_status[i] != VM_PAGER_PEND) { 462 vm_page_busy_wait(mt, FALSE, "pgouw"); 463 if (vm_page_count_severe()) 464 vm_page_deactivate(mt); 465 #if 0 466 if (!vm_page_count_severe() || !vm_page_try_to_cache(mt)) 467 vm_page_protect(mt, VM_PROT_READ); 468 #endif 469 vm_page_io_finish(mt); 470 vm_page_wakeup(mt); 471 vm_object_pip_wakeup(object); 472 } 473 } 474 return numpagedout; 475 } 476 477 #if !defined(NO_SWAPPING) 478 /* 479 * deactivate enough pages to satisfy the inactive target 480 * requirements or if vm_page_proc_limit is set, then 481 * deactivate all of the pages in the object and its 482 * backing_objects. 483 * 484 * The map must be locked. 485 * The caller must hold the vm_object. 486 */ 487 static int vm_pageout_object_deactivate_pages_callback(vm_page_t, void *); 488 489 static void 490 vm_pageout_object_deactivate_pages(vm_map_t map, vm_object_t object, 491 vm_pindex_t desired, int map_remove_only) 492 { 493 struct rb_vm_page_scan_info info; 494 vm_object_t lobject; 495 vm_object_t tobject; 496 int remove_mode; 497 498 ASSERT_LWKT_TOKEN_HELD(vm_object_token(object)); 499 lobject = object; 500 501 while (lobject) { 502 if (pmap_resident_count(vm_map_pmap(map)) <= desired) 503 break; 504 if (lobject->type == OBJT_DEVICE || lobject->type == OBJT_PHYS) 505 break; 506 if (lobject->paging_in_progress) 507 break; 508 509 remove_mode = map_remove_only; 510 if (lobject->shadow_count > 1) 511 remove_mode = 1; 512 513 /* 514 * scan the objects entire memory queue. We hold the 515 * object's token so the scan should not race anything. 516 */ 517 info.limit = remove_mode; 518 info.map = map; 519 info.desired = desired; 520 vm_page_rb_tree_RB_SCAN(&lobject->rb_memq, NULL, 521 vm_pageout_object_deactivate_pages_callback, 522 &info 523 ); 524 while ((tobject = lobject->backing_object) != NULL) { 525 KKASSERT(tobject != object); 526 vm_object_hold(tobject); 527 if (tobject == lobject->backing_object) 528 break; 529 vm_object_drop(tobject); 530 } 531 if (lobject != object) { 532 vm_object_lock_swap(); 533 vm_object_drop(lobject); 534 } 535 lobject = tobject; 536 } 537 if (lobject != object) 538 vm_object_drop(lobject); 539 } 540 541 /* 542 * The caller must hold the vm_object. 543 */ 544 static int 545 vm_pageout_object_deactivate_pages_callback(vm_page_t p, void *data) 546 { 547 struct rb_vm_page_scan_info *info = data; 548 int actcount; 549 550 if (pmap_resident_count(vm_map_pmap(info->map)) <= info->desired) { 551 return(-1); 552 } 553 mycpu->gd_cnt.v_pdpages++; 554 555 if (vm_page_busy_try(p, TRUE)) 556 return(0); 557 if (p->wire_count || p->hold_count || (p->flags & PG_UNMANAGED)) { 558 vm_page_wakeup(p); 559 return(0); 560 } 561 if (!pmap_page_exists_quick(vm_map_pmap(info->map), p)) { 562 vm_page_wakeup(p); 563 return(0); 564 } 565 566 actcount = pmap_ts_referenced(p); 567 if (actcount) { 568 vm_page_flag_set(p, PG_REFERENCED); 569 } else if (p->flags & PG_REFERENCED) { 570 actcount = 1; 571 } 572 573 vm_page_and_queue_spin_lock(p); 574 if (p->queue - p->pc != PQ_ACTIVE && (p->flags & PG_REFERENCED)) { 575 vm_page_and_queue_spin_unlock(p); 576 vm_page_activate(p); 577 p->act_count += actcount; 578 vm_page_flag_clear(p, PG_REFERENCED); 579 } else if (p->queue - p->pc == PQ_ACTIVE) { 580 if ((p->flags & PG_REFERENCED) == 0) { 581 p->act_count -= min(p->act_count, ACT_DECLINE); 582 if (!info->limit && 583 (vm_pageout_algorithm || (p->act_count == 0))) { 584 vm_page_and_queue_spin_unlock(p); 585 vm_page_protect(p, VM_PROT_NONE); 586 vm_page_deactivate(p); 587 } else { 588 TAILQ_REMOVE(&vm_page_queues[p->queue].pl, 589 p, pageq); 590 TAILQ_INSERT_TAIL(&vm_page_queues[p->queue].pl, 591 p, pageq); 592 vm_page_and_queue_spin_unlock(p); 593 } 594 } else { 595 vm_page_and_queue_spin_unlock(p); 596 vm_page_activate(p); 597 vm_page_flag_clear(p, PG_REFERENCED); 598 599 vm_page_and_queue_spin_lock(p); 600 if (p->queue - p->pc == PQ_ACTIVE) { 601 if (p->act_count < (ACT_MAX - ACT_ADVANCE)) 602 p->act_count += ACT_ADVANCE; 603 TAILQ_REMOVE(&vm_page_queues[p->queue].pl, 604 p, pageq); 605 TAILQ_INSERT_TAIL(&vm_page_queues[p->queue].pl, 606 p, pageq); 607 } 608 vm_page_and_queue_spin_unlock(p); 609 } 610 } else if (p->queue - p->pc == PQ_INACTIVE) { 611 vm_page_and_queue_spin_unlock(p); 612 vm_page_protect(p, VM_PROT_NONE); 613 } else { 614 vm_page_and_queue_spin_unlock(p); 615 } 616 vm_page_wakeup(p); 617 return(0); 618 } 619 620 /* 621 * Deactivate some number of pages in a map, try to do it fairly, but 622 * that is really hard to do. 623 */ 624 static void 625 vm_pageout_map_deactivate_pages(vm_map_t map, vm_pindex_t desired) 626 { 627 vm_map_entry_t tmpe; 628 vm_object_t obj, bigobj; 629 int nothingwired; 630 631 if (lockmgr(&map->lock, LK_EXCLUSIVE | LK_NOWAIT)) { 632 return; 633 } 634 635 bigobj = NULL; 636 nothingwired = TRUE; 637 638 /* 639 * first, search out the biggest object, and try to free pages from 640 * that. 641 */ 642 tmpe = map->header.next; 643 while (tmpe != &map->header) { 644 switch(tmpe->maptype) { 645 case VM_MAPTYPE_NORMAL: 646 case VM_MAPTYPE_VPAGETABLE: 647 obj = tmpe->object.vm_object; 648 if ((obj != NULL) && (obj->shadow_count <= 1) && 649 ((bigobj == NULL) || 650 (bigobj->resident_page_count < obj->resident_page_count))) { 651 bigobj = obj; 652 } 653 break; 654 default: 655 break; 656 } 657 if (tmpe->wired_count > 0) 658 nothingwired = FALSE; 659 tmpe = tmpe->next; 660 } 661 662 if (bigobj) { 663 vm_object_hold(bigobj); 664 vm_pageout_object_deactivate_pages(map, bigobj, desired, 0); 665 vm_object_drop(bigobj); 666 } 667 668 /* 669 * Next, hunt around for other pages to deactivate. We actually 670 * do this search sort of wrong -- .text first is not the best idea. 671 */ 672 tmpe = map->header.next; 673 while (tmpe != &map->header) { 674 if (pmap_resident_count(vm_map_pmap(map)) <= desired) 675 break; 676 switch(tmpe->maptype) { 677 case VM_MAPTYPE_NORMAL: 678 case VM_MAPTYPE_VPAGETABLE: 679 obj = tmpe->object.vm_object; 680 if (obj) { 681 vm_object_hold(obj); 682 vm_pageout_object_deactivate_pages(map, obj, desired, 0); 683 vm_object_drop(obj); 684 } 685 break; 686 default: 687 break; 688 } 689 tmpe = tmpe->next; 690 }; 691 692 /* 693 * Remove all mappings if a process is swapped out, this will free page 694 * table pages. 695 */ 696 if (desired == 0 && nothingwired) 697 pmap_remove(vm_map_pmap(map), 698 VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS); 699 vm_map_unlock(map); 700 } 701 #endif 702 703 /* 704 * Called when the pageout scan wants to free a page. We no longer 705 * try to cycle the vm_object here with a reference & dealloc, which can 706 * cause a non-trivial object collapse in a critical path. 707 * 708 * It is unclear why we cycled the ref_count in the past, perhaps to try 709 * to optimize shadow chain collapses but I don't quite see why it would 710 * be necessary. An OBJ_DEAD object should terminate any and all vm_pages 711 * synchronously and not have to be kicked-start. 712 */ 713 static void 714 vm_pageout_page_free(vm_page_t m) 715 { 716 vm_page_protect(m, VM_PROT_NONE); 717 vm_page_free(m); 718 } 719 720 /* 721 * vm_pageout_scan does the dirty work for the pageout daemon. 722 */ 723 struct vm_pageout_scan_info { 724 struct proc *bigproc; 725 vm_offset_t bigsize; 726 }; 727 728 static int vm_pageout_scan_callback(struct proc *p, void *data); 729 730 static int 731 vm_pageout_scan_inactive(int pass, int q, int avail_shortage, 732 int *vnodes_skippedp) 733 { 734 vm_page_t m; 735 struct vm_page marker; 736 struct vnode *vpfailed; /* warning, allowed to be stale */ 737 int maxscan; 738 int delta = 0; 739 vm_object_t object; 740 int actcount; 741 int maxlaunder; 742 743 /* 744 * Start scanning the inactive queue for pages we can move to the 745 * cache or free. The scan will stop when the target is reached or 746 * we have scanned the entire inactive queue. Note that m->act_count 747 * is not used to form decisions for the inactive queue, only for the 748 * active queue. 749 * 750 * maxlaunder limits the number of dirty pages we flush per scan. 751 * For most systems a smaller value (16 or 32) is more robust under 752 * extreme memory and disk pressure because any unnecessary writes 753 * to disk can result in extreme performance degredation. However, 754 * systems with excessive dirty pages (especially when MAP_NOSYNC is 755 * used) will die horribly with limited laundering. If the pageout 756 * daemon cannot clean enough pages in the first pass, we let it go 757 * all out in succeeding passes. 758 */ 759 if ((maxlaunder = vm_max_launder) <= 1) 760 maxlaunder = 1; 761 if (pass) 762 maxlaunder = 10000; 763 764 /* 765 * Initialize our marker 766 */ 767 bzero(&marker, sizeof(marker)); 768 marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER; 769 marker.queue = PQ_INACTIVE + q; 770 marker.pc = q; 771 marker.wire_count = 1; 772 773 /* 774 * Inactive queue scan. 775 * 776 * NOTE: The vm_page must be spinlocked before the queue to avoid 777 * deadlocks, so it is easiest to simply iterate the loop 778 * with the queue unlocked at the top. 779 */ 780 vpfailed = NULL; 781 782 vm_page_queues_spin_lock(PQ_INACTIVE + q); 783 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); 784 maxscan = vm_page_queues[PQ_INACTIVE + q].lcnt; 785 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 786 787 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 788 maxscan-- > 0 && avail_shortage - delta > 0) 789 { 790 vm_page_and_queue_spin_lock(m); 791 if (m != TAILQ_NEXT(&marker, pageq)) { 792 vm_page_and_queue_spin_unlock(m); 793 ++maxscan; 794 continue; 795 } 796 KKASSERT(m->queue - m->pc == PQ_INACTIVE); 797 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, 798 &marker, pageq); 799 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_INACTIVE + q].pl, m, 800 &marker, pageq); 801 mycpu->gd_cnt.v_pdpages++; 802 803 /* 804 * Skip marker pages 805 */ 806 if (m->flags & PG_MARKER) { 807 vm_page_and_queue_spin_unlock(m); 808 continue; 809 } 810 811 /* 812 * Try to busy the page. Don't mess with pages which are 813 * already busy or reorder them in the queue. 814 */ 815 if (vm_page_busy_try(m, TRUE)) { 816 vm_page_and_queue_spin_unlock(m); 817 continue; 818 } 819 vm_page_and_queue_spin_unlock(m); 820 KKASSERT(m->queue - m->pc == PQ_INACTIVE); 821 822 lwkt_yield(); 823 824 /* 825 * The page has been successfully busied and is now no 826 * longer spinlocked. The queue is no longer spinlocked 827 * either. 828 */ 829 830 /* 831 * It is possible for a page to be busied ad-hoc (e.g. the 832 * pmap_collect() code) and wired and race against the 833 * allocation of a new page. vm_page_alloc() may be forced 834 * to deactivate the wired page in which case it winds up 835 * on the inactive queue and must be handled here. We 836 * correct the problem simply by unqueuing the page. 837 */ 838 if (m->wire_count) { 839 vm_page_unqueue_nowakeup(m); 840 vm_page_wakeup(m); 841 kprintf("WARNING: pagedaemon: wired page on " 842 "inactive queue %p\n", m); 843 continue; 844 } 845 846 /* 847 * A held page may be undergoing I/O, so skip it. 848 */ 849 if (m->hold_count) { 850 vm_page_and_queue_spin_lock(m); 851 if (m->queue - m->pc == PQ_INACTIVE) { 852 TAILQ_REMOVE( 853 &vm_page_queues[PQ_INACTIVE + q].pl, 854 m, pageq); 855 TAILQ_INSERT_TAIL( 856 &vm_page_queues[PQ_INACTIVE + q].pl, 857 m, pageq); 858 ++vm_swapcache_inactive_heuristic; 859 } 860 vm_page_and_queue_spin_unlock(m); 861 vm_page_wakeup(m); 862 continue; 863 } 864 865 if (m->object->ref_count == 0) { 866 /* 867 * If the object is not being used, we ignore previous 868 * references. 869 */ 870 vm_page_flag_clear(m, PG_REFERENCED); 871 pmap_clear_reference(m); 872 /* fall through to end */ 873 } else if (((m->flags & PG_REFERENCED) == 0) && 874 (actcount = pmap_ts_referenced(m))) { 875 /* 876 * Otherwise, if the page has been referenced while 877 * in the inactive queue, we bump the "activation 878 * count" upwards, making it less likely that the 879 * page will be added back to the inactive queue 880 * prematurely again. Here we check the page tables 881 * (or emulated bits, if any), given the upper level 882 * VM system not knowing anything about existing 883 * references. 884 */ 885 vm_page_activate(m); 886 m->act_count += (actcount + ACT_ADVANCE); 887 vm_page_wakeup(m); 888 continue; 889 } 890 891 /* 892 * (m) is still busied. 893 * 894 * If the upper level VM system knows about any page 895 * references, we activate the page. We also set the 896 * "activation count" higher than normal so that we will less 897 * likely place pages back onto the inactive queue again. 898 */ 899 if ((m->flags & PG_REFERENCED) != 0) { 900 vm_page_flag_clear(m, PG_REFERENCED); 901 actcount = pmap_ts_referenced(m); 902 vm_page_activate(m); 903 m->act_count += (actcount + ACT_ADVANCE + 1); 904 vm_page_wakeup(m); 905 continue; 906 } 907 908 /* 909 * If the upper level VM system doesn't know anything about 910 * the page being dirty, we have to check for it again. As 911 * far as the VM code knows, any partially dirty pages are 912 * fully dirty. 913 * 914 * Pages marked PG_WRITEABLE may be mapped into the user 915 * address space of a process running on another cpu. A 916 * user process (without holding the MP lock) running on 917 * another cpu may be able to touch the page while we are 918 * trying to remove it. vm_page_cache() will handle this 919 * case for us. 920 */ 921 if (m->dirty == 0) { 922 vm_page_test_dirty(m); 923 } else { 924 vm_page_dirty(m); 925 } 926 927 if (m->valid == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 928 /* 929 * Invalid pages can be easily freed 930 */ 931 vm_pageout_page_free(m); 932 mycpu->gd_cnt.v_dfree++; 933 ++delta; 934 } else if (m->dirty == 0 && (m->flags & PG_NEED_COMMIT) == 0) { 935 /* 936 * Clean pages can be placed onto the cache queue. 937 * This effectively frees them. 938 */ 939 vm_page_cache(m); 940 ++delta; 941 } else if ((m->flags & PG_WINATCFLS) == 0 && pass == 0) { 942 /* 943 * Dirty pages need to be paged out, but flushing 944 * a page is extremely expensive verses freeing 945 * a clean page. Rather then artificially limiting 946 * the number of pages we can flush, we instead give 947 * dirty pages extra priority on the inactive queue 948 * by forcing them to be cycled through the queue 949 * twice before being flushed, after which the 950 * (now clean) page will cycle through once more 951 * before being freed. This significantly extends 952 * the thrash point for a heavily loaded machine. 953 */ 954 vm_page_flag_set(m, PG_WINATCFLS); 955 vm_page_and_queue_spin_lock(m); 956 if (m->queue - m->pc == PQ_INACTIVE) { 957 TAILQ_REMOVE( 958 &vm_page_queues[PQ_INACTIVE + q].pl, 959 m, pageq); 960 TAILQ_INSERT_TAIL( 961 &vm_page_queues[PQ_INACTIVE + q].pl, 962 m, pageq); 963 ++vm_swapcache_inactive_heuristic; 964 } 965 vm_page_and_queue_spin_unlock(m); 966 vm_page_wakeup(m); 967 } else if (maxlaunder > 0) { 968 /* 969 * We always want to try to flush some dirty pages if 970 * we encounter them, to keep the system stable. 971 * Normally this number is small, but under extreme 972 * pressure where there are insufficient clean pages 973 * on the inactive queue, we may have to go all out. 974 */ 975 int swap_pageouts_ok; 976 struct vnode *vp = NULL; 977 978 object = m->object; 979 980 if ((object->type != OBJT_SWAP) && (object->type != OBJT_DEFAULT)) { 981 swap_pageouts_ok = 1; 982 } else { 983 swap_pageouts_ok = !(defer_swap_pageouts || disable_swap_pageouts); 984 swap_pageouts_ok |= (!disable_swap_pageouts && defer_swap_pageouts && 985 vm_page_count_min(0)); 986 987 } 988 989 /* 990 * We don't bother paging objects that are "dead". 991 * Those objects are in a "rundown" state. 992 */ 993 if (!swap_pageouts_ok || (object->flags & OBJ_DEAD)) { 994 vm_page_and_queue_spin_lock(m); 995 if (m->queue - m->pc == PQ_INACTIVE) { 996 TAILQ_REMOVE( 997 &vm_page_queues[PQ_INACTIVE + q].pl, 998 m, pageq); 999 TAILQ_INSERT_TAIL( 1000 &vm_page_queues[PQ_INACTIVE + q].pl, 1001 m, pageq); 1002 ++vm_swapcache_inactive_heuristic; 1003 } 1004 vm_page_and_queue_spin_unlock(m); 1005 vm_page_wakeup(m); 1006 continue; 1007 } 1008 1009 /* 1010 * (m) is still busied. 1011 * 1012 * The object is already known NOT to be dead. It 1013 * is possible for the vget() to block the whole 1014 * pageout daemon, but the new low-memory handling 1015 * code should prevent it. 1016 * 1017 * The previous code skipped locked vnodes and, worse, 1018 * reordered pages in the queue. This results in 1019 * completely non-deterministic operation because, 1020 * quite often, a vm_fault has initiated an I/O and 1021 * is holding a locked vnode at just the point where 1022 * the pageout daemon is woken up. 1023 * 1024 * We can't wait forever for the vnode lock, we might 1025 * deadlock due to a vn_read() getting stuck in 1026 * vm_wait while holding this vnode. We skip the 1027 * vnode if we can't get it in a reasonable amount 1028 * of time. 1029 * 1030 * vpfailed is used to (try to) avoid the case where 1031 * a large number of pages are associated with a 1032 * locked vnode, which could cause the pageout daemon 1033 * to stall for an excessive amount of time. 1034 */ 1035 if (object->type == OBJT_VNODE) { 1036 int flags; 1037 1038 vp = object->handle; 1039 flags = LK_EXCLUSIVE | LK_NOOBJ; 1040 if (vp == vpfailed) 1041 flags |= LK_NOWAIT; 1042 else 1043 flags |= LK_TIMELOCK; 1044 vm_page_hold(m); 1045 vm_page_wakeup(m); 1046 1047 /* 1048 * We have unbusied (m) temporarily so we can 1049 * acquire the vp lock without deadlocking. 1050 * (m) is held to prevent destruction. 1051 */ 1052 if (vget(vp, flags) != 0) { 1053 vpfailed = vp; 1054 ++pageout_lock_miss; 1055 if (object->flags & OBJ_MIGHTBEDIRTY) 1056 ++*vnodes_skippedp; 1057 vm_page_unhold(m); 1058 continue; 1059 } 1060 1061 /* 1062 * The page might have been moved to another 1063 * queue during potential blocking in vget() 1064 * above. The page might have been freed and 1065 * reused for another vnode. The object might 1066 * have been reused for another vnode. 1067 */ 1068 if (m->queue - m->pc != PQ_INACTIVE || 1069 m->object != object || 1070 object->handle != vp) { 1071 if (object->flags & OBJ_MIGHTBEDIRTY) 1072 ++*vnodes_skippedp; 1073 vput(vp); 1074 vm_page_unhold(m); 1075 continue; 1076 } 1077 1078 /* 1079 * The page may have been busied during the 1080 * blocking in vput(); We don't move the 1081 * page back onto the end of the queue so that 1082 * statistics are more correct if we don't. 1083 */ 1084 if (vm_page_busy_try(m, TRUE)) { 1085 vput(vp); 1086 vm_page_unhold(m); 1087 continue; 1088 } 1089 vm_page_unhold(m); 1090 1091 /* 1092 * (m) is busied again 1093 * 1094 * We own the busy bit and remove our hold 1095 * bit. If the page is still held it 1096 * might be undergoing I/O, so skip it. 1097 */ 1098 if (m->hold_count) { 1099 vm_page_and_queue_spin_lock(m); 1100 if (m->queue - m->pc == PQ_INACTIVE) { 1101 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, m, pageq); 1102 TAILQ_INSERT_TAIL(&vm_page_queues[PQ_INACTIVE + q].pl, m, pageq); 1103 ++vm_swapcache_inactive_heuristic; 1104 } 1105 vm_page_and_queue_spin_unlock(m); 1106 if (object->flags & OBJ_MIGHTBEDIRTY) 1107 ++*vnodes_skippedp; 1108 vm_page_wakeup(m); 1109 vput(vp); 1110 continue; 1111 } 1112 /* (m) is left busied as we fall through */ 1113 } 1114 1115 /* 1116 * page is busy and not held here. 1117 * 1118 * If a page is dirty, then it is either being washed 1119 * (but not yet cleaned) or it is still in the 1120 * laundry. If it is still in the laundry, then we 1121 * start the cleaning operation. 1122 * 1123 * decrement inactive_shortage on success to account 1124 * for the (future) cleaned page. Otherwise we 1125 * could wind up laundering or cleaning too many 1126 * pages. 1127 */ 1128 if (vm_pageout_clean(m) != 0) { 1129 ++delta; 1130 --maxlaunder; 1131 } 1132 /* clean ate busy, page no longer accessible */ 1133 if (vp != NULL) 1134 vput(vp); 1135 } else { 1136 vm_page_wakeup(m); 1137 } 1138 } 1139 vm_page_queues_spin_lock(PQ_INACTIVE + q); 1140 TAILQ_REMOVE(&vm_page_queues[PQ_INACTIVE + q].pl, &marker, pageq); 1141 vm_page_queues_spin_unlock(PQ_INACTIVE + q); 1142 return (delta); 1143 } 1144 1145 static int 1146 vm_pageout_scan_active(int pass, int q, 1147 int avail_shortage, int inactive_shortage, 1148 int *recycle_countp) 1149 { 1150 struct vm_page marker; 1151 vm_page_t m; 1152 int actcount; 1153 int delta = 0; 1154 int maxscan; 1155 1156 /* 1157 * We want to move pages from the active queue to the inactive 1158 * queue to get the inactive queue to the inactive target. If 1159 * we still have a page shortage from above we try to directly free 1160 * clean pages instead of moving them. 1161 * 1162 * If we do still have a shortage we keep track of the number of 1163 * pages we free or cache (recycle_count) as a measure of thrashing 1164 * between the active and inactive queues. 1165 * 1166 * If we were able to completely satisfy the free+cache targets 1167 * from the inactive pool we limit the number of pages we move 1168 * from the active pool to the inactive pool to 2x the pages we 1169 * had removed from the inactive pool (with a minimum of 1/5 the 1170 * inactive target). If we were not able to completely satisfy 1171 * the free+cache targets we go for the whole target aggressively. 1172 * 1173 * NOTE: Both variables can end up negative. 1174 * NOTE: We are still in a critical section. 1175 */ 1176 1177 bzero(&marker, sizeof(marker)); 1178 marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER; 1179 marker.queue = PQ_ACTIVE + q; 1180 marker.pc = q; 1181 marker.wire_count = 1; 1182 1183 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1184 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1185 maxscan = vm_page_queues[PQ_ACTIVE + q].lcnt; 1186 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1187 1188 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 1189 maxscan-- > 0 && (avail_shortage - delta > 0 || 1190 inactive_shortage > 0)) 1191 { 1192 vm_page_and_queue_spin_lock(m); 1193 if (m != TAILQ_NEXT(&marker, pageq)) { 1194 vm_page_and_queue_spin_unlock(m); 1195 ++maxscan; 1196 continue; 1197 } 1198 KKASSERT(m->queue - m->pc == PQ_ACTIVE); 1199 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, 1200 &marker, pageq); 1201 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m, 1202 &marker, pageq); 1203 1204 /* 1205 * Skip marker pages 1206 */ 1207 if (m->flags & PG_MARKER) { 1208 vm_page_and_queue_spin_unlock(m); 1209 continue; 1210 } 1211 1212 /* 1213 * Try to busy the page. Don't mess with pages which are 1214 * already busy or reorder them in the queue. 1215 */ 1216 if (vm_page_busy_try(m, TRUE)) { 1217 vm_page_and_queue_spin_unlock(m); 1218 continue; 1219 } 1220 1221 /* 1222 * Don't deactivate pages that are held, even if we can 1223 * busy them. (XXX why not?) 1224 */ 1225 if (m->hold_count != 0) { 1226 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, 1227 m, pageq); 1228 TAILQ_INSERT_TAIL(&vm_page_queues[PQ_ACTIVE + q].pl, 1229 m, pageq); 1230 vm_page_and_queue_spin_unlock(m); 1231 vm_page_wakeup(m); 1232 continue; 1233 } 1234 vm_page_and_queue_spin_unlock(m); 1235 lwkt_yield(); 1236 1237 /* 1238 * The page has been successfully busied and the page and 1239 * queue are no longer locked. 1240 */ 1241 1242 /* 1243 * The count for pagedaemon pages is done after checking the 1244 * page for eligibility... 1245 */ 1246 mycpu->gd_cnt.v_pdpages++; 1247 1248 /* 1249 * Check to see "how much" the page has been used and clear 1250 * the tracking access bits. If the object has no references 1251 * don't bother paying the expense. 1252 */ 1253 actcount = 0; 1254 if (m->object->ref_count != 0) { 1255 if (m->flags & PG_REFERENCED) 1256 ++actcount; 1257 actcount += pmap_ts_referenced(m); 1258 if (actcount) { 1259 m->act_count += ACT_ADVANCE + actcount; 1260 if (m->act_count > ACT_MAX) 1261 m->act_count = ACT_MAX; 1262 } 1263 } 1264 vm_page_flag_clear(m, PG_REFERENCED); 1265 1266 /* 1267 * actcount is only valid if the object ref_count is non-zero. 1268 */ 1269 if (actcount && m->object->ref_count != 0) { 1270 vm_page_and_queue_spin_lock(m); 1271 if (m->queue - m->pc == PQ_ACTIVE) { 1272 TAILQ_REMOVE( 1273 &vm_page_queues[PQ_ACTIVE + q].pl, 1274 m, pageq); 1275 TAILQ_INSERT_TAIL( 1276 &vm_page_queues[PQ_ACTIVE + q].pl, 1277 m, pageq); 1278 } 1279 vm_page_and_queue_spin_unlock(m); 1280 vm_page_wakeup(m); 1281 } else { 1282 m->act_count -= min(m->act_count, ACT_DECLINE); 1283 if (vm_pageout_algorithm || 1284 m->object->ref_count == 0 || 1285 m->act_count < pass + 1 1286 ) { 1287 /* 1288 * Deactivate the page. If we had a 1289 * shortage from our inactive scan try to 1290 * free (cache) the page instead. 1291 * 1292 * Don't just blindly cache the page if 1293 * we do not have a shortage from the 1294 * inactive scan, that could lead to 1295 * gigabytes being moved. 1296 */ 1297 --inactive_shortage; 1298 if (avail_shortage - delta > 0 || 1299 m->object->ref_count == 0) { 1300 if (avail_shortage - delta > 0) 1301 ++*recycle_countp; 1302 vm_page_protect(m, VM_PROT_NONE); 1303 if (m->dirty == 0 && 1304 (m->flags & PG_NEED_COMMIT) == 0 && 1305 avail_shortage - delta > 0) { 1306 vm_page_cache(m); 1307 } else { 1308 vm_page_deactivate(m); 1309 vm_page_wakeup(m); 1310 } 1311 } else { 1312 vm_page_deactivate(m); 1313 vm_page_wakeup(m); 1314 } 1315 ++delta; 1316 } else { 1317 vm_page_and_queue_spin_lock(m); 1318 if (m->queue - m->pc == PQ_ACTIVE) { 1319 TAILQ_REMOVE( 1320 &vm_page_queues[PQ_ACTIVE + q].pl, 1321 m, pageq); 1322 TAILQ_INSERT_TAIL( 1323 &vm_page_queues[PQ_ACTIVE + q].pl, 1324 m, pageq); 1325 } 1326 vm_page_and_queue_spin_unlock(m); 1327 vm_page_wakeup(m); 1328 } 1329 } 1330 } 1331 1332 /* 1333 * Clean out our local marker. 1334 */ 1335 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1336 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1337 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1338 1339 return (delta); 1340 } 1341 1342 /* 1343 * The number of actually free pages can drop down to v_free_reserved, 1344 * we try to build the free count back above v_free_min. Note that 1345 * vm_paging_needed() also returns TRUE if v_free_count is not at 1346 * least v_free_min so that is the minimum we must build the free 1347 * count to. 1348 * 1349 * We use a slightly higher target to improve hysteresis, 1350 * ((v_free_target + v_free_min) / 2). Since v_free_target 1351 * is usually the same as v_cache_min this maintains about 1352 * half the pages in the free queue as are in the cache queue, 1353 * providing pretty good pipelining for pageout operation. 1354 * 1355 * The system operator can manipulate vm.v_cache_min and 1356 * vm.v_free_target to tune the pageout demon. Be sure 1357 * to keep vm.v_free_min < vm.v_free_target. 1358 * 1359 * Note that the original paging target is to get at least 1360 * (free_min + cache_min) into (free + cache). The slightly 1361 * higher target will shift additional pages from cache to free 1362 * without effecting the original paging target in order to 1363 * maintain better hysteresis and not have the free count always 1364 * be dead-on v_free_min. 1365 * 1366 * NOTE: we are still in a critical section. 1367 * 1368 * Pages moved from PQ_CACHE to totally free are not counted in the 1369 * pages_freed counter. 1370 */ 1371 static void 1372 vm_pageout_scan_cache(int avail_shortage, int vnodes_skipped, int recycle_count) 1373 { 1374 struct vm_pageout_scan_info info; 1375 vm_page_t m; 1376 1377 while (vmstats.v_free_count < 1378 (vmstats.v_free_min + vmstats.v_free_target) / 2) { 1379 /* 1380 * This steals some code from vm/vm_page.c 1381 */ 1382 static int cache_rover = 0; 1383 1384 m = vm_page_list_find(PQ_CACHE, cache_rover & PQ_L2_MASK, FALSE); 1385 if (m == NULL) 1386 break; 1387 /* page is returned removed from its queue and spinlocked */ 1388 if (vm_page_busy_try(m, TRUE)) { 1389 vm_page_deactivate_locked(m); 1390 vm_page_spin_unlock(m); 1391 #ifdef INVARIANTS 1392 kprintf("Warning: busy page %p found in cache\n", m); 1393 #endif 1394 continue; 1395 } 1396 vm_page_spin_unlock(m); 1397 pagedaemon_wakeup(); 1398 lwkt_yield(); 1399 1400 /* 1401 * Page has been successfully busied and it and its queue 1402 * is no longer spinlocked. 1403 */ 1404 if ((m->flags & (PG_UNMANAGED | PG_NEED_COMMIT)) || 1405 m->hold_count || 1406 m->wire_count) { 1407 vm_page_deactivate(m); 1408 vm_page_wakeup(m); 1409 continue; 1410 } 1411 KKASSERT((m->flags & PG_MAPPED) == 0); 1412 KKASSERT(m->dirty == 0); 1413 cache_rover += PQ_PRIME2; 1414 vm_pageout_page_free(m); 1415 mycpu->gd_cnt.v_dfree++; 1416 } 1417 1418 #if !defined(NO_SWAPPING) 1419 /* 1420 * Idle process swapout -- run once per second. 1421 */ 1422 if (vm_swap_idle_enabled) { 1423 static long lsec; 1424 if (time_second != lsec) { 1425 vm_pageout_req_swapout |= VM_SWAP_IDLE; 1426 vm_req_vmdaemon(); 1427 lsec = time_second; 1428 } 1429 } 1430 #endif 1431 1432 /* 1433 * If we didn't get enough free pages, and we have skipped a vnode 1434 * in a writeable object, wakeup the sync daemon. And kick swapout 1435 * if we did not get enough free pages. 1436 */ 1437 if (vm_paging_target() > 0) { 1438 if (vnodes_skipped && vm_page_count_min(0)) 1439 speedup_syncer(); 1440 #if !defined(NO_SWAPPING) 1441 if (vm_swap_enabled && vm_page_count_target()) { 1442 vm_req_vmdaemon(); 1443 vm_pageout_req_swapout |= VM_SWAP_NORMAL; 1444 } 1445 #endif 1446 } 1447 1448 /* 1449 * Handle catastrophic conditions. Under good conditions we should 1450 * be at the target, well beyond our minimum. If we could not even 1451 * reach our minimum the system is under heavy stress. 1452 * 1453 * Determine whether we have run out of memory. This occurs when 1454 * swap_pager_full is TRUE and the only pages left in the page 1455 * queues are dirty. We will still likely have page shortages. 1456 * 1457 * - swap_pager_full is set if insufficient swap was 1458 * available to satisfy a requested pageout. 1459 * 1460 * - the inactive queue is bloated (4 x size of active queue), 1461 * meaning it is unable to get rid of dirty pages and. 1462 * 1463 * - vm_page_count_min() without counting pages recycled from the 1464 * active queue (recycle_count) means we could not recover 1465 * enough pages to meet bare minimum needs. This test only 1466 * works if the inactive queue is bloated. 1467 * 1468 * - due to a positive avail_shortage we shifted the remaining 1469 * dirty pages from the active queue to the inactive queue 1470 * trying to find clean ones to free. 1471 */ 1472 if (swap_pager_full && vm_page_count_min(recycle_count)) 1473 kprintf("Warning: system low on memory+swap!\n"); 1474 if (swap_pager_full && vm_page_count_min(recycle_count) && 1475 vmstats.v_inactive_count > vmstats.v_active_count * 4 && 1476 avail_shortage > 0) { 1477 /* 1478 * Kill something. 1479 */ 1480 info.bigproc = NULL; 1481 info.bigsize = 0; 1482 allproc_scan(vm_pageout_scan_callback, &info); 1483 if (info.bigproc != NULL) { 1484 killproc(info.bigproc, "out of swap space"); 1485 info.bigproc->p_nice = PRIO_MIN; 1486 info.bigproc->p_usched->resetpriority( 1487 FIRST_LWP_IN_PROC(info.bigproc)); 1488 wakeup(&vmstats.v_free_count); 1489 PRELE(info.bigproc); 1490 } 1491 } 1492 } 1493 1494 /* 1495 * The caller must hold proc_token. 1496 */ 1497 static int 1498 vm_pageout_scan_callback(struct proc *p, void *data) 1499 { 1500 struct vm_pageout_scan_info *info = data; 1501 vm_offset_t size; 1502 1503 /* 1504 * Never kill system processes or init. If we have configured swap 1505 * then try to avoid killing low-numbered pids. 1506 */ 1507 if ((p->p_flags & P_SYSTEM) || (p->p_pid == 1) || 1508 ((p->p_pid < 48) && (vm_swap_size != 0))) { 1509 return (0); 1510 } 1511 1512 /* 1513 * if the process is in a non-running type state, 1514 * don't touch it. 1515 */ 1516 if (p->p_stat != SACTIVE && p->p_stat != SSTOP) 1517 return (0); 1518 1519 /* 1520 * Get the approximate process size. Note that anonymous pages 1521 * with backing swap will be counted twice, but there should not 1522 * be too many such pages due to the stress the VM system is 1523 * under at this point. 1524 */ 1525 size = vmspace_anonymous_count(p->p_vmspace) + 1526 vmspace_swap_count(p->p_vmspace); 1527 1528 /* 1529 * If the this process is bigger than the biggest one 1530 * remember it. 1531 */ 1532 if (info->bigsize < size) { 1533 if (info->bigproc) 1534 PRELE(info->bigproc); 1535 PHOLD(p); 1536 info->bigproc = p; 1537 info->bigsize = size; 1538 } 1539 lwkt_yield(); 1540 return(0); 1541 } 1542 1543 /* 1544 * This routine tries to maintain the pseudo LRU active queue, 1545 * so that during long periods of time where there is no paging, 1546 * that some statistic accumulation still occurs. This code 1547 * helps the situation where paging just starts to occur. 1548 */ 1549 static void 1550 vm_pageout_page_stats(int q) 1551 { 1552 static int fullintervalcount = 0; 1553 struct vm_page marker; 1554 vm_page_t m; 1555 int pcount, tpcount; /* Number of pages to check */ 1556 int page_shortage; 1557 1558 page_shortage = (vmstats.v_inactive_target + vmstats.v_cache_max + 1559 vmstats.v_free_min) - 1560 (vmstats.v_free_count + vmstats.v_inactive_count + 1561 vmstats.v_cache_count); 1562 1563 if (page_shortage <= 0) 1564 return; 1565 1566 pcount = vm_page_queues[PQ_ACTIVE + q].lcnt; 1567 fullintervalcount += vm_pageout_stats_interval; 1568 if (fullintervalcount < vm_pageout_full_stats_interval) { 1569 tpcount = (vm_pageout_stats_max * pcount) / 1570 vmstats.v_page_count + 1; 1571 if (pcount > tpcount) 1572 pcount = tpcount; 1573 } else { 1574 fullintervalcount = 0; 1575 } 1576 1577 bzero(&marker, sizeof(marker)); 1578 marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER; 1579 marker.queue = PQ_ACTIVE + q; 1580 marker.pc = q; 1581 marker.wire_count = 1; 1582 1583 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1584 TAILQ_INSERT_HEAD(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1585 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1586 1587 while ((m = TAILQ_NEXT(&marker, pageq)) != NULL && 1588 pcount-- > 0) 1589 { 1590 int actcount; 1591 1592 vm_page_and_queue_spin_lock(m); 1593 if (m != TAILQ_NEXT(&marker, pageq)) { 1594 vm_page_and_queue_spin_unlock(m); 1595 ++pcount; 1596 continue; 1597 } 1598 KKASSERT(m->queue - m->pc == PQ_ACTIVE); 1599 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1600 TAILQ_INSERT_AFTER(&vm_page_queues[PQ_ACTIVE + q].pl, m, 1601 &marker, pageq); 1602 1603 /* 1604 * Ignore markers 1605 */ 1606 if (m->flags & PG_MARKER) { 1607 vm_page_and_queue_spin_unlock(m); 1608 continue; 1609 } 1610 1611 /* 1612 * Ignore pages we can't busy 1613 */ 1614 if (vm_page_busy_try(m, TRUE)) { 1615 vm_page_and_queue_spin_unlock(m); 1616 continue; 1617 } 1618 vm_page_and_queue_spin_unlock(m); 1619 KKASSERT(m->queue - m->pc == PQ_ACTIVE); 1620 1621 /* 1622 * We now have a safely busied page, the page and queue 1623 * spinlocks have been released. 1624 * 1625 * Ignore held pages 1626 */ 1627 if (m->hold_count) { 1628 vm_page_wakeup(m); 1629 continue; 1630 } 1631 1632 /* 1633 * Calculate activity 1634 */ 1635 actcount = 0; 1636 if (m->flags & PG_REFERENCED) { 1637 vm_page_flag_clear(m, PG_REFERENCED); 1638 actcount += 1; 1639 } 1640 actcount += pmap_ts_referenced(m); 1641 1642 /* 1643 * Update act_count and move page to end of queue. 1644 */ 1645 if (actcount) { 1646 m->act_count += ACT_ADVANCE + actcount; 1647 if (m->act_count > ACT_MAX) 1648 m->act_count = ACT_MAX; 1649 vm_page_and_queue_spin_lock(m); 1650 if (m->queue - m->pc == PQ_ACTIVE) { 1651 TAILQ_REMOVE( 1652 &vm_page_queues[PQ_ACTIVE + q].pl, 1653 m, pageq); 1654 TAILQ_INSERT_TAIL( 1655 &vm_page_queues[PQ_ACTIVE + q].pl, 1656 m, pageq); 1657 } 1658 vm_page_and_queue_spin_unlock(m); 1659 vm_page_wakeup(m); 1660 continue; 1661 } 1662 1663 if (m->act_count == 0) { 1664 /* 1665 * We turn off page access, so that we have 1666 * more accurate RSS stats. We don't do this 1667 * in the normal page deactivation when the 1668 * system is loaded VM wise, because the 1669 * cost of the large number of page protect 1670 * operations would be higher than the value 1671 * of doing the operation. 1672 * 1673 * We use the marker to save our place so 1674 * we can release the spin lock. both (m) 1675 * and (next) will be invalid. 1676 */ 1677 vm_page_protect(m, VM_PROT_NONE); 1678 vm_page_deactivate(m); 1679 } else { 1680 m->act_count -= min(m->act_count, ACT_DECLINE); 1681 vm_page_and_queue_spin_lock(m); 1682 if (m->queue - m->pc == PQ_ACTIVE) { 1683 TAILQ_REMOVE( 1684 &vm_page_queues[PQ_ACTIVE + q].pl, 1685 m, pageq); 1686 TAILQ_INSERT_TAIL( 1687 &vm_page_queues[PQ_ACTIVE + q].pl, 1688 m, pageq); 1689 } 1690 vm_page_and_queue_spin_unlock(m); 1691 } 1692 vm_page_wakeup(m); 1693 } 1694 1695 /* 1696 * Remove our local marker 1697 */ 1698 vm_page_queues_spin_lock(PQ_ACTIVE + q); 1699 TAILQ_REMOVE(&vm_page_queues[PQ_ACTIVE + q].pl, &marker, pageq); 1700 vm_page_queues_spin_unlock(PQ_ACTIVE + q); 1701 } 1702 1703 static int 1704 vm_pageout_free_page_calc(vm_size_t count) 1705 { 1706 if (count < vmstats.v_page_count) 1707 return 0; 1708 /* 1709 * free_reserved needs to include enough for the largest swap pager 1710 * structures plus enough for any pv_entry structs when paging. 1711 * 1712 * v_free_min normal allocations 1713 * v_free_reserved system allocations 1714 * v_pageout_free_min allocations by pageout daemon 1715 * v_interrupt_free_min low level allocations (e.g swap structures) 1716 */ 1717 if (vmstats.v_page_count > 1024) 1718 vmstats.v_free_min = 64 + (vmstats.v_page_count - 1024) / 200; 1719 else 1720 vmstats.v_free_min = 64; 1721 vmstats.v_free_reserved = vmstats.v_free_min * 4 / 8 + 7; 1722 vmstats.v_free_severe = vmstats.v_free_min * 4 / 8 + 0; 1723 vmstats.v_pageout_free_min = vmstats.v_free_min * 2 / 8 + 7; 1724 vmstats.v_interrupt_free_min = vmstats.v_free_min * 1 / 8 + 7; 1725 1726 return 1; 1727 } 1728 1729 1730 /* 1731 * vm_pageout is the high level pageout daemon. 1732 * 1733 * No requirements. 1734 */ 1735 static void 1736 vm_pageout_thread(void) 1737 { 1738 int pass; 1739 int q; 1740 1741 /* 1742 * Initialize some paging parameters. 1743 */ 1744 curthread->td_flags |= TDF_SYSTHREAD; 1745 1746 if (vmstats.v_page_count < 2000) 1747 vm_pageout_page_count = 8; 1748 1749 vm_pageout_free_page_calc(vmstats.v_page_count); 1750 1751 /* 1752 * v_free_target and v_cache_min control pageout hysteresis. Note 1753 * that these are more a measure of the VM cache queue hysteresis 1754 * then the VM free queue. Specifically, v_free_target is the 1755 * high water mark (free+cache pages). 1756 * 1757 * v_free_reserved + v_cache_min (mostly means v_cache_min) is the 1758 * low water mark, while v_free_min is the stop. v_cache_min must 1759 * be big enough to handle memory needs while the pageout daemon 1760 * is signalled and run to free more pages. 1761 */ 1762 if (vmstats.v_free_count > 6144) 1763 vmstats.v_free_target = 4 * vmstats.v_free_min + vmstats.v_free_reserved; 1764 else 1765 vmstats.v_free_target = 2 * vmstats.v_free_min + vmstats.v_free_reserved; 1766 1767 /* 1768 * NOTE: With the new buffer cache b_act_count we want the default 1769 * inactive target to be a percentage of available memory. 1770 * 1771 * The inactive target essentially determines the minimum 1772 * number of 'temporary' pages capable of caching one-time-use 1773 * files when the VM system is otherwise full of pages 1774 * belonging to multi-time-use files or active program data. 1775 * 1776 * NOTE: The inactive target is aggressively persued only if the 1777 * inactive queue becomes too small. If the inactive queue 1778 * is large enough to satisfy page movement to free+cache 1779 * then it is repopulated more slowly from the active queue. 1780 * This allows a general inactive_target default to be set. 1781 * 1782 * There is an issue here for processes which sit mostly idle 1783 * 'overnight', such as sshd, tcsh, and X. Any movement from 1784 * the active queue will eventually cause such pages to 1785 * recycle eventually causing a lot of paging in the morning. 1786 * To reduce the incidence of this pages cycled out of the 1787 * buffer cache are moved directly to the inactive queue if 1788 * they were only used once or twice. 1789 * 1790 * The vfs.vm_cycle_point sysctl can be used to adjust this. 1791 * Increasing the value (up to 64) increases the number of 1792 * buffer recyclements which go directly to the inactive queue. 1793 */ 1794 if (vmstats.v_free_count > 2048) { 1795 vmstats.v_cache_min = vmstats.v_free_target; 1796 vmstats.v_cache_max = 2 * vmstats.v_cache_min; 1797 } else { 1798 vmstats.v_cache_min = 0; 1799 vmstats.v_cache_max = 0; 1800 } 1801 vmstats.v_inactive_target = vmstats.v_free_count / 4; 1802 1803 /* XXX does not really belong here */ 1804 if (vm_page_max_wired == 0) 1805 vm_page_max_wired = vmstats.v_free_count / 3; 1806 1807 if (vm_pageout_stats_max == 0) 1808 vm_pageout_stats_max = vmstats.v_free_target; 1809 1810 /* 1811 * Set interval in seconds for stats scan. 1812 */ 1813 if (vm_pageout_stats_interval == 0) 1814 vm_pageout_stats_interval = 5; 1815 if (vm_pageout_full_stats_interval == 0) 1816 vm_pageout_full_stats_interval = vm_pageout_stats_interval * 4; 1817 1818 1819 /* 1820 * Set maximum free per pass 1821 */ 1822 if (vm_pageout_stats_free_max == 0) 1823 vm_pageout_stats_free_max = 5; 1824 1825 swap_pager_swap_init(); 1826 pass = 0; 1827 1828 /* 1829 * The pageout daemon is never done, so loop forever. 1830 */ 1831 while (TRUE) { 1832 int error; 1833 int delta1; 1834 int delta2; 1835 int avail_shortage; 1836 int inactive_shortage; 1837 int vnodes_skipped = 0; 1838 int recycle_count = 0; 1839 int tmp; 1840 1841 /* 1842 * Wait for an action request. If we timeout check to 1843 * see if paging is needed (in case the normal wakeup 1844 * code raced us). 1845 */ 1846 if (vm_pages_needed == 0) { 1847 error = tsleep(&vm_pages_needed, 1848 0, "psleep", 1849 vm_pageout_stats_interval * hz); 1850 if (error && 1851 vm_paging_needed() == 0 && 1852 vm_pages_needed == 0) { 1853 for (q = 0; q < PQ_L2_SIZE; ++q) 1854 vm_pageout_page_stats(q); 1855 continue; 1856 } 1857 vm_pages_needed = 1; 1858 } 1859 1860 mycpu->gd_cnt.v_pdwakeups++; 1861 1862 /* 1863 * Do whatever cleanup that the pmap code can. 1864 */ 1865 pmap_collect(); 1866 1867 /* 1868 * Scan for pageout. Try to avoid thrashing the system 1869 * with activity. 1870 * 1871 * Calculate our target for the number of free+cache pages we 1872 * want to get to. This is higher then the number that causes 1873 * allocations to stall (severe) in order to provide hysteresis, 1874 * and if we don't make it all the way but get to the minimum 1875 * we're happy. Goose it a bit if there are multipler 1876 * requests for memory. 1877 */ 1878 avail_shortage = vm_paging_target() + vm_pageout_deficit; 1879 vm_pageout_deficit = 0; 1880 delta1 = 0; 1881 if (avail_shortage > 0) { 1882 for (q = 0; q < PQ_L2_SIZE; ++q) { 1883 delta1 += vm_pageout_scan_inactive( 1884 pass, q, 1885 PQAVERAGE(avail_shortage), 1886 &vnodes_skipped); 1887 } 1888 avail_shortage -= delta1; 1889 } 1890 1891 /* 1892 * Figure out how many active pages we must deactivate. If 1893 * we were able to reach our target with just the inactive 1894 * scan above we limit the number of active pages we 1895 * deactivate to reduce unnecessary work. 1896 */ 1897 inactive_shortage = vmstats.v_inactive_target - 1898 vmstats.v_inactive_count; 1899 1900 /* 1901 * If we were unable to free sufficient inactive pages to 1902 * satisfy the free/cache queue requirements then simply 1903 * reaching the inactive target may not be good enough. 1904 * Try to deactivate pages in excess of the target based 1905 * on the shortfall. 1906 * 1907 * However to prevent thrashing the VM system do not 1908 * deactivate more than an additional 1/10 the inactive 1909 * target's worth of active pages. 1910 */ 1911 if (avail_shortage > 0) { 1912 tmp = avail_shortage * 2; 1913 if (tmp > vmstats.v_inactive_target / 10) 1914 tmp = vmstats.v_inactive_target / 10; 1915 inactive_shortage += tmp; 1916 } 1917 1918 if (avail_shortage > 0 || inactive_shortage > 0) { 1919 delta2 = 0; 1920 for (q = 0; q < PQ_L2_SIZE; ++q) { 1921 delta2 += vm_pageout_scan_active( 1922 pass, q, 1923 PQAVERAGE(avail_shortage), 1924 PQAVERAGE(inactive_shortage), 1925 &recycle_count); 1926 } 1927 inactive_shortage -= delta2; 1928 avail_shortage -= delta2; 1929 } 1930 1931 /* 1932 * Finally free enough cache pages to meet our free page 1933 * requirement and take more drastic measures if we are 1934 * still in trouble. 1935 */ 1936 vm_pageout_scan_cache(avail_shortage, vnodes_skipped, 1937 recycle_count); 1938 1939 /* 1940 * Wait for more work. 1941 */ 1942 if (avail_shortage > 0) { 1943 ++pass; 1944 if (swap_pager_full) { 1945 /* 1946 * Running out of memory, catastrophic back-off 1947 * to one-second intervals. 1948 */ 1949 tsleep(&vm_pages_needed, 0, "pdelay", hz); 1950 } else if (pass < 10 && vm_pages_needed > 1) { 1951 /* 1952 * Normal operation, additional processes 1953 * have already kicked us. Retry immediately. 1954 */ 1955 } else if (pass < 10) { 1956 /* 1957 * Normal operation, fewer processes. Delay 1958 * a bit but allow wakeups. 1959 */ 1960 vm_pages_needed = 0; 1961 tsleep(&vm_pages_needed, 0, "pdelay", hz / 10); 1962 vm_pages_needed = 1; 1963 } else { 1964 /* 1965 * We've taken too many passes, forced delay. 1966 */ 1967 tsleep(&vm_pages_needed, 0, "pdelay", hz / 10); 1968 } 1969 } else { 1970 /* 1971 * Interlocked wakeup of waiters (non-optional) 1972 */ 1973 pass = 0; 1974 if (vm_pages_needed && !vm_page_count_min(0)) { 1975 wakeup(&vmstats.v_free_count); 1976 vm_pages_needed = 0; 1977 } 1978 } 1979 } 1980 } 1981 1982 static struct kproc_desc page_kp = { 1983 "pagedaemon", 1984 vm_pageout_thread, 1985 &pagethread 1986 }; 1987 SYSINIT(pagedaemon, SI_SUB_KTHREAD_PAGE, SI_ORDER_FIRST, kproc_start, &page_kp) 1988 1989 1990 /* 1991 * Called after allocating a page out of the cache or free queue 1992 * to possibly wake the pagedaemon up to replentish our supply. 1993 * 1994 * We try to generate some hysteresis by waking the pagedaemon up 1995 * when our free+cache pages go below the free_min+cache_min level. 1996 * The pagedaemon tries to get the count back up to at least the 1997 * minimum, and through to the target level if possible. 1998 * 1999 * If the pagedaemon is already active bump vm_pages_needed as a hint 2000 * that there are even more requests pending. 2001 * 2002 * SMP races ok? 2003 * No requirements. 2004 */ 2005 void 2006 pagedaemon_wakeup(void) 2007 { 2008 if (vm_paging_needed() && curthread != pagethread) { 2009 if (vm_pages_needed == 0) { 2010 vm_pages_needed = 1; /* SMP race ok */ 2011 wakeup(&vm_pages_needed); 2012 } else if (vm_page_count_min(0)) { 2013 ++vm_pages_needed; /* SMP race ok */ 2014 } 2015 } 2016 } 2017 2018 #if !defined(NO_SWAPPING) 2019 2020 /* 2021 * SMP races ok? 2022 * No requirements. 2023 */ 2024 static void 2025 vm_req_vmdaemon(void) 2026 { 2027 static int lastrun = 0; 2028 2029 if ((ticks > (lastrun + hz)) || (ticks < lastrun)) { 2030 wakeup(&vm_daemon_needed); 2031 lastrun = ticks; 2032 } 2033 } 2034 2035 static int vm_daemon_callback(struct proc *p, void *data __unused); 2036 2037 /* 2038 * No requirements. 2039 */ 2040 static void 2041 vm_daemon(void) 2042 { 2043 /* 2044 * XXX vm_daemon_needed specific token? 2045 */ 2046 while (TRUE) { 2047 tsleep(&vm_daemon_needed, 0, "psleep", 0); 2048 if (vm_pageout_req_swapout) { 2049 swapout_procs(vm_pageout_req_swapout); 2050 vm_pageout_req_swapout = 0; 2051 } 2052 /* 2053 * scan the processes for exceeding their rlimits or if 2054 * process is swapped out -- deactivate pages 2055 */ 2056 allproc_scan(vm_daemon_callback, NULL); 2057 } 2058 } 2059 2060 /* 2061 * Caller must hold proc_token. 2062 */ 2063 static int 2064 vm_daemon_callback(struct proc *p, void *data __unused) 2065 { 2066 vm_pindex_t limit, size; 2067 2068 /* 2069 * if this is a system process or if we have already 2070 * looked at this process, skip it. 2071 */ 2072 if (p->p_flags & (P_SYSTEM | P_WEXIT)) 2073 return (0); 2074 2075 /* 2076 * if the process is in a non-running type state, 2077 * don't touch it. 2078 */ 2079 if (p->p_stat != SACTIVE && p->p_stat != SSTOP) 2080 return (0); 2081 2082 /* 2083 * get a limit 2084 */ 2085 limit = OFF_TO_IDX(qmin(p->p_rlimit[RLIMIT_RSS].rlim_cur, 2086 p->p_rlimit[RLIMIT_RSS].rlim_max)); 2087 2088 /* 2089 * let processes that are swapped out really be 2090 * swapped out. Set the limit to nothing to get as 2091 * many pages out to swap as possible. 2092 */ 2093 if (p->p_flags & P_SWAPPEDOUT) 2094 limit = 0; 2095 2096 lwkt_gettoken(&p->p_vmspace->vm_map.token); 2097 size = vmspace_resident_count(p->p_vmspace); 2098 if (limit >= 0 && size >= limit) { 2099 vm_pageout_map_deactivate_pages(&p->p_vmspace->vm_map, limit); 2100 } 2101 lwkt_reltoken(&p->p_vmspace->vm_map.token); 2102 return (0); 2103 } 2104 2105 #endif 2106