xref: /netbsd-src/sys/uvm/uvm_pdaemon.c (revision 9fb66d812c00ebfb445c0b47dea128f32aa6fe96)
1 /*	$NetBSD: uvm_pdaemon.c,v 1.132 2021/04/17 01:53:58 mrg Exp $	*/
2 
3 /*
4  * Copyright (c) 1997 Charles D. Cranor and Washington University.
5  * Copyright (c) 1991, 1993, The Regents of the University of California.
6  *
7  * All rights reserved.
8  *
9  * This code is derived from software contributed to Berkeley by
10  * The Mach Operating System project at Carnegie-Mellon University.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	@(#)vm_pageout.c        8.5 (Berkeley) 2/14/94
37  * from: Id: uvm_pdaemon.c,v 1.1.2.32 1998/02/06 05:26:30 chs Exp
38  *
39  *
40  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
41  * All rights reserved.
42  *
43  * Permission to use, copy, modify and distribute this software and
44  * its documentation is hereby granted, provided that both the copyright
45  * notice and this permission notice appear in all copies of the
46  * software, derivative works or modified versions, and any portions
47  * thereof, and that both notices appear in supporting documentation.
48  *
49  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
50  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
51  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
52  *
53  * Carnegie Mellon requests users of this software to return to
54  *
55  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
56  *  School of Computer Science
57  *  Carnegie Mellon University
58  *  Pittsburgh PA 15213-3890
59  *
60  * any improvements or extensions that they make and grant Carnegie the
61  * rights to redistribute these changes.
62  */
63 
64 /*
65  * uvm_pdaemon.c: the page daemon
66  */
67 
68 #include <sys/cdefs.h>
69 __KERNEL_RCSID(0, "$NetBSD: uvm_pdaemon.c,v 1.132 2021/04/17 01:53:58 mrg Exp $");
70 
71 #include "opt_uvmhist.h"
72 #include "opt_readahead.h"
73 
74 #define	__RWLOCK_PRIVATE
75 
76 #include <sys/param.h>
77 #include <sys/proc.h>
78 #include <sys/systm.h>
79 #include <sys/kernel.h>
80 #include <sys/pool.h>
81 #include <sys/buf.h>
82 #include <sys/module.h>
83 #include <sys/atomic.h>
84 #include <sys/kthread.h>
85 
86 #include <uvm/uvm.h>
87 #include <uvm/uvm_pdpolicy.h>
88 #include <uvm/uvm_pgflcache.h>
89 
90 #ifdef UVMHIST
91 static struct kern_history_ent pdhistbuf[UVMHIST_PDHIST_SIZE];
92 UVMHIST_DEFINE(pdhist) = UVMHIST_INITIALIZER(pdhisthist, pdhistbuf);
93 #endif
94 
95 /*
96  * UVMPD_NUMDIRTYREACTS is how many dirty pages the pagedaemon will reactivate
97  * in a pass thru the inactive list when swap is full.  the value should be
98  * "small"... if it's too large we'll cycle the active pages thru the inactive
99  * queue too quickly to for them to be referenced and avoid being freed.
100  */
101 
102 #define	UVMPD_NUMDIRTYREACTS	16
103 
104 /*
105  * local prototypes
106  */
107 
108 static void	uvmpd_scan(void);
109 static void	uvmpd_scan_queue(void);
110 static void	uvmpd_tune(void);
111 static void	uvmpd_pool_drain_thread(void *);
112 static void	uvmpd_pool_drain_wakeup(void);
113 
114 static unsigned int uvm_pagedaemon_waiters;
115 
116 /* State for the pool drainer thread */
117 static kmutex_t uvmpd_lock __cacheline_aligned;
118 static kcondvar_t uvmpd_pool_drain_cv;
119 static bool uvmpd_pool_drain_run = false;
120 
121 /*
122  * XXX hack to avoid hangs when large processes fork.
123  */
124 u_int uvm_extrapages;
125 
126 /*
127  * uvm_wait: wait (sleep) for the page daemon to free some pages
128  *
129  * => should be called with all locks released
130  * => should _not_ be called by the page daemon (to avoid deadlock)
131  */
132 
133 void
134 uvm_wait(const char *wmsg)
135 {
136 	int timo = 0;
137 
138 	if (uvm.pagedaemon_lwp == NULL)
139 		panic("out of memory before the pagedaemon thread exists");
140 
141 	mutex_spin_enter(&uvmpd_lock);
142 
143 	/*
144 	 * check for page daemon going to sleep (waiting for itself)
145 	 */
146 
147 	if (curlwp == uvm.pagedaemon_lwp && uvmexp.paging == 0) {
148 		/*
149 		 * now we have a problem: the pagedaemon wants to go to
150 		 * sleep until it frees more memory.   but how can it
151 		 * free more memory if it is asleep?  that is a deadlock.
152 		 * we have two options:
153 		 *  [1] panic now
154 		 *  [2] put a timeout on the sleep, thus causing the
155 		 *      pagedaemon to only pause (rather than sleep forever)
156 		 *
157 		 * note that option [2] will only help us if we get lucky
158 		 * and some other process on the system breaks the deadlock
159 		 * by exiting or freeing memory (thus allowing the pagedaemon
160 		 * to continue).  for now we panic if DEBUG is defined,
161 		 * otherwise we hope for the best with option [2] (better
162 		 * yet, this should never happen in the first place!).
163 		 */
164 
165 		printf("pagedaemon: deadlock detected!\n");
166 		timo = hz >> 3;		/* set timeout */
167 #if defined(DEBUG)
168 		/* DEBUG: panic so we can debug it */
169 		panic("pagedaemon deadlock");
170 #endif
171 	}
172 
173 	uvm_pagedaemon_waiters++;
174 	wakeup(&uvm.pagedaemon);		/* wake the daemon! */
175 	UVM_UNLOCK_AND_WAIT(&uvmexp.free, &uvmpd_lock, false, wmsg, timo);
176 }
177 
178 /*
179  * uvm_kick_pdaemon: perform checks to determine if we need to
180  * give the pagedaemon a nudge, and do so if necessary.
181  */
182 
183 void
184 uvm_kick_pdaemon(void)
185 {
186 	int fpages = uvm_availmem(false);
187 
188 	if (fpages + uvmexp.paging < uvmexp.freemin ||
189 	    (fpages + uvmexp.paging < uvmexp.freetarg &&
190 	     uvmpdpol_needsscan_p()) ||
191 	     uvm_km_va_starved_p()) {
192 	     	mutex_spin_enter(&uvmpd_lock);
193 		wakeup(&uvm.pagedaemon);
194 	     	mutex_spin_exit(&uvmpd_lock);
195 	}
196 }
197 
198 /*
199  * uvmpd_tune: tune paging parameters
200  *
201  * => called when ever memory is added (or removed?) to the system
202  */
203 
204 static void
205 uvmpd_tune(void)
206 {
207 	int val;
208 
209 	UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
210 
211 	/*
212 	 * try to keep 0.5% of available RAM free, but limit to between
213 	 * 128k and 1024k per-CPU.  XXX: what are these values good for?
214 	 */
215 	val = uvmexp.npages / 200;
216 	val = MAX(val, (128*1024) >> PAGE_SHIFT);
217 	val = MIN(val, (1024*1024) >> PAGE_SHIFT);
218 	val *= ncpu;
219 
220 	/* Make sure there's always a user page free. */
221 	if (val < uvmexp.reserve_kernel + 1)
222 		val = uvmexp.reserve_kernel + 1;
223 	uvmexp.freemin = val;
224 
225 	/* Calculate free target. */
226 	val = (uvmexp.freemin * 4) / 3;
227 	if (val <= uvmexp.freemin)
228 		val = uvmexp.freemin + 1;
229 	uvmexp.freetarg = val + atomic_swap_uint(&uvm_extrapages, 0);
230 
231 	uvmexp.wiredmax = uvmexp.npages / 3;
232 	UVMHIST_LOG(pdhist, "<- done, freemin=%jd, freetarg=%jd, wiredmax=%jd",
233 	      uvmexp.freemin, uvmexp.freetarg, uvmexp.wiredmax, 0);
234 }
235 
236 /*
237  * uvm_pageout: the main loop for the pagedaemon
238  */
239 
240 void
241 uvm_pageout(void *arg)
242 {
243 	int npages = 0;
244 	int extrapages = 0;
245 	int fpages;
246 
247 	UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
248 
249 	UVMHIST_LOG(pdhist,"<starting uvm pagedaemon>", 0, 0, 0, 0);
250 
251 	mutex_init(&uvmpd_lock, MUTEX_DEFAULT, IPL_VM);
252 	cv_init(&uvmpd_pool_drain_cv, "pooldrain");
253 
254 	/* Create the pool drainer kernel thread. */
255 	if (kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL,
256 	    uvmpd_pool_drain_thread, NULL, NULL, "pooldrain"))
257 		panic("fork pooldrain");
258 
259 	/*
260 	 * ensure correct priority and set paging parameters...
261 	 */
262 
263 	uvm.pagedaemon_lwp = curlwp;
264 	npages = uvmexp.npages;
265 	uvmpd_tune();
266 
267 	/*
268 	 * main loop
269 	 */
270 
271 	for (;;) {
272 		bool needsscan, needsfree, kmem_va_starved;
273 
274 		kmem_va_starved = uvm_km_va_starved_p();
275 
276 		mutex_spin_enter(&uvmpd_lock);
277 		if ((uvm_pagedaemon_waiters == 0 || uvmexp.paging > 0) &&
278 		    !kmem_va_starved) {
279 			UVMHIST_LOG(pdhist,"  <<SLEEPING>>",0,0,0,0);
280 			UVM_UNLOCK_AND_WAIT(&uvm.pagedaemon,
281 			    &uvmpd_lock, false, "pgdaemon", 0);
282 			uvmexp.pdwoke++;
283 			UVMHIST_LOG(pdhist,"  <<WOKE UP>>",0,0,0,0);
284 		} else {
285 			mutex_spin_exit(&uvmpd_lock);
286 		}
287 
288 		/*
289 		 * now recompute inactive count
290 		 */
291 
292 		if (npages != uvmexp.npages || extrapages != uvm_extrapages) {
293 			npages = uvmexp.npages;
294 			extrapages = uvm_extrapages;
295 			uvmpd_tune();
296 		}
297 
298 		uvmpdpol_tune();
299 
300 		/*
301 		 * Estimate a hint.  Note that bufmem are returned to
302 		 * system only when entire pool page is empty.
303 		 */
304 		fpages = uvm_availmem(false);
305 		UVMHIST_LOG(pdhist,"  free/ftarg=%jd/%jd",
306 		    fpages, uvmexp.freetarg, 0,0);
307 
308 		needsfree = fpages + uvmexp.paging < uvmexp.freetarg;
309 		needsscan = needsfree || uvmpdpol_needsscan_p();
310 
311 		/*
312 		 * scan if needed
313 		 */
314 		if (needsscan) {
315 			uvmpd_scan();
316 		}
317 
318 		/*
319 		 * if there's any free memory to be had,
320 		 * wake up any waiters.
321 		 */
322 		if (uvm_availmem(false) > uvmexp.reserve_kernel ||
323 		    uvmexp.paging == 0) {
324 			mutex_spin_enter(&uvmpd_lock);
325 			wakeup(&uvmexp.free);
326 			uvm_pagedaemon_waiters = 0;
327 			mutex_spin_exit(&uvmpd_lock);
328 		}
329 
330 		/*
331 		 * scan done.  if we don't need free memory, we're done.
332 		 */
333 
334 		if (!needsfree && !kmem_va_starved)
335 			continue;
336 
337 		/*
338 		 * kick the pool drainer thread.
339 		 */
340 
341 		uvmpd_pool_drain_wakeup();
342 	}
343 	/*NOTREACHED*/
344 }
345 
346 void
347 uvm_pageout_start(int npages)
348 {
349 
350 	atomic_add_int(&uvmexp.paging, npages);
351 }
352 
353 void
354 uvm_pageout_done(int npages)
355 {
356 
357 	KASSERT(atomic_load_relaxed(&uvmexp.paging) >= npages);
358 
359 	if (npages == 0) {
360 		return;
361 	}
362 
363 	atomic_add_int(&uvmexp.paging, -npages);
364 
365 	/*
366 	 * wake up either of pagedaemon or LWPs waiting for it.
367 	 */
368 
369 	mutex_spin_enter(&uvmpd_lock);
370 	if (uvm_availmem(false) <= uvmexp.reserve_kernel) {
371 		wakeup(&uvm.pagedaemon);
372 	} else if (uvm_pagedaemon_waiters != 0) {
373 		wakeup(&uvmexp.free);
374 		uvm_pagedaemon_waiters = 0;
375 	}
376 	mutex_spin_exit(&uvmpd_lock);
377 }
378 
379 static krwlock_t *
380 uvmpd_page_owner_lock(struct vm_page *pg)
381 {
382 	struct uvm_object *uobj = pg->uobject;
383 	struct vm_anon *anon = pg->uanon;
384 	krwlock_t *slock;
385 
386 	KASSERT(mutex_owned(&pg->interlock));
387 
388 #ifdef DEBUG
389 	if (uobj == (void *)0xdeadbeef || anon == (void *)0xdeadbeef) {
390 		return NULL;
391 	}
392 #endif
393 	if (uobj != NULL) {
394 		slock = uobj->vmobjlock;
395 		KASSERTMSG(slock != NULL, "pg %p uobj %p, NULL lock", pg, uobj);
396 	} else if (anon != NULL) {
397 		slock = anon->an_lock;
398 		KASSERTMSG(slock != NULL, "pg %p anon %p, NULL lock", pg, anon);
399 	} else {
400 		slock = NULL;
401 	}
402 	return slock;
403 }
404 
405 /*
406  * uvmpd_trylockowner: trylock the page's owner.
407  *
408  * => called with page interlock held.
409  * => resolve orphaned O->A loaned page.
410  * => return the locked mutex on success.  otherwise, return NULL.
411  */
412 
413 krwlock_t *
414 uvmpd_trylockowner(struct vm_page *pg)
415 {
416 	krwlock_t *slock, *heldslock;
417 
418 	KASSERT(mutex_owned(&pg->interlock));
419 
420 	slock = uvmpd_page_owner_lock(pg);
421 	if (slock == NULL) {
422 		/* Page may be in state of flux - ignore. */
423 		mutex_exit(&pg->interlock);
424 		return NULL;
425 	}
426 
427 	if (rw_tryenter(slock, RW_WRITER)) {
428 		goto success;
429 	}
430 
431 	/*
432 	 * The try-lock didn't work, so now do a blocking lock after
433 	 * dropping the page interlock.  Prevent the owner lock from
434 	 * being freed by taking a hold on it first.
435 	 */
436 
437 	rw_obj_hold(slock);
438 	mutex_exit(&pg->interlock);
439 	rw_enter(slock, RW_WRITER);
440 	heldslock = slock;
441 
442 	/*
443 	 * Now we hold some owner lock.  Check if the lock we hold
444 	 * is still the lock for the owner of the page.
445 	 * If it is then return it, otherwise release it and return NULL.
446 	 */
447 
448 	mutex_enter(&pg->interlock);
449 	slock = uvmpd_page_owner_lock(pg);
450 	if (heldslock != slock) {
451 		rw_exit(heldslock);
452 		slock = NULL;
453 	}
454 	rw_obj_free(heldslock);
455 	if (slock != NULL) {
456 success:
457 		/*
458 		 * Set PG_ANON if it isn't set already.
459 		 */
460 		if (pg->uobject == NULL && (pg->flags & PG_ANON) == 0) {
461 			KASSERT(pg->loan_count > 0);
462 			pg->loan_count--;
463 			pg->flags |= PG_ANON;
464 			/* anon now owns it */
465 		}
466 	}
467 	mutex_exit(&pg->interlock);
468 	return slock;
469 }
470 
471 #if defined(VMSWAP)
472 struct swapcluster {
473 	int swc_slot;
474 	int swc_nallocated;
475 	int swc_nused;
476 	struct vm_page *swc_pages[howmany(MAXPHYS, MIN_PAGE_SIZE)];
477 };
478 
479 static void
480 swapcluster_init(struct swapcluster *swc)
481 {
482 
483 	swc->swc_slot = 0;
484 	swc->swc_nused = 0;
485 }
486 
487 static int
488 swapcluster_allocslots(struct swapcluster *swc)
489 {
490 	int slot;
491 	int npages;
492 
493 	if (swc->swc_slot != 0) {
494 		return 0;
495 	}
496 
497 	/* Even with strange MAXPHYS, the shift
498 	   implicitly rounds down to a page. */
499 	npages = MAXPHYS >> PAGE_SHIFT;
500 	slot = uvm_swap_alloc(&npages, true);
501 	if (slot == 0) {
502 		return ENOMEM;
503 	}
504 	swc->swc_slot = slot;
505 	swc->swc_nallocated = npages;
506 	swc->swc_nused = 0;
507 
508 	return 0;
509 }
510 
511 static int
512 swapcluster_add(struct swapcluster *swc, struct vm_page *pg)
513 {
514 	int slot;
515 	struct uvm_object *uobj;
516 
517 	KASSERT(swc->swc_slot != 0);
518 	KASSERT(swc->swc_nused < swc->swc_nallocated);
519 	KASSERT((pg->flags & PG_SWAPBACKED) != 0);
520 
521 	slot = swc->swc_slot + swc->swc_nused;
522 	uobj = pg->uobject;
523 	if (uobj == NULL) {
524 		KASSERT(rw_write_held(pg->uanon->an_lock));
525 		pg->uanon->an_swslot = slot;
526 	} else {
527 		int result;
528 
529 		KASSERT(rw_write_held(uobj->vmobjlock));
530 		result = uao_set_swslot(uobj, pg->offset >> PAGE_SHIFT, slot);
531 		if (result == -1) {
532 			return ENOMEM;
533 		}
534 	}
535 	swc->swc_pages[swc->swc_nused] = pg;
536 	swc->swc_nused++;
537 
538 	return 0;
539 }
540 
541 static void
542 swapcluster_flush(struct swapcluster *swc, bool now)
543 {
544 	int slot;
545 	int nused;
546 	int nallocated;
547 	int error __diagused;
548 
549 	if (swc->swc_slot == 0) {
550 		return;
551 	}
552 	KASSERT(swc->swc_nused <= swc->swc_nallocated);
553 
554 	slot = swc->swc_slot;
555 	nused = swc->swc_nused;
556 	nallocated = swc->swc_nallocated;
557 
558 	/*
559 	 * if this is the final pageout we could have a few
560 	 * unused swap blocks.  if so, free them now.
561 	 */
562 
563 	if (nused < nallocated) {
564 		if (!now) {
565 			return;
566 		}
567 		uvm_swap_free(slot + nused, nallocated - nused);
568 	}
569 
570 	/*
571 	 * now start the pageout.
572 	 */
573 
574 	if (nused > 0) {
575 		uvmexp.pdpageouts++;
576 		uvm_pageout_start(nused);
577 		error = uvm_swap_put(slot, swc->swc_pages, nused, 0);
578 		KASSERT(error == 0 || error == ENOMEM);
579 	}
580 
581 	/*
582 	 * zero swslot to indicate that we are
583 	 * no longer building a swap-backed cluster.
584 	 */
585 
586 	swc->swc_slot = 0;
587 	swc->swc_nused = 0;
588 }
589 
590 static int
591 swapcluster_nused(struct swapcluster *swc)
592 {
593 
594 	return swc->swc_nused;
595 }
596 
597 /*
598  * uvmpd_dropswap: free any swap allocated to this page.
599  *
600  * => called with owner locked.
601  * => return true if a page had an associated slot.
602  */
603 
604 bool
605 uvmpd_dropswap(struct vm_page *pg)
606 {
607 	bool result = false;
608 	struct vm_anon *anon = pg->uanon;
609 
610 	if ((pg->flags & PG_ANON) && anon->an_swslot) {
611 		uvm_swap_free(anon->an_swslot, 1);
612 		anon->an_swslot = 0;
613 		uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
614 		result = true;
615 	} else if (pg->flags & PG_AOBJ) {
616 		int slot = uao_set_swslot(pg->uobject,
617 		    pg->offset >> PAGE_SHIFT, 0);
618 		if (slot) {
619 			uvm_swap_free(slot, 1);
620 			uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
621 			result = true;
622 		}
623 	}
624 
625 	return result;
626 }
627 
628 #endif /* defined(VMSWAP) */
629 
630 /*
631  * uvmpd_scan_queue: scan an replace candidate list for pages
632  * to clean or free.
633  *
634  * => we work on meeting our free target by converting inactive pages
635  *    into free pages.
636  * => we handle the building of swap-backed clusters
637  */
638 
639 static void
640 uvmpd_scan_queue(void)
641 {
642 	struct vm_page *p;
643 	struct uvm_object *uobj;
644 	struct vm_anon *anon;
645 #if defined(VMSWAP)
646 	struct swapcluster swc;
647 #endif /* defined(VMSWAP) */
648 	int dirtyreacts;
649 	krwlock_t *slock;
650 	UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
651 
652 	/*
653 	 * swslot is non-zero if we are building a swap cluster.  we want
654 	 * to stay in the loop while we have a page to scan or we have
655 	 * a swap-cluster to build.
656 	 */
657 
658 #if defined(VMSWAP)
659 	swapcluster_init(&swc);
660 #endif /* defined(VMSWAP) */
661 
662 	dirtyreacts = 0;
663 	uvmpdpol_scaninit();
664 
665 	while (/* CONSTCOND */ 1) {
666 
667 		/*
668 		 * see if we've met the free target.
669 		 */
670 
671 		if (uvm_availmem(false) + uvmexp.paging
672 #if defined(VMSWAP)
673 		    + swapcluster_nused(&swc)
674 #endif /* defined(VMSWAP) */
675 		    >= uvmexp.freetarg << 2 ||
676 		    dirtyreacts == UVMPD_NUMDIRTYREACTS) {
677 			UVMHIST_LOG(pdhist,"  met free target: "
678 				    "exit loop", 0, 0, 0, 0);
679 			break;
680 		}
681 
682 		/*
683 		 * first we have the pdpolicy select a victim page
684 		 * and attempt to lock the object that the page
685 		 * belongs to.  if our attempt fails we skip on to
686 		 * the next page (no harm done).  it is important to
687 		 * "try" locking the object as we are locking in the
688 		 * wrong order (pageq -> object) and we don't want to
689 		 * deadlock.
690 		 *
691 		 * the only time we expect to see an ownerless page
692 		 * (i.e. a page with no uobject and !PG_ANON) is if an
693 		 * anon has loaned a page from a uvm_object and the
694 		 * uvm_object has dropped the ownership.  in that
695 		 * case, the anon can "take over" the loaned page
696 		 * and make it its own.
697 		 */
698 
699 		p = uvmpdpol_selectvictim(&slock);
700 		if (p == NULL) {
701 			break;
702 		}
703 		KASSERT(uvmpdpol_pageisqueued_p(p));
704 		KASSERT(uvm_page_owner_locked_p(p, true));
705 		KASSERT(p->wire_count == 0);
706 
707 		/*
708 		 * we are below target and have a new page to consider.
709 		 */
710 
711 		anon = p->uanon;
712 		uobj = p->uobject;
713 
714 		if (p->flags & PG_BUSY) {
715 			rw_exit(slock);
716 			uvmexp.pdbusy++;
717 			continue;
718 		}
719 
720 		/* does the page belong to an object? */
721 		if (uobj != NULL) {
722 			uvmexp.pdobscan++;
723 		} else {
724 #if defined(VMSWAP)
725 			KASSERT(anon != NULL);
726 			uvmexp.pdanscan++;
727 #else /* defined(VMSWAP) */
728 			panic("%s: anon", __func__);
729 #endif /* defined(VMSWAP) */
730 		}
731 
732 
733 		/*
734 		 * we now have the object locked.
735 		 * if the page is not swap-backed, call the object's
736 		 * pager to flush and free the page.
737 		 */
738 
739 #if defined(READAHEAD_STATS)
740 		if ((p->flags & PG_READAHEAD) != 0) {
741 			p->flags &= ~PG_READAHEAD;
742 			uvm_ra_miss.ev_count++;
743 		}
744 #endif /* defined(READAHEAD_STATS) */
745 
746 		if ((p->flags & PG_SWAPBACKED) == 0) {
747 			KASSERT(uobj != NULL);
748 			(void) (uobj->pgops->pgo_put)(uobj, p->offset,
749 			    p->offset + PAGE_SIZE, PGO_CLEANIT|PGO_FREE);
750 			continue;
751 		}
752 
753 		/*
754 		 * the page is swap-backed.  remove all the permissions
755 		 * from the page so we can sync the modified info
756 		 * without any race conditions.  if the page is clean
757 		 * we can free it now and continue.
758 		 */
759 
760 		pmap_page_protect(p, VM_PROT_NONE);
761 		if (uvm_pagegetdirty(p) == UVM_PAGE_STATUS_UNKNOWN) {
762 			if (pmap_clear_modify(p)) {
763 				uvm_pagemarkdirty(p, UVM_PAGE_STATUS_DIRTY);
764 			} else {
765 				uvm_pagemarkdirty(p, UVM_PAGE_STATUS_CLEAN);
766 			}
767 		}
768 		if (uvm_pagegetdirty(p) != UVM_PAGE_STATUS_DIRTY) {
769 			int slot;
770 			int pageidx;
771 
772 			pageidx = p->offset >> PAGE_SHIFT;
773 			uvm_pagefree(p);
774 			atomic_inc_uint(&uvmexp.pdfreed);
775 
776 			/*
777 			 * for anons, we need to remove the page
778 			 * from the anon ourselves.  for aobjs,
779 			 * pagefree did that for us.
780 			 */
781 
782 			if (anon) {
783 				KASSERT(anon->an_swslot != 0);
784 				anon->an_page = NULL;
785 				slot = anon->an_swslot;
786 			} else {
787 				slot = uao_find_swslot(uobj, pageidx);
788 			}
789 			if (slot > 0) {
790 				/* this page is now only in swap. */
791 				KASSERT(uvmexp.swpgonly < uvmexp.swpginuse);
792 				atomic_inc_uint(&uvmexp.swpgonly);
793 			}
794 			rw_exit(slock);
795 			continue;
796 		}
797 
798 #if defined(VMSWAP)
799 		/*
800 		 * this page is dirty, skip it if we'll have met our
801 		 * free target when all the current pageouts complete.
802 		 */
803 
804 		if (uvm_availmem(false) + uvmexp.paging >
805 		    uvmexp.freetarg << 2) {
806 			rw_exit(slock);
807 			continue;
808 		}
809 
810 		/*
811 		 * free any swap space allocated to the page since
812 		 * we'll have to write it again with its new data.
813 		 */
814 
815 		uvmpd_dropswap(p);
816 
817 		/*
818 		 * start new swap pageout cluster (if necessary).
819 		 *
820 		 * if swap is full reactivate this page so that
821 		 * we eventually cycle all pages through the
822 		 * inactive queue.
823 		 */
824 
825 		if (swapcluster_allocslots(&swc)) {
826 			dirtyreacts++;
827 			uvm_pagelock(p);
828 			uvm_pageactivate(p);
829 			uvm_pageunlock(p);
830 			rw_exit(slock);
831 			continue;
832 		}
833 
834 		/*
835 		 * at this point, we're definitely going reuse this
836 		 * page.  mark the page busy and delayed-free.
837 		 * we should remove the page from the page queues
838 		 * so we don't ever look at it again.
839 		 * adjust counters and such.
840 		 */
841 
842 		p->flags |= PG_BUSY;
843 		UVM_PAGE_OWN(p, "scan_queue");
844 		p->flags |= PG_PAGEOUT;
845 		uvmexp.pgswapout++;
846 
847 		uvm_pagelock(p);
848 		uvm_pagedequeue(p);
849 		uvm_pageunlock(p);
850 
851 		/*
852 		 * add the new page to the cluster.
853 		 */
854 
855 		if (swapcluster_add(&swc, p)) {
856 			p->flags &= ~(PG_BUSY|PG_PAGEOUT);
857 			UVM_PAGE_OWN(p, NULL);
858 			dirtyreacts++;
859 			uvm_pagelock(p);
860 			uvm_pageactivate(p);
861 			uvm_pageunlock(p);
862 			rw_exit(slock);
863 			continue;
864 		}
865 		rw_exit(slock);
866 
867 		swapcluster_flush(&swc, false);
868 
869 		/*
870 		 * the pageout is in progress.  bump counters and set up
871 		 * for the next loop.
872 		 */
873 
874 		atomic_inc_uint(&uvmexp.pdpending);
875 
876 #else /* defined(VMSWAP) */
877 		uvm_pagelock(p);
878 		uvm_pageactivate(p);
879 		uvm_pageunlock(p);
880 		rw_exit(slock);
881 #endif /* defined(VMSWAP) */
882 	}
883 
884 	uvmpdpol_scanfini();
885 
886 #if defined(VMSWAP)
887 	swapcluster_flush(&swc, true);
888 #endif /* defined(VMSWAP) */
889 }
890 
891 /*
892  * uvmpd_scan: scan the page queues and attempt to meet our targets.
893  */
894 
895 static void
896 uvmpd_scan(void)
897 {
898 	int swap_shortage, pages_freed, fpages;
899 	UVMHIST_FUNC(__func__); UVMHIST_CALLED(pdhist);
900 
901 	uvmexp.pdrevs++;
902 
903 	/*
904 	 * work on meeting our targets.   first we work on our free target
905 	 * by converting inactive pages into free pages.  then we work on
906 	 * meeting our inactive target by converting active pages to
907 	 * inactive ones.
908 	 */
909 
910 	UVMHIST_LOG(pdhist, "  starting 'free' loop",0,0,0,0);
911 
912 	pages_freed = uvmexp.pdfreed;
913 	uvmpd_scan_queue();
914 	pages_freed = uvmexp.pdfreed - pages_freed;
915 
916 	/*
917 	 * detect if we're not going to be able to page anything out
918 	 * until we free some swap resources from active pages.
919 	 */
920 
921 	swap_shortage = 0;
922 	fpages = uvm_availmem(false);
923 	if (fpages < uvmexp.freetarg &&
924 	    uvmexp.swpginuse >= uvmexp.swpgavail &&
925 	    !uvm_swapisfull() &&
926 	    pages_freed == 0) {
927 		swap_shortage = uvmexp.freetarg - fpages;
928 	}
929 
930 	uvmpdpol_balancequeue(swap_shortage);
931 
932 	/*
933 	 * if still below the minimum target, try unloading kernel
934 	 * modules.
935 	 */
936 
937 	if (uvm_availmem(false) < uvmexp.freemin) {
938 		module_thread_kick();
939 	}
940 }
941 
942 /*
943  * uvm_reclaimable: decide whether to wait for pagedaemon.
944  *
945  * => return true if it seems to be worth to do uvm_wait.
946  *
947  * XXX should be tunable.
948  * XXX should consider pools, etc?
949  */
950 
951 bool
952 uvm_reclaimable(void)
953 {
954 	int filepages;
955 	int active, inactive;
956 
957 	/*
958 	 * if swap is not full, no problem.
959 	 */
960 
961 	if (!uvm_swapisfull()) {
962 		return true;
963 	}
964 
965 	/*
966 	 * file-backed pages can be reclaimed even when swap is full.
967 	 * if we have more than 1/16 of pageable memory or 5MB, try to reclaim.
968 	 * NB: filepages calculation does not exclude EXECPAGES - intentional.
969 	 *
970 	 * XXX assume the worst case, ie. all wired pages are file-backed.
971 	 *
972 	 * XXX should consider about other reclaimable memory.
973 	 * XXX ie. pools, traditional buffer cache.
974 	 */
975 
976 	cpu_count_sync(false);
977 	filepages = (int)(cpu_count_get(CPU_COUNT_FILECLEAN) +
978 	    cpu_count_get(CPU_COUNT_FILEUNKNOWN) +
979 	    cpu_count_get(CPU_COUNT_FILEDIRTY) - uvmexp.wired);
980 	uvm_estimatepageable(&active, &inactive);
981 	if (filepages >= MIN((active + inactive) >> 4,
982 	    5 * 1024 * 1024 >> PAGE_SHIFT)) {
983 		return true;
984 	}
985 
986 	/*
987 	 * kill the process, fail allocation, etc..
988 	 */
989 
990 	return false;
991 }
992 
993 void
994 uvm_estimatepageable(int *active, int *inactive)
995 {
996 
997 	uvmpdpol_estimatepageable(active, inactive);
998 }
999 
1000 
1001 /*
1002  * Use a separate thread for draining pools.
1003  * This work can't done from the main pagedaemon thread because
1004  * some pool allocators need to take vm_map locks.
1005  */
1006 
1007 static void
1008 uvmpd_pool_drain_thread(void *arg)
1009 {
1010 	struct pool *firstpool, *curpool;
1011 	int bufcnt, lastslept;
1012 	bool cycled;
1013 
1014 	firstpool = NULL;
1015 	cycled = true;
1016 	for (;;) {
1017 		/*
1018 		 * sleep until awoken by the pagedaemon.
1019 		 */
1020 		mutex_enter(&uvmpd_lock);
1021 		if (!uvmpd_pool_drain_run) {
1022 			lastslept = getticks();
1023 			cv_wait(&uvmpd_pool_drain_cv, &uvmpd_lock);
1024 			if (getticks() != lastslept) {
1025 				cycled = false;
1026 				firstpool = NULL;
1027 			}
1028 		}
1029 		uvmpd_pool_drain_run = false;
1030 		mutex_exit(&uvmpd_lock);
1031 
1032 		/*
1033 		 * rate limit draining, otherwise in desperate circumstances
1034 		 * this can totally saturate the system with xcall activity.
1035 		 */
1036 		if (cycled) {
1037 			kpause("uvmpdlmt", false, 1, NULL);
1038 			cycled = false;
1039 			firstpool = NULL;
1040 		}
1041 
1042 		/*
1043 		 * drain and temporarily disable the freelist cache.
1044 		 */
1045 		uvm_pgflcache_pause();
1046 
1047 		/*
1048 		 * kill unused metadata buffers.
1049 		 */
1050 		bufcnt = uvmexp.freetarg - uvm_availmem(false);
1051 		if (bufcnt < 0)
1052 			bufcnt = 0;
1053 
1054 		mutex_enter(&bufcache_lock);
1055 		buf_drain(bufcnt << PAGE_SHIFT);
1056 		mutex_exit(&bufcache_lock);
1057 
1058 		/*
1059 		 * drain a pool, and then re-enable the freelist cache.
1060 		 */
1061 		(void)pool_drain(&curpool);
1062 		KASSERT(curpool != NULL);
1063 		if (firstpool == NULL) {
1064 			firstpool = curpool;
1065 		} else if (firstpool == curpool) {
1066 			cycled = true;
1067 		}
1068 		uvm_pgflcache_resume();
1069 	}
1070 	/*NOTREACHED*/
1071 }
1072 
1073 static void
1074 uvmpd_pool_drain_wakeup(void)
1075 {
1076 
1077 	mutex_enter(&uvmpd_lock);
1078 	uvmpd_pool_drain_run = true;
1079 	cv_signal(&uvmpd_pool_drain_cv);
1080 	mutex_exit(&uvmpd_lock);
1081 }
1082