xref: /netbsd-src/sys/uvm/uvm_pdaemon.c (revision f3cfa6f6ce31685c6c4a758bc430e69eb99f50a4)
1 /*	$NetBSD: uvm_pdaemon.c,v 1.110 2019/04/21 15:32:18 chs Exp $	*/
2 
3 /*
4  * Copyright (c) 1997 Charles D. Cranor and Washington University.
5  * Copyright (c) 1991, 1993, The Regents of the University of California.
6  *
7  * All rights reserved.
8  *
9  * This code is derived from software contributed to Berkeley by
10  * The Mach Operating System project at Carnegie-Mellon University.
11  *
12  * Redistribution and use in source and binary forms, with or without
13  * modification, are permitted provided that the following conditions
14  * are met:
15  * 1. Redistributions of source code must retain the above copyright
16  *    notice, this list of conditions and the following disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  * 3. Neither the name of the University nor the names of its contributors
21  *    may be used to endorse or promote products derived from this software
22  *    without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
25  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
26  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
27  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
28  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
29  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
30  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
33  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  *
36  *	@(#)vm_pageout.c        8.5 (Berkeley) 2/14/94
37  * from: Id: uvm_pdaemon.c,v 1.1.2.32 1998/02/06 05:26:30 chs Exp
38  *
39  *
40  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
41  * All rights reserved.
42  *
43  * Permission to use, copy, modify and distribute this software and
44  * its documentation is hereby granted, provided that both the copyright
45  * notice and this permission notice appear in all copies of the
46  * software, derivative works or modified versions, and any portions
47  * thereof, and that both notices appear in supporting documentation.
48  *
49  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
50  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
51  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
52  *
53  * Carnegie Mellon requests users of this software to return to
54  *
55  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
56  *  School of Computer Science
57  *  Carnegie Mellon University
58  *  Pittsburgh PA 15213-3890
59  *
60  * any improvements or extensions that they make and grant Carnegie the
61  * rights to redistribute these changes.
62  */
63 
64 /*
65  * uvm_pdaemon.c: the page daemon
66  */
67 
68 #include <sys/cdefs.h>
69 __KERNEL_RCSID(0, "$NetBSD: uvm_pdaemon.c,v 1.110 2019/04/21 15:32:18 chs Exp $");
70 
71 #include "opt_uvmhist.h"
72 #include "opt_readahead.h"
73 
74 #include <sys/param.h>
75 #include <sys/proc.h>
76 #include <sys/systm.h>
77 #include <sys/kernel.h>
78 #include <sys/pool.h>
79 #include <sys/buf.h>
80 #include <sys/module.h>
81 #include <sys/atomic.h>
82 #include <sys/kthread.h>
83 
84 #include <uvm/uvm.h>
85 #include <uvm/uvm_pdpolicy.h>
86 
87 #ifdef UVMHIST
88 UVMHIST_DEFINE(pdhist);
89 #endif
90 
91 /*
92  * UVMPD_NUMDIRTYREACTS is how many dirty pages the pagedaemon will reactivate
93  * in a pass thru the inactive list when swap is full.  the value should be
94  * "small"... if it's too large we'll cycle the active pages thru the inactive
95  * queue too quickly to for them to be referenced and avoid being freed.
96  */
97 
98 #define	UVMPD_NUMDIRTYREACTS	16
99 
100 #define	UVMPD_NUMTRYLOCKOWNER	16
101 
102 /*
103  * local prototypes
104  */
105 
106 static void	uvmpd_scan(void);
107 static void	uvmpd_scan_queue(void);
108 static void	uvmpd_tune(void);
109 static void	uvmpd_pool_drain_thread(void *);
110 static void	uvmpd_pool_drain_wakeup(void);
111 
112 static unsigned int uvm_pagedaemon_waiters;
113 
114 /* State for the pool drainer thread */
115 static kmutex_t uvmpd_pool_drain_lock;
116 static kcondvar_t uvmpd_pool_drain_cv;
117 static bool uvmpd_pool_drain_run = false;
118 
119 /*
120  * XXX hack to avoid hangs when large processes fork.
121  */
122 u_int uvm_extrapages;
123 
124 /*
125  * uvm_wait: wait (sleep) for the page daemon to free some pages
126  *
127  * => should be called with all locks released
128  * => should _not_ be called by the page daemon (to avoid deadlock)
129  */
130 
131 void
132 uvm_wait(const char *wmsg)
133 {
134 	int timo = 0;
135 
136 	mutex_spin_enter(&uvm_fpageqlock);
137 
138 	/*
139 	 * check for page daemon going to sleep (waiting for itself)
140 	 */
141 
142 	if (curlwp == uvm.pagedaemon_lwp && uvmexp.paging == 0) {
143 		/*
144 		 * now we have a problem: the pagedaemon wants to go to
145 		 * sleep until it frees more memory.   but how can it
146 		 * free more memory if it is asleep?  that is a deadlock.
147 		 * we have two options:
148 		 *  [1] panic now
149 		 *  [2] put a timeout on the sleep, thus causing the
150 		 *      pagedaemon to only pause (rather than sleep forever)
151 		 *
152 		 * note that option [2] will only help us if we get lucky
153 		 * and some other process on the system breaks the deadlock
154 		 * by exiting or freeing memory (thus allowing the pagedaemon
155 		 * to continue).  for now we panic if DEBUG is defined,
156 		 * otherwise we hope for the best with option [2] (better
157 		 * yet, this should never happen in the first place!).
158 		 */
159 
160 		printf("pagedaemon: deadlock detected!\n");
161 		timo = hz >> 3;		/* set timeout */
162 #if defined(DEBUG)
163 		/* DEBUG: panic so we can debug it */
164 		panic("pagedaemon deadlock");
165 #endif
166 	}
167 
168 	uvm_pagedaemon_waiters++;
169 	wakeup(&uvm.pagedaemon);		/* wake the daemon! */
170 	UVM_UNLOCK_AND_WAIT(&uvmexp.free, &uvm_fpageqlock, false, wmsg, timo);
171 }
172 
173 /*
174  * uvm_kick_pdaemon: perform checks to determine if we need to
175  * give the pagedaemon a nudge, and do so if necessary.
176  *
177  * => called with uvm_fpageqlock held.
178  */
179 
180 void
181 uvm_kick_pdaemon(void)
182 {
183 
184 	KASSERT(mutex_owned(&uvm_fpageqlock));
185 
186 	if (uvmexp.free + uvmexp.paging < uvmexp.freemin ||
187 	    (uvmexp.free + uvmexp.paging < uvmexp.freetarg &&
188 	     uvmpdpol_needsscan_p()) ||
189 	     uvm_km_va_starved_p()) {
190 		wakeup(&uvm.pagedaemon);
191 	}
192 }
193 
194 /*
195  * uvmpd_tune: tune paging parameters
196  *
197  * => called when ever memory is added (or removed?) to the system
198  * => caller must call with page queues locked
199  */
200 
201 static void
202 uvmpd_tune(void)
203 {
204 	int val;
205 
206 	UVMHIST_FUNC("uvmpd_tune"); UVMHIST_CALLED(pdhist);
207 
208 	/*
209 	 * try to keep 0.5% of available RAM free, but limit to between
210 	 * 128k and 1024k per-CPU.  XXX: what are these values good for?
211 	 */
212 	val = uvmexp.npages / 200;
213 	val = MAX(val, (128*1024) >> PAGE_SHIFT);
214 	val = MIN(val, (1024*1024) >> PAGE_SHIFT);
215 	val *= ncpu;
216 
217 	/* Make sure there's always a user page free. */
218 	if (val < uvmexp.reserve_kernel + 1)
219 		val = uvmexp.reserve_kernel + 1;
220 	uvmexp.freemin = val;
221 
222 	/* Calculate free target. */
223 	val = (uvmexp.freemin * 4) / 3;
224 	if (val <= uvmexp.freemin)
225 		val = uvmexp.freemin + 1;
226 	uvmexp.freetarg = val + atomic_swap_uint(&uvm_extrapages, 0);
227 
228 	uvmexp.wiredmax = uvmexp.npages / 3;
229 	UVMHIST_LOG(pdhist, "<- done, freemin=%jd, freetarg=%jd, wiredmax=%jd",
230 	      uvmexp.freemin, uvmexp.freetarg, uvmexp.wiredmax, 0);
231 }
232 
233 /*
234  * uvm_pageout: the main loop for the pagedaemon
235  */
236 
237 void
238 uvm_pageout(void *arg)
239 {
240 	int npages = 0;
241 	int extrapages = 0;
242 
243 	UVMHIST_FUNC("uvm_pageout"); UVMHIST_CALLED(pdhist);
244 
245 	UVMHIST_LOG(pdhist,"<starting uvm pagedaemon>", 0, 0, 0, 0);
246 
247 	mutex_init(&uvmpd_pool_drain_lock, MUTEX_DEFAULT, IPL_VM);
248 	cv_init(&uvmpd_pool_drain_cv, "pooldrain");
249 
250 	/* Create the pool drainer kernel thread. */
251 	if (kthread_create(PRI_VM, KTHREAD_MPSAFE, NULL,
252 	    uvmpd_pool_drain_thread, NULL, NULL, "pooldrain"))
253 		panic("fork pooldrain");
254 
255 	/*
256 	 * ensure correct priority and set paging parameters...
257 	 */
258 
259 	uvm.pagedaemon_lwp = curlwp;
260 	mutex_enter(&uvm_pageqlock);
261 	npages = uvmexp.npages;
262 	uvmpd_tune();
263 	mutex_exit(&uvm_pageqlock);
264 
265 	/*
266 	 * main loop
267 	 */
268 
269 	for (;;) {
270 		bool needsscan, needsfree, kmem_va_starved;
271 
272 		kmem_va_starved = uvm_km_va_starved_p();
273 
274 		mutex_spin_enter(&uvm_fpageqlock);
275 		if ((uvm_pagedaemon_waiters == 0 || uvmexp.paging > 0) &&
276 		    !kmem_va_starved) {
277 			UVMHIST_LOG(pdhist,"  <<SLEEPING>>",0,0,0,0);
278 			UVM_UNLOCK_AND_WAIT(&uvm.pagedaemon,
279 			    &uvm_fpageqlock, false, "pgdaemon", 0);
280 			uvmexp.pdwoke++;
281 			UVMHIST_LOG(pdhist,"  <<WOKE UP>>",0,0,0,0);
282 		} else {
283 			mutex_spin_exit(&uvm_fpageqlock);
284 		}
285 
286 		/*
287 		 * now lock page queues and recompute inactive count
288 		 */
289 
290 		mutex_enter(&uvm_pageqlock);
291 		if (npages != uvmexp.npages || extrapages != uvm_extrapages) {
292 			npages = uvmexp.npages;
293 			extrapages = uvm_extrapages;
294 			mutex_spin_enter(&uvm_fpageqlock);
295 			uvmpd_tune();
296 			mutex_spin_exit(&uvm_fpageqlock);
297 		}
298 
299 		uvmpdpol_tune();
300 
301 		/*
302 		 * Estimate a hint.  Note that bufmem are returned to
303 		 * system only when entire pool page is empty.
304 		 */
305 		mutex_spin_enter(&uvm_fpageqlock);
306 
307 		UVMHIST_LOG(pdhist,"  free/ftarg=%jd/%jd",
308 		    uvmexp.free, uvmexp.freetarg, 0,0);
309 
310 		needsfree = uvmexp.free + uvmexp.paging < uvmexp.freetarg;
311 		needsscan = needsfree || uvmpdpol_needsscan_p();
312 
313 		/*
314 		 * scan if needed
315 		 */
316 		if (needsscan) {
317 			mutex_spin_exit(&uvm_fpageqlock);
318 			uvmpd_scan();
319 			mutex_spin_enter(&uvm_fpageqlock);
320 		}
321 
322 		/*
323 		 * if there's any free memory to be had,
324 		 * wake up any waiters.
325 		 */
326 		if (uvmexp.free > uvmexp.reserve_kernel ||
327 		    uvmexp.paging == 0) {
328 			wakeup(&uvmexp.free);
329 			uvm_pagedaemon_waiters = 0;
330 		}
331 		mutex_spin_exit(&uvm_fpageqlock);
332 
333 		/*
334 		 * scan done.  unlock page queues (the only lock we are holding)
335 		 */
336 		mutex_exit(&uvm_pageqlock);
337 
338 		/*
339 		 * if we don't need free memory, we're done.
340 		 */
341 
342 		if (!needsfree && !kmem_va_starved)
343 			continue;
344 
345 		/*
346 		 * kick the pool drainer thread.
347 		 */
348 
349 		uvmpd_pool_drain_wakeup();
350 	}
351 	/*NOTREACHED*/
352 }
353 
354 
355 /*
356  * uvm_aiodone_worker: a workqueue callback for the aiodone daemon.
357  */
358 
359 void
360 uvm_aiodone_worker(struct work *wk, void *dummy)
361 {
362 	struct buf *bp = (void *)wk;
363 
364 	KASSERT(&bp->b_work == wk);
365 
366 	/*
367 	 * process an i/o that's done.
368 	 */
369 
370 	(*bp->b_iodone)(bp);
371 }
372 
373 void
374 uvm_pageout_start(int npages)
375 {
376 
377 	mutex_spin_enter(&uvm_fpageqlock);
378 	uvmexp.paging += npages;
379 	mutex_spin_exit(&uvm_fpageqlock);
380 }
381 
382 void
383 uvm_pageout_done(int npages)
384 {
385 
386 	mutex_spin_enter(&uvm_fpageqlock);
387 	KASSERT(uvmexp.paging >= npages);
388 	uvmexp.paging -= npages;
389 
390 	/*
391 	 * wake up either of pagedaemon or LWPs waiting for it.
392 	 */
393 
394 	if (uvmexp.free <= uvmexp.reserve_kernel) {
395 		wakeup(&uvm.pagedaemon);
396 	} else {
397 		wakeup(&uvmexp.free);
398 		uvm_pagedaemon_waiters = 0;
399 	}
400 	mutex_spin_exit(&uvm_fpageqlock);
401 }
402 
403 /*
404  * uvmpd_trylockowner: trylock the page's owner.
405  *
406  * => called with pageq locked.
407  * => resolve orphaned O->A loaned page.
408  * => return the locked mutex on success.  otherwise, return NULL.
409  */
410 
411 kmutex_t *
412 uvmpd_trylockowner(struct vm_page *pg)
413 {
414 	struct uvm_object *uobj = pg->uobject;
415 	kmutex_t *slock;
416 
417 	KASSERT(mutex_owned(&uvm_pageqlock));
418 
419 	if (uobj != NULL) {
420 		slock = uobj->vmobjlock;
421 	} else {
422 		struct vm_anon *anon = pg->uanon;
423 
424 		KASSERT(anon != NULL);
425 		slock = anon->an_lock;
426 	}
427 
428 	if (!mutex_tryenter(slock)) {
429 		return NULL;
430 	}
431 
432 	if (uobj == NULL) {
433 
434 		/*
435 		 * set PQ_ANON if it isn't set already.
436 		 */
437 
438 		if ((pg->pqflags & PQ_ANON) == 0) {
439 			KASSERT(pg->loan_count > 0);
440 			pg->loan_count--;
441 			pg->pqflags |= PQ_ANON;
442 			/* anon now owns it */
443 		}
444 	}
445 
446 	return slock;
447 }
448 
449 #if defined(VMSWAP)
450 struct swapcluster {
451 	int swc_slot;
452 	int swc_nallocated;
453 	int swc_nused;
454 	struct vm_page *swc_pages[howmany(MAXPHYS, MIN_PAGE_SIZE)];
455 };
456 
457 static void
458 swapcluster_init(struct swapcluster *swc)
459 {
460 
461 	swc->swc_slot = 0;
462 	swc->swc_nused = 0;
463 }
464 
465 static int
466 swapcluster_allocslots(struct swapcluster *swc)
467 {
468 	int slot;
469 	int npages;
470 
471 	if (swc->swc_slot != 0) {
472 		return 0;
473 	}
474 
475 	/* Even with strange MAXPHYS, the shift
476 	   implicitly rounds down to a page. */
477 	npages = MAXPHYS >> PAGE_SHIFT;
478 	slot = uvm_swap_alloc(&npages, true);
479 	if (slot == 0) {
480 		return ENOMEM;
481 	}
482 	swc->swc_slot = slot;
483 	swc->swc_nallocated = npages;
484 	swc->swc_nused = 0;
485 
486 	return 0;
487 }
488 
489 static int
490 swapcluster_add(struct swapcluster *swc, struct vm_page *pg)
491 {
492 	int slot;
493 	struct uvm_object *uobj;
494 
495 	KASSERT(swc->swc_slot != 0);
496 	KASSERT(swc->swc_nused < swc->swc_nallocated);
497 	KASSERT((pg->pqflags & PQ_SWAPBACKED) != 0);
498 
499 	slot = swc->swc_slot + swc->swc_nused;
500 	uobj = pg->uobject;
501 	if (uobj == NULL) {
502 		KASSERT(mutex_owned(pg->uanon->an_lock));
503 		pg->uanon->an_swslot = slot;
504 	} else {
505 		int result;
506 
507 		KASSERT(mutex_owned(uobj->vmobjlock));
508 		result = uao_set_swslot(uobj, pg->offset >> PAGE_SHIFT, slot);
509 		if (result == -1) {
510 			return ENOMEM;
511 		}
512 	}
513 	swc->swc_pages[swc->swc_nused] = pg;
514 	swc->swc_nused++;
515 
516 	return 0;
517 }
518 
519 static void
520 swapcluster_flush(struct swapcluster *swc, bool now)
521 {
522 	int slot;
523 	int nused;
524 	int nallocated;
525 	int error __diagused;
526 
527 	if (swc->swc_slot == 0) {
528 		return;
529 	}
530 	KASSERT(swc->swc_nused <= swc->swc_nallocated);
531 
532 	slot = swc->swc_slot;
533 	nused = swc->swc_nused;
534 	nallocated = swc->swc_nallocated;
535 
536 	/*
537 	 * if this is the final pageout we could have a few
538 	 * unused swap blocks.  if so, free them now.
539 	 */
540 
541 	if (nused < nallocated) {
542 		if (!now) {
543 			return;
544 		}
545 		uvm_swap_free(slot + nused, nallocated - nused);
546 	}
547 
548 	/*
549 	 * now start the pageout.
550 	 */
551 
552 	if (nused > 0) {
553 		uvmexp.pdpageouts++;
554 		uvm_pageout_start(nused);
555 		error = uvm_swap_put(slot, swc->swc_pages, nused, 0);
556 		KASSERT(error == 0 || error == ENOMEM);
557 	}
558 
559 	/*
560 	 * zero swslot to indicate that we are
561 	 * no longer building a swap-backed cluster.
562 	 */
563 
564 	swc->swc_slot = 0;
565 	swc->swc_nused = 0;
566 }
567 
568 static int
569 swapcluster_nused(struct swapcluster *swc)
570 {
571 
572 	return swc->swc_nused;
573 }
574 
575 /*
576  * uvmpd_dropswap: free any swap allocated to this page.
577  *
578  * => called with owner locked.
579  * => return true if a page had an associated slot.
580  */
581 
582 static bool
583 uvmpd_dropswap(struct vm_page *pg)
584 {
585 	bool result = false;
586 	struct vm_anon *anon = pg->uanon;
587 
588 	if ((pg->pqflags & PQ_ANON) && anon->an_swslot) {
589 		uvm_swap_free(anon->an_swslot, 1);
590 		anon->an_swslot = 0;
591 		pg->flags &= ~PG_CLEAN;
592 		result = true;
593 	} else if (pg->pqflags & PQ_AOBJ) {
594 		int slot = uao_set_swslot(pg->uobject,
595 		    pg->offset >> PAGE_SHIFT, 0);
596 		if (slot) {
597 			uvm_swap_free(slot, 1);
598 			pg->flags &= ~PG_CLEAN;
599 			result = true;
600 		}
601 	}
602 
603 	return result;
604 }
605 
606 /*
607  * uvmpd_trydropswap: try to free any swap allocated to this page.
608  *
609  * => return true if a slot is successfully freed.
610  */
611 
612 bool
613 uvmpd_trydropswap(struct vm_page *pg)
614 {
615 	kmutex_t *slock;
616 	bool result;
617 
618 	if ((pg->flags & PG_BUSY) != 0) {
619 		return false;
620 	}
621 
622 	/*
623 	 * lock the page's owner.
624 	 */
625 
626 	slock = uvmpd_trylockowner(pg);
627 	if (slock == NULL) {
628 		return false;
629 	}
630 
631 	/*
632 	 * skip this page if it's busy.
633 	 */
634 
635 	if ((pg->flags & PG_BUSY) != 0) {
636 		mutex_exit(slock);
637 		return false;
638 	}
639 
640 	result = uvmpd_dropswap(pg);
641 
642 	mutex_exit(slock);
643 
644 	return result;
645 }
646 
647 #endif /* defined(VMSWAP) */
648 
649 /*
650  * uvmpd_scan_queue: scan an replace candidate list for pages
651  * to clean or free.
652  *
653  * => called with page queues locked
654  * => we work on meeting our free target by converting inactive pages
655  *    into free pages.
656  * => we handle the building of swap-backed clusters
657  */
658 
659 static void
660 uvmpd_scan_queue(void)
661 {
662 	struct vm_page *p;
663 	struct uvm_object *uobj;
664 	struct vm_anon *anon;
665 #if defined(VMSWAP)
666 	struct swapcluster swc;
667 #endif /* defined(VMSWAP) */
668 	int dirtyreacts;
669 	int lockownerfail;
670 	kmutex_t *slock;
671 	UVMHIST_FUNC("uvmpd_scan_queue"); UVMHIST_CALLED(pdhist);
672 
673 	/*
674 	 * swslot is non-zero if we are building a swap cluster.  we want
675 	 * to stay in the loop while we have a page to scan or we have
676 	 * a swap-cluster to build.
677 	 */
678 
679 #if defined(VMSWAP)
680 	swapcluster_init(&swc);
681 #endif /* defined(VMSWAP) */
682 
683 	dirtyreacts = 0;
684 	lockownerfail = 0;
685 	uvmpdpol_scaninit();
686 
687 	while (/* CONSTCOND */ 1) {
688 
689 		/*
690 		 * see if we've met the free target.
691 		 */
692 
693 		if (uvmexp.free + uvmexp.paging
694 #if defined(VMSWAP)
695 		    + swapcluster_nused(&swc)
696 #endif /* defined(VMSWAP) */
697 		    >= uvmexp.freetarg << 2 ||
698 		    dirtyreacts == UVMPD_NUMDIRTYREACTS) {
699 			UVMHIST_LOG(pdhist,"  met free target: "
700 				    "exit loop", 0, 0, 0, 0);
701 			break;
702 		}
703 
704 		p = uvmpdpol_selectvictim();
705 		if (p == NULL) {
706 			break;
707 		}
708 		KASSERT(uvmpdpol_pageisqueued_p(p));
709 		KASSERT(p->wire_count == 0);
710 
711 		/*
712 		 * we are below target and have a new page to consider.
713 		 */
714 
715 		anon = p->uanon;
716 		uobj = p->uobject;
717 
718 		/*
719 		 * first we attempt to lock the object that this page
720 		 * belongs to.  if our attempt fails we skip on to
721 		 * the next page (no harm done).  it is important to
722 		 * "try" locking the object as we are locking in the
723 		 * wrong order (pageq -> object) and we don't want to
724 		 * deadlock.
725 		 *
726 		 * the only time we expect to see an ownerless page
727 		 * (i.e. a page with no uobject and !PQ_ANON) is if an
728 		 * anon has loaned a page from a uvm_object and the
729 		 * uvm_object has dropped the ownership.  in that
730 		 * case, the anon can "take over" the loaned page
731 		 * and make it its own.
732 		 */
733 
734 		slock = uvmpd_trylockowner(p);
735 		if (slock == NULL) {
736 			/*
737 			 * yield cpu to make a chance for an LWP holding
738 			 * the lock run.  otherwise we can busy-loop too long
739 			 * if the page queue is filled with a lot of pages
740 			 * from few objects.
741 			 */
742 			lockownerfail++;
743 			if (lockownerfail > UVMPD_NUMTRYLOCKOWNER) {
744 				mutex_exit(&uvm_pageqlock);
745 				/* XXX Better than yielding but inadequate. */
746 				kpause("livelock", false, 1, NULL);
747 				mutex_enter(&uvm_pageqlock);
748 				lockownerfail = 0;
749 			}
750 			continue;
751 		}
752 		if (p->flags & PG_BUSY) {
753 			mutex_exit(slock);
754 			uvmexp.pdbusy++;
755 			continue;
756 		}
757 
758 		/* does the page belong to an object? */
759 		if (uobj != NULL) {
760 			uvmexp.pdobscan++;
761 		} else {
762 #if defined(VMSWAP)
763 			KASSERT(anon != NULL);
764 			uvmexp.pdanscan++;
765 #else /* defined(VMSWAP) */
766 			panic("%s: anon", __func__);
767 #endif /* defined(VMSWAP) */
768 		}
769 
770 
771 		/*
772 		 * we now have the object and the page queues locked.
773 		 * if the page is not swap-backed, call the object's
774 		 * pager to flush and free the page.
775 		 */
776 
777 #if defined(READAHEAD_STATS)
778 		if ((p->pqflags & PQ_READAHEAD) != 0) {
779 			p->pqflags &= ~PQ_READAHEAD;
780 			uvm_ra_miss.ev_count++;
781 		}
782 #endif /* defined(READAHEAD_STATS) */
783 
784 		if ((p->pqflags & PQ_SWAPBACKED) == 0) {
785 			KASSERT(uobj != NULL);
786 			mutex_exit(&uvm_pageqlock);
787 			(void) (uobj->pgops->pgo_put)(uobj, p->offset,
788 			    p->offset + PAGE_SIZE, PGO_CLEANIT|PGO_FREE);
789 			mutex_enter(&uvm_pageqlock);
790 			continue;
791 		}
792 
793 		/*
794 		 * the page is swap-backed.  remove all the permissions
795 		 * from the page so we can sync the modified info
796 		 * without any race conditions.  if the page is clean
797 		 * we can free it now and continue.
798 		 */
799 
800 		pmap_page_protect(p, VM_PROT_NONE);
801 		if ((p->flags & PG_CLEAN) && pmap_clear_modify(p)) {
802 			p->flags &= ~(PG_CLEAN);
803 		}
804 		if (p->flags & PG_CLEAN) {
805 			int slot;
806 			int pageidx;
807 
808 			pageidx = p->offset >> PAGE_SHIFT;
809 			uvm_pagefree(p);
810 			uvmexp.pdfreed++;
811 
812 			/*
813 			 * for anons, we need to remove the page
814 			 * from the anon ourselves.  for aobjs,
815 			 * pagefree did that for us.
816 			 */
817 
818 			if (anon) {
819 				KASSERT(anon->an_swslot != 0);
820 				anon->an_page = NULL;
821 				slot = anon->an_swslot;
822 			} else {
823 				slot = uao_find_swslot(uobj, pageidx);
824 			}
825 			mutex_exit(slock);
826 
827 			if (slot > 0) {
828 				/* this page is now only in swap. */
829 				mutex_enter(&uvm_swap_data_lock);
830 				KASSERT(uvmexp.swpgonly < uvmexp.swpginuse);
831 				uvmexp.swpgonly++;
832 				mutex_exit(&uvm_swap_data_lock);
833 			}
834 			continue;
835 		}
836 
837 #if defined(VMSWAP)
838 		/*
839 		 * this page is dirty, skip it if we'll have met our
840 		 * free target when all the current pageouts complete.
841 		 */
842 
843 		if (uvmexp.free + uvmexp.paging > uvmexp.freetarg << 2) {
844 			mutex_exit(slock);
845 			continue;
846 		}
847 
848 		/*
849 		 * free any swap space allocated to the page since
850 		 * we'll have to write it again with its new data.
851 		 */
852 
853 		uvmpd_dropswap(p);
854 
855 		/*
856 		 * start new swap pageout cluster (if necessary).
857 		 *
858 		 * if swap is full reactivate this page so that
859 		 * we eventually cycle all pages through the
860 		 * inactive queue.
861 		 */
862 
863 		if (swapcluster_allocslots(&swc)) {
864 			dirtyreacts++;
865 			uvm_pageactivate(p);
866 			mutex_exit(slock);
867 			continue;
868 		}
869 
870 		/*
871 		 * at this point, we're definitely going reuse this
872 		 * page.  mark the page busy and delayed-free.
873 		 * we should remove the page from the page queues
874 		 * so we don't ever look at it again.
875 		 * adjust counters and such.
876 		 */
877 
878 		p->flags |= PG_BUSY;
879 		UVM_PAGE_OWN(p, "scan_queue");
880 
881 		p->flags |= PG_PAGEOUT;
882 		uvm_pagedequeue(p);
883 
884 		uvmexp.pgswapout++;
885 		mutex_exit(&uvm_pageqlock);
886 
887 		/*
888 		 * add the new page to the cluster.
889 		 */
890 
891 		if (swapcluster_add(&swc, p)) {
892 			p->flags &= ~(PG_BUSY|PG_PAGEOUT);
893 			UVM_PAGE_OWN(p, NULL);
894 			mutex_enter(&uvm_pageqlock);
895 			dirtyreacts++;
896 			uvm_pageactivate(p);
897 			mutex_exit(slock);
898 			continue;
899 		}
900 		mutex_exit(slock);
901 
902 		swapcluster_flush(&swc, false);
903 		mutex_enter(&uvm_pageqlock);
904 
905 		/*
906 		 * the pageout is in progress.  bump counters and set up
907 		 * for the next loop.
908 		 */
909 
910 		uvmexp.pdpending++;
911 
912 #else /* defined(VMSWAP) */
913 		uvm_pageactivate(p);
914 		mutex_exit(slock);
915 #endif /* defined(VMSWAP) */
916 	}
917 
918 #if defined(VMSWAP)
919 	mutex_exit(&uvm_pageqlock);
920 	swapcluster_flush(&swc, true);
921 	mutex_enter(&uvm_pageqlock);
922 #endif /* defined(VMSWAP) */
923 }
924 
925 /*
926  * uvmpd_scan: scan the page queues and attempt to meet our targets.
927  *
928  * => called with pageq's locked
929  */
930 
931 static void
932 uvmpd_scan(void)
933 {
934 	int swap_shortage, pages_freed;
935 	UVMHIST_FUNC("uvmpd_scan"); UVMHIST_CALLED(pdhist);
936 
937 	uvmexp.pdrevs++;
938 
939 	/*
940 	 * work on meeting our targets.   first we work on our free target
941 	 * by converting inactive pages into free pages.  then we work on
942 	 * meeting our inactive target by converting active pages to
943 	 * inactive ones.
944 	 */
945 
946 	UVMHIST_LOG(pdhist, "  starting 'free' loop",0,0,0,0);
947 
948 	pages_freed = uvmexp.pdfreed;
949 	uvmpd_scan_queue();
950 	pages_freed = uvmexp.pdfreed - pages_freed;
951 
952 	/*
953 	 * detect if we're not going to be able to page anything out
954 	 * until we free some swap resources from active pages.
955 	 */
956 
957 	swap_shortage = 0;
958 	if (uvmexp.free < uvmexp.freetarg &&
959 	    uvmexp.swpginuse >= uvmexp.swpgavail &&
960 	    !uvm_swapisfull() &&
961 	    pages_freed == 0) {
962 		swap_shortage = uvmexp.freetarg - uvmexp.free;
963 	}
964 
965 	uvmpdpol_balancequeue(swap_shortage);
966 
967 	/*
968 	 * if still below the minimum target, try unloading kernel
969 	 * modules.
970 	 */
971 
972 	if (uvmexp.free < uvmexp.freemin) {
973 		module_thread_kick();
974 	}
975 }
976 
977 /*
978  * uvm_reclaimable: decide whether to wait for pagedaemon.
979  *
980  * => return true if it seems to be worth to do uvm_wait.
981  *
982  * XXX should be tunable.
983  * XXX should consider pools, etc?
984  */
985 
986 bool
987 uvm_reclaimable(void)
988 {
989 	int filepages;
990 	int active, inactive;
991 
992 	/*
993 	 * if swap is not full, no problem.
994 	 */
995 
996 	if (!uvm_swapisfull()) {
997 		return true;
998 	}
999 
1000 	/*
1001 	 * file-backed pages can be reclaimed even when swap is full.
1002 	 * if we have more than 1/16 of pageable memory or 5MB, try to reclaim.
1003 	 *
1004 	 * XXX assume the worst case, ie. all wired pages are file-backed.
1005 	 *
1006 	 * XXX should consider about other reclaimable memory.
1007 	 * XXX ie. pools, traditional buffer cache.
1008 	 */
1009 
1010 	filepages = uvmexp.filepages + uvmexp.execpages - uvmexp.wired;
1011 	uvm_estimatepageable(&active, &inactive);
1012 	if (filepages >= MIN((active + inactive) >> 4,
1013 	    5 * 1024 * 1024 >> PAGE_SHIFT)) {
1014 		return true;
1015 	}
1016 
1017 	/*
1018 	 * kill the process, fail allocation, etc..
1019 	 */
1020 
1021 	return false;
1022 }
1023 
1024 void
1025 uvm_estimatepageable(int *active, int *inactive)
1026 {
1027 
1028 	uvmpdpol_estimatepageable(active, inactive);
1029 }
1030 
1031 
1032 /*
1033  * Use a separate thread for draining pools.
1034  * This work can't done from the main pagedaemon thread because
1035  * some pool allocators need to take vm_map locks.
1036  */
1037 
1038 static void
1039 uvmpd_pool_drain_thread(void *arg)
1040 {
1041 	int bufcnt;
1042 
1043 	for (;;) {
1044 		mutex_enter(&uvmpd_pool_drain_lock);
1045 		if (!uvmpd_pool_drain_run) {
1046 			cv_wait(&uvmpd_pool_drain_cv, &uvmpd_pool_drain_lock);
1047 		}
1048 		uvmpd_pool_drain_run = false;
1049 		mutex_exit(&uvmpd_pool_drain_lock);
1050 
1051 		/*
1052 		 * kill unused metadata buffers.
1053 		 */
1054 		mutex_spin_enter(&uvm_fpageqlock);
1055 		bufcnt = uvmexp.freetarg - uvmexp.free;
1056 		mutex_spin_exit(&uvm_fpageqlock);
1057 		if (bufcnt < 0)
1058 			bufcnt = 0;
1059 
1060 		mutex_enter(&bufcache_lock);
1061 		buf_drain(bufcnt << PAGE_SHIFT);
1062 		mutex_exit(&bufcache_lock);
1063 
1064 		/*
1065 		 * drain a pool.
1066 		 */
1067 		pool_drain(NULL);
1068 	}
1069 	/*NOTREACHED*/
1070 }
1071 
1072 static void
1073 uvmpd_pool_drain_wakeup(void)
1074 {
1075 
1076 	mutex_enter(&uvmpd_pool_drain_lock);
1077 	uvmpd_pool_drain_run = true;
1078 	cv_signal(&uvmpd_pool_drain_cv);
1079 	mutex_exit(&uvmpd_pool_drain_lock);
1080 }
1081