xref: /openbsd-src/sys/uvm/uvm_pdaemon.c (revision a28daedfc357b214be5c701aa8ba8adb29a7f1c2)
1 /*	$OpenBSD: uvm_pdaemon.c,v 1.42 2009/04/17 07:14:04 oga Exp $	*/
2 /*	$NetBSD: uvm_pdaemon.c,v 1.23 2000/08/20 10:24:14 bjh21 Exp $	*/
3 
4 /*
5  * Copyright (c) 1997 Charles D. Cranor and Washington University.
6  * Copyright (c) 1991, 1993, The Regents of the University of California.
7  *
8  * All rights reserved.
9  *
10  * This code is derived from software contributed to Berkeley by
11  * The Mach Operating System project at Carnegie-Mellon University.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  * 3. All advertising materials mentioning features or use of this software
22  *    must display the following acknowledgement:
23  *	This product includes software developed by Charles D. Cranor,
24  *      Washington University, the University of California, Berkeley and
25  *      its contributors.
26  * 4. Neither the name of the University nor the names of its contributors
27  *    may be used to endorse or promote products derived from this software
28  *    without specific prior written permission.
29  *
30  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
31  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
32  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
34  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
35  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
36  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
37  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
38  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
39  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
40  * SUCH DAMAGE.
41  *
42  *	@(#)vm_pageout.c        8.5 (Berkeley) 2/14/94
43  * from: Id: uvm_pdaemon.c,v 1.1.2.32 1998/02/06 05:26:30 chs Exp
44  *
45  *
46  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
47  * All rights reserved.
48  *
49  * Permission to use, copy, modify and distribute this software and
50  * its documentation is hereby granted, provided that both the copyright
51  * notice and this permission notice appear in all copies of the
52  * software, derivative works or modified versions, and any portions
53  * thereof, and that both notices appear in supporting documentation.
54  *
55  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
56  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
57  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
58  *
59  * Carnegie Mellon requests users of this software to return to
60  *
61  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
62  *  School of Computer Science
63  *  Carnegie Mellon University
64  *  Pittsburgh PA 15213-3890
65  *
66  * any improvements or extensions that they make and grant Carnegie the
67  * rights to redistribute these changes.
68  */
69 
70 /*
71  * uvm_pdaemon.c: the page daemon
72  */
73 
74 #include <sys/param.h>
75 #include <sys/proc.h>
76 #include <sys/systm.h>
77 #include <sys/kernel.h>
78 #include <sys/pool.h>
79 #include <sys/buf.h>
80 #include <sys/vnode.h>
81 #include <sys/mount.h>
82 
83 #include <uvm/uvm.h>
84 
85 /*
86  * UVMPD_NUMDIRTYREACTS is how many dirty pages the pagedaemon will reactivate
87  * in a pass thru the inactive list when swap is full.  the value should be
88  * "small"... if it's too large we'll cycle the active pages thru the inactive
89  * queue too quickly to for them to be referenced and avoid being freed.
90  */
91 
92 #define UVMPD_NUMDIRTYREACTS 16
93 
94 
95 /*
96  * local prototypes
97  */
98 
99 static void		uvmpd_scan(void);
100 static boolean_t	uvmpd_scan_inactive(struct pglist *);
101 static void		uvmpd_tune(void);
102 
103 /*
104  * uvm_wait: wait (sleep) for the page daemon to free some pages
105  *
106  * => should be called with all locks released
107  * => should _not_ be called by the page daemon (to avoid deadlock)
108  */
109 
110 void
111 uvm_wait(const char *wmsg)
112 {
113 	int	timo = 0;
114 
115 	/*
116 	 * check for page daemon going to sleep (waiting for itself)
117 	 */
118 
119 	if (curproc == uvm.pagedaemon_proc) {
120 		/*
121 		 * now we have a problem: the pagedaemon wants to go to
122 		 * sleep until it frees more memory.   but how can it
123 		 * free more memory if it is asleep?  that is a deadlock.
124 		 * we have two options:
125 		 *  [1] panic now
126 		 *  [2] put a timeout on the sleep, thus causing the
127 		 *      pagedaemon to only pause (rather than sleep forever)
128 		 *
129 		 * note that option [2] will only help us if we get lucky
130 		 * and some other process on the system breaks the deadlock
131 		 * by exiting or freeing memory (thus allowing the pagedaemon
132 		 * to continue).  for now we panic if DEBUG is defined,
133 		 * otherwise we hope for the best with option [2] (better
134 		 * yet, this should never happen in the first place!).
135 		 */
136 
137 		printf("pagedaemon: deadlock detected!\n");
138 		timo = hz >> 3;		/* set timeout */
139 #if defined(DEBUG)
140 		/* DEBUG: panic so we can debug it */
141 		panic("pagedaemon deadlock");
142 #endif
143 	}
144 
145 	uvm_lock_fpageq();
146 	wakeup(&uvm.pagedaemon);		/* wake the daemon! */
147 	msleep(&uvmexp.free, &uvm.fpageqlock, PVM | PNORELOCK, wmsg, timo);
148 }
149 
150 
151 /*
152  * uvmpd_tune: tune paging parameters
153  *
154  * => called when ever memory is added (or removed?) to the system
155  * => caller must call with page queues locked
156  */
157 
158 static void
159 uvmpd_tune(void)
160 {
161 	UVMHIST_FUNC("uvmpd_tune"); UVMHIST_CALLED(pdhist);
162 
163 	uvmexp.freemin = uvmexp.npages / 30;
164 
165 	/* between 16k and 512k */
166 	/* XXX:  what are these values good for? */
167 	uvmexp.freemin = max(uvmexp.freemin, (16*1024) >> PAGE_SHIFT);
168 #if 0
169 	uvmexp.freemin = min(uvmexp.freemin, (512*1024) >> PAGE_SHIFT);
170 #endif
171 
172 	/* Make sure there's always a user page free. */
173 	if (uvmexp.freemin < uvmexp.reserve_kernel + 1)
174 		uvmexp.freemin = uvmexp.reserve_kernel + 1;
175 
176 	uvmexp.freetarg = (uvmexp.freemin * 4) / 3;
177 	if (uvmexp.freetarg <= uvmexp.freemin)
178 		uvmexp.freetarg = uvmexp.freemin + 1;
179 
180 	/* uvmexp.inactarg: computed in main daemon loop */
181 
182 	uvmexp.wiredmax = uvmexp.npages / 3;
183 	UVMHIST_LOG(pdhist, "<- done, freemin=%ld, freetarg=%ld, wiredmax=%ld",
184 	      uvmexp.freemin, uvmexp.freetarg, uvmexp.wiredmax, 0);
185 }
186 
187 /*
188  * uvm_pageout: the main loop for the pagedaemon
189  */
190 
191 void
192 uvm_pageout(void *arg)
193 {
194 	int npages = 0;
195 	UVMHIST_FUNC("uvm_pageout"); UVMHIST_CALLED(pdhist);
196 
197 	UVMHIST_LOG(pdhist,"<starting uvm pagedaemon>", 0, 0, 0, 0);
198 
199 	/*
200 	 * ensure correct priority and set paging parameters...
201 	 */
202 
203 	uvm.pagedaemon_proc = curproc;
204 	(void) spl0();
205 	uvm_lock_pageq();
206 	npages = uvmexp.npages;
207 	uvmpd_tune();
208 	uvm_unlock_pageq();
209 
210 	/*
211 	 * main loop
212 	 */
213 
214 	for (;;) {
215 		uvm_lock_fpageq();
216 		UVMHIST_LOG(pdhist,"  <<SLEEPING>>",0,0,0,0);
217 		msleep(&uvm.pagedaemon, &uvm.fpageqlock, PVM | PNORELOCK,
218 		    "pgdaemon", 0);
219 		uvmexp.pdwoke++;
220 		UVMHIST_LOG(pdhist,"  <<WOKE UP>>",0,0,0,0);
221 
222 		/*
223 		 * now lock page queues and recompute inactive count
224 		 */
225 
226 		uvm_lock_pageq();
227 		if (npages != uvmexp.npages) {	/* check for new pages? */
228 			npages = uvmexp.npages;
229 			uvmpd_tune();
230 		}
231 
232 		uvmexp.inactarg = (uvmexp.active + uvmexp.inactive) / 3;
233 		if (uvmexp.inactarg <= uvmexp.freetarg) {
234 			uvmexp.inactarg = uvmexp.freetarg + 1;
235 		}
236 
237 		UVMHIST_LOG(pdhist,"  free/ftarg=%ld/%ld, inact/itarg=%ld/%ld",
238 		    uvmexp.free, uvmexp.freetarg, uvmexp.inactive,
239 		    uvmexp.inactarg);
240 
241 		/*
242 		 * scan if needed
243 		 */
244 		if ((uvmexp.free - BUFPAGES_DEFICIT) < uvmexp.freetarg ||
245 		    uvmexp.inactive < uvmexp.inactarg) {
246 			uvmpd_scan();
247 		}
248 
249 		/*
250 		 * if there's any free memory to be had,
251 		 * wake up any waiters.
252 		 */
253 		uvm_lock_fpageq();
254 		if (uvmexp.free > uvmexp.reserve_kernel ||
255 		    uvmexp.paging == 0) {
256 			wakeup(&uvmexp.free);
257 		}
258 		uvm_unlock_fpageq();
259 
260 		/*
261 		 * scan done.  unlock page queues (the only lock we are holding)
262 		 */
263 
264 		uvm_unlock_pageq();
265 	}
266 	/*NOTREACHED*/
267 }
268 
269 
270 /*
271  * uvm_aiodone_daemon:  main loop for the aiodone daemon.
272  */
273 
274 void
275 uvm_aiodone_daemon(void *arg)
276 {
277 	int s, free;
278 	struct buf *bp, *nbp;
279 	UVMHIST_FUNC("uvm_aiodoned"); UVMHIST_CALLED(pdhist);
280 
281 	uvm.aiodoned_proc = curproc;
282 
283 	for (;;) {
284 
285 		/*
286 		 * Check for done aio structures. If we've got structures to
287 		 * process, do so. Otherwise sleep while avoiding races.
288 		 */
289 		mtx_enter(&uvm.aiodoned_lock);
290 		while ((bp = TAILQ_FIRST(&uvm.aio_done)) == NULL)
291 			msleep(&uvm.aiodoned, &uvm.aiodoned_lock,
292 			    PVM, "aiodoned", 0);
293 		/* Take the list for ourselves. */
294 		TAILQ_INIT(&uvm.aio_done);
295 		mtx_leave(&uvm.aiodoned_lock);
296 
297 		/*
298 		 * process each i/o that's done.
299 		 */
300 
301 		free = uvmexp.free;
302 		while (bp != NULL) {
303 			if (bp->b_flags & B_PDAEMON) {
304 				uvmexp.paging -= bp->b_bufsize >> PAGE_SHIFT;
305 			}
306 			nbp = TAILQ_NEXT(bp, b_freelist);
307 			s = splbio();	/* b_iodone must by called at splbio */
308 			(*bp->b_iodone)(bp);
309 			splx(s);
310 			bp = nbp;
311 		}
312 		uvm_lock_fpageq();
313 		wakeup(free <= uvmexp.reserve_kernel ? &uvm.pagedaemon :
314 		    &uvmexp.free);
315 		uvm_unlock_fpageq();
316 	}
317 }
318 
319 
320 
321 /*
322  * uvmpd_scan_inactive: scan an inactive list for pages to clean or free.
323  *
324  * => called with page queues locked
325  * => we work on meeting our free target by converting inactive pages
326  *    into free pages.
327  * => we handle the building of swap-backed clusters
328  * => we return TRUE if we are exiting because we met our target
329  */
330 
331 static boolean_t
332 uvmpd_scan_inactive(struct pglist *pglst)
333 {
334 	boolean_t retval = FALSE;	/* assume we haven't hit target */
335 	int free, result;
336 	struct vm_page *p, *nextpg;
337 	struct uvm_object *uobj;
338 	struct vm_page *pps[MAXBSIZE >> PAGE_SHIFT], **ppsp;
339 	int npages;
340 	struct vm_page *swpps[MAXBSIZE >> PAGE_SHIFT]; 	/* XXX: see below */
341 	int swnpages, swcpages;				/* XXX: see below */
342 	int swslot;
343 	struct vm_anon *anon;
344 	boolean_t swap_backed;
345 	vaddr_t start;
346 	int dirtyreacts;
347 	UVMHIST_FUNC("uvmpd_scan_inactive"); UVMHIST_CALLED(pdhist);
348 
349 	/*
350 	 * note: we currently keep swap-backed pages on a separate inactive
351 	 * list from object-backed pages.   however, merging the two lists
352 	 * back together again hasn't been ruled out.   thus, we keep our
353 	 * swap cluster in "swpps" rather than in pps (allows us to mix
354 	 * clustering types in the event of a mixed inactive queue).
355 	 */
356 
357 	/*
358 	 * swslot is non-zero if we are building a swap cluster.  we want
359 	 * to stay in the loop while we have a page to scan or we have
360 	 * a swap-cluster to build.
361 	 */
362 
363 	swslot = 0;
364 	swnpages = swcpages = 0;
365 	free = 0;
366 	dirtyreacts = 0;
367 
368 	for (p = TAILQ_FIRST(pglst); p != NULL || swslot != 0; p = nextpg) {
369 
370 		/*
371 		 * note that p can be NULL iff we have traversed the whole
372 		 * list and need to do one final swap-backed clustered pageout.
373 		 */
374 
375 		uobj = NULL;
376 		anon = NULL;
377 
378 		if (p) {
379 
380 			/*
381 			 * update our copy of "free" and see if we've met
382 			 * our target
383 			 */
384 			free = uvmexp.free - BUFPAGES_DEFICIT;
385 
386 			if (free + uvmexp.paging >= uvmexp.freetarg << 2 ||
387 			    dirtyreacts == UVMPD_NUMDIRTYREACTS) {
388 				UVMHIST_LOG(pdhist,"  met free target: "
389 					    "exit loop", 0, 0, 0, 0);
390 				retval = TRUE;
391 
392 				if (swslot == 0) {
393 					/* exit now if no swap-i/o pending */
394 					break;
395 				}
396 
397 				/* set p to null to signal final swap i/o */
398 				p = NULL;
399 			}
400 		}
401 
402 		if (p) {	/* if (we have a new page to consider) */
403 
404 			/*
405 			 * we are below target and have a new page to consider.
406 			 */
407 			uvmexp.pdscans++;
408 			nextpg = TAILQ_NEXT(p, pageq);
409 
410 			/*
411 			 * move referenced pages back to active queue and
412 			 * skip to next page (unlikely to happen since
413 			 * inactive pages shouldn't have any valid mappings
414 			 * and we cleared reference before deactivating).
415 			 */
416 
417 			if (pmap_is_referenced(p)) {
418 				uvm_pageactivate(p);
419 				uvmexp.pdreact++;
420 				continue;
421 			}
422 
423 			/*
424 			 * first we attempt to lock the object that this page
425 			 * belongs to.  if our attempt fails we skip on to
426 			 * the next page (no harm done).  it is important to
427 			 * "try" locking the object as we are locking in the
428 			 * wrong order (pageq -> object) and we don't want to
429 			 * deadlock.
430 			 *
431 			 * the only time we expect to see an ownerless page
432 			 * (i.e. a page with no uobject and !PQ_ANON) is if an
433 			 * anon has loaned a page from a uvm_object and the
434 			 * uvm_object has dropped the ownership.  in that
435 			 * case, the anon can "take over" the loaned page
436 			 * and make it its own.
437 			 */
438 
439 			/* is page part of an anon or ownerless ? */
440 			if ((p->pg_flags & PQ_ANON) || p->uobject == NULL) {
441 				anon = p->uanon;
442 				KASSERT(anon != NULL);
443 				if (!simple_lock_try(&anon->an_lock)) {
444 					/* lock failed, skip this page */
445 					continue;
446 				}
447 
448 				/*
449 				 * if the page is ownerless, claim it in the
450 				 * name of "anon"!
451 				 */
452 
453 				if ((p->pg_flags & PQ_ANON) == 0) {
454 					KASSERT(p->loan_count > 0);
455 					p->loan_count--;
456 					atomic_setbits_int(&p->pg_flags,
457 					    PQ_ANON);
458 					/* anon now owns it */
459 				}
460 				if (p->pg_flags & PG_BUSY) {
461 					simple_unlock(&anon->an_lock);
462 					uvmexp.pdbusy++;
463 					/* someone else owns page, skip it */
464 					continue;
465 				}
466 				uvmexp.pdanscan++;
467 			} else {
468 				uobj = p->uobject;
469 				KASSERT(uobj != NULL);
470 				if (!simple_lock_try(&uobj->vmobjlock)) {
471 					/* lock failed, skip this page */
472 					continue;
473 				}
474 				if (p->pg_flags & PG_BUSY) {
475 					simple_unlock(&uobj->vmobjlock);
476 					uvmexp.pdbusy++;
477 					/* someone else owns page, skip it */
478 					continue;
479 				}
480 				uvmexp.pdobscan++;
481 			}
482 
483 			/*
484 			 * we now have the object and the page queues locked.
485 			 * the page is not busy.   if the page is clean we
486 			 * can free it now and continue.
487 			 */
488 
489 			if (p->pg_flags & PG_CLEAN) {
490 				if (p->pg_flags & PQ_SWAPBACKED) {
491 					/* this page now lives only in swap */
492 					simple_lock(&uvm.swap_data_lock);
493 					uvmexp.swpgonly++;
494 					simple_unlock(&uvm.swap_data_lock);
495 				}
496 
497 				/* zap all mappings with pmap_page_protect... */
498 				pmap_page_protect(p, VM_PROT_NONE);
499 				uvm_pagefree(p);
500 				uvmexp.pdfreed++;
501 
502 				if (anon) {
503 
504 					/*
505 					 * an anonymous page can only be clean
506 					 * if it has backing store assigned.
507 					 */
508 
509 					KASSERT(anon->an_swslot != 0);
510 
511 					/* remove from object */
512 					anon->an_page = NULL;
513 					simple_unlock(&anon->an_lock);
514 				} else {
515 					/* pagefree has already removed the
516 					 * page from the object */
517 					simple_unlock(&uobj->vmobjlock);
518 				}
519 				continue;
520 			}
521 
522 			/*
523 			 * this page is dirty, skip it if we'll have met our
524 			 * free target when all the current pageouts complete.
525 			 */
526 
527 			if (free + uvmexp.paging > uvmexp.freetarg << 2) {
528 				if (anon) {
529 					simple_unlock(&anon->an_lock);
530 				} else {
531 					simple_unlock(&uobj->vmobjlock);
532 				}
533 				continue;
534 			}
535 
536 			/*
537 			 * this page is dirty, but we can't page it out
538 			 * since all pages in swap are only in swap.
539 			 * reactivate it so that we eventually cycle
540 			 * all pages thru the inactive queue.
541 			 */
542 
543 			KASSERT(uvmexp.swpgonly <= uvmexp.swpages);
544 			if ((p->pg_flags & PQ_SWAPBACKED) &&
545 			    uvmexp.swpgonly == uvmexp.swpages) {
546 				dirtyreacts++;
547 				uvm_pageactivate(p);
548 				if (anon) {
549 					simple_unlock(&anon->an_lock);
550 				} else {
551 					simple_unlock(&uobj->vmobjlock);
552 				}
553 				continue;
554 			}
555 
556 			/*
557 			 * if the page is swap-backed and dirty and swap space
558 			 * is full, free any swap allocated to the page
559 			 * so that other pages can be paged out.
560 			 */
561 
562 			KASSERT(uvmexp.swpginuse <= uvmexp.swpages);
563 			if ((p->pg_flags & PQ_SWAPBACKED) &&
564 			    uvmexp.swpginuse == uvmexp.swpages) {
565 
566 				if ((p->pg_flags & PQ_ANON) &&
567 				    p->uanon->an_swslot) {
568 					uvm_swap_free(p->uanon->an_swslot, 1);
569 					p->uanon->an_swslot = 0;
570 				}
571 				if (p->pg_flags & PQ_AOBJ) {
572 					uao_dropswap(p->uobject,
573 						     p->offset >> PAGE_SHIFT);
574 				}
575 			}
576 
577 			/*
578 			 * the page we are looking at is dirty.   we must
579 			 * clean it before it can be freed.  to do this we
580 			 * first mark the page busy so that no one else will
581 			 * touch the page.   we write protect all the mappings
582 			 * of the page so that no one touches it while it is
583 			 * in I/O.
584 			 */
585 
586 			swap_backed = ((p->pg_flags & PQ_SWAPBACKED) != 0);
587 			atomic_setbits_int(&p->pg_flags, PG_BUSY);
588 			UVM_PAGE_OWN(p, "scan_inactive");
589 			pmap_page_protect(p, VM_PROT_READ);
590 			uvmexp.pgswapout++;
591 
592 			/*
593 			 * for swap-backed pages we need to (re)allocate
594 			 * swap space.
595 			 */
596 
597 			if (swap_backed) {
598 
599 				/*
600 				 * free old swap slot (if any)
601 				 */
602 
603 				if (anon) {
604 					if (anon->an_swslot) {
605 						uvm_swap_free(anon->an_swslot,
606 						    1);
607 						anon->an_swslot = 0;
608 					}
609 				} else {
610 					uao_dropswap(uobj,
611 						     p->offset >> PAGE_SHIFT);
612 				}
613 
614 				/*
615 				 * start new cluster (if necessary)
616 				 */
617 
618 				if (swslot == 0) {
619 					swnpages = MAXBSIZE >> PAGE_SHIFT;
620 					swslot = uvm_swap_alloc(&swnpages,
621 					    TRUE);
622 					if (swslot == 0) {
623 						/* no swap?  give up! */
624 						atomic_clearbits_int(
625 						    &p->pg_flags,
626 						    PG_BUSY);
627 						UVM_PAGE_OWN(p, NULL);
628 						if (anon)
629 							simple_unlock(
630 							    &anon->an_lock);
631 						else
632 							simple_unlock(
633 							    &uobj->vmobjlock);
634 						continue;
635 					}
636 					swcpages = 0;	/* cluster is empty */
637 				}
638 
639 				/*
640 				 * add block to cluster
641 				 */
642 
643 				swpps[swcpages] = p;
644 				if (anon)
645 					anon->an_swslot = swslot + swcpages;
646 				else
647 					uao_set_swslot(uobj,
648 					    p->offset >> PAGE_SHIFT,
649 					    swslot + swcpages);
650 				swcpages++;
651 			}
652 		} else {
653 
654 			/* if p == NULL we must be doing a last swap i/o */
655 			swap_backed = TRUE;
656 		}
657 
658 		/*
659 		 * now consider doing the pageout.
660 		 *
661 		 * for swap-backed pages, we do the pageout if we have either
662 		 * filled the cluster (in which case (swnpages == swcpages) or
663 		 * run out of pages (p == NULL).
664 		 *
665 		 * for object pages, we always do the pageout.
666 		 */
667 
668 		if (swap_backed) {
669 			if (p) {	/* if we just added a page to cluster */
670 				if (anon)
671 					simple_unlock(&anon->an_lock);
672 				else
673 					simple_unlock(&uobj->vmobjlock);
674 
675 				/* cluster not full yet? */
676 				if (swcpages < swnpages)
677 					continue;
678 			}
679 
680 			/* starting I/O now... set up for it */
681 			npages = swcpages;
682 			ppsp = swpps;
683 			/* for swap-backed pages only */
684 			start = (vaddr_t) swslot;
685 
686 			/* if this is final pageout we could have a few
687 			 * extra swap blocks */
688 			if (swcpages < swnpages) {
689 				uvm_swap_free(swslot + swcpages,
690 				    (swnpages - swcpages));
691 			}
692 		} else {
693 			/* normal object pageout */
694 			ppsp = pps;
695 			npages = sizeof(pps) / sizeof(struct vm_page *);
696 			/* not looked at because PGO_ALLPAGES is set */
697 			start = 0;
698 		}
699 
700 		/*
701 		 * now do the pageout.
702 		 *
703 		 * for swap_backed pages we have already built the cluster.
704 		 * for !swap_backed pages, uvm_pager_put will call the object's
705 		 * "make put cluster" function to build a cluster on our behalf.
706 		 *
707 		 * we pass the PGO_PDFREECLUST flag to uvm_pager_put to instruct
708 		 * it to free the cluster pages for us on a successful I/O (it
709 		 * always does this for un-successful I/O requests).  this
710 		 * allows us to do clustered pageout without having to deal
711 		 * with cluster pages at this level.
712 		 *
713 		 * note locking semantics of uvm_pager_put with PGO_PDFREECLUST:
714 		 *  IN: locked: uobj (if !swap_backed), page queues
715 		 * OUT: locked: uobj (if !swap_backed && result !=VM_PAGER_PEND)
716 		 *     !locked: pageqs, uobj (if swap_backed || VM_PAGER_PEND)
717 		 *
718 		 * [the bit about VM_PAGER_PEND saves us one lock-unlock pair]
719 		 */
720 
721 		/* locked: uobj (if !swap_backed), page queues */
722 		uvmexp.pdpageouts++;
723 		result = uvm_pager_put(swap_backed ? NULL : uobj, p,
724 		    &ppsp, &npages, PGO_ALLPAGES|PGO_PDFREECLUST, start, 0);
725 		/* locked: uobj (if !swap_backed && result != PEND) */
726 		/* unlocked: pageqs, object (if swap_backed ||result == PEND) */
727 
728 		/*
729 		 * if we did i/o to swap, zero swslot to indicate that we are
730 		 * no longer building a swap-backed cluster.
731 		 */
732 
733 		if (swap_backed)
734 			swslot = 0;		/* done with this cluster */
735 
736 		/*
737 		 * first, we check for VM_PAGER_PEND which means that the
738 		 * async I/O is in progress and the async I/O done routine
739 		 * will clean up after us.   in this case we move on to the
740 		 * next page.
741 		 *
742 		 * there is a very remote chance that the pending async i/o can
743 		 * finish _before_ we get here.   if that happens, our page "p"
744 		 * may no longer be on the inactive queue.   so we verify this
745 		 * when determining the next page (starting over at the head if
746 		 * we've lost our inactive page).
747 		 */
748 
749 		if (result == VM_PAGER_PEND) {
750 			uvmexp.paging += npages;
751 			uvm_lock_pageq();
752 			uvmexp.pdpending++;
753 			if (p) {
754 				if (p->pg_flags & PQ_INACTIVE)
755 					nextpg = TAILQ_NEXT(p, pageq);
756 				else
757 					nextpg = TAILQ_FIRST(pglst);
758 			} else {
759 				nextpg = NULL;
760 			}
761 			continue;
762 		}
763 
764 #ifdef UBC
765 		if (result == VM_PAGER_ERROR &&
766 		    curproc == uvm.pagedaemon_proc) {
767 			uvm_lock_pageq();
768 			nextpg = TAILQ_NEXT(p, pageq);
769 			uvm_pageactivate(p);
770 			continue;
771 		}
772 #endif
773 
774 		/*
775 		 * clean up "p" if we have one
776 		 */
777 
778 		if (p) {
779 			/*
780 			 * the I/O request to "p" is done and uvm_pager_put
781 			 * has freed any cluster pages it may have allocated
782 			 * during I/O.  all that is left for us to do is
783 			 * clean up page "p" (which is still PG_BUSY).
784 			 *
785 			 * our result could be one of the following:
786 			 *   VM_PAGER_OK: successful pageout
787 			 *
788 			 *   VM_PAGER_AGAIN: tmp resource shortage, we skip
789 			 *     to next page
790 			 *   VM_PAGER_{FAIL,ERROR,BAD}: an error.   we
791 			 *     "reactivate" page to get it out of the way (it
792 			 *     will eventually drift back into the inactive
793 			 *     queue for a retry).
794 			 *   VM_PAGER_UNLOCK: should never see this as it is
795 			 *     only valid for "get" operations
796 			 */
797 
798 			/* relock p's object: page queues not lock yet, so
799 			 * no need for "try" */
800 
801 			/* !swap_backed case: already locked... */
802 			if (swap_backed) {
803 				if (anon)
804 					simple_lock(&anon->an_lock);
805 				else
806 					simple_lock(&uobj->vmobjlock);
807 			}
808 
809 #ifdef DIAGNOSTIC
810 			if (result == VM_PAGER_UNLOCK)
811 				panic("pagedaemon: pageout returned "
812 				    "invalid 'unlock' code");
813 #endif
814 
815 			/* handle PG_WANTED now */
816 			if (p->pg_flags & PG_WANTED)
817 				/* still holding object lock */
818 				wakeup(p);
819 
820 			atomic_clearbits_int(&p->pg_flags, PG_BUSY|PG_WANTED);
821 			UVM_PAGE_OWN(p, NULL);
822 
823 			/* released during I/O? */
824 			if (p->pg_flags & PG_RELEASED) {
825 				if (anon) {
826 					/* remove page so we can get nextpg */
827 					anon->an_page = NULL;
828 
829 					simple_unlock(&anon->an_lock);
830 					uvm_anfree(anon);	/* kills anon */
831 					pmap_page_protect(p, VM_PROT_NONE);
832 					anon = NULL;
833 					uvm_lock_pageq();
834 					nextpg = TAILQ_NEXT(p, pageq);
835 					/* free released page */
836 					uvm_pagefree(p);
837 
838 				} else {
839 
840 					/*
841 					 * pgo_releasepg nukes the page and
842 					 * gets "nextpg" for us.  it returns
843 					 * with the page queues locked (when
844 					 * given nextpg ptr).
845 					 */
846 
847 					if (!uobj->pgops->pgo_releasepg(p,
848 					    &nextpg))
849 						/* uobj died after release */
850 						uobj = NULL;
851 				}
852 			} else {	/* page was not released during I/O */
853 				uvm_lock_pageq();
854 				nextpg = TAILQ_NEXT(p, pageq);
855 				if (result != VM_PAGER_OK) {
856 					/* pageout was a failure... */
857 					if (result != VM_PAGER_AGAIN)
858 						uvm_pageactivate(p);
859 					pmap_clear_reference(p);
860 					/* XXXCDC: if (swap_backed) FREE p's
861 					 * swap block? */
862 				} else {
863 					/* pageout was a success... */
864 					pmap_clear_reference(p);
865 					pmap_clear_modify(p);
866 					atomic_setbits_int(&p->pg_flags,
867 					    PG_CLEAN);
868 				}
869 			}
870 
871 			/*
872 			 * drop object lock (if there is an object left).   do
873 			 * a safety check of nextpg to make sure it is on the
874 			 * inactive queue (it should be since PG_BUSY pages on
875 			 * the inactive queue can't be re-queued [note: not
876 			 * true for active queue]).
877 			 */
878 
879 			if (anon)
880 				simple_unlock(&anon->an_lock);
881 			else if (uobj)
882 				simple_unlock(&uobj->vmobjlock);
883 
884 			if (nextpg && (nextpg->pg_flags & PQ_INACTIVE) == 0) {
885 				nextpg = TAILQ_FIRST(pglst);	/* reload! */
886 			}
887 		} else {
888 
889 			/*
890 			 * if p is null in this loop, make sure it stays null
891 			 * in the next loop.
892 			 */
893 
894 			nextpg = NULL;
895 
896 			/*
897 			 * lock page queues here just so they're always locked
898 			 * at the end of the loop.
899 			 */
900 
901 			uvm_lock_pageq();
902 		}
903 	}
904 	return (retval);
905 }
906 
907 /*
908  * uvmpd_scan: scan the page queues and attempt to meet our targets.
909  *
910  * => called with pageq's locked
911  */
912 
913 void
914 uvmpd_scan(void)
915 {
916 	int free, inactive_shortage, swap_shortage, pages_freed;
917 	struct vm_page *p, *nextpg;
918 	struct uvm_object *uobj;
919 	boolean_t got_it;
920 	UVMHIST_FUNC("uvmpd_scan"); UVMHIST_CALLED(pdhist);
921 
922 	uvmexp.pdrevs++;		/* counter */
923 	uobj = NULL;
924 
925 	/*
926 	 * get current "free" page count
927 	 */
928 	free = uvmexp.free - BUFPAGES_DEFICIT;
929 
930 #ifndef __SWAP_BROKEN
931 	/*
932 	 * swap out some processes if we are below our free target.
933 	 * we need to unlock the page queues for this.
934 	 */
935 	if (free < uvmexp.freetarg) {
936 		uvmexp.pdswout++;
937 		UVMHIST_LOG(pdhist,"  free %ld < target %ld: swapout", free,
938 		    uvmexp.freetarg, 0, 0);
939 		uvm_unlock_pageq();
940 		uvm_swapout_threads();
941 		uvm_lock_pageq();
942 	}
943 #endif
944 
945 	/*
946 	 * now we want to work on meeting our targets.   first we work on our
947 	 * free target by converting inactive pages into free pages.  then
948 	 * we work on meeting our inactive target by converting active pages
949 	 * to inactive ones.
950 	 */
951 
952 	UVMHIST_LOG(pdhist, "  starting 'free' loop",0,0,0,0);
953 
954 	/*
955 	 * alternate starting queue between swap and object based on the
956 	 * low bit of uvmexp.pdrevs (which we bump by one each call).
957 	 */
958 
959 	got_it = FALSE;
960 	pages_freed = uvmexp.pdfreed;	/* XXX - int */
961 	if ((uvmexp.pdrevs & 1) != 0 && uvmexp.nswapdev != 0)
962 		got_it = uvmpd_scan_inactive(&uvm.page_inactive_swp);
963 	if (!got_it)
964 		got_it = uvmpd_scan_inactive(&uvm.page_inactive_obj);
965 	if (!got_it && (uvmexp.pdrevs & 1) == 0 && uvmexp.nswapdev != 0)
966 		(void) uvmpd_scan_inactive(&uvm.page_inactive_swp);
967 	pages_freed = uvmexp.pdfreed - pages_freed;
968 
969 	/*
970 	 * we have done the scan to get free pages.   now we work on meeting
971 	 * our inactive target.
972 	 */
973 
974 	inactive_shortage = uvmexp.inactarg - uvmexp.inactive;
975 
976 	/*
977 	 * detect if we're not going to be able to page anything out
978 	 * until we free some swap resources from active pages.
979 	 */
980 
981 	swap_shortage = 0;
982 	if (uvmexp.free < uvmexp.freetarg &&
983 	    uvmexp.swpginuse == uvmexp.swpages &&
984 	    uvmexp.swpgonly < uvmexp.swpages &&
985 	    pages_freed == 0) {
986 		swap_shortage = uvmexp.freetarg - uvmexp.free;
987 	}
988 
989 	UVMHIST_LOG(pdhist, "  loop 2: inactive_shortage=%ld swap_shortage=%ld",
990 		    inactive_shortage, swap_shortage,0,0);
991 	for (p = TAILQ_FIRST(&uvm.page_active);
992 	     p != NULL && (inactive_shortage > 0 || swap_shortage > 0);
993 	     p = nextpg) {
994 		nextpg = TAILQ_NEXT(p, pageq);
995 		if (p->pg_flags & PG_BUSY)
996 			continue;	/* quick check before trying to lock */
997 
998 		/*
999 		 * lock the page's owner.
1000 		 */
1001 		/* is page anon owned or ownerless? */
1002 		if ((p->pg_flags & PQ_ANON) || p->uobject == NULL) {
1003 			KASSERT(p->uanon != NULL);
1004 			if (!simple_lock_try(&p->uanon->an_lock))
1005 				continue;
1006 
1007 			/* take over the page? */
1008 			if ((p->pg_flags & PQ_ANON) == 0) {
1009 				KASSERT(p->loan_count > 0);
1010 				p->loan_count--;
1011 				atomic_setbits_int(&p->pg_flags, PQ_ANON);
1012 			}
1013 		} else {
1014 			if (!simple_lock_try(&p->uobject->vmobjlock))
1015 				continue;
1016 		}
1017 
1018 		/*
1019 		 * skip this page if it's busy.
1020 		 */
1021 
1022 		if ((p->pg_flags & PG_BUSY) != 0) {
1023 			if (p->pg_flags & PQ_ANON)
1024 				simple_unlock(&p->uanon->an_lock);
1025 			else
1026 				simple_unlock(&p->uobject->vmobjlock);
1027 			continue;
1028 		}
1029 
1030 		/*
1031 		 * if there's a shortage of swap, free any swap allocated
1032 		 * to this page so that other pages can be paged out.
1033 		 */
1034 
1035 		if (swap_shortage > 0) {
1036 			if ((p->pg_flags & PQ_ANON) && p->uanon->an_swslot) {
1037 				uvm_swap_free(p->uanon->an_swslot, 1);
1038 				p->uanon->an_swslot = 0;
1039 				atomic_clearbits_int(&p->pg_flags, PG_CLEAN);
1040 				swap_shortage--;
1041 			}
1042 			if (p->pg_flags & PQ_AOBJ) {
1043 				int slot = uao_set_swslot(p->uobject,
1044 					p->offset >> PAGE_SHIFT, 0);
1045 				if (slot) {
1046 					uvm_swap_free(slot, 1);
1047 					atomic_clearbits_int(&p->pg_flags,
1048 					    PG_CLEAN);
1049 					swap_shortage--;
1050 				}
1051 			}
1052 		}
1053 
1054 		/*
1055 		 * deactivate this page if there's a shortage of
1056 		 * inactive pages.
1057 		 */
1058 
1059 		if (inactive_shortage > 0) {
1060 			pmap_page_protect(p, VM_PROT_NONE);
1061 			/* no need to check wire_count as pg is "active" */
1062 			uvm_pagedeactivate(p);
1063 			uvmexp.pddeact++;
1064 			inactive_shortage--;
1065 		}
1066 		if (p->pg_flags & PQ_ANON)
1067 			simple_unlock(&p->uanon->an_lock);
1068 		else
1069 			simple_unlock(&p->uobject->vmobjlock);
1070 	}
1071 }
1072