xref: /netbsd-src/sys/uvm/uvm_pdpolicy_clock.c (revision e6c7e151de239c49d2e38720a061ed9d1fa99309)
1 /*	$NetBSD: uvm_pdpolicy_clock.c,v 1.36 2020/04/02 16:29:30 maxv Exp $	*/
2 /*	NetBSD: uvm_pdaemon.c,v 1.72 2006/01/05 10:47:33 yamt Exp $	*/
3 
4 /*-
5  * Copyright (c) 2019, 2020 The NetBSD Foundation, Inc.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to The NetBSD Foundation
9  * by Andrew Doran.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  * POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 /*
34  * Copyright (c) 1997 Charles D. Cranor and Washington University.
35  * Copyright (c) 1991, 1993, The Regents of the University of California.
36  *
37  * All rights reserved.
38  *
39  * This code is derived from software contributed to Berkeley by
40  * The Mach Operating System project at Carnegie-Mellon University.
41  *
42  * Redistribution and use in source and binary forms, with or without
43  * modification, are permitted provided that the following conditions
44  * are met:
45  * 1. Redistributions of source code must retain the above copyright
46  *    notice, this list of conditions and the following disclaimer.
47  * 2. Redistributions in binary form must reproduce the above copyright
48  *    notice, this list of conditions and the following disclaimer in the
49  *    documentation and/or other materials provided with the distribution.
50  * 3. Neither the name of the University nor the names of its contributors
51  *    may be used to endorse or promote products derived from this software
52  *    without specific prior written permission.
53  *
54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64  * SUCH DAMAGE.
65  *
66  *	@(#)vm_pageout.c        8.5 (Berkeley) 2/14/94
67  * from: Id: uvm_pdaemon.c,v 1.1.2.32 1998/02/06 05:26:30 chs Exp
68  *
69  *
70  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
71  * All rights reserved.
72  *
73  * Permission to use, copy, modify and distribute this software and
74  * its documentation is hereby granted, provided that both the copyright
75  * notice and this permission notice appear in all copies of the
76  * software, derivative works or modified versions, and any portions
77  * thereof, and that both notices appear in supporting documentation.
78  *
79  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
80  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
81  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
82  *
83  * Carnegie Mellon requests users of this software to return to
84  *
85  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
86  *  School of Computer Science
87  *  Carnegie Mellon University
88  *  Pittsburgh PA 15213-3890
89  *
90  * any improvements or extensions that they make and grant Carnegie the
91  * rights to redistribute these changes.
92  */
93 
94 #if defined(PDSIM)
95 
96 #include "pdsim.h"
97 
98 #else /* defined(PDSIM) */
99 
100 #include <sys/cdefs.h>
101 __KERNEL_RCSID(0, "$NetBSD: uvm_pdpolicy_clock.c,v 1.36 2020/04/02 16:29:30 maxv Exp $");
102 
103 #include <sys/param.h>
104 #include <sys/proc.h>
105 #include <sys/systm.h>
106 #include <sys/kernel.h>
107 #include <sys/kmem.h>
108 #include <sys/atomic.h>
109 
110 #include <uvm/uvm.h>
111 #include <uvm/uvm_pdpolicy.h>
112 #include <uvm/uvm_pdpolicy_impl.h>
113 #include <uvm/uvm_stat.h>
114 
115 #endif /* defined(PDSIM) */
116 
117 /*
118  * per-CPU queue of pending page status changes.  128 entries makes for a
119  * 1kB queue on _LP64 and has been found to be a reasonable compromise that
120  * keeps lock contention events and wait times low, while not using too much
121  * memory nor allowing global state to fall too far behind.
122  */
123 #if !defined(CLOCK_PDQ_SIZE)
124 #define	CLOCK_PDQ_SIZE	128
125 #endif /* !defined(CLOCK_PDQ_SIZE) */
126 
127 #define PQ_INACTIVE	0x00000010	/* page is in inactive list */
128 #define PQ_ACTIVE	0x00000020	/* page is in active list */
129 
130 #if !defined(CLOCK_INACTIVEPCT)
131 #define	CLOCK_INACTIVEPCT	33
132 #endif /* !defined(CLOCK_INACTIVEPCT) */
133 
134 struct uvmpdpol_globalstate {
135 	kmutex_t lock;			/* lock on state */
136 					/* <= compiler pads here */
137 	struct pglist s_activeq		/* allocated pages, in use */
138 	    __aligned(COHERENCY_UNIT);
139 	struct pglist s_inactiveq;	/* pages between the clock hands */
140 	int s_active;
141 	int s_inactive;
142 	int s_inactarg;
143 	struct uvm_pctparam s_anonmin;
144 	struct uvm_pctparam s_filemin;
145 	struct uvm_pctparam s_execmin;
146 	struct uvm_pctparam s_anonmax;
147 	struct uvm_pctparam s_filemax;
148 	struct uvm_pctparam s_execmax;
149 	struct uvm_pctparam s_inactivepct;
150 };
151 
152 struct uvmpdpol_scanstate {
153 	bool ss_anonreact, ss_filereact, ss_execreact;
154 	struct vm_page ss_marker;
155 };
156 
157 static void	uvmpdpol_pageactivate_locked(struct vm_page *);
158 static void	uvmpdpol_pagedeactivate_locked(struct vm_page *);
159 static void	uvmpdpol_pagedequeue_locked(struct vm_page *);
160 static bool	uvmpdpol_pagerealize_locked(struct vm_page *);
161 static struct uvm_cpu *uvmpdpol_flush(void);
162 
163 static struct uvmpdpol_globalstate pdpol_state __cacheline_aligned;
164 static struct uvmpdpol_scanstate pdpol_scanstate;
165 
166 PDPOL_EVCNT_DEFINE(reactexec)
167 PDPOL_EVCNT_DEFINE(reactfile)
168 PDPOL_EVCNT_DEFINE(reactanon)
169 
170 static void
171 clock_tune(void)
172 {
173 	struct uvmpdpol_globalstate *s = &pdpol_state;
174 
175 	s->s_inactarg = UVM_PCTPARAM_APPLY(&s->s_inactivepct,
176 	    s->s_active + s->s_inactive);
177 	if (s->s_inactarg <= uvmexp.freetarg) {
178 		s->s_inactarg = uvmexp.freetarg + 1;
179 	}
180 }
181 
182 void
183 uvmpdpol_scaninit(void)
184 {
185 	struct uvmpdpol_globalstate *s = &pdpol_state;
186 	struct uvmpdpol_scanstate *ss = &pdpol_scanstate;
187 	int t;
188 	bool anonunder, fileunder, execunder;
189 	bool anonover, fileover, execover;
190 	bool anonreact, filereact, execreact;
191 	int64_t freepg, anonpg, filepg, execpg;
192 
193 	/*
194 	 * decide which types of pages we want to reactivate instead of freeing
195 	 * to keep usage within the minimum and maximum usage limits.
196 	 */
197 
198 	cpu_count_sync_all();
199 	freepg = uvm_availmem();
200 	anonpg = cpu_count_get(CPU_COUNT_ANONPAGES);
201 	filepg = cpu_count_get(CPU_COUNT_FILEPAGES);
202 	execpg = cpu_count_get(CPU_COUNT_EXECPAGES);
203 
204 	mutex_enter(&s->lock);
205 	t = s->s_active + s->s_inactive + freepg;
206 	anonunder = anonpg <= UVM_PCTPARAM_APPLY(&s->s_anonmin, t);
207 	fileunder = filepg <= UVM_PCTPARAM_APPLY(&s->s_filemin, t);
208 	execunder = execpg <= UVM_PCTPARAM_APPLY(&s->s_execmin, t);
209 	anonover = anonpg > UVM_PCTPARAM_APPLY(&s->s_anonmax, t);
210 	fileover = filepg > UVM_PCTPARAM_APPLY(&s->s_filemax, t);
211 	execover = execpg > UVM_PCTPARAM_APPLY(&s->s_execmax, t);
212 	anonreact = anonunder || (!anonover && (fileover || execover));
213 	filereact = fileunder || (!fileover && (anonover || execover));
214 	execreact = execunder || (!execover && (anonover || fileover));
215 	if (filereact && execreact && (anonreact || uvm_swapisfull())) {
216 		anonreact = filereact = execreact = false;
217 	}
218 	ss->ss_anonreact = anonreact;
219 	ss->ss_filereact = filereact;
220 	ss->ss_execreact = execreact;
221 	memset(&ss->ss_marker, 0, sizeof(ss->ss_marker));
222 	ss->ss_marker.flags = PG_MARKER;
223 	TAILQ_INSERT_HEAD(&pdpol_state.s_inactiveq, &ss->ss_marker, pdqueue);
224 	mutex_exit(&s->lock);
225 }
226 
227 void
228 uvmpdpol_scanfini(void)
229 {
230 	struct uvmpdpol_globalstate *s = &pdpol_state;
231 	struct uvmpdpol_scanstate *ss = &pdpol_scanstate;
232 
233 	mutex_enter(&s->lock);
234 	TAILQ_REMOVE(&pdpol_state.s_inactiveq, &ss->ss_marker, pdqueue);
235 	mutex_exit(&s->lock);
236 }
237 
238 struct vm_page *
239 uvmpdpol_selectvictim(krwlock_t **plock)
240 {
241 	struct uvmpdpol_globalstate *s = &pdpol_state;
242 	struct uvmpdpol_scanstate *ss = &pdpol_scanstate;
243 	struct vm_page *pg;
244 	krwlock_t *lock;
245 
246 	mutex_enter(&s->lock);
247 	while (/* CONSTCOND */ 1) {
248 		struct vm_anon *anon;
249 		struct uvm_object *uobj;
250 
251 		pg = TAILQ_NEXT(&ss->ss_marker, pdqueue);
252 		if (pg == NULL) {
253 			break;
254 		}
255 		KASSERT((pg->flags & PG_MARKER) == 0);
256 		uvmexp.pdscans++;
257 
258 		/*
259 		 * acquire interlock to stablize page identity.
260 		 * if we have caught the page in a state of flux
261 		 * deal with it and retry.
262 		 */
263 		mutex_enter(&pg->interlock);
264 		if (uvmpdpol_pagerealize_locked(pg)) {
265 			mutex_exit(&pg->interlock);
266 			continue;
267 		}
268 
269 		/*
270 		 * now prepare to move on to the next page.
271 		 */
272 		TAILQ_REMOVE(&pdpol_state.s_inactiveq, &ss->ss_marker,
273 		    pdqueue);
274 		TAILQ_INSERT_AFTER(&pdpol_state.s_inactiveq, pg,
275 		    &ss->ss_marker, pdqueue);
276 
277 		/*
278 		 * enforce the minimum thresholds on different
279 		 * types of memory usage.  if reusing the current
280 		 * page would reduce that type of usage below its
281 		 * minimum, reactivate the page instead and move
282 		 * on to the next page.
283 		 */
284 		anon = pg->uanon;
285 		uobj = pg->uobject;
286 		if (uobj && UVM_OBJ_IS_VTEXT(uobj) && ss->ss_execreact) {
287 			uvmpdpol_pageactivate_locked(pg);
288 			mutex_exit(&pg->interlock);
289 			PDPOL_EVCNT_INCR(reactexec);
290 			continue;
291 		}
292 		if (uobj && UVM_OBJ_IS_VNODE(uobj) &&
293 		    !UVM_OBJ_IS_VTEXT(uobj) && ss->ss_filereact) {
294 			uvmpdpol_pageactivate_locked(pg);
295 			mutex_exit(&pg->interlock);
296 			PDPOL_EVCNT_INCR(reactfile);
297 			continue;
298 		}
299 		if ((anon || UVM_OBJ_IS_AOBJ(uobj)) && ss->ss_anonreact) {
300 			uvmpdpol_pageactivate_locked(pg);
301 			mutex_exit(&pg->interlock);
302 			PDPOL_EVCNT_INCR(reactanon);
303 			continue;
304 		}
305 
306 		/*
307 		 * try to lock the object that owns the page.
308 		 *
309 		 * with the page interlock held, we can drop s->lock, which
310 		 * could otherwise serve as a barrier to us getting the
311 		 * object locked, because the owner of the object's lock may
312 		 * be blocked on s->lock (i.e. a deadlock).
313 		 *
314 		 * whatever happens, uvmpd_trylockowner() will release the
315 		 * interlock.  with the interlock dropped we can then
316 		 * re-acquire our own lock.  the order is:
317 		 *
318 		 *	object -> pdpol -> interlock.
319 	         */
320 	        mutex_exit(&s->lock);
321         	lock = uvmpd_trylockowner(pg);
322         	/* pg->interlock now released */
323         	mutex_enter(&s->lock);
324 		if (lock == NULL) {
325 			/* didn't get it - try the next page. */
326 			continue;
327 		}
328 
329 		/*
330 		 * move referenced pages back to active queue and skip to
331 		 * next page.
332 		 */
333 		if (pmap_is_referenced(pg)) {
334 			mutex_enter(&pg->interlock);
335 			uvmpdpol_pageactivate_locked(pg);
336 			mutex_exit(&pg->interlock);
337 			uvmexp.pdreact++;
338 			rw_exit(lock);
339 			continue;
340 		}
341 
342 		/* we have a potential victim. */
343 		*plock = lock;
344 		break;
345 	}
346 	mutex_exit(&s->lock);
347 	return pg;
348 }
349 
350 void
351 uvmpdpol_balancequeue(int swap_shortage)
352 {
353 	struct uvmpdpol_globalstate *s = &pdpol_state;
354 	int inactive_shortage;
355 	struct vm_page *p, marker;
356 	krwlock_t *lock;
357 
358 	/*
359 	 * we have done the scan to get free pages.   now we work on meeting
360 	 * our inactive target.
361 	 */
362 
363 	memset(&marker, 0, sizeof(marker));
364 	marker.flags = PG_MARKER;
365 
366 	mutex_enter(&s->lock);
367 	TAILQ_INSERT_HEAD(&pdpol_state.s_activeq, &marker, pdqueue);
368 	for (;;) {
369 		inactive_shortage =
370 		    pdpol_state.s_inactarg - pdpol_state.s_inactive;
371 		if (inactive_shortage <= 0 && swap_shortage <= 0) {
372 			break;
373 		}
374 		p = TAILQ_NEXT(&marker, pdqueue);
375 		if (p == NULL) {
376 			break;
377 		}
378 		KASSERT((p->flags & PG_MARKER) == 0);
379 
380 		/*
381 		 * acquire interlock to stablize page identity.
382 		 * if we have caught the page in a state of flux
383 		 * deal with it and retry.
384 		 */
385 		mutex_enter(&p->interlock);
386 		if (uvmpdpol_pagerealize_locked(p)) {
387 			mutex_exit(&p->interlock);
388 			continue;
389 		}
390 
391 		/*
392 		 * now prepare to move on to the next page.
393 		 */
394 		TAILQ_REMOVE(&pdpol_state.s_activeq, &marker, pdqueue);
395 		TAILQ_INSERT_AFTER(&pdpol_state.s_activeq, p, &marker,
396 		    pdqueue);
397 
398 		/*
399 		 * try to lock the object that owns the page.  see comments
400 		 * in uvmpdol_selectvictim().
401 	         */
402 	        mutex_exit(&s->lock);
403         	lock = uvmpd_trylockowner(p);
404         	/* p->interlock now released */
405         	mutex_enter(&s->lock);
406 		if (lock == NULL) {
407 			/* didn't get it - try the next page. */
408 			continue;
409 		}
410 
411 		/*
412 		 * if there's a shortage of swap slots, try to free it.
413 		 */
414 		if (swap_shortage > 0 && (p->flags & PG_SWAPBACKED) != 0 &&
415 		    (p->flags & PG_BUSY) == 0) {
416 			if (uvmpd_dropswap(p)) {
417 				swap_shortage--;
418 			}
419 		}
420 
421 		/*
422 		 * if there's a shortage of inactive pages, deactivate.
423 		 */
424 		if (inactive_shortage > 0) {
425 			pmap_clear_reference(p);
426 			mutex_enter(&p->interlock);
427 			uvmpdpol_pagedeactivate_locked(p);
428 			mutex_exit(&p->interlock);
429 			uvmexp.pddeact++;
430 			inactive_shortage--;
431 		}
432 		rw_exit(lock);
433 	}
434 	TAILQ_REMOVE(&pdpol_state.s_activeq, &marker, pdqueue);
435 	mutex_exit(&s->lock);
436 }
437 
438 static void
439 uvmpdpol_pagedeactivate_locked(struct vm_page *pg)
440 {
441 	struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
442 
443 	KASSERT(mutex_owned(&s->lock));
444 	KASSERT(mutex_owned(&pg->interlock));
445 	KASSERT((pg->pqflags & (PQ_INTENT_MASK | PQ_INTENT_SET)) !=
446 	    (PQ_INTENT_D | PQ_INTENT_SET));
447 
448 	if (pg->pqflags & PQ_ACTIVE) {
449 		TAILQ_REMOVE(&pdpol_state.s_activeq, pg, pdqueue);
450 		KASSERT(pdpol_state.s_active > 0);
451 		pdpol_state.s_active--;
452 	}
453 	if ((pg->pqflags & PQ_INACTIVE) == 0) {
454 		KASSERT(pg->wire_count == 0);
455 		TAILQ_INSERT_TAIL(&pdpol_state.s_inactiveq, pg, pdqueue);
456 		pdpol_state.s_inactive++;
457 	}
458 	pg->pqflags &= ~(PQ_ACTIVE | PQ_INTENT_SET);
459 	pg->pqflags |= PQ_INACTIVE;
460 }
461 
462 void
463 uvmpdpol_pagedeactivate(struct vm_page *pg)
464 {
465 
466 	KASSERT(uvm_page_owner_locked_p(pg, false));
467 	KASSERT(mutex_owned(&pg->interlock));
468 
469 	/*
470 	 * we have to clear the reference bit now, as when it comes time to
471 	 * realize the intent we won't have the object locked any more.
472 	 */
473 	pmap_clear_reference(pg);
474 	uvmpdpol_set_intent(pg, PQ_INTENT_I);
475 }
476 
477 static void
478 uvmpdpol_pageactivate_locked(struct vm_page *pg)
479 {
480 	struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
481 
482 	KASSERT(mutex_owned(&s->lock));
483 	KASSERT(mutex_owned(&pg->interlock));
484 	KASSERT((pg->pqflags & (PQ_INTENT_MASK | PQ_INTENT_SET)) !=
485 	    (PQ_INTENT_D | PQ_INTENT_SET));
486 
487 	uvmpdpol_pagedequeue_locked(pg);
488 	TAILQ_INSERT_TAIL(&pdpol_state.s_activeq, pg, pdqueue);
489 	pdpol_state.s_active++;
490 	pg->pqflags &= ~(PQ_INACTIVE | PQ_INTENT_SET);
491 	pg->pqflags |= PQ_ACTIVE;
492 }
493 
494 void
495 uvmpdpol_pageactivate(struct vm_page *pg)
496 {
497 
498 	KASSERT(uvm_page_owner_locked_p(pg, false));
499 	KASSERT(mutex_owned(&pg->interlock));
500 
501 	uvmpdpol_set_intent(pg, PQ_INTENT_A);
502 }
503 
504 static void
505 uvmpdpol_pagedequeue_locked(struct vm_page *pg)
506 {
507 	struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
508 
509 	KASSERT(mutex_owned(&s->lock));
510 	KASSERT(mutex_owned(&pg->interlock));
511 
512 	if (pg->pqflags & PQ_ACTIVE) {
513 		TAILQ_REMOVE(&pdpol_state.s_activeq, pg, pdqueue);
514 		KASSERT((pg->pqflags & PQ_INACTIVE) == 0);
515 		KASSERT(pdpol_state.s_active > 0);
516 		pdpol_state.s_active--;
517 	} else if (pg->pqflags & PQ_INACTIVE) {
518 		TAILQ_REMOVE(&pdpol_state.s_inactiveq, pg, pdqueue);
519 		KASSERT(pdpol_state.s_inactive > 0);
520 		pdpol_state.s_inactive--;
521 	}
522 	pg->pqflags &= ~(PQ_ACTIVE | PQ_INACTIVE | PQ_INTENT_SET);
523 }
524 
525 void
526 uvmpdpol_pagedequeue(struct vm_page *pg)
527 {
528 
529 	KASSERT(uvm_page_owner_locked_p(pg, true));
530 	KASSERT(mutex_owned(&pg->interlock));
531 
532 	uvmpdpol_set_intent(pg, PQ_INTENT_D);
533 }
534 
535 void
536 uvmpdpol_pageenqueue(struct vm_page *pg)
537 {
538 
539 	KASSERT(uvm_page_owner_locked_p(pg, false));
540 	KASSERT(mutex_owned(&pg->interlock));
541 
542 	uvmpdpol_set_intent(pg, PQ_INTENT_E);
543 }
544 
545 void
546 uvmpdpol_anfree(struct vm_anon *an)
547 {
548 }
549 
550 bool
551 uvmpdpol_pageisqueued_p(struct vm_page *pg)
552 {
553 	uint32_t pqflags;
554 
555 	/*
556 	 * if there's an intent set, we have to consider it.  otherwise,
557 	 * return the actual state.  we may be called unlocked for the
558 	 * purpose of assertions, which is safe due to the page lifecycle.
559 	 */
560 	pqflags = atomic_load_relaxed(&pg->pqflags);
561 	if ((pqflags & PQ_INTENT_SET) != 0) {
562 		return (pqflags & PQ_INTENT_MASK) != PQ_INTENT_D;
563 	} else {
564 		return (pqflags & (PQ_ACTIVE | PQ_INACTIVE)) != 0;
565 	}
566 }
567 
568 void
569 uvmpdpol_estimatepageable(int *active, int *inactive)
570 {
571 	struct uvmpdpol_globalstate *s = &pdpol_state;
572 
573 	/*
574 	 * Don't take any locks here.  This can be called from DDB, and in
575 	 * any case the numbers are stale the instant the lock is dropped,
576 	 * so it just doesn't matter.
577 	 */
578 	if (active) {
579 		*active = s->s_active;
580 	}
581 	if (inactive) {
582 		*inactive = s->s_inactive;
583 	}
584 }
585 
586 #if !defined(PDSIM)
587 static int
588 min_check(struct uvm_pctparam *pct, int t)
589 {
590 	struct uvmpdpol_globalstate *s = &pdpol_state;
591 	int total = t;
592 
593 	if (pct != &s->s_anonmin) {
594 		total += uvm_pctparam_get(&s->s_anonmin);
595 	}
596 	if (pct != &s->s_filemin) {
597 		total += uvm_pctparam_get(&s->s_filemin);
598 	}
599 	if (pct != &s->s_execmin) {
600 		total += uvm_pctparam_get(&s->s_execmin);
601 	}
602 	if (total > 95) {
603 		return EINVAL;
604 	}
605 	return 0;
606 }
607 #endif /* !defined(PDSIM) */
608 
609 void
610 uvmpdpol_init(void)
611 {
612 	struct uvmpdpol_globalstate *s = &pdpol_state;
613 
614 	mutex_init(&s->lock, MUTEX_DEFAULT, IPL_NONE);
615 	TAILQ_INIT(&s->s_activeq);
616 	TAILQ_INIT(&s->s_inactiveq);
617 	uvm_pctparam_init(&s->s_inactivepct, CLOCK_INACTIVEPCT, NULL);
618 	uvm_pctparam_init(&s->s_anonmin, 10, min_check);
619 	uvm_pctparam_init(&s->s_filemin, 10, min_check);
620 	uvm_pctparam_init(&s->s_execmin,  5, min_check);
621 	uvm_pctparam_init(&s->s_anonmax, 80, NULL);
622 	uvm_pctparam_init(&s->s_filemax, 50, NULL);
623 	uvm_pctparam_init(&s->s_execmax, 30, NULL);
624 }
625 
626 void
627 uvmpdpol_init_cpu(struct uvm_cpu *ucpu)
628 {
629 
630 	ucpu->pdq =
631 	    kmem_alloc(CLOCK_PDQ_SIZE * sizeof(struct vm_page *), KM_SLEEP);
632 	ucpu->pdqhead = CLOCK_PDQ_SIZE;
633 	ucpu->pdqtail = CLOCK_PDQ_SIZE;
634 }
635 
636 void
637 uvmpdpol_reinit(void)
638 {
639 }
640 
641 bool
642 uvmpdpol_needsscan_p(void)
643 {
644 
645 	/*
646 	 * this must be an unlocked check: can be called from interrupt.
647 	 */
648 	return pdpol_state.s_inactive < pdpol_state.s_inactarg;
649 }
650 
651 void
652 uvmpdpol_tune(void)
653 {
654 	struct uvmpdpol_globalstate *s = &pdpol_state;
655 
656 	mutex_enter(&s->lock);
657 	clock_tune();
658 	mutex_exit(&s->lock);
659 }
660 
661 /*
662  * uvmpdpol_pagerealize_locked: take the intended state set on a page and
663  * make it real.  return true if any work was done.
664  */
665 static bool
666 uvmpdpol_pagerealize_locked(struct vm_page *pg)
667 {
668 	struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
669 
670 	KASSERT(mutex_owned(&s->lock));
671 	KASSERT(mutex_owned(&pg->interlock));
672 
673 	switch (pg->pqflags & (PQ_INTENT_MASK | PQ_INTENT_SET)) {
674 	case PQ_INTENT_A | PQ_INTENT_SET:
675 	case PQ_INTENT_E | PQ_INTENT_SET:
676 		uvmpdpol_pageactivate_locked(pg);
677 		return true;
678 	case PQ_INTENT_I | PQ_INTENT_SET:
679 		uvmpdpol_pagedeactivate_locked(pg);
680 		return true;
681 	case PQ_INTENT_D | PQ_INTENT_SET:
682 		uvmpdpol_pagedequeue_locked(pg);
683 		return true;
684 	default:
685 		return false;
686 	}
687 }
688 
689 /*
690  * uvmpdpol_flush: return the current uvm_cpu with all of its pending
691  * updates flushed to the global queues.  this routine may block, and
692  * so can switch cpu.  the idea is to empty to queue on whatever cpu
693  * we finally end up on.
694  */
695 static struct uvm_cpu *
696 uvmpdpol_flush(void)
697 {
698 	struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
699 	struct uvm_cpu *ucpu;
700 	struct vm_page *pg;
701 
702 	KASSERT(kpreempt_disabled());
703 
704 	mutex_enter(&s->lock);
705 	for (;;) {
706 		/*
707 		 * prefer scanning forwards (even though mutex_enter() is
708 		 * serializing) so as to not defeat any prefetch logic in
709 		 * the CPU.  that means elsewhere enqueuing backwards, like
710 		 * a stack, but not so important there as pages are being
711 		 * added singularly.
712 		 *
713 		 * prefetch the next "struct vm_page" while working on the
714 		 * current one.  this has a measurable and very positive
715 		 * effect in reducing the amount of time spent here under
716 		 * the global lock.
717 		 */
718 		ucpu = curcpu()->ci_data.cpu_uvm;
719 		KASSERT(ucpu->pdqhead <= ucpu->pdqtail);
720 		if (__predict_false(ucpu->pdqhead == ucpu->pdqtail)) {
721 			break;
722 		}
723 		pg = ucpu->pdq[ucpu->pdqhead++];
724 		if (__predict_true(ucpu->pdqhead != ucpu->pdqtail)) {
725 			__builtin_prefetch(ucpu->pdq[ucpu->pdqhead]);
726 		}
727 		mutex_enter(&pg->interlock);
728 		pg->pqflags &= ~PQ_INTENT_QUEUED;
729 		(void)uvmpdpol_pagerealize_locked(pg);
730 		mutex_exit(&pg->interlock);
731 	}
732 	mutex_exit(&s->lock);
733 	return ucpu;
734 }
735 
736 /*
737  * uvmpdpol_pagerealize: realize any intent set on the page.  in this
738  * implementation, that means putting the page on a per-CPU queue to be
739  * dealt with later.
740  */
741 void
742 uvmpdpol_pagerealize(struct vm_page *pg)
743 {
744 	struct uvm_cpu *ucpu;
745 
746 	/*
747 	 * drain the per per-CPU queue if full, then enter the page.
748 	 */
749 	kpreempt_disable();
750 	ucpu = curcpu()->ci_data.cpu_uvm;
751 	if (__predict_false(ucpu->pdqhead == 0)) {
752 		ucpu = uvmpdpol_flush();
753 	}
754 	ucpu->pdq[--(ucpu->pdqhead)] = pg;
755 	kpreempt_enable();
756 }
757 
758 /*
759  * uvmpdpol_idle: called from the system idle loop.  periodically purge any
760  * pending updates back to the global queues.
761  */
762 void
763 uvmpdpol_idle(struct uvm_cpu *ucpu)
764 {
765 	struct uvmpdpol_globalstate *s = &pdpol_state;
766 	struct vm_page *pg;
767 
768 	KASSERT(kpreempt_disabled());
769 
770 	/*
771 	 * if no pages in the queue, we have nothing to do.
772 	 */
773 	if (ucpu->pdqhead == ucpu->pdqtail) {
774 		ucpu->pdqtime = getticks();
775 		return;
776 	}
777 
778 	/*
779 	 * don't do this more than ~8 times a second as it would needlessly
780 	 * exert pressure.
781 	 */
782 	if (getticks() - ucpu->pdqtime < (hz >> 3)) {
783 		return;
784 	}
785 
786 	/*
787 	 * the idle LWP can't block, so we have to try for the lock.  if we
788 	 * get it, purge the per-CPU pending update queue.  continually
789 	 * check for a pending resched: in that case exit immediately.
790 	 */
791 	if (mutex_tryenter(&s->lock)) {
792 		while (ucpu->pdqhead != ucpu->pdqtail) {
793 			pg = ucpu->pdq[ucpu->pdqhead];
794 			if (!mutex_tryenter(&pg->interlock)) {
795 				break;
796 			}
797 			ucpu->pdqhead++;
798 			pg->pqflags &= ~PQ_INTENT_QUEUED;
799 			(void)uvmpdpol_pagerealize_locked(pg);
800 			mutex_exit(&pg->interlock);
801 			if (curcpu()->ci_want_resched) {
802 				break;
803 			}
804 		}
805 		if (ucpu->pdqhead == ucpu->pdqtail) {
806 			ucpu->pdqtime = getticks();
807 		}
808 		mutex_exit(&s->lock);
809 	}
810 }
811 
812 #if !defined(PDSIM)
813 
814 #include <sys/sysctl.h>	/* XXX SYSCTL_DESCR */
815 
816 void
817 uvmpdpol_sysctlsetup(void)
818 {
819 	struct uvmpdpol_globalstate *s = &pdpol_state;
820 
821 	uvm_pctparam_createsysctlnode(&s->s_anonmin, "anonmin",
822 	    SYSCTL_DESCR("Percentage of physical memory reserved "
823 	    "for anonymous application data"));
824 	uvm_pctparam_createsysctlnode(&s->s_filemin, "filemin",
825 	    SYSCTL_DESCR("Percentage of physical memory reserved "
826 	    "for cached file data"));
827 	uvm_pctparam_createsysctlnode(&s->s_execmin, "execmin",
828 	    SYSCTL_DESCR("Percentage of physical memory reserved "
829 	    "for cached executable data"));
830 
831 	uvm_pctparam_createsysctlnode(&s->s_anonmax, "anonmax",
832 	    SYSCTL_DESCR("Percentage of physical memory which will "
833 	    "be reclaimed from other usage for "
834 	    "anonymous application data"));
835 	uvm_pctparam_createsysctlnode(&s->s_filemax, "filemax",
836 	    SYSCTL_DESCR("Percentage of physical memory which will "
837 	    "be reclaimed from other usage for cached "
838 	    "file data"));
839 	uvm_pctparam_createsysctlnode(&s->s_execmax, "execmax",
840 	    SYSCTL_DESCR("Percentage of physical memory which will "
841 	    "be reclaimed from other usage for cached "
842 	    "executable data"));
843 
844 	uvm_pctparam_createsysctlnode(&s->s_inactivepct, "inactivepct",
845 	    SYSCTL_DESCR("Percentage of inactive queue of "
846 	    "the entire (active + inactive) queue"));
847 }
848 
849 #endif /* !defined(PDSIM) */
850 
851 #if defined(PDSIM)
852 void
853 pdsim_dump(const char *id)
854 {
855 #if defined(DEBUG)
856 	/* XXX */
857 #endif /* defined(DEBUG) */
858 }
859 #endif /* defined(PDSIM) */
860