xref: /netbsd-src/sys/uvm/uvm_pdpolicy_clock.c (revision 06ddeb9f13adf7283ddbc0ff517ec4b59ab5f739)
1 /*	$NetBSD: uvm_pdpolicy_clock.c,v 1.40 2022/04/12 20:27:56 andvar Exp $	*/
2 /*	NetBSD: uvm_pdaemon.c,v 1.72 2006/01/05 10:47:33 yamt Exp $	*/
3 
4 /*-
5  * Copyright (c) 2019, 2020 The NetBSD Foundation, Inc.
6  * All rights reserved.
7  *
8  * This code is derived from software contributed to The NetBSD Foundation
9  * by Andrew Doran.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
21  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
23  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
24  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  * POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 /*
34  * Copyright (c) 1997 Charles D. Cranor and Washington University.
35  * Copyright (c) 1991, 1993, The Regents of the University of California.
36  *
37  * All rights reserved.
38  *
39  * This code is derived from software contributed to Berkeley by
40  * The Mach Operating System project at Carnegie-Mellon University.
41  *
42  * Redistribution and use in source and binary forms, with or without
43  * modification, are permitted provided that the following conditions
44  * are met:
45  * 1. Redistributions of source code must retain the above copyright
46  *    notice, this list of conditions and the following disclaimer.
47  * 2. Redistributions in binary form must reproduce the above copyright
48  *    notice, this list of conditions and the following disclaimer in the
49  *    documentation and/or other materials provided with the distribution.
50  * 3. Neither the name of the University nor the names of its contributors
51  *    may be used to endorse or promote products derived from this software
52  *    without specific prior written permission.
53  *
54  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64  * SUCH DAMAGE.
65  *
66  *	@(#)vm_pageout.c        8.5 (Berkeley) 2/14/94
67  * from: Id: uvm_pdaemon.c,v 1.1.2.32 1998/02/06 05:26:30 chs Exp
68  *
69  *
70  * Copyright (c) 1987, 1990 Carnegie-Mellon University.
71  * All rights reserved.
72  *
73  * Permission to use, copy, modify and distribute this software and
74  * its documentation is hereby granted, provided that both the copyright
75  * notice and this permission notice appear in all copies of the
76  * software, derivative works or modified versions, and any portions
77  * thereof, and that both notices appear in supporting documentation.
78  *
79  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
80  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
81  * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
82  *
83  * Carnegie Mellon requests users of this software to return to
84  *
85  *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
86  *  School of Computer Science
87  *  Carnegie Mellon University
88  *  Pittsburgh PA 15213-3890
89  *
90  * any improvements or extensions that they make and grant Carnegie the
91  * rights to redistribute these changes.
92  */
93 
94 #if defined(PDSIM)
95 
96 #include "pdsim.h"
97 
98 #else /* defined(PDSIM) */
99 
100 #include <sys/cdefs.h>
101 __KERNEL_RCSID(0, "$NetBSD: uvm_pdpolicy_clock.c,v 1.40 2022/04/12 20:27:56 andvar Exp $");
102 
103 #include <sys/param.h>
104 #include <sys/proc.h>
105 #include <sys/systm.h>
106 #include <sys/kernel.h>
107 #include <sys/kmem.h>
108 #include <sys/atomic.h>
109 
110 #include <uvm/uvm.h>
111 #include <uvm/uvm_pdpolicy.h>
112 #include <uvm/uvm_pdpolicy_impl.h>
113 #include <uvm/uvm_stat.h>
114 
115 #endif /* defined(PDSIM) */
116 
117 /*
118  * per-CPU queue of pending page status changes.  128 entries makes for a
119  * 1kB queue on _LP64 and has been found to be a reasonable compromise that
120  * keeps lock contention events and wait times low, while not using too much
121  * memory nor allowing global state to fall too far behind.
122  */
123 #if !defined(CLOCK_PDQ_SIZE)
124 #define	CLOCK_PDQ_SIZE	128
125 #endif /* !defined(CLOCK_PDQ_SIZE) */
126 
127 #define PQ_INACTIVE	0x00000010	/* page is in inactive list */
128 #define PQ_ACTIVE	0x00000020	/* page is in active list */
129 
130 #if !defined(CLOCK_INACTIVEPCT)
131 #define	CLOCK_INACTIVEPCT	33
132 #endif /* !defined(CLOCK_INACTIVEPCT) */
133 
134 struct uvmpdpol_globalstate {
135 	kmutex_t lock;			/* lock on state */
136 					/* <= compiler pads here */
137 	struct pglist s_activeq		/* allocated pages, in use */
138 	    __aligned(COHERENCY_UNIT);
139 	struct pglist s_inactiveq;	/* pages between the clock hands */
140 	int s_active;
141 	int s_inactive;
142 	int s_inactarg;
143 	struct uvm_pctparam s_anonmin;
144 	struct uvm_pctparam s_filemin;
145 	struct uvm_pctparam s_execmin;
146 	struct uvm_pctparam s_anonmax;
147 	struct uvm_pctparam s_filemax;
148 	struct uvm_pctparam s_execmax;
149 	struct uvm_pctparam s_inactivepct;
150 };
151 
152 struct uvmpdpol_scanstate {
153 	bool ss_anonreact, ss_filereact, ss_execreact;
154 	struct vm_page ss_marker;
155 };
156 
157 static void	uvmpdpol_pageactivate_locked(struct vm_page *);
158 static void	uvmpdpol_pagedeactivate_locked(struct vm_page *);
159 static void	uvmpdpol_pagedequeue_locked(struct vm_page *);
160 static bool	uvmpdpol_pagerealize_locked(struct vm_page *);
161 static struct uvm_cpu *uvmpdpol_flush(void);
162 
163 static struct uvmpdpol_globalstate pdpol_state __cacheline_aligned;
164 static struct uvmpdpol_scanstate pdpol_scanstate;
165 
166 PDPOL_EVCNT_DEFINE(reactexec)
PDPOL_EVCNT_DEFINE(reactfile)167 PDPOL_EVCNT_DEFINE(reactfile)
168 PDPOL_EVCNT_DEFINE(reactanon)
169 
170 static void
171 clock_tune(void)
172 {
173 	struct uvmpdpol_globalstate *s = &pdpol_state;
174 
175 	s->s_inactarg = UVM_PCTPARAM_APPLY(&s->s_inactivepct,
176 	    s->s_active + s->s_inactive);
177 	if (s->s_inactarg <= uvmexp.freetarg) {
178 		s->s_inactarg = uvmexp.freetarg + 1;
179 	}
180 }
181 
182 void
uvmpdpol_scaninit(void)183 uvmpdpol_scaninit(void)
184 {
185 	struct uvmpdpol_globalstate *s = &pdpol_state;
186 	struct uvmpdpol_scanstate *ss = &pdpol_scanstate;
187 	int t;
188 	bool anonunder, fileunder, execunder;
189 	bool anonover, fileover, execover;
190 	bool anonreact, filereact, execreact;
191 	int64_t freepg, anonpg, filepg, execpg;
192 
193 	/*
194 	 * decide which types of pages we want to reactivate instead of freeing
195 	 * to keep usage within the minimum and maximum usage limits.
196 	 * uvm_availmem() will sync the counters.
197 	 */
198 
199 	freepg = uvm_availmem(false);
200 	anonpg = cpu_count_get(CPU_COUNT_ANONCLEAN) +
201 	    cpu_count_get(CPU_COUNT_ANONDIRTY) +
202 	    cpu_count_get(CPU_COUNT_ANONUNKNOWN);
203 	execpg = cpu_count_get(CPU_COUNT_EXECPAGES);
204 	filepg = cpu_count_get(CPU_COUNT_FILECLEAN) +
205 	    cpu_count_get(CPU_COUNT_FILEDIRTY) +
206 	    cpu_count_get(CPU_COUNT_FILEUNKNOWN) -
207 	    execpg;
208 
209 	mutex_enter(&s->lock);
210 	t = s->s_active + s->s_inactive + freepg;
211 	anonunder = anonpg <= UVM_PCTPARAM_APPLY(&s->s_anonmin, t);
212 	fileunder = filepg <= UVM_PCTPARAM_APPLY(&s->s_filemin, t);
213 	execunder = execpg <= UVM_PCTPARAM_APPLY(&s->s_execmin, t);
214 	anonover = anonpg > UVM_PCTPARAM_APPLY(&s->s_anonmax, t);
215 	fileover = filepg > UVM_PCTPARAM_APPLY(&s->s_filemax, t);
216 	execover = execpg > UVM_PCTPARAM_APPLY(&s->s_execmax, t);
217 	anonreact = anonunder || (!anonover && (fileover || execover));
218 	filereact = fileunder || (!fileover && (anonover || execover));
219 	execreact = execunder || (!execover && (anonover || fileover));
220 	if (filereact && execreact && (anonreact || uvm_swapisfull())) {
221 		anonreact = filereact = execreact = false;
222 	}
223 	ss->ss_anonreact = anonreact;
224 	ss->ss_filereact = filereact;
225 	ss->ss_execreact = execreact;
226 	memset(&ss->ss_marker, 0, sizeof(ss->ss_marker));
227 	ss->ss_marker.flags = PG_MARKER;
228 	TAILQ_INSERT_HEAD(&pdpol_state.s_inactiveq, &ss->ss_marker, pdqueue);
229 	mutex_exit(&s->lock);
230 }
231 
232 void
uvmpdpol_scanfini(void)233 uvmpdpol_scanfini(void)
234 {
235 	struct uvmpdpol_globalstate *s = &pdpol_state;
236 	struct uvmpdpol_scanstate *ss = &pdpol_scanstate;
237 
238 	mutex_enter(&s->lock);
239 	TAILQ_REMOVE(&pdpol_state.s_inactiveq, &ss->ss_marker, pdqueue);
240 	mutex_exit(&s->lock);
241 }
242 
243 struct vm_page *
uvmpdpol_selectvictim(krwlock_t ** plock)244 uvmpdpol_selectvictim(krwlock_t **plock)
245 {
246 	struct uvmpdpol_globalstate *s = &pdpol_state;
247 	struct uvmpdpol_scanstate *ss = &pdpol_scanstate;
248 	struct vm_page *pg;
249 	krwlock_t *lock;
250 
251 	mutex_enter(&s->lock);
252 	while (/* CONSTCOND */ 1) {
253 		struct vm_anon *anon;
254 		struct uvm_object *uobj;
255 
256 		pg = TAILQ_NEXT(&ss->ss_marker, pdqueue);
257 		if (pg == NULL) {
258 			break;
259 		}
260 		KASSERT((pg->flags & PG_MARKER) == 0);
261 		uvmexp.pdscans++;
262 
263 		/*
264 		 * acquire interlock to stabilize page identity.
265 		 * if we have caught the page in a state of flux
266 		 * deal with it and retry.
267 		 */
268 		mutex_enter(&pg->interlock);
269 		if (uvmpdpol_pagerealize_locked(pg)) {
270 			mutex_exit(&pg->interlock);
271 			continue;
272 		}
273 
274 		/*
275 		 * now prepare to move on to the next page.
276 		 */
277 		TAILQ_REMOVE(&pdpol_state.s_inactiveq, &ss->ss_marker,
278 		    pdqueue);
279 		TAILQ_INSERT_AFTER(&pdpol_state.s_inactiveq, pg,
280 		    &ss->ss_marker, pdqueue);
281 
282 		/*
283 		 * enforce the minimum thresholds on different
284 		 * types of memory usage.  if reusing the current
285 		 * page would reduce that type of usage below its
286 		 * minimum, reactivate the page instead and move
287 		 * on to the next page.
288 		 */
289 		anon = pg->uanon;
290 		uobj = pg->uobject;
291 		if (uobj && UVM_OBJ_IS_VTEXT(uobj) && ss->ss_execreact) {
292 			uvmpdpol_pageactivate_locked(pg);
293 			mutex_exit(&pg->interlock);
294 			PDPOL_EVCNT_INCR(reactexec);
295 			continue;
296 		}
297 		if (uobj && UVM_OBJ_IS_VNODE(uobj) &&
298 		    !UVM_OBJ_IS_VTEXT(uobj) && ss->ss_filereact) {
299 			uvmpdpol_pageactivate_locked(pg);
300 			mutex_exit(&pg->interlock);
301 			PDPOL_EVCNT_INCR(reactfile);
302 			continue;
303 		}
304 		if ((anon || UVM_OBJ_IS_AOBJ(uobj)) && ss->ss_anonreact) {
305 			uvmpdpol_pageactivate_locked(pg);
306 			mutex_exit(&pg->interlock);
307 			PDPOL_EVCNT_INCR(reactanon);
308 			continue;
309 		}
310 
311 		/*
312 		 * try to lock the object that owns the page.
313 		 *
314 		 * with the page interlock held, we can drop s->lock, which
315 		 * could otherwise serve as a barrier to us getting the
316 		 * object locked, because the owner of the object's lock may
317 		 * be blocked on s->lock (i.e. a deadlock).
318 		 *
319 		 * whatever happens, uvmpd_trylockowner() will release the
320 		 * interlock.  with the interlock dropped we can then
321 		 * re-acquire our own lock.  the order is:
322 		 *
323 		 *	object -> pdpol -> interlock.
324 	         */
325 	        mutex_exit(&s->lock);
326         	lock = uvmpd_trylockowner(pg);
327         	/* pg->interlock now released */
328         	mutex_enter(&s->lock);
329 		if (lock == NULL) {
330 			/* didn't get it - try the next page. */
331 			continue;
332 		}
333 
334 		/*
335 		 * move referenced pages back to active queue and skip to
336 		 * next page.
337 		 */
338 		if (pmap_is_referenced(pg)) {
339 			mutex_enter(&pg->interlock);
340 			uvmpdpol_pageactivate_locked(pg);
341 			mutex_exit(&pg->interlock);
342 			uvmexp.pdreact++;
343 			rw_exit(lock);
344 			continue;
345 		}
346 
347 		/* we have a potential victim. */
348 		*plock = lock;
349 		break;
350 	}
351 	mutex_exit(&s->lock);
352 	return pg;
353 }
354 
355 void
uvmpdpol_balancequeue(int swap_shortage)356 uvmpdpol_balancequeue(int swap_shortage)
357 {
358 	struct uvmpdpol_globalstate *s = &pdpol_state;
359 	int inactive_shortage;
360 	struct vm_page *p, marker;
361 	krwlock_t *lock;
362 
363 	/*
364 	 * we have done the scan to get free pages.   now we work on meeting
365 	 * our inactive target.
366 	 */
367 
368 	memset(&marker, 0, sizeof(marker));
369 	marker.flags = PG_MARKER;
370 
371 	mutex_enter(&s->lock);
372 	TAILQ_INSERT_HEAD(&pdpol_state.s_activeq, &marker, pdqueue);
373 	for (;;) {
374 		inactive_shortage =
375 		    pdpol_state.s_inactarg - pdpol_state.s_inactive;
376 		if (inactive_shortage <= 0 && swap_shortage <= 0) {
377 			break;
378 		}
379 		p = TAILQ_NEXT(&marker, pdqueue);
380 		if (p == NULL) {
381 			break;
382 		}
383 		KASSERT((p->flags & PG_MARKER) == 0);
384 
385 		/*
386 		 * acquire interlock to stabilize page identity.
387 		 * if we have caught the page in a state of flux
388 		 * deal with it and retry.
389 		 */
390 		mutex_enter(&p->interlock);
391 		if (uvmpdpol_pagerealize_locked(p)) {
392 			mutex_exit(&p->interlock);
393 			continue;
394 		}
395 
396 		/*
397 		 * now prepare to move on to the next page.
398 		 */
399 		TAILQ_REMOVE(&pdpol_state.s_activeq, &marker, pdqueue);
400 		TAILQ_INSERT_AFTER(&pdpol_state.s_activeq, p, &marker,
401 		    pdqueue);
402 
403 		/*
404 		 * try to lock the object that owns the page.  see comments
405 		 * in uvmpdol_selectvictim().
406 	         */
407 	        mutex_exit(&s->lock);
408         	lock = uvmpd_trylockowner(p);
409         	/* p->interlock now released */
410         	mutex_enter(&s->lock);
411 		if (lock == NULL) {
412 			/* didn't get it - try the next page. */
413 			continue;
414 		}
415 
416 		/*
417 		 * if there's a shortage of swap slots, try to free it.
418 		 */
419 		if (swap_shortage > 0 && (p->flags & PG_SWAPBACKED) != 0 &&
420 		    (p->flags & PG_BUSY) == 0) {
421 			if (uvmpd_dropswap(p)) {
422 				swap_shortage--;
423 			}
424 		}
425 
426 		/*
427 		 * if there's a shortage of inactive pages, deactivate.
428 		 */
429 		if (inactive_shortage > 0) {
430 			pmap_clear_reference(p);
431 			mutex_enter(&p->interlock);
432 			uvmpdpol_pagedeactivate_locked(p);
433 			mutex_exit(&p->interlock);
434 			uvmexp.pddeact++;
435 			inactive_shortage--;
436 		}
437 		rw_exit(lock);
438 	}
439 	TAILQ_REMOVE(&pdpol_state.s_activeq, &marker, pdqueue);
440 	mutex_exit(&s->lock);
441 }
442 
443 static void
uvmpdpol_pagedeactivate_locked(struct vm_page * pg)444 uvmpdpol_pagedeactivate_locked(struct vm_page *pg)
445 {
446 	struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
447 
448 	KASSERT(mutex_owned(&s->lock));
449 	KASSERT(mutex_owned(&pg->interlock));
450 	KASSERT((pg->pqflags & (PQ_INTENT_MASK | PQ_INTENT_SET)) !=
451 	    (PQ_INTENT_D | PQ_INTENT_SET));
452 
453 	if (pg->pqflags & PQ_ACTIVE) {
454 		TAILQ_REMOVE(&pdpol_state.s_activeq, pg, pdqueue);
455 		KASSERT(pdpol_state.s_active > 0);
456 		pdpol_state.s_active--;
457 	}
458 	if ((pg->pqflags & PQ_INACTIVE) == 0) {
459 		KASSERT(pg->wire_count == 0);
460 		TAILQ_INSERT_TAIL(&pdpol_state.s_inactiveq, pg, pdqueue);
461 		pdpol_state.s_inactive++;
462 	}
463 	pg->pqflags &= ~(PQ_ACTIVE | PQ_INTENT_SET);
464 	pg->pqflags |= PQ_INACTIVE;
465 }
466 
467 void
uvmpdpol_pagedeactivate(struct vm_page * pg)468 uvmpdpol_pagedeactivate(struct vm_page *pg)
469 {
470 
471 	KASSERT(uvm_page_owner_locked_p(pg, false));
472 	KASSERT(mutex_owned(&pg->interlock));
473 
474 	/*
475 	 * we have to clear the reference bit now, as when it comes time to
476 	 * realize the intent we won't have the object locked any more.
477 	 */
478 	pmap_clear_reference(pg);
479 	uvmpdpol_set_intent(pg, PQ_INTENT_I);
480 }
481 
482 static void
uvmpdpol_pageactivate_locked(struct vm_page * pg)483 uvmpdpol_pageactivate_locked(struct vm_page *pg)
484 {
485 	struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
486 
487 	KASSERT(mutex_owned(&s->lock));
488 	KASSERT(mutex_owned(&pg->interlock));
489 	KASSERT((pg->pqflags & (PQ_INTENT_MASK | PQ_INTENT_SET)) !=
490 	    (PQ_INTENT_D | PQ_INTENT_SET));
491 
492 	uvmpdpol_pagedequeue_locked(pg);
493 	TAILQ_INSERT_TAIL(&pdpol_state.s_activeq, pg, pdqueue);
494 	pdpol_state.s_active++;
495 	pg->pqflags &= ~(PQ_INACTIVE | PQ_INTENT_SET);
496 	pg->pqflags |= PQ_ACTIVE;
497 }
498 
499 void
uvmpdpol_pageactivate(struct vm_page * pg)500 uvmpdpol_pageactivate(struct vm_page *pg)
501 {
502 
503 	KASSERT(uvm_page_owner_locked_p(pg, false));
504 	KASSERT(mutex_owned(&pg->interlock));
505 
506 	uvmpdpol_set_intent(pg, PQ_INTENT_A);
507 }
508 
509 static void
uvmpdpol_pagedequeue_locked(struct vm_page * pg)510 uvmpdpol_pagedequeue_locked(struct vm_page *pg)
511 {
512 	struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
513 
514 	KASSERT(mutex_owned(&s->lock));
515 	KASSERT(mutex_owned(&pg->interlock));
516 
517 	if (pg->pqflags & PQ_ACTIVE) {
518 		TAILQ_REMOVE(&pdpol_state.s_activeq, pg, pdqueue);
519 		KASSERT((pg->pqflags & PQ_INACTIVE) == 0);
520 		KASSERT(pdpol_state.s_active > 0);
521 		pdpol_state.s_active--;
522 	} else if (pg->pqflags & PQ_INACTIVE) {
523 		TAILQ_REMOVE(&pdpol_state.s_inactiveq, pg, pdqueue);
524 		KASSERT(pdpol_state.s_inactive > 0);
525 		pdpol_state.s_inactive--;
526 	}
527 	pg->pqflags &= ~(PQ_ACTIVE | PQ_INACTIVE | PQ_INTENT_SET);
528 }
529 
530 void
uvmpdpol_pagedequeue(struct vm_page * pg)531 uvmpdpol_pagedequeue(struct vm_page *pg)
532 {
533 
534 	KASSERT(uvm_page_owner_locked_p(pg, true));
535 	KASSERT(mutex_owned(&pg->interlock));
536 
537 	uvmpdpol_set_intent(pg, PQ_INTENT_D);
538 }
539 
540 void
uvmpdpol_pageenqueue(struct vm_page * pg)541 uvmpdpol_pageenqueue(struct vm_page *pg)
542 {
543 
544 	KASSERT(uvm_page_owner_locked_p(pg, false));
545 	KASSERT(mutex_owned(&pg->interlock));
546 
547 	uvmpdpol_set_intent(pg, PQ_INTENT_E);
548 }
549 
550 void
uvmpdpol_anfree(struct vm_anon * an)551 uvmpdpol_anfree(struct vm_anon *an)
552 {
553 }
554 
555 bool
uvmpdpol_pageisqueued_p(struct vm_page * pg)556 uvmpdpol_pageisqueued_p(struct vm_page *pg)
557 {
558 	uint32_t pqflags;
559 
560 	/*
561 	 * if there's an intent set, we have to consider it.  otherwise,
562 	 * return the actual state.  we may be called unlocked for the
563 	 * purpose of assertions, which is safe due to the page lifecycle.
564 	 */
565 	pqflags = atomic_load_relaxed(&pg->pqflags);
566 	if ((pqflags & PQ_INTENT_SET) != 0) {
567 		return (pqflags & PQ_INTENT_MASK) != PQ_INTENT_D;
568 	} else {
569 		return (pqflags & (PQ_ACTIVE | PQ_INACTIVE)) != 0;
570 	}
571 }
572 
573 bool
uvmpdpol_pageactivate_p(struct vm_page * pg)574 uvmpdpol_pageactivate_p(struct vm_page *pg)
575 {
576 	uint32_t pqflags;
577 
578 	/* consider intent in preference to actual state. */
579 	pqflags = atomic_load_relaxed(&pg->pqflags);
580 	if ((pqflags & PQ_INTENT_SET) != 0) {
581 		pqflags &= PQ_INTENT_MASK;
582 		return pqflags != PQ_INTENT_A && pqflags != PQ_INTENT_E;
583 	} else {
584 		/*
585 		 * TODO: Enabling this may be too much of a big hammer,
586 		 * since we do get useful information from activations.
587 		 * Think about it more and maybe come up with a heuristic
588 		 * or something.
589 		 *
590 		 * return (pqflags & PQ_ACTIVE) == 0;
591 		 */
592 		return true;
593 	}
594 }
595 
596 void
uvmpdpol_estimatepageable(int * active,int * inactive)597 uvmpdpol_estimatepageable(int *active, int *inactive)
598 {
599 	struct uvmpdpol_globalstate *s = &pdpol_state;
600 
601 	/*
602 	 * Don't take any locks here.  This can be called from DDB, and in
603 	 * any case the numbers are stale the instant the lock is dropped,
604 	 * so it just doesn't matter.
605 	 */
606 	if (active) {
607 		*active = s->s_active;
608 	}
609 	if (inactive) {
610 		*inactive = s->s_inactive;
611 	}
612 }
613 
614 #if !defined(PDSIM)
615 static int
min_check(struct uvm_pctparam * pct,int t)616 min_check(struct uvm_pctparam *pct, int t)
617 {
618 	struct uvmpdpol_globalstate *s = &pdpol_state;
619 	int total = t;
620 
621 	if (pct != &s->s_anonmin) {
622 		total += uvm_pctparam_get(&s->s_anonmin);
623 	}
624 	if (pct != &s->s_filemin) {
625 		total += uvm_pctparam_get(&s->s_filemin);
626 	}
627 	if (pct != &s->s_execmin) {
628 		total += uvm_pctparam_get(&s->s_execmin);
629 	}
630 	if (total > 95) {
631 		return EINVAL;
632 	}
633 	return 0;
634 }
635 #endif /* !defined(PDSIM) */
636 
637 void
uvmpdpol_init(void)638 uvmpdpol_init(void)
639 {
640 	struct uvmpdpol_globalstate *s = &pdpol_state;
641 
642 	mutex_init(&s->lock, MUTEX_DEFAULT, IPL_NONE);
643 	TAILQ_INIT(&s->s_activeq);
644 	TAILQ_INIT(&s->s_inactiveq);
645 	uvm_pctparam_init(&s->s_inactivepct, CLOCK_INACTIVEPCT, NULL);
646 	uvm_pctparam_init(&s->s_anonmin, 10, min_check);
647 	uvm_pctparam_init(&s->s_filemin, 10, min_check);
648 	uvm_pctparam_init(&s->s_execmin,  5, min_check);
649 	uvm_pctparam_init(&s->s_anonmax, 80, NULL);
650 	uvm_pctparam_init(&s->s_filemax, 50, NULL);
651 	uvm_pctparam_init(&s->s_execmax, 30, NULL);
652 }
653 
654 void
uvmpdpol_init_cpu(struct uvm_cpu * ucpu)655 uvmpdpol_init_cpu(struct uvm_cpu *ucpu)
656 {
657 
658 	ucpu->pdq =
659 	    kmem_alloc(CLOCK_PDQ_SIZE * sizeof(struct vm_page *), KM_SLEEP);
660 	ucpu->pdqhead = CLOCK_PDQ_SIZE;
661 	ucpu->pdqtail = CLOCK_PDQ_SIZE;
662 }
663 
664 void
uvmpdpol_reinit(void)665 uvmpdpol_reinit(void)
666 {
667 }
668 
669 bool
uvmpdpol_needsscan_p(void)670 uvmpdpol_needsscan_p(void)
671 {
672 
673 	/*
674 	 * this must be an unlocked check: can be called from interrupt.
675 	 */
676 	return pdpol_state.s_inactive < pdpol_state.s_inactarg;
677 }
678 
679 void
uvmpdpol_tune(void)680 uvmpdpol_tune(void)
681 {
682 	struct uvmpdpol_globalstate *s = &pdpol_state;
683 
684 	mutex_enter(&s->lock);
685 	clock_tune();
686 	mutex_exit(&s->lock);
687 }
688 
689 /*
690  * uvmpdpol_pagerealize_locked: take the intended state set on a page and
691  * make it real.  return true if any work was done.
692  */
693 static bool
uvmpdpol_pagerealize_locked(struct vm_page * pg)694 uvmpdpol_pagerealize_locked(struct vm_page *pg)
695 {
696 	struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
697 
698 	KASSERT(mutex_owned(&s->lock));
699 	KASSERT(mutex_owned(&pg->interlock));
700 
701 	switch (pg->pqflags & (PQ_INTENT_MASK | PQ_INTENT_SET)) {
702 	case PQ_INTENT_A | PQ_INTENT_SET:
703 	case PQ_INTENT_E | PQ_INTENT_SET:
704 		uvmpdpol_pageactivate_locked(pg);
705 		return true;
706 	case PQ_INTENT_I | PQ_INTENT_SET:
707 		uvmpdpol_pagedeactivate_locked(pg);
708 		return true;
709 	case PQ_INTENT_D | PQ_INTENT_SET:
710 		uvmpdpol_pagedequeue_locked(pg);
711 		return true;
712 	default:
713 		return false;
714 	}
715 }
716 
717 /*
718  * uvmpdpol_flush: return the current uvm_cpu with all of its pending
719  * updates flushed to the global queues.  this routine may block, and
720  * so can switch cpu.  the idea is to empty to queue on whatever cpu
721  * we finally end up on.
722  */
723 static struct uvm_cpu *
uvmpdpol_flush(void)724 uvmpdpol_flush(void)
725 {
726 	struct uvmpdpol_globalstate *s __diagused = &pdpol_state;
727 	struct uvm_cpu *ucpu;
728 	struct vm_page *pg;
729 
730 	KASSERT(kpreempt_disabled());
731 
732 	mutex_enter(&s->lock);
733 	for (;;) {
734 		/*
735 		 * prefer scanning forwards (even though mutex_enter() is
736 		 * serializing) so as to not defeat any prefetch logic in
737 		 * the CPU.  that means elsewhere enqueuing backwards, like
738 		 * a stack, but not so important there as pages are being
739 		 * added singularly.
740 		 *
741 		 * prefetch the next "struct vm_page" while working on the
742 		 * current one.  this has a measurable and very positive
743 		 * effect in reducing the amount of time spent here under
744 		 * the global lock.
745 		 */
746 		ucpu = curcpu()->ci_data.cpu_uvm;
747 		KASSERT(ucpu->pdqhead <= ucpu->pdqtail);
748 		if (__predict_false(ucpu->pdqhead == ucpu->pdqtail)) {
749 			break;
750 		}
751 		pg = ucpu->pdq[ucpu->pdqhead++];
752 		if (__predict_true(ucpu->pdqhead != ucpu->pdqtail)) {
753 			__builtin_prefetch(ucpu->pdq[ucpu->pdqhead]);
754 		}
755 		mutex_enter(&pg->interlock);
756 		pg->pqflags &= ~PQ_INTENT_QUEUED;
757 		(void)uvmpdpol_pagerealize_locked(pg);
758 		mutex_exit(&pg->interlock);
759 	}
760 	mutex_exit(&s->lock);
761 	return ucpu;
762 }
763 
764 /*
765  * uvmpdpol_pagerealize: realize any intent set on the page.  in this
766  * implementation, that means putting the page on a per-CPU queue to be
767  * dealt with later.
768  */
769 void
uvmpdpol_pagerealize(struct vm_page * pg)770 uvmpdpol_pagerealize(struct vm_page *pg)
771 {
772 	struct uvm_cpu *ucpu;
773 
774 	/*
775 	 * drain the per per-CPU queue if full, then enter the page.
776 	 */
777 	kpreempt_disable();
778 	ucpu = curcpu()->ci_data.cpu_uvm;
779 	if (__predict_false(ucpu->pdqhead == 0)) {
780 		ucpu = uvmpdpol_flush();
781 	}
782 	ucpu->pdq[--(ucpu->pdqhead)] = pg;
783 	kpreempt_enable();
784 }
785 
786 /*
787  * uvmpdpol_idle: called from the system idle loop.  periodically purge any
788  * pending updates back to the global queues.
789  */
790 void
uvmpdpol_idle(struct uvm_cpu * ucpu)791 uvmpdpol_idle(struct uvm_cpu *ucpu)
792 {
793 	struct uvmpdpol_globalstate *s = &pdpol_state;
794 	struct vm_page *pg;
795 
796 	KASSERT(kpreempt_disabled());
797 
798 	/*
799 	 * if no pages in the queue, we have nothing to do.
800 	 */
801 	if (ucpu->pdqhead == ucpu->pdqtail) {
802 		ucpu->pdqtime = getticks();
803 		return;
804 	}
805 
806 	/*
807 	 * don't do this more than ~8 times a second as it would needlessly
808 	 * exert pressure.
809 	 */
810 	if (getticks() - ucpu->pdqtime < (hz >> 3)) {
811 		return;
812 	}
813 
814 	/*
815 	 * the idle LWP can't block, so we have to try for the lock.  if we
816 	 * get it, purge the per-CPU pending update queue.  continually
817 	 * check for a pending resched: in that case exit immediately.
818 	 */
819 	if (mutex_tryenter(&s->lock)) {
820 		while (ucpu->pdqhead != ucpu->pdqtail) {
821 			pg = ucpu->pdq[ucpu->pdqhead];
822 			if (!mutex_tryenter(&pg->interlock)) {
823 				break;
824 			}
825 			ucpu->pdqhead++;
826 			pg->pqflags &= ~PQ_INTENT_QUEUED;
827 			(void)uvmpdpol_pagerealize_locked(pg);
828 			mutex_exit(&pg->interlock);
829 			if (curcpu()->ci_want_resched) {
830 				break;
831 			}
832 		}
833 		if (ucpu->pdqhead == ucpu->pdqtail) {
834 			ucpu->pdqtime = getticks();
835 		}
836 		mutex_exit(&s->lock);
837 	}
838 }
839 
840 #if !defined(PDSIM)
841 
842 #include <sys/sysctl.h>	/* XXX SYSCTL_DESCR */
843 
844 void
uvmpdpol_sysctlsetup(void)845 uvmpdpol_sysctlsetup(void)
846 {
847 	struct uvmpdpol_globalstate *s = &pdpol_state;
848 
849 	uvm_pctparam_createsysctlnode(&s->s_anonmin, "anonmin",
850 	    SYSCTL_DESCR("Percentage of physical memory reserved "
851 	    "for anonymous application data"));
852 	uvm_pctparam_createsysctlnode(&s->s_filemin, "filemin",
853 	    SYSCTL_DESCR("Percentage of physical memory reserved "
854 	    "for cached file data"));
855 	uvm_pctparam_createsysctlnode(&s->s_execmin, "execmin",
856 	    SYSCTL_DESCR("Percentage of physical memory reserved "
857 	    "for cached executable data"));
858 
859 	uvm_pctparam_createsysctlnode(&s->s_anonmax, "anonmax",
860 	    SYSCTL_DESCR("Percentage of physical memory which will "
861 	    "be reclaimed from other usage for "
862 	    "anonymous application data"));
863 	uvm_pctparam_createsysctlnode(&s->s_filemax, "filemax",
864 	    SYSCTL_DESCR("Percentage of physical memory which will "
865 	    "be reclaimed from other usage for cached "
866 	    "file data"));
867 	uvm_pctparam_createsysctlnode(&s->s_execmax, "execmax",
868 	    SYSCTL_DESCR("Percentage of physical memory which will "
869 	    "be reclaimed from other usage for cached "
870 	    "executable data"));
871 
872 	uvm_pctparam_createsysctlnode(&s->s_inactivepct, "inactivepct",
873 	    SYSCTL_DESCR("Percentage of inactive queue of "
874 	    "the entire (active + inactive) queue"));
875 }
876 
877 #endif /* !defined(PDSIM) */
878 
879 #if defined(PDSIM)
880 void
pdsim_dump(const char * id)881 pdsim_dump(const char *id)
882 {
883 #if defined(DEBUG)
884 	/* XXX */
885 #endif /* defined(DEBUG) */
886 }
887 #endif /* defined(PDSIM) */
888