xref: /openbsd-src/sys/kern/sched_bsd.c (revision cf08c0f1c1a5f2177af9dd85c2dfd49647e9816e)
1*cf08c0f1Sclaudio /*	$OpenBSD: sched_bsd.c,v 1.98 2024/11/24 13:02:37 claudio Exp $	*/
24bde56adStedu /*	$NetBSD: kern_synch.c,v 1.37 1996/04/22 01:38:37 christos Exp $	*/
34bde56adStedu 
44bde56adStedu /*-
54bde56adStedu  * Copyright (c) 1982, 1986, 1990, 1991, 1993
64bde56adStedu  *	The Regents of the University of California.  All rights reserved.
74bde56adStedu  * (c) UNIX System Laboratories, Inc.
84bde56adStedu  * All or some portions of this file are derived from material licensed
94bde56adStedu  * to the University of California by American Telephone and Telegraph
104bde56adStedu  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
114bde56adStedu  * the permission of UNIX System Laboratories, Inc.
124bde56adStedu  *
134bde56adStedu  * Redistribution and use in source and binary forms, with or without
144bde56adStedu  * modification, are permitted provided that the following conditions
154bde56adStedu  * are met:
164bde56adStedu  * 1. Redistributions of source code must retain the above copyright
174bde56adStedu  *    notice, this list of conditions and the following disclaimer.
184bde56adStedu  * 2. Redistributions in binary form must reproduce the above copyright
194bde56adStedu  *    notice, this list of conditions and the following disclaimer in the
204bde56adStedu  *    documentation and/or other materials provided with the distribution.
214bde56adStedu  * 3. Neither the name of the University nor the names of its contributors
224bde56adStedu  *    may be used to endorse or promote products derived from this software
234bde56adStedu  *    without specific prior written permission.
244bde56adStedu  *
254bde56adStedu  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
264bde56adStedu  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
274bde56adStedu  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
284bde56adStedu  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
294bde56adStedu  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
304bde56adStedu  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
314bde56adStedu  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
324bde56adStedu  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
334bde56adStedu  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
344bde56adStedu  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
354bde56adStedu  * SUCH DAMAGE.
364bde56adStedu  *
374bde56adStedu  *	@(#)kern_synch.c	8.6 (Berkeley) 1/21/94
384bde56adStedu  */
394bde56adStedu 
404bde56adStedu #include <sys/param.h>
414bde56adStedu #include <sys/systm.h>
42671537bfScheloha #include <sys/clockintr.h>
434bde56adStedu #include <sys/proc.h>
444bde56adStedu #include <sys/kernel.h>
45c7a32f40Stedu #include <sys/malloc.h>
464bde56adStedu #include <sys/resourcevar.h>
474bde56adStedu #include <uvm/uvm_extern.h>
484bde56adStedu #include <sys/sched.h>
494bde56adStedu #include <sys/timeout.h>
50f2396460Svisa #include <sys/smr.h>
5191b2ecf6Smpi #include <sys/tracepoint.h>
524bde56adStedu 
534bde56adStedu #ifdef KTRACE
544bde56adStedu #include <sys/ktrace.h>
554bde56adStedu #endif
564bde56adStedu 
57961828bcScheloha uint64_t roundrobin_period;	/* [I] roundrobin period (ns) */
584bde56adStedu int	lbolt;			/* once a second sleep address */
594bde56adStedu 
60de29a8a5Sclaudio struct mutex sched_lock;
614bde56adStedu 
6202d561d0Sclaudio void			update_loadavg(void *);
634bde56adStedu void			schedcpu(void *);
6455eab86cSmpi uint32_t		decay_aftersleep(uint32_t, uint32_t);
654bde56adStedu 
6602d561d0Sclaudio extern struct cpuset sched_idle_cpus;
6702d561d0Sclaudio 
6802d561d0Sclaudio /*
6902d561d0Sclaudio  * constants for averages over 1, 5, and 15 minutes when sampling at
7002d561d0Sclaudio  * 5 second intervals.
7102d561d0Sclaudio  */
7202d561d0Sclaudio static const fixpt_t cexp[3] = {
7302d561d0Sclaudio 	0.9200444146293232 * FSCALE,	/* exp(-1/12) */
7402d561d0Sclaudio 	0.9834714538216174 * FSCALE,	/* exp(-1/60) */
7502d561d0Sclaudio 	0.9944598480048967 * FSCALE,	/* exp(-1/180) */
7602d561d0Sclaudio };
7702d561d0Sclaudio 
7802d561d0Sclaudio struct loadavg averunnable;
7902d561d0Sclaudio 
804bde56adStedu /*
814bde56adStedu  * Force switch among equal priority processes every 100ms.
824bde56adStedu  */
834bde56adStedu void
84106c68c4Scheloha roundrobin(struct clockrequest *cr, void *cf, void *arg)
854bde56adStedu {
869ac452c7Scheloha 	uint64_t count;
879ac452c7Scheloha 	struct cpu_info *ci = curcpu();
884bde56adStedu 	struct schedstate_percpu *spc = &ci->ci_schedstate;
894bde56adStedu 
90106c68c4Scheloha 	count = clockrequest_advance(cr, roundrobin_period);
914bde56adStedu 
927035ad6bSart 	if (ci->ci_curproc != NULL) {
939ac452c7Scheloha 		if (spc->spc_schedflags & SPCF_SEENRR || count >= 2) {
944bde56adStedu 			/*
954bde56adStedu 			 * The process has already been through a roundrobin
964bde56adStedu 			 * without switching and may be hogging the CPU.
974bde56adStedu 			 * Indicate that the process should yield.
984bde56adStedu 			 */
998173f959Skettenis 			atomic_setbits_int(&spc->spc_schedflags,
1009ac452c7Scheloha 			    SPCF_SEENRR | SPCF_SHOULDYIELD);
1014bde56adStedu 		} else {
1028173f959Skettenis 			atomic_setbits_int(&spc->spc_schedflags,
1038173f959Skettenis 			    SPCF_SEENRR);
1044bde56adStedu 		}
1054bde56adStedu 	}
1064bde56adStedu 
1073a63f5a8Sclaudio 	if (spc->spc_nrun || spc->spc_schedflags & SPCF_SHOULDYIELD)
1087035ad6bSart 		need_resched(ci);
1094bde56adStedu }
1104bde56adStedu 
11102d561d0Sclaudio 
11202d561d0Sclaudio 
11302d561d0Sclaudio /*
11402d561d0Sclaudio  * update_loadav: compute a tenex style load average of a quantity on
11502d561d0Sclaudio  * 1, 5, and 15 minute intervals.
11602d561d0Sclaudio  */
11702d561d0Sclaudio void
1186c64bd7fScheloha update_loadavg(void *unused)
11902d561d0Sclaudio {
1206c64bd7fScheloha 	static struct timeout to = TIMEOUT_INITIALIZER(update_loadavg, NULL);
12102d561d0Sclaudio 	CPU_INFO_ITERATOR cii;
12202d561d0Sclaudio 	struct cpu_info *ci;
12302d561d0Sclaudio 	u_int i, nrun = 0;
12402d561d0Sclaudio 
12502d561d0Sclaudio 	CPU_INFO_FOREACH(cii, ci) {
12602d561d0Sclaudio 		if (!cpuset_isset(&sched_idle_cpus, ci))
12702d561d0Sclaudio 			nrun++;
12802d561d0Sclaudio 		nrun += ci->ci_schedstate.spc_nrun;
12902d561d0Sclaudio 	}
13002d561d0Sclaudio 
13102d561d0Sclaudio 	for (i = 0; i < 3; i++) {
13202d561d0Sclaudio 		averunnable.ldavg[i] = (cexp[i] * averunnable.ldavg[i] +
13302d561d0Sclaudio 		    nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT;
13402d561d0Sclaudio 	}
13502d561d0Sclaudio 
1366c64bd7fScheloha 	timeout_add_sec(&to, 5);
13702d561d0Sclaudio }
13802d561d0Sclaudio 
1394bde56adStedu /*
1404bde56adStedu  * Constants for digital decay and forget:
1414bde56adStedu  *	90% of (p_estcpu) usage in 5 * loadav time
1424bde56adStedu  *	95% of (p_pctcpu) usage in 60 seconds (load insensitive)
1434bde56adStedu  *          Note that, as ps(1) mentions, this can let percentages
1444bde56adStedu  *          total over 100% (I've seen 137.9% for 3 processes).
1454bde56adStedu  *
1464bde56adStedu  * Note that hardclock updates p_estcpu and p_cpticks independently.
1474bde56adStedu  *
1484bde56adStedu  * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
1494bde56adStedu  * That is, the system wants to compute a value of decay such
1504bde56adStedu  * that the following for loop:
1514bde56adStedu  * 	for (i = 0; i < (5 * loadavg); i++)
1524bde56adStedu  * 		p_estcpu *= decay;
1534bde56adStedu  * will compute
1544bde56adStedu  * 	p_estcpu *= 0.1;
1554bde56adStedu  * for all values of loadavg:
1564bde56adStedu  *
1574bde56adStedu  * Mathematically this loop can be expressed by saying:
1584bde56adStedu  * 	decay ** (5 * loadavg) ~= .1
1594bde56adStedu  *
1604bde56adStedu  * The system computes decay as:
1614bde56adStedu  * 	decay = (2 * loadavg) / (2 * loadavg + 1)
1624bde56adStedu  *
1634bde56adStedu  * We wish to prove that the system's computation of decay
1644bde56adStedu  * will always fulfill the equation:
1654bde56adStedu  * 	decay ** (5 * loadavg) ~= .1
1664bde56adStedu  *
1674bde56adStedu  * If we compute b as:
1684bde56adStedu  * 	b = 2 * loadavg
1694bde56adStedu  * then
1704bde56adStedu  * 	decay = b / (b + 1)
1714bde56adStedu  *
1724bde56adStedu  * We now need to prove two things:
1734bde56adStedu  *	1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
1744bde56adStedu  *	2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
1754bde56adStedu  *
1764bde56adStedu  * Facts:
1774bde56adStedu  *         For x close to zero, exp(x) =~ 1 + x, since
1784bde56adStedu  *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
1794bde56adStedu  *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
1804bde56adStedu  *         For x close to zero, ln(1+x) =~ x, since
1814bde56adStedu  *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
1824bde56adStedu  *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
1834bde56adStedu  *         ln(.1) =~ -2.30
1844bde56adStedu  *
1854bde56adStedu  * Proof of (1):
1864bde56adStedu  *    Solve (factor)**(power) =~ .1 given power (5*loadav):
1874bde56adStedu  *	solving for factor,
1884bde56adStedu  *      ln(factor) =~ (-2.30/5*loadav), or
1894bde56adStedu  *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
1904bde56adStedu  *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
1914bde56adStedu  *
1924bde56adStedu  * Proof of (2):
1934bde56adStedu  *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
1944bde56adStedu  *	solving for power,
1954bde56adStedu  *      power*ln(b/(b+1)) =~ -2.30, or
1964bde56adStedu  *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
1974bde56adStedu  *
1984bde56adStedu  * Actual power values for the implemented algorithm are as follows:
1994bde56adStedu  *      loadav: 1       2       3       4
2004bde56adStedu  *      power:  5.68    10.32   14.94   19.55
2014bde56adStedu  */
2024bde56adStedu 
2034bde56adStedu /* calculations for digital decay to forget 90% of usage in 5*loadav sec */
2044bde56adStedu #define	loadfactor(loadav)	(2 * (loadav))
2054bde56adStedu #define	decay_cpu(loadfac, cpu)	(((loadfac) * (cpu)) / ((loadfac) + FSCALE))
2064bde56adStedu 
2074bde56adStedu /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
2084bde56adStedu fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;		/* exp(-1/20) */
2094bde56adStedu 
2104bde56adStedu /*
2114bde56adStedu  * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
2124bde56adStedu  * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
2134bde56adStedu  * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
2144bde56adStedu  *
2154bde56adStedu  * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
2164bde56adStedu  *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
2174bde56adStedu  *
2182addf348Sjmc  * If you don't want to bother with the faster/more-accurate formula, you
2194bde56adStedu  * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
2204bde56adStedu  * (more general) method of calculating the %age of CPU used by a process.
2214bde56adStedu  */
2224bde56adStedu #define	CCPU_SHIFT	11
2234bde56adStedu 
2244bde56adStedu /*
225db8e2cd9Sart  * Recompute process priorities, every second.
2264bde56adStedu  */
2274bde56adStedu void
2286c64bd7fScheloha schedcpu(void *unused)
2294bde56adStedu {
2306c64bd7fScheloha 	static struct timeout to = TIMEOUT_INITIALIZER(schedcpu, NULL);
2314bde56adStedu 	fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
2324bde56adStedu 	struct proc *p;
2334bde56adStedu 	unsigned int newcpu;
2344bde56adStedu 
2352393c85aSthib 	LIST_FOREACH(p, &allproc, p_list) {
2364bde56adStedu 		/*
23756497060Smpi 		 * Idle threads are never placed on the runqueue,
23856497060Smpi 		 * therefore computing their priority is pointless.
23956497060Smpi 		 */
24056497060Smpi 		if (p->p_cpu != NULL &&
24156497060Smpi 		    p->p_cpu->ci_schedstate.spc_idleproc == p)
24256497060Smpi 			continue;
24356497060Smpi 		/*
244bfb9dbb1Smpi 		 * Increment sleep time (if sleeping). We ignore overflow.
2454bde56adStedu 		 */
2464bde56adStedu 		if (p->p_stat == SSLEEP || p->p_stat == SSTOP)
2474bde56adStedu 			p->p_slptime++;
2484bde56adStedu 		p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
2494bde56adStedu 		/*
2504bde56adStedu 		 * If the process has slept the entire second,
2514bde56adStedu 		 * stop recalculating its priority until it wakes up.
2524bde56adStedu 		 */
2535bc652b1Sderaadt 		if (p->p_slptime > 1)
2544bde56adStedu 			continue;
255a09e9584Sclaudio 		SCHED_LOCK();
2564bde56adStedu 		/*
257bfb9dbb1Smpi 		 * p_pctcpu is only for diagnostic tools such as ps.
2584bde56adStedu 		 */
2594bde56adStedu #if	(FSHIFT >= CCPU_SHIFT)
2609bcfcad5Scheloha 		p->p_pctcpu += (stathz == 100)?
2614bde56adStedu 			((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
2624bde56adStedu                 	100 * (((fixpt_t) p->p_cpticks)
2639bcfcad5Scheloha 				<< (FSHIFT - CCPU_SHIFT)) / stathz;
2644bde56adStedu #else
2654bde56adStedu 		p->p_pctcpu += ((FSCALE - ccpu) *
2669bcfcad5Scheloha 			(p->p_cpticks * FSCALE / stathz)) >> FSHIFT;
2674bde56adStedu #endif
2684bde56adStedu 		p->p_cpticks = 0;
2694bde56adStedu 		newcpu = (u_int) decay_cpu(loadfac, p->p_estcpu);
27055eab86cSmpi 		setpriority(p, newcpu, p->p_p->ps_nice);
27155eab86cSmpi 
27293452c4aSart 		if (p->p_stat == SRUN &&
27324e0bd45Smpi 		    (p->p_runpri / SCHED_PPQ) != (p->p_usrpri / SCHED_PPQ)) {
2744bde56adStedu 			remrunqueue(p);
27576e7c40eSmpi 			setrunqueue(p->p_cpu, p, p->p_usrpri);
2764bde56adStedu 		}
277a09e9584Sclaudio 		SCHED_UNLOCK();
2784bde56adStedu 	}
2793e5dea16Stedu 	wakeup(&lbolt);
2806c64bd7fScheloha 	timeout_add_sec(&to, 1);
2814bde56adStedu }
2824bde56adStedu 
2834bde56adStedu /*
2844bde56adStedu  * Recalculate the priority of a process after it has slept for a while.
2854bde56adStedu  * For all load averages >= 1 and max p_estcpu of 255, sleeping for at
2864bde56adStedu  * least six times the loadfactor will decay p_estcpu to zero.
2874bde56adStedu  */
28855eab86cSmpi uint32_t
28955eab86cSmpi decay_aftersleep(uint32_t estcpu, uint32_t slptime)
2904bde56adStedu {
291b813f2f7Stedu 	fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
29255eab86cSmpi 	uint32_t newcpu;
2934bde56adStedu 
29455eab86cSmpi 	if (slptime > 5 * loadfac)
29555eab86cSmpi 		newcpu = 0;
2964bde56adStedu 	else {
29755eab86cSmpi 		newcpu = estcpu;
29855eab86cSmpi 		slptime--;	/* the first time was done in schedcpu */
29955eab86cSmpi 		while (newcpu && --slptime)
30055eab86cSmpi 			newcpu = decay_cpu(loadfac, newcpu);
30155eab86cSmpi 
3024bde56adStedu 	}
30355eab86cSmpi 
30455eab86cSmpi 	return (newcpu);
3054bde56adStedu }
3064bde56adStedu 
3074bde56adStedu /*
3084bde56adStedu  * General yield call.  Puts the current process back on its run queue and
3094bde56adStedu  * performs a voluntary context switch.
3104bde56adStedu  */
3114bde56adStedu void
312b813f2f7Stedu yield(void)
3134bde56adStedu {
3144bde56adStedu 	struct proc *p = curproc;
3154bde56adStedu 
316a09e9584Sclaudio 	SCHED_LOCK();
31776e7c40eSmpi 	setrunqueue(p->p_cpu, p, p->p_usrpri);
3188f15e6a4Sguenther 	p->p_ru.ru_nvcsw++;
3195bc652b1Sderaadt 	mi_switch();
320a09e9584Sclaudio 	SCHED_UNLOCK();
3214bde56adStedu }
3224bde56adStedu 
3234bde56adStedu /*
3244bde56adStedu  * General preemption call.  Puts the current process back on its run queue
3254bde56adStedu  * and performs an involuntary context switch.  If a process is supplied,
3264bde56adStedu  * we switch to that process.  Otherwise, we use the normal process selection
3274bde56adStedu  * criteria.
3284bde56adStedu  */
3294bde56adStedu void
3309b1ed563Smpi preempt(void)
3314bde56adStedu {
3324bde56adStedu 	struct proc *p = curproc;
3334bde56adStedu 
334a09e9584Sclaudio 	SCHED_LOCK();
33576e7c40eSmpi 	setrunqueue(p->p_cpu, p, p->p_usrpri);
3368f15e6a4Sguenther 	p->p_ru.ru_nivcsw++;
3375bc652b1Sderaadt 	mi_switch();
338a09e9584Sclaudio 	SCHED_UNLOCK();
3394bde56adStedu }
3404bde56adStedu 
3414bde56adStedu void
342b813f2f7Stedu mi_switch(void)
3434bde56adStedu {
34445053f4aSart 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
34545053f4aSart 	struct proc *p = curproc;
34645053f4aSart 	struct proc *nextproc;
347a09e9584Sclaudio 	int oldipl;
34845053f4aSart #ifdef MULTIPROCESSOR
3494bde56adStedu 	int hold_count;
3504bde56adStedu #endif
35145053f4aSart 
35245053f4aSart 	KASSERT(p->p_stat != SONPROC);
3534bde56adStedu 
3544bde56adStedu 	SCHED_ASSERT_LOCKED();
3554bde56adStedu 
35645053f4aSart #ifdef MULTIPROCESSOR
3575bc652b1Sderaadt 	/*
3585bc652b1Sderaadt 	 * Release the kernel_lock, as we are about to yield the CPU.
3595bc652b1Sderaadt 	 */
36025391aceSmpi 	if (_kernel_lock_held())
3615bc652b1Sderaadt 		hold_count = __mp_release_all(&kernel_lock);
362e57a349aSart 	else
363e57a349aSart 		hold_count = 0;
3645bc652b1Sderaadt #endif
3655bc652b1Sderaadt 
3667b3f8d1dSclaudio 	/* Update thread runtime */
3677b3f8d1dSclaudio 	tuagg_add_runtime();
3688f15e6a4Sguenther 
36944e0cbf2Scheloha 	/* Stop any optional clock interrupts. */
37044e0cbf2Scheloha 	if (ISSET(spc->spc_schedflags, SPCF_ITIMER)) {
37144e0cbf2Scheloha 		atomic_clearbits_int(&spc->spc_schedflags, SPCF_ITIMER);
3721d970828Scheloha 		clockintr_cancel(&spc->spc_itimer);
37344e0cbf2Scheloha 	}
374671537bfScheloha 	if (ISSET(spc->spc_schedflags, SPCF_PROFCLOCK)) {
375671537bfScheloha 		atomic_clearbits_int(&spc->spc_schedflags, SPCF_PROFCLOCK);
3761d970828Scheloha 		clockintr_cancel(&spc->spc_profclock);
377671537bfScheloha 	}
378671537bfScheloha 
3794bde56adStedu 	/*
3804bde56adStedu 	 * Process is about to yield the CPU; clear the appropriate
3814bde56adStedu 	 * scheduling flags.
3824bde56adStedu 	 */
3838173f959Skettenis 	atomic_clearbits_int(&spc->spc_schedflags, SPCF_SWITCHCLEAR);
3844bde56adStedu 
38545053f4aSart 	nextproc = sched_chooseproc();
38645053f4aSart 
387de29a8a5Sclaudio 	/* preserve old IPL level so we can switch back to that */
388de29a8a5Sclaudio 	oldipl = MUTEX_OLDIPL(&sched_lock);
389de29a8a5Sclaudio 
39045053f4aSart 	if (p != nextproc) {
3914bde56adStedu 		uvmexp.swtch++;
3929fde647aSmpi 		TRACEPOINT(sched, off__cpu, nextproc->p_tid + THREAD_PID_OFFSET,
39391b2ecf6Smpi 		    nextproc->p_p->ps_pid);
39445053f4aSart 		cpu_switchto(p, nextproc);
39591b2ecf6Smpi 		TRACEPOINT(sched, on__cpu, NULL);
39645053f4aSart 	} else {
39791b2ecf6Smpi 		TRACEPOINT(sched, remain__cpu, NULL);
39845053f4aSart 		p->p_stat = SONPROC;
39945053f4aSart 	}
40045053f4aSart 
4012c9d4ccbSart 	clear_resched(curcpu());
4022c9d4ccbSart 
40345053f4aSart 	SCHED_ASSERT_LOCKED();
4044bde56adStedu 
405de29a8a5Sclaudio 	/* Restore proc's IPL. */
406de29a8a5Sclaudio 	MUTEX_OLDIPL(&sched_lock) = oldipl;
407a09e9584Sclaudio 	SCHED_UNLOCK();
40845053f4aSart 
4094bde56adStedu 	SCHED_ASSERT_UNLOCKED();
4104bde56adStedu 
411de29a8a5Sclaudio 	assertwaitok();
412f2396460Svisa 	smr_idle();
413f2396460Svisa 
4144bde56adStedu 	/*
4154bde56adStedu 	 * We're running again; record our new start time.  We might
4167927db41Sclaudio 	 * be running on a new CPU now, so refetch the schedstate_percpu
4177927db41Sclaudio 	 * pointer.
4184bde56adStedu 	 */
41945053f4aSart 	KASSERT(p->p_cpu == curcpu());
4207927db41Sclaudio 	spc = &p->p_cpu->ci_schedstate;
42145053f4aSart 
42244e0cbf2Scheloha 	/* Start any optional clock interrupts needed by the thread. */
42344e0cbf2Scheloha 	if (ISSET(p->p_p->ps_flags, PS_ITIMER)) {
4247927db41Sclaudio 		atomic_setbits_int(&spc->spc_schedflags, SPCF_ITIMER);
4251d970828Scheloha 		clockintr_advance(&spc->spc_itimer, hardclock_period);
42644e0cbf2Scheloha 	}
427671537bfScheloha 	if (ISSET(p->p_p->ps_flags, PS_PROFIL)) {
4287927db41Sclaudio 		atomic_setbits_int(&spc->spc_schedflags, SPCF_PROFCLOCK);
4291d970828Scheloha 		clockintr_advance(&spc->spc_profclock, profclock_period);
430671537bfScheloha 	}
431671537bfScheloha 
4327927db41Sclaudio 	nanouptime(&spc->spc_runtime);
4334bde56adStedu 
43445053f4aSart #ifdef MULTIPROCESSOR
4354bde56adStedu 	/*
4364bde56adStedu 	 * Reacquire the kernel_lock now.  We do this after we've
4374bde56adStedu 	 * released the scheduler lock to avoid deadlock, and before
438d5fed243Sniklas 	 * we reacquire the interlock and the scheduler lock.
4394bde56adStedu 	 */
440e57a349aSart 	if (hold_count)
4414bde56adStedu 		__mp_acquire_count(&kernel_lock, hold_count);
4425bc652b1Sderaadt #endif
443a09e9584Sclaudio 	SCHED_LOCK();
4444bde56adStedu }
4454bde56adStedu 
4464bde56adStedu /*
4474bde56adStedu  * Change process state to be runnable,
448381e34d2Sguenther  * placing it on the run queue.
4494bde56adStedu  */
4504bde56adStedu void
451b813f2f7Stedu setrunnable(struct proc *p)
4524bde56adStedu {
453381e34d2Sguenther 	struct process *pr = p->p_p;
45424e0bd45Smpi 	u_char prio;
455381e34d2Sguenther 
4564bde56adStedu 	SCHED_ASSERT_LOCKED();
4574bde56adStedu 
4584bde56adStedu 	switch (p->p_stat) {
4594bde56adStedu 	case 0:
4604bde56adStedu 	case SRUN:
4614bde56adStedu 	case SONPROC:
4624bde56adStedu 	case SDEAD:
46393452c4aSart 	case SIDL:
4644bde56adStedu 	default:
4654bde56adStedu 		panic("setrunnable");
4664bde56adStedu 	case SSTOP:
46724e0bd45Smpi 		prio = p->p_usrpri;
468*cf08c0f1Sclaudio 		/* if not yet asleep, unstop but don't add to runqueue */
469*cf08c0f1Sclaudio 		if (ISSET(p->p_flag, P_WSLEEP)) {
470*cf08c0f1Sclaudio 			p->p_stat = SSLEEP;
471*cf08c0f1Sclaudio 			return;
472*cf08c0f1Sclaudio 		}
4739b3d5a4aSmpi 		setrunqueue(NULL, p, prio);
47424e0bd45Smpi 		break;
4754bde56adStedu 	case SSLEEP:
47624e0bd45Smpi 		prio = p->p_slppri;
477aa563902Sclaudio 
478aa563902Sclaudio 		/* if not yet asleep, don't add to runqueue */
479aa563902Sclaudio 		if (ISSET(p->p_flag, P_WSLEEP))
480aa563902Sclaudio 			return;
4819b3d5a4aSmpi 		setrunqueue(NULL, p, prio);
4829b3d5a4aSmpi 		TRACEPOINT(sched, wakeup, p->p_tid + THREAD_PID_OFFSET,
4839b3d5a4aSmpi 		    p->p_p->ps_pid, CPU_INFO_UNIT(p->p_cpu));
4844bde56adStedu 		break;
4854bde56adStedu 	}
48655eab86cSmpi 	if (p->p_slptime > 1) {
48755eab86cSmpi 		uint32_t newcpu;
48855eab86cSmpi 
48955eab86cSmpi 		newcpu = decay_aftersleep(p->p_estcpu, p->p_slptime);
490381e34d2Sguenther 		setpriority(p, newcpu, pr->ps_nice);
49155eab86cSmpi 	}
4924bde56adStedu 	p->p_slptime = 0;
4934bde56adStedu }
4944bde56adStedu 
4954bde56adStedu /*
49655eab86cSmpi  * Compute the priority of a process.
4974bde56adStedu  */
4984bde56adStedu void
49955eab86cSmpi setpriority(struct proc *p, uint32_t newcpu, uint8_t nice)
5004bde56adStedu {
50155eab86cSmpi 	unsigned int newprio;
50255eab86cSmpi 
50355eab86cSmpi 	newprio = min((PUSER + newcpu + NICE_WEIGHT * (nice - NZERO)), MAXPRI);
5044bde56adStedu 
5054bde56adStedu 	SCHED_ASSERT_LOCKED();
50655eab86cSmpi 	p->p_estcpu = newcpu;
50755eab86cSmpi 	p->p_usrpri = newprio;
5084bde56adStedu }
5094bde56adStedu 
5104bde56adStedu /*
5114bde56adStedu  * We adjust the priority of the current process.  The priority of a process
5124bde56adStedu  * gets worse as it accumulates CPU time.  The cpu usage estimator (p_estcpu)
5134bde56adStedu  * is increased here.  The formula for computing priorities (in kern_synch.c)
5144bde56adStedu  * will compute a different value each time p_estcpu increases. This can
5154bde56adStedu  * cause a switch, but unless the priority crosses a PPQ boundary the actual
5164bde56adStedu  * queue will not change.  The cpu usage estimator ramps up quite quickly
5174bde56adStedu  * when the process is running (linearly), and decays away exponentially, at
5184bde56adStedu  * a rate which is proportionally slower when the system is busy.  The basic
5194bde56adStedu  * principle is that the system will 90% forget that the process used a lot
5204bde56adStedu  * of CPU time in 5 * loadav seconds.  This causes the system to favor
5214bde56adStedu  * processes which haven't run much recently, and to round-robin among other
5224bde56adStedu  * processes.
5234bde56adStedu  */
5244bde56adStedu void
525b813f2f7Stedu schedclock(struct proc *p)
5264bde56adStedu {
52756497060Smpi 	struct cpu_info *ci = curcpu();
52856497060Smpi 	struct schedstate_percpu *spc = &ci->ci_schedstate;
52955eab86cSmpi 	uint32_t newcpu;
5304bde56adStedu 
531ffa74332Smpi 	if (p == spc->spc_idleproc || spc->spc_spinning)
53256497060Smpi 		return;
53356497060Smpi 
534a09e9584Sclaudio 	SCHED_LOCK();
53555eab86cSmpi 	newcpu = ESTCPULIM(p->p_estcpu + 1);
53655eab86cSmpi 	setpriority(p, newcpu, p->p_p->ps_nice);
537a09e9584Sclaudio 	SCHED_UNLOCK();
5384bde56adStedu }
539c7a32f40Stedu 
54045b8cd10Sderaadt void (*cpu_setperf)(int);
541c7a32f40Stedu 
542c7a32f40Stedu #define PERFPOL_MANUAL 0
543c7a32f40Stedu #define PERFPOL_AUTO 1
544c7a32f40Stedu #define PERFPOL_HIGH 2
545c7a32f40Stedu int perflevel = 100;
546cc51e07cSjca int perfpolicy_on_ac = PERFPOL_HIGH;
547cc51e07cSjca int perfpolicy_on_battery = PERFPOL_AUTO;
548c7a32f40Stedu 
54945b8cd10Sderaadt #ifndef SMALL_KERNEL
55045b8cd10Sderaadt /*
55145b8cd10Sderaadt  * The code below handles CPU throttling.
55245b8cd10Sderaadt  */
55345b8cd10Sderaadt #include <sys/sysctl.h>
554c7a32f40Stedu 
555c7a32f40Stedu void setperf_auto(void *);
556b1c17b9eSnaddy struct timeout setperf_to = TIMEOUT_INITIALIZER(setperf_auto, NULL);
5570294e0beSderaadt extern int hw_power;
558c7a32f40Stedu 
559cc51e07cSjca static inline int
560cc51e07cSjca perfpolicy_dynamic(void)
561cc51e07cSjca {
562cc51e07cSjca 	return (perfpolicy_on_ac == PERFPOL_AUTO ||
563cc51e07cSjca 	    perfpolicy_on_battery == PERFPOL_AUTO);
564cc51e07cSjca }
565cc51e07cSjca 
566cc51e07cSjca static inline int
567cc51e07cSjca current_perfpolicy(void)
568cc51e07cSjca {
569cc51e07cSjca 	return (hw_power) ? perfpolicy_on_ac : perfpolicy_on_battery;
570cc51e07cSjca }
571cc51e07cSjca 
572c7a32f40Stedu void
573c7a32f40Stedu setperf_auto(void *v)
574c7a32f40Stedu {
575c7a32f40Stedu 	static uint64_t *idleticks, *totalticks;
57675514d34Stedu 	static int downbeats;
5770294e0beSderaadt 	int i, j = 0;
5780294e0beSderaadt 	int speedup = 0;
579c7a32f40Stedu 	CPU_INFO_ITERATOR cii;
580c7a32f40Stedu 	struct cpu_info *ci;
5810294e0beSderaadt 	uint64_t idle, total, allidle = 0, alltotal = 0;
582c7a32f40Stedu 
583cc51e07cSjca 	if (!perfpolicy_dynamic())
584c7a32f40Stedu 		return;
585c7a32f40Stedu 
5860294e0beSderaadt 	if (cpu_setperf == NULL)
5870294e0beSderaadt 		return;
5880294e0beSderaadt 
589cc51e07cSjca 	if (current_perfpolicy() == PERFPOL_HIGH) {
5900294e0beSderaadt 		speedup = 1;
5910294e0beSderaadt 		goto faster;
5920294e0beSderaadt 	}
5930294e0beSderaadt 
594c7a32f40Stedu 	if (!idleticks)
595f8238f3eSdoug 		if (!(idleticks = mallocarray(ncpusfound, sizeof(*idleticks),
596c7a32f40Stedu 		    M_DEVBUF, M_NOWAIT | M_ZERO)))
597c7a32f40Stedu 			return;
598c7a32f40Stedu 	if (!totalticks)
599f8238f3eSdoug 		if (!(totalticks = mallocarray(ncpusfound, sizeof(*totalticks),
600c7a32f40Stedu 		    M_DEVBUF, M_NOWAIT | M_ZERO))) {
6015ff140d2Sderaadt 			free(idleticks, M_DEVBUF,
6025ff140d2Sderaadt 			    sizeof(*idleticks) * ncpusfound);
603c7a32f40Stedu 			return;
604c7a32f40Stedu 		}
605c7a32f40Stedu 	CPU_INFO_FOREACH(cii, ci) {
606a4278b79Ssolene 		if (!cpu_is_online(ci))
607a4278b79Ssolene 			continue;
608c7a32f40Stedu 		total = 0;
609c7a32f40Stedu 		for (i = 0; i < CPUSTATES; i++) {
610c7a32f40Stedu 			total += ci->ci_schedstate.spc_cp_time[i];
611c7a32f40Stedu 		}
612c7a32f40Stedu 		total -= totalticks[j];
613c7a32f40Stedu 		idle = ci->ci_schedstate.spc_cp_time[CP_IDLE] - idleticks[j];
614c7a32f40Stedu 		if (idle < total / 3)
615c7a32f40Stedu 			speedup = 1;
616c7a32f40Stedu 		alltotal += total;
617c7a32f40Stedu 		allidle += idle;
618c7a32f40Stedu 		idleticks[j] += idle;
619c7a32f40Stedu 		totalticks[j] += total;
620c7a32f40Stedu 		j++;
621c7a32f40Stedu 	}
622c7a32f40Stedu 	if (allidle < alltotal / 2)
623c7a32f40Stedu 		speedup = 1;
624be93862eSsolene 	if (speedup && downbeats < 5)
625be93862eSsolene 		downbeats++;
626c7a32f40Stedu 
627c7a32f40Stedu 	if (speedup && perflevel != 100) {
6280294e0beSderaadt faster:
629c7a32f40Stedu 		perflevel = 100;
630c7a32f40Stedu 		cpu_setperf(perflevel);
63175514d34Stedu 	} else if (!speedup && perflevel != 0 && --downbeats <= 0) {
632c7a32f40Stedu 		perflevel = 0;
633c7a32f40Stedu 		cpu_setperf(perflevel);
634c7a32f40Stedu 	}
635c7a32f40Stedu 
636c7a32f40Stedu 	timeout_add_msec(&setperf_to, 100);
637c7a32f40Stedu }
638c7a32f40Stedu 
639c7a32f40Stedu int
640c7a32f40Stedu sysctl_hwsetperf(void *oldp, size_t *oldlenp, void *newp, size_t newlen)
641c7a32f40Stedu {
642b7af1a41Sgnezdo 	int err;
643c7a32f40Stedu 
644c7a32f40Stedu 	if (!cpu_setperf)
645c7a32f40Stedu 		return EOPNOTSUPP;
646c7a32f40Stedu 
647cc51e07cSjca 	if (perfpolicy_on_ac != PERFPOL_MANUAL)
648c7a32f40Stedu 		return sysctl_rdint(oldp, oldlenp, newp, perflevel);
649c7a32f40Stedu 
650b7af1a41Sgnezdo 	err = sysctl_int_bounded(oldp, oldlenp, newp, newlen,
651b7af1a41Sgnezdo 	    &perflevel, 0, 100);
652c7a32f40Stedu 	if (err)
653c7a32f40Stedu 		return err;
654947f401bStb 
655947f401bStb 	if (newp != NULL)
656c7a32f40Stedu 		cpu_setperf(perflevel);
657947f401bStb 
658c7a32f40Stedu 	return 0;
659c7a32f40Stedu }
660c7a32f40Stedu 
661c7a32f40Stedu int
662c7a32f40Stedu sysctl_hwperfpolicy(void *oldp, size_t *oldlenp, void *newp, size_t newlen)
663c7a32f40Stedu {
664c7a32f40Stedu 	char policy[32];
665cc51e07cSjca 	char *policy_on_battery;
666cc51e07cSjca 	int err, perfpolicy;
667c7a32f40Stedu 
668c7a32f40Stedu 	if (!cpu_setperf)
669c7a32f40Stedu 		return EOPNOTSUPP;
670c7a32f40Stedu 
671cc51e07cSjca 	switch (current_perfpolicy()) {
672c7a32f40Stedu 	case PERFPOL_MANUAL:
673c7a32f40Stedu 		strlcpy(policy, "manual", sizeof(policy));
674c7a32f40Stedu 		break;
675c7a32f40Stedu 	case PERFPOL_AUTO:
676c7a32f40Stedu 		strlcpy(policy, "auto", sizeof(policy));
677c7a32f40Stedu 		break;
678c7a32f40Stedu 	case PERFPOL_HIGH:
679c7a32f40Stedu 		strlcpy(policy, "high", sizeof(policy));
680c7a32f40Stedu 		break;
681c7a32f40Stedu 	default:
682c7a32f40Stedu 		strlcpy(policy, "unknown", sizeof(policy));
683c7a32f40Stedu 		break;
684c7a32f40Stedu 	}
685c7a32f40Stedu 
686c7a32f40Stedu 	if (newp == NULL)
687c7a32f40Stedu 		return sysctl_rdstring(oldp, oldlenp, newp, policy);
688c7a32f40Stedu 
689c7a32f40Stedu 	err = sysctl_string(oldp, oldlenp, newp, newlen, policy, sizeof(policy));
690c7a32f40Stedu 	if (err)
691c7a32f40Stedu 		return err;
692cc51e07cSjca 
693cc51e07cSjca 	policy_on_battery = strchr(policy, ',');
694cc51e07cSjca 	if (policy_on_battery != NULL) {
695cc51e07cSjca 		*policy_on_battery = '\0';
696cc51e07cSjca 		policy_on_battery++;
697cc51e07cSjca 	}
698cc51e07cSjca 
699c7a32f40Stedu 	if (strcmp(policy, "manual") == 0)
700c7a32f40Stedu 		perfpolicy = PERFPOL_MANUAL;
701c7a32f40Stedu 	else if (strcmp(policy, "auto") == 0)
702c7a32f40Stedu 		perfpolicy = PERFPOL_AUTO;
703c7a32f40Stedu 	else if (strcmp(policy, "high") == 0)
704c7a32f40Stedu 		perfpolicy = PERFPOL_HIGH;
705c7a32f40Stedu 	else
706c7a32f40Stedu 		return EINVAL;
707c7a32f40Stedu 
708cc51e07cSjca 	if (policy_on_battery == NULL)
709cc51e07cSjca 		perfpolicy_on_battery = perfpolicy_on_ac = perfpolicy;
710cc51e07cSjca 	else {
711cc51e07cSjca 		if (strcmp(policy_on_battery, "manual") == 0 ||
712cc51e07cSjca 		    perfpolicy == PERFPOL_MANUAL) {
713cc51e07cSjca 			/* Not handled */
714cc51e07cSjca 			return EINVAL;
715cc51e07cSjca 		}
716cc51e07cSjca 		if (strcmp(policy_on_battery, "auto") == 0)
717cc51e07cSjca 			perfpolicy_on_battery = PERFPOL_AUTO;
718cc51e07cSjca 		else if (strcmp(policy_on_battery, "high") == 0)
719cc51e07cSjca 			perfpolicy_on_battery = PERFPOL_HIGH;
720cc51e07cSjca 		else
721cc51e07cSjca 			return EINVAL;
722cc51e07cSjca 		perfpolicy_on_ac = perfpolicy;
723cc51e07cSjca 	}
724cc51e07cSjca 
725cc51e07cSjca 	if (current_perfpolicy() == PERFPOL_HIGH) {
726c7a32f40Stedu 		perflevel = 100;
727c7a32f40Stedu 		cpu_setperf(perflevel);
728c7a32f40Stedu 	}
729cc51e07cSjca 
730cc51e07cSjca 	if (perfpolicy_dynamic())
731cc51e07cSjca 		timeout_add_msec(&setperf_to, 200);
732cc51e07cSjca 
733c7a32f40Stedu 	return 0;
734c7a32f40Stedu }
735c7a32f40Stedu #endif
7360294e0beSderaadt 
7376c64bd7fScheloha /*
7386c64bd7fScheloha  * Start the scheduler's periodic timeouts.
7396c64bd7fScheloha  */
7400294e0beSderaadt void
7410294e0beSderaadt scheduler_start(void)
7420294e0beSderaadt {
7436c64bd7fScheloha 	schedcpu(NULL);
7446c64bd7fScheloha 	update_loadavg(NULL);
7450294e0beSderaadt 
7460294e0beSderaadt #ifndef SMALL_KERNEL
747cc51e07cSjca 	if (perfpolicy_dynamic())
7480294e0beSderaadt 		timeout_add_msec(&setperf_to, 200);
7490294e0beSderaadt #endif
7500294e0beSderaadt }
7510294e0beSderaadt 
752