1*cf08c0f1Sclaudio /* $OpenBSD: sched_bsd.c,v 1.98 2024/11/24 13:02:37 claudio Exp $ */ 24bde56adStedu /* $NetBSD: kern_synch.c,v 1.37 1996/04/22 01:38:37 christos Exp $ */ 34bde56adStedu 44bde56adStedu /*- 54bde56adStedu * Copyright (c) 1982, 1986, 1990, 1991, 1993 64bde56adStedu * The Regents of the University of California. All rights reserved. 74bde56adStedu * (c) UNIX System Laboratories, Inc. 84bde56adStedu * All or some portions of this file are derived from material licensed 94bde56adStedu * to the University of California by American Telephone and Telegraph 104bde56adStedu * Co. or Unix System Laboratories, Inc. and are reproduced herein with 114bde56adStedu * the permission of UNIX System Laboratories, Inc. 124bde56adStedu * 134bde56adStedu * Redistribution and use in source and binary forms, with or without 144bde56adStedu * modification, are permitted provided that the following conditions 154bde56adStedu * are met: 164bde56adStedu * 1. Redistributions of source code must retain the above copyright 174bde56adStedu * notice, this list of conditions and the following disclaimer. 184bde56adStedu * 2. Redistributions in binary form must reproduce the above copyright 194bde56adStedu * notice, this list of conditions and the following disclaimer in the 204bde56adStedu * documentation and/or other materials provided with the distribution. 214bde56adStedu * 3. Neither the name of the University nor the names of its contributors 224bde56adStedu * may be used to endorse or promote products derived from this software 234bde56adStedu * without specific prior written permission. 244bde56adStedu * 254bde56adStedu * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 264bde56adStedu * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 274bde56adStedu * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 284bde56adStedu * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 294bde56adStedu * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 304bde56adStedu * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 314bde56adStedu * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 324bde56adStedu * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 334bde56adStedu * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 344bde56adStedu * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 354bde56adStedu * SUCH DAMAGE. 364bde56adStedu * 374bde56adStedu * @(#)kern_synch.c 8.6 (Berkeley) 1/21/94 384bde56adStedu */ 394bde56adStedu 404bde56adStedu #include <sys/param.h> 414bde56adStedu #include <sys/systm.h> 42671537bfScheloha #include <sys/clockintr.h> 434bde56adStedu #include <sys/proc.h> 444bde56adStedu #include <sys/kernel.h> 45c7a32f40Stedu #include <sys/malloc.h> 464bde56adStedu #include <sys/resourcevar.h> 474bde56adStedu #include <uvm/uvm_extern.h> 484bde56adStedu #include <sys/sched.h> 494bde56adStedu #include <sys/timeout.h> 50f2396460Svisa #include <sys/smr.h> 5191b2ecf6Smpi #include <sys/tracepoint.h> 524bde56adStedu 534bde56adStedu #ifdef KTRACE 544bde56adStedu #include <sys/ktrace.h> 554bde56adStedu #endif 564bde56adStedu 57961828bcScheloha uint64_t roundrobin_period; /* [I] roundrobin period (ns) */ 584bde56adStedu int lbolt; /* once a second sleep address */ 594bde56adStedu 60de29a8a5Sclaudio struct mutex sched_lock; 614bde56adStedu 6202d561d0Sclaudio void update_loadavg(void *); 634bde56adStedu void schedcpu(void *); 6455eab86cSmpi uint32_t decay_aftersleep(uint32_t, uint32_t); 654bde56adStedu 6602d561d0Sclaudio extern struct cpuset sched_idle_cpus; 6702d561d0Sclaudio 6802d561d0Sclaudio /* 6902d561d0Sclaudio * constants for averages over 1, 5, and 15 minutes when sampling at 7002d561d0Sclaudio * 5 second intervals. 7102d561d0Sclaudio */ 7202d561d0Sclaudio static const fixpt_t cexp[3] = { 7302d561d0Sclaudio 0.9200444146293232 * FSCALE, /* exp(-1/12) */ 7402d561d0Sclaudio 0.9834714538216174 * FSCALE, /* exp(-1/60) */ 7502d561d0Sclaudio 0.9944598480048967 * FSCALE, /* exp(-1/180) */ 7602d561d0Sclaudio }; 7702d561d0Sclaudio 7802d561d0Sclaudio struct loadavg averunnable; 7902d561d0Sclaudio 804bde56adStedu /* 814bde56adStedu * Force switch among equal priority processes every 100ms. 824bde56adStedu */ 834bde56adStedu void 84106c68c4Scheloha roundrobin(struct clockrequest *cr, void *cf, void *arg) 854bde56adStedu { 869ac452c7Scheloha uint64_t count; 879ac452c7Scheloha struct cpu_info *ci = curcpu(); 884bde56adStedu struct schedstate_percpu *spc = &ci->ci_schedstate; 894bde56adStedu 90106c68c4Scheloha count = clockrequest_advance(cr, roundrobin_period); 914bde56adStedu 927035ad6bSart if (ci->ci_curproc != NULL) { 939ac452c7Scheloha if (spc->spc_schedflags & SPCF_SEENRR || count >= 2) { 944bde56adStedu /* 954bde56adStedu * The process has already been through a roundrobin 964bde56adStedu * without switching and may be hogging the CPU. 974bde56adStedu * Indicate that the process should yield. 984bde56adStedu */ 998173f959Skettenis atomic_setbits_int(&spc->spc_schedflags, 1009ac452c7Scheloha SPCF_SEENRR | SPCF_SHOULDYIELD); 1014bde56adStedu } else { 1028173f959Skettenis atomic_setbits_int(&spc->spc_schedflags, 1038173f959Skettenis SPCF_SEENRR); 1044bde56adStedu } 1054bde56adStedu } 1064bde56adStedu 1073a63f5a8Sclaudio if (spc->spc_nrun || spc->spc_schedflags & SPCF_SHOULDYIELD) 1087035ad6bSart need_resched(ci); 1094bde56adStedu } 1104bde56adStedu 11102d561d0Sclaudio 11202d561d0Sclaudio 11302d561d0Sclaudio /* 11402d561d0Sclaudio * update_loadav: compute a tenex style load average of a quantity on 11502d561d0Sclaudio * 1, 5, and 15 minute intervals. 11602d561d0Sclaudio */ 11702d561d0Sclaudio void 1186c64bd7fScheloha update_loadavg(void *unused) 11902d561d0Sclaudio { 1206c64bd7fScheloha static struct timeout to = TIMEOUT_INITIALIZER(update_loadavg, NULL); 12102d561d0Sclaudio CPU_INFO_ITERATOR cii; 12202d561d0Sclaudio struct cpu_info *ci; 12302d561d0Sclaudio u_int i, nrun = 0; 12402d561d0Sclaudio 12502d561d0Sclaudio CPU_INFO_FOREACH(cii, ci) { 12602d561d0Sclaudio if (!cpuset_isset(&sched_idle_cpus, ci)) 12702d561d0Sclaudio nrun++; 12802d561d0Sclaudio nrun += ci->ci_schedstate.spc_nrun; 12902d561d0Sclaudio } 13002d561d0Sclaudio 13102d561d0Sclaudio for (i = 0; i < 3; i++) { 13202d561d0Sclaudio averunnable.ldavg[i] = (cexp[i] * averunnable.ldavg[i] + 13302d561d0Sclaudio nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT; 13402d561d0Sclaudio } 13502d561d0Sclaudio 1366c64bd7fScheloha timeout_add_sec(&to, 5); 13702d561d0Sclaudio } 13802d561d0Sclaudio 1394bde56adStedu /* 1404bde56adStedu * Constants for digital decay and forget: 1414bde56adStedu * 90% of (p_estcpu) usage in 5 * loadav time 1424bde56adStedu * 95% of (p_pctcpu) usage in 60 seconds (load insensitive) 1434bde56adStedu * Note that, as ps(1) mentions, this can let percentages 1444bde56adStedu * total over 100% (I've seen 137.9% for 3 processes). 1454bde56adStedu * 1464bde56adStedu * Note that hardclock updates p_estcpu and p_cpticks independently. 1474bde56adStedu * 1484bde56adStedu * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds. 1494bde56adStedu * That is, the system wants to compute a value of decay such 1504bde56adStedu * that the following for loop: 1514bde56adStedu * for (i = 0; i < (5 * loadavg); i++) 1524bde56adStedu * p_estcpu *= decay; 1534bde56adStedu * will compute 1544bde56adStedu * p_estcpu *= 0.1; 1554bde56adStedu * for all values of loadavg: 1564bde56adStedu * 1574bde56adStedu * Mathematically this loop can be expressed by saying: 1584bde56adStedu * decay ** (5 * loadavg) ~= .1 1594bde56adStedu * 1604bde56adStedu * The system computes decay as: 1614bde56adStedu * decay = (2 * loadavg) / (2 * loadavg + 1) 1624bde56adStedu * 1634bde56adStedu * We wish to prove that the system's computation of decay 1644bde56adStedu * will always fulfill the equation: 1654bde56adStedu * decay ** (5 * loadavg) ~= .1 1664bde56adStedu * 1674bde56adStedu * If we compute b as: 1684bde56adStedu * b = 2 * loadavg 1694bde56adStedu * then 1704bde56adStedu * decay = b / (b + 1) 1714bde56adStedu * 1724bde56adStedu * We now need to prove two things: 1734bde56adStedu * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1) 1744bde56adStedu * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg) 1754bde56adStedu * 1764bde56adStedu * Facts: 1774bde56adStedu * For x close to zero, exp(x) =~ 1 + x, since 1784bde56adStedu * exp(x) = 0! + x**1/1! + x**2/2! + ... . 1794bde56adStedu * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b. 1804bde56adStedu * For x close to zero, ln(1+x) =~ x, since 1814bde56adStedu * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1 1824bde56adStedu * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1). 1834bde56adStedu * ln(.1) =~ -2.30 1844bde56adStedu * 1854bde56adStedu * Proof of (1): 1864bde56adStedu * Solve (factor)**(power) =~ .1 given power (5*loadav): 1874bde56adStedu * solving for factor, 1884bde56adStedu * ln(factor) =~ (-2.30/5*loadav), or 1894bde56adStedu * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) = 1904bde56adStedu * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED 1914bde56adStedu * 1924bde56adStedu * Proof of (2): 1934bde56adStedu * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)): 1944bde56adStedu * solving for power, 1954bde56adStedu * power*ln(b/(b+1)) =~ -2.30, or 1964bde56adStedu * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED 1974bde56adStedu * 1984bde56adStedu * Actual power values for the implemented algorithm are as follows: 1994bde56adStedu * loadav: 1 2 3 4 2004bde56adStedu * power: 5.68 10.32 14.94 19.55 2014bde56adStedu */ 2024bde56adStedu 2034bde56adStedu /* calculations for digital decay to forget 90% of usage in 5*loadav sec */ 2044bde56adStedu #define loadfactor(loadav) (2 * (loadav)) 2054bde56adStedu #define decay_cpu(loadfac, cpu) (((loadfac) * (cpu)) / ((loadfac) + FSCALE)) 2064bde56adStedu 2074bde56adStedu /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 2084bde56adStedu fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 2094bde56adStedu 2104bde56adStedu /* 2114bde56adStedu * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the 2124bde56adStedu * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below 2134bde56adStedu * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT). 2144bde56adStedu * 2154bde56adStedu * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used: 2164bde56adStedu * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits). 2174bde56adStedu * 2182addf348Sjmc * If you don't want to bother with the faster/more-accurate formula, you 2194bde56adStedu * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate 2204bde56adStedu * (more general) method of calculating the %age of CPU used by a process. 2214bde56adStedu */ 2224bde56adStedu #define CCPU_SHIFT 11 2234bde56adStedu 2244bde56adStedu /* 225db8e2cd9Sart * Recompute process priorities, every second. 2264bde56adStedu */ 2274bde56adStedu void 2286c64bd7fScheloha schedcpu(void *unused) 2294bde56adStedu { 2306c64bd7fScheloha static struct timeout to = TIMEOUT_INITIALIZER(schedcpu, NULL); 2314bde56adStedu fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); 2324bde56adStedu struct proc *p; 2334bde56adStedu unsigned int newcpu; 2344bde56adStedu 2352393c85aSthib LIST_FOREACH(p, &allproc, p_list) { 2364bde56adStedu /* 23756497060Smpi * Idle threads are never placed on the runqueue, 23856497060Smpi * therefore computing their priority is pointless. 23956497060Smpi */ 24056497060Smpi if (p->p_cpu != NULL && 24156497060Smpi p->p_cpu->ci_schedstate.spc_idleproc == p) 24256497060Smpi continue; 24356497060Smpi /* 244bfb9dbb1Smpi * Increment sleep time (if sleeping). We ignore overflow. 2454bde56adStedu */ 2464bde56adStedu if (p->p_stat == SSLEEP || p->p_stat == SSTOP) 2474bde56adStedu p->p_slptime++; 2484bde56adStedu p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT; 2494bde56adStedu /* 2504bde56adStedu * If the process has slept the entire second, 2514bde56adStedu * stop recalculating its priority until it wakes up. 2524bde56adStedu */ 2535bc652b1Sderaadt if (p->p_slptime > 1) 2544bde56adStedu continue; 255a09e9584Sclaudio SCHED_LOCK(); 2564bde56adStedu /* 257bfb9dbb1Smpi * p_pctcpu is only for diagnostic tools such as ps. 2584bde56adStedu */ 2594bde56adStedu #if (FSHIFT >= CCPU_SHIFT) 2609bcfcad5Scheloha p->p_pctcpu += (stathz == 100)? 2614bde56adStedu ((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT): 2624bde56adStedu 100 * (((fixpt_t) p->p_cpticks) 2639bcfcad5Scheloha << (FSHIFT - CCPU_SHIFT)) / stathz; 2644bde56adStedu #else 2654bde56adStedu p->p_pctcpu += ((FSCALE - ccpu) * 2669bcfcad5Scheloha (p->p_cpticks * FSCALE / stathz)) >> FSHIFT; 2674bde56adStedu #endif 2684bde56adStedu p->p_cpticks = 0; 2694bde56adStedu newcpu = (u_int) decay_cpu(loadfac, p->p_estcpu); 27055eab86cSmpi setpriority(p, newcpu, p->p_p->ps_nice); 27155eab86cSmpi 27293452c4aSart if (p->p_stat == SRUN && 27324e0bd45Smpi (p->p_runpri / SCHED_PPQ) != (p->p_usrpri / SCHED_PPQ)) { 2744bde56adStedu remrunqueue(p); 27576e7c40eSmpi setrunqueue(p->p_cpu, p, p->p_usrpri); 2764bde56adStedu } 277a09e9584Sclaudio SCHED_UNLOCK(); 2784bde56adStedu } 2793e5dea16Stedu wakeup(&lbolt); 2806c64bd7fScheloha timeout_add_sec(&to, 1); 2814bde56adStedu } 2824bde56adStedu 2834bde56adStedu /* 2844bde56adStedu * Recalculate the priority of a process after it has slept for a while. 2854bde56adStedu * For all load averages >= 1 and max p_estcpu of 255, sleeping for at 2864bde56adStedu * least six times the loadfactor will decay p_estcpu to zero. 2874bde56adStedu */ 28855eab86cSmpi uint32_t 28955eab86cSmpi decay_aftersleep(uint32_t estcpu, uint32_t slptime) 2904bde56adStedu { 291b813f2f7Stedu fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); 29255eab86cSmpi uint32_t newcpu; 2934bde56adStedu 29455eab86cSmpi if (slptime > 5 * loadfac) 29555eab86cSmpi newcpu = 0; 2964bde56adStedu else { 29755eab86cSmpi newcpu = estcpu; 29855eab86cSmpi slptime--; /* the first time was done in schedcpu */ 29955eab86cSmpi while (newcpu && --slptime) 30055eab86cSmpi newcpu = decay_cpu(loadfac, newcpu); 30155eab86cSmpi 3024bde56adStedu } 30355eab86cSmpi 30455eab86cSmpi return (newcpu); 3054bde56adStedu } 3064bde56adStedu 3074bde56adStedu /* 3084bde56adStedu * General yield call. Puts the current process back on its run queue and 3094bde56adStedu * performs a voluntary context switch. 3104bde56adStedu */ 3114bde56adStedu void 312b813f2f7Stedu yield(void) 3134bde56adStedu { 3144bde56adStedu struct proc *p = curproc; 3154bde56adStedu 316a09e9584Sclaudio SCHED_LOCK(); 31776e7c40eSmpi setrunqueue(p->p_cpu, p, p->p_usrpri); 3188f15e6a4Sguenther p->p_ru.ru_nvcsw++; 3195bc652b1Sderaadt mi_switch(); 320a09e9584Sclaudio SCHED_UNLOCK(); 3214bde56adStedu } 3224bde56adStedu 3234bde56adStedu /* 3244bde56adStedu * General preemption call. Puts the current process back on its run queue 3254bde56adStedu * and performs an involuntary context switch. If a process is supplied, 3264bde56adStedu * we switch to that process. Otherwise, we use the normal process selection 3274bde56adStedu * criteria. 3284bde56adStedu */ 3294bde56adStedu void 3309b1ed563Smpi preempt(void) 3314bde56adStedu { 3324bde56adStedu struct proc *p = curproc; 3334bde56adStedu 334a09e9584Sclaudio SCHED_LOCK(); 33576e7c40eSmpi setrunqueue(p->p_cpu, p, p->p_usrpri); 3368f15e6a4Sguenther p->p_ru.ru_nivcsw++; 3375bc652b1Sderaadt mi_switch(); 338a09e9584Sclaudio SCHED_UNLOCK(); 3394bde56adStedu } 3404bde56adStedu 3414bde56adStedu void 342b813f2f7Stedu mi_switch(void) 3434bde56adStedu { 34445053f4aSart struct schedstate_percpu *spc = &curcpu()->ci_schedstate; 34545053f4aSart struct proc *p = curproc; 34645053f4aSart struct proc *nextproc; 347a09e9584Sclaudio int oldipl; 34845053f4aSart #ifdef MULTIPROCESSOR 3494bde56adStedu int hold_count; 3504bde56adStedu #endif 35145053f4aSart 35245053f4aSart KASSERT(p->p_stat != SONPROC); 3534bde56adStedu 3544bde56adStedu SCHED_ASSERT_LOCKED(); 3554bde56adStedu 35645053f4aSart #ifdef MULTIPROCESSOR 3575bc652b1Sderaadt /* 3585bc652b1Sderaadt * Release the kernel_lock, as we are about to yield the CPU. 3595bc652b1Sderaadt */ 36025391aceSmpi if (_kernel_lock_held()) 3615bc652b1Sderaadt hold_count = __mp_release_all(&kernel_lock); 362e57a349aSart else 363e57a349aSart hold_count = 0; 3645bc652b1Sderaadt #endif 3655bc652b1Sderaadt 3667b3f8d1dSclaudio /* Update thread runtime */ 3677b3f8d1dSclaudio tuagg_add_runtime(); 3688f15e6a4Sguenther 36944e0cbf2Scheloha /* Stop any optional clock interrupts. */ 37044e0cbf2Scheloha if (ISSET(spc->spc_schedflags, SPCF_ITIMER)) { 37144e0cbf2Scheloha atomic_clearbits_int(&spc->spc_schedflags, SPCF_ITIMER); 3721d970828Scheloha clockintr_cancel(&spc->spc_itimer); 37344e0cbf2Scheloha } 374671537bfScheloha if (ISSET(spc->spc_schedflags, SPCF_PROFCLOCK)) { 375671537bfScheloha atomic_clearbits_int(&spc->spc_schedflags, SPCF_PROFCLOCK); 3761d970828Scheloha clockintr_cancel(&spc->spc_profclock); 377671537bfScheloha } 378671537bfScheloha 3794bde56adStedu /* 3804bde56adStedu * Process is about to yield the CPU; clear the appropriate 3814bde56adStedu * scheduling flags. 3824bde56adStedu */ 3838173f959Skettenis atomic_clearbits_int(&spc->spc_schedflags, SPCF_SWITCHCLEAR); 3844bde56adStedu 38545053f4aSart nextproc = sched_chooseproc(); 38645053f4aSart 387de29a8a5Sclaudio /* preserve old IPL level so we can switch back to that */ 388de29a8a5Sclaudio oldipl = MUTEX_OLDIPL(&sched_lock); 389de29a8a5Sclaudio 39045053f4aSart if (p != nextproc) { 3914bde56adStedu uvmexp.swtch++; 3929fde647aSmpi TRACEPOINT(sched, off__cpu, nextproc->p_tid + THREAD_PID_OFFSET, 39391b2ecf6Smpi nextproc->p_p->ps_pid); 39445053f4aSart cpu_switchto(p, nextproc); 39591b2ecf6Smpi TRACEPOINT(sched, on__cpu, NULL); 39645053f4aSart } else { 39791b2ecf6Smpi TRACEPOINT(sched, remain__cpu, NULL); 39845053f4aSart p->p_stat = SONPROC; 39945053f4aSart } 40045053f4aSart 4012c9d4ccbSart clear_resched(curcpu()); 4022c9d4ccbSart 40345053f4aSart SCHED_ASSERT_LOCKED(); 4044bde56adStedu 405de29a8a5Sclaudio /* Restore proc's IPL. */ 406de29a8a5Sclaudio MUTEX_OLDIPL(&sched_lock) = oldipl; 407a09e9584Sclaudio SCHED_UNLOCK(); 40845053f4aSart 4094bde56adStedu SCHED_ASSERT_UNLOCKED(); 4104bde56adStedu 411de29a8a5Sclaudio assertwaitok(); 412f2396460Svisa smr_idle(); 413f2396460Svisa 4144bde56adStedu /* 4154bde56adStedu * We're running again; record our new start time. We might 4167927db41Sclaudio * be running on a new CPU now, so refetch the schedstate_percpu 4177927db41Sclaudio * pointer. 4184bde56adStedu */ 41945053f4aSart KASSERT(p->p_cpu == curcpu()); 4207927db41Sclaudio spc = &p->p_cpu->ci_schedstate; 42145053f4aSart 42244e0cbf2Scheloha /* Start any optional clock interrupts needed by the thread. */ 42344e0cbf2Scheloha if (ISSET(p->p_p->ps_flags, PS_ITIMER)) { 4247927db41Sclaudio atomic_setbits_int(&spc->spc_schedflags, SPCF_ITIMER); 4251d970828Scheloha clockintr_advance(&spc->spc_itimer, hardclock_period); 42644e0cbf2Scheloha } 427671537bfScheloha if (ISSET(p->p_p->ps_flags, PS_PROFIL)) { 4287927db41Sclaudio atomic_setbits_int(&spc->spc_schedflags, SPCF_PROFCLOCK); 4291d970828Scheloha clockintr_advance(&spc->spc_profclock, profclock_period); 430671537bfScheloha } 431671537bfScheloha 4327927db41Sclaudio nanouptime(&spc->spc_runtime); 4334bde56adStedu 43445053f4aSart #ifdef MULTIPROCESSOR 4354bde56adStedu /* 4364bde56adStedu * Reacquire the kernel_lock now. We do this after we've 4374bde56adStedu * released the scheduler lock to avoid deadlock, and before 438d5fed243Sniklas * we reacquire the interlock and the scheduler lock. 4394bde56adStedu */ 440e57a349aSart if (hold_count) 4414bde56adStedu __mp_acquire_count(&kernel_lock, hold_count); 4425bc652b1Sderaadt #endif 443a09e9584Sclaudio SCHED_LOCK(); 4444bde56adStedu } 4454bde56adStedu 4464bde56adStedu /* 4474bde56adStedu * Change process state to be runnable, 448381e34d2Sguenther * placing it on the run queue. 4494bde56adStedu */ 4504bde56adStedu void 451b813f2f7Stedu setrunnable(struct proc *p) 4524bde56adStedu { 453381e34d2Sguenther struct process *pr = p->p_p; 45424e0bd45Smpi u_char prio; 455381e34d2Sguenther 4564bde56adStedu SCHED_ASSERT_LOCKED(); 4574bde56adStedu 4584bde56adStedu switch (p->p_stat) { 4594bde56adStedu case 0: 4604bde56adStedu case SRUN: 4614bde56adStedu case SONPROC: 4624bde56adStedu case SDEAD: 46393452c4aSart case SIDL: 4644bde56adStedu default: 4654bde56adStedu panic("setrunnable"); 4664bde56adStedu case SSTOP: 46724e0bd45Smpi prio = p->p_usrpri; 468*cf08c0f1Sclaudio /* if not yet asleep, unstop but don't add to runqueue */ 469*cf08c0f1Sclaudio if (ISSET(p->p_flag, P_WSLEEP)) { 470*cf08c0f1Sclaudio p->p_stat = SSLEEP; 471*cf08c0f1Sclaudio return; 472*cf08c0f1Sclaudio } 4739b3d5a4aSmpi setrunqueue(NULL, p, prio); 47424e0bd45Smpi break; 4754bde56adStedu case SSLEEP: 47624e0bd45Smpi prio = p->p_slppri; 477aa563902Sclaudio 478aa563902Sclaudio /* if not yet asleep, don't add to runqueue */ 479aa563902Sclaudio if (ISSET(p->p_flag, P_WSLEEP)) 480aa563902Sclaudio return; 4819b3d5a4aSmpi setrunqueue(NULL, p, prio); 4829b3d5a4aSmpi TRACEPOINT(sched, wakeup, p->p_tid + THREAD_PID_OFFSET, 4839b3d5a4aSmpi p->p_p->ps_pid, CPU_INFO_UNIT(p->p_cpu)); 4844bde56adStedu break; 4854bde56adStedu } 48655eab86cSmpi if (p->p_slptime > 1) { 48755eab86cSmpi uint32_t newcpu; 48855eab86cSmpi 48955eab86cSmpi newcpu = decay_aftersleep(p->p_estcpu, p->p_slptime); 490381e34d2Sguenther setpriority(p, newcpu, pr->ps_nice); 49155eab86cSmpi } 4924bde56adStedu p->p_slptime = 0; 4934bde56adStedu } 4944bde56adStedu 4954bde56adStedu /* 49655eab86cSmpi * Compute the priority of a process. 4974bde56adStedu */ 4984bde56adStedu void 49955eab86cSmpi setpriority(struct proc *p, uint32_t newcpu, uint8_t nice) 5004bde56adStedu { 50155eab86cSmpi unsigned int newprio; 50255eab86cSmpi 50355eab86cSmpi newprio = min((PUSER + newcpu + NICE_WEIGHT * (nice - NZERO)), MAXPRI); 5044bde56adStedu 5054bde56adStedu SCHED_ASSERT_LOCKED(); 50655eab86cSmpi p->p_estcpu = newcpu; 50755eab86cSmpi p->p_usrpri = newprio; 5084bde56adStedu } 5094bde56adStedu 5104bde56adStedu /* 5114bde56adStedu * We adjust the priority of the current process. The priority of a process 5124bde56adStedu * gets worse as it accumulates CPU time. The cpu usage estimator (p_estcpu) 5134bde56adStedu * is increased here. The formula for computing priorities (in kern_synch.c) 5144bde56adStedu * will compute a different value each time p_estcpu increases. This can 5154bde56adStedu * cause a switch, but unless the priority crosses a PPQ boundary the actual 5164bde56adStedu * queue will not change. The cpu usage estimator ramps up quite quickly 5174bde56adStedu * when the process is running (linearly), and decays away exponentially, at 5184bde56adStedu * a rate which is proportionally slower when the system is busy. The basic 5194bde56adStedu * principle is that the system will 90% forget that the process used a lot 5204bde56adStedu * of CPU time in 5 * loadav seconds. This causes the system to favor 5214bde56adStedu * processes which haven't run much recently, and to round-robin among other 5224bde56adStedu * processes. 5234bde56adStedu */ 5244bde56adStedu void 525b813f2f7Stedu schedclock(struct proc *p) 5264bde56adStedu { 52756497060Smpi struct cpu_info *ci = curcpu(); 52856497060Smpi struct schedstate_percpu *spc = &ci->ci_schedstate; 52955eab86cSmpi uint32_t newcpu; 5304bde56adStedu 531ffa74332Smpi if (p == spc->spc_idleproc || spc->spc_spinning) 53256497060Smpi return; 53356497060Smpi 534a09e9584Sclaudio SCHED_LOCK(); 53555eab86cSmpi newcpu = ESTCPULIM(p->p_estcpu + 1); 53655eab86cSmpi setpriority(p, newcpu, p->p_p->ps_nice); 537a09e9584Sclaudio SCHED_UNLOCK(); 5384bde56adStedu } 539c7a32f40Stedu 54045b8cd10Sderaadt void (*cpu_setperf)(int); 541c7a32f40Stedu 542c7a32f40Stedu #define PERFPOL_MANUAL 0 543c7a32f40Stedu #define PERFPOL_AUTO 1 544c7a32f40Stedu #define PERFPOL_HIGH 2 545c7a32f40Stedu int perflevel = 100; 546cc51e07cSjca int perfpolicy_on_ac = PERFPOL_HIGH; 547cc51e07cSjca int perfpolicy_on_battery = PERFPOL_AUTO; 548c7a32f40Stedu 54945b8cd10Sderaadt #ifndef SMALL_KERNEL 55045b8cd10Sderaadt /* 55145b8cd10Sderaadt * The code below handles CPU throttling. 55245b8cd10Sderaadt */ 55345b8cd10Sderaadt #include <sys/sysctl.h> 554c7a32f40Stedu 555c7a32f40Stedu void setperf_auto(void *); 556b1c17b9eSnaddy struct timeout setperf_to = TIMEOUT_INITIALIZER(setperf_auto, NULL); 5570294e0beSderaadt extern int hw_power; 558c7a32f40Stedu 559cc51e07cSjca static inline int 560cc51e07cSjca perfpolicy_dynamic(void) 561cc51e07cSjca { 562cc51e07cSjca return (perfpolicy_on_ac == PERFPOL_AUTO || 563cc51e07cSjca perfpolicy_on_battery == PERFPOL_AUTO); 564cc51e07cSjca } 565cc51e07cSjca 566cc51e07cSjca static inline int 567cc51e07cSjca current_perfpolicy(void) 568cc51e07cSjca { 569cc51e07cSjca return (hw_power) ? perfpolicy_on_ac : perfpolicy_on_battery; 570cc51e07cSjca } 571cc51e07cSjca 572c7a32f40Stedu void 573c7a32f40Stedu setperf_auto(void *v) 574c7a32f40Stedu { 575c7a32f40Stedu static uint64_t *idleticks, *totalticks; 57675514d34Stedu static int downbeats; 5770294e0beSderaadt int i, j = 0; 5780294e0beSderaadt int speedup = 0; 579c7a32f40Stedu CPU_INFO_ITERATOR cii; 580c7a32f40Stedu struct cpu_info *ci; 5810294e0beSderaadt uint64_t idle, total, allidle = 0, alltotal = 0; 582c7a32f40Stedu 583cc51e07cSjca if (!perfpolicy_dynamic()) 584c7a32f40Stedu return; 585c7a32f40Stedu 5860294e0beSderaadt if (cpu_setperf == NULL) 5870294e0beSderaadt return; 5880294e0beSderaadt 589cc51e07cSjca if (current_perfpolicy() == PERFPOL_HIGH) { 5900294e0beSderaadt speedup = 1; 5910294e0beSderaadt goto faster; 5920294e0beSderaadt } 5930294e0beSderaadt 594c7a32f40Stedu if (!idleticks) 595f8238f3eSdoug if (!(idleticks = mallocarray(ncpusfound, sizeof(*idleticks), 596c7a32f40Stedu M_DEVBUF, M_NOWAIT | M_ZERO))) 597c7a32f40Stedu return; 598c7a32f40Stedu if (!totalticks) 599f8238f3eSdoug if (!(totalticks = mallocarray(ncpusfound, sizeof(*totalticks), 600c7a32f40Stedu M_DEVBUF, M_NOWAIT | M_ZERO))) { 6015ff140d2Sderaadt free(idleticks, M_DEVBUF, 6025ff140d2Sderaadt sizeof(*idleticks) * ncpusfound); 603c7a32f40Stedu return; 604c7a32f40Stedu } 605c7a32f40Stedu CPU_INFO_FOREACH(cii, ci) { 606a4278b79Ssolene if (!cpu_is_online(ci)) 607a4278b79Ssolene continue; 608c7a32f40Stedu total = 0; 609c7a32f40Stedu for (i = 0; i < CPUSTATES; i++) { 610c7a32f40Stedu total += ci->ci_schedstate.spc_cp_time[i]; 611c7a32f40Stedu } 612c7a32f40Stedu total -= totalticks[j]; 613c7a32f40Stedu idle = ci->ci_schedstate.spc_cp_time[CP_IDLE] - idleticks[j]; 614c7a32f40Stedu if (idle < total / 3) 615c7a32f40Stedu speedup = 1; 616c7a32f40Stedu alltotal += total; 617c7a32f40Stedu allidle += idle; 618c7a32f40Stedu idleticks[j] += idle; 619c7a32f40Stedu totalticks[j] += total; 620c7a32f40Stedu j++; 621c7a32f40Stedu } 622c7a32f40Stedu if (allidle < alltotal / 2) 623c7a32f40Stedu speedup = 1; 624be93862eSsolene if (speedup && downbeats < 5) 625be93862eSsolene downbeats++; 626c7a32f40Stedu 627c7a32f40Stedu if (speedup && perflevel != 100) { 6280294e0beSderaadt faster: 629c7a32f40Stedu perflevel = 100; 630c7a32f40Stedu cpu_setperf(perflevel); 63175514d34Stedu } else if (!speedup && perflevel != 0 && --downbeats <= 0) { 632c7a32f40Stedu perflevel = 0; 633c7a32f40Stedu cpu_setperf(perflevel); 634c7a32f40Stedu } 635c7a32f40Stedu 636c7a32f40Stedu timeout_add_msec(&setperf_to, 100); 637c7a32f40Stedu } 638c7a32f40Stedu 639c7a32f40Stedu int 640c7a32f40Stedu sysctl_hwsetperf(void *oldp, size_t *oldlenp, void *newp, size_t newlen) 641c7a32f40Stedu { 642b7af1a41Sgnezdo int err; 643c7a32f40Stedu 644c7a32f40Stedu if (!cpu_setperf) 645c7a32f40Stedu return EOPNOTSUPP; 646c7a32f40Stedu 647cc51e07cSjca if (perfpolicy_on_ac != PERFPOL_MANUAL) 648c7a32f40Stedu return sysctl_rdint(oldp, oldlenp, newp, perflevel); 649c7a32f40Stedu 650b7af1a41Sgnezdo err = sysctl_int_bounded(oldp, oldlenp, newp, newlen, 651b7af1a41Sgnezdo &perflevel, 0, 100); 652c7a32f40Stedu if (err) 653c7a32f40Stedu return err; 654947f401bStb 655947f401bStb if (newp != NULL) 656c7a32f40Stedu cpu_setperf(perflevel); 657947f401bStb 658c7a32f40Stedu return 0; 659c7a32f40Stedu } 660c7a32f40Stedu 661c7a32f40Stedu int 662c7a32f40Stedu sysctl_hwperfpolicy(void *oldp, size_t *oldlenp, void *newp, size_t newlen) 663c7a32f40Stedu { 664c7a32f40Stedu char policy[32]; 665cc51e07cSjca char *policy_on_battery; 666cc51e07cSjca int err, perfpolicy; 667c7a32f40Stedu 668c7a32f40Stedu if (!cpu_setperf) 669c7a32f40Stedu return EOPNOTSUPP; 670c7a32f40Stedu 671cc51e07cSjca switch (current_perfpolicy()) { 672c7a32f40Stedu case PERFPOL_MANUAL: 673c7a32f40Stedu strlcpy(policy, "manual", sizeof(policy)); 674c7a32f40Stedu break; 675c7a32f40Stedu case PERFPOL_AUTO: 676c7a32f40Stedu strlcpy(policy, "auto", sizeof(policy)); 677c7a32f40Stedu break; 678c7a32f40Stedu case PERFPOL_HIGH: 679c7a32f40Stedu strlcpy(policy, "high", sizeof(policy)); 680c7a32f40Stedu break; 681c7a32f40Stedu default: 682c7a32f40Stedu strlcpy(policy, "unknown", sizeof(policy)); 683c7a32f40Stedu break; 684c7a32f40Stedu } 685c7a32f40Stedu 686c7a32f40Stedu if (newp == NULL) 687c7a32f40Stedu return sysctl_rdstring(oldp, oldlenp, newp, policy); 688c7a32f40Stedu 689c7a32f40Stedu err = sysctl_string(oldp, oldlenp, newp, newlen, policy, sizeof(policy)); 690c7a32f40Stedu if (err) 691c7a32f40Stedu return err; 692cc51e07cSjca 693cc51e07cSjca policy_on_battery = strchr(policy, ','); 694cc51e07cSjca if (policy_on_battery != NULL) { 695cc51e07cSjca *policy_on_battery = '\0'; 696cc51e07cSjca policy_on_battery++; 697cc51e07cSjca } 698cc51e07cSjca 699c7a32f40Stedu if (strcmp(policy, "manual") == 0) 700c7a32f40Stedu perfpolicy = PERFPOL_MANUAL; 701c7a32f40Stedu else if (strcmp(policy, "auto") == 0) 702c7a32f40Stedu perfpolicy = PERFPOL_AUTO; 703c7a32f40Stedu else if (strcmp(policy, "high") == 0) 704c7a32f40Stedu perfpolicy = PERFPOL_HIGH; 705c7a32f40Stedu else 706c7a32f40Stedu return EINVAL; 707c7a32f40Stedu 708cc51e07cSjca if (policy_on_battery == NULL) 709cc51e07cSjca perfpolicy_on_battery = perfpolicy_on_ac = perfpolicy; 710cc51e07cSjca else { 711cc51e07cSjca if (strcmp(policy_on_battery, "manual") == 0 || 712cc51e07cSjca perfpolicy == PERFPOL_MANUAL) { 713cc51e07cSjca /* Not handled */ 714cc51e07cSjca return EINVAL; 715cc51e07cSjca } 716cc51e07cSjca if (strcmp(policy_on_battery, "auto") == 0) 717cc51e07cSjca perfpolicy_on_battery = PERFPOL_AUTO; 718cc51e07cSjca else if (strcmp(policy_on_battery, "high") == 0) 719cc51e07cSjca perfpolicy_on_battery = PERFPOL_HIGH; 720cc51e07cSjca else 721cc51e07cSjca return EINVAL; 722cc51e07cSjca perfpolicy_on_ac = perfpolicy; 723cc51e07cSjca } 724cc51e07cSjca 725cc51e07cSjca if (current_perfpolicy() == PERFPOL_HIGH) { 726c7a32f40Stedu perflevel = 100; 727c7a32f40Stedu cpu_setperf(perflevel); 728c7a32f40Stedu } 729cc51e07cSjca 730cc51e07cSjca if (perfpolicy_dynamic()) 731cc51e07cSjca timeout_add_msec(&setperf_to, 200); 732cc51e07cSjca 733c7a32f40Stedu return 0; 734c7a32f40Stedu } 735c7a32f40Stedu #endif 7360294e0beSderaadt 7376c64bd7fScheloha /* 7386c64bd7fScheloha * Start the scheduler's periodic timeouts. 7396c64bd7fScheloha */ 7400294e0beSderaadt void 7410294e0beSderaadt scheduler_start(void) 7420294e0beSderaadt { 7436c64bd7fScheloha schedcpu(NULL); 7446c64bd7fScheloha update_loadavg(NULL); 7450294e0beSderaadt 7460294e0beSderaadt #ifndef SMALL_KERNEL 747cc51e07cSjca if (perfpolicy_dynamic()) 7480294e0beSderaadt timeout_add_msec(&setperf_to, 200); 7490294e0beSderaadt #endif 7500294e0beSderaadt } 7510294e0beSderaadt 752