1*35bbce86Sclaudio /* $OpenBSD: kern_sched.c,v 1.103 2024/11/24 13:05:14 claudio Exp $ */ 245053f4aSart /* 37035ad6bSart * Copyright (c) 2007, 2008 Artur Grabowski <art@openbsd.org> 445053f4aSart * 545053f4aSart * Permission to use, copy, modify, and distribute this software for any 645053f4aSart * purpose with or without fee is hereby granted, provided that the above 745053f4aSart * copyright notice and this permission notice appear in all copies. 845053f4aSart * 945053f4aSart * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 1045053f4aSart * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 1145053f4aSart * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 1245053f4aSart * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 1345053f4aSart * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 1445053f4aSart * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 1545053f4aSart * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 1645053f4aSart */ 1745053f4aSart 1845053f4aSart #include <sys/param.h> 1945053f4aSart 2045053f4aSart #include <sys/sched.h> 2145053f4aSart #include <sys/proc.h> 2245053f4aSart #include <sys/kthread.h> 2345053f4aSart #include <sys/systm.h> 24671537bfScheloha #include <sys/clockintr.h> 25671537bfScheloha #include <sys/resourcevar.h> 26f30f8d91Skettenis #include <sys/task.h> 2744e0cbf2Scheloha #include <sys/time.h> 28f2396460Svisa #include <sys/smr.h> 2991b2ecf6Smpi #include <sys/tracepoint.h> 3045053f4aSart 3145053f4aSart #include <uvm/uvm_extern.h> 3245053f4aSart 3345053f4aSart void sched_kthreads_create(void *); 3445053f4aSart 357035ad6bSart int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p); 367035ad6bSart struct proc *sched_steal_proc(struct cpu_info *); 377035ad6bSart 387035ad6bSart /* 397035ad6bSart * To help choosing which cpu should run which process we keep track 407035ad6bSart * of cpus which are currently idle and which cpus have processes 417035ad6bSart * queued. 427035ad6bSart */ 437035ad6bSart struct cpuset sched_idle_cpus; 447035ad6bSart struct cpuset sched_queued_cpus; 4538795e35Skettenis struct cpuset sched_all_cpus; 467035ad6bSart 4745053f4aSart /* 48940a9231Shaesbaert * Some general scheduler counters. 49940a9231Shaesbaert */ 50940a9231Shaesbaert uint64_t sched_nmigrations; /* Cpu migration counter */ 51940a9231Shaesbaert uint64_t sched_nomigrations; /* Cpu no migration counter */ 52940a9231Shaesbaert uint64_t sched_noidle; /* Times we didn't pick the idle task */ 53940a9231Shaesbaert uint64_t sched_stolen; /* Times we stole proc from other cpus */ 54940a9231Shaesbaert uint64_t sched_choose; /* Times we chose a cpu */ 55940a9231Shaesbaert uint64_t sched_wasidle; /* Times we came out of idle */ 56940a9231Shaesbaert 5796c11352Skettenis int sched_smt; 5896c11352Skettenis 59940a9231Shaesbaert /* 6045053f4aSart * A few notes about cpu_switchto that is implemented in MD code. 6145053f4aSart * 6245053f4aSart * cpu_switchto takes two arguments, the old proc and the proc 6345053f4aSart * it should switch to. The new proc will never be NULL, so we always have 6445053f4aSart * a saved state that we need to switch to. The old proc however can 6545053f4aSart * be NULL if the process is exiting. NULL for the old proc simply 6645053f4aSart * means "don't bother saving old state". 6745053f4aSart * 6845053f4aSart * cpu_switchto is supposed to atomically load the new state of the process 6945053f4aSart * including the pcb, pmap and setting curproc, the p_cpu pointer in the 7045053f4aSart * proc and p_stat to SONPROC. Atomically with respect to interrupts, other 7145053f4aSart * cpus in the system must not depend on this state being consistent. 7245053f4aSart * Therefore no locking is necessary in cpu_switchto other than blocking 7345053f4aSart * interrupts during the context switch. 7445053f4aSart */ 7545053f4aSart 7645053f4aSart /* 7745053f4aSart * sched_init_cpu is called from main() for the boot cpu, then it's the 7845053f4aSart * responsibility of the MD code to call it for all other cpus. 7945053f4aSart */ 8045053f4aSart void 8145053f4aSart sched_init_cpu(struct cpu_info *ci) 8245053f4aSart { 8345053f4aSart struct schedstate_percpu *spc = &ci->ci_schedstate; 847035ad6bSart int i; 857035ad6bSart 867035ad6bSart for (i = 0; i < SCHED_NQS; i++) 877035ad6bSart TAILQ_INIT(&spc->spc_qs[i]); 8845053f4aSart 8945053f4aSart spc->spc_idleproc = NULL; 9045053f4aSart 911d970828Scheloha clockintr_bind(&spc->spc_itimer, ci, itimer_update, NULL); 921d970828Scheloha clockintr_bind(&spc->spc_profclock, ci, profclock, NULL); 931d970828Scheloha clockintr_bind(&spc->spc_roundrobin, ci, roundrobin, NULL); 941d970828Scheloha clockintr_bind(&spc->spc_statclock, ci, statclock, NULL); 95671537bfScheloha 9645053f4aSart kthread_create_deferred(sched_kthreads_create, ci); 9745053f4aSart 9845053f4aSart LIST_INIT(&spc->spc_deadproc); 99cd89219eSvisa SIMPLEQ_INIT(&spc->spc_deferred); 1007035ad6bSart 1017035ad6bSart /* 1027035ad6bSart * Slight hack here until the cpuset code handles cpu_info 1037035ad6bSart * structures. 1047035ad6bSart */ 1057035ad6bSart cpuset_init_cpu(ci); 10696c11352Skettenis 10796c11352Skettenis #ifdef __HAVE_CPU_TOPOLOGY 10896c11352Skettenis if (!sched_smt && ci->ci_smt_id > 0) 10996c11352Skettenis return; 11096c11352Skettenis #endif 11138795e35Skettenis cpuset_add(&sched_all_cpus, ci); 11245053f4aSart } 11345053f4aSart 11445053f4aSart void 11545053f4aSart sched_kthreads_create(void *v) 11645053f4aSart { 11745053f4aSart struct cpu_info *ci = v; 11845053f4aSart struct schedstate_percpu *spc = &ci->ci_schedstate; 11945053f4aSart static int num; 12045053f4aSart 12134b8a7e2Sguenther if (fork1(&proc0, FORK_SHAREVM|FORK_SHAREFILES|FORK_NOZOMBIE| 1223d8a8d53Sclaudio FORK_SYSTEM|FORK_IDLE, sched_idle, ci, NULL, 123a48ed3ddShaesbaert &spc->spc_idleproc)) 12445053f4aSart panic("fork idle"); 12545053f4aSart 126a48ed3ddShaesbaert /* Name it as specified. */ 1278fda72b7Sguenther snprintf(spc->spc_idleproc->p_p->ps_comm, 1288fda72b7Sguenther sizeof(spc->spc_idleproc->p_p->ps_comm), 129a48ed3ddShaesbaert "idle%d", num); 130a48ed3ddShaesbaert 13145053f4aSart num++; 13245053f4aSart } 13345053f4aSart 13445053f4aSart void 13545053f4aSart sched_idle(void *v) 13645053f4aSart { 1377035ad6bSart struct schedstate_percpu *spc; 13845053f4aSart struct proc *p = curproc; 13945053f4aSart struct cpu_info *ci = v; 14045053f4aSart 141971e1bb6Sart KERNEL_UNLOCK(); 14245053f4aSart 1437035ad6bSart spc = &ci->ci_schedstate; 1447035ad6bSart 14545053f4aSart /* 14645053f4aSart * First time we enter here, we're not supposed to idle, 14745053f4aSart * just go away for a while. 14845053f4aSart */ 149a09e9584Sclaudio SCHED_LOCK(); 1507035ad6bSart cpuset_add(&sched_idle_cpus, ci); 15145053f4aSart p->p_stat = SSLEEP; 15243311cbbSart p->p_cpu = ci; 15343311cbbSart atomic_setbits_int(&p->p_flag, P_CPUPEG); 15445053f4aSart mi_switch(); 1557035ad6bSart cpuset_del(&sched_idle_cpus, ci); 156a09e9584Sclaudio SCHED_UNLOCK(); 15745053f4aSart 15845053f4aSart KASSERT(ci == curcpu()); 1597035ad6bSart KASSERT(curproc == spc->spc_idleproc); 16045053f4aSart 1617035ad6bSart while (1) { 162b2602131Smpi while (!cpu_is_idle(curcpu())) { 1632afc0175Svisa struct proc *dead; 1642afc0175Svisa 165a09e9584Sclaudio SCHED_LOCK(); 16645053f4aSart p->p_stat = SSLEEP; 16745053f4aSart mi_switch(); 168a09e9584Sclaudio SCHED_UNLOCK(); 1692afc0175Svisa 1702afc0175Svisa while ((dead = LIST_FIRST(&spc->spc_deadproc))) { 1712afc0175Svisa LIST_REMOVE(dead, p_hash); 1722afc0175Svisa exit2(dead); 1732afc0175Svisa } 17445053f4aSart } 17545053f4aSart 176c10f2b41Sderaadt splassert(IPL_NONE); 177c10f2b41Sderaadt 178f2396460Svisa smr_idle(); 179f2396460Svisa 1807035ad6bSart cpuset_add(&sched_idle_cpus, ci); 18145053f4aSart cpu_idle_enter(); 18208c2c447Skettenis while (spc->spc_whichqs == 0) { 1835deb3491Smpi #ifdef MULTIPROCESSOR 18438795e35Skettenis if (spc->spc_schedflags & SPCF_SHOULDHALT && 18538795e35Skettenis (spc->spc_schedflags & SPCF_HALTED) == 0) { 18638795e35Skettenis cpuset_del(&sched_idle_cpus, ci); 187a09e9584Sclaudio SCHED_LOCK(); 18808c2c447Skettenis atomic_setbits_int(&spc->spc_schedflags, 18938795e35Skettenis spc->spc_whichqs ? 0 : SPCF_HALTED); 190a09e9584Sclaudio SCHED_UNLOCK(); 19108c2c447Skettenis wakeup(spc); 19208c2c447Skettenis } 1935deb3491Smpi #endif 19445053f4aSart cpu_idle_cycle(); 19508c2c447Skettenis } 19645053f4aSart cpu_idle_leave(); 1977035ad6bSart cpuset_del(&sched_idle_cpus, ci); 19845053f4aSart } 19945053f4aSart } 20045053f4aSart 20145053f4aSart /* 20245053f4aSart * To free our address space we have to jump through a few hoops. 20345053f4aSart * The freeing is done by the reaper, but until we have one reaper 20445053f4aSart * per cpu, we have no way of putting this proc on the deadproc list 20545053f4aSart * and waking up the reaper without risking having our address space and 20645053f4aSart * stack torn from under us before we manage to switch to another proc. 20745053f4aSart * Therefore we have a per-cpu list of dead processes where we put this 2082afc0175Svisa * proc and have idle clean up that list and move it to the reaper list. 20945053f4aSart * All this will be unnecessary once we can bind the reaper this cpu 21045053f4aSart * and not risk having it switch to another in case it sleeps. 21145053f4aSart */ 21245053f4aSart void 21345053f4aSart sched_exit(struct proc *p) 21445053f4aSart { 21545053f4aSart struct schedstate_percpu *spc = &curcpu()->ci_schedstate; 216bb00e811Sclaudio 217bb00e811Sclaudio LIST_INSERT_HEAD(&spc->spc_deadproc, p, p_hash); 218bb00e811Sclaudio 2197b3f8d1dSclaudio tuagg_add_runtime(); 220241d6723Sclaudio 221bb00e811Sclaudio KERNEL_ASSERT_LOCKED(); 222bb00e811Sclaudio sched_toidle(); 223bb00e811Sclaudio } 224bb00e811Sclaudio 225bb00e811Sclaudio void 226bb00e811Sclaudio sched_toidle(void) 227bb00e811Sclaudio { 228bb00e811Sclaudio struct schedstate_percpu *spc = &curcpu()->ci_schedstate; 2292afc0175Svisa struct proc *idle; 23045053f4aSart 231bb00e811Sclaudio #ifdef MULTIPROCESSOR 232bb00e811Sclaudio /* This process no longer needs to hold the kernel lock. */ 233bb00e811Sclaudio if (_kernel_lock_held()) 234bb00e811Sclaudio __mp_release_all(&kernel_lock); 235bb00e811Sclaudio #endif 236bb00e811Sclaudio 23744e0cbf2Scheloha if (ISSET(spc->spc_schedflags, SPCF_ITIMER)) { 23844e0cbf2Scheloha atomic_clearbits_int(&spc->spc_schedflags, SPCF_ITIMER); 2391d970828Scheloha clockintr_cancel(&spc->spc_itimer); 24044e0cbf2Scheloha } 241671537bfScheloha if (ISSET(spc->spc_schedflags, SPCF_PROFCLOCK)) { 242671537bfScheloha atomic_clearbits_int(&spc->spc_schedflags, SPCF_PROFCLOCK); 2431d970828Scheloha clockintr_cancel(&spc->spc_profclock); 244671537bfScheloha } 245671537bfScheloha 246bb00e811Sclaudio atomic_clearbits_int(&spc->spc_schedflags, SPCF_SWITCHCLEAR); 24745053f4aSart 248a09e9584Sclaudio SCHED_LOCK(); 2492afc0175Svisa idle = spc->spc_idleproc; 2502afc0175Svisa idle->p_stat = SRUN; 251bb00e811Sclaudio 252bb00e811Sclaudio uvmexp.swtch++; 2539c699cfdSclaudio if (curproc != NULL) 254bb00e811Sclaudio TRACEPOINT(sched, off__cpu, idle->p_tid + THREAD_PID_OFFSET, 255bb00e811Sclaudio idle->p_p->ps_pid); 2562afc0175Svisa cpu_switchto(NULL, idle); 2572afc0175Svisa panic("cpu_switchto returned"); 25845053f4aSart } 25945053f4aSart 26045053f4aSart /* 26145053f4aSart * Run queue management. 26245053f4aSart */ 26345053f4aSart void 26445053f4aSart sched_init_runqueues(void) 26545053f4aSart { 26645053f4aSart } 26745053f4aSart 26845053f4aSart void 26976e7c40eSmpi setrunqueue(struct cpu_info *ci, struct proc *p, uint8_t prio) 27045053f4aSart { 2717035ad6bSart struct schedstate_percpu *spc; 27276e7c40eSmpi int queue = prio >> 2; 27345053f4aSart 27476e7c40eSmpi if (ci == NULL) 27576e7c40eSmpi ci = sched_choosecpu(p); 27676e7c40eSmpi 27776e7c40eSmpi KASSERT(ci != NULL); 27845053f4aSart SCHED_ASSERT_LOCKED(); 279709f9596Sclaudio KASSERT(p->p_wchan == NULL); 280*35bbce86Sclaudio KASSERT(!ISSET(p->p_flag, P_WSLEEP)); 28176e7c40eSmpi 28276e7c40eSmpi p->p_cpu = ci; 28376e7c40eSmpi p->p_stat = SRUN; 28424e0bd45Smpi p->p_runpri = prio; 28576e7c40eSmpi 2867035ad6bSart spc = &p->p_cpu->ci_schedstate; 2877035ad6bSart spc->spc_nrun++; 2889fde647aSmpi TRACEPOINT(sched, enqueue, p->p_tid + THREAD_PID_OFFSET, 2899fde647aSmpi p->p_p->ps_pid); 29045053f4aSart 2917035ad6bSart TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq); 29241d7544aSbluhm spc->spc_whichqs |= (1U << queue); 2937035ad6bSart cpuset_add(&sched_queued_cpus, p->p_cpu); 2947035ad6bSart 2957c565ebfSart if (cpuset_isset(&sched_idle_cpus, p->p_cpu)) 2967035ad6bSart cpu_unidle(p->p_cpu); 2972286c11fSmpi else if (prio < spc->spc_curpriority) 29848349d21Smpi need_resched(ci); 29945053f4aSart } 30045053f4aSart 30145053f4aSart void 30245053f4aSart remrunqueue(struct proc *p) 30345053f4aSart { 3047035ad6bSart struct schedstate_percpu *spc; 30524e0bd45Smpi int queue = p->p_runpri >> 2; 30645053f4aSart 30745053f4aSart SCHED_ASSERT_LOCKED(); 3087035ad6bSart spc = &p->p_cpu->ci_schedstate; 3097035ad6bSart spc->spc_nrun--; 3109fde647aSmpi TRACEPOINT(sched, dequeue, p->p_tid + THREAD_PID_OFFSET, 3119fde647aSmpi p->p_p->ps_pid); 31245053f4aSart 3137035ad6bSart TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq); 3147035ad6bSart if (TAILQ_EMPTY(&spc->spc_qs[queue])) { 31541d7544aSbluhm spc->spc_whichqs &= ~(1U << queue); 3167035ad6bSart if (spc->spc_whichqs == 0) 3177035ad6bSart cpuset_del(&sched_queued_cpus, p->p_cpu); 3187035ad6bSart } 31945053f4aSart } 32045053f4aSart 32145053f4aSart struct proc * 32245053f4aSart sched_chooseproc(void) 32345053f4aSart { 3247035ad6bSart struct schedstate_percpu *spc = &curcpu()->ci_schedstate; 32545053f4aSart struct proc *p; 32645053f4aSart int queue; 32745053f4aSart 32845053f4aSart SCHED_ASSERT_LOCKED(); 32945053f4aSart 3305deb3491Smpi #ifdef MULTIPROCESSOR 33108c2c447Skettenis if (spc->spc_schedflags & SPCF_SHOULDHALT) { 33238795e35Skettenis if (spc->spc_whichqs) { 33338795e35Skettenis for (queue = 0; queue < SCHED_NQS; queue++) { 334d3a76966Smatthew while ((p = TAILQ_FIRST(&spc->spc_qs[queue]))) { 33538795e35Skettenis remrunqueue(p); 33624e0bd45Smpi setrunqueue(NULL, p, p->p_runpri); 337336ad790Skettenis if (p->p_cpu == curcpu()) { 338336ad790Skettenis KASSERT(p->p_flag & P_CPUPEG); 339336ad790Skettenis goto again; 340336ad790Skettenis } 34138795e35Skettenis } 34238795e35Skettenis } 34338795e35Skettenis } 34408c2c447Skettenis p = spc->spc_idleproc; 345bb00e811Sclaudio if (p == NULL) 346bb00e811Sclaudio panic("no idleproc set on CPU%d", 347bb00e811Sclaudio CPU_INFO_UNIT(curcpu())); 34808c2c447Skettenis p->p_stat = SRUN; 349bb00e811Sclaudio KASSERT(p->p_wchan == NULL); 35008c2c447Skettenis return (p); 35108c2c447Skettenis } 352bb00e811Sclaudio again: 3535deb3491Smpi #endif 35408c2c447Skettenis 3557035ad6bSart if (spc->spc_whichqs) { 3567035ad6bSart queue = ffs(spc->spc_whichqs) - 1; 3577035ad6bSart p = TAILQ_FIRST(&spc->spc_qs[queue]); 3587035ad6bSart remrunqueue(p); 359940a9231Shaesbaert sched_noidle++; 36076e7c40eSmpi if (p->p_stat != SRUN) 36176e7c40eSmpi panic("thread %d not in SRUN: %d", p->p_tid, p->p_stat); 3627035ad6bSart } else if ((p = sched_steal_proc(curcpu())) == NULL) { 3637035ad6bSart p = spc->spc_idleproc; 364bb00e811Sclaudio if (p == NULL) 365bb00e811Sclaudio panic("no idleproc set on CPU%d", 366bb00e811Sclaudio CPU_INFO_UNIT(curcpu())); 36745053f4aSart p->p_stat = SRUN; 36845053f4aSart } 36945053f4aSart 3701a90c3d6Smiod KASSERT(p->p_wchan == NULL); 371*35bbce86Sclaudio KASSERT(!ISSET(p->p_flag, P_WSLEEP)); 37245053f4aSart return (p); 37345053f4aSart } 3747035ad6bSart 3757c565ebfSart struct cpu_info * 3767c565ebfSart sched_choosecpu_fork(struct proc *parent, int flags) 3777c565ebfSart { 378506dbbfdStedu #ifdef MULTIPROCESSOR 3797c565ebfSart struct cpu_info *choice = NULL; 3807c565ebfSart int run, best_run = INT_MAX; 3817c565ebfSart struct cpu_info *ci; 3827c565ebfSart struct cpuset set; 3837c565ebfSart 3847c565ebfSart #if 0 3857c565ebfSart /* 3867c565ebfSart * XXX 3877c565ebfSart * Don't do this until we have a painless way to move the cpu in exec. 3887c565ebfSart * Preferably when nuking the old pmap and getting a new one on a 3897c565ebfSart * new cpu. 3907c565ebfSart */ 3917c565ebfSart /* 3927c565ebfSart * PPWAIT forks are simple. We know that the parent will not 3937c565ebfSart * run until we exec and choose another cpu, so we just steal its 3947c565ebfSart * cpu. 3957c565ebfSart */ 3967c565ebfSart if (flags & FORK_PPWAIT) 3977c565ebfSart return (parent->p_cpu); 3987c565ebfSart #endif 3997c565ebfSart 4007c565ebfSart /* 4017c565ebfSart * Look at all cpus that are currently idle and have nothing queued. 4027c565ebfSart * If there are none, pick the one with least queued procs first, 4037c565ebfSart * then the one with lowest load average. 4047c565ebfSart */ 4057c565ebfSart cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus); 4067f771310Skettenis cpuset_intersection(&set, &set, &sched_all_cpus); 4077c565ebfSart if (cpuset_first(&set) == NULL) 40838795e35Skettenis cpuset_copy(&set, &sched_all_cpus); 4097c565ebfSart 4107c565ebfSart while ((ci = cpuset_first(&set)) != NULL) { 4117c565ebfSart cpuset_del(&set, ci); 4127c565ebfSart 4137c565ebfSart run = ci->ci_schedstate.spc_nrun; 4147c565ebfSart 415834cc80dSclaudio if (choice == NULL || run < best_run) { 4167c565ebfSart choice = ci; 4177c565ebfSart best_run = run; 4187c565ebfSart } 4197c565ebfSart } 4207c565ebfSart 4217c565ebfSart return (choice); 422506dbbfdStedu #else 423506dbbfdStedu return (curcpu()); 424506dbbfdStedu #endif 4257c565ebfSart } 4267c565ebfSart 4277c565ebfSart struct cpu_info * 4287035ad6bSart sched_choosecpu(struct proc *p) 4297035ad6bSart { 430506dbbfdStedu #ifdef MULTIPROCESSOR 4317035ad6bSart struct cpu_info *choice = NULL; 4327035ad6bSart int last_cost = INT_MAX; 4337035ad6bSart struct cpu_info *ci; 4347035ad6bSart struct cpuset set; 4357035ad6bSart 4366026e1a6Sart /* 4376026e1a6Sart * If pegged to a cpu, don't allow it to move. 4386026e1a6Sart */ 4396026e1a6Sart if (p->p_flag & P_CPUPEG) 4407c565ebfSart return (p->p_cpu); 4416026e1a6Sart 4427035ad6bSart sched_choose++; 4437035ad6bSart 4447035ad6bSart /* 4457c565ebfSart * Look at all cpus that are currently idle and have nothing queued. 4467c565ebfSart * If there are none, pick the cheapest of those. 4477c565ebfSart * (idle + queued could mean that the cpu is handling an interrupt 4487c565ebfSart * at this moment and haven't had time to leave idle yet). 4497035ad6bSart */ 4507c565ebfSart cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus); 4517f771310Skettenis cpuset_intersection(&set, &set, &sched_all_cpus); 4527c565ebfSart 4537c565ebfSart /* 4547c565ebfSart * First, just check if our current cpu is in that set, if it is, 4557c565ebfSart * this is simple. 4567c565ebfSart * Also, our cpu might not be idle, but if it's the current cpu 4577c565ebfSart * and it has nothing else queued and we're curproc, take it. 4587c565ebfSart */ 4597c565ebfSart if (cpuset_isset(&set, p->p_cpu) || 4607c565ebfSart (p->p_cpu == curcpu() && p->p_cpu->ci_schedstate.spc_nrun == 0 && 46193a6d383Skettenis (p->p_cpu->ci_schedstate.spc_schedflags & SPCF_SHOULDHALT) == 0 && 4627c565ebfSart curproc == p)) { 4637035ad6bSart sched_wasidle++; 4647c565ebfSart return (p->p_cpu); 4657035ad6bSart } 4667035ad6bSart 4677c565ebfSart if (cpuset_first(&set) == NULL) 46838795e35Skettenis cpuset_copy(&set, &sched_all_cpus); 4697035ad6bSart 4707035ad6bSart while ((ci = cpuset_first(&set)) != NULL) { 4717035ad6bSart int cost = sched_proc_to_cpu_cost(ci, p); 4727035ad6bSart 4737035ad6bSart if (choice == NULL || cost < last_cost) { 4747035ad6bSart choice = ci; 4757035ad6bSart last_cost = cost; 4767035ad6bSart } 4777035ad6bSart cpuset_del(&set, ci); 4787035ad6bSart } 4797035ad6bSart 4807c565ebfSart if (p->p_cpu != choice) 4817035ad6bSart sched_nmigrations++; 4827c565ebfSart else 4837035ad6bSart sched_nomigrations++; 4847035ad6bSart 4857c565ebfSart return (choice); 486506dbbfdStedu #else 487506dbbfdStedu return (curcpu()); 488506dbbfdStedu #endif 4897035ad6bSart } 4907035ad6bSart 4917035ad6bSart /* 4927035ad6bSart * Attempt to steal a proc from some cpu. 4937035ad6bSart */ 4947035ad6bSart struct proc * 4957035ad6bSart sched_steal_proc(struct cpu_info *self) 4967035ad6bSart { 4977035ad6bSart struct proc *best = NULL; 498506dbbfdStedu #ifdef MULTIPROCESSOR 499506dbbfdStedu struct schedstate_percpu *spc; 5007035ad6bSart int bestcost = INT_MAX; 5017035ad6bSart struct cpu_info *ci; 5027035ad6bSart struct cpuset set; 5037035ad6bSart 5047f771310Skettenis KASSERT((self->ci_schedstate.spc_schedflags & SPCF_SHOULDHALT) == 0); 5057f771310Skettenis 50673dc9ed4Skettenis /* Don't steal if we don't want to schedule processes in this CPU. */ 50773dc9ed4Skettenis if (!cpuset_isset(&sched_all_cpus, self)) 50873dc9ed4Skettenis return (NULL); 50973dc9ed4Skettenis 5107035ad6bSart cpuset_copy(&set, &sched_queued_cpus); 5117035ad6bSart 5127035ad6bSart while ((ci = cpuset_first(&set)) != NULL) { 5137035ad6bSart struct proc *p; 51425a90001Sart int queue; 5157035ad6bSart int cost; 5167035ad6bSart 5177035ad6bSart cpuset_del(&set, ci); 5187035ad6bSart 5197035ad6bSart spc = &ci->ci_schedstate; 5207035ad6bSart 52125a90001Sart queue = ffs(spc->spc_whichqs) - 1; 52225a90001Sart TAILQ_FOREACH(p, &spc->spc_qs[queue], p_runq) { 52325a90001Sart if (p->p_flag & P_CPUPEG) 52425a90001Sart continue; 52525a90001Sart 5267035ad6bSart cost = sched_proc_to_cpu_cost(self, p); 5277035ad6bSart 5287035ad6bSart if (best == NULL || cost < bestcost) { 5297035ad6bSart best = p; 5307035ad6bSart bestcost = cost; 5317035ad6bSart } 5327035ad6bSart } 53325a90001Sart } 5347035ad6bSart if (best == NULL) 5357035ad6bSart return (NULL); 5367035ad6bSart 5379b3d5a4aSmpi TRACEPOINT(sched, steal, best->p_tid + THREAD_PID_OFFSET, 5389b3d5a4aSmpi best->p_p->ps_pid, CPU_INFO_UNIT(self)); 5399b3d5a4aSmpi 5407035ad6bSart remrunqueue(best); 5417035ad6bSart best->p_cpu = self; 5427035ad6bSart 5437035ad6bSart sched_stolen++; 544506dbbfdStedu #endif 5457035ad6bSart return (best); 5467035ad6bSart } 5477035ad6bSart 548506dbbfdStedu #ifdef MULTIPROCESSOR 5497035ad6bSart /* 5507035ad6bSart * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know). 5517035ad6bSart */ 5527035ad6bSart static int 5537035ad6bSart log2(unsigned int i) 5547035ad6bSart { 5557035ad6bSart int ret = 0; 5567035ad6bSart 5577035ad6bSart while (i >>= 1) 5587035ad6bSart ret++; 5597035ad6bSart 5607035ad6bSart return (ret); 5617035ad6bSart } 5627035ad6bSart 5637035ad6bSart /* 5647035ad6bSart * Calculate the cost of moving the proc to this cpu. 5657035ad6bSart * 5667035ad6bSart * What we want is some guesstimate of how much "performance" it will 5677035ad6bSart * cost us to move the proc here. Not just for caches and TLBs and NUMA 5687035ad6bSart * memory, but also for the proc itself. A highly loaded cpu might not 5697035ad6bSart * be the best candidate for this proc since it won't get run. 5707035ad6bSart * 5717035ad6bSart * Just total guesstimates for now. 5727035ad6bSart */ 5737035ad6bSart 5747035ad6bSart int sched_cost_priority = 1; 5757035ad6bSart int sched_cost_runnable = 3; 5767035ad6bSart int sched_cost_resident = 1; 577506dbbfdStedu #endif 5787035ad6bSart 5797035ad6bSart int 5807035ad6bSart sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p) 5817035ad6bSart { 582506dbbfdStedu int cost = 0; 583506dbbfdStedu #ifdef MULTIPROCESSOR 5847035ad6bSart struct schedstate_percpu *spc; 5857035ad6bSart int l2resident = 0; 5867035ad6bSart 5877035ad6bSart spc = &ci->ci_schedstate; 5887035ad6bSart 5897035ad6bSart /* 5907035ad6bSart * First, account for the priority of the proc we want to move. 5917035ad6bSart * More willing to move, the lower the priority of the destination 5927035ad6bSart * and the higher the priority of the proc. 5937035ad6bSart */ 5947035ad6bSart if (!cpuset_isset(&sched_idle_cpus, ci)) { 59524e0bd45Smpi cost += (p->p_usrpri - spc->spc_curpriority) * 5967035ad6bSart sched_cost_priority; 5977035ad6bSart cost += sched_cost_runnable; 5987035ad6bSart } 599940a9231Shaesbaert if (cpuset_isset(&sched_queued_cpus, ci)) 6007035ad6bSart cost += spc->spc_nrun * sched_cost_runnable; 6017035ad6bSart 6027035ad6bSart /* 603581334fcSkettenis * Try to avoid the primary cpu as it handles hardware interrupts. 604581334fcSkettenis * 605581334fcSkettenis * XXX Needs to be revisited when we distribute interrupts 606581334fcSkettenis * over cpus. 607581334fcSkettenis */ 608581334fcSkettenis if (CPU_IS_PRIMARY(ci)) 609581334fcSkettenis cost += sched_cost_runnable; 610581334fcSkettenis 611581334fcSkettenis /* 6127035ad6bSart * If the proc is on this cpu already, lower the cost by how much 6137035ad6bSart * it has been running and an estimate of its footprint. 6147035ad6bSart */ 6157035ad6bSart if (p->p_cpu == ci && p->p_slptime == 0) { 6167035ad6bSart l2resident = 6177035ad6bSart log2(pmap_resident_count(p->p_vmspace->vm_map.pmap)); 6187035ad6bSart cost -= l2resident * sched_cost_resident; 6197035ad6bSart } 620506dbbfdStedu #endif 6217035ad6bSart return (cost); 6227035ad6bSart } 6237035ad6bSart 6247035ad6bSart /* 6256026e1a6Sart * Peg a proc to a cpu. 6266026e1a6Sart */ 6276026e1a6Sart void 6286026e1a6Sart sched_peg_curproc(struct cpu_info *ci) 6296026e1a6Sart { 6306026e1a6Sart struct proc *p = curproc; 6316026e1a6Sart 632a09e9584Sclaudio SCHED_LOCK(); 6336026e1a6Sart atomic_setbits_int(&p->p_flag, P_CPUPEG); 63476e7c40eSmpi setrunqueue(ci, p, p->p_usrpri); 6358f15e6a4Sguenther p->p_ru.ru_nvcsw++; 6366026e1a6Sart mi_switch(); 637a09e9584Sclaudio SCHED_UNLOCK(); 6386026e1a6Sart } 6396026e1a6Sart 640cf31dfdeSmpi void 641cf31dfdeSmpi sched_unpeg_curproc(void) 642cf31dfdeSmpi { 643cf31dfdeSmpi struct proc *p = curproc; 644cf31dfdeSmpi 645cf31dfdeSmpi atomic_clearbits_int(&p->p_flag, P_CPUPEG); 646cf31dfdeSmpi } 647cf31dfdeSmpi 6489485775dSkettenis #ifdef MULTIPROCESSOR 6499485775dSkettenis 6509485775dSkettenis void 6519485775dSkettenis sched_start_secondary_cpus(void) 6529485775dSkettenis { 6539485775dSkettenis CPU_INFO_ITERATOR cii; 6549485775dSkettenis struct cpu_info *ci; 6559485775dSkettenis 6569485775dSkettenis CPU_INFO_FOREACH(cii, ci) { 6579485775dSkettenis struct schedstate_percpu *spc = &ci->ci_schedstate; 6589485775dSkettenis 659d73de46fSkettenis if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci)) 6609485775dSkettenis continue; 6619485775dSkettenis atomic_clearbits_int(&spc->spc_schedflags, 6629485775dSkettenis SPCF_SHOULDHALT | SPCF_HALTED); 66396c11352Skettenis #ifdef __HAVE_CPU_TOPOLOGY 66496c11352Skettenis if (!sched_smt && ci->ci_smt_id > 0) 66596c11352Skettenis continue; 66696c11352Skettenis #endif 66796c11352Skettenis cpuset_add(&sched_all_cpus, ci); 6689485775dSkettenis } 6699485775dSkettenis } 6709485775dSkettenis 6719485775dSkettenis void 6729485775dSkettenis sched_stop_secondary_cpus(void) 6739485775dSkettenis { 6749485775dSkettenis CPU_INFO_ITERATOR cii; 6759485775dSkettenis struct cpu_info *ci; 6769485775dSkettenis 6779485775dSkettenis /* 6789485775dSkettenis * Make sure we stop the secondary CPUs. 6799485775dSkettenis */ 6809485775dSkettenis CPU_INFO_FOREACH(cii, ci) { 6819485775dSkettenis struct schedstate_percpu *spc = &ci->ci_schedstate; 6829485775dSkettenis 683d73de46fSkettenis if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci)) 6849485775dSkettenis continue; 68538795e35Skettenis cpuset_del(&sched_all_cpus, ci); 6869485775dSkettenis atomic_setbits_int(&spc->spc_schedflags, SPCF_SHOULDHALT); 6879485775dSkettenis } 6889485775dSkettenis CPU_INFO_FOREACH(cii, ci) { 6899485775dSkettenis struct schedstate_percpu *spc = &ci->ci_schedstate; 6909485775dSkettenis 691d73de46fSkettenis if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci)) 6929485775dSkettenis continue; 6939485775dSkettenis while ((spc->spc_schedflags & SPCF_HALTED) == 0) { 694f2e7dc09Sclaudio sleep_setup(spc, PZERO, "schedstate"); 695f2e7dc09Sclaudio sleep_finish(0, 6969485775dSkettenis (spc->spc_schedflags & SPCF_HALTED) == 0); 6979485775dSkettenis } 6989485775dSkettenis } 6999485775dSkettenis } 7009485775dSkettenis 7010e4e5752Sdlg struct sched_barrier_state { 7020e4e5752Sdlg struct cpu_info *ci; 7030e4e5752Sdlg struct cond cond; 7040e4e5752Sdlg }; 7050e4e5752Sdlg 706f30f8d91Skettenis void 707f30f8d91Skettenis sched_barrier_task(void *arg) 708f30f8d91Skettenis { 7090e4e5752Sdlg struct sched_barrier_state *sb = arg; 7100e4e5752Sdlg struct cpu_info *ci = sb->ci; 711f30f8d91Skettenis 712f30f8d91Skettenis sched_peg_curproc(ci); 7130e4e5752Sdlg cond_signal(&sb->cond); 714cf31dfdeSmpi sched_unpeg_curproc(); 715f30f8d91Skettenis } 716f30f8d91Skettenis 717f30f8d91Skettenis void 718f30f8d91Skettenis sched_barrier(struct cpu_info *ci) 719f30f8d91Skettenis { 7200e4e5752Sdlg struct sched_barrier_state sb; 721f30f8d91Skettenis struct task task; 722f30f8d91Skettenis CPU_INFO_ITERATOR cii; 723f30f8d91Skettenis 724f30f8d91Skettenis if (ci == NULL) { 725f30f8d91Skettenis CPU_INFO_FOREACH(cii, ci) { 726f30f8d91Skettenis if (CPU_IS_PRIMARY(ci)) 727f30f8d91Skettenis break; 728f30f8d91Skettenis } 729f30f8d91Skettenis } 730f30f8d91Skettenis KASSERT(ci != NULL); 731077ff916Skettenis 732077ff916Skettenis if (ci == curcpu()) 733077ff916Skettenis return; 734f30f8d91Skettenis 7350e4e5752Sdlg sb.ci = ci; 7360e4e5752Sdlg cond_init(&sb.cond); 7370e4e5752Sdlg task_set(&task, sched_barrier_task, &sb); 7380e4e5752Sdlg 7390e4e5752Sdlg task_add(systqmp, &task); 7400e4e5752Sdlg cond_wait(&sb.cond, "sbar"); 741f30f8d91Skettenis } 742f30f8d91Skettenis 743f30f8d91Skettenis #else 744f30f8d91Skettenis 745f30f8d91Skettenis void 746f30f8d91Skettenis sched_barrier(struct cpu_info *ci) 747f30f8d91Skettenis { 748f30f8d91Skettenis } 749f30f8d91Skettenis 7509485775dSkettenis #endif 7519485775dSkettenis 7526026e1a6Sart /* 7537035ad6bSart * Functions to manipulate cpu sets. 7547035ad6bSart */ 7557035ad6bSart struct cpu_info *cpuset_infos[MAXCPUS]; 7567035ad6bSart static struct cpuset cpuset_all; 7577035ad6bSart 7587035ad6bSart void 7597035ad6bSart cpuset_init_cpu(struct cpu_info *ci) 7607035ad6bSart { 7617035ad6bSart cpuset_add(&cpuset_all, ci); 7627035ad6bSart cpuset_infos[CPU_INFO_UNIT(ci)] = ci; 7637035ad6bSart } 7647035ad6bSart 7657035ad6bSart void 7667035ad6bSart cpuset_clear(struct cpuset *cs) 7677035ad6bSart { 7687035ad6bSart memset(cs, 0, sizeof(*cs)); 7697035ad6bSart } 7707035ad6bSart 7717035ad6bSart void 7727035ad6bSart cpuset_add(struct cpuset *cs, struct cpu_info *ci) 7737035ad6bSart { 7747035ad6bSart unsigned int num = CPU_INFO_UNIT(ci); 77541d7544aSbluhm atomic_setbits_int(&cs->cs_set[num/32], (1U << (num % 32))); 7767035ad6bSart } 7777035ad6bSart 7787035ad6bSart void 7797035ad6bSart cpuset_del(struct cpuset *cs, struct cpu_info *ci) 7807035ad6bSart { 7817035ad6bSart unsigned int num = CPU_INFO_UNIT(ci); 78241d7544aSbluhm atomic_clearbits_int(&cs->cs_set[num/32], (1U << (num % 32))); 7837035ad6bSart } 7847035ad6bSart 7857035ad6bSart int 7867035ad6bSart cpuset_isset(struct cpuset *cs, struct cpu_info *ci) 7877035ad6bSart { 7887035ad6bSart unsigned int num = CPU_INFO_UNIT(ci); 78941d7544aSbluhm return (cs->cs_set[num/32] & (1U << (num % 32))); 7907035ad6bSart } 7917035ad6bSart 7927035ad6bSart void 7937035ad6bSart cpuset_add_all(struct cpuset *cs) 7947035ad6bSart { 7957035ad6bSart cpuset_copy(cs, &cpuset_all); 7967035ad6bSart } 7977035ad6bSart 7987035ad6bSart void 7997035ad6bSart cpuset_copy(struct cpuset *to, struct cpuset *from) 8007035ad6bSart { 8017035ad6bSart memcpy(to, from, sizeof(*to)); 8027035ad6bSart } 8037035ad6bSart 8047035ad6bSart struct cpu_info * 8057035ad6bSart cpuset_first(struct cpuset *cs) 8067035ad6bSart { 8077035ad6bSart int i; 8087035ad6bSart 8097035ad6bSart for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 8107035ad6bSart if (cs->cs_set[i]) 8117035ad6bSart return (cpuset_infos[i * 32 + ffs(cs->cs_set[i]) - 1]); 8127035ad6bSart 8137035ad6bSart return (NULL); 8147035ad6bSart } 8157c565ebfSart 8167c565ebfSart void 8177c565ebfSart cpuset_union(struct cpuset *to, struct cpuset *a, struct cpuset *b) 8187c565ebfSart { 8197c565ebfSart int i; 8207c565ebfSart 8217c565ebfSart for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 8227c565ebfSart to->cs_set[i] = a->cs_set[i] | b->cs_set[i]; 8237c565ebfSart } 8247c565ebfSart 8257c565ebfSart void 8267c565ebfSart cpuset_intersection(struct cpuset *to, struct cpuset *a, struct cpuset *b) 8277c565ebfSart { 8287c565ebfSart int i; 8297c565ebfSart 8307c565ebfSart for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 8317c565ebfSart to->cs_set[i] = a->cs_set[i] & b->cs_set[i]; 8327c565ebfSart } 8337c565ebfSart 8347c565ebfSart void 8357c565ebfSart cpuset_complement(struct cpuset *to, struct cpuset *a, struct cpuset *b) 8367c565ebfSart { 8377c565ebfSart int i; 8387c565ebfSart 8397c565ebfSart for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 8407c565ebfSart to->cs_set[i] = b->cs_set[i] & ~a->cs_set[i]; 8417c565ebfSart } 84296c11352Skettenis 843c71ddef4Scheloha int 844c71ddef4Scheloha cpuset_cardinality(struct cpuset *cs) 845c71ddef4Scheloha { 846c71ddef4Scheloha int cardinality, i, n; 847c71ddef4Scheloha 848c71ddef4Scheloha cardinality = 0; 849c71ddef4Scheloha 850c71ddef4Scheloha for (i = 0; i < CPUSET_ASIZE(ncpus); i++) 851c71ddef4Scheloha for (n = cs->cs_set[i]; n != 0; n &= n - 1) 852c71ddef4Scheloha cardinality++; 853c71ddef4Scheloha 854c71ddef4Scheloha return (cardinality); 855c71ddef4Scheloha } 856c71ddef4Scheloha 857c71ddef4Scheloha int 858c71ddef4Scheloha sysctl_hwncpuonline(void) 859c71ddef4Scheloha { 860c71ddef4Scheloha return cpuset_cardinality(&sched_all_cpus); 861c71ddef4Scheloha } 862c71ddef4Scheloha 863d52dbbbeScheloha int 864d52dbbbeScheloha cpu_is_online(struct cpu_info *ci) 865d52dbbbeScheloha { 866d52dbbbeScheloha return cpuset_isset(&sched_all_cpus, ci); 867d52dbbbeScheloha } 868d52dbbbeScheloha 86996c11352Skettenis #ifdef __HAVE_CPU_TOPOLOGY 87096c11352Skettenis 87196c11352Skettenis #include <sys/sysctl.h> 87296c11352Skettenis 87396c11352Skettenis int 87496c11352Skettenis sysctl_hwsmt(void *oldp, size_t *oldlenp, void *newp, size_t newlen) 87596c11352Skettenis { 87696c11352Skettenis CPU_INFO_ITERATOR cii; 87796c11352Skettenis struct cpu_info *ci; 87896c11352Skettenis int err, newsmt; 87996c11352Skettenis 88096c11352Skettenis newsmt = sched_smt; 88147507885Sgnezdo err = sysctl_int_bounded(oldp, oldlenp, newp, newlen, &newsmt, 0, 1); 88296c11352Skettenis if (err) 88396c11352Skettenis return err; 88496c11352Skettenis if (newsmt == sched_smt) 88596c11352Skettenis return 0; 88696c11352Skettenis 88796c11352Skettenis sched_smt = newsmt; 88896c11352Skettenis CPU_INFO_FOREACH(cii, ci) { 889d73de46fSkettenis if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci)) 89096c11352Skettenis continue; 89196c11352Skettenis if (ci->ci_smt_id == 0) 89296c11352Skettenis continue; 89396c11352Skettenis if (sched_smt) 89496c11352Skettenis cpuset_add(&sched_all_cpus, ci); 89596c11352Skettenis else 89696c11352Skettenis cpuset_del(&sched_all_cpus, ci); 89796c11352Skettenis } 89896c11352Skettenis 89996c11352Skettenis return 0; 90096c11352Skettenis } 90196c11352Skettenis 90296c11352Skettenis #endif 903