1 /* $NetBSD: sched_4bsd.c,v 1.45 2021/08/09 19:57:57 andvar Exp $ */ 2 3 /* 4 * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008, 2019, 2020 5 * The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 10 * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran, and 11 * Daniel Sieger. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 24 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 26 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 29 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 30 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 32 * POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 /* 36 * Copyright (c) 1982, 1986, 1990, 1991, 1993 37 * The Regents of the University of California. All rights reserved. 38 * (c) UNIX System Laboratories, Inc. 39 * All or some portions of this file are derived from material licensed 40 * to the University of California by American Telephone and Telegraph 41 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 42 * the permission of UNIX System Laboratories, Inc. 43 * 44 * Redistribution and use in source and binary forms, with or without 45 * modification, are permitted provided that the following conditions 46 * are met: 47 * 1. Redistributions of source code must retain the above copyright 48 * notice, this list of conditions and the following disclaimer. 49 * 2. Redistributions in binary form must reproduce the above copyright 50 * notice, this list of conditions and the following disclaimer in the 51 * documentation and/or other materials provided with the distribution. 52 * 3. Neither the name of the University nor the names of its contributors 53 * may be used to endorse or promote products derived from this software 54 * without specific prior written permission. 55 * 56 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 57 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 58 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 59 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 60 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 61 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 62 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 63 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 64 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 65 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 66 * SUCH DAMAGE. 67 * 68 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 69 */ 70 71 #include <sys/cdefs.h> 72 __KERNEL_RCSID(0, "$NetBSD: sched_4bsd.c,v 1.45 2021/08/09 19:57:57 andvar Exp $"); 73 74 #include "opt_ddb.h" 75 #include "opt_lockdebug.h" 76 77 #include <sys/param.h> 78 #include <sys/systm.h> 79 #include <sys/callout.h> 80 #include <sys/cpu.h> 81 #include <sys/proc.h> 82 #include <sys/kernel.h> 83 #include <sys/resourcevar.h> 84 #include <sys/sched.h> 85 #include <sys/sysctl.h> 86 #include <sys/lockdebug.h> 87 #include <sys/intr.h> 88 #include <sys/atomic.h> 89 90 static void updatepri(struct lwp *); 91 static void resetpriority(struct lwp *); 92 93 extern unsigned int sched_pstats_ticks; /* defined in kern_synch.c */ 94 95 /* Number of hardclock ticks per sched_tick() */ 96 u_int sched_rrticks __read_mostly; 97 98 /* 99 * Force switch among equal priority processes every 100ms. 100 * Called from hardclock every hz/10 == sched_rrticks hardclock ticks. 101 */ 102 /* ARGSUSED */ 103 void 104 sched_tick(struct cpu_info *ci) 105 { 106 struct schedstate_percpu *spc = &ci->ci_schedstate; 107 pri_t pri = PRI_NONE; 108 lwp_t *l; 109 110 spc->spc_ticks = sched_rrticks; 111 112 if (CURCPU_IDLE_P()) { 113 spc_lock(ci); 114 sched_resched_cpu(ci, MAXPRI_KTHREAD, true); 115 /* spc now unlocked */ 116 return; 117 } 118 l = ci->ci_onproc; 119 if (l == NULL) { 120 return; 121 } 122 /* 123 * Can only be spc_lwplock or a turnstile lock at this point 124 * (if we interrupted priority inheritance trylock dance). 125 */ 126 KASSERT(l->l_mutex != spc->spc_mutex); 127 switch (l->l_class) { 128 case SCHED_FIFO: 129 /* No timeslicing for FIFO jobs. */ 130 break; 131 case SCHED_RR: 132 /* Force it into mi_switch() to look for other jobs to run. */ 133 pri = MAXPRI_KERNEL_RT; 134 break; 135 default: 136 if (spc->spc_flags & SPCF_SHOULDYIELD) { 137 /* 138 * Process is stuck in kernel somewhere, probably 139 * due to buggy or inefficient code. Force a 140 * kernel preemption. 141 */ 142 pri = MAXPRI_KERNEL_RT; 143 } else if (spc->spc_flags & SPCF_SEENRR) { 144 /* 145 * The process has already been through a roundrobin 146 * without switching and may be hogging the CPU. 147 * Indicate that the process should yield. 148 */ 149 pri = MAXPRI_KTHREAD; 150 spc->spc_flags |= SPCF_SHOULDYIELD; 151 } else if ((spc->spc_flags & SPCF_1STCLASS) == 0) { 152 /* 153 * For SMT or asymmetric systems push a little 154 * harder: if this is not a 1st class CPU, try to 155 * find a better one to run this LWP. 156 */ 157 pri = MAXPRI_KTHREAD; 158 spc->spc_flags |= SPCF_SHOULDYIELD; 159 } else { 160 spc->spc_flags |= SPCF_SEENRR; 161 } 162 break; 163 } 164 165 if (pri != PRI_NONE) { 166 spc_lock(ci); 167 sched_resched_cpu(ci, pri, true); 168 /* spc now unlocked */ 169 } 170 } 171 172 /* 173 * Why PRIO_MAX - 2? From setpriority(2): 174 * 175 * prio is a value in the range -20 to 20. The default priority is 176 * 0; lower priorities cause more favorable scheduling. A value of 177 * 19 or 20 will schedule a process only when nothing at priority <= 178 * 0 is runnable. 179 * 180 * This gives estcpu influence over 18 priority levels, and leaves nice 181 * with 40 levels. One way to think about it is that nice has 20 levels 182 * either side of estcpu's 18. 183 */ 184 #define ESTCPU_SHIFT 11 185 #define ESTCPU_MAX ((PRIO_MAX - 2) << ESTCPU_SHIFT) 186 #define ESTCPU_ACCUM (1 << (ESTCPU_SHIFT - 1)) 187 #define ESTCPULIM(e) uimin((e), ESTCPU_MAX) 188 189 /* 190 * The main parameter used by this algorithm is 'l_estcpu'. It is an estimate 191 * of the recent CPU utilization of the thread. 192 * 193 * l_estcpu is: 194 * - increased each time the hardclock ticks and the thread is found to 195 * be executing, in sched_schedclock() called from hardclock() 196 * - decreased (filtered) on each sched tick, in sched_pstats_hook() 197 * If the lwp is sleeping for more than a second, we don't touch l_estcpu: it 198 * will be updated in sched_setrunnable() when the lwp wakes up, in burst mode 199 * (ie, we decrease it n times). 200 * 201 * Note that hardclock updates l_estcpu and l_cpticks independently. 202 * 203 * ----------------------------------------------------------------------------- 204 * 205 * Here we describe how l_estcpu is decreased. 206 * 207 * Constants for digital decay (filter): 208 * 90% of l_estcpu usage in (5 * loadavg) seconds 209 * 210 * We wish to decay away 90% of l_estcpu in (5 * loadavg) seconds. That is, we 211 * want to compute a value of decay such that the following loop: 212 * for (i = 0; i < (5 * loadavg); i++) 213 * l_estcpu *= decay; 214 * will result in 215 * l_estcpu *= 0.1; 216 * for all values of loadavg. 217 * 218 * Mathematically this loop can be expressed by saying: 219 * decay ** (5 * loadavg) ~= .1 220 * 221 * And finally, the corresponding value of decay we're using is: 222 * decay = (2 * loadavg) / (2 * loadavg + 1) 223 * 224 * ----------------------------------------------------------------------------- 225 * 226 * Now, let's prove that the value of decay stated above will always fulfill 227 * the equation: 228 * decay ** (5 * loadavg) ~= .1 229 * 230 * If we compute b as: 231 * b = 2 * loadavg 232 * then 233 * decay = b / (b + 1) 234 * 235 * We now need to prove two things: 236 * 1) Given [factor ** (5 * loadavg) =~ .1], prove [factor == b/(b+1)]. 237 * 2) Given [b/(b+1) ** power =~ .1], prove [power == (5 * loadavg)]. 238 * 239 * Facts: 240 * * For x real: exp(x) = 0! + x**1/1! + x**2/2! + ... 241 * Therefore, for x close to zero, exp(x) =~ 1 + x. 242 * In turn, for b large enough, exp(-1/b) =~ 1 - (1/b) = (b-1)/b. 243 * 244 * * For b large enough, (b-1)/b =~ b/(b+1). 245 * 246 * * For x belonging to [-1;1[, ln(1-x) = - x - x**2/2 - x**3/3 - ... 247 * Therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1). 248 * 249 * * ln(0.1) =~ -2.30 250 * 251 * Proof of (1): 252 * factor ** (5 * loadavg) =~ 0.1 253 * => ln(factor) =~ -2.30 / (5 * loadavg) 254 * => factor =~ exp(-1 / ((5 / 2.30) * loadavg)) 255 * =~ exp(-1 / (2 * loadavg)) 256 * =~ exp(-1 / b) 257 * =~ (b - 1) / b 258 * =~ b / (b + 1) 259 * =~ (2 * loadavg) / ((2 * loadavg) + 1) 260 * 261 * Proof of (2): 262 * (b / (b + 1)) ** power =~ .1 263 * => power * ln(b / (b + 1)) =~ -2.30 264 * => power * (-1 / (b + 1)) =~ -2.30 265 * => power =~ 2.30 * (b + 1) 266 * => power =~ 4.60 * loadavg + 2.30 267 * => power =~ 5 * loadavg 268 * 269 * Conclusion: decay = (2 * loadavg) / (2 * loadavg + 1) 270 */ 271 272 /* See calculations above */ 273 #define loadfactor(loadavg) (2 * (loadavg)) 274 275 static fixpt_t 276 decay_cpu(fixpt_t loadfac, fixpt_t estcpu) 277 { 278 279 if (estcpu == 0) { 280 return 0; 281 } 282 283 #if !defined(_LP64) 284 /* avoid 64bit arithmetics. */ 285 #define FIXPT_MAX ((fixpt_t)((UINTMAX_C(1) << sizeof(fixpt_t) * CHAR_BIT) - 1)) 286 if (__predict_true(loadfac <= FIXPT_MAX / ESTCPU_MAX)) { 287 return estcpu * loadfac / (loadfac + FSCALE); 288 } 289 #endif 290 291 return (uint64_t)estcpu * loadfac / (loadfac + FSCALE); 292 } 293 294 static fixpt_t 295 decay_cpu_batch(fixpt_t loadfac, fixpt_t estcpu, unsigned int n) 296 { 297 298 /* 299 * For all load averages >= 1 and max l_estcpu of (255 << ESTCPU_SHIFT), 300 * if we slept for at least seven times the loadfactor, we will decay 301 * l_estcpu to less than (1 << ESTCPU_SHIFT), and therefore we can 302 * return zero directly. 303 * 304 * Note that our ESTCPU_MAX is actually much smaller than 305 * (255 << ESTCPU_SHIFT). 306 */ 307 if ((n << FSHIFT) >= 7 * loadfac) { 308 return 0; 309 } 310 311 while (estcpu != 0 && n > 1) { 312 estcpu = decay_cpu(loadfac, estcpu); 313 n--; 314 } 315 316 return estcpu; 317 } 318 319 /* 320 * sched_pstats_hook: 321 * 322 * Periodically called from sched_pstats(); used to recalculate priorities. 323 */ 324 void 325 sched_pstats_hook(struct lwp *l, int batch) 326 { 327 fixpt_t loadfac; 328 329 /* 330 * If the LWP has slept an entire second, stop recalculating 331 * its priority until it wakes up. 332 */ 333 KASSERT(lwp_locked(l, NULL)); 334 if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP || 335 l->l_stat == LSSUSPENDED) { 336 if (l->l_slptime > 1) { 337 return; 338 } 339 } 340 341 loadfac = loadfactor(averunnable.ldavg[0]); 342 l->l_estcpu = decay_cpu(loadfac, l->l_estcpu); 343 resetpriority(l); 344 } 345 346 /* 347 * Recalculate the priority of an LWP after it has slept for a while. 348 */ 349 static void 350 updatepri(struct lwp *l) 351 { 352 fixpt_t loadfac; 353 354 KASSERT(lwp_locked(l, NULL)); 355 KASSERT(l->l_slptime > 1); 356 357 loadfac = loadfactor(averunnable.ldavg[0]); 358 359 l->l_slptime--; /* the first time was done in sched_pstats */ 360 l->l_estcpu = decay_cpu_batch(loadfac, l->l_estcpu, l->l_slptime); 361 resetpriority(l); 362 } 363 364 void 365 sched_rqinit(void) 366 { 367 368 } 369 370 void 371 sched_setrunnable(struct lwp *l) 372 { 373 374 if (l->l_slptime > 1) 375 updatepri(l); 376 } 377 378 void 379 sched_nice(struct proc *p, int n) 380 { 381 struct lwp *l; 382 383 KASSERT(mutex_owned(p->p_lock)); 384 385 p->p_nice = n; 386 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 387 lwp_lock(l); 388 resetpriority(l); 389 lwp_unlock(l); 390 } 391 } 392 393 /* 394 * Recompute the priority of an LWP. Arrange to reschedule if 395 * the resulting priority is better than that of the current LWP. 396 */ 397 static void 398 resetpriority(struct lwp *l) 399 { 400 pri_t pri; 401 struct proc *p = l->l_proc; 402 403 KASSERT(lwp_locked(l, NULL)); 404 405 if (l->l_class != SCHED_OTHER) 406 return; 407 408 /* See comments above ESTCPU_SHIFT definition. */ 409 pri = (PRI_KERNEL - 1) - (l->l_estcpu >> ESTCPU_SHIFT) - p->p_nice; 410 pri = imax(pri, 0); 411 if (pri != l->l_priority) 412 lwp_changepri(l, pri); 413 } 414 415 /* 416 * We adjust the priority of the current LWP. The priority of a LWP 417 * gets worse as it accumulates CPU time. The CPU usage estimator (l_estcpu) 418 * is increased here. The formula for computing priorities will compute a 419 * different value each time l_estcpu increases. This can cause a switch, 420 * but unless the priority crosses a PPQ boundary the actual queue will not 421 * change. The CPU usage estimator ramps up quite quickly when the process 422 * is running (linearly), and decays away exponentially, at a rate which is 423 * proportionally slower when the system is busy. The basic principle is 424 * that the system will 90% forget that the process used a lot of CPU time 425 * in (5 * loadavg) seconds. This causes the system to favor processes which 426 * haven't run much recently, and to round-robin among other processes. 427 */ 428 void 429 sched_schedclock(struct lwp *l) 430 { 431 432 if (l->l_class != SCHED_OTHER) 433 return; 434 435 KASSERT(!CURCPU_IDLE_P()); 436 l->l_estcpu = ESTCPULIM(l->l_estcpu + ESTCPU_ACCUM); 437 lwp_lock(l); 438 resetpriority(l); 439 lwp_unlock(l); 440 } 441 442 /* 443 * sched_proc_fork: 444 * 445 * Inherit the parent's scheduler history. 446 */ 447 void 448 sched_proc_fork(struct proc *parent, struct proc *child) 449 { 450 lwp_t *pl; 451 452 KASSERT(mutex_owned(parent->p_lock)); 453 454 pl = LIST_FIRST(&parent->p_lwps); 455 child->p_estcpu_inherited = pl->l_estcpu; 456 child->p_forktime = sched_pstats_ticks; 457 } 458 459 /* 460 * sched_proc_exit: 461 * 462 * Chargeback parents for the sins of their children. 463 */ 464 void 465 sched_proc_exit(struct proc *parent, struct proc *child) 466 { 467 fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); 468 fixpt_t estcpu; 469 lwp_t *pl, *cl; 470 471 /* XXX Only if parent != init?? */ 472 473 mutex_enter(parent->p_lock); 474 pl = LIST_FIRST(&parent->p_lwps); 475 cl = LIST_FIRST(&child->p_lwps); 476 estcpu = decay_cpu_batch(loadfac, child->p_estcpu_inherited, 477 sched_pstats_ticks - child->p_forktime); 478 if (cl->l_estcpu > estcpu) { 479 lwp_lock(pl); 480 pl->l_estcpu = ESTCPULIM(pl->l_estcpu + cl->l_estcpu - estcpu); 481 lwp_unlock(pl); 482 } 483 mutex_exit(parent->p_lock); 484 } 485 486 void 487 sched_wakeup(struct lwp *l) 488 { 489 490 } 491 492 void 493 sched_slept(struct lwp *l) 494 { 495 496 } 497 498 void 499 sched_lwp_fork(struct lwp *l1, struct lwp *l2) 500 { 501 502 l2->l_estcpu = l1->l_estcpu; 503 } 504 505 void 506 sched_lwp_collect(struct lwp *t) 507 { 508 lwp_t *l; 509 510 /* Absorb estcpu value of collected LWP. */ 511 l = curlwp; 512 lwp_lock(l); 513 l->l_estcpu += t->l_estcpu; 514 lwp_unlock(l); 515 } 516 517 void 518 sched_oncpu(lwp_t *l) 519 { 520 521 } 522 523 void 524 sched_newts(lwp_t *l) 525 { 526 527 } 528 529 /* 530 * Sysctl nodes and initialization. 531 */ 532 533 static int 534 sysctl_sched_rtts(SYSCTLFN_ARGS) 535 { 536 struct sysctlnode node; 537 int rttsms = hztoms(sched_rrticks); 538 539 node = *rnode; 540 node.sysctl_data = &rttsms; 541 return sysctl_lookup(SYSCTLFN_CALL(&node)); 542 } 543 544 SYSCTL_SETUP(sysctl_sched_4bsd_setup, "sysctl sched setup") 545 { 546 const struct sysctlnode *node = NULL; 547 548 sysctl_createv(clog, 0, NULL, &node, 549 CTLFLAG_PERMANENT, 550 CTLTYPE_NODE, "sched", 551 SYSCTL_DESCR("Scheduler options"), 552 NULL, 0, NULL, 0, 553 CTL_KERN, CTL_CREATE, CTL_EOL); 554 555 if (node == NULL) 556 return; 557 558 sched_rrticks = hz / 10; 559 560 sysctl_createv(NULL, 0, &node, NULL, 561 CTLFLAG_PERMANENT, 562 CTLTYPE_STRING, "name", NULL, 563 NULL, 0, __UNCONST("4.4BSD"), 0, 564 CTL_CREATE, CTL_EOL); 565 sysctl_createv(NULL, 0, &node, NULL, 566 CTLFLAG_PERMANENT, 567 CTLTYPE_INT, "rtts", 568 SYSCTL_DESCR("Round-robin time quantum (in milliseconds)"), 569 sysctl_sched_rtts, 0, NULL, 0, 570 CTL_CREATE, CTL_EOL); 571 } 572