1 /* $OpenBSD: sched_bsd.c,v 1.9 2006/11/29 12:24:18 miod Exp $ */ 2 /* $NetBSD: kern_synch.c,v 1.37 1996/04/22 01:38:37 christos Exp $ */ 3 4 /*- 5 * Copyright (c) 1982, 1986, 1990, 1991, 1993 6 * The Regents of the University of California. All rights reserved. 7 * (c) UNIX System Laboratories, Inc. 8 * All or some portions of this file are derived from material licensed 9 * to the University of California by American Telephone and Telegraph 10 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 11 * the permission of UNIX System Laboratories, Inc. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 3. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * @(#)kern_synch.c 8.6 (Berkeley) 1/21/94 38 */ 39 40 #include <sys/param.h> 41 #include <sys/systm.h> 42 #include <sys/proc.h> 43 #include <sys/kernel.h> 44 #include <sys/buf.h> 45 #include <sys/signalvar.h> 46 #include <sys/resourcevar.h> 47 #include <uvm/uvm_extern.h> 48 #include <sys/sched.h> 49 #include <sys/timeout.h> 50 51 #ifdef KTRACE 52 #include <sys/ktrace.h> 53 #endif 54 55 #include <machine/cpu.h> 56 57 #ifndef __HAVE_CPUINFO 58 u_char curpriority; /* usrpri of curproc */ 59 #endif 60 int lbolt; /* once a second sleep address */ 61 #ifdef __HAVE_CPUINFO 62 int rrticks_init; /* # of hardclock ticks per roundrobin() */ 63 #endif 64 65 int whichqs; /* Bit mask summary of non-empty Q's. */ 66 struct prochd qs[NQS]; 67 68 struct SIMPLELOCK sched_lock; 69 70 void scheduler_start(void); 71 72 #ifdef __HAVE_CPUINFO 73 void roundrobin(struct cpu_info *); 74 #else 75 void roundrobin(void *); 76 #endif 77 void schedcpu(void *); 78 void updatepri(struct proc *); 79 void endtsleep(void *); 80 81 void 82 scheduler_start(void) 83 { 84 #ifndef __HAVE_CPUINFO 85 static struct timeout roundrobin_to; 86 #endif 87 static struct timeout schedcpu_to; 88 89 /* 90 * We avoid polluting the global namespace by keeping the scheduler 91 * timeouts static in this function. 92 * We setup the timeouts here and kick schedcpu and roundrobin once to 93 * make them do their job. 94 */ 95 96 #ifndef __HAVE_CPUINFO 97 timeout_set(&roundrobin_to, roundrobin, &roundrobin_to); 98 #endif 99 timeout_set(&schedcpu_to, schedcpu, &schedcpu_to); 100 101 #ifdef __HAVE_CPUINFO 102 rrticks_init = hz / 10; 103 #else 104 roundrobin(&roundrobin_to); 105 #endif 106 schedcpu(&schedcpu_to); 107 } 108 109 /* 110 * Force switch among equal priority processes every 100ms. 111 */ 112 /* ARGSUSED */ 113 #ifdef __HAVE_CPUINFO 114 void 115 roundrobin(struct cpu_info *ci) 116 { 117 struct schedstate_percpu *spc = &ci->ci_schedstate; 118 int s; 119 120 spc->spc_rrticks = rrticks_init; 121 122 if (curproc != NULL) { 123 s = splstatclock(); 124 if (spc->spc_schedflags & SPCF_SEENRR) { 125 /* 126 * The process has already been through a roundrobin 127 * without switching and may be hogging the CPU. 128 * Indicate that the process should yield. 129 */ 130 spc->spc_schedflags |= SPCF_SHOULDYIELD; 131 } else { 132 spc->spc_schedflags |= SPCF_SEENRR; 133 } 134 splx(s); 135 } 136 137 need_resched(curcpu()); 138 } 139 #else 140 void 141 roundrobin(void *arg) 142 { 143 struct timeout *to = (struct timeout *)arg; 144 struct proc *p = curproc; 145 int s; 146 147 if (p != NULL) { 148 s = splstatclock(); 149 if (p->p_schedflags & PSCHED_SEENRR) { 150 /* 151 * The process has already been through a roundrobin 152 * without switching and may be hogging the CPU. 153 * Indicate that the process should yield. 154 */ 155 p->p_schedflags |= PSCHED_SHOULDYIELD; 156 } else { 157 p->p_schedflags |= PSCHED_SEENRR; 158 } 159 splx(s); 160 } 161 162 need_resched(NULL); 163 timeout_add(to, hz / 10); 164 } 165 #endif 166 167 /* 168 * Constants for digital decay and forget: 169 * 90% of (p_estcpu) usage in 5 * loadav time 170 * 95% of (p_pctcpu) usage in 60 seconds (load insensitive) 171 * Note that, as ps(1) mentions, this can let percentages 172 * total over 100% (I've seen 137.9% for 3 processes). 173 * 174 * Note that hardclock updates p_estcpu and p_cpticks independently. 175 * 176 * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds. 177 * That is, the system wants to compute a value of decay such 178 * that the following for loop: 179 * for (i = 0; i < (5 * loadavg); i++) 180 * p_estcpu *= decay; 181 * will compute 182 * p_estcpu *= 0.1; 183 * for all values of loadavg: 184 * 185 * Mathematically this loop can be expressed by saying: 186 * decay ** (5 * loadavg) ~= .1 187 * 188 * The system computes decay as: 189 * decay = (2 * loadavg) / (2 * loadavg + 1) 190 * 191 * We wish to prove that the system's computation of decay 192 * will always fulfill the equation: 193 * decay ** (5 * loadavg) ~= .1 194 * 195 * If we compute b as: 196 * b = 2 * loadavg 197 * then 198 * decay = b / (b + 1) 199 * 200 * We now need to prove two things: 201 * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1) 202 * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg) 203 * 204 * Facts: 205 * For x close to zero, exp(x) =~ 1 + x, since 206 * exp(x) = 0! + x**1/1! + x**2/2! + ... . 207 * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b. 208 * For x close to zero, ln(1+x) =~ x, since 209 * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1 210 * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1). 211 * ln(.1) =~ -2.30 212 * 213 * Proof of (1): 214 * Solve (factor)**(power) =~ .1 given power (5*loadav): 215 * solving for factor, 216 * ln(factor) =~ (-2.30/5*loadav), or 217 * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) = 218 * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED 219 * 220 * Proof of (2): 221 * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)): 222 * solving for power, 223 * power*ln(b/(b+1)) =~ -2.30, or 224 * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED 225 * 226 * Actual power values for the implemented algorithm are as follows: 227 * loadav: 1 2 3 4 228 * power: 5.68 10.32 14.94 19.55 229 */ 230 231 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */ 232 #define loadfactor(loadav) (2 * (loadav)) 233 #define decay_cpu(loadfac, cpu) (((loadfac) * (cpu)) / ((loadfac) + FSCALE)) 234 235 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 236 fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 237 238 /* 239 * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the 240 * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below 241 * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT). 242 * 243 * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used: 244 * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits). 245 * 246 * If you don't want to bother with the faster/more-accurate formula, you 247 * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate 248 * (more general) method of calculating the %age of CPU used by a process. 249 */ 250 #define CCPU_SHIFT 11 251 252 /* 253 * Recompute process priorities, every hz ticks. 254 */ 255 /* ARGSUSED */ 256 void 257 schedcpu(void *arg) 258 { 259 struct timeout *to = (struct timeout *)arg; 260 fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); 261 struct proc *p; 262 int s; 263 unsigned int newcpu; 264 int phz; 265 266 /* 267 * If we have a statistics clock, use that to calculate CPU 268 * time, otherwise revert to using the profiling clock (which, 269 * in turn, defaults to hz if there is no separate profiling 270 * clock available) 271 */ 272 phz = stathz ? stathz : profhz; 273 KASSERT(phz); 274 275 for (p = LIST_FIRST(&allproc); p != NULL; p = LIST_NEXT(p, p_list)) { 276 /* 277 * Increment time in/out of memory and sleep time 278 * (if sleeping). We ignore overflow; with 16-bit int's 279 * (remember them?) overflow takes 45 days. 280 */ 281 p->p_swtime++; 282 if (p->p_stat == SSLEEP || p->p_stat == SSTOP) 283 p->p_slptime++; 284 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT; 285 /* 286 * If the process has slept the entire second, 287 * stop recalculating its priority until it wakes up. 288 */ 289 if (p->p_slptime > 1) 290 continue; 291 s = splstatclock(); /* prevent state changes */ 292 /* 293 * p_pctcpu is only for ps. 294 */ 295 #if (FSHIFT >= CCPU_SHIFT) 296 p->p_pctcpu += (phz == 100)? 297 ((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT): 298 100 * (((fixpt_t) p->p_cpticks) 299 << (FSHIFT - CCPU_SHIFT)) / phz; 300 #else 301 p->p_pctcpu += ((FSCALE - ccpu) * 302 (p->p_cpticks * FSCALE / phz)) >> FSHIFT; 303 #endif 304 p->p_cpticks = 0; 305 newcpu = (u_int) decay_cpu(loadfac, p->p_estcpu); 306 p->p_estcpu = newcpu; 307 splx(s); 308 SCHED_LOCK(s); 309 resetpriority(p); 310 if (p->p_priority >= PUSER) { 311 if ((p != curproc) && 312 p->p_stat == SRUN && 313 (p->p_priority / PPQ) != (p->p_usrpri / PPQ)) { 314 remrunqueue(p); 315 p->p_priority = p->p_usrpri; 316 setrunqueue(p); 317 } else 318 p->p_priority = p->p_usrpri; 319 } 320 SCHED_UNLOCK(s); 321 } 322 uvm_meter(); 323 wakeup(&lbolt); 324 timeout_add(to, hz); 325 } 326 327 /* 328 * Recalculate the priority of a process after it has slept for a while. 329 * For all load averages >= 1 and max p_estcpu of 255, sleeping for at 330 * least six times the loadfactor will decay p_estcpu to zero. 331 */ 332 void 333 updatepri(struct proc *p) 334 { 335 unsigned int newcpu = p->p_estcpu; 336 fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); 337 338 SCHED_ASSERT_LOCKED(); 339 340 if (p->p_slptime > 5 * loadfac) 341 p->p_estcpu = 0; 342 else { 343 p->p_slptime--; /* the first time was done in schedcpu */ 344 while (newcpu && --p->p_slptime) 345 newcpu = (int) decay_cpu(loadfac, newcpu); 346 p->p_estcpu = newcpu; 347 } 348 resetpriority(p); 349 } 350 351 #if defined(MULTIPROCESSOR) || defined(LOCKDEBUG) 352 void 353 sched_unlock_idle(void) 354 { 355 SIMPLE_UNLOCK(&sched_lock); 356 } 357 358 void 359 sched_lock_idle(void) 360 { 361 SIMPLE_LOCK(&sched_lock); 362 } 363 #endif /* MULTIPROCESSOR || LOCKDEBUG */ 364 365 /* 366 * General yield call. Puts the current process back on its run queue and 367 * performs a voluntary context switch. 368 */ 369 void 370 yield(void) 371 { 372 struct proc *p = curproc; 373 int s; 374 375 SCHED_LOCK(s); 376 p->p_priority = p->p_usrpri; 377 p->p_stat = SRUN; 378 setrunqueue(p); 379 p->p_stats->p_ru.ru_nvcsw++; 380 mi_switch(); 381 SCHED_UNLOCK(s); 382 } 383 384 /* 385 * General preemption call. Puts the current process back on its run queue 386 * and performs an involuntary context switch. If a process is supplied, 387 * we switch to that process. Otherwise, we use the normal process selection 388 * criteria. 389 */ 390 void 391 preempt(struct proc *newp) 392 { 393 struct proc *p = curproc; 394 int s; 395 396 /* 397 * XXX Switching to a specific process is not supported yet. 398 */ 399 if (newp != NULL) 400 panic("preempt: cpu_preempt not yet implemented"); 401 402 SCHED_LOCK(s); 403 p->p_priority = p->p_usrpri; 404 p->p_stat = SRUN; 405 setrunqueue(p); 406 p->p_stats->p_ru.ru_nivcsw++; 407 mi_switch(); 408 SCHED_UNLOCK(s); 409 } 410 411 412 /* 413 * Must be called at splstatclock() or higher. 414 */ 415 void 416 mi_switch(void) 417 { 418 struct proc *p = curproc; /* XXX */ 419 struct rlimit *rlim; 420 struct timeval tv; 421 #if defined(MULTIPROCESSOR) 422 int hold_count; 423 int sched_count; 424 #endif 425 #ifdef __HAVE_CPUINFO 426 struct schedstate_percpu *spc = &p->p_cpu->ci_schedstate; 427 #endif 428 429 SCHED_ASSERT_LOCKED(); 430 431 #if defined(MULTIPROCESSOR) 432 /* 433 * Release the kernel_lock, as we are about to yield the CPU. 434 * The scheduler lock is still held until cpu_switch() 435 * selects a new process and removes it from the run queue. 436 */ 437 sched_count = __mp_release_all_but_one(&sched_lock); 438 if (p->p_flag & P_BIGLOCK) 439 hold_count = __mp_release_all(&kernel_lock); 440 #endif 441 442 /* 443 * Compute the amount of time during which the current 444 * process was running, and add that to its total so far. 445 * XXX - use microuptime here to avoid strangeness. 446 */ 447 microuptime(&tv); 448 #ifdef __HAVE_CPUINFO 449 if (timercmp(&tv, &spc->spc_runtime, <)) { 450 #if 0 451 printf("uptime is not monotonic! " 452 "tv=%lu.%06lu, runtime=%lu.%06lu\n", 453 tv.tv_sec, tv.tv_usec, spc->spc_runtime.tv_sec, 454 spc->spc_runtime.tv_usec); 455 #endif 456 } else { 457 timersub(&tv, &spc->spc_runtime, &tv); 458 timeradd(&p->p_rtime, &tv, &p->p_rtime); 459 } 460 #else 461 if (timercmp(&tv, &runtime, <)) { 462 #if 0 463 printf("uptime is not monotonic! " 464 "tv=%lu.%06lu, runtime=%lu.%06lu\n", 465 tv.tv_sec, tv.tv_usec, runtime.tv_sec, runtime.tv_usec); 466 #endif 467 } else { 468 timersub(&tv, &runtime, &tv); 469 timeradd(&p->p_rtime, &tv, &p->p_rtime); 470 } 471 #endif 472 473 /* 474 * Check if the process exceeds its cpu resource allocation. 475 * If over max, kill it. 476 */ 477 rlim = &p->p_rlimit[RLIMIT_CPU]; 478 if ((rlim_t)p->p_rtime.tv_sec >= rlim->rlim_cur) { 479 if ((rlim_t)p->p_rtime.tv_sec >= rlim->rlim_max) { 480 psignal(p, SIGKILL); 481 } else { 482 psignal(p, SIGXCPU); 483 if (rlim->rlim_cur < rlim->rlim_max) 484 rlim->rlim_cur += 5; 485 } 486 } 487 488 /* 489 * Process is about to yield the CPU; clear the appropriate 490 * scheduling flags. 491 */ 492 #ifdef __HAVE_CPUINFO 493 spc->spc_schedflags &= ~SPCF_SWITCHCLEAR; 494 #else 495 p->p_schedflags &= ~PSCHED_SWITCHCLEAR; 496 #endif 497 498 /* 499 * Pick a new current process and record its start time. 500 */ 501 uvmexp.swtch++; 502 cpu_switch(p); 503 504 /* 505 * Make sure that MD code released the scheduler lock before 506 * resuming us. 507 */ 508 SCHED_ASSERT_UNLOCKED(); 509 510 /* 511 * We're running again; record our new start time. We might 512 * be running on a new CPU now, so don't use the cache'd 513 * schedstate_percpu pointer. 514 */ 515 #ifdef __HAVE_CPUINFO 516 KDASSERT(p->p_cpu != NULL); 517 KDASSERT(p->p_cpu == curcpu()); 518 microuptime(&p->p_cpu->ci_schedstate.spc_runtime); 519 #else 520 microuptime(&runtime); 521 #endif 522 523 #if defined(MULTIPROCESSOR) 524 /* 525 * Reacquire the kernel_lock now. We do this after we've 526 * released the scheduler lock to avoid deadlock, and before 527 * we reacquire the interlock and the scheduler lock. 528 */ 529 if (p->p_flag & P_BIGLOCK) 530 __mp_acquire_count(&kernel_lock, hold_count); 531 __mp_acquire_count(&sched_lock, sched_count + 1); 532 #endif 533 } 534 535 /* 536 * Initialize the (doubly-linked) run queues 537 * to be empty. 538 */ 539 void 540 rqinit(void) 541 { 542 int i; 543 544 for (i = 0; i < NQS; i++) 545 qs[i].ph_link = qs[i].ph_rlink = (struct proc *)&qs[i]; 546 SIMPLE_LOCK_INIT(&sched_lock); 547 } 548 549 static __inline void 550 resched_proc(struct proc *p, u_char pri) 551 { 552 #ifdef __HAVE_CPUINFO 553 struct cpu_info *ci; 554 #endif 555 556 /* 557 * XXXSMP 558 * Since p->p_cpu persists across a context switch, 559 * this gives us *very weak* processor affinity, in 560 * that we notify the CPU on which the process last 561 * ran that it should try to switch. 562 * 563 * This does not guarantee that the process will run on 564 * that processor next, because another processor might 565 * grab it the next time it performs a context switch. 566 * 567 * This also does not handle the case where its last 568 * CPU is running a higher-priority process, but every 569 * other CPU is running a lower-priority process. There 570 * are ways to handle this situation, but they're not 571 * currently very pretty, and we also need to weigh the 572 * cost of moving a process from one CPU to another. 573 * 574 * XXXSMP 575 * There is also the issue of locking the other CPU's 576 * sched state, which we currently do not do. 577 */ 578 #ifdef __HAVE_CPUINFO 579 ci = (p->p_cpu != NULL) ? p->p_cpu : curcpu(); 580 if (pri < ci->ci_schedstate.spc_curpriority) 581 need_resched(ci); 582 #else 583 if (pri < curpriority) 584 need_resched(NULL); 585 #endif 586 } 587 588 /* 589 * Change process state to be runnable, 590 * placing it on the run queue if it is in memory, 591 * and awakening the swapper if it isn't in memory. 592 */ 593 void 594 setrunnable(struct proc *p) 595 { 596 SCHED_ASSERT_LOCKED(); 597 598 switch (p->p_stat) { 599 case 0: 600 case SRUN: 601 case SONPROC: 602 case SZOMB: 603 case SDEAD: 604 default: 605 panic("setrunnable"); 606 case SSTOP: 607 /* 608 * If we're being traced (possibly because someone attached us 609 * while we were stopped), check for a signal from the debugger. 610 */ 611 if ((p->p_flag & P_TRACED) != 0 && p->p_xstat != 0) 612 p->p_siglist |= sigmask(p->p_xstat); 613 case SSLEEP: 614 unsleep(p); /* e.g. when sending signals */ 615 break; 616 case SIDL: 617 break; 618 } 619 p->p_stat = SRUN; 620 setrunqueue(p); 621 if (p->p_slptime > 1) 622 updatepri(p); 623 p->p_slptime = 0; 624 resched_proc(p, p->p_priority); 625 } 626 627 /* 628 * Compute the priority of a process when running in user mode. 629 * Arrange to reschedule if the resulting priority is better 630 * than that of the current process. 631 */ 632 void 633 resetpriority(struct proc *p) 634 { 635 unsigned int newpriority; 636 637 SCHED_ASSERT_LOCKED(); 638 639 newpriority = PUSER + p->p_estcpu + NICE_WEIGHT * (p->p_nice - NZERO); 640 newpriority = min(newpriority, MAXPRI); 641 p->p_usrpri = newpriority; 642 resched_proc(p, p->p_usrpri); 643 } 644 645 /* 646 * We adjust the priority of the current process. The priority of a process 647 * gets worse as it accumulates CPU time. The cpu usage estimator (p_estcpu) 648 * is increased here. The formula for computing priorities (in kern_synch.c) 649 * will compute a different value each time p_estcpu increases. This can 650 * cause a switch, but unless the priority crosses a PPQ boundary the actual 651 * queue will not change. The cpu usage estimator ramps up quite quickly 652 * when the process is running (linearly), and decays away exponentially, at 653 * a rate which is proportionally slower when the system is busy. The basic 654 * principle is that the system will 90% forget that the process used a lot 655 * of CPU time in 5 * loadav seconds. This causes the system to favor 656 * processes which haven't run much recently, and to round-robin among other 657 * processes. 658 */ 659 660 void 661 schedclock(struct proc *p) 662 { 663 int s; 664 665 p->p_estcpu = ESTCPULIM(p->p_estcpu + 1); 666 SCHED_LOCK(s); 667 resetpriority(p); 668 SCHED_UNLOCK(s); 669 if (p->p_priority >= PUSER) 670 p->p_priority = p->p_usrpri; 671 } 672