1 /* $NetBSD: kern_synch.c,v 1.136 2003/08/07 16:31:50 agc Exp $ */ 2 3 /*- 4 * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the NetBSD 22 * Foundation, Inc. and its contributors. 23 * 4. Neither the name of The NetBSD Foundation nor the names of its 24 * contributors may be used to endorse or promote products derived 25 * from this software without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 28 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 29 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 30 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 31 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 * POSSIBILITY OF SUCH DAMAGE. 38 */ 39 40 /*- 41 * Copyright (c) 1982, 1986, 1990, 1991, 1993 42 * The Regents of the University of California. All rights reserved. 43 * (c) UNIX System Laboratories, Inc. 44 * All or some portions of this file are derived from material licensed 45 * to the University of California by American Telephone and Telegraph 46 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 47 * the permission of UNIX System Laboratories, Inc. 48 * 49 * Redistribution and use in source and binary forms, with or without 50 * modification, are permitted provided that the following conditions 51 * are met: 52 * 1. Redistributions of source code must retain the above copyright 53 * notice, this list of conditions and the following disclaimer. 54 * 2. Redistributions in binary form must reproduce the above copyright 55 * notice, this list of conditions and the following disclaimer in the 56 * documentation and/or other materials provided with the distribution. 57 * 3. Neither the name of the University nor the names of its contributors 58 * may be used to endorse or promote products derived from this software 59 * without specific prior written permission. 60 * 61 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 62 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 63 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 64 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 65 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 66 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 67 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 68 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 69 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 70 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 71 * SUCH DAMAGE. 72 * 73 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 74 */ 75 76 #include <sys/cdefs.h> 77 __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.136 2003/08/07 16:31:50 agc Exp $"); 78 79 #include "opt_ddb.h" 80 #include "opt_ktrace.h" 81 #include "opt_kstack.h" 82 #include "opt_lockdebug.h" 83 #include "opt_multiprocessor.h" 84 #include "opt_perfctrs.h" 85 86 #include <sys/param.h> 87 #include <sys/systm.h> 88 #include <sys/callout.h> 89 #include <sys/proc.h> 90 #include <sys/kernel.h> 91 #include <sys/buf.h> 92 #if defined(PERFCTRS) 93 #include <sys/pmc.h> 94 #endif 95 #include <sys/signalvar.h> 96 #include <sys/resourcevar.h> 97 #include <sys/sched.h> 98 #include <sys/sa.h> 99 #include <sys/savar.h> 100 101 #include <uvm/uvm_extern.h> 102 103 #ifdef KTRACE 104 #include <sys/ktrace.h> 105 #endif 106 107 #include <machine/cpu.h> 108 109 int lbolt; /* once a second sleep address */ 110 int rrticks; /* number of hardclock ticks per roundrobin() */ 111 112 /* 113 * The global scheduler state. 114 */ 115 struct prochd sched_qs[RUNQUE_NQS]; /* run queues */ 116 __volatile u_int32_t sched_whichqs; /* bitmap of non-empty queues */ 117 struct slpque sched_slpque[SLPQUE_TABLESIZE]; /* sleep queues */ 118 119 struct simplelock sched_lock = SIMPLELOCK_INITIALIZER; 120 121 void schedcpu(void *); 122 void updatepri(struct lwp *); 123 void endtsleep(void *); 124 125 __inline void awaken(struct lwp *); 126 127 struct callout schedcpu_ch = CALLOUT_INITIALIZER; 128 129 130 131 /* 132 * Force switch among equal priority processes every 100ms. 133 * Called from hardclock every hz/10 == rrticks hardclock ticks. 134 */ 135 /* ARGSUSED */ 136 void 137 roundrobin(struct cpu_info *ci) 138 { 139 struct schedstate_percpu *spc = &ci->ci_schedstate; 140 141 spc->spc_rrticks = rrticks; 142 143 if (curlwp != NULL) { 144 if (spc->spc_flags & SPCF_SEENRR) { 145 /* 146 * The process has already been through a roundrobin 147 * without switching and may be hogging the CPU. 148 * Indicate that the process should yield. 149 */ 150 spc->spc_flags |= SPCF_SHOULDYIELD; 151 } else 152 spc->spc_flags |= SPCF_SEENRR; 153 } 154 need_resched(curcpu()); 155 } 156 157 /* 158 * Constants for digital decay and forget: 159 * 90% of (p_estcpu) usage in 5 * loadav time 160 * 95% of (p_pctcpu) usage in 60 seconds (load insensitive) 161 * Note that, as ps(1) mentions, this can let percentages 162 * total over 100% (I've seen 137.9% for 3 processes). 163 * 164 * Note that hardclock updates p_estcpu and p_cpticks independently. 165 * 166 * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds. 167 * That is, the system wants to compute a value of decay such 168 * that the following for loop: 169 * for (i = 0; i < (5 * loadavg); i++) 170 * p_estcpu *= decay; 171 * will compute 172 * p_estcpu *= 0.1; 173 * for all values of loadavg: 174 * 175 * Mathematically this loop can be expressed by saying: 176 * decay ** (5 * loadavg) ~= .1 177 * 178 * The system computes decay as: 179 * decay = (2 * loadavg) / (2 * loadavg + 1) 180 * 181 * We wish to prove that the system's computation of decay 182 * will always fulfill the equation: 183 * decay ** (5 * loadavg) ~= .1 184 * 185 * If we compute b as: 186 * b = 2 * loadavg 187 * then 188 * decay = b / (b + 1) 189 * 190 * We now need to prove two things: 191 * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1) 192 * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg) 193 * 194 * Facts: 195 * For x close to zero, exp(x) =~ 1 + x, since 196 * exp(x) = 0! + x**1/1! + x**2/2! + ... . 197 * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b. 198 * For x close to zero, ln(1+x) =~ x, since 199 * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1 200 * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1). 201 * ln(.1) =~ -2.30 202 * 203 * Proof of (1): 204 * Solve (factor)**(power) =~ .1 given power (5*loadav): 205 * solving for factor, 206 * ln(factor) =~ (-2.30/5*loadav), or 207 * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) = 208 * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED 209 * 210 * Proof of (2): 211 * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)): 212 * solving for power, 213 * power*ln(b/(b+1)) =~ -2.30, or 214 * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED 215 * 216 * Actual power values for the implemented algorithm are as follows: 217 * loadav: 1 2 3 4 218 * power: 5.68 10.32 14.94 19.55 219 */ 220 221 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */ 222 #define loadfactor(loadav) (2 * (loadav)) 223 #define decay_cpu(loadfac, cpu) (((loadfac) * (cpu)) / ((loadfac) + FSCALE)) 224 225 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 226 fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 227 228 /* 229 * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the 230 * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below 231 * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT). 232 * 233 * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used: 234 * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits). 235 * 236 * If you dont want to bother with the faster/more-accurate formula, you 237 * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate 238 * (more general) method of calculating the %age of CPU used by a process. 239 */ 240 #define CCPU_SHIFT 11 241 242 /* 243 * Recompute process priorities, every hz ticks. 244 */ 245 /* ARGSUSED */ 246 void 247 schedcpu(void *arg) 248 { 249 fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); 250 struct lwp *l; 251 struct proc *p; 252 int s, minslp; 253 unsigned int newcpu; 254 int clkhz; 255 256 proclist_lock_read(); 257 LIST_FOREACH(p, &allproc, p_list) { 258 /* 259 * Increment time in/out of memory and sleep time 260 * (if sleeping). We ignore overflow; with 16-bit int's 261 * (remember them?) overflow takes 45 days. 262 */ 263 minslp = 2; 264 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 265 l->l_swtime++; 266 if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP || 267 l->l_stat == LSSUSPENDED) { 268 l->l_slptime++; 269 minslp = min(minslp, l->l_slptime); 270 } else 271 minslp = 0; 272 } 273 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT; 274 /* 275 * If the process has slept the entire second, 276 * stop recalculating its priority until it wakes up. 277 */ 278 if (minslp > 1) 279 continue; 280 s = splstatclock(); /* prevent state changes */ 281 /* 282 * p_pctcpu is only for ps. 283 */ 284 clkhz = stathz != 0 ? stathz : hz; 285 #if (FSHIFT >= CCPU_SHIFT) 286 p->p_pctcpu += (clkhz == 100)? 287 ((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT): 288 100 * (((fixpt_t) p->p_cpticks) 289 << (FSHIFT - CCPU_SHIFT)) / clkhz; 290 #else 291 p->p_pctcpu += ((FSCALE - ccpu) * 292 (p->p_cpticks * FSCALE / clkhz)) >> FSHIFT; 293 #endif 294 p->p_cpticks = 0; 295 newcpu = (u_int)decay_cpu(loadfac, p->p_estcpu); 296 p->p_estcpu = newcpu; 297 splx(s); /* Done with the process CPU ticks update */ 298 SCHED_LOCK(s); 299 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 300 if (l->l_slptime > 1) 301 continue; 302 resetpriority(l); 303 if (l->l_priority >= PUSER) { 304 if (l->l_stat == LSRUN && 305 (l->l_flag & L_INMEM) && 306 (l->l_priority / PPQ) != (l->l_usrpri / PPQ)) { 307 remrunqueue(l); 308 l->l_priority = l->l_usrpri; 309 setrunqueue(l); 310 } else 311 l->l_priority = l->l_usrpri; 312 } 313 } 314 SCHED_UNLOCK(s); 315 } 316 proclist_unlock_read(); 317 uvm_meter(); 318 wakeup((caddr_t)&lbolt); 319 callout_reset(&schedcpu_ch, hz, schedcpu, NULL); 320 } 321 322 /* 323 * Recalculate the priority of a process after it has slept for a while. 324 * For all load averages >= 1 and max p_estcpu of 255, sleeping for at 325 * least six times the loadfactor will decay p_estcpu to zero. 326 */ 327 void 328 updatepri(struct lwp *l) 329 { 330 struct proc *p = l->l_proc; 331 unsigned int newcpu; 332 fixpt_t loadfac; 333 334 SCHED_ASSERT_LOCKED(); 335 336 newcpu = p->p_estcpu; 337 loadfac = loadfactor(averunnable.ldavg[0]); 338 339 if (l->l_slptime > 5 * loadfac) 340 p->p_estcpu = 0; /* XXX NJWLWP */ 341 else { 342 l->l_slptime--; /* the first time was done in schedcpu */ 343 while (newcpu && --l->l_slptime) 344 newcpu = (int) decay_cpu(loadfac, newcpu); 345 p->p_estcpu = newcpu; 346 } 347 resetpriority(l); 348 } 349 350 /* 351 * During autoconfiguration or after a panic, a sleep will simply 352 * lower the priority briefly to allow interrupts, then return. 353 * The priority to be used (safepri) is machine-dependent, thus this 354 * value is initialized and maintained in the machine-dependent layers. 355 * This priority will typically be 0, or the lowest priority 356 * that is safe for use on the interrupt stack; it can be made 357 * higher to block network software interrupts after panics. 358 */ 359 int safepri; 360 361 /* 362 * General sleep call. Suspends the current process until a wakeup is 363 * performed on the specified identifier. The process will then be made 364 * runnable with the specified priority. Sleeps at most timo/hz seconds 365 * (0 means no timeout). If pri includes PCATCH flag, signals are checked 366 * before and after sleeping, else signals are not checked. Returns 0 if 367 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a 368 * signal needs to be delivered, ERESTART is returned if the current system 369 * call should be restarted if possible, and EINTR is returned if the system 370 * call should be interrupted by the signal (return EINTR). 371 * 372 * The interlock is held until the scheduler_slock is acquired. The 373 * interlock will be locked before returning back to the caller 374 * unless the PNORELOCK flag is specified, in which case the 375 * interlock will always be unlocked upon return. 376 */ 377 int 378 ltsleep(const void *ident, int priority, const char *wmesg, int timo, 379 __volatile struct simplelock *interlock) 380 { 381 struct lwp *l = curlwp; 382 struct proc *p = l ? l->l_proc : NULL; 383 struct slpque *qp; 384 int sig, s; 385 int catch = priority & PCATCH; 386 int relock = (priority & PNORELOCK) == 0; 387 int exiterr = (priority & PNOEXITERR) == 0; 388 389 /* 390 * XXXSMP 391 * This is probably bogus. Figure out what the right 392 * thing to do here really is. 393 * Note that not sleeping if ltsleep is called with curlwp == NULL 394 * in the shutdown case is disgusting but partly necessary given 395 * how shutdown (barely) works. 396 */ 397 if (cold || (doing_shutdown && (panicstr || (l == NULL)))) { 398 /* 399 * After a panic, or during autoconfiguration, 400 * just give interrupts a chance, then just return; 401 * don't run any other procs or panic below, 402 * in case this is the idle process and already asleep. 403 */ 404 s = splhigh(); 405 splx(safepri); 406 splx(s); 407 if (interlock != NULL && relock == 0) 408 simple_unlock(interlock); 409 return (0); 410 } 411 412 KASSERT(p != NULL); 413 LOCK_ASSERT(interlock == NULL || simple_lock_held(interlock)); 414 415 #ifdef KTRACE 416 if (KTRPOINT(p, KTR_CSW)) 417 ktrcsw(p, 1, 0); 418 #endif 419 420 SCHED_LOCK(s); 421 422 #ifdef DIAGNOSTIC 423 if (ident == NULL) 424 panic("ltsleep: ident == NULL"); 425 if (l->l_stat != LSONPROC) 426 panic("ltsleep: l_stat %d != LSONPROC", l->l_stat); 427 if (l->l_back != NULL) 428 panic("ltsleep: p_back != NULL"); 429 #endif 430 431 l->l_wchan = ident; 432 l->l_wmesg = wmesg; 433 l->l_slptime = 0; 434 l->l_priority = priority & PRIMASK; 435 436 qp = SLPQUE(ident); 437 if (qp->sq_head == 0) 438 qp->sq_head = l; 439 else { 440 *qp->sq_tailp = l; 441 } 442 *(qp->sq_tailp = &l->l_forw) = 0; 443 444 if (timo) 445 callout_reset(&l->l_tsleep_ch, timo, endtsleep, l); 446 447 /* 448 * We can now release the interlock; the scheduler_slock 449 * is held, so a thread can't get in to do wakeup() before 450 * we do the switch. 451 * 452 * XXX We leave the code block here, after inserting ourselves 453 * on the sleep queue, because we might want a more clever 454 * data structure for the sleep queues at some point. 455 */ 456 if (interlock != NULL) 457 simple_unlock(interlock); 458 459 /* 460 * We put ourselves on the sleep queue and start our timeout 461 * before calling CURSIG, as we could stop there, and a wakeup 462 * or a SIGCONT (or both) could occur while we were stopped. 463 * A SIGCONT would cause us to be marked as SSLEEP 464 * without resuming us, thus we must be ready for sleep 465 * when CURSIG is called. If the wakeup happens while we're 466 * stopped, p->p_wchan will be 0 upon return from CURSIG. 467 */ 468 if (catch) { 469 l->l_flag |= L_SINTR; 470 if (((sig = CURSIG(l)) != 0) || (p->p_flag & P_WEXIT)) { 471 if (l->l_wchan != NULL) 472 unsleep(l); 473 l->l_stat = LSONPROC; 474 SCHED_UNLOCK(s); 475 goto resume; 476 } 477 if (l->l_wchan == NULL) { 478 catch = 0; 479 SCHED_UNLOCK(s); 480 goto resume; 481 } 482 } else 483 sig = 0; 484 l->l_stat = LSSLEEP; 485 p->p_nrlwps--; 486 p->p_stats->p_ru.ru_nvcsw++; 487 SCHED_ASSERT_LOCKED(); 488 if (l->l_flag & L_SA) 489 sa_switch(l, SA_UPCALL_BLOCKED); 490 else 491 mi_switch(l, NULL); 492 493 #if defined(DDB) && !defined(GPROF) 494 /* handy breakpoint location after process "wakes" */ 495 __asm(".globl bpendtsleep ; bpendtsleep:"); 496 #endif 497 /* 498 * p->p_nrlwps is incremented by whoever made us runnable again, 499 * either setrunnable() or awaken(). 500 */ 501 502 SCHED_ASSERT_UNLOCKED(); 503 splx(s); 504 505 resume: 506 KDASSERT(l->l_cpu != NULL); 507 KDASSERT(l->l_cpu == curcpu()); 508 l->l_cpu->ci_schedstate.spc_curpriority = l->l_usrpri; 509 510 l->l_flag &= ~L_SINTR; 511 if (l->l_flag & L_TIMEOUT) { 512 l->l_flag &= ~(L_TIMEOUT|L_CANCELLED); 513 if (sig == 0) { 514 #ifdef KTRACE 515 if (KTRPOINT(p, KTR_CSW)) 516 ktrcsw(p, 0, 0); 517 #endif 518 if (relock && interlock != NULL) 519 simple_lock(interlock); 520 return (EWOULDBLOCK); 521 } 522 } else if (timo) 523 callout_stop(&l->l_tsleep_ch); 524 525 if (catch) { 526 const int cancelled = l->l_flag & L_CANCELLED; 527 l->l_flag &= ~L_CANCELLED; 528 if (sig != 0 || (sig = CURSIG(l)) != 0 || cancelled) { 529 #ifdef KTRACE 530 if (KTRPOINT(p, KTR_CSW)) 531 ktrcsw(p, 0, 0); 532 #endif 533 if (relock && interlock != NULL) 534 simple_lock(interlock); 535 /* 536 * If this sleep was canceled, don't let the syscall 537 * restart. 538 */ 539 if (cancelled || 540 (SIGACTION(p, sig).sa_flags & SA_RESTART) == 0) 541 return (EINTR); 542 return (ERESTART); 543 } 544 } 545 546 #ifdef KTRACE 547 if (KTRPOINT(p, KTR_CSW)) 548 ktrcsw(p, 0, 0); 549 #endif 550 if (relock && interlock != NULL) 551 simple_lock(interlock); 552 553 /* XXXNJW this is very much a kluge. 554 * revisit. a better way of preventing looping/hanging syscalls like 555 * wait4() and _lwp_wait() from wedging an exiting process 556 * would be preferred. 557 */ 558 if (catch && ((p->p_flag & P_WEXIT) && exiterr)) 559 return (EINTR); 560 return (0); 561 } 562 563 /* 564 * Implement timeout for tsleep. 565 * If process hasn't been awakened (wchan non-zero), 566 * set timeout flag and undo the sleep. If proc 567 * is stopped, just unsleep so it will remain stopped. 568 */ 569 void 570 endtsleep(void *arg) 571 { 572 struct lwp *l; 573 int s; 574 575 l = (struct lwp *)arg; 576 SCHED_LOCK(s); 577 if (l->l_wchan) { 578 if (l->l_stat == LSSLEEP) 579 setrunnable(l); 580 else 581 unsleep(l); 582 l->l_flag |= L_TIMEOUT; 583 } 584 SCHED_UNLOCK(s); 585 } 586 587 /* 588 * Remove a process from its wait queue 589 */ 590 void 591 unsleep(struct lwp *l) 592 { 593 struct slpque *qp; 594 struct lwp **hp; 595 596 SCHED_ASSERT_LOCKED(); 597 598 if (l->l_wchan) { 599 hp = &(qp = SLPQUE(l->l_wchan))->sq_head; 600 while (*hp != l) 601 hp = &(*hp)->l_forw; 602 *hp = l->l_forw; 603 if (qp->sq_tailp == &l->l_forw) 604 qp->sq_tailp = hp; 605 l->l_wchan = 0; 606 } 607 } 608 609 /* 610 * Optimized-for-wakeup() version of setrunnable(). 611 */ 612 __inline void 613 awaken(struct lwp *l) 614 { 615 616 SCHED_ASSERT_LOCKED(); 617 618 if (l->l_slptime > 1) 619 updatepri(l); 620 l->l_slptime = 0; 621 l->l_stat = LSRUN; 622 l->l_proc->p_nrlwps++; 623 /* 624 * Since curpriority is a user priority, p->p_priority 625 * is always better than curpriority on the last CPU on 626 * which it ran. 627 * 628 * XXXSMP See affinity comment in resched_proc(). 629 */ 630 if (l->l_flag & L_INMEM) { 631 setrunqueue(l); 632 if (l->l_flag & L_SA) 633 l->l_proc->p_sa->sa_woken = l; 634 KASSERT(l->l_cpu != NULL); 635 need_resched(l->l_cpu); 636 } else 637 sched_wakeup(&proc0); 638 } 639 640 #if defined(MULTIPROCESSOR) || defined(LOCKDEBUG) 641 void 642 sched_unlock_idle(void) 643 { 644 645 simple_unlock(&sched_lock); 646 } 647 648 void 649 sched_lock_idle(void) 650 { 651 652 simple_lock(&sched_lock); 653 } 654 #endif /* MULTIPROCESSOR || LOCKDEBUG */ 655 656 /* 657 * Make all processes sleeping on the specified identifier runnable. 658 */ 659 660 void 661 wakeup(const void *ident) 662 { 663 int s; 664 665 SCHED_ASSERT_UNLOCKED(); 666 667 SCHED_LOCK(s); 668 sched_wakeup(ident); 669 SCHED_UNLOCK(s); 670 } 671 672 void 673 sched_wakeup(const void *ident) 674 { 675 struct slpque *qp; 676 struct lwp *l, **q; 677 678 SCHED_ASSERT_LOCKED(); 679 680 qp = SLPQUE(ident); 681 restart: 682 for (q = &qp->sq_head; (l = *q) != NULL; ) { 683 #ifdef DIAGNOSTIC 684 if (l->l_back || (l->l_stat != LSSLEEP && 685 l->l_stat != LSSTOP && l->l_stat != LSSUSPENDED)) 686 panic("wakeup"); 687 #endif 688 if (l->l_wchan == ident) { 689 l->l_wchan = 0; 690 *q = l->l_forw; 691 if (qp->sq_tailp == &l->l_forw) 692 qp->sq_tailp = q; 693 if (l->l_stat == LSSLEEP) { 694 awaken(l); 695 goto restart; 696 } 697 } else 698 q = &l->l_forw; 699 } 700 } 701 702 /* 703 * Make the highest priority process first in line on the specified 704 * identifier runnable. 705 */ 706 void 707 wakeup_one(const void *ident) 708 { 709 struct slpque *qp; 710 struct lwp *l, **q; 711 struct lwp *best_sleepp, **best_sleepq; 712 struct lwp *best_stopp, **best_stopq; 713 int s; 714 715 best_sleepp = best_stopp = NULL; 716 best_sleepq = best_stopq = NULL; 717 718 SCHED_LOCK(s); 719 720 qp = SLPQUE(ident); 721 722 for (q = &qp->sq_head; (l = *q) != NULL; q = &l->l_forw) { 723 #ifdef DIAGNOSTIC 724 if (l->l_back || (l->l_stat != LSSLEEP && 725 l->l_stat != LSSTOP && l->l_stat != LSSUSPENDED)) 726 panic("wakeup_one"); 727 #endif 728 if (l->l_wchan == ident) { 729 if (l->l_stat == LSSLEEP) { 730 if (best_sleepp == NULL || 731 l->l_priority < best_sleepp->l_priority) { 732 best_sleepp = l; 733 best_sleepq = q; 734 } 735 } else { 736 if (best_stopp == NULL || 737 l->l_priority < best_stopp->l_priority) { 738 best_stopp = l; 739 best_stopq = q; 740 } 741 } 742 } 743 } 744 745 /* 746 * Consider any SSLEEP process higher than the highest priority SSTOP 747 * process. 748 */ 749 if (best_sleepp != NULL) { 750 l = best_sleepp; 751 q = best_sleepq; 752 } else { 753 l = best_stopp; 754 q = best_stopq; 755 } 756 757 if (l != NULL) { 758 l->l_wchan = NULL; 759 *q = l->l_forw; 760 if (qp->sq_tailp == &l->l_forw) 761 qp->sq_tailp = q; 762 if (l->l_stat == LSSLEEP) 763 awaken(l); 764 } 765 SCHED_UNLOCK(s); 766 } 767 768 /* 769 * General yield call. Puts the current process back on its run queue and 770 * performs a voluntary context switch. Should only be called when the 771 * current process explicitly requests it (eg sched_yield(2) in compat code). 772 */ 773 void 774 yield(void) 775 { 776 struct lwp *l = curlwp; 777 int s; 778 779 SCHED_LOCK(s); 780 l->l_priority = l->l_usrpri; 781 l->l_stat = LSRUN; 782 setrunqueue(l); 783 l->l_proc->p_stats->p_ru.ru_nvcsw++; 784 mi_switch(l, NULL); 785 SCHED_ASSERT_UNLOCKED(); 786 splx(s); 787 } 788 789 /* 790 * General preemption call. Puts the current process back on its run queue 791 * and performs an involuntary context switch. If a process is supplied, 792 * we switch to that process. Otherwise, we use the normal process selection 793 * criteria. 794 */ 795 796 void 797 preempt(int more) 798 { 799 struct lwp *l = curlwp; 800 int r, s; 801 /* XXXUPSXXX Not needed for SMP patch */ 802 #if 0 803 /* XXX Until the preempt() bug is fixed. */ 804 if (more && (l->l_proc->p_flag & P_SA)) { 805 l->l_cpu->ci_schedstate.spc_flags &= ~SPCF_SWITCHCLEAR; 806 return; 807 } 808 #endif 809 810 SCHED_LOCK(s); 811 l->l_priority = l->l_usrpri; 812 l->l_stat = LSRUN; 813 setrunqueue(l); 814 l->l_proc->p_stats->p_ru.ru_nivcsw++; 815 r = mi_switch(l, NULL); 816 SCHED_ASSERT_UNLOCKED(); 817 splx(s); 818 if ((l->l_flag & L_SA) != 0 && r != 0 && more == 0) 819 sa_preempt(l); 820 } 821 822 /* 823 * The machine independent parts of context switch. 824 * Must be called at splsched() (no higher!) and with 825 * the sched_lock held. 826 * Switch to "new" if non-NULL, otherwise let cpu_switch choose 827 * the next lwp. 828 * 829 * Returns 1 if another process was actually run. 830 */ 831 int 832 mi_switch(struct lwp *l, struct lwp *newl) 833 { 834 struct schedstate_percpu *spc; 835 struct rlimit *rlim; 836 long s, u; 837 struct timeval tv; 838 #if defined(MULTIPROCESSOR) 839 int hold_count; 840 #endif 841 struct proc *p = l->l_proc; 842 int retval; 843 844 SCHED_ASSERT_LOCKED(); 845 846 #if defined(MULTIPROCESSOR) 847 /* 848 * Release the kernel_lock, as we are about to yield the CPU. 849 * The scheduler lock is still held until cpu_switch() 850 * selects a new process and removes it from the run queue. 851 */ 852 if (l->l_flag & L_BIGLOCK) 853 hold_count = spinlock_release_all(&kernel_lock); 854 #endif 855 856 KDASSERT(l->l_cpu != NULL); 857 KDASSERT(l->l_cpu == curcpu()); 858 859 spc = &l->l_cpu->ci_schedstate; 860 861 #if defined(LOCKDEBUG) || defined(DIAGNOSTIC) 862 spinlock_switchcheck(); 863 #endif 864 #ifdef LOCKDEBUG 865 simple_lock_switchcheck(); 866 #endif 867 868 /* 869 * Compute the amount of time during which the current 870 * process was running. 871 */ 872 microtime(&tv); 873 u = p->p_rtime.tv_usec + 874 (tv.tv_usec - spc->spc_runtime.tv_usec); 875 s = p->p_rtime.tv_sec + (tv.tv_sec - spc->spc_runtime.tv_sec); 876 if (u < 0) { 877 u += 1000000; 878 s--; 879 } else if (u >= 1000000) { 880 u -= 1000000; 881 s++; 882 } 883 p->p_rtime.tv_usec = u; 884 p->p_rtime.tv_sec = s; 885 886 /* 887 * Check if the process exceeds its cpu resource allocation. 888 * If over max, kill it. In any case, if it has run for more 889 * than 10 minutes, reduce priority to give others a chance. 890 */ 891 rlim = &p->p_rlimit[RLIMIT_CPU]; 892 if (s >= rlim->rlim_cur) { 893 /* 894 * XXXSMP: we're inside the scheduler lock perimeter; 895 * use sched_psignal. 896 */ 897 if (s >= rlim->rlim_max) 898 sched_psignal(p, SIGKILL); 899 else { 900 sched_psignal(p, SIGXCPU); 901 if (rlim->rlim_cur < rlim->rlim_max) 902 rlim->rlim_cur += 5; 903 } 904 } 905 if (autonicetime && s > autonicetime && p->p_ucred->cr_uid && 906 p->p_nice == NZERO) { 907 p->p_nice = autoniceval + NZERO; 908 resetpriority(l); 909 } 910 911 /* 912 * Process is about to yield the CPU; clear the appropriate 913 * scheduling flags. 914 */ 915 spc->spc_flags &= ~SPCF_SWITCHCLEAR; 916 917 #ifdef KSTACK_CHECK_MAGIC 918 kstack_check_magic(l); 919 #endif 920 921 /* 922 * If we are using h/w performance counters, save context. 923 */ 924 #if PERFCTRS 925 if (PMC_ENABLED(p)) 926 pmc_save_context(p); 927 #endif 928 929 /* 930 * Switch to the new current process. When we 931 * run again, we'll return back here. 932 */ 933 uvmexp.swtch++; 934 if (newl == NULL) { 935 retval = cpu_switch(l, NULL); 936 } else { 937 remrunqueue(newl); 938 cpu_switchto(l, newl); 939 retval = 0; 940 } 941 942 /* 943 * If we are using h/w performance counters, restore context. 944 */ 945 #if PERFCTRS 946 if (PMC_ENABLED(p)) 947 pmc_restore_context(p); 948 #endif 949 950 /* 951 * Make sure that MD code released the scheduler lock before 952 * resuming us. 953 */ 954 SCHED_ASSERT_UNLOCKED(); 955 956 /* 957 * We're running again; record our new start time. We might 958 * be running on a new CPU now, so don't use the cache'd 959 * schedstate_percpu pointer. 960 */ 961 KDASSERT(l->l_cpu != NULL); 962 KDASSERT(l->l_cpu == curcpu()); 963 microtime(&l->l_cpu->ci_schedstate.spc_runtime); 964 965 #if defined(MULTIPROCESSOR) 966 /* 967 * Reacquire the kernel_lock now. We do this after we've 968 * released the scheduler lock to avoid deadlock, and before 969 * we reacquire the interlock. 970 */ 971 if (l->l_flag & L_BIGLOCK) 972 spinlock_acquire_count(&kernel_lock, hold_count); 973 #endif 974 975 return retval; 976 } 977 978 /* 979 * Initialize the (doubly-linked) run queues 980 * to be empty. 981 */ 982 void 983 rqinit() 984 { 985 int i; 986 987 for (i = 0; i < RUNQUE_NQS; i++) 988 sched_qs[i].ph_link = sched_qs[i].ph_rlink = 989 (struct lwp *)&sched_qs[i]; 990 } 991 992 static __inline void 993 resched_proc(struct lwp *l, u_char pri) 994 { 995 struct cpu_info *ci; 996 997 /* 998 * XXXSMP 999 * Since l->l_cpu persists across a context switch, 1000 * this gives us *very weak* processor affinity, in 1001 * that we notify the CPU on which the process last 1002 * ran that it should try to switch. 1003 * 1004 * This does not guarantee that the process will run on 1005 * that processor next, because another processor might 1006 * grab it the next time it performs a context switch. 1007 * 1008 * This also does not handle the case where its last 1009 * CPU is running a higher-priority process, but every 1010 * other CPU is running a lower-priority process. There 1011 * are ways to handle this situation, but they're not 1012 * currently very pretty, and we also need to weigh the 1013 * cost of moving a process from one CPU to another. 1014 * 1015 * XXXSMP 1016 * There is also the issue of locking the other CPU's 1017 * sched state, which we currently do not do. 1018 */ 1019 ci = (l->l_cpu != NULL) ? l->l_cpu : curcpu(); 1020 if (pri < ci->ci_schedstate.spc_curpriority) 1021 need_resched(ci); 1022 } 1023 1024 /* 1025 * Change process state to be runnable, 1026 * placing it on the run queue if it is in memory, 1027 * and awakening the swapper if it isn't in memory. 1028 */ 1029 void 1030 setrunnable(struct lwp *l) 1031 { 1032 struct proc *p = l->l_proc; 1033 1034 SCHED_ASSERT_LOCKED(); 1035 1036 switch (l->l_stat) { 1037 case 0: 1038 case LSRUN: 1039 case LSONPROC: 1040 case LSZOMB: 1041 case LSDEAD: 1042 default: 1043 panic("setrunnable: lwp %p state was %d", l, l->l_stat); 1044 case LSSTOP: 1045 /* 1046 * If we're being traced (possibly because someone attached us 1047 * while we were stopped), check for a signal from the debugger. 1048 */ 1049 if ((p->p_flag & P_TRACED) != 0 && p->p_xstat != 0) { 1050 sigaddset(&p->p_sigctx.ps_siglist, p->p_xstat); 1051 CHECKSIGS(p); 1052 } 1053 case LSSLEEP: 1054 unsleep(l); /* e.g. when sending signals */ 1055 break; 1056 1057 case LSIDL: 1058 break; 1059 case LSSUSPENDED: 1060 break; 1061 } 1062 l->l_stat = LSRUN; 1063 p->p_nrlwps++; 1064 1065 if (l->l_flag & L_INMEM) 1066 setrunqueue(l); 1067 1068 if (l->l_slptime > 1) 1069 updatepri(l); 1070 l->l_slptime = 0; 1071 if ((l->l_flag & L_INMEM) == 0) 1072 sched_wakeup((caddr_t)&proc0); 1073 else 1074 resched_proc(l, l->l_priority); 1075 } 1076 1077 /* 1078 * Compute the priority of a process when running in user mode. 1079 * Arrange to reschedule if the resulting priority is better 1080 * than that of the current process. 1081 */ 1082 void 1083 resetpriority(struct lwp *l) 1084 { 1085 unsigned int newpriority; 1086 struct proc *p = l->l_proc; 1087 1088 SCHED_ASSERT_LOCKED(); 1089 1090 newpriority = PUSER + p->p_estcpu + 1091 NICE_WEIGHT * (p->p_nice - NZERO); 1092 newpriority = min(newpriority, MAXPRI); 1093 l->l_usrpri = newpriority; 1094 resched_proc(l, l->l_usrpri); 1095 } 1096 1097 /* 1098 * Recompute priority for all LWPs in a process. 1099 */ 1100 void 1101 resetprocpriority(struct proc *p) 1102 { 1103 struct lwp *l; 1104 1105 LIST_FOREACH(l, &p->p_lwps, l_sibling) 1106 resetpriority(l); 1107 } 1108 1109 /* 1110 * We adjust the priority of the current process. The priority of a process 1111 * gets worse as it accumulates CPU time. The cpu usage estimator (p_estcpu) 1112 * is increased here. The formula for computing priorities (in kern_synch.c) 1113 * will compute a different value each time p_estcpu increases. This can 1114 * cause a switch, but unless the priority crosses a PPQ boundary the actual 1115 * queue will not change. The cpu usage estimator ramps up quite quickly 1116 * when the process is running (linearly), and decays away exponentially, at 1117 * a rate which is proportionally slower when the system is busy. The basic 1118 * principle is that the system will 90% forget that the process used a lot 1119 * of CPU time in 5 * loadav seconds. This causes the system to favor 1120 * processes which haven't run much recently, and to round-robin among other 1121 * processes. 1122 */ 1123 1124 void 1125 schedclock(struct lwp *l) 1126 { 1127 struct proc *p = l->l_proc; 1128 int s; 1129 1130 p->p_estcpu = ESTCPULIM(p->p_estcpu + 1); 1131 SCHED_LOCK(s); 1132 resetpriority(l); 1133 SCHED_UNLOCK(s); 1134 1135 if (l->l_priority >= PUSER) 1136 l->l_priority = l->l_usrpri; 1137 } 1138 1139 void 1140 suspendsched() 1141 { 1142 struct lwp *l; 1143 int s; 1144 1145 /* 1146 * Convert all non-P_SYSTEM LSSLEEP or LSRUN processes to 1147 * LSSUSPENDED. 1148 */ 1149 proclist_lock_read(); 1150 SCHED_LOCK(s); 1151 LIST_FOREACH(l, &alllwp, l_list) { 1152 if ((l->l_proc->p_flag & P_SYSTEM) != 0) 1153 continue; 1154 1155 switch (l->l_stat) { 1156 case LSRUN: 1157 l->l_proc->p_nrlwps--; 1158 if ((l->l_flag & L_INMEM) != 0) 1159 remrunqueue(l); 1160 /* FALLTHROUGH */ 1161 case LSSLEEP: 1162 l->l_stat = LSSUSPENDED; 1163 break; 1164 case LSONPROC: 1165 /* 1166 * XXX SMP: we need to deal with processes on 1167 * others CPU ! 1168 */ 1169 break; 1170 default: 1171 break; 1172 } 1173 } 1174 SCHED_UNLOCK(s); 1175 proclist_unlock_read(); 1176 } 1177 1178 /* 1179 * Low-level routines to access the run queue. Optimised assembler 1180 * routines can override these. 1181 */ 1182 1183 #ifndef __HAVE_MD_RUNQUEUE 1184 1185 /* 1186 * On some architectures, it's faster to use a MSB ordering for the priorites 1187 * than the traditional LSB ordering. 1188 */ 1189 #ifdef __HAVE_BIGENDIAN_BITOPS 1190 #define RQMASK(n) (0x80000000 >> (n)) 1191 #else 1192 #define RQMASK(n) (0x00000001 << (n)) 1193 #endif 1194 1195 /* 1196 * The primitives that manipulate the run queues. whichqs tells which 1197 * of the 32 queues qs have processes in them. Setrunqueue puts processes 1198 * into queues, remrunqueue removes them from queues. The running process is 1199 * on no queue, other processes are on a queue related to p->p_priority, 1200 * divided by 4 actually to shrink the 0-127 range of priorities into the 32 1201 * available queues. 1202 */ 1203 1204 void 1205 setrunqueue(struct lwp *l) 1206 { 1207 struct prochd *rq; 1208 struct lwp *prev; 1209 const int whichq = l->l_priority / 4; 1210 1211 #ifdef DIAGNOSTIC 1212 if (l->l_back != NULL || l->l_wchan != NULL || l->l_stat != LSRUN) 1213 panic("setrunqueue"); 1214 #endif 1215 sched_whichqs |= RQMASK(whichq); 1216 rq = &sched_qs[whichq]; 1217 prev = rq->ph_rlink; 1218 l->l_forw = (struct lwp *)rq; 1219 rq->ph_rlink = l; 1220 prev->l_forw = l; 1221 l->l_back = prev; 1222 } 1223 1224 void 1225 remrunqueue(struct lwp *l) 1226 { 1227 struct lwp *prev, *next; 1228 const int whichq = l->l_priority / 4; 1229 #ifdef DIAGNOSTIC 1230 if (((sched_whichqs & RQMASK(whichq)) == 0)) 1231 panic("remrunqueue"); 1232 #endif 1233 prev = l->l_back; 1234 l->l_back = NULL; 1235 next = l->l_forw; 1236 prev->l_forw = next; 1237 next->l_back = prev; 1238 if (prev == next) 1239 sched_whichqs &= ~RQMASK(whichq); 1240 } 1241 1242 #undef RQMASK 1243 #endif /* !defined(__HAVE_MD_RUNQUEUE) */ 1244