1 /* $NetBSD: kern_synch.c,v 1.149 2005/05/29 22:24:15 christos Exp $ */ 2 3 /*- 4 * Copyright (c) 1999, 2000, 2004 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center. 10 * This code is derived from software contributed to The NetBSD Foundation 11 * by Charles M. Hannum. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 3. All advertising materials mentioning features or use of this software 22 * must display the following acknowledgement: 23 * This product includes software developed by the NetBSD 24 * Foundation, Inc. and its contributors. 25 * 4. Neither the name of The NetBSD Foundation nor the names of its 26 * contributors may be used to endorse or promote products derived 27 * from this software without specific prior written permission. 28 * 29 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 30 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 31 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 32 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 33 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 34 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 35 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 36 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 37 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 38 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 39 * POSSIBILITY OF SUCH DAMAGE. 40 */ 41 42 /*- 43 * Copyright (c) 1982, 1986, 1990, 1991, 1993 44 * The Regents of the University of California. All rights reserved. 45 * (c) UNIX System Laboratories, Inc. 46 * All or some portions of this file are derived from material licensed 47 * to the University of California by American Telephone and Telegraph 48 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 49 * the permission of UNIX System Laboratories, Inc. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions 53 * are met: 54 * 1. Redistributions of source code must retain the above copyright 55 * notice, this list of conditions and the following disclaimer. 56 * 2. Redistributions in binary form must reproduce the above copyright 57 * notice, this list of conditions and the following disclaimer in the 58 * documentation and/or other materials provided with the distribution. 59 * 3. Neither the name of the University nor the names of its contributors 60 * may be used to endorse or promote products derived from this software 61 * without specific prior written permission. 62 * 63 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 64 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 65 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 66 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 67 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 68 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 69 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 70 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 71 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 72 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 73 * SUCH DAMAGE. 74 * 75 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 76 */ 77 78 #include <sys/cdefs.h> 79 __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.149 2005/05/29 22:24:15 christos Exp $"); 80 81 #include "opt_ddb.h" 82 #include "opt_ktrace.h" 83 #include "opt_kstack.h" 84 #include "opt_lockdebug.h" 85 #include "opt_multiprocessor.h" 86 #include "opt_perfctrs.h" 87 88 #include <sys/param.h> 89 #include <sys/systm.h> 90 #include <sys/callout.h> 91 #include <sys/proc.h> 92 #include <sys/kernel.h> 93 #include <sys/buf.h> 94 #if defined(PERFCTRS) 95 #include <sys/pmc.h> 96 #endif 97 #include <sys/signalvar.h> 98 #include <sys/resourcevar.h> 99 #include <sys/sched.h> 100 #include <sys/sa.h> 101 #include <sys/savar.h> 102 103 #include <uvm/uvm_extern.h> 104 105 #ifdef KTRACE 106 #include <sys/ktrace.h> 107 #endif 108 109 #include <machine/cpu.h> 110 111 int lbolt; /* once a second sleep address */ 112 int rrticks; /* number of hardclock ticks per roundrobin() */ 113 114 /* 115 * The global scheduler state. 116 */ 117 struct prochd sched_qs[RUNQUE_NQS]; /* run queues */ 118 __volatile u_int32_t sched_whichqs; /* bitmap of non-empty queues */ 119 struct slpque sched_slpque[SLPQUE_TABLESIZE]; /* sleep queues */ 120 121 struct simplelock sched_lock = SIMPLELOCK_INITIALIZER; 122 123 void schedcpu(void *); 124 void updatepri(struct lwp *); 125 void endtsleep(void *); 126 127 __inline void sa_awaken(struct lwp *); 128 __inline void awaken(struct lwp *); 129 130 struct callout schedcpu_ch = CALLOUT_INITIALIZER_SETFUNC(schedcpu, NULL); 131 132 133 134 /* 135 * Force switch among equal priority processes every 100ms. 136 * Called from hardclock every hz/10 == rrticks hardclock ticks. 137 */ 138 /* ARGSUSED */ 139 void 140 roundrobin(struct cpu_info *ci) 141 { 142 struct schedstate_percpu *spc = &ci->ci_schedstate; 143 144 spc->spc_rrticks = rrticks; 145 146 if (curlwp != NULL) { 147 if (spc->spc_flags & SPCF_SEENRR) { 148 /* 149 * The process has already been through a roundrobin 150 * without switching and may be hogging the CPU. 151 * Indicate that the process should yield. 152 */ 153 spc->spc_flags |= SPCF_SHOULDYIELD; 154 } else 155 spc->spc_flags |= SPCF_SEENRR; 156 } 157 need_resched(curcpu()); 158 } 159 160 /* 161 * Constants for digital decay and forget: 162 * 90% of (p_estcpu) usage in 5 * loadav time 163 * 95% of (p_pctcpu) usage in 60 seconds (load insensitive) 164 * Note that, as ps(1) mentions, this can let percentages 165 * total over 100% (I've seen 137.9% for 3 processes). 166 * 167 * Note that hardclock updates p_estcpu and p_cpticks independently. 168 * 169 * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds. 170 * That is, the system wants to compute a value of decay such 171 * that the following for loop: 172 * for (i = 0; i < (5 * loadavg); i++) 173 * p_estcpu *= decay; 174 * will compute 175 * p_estcpu *= 0.1; 176 * for all values of loadavg: 177 * 178 * Mathematically this loop can be expressed by saying: 179 * decay ** (5 * loadavg) ~= .1 180 * 181 * The system computes decay as: 182 * decay = (2 * loadavg) / (2 * loadavg + 1) 183 * 184 * We wish to prove that the system's computation of decay 185 * will always fulfill the equation: 186 * decay ** (5 * loadavg) ~= .1 187 * 188 * If we compute b as: 189 * b = 2 * loadavg 190 * then 191 * decay = b / (b + 1) 192 * 193 * We now need to prove two things: 194 * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1) 195 * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg) 196 * 197 * Facts: 198 * For x close to zero, exp(x) =~ 1 + x, since 199 * exp(x) = 0! + x**1/1! + x**2/2! + ... . 200 * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b. 201 * For x close to zero, ln(1+x) =~ x, since 202 * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1 203 * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1). 204 * ln(.1) =~ -2.30 205 * 206 * Proof of (1): 207 * Solve (factor)**(power) =~ .1 given power (5*loadav): 208 * solving for factor, 209 * ln(factor) =~ (-2.30/5*loadav), or 210 * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) = 211 * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED 212 * 213 * Proof of (2): 214 * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)): 215 * solving for power, 216 * power*ln(b/(b+1)) =~ -2.30, or 217 * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED 218 * 219 * Actual power values for the implemented algorithm are as follows: 220 * loadav: 1 2 3 4 221 * power: 5.68 10.32 14.94 19.55 222 */ 223 224 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */ 225 #define loadfactor(loadav) (2 * (loadav)) 226 #define decay_cpu(loadfac, cpu) (((loadfac) * (cpu)) / ((loadfac) + FSCALE)) 227 228 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 229 fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 230 231 /* 232 * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the 233 * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below 234 * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT). 235 * 236 * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used: 237 * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits). 238 * 239 * If you dont want to bother with the faster/more-accurate formula, you 240 * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate 241 * (more general) method of calculating the %age of CPU used by a process. 242 */ 243 #define CCPU_SHIFT 11 244 245 /* 246 * Recompute process priorities, every hz ticks. 247 */ 248 /* ARGSUSED */ 249 void 250 schedcpu(void *arg) 251 { 252 fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); 253 struct lwp *l; 254 struct proc *p; 255 int s, minslp; 256 unsigned int newcpu; 257 int clkhz; 258 259 proclist_lock_read(); 260 PROCLIST_FOREACH(p, &allproc) { 261 /* 262 * Increment time in/out of memory and sleep time 263 * (if sleeping). We ignore overflow; with 16-bit int's 264 * (remember them?) overflow takes 45 days. 265 */ 266 minslp = 2; 267 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 268 l->l_swtime++; 269 if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP || 270 l->l_stat == LSSUSPENDED) { 271 l->l_slptime++; 272 minslp = min(minslp, l->l_slptime); 273 } else 274 minslp = 0; 275 } 276 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT; 277 /* 278 * If the process has slept the entire second, 279 * stop recalculating its priority until it wakes up. 280 */ 281 if (minslp > 1) 282 continue; 283 s = splstatclock(); /* prevent state changes */ 284 /* 285 * p_pctcpu is only for ps. 286 */ 287 clkhz = stathz != 0 ? stathz : hz; 288 #if (FSHIFT >= CCPU_SHIFT) 289 p->p_pctcpu += (clkhz == 100)? 290 ((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT): 291 100 * (((fixpt_t) p->p_cpticks) 292 << (FSHIFT - CCPU_SHIFT)) / clkhz; 293 #else 294 p->p_pctcpu += ((FSCALE - ccpu) * 295 (p->p_cpticks * FSCALE / clkhz)) >> FSHIFT; 296 #endif 297 p->p_cpticks = 0; 298 newcpu = (u_int)decay_cpu(loadfac, p->p_estcpu); 299 p->p_estcpu = newcpu; 300 splx(s); /* Done with the process CPU ticks update */ 301 SCHED_LOCK(s); 302 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 303 if (l->l_slptime > 1) 304 continue; 305 resetpriority(l); 306 if (l->l_priority >= PUSER) { 307 if (l->l_stat == LSRUN && 308 (l->l_flag & L_INMEM) && 309 (l->l_priority / PPQ) != (l->l_usrpri / PPQ)) { 310 remrunqueue(l); 311 l->l_priority = l->l_usrpri; 312 setrunqueue(l); 313 } else 314 l->l_priority = l->l_usrpri; 315 } 316 } 317 SCHED_UNLOCK(s); 318 } 319 proclist_unlock_read(); 320 uvm_meter(); 321 wakeup((caddr_t)&lbolt); 322 callout_schedule(&schedcpu_ch, hz); 323 } 324 325 /* 326 * Recalculate the priority of a process after it has slept for a while. 327 * For all load averages >= 1 and max p_estcpu of 255, sleeping for at 328 * least six times the loadfactor will decay p_estcpu to zero. 329 */ 330 void 331 updatepri(struct lwp *l) 332 { 333 struct proc *p = l->l_proc; 334 unsigned int newcpu; 335 fixpt_t loadfac; 336 337 SCHED_ASSERT_LOCKED(); 338 339 newcpu = p->p_estcpu; 340 loadfac = loadfactor(averunnable.ldavg[0]); 341 342 if (l->l_slptime > 5 * loadfac) 343 p->p_estcpu = 0; /* XXX NJWLWP */ 344 else { 345 l->l_slptime--; /* the first time was done in schedcpu */ 346 while (newcpu && --l->l_slptime) 347 newcpu = (int) decay_cpu(loadfac, newcpu); 348 p->p_estcpu = newcpu; 349 } 350 resetpriority(l); 351 } 352 353 /* 354 * During autoconfiguration or after a panic, a sleep will simply 355 * lower the priority briefly to allow interrupts, then return. 356 * The priority to be used (safepri) is machine-dependent, thus this 357 * value is initialized and maintained in the machine-dependent layers. 358 * This priority will typically be 0, or the lowest priority 359 * that is safe for use on the interrupt stack; it can be made 360 * higher to block network software interrupts after panics. 361 */ 362 int safepri; 363 364 /* 365 * General sleep call. Suspends the current process until a wakeup is 366 * performed on the specified identifier. The process will then be made 367 * runnable with the specified priority. Sleeps at most timo/hz seconds 368 * (0 means no timeout). If pri includes PCATCH flag, signals are checked 369 * before and after sleeping, else signals are not checked. Returns 0 if 370 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a 371 * signal needs to be delivered, ERESTART is returned if the current system 372 * call should be restarted if possible, and EINTR is returned if the system 373 * call should be interrupted by the signal (return EINTR). 374 * 375 * The interlock is held until the scheduler_slock is acquired. The 376 * interlock will be locked before returning back to the caller 377 * unless the PNORELOCK flag is specified, in which case the 378 * interlock will always be unlocked upon return. 379 */ 380 int 381 ltsleep(__volatile const void *ident, int priority, const char *wmesg, int timo, 382 __volatile struct simplelock *interlock) 383 { 384 struct lwp *l = curlwp; 385 struct proc *p = l ? l->l_proc : NULL; 386 struct slpque *qp; 387 int sig, s; 388 int catch = priority & PCATCH; 389 int relock = (priority & PNORELOCK) == 0; 390 int exiterr = (priority & PNOEXITERR) == 0; 391 392 /* 393 * XXXSMP 394 * This is probably bogus. Figure out what the right 395 * thing to do here really is. 396 * Note that not sleeping if ltsleep is called with curlwp == NULL 397 * in the shutdown case is disgusting but partly necessary given 398 * how shutdown (barely) works. 399 */ 400 if (cold || (doing_shutdown && (panicstr || (l == NULL)))) { 401 /* 402 * After a panic, or during autoconfiguration, 403 * just give interrupts a chance, then just return; 404 * don't run any other procs or panic below, 405 * in case this is the idle process and already asleep. 406 */ 407 s = splhigh(); 408 splx(safepri); 409 splx(s); 410 if (interlock != NULL && relock == 0) 411 simple_unlock(interlock); 412 return (0); 413 } 414 415 KASSERT(p != NULL); 416 LOCK_ASSERT(interlock == NULL || simple_lock_held(interlock)); 417 418 #ifdef KTRACE 419 if (KTRPOINT(p, KTR_CSW)) 420 ktrcsw(p, 1, 0); 421 #endif 422 423 SCHED_LOCK(s); 424 425 #ifdef DIAGNOSTIC 426 if (ident == NULL) 427 panic("ltsleep: ident == NULL"); 428 if (l->l_stat != LSONPROC) 429 panic("ltsleep: l_stat %d != LSONPROC", l->l_stat); 430 if (l->l_back != NULL) 431 panic("ltsleep: p_back != NULL"); 432 #endif 433 434 l->l_wchan = ident; 435 l->l_wmesg = wmesg; 436 l->l_slptime = 0; 437 l->l_priority = priority & PRIMASK; 438 439 qp = SLPQUE(ident); 440 if (qp->sq_head == 0) 441 qp->sq_head = l; 442 else { 443 *qp->sq_tailp = l; 444 } 445 *(qp->sq_tailp = &l->l_forw) = 0; 446 447 if (timo) 448 callout_reset(&l->l_tsleep_ch, timo, endtsleep, l); 449 450 /* 451 * We can now release the interlock; the scheduler_slock 452 * is held, so a thread can't get in to do wakeup() before 453 * we do the switch. 454 * 455 * XXX We leave the code block here, after inserting ourselves 456 * on the sleep queue, because we might want a more clever 457 * data structure for the sleep queues at some point. 458 */ 459 if (interlock != NULL) 460 simple_unlock(interlock); 461 462 /* 463 * We put ourselves on the sleep queue and start our timeout 464 * before calling CURSIG, as we could stop there, and a wakeup 465 * or a SIGCONT (or both) could occur while we were stopped. 466 * A SIGCONT would cause us to be marked as SSLEEP 467 * without resuming us, thus we must be ready for sleep 468 * when CURSIG is called. If the wakeup happens while we're 469 * stopped, p->p_wchan will be 0 upon return from CURSIG. 470 */ 471 if (catch) { 472 l->l_flag |= L_SINTR; 473 if (((sig = CURSIG(l)) != 0) || 474 ((p->p_flag & P_WEXIT) && p->p_nlwps > 1)) { 475 if (l->l_wchan != NULL) 476 unsleep(l); 477 l->l_stat = LSONPROC; 478 SCHED_UNLOCK(s); 479 goto resume; 480 } 481 if (l->l_wchan == NULL) { 482 catch = 0; 483 SCHED_UNLOCK(s); 484 goto resume; 485 } 486 } else 487 sig = 0; 488 l->l_stat = LSSLEEP; 489 p->p_nrlwps--; 490 p->p_stats->p_ru.ru_nvcsw++; 491 SCHED_ASSERT_LOCKED(); 492 if (l->l_flag & L_SA) 493 sa_switch(l, SA_UPCALL_BLOCKED); 494 else 495 mi_switch(l, NULL); 496 497 #if defined(DDB) && !defined(GPROF) 498 /* handy breakpoint location after process "wakes" */ 499 __asm(".globl bpendtsleep\nbpendtsleep:"); 500 #endif 501 /* 502 * p->p_nrlwps is incremented by whoever made us runnable again, 503 * either setrunnable() or awaken(). 504 */ 505 506 SCHED_ASSERT_UNLOCKED(); 507 splx(s); 508 509 resume: 510 KDASSERT(l->l_cpu != NULL); 511 KDASSERT(l->l_cpu == curcpu()); 512 l->l_cpu->ci_schedstate.spc_curpriority = l->l_usrpri; 513 514 l->l_flag &= ~L_SINTR; 515 if (l->l_flag & L_TIMEOUT) { 516 l->l_flag &= ~(L_TIMEOUT|L_CANCELLED); 517 if (sig == 0) { 518 #ifdef KTRACE 519 if (KTRPOINT(p, KTR_CSW)) 520 ktrcsw(p, 0, 0); 521 #endif 522 if (relock && interlock != NULL) 523 simple_lock(interlock); 524 return (EWOULDBLOCK); 525 } 526 } else if (timo) 527 callout_stop(&l->l_tsleep_ch); 528 529 if (catch) { 530 const int cancelled = l->l_flag & L_CANCELLED; 531 l->l_flag &= ~L_CANCELLED; 532 if (sig != 0 || (sig = CURSIG(l)) != 0 || cancelled) { 533 #ifdef KTRACE 534 if (KTRPOINT(p, KTR_CSW)) 535 ktrcsw(p, 0, 0); 536 #endif 537 if (relock && interlock != NULL) 538 simple_lock(interlock); 539 /* 540 * If this sleep was canceled, don't let the syscall 541 * restart. 542 */ 543 if (cancelled || 544 (SIGACTION(p, sig).sa_flags & SA_RESTART) == 0) 545 return (EINTR); 546 return (ERESTART); 547 } 548 } 549 550 #ifdef KTRACE 551 if (KTRPOINT(p, KTR_CSW)) 552 ktrcsw(p, 0, 0); 553 #endif 554 if (relock && interlock != NULL) 555 simple_lock(interlock); 556 557 /* XXXNJW this is very much a kluge. 558 * revisit. a better way of preventing looping/hanging syscalls like 559 * wait4() and _lwp_wait() from wedging an exiting process 560 * would be preferred. 561 */ 562 if (catch && ((p->p_flag & P_WEXIT) && p->p_nlwps > 1 && exiterr)) 563 return (EINTR); 564 return (0); 565 } 566 567 /* 568 * Implement timeout for tsleep. 569 * If process hasn't been awakened (wchan non-zero), 570 * set timeout flag and undo the sleep. If proc 571 * is stopped, just unsleep so it will remain stopped. 572 */ 573 void 574 endtsleep(void *arg) 575 { 576 struct lwp *l; 577 int s; 578 579 l = (struct lwp *)arg; 580 SCHED_LOCK(s); 581 if (l->l_wchan) { 582 if (l->l_stat == LSSLEEP) 583 setrunnable(l); 584 else 585 unsleep(l); 586 l->l_flag |= L_TIMEOUT; 587 } 588 SCHED_UNLOCK(s); 589 } 590 591 /* 592 * Remove a process from its wait queue 593 */ 594 void 595 unsleep(struct lwp *l) 596 { 597 struct slpque *qp; 598 struct lwp **hp; 599 600 SCHED_ASSERT_LOCKED(); 601 602 if (l->l_wchan) { 603 hp = &(qp = SLPQUE(l->l_wchan))->sq_head; 604 while (*hp != l) 605 hp = &(*hp)->l_forw; 606 *hp = l->l_forw; 607 if (qp->sq_tailp == &l->l_forw) 608 qp->sq_tailp = hp; 609 l->l_wchan = 0; 610 } 611 } 612 613 __inline void 614 sa_awaken(struct lwp *l) 615 { 616 617 SCHED_ASSERT_LOCKED(); 618 619 if (l == l->l_savp->savp_lwp && l->l_flag & L_SA_YIELD) 620 l->l_flag &= ~L_SA_IDLE; 621 } 622 623 /* 624 * Optimized-for-wakeup() version of setrunnable(). 625 */ 626 __inline void 627 awaken(struct lwp *l) 628 { 629 630 SCHED_ASSERT_LOCKED(); 631 632 if (l->l_proc->p_sa) 633 sa_awaken(l); 634 635 if (l->l_slptime > 1) 636 updatepri(l); 637 l->l_slptime = 0; 638 l->l_stat = LSRUN; 639 l->l_proc->p_nrlwps++; 640 /* 641 * Since curpriority is a user priority, p->p_priority 642 * is always better than curpriority on the last CPU on 643 * which it ran. 644 * 645 * XXXSMP See affinity comment in resched_proc(). 646 */ 647 if (l->l_flag & L_INMEM) { 648 setrunqueue(l); 649 KASSERT(l->l_cpu != NULL); 650 need_resched(l->l_cpu); 651 } else 652 sched_wakeup(&proc0); 653 } 654 655 #if defined(MULTIPROCESSOR) || defined(LOCKDEBUG) 656 void 657 sched_unlock_idle(void) 658 { 659 660 simple_unlock(&sched_lock); 661 } 662 663 void 664 sched_lock_idle(void) 665 { 666 667 simple_lock(&sched_lock); 668 } 669 #endif /* MULTIPROCESSOR || LOCKDEBUG */ 670 671 /* 672 * Make all processes sleeping on the specified identifier runnable. 673 */ 674 675 void 676 wakeup(__volatile const void *ident) 677 { 678 int s; 679 680 SCHED_ASSERT_UNLOCKED(); 681 682 SCHED_LOCK(s); 683 sched_wakeup(ident); 684 SCHED_UNLOCK(s); 685 } 686 687 void 688 sched_wakeup(__volatile const void *ident) 689 { 690 struct slpque *qp; 691 struct lwp *l, **q; 692 693 SCHED_ASSERT_LOCKED(); 694 695 qp = SLPQUE(ident); 696 restart: 697 for (q = &qp->sq_head; (l = *q) != NULL; ) { 698 #ifdef DIAGNOSTIC 699 if (l->l_back || (l->l_stat != LSSLEEP && 700 l->l_stat != LSSTOP && l->l_stat != LSSUSPENDED)) 701 panic("wakeup"); 702 #endif 703 if (l->l_wchan == ident) { 704 l->l_wchan = 0; 705 *q = l->l_forw; 706 if (qp->sq_tailp == &l->l_forw) 707 qp->sq_tailp = q; 708 if (l->l_stat == LSSLEEP) { 709 awaken(l); 710 goto restart; 711 } 712 } else 713 q = &l->l_forw; 714 } 715 } 716 717 /* 718 * Make the highest priority process first in line on the specified 719 * identifier runnable. 720 */ 721 void 722 wakeup_one(__volatile const void *ident) 723 { 724 struct slpque *qp; 725 struct lwp *l, **q; 726 struct lwp *best_sleepp, **best_sleepq; 727 struct lwp *best_stopp, **best_stopq; 728 int s; 729 730 best_sleepp = best_stopp = NULL; 731 best_sleepq = best_stopq = NULL; 732 733 SCHED_LOCK(s); 734 735 qp = SLPQUE(ident); 736 737 for (q = &qp->sq_head; (l = *q) != NULL; q = &l->l_forw) { 738 #ifdef DIAGNOSTIC 739 if (l->l_back || (l->l_stat != LSSLEEP && 740 l->l_stat != LSSTOP && l->l_stat != LSSUSPENDED)) 741 panic("wakeup_one"); 742 #endif 743 if (l->l_wchan == ident) { 744 if (l->l_stat == LSSLEEP) { 745 if (best_sleepp == NULL || 746 l->l_priority < best_sleepp->l_priority) { 747 best_sleepp = l; 748 best_sleepq = q; 749 } 750 } else { 751 if (best_stopp == NULL || 752 l->l_priority < best_stopp->l_priority) { 753 best_stopp = l; 754 best_stopq = q; 755 } 756 } 757 } 758 } 759 760 /* 761 * Consider any SSLEEP process higher than the highest priority SSTOP 762 * process. 763 */ 764 if (best_sleepp != NULL) { 765 l = best_sleepp; 766 q = best_sleepq; 767 } else { 768 l = best_stopp; 769 q = best_stopq; 770 } 771 772 if (l != NULL) { 773 l->l_wchan = NULL; 774 *q = l->l_forw; 775 if (qp->sq_tailp == &l->l_forw) 776 qp->sq_tailp = q; 777 if (l->l_stat == LSSLEEP) 778 awaken(l); 779 } 780 SCHED_UNLOCK(s); 781 } 782 783 /* 784 * General yield call. Puts the current process back on its run queue and 785 * performs a voluntary context switch. Should only be called when the 786 * current process explicitly requests it (eg sched_yield(2) in compat code). 787 */ 788 void 789 yield(void) 790 { 791 struct lwp *l = curlwp; 792 int s; 793 794 SCHED_LOCK(s); 795 l->l_priority = l->l_usrpri; 796 l->l_stat = LSRUN; 797 setrunqueue(l); 798 l->l_proc->p_stats->p_ru.ru_nvcsw++; 799 mi_switch(l, NULL); 800 SCHED_ASSERT_UNLOCKED(); 801 splx(s); 802 } 803 804 /* 805 * General preemption call. Puts the current process back on its run queue 806 * and performs an involuntary context switch. If a process is supplied, 807 * we switch to that process. Otherwise, we use the normal process selection 808 * criteria. 809 */ 810 811 void 812 preempt(int more) 813 { 814 struct lwp *l = curlwp; 815 int r, s; 816 817 SCHED_LOCK(s); 818 l->l_priority = l->l_usrpri; 819 l->l_stat = LSRUN; 820 setrunqueue(l); 821 l->l_proc->p_stats->p_ru.ru_nivcsw++; 822 r = mi_switch(l, NULL); 823 SCHED_ASSERT_UNLOCKED(); 824 splx(s); 825 if ((l->l_flag & L_SA) != 0 && r != 0 && more == 0) 826 sa_preempt(l); 827 } 828 829 /* 830 * The machine independent parts of context switch. 831 * Must be called at splsched() (no higher!) and with 832 * the sched_lock held. 833 * Switch to "new" if non-NULL, otherwise let cpu_switch choose 834 * the next lwp. 835 * 836 * Returns 1 if another process was actually run. 837 */ 838 int 839 mi_switch(struct lwp *l, struct lwp *newl) 840 { 841 struct schedstate_percpu *spc; 842 struct rlimit *rlim; 843 long s, u; 844 struct timeval tv; 845 int hold_count; 846 struct proc *p = l->l_proc; 847 int retval; 848 849 SCHED_ASSERT_LOCKED(); 850 851 /* 852 * Release the kernel_lock, as we are about to yield the CPU. 853 * The scheduler lock is still held until cpu_switch() 854 * selects a new process and removes it from the run queue. 855 */ 856 hold_count = KERNEL_LOCK_RELEASE_ALL(); 857 858 KDASSERT(l->l_cpu != NULL); 859 KDASSERT(l->l_cpu == curcpu()); 860 861 spc = &l->l_cpu->ci_schedstate; 862 863 #if defined(LOCKDEBUG) || defined(DIAGNOSTIC) 864 spinlock_switchcheck(); 865 #endif 866 #ifdef LOCKDEBUG 867 simple_lock_switchcheck(); 868 #endif 869 870 /* 871 * Compute the amount of time during which the current 872 * process was running. 873 */ 874 microtime(&tv); 875 u = p->p_rtime.tv_usec + 876 (tv.tv_usec - spc->spc_runtime.tv_usec); 877 s = p->p_rtime.tv_sec + (tv.tv_sec - spc->spc_runtime.tv_sec); 878 if (u < 0) { 879 u += 1000000; 880 s--; 881 } else if (u >= 1000000) { 882 u -= 1000000; 883 s++; 884 } 885 p->p_rtime.tv_usec = u; 886 p->p_rtime.tv_sec = s; 887 888 /* 889 * Check if the process exceeds its CPU resource allocation. 890 * If over max, kill it. In any case, if it has run for more 891 * than 10 minutes, reduce priority to give others a chance. 892 */ 893 rlim = &p->p_rlimit[RLIMIT_CPU]; 894 if (s >= rlim->rlim_cur) { 895 /* 896 * XXXSMP: we're inside the scheduler lock perimeter; 897 * use sched_psignal. 898 */ 899 if (s >= rlim->rlim_max) 900 sched_psignal(p, SIGKILL); 901 else { 902 sched_psignal(p, SIGXCPU); 903 if (rlim->rlim_cur < rlim->rlim_max) 904 rlim->rlim_cur += 5; 905 } 906 } 907 if (autonicetime && s > autonicetime && p->p_ucred->cr_uid && 908 p->p_nice == NZERO) { 909 p->p_nice = autoniceval + NZERO; 910 resetpriority(l); 911 } 912 913 /* 914 * Process is about to yield the CPU; clear the appropriate 915 * scheduling flags. 916 */ 917 spc->spc_flags &= ~SPCF_SWITCHCLEAR; 918 919 #ifdef KSTACK_CHECK_MAGIC 920 kstack_check_magic(l); 921 #endif 922 923 /* 924 * If we are using h/w performance counters, save context. 925 */ 926 #if PERFCTRS 927 if (PMC_ENABLED(p)) 928 pmc_save_context(p); 929 #endif 930 931 /* 932 * Switch to the new current process. When we 933 * run again, we'll return back here. 934 */ 935 uvmexp.swtch++; 936 if (newl == NULL) { 937 retval = cpu_switch(l, NULL); 938 } else { 939 remrunqueue(newl); 940 cpu_switchto(l, newl); 941 retval = 0; 942 } 943 944 /* 945 * If we are using h/w performance counters, restore context. 946 */ 947 #if PERFCTRS 948 if (PMC_ENABLED(p)) 949 pmc_restore_context(p); 950 #endif 951 952 /* 953 * Make sure that MD code released the scheduler lock before 954 * resuming us. 955 */ 956 SCHED_ASSERT_UNLOCKED(); 957 958 /* 959 * We're running again; record our new start time. We might 960 * be running on a new CPU now, so don't use the cache'd 961 * schedstate_percpu pointer. 962 */ 963 KDASSERT(l->l_cpu != NULL); 964 KDASSERT(l->l_cpu == curcpu()); 965 microtime(&l->l_cpu->ci_schedstate.spc_runtime); 966 967 /* 968 * Reacquire the kernel_lock now. We do this after we've 969 * released the scheduler lock to avoid deadlock, and before 970 * we reacquire the interlock. 971 */ 972 KERNEL_LOCK_ACQUIRE_COUNT(hold_count); 973 974 return retval; 975 } 976 977 /* 978 * Initialize the (doubly-linked) run queues 979 * to be empty. 980 */ 981 void 982 rqinit() 983 { 984 int i; 985 986 for (i = 0; i < RUNQUE_NQS; i++) 987 sched_qs[i].ph_link = sched_qs[i].ph_rlink = 988 (struct lwp *)&sched_qs[i]; 989 } 990 991 static __inline void 992 resched_proc(struct lwp *l, u_char pri) 993 { 994 struct cpu_info *ci; 995 996 /* 997 * XXXSMP 998 * Since l->l_cpu persists across a context switch, 999 * this gives us *very weak* processor affinity, in 1000 * that we notify the CPU on which the process last 1001 * ran that it should try to switch. 1002 * 1003 * This does not guarantee that the process will run on 1004 * that processor next, because another processor might 1005 * grab it the next time it performs a context switch. 1006 * 1007 * This also does not handle the case where its last 1008 * CPU is running a higher-priority process, but every 1009 * other CPU is running a lower-priority process. There 1010 * are ways to handle this situation, but they're not 1011 * currently very pretty, and we also need to weigh the 1012 * cost of moving a process from one CPU to another. 1013 * 1014 * XXXSMP 1015 * There is also the issue of locking the other CPU's 1016 * sched state, which we currently do not do. 1017 */ 1018 ci = (l->l_cpu != NULL) ? l->l_cpu : curcpu(); 1019 if (pri < ci->ci_schedstate.spc_curpriority) 1020 need_resched(ci); 1021 } 1022 1023 /* 1024 * Change process state to be runnable, 1025 * placing it on the run queue if it is in memory, 1026 * and awakening the swapper if it isn't in memory. 1027 */ 1028 void 1029 setrunnable(struct lwp *l) 1030 { 1031 struct proc *p = l->l_proc; 1032 1033 SCHED_ASSERT_LOCKED(); 1034 1035 switch (l->l_stat) { 1036 case 0: 1037 case LSRUN: 1038 case LSONPROC: 1039 case LSZOMB: 1040 case LSDEAD: 1041 default: 1042 panic("setrunnable: lwp %p state was %d", l, l->l_stat); 1043 case LSSTOP: 1044 /* 1045 * If we're being traced (possibly because someone attached us 1046 * while we were stopped), check for a signal from the debugger. 1047 */ 1048 if ((p->p_flag & P_TRACED) != 0 && p->p_xstat != 0) { 1049 sigaddset(&p->p_sigctx.ps_siglist, p->p_xstat); 1050 CHECKSIGS(p); 1051 } 1052 case LSSLEEP: 1053 unsleep(l); /* e.g. when sending signals */ 1054 break; 1055 1056 case LSIDL: 1057 break; 1058 case LSSUSPENDED: 1059 break; 1060 } 1061 1062 if (l->l_proc->p_sa) 1063 sa_awaken(l); 1064 1065 l->l_stat = LSRUN; 1066 p->p_nrlwps++; 1067 1068 if (l->l_flag & L_INMEM) 1069 setrunqueue(l); 1070 1071 if (l->l_slptime > 1) 1072 updatepri(l); 1073 l->l_slptime = 0; 1074 if ((l->l_flag & L_INMEM) == 0) 1075 sched_wakeup((caddr_t)&proc0); 1076 else 1077 resched_proc(l, l->l_priority); 1078 } 1079 1080 /* 1081 * Compute the priority of a process when running in user mode. 1082 * Arrange to reschedule if the resulting priority is better 1083 * than that of the current process. 1084 */ 1085 void 1086 resetpriority(struct lwp *l) 1087 { 1088 unsigned int newpriority; 1089 struct proc *p = l->l_proc; 1090 1091 SCHED_ASSERT_LOCKED(); 1092 1093 newpriority = PUSER + p->p_estcpu + 1094 NICE_WEIGHT * (p->p_nice - NZERO); 1095 newpriority = min(newpriority, MAXPRI); 1096 l->l_usrpri = newpriority; 1097 resched_proc(l, l->l_usrpri); 1098 } 1099 1100 /* 1101 * Recompute priority for all LWPs in a process. 1102 */ 1103 void 1104 resetprocpriority(struct proc *p) 1105 { 1106 struct lwp *l; 1107 1108 LIST_FOREACH(l, &p->p_lwps, l_sibling) 1109 resetpriority(l); 1110 } 1111 1112 /* 1113 * We adjust the priority of the current process. The priority of a process 1114 * gets worse as it accumulates CPU time. The CPU usage estimator (p_estcpu) 1115 * is increased here. The formula for computing priorities (in kern_synch.c) 1116 * will compute a different value each time p_estcpu increases. This can 1117 * cause a switch, but unless the priority crosses a PPQ boundary the actual 1118 * queue will not change. The CPU usage estimator ramps up quite quickly 1119 * when the process is running (linearly), and decays away exponentially, at 1120 * a rate which is proportionally slower when the system is busy. The basic 1121 * principle is that the system will 90% forget that the process used a lot 1122 * of CPU time in 5 * loadav seconds. This causes the system to favor 1123 * processes which haven't run much recently, and to round-robin among other 1124 * processes. 1125 */ 1126 1127 void 1128 schedclock(struct lwp *l) 1129 { 1130 struct proc *p = l->l_proc; 1131 int s; 1132 1133 p->p_estcpu = ESTCPULIM(p->p_estcpu + 1); 1134 SCHED_LOCK(s); 1135 resetpriority(l); 1136 SCHED_UNLOCK(s); 1137 1138 if (l->l_priority >= PUSER) 1139 l->l_priority = l->l_usrpri; 1140 } 1141 1142 void 1143 suspendsched() 1144 { 1145 struct lwp *l; 1146 int s; 1147 1148 /* 1149 * Convert all non-P_SYSTEM LSSLEEP or LSRUN processes to 1150 * LSSUSPENDED. 1151 */ 1152 proclist_lock_read(); 1153 SCHED_LOCK(s); 1154 LIST_FOREACH(l, &alllwp, l_list) { 1155 if ((l->l_proc->p_flag & P_SYSTEM) != 0) 1156 continue; 1157 1158 switch (l->l_stat) { 1159 case LSRUN: 1160 l->l_proc->p_nrlwps--; 1161 if ((l->l_flag & L_INMEM) != 0) 1162 remrunqueue(l); 1163 /* FALLTHROUGH */ 1164 case LSSLEEP: 1165 l->l_stat = LSSUSPENDED; 1166 break; 1167 case LSONPROC: 1168 /* 1169 * XXX SMP: we need to deal with processes on 1170 * others CPU ! 1171 */ 1172 break; 1173 default: 1174 break; 1175 } 1176 } 1177 SCHED_UNLOCK(s); 1178 proclist_unlock_read(); 1179 } 1180 1181 /* 1182 * Low-level routines to access the run queue. Optimised assembler 1183 * routines can override these. 1184 */ 1185 1186 #ifndef __HAVE_MD_RUNQUEUE 1187 1188 /* 1189 * On some architectures, it's faster to use a MSB ordering for the priorites 1190 * than the traditional LSB ordering. 1191 */ 1192 #ifdef __HAVE_BIGENDIAN_BITOPS 1193 #define RQMASK(n) (0x80000000 >> (n)) 1194 #else 1195 #define RQMASK(n) (0x00000001 << (n)) 1196 #endif 1197 1198 /* 1199 * The primitives that manipulate the run queues. whichqs tells which 1200 * of the 32 queues qs have processes in them. Setrunqueue puts processes 1201 * into queues, remrunqueue removes them from queues. The running process is 1202 * on no queue, other processes are on a queue related to p->p_priority, 1203 * divided by 4 actually to shrink the 0-127 range of priorities into the 32 1204 * available queues. 1205 */ 1206 1207 #ifdef RQDEBUG 1208 static void 1209 checkrunqueue(int whichq, struct lwp *l) 1210 { 1211 const struct prochd * const rq = &sched_qs[whichq]; 1212 struct lwp *l2; 1213 int found = 0; 1214 int die = 0; 1215 int empty = 1; 1216 for (l2 = rq->ph_link; l2 != (void*) rq; l2 = l2->l_forw) { 1217 if (l2->l_stat != LSRUN) { 1218 printf("checkrunqueue[%d]: lwp %p state (%d) " 1219 " != LSRUN\n", whichq, l2, l2->l_stat); 1220 } 1221 if (l2->l_back->l_forw != l2) { 1222 printf("checkrunqueue[%d]: lwp %p back-qptr (%p) " 1223 "corrupt %p\n", whichq, l2, l2->l_back, 1224 l2->l_back->l_forw); 1225 die = 1; 1226 } 1227 if (l2->l_forw->l_back != l2) { 1228 printf("checkrunqueue[%d]: lwp %p forw-qptr (%p) " 1229 "corrupt %p\n", whichq, l2, l2->l_forw, 1230 l2->l_forw->l_back); 1231 die = 1; 1232 } 1233 if (l2 == l) 1234 found = 1; 1235 empty = 0; 1236 } 1237 if (empty && (sched_whichqs & RQMASK(whichq)) != 0) { 1238 printf("checkrunqueue[%d]: bit set for empty run-queue %p\n", 1239 whichq, rq); 1240 die = 1; 1241 } else if (!empty && (sched_whichqs & RQMASK(whichq)) == 0) { 1242 printf("checkrunqueue[%d]: bit clear for non-empty " 1243 "run-queue %p\n", whichq, rq); 1244 die = 1; 1245 } 1246 if (l != NULL && (sched_whichqs & RQMASK(whichq)) == 0) { 1247 printf("checkrunqueue[%d]: bit clear for active lwp %p\n", 1248 whichq, l); 1249 die = 1; 1250 } 1251 if (l != NULL && empty) { 1252 printf("checkrunqueue[%d]: empty run-queue %p with " 1253 "active lwp %p\n", whichq, rq, l); 1254 die = 1; 1255 } 1256 if (l != NULL && !found) { 1257 printf("checkrunqueue[%d]: lwp %p not in runqueue %p!", 1258 whichq, l, rq); 1259 die = 1; 1260 } 1261 if (die) 1262 panic("checkrunqueue: inconsistency found"); 1263 } 1264 #endif /* RQDEBUG */ 1265 1266 void 1267 setrunqueue(struct lwp *l) 1268 { 1269 struct prochd *rq; 1270 struct lwp *prev; 1271 const int whichq = l->l_priority / 4; 1272 1273 #ifdef RQDEBUG 1274 checkrunqueue(whichq, NULL); 1275 #endif 1276 #ifdef DIAGNOSTIC 1277 if (l->l_back != NULL || l->l_wchan != NULL || l->l_stat != LSRUN) 1278 panic("setrunqueue"); 1279 #endif 1280 sched_whichqs |= RQMASK(whichq); 1281 rq = &sched_qs[whichq]; 1282 prev = rq->ph_rlink; 1283 l->l_forw = (struct lwp *)rq; 1284 rq->ph_rlink = l; 1285 prev->l_forw = l; 1286 l->l_back = prev; 1287 #ifdef RQDEBUG 1288 checkrunqueue(whichq, l); 1289 #endif 1290 } 1291 1292 void 1293 remrunqueue(struct lwp *l) 1294 { 1295 struct lwp *prev, *next; 1296 const int whichq = l->l_priority / 4; 1297 #ifdef RQDEBUG 1298 checkrunqueue(whichq, l); 1299 #endif 1300 #ifdef DIAGNOSTIC 1301 if (((sched_whichqs & RQMASK(whichq)) == 0)) 1302 panic("remrunqueue: bit %d not set", whichq); 1303 #endif 1304 prev = l->l_back; 1305 l->l_back = NULL; 1306 next = l->l_forw; 1307 prev->l_forw = next; 1308 next->l_back = prev; 1309 if (prev == next) 1310 sched_whichqs &= ~RQMASK(whichq); 1311 #ifdef RQDEBUG 1312 checkrunqueue(whichq, NULL); 1313 #endif 1314 } 1315 1316 #undef RQMASK 1317 #endif /* !defined(__HAVE_MD_RUNQUEUE) */ 1318