1 /* $NetBSD: kern_synch.c,v 1.135 2003/07/28 23:35:20 matt Exp $ */ 2 3 /*- 4 * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the NetBSD 22 * Foundation, Inc. and its contributors. 23 * 4. Neither the name of The NetBSD Foundation nor the names of its 24 * contributors may be used to endorse or promote products derived 25 * from this software without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 28 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 29 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 30 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 31 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 * POSSIBILITY OF SUCH DAMAGE. 38 */ 39 40 /*- 41 * Copyright (c) 1982, 1986, 1990, 1991, 1993 42 * The Regents of the University of California. All rights reserved. 43 * (c) UNIX System Laboratories, Inc. 44 * All or some portions of this file are derived from material licensed 45 * to the University of California by American Telephone and Telegraph 46 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 47 * the permission of UNIX System Laboratories, Inc. 48 * 49 * Redistribution and use in source and binary forms, with or without 50 * modification, are permitted provided that the following conditions 51 * are met: 52 * 1. Redistributions of source code must retain the above copyright 53 * notice, this list of conditions and the following disclaimer. 54 * 2. Redistributions in binary form must reproduce the above copyright 55 * notice, this list of conditions and the following disclaimer in the 56 * documentation and/or other materials provided with the distribution. 57 * 3. All advertising materials mentioning features or use of this software 58 * must display the following acknowledgement: 59 * This product includes software developed by the University of 60 * California, Berkeley and its contributors. 61 * 4. Neither the name of the University nor the names of its contributors 62 * may be used to endorse or promote products derived from this software 63 * without specific prior written permission. 64 * 65 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 66 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 68 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 69 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 70 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 71 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 72 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 73 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 74 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 75 * SUCH DAMAGE. 76 * 77 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 78 */ 79 80 #include <sys/cdefs.h> 81 __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.135 2003/07/28 23:35:20 matt Exp $"); 82 83 #include "opt_ddb.h" 84 #include "opt_ktrace.h" 85 #include "opt_kstack.h" 86 #include "opt_lockdebug.h" 87 #include "opt_multiprocessor.h" 88 #include "opt_perfctrs.h" 89 90 #include <sys/param.h> 91 #include <sys/systm.h> 92 #include <sys/callout.h> 93 #include <sys/proc.h> 94 #include <sys/kernel.h> 95 #include <sys/buf.h> 96 #if defined(PERFCTRS) 97 #include <sys/pmc.h> 98 #endif 99 #include <sys/signalvar.h> 100 #include <sys/resourcevar.h> 101 #include <sys/sched.h> 102 #include <sys/sa.h> 103 #include <sys/savar.h> 104 105 #include <uvm/uvm_extern.h> 106 107 #ifdef KTRACE 108 #include <sys/ktrace.h> 109 #endif 110 111 #include <machine/cpu.h> 112 113 int lbolt; /* once a second sleep address */ 114 int rrticks; /* number of hardclock ticks per roundrobin() */ 115 116 /* 117 * The global scheduler state. 118 */ 119 struct prochd sched_qs[RUNQUE_NQS]; /* run queues */ 120 __volatile u_int32_t sched_whichqs; /* bitmap of non-empty queues */ 121 struct slpque sched_slpque[SLPQUE_TABLESIZE]; /* sleep queues */ 122 123 struct simplelock sched_lock = SIMPLELOCK_INITIALIZER; 124 125 void schedcpu(void *); 126 void updatepri(struct lwp *); 127 void endtsleep(void *); 128 129 __inline void awaken(struct lwp *); 130 131 struct callout schedcpu_ch = CALLOUT_INITIALIZER; 132 133 134 135 /* 136 * Force switch among equal priority processes every 100ms. 137 * Called from hardclock every hz/10 == rrticks hardclock ticks. 138 */ 139 /* ARGSUSED */ 140 void 141 roundrobin(struct cpu_info *ci) 142 { 143 struct schedstate_percpu *spc = &ci->ci_schedstate; 144 145 spc->spc_rrticks = rrticks; 146 147 if (curlwp != NULL) { 148 if (spc->spc_flags & SPCF_SEENRR) { 149 /* 150 * The process has already been through a roundrobin 151 * without switching and may be hogging the CPU. 152 * Indicate that the process should yield. 153 */ 154 spc->spc_flags |= SPCF_SHOULDYIELD; 155 } else 156 spc->spc_flags |= SPCF_SEENRR; 157 } 158 need_resched(curcpu()); 159 } 160 161 /* 162 * Constants for digital decay and forget: 163 * 90% of (p_estcpu) usage in 5 * loadav time 164 * 95% of (p_pctcpu) usage in 60 seconds (load insensitive) 165 * Note that, as ps(1) mentions, this can let percentages 166 * total over 100% (I've seen 137.9% for 3 processes). 167 * 168 * Note that hardclock updates p_estcpu and p_cpticks independently. 169 * 170 * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds. 171 * That is, the system wants to compute a value of decay such 172 * that the following for loop: 173 * for (i = 0; i < (5 * loadavg); i++) 174 * p_estcpu *= decay; 175 * will compute 176 * p_estcpu *= 0.1; 177 * for all values of loadavg: 178 * 179 * Mathematically this loop can be expressed by saying: 180 * decay ** (5 * loadavg) ~= .1 181 * 182 * The system computes decay as: 183 * decay = (2 * loadavg) / (2 * loadavg + 1) 184 * 185 * We wish to prove that the system's computation of decay 186 * will always fulfill the equation: 187 * decay ** (5 * loadavg) ~= .1 188 * 189 * If we compute b as: 190 * b = 2 * loadavg 191 * then 192 * decay = b / (b + 1) 193 * 194 * We now need to prove two things: 195 * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1) 196 * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg) 197 * 198 * Facts: 199 * For x close to zero, exp(x) =~ 1 + x, since 200 * exp(x) = 0! + x**1/1! + x**2/2! + ... . 201 * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b. 202 * For x close to zero, ln(1+x) =~ x, since 203 * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1 204 * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1). 205 * ln(.1) =~ -2.30 206 * 207 * Proof of (1): 208 * Solve (factor)**(power) =~ .1 given power (5*loadav): 209 * solving for factor, 210 * ln(factor) =~ (-2.30/5*loadav), or 211 * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) = 212 * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED 213 * 214 * Proof of (2): 215 * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)): 216 * solving for power, 217 * power*ln(b/(b+1)) =~ -2.30, or 218 * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED 219 * 220 * Actual power values for the implemented algorithm are as follows: 221 * loadav: 1 2 3 4 222 * power: 5.68 10.32 14.94 19.55 223 */ 224 225 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */ 226 #define loadfactor(loadav) (2 * (loadav)) 227 #define decay_cpu(loadfac, cpu) (((loadfac) * (cpu)) / ((loadfac) + FSCALE)) 228 229 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 230 fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 231 232 /* 233 * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the 234 * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below 235 * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT). 236 * 237 * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used: 238 * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits). 239 * 240 * If you dont want to bother with the faster/more-accurate formula, you 241 * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate 242 * (more general) method of calculating the %age of CPU used by a process. 243 */ 244 #define CCPU_SHIFT 11 245 246 /* 247 * Recompute process priorities, every hz ticks. 248 */ 249 /* ARGSUSED */ 250 void 251 schedcpu(void *arg) 252 { 253 fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); 254 struct lwp *l; 255 struct proc *p; 256 int s, minslp; 257 unsigned int newcpu; 258 int clkhz; 259 260 proclist_lock_read(); 261 LIST_FOREACH(p, &allproc, p_list) { 262 /* 263 * Increment time in/out of memory and sleep time 264 * (if sleeping). We ignore overflow; with 16-bit int's 265 * (remember them?) overflow takes 45 days. 266 */ 267 minslp = 2; 268 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 269 l->l_swtime++; 270 if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP || 271 l->l_stat == LSSUSPENDED) { 272 l->l_slptime++; 273 minslp = min(minslp, l->l_slptime); 274 } else 275 minslp = 0; 276 } 277 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT; 278 /* 279 * If the process has slept the entire second, 280 * stop recalculating its priority until it wakes up. 281 */ 282 if (minslp > 1) 283 continue; 284 s = splstatclock(); /* prevent state changes */ 285 /* 286 * p_pctcpu is only for ps. 287 */ 288 clkhz = stathz != 0 ? stathz : hz; 289 #if (FSHIFT >= CCPU_SHIFT) 290 p->p_pctcpu += (clkhz == 100)? 291 ((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT): 292 100 * (((fixpt_t) p->p_cpticks) 293 << (FSHIFT - CCPU_SHIFT)) / clkhz; 294 #else 295 p->p_pctcpu += ((FSCALE - ccpu) * 296 (p->p_cpticks * FSCALE / clkhz)) >> FSHIFT; 297 #endif 298 p->p_cpticks = 0; 299 newcpu = (u_int)decay_cpu(loadfac, p->p_estcpu); 300 p->p_estcpu = newcpu; 301 splx(s); /* Done with the process CPU ticks update */ 302 SCHED_LOCK(s); 303 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 304 if (l->l_slptime > 1) 305 continue; 306 resetpriority(l); 307 if (l->l_priority >= PUSER) { 308 if (l->l_stat == LSRUN && 309 (l->l_flag & L_INMEM) && 310 (l->l_priority / PPQ) != (l->l_usrpri / PPQ)) { 311 remrunqueue(l); 312 l->l_priority = l->l_usrpri; 313 setrunqueue(l); 314 } else 315 l->l_priority = l->l_usrpri; 316 } 317 } 318 SCHED_UNLOCK(s); 319 } 320 proclist_unlock_read(); 321 uvm_meter(); 322 wakeup((caddr_t)&lbolt); 323 callout_reset(&schedcpu_ch, hz, schedcpu, NULL); 324 } 325 326 /* 327 * Recalculate the priority of a process after it has slept for a while. 328 * For all load averages >= 1 and max p_estcpu of 255, sleeping for at 329 * least six times the loadfactor will decay p_estcpu to zero. 330 */ 331 void 332 updatepri(struct lwp *l) 333 { 334 struct proc *p = l->l_proc; 335 unsigned int newcpu; 336 fixpt_t loadfac; 337 338 SCHED_ASSERT_LOCKED(); 339 340 newcpu = p->p_estcpu; 341 loadfac = loadfactor(averunnable.ldavg[0]); 342 343 if (l->l_slptime > 5 * loadfac) 344 p->p_estcpu = 0; /* XXX NJWLWP */ 345 else { 346 l->l_slptime--; /* the first time was done in schedcpu */ 347 while (newcpu && --l->l_slptime) 348 newcpu = (int) decay_cpu(loadfac, newcpu); 349 p->p_estcpu = newcpu; 350 } 351 resetpriority(l); 352 } 353 354 /* 355 * During autoconfiguration or after a panic, a sleep will simply 356 * lower the priority briefly to allow interrupts, then return. 357 * The priority to be used (safepri) is machine-dependent, thus this 358 * value is initialized and maintained in the machine-dependent layers. 359 * This priority will typically be 0, or the lowest priority 360 * that is safe for use on the interrupt stack; it can be made 361 * higher to block network software interrupts after panics. 362 */ 363 int safepri; 364 365 /* 366 * General sleep call. Suspends the current process until a wakeup is 367 * performed on the specified identifier. The process will then be made 368 * runnable with the specified priority. Sleeps at most timo/hz seconds 369 * (0 means no timeout). If pri includes PCATCH flag, signals are checked 370 * before and after sleeping, else signals are not checked. Returns 0 if 371 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a 372 * signal needs to be delivered, ERESTART is returned if the current system 373 * call should be restarted if possible, and EINTR is returned if the system 374 * call should be interrupted by the signal (return EINTR). 375 * 376 * The interlock is held until the scheduler_slock is acquired. The 377 * interlock will be locked before returning back to the caller 378 * unless the PNORELOCK flag is specified, in which case the 379 * interlock will always be unlocked upon return. 380 */ 381 int 382 ltsleep(const void *ident, int priority, const char *wmesg, int timo, 383 __volatile struct simplelock *interlock) 384 { 385 struct lwp *l = curlwp; 386 struct proc *p = l ? l->l_proc : NULL; 387 struct slpque *qp; 388 int sig, s; 389 int catch = priority & PCATCH; 390 int relock = (priority & PNORELOCK) == 0; 391 int exiterr = (priority & PNOEXITERR) == 0; 392 393 /* 394 * XXXSMP 395 * This is probably bogus. Figure out what the right 396 * thing to do here really is. 397 * Note that not sleeping if ltsleep is called with curlwp == NULL 398 * in the shutdown case is disgusting but partly necessary given 399 * how shutdown (barely) works. 400 */ 401 if (cold || (doing_shutdown && (panicstr || (l == NULL)))) { 402 /* 403 * After a panic, or during autoconfiguration, 404 * just give interrupts a chance, then just return; 405 * don't run any other procs or panic below, 406 * in case this is the idle process and already asleep. 407 */ 408 s = splhigh(); 409 splx(safepri); 410 splx(s); 411 if (interlock != NULL && relock == 0) 412 simple_unlock(interlock); 413 return (0); 414 } 415 416 KASSERT(p != NULL); 417 LOCK_ASSERT(interlock == NULL || simple_lock_held(interlock)); 418 419 #ifdef KTRACE 420 if (KTRPOINT(p, KTR_CSW)) 421 ktrcsw(p, 1, 0); 422 #endif 423 424 SCHED_LOCK(s); 425 426 #ifdef DIAGNOSTIC 427 if (ident == NULL) 428 panic("ltsleep: ident == NULL"); 429 if (l->l_stat != LSONPROC) 430 panic("ltsleep: l_stat %d != LSONPROC", l->l_stat); 431 if (l->l_back != NULL) 432 panic("ltsleep: p_back != NULL"); 433 #endif 434 435 l->l_wchan = ident; 436 l->l_wmesg = wmesg; 437 l->l_slptime = 0; 438 l->l_priority = priority & PRIMASK; 439 440 qp = SLPQUE(ident); 441 if (qp->sq_head == 0) 442 qp->sq_head = l; 443 else { 444 *qp->sq_tailp = l; 445 } 446 *(qp->sq_tailp = &l->l_forw) = 0; 447 448 if (timo) 449 callout_reset(&l->l_tsleep_ch, timo, endtsleep, l); 450 451 /* 452 * We can now release the interlock; the scheduler_slock 453 * is held, so a thread can't get in to do wakeup() before 454 * we do the switch. 455 * 456 * XXX We leave the code block here, after inserting ourselves 457 * on the sleep queue, because we might want a more clever 458 * data structure for the sleep queues at some point. 459 */ 460 if (interlock != NULL) 461 simple_unlock(interlock); 462 463 /* 464 * We put ourselves on the sleep queue and start our timeout 465 * before calling CURSIG, as we could stop there, and a wakeup 466 * or a SIGCONT (or both) could occur while we were stopped. 467 * A SIGCONT would cause us to be marked as SSLEEP 468 * without resuming us, thus we must be ready for sleep 469 * when CURSIG is called. If the wakeup happens while we're 470 * stopped, p->p_wchan will be 0 upon return from CURSIG. 471 */ 472 if (catch) { 473 l->l_flag |= L_SINTR; 474 if (((sig = CURSIG(l)) != 0) || (p->p_flag & P_WEXIT)) { 475 if (l->l_wchan != NULL) 476 unsleep(l); 477 l->l_stat = LSONPROC; 478 SCHED_UNLOCK(s); 479 goto resume; 480 } 481 if (l->l_wchan == NULL) { 482 catch = 0; 483 SCHED_UNLOCK(s); 484 goto resume; 485 } 486 } else 487 sig = 0; 488 l->l_stat = LSSLEEP; 489 p->p_nrlwps--; 490 p->p_stats->p_ru.ru_nvcsw++; 491 SCHED_ASSERT_LOCKED(); 492 if (l->l_flag & L_SA) 493 sa_switch(l, SA_UPCALL_BLOCKED); 494 else 495 mi_switch(l, NULL); 496 497 #if defined(DDB) && !defined(GPROF) 498 /* handy breakpoint location after process "wakes" */ 499 __asm(".globl bpendtsleep ; bpendtsleep:"); 500 #endif 501 /* 502 * p->p_nrlwps is incremented by whoever made us runnable again, 503 * either setrunnable() or awaken(). 504 */ 505 506 SCHED_ASSERT_UNLOCKED(); 507 splx(s); 508 509 resume: 510 KDASSERT(l->l_cpu != NULL); 511 KDASSERT(l->l_cpu == curcpu()); 512 l->l_cpu->ci_schedstate.spc_curpriority = l->l_usrpri; 513 514 l->l_flag &= ~L_SINTR; 515 if (l->l_flag & L_TIMEOUT) { 516 l->l_flag &= ~(L_TIMEOUT|L_CANCELLED); 517 if (sig == 0) { 518 #ifdef KTRACE 519 if (KTRPOINT(p, KTR_CSW)) 520 ktrcsw(p, 0, 0); 521 #endif 522 if (relock && interlock != NULL) 523 simple_lock(interlock); 524 return (EWOULDBLOCK); 525 } 526 } else if (timo) 527 callout_stop(&l->l_tsleep_ch); 528 529 if (catch) { 530 const int cancelled = l->l_flag & L_CANCELLED; 531 l->l_flag &= ~L_CANCELLED; 532 if (sig != 0 || (sig = CURSIG(l)) != 0 || cancelled) { 533 #ifdef KTRACE 534 if (KTRPOINT(p, KTR_CSW)) 535 ktrcsw(p, 0, 0); 536 #endif 537 if (relock && interlock != NULL) 538 simple_lock(interlock); 539 /* 540 * If this sleep was canceled, don't let the syscall 541 * restart. 542 */ 543 if (cancelled || 544 (SIGACTION(p, sig).sa_flags & SA_RESTART) == 0) 545 return (EINTR); 546 return (ERESTART); 547 } 548 } 549 550 #ifdef KTRACE 551 if (KTRPOINT(p, KTR_CSW)) 552 ktrcsw(p, 0, 0); 553 #endif 554 if (relock && interlock != NULL) 555 simple_lock(interlock); 556 557 /* XXXNJW this is very much a kluge. 558 * revisit. a better way of preventing looping/hanging syscalls like 559 * wait4() and _lwp_wait() from wedging an exiting process 560 * would be preferred. 561 */ 562 if (catch && ((p->p_flag & P_WEXIT) && exiterr)) 563 return (EINTR); 564 return (0); 565 } 566 567 /* 568 * Implement timeout for tsleep. 569 * If process hasn't been awakened (wchan non-zero), 570 * set timeout flag and undo the sleep. If proc 571 * is stopped, just unsleep so it will remain stopped. 572 */ 573 void 574 endtsleep(void *arg) 575 { 576 struct lwp *l; 577 int s; 578 579 l = (struct lwp *)arg; 580 SCHED_LOCK(s); 581 if (l->l_wchan) { 582 if (l->l_stat == LSSLEEP) 583 setrunnable(l); 584 else 585 unsleep(l); 586 l->l_flag |= L_TIMEOUT; 587 } 588 SCHED_UNLOCK(s); 589 } 590 591 /* 592 * Remove a process from its wait queue 593 */ 594 void 595 unsleep(struct lwp *l) 596 { 597 struct slpque *qp; 598 struct lwp **hp; 599 600 SCHED_ASSERT_LOCKED(); 601 602 if (l->l_wchan) { 603 hp = &(qp = SLPQUE(l->l_wchan))->sq_head; 604 while (*hp != l) 605 hp = &(*hp)->l_forw; 606 *hp = l->l_forw; 607 if (qp->sq_tailp == &l->l_forw) 608 qp->sq_tailp = hp; 609 l->l_wchan = 0; 610 } 611 } 612 613 /* 614 * Optimized-for-wakeup() version of setrunnable(). 615 */ 616 __inline void 617 awaken(struct lwp *l) 618 { 619 620 SCHED_ASSERT_LOCKED(); 621 622 if (l->l_slptime > 1) 623 updatepri(l); 624 l->l_slptime = 0; 625 l->l_stat = LSRUN; 626 l->l_proc->p_nrlwps++; 627 /* 628 * Since curpriority is a user priority, p->p_priority 629 * is always better than curpriority on the last CPU on 630 * which it ran. 631 * 632 * XXXSMP See affinity comment in resched_proc(). 633 */ 634 if (l->l_flag & L_INMEM) { 635 setrunqueue(l); 636 if (l->l_flag & L_SA) 637 l->l_proc->p_sa->sa_woken = l; 638 KASSERT(l->l_cpu != NULL); 639 need_resched(l->l_cpu); 640 } else 641 sched_wakeup(&proc0); 642 } 643 644 #if defined(MULTIPROCESSOR) || defined(LOCKDEBUG) 645 void 646 sched_unlock_idle(void) 647 { 648 649 simple_unlock(&sched_lock); 650 } 651 652 void 653 sched_lock_idle(void) 654 { 655 656 simple_lock(&sched_lock); 657 } 658 #endif /* MULTIPROCESSOR || LOCKDEBUG */ 659 660 /* 661 * Make all processes sleeping on the specified identifier runnable. 662 */ 663 664 void 665 wakeup(const void *ident) 666 { 667 int s; 668 669 SCHED_ASSERT_UNLOCKED(); 670 671 SCHED_LOCK(s); 672 sched_wakeup(ident); 673 SCHED_UNLOCK(s); 674 } 675 676 void 677 sched_wakeup(const void *ident) 678 { 679 struct slpque *qp; 680 struct lwp *l, **q; 681 682 SCHED_ASSERT_LOCKED(); 683 684 qp = SLPQUE(ident); 685 restart: 686 for (q = &qp->sq_head; (l = *q) != NULL; ) { 687 #ifdef DIAGNOSTIC 688 if (l->l_back || (l->l_stat != LSSLEEP && 689 l->l_stat != LSSTOP && l->l_stat != LSSUSPENDED)) 690 panic("wakeup"); 691 #endif 692 if (l->l_wchan == ident) { 693 l->l_wchan = 0; 694 *q = l->l_forw; 695 if (qp->sq_tailp == &l->l_forw) 696 qp->sq_tailp = q; 697 if (l->l_stat == LSSLEEP) { 698 awaken(l); 699 goto restart; 700 } 701 } else 702 q = &l->l_forw; 703 } 704 } 705 706 /* 707 * Make the highest priority process first in line on the specified 708 * identifier runnable. 709 */ 710 void 711 wakeup_one(const void *ident) 712 { 713 struct slpque *qp; 714 struct lwp *l, **q; 715 struct lwp *best_sleepp, **best_sleepq; 716 struct lwp *best_stopp, **best_stopq; 717 int s; 718 719 best_sleepp = best_stopp = NULL; 720 best_sleepq = best_stopq = NULL; 721 722 SCHED_LOCK(s); 723 724 qp = SLPQUE(ident); 725 726 for (q = &qp->sq_head; (l = *q) != NULL; q = &l->l_forw) { 727 #ifdef DIAGNOSTIC 728 if (l->l_back || (l->l_stat != LSSLEEP && 729 l->l_stat != LSSTOP && l->l_stat != LSSUSPENDED)) 730 panic("wakeup_one"); 731 #endif 732 if (l->l_wchan == ident) { 733 if (l->l_stat == LSSLEEP) { 734 if (best_sleepp == NULL || 735 l->l_priority < best_sleepp->l_priority) { 736 best_sleepp = l; 737 best_sleepq = q; 738 } 739 } else { 740 if (best_stopp == NULL || 741 l->l_priority < best_stopp->l_priority) { 742 best_stopp = l; 743 best_stopq = q; 744 } 745 } 746 } 747 } 748 749 /* 750 * Consider any SSLEEP process higher than the highest priority SSTOP 751 * process. 752 */ 753 if (best_sleepp != NULL) { 754 l = best_sleepp; 755 q = best_sleepq; 756 } else { 757 l = best_stopp; 758 q = best_stopq; 759 } 760 761 if (l != NULL) { 762 l->l_wchan = NULL; 763 *q = l->l_forw; 764 if (qp->sq_tailp == &l->l_forw) 765 qp->sq_tailp = q; 766 if (l->l_stat == LSSLEEP) 767 awaken(l); 768 } 769 SCHED_UNLOCK(s); 770 } 771 772 /* 773 * General yield call. Puts the current process back on its run queue and 774 * performs a voluntary context switch. Should only be called when the 775 * current process explicitly requests it (eg sched_yield(2) in compat code). 776 */ 777 void 778 yield(void) 779 { 780 struct lwp *l = curlwp; 781 int s; 782 783 SCHED_LOCK(s); 784 l->l_priority = l->l_usrpri; 785 l->l_stat = LSRUN; 786 setrunqueue(l); 787 l->l_proc->p_stats->p_ru.ru_nvcsw++; 788 mi_switch(l, NULL); 789 SCHED_ASSERT_UNLOCKED(); 790 splx(s); 791 } 792 793 /* 794 * General preemption call. Puts the current process back on its run queue 795 * and performs an involuntary context switch. If a process is supplied, 796 * we switch to that process. Otherwise, we use the normal process selection 797 * criteria. 798 */ 799 800 void 801 preempt(int more) 802 { 803 struct lwp *l = curlwp; 804 int r, s; 805 /* XXXUPSXXX Not needed for SMP patch */ 806 #if 0 807 /* XXX Until the preempt() bug is fixed. */ 808 if (more && (l->l_proc->p_flag & P_SA)) { 809 l->l_cpu->ci_schedstate.spc_flags &= ~SPCF_SWITCHCLEAR; 810 return; 811 } 812 #endif 813 814 SCHED_LOCK(s); 815 l->l_priority = l->l_usrpri; 816 l->l_stat = LSRUN; 817 setrunqueue(l); 818 l->l_proc->p_stats->p_ru.ru_nivcsw++; 819 r = mi_switch(l, NULL); 820 SCHED_ASSERT_UNLOCKED(); 821 splx(s); 822 if ((l->l_flag & L_SA) != 0 && r != 0 && more == 0) 823 sa_preempt(l); 824 } 825 826 /* 827 * The machine independent parts of context switch. 828 * Must be called at splsched() (no higher!) and with 829 * the sched_lock held. 830 * Switch to "new" if non-NULL, otherwise let cpu_switch choose 831 * the next lwp. 832 * 833 * Returns 1 if another process was actually run. 834 */ 835 int 836 mi_switch(struct lwp *l, struct lwp *newl) 837 { 838 struct schedstate_percpu *spc; 839 struct rlimit *rlim; 840 long s, u; 841 struct timeval tv; 842 #if defined(MULTIPROCESSOR) 843 int hold_count; 844 #endif 845 struct proc *p = l->l_proc; 846 int retval; 847 848 SCHED_ASSERT_LOCKED(); 849 850 #if defined(MULTIPROCESSOR) 851 /* 852 * Release the kernel_lock, as we are about to yield the CPU. 853 * The scheduler lock is still held until cpu_switch() 854 * selects a new process and removes it from the run queue. 855 */ 856 if (l->l_flag & L_BIGLOCK) 857 hold_count = spinlock_release_all(&kernel_lock); 858 #endif 859 860 KDASSERT(l->l_cpu != NULL); 861 KDASSERT(l->l_cpu == curcpu()); 862 863 spc = &l->l_cpu->ci_schedstate; 864 865 #if defined(LOCKDEBUG) || defined(DIAGNOSTIC) 866 spinlock_switchcheck(); 867 #endif 868 #ifdef LOCKDEBUG 869 simple_lock_switchcheck(); 870 #endif 871 872 /* 873 * Compute the amount of time during which the current 874 * process was running. 875 */ 876 microtime(&tv); 877 u = p->p_rtime.tv_usec + 878 (tv.tv_usec - spc->spc_runtime.tv_usec); 879 s = p->p_rtime.tv_sec + (tv.tv_sec - spc->spc_runtime.tv_sec); 880 if (u < 0) { 881 u += 1000000; 882 s--; 883 } else if (u >= 1000000) { 884 u -= 1000000; 885 s++; 886 } 887 p->p_rtime.tv_usec = u; 888 p->p_rtime.tv_sec = s; 889 890 /* 891 * Check if the process exceeds its cpu resource allocation. 892 * If over max, kill it. In any case, if it has run for more 893 * than 10 minutes, reduce priority to give others a chance. 894 */ 895 rlim = &p->p_rlimit[RLIMIT_CPU]; 896 if (s >= rlim->rlim_cur) { 897 /* 898 * XXXSMP: we're inside the scheduler lock perimeter; 899 * use sched_psignal. 900 */ 901 if (s >= rlim->rlim_max) 902 sched_psignal(p, SIGKILL); 903 else { 904 sched_psignal(p, SIGXCPU); 905 if (rlim->rlim_cur < rlim->rlim_max) 906 rlim->rlim_cur += 5; 907 } 908 } 909 if (autonicetime && s > autonicetime && p->p_ucred->cr_uid && 910 p->p_nice == NZERO) { 911 p->p_nice = autoniceval + NZERO; 912 resetpriority(l); 913 } 914 915 /* 916 * Process is about to yield the CPU; clear the appropriate 917 * scheduling flags. 918 */ 919 spc->spc_flags &= ~SPCF_SWITCHCLEAR; 920 921 #ifdef KSTACK_CHECK_MAGIC 922 kstack_check_magic(l); 923 #endif 924 925 /* 926 * If we are using h/w performance counters, save context. 927 */ 928 #if PERFCTRS 929 if (PMC_ENABLED(p)) 930 pmc_save_context(p); 931 #endif 932 933 /* 934 * Switch to the new current process. When we 935 * run again, we'll return back here. 936 */ 937 uvmexp.swtch++; 938 if (newl == NULL) { 939 retval = cpu_switch(l, NULL); 940 } else { 941 remrunqueue(newl); 942 cpu_switchto(l, newl); 943 retval = 0; 944 } 945 946 /* 947 * If we are using h/w performance counters, restore context. 948 */ 949 #if PERFCTRS 950 if (PMC_ENABLED(p)) 951 pmc_restore_context(p); 952 #endif 953 954 /* 955 * Make sure that MD code released the scheduler lock before 956 * resuming us. 957 */ 958 SCHED_ASSERT_UNLOCKED(); 959 960 /* 961 * We're running again; record our new start time. We might 962 * be running on a new CPU now, so don't use the cache'd 963 * schedstate_percpu pointer. 964 */ 965 KDASSERT(l->l_cpu != NULL); 966 KDASSERT(l->l_cpu == curcpu()); 967 microtime(&l->l_cpu->ci_schedstate.spc_runtime); 968 969 #if defined(MULTIPROCESSOR) 970 /* 971 * Reacquire the kernel_lock now. We do this after we've 972 * released the scheduler lock to avoid deadlock, and before 973 * we reacquire the interlock. 974 */ 975 if (l->l_flag & L_BIGLOCK) 976 spinlock_acquire_count(&kernel_lock, hold_count); 977 #endif 978 979 return retval; 980 } 981 982 /* 983 * Initialize the (doubly-linked) run queues 984 * to be empty. 985 */ 986 void 987 rqinit() 988 { 989 int i; 990 991 for (i = 0; i < RUNQUE_NQS; i++) 992 sched_qs[i].ph_link = sched_qs[i].ph_rlink = 993 (struct lwp *)&sched_qs[i]; 994 } 995 996 static __inline void 997 resched_proc(struct lwp *l, u_char pri) 998 { 999 struct cpu_info *ci; 1000 1001 /* 1002 * XXXSMP 1003 * Since l->l_cpu persists across a context switch, 1004 * this gives us *very weak* processor affinity, in 1005 * that we notify the CPU on which the process last 1006 * ran that it should try to switch. 1007 * 1008 * This does not guarantee that the process will run on 1009 * that processor next, because another processor might 1010 * grab it the next time it performs a context switch. 1011 * 1012 * This also does not handle the case where its last 1013 * CPU is running a higher-priority process, but every 1014 * other CPU is running a lower-priority process. There 1015 * are ways to handle this situation, but they're not 1016 * currently very pretty, and we also need to weigh the 1017 * cost of moving a process from one CPU to another. 1018 * 1019 * XXXSMP 1020 * There is also the issue of locking the other CPU's 1021 * sched state, which we currently do not do. 1022 */ 1023 ci = (l->l_cpu != NULL) ? l->l_cpu : curcpu(); 1024 if (pri < ci->ci_schedstate.spc_curpriority) 1025 need_resched(ci); 1026 } 1027 1028 /* 1029 * Change process state to be runnable, 1030 * placing it on the run queue if it is in memory, 1031 * and awakening the swapper if it isn't in memory. 1032 */ 1033 void 1034 setrunnable(struct lwp *l) 1035 { 1036 struct proc *p = l->l_proc; 1037 1038 SCHED_ASSERT_LOCKED(); 1039 1040 switch (l->l_stat) { 1041 case 0: 1042 case LSRUN: 1043 case LSONPROC: 1044 case LSZOMB: 1045 case LSDEAD: 1046 default: 1047 panic("setrunnable: lwp %p state was %d", l, l->l_stat); 1048 case LSSTOP: 1049 /* 1050 * If we're being traced (possibly because someone attached us 1051 * while we were stopped), check for a signal from the debugger. 1052 */ 1053 if ((p->p_flag & P_TRACED) != 0 && p->p_xstat != 0) { 1054 sigaddset(&p->p_sigctx.ps_siglist, p->p_xstat); 1055 CHECKSIGS(p); 1056 } 1057 case LSSLEEP: 1058 unsleep(l); /* e.g. when sending signals */ 1059 break; 1060 1061 case LSIDL: 1062 break; 1063 case LSSUSPENDED: 1064 break; 1065 } 1066 l->l_stat = LSRUN; 1067 p->p_nrlwps++; 1068 1069 if (l->l_flag & L_INMEM) 1070 setrunqueue(l); 1071 1072 if (l->l_slptime > 1) 1073 updatepri(l); 1074 l->l_slptime = 0; 1075 if ((l->l_flag & L_INMEM) == 0) 1076 sched_wakeup((caddr_t)&proc0); 1077 else 1078 resched_proc(l, l->l_priority); 1079 } 1080 1081 /* 1082 * Compute the priority of a process when running in user mode. 1083 * Arrange to reschedule if the resulting priority is better 1084 * than that of the current process. 1085 */ 1086 void 1087 resetpriority(struct lwp *l) 1088 { 1089 unsigned int newpriority; 1090 struct proc *p = l->l_proc; 1091 1092 SCHED_ASSERT_LOCKED(); 1093 1094 newpriority = PUSER + p->p_estcpu + 1095 NICE_WEIGHT * (p->p_nice - NZERO); 1096 newpriority = min(newpriority, MAXPRI); 1097 l->l_usrpri = newpriority; 1098 resched_proc(l, l->l_usrpri); 1099 } 1100 1101 /* 1102 * Recompute priority for all LWPs in a process. 1103 */ 1104 void 1105 resetprocpriority(struct proc *p) 1106 { 1107 struct lwp *l; 1108 1109 LIST_FOREACH(l, &p->p_lwps, l_sibling) 1110 resetpriority(l); 1111 } 1112 1113 /* 1114 * We adjust the priority of the current process. The priority of a process 1115 * gets worse as it accumulates CPU time. The cpu usage estimator (p_estcpu) 1116 * is increased here. The formula for computing priorities (in kern_synch.c) 1117 * will compute a different value each time p_estcpu increases. This can 1118 * cause a switch, but unless the priority crosses a PPQ boundary the actual 1119 * queue will not change. The cpu usage estimator ramps up quite quickly 1120 * when the process is running (linearly), and decays away exponentially, at 1121 * a rate which is proportionally slower when the system is busy. The basic 1122 * principle is that the system will 90% forget that the process used a lot 1123 * of CPU time in 5 * loadav seconds. This causes the system to favor 1124 * processes which haven't run much recently, and to round-robin among other 1125 * processes. 1126 */ 1127 1128 void 1129 schedclock(struct lwp *l) 1130 { 1131 struct proc *p = l->l_proc; 1132 int s; 1133 1134 p->p_estcpu = ESTCPULIM(p->p_estcpu + 1); 1135 SCHED_LOCK(s); 1136 resetpriority(l); 1137 SCHED_UNLOCK(s); 1138 1139 if (l->l_priority >= PUSER) 1140 l->l_priority = l->l_usrpri; 1141 } 1142 1143 void 1144 suspendsched() 1145 { 1146 struct lwp *l; 1147 int s; 1148 1149 /* 1150 * Convert all non-P_SYSTEM LSSLEEP or LSRUN processes to 1151 * LSSUSPENDED. 1152 */ 1153 proclist_lock_read(); 1154 SCHED_LOCK(s); 1155 LIST_FOREACH(l, &alllwp, l_list) { 1156 if ((l->l_proc->p_flag & P_SYSTEM) != 0) 1157 continue; 1158 1159 switch (l->l_stat) { 1160 case LSRUN: 1161 l->l_proc->p_nrlwps--; 1162 if ((l->l_flag & L_INMEM) != 0) 1163 remrunqueue(l); 1164 /* FALLTHROUGH */ 1165 case LSSLEEP: 1166 l->l_stat = LSSUSPENDED; 1167 break; 1168 case LSONPROC: 1169 /* 1170 * XXX SMP: we need to deal with processes on 1171 * others CPU ! 1172 */ 1173 break; 1174 default: 1175 break; 1176 } 1177 } 1178 SCHED_UNLOCK(s); 1179 proclist_unlock_read(); 1180 } 1181 1182 /* 1183 * Low-level routines to access the run queue. Optimised assembler 1184 * routines can override these. 1185 */ 1186 1187 #ifndef __HAVE_MD_RUNQUEUE 1188 1189 /* 1190 * On some architectures, it's faster to use a MSB ordering for the priorites 1191 * than the traditional LSB ordering. 1192 */ 1193 #ifdef __HAVE_BIGENDIAN_BITOPS 1194 #define RQMASK(n) (0x80000000 >> (n)) 1195 #else 1196 #define RQMASK(n) (0x00000001 << (n)) 1197 #endif 1198 1199 /* 1200 * The primitives that manipulate the run queues. whichqs tells which 1201 * of the 32 queues qs have processes in them. Setrunqueue puts processes 1202 * into queues, remrunqueue removes them from queues. The running process is 1203 * on no queue, other processes are on a queue related to p->p_priority, 1204 * divided by 4 actually to shrink the 0-127 range of priorities into the 32 1205 * available queues. 1206 */ 1207 1208 void 1209 setrunqueue(struct lwp *l) 1210 { 1211 struct prochd *rq; 1212 struct lwp *prev; 1213 const int whichq = l->l_priority / 4; 1214 1215 #ifdef DIAGNOSTIC 1216 if (l->l_back != NULL || l->l_wchan != NULL || l->l_stat != LSRUN) 1217 panic("setrunqueue"); 1218 #endif 1219 sched_whichqs |= RQMASK(whichq); 1220 rq = &sched_qs[whichq]; 1221 prev = rq->ph_rlink; 1222 l->l_forw = (struct lwp *)rq; 1223 rq->ph_rlink = l; 1224 prev->l_forw = l; 1225 l->l_back = prev; 1226 } 1227 1228 void 1229 remrunqueue(struct lwp *l) 1230 { 1231 struct lwp *prev, *next; 1232 const int whichq = l->l_priority / 4; 1233 #ifdef DIAGNOSTIC 1234 if (((sched_whichqs & RQMASK(whichq)) == 0)) 1235 panic("remrunqueue"); 1236 #endif 1237 prev = l->l_back; 1238 l->l_back = NULL; 1239 next = l->l_forw; 1240 prev->l_forw = next; 1241 next->l_back = prev; 1242 if (prev == next) 1243 sched_whichqs &= ~RQMASK(whichq); 1244 } 1245 1246 #undef RQMASK 1247 #endif /* !defined(__HAVE_MD_RUNQUEUE) */ 1248