1 /* $NetBSD: kern_synch.c,v 1.141 2004/02/13 11:36:23 wiz Exp $ */ 2 3 /*- 4 * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the NetBSD 22 * Foundation, Inc. and its contributors. 23 * 4. Neither the name of The NetBSD Foundation nor the names of its 24 * contributors may be used to endorse or promote products derived 25 * from this software without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 28 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 29 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 30 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 31 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 * POSSIBILITY OF SUCH DAMAGE. 38 */ 39 40 /*- 41 * Copyright (c) 1982, 1986, 1990, 1991, 1993 42 * The Regents of the University of California. All rights reserved. 43 * (c) UNIX System Laboratories, Inc. 44 * All or some portions of this file are derived from material licensed 45 * to the University of California by American Telephone and Telegraph 46 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 47 * the permission of UNIX System Laboratories, Inc. 48 * 49 * Redistribution and use in source and binary forms, with or without 50 * modification, are permitted provided that the following conditions 51 * are met: 52 * 1. Redistributions of source code must retain the above copyright 53 * notice, this list of conditions and the following disclaimer. 54 * 2. Redistributions in binary form must reproduce the above copyright 55 * notice, this list of conditions and the following disclaimer in the 56 * documentation and/or other materials provided with the distribution. 57 * 3. Neither the name of the University nor the names of its contributors 58 * may be used to endorse or promote products derived from this software 59 * without specific prior written permission. 60 * 61 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 62 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 63 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 64 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 65 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 66 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 67 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 68 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 69 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 70 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 71 * SUCH DAMAGE. 72 * 73 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 74 */ 75 76 #include <sys/cdefs.h> 77 __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.141 2004/02/13 11:36:23 wiz Exp $"); 78 79 #include "opt_ddb.h" 80 #include "opt_ktrace.h" 81 #include "opt_kstack.h" 82 #include "opt_lockdebug.h" 83 #include "opt_multiprocessor.h" 84 #include "opt_perfctrs.h" 85 86 #include <sys/param.h> 87 #include <sys/systm.h> 88 #include <sys/callout.h> 89 #include <sys/proc.h> 90 #include <sys/kernel.h> 91 #include <sys/buf.h> 92 #if defined(PERFCTRS) 93 #include <sys/pmc.h> 94 #endif 95 #include <sys/signalvar.h> 96 #include <sys/resourcevar.h> 97 #include <sys/sched.h> 98 #include <sys/sa.h> 99 #include <sys/savar.h> 100 101 #include <uvm/uvm_extern.h> 102 103 #ifdef KTRACE 104 #include <sys/ktrace.h> 105 #endif 106 107 #include <machine/cpu.h> 108 109 int lbolt; /* once a second sleep address */ 110 int rrticks; /* number of hardclock ticks per roundrobin() */ 111 112 /* 113 * The global scheduler state. 114 */ 115 struct prochd sched_qs[RUNQUE_NQS]; /* run queues */ 116 __volatile u_int32_t sched_whichqs; /* bitmap of non-empty queues */ 117 struct slpque sched_slpque[SLPQUE_TABLESIZE]; /* sleep queues */ 118 119 struct simplelock sched_lock = SIMPLELOCK_INITIALIZER; 120 121 void schedcpu(void *); 122 void updatepri(struct lwp *); 123 void endtsleep(void *); 124 125 __inline void sa_awaken(struct lwp *); 126 __inline void awaken(struct lwp *); 127 128 struct callout schedcpu_ch = CALLOUT_INITIALIZER; 129 130 131 132 /* 133 * Force switch among equal priority processes every 100ms. 134 * Called from hardclock every hz/10 == rrticks hardclock ticks. 135 */ 136 /* ARGSUSED */ 137 void 138 roundrobin(struct cpu_info *ci) 139 { 140 struct schedstate_percpu *spc = &ci->ci_schedstate; 141 142 spc->spc_rrticks = rrticks; 143 144 if (curlwp != NULL) { 145 if (spc->spc_flags & SPCF_SEENRR) { 146 /* 147 * The process has already been through a roundrobin 148 * without switching and may be hogging the CPU. 149 * Indicate that the process should yield. 150 */ 151 spc->spc_flags |= SPCF_SHOULDYIELD; 152 } else 153 spc->spc_flags |= SPCF_SEENRR; 154 } 155 need_resched(curcpu()); 156 } 157 158 /* 159 * Constants for digital decay and forget: 160 * 90% of (p_estcpu) usage in 5 * loadav time 161 * 95% of (p_pctcpu) usage in 60 seconds (load insensitive) 162 * Note that, as ps(1) mentions, this can let percentages 163 * total over 100% (I've seen 137.9% for 3 processes). 164 * 165 * Note that hardclock updates p_estcpu and p_cpticks independently. 166 * 167 * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds. 168 * That is, the system wants to compute a value of decay such 169 * that the following for loop: 170 * for (i = 0; i < (5 * loadavg); i++) 171 * p_estcpu *= decay; 172 * will compute 173 * p_estcpu *= 0.1; 174 * for all values of loadavg: 175 * 176 * Mathematically this loop can be expressed by saying: 177 * decay ** (5 * loadavg) ~= .1 178 * 179 * The system computes decay as: 180 * decay = (2 * loadavg) / (2 * loadavg + 1) 181 * 182 * We wish to prove that the system's computation of decay 183 * will always fulfill the equation: 184 * decay ** (5 * loadavg) ~= .1 185 * 186 * If we compute b as: 187 * b = 2 * loadavg 188 * then 189 * decay = b / (b + 1) 190 * 191 * We now need to prove two things: 192 * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1) 193 * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg) 194 * 195 * Facts: 196 * For x close to zero, exp(x) =~ 1 + x, since 197 * exp(x) = 0! + x**1/1! + x**2/2! + ... . 198 * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b. 199 * For x close to zero, ln(1+x) =~ x, since 200 * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1 201 * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1). 202 * ln(.1) =~ -2.30 203 * 204 * Proof of (1): 205 * Solve (factor)**(power) =~ .1 given power (5*loadav): 206 * solving for factor, 207 * ln(factor) =~ (-2.30/5*loadav), or 208 * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) = 209 * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED 210 * 211 * Proof of (2): 212 * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)): 213 * solving for power, 214 * power*ln(b/(b+1)) =~ -2.30, or 215 * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED 216 * 217 * Actual power values for the implemented algorithm are as follows: 218 * loadav: 1 2 3 4 219 * power: 5.68 10.32 14.94 19.55 220 */ 221 222 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */ 223 #define loadfactor(loadav) (2 * (loadav)) 224 #define decay_cpu(loadfac, cpu) (((loadfac) * (cpu)) / ((loadfac) + FSCALE)) 225 226 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 227 fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 228 229 /* 230 * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the 231 * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below 232 * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT). 233 * 234 * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used: 235 * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits). 236 * 237 * If you dont want to bother with the faster/more-accurate formula, you 238 * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate 239 * (more general) method of calculating the %age of CPU used by a process. 240 */ 241 #define CCPU_SHIFT 11 242 243 /* 244 * Recompute process priorities, every hz ticks. 245 */ 246 /* ARGSUSED */ 247 void 248 schedcpu(void *arg) 249 { 250 fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); 251 struct lwp *l; 252 struct proc *p; 253 int s, minslp; 254 unsigned int newcpu; 255 int clkhz; 256 257 proclist_lock_read(); 258 LIST_FOREACH(p, &allproc, p_list) { 259 /* 260 * Increment time in/out of memory and sleep time 261 * (if sleeping). We ignore overflow; with 16-bit int's 262 * (remember them?) overflow takes 45 days. 263 */ 264 minslp = 2; 265 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 266 l->l_swtime++; 267 if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP || 268 l->l_stat == LSSUSPENDED) { 269 l->l_slptime++; 270 minslp = min(minslp, l->l_slptime); 271 } else 272 minslp = 0; 273 } 274 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT; 275 /* 276 * If the process has slept the entire second, 277 * stop recalculating its priority until it wakes up. 278 */ 279 if (minslp > 1) 280 continue; 281 s = splstatclock(); /* prevent state changes */ 282 /* 283 * p_pctcpu is only for ps. 284 */ 285 clkhz = stathz != 0 ? stathz : hz; 286 #if (FSHIFT >= CCPU_SHIFT) 287 p->p_pctcpu += (clkhz == 100)? 288 ((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT): 289 100 * (((fixpt_t) p->p_cpticks) 290 << (FSHIFT - CCPU_SHIFT)) / clkhz; 291 #else 292 p->p_pctcpu += ((FSCALE - ccpu) * 293 (p->p_cpticks * FSCALE / clkhz)) >> FSHIFT; 294 #endif 295 p->p_cpticks = 0; 296 newcpu = (u_int)decay_cpu(loadfac, p->p_estcpu); 297 p->p_estcpu = newcpu; 298 splx(s); /* Done with the process CPU ticks update */ 299 SCHED_LOCK(s); 300 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 301 if (l->l_slptime > 1) 302 continue; 303 resetpriority(l); 304 if (l->l_priority >= PUSER) { 305 if (l->l_stat == LSRUN && 306 (l->l_flag & L_INMEM) && 307 (l->l_priority / PPQ) != (l->l_usrpri / PPQ)) { 308 remrunqueue(l); 309 l->l_priority = l->l_usrpri; 310 setrunqueue(l); 311 } else 312 l->l_priority = l->l_usrpri; 313 } 314 } 315 SCHED_UNLOCK(s); 316 } 317 proclist_unlock_read(); 318 uvm_meter(); 319 wakeup((caddr_t)&lbolt); 320 callout_reset(&schedcpu_ch, hz, schedcpu, NULL); 321 } 322 323 /* 324 * Recalculate the priority of a process after it has slept for a while. 325 * For all load averages >= 1 and max p_estcpu of 255, sleeping for at 326 * least six times the loadfactor will decay p_estcpu to zero. 327 */ 328 void 329 updatepri(struct lwp *l) 330 { 331 struct proc *p = l->l_proc; 332 unsigned int newcpu; 333 fixpt_t loadfac; 334 335 SCHED_ASSERT_LOCKED(); 336 337 newcpu = p->p_estcpu; 338 loadfac = loadfactor(averunnable.ldavg[0]); 339 340 if (l->l_slptime > 5 * loadfac) 341 p->p_estcpu = 0; /* XXX NJWLWP */ 342 else { 343 l->l_slptime--; /* the first time was done in schedcpu */ 344 while (newcpu && --l->l_slptime) 345 newcpu = (int) decay_cpu(loadfac, newcpu); 346 p->p_estcpu = newcpu; 347 } 348 resetpriority(l); 349 } 350 351 /* 352 * During autoconfiguration or after a panic, a sleep will simply 353 * lower the priority briefly to allow interrupts, then return. 354 * The priority to be used (safepri) is machine-dependent, thus this 355 * value is initialized and maintained in the machine-dependent layers. 356 * This priority will typically be 0, or the lowest priority 357 * that is safe for use on the interrupt stack; it can be made 358 * higher to block network software interrupts after panics. 359 */ 360 int safepri; 361 362 /* 363 * General sleep call. Suspends the current process until a wakeup is 364 * performed on the specified identifier. The process will then be made 365 * runnable with the specified priority. Sleeps at most timo/hz seconds 366 * (0 means no timeout). If pri includes PCATCH flag, signals are checked 367 * before and after sleeping, else signals are not checked. Returns 0 if 368 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a 369 * signal needs to be delivered, ERESTART is returned if the current system 370 * call should be restarted if possible, and EINTR is returned if the system 371 * call should be interrupted by the signal (return EINTR). 372 * 373 * The interlock is held until the scheduler_slock is acquired. The 374 * interlock will be locked before returning back to the caller 375 * unless the PNORELOCK flag is specified, in which case the 376 * interlock will always be unlocked upon return. 377 */ 378 int 379 ltsleep(const void *ident, int priority, const char *wmesg, int timo, 380 __volatile struct simplelock *interlock) 381 { 382 struct lwp *l = curlwp; 383 struct proc *p = l ? l->l_proc : NULL; 384 struct slpque *qp; 385 int sig, s; 386 int catch = priority & PCATCH; 387 int relock = (priority & PNORELOCK) == 0; 388 int exiterr = (priority & PNOEXITERR) == 0; 389 390 /* 391 * XXXSMP 392 * This is probably bogus. Figure out what the right 393 * thing to do here really is. 394 * Note that not sleeping if ltsleep is called with curlwp == NULL 395 * in the shutdown case is disgusting but partly necessary given 396 * how shutdown (barely) works. 397 */ 398 if (cold || (doing_shutdown && (panicstr || (l == NULL)))) { 399 /* 400 * After a panic, or during autoconfiguration, 401 * just give interrupts a chance, then just return; 402 * don't run any other procs or panic below, 403 * in case this is the idle process and already asleep. 404 */ 405 s = splhigh(); 406 splx(safepri); 407 splx(s); 408 if (interlock != NULL && relock == 0) 409 simple_unlock(interlock); 410 return (0); 411 } 412 413 KASSERT(p != NULL); 414 LOCK_ASSERT(interlock == NULL || simple_lock_held(interlock)); 415 416 #ifdef KTRACE 417 if (KTRPOINT(p, KTR_CSW)) 418 ktrcsw(p, 1, 0); 419 #endif 420 421 SCHED_LOCK(s); 422 423 #ifdef DIAGNOSTIC 424 if (ident == NULL) 425 panic("ltsleep: ident == NULL"); 426 if (l->l_stat != LSONPROC) 427 panic("ltsleep: l_stat %d != LSONPROC", l->l_stat); 428 if (l->l_back != NULL) 429 panic("ltsleep: p_back != NULL"); 430 #endif 431 432 l->l_wchan = ident; 433 l->l_wmesg = wmesg; 434 l->l_slptime = 0; 435 l->l_priority = priority & PRIMASK; 436 437 qp = SLPQUE(ident); 438 if (qp->sq_head == 0) 439 qp->sq_head = l; 440 else { 441 *qp->sq_tailp = l; 442 } 443 *(qp->sq_tailp = &l->l_forw) = 0; 444 445 if (timo) 446 callout_reset(&l->l_tsleep_ch, timo, endtsleep, l); 447 448 /* 449 * We can now release the interlock; the scheduler_slock 450 * is held, so a thread can't get in to do wakeup() before 451 * we do the switch. 452 * 453 * XXX We leave the code block here, after inserting ourselves 454 * on the sleep queue, because we might want a more clever 455 * data structure for the sleep queues at some point. 456 */ 457 if (interlock != NULL) 458 simple_unlock(interlock); 459 460 /* 461 * We put ourselves on the sleep queue and start our timeout 462 * before calling CURSIG, as we could stop there, and a wakeup 463 * or a SIGCONT (or both) could occur while we were stopped. 464 * A SIGCONT would cause us to be marked as SSLEEP 465 * without resuming us, thus we must be ready for sleep 466 * when CURSIG is called. If the wakeup happens while we're 467 * stopped, p->p_wchan will be 0 upon return from CURSIG. 468 */ 469 if (catch) { 470 l->l_flag |= L_SINTR; 471 if (((sig = CURSIG(l)) != 0) || 472 ((p->p_flag & P_WEXIT) && p->p_nlwps > 1)) { 473 if (l->l_wchan != NULL) 474 unsleep(l); 475 l->l_stat = LSONPROC; 476 SCHED_UNLOCK(s); 477 goto resume; 478 } 479 if (l->l_wchan == NULL) { 480 catch = 0; 481 SCHED_UNLOCK(s); 482 goto resume; 483 } 484 } else 485 sig = 0; 486 l->l_stat = LSSLEEP; 487 p->p_nrlwps--; 488 p->p_stats->p_ru.ru_nvcsw++; 489 SCHED_ASSERT_LOCKED(); 490 if (l->l_flag & L_SA) 491 sa_switch(l, SA_UPCALL_BLOCKED); 492 else 493 mi_switch(l, NULL); 494 495 #if defined(DDB) && !defined(GPROF) 496 /* handy breakpoint location after process "wakes" */ 497 __asm(".globl bpendtsleep\nbpendtsleep:"); 498 #endif 499 /* 500 * p->p_nrlwps is incremented by whoever made us runnable again, 501 * either setrunnable() or awaken(). 502 */ 503 504 SCHED_ASSERT_UNLOCKED(); 505 splx(s); 506 507 resume: 508 KDASSERT(l->l_cpu != NULL); 509 KDASSERT(l->l_cpu == curcpu()); 510 l->l_cpu->ci_schedstate.spc_curpriority = l->l_usrpri; 511 512 l->l_flag &= ~L_SINTR; 513 if (l->l_flag & L_TIMEOUT) { 514 l->l_flag &= ~(L_TIMEOUT|L_CANCELLED); 515 if (sig == 0) { 516 #ifdef KTRACE 517 if (KTRPOINT(p, KTR_CSW)) 518 ktrcsw(p, 0, 0); 519 #endif 520 if (relock && interlock != NULL) 521 simple_lock(interlock); 522 return (EWOULDBLOCK); 523 } 524 } else if (timo) 525 callout_stop(&l->l_tsleep_ch); 526 527 if (catch) { 528 const int cancelled = l->l_flag & L_CANCELLED; 529 l->l_flag &= ~L_CANCELLED; 530 if (sig != 0 || (sig = CURSIG(l)) != 0 || cancelled) { 531 #ifdef KTRACE 532 if (KTRPOINT(p, KTR_CSW)) 533 ktrcsw(p, 0, 0); 534 #endif 535 if (relock && interlock != NULL) 536 simple_lock(interlock); 537 /* 538 * If this sleep was canceled, don't let the syscall 539 * restart. 540 */ 541 if (cancelled || 542 (SIGACTION(p, sig).sa_flags & SA_RESTART) == 0) 543 return (EINTR); 544 return (ERESTART); 545 } 546 } 547 548 #ifdef KTRACE 549 if (KTRPOINT(p, KTR_CSW)) 550 ktrcsw(p, 0, 0); 551 #endif 552 if (relock && interlock != NULL) 553 simple_lock(interlock); 554 555 /* XXXNJW this is very much a kluge. 556 * revisit. a better way of preventing looping/hanging syscalls like 557 * wait4() and _lwp_wait() from wedging an exiting process 558 * would be preferred. 559 */ 560 if (catch && ((p->p_flag & P_WEXIT) && p->p_nlwps > 1 && exiterr)) 561 return (EINTR); 562 return (0); 563 } 564 565 /* 566 * Implement timeout for tsleep. 567 * If process hasn't been awakened (wchan non-zero), 568 * set timeout flag and undo the sleep. If proc 569 * is stopped, just unsleep so it will remain stopped. 570 */ 571 void 572 endtsleep(void *arg) 573 { 574 struct lwp *l; 575 int s; 576 577 l = (struct lwp *)arg; 578 SCHED_LOCK(s); 579 if (l->l_wchan) { 580 if (l->l_stat == LSSLEEP) 581 setrunnable(l); 582 else 583 unsleep(l); 584 l->l_flag |= L_TIMEOUT; 585 } 586 SCHED_UNLOCK(s); 587 } 588 589 /* 590 * Remove a process from its wait queue 591 */ 592 void 593 unsleep(struct lwp *l) 594 { 595 struct slpque *qp; 596 struct lwp **hp; 597 598 SCHED_ASSERT_LOCKED(); 599 600 if (l->l_wchan) { 601 hp = &(qp = SLPQUE(l->l_wchan))->sq_head; 602 while (*hp != l) 603 hp = &(*hp)->l_forw; 604 *hp = l->l_forw; 605 if (qp->sq_tailp == &l->l_forw) 606 qp->sq_tailp = hp; 607 l->l_wchan = 0; 608 } 609 } 610 611 __inline void 612 sa_awaken(struct lwp *l) 613 { 614 struct sadata *sa = l->l_proc->p_sa; 615 616 SCHED_ASSERT_LOCKED(); 617 618 if (l == sa->sa_vp && l->l_flag & L_SA_YIELD) 619 l->l_flag &= ~L_SA_IDLE; 620 } 621 622 /* 623 * Optimized-for-wakeup() version of setrunnable(). 624 */ 625 __inline void 626 awaken(struct lwp *l) 627 { 628 629 SCHED_ASSERT_LOCKED(); 630 631 if (l->l_proc->p_sa) 632 sa_awaken(l); 633 634 if (l->l_slptime > 1) 635 updatepri(l); 636 l->l_slptime = 0; 637 l->l_stat = LSRUN; 638 l->l_proc->p_nrlwps++; 639 /* 640 * Since curpriority is a user priority, p->p_priority 641 * is always better than curpriority on the last CPU on 642 * which it ran. 643 * 644 * XXXSMP See affinity comment in resched_proc(). 645 */ 646 if (l->l_flag & L_INMEM) { 647 setrunqueue(l); 648 KASSERT(l->l_cpu != NULL); 649 need_resched(l->l_cpu); 650 } else 651 sched_wakeup(&proc0); 652 } 653 654 #if defined(MULTIPROCESSOR) || defined(LOCKDEBUG) 655 void 656 sched_unlock_idle(void) 657 { 658 659 simple_unlock(&sched_lock); 660 } 661 662 void 663 sched_lock_idle(void) 664 { 665 666 simple_lock(&sched_lock); 667 } 668 #endif /* MULTIPROCESSOR || LOCKDEBUG */ 669 670 /* 671 * Make all processes sleeping on the specified identifier runnable. 672 */ 673 674 void 675 wakeup(const void *ident) 676 { 677 int s; 678 679 SCHED_ASSERT_UNLOCKED(); 680 681 SCHED_LOCK(s); 682 sched_wakeup(ident); 683 SCHED_UNLOCK(s); 684 } 685 686 void 687 sched_wakeup(const void *ident) 688 { 689 struct slpque *qp; 690 struct lwp *l, **q; 691 692 SCHED_ASSERT_LOCKED(); 693 694 qp = SLPQUE(ident); 695 restart: 696 for (q = &qp->sq_head; (l = *q) != NULL; ) { 697 #ifdef DIAGNOSTIC 698 if (l->l_back || (l->l_stat != LSSLEEP && 699 l->l_stat != LSSTOP && l->l_stat != LSSUSPENDED)) 700 panic("wakeup"); 701 #endif 702 if (l->l_wchan == ident) { 703 l->l_wchan = 0; 704 *q = l->l_forw; 705 if (qp->sq_tailp == &l->l_forw) 706 qp->sq_tailp = q; 707 if (l->l_stat == LSSLEEP) { 708 awaken(l); 709 goto restart; 710 } 711 } else 712 q = &l->l_forw; 713 } 714 } 715 716 /* 717 * Make the highest priority process first in line on the specified 718 * identifier runnable. 719 */ 720 void 721 wakeup_one(const void *ident) 722 { 723 struct slpque *qp; 724 struct lwp *l, **q; 725 struct lwp *best_sleepp, **best_sleepq; 726 struct lwp *best_stopp, **best_stopq; 727 int s; 728 729 best_sleepp = best_stopp = NULL; 730 best_sleepq = best_stopq = NULL; 731 732 SCHED_LOCK(s); 733 734 qp = SLPQUE(ident); 735 736 for (q = &qp->sq_head; (l = *q) != NULL; q = &l->l_forw) { 737 #ifdef DIAGNOSTIC 738 if (l->l_back || (l->l_stat != LSSLEEP && 739 l->l_stat != LSSTOP && l->l_stat != LSSUSPENDED)) 740 panic("wakeup_one"); 741 #endif 742 if (l->l_wchan == ident) { 743 if (l->l_stat == LSSLEEP) { 744 if (best_sleepp == NULL || 745 l->l_priority < best_sleepp->l_priority) { 746 best_sleepp = l; 747 best_sleepq = q; 748 } 749 } else { 750 if (best_stopp == NULL || 751 l->l_priority < best_stopp->l_priority) { 752 best_stopp = l; 753 best_stopq = q; 754 } 755 } 756 } 757 } 758 759 /* 760 * Consider any SSLEEP process higher than the highest priority SSTOP 761 * process. 762 */ 763 if (best_sleepp != NULL) { 764 l = best_sleepp; 765 q = best_sleepq; 766 } else { 767 l = best_stopp; 768 q = best_stopq; 769 } 770 771 if (l != NULL) { 772 l->l_wchan = NULL; 773 *q = l->l_forw; 774 if (qp->sq_tailp == &l->l_forw) 775 qp->sq_tailp = q; 776 if (l->l_stat == LSSLEEP) 777 awaken(l); 778 } 779 SCHED_UNLOCK(s); 780 } 781 782 /* 783 * General yield call. Puts the current process back on its run queue and 784 * performs a voluntary context switch. Should only be called when the 785 * current process explicitly requests it (eg sched_yield(2) in compat code). 786 */ 787 void 788 yield(void) 789 { 790 struct lwp *l = curlwp; 791 int s; 792 793 SCHED_LOCK(s); 794 l->l_priority = l->l_usrpri; 795 l->l_stat = LSRUN; 796 setrunqueue(l); 797 l->l_proc->p_stats->p_ru.ru_nvcsw++; 798 mi_switch(l, NULL); 799 SCHED_ASSERT_UNLOCKED(); 800 splx(s); 801 } 802 803 /* 804 * General preemption call. Puts the current process back on its run queue 805 * and performs an involuntary context switch. If a process is supplied, 806 * we switch to that process. Otherwise, we use the normal process selection 807 * criteria. 808 */ 809 810 void 811 preempt(int more) 812 { 813 struct lwp *l = curlwp; 814 int r, s; 815 816 SCHED_LOCK(s); 817 l->l_priority = l->l_usrpri; 818 l->l_stat = LSRUN; 819 setrunqueue(l); 820 l->l_proc->p_stats->p_ru.ru_nivcsw++; 821 r = mi_switch(l, NULL); 822 SCHED_ASSERT_UNLOCKED(); 823 splx(s); 824 if ((l->l_flag & L_SA) != 0 && r != 0 && more == 0) 825 sa_preempt(l); 826 } 827 828 /* 829 * The machine independent parts of context switch. 830 * Must be called at splsched() (no higher!) and with 831 * the sched_lock held. 832 * Switch to "new" if non-NULL, otherwise let cpu_switch choose 833 * the next lwp. 834 * 835 * Returns 1 if another process was actually run. 836 */ 837 int 838 mi_switch(struct lwp *l, struct lwp *newl) 839 { 840 struct schedstate_percpu *spc; 841 struct rlimit *rlim; 842 long s, u; 843 struct timeval tv; 844 #if defined(MULTIPROCESSOR) 845 int hold_count = 0; /* XXX: gcc */ 846 #endif 847 struct proc *p = l->l_proc; 848 int retval; 849 850 SCHED_ASSERT_LOCKED(); 851 852 #if defined(MULTIPROCESSOR) 853 /* 854 * Release the kernel_lock, as we are about to yield the CPU. 855 * The scheduler lock is still held until cpu_switch() 856 * selects a new process and removes it from the run queue. 857 */ 858 if (l->l_flag & L_BIGLOCK) 859 hold_count = spinlock_release_all(&kernel_lock); 860 #endif 861 862 KDASSERT(l->l_cpu != NULL); 863 KDASSERT(l->l_cpu == curcpu()); 864 865 spc = &l->l_cpu->ci_schedstate; 866 867 #if defined(LOCKDEBUG) || defined(DIAGNOSTIC) 868 spinlock_switchcheck(); 869 #endif 870 #ifdef LOCKDEBUG 871 simple_lock_switchcheck(); 872 #endif 873 874 /* 875 * Compute the amount of time during which the current 876 * process was running. 877 */ 878 microtime(&tv); 879 u = p->p_rtime.tv_usec + 880 (tv.tv_usec - spc->spc_runtime.tv_usec); 881 s = p->p_rtime.tv_sec + (tv.tv_sec - spc->spc_runtime.tv_sec); 882 if (u < 0) { 883 u += 1000000; 884 s--; 885 } else if (u >= 1000000) { 886 u -= 1000000; 887 s++; 888 } 889 p->p_rtime.tv_usec = u; 890 p->p_rtime.tv_sec = s; 891 892 /* 893 * Check if the process exceeds its CPU resource allocation. 894 * If over max, kill it. In any case, if it has run for more 895 * than 10 minutes, reduce priority to give others a chance. 896 */ 897 rlim = &p->p_rlimit[RLIMIT_CPU]; 898 if (s >= rlim->rlim_cur) { 899 /* 900 * XXXSMP: we're inside the scheduler lock perimeter; 901 * use sched_psignal. 902 */ 903 if (s >= rlim->rlim_max) 904 sched_psignal(p, SIGKILL); 905 else { 906 sched_psignal(p, SIGXCPU); 907 if (rlim->rlim_cur < rlim->rlim_max) 908 rlim->rlim_cur += 5; 909 } 910 } 911 if (autonicetime && s > autonicetime && p->p_ucred->cr_uid && 912 p->p_nice == NZERO) { 913 p->p_nice = autoniceval + NZERO; 914 resetpriority(l); 915 } 916 917 /* 918 * Process is about to yield the CPU; clear the appropriate 919 * scheduling flags. 920 */ 921 spc->spc_flags &= ~SPCF_SWITCHCLEAR; 922 923 #ifdef KSTACK_CHECK_MAGIC 924 kstack_check_magic(l); 925 #endif 926 927 /* 928 * If we are using h/w performance counters, save context. 929 */ 930 #if PERFCTRS 931 if (PMC_ENABLED(p)) 932 pmc_save_context(p); 933 #endif 934 935 /* 936 * Switch to the new current process. When we 937 * run again, we'll return back here. 938 */ 939 uvmexp.swtch++; 940 if (newl == NULL) { 941 retval = cpu_switch(l, NULL); 942 } else { 943 remrunqueue(newl); 944 cpu_switchto(l, newl); 945 retval = 0; 946 } 947 948 /* 949 * If we are using h/w performance counters, restore context. 950 */ 951 #if PERFCTRS 952 if (PMC_ENABLED(p)) 953 pmc_restore_context(p); 954 #endif 955 956 /* 957 * Make sure that MD code released the scheduler lock before 958 * resuming us. 959 */ 960 SCHED_ASSERT_UNLOCKED(); 961 962 /* 963 * We're running again; record our new start time. We might 964 * be running on a new CPU now, so don't use the cache'd 965 * schedstate_percpu pointer. 966 */ 967 KDASSERT(l->l_cpu != NULL); 968 KDASSERT(l->l_cpu == curcpu()); 969 microtime(&l->l_cpu->ci_schedstate.spc_runtime); 970 971 #if defined(MULTIPROCESSOR) 972 /* 973 * Reacquire the kernel_lock now. We do this after we've 974 * released the scheduler lock to avoid deadlock, and before 975 * we reacquire the interlock. 976 */ 977 if (l->l_flag & L_BIGLOCK) 978 spinlock_acquire_count(&kernel_lock, hold_count); 979 #endif 980 981 return retval; 982 } 983 984 /* 985 * Initialize the (doubly-linked) run queues 986 * to be empty. 987 */ 988 void 989 rqinit() 990 { 991 int i; 992 993 for (i = 0; i < RUNQUE_NQS; i++) 994 sched_qs[i].ph_link = sched_qs[i].ph_rlink = 995 (struct lwp *)&sched_qs[i]; 996 } 997 998 static __inline void 999 resched_proc(struct lwp *l, u_char pri) 1000 { 1001 struct cpu_info *ci; 1002 1003 /* 1004 * XXXSMP 1005 * Since l->l_cpu persists across a context switch, 1006 * this gives us *very weak* processor affinity, in 1007 * that we notify the CPU on which the process last 1008 * ran that it should try to switch. 1009 * 1010 * This does not guarantee that the process will run on 1011 * that processor next, because another processor might 1012 * grab it the next time it performs a context switch. 1013 * 1014 * This also does not handle the case where its last 1015 * CPU is running a higher-priority process, but every 1016 * other CPU is running a lower-priority process. There 1017 * are ways to handle this situation, but they're not 1018 * currently very pretty, and we also need to weigh the 1019 * cost of moving a process from one CPU to another. 1020 * 1021 * XXXSMP 1022 * There is also the issue of locking the other CPU's 1023 * sched state, which we currently do not do. 1024 */ 1025 ci = (l->l_cpu != NULL) ? l->l_cpu : curcpu(); 1026 if (pri < ci->ci_schedstate.spc_curpriority) 1027 need_resched(ci); 1028 } 1029 1030 /* 1031 * Change process state to be runnable, 1032 * placing it on the run queue if it is in memory, 1033 * and awakening the swapper if it isn't in memory. 1034 */ 1035 void 1036 setrunnable(struct lwp *l) 1037 { 1038 struct proc *p = l->l_proc; 1039 1040 SCHED_ASSERT_LOCKED(); 1041 1042 switch (l->l_stat) { 1043 case 0: 1044 case LSRUN: 1045 case LSONPROC: 1046 case LSZOMB: 1047 case LSDEAD: 1048 default: 1049 panic("setrunnable: lwp %p state was %d", l, l->l_stat); 1050 case LSSTOP: 1051 /* 1052 * If we're being traced (possibly because someone attached us 1053 * while we were stopped), check for a signal from the debugger. 1054 */ 1055 if ((p->p_flag & P_TRACED) != 0 && p->p_xstat != 0) { 1056 sigaddset(&p->p_sigctx.ps_siglist, p->p_xstat); 1057 CHECKSIGS(p); 1058 } 1059 case LSSLEEP: 1060 unsleep(l); /* e.g. when sending signals */ 1061 break; 1062 1063 case LSIDL: 1064 break; 1065 case LSSUSPENDED: 1066 break; 1067 } 1068 1069 if (l->l_proc->p_sa) 1070 sa_awaken(l); 1071 1072 l->l_stat = LSRUN; 1073 p->p_nrlwps++; 1074 1075 if (l->l_flag & L_INMEM) 1076 setrunqueue(l); 1077 1078 if (l->l_slptime > 1) 1079 updatepri(l); 1080 l->l_slptime = 0; 1081 if ((l->l_flag & L_INMEM) == 0) 1082 sched_wakeup((caddr_t)&proc0); 1083 else 1084 resched_proc(l, l->l_priority); 1085 } 1086 1087 /* 1088 * Compute the priority of a process when running in user mode. 1089 * Arrange to reschedule if the resulting priority is better 1090 * than that of the current process. 1091 */ 1092 void 1093 resetpriority(struct lwp *l) 1094 { 1095 unsigned int newpriority; 1096 struct proc *p = l->l_proc; 1097 1098 SCHED_ASSERT_LOCKED(); 1099 1100 newpriority = PUSER + p->p_estcpu + 1101 NICE_WEIGHT * (p->p_nice - NZERO); 1102 newpriority = min(newpriority, MAXPRI); 1103 l->l_usrpri = newpriority; 1104 resched_proc(l, l->l_usrpri); 1105 } 1106 1107 /* 1108 * Recompute priority for all LWPs in a process. 1109 */ 1110 void 1111 resetprocpriority(struct proc *p) 1112 { 1113 struct lwp *l; 1114 1115 LIST_FOREACH(l, &p->p_lwps, l_sibling) 1116 resetpriority(l); 1117 } 1118 1119 /* 1120 * We adjust the priority of the current process. The priority of a process 1121 * gets worse as it accumulates CPU time. The CPU usage estimator (p_estcpu) 1122 * is increased here. The formula for computing priorities (in kern_synch.c) 1123 * will compute a different value each time p_estcpu increases. This can 1124 * cause a switch, but unless the priority crosses a PPQ boundary the actual 1125 * queue will not change. The CPU usage estimator ramps up quite quickly 1126 * when the process is running (linearly), and decays away exponentially, at 1127 * a rate which is proportionally slower when the system is busy. The basic 1128 * principle is that the system will 90% forget that the process used a lot 1129 * of CPU time in 5 * loadav seconds. This causes the system to favor 1130 * processes which haven't run much recently, and to round-robin among other 1131 * processes. 1132 */ 1133 1134 void 1135 schedclock(struct lwp *l) 1136 { 1137 struct proc *p = l->l_proc; 1138 int s; 1139 1140 p->p_estcpu = ESTCPULIM(p->p_estcpu + 1); 1141 SCHED_LOCK(s); 1142 resetpriority(l); 1143 SCHED_UNLOCK(s); 1144 1145 if (l->l_priority >= PUSER) 1146 l->l_priority = l->l_usrpri; 1147 } 1148 1149 void 1150 suspendsched() 1151 { 1152 struct lwp *l; 1153 int s; 1154 1155 /* 1156 * Convert all non-P_SYSTEM LSSLEEP or LSRUN processes to 1157 * LSSUSPENDED. 1158 */ 1159 proclist_lock_read(); 1160 SCHED_LOCK(s); 1161 LIST_FOREACH(l, &alllwp, l_list) { 1162 if ((l->l_proc->p_flag & P_SYSTEM) != 0) 1163 continue; 1164 1165 switch (l->l_stat) { 1166 case LSRUN: 1167 l->l_proc->p_nrlwps--; 1168 if ((l->l_flag & L_INMEM) != 0) 1169 remrunqueue(l); 1170 /* FALLTHROUGH */ 1171 case LSSLEEP: 1172 l->l_stat = LSSUSPENDED; 1173 break; 1174 case LSONPROC: 1175 /* 1176 * XXX SMP: we need to deal with processes on 1177 * others CPU ! 1178 */ 1179 break; 1180 default: 1181 break; 1182 } 1183 } 1184 SCHED_UNLOCK(s); 1185 proclist_unlock_read(); 1186 } 1187 1188 /* 1189 * Low-level routines to access the run queue. Optimised assembler 1190 * routines can override these. 1191 */ 1192 1193 #ifndef __HAVE_MD_RUNQUEUE 1194 1195 /* 1196 * On some architectures, it's faster to use a MSB ordering for the priorites 1197 * than the traditional LSB ordering. 1198 */ 1199 #ifdef __HAVE_BIGENDIAN_BITOPS 1200 #define RQMASK(n) (0x80000000 >> (n)) 1201 #else 1202 #define RQMASK(n) (0x00000001 << (n)) 1203 #endif 1204 1205 /* 1206 * The primitives that manipulate the run queues. whichqs tells which 1207 * of the 32 queues qs have processes in them. Setrunqueue puts processes 1208 * into queues, remrunqueue removes them from queues. The running process is 1209 * on no queue, other processes are on a queue related to p->p_priority, 1210 * divided by 4 actually to shrink the 0-127 range of priorities into the 32 1211 * available queues. 1212 */ 1213 1214 void 1215 setrunqueue(struct lwp *l) 1216 { 1217 struct prochd *rq; 1218 struct lwp *prev; 1219 const int whichq = l->l_priority / 4; 1220 1221 #ifdef DIAGNOSTIC 1222 if (l->l_back != NULL || l->l_wchan != NULL || l->l_stat != LSRUN) 1223 panic("setrunqueue"); 1224 #endif 1225 sched_whichqs |= RQMASK(whichq); 1226 rq = &sched_qs[whichq]; 1227 prev = rq->ph_rlink; 1228 l->l_forw = (struct lwp *)rq; 1229 rq->ph_rlink = l; 1230 prev->l_forw = l; 1231 l->l_back = prev; 1232 } 1233 1234 void 1235 remrunqueue(struct lwp *l) 1236 { 1237 struct lwp *prev, *next; 1238 const int whichq = l->l_priority / 4; 1239 #ifdef DIAGNOSTIC 1240 if (((sched_whichqs & RQMASK(whichq)) == 0)) 1241 panic("remrunqueue"); 1242 #endif 1243 prev = l->l_back; 1244 l->l_back = NULL; 1245 next = l->l_forw; 1246 prev->l_forw = next; 1247 next->l_back = prev; 1248 if (prev == next) 1249 sched_whichqs &= ~RQMASK(whichq); 1250 } 1251 1252 #undef RQMASK 1253 #endif /* !defined(__HAVE_MD_RUNQUEUE) */ 1254