1 /* $NetBSD: kern_synch.c,v 1.145 2004/10/01 16:30:55 yamt Exp $ */ 2 3 /*- 4 * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the NetBSD 22 * Foundation, Inc. and its contributors. 23 * 4. Neither the name of The NetBSD Foundation nor the names of its 24 * contributors may be used to endorse or promote products derived 25 * from this software without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 28 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 29 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 30 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 31 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 * POSSIBILITY OF SUCH DAMAGE. 38 */ 39 40 /*- 41 * Copyright (c) 1982, 1986, 1990, 1991, 1993 42 * The Regents of the University of California. All rights reserved. 43 * (c) UNIX System Laboratories, Inc. 44 * All or some portions of this file are derived from material licensed 45 * to the University of California by American Telephone and Telegraph 46 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 47 * the permission of UNIX System Laboratories, Inc. 48 * 49 * Redistribution and use in source and binary forms, with or without 50 * modification, are permitted provided that the following conditions 51 * are met: 52 * 1. Redistributions of source code must retain the above copyright 53 * notice, this list of conditions and the following disclaimer. 54 * 2. Redistributions in binary form must reproduce the above copyright 55 * notice, this list of conditions and the following disclaimer in the 56 * documentation and/or other materials provided with the distribution. 57 * 3. Neither the name of the University nor the names of its contributors 58 * may be used to endorse or promote products derived from this software 59 * without specific prior written permission. 60 * 61 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 62 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 63 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 64 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 65 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 66 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 67 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 68 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 69 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 70 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 71 * SUCH DAMAGE. 72 * 73 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 74 */ 75 76 #include <sys/cdefs.h> 77 __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.145 2004/10/01 16:30:55 yamt Exp $"); 78 79 #include "opt_ddb.h" 80 #include "opt_ktrace.h" 81 #include "opt_kstack.h" 82 #include "opt_lockdebug.h" 83 #include "opt_multiprocessor.h" 84 #include "opt_perfctrs.h" 85 86 #include <sys/param.h> 87 #include <sys/systm.h> 88 #include <sys/callout.h> 89 #include <sys/proc.h> 90 #include <sys/kernel.h> 91 #include <sys/buf.h> 92 #if defined(PERFCTRS) 93 #include <sys/pmc.h> 94 #endif 95 #include <sys/signalvar.h> 96 #include <sys/resourcevar.h> 97 #include <sys/sched.h> 98 #include <sys/sa.h> 99 #include <sys/savar.h> 100 101 #include <uvm/uvm_extern.h> 102 103 #ifdef KTRACE 104 #include <sys/ktrace.h> 105 #endif 106 107 #include <machine/cpu.h> 108 109 int lbolt; /* once a second sleep address */ 110 int rrticks; /* number of hardclock ticks per roundrobin() */ 111 112 /* 113 * The global scheduler state. 114 */ 115 struct prochd sched_qs[RUNQUE_NQS]; /* run queues */ 116 __volatile u_int32_t sched_whichqs; /* bitmap of non-empty queues */ 117 struct slpque sched_slpque[SLPQUE_TABLESIZE]; /* sleep queues */ 118 119 struct simplelock sched_lock = SIMPLELOCK_INITIALIZER; 120 121 void schedcpu(void *); 122 void updatepri(struct lwp *); 123 void endtsleep(void *); 124 125 __inline void sa_awaken(struct lwp *); 126 __inline void awaken(struct lwp *); 127 128 struct callout schedcpu_ch = CALLOUT_INITIALIZER_SETFUNC(schedcpu, NULL); 129 130 131 132 /* 133 * Force switch among equal priority processes every 100ms. 134 * Called from hardclock every hz/10 == rrticks hardclock ticks. 135 */ 136 /* ARGSUSED */ 137 void 138 roundrobin(struct cpu_info *ci) 139 { 140 struct schedstate_percpu *spc = &ci->ci_schedstate; 141 142 spc->spc_rrticks = rrticks; 143 144 if (curlwp != NULL) { 145 if (spc->spc_flags & SPCF_SEENRR) { 146 /* 147 * The process has already been through a roundrobin 148 * without switching and may be hogging the CPU. 149 * Indicate that the process should yield. 150 */ 151 spc->spc_flags |= SPCF_SHOULDYIELD; 152 } else 153 spc->spc_flags |= SPCF_SEENRR; 154 } 155 need_resched(curcpu()); 156 } 157 158 /* 159 * Constants for digital decay and forget: 160 * 90% of (p_estcpu) usage in 5 * loadav time 161 * 95% of (p_pctcpu) usage in 60 seconds (load insensitive) 162 * Note that, as ps(1) mentions, this can let percentages 163 * total over 100% (I've seen 137.9% for 3 processes). 164 * 165 * Note that hardclock updates p_estcpu and p_cpticks independently. 166 * 167 * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds. 168 * That is, the system wants to compute a value of decay such 169 * that the following for loop: 170 * for (i = 0; i < (5 * loadavg); i++) 171 * p_estcpu *= decay; 172 * will compute 173 * p_estcpu *= 0.1; 174 * for all values of loadavg: 175 * 176 * Mathematically this loop can be expressed by saying: 177 * decay ** (5 * loadavg) ~= .1 178 * 179 * The system computes decay as: 180 * decay = (2 * loadavg) / (2 * loadavg + 1) 181 * 182 * We wish to prove that the system's computation of decay 183 * will always fulfill the equation: 184 * decay ** (5 * loadavg) ~= .1 185 * 186 * If we compute b as: 187 * b = 2 * loadavg 188 * then 189 * decay = b / (b + 1) 190 * 191 * We now need to prove two things: 192 * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1) 193 * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg) 194 * 195 * Facts: 196 * For x close to zero, exp(x) =~ 1 + x, since 197 * exp(x) = 0! + x**1/1! + x**2/2! + ... . 198 * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b. 199 * For x close to zero, ln(1+x) =~ x, since 200 * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1 201 * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1). 202 * ln(.1) =~ -2.30 203 * 204 * Proof of (1): 205 * Solve (factor)**(power) =~ .1 given power (5*loadav): 206 * solving for factor, 207 * ln(factor) =~ (-2.30/5*loadav), or 208 * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) = 209 * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED 210 * 211 * Proof of (2): 212 * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)): 213 * solving for power, 214 * power*ln(b/(b+1)) =~ -2.30, or 215 * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED 216 * 217 * Actual power values for the implemented algorithm are as follows: 218 * loadav: 1 2 3 4 219 * power: 5.68 10.32 14.94 19.55 220 */ 221 222 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */ 223 #define loadfactor(loadav) (2 * (loadav)) 224 #define decay_cpu(loadfac, cpu) (((loadfac) * (cpu)) / ((loadfac) + FSCALE)) 225 226 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 227 fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 228 229 /* 230 * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the 231 * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below 232 * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT). 233 * 234 * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used: 235 * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits). 236 * 237 * If you dont want to bother with the faster/more-accurate formula, you 238 * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate 239 * (more general) method of calculating the %age of CPU used by a process. 240 */ 241 #define CCPU_SHIFT 11 242 243 /* 244 * Recompute process priorities, every hz ticks. 245 */ 246 /* ARGSUSED */ 247 void 248 schedcpu(void *arg) 249 { 250 fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); 251 struct lwp *l; 252 struct proc *p; 253 int s, minslp; 254 unsigned int newcpu; 255 int clkhz; 256 257 proclist_lock_read(); 258 PROCLIST_FOREACH(p, &allproc) { 259 /* 260 * Increment time in/out of memory and sleep time 261 * (if sleeping). We ignore overflow; with 16-bit int's 262 * (remember them?) overflow takes 45 days. 263 */ 264 minslp = 2; 265 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 266 l->l_swtime++; 267 if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP || 268 l->l_stat == LSSUSPENDED) { 269 l->l_slptime++; 270 minslp = min(minslp, l->l_slptime); 271 } else 272 minslp = 0; 273 } 274 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT; 275 /* 276 * If the process has slept the entire second, 277 * stop recalculating its priority until it wakes up. 278 */ 279 if (minslp > 1) 280 continue; 281 s = splstatclock(); /* prevent state changes */ 282 /* 283 * p_pctcpu is only for ps. 284 */ 285 clkhz = stathz != 0 ? stathz : hz; 286 #if (FSHIFT >= CCPU_SHIFT) 287 p->p_pctcpu += (clkhz == 100)? 288 ((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT): 289 100 * (((fixpt_t) p->p_cpticks) 290 << (FSHIFT - CCPU_SHIFT)) / clkhz; 291 #else 292 p->p_pctcpu += ((FSCALE - ccpu) * 293 (p->p_cpticks * FSCALE / clkhz)) >> FSHIFT; 294 #endif 295 p->p_cpticks = 0; 296 newcpu = (u_int)decay_cpu(loadfac, p->p_estcpu); 297 p->p_estcpu = newcpu; 298 splx(s); /* Done with the process CPU ticks update */ 299 SCHED_LOCK(s); 300 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 301 if (l->l_slptime > 1) 302 continue; 303 resetpriority(l); 304 if (l->l_priority >= PUSER) { 305 if (l->l_stat == LSRUN && 306 (l->l_flag & L_INMEM) && 307 (l->l_priority / PPQ) != (l->l_usrpri / PPQ)) { 308 remrunqueue(l); 309 l->l_priority = l->l_usrpri; 310 setrunqueue(l); 311 } else 312 l->l_priority = l->l_usrpri; 313 } 314 } 315 SCHED_UNLOCK(s); 316 } 317 proclist_unlock_read(); 318 uvm_meter(); 319 wakeup((caddr_t)&lbolt); 320 callout_schedule(&schedcpu_ch, hz); 321 } 322 323 /* 324 * Recalculate the priority of a process after it has slept for a while. 325 * For all load averages >= 1 and max p_estcpu of 255, sleeping for at 326 * least six times the loadfactor will decay p_estcpu to zero. 327 */ 328 void 329 updatepri(struct lwp *l) 330 { 331 struct proc *p = l->l_proc; 332 unsigned int newcpu; 333 fixpt_t loadfac; 334 335 SCHED_ASSERT_LOCKED(); 336 337 newcpu = p->p_estcpu; 338 loadfac = loadfactor(averunnable.ldavg[0]); 339 340 if (l->l_slptime > 5 * loadfac) 341 p->p_estcpu = 0; /* XXX NJWLWP */ 342 else { 343 l->l_slptime--; /* the first time was done in schedcpu */ 344 while (newcpu && --l->l_slptime) 345 newcpu = (int) decay_cpu(loadfac, newcpu); 346 p->p_estcpu = newcpu; 347 } 348 resetpriority(l); 349 } 350 351 /* 352 * During autoconfiguration or after a panic, a sleep will simply 353 * lower the priority briefly to allow interrupts, then return. 354 * The priority to be used (safepri) is machine-dependent, thus this 355 * value is initialized and maintained in the machine-dependent layers. 356 * This priority will typically be 0, or the lowest priority 357 * that is safe for use on the interrupt stack; it can be made 358 * higher to block network software interrupts after panics. 359 */ 360 int safepri; 361 362 /* 363 * General sleep call. Suspends the current process until a wakeup is 364 * performed on the specified identifier. The process will then be made 365 * runnable with the specified priority. Sleeps at most timo/hz seconds 366 * (0 means no timeout). If pri includes PCATCH flag, signals are checked 367 * before and after sleeping, else signals are not checked. Returns 0 if 368 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a 369 * signal needs to be delivered, ERESTART is returned if the current system 370 * call should be restarted if possible, and EINTR is returned if the system 371 * call should be interrupted by the signal (return EINTR). 372 * 373 * The interlock is held until the scheduler_slock is acquired. The 374 * interlock will be locked before returning back to the caller 375 * unless the PNORELOCK flag is specified, in which case the 376 * interlock will always be unlocked upon return. 377 */ 378 int 379 ltsleep(const void *ident, int priority, const char *wmesg, int timo, 380 __volatile struct simplelock *interlock) 381 { 382 struct lwp *l = curlwp; 383 struct proc *p = l ? l->l_proc : NULL; 384 struct slpque *qp; 385 int sig, s; 386 int catch = priority & PCATCH; 387 int relock = (priority & PNORELOCK) == 0; 388 int exiterr = (priority & PNOEXITERR) == 0; 389 390 /* 391 * XXXSMP 392 * This is probably bogus. Figure out what the right 393 * thing to do here really is. 394 * Note that not sleeping if ltsleep is called with curlwp == NULL 395 * in the shutdown case is disgusting but partly necessary given 396 * how shutdown (barely) works. 397 */ 398 if (cold || (doing_shutdown && (panicstr || (l == NULL)))) { 399 /* 400 * After a panic, or during autoconfiguration, 401 * just give interrupts a chance, then just return; 402 * don't run any other procs or panic below, 403 * in case this is the idle process and already asleep. 404 */ 405 s = splhigh(); 406 splx(safepri); 407 splx(s); 408 if (interlock != NULL && relock == 0) 409 simple_unlock(interlock); 410 return (0); 411 } 412 413 KASSERT(p != NULL); 414 LOCK_ASSERT(interlock == NULL || simple_lock_held(interlock)); 415 416 #ifdef KTRACE 417 if (KTRPOINT(p, KTR_CSW)) 418 ktrcsw(p, 1, 0); 419 #endif 420 421 SCHED_LOCK(s); 422 423 #ifdef DIAGNOSTIC 424 if (ident == NULL) 425 panic("ltsleep: ident == NULL"); 426 if (l->l_stat != LSONPROC) 427 panic("ltsleep: l_stat %d != LSONPROC", l->l_stat); 428 if (l->l_back != NULL) 429 panic("ltsleep: p_back != NULL"); 430 #endif 431 432 l->l_wchan = ident; 433 l->l_wmesg = wmesg; 434 l->l_slptime = 0; 435 l->l_priority = priority & PRIMASK; 436 437 qp = SLPQUE(ident); 438 if (qp->sq_head == 0) 439 qp->sq_head = l; 440 else { 441 *qp->sq_tailp = l; 442 } 443 *(qp->sq_tailp = &l->l_forw) = 0; 444 445 if (timo) 446 callout_reset(&l->l_tsleep_ch, timo, endtsleep, l); 447 448 /* 449 * We can now release the interlock; the scheduler_slock 450 * is held, so a thread can't get in to do wakeup() before 451 * we do the switch. 452 * 453 * XXX We leave the code block here, after inserting ourselves 454 * on the sleep queue, because we might want a more clever 455 * data structure for the sleep queues at some point. 456 */ 457 if (interlock != NULL) 458 simple_unlock(interlock); 459 460 /* 461 * We put ourselves on the sleep queue and start our timeout 462 * before calling CURSIG, as we could stop there, and a wakeup 463 * or a SIGCONT (or both) could occur while we were stopped. 464 * A SIGCONT would cause us to be marked as SSLEEP 465 * without resuming us, thus we must be ready for sleep 466 * when CURSIG is called. If the wakeup happens while we're 467 * stopped, p->p_wchan will be 0 upon return from CURSIG. 468 */ 469 if (catch) { 470 l->l_flag |= L_SINTR; 471 if (((sig = CURSIG(l)) != 0) || 472 ((p->p_flag & P_WEXIT) && p->p_nlwps > 1)) { 473 if (l->l_wchan != NULL) 474 unsleep(l); 475 l->l_stat = LSONPROC; 476 SCHED_UNLOCK(s); 477 goto resume; 478 } 479 if (l->l_wchan == NULL) { 480 catch = 0; 481 SCHED_UNLOCK(s); 482 goto resume; 483 } 484 } else 485 sig = 0; 486 l->l_stat = LSSLEEP; 487 p->p_nrlwps--; 488 p->p_stats->p_ru.ru_nvcsw++; 489 SCHED_ASSERT_LOCKED(); 490 if (l->l_flag & L_SA) 491 sa_switch(l, SA_UPCALL_BLOCKED); 492 else 493 mi_switch(l, NULL); 494 495 #if defined(DDB) && !defined(GPROF) 496 /* handy breakpoint location after process "wakes" */ 497 __asm(".globl bpendtsleep\nbpendtsleep:"); 498 #endif 499 /* 500 * p->p_nrlwps is incremented by whoever made us runnable again, 501 * either setrunnable() or awaken(). 502 */ 503 504 SCHED_ASSERT_UNLOCKED(); 505 splx(s); 506 507 resume: 508 KDASSERT(l->l_cpu != NULL); 509 KDASSERT(l->l_cpu == curcpu()); 510 l->l_cpu->ci_schedstate.spc_curpriority = l->l_usrpri; 511 512 l->l_flag &= ~L_SINTR; 513 if (l->l_flag & L_TIMEOUT) { 514 l->l_flag &= ~(L_TIMEOUT|L_CANCELLED); 515 if (sig == 0) { 516 #ifdef KTRACE 517 if (KTRPOINT(p, KTR_CSW)) 518 ktrcsw(p, 0, 0); 519 #endif 520 if (relock && interlock != NULL) 521 simple_lock(interlock); 522 return (EWOULDBLOCK); 523 } 524 } else if (timo) 525 callout_stop(&l->l_tsleep_ch); 526 527 if (catch) { 528 const int cancelled = l->l_flag & L_CANCELLED; 529 l->l_flag &= ~L_CANCELLED; 530 if (sig != 0 || (sig = CURSIG(l)) != 0 || cancelled) { 531 #ifdef KTRACE 532 if (KTRPOINT(p, KTR_CSW)) 533 ktrcsw(p, 0, 0); 534 #endif 535 if (relock && interlock != NULL) 536 simple_lock(interlock); 537 /* 538 * If this sleep was canceled, don't let the syscall 539 * restart. 540 */ 541 if (cancelled || 542 (SIGACTION(p, sig).sa_flags & SA_RESTART) == 0) 543 return (EINTR); 544 return (ERESTART); 545 } 546 } 547 548 #ifdef KTRACE 549 if (KTRPOINT(p, KTR_CSW)) 550 ktrcsw(p, 0, 0); 551 #endif 552 if (relock && interlock != NULL) 553 simple_lock(interlock); 554 555 /* XXXNJW this is very much a kluge. 556 * revisit. a better way of preventing looping/hanging syscalls like 557 * wait4() and _lwp_wait() from wedging an exiting process 558 * would be preferred. 559 */ 560 if (catch && ((p->p_flag & P_WEXIT) && p->p_nlwps > 1 && exiterr)) 561 return (EINTR); 562 return (0); 563 } 564 565 /* 566 * Implement timeout for tsleep. 567 * If process hasn't been awakened (wchan non-zero), 568 * set timeout flag and undo the sleep. If proc 569 * is stopped, just unsleep so it will remain stopped. 570 */ 571 void 572 endtsleep(void *arg) 573 { 574 struct lwp *l; 575 int s; 576 577 l = (struct lwp *)arg; 578 SCHED_LOCK(s); 579 if (l->l_wchan) { 580 if (l->l_stat == LSSLEEP) 581 setrunnable(l); 582 else 583 unsleep(l); 584 l->l_flag |= L_TIMEOUT; 585 } 586 SCHED_UNLOCK(s); 587 } 588 589 /* 590 * Remove a process from its wait queue 591 */ 592 void 593 unsleep(struct lwp *l) 594 { 595 struct slpque *qp; 596 struct lwp **hp; 597 598 SCHED_ASSERT_LOCKED(); 599 600 if (l->l_wchan) { 601 hp = &(qp = SLPQUE(l->l_wchan))->sq_head; 602 while (*hp != l) 603 hp = &(*hp)->l_forw; 604 *hp = l->l_forw; 605 if (qp->sq_tailp == &l->l_forw) 606 qp->sq_tailp = hp; 607 l->l_wchan = 0; 608 } 609 } 610 611 __inline void 612 sa_awaken(struct lwp *l) 613 { 614 615 SCHED_ASSERT_LOCKED(); 616 617 if (l == l->l_savp->savp_lwp && l->l_flag & L_SA_YIELD) 618 l->l_flag &= ~L_SA_IDLE; 619 } 620 621 /* 622 * Optimized-for-wakeup() version of setrunnable(). 623 */ 624 __inline void 625 awaken(struct lwp *l) 626 { 627 628 SCHED_ASSERT_LOCKED(); 629 630 if (l->l_proc->p_sa) 631 sa_awaken(l); 632 633 if (l->l_slptime > 1) 634 updatepri(l); 635 l->l_slptime = 0; 636 l->l_stat = LSRUN; 637 l->l_proc->p_nrlwps++; 638 /* 639 * Since curpriority is a user priority, p->p_priority 640 * is always better than curpriority on the last CPU on 641 * which it ran. 642 * 643 * XXXSMP See affinity comment in resched_proc(). 644 */ 645 if (l->l_flag & L_INMEM) { 646 setrunqueue(l); 647 KASSERT(l->l_cpu != NULL); 648 need_resched(l->l_cpu); 649 } else 650 sched_wakeup(&proc0); 651 } 652 653 #if defined(MULTIPROCESSOR) || defined(LOCKDEBUG) 654 void 655 sched_unlock_idle(void) 656 { 657 658 simple_unlock(&sched_lock); 659 } 660 661 void 662 sched_lock_idle(void) 663 { 664 665 simple_lock(&sched_lock); 666 } 667 #endif /* MULTIPROCESSOR || LOCKDEBUG */ 668 669 /* 670 * Make all processes sleeping on the specified identifier runnable. 671 */ 672 673 void 674 wakeup(const void *ident) 675 { 676 int s; 677 678 SCHED_ASSERT_UNLOCKED(); 679 680 SCHED_LOCK(s); 681 sched_wakeup(ident); 682 SCHED_UNLOCK(s); 683 } 684 685 void 686 sched_wakeup(const void *ident) 687 { 688 struct slpque *qp; 689 struct lwp *l, **q; 690 691 SCHED_ASSERT_LOCKED(); 692 693 qp = SLPQUE(ident); 694 restart: 695 for (q = &qp->sq_head; (l = *q) != NULL; ) { 696 #ifdef DIAGNOSTIC 697 if (l->l_back || (l->l_stat != LSSLEEP && 698 l->l_stat != LSSTOP && l->l_stat != LSSUSPENDED)) 699 panic("wakeup"); 700 #endif 701 if (l->l_wchan == ident) { 702 l->l_wchan = 0; 703 *q = l->l_forw; 704 if (qp->sq_tailp == &l->l_forw) 705 qp->sq_tailp = q; 706 if (l->l_stat == LSSLEEP) { 707 awaken(l); 708 goto restart; 709 } 710 } else 711 q = &l->l_forw; 712 } 713 } 714 715 /* 716 * Make the highest priority process first in line on the specified 717 * identifier runnable. 718 */ 719 void 720 wakeup_one(const void *ident) 721 { 722 struct slpque *qp; 723 struct lwp *l, **q; 724 struct lwp *best_sleepp, **best_sleepq; 725 struct lwp *best_stopp, **best_stopq; 726 int s; 727 728 best_sleepp = best_stopp = NULL; 729 best_sleepq = best_stopq = NULL; 730 731 SCHED_LOCK(s); 732 733 qp = SLPQUE(ident); 734 735 for (q = &qp->sq_head; (l = *q) != NULL; q = &l->l_forw) { 736 #ifdef DIAGNOSTIC 737 if (l->l_back || (l->l_stat != LSSLEEP && 738 l->l_stat != LSSTOP && l->l_stat != LSSUSPENDED)) 739 panic("wakeup_one"); 740 #endif 741 if (l->l_wchan == ident) { 742 if (l->l_stat == LSSLEEP) { 743 if (best_sleepp == NULL || 744 l->l_priority < best_sleepp->l_priority) { 745 best_sleepp = l; 746 best_sleepq = q; 747 } 748 } else { 749 if (best_stopp == NULL || 750 l->l_priority < best_stopp->l_priority) { 751 best_stopp = l; 752 best_stopq = q; 753 } 754 } 755 } 756 } 757 758 /* 759 * Consider any SSLEEP process higher than the highest priority SSTOP 760 * process. 761 */ 762 if (best_sleepp != NULL) { 763 l = best_sleepp; 764 q = best_sleepq; 765 } else { 766 l = best_stopp; 767 q = best_stopq; 768 } 769 770 if (l != NULL) { 771 l->l_wchan = NULL; 772 *q = l->l_forw; 773 if (qp->sq_tailp == &l->l_forw) 774 qp->sq_tailp = q; 775 if (l->l_stat == LSSLEEP) 776 awaken(l); 777 } 778 SCHED_UNLOCK(s); 779 } 780 781 /* 782 * General yield call. Puts the current process back on its run queue and 783 * performs a voluntary context switch. Should only be called when the 784 * current process explicitly requests it (eg sched_yield(2) in compat code). 785 */ 786 void 787 yield(void) 788 { 789 struct lwp *l = curlwp; 790 int s; 791 792 SCHED_LOCK(s); 793 l->l_priority = l->l_usrpri; 794 l->l_stat = LSRUN; 795 setrunqueue(l); 796 l->l_proc->p_stats->p_ru.ru_nvcsw++; 797 mi_switch(l, NULL); 798 SCHED_ASSERT_UNLOCKED(); 799 splx(s); 800 } 801 802 /* 803 * General preemption call. Puts the current process back on its run queue 804 * and performs an involuntary context switch. If a process is supplied, 805 * we switch to that process. Otherwise, we use the normal process selection 806 * criteria. 807 */ 808 809 void 810 preempt(int more) 811 { 812 struct lwp *l = curlwp; 813 int r, s; 814 815 SCHED_LOCK(s); 816 l->l_priority = l->l_usrpri; 817 l->l_stat = LSRUN; 818 setrunqueue(l); 819 l->l_proc->p_stats->p_ru.ru_nivcsw++; 820 r = mi_switch(l, NULL); 821 SCHED_ASSERT_UNLOCKED(); 822 splx(s); 823 if ((l->l_flag & L_SA) != 0 && r != 0 && more == 0) 824 sa_preempt(l); 825 } 826 827 /* 828 * The machine independent parts of context switch. 829 * Must be called at splsched() (no higher!) and with 830 * the sched_lock held. 831 * Switch to "new" if non-NULL, otherwise let cpu_switch choose 832 * the next lwp. 833 * 834 * Returns 1 if another process was actually run. 835 */ 836 int 837 mi_switch(struct lwp *l, struct lwp *newl) 838 { 839 struct schedstate_percpu *spc; 840 struct rlimit *rlim; 841 long s, u; 842 struct timeval tv; 843 int hold_count; 844 struct proc *p = l->l_proc; 845 int retval; 846 847 SCHED_ASSERT_LOCKED(); 848 849 /* 850 * Release the kernel_lock, as we are about to yield the CPU. 851 * The scheduler lock is still held until cpu_switch() 852 * selects a new process and removes it from the run queue. 853 */ 854 hold_count = KERNEL_LOCK_RELEASE_ALL(); 855 856 KDASSERT(l->l_cpu != NULL); 857 KDASSERT(l->l_cpu == curcpu()); 858 859 spc = &l->l_cpu->ci_schedstate; 860 861 #if defined(LOCKDEBUG) || defined(DIAGNOSTIC) 862 spinlock_switchcheck(); 863 #endif 864 #ifdef LOCKDEBUG 865 simple_lock_switchcheck(); 866 #endif 867 868 /* 869 * Compute the amount of time during which the current 870 * process was running. 871 */ 872 microtime(&tv); 873 u = p->p_rtime.tv_usec + 874 (tv.tv_usec - spc->spc_runtime.tv_usec); 875 s = p->p_rtime.tv_sec + (tv.tv_sec - spc->spc_runtime.tv_sec); 876 if (u < 0) { 877 u += 1000000; 878 s--; 879 } else if (u >= 1000000) { 880 u -= 1000000; 881 s++; 882 } 883 p->p_rtime.tv_usec = u; 884 p->p_rtime.tv_sec = s; 885 886 /* 887 * Check if the process exceeds its CPU resource allocation. 888 * If over max, kill it. In any case, if it has run for more 889 * than 10 minutes, reduce priority to give others a chance. 890 */ 891 rlim = &p->p_rlimit[RLIMIT_CPU]; 892 if (s >= rlim->rlim_cur) { 893 /* 894 * XXXSMP: we're inside the scheduler lock perimeter; 895 * use sched_psignal. 896 */ 897 if (s >= rlim->rlim_max) 898 sched_psignal(p, SIGKILL); 899 else { 900 sched_psignal(p, SIGXCPU); 901 if (rlim->rlim_cur < rlim->rlim_max) 902 rlim->rlim_cur += 5; 903 } 904 } 905 if (autonicetime && s > autonicetime && p->p_ucred->cr_uid && 906 p->p_nice == NZERO) { 907 p->p_nice = autoniceval + NZERO; 908 resetpriority(l); 909 } 910 911 /* 912 * Process is about to yield the CPU; clear the appropriate 913 * scheduling flags. 914 */ 915 spc->spc_flags &= ~SPCF_SWITCHCLEAR; 916 917 #ifdef KSTACK_CHECK_MAGIC 918 kstack_check_magic(l); 919 #endif 920 921 /* 922 * If we are using h/w performance counters, save context. 923 */ 924 #if PERFCTRS 925 if (PMC_ENABLED(p)) 926 pmc_save_context(p); 927 #endif 928 929 /* 930 * Switch to the new current process. When we 931 * run again, we'll return back here. 932 */ 933 uvmexp.swtch++; 934 if (newl == NULL) { 935 retval = cpu_switch(l, NULL); 936 } else { 937 remrunqueue(newl); 938 cpu_switchto(l, newl); 939 retval = 0; 940 } 941 942 /* 943 * If we are using h/w performance counters, restore context. 944 */ 945 #if PERFCTRS 946 if (PMC_ENABLED(p)) 947 pmc_restore_context(p); 948 #endif 949 950 /* 951 * Make sure that MD code released the scheduler lock before 952 * resuming us. 953 */ 954 SCHED_ASSERT_UNLOCKED(); 955 956 /* 957 * We're running again; record our new start time. We might 958 * be running on a new CPU now, so don't use the cache'd 959 * schedstate_percpu pointer. 960 */ 961 KDASSERT(l->l_cpu != NULL); 962 KDASSERT(l->l_cpu == curcpu()); 963 microtime(&l->l_cpu->ci_schedstate.spc_runtime); 964 965 /* 966 * Reacquire the kernel_lock now. We do this after we've 967 * released the scheduler lock to avoid deadlock, and before 968 * we reacquire the interlock. 969 */ 970 KERNEL_LOCK_ACQUIRE_COUNT(hold_count); 971 972 return retval; 973 } 974 975 /* 976 * Initialize the (doubly-linked) run queues 977 * to be empty. 978 */ 979 void 980 rqinit() 981 { 982 int i; 983 984 for (i = 0; i < RUNQUE_NQS; i++) 985 sched_qs[i].ph_link = sched_qs[i].ph_rlink = 986 (struct lwp *)&sched_qs[i]; 987 } 988 989 static __inline void 990 resched_proc(struct lwp *l, u_char pri) 991 { 992 struct cpu_info *ci; 993 994 /* 995 * XXXSMP 996 * Since l->l_cpu persists across a context switch, 997 * this gives us *very weak* processor affinity, in 998 * that we notify the CPU on which the process last 999 * ran that it should try to switch. 1000 * 1001 * This does not guarantee that the process will run on 1002 * that processor next, because another processor might 1003 * grab it the next time it performs a context switch. 1004 * 1005 * This also does not handle the case where its last 1006 * CPU is running a higher-priority process, but every 1007 * other CPU is running a lower-priority process. There 1008 * are ways to handle this situation, but they're not 1009 * currently very pretty, and we also need to weigh the 1010 * cost of moving a process from one CPU to another. 1011 * 1012 * XXXSMP 1013 * There is also the issue of locking the other CPU's 1014 * sched state, which we currently do not do. 1015 */ 1016 ci = (l->l_cpu != NULL) ? l->l_cpu : curcpu(); 1017 if (pri < ci->ci_schedstate.spc_curpriority) 1018 need_resched(ci); 1019 } 1020 1021 /* 1022 * Change process state to be runnable, 1023 * placing it on the run queue if it is in memory, 1024 * and awakening the swapper if it isn't in memory. 1025 */ 1026 void 1027 setrunnable(struct lwp *l) 1028 { 1029 struct proc *p = l->l_proc; 1030 1031 SCHED_ASSERT_LOCKED(); 1032 1033 switch (l->l_stat) { 1034 case 0: 1035 case LSRUN: 1036 case LSONPROC: 1037 case LSZOMB: 1038 case LSDEAD: 1039 default: 1040 panic("setrunnable: lwp %p state was %d", l, l->l_stat); 1041 case LSSTOP: 1042 /* 1043 * If we're being traced (possibly because someone attached us 1044 * while we were stopped), check for a signal from the debugger. 1045 */ 1046 if ((p->p_flag & P_TRACED) != 0 && p->p_xstat != 0) { 1047 sigaddset(&p->p_sigctx.ps_siglist, p->p_xstat); 1048 CHECKSIGS(p); 1049 } 1050 case LSSLEEP: 1051 unsleep(l); /* e.g. when sending signals */ 1052 break; 1053 1054 case LSIDL: 1055 break; 1056 case LSSUSPENDED: 1057 break; 1058 } 1059 1060 if (l->l_proc->p_sa) 1061 sa_awaken(l); 1062 1063 l->l_stat = LSRUN; 1064 p->p_nrlwps++; 1065 1066 if (l->l_flag & L_INMEM) 1067 setrunqueue(l); 1068 1069 if (l->l_slptime > 1) 1070 updatepri(l); 1071 l->l_slptime = 0; 1072 if ((l->l_flag & L_INMEM) == 0) 1073 sched_wakeup((caddr_t)&proc0); 1074 else 1075 resched_proc(l, l->l_priority); 1076 } 1077 1078 /* 1079 * Compute the priority of a process when running in user mode. 1080 * Arrange to reschedule if the resulting priority is better 1081 * than that of the current process. 1082 */ 1083 void 1084 resetpriority(struct lwp *l) 1085 { 1086 unsigned int newpriority; 1087 struct proc *p = l->l_proc; 1088 1089 SCHED_ASSERT_LOCKED(); 1090 1091 newpriority = PUSER + p->p_estcpu + 1092 NICE_WEIGHT * (p->p_nice - NZERO); 1093 newpriority = min(newpriority, MAXPRI); 1094 l->l_usrpri = newpriority; 1095 resched_proc(l, l->l_usrpri); 1096 } 1097 1098 /* 1099 * Recompute priority for all LWPs in a process. 1100 */ 1101 void 1102 resetprocpriority(struct proc *p) 1103 { 1104 struct lwp *l; 1105 1106 LIST_FOREACH(l, &p->p_lwps, l_sibling) 1107 resetpriority(l); 1108 } 1109 1110 /* 1111 * We adjust the priority of the current process. The priority of a process 1112 * gets worse as it accumulates CPU time. The CPU usage estimator (p_estcpu) 1113 * is increased here. The formula for computing priorities (in kern_synch.c) 1114 * will compute a different value each time p_estcpu increases. This can 1115 * cause a switch, but unless the priority crosses a PPQ boundary the actual 1116 * queue will not change. The CPU usage estimator ramps up quite quickly 1117 * when the process is running (linearly), and decays away exponentially, at 1118 * a rate which is proportionally slower when the system is busy. The basic 1119 * principle is that the system will 90% forget that the process used a lot 1120 * of CPU time in 5 * loadav seconds. This causes the system to favor 1121 * processes which haven't run much recently, and to round-robin among other 1122 * processes. 1123 */ 1124 1125 void 1126 schedclock(struct lwp *l) 1127 { 1128 struct proc *p = l->l_proc; 1129 int s; 1130 1131 p->p_estcpu = ESTCPULIM(p->p_estcpu + 1); 1132 SCHED_LOCK(s); 1133 resetpriority(l); 1134 SCHED_UNLOCK(s); 1135 1136 if (l->l_priority >= PUSER) 1137 l->l_priority = l->l_usrpri; 1138 } 1139 1140 void 1141 suspendsched() 1142 { 1143 struct lwp *l; 1144 int s; 1145 1146 /* 1147 * Convert all non-P_SYSTEM LSSLEEP or LSRUN processes to 1148 * LSSUSPENDED. 1149 */ 1150 proclist_lock_read(); 1151 SCHED_LOCK(s); 1152 LIST_FOREACH(l, &alllwp, l_list) { 1153 if ((l->l_proc->p_flag & P_SYSTEM) != 0) 1154 continue; 1155 1156 switch (l->l_stat) { 1157 case LSRUN: 1158 l->l_proc->p_nrlwps--; 1159 if ((l->l_flag & L_INMEM) != 0) 1160 remrunqueue(l); 1161 /* FALLTHROUGH */ 1162 case LSSLEEP: 1163 l->l_stat = LSSUSPENDED; 1164 break; 1165 case LSONPROC: 1166 /* 1167 * XXX SMP: we need to deal with processes on 1168 * others CPU ! 1169 */ 1170 break; 1171 default: 1172 break; 1173 } 1174 } 1175 SCHED_UNLOCK(s); 1176 proclist_unlock_read(); 1177 } 1178 1179 /* 1180 * Low-level routines to access the run queue. Optimised assembler 1181 * routines can override these. 1182 */ 1183 1184 #ifndef __HAVE_MD_RUNQUEUE 1185 1186 /* 1187 * On some architectures, it's faster to use a MSB ordering for the priorites 1188 * than the traditional LSB ordering. 1189 */ 1190 #ifdef __HAVE_BIGENDIAN_BITOPS 1191 #define RQMASK(n) (0x80000000 >> (n)) 1192 #else 1193 #define RQMASK(n) (0x00000001 << (n)) 1194 #endif 1195 1196 /* 1197 * The primitives that manipulate the run queues. whichqs tells which 1198 * of the 32 queues qs have processes in them. Setrunqueue puts processes 1199 * into queues, remrunqueue removes them from queues. The running process is 1200 * on no queue, other processes are on a queue related to p->p_priority, 1201 * divided by 4 actually to shrink the 0-127 range of priorities into the 32 1202 * available queues. 1203 */ 1204 1205 void 1206 setrunqueue(struct lwp *l) 1207 { 1208 struct prochd *rq; 1209 struct lwp *prev; 1210 const int whichq = l->l_priority / 4; 1211 1212 #ifdef DIAGNOSTIC 1213 if (l->l_back != NULL || l->l_wchan != NULL || l->l_stat != LSRUN) 1214 panic("setrunqueue"); 1215 #endif 1216 sched_whichqs |= RQMASK(whichq); 1217 rq = &sched_qs[whichq]; 1218 prev = rq->ph_rlink; 1219 l->l_forw = (struct lwp *)rq; 1220 rq->ph_rlink = l; 1221 prev->l_forw = l; 1222 l->l_back = prev; 1223 } 1224 1225 void 1226 remrunqueue(struct lwp *l) 1227 { 1228 struct lwp *prev, *next; 1229 const int whichq = l->l_priority / 4; 1230 #ifdef DIAGNOSTIC 1231 if (((sched_whichqs & RQMASK(whichq)) == 0)) 1232 panic("remrunqueue"); 1233 #endif 1234 prev = l->l_back; 1235 l->l_back = NULL; 1236 next = l->l_forw; 1237 prev->l_forw = next; 1238 next->l_back = prev; 1239 if (prev == next) 1240 sched_whichqs &= ~RQMASK(whichq); 1241 } 1242 1243 #undef RQMASK 1244 #endif /* !defined(__HAVE_MD_RUNQUEUE) */ 1245