1 /*- 2 * Copyright (c) 1982, 1986, 1990, 1991, 1993 3 * The Regents of the University of California. All rights reserved. 4 * (c) UNIX System Laboratories, Inc. 5 * All or some portions of this file are derived from material licensed 6 * to the University of California by American Telephone and Telegraph 7 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 8 * the permission of UNIX System Laboratories, Inc. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 39 * $FreeBSD: src/sys/kern/kern_synch.c,v 1.87.2.6 2002/10/13 07:29:53 kbyanc Exp $ 40 * $DragonFly: src/sys/kern/kern_synch.c,v 1.29 2004/03/08 03:05:27 dillon Exp $ 41 */ 42 43 #include "opt_ktrace.h" 44 45 #include <sys/param.h> 46 #include <sys/systm.h> 47 #include <sys/proc.h> 48 #include <sys/kernel.h> 49 #include <sys/signalvar.h> 50 #include <sys/resourcevar.h> 51 #include <sys/vmmeter.h> 52 #include <sys/sysctl.h> 53 #include <sys/thread2.h> 54 #ifdef KTRACE 55 #include <sys/uio.h> 56 #include <sys/ktrace.h> 57 #endif 58 #include <sys/xwait.h> 59 60 #include <machine/cpu.h> 61 #include <machine/ipl.h> 62 #include <machine/smp.h> 63 64 static void sched_setup (void *dummy); 65 SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL) 66 67 int hogticks; 68 int lbolt; 69 int sched_quantum; /* Roundrobin scheduling quantum in ticks. */ 70 int ncpus; 71 int ncpus2, ncpus2_shift, ncpus2_mask; 72 73 static struct callout loadav_callout; 74 75 struct loadavg averunnable = 76 { {0, 0, 0}, FSCALE }; /* load average, of runnable procs */ 77 /* 78 * Constants for averages over 1, 5, and 15 minutes 79 * when sampling at 5 second intervals. 80 */ 81 static fixpt_t cexp[3] = { 82 0.9200444146293232 * FSCALE, /* exp(-1/12) */ 83 0.9834714538216174 * FSCALE, /* exp(-1/60) */ 84 0.9944598480048967 * FSCALE, /* exp(-1/180) */ 85 }; 86 87 static void endtsleep (void *); 88 static void loadav (void *arg); 89 static void roundrobin (void *arg); 90 static void schedcpu (void *arg); 91 static void updatepri (struct proc *p); 92 static void crit_panicints(void); 93 94 static int 95 sysctl_kern_quantum(SYSCTL_HANDLER_ARGS) 96 { 97 int error, new_val; 98 99 new_val = sched_quantum * tick; 100 error = sysctl_handle_int(oidp, &new_val, 0, req); 101 if (error != 0 || req->newptr == NULL) 102 return (error); 103 if (new_val < tick) 104 return (EINVAL); 105 sched_quantum = new_val / tick; 106 hogticks = 2 * sched_quantum; 107 return (0); 108 } 109 110 SYSCTL_PROC(_kern, OID_AUTO, quantum, CTLTYPE_INT|CTLFLAG_RW, 111 0, sizeof sched_quantum, sysctl_kern_quantum, "I", ""); 112 113 int 114 roundrobin_interval(void) 115 { 116 return (sched_quantum); 117 } 118 119 /* 120 * Force switch among equal priority processes every 100ms. 121 * 122 * WARNING! The MP lock is not held on ipi message remotes. 123 */ 124 #ifdef SMP 125 126 static void 127 roundrobin_remote(void *arg) 128 { 129 struct proc *p = lwkt_preempted_proc(); 130 if (p == NULL || RTP_PRIO_NEED_RR(p->p_rtprio.type)) 131 need_resched(); 132 } 133 134 #endif 135 136 static void 137 roundrobin(void *arg) 138 { 139 struct proc *p = lwkt_preempted_proc(); 140 if (p == NULL || RTP_PRIO_NEED_RR(p->p_rtprio.type)) 141 need_resched(); 142 #ifdef SMP 143 lwkt_send_ipiq_mask(mycpu->gd_other_cpus, roundrobin_remote, NULL); 144 #endif 145 timeout(roundrobin, NULL, sched_quantum); 146 } 147 148 #ifdef SMP 149 150 void 151 resched_cpus(u_int32_t mask) 152 { 153 lwkt_send_ipiq_mask(mask, roundrobin_remote, NULL); 154 } 155 156 #endif 157 158 /* 159 * Constants for digital decay and forget: 160 * 90% of (p_estcpu) usage in 5 * loadav time 161 * 95% of (p_pctcpu) usage in 60 seconds (load insensitive) 162 * Note that, as ps(1) mentions, this can let percentages 163 * total over 100% (I've seen 137.9% for 3 processes). 164 * 165 * Note that schedulerclock() updates p_estcpu and p_cpticks asynchronously. 166 * 167 * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds. 168 * That is, the system wants to compute a value of decay such 169 * that the following for loop: 170 * for (i = 0; i < (5 * loadavg); i++) 171 * p_estcpu *= decay; 172 * will compute 173 * p_estcpu *= 0.1; 174 * for all values of loadavg: 175 * 176 * Mathematically this loop can be expressed by saying: 177 * decay ** (5 * loadavg) ~= .1 178 * 179 * The system computes decay as: 180 * decay = (2 * loadavg) / (2 * loadavg + 1) 181 * 182 * We wish to prove that the system's computation of decay 183 * will always fulfill the equation: 184 * decay ** (5 * loadavg) ~= .1 185 * 186 * If we compute b as: 187 * b = 2 * loadavg 188 * then 189 * decay = b / (b + 1) 190 * 191 * We now need to prove two things: 192 * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1) 193 * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg) 194 * 195 * Facts: 196 * For x close to zero, exp(x) =~ 1 + x, since 197 * exp(x) = 0! + x**1/1! + x**2/2! + ... . 198 * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b. 199 * For x close to zero, ln(1+x) =~ x, since 200 * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1 201 * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1). 202 * ln(.1) =~ -2.30 203 * 204 * Proof of (1): 205 * Solve (factor)**(power) =~ .1 given power (5*loadav): 206 * solving for factor, 207 * ln(factor) =~ (-2.30/5*loadav), or 208 * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) = 209 * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED 210 * 211 * Proof of (2): 212 * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)): 213 * solving for power, 214 * power*ln(b/(b+1)) =~ -2.30, or 215 * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED 216 * 217 * Actual power values for the implemented algorithm are as follows: 218 * loadav: 1 2 3 4 219 * power: 5.68 10.32 14.94 19.55 220 */ 221 222 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */ 223 #define loadfactor(loadav) (2 * (loadav)) 224 #define decay_cpu(loadfac, cpu) (((loadfac) * (cpu)) / ((loadfac) + FSCALE)) 225 226 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 227 static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 228 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); 229 230 /* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */ 231 static int fscale __unused = FSCALE; 232 SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, ""); 233 234 /* 235 * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the 236 * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below 237 * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT). 238 * 239 * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used: 240 * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits). 241 * 242 * If you don't want to bother with the faster/more-accurate formula, you 243 * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate 244 * (more general) method of calculating the %age of CPU used by a process. 245 */ 246 #define CCPU_SHIFT 11 247 248 /* 249 * Recompute process priorities, every hz ticks. 250 */ 251 /* ARGSUSED */ 252 static void 253 schedcpu(void *arg) 254 { 255 fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); 256 struct proc *p; 257 int realstathz, s; 258 259 realstathz = stathz ? stathz : hz; 260 FOREACH_PROC_IN_SYSTEM(p) { 261 /* 262 * Increment time in/out of memory and sleep time 263 * (if sleeping). We ignore overflow; with 16-bit int's 264 * (remember them?) overflow takes 45 days. 265 */ 266 p->p_swtime++; 267 if (p->p_stat == SSLEEP || p->p_stat == SSTOP) 268 p->p_slptime++; 269 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT; 270 /* 271 * If the process has slept the entire second, 272 * stop recalculating its priority until it wakes up. 273 */ 274 if (p->p_slptime > 1) 275 continue; 276 s = splhigh(); /* prevent state changes and protect run queue */ 277 /* 278 * p_pctcpu is only for ps. 279 */ 280 #if (FSHIFT >= CCPU_SHIFT) 281 p->p_pctcpu += (realstathz == 100)? 282 ((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT): 283 100 * (((fixpt_t) p->p_cpticks) 284 << (FSHIFT - CCPU_SHIFT)) / realstathz; 285 #else 286 p->p_pctcpu += ((FSCALE - ccpu) * 287 (p->p_cpticks * FSCALE / realstathz)) >> FSHIFT; 288 #endif 289 p->p_cpticks = 0; 290 p->p_estcpu = decay_cpu(loadfac, p->p_estcpu); 291 resetpriority(p); 292 splx(s); 293 } 294 wakeup((caddr_t)&lbolt); 295 timeout(schedcpu, (void *)0, hz); 296 } 297 298 /* 299 * Recalculate the priority of a process after it has slept for a while. 300 * For all load averages >= 1 and max p_estcpu of 255, sleeping for at 301 * least six times the loadfactor will decay p_estcpu to zero. 302 */ 303 static void 304 updatepri(struct proc *p) 305 { 306 unsigned int newcpu = p->p_estcpu; 307 fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); 308 309 if (p->p_slptime > 5 * loadfac) { 310 p->p_estcpu = 0; 311 } else { 312 p->p_slptime--; /* the first time was done in schedcpu */ 313 while (newcpu && --p->p_slptime) 314 newcpu = decay_cpu(loadfac, newcpu); 315 p->p_estcpu = newcpu; 316 } 317 resetpriority(p); 318 } 319 320 /* 321 * We're only looking at 7 bits of the address; everything is 322 * aligned to 4, lots of things are aligned to greater powers 323 * of 2. Shift right by 8, i.e. drop the bottom 256 worth. 324 */ 325 #define TABLESIZE 128 326 static TAILQ_HEAD(slpquehead, thread) slpque[TABLESIZE]; 327 #define LOOKUP(x) (((intptr_t)(x) >> 8) & (TABLESIZE - 1)) 328 329 /* 330 * During autoconfiguration or after a panic, a sleep will simply 331 * lower the priority briefly to allow interrupts, then return. 332 * The priority to be used (safepri) is machine-dependent, thus this 333 * value is initialized and maintained in the machine-dependent layers. 334 * This priority will typically be 0, or the lowest priority 335 * that is safe for use on the interrupt stack; it can be made 336 * higher to block network software interrupts after panics. 337 */ 338 int safepri; 339 340 void 341 sleepinit(void) 342 { 343 int i; 344 345 sched_quantum = hz/10; 346 hogticks = 2 * sched_quantum; 347 for (i = 0; i < TABLESIZE; i++) 348 TAILQ_INIT(&slpque[i]); 349 } 350 351 /* 352 * General sleep call. Suspends the current process until a wakeup is 353 * performed on the specified identifier. The process will then be made 354 * runnable with the specified priority. Sleeps at most timo/hz seconds 355 * (0 means no timeout). If flags includes PCATCH flag, signals are checked 356 * before and after sleeping, else signals are not checked. Returns 0 if 357 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a 358 * signal needs to be delivered, ERESTART is returned if the current system 359 * call should be restarted if possible, and EINTR is returned if the system 360 * call should be interrupted by the signal (return EINTR). 361 * 362 * If the process has P_CURPROC set mi_switch() will not re-queue it to 363 * the userland scheduler queues because we are in a SSLEEP state. If 364 * we are not the current process then we have to remove ourselves from 365 * the scheduler queues. 366 * 367 * YYY priority now unused 368 */ 369 int 370 tsleep(void *ident, int flags, const char *wmesg, int timo) 371 { 372 struct thread *td = curthread; 373 struct proc *p = td->td_proc; /* may be NULL */ 374 int s, sig = 0, catch = flags & PCATCH; 375 int id = LOOKUP(ident); 376 struct callout_handle thandle; 377 378 /* 379 * NOTE: removed KTRPOINT, it could cause races due to blocking 380 * even in stable. Just scrap it for now. 381 */ 382 if (cold || panicstr) { 383 /* 384 * After a panic, or during autoconfiguration, 385 * just give interrupts a chance, then just return; 386 * don't run any other procs or panic below, 387 * in case this is the idle process and already asleep. 388 */ 389 crit_panicints(); 390 return (0); 391 } 392 KKASSERT(td != &mycpu->gd_idlethread); /* you must be kidding! */ 393 s = splhigh(); 394 KASSERT(ident != NULL, ("tsleep: no ident")); 395 KASSERT(p == NULL || p->p_stat == SRUN, ("tsleep %p %s %d", 396 ident, wmesg, p->p_stat)); 397 398 crit_enter(); 399 td->td_wchan = ident; 400 td->td_wmesg = wmesg; 401 if (p) 402 p->p_slptime = 0; 403 lwkt_deschedule_self(); 404 TAILQ_INSERT_TAIL(&slpque[id], td, td_threadq); 405 if (timo) 406 thandle = timeout(endtsleep, (void *)td, timo); 407 /* 408 * We put ourselves on the sleep queue and start our timeout 409 * before calling CURSIG, as we could stop there, and a wakeup 410 * or a SIGCONT (or both) could occur while we were stopped. 411 * A SIGCONT would cause us to be marked as SSLEEP 412 * without resuming us, thus we must be ready for sleep 413 * when CURSIG is called. If the wakeup happens while we're 414 * stopped, td->td_wchan will be 0 upon return from CURSIG. 415 */ 416 if (p) { 417 if (catch) { 418 p->p_flag |= P_SINTR; 419 if ((sig = CURSIG(p))) { 420 if (td->td_wchan) { 421 unsleep(td); 422 lwkt_schedule_self(); 423 } 424 p->p_stat = SRUN; 425 goto resume; 426 } 427 if (td->td_wchan == NULL) { 428 catch = 0; 429 goto resume; 430 } 431 } else { 432 sig = 0; 433 } 434 435 /* 436 * If we are not the current process we have to remove ourself 437 * from the run queue. 438 */ 439 KASSERT(p->p_stat == SRUN, ("PSTAT NOT SRUN %d %d", p->p_pid, p->p_stat)); 440 /* 441 * If this is the current 'user' process schedule another one. 442 */ 443 clrrunnable(p, SSLEEP); 444 p->p_stats->p_ru.ru_nvcsw++; 445 KKASSERT(td->td_release || (p->p_flag & P_CURPROC) == 0); 446 mi_switch(); 447 KASSERT(p->p_stat == SRUN, ("tsleep: stat not srun")); 448 } else { 449 lwkt_switch(); 450 } 451 resume: 452 crit_exit(); 453 if (p) 454 p->p_flag &= ~P_SINTR; 455 splx(s); 456 if (td->td_flags & TDF_TIMEOUT) { 457 td->td_flags &= ~TDF_TIMEOUT; 458 if (sig == 0) 459 return (EWOULDBLOCK); 460 } else if (timo) { 461 untimeout(endtsleep, (void *)td, thandle); 462 } else if (td->td_wmesg) { 463 /* 464 * This can happen if a thread is woken up directly. Clear 465 * wmesg to avoid debugging confusion. 466 */ 467 td->td_wmesg = NULL; 468 } 469 /* inline of iscaught() */ 470 if (p) { 471 if (catch && (sig != 0 || (sig = CURSIG(p)))) { 472 if (SIGISMEMBER(p->p_sigacts->ps_sigintr, sig)) 473 return (EINTR); 474 return (ERESTART); 475 } 476 } 477 return (0); 478 } 479 480 /* 481 * Implement the timeout for tsleep. We interlock against 482 * wchan when setting TDF_TIMEOUT. For processes we remove 483 * the sleep if the process is stopped rather then sleeping, 484 * so it remains stopped. 485 */ 486 static void 487 endtsleep(void *arg) 488 { 489 thread_t td = arg; 490 struct proc *p; 491 int s; 492 493 s = splhigh(); 494 if (td->td_wchan) { 495 td->td_flags |= TDF_TIMEOUT; 496 if ((p = td->td_proc) != NULL) { 497 if (p->p_stat == SSLEEP) 498 setrunnable(p); 499 else 500 unsleep(td); 501 } else { 502 unsleep(td); 503 lwkt_schedule(td); 504 } 505 } 506 splx(s); 507 } 508 509 /* 510 * Remove a process from its wait queue 511 */ 512 void 513 unsleep(struct thread *td) 514 { 515 int s; 516 517 s = splhigh(); 518 if (td->td_wchan) { 519 #if 0 520 if (p->p_flag & P_XSLEEP) { 521 struct xwait *w = p->p_wchan; 522 TAILQ_REMOVE(&w->waitq, p, p_procq); 523 p->p_flag &= ~P_XSLEEP; 524 } else 525 #endif 526 TAILQ_REMOVE(&slpque[LOOKUP(td->td_wchan)], td, td_threadq); 527 td->td_wchan = NULL; 528 } 529 splx(s); 530 } 531 532 #if 0 533 /* 534 * Make all processes sleeping on the explicit lock structure runnable. 535 */ 536 void 537 xwakeup(struct xwait *w) 538 { 539 struct proc *p; 540 int s; 541 542 s = splhigh(); 543 ++w->gen; 544 while ((p = TAILQ_FIRST(&w->waitq)) != NULL) { 545 TAILQ_REMOVE(&w->waitq, p, p_procq); 546 KASSERT(p->p_wchan == w && (p->p_flag & P_XSLEEP), 547 ("xwakeup: wchan mismatch for %p (%p/%p) %08x", p, p->p_wchan, w, p->p_flag & P_XSLEEP)); 548 p->p_wchan = NULL; 549 p->p_flag &= ~P_XSLEEP; 550 if (p->p_stat == SSLEEP) { 551 /* OPTIMIZED EXPANSION OF setrunnable(p); */ 552 if (p->p_slptime > 1) 553 updatepri(p); 554 p->p_slptime = 0; 555 p->p_stat = SRUN; 556 if (p->p_flag & P_INMEM) { 557 setrunqueue(p); 558 } else { 559 p->p_flag |= P_SWAPINREQ; 560 wakeup((caddr_t)&proc0); 561 } 562 } 563 } 564 splx(s); 565 } 566 #endif 567 568 /* 569 * Make all processes sleeping on the specified identifier runnable. 570 */ 571 static void 572 _wakeup(void *ident, int count) 573 { 574 struct slpquehead *qp; 575 struct thread *td; 576 struct thread *ntd; 577 struct proc *p; 578 int s; 579 int id = LOOKUP(ident); 580 581 s = splhigh(); 582 qp = &slpque[id]; 583 restart: 584 for (td = TAILQ_FIRST(qp); td != NULL; td = ntd) { 585 ntd = TAILQ_NEXT(td, td_threadq); 586 if (td->td_wchan == ident) { 587 TAILQ_REMOVE(qp, td, td_threadq); 588 td->td_wchan = NULL; 589 if ((p = td->td_proc) != NULL && p->p_stat == SSLEEP) { 590 /* OPTIMIZED EXPANSION OF setrunnable(p); */ 591 if (p->p_slptime > 1) 592 updatepri(p); 593 p->p_slptime = 0; 594 p->p_stat = SRUN; 595 if (p->p_flag & P_INMEM) { 596 setrunqueue(p); 597 } else { 598 p->p_flag |= P_SWAPINREQ; 599 wakeup((caddr_t)&proc0); 600 } 601 /* END INLINE EXPANSION */ 602 } else if (p == NULL) { 603 lwkt_schedule(td); 604 } 605 if (--count == 0) 606 break; 607 goto restart; 608 } 609 } 610 splx(s); 611 } 612 613 void 614 wakeup(void *ident) 615 { 616 _wakeup(ident, 0); 617 } 618 619 void 620 wakeup_one(void *ident) 621 { 622 _wakeup(ident, 1); 623 } 624 625 /* 626 * The machine independent parts of mi_switch(). 627 * Must be called at splstatclock() or higher. 628 */ 629 void 630 mi_switch() 631 { 632 struct thread *td = curthread; 633 struct proc *p = td->td_proc; /* XXX */ 634 struct rlimit *rlim; 635 int x; 636 u_int64_t ttime; 637 638 /* 639 * XXX this spl is almost unnecessary. It is partly to allow for 640 * sloppy callers that don't do it (issignal() via CURSIG() is the 641 * main offender). It is partly to work around a bug in the i386 642 * cpu_switch() (the ipl is not preserved). We ran for years 643 * without it. I think there was only a interrupt latency problem. 644 * The main caller, tsleep(), does an splx() a couple of instructions 645 * after calling here. The buggy caller, issignal(), usually calls 646 * here at spl0() and sometimes returns at splhigh(). The process 647 * then runs for a little too long at splhigh(). The ipl gets fixed 648 * when the process returns to user mode (or earlier). 649 * 650 * It would probably be better to always call here at spl0(). Callers 651 * are prepared to give up control to another process, so they must 652 * be prepared to be interrupted. The clock stuff here may not 653 * actually need splstatclock(). 654 */ 655 x = splstatclock(); 656 clear_resched(); 657 658 /* 659 * Check if the process exceeds its cpu resource allocation. 660 * If over max, kill it. Time spent in interrupts is not 661 * included. YYY 64 bit match is expensive. Ick. 662 */ 663 ttime = td->td_sticks + td->td_uticks; 664 if (p->p_stat != SZOMB && p->p_limit->p_cpulimit != RLIM_INFINITY && 665 ttime > p->p_limit->p_cpulimit) { 666 rlim = &p->p_rlimit[RLIMIT_CPU]; 667 if (ttime / (rlim_t)1000000 >= rlim->rlim_max) { 668 killproc(p, "exceeded maximum CPU limit"); 669 } else { 670 psignal(p, SIGXCPU); 671 if (rlim->rlim_cur < rlim->rlim_max) { 672 /* XXX: we should make a private copy */ 673 rlim->rlim_cur += 5; 674 } 675 } 676 } 677 678 /* 679 * Pick a new current process and record its start time. If we 680 * are in a SSTOPped state we deschedule ourselves. YYY this needs 681 * to be cleaned up, remember that LWKTs stay on their run queue 682 * which works differently then the user scheduler which removes 683 * the process from the runq when it runs it. 684 */ 685 mycpu->gd_cnt.v_swtch++; 686 if (p->p_stat == SSTOP) 687 lwkt_deschedule_self(); 688 lwkt_switch(); 689 690 splx(x); 691 } 692 693 /* 694 * Change process state to be runnable, 695 * placing it on the run queue if it is in memory, 696 * and awakening the swapper if it isn't in memory. 697 */ 698 void 699 setrunnable(struct proc *p) 700 { 701 int s; 702 703 s = splhigh(); 704 switch (p->p_stat) { 705 case 0: 706 case SRUN: 707 case SZOMB: 708 default: 709 panic("setrunnable"); 710 case SSTOP: 711 case SSLEEP: 712 unsleep(p->p_thread); /* e.g. when sending signals */ 713 break; 714 715 case SIDL: 716 break; 717 } 718 p->p_stat = SRUN; 719 if (p->p_flag & P_INMEM) 720 setrunqueue(p); 721 splx(s); 722 if (p->p_slptime > 1) 723 updatepri(p); 724 p->p_slptime = 0; 725 if ((p->p_flag & P_INMEM) == 0) { 726 p->p_flag |= P_SWAPINREQ; 727 wakeup((caddr_t)&proc0); 728 } 729 } 730 731 /* 732 * Change the process state to NOT be runnable, removing it from the run 733 * queue. If P_CURPROC is not set and we are in SRUN the process is on the 734 * run queue (If P_INMEM is not set then it isn't because it is swapped). 735 */ 736 void 737 clrrunnable(struct proc *p, int stat) 738 { 739 int s; 740 741 s = splhigh(); 742 switch(p->p_stat) { 743 case SRUN: 744 if (p->p_flag & P_ONRUNQ) 745 remrunqueue(p); 746 break; 747 default: 748 break; 749 } 750 p->p_stat = stat; 751 splx(s); 752 } 753 754 /* 755 * Compute the priority of a process when running in user mode. 756 * Arrange to reschedule if the resulting priority is better 757 * than that of the current process. 758 */ 759 void 760 resetpriority(struct proc *p) 761 { 762 unsigned int newpriority; 763 int opq; 764 int npq; 765 766 /* 767 * Set p_priority for general process comparisons 768 */ 769 switch(p->p_rtprio.type) { 770 case RTP_PRIO_REALTIME: 771 p->p_priority = PRIBASE_REALTIME + p->p_rtprio.prio; 772 return; 773 case RTP_PRIO_NORMAL: 774 break; 775 case RTP_PRIO_IDLE: 776 p->p_priority = PRIBASE_IDLE + p->p_rtprio.prio; 777 return; 778 case RTP_PRIO_THREAD: 779 p->p_priority = PRIBASE_THREAD + p->p_rtprio.prio; 780 return; 781 } 782 783 /* 784 * NORMAL priorities fall through. These are based on niceness 785 * and cpu use. 786 */ 787 newpriority = NICE_ADJUST(p->p_nice - PRIO_MIN) + 788 p->p_estcpu / ESTCPURAMP; 789 newpriority = min(newpriority, MAXPRI); 790 npq = newpriority / PPQ; 791 crit_enter(); 792 opq = (p->p_priority & PRIMASK) / PPQ; 793 if (p->p_stat == SRUN && (p->p_flag & P_ONRUNQ) && opq != npq) { 794 /* 795 * We have to move the process to another queue 796 */ 797 remrunqueue(p); 798 p->p_priority = PRIBASE_NORMAL + newpriority; 799 setrunqueue(p); 800 } else { 801 /* 802 * We can just adjust the priority and it will be picked 803 * up later. 804 */ 805 KKASSERT(opq == npq || (p->p_flag & P_ONRUNQ) == 0); 806 p->p_priority = PRIBASE_NORMAL + newpriority; 807 } 808 crit_exit(); 809 } 810 811 /* 812 * Compute a tenex style load average of a quantity on 813 * 1, 5 and 15 minute intervals. 814 */ 815 static void 816 loadav(void *arg) 817 { 818 int i, nrun; 819 struct loadavg *avg; 820 struct proc *p; 821 822 avg = &averunnable; 823 nrun = 0; 824 FOREACH_PROC_IN_SYSTEM(p) { 825 switch (p->p_stat) { 826 case SRUN: 827 case SIDL: 828 nrun++; 829 } 830 } 831 for (i = 0; i < 3; i++) 832 avg->ldavg[i] = (cexp[i] * avg->ldavg[i] + 833 nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT; 834 835 /* 836 * Schedule the next update to occur after 5 seconds, but add a 837 * random variation to avoid synchronisation with processes that 838 * run at regular intervals. 839 */ 840 callout_reset(&loadav_callout, hz * 4 + (int)(random() % (hz * 2 + 1)), 841 loadav, NULL); 842 } 843 844 /* ARGSUSED */ 845 static void 846 sched_setup(void *dummy) 847 { 848 849 callout_init(&loadav_callout); 850 851 /* Kick off timeout driven events by calling first time. */ 852 roundrobin(NULL); 853 schedcpu(NULL); 854 loadav(NULL); 855 } 856 857 /* 858 * We adjust the priority of the current process. The priority of 859 * a process gets worse as it accumulates CPU time. The cpu usage 860 * estimator (p_estcpu) is increased here. resetpriority() will 861 * compute a different priority each time p_estcpu increases by 862 * INVERSE_ESTCPU_WEIGHT * (until MAXPRI is reached). 863 * 864 * The cpu usage estimator ramps up quite quickly when the process is 865 * running (linearly), and decays away exponentially, at a rate which 866 * is proportionally slower when the system is busy. The basic principle 867 * is that the system will 90% forget that the process used a lot of CPU 868 * time in 5 * loadav seconds. This causes the system to favor processes 869 * which haven't run much recently, and to round-robin among other processes. 870 * 871 * WARNING! called from a fast-int or an IPI, the MP lock MIGHT NOT BE HELD 872 * and we cannot block. 873 */ 874 void 875 schedulerclock(void *dummy) 876 { 877 struct thread *td; 878 struct proc *p; 879 880 td = curthread; 881 if ((p = td->td_proc) != NULL) { 882 p->p_cpticks++; 883 p->p_estcpu = ESTCPULIM(p->p_estcpu + 1); 884 if ((p->p_estcpu % PPQ) == 0 && try_mplock()) { 885 resetpriority(p); 886 rel_mplock(); 887 } 888 } 889 } 890 891 static 892 void 893 crit_panicints(void) 894 { 895 int s; 896 int cpri; 897 898 s = splhigh(); 899 cpri = crit_panic_save(); 900 splx(safepri); 901 crit_panic_restore(cpri); 902 splx(s); 903 } 904 905