1 /* $NetBSD: kern_synch.c,v 1.283 2010/04/30 10:02:00 martin Exp $ */ 2 3 /*- 4 * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008, 2009 5 * The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 10 * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran and 11 * Daniel Sieger. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 24 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 26 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 29 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 30 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 32 * POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 /*- 36 * Copyright (c) 1982, 1986, 1990, 1991, 1993 37 * The Regents of the University of California. All rights reserved. 38 * (c) UNIX System Laboratories, Inc. 39 * All or some portions of this file are derived from material licensed 40 * to the University of California by American Telephone and Telegraph 41 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 42 * the permission of UNIX System Laboratories, Inc. 43 * 44 * Redistribution and use in source and binary forms, with or without 45 * modification, are permitted provided that the following conditions 46 * are met: 47 * 1. Redistributions of source code must retain the above copyright 48 * notice, this list of conditions and the following disclaimer. 49 * 2. Redistributions in binary form must reproduce the above copyright 50 * notice, this list of conditions and the following disclaimer in the 51 * documentation and/or other materials provided with the distribution. 52 * 3. Neither the name of the University nor the names of its contributors 53 * may be used to endorse or promote products derived from this software 54 * without specific prior written permission. 55 * 56 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 57 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 58 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 59 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 60 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 61 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 62 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 63 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 64 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 65 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 66 * SUCH DAMAGE. 67 * 68 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 69 */ 70 71 #include <sys/cdefs.h> 72 __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.283 2010/04/30 10:02:00 martin Exp $"); 73 74 #include "opt_kstack.h" 75 #include "opt_perfctrs.h" 76 #include "opt_sa.h" 77 #include "opt_dtrace.h" 78 79 #define __MUTEX_PRIVATE 80 81 #include <sys/param.h> 82 #include <sys/systm.h> 83 #include <sys/proc.h> 84 #include <sys/kernel.h> 85 #if defined(PERFCTRS) 86 #include <sys/pmc.h> 87 #endif 88 #include <sys/cpu.h> 89 #include <sys/resourcevar.h> 90 #include <sys/sched.h> 91 #include <sys/sa.h> 92 #include <sys/savar.h> 93 #include <sys/syscall_stats.h> 94 #include <sys/sleepq.h> 95 #include <sys/lockdebug.h> 96 #include <sys/evcnt.h> 97 #include <sys/intr.h> 98 #include <sys/lwpctl.h> 99 #include <sys/atomic.h> 100 #include <sys/simplelock.h> 101 102 #include <uvm/uvm_extern.h> 103 104 #include <dev/lockstat.h> 105 106 #include <sys/dtrace_bsd.h> 107 int dtrace_vtime_active=0; 108 dtrace_vtime_switch_func_t dtrace_vtime_switch_func; 109 110 static void sched_unsleep(struct lwp *, bool); 111 static void sched_changepri(struct lwp *, pri_t); 112 static void sched_lendpri(struct lwp *, pri_t); 113 static void resched_cpu(struct lwp *); 114 115 syncobj_t sleep_syncobj = { 116 SOBJ_SLEEPQ_SORTED, 117 sleepq_unsleep, 118 sleepq_changepri, 119 sleepq_lendpri, 120 syncobj_noowner, 121 }; 122 123 syncobj_t sched_syncobj = { 124 SOBJ_SLEEPQ_SORTED, 125 sched_unsleep, 126 sched_changepri, 127 sched_lendpri, 128 syncobj_noowner, 129 }; 130 131 unsigned sched_pstats_ticks; 132 kcondvar_t lbolt; /* once a second sleep address */ 133 134 /* Preemption event counters */ 135 static struct evcnt kpreempt_ev_crit; 136 static struct evcnt kpreempt_ev_klock; 137 static struct evcnt kpreempt_ev_immed; 138 139 /* 140 * During autoconfiguration or after a panic, a sleep will simply lower the 141 * priority briefly to allow interrupts, then return. The priority to be 142 * used (safepri) is machine-dependent, thus this value is initialized and 143 * maintained in the machine-dependent layers. This priority will typically 144 * be 0, or the lowest priority that is safe for use on the interrupt stack; 145 * it can be made higher to block network software interrupts after panics. 146 */ 147 int safepri; 148 149 void 150 synch_init(void) 151 { 152 153 cv_init(&lbolt, "lbolt"); 154 155 evcnt_attach_dynamic(&kpreempt_ev_crit, EVCNT_TYPE_MISC, NULL, 156 "kpreempt", "defer: critical section"); 157 evcnt_attach_dynamic(&kpreempt_ev_klock, EVCNT_TYPE_MISC, NULL, 158 "kpreempt", "defer: kernel_lock"); 159 evcnt_attach_dynamic(&kpreempt_ev_immed, EVCNT_TYPE_MISC, NULL, 160 "kpreempt", "immediate"); 161 } 162 163 /* 164 * OBSOLETE INTERFACE 165 * 166 * General sleep call. Suspends the current LWP until a wakeup is 167 * performed on the specified identifier. The LWP will then be made 168 * runnable with the specified priority. Sleeps at most timo/hz seconds (0 169 * means no timeout). If pri includes PCATCH flag, signals are checked 170 * before and after sleeping, else signals are not checked. Returns 0 if 171 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a 172 * signal needs to be delivered, ERESTART is returned if the current system 173 * call should be restarted if possible, and EINTR is returned if the system 174 * call should be interrupted by the signal (return EINTR). 175 * 176 * The interlock is held until we are on a sleep queue. The interlock will 177 * be locked before returning back to the caller unless the PNORELOCK flag 178 * is specified, in which case the interlock will always be unlocked upon 179 * return. 180 */ 181 int 182 ltsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo, 183 volatile struct simplelock *interlock) 184 { 185 struct lwp *l = curlwp; 186 sleepq_t *sq; 187 kmutex_t *mp; 188 int error; 189 190 KASSERT((l->l_pflag & LP_INTR) == 0); 191 KASSERT(ident != &lbolt); 192 193 if (sleepq_dontsleep(l)) { 194 (void)sleepq_abort(NULL, 0); 195 if ((priority & PNORELOCK) != 0) 196 simple_unlock(interlock); 197 return 0; 198 } 199 200 l->l_kpriority = true; 201 sq = sleeptab_lookup(&sleeptab, ident, &mp); 202 sleepq_enter(sq, l, mp); 203 sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj); 204 205 if (interlock != NULL) { 206 KASSERT(simple_lock_held(interlock)); 207 simple_unlock(interlock); 208 } 209 210 error = sleepq_block(timo, priority & PCATCH); 211 212 if (interlock != NULL && (priority & PNORELOCK) == 0) 213 simple_lock(interlock); 214 215 return error; 216 } 217 218 int 219 mtsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo, 220 kmutex_t *mtx) 221 { 222 struct lwp *l = curlwp; 223 sleepq_t *sq; 224 kmutex_t *mp; 225 int error; 226 227 KASSERT((l->l_pflag & LP_INTR) == 0); 228 KASSERT(ident != &lbolt); 229 230 if (sleepq_dontsleep(l)) { 231 (void)sleepq_abort(mtx, (priority & PNORELOCK) != 0); 232 return 0; 233 } 234 235 l->l_kpriority = true; 236 sq = sleeptab_lookup(&sleeptab, ident, &mp); 237 sleepq_enter(sq, l, mp); 238 sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj); 239 mutex_exit(mtx); 240 error = sleepq_block(timo, priority & PCATCH); 241 242 if ((priority & PNORELOCK) == 0) 243 mutex_enter(mtx); 244 245 return error; 246 } 247 248 /* 249 * General sleep call for situations where a wake-up is not expected. 250 */ 251 int 252 kpause(const char *wmesg, bool intr, int timo, kmutex_t *mtx) 253 { 254 struct lwp *l = curlwp; 255 kmutex_t *mp; 256 sleepq_t *sq; 257 int error; 258 259 if (sleepq_dontsleep(l)) 260 return sleepq_abort(NULL, 0); 261 262 if (mtx != NULL) 263 mutex_exit(mtx); 264 l->l_kpriority = true; 265 sq = sleeptab_lookup(&sleeptab, l, &mp); 266 sleepq_enter(sq, l, mp); 267 sleepq_enqueue(sq, l, wmesg, &sleep_syncobj); 268 error = sleepq_block(timo, intr); 269 if (mtx != NULL) 270 mutex_enter(mtx); 271 272 return error; 273 } 274 275 #ifdef KERN_SA 276 /* 277 * sa_awaken: 278 * 279 * We believe this lwp is an SA lwp. If it's yielding, 280 * let it know it needs to wake up. 281 * 282 * We are called and exit with the lwp locked. We are 283 * called in the middle of wakeup operations, so we need 284 * to not touch the locks at all. 285 */ 286 void 287 sa_awaken(struct lwp *l) 288 { 289 /* LOCK_ASSERT(lwp_locked(l, NULL)); */ 290 291 if (l == l->l_savp->savp_lwp && l->l_flag & LW_SA_YIELD) 292 l->l_flag &= ~LW_SA_IDLE; 293 } 294 #endif /* KERN_SA */ 295 296 /* 297 * OBSOLETE INTERFACE 298 * 299 * Make all LWPs sleeping on the specified identifier runnable. 300 */ 301 void 302 wakeup(wchan_t ident) 303 { 304 sleepq_t *sq; 305 kmutex_t *mp; 306 307 if (__predict_false(cold)) 308 return; 309 310 sq = sleeptab_lookup(&sleeptab, ident, &mp); 311 sleepq_wake(sq, ident, (u_int)-1, mp); 312 } 313 314 /* 315 * OBSOLETE INTERFACE 316 * 317 * Make the highest priority LWP first in line on the specified 318 * identifier runnable. 319 */ 320 void 321 wakeup_one(wchan_t ident) 322 { 323 sleepq_t *sq; 324 kmutex_t *mp; 325 326 if (__predict_false(cold)) 327 return; 328 329 sq = sleeptab_lookup(&sleeptab, ident, &mp); 330 sleepq_wake(sq, ident, 1, mp); 331 } 332 333 334 /* 335 * General yield call. Puts the current LWP back on its run queue and 336 * performs a voluntary context switch. Should only be called when the 337 * current LWP explicitly requests it (eg sched_yield(2)). 338 */ 339 void 340 yield(void) 341 { 342 struct lwp *l = curlwp; 343 344 KERNEL_UNLOCK_ALL(l, &l->l_biglocks); 345 lwp_lock(l); 346 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); 347 KASSERT(l->l_stat == LSONPROC); 348 l->l_kpriority = false; 349 (void)mi_switch(l); 350 KERNEL_LOCK(l->l_biglocks, l); 351 } 352 353 /* 354 * General preemption call. Puts the current LWP back on its run queue 355 * and performs an involuntary context switch. 356 */ 357 void 358 preempt(void) 359 { 360 struct lwp *l = curlwp; 361 362 KERNEL_UNLOCK_ALL(l, &l->l_biglocks); 363 lwp_lock(l); 364 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); 365 KASSERT(l->l_stat == LSONPROC); 366 l->l_kpriority = false; 367 l->l_nivcsw++; 368 (void)mi_switch(l); 369 KERNEL_LOCK(l->l_biglocks, l); 370 } 371 372 /* 373 * Handle a request made by another agent to preempt the current LWP 374 * in-kernel. Usually called when l_dopreempt may be non-zero. 375 * 376 * Character addresses for lockstat only. 377 */ 378 static char in_critical_section; 379 static char kernel_lock_held; 380 static char is_softint; 381 static char cpu_kpreempt_enter_fail; 382 383 bool 384 kpreempt(uintptr_t where) 385 { 386 uintptr_t failed; 387 lwp_t *l; 388 int s, dop, lsflag; 389 390 l = curlwp; 391 failed = 0; 392 while ((dop = l->l_dopreempt) != 0) { 393 if (l->l_stat != LSONPROC) { 394 /* 395 * About to block (or die), let it happen. 396 * Doesn't really count as "preemption has 397 * been blocked", since we're going to 398 * context switch. 399 */ 400 l->l_dopreempt = 0; 401 return true; 402 } 403 if (__predict_false((l->l_flag & LW_IDLE) != 0)) { 404 /* Can't preempt idle loop, don't count as failure. */ 405 l->l_dopreempt = 0; 406 return true; 407 } 408 if (__predict_false(l->l_nopreempt != 0)) { 409 /* LWP holds preemption disabled, explicitly. */ 410 if ((dop & DOPREEMPT_COUNTED) == 0) { 411 kpreempt_ev_crit.ev_count++; 412 } 413 failed = (uintptr_t)&in_critical_section; 414 break; 415 } 416 if (__predict_false((l->l_pflag & LP_INTR) != 0)) { 417 /* Can't preempt soft interrupts yet. */ 418 l->l_dopreempt = 0; 419 failed = (uintptr_t)&is_softint; 420 break; 421 } 422 s = splsched(); 423 if (__predict_false(l->l_blcnt != 0 || 424 curcpu()->ci_biglock_wanted != NULL)) { 425 /* Hold or want kernel_lock, code is not MT safe. */ 426 splx(s); 427 if ((dop & DOPREEMPT_COUNTED) == 0) { 428 kpreempt_ev_klock.ev_count++; 429 } 430 failed = (uintptr_t)&kernel_lock_held; 431 break; 432 } 433 if (__predict_false(!cpu_kpreempt_enter(where, s))) { 434 /* 435 * It may be that the IPL is too high. 436 * kpreempt_enter() can schedule an 437 * interrupt to retry later. 438 */ 439 splx(s); 440 failed = (uintptr_t)&cpu_kpreempt_enter_fail; 441 break; 442 } 443 /* Do it! */ 444 if (__predict_true((dop & DOPREEMPT_COUNTED) == 0)) { 445 kpreempt_ev_immed.ev_count++; 446 } 447 lwp_lock(l); 448 mi_switch(l); 449 l->l_nopreempt++; 450 splx(s); 451 452 /* Take care of any MD cleanup. */ 453 cpu_kpreempt_exit(where); 454 l->l_nopreempt--; 455 } 456 457 if (__predict_true(!failed)) { 458 return false; 459 } 460 461 /* Record preemption failure for reporting via lockstat. */ 462 atomic_or_uint(&l->l_dopreempt, DOPREEMPT_COUNTED); 463 lsflag = 0; 464 LOCKSTAT_ENTER(lsflag); 465 if (__predict_false(lsflag)) { 466 if (where == 0) { 467 where = (uintptr_t)__builtin_return_address(0); 468 } 469 /* Preemption is on, might recurse, so make it atomic. */ 470 if (atomic_cas_ptr_ni((void *)&l->l_pfailaddr, NULL, 471 (void *)where) == NULL) { 472 LOCKSTAT_START_TIMER(lsflag, l->l_pfailtime); 473 l->l_pfaillock = failed; 474 } 475 } 476 LOCKSTAT_EXIT(lsflag); 477 return true; 478 } 479 480 /* 481 * Return true if preemption is explicitly disabled. 482 */ 483 bool 484 kpreempt_disabled(void) 485 { 486 const lwp_t *l = curlwp; 487 488 return l->l_nopreempt != 0 || l->l_stat == LSZOMB || 489 (l->l_flag & LW_IDLE) != 0 || cpu_kpreempt_disabled(); 490 } 491 492 /* 493 * Disable kernel preemption. 494 */ 495 void 496 kpreempt_disable(void) 497 { 498 499 KPREEMPT_DISABLE(curlwp); 500 } 501 502 /* 503 * Reenable kernel preemption. 504 */ 505 void 506 kpreempt_enable(void) 507 { 508 509 KPREEMPT_ENABLE(curlwp); 510 } 511 512 /* 513 * Compute the amount of time during which the current lwp was running. 514 * 515 * - update l_rtime unless it's an idle lwp. 516 */ 517 518 void 519 updatertime(lwp_t *l, const struct bintime *now) 520 { 521 522 if (__predict_false(l->l_flag & LW_IDLE)) 523 return; 524 525 /* rtime += now - stime */ 526 bintime_add(&l->l_rtime, now); 527 bintime_sub(&l->l_rtime, &l->l_stime); 528 } 529 530 /* 531 * Select next LWP from the current CPU to run.. 532 */ 533 static inline lwp_t * 534 nextlwp(struct cpu_info *ci, struct schedstate_percpu *spc) 535 { 536 lwp_t *newl; 537 538 /* 539 * Let sched_nextlwp() select the LWP to run the CPU next. 540 * If no LWP is runnable, select the idle LWP. 541 * 542 * Note that spc_lwplock might not necessary be held, and 543 * new thread would be unlocked after setting the LWP-lock. 544 */ 545 newl = sched_nextlwp(); 546 if (newl != NULL) { 547 sched_dequeue(newl); 548 KASSERT(lwp_locked(newl, spc->spc_mutex)); 549 KASSERT(newl->l_cpu == ci); 550 newl->l_stat = LSONPROC; 551 newl->l_pflag |= LP_RUNNING; 552 lwp_setlock(newl, spc->spc_lwplock); 553 } else { 554 newl = ci->ci_data.cpu_idlelwp; 555 newl->l_stat = LSONPROC; 556 newl->l_pflag |= LP_RUNNING; 557 } 558 559 /* 560 * Only clear want_resched if there are no pending (slow) 561 * software interrupts. 562 */ 563 ci->ci_want_resched = ci->ci_data.cpu_softints; 564 spc->spc_flags &= ~SPCF_SWITCHCLEAR; 565 spc->spc_curpriority = lwp_eprio(newl); 566 567 return newl; 568 } 569 570 /* 571 * The machine independent parts of context switch. 572 * 573 * Returns 1 if another LWP was actually run. 574 */ 575 int 576 mi_switch(lwp_t *l) 577 { 578 struct cpu_info *ci; 579 struct schedstate_percpu *spc; 580 struct lwp *newl; 581 int retval, oldspl; 582 struct bintime bt; 583 bool returning; 584 585 KASSERT(lwp_locked(l, NULL)); 586 KASSERT(kpreempt_disabled()); 587 LOCKDEBUG_BARRIER(l->l_mutex, 1); 588 589 kstack_check_magic(l); 590 591 binuptime(&bt); 592 593 KASSERT((l->l_pflag & LP_RUNNING) != 0); 594 KASSERT(l->l_cpu == curcpu()); 595 ci = l->l_cpu; 596 spc = &ci->ci_schedstate; 597 returning = false; 598 newl = NULL; 599 600 /* 601 * If we have been asked to switch to a specific LWP, then there 602 * is no need to inspect the run queues. If a soft interrupt is 603 * blocking, then return to the interrupted thread without adjusting 604 * VM context or its start time: neither have been changed in order 605 * to take the interrupt. 606 */ 607 if (l->l_switchto != NULL) { 608 if ((l->l_pflag & LP_INTR) != 0) { 609 returning = true; 610 softint_block(l); 611 if ((l->l_pflag & LP_TIMEINTR) != 0) 612 updatertime(l, &bt); 613 } 614 newl = l->l_switchto; 615 l->l_switchto = NULL; 616 } 617 #ifndef __HAVE_FAST_SOFTINTS 618 else if (ci->ci_data.cpu_softints != 0) { 619 /* There are pending soft interrupts, so pick one. */ 620 newl = softint_picklwp(); 621 newl->l_stat = LSONPROC; 622 newl->l_pflag |= LP_RUNNING; 623 } 624 #endif /* !__HAVE_FAST_SOFTINTS */ 625 626 /* Count time spent in current system call */ 627 if (!returning) { 628 SYSCALL_TIME_SLEEP(l); 629 630 /* 631 * XXXSMP If we are using h/w performance counters, 632 * save context. 633 */ 634 #if PERFCTRS 635 if (PMC_ENABLED(l->l_proc)) { 636 pmc_save_context(l->l_proc); 637 } 638 #endif 639 updatertime(l, &bt); 640 } 641 642 /* Lock the runqueue */ 643 KASSERT(l->l_stat != LSRUN); 644 mutex_spin_enter(spc->spc_mutex); 645 646 /* 647 * If on the CPU and we have gotten this far, then we must yield. 648 */ 649 if (l->l_stat == LSONPROC && l != newl) { 650 KASSERT(lwp_locked(l, spc->spc_lwplock)); 651 if ((l->l_flag & LW_IDLE) == 0) { 652 l->l_stat = LSRUN; 653 lwp_setlock(l, spc->spc_mutex); 654 sched_enqueue(l, true); 655 /* Handle migration case */ 656 KASSERT(spc->spc_migrating == NULL); 657 if (l->l_target_cpu != NULL) { 658 spc->spc_migrating = l; 659 } 660 } else 661 l->l_stat = LSIDL; 662 } 663 664 /* Pick new LWP to run. */ 665 if (newl == NULL) { 666 newl = nextlwp(ci, spc); 667 } 668 669 /* Items that must be updated with the CPU locked. */ 670 if (!returning) { 671 /* Update the new LWP's start time. */ 672 newl->l_stime = bt; 673 674 /* 675 * ci_curlwp changes when a fast soft interrupt occurs. 676 * We use cpu_onproc to keep track of which kernel or 677 * user thread is running 'underneath' the software 678 * interrupt. This is important for time accounting, 679 * itimers and forcing user threads to preempt (aston). 680 */ 681 ci->ci_data.cpu_onproc = newl; 682 } 683 684 /* 685 * Preemption related tasks. Must be done with the current 686 * CPU locked. 687 */ 688 cpu_did_resched(l); 689 l->l_dopreempt = 0; 690 if (__predict_false(l->l_pfailaddr != 0)) { 691 LOCKSTAT_FLAG(lsflag); 692 LOCKSTAT_ENTER(lsflag); 693 LOCKSTAT_STOP_TIMER(lsflag, l->l_pfailtime); 694 LOCKSTAT_EVENT_RA(lsflag, l->l_pfaillock, LB_NOPREEMPT|LB_SPIN, 695 1, l->l_pfailtime, l->l_pfailaddr); 696 LOCKSTAT_EXIT(lsflag); 697 l->l_pfailtime = 0; 698 l->l_pfaillock = 0; 699 l->l_pfailaddr = 0; 700 } 701 702 if (l != newl) { 703 struct lwp *prevlwp; 704 705 /* Release all locks, but leave the current LWP locked */ 706 if (l->l_mutex == spc->spc_mutex) { 707 /* 708 * Drop spc_lwplock, if the current LWP has been moved 709 * to the run queue (it is now locked by spc_mutex). 710 */ 711 mutex_spin_exit(spc->spc_lwplock); 712 } else { 713 /* 714 * Otherwise, drop the spc_mutex, we are done with the 715 * run queues. 716 */ 717 mutex_spin_exit(spc->spc_mutex); 718 } 719 720 /* 721 * Mark that context switch is going to be performed 722 * for this LWP, to protect it from being switched 723 * to on another CPU. 724 */ 725 KASSERT(l->l_ctxswtch == 0); 726 l->l_ctxswtch = 1; 727 l->l_ncsw++; 728 KASSERT((l->l_pflag & LP_RUNNING) != 0); 729 l->l_pflag &= ~LP_RUNNING; 730 731 /* 732 * Increase the count of spin-mutexes before the release 733 * of the last lock - we must remain at IPL_SCHED during 734 * the context switch. 735 */ 736 oldspl = MUTEX_SPIN_OLDSPL(ci); 737 ci->ci_mtx_count--; 738 lwp_unlock(l); 739 740 /* Count the context switch on this CPU. */ 741 ci->ci_data.cpu_nswtch++; 742 743 /* Update status for lwpctl, if present. */ 744 if (l->l_lwpctl != NULL) 745 l->l_lwpctl->lc_curcpu = LWPCTL_CPU_NONE; 746 747 /* 748 * Save old VM context, unless a soft interrupt 749 * handler is blocking. 750 */ 751 if (!returning) 752 pmap_deactivate(l); 753 754 /* 755 * We may need to spin-wait if 'newl' is still 756 * context switching on another CPU. 757 */ 758 if (__predict_false(newl->l_ctxswtch != 0)) { 759 u_int count; 760 count = SPINLOCK_BACKOFF_MIN; 761 while (newl->l_ctxswtch) 762 SPINLOCK_BACKOFF(count); 763 } 764 765 /* 766 * If DTrace has set the active vtime enum to anything 767 * other than INACTIVE (0), then it should have set the 768 * function to call. 769 */ 770 if (__predict_false(dtrace_vtime_active)) { 771 (*dtrace_vtime_switch_func)(newl); 772 } 773 774 /* Switch to the new LWP.. */ 775 prevlwp = cpu_switchto(l, newl, returning); 776 ci = curcpu(); 777 778 /* 779 * Switched away - we have new curlwp. 780 * Restore VM context and IPL. 781 */ 782 pmap_activate(l); 783 uvm_emap_switch(l); 784 785 if (prevlwp != NULL) { 786 /* Normalize the count of the spin-mutexes */ 787 ci->ci_mtx_count++; 788 /* Unmark the state of context switch */ 789 membar_exit(); 790 prevlwp->l_ctxswtch = 0; 791 } 792 793 /* Update status for lwpctl, if present. */ 794 if (l->l_lwpctl != NULL) { 795 l->l_lwpctl->lc_curcpu = (int)cpu_index(ci); 796 l->l_lwpctl->lc_pctr++; 797 } 798 799 KASSERT(l->l_cpu == ci); 800 splx(oldspl); 801 retval = 1; 802 } else { 803 /* Nothing to do - just unlock and return. */ 804 mutex_spin_exit(spc->spc_mutex); 805 lwp_unlock(l); 806 retval = 0; 807 } 808 809 KASSERT(l == curlwp); 810 KASSERT(l->l_stat == LSONPROC); 811 812 /* 813 * XXXSMP If we are using h/w performance counters, restore context. 814 * XXXSMP preemption problem. 815 */ 816 #if PERFCTRS 817 if (PMC_ENABLED(l->l_proc)) { 818 pmc_restore_context(l->l_proc); 819 } 820 #endif 821 SYSCALL_TIME_WAKEUP(l); 822 LOCKDEBUG_BARRIER(NULL, 1); 823 824 return retval; 825 } 826 827 /* 828 * The machine independent parts of context switch to oblivion. 829 * Does not return. Call with the LWP unlocked. 830 */ 831 void 832 lwp_exit_switchaway(lwp_t *l) 833 { 834 struct cpu_info *ci; 835 struct lwp *newl; 836 struct bintime bt; 837 838 ci = l->l_cpu; 839 840 KASSERT(kpreempt_disabled()); 841 KASSERT(l->l_stat == LSZOMB || l->l_stat == LSIDL); 842 KASSERT(ci == curcpu()); 843 LOCKDEBUG_BARRIER(NULL, 0); 844 845 kstack_check_magic(l); 846 847 /* Count time spent in current system call */ 848 SYSCALL_TIME_SLEEP(l); 849 binuptime(&bt); 850 updatertime(l, &bt); 851 852 /* Must stay at IPL_SCHED even after releasing run queue lock. */ 853 (void)splsched(); 854 855 /* 856 * Let sched_nextlwp() select the LWP to run the CPU next. 857 * If no LWP is runnable, select the idle LWP. 858 * 859 * Note that spc_lwplock might not necessary be held, and 860 * new thread would be unlocked after setting the LWP-lock. 861 */ 862 spc_lock(ci); 863 #ifndef __HAVE_FAST_SOFTINTS 864 if (ci->ci_data.cpu_softints != 0) { 865 /* There are pending soft interrupts, so pick one. */ 866 newl = softint_picklwp(); 867 newl->l_stat = LSONPROC; 868 newl->l_pflag |= LP_RUNNING; 869 } else 870 #endif /* !__HAVE_FAST_SOFTINTS */ 871 { 872 newl = nextlwp(ci, &ci->ci_schedstate); 873 } 874 875 /* Update the new LWP's start time. */ 876 newl->l_stime = bt; 877 l->l_pflag &= ~LP_RUNNING; 878 879 /* 880 * ci_curlwp changes when a fast soft interrupt occurs. 881 * We use cpu_onproc to keep track of which kernel or 882 * user thread is running 'underneath' the software 883 * interrupt. This is important for time accounting, 884 * itimers and forcing user threads to preempt (aston). 885 */ 886 ci->ci_data.cpu_onproc = newl; 887 888 /* 889 * Preemption related tasks. Must be done with the current 890 * CPU locked. 891 */ 892 cpu_did_resched(l); 893 894 /* Unlock the run queue. */ 895 spc_unlock(ci); 896 897 /* Count the context switch on this CPU. */ 898 ci->ci_data.cpu_nswtch++; 899 900 /* Update status for lwpctl, if present. */ 901 if (l->l_lwpctl != NULL) 902 l->l_lwpctl->lc_curcpu = LWPCTL_CPU_EXITED; 903 904 /* 905 * We may need to spin-wait if 'newl' is still 906 * context switching on another CPU. 907 */ 908 if (__predict_false(newl->l_ctxswtch != 0)) { 909 u_int count; 910 count = SPINLOCK_BACKOFF_MIN; 911 while (newl->l_ctxswtch) 912 SPINLOCK_BACKOFF(count); 913 } 914 915 /* 916 * If DTrace has set the active vtime enum to anything 917 * other than INACTIVE (0), then it should have set the 918 * function to call. 919 */ 920 if (__predict_false(dtrace_vtime_active)) { 921 (*dtrace_vtime_switch_func)(newl); 922 } 923 924 /* Switch to the new LWP.. */ 925 (void)cpu_switchto(NULL, newl, false); 926 927 for (;;) continue; /* XXX: convince gcc about "noreturn" */ 928 /* NOTREACHED */ 929 } 930 931 /* 932 * setrunnable: change LWP state to be runnable, placing it on the run queue. 933 * 934 * Call with the process and LWP locked. Will return with the LWP unlocked. 935 */ 936 void 937 setrunnable(struct lwp *l) 938 { 939 struct proc *p = l->l_proc; 940 struct cpu_info *ci; 941 942 KASSERT((l->l_flag & LW_IDLE) == 0); 943 KASSERT(mutex_owned(p->p_lock)); 944 KASSERT(lwp_locked(l, NULL)); 945 KASSERT(l->l_mutex != l->l_cpu->ci_schedstate.spc_mutex); 946 947 switch (l->l_stat) { 948 case LSSTOP: 949 /* 950 * If we're being traced (possibly because someone attached us 951 * while we were stopped), check for a signal from the debugger. 952 */ 953 if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xstat != 0) 954 signotify(l); 955 p->p_nrlwps++; 956 break; 957 case LSSUSPENDED: 958 l->l_flag &= ~LW_WSUSPEND; 959 p->p_nrlwps++; 960 cv_broadcast(&p->p_lwpcv); 961 break; 962 case LSSLEEP: 963 KASSERT(l->l_wchan != NULL); 964 break; 965 default: 966 panic("setrunnable: lwp %p state was %d", l, l->l_stat); 967 } 968 969 #ifdef KERN_SA 970 if (l->l_proc->p_sa) 971 sa_awaken(l); 972 #endif /* KERN_SA */ 973 974 /* 975 * If the LWP was sleeping interruptably, then it's OK to start it 976 * again. If not, mark it as still sleeping. 977 */ 978 if (l->l_wchan != NULL) { 979 l->l_stat = LSSLEEP; 980 /* lwp_unsleep() will release the lock. */ 981 lwp_unsleep(l, true); 982 return; 983 } 984 985 /* 986 * If the LWP is still on the CPU, mark it as LSONPROC. It may be 987 * about to call mi_switch(), in which case it will yield. 988 */ 989 if ((l->l_pflag & LP_RUNNING) != 0) { 990 l->l_stat = LSONPROC; 991 l->l_slptime = 0; 992 lwp_unlock(l); 993 return; 994 } 995 996 /* 997 * Look for a CPU to run. 998 * Set the LWP runnable. 999 */ 1000 ci = sched_takecpu(l); 1001 l->l_cpu = ci; 1002 spc_lock(ci); 1003 lwp_unlock_to(l, ci->ci_schedstate.spc_mutex); 1004 sched_setrunnable(l); 1005 l->l_stat = LSRUN; 1006 l->l_slptime = 0; 1007 1008 sched_enqueue(l, false); 1009 resched_cpu(l); 1010 lwp_unlock(l); 1011 } 1012 1013 /* 1014 * suspendsched: 1015 * 1016 * Convert all non-LW_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED. 1017 */ 1018 void 1019 suspendsched(void) 1020 { 1021 CPU_INFO_ITERATOR cii; 1022 struct cpu_info *ci; 1023 struct lwp *l; 1024 struct proc *p; 1025 1026 /* 1027 * We do this by process in order not to violate the locking rules. 1028 */ 1029 mutex_enter(proc_lock); 1030 PROCLIST_FOREACH(p, &allproc) { 1031 mutex_enter(p->p_lock); 1032 if ((p->p_flag & PK_SYSTEM) != 0) { 1033 mutex_exit(p->p_lock); 1034 continue; 1035 } 1036 1037 p->p_stat = SSTOP; 1038 1039 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 1040 if (l == curlwp) 1041 continue; 1042 1043 lwp_lock(l); 1044 1045 /* 1046 * Set L_WREBOOT so that the LWP will suspend itself 1047 * when it tries to return to user mode. We want to 1048 * try and get to get as many LWPs as possible to 1049 * the user / kernel boundary, so that they will 1050 * release any locks that they hold. 1051 */ 1052 l->l_flag |= (LW_WREBOOT | LW_WSUSPEND); 1053 1054 if (l->l_stat == LSSLEEP && 1055 (l->l_flag & LW_SINTR) != 0) { 1056 /* setrunnable() will release the lock. */ 1057 setrunnable(l); 1058 continue; 1059 } 1060 1061 lwp_unlock(l); 1062 } 1063 1064 mutex_exit(p->p_lock); 1065 } 1066 mutex_exit(proc_lock); 1067 1068 /* 1069 * Kick all CPUs to make them preempt any LWPs running in user mode. 1070 * They'll trap into the kernel and suspend themselves in userret(). 1071 */ 1072 for (CPU_INFO_FOREACH(cii, ci)) { 1073 spc_lock(ci); 1074 cpu_need_resched(ci, RESCHED_IMMED); 1075 spc_unlock(ci); 1076 } 1077 } 1078 1079 /* 1080 * sched_unsleep: 1081 * 1082 * The is called when the LWP has not been awoken normally but instead 1083 * interrupted: for example, if the sleep timed out. Because of this, 1084 * it's not a valid action for running or idle LWPs. 1085 */ 1086 static void 1087 sched_unsleep(struct lwp *l, bool cleanup) 1088 { 1089 1090 lwp_unlock(l); 1091 panic("sched_unsleep"); 1092 } 1093 1094 static void 1095 resched_cpu(struct lwp *l) 1096 { 1097 struct cpu_info *ci = l->l_cpu; 1098 1099 KASSERT(lwp_locked(l, NULL)); 1100 if (lwp_eprio(l) > ci->ci_schedstate.spc_curpriority) 1101 cpu_need_resched(ci, 0); 1102 } 1103 1104 static void 1105 sched_changepri(struct lwp *l, pri_t pri) 1106 { 1107 1108 KASSERT(lwp_locked(l, NULL)); 1109 1110 if (l->l_stat == LSRUN) { 1111 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex)); 1112 sched_dequeue(l); 1113 l->l_priority = pri; 1114 sched_enqueue(l, false); 1115 } else { 1116 l->l_priority = pri; 1117 } 1118 resched_cpu(l); 1119 } 1120 1121 static void 1122 sched_lendpri(struct lwp *l, pri_t pri) 1123 { 1124 1125 KASSERT(lwp_locked(l, NULL)); 1126 1127 if (l->l_stat == LSRUN) { 1128 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex)); 1129 sched_dequeue(l); 1130 l->l_inheritedprio = pri; 1131 sched_enqueue(l, false); 1132 } else { 1133 l->l_inheritedprio = pri; 1134 } 1135 resched_cpu(l); 1136 } 1137 1138 struct lwp * 1139 syncobj_noowner(wchan_t wchan) 1140 { 1141 1142 return NULL; 1143 } 1144 1145 /* Decay 95% of proc::p_pctcpu in 60 seconds, ccpu = exp(-1/20) */ 1146 const fixpt_t ccpu = 0.95122942450071400909 * FSCALE; 1147 1148 /* 1149 * Constants for averages over 1, 5 and 15 minutes when sampling at 1150 * 5 second intervals. 1151 */ 1152 static const fixpt_t cexp[ ] = { 1153 0.9200444146293232 * FSCALE, /* exp(-1/12) */ 1154 0.9834714538216174 * FSCALE, /* exp(-1/60) */ 1155 0.9944598480048967 * FSCALE, /* exp(-1/180) */ 1156 }; 1157 1158 /* 1159 * sched_pstats: 1160 * 1161 * => Update process statistics and check CPU resource allocation. 1162 * => Call scheduler-specific hook to eventually adjust LWP priorities. 1163 * => Compute load average of a quantity on 1, 5 and 15 minute intervals. 1164 */ 1165 void 1166 sched_pstats(void) 1167 { 1168 extern struct loadavg averunnable; 1169 struct loadavg *avg = &averunnable; 1170 const int clkhz = (stathz != 0 ? stathz : hz); 1171 static bool backwards = false; 1172 static u_int lavg_count = 0; 1173 struct proc *p; 1174 int nrun; 1175 1176 sched_pstats_ticks++; 1177 if (++lavg_count >= 5) { 1178 lavg_count = 0; 1179 nrun = 0; 1180 } 1181 mutex_enter(proc_lock); 1182 PROCLIST_FOREACH(p, &allproc) { 1183 struct lwp *l; 1184 struct rlimit *rlim; 1185 long runtm; 1186 int sig; 1187 1188 /* Increment sleep time (if sleeping), ignore overflow. */ 1189 mutex_enter(p->p_lock); 1190 runtm = p->p_rtime.sec; 1191 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 1192 fixpt_t lpctcpu; 1193 u_int lcpticks; 1194 1195 if (__predict_false((l->l_flag & LW_IDLE) != 0)) 1196 continue; 1197 lwp_lock(l); 1198 runtm += l->l_rtime.sec; 1199 l->l_swtime++; 1200 sched_lwp_stats(l); 1201 1202 /* For load average calculation. */ 1203 if (__predict_false(lavg_count == 0) && 1204 (l->l_flag & (LW_SINTR | LW_SYSTEM)) == 0) { 1205 switch (l->l_stat) { 1206 case LSSLEEP: 1207 if (l->l_slptime > 1) { 1208 break; 1209 } 1210 case LSRUN: 1211 case LSONPROC: 1212 case LSIDL: 1213 nrun++; 1214 } 1215 } 1216 lwp_unlock(l); 1217 1218 l->l_pctcpu = (l->l_pctcpu * ccpu) >> FSHIFT; 1219 if (l->l_slptime != 0) 1220 continue; 1221 1222 lpctcpu = l->l_pctcpu; 1223 lcpticks = atomic_swap_uint(&l->l_cpticks, 0); 1224 lpctcpu += ((FSCALE - ccpu) * 1225 (lcpticks * FSCALE / clkhz)) >> FSHIFT; 1226 l->l_pctcpu = lpctcpu; 1227 } 1228 /* Calculating p_pctcpu only for ps(1) */ 1229 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT; 1230 1231 /* 1232 * Check if the process exceeds its CPU resource allocation. 1233 * If over max, kill it. 1234 */ 1235 rlim = &p->p_rlimit[RLIMIT_CPU]; 1236 sig = 0; 1237 if (__predict_false(runtm >= rlim->rlim_cur)) { 1238 if (runtm >= rlim->rlim_max) 1239 sig = SIGKILL; 1240 else { 1241 sig = SIGXCPU; 1242 if (rlim->rlim_cur < rlim->rlim_max) 1243 rlim->rlim_cur += 5; 1244 } 1245 } 1246 mutex_exit(p->p_lock); 1247 if (__predict_false(runtm < 0)) { 1248 if (!backwards) { 1249 backwards = true; 1250 printf("WARNING: negative runtime; " 1251 "monotonic clock has gone backwards\n"); 1252 } 1253 } else if (__predict_false(sig)) { 1254 KASSERT((p->p_flag & PK_SYSTEM) == 0); 1255 psignal(p, sig); 1256 } 1257 } 1258 mutex_exit(proc_lock); 1259 1260 /* Load average calculation. */ 1261 if (__predict_false(lavg_count == 0)) { 1262 int i; 1263 CTASSERT(__arraycount(cexp) == __arraycount(avg->ldavg)); 1264 for (i = 0; i < __arraycount(cexp); i++) { 1265 avg->ldavg[i] = (cexp[i] * avg->ldavg[i] + 1266 nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT; 1267 } 1268 } 1269 1270 /* Lightning bolt. */ 1271 cv_broadcast(&lbolt); 1272 } 1273