1 /* $NetBSD: kern_synch.c,v 1.289 2011/05/13 22:16:43 rmind Exp $ */ 2 3 /*- 4 * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008, 2009 5 * The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 10 * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran and 11 * Daniel Sieger. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 24 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 26 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 29 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 30 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 32 * POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 /*- 36 * Copyright (c) 1982, 1986, 1990, 1991, 1993 37 * The Regents of the University of California. All rights reserved. 38 * (c) UNIX System Laboratories, Inc. 39 * All or some portions of this file are derived from material licensed 40 * to the University of California by American Telephone and Telegraph 41 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 42 * the permission of UNIX System Laboratories, Inc. 43 * 44 * Redistribution and use in source and binary forms, with or without 45 * modification, are permitted provided that the following conditions 46 * are met: 47 * 1. Redistributions of source code must retain the above copyright 48 * notice, this list of conditions and the following disclaimer. 49 * 2. Redistributions in binary form must reproduce the above copyright 50 * notice, this list of conditions and the following disclaimer in the 51 * documentation and/or other materials provided with the distribution. 52 * 3. Neither the name of the University nor the names of its contributors 53 * may be used to endorse or promote products derived from this software 54 * without specific prior written permission. 55 * 56 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 57 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 58 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 59 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 60 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 61 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 62 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 63 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 64 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 65 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 66 * SUCH DAMAGE. 67 * 68 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 69 */ 70 71 #include <sys/cdefs.h> 72 __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.289 2011/05/13 22:16:43 rmind Exp $"); 73 74 #include "opt_kstack.h" 75 #include "opt_perfctrs.h" 76 #include "opt_sa.h" 77 #include "opt_dtrace.h" 78 79 #define __MUTEX_PRIVATE 80 81 #include <sys/param.h> 82 #include <sys/systm.h> 83 #include <sys/proc.h> 84 #include <sys/kernel.h> 85 #if defined(PERFCTRS) 86 #include <sys/pmc.h> 87 #endif 88 #include <sys/cpu.h> 89 #include <sys/resourcevar.h> 90 #include <sys/sched.h> 91 #include <sys/sa.h> 92 #include <sys/savar.h> 93 #include <sys/syscall_stats.h> 94 #include <sys/sleepq.h> 95 #include <sys/lockdebug.h> 96 #include <sys/evcnt.h> 97 #include <sys/intr.h> 98 #include <sys/lwpctl.h> 99 #include <sys/atomic.h> 100 #include <sys/simplelock.h> 101 102 #include <uvm/uvm_extern.h> 103 104 #include <dev/lockstat.h> 105 106 #include <sys/dtrace_bsd.h> 107 int dtrace_vtime_active=0; 108 dtrace_vtime_switch_func_t dtrace_vtime_switch_func; 109 110 static void sched_unsleep(struct lwp *, bool); 111 static void sched_changepri(struct lwp *, pri_t); 112 static void sched_lendpri(struct lwp *, pri_t); 113 static void resched_cpu(struct lwp *); 114 115 syncobj_t sleep_syncobj = { 116 SOBJ_SLEEPQ_SORTED, 117 sleepq_unsleep, 118 sleepq_changepri, 119 sleepq_lendpri, 120 syncobj_noowner, 121 }; 122 123 syncobj_t sched_syncobj = { 124 SOBJ_SLEEPQ_SORTED, 125 sched_unsleep, 126 sched_changepri, 127 sched_lendpri, 128 syncobj_noowner, 129 }; 130 131 /* "Lightning bolt": once a second sleep address. */ 132 kcondvar_t lbolt __cacheline_aligned; 133 134 u_int sched_pstats_ticks __cacheline_aligned; 135 136 /* Preemption event counters. */ 137 static struct evcnt kpreempt_ev_crit __cacheline_aligned; 138 static struct evcnt kpreempt_ev_klock __cacheline_aligned; 139 static struct evcnt kpreempt_ev_immed __cacheline_aligned; 140 141 /* 142 * During autoconfiguration or after a panic, a sleep will simply lower the 143 * priority briefly to allow interrupts, then return. The priority to be 144 * used (safepri) is machine-dependent, thus this value is initialized and 145 * maintained in the machine-dependent layers. This priority will typically 146 * be 0, or the lowest priority that is safe for use on the interrupt stack; 147 * it can be made higher to block network software interrupts after panics. 148 */ 149 int safepri; 150 151 void 152 synch_init(void) 153 { 154 155 cv_init(&lbolt, "lbolt"); 156 157 evcnt_attach_dynamic(&kpreempt_ev_crit, EVCNT_TYPE_MISC, NULL, 158 "kpreempt", "defer: critical section"); 159 evcnt_attach_dynamic(&kpreempt_ev_klock, EVCNT_TYPE_MISC, NULL, 160 "kpreempt", "defer: kernel_lock"); 161 evcnt_attach_dynamic(&kpreempt_ev_immed, EVCNT_TYPE_MISC, NULL, 162 "kpreempt", "immediate"); 163 } 164 165 /* 166 * OBSOLETE INTERFACE 167 * 168 * General sleep call. Suspends the current LWP until a wakeup is 169 * performed on the specified identifier. The LWP will then be made 170 * runnable with the specified priority. Sleeps at most timo/hz seconds (0 171 * means no timeout). If pri includes PCATCH flag, signals are checked 172 * before and after sleeping, else signals are not checked. Returns 0 if 173 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a 174 * signal needs to be delivered, ERESTART is returned if the current system 175 * call should be restarted if possible, and EINTR is returned if the system 176 * call should be interrupted by the signal (return EINTR). 177 * 178 * The interlock is held until we are on a sleep queue. The interlock will 179 * be locked before returning back to the caller unless the PNORELOCK flag 180 * is specified, in which case the interlock will always be unlocked upon 181 * return. 182 */ 183 int 184 ltsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo, 185 volatile struct simplelock *interlock) 186 { 187 struct lwp *l = curlwp; 188 sleepq_t *sq; 189 kmutex_t *mp; 190 int error; 191 192 KASSERT((l->l_pflag & LP_INTR) == 0); 193 KASSERT(ident != &lbolt); 194 195 if (sleepq_dontsleep(l)) { 196 (void)sleepq_abort(NULL, 0); 197 if ((priority & PNORELOCK) != 0) 198 simple_unlock(interlock); 199 return 0; 200 } 201 202 l->l_kpriority = true; 203 sq = sleeptab_lookup(&sleeptab, ident, &mp); 204 sleepq_enter(sq, l, mp); 205 sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj); 206 207 if (interlock != NULL) { 208 KASSERT(simple_lock_held(interlock)); 209 simple_unlock(interlock); 210 } 211 212 error = sleepq_block(timo, priority & PCATCH); 213 214 if (interlock != NULL && (priority & PNORELOCK) == 0) 215 simple_lock(interlock); 216 217 return error; 218 } 219 220 int 221 mtsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo, 222 kmutex_t *mtx) 223 { 224 struct lwp *l = curlwp; 225 sleepq_t *sq; 226 kmutex_t *mp; 227 int error; 228 229 KASSERT((l->l_pflag & LP_INTR) == 0); 230 KASSERT(ident != &lbolt); 231 232 if (sleepq_dontsleep(l)) { 233 (void)sleepq_abort(mtx, (priority & PNORELOCK) != 0); 234 return 0; 235 } 236 237 l->l_kpriority = true; 238 sq = sleeptab_lookup(&sleeptab, ident, &mp); 239 sleepq_enter(sq, l, mp); 240 sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj); 241 mutex_exit(mtx); 242 error = sleepq_block(timo, priority & PCATCH); 243 244 if ((priority & PNORELOCK) == 0) 245 mutex_enter(mtx); 246 247 return error; 248 } 249 250 /* 251 * General sleep call for situations where a wake-up is not expected. 252 */ 253 int 254 kpause(const char *wmesg, bool intr, int timo, kmutex_t *mtx) 255 { 256 struct lwp *l = curlwp; 257 kmutex_t *mp; 258 sleepq_t *sq; 259 int error; 260 261 KASSERT(!(timo == 0 && intr == false)); 262 263 if (sleepq_dontsleep(l)) 264 return sleepq_abort(NULL, 0); 265 266 if (mtx != NULL) 267 mutex_exit(mtx); 268 l->l_kpriority = true; 269 sq = sleeptab_lookup(&sleeptab, l, &mp); 270 sleepq_enter(sq, l, mp); 271 sleepq_enqueue(sq, l, wmesg, &sleep_syncobj); 272 error = sleepq_block(timo, intr); 273 if (mtx != NULL) 274 mutex_enter(mtx); 275 276 return error; 277 } 278 279 #ifdef KERN_SA 280 /* 281 * sa_awaken: 282 * 283 * We believe this lwp is an SA lwp. If it's yielding, 284 * let it know it needs to wake up. 285 * 286 * We are called and exit with the lwp locked. We are 287 * called in the middle of wakeup operations, so we need 288 * to not touch the locks at all. 289 */ 290 void 291 sa_awaken(struct lwp *l) 292 { 293 /* LOCK_ASSERT(lwp_locked(l, NULL)); */ 294 295 if (l == l->l_savp->savp_lwp && l->l_flag & LW_SA_YIELD) 296 l->l_flag &= ~LW_SA_IDLE; 297 } 298 #endif /* KERN_SA */ 299 300 /* 301 * OBSOLETE INTERFACE 302 * 303 * Make all LWPs sleeping on the specified identifier runnable. 304 */ 305 void 306 wakeup(wchan_t ident) 307 { 308 sleepq_t *sq; 309 kmutex_t *mp; 310 311 if (__predict_false(cold)) 312 return; 313 314 sq = sleeptab_lookup(&sleeptab, ident, &mp); 315 sleepq_wake(sq, ident, (u_int)-1, mp); 316 } 317 318 /* 319 * OBSOLETE INTERFACE 320 * 321 * Make the highest priority LWP first in line on the specified 322 * identifier runnable. 323 */ 324 void 325 wakeup_one(wchan_t ident) 326 { 327 sleepq_t *sq; 328 kmutex_t *mp; 329 330 if (__predict_false(cold)) 331 return; 332 333 sq = sleeptab_lookup(&sleeptab, ident, &mp); 334 sleepq_wake(sq, ident, 1, mp); 335 } 336 337 338 /* 339 * General yield call. Puts the current LWP back on its run queue and 340 * performs a voluntary context switch. Should only be called when the 341 * current LWP explicitly requests it (eg sched_yield(2)). 342 */ 343 void 344 yield(void) 345 { 346 struct lwp *l = curlwp; 347 348 KERNEL_UNLOCK_ALL(l, &l->l_biglocks); 349 lwp_lock(l); 350 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); 351 KASSERT(l->l_stat == LSONPROC); 352 l->l_kpriority = false; 353 (void)mi_switch(l); 354 KERNEL_LOCK(l->l_biglocks, l); 355 } 356 357 /* 358 * General preemption call. Puts the current LWP back on its run queue 359 * and performs an involuntary context switch. 360 */ 361 void 362 preempt(void) 363 { 364 struct lwp *l = curlwp; 365 366 KERNEL_UNLOCK_ALL(l, &l->l_biglocks); 367 lwp_lock(l); 368 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); 369 KASSERT(l->l_stat == LSONPROC); 370 l->l_kpriority = false; 371 l->l_nivcsw++; 372 (void)mi_switch(l); 373 KERNEL_LOCK(l->l_biglocks, l); 374 } 375 376 /* 377 * Handle a request made by another agent to preempt the current LWP 378 * in-kernel. Usually called when l_dopreempt may be non-zero. 379 * 380 * Character addresses for lockstat only. 381 */ 382 static char in_critical_section; 383 static char kernel_lock_held; 384 static char is_softint; 385 static char cpu_kpreempt_enter_fail; 386 387 bool 388 kpreempt(uintptr_t where) 389 { 390 uintptr_t failed; 391 lwp_t *l; 392 int s, dop, lsflag; 393 394 l = curlwp; 395 failed = 0; 396 while ((dop = l->l_dopreempt) != 0) { 397 if (l->l_stat != LSONPROC) { 398 /* 399 * About to block (or die), let it happen. 400 * Doesn't really count as "preemption has 401 * been blocked", since we're going to 402 * context switch. 403 */ 404 l->l_dopreempt = 0; 405 return true; 406 } 407 if (__predict_false((l->l_flag & LW_IDLE) != 0)) { 408 /* Can't preempt idle loop, don't count as failure. */ 409 l->l_dopreempt = 0; 410 return true; 411 } 412 if (__predict_false(l->l_nopreempt != 0)) { 413 /* LWP holds preemption disabled, explicitly. */ 414 if ((dop & DOPREEMPT_COUNTED) == 0) { 415 kpreempt_ev_crit.ev_count++; 416 } 417 failed = (uintptr_t)&in_critical_section; 418 break; 419 } 420 if (__predict_false((l->l_pflag & LP_INTR) != 0)) { 421 /* Can't preempt soft interrupts yet. */ 422 l->l_dopreempt = 0; 423 failed = (uintptr_t)&is_softint; 424 break; 425 } 426 s = splsched(); 427 if (__predict_false(l->l_blcnt != 0 || 428 curcpu()->ci_biglock_wanted != NULL)) { 429 /* Hold or want kernel_lock, code is not MT safe. */ 430 splx(s); 431 if ((dop & DOPREEMPT_COUNTED) == 0) { 432 kpreempt_ev_klock.ev_count++; 433 } 434 failed = (uintptr_t)&kernel_lock_held; 435 break; 436 } 437 if (__predict_false(!cpu_kpreempt_enter(where, s))) { 438 /* 439 * It may be that the IPL is too high. 440 * kpreempt_enter() can schedule an 441 * interrupt to retry later. 442 */ 443 splx(s); 444 failed = (uintptr_t)&cpu_kpreempt_enter_fail; 445 break; 446 } 447 /* Do it! */ 448 if (__predict_true((dop & DOPREEMPT_COUNTED) == 0)) { 449 kpreempt_ev_immed.ev_count++; 450 } 451 lwp_lock(l); 452 mi_switch(l); 453 l->l_nopreempt++; 454 splx(s); 455 456 /* Take care of any MD cleanup. */ 457 cpu_kpreempt_exit(where); 458 l->l_nopreempt--; 459 } 460 461 if (__predict_true(!failed)) { 462 return false; 463 } 464 465 /* Record preemption failure for reporting via lockstat. */ 466 atomic_or_uint(&l->l_dopreempt, DOPREEMPT_COUNTED); 467 lsflag = 0; 468 LOCKSTAT_ENTER(lsflag); 469 if (__predict_false(lsflag)) { 470 if (where == 0) { 471 where = (uintptr_t)__builtin_return_address(0); 472 } 473 /* Preemption is on, might recurse, so make it atomic. */ 474 if (atomic_cas_ptr_ni((void *)&l->l_pfailaddr, NULL, 475 (void *)where) == NULL) { 476 LOCKSTAT_START_TIMER(lsflag, l->l_pfailtime); 477 l->l_pfaillock = failed; 478 } 479 } 480 LOCKSTAT_EXIT(lsflag); 481 return true; 482 } 483 484 /* 485 * Return true if preemption is explicitly disabled. 486 */ 487 bool 488 kpreempt_disabled(void) 489 { 490 const lwp_t *l = curlwp; 491 492 return l->l_nopreempt != 0 || l->l_stat == LSZOMB || 493 (l->l_flag & LW_IDLE) != 0 || cpu_kpreempt_disabled(); 494 } 495 496 /* 497 * Disable kernel preemption. 498 */ 499 void 500 kpreempt_disable(void) 501 { 502 503 KPREEMPT_DISABLE(curlwp); 504 } 505 506 /* 507 * Reenable kernel preemption. 508 */ 509 void 510 kpreempt_enable(void) 511 { 512 513 KPREEMPT_ENABLE(curlwp); 514 } 515 516 /* 517 * Compute the amount of time during which the current lwp was running. 518 * 519 * - update l_rtime unless it's an idle lwp. 520 */ 521 522 void 523 updatertime(lwp_t *l, const struct bintime *now) 524 { 525 526 if (__predict_false(l->l_flag & LW_IDLE)) 527 return; 528 529 /* rtime += now - stime */ 530 bintime_add(&l->l_rtime, now); 531 bintime_sub(&l->l_rtime, &l->l_stime); 532 } 533 534 /* 535 * Select next LWP from the current CPU to run.. 536 */ 537 static inline lwp_t * 538 nextlwp(struct cpu_info *ci, struct schedstate_percpu *spc) 539 { 540 lwp_t *newl; 541 542 /* 543 * Let sched_nextlwp() select the LWP to run the CPU next. 544 * If no LWP is runnable, select the idle LWP. 545 * 546 * Note that spc_lwplock might not necessary be held, and 547 * new thread would be unlocked after setting the LWP-lock. 548 */ 549 newl = sched_nextlwp(); 550 if (newl != NULL) { 551 sched_dequeue(newl); 552 KASSERT(lwp_locked(newl, spc->spc_mutex)); 553 KASSERT(newl->l_cpu == ci); 554 newl->l_stat = LSONPROC; 555 newl->l_pflag |= LP_RUNNING; 556 lwp_setlock(newl, spc->spc_lwplock); 557 } else { 558 newl = ci->ci_data.cpu_idlelwp; 559 newl->l_stat = LSONPROC; 560 newl->l_pflag |= LP_RUNNING; 561 } 562 563 /* 564 * Only clear want_resched if there are no pending (slow) 565 * software interrupts. 566 */ 567 ci->ci_want_resched = ci->ci_data.cpu_softints; 568 spc->spc_flags &= ~SPCF_SWITCHCLEAR; 569 spc->spc_curpriority = lwp_eprio(newl); 570 571 return newl; 572 } 573 574 /* 575 * The machine independent parts of context switch. 576 * 577 * Returns 1 if another LWP was actually run. 578 */ 579 int 580 mi_switch(lwp_t *l) 581 { 582 struct cpu_info *ci; 583 struct schedstate_percpu *spc; 584 struct lwp *newl; 585 int retval, oldspl; 586 struct bintime bt; 587 bool returning; 588 589 KASSERT(lwp_locked(l, NULL)); 590 KASSERT(kpreempt_disabled()); 591 LOCKDEBUG_BARRIER(l->l_mutex, 1); 592 593 kstack_check_magic(l); 594 595 binuptime(&bt); 596 597 KASSERT((l->l_pflag & LP_RUNNING) != 0); 598 KASSERT(l->l_cpu == curcpu()); 599 ci = l->l_cpu; 600 spc = &ci->ci_schedstate; 601 returning = false; 602 newl = NULL; 603 604 /* 605 * If we have been asked to switch to a specific LWP, then there 606 * is no need to inspect the run queues. If a soft interrupt is 607 * blocking, then return to the interrupted thread without adjusting 608 * VM context or its start time: neither have been changed in order 609 * to take the interrupt. 610 */ 611 if (l->l_switchto != NULL) { 612 if ((l->l_pflag & LP_INTR) != 0) { 613 returning = true; 614 softint_block(l); 615 if ((l->l_pflag & LP_TIMEINTR) != 0) 616 updatertime(l, &bt); 617 } 618 newl = l->l_switchto; 619 l->l_switchto = NULL; 620 } 621 #ifndef __HAVE_FAST_SOFTINTS 622 else if (ci->ci_data.cpu_softints != 0) { 623 /* There are pending soft interrupts, so pick one. */ 624 newl = softint_picklwp(); 625 newl->l_stat = LSONPROC; 626 newl->l_pflag |= LP_RUNNING; 627 } 628 #endif /* !__HAVE_FAST_SOFTINTS */ 629 630 /* Count time spent in current system call */ 631 if (!returning) { 632 SYSCALL_TIME_SLEEP(l); 633 634 /* 635 * XXXSMP If we are using h/w performance counters, 636 * save context. 637 */ 638 #if PERFCTRS 639 if (PMC_ENABLED(l->l_proc)) { 640 pmc_save_context(l->l_proc); 641 } 642 #endif 643 updatertime(l, &bt); 644 } 645 646 /* Lock the runqueue */ 647 KASSERT(l->l_stat != LSRUN); 648 mutex_spin_enter(spc->spc_mutex); 649 650 /* 651 * If on the CPU and we have gotten this far, then we must yield. 652 */ 653 if (l->l_stat == LSONPROC && l != newl) { 654 KASSERT(lwp_locked(l, spc->spc_lwplock)); 655 if ((l->l_flag & LW_IDLE) == 0) { 656 l->l_stat = LSRUN; 657 lwp_setlock(l, spc->spc_mutex); 658 sched_enqueue(l, true); 659 /* 660 * Handle migration. Note that "migrating LWP" may 661 * be reset here, if interrupt/preemption happens 662 * early in idle LWP. 663 */ 664 if (l->l_target_cpu != NULL) { 665 KASSERT((l->l_pflag & LP_INTR) == 0); 666 spc->spc_migrating = l; 667 } 668 } else 669 l->l_stat = LSIDL; 670 } 671 672 /* Pick new LWP to run. */ 673 if (newl == NULL) { 674 newl = nextlwp(ci, spc); 675 } 676 677 /* Items that must be updated with the CPU locked. */ 678 if (!returning) { 679 /* Update the new LWP's start time. */ 680 newl->l_stime = bt; 681 682 /* 683 * ci_curlwp changes when a fast soft interrupt occurs. 684 * We use cpu_onproc to keep track of which kernel or 685 * user thread is running 'underneath' the software 686 * interrupt. This is important for time accounting, 687 * itimers and forcing user threads to preempt (aston). 688 */ 689 ci->ci_data.cpu_onproc = newl; 690 } 691 692 /* 693 * Preemption related tasks. Must be done with the current 694 * CPU locked. 695 */ 696 cpu_did_resched(l); 697 l->l_dopreempt = 0; 698 if (__predict_false(l->l_pfailaddr != 0)) { 699 LOCKSTAT_FLAG(lsflag); 700 LOCKSTAT_ENTER(lsflag); 701 LOCKSTAT_STOP_TIMER(lsflag, l->l_pfailtime); 702 LOCKSTAT_EVENT_RA(lsflag, l->l_pfaillock, LB_NOPREEMPT|LB_SPIN, 703 1, l->l_pfailtime, l->l_pfailaddr); 704 LOCKSTAT_EXIT(lsflag); 705 l->l_pfailtime = 0; 706 l->l_pfaillock = 0; 707 l->l_pfailaddr = 0; 708 } 709 710 if (l != newl) { 711 struct lwp *prevlwp; 712 713 /* Release all locks, but leave the current LWP locked */ 714 if (l->l_mutex == spc->spc_mutex) { 715 /* 716 * Drop spc_lwplock, if the current LWP has been moved 717 * to the run queue (it is now locked by spc_mutex). 718 */ 719 mutex_spin_exit(spc->spc_lwplock); 720 } else { 721 /* 722 * Otherwise, drop the spc_mutex, we are done with the 723 * run queues. 724 */ 725 mutex_spin_exit(spc->spc_mutex); 726 } 727 728 /* 729 * Mark that context switch is going to be performed 730 * for this LWP, to protect it from being switched 731 * to on another CPU. 732 */ 733 KASSERT(l->l_ctxswtch == 0); 734 l->l_ctxswtch = 1; 735 l->l_ncsw++; 736 KASSERT((l->l_pflag & LP_RUNNING) != 0); 737 l->l_pflag &= ~LP_RUNNING; 738 739 /* 740 * Increase the count of spin-mutexes before the release 741 * of the last lock - we must remain at IPL_SCHED during 742 * the context switch. 743 */ 744 KASSERTMSG(ci->ci_mtx_count == -1, 745 ("%s: cpu%u: ci_mtx_count (%d) != -1", 746 __func__, cpu_index(ci), ci->ci_mtx_count)); 747 oldspl = MUTEX_SPIN_OLDSPL(ci); 748 ci->ci_mtx_count--; 749 lwp_unlock(l); 750 751 /* Count the context switch on this CPU. */ 752 ci->ci_data.cpu_nswtch++; 753 754 /* Update status for lwpctl, if present. */ 755 if (l->l_lwpctl != NULL) 756 l->l_lwpctl->lc_curcpu = LWPCTL_CPU_NONE; 757 758 /* 759 * Save old VM context, unless a soft interrupt 760 * handler is blocking. 761 */ 762 if (!returning) 763 pmap_deactivate(l); 764 765 /* 766 * We may need to spin-wait if 'newl' is still 767 * context switching on another CPU. 768 */ 769 if (__predict_false(newl->l_ctxswtch != 0)) { 770 u_int count; 771 count = SPINLOCK_BACKOFF_MIN; 772 while (newl->l_ctxswtch) 773 SPINLOCK_BACKOFF(count); 774 } 775 776 /* 777 * If DTrace has set the active vtime enum to anything 778 * other than INACTIVE (0), then it should have set the 779 * function to call. 780 */ 781 if (__predict_false(dtrace_vtime_active)) { 782 (*dtrace_vtime_switch_func)(newl); 783 } 784 785 /* Switch to the new LWP.. */ 786 prevlwp = cpu_switchto(l, newl, returning); 787 ci = curcpu(); 788 789 /* 790 * Switched away - we have new curlwp. 791 * Restore VM context and IPL. 792 */ 793 pmap_activate(l); 794 uvm_emap_switch(l); 795 pcu_switchpoint(l); 796 797 if (prevlwp != NULL) { 798 /* Normalize the count of the spin-mutexes */ 799 ci->ci_mtx_count++; 800 /* Unmark the state of context switch */ 801 membar_exit(); 802 prevlwp->l_ctxswtch = 0; 803 } 804 805 /* Update status for lwpctl, if present. */ 806 if (l->l_lwpctl != NULL) { 807 l->l_lwpctl->lc_curcpu = (int)cpu_index(ci); 808 l->l_lwpctl->lc_pctr++; 809 } 810 811 KASSERT(l->l_cpu == ci); 812 splx(oldspl); 813 retval = 1; 814 } else { 815 /* Nothing to do - just unlock and return. */ 816 mutex_spin_exit(spc->spc_mutex); 817 lwp_unlock(l); 818 retval = 0; 819 } 820 821 KASSERT(l == curlwp); 822 KASSERT(l->l_stat == LSONPROC); 823 824 /* 825 * XXXSMP If we are using h/w performance counters, restore context. 826 * XXXSMP preemption problem. 827 */ 828 #if PERFCTRS 829 if (PMC_ENABLED(l->l_proc)) { 830 pmc_restore_context(l->l_proc); 831 } 832 #endif 833 SYSCALL_TIME_WAKEUP(l); 834 LOCKDEBUG_BARRIER(NULL, 1); 835 836 return retval; 837 } 838 839 /* 840 * The machine independent parts of context switch to oblivion. 841 * Does not return. Call with the LWP unlocked. 842 */ 843 void 844 lwp_exit_switchaway(lwp_t *l) 845 { 846 struct cpu_info *ci; 847 struct lwp *newl; 848 struct bintime bt; 849 850 ci = l->l_cpu; 851 852 KASSERT(kpreempt_disabled()); 853 KASSERT(l->l_stat == LSZOMB || l->l_stat == LSIDL); 854 KASSERT(ci == curcpu()); 855 LOCKDEBUG_BARRIER(NULL, 0); 856 857 kstack_check_magic(l); 858 859 /* Count time spent in current system call */ 860 SYSCALL_TIME_SLEEP(l); 861 binuptime(&bt); 862 updatertime(l, &bt); 863 864 /* Must stay at IPL_SCHED even after releasing run queue lock. */ 865 (void)splsched(); 866 867 /* 868 * Let sched_nextlwp() select the LWP to run the CPU next. 869 * If no LWP is runnable, select the idle LWP. 870 * 871 * Note that spc_lwplock might not necessary be held, and 872 * new thread would be unlocked after setting the LWP-lock. 873 */ 874 spc_lock(ci); 875 #ifndef __HAVE_FAST_SOFTINTS 876 if (ci->ci_data.cpu_softints != 0) { 877 /* There are pending soft interrupts, so pick one. */ 878 newl = softint_picklwp(); 879 newl->l_stat = LSONPROC; 880 newl->l_pflag |= LP_RUNNING; 881 } else 882 #endif /* !__HAVE_FAST_SOFTINTS */ 883 { 884 newl = nextlwp(ci, &ci->ci_schedstate); 885 } 886 887 /* Update the new LWP's start time. */ 888 newl->l_stime = bt; 889 l->l_pflag &= ~LP_RUNNING; 890 891 /* 892 * ci_curlwp changes when a fast soft interrupt occurs. 893 * We use cpu_onproc to keep track of which kernel or 894 * user thread is running 'underneath' the software 895 * interrupt. This is important for time accounting, 896 * itimers and forcing user threads to preempt (aston). 897 */ 898 ci->ci_data.cpu_onproc = newl; 899 900 /* 901 * Preemption related tasks. Must be done with the current 902 * CPU locked. 903 */ 904 cpu_did_resched(l); 905 906 /* Unlock the run queue. */ 907 spc_unlock(ci); 908 909 /* Count the context switch on this CPU. */ 910 ci->ci_data.cpu_nswtch++; 911 912 /* Update status for lwpctl, if present. */ 913 if (l->l_lwpctl != NULL) 914 l->l_lwpctl->lc_curcpu = LWPCTL_CPU_EXITED; 915 916 /* 917 * We may need to spin-wait if 'newl' is still 918 * context switching on another CPU. 919 */ 920 if (__predict_false(newl->l_ctxswtch != 0)) { 921 u_int count; 922 count = SPINLOCK_BACKOFF_MIN; 923 while (newl->l_ctxswtch) 924 SPINLOCK_BACKOFF(count); 925 } 926 927 /* 928 * If DTrace has set the active vtime enum to anything 929 * other than INACTIVE (0), then it should have set the 930 * function to call. 931 */ 932 if (__predict_false(dtrace_vtime_active)) { 933 (*dtrace_vtime_switch_func)(newl); 934 } 935 936 /* Switch to the new LWP.. */ 937 (void)cpu_switchto(NULL, newl, false); 938 939 for (;;) continue; /* XXX: convince gcc about "noreturn" */ 940 /* NOTREACHED */ 941 } 942 943 /* 944 * setrunnable: change LWP state to be runnable, placing it on the run queue. 945 * 946 * Call with the process and LWP locked. Will return with the LWP unlocked. 947 */ 948 void 949 setrunnable(struct lwp *l) 950 { 951 struct proc *p = l->l_proc; 952 struct cpu_info *ci; 953 954 KASSERT((l->l_flag & LW_IDLE) == 0); 955 KASSERT(mutex_owned(p->p_lock)); 956 KASSERT(lwp_locked(l, NULL)); 957 KASSERT(l->l_mutex != l->l_cpu->ci_schedstate.spc_mutex); 958 959 switch (l->l_stat) { 960 case LSSTOP: 961 /* 962 * If we're being traced (possibly because someone attached us 963 * while we were stopped), check for a signal from the debugger. 964 */ 965 if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xstat != 0) 966 signotify(l); 967 p->p_nrlwps++; 968 break; 969 case LSSUSPENDED: 970 l->l_flag &= ~LW_WSUSPEND; 971 p->p_nrlwps++; 972 cv_broadcast(&p->p_lwpcv); 973 break; 974 case LSSLEEP: 975 KASSERT(l->l_wchan != NULL); 976 break; 977 default: 978 panic("setrunnable: lwp %p state was %d", l, l->l_stat); 979 } 980 981 #ifdef KERN_SA 982 if (l->l_proc->p_sa) 983 sa_awaken(l); 984 #endif /* KERN_SA */ 985 986 /* 987 * If the LWP was sleeping, start it again. 988 */ 989 if (l->l_wchan != NULL) { 990 l->l_stat = LSSLEEP; 991 /* lwp_unsleep() will release the lock. */ 992 lwp_unsleep(l, true); 993 return; 994 } 995 996 /* 997 * If the LWP is still on the CPU, mark it as LSONPROC. It may be 998 * about to call mi_switch(), in which case it will yield. 999 */ 1000 if ((l->l_pflag & LP_RUNNING) != 0) { 1001 l->l_stat = LSONPROC; 1002 l->l_slptime = 0; 1003 lwp_unlock(l); 1004 return; 1005 } 1006 1007 /* 1008 * Look for a CPU to run. 1009 * Set the LWP runnable. 1010 */ 1011 ci = sched_takecpu(l); 1012 l->l_cpu = ci; 1013 spc_lock(ci); 1014 lwp_unlock_to(l, ci->ci_schedstate.spc_mutex); 1015 sched_setrunnable(l); 1016 l->l_stat = LSRUN; 1017 l->l_slptime = 0; 1018 1019 sched_enqueue(l, false); 1020 resched_cpu(l); 1021 lwp_unlock(l); 1022 } 1023 1024 /* 1025 * suspendsched: 1026 * 1027 * Convert all non-LW_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED. 1028 */ 1029 void 1030 suspendsched(void) 1031 { 1032 CPU_INFO_ITERATOR cii; 1033 struct cpu_info *ci; 1034 struct lwp *l; 1035 struct proc *p; 1036 1037 /* 1038 * We do this by process in order not to violate the locking rules. 1039 */ 1040 mutex_enter(proc_lock); 1041 PROCLIST_FOREACH(p, &allproc) { 1042 mutex_enter(p->p_lock); 1043 if ((p->p_flag & PK_SYSTEM) != 0) { 1044 mutex_exit(p->p_lock); 1045 continue; 1046 } 1047 1048 p->p_stat = SSTOP; 1049 1050 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 1051 if (l == curlwp) 1052 continue; 1053 1054 lwp_lock(l); 1055 1056 /* 1057 * Set L_WREBOOT so that the LWP will suspend itself 1058 * when it tries to return to user mode. We want to 1059 * try and get to get as many LWPs as possible to 1060 * the user / kernel boundary, so that they will 1061 * release any locks that they hold. 1062 */ 1063 l->l_flag |= (LW_WREBOOT | LW_WSUSPEND); 1064 1065 if (l->l_stat == LSSLEEP && 1066 (l->l_flag & LW_SINTR) != 0) { 1067 /* setrunnable() will release the lock. */ 1068 setrunnable(l); 1069 continue; 1070 } 1071 1072 lwp_unlock(l); 1073 } 1074 1075 mutex_exit(p->p_lock); 1076 } 1077 mutex_exit(proc_lock); 1078 1079 /* 1080 * Kick all CPUs to make them preempt any LWPs running in user mode. 1081 * They'll trap into the kernel and suspend themselves in userret(). 1082 */ 1083 for (CPU_INFO_FOREACH(cii, ci)) { 1084 spc_lock(ci); 1085 cpu_need_resched(ci, RESCHED_IMMED); 1086 spc_unlock(ci); 1087 } 1088 } 1089 1090 /* 1091 * sched_unsleep: 1092 * 1093 * The is called when the LWP has not been awoken normally but instead 1094 * interrupted: for example, if the sleep timed out. Because of this, 1095 * it's not a valid action for running or idle LWPs. 1096 */ 1097 static void 1098 sched_unsleep(struct lwp *l, bool cleanup) 1099 { 1100 1101 lwp_unlock(l); 1102 panic("sched_unsleep"); 1103 } 1104 1105 static void 1106 resched_cpu(struct lwp *l) 1107 { 1108 struct cpu_info *ci = l->l_cpu; 1109 1110 KASSERT(lwp_locked(l, NULL)); 1111 if (lwp_eprio(l) > ci->ci_schedstate.spc_curpriority) 1112 cpu_need_resched(ci, 0); 1113 } 1114 1115 static void 1116 sched_changepri(struct lwp *l, pri_t pri) 1117 { 1118 1119 KASSERT(lwp_locked(l, NULL)); 1120 1121 if (l->l_stat == LSRUN) { 1122 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex)); 1123 sched_dequeue(l); 1124 l->l_priority = pri; 1125 sched_enqueue(l, false); 1126 } else { 1127 l->l_priority = pri; 1128 } 1129 resched_cpu(l); 1130 } 1131 1132 static void 1133 sched_lendpri(struct lwp *l, pri_t pri) 1134 { 1135 1136 KASSERT(lwp_locked(l, NULL)); 1137 1138 if (l->l_stat == LSRUN) { 1139 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex)); 1140 sched_dequeue(l); 1141 l->l_inheritedprio = pri; 1142 sched_enqueue(l, false); 1143 } else { 1144 l->l_inheritedprio = pri; 1145 } 1146 resched_cpu(l); 1147 } 1148 1149 struct lwp * 1150 syncobj_noowner(wchan_t wchan) 1151 { 1152 1153 return NULL; 1154 } 1155 1156 /* Decay 95% of proc::p_pctcpu in 60 seconds, ccpu = exp(-1/20) */ 1157 const fixpt_t ccpu = 0.95122942450071400909 * FSCALE; 1158 1159 /* 1160 * Constants for averages over 1, 5 and 15 minutes when sampling at 1161 * 5 second intervals. 1162 */ 1163 static const fixpt_t cexp[ ] = { 1164 0.9200444146293232 * FSCALE, /* exp(-1/12) */ 1165 0.9834714538216174 * FSCALE, /* exp(-1/60) */ 1166 0.9944598480048967 * FSCALE, /* exp(-1/180) */ 1167 }; 1168 1169 /* 1170 * sched_pstats: 1171 * 1172 * => Update process statistics and check CPU resource allocation. 1173 * => Call scheduler-specific hook to eventually adjust LWP priorities. 1174 * => Compute load average of a quantity on 1, 5 and 15 minute intervals. 1175 */ 1176 void 1177 sched_pstats(void) 1178 { 1179 extern struct loadavg averunnable; 1180 struct loadavg *avg = &averunnable; 1181 const int clkhz = (stathz != 0 ? stathz : hz); 1182 static bool backwards = false; 1183 static u_int lavg_count = 0; 1184 struct proc *p; 1185 int nrun; 1186 1187 sched_pstats_ticks++; 1188 if (++lavg_count >= 5) { 1189 lavg_count = 0; 1190 nrun = 0; 1191 } 1192 mutex_enter(proc_lock); 1193 PROCLIST_FOREACH(p, &allproc) { 1194 struct lwp *l; 1195 struct rlimit *rlim; 1196 long runtm; 1197 int sig; 1198 1199 /* Increment sleep time (if sleeping), ignore overflow. */ 1200 mutex_enter(p->p_lock); 1201 runtm = p->p_rtime.sec; 1202 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 1203 fixpt_t lpctcpu; 1204 u_int lcpticks; 1205 1206 if (__predict_false((l->l_flag & LW_IDLE) != 0)) 1207 continue; 1208 lwp_lock(l); 1209 runtm += l->l_rtime.sec; 1210 l->l_swtime++; 1211 sched_lwp_stats(l); 1212 1213 /* For load average calculation. */ 1214 if (__predict_false(lavg_count == 0) && 1215 (l->l_flag & (LW_SINTR | LW_SYSTEM)) == 0) { 1216 switch (l->l_stat) { 1217 case LSSLEEP: 1218 if (l->l_slptime > 1) { 1219 break; 1220 } 1221 case LSRUN: 1222 case LSONPROC: 1223 case LSIDL: 1224 nrun++; 1225 } 1226 } 1227 lwp_unlock(l); 1228 1229 l->l_pctcpu = (l->l_pctcpu * ccpu) >> FSHIFT; 1230 if (l->l_slptime != 0) 1231 continue; 1232 1233 lpctcpu = l->l_pctcpu; 1234 lcpticks = atomic_swap_uint(&l->l_cpticks, 0); 1235 lpctcpu += ((FSCALE - ccpu) * 1236 (lcpticks * FSCALE / clkhz)) >> FSHIFT; 1237 l->l_pctcpu = lpctcpu; 1238 } 1239 /* Calculating p_pctcpu only for ps(1) */ 1240 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT; 1241 1242 /* 1243 * Check if the process exceeds its CPU resource allocation. 1244 * If over max, kill it. 1245 */ 1246 rlim = &p->p_rlimit[RLIMIT_CPU]; 1247 sig = 0; 1248 if (__predict_false(runtm >= rlim->rlim_cur)) { 1249 if (runtm >= rlim->rlim_max) 1250 sig = SIGKILL; 1251 else { 1252 sig = SIGXCPU; 1253 if (rlim->rlim_cur < rlim->rlim_max) 1254 rlim->rlim_cur += 5; 1255 } 1256 } 1257 mutex_exit(p->p_lock); 1258 if (__predict_false(runtm < 0)) { 1259 if (!backwards) { 1260 backwards = true; 1261 printf("WARNING: negative runtime; " 1262 "monotonic clock has gone backwards\n"); 1263 } 1264 } else if (__predict_false(sig)) { 1265 KASSERT((p->p_flag & PK_SYSTEM) == 0); 1266 psignal(p, sig); 1267 } 1268 } 1269 mutex_exit(proc_lock); 1270 1271 /* Load average calculation. */ 1272 if (__predict_false(lavg_count == 0)) { 1273 int i; 1274 CTASSERT(__arraycount(cexp) == __arraycount(avg->ldavg)); 1275 for (i = 0; i < __arraycount(cexp); i++) { 1276 avg->ldavg[i] = (cexp[i] * avg->ldavg[i] + 1277 nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT; 1278 } 1279 } 1280 1281 /* Lightning bolt. */ 1282 cv_broadcast(&lbolt); 1283 } 1284