1 /* $NetBSD: kern_synch.c,v 1.295 2011/10/05 20:37:40 njoly Exp $ */ 2 3 /*- 4 * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008, 2009 5 * The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 10 * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran and 11 * Daniel Sieger. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 24 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 26 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 29 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 30 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 32 * POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 /*- 36 * Copyright (c) 1982, 1986, 1990, 1991, 1993 37 * The Regents of the University of California. All rights reserved. 38 * (c) UNIX System Laboratories, Inc. 39 * All or some portions of this file are derived from material licensed 40 * to the University of California by American Telephone and Telegraph 41 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 42 * the permission of UNIX System Laboratories, Inc. 43 * 44 * Redistribution and use in source and binary forms, with or without 45 * modification, are permitted provided that the following conditions 46 * are met: 47 * 1. Redistributions of source code must retain the above copyright 48 * notice, this list of conditions and the following disclaimer. 49 * 2. Redistributions in binary form must reproduce the above copyright 50 * notice, this list of conditions and the following disclaimer in the 51 * documentation and/or other materials provided with the distribution. 52 * 3. Neither the name of the University nor the names of its contributors 53 * may be used to endorse or promote products derived from this software 54 * without specific prior written permission. 55 * 56 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 57 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 58 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 59 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 60 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 61 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 62 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 63 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 64 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 65 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 66 * SUCH DAMAGE. 67 * 68 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 69 */ 70 71 #include <sys/cdefs.h> 72 __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.295 2011/10/05 20:37:40 njoly Exp $"); 73 74 #include "opt_kstack.h" 75 #include "opt_perfctrs.h" 76 #include "opt_sa.h" 77 #include "opt_dtrace.h" 78 79 #define __MUTEX_PRIVATE 80 81 #include <sys/param.h> 82 #include <sys/systm.h> 83 #include <sys/proc.h> 84 #include <sys/kernel.h> 85 #if defined(PERFCTRS) 86 #include <sys/pmc.h> 87 #endif 88 #include <sys/cpu.h> 89 #include <sys/pserialize.h> 90 #include <sys/resourcevar.h> 91 #include <sys/sched.h> 92 #include <sys/sa.h> 93 #include <sys/savar.h> 94 #include <sys/syscall_stats.h> 95 #include <sys/sleepq.h> 96 #include <sys/lockdebug.h> 97 #include <sys/evcnt.h> 98 #include <sys/intr.h> 99 #include <sys/lwpctl.h> 100 #include <sys/atomic.h> 101 #include <sys/simplelock.h> 102 #include <sys/syslog.h> 103 104 #include <uvm/uvm_extern.h> 105 106 #include <dev/lockstat.h> 107 108 #include <sys/dtrace_bsd.h> 109 int dtrace_vtime_active=0; 110 dtrace_vtime_switch_func_t dtrace_vtime_switch_func; 111 112 static void sched_unsleep(struct lwp *, bool); 113 static void sched_changepri(struct lwp *, pri_t); 114 static void sched_lendpri(struct lwp *, pri_t); 115 static void resched_cpu(struct lwp *); 116 117 syncobj_t sleep_syncobj = { 118 SOBJ_SLEEPQ_SORTED, 119 sleepq_unsleep, 120 sleepq_changepri, 121 sleepq_lendpri, 122 syncobj_noowner, 123 }; 124 125 syncobj_t sched_syncobj = { 126 SOBJ_SLEEPQ_SORTED, 127 sched_unsleep, 128 sched_changepri, 129 sched_lendpri, 130 syncobj_noowner, 131 }; 132 133 /* "Lightning bolt": once a second sleep address. */ 134 kcondvar_t lbolt __cacheline_aligned; 135 136 u_int sched_pstats_ticks __cacheline_aligned; 137 138 /* Preemption event counters. */ 139 static struct evcnt kpreempt_ev_crit __cacheline_aligned; 140 static struct evcnt kpreempt_ev_klock __cacheline_aligned; 141 static struct evcnt kpreempt_ev_immed __cacheline_aligned; 142 143 /* 144 * During autoconfiguration or after a panic, a sleep will simply lower the 145 * priority briefly to allow interrupts, then return. The priority to be 146 * used (safepri) is machine-dependent, thus this value is initialized and 147 * maintained in the machine-dependent layers. This priority will typically 148 * be 0, or the lowest priority that is safe for use on the interrupt stack; 149 * it can be made higher to block network software interrupts after panics. 150 */ 151 int safepri; 152 153 void 154 synch_init(void) 155 { 156 157 cv_init(&lbolt, "lbolt"); 158 159 evcnt_attach_dynamic(&kpreempt_ev_crit, EVCNT_TYPE_MISC, NULL, 160 "kpreempt", "defer: critical section"); 161 evcnt_attach_dynamic(&kpreempt_ev_klock, EVCNT_TYPE_MISC, NULL, 162 "kpreempt", "defer: kernel_lock"); 163 evcnt_attach_dynamic(&kpreempt_ev_immed, EVCNT_TYPE_MISC, NULL, 164 "kpreempt", "immediate"); 165 } 166 167 /* 168 * OBSOLETE INTERFACE 169 * 170 * General sleep call. Suspends the current LWP until a wakeup is 171 * performed on the specified identifier. The LWP will then be made 172 * runnable with the specified priority. Sleeps at most timo/hz seconds (0 173 * means no timeout). If pri includes PCATCH flag, signals are checked 174 * before and after sleeping, else signals are not checked. Returns 0 if 175 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a 176 * signal needs to be delivered, ERESTART is returned if the current system 177 * call should be restarted if possible, and EINTR is returned if the system 178 * call should be interrupted by the signal (return EINTR). 179 * 180 * The interlock is held until we are on a sleep queue. The interlock will 181 * be locked before returning back to the caller unless the PNORELOCK flag 182 * is specified, in which case the interlock will always be unlocked upon 183 * return. 184 */ 185 int 186 ltsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo, 187 volatile struct simplelock *interlock) 188 { 189 struct lwp *l = curlwp; 190 sleepq_t *sq; 191 kmutex_t *mp; 192 int error; 193 194 KASSERT((l->l_pflag & LP_INTR) == 0); 195 KASSERT(ident != &lbolt); 196 197 if (sleepq_dontsleep(l)) { 198 (void)sleepq_abort(NULL, 0); 199 if ((priority & PNORELOCK) != 0) 200 simple_unlock(interlock); 201 return 0; 202 } 203 204 l->l_kpriority = true; 205 sq = sleeptab_lookup(&sleeptab, ident, &mp); 206 sleepq_enter(sq, l, mp); 207 sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj); 208 209 if (interlock != NULL) { 210 KASSERT(simple_lock_held(interlock)); 211 simple_unlock(interlock); 212 } 213 214 error = sleepq_block(timo, priority & PCATCH); 215 216 if (interlock != NULL && (priority & PNORELOCK) == 0) 217 simple_lock(interlock); 218 219 return error; 220 } 221 222 int 223 mtsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo, 224 kmutex_t *mtx) 225 { 226 struct lwp *l = curlwp; 227 sleepq_t *sq; 228 kmutex_t *mp; 229 int error; 230 231 KASSERT((l->l_pflag & LP_INTR) == 0); 232 KASSERT(ident != &lbolt); 233 234 if (sleepq_dontsleep(l)) { 235 (void)sleepq_abort(mtx, (priority & PNORELOCK) != 0); 236 return 0; 237 } 238 239 l->l_kpriority = true; 240 sq = sleeptab_lookup(&sleeptab, ident, &mp); 241 sleepq_enter(sq, l, mp); 242 sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj); 243 mutex_exit(mtx); 244 error = sleepq_block(timo, priority & PCATCH); 245 246 if ((priority & PNORELOCK) == 0) 247 mutex_enter(mtx); 248 249 return error; 250 } 251 252 /* 253 * General sleep call for situations where a wake-up is not expected. 254 */ 255 int 256 kpause(const char *wmesg, bool intr, int timo, kmutex_t *mtx) 257 { 258 struct lwp *l = curlwp; 259 kmutex_t *mp; 260 sleepq_t *sq; 261 int error; 262 263 KASSERT(!(timo == 0 && intr == false)); 264 265 if (sleepq_dontsleep(l)) 266 return sleepq_abort(NULL, 0); 267 268 if (mtx != NULL) 269 mutex_exit(mtx); 270 l->l_kpriority = true; 271 sq = sleeptab_lookup(&sleeptab, l, &mp); 272 sleepq_enter(sq, l, mp); 273 sleepq_enqueue(sq, l, wmesg, &sleep_syncobj); 274 error = sleepq_block(timo, intr); 275 if (mtx != NULL) 276 mutex_enter(mtx); 277 278 return error; 279 } 280 281 #ifdef KERN_SA 282 /* 283 * sa_awaken: 284 * 285 * We believe this lwp is an SA lwp. If it's yielding, 286 * let it know it needs to wake up. 287 * 288 * We are called and exit with the lwp locked. We are 289 * called in the middle of wakeup operations, so we need 290 * to not touch the locks at all. 291 */ 292 void 293 sa_awaken(struct lwp *l) 294 { 295 /* LOCK_ASSERT(lwp_locked(l, NULL)); */ 296 297 if (l == l->l_savp->savp_lwp && l->l_flag & LW_SA_YIELD) 298 l->l_flag &= ~LW_SA_IDLE; 299 } 300 #endif /* KERN_SA */ 301 302 /* 303 * OBSOLETE INTERFACE 304 * 305 * Make all LWPs sleeping on the specified identifier runnable. 306 */ 307 void 308 wakeup(wchan_t ident) 309 { 310 sleepq_t *sq; 311 kmutex_t *mp; 312 313 if (__predict_false(cold)) 314 return; 315 316 sq = sleeptab_lookup(&sleeptab, ident, &mp); 317 sleepq_wake(sq, ident, (u_int)-1, mp); 318 } 319 320 /* 321 * OBSOLETE INTERFACE 322 * 323 * Make the highest priority LWP first in line on the specified 324 * identifier runnable. 325 */ 326 void 327 wakeup_one(wchan_t ident) 328 { 329 sleepq_t *sq; 330 kmutex_t *mp; 331 332 if (__predict_false(cold)) 333 return; 334 335 sq = sleeptab_lookup(&sleeptab, ident, &mp); 336 sleepq_wake(sq, ident, 1, mp); 337 } 338 339 340 /* 341 * General yield call. Puts the current LWP back on its run queue and 342 * performs a voluntary context switch. Should only be called when the 343 * current LWP explicitly requests it (eg sched_yield(2)). 344 */ 345 void 346 yield(void) 347 { 348 struct lwp *l = curlwp; 349 350 KERNEL_UNLOCK_ALL(l, &l->l_biglocks); 351 lwp_lock(l); 352 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); 353 KASSERT(l->l_stat == LSONPROC); 354 l->l_kpriority = false; 355 (void)mi_switch(l); 356 KERNEL_LOCK(l->l_biglocks, l); 357 } 358 359 /* 360 * General preemption call. Puts the current LWP back on its run queue 361 * and performs an involuntary context switch. 362 */ 363 void 364 preempt(void) 365 { 366 struct lwp *l = curlwp; 367 368 KERNEL_UNLOCK_ALL(l, &l->l_biglocks); 369 lwp_lock(l); 370 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); 371 KASSERT(l->l_stat == LSONPROC); 372 l->l_kpriority = false; 373 l->l_nivcsw++; 374 (void)mi_switch(l); 375 KERNEL_LOCK(l->l_biglocks, l); 376 } 377 378 /* 379 * Handle a request made by another agent to preempt the current LWP 380 * in-kernel. Usually called when l_dopreempt may be non-zero. 381 * 382 * Character addresses for lockstat only. 383 */ 384 static char in_critical_section; 385 static char kernel_lock_held; 386 static char is_softint; 387 static char cpu_kpreempt_enter_fail; 388 389 bool 390 kpreempt(uintptr_t where) 391 { 392 uintptr_t failed; 393 lwp_t *l; 394 int s, dop, lsflag; 395 396 l = curlwp; 397 failed = 0; 398 while ((dop = l->l_dopreempt) != 0) { 399 if (l->l_stat != LSONPROC) { 400 /* 401 * About to block (or die), let it happen. 402 * Doesn't really count as "preemption has 403 * been blocked", since we're going to 404 * context switch. 405 */ 406 l->l_dopreempt = 0; 407 return true; 408 } 409 if (__predict_false((l->l_flag & LW_IDLE) != 0)) { 410 /* Can't preempt idle loop, don't count as failure. */ 411 l->l_dopreempt = 0; 412 return true; 413 } 414 if (__predict_false(l->l_nopreempt != 0)) { 415 /* LWP holds preemption disabled, explicitly. */ 416 if ((dop & DOPREEMPT_COUNTED) == 0) { 417 kpreempt_ev_crit.ev_count++; 418 } 419 failed = (uintptr_t)&in_critical_section; 420 break; 421 } 422 if (__predict_false((l->l_pflag & LP_INTR) != 0)) { 423 /* Can't preempt soft interrupts yet. */ 424 l->l_dopreempt = 0; 425 failed = (uintptr_t)&is_softint; 426 break; 427 } 428 s = splsched(); 429 if (__predict_false(l->l_blcnt != 0 || 430 curcpu()->ci_biglock_wanted != NULL)) { 431 /* Hold or want kernel_lock, code is not MT safe. */ 432 splx(s); 433 if ((dop & DOPREEMPT_COUNTED) == 0) { 434 kpreempt_ev_klock.ev_count++; 435 } 436 failed = (uintptr_t)&kernel_lock_held; 437 break; 438 } 439 if (__predict_false(!cpu_kpreempt_enter(where, s))) { 440 /* 441 * It may be that the IPL is too high. 442 * kpreempt_enter() can schedule an 443 * interrupt to retry later. 444 */ 445 splx(s); 446 failed = (uintptr_t)&cpu_kpreempt_enter_fail; 447 break; 448 } 449 /* Do it! */ 450 if (__predict_true((dop & DOPREEMPT_COUNTED) == 0)) { 451 kpreempt_ev_immed.ev_count++; 452 } 453 lwp_lock(l); 454 mi_switch(l); 455 l->l_nopreempt++; 456 splx(s); 457 458 /* Take care of any MD cleanup. */ 459 cpu_kpreempt_exit(where); 460 l->l_nopreempt--; 461 } 462 463 if (__predict_true(!failed)) { 464 return false; 465 } 466 467 /* Record preemption failure for reporting via lockstat. */ 468 atomic_or_uint(&l->l_dopreempt, DOPREEMPT_COUNTED); 469 lsflag = 0; 470 LOCKSTAT_ENTER(lsflag); 471 if (__predict_false(lsflag)) { 472 if (where == 0) { 473 where = (uintptr_t)__builtin_return_address(0); 474 } 475 /* Preemption is on, might recurse, so make it atomic. */ 476 if (atomic_cas_ptr_ni((void *)&l->l_pfailaddr, NULL, 477 (void *)where) == NULL) { 478 LOCKSTAT_START_TIMER(lsflag, l->l_pfailtime); 479 l->l_pfaillock = failed; 480 } 481 } 482 LOCKSTAT_EXIT(lsflag); 483 return true; 484 } 485 486 /* 487 * Return true if preemption is explicitly disabled. 488 */ 489 bool 490 kpreempt_disabled(void) 491 { 492 const lwp_t *l = curlwp; 493 494 return l->l_nopreempt != 0 || l->l_stat == LSZOMB || 495 (l->l_flag & LW_IDLE) != 0 || cpu_kpreempt_disabled(); 496 } 497 498 /* 499 * Disable kernel preemption. 500 */ 501 void 502 kpreempt_disable(void) 503 { 504 505 KPREEMPT_DISABLE(curlwp); 506 } 507 508 /* 509 * Reenable kernel preemption. 510 */ 511 void 512 kpreempt_enable(void) 513 { 514 515 KPREEMPT_ENABLE(curlwp); 516 } 517 518 /* 519 * Compute the amount of time during which the current lwp was running. 520 * 521 * - update l_rtime unless it's an idle lwp. 522 */ 523 524 void 525 updatertime(lwp_t *l, const struct bintime *now) 526 { 527 528 if (__predict_false(l->l_flag & LW_IDLE)) 529 return; 530 531 /* rtime += now - stime */ 532 bintime_add(&l->l_rtime, now); 533 bintime_sub(&l->l_rtime, &l->l_stime); 534 } 535 536 /* 537 * Select next LWP from the current CPU to run.. 538 */ 539 static inline lwp_t * 540 nextlwp(struct cpu_info *ci, struct schedstate_percpu *spc) 541 { 542 lwp_t *newl; 543 544 /* 545 * Let sched_nextlwp() select the LWP to run the CPU next. 546 * If no LWP is runnable, select the idle LWP. 547 * 548 * Note that spc_lwplock might not necessary be held, and 549 * new thread would be unlocked after setting the LWP-lock. 550 */ 551 newl = sched_nextlwp(); 552 if (newl != NULL) { 553 sched_dequeue(newl); 554 KASSERT(lwp_locked(newl, spc->spc_mutex)); 555 KASSERT(newl->l_cpu == ci); 556 newl->l_stat = LSONPROC; 557 newl->l_pflag |= LP_RUNNING; 558 lwp_setlock(newl, spc->spc_lwplock); 559 } else { 560 newl = ci->ci_data.cpu_idlelwp; 561 newl->l_stat = LSONPROC; 562 newl->l_pflag |= LP_RUNNING; 563 } 564 565 /* 566 * Only clear want_resched if there are no pending (slow) 567 * software interrupts. 568 */ 569 ci->ci_want_resched = ci->ci_data.cpu_softints; 570 spc->spc_flags &= ~SPCF_SWITCHCLEAR; 571 spc->spc_curpriority = lwp_eprio(newl); 572 573 return newl; 574 } 575 576 /* 577 * The machine independent parts of context switch. 578 * 579 * Returns 1 if another LWP was actually run. 580 */ 581 int 582 mi_switch(lwp_t *l) 583 { 584 struct cpu_info *ci; 585 struct schedstate_percpu *spc; 586 struct lwp *newl; 587 int retval, oldspl; 588 struct bintime bt; 589 bool returning; 590 591 KASSERT(lwp_locked(l, NULL)); 592 KASSERT(kpreempt_disabled()); 593 LOCKDEBUG_BARRIER(l->l_mutex, 1); 594 595 kstack_check_magic(l); 596 597 binuptime(&bt); 598 599 KASSERT((l->l_pflag & LP_RUNNING) != 0); 600 KASSERT(l->l_cpu == curcpu()); 601 ci = l->l_cpu; 602 spc = &ci->ci_schedstate; 603 returning = false; 604 newl = NULL; 605 606 /* 607 * If we have been asked to switch to a specific LWP, then there 608 * is no need to inspect the run queues. If a soft interrupt is 609 * blocking, then return to the interrupted thread without adjusting 610 * VM context or its start time: neither have been changed in order 611 * to take the interrupt. 612 */ 613 if (l->l_switchto != NULL) { 614 if ((l->l_pflag & LP_INTR) != 0) { 615 returning = true; 616 softint_block(l); 617 if ((l->l_pflag & LP_TIMEINTR) != 0) 618 updatertime(l, &bt); 619 } 620 newl = l->l_switchto; 621 l->l_switchto = NULL; 622 } 623 #ifndef __HAVE_FAST_SOFTINTS 624 else if (ci->ci_data.cpu_softints != 0) { 625 /* There are pending soft interrupts, so pick one. */ 626 newl = softint_picklwp(); 627 newl->l_stat = LSONPROC; 628 newl->l_pflag |= LP_RUNNING; 629 } 630 #endif /* !__HAVE_FAST_SOFTINTS */ 631 632 /* Count time spent in current system call */ 633 if (!returning) { 634 SYSCALL_TIME_SLEEP(l); 635 636 /* 637 * XXXSMP If we are using h/w performance counters, 638 * save context. 639 */ 640 #if PERFCTRS 641 if (PMC_ENABLED(l->l_proc)) { 642 pmc_save_context(l->l_proc); 643 } 644 #endif 645 updatertime(l, &bt); 646 } 647 648 /* Lock the runqueue */ 649 KASSERT(l->l_stat != LSRUN); 650 mutex_spin_enter(spc->spc_mutex); 651 652 /* 653 * If on the CPU and we have gotten this far, then we must yield. 654 */ 655 if (l->l_stat == LSONPROC && l != newl) { 656 KASSERT(lwp_locked(l, spc->spc_lwplock)); 657 if ((l->l_flag & LW_IDLE) == 0) { 658 l->l_stat = LSRUN; 659 lwp_setlock(l, spc->spc_mutex); 660 sched_enqueue(l, true); 661 /* 662 * Handle migration. Note that "migrating LWP" may 663 * be reset here, if interrupt/preemption happens 664 * early in idle LWP. 665 */ 666 if (l->l_target_cpu != NULL) { 667 KASSERT((l->l_pflag & LP_INTR) == 0); 668 spc->spc_migrating = l; 669 } 670 } else 671 l->l_stat = LSIDL; 672 } 673 674 /* Pick new LWP to run. */ 675 if (newl == NULL) { 676 newl = nextlwp(ci, spc); 677 } 678 679 /* Items that must be updated with the CPU locked. */ 680 if (!returning) { 681 /* Update the new LWP's start time. */ 682 newl->l_stime = bt; 683 684 /* 685 * ci_curlwp changes when a fast soft interrupt occurs. 686 * We use cpu_onproc to keep track of which kernel or 687 * user thread is running 'underneath' the software 688 * interrupt. This is important for time accounting, 689 * itimers and forcing user threads to preempt (aston). 690 */ 691 ci->ci_data.cpu_onproc = newl; 692 } 693 694 /* 695 * Preemption related tasks. Must be done with the current 696 * CPU locked. 697 */ 698 cpu_did_resched(l); 699 l->l_dopreempt = 0; 700 if (__predict_false(l->l_pfailaddr != 0)) { 701 LOCKSTAT_FLAG(lsflag); 702 LOCKSTAT_ENTER(lsflag); 703 LOCKSTAT_STOP_TIMER(lsflag, l->l_pfailtime); 704 LOCKSTAT_EVENT_RA(lsflag, l->l_pfaillock, LB_NOPREEMPT|LB_SPIN, 705 1, l->l_pfailtime, l->l_pfailaddr); 706 LOCKSTAT_EXIT(lsflag); 707 l->l_pfailtime = 0; 708 l->l_pfaillock = 0; 709 l->l_pfailaddr = 0; 710 } 711 712 if (l != newl) { 713 struct lwp *prevlwp; 714 715 /* Release all locks, but leave the current LWP locked */ 716 if (l->l_mutex == spc->spc_mutex) { 717 /* 718 * Drop spc_lwplock, if the current LWP has been moved 719 * to the run queue (it is now locked by spc_mutex). 720 */ 721 mutex_spin_exit(spc->spc_lwplock); 722 } else { 723 /* 724 * Otherwise, drop the spc_mutex, we are done with the 725 * run queues. 726 */ 727 mutex_spin_exit(spc->spc_mutex); 728 } 729 730 /* 731 * Mark that context switch is going to be performed 732 * for this LWP, to protect it from being switched 733 * to on another CPU. 734 */ 735 KASSERT(l->l_ctxswtch == 0); 736 l->l_ctxswtch = 1; 737 l->l_ncsw++; 738 KASSERT((l->l_pflag & LP_RUNNING) != 0); 739 l->l_pflag &= ~LP_RUNNING; 740 741 /* 742 * Increase the count of spin-mutexes before the release 743 * of the last lock - we must remain at IPL_SCHED during 744 * the context switch. 745 */ 746 KASSERTMSG(ci->ci_mtx_count == -1, 747 "%s: cpu%u: ci_mtx_count (%d) != -1", 748 __func__, cpu_index(ci), ci->ci_mtx_count); 749 oldspl = MUTEX_SPIN_OLDSPL(ci); 750 ci->ci_mtx_count--; 751 lwp_unlock(l); 752 753 /* Count the context switch on this CPU. */ 754 ci->ci_data.cpu_nswtch++; 755 756 /* Update status for lwpctl, if present. */ 757 if (l->l_lwpctl != NULL) 758 l->l_lwpctl->lc_curcpu = LWPCTL_CPU_NONE; 759 760 /* 761 * Save old VM context, unless a soft interrupt 762 * handler is blocking. 763 */ 764 if (!returning) 765 pmap_deactivate(l); 766 767 /* 768 * We may need to spin-wait if 'newl' is still 769 * context switching on another CPU. 770 */ 771 if (__predict_false(newl->l_ctxswtch != 0)) { 772 u_int count; 773 count = SPINLOCK_BACKOFF_MIN; 774 while (newl->l_ctxswtch) 775 SPINLOCK_BACKOFF(count); 776 } 777 778 /* 779 * If DTrace has set the active vtime enum to anything 780 * other than INACTIVE (0), then it should have set the 781 * function to call. 782 */ 783 if (__predict_false(dtrace_vtime_active)) { 784 (*dtrace_vtime_switch_func)(newl); 785 } 786 787 /* Switch to the new LWP.. */ 788 prevlwp = cpu_switchto(l, newl, returning); 789 ci = curcpu(); 790 791 /* 792 * Switched away - we have new curlwp. 793 * Restore VM context and IPL. 794 */ 795 pmap_activate(l); 796 uvm_emap_switch(l); 797 pcu_switchpoint(l); 798 799 if (prevlwp != NULL) { 800 /* Normalize the count of the spin-mutexes */ 801 ci->ci_mtx_count++; 802 /* Unmark the state of context switch */ 803 membar_exit(); 804 prevlwp->l_ctxswtch = 0; 805 } 806 807 /* Update status for lwpctl, if present. */ 808 if (l->l_lwpctl != NULL) { 809 l->l_lwpctl->lc_curcpu = (int)cpu_index(ci); 810 l->l_lwpctl->lc_pctr++; 811 } 812 813 /* Note trip through cpu_switchto(). */ 814 pserialize_switchpoint(); 815 816 KASSERT(l->l_cpu == ci); 817 splx(oldspl); 818 retval = 1; 819 } else { 820 /* Nothing to do - just unlock and return. */ 821 mutex_spin_exit(spc->spc_mutex); 822 lwp_unlock(l); 823 retval = 0; 824 } 825 826 KASSERT(l == curlwp); 827 KASSERT(l->l_stat == LSONPROC); 828 829 /* 830 * XXXSMP If we are using h/w performance counters, restore context. 831 * XXXSMP preemption problem. 832 */ 833 #if PERFCTRS 834 if (PMC_ENABLED(l->l_proc)) { 835 pmc_restore_context(l->l_proc); 836 } 837 #endif 838 SYSCALL_TIME_WAKEUP(l); 839 LOCKDEBUG_BARRIER(NULL, 1); 840 841 return retval; 842 } 843 844 /* 845 * The machine independent parts of context switch to oblivion. 846 * Does not return. Call with the LWP unlocked. 847 */ 848 void 849 lwp_exit_switchaway(lwp_t *l) 850 { 851 struct cpu_info *ci; 852 struct lwp *newl; 853 struct bintime bt; 854 855 ci = l->l_cpu; 856 857 KASSERT(kpreempt_disabled()); 858 KASSERT(l->l_stat == LSZOMB || l->l_stat == LSIDL); 859 KASSERT(ci == curcpu()); 860 LOCKDEBUG_BARRIER(NULL, 0); 861 862 kstack_check_magic(l); 863 864 /* Count time spent in current system call */ 865 SYSCALL_TIME_SLEEP(l); 866 binuptime(&bt); 867 updatertime(l, &bt); 868 869 /* Must stay at IPL_SCHED even after releasing run queue lock. */ 870 (void)splsched(); 871 872 /* 873 * Let sched_nextlwp() select the LWP to run the CPU next. 874 * If no LWP is runnable, select the idle LWP. 875 * 876 * Note that spc_lwplock might not necessary be held, and 877 * new thread would be unlocked after setting the LWP-lock. 878 */ 879 spc_lock(ci); 880 #ifndef __HAVE_FAST_SOFTINTS 881 if (ci->ci_data.cpu_softints != 0) { 882 /* There are pending soft interrupts, so pick one. */ 883 newl = softint_picklwp(); 884 newl->l_stat = LSONPROC; 885 newl->l_pflag |= LP_RUNNING; 886 } else 887 #endif /* !__HAVE_FAST_SOFTINTS */ 888 { 889 newl = nextlwp(ci, &ci->ci_schedstate); 890 } 891 892 /* Update the new LWP's start time. */ 893 newl->l_stime = bt; 894 l->l_pflag &= ~LP_RUNNING; 895 896 /* 897 * ci_curlwp changes when a fast soft interrupt occurs. 898 * We use cpu_onproc to keep track of which kernel or 899 * user thread is running 'underneath' the software 900 * interrupt. This is important for time accounting, 901 * itimers and forcing user threads to preempt (aston). 902 */ 903 ci->ci_data.cpu_onproc = newl; 904 905 /* 906 * Preemption related tasks. Must be done with the current 907 * CPU locked. 908 */ 909 cpu_did_resched(l); 910 911 /* Unlock the run queue. */ 912 spc_unlock(ci); 913 914 /* Count the context switch on this CPU. */ 915 ci->ci_data.cpu_nswtch++; 916 917 /* Update status for lwpctl, if present. */ 918 if (l->l_lwpctl != NULL) 919 l->l_lwpctl->lc_curcpu = LWPCTL_CPU_EXITED; 920 921 /* 922 * We may need to spin-wait if 'newl' is still 923 * context switching on another CPU. 924 */ 925 if (__predict_false(newl->l_ctxswtch != 0)) { 926 u_int count; 927 count = SPINLOCK_BACKOFF_MIN; 928 while (newl->l_ctxswtch) 929 SPINLOCK_BACKOFF(count); 930 } 931 932 /* 933 * If DTrace has set the active vtime enum to anything 934 * other than INACTIVE (0), then it should have set the 935 * function to call. 936 */ 937 if (__predict_false(dtrace_vtime_active)) { 938 (*dtrace_vtime_switch_func)(newl); 939 } 940 941 /* Switch to the new LWP.. */ 942 (void)cpu_switchto(NULL, newl, false); 943 944 for (;;) continue; /* XXX: convince gcc about "noreturn" */ 945 /* NOTREACHED */ 946 } 947 948 /* 949 * setrunnable: change LWP state to be runnable, placing it on the run queue. 950 * 951 * Call with the process and LWP locked. Will return with the LWP unlocked. 952 */ 953 void 954 setrunnable(struct lwp *l) 955 { 956 struct proc *p = l->l_proc; 957 struct cpu_info *ci; 958 959 KASSERT((l->l_flag & LW_IDLE) == 0); 960 KASSERT(mutex_owned(p->p_lock)); 961 KASSERT(lwp_locked(l, NULL)); 962 KASSERT(l->l_mutex != l->l_cpu->ci_schedstate.spc_mutex); 963 964 switch (l->l_stat) { 965 case LSSTOP: 966 /* 967 * If we're being traced (possibly because someone attached us 968 * while we were stopped), check for a signal from the debugger. 969 */ 970 if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xstat != 0) 971 signotify(l); 972 p->p_nrlwps++; 973 break; 974 case LSSUSPENDED: 975 l->l_flag &= ~LW_WSUSPEND; 976 p->p_nrlwps++; 977 cv_broadcast(&p->p_lwpcv); 978 break; 979 case LSSLEEP: 980 KASSERT(l->l_wchan != NULL); 981 break; 982 default: 983 panic("setrunnable: lwp %p state was %d", l, l->l_stat); 984 } 985 986 #ifdef KERN_SA 987 if (l->l_proc->p_sa) 988 sa_awaken(l); 989 #endif /* KERN_SA */ 990 991 /* 992 * If the LWP was sleeping, start it again. 993 */ 994 if (l->l_wchan != NULL) { 995 l->l_stat = LSSLEEP; 996 /* lwp_unsleep() will release the lock. */ 997 lwp_unsleep(l, true); 998 return; 999 } 1000 1001 /* 1002 * If the LWP is still on the CPU, mark it as LSONPROC. It may be 1003 * about to call mi_switch(), in which case it will yield. 1004 */ 1005 if ((l->l_pflag & LP_RUNNING) != 0) { 1006 l->l_stat = LSONPROC; 1007 l->l_slptime = 0; 1008 lwp_unlock(l); 1009 return; 1010 } 1011 1012 /* 1013 * Look for a CPU to run. 1014 * Set the LWP runnable. 1015 */ 1016 ci = sched_takecpu(l); 1017 l->l_cpu = ci; 1018 spc_lock(ci); 1019 lwp_unlock_to(l, ci->ci_schedstate.spc_mutex); 1020 sched_setrunnable(l); 1021 l->l_stat = LSRUN; 1022 l->l_slptime = 0; 1023 1024 sched_enqueue(l, false); 1025 resched_cpu(l); 1026 lwp_unlock(l); 1027 } 1028 1029 /* 1030 * suspendsched: 1031 * 1032 * Convert all non-LW_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED. 1033 */ 1034 void 1035 suspendsched(void) 1036 { 1037 CPU_INFO_ITERATOR cii; 1038 struct cpu_info *ci; 1039 struct lwp *l; 1040 struct proc *p; 1041 1042 /* 1043 * We do this by process in order not to violate the locking rules. 1044 */ 1045 mutex_enter(proc_lock); 1046 PROCLIST_FOREACH(p, &allproc) { 1047 mutex_enter(p->p_lock); 1048 if ((p->p_flag & PK_SYSTEM) != 0) { 1049 mutex_exit(p->p_lock); 1050 continue; 1051 } 1052 1053 p->p_stat = SSTOP; 1054 1055 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 1056 if (l == curlwp) 1057 continue; 1058 1059 lwp_lock(l); 1060 1061 /* 1062 * Set L_WREBOOT so that the LWP will suspend itself 1063 * when it tries to return to user mode. We want to 1064 * try and get to get as many LWPs as possible to 1065 * the user / kernel boundary, so that they will 1066 * release any locks that they hold. 1067 */ 1068 l->l_flag |= (LW_WREBOOT | LW_WSUSPEND); 1069 1070 if (l->l_stat == LSSLEEP && 1071 (l->l_flag & LW_SINTR) != 0) { 1072 /* setrunnable() will release the lock. */ 1073 setrunnable(l); 1074 continue; 1075 } 1076 1077 lwp_unlock(l); 1078 } 1079 1080 mutex_exit(p->p_lock); 1081 } 1082 mutex_exit(proc_lock); 1083 1084 /* 1085 * Kick all CPUs to make them preempt any LWPs running in user mode. 1086 * They'll trap into the kernel and suspend themselves in userret(). 1087 */ 1088 for (CPU_INFO_FOREACH(cii, ci)) { 1089 spc_lock(ci); 1090 cpu_need_resched(ci, RESCHED_IMMED); 1091 spc_unlock(ci); 1092 } 1093 } 1094 1095 /* 1096 * sched_unsleep: 1097 * 1098 * The is called when the LWP has not been awoken normally but instead 1099 * interrupted: for example, if the sleep timed out. Because of this, 1100 * it's not a valid action for running or idle LWPs. 1101 */ 1102 static void 1103 sched_unsleep(struct lwp *l, bool cleanup) 1104 { 1105 1106 lwp_unlock(l); 1107 panic("sched_unsleep"); 1108 } 1109 1110 static void 1111 resched_cpu(struct lwp *l) 1112 { 1113 struct cpu_info *ci = l->l_cpu; 1114 1115 KASSERT(lwp_locked(l, NULL)); 1116 if (lwp_eprio(l) > ci->ci_schedstate.spc_curpriority) 1117 cpu_need_resched(ci, 0); 1118 } 1119 1120 static void 1121 sched_changepri(struct lwp *l, pri_t pri) 1122 { 1123 1124 KASSERT(lwp_locked(l, NULL)); 1125 1126 if (l->l_stat == LSRUN) { 1127 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex)); 1128 sched_dequeue(l); 1129 l->l_priority = pri; 1130 sched_enqueue(l, false); 1131 } else { 1132 l->l_priority = pri; 1133 } 1134 resched_cpu(l); 1135 } 1136 1137 static void 1138 sched_lendpri(struct lwp *l, pri_t pri) 1139 { 1140 1141 KASSERT(lwp_locked(l, NULL)); 1142 1143 if (l->l_stat == LSRUN) { 1144 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex)); 1145 sched_dequeue(l); 1146 l->l_inheritedprio = pri; 1147 sched_enqueue(l, false); 1148 } else { 1149 l->l_inheritedprio = pri; 1150 } 1151 resched_cpu(l); 1152 } 1153 1154 struct lwp * 1155 syncobj_noowner(wchan_t wchan) 1156 { 1157 1158 return NULL; 1159 } 1160 1161 /* Decay 95% of proc::p_pctcpu in 60 seconds, ccpu = exp(-1/20) */ 1162 const fixpt_t ccpu = 0.95122942450071400909 * FSCALE; 1163 1164 /* 1165 * Constants for averages over 1, 5 and 15 minutes when sampling at 1166 * 5 second intervals. 1167 */ 1168 static const fixpt_t cexp[ ] = { 1169 0.9200444146293232 * FSCALE, /* exp(-1/12) */ 1170 0.9834714538216174 * FSCALE, /* exp(-1/60) */ 1171 0.9944598480048967 * FSCALE, /* exp(-1/180) */ 1172 }; 1173 1174 /* 1175 * sched_pstats: 1176 * 1177 * => Update process statistics and check CPU resource allocation. 1178 * => Call scheduler-specific hook to eventually adjust LWP priorities. 1179 * => Compute load average of a quantity on 1, 5 and 15 minute intervals. 1180 */ 1181 void 1182 sched_pstats(void) 1183 { 1184 extern struct loadavg averunnable; 1185 struct loadavg *avg = &averunnable; 1186 const int clkhz = (stathz != 0 ? stathz : hz); 1187 static bool backwards = false; 1188 static u_int lavg_count = 0; 1189 struct proc *p; 1190 int nrun; 1191 1192 sched_pstats_ticks++; 1193 if (++lavg_count >= 5) { 1194 lavg_count = 0; 1195 nrun = 0; 1196 } 1197 mutex_enter(proc_lock); 1198 PROCLIST_FOREACH(p, &allproc) { 1199 struct lwp *l; 1200 struct rlimit *rlim; 1201 long runtm; 1202 int sig; 1203 1204 /* Increment sleep time (if sleeping), ignore overflow. */ 1205 mutex_enter(p->p_lock); 1206 runtm = p->p_rtime.sec; 1207 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 1208 fixpt_t lpctcpu; 1209 u_int lcpticks; 1210 1211 if (__predict_false((l->l_flag & LW_IDLE) != 0)) 1212 continue; 1213 lwp_lock(l); 1214 runtm += l->l_rtime.sec; 1215 l->l_swtime++; 1216 sched_lwp_stats(l); 1217 1218 /* For load average calculation. */ 1219 if (__predict_false(lavg_count == 0) && 1220 (l->l_flag & (LW_SINTR | LW_SYSTEM)) == 0) { 1221 switch (l->l_stat) { 1222 case LSSLEEP: 1223 if (l->l_slptime > 1) { 1224 break; 1225 } 1226 case LSRUN: 1227 case LSONPROC: 1228 case LSIDL: 1229 nrun++; 1230 } 1231 } 1232 lwp_unlock(l); 1233 1234 l->l_pctcpu = (l->l_pctcpu * ccpu) >> FSHIFT; 1235 if (l->l_slptime != 0) 1236 continue; 1237 1238 lpctcpu = l->l_pctcpu; 1239 lcpticks = atomic_swap_uint(&l->l_cpticks, 0); 1240 lpctcpu += ((FSCALE - ccpu) * 1241 (lcpticks * FSCALE / clkhz)) >> FSHIFT; 1242 l->l_pctcpu = lpctcpu; 1243 } 1244 /* Calculating p_pctcpu only for ps(1) */ 1245 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT; 1246 1247 /* 1248 * Check if the process exceeds its CPU resource allocation. 1249 * If over the hard limit, kill it with SIGKILL. 1250 * If over the soft limit, send SIGXCPU and raise 1251 * the soft limit a little. 1252 */ 1253 rlim = &p->p_rlimit[RLIMIT_CPU]; 1254 sig = 0; 1255 if (__predict_false(runtm >= rlim->rlim_cur)) { 1256 if (runtm >= rlim->rlim_max) { 1257 sig = SIGKILL; 1258 log(LOG_NOTICE, "pid %d is killed: %s\n", 1259 p->p_pid, "exceeded RLIMIT_CPU"); 1260 uprintf("pid %d, command %s, is killed: %s\n", 1261 p->p_pid, p->p_comm, 1262 "exceeded RLIMIT_CPU"); 1263 } else { 1264 sig = SIGXCPU; 1265 if (rlim->rlim_cur < rlim->rlim_max) 1266 rlim->rlim_cur += 5; 1267 } 1268 } 1269 mutex_exit(p->p_lock); 1270 if (__predict_false(runtm < 0)) { 1271 if (!backwards) { 1272 backwards = true; 1273 printf("WARNING: negative runtime; " 1274 "monotonic clock has gone backwards\n"); 1275 } 1276 } else if (__predict_false(sig)) { 1277 KASSERT((p->p_flag & PK_SYSTEM) == 0); 1278 psignal(p, sig); 1279 } 1280 } 1281 mutex_exit(proc_lock); 1282 1283 /* Load average calculation. */ 1284 if (__predict_false(lavg_count == 0)) { 1285 int i; 1286 CTASSERT(__arraycount(cexp) == __arraycount(avg->ldavg)); 1287 for (i = 0; i < __arraycount(cexp); i++) { 1288 avg->ldavg[i] = (cexp[i] * avg->ldavg[i] + 1289 nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT; 1290 } 1291 } 1292 1293 /* Lightning bolt. */ 1294 cv_broadcast(&lbolt); 1295 } 1296