1 /* $NetBSD: kern_synch.c,v 1.305 2012/09/02 16:00:00 mlelstv Exp $ */ 2 3 /*- 4 * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008, 2009 5 * The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 10 * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran and 11 * Daniel Sieger. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 24 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 26 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 29 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 30 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 32 * POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 /*- 36 * Copyright (c) 1982, 1986, 1990, 1991, 1993 37 * The Regents of the University of California. All rights reserved. 38 * (c) UNIX System Laboratories, Inc. 39 * All or some portions of this file are derived from material licensed 40 * to the University of California by American Telephone and Telegraph 41 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 42 * the permission of UNIX System Laboratories, Inc. 43 * 44 * Redistribution and use in source and binary forms, with or without 45 * modification, are permitted provided that the following conditions 46 * are met: 47 * 1. Redistributions of source code must retain the above copyright 48 * notice, this list of conditions and the following disclaimer. 49 * 2. Redistributions in binary form must reproduce the above copyright 50 * notice, this list of conditions and the following disclaimer in the 51 * documentation and/or other materials provided with the distribution. 52 * 3. Neither the name of the University nor the names of its contributors 53 * may be used to endorse or promote products derived from this software 54 * without specific prior written permission. 55 * 56 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 57 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 58 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 59 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 60 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 61 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 62 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 63 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 64 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 65 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 66 * SUCH DAMAGE. 67 * 68 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 69 */ 70 71 #include <sys/cdefs.h> 72 __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.305 2012/09/02 16:00:00 mlelstv Exp $"); 73 74 #include "opt_kstack.h" 75 #include "opt_perfctrs.h" 76 #include "opt_dtrace.h" 77 78 #define __MUTEX_PRIVATE 79 80 #include <sys/param.h> 81 #include <sys/systm.h> 82 #include <sys/proc.h> 83 #include <sys/kernel.h> 84 #if defined(PERFCTRS) 85 #include <sys/pmc.h> 86 #endif 87 #include <sys/cpu.h> 88 #include <sys/pserialize.h> 89 #include <sys/resourcevar.h> 90 #include <sys/sched.h> 91 #include <sys/syscall_stats.h> 92 #include <sys/sleepq.h> 93 #include <sys/lockdebug.h> 94 #include <sys/evcnt.h> 95 #include <sys/intr.h> 96 #include <sys/lwpctl.h> 97 #include <sys/atomic.h> 98 #include <sys/simplelock.h> 99 #include <sys/syslog.h> 100 101 #include <uvm/uvm_extern.h> 102 103 #include <dev/lockstat.h> 104 105 #include <sys/dtrace_bsd.h> 106 int dtrace_vtime_active=0; 107 dtrace_vtime_switch_func_t dtrace_vtime_switch_func; 108 109 static void sched_unsleep(struct lwp *, bool); 110 static void sched_changepri(struct lwp *, pri_t); 111 static void sched_lendpri(struct lwp *, pri_t); 112 static void resched_cpu(struct lwp *); 113 114 syncobj_t sleep_syncobj = { 115 SOBJ_SLEEPQ_SORTED, 116 sleepq_unsleep, 117 sleepq_changepri, 118 sleepq_lendpri, 119 syncobj_noowner, 120 }; 121 122 syncobj_t sched_syncobj = { 123 SOBJ_SLEEPQ_SORTED, 124 sched_unsleep, 125 sched_changepri, 126 sched_lendpri, 127 syncobj_noowner, 128 }; 129 130 /* "Lightning bolt": once a second sleep address. */ 131 kcondvar_t lbolt __cacheline_aligned; 132 133 u_int sched_pstats_ticks __cacheline_aligned; 134 135 /* Preemption event counters. */ 136 static struct evcnt kpreempt_ev_crit __cacheline_aligned; 137 static struct evcnt kpreempt_ev_klock __cacheline_aligned; 138 static struct evcnt kpreempt_ev_immed __cacheline_aligned; 139 140 void 141 synch_init(void) 142 { 143 144 cv_init(&lbolt, "lbolt"); 145 146 evcnt_attach_dynamic(&kpreempt_ev_crit, EVCNT_TYPE_MISC, NULL, 147 "kpreempt", "defer: critical section"); 148 evcnt_attach_dynamic(&kpreempt_ev_klock, EVCNT_TYPE_MISC, NULL, 149 "kpreempt", "defer: kernel_lock"); 150 evcnt_attach_dynamic(&kpreempt_ev_immed, EVCNT_TYPE_MISC, NULL, 151 "kpreempt", "immediate"); 152 } 153 154 /* 155 * OBSOLETE INTERFACE 156 * 157 * General sleep call. Suspends the current LWP until a wakeup is 158 * performed on the specified identifier. The LWP will then be made 159 * runnable with the specified priority. Sleeps at most timo/hz seconds (0 160 * means no timeout). If pri includes PCATCH flag, signals are checked 161 * before and after sleeping, else signals are not checked. Returns 0 if 162 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a 163 * signal needs to be delivered, ERESTART is returned if the current system 164 * call should be restarted if possible, and EINTR is returned if the system 165 * call should be interrupted by the signal (return EINTR). 166 */ 167 int 168 tsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo) 169 { 170 struct lwp *l = curlwp; 171 sleepq_t *sq; 172 kmutex_t *mp; 173 174 KASSERT((l->l_pflag & LP_INTR) == 0); 175 KASSERT(ident != &lbolt); 176 177 if (sleepq_dontsleep(l)) { 178 (void)sleepq_abort(NULL, 0); 179 return 0; 180 } 181 182 l->l_kpriority = true; 183 sq = sleeptab_lookup(&sleeptab, ident, &mp); 184 sleepq_enter(sq, l, mp); 185 sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj); 186 return sleepq_block(timo, priority & PCATCH); 187 } 188 189 int 190 mtsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo, 191 kmutex_t *mtx) 192 { 193 struct lwp *l = curlwp; 194 sleepq_t *sq; 195 kmutex_t *mp; 196 int error; 197 198 KASSERT((l->l_pflag & LP_INTR) == 0); 199 KASSERT(ident != &lbolt); 200 201 if (sleepq_dontsleep(l)) { 202 (void)sleepq_abort(mtx, (priority & PNORELOCK) != 0); 203 return 0; 204 } 205 206 l->l_kpriority = true; 207 sq = sleeptab_lookup(&sleeptab, ident, &mp); 208 sleepq_enter(sq, l, mp); 209 sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj); 210 mutex_exit(mtx); 211 error = sleepq_block(timo, priority & PCATCH); 212 213 if ((priority & PNORELOCK) == 0) 214 mutex_enter(mtx); 215 216 return error; 217 } 218 219 /* 220 * General sleep call for situations where a wake-up is not expected. 221 */ 222 int 223 kpause(const char *wmesg, bool intr, int timo, kmutex_t *mtx) 224 { 225 struct lwp *l = curlwp; 226 kmutex_t *mp; 227 sleepq_t *sq; 228 int error; 229 230 KASSERT(!(timo == 0 && intr == false)); 231 232 if (sleepq_dontsleep(l)) 233 return sleepq_abort(NULL, 0); 234 235 if (mtx != NULL) 236 mutex_exit(mtx); 237 l->l_kpriority = true; 238 sq = sleeptab_lookup(&sleeptab, l, &mp); 239 sleepq_enter(sq, l, mp); 240 sleepq_enqueue(sq, l, wmesg, &sleep_syncobj); 241 error = sleepq_block(timo, intr); 242 if (mtx != NULL) 243 mutex_enter(mtx); 244 245 return error; 246 } 247 248 /* 249 * OBSOLETE INTERFACE 250 * 251 * Make all LWPs sleeping on the specified identifier runnable. 252 */ 253 void 254 wakeup(wchan_t ident) 255 { 256 sleepq_t *sq; 257 kmutex_t *mp; 258 259 if (__predict_false(cold)) 260 return; 261 262 sq = sleeptab_lookup(&sleeptab, ident, &mp); 263 sleepq_wake(sq, ident, (u_int)-1, mp); 264 } 265 266 /* 267 * General yield call. Puts the current LWP back on its run queue and 268 * performs a voluntary context switch. Should only be called when the 269 * current LWP explicitly requests it (eg sched_yield(2)). 270 */ 271 void 272 yield(void) 273 { 274 struct lwp *l = curlwp; 275 276 KERNEL_UNLOCK_ALL(l, &l->l_biglocks); 277 lwp_lock(l); 278 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); 279 KASSERT(l->l_stat == LSONPROC); 280 l->l_kpriority = false; 281 (void)mi_switch(l); 282 KERNEL_LOCK(l->l_biglocks, l); 283 } 284 285 /* 286 * General preemption call. Puts the current LWP back on its run queue 287 * and performs an involuntary context switch. 288 */ 289 void 290 preempt(void) 291 { 292 struct lwp *l = curlwp; 293 294 KERNEL_UNLOCK_ALL(l, &l->l_biglocks); 295 lwp_lock(l); 296 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); 297 KASSERT(l->l_stat == LSONPROC); 298 l->l_kpriority = false; 299 l->l_nivcsw++; 300 (void)mi_switch(l); 301 KERNEL_LOCK(l->l_biglocks, l); 302 } 303 304 /* 305 * Handle a request made by another agent to preempt the current LWP 306 * in-kernel. Usually called when l_dopreempt may be non-zero. 307 * 308 * Character addresses for lockstat only. 309 */ 310 static char in_critical_section; 311 static char kernel_lock_held; 312 static char is_softint; 313 static char cpu_kpreempt_enter_fail; 314 315 bool 316 kpreempt(uintptr_t where) 317 { 318 uintptr_t failed; 319 lwp_t *l; 320 int s, dop, lsflag; 321 322 l = curlwp; 323 failed = 0; 324 while ((dop = l->l_dopreempt) != 0) { 325 if (l->l_stat != LSONPROC) { 326 /* 327 * About to block (or die), let it happen. 328 * Doesn't really count as "preemption has 329 * been blocked", since we're going to 330 * context switch. 331 */ 332 l->l_dopreempt = 0; 333 return true; 334 } 335 if (__predict_false((l->l_flag & LW_IDLE) != 0)) { 336 /* Can't preempt idle loop, don't count as failure. */ 337 l->l_dopreempt = 0; 338 return true; 339 } 340 if (__predict_false(l->l_nopreempt != 0)) { 341 /* LWP holds preemption disabled, explicitly. */ 342 if ((dop & DOPREEMPT_COUNTED) == 0) { 343 kpreempt_ev_crit.ev_count++; 344 } 345 failed = (uintptr_t)&in_critical_section; 346 break; 347 } 348 if (__predict_false((l->l_pflag & LP_INTR) != 0)) { 349 /* Can't preempt soft interrupts yet. */ 350 l->l_dopreempt = 0; 351 failed = (uintptr_t)&is_softint; 352 break; 353 } 354 s = splsched(); 355 if (__predict_false(l->l_blcnt != 0 || 356 curcpu()->ci_biglock_wanted != NULL)) { 357 /* Hold or want kernel_lock, code is not MT safe. */ 358 splx(s); 359 if ((dop & DOPREEMPT_COUNTED) == 0) { 360 kpreempt_ev_klock.ev_count++; 361 } 362 failed = (uintptr_t)&kernel_lock_held; 363 break; 364 } 365 if (__predict_false(!cpu_kpreempt_enter(where, s))) { 366 /* 367 * It may be that the IPL is too high. 368 * kpreempt_enter() can schedule an 369 * interrupt to retry later. 370 */ 371 splx(s); 372 failed = (uintptr_t)&cpu_kpreempt_enter_fail; 373 break; 374 } 375 /* Do it! */ 376 if (__predict_true((dop & DOPREEMPT_COUNTED) == 0)) { 377 kpreempt_ev_immed.ev_count++; 378 } 379 lwp_lock(l); 380 mi_switch(l); 381 l->l_nopreempt++; 382 splx(s); 383 384 /* Take care of any MD cleanup. */ 385 cpu_kpreempt_exit(where); 386 l->l_nopreempt--; 387 } 388 389 if (__predict_true(!failed)) { 390 return false; 391 } 392 393 /* Record preemption failure for reporting via lockstat. */ 394 atomic_or_uint(&l->l_dopreempt, DOPREEMPT_COUNTED); 395 lsflag = 0; 396 LOCKSTAT_ENTER(lsflag); 397 if (__predict_false(lsflag)) { 398 if (where == 0) { 399 where = (uintptr_t)__builtin_return_address(0); 400 } 401 /* Preemption is on, might recurse, so make it atomic. */ 402 if (atomic_cas_ptr_ni((void *)&l->l_pfailaddr, NULL, 403 (void *)where) == NULL) { 404 LOCKSTAT_START_TIMER(lsflag, l->l_pfailtime); 405 l->l_pfaillock = failed; 406 } 407 } 408 LOCKSTAT_EXIT(lsflag); 409 return true; 410 } 411 412 /* 413 * Return true if preemption is explicitly disabled. 414 */ 415 bool 416 kpreempt_disabled(void) 417 { 418 const lwp_t *l = curlwp; 419 420 return l->l_nopreempt != 0 || l->l_stat == LSZOMB || 421 (l->l_flag & LW_IDLE) != 0 || cpu_kpreempt_disabled(); 422 } 423 424 /* 425 * Disable kernel preemption. 426 */ 427 void 428 kpreempt_disable(void) 429 { 430 431 KPREEMPT_DISABLE(curlwp); 432 } 433 434 /* 435 * Reenable kernel preemption. 436 */ 437 void 438 kpreempt_enable(void) 439 { 440 441 KPREEMPT_ENABLE(curlwp); 442 } 443 444 /* 445 * Compute the amount of time during which the current lwp was running. 446 * 447 * - update l_rtime unless it's an idle lwp. 448 */ 449 450 void 451 updatertime(lwp_t *l, const struct bintime *now) 452 { 453 454 if (__predict_false(l->l_flag & LW_IDLE)) 455 return; 456 457 /* rtime += now - stime */ 458 bintime_add(&l->l_rtime, now); 459 bintime_sub(&l->l_rtime, &l->l_stime); 460 } 461 462 /* 463 * Select next LWP from the current CPU to run.. 464 */ 465 static inline lwp_t * 466 nextlwp(struct cpu_info *ci, struct schedstate_percpu *spc) 467 { 468 lwp_t *newl; 469 470 /* 471 * Let sched_nextlwp() select the LWP to run the CPU next. 472 * If no LWP is runnable, select the idle LWP. 473 * 474 * Note that spc_lwplock might not necessary be held, and 475 * new thread would be unlocked after setting the LWP-lock. 476 */ 477 newl = sched_nextlwp(); 478 if (newl != NULL) { 479 sched_dequeue(newl); 480 KASSERT(lwp_locked(newl, spc->spc_mutex)); 481 KASSERT(newl->l_cpu == ci); 482 newl->l_stat = LSONPROC; 483 newl->l_pflag |= LP_RUNNING; 484 lwp_setlock(newl, spc->spc_lwplock); 485 } else { 486 newl = ci->ci_data.cpu_idlelwp; 487 newl->l_stat = LSONPROC; 488 newl->l_pflag |= LP_RUNNING; 489 } 490 491 /* 492 * Only clear want_resched if there are no pending (slow) 493 * software interrupts. 494 */ 495 ci->ci_want_resched = ci->ci_data.cpu_softints; 496 spc->spc_flags &= ~SPCF_SWITCHCLEAR; 497 spc->spc_curpriority = lwp_eprio(newl); 498 499 return newl; 500 } 501 502 /* 503 * The machine independent parts of context switch. 504 * 505 * Returns 1 if another LWP was actually run. 506 */ 507 int 508 mi_switch(lwp_t *l) 509 { 510 struct cpu_info *ci; 511 struct schedstate_percpu *spc; 512 struct lwp *newl; 513 int retval, oldspl; 514 struct bintime bt; 515 bool returning; 516 517 KASSERT(lwp_locked(l, NULL)); 518 KASSERT(kpreempt_disabled()); 519 LOCKDEBUG_BARRIER(l->l_mutex, 1); 520 521 kstack_check_magic(l); 522 523 binuptime(&bt); 524 525 KASSERTMSG(l == curlwp, "l %p curlwp %p", l, curlwp); 526 KASSERT((l->l_pflag & LP_RUNNING) != 0); 527 KASSERT(l->l_cpu == curcpu()); 528 ci = l->l_cpu; 529 spc = &ci->ci_schedstate; 530 returning = false; 531 newl = NULL; 532 533 /* 534 * If we have been asked to switch to a specific LWP, then there 535 * is no need to inspect the run queues. If a soft interrupt is 536 * blocking, then return to the interrupted thread without adjusting 537 * VM context or its start time: neither have been changed in order 538 * to take the interrupt. 539 */ 540 if (l->l_switchto != NULL) { 541 if ((l->l_pflag & LP_INTR) != 0) { 542 returning = true; 543 softint_block(l); 544 if ((l->l_pflag & LP_TIMEINTR) != 0) 545 updatertime(l, &bt); 546 } 547 newl = l->l_switchto; 548 l->l_switchto = NULL; 549 } 550 #ifndef __HAVE_FAST_SOFTINTS 551 else if (ci->ci_data.cpu_softints != 0) { 552 /* There are pending soft interrupts, so pick one. */ 553 newl = softint_picklwp(); 554 newl->l_stat = LSONPROC; 555 newl->l_pflag |= LP_RUNNING; 556 } 557 #endif /* !__HAVE_FAST_SOFTINTS */ 558 559 /* Count time spent in current system call */ 560 if (!returning) { 561 SYSCALL_TIME_SLEEP(l); 562 563 /* 564 * XXXSMP If we are using h/w performance counters, 565 * save context. 566 */ 567 #if PERFCTRS 568 if (PMC_ENABLED(l->l_proc)) { 569 pmc_save_context(l->l_proc); 570 } 571 #endif 572 updatertime(l, &bt); 573 } 574 575 /* Lock the runqueue */ 576 KASSERT(l->l_stat != LSRUN); 577 mutex_spin_enter(spc->spc_mutex); 578 579 /* 580 * If on the CPU and we have gotten this far, then we must yield. 581 */ 582 if (l->l_stat == LSONPROC && l != newl) { 583 KASSERT(lwp_locked(l, spc->spc_lwplock)); 584 if ((l->l_flag & LW_IDLE) == 0) { 585 l->l_stat = LSRUN; 586 lwp_setlock(l, spc->spc_mutex); 587 sched_enqueue(l, true); 588 /* 589 * Handle migration. Note that "migrating LWP" may 590 * be reset here, if interrupt/preemption happens 591 * early in idle LWP. 592 */ 593 if (l->l_target_cpu != NULL) { 594 KASSERT((l->l_pflag & LP_INTR) == 0); 595 spc->spc_migrating = l; 596 } 597 } else 598 l->l_stat = LSIDL; 599 } 600 601 /* Pick new LWP to run. */ 602 if (newl == NULL) { 603 newl = nextlwp(ci, spc); 604 } 605 606 /* Items that must be updated with the CPU locked. */ 607 if (!returning) { 608 /* Update the new LWP's start time. */ 609 newl->l_stime = bt; 610 611 /* 612 * ci_curlwp changes when a fast soft interrupt occurs. 613 * We use cpu_onproc to keep track of which kernel or 614 * user thread is running 'underneath' the software 615 * interrupt. This is important for time accounting, 616 * itimers and forcing user threads to preempt (aston). 617 */ 618 ci->ci_data.cpu_onproc = newl; 619 } 620 621 /* 622 * Preemption related tasks. Must be done with the current 623 * CPU locked. 624 */ 625 cpu_did_resched(l); 626 l->l_dopreempt = 0; 627 if (__predict_false(l->l_pfailaddr != 0)) { 628 LOCKSTAT_FLAG(lsflag); 629 LOCKSTAT_ENTER(lsflag); 630 LOCKSTAT_STOP_TIMER(lsflag, l->l_pfailtime); 631 LOCKSTAT_EVENT_RA(lsflag, l->l_pfaillock, LB_NOPREEMPT|LB_SPIN, 632 1, l->l_pfailtime, l->l_pfailaddr); 633 LOCKSTAT_EXIT(lsflag); 634 l->l_pfailtime = 0; 635 l->l_pfaillock = 0; 636 l->l_pfailaddr = 0; 637 } 638 639 if (l != newl) { 640 struct lwp *prevlwp; 641 642 /* Release all locks, but leave the current LWP locked */ 643 if (l->l_mutex == spc->spc_mutex) { 644 /* 645 * Drop spc_lwplock, if the current LWP has been moved 646 * to the run queue (it is now locked by spc_mutex). 647 */ 648 mutex_spin_exit(spc->spc_lwplock); 649 } else { 650 /* 651 * Otherwise, drop the spc_mutex, we are done with the 652 * run queues. 653 */ 654 mutex_spin_exit(spc->spc_mutex); 655 } 656 657 /* 658 * Mark that context switch is going to be performed 659 * for this LWP, to protect it from being switched 660 * to on another CPU. 661 */ 662 KASSERT(l->l_ctxswtch == 0); 663 l->l_ctxswtch = 1; 664 l->l_ncsw++; 665 KASSERT((l->l_pflag & LP_RUNNING) != 0); 666 l->l_pflag &= ~LP_RUNNING; 667 668 /* 669 * Increase the count of spin-mutexes before the release 670 * of the last lock - we must remain at IPL_SCHED during 671 * the context switch. 672 */ 673 KASSERTMSG(ci->ci_mtx_count == -1, 674 "%s: cpu%u: ci_mtx_count (%d) != -1 " 675 "(block with spin-mutex held)", 676 __func__, cpu_index(ci), ci->ci_mtx_count); 677 oldspl = MUTEX_SPIN_OLDSPL(ci); 678 ci->ci_mtx_count--; 679 lwp_unlock(l); 680 681 /* Count the context switch on this CPU. */ 682 ci->ci_data.cpu_nswtch++; 683 684 /* Update status for lwpctl, if present. */ 685 if (l->l_lwpctl != NULL) 686 l->l_lwpctl->lc_curcpu = LWPCTL_CPU_NONE; 687 688 /* 689 * Save old VM context, unless a soft interrupt 690 * handler is blocking. 691 */ 692 if (!returning) 693 pmap_deactivate(l); 694 695 /* 696 * We may need to spin-wait if 'newl' is still 697 * context switching on another CPU. 698 */ 699 if (__predict_false(newl->l_ctxswtch != 0)) { 700 u_int count; 701 count = SPINLOCK_BACKOFF_MIN; 702 while (newl->l_ctxswtch) 703 SPINLOCK_BACKOFF(count); 704 } 705 706 /* 707 * If DTrace has set the active vtime enum to anything 708 * other than INACTIVE (0), then it should have set the 709 * function to call. 710 */ 711 if (__predict_false(dtrace_vtime_active)) { 712 (*dtrace_vtime_switch_func)(newl); 713 } 714 715 /* Switch to the new LWP.. */ 716 #ifdef MULTIPROCESSOR 717 KASSERT(curlwp == ci->ci_curlwp); 718 #endif 719 KASSERTMSG(l == curlwp, "l %p curlwp %p", l, curlwp); 720 prevlwp = cpu_switchto(l, newl, returning); 721 ci = curcpu(); 722 #ifdef MULTIPROCESSOR 723 KASSERT(curlwp == ci->ci_curlwp); 724 #endif 725 KASSERTMSG(l == curlwp, "l %p curlwp %p prevlwp %p", 726 l, curlwp, prevlwp); 727 728 /* 729 * Switched away - we have new curlwp. 730 * Restore VM context and IPL. 731 */ 732 pmap_activate(l); 733 uvm_emap_switch(l); 734 pcu_switchpoint(l); 735 736 if (prevlwp != NULL) { 737 /* Normalize the count of the spin-mutexes */ 738 ci->ci_mtx_count++; 739 /* Unmark the state of context switch */ 740 membar_exit(); 741 prevlwp->l_ctxswtch = 0; 742 } 743 744 /* Update status for lwpctl, if present. */ 745 if (l->l_lwpctl != NULL) { 746 l->l_lwpctl->lc_curcpu = (int)cpu_index(ci); 747 l->l_lwpctl->lc_pctr++; 748 } 749 750 /* Note trip through cpu_switchto(). */ 751 pserialize_switchpoint(); 752 753 KASSERT(l->l_cpu == ci); 754 splx(oldspl); 755 /* 756 * note that, unless the caller disabled preemption, 757 * we can be preempted at any time after the above splx() call. 758 */ 759 retval = 1; 760 } else { 761 /* Nothing to do - just unlock and return. */ 762 mutex_spin_exit(spc->spc_mutex); 763 lwp_unlock(l); 764 retval = 0; 765 } 766 767 KASSERT(l == curlwp); 768 KASSERT(l->l_stat == LSONPROC); 769 770 /* 771 * XXXSMP If we are using h/w performance counters, restore context. 772 * XXXSMP preemption problem. 773 */ 774 #if PERFCTRS 775 if (PMC_ENABLED(l->l_proc)) { 776 pmc_restore_context(l->l_proc); 777 } 778 #endif 779 SYSCALL_TIME_WAKEUP(l); 780 LOCKDEBUG_BARRIER(NULL, 1); 781 782 return retval; 783 } 784 785 /* 786 * The machine independent parts of context switch to oblivion. 787 * Does not return. Call with the LWP unlocked. 788 */ 789 void 790 lwp_exit_switchaway(lwp_t *l) 791 { 792 struct cpu_info *ci; 793 struct lwp *newl; 794 struct bintime bt; 795 796 ci = l->l_cpu; 797 798 KASSERT(kpreempt_disabled()); 799 KASSERT(l->l_stat == LSZOMB || l->l_stat == LSIDL); 800 KASSERT(ci == curcpu()); 801 LOCKDEBUG_BARRIER(NULL, 0); 802 803 kstack_check_magic(l); 804 805 /* Count time spent in current system call */ 806 SYSCALL_TIME_SLEEP(l); 807 binuptime(&bt); 808 updatertime(l, &bt); 809 810 /* Must stay at IPL_SCHED even after releasing run queue lock. */ 811 (void)splsched(); 812 813 /* 814 * Let sched_nextlwp() select the LWP to run the CPU next. 815 * If no LWP is runnable, select the idle LWP. 816 * 817 * Note that spc_lwplock might not necessary be held, and 818 * new thread would be unlocked after setting the LWP-lock. 819 */ 820 spc_lock(ci); 821 #ifndef __HAVE_FAST_SOFTINTS 822 if (ci->ci_data.cpu_softints != 0) { 823 /* There are pending soft interrupts, so pick one. */ 824 newl = softint_picklwp(); 825 newl->l_stat = LSONPROC; 826 newl->l_pflag |= LP_RUNNING; 827 } else 828 #endif /* !__HAVE_FAST_SOFTINTS */ 829 { 830 newl = nextlwp(ci, &ci->ci_schedstate); 831 } 832 833 /* Update the new LWP's start time. */ 834 newl->l_stime = bt; 835 l->l_pflag &= ~LP_RUNNING; 836 837 /* 838 * ci_curlwp changes when a fast soft interrupt occurs. 839 * We use cpu_onproc to keep track of which kernel or 840 * user thread is running 'underneath' the software 841 * interrupt. This is important for time accounting, 842 * itimers and forcing user threads to preempt (aston). 843 */ 844 ci->ci_data.cpu_onproc = newl; 845 846 /* 847 * Preemption related tasks. Must be done with the current 848 * CPU locked. 849 */ 850 cpu_did_resched(l); 851 852 /* Unlock the run queue. */ 853 spc_unlock(ci); 854 855 /* Count the context switch on this CPU. */ 856 ci->ci_data.cpu_nswtch++; 857 858 /* Update status for lwpctl, if present. */ 859 if (l->l_lwpctl != NULL) 860 l->l_lwpctl->lc_curcpu = LWPCTL_CPU_EXITED; 861 862 /* 863 * We may need to spin-wait if 'newl' is still 864 * context switching on another CPU. 865 */ 866 if (__predict_false(newl->l_ctxswtch != 0)) { 867 u_int count; 868 count = SPINLOCK_BACKOFF_MIN; 869 while (newl->l_ctxswtch) 870 SPINLOCK_BACKOFF(count); 871 } 872 873 /* 874 * If DTrace has set the active vtime enum to anything 875 * other than INACTIVE (0), then it should have set the 876 * function to call. 877 */ 878 if (__predict_false(dtrace_vtime_active)) { 879 (*dtrace_vtime_switch_func)(newl); 880 } 881 882 /* Switch to the new LWP.. */ 883 (void)cpu_switchto(NULL, newl, false); 884 885 for (;;) continue; /* XXX: convince gcc about "noreturn" */ 886 /* NOTREACHED */ 887 } 888 889 /* 890 * setrunnable: change LWP state to be runnable, placing it on the run queue. 891 * 892 * Call with the process and LWP locked. Will return with the LWP unlocked. 893 */ 894 void 895 setrunnable(struct lwp *l) 896 { 897 struct proc *p = l->l_proc; 898 struct cpu_info *ci; 899 900 KASSERT((l->l_flag & LW_IDLE) == 0); 901 KASSERT(mutex_owned(p->p_lock)); 902 KASSERT(lwp_locked(l, NULL)); 903 KASSERT(l->l_mutex != l->l_cpu->ci_schedstate.spc_mutex); 904 905 switch (l->l_stat) { 906 case LSSTOP: 907 /* 908 * If we're being traced (possibly because someone attached us 909 * while we were stopped), check for a signal from the debugger. 910 */ 911 if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xstat != 0) 912 signotify(l); 913 p->p_nrlwps++; 914 break; 915 case LSSUSPENDED: 916 l->l_flag &= ~LW_WSUSPEND; 917 p->p_nrlwps++; 918 cv_broadcast(&p->p_lwpcv); 919 break; 920 case LSSLEEP: 921 KASSERT(l->l_wchan != NULL); 922 break; 923 default: 924 panic("setrunnable: lwp %p state was %d", l, l->l_stat); 925 } 926 927 /* 928 * If the LWP was sleeping, start it again. 929 */ 930 if (l->l_wchan != NULL) { 931 l->l_stat = LSSLEEP; 932 /* lwp_unsleep() will release the lock. */ 933 lwp_unsleep(l, true); 934 return; 935 } 936 937 /* 938 * If the LWP is still on the CPU, mark it as LSONPROC. It may be 939 * about to call mi_switch(), in which case it will yield. 940 */ 941 if ((l->l_pflag & LP_RUNNING) != 0) { 942 l->l_stat = LSONPROC; 943 l->l_slptime = 0; 944 lwp_unlock(l); 945 return; 946 } 947 948 /* 949 * Look for a CPU to run. 950 * Set the LWP runnable. 951 */ 952 ci = sched_takecpu(l); 953 l->l_cpu = ci; 954 spc_lock(ci); 955 lwp_unlock_to(l, ci->ci_schedstate.spc_mutex); 956 sched_setrunnable(l); 957 l->l_stat = LSRUN; 958 l->l_slptime = 0; 959 960 sched_enqueue(l, false); 961 resched_cpu(l); 962 lwp_unlock(l); 963 } 964 965 /* 966 * suspendsched: 967 * 968 * Convert all non-LW_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED. 969 */ 970 void 971 suspendsched(void) 972 { 973 CPU_INFO_ITERATOR cii; 974 struct cpu_info *ci; 975 struct lwp *l; 976 struct proc *p; 977 978 /* 979 * We do this by process in order not to violate the locking rules. 980 */ 981 mutex_enter(proc_lock); 982 PROCLIST_FOREACH(p, &allproc) { 983 mutex_enter(p->p_lock); 984 if ((p->p_flag & PK_SYSTEM) != 0) { 985 mutex_exit(p->p_lock); 986 continue; 987 } 988 989 p->p_stat = SSTOP; 990 991 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 992 if (l == curlwp) 993 continue; 994 995 lwp_lock(l); 996 997 /* 998 * Set L_WREBOOT so that the LWP will suspend itself 999 * when it tries to return to user mode. We want to 1000 * try and get to get as many LWPs as possible to 1001 * the user / kernel boundary, so that they will 1002 * release any locks that they hold. 1003 */ 1004 l->l_flag |= (LW_WREBOOT | LW_WSUSPEND); 1005 1006 if (l->l_stat == LSSLEEP && 1007 (l->l_flag & LW_SINTR) != 0) { 1008 /* setrunnable() will release the lock. */ 1009 setrunnable(l); 1010 continue; 1011 } 1012 1013 lwp_unlock(l); 1014 } 1015 1016 mutex_exit(p->p_lock); 1017 } 1018 mutex_exit(proc_lock); 1019 1020 /* 1021 * Kick all CPUs to make them preempt any LWPs running in user mode. 1022 * They'll trap into the kernel and suspend themselves in userret(). 1023 */ 1024 for (CPU_INFO_FOREACH(cii, ci)) { 1025 spc_lock(ci); 1026 cpu_need_resched(ci, RESCHED_IMMED); 1027 spc_unlock(ci); 1028 } 1029 } 1030 1031 /* 1032 * sched_unsleep: 1033 * 1034 * The is called when the LWP has not been awoken normally but instead 1035 * interrupted: for example, if the sleep timed out. Because of this, 1036 * it's not a valid action for running or idle LWPs. 1037 */ 1038 static void 1039 sched_unsleep(struct lwp *l, bool cleanup) 1040 { 1041 1042 lwp_unlock(l); 1043 panic("sched_unsleep"); 1044 } 1045 1046 static void 1047 resched_cpu(struct lwp *l) 1048 { 1049 struct cpu_info *ci = l->l_cpu; 1050 1051 KASSERT(lwp_locked(l, NULL)); 1052 if (lwp_eprio(l) > ci->ci_schedstate.spc_curpriority) 1053 cpu_need_resched(ci, 0); 1054 } 1055 1056 static void 1057 sched_changepri(struct lwp *l, pri_t pri) 1058 { 1059 1060 KASSERT(lwp_locked(l, NULL)); 1061 1062 if (l->l_stat == LSRUN) { 1063 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex)); 1064 sched_dequeue(l); 1065 l->l_priority = pri; 1066 sched_enqueue(l, false); 1067 } else { 1068 l->l_priority = pri; 1069 } 1070 resched_cpu(l); 1071 } 1072 1073 static void 1074 sched_lendpri(struct lwp *l, pri_t pri) 1075 { 1076 1077 KASSERT(lwp_locked(l, NULL)); 1078 1079 if (l->l_stat == LSRUN) { 1080 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex)); 1081 sched_dequeue(l); 1082 l->l_inheritedprio = pri; 1083 sched_enqueue(l, false); 1084 } else { 1085 l->l_inheritedprio = pri; 1086 } 1087 resched_cpu(l); 1088 } 1089 1090 struct lwp * 1091 syncobj_noowner(wchan_t wchan) 1092 { 1093 1094 return NULL; 1095 } 1096 1097 /* Decay 95% of proc::p_pctcpu in 60 seconds, ccpu = exp(-1/20) */ 1098 const fixpt_t ccpu = 0.95122942450071400909 * FSCALE; 1099 1100 /* 1101 * Constants for averages over 1, 5 and 15 minutes when sampling at 1102 * 5 second intervals. 1103 */ 1104 static const fixpt_t cexp[ ] = { 1105 0.9200444146293232 * FSCALE, /* exp(-1/12) */ 1106 0.9834714538216174 * FSCALE, /* exp(-1/60) */ 1107 0.9944598480048967 * FSCALE, /* exp(-1/180) */ 1108 }; 1109 1110 /* 1111 * sched_pstats: 1112 * 1113 * => Update process statistics and check CPU resource allocation. 1114 * => Call scheduler-specific hook to eventually adjust LWP priorities. 1115 * => Compute load average of a quantity on 1, 5 and 15 minute intervals. 1116 */ 1117 void 1118 sched_pstats(void) 1119 { 1120 extern struct loadavg averunnable; 1121 struct loadavg *avg = &averunnable; 1122 const int clkhz = (stathz != 0 ? stathz : hz); 1123 static bool backwards = false; 1124 static u_int lavg_count = 0; 1125 struct proc *p; 1126 int nrun; 1127 1128 sched_pstats_ticks++; 1129 if (++lavg_count >= 5) { 1130 lavg_count = 0; 1131 nrun = 0; 1132 } 1133 mutex_enter(proc_lock); 1134 PROCLIST_FOREACH(p, &allproc) { 1135 struct lwp *l; 1136 struct rlimit *rlim; 1137 time_t runtm; 1138 int sig; 1139 1140 /* Increment sleep time (if sleeping), ignore overflow. */ 1141 mutex_enter(p->p_lock); 1142 runtm = p->p_rtime.sec; 1143 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 1144 fixpt_t lpctcpu; 1145 u_int lcpticks; 1146 1147 if (__predict_false((l->l_flag & LW_IDLE) != 0)) 1148 continue; 1149 lwp_lock(l); 1150 runtm += l->l_rtime.sec; 1151 l->l_swtime++; 1152 sched_lwp_stats(l); 1153 1154 /* For load average calculation. */ 1155 if (__predict_false(lavg_count == 0) && 1156 (l->l_flag & (LW_SINTR | LW_SYSTEM)) == 0) { 1157 switch (l->l_stat) { 1158 case LSSLEEP: 1159 if (l->l_slptime > 1) { 1160 break; 1161 } 1162 case LSRUN: 1163 case LSONPROC: 1164 case LSIDL: 1165 nrun++; 1166 } 1167 } 1168 lwp_unlock(l); 1169 1170 l->l_pctcpu = (l->l_pctcpu * ccpu) >> FSHIFT; 1171 if (l->l_slptime != 0) 1172 continue; 1173 1174 lpctcpu = l->l_pctcpu; 1175 lcpticks = atomic_swap_uint(&l->l_cpticks, 0); 1176 lpctcpu += ((FSCALE - ccpu) * 1177 (lcpticks * FSCALE / clkhz)) >> FSHIFT; 1178 l->l_pctcpu = lpctcpu; 1179 } 1180 /* Calculating p_pctcpu only for ps(1) */ 1181 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT; 1182 1183 if (__predict_false(runtm < 0)) { 1184 if (!backwards) { 1185 backwards = true; 1186 printf("WARNING: negative runtime; " 1187 "monotonic clock has gone backwards\n"); 1188 } 1189 mutex_exit(p->p_lock); 1190 continue; 1191 } 1192 1193 /* 1194 * Check if the process exceeds its CPU resource allocation. 1195 * If over the hard limit, kill it with SIGKILL. 1196 * If over the soft limit, send SIGXCPU and raise 1197 * the soft limit a little. 1198 */ 1199 rlim = &p->p_rlimit[RLIMIT_CPU]; 1200 sig = 0; 1201 if (__predict_false(runtm >= rlim->rlim_cur)) { 1202 if (runtm >= rlim->rlim_max) { 1203 sig = SIGKILL; 1204 log(LOG_NOTICE, "pid %d is killed: %s\n", 1205 p->p_pid, "exceeded RLIMIT_CPU"); 1206 uprintf("pid %d, command %s, is killed: %s\n", 1207 p->p_pid, p->p_comm, 1208 "exceeded RLIMIT_CPU"); 1209 } else { 1210 sig = SIGXCPU; 1211 if (rlim->rlim_cur < rlim->rlim_max) 1212 rlim->rlim_cur += 5; 1213 } 1214 } 1215 mutex_exit(p->p_lock); 1216 if (__predict_false(sig)) { 1217 KASSERT((p->p_flag & PK_SYSTEM) == 0); 1218 psignal(p, sig); 1219 } 1220 } 1221 mutex_exit(proc_lock); 1222 1223 /* Load average calculation. */ 1224 if (__predict_false(lavg_count == 0)) { 1225 int i; 1226 CTASSERT(__arraycount(cexp) == __arraycount(avg->ldavg)); 1227 for (i = 0; i < __arraycount(cexp); i++) { 1228 avg->ldavg[i] = (cexp[i] * avg->ldavg[i] + 1229 nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT; 1230 } 1231 } 1232 1233 /* Lightning bolt. */ 1234 cv_broadcast(&lbolt); 1235 } 1236