1 /* $NetBSD: kern_synch.c,v 1.314 2018/02/16 07:04:51 ozaki-r Exp $ */ 2 3 /*- 4 * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008, 2009 5 * The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 10 * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran and 11 * Daniel Sieger. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 24 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 26 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 29 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 30 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 32 * POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 /*- 36 * Copyright (c) 1982, 1986, 1990, 1991, 1993 37 * The Regents of the University of California. All rights reserved. 38 * (c) UNIX System Laboratories, Inc. 39 * All or some portions of this file are derived from material licensed 40 * to the University of California by American Telephone and Telegraph 41 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 42 * the permission of UNIX System Laboratories, Inc. 43 * 44 * Redistribution and use in source and binary forms, with or without 45 * modification, are permitted provided that the following conditions 46 * are met: 47 * 1. Redistributions of source code must retain the above copyright 48 * notice, this list of conditions and the following disclaimer. 49 * 2. Redistributions in binary form must reproduce the above copyright 50 * notice, this list of conditions and the following disclaimer in the 51 * documentation and/or other materials provided with the distribution. 52 * 3. Neither the name of the University nor the names of its contributors 53 * may be used to endorse or promote products derived from this software 54 * without specific prior written permission. 55 * 56 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 57 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 58 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 59 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 60 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 61 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 62 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 63 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 64 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 65 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 66 * SUCH DAMAGE. 67 * 68 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 69 */ 70 71 #include <sys/cdefs.h> 72 __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.314 2018/02/16 07:04:51 ozaki-r Exp $"); 73 74 #include "opt_kstack.h" 75 #include "opt_perfctrs.h" 76 #include "opt_dtrace.h" 77 78 #define __MUTEX_PRIVATE 79 80 #include <sys/param.h> 81 #include <sys/systm.h> 82 #include <sys/proc.h> 83 #include <sys/kernel.h> 84 #if defined(PERFCTRS) 85 #include <sys/pmc.h> 86 #endif 87 #include <sys/cpu.h> 88 #include <sys/pserialize.h> 89 #include <sys/resourcevar.h> 90 #include <sys/sched.h> 91 #include <sys/syscall_stats.h> 92 #include <sys/sleepq.h> 93 #include <sys/lockdebug.h> 94 #include <sys/evcnt.h> 95 #include <sys/intr.h> 96 #include <sys/lwpctl.h> 97 #include <sys/atomic.h> 98 #include <sys/syslog.h> 99 100 #include <uvm/uvm_extern.h> 101 102 #include <dev/lockstat.h> 103 104 #include <sys/dtrace_bsd.h> 105 int dtrace_vtime_active=0; 106 dtrace_vtime_switch_func_t dtrace_vtime_switch_func; 107 108 static void sched_unsleep(struct lwp *, bool); 109 static void sched_changepri(struct lwp *, pri_t); 110 static void sched_lendpri(struct lwp *, pri_t); 111 static void resched_cpu(struct lwp *); 112 113 syncobj_t sleep_syncobj = { 114 .sobj_flag = SOBJ_SLEEPQ_SORTED, 115 .sobj_unsleep = sleepq_unsleep, 116 .sobj_changepri = sleepq_changepri, 117 .sobj_lendpri = sleepq_lendpri, 118 .sobj_owner = syncobj_noowner, 119 }; 120 121 syncobj_t sched_syncobj = { 122 .sobj_flag = SOBJ_SLEEPQ_SORTED, 123 .sobj_unsleep = sched_unsleep, 124 .sobj_changepri = sched_changepri, 125 .sobj_lendpri = sched_lendpri, 126 .sobj_owner = syncobj_noowner, 127 }; 128 129 /* "Lightning bolt": once a second sleep address. */ 130 kcondvar_t lbolt __cacheline_aligned; 131 132 u_int sched_pstats_ticks __cacheline_aligned; 133 134 /* Preemption event counters. */ 135 static struct evcnt kpreempt_ev_crit __cacheline_aligned; 136 static struct evcnt kpreempt_ev_klock __cacheline_aligned; 137 static struct evcnt kpreempt_ev_immed __cacheline_aligned; 138 139 void 140 synch_init(void) 141 { 142 143 cv_init(&lbolt, "lbolt"); 144 145 evcnt_attach_dynamic(&kpreempt_ev_crit, EVCNT_TYPE_MISC, NULL, 146 "kpreempt", "defer: critical section"); 147 evcnt_attach_dynamic(&kpreempt_ev_klock, EVCNT_TYPE_MISC, NULL, 148 "kpreempt", "defer: kernel_lock"); 149 evcnt_attach_dynamic(&kpreempt_ev_immed, EVCNT_TYPE_MISC, NULL, 150 "kpreempt", "immediate"); 151 } 152 153 /* 154 * OBSOLETE INTERFACE 155 * 156 * General sleep call. Suspends the current LWP until a wakeup is 157 * performed on the specified identifier. The LWP will then be made 158 * runnable with the specified priority. Sleeps at most timo/hz seconds (0 159 * means no timeout). If pri includes PCATCH flag, signals are checked 160 * before and after sleeping, else signals are not checked. Returns 0 if 161 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a 162 * signal needs to be delivered, ERESTART is returned if the current system 163 * call should be restarted if possible, and EINTR is returned if the system 164 * call should be interrupted by the signal (return EINTR). 165 */ 166 int 167 tsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo) 168 { 169 struct lwp *l = curlwp; 170 sleepq_t *sq; 171 kmutex_t *mp; 172 173 KASSERT((l->l_pflag & LP_INTR) == 0); 174 KASSERT(ident != &lbolt); 175 176 if (sleepq_dontsleep(l)) { 177 (void)sleepq_abort(NULL, 0); 178 return 0; 179 } 180 181 l->l_kpriority = true; 182 sq = sleeptab_lookup(&sleeptab, ident, &mp); 183 sleepq_enter(sq, l, mp); 184 sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj); 185 return sleepq_block(timo, priority & PCATCH); 186 } 187 188 int 189 mtsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo, 190 kmutex_t *mtx) 191 { 192 struct lwp *l = curlwp; 193 sleepq_t *sq; 194 kmutex_t *mp; 195 int error; 196 197 KASSERT((l->l_pflag & LP_INTR) == 0); 198 KASSERT(ident != &lbolt); 199 200 if (sleepq_dontsleep(l)) { 201 (void)sleepq_abort(mtx, (priority & PNORELOCK) != 0); 202 return 0; 203 } 204 205 l->l_kpriority = true; 206 sq = sleeptab_lookup(&sleeptab, ident, &mp); 207 sleepq_enter(sq, l, mp); 208 sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj); 209 mutex_exit(mtx); 210 error = sleepq_block(timo, priority & PCATCH); 211 212 if ((priority & PNORELOCK) == 0) 213 mutex_enter(mtx); 214 215 return error; 216 } 217 218 /* 219 * General sleep call for situations where a wake-up is not expected. 220 */ 221 int 222 kpause(const char *wmesg, bool intr, int timo, kmutex_t *mtx) 223 { 224 struct lwp *l = curlwp; 225 kmutex_t *mp; 226 sleepq_t *sq; 227 int error; 228 229 KASSERT(!(timo == 0 && intr == false)); 230 231 if (sleepq_dontsleep(l)) 232 return sleepq_abort(NULL, 0); 233 234 if (mtx != NULL) 235 mutex_exit(mtx); 236 l->l_kpriority = true; 237 sq = sleeptab_lookup(&sleeptab, l, &mp); 238 sleepq_enter(sq, l, mp); 239 sleepq_enqueue(sq, l, wmesg, &sleep_syncobj); 240 error = sleepq_block(timo, intr); 241 if (mtx != NULL) 242 mutex_enter(mtx); 243 244 return error; 245 } 246 247 /* 248 * OBSOLETE INTERFACE 249 * 250 * Make all LWPs sleeping on the specified identifier runnable. 251 */ 252 void 253 wakeup(wchan_t ident) 254 { 255 sleepq_t *sq; 256 kmutex_t *mp; 257 258 if (__predict_false(cold)) 259 return; 260 261 sq = sleeptab_lookup(&sleeptab, ident, &mp); 262 sleepq_wake(sq, ident, (u_int)-1, mp); 263 } 264 265 /* 266 * General yield call. Puts the current LWP back on its run queue and 267 * performs a voluntary context switch. Should only be called when the 268 * current LWP explicitly requests it (eg sched_yield(2)). 269 */ 270 void 271 yield(void) 272 { 273 struct lwp *l = curlwp; 274 275 KERNEL_UNLOCK_ALL(l, &l->l_biglocks); 276 lwp_lock(l); 277 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); 278 KASSERT(l->l_stat == LSONPROC); 279 l->l_kpriority = false; 280 (void)mi_switch(l); 281 KERNEL_LOCK(l->l_biglocks, l); 282 } 283 284 /* 285 * General preemption call. Puts the current LWP back on its run queue 286 * and performs an involuntary context switch. 287 */ 288 void 289 preempt(void) 290 { 291 struct lwp *l = curlwp; 292 293 KERNEL_UNLOCK_ALL(l, &l->l_biglocks); 294 lwp_lock(l); 295 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); 296 KASSERT(l->l_stat == LSONPROC); 297 l->l_kpriority = false; 298 l->l_nivcsw++; 299 (void)mi_switch(l); 300 KERNEL_LOCK(l->l_biglocks, l); 301 } 302 303 /* 304 * Handle a request made by another agent to preempt the current LWP 305 * in-kernel. Usually called when l_dopreempt may be non-zero. 306 * 307 * Character addresses for lockstat only. 308 */ 309 static char in_critical_section; 310 static char kernel_lock_held; 311 static char is_softint; 312 static char cpu_kpreempt_enter_fail; 313 314 bool 315 kpreempt(uintptr_t where) 316 { 317 uintptr_t failed; 318 lwp_t *l; 319 int s, dop, lsflag; 320 321 l = curlwp; 322 failed = 0; 323 while ((dop = l->l_dopreempt) != 0) { 324 if (l->l_stat != LSONPROC) { 325 /* 326 * About to block (or die), let it happen. 327 * Doesn't really count as "preemption has 328 * been blocked", since we're going to 329 * context switch. 330 */ 331 l->l_dopreempt = 0; 332 return true; 333 } 334 if (__predict_false((l->l_flag & LW_IDLE) != 0)) { 335 /* Can't preempt idle loop, don't count as failure. */ 336 l->l_dopreempt = 0; 337 return true; 338 } 339 if (__predict_false(l->l_nopreempt != 0)) { 340 /* LWP holds preemption disabled, explicitly. */ 341 if ((dop & DOPREEMPT_COUNTED) == 0) { 342 kpreempt_ev_crit.ev_count++; 343 } 344 failed = (uintptr_t)&in_critical_section; 345 break; 346 } 347 if (__predict_false((l->l_pflag & LP_INTR) != 0)) { 348 /* Can't preempt soft interrupts yet. */ 349 l->l_dopreempt = 0; 350 failed = (uintptr_t)&is_softint; 351 break; 352 } 353 s = splsched(); 354 if (__predict_false(l->l_blcnt != 0 || 355 curcpu()->ci_biglock_wanted != NULL)) { 356 /* Hold or want kernel_lock, code is not MT safe. */ 357 splx(s); 358 if ((dop & DOPREEMPT_COUNTED) == 0) { 359 kpreempt_ev_klock.ev_count++; 360 } 361 failed = (uintptr_t)&kernel_lock_held; 362 break; 363 } 364 if (__predict_false(!cpu_kpreempt_enter(where, s))) { 365 /* 366 * It may be that the IPL is too high. 367 * kpreempt_enter() can schedule an 368 * interrupt to retry later. 369 */ 370 splx(s); 371 failed = (uintptr_t)&cpu_kpreempt_enter_fail; 372 break; 373 } 374 /* Do it! */ 375 if (__predict_true((dop & DOPREEMPT_COUNTED) == 0)) { 376 kpreempt_ev_immed.ev_count++; 377 } 378 lwp_lock(l); 379 mi_switch(l); 380 l->l_nopreempt++; 381 splx(s); 382 383 /* Take care of any MD cleanup. */ 384 cpu_kpreempt_exit(where); 385 l->l_nopreempt--; 386 } 387 388 if (__predict_true(!failed)) { 389 return false; 390 } 391 392 /* Record preemption failure for reporting via lockstat. */ 393 atomic_or_uint(&l->l_dopreempt, DOPREEMPT_COUNTED); 394 lsflag = 0; 395 LOCKSTAT_ENTER(lsflag); 396 if (__predict_false(lsflag)) { 397 if (where == 0) { 398 where = (uintptr_t)__builtin_return_address(0); 399 } 400 /* Preemption is on, might recurse, so make it atomic. */ 401 if (atomic_cas_ptr_ni((void *)&l->l_pfailaddr, NULL, 402 (void *)where) == NULL) { 403 LOCKSTAT_START_TIMER(lsflag, l->l_pfailtime); 404 l->l_pfaillock = failed; 405 } 406 } 407 LOCKSTAT_EXIT(lsflag); 408 return true; 409 } 410 411 /* 412 * Return true if preemption is explicitly disabled. 413 */ 414 bool 415 kpreempt_disabled(void) 416 { 417 const lwp_t *l = curlwp; 418 419 return l->l_nopreempt != 0 || l->l_stat == LSZOMB || 420 (l->l_flag & LW_IDLE) != 0 || cpu_kpreempt_disabled(); 421 } 422 423 /* 424 * Disable kernel preemption. 425 */ 426 void 427 kpreempt_disable(void) 428 { 429 430 KPREEMPT_DISABLE(curlwp); 431 } 432 433 /* 434 * Reenable kernel preemption. 435 */ 436 void 437 kpreempt_enable(void) 438 { 439 440 KPREEMPT_ENABLE(curlwp); 441 } 442 443 /* 444 * Compute the amount of time during which the current lwp was running. 445 * 446 * - update l_rtime unless it's an idle lwp. 447 */ 448 449 void 450 updatertime(lwp_t *l, const struct bintime *now) 451 { 452 453 if (__predict_false(l->l_flag & LW_IDLE)) 454 return; 455 456 /* rtime += now - stime */ 457 bintime_add(&l->l_rtime, now); 458 bintime_sub(&l->l_rtime, &l->l_stime); 459 } 460 461 /* 462 * Select next LWP from the current CPU to run.. 463 */ 464 static inline lwp_t * 465 nextlwp(struct cpu_info *ci, struct schedstate_percpu *spc) 466 { 467 lwp_t *newl; 468 469 /* 470 * Let sched_nextlwp() select the LWP to run the CPU next. 471 * If no LWP is runnable, select the idle LWP. 472 * 473 * Note that spc_lwplock might not necessary be held, and 474 * new thread would be unlocked after setting the LWP-lock. 475 */ 476 newl = sched_nextlwp(); 477 if (newl != NULL) { 478 sched_dequeue(newl); 479 KASSERT(lwp_locked(newl, spc->spc_mutex)); 480 KASSERT(newl->l_cpu == ci); 481 newl->l_stat = LSONPROC; 482 newl->l_pflag |= LP_RUNNING; 483 lwp_setlock(newl, spc->spc_lwplock); 484 } else { 485 newl = ci->ci_data.cpu_idlelwp; 486 newl->l_stat = LSONPROC; 487 newl->l_pflag |= LP_RUNNING; 488 } 489 490 /* 491 * Only clear want_resched if there are no pending (slow) 492 * software interrupts. 493 */ 494 ci->ci_want_resched = ci->ci_data.cpu_softints; 495 spc->spc_flags &= ~SPCF_SWITCHCLEAR; 496 spc->spc_curpriority = lwp_eprio(newl); 497 498 return newl; 499 } 500 501 /* 502 * The machine independent parts of context switch. 503 * 504 * Returns 1 if another LWP was actually run. 505 */ 506 int 507 mi_switch(lwp_t *l) 508 { 509 struct cpu_info *ci; 510 struct schedstate_percpu *spc; 511 struct lwp *newl; 512 int retval, oldspl; 513 struct bintime bt; 514 bool returning; 515 516 KASSERT(lwp_locked(l, NULL)); 517 KASSERT(kpreempt_disabled()); 518 LOCKDEBUG_BARRIER(l->l_mutex, 1); 519 520 kstack_check_magic(l); 521 522 binuptime(&bt); 523 524 KASSERTMSG(l == curlwp, "l %p curlwp %p", l, curlwp); 525 KASSERT((l->l_pflag & LP_RUNNING) != 0); 526 KASSERT(l->l_cpu == curcpu()); 527 ci = l->l_cpu; 528 spc = &ci->ci_schedstate; 529 returning = false; 530 newl = NULL; 531 532 /* 533 * If we have been asked to switch to a specific LWP, then there 534 * is no need to inspect the run queues. If a soft interrupt is 535 * blocking, then return to the interrupted thread without adjusting 536 * VM context or its start time: neither have been changed in order 537 * to take the interrupt. 538 */ 539 if (l->l_switchto != NULL) { 540 if ((l->l_pflag & LP_INTR) != 0) { 541 returning = true; 542 softint_block(l); 543 if ((l->l_pflag & LP_TIMEINTR) != 0) 544 updatertime(l, &bt); 545 } 546 newl = l->l_switchto; 547 l->l_switchto = NULL; 548 } 549 #ifndef __HAVE_FAST_SOFTINTS 550 else if (ci->ci_data.cpu_softints != 0) { 551 /* There are pending soft interrupts, so pick one. */ 552 newl = softint_picklwp(); 553 newl->l_stat = LSONPROC; 554 newl->l_pflag |= LP_RUNNING; 555 } 556 #endif /* !__HAVE_FAST_SOFTINTS */ 557 558 /* Count time spent in current system call */ 559 if (!returning) { 560 SYSCALL_TIME_SLEEP(l); 561 562 /* 563 * XXXSMP If we are using h/w performance counters, 564 * save context. 565 */ 566 #if PERFCTRS 567 if (PMC_ENABLED(l->l_proc)) { 568 pmc_save_context(l->l_proc); 569 } 570 #endif 571 updatertime(l, &bt); 572 } 573 574 /* Lock the runqueue */ 575 KASSERT(l->l_stat != LSRUN); 576 mutex_spin_enter(spc->spc_mutex); 577 578 /* 579 * If on the CPU and we have gotten this far, then we must yield. 580 */ 581 if (l->l_stat == LSONPROC && l != newl) { 582 KASSERT(lwp_locked(l, spc->spc_lwplock)); 583 if ((l->l_flag & LW_IDLE) == 0) { 584 l->l_stat = LSRUN; 585 lwp_setlock(l, spc->spc_mutex); 586 sched_enqueue(l, true); 587 /* 588 * Handle migration. Note that "migrating LWP" may 589 * be reset here, if interrupt/preemption happens 590 * early in idle LWP. 591 */ 592 if (l->l_target_cpu != NULL && 593 (l->l_pflag & LP_BOUND) == 0) { 594 KASSERT((l->l_pflag & LP_INTR) == 0); 595 spc->spc_migrating = l; 596 } 597 } else 598 l->l_stat = LSIDL; 599 } 600 601 /* Pick new LWP to run. */ 602 if (newl == NULL) { 603 newl = nextlwp(ci, spc); 604 } 605 606 /* Items that must be updated with the CPU locked. */ 607 if (!returning) { 608 /* Update the new LWP's start time. */ 609 newl->l_stime = bt; 610 611 /* 612 * ci_curlwp changes when a fast soft interrupt occurs. 613 * We use cpu_onproc to keep track of which kernel or 614 * user thread is running 'underneath' the software 615 * interrupt. This is important for time accounting, 616 * itimers and forcing user threads to preempt (aston). 617 */ 618 ci->ci_data.cpu_onproc = newl; 619 } 620 621 /* 622 * Preemption related tasks. Must be done with the current 623 * CPU locked. 624 */ 625 cpu_did_resched(l); 626 l->l_dopreempt = 0; 627 if (__predict_false(l->l_pfailaddr != 0)) { 628 LOCKSTAT_FLAG(lsflag); 629 LOCKSTAT_ENTER(lsflag); 630 LOCKSTAT_STOP_TIMER(lsflag, l->l_pfailtime); 631 LOCKSTAT_EVENT_RA(lsflag, l->l_pfaillock, LB_NOPREEMPT|LB_SPIN, 632 1, l->l_pfailtime, l->l_pfailaddr); 633 LOCKSTAT_EXIT(lsflag); 634 l->l_pfailtime = 0; 635 l->l_pfaillock = 0; 636 l->l_pfailaddr = 0; 637 } 638 639 if (l != newl) { 640 struct lwp *prevlwp; 641 642 /* Release all locks, but leave the current LWP locked */ 643 if (l->l_mutex == spc->spc_mutex) { 644 /* 645 * Drop spc_lwplock, if the current LWP has been moved 646 * to the run queue (it is now locked by spc_mutex). 647 */ 648 mutex_spin_exit(spc->spc_lwplock); 649 } else { 650 /* 651 * Otherwise, drop the spc_mutex, we are done with the 652 * run queues. 653 */ 654 mutex_spin_exit(spc->spc_mutex); 655 } 656 657 /* 658 * Mark that context switch is going to be performed 659 * for this LWP, to protect it from being switched 660 * to on another CPU. 661 */ 662 KASSERT(l->l_ctxswtch == 0); 663 l->l_ctxswtch = 1; 664 l->l_ncsw++; 665 KASSERT((l->l_pflag & LP_RUNNING) != 0); 666 l->l_pflag &= ~LP_RUNNING; 667 668 /* 669 * Increase the count of spin-mutexes before the release 670 * of the last lock - we must remain at IPL_SCHED during 671 * the context switch. 672 */ 673 KASSERTMSG(ci->ci_mtx_count == -1, 674 "%s: cpu%u: ci_mtx_count (%d) != -1 " 675 "(block with spin-mutex held)", 676 __func__, cpu_index(ci), ci->ci_mtx_count); 677 oldspl = MUTEX_SPIN_OLDSPL(ci); 678 ci->ci_mtx_count--; 679 lwp_unlock(l); 680 681 /* Count the context switch on this CPU. */ 682 ci->ci_data.cpu_nswtch++; 683 684 /* Update status for lwpctl, if present. */ 685 if (l->l_lwpctl != NULL) 686 l->l_lwpctl->lc_curcpu = LWPCTL_CPU_NONE; 687 688 /* 689 * Save old VM context, unless a soft interrupt 690 * handler is blocking. 691 */ 692 if (!returning) 693 pmap_deactivate(l); 694 695 /* 696 * We may need to spin-wait if 'newl' is still 697 * context switching on another CPU. 698 */ 699 if (__predict_false(newl->l_ctxswtch != 0)) { 700 u_int count; 701 count = SPINLOCK_BACKOFF_MIN; 702 while (newl->l_ctxswtch) 703 SPINLOCK_BACKOFF(count); 704 } 705 706 /* 707 * If DTrace has set the active vtime enum to anything 708 * other than INACTIVE (0), then it should have set the 709 * function to call. 710 */ 711 if (__predict_false(dtrace_vtime_active)) { 712 (*dtrace_vtime_switch_func)(newl); 713 } 714 715 /* Switch to the new LWP.. */ 716 #ifdef MULTIPROCESSOR 717 KASSERT(curlwp == ci->ci_curlwp); 718 #endif 719 KASSERTMSG(l == curlwp, "l %p curlwp %p", l, curlwp); 720 prevlwp = cpu_switchto(l, newl, returning); 721 ci = curcpu(); 722 #ifdef MULTIPROCESSOR 723 KASSERT(curlwp == ci->ci_curlwp); 724 #endif 725 KASSERTMSG(l == curlwp, "l %p curlwp %p prevlwp %p", 726 l, curlwp, prevlwp); 727 728 /* 729 * Switched away - we have new curlwp. 730 * Restore VM context and IPL. 731 */ 732 pmap_activate(l); 733 uvm_emap_switch(l); 734 pcu_switchpoint(l); 735 736 if (prevlwp != NULL) { 737 /* Normalize the count of the spin-mutexes */ 738 ci->ci_mtx_count++; 739 /* Unmark the state of context switch */ 740 membar_exit(); 741 prevlwp->l_ctxswtch = 0; 742 } 743 744 /* Update status for lwpctl, if present. */ 745 if (l->l_lwpctl != NULL) { 746 l->l_lwpctl->lc_curcpu = (int)cpu_index(ci); 747 l->l_lwpctl->lc_pctr++; 748 } 749 750 /* Note trip through cpu_switchto(). */ 751 pserialize_switchpoint(); 752 753 KASSERT(l->l_cpu == ci); 754 splx(oldspl); 755 /* 756 * note that, unless the caller disabled preemption, 757 * we can be preempted at any time after the above splx() call. 758 */ 759 retval = 1; 760 } else { 761 /* Nothing to do - just unlock and return. */ 762 mutex_spin_exit(spc->spc_mutex); 763 lwp_unlock(l); 764 retval = 0; 765 } 766 767 KASSERT(l == curlwp); 768 KASSERT(l->l_stat == LSONPROC); 769 770 /* 771 * XXXSMP If we are using h/w performance counters, restore context. 772 * XXXSMP preemption problem. 773 */ 774 #if PERFCTRS 775 if (PMC_ENABLED(l->l_proc)) { 776 pmc_restore_context(l->l_proc); 777 } 778 #endif 779 SYSCALL_TIME_WAKEUP(l); 780 LOCKDEBUG_BARRIER(NULL, 1); 781 782 return retval; 783 } 784 785 /* 786 * The machine independent parts of context switch to oblivion. 787 * Does not return. Call with the LWP unlocked. 788 */ 789 void 790 lwp_exit_switchaway(lwp_t *l) 791 { 792 struct cpu_info *ci; 793 struct lwp *newl; 794 struct bintime bt; 795 796 ci = l->l_cpu; 797 798 KASSERT(kpreempt_disabled()); 799 KASSERT(l->l_stat == LSZOMB || l->l_stat == LSIDL); 800 KASSERT(ci == curcpu()); 801 LOCKDEBUG_BARRIER(NULL, 0); 802 803 kstack_check_magic(l); 804 805 /* Count time spent in current system call */ 806 SYSCALL_TIME_SLEEP(l); 807 binuptime(&bt); 808 updatertime(l, &bt); 809 810 /* Must stay at IPL_SCHED even after releasing run queue lock. */ 811 (void)splsched(); 812 813 /* 814 * Let sched_nextlwp() select the LWP to run the CPU next. 815 * If no LWP is runnable, select the idle LWP. 816 * 817 * Note that spc_lwplock might not necessary be held, and 818 * new thread would be unlocked after setting the LWP-lock. 819 */ 820 spc_lock(ci); 821 #ifndef __HAVE_FAST_SOFTINTS 822 if (ci->ci_data.cpu_softints != 0) { 823 /* There are pending soft interrupts, so pick one. */ 824 newl = softint_picklwp(); 825 newl->l_stat = LSONPROC; 826 newl->l_pflag |= LP_RUNNING; 827 } else 828 #endif /* !__HAVE_FAST_SOFTINTS */ 829 { 830 newl = nextlwp(ci, &ci->ci_schedstate); 831 } 832 833 /* Update the new LWP's start time. */ 834 newl->l_stime = bt; 835 l->l_pflag &= ~LP_RUNNING; 836 837 /* 838 * ci_curlwp changes when a fast soft interrupt occurs. 839 * We use cpu_onproc to keep track of which kernel or 840 * user thread is running 'underneath' the software 841 * interrupt. This is important for time accounting, 842 * itimers and forcing user threads to preempt (aston). 843 */ 844 ci->ci_data.cpu_onproc = newl; 845 846 /* 847 * Preemption related tasks. Must be done with the current 848 * CPU locked. 849 */ 850 cpu_did_resched(l); 851 852 /* Unlock the run queue. */ 853 spc_unlock(ci); 854 855 /* Count the context switch on this CPU. */ 856 ci->ci_data.cpu_nswtch++; 857 858 /* Update status for lwpctl, if present. */ 859 if (l->l_lwpctl != NULL) 860 l->l_lwpctl->lc_curcpu = LWPCTL_CPU_EXITED; 861 862 /* 863 * We may need to spin-wait if 'newl' is still 864 * context switching on another CPU. 865 */ 866 if (__predict_false(newl->l_ctxswtch != 0)) { 867 u_int count; 868 count = SPINLOCK_BACKOFF_MIN; 869 while (newl->l_ctxswtch) 870 SPINLOCK_BACKOFF(count); 871 } 872 873 /* 874 * If DTrace has set the active vtime enum to anything 875 * other than INACTIVE (0), then it should have set the 876 * function to call. 877 */ 878 if (__predict_false(dtrace_vtime_active)) { 879 (*dtrace_vtime_switch_func)(newl); 880 } 881 882 /* Switch to the new LWP.. */ 883 (void)cpu_switchto(NULL, newl, false); 884 885 for (;;) continue; /* XXX: convince gcc about "noreturn" */ 886 /* NOTREACHED */ 887 } 888 889 /* 890 * setrunnable: change LWP state to be runnable, placing it on the run queue. 891 * 892 * Call with the process and LWP locked. Will return with the LWP unlocked. 893 */ 894 void 895 setrunnable(struct lwp *l) 896 { 897 struct proc *p = l->l_proc; 898 struct cpu_info *ci; 899 900 KASSERT((l->l_flag & LW_IDLE) == 0); 901 KASSERT(mutex_owned(p->p_lock)); 902 KASSERT(lwp_locked(l, NULL)); 903 KASSERT(l->l_mutex != l->l_cpu->ci_schedstate.spc_mutex); 904 905 switch (l->l_stat) { 906 case LSSTOP: 907 /* 908 * If we're being traced (possibly because someone attached us 909 * while we were stopped), check for a signal from the debugger. 910 */ 911 if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xsig != 0) 912 signotify(l); 913 p->p_nrlwps++; 914 break; 915 case LSSUSPENDED: 916 l->l_flag &= ~LW_WSUSPEND; 917 p->p_nrlwps++; 918 cv_broadcast(&p->p_lwpcv); 919 break; 920 case LSSLEEP: 921 KASSERT(l->l_wchan != NULL); 922 break; 923 default: 924 panic("setrunnable: lwp %p state was %d", l, l->l_stat); 925 } 926 927 /* 928 * If the LWP was sleeping, start it again. 929 */ 930 if (l->l_wchan != NULL) { 931 l->l_stat = LSSLEEP; 932 /* lwp_unsleep() will release the lock. */ 933 lwp_unsleep(l, true); 934 return; 935 } 936 937 /* 938 * If the LWP is still on the CPU, mark it as LSONPROC. It may be 939 * about to call mi_switch(), in which case it will yield. 940 */ 941 if ((l->l_pflag & LP_RUNNING) != 0) { 942 l->l_stat = LSONPROC; 943 l->l_slptime = 0; 944 lwp_unlock(l); 945 return; 946 } 947 948 /* 949 * Look for a CPU to run. 950 * Set the LWP runnable. 951 */ 952 ci = sched_takecpu(l); 953 l->l_cpu = ci; 954 spc_lock(ci); 955 lwp_unlock_to(l, ci->ci_schedstate.spc_mutex); 956 sched_setrunnable(l); 957 l->l_stat = LSRUN; 958 l->l_slptime = 0; 959 960 sched_enqueue(l, false); 961 resched_cpu(l); 962 lwp_unlock(l); 963 } 964 965 /* 966 * suspendsched: 967 * 968 * Convert all non-LW_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED. 969 */ 970 void 971 suspendsched(void) 972 { 973 CPU_INFO_ITERATOR cii; 974 struct cpu_info *ci; 975 struct lwp *l; 976 struct proc *p; 977 978 /* 979 * We do this by process in order not to violate the locking rules. 980 */ 981 mutex_enter(proc_lock); 982 PROCLIST_FOREACH(p, &allproc) { 983 mutex_enter(p->p_lock); 984 if ((p->p_flag & PK_SYSTEM) != 0) { 985 mutex_exit(p->p_lock); 986 continue; 987 } 988 989 if (p->p_stat != SSTOP) { 990 if (p->p_stat != SZOMB && p->p_stat != SDEAD) { 991 p->p_pptr->p_nstopchild++; 992 p->p_waited = 0; 993 } 994 p->p_stat = SSTOP; 995 } 996 997 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 998 if (l == curlwp) 999 continue; 1000 1001 lwp_lock(l); 1002 1003 /* 1004 * Set L_WREBOOT so that the LWP will suspend itself 1005 * when it tries to return to user mode. We want to 1006 * try and get to get as many LWPs as possible to 1007 * the user / kernel boundary, so that they will 1008 * release any locks that they hold. 1009 */ 1010 l->l_flag |= (LW_WREBOOT | LW_WSUSPEND); 1011 1012 if (l->l_stat == LSSLEEP && 1013 (l->l_flag & LW_SINTR) != 0) { 1014 /* setrunnable() will release the lock. */ 1015 setrunnable(l); 1016 continue; 1017 } 1018 1019 lwp_unlock(l); 1020 } 1021 1022 mutex_exit(p->p_lock); 1023 } 1024 mutex_exit(proc_lock); 1025 1026 /* 1027 * Kick all CPUs to make them preempt any LWPs running in user mode. 1028 * They'll trap into the kernel and suspend themselves in userret(). 1029 */ 1030 for (CPU_INFO_FOREACH(cii, ci)) { 1031 spc_lock(ci); 1032 cpu_need_resched(ci, RESCHED_IMMED); 1033 spc_unlock(ci); 1034 } 1035 } 1036 1037 /* 1038 * sched_unsleep: 1039 * 1040 * The is called when the LWP has not been awoken normally but instead 1041 * interrupted: for example, if the sleep timed out. Because of this, 1042 * it's not a valid action for running or idle LWPs. 1043 */ 1044 static void 1045 sched_unsleep(struct lwp *l, bool cleanup) 1046 { 1047 1048 lwp_unlock(l); 1049 panic("sched_unsleep"); 1050 } 1051 1052 static void 1053 resched_cpu(struct lwp *l) 1054 { 1055 struct cpu_info *ci = l->l_cpu; 1056 1057 KASSERT(lwp_locked(l, NULL)); 1058 if (lwp_eprio(l) > ci->ci_schedstate.spc_curpriority) 1059 cpu_need_resched(ci, 0); 1060 } 1061 1062 static void 1063 sched_changepri(struct lwp *l, pri_t pri) 1064 { 1065 1066 KASSERT(lwp_locked(l, NULL)); 1067 1068 if (l->l_stat == LSRUN) { 1069 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex)); 1070 sched_dequeue(l); 1071 l->l_priority = pri; 1072 sched_enqueue(l, false); 1073 } else { 1074 l->l_priority = pri; 1075 } 1076 resched_cpu(l); 1077 } 1078 1079 static void 1080 sched_lendpri(struct lwp *l, pri_t pri) 1081 { 1082 1083 KASSERT(lwp_locked(l, NULL)); 1084 1085 if (l->l_stat == LSRUN) { 1086 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex)); 1087 sched_dequeue(l); 1088 l->l_inheritedprio = pri; 1089 l->l_auxprio = MAX(l->l_inheritedprio, l->l_protectprio); 1090 sched_enqueue(l, false); 1091 } else { 1092 l->l_inheritedprio = pri; 1093 l->l_auxprio = MAX(l->l_inheritedprio, l->l_protectprio); 1094 } 1095 resched_cpu(l); 1096 } 1097 1098 struct lwp * 1099 syncobj_noowner(wchan_t wchan) 1100 { 1101 1102 return NULL; 1103 } 1104 1105 /* Decay 95% of proc::p_pctcpu in 60 seconds, ccpu = exp(-1/20) */ 1106 const fixpt_t ccpu = 0.95122942450071400909 * FSCALE; 1107 1108 /* 1109 * Constants for averages over 1, 5 and 15 minutes when sampling at 1110 * 5 second intervals. 1111 */ 1112 static const fixpt_t cexp[ ] = { 1113 0.9200444146293232 * FSCALE, /* exp(-1/12) */ 1114 0.9834714538216174 * FSCALE, /* exp(-1/60) */ 1115 0.9944598480048967 * FSCALE, /* exp(-1/180) */ 1116 }; 1117 1118 /* 1119 * sched_pstats: 1120 * 1121 * => Update process statistics and check CPU resource allocation. 1122 * => Call scheduler-specific hook to eventually adjust LWP priorities. 1123 * => Compute load average of a quantity on 1, 5 and 15 minute intervals. 1124 */ 1125 void 1126 sched_pstats(void) 1127 { 1128 extern struct loadavg averunnable; 1129 struct loadavg *avg = &averunnable; 1130 const int clkhz = (stathz != 0 ? stathz : hz); 1131 static bool backwards = false; 1132 static u_int lavg_count = 0; 1133 struct proc *p; 1134 int nrun; 1135 1136 sched_pstats_ticks++; 1137 if (++lavg_count >= 5) { 1138 lavg_count = 0; 1139 nrun = 0; 1140 } 1141 mutex_enter(proc_lock); 1142 PROCLIST_FOREACH(p, &allproc) { 1143 struct lwp *l; 1144 struct rlimit *rlim; 1145 time_t runtm; 1146 int sig; 1147 1148 /* Increment sleep time (if sleeping), ignore overflow. */ 1149 mutex_enter(p->p_lock); 1150 runtm = p->p_rtime.sec; 1151 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 1152 fixpt_t lpctcpu; 1153 u_int lcpticks; 1154 1155 if (__predict_false((l->l_flag & LW_IDLE) != 0)) 1156 continue; 1157 lwp_lock(l); 1158 runtm += l->l_rtime.sec; 1159 l->l_swtime++; 1160 sched_lwp_stats(l); 1161 1162 /* For load average calculation. */ 1163 if (__predict_false(lavg_count == 0) && 1164 (l->l_flag & (LW_SINTR | LW_SYSTEM)) == 0) { 1165 switch (l->l_stat) { 1166 case LSSLEEP: 1167 if (l->l_slptime > 1) { 1168 break; 1169 } 1170 case LSRUN: 1171 case LSONPROC: 1172 case LSIDL: 1173 nrun++; 1174 } 1175 } 1176 lwp_unlock(l); 1177 1178 l->l_pctcpu = (l->l_pctcpu * ccpu) >> FSHIFT; 1179 if (l->l_slptime != 0) 1180 continue; 1181 1182 lpctcpu = l->l_pctcpu; 1183 lcpticks = atomic_swap_uint(&l->l_cpticks, 0); 1184 lpctcpu += ((FSCALE - ccpu) * 1185 (lcpticks * FSCALE / clkhz)) >> FSHIFT; 1186 l->l_pctcpu = lpctcpu; 1187 } 1188 /* Calculating p_pctcpu only for ps(1) */ 1189 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT; 1190 1191 if (__predict_false(runtm < 0)) { 1192 if (!backwards) { 1193 backwards = true; 1194 printf("WARNING: negative runtime; " 1195 "monotonic clock has gone backwards\n"); 1196 } 1197 mutex_exit(p->p_lock); 1198 continue; 1199 } 1200 1201 /* 1202 * Check if the process exceeds its CPU resource allocation. 1203 * If over the hard limit, kill it with SIGKILL. 1204 * If over the soft limit, send SIGXCPU and raise 1205 * the soft limit a little. 1206 */ 1207 rlim = &p->p_rlimit[RLIMIT_CPU]; 1208 sig = 0; 1209 if (__predict_false(runtm >= rlim->rlim_cur)) { 1210 if (runtm >= rlim->rlim_max) { 1211 sig = SIGKILL; 1212 log(LOG_NOTICE, 1213 "pid %d, command %s, is killed: %s\n", 1214 p->p_pid, p->p_comm, "exceeded RLIMIT_CPU"); 1215 uprintf("pid %d, command %s, is killed: %s\n", 1216 p->p_pid, p->p_comm, "exceeded RLIMIT_CPU"); 1217 } else { 1218 sig = SIGXCPU; 1219 if (rlim->rlim_cur < rlim->rlim_max) 1220 rlim->rlim_cur += 5; 1221 } 1222 } 1223 mutex_exit(p->p_lock); 1224 if (__predict_false(sig)) { 1225 KASSERT((p->p_flag & PK_SYSTEM) == 0); 1226 psignal(p, sig); 1227 } 1228 } 1229 mutex_exit(proc_lock); 1230 1231 /* Load average calculation. */ 1232 if (__predict_false(lavg_count == 0)) { 1233 int i; 1234 CTASSERT(__arraycount(cexp) == __arraycount(avg->ldavg)); 1235 for (i = 0; i < __arraycount(cexp); i++) { 1236 avg->ldavg[i] = (cexp[i] * avg->ldavg[i] + 1237 nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT; 1238 } 1239 } 1240 1241 /* Lightning bolt. */ 1242 cv_broadcast(&lbolt); 1243 } 1244