1 /* $NetBSD: kern_synch.c,v 1.358 2023/07/17 12:54:29 riastradh Exp $ */ 2 3 /*- 4 * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008, 2009, 2019, 2020 5 * The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 10 * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran and 11 * Daniel Sieger. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 24 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 26 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 29 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 30 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 32 * POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 /*- 36 * Copyright (c) 1982, 1986, 1990, 1991, 1993 37 * The Regents of the University of California. All rights reserved. 38 * (c) UNIX System Laboratories, Inc. 39 * All or some portions of this file are derived from material licensed 40 * to the University of California by American Telephone and Telegraph 41 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 42 * the permission of UNIX System Laboratories, Inc. 43 * 44 * Redistribution and use in source and binary forms, with or without 45 * modification, are permitted provided that the following conditions 46 * are met: 47 * 1. Redistributions of source code must retain the above copyright 48 * notice, this list of conditions and the following disclaimer. 49 * 2. Redistributions in binary form must reproduce the above copyright 50 * notice, this list of conditions and the following disclaimer in the 51 * documentation and/or other materials provided with the distribution. 52 * 3. Neither the name of the University nor the names of its contributors 53 * may be used to endorse or promote products derived from this software 54 * without specific prior written permission. 55 * 56 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 57 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 58 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 59 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 60 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 61 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 62 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 63 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 64 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 65 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 66 * SUCH DAMAGE. 67 * 68 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 69 */ 70 71 #include <sys/cdefs.h> 72 __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.358 2023/07/17 12:54:29 riastradh Exp $"); 73 74 #include "opt_kstack.h" 75 #include "opt_ddb.h" 76 #include "opt_dtrace.h" 77 78 #define __MUTEX_PRIVATE 79 80 #include <sys/param.h> 81 #include <sys/systm.h> 82 #include <sys/proc.h> 83 #include <sys/kernel.h> 84 #include <sys/cpu.h> 85 #include <sys/pserialize.h> 86 #include <sys/resource.h> 87 #include <sys/resourcevar.h> 88 #include <sys/rwlock.h> 89 #include <sys/sched.h> 90 #include <sys/syscall_stats.h> 91 #include <sys/sleepq.h> 92 #include <sys/lockdebug.h> 93 #include <sys/evcnt.h> 94 #include <sys/intr.h> 95 #include <sys/lwpctl.h> 96 #include <sys/atomic.h> 97 #include <sys/syslog.h> 98 99 #include <uvm/uvm_extern.h> 100 101 #include <dev/lockstat.h> 102 103 #include <sys/dtrace_bsd.h> 104 int dtrace_vtime_active=0; 105 dtrace_vtime_switch_func_t dtrace_vtime_switch_func; 106 107 #ifdef DDB 108 #include <ddb/ddb.h> 109 #endif 110 111 static void sched_unsleep(struct lwp *, bool); 112 static void sched_changepri(struct lwp *, pri_t); 113 static void sched_lendpri(struct lwp *, pri_t); 114 115 syncobj_t sleep_syncobj = { 116 .sobj_name = "sleep", 117 .sobj_flag = SOBJ_SLEEPQ_SORTED, 118 .sobj_unsleep = sleepq_unsleep, 119 .sobj_changepri = sleepq_changepri, 120 .sobj_lendpri = sleepq_lendpri, 121 .sobj_owner = syncobj_noowner, 122 }; 123 124 syncobj_t sched_syncobj = { 125 .sobj_name = "sched", 126 .sobj_flag = SOBJ_SLEEPQ_SORTED, 127 .sobj_unsleep = sched_unsleep, 128 .sobj_changepri = sched_changepri, 129 .sobj_lendpri = sched_lendpri, 130 .sobj_owner = syncobj_noowner, 131 }; 132 133 syncobj_t kpause_syncobj = { 134 .sobj_name = "kpause", 135 .sobj_flag = SOBJ_SLEEPQ_NULL, 136 .sobj_unsleep = sleepq_unsleep, 137 .sobj_changepri = sleepq_changepri, 138 .sobj_lendpri = sleepq_lendpri, 139 .sobj_owner = syncobj_noowner, 140 }; 141 142 /* "Lightning bolt": once a second sleep address. */ 143 kcondvar_t lbolt __cacheline_aligned; 144 145 u_int sched_pstats_ticks __cacheline_aligned; 146 147 /* Preemption event counters. */ 148 static struct evcnt kpreempt_ev_crit __cacheline_aligned; 149 static struct evcnt kpreempt_ev_klock __cacheline_aligned; 150 static struct evcnt kpreempt_ev_immed __cacheline_aligned; 151 152 void 153 synch_init(void) 154 { 155 156 cv_init(&lbolt, "lbolt"); 157 158 evcnt_attach_dynamic(&kpreempt_ev_crit, EVCNT_TYPE_MISC, NULL, 159 "kpreempt", "defer: critical section"); 160 evcnt_attach_dynamic(&kpreempt_ev_klock, EVCNT_TYPE_MISC, NULL, 161 "kpreempt", "defer: kernel_lock"); 162 evcnt_attach_dynamic(&kpreempt_ev_immed, EVCNT_TYPE_MISC, NULL, 163 "kpreempt", "immediate"); 164 } 165 166 /* 167 * OBSOLETE INTERFACE 168 * 169 * General sleep call. Suspends the current LWP until a wakeup is 170 * performed on the specified identifier. The LWP will then be made 171 * runnable with the specified priority. Sleeps at most timo/hz seconds (0 172 * means no timeout). If pri includes PCATCH flag, signals are checked 173 * before and after sleeping, else signals are not checked. Returns 0 if 174 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a 175 * signal needs to be delivered, ERESTART is returned if the current system 176 * call should be restarted if possible, and EINTR is returned if the system 177 * call should be interrupted by the signal (return EINTR). 178 */ 179 int 180 tsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo) 181 { 182 struct lwp *l = curlwp; 183 sleepq_t *sq; 184 kmutex_t *mp; 185 bool catch_p; 186 187 KASSERT((l->l_pflag & LP_INTR) == 0); 188 KASSERT(ident != &lbolt); 189 //KASSERT(KERNEL_LOCKED_P()); 190 191 if (sleepq_dontsleep(l)) { 192 (void)sleepq_abort(NULL, 0); 193 return 0; 194 } 195 196 l->l_kpriority = true; 197 catch_p = priority & PCATCH; 198 sq = sleeptab_lookup(&sleeptab, ident, &mp); 199 sleepq_enter(sq, l, mp); 200 sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj, catch_p); 201 return sleepq_block(timo, catch_p, &sleep_syncobj); 202 } 203 204 int 205 mtsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo, 206 kmutex_t *mtx) 207 { 208 struct lwp *l = curlwp; 209 sleepq_t *sq; 210 kmutex_t *mp; 211 bool catch_p; 212 int error; 213 214 KASSERT((l->l_pflag & LP_INTR) == 0); 215 KASSERT(ident != &lbolt); 216 217 if (sleepq_dontsleep(l)) { 218 (void)sleepq_abort(mtx, (priority & PNORELOCK) != 0); 219 return 0; 220 } 221 222 l->l_kpriority = true; 223 catch_p = priority & PCATCH; 224 sq = sleeptab_lookup(&sleeptab, ident, &mp); 225 sleepq_enter(sq, l, mp); 226 sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj, catch_p); 227 mutex_exit(mtx); 228 error = sleepq_block(timo, catch_p, &sleep_syncobj); 229 230 if ((priority & PNORELOCK) == 0) 231 mutex_enter(mtx); 232 233 return error; 234 } 235 236 /* 237 * General sleep call for situations where a wake-up is not expected. 238 */ 239 int 240 kpause(const char *wmesg, bool intr, int timo, kmutex_t *mtx) 241 { 242 struct lwp *l = curlwp; 243 int error; 244 245 KASSERT(timo != 0 || intr); 246 247 if (sleepq_dontsleep(l)) 248 return sleepq_abort(NULL, 0); 249 250 if (mtx != NULL) 251 mutex_exit(mtx); 252 l->l_kpriority = true; 253 lwp_lock(l); 254 KERNEL_UNLOCK_ALL(NULL, &l->l_biglocks); 255 sleepq_enqueue(NULL, l, wmesg, &kpause_syncobj, intr); 256 error = sleepq_block(timo, intr, &kpause_syncobj); 257 if (mtx != NULL) 258 mutex_enter(mtx); 259 260 return error; 261 } 262 263 /* 264 * OBSOLETE INTERFACE 265 * 266 * Make all LWPs sleeping on the specified identifier runnable. 267 */ 268 void 269 wakeup(wchan_t ident) 270 { 271 sleepq_t *sq; 272 kmutex_t *mp; 273 274 if (__predict_false(cold)) 275 return; 276 277 sq = sleeptab_lookup(&sleeptab, ident, &mp); 278 sleepq_wake(sq, ident, (u_int)-1, mp); 279 } 280 281 /* 282 * General yield call. Puts the current LWP back on its run queue and 283 * performs a context switch. 284 */ 285 void 286 yield(void) 287 { 288 struct lwp *l = curlwp; 289 290 KERNEL_UNLOCK_ALL(l, &l->l_biglocks); 291 lwp_lock(l); 292 293 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); 294 KASSERT(l->l_stat == LSONPROC); 295 296 /* Voluntary - ditch kpriority boost. */ 297 l->l_kpriority = false; 298 spc_lock(l->l_cpu); 299 mi_switch(l); 300 KERNEL_LOCK(l->l_biglocks, l); 301 } 302 303 /* 304 * General preemption call. Puts the current LWP back on its run queue 305 * and performs an involuntary context switch. Different from yield() 306 * in that: 307 * 308 * - It's counted differently (involuntary vs. voluntary). 309 * - Realtime threads go to the head of their runqueue vs. tail for yield(). 310 * - Priority boost is retained unless LWP has exceeded timeslice. 311 */ 312 void 313 preempt(void) 314 { 315 struct lwp *l = curlwp; 316 317 KERNEL_UNLOCK_ALL(l, &l->l_biglocks); 318 lwp_lock(l); 319 320 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); 321 KASSERT(l->l_stat == LSONPROC); 322 323 spc_lock(l->l_cpu); 324 /* Involuntary - keep kpriority boost unless a CPU hog. */ 325 if ((l->l_cpu->ci_schedstate.spc_flags & SPCF_SHOULDYIELD) != 0) { 326 l->l_kpriority = false; 327 } 328 l->l_pflag |= LP_PREEMPTING; 329 mi_switch(l); 330 KERNEL_LOCK(l->l_biglocks, l); 331 } 332 333 /* 334 * Return true if the current LWP should yield the processor. Intended to 335 * be used by long-running code in kernel. 336 */ 337 inline bool 338 preempt_needed(void) 339 { 340 lwp_t *l = curlwp; 341 int needed; 342 343 KPREEMPT_DISABLE(l); 344 needed = l->l_cpu->ci_want_resched; 345 KPREEMPT_ENABLE(l); 346 347 return (needed != 0); 348 } 349 350 /* 351 * A breathing point for long running code in kernel. 352 */ 353 void 354 preempt_point(void) 355 { 356 357 if (__predict_false(preempt_needed())) { 358 preempt(); 359 } 360 } 361 362 /* 363 * Handle a request made by another agent to preempt the current LWP 364 * in-kernel. Usually called when l_dopreempt may be non-zero. 365 * 366 * Character addresses for lockstat only. 367 */ 368 static char kpreempt_is_disabled; 369 static char kernel_lock_held; 370 static char is_softint_lwp; 371 static char spl_is_raised; 372 373 bool 374 kpreempt(uintptr_t where) 375 { 376 uintptr_t failed; 377 lwp_t *l; 378 int s, dop, lsflag; 379 380 l = curlwp; 381 failed = 0; 382 while ((dop = l->l_dopreempt) != 0) { 383 if (l->l_stat != LSONPROC) { 384 /* 385 * About to block (or die), let it happen. 386 * Doesn't really count as "preemption has 387 * been blocked", since we're going to 388 * context switch. 389 */ 390 atomic_swap_uint(&l->l_dopreempt, 0); 391 return true; 392 } 393 KASSERT((l->l_flag & LW_IDLE) == 0); 394 if (__predict_false(l->l_nopreempt != 0)) { 395 /* LWP holds preemption disabled, explicitly. */ 396 if ((dop & DOPREEMPT_COUNTED) == 0) { 397 kpreempt_ev_crit.ev_count++; 398 } 399 failed = (uintptr_t)&kpreempt_is_disabled; 400 break; 401 } 402 if (__predict_false((l->l_pflag & LP_INTR) != 0)) { 403 /* Can't preempt soft interrupts yet. */ 404 atomic_swap_uint(&l->l_dopreempt, 0); 405 failed = (uintptr_t)&is_softint_lwp; 406 break; 407 } 408 s = splsched(); 409 if (__predict_false(l->l_blcnt != 0 || 410 curcpu()->ci_biglock_wanted != NULL)) { 411 /* Hold or want kernel_lock, code is not MT safe. */ 412 splx(s); 413 if ((dop & DOPREEMPT_COUNTED) == 0) { 414 kpreempt_ev_klock.ev_count++; 415 } 416 failed = (uintptr_t)&kernel_lock_held; 417 break; 418 } 419 if (__predict_false(!cpu_kpreempt_enter(where, s))) { 420 /* 421 * It may be that the IPL is too high. 422 * kpreempt_enter() can schedule an 423 * interrupt to retry later. 424 */ 425 splx(s); 426 failed = (uintptr_t)&spl_is_raised; 427 break; 428 } 429 /* Do it! */ 430 if (__predict_true((dop & DOPREEMPT_COUNTED) == 0)) { 431 kpreempt_ev_immed.ev_count++; 432 } 433 lwp_lock(l); 434 /* Involuntary - keep kpriority boost. */ 435 l->l_pflag |= LP_PREEMPTING; 436 spc_lock(l->l_cpu); 437 mi_switch(l); 438 l->l_nopreempt++; 439 splx(s); 440 441 /* Take care of any MD cleanup. */ 442 cpu_kpreempt_exit(where); 443 l->l_nopreempt--; 444 } 445 446 if (__predict_true(!failed)) { 447 return false; 448 } 449 450 /* Record preemption failure for reporting via lockstat. */ 451 atomic_or_uint(&l->l_dopreempt, DOPREEMPT_COUNTED); 452 lsflag = 0; 453 LOCKSTAT_ENTER(lsflag); 454 if (__predict_false(lsflag)) { 455 if (where == 0) { 456 where = (uintptr_t)__builtin_return_address(0); 457 } 458 /* Preemption is on, might recurse, so make it atomic. */ 459 if (atomic_cas_ptr_ni((void *)&l->l_pfailaddr, NULL, 460 (void *)where) == NULL) { 461 LOCKSTAT_START_TIMER(lsflag, l->l_pfailtime); 462 l->l_pfaillock = failed; 463 } 464 } 465 LOCKSTAT_EXIT(lsflag); 466 return true; 467 } 468 469 /* 470 * Return true if preemption is explicitly disabled. 471 */ 472 bool 473 kpreempt_disabled(void) 474 { 475 const lwp_t *l = curlwp; 476 477 return l->l_nopreempt != 0 || l->l_stat == LSZOMB || 478 (l->l_flag & LW_IDLE) != 0 || (l->l_pflag & LP_INTR) != 0 || 479 cpu_kpreempt_disabled(); 480 } 481 482 /* 483 * Disable kernel preemption. 484 */ 485 void 486 kpreempt_disable(void) 487 { 488 489 KPREEMPT_DISABLE(curlwp); 490 } 491 492 /* 493 * Reenable kernel preemption. 494 */ 495 void 496 kpreempt_enable(void) 497 { 498 499 KPREEMPT_ENABLE(curlwp); 500 } 501 502 /* 503 * Compute the amount of time during which the current lwp was running. 504 * 505 * - update l_rtime unless it's an idle lwp. 506 */ 507 508 void 509 updatertime(lwp_t *l, const struct bintime *now) 510 { 511 static bool backwards = false; 512 513 if (__predict_false(l->l_flag & LW_IDLE)) 514 return; 515 516 if (__predict_false(bintimecmp(now, &l->l_stime, <)) && !backwards) { 517 char caller[128]; 518 519 #ifdef DDB 520 db_symstr(caller, sizeof(caller), 521 (db_expr_t)(intptr_t)__builtin_return_address(0), 522 DB_STGY_PROC); 523 #else 524 snprintf(caller, sizeof(caller), "%p", 525 __builtin_return_address(0)); 526 #endif 527 backwards = true; 528 printf("WARNING: lwp %ld (%s%s%s) flags 0x%x:" 529 " timecounter went backwards" 530 " from (%jd + 0x%016"PRIx64"/2^64) sec" 531 " to (%jd + 0x%016"PRIx64"/2^64) sec" 532 " in %s\n", 533 (long)l->l_lid, 534 l->l_proc->p_comm, 535 l->l_name ? " " : "", 536 l->l_name ? l->l_name : "", 537 l->l_pflag, 538 (intmax_t)l->l_stime.sec, l->l_stime.frac, 539 (intmax_t)now->sec, now->frac, 540 caller); 541 } 542 543 /* rtime += now - stime */ 544 bintime_add(&l->l_rtime, now); 545 bintime_sub(&l->l_rtime, &l->l_stime); 546 } 547 548 /* 549 * Select next LWP from the current CPU to run.. 550 */ 551 static inline lwp_t * 552 nextlwp(struct cpu_info *ci, struct schedstate_percpu *spc) 553 { 554 lwp_t *newl; 555 556 /* 557 * Let sched_nextlwp() select the LWP to run the CPU next. 558 * If no LWP is runnable, select the idle LWP. 559 * 560 * On arrival here LWPs on a run queue are locked by spc_mutex which 561 * is currently held. Idle LWPs are always locked by spc_lwplock, 562 * which may or may not be held here. On exit from this code block, 563 * in all cases newl is locked by spc_lwplock. 564 */ 565 newl = sched_nextlwp(); 566 if (newl != NULL) { 567 sched_dequeue(newl); 568 KASSERT(lwp_locked(newl, spc->spc_mutex)); 569 KASSERT(newl->l_cpu == ci); 570 newl->l_stat = LSONPROC; 571 newl->l_pflag |= LP_RUNNING; 572 spc->spc_curpriority = lwp_eprio(newl); 573 spc->spc_flags &= ~(SPCF_SWITCHCLEAR | SPCF_IDLE); 574 lwp_setlock(newl, spc->spc_lwplock); 575 } else { 576 /* 577 * The idle LWP does not get set to LSONPROC, because 578 * otherwise it screws up the output from top(1) etc. 579 */ 580 newl = ci->ci_data.cpu_idlelwp; 581 newl->l_pflag |= LP_RUNNING; 582 spc->spc_curpriority = PRI_IDLE; 583 spc->spc_flags = (spc->spc_flags & ~SPCF_SWITCHCLEAR) | 584 SPCF_IDLE; 585 } 586 587 /* 588 * Only clear want_resched if there are no pending (slow) software 589 * interrupts. We can do this without an atomic, because no new 590 * LWPs can appear in the queue due to our hold on spc_mutex, and 591 * the update to ci_want_resched will become globally visible before 592 * the release of spc_mutex becomes globally visible. 593 */ 594 if (ci->ci_data.cpu_softints == 0) 595 ci->ci_want_resched = 0; 596 597 return newl; 598 } 599 600 /* 601 * The machine independent parts of context switch. 602 * 603 * NOTE: l->l_cpu is not changed in this routine, because an LWP never 604 * changes its own l_cpu (that would screw up curcpu on many ports and could 605 * cause all kinds of other evil stuff). l_cpu is always changed by some 606 * other actor, when it's known the LWP is not running (the LP_RUNNING flag 607 * is checked under lock). 608 */ 609 void 610 mi_switch(lwp_t *l) 611 { 612 struct cpu_info *ci; 613 struct schedstate_percpu *spc; 614 struct lwp *newl; 615 kmutex_t *lock; 616 int oldspl; 617 struct bintime bt; 618 bool returning; 619 620 KASSERT(lwp_locked(l, NULL)); 621 KASSERT(kpreempt_disabled()); 622 KASSERT(mutex_owned(curcpu()->ci_schedstate.spc_mutex)); 623 KASSERTMSG(l->l_blcnt == 0, "kernel_lock leaked"); 624 625 kstack_check_magic(l); 626 627 binuptime(&bt); 628 629 KASSERTMSG(l == curlwp, "l %p curlwp %p", l, curlwp); 630 KASSERT((l->l_pflag & LP_RUNNING) != 0); 631 KASSERT(l->l_cpu == curcpu() || l->l_stat == LSRUN); 632 ci = curcpu(); 633 spc = &ci->ci_schedstate; 634 returning = false; 635 newl = NULL; 636 637 /* 638 * If we have been asked to switch to a specific LWP, then there 639 * is no need to inspect the run queues. If a soft interrupt is 640 * blocking, then return to the interrupted thread without adjusting 641 * VM context or its start time: neither have been changed in order 642 * to take the interrupt. 643 */ 644 if (l->l_switchto != NULL) { 645 if ((l->l_pflag & LP_INTR) != 0) { 646 returning = true; 647 softint_block(l); 648 if ((l->l_pflag & LP_TIMEINTR) != 0) 649 updatertime(l, &bt); 650 } 651 newl = l->l_switchto; 652 l->l_switchto = NULL; 653 } 654 #ifndef __HAVE_FAST_SOFTINTS 655 else if (ci->ci_data.cpu_softints != 0) { 656 /* There are pending soft interrupts, so pick one. */ 657 newl = softint_picklwp(); 658 newl->l_stat = LSONPROC; 659 newl->l_pflag |= LP_RUNNING; 660 } 661 #endif /* !__HAVE_FAST_SOFTINTS */ 662 663 /* 664 * If on the CPU and we have gotten this far, then we must yield. 665 */ 666 if (l->l_stat == LSONPROC && l != newl) { 667 KASSERT(lwp_locked(l, spc->spc_lwplock)); 668 KASSERT((l->l_flag & LW_IDLE) == 0); 669 l->l_stat = LSRUN; 670 lwp_setlock(l, spc->spc_mutex); 671 sched_enqueue(l); 672 sched_preempted(l); 673 674 /* 675 * Handle migration. Note that "migrating LWP" may 676 * be reset here, if interrupt/preemption happens 677 * early in idle LWP. 678 */ 679 if (l->l_target_cpu != NULL && (l->l_pflag & LP_BOUND) == 0) { 680 KASSERT((l->l_pflag & LP_INTR) == 0); 681 spc->spc_migrating = l; 682 } 683 } 684 685 /* Pick new LWP to run. */ 686 if (newl == NULL) { 687 newl = nextlwp(ci, spc); 688 } 689 690 /* Items that must be updated with the CPU locked. */ 691 if (!returning) { 692 /* Count time spent in current system call */ 693 SYSCALL_TIME_SLEEP(l); 694 695 updatertime(l, &bt); 696 697 /* Update the new LWP's start time. */ 698 newl->l_stime = bt; 699 700 /* 701 * ci_curlwp changes when a fast soft interrupt occurs. 702 * We use ci_onproc to keep track of which kernel or 703 * user thread is running 'underneath' the software 704 * interrupt. This is important for time accounting, 705 * itimers and forcing user threads to preempt (aston). 706 */ 707 ci->ci_onproc = newl; 708 } 709 710 /* 711 * Preemption related tasks. Must be done holding spc_mutex. Clear 712 * l_dopreempt without an atomic - it's only ever set non-zero by 713 * sched_resched_cpu() which also holds spc_mutex, and only ever 714 * cleared by the LWP itself (us) with atomics when not under lock. 715 */ 716 l->l_dopreempt = 0; 717 if (__predict_false(l->l_pfailaddr != 0)) { 718 LOCKSTAT_FLAG(lsflag); 719 LOCKSTAT_ENTER(lsflag); 720 LOCKSTAT_STOP_TIMER(lsflag, l->l_pfailtime); 721 LOCKSTAT_EVENT_RA(lsflag, l->l_pfaillock, LB_NOPREEMPT|LB_SPIN, 722 1, l->l_pfailtime, l->l_pfailaddr); 723 LOCKSTAT_EXIT(lsflag); 724 l->l_pfailtime = 0; 725 l->l_pfaillock = 0; 726 l->l_pfailaddr = 0; 727 } 728 729 if (l != newl) { 730 struct lwp *prevlwp; 731 732 /* Release all locks, but leave the current LWP locked */ 733 if (l->l_mutex == spc->spc_mutex) { 734 /* 735 * Drop spc_lwplock, if the current LWP has been moved 736 * to the run queue (it is now locked by spc_mutex). 737 */ 738 mutex_spin_exit(spc->spc_lwplock); 739 } else { 740 /* 741 * Otherwise, drop the spc_mutex, we are done with the 742 * run queues. 743 */ 744 mutex_spin_exit(spc->spc_mutex); 745 } 746 747 /* We're down to only one lock, so do debug checks. */ 748 LOCKDEBUG_BARRIER(l->l_mutex, 1); 749 750 /* Count the context switch. */ 751 CPU_COUNT(CPU_COUNT_NSWTCH, 1); 752 l->l_ncsw++; 753 if ((l->l_pflag & LP_PREEMPTING) != 0) { 754 l->l_nivcsw++; 755 l->l_pflag &= ~LP_PREEMPTING; 756 } 757 758 /* 759 * Increase the count of spin-mutexes before the release 760 * of the last lock - we must remain at IPL_SCHED after 761 * releasing the lock. 762 */ 763 KASSERTMSG(ci->ci_mtx_count == -1, 764 "%s: cpu%u: ci_mtx_count (%d) != -1 " 765 "(block with spin-mutex held)", 766 __func__, cpu_index(ci), ci->ci_mtx_count); 767 oldspl = MUTEX_SPIN_OLDSPL(ci); 768 ci->ci_mtx_count = -2; 769 770 /* Update status for lwpctl, if present. */ 771 if (l->l_lwpctl != NULL) { 772 l->l_lwpctl->lc_curcpu = (l->l_stat == LSZOMB ? 773 LWPCTL_CPU_EXITED : LWPCTL_CPU_NONE); 774 } 775 776 /* 777 * If curlwp is a soft interrupt LWP, there's nobody on the 778 * other side to unlock - we're returning into an assembly 779 * trampoline. Unlock now. This is safe because this is a 780 * kernel LWP and is bound to current CPU: the worst anyone 781 * else will do to it, is to put it back onto this CPU's run 782 * queue (and the CPU is busy here right now!). 783 */ 784 if (returning) { 785 /* Keep IPL_SCHED after this; MD code will fix up. */ 786 l->l_pflag &= ~LP_RUNNING; 787 lwp_unlock(l); 788 } else { 789 /* A normal LWP: save old VM context. */ 790 pmap_deactivate(l); 791 } 792 793 /* 794 * If DTrace has set the active vtime enum to anything 795 * other than INACTIVE (0), then it should have set the 796 * function to call. 797 */ 798 if (__predict_false(dtrace_vtime_active)) { 799 (*dtrace_vtime_switch_func)(newl); 800 } 801 802 /* 803 * We must ensure not to come here from inside a read section. 804 */ 805 KASSERT(pserialize_not_in_read_section()); 806 807 /* Switch to the new LWP.. */ 808 #ifdef MULTIPROCESSOR 809 KASSERT(curlwp == ci->ci_curlwp); 810 #endif 811 KASSERTMSG(l == curlwp, "l %p curlwp %p", l, curlwp); 812 prevlwp = cpu_switchto(l, newl, returning); 813 ci = curcpu(); 814 #ifdef MULTIPROCESSOR 815 KASSERT(curlwp == ci->ci_curlwp); 816 #endif 817 KASSERTMSG(l == curlwp, "l %p curlwp %p prevlwp %p", 818 l, curlwp, prevlwp); 819 KASSERT(prevlwp != NULL); 820 KASSERT(l->l_cpu == ci); 821 KASSERT(ci->ci_mtx_count == -2); 822 823 /* 824 * Immediately mark the previous LWP as no longer running 825 * and unlock (to keep lock wait times short as possible). 826 * We'll still be at IPL_SCHED afterwards. If a zombie, 827 * don't touch after clearing LP_RUNNING as it could be 828 * reaped by another CPU. Issue a memory barrier to ensure 829 * this. 830 * 831 * atomic_store_release matches atomic_load_acquire in 832 * lwp_free. 833 */ 834 KASSERT((prevlwp->l_pflag & LP_RUNNING) != 0); 835 lock = prevlwp->l_mutex; 836 if (__predict_false(prevlwp->l_stat == LSZOMB)) { 837 atomic_store_release(&prevlwp->l_pflag, 838 prevlwp->l_pflag & ~LP_RUNNING); 839 } else { 840 prevlwp->l_pflag &= ~LP_RUNNING; 841 } 842 mutex_spin_exit(lock); 843 844 /* 845 * Switched away - we have new curlwp. 846 * Restore VM context and IPL. 847 */ 848 pmap_activate(l); 849 pcu_switchpoint(l); 850 851 /* Update status for lwpctl, if present. */ 852 if (l->l_lwpctl != NULL) { 853 l->l_lwpctl->lc_curcpu = (int)cpu_index(ci); 854 l->l_lwpctl->lc_pctr++; 855 } 856 857 /* 858 * Normalize the spin mutex count and restore the previous 859 * SPL. Note that, unless the caller disabled preemption, 860 * we can be preempted at any time after this splx(). 861 */ 862 KASSERT(l->l_cpu == ci); 863 KASSERT(ci->ci_mtx_count == -1); 864 ci->ci_mtx_count = 0; 865 splx(oldspl); 866 } else { 867 /* Nothing to do - just unlock and return. */ 868 mutex_spin_exit(spc->spc_mutex); 869 l->l_pflag &= ~LP_PREEMPTING; 870 lwp_unlock(l); 871 } 872 873 KASSERT(l == curlwp); 874 KASSERT(l->l_stat == LSONPROC || (l->l_flag & LW_IDLE) != 0); 875 876 SYSCALL_TIME_WAKEUP(l); 877 LOCKDEBUG_BARRIER(NULL, 1); 878 } 879 880 /* 881 * setrunnable: change LWP state to be runnable, placing it on the run queue. 882 * 883 * Call with the process and LWP locked. Will return with the LWP unlocked. 884 */ 885 void 886 setrunnable(struct lwp *l) 887 { 888 struct proc *p = l->l_proc; 889 struct cpu_info *ci; 890 kmutex_t *oldlock; 891 892 KASSERT((l->l_flag & LW_IDLE) == 0); 893 KASSERT((l->l_flag & LW_DBGSUSPEND) == 0); 894 KASSERT(mutex_owned(p->p_lock)); 895 KASSERT(lwp_locked(l, NULL)); 896 KASSERT(l->l_mutex != l->l_cpu->ci_schedstate.spc_mutex); 897 898 switch (l->l_stat) { 899 case LSSTOP: 900 /* 901 * If we're being traced (possibly because someone attached us 902 * while we were stopped), check for a signal from the debugger. 903 */ 904 if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xsig != 0) 905 signotify(l); 906 p->p_nrlwps++; 907 break; 908 case LSSUSPENDED: 909 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); 910 l->l_flag &= ~LW_WSUSPEND; 911 p->p_nrlwps++; 912 cv_broadcast(&p->p_lwpcv); 913 break; 914 case LSSLEEP: 915 KASSERT(l->l_wchan != NULL); 916 break; 917 case LSIDL: 918 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); 919 break; 920 default: 921 panic("setrunnable: lwp %p state was %d", l, l->l_stat); 922 } 923 924 /* 925 * If the LWP was sleeping, start it again. 926 */ 927 if (l->l_wchan != NULL) { 928 l->l_stat = LSSLEEP; 929 /* lwp_unsleep() will release the lock. */ 930 lwp_unsleep(l, true); 931 return; 932 } 933 934 /* 935 * If the LWP is still on the CPU, mark it as LSONPROC. It may be 936 * about to call mi_switch(), in which case it will yield. 937 */ 938 if ((l->l_pflag & LP_RUNNING) != 0) { 939 l->l_stat = LSONPROC; 940 l->l_slptime = 0; 941 lwp_unlock(l); 942 return; 943 } 944 945 /* 946 * Look for a CPU to run. 947 * Set the LWP runnable. 948 */ 949 ci = sched_takecpu(l); 950 l->l_cpu = ci; 951 spc_lock(ci); 952 oldlock = lwp_setlock(l, l->l_cpu->ci_schedstate.spc_mutex); 953 sched_setrunnable(l); 954 l->l_stat = LSRUN; 955 l->l_slptime = 0; 956 sched_enqueue(l); 957 sched_resched_lwp(l, true); 958 /* SPC & LWP now unlocked. */ 959 mutex_spin_exit(oldlock); 960 } 961 962 /* 963 * suspendsched: 964 * 965 * Convert all non-LW_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED. 966 */ 967 void 968 suspendsched(void) 969 { 970 CPU_INFO_ITERATOR cii; 971 struct cpu_info *ci; 972 struct lwp *l; 973 struct proc *p; 974 975 /* 976 * We do this by process in order not to violate the locking rules. 977 */ 978 mutex_enter(&proc_lock); 979 PROCLIST_FOREACH(p, &allproc) { 980 mutex_enter(p->p_lock); 981 if ((p->p_flag & PK_SYSTEM) != 0) { 982 mutex_exit(p->p_lock); 983 continue; 984 } 985 986 if (p->p_stat != SSTOP) { 987 if (p->p_stat != SZOMB && p->p_stat != SDEAD) { 988 p->p_pptr->p_nstopchild++; 989 p->p_waited = 0; 990 } 991 p->p_stat = SSTOP; 992 } 993 994 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 995 if (l == curlwp) 996 continue; 997 998 lwp_lock(l); 999 1000 /* 1001 * Set L_WREBOOT so that the LWP will suspend itself 1002 * when it tries to return to user mode. We want to 1003 * try and get to get as many LWPs as possible to 1004 * the user / kernel boundary, so that they will 1005 * release any locks that they hold. 1006 */ 1007 l->l_flag |= (LW_WREBOOT | LW_WSUSPEND); 1008 1009 if (l->l_stat == LSSLEEP && 1010 (l->l_flag & LW_SINTR) != 0) { 1011 /* setrunnable() will release the lock. */ 1012 setrunnable(l); 1013 continue; 1014 } 1015 1016 lwp_unlock(l); 1017 } 1018 1019 mutex_exit(p->p_lock); 1020 } 1021 mutex_exit(&proc_lock); 1022 1023 /* 1024 * Kick all CPUs to make them preempt any LWPs running in user mode. 1025 * They'll trap into the kernel and suspend themselves in userret(). 1026 * 1027 * Unusually, we don't hold any other scheduler object locked, which 1028 * would keep preemption off for sched_resched_cpu(), so disable it 1029 * explicitly. 1030 */ 1031 kpreempt_disable(); 1032 for (CPU_INFO_FOREACH(cii, ci)) { 1033 spc_lock(ci); 1034 sched_resched_cpu(ci, PRI_KERNEL, true); 1035 /* spc now unlocked */ 1036 } 1037 kpreempt_enable(); 1038 } 1039 1040 /* 1041 * sched_unsleep: 1042 * 1043 * The is called when the LWP has not been awoken normally but instead 1044 * interrupted: for example, if the sleep timed out. Because of this, 1045 * it's not a valid action for running or idle LWPs. 1046 */ 1047 static void 1048 sched_unsleep(struct lwp *l, bool cleanup) 1049 { 1050 1051 lwp_unlock(l); 1052 panic("sched_unsleep"); 1053 } 1054 1055 static void 1056 sched_changepri(struct lwp *l, pri_t pri) 1057 { 1058 struct schedstate_percpu *spc; 1059 struct cpu_info *ci; 1060 1061 KASSERT(lwp_locked(l, NULL)); 1062 1063 ci = l->l_cpu; 1064 spc = &ci->ci_schedstate; 1065 1066 if (l->l_stat == LSRUN) { 1067 KASSERT(lwp_locked(l, spc->spc_mutex)); 1068 sched_dequeue(l); 1069 l->l_priority = pri; 1070 sched_enqueue(l); 1071 sched_resched_lwp(l, false); 1072 } else if (l->l_stat == LSONPROC && l->l_class != SCHED_OTHER) { 1073 /* On priority drop, only evict realtime LWPs. */ 1074 KASSERT(lwp_locked(l, spc->spc_lwplock)); 1075 l->l_priority = pri; 1076 spc_lock(ci); 1077 sched_resched_cpu(ci, spc->spc_maxpriority, true); 1078 /* spc now unlocked */ 1079 } else { 1080 l->l_priority = pri; 1081 } 1082 } 1083 1084 static void 1085 sched_lendpri(struct lwp *l, pri_t pri) 1086 { 1087 struct schedstate_percpu *spc; 1088 struct cpu_info *ci; 1089 1090 KASSERT(lwp_locked(l, NULL)); 1091 1092 ci = l->l_cpu; 1093 spc = &ci->ci_schedstate; 1094 1095 if (l->l_stat == LSRUN) { 1096 KASSERT(lwp_locked(l, spc->spc_mutex)); 1097 sched_dequeue(l); 1098 l->l_inheritedprio = pri; 1099 l->l_auxprio = MAX(l->l_inheritedprio, l->l_protectprio); 1100 sched_enqueue(l); 1101 sched_resched_lwp(l, false); 1102 } else if (l->l_stat == LSONPROC && l->l_class != SCHED_OTHER) { 1103 /* On priority drop, only evict realtime LWPs. */ 1104 KASSERT(lwp_locked(l, spc->spc_lwplock)); 1105 l->l_inheritedprio = pri; 1106 l->l_auxprio = MAX(l->l_inheritedprio, l->l_protectprio); 1107 spc_lock(ci); 1108 sched_resched_cpu(ci, spc->spc_maxpriority, true); 1109 /* spc now unlocked */ 1110 } else { 1111 l->l_inheritedprio = pri; 1112 l->l_auxprio = MAX(l->l_inheritedprio, l->l_protectprio); 1113 } 1114 } 1115 1116 struct lwp * 1117 syncobj_noowner(wchan_t wchan) 1118 { 1119 1120 return NULL; 1121 } 1122 1123 /* Decay 95% of proc::p_pctcpu in 60 seconds, ccpu = exp(-1/20) */ 1124 const fixpt_t ccpu = 0.95122942450071400909 * FSCALE; 1125 1126 /* 1127 * Constants for averages over 1, 5 and 15 minutes when sampling at 1128 * 5 second intervals. 1129 */ 1130 static const fixpt_t cexp[ ] = { 1131 0.9200444146293232 * FSCALE, /* exp(-1/12) */ 1132 0.9834714538216174 * FSCALE, /* exp(-1/60) */ 1133 0.9944598480048967 * FSCALE, /* exp(-1/180) */ 1134 }; 1135 1136 /* 1137 * sched_pstats: 1138 * 1139 * => Update process statistics and check CPU resource allocation. 1140 * => Call scheduler-specific hook to eventually adjust LWP priorities. 1141 * => Compute load average of a quantity on 1, 5 and 15 minute intervals. 1142 */ 1143 void 1144 sched_pstats(void) 1145 { 1146 struct loadavg *avg = &averunnable; 1147 const int clkhz = (stathz != 0 ? stathz : hz); 1148 static bool backwardslwp = false; 1149 static bool backwardsproc = false; 1150 static u_int lavg_count = 0; 1151 struct proc *p; 1152 int nrun; 1153 1154 sched_pstats_ticks++; 1155 if (++lavg_count >= 5) { 1156 lavg_count = 0; 1157 nrun = 0; 1158 } 1159 mutex_enter(&proc_lock); 1160 PROCLIST_FOREACH(p, &allproc) { 1161 struct lwp *l; 1162 struct rlimit *rlim; 1163 time_t runtm; 1164 int sig; 1165 1166 /* Increment sleep time (if sleeping), ignore overflow. */ 1167 mutex_enter(p->p_lock); 1168 runtm = p->p_rtime.sec; 1169 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 1170 fixpt_t lpctcpu; 1171 u_int lcpticks; 1172 1173 if (__predict_false((l->l_flag & LW_IDLE) != 0)) 1174 continue; 1175 lwp_lock(l); 1176 if (__predict_false(l->l_rtime.sec < 0) && 1177 !backwardslwp) { 1178 backwardslwp = true; 1179 printf("WARNING: lwp %ld (%s%s%s): " 1180 "negative runtime: " 1181 "(%jd + 0x%016"PRIx64"/2^64) sec\n", 1182 (long)l->l_lid, 1183 l->l_proc->p_comm, 1184 l->l_name ? " " : "", 1185 l->l_name ? l->l_name : "", 1186 (intmax_t)l->l_rtime.sec, 1187 l->l_rtime.frac); 1188 } 1189 runtm += l->l_rtime.sec; 1190 l->l_swtime++; 1191 sched_lwp_stats(l); 1192 1193 /* For load average calculation. */ 1194 if (__predict_false(lavg_count == 0) && 1195 (l->l_flag & (LW_SINTR | LW_SYSTEM)) == 0) { 1196 switch (l->l_stat) { 1197 case LSSLEEP: 1198 if (l->l_slptime > 1) { 1199 break; 1200 } 1201 /* FALLTHROUGH */ 1202 case LSRUN: 1203 case LSONPROC: 1204 case LSIDL: 1205 nrun++; 1206 } 1207 } 1208 lwp_unlock(l); 1209 1210 l->l_pctcpu = (l->l_pctcpu * ccpu) >> FSHIFT; 1211 if (l->l_slptime != 0) 1212 continue; 1213 1214 lpctcpu = l->l_pctcpu; 1215 lcpticks = atomic_swap_uint(&l->l_cpticks, 0); 1216 lpctcpu += ((FSCALE - ccpu) * 1217 (lcpticks * FSCALE / clkhz)) >> FSHIFT; 1218 l->l_pctcpu = lpctcpu; 1219 } 1220 /* Calculating p_pctcpu only for ps(1) */ 1221 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT; 1222 1223 if (__predict_false(runtm < 0)) { 1224 if (!backwardsproc) { 1225 backwardsproc = true; 1226 printf("WARNING: pid %ld (%s): " 1227 "negative runtime; " 1228 "monotonic clock has gone backwards\n", 1229 (long)p->p_pid, p->p_comm); 1230 } 1231 mutex_exit(p->p_lock); 1232 continue; 1233 } 1234 1235 /* 1236 * Check if the process exceeds its CPU resource allocation. 1237 * If over the hard limit, kill it with SIGKILL. 1238 * If over the soft limit, send SIGXCPU and raise 1239 * the soft limit a little. 1240 */ 1241 rlim = &p->p_rlimit[RLIMIT_CPU]; 1242 sig = 0; 1243 if (__predict_false(runtm >= rlim->rlim_cur)) { 1244 if (runtm >= rlim->rlim_max) { 1245 sig = SIGKILL; 1246 log(LOG_NOTICE, 1247 "pid %d, command %s, is killed: %s\n", 1248 p->p_pid, p->p_comm, "exceeded RLIMIT_CPU"); 1249 uprintf("pid %d, command %s, is killed: %s\n", 1250 p->p_pid, p->p_comm, "exceeded RLIMIT_CPU"); 1251 } else { 1252 sig = SIGXCPU; 1253 if (rlim->rlim_cur < rlim->rlim_max) 1254 rlim->rlim_cur += 5; 1255 } 1256 } 1257 mutex_exit(p->p_lock); 1258 if (__predict_false(sig)) { 1259 KASSERT((p->p_flag & PK_SYSTEM) == 0); 1260 psignal(p, sig); 1261 } 1262 } 1263 1264 /* Load average calculation. */ 1265 if (__predict_false(lavg_count == 0)) { 1266 int i; 1267 CTASSERT(__arraycount(cexp) == __arraycount(avg->ldavg)); 1268 for (i = 0; i < __arraycount(cexp); i++) { 1269 avg->ldavg[i] = (cexp[i] * avg->ldavg[i] + 1270 nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT; 1271 } 1272 } 1273 1274 /* Lightning bolt. */ 1275 cv_broadcast(&lbolt); 1276 1277 mutex_exit(&proc_lock); 1278 } 1279