1 /* $NetBSD: kern_synch.c,v 1.357 2023/07/13 13:33:55 riastradh Exp $ */ 2 3 /*- 4 * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008, 2009, 2019, 2020 5 * The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 10 * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran and 11 * Daniel Sieger. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 24 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 26 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 29 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 30 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 32 * POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 /*- 36 * Copyright (c) 1982, 1986, 1990, 1991, 1993 37 * The Regents of the University of California. All rights reserved. 38 * (c) UNIX System Laboratories, Inc. 39 * All or some portions of this file are derived from material licensed 40 * to the University of California by American Telephone and Telegraph 41 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 42 * the permission of UNIX System Laboratories, Inc. 43 * 44 * Redistribution and use in source and binary forms, with or without 45 * modification, are permitted provided that the following conditions 46 * are met: 47 * 1. Redistributions of source code must retain the above copyright 48 * notice, this list of conditions and the following disclaimer. 49 * 2. Redistributions in binary form must reproduce the above copyright 50 * notice, this list of conditions and the following disclaimer in the 51 * documentation and/or other materials provided with the distribution. 52 * 3. Neither the name of the University nor the names of its contributors 53 * may be used to endorse or promote products derived from this software 54 * without specific prior written permission. 55 * 56 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 57 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 58 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 59 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 60 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 61 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 62 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 63 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 64 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 65 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 66 * SUCH DAMAGE. 67 * 68 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 69 */ 70 71 #include <sys/cdefs.h> 72 __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.357 2023/07/13 13:33:55 riastradh Exp $"); 73 74 #include "opt_kstack.h" 75 #include "opt_ddb.h" 76 #include "opt_dtrace.h" 77 78 #define __MUTEX_PRIVATE 79 80 #include <sys/param.h> 81 #include <sys/systm.h> 82 #include <sys/proc.h> 83 #include <sys/kernel.h> 84 #include <sys/cpu.h> 85 #include <sys/pserialize.h> 86 #include <sys/resource.h> 87 #include <sys/resourcevar.h> 88 #include <sys/rwlock.h> 89 #include <sys/sched.h> 90 #include <sys/syscall_stats.h> 91 #include <sys/sleepq.h> 92 #include <sys/lockdebug.h> 93 #include <sys/evcnt.h> 94 #include <sys/intr.h> 95 #include <sys/lwpctl.h> 96 #include <sys/atomic.h> 97 #include <sys/syslog.h> 98 99 #include <uvm/uvm_extern.h> 100 101 #include <dev/lockstat.h> 102 103 #include <sys/dtrace_bsd.h> 104 int dtrace_vtime_active=0; 105 dtrace_vtime_switch_func_t dtrace_vtime_switch_func; 106 107 #ifdef DDB 108 #include <ddb/ddb.h> 109 #endif 110 111 static void sched_unsleep(struct lwp *, bool); 112 static void sched_changepri(struct lwp *, pri_t); 113 static void sched_lendpri(struct lwp *, pri_t); 114 115 syncobj_t sleep_syncobj = { 116 .sobj_flag = SOBJ_SLEEPQ_SORTED, 117 .sobj_unsleep = sleepq_unsleep, 118 .sobj_changepri = sleepq_changepri, 119 .sobj_lendpri = sleepq_lendpri, 120 .sobj_owner = syncobj_noowner, 121 }; 122 123 syncobj_t sched_syncobj = { 124 .sobj_flag = SOBJ_SLEEPQ_SORTED, 125 .sobj_unsleep = sched_unsleep, 126 .sobj_changepri = sched_changepri, 127 .sobj_lendpri = sched_lendpri, 128 .sobj_owner = syncobj_noowner, 129 }; 130 131 syncobj_t kpause_syncobj = { 132 .sobj_flag = SOBJ_SLEEPQ_NULL, 133 .sobj_unsleep = sleepq_unsleep, 134 .sobj_changepri = sleepq_changepri, 135 .sobj_lendpri = sleepq_lendpri, 136 .sobj_owner = syncobj_noowner, 137 }; 138 139 /* "Lightning bolt": once a second sleep address. */ 140 kcondvar_t lbolt __cacheline_aligned; 141 142 u_int sched_pstats_ticks __cacheline_aligned; 143 144 /* Preemption event counters. */ 145 static struct evcnt kpreempt_ev_crit __cacheline_aligned; 146 static struct evcnt kpreempt_ev_klock __cacheline_aligned; 147 static struct evcnt kpreempt_ev_immed __cacheline_aligned; 148 149 void 150 synch_init(void) 151 { 152 153 cv_init(&lbolt, "lbolt"); 154 155 evcnt_attach_dynamic(&kpreempt_ev_crit, EVCNT_TYPE_MISC, NULL, 156 "kpreempt", "defer: critical section"); 157 evcnt_attach_dynamic(&kpreempt_ev_klock, EVCNT_TYPE_MISC, NULL, 158 "kpreempt", "defer: kernel_lock"); 159 evcnt_attach_dynamic(&kpreempt_ev_immed, EVCNT_TYPE_MISC, NULL, 160 "kpreempt", "immediate"); 161 } 162 163 /* 164 * OBSOLETE INTERFACE 165 * 166 * General sleep call. Suspends the current LWP until a wakeup is 167 * performed on the specified identifier. The LWP will then be made 168 * runnable with the specified priority. Sleeps at most timo/hz seconds (0 169 * means no timeout). If pri includes PCATCH flag, signals are checked 170 * before and after sleeping, else signals are not checked. Returns 0 if 171 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a 172 * signal needs to be delivered, ERESTART is returned if the current system 173 * call should be restarted if possible, and EINTR is returned if the system 174 * call should be interrupted by the signal (return EINTR). 175 */ 176 int 177 tsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo) 178 { 179 struct lwp *l = curlwp; 180 sleepq_t *sq; 181 kmutex_t *mp; 182 bool catch_p; 183 184 KASSERT((l->l_pflag & LP_INTR) == 0); 185 KASSERT(ident != &lbolt); 186 //KASSERT(KERNEL_LOCKED_P()); 187 188 if (sleepq_dontsleep(l)) { 189 (void)sleepq_abort(NULL, 0); 190 return 0; 191 } 192 193 l->l_kpriority = true; 194 catch_p = priority & PCATCH; 195 sq = sleeptab_lookup(&sleeptab, ident, &mp); 196 sleepq_enter(sq, l, mp); 197 sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj, catch_p); 198 return sleepq_block(timo, catch_p, &sleep_syncobj); 199 } 200 201 int 202 mtsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo, 203 kmutex_t *mtx) 204 { 205 struct lwp *l = curlwp; 206 sleepq_t *sq; 207 kmutex_t *mp; 208 bool catch_p; 209 int error; 210 211 KASSERT((l->l_pflag & LP_INTR) == 0); 212 KASSERT(ident != &lbolt); 213 214 if (sleepq_dontsleep(l)) { 215 (void)sleepq_abort(mtx, (priority & PNORELOCK) != 0); 216 return 0; 217 } 218 219 l->l_kpriority = true; 220 catch_p = priority & PCATCH; 221 sq = sleeptab_lookup(&sleeptab, ident, &mp); 222 sleepq_enter(sq, l, mp); 223 sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj, catch_p); 224 mutex_exit(mtx); 225 error = sleepq_block(timo, catch_p, &sleep_syncobj); 226 227 if ((priority & PNORELOCK) == 0) 228 mutex_enter(mtx); 229 230 return error; 231 } 232 233 /* 234 * General sleep call for situations where a wake-up is not expected. 235 */ 236 int 237 kpause(const char *wmesg, bool intr, int timo, kmutex_t *mtx) 238 { 239 struct lwp *l = curlwp; 240 int error; 241 242 KASSERT(timo != 0 || intr); 243 244 if (sleepq_dontsleep(l)) 245 return sleepq_abort(NULL, 0); 246 247 if (mtx != NULL) 248 mutex_exit(mtx); 249 l->l_kpriority = true; 250 lwp_lock(l); 251 KERNEL_UNLOCK_ALL(NULL, &l->l_biglocks); 252 sleepq_enqueue(NULL, l, wmesg, &kpause_syncobj, intr); 253 error = sleepq_block(timo, intr, &kpause_syncobj); 254 if (mtx != NULL) 255 mutex_enter(mtx); 256 257 return error; 258 } 259 260 /* 261 * OBSOLETE INTERFACE 262 * 263 * Make all LWPs sleeping on the specified identifier runnable. 264 */ 265 void 266 wakeup(wchan_t ident) 267 { 268 sleepq_t *sq; 269 kmutex_t *mp; 270 271 if (__predict_false(cold)) 272 return; 273 274 sq = sleeptab_lookup(&sleeptab, ident, &mp); 275 sleepq_wake(sq, ident, (u_int)-1, mp); 276 } 277 278 /* 279 * General yield call. Puts the current LWP back on its run queue and 280 * performs a context switch. 281 */ 282 void 283 yield(void) 284 { 285 struct lwp *l = curlwp; 286 287 KERNEL_UNLOCK_ALL(l, &l->l_biglocks); 288 lwp_lock(l); 289 290 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); 291 KASSERT(l->l_stat == LSONPROC); 292 293 /* Voluntary - ditch kpriority boost. */ 294 l->l_kpriority = false; 295 spc_lock(l->l_cpu); 296 mi_switch(l); 297 KERNEL_LOCK(l->l_biglocks, l); 298 } 299 300 /* 301 * General preemption call. Puts the current LWP back on its run queue 302 * and performs an involuntary context switch. Different from yield() 303 * in that: 304 * 305 * - It's counted differently (involuntary vs. voluntary). 306 * - Realtime threads go to the head of their runqueue vs. tail for yield(). 307 * - Priority boost is retained unless LWP has exceeded timeslice. 308 */ 309 void 310 preempt(void) 311 { 312 struct lwp *l = curlwp; 313 314 KERNEL_UNLOCK_ALL(l, &l->l_biglocks); 315 lwp_lock(l); 316 317 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); 318 KASSERT(l->l_stat == LSONPROC); 319 320 spc_lock(l->l_cpu); 321 /* Involuntary - keep kpriority boost unless a CPU hog. */ 322 if ((l->l_cpu->ci_schedstate.spc_flags & SPCF_SHOULDYIELD) != 0) { 323 l->l_kpriority = false; 324 } 325 l->l_pflag |= LP_PREEMPTING; 326 mi_switch(l); 327 KERNEL_LOCK(l->l_biglocks, l); 328 } 329 330 /* 331 * Return true if the current LWP should yield the processor. Intended to 332 * be used by long-running code in kernel. 333 */ 334 inline bool 335 preempt_needed(void) 336 { 337 lwp_t *l = curlwp; 338 int needed; 339 340 KPREEMPT_DISABLE(l); 341 needed = l->l_cpu->ci_want_resched; 342 KPREEMPT_ENABLE(l); 343 344 return (needed != 0); 345 } 346 347 /* 348 * A breathing point for long running code in kernel. 349 */ 350 void 351 preempt_point(void) 352 { 353 354 if (__predict_false(preempt_needed())) { 355 preempt(); 356 } 357 } 358 359 /* 360 * Handle a request made by another agent to preempt the current LWP 361 * in-kernel. Usually called when l_dopreempt may be non-zero. 362 * 363 * Character addresses for lockstat only. 364 */ 365 static char kpreempt_is_disabled; 366 static char kernel_lock_held; 367 static char is_softint_lwp; 368 static char spl_is_raised; 369 370 bool 371 kpreempt(uintptr_t where) 372 { 373 uintptr_t failed; 374 lwp_t *l; 375 int s, dop, lsflag; 376 377 l = curlwp; 378 failed = 0; 379 while ((dop = l->l_dopreempt) != 0) { 380 if (l->l_stat != LSONPROC) { 381 /* 382 * About to block (or die), let it happen. 383 * Doesn't really count as "preemption has 384 * been blocked", since we're going to 385 * context switch. 386 */ 387 atomic_swap_uint(&l->l_dopreempt, 0); 388 return true; 389 } 390 KASSERT((l->l_flag & LW_IDLE) == 0); 391 if (__predict_false(l->l_nopreempt != 0)) { 392 /* LWP holds preemption disabled, explicitly. */ 393 if ((dop & DOPREEMPT_COUNTED) == 0) { 394 kpreempt_ev_crit.ev_count++; 395 } 396 failed = (uintptr_t)&kpreempt_is_disabled; 397 break; 398 } 399 if (__predict_false((l->l_pflag & LP_INTR) != 0)) { 400 /* Can't preempt soft interrupts yet. */ 401 atomic_swap_uint(&l->l_dopreempt, 0); 402 failed = (uintptr_t)&is_softint_lwp; 403 break; 404 } 405 s = splsched(); 406 if (__predict_false(l->l_blcnt != 0 || 407 curcpu()->ci_biglock_wanted != NULL)) { 408 /* Hold or want kernel_lock, code is not MT safe. */ 409 splx(s); 410 if ((dop & DOPREEMPT_COUNTED) == 0) { 411 kpreempt_ev_klock.ev_count++; 412 } 413 failed = (uintptr_t)&kernel_lock_held; 414 break; 415 } 416 if (__predict_false(!cpu_kpreempt_enter(where, s))) { 417 /* 418 * It may be that the IPL is too high. 419 * kpreempt_enter() can schedule an 420 * interrupt to retry later. 421 */ 422 splx(s); 423 failed = (uintptr_t)&spl_is_raised; 424 break; 425 } 426 /* Do it! */ 427 if (__predict_true((dop & DOPREEMPT_COUNTED) == 0)) { 428 kpreempt_ev_immed.ev_count++; 429 } 430 lwp_lock(l); 431 /* Involuntary - keep kpriority boost. */ 432 l->l_pflag |= LP_PREEMPTING; 433 spc_lock(l->l_cpu); 434 mi_switch(l); 435 l->l_nopreempt++; 436 splx(s); 437 438 /* Take care of any MD cleanup. */ 439 cpu_kpreempt_exit(where); 440 l->l_nopreempt--; 441 } 442 443 if (__predict_true(!failed)) { 444 return false; 445 } 446 447 /* Record preemption failure for reporting via lockstat. */ 448 atomic_or_uint(&l->l_dopreempt, DOPREEMPT_COUNTED); 449 lsflag = 0; 450 LOCKSTAT_ENTER(lsflag); 451 if (__predict_false(lsflag)) { 452 if (where == 0) { 453 where = (uintptr_t)__builtin_return_address(0); 454 } 455 /* Preemption is on, might recurse, so make it atomic. */ 456 if (atomic_cas_ptr_ni((void *)&l->l_pfailaddr, NULL, 457 (void *)where) == NULL) { 458 LOCKSTAT_START_TIMER(lsflag, l->l_pfailtime); 459 l->l_pfaillock = failed; 460 } 461 } 462 LOCKSTAT_EXIT(lsflag); 463 return true; 464 } 465 466 /* 467 * Return true if preemption is explicitly disabled. 468 */ 469 bool 470 kpreempt_disabled(void) 471 { 472 const lwp_t *l = curlwp; 473 474 return l->l_nopreempt != 0 || l->l_stat == LSZOMB || 475 (l->l_flag & LW_IDLE) != 0 || (l->l_pflag & LP_INTR) != 0 || 476 cpu_kpreempt_disabled(); 477 } 478 479 /* 480 * Disable kernel preemption. 481 */ 482 void 483 kpreempt_disable(void) 484 { 485 486 KPREEMPT_DISABLE(curlwp); 487 } 488 489 /* 490 * Reenable kernel preemption. 491 */ 492 void 493 kpreempt_enable(void) 494 { 495 496 KPREEMPT_ENABLE(curlwp); 497 } 498 499 /* 500 * Compute the amount of time during which the current lwp was running. 501 * 502 * - update l_rtime unless it's an idle lwp. 503 */ 504 505 void 506 updatertime(lwp_t *l, const struct bintime *now) 507 { 508 static bool backwards = false; 509 510 if (__predict_false(l->l_flag & LW_IDLE)) 511 return; 512 513 if (__predict_false(bintimecmp(now, &l->l_stime, <)) && !backwards) { 514 char caller[128]; 515 516 #ifdef DDB 517 db_symstr(caller, sizeof(caller), 518 (db_expr_t)(intptr_t)__builtin_return_address(0), 519 DB_STGY_PROC); 520 #else 521 snprintf(caller, sizeof(caller), "%p", 522 __builtin_return_address(0)); 523 #endif 524 backwards = true; 525 printf("WARNING: lwp %ld (%s%s%s) flags 0x%x:" 526 " timecounter went backwards" 527 " from (%jd + 0x%016"PRIx64"/2^64) sec" 528 " to (%jd + 0x%016"PRIx64"/2^64) sec" 529 " in %s\n", 530 (long)l->l_lid, 531 l->l_proc->p_comm, 532 l->l_name ? " " : "", 533 l->l_name ? l->l_name : "", 534 l->l_pflag, 535 (intmax_t)l->l_stime.sec, l->l_stime.frac, 536 (intmax_t)now->sec, now->frac, 537 caller); 538 } 539 540 /* rtime += now - stime */ 541 bintime_add(&l->l_rtime, now); 542 bintime_sub(&l->l_rtime, &l->l_stime); 543 } 544 545 /* 546 * Select next LWP from the current CPU to run.. 547 */ 548 static inline lwp_t * 549 nextlwp(struct cpu_info *ci, struct schedstate_percpu *spc) 550 { 551 lwp_t *newl; 552 553 /* 554 * Let sched_nextlwp() select the LWP to run the CPU next. 555 * If no LWP is runnable, select the idle LWP. 556 * 557 * On arrival here LWPs on a run queue are locked by spc_mutex which 558 * is currently held. Idle LWPs are always locked by spc_lwplock, 559 * which may or may not be held here. On exit from this code block, 560 * in all cases newl is locked by spc_lwplock. 561 */ 562 newl = sched_nextlwp(); 563 if (newl != NULL) { 564 sched_dequeue(newl); 565 KASSERT(lwp_locked(newl, spc->spc_mutex)); 566 KASSERT(newl->l_cpu == ci); 567 newl->l_stat = LSONPROC; 568 newl->l_pflag |= LP_RUNNING; 569 spc->spc_curpriority = lwp_eprio(newl); 570 spc->spc_flags &= ~(SPCF_SWITCHCLEAR | SPCF_IDLE); 571 lwp_setlock(newl, spc->spc_lwplock); 572 } else { 573 /* 574 * The idle LWP does not get set to LSONPROC, because 575 * otherwise it screws up the output from top(1) etc. 576 */ 577 newl = ci->ci_data.cpu_idlelwp; 578 newl->l_pflag |= LP_RUNNING; 579 spc->spc_curpriority = PRI_IDLE; 580 spc->spc_flags = (spc->spc_flags & ~SPCF_SWITCHCLEAR) | 581 SPCF_IDLE; 582 } 583 584 /* 585 * Only clear want_resched if there are no pending (slow) software 586 * interrupts. We can do this without an atomic, because no new 587 * LWPs can appear in the queue due to our hold on spc_mutex, and 588 * the update to ci_want_resched will become globally visible before 589 * the release of spc_mutex becomes globally visible. 590 */ 591 if (ci->ci_data.cpu_softints == 0) 592 ci->ci_want_resched = 0; 593 594 return newl; 595 } 596 597 /* 598 * The machine independent parts of context switch. 599 * 600 * NOTE: l->l_cpu is not changed in this routine, because an LWP never 601 * changes its own l_cpu (that would screw up curcpu on many ports and could 602 * cause all kinds of other evil stuff). l_cpu is always changed by some 603 * other actor, when it's known the LWP is not running (the LP_RUNNING flag 604 * is checked under lock). 605 */ 606 void 607 mi_switch(lwp_t *l) 608 { 609 struct cpu_info *ci; 610 struct schedstate_percpu *spc; 611 struct lwp *newl; 612 kmutex_t *lock; 613 int oldspl; 614 struct bintime bt; 615 bool returning; 616 617 KASSERT(lwp_locked(l, NULL)); 618 KASSERT(kpreempt_disabled()); 619 KASSERT(mutex_owned(curcpu()->ci_schedstate.spc_mutex)); 620 KASSERTMSG(l->l_blcnt == 0, "kernel_lock leaked"); 621 622 kstack_check_magic(l); 623 624 binuptime(&bt); 625 626 KASSERTMSG(l == curlwp, "l %p curlwp %p", l, curlwp); 627 KASSERT((l->l_pflag & LP_RUNNING) != 0); 628 KASSERT(l->l_cpu == curcpu() || l->l_stat == LSRUN); 629 ci = curcpu(); 630 spc = &ci->ci_schedstate; 631 returning = false; 632 newl = NULL; 633 634 /* 635 * If we have been asked to switch to a specific LWP, then there 636 * is no need to inspect the run queues. If a soft interrupt is 637 * blocking, then return to the interrupted thread without adjusting 638 * VM context or its start time: neither have been changed in order 639 * to take the interrupt. 640 */ 641 if (l->l_switchto != NULL) { 642 if ((l->l_pflag & LP_INTR) != 0) { 643 returning = true; 644 softint_block(l); 645 if ((l->l_pflag & LP_TIMEINTR) != 0) 646 updatertime(l, &bt); 647 } 648 newl = l->l_switchto; 649 l->l_switchto = NULL; 650 } 651 #ifndef __HAVE_FAST_SOFTINTS 652 else if (ci->ci_data.cpu_softints != 0) { 653 /* There are pending soft interrupts, so pick one. */ 654 newl = softint_picklwp(); 655 newl->l_stat = LSONPROC; 656 newl->l_pflag |= LP_RUNNING; 657 } 658 #endif /* !__HAVE_FAST_SOFTINTS */ 659 660 /* 661 * If on the CPU and we have gotten this far, then we must yield. 662 */ 663 if (l->l_stat == LSONPROC && l != newl) { 664 KASSERT(lwp_locked(l, spc->spc_lwplock)); 665 KASSERT((l->l_flag & LW_IDLE) == 0); 666 l->l_stat = LSRUN; 667 lwp_setlock(l, spc->spc_mutex); 668 sched_enqueue(l); 669 sched_preempted(l); 670 671 /* 672 * Handle migration. Note that "migrating LWP" may 673 * be reset here, if interrupt/preemption happens 674 * early in idle LWP. 675 */ 676 if (l->l_target_cpu != NULL && (l->l_pflag & LP_BOUND) == 0) { 677 KASSERT((l->l_pflag & LP_INTR) == 0); 678 spc->spc_migrating = l; 679 } 680 } 681 682 /* Pick new LWP to run. */ 683 if (newl == NULL) { 684 newl = nextlwp(ci, spc); 685 } 686 687 /* Items that must be updated with the CPU locked. */ 688 if (!returning) { 689 /* Count time spent in current system call */ 690 SYSCALL_TIME_SLEEP(l); 691 692 updatertime(l, &bt); 693 694 /* Update the new LWP's start time. */ 695 newl->l_stime = bt; 696 697 /* 698 * ci_curlwp changes when a fast soft interrupt occurs. 699 * We use ci_onproc to keep track of which kernel or 700 * user thread is running 'underneath' the software 701 * interrupt. This is important for time accounting, 702 * itimers and forcing user threads to preempt (aston). 703 */ 704 ci->ci_onproc = newl; 705 } 706 707 /* 708 * Preemption related tasks. Must be done holding spc_mutex. Clear 709 * l_dopreempt without an atomic - it's only ever set non-zero by 710 * sched_resched_cpu() which also holds spc_mutex, and only ever 711 * cleared by the LWP itself (us) with atomics when not under lock. 712 */ 713 l->l_dopreempt = 0; 714 if (__predict_false(l->l_pfailaddr != 0)) { 715 LOCKSTAT_FLAG(lsflag); 716 LOCKSTAT_ENTER(lsflag); 717 LOCKSTAT_STOP_TIMER(lsflag, l->l_pfailtime); 718 LOCKSTAT_EVENT_RA(lsflag, l->l_pfaillock, LB_NOPREEMPT|LB_SPIN, 719 1, l->l_pfailtime, l->l_pfailaddr); 720 LOCKSTAT_EXIT(lsflag); 721 l->l_pfailtime = 0; 722 l->l_pfaillock = 0; 723 l->l_pfailaddr = 0; 724 } 725 726 if (l != newl) { 727 struct lwp *prevlwp; 728 729 /* Release all locks, but leave the current LWP locked */ 730 if (l->l_mutex == spc->spc_mutex) { 731 /* 732 * Drop spc_lwplock, if the current LWP has been moved 733 * to the run queue (it is now locked by spc_mutex). 734 */ 735 mutex_spin_exit(spc->spc_lwplock); 736 } else { 737 /* 738 * Otherwise, drop the spc_mutex, we are done with the 739 * run queues. 740 */ 741 mutex_spin_exit(spc->spc_mutex); 742 } 743 744 /* We're down to only one lock, so do debug checks. */ 745 LOCKDEBUG_BARRIER(l->l_mutex, 1); 746 747 /* Count the context switch. */ 748 CPU_COUNT(CPU_COUNT_NSWTCH, 1); 749 l->l_ncsw++; 750 if ((l->l_pflag & LP_PREEMPTING) != 0) { 751 l->l_nivcsw++; 752 l->l_pflag &= ~LP_PREEMPTING; 753 } 754 755 /* 756 * Increase the count of spin-mutexes before the release 757 * of the last lock - we must remain at IPL_SCHED after 758 * releasing the lock. 759 */ 760 KASSERTMSG(ci->ci_mtx_count == -1, 761 "%s: cpu%u: ci_mtx_count (%d) != -1 " 762 "(block with spin-mutex held)", 763 __func__, cpu_index(ci), ci->ci_mtx_count); 764 oldspl = MUTEX_SPIN_OLDSPL(ci); 765 ci->ci_mtx_count = -2; 766 767 /* Update status for lwpctl, if present. */ 768 if (l->l_lwpctl != NULL) { 769 l->l_lwpctl->lc_curcpu = (l->l_stat == LSZOMB ? 770 LWPCTL_CPU_EXITED : LWPCTL_CPU_NONE); 771 } 772 773 /* 774 * If curlwp is a soft interrupt LWP, there's nobody on the 775 * other side to unlock - we're returning into an assembly 776 * trampoline. Unlock now. This is safe because this is a 777 * kernel LWP and is bound to current CPU: the worst anyone 778 * else will do to it, is to put it back onto this CPU's run 779 * queue (and the CPU is busy here right now!). 780 */ 781 if (returning) { 782 /* Keep IPL_SCHED after this; MD code will fix up. */ 783 l->l_pflag &= ~LP_RUNNING; 784 lwp_unlock(l); 785 } else { 786 /* A normal LWP: save old VM context. */ 787 pmap_deactivate(l); 788 } 789 790 /* 791 * If DTrace has set the active vtime enum to anything 792 * other than INACTIVE (0), then it should have set the 793 * function to call. 794 */ 795 if (__predict_false(dtrace_vtime_active)) { 796 (*dtrace_vtime_switch_func)(newl); 797 } 798 799 /* 800 * We must ensure not to come here from inside a read section. 801 */ 802 KASSERT(pserialize_not_in_read_section()); 803 804 /* Switch to the new LWP.. */ 805 #ifdef MULTIPROCESSOR 806 KASSERT(curlwp == ci->ci_curlwp); 807 #endif 808 KASSERTMSG(l == curlwp, "l %p curlwp %p", l, curlwp); 809 prevlwp = cpu_switchto(l, newl, returning); 810 ci = curcpu(); 811 #ifdef MULTIPROCESSOR 812 KASSERT(curlwp == ci->ci_curlwp); 813 #endif 814 KASSERTMSG(l == curlwp, "l %p curlwp %p prevlwp %p", 815 l, curlwp, prevlwp); 816 KASSERT(prevlwp != NULL); 817 KASSERT(l->l_cpu == ci); 818 KASSERT(ci->ci_mtx_count == -2); 819 820 /* 821 * Immediately mark the previous LWP as no longer running 822 * and unlock (to keep lock wait times short as possible). 823 * We'll still be at IPL_SCHED afterwards. If a zombie, 824 * don't touch after clearing LP_RUNNING as it could be 825 * reaped by another CPU. Issue a memory barrier to ensure 826 * this. 827 * 828 * atomic_store_release matches atomic_load_acquire in 829 * lwp_free. 830 */ 831 KASSERT((prevlwp->l_pflag & LP_RUNNING) != 0); 832 lock = prevlwp->l_mutex; 833 if (__predict_false(prevlwp->l_stat == LSZOMB)) { 834 atomic_store_release(&prevlwp->l_pflag, 835 prevlwp->l_pflag & ~LP_RUNNING); 836 } else { 837 prevlwp->l_pflag &= ~LP_RUNNING; 838 } 839 mutex_spin_exit(lock); 840 841 /* 842 * Switched away - we have new curlwp. 843 * Restore VM context and IPL. 844 */ 845 pmap_activate(l); 846 pcu_switchpoint(l); 847 848 /* Update status for lwpctl, if present. */ 849 if (l->l_lwpctl != NULL) { 850 l->l_lwpctl->lc_curcpu = (int)cpu_index(ci); 851 l->l_lwpctl->lc_pctr++; 852 } 853 854 /* 855 * Normalize the spin mutex count and restore the previous 856 * SPL. Note that, unless the caller disabled preemption, 857 * we can be preempted at any time after this splx(). 858 */ 859 KASSERT(l->l_cpu == ci); 860 KASSERT(ci->ci_mtx_count == -1); 861 ci->ci_mtx_count = 0; 862 splx(oldspl); 863 } else { 864 /* Nothing to do - just unlock and return. */ 865 mutex_spin_exit(spc->spc_mutex); 866 l->l_pflag &= ~LP_PREEMPTING; 867 lwp_unlock(l); 868 } 869 870 KASSERT(l == curlwp); 871 KASSERT(l->l_stat == LSONPROC || (l->l_flag & LW_IDLE) != 0); 872 873 SYSCALL_TIME_WAKEUP(l); 874 LOCKDEBUG_BARRIER(NULL, 1); 875 } 876 877 /* 878 * setrunnable: change LWP state to be runnable, placing it on the run queue. 879 * 880 * Call with the process and LWP locked. Will return with the LWP unlocked. 881 */ 882 void 883 setrunnable(struct lwp *l) 884 { 885 struct proc *p = l->l_proc; 886 struct cpu_info *ci; 887 kmutex_t *oldlock; 888 889 KASSERT((l->l_flag & LW_IDLE) == 0); 890 KASSERT((l->l_flag & LW_DBGSUSPEND) == 0); 891 KASSERT(mutex_owned(p->p_lock)); 892 KASSERT(lwp_locked(l, NULL)); 893 KASSERT(l->l_mutex != l->l_cpu->ci_schedstate.spc_mutex); 894 895 switch (l->l_stat) { 896 case LSSTOP: 897 /* 898 * If we're being traced (possibly because someone attached us 899 * while we were stopped), check for a signal from the debugger. 900 */ 901 if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xsig != 0) 902 signotify(l); 903 p->p_nrlwps++; 904 break; 905 case LSSUSPENDED: 906 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); 907 l->l_flag &= ~LW_WSUSPEND; 908 p->p_nrlwps++; 909 cv_broadcast(&p->p_lwpcv); 910 break; 911 case LSSLEEP: 912 KASSERT(l->l_wchan != NULL); 913 break; 914 case LSIDL: 915 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); 916 break; 917 default: 918 panic("setrunnable: lwp %p state was %d", l, l->l_stat); 919 } 920 921 /* 922 * If the LWP was sleeping, start it again. 923 */ 924 if (l->l_wchan != NULL) { 925 l->l_stat = LSSLEEP; 926 /* lwp_unsleep() will release the lock. */ 927 lwp_unsleep(l, true); 928 return; 929 } 930 931 /* 932 * If the LWP is still on the CPU, mark it as LSONPROC. It may be 933 * about to call mi_switch(), in which case it will yield. 934 */ 935 if ((l->l_pflag & LP_RUNNING) != 0) { 936 l->l_stat = LSONPROC; 937 l->l_slptime = 0; 938 lwp_unlock(l); 939 return; 940 } 941 942 /* 943 * Look for a CPU to run. 944 * Set the LWP runnable. 945 */ 946 ci = sched_takecpu(l); 947 l->l_cpu = ci; 948 spc_lock(ci); 949 oldlock = lwp_setlock(l, l->l_cpu->ci_schedstate.spc_mutex); 950 sched_setrunnable(l); 951 l->l_stat = LSRUN; 952 l->l_slptime = 0; 953 sched_enqueue(l); 954 sched_resched_lwp(l, true); 955 /* SPC & LWP now unlocked. */ 956 mutex_spin_exit(oldlock); 957 } 958 959 /* 960 * suspendsched: 961 * 962 * Convert all non-LW_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED. 963 */ 964 void 965 suspendsched(void) 966 { 967 CPU_INFO_ITERATOR cii; 968 struct cpu_info *ci; 969 struct lwp *l; 970 struct proc *p; 971 972 /* 973 * We do this by process in order not to violate the locking rules. 974 */ 975 mutex_enter(&proc_lock); 976 PROCLIST_FOREACH(p, &allproc) { 977 mutex_enter(p->p_lock); 978 if ((p->p_flag & PK_SYSTEM) != 0) { 979 mutex_exit(p->p_lock); 980 continue; 981 } 982 983 if (p->p_stat != SSTOP) { 984 if (p->p_stat != SZOMB && p->p_stat != SDEAD) { 985 p->p_pptr->p_nstopchild++; 986 p->p_waited = 0; 987 } 988 p->p_stat = SSTOP; 989 } 990 991 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 992 if (l == curlwp) 993 continue; 994 995 lwp_lock(l); 996 997 /* 998 * Set L_WREBOOT so that the LWP will suspend itself 999 * when it tries to return to user mode. We want to 1000 * try and get to get as many LWPs as possible to 1001 * the user / kernel boundary, so that they will 1002 * release any locks that they hold. 1003 */ 1004 l->l_flag |= (LW_WREBOOT | LW_WSUSPEND); 1005 1006 if (l->l_stat == LSSLEEP && 1007 (l->l_flag & LW_SINTR) != 0) { 1008 /* setrunnable() will release the lock. */ 1009 setrunnable(l); 1010 continue; 1011 } 1012 1013 lwp_unlock(l); 1014 } 1015 1016 mutex_exit(p->p_lock); 1017 } 1018 mutex_exit(&proc_lock); 1019 1020 /* 1021 * Kick all CPUs to make them preempt any LWPs running in user mode. 1022 * They'll trap into the kernel and suspend themselves in userret(). 1023 * 1024 * Unusually, we don't hold any other scheduler object locked, which 1025 * would keep preemption off for sched_resched_cpu(), so disable it 1026 * explicitly. 1027 */ 1028 kpreempt_disable(); 1029 for (CPU_INFO_FOREACH(cii, ci)) { 1030 spc_lock(ci); 1031 sched_resched_cpu(ci, PRI_KERNEL, true); 1032 /* spc now unlocked */ 1033 } 1034 kpreempt_enable(); 1035 } 1036 1037 /* 1038 * sched_unsleep: 1039 * 1040 * The is called when the LWP has not been awoken normally but instead 1041 * interrupted: for example, if the sleep timed out. Because of this, 1042 * it's not a valid action for running or idle LWPs. 1043 */ 1044 static void 1045 sched_unsleep(struct lwp *l, bool cleanup) 1046 { 1047 1048 lwp_unlock(l); 1049 panic("sched_unsleep"); 1050 } 1051 1052 static void 1053 sched_changepri(struct lwp *l, pri_t pri) 1054 { 1055 struct schedstate_percpu *spc; 1056 struct cpu_info *ci; 1057 1058 KASSERT(lwp_locked(l, NULL)); 1059 1060 ci = l->l_cpu; 1061 spc = &ci->ci_schedstate; 1062 1063 if (l->l_stat == LSRUN) { 1064 KASSERT(lwp_locked(l, spc->spc_mutex)); 1065 sched_dequeue(l); 1066 l->l_priority = pri; 1067 sched_enqueue(l); 1068 sched_resched_lwp(l, false); 1069 } else if (l->l_stat == LSONPROC && l->l_class != SCHED_OTHER) { 1070 /* On priority drop, only evict realtime LWPs. */ 1071 KASSERT(lwp_locked(l, spc->spc_lwplock)); 1072 l->l_priority = pri; 1073 spc_lock(ci); 1074 sched_resched_cpu(ci, spc->spc_maxpriority, true); 1075 /* spc now unlocked */ 1076 } else { 1077 l->l_priority = pri; 1078 } 1079 } 1080 1081 static void 1082 sched_lendpri(struct lwp *l, pri_t pri) 1083 { 1084 struct schedstate_percpu *spc; 1085 struct cpu_info *ci; 1086 1087 KASSERT(lwp_locked(l, NULL)); 1088 1089 ci = l->l_cpu; 1090 spc = &ci->ci_schedstate; 1091 1092 if (l->l_stat == LSRUN) { 1093 KASSERT(lwp_locked(l, spc->spc_mutex)); 1094 sched_dequeue(l); 1095 l->l_inheritedprio = pri; 1096 l->l_auxprio = MAX(l->l_inheritedprio, l->l_protectprio); 1097 sched_enqueue(l); 1098 sched_resched_lwp(l, false); 1099 } else if (l->l_stat == LSONPROC && l->l_class != SCHED_OTHER) { 1100 /* On priority drop, only evict realtime LWPs. */ 1101 KASSERT(lwp_locked(l, spc->spc_lwplock)); 1102 l->l_inheritedprio = pri; 1103 l->l_auxprio = MAX(l->l_inheritedprio, l->l_protectprio); 1104 spc_lock(ci); 1105 sched_resched_cpu(ci, spc->spc_maxpriority, true); 1106 /* spc now unlocked */ 1107 } else { 1108 l->l_inheritedprio = pri; 1109 l->l_auxprio = MAX(l->l_inheritedprio, l->l_protectprio); 1110 } 1111 } 1112 1113 struct lwp * 1114 syncobj_noowner(wchan_t wchan) 1115 { 1116 1117 return NULL; 1118 } 1119 1120 /* Decay 95% of proc::p_pctcpu in 60 seconds, ccpu = exp(-1/20) */ 1121 const fixpt_t ccpu = 0.95122942450071400909 * FSCALE; 1122 1123 /* 1124 * Constants for averages over 1, 5 and 15 minutes when sampling at 1125 * 5 second intervals. 1126 */ 1127 static const fixpt_t cexp[ ] = { 1128 0.9200444146293232 * FSCALE, /* exp(-1/12) */ 1129 0.9834714538216174 * FSCALE, /* exp(-1/60) */ 1130 0.9944598480048967 * FSCALE, /* exp(-1/180) */ 1131 }; 1132 1133 /* 1134 * sched_pstats: 1135 * 1136 * => Update process statistics and check CPU resource allocation. 1137 * => Call scheduler-specific hook to eventually adjust LWP priorities. 1138 * => Compute load average of a quantity on 1, 5 and 15 minute intervals. 1139 */ 1140 void 1141 sched_pstats(void) 1142 { 1143 struct loadavg *avg = &averunnable; 1144 const int clkhz = (stathz != 0 ? stathz : hz); 1145 static bool backwardslwp = false; 1146 static bool backwardsproc = false; 1147 static u_int lavg_count = 0; 1148 struct proc *p; 1149 int nrun; 1150 1151 sched_pstats_ticks++; 1152 if (++lavg_count >= 5) { 1153 lavg_count = 0; 1154 nrun = 0; 1155 } 1156 mutex_enter(&proc_lock); 1157 PROCLIST_FOREACH(p, &allproc) { 1158 struct lwp *l; 1159 struct rlimit *rlim; 1160 time_t runtm; 1161 int sig; 1162 1163 /* Increment sleep time (if sleeping), ignore overflow. */ 1164 mutex_enter(p->p_lock); 1165 runtm = p->p_rtime.sec; 1166 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 1167 fixpt_t lpctcpu; 1168 u_int lcpticks; 1169 1170 if (__predict_false((l->l_flag & LW_IDLE) != 0)) 1171 continue; 1172 lwp_lock(l); 1173 if (__predict_false(l->l_rtime.sec < 0) && 1174 !backwardslwp) { 1175 backwardslwp = true; 1176 printf("WARNING: lwp %ld (%s%s%s): " 1177 "negative runtime: " 1178 "(%jd + 0x%016"PRIx64"/2^64) sec\n", 1179 (long)l->l_lid, 1180 l->l_proc->p_comm, 1181 l->l_name ? " " : "", 1182 l->l_name ? l->l_name : "", 1183 (intmax_t)l->l_rtime.sec, 1184 l->l_rtime.frac); 1185 } 1186 runtm += l->l_rtime.sec; 1187 l->l_swtime++; 1188 sched_lwp_stats(l); 1189 1190 /* For load average calculation. */ 1191 if (__predict_false(lavg_count == 0) && 1192 (l->l_flag & (LW_SINTR | LW_SYSTEM)) == 0) { 1193 switch (l->l_stat) { 1194 case LSSLEEP: 1195 if (l->l_slptime > 1) { 1196 break; 1197 } 1198 /* FALLTHROUGH */ 1199 case LSRUN: 1200 case LSONPROC: 1201 case LSIDL: 1202 nrun++; 1203 } 1204 } 1205 lwp_unlock(l); 1206 1207 l->l_pctcpu = (l->l_pctcpu * ccpu) >> FSHIFT; 1208 if (l->l_slptime != 0) 1209 continue; 1210 1211 lpctcpu = l->l_pctcpu; 1212 lcpticks = atomic_swap_uint(&l->l_cpticks, 0); 1213 lpctcpu += ((FSCALE - ccpu) * 1214 (lcpticks * FSCALE / clkhz)) >> FSHIFT; 1215 l->l_pctcpu = lpctcpu; 1216 } 1217 /* Calculating p_pctcpu only for ps(1) */ 1218 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT; 1219 1220 if (__predict_false(runtm < 0)) { 1221 if (!backwardsproc) { 1222 backwardsproc = true; 1223 printf("WARNING: pid %ld (%s): " 1224 "negative runtime; " 1225 "monotonic clock has gone backwards\n", 1226 (long)p->p_pid, p->p_comm); 1227 } 1228 mutex_exit(p->p_lock); 1229 continue; 1230 } 1231 1232 /* 1233 * Check if the process exceeds its CPU resource allocation. 1234 * If over the hard limit, kill it with SIGKILL. 1235 * If over the soft limit, send SIGXCPU and raise 1236 * the soft limit a little. 1237 */ 1238 rlim = &p->p_rlimit[RLIMIT_CPU]; 1239 sig = 0; 1240 if (__predict_false(runtm >= rlim->rlim_cur)) { 1241 if (runtm >= rlim->rlim_max) { 1242 sig = SIGKILL; 1243 log(LOG_NOTICE, 1244 "pid %d, command %s, is killed: %s\n", 1245 p->p_pid, p->p_comm, "exceeded RLIMIT_CPU"); 1246 uprintf("pid %d, command %s, is killed: %s\n", 1247 p->p_pid, p->p_comm, "exceeded RLIMIT_CPU"); 1248 } else { 1249 sig = SIGXCPU; 1250 if (rlim->rlim_cur < rlim->rlim_max) 1251 rlim->rlim_cur += 5; 1252 } 1253 } 1254 mutex_exit(p->p_lock); 1255 if (__predict_false(sig)) { 1256 KASSERT((p->p_flag & PK_SYSTEM) == 0); 1257 psignal(p, sig); 1258 } 1259 } 1260 1261 /* Load average calculation. */ 1262 if (__predict_false(lavg_count == 0)) { 1263 int i; 1264 CTASSERT(__arraycount(cexp) == __arraycount(avg->ldavg)); 1265 for (i = 0; i < __arraycount(cexp); i++) { 1266 avg->ldavg[i] = (cexp[i] * avg->ldavg[i] + 1267 nrun * FSCALE * (FSCALE - cexp[i])) >> FSHIFT; 1268 } 1269 } 1270 1271 /* Lightning bolt. */ 1272 cv_broadcast(&lbolt); 1273 1274 mutex_exit(&proc_lock); 1275 } 1276