1 /* $NetBSD: kern_synch.c,v 1.255 2008/11/15 10:54:32 skrll Exp $ */ 2 3 /*- 4 * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran and 10 * Daniel Sieger. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 23 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 24 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 25 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 /*- 35 * Copyright (c) 1982, 1986, 1990, 1991, 1993 36 * The Regents of the University of California. All rights reserved. 37 * (c) UNIX System Laboratories, Inc. 38 * All or some portions of this file are derived from material licensed 39 * to the University of California by American Telephone and Telegraph 40 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 41 * the permission of UNIX System Laboratories, Inc. 42 * 43 * Redistribution and use in source and binary forms, with or without 44 * modification, are permitted provided that the following conditions 45 * are met: 46 * 1. Redistributions of source code must retain the above copyright 47 * notice, this list of conditions and the following disclaimer. 48 * 2. Redistributions in binary form must reproduce the above copyright 49 * notice, this list of conditions and the following disclaimer in the 50 * documentation and/or other materials provided with the distribution. 51 * 3. Neither the name of the University nor the names of its contributors 52 * may be used to endorse or promote products derived from this software 53 * without specific prior written permission. 54 * 55 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 56 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 57 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 58 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 59 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 60 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 61 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 62 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 63 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 64 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 65 * SUCH DAMAGE. 66 * 67 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 68 */ 69 70 #include <sys/cdefs.h> 71 __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.255 2008/11/15 10:54:32 skrll Exp $"); 72 73 #include "opt_kstack.h" 74 #include "opt_perfctrs.h" 75 #include "opt_sa.h" 76 77 #define __MUTEX_PRIVATE 78 79 #include <sys/param.h> 80 #include <sys/systm.h> 81 #include <sys/proc.h> 82 #include <sys/kernel.h> 83 #if defined(PERFCTRS) 84 #include <sys/pmc.h> 85 #endif 86 #include <sys/cpu.h> 87 #include <sys/resourcevar.h> 88 #include <sys/sched.h> 89 #include <sys/sa.h> 90 #include <sys/savar.h> 91 #include <sys/syscall_stats.h> 92 #include <sys/sleepq.h> 93 #include <sys/lockdebug.h> 94 #include <sys/evcnt.h> 95 #include <sys/intr.h> 96 #include <sys/lwpctl.h> 97 #include <sys/atomic.h> 98 #include <sys/simplelock.h> 99 100 #include <uvm/uvm_extern.h> 101 102 #include <dev/lockstat.h> 103 104 static u_int sched_unsleep(struct lwp *, bool); 105 static void sched_changepri(struct lwp *, pri_t); 106 static void sched_lendpri(struct lwp *, pri_t); 107 static void resched_cpu(struct lwp *); 108 109 syncobj_t sleep_syncobj = { 110 SOBJ_SLEEPQ_SORTED, 111 sleepq_unsleep, 112 sleepq_changepri, 113 sleepq_lendpri, 114 syncobj_noowner, 115 }; 116 117 syncobj_t sched_syncobj = { 118 SOBJ_SLEEPQ_SORTED, 119 sched_unsleep, 120 sched_changepri, 121 sched_lendpri, 122 syncobj_noowner, 123 }; 124 125 callout_t sched_pstats_ch; 126 unsigned sched_pstats_ticks; 127 kcondvar_t lbolt; /* once a second sleep address */ 128 129 /* Preemption event counters */ 130 static struct evcnt kpreempt_ev_crit; 131 static struct evcnt kpreempt_ev_klock; 132 static struct evcnt kpreempt_ev_ipl; 133 static struct evcnt kpreempt_ev_immed; 134 135 /* 136 * During autoconfiguration or after a panic, a sleep will simply lower the 137 * priority briefly to allow interrupts, then return. The priority to be 138 * used (safepri) is machine-dependent, thus this value is initialized and 139 * maintained in the machine-dependent layers. This priority will typically 140 * be 0, or the lowest priority that is safe for use on the interrupt stack; 141 * it can be made higher to block network software interrupts after panics. 142 */ 143 int safepri; 144 145 void 146 sched_init(void) 147 { 148 149 cv_init(&lbolt, "lbolt"); 150 callout_init(&sched_pstats_ch, CALLOUT_MPSAFE); 151 callout_setfunc(&sched_pstats_ch, sched_pstats, NULL); 152 153 evcnt_attach_dynamic(&kpreempt_ev_crit, EVCNT_TYPE_MISC, NULL, 154 "kpreempt", "defer: critical section"); 155 evcnt_attach_dynamic(&kpreempt_ev_klock, EVCNT_TYPE_MISC, NULL, 156 "kpreempt", "defer: kernel_lock"); 157 evcnt_attach_dynamic(&kpreempt_ev_ipl, EVCNT_TYPE_MISC, NULL, 158 "kpreempt", "defer: IPL"); 159 evcnt_attach_dynamic(&kpreempt_ev_immed, EVCNT_TYPE_MISC, NULL, 160 "kpreempt", "immediate"); 161 162 sched_pstats(NULL); 163 } 164 165 /* 166 * OBSOLETE INTERFACE 167 * 168 * General sleep call. Suspends the current LWP until a wakeup is 169 * performed on the specified identifier. The LWP will then be made 170 * runnable with the specified priority. Sleeps at most timo/hz seconds (0 171 * means no timeout). If pri includes PCATCH flag, signals are checked 172 * before and after sleeping, else signals are not checked. Returns 0 if 173 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a 174 * signal needs to be delivered, ERESTART is returned if the current system 175 * call should be restarted if possible, and EINTR is returned if the system 176 * call should be interrupted by the signal (return EINTR). 177 * 178 * The interlock is held until we are on a sleep queue. The interlock will 179 * be locked before returning back to the caller unless the PNORELOCK flag 180 * is specified, in which case the interlock will always be unlocked upon 181 * return. 182 */ 183 int 184 ltsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo, 185 volatile struct simplelock *interlock) 186 { 187 struct lwp *l = curlwp; 188 sleepq_t *sq; 189 kmutex_t *mp; 190 int error; 191 192 KASSERT((l->l_pflag & LP_INTR) == 0); 193 194 if (sleepq_dontsleep(l)) { 195 (void)sleepq_abort(NULL, 0); 196 if ((priority & PNORELOCK) != 0) 197 simple_unlock(interlock); 198 return 0; 199 } 200 201 l->l_kpriority = true; 202 sq = sleeptab_lookup(&sleeptab, ident, &mp); 203 sleepq_enter(sq, l, mp); 204 sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj); 205 206 if (interlock != NULL) { 207 KASSERT(simple_lock_held(interlock)); 208 simple_unlock(interlock); 209 } 210 211 error = sleepq_block(timo, priority & PCATCH); 212 213 if (interlock != NULL && (priority & PNORELOCK) == 0) 214 simple_lock(interlock); 215 216 return error; 217 } 218 219 int 220 mtsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo, 221 kmutex_t *mtx) 222 { 223 struct lwp *l = curlwp; 224 sleepq_t *sq; 225 kmutex_t *mp; 226 int error; 227 228 KASSERT((l->l_pflag & LP_INTR) == 0); 229 230 if (sleepq_dontsleep(l)) { 231 (void)sleepq_abort(mtx, (priority & PNORELOCK) != 0); 232 return 0; 233 } 234 235 l->l_kpriority = true; 236 sq = sleeptab_lookup(&sleeptab, ident, &mp); 237 sleepq_enter(sq, l, mp); 238 sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj); 239 mutex_exit(mtx); 240 error = sleepq_block(timo, priority & PCATCH); 241 242 if ((priority & PNORELOCK) == 0) 243 mutex_enter(mtx); 244 245 return error; 246 } 247 248 /* 249 * General sleep call for situations where a wake-up is not expected. 250 */ 251 int 252 kpause(const char *wmesg, bool intr, int timo, kmutex_t *mtx) 253 { 254 struct lwp *l = curlwp; 255 kmutex_t *mp; 256 sleepq_t *sq; 257 int error; 258 259 if (sleepq_dontsleep(l)) 260 return sleepq_abort(NULL, 0); 261 262 if (mtx != NULL) 263 mutex_exit(mtx); 264 l->l_kpriority = true; 265 sq = sleeptab_lookup(&sleeptab, l, &mp); 266 sleepq_enter(sq, l, mp); 267 sleepq_enqueue(sq, l, wmesg, &sleep_syncobj); 268 error = sleepq_block(timo, intr); 269 if (mtx != NULL) 270 mutex_enter(mtx); 271 272 return error; 273 } 274 275 #ifdef KERN_SA 276 /* 277 * sa_awaken: 278 * 279 * We believe this lwp is an SA lwp. If it's yielding, 280 * let it know it needs to wake up. 281 * 282 * We are called and exit with the lwp locked. We are 283 * called in the middle of wakeup operations, so we need 284 * to not touch the locks at all. 285 */ 286 void 287 sa_awaken(struct lwp *l) 288 { 289 /* LOCK_ASSERT(lwp_locked(l, NULL)); */ 290 291 if (l == l->l_savp->savp_lwp && l->l_flag & LW_SA_YIELD) 292 l->l_flag &= ~LW_SA_IDLE; 293 } 294 #endif /* KERN_SA */ 295 296 /* 297 * OBSOLETE INTERFACE 298 * 299 * Make all LWPs sleeping on the specified identifier runnable. 300 */ 301 void 302 wakeup(wchan_t ident) 303 { 304 sleepq_t *sq; 305 kmutex_t *mp; 306 307 if (cold) 308 return; 309 310 sq = sleeptab_lookup(&sleeptab, ident, &mp); 311 sleepq_wake(sq, ident, (u_int)-1, mp); 312 } 313 314 /* 315 * OBSOLETE INTERFACE 316 * 317 * Make the highest priority LWP first in line on the specified 318 * identifier runnable. 319 */ 320 void 321 wakeup_one(wchan_t ident) 322 { 323 sleepq_t *sq; 324 kmutex_t *mp; 325 326 if (cold) 327 return; 328 329 sq = sleeptab_lookup(&sleeptab, ident, &mp); 330 sleepq_wake(sq, ident, 1, mp); 331 } 332 333 334 /* 335 * General yield call. Puts the current LWP back on its run queue and 336 * performs a voluntary context switch. Should only be called when the 337 * current LWP explicitly requests it (eg sched_yield(2)). 338 */ 339 void 340 yield(void) 341 { 342 struct lwp *l = curlwp; 343 344 KERNEL_UNLOCK_ALL(l, &l->l_biglocks); 345 lwp_lock(l); 346 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); 347 KASSERT(l->l_stat == LSONPROC); 348 l->l_kpriority = false; 349 (void)mi_switch(l); 350 KERNEL_LOCK(l->l_biglocks, l); 351 } 352 353 /* 354 * General preemption call. Puts the current LWP back on its run queue 355 * and performs an involuntary context switch. 356 */ 357 void 358 preempt(void) 359 { 360 struct lwp *l = curlwp; 361 362 KERNEL_UNLOCK_ALL(l, &l->l_biglocks); 363 lwp_lock(l); 364 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); 365 KASSERT(l->l_stat == LSONPROC); 366 l->l_kpriority = false; 367 l->l_nivcsw++; 368 (void)mi_switch(l); 369 KERNEL_LOCK(l->l_biglocks, l); 370 } 371 372 /* 373 * Handle a request made by another agent to preempt the current LWP 374 * in-kernel. Usually called when l_dopreempt may be non-zero. 375 * 376 * Character addresses for lockstat only. 377 */ 378 static char in_critical_section; 379 static char kernel_lock_held; 380 static char spl_raised; 381 static char is_softint; 382 383 bool 384 kpreempt(uintptr_t where) 385 { 386 uintptr_t failed; 387 lwp_t *l; 388 int s, dop; 389 390 l = curlwp; 391 failed = 0; 392 while ((dop = l->l_dopreempt) != 0) { 393 if (l->l_stat != LSONPROC) { 394 /* 395 * About to block (or die), let it happen. 396 * Doesn't really count as "preemption has 397 * been blocked", since we're going to 398 * context switch. 399 */ 400 l->l_dopreempt = 0; 401 return true; 402 } 403 if (__predict_false((l->l_flag & LW_IDLE) != 0)) { 404 /* Can't preempt idle loop, don't count as failure. */ 405 l->l_dopreempt = 0; 406 return true; 407 } 408 if (__predict_false(l->l_nopreempt != 0)) { 409 /* LWP holds preemption disabled, explicitly. */ 410 if ((dop & DOPREEMPT_COUNTED) == 0) { 411 kpreempt_ev_crit.ev_count++; 412 } 413 failed = (uintptr_t)&in_critical_section; 414 break; 415 } 416 if (__predict_false((l->l_pflag & LP_INTR) != 0)) { 417 /* Can't preempt soft interrupts yet. */ 418 l->l_dopreempt = 0; 419 failed = (uintptr_t)&is_softint; 420 break; 421 } 422 s = splsched(); 423 if (__predict_false(l->l_blcnt != 0 || 424 curcpu()->ci_biglock_wanted != NULL)) { 425 /* Hold or want kernel_lock, code is not MT safe. */ 426 splx(s); 427 if ((dop & DOPREEMPT_COUNTED) == 0) { 428 kpreempt_ev_klock.ev_count++; 429 } 430 failed = (uintptr_t)&kernel_lock_held; 431 break; 432 } 433 if (__predict_false(!cpu_kpreempt_enter(where, s))) { 434 /* 435 * It may be that the IPL is too high. 436 * kpreempt_enter() can schedule an 437 * interrupt to retry later. 438 */ 439 splx(s); 440 if ((dop & DOPREEMPT_COUNTED) == 0) { 441 kpreempt_ev_ipl.ev_count++; 442 } 443 failed = (uintptr_t)&spl_raised; 444 break; 445 } 446 /* Do it! */ 447 if (__predict_true((dop & DOPREEMPT_COUNTED) == 0)) { 448 kpreempt_ev_immed.ev_count++; 449 } 450 lwp_lock(l); 451 mi_switch(l); 452 l->l_nopreempt++; 453 splx(s); 454 455 /* Take care of any MD cleanup. */ 456 cpu_kpreempt_exit(where); 457 l->l_nopreempt--; 458 } 459 460 /* Record preemption failure for reporting via lockstat. */ 461 if (__predict_false(failed)) { 462 int lsflag = 0; 463 atomic_or_uint(&l->l_dopreempt, DOPREEMPT_COUNTED); 464 LOCKSTAT_ENTER(lsflag); 465 /* Might recurse, make it atomic. */ 466 if (__predict_false(lsflag)) { 467 if (where == 0) { 468 where = (uintptr_t)__builtin_return_address(0); 469 } 470 if (atomic_cas_ptr_ni((void *)&l->l_pfailaddr, 471 NULL, (void *)where) == NULL) { 472 LOCKSTAT_START_TIMER(lsflag, l->l_pfailtime); 473 l->l_pfaillock = failed; 474 } 475 } 476 LOCKSTAT_EXIT(lsflag); 477 } 478 479 return failed; 480 } 481 482 /* 483 * Return true if preemption is explicitly disabled. 484 */ 485 bool 486 kpreempt_disabled(void) 487 { 488 lwp_t *l; 489 490 l = curlwp; 491 492 return l->l_nopreempt != 0 || l->l_stat == LSZOMB || 493 (l->l_flag & LW_IDLE) != 0 || cpu_kpreempt_disabled(); 494 } 495 496 /* 497 * Disable kernel preemption. 498 */ 499 void 500 kpreempt_disable(void) 501 { 502 503 KPREEMPT_DISABLE(curlwp); 504 } 505 506 /* 507 * Reenable kernel preemption. 508 */ 509 void 510 kpreempt_enable(void) 511 { 512 513 KPREEMPT_ENABLE(curlwp); 514 } 515 516 /* 517 * Compute the amount of time during which the current lwp was running. 518 * 519 * - update l_rtime unless it's an idle lwp. 520 */ 521 522 void 523 updatertime(lwp_t *l, const struct bintime *now) 524 { 525 526 if ((l->l_flag & LW_IDLE) != 0) 527 return; 528 529 /* rtime += now - stime */ 530 bintime_add(&l->l_rtime, now); 531 bintime_sub(&l->l_rtime, &l->l_stime); 532 } 533 534 /* 535 * Select next LWP from the current CPU to run.. 536 */ 537 static inline lwp_t * 538 nextlwp(struct cpu_info *ci, struct schedstate_percpu *spc) 539 { 540 lwp_t *newl; 541 542 /* 543 * Let sched_nextlwp() select the LWP to run the CPU next. 544 * If no LWP is runnable, select the idle LWP. 545 * 546 * Note that spc_lwplock might not necessary be held, and 547 * new thread would be unlocked after setting the LWP-lock. 548 */ 549 newl = sched_nextlwp(); 550 if (newl != NULL) { 551 sched_dequeue(newl); 552 KASSERT(lwp_locked(newl, spc->spc_mutex)); 553 newl->l_stat = LSONPROC; 554 newl->l_cpu = ci; 555 newl->l_pflag |= LP_RUNNING; 556 lwp_setlock(newl, spc->spc_lwplock); 557 } else { 558 newl = ci->ci_data.cpu_idlelwp; 559 newl->l_stat = LSONPROC; 560 newl->l_pflag |= LP_RUNNING; 561 } 562 563 /* 564 * Only clear want_resched if there are no pending (slow) 565 * software interrupts. 566 */ 567 ci->ci_want_resched = ci->ci_data.cpu_softints; 568 spc->spc_flags &= ~SPCF_SWITCHCLEAR; 569 spc->spc_curpriority = lwp_eprio(newl); 570 571 return newl; 572 } 573 574 /* 575 * The machine independent parts of context switch. 576 * 577 * Returns 1 if another LWP was actually run. 578 */ 579 int 580 mi_switch(lwp_t *l) 581 { 582 struct cpu_info *ci; 583 struct schedstate_percpu *spc; 584 struct lwp *newl; 585 int retval, oldspl; 586 struct bintime bt; 587 bool returning; 588 589 KASSERT(lwp_locked(l, NULL)); 590 KASSERT(kpreempt_disabled()); 591 LOCKDEBUG_BARRIER(l->l_mutex, 1); 592 593 #ifdef KSTACK_CHECK_MAGIC 594 kstack_check_magic(l); 595 #endif 596 597 binuptime(&bt); 598 599 KASSERT(l->l_cpu == curcpu()); 600 ci = l->l_cpu; 601 spc = &ci->ci_schedstate; 602 returning = false; 603 newl = NULL; 604 605 /* 606 * If we have been asked to switch to a specific LWP, then there 607 * is no need to inspect the run queues. If a soft interrupt is 608 * blocking, then return to the interrupted thread without adjusting 609 * VM context or its start time: neither have been changed in order 610 * to take the interrupt. 611 */ 612 if (l->l_switchto != NULL) { 613 if ((l->l_pflag & LP_INTR) != 0) { 614 returning = true; 615 softint_block(l); 616 if ((l->l_pflag & LP_TIMEINTR) != 0) 617 updatertime(l, &bt); 618 } 619 newl = l->l_switchto; 620 l->l_switchto = NULL; 621 } 622 #ifndef __HAVE_FAST_SOFTINTS 623 else if (ci->ci_data.cpu_softints != 0) { 624 /* There are pending soft interrupts, so pick one. */ 625 newl = softint_picklwp(); 626 newl->l_stat = LSONPROC; 627 newl->l_pflag |= LP_RUNNING; 628 } 629 #endif /* !__HAVE_FAST_SOFTINTS */ 630 631 /* Count time spent in current system call */ 632 if (!returning) { 633 SYSCALL_TIME_SLEEP(l); 634 635 /* 636 * XXXSMP If we are using h/w performance counters, 637 * save context. 638 */ 639 #if PERFCTRS 640 if (PMC_ENABLED(l->l_proc)) { 641 pmc_save_context(l->l_proc); 642 } 643 #endif 644 updatertime(l, &bt); 645 } 646 647 /* Lock the runqueue */ 648 KASSERT(l->l_stat != LSRUN); 649 mutex_spin_enter(spc->spc_mutex); 650 651 /* 652 * If on the CPU and we have gotten this far, then we must yield. 653 */ 654 if (l->l_stat == LSONPROC && l != newl) { 655 KASSERT(lwp_locked(l, spc->spc_lwplock)); 656 if ((l->l_flag & LW_IDLE) == 0) { 657 l->l_stat = LSRUN; 658 lwp_setlock(l, spc->spc_mutex); 659 sched_enqueue(l, true); 660 /* Handle migration case */ 661 KASSERT(spc->spc_migrating == NULL); 662 if (l->l_target_cpu != NULL) { 663 spc->spc_migrating = l; 664 } 665 } else 666 l->l_stat = LSIDL; 667 } 668 669 /* Pick new LWP to run. */ 670 if (newl == NULL) { 671 newl = nextlwp(ci, spc); 672 } 673 674 /* Items that must be updated with the CPU locked. */ 675 if (!returning) { 676 /* Update the new LWP's start time. */ 677 newl->l_stime = bt; 678 679 /* 680 * ci_curlwp changes when a fast soft interrupt occurs. 681 * We use cpu_onproc to keep track of which kernel or 682 * user thread is running 'underneath' the software 683 * interrupt. This is important for time accounting, 684 * itimers and forcing user threads to preempt (aston). 685 */ 686 ci->ci_data.cpu_onproc = newl; 687 } 688 689 /* 690 * Preemption related tasks. Must be done with the current 691 * CPU locked. 692 */ 693 cpu_did_resched(l); 694 l->l_dopreempt = 0; 695 if (__predict_false(l->l_pfailaddr != 0)) { 696 LOCKSTAT_FLAG(lsflag); 697 LOCKSTAT_ENTER(lsflag); 698 LOCKSTAT_STOP_TIMER(lsflag, l->l_pfailtime); 699 LOCKSTAT_EVENT_RA(lsflag, l->l_pfaillock, LB_NOPREEMPT|LB_SPIN, 700 1, l->l_pfailtime, l->l_pfailaddr); 701 LOCKSTAT_EXIT(lsflag); 702 l->l_pfailtime = 0; 703 l->l_pfaillock = 0; 704 l->l_pfailaddr = 0; 705 } 706 707 if (l != newl) { 708 struct lwp *prevlwp; 709 710 /* Release all locks, but leave the current LWP locked */ 711 if (l->l_mutex == spc->spc_mutex) { 712 /* 713 * Drop spc_lwplock, if the current LWP has been moved 714 * to the run queue (it is now locked by spc_mutex). 715 */ 716 mutex_spin_exit(spc->spc_lwplock); 717 } else { 718 /* 719 * Otherwise, drop the spc_mutex, we are done with the 720 * run queues. 721 */ 722 mutex_spin_exit(spc->spc_mutex); 723 } 724 725 /* 726 * Mark that context switch is going to be performed 727 * for this LWP, to protect it from being switched 728 * to on another CPU. 729 */ 730 KASSERT(l->l_ctxswtch == 0); 731 l->l_ctxswtch = 1; 732 l->l_ncsw++; 733 l->l_pflag &= ~LP_RUNNING; 734 735 /* 736 * Increase the count of spin-mutexes before the release 737 * of the last lock - we must remain at IPL_SCHED during 738 * the context switch. 739 */ 740 oldspl = MUTEX_SPIN_OLDSPL(ci); 741 ci->ci_mtx_count--; 742 lwp_unlock(l); 743 744 /* Count the context switch on this CPU. */ 745 ci->ci_data.cpu_nswtch++; 746 747 /* Update status for lwpctl, if present. */ 748 if (l->l_lwpctl != NULL) 749 l->l_lwpctl->lc_curcpu = LWPCTL_CPU_NONE; 750 751 /* 752 * Save old VM context, unless a soft interrupt 753 * handler is blocking. 754 */ 755 if (!returning) 756 pmap_deactivate(l); 757 758 /* 759 * We may need to spin-wait for if 'newl' is still 760 * context switching on another CPU. 761 */ 762 if (newl->l_ctxswtch != 0) { 763 u_int count; 764 count = SPINLOCK_BACKOFF_MIN; 765 while (newl->l_ctxswtch) 766 SPINLOCK_BACKOFF(count); 767 } 768 769 /* Switch to the new LWP.. */ 770 prevlwp = cpu_switchto(l, newl, returning); 771 ci = curcpu(); 772 773 /* 774 * Switched away - we have new curlwp. 775 * Restore VM context and IPL. 776 */ 777 pmap_activate(l); 778 if (prevlwp != NULL) { 779 /* Normalize the count of the spin-mutexes */ 780 ci->ci_mtx_count++; 781 /* Unmark the state of context switch */ 782 membar_exit(); 783 prevlwp->l_ctxswtch = 0; 784 } 785 786 /* Update status for lwpctl, if present. */ 787 if (l->l_lwpctl != NULL) { 788 l->l_lwpctl->lc_curcpu = (int)cpu_index(ci); 789 l->l_lwpctl->lc_pctr++; 790 } 791 792 KASSERT(l->l_cpu == ci); 793 splx(oldspl); 794 retval = 1; 795 } else { 796 /* Nothing to do - just unlock and return. */ 797 mutex_spin_exit(spc->spc_mutex); 798 lwp_unlock(l); 799 retval = 0; 800 } 801 802 KASSERT(l == curlwp); 803 KASSERT(l->l_stat == LSONPROC); 804 805 /* 806 * XXXSMP If we are using h/w performance counters, restore context. 807 * XXXSMP preemption problem. 808 */ 809 #if PERFCTRS 810 if (PMC_ENABLED(l->l_proc)) { 811 pmc_restore_context(l->l_proc); 812 } 813 #endif 814 SYSCALL_TIME_WAKEUP(l); 815 LOCKDEBUG_BARRIER(NULL, 1); 816 817 return retval; 818 } 819 820 /* 821 * The machine independent parts of context switch to oblivion. 822 * Does not return. Call with the LWP unlocked. 823 */ 824 void 825 lwp_exit_switchaway(lwp_t *l) 826 { 827 struct cpu_info *ci; 828 struct lwp *newl; 829 struct bintime bt; 830 831 ci = l->l_cpu; 832 833 KASSERT(kpreempt_disabled()); 834 KASSERT(l->l_stat == LSZOMB || l->l_stat == LSIDL); 835 KASSERT(ci == curcpu()); 836 LOCKDEBUG_BARRIER(NULL, 0); 837 838 #ifdef KSTACK_CHECK_MAGIC 839 kstack_check_magic(l); 840 #endif 841 842 /* Count time spent in current system call */ 843 SYSCALL_TIME_SLEEP(l); 844 binuptime(&bt); 845 updatertime(l, &bt); 846 847 /* Must stay at IPL_SCHED even after releasing run queue lock. */ 848 (void)splsched(); 849 850 /* 851 * Let sched_nextlwp() select the LWP to run the CPU next. 852 * If no LWP is runnable, select the idle LWP. 853 * 854 * Note that spc_lwplock might not necessary be held, and 855 * new thread would be unlocked after setting the LWP-lock. 856 */ 857 spc_lock(ci); 858 #ifndef __HAVE_FAST_SOFTINTS 859 if (ci->ci_data.cpu_softints != 0) { 860 /* There are pending soft interrupts, so pick one. */ 861 newl = softint_picklwp(); 862 newl->l_stat = LSONPROC; 863 newl->l_pflag |= LP_RUNNING; 864 } else 865 #endif /* !__HAVE_FAST_SOFTINTS */ 866 { 867 newl = nextlwp(ci, &ci->ci_schedstate); 868 } 869 870 /* Update the new LWP's start time. */ 871 newl->l_stime = bt; 872 l->l_pflag &= ~LP_RUNNING; 873 874 /* 875 * ci_curlwp changes when a fast soft interrupt occurs. 876 * We use cpu_onproc to keep track of which kernel or 877 * user thread is running 'underneath' the software 878 * interrupt. This is important for time accounting, 879 * itimers and forcing user threads to preempt (aston). 880 */ 881 ci->ci_data.cpu_onproc = newl; 882 883 /* 884 * Preemption related tasks. Must be done with the current 885 * CPU locked. 886 */ 887 cpu_did_resched(l); 888 889 /* Unlock the run queue. */ 890 spc_unlock(ci); 891 892 /* Count the context switch on this CPU. */ 893 ci->ci_data.cpu_nswtch++; 894 895 /* Update status for lwpctl, if present. */ 896 if (l->l_lwpctl != NULL) 897 l->l_lwpctl->lc_curcpu = LWPCTL_CPU_EXITED; 898 899 /* 900 * We may need to spin-wait for if 'newl' is still 901 * context switching on another CPU. 902 */ 903 if (newl->l_ctxswtch != 0) { 904 u_int count; 905 count = SPINLOCK_BACKOFF_MIN; 906 while (newl->l_ctxswtch) 907 SPINLOCK_BACKOFF(count); 908 } 909 910 /* Switch to the new LWP.. */ 911 (void)cpu_switchto(NULL, newl, false); 912 913 for (;;) continue; /* XXX: convince gcc about "noreturn" */ 914 /* NOTREACHED */ 915 } 916 917 /* 918 * Change LWP state to be runnable, placing it on the run queue if it is 919 * in memory, and awakening the swapper if it isn't in memory. 920 * 921 * Call with the process and LWP locked. Will return with the LWP unlocked. 922 */ 923 void 924 setrunnable(struct lwp *l) 925 { 926 struct proc *p = l->l_proc; 927 struct cpu_info *ci; 928 sigset_t *ss; 929 930 KASSERT((l->l_flag & LW_IDLE) == 0); 931 KASSERT(mutex_owned(p->p_lock)); 932 KASSERT(lwp_locked(l, NULL)); 933 KASSERT(l->l_mutex != l->l_cpu->ci_schedstate.spc_mutex); 934 935 switch (l->l_stat) { 936 case LSSTOP: 937 /* 938 * If we're being traced (possibly because someone attached us 939 * while we were stopped), check for a signal from the debugger. 940 */ 941 if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xstat != 0) { 942 if ((sigprop[p->p_xstat] & SA_TOLWP) != 0) 943 ss = &l->l_sigpend.sp_set; 944 else 945 ss = &p->p_sigpend.sp_set; 946 sigaddset(ss, p->p_xstat); 947 signotify(l); 948 } 949 p->p_nrlwps++; 950 break; 951 case LSSUSPENDED: 952 l->l_flag &= ~LW_WSUSPEND; 953 p->p_nrlwps++; 954 cv_broadcast(&p->p_lwpcv); 955 break; 956 case LSSLEEP: 957 KASSERT(l->l_wchan != NULL); 958 break; 959 default: 960 panic("setrunnable: lwp %p state was %d", l, l->l_stat); 961 } 962 963 #ifdef KERN_SA 964 if (l->l_proc->p_sa) 965 sa_awaken(l); 966 #endif /* KERN_SA */ 967 968 /* 969 * If the LWP was sleeping interruptably, then it's OK to start it 970 * again. If not, mark it as still sleeping. 971 */ 972 if (l->l_wchan != NULL) { 973 l->l_stat = LSSLEEP; 974 /* lwp_unsleep() will release the lock. */ 975 lwp_unsleep(l, true); 976 return; 977 } 978 979 /* 980 * If the LWP is still on the CPU, mark it as LSONPROC. It may be 981 * about to call mi_switch(), in which case it will yield. 982 */ 983 if ((l->l_pflag & LP_RUNNING) != 0) { 984 l->l_stat = LSONPROC; 985 l->l_slptime = 0; 986 lwp_unlock(l); 987 return; 988 } 989 990 /* 991 * Look for a CPU to run. 992 * Set the LWP runnable. 993 */ 994 ci = sched_takecpu(l); 995 l->l_cpu = ci; 996 spc_lock(ci); 997 lwp_unlock_to(l, ci->ci_schedstate.spc_mutex); 998 sched_setrunnable(l); 999 l->l_stat = LSRUN; 1000 l->l_slptime = 0; 1001 1002 /* 1003 * If thread is swapped out - wake the swapper to bring it back in. 1004 * Otherwise, enter it into a run queue. 1005 */ 1006 if (l->l_flag & LW_INMEM) { 1007 sched_enqueue(l, false); 1008 resched_cpu(l); 1009 lwp_unlock(l); 1010 } else { 1011 lwp_unlock(l); 1012 uvm_kick_scheduler(); 1013 } 1014 } 1015 1016 /* 1017 * suspendsched: 1018 * 1019 * Convert all non-L_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED. 1020 */ 1021 void 1022 suspendsched(void) 1023 { 1024 CPU_INFO_ITERATOR cii; 1025 struct cpu_info *ci; 1026 struct lwp *l; 1027 struct proc *p; 1028 1029 /* 1030 * We do this by process in order not to violate the locking rules. 1031 */ 1032 mutex_enter(proc_lock); 1033 PROCLIST_FOREACH(p, &allproc) { 1034 if ((p->p_flag & PK_MARKER) != 0) 1035 continue; 1036 1037 mutex_enter(p->p_lock); 1038 if ((p->p_flag & PK_SYSTEM) != 0) { 1039 mutex_exit(p->p_lock); 1040 continue; 1041 } 1042 1043 p->p_stat = SSTOP; 1044 1045 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 1046 if (l == curlwp) 1047 continue; 1048 1049 lwp_lock(l); 1050 1051 /* 1052 * Set L_WREBOOT so that the LWP will suspend itself 1053 * when it tries to return to user mode. We want to 1054 * try and get to get as many LWPs as possible to 1055 * the user / kernel boundary, so that they will 1056 * release any locks that they hold. 1057 */ 1058 l->l_flag |= (LW_WREBOOT | LW_WSUSPEND); 1059 1060 if (l->l_stat == LSSLEEP && 1061 (l->l_flag & LW_SINTR) != 0) { 1062 /* setrunnable() will release the lock. */ 1063 setrunnable(l); 1064 continue; 1065 } 1066 1067 lwp_unlock(l); 1068 } 1069 1070 mutex_exit(p->p_lock); 1071 } 1072 mutex_exit(proc_lock); 1073 1074 /* 1075 * Kick all CPUs to make them preempt any LWPs running in user mode. 1076 * They'll trap into the kernel and suspend themselves in userret(). 1077 */ 1078 for (CPU_INFO_FOREACH(cii, ci)) { 1079 spc_lock(ci); 1080 cpu_need_resched(ci, RESCHED_IMMED); 1081 spc_unlock(ci); 1082 } 1083 } 1084 1085 /* 1086 * sched_unsleep: 1087 * 1088 * The is called when the LWP has not been awoken normally but instead 1089 * interrupted: for example, if the sleep timed out. Because of this, 1090 * it's not a valid action for running or idle LWPs. 1091 */ 1092 static u_int 1093 sched_unsleep(struct lwp *l, bool cleanup) 1094 { 1095 1096 lwp_unlock(l); 1097 panic("sched_unsleep"); 1098 } 1099 1100 static void 1101 resched_cpu(struct lwp *l) 1102 { 1103 struct cpu_info *ci = ci = l->l_cpu; 1104 1105 KASSERT(lwp_locked(l, NULL)); 1106 if (lwp_eprio(l) > ci->ci_schedstate.spc_curpriority) 1107 cpu_need_resched(ci, 0); 1108 } 1109 1110 static void 1111 sched_changepri(struct lwp *l, pri_t pri) 1112 { 1113 1114 KASSERT(lwp_locked(l, NULL)); 1115 1116 if (l->l_stat == LSRUN && (l->l_flag & LW_INMEM) != 0) { 1117 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex)); 1118 sched_dequeue(l); 1119 l->l_priority = pri; 1120 sched_enqueue(l, false); 1121 } else { 1122 l->l_priority = pri; 1123 } 1124 resched_cpu(l); 1125 } 1126 1127 static void 1128 sched_lendpri(struct lwp *l, pri_t pri) 1129 { 1130 1131 KASSERT(lwp_locked(l, NULL)); 1132 1133 if (l->l_stat == LSRUN && (l->l_flag & LW_INMEM) != 0) { 1134 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex)); 1135 sched_dequeue(l); 1136 l->l_inheritedprio = pri; 1137 sched_enqueue(l, false); 1138 } else { 1139 l->l_inheritedprio = pri; 1140 } 1141 resched_cpu(l); 1142 } 1143 1144 struct lwp * 1145 syncobj_noowner(wchan_t wchan) 1146 { 1147 1148 return NULL; 1149 } 1150 1151 /* Decay 95% of proc::p_pctcpu in 60 seconds, ccpu = exp(-1/20) */ 1152 const fixpt_t ccpu = 0.95122942450071400909 * FSCALE; 1153 1154 /* 1155 * sched_pstats: 1156 * 1157 * Update process statistics and check CPU resource allocation. 1158 * Call scheduler-specific hook to eventually adjust process/LWP 1159 * priorities. 1160 */ 1161 /* ARGSUSED */ 1162 void 1163 sched_pstats(void *arg) 1164 { 1165 const int clkhz = (stathz != 0 ? stathz : hz); 1166 struct rlimit *rlim; 1167 struct lwp *l; 1168 struct proc *p; 1169 long runtm; 1170 fixpt_t lpctcpu; 1171 u_int lcpticks; 1172 int sig; 1173 1174 sched_pstats_ticks++; 1175 1176 mutex_enter(proc_lock); 1177 PROCLIST_FOREACH(p, &allproc) { 1178 if (__predict_false((p->p_flag & PK_MARKER) != 0)) 1179 continue; 1180 1181 /* 1182 * Increment time in/out of memory and sleep 1183 * time (if sleeping), ignore overflow. 1184 */ 1185 mutex_enter(p->p_lock); 1186 runtm = p->p_rtime.sec; 1187 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 1188 if (__predict_false((l->l_flag & LW_IDLE) != 0)) 1189 continue; 1190 lwp_lock(l); 1191 runtm += l->l_rtime.sec; 1192 l->l_swtime++; 1193 sched_lwp_stats(l); 1194 lwp_unlock(l); 1195 1196 l->l_pctcpu = (l->l_pctcpu * ccpu) >> FSHIFT; 1197 if (l->l_slptime != 0) 1198 continue; 1199 1200 lpctcpu = l->l_pctcpu; 1201 lcpticks = atomic_swap_uint(&l->l_cpticks, 0); 1202 lpctcpu += ((FSCALE - ccpu) * 1203 (lcpticks * FSCALE / clkhz)) >> FSHIFT; 1204 l->l_pctcpu = lpctcpu; 1205 } 1206 /* Calculating p_pctcpu only for ps(1) */ 1207 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT; 1208 1209 /* 1210 * Check if the process exceeds its CPU resource allocation. 1211 * If over max, kill it. 1212 */ 1213 rlim = &p->p_rlimit[RLIMIT_CPU]; 1214 sig = 0; 1215 if (__predict_false(runtm >= rlim->rlim_cur)) { 1216 if (runtm >= rlim->rlim_max) 1217 sig = SIGKILL; 1218 else { 1219 sig = SIGXCPU; 1220 if (rlim->rlim_cur < rlim->rlim_max) 1221 rlim->rlim_cur += 5; 1222 } 1223 } 1224 mutex_exit(p->p_lock); 1225 if (__predict_false(sig)) 1226 psignal(p, sig); 1227 } 1228 mutex_exit(proc_lock); 1229 uvm_meter(); 1230 cv_wakeup(&lbolt); 1231 callout_schedule(&sched_pstats_ch, hz); 1232 } 1233