1 /* $NetBSD: kern_synch.c,v 1.258 2008/12/21 13:26:58 ad Exp $ */ 2 3 /*- 4 * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran and 10 * Daniel Sieger. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 23 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 24 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 25 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 /*- 35 * Copyright (c) 1982, 1986, 1990, 1991, 1993 36 * The Regents of the University of California. All rights reserved. 37 * (c) UNIX System Laboratories, Inc. 38 * All or some portions of this file are derived from material licensed 39 * to the University of California by American Telephone and Telegraph 40 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 41 * the permission of UNIX System Laboratories, Inc. 42 * 43 * Redistribution and use in source and binary forms, with or without 44 * modification, are permitted provided that the following conditions 45 * are met: 46 * 1. Redistributions of source code must retain the above copyright 47 * notice, this list of conditions and the following disclaimer. 48 * 2. Redistributions in binary form must reproduce the above copyright 49 * notice, this list of conditions and the following disclaimer in the 50 * documentation and/or other materials provided with the distribution. 51 * 3. Neither the name of the University nor the names of its contributors 52 * may be used to endorse or promote products derived from this software 53 * without specific prior written permission. 54 * 55 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 56 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 57 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 58 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 59 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 60 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 61 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 62 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 63 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 64 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 65 * SUCH DAMAGE. 66 * 67 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 68 */ 69 70 #include <sys/cdefs.h> 71 __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.258 2008/12/21 13:26:58 ad Exp $"); 72 73 #include "opt_kstack.h" 74 #include "opt_perfctrs.h" 75 #include "opt_sa.h" 76 77 #define __MUTEX_PRIVATE 78 79 #include <sys/param.h> 80 #include <sys/systm.h> 81 #include <sys/proc.h> 82 #include <sys/kernel.h> 83 #if defined(PERFCTRS) 84 #include <sys/pmc.h> 85 #endif 86 #include <sys/cpu.h> 87 #include <sys/resourcevar.h> 88 #include <sys/sched.h> 89 #include <sys/sa.h> 90 #include <sys/savar.h> 91 #include <sys/syscall_stats.h> 92 #include <sys/sleepq.h> 93 #include <sys/lockdebug.h> 94 #include <sys/evcnt.h> 95 #include <sys/intr.h> 96 #include <sys/lwpctl.h> 97 #include <sys/atomic.h> 98 #include <sys/simplelock.h> 99 100 #include <uvm/uvm_extern.h> 101 102 #include <dev/lockstat.h> 103 104 static u_int sched_unsleep(struct lwp *, bool); 105 static void sched_changepri(struct lwp *, pri_t); 106 static void sched_lendpri(struct lwp *, pri_t); 107 static void resched_cpu(struct lwp *); 108 109 syncobj_t sleep_syncobj = { 110 SOBJ_SLEEPQ_SORTED, 111 sleepq_unsleep, 112 sleepq_changepri, 113 sleepq_lendpri, 114 syncobj_noowner, 115 }; 116 117 syncobj_t sched_syncobj = { 118 SOBJ_SLEEPQ_SORTED, 119 sched_unsleep, 120 sched_changepri, 121 sched_lendpri, 122 syncobj_noowner, 123 }; 124 125 callout_t sched_pstats_ch; 126 unsigned sched_pstats_ticks; 127 kcondvar_t lbolt; /* once a second sleep address */ 128 129 /* Preemption event counters */ 130 static struct evcnt kpreempt_ev_crit; 131 static struct evcnt kpreempt_ev_klock; 132 static struct evcnt kpreempt_ev_immed; 133 134 /* 135 * During autoconfiguration or after a panic, a sleep will simply lower the 136 * priority briefly to allow interrupts, then return. The priority to be 137 * used (safepri) is machine-dependent, thus this value is initialized and 138 * maintained in the machine-dependent layers. This priority will typically 139 * be 0, or the lowest priority that is safe for use on the interrupt stack; 140 * it can be made higher to block network software interrupts after panics. 141 */ 142 int safepri; 143 144 void 145 sched_init(void) 146 { 147 148 cv_init(&lbolt, "lbolt"); 149 callout_init(&sched_pstats_ch, CALLOUT_MPSAFE); 150 callout_setfunc(&sched_pstats_ch, sched_pstats, NULL); 151 152 evcnt_attach_dynamic(&kpreempt_ev_crit, EVCNT_TYPE_MISC, NULL, 153 "kpreempt", "defer: critical section"); 154 evcnt_attach_dynamic(&kpreempt_ev_klock, EVCNT_TYPE_MISC, NULL, 155 "kpreempt", "defer: kernel_lock"); 156 evcnt_attach_dynamic(&kpreempt_ev_immed, EVCNT_TYPE_MISC, NULL, 157 "kpreempt", "immediate"); 158 159 sched_pstats(NULL); 160 } 161 162 /* 163 * OBSOLETE INTERFACE 164 * 165 * General sleep call. Suspends the current LWP until a wakeup is 166 * performed on the specified identifier. The LWP will then be made 167 * runnable with the specified priority. Sleeps at most timo/hz seconds (0 168 * means no timeout). If pri includes PCATCH flag, signals are checked 169 * before and after sleeping, else signals are not checked. Returns 0 if 170 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a 171 * signal needs to be delivered, ERESTART is returned if the current system 172 * call should be restarted if possible, and EINTR is returned if the system 173 * call should be interrupted by the signal (return EINTR). 174 * 175 * The interlock is held until we are on a sleep queue. The interlock will 176 * be locked before returning back to the caller unless the PNORELOCK flag 177 * is specified, in which case the interlock will always be unlocked upon 178 * return. 179 */ 180 int 181 ltsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo, 182 volatile struct simplelock *interlock) 183 { 184 struct lwp *l = curlwp; 185 sleepq_t *sq; 186 kmutex_t *mp; 187 int error; 188 189 KASSERT((l->l_pflag & LP_INTR) == 0); 190 191 if (sleepq_dontsleep(l)) { 192 (void)sleepq_abort(NULL, 0); 193 if ((priority & PNORELOCK) != 0) 194 simple_unlock(interlock); 195 return 0; 196 } 197 198 l->l_kpriority = true; 199 sq = sleeptab_lookup(&sleeptab, ident, &mp); 200 sleepq_enter(sq, l, mp); 201 sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj); 202 203 if (interlock != NULL) { 204 KASSERT(simple_lock_held(interlock)); 205 simple_unlock(interlock); 206 } 207 208 error = sleepq_block(timo, priority & PCATCH); 209 210 if (interlock != NULL && (priority & PNORELOCK) == 0) 211 simple_lock(interlock); 212 213 return error; 214 } 215 216 int 217 mtsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo, 218 kmutex_t *mtx) 219 { 220 struct lwp *l = curlwp; 221 sleepq_t *sq; 222 kmutex_t *mp; 223 int error; 224 225 KASSERT((l->l_pflag & LP_INTR) == 0); 226 227 if (sleepq_dontsleep(l)) { 228 (void)sleepq_abort(mtx, (priority & PNORELOCK) != 0); 229 return 0; 230 } 231 232 l->l_kpriority = true; 233 sq = sleeptab_lookup(&sleeptab, ident, &mp); 234 sleepq_enter(sq, l, mp); 235 sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj); 236 mutex_exit(mtx); 237 error = sleepq_block(timo, priority & PCATCH); 238 239 if ((priority & PNORELOCK) == 0) 240 mutex_enter(mtx); 241 242 return error; 243 } 244 245 /* 246 * General sleep call for situations where a wake-up is not expected. 247 */ 248 int 249 kpause(const char *wmesg, bool intr, int timo, kmutex_t *mtx) 250 { 251 struct lwp *l = curlwp; 252 kmutex_t *mp; 253 sleepq_t *sq; 254 int error; 255 256 if (sleepq_dontsleep(l)) 257 return sleepq_abort(NULL, 0); 258 259 if (mtx != NULL) 260 mutex_exit(mtx); 261 l->l_kpriority = true; 262 sq = sleeptab_lookup(&sleeptab, l, &mp); 263 sleepq_enter(sq, l, mp); 264 sleepq_enqueue(sq, l, wmesg, &sleep_syncobj); 265 error = sleepq_block(timo, intr); 266 if (mtx != NULL) 267 mutex_enter(mtx); 268 269 return error; 270 } 271 272 #ifdef KERN_SA 273 /* 274 * sa_awaken: 275 * 276 * We believe this lwp is an SA lwp. If it's yielding, 277 * let it know it needs to wake up. 278 * 279 * We are called and exit with the lwp locked. We are 280 * called in the middle of wakeup operations, so we need 281 * to not touch the locks at all. 282 */ 283 void 284 sa_awaken(struct lwp *l) 285 { 286 /* LOCK_ASSERT(lwp_locked(l, NULL)); */ 287 288 if (l == l->l_savp->savp_lwp && l->l_flag & LW_SA_YIELD) 289 l->l_flag &= ~LW_SA_IDLE; 290 } 291 #endif /* KERN_SA */ 292 293 /* 294 * OBSOLETE INTERFACE 295 * 296 * Make all LWPs sleeping on the specified identifier runnable. 297 */ 298 void 299 wakeup(wchan_t ident) 300 { 301 sleepq_t *sq; 302 kmutex_t *mp; 303 304 if (cold) 305 return; 306 307 sq = sleeptab_lookup(&sleeptab, ident, &mp); 308 sleepq_wake(sq, ident, (u_int)-1, mp); 309 } 310 311 /* 312 * OBSOLETE INTERFACE 313 * 314 * Make the highest priority LWP first in line on the specified 315 * identifier runnable. 316 */ 317 void 318 wakeup_one(wchan_t ident) 319 { 320 sleepq_t *sq; 321 kmutex_t *mp; 322 323 if (cold) 324 return; 325 326 sq = sleeptab_lookup(&sleeptab, ident, &mp); 327 sleepq_wake(sq, ident, 1, mp); 328 } 329 330 331 /* 332 * General yield call. Puts the current LWP back on its run queue and 333 * performs a voluntary context switch. Should only be called when the 334 * current LWP explicitly requests it (eg sched_yield(2)). 335 */ 336 void 337 yield(void) 338 { 339 struct lwp *l = curlwp; 340 341 KERNEL_UNLOCK_ALL(l, &l->l_biglocks); 342 lwp_lock(l); 343 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); 344 KASSERT(l->l_stat == LSONPROC); 345 l->l_kpriority = false; 346 (void)mi_switch(l); 347 KERNEL_LOCK(l->l_biglocks, l); 348 } 349 350 /* 351 * General preemption call. Puts the current LWP back on its run queue 352 * and performs an involuntary context switch. 353 */ 354 void 355 preempt(void) 356 { 357 struct lwp *l = curlwp; 358 359 KERNEL_UNLOCK_ALL(l, &l->l_biglocks); 360 lwp_lock(l); 361 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); 362 KASSERT(l->l_stat == LSONPROC); 363 l->l_kpriority = false; 364 l->l_nivcsw++; 365 (void)mi_switch(l); 366 KERNEL_LOCK(l->l_biglocks, l); 367 } 368 369 /* 370 * Handle a request made by another agent to preempt the current LWP 371 * in-kernel. Usually called when l_dopreempt may be non-zero. 372 * 373 * Character addresses for lockstat only. 374 */ 375 static char in_critical_section; 376 static char kernel_lock_held; 377 static char is_softint; 378 379 bool 380 kpreempt(uintptr_t where) 381 { 382 uintptr_t failed; 383 lwp_t *l; 384 int s, dop; 385 386 l = curlwp; 387 failed = 0; 388 while ((dop = l->l_dopreempt) != 0) { 389 if (l->l_stat != LSONPROC) { 390 /* 391 * About to block (or die), let it happen. 392 * Doesn't really count as "preemption has 393 * been blocked", since we're going to 394 * context switch. 395 */ 396 l->l_dopreempt = 0; 397 return true; 398 } 399 if (__predict_false((l->l_flag & LW_IDLE) != 0)) { 400 /* Can't preempt idle loop, don't count as failure. */ 401 l->l_dopreempt = 0; 402 return true; 403 } 404 if (__predict_false(l->l_nopreempt != 0)) { 405 /* LWP holds preemption disabled, explicitly. */ 406 if ((dop & DOPREEMPT_COUNTED) == 0) { 407 kpreempt_ev_crit.ev_count++; 408 } 409 failed = (uintptr_t)&in_critical_section; 410 break; 411 } 412 if (__predict_false((l->l_pflag & LP_INTR) != 0)) { 413 /* Can't preempt soft interrupts yet. */ 414 l->l_dopreempt = 0; 415 failed = (uintptr_t)&is_softint; 416 break; 417 } 418 s = splsched(); 419 if (__predict_false(l->l_blcnt != 0 || 420 curcpu()->ci_biglock_wanted != NULL)) { 421 /* Hold or want kernel_lock, code is not MT safe. */ 422 splx(s); 423 if ((dop & DOPREEMPT_COUNTED) == 0) { 424 kpreempt_ev_klock.ev_count++; 425 } 426 failed = (uintptr_t)&kernel_lock_held; 427 break; 428 } 429 if (__predict_false(!cpu_kpreempt_enter(where, s))) { 430 /* 431 * It may be that the IPL is too high. 432 * kpreempt_enter() can schedule an 433 * interrupt to retry later. 434 */ 435 splx(s); 436 break; 437 } 438 /* Do it! */ 439 if (__predict_true((dop & DOPREEMPT_COUNTED) == 0)) { 440 kpreempt_ev_immed.ev_count++; 441 } 442 lwp_lock(l); 443 mi_switch(l); 444 l->l_nopreempt++; 445 splx(s); 446 447 /* Take care of any MD cleanup. */ 448 cpu_kpreempt_exit(where); 449 l->l_nopreempt--; 450 } 451 452 /* Record preemption failure for reporting via lockstat. */ 453 if (__predict_false(failed)) { 454 int lsflag = 0; 455 atomic_or_uint(&l->l_dopreempt, DOPREEMPT_COUNTED); 456 LOCKSTAT_ENTER(lsflag); 457 /* Might recurse, make it atomic. */ 458 if (__predict_false(lsflag)) { 459 if (where == 0) { 460 where = (uintptr_t)__builtin_return_address(0); 461 } 462 if (atomic_cas_ptr_ni((void *)&l->l_pfailaddr, 463 NULL, (void *)where) == NULL) { 464 LOCKSTAT_START_TIMER(lsflag, l->l_pfailtime); 465 l->l_pfaillock = failed; 466 } 467 } 468 LOCKSTAT_EXIT(lsflag); 469 } 470 471 return failed; 472 } 473 474 /* 475 * Return true if preemption is explicitly disabled. 476 */ 477 bool 478 kpreempt_disabled(void) 479 { 480 lwp_t *l; 481 482 l = curlwp; 483 484 return l->l_nopreempt != 0 || l->l_stat == LSZOMB || 485 (l->l_flag & LW_IDLE) != 0 || cpu_kpreempt_disabled(); 486 } 487 488 /* 489 * Disable kernel preemption. 490 */ 491 void 492 kpreempt_disable(void) 493 { 494 495 KPREEMPT_DISABLE(curlwp); 496 } 497 498 /* 499 * Reenable kernel preemption. 500 */ 501 void 502 kpreempt_enable(void) 503 { 504 505 KPREEMPT_ENABLE(curlwp); 506 } 507 508 /* 509 * Compute the amount of time during which the current lwp was running. 510 * 511 * - update l_rtime unless it's an idle lwp. 512 */ 513 514 void 515 updatertime(lwp_t *l, const struct bintime *now) 516 { 517 518 if ((l->l_flag & LW_IDLE) != 0) 519 return; 520 521 /* rtime += now - stime */ 522 bintime_add(&l->l_rtime, now); 523 bintime_sub(&l->l_rtime, &l->l_stime); 524 } 525 526 /* 527 * Select next LWP from the current CPU to run.. 528 */ 529 static inline lwp_t * 530 nextlwp(struct cpu_info *ci, struct schedstate_percpu *spc) 531 { 532 lwp_t *newl; 533 534 /* 535 * Let sched_nextlwp() select the LWP to run the CPU next. 536 * If no LWP is runnable, select the idle LWP. 537 * 538 * Note that spc_lwplock might not necessary be held, and 539 * new thread would be unlocked after setting the LWP-lock. 540 */ 541 newl = sched_nextlwp(); 542 if (newl != NULL) { 543 sched_dequeue(newl); 544 KASSERT(lwp_locked(newl, spc->spc_mutex)); 545 newl->l_stat = LSONPROC; 546 newl->l_cpu = ci; 547 newl->l_pflag |= LP_RUNNING; 548 lwp_setlock(newl, spc->spc_lwplock); 549 } else { 550 newl = ci->ci_data.cpu_idlelwp; 551 newl->l_stat = LSONPROC; 552 newl->l_pflag |= LP_RUNNING; 553 } 554 555 /* 556 * Only clear want_resched if there are no pending (slow) 557 * software interrupts. 558 */ 559 ci->ci_want_resched = ci->ci_data.cpu_softints; 560 spc->spc_flags &= ~SPCF_SWITCHCLEAR; 561 spc->spc_curpriority = lwp_eprio(newl); 562 563 return newl; 564 } 565 566 /* 567 * The machine independent parts of context switch. 568 * 569 * Returns 1 if another LWP was actually run. 570 */ 571 int 572 mi_switch(lwp_t *l) 573 { 574 struct cpu_info *ci; 575 struct schedstate_percpu *spc; 576 struct lwp *newl; 577 int retval, oldspl; 578 struct bintime bt; 579 bool returning; 580 581 KASSERT(lwp_locked(l, NULL)); 582 KASSERT(kpreempt_disabled()); 583 LOCKDEBUG_BARRIER(l->l_mutex, 1); 584 585 #ifdef KSTACK_CHECK_MAGIC 586 kstack_check_magic(l); 587 #endif 588 589 binuptime(&bt); 590 591 KASSERT(l->l_cpu == curcpu()); 592 ci = l->l_cpu; 593 spc = &ci->ci_schedstate; 594 returning = false; 595 newl = NULL; 596 597 /* 598 * If we have been asked to switch to a specific LWP, then there 599 * is no need to inspect the run queues. If a soft interrupt is 600 * blocking, then return to the interrupted thread without adjusting 601 * VM context or its start time: neither have been changed in order 602 * to take the interrupt. 603 */ 604 if (l->l_switchto != NULL) { 605 if ((l->l_pflag & LP_INTR) != 0) { 606 returning = true; 607 softint_block(l); 608 if ((l->l_pflag & LP_TIMEINTR) != 0) 609 updatertime(l, &bt); 610 } 611 newl = l->l_switchto; 612 l->l_switchto = NULL; 613 } 614 #ifndef __HAVE_FAST_SOFTINTS 615 else if (ci->ci_data.cpu_softints != 0) { 616 /* There are pending soft interrupts, so pick one. */ 617 newl = softint_picklwp(); 618 newl->l_stat = LSONPROC; 619 newl->l_pflag |= LP_RUNNING; 620 } 621 #endif /* !__HAVE_FAST_SOFTINTS */ 622 623 /* Count time spent in current system call */ 624 if (!returning) { 625 SYSCALL_TIME_SLEEP(l); 626 627 /* 628 * XXXSMP If we are using h/w performance counters, 629 * save context. 630 */ 631 #if PERFCTRS 632 if (PMC_ENABLED(l->l_proc)) { 633 pmc_save_context(l->l_proc); 634 } 635 #endif 636 updatertime(l, &bt); 637 } 638 639 /* Lock the runqueue */ 640 KASSERT(l->l_stat != LSRUN); 641 mutex_spin_enter(spc->spc_mutex); 642 643 /* 644 * If on the CPU and we have gotten this far, then we must yield. 645 */ 646 if (l->l_stat == LSONPROC && l != newl) { 647 KASSERT(lwp_locked(l, spc->spc_lwplock)); 648 if ((l->l_flag & LW_IDLE) == 0) { 649 l->l_stat = LSRUN; 650 lwp_setlock(l, spc->spc_mutex); 651 sched_enqueue(l, true); 652 /* Handle migration case */ 653 KASSERT(spc->spc_migrating == NULL); 654 if (l->l_target_cpu != NULL) { 655 spc->spc_migrating = l; 656 } 657 } else 658 l->l_stat = LSIDL; 659 } 660 661 /* Pick new LWP to run. */ 662 if (newl == NULL) { 663 newl = nextlwp(ci, spc); 664 } 665 666 /* Items that must be updated with the CPU locked. */ 667 if (!returning) { 668 /* Update the new LWP's start time. */ 669 newl->l_stime = bt; 670 671 /* 672 * ci_curlwp changes when a fast soft interrupt occurs. 673 * We use cpu_onproc to keep track of which kernel or 674 * user thread is running 'underneath' the software 675 * interrupt. This is important for time accounting, 676 * itimers and forcing user threads to preempt (aston). 677 */ 678 ci->ci_data.cpu_onproc = newl; 679 } 680 681 /* 682 * Preemption related tasks. Must be done with the current 683 * CPU locked. 684 */ 685 cpu_did_resched(l); 686 l->l_dopreempt = 0; 687 if (__predict_false(l->l_pfailaddr != 0)) { 688 LOCKSTAT_FLAG(lsflag); 689 LOCKSTAT_ENTER(lsflag); 690 LOCKSTAT_STOP_TIMER(lsflag, l->l_pfailtime); 691 LOCKSTAT_EVENT_RA(lsflag, l->l_pfaillock, LB_NOPREEMPT|LB_SPIN, 692 1, l->l_pfailtime, l->l_pfailaddr); 693 LOCKSTAT_EXIT(lsflag); 694 l->l_pfailtime = 0; 695 l->l_pfaillock = 0; 696 l->l_pfailaddr = 0; 697 } 698 699 if (l != newl) { 700 struct lwp *prevlwp; 701 702 /* Release all locks, but leave the current LWP locked */ 703 if (l->l_mutex == spc->spc_mutex) { 704 /* 705 * Drop spc_lwplock, if the current LWP has been moved 706 * to the run queue (it is now locked by spc_mutex). 707 */ 708 mutex_spin_exit(spc->spc_lwplock); 709 } else { 710 /* 711 * Otherwise, drop the spc_mutex, we are done with the 712 * run queues. 713 */ 714 mutex_spin_exit(spc->spc_mutex); 715 } 716 717 /* 718 * Mark that context switch is going to be performed 719 * for this LWP, to protect it from being switched 720 * to on another CPU. 721 */ 722 KASSERT(l->l_ctxswtch == 0); 723 l->l_ctxswtch = 1; 724 l->l_ncsw++; 725 l->l_pflag &= ~LP_RUNNING; 726 727 /* 728 * Increase the count of spin-mutexes before the release 729 * of the last lock - we must remain at IPL_SCHED during 730 * the context switch. 731 */ 732 oldspl = MUTEX_SPIN_OLDSPL(ci); 733 ci->ci_mtx_count--; 734 lwp_unlock(l); 735 736 /* Count the context switch on this CPU. */ 737 ci->ci_data.cpu_nswtch++; 738 739 /* Update status for lwpctl, if present. */ 740 if (l->l_lwpctl != NULL) 741 l->l_lwpctl->lc_curcpu = LWPCTL_CPU_NONE; 742 743 /* 744 * Save old VM context, unless a soft interrupt 745 * handler is blocking. 746 */ 747 if (!returning) 748 pmap_deactivate(l); 749 750 /* 751 * We may need to spin-wait for if 'newl' is still 752 * context switching on another CPU. 753 */ 754 if (newl->l_ctxswtch != 0) { 755 u_int count; 756 count = SPINLOCK_BACKOFF_MIN; 757 while (newl->l_ctxswtch) 758 SPINLOCK_BACKOFF(count); 759 } 760 761 /* Switch to the new LWP.. */ 762 prevlwp = cpu_switchto(l, newl, returning); 763 ci = curcpu(); 764 765 /* 766 * Switched away - we have new curlwp. 767 * Restore VM context and IPL. 768 */ 769 pmap_activate(l); 770 if (prevlwp != NULL) { 771 /* Normalize the count of the spin-mutexes */ 772 ci->ci_mtx_count++; 773 /* Unmark the state of context switch */ 774 membar_exit(); 775 prevlwp->l_ctxswtch = 0; 776 } 777 778 /* Update status for lwpctl, if present. */ 779 if (l->l_lwpctl != NULL) { 780 l->l_lwpctl->lc_curcpu = (int)cpu_index(ci); 781 l->l_lwpctl->lc_pctr++; 782 } 783 784 KASSERT(l->l_cpu == ci); 785 splx(oldspl); 786 retval = 1; 787 } else { 788 /* Nothing to do - just unlock and return. */ 789 mutex_spin_exit(spc->spc_mutex); 790 lwp_unlock(l); 791 retval = 0; 792 } 793 794 KASSERT(l == curlwp); 795 KASSERT(l->l_stat == LSONPROC); 796 797 /* 798 * XXXSMP If we are using h/w performance counters, restore context. 799 * XXXSMP preemption problem. 800 */ 801 #if PERFCTRS 802 if (PMC_ENABLED(l->l_proc)) { 803 pmc_restore_context(l->l_proc); 804 } 805 #endif 806 SYSCALL_TIME_WAKEUP(l); 807 LOCKDEBUG_BARRIER(NULL, 1); 808 809 return retval; 810 } 811 812 /* 813 * The machine independent parts of context switch to oblivion. 814 * Does not return. Call with the LWP unlocked. 815 */ 816 void 817 lwp_exit_switchaway(lwp_t *l) 818 { 819 struct cpu_info *ci; 820 struct lwp *newl; 821 struct bintime bt; 822 823 ci = l->l_cpu; 824 825 KASSERT(kpreempt_disabled()); 826 KASSERT(l->l_stat == LSZOMB || l->l_stat == LSIDL); 827 KASSERT(ci == curcpu()); 828 LOCKDEBUG_BARRIER(NULL, 0); 829 830 #ifdef KSTACK_CHECK_MAGIC 831 kstack_check_magic(l); 832 #endif 833 834 /* Count time spent in current system call */ 835 SYSCALL_TIME_SLEEP(l); 836 binuptime(&bt); 837 updatertime(l, &bt); 838 839 /* Must stay at IPL_SCHED even after releasing run queue lock. */ 840 (void)splsched(); 841 842 /* 843 * Let sched_nextlwp() select the LWP to run the CPU next. 844 * If no LWP is runnable, select the idle LWP. 845 * 846 * Note that spc_lwplock might not necessary be held, and 847 * new thread would be unlocked after setting the LWP-lock. 848 */ 849 spc_lock(ci); 850 #ifndef __HAVE_FAST_SOFTINTS 851 if (ci->ci_data.cpu_softints != 0) { 852 /* There are pending soft interrupts, so pick one. */ 853 newl = softint_picklwp(); 854 newl->l_stat = LSONPROC; 855 newl->l_pflag |= LP_RUNNING; 856 } else 857 #endif /* !__HAVE_FAST_SOFTINTS */ 858 { 859 newl = nextlwp(ci, &ci->ci_schedstate); 860 } 861 862 /* Update the new LWP's start time. */ 863 newl->l_stime = bt; 864 l->l_pflag &= ~LP_RUNNING; 865 866 /* 867 * ci_curlwp changes when a fast soft interrupt occurs. 868 * We use cpu_onproc to keep track of which kernel or 869 * user thread is running 'underneath' the software 870 * interrupt. This is important for time accounting, 871 * itimers and forcing user threads to preempt (aston). 872 */ 873 ci->ci_data.cpu_onproc = newl; 874 875 /* 876 * Preemption related tasks. Must be done with the current 877 * CPU locked. 878 */ 879 cpu_did_resched(l); 880 881 /* Unlock the run queue. */ 882 spc_unlock(ci); 883 884 /* Count the context switch on this CPU. */ 885 ci->ci_data.cpu_nswtch++; 886 887 /* Update status for lwpctl, if present. */ 888 if (l->l_lwpctl != NULL) 889 l->l_lwpctl->lc_curcpu = LWPCTL_CPU_EXITED; 890 891 /* 892 * We may need to spin-wait for if 'newl' is still 893 * context switching on another CPU. 894 */ 895 if (newl->l_ctxswtch != 0) { 896 u_int count; 897 count = SPINLOCK_BACKOFF_MIN; 898 while (newl->l_ctxswtch) 899 SPINLOCK_BACKOFF(count); 900 } 901 902 /* Switch to the new LWP.. */ 903 (void)cpu_switchto(NULL, newl, false); 904 905 for (;;) continue; /* XXX: convince gcc about "noreturn" */ 906 /* NOTREACHED */ 907 } 908 909 /* 910 * Change LWP state to be runnable, placing it on the run queue if it is 911 * in memory, and awakening the swapper if it isn't in memory. 912 * 913 * Call with the process and LWP locked. Will return with the LWP unlocked. 914 */ 915 void 916 setrunnable(struct lwp *l) 917 { 918 struct proc *p = l->l_proc; 919 struct cpu_info *ci; 920 921 KASSERT((l->l_flag & LW_IDLE) == 0); 922 KASSERT(mutex_owned(p->p_lock)); 923 KASSERT(lwp_locked(l, NULL)); 924 KASSERT(l->l_mutex != l->l_cpu->ci_schedstate.spc_mutex); 925 926 switch (l->l_stat) { 927 case LSSTOP: 928 /* 929 * If we're being traced (possibly because someone attached us 930 * while we were stopped), check for a signal from the debugger. 931 */ 932 if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xstat != 0) 933 signotify(l); 934 p->p_nrlwps++; 935 break; 936 case LSSUSPENDED: 937 l->l_flag &= ~LW_WSUSPEND; 938 p->p_nrlwps++; 939 cv_broadcast(&p->p_lwpcv); 940 break; 941 case LSSLEEP: 942 KASSERT(l->l_wchan != NULL); 943 break; 944 default: 945 panic("setrunnable: lwp %p state was %d", l, l->l_stat); 946 } 947 948 #ifdef KERN_SA 949 if (l->l_proc->p_sa) 950 sa_awaken(l); 951 #endif /* KERN_SA */ 952 953 /* 954 * If the LWP was sleeping interruptably, then it's OK to start it 955 * again. If not, mark it as still sleeping. 956 */ 957 if (l->l_wchan != NULL) { 958 l->l_stat = LSSLEEP; 959 /* lwp_unsleep() will release the lock. */ 960 lwp_unsleep(l, true); 961 return; 962 } 963 964 /* 965 * If the LWP is still on the CPU, mark it as LSONPROC. It may be 966 * about to call mi_switch(), in which case it will yield. 967 */ 968 if ((l->l_pflag & LP_RUNNING) != 0) { 969 l->l_stat = LSONPROC; 970 l->l_slptime = 0; 971 lwp_unlock(l); 972 return; 973 } 974 975 /* 976 * Look for a CPU to run. 977 * Set the LWP runnable. 978 */ 979 ci = sched_takecpu(l); 980 l->l_cpu = ci; 981 spc_lock(ci); 982 lwp_unlock_to(l, ci->ci_schedstate.spc_mutex); 983 sched_setrunnable(l); 984 l->l_stat = LSRUN; 985 l->l_slptime = 0; 986 987 /* 988 * If thread is swapped out - wake the swapper to bring it back in. 989 * Otherwise, enter it into a run queue. 990 */ 991 if (l->l_flag & LW_INMEM) { 992 sched_enqueue(l, false); 993 resched_cpu(l); 994 lwp_unlock(l); 995 } else { 996 lwp_unlock(l); 997 uvm_kick_scheduler(); 998 } 999 } 1000 1001 /* 1002 * suspendsched: 1003 * 1004 * Convert all non-L_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED. 1005 */ 1006 void 1007 suspendsched(void) 1008 { 1009 CPU_INFO_ITERATOR cii; 1010 struct cpu_info *ci; 1011 struct lwp *l; 1012 struct proc *p; 1013 1014 /* 1015 * We do this by process in order not to violate the locking rules. 1016 */ 1017 mutex_enter(proc_lock); 1018 PROCLIST_FOREACH(p, &allproc) { 1019 if ((p->p_flag & PK_MARKER) != 0) 1020 continue; 1021 1022 mutex_enter(p->p_lock); 1023 if ((p->p_flag & PK_SYSTEM) != 0) { 1024 mutex_exit(p->p_lock); 1025 continue; 1026 } 1027 1028 p->p_stat = SSTOP; 1029 1030 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 1031 if (l == curlwp) 1032 continue; 1033 1034 lwp_lock(l); 1035 1036 /* 1037 * Set L_WREBOOT so that the LWP will suspend itself 1038 * when it tries to return to user mode. We want to 1039 * try and get to get as many LWPs as possible to 1040 * the user / kernel boundary, so that they will 1041 * release any locks that they hold. 1042 */ 1043 l->l_flag |= (LW_WREBOOT | LW_WSUSPEND); 1044 1045 if (l->l_stat == LSSLEEP && 1046 (l->l_flag & LW_SINTR) != 0) { 1047 /* setrunnable() will release the lock. */ 1048 setrunnable(l); 1049 continue; 1050 } 1051 1052 lwp_unlock(l); 1053 } 1054 1055 mutex_exit(p->p_lock); 1056 } 1057 mutex_exit(proc_lock); 1058 1059 /* 1060 * Kick all CPUs to make them preempt any LWPs running in user mode. 1061 * They'll trap into the kernel and suspend themselves in userret(). 1062 */ 1063 for (CPU_INFO_FOREACH(cii, ci)) { 1064 spc_lock(ci); 1065 cpu_need_resched(ci, RESCHED_IMMED); 1066 spc_unlock(ci); 1067 } 1068 } 1069 1070 /* 1071 * sched_unsleep: 1072 * 1073 * The is called when the LWP has not been awoken normally but instead 1074 * interrupted: for example, if the sleep timed out. Because of this, 1075 * it's not a valid action for running or idle LWPs. 1076 */ 1077 static u_int 1078 sched_unsleep(struct lwp *l, bool cleanup) 1079 { 1080 1081 lwp_unlock(l); 1082 panic("sched_unsleep"); 1083 } 1084 1085 static void 1086 resched_cpu(struct lwp *l) 1087 { 1088 struct cpu_info *ci = ci = l->l_cpu; 1089 1090 KASSERT(lwp_locked(l, NULL)); 1091 if (lwp_eprio(l) > ci->ci_schedstate.spc_curpriority) 1092 cpu_need_resched(ci, 0); 1093 } 1094 1095 static void 1096 sched_changepri(struct lwp *l, pri_t pri) 1097 { 1098 1099 KASSERT(lwp_locked(l, NULL)); 1100 1101 if (l->l_stat == LSRUN && (l->l_flag & LW_INMEM) != 0) { 1102 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex)); 1103 sched_dequeue(l); 1104 l->l_priority = pri; 1105 sched_enqueue(l, false); 1106 } else { 1107 l->l_priority = pri; 1108 } 1109 resched_cpu(l); 1110 } 1111 1112 static void 1113 sched_lendpri(struct lwp *l, pri_t pri) 1114 { 1115 1116 KASSERT(lwp_locked(l, NULL)); 1117 1118 if (l->l_stat == LSRUN && (l->l_flag & LW_INMEM) != 0) { 1119 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex)); 1120 sched_dequeue(l); 1121 l->l_inheritedprio = pri; 1122 sched_enqueue(l, false); 1123 } else { 1124 l->l_inheritedprio = pri; 1125 } 1126 resched_cpu(l); 1127 } 1128 1129 struct lwp * 1130 syncobj_noowner(wchan_t wchan) 1131 { 1132 1133 return NULL; 1134 } 1135 1136 /* Decay 95% of proc::p_pctcpu in 60 seconds, ccpu = exp(-1/20) */ 1137 const fixpt_t ccpu = 0.95122942450071400909 * FSCALE; 1138 1139 /* 1140 * sched_pstats: 1141 * 1142 * Update process statistics and check CPU resource allocation. 1143 * Call scheduler-specific hook to eventually adjust process/LWP 1144 * priorities. 1145 */ 1146 /* ARGSUSED */ 1147 void 1148 sched_pstats(void *arg) 1149 { 1150 const int clkhz = (stathz != 0 ? stathz : hz); 1151 struct rlimit *rlim; 1152 struct lwp *l; 1153 struct proc *p; 1154 long runtm; 1155 fixpt_t lpctcpu; 1156 u_int lcpticks; 1157 int sig; 1158 1159 sched_pstats_ticks++; 1160 1161 mutex_enter(proc_lock); 1162 PROCLIST_FOREACH(p, &allproc) { 1163 if (__predict_false((p->p_flag & PK_MARKER) != 0)) 1164 continue; 1165 1166 /* 1167 * Increment time in/out of memory and sleep 1168 * time (if sleeping), ignore overflow. 1169 */ 1170 mutex_enter(p->p_lock); 1171 runtm = p->p_rtime.sec; 1172 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 1173 if (__predict_false((l->l_flag & LW_IDLE) != 0)) 1174 continue; 1175 lwp_lock(l); 1176 runtm += l->l_rtime.sec; 1177 l->l_swtime++; 1178 sched_lwp_stats(l); 1179 lwp_unlock(l); 1180 1181 l->l_pctcpu = (l->l_pctcpu * ccpu) >> FSHIFT; 1182 if (l->l_slptime != 0) 1183 continue; 1184 1185 lpctcpu = l->l_pctcpu; 1186 lcpticks = atomic_swap_uint(&l->l_cpticks, 0); 1187 lpctcpu += ((FSCALE - ccpu) * 1188 (lcpticks * FSCALE / clkhz)) >> FSHIFT; 1189 l->l_pctcpu = lpctcpu; 1190 } 1191 /* Calculating p_pctcpu only for ps(1) */ 1192 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT; 1193 1194 /* 1195 * Check if the process exceeds its CPU resource allocation. 1196 * If over max, kill it. 1197 */ 1198 rlim = &p->p_rlimit[RLIMIT_CPU]; 1199 sig = 0; 1200 if (__predict_false(runtm >= rlim->rlim_cur)) { 1201 if (runtm >= rlim->rlim_max) 1202 sig = SIGKILL; 1203 else { 1204 sig = SIGXCPU; 1205 if (rlim->rlim_cur < rlim->rlim_max) 1206 rlim->rlim_cur += 5; 1207 } 1208 } 1209 mutex_exit(p->p_lock); 1210 if (__predict_false(sig)) 1211 psignal(p, sig); 1212 } 1213 mutex_exit(proc_lock); 1214 uvm_meter(); 1215 cv_wakeup(&lbolt); 1216 callout_schedule(&sched_pstats_ch, hz); 1217 } 1218