1 /* $NetBSD: kern_synch.c,v 1.267 2009/07/19 10:11:55 yamt Exp $ */ 2 3 /*- 4 * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008, 2009 5 * The NetBSD Foundation, Inc. 6 * All rights reserved. 7 * 8 * This code is derived from software contributed to The NetBSD Foundation 9 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 10 * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran and 11 * Daniel Sieger. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 22 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 24 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 26 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 27 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 28 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 29 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 30 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 31 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 32 * POSSIBILITY OF SUCH DAMAGE. 33 */ 34 35 /*- 36 * Copyright (c) 1982, 1986, 1990, 1991, 1993 37 * The Regents of the University of California. All rights reserved. 38 * (c) UNIX System Laboratories, Inc. 39 * All or some portions of this file are derived from material licensed 40 * to the University of California by American Telephone and Telegraph 41 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 42 * the permission of UNIX System Laboratories, Inc. 43 * 44 * Redistribution and use in source and binary forms, with or without 45 * modification, are permitted provided that the following conditions 46 * are met: 47 * 1. Redistributions of source code must retain the above copyright 48 * notice, this list of conditions and the following disclaimer. 49 * 2. Redistributions in binary form must reproduce the above copyright 50 * notice, this list of conditions and the following disclaimer in the 51 * documentation and/or other materials provided with the distribution. 52 * 3. Neither the name of the University nor the names of its contributors 53 * may be used to endorse or promote products derived from this software 54 * without specific prior written permission. 55 * 56 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 57 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 58 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 59 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 60 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 61 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 62 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 63 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 64 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 65 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 66 * SUCH DAMAGE. 67 * 68 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 69 */ 70 71 #include <sys/cdefs.h> 72 __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.267 2009/07/19 10:11:55 yamt Exp $"); 73 74 #include "opt_kstack.h" 75 #include "opt_perfctrs.h" 76 #include "opt_sa.h" 77 78 #define __MUTEX_PRIVATE 79 80 #include <sys/param.h> 81 #include <sys/systm.h> 82 #include <sys/proc.h> 83 #include <sys/kernel.h> 84 #if defined(PERFCTRS) 85 #include <sys/pmc.h> 86 #endif 87 #include <sys/cpu.h> 88 #include <sys/resourcevar.h> 89 #include <sys/sched.h> 90 #include <sys/sa.h> 91 #include <sys/savar.h> 92 #include <sys/syscall_stats.h> 93 #include <sys/sleepq.h> 94 #include <sys/lockdebug.h> 95 #include <sys/evcnt.h> 96 #include <sys/intr.h> 97 #include <sys/lwpctl.h> 98 #include <sys/atomic.h> 99 #include <sys/simplelock.h> 100 101 #include <uvm/uvm_extern.h> 102 103 #include <dev/lockstat.h> 104 105 static u_int sched_unsleep(struct lwp *, bool); 106 static void sched_changepri(struct lwp *, pri_t); 107 static void sched_lendpri(struct lwp *, pri_t); 108 static void resched_cpu(struct lwp *); 109 110 syncobj_t sleep_syncobj = { 111 SOBJ_SLEEPQ_SORTED, 112 sleepq_unsleep, 113 sleepq_changepri, 114 sleepq_lendpri, 115 syncobj_noowner, 116 }; 117 118 syncobj_t sched_syncobj = { 119 SOBJ_SLEEPQ_SORTED, 120 sched_unsleep, 121 sched_changepri, 122 sched_lendpri, 123 syncobj_noowner, 124 }; 125 126 callout_t sched_pstats_ch; 127 unsigned sched_pstats_ticks; 128 kcondvar_t lbolt; /* once a second sleep address */ 129 130 /* Preemption event counters */ 131 static struct evcnt kpreempt_ev_crit; 132 static struct evcnt kpreempt_ev_klock; 133 static struct evcnt kpreempt_ev_immed; 134 135 /* 136 * During autoconfiguration or after a panic, a sleep will simply lower the 137 * priority briefly to allow interrupts, then return. The priority to be 138 * used (safepri) is machine-dependent, thus this value is initialized and 139 * maintained in the machine-dependent layers. This priority will typically 140 * be 0, or the lowest priority that is safe for use on the interrupt stack; 141 * it can be made higher to block network software interrupts after panics. 142 */ 143 int safepri; 144 145 void 146 sched_init(void) 147 { 148 149 cv_init(&lbolt, "lbolt"); 150 callout_init(&sched_pstats_ch, CALLOUT_MPSAFE); 151 callout_setfunc(&sched_pstats_ch, sched_pstats, NULL); 152 153 evcnt_attach_dynamic(&kpreempt_ev_crit, EVCNT_TYPE_MISC, NULL, 154 "kpreempt", "defer: critical section"); 155 evcnt_attach_dynamic(&kpreempt_ev_klock, EVCNT_TYPE_MISC, NULL, 156 "kpreempt", "defer: kernel_lock"); 157 evcnt_attach_dynamic(&kpreempt_ev_immed, EVCNT_TYPE_MISC, NULL, 158 "kpreempt", "immediate"); 159 160 sched_pstats(NULL); 161 } 162 163 /* 164 * OBSOLETE INTERFACE 165 * 166 * General sleep call. Suspends the current LWP until a wakeup is 167 * performed on the specified identifier. The LWP will then be made 168 * runnable with the specified priority. Sleeps at most timo/hz seconds (0 169 * means no timeout). If pri includes PCATCH flag, signals are checked 170 * before and after sleeping, else signals are not checked. Returns 0 if 171 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a 172 * signal needs to be delivered, ERESTART is returned if the current system 173 * call should be restarted if possible, and EINTR is returned if the system 174 * call should be interrupted by the signal (return EINTR). 175 * 176 * The interlock is held until we are on a sleep queue. The interlock will 177 * be locked before returning back to the caller unless the PNORELOCK flag 178 * is specified, in which case the interlock will always be unlocked upon 179 * return. 180 */ 181 int 182 ltsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo, 183 volatile struct simplelock *interlock) 184 { 185 struct lwp *l = curlwp; 186 sleepq_t *sq; 187 kmutex_t *mp; 188 int error; 189 190 KASSERT((l->l_pflag & LP_INTR) == 0); 191 192 if (sleepq_dontsleep(l)) { 193 (void)sleepq_abort(NULL, 0); 194 if ((priority & PNORELOCK) != 0) 195 simple_unlock(interlock); 196 return 0; 197 } 198 199 l->l_kpriority = true; 200 sq = sleeptab_lookup(&sleeptab, ident, &mp); 201 sleepq_enter(sq, l, mp); 202 sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj); 203 204 if (interlock != NULL) { 205 KASSERT(simple_lock_held(interlock)); 206 simple_unlock(interlock); 207 } 208 209 error = sleepq_block(timo, priority & PCATCH); 210 211 if (interlock != NULL && (priority & PNORELOCK) == 0) 212 simple_lock(interlock); 213 214 return error; 215 } 216 217 int 218 mtsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo, 219 kmutex_t *mtx) 220 { 221 struct lwp *l = curlwp; 222 sleepq_t *sq; 223 kmutex_t *mp; 224 int error; 225 226 KASSERT((l->l_pflag & LP_INTR) == 0); 227 228 if (sleepq_dontsleep(l)) { 229 (void)sleepq_abort(mtx, (priority & PNORELOCK) != 0); 230 return 0; 231 } 232 233 l->l_kpriority = true; 234 sq = sleeptab_lookup(&sleeptab, ident, &mp); 235 sleepq_enter(sq, l, mp); 236 sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj); 237 mutex_exit(mtx); 238 error = sleepq_block(timo, priority & PCATCH); 239 240 if ((priority & PNORELOCK) == 0) 241 mutex_enter(mtx); 242 243 return error; 244 } 245 246 /* 247 * General sleep call for situations where a wake-up is not expected. 248 */ 249 int 250 kpause(const char *wmesg, bool intr, int timo, kmutex_t *mtx) 251 { 252 struct lwp *l = curlwp; 253 kmutex_t *mp; 254 sleepq_t *sq; 255 int error; 256 257 if (sleepq_dontsleep(l)) 258 return sleepq_abort(NULL, 0); 259 260 if (mtx != NULL) 261 mutex_exit(mtx); 262 l->l_kpriority = true; 263 sq = sleeptab_lookup(&sleeptab, l, &mp); 264 sleepq_enter(sq, l, mp); 265 sleepq_enqueue(sq, l, wmesg, &sleep_syncobj); 266 error = sleepq_block(timo, intr); 267 if (mtx != NULL) 268 mutex_enter(mtx); 269 270 return error; 271 } 272 273 #ifdef KERN_SA 274 /* 275 * sa_awaken: 276 * 277 * We believe this lwp is an SA lwp. If it's yielding, 278 * let it know it needs to wake up. 279 * 280 * We are called and exit with the lwp locked. We are 281 * called in the middle of wakeup operations, so we need 282 * to not touch the locks at all. 283 */ 284 void 285 sa_awaken(struct lwp *l) 286 { 287 /* LOCK_ASSERT(lwp_locked(l, NULL)); */ 288 289 if (l == l->l_savp->savp_lwp && l->l_flag & LW_SA_YIELD) 290 l->l_flag &= ~LW_SA_IDLE; 291 } 292 #endif /* KERN_SA */ 293 294 /* 295 * OBSOLETE INTERFACE 296 * 297 * Make all LWPs sleeping on the specified identifier runnable. 298 */ 299 void 300 wakeup(wchan_t ident) 301 { 302 sleepq_t *sq; 303 kmutex_t *mp; 304 305 if (__predict_false(cold)) 306 return; 307 308 sq = sleeptab_lookup(&sleeptab, ident, &mp); 309 sleepq_wake(sq, ident, (u_int)-1, mp); 310 } 311 312 /* 313 * OBSOLETE INTERFACE 314 * 315 * Make the highest priority LWP first in line on the specified 316 * identifier runnable. 317 */ 318 void 319 wakeup_one(wchan_t ident) 320 { 321 sleepq_t *sq; 322 kmutex_t *mp; 323 324 if (__predict_false(cold)) 325 return; 326 327 sq = sleeptab_lookup(&sleeptab, ident, &mp); 328 sleepq_wake(sq, ident, 1, mp); 329 } 330 331 332 /* 333 * General yield call. Puts the current LWP back on its run queue and 334 * performs a voluntary context switch. Should only be called when the 335 * current LWP explicitly requests it (eg sched_yield(2)). 336 */ 337 void 338 yield(void) 339 { 340 struct lwp *l = curlwp; 341 342 KERNEL_UNLOCK_ALL(l, &l->l_biglocks); 343 lwp_lock(l); 344 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); 345 KASSERT(l->l_stat == LSONPROC); 346 l->l_kpriority = false; 347 (void)mi_switch(l); 348 KERNEL_LOCK(l->l_biglocks, l); 349 } 350 351 /* 352 * General preemption call. Puts the current LWP back on its run queue 353 * and performs an involuntary context switch. 354 */ 355 void 356 preempt(void) 357 { 358 struct lwp *l = curlwp; 359 360 KERNEL_UNLOCK_ALL(l, &l->l_biglocks); 361 lwp_lock(l); 362 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); 363 KASSERT(l->l_stat == LSONPROC); 364 l->l_kpriority = false; 365 l->l_nivcsw++; 366 (void)mi_switch(l); 367 KERNEL_LOCK(l->l_biglocks, l); 368 } 369 370 /* 371 * Handle a request made by another agent to preempt the current LWP 372 * in-kernel. Usually called when l_dopreempt may be non-zero. 373 * 374 * Character addresses for lockstat only. 375 */ 376 static char in_critical_section; 377 static char kernel_lock_held; 378 static char is_softint; 379 static char cpu_kpreempt_enter_fail; 380 381 bool 382 kpreempt(uintptr_t where) 383 { 384 uintptr_t failed; 385 lwp_t *l; 386 int s, dop, lsflag; 387 388 l = curlwp; 389 failed = 0; 390 while ((dop = l->l_dopreempt) != 0) { 391 if (l->l_stat != LSONPROC) { 392 /* 393 * About to block (or die), let it happen. 394 * Doesn't really count as "preemption has 395 * been blocked", since we're going to 396 * context switch. 397 */ 398 l->l_dopreempt = 0; 399 return true; 400 } 401 if (__predict_false((l->l_flag & LW_IDLE) != 0)) { 402 /* Can't preempt idle loop, don't count as failure. */ 403 l->l_dopreempt = 0; 404 return true; 405 } 406 if (__predict_false(l->l_nopreempt != 0)) { 407 /* LWP holds preemption disabled, explicitly. */ 408 if ((dop & DOPREEMPT_COUNTED) == 0) { 409 kpreempt_ev_crit.ev_count++; 410 } 411 failed = (uintptr_t)&in_critical_section; 412 break; 413 } 414 if (__predict_false((l->l_pflag & LP_INTR) != 0)) { 415 /* Can't preempt soft interrupts yet. */ 416 l->l_dopreempt = 0; 417 failed = (uintptr_t)&is_softint; 418 break; 419 } 420 s = splsched(); 421 if (__predict_false(l->l_blcnt != 0 || 422 curcpu()->ci_biglock_wanted != NULL)) { 423 /* Hold or want kernel_lock, code is not MT safe. */ 424 splx(s); 425 if ((dop & DOPREEMPT_COUNTED) == 0) { 426 kpreempt_ev_klock.ev_count++; 427 } 428 failed = (uintptr_t)&kernel_lock_held; 429 break; 430 } 431 if (__predict_false(!cpu_kpreempt_enter(where, s))) { 432 /* 433 * It may be that the IPL is too high. 434 * kpreempt_enter() can schedule an 435 * interrupt to retry later. 436 */ 437 splx(s); 438 failed = (uintptr_t)&cpu_kpreempt_enter_fail; 439 break; 440 } 441 /* Do it! */ 442 if (__predict_true((dop & DOPREEMPT_COUNTED) == 0)) { 443 kpreempt_ev_immed.ev_count++; 444 } 445 lwp_lock(l); 446 mi_switch(l); 447 l->l_nopreempt++; 448 splx(s); 449 450 /* Take care of any MD cleanup. */ 451 cpu_kpreempt_exit(where); 452 l->l_nopreempt--; 453 } 454 455 if (__predict_true(!failed)) { 456 return false; 457 } 458 459 /* Record preemption failure for reporting via lockstat. */ 460 atomic_or_uint(&l->l_dopreempt, DOPREEMPT_COUNTED); 461 lsflag = 0; 462 LOCKSTAT_ENTER(lsflag); 463 if (__predict_false(lsflag)) { 464 if (where == 0) { 465 where = (uintptr_t)__builtin_return_address(0); 466 } 467 /* Preemption is on, might recurse, so make it atomic. */ 468 if (atomic_cas_ptr_ni((void *)&l->l_pfailaddr, NULL, 469 (void *)where) == NULL) { 470 LOCKSTAT_START_TIMER(lsflag, l->l_pfailtime); 471 l->l_pfaillock = failed; 472 } 473 } 474 LOCKSTAT_EXIT(lsflag); 475 return true; 476 } 477 478 /* 479 * Return true if preemption is explicitly disabled. 480 */ 481 bool 482 kpreempt_disabled(void) 483 { 484 const lwp_t *l = curlwp; 485 486 return l->l_nopreempt != 0 || l->l_stat == LSZOMB || 487 (l->l_flag & LW_IDLE) != 0 || cpu_kpreempt_disabled(); 488 } 489 490 /* 491 * Disable kernel preemption. 492 */ 493 void 494 kpreempt_disable(void) 495 { 496 497 KPREEMPT_DISABLE(curlwp); 498 } 499 500 /* 501 * Reenable kernel preemption. 502 */ 503 void 504 kpreempt_enable(void) 505 { 506 507 KPREEMPT_ENABLE(curlwp); 508 } 509 510 /* 511 * Compute the amount of time during which the current lwp was running. 512 * 513 * - update l_rtime unless it's an idle lwp. 514 */ 515 516 void 517 updatertime(lwp_t *l, const struct bintime *now) 518 { 519 520 if (__predict_false(l->l_flag & LW_IDLE)) 521 return; 522 523 /* rtime += now - stime */ 524 bintime_add(&l->l_rtime, now); 525 bintime_sub(&l->l_rtime, &l->l_stime); 526 } 527 528 /* 529 * Select next LWP from the current CPU to run.. 530 */ 531 static inline lwp_t * 532 nextlwp(struct cpu_info *ci, struct schedstate_percpu *spc) 533 { 534 lwp_t *newl; 535 536 /* 537 * Let sched_nextlwp() select the LWP to run the CPU next. 538 * If no LWP is runnable, select the idle LWP. 539 * 540 * Note that spc_lwplock might not necessary be held, and 541 * new thread would be unlocked after setting the LWP-lock. 542 */ 543 newl = sched_nextlwp(); 544 if (newl != NULL) { 545 sched_dequeue(newl); 546 KASSERT(lwp_locked(newl, spc->spc_mutex)); 547 newl->l_stat = LSONPROC; 548 newl->l_cpu = ci; 549 newl->l_pflag |= LP_RUNNING; 550 lwp_setlock(newl, spc->spc_lwplock); 551 } else { 552 newl = ci->ci_data.cpu_idlelwp; 553 newl->l_stat = LSONPROC; 554 newl->l_pflag |= LP_RUNNING; 555 } 556 557 /* 558 * Only clear want_resched if there are no pending (slow) 559 * software interrupts. 560 */ 561 ci->ci_want_resched = ci->ci_data.cpu_softints; 562 spc->spc_flags &= ~SPCF_SWITCHCLEAR; 563 spc->spc_curpriority = lwp_eprio(newl); 564 565 return newl; 566 } 567 568 /* 569 * The machine independent parts of context switch. 570 * 571 * Returns 1 if another LWP was actually run. 572 */ 573 int 574 mi_switch(lwp_t *l) 575 { 576 struct cpu_info *ci; 577 struct schedstate_percpu *spc; 578 struct lwp *newl; 579 int retval, oldspl; 580 struct bintime bt; 581 bool returning; 582 583 KASSERT(lwp_locked(l, NULL)); 584 KASSERT(kpreempt_disabled()); 585 LOCKDEBUG_BARRIER(l->l_mutex, 1); 586 587 kstack_check_magic(l); 588 589 binuptime(&bt); 590 591 KASSERT((l->l_pflag & LP_RUNNING) != 0); 592 KASSERT(l->l_cpu == curcpu()); 593 ci = l->l_cpu; 594 spc = &ci->ci_schedstate; 595 returning = false; 596 newl = NULL; 597 598 /* 599 * If we have been asked to switch to a specific LWP, then there 600 * is no need to inspect the run queues. If a soft interrupt is 601 * blocking, then return to the interrupted thread without adjusting 602 * VM context or its start time: neither have been changed in order 603 * to take the interrupt. 604 */ 605 if (l->l_switchto != NULL) { 606 if ((l->l_pflag & LP_INTR) != 0) { 607 returning = true; 608 softint_block(l); 609 if ((l->l_pflag & LP_TIMEINTR) != 0) 610 updatertime(l, &bt); 611 } 612 newl = l->l_switchto; 613 l->l_switchto = NULL; 614 } 615 #ifndef __HAVE_FAST_SOFTINTS 616 else if (ci->ci_data.cpu_softints != 0) { 617 /* There are pending soft interrupts, so pick one. */ 618 newl = softint_picklwp(); 619 newl->l_stat = LSONPROC; 620 newl->l_pflag |= LP_RUNNING; 621 } 622 #endif /* !__HAVE_FAST_SOFTINTS */ 623 624 /* Count time spent in current system call */ 625 if (!returning) { 626 SYSCALL_TIME_SLEEP(l); 627 628 /* 629 * XXXSMP If we are using h/w performance counters, 630 * save context. 631 */ 632 #if PERFCTRS 633 if (PMC_ENABLED(l->l_proc)) { 634 pmc_save_context(l->l_proc); 635 } 636 #endif 637 updatertime(l, &bt); 638 } 639 640 /* Lock the runqueue */ 641 KASSERT(l->l_stat != LSRUN); 642 mutex_spin_enter(spc->spc_mutex); 643 644 /* 645 * If on the CPU and we have gotten this far, then we must yield. 646 */ 647 if (l->l_stat == LSONPROC && l != newl) { 648 KASSERT(lwp_locked(l, spc->spc_lwplock)); 649 if ((l->l_flag & LW_IDLE) == 0) { 650 l->l_stat = LSRUN; 651 lwp_setlock(l, spc->spc_mutex); 652 sched_enqueue(l, true); 653 /* Handle migration case */ 654 KASSERT(spc->spc_migrating == NULL); 655 if (l->l_target_cpu != NULL) { 656 spc->spc_migrating = l; 657 } 658 } else 659 l->l_stat = LSIDL; 660 } 661 662 /* Pick new LWP to run. */ 663 if (newl == NULL) { 664 newl = nextlwp(ci, spc); 665 } 666 667 /* Items that must be updated with the CPU locked. */ 668 if (!returning) { 669 /* Update the new LWP's start time. */ 670 newl->l_stime = bt; 671 672 /* 673 * ci_curlwp changes when a fast soft interrupt occurs. 674 * We use cpu_onproc to keep track of which kernel or 675 * user thread is running 'underneath' the software 676 * interrupt. This is important for time accounting, 677 * itimers and forcing user threads to preempt (aston). 678 */ 679 ci->ci_data.cpu_onproc = newl; 680 } 681 682 /* 683 * Preemption related tasks. Must be done with the current 684 * CPU locked. 685 */ 686 cpu_did_resched(l); 687 l->l_dopreempt = 0; 688 if (__predict_false(l->l_pfailaddr != 0)) { 689 LOCKSTAT_FLAG(lsflag); 690 LOCKSTAT_ENTER(lsflag); 691 LOCKSTAT_STOP_TIMER(lsflag, l->l_pfailtime); 692 LOCKSTAT_EVENT_RA(lsflag, l->l_pfaillock, LB_NOPREEMPT|LB_SPIN, 693 1, l->l_pfailtime, l->l_pfailaddr); 694 LOCKSTAT_EXIT(lsflag); 695 l->l_pfailtime = 0; 696 l->l_pfaillock = 0; 697 l->l_pfailaddr = 0; 698 } 699 700 if (l != newl) { 701 struct lwp *prevlwp; 702 703 /* Release all locks, but leave the current LWP locked */ 704 if (l->l_mutex == spc->spc_mutex) { 705 /* 706 * Drop spc_lwplock, if the current LWP has been moved 707 * to the run queue (it is now locked by spc_mutex). 708 */ 709 mutex_spin_exit(spc->spc_lwplock); 710 } else { 711 /* 712 * Otherwise, drop the spc_mutex, we are done with the 713 * run queues. 714 */ 715 mutex_spin_exit(spc->spc_mutex); 716 } 717 718 /* 719 * Mark that context switch is going to be performed 720 * for this LWP, to protect it from being switched 721 * to on another CPU. 722 */ 723 KASSERT(l->l_ctxswtch == 0); 724 l->l_ctxswtch = 1; 725 l->l_ncsw++; 726 KASSERT((l->l_pflag & LP_RUNNING) != 0); 727 l->l_pflag &= ~LP_RUNNING; 728 729 /* 730 * Increase the count of spin-mutexes before the release 731 * of the last lock - we must remain at IPL_SCHED during 732 * the context switch. 733 */ 734 oldspl = MUTEX_SPIN_OLDSPL(ci); 735 ci->ci_mtx_count--; 736 lwp_unlock(l); 737 738 /* Count the context switch on this CPU. */ 739 ci->ci_data.cpu_nswtch++; 740 741 /* Update status for lwpctl, if present. */ 742 if (l->l_lwpctl != NULL) 743 l->l_lwpctl->lc_curcpu = LWPCTL_CPU_NONE; 744 745 /* 746 * Save old VM context, unless a soft interrupt 747 * handler is blocking. 748 */ 749 if (!returning) 750 pmap_deactivate(l); 751 752 /* 753 * We may need to spin-wait for if 'newl' is still 754 * context switching on another CPU. 755 */ 756 if (__predict_false(newl->l_ctxswtch != 0)) { 757 u_int count; 758 count = SPINLOCK_BACKOFF_MIN; 759 while (newl->l_ctxswtch) 760 SPINLOCK_BACKOFF(count); 761 } 762 763 /* Switch to the new LWP.. */ 764 prevlwp = cpu_switchto(l, newl, returning); 765 ci = curcpu(); 766 767 /* 768 * Switched away - we have new curlwp. 769 * Restore VM context and IPL. 770 */ 771 pmap_activate(l); 772 uvm_emap_switch(l); 773 774 if (prevlwp != NULL) { 775 /* Normalize the count of the spin-mutexes */ 776 ci->ci_mtx_count++; 777 /* Unmark the state of context switch */ 778 membar_exit(); 779 prevlwp->l_ctxswtch = 0; 780 } 781 782 /* Update status for lwpctl, if present. */ 783 if (l->l_lwpctl != NULL) { 784 l->l_lwpctl->lc_curcpu = (int)cpu_index(ci); 785 l->l_lwpctl->lc_pctr++; 786 } 787 788 KASSERT(l->l_cpu == ci); 789 splx(oldspl); 790 retval = 1; 791 } else { 792 /* Nothing to do - just unlock and return. */ 793 mutex_spin_exit(spc->spc_mutex); 794 lwp_unlock(l); 795 retval = 0; 796 } 797 798 KASSERT(l == curlwp); 799 KASSERT(l->l_stat == LSONPROC); 800 801 /* 802 * XXXSMP If we are using h/w performance counters, restore context. 803 * XXXSMP preemption problem. 804 */ 805 #if PERFCTRS 806 if (PMC_ENABLED(l->l_proc)) { 807 pmc_restore_context(l->l_proc); 808 } 809 #endif 810 SYSCALL_TIME_WAKEUP(l); 811 LOCKDEBUG_BARRIER(NULL, 1); 812 813 return retval; 814 } 815 816 /* 817 * The machine independent parts of context switch to oblivion. 818 * Does not return. Call with the LWP unlocked. 819 */ 820 void 821 lwp_exit_switchaway(lwp_t *l) 822 { 823 struct cpu_info *ci; 824 struct lwp *newl; 825 struct bintime bt; 826 827 ci = l->l_cpu; 828 829 KASSERT(kpreempt_disabled()); 830 KASSERT(l->l_stat == LSZOMB || l->l_stat == LSIDL); 831 KASSERT(ci == curcpu()); 832 LOCKDEBUG_BARRIER(NULL, 0); 833 834 kstack_check_magic(l); 835 836 /* Count time spent in current system call */ 837 SYSCALL_TIME_SLEEP(l); 838 binuptime(&bt); 839 updatertime(l, &bt); 840 841 /* Must stay at IPL_SCHED even after releasing run queue lock. */ 842 (void)splsched(); 843 844 /* 845 * Let sched_nextlwp() select the LWP to run the CPU next. 846 * If no LWP is runnable, select the idle LWP. 847 * 848 * Note that spc_lwplock might not necessary be held, and 849 * new thread would be unlocked after setting the LWP-lock. 850 */ 851 spc_lock(ci); 852 #ifndef __HAVE_FAST_SOFTINTS 853 if (ci->ci_data.cpu_softints != 0) { 854 /* There are pending soft interrupts, so pick one. */ 855 newl = softint_picklwp(); 856 newl->l_stat = LSONPROC; 857 newl->l_pflag |= LP_RUNNING; 858 } else 859 #endif /* !__HAVE_FAST_SOFTINTS */ 860 { 861 newl = nextlwp(ci, &ci->ci_schedstate); 862 } 863 864 /* Update the new LWP's start time. */ 865 newl->l_stime = bt; 866 l->l_pflag &= ~LP_RUNNING; 867 868 /* 869 * ci_curlwp changes when a fast soft interrupt occurs. 870 * We use cpu_onproc to keep track of which kernel or 871 * user thread is running 'underneath' the software 872 * interrupt. This is important for time accounting, 873 * itimers and forcing user threads to preempt (aston). 874 */ 875 ci->ci_data.cpu_onproc = newl; 876 877 /* 878 * Preemption related tasks. Must be done with the current 879 * CPU locked. 880 */ 881 cpu_did_resched(l); 882 883 /* Unlock the run queue. */ 884 spc_unlock(ci); 885 886 /* Count the context switch on this CPU. */ 887 ci->ci_data.cpu_nswtch++; 888 889 /* Update status for lwpctl, if present. */ 890 if (l->l_lwpctl != NULL) 891 l->l_lwpctl->lc_curcpu = LWPCTL_CPU_EXITED; 892 893 /* 894 * We may need to spin-wait for if 'newl' is still 895 * context switching on another CPU. 896 */ 897 if (__predict_false(newl->l_ctxswtch != 0)) { 898 u_int count; 899 count = SPINLOCK_BACKOFF_MIN; 900 while (newl->l_ctxswtch) 901 SPINLOCK_BACKOFF(count); 902 } 903 904 /* Switch to the new LWP.. */ 905 (void)cpu_switchto(NULL, newl, false); 906 907 for (;;) continue; /* XXX: convince gcc about "noreturn" */ 908 /* NOTREACHED */ 909 } 910 911 /* 912 * Change LWP state to be runnable, placing it on the run queue if it is 913 * in memory, and awakening the swapper if it isn't in memory. 914 * 915 * Call with the process and LWP locked. Will return with the LWP unlocked. 916 */ 917 void 918 setrunnable(struct lwp *l) 919 { 920 struct proc *p = l->l_proc; 921 struct cpu_info *ci; 922 923 KASSERT((l->l_flag & LW_IDLE) == 0); 924 KASSERT(mutex_owned(p->p_lock)); 925 KASSERT(lwp_locked(l, NULL)); 926 KASSERT(l->l_mutex != l->l_cpu->ci_schedstate.spc_mutex); 927 928 switch (l->l_stat) { 929 case LSSTOP: 930 /* 931 * If we're being traced (possibly because someone attached us 932 * while we were stopped), check for a signal from the debugger. 933 */ 934 if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xstat != 0) 935 signotify(l); 936 p->p_nrlwps++; 937 break; 938 case LSSUSPENDED: 939 l->l_flag &= ~LW_WSUSPEND; 940 p->p_nrlwps++; 941 cv_broadcast(&p->p_lwpcv); 942 break; 943 case LSSLEEP: 944 KASSERT(l->l_wchan != NULL); 945 break; 946 default: 947 panic("setrunnable: lwp %p state was %d", l, l->l_stat); 948 } 949 950 #ifdef KERN_SA 951 if (l->l_proc->p_sa) 952 sa_awaken(l); 953 #endif /* KERN_SA */ 954 955 /* 956 * If the LWP was sleeping interruptably, then it's OK to start it 957 * again. If not, mark it as still sleeping. 958 */ 959 if (l->l_wchan != NULL) { 960 l->l_stat = LSSLEEP; 961 /* lwp_unsleep() will release the lock. */ 962 lwp_unsleep(l, true); 963 return; 964 } 965 966 /* 967 * If the LWP is still on the CPU, mark it as LSONPROC. It may be 968 * about to call mi_switch(), in which case it will yield. 969 */ 970 if ((l->l_pflag & LP_RUNNING) != 0) { 971 l->l_stat = LSONPROC; 972 l->l_slptime = 0; 973 lwp_unlock(l); 974 return; 975 } 976 977 /* 978 * Look for a CPU to run. 979 * Set the LWP runnable. 980 */ 981 ci = sched_takecpu(l); 982 l->l_cpu = ci; 983 spc_lock(ci); 984 lwp_unlock_to(l, ci->ci_schedstate.spc_mutex); 985 sched_setrunnable(l); 986 l->l_stat = LSRUN; 987 l->l_slptime = 0; 988 989 /* 990 * If thread is swapped out - wake the swapper to bring it back in. 991 * Otherwise, enter it into a run queue. 992 */ 993 if (l->l_flag & LW_INMEM) { 994 sched_enqueue(l, false); 995 resched_cpu(l); 996 lwp_unlock(l); 997 } else { 998 lwp_unlock(l); 999 uvm_kick_scheduler(); 1000 } 1001 } 1002 1003 /* 1004 * suspendsched: 1005 * 1006 * Convert all non-LW_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED. 1007 */ 1008 void 1009 suspendsched(void) 1010 { 1011 CPU_INFO_ITERATOR cii; 1012 struct cpu_info *ci; 1013 struct lwp *l; 1014 struct proc *p; 1015 1016 /* 1017 * We do this by process in order not to violate the locking rules. 1018 */ 1019 mutex_enter(proc_lock); 1020 PROCLIST_FOREACH(p, &allproc) { 1021 if ((p->p_flag & PK_MARKER) != 0) 1022 continue; 1023 1024 mutex_enter(p->p_lock); 1025 if ((p->p_flag & PK_SYSTEM) != 0) { 1026 mutex_exit(p->p_lock); 1027 continue; 1028 } 1029 1030 p->p_stat = SSTOP; 1031 1032 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 1033 if (l == curlwp) 1034 continue; 1035 1036 lwp_lock(l); 1037 1038 /* 1039 * Set L_WREBOOT so that the LWP will suspend itself 1040 * when it tries to return to user mode. We want to 1041 * try and get to get as many LWPs as possible to 1042 * the user / kernel boundary, so that they will 1043 * release any locks that they hold. 1044 */ 1045 l->l_flag |= (LW_WREBOOT | LW_WSUSPEND); 1046 1047 if (l->l_stat == LSSLEEP && 1048 (l->l_flag & LW_SINTR) != 0) { 1049 /* setrunnable() will release the lock. */ 1050 setrunnable(l); 1051 continue; 1052 } 1053 1054 lwp_unlock(l); 1055 } 1056 1057 mutex_exit(p->p_lock); 1058 } 1059 mutex_exit(proc_lock); 1060 1061 /* 1062 * Kick all CPUs to make them preempt any LWPs running in user mode. 1063 * They'll trap into the kernel and suspend themselves in userret(). 1064 */ 1065 for (CPU_INFO_FOREACH(cii, ci)) { 1066 spc_lock(ci); 1067 cpu_need_resched(ci, RESCHED_IMMED); 1068 spc_unlock(ci); 1069 } 1070 } 1071 1072 /* 1073 * sched_unsleep: 1074 * 1075 * The is called when the LWP has not been awoken normally but instead 1076 * interrupted: for example, if the sleep timed out. Because of this, 1077 * it's not a valid action for running or idle LWPs. 1078 */ 1079 static u_int 1080 sched_unsleep(struct lwp *l, bool cleanup) 1081 { 1082 1083 lwp_unlock(l); 1084 panic("sched_unsleep"); 1085 } 1086 1087 static void 1088 resched_cpu(struct lwp *l) 1089 { 1090 struct cpu_info *ci = ci = l->l_cpu; 1091 1092 KASSERT(lwp_locked(l, NULL)); 1093 if (lwp_eprio(l) > ci->ci_schedstate.spc_curpriority) 1094 cpu_need_resched(ci, 0); 1095 } 1096 1097 static void 1098 sched_changepri(struct lwp *l, pri_t pri) 1099 { 1100 1101 KASSERT(lwp_locked(l, NULL)); 1102 1103 if (l->l_stat == LSRUN && (l->l_flag & LW_INMEM) != 0) { 1104 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex)); 1105 sched_dequeue(l); 1106 l->l_priority = pri; 1107 sched_enqueue(l, false); 1108 } else { 1109 l->l_priority = pri; 1110 } 1111 resched_cpu(l); 1112 } 1113 1114 static void 1115 sched_lendpri(struct lwp *l, pri_t pri) 1116 { 1117 1118 KASSERT(lwp_locked(l, NULL)); 1119 1120 if (l->l_stat == LSRUN && (l->l_flag & LW_INMEM) != 0) { 1121 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex)); 1122 sched_dequeue(l); 1123 l->l_inheritedprio = pri; 1124 sched_enqueue(l, false); 1125 } else { 1126 l->l_inheritedprio = pri; 1127 } 1128 resched_cpu(l); 1129 } 1130 1131 struct lwp * 1132 syncobj_noowner(wchan_t wchan) 1133 { 1134 1135 return NULL; 1136 } 1137 1138 /* Decay 95% of proc::p_pctcpu in 60 seconds, ccpu = exp(-1/20) */ 1139 const fixpt_t ccpu = 0.95122942450071400909 * FSCALE; 1140 1141 /* 1142 * sched_pstats: 1143 * 1144 * Update process statistics and check CPU resource allocation. 1145 * Call scheduler-specific hook to eventually adjust process/LWP 1146 * priorities. 1147 */ 1148 /* ARGSUSED */ 1149 void 1150 sched_pstats(void *arg) 1151 { 1152 const int clkhz = (stathz != 0 ? stathz : hz); 1153 static bool backwards; 1154 struct rlimit *rlim; 1155 struct lwp *l; 1156 struct proc *p; 1157 long runtm; 1158 fixpt_t lpctcpu; 1159 u_int lcpticks; 1160 int sig; 1161 1162 sched_pstats_ticks++; 1163 1164 mutex_enter(proc_lock); 1165 PROCLIST_FOREACH(p, &allproc) { 1166 if (__predict_false((p->p_flag & PK_MARKER) != 0)) 1167 continue; 1168 1169 /* 1170 * Increment time in/out of memory and sleep 1171 * time (if sleeping), ignore overflow. 1172 */ 1173 mutex_enter(p->p_lock); 1174 runtm = p->p_rtime.sec; 1175 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 1176 if (__predict_false((l->l_flag & LW_IDLE) != 0)) 1177 continue; 1178 lwp_lock(l); 1179 runtm += l->l_rtime.sec; 1180 l->l_swtime++; 1181 sched_lwp_stats(l); 1182 lwp_unlock(l); 1183 1184 l->l_pctcpu = (l->l_pctcpu * ccpu) >> FSHIFT; 1185 if (l->l_slptime != 0) 1186 continue; 1187 1188 lpctcpu = l->l_pctcpu; 1189 lcpticks = atomic_swap_uint(&l->l_cpticks, 0); 1190 lpctcpu += ((FSCALE - ccpu) * 1191 (lcpticks * FSCALE / clkhz)) >> FSHIFT; 1192 l->l_pctcpu = lpctcpu; 1193 } 1194 /* Calculating p_pctcpu only for ps(1) */ 1195 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT; 1196 1197 /* 1198 * Check if the process exceeds its CPU resource allocation. 1199 * If over max, kill it. 1200 */ 1201 rlim = &p->p_rlimit[RLIMIT_CPU]; 1202 sig = 0; 1203 if (__predict_false(runtm >= rlim->rlim_cur)) { 1204 if (runtm >= rlim->rlim_max) 1205 sig = SIGKILL; 1206 else { 1207 sig = SIGXCPU; 1208 if (rlim->rlim_cur < rlim->rlim_max) 1209 rlim->rlim_cur += 5; 1210 } 1211 } 1212 mutex_exit(p->p_lock); 1213 if (__predict_false(runtm < 0)) { 1214 if (!backwards) { 1215 backwards = true; 1216 printf("WARNING: negative runtime; " 1217 "monotonic clock has gone backwards\n"); 1218 } 1219 } else if (__predict_false(sig)) { 1220 KASSERT((p->p_flag & PK_SYSTEM) == 0); 1221 psignal(p, sig); 1222 } 1223 } 1224 mutex_exit(proc_lock); 1225 uvm_meter(); 1226 cv_wakeup(&lbolt); 1227 callout_schedule(&sched_pstats_ch, hz); 1228 } 1229