1 /* $NetBSD: kern_synch.c,v 1.251 2008/07/25 00:48:59 uwe Exp $ */ 2 3 /*- 4 * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran and 10 * Daniel Sieger. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 23 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 24 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 25 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 /*- 35 * Copyright (c) 1982, 1986, 1990, 1991, 1993 36 * The Regents of the University of California. All rights reserved. 37 * (c) UNIX System Laboratories, Inc. 38 * All or some portions of this file are derived from material licensed 39 * to the University of California by American Telephone and Telegraph 40 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 41 * the permission of UNIX System Laboratories, Inc. 42 * 43 * Redistribution and use in source and binary forms, with or without 44 * modification, are permitted provided that the following conditions 45 * are met: 46 * 1. Redistributions of source code must retain the above copyright 47 * notice, this list of conditions and the following disclaimer. 48 * 2. Redistributions in binary form must reproduce the above copyright 49 * notice, this list of conditions and the following disclaimer in the 50 * documentation and/or other materials provided with the distribution. 51 * 3. Neither the name of the University nor the names of its contributors 52 * may be used to endorse or promote products derived from this software 53 * without specific prior written permission. 54 * 55 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 56 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 57 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 58 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 59 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 60 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 61 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 62 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 63 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 64 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 65 * SUCH DAMAGE. 66 * 67 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 68 */ 69 70 #include <sys/cdefs.h> 71 __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.251 2008/07/25 00:48:59 uwe Exp $"); 72 73 #include "opt_kstack.h" 74 #include "opt_perfctrs.h" 75 76 #define __MUTEX_PRIVATE 77 78 #include <sys/param.h> 79 #include <sys/systm.h> 80 #include <sys/proc.h> 81 #include <sys/kernel.h> 82 #if defined(PERFCTRS) 83 #include <sys/pmc.h> 84 #endif 85 #include <sys/cpu.h> 86 #include <sys/resourcevar.h> 87 #include <sys/sched.h> 88 #include <sys/syscall_stats.h> 89 #include <sys/sleepq.h> 90 #include <sys/lockdebug.h> 91 #include <sys/evcnt.h> 92 #include <sys/intr.h> 93 #include <sys/lwpctl.h> 94 #include <sys/atomic.h> 95 #include <sys/simplelock.h> 96 97 #include <uvm/uvm_extern.h> 98 99 #include <dev/lockstat.h> 100 101 static u_int sched_unsleep(struct lwp *, bool); 102 static void sched_changepri(struct lwp *, pri_t); 103 static void sched_lendpri(struct lwp *, pri_t); 104 static void resched_cpu(struct lwp *); 105 106 syncobj_t sleep_syncobj = { 107 SOBJ_SLEEPQ_SORTED, 108 sleepq_unsleep, 109 sleepq_changepri, 110 sleepq_lendpri, 111 syncobj_noowner, 112 }; 113 114 syncobj_t sched_syncobj = { 115 SOBJ_SLEEPQ_SORTED, 116 sched_unsleep, 117 sched_changepri, 118 sched_lendpri, 119 syncobj_noowner, 120 }; 121 122 callout_t sched_pstats_ch; 123 unsigned sched_pstats_ticks; 124 kcondvar_t lbolt; /* once a second sleep address */ 125 126 /* Preemption event counters */ 127 static struct evcnt kpreempt_ev_crit; 128 static struct evcnt kpreempt_ev_klock; 129 static struct evcnt kpreempt_ev_ipl; 130 static struct evcnt kpreempt_ev_immed; 131 132 /* 133 * During autoconfiguration or after a panic, a sleep will simply lower the 134 * priority briefly to allow interrupts, then return. The priority to be 135 * used (safepri) is machine-dependent, thus this value is initialized and 136 * maintained in the machine-dependent layers. This priority will typically 137 * be 0, or the lowest priority that is safe for use on the interrupt stack; 138 * it can be made higher to block network software interrupts after panics. 139 */ 140 int safepri; 141 142 void 143 sched_init(void) 144 { 145 146 cv_init(&lbolt, "lbolt"); 147 callout_init(&sched_pstats_ch, CALLOUT_MPSAFE); 148 callout_setfunc(&sched_pstats_ch, sched_pstats, NULL); 149 150 evcnt_attach_dynamic(&kpreempt_ev_crit, EVCNT_TYPE_MISC, NULL, 151 "kpreempt", "defer: critical section"); 152 evcnt_attach_dynamic(&kpreempt_ev_klock, EVCNT_TYPE_MISC, NULL, 153 "kpreempt", "defer: kernel_lock"); 154 evcnt_attach_dynamic(&kpreempt_ev_ipl, EVCNT_TYPE_MISC, NULL, 155 "kpreempt", "defer: IPL"); 156 evcnt_attach_dynamic(&kpreempt_ev_immed, EVCNT_TYPE_MISC, NULL, 157 "kpreempt", "immediate"); 158 159 sched_pstats(NULL); 160 } 161 162 /* 163 * OBSOLETE INTERFACE 164 * 165 * General sleep call. Suspends the current process until a wakeup is 166 * performed on the specified identifier. The process will then be made 167 * runnable with the specified priority. Sleeps at most timo/hz seconds (0 168 * means no timeout). If pri includes PCATCH flag, signals are checked 169 * before and after sleeping, else signals are not checked. Returns 0 if 170 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a 171 * signal needs to be delivered, ERESTART is returned if the current system 172 * call should be restarted if possible, and EINTR is returned if the system 173 * call should be interrupted by the signal (return EINTR). 174 * 175 * The interlock is held until we are on a sleep queue. The interlock will 176 * be locked before returning back to the caller unless the PNORELOCK flag 177 * is specified, in which case the interlock will always be unlocked upon 178 * return. 179 */ 180 int 181 ltsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo, 182 volatile struct simplelock *interlock) 183 { 184 struct lwp *l = curlwp; 185 sleepq_t *sq; 186 kmutex_t *mp; 187 int error; 188 189 KASSERT((l->l_pflag & LP_INTR) == 0); 190 191 if (sleepq_dontsleep(l)) { 192 (void)sleepq_abort(NULL, 0); 193 if ((priority & PNORELOCK) != 0) 194 simple_unlock(interlock); 195 return 0; 196 } 197 198 l->l_kpriority = true; 199 sq = sleeptab_lookup(&sleeptab, ident, &mp); 200 sleepq_enter(sq, l, mp); 201 sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj); 202 203 if (interlock != NULL) { 204 KASSERT(simple_lock_held(interlock)); 205 simple_unlock(interlock); 206 } 207 208 error = sleepq_block(timo, priority & PCATCH); 209 210 if (interlock != NULL && (priority & PNORELOCK) == 0) 211 simple_lock(interlock); 212 213 return error; 214 } 215 216 int 217 mtsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo, 218 kmutex_t *mtx) 219 { 220 struct lwp *l = curlwp; 221 sleepq_t *sq; 222 kmutex_t *mp; 223 int error; 224 225 KASSERT((l->l_pflag & LP_INTR) == 0); 226 227 if (sleepq_dontsleep(l)) { 228 (void)sleepq_abort(mtx, (priority & PNORELOCK) != 0); 229 return 0; 230 } 231 232 l->l_kpriority = true; 233 sq = sleeptab_lookup(&sleeptab, ident, &mp); 234 sleepq_enter(sq, l, mp); 235 sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj); 236 mutex_exit(mtx); 237 error = sleepq_block(timo, priority & PCATCH); 238 239 if ((priority & PNORELOCK) == 0) 240 mutex_enter(mtx); 241 242 return error; 243 } 244 245 /* 246 * General sleep call for situations where a wake-up is not expected. 247 */ 248 int 249 kpause(const char *wmesg, bool intr, int timo, kmutex_t *mtx) 250 { 251 struct lwp *l = curlwp; 252 kmutex_t *mp; 253 sleepq_t *sq; 254 int error; 255 256 if (sleepq_dontsleep(l)) 257 return sleepq_abort(NULL, 0); 258 259 if (mtx != NULL) 260 mutex_exit(mtx); 261 l->l_kpriority = true; 262 sq = sleeptab_lookup(&sleeptab, l, &mp); 263 sleepq_enter(sq, l, mp); 264 sleepq_enqueue(sq, l, wmesg, &sleep_syncobj); 265 error = sleepq_block(timo, intr); 266 if (mtx != NULL) 267 mutex_enter(mtx); 268 269 return error; 270 } 271 272 /* 273 * OBSOLETE INTERFACE 274 * 275 * Make all processes sleeping on the specified identifier runnable. 276 */ 277 void 278 wakeup(wchan_t ident) 279 { 280 sleepq_t *sq; 281 kmutex_t *mp; 282 283 if (cold) 284 return; 285 286 sq = sleeptab_lookup(&sleeptab, ident, &mp); 287 sleepq_wake(sq, ident, (u_int)-1, mp); 288 } 289 290 /* 291 * OBSOLETE INTERFACE 292 * 293 * Make the highest priority process first in line on the specified 294 * identifier runnable. 295 */ 296 void 297 wakeup_one(wchan_t ident) 298 { 299 sleepq_t *sq; 300 kmutex_t *mp; 301 302 if (cold) 303 return; 304 305 sq = sleeptab_lookup(&sleeptab, ident, &mp); 306 sleepq_wake(sq, ident, 1, mp); 307 } 308 309 310 /* 311 * General yield call. Puts the current process back on its run queue and 312 * performs a voluntary context switch. Should only be called when the 313 * current process explicitly requests it (eg sched_yield(2)). 314 */ 315 void 316 yield(void) 317 { 318 struct lwp *l = curlwp; 319 320 KERNEL_UNLOCK_ALL(l, &l->l_biglocks); 321 lwp_lock(l); 322 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); 323 KASSERT(l->l_stat == LSONPROC); 324 l->l_kpriority = false; 325 (void)mi_switch(l); 326 KERNEL_LOCK(l->l_biglocks, l); 327 } 328 329 /* 330 * General preemption call. Puts the current process back on its run queue 331 * and performs an involuntary context switch. 332 */ 333 void 334 preempt(void) 335 { 336 struct lwp *l = curlwp; 337 338 KERNEL_UNLOCK_ALL(l, &l->l_biglocks); 339 lwp_lock(l); 340 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); 341 KASSERT(l->l_stat == LSONPROC); 342 l->l_kpriority = false; 343 l->l_nivcsw++; 344 (void)mi_switch(l); 345 KERNEL_LOCK(l->l_biglocks, l); 346 } 347 348 /* 349 * Handle a request made by another agent to preempt the current LWP 350 * in-kernel. Usually called when l_dopreempt may be non-zero. 351 * 352 * Character addresses for lockstat only. 353 */ 354 static char in_critical_section; 355 static char kernel_lock_held; 356 static char spl_raised; 357 static char is_softint; 358 359 bool 360 kpreempt(uintptr_t where) 361 { 362 uintptr_t failed; 363 lwp_t *l; 364 int s, dop; 365 366 l = curlwp; 367 failed = 0; 368 while ((dop = l->l_dopreempt) != 0) { 369 if (l->l_stat != LSONPROC) { 370 /* 371 * About to block (or die), let it happen. 372 * Doesn't really count as "preemption has 373 * been blocked", since we're going to 374 * context switch. 375 */ 376 l->l_dopreempt = 0; 377 return true; 378 } 379 if (__predict_false((l->l_flag & LW_IDLE) != 0)) { 380 /* Can't preempt idle loop, don't count as failure. */ 381 l->l_dopreempt = 0; 382 return true; 383 } 384 if (__predict_false(l->l_nopreempt != 0)) { 385 /* LWP holds preemption disabled, explicitly. */ 386 if ((dop & DOPREEMPT_COUNTED) == 0) { 387 kpreempt_ev_crit.ev_count++; 388 } 389 failed = (uintptr_t)&in_critical_section; 390 break; 391 } 392 if (__predict_false((l->l_pflag & LP_INTR) != 0)) { 393 /* Can't preempt soft interrupts yet. */ 394 l->l_dopreempt = 0; 395 failed = (uintptr_t)&is_softint; 396 break; 397 } 398 s = splsched(); 399 if (__predict_false(l->l_blcnt != 0 || 400 curcpu()->ci_biglock_wanted != NULL)) { 401 /* Hold or want kernel_lock, code is not MT safe. */ 402 splx(s); 403 if ((dop & DOPREEMPT_COUNTED) == 0) { 404 kpreempt_ev_klock.ev_count++; 405 } 406 failed = (uintptr_t)&kernel_lock_held; 407 break; 408 } 409 if (__predict_false(!cpu_kpreempt_enter(where, s))) { 410 /* 411 * It may be that the IPL is too high. 412 * kpreempt_enter() can schedule an 413 * interrupt to retry later. 414 */ 415 splx(s); 416 if ((dop & DOPREEMPT_COUNTED) == 0) { 417 kpreempt_ev_ipl.ev_count++; 418 } 419 failed = (uintptr_t)&spl_raised; 420 break; 421 } 422 /* Do it! */ 423 if (__predict_true((dop & DOPREEMPT_COUNTED) == 0)) { 424 kpreempt_ev_immed.ev_count++; 425 } 426 lwp_lock(l); 427 mi_switch(l); 428 l->l_nopreempt++; 429 splx(s); 430 431 /* Take care of any MD cleanup. */ 432 cpu_kpreempt_exit(where); 433 l->l_nopreempt--; 434 } 435 436 /* Record preemption failure for reporting via lockstat. */ 437 if (__predict_false(failed)) { 438 int lsflag = 0; 439 atomic_or_uint(&l->l_dopreempt, DOPREEMPT_COUNTED); 440 LOCKSTAT_ENTER(lsflag); 441 /* Might recurse, make it atomic. */ 442 if (__predict_false(lsflag)) { 443 if (where == 0) { 444 where = (uintptr_t)__builtin_return_address(0); 445 } 446 if (atomic_cas_ptr_ni((void *)&l->l_pfailaddr, 447 NULL, (void *)where) == NULL) { 448 LOCKSTAT_START_TIMER(lsflag, l->l_pfailtime); 449 l->l_pfaillock = failed; 450 } 451 } 452 LOCKSTAT_EXIT(lsflag); 453 } 454 455 return failed; 456 } 457 458 /* 459 * Return true if preemption is explicitly disabled. 460 */ 461 bool 462 kpreempt_disabled(void) 463 { 464 lwp_t *l; 465 466 l = curlwp; 467 468 return l->l_nopreempt != 0 || l->l_stat == LSZOMB || 469 (l->l_flag & LW_IDLE) != 0 || cpu_kpreempt_disabled(); 470 } 471 472 /* 473 * Disable kernel preemption. 474 */ 475 void 476 kpreempt_disable(void) 477 { 478 479 KPREEMPT_DISABLE(curlwp); 480 } 481 482 /* 483 * Reenable kernel preemption. 484 */ 485 void 486 kpreempt_enable(void) 487 { 488 489 KPREEMPT_ENABLE(curlwp); 490 } 491 492 /* 493 * Compute the amount of time during which the current lwp was running. 494 * 495 * - update l_rtime unless it's an idle lwp. 496 */ 497 498 void 499 updatertime(lwp_t *l, const struct bintime *now) 500 { 501 502 if ((l->l_flag & LW_IDLE) != 0) 503 return; 504 505 /* rtime += now - stime */ 506 bintime_add(&l->l_rtime, now); 507 bintime_sub(&l->l_rtime, &l->l_stime); 508 } 509 510 /* 511 * Select next LWP from the current CPU to run.. 512 */ 513 static inline lwp_t * 514 nextlwp(struct cpu_info *ci, struct schedstate_percpu *spc) 515 { 516 lwp_t *newl; 517 518 /* 519 * Let sched_nextlwp() select the LWP to run the CPU next. 520 * If no LWP is runnable, select the idle LWP. 521 * 522 * Note that spc_lwplock might not necessary be held, and 523 * new thread would be unlocked after setting the LWP-lock. 524 */ 525 newl = sched_nextlwp(); 526 if (newl != NULL) { 527 sched_dequeue(newl); 528 KASSERT(lwp_locked(newl, spc->spc_mutex)); 529 newl->l_stat = LSONPROC; 530 newl->l_cpu = ci; 531 newl->l_pflag |= LP_RUNNING; 532 lwp_setlock(newl, spc->spc_lwplock); 533 } else { 534 newl = ci->ci_data.cpu_idlelwp; 535 newl->l_stat = LSONPROC; 536 newl->l_pflag |= LP_RUNNING; 537 } 538 539 /* 540 * Only clear want_resched if there are no pending (slow) 541 * software interrupts. 542 */ 543 ci->ci_want_resched = ci->ci_data.cpu_softints; 544 spc->spc_flags &= ~SPCF_SWITCHCLEAR; 545 spc->spc_curpriority = lwp_eprio(newl); 546 547 return newl; 548 } 549 550 /* 551 * The machine independent parts of context switch. 552 * 553 * Returns 1 if another LWP was actually run. 554 */ 555 int 556 mi_switch(lwp_t *l) 557 { 558 struct cpu_info *ci; 559 struct schedstate_percpu *spc; 560 struct lwp *newl; 561 int retval, oldspl; 562 struct bintime bt; 563 bool returning; 564 565 KASSERT(lwp_locked(l, NULL)); 566 KASSERT(kpreempt_disabled()); 567 LOCKDEBUG_BARRIER(l->l_mutex, 1); 568 569 #ifdef KSTACK_CHECK_MAGIC 570 kstack_check_magic(l); 571 #endif 572 573 binuptime(&bt); 574 575 KASSERT(l->l_cpu == curcpu()); 576 ci = l->l_cpu; 577 spc = &ci->ci_schedstate; 578 returning = false; 579 newl = NULL; 580 581 /* 582 * If we have been asked to switch to a specific LWP, then there 583 * is no need to inspect the run queues. If a soft interrupt is 584 * blocking, then return to the interrupted thread without adjusting 585 * VM context or its start time: neither have been changed in order 586 * to take the interrupt. 587 */ 588 if (l->l_switchto != NULL) { 589 if ((l->l_pflag & LP_INTR) != 0) { 590 returning = true; 591 softint_block(l); 592 if ((l->l_pflag & LP_TIMEINTR) != 0) 593 updatertime(l, &bt); 594 } 595 newl = l->l_switchto; 596 l->l_switchto = NULL; 597 } 598 #ifndef __HAVE_FAST_SOFTINTS 599 else if (ci->ci_data.cpu_softints != 0) { 600 /* There are pending soft interrupts, so pick one. */ 601 newl = softint_picklwp(); 602 newl->l_stat = LSONPROC; 603 newl->l_pflag |= LP_RUNNING; 604 } 605 #endif /* !__HAVE_FAST_SOFTINTS */ 606 607 /* Count time spent in current system call */ 608 if (!returning) { 609 SYSCALL_TIME_SLEEP(l); 610 611 /* 612 * XXXSMP If we are using h/w performance counters, 613 * save context. 614 */ 615 #if PERFCTRS 616 if (PMC_ENABLED(l->l_proc)) { 617 pmc_save_context(l->l_proc); 618 } 619 #endif 620 updatertime(l, &bt); 621 } 622 623 /* Lock the runqueue */ 624 KASSERT(l->l_stat != LSRUN); 625 mutex_spin_enter(spc->spc_mutex); 626 627 /* 628 * If on the CPU and we have gotten this far, then we must yield. 629 */ 630 if (l->l_stat == LSONPROC && l != newl) { 631 KASSERT(lwp_locked(l, spc->spc_lwplock)); 632 if ((l->l_flag & LW_IDLE) == 0) { 633 l->l_stat = LSRUN; 634 lwp_setlock(l, spc->spc_mutex); 635 sched_enqueue(l, true); 636 /* Handle migration case */ 637 KASSERT(spc->spc_migrating == NULL); 638 if (l->l_target_cpu != NULL) { 639 spc->spc_migrating = l; 640 } 641 } else 642 l->l_stat = LSIDL; 643 } 644 645 /* Pick new LWP to run. */ 646 if (newl == NULL) { 647 newl = nextlwp(ci, spc); 648 } 649 650 /* Items that must be updated with the CPU locked. */ 651 if (!returning) { 652 /* Update the new LWP's start time. */ 653 newl->l_stime = bt; 654 655 /* 656 * ci_curlwp changes when a fast soft interrupt occurs. 657 * We use cpu_onproc to keep track of which kernel or 658 * user thread is running 'underneath' the software 659 * interrupt. This is important for time accounting, 660 * itimers and forcing user threads to preempt (aston). 661 */ 662 ci->ci_data.cpu_onproc = newl; 663 } 664 665 /* 666 * Preemption related tasks. Must be done with the current 667 * CPU locked. 668 */ 669 cpu_did_resched(l); 670 l->l_dopreempt = 0; 671 if (__predict_false(l->l_pfailaddr != 0)) { 672 LOCKSTAT_FLAG(lsflag); 673 LOCKSTAT_ENTER(lsflag); 674 LOCKSTAT_STOP_TIMER(lsflag, l->l_pfailtime); 675 LOCKSTAT_EVENT_RA(lsflag, l->l_pfaillock, LB_NOPREEMPT|LB_SPIN, 676 1, l->l_pfailtime, l->l_pfailaddr); 677 LOCKSTAT_EXIT(lsflag); 678 l->l_pfailtime = 0; 679 l->l_pfaillock = 0; 680 l->l_pfailaddr = 0; 681 } 682 683 if (l != newl) { 684 struct lwp *prevlwp; 685 686 /* Release all locks, but leave the current LWP locked */ 687 if (l->l_mutex == spc->spc_mutex) { 688 /* 689 * Drop spc_lwplock, if the current LWP has been moved 690 * to the run queue (it is now locked by spc_mutex). 691 */ 692 mutex_spin_exit(spc->spc_lwplock); 693 } else { 694 /* 695 * Otherwise, drop the spc_mutex, we are done with the 696 * run queues. 697 */ 698 mutex_spin_exit(spc->spc_mutex); 699 } 700 701 /* 702 * Mark that context switch is going to be perfomed 703 * for this LWP, to protect it from being switched 704 * to on another CPU. 705 */ 706 KASSERT(l->l_ctxswtch == 0); 707 l->l_ctxswtch = 1; 708 l->l_ncsw++; 709 l->l_pflag &= ~LP_RUNNING; 710 711 /* 712 * Increase the count of spin-mutexes before the release 713 * of the last lock - we must remain at IPL_SCHED during 714 * the context switch. 715 */ 716 oldspl = MUTEX_SPIN_OLDSPL(ci); 717 ci->ci_mtx_count--; 718 lwp_unlock(l); 719 720 /* Count the context switch on this CPU. */ 721 ci->ci_data.cpu_nswtch++; 722 723 /* Update status for lwpctl, if present. */ 724 if (l->l_lwpctl != NULL) 725 l->l_lwpctl->lc_curcpu = LWPCTL_CPU_NONE; 726 727 /* 728 * Save old VM context, unless a soft interrupt 729 * handler is blocking. 730 */ 731 if (!returning) 732 pmap_deactivate(l); 733 734 /* 735 * We may need to spin-wait for if 'newl' is still 736 * context switching on another CPU. 737 */ 738 if (newl->l_ctxswtch != 0) { 739 u_int count; 740 count = SPINLOCK_BACKOFF_MIN; 741 while (newl->l_ctxswtch) 742 SPINLOCK_BACKOFF(count); 743 } 744 745 /* Switch to the new LWP.. */ 746 prevlwp = cpu_switchto(l, newl, returning); 747 ci = curcpu(); 748 749 /* 750 * Switched away - we have new curlwp. 751 * Restore VM context and IPL. 752 */ 753 pmap_activate(l); 754 if (prevlwp != NULL) { 755 /* Normalize the count of the spin-mutexes */ 756 ci->ci_mtx_count++; 757 /* Unmark the state of context switch */ 758 membar_exit(); 759 prevlwp->l_ctxswtch = 0; 760 } 761 762 /* Update status for lwpctl, if present. */ 763 if (l->l_lwpctl != NULL) { 764 l->l_lwpctl->lc_curcpu = (int)cpu_index(ci); 765 l->l_lwpctl->lc_pctr++; 766 } 767 768 KASSERT(l->l_cpu == ci); 769 splx(oldspl); 770 retval = 1; 771 } else { 772 /* Nothing to do - just unlock and return. */ 773 mutex_spin_exit(spc->spc_mutex); 774 lwp_unlock(l); 775 retval = 0; 776 } 777 778 KASSERT(l == curlwp); 779 KASSERT(l->l_stat == LSONPROC); 780 781 /* 782 * XXXSMP If we are using h/w performance counters, restore context. 783 * XXXSMP preemption problem. 784 */ 785 #if PERFCTRS 786 if (PMC_ENABLED(l->l_proc)) { 787 pmc_restore_context(l->l_proc); 788 } 789 #endif 790 SYSCALL_TIME_WAKEUP(l); 791 LOCKDEBUG_BARRIER(NULL, 1); 792 793 return retval; 794 } 795 796 /* 797 * The machine independent parts of context switch to oblivion. 798 * Does not return. Call with the LWP unlocked. 799 */ 800 void 801 lwp_exit_switchaway(lwp_t *l) 802 { 803 struct cpu_info *ci; 804 struct lwp *newl; 805 struct bintime bt; 806 807 ci = l->l_cpu; 808 809 KASSERT(kpreempt_disabled()); 810 KASSERT(l->l_stat == LSZOMB || l->l_stat == LSIDL); 811 KASSERT(ci == curcpu()); 812 LOCKDEBUG_BARRIER(NULL, 0); 813 814 #ifdef KSTACK_CHECK_MAGIC 815 kstack_check_magic(l); 816 #endif 817 818 /* Count time spent in current system call */ 819 SYSCALL_TIME_SLEEP(l); 820 binuptime(&bt); 821 updatertime(l, &bt); 822 823 /* Must stay at IPL_SCHED even after releasing run queue lock. */ 824 (void)splsched(); 825 826 /* 827 * Let sched_nextlwp() select the LWP to run the CPU next. 828 * If no LWP is runnable, select the idle LWP. 829 * 830 * Note that spc_lwplock might not necessary be held, and 831 * new thread would be unlocked after setting the LWP-lock. 832 */ 833 spc_lock(ci); 834 #ifndef __HAVE_FAST_SOFTINTS 835 if (ci->ci_data.cpu_softints != 0) { 836 /* There are pending soft interrupts, so pick one. */ 837 newl = softint_picklwp(); 838 newl->l_stat = LSONPROC; 839 newl->l_pflag |= LP_RUNNING; 840 } else 841 #endif /* !__HAVE_FAST_SOFTINTS */ 842 { 843 newl = nextlwp(ci, &ci->ci_schedstate); 844 } 845 846 /* Update the new LWP's start time. */ 847 newl->l_stime = bt; 848 l->l_pflag &= ~LP_RUNNING; 849 850 /* 851 * ci_curlwp changes when a fast soft interrupt occurs. 852 * We use cpu_onproc to keep track of which kernel or 853 * user thread is running 'underneath' the software 854 * interrupt. This is important for time accounting, 855 * itimers and forcing user threads to preempt (aston). 856 */ 857 ci->ci_data.cpu_onproc = newl; 858 859 /* 860 * Preemption related tasks. Must be done with the current 861 * CPU locked. 862 */ 863 cpu_did_resched(l); 864 865 /* Unlock the run queue. */ 866 spc_unlock(ci); 867 868 /* Count the context switch on this CPU. */ 869 ci->ci_data.cpu_nswtch++; 870 871 /* Update status for lwpctl, if present. */ 872 if (l->l_lwpctl != NULL) 873 l->l_lwpctl->lc_curcpu = LWPCTL_CPU_EXITED; 874 875 /* 876 * We may need to spin-wait for if 'newl' is still 877 * context switching on another CPU. 878 */ 879 if (newl->l_ctxswtch != 0) { 880 u_int count; 881 count = SPINLOCK_BACKOFF_MIN; 882 while (newl->l_ctxswtch) 883 SPINLOCK_BACKOFF(count); 884 } 885 886 /* Switch to the new LWP.. */ 887 (void)cpu_switchto(NULL, newl, false); 888 889 for (;;) continue; /* XXX: convince gcc about "noreturn" */ 890 /* NOTREACHED */ 891 } 892 893 /* 894 * Change process state to be runnable, placing it on the run queue if it is 895 * in memory, and awakening the swapper if it isn't in memory. 896 * 897 * Call with the process and LWP locked. Will return with the LWP unlocked. 898 */ 899 void 900 setrunnable(struct lwp *l) 901 { 902 struct proc *p = l->l_proc; 903 struct cpu_info *ci; 904 sigset_t *ss; 905 906 KASSERT((l->l_flag & LW_IDLE) == 0); 907 KASSERT(mutex_owned(p->p_lock)); 908 KASSERT(lwp_locked(l, NULL)); 909 KASSERT(l->l_mutex != l->l_cpu->ci_schedstate.spc_mutex); 910 911 switch (l->l_stat) { 912 case LSSTOP: 913 /* 914 * If we're being traced (possibly because someone attached us 915 * while we were stopped), check for a signal from the debugger. 916 */ 917 if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xstat != 0) { 918 if ((sigprop[p->p_xstat] & SA_TOLWP) != 0) 919 ss = &l->l_sigpend.sp_set; 920 else 921 ss = &p->p_sigpend.sp_set; 922 sigaddset(ss, p->p_xstat); 923 signotify(l); 924 } 925 p->p_nrlwps++; 926 break; 927 case LSSUSPENDED: 928 l->l_flag &= ~LW_WSUSPEND; 929 p->p_nrlwps++; 930 cv_broadcast(&p->p_lwpcv); 931 break; 932 case LSSLEEP: 933 KASSERT(l->l_wchan != NULL); 934 break; 935 default: 936 panic("setrunnable: lwp %p state was %d", l, l->l_stat); 937 } 938 939 /* 940 * If the LWP was sleeping interruptably, then it's OK to start it 941 * again. If not, mark it as still sleeping. 942 */ 943 if (l->l_wchan != NULL) { 944 l->l_stat = LSSLEEP; 945 /* lwp_unsleep() will release the lock. */ 946 lwp_unsleep(l, true); 947 return; 948 } 949 950 /* 951 * If the LWP is still on the CPU, mark it as LSONPROC. It may be 952 * about to call mi_switch(), in which case it will yield. 953 */ 954 if ((l->l_pflag & LP_RUNNING) != 0) { 955 l->l_stat = LSONPROC; 956 l->l_slptime = 0; 957 lwp_unlock(l); 958 return; 959 } 960 961 /* 962 * Look for a CPU to run. 963 * Set the LWP runnable. 964 */ 965 ci = sched_takecpu(l); 966 l->l_cpu = ci; 967 spc_lock(ci); 968 lwp_unlock_to(l, ci->ci_schedstate.spc_mutex); 969 sched_setrunnable(l); 970 l->l_stat = LSRUN; 971 l->l_slptime = 0; 972 973 /* 974 * If thread is swapped out - wake the swapper to bring it back in. 975 * Otherwise, enter it into a run queue. 976 */ 977 if (l->l_flag & LW_INMEM) { 978 sched_enqueue(l, false); 979 resched_cpu(l); 980 lwp_unlock(l); 981 } else { 982 lwp_unlock(l); 983 uvm_kick_scheduler(); 984 } 985 } 986 987 /* 988 * suspendsched: 989 * 990 * Convert all non-L_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED. 991 */ 992 void 993 suspendsched(void) 994 { 995 CPU_INFO_ITERATOR cii; 996 struct cpu_info *ci; 997 struct lwp *l; 998 struct proc *p; 999 1000 /* 1001 * We do this by process in order not to violate the locking rules. 1002 */ 1003 mutex_enter(proc_lock); 1004 PROCLIST_FOREACH(p, &allproc) { 1005 if ((p->p_flag & PK_MARKER) != 0) 1006 continue; 1007 1008 mutex_enter(p->p_lock); 1009 if ((p->p_flag & PK_SYSTEM) != 0) { 1010 mutex_exit(p->p_lock); 1011 continue; 1012 } 1013 1014 p->p_stat = SSTOP; 1015 1016 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 1017 if (l == curlwp) 1018 continue; 1019 1020 lwp_lock(l); 1021 1022 /* 1023 * Set L_WREBOOT so that the LWP will suspend itself 1024 * when it tries to return to user mode. We want to 1025 * try and get to get as many LWPs as possible to 1026 * the user / kernel boundary, so that they will 1027 * release any locks that they hold. 1028 */ 1029 l->l_flag |= (LW_WREBOOT | LW_WSUSPEND); 1030 1031 if (l->l_stat == LSSLEEP && 1032 (l->l_flag & LW_SINTR) != 0) { 1033 /* setrunnable() will release the lock. */ 1034 setrunnable(l); 1035 continue; 1036 } 1037 1038 lwp_unlock(l); 1039 } 1040 1041 mutex_exit(p->p_lock); 1042 } 1043 mutex_exit(proc_lock); 1044 1045 /* 1046 * Kick all CPUs to make them preempt any LWPs running in user mode. 1047 * They'll trap into the kernel and suspend themselves in userret(). 1048 */ 1049 for (CPU_INFO_FOREACH(cii, ci)) { 1050 spc_lock(ci); 1051 cpu_need_resched(ci, RESCHED_IMMED); 1052 spc_unlock(ci); 1053 } 1054 } 1055 1056 /* 1057 * sched_unsleep: 1058 * 1059 * The is called when the LWP has not been awoken normally but instead 1060 * interrupted: for example, if the sleep timed out. Because of this, 1061 * it's not a valid action for running or idle LWPs. 1062 */ 1063 static u_int 1064 sched_unsleep(struct lwp *l, bool cleanup) 1065 { 1066 1067 lwp_unlock(l); 1068 panic("sched_unsleep"); 1069 } 1070 1071 static void 1072 resched_cpu(struct lwp *l) 1073 { 1074 struct cpu_info *ci = ci = l->l_cpu; 1075 1076 KASSERT(lwp_locked(l, NULL)); 1077 if (lwp_eprio(l) > ci->ci_schedstate.spc_curpriority) 1078 cpu_need_resched(ci, 0); 1079 } 1080 1081 static void 1082 sched_changepri(struct lwp *l, pri_t pri) 1083 { 1084 1085 KASSERT(lwp_locked(l, NULL)); 1086 1087 if (l->l_stat == LSRUN && (l->l_flag & LW_INMEM) != 0) { 1088 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex)); 1089 sched_dequeue(l); 1090 l->l_priority = pri; 1091 sched_enqueue(l, false); 1092 } else { 1093 l->l_priority = pri; 1094 } 1095 resched_cpu(l); 1096 } 1097 1098 static void 1099 sched_lendpri(struct lwp *l, pri_t pri) 1100 { 1101 1102 KASSERT(lwp_locked(l, NULL)); 1103 1104 if (l->l_stat == LSRUN && (l->l_flag & LW_INMEM) != 0) { 1105 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex)); 1106 sched_dequeue(l); 1107 l->l_inheritedprio = pri; 1108 sched_enqueue(l, false); 1109 } else { 1110 l->l_inheritedprio = pri; 1111 } 1112 resched_cpu(l); 1113 } 1114 1115 struct lwp * 1116 syncobj_noowner(wchan_t wchan) 1117 { 1118 1119 return NULL; 1120 } 1121 1122 /* Decay 95% of proc::p_pctcpu in 60 seconds, ccpu = exp(-1/20) */ 1123 const fixpt_t ccpu = 0.95122942450071400909 * FSCALE; 1124 1125 /* 1126 * sched_pstats: 1127 * 1128 * Update process statistics and check CPU resource allocation. 1129 * Call scheduler-specific hook to eventually adjust process/LWP 1130 * priorities. 1131 */ 1132 /* ARGSUSED */ 1133 void 1134 sched_pstats(void *arg) 1135 { 1136 const int clkhz = (stathz != 0 ? stathz : hz); 1137 struct rlimit *rlim; 1138 struct lwp *l; 1139 struct proc *p; 1140 long runtm; 1141 fixpt_t lpctcpu; 1142 u_int lcpticks; 1143 int sig; 1144 1145 sched_pstats_ticks++; 1146 1147 mutex_enter(proc_lock); 1148 PROCLIST_FOREACH(p, &allproc) { 1149 if (__predict_false((p->p_flag & PK_MARKER) != 0)) 1150 continue; 1151 1152 /* 1153 * Increment time in/out of memory and sleep 1154 * time (if sleeping), ignore overflow. 1155 */ 1156 mutex_enter(p->p_lock); 1157 runtm = p->p_rtime.sec; 1158 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 1159 if (__predict_false((l->l_flag & LW_IDLE) != 0)) 1160 continue; 1161 lwp_lock(l); 1162 runtm += l->l_rtime.sec; 1163 l->l_swtime++; 1164 sched_lwp_stats(l); 1165 lwp_unlock(l); 1166 1167 l->l_pctcpu = (l->l_pctcpu * ccpu) >> FSHIFT; 1168 if (l->l_slptime != 0) 1169 continue; 1170 1171 lpctcpu = l->l_pctcpu; 1172 lcpticks = atomic_swap_uint(&l->l_cpticks, 0); 1173 lpctcpu += ((FSCALE - ccpu) * 1174 (lcpticks * FSCALE / clkhz)) >> FSHIFT; 1175 l->l_pctcpu = lpctcpu; 1176 } 1177 /* Calculating p_pctcpu only for ps(1) */ 1178 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT; 1179 1180 /* 1181 * Check if the process exceeds its CPU resource allocation. 1182 * If over max, kill it. 1183 */ 1184 rlim = &p->p_rlimit[RLIMIT_CPU]; 1185 sig = 0; 1186 if (__predict_false(runtm >= rlim->rlim_cur)) { 1187 if (runtm >= rlim->rlim_max) 1188 sig = SIGKILL; 1189 else { 1190 sig = SIGXCPU; 1191 if (rlim->rlim_cur < rlim->rlim_max) 1192 rlim->rlim_cur += 5; 1193 } 1194 } 1195 mutex_exit(p->p_lock); 1196 if (__predict_false(sig)) 1197 psignal(p, sig); 1198 } 1199 mutex_exit(proc_lock); 1200 uvm_meter(); 1201 cv_wakeup(&lbolt); 1202 callout_schedule(&sched_pstats_ch, hz); 1203 } 1204