1 /* $NetBSD: kern_synch.c,v 1.248 2008/05/31 21:26:01 ad Exp $ */ 2 3 /*- 4 * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran and 10 * Daniel Sieger. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 23 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 24 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 25 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 /*- 35 * Copyright (c) 1982, 1986, 1990, 1991, 1993 36 * The Regents of the University of California. All rights reserved. 37 * (c) UNIX System Laboratories, Inc. 38 * All or some portions of this file are derived from material licensed 39 * to the University of California by American Telephone and Telegraph 40 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 41 * the permission of UNIX System Laboratories, Inc. 42 * 43 * Redistribution and use in source and binary forms, with or without 44 * modification, are permitted provided that the following conditions 45 * are met: 46 * 1. Redistributions of source code must retain the above copyright 47 * notice, this list of conditions and the following disclaimer. 48 * 2. Redistributions in binary form must reproduce the above copyright 49 * notice, this list of conditions and the following disclaimer in the 50 * documentation and/or other materials provided with the distribution. 51 * 3. Neither the name of the University nor the names of its contributors 52 * may be used to endorse or promote products derived from this software 53 * without specific prior written permission. 54 * 55 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 56 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 57 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 58 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 59 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 60 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 61 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 62 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 63 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 64 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 65 * SUCH DAMAGE. 66 * 67 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 68 */ 69 70 #include <sys/cdefs.h> 71 __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.248 2008/05/31 21:26:01 ad Exp $"); 72 73 #include "opt_kstack.h" 74 #include "opt_perfctrs.h" 75 76 #define __MUTEX_PRIVATE 77 78 #include <sys/param.h> 79 #include <sys/systm.h> 80 #include <sys/proc.h> 81 #include <sys/kernel.h> 82 #if defined(PERFCTRS) 83 #include <sys/pmc.h> 84 #endif 85 #include <sys/cpu.h> 86 #include <sys/resourcevar.h> 87 #include <sys/sched.h> 88 #include <sys/syscall_stats.h> 89 #include <sys/sleepq.h> 90 #include <sys/lockdebug.h> 91 #include <sys/evcnt.h> 92 #include <sys/intr.h> 93 #include <sys/lwpctl.h> 94 #include <sys/atomic.h> 95 #include <sys/simplelock.h> 96 97 #include <uvm/uvm_extern.h> 98 99 #include <dev/lockstat.h> 100 101 static u_int sched_unsleep(struct lwp *, bool); 102 static void sched_changepri(struct lwp *, pri_t); 103 static void sched_lendpri(struct lwp *, pri_t); 104 105 syncobj_t sleep_syncobj = { 106 SOBJ_SLEEPQ_SORTED, 107 sleepq_unsleep, 108 sleepq_changepri, 109 sleepq_lendpri, 110 syncobj_noowner, 111 }; 112 113 syncobj_t sched_syncobj = { 114 SOBJ_SLEEPQ_SORTED, 115 sched_unsleep, 116 sched_changepri, 117 sched_lendpri, 118 syncobj_noowner, 119 }; 120 121 callout_t sched_pstats_ch; 122 unsigned sched_pstats_ticks; 123 kcondvar_t lbolt; /* once a second sleep address */ 124 125 /* Preemption event counters */ 126 static struct evcnt kpreempt_ev_crit; 127 static struct evcnt kpreempt_ev_klock; 128 static struct evcnt kpreempt_ev_ipl; 129 static struct evcnt kpreempt_ev_immed; 130 131 /* 132 * During autoconfiguration or after a panic, a sleep will simply lower the 133 * priority briefly to allow interrupts, then return. The priority to be 134 * used (safepri) is machine-dependent, thus this value is initialized and 135 * maintained in the machine-dependent layers. This priority will typically 136 * be 0, or the lowest priority that is safe for use on the interrupt stack; 137 * it can be made higher to block network software interrupts after panics. 138 */ 139 int safepri; 140 141 void 142 sched_init(void) 143 { 144 145 cv_init(&lbolt, "lbolt"); 146 callout_init(&sched_pstats_ch, CALLOUT_MPSAFE); 147 callout_setfunc(&sched_pstats_ch, sched_pstats, NULL); 148 149 evcnt_attach_dynamic(&kpreempt_ev_crit, EVCNT_TYPE_MISC, NULL, 150 "kpreempt", "defer: critical section"); 151 evcnt_attach_dynamic(&kpreempt_ev_klock, EVCNT_TYPE_MISC, NULL, 152 "kpreempt", "defer: kernel_lock"); 153 evcnt_attach_dynamic(&kpreempt_ev_ipl, EVCNT_TYPE_MISC, NULL, 154 "kpreempt", "defer: IPL"); 155 evcnt_attach_dynamic(&kpreempt_ev_immed, EVCNT_TYPE_MISC, NULL, 156 "kpreempt", "immediate"); 157 158 sched_pstats(NULL); 159 } 160 161 /* 162 * OBSOLETE INTERFACE 163 * 164 * General sleep call. Suspends the current process until a wakeup is 165 * performed on the specified identifier. The process will then be made 166 * runnable with the specified priority. Sleeps at most timo/hz seconds (0 167 * means no timeout). If pri includes PCATCH flag, signals are checked 168 * before and after sleeping, else signals are not checked. Returns 0 if 169 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a 170 * signal needs to be delivered, ERESTART is returned if the current system 171 * call should be restarted if possible, and EINTR is returned if the system 172 * call should be interrupted by the signal (return EINTR). 173 * 174 * The interlock is held until we are on a sleep queue. The interlock will 175 * be locked before returning back to the caller unless the PNORELOCK flag 176 * is specified, in which case the interlock will always be unlocked upon 177 * return. 178 */ 179 int 180 ltsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo, 181 volatile struct simplelock *interlock) 182 { 183 struct lwp *l = curlwp; 184 sleepq_t *sq; 185 kmutex_t *mp; 186 int error; 187 188 KASSERT((l->l_pflag & LP_INTR) == 0); 189 190 if (sleepq_dontsleep(l)) { 191 (void)sleepq_abort(NULL, 0); 192 if ((priority & PNORELOCK) != 0) 193 simple_unlock(interlock); 194 return 0; 195 } 196 197 l->l_kpriority = true; 198 sq = sleeptab_lookup(&sleeptab, ident, &mp); 199 sleepq_enter(sq, l, mp); 200 sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj); 201 202 if (interlock != NULL) { 203 KASSERT(simple_lock_held(interlock)); 204 simple_unlock(interlock); 205 } 206 207 error = sleepq_block(timo, priority & PCATCH); 208 209 if (interlock != NULL && (priority & PNORELOCK) == 0) 210 simple_lock(interlock); 211 212 return error; 213 } 214 215 int 216 mtsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo, 217 kmutex_t *mtx) 218 { 219 struct lwp *l = curlwp; 220 sleepq_t *sq; 221 kmutex_t *mp; 222 int error; 223 224 KASSERT((l->l_pflag & LP_INTR) == 0); 225 226 if (sleepq_dontsleep(l)) { 227 (void)sleepq_abort(mtx, (priority & PNORELOCK) != 0); 228 return 0; 229 } 230 231 l->l_kpriority = true; 232 sq = sleeptab_lookup(&sleeptab, ident, &mp); 233 sleepq_enter(sq, l, mp); 234 sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj); 235 mutex_exit(mtx); 236 error = sleepq_block(timo, priority & PCATCH); 237 238 if ((priority & PNORELOCK) == 0) 239 mutex_enter(mtx); 240 241 return error; 242 } 243 244 /* 245 * General sleep call for situations where a wake-up is not expected. 246 */ 247 int 248 kpause(const char *wmesg, bool intr, int timo, kmutex_t *mtx) 249 { 250 struct lwp *l = curlwp; 251 kmutex_t *mp; 252 sleepq_t *sq; 253 int error; 254 255 if (sleepq_dontsleep(l)) 256 return sleepq_abort(NULL, 0); 257 258 if (mtx != NULL) 259 mutex_exit(mtx); 260 l->l_kpriority = true; 261 sq = sleeptab_lookup(&sleeptab, l, &mp); 262 sleepq_enter(sq, l, mp); 263 sleepq_enqueue(sq, l, wmesg, &sleep_syncobj); 264 error = sleepq_block(timo, intr); 265 if (mtx != NULL) 266 mutex_enter(mtx); 267 268 return error; 269 } 270 271 /* 272 * OBSOLETE INTERFACE 273 * 274 * Make all processes sleeping on the specified identifier runnable. 275 */ 276 void 277 wakeup(wchan_t ident) 278 { 279 sleepq_t *sq; 280 kmutex_t *mp; 281 282 if (cold) 283 return; 284 285 sq = sleeptab_lookup(&sleeptab, ident, &mp); 286 sleepq_wake(sq, ident, (u_int)-1, mp); 287 } 288 289 /* 290 * OBSOLETE INTERFACE 291 * 292 * Make the highest priority process first in line on the specified 293 * identifier runnable. 294 */ 295 void 296 wakeup_one(wchan_t ident) 297 { 298 sleepq_t *sq; 299 kmutex_t *mp; 300 301 if (cold) 302 return; 303 304 sq = sleeptab_lookup(&sleeptab, ident, &mp); 305 sleepq_wake(sq, ident, 1, mp); 306 } 307 308 309 /* 310 * General yield call. Puts the current process back on its run queue and 311 * performs a voluntary context switch. Should only be called when the 312 * current process explicitly requests it (eg sched_yield(2)). 313 */ 314 void 315 yield(void) 316 { 317 struct lwp *l = curlwp; 318 319 KERNEL_UNLOCK_ALL(l, &l->l_biglocks); 320 lwp_lock(l); 321 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); 322 KASSERT(l->l_stat == LSONPROC); 323 l->l_kpriority = false; 324 (void)mi_switch(l); 325 KERNEL_LOCK(l->l_biglocks, l); 326 } 327 328 /* 329 * General preemption call. Puts the current process back on its run queue 330 * and performs an involuntary context switch. 331 */ 332 void 333 preempt(void) 334 { 335 struct lwp *l = curlwp; 336 337 KERNEL_UNLOCK_ALL(l, &l->l_biglocks); 338 lwp_lock(l); 339 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); 340 KASSERT(l->l_stat == LSONPROC); 341 l->l_kpriority = false; 342 l->l_nivcsw++; 343 (void)mi_switch(l); 344 KERNEL_LOCK(l->l_biglocks, l); 345 } 346 347 /* 348 * Handle a request made by another agent to preempt the current LWP 349 * in-kernel. Usually called when l_dopreempt may be non-zero. 350 * 351 * Character addresses for lockstat only. 352 */ 353 static char in_critical_section; 354 static char kernel_lock_held; 355 static char spl_raised; 356 static char is_softint; 357 358 bool 359 kpreempt(uintptr_t where) 360 { 361 uintptr_t failed; 362 lwp_t *l; 363 int s, dop; 364 365 l = curlwp; 366 failed = 0; 367 while ((dop = l->l_dopreempt) != 0) { 368 if (l->l_stat != LSONPROC) { 369 /* 370 * About to block (or die), let it happen. 371 * Doesn't really count as "preemption has 372 * been blocked", since we're going to 373 * context switch. 374 */ 375 l->l_dopreempt = 0; 376 return true; 377 } 378 if (__predict_false((l->l_flag & LW_IDLE) != 0)) { 379 /* Can't preempt idle loop, don't count as failure. */ 380 l->l_dopreempt = 0; 381 return true; 382 } 383 if (__predict_false(l->l_nopreempt != 0)) { 384 /* LWP holds preemption disabled, explicitly. */ 385 if ((dop & DOPREEMPT_COUNTED) == 0) { 386 kpreempt_ev_crit.ev_count++; 387 } 388 failed = (uintptr_t)&in_critical_section; 389 break; 390 } 391 if (__predict_false((l->l_pflag & LP_INTR) != 0)) { 392 /* Can't preempt soft interrupts yet. */ 393 l->l_dopreempt = 0; 394 failed = (uintptr_t)&is_softint; 395 break; 396 } 397 s = splsched(); 398 if (__predict_false(l->l_blcnt != 0 || 399 curcpu()->ci_biglock_wanted != NULL)) { 400 /* Hold or want kernel_lock, code is not MT safe. */ 401 splx(s); 402 if ((dop & DOPREEMPT_COUNTED) == 0) { 403 kpreempt_ev_klock.ev_count++; 404 } 405 failed = (uintptr_t)&kernel_lock_held; 406 break; 407 } 408 if (__predict_false(!cpu_kpreempt_enter(where, s))) { 409 /* 410 * It may be that the IPL is too high. 411 * kpreempt_enter() can schedule an 412 * interrupt to retry later. 413 */ 414 splx(s); 415 if ((dop & DOPREEMPT_COUNTED) == 0) { 416 kpreempt_ev_ipl.ev_count++; 417 } 418 failed = (uintptr_t)&spl_raised; 419 break; 420 } 421 /* Do it! */ 422 if (__predict_true((dop & DOPREEMPT_COUNTED) == 0)) { 423 kpreempt_ev_immed.ev_count++; 424 } 425 lwp_lock(l); 426 mi_switch(l); 427 l->l_nopreempt++; 428 splx(s); 429 430 /* Take care of any MD cleanup. */ 431 cpu_kpreempt_exit(where); 432 l->l_nopreempt--; 433 } 434 435 /* Record preemption failure for reporting via lockstat. */ 436 if (__predict_false(failed)) { 437 int lsflag = 0; 438 atomic_or_uint(&l->l_dopreempt, DOPREEMPT_COUNTED); 439 LOCKSTAT_ENTER(lsflag); 440 /* Might recurse, make it atomic. */ 441 if (__predict_false(lsflag)) { 442 if (where == 0) { 443 where = (uintptr_t)__builtin_return_address(0); 444 } 445 if (atomic_cas_ptr_ni((void *)&l->l_pfailaddr, 446 NULL, (void *)where) == NULL) { 447 LOCKSTAT_START_TIMER(lsflag, l->l_pfailtime); 448 l->l_pfaillock = failed; 449 } 450 } 451 LOCKSTAT_EXIT(lsflag); 452 } 453 454 return failed; 455 } 456 457 /* 458 * Return true if preemption is explicitly disabled. 459 */ 460 bool 461 kpreempt_disabled(void) 462 { 463 lwp_t *l; 464 465 l = curlwp; 466 467 return l->l_nopreempt != 0 || l->l_stat == LSZOMB || 468 (l->l_flag & LW_IDLE) != 0 || cpu_kpreempt_disabled(); 469 } 470 471 /* 472 * Disable kernel preemption. 473 */ 474 void 475 kpreempt_disable(void) 476 { 477 478 KPREEMPT_DISABLE(curlwp); 479 } 480 481 /* 482 * Reenable kernel preemption. 483 */ 484 void 485 kpreempt_enable(void) 486 { 487 488 KPREEMPT_ENABLE(curlwp); 489 } 490 491 /* 492 * Compute the amount of time during which the current lwp was running. 493 * 494 * - update l_rtime unless it's an idle lwp. 495 */ 496 497 void 498 updatertime(lwp_t *l, const struct bintime *now) 499 { 500 501 if ((l->l_flag & LW_IDLE) != 0) 502 return; 503 504 /* rtime += now - stime */ 505 bintime_add(&l->l_rtime, now); 506 bintime_sub(&l->l_rtime, &l->l_stime); 507 } 508 509 /* 510 * Select next LWP from the current CPU to run.. 511 */ 512 static inline lwp_t * 513 nextlwp(struct cpu_info *ci, struct schedstate_percpu *spc) 514 { 515 lwp_t *newl; 516 517 /* 518 * Let sched_nextlwp() select the LWP to run the CPU next. 519 * If no LWP is runnable, select the idle LWP. 520 * 521 * Note that spc_lwplock might not necessary be held, and 522 * new thread would be unlocked after setting the LWP-lock. 523 */ 524 newl = sched_nextlwp(); 525 if (newl != NULL) { 526 sched_dequeue(newl); 527 KASSERT(lwp_locked(newl, spc->spc_mutex)); 528 newl->l_stat = LSONPROC; 529 newl->l_cpu = ci; 530 newl->l_pflag |= LP_RUNNING; 531 lwp_setlock(newl, spc->spc_lwplock); 532 } else { 533 newl = ci->ci_data.cpu_idlelwp; 534 newl->l_stat = LSONPROC; 535 newl->l_pflag |= LP_RUNNING; 536 } 537 538 /* 539 * Only clear want_resched if there are no pending (slow) 540 * software interrupts. 541 */ 542 ci->ci_want_resched = ci->ci_data.cpu_softints; 543 spc->spc_flags &= ~SPCF_SWITCHCLEAR; 544 spc->spc_curpriority = lwp_eprio(newl); 545 546 return newl; 547 } 548 549 /* 550 * The machine independent parts of context switch. 551 * 552 * Returns 1 if another LWP was actually run. 553 */ 554 int 555 mi_switch(lwp_t *l) 556 { 557 struct cpu_info *ci; 558 struct schedstate_percpu *spc; 559 struct lwp *newl; 560 int retval, oldspl; 561 struct bintime bt; 562 bool returning; 563 564 KASSERT(lwp_locked(l, NULL)); 565 KASSERT(kpreempt_disabled()); 566 LOCKDEBUG_BARRIER(l->l_mutex, 1); 567 568 #ifdef KSTACK_CHECK_MAGIC 569 kstack_check_magic(l); 570 #endif 571 572 binuptime(&bt); 573 574 KASSERT(l->l_cpu == curcpu()); 575 ci = l->l_cpu; 576 spc = &ci->ci_schedstate; 577 returning = false; 578 newl = NULL; 579 580 /* 581 * If we have been asked to switch to a specific LWP, then there 582 * is no need to inspect the run queues. If a soft interrupt is 583 * blocking, then return to the interrupted thread without adjusting 584 * VM context or its start time: neither have been changed in order 585 * to take the interrupt. 586 */ 587 if (l->l_switchto != NULL) { 588 if ((l->l_pflag & LP_INTR) != 0) { 589 returning = true; 590 softint_block(l); 591 if ((l->l_pflag & LP_TIMEINTR) != 0) 592 updatertime(l, &bt); 593 } 594 newl = l->l_switchto; 595 l->l_switchto = NULL; 596 } 597 #ifndef __HAVE_FAST_SOFTINTS 598 else if (ci->ci_data.cpu_softints != 0) { 599 /* There are pending soft interrupts, so pick one. */ 600 newl = softint_picklwp(); 601 newl->l_stat = LSONPROC; 602 newl->l_pflag |= LP_RUNNING; 603 } 604 #endif /* !__HAVE_FAST_SOFTINTS */ 605 606 /* Count time spent in current system call */ 607 if (!returning) { 608 SYSCALL_TIME_SLEEP(l); 609 610 /* 611 * XXXSMP If we are using h/w performance counters, 612 * save context. 613 */ 614 #if PERFCTRS 615 if (PMC_ENABLED(l->l_proc)) { 616 pmc_save_context(l->l_proc); 617 } 618 #endif 619 updatertime(l, &bt); 620 } 621 622 /* Lock the runqueue */ 623 KASSERT(l->l_stat != LSRUN); 624 mutex_spin_enter(spc->spc_mutex); 625 626 /* 627 * If on the CPU and we have gotten this far, then we must yield. 628 */ 629 if (l->l_stat == LSONPROC && l != newl) { 630 KASSERT(lwp_locked(l, spc->spc_lwplock)); 631 if ((l->l_flag & LW_IDLE) == 0) { 632 l->l_stat = LSRUN; 633 lwp_setlock(l, spc->spc_mutex); 634 sched_enqueue(l, true); 635 /* Handle migration case */ 636 KASSERT(spc->spc_migrating == NULL); 637 if (l->l_target_cpu != NULL) { 638 spc->spc_migrating = l; 639 } 640 } else 641 l->l_stat = LSIDL; 642 } 643 644 /* Pick new LWP to run. */ 645 if (newl == NULL) { 646 newl = nextlwp(ci, spc); 647 } 648 649 /* Items that must be updated with the CPU locked. */ 650 if (!returning) { 651 /* Update the new LWP's start time. */ 652 newl->l_stime = bt; 653 654 /* 655 * ci_curlwp changes when a fast soft interrupt occurs. 656 * We use cpu_onproc to keep track of which kernel or 657 * user thread is running 'underneath' the software 658 * interrupt. This is important for time accounting, 659 * itimers and forcing user threads to preempt (aston). 660 */ 661 ci->ci_data.cpu_onproc = newl; 662 } 663 664 /* 665 * Preemption related tasks. Must be done with the current 666 * CPU locked. 667 */ 668 cpu_did_resched(l); 669 l->l_dopreempt = 0; 670 if (__predict_false(l->l_pfailaddr != 0)) { 671 LOCKSTAT_FLAG(lsflag); 672 LOCKSTAT_ENTER(lsflag); 673 LOCKSTAT_STOP_TIMER(lsflag, l->l_pfailtime); 674 LOCKSTAT_EVENT_RA(lsflag, l->l_pfaillock, LB_NOPREEMPT|LB_SPIN, 675 1, l->l_pfailtime, l->l_pfailaddr); 676 LOCKSTAT_EXIT(lsflag); 677 l->l_pfailtime = 0; 678 l->l_pfaillock = 0; 679 l->l_pfailaddr = 0; 680 } 681 682 if (l != newl) { 683 struct lwp *prevlwp; 684 685 /* Release all locks, but leave the current LWP locked */ 686 if (l->l_mutex == spc->spc_mutex) { 687 /* 688 * Drop spc_lwplock, if the current LWP has been moved 689 * to the run queue (it is now locked by spc_mutex). 690 */ 691 mutex_spin_exit(spc->spc_lwplock); 692 } else { 693 /* 694 * Otherwise, drop the spc_mutex, we are done with the 695 * run queues. 696 */ 697 mutex_spin_exit(spc->spc_mutex); 698 } 699 700 /* 701 * Mark that context switch is going to be perfomed 702 * for this LWP, to protect it from being switched 703 * to on another CPU. 704 */ 705 KASSERT(l->l_ctxswtch == 0); 706 l->l_ctxswtch = 1; 707 l->l_ncsw++; 708 l->l_pflag &= ~LP_RUNNING; 709 710 /* 711 * Increase the count of spin-mutexes before the release 712 * of the last lock - we must remain at IPL_SCHED during 713 * the context switch. 714 */ 715 oldspl = MUTEX_SPIN_OLDSPL(ci); 716 ci->ci_mtx_count--; 717 lwp_unlock(l); 718 719 /* Count the context switch on this CPU. */ 720 ci->ci_data.cpu_nswtch++; 721 722 /* Update status for lwpctl, if present. */ 723 if (l->l_lwpctl != NULL) 724 l->l_lwpctl->lc_curcpu = LWPCTL_CPU_NONE; 725 726 /* 727 * Save old VM context, unless a soft interrupt 728 * handler is blocking. 729 */ 730 if (!returning) 731 pmap_deactivate(l); 732 733 /* 734 * We may need to spin-wait for if 'newl' is still 735 * context switching on another CPU. 736 */ 737 if (newl->l_ctxswtch != 0) { 738 u_int count; 739 count = SPINLOCK_BACKOFF_MIN; 740 while (newl->l_ctxswtch) 741 SPINLOCK_BACKOFF(count); 742 } 743 744 /* Switch to the new LWP.. */ 745 prevlwp = cpu_switchto(l, newl, returning); 746 ci = curcpu(); 747 748 /* 749 * Switched away - we have new curlwp. 750 * Restore VM context and IPL. 751 */ 752 pmap_activate(l); 753 if (prevlwp != NULL) { 754 /* Normalize the count of the spin-mutexes */ 755 ci->ci_mtx_count++; 756 /* Unmark the state of context switch */ 757 membar_exit(); 758 prevlwp->l_ctxswtch = 0; 759 } 760 761 /* Update status for lwpctl, if present. */ 762 if (l->l_lwpctl != NULL) { 763 l->l_lwpctl->lc_curcpu = (int)cpu_index(ci); 764 l->l_lwpctl->lc_pctr++; 765 } 766 767 KASSERT(l->l_cpu == ci); 768 splx(oldspl); 769 retval = 1; 770 } else { 771 /* Nothing to do - just unlock and return. */ 772 mutex_spin_exit(spc->spc_mutex); 773 lwp_unlock(l); 774 retval = 0; 775 } 776 777 KASSERT(l == curlwp); 778 KASSERT(l->l_stat == LSONPROC); 779 780 /* 781 * XXXSMP If we are using h/w performance counters, restore context. 782 * XXXSMP preemption problem. 783 */ 784 #if PERFCTRS 785 if (PMC_ENABLED(l->l_proc)) { 786 pmc_restore_context(l->l_proc); 787 } 788 #endif 789 SYSCALL_TIME_WAKEUP(l); 790 LOCKDEBUG_BARRIER(NULL, 1); 791 792 return retval; 793 } 794 795 /* 796 * The machine independent parts of context switch to oblivion. 797 * Does not return. Call with the LWP unlocked. 798 */ 799 void 800 lwp_exit_switchaway(lwp_t *l) 801 { 802 struct cpu_info *ci; 803 struct lwp *newl; 804 struct bintime bt; 805 806 ci = l->l_cpu; 807 808 KASSERT(kpreempt_disabled()); 809 KASSERT(l->l_stat == LSZOMB || l->l_stat == LSIDL); 810 KASSERT(ci == curcpu()); 811 LOCKDEBUG_BARRIER(NULL, 0); 812 813 #ifdef KSTACK_CHECK_MAGIC 814 kstack_check_magic(l); 815 #endif 816 817 /* Count time spent in current system call */ 818 SYSCALL_TIME_SLEEP(l); 819 binuptime(&bt); 820 updatertime(l, &bt); 821 822 /* Must stay at IPL_SCHED even after releasing run queue lock. */ 823 (void)splsched(); 824 825 /* 826 * Let sched_nextlwp() select the LWP to run the CPU next. 827 * If no LWP is runnable, select the idle LWP. 828 * 829 * Note that spc_lwplock might not necessary be held, and 830 * new thread would be unlocked after setting the LWP-lock. 831 */ 832 spc_lock(ci); 833 #ifndef __HAVE_FAST_SOFTINTS 834 if (ci->ci_data.cpu_softints != 0) { 835 /* There are pending soft interrupts, so pick one. */ 836 newl = softint_picklwp(); 837 newl->l_stat = LSONPROC; 838 newl->l_pflag |= LP_RUNNING; 839 } else 840 #endif /* !__HAVE_FAST_SOFTINTS */ 841 { 842 newl = nextlwp(ci, &ci->ci_schedstate); 843 } 844 845 /* Update the new LWP's start time. */ 846 newl->l_stime = bt; 847 l->l_pflag &= ~LP_RUNNING; 848 849 /* 850 * ci_curlwp changes when a fast soft interrupt occurs. 851 * We use cpu_onproc to keep track of which kernel or 852 * user thread is running 'underneath' the software 853 * interrupt. This is important for time accounting, 854 * itimers and forcing user threads to preempt (aston). 855 */ 856 ci->ci_data.cpu_onproc = newl; 857 858 /* 859 * Preemption related tasks. Must be done with the current 860 * CPU locked. 861 */ 862 cpu_did_resched(l); 863 864 /* Unlock the run queue. */ 865 spc_unlock(ci); 866 867 /* Count the context switch on this CPU. */ 868 ci->ci_data.cpu_nswtch++; 869 870 /* Update status for lwpctl, if present. */ 871 if (l->l_lwpctl != NULL) 872 l->l_lwpctl->lc_curcpu = LWPCTL_CPU_EXITED; 873 874 /* 875 * We may need to spin-wait for if 'newl' is still 876 * context switching on another CPU. 877 */ 878 if (newl->l_ctxswtch != 0) { 879 u_int count; 880 count = SPINLOCK_BACKOFF_MIN; 881 while (newl->l_ctxswtch) 882 SPINLOCK_BACKOFF(count); 883 } 884 885 /* Switch to the new LWP.. */ 886 (void)cpu_switchto(NULL, newl, false); 887 888 /* NOTREACHED */ 889 } 890 891 /* 892 * Change process state to be runnable, placing it on the run queue if it is 893 * in memory, and awakening the swapper if it isn't in memory. 894 * 895 * Call with the process and LWP locked. Will return with the LWP unlocked. 896 */ 897 void 898 setrunnable(struct lwp *l) 899 { 900 struct proc *p = l->l_proc; 901 struct cpu_info *ci; 902 sigset_t *ss; 903 904 KASSERT((l->l_flag & LW_IDLE) == 0); 905 KASSERT(mutex_owned(p->p_lock)); 906 KASSERT(lwp_locked(l, NULL)); 907 KASSERT(l->l_mutex != l->l_cpu->ci_schedstate.spc_mutex); 908 909 switch (l->l_stat) { 910 case LSSTOP: 911 /* 912 * If we're being traced (possibly because someone attached us 913 * while we were stopped), check for a signal from the debugger. 914 */ 915 if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xstat != 0) { 916 if ((sigprop[p->p_xstat] & SA_TOLWP) != 0) 917 ss = &l->l_sigpend.sp_set; 918 else 919 ss = &p->p_sigpend.sp_set; 920 sigaddset(ss, p->p_xstat); 921 signotify(l); 922 } 923 p->p_nrlwps++; 924 break; 925 case LSSUSPENDED: 926 l->l_flag &= ~LW_WSUSPEND; 927 p->p_nrlwps++; 928 cv_broadcast(&p->p_lwpcv); 929 break; 930 case LSSLEEP: 931 KASSERT(l->l_wchan != NULL); 932 break; 933 default: 934 panic("setrunnable: lwp %p state was %d", l, l->l_stat); 935 } 936 937 /* 938 * If the LWP was sleeping interruptably, then it's OK to start it 939 * again. If not, mark it as still sleeping. 940 */ 941 if (l->l_wchan != NULL) { 942 l->l_stat = LSSLEEP; 943 /* lwp_unsleep() will release the lock. */ 944 lwp_unsleep(l, true); 945 return; 946 } 947 948 /* 949 * If the LWP is still on the CPU, mark it as LSONPROC. It may be 950 * about to call mi_switch(), in which case it will yield. 951 */ 952 if ((l->l_pflag & LP_RUNNING) != 0) { 953 l->l_stat = LSONPROC; 954 l->l_slptime = 0; 955 lwp_unlock(l); 956 return; 957 } 958 959 /* 960 * Look for a CPU to run. 961 * Set the LWP runnable. 962 */ 963 ci = sched_takecpu(l); 964 l->l_cpu = ci; 965 spc_lock(ci); 966 lwp_unlock_to(l, ci->ci_schedstate.spc_mutex); 967 sched_setrunnable(l); 968 l->l_stat = LSRUN; 969 l->l_slptime = 0; 970 971 /* 972 * If thread is swapped out - wake the swapper to bring it back in. 973 * Otherwise, enter it into a run queue. 974 */ 975 if (l->l_flag & LW_INMEM) { 976 sched_enqueue(l, false); 977 resched_cpu(l); 978 lwp_unlock(l); 979 } else { 980 lwp_unlock(l); 981 uvm_kick_scheduler(); 982 } 983 } 984 985 /* 986 * suspendsched: 987 * 988 * Convert all non-L_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED. 989 */ 990 void 991 suspendsched(void) 992 { 993 CPU_INFO_ITERATOR cii; 994 struct cpu_info *ci; 995 struct lwp *l; 996 struct proc *p; 997 998 /* 999 * We do this by process in order not to violate the locking rules. 1000 */ 1001 mutex_enter(proc_lock); 1002 PROCLIST_FOREACH(p, &allproc) { 1003 if ((p->p_flag & PK_MARKER) != 0) 1004 continue; 1005 1006 mutex_enter(p->p_lock); 1007 if ((p->p_flag & PK_SYSTEM) != 0) { 1008 mutex_exit(p->p_lock); 1009 continue; 1010 } 1011 1012 p->p_stat = SSTOP; 1013 1014 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 1015 if (l == curlwp) 1016 continue; 1017 1018 lwp_lock(l); 1019 1020 /* 1021 * Set L_WREBOOT so that the LWP will suspend itself 1022 * when it tries to return to user mode. We want to 1023 * try and get to get as many LWPs as possible to 1024 * the user / kernel boundary, so that they will 1025 * release any locks that they hold. 1026 */ 1027 l->l_flag |= (LW_WREBOOT | LW_WSUSPEND); 1028 1029 if (l->l_stat == LSSLEEP && 1030 (l->l_flag & LW_SINTR) != 0) { 1031 /* setrunnable() will release the lock. */ 1032 setrunnable(l); 1033 continue; 1034 } 1035 1036 lwp_unlock(l); 1037 } 1038 1039 mutex_exit(p->p_lock); 1040 } 1041 mutex_exit(proc_lock); 1042 1043 /* 1044 * Kick all CPUs to make them preempt any LWPs running in user mode. 1045 * They'll trap into the kernel and suspend themselves in userret(). 1046 */ 1047 for (CPU_INFO_FOREACH(cii, ci)) { 1048 spc_lock(ci); 1049 cpu_need_resched(ci, RESCHED_IMMED); 1050 spc_unlock(ci); 1051 } 1052 } 1053 1054 /* 1055 * sched_unsleep: 1056 * 1057 * The is called when the LWP has not been awoken normally but instead 1058 * interrupted: for example, if the sleep timed out. Because of this, 1059 * it's not a valid action for running or idle LWPs. 1060 */ 1061 static u_int 1062 sched_unsleep(struct lwp *l, bool cleanup) 1063 { 1064 1065 lwp_unlock(l); 1066 panic("sched_unsleep"); 1067 } 1068 1069 void 1070 resched_cpu(struct lwp *l) 1071 { 1072 struct cpu_info *ci; 1073 1074 /* 1075 * XXXSMP 1076 * Since l->l_cpu persists across a context switch, 1077 * this gives us *very weak* processor affinity, in 1078 * that we notify the CPU on which the process last 1079 * ran that it should try to switch. 1080 * 1081 * This does not guarantee that the process will run on 1082 * that processor next, because another processor might 1083 * grab it the next time it performs a context switch. 1084 * 1085 * This also does not handle the case where its last 1086 * CPU is running a higher-priority process, but every 1087 * other CPU is running a lower-priority process. There 1088 * are ways to handle this situation, but they're not 1089 * currently very pretty, and we also need to weigh the 1090 * cost of moving a process from one CPU to another. 1091 */ 1092 ci = l->l_cpu; 1093 if (lwp_eprio(l) > ci->ci_schedstate.spc_curpriority) 1094 cpu_need_resched(ci, 0); 1095 } 1096 1097 static void 1098 sched_changepri(struct lwp *l, pri_t pri) 1099 { 1100 1101 KASSERT(lwp_locked(l, NULL)); 1102 1103 if (l->l_stat == LSRUN && (l->l_flag & LW_INMEM) != 0) { 1104 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex)); 1105 sched_dequeue(l); 1106 l->l_priority = pri; 1107 sched_enqueue(l, false); 1108 } else { 1109 l->l_priority = pri; 1110 } 1111 resched_cpu(l); 1112 } 1113 1114 static void 1115 sched_lendpri(struct lwp *l, pri_t pri) 1116 { 1117 1118 KASSERT(lwp_locked(l, NULL)); 1119 1120 if (l->l_stat == LSRUN && (l->l_flag & LW_INMEM) != 0) { 1121 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex)); 1122 sched_dequeue(l); 1123 l->l_inheritedprio = pri; 1124 sched_enqueue(l, false); 1125 } else { 1126 l->l_inheritedprio = pri; 1127 } 1128 resched_cpu(l); 1129 } 1130 1131 struct lwp * 1132 syncobj_noowner(wchan_t wchan) 1133 { 1134 1135 return NULL; 1136 } 1137 1138 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 1139 fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 1140 1141 /* 1142 * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the 1143 * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below 1144 * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT). 1145 * 1146 * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used: 1147 * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits). 1148 * 1149 * If you dont want to bother with the faster/more-accurate formula, you 1150 * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate 1151 * (more general) method of calculating the %age of CPU used by a process. 1152 */ 1153 #define CCPU_SHIFT (FSHIFT + 1) 1154 1155 /* 1156 * sched_pstats: 1157 * 1158 * Update process statistics and check CPU resource allocation. 1159 * Call scheduler-specific hook to eventually adjust process/LWP 1160 * priorities. 1161 */ 1162 /* ARGSUSED */ 1163 void 1164 sched_pstats(void *arg) 1165 { 1166 struct rlimit *rlim; 1167 struct lwp *l; 1168 struct proc *p; 1169 int sig, clkhz; 1170 long runtm; 1171 1172 sched_pstats_ticks++; 1173 1174 mutex_enter(proc_lock); 1175 PROCLIST_FOREACH(p, &allproc) { 1176 if ((p->p_flag & PK_MARKER) != 0) 1177 continue; 1178 1179 /* 1180 * Increment time in/out of memory and sleep time (if 1181 * sleeping). We ignore overflow; with 16-bit int's 1182 * (remember them?) overflow takes 45 days. 1183 */ 1184 mutex_enter(p->p_lock); 1185 mutex_spin_enter(&p->p_stmutex); 1186 runtm = p->p_rtime.sec; 1187 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 1188 if ((l->l_flag & LW_IDLE) != 0) 1189 continue; 1190 lwp_lock(l); 1191 runtm += l->l_rtime.sec; 1192 l->l_swtime++; 1193 sched_lwp_stats(l); 1194 lwp_unlock(l); 1195 1196 /* 1197 * p_pctcpu is only for ps. 1198 */ 1199 l->l_pctcpu = (l->l_pctcpu * ccpu) >> FSHIFT; 1200 if (l->l_slptime < 1) { 1201 clkhz = stathz != 0 ? stathz : hz; 1202 #if (FSHIFT >= CCPU_SHIFT) 1203 l->l_pctcpu += (clkhz == 100) ? 1204 ((fixpt_t)l->l_cpticks) << 1205 (FSHIFT - CCPU_SHIFT) : 1206 100 * (((fixpt_t) p->p_cpticks) 1207 << (FSHIFT - CCPU_SHIFT)) / clkhz; 1208 #else 1209 l->l_pctcpu += ((FSCALE - ccpu) * 1210 (l->l_cpticks * FSCALE / clkhz)) >> FSHIFT; 1211 #endif 1212 l->l_cpticks = 0; 1213 } 1214 } 1215 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT; 1216 mutex_spin_exit(&p->p_stmutex); 1217 1218 /* 1219 * Check if the process exceeds its CPU resource allocation. 1220 * If over max, kill it. 1221 */ 1222 rlim = &p->p_rlimit[RLIMIT_CPU]; 1223 sig = 0; 1224 if (runtm >= rlim->rlim_cur) { 1225 if (runtm >= rlim->rlim_max) 1226 sig = SIGKILL; 1227 else { 1228 sig = SIGXCPU; 1229 if (rlim->rlim_cur < rlim->rlim_max) 1230 rlim->rlim_cur += 5; 1231 } 1232 } 1233 mutex_exit(p->p_lock); 1234 if (sig) 1235 psignal(p, sig); 1236 } 1237 mutex_exit(proc_lock); 1238 uvm_meter(); 1239 cv_wakeup(&lbolt); 1240 callout_schedule(&sched_pstats_ch, hz); 1241 } 1242