1 /* $NetBSD: kern_synch.c,v 1.241 2008/04/30 12:44:27 ad Exp $ */ 2 3 /*- 4 * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran and 10 * Daniel Sieger. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 23 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 24 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 25 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 /*- 35 * Copyright (c) 1982, 1986, 1990, 1991, 1993 36 * The Regents of the University of California. All rights reserved. 37 * (c) UNIX System Laboratories, Inc. 38 * All or some portions of this file are derived from material licensed 39 * to the University of California by American Telephone and Telegraph 40 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 41 * the permission of UNIX System Laboratories, Inc. 42 * 43 * Redistribution and use in source and binary forms, with or without 44 * modification, are permitted provided that the following conditions 45 * are met: 46 * 1. Redistributions of source code must retain the above copyright 47 * notice, this list of conditions and the following disclaimer. 48 * 2. Redistributions in binary form must reproduce the above copyright 49 * notice, this list of conditions and the following disclaimer in the 50 * documentation and/or other materials provided with the distribution. 51 * 3. Neither the name of the University nor the names of its contributors 52 * may be used to endorse or promote products derived from this software 53 * without specific prior written permission. 54 * 55 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 56 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 57 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 58 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 59 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 60 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 61 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 62 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 63 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 64 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 65 * SUCH DAMAGE. 66 * 67 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 68 */ 69 70 #include <sys/cdefs.h> 71 __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.241 2008/04/30 12:44:27 ad Exp $"); 72 73 #include "opt_kstack.h" 74 #include "opt_lockdebug.h" 75 #include "opt_multiprocessor.h" 76 #include "opt_perfctrs.h" 77 78 #define __MUTEX_PRIVATE 79 80 #include <sys/param.h> 81 #include <sys/systm.h> 82 #include <sys/proc.h> 83 #include <sys/kernel.h> 84 #if defined(PERFCTRS) 85 #include <sys/pmc.h> 86 #endif 87 #include <sys/cpu.h> 88 #include <sys/resourcevar.h> 89 #include <sys/sched.h> 90 #include <sys/syscall_stats.h> 91 #include <sys/sleepq.h> 92 #include <sys/lockdebug.h> 93 #include <sys/evcnt.h> 94 #include <sys/intr.h> 95 #include <sys/lwpctl.h> 96 #include <sys/atomic.h> 97 #include <sys/simplelock.h> 98 99 #include <uvm/uvm_extern.h> 100 101 #include <dev/lockstat.h> 102 103 static u_int sched_unsleep(struct lwp *, bool); 104 static void sched_changepri(struct lwp *, pri_t); 105 static void sched_lendpri(struct lwp *, pri_t); 106 107 syncobj_t sleep_syncobj = { 108 SOBJ_SLEEPQ_SORTED, 109 sleepq_unsleep, 110 sleepq_changepri, 111 sleepq_lendpri, 112 syncobj_noowner, 113 }; 114 115 syncobj_t sched_syncobj = { 116 SOBJ_SLEEPQ_SORTED, 117 sched_unsleep, 118 sched_changepri, 119 sched_lendpri, 120 syncobj_noowner, 121 }; 122 123 callout_t sched_pstats_ch; 124 unsigned sched_pstats_ticks; 125 kcondvar_t lbolt; /* once a second sleep address */ 126 127 /* Preemption event counters */ 128 static struct evcnt kpreempt_ev_crit; 129 static struct evcnt kpreempt_ev_klock; 130 static struct evcnt kpreempt_ev_ipl; 131 static struct evcnt kpreempt_ev_immed; 132 133 /* 134 * During autoconfiguration or after a panic, a sleep will simply lower the 135 * priority briefly to allow interrupts, then return. The priority to be 136 * used (safepri) is machine-dependent, thus this value is initialized and 137 * maintained in the machine-dependent layers. This priority will typically 138 * be 0, or the lowest priority that is safe for use on the interrupt stack; 139 * it can be made higher to block network software interrupts after panics. 140 */ 141 int safepri; 142 143 void 144 sched_init(void) 145 { 146 147 cv_init(&lbolt, "lbolt"); 148 callout_init(&sched_pstats_ch, CALLOUT_MPSAFE); 149 callout_setfunc(&sched_pstats_ch, sched_pstats, NULL); 150 151 evcnt_attach_dynamic(&kpreempt_ev_crit, EVCNT_TYPE_MISC, NULL, 152 "kpreempt", "defer: critical section"); 153 evcnt_attach_dynamic(&kpreempt_ev_klock, EVCNT_TYPE_MISC, NULL, 154 "kpreempt", "defer: kernel_lock"); 155 evcnt_attach_dynamic(&kpreempt_ev_ipl, EVCNT_TYPE_MISC, NULL, 156 "kpreempt", "defer: IPL"); 157 evcnt_attach_dynamic(&kpreempt_ev_immed, EVCNT_TYPE_MISC, NULL, 158 "kpreempt", "immediate"); 159 160 sched_pstats(NULL); 161 } 162 163 /* 164 * OBSOLETE INTERFACE 165 * 166 * General sleep call. Suspends the current process until a wakeup is 167 * performed on the specified identifier. The process will then be made 168 * runnable with the specified priority. Sleeps at most timo/hz seconds (0 169 * means no timeout). If pri includes PCATCH flag, signals are checked 170 * before and after sleeping, else signals are not checked. Returns 0 if 171 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a 172 * signal needs to be delivered, ERESTART is returned if the current system 173 * call should be restarted if possible, and EINTR is returned if the system 174 * call should be interrupted by the signal (return EINTR). 175 * 176 * The interlock is held until we are on a sleep queue. The interlock will 177 * be locked before returning back to the caller unless the PNORELOCK flag 178 * is specified, in which case the interlock will always be unlocked upon 179 * return. 180 */ 181 int 182 ltsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo, 183 volatile struct simplelock *interlock) 184 { 185 struct lwp *l = curlwp; 186 sleepq_t *sq; 187 int error; 188 189 KASSERT((l->l_pflag & LP_INTR) == 0); 190 191 if (sleepq_dontsleep(l)) { 192 (void)sleepq_abort(NULL, 0); 193 if ((priority & PNORELOCK) != 0) 194 simple_unlock(interlock); 195 return 0; 196 } 197 198 l->l_kpriority = true; 199 sq = sleeptab_lookup(&sleeptab, ident); 200 sleepq_enter(sq, l); 201 sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj); 202 203 if (interlock != NULL) { 204 KASSERT(simple_lock_held(interlock)); 205 simple_unlock(interlock); 206 } 207 208 error = sleepq_block(timo, priority & PCATCH); 209 210 if (interlock != NULL && (priority & PNORELOCK) == 0) 211 simple_lock(interlock); 212 213 return error; 214 } 215 216 int 217 mtsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo, 218 kmutex_t *mtx) 219 { 220 struct lwp *l = curlwp; 221 sleepq_t *sq; 222 int error; 223 224 KASSERT((l->l_pflag & LP_INTR) == 0); 225 226 if (sleepq_dontsleep(l)) { 227 (void)sleepq_abort(mtx, (priority & PNORELOCK) != 0); 228 return 0; 229 } 230 231 l->l_kpriority = true; 232 sq = sleeptab_lookup(&sleeptab, ident); 233 sleepq_enter(sq, l); 234 sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj); 235 mutex_exit(mtx); 236 error = sleepq_block(timo, priority & PCATCH); 237 238 if ((priority & PNORELOCK) == 0) 239 mutex_enter(mtx); 240 241 return error; 242 } 243 244 /* 245 * General sleep call for situations where a wake-up is not expected. 246 */ 247 int 248 kpause(const char *wmesg, bool intr, int timo, kmutex_t *mtx) 249 { 250 struct lwp *l = curlwp; 251 sleepq_t *sq; 252 int error; 253 254 if (sleepq_dontsleep(l)) 255 return sleepq_abort(NULL, 0); 256 257 if (mtx != NULL) 258 mutex_exit(mtx); 259 l->l_kpriority = true; 260 sq = sleeptab_lookup(&sleeptab, l); 261 sleepq_enter(sq, l); 262 sleepq_enqueue(sq, l, wmesg, &sleep_syncobj); 263 error = sleepq_block(timo, intr); 264 if (mtx != NULL) 265 mutex_enter(mtx); 266 267 return error; 268 } 269 270 /* 271 * OBSOLETE INTERFACE 272 * 273 * Make all processes sleeping on the specified identifier runnable. 274 */ 275 void 276 wakeup(wchan_t ident) 277 { 278 sleepq_t *sq; 279 280 if (cold) 281 return; 282 283 sq = sleeptab_lookup(&sleeptab, ident); 284 sleepq_wake(sq, ident, (u_int)-1); 285 } 286 287 /* 288 * OBSOLETE INTERFACE 289 * 290 * Make the highest priority process first in line on the specified 291 * identifier runnable. 292 */ 293 void 294 wakeup_one(wchan_t ident) 295 { 296 sleepq_t *sq; 297 298 if (cold) 299 return; 300 301 sq = sleeptab_lookup(&sleeptab, ident); 302 sleepq_wake(sq, ident, 1); 303 } 304 305 306 /* 307 * General yield call. Puts the current process back on its run queue and 308 * performs a voluntary context switch. Should only be called when the 309 * current process explicitly requests it (eg sched_yield(2)). 310 */ 311 void 312 yield(void) 313 { 314 struct lwp *l = curlwp; 315 316 KERNEL_UNLOCK_ALL(l, &l->l_biglocks); 317 lwp_lock(l); 318 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); 319 KASSERT(l->l_stat == LSONPROC); 320 l->l_kpriority = false; 321 (void)mi_switch(l); 322 KERNEL_LOCK(l->l_biglocks, l); 323 } 324 325 /* 326 * General preemption call. Puts the current process back on its run queue 327 * and performs an involuntary context switch. 328 */ 329 void 330 preempt(void) 331 { 332 struct lwp *l = curlwp; 333 334 KERNEL_UNLOCK_ALL(l, &l->l_biglocks); 335 lwp_lock(l); 336 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); 337 KASSERT(l->l_stat == LSONPROC); 338 l->l_kpriority = false; 339 l->l_nivcsw++; 340 (void)mi_switch(l); 341 KERNEL_LOCK(l->l_biglocks, l); 342 } 343 344 /* 345 * Handle a request made by another agent to preempt the current LWP 346 * in-kernel. Usually called when l_dopreempt may be non-zero. 347 * 348 * Character addresses for lockstat only. 349 */ 350 static char in_critical_section; 351 static char kernel_lock_held; 352 static char spl_raised; 353 static char is_softint; 354 355 bool 356 kpreempt(uintptr_t where) 357 { 358 uintptr_t failed; 359 lwp_t *l; 360 int s, dop; 361 362 l = curlwp; 363 failed = 0; 364 while ((dop = l->l_dopreempt) != 0) { 365 if (l->l_stat != LSONPROC) { 366 /* 367 * About to block (or die), let it happen. 368 * Doesn't really count as "preemption has 369 * been blocked", since we're going to 370 * context switch. 371 */ 372 l->l_dopreempt = 0; 373 return true; 374 } 375 if (__predict_false((l->l_flag & LW_IDLE) != 0)) { 376 /* Can't preempt idle loop, don't count as failure. */ 377 l->l_dopreempt = 0; 378 return true; 379 } 380 if (__predict_false(l->l_nopreempt != 0)) { 381 /* LWP holds preemption disabled, explicitly. */ 382 if ((dop & DOPREEMPT_COUNTED) == 0) { 383 kpreempt_ev_crit.ev_count++; 384 } 385 failed = (uintptr_t)&in_critical_section; 386 break; 387 } 388 if (__predict_false((l->l_pflag & LP_INTR) != 0)) { 389 /* Can't preempt soft interrupts yet. */ 390 l->l_dopreempt = 0; 391 failed = (uintptr_t)&is_softint; 392 break; 393 } 394 s = splsched(); 395 if (__predict_false(l->l_blcnt != 0 || 396 curcpu()->ci_biglock_wanted != NULL)) { 397 /* Hold or want kernel_lock, code is not MT safe. */ 398 splx(s); 399 if ((dop & DOPREEMPT_COUNTED) == 0) { 400 kpreempt_ev_klock.ev_count++; 401 } 402 failed = (uintptr_t)&kernel_lock_held; 403 break; 404 } 405 if (__predict_false(!cpu_kpreempt_enter(where, s))) { 406 /* 407 * It may be that the IPL is too high. 408 * kpreempt_enter() can schedule an 409 * interrupt to retry later. 410 */ 411 splx(s); 412 if ((dop & DOPREEMPT_COUNTED) == 0) { 413 kpreempt_ev_ipl.ev_count++; 414 } 415 failed = (uintptr_t)&spl_raised; 416 break; 417 } 418 /* Do it! */ 419 if (__predict_true((dop & DOPREEMPT_COUNTED) == 0)) { 420 kpreempt_ev_immed.ev_count++; 421 } 422 lwp_lock(l); 423 mi_switch(l); 424 l->l_nopreempt++; 425 splx(s); 426 427 /* Take care of any MD cleanup. */ 428 cpu_kpreempt_exit(where); 429 l->l_nopreempt--; 430 } 431 432 /* Record preemption failure for reporting via lockstat. */ 433 if (__predict_false(failed)) { 434 int lsflag = 0; 435 atomic_or_uint(&l->l_dopreempt, DOPREEMPT_COUNTED); 436 LOCKSTAT_ENTER(lsflag); 437 /* Might recurse, make it atomic. */ 438 if (__predict_false(lsflag)) { 439 if (where == 0) { 440 where = (uintptr_t)__builtin_return_address(0); 441 } 442 if (atomic_cas_ptr_ni((void *)&l->l_pfailaddr, 443 NULL, (void *)where) == NULL) { 444 LOCKSTAT_START_TIMER(lsflag, l->l_pfailtime); 445 l->l_pfaillock = failed; 446 } 447 } 448 LOCKSTAT_EXIT(lsflag); 449 } 450 451 return failed; 452 } 453 454 /* 455 * Return true if preemption is explicitly disabled. 456 */ 457 bool 458 kpreempt_disabled(void) 459 { 460 lwp_t *l; 461 462 l = curlwp; 463 464 return l->l_nopreempt != 0 || l->l_stat == LSZOMB || 465 (l->l_flag & LW_IDLE) != 0 || cpu_kpreempt_disabled(); 466 } 467 468 /* 469 * Disable kernel preemption. 470 */ 471 void 472 kpreempt_disable(void) 473 { 474 475 KPREEMPT_DISABLE(curlwp); 476 } 477 478 /* 479 * Reenable kernel preemption. 480 */ 481 void 482 kpreempt_enable(void) 483 { 484 485 KPREEMPT_ENABLE(curlwp); 486 } 487 488 /* 489 * Compute the amount of time during which the current lwp was running. 490 * 491 * - update l_rtime unless it's an idle lwp. 492 */ 493 494 void 495 updatertime(lwp_t *l, const struct bintime *now) 496 { 497 498 if ((l->l_flag & LW_IDLE) != 0) 499 return; 500 501 /* rtime += now - stime */ 502 bintime_add(&l->l_rtime, now); 503 bintime_sub(&l->l_rtime, &l->l_stime); 504 } 505 506 /* 507 * The machine independent parts of context switch. 508 * 509 * Returns 1 if another LWP was actually run. 510 */ 511 int 512 mi_switch(lwp_t *l) 513 { 514 struct cpu_info *ci, *tci = NULL; 515 struct schedstate_percpu *spc; 516 struct lwp *newl; 517 int retval, oldspl; 518 struct bintime bt; 519 bool returning; 520 521 KASSERT(lwp_locked(l, NULL)); 522 KASSERT(kpreempt_disabled()); 523 LOCKDEBUG_BARRIER(l->l_mutex, 1); 524 525 #ifdef KSTACK_CHECK_MAGIC 526 kstack_check_magic(l); 527 #endif 528 529 binuptime(&bt); 530 531 KASSERT(l->l_cpu == curcpu()); 532 ci = l->l_cpu; 533 spc = &ci->ci_schedstate; 534 returning = false; 535 newl = NULL; 536 537 /* 538 * If we have been asked to switch to a specific LWP, then there 539 * is no need to inspect the run queues. If a soft interrupt is 540 * blocking, then return to the interrupted thread without adjusting 541 * VM context or its start time: neither have been changed in order 542 * to take the interrupt. 543 */ 544 if (l->l_switchto != NULL) { 545 if ((l->l_pflag & LP_INTR) != 0) { 546 returning = true; 547 softint_block(l); 548 if ((l->l_flag & LW_TIMEINTR) != 0) 549 updatertime(l, &bt); 550 } 551 newl = l->l_switchto; 552 l->l_switchto = NULL; 553 } 554 #ifndef __HAVE_FAST_SOFTINTS 555 else if (ci->ci_data.cpu_softints != 0) { 556 /* There are pending soft interrupts, so pick one. */ 557 newl = softint_picklwp(); 558 newl->l_stat = LSONPROC; 559 newl->l_flag |= LW_RUNNING; 560 } 561 #endif /* !__HAVE_FAST_SOFTINTS */ 562 563 /* Count time spent in current system call */ 564 if (!returning) { 565 SYSCALL_TIME_SLEEP(l); 566 567 /* 568 * XXXSMP If we are using h/w performance counters, 569 * save context. 570 */ 571 #if PERFCTRS 572 if (PMC_ENABLED(l->l_proc)) { 573 pmc_save_context(l->l_proc); 574 } 575 #endif 576 updatertime(l, &bt); 577 } 578 579 /* 580 * If on the CPU and we have gotten this far, then we must yield. 581 */ 582 KASSERT(l->l_stat != LSRUN); 583 if (l->l_stat == LSONPROC && (l->l_target_cpu || l != newl)) { 584 KASSERT(lwp_locked(l, spc->spc_lwplock)); 585 586 if (l->l_target_cpu == l->l_cpu) { 587 l->l_target_cpu = NULL; 588 } else { 589 tci = l->l_target_cpu; 590 } 591 592 if (__predict_false(tci != NULL)) { 593 /* Double-lock the runqueues */ 594 spc_dlock(ci, tci); 595 } else { 596 /* Lock the runqueue */ 597 spc_lock(ci); 598 } 599 600 if ((l->l_flag & LW_IDLE) == 0) { 601 l->l_stat = LSRUN; 602 if (__predict_false(tci != NULL)) { 603 /* 604 * Set the new CPU, lock and unset the 605 * l_target_cpu - thread will be enqueued 606 * to the runqueue of target CPU. 607 */ 608 l->l_cpu = tci; 609 lwp_setlock(l, tci->ci_schedstate.spc_mutex); 610 l->l_target_cpu = NULL; 611 } else { 612 lwp_setlock(l, spc->spc_mutex); 613 } 614 sched_enqueue(l, true); 615 } else { 616 KASSERT(tci == NULL); 617 l->l_stat = LSIDL; 618 } 619 } else { 620 /* Lock the runqueue */ 621 spc_lock(ci); 622 } 623 624 /* 625 * Let sched_nextlwp() select the LWP to run the CPU next. 626 * If no LWP is runnable, select the idle LWP. 627 * 628 * Note that spc_lwplock might not necessary be held, and 629 * new thread would be unlocked after setting the LWP-lock. 630 */ 631 if (newl == NULL) { 632 newl = sched_nextlwp(); 633 if (newl != NULL) { 634 sched_dequeue(newl); 635 KASSERT(lwp_locked(newl, spc->spc_mutex)); 636 newl->l_stat = LSONPROC; 637 newl->l_cpu = ci; 638 newl->l_flag |= LW_RUNNING; 639 lwp_setlock(newl, spc->spc_lwplock); 640 } else { 641 newl = ci->ci_data.cpu_idlelwp; 642 newl->l_stat = LSONPROC; 643 newl->l_flag |= LW_RUNNING; 644 } 645 /* 646 * Only clear want_resched if there are no 647 * pending (slow) software interrupts. 648 */ 649 ci->ci_want_resched = ci->ci_data.cpu_softints; 650 spc->spc_flags &= ~SPCF_SWITCHCLEAR; 651 spc->spc_curpriority = lwp_eprio(newl); 652 } 653 654 /* Items that must be updated with the CPU locked. */ 655 if (!returning) { 656 /* Update the new LWP's start time. */ 657 newl->l_stime = bt; 658 659 /* 660 * ci_curlwp changes when a fast soft interrupt occurs. 661 * We use cpu_onproc to keep track of which kernel or 662 * user thread is running 'underneath' the software 663 * interrupt. This is important for time accounting, 664 * itimers and forcing user threads to preempt (aston). 665 */ 666 ci->ci_data.cpu_onproc = newl; 667 } 668 669 /* 670 * Preemption related tasks. Must be done with the current 671 * CPU locked. 672 */ 673 cpu_did_resched(l); 674 l->l_dopreempt = 0; 675 if (__predict_false(l->l_pfailaddr != 0)) { 676 LOCKSTAT_FLAG(lsflag); 677 LOCKSTAT_ENTER(lsflag); 678 LOCKSTAT_STOP_TIMER(lsflag, l->l_pfailtime); 679 LOCKSTAT_EVENT_RA(lsflag, l->l_pfaillock, LB_NOPREEMPT|LB_SPIN, 680 1, l->l_pfailtime, l->l_pfailaddr); 681 LOCKSTAT_EXIT(lsflag); 682 l->l_pfailtime = 0; 683 l->l_pfaillock = 0; 684 l->l_pfailaddr = 0; 685 } 686 687 if (l != newl) { 688 struct lwp *prevlwp; 689 690 /* Release all locks, but leave the current LWP locked */ 691 if (l->l_mutex == l->l_cpu->ci_schedstate.spc_mutex) { 692 /* 693 * In case of migration, drop the local runqueue 694 * lock, thread is on other runqueue now. 695 */ 696 if (__predict_false(tci != NULL)) 697 spc_unlock(ci); 698 /* 699 * Drop spc_lwplock, if the current LWP has been moved 700 * to the run queue (it is now locked by spc_mutex). 701 */ 702 mutex_spin_exit(spc->spc_lwplock); 703 } else { 704 /* 705 * Otherwise, drop the spc_mutex, we are done with the 706 * run queues. 707 */ 708 mutex_spin_exit(spc->spc_mutex); 709 KASSERT(tci == NULL); 710 } 711 712 /* 713 * Mark that context switch is going to be perfomed 714 * for this LWP, to protect it from being switched 715 * to on another CPU. 716 */ 717 KASSERT(l->l_ctxswtch == 0); 718 l->l_ctxswtch = 1; 719 l->l_ncsw++; 720 l->l_flag &= ~LW_RUNNING; 721 722 /* 723 * Increase the count of spin-mutexes before the release 724 * of the last lock - we must remain at IPL_SCHED during 725 * the context switch. 726 */ 727 oldspl = MUTEX_SPIN_OLDSPL(ci); 728 ci->ci_mtx_count--; 729 lwp_unlock(l); 730 731 /* Count the context switch on this CPU. */ 732 ci->ci_data.cpu_nswtch++; 733 734 /* Update status for lwpctl, if present. */ 735 if (l->l_lwpctl != NULL) 736 l->l_lwpctl->lc_curcpu = LWPCTL_CPU_NONE; 737 738 /* 739 * Save old VM context, unless a soft interrupt 740 * handler is blocking. 741 */ 742 if (!returning) 743 pmap_deactivate(l); 744 745 /* 746 * We may need to spin-wait for if 'newl' is still 747 * context switching on another CPU. 748 */ 749 if (newl->l_ctxswtch != 0) { 750 u_int count; 751 count = SPINLOCK_BACKOFF_MIN; 752 while (newl->l_ctxswtch) 753 SPINLOCK_BACKOFF(count); 754 } 755 756 /* Switch to the new LWP.. */ 757 prevlwp = cpu_switchto(l, newl, returning); 758 ci = curcpu(); 759 760 /* 761 * Switched away - we have new curlwp. 762 * Restore VM context and IPL. 763 */ 764 pmap_activate(l); 765 if (prevlwp != NULL) { 766 /* Normalize the count of the spin-mutexes */ 767 ci->ci_mtx_count++; 768 /* Unmark the state of context switch */ 769 membar_exit(); 770 prevlwp->l_ctxswtch = 0; 771 } 772 773 /* Update status for lwpctl, if present. */ 774 if (l->l_lwpctl != NULL) { 775 l->l_lwpctl->lc_curcpu = (int)cpu_index(ci); 776 l->l_lwpctl->lc_pctr++; 777 } 778 779 KASSERT(l->l_cpu == ci); 780 splx(oldspl); 781 retval = 1; 782 } else { 783 /* Nothing to do - just unlock and return. */ 784 KASSERT(tci == NULL); 785 spc_unlock(ci); 786 lwp_unlock(l); 787 retval = 0; 788 } 789 790 KASSERT(l == curlwp); 791 KASSERT(l->l_stat == LSONPROC); 792 793 /* 794 * XXXSMP If we are using h/w performance counters, restore context. 795 * XXXSMP preemption problem. 796 */ 797 #if PERFCTRS 798 if (PMC_ENABLED(l->l_proc)) { 799 pmc_restore_context(l->l_proc); 800 } 801 #endif 802 SYSCALL_TIME_WAKEUP(l); 803 LOCKDEBUG_BARRIER(NULL, 1); 804 805 return retval; 806 } 807 808 /* 809 * Change process state to be runnable, placing it on the run queue if it is 810 * in memory, and awakening the swapper if it isn't in memory. 811 * 812 * Call with the process and LWP locked. Will return with the LWP unlocked. 813 */ 814 void 815 setrunnable(struct lwp *l) 816 { 817 struct proc *p = l->l_proc; 818 struct cpu_info *ci; 819 sigset_t *ss; 820 821 KASSERT((l->l_flag & LW_IDLE) == 0); 822 KASSERT(mutex_owned(p->p_lock)); 823 KASSERT(lwp_locked(l, NULL)); 824 KASSERT(l->l_mutex != l->l_cpu->ci_schedstate.spc_mutex); 825 826 switch (l->l_stat) { 827 case LSSTOP: 828 /* 829 * If we're being traced (possibly because someone attached us 830 * while we were stopped), check for a signal from the debugger. 831 */ 832 if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xstat != 0) { 833 if ((sigprop[p->p_xstat] & SA_TOLWP) != 0) 834 ss = &l->l_sigpend.sp_set; 835 else 836 ss = &p->p_sigpend.sp_set; 837 sigaddset(ss, p->p_xstat); 838 signotify(l); 839 } 840 p->p_nrlwps++; 841 break; 842 case LSSUSPENDED: 843 l->l_flag &= ~LW_WSUSPEND; 844 p->p_nrlwps++; 845 cv_broadcast(&p->p_lwpcv); 846 break; 847 case LSSLEEP: 848 KASSERT(l->l_wchan != NULL); 849 break; 850 default: 851 panic("setrunnable: lwp %p state was %d", l, l->l_stat); 852 } 853 854 /* 855 * If the LWP was sleeping interruptably, then it's OK to start it 856 * again. If not, mark it as still sleeping. 857 */ 858 if (l->l_wchan != NULL) { 859 l->l_stat = LSSLEEP; 860 /* lwp_unsleep() will release the lock. */ 861 lwp_unsleep(l, true); 862 return; 863 } 864 865 /* 866 * If the LWP is still on the CPU, mark it as LSONPROC. It may be 867 * about to call mi_switch(), in which case it will yield. 868 */ 869 if ((l->l_flag & LW_RUNNING) != 0) { 870 l->l_stat = LSONPROC; 871 l->l_slptime = 0; 872 lwp_unlock(l); 873 return; 874 } 875 876 /* 877 * Look for a CPU to run. 878 * Set the LWP runnable. 879 */ 880 ci = sched_takecpu(l); 881 l->l_cpu = ci; 882 spc_lock(ci); 883 lwp_unlock_to(l, ci->ci_schedstate.spc_mutex); 884 sched_setrunnable(l); 885 l->l_stat = LSRUN; 886 l->l_slptime = 0; 887 888 /* 889 * If thread is swapped out - wake the swapper to bring it back in. 890 * Otherwise, enter it into a run queue. 891 */ 892 if (l->l_flag & LW_INMEM) { 893 sched_enqueue(l, false); 894 resched_cpu(l); 895 lwp_unlock(l); 896 } else { 897 lwp_unlock(l); 898 uvm_kick_scheduler(); 899 } 900 } 901 902 /* 903 * suspendsched: 904 * 905 * Convert all non-L_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED. 906 */ 907 void 908 suspendsched(void) 909 { 910 CPU_INFO_ITERATOR cii; 911 struct cpu_info *ci; 912 struct lwp *l; 913 struct proc *p; 914 915 /* 916 * We do this by process in order not to violate the locking rules. 917 */ 918 mutex_enter(proc_lock); 919 PROCLIST_FOREACH(p, &allproc) { 920 if ((p->p_flag & PK_MARKER) != 0) 921 continue; 922 923 mutex_enter(p->p_lock); 924 if ((p->p_flag & PK_SYSTEM) != 0) { 925 mutex_exit(p->p_lock); 926 continue; 927 } 928 929 p->p_stat = SSTOP; 930 931 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 932 if (l == curlwp) 933 continue; 934 935 lwp_lock(l); 936 937 /* 938 * Set L_WREBOOT so that the LWP will suspend itself 939 * when it tries to return to user mode. We want to 940 * try and get to get as many LWPs as possible to 941 * the user / kernel boundary, so that they will 942 * release any locks that they hold. 943 */ 944 l->l_flag |= (LW_WREBOOT | LW_WSUSPEND); 945 946 if (l->l_stat == LSSLEEP && 947 (l->l_flag & LW_SINTR) != 0) { 948 /* setrunnable() will release the lock. */ 949 setrunnable(l); 950 continue; 951 } 952 953 lwp_unlock(l); 954 } 955 956 mutex_exit(p->p_lock); 957 } 958 mutex_exit(proc_lock); 959 960 /* 961 * Kick all CPUs to make them preempt any LWPs running in user mode. 962 * They'll trap into the kernel and suspend themselves in userret(). 963 */ 964 for (CPU_INFO_FOREACH(cii, ci)) { 965 spc_lock(ci); 966 cpu_need_resched(ci, RESCHED_IMMED); 967 spc_unlock(ci); 968 } 969 } 970 971 /* 972 * sched_unsleep: 973 * 974 * The is called when the LWP has not been awoken normally but instead 975 * interrupted: for example, if the sleep timed out. Because of this, 976 * it's not a valid action for running or idle LWPs. 977 */ 978 static u_int 979 sched_unsleep(struct lwp *l, bool cleanup) 980 { 981 982 lwp_unlock(l); 983 panic("sched_unsleep"); 984 } 985 986 void 987 resched_cpu(struct lwp *l) 988 { 989 struct cpu_info *ci; 990 991 /* 992 * XXXSMP 993 * Since l->l_cpu persists across a context switch, 994 * this gives us *very weak* processor affinity, in 995 * that we notify the CPU on which the process last 996 * ran that it should try to switch. 997 * 998 * This does not guarantee that the process will run on 999 * that processor next, because another processor might 1000 * grab it the next time it performs a context switch. 1001 * 1002 * This also does not handle the case where its last 1003 * CPU is running a higher-priority process, but every 1004 * other CPU is running a lower-priority process. There 1005 * are ways to handle this situation, but they're not 1006 * currently very pretty, and we also need to weigh the 1007 * cost of moving a process from one CPU to another. 1008 */ 1009 ci = l->l_cpu; 1010 if (lwp_eprio(l) > ci->ci_schedstate.spc_curpriority) 1011 cpu_need_resched(ci, 0); 1012 } 1013 1014 static void 1015 sched_changepri(struct lwp *l, pri_t pri) 1016 { 1017 1018 KASSERT(lwp_locked(l, NULL)); 1019 1020 if (l->l_stat == LSRUN && (l->l_flag & LW_INMEM) != 0) { 1021 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex)); 1022 sched_dequeue(l); 1023 l->l_priority = pri; 1024 sched_enqueue(l, false); 1025 } else { 1026 l->l_priority = pri; 1027 } 1028 resched_cpu(l); 1029 } 1030 1031 static void 1032 sched_lendpri(struct lwp *l, pri_t pri) 1033 { 1034 1035 KASSERT(lwp_locked(l, NULL)); 1036 1037 if (l->l_stat == LSRUN && (l->l_flag & LW_INMEM) != 0) { 1038 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex)); 1039 sched_dequeue(l); 1040 l->l_inheritedprio = pri; 1041 sched_enqueue(l, false); 1042 } else { 1043 l->l_inheritedprio = pri; 1044 } 1045 resched_cpu(l); 1046 } 1047 1048 struct lwp * 1049 syncobj_noowner(wchan_t wchan) 1050 { 1051 1052 return NULL; 1053 } 1054 1055 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 1056 fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 1057 1058 /* 1059 * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the 1060 * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below 1061 * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT). 1062 * 1063 * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used: 1064 * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits). 1065 * 1066 * If you dont want to bother with the faster/more-accurate formula, you 1067 * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate 1068 * (more general) method of calculating the %age of CPU used by a process. 1069 */ 1070 #define CCPU_SHIFT (FSHIFT + 1) 1071 1072 /* 1073 * sched_pstats: 1074 * 1075 * Update process statistics and check CPU resource allocation. 1076 * Call scheduler-specific hook to eventually adjust process/LWP 1077 * priorities. 1078 */ 1079 /* ARGSUSED */ 1080 void 1081 sched_pstats(void *arg) 1082 { 1083 struct rlimit *rlim; 1084 struct lwp *l; 1085 struct proc *p; 1086 int sig, clkhz; 1087 long runtm; 1088 1089 sched_pstats_ticks++; 1090 1091 mutex_enter(proc_lock); 1092 PROCLIST_FOREACH(p, &allproc) { 1093 if ((p->p_flag & PK_MARKER) != 0) 1094 continue; 1095 1096 /* 1097 * Increment time in/out of memory and sleep time (if 1098 * sleeping). We ignore overflow; with 16-bit int's 1099 * (remember them?) overflow takes 45 days. 1100 */ 1101 mutex_enter(p->p_lock); 1102 mutex_spin_enter(&p->p_stmutex); 1103 runtm = p->p_rtime.sec; 1104 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 1105 if ((l->l_flag & LW_IDLE) != 0) 1106 continue; 1107 lwp_lock(l); 1108 runtm += l->l_rtime.sec; 1109 l->l_swtime++; 1110 sched_pstats_hook(l); 1111 lwp_unlock(l); 1112 1113 /* 1114 * p_pctcpu is only for ps. 1115 */ 1116 l->l_pctcpu = (l->l_pctcpu * ccpu) >> FSHIFT; 1117 if (l->l_slptime < 1) { 1118 clkhz = stathz != 0 ? stathz : hz; 1119 #if (FSHIFT >= CCPU_SHIFT) 1120 l->l_pctcpu += (clkhz == 100) ? 1121 ((fixpt_t)l->l_cpticks) << 1122 (FSHIFT - CCPU_SHIFT) : 1123 100 * (((fixpt_t) p->p_cpticks) 1124 << (FSHIFT - CCPU_SHIFT)) / clkhz; 1125 #else 1126 l->l_pctcpu += ((FSCALE - ccpu) * 1127 (l->l_cpticks * FSCALE / clkhz)) >> FSHIFT; 1128 #endif 1129 l->l_cpticks = 0; 1130 } 1131 } 1132 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT; 1133 mutex_spin_exit(&p->p_stmutex); 1134 1135 /* 1136 * Check if the process exceeds its CPU resource allocation. 1137 * If over max, kill it. 1138 */ 1139 rlim = &p->p_rlimit[RLIMIT_CPU]; 1140 sig = 0; 1141 if (runtm >= rlim->rlim_cur) { 1142 if (runtm >= rlim->rlim_max) 1143 sig = SIGKILL; 1144 else { 1145 sig = SIGXCPU; 1146 if (rlim->rlim_cur < rlim->rlim_max) 1147 rlim->rlim_cur += 5; 1148 } 1149 } 1150 mutex_exit(p->p_lock); 1151 if (sig) 1152 psignal(p, sig); 1153 } 1154 mutex_exit(proc_lock); 1155 uvm_meter(); 1156 cv_wakeup(&lbolt); 1157 callout_schedule(&sched_pstats_ch, hz); 1158 } 1159