1 /* $NetBSD: kern_synch.c,v 1.243 2008/05/19 17:06:02 ad Exp $ */ 2 3 /*- 4 * Copyright (c) 1999, 2000, 2004, 2006, 2007, 2008 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, 9 * NASA Ames Research Center, by Charles M. Hannum, Andrew Doran and 10 * Daniel Sieger. 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 1. Redistributions of source code must retain the above copyright 16 * notice, this list of conditions and the following disclaimer. 17 * 2. Redistributions in binary form must reproduce the above copyright 18 * notice, this list of conditions and the following disclaimer in the 19 * documentation and/or other materials provided with the distribution. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 23 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 24 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 25 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 26 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 27 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 28 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 29 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 30 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 31 * POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 /*- 35 * Copyright (c) 1982, 1986, 1990, 1991, 1993 36 * The Regents of the University of California. All rights reserved. 37 * (c) UNIX System Laboratories, Inc. 38 * All or some portions of this file are derived from material licensed 39 * to the University of California by American Telephone and Telegraph 40 * Co. or Unix System Laboratories, Inc. and are reproduced herein with 41 * the permission of UNIX System Laboratories, Inc. 42 * 43 * Redistribution and use in source and binary forms, with or without 44 * modification, are permitted provided that the following conditions 45 * are met: 46 * 1. Redistributions of source code must retain the above copyright 47 * notice, this list of conditions and the following disclaimer. 48 * 2. Redistributions in binary form must reproduce the above copyright 49 * notice, this list of conditions and the following disclaimer in the 50 * documentation and/or other materials provided with the distribution. 51 * 3. Neither the name of the University nor the names of its contributors 52 * may be used to endorse or promote products derived from this software 53 * without specific prior written permission. 54 * 55 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 56 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 57 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 58 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 59 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 60 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 61 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 62 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 63 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 64 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 65 * SUCH DAMAGE. 66 * 67 * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 68 */ 69 70 #include <sys/cdefs.h> 71 __KERNEL_RCSID(0, "$NetBSD: kern_synch.c,v 1.243 2008/05/19 17:06:02 ad Exp $"); 72 73 #include "opt_kstack.h" 74 #include "opt_perfctrs.h" 75 76 #define __MUTEX_PRIVATE 77 78 #include <sys/param.h> 79 #include <sys/systm.h> 80 #include <sys/proc.h> 81 #include <sys/kernel.h> 82 #if defined(PERFCTRS) 83 #include <sys/pmc.h> 84 #endif 85 #include <sys/cpu.h> 86 #include <sys/resourcevar.h> 87 #include <sys/sched.h> 88 #include <sys/syscall_stats.h> 89 #include <sys/sleepq.h> 90 #include <sys/lockdebug.h> 91 #include <sys/evcnt.h> 92 #include <sys/intr.h> 93 #include <sys/lwpctl.h> 94 #include <sys/atomic.h> 95 #include <sys/simplelock.h> 96 97 #include <uvm/uvm_extern.h> 98 99 #include <dev/lockstat.h> 100 101 static u_int sched_unsleep(struct lwp *, bool); 102 static void sched_changepri(struct lwp *, pri_t); 103 static void sched_lendpri(struct lwp *, pri_t); 104 105 syncobj_t sleep_syncobj = { 106 SOBJ_SLEEPQ_SORTED, 107 sleepq_unsleep, 108 sleepq_changepri, 109 sleepq_lendpri, 110 syncobj_noowner, 111 }; 112 113 syncobj_t sched_syncobj = { 114 SOBJ_SLEEPQ_SORTED, 115 sched_unsleep, 116 sched_changepri, 117 sched_lendpri, 118 syncobj_noowner, 119 }; 120 121 callout_t sched_pstats_ch; 122 unsigned sched_pstats_ticks; 123 kcondvar_t lbolt; /* once a second sleep address */ 124 125 /* Preemption event counters */ 126 static struct evcnt kpreempt_ev_crit; 127 static struct evcnt kpreempt_ev_klock; 128 static struct evcnt kpreempt_ev_ipl; 129 static struct evcnt kpreempt_ev_immed; 130 131 /* 132 * During autoconfiguration or after a panic, a sleep will simply lower the 133 * priority briefly to allow interrupts, then return. The priority to be 134 * used (safepri) is machine-dependent, thus this value is initialized and 135 * maintained in the machine-dependent layers. This priority will typically 136 * be 0, or the lowest priority that is safe for use on the interrupt stack; 137 * it can be made higher to block network software interrupts after panics. 138 */ 139 int safepri; 140 141 void 142 sched_init(void) 143 { 144 145 cv_init(&lbolt, "lbolt"); 146 callout_init(&sched_pstats_ch, CALLOUT_MPSAFE); 147 callout_setfunc(&sched_pstats_ch, sched_pstats, NULL); 148 149 evcnt_attach_dynamic(&kpreempt_ev_crit, EVCNT_TYPE_MISC, NULL, 150 "kpreempt", "defer: critical section"); 151 evcnt_attach_dynamic(&kpreempt_ev_klock, EVCNT_TYPE_MISC, NULL, 152 "kpreempt", "defer: kernel_lock"); 153 evcnt_attach_dynamic(&kpreempt_ev_ipl, EVCNT_TYPE_MISC, NULL, 154 "kpreempt", "defer: IPL"); 155 evcnt_attach_dynamic(&kpreempt_ev_immed, EVCNT_TYPE_MISC, NULL, 156 "kpreempt", "immediate"); 157 158 sched_pstats(NULL); 159 } 160 161 /* 162 * OBSOLETE INTERFACE 163 * 164 * General sleep call. Suspends the current process until a wakeup is 165 * performed on the specified identifier. The process will then be made 166 * runnable with the specified priority. Sleeps at most timo/hz seconds (0 167 * means no timeout). If pri includes PCATCH flag, signals are checked 168 * before and after sleeping, else signals are not checked. Returns 0 if 169 * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a 170 * signal needs to be delivered, ERESTART is returned if the current system 171 * call should be restarted if possible, and EINTR is returned if the system 172 * call should be interrupted by the signal (return EINTR). 173 * 174 * The interlock is held until we are on a sleep queue. The interlock will 175 * be locked before returning back to the caller unless the PNORELOCK flag 176 * is specified, in which case the interlock will always be unlocked upon 177 * return. 178 */ 179 int 180 ltsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo, 181 volatile struct simplelock *interlock) 182 { 183 struct lwp *l = curlwp; 184 sleepq_t *sq; 185 int error; 186 187 KASSERT((l->l_pflag & LP_INTR) == 0); 188 189 if (sleepq_dontsleep(l)) { 190 (void)sleepq_abort(NULL, 0); 191 if ((priority & PNORELOCK) != 0) 192 simple_unlock(interlock); 193 return 0; 194 } 195 196 l->l_kpriority = true; 197 sq = sleeptab_lookup(&sleeptab, ident); 198 sleepq_enter(sq, l); 199 sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj); 200 201 if (interlock != NULL) { 202 KASSERT(simple_lock_held(interlock)); 203 simple_unlock(interlock); 204 } 205 206 error = sleepq_block(timo, priority & PCATCH); 207 208 if (interlock != NULL && (priority & PNORELOCK) == 0) 209 simple_lock(interlock); 210 211 return error; 212 } 213 214 int 215 mtsleep(wchan_t ident, pri_t priority, const char *wmesg, int timo, 216 kmutex_t *mtx) 217 { 218 struct lwp *l = curlwp; 219 sleepq_t *sq; 220 int error; 221 222 KASSERT((l->l_pflag & LP_INTR) == 0); 223 224 if (sleepq_dontsleep(l)) { 225 (void)sleepq_abort(mtx, (priority & PNORELOCK) != 0); 226 return 0; 227 } 228 229 l->l_kpriority = true; 230 sq = sleeptab_lookup(&sleeptab, ident); 231 sleepq_enter(sq, l); 232 sleepq_enqueue(sq, ident, wmesg, &sleep_syncobj); 233 mutex_exit(mtx); 234 error = sleepq_block(timo, priority & PCATCH); 235 236 if ((priority & PNORELOCK) == 0) 237 mutex_enter(mtx); 238 239 return error; 240 } 241 242 /* 243 * General sleep call for situations where a wake-up is not expected. 244 */ 245 int 246 kpause(const char *wmesg, bool intr, int timo, kmutex_t *mtx) 247 { 248 struct lwp *l = curlwp; 249 sleepq_t *sq; 250 int error; 251 252 if (sleepq_dontsleep(l)) 253 return sleepq_abort(NULL, 0); 254 255 if (mtx != NULL) 256 mutex_exit(mtx); 257 l->l_kpriority = true; 258 sq = sleeptab_lookup(&sleeptab, l); 259 sleepq_enter(sq, l); 260 sleepq_enqueue(sq, l, wmesg, &sleep_syncobj); 261 error = sleepq_block(timo, intr); 262 if (mtx != NULL) 263 mutex_enter(mtx); 264 265 return error; 266 } 267 268 /* 269 * OBSOLETE INTERFACE 270 * 271 * Make all processes sleeping on the specified identifier runnable. 272 */ 273 void 274 wakeup(wchan_t ident) 275 { 276 sleepq_t *sq; 277 278 if (cold) 279 return; 280 281 sq = sleeptab_lookup(&sleeptab, ident); 282 sleepq_wake(sq, ident, (u_int)-1); 283 } 284 285 /* 286 * OBSOLETE INTERFACE 287 * 288 * Make the highest priority process first in line on the specified 289 * identifier runnable. 290 */ 291 void 292 wakeup_one(wchan_t ident) 293 { 294 sleepq_t *sq; 295 296 if (cold) 297 return; 298 299 sq = sleeptab_lookup(&sleeptab, ident); 300 sleepq_wake(sq, ident, 1); 301 } 302 303 304 /* 305 * General yield call. Puts the current process back on its run queue and 306 * performs a voluntary context switch. Should only be called when the 307 * current process explicitly requests it (eg sched_yield(2)). 308 */ 309 void 310 yield(void) 311 { 312 struct lwp *l = curlwp; 313 314 KERNEL_UNLOCK_ALL(l, &l->l_biglocks); 315 lwp_lock(l); 316 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); 317 KASSERT(l->l_stat == LSONPROC); 318 l->l_kpriority = false; 319 (void)mi_switch(l); 320 KERNEL_LOCK(l->l_biglocks, l); 321 } 322 323 /* 324 * General preemption call. Puts the current process back on its run queue 325 * and performs an involuntary context switch. 326 */ 327 void 328 preempt(void) 329 { 330 struct lwp *l = curlwp; 331 332 KERNEL_UNLOCK_ALL(l, &l->l_biglocks); 333 lwp_lock(l); 334 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_lwplock)); 335 KASSERT(l->l_stat == LSONPROC); 336 l->l_kpriority = false; 337 l->l_nivcsw++; 338 (void)mi_switch(l); 339 KERNEL_LOCK(l->l_biglocks, l); 340 } 341 342 /* 343 * Handle a request made by another agent to preempt the current LWP 344 * in-kernel. Usually called when l_dopreempt may be non-zero. 345 * 346 * Character addresses for lockstat only. 347 */ 348 static char in_critical_section; 349 static char kernel_lock_held; 350 static char spl_raised; 351 static char is_softint; 352 353 bool 354 kpreempt(uintptr_t where) 355 { 356 uintptr_t failed; 357 lwp_t *l; 358 int s, dop; 359 360 l = curlwp; 361 failed = 0; 362 while ((dop = l->l_dopreempt) != 0) { 363 if (l->l_stat != LSONPROC) { 364 /* 365 * About to block (or die), let it happen. 366 * Doesn't really count as "preemption has 367 * been blocked", since we're going to 368 * context switch. 369 */ 370 l->l_dopreempt = 0; 371 return true; 372 } 373 if (__predict_false((l->l_flag & LW_IDLE) != 0)) { 374 /* Can't preempt idle loop, don't count as failure. */ 375 l->l_dopreempt = 0; 376 return true; 377 } 378 if (__predict_false(l->l_nopreempt != 0)) { 379 /* LWP holds preemption disabled, explicitly. */ 380 if ((dop & DOPREEMPT_COUNTED) == 0) { 381 kpreempt_ev_crit.ev_count++; 382 } 383 failed = (uintptr_t)&in_critical_section; 384 break; 385 } 386 if (__predict_false((l->l_pflag & LP_INTR) != 0)) { 387 /* Can't preempt soft interrupts yet. */ 388 l->l_dopreempt = 0; 389 failed = (uintptr_t)&is_softint; 390 break; 391 } 392 s = splsched(); 393 if (__predict_false(l->l_blcnt != 0 || 394 curcpu()->ci_biglock_wanted != NULL)) { 395 /* Hold or want kernel_lock, code is not MT safe. */ 396 splx(s); 397 if ((dop & DOPREEMPT_COUNTED) == 0) { 398 kpreempt_ev_klock.ev_count++; 399 } 400 failed = (uintptr_t)&kernel_lock_held; 401 break; 402 } 403 if (__predict_false(!cpu_kpreempt_enter(where, s))) { 404 /* 405 * It may be that the IPL is too high. 406 * kpreempt_enter() can schedule an 407 * interrupt to retry later. 408 */ 409 splx(s); 410 if ((dop & DOPREEMPT_COUNTED) == 0) { 411 kpreempt_ev_ipl.ev_count++; 412 } 413 failed = (uintptr_t)&spl_raised; 414 break; 415 } 416 /* Do it! */ 417 if (__predict_true((dop & DOPREEMPT_COUNTED) == 0)) { 418 kpreempt_ev_immed.ev_count++; 419 } 420 lwp_lock(l); 421 mi_switch(l); 422 l->l_nopreempt++; 423 splx(s); 424 425 /* Take care of any MD cleanup. */ 426 cpu_kpreempt_exit(where); 427 l->l_nopreempt--; 428 } 429 430 /* Record preemption failure for reporting via lockstat. */ 431 if (__predict_false(failed)) { 432 int lsflag = 0; 433 atomic_or_uint(&l->l_dopreempt, DOPREEMPT_COUNTED); 434 LOCKSTAT_ENTER(lsflag); 435 /* Might recurse, make it atomic. */ 436 if (__predict_false(lsflag)) { 437 if (where == 0) { 438 where = (uintptr_t)__builtin_return_address(0); 439 } 440 if (atomic_cas_ptr_ni((void *)&l->l_pfailaddr, 441 NULL, (void *)where) == NULL) { 442 LOCKSTAT_START_TIMER(lsflag, l->l_pfailtime); 443 l->l_pfaillock = failed; 444 } 445 } 446 LOCKSTAT_EXIT(lsflag); 447 } 448 449 return failed; 450 } 451 452 /* 453 * Return true if preemption is explicitly disabled. 454 */ 455 bool 456 kpreempt_disabled(void) 457 { 458 lwp_t *l; 459 460 l = curlwp; 461 462 return l->l_nopreempt != 0 || l->l_stat == LSZOMB || 463 (l->l_flag & LW_IDLE) != 0 || cpu_kpreempt_disabled(); 464 } 465 466 /* 467 * Disable kernel preemption. 468 */ 469 void 470 kpreempt_disable(void) 471 { 472 473 KPREEMPT_DISABLE(curlwp); 474 } 475 476 /* 477 * Reenable kernel preemption. 478 */ 479 void 480 kpreempt_enable(void) 481 { 482 483 KPREEMPT_ENABLE(curlwp); 484 } 485 486 /* 487 * Compute the amount of time during which the current lwp was running. 488 * 489 * - update l_rtime unless it's an idle lwp. 490 */ 491 492 void 493 updatertime(lwp_t *l, const struct bintime *now) 494 { 495 496 if ((l->l_flag & LW_IDLE) != 0) 497 return; 498 499 /* rtime += now - stime */ 500 bintime_add(&l->l_rtime, now); 501 bintime_sub(&l->l_rtime, &l->l_stime); 502 } 503 504 /* 505 * The machine independent parts of context switch. 506 * 507 * Returns 1 if another LWP was actually run. 508 */ 509 int 510 mi_switch(lwp_t *l) 511 { 512 struct cpu_info *ci, *tci = NULL; 513 struct schedstate_percpu *spc; 514 struct lwp *newl; 515 int retval, oldspl; 516 struct bintime bt; 517 bool returning; 518 519 KASSERT(lwp_locked(l, NULL)); 520 KASSERT(kpreempt_disabled()); 521 LOCKDEBUG_BARRIER(l->l_mutex, 1); 522 523 #ifdef KSTACK_CHECK_MAGIC 524 kstack_check_magic(l); 525 #endif 526 527 binuptime(&bt); 528 529 KASSERT(l->l_cpu == curcpu()); 530 ci = l->l_cpu; 531 spc = &ci->ci_schedstate; 532 returning = false; 533 newl = NULL; 534 535 /* 536 * If we have been asked to switch to a specific LWP, then there 537 * is no need to inspect the run queues. If a soft interrupt is 538 * blocking, then return to the interrupted thread without adjusting 539 * VM context or its start time: neither have been changed in order 540 * to take the interrupt. 541 */ 542 if (l->l_switchto != NULL) { 543 if ((l->l_pflag & LP_INTR) != 0) { 544 returning = true; 545 softint_block(l); 546 if ((l->l_flag & LW_TIMEINTR) != 0) 547 updatertime(l, &bt); 548 } 549 newl = l->l_switchto; 550 l->l_switchto = NULL; 551 } 552 #ifndef __HAVE_FAST_SOFTINTS 553 else if (ci->ci_data.cpu_softints != 0) { 554 /* There are pending soft interrupts, so pick one. */ 555 newl = softint_picklwp(); 556 newl->l_stat = LSONPROC; 557 newl->l_flag |= LW_RUNNING; 558 } 559 #endif /* !__HAVE_FAST_SOFTINTS */ 560 561 /* Count time spent in current system call */ 562 if (!returning) { 563 SYSCALL_TIME_SLEEP(l); 564 565 /* 566 * XXXSMP If we are using h/w performance counters, 567 * save context. 568 */ 569 #if PERFCTRS 570 if (PMC_ENABLED(l->l_proc)) { 571 pmc_save_context(l->l_proc); 572 } 573 #endif 574 updatertime(l, &bt); 575 } 576 577 /* 578 * If on the CPU and we have gotten this far, then we must yield. 579 */ 580 KASSERT(l->l_stat != LSRUN); 581 if (l->l_stat == LSONPROC && (l->l_target_cpu || l != newl)) { 582 KASSERT(lwp_locked(l, spc->spc_lwplock)); 583 584 if (l->l_target_cpu == l->l_cpu) { 585 l->l_target_cpu = NULL; 586 } else { 587 tci = l->l_target_cpu; 588 } 589 590 if (__predict_false(tci != NULL)) { 591 /* Double-lock the runqueues */ 592 spc_dlock(ci, tci); 593 } else { 594 /* Lock the runqueue */ 595 spc_lock(ci); 596 } 597 598 if ((l->l_flag & LW_IDLE) == 0) { 599 l->l_stat = LSRUN; 600 if (__predict_false(tci != NULL)) { 601 /* 602 * Set the new CPU, lock and unset the 603 * l_target_cpu - thread will be enqueued 604 * to the runqueue of target CPU. 605 */ 606 l->l_cpu = tci; 607 lwp_setlock(l, tci->ci_schedstate.spc_mutex); 608 l->l_target_cpu = NULL; 609 } else { 610 lwp_setlock(l, spc->spc_mutex); 611 } 612 sched_enqueue(l, true); 613 } else { 614 KASSERT(tci == NULL); 615 l->l_stat = LSIDL; 616 } 617 } else { 618 /* Lock the runqueue */ 619 spc_lock(ci); 620 } 621 622 /* 623 * Let sched_nextlwp() select the LWP to run the CPU next. 624 * If no LWP is runnable, select the idle LWP. 625 * 626 * Note that spc_lwplock might not necessary be held, and 627 * new thread would be unlocked after setting the LWP-lock. 628 */ 629 if (newl == NULL) { 630 newl = sched_nextlwp(); 631 if (newl != NULL) { 632 sched_dequeue(newl); 633 KASSERT(lwp_locked(newl, spc->spc_mutex)); 634 newl->l_stat = LSONPROC; 635 newl->l_cpu = ci; 636 newl->l_flag |= LW_RUNNING; 637 lwp_setlock(newl, spc->spc_lwplock); 638 } else { 639 newl = ci->ci_data.cpu_idlelwp; 640 newl->l_stat = LSONPROC; 641 newl->l_flag |= LW_RUNNING; 642 } 643 /* 644 * Only clear want_resched if there are no 645 * pending (slow) software interrupts. 646 */ 647 ci->ci_want_resched = ci->ci_data.cpu_softints; 648 spc->spc_flags &= ~SPCF_SWITCHCLEAR; 649 spc->spc_curpriority = lwp_eprio(newl); 650 } 651 652 /* Items that must be updated with the CPU locked. */ 653 if (!returning) { 654 /* Update the new LWP's start time. */ 655 newl->l_stime = bt; 656 657 /* 658 * ci_curlwp changes when a fast soft interrupt occurs. 659 * We use cpu_onproc to keep track of which kernel or 660 * user thread is running 'underneath' the software 661 * interrupt. This is important for time accounting, 662 * itimers and forcing user threads to preempt (aston). 663 */ 664 ci->ci_data.cpu_onproc = newl; 665 } 666 667 /* 668 * Preemption related tasks. Must be done with the current 669 * CPU locked. 670 */ 671 cpu_did_resched(l); 672 l->l_dopreempt = 0; 673 if (__predict_false(l->l_pfailaddr != 0)) { 674 LOCKSTAT_FLAG(lsflag); 675 LOCKSTAT_ENTER(lsflag); 676 LOCKSTAT_STOP_TIMER(lsflag, l->l_pfailtime); 677 LOCKSTAT_EVENT_RA(lsflag, l->l_pfaillock, LB_NOPREEMPT|LB_SPIN, 678 1, l->l_pfailtime, l->l_pfailaddr); 679 LOCKSTAT_EXIT(lsflag); 680 l->l_pfailtime = 0; 681 l->l_pfaillock = 0; 682 l->l_pfailaddr = 0; 683 } 684 685 if (l != newl) { 686 struct lwp *prevlwp; 687 688 /* Release all locks, but leave the current LWP locked */ 689 if (l->l_mutex == l->l_cpu->ci_schedstate.spc_mutex) { 690 /* 691 * In case of migration, drop the local runqueue 692 * lock, thread is on other runqueue now. 693 */ 694 if (__predict_false(tci != NULL)) 695 spc_unlock(ci); 696 /* 697 * Drop spc_lwplock, if the current LWP has been moved 698 * to the run queue (it is now locked by spc_mutex). 699 */ 700 mutex_spin_exit(spc->spc_lwplock); 701 } else { 702 /* 703 * Otherwise, drop the spc_mutex, we are done with the 704 * run queues. 705 */ 706 mutex_spin_exit(spc->spc_mutex); 707 KASSERT(tci == NULL); 708 } 709 710 /* 711 * Mark that context switch is going to be perfomed 712 * for this LWP, to protect it from being switched 713 * to on another CPU. 714 */ 715 KASSERT(l->l_ctxswtch == 0); 716 l->l_ctxswtch = 1; 717 l->l_ncsw++; 718 l->l_flag &= ~LW_RUNNING; 719 720 /* 721 * Increase the count of spin-mutexes before the release 722 * of the last lock - we must remain at IPL_SCHED during 723 * the context switch. 724 */ 725 oldspl = MUTEX_SPIN_OLDSPL(ci); 726 ci->ci_mtx_count--; 727 lwp_unlock(l); 728 729 /* Count the context switch on this CPU. */ 730 ci->ci_data.cpu_nswtch++; 731 732 /* Update status for lwpctl, if present. */ 733 if (l->l_lwpctl != NULL) 734 l->l_lwpctl->lc_curcpu = LWPCTL_CPU_NONE; 735 736 /* 737 * Save old VM context, unless a soft interrupt 738 * handler is blocking. 739 */ 740 if (!returning) 741 pmap_deactivate(l); 742 743 /* 744 * We may need to spin-wait for if 'newl' is still 745 * context switching on another CPU. 746 */ 747 if (newl->l_ctxswtch != 0) { 748 u_int count; 749 count = SPINLOCK_BACKOFF_MIN; 750 while (newl->l_ctxswtch) 751 SPINLOCK_BACKOFF(count); 752 } 753 754 /* Switch to the new LWP.. */ 755 prevlwp = cpu_switchto(l, newl, returning); 756 ci = curcpu(); 757 758 /* 759 * Switched away - we have new curlwp. 760 * Restore VM context and IPL. 761 */ 762 pmap_activate(l); 763 if (prevlwp != NULL) { 764 /* Normalize the count of the spin-mutexes */ 765 ci->ci_mtx_count++; 766 /* Unmark the state of context switch */ 767 membar_exit(); 768 prevlwp->l_ctxswtch = 0; 769 } 770 771 /* Update status for lwpctl, if present. */ 772 if (l->l_lwpctl != NULL) { 773 l->l_lwpctl->lc_curcpu = (int)cpu_index(ci); 774 l->l_lwpctl->lc_pctr++; 775 } 776 777 KASSERT(l->l_cpu == ci); 778 splx(oldspl); 779 retval = 1; 780 } else { 781 /* Nothing to do - just unlock and return. */ 782 KASSERT(tci == NULL); 783 spc_unlock(ci); 784 lwp_unlock(l); 785 retval = 0; 786 } 787 788 KASSERT(l == curlwp); 789 KASSERT(l->l_stat == LSONPROC); 790 791 /* 792 * XXXSMP If we are using h/w performance counters, restore context. 793 * XXXSMP preemption problem. 794 */ 795 #if PERFCTRS 796 if (PMC_ENABLED(l->l_proc)) { 797 pmc_restore_context(l->l_proc); 798 } 799 #endif 800 SYSCALL_TIME_WAKEUP(l); 801 LOCKDEBUG_BARRIER(NULL, 1); 802 803 return retval; 804 } 805 806 /* 807 * Change process state to be runnable, placing it on the run queue if it is 808 * in memory, and awakening the swapper if it isn't in memory. 809 * 810 * Call with the process and LWP locked. Will return with the LWP unlocked. 811 */ 812 void 813 setrunnable(struct lwp *l) 814 { 815 struct proc *p = l->l_proc; 816 struct cpu_info *ci; 817 sigset_t *ss; 818 819 KASSERT((l->l_flag & LW_IDLE) == 0); 820 KASSERT(mutex_owned(p->p_lock)); 821 KASSERT(lwp_locked(l, NULL)); 822 KASSERT(l->l_mutex != l->l_cpu->ci_schedstate.spc_mutex); 823 824 switch (l->l_stat) { 825 case LSSTOP: 826 /* 827 * If we're being traced (possibly because someone attached us 828 * while we were stopped), check for a signal from the debugger. 829 */ 830 if ((p->p_slflag & PSL_TRACED) != 0 && p->p_xstat != 0) { 831 if ((sigprop[p->p_xstat] & SA_TOLWP) != 0) 832 ss = &l->l_sigpend.sp_set; 833 else 834 ss = &p->p_sigpend.sp_set; 835 sigaddset(ss, p->p_xstat); 836 signotify(l); 837 } 838 p->p_nrlwps++; 839 break; 840 case LSSUSPENDED: 841 l->l_flag &= ~LW_WSUSPEND; 842 p->p_nrlwps++; 843 cv_broadcast(&p->p_lwpcv); 844 break; 845 case LSSLEEP: 846 KASSERT(l->l_wchan != NULL); 847 break; 848 default: 849 panic("setrunnable: lwp %p state was %d", l, l->l_stat); 850 } 851 852 /* 853 * If the LWP was sleeping interruptably, then it's OK to start it 854 * again. If not, mark it as still sleeping. 855 */ 856 if (l->l_wchan != NULL) { 857 l->l_stat = LSSLEEP; 858 /* lwp_unsleep() will release the lock. */ 859 lwp_unsleep(l, true); 860 return; 861 } 862 863 /* 864 * If the LWP is still on the CPU, mark it as LSONPROC. It may be 865 * about to call mi_switch(), in which case it will yield. 866 */ 867 if ((l->l_flag & LW_RUNNING) != 0) { 868 l->l_stat = LSONPROC; 869 l->l_slptime = 0; 870 lwp_unlock(l); 871 return; 872 } 873 874 /* 875 * Look for a CPU to run. 876 * Set the LWP runnable. 877 */ 878 ci = sched_takecpu(l); 879 l->l_cpu = ci; 880 spc_lock(ci); 881 lwp_unlock_to(l, ci->ci_schedstate.spc_mutex); 882 sched_setrunnable(l); 883 l->l_stat = LSRUN; 884 l->l_slptime = 0; 885 886 /* 887 * If thread is swapped out - wake the swapper to bring it back in. 888 * Otherwise, enter it into a run queue. 889 */ 890 if (l->l_flag & LW_INMEM) { 891 sched_enqueue(l, false); 892 resched_cpu(l); 893 lwp_unlock(l); 894 } else { 895 lwp_unlock(l); 896 uvm_kick_scheduler(); 897 } 898 } 899 900 /* 901 * suspendsched: 902 * 903 * Convert all non-L_SYSTEM LSSLEEP or LSRUN LWPs to LSSUSPENDED. 904 */ 905 void 906 suspendsched(void) 907 { 908 CPU_INFO_ITERATOR cii; 909 struct cpu_info *ci; 910 struct lwp *l; 911 struct proc *p; 912 913 /* 914 * We do this by process in order not to violate the locking rules. 915 */ 916 mutex_enter(proc_lock); 917 PROCLIST_FOREACH(p, &allproc) { 918 if ((p->p_flag & PK_MARKER) != 0) 919 continue; 920 921 mutex_enter(p->p_lock); 922 if ((p->p_flag & PK_SYSTEM) != 0) { 923 mutex_exit(p->p_lock); 924 continue; 925 } 926 927 p->p_stat = SSTOP; 928 929 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 930 if (l == curlwp) 931 continue; 932 933 lwp_lock(l); 934 935 /* 936 * Set L_WREBOOT so that the LWP will suspend itself 937 * when it tries to return to user mode. We want to 938 * try and get to get as many LWPs as possible to 939 * the user / kernel boundary, so that they will 940 * release any locks that they hold. 941 */ 942 l->l_flag |= (LW_WREBOOT | LW_WSUSPEND); 943 944 if (l->l_stat == LSSLEEP && 945 (l->l_flag & LW_SINTR) != 0) { 946 /* setrunnable() will release the lock. */ 947 setrunnable(l); 948 continue; 949 } 950 951 lwp_unlock(l); 952 } 953 954 mutex_exit(p->p_lock); 955 } 956 mutex_exit(proc_lock); 957 958 /* 959 * Kick all CPUs to make them preempt any LWPs running in user mode. 960 * They'll trap into the kernel and suspend themselves in userret(). 961 */ 962 for (CPU_INFO_FOREACH(cii, ci)) { 963 spc_lock(ci); 964 cpu_need_resched(ci, RESCHED_IMMED); 965 spc_unlock(ci); 966 } 967 } 968 969 /* 970 * sched_unsleep: 971 * 972 * The is called when the LWP has not been awoken normally but instead 973 * interrupted: for example, if the sleep timed out. Because of this, 974 * it's not a valid action for running or idle LWPs. 975 */ 976 static u_int 977 sched_unsleep(struct lwp *l, bool cleanup) 978 { 979 980 lwp_unlock(l); 981 panic("sched_unsleep"); 982 } 983 984 void 985 resched_cpu(struct lwp *l) 986 { 987 struct cpu_info *ci; 988 989 /* 990 * XXXSMP 991 * Since l->l_cpu persists across a context switch, 992 * this gives us *very weak* processor affinity, in 993 * that we notify the CPU on which the process last 994 * ran that it should try to switch. 995 * 996 * This does not guarantee that the process will run on 997 * that processor next, because another processor might 998 * grab it the next time it performs a context switch. 999 * 1000 * This also does not handle the case where its last 1001 * CPU is running a higher-priority process, but every 1002 * other CPU is running a lower-priority process. There 1003 * are ways to handle this situation, but they're not 1004 * currently very pretty, and we also need to weigh the 1005 * cost of moving a process from one CPU to another. 1006 */ 1007 ci = l->l_cpu; 1008 if (lwp_eprio(l) > ci->ci_schedstate.spc_curpriority) 1009 cpu_need_resched(ci, 0); 1010 } 1011 1012 static void 1013 sched_changepri(struct lwp *l, pri_t pri) 1014 { 1015 1016 KASSERT(lwp_locked(l, NULL)); 1017 1018 if (l->l_stat == LSRUN && (l->l_flag & LW_INMEM) != 0) { 1019 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex)); 1020 sched_dequeue(l); 1021 l->l_priority = pri; 1022 sched_enqueue(l, false); 1023 } else { 1024 l->l_priority = pri; 1025 } 1026 resched_cpu(l); 1027 } 1028 1029 static void 1030 sched_lendpri(struct lwp *l, pri_t pri) 1031 { 1032 1033 KASSERT(lwp_locked(l, NULL)); 1034 1035 if (l->l_stat == LSRUN && (l->l_flag & LW_INMEM) != 0) { 1036 KASSERT(lwp_locked(l, l->l_cpu->ci_schedstate.spc_mutex)); 1037 sched_dequeue(l); 1038 l->l_inheritedprio = pri; 1039 sched_enqueue(l, false); 1040 } else { 1041 l->l_inheritedprio = pri; 1042 } 1043 resched_cpu(l); 1044 } 1045 1046 struct lwp * 1047 syncobj_noowner(wchan_t wchan) 1048 { 1049 1050 return NULL; 1051 } 1052 1053 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ 1054 fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ 1055 1056 /* 1057 * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the 1058 * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below 1059 * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT). 1060 * 1061 * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used: 1062 * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits). 1063 * 1064 * If you dont want to bother with the faster/more-accurate formula, you 1065 * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate 1066 * (more general) method of calculating the %age of CPU used by a process. 1067 */ 1068 #define CCPU_SHIFT (FSHIFT + 1) 1069 1070 /* 1071 * sched_pstats: 1072 * 1073 * Update process statistics and check CPU resource allocation. 1074 * Call scheduler-specific hook to eventually adjust process/LWP 1075 * priorities. 1076 */ 1077 /* ARGSUSED */ 1078 void 1079 sched_pstats(void *arg) 1080 { 1081 struct rlimit *rlim; 1082 struct lwp *l; 1083 struct proc *p; 1084 int sig, clkhz; 1085 long runtm; 1086 1087 sched_pstats_ticks++; 1088 1089 mutex_enter(proc_lock); 1090 PROCLIST_FOREACH(p, &allproc) { 1091 if ((p->p_flag & PK_MARKER) != 0) 1092 continue; 1093 1094 /* 1095 * Increment time in/out of memory and sleep time (if 1096 * sleeping). We ignore overflow; with 16-bit int's 1097 * (remember them?) overflow takes 45 days. 1098 */ 1099 mutex_enter(p->p_lock); 1100 mutex_spin_enter(&p->p_stmutex); 1101 runtm = p->p_rtime.sec; 1102 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 1103 if ((l->l_flag & LW_IDLE) != 0) 1104 continue; 1105 lwp_lock(l); 1106 runtm += l->l_rtime.sec; 1107 l->l_swtime++; 1108 sched_lwp_stats(l); 1109 lwp_unlock(l); 1110 1111 /* 1112 * p_pctcpu is only for ps. 1113 */ 1114 l->l_pctcpu = (l->l_pctcpu * ccpu) >> FSHIFT; 1115 if (l->l_slptime < 1) { 1116 clkhz = stathz != 0 ? stathz : hz; 1117 #if (FSHIFT >= CCPU_SHIFT) 1118 l->l_pctcpu += (clkhz == 100) ? 1119 ((fixpt_t)l->l_cpticks) << 1120 (FSHIFT - CCPU_SHIFT) : 1121 100 * (((fixpt_t) p->p_cpticks) 1122 << (FSHIFT - CCPU_SHIFT)) / clkhz; 1123 #else 1124 l->l_pctcpu += ((FSCALE - ccpu) * 1125 (l->l_cpticks * FSCALE / clkhz)) >> FSHIFT; 1126 #endif 1127 l->l_cpticks = 0; 1128 } 1129 } 1130 p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT; 1131 mutex_spin_exit(&p->p_stmutex); 1132 1133 /* 1134 * Check if the process exceeds its CPU resource allocation. 1135 * If over max, kill it. 1136 */ 1137 rlim = &p->p_rlimit[RLIMIT_CPU]; 1138 sig = 0; 1139 if (runtm >= rlim->rlim_cur) { 1140 if (runtm >= rlim->rlim_max) 1141 sig = SIGKILL; 1142 else { 1143 sig = SIGXCPU; 1144 if (rlim->rlim_cur < rlim->rlim_max) 1145 rlim->rlim_cur += 5; 1146 } 1147 } 1148 mutex_exit(p->p_lock); 1149 if (sig) 1150 psignal(p, sig); 1151 } 1152 mutex_exit(proc_lock); 1153 uvm_meter(); 1154 cv_wakeup(&lbolt); 1155 callout_schedule(&sched_pstats_ch, hz); 1156 } 1157