1 /* 2 * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 * 26 * $DragonFly: src/sys/kern/usched_bsd4.c,v 1.3 2005/10/11 09:59:56 corecode Exp $ 27 */ 28 29 #include <sys/param.h> 30 #include <sys/systm.h> 31 #include <sys/kernel.h> 32 #include <sys/lock.h> 33 #include <sys/queue.h> 34 #include <sys/proc.h> 35 #include <sys/rtprio.h> 36 #include <sys/thread2.h> 37 #include <sys/uio.h> 38 #include <sys/sysctl.h> 39 #include <sys/resourcevar.h> 40 #include <machine/ipl.h> 41 #include <machine/cpu.h> 42 #include <machine/smp.h> 43 44 /* 45 * Priorities. Note that with 32 run queues per scheduler each queue 46 * represents four priority levels. 47 */ 48 49 #define MAXPRI 128 50 #define PRIMASK (MAXPRI - 1) 51 #define PRIBASE_REALTIME 0 52 #define PRIBASE_NORMAL MAXPRI 53 #define PRIBASE_IDLE (MAXPRI * 2) 54 #define PRIBASE_THREAD (MAXPRI * 3) 55 #define PRIBASE_NULL (MAXPRI * 4) 56 57 #define NQS 32 /* 32 run queues. */ 58 #define PPQ (MAXPRI / NQS) /* priorities per queue */ 59 60 /* 61 * NICEPPQ - number of nice units per priority queue 62 * ESTCPURAMP - number of scheduler ticks for estcpu to switch queues 63 * 64 * ESTCPUPPQ - number of estcpu units per priority queue 65 * ESTCPUMAX - number of estcpu units 66 * ESTCPUINCR - amount we have to increment p_estcpu per scheduling tick at 67 * 100% cpu. 68 */ 69 #define NICEPPQ 2 70 #define ESTCPURAMP 4 71 #define ESTCPUPPQ 512 72 #define ESTCPUMAX (ESTCPUPPQ * NQS) 73 #define ESTCPUINCR (ESTCPUPPQ / ESTCPURAMP) 74 #define PRIO_RANGE (PRIO_MAX - PRIO_MIN + 1) 75 76 #define ESTCPULIM(v) min((v), ESTCPUMAX) 77 78 TAILQ_HEAD(rq, lwp); 79 80 #define lwp_priority lwp_usdata.bsd4.priority 81 #define lwp_rqindex lwp_usdata.bsd4.rqindex 82 #define lwp_origcpu lwp_usdata.bsd4.origcpu 83 #define lwp_estcpu lwp_usdata.bsd4.estcpu 84 85 static void bsd4_acquire_curproc(struct lwp *lp); 86 static void bsd4_release_curproc(struct lwp *lp); 87 static void bsd4_select_curproc(globaldata_t gd); 88 static void bsd4_setrunqueue(struct lwp *lp); 89 static void bsd4_remrunqueue(struct lwp *lp); 90 static void bsd4_schedulerclock(struct lwp *lp, sysclock_t period, 91 sysclock_t cpstamp); 92 static void bsd4_resetpriority(struct lwp *lp); 93 static void bsd4_forking(struct lwp *plp, struct lwp *lp); 94 static void bsd4_exiting(struct lwp *plp, struct lwp *lp); 95 96 static void bsd4_recalculate_estcpu(struct lwp *lp); 97 98 struct usched usched_bsd4 = { 99 { NULL }, 100 "bsd4", "Original DragonFly Scheduler", 101 bsd4_acquire_curproc, 102 bsd4_release_curproc, 103 bsd4_select_curproc, 104 bsd4_setrunqueue, 105 bsd4_remrunqueue, 106 bsd4_schedulerclock, 107 bsd4_recalculate_estcpu, 108 bsd4_resetpriority, 109 bsd4_forking, 110 bsd4_exiting 111 }; 112 113 /* 114 * We have NQS (32) run queues per scheduling class. For the normal 115 * class, there are 128 priorities scaled onto these 32 queues. New 116 * processes are added to the last entry in each queue, and processes 117 * are selected for running by taking them from the head and maintaining 118 * a simple FIFO arrangement. Realtime and Idle priority processes have 119 * and explicit 0-31 priority which maps directly onto their class queue 120 * index. When a queue has something in it, the corresponding bit is 121 * set in the queuebits variable, allowing a single read to determine 122 * the state of all 32 queues and then a ffs() to find the first busy 123 * queue. 124 */ 125 static struct rq queues[NQS]; 126 static struct rq rtqueues[NQS]; 127 static struct rq idqueues[NQS]; 128 static u_int32_t queuebits; 129 static u_int32_t rtqueuebits; 130 static u_int32_t idqueuebits; 131 static cpumask_t curprocmask = -1; /* currently running a user process */ 132 static cpumask_t rdyprocmask; /* ready to accept a user process */ 133 static int runqcount; 134 #ifdef SMP 135 static int scancpu; 136 #endif 137 138 SYSCTL_INT(_debug, OID_AUTO, runqcount, CTLFLAG_RD, &runqcount, 0, ""); 139 #ifdef INVARIANTS 140 static int usched_nonoptimal; 141 SYSCTL_INT(_debug, OID_AUTO, usched_nonoptimal, CTLFLAG_RW, 142 &usched_nonoptimal, 0, "acquire_curproc() was not optimal"); 143 static int usched_optimal; 144 SYSCTL_INT(_debug, OID_AUTO, usched_optimal, CTLFLAG_RW, 145 &usched_optimal, 0, "acquire_curproc() was optimal"); 146 #endif 147 static int usched_debug = -1; 148 SYSCTL_INT(_debug, OID_AUTO, scdebug, CTLFLAG_RW, &usched_debug, 0, ""); 149 #ifdef SMP 150 static int remote_resched = 1; 151 static int remote_resched_nonaffinity; 152 static int remote_resched_affinity; 153 static int choose_affinity; 154 SYSCTL_INT(_debug, OID_AUTO, remote_resched, CTLFLAG_RW, 155 &remote_resched, 0, "Resched to another cpu"); 156 SYSCTL_INT(_debug, OID_AUTO, remote_resched_nonaffinity, CTLFLAG_RD, 157 &remote_resched_nonaffinity, 0, "Number of remote rescheds"); 158 SYSCTL_INT(_debug, OID_AUTO, remote_resched_affinity, CTLFLAG_RD, 159 &remote_resched_affinity, 0, "Number of remote rescheds"); 160 SYSCTL_INT(_debug, OID_AUTO, choose_affinity, CTLFLAG_RD, 161 &choose_affinity, 0, "chooseproc() was smart"); 162 #endif 163 164 static int usched_bsd4_rrinterval = (ESTCPUFREQ + 9) / 10; 165 SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_rrinterval, CTLFLAG_RW, 166 &usched_bsd4_rrinterval, 0, ""); 167 static int usched_bsd4_decay = ESTCPUINCR / 2; 168 SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_decay, CTLFLAG_RW, 169 &usched_bsd4_decay, 0, ""); 170 171 /* 172 * Initialize the run queues at boot time. 173 */ 174 static void 175 rqinit(void *dummy) 176 { 177 int i; 178 179 for (i = 0; i < NQS; i++) { 180 TAILQ_INIT(&queues[i]); 181 TAILQ_INIT(&rtqueues[i]); 182 TAILQ_INIT(&idqueues[i]); 183 } 184 curprocmask &= ~1; 185 } 186 SYSINIT(runqueue, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, rqinit, NULL) 187 188 /* 189 * chooseproc() is called when a cpu needs a user process to LWKT schedule, 190 * it selects a user process and returns it. If chkp is non-NULL and chkp 191 * has a better or equal then the process that would otherwise be 192 * chosen, NULL is returned. 193 * 194 * Until we fix the RUNQ code the chkp test has to be strict or we may 195 * bounce between processes trying to acquire the current process designation. 196 */ 197 static 198 struct lwp * 199 chooseproc(struct lwp *chklp) 200 { 201 struct lwp *lp; 202 struct rq *q; 203 u_int32_t *which; 204 u_int32_t pri; 205 206 if (rtqueuebits) { 207 pri = bsfl(rtqueuebits); 208 q = &rtqueues[pri]; 209 which = &rtqueuebits; 210 } else if (queuebits) { 211 pri = bsfl(queuebits); 212 q = &queues[pri]; 213 which = &queuebits; 214 } else if (idqueuebits) { 215 pri = bsfl(idqueuebits); 216 q = &idqueues[pri]; 217 which = &idqueuebits; 218 } else { 219 return NULL; 220 } 221 lp = TAILQ_FIRST(q); 222 KASSERT(lp, ("chooseproc: no lwp on busy queue")); 223 224 /* 225 * If the passed lwp <chklp> is reasonably close to the selected 226 * lwp <lp>, return NULL (indicating that <chklp> should be kept). 227 * 228 * Note that we must error on the side of <chklp> to avoid bouncing 229 * between threads in the acquire code. 230 */ 231 if (chklp) { 232 if (chklp->lwp_priority < lp->lwp_priority + PPQ) 233 return(NULL); 234 } 235 236 #ifdef SMP 237 /* 238 * If the chosen lwp does not reside on this cpu spend a few 239 * cycles looking for a better candidate at the same priority level. 240 * This is a fallback check, setrunqueue() tries to wakeup the 241 * correct cpu and is our front-line affinity. 242 */ 243 if (lp->lwp_thread->td_gd != mycpu && 244 (chklp = TAILQ_NEXT(lp, lwp_procq)) != NULL 245 ) { 246 if (chklp->lwp_thread->td_gd == mycpu) { 247 ++choose_affinity; 248 lp = chklp; 249 } 250 } 251 #endif 252 253 TAILQ_REMOVE(q, lp, lwp_procq); 254 --runqcount; 255 if (TAILQ_EMPTY(q)) 256 *which &= ~(1 << pri); 257 KASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) != 0, ("not on runq6!")); 258 lp->lwp_proc->p_flag &= ~P_ONRUNQ; 259 return lp; 260 } 261 262 #ifdef SMP 263 /* 264 * called via an ipi message to reschedule on another cpu. 265 */ 266 static 267 void 268 need_user_resched_remote(void *dummy) 269 { 270 need_user_resched(); 271 } 272 273 #endif 274 275 /* 276 * setrunqueue() 'wakes up' a 'user' process. GIANT must be held. The 277 * user process may represent any user process, including the current 278 * process. 279 * 280 * If P_PASSIVE_ACQ is set setrunqueue() will not wakeup potential target 281 * cpus in an attempt to keep the process on the current cpu at least for 282 * a little while to take advantage of locality of reference (e.g. fork/exec 283 * or short fork/exit, and uio_yield()). 284 * 285 * CPU AFFINITY: cpu affinity is handled by attempting to either schedule 286 * or (user level) preempt on the same cpu that a process was previously 287 * scheduled to. If we cannot do this but we are at enough of a higher 288 * priority then the processes running on other cpus, we will allow the 289 * process to be stolen by another cpu. 290 * 291 * WARNING! a thread can be acquired by another cpu the moment it is put 292 * on the user scheduler's run queue AND we release the MP lock. Since we 293 * release the MP lock before switching out another cpu may begin stealing 294 * our current thread before we are completely switched out! The 295 * lwkt_acquire() function will stall until TDF_RUNNING is cleared on the 296 * thread before stealing it. 297 * 298 * NOTE on need_user_resched() calls: we have to call need_user_resched() 299 * if the new process is more important then the current process, or if 300 * the new process is the current process and is now less important then 301 * other processes. 302 * 303 * The associated thread must NOT be scheduled. 304 * The process must be runnable. 305 * This must be called at splhigh(). 306 */ 307 static void 308 bsd4_setrunqueue(struct lwp *lp) 309 { 310 struct rq *q; 311 struct globaldata *gd; 312 int pri; 313 int cpuid; 314 u_int32_t needresched; 315 #ifdef SMP 316 int count; 317 cpumask_t mask; 318 #endif 319 320 crit_enter(); 321 KASSERT(lp->lwp_proc->p_stat == SRUN, ("setrunqueue: proc not SRUN")); 322 KASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) == 0, 323 ("lwp %d/%d already on runq! flag %08x", lp->lwp_proc->p_pid, 324 lp->lwp_tid, lp->lwp_proc->p_flag)); 325 KKASSERT((lp->lwp_thread->td_flags & TDF_RUNQ) == 0); 326 327 /* 328 * Note: gd is the gd of the TARGET thread's cpu, not our cpu. 329 */ 330 gd = lp->lwp_thread->td_gd; 331 332 /* 333 * Because recalculate is only called once or twice for long sleeps, 334 * not every second forever while the process is sleeping, we have 335 * to manually call it to resynchronize p_cpbase on wakeup or it 336 * will wrap if the process was sleeping long enough (e.g. ~10 min 337 * with the ACPI timer) and really mess up the nticks calculation. 338 */ 339 if (lp->lwp_slptime) { 340 bsd4_recalculate_estcpu(lp); 341 lp->lwp_slptime = 0; 342 } 343 /* 344 * We have not been released, make sure that we are not the currently 345 * designated process. 346 */ 347 KKASSERT(gd->gd_uschedcp != lp); 348 349 /* 350 * Check cpu affinity. The associated thread is stable at the 351 * moment. Note that we may be checking another cpu here so we 352 * have to be careful. We are currently protected by the BGL. 353 * 354 * This allows us to avoid actually queueing the process. 355 * acquire_curproc() will handle any threads we mistakenly schedule. 356 */ 357 cpuid = gd->gd_cpuid; 358 359 if ((curprocmask & (1 << cpuid)) == 0) { 360 curprocmask |= 1 << cpuid; 361 gd->gd_uschedcp = lp; 362 gd->gd_upri = lp->lwp_priority; 363 lwkt_schedule(lp->lwp_thread); 364 /* CANNOT TOUCH PROC OR TD AFTER SCHEDULE CALL TO REMOTE CPU */ 365 crit_exit(); 366 #ifdef SMP 367 if (gd != mycpu) 368 ++remote_resched_affinity; 369 #endif 370 return; 371 } 372 373 /* 374 * gd and cpuid may still 'hint' at another cpu. Even so we have 375 * to place this process on the userland scheduler's run queue for 376 * action by the target cpu. 377 */ 378 ++runqcount; 379 lp->lwp_proc->p_flag |= P_ONRUNQ; 380 if (lp->lwp_rtprio.type == RTP_PRIO_NORMAL) { 381 pri = (lp->lwp_priority & PRIMASK) / PPQ; 382 q = &queues[pri]; 383 queuebits |= 1 << pri; 384 needresched = (queuebits & ((1 << pri) - 1)); 385 } else if (lp->lwp_rtprio.type == RTP_PRIO_REALTIME || 386 lp->lwp_rtprio.type == RTP_PRIO_FIFO) { 387 pri = (u_int8_t)lp->lwp_rtprio.prio; 388 q = &rtqueues[pri]; 389 rtqueuebits |= 1 << pri; 390 needresched = (rtqueuebits & ((1 << pri) - 1)); 391 } else if (lp->lwp_rtprio.type == RTP_PRIO_IDLE) { 392 pri = (u_int8_t)lp->lwp_rtprio.prio; 393 q = &idqueues[pri]; 394 idqueuebits |= 1 << pri; 395 needresched = (idqueuebits & ((1 << pri) - 1)); 396 } else { 397 needresched = 0; 398 panic("setrunqueue: invalid rtprio type"); 399 } 400 KKASSERT(pri < 32); 401 lp->lwp_rqindex = pri; /* remember the queue index */ 402 TAILQ_INSERT_TAIL(q, lp, lwp_procq); 403 404 #ifdef SMP 405 /* 406 * Either wakeup other cpus user thread scheduler or request 407 * preemption on other cpus (which will also wakeup a HLT). 408 * 409 * NOTE! gd and cpuid may still be our 'hint', not our current 410 * cpu info. 411 */ 412 413 count = runqcount; 414 415 /* 416 * Check cpu affinity for user preemption (when the curprocmask bit 417 * is set). Note that gd_upri is a speculative field (we modify 418 * another cpu's gd_upri to avoid sending ipiq storms). 419 */ 420 if (gd == mycpu) { 421 if ((lp->lwp_thread->td_flags & TDF_NORESCHED) == 0) { 422 if (lp->lwp_priority < gd->gd_upri - PPQ) { 423 gd->gd_upri = lp->lwp_priority; 424 gd->gd_rrcount = 0; 425 need_user_resched(); 426 --count; 427 } else if (gd->gd_uschedcp == lp && needresched) { 428 gd->gd_rrcount = 0; 429 need_user_resched(); 430 --count; 431 } 432 } 433 } else if (remote_resched) { 434 if (lp->lwp_priority < gd->gd_upri - PPQ) { 435 gd->gd_upri = lp->lwp_priority; 436 lwkt_send_ipiq(gd, need_user_resched_remote, NULL); 437 --count; 438 ++remote_resched_affinity; 439 } 440 } 441 442 /* 443 * No affinity, first schedule to any cpus that do not have a current 444 * process. If there is a free cpu we always schedule to it. 445 */ 446 if (count && 447 (mask = ~curprocmask & rdyprocmask & mycpu->gd_other_cpus) != 0 && 448 (lp->lwp_proc->p_flag & P_PASSIVE_ACQ) == 0) { 449 if (!mask) 450 printf("lwp %d/%d nocpu to schedule it on\n", 451 lp->lwp_proc->p_pid, lp->lwp_tid); 452 while (mask && count) { 453 cpuid = bsfl(mask); 454 KKASSERT((curprocmask & (1 << cpuid)) == 0); 455 rdyprocmask &= ~(1 << cpuid); 456 lwkt_schedule(&globaldata_find(cpuid)->gd_schedthread); 457 --count; 458 mask &= ~(1 << cpuid); 459 } 460 } 461 462 /* 463 * If there are still runnable processes try to wakeup a random 464 * cpu that is running a much lower priority process in order to 465 * preempt on it. Note that gd_upri is only a hint, so we can 466 * overwrite it from the wrong cpu. If we can't find one, we 467 * are SOL. 468 * 469 * We depress the priority check so multiple cpu bound programs 470 * do not bounce between cpus. Remember that the clock interrupt 471 * will also cause all cpus to reschedule. 472 * 473 * We must mask against rdyprocmask or we will race in the boot 474 * code (before all cpus have working scheduler helpers), plus 475 * some cpus might not be operational and/or not configured to 476 * handle user processes. 477 */ 478 if (count && remote_resched && ncpus > 1) { 479 cpuid = scancpu; 480 do { 481 if (++cpuid == ncpus) 482 cpuid = 0; 483 } while (cpuid == mycpu->gd_cpuid); 484 scancpu = cpuid; 485 486 if (rdyprocmask & (1 << cpuid)) { 487 gd = globaldata_find(cpuid); 488 489 if (lp->lwp_priority < gd->gd_upri - PPQ) { 490 gd->gd_upri = lp->lwp_priority; 491 lwkt_send_ipiq(gd, need_user_resched_remote, NULL); 492 ++remote_resched_nonaffinity; 493 } 494 } 495 } 496 #else 497 if ((lp->lwp_thread->td_flags & TDF_NORESCHED) == 0) { 498 if (lp->lwp_priority < gd->gd_upri - PPQ) { 499 gd->gd_upri = lp->lwp_priority; 500 gd->gd_rrcount = 0; 501 need_user_resched(); 502 } else if (gd->gd_uschedcp == lp && needresched) { 503 gd->gd_rrcount = 0; 504 need_user_resched(); 505 } 506 } 507 #endif 508 crit_exit(); 509 } 510 511 /* 512 * remrunqueue() removes a given process from the run queue that it is on, 513 * clearing the queue busy bit if it becomes empty. This function is called 514 * when a userland process is selected for LWKT scheduling. Note that 515 * LWKT scheduling is an abstraction of 'curproc'.. there could very well be 516 * several userland processes whos threads are scheduled or otherwise in 517 * a special state, and such processes are NOT on the userland scheduler's 518 * run queue. 519 * 520 * This must be called at splhigh(). 521 */ 522 static void 523 bsd4_remrunqueue(struct lwp *lp) 524 { 525 struct rq *q; 526 u_int32_t *which; 527 u_int8_t pri; 528 529 crit_enter(); 530 KASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) != 0, ("not on runq4!")); 531 lp->lwp_proc->p_flag &= ~P_ONRUNQ; 532 --runqcount; 533 KKASSERT(runqcount >= 0); 534 pri = lp->lwp_rqindex; 535 if (lp->lwp_rtprio.type == RTP_PRIO_NORMAL) { 536 q = &queues[pri]; 537 which = &queuebits; 538 } else if (lp->lwp_rtprio.type == RTP_PRIO_REALTIME || 539 lp->lwp_rtprio.type == RTP_PRIO_FIFO) { 540 q = &rtqueues[pri]; 541 which = &rtqueuebits; 542 } else if (lp->lwp_rtprio.type == RTP_PRIO_IDLE) { 543 q = &idqueues[pri]; 544 which = &idqueuebits; 545 } else { 546 panic("remrunqueue: invalid rtprio type"); 547 } 548 TAILQ_REMOVE(q, lp, lwp_procq); 549 if (TAILQ_EMPTY(q)) { 550 KASSERT((*which & (1 << pri)) != 0, 551 ("remrunqueue: remove from empty queue")); 552 *which &= ~(1 << pri); 553 } 554 crit_exit(); 555 } 556 557 /* 558 * This routine is called from a systimer IPI. It MUST be MP-safe and 559 * the BGL IS NOT HELD ON ENTRY. This routine is called at ESTCPUFREQ. 560 */ 561 static 562 void 563 bsd4_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp) 564 { 565 globaldata_t gd = mycpu; 566 567 /* 568 * Do we need to round-robin? We round-robin 10 times a second. 569 * This should only occur for cpu-bound batch processes. 570 */ 571 if (++gd->gd_rrcount >= usched_bsd4_rrinterval) { 572 gd->gd_rrcount = 0; 573 need_user_resched(); 574 } 575 576 /* 577 * As the process accumulates cpu time p_estcpu is bumped and may 578 * push the process into another scheduling queue. It typically 579 * takes 4 ticks to bump the queue. 580 */ 581 lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUINCR); 582 583 /* 584 * Reducing p_origcpu over time causes more of our estcpu to be 585 * returned to the parent when we exit. This is a small tweak 586 * for the batch detection heuristic. 587 */ 588 if (lp->lwp_origcpu) 589 --lp->lwp_origcpu; 590 591 /* XXX optimize, avoid lock if no reset is required */ 592 if (try_mplock()) { 593 bsd4_resetpriority(lp); 594 rel_mplock(); 595 } 596 } 597 598 /* 599 * Release the current process designation on p. P MUST BE CURPROC. 600 * Attempt to assign a new current process from the run queue. 601 * 602 * This function is called from exit1(), tsleep(), and the passive 603 * release code setup in <arch>/<arch>/trap.c 604 * 605 * If we do not have or cannot get the MP lock we just wakeup the userland 606 * helper scheduler thread for this cpu to do the work for us. 607 * 608 * WARNING! The MP lock may be in an unsynchronized state due to the 609 * way get_mplock() works and the fact that this function may be called 610 * from a passive release during a lwkt_switch(). try_mplock() will deal 611 * with this for us but you should be aware that td_mpcount may not be 612 * useable. 613 */ 614 static void 615 bsd4_release_curproc(struct lwp *lp) 616 { 617 int cpuid; 618 globaldata_t gd = mycpu; 619 620 KKASSERT(lp->lwp_thread->td_gd == gd); 621 crit_enter(); 622 cpuid = gd->gd_cpuid; 623 624 if (gd->gd_uschedcp == lp) { 625 if (try_mplock()) { 626 /* 627 * YYY when the MP lock is not assumed (see else) we 628 * will have to check that gd_uschedcp is still == lp 629 * after acquisition of the MP lock 630 */ 631 gd->gd_uschedcp = NULL; 632 gd->gd_upri = PRIBASE_NULL; 633 bsd4_select_curproc(gd); 634 rel_mplock(); 635 } else { 636 KKASSERT(0); /* MP LOCK ALWAYS HELD AT THE MOMENT */ 637 gd->gd_uschedcp = NULL; 638 gd->gd_upri = PRIBASE_NULL; 639 /* YYY uschedcp and curprocmask */ 640 if (runqcount && (rdyprocmask & (1 << cpuid))) { 641 rdyprocmask &= ~(1 << cpuid); 642 lwkt_schedule(&mycpu->gd_schedthread); 643 } 644 } 645 } 646 crit_exit(); 647 } 648 649 /* 650 * Select a new current process, potentially retaining gd_uschedcp. However, 651 * be sure to round-robin. This routine is generally only called if a 652 * reschedule is requested and that typically only occurs if a new process 653 * has a better priority or when we are round-robining. 654 * 655 * NOTE: Must be called with giant held and the current cpu's gd. 656 * NOTE: The caller must handle the situation where it loses a 657 * uschedcp designation that it previously held, typically by 658 * calling acquire_curproc() again. 659 * NOTE: May not block 660 */ 661 static 662 void 663 bsd4_select_curproc(globaldata_t gd) 664 { 665 struct lwp *nlp; 666 int cpuid = gd->gd_cpuid; 667 void *old; 668 669 clear_user_resched(); 670 671 /* 672 * Choose the next designated current user process. 673 * Note that we cannot schedule gd_schedthread 674 * if runqcount is 0 without creating a scheduling 675 * loop. 676 * 677 * We do not clear the user resched request here, 678 * we need to test it later when we re-acquire. 679 * 680 * NOTE: chooseproc returns NULL if the chosen lwp 681 * is gd_uschedcp. XXX needs cleanup. 682 */ 683 old = gd->gd_uschedcp; 684 if ((nlp = chooseproc(gd->gd_uschedcp)) != NULL) { 685 curprocmask |= 1 << cpuid; 686 gd->gd_upri = nlp->lwp_priority; 687 gd->gd_uschedcp = nlp; 688 lwkt_acquire(nlp->lwp_thread); 689 lwkt_schedule(nlp->lwp_thread); 690 } else if (gd->gd_uschedcp) { 691 gd->gd_upri = gd->gd_uschedcp->lwp_priority; 692 KKASSERT(curprocmask & (1 << cpuid)); 693 } else if (runqcount && (rdyprocmask & (1 << cpuid))) { 694 /*gd->gd_uschedcp = NULL;*/ 695 curprocmask &= ~(1 << cpuid); 696 rdyprocmask &= ~(1 << cpuid); 697 lwkt_schedule(&gd->gd_schedthread); 698 } else { 699 /*gd->gd_uschedcp = NULL;*/ 700 curprocmask &= ~(1 << cpuid); 701 } 702 } 703 704 /* 705 * Acquire the current process designation on the CURRENT process only. 706 * This function is called at kernel-user priority (not userland priority) 707 * when curlwp does not match gd_uschedcp. 708 * 709 * Basically we recalculate our estcpu to hopefully give us a more 710 * favorable disposition, setrunqueue, then wait for the curlwp 711 * designation to be handed to us (if the setrunqueue didn't do it). 712 */ 713 static void 714 bsd4_acquire_curproc(struct lwp *lp) 715 { 716 globaldata_t gd = mycpu; 717 718 crit_enter(); 719 ++lp->lwp_stats->p_ru.ru_nivcsw; 720 721 /* 722 * Loop until we become the current process. 723 */ 724 do { 725 KKASSERT(lp == gd->gd_curthread->td_lwp); 726 bsd4_recalculate_estcpu(lp); 727 lwkt_deschedule_self(gd->gd_curthread); 728 bsd4_setrunqueue(lp); 729 lwkt_switch(); 730 731 /* 732 * WE MAY HAVE BEEN MIGRATED TO ANOTHER CPU, RELOAD GD. 733 */ 734 gd = mycpu; 735 } while (gd->gd_uschedcp != lp); 736 737 crit_exit(); 738 739 /* 740 * That's it. Cleanup, we are done. The caller can return to 741 * user mode now. 742 */ 743 KKASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) == 0); 744 } 745 746 /* 747 * Compute the priority of a process when running in user mode. 748 * Arrange to reschedule if the resulting priority is better 749 * than that of the current process. 750 */ 751 static void 752 bsd4_resetpriority(struct lwp *lp) 753 { 754 int newpriority; 755 int opq; 756 int npq; 757 758 /* 759 * Set p_priority for general process comparisons 760 */ 761 switch(lp->lwp_rtprio.type) { 762 case RTP_PRIO_REALTIME: 763 lp->lwp_priority = PRIBASE_REALTIME + lp->lwp_rtprio.prio; 764 return; 765 case RTP_PRIO_NORMAL: 766 break; 767 case RTP_PRIO_IDLE: 768 lp->lwp_priority = PRIBASE_IDLE + lp->lwp_rtprio.prio; 769 return; 770 case RTP_PRIO_THREAD: 771 lp->lwp_priority = PRIBASE_THREAD + lp->lwp_rtprio.prio; 772 return; 773 } 774 775 /* 776 * NORMAL priorities fall through. These are based on niceness 777 * and cpu use. Lower numbers == higher priorities. 778 * 779 * Calculate our priority based on our niceness and estimated cpu. 780 * Note that the nice value adjusts the baseline, which effects 781 * cpu bursts but does not effect overall cpu use between cpu-bound 782 * processes. The use of the nice field in the decay calculation 783 * controls the overall cpu use. 784 * 785 * This isn't an exact calculation. We fit the full nice and 786 * estcpu range into the priority range so the actual PPQ value 787 * is incorrect, but it's still a reasonable way to think about it. 788 */ 789 newpriority = (lp->lwp_proc->p_nice - PRIO_MIN) * PPQ / NICEPPQ; 790 newpriority += lp->lwp_estcpu * PPQ / ESTCPUPPQ; 791 newpriority = newpriority * MAXPRI / 792 (PRIO_RANGE * PPQ / NICEPPQ + ESTCPUMAX * PPQ / ESTCPUPPQ); 793 newpriority = MIN(newpriority, MAXPRI - 1); /* sanity */ 794 newpriority = MAX(newpriority, 0); /* sanity */ 795 npq = newpriority / PPQ; 796 crit_enter(); 797 opq = (lp->lwp_priority & PRIMASK) / PPQ; 798 if (lp->lwp_proc->p_stat == SRUN && (lp->lwp_proc->p_flag & P_ONRUNQ) && opq != npq) { 799 /* 800 * We have to move the process to another queue 801 */ 802 bsd4_remrunqueue(lp); 803 lp->lwp_priority = PRIBASE_NORMAL + newpriority; 804 bsd4_setrunqueue(lp); 805 } else { 806 /* 807 * We can just adjust the priority and it will be picked 808 * up later. 809 */ 810 KKASSERT(opq == npq || (lp->lwp_proc->p_flag & P_ONRUNQ) == 0); 811 lp->lwp_priority = PRIBASE_NORMAL + newpriority; 812 } 813 crit_exit(); 814 } 815 816 /* 817 * Called from fork1() when a new child process is being created. 818 * 819 * Give the child process an initial estcpu that is more batch then 820 * its parent and dock the parent for the fork (but do not 821 * reschedule the parent). This comprises the main part of our batch 822 * detection heuristic for both parallel forking and sequential execs. 823 * 824 * Interactive processes will decay the boosted estcpu quickly while batch 825 * processes will tend to compound it. 826 * XXX lwp should be "spawning" instead of "forking" 827 */ 828 static void 829 bsd4_forking(struct lwp *plp, struct lwp *lp) 830 { 831 lp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ); 832 lp->lwp_origcpu = lp->lwp_estcpu; 833 plp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ); 834 } 835 836 /* 837 * Called when the parent reaps a child. Propogate cpu use by the child 838 * back to the parent. 839 */ 840 static void 841 bsd4_exiting(struct lwp *plp, struct lwp *lp) 842 { 843 int delta; 844 845 if (plp->lwp_proc->p_pid != 1) { 846 delta = lp->lwp_estcpu - lp->lwp_origcpu; 847 if (delta > 0) 848 plp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + delta); 849 } 850 } 851 852 /* 853 * Called from acquire and from kern_synch's one-second timer with a 854 * critical section held. 855 * 856 * Decay p_estcpu based on the number of ticks we haven't been running 857 * and our p_nice. As the load increases each process observes a larger 858 * number of idle ticks (because other processes are running in them). 859 * This observation leads to a larger correction which tends to make the 860 * system more 'batchy'. 861 * 862 * Note that no recalculation occurs for a process which sleeps and wakes 863 * up in the same tick. That is, a system doing thousands of context 864 * switches per second will still only do serious estcpu calculations 865 * ESTCPUFREQ times per second. 866 */ 867 static 868 void 869 bsd4_recalculate_estcpu(struct lwp *lp) 870 { 871 globaldata_t gd = mycpu; 872 sysclock_t cpbase; 873 int loadfac; 874 int ndecay; 875 int nticks; 876 int nleft; 877 878 /* 879 * We have to subtract periodic to get the last schedclock 880 * timeout time, otherwise we would get the upcoming timeout. 881 * Keep in mind that a process can migrate between cpus and 882 * while the scheduler clock should be very close, boundary 883 * conditions could lead to a small negative delta. 884 */ 885 cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic; 886 887 if (lp->lwp_slptime > 1) { 888 /* 889 * Too much time has passed, do a coarse correction. 890 */ 891 lp->lwp_estcpu = lp->lwp_estcpu >> 1; 892 bsd4_resetpriority(lp); 893 lp->lwp_cpbase = cpbase; 894 lp->lwp_cpticks = 0; 895 } else if (lp->lwp_cpbase != cpbase) { 896 /* 897 * Adjust estcpu if we are in a different tick. Don't waste 898 * time if we are in the same tick. 899 * 900 * First calculate the number of ticks in the measurement 901 * interval. 902 */ 903 nticks = (cpbase - lp->lwp_cpbase) / gd->gd_schedclock.periodic; 904 updatepcpu(lp, lp->lwp_cpticks, nticks); 905 906 if ((nleft = nticks - lp->lwp_cpticks) < 0) 907 nleft = 0; 908 if (usched_debug == lp->lwp_proc->p_pid) { 909 printf("pid %d tid %d estcpu %d cpticks %d nticks %d nleft %d", 910 lp->lwp_proc->p_pid, lp->lwp_tid, lp->lwp_estcpu, 911 lp->lwp_cpticks, nticks, nleft); 912 } 913 914 /* 915 * Calculate a decay value based on ticks remaining scaled 916 * down by the instantanious load and p_nice. 917 */ 918 if ((loadfac = runqcount) < 2) 919 loadfac = 2; 920 ndecay = nleft * usched_bsd4_decay * 2 * 921 (PRIO_MAX * 2 - lp->lwp_proc->p_nice) / (loadfac * PRIO_MAX * 2); 922 923 /* 924 * Adjust p_estcpu. Handle a border case where batch jobs 925 * can get stalled long enough to decay to zero when they 926 * shouldn't. 927 */ 928 if (lp->lwp_estcpu > ndecay * 2) 929 lp->lwp_estcpu -= ndecay; 930 else 931 lp->lwp_estcpu >>= 1; 932 933 if (usched_debug == lp->lwp_proc->p_pid) 934 printf(" ndecay %d estcpu %d\n", ndecay, lp->lwp_estcpu); 935 936 bsd4_resetpriority(lp); 937 lp->lwp_cpbase = cpbase; 938 lp->lwp_cpticks = 0; 939 } 940 } 941 942 #ifdef SMP 943 944 /* 945 * For SMP systems a user scheduler helper thread is created for each 946 * cpu and is used to allow one cpu to wakeup another for the purposes of 947 * scheduling userland threads from setrunqueue(). UP systems do not 948 * need the helper since there is only one cpu. We can't use the idle 949 * thread for this because we need to hold the MP lock. Additionally, 950 * doing things this way allows us to HLT idle cpus on MP systems. 951 */ 952 static void 953 sched_thread(void *dummy) 954 { 955 globaldata_t gd = mycpu; 956 int cpuid = gd->gd_cpuid; /* doesn't change */ 957 u_int32_t cpumask = 1 << cpuid; /* doesn't change */ 958 959 get_mplock(); /* hold the MP lock */ 960 for (;;) { 961 struct lwp *nlp; 962 963 lwkt_deschedule_self(gd->gd_curthread); /* interlock */ 964 rdyprocmask |= cpumask; 965 crit_enter_quick(gd->gd_curthread); 966 if ((curprocmask & cpumask) == 0 && (nlp = chooseproc(NULL)) != NULL) { 967 curprocmask |= cpumask; 968 gd->gd_upri = nlp->lwp_priority; 969 gd->gd_uschedcp = nlp; 970 lwkt_acquire(nlp->lwp_thread); 971 lwkt_schedule(nlp->lwp_thread); 972 } 973 crit_exit_quick(gd->gd_curthread); 974 lwkt_switch(); 975 } 976 } 977 978 /* 979 * Setup our scheduler helpers. Note that curprocmask bit 0 has already 980 * been cleared by rqinit() and we should not mess with it further. 981 */ 982 static void 983 sched_thread_cpu_init(void) 984 { 985 int i; 986 987 if (bootverbose) 988 printf("start scheduler helpers on cpus:"); 989 990 for (i = 0; i < ncpus; ++i) { 991 globaldata_t dgd = globaldata_find(i); 992 cpumask_t mask = 1 << i; 993 994 if ((mask & smp_active_mask) == 0) 995 continue; 996 997 if (bootverbose) 998 printf(" %d", i); 999 1000 lwkt_create(sched_thread, NULL, NULL, &dgd->gd_schedthread, 1001 TDF_STOPREQ, i, "usched %d", i); 1002 1003 /* 1004 * Allow user scheduling on the target cpu. cpu #0 has already 1005 * been enabled in rqinit(). 1006 */ 1007 if (i) 1008 curprocmask &= ~mask; 1009 rdyprocmask |= mask; 1010 } 1011 if (bootverbose) 1012 printf("\n"); 1013 } 1014 SYSINIT(uschedtd, SI_SUB_FINISH_SMP, SI_ORDER_ANY, sched_thread_cpu_init, NULL) 1015 1016 #endif 1017 1018