xref: /dflybsd-src/sys/kern/usched_bsd4.c (revision c9fbf0d3b1d54097180190816f8fa4f5d415b174)
1 /*
2  * Copyright (c) 1999 Peter Wemm <peter@FreeBSD.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  * $DragonFly: src/sys/kern/usched_bsd4.c,v 1.3 2005/10/11 09:59:56 corecode Exp $
27  */
28 
29 #include <sys/param.h>
30 #include <sys/systm.h>
31 #include <sys/kernel.h>
32 #include <sys/lock.h>
33 #include <sys/queue.h>
34 #include <sys/proc.h>
35 #include <sys/rtprio.h>
36 #include <sys/thread2.h>
37 #include <sys/uio.h>
38 #include <sys/sysctl.h>
39 #include <sys/resourcevar.h>
40 #include <machine/ipl.h>
41 #include <machine/cpu.h>
42 #include <machine/smp.h>
43 
44 /*
45  * Priorities.  Note that with 32 run queues per scheduler each queue
46  * represents four priority levels.
47  */
48 
49 #define MAXPRI			128
50 #define PRIMASK			(MAXPRI - 1)
51 #define PRIBASE_REALTIME	0
52 #define PRIBASE_NORMAL		MAXPRI
53 #define PRIBASE_IDLE		(MAXPRI * 2)
54 #define PRIBASE_THREAD		(MAXPRI * 3)
55 #define PRIBASE_NULL		(MAXPRI * 4)
56 
57 #define NQS	32			/* 32 run queues. */
58 #define PPQ	(MAXPRI / NQS)		/* priorities per queue */
59 
60 /*
61  * NICEPPQ	- number of nice units per priority queue
62  * ESTCPURAMP	- number of scheduler ticks for estcpu to switch queues
63  *
64  * ESTCPUPPQ	- number of estcpu units per priority queue
65  * ESTCPUMAX	- number of estcpu units
66  * ESTCPUINCR	- amount we have to increment p_estcpu per scheduling tick at
67  *		  100% cpu.
68  */
69 #define NICEPPQ		2
70 #define ESTCPURAMP	4
71 #define ESTCPUPPQ	512
72 #define ESTCPUMAX	(ESTCPUPPQ * NQS)
73 #define ESTCPUINCR	(ESTCPUPPQ / ESTCPURAMP)
74 #define PRIO_RANGE	(PRIO_MAX - PRIO_MIN + 1)
75 
76 #define ESTCPULIM(v)	min((v), ESTCPUMAX)
77 
78 TAILQ_HEAD(rq, lwp);
79 
80 #define lwp_priority	lwp_usdata.bsd4.priority
81 #define lwp_rqindex	lwp_usdata.bsd4.rqindex
82 #define lwp_origcpu	lwp_usdata.bsd4.origcpu
83 #define lwp_estcpu	lwp_usdata.bsd4.estcpu
84 
85 static void bsd4_acquire_curproc(struct lwp *lp);
86 static void bsd4_release_curproc(struct lwp *lp);
87 static void bsd4_select_curproc(globaldata_t gd);
88 static void bsd4_setrunqueue(struct lwp *lp);
89 static void bsd4_remrunqueue(struct lwp *lp);
90 static void bsd4_schedulerclock(struct lwp *lp, sysclock_t period,
91 				sysclock_t cpstamp);
92 static void bsd4_resetpriority(struct lwp *lp);
93 static void bsd4_forking(struct lwp *plp, struct lwp *lp);
94 static void bsd4_exiting(struct lwp *plp, struct lwp *lp);
95 
96 static void bsd4_recalculate_estcpu(struct lwp *lp);
97 
98 struct usched usched_bsd4 = {
99 	{ NULL },
100 	"bsd4", "Original DragonFly Scheduler",
101 	bsd4_acquire_curproc,
102 	bsd4_release_curproc,
103 	bsd4_select_curproc,
104 	bsd4_setrunqueue,
105 	bsd4_remrunqueue,
106 	bsd4_schedulerclock,
107 	bsd4_recalculate_estcpu,
108 	bsd4_resetpriority,
109 	bsd4_forking,
110 	bsd4_exiting
111 };
112 
113 /*
114  * We have NQS (32) run queues per scheduling class.  For the normal
115  * class, there are 128 priorities scaled onto these 32 queues.  New
116  * processes are added to the last entry in each queue, and processes
117  * are selected for running by taking them from the head and maintaining
118  * a simple FIFO arrangement.  Realtime and Idle priority processes have
119  * and explicit 0-31 priority which maps directly onto their class queue
120  * index.  When a queue has something in it, the corresponding bit is
121  * set in the queuebits variable, allowing a single read to determine
122  * the state of all 32 queues and then a ffs() to find the first busy
123  * queue.
124  */
125 static struct rq queues[NQS];
126 static struct rq rtqueues[NQS];
127 static struct rq idqueues[NQS];
128 static u_int32_t queuebits;
129 static u_int32_t rtqueuebits;
130 static u_int32_t idqueuebits;
131 static cpumask_t curprocmask = -1;	/* currently running a user process */
132 static cpumask_t rdyprocmask;		/* ready to accept a user process */
133 static int	 runqcount;
134 #ifdef SMP
135 static int	 scancpu;
136 #endif
137 
138 SYSCTL_INT(_debug, OID_AUTO, runqcount, CTLFLAG_RD, &runqcount, 0, "");
139 #ifdef INVARIANTS
140 static int usched_nonoptimal;
141 SYSCTL_INT(_debug, OID_AUTO, usched_nonoptimal, CTLFLAG_RW,
142         &usched_nonoptimal, 0, "acquire_curproc() was not optimal");
143 static int usched_optimal;
144 SYSCTL_INT(_debug, OID_AUTO, usched_optimal, CTLFLAG_RW,
145         &usched_optimal, 0, "acquire_curproc() was optimal");
146 #endif
147 static int usched_debug = -1;
148 SYSCTL_INT(_debug, OID_AUTO, scdebug, CTLFLAG_RW, &usched_debug, 0, "");
149 #ifdef SMP
150 static int remote_resched = 1;
151 static int remote_resched_nonaffinity;
152 static int remote_resched_affinity;
153 static int choose_affinity;
154 SYSCTL_INT(_debug, OID_AUTO, remote_resched, CTLFLAG_RW,
155         &remote_resched, 0, "Resched to another cpu");
156 SYSCTL_INT(_debug, OID_AUTO, remote_resched_nonaffinity, CTLFLAG_RD,
157         &remote_resched_nonaffinity, 0, "Number of remote rescheds");
158 SYSCTL_INT(_debug, OID_AUTO, remote_resched_affinity, CTLFLAG_RD,
159         &remote_resched_affinity, 0, "Number of remote rescheds");
160 SYSCTL_INT(_debug, OID_AUTO, choose_affinity, CTLFLAG_RD,
161         &choose_affinity, 0, "chooseproc() was smart");
162 #endif
163 
164 static int usched_bsd4_rrinterval = (ESTCPUFREQ + 9) / 10;
165 SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_rrinterval, CTLFLAG_RW,
166         &usched_bsd4_rrinterval, 0, "");
167 static int usched_bsd4_decay = ESTCPUINCR / 2;
168 SYSCTL_INT(_kern, OID_AUTO, usched_bsd4_decay, CTLFLAG_RW,
169         &usched_bsd4_decay, 0, "");
170 
171 /*
172  * Initialize the run queues at boot time.
173  */
174 static void
175 rqinit(void *dummy)
176 {
177 	int i;
178 
179 	for (i = 0; i < NQS; i++) {
180 		TAILQ_INIT(&queues[i]);
181 		TAILQ_INIT(&rtqueues[i]);
182 		TAILQ_INIT(&idqueues[i]);
183 	}
184 	curprocmask &= ~1;
185 }
186 SYSINIT(runqueue, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, rqinit, NULL)
187 
188 /*
189  * chooseproc() is called when a cpu needs a user process to LWKT schedule,
190  * it selects a user process and returns it.  If chkp is non-NULL and chkp
191  * has a better or equal then the process that would otherwise be
192  * chosen, NULL is returned.
193  *
194  * Until we fix the RUNQ code the chkp test has to be strict or we may
195  * bounce between processes trying to acquire the current process designation.
196  */
197 static
198 struct lwp *
199 chooseproc(struct lwp *chklp)
200 {
201 	struct lwp *lp;
202 	struct rq *q;
203 	u_int32_t *which;
204 	u_int32_t pri;
205 
206 	if (rtqueuebits) {
207 		pri = bsfl(rtqueuebits);
208 		q = &rtqueues[pri];
209 		which = &rtqueuebits;
210 	} else if (queuebits) {
211 		pri = bsfl(queuebits);
212 		q = &queues[pri];
213 		which = &queuebits;
214 	} else if (idqueuebits) {
215 		pri = bsfl(idqueuebits);
216 		q = &idqueues[pri];
217 		which = &idqueuebits;
218 	} else {
219 		return NULL;
220 	}
221 	lp = TAILQ_FIRST(q);
222 	KASSERT(lp, ("chooseproc: no lwp on busy queue"));
223 
224 	/*
225 	 * If the passed lwp <chklp> is reasonably close to the selected
226 	 * lwp <lp>, return NULL (indicating that <chklp> should be kept).
227 	 *
228 	 * Note that we must error on the side of <chklp> to avoid bouncing
229 	 * between threads in the acquire code.
230 	 */
231 	if (chklp) {
232 		if (chklp->lwp_priority < lp->lwp_priority + PPQ)
233 			return(NULL);
234 	}
235 
236 #ifdef SMP
237 	/*
238 	 * If the chosen lwp does not reside on this cpu spend a few
239 	 * cycles looking for a better candidate at the same priority level.
240 	 * This is a fallback check, setrunqueue() tries to wakeup the
241 	 * correct cpu and is our front-line affinity.
242 	 */
243 	if (lp->lwp_thread->td_gd != mycpu &&
244 	    (chklp = TAILQ_NEXT(lp, lwp_procq)) != NULL
245 	) {
246 		if (chklp->lwp_thread->td_gd == mycpu) {
247 			++choose_affinity;
248 			lp = chklp;
249 		}
250 	}
251 #endif
252 
253 	TAILQ_REMOVE(q, lp, lwp_procq);
254 	--runqcount;
255 	if (TAILQ_EMPTY(q))
256 		*which &= ~(1 << pri);
257 	KASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) != 0, ("not on runq6!"));
258 	lp->lwp_proc->p_flag &= ~P_ONRUNQ;
259 	return lp;
260 }
261 
262 #ifdef SMP
263 /*
264  * called via an ipi message to reschedule on another cpu.
265  */
266 static
267 void
268 need_user_resched_remote(void *dummy)
269 {
270 	need_user_resched();
271 }
272 
273 #endif
274 
275 /*
276  * setrunqueue() 'wakes up' a 'user' process.  GIANT must be held.  The
277  * user process may represent any user process, including the current
278  * process.
279  *
280  * If P_PASSIVE_ACQ is set setrunqueue() will not wakeup potential target
281  * cpus in an attempt to keep the process on the current cpu at least for
282  * a little while to take advantage of locality of reference (e.g. fork/exec
283  * or short fork/exit, and uio_yield()).
284  *
285  * CPU AFFINITY: cpu affinity is handled by attempting to either schedule
286  * or (user level) preempt on the same cpu that a process was previously
287  * scheduled to.  If we cannot do this but we are at enough of a higher
288  * priority then the processes running on other cpus, we will allow the
289  * process to be stolen by another cpu.
290  *
291  * WARNING! a thread can be acquired by another cpu the moment it is put
292  * on the user scheduler's run queue AND we release the MP lock.  Since we
293  * release the MP lock before switching out another cpu may begin stealing
294  * our current thread before we are completely switched out!  The
295  * lwkt_acquire() function will stall until TDF_RUNNING is cleared on the
296  * thread before stealing it.
297  *
298  * NOTE on need_user_resched() calls: we have to call need_user_resched()
299  * if the new process is more important then the current process, or if
300  * the new process is the current process and is now less important then
301  * other processes.
302  *
303  * The associated thread must NOT be scheduled.
304  * The process must be runnable.
305  * This must be called at splhigh().
306  */
307 static void
308 bsd4_setrunqueue(struct lwp *lp)
309 {
310 	struct rq *q;
311 	struct globaldata *gd;
312 	int pri;
313 	int cpuid;
314 	u_int32_t needresched;
315 #ifdef SMP
316 	int count;
317 	cpumask_t mask;
318 #endif
319 
320 	crit_enter();
321 	KASSERT(lp->lwp_proc->p_stat == SRUN, ("setrunqueue: proc not SRUN"));
322 	KASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) == 0,
323 	    ("lwp %d/%d already on runq! flag %08x", lp->lwp_proc->p_pid,
324 	     lp->lwp_tid, lp->lwp_proc->p_flag));
325 	KKASSERT((lp->lwp_thread->td_flags & TDF_RUNQ) == 0);
326 
327 	/*
328 	 * Note: gd is the gd of the TARGET thread's cpu, not our cpu.
329 	 */
330 	gd = lp->lwp_thread->td_gd;
331 
332 	/*
333 	 * Because recalculate is only called once or twice for long sleeps,
334 	 * not every second forever while the process is sleeping, we have
335 	 * to manually call it to resynchronize p_cpbase on wakeup or it
336 	 * will wrap if the process was sleeping long enough (e.g. ~10 min
337 	 * with the ACPI timer) and really mess up the nticks calculation.
338 	 */
339 	if (lp->lwp_slptime) {
340 	    bsd4_recalculate_estcpu(lp);
341 	    lp->lwp_slptime = 0;
342 	}
343 	/*
344 	 * We have not been released, make sure that we are not the currently
345 	 * designated process.
346 	 */
347 	KKASSERT(gd->gd_uschedcp != lp);
348 
349 	/*
350 	 * Check cpu affinity.  The associated thread is stable at the
351 	 * moment.  Note that we may be checking another cpu here so we
352 	 * have to be careful.  We are currently protected by the BGL.
353 	 *
354 	 * This allows us to avoid actually queueing the process.
355 	 * acquire_curproc() will handle any threads we mistakenly schedule.
356 	 */
357 	cpuid = gd->gd_cpuid;
358 
359 	if ((curprocmask & (1 << cpuid)) == 0) {
360 		curprocmask |= 1 << cpuid;
361 		gd->gd_uschedcp = lp;
362 		gd->gd_upri = lp->lwp_priority;
363 		lwkt_schedule(lp->lwp_thread);
364 		/* CANNOT TOUCH PROC OR TD AFTER SCHEDULE CALL TO REMOTE CPU */
365 		crit_exit();
366 #ifdef SMP
367 		if (gd != mycpu)
368 			++remote_resched_affinity;
369 #endif
370 		return;
371 	}
372 
373 	/*
374 	 * gd and cpuid may still 'hint' at another cpu.  Even so we have
375 	 * to place this process on the userland scheduler's run queue for
376 	 * action by the target cpu.
377 	 */
378 	++runqcount;
379 	lp->lwp_proc->p_flag |= P_ONRUNQ;
380 	if (lp->lwp_rtprio.type == RTP_PRIO_NORMAL) {
381 		pri = (lp->lwp_priority & PRIMASK) / PPQ;
382 		q = &queues[pri];
383 		queuebits |= 1 << pri;
384 		needresched = (queuebits & ((1 << pri) - 1));
385 	} else if (lp->lwp_rtprio.type == RTP_PRIO_REALTIME ||
386 		   lp->lwp_rtprio.type == RTP_PRIO_FIFO) {
387 		pri = (u_int8_t)lp->lwp_rtprio.prio;
388 		q = &rtqueues[pri];
389 		rtqueuebits |= 1 << pri;
390 		needresched = (rtqueuebits & ((1 << pri) - 1));
391 	} else if (lp->lwp_rtprio.type == RTP_PRIO_IDLE) {
392 		pri = (u_int8_t)lp->lwp_rtprio.prio;
393 		q = &idqueues[pri];
394 		idqueuebits |= 1 << pri;
395 		needresched = (idqueuebits & ((1 << pri) - 1));
396 	} else {
397 		needresched = 0;
398 		panic("setrunqueue: invalid rtprio type");
399 	}
400 	KKASSERT(pri < 32);
401 	lp->lwp_rqindex = pri;		/* remember the queue index */
402 	TAILQ_INSERT_TAIL(q, lp, lwp_procq);
403 
404 #ifdef SMP
405 	/*
406 	 * Either wakeup other cpus user thread scheduler or request
407 	 * preemption on other cpus (which will also wakeup a HLT).
408 	 *
409 	 * NOTE!  gd and cpuid may still be our 'hint', not our current
410 	 * cpu info.
411 	 */
412 
413 	count = runqcount;
414 
415 	/*
416 	 * Check cpu affinity for user preemption (when the curprocmask bit
417 	 * is set).  Note that gd_upri is a speculative field (we modify
418 	 * another cpu's gd_upri to avoid sending ipiq storms).
419 	 */
420 	if (gd == mycpu) {
421 		if ((lp->lwp_thread->td_flags & TDF_NORESCHED) == 0) {
422 			if (lp->lwp_priority < gd->gd_upri - PPQ) {
423 				gd->gd_upri = lp->lwp_priority;
424 				gd->gd_rrcount = 0;
425 				need_user_resched();
426 				--count;
427 			} else if (gd->gd_uschedcp == lp && needresched) {
428 				gd->gd_rrcount = 0;
429 				need_user_resched();
430 				--count;
431 			}
432 		}
433 	} else if (remote_resched) {
434 		if (lp->lwp_priority < gd->gd_upri - PPQ) {
435 			gd->gd_upri = lp->lwp_priority;
436 			lwkt_send_ipiq(gd, need_user_resched_remote, NULL);
437 			--count;
438 			++remote_resched_affinity;
439 		}
440 	}
441 
442 	/*
443 	 * No affinity, first schedule to any cpus that do not have a current
444 	 * process.  If there is a free cpu we always schedule to it.
445 	 */
446 	if (count &&
447 	    (mask = ~curprocmask & rdyprocmask & mycpu->gd_other_cpus) != 0 &&
448 	    (lp->lwp_proc->p_flag & P_PASSIVE_ACQ) == 0) {
449 		if (!mask)
450 			printf("lwp %d/%d nocpu to schedule it on\n",
451 			       lp->lwp_proc->p_pid, lp->lwp_tid);
452 		while (mask && count) {
453 			cpuid = bsfl(mask);
454 			KKASSERT((curprocmask & (1 << cpuid)) == 0);
455 			rdyprocmask &= ~(1 << cpuid);
456 			lwkt_schedule(&globaldata_find(cpuid)->gd_schedthread);
457 			--count;
458 			mask &= ~(1 << cpuid);
459 		}
460 	}
461 
462 	/*
463 	 * If there are still runnable processes try to wakeup a random
464 	 * cpu that is running a much lower priority process in order to
465 	 * preempt on it.  Note that gd_upri is only a hint, so we can
466 	 * overwrite it from the wrong cpu.   If we can't find one, we
467 	 * are SOL.
468 	 *
469 	 * We depress the priority check so multiple cpu bound programs
470 	 * do not bounce between cpus.  Remember that the clock interrupt
471 	 * will also cause all cpus to reschedule.
472 	 *
473 	 * We must mask against rdyprocmask or we will race in the boot
474 	 * code (before all cpus have working scheduler helpers), plus
475 	 * some cpus might not be operational and/or not configured to
476 	 * handle user processes.
477 	 */
478 	if (count && remote_resched && ncpus > 1) {
479 		cpuid = scancpu;
480 		do {
481 			if (++cpuid == ncpus)
482 				cpuid = 0;
483 		} while (cpuid == mycpu->gd_cpuid);
484 		scancpu = cpuid;
485 
486 		if (rdyprocmask & (1 << cpuid)) {
487 			gd = globaldata_find(cpuid);
488 
489 			if (lp->lwp_priority < gd->gd_upri - PPQ) {
490 				gd->gd_upri = lp->lwp_priority;
491 				lwkt_send_ipiq(gd, need_user_resched_remote, NULL);
492 				++remote_resched_nonaffinity;
493 			}
494 		}
495 	}
496 #else
497 	if ((lp->lwp_thread->td_flags & TDF_NORESCHED) == 0) {
498 		if (lp->lwp_priority < gd->gd_upri - PPQ) {
499 			gd->gd_upri = lp->lwp_priority;
500 			gd->gd_rrcount = 0;
501 			need_user_resched();
502 		} else if (gd->gd_uschedcp == lp && needresched) {
503 			gd->gd_rrcount = 0;
504 			need_user_resched();
505 		}
506 	}
507 #endif
508 	crit_exit();
509 }
510 
511 /*
512  * remrunqueue() removes a given process from the run queue that it is on,
513  * clearing the queue busy bit if it becomes empty.  This function is called
514  * when a userland process is selected for LWKT scheduling.  Note that
515  * LWKT scheduling is an abstraction of 'curproc'.. there could very well be
516  * several userland processes whos threads are scheduled or otherwise in
517  * a special state, and such processes are NOT on the userland scheduler's
518  * run queue.
519  *
520  * This must be called at splhigh().
521  */
522 static void
523 bsd4_remrunqueue(struct lwp *lp)
524 {
525 	struct rq *q;
526 	u_int32_t *which;
527 	u_int8_t pri;
528 
529 	crit_enter();
530 	KASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) != 0, ("not on runq4!"));
531 	lp->lwp_proc->p_flag &= ~P_ONRUNQ;
532 	--runqcount;
533 	KKASSERT(runqcount >= 0);
534 	pri = lp->lwp_rqindex;
535 	if (lp->lwp_rtprio.type == RTP_PRIO_NORMAL) {
536 		q = &queues[pri];
537 		which = &queuebits;
538 	} else if (lp->lwp_rtprio.type == RTP_PRIO_REALTIME ||
539 		   lp->lwp_rtprio.type == RTP_PRIO_FIFO) {
540 		q = &rtqueues[pri];
541 		which = &rtqueuebits;
542 	} else if (lp->lwp_rtprio.type == RTP_PRIO_IDLE) {
543 		q = &idqueues[pri];
544 		which = &idqueuebits;
545 	} else {
546 		panic("remrunqueue: invalid rtprio type");
547 	}
548 	TAILQ_REMOVE(q, lp, lwp_procq);
549 	if (TAILQ_EMPTY(q)) {
550 		KASSERT((*which & (1 << pri)) != 0,
551 			("remrunqueue: remove from empty queue"));
552 		*which &= ~(1 << pri);
553 	}
554 	crit_exit();
555 }
556 
557 /*
558  * This routine is called from a systimer IPI.  It MUST be MP-safe and
559  * the BGL IS NOT HELD ON ENTRY.  This routine is called at ESTCPUFREQ.
560  */
561 static
562 void
563 bsd4_schedulerclock(struct lwp *lp, sysclock_t period, sysclock_t cpstamp)
564 {
565 	globaldata_t gd = mycpu;
566 
567 	/*
568 	 * Do we need to round-robin?  We round-robin 10 times a second.
569 	 * This should only occur for cpu-bound batch processes.
570 	 */
571 	if (++gd->gd_rrcount >= usched_bsd4_rrinterval) {
572 		gd->gd_rrcount = 0;
573 		need_user_resched();
574 	}
575 
576 	/*
577 	 * As the process accumulates cpu time p_estcpu is bumped and may
578 	 * push the process into another scheduling queue.  It typically
579 	 * takes 4 ticks to bump the queue.
580 	 */
581 	lp->lwp_estcpu = ESTCPULIM(lp->lwp_estcpu + ESTCPUINCR);
582 
583 	/*
584 	 * Reducing p_origcpu over time causes more of our estcpu to be
585 	 * returned to the parent when we exit.  This is a small tweak
586 	 * for the batch detection heuristic.
587 	 */
588 	if (lp->lwp_origcpu)
589 		--lp->lwp_origcpu;
590 
591 	/* XXX optimize, avoid lock if no reset is required */
592 	if (try_mplock()) {
593 		bsd4_resetpriority(lp);
594 		rel_mplock();
595 	}
596 }
597 
598 /*
599  * Release the current process designation on p.  P MUST BE CURPROC.
600  * Attempt to assign a new current process from the run queue.
601  *
602  * This function is called from exit1(), tsleep(), and the passive
603  * release code setup in <arch>/<arch>/trap.c
604  *
605  * If we do not have or cannot get the MP lock we just wakeup the userland
606  * helper scheduler thread for this cpu to do the work for us.
607  *
608  * WARNING!  The MP lock may be in an unsynchronized state due to the
609  * way get_mplock() works and the fact that this function may be called
610  * from a passive release during a lwkt_switch().   try_mplock() will deal
611  * with this for us but you should be aware that td_mpcount may not be
612  * useable.
613  */
614 static void
615 bsd4_release_curproc(struct lwp *lp)
616 {
617 	int cpuid;
618 	globaldata_t gd = mycpu;
619 
620 	KKASSERT(lp->lwp_thread->td_gd == gd);
621 	crit_enter();
622 	cpuid = gd->gd_cpuid;
623 
624 	if (gd->gd_uschedcp == lp) {
625 		if (try_mplock()) {
626 			/*
627 			 * YYY when the MP lock is not assumed (see else) we
628 			 * will have to check that gd_uschedcp is still == lp
629 			 * after acquisition of the MP lock
630 			 */
631 			gd->gd_uschedcp = NULL;
632 			gd->gd_upri = PRIBASE_NULL;
633 			bsd4_select_curproc(gd);
634 			rel_mplock();
635 		} else {
636 			KKASSERT(0);	/* MP LOCK ALWAYS HELD AT THE MOMENT */
637 			gd->gd_uschedcp = NULL;
638 			gd->gd_upri = PRIBASE_NULL;
639 			/* YYY uschedcp and curprocmask */
640 			if (runqcount && (rdyprocmask & (1 << cpuid))) {
641 				rdyprocmask &= ~(1 << cpuid);
642 				lwkt_schedule(&mycpu->gd_schedthread);
643 			}
644 		}
645 	}
646 	crit_exit();
647 }
648 
649 /*
650  * Select a new current process, potentially retaining gd_uschedcp.  However,
651  * be sure to round-robin.  This routine is generally only called if a
652  * reschedule is requested and that typically only occurs if a new process
653  * has a better priority or when we are round-robining.
654  *
655  * NOTE: Must be called with giant held and the current cpu's gd.
656  * NOTE: The caller must handle the situation where it loses a
657  *	uschedcp designation that it previously held, typically by
658  *	calling acquire_curproc() again.
659  * NOTE: May not block
660  */
661 static
662 void
663 bsd4_select_curproc(globaldata_t gd)
664 {
665 	struct lwp *nlp;
666 	int cpuid = gd->gd_cpuid;
667 	void *old;
668 
669 	clear_user_resched();
670 
671 	/*
672 	 * Choose the next designated current user process.
673 	 * Note that we cannot schedule gd_schedthread
674 	 * if runqcount is 0 without creating a scheduling
675 	 * loop.
676 	 *
677 	 * We do not clear the user resched request here,
678 	 * we need to test it later when we re-acquire.
679 	 *
680 	 * NOTE: chooseproc returns NULL if the chosen lwp
681 	 * is gd_uschedcp. XXX needs cleanup.
682 	 */
683 	old = gd->gd_uschedcp;
684 	if ((nlp = chooseproc(gd->gd_uschedcp)) != NULL) {
685 		curprocmask |= 1 << cpuid;
686 		gd->gd_upri = nlp->lwp_priority;
687 		gd->gd_uschedcp = nlp;
688 		lwkt_acquire(nlp->lwp_thread);
689 		lwkt_schedule(nlp->lwp_thread);
690 	} else if (gd->gd_uschedcp) {
691 		gd->gd_upri = gd->gd_uschedcp->lwp_priority;
692 		KKASSERT(curprocmask & (1 << cpuid));
693 	} else if (runqcount && (rdyprocmask & (1 << cpuid))) {
694 		/*gd->gd_uschedcp = NULL;*/
695 		curprocmask &= ~(1 << cpuid);
696 		rdyprocmask &= ~(1 << cpuid);
697 		lwkt_schedule(&gd->gd_schedthread);
698 	} else {
699 		/*gd->gd_uschedcp = NULL;*/
700 		curprocmask &= ~(1 << cpuid);
701 	}
702 }
703 
704 /*
705  * Acquire the current process designation on the CURRENT process only.
706  * This function is called at kernel-user priority (not userland priority)
707  * when curlwp does not match gd_uschedcp.
708  *
709  * Basically we recalculate our estcpu to hopefully give us a more
710  * favorable disposition, setrunqueue, then wait for the curlwp
711  * designation to be handed to us (if the setrunqueue didn't do it).
712  */
713 static void
714 bsd4_acquire_curproc(struct lwp *lp)
715 {
716 	globaldata_t gd = mycpu;
717 
718 	crit_enter();
719 	++lp->lwp_stats->p_ru.ru_nivcsw;
720 
721 	/*
722 	 * Loop until we become the current process.
723 	 */
724 	do {
725 		KKASSERT(lp == gd->gd_curthread->td_lwp);
726 		bsd4_recalculate_estcpu(lp);
727 		lwkt_deschedule_self(gd->gd_curthread);
728 		bsd4_setrunqueue(lp);
729 		lwkt_switch();
730 
731 		/*
732 		 * WE MAY HAVE BEEN MIGRATED TO ANOTHER CPU, RELOAD GD.
733 		 */
734 		gd = mycpu;
735 	} while (gd->gd_uschedcp != lp);
736 
737 	crit_exit();
738 
739 	/*
740 	 * That's it.  Cleanup, we are done.  The caller can return to
741 	 * user mode now.
742 	 */
743 	KKASSERT((lp->lwp_proc->p_flag & P_ONRUNQ) == 0);
744 }
745 
746 /*
747  * Compute the priority of a process when running in user mode.
748  * Arrange to reschedule if the resulting priority is better
749  * than that of the current process.
750  */
751 static void
752 bsd4_resetpriority(struct lwp *lp)
753 {
754 	int newpriority;
755 	int opq;
756 	int npq;
757 
758 	/*
759 	 * Set p_priority for general process comparisons
760 	 */
761 	switch(lp->lwp_rtprio.type) {
762 	case RTP_PRIO_REALTIME:
763 		lp->lwp_priority = PRIBASE_REALTIME + lp->lwp_rtprio.prio;
764 		return;
765 	case RTP_PRIO_NORMAL:
766 		break;
767 	case RTP_PRIO_IDLE:
768 		lp->lwp_priority = PRIBASE_IDLE + lp->lwp_rtprio.prio;
769 		return;
770 	case RTP_PRIO_THREAD:
771 		lp->lwp_priority = PRIBASE_THREAD + lp->lwp_rtprio.prio;
772 		return;
773 	}
774 
775 	/*
776 	 * NORMAL priorities fall through.  These are based on niceness
777 	 * and cpu use.  Lower numbers == higher priorities.
778 	 *
779 	 * Calculate our priority based on our niceness and estimated cpu.
780 	 * Note that the nice value adjusts the baseline, which effects
781 	 * cpu bursts but does not effect overall cpu use between cpu-bound
782 	 * processes.  The use of the nice field in the decay calculation
783 	 * controls the overall cpu use.
784 	 *
785 	 * This isn't an exact calculation.  We fit the full nice and
786 	 * estcpu range into the priority range so the actual PPQ value
787 	 * is incorrect, but it's still a reasonable way to think about it.
788 	 */
789 	newpriority = (lp->lwp_proc->p_nice - PRIO_MIN) * PPQ / NICEPPQ;
790 	newpriority += lp->lwp_estcpu * PPQ / ESTCPUPPQ;
791 	newpriority = newpriority * MAXPRI /
792 		    (PRIO_RANGE * PPQ / NICEPPQ + ESTCPUMAX * PPQ / ESTCPUPPQ);
793 	newpriority = MIN(newpriority, MAXPRI - 1);	/* sanity */
794 	newpriority = MAX(newpriority, 0);		/* sanity */
795 	npq = newpriority / PPQ;
796 	crit_enter();
797 	opq = (lp->lwp_priority & PRIMASK) / PPQ;
798 	if (lp->lwp_proc->p_stat == SRUN && (lp->lwp_proc->p_flag & P_ONRUNQ) && opq != npq) {
799 		/*
800 		 * We have to move the process to another queue
801 		 */
802 		bsd4_remrunqueue(lp);
803 		lp->lwp_priority = PRIBASE_NORMAL + newpriority;
804 		bsd4_setrunqueue(lp);
805 	} else {
806 		/*
807 		 * We can just adjust the priority and it will be picked
808 		 * up later.
809 		 */
810 		KKASSERT(opq == npq || (lp->lwp_proc->p_flag & P_ONRUNQ) == 0);
811 		lp->lwp_priority = PRIBASE_NORMAL + newpriority;
812 	}
813 	crit_exit();
814 }
815 
816 /*
817  * Called from fork1() when a new child process is being created.
818  *
819  * Give the child process an initial estcpu that is more batch then
820  * its parent and dock the parent for the fork (but do not
821  * reschedule the parent).   This comprises the main part of our batch
822  * detection heuristic for both parallel forking and sequential execs.
823  *
824  * Interactive processes will decay the boosted estcpu quickly while batch
825  * processes will tend to compound it.
826  * XXX lwp should be "spawning" instead of "forking"
827  */
828 static void
829 bsd4_forking(struct lwp *plp, struct lwp *lp)
830 {
831 	lp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ);
832 	lp->lwp_origcpu = lp->lwp_estcpu;
833 	plp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + ESTCPUPPQ);
834 }
835 
836 /*
837  * Called when the parent reaps a child.   Propogate cpu use by the child
838  * back to the parent.
839  */
840 static void
841 bsd4_exiting(struct lwp *plp, struct lwp *lp)
842 {
843 	int delta;
844 
845 	if (plp->lwp_proc->p_pid != 1) {
846 		delta = lp->lwp_estcpu - lp->lwp_origcpu;
847 		if (delta > 0)
848 			plp->lwp_estcpu = ESTCPULIM(plp->lwp_estcpu + delta);
849 	}
850 }
851 
852 /*
853  * Called from acquire and from kern_synch's one-second timer with a
854  * critical section held.
855  *
856  * Decay p_estcpu based on the number of ticks we haven't been running
857  * and our p_nice.  As the load increases each process observes a larger
858  * number of idle ticks (because other processes are running in them).
859  * This observation leads to a larger correction which tends to make the
860  * system more 'batchy'.
861  *
862  * Note that no recalculation occurs for a process which sleeps and wakes
863  * up in the same tick.  That is, a system doing thousands of context
864  * switches per second will still only do serious estcpu calculations
865  * ESTCPUFREQ times per second.
866  */
867 static
868 void
869 bsd4_recalculate_estcpu(struct lwp *lp)
870 {
871 	globaldata_t gd = mycpu;
872 	sysclock_t cpbase;
873 	int loadfac;
874 	int ndecay;
875 	int nticks;
876 	int nleft;
877 
878 	/*
879 	 * We have to subtract periodic to get the last schedclock
880 	 * timeout time, otherwise we would get the upcoming timeout.
881 	 * Keep in mind that a process can migrate between cpus and
882 	 * while the scheduler clock should be very close, boundary
883 	 * conditions could lead to a small negative delta.
884 	 */
885 	cpbase = gd->gd_schedclock.time - gd->gd_schedclock.periodic;
886 
887 	if (lp->lwp_slptime > 1) {
888 		/*
889 		 * Too much time has passed, do a coarse correction.
890 		 */
891 		lp->lwp_estcpu = lp->lwp_estcpu >> 1;
892 		bsd4_resetpriority(lp);
893 		lp->lwp_cpbase = cpbase;
894 		lp->lwp_cpticks = 0;
895 	} else if (lp->lwp_cpbase != cpbase) {
896 		/*
897 		 * Adjust estcpu if we are in a different tick.  Don't waste
898 		 * time if we are in the same tick.
899 		 *
900 		 * First calculate the number of ticks in the measurement
901 		 * interval.
902 		 */
903 		nticks = (cpbase - lp->lwp_cpbase) / gd->gd_schedclock.periodic;
904 		updatepcpu(lp, lp->lwp_cpticks, nticks);
905 
906 		if ((nleft = nticks - lp->lwp_cpticks) < 0)
907 			nleft = 0;
908 		if (usched_debug == lp->lwp_proc->p_pid) {
909 			printf("pid %d tid %d estcpu %d cpticks %d nticks %d nleft %d",
910 				lp->lwp_proc->p_pid, lp->lwp_tid, lp->lwp_estcpu,
911 				lp->lwp_cpticks, nticks, nleft);
912 		}
913 
914 		/*
915 		 * Calculate a decay value based on ticks remaining scaled
916 		 * down by the instantanious load and p_nice.
917 		 */
918 		if ((loadfac = runqcount) < 2)
919 			loadfac = 2;
920 		ndecay = nleft * usched_bsd4_decay * 2 *
921 			(PRIO_MAX * 2 - lp->lwp_proc->p_nice) / (loadfac * PRIO_MAX * 2);
922 
923 		/*
924 		 * Adjust p_estcpu.  Handle a border case where batch jobs
925 		 * can get stalled long enough to decay to zero when they
926 		 * shouldn't.
927 		 */
928 		if (lp->lwp_estcpu > ndecay * 2)
929 			lp->lwp_estcpu -= ndecay;
930 		else
931 			lp->lwp_estcpu >>= 1;
932 
933 		if (usched_debug == lp->lwp_proc->p_pid)
934 			printf(" ndecay %d estcpu %d\n", ndecay, lp->lwp_estcpu);
935 
936 		bsd4_resetpriority(lp);
937 		lp->lwp_cpbase = cpbase;
938 		lp->lwp_cpticks = 0;
939 	}
940 }
941 
942 #ifdef SMP
943 
944 /*
945  * For SMP systems a user scheduler helper thread is created for each
946  * cpu and is used to allow one cpu to wakeup another for the purposes of
947  * scheduling userland threads from setrunqueue().  UP systems do not
948  * need the helper since there is only one cpu.  We can't use the idle
949  * thread for this because we need to hold the MP lock.  Additionally,
950  * doing things this way allows us to HLT idle cpus on MP systems.
951  */
952 static void
953 sched_thread(void *dummy)
954 {
955     globaldata_t gd = mycpu;
956     int cpuid = gd->gd_cpuid;		/* doesn't change */
957     u_int32_t cpumask = 1 << cpuid;	/* doesn't change */
958 
959     get_mplock();			/* hold the MP lock */
960     for (;;) {
961 	struct lwp *nlp;
962 
963 	lwkt_deschedule_self(gd->gd_curthread);	/* interlock */
964 	rdyprocmask |= cpumask;
965 	crit_enter_quick(gd->gd_curthread);
966 	if ((curprocmask & cpumask) == 0 && (nlp = chooseproc(NULL)) != NULL) {
967 	    curprocmask |= cpumask;
968 	    gd->gd_upri = nlp->lwp_priority;
969 	    gd->gd_uschedcp = nlp;
970 	    lwkt_acquire(nlp->lwp_thread);
971 	    lwkt_schedule(nlp->lwp_thread);
972 	}
973 	crit_exit_quick(gd->gd_curthread);
974 	lwkt_switch();
975     }
976 }
977 
978 /*
979  * Setup our scheduler helpers.  Note that curprocmask bit 0 has already
980  * been cleared by rqinit() and we should not mess with it further.
981  */
982 static void
983 sched_thread_cpu_init(void)
984 {
985     int i;
986 
987     if (bootverbose)
988 	printf("start scheduler helpers on cpus:");
989 
990     for (i = 0; i < ncpus; ++i) {
991 	globaldata_t dgd = globaldata_find(i);
992 	cpumask_t mask = 1 << i;
993 
994 	if ((mask & smp_active_mask) == 0)
995 	    continue;
996 
997 	if (bootverbose)
998 	    printf(" %d", i);
999 
1000 	lwkt_create(sched_thread, NULL, NULL, &dgd->gd_schedthread,
1001 		    TDF_STOPREQ, i, "usched %d", i);
1002 
1003 	/*
1004 	 * Allow user scheduling on the target cpu.  cpu #0 has already
1005 	 * been enabled in rqinit().
1006 	 */
1007 	if (i)
1008 	    curprocmask &= ~mask;
1009 	rdyprocmask |= mask;
1010     }
1011     if (bootverbose)
1012 	printf("\n");
1013 }
1014 SYSINIT(uschedtd, SI_SUB_FINISH_SMP, SI_ORDER_ANY, sched_thread_cpu_init, NULL)
1015 
1016 #endif
1017 
1018