xref: /netbsd-src/sys/kern/kern_synch.c (revision 481fca6e59249d8ffcf24fef7cfbe7b131bfb080)
1 /*	$NetBSD: kern_synch.c,v 1.79 2000/06/27 17:41:27 mrg Exp $	*/
2 
3 /*-
4  * Copyright (c) 1999, 2000 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility,
9  * NASA Ames Research Center.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice, this list of conditions and the following disclaimer.
16  * 2. Redistributions in binary form must reproduce the above copyright
17  *    notice, this list of conditions and the following disclaimer in the
18  *    documentation and/or other materials provided with the distribution.
19  * 3. All advertising materials mentioning features or use of this software
20  *    must display the following acknowledgement:
21  *	This product includes software developed by the NetBSD
22  *	Foundation, Inc. and its contributors.
23  * 4. Neither the name of The NetBSD Foundation nor the names of its
24  *    contributors may be used to endorse or promote products derived
25  *    from this software without specific prior written permission.
26  *
27  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
28  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
29  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
31  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37  * POSSIBILITY OF SUCH DAMAGE.
38  */
39 
40 /*-
41  * Copyright (c) 1982, 1986, 1990, 1991, 1993
42  *	The Regents of the University of California.  All rights reserved.
43  * (c) UNIX System Laboratories, Inc.
44  * All or some portions of this file are derived from material licensed
45  * to the University of California by American Telephone and Telegraph
46  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
47  * the permission of UNIX System Laboratories, Inc.
48  *
49  * Redistribution and use in source and binary forms, with or without
50  * modification, are permitted provided that the following conditions
51  * are met:
52  * 1. Redistributions of source code must retain the above copyright
53  *    notice, this list of conditions and the following disclaimer.
54  * 2. Redistributions in binary form must reproduce the above copyright
55  *    notice, this list of conditions and the following disclaimer in the
56  *    documentation and/or other materials provided with the distribution.
57  * 3. All advertising materials mentioning features or use of this software
58  *    must display the following acknowledgement:
59  *	This product includes software developed by the University of
60  *	California, Berkeley and its contributors.
61  * 4. Neither the name of the University nor the names of its contributors
62  *    may be used to endorse or promote products derived from this software
63  *    without specific prior written permission.
64  *
65  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
66  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
69  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
70  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
71  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
72  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
73  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
74  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
75  * SUCH DAMAGE.
76  *
77  *	@(#)kern_synch.c	8.9 (Berkeley) 5/19/95
78  */
79 
80 #include "opt_ddb.h"
81 #include "opt_ktrace.h"
82 
83 #include <sys/param.h>
84 #include <sys/systm.h>
85 #include <sys/callout.h>
86 #include <sys/proc.h>
87 #include <sys/kernel.h>
88 #include <sys/buf.h>
89 #include <sys/signalvar.h>
90 #include <sys/resourcevar.h>
91 #include <sys/sched.h>
92 
93 #include <uvm/uvm_extern.h>
94 
95 #ifdef KTRACE
96 #include <sys/ktrace.h>
97 #endif
98 
99 #include <machine/cpu.h>
100 
101 int	lbolt;			/* once a second sleep address */
102 
103 /*
104  * The global scheduler state.
105  */
106 struct prochd sched_qs[RUNQUE_NQS];	/* run queues */
107 __volatile u_int32_t sched_whichqs;	/* bitmap of non-empty queues */
108 struct slpque sched_slpque[SLPQUE_TABLESIZE]; /* sleep queues */
109 
110 void roundrobin(void *);
111 void schedcpu(void *);
112 void updatepri(struct proc *);
113 void endtsleep(void *);
114 
115 __inline void awaken(struct proc *);
116 
117 struct callout roundrobin_ch = CALLOUT_INITIALIZER;
118 struct callout schedcpu_ch = CALLOUT_INITIALIZER;
119 
120 /*
121  * Force switch among equal priority processes every 100ms.
122  */
123 /* ARGSUSED */
124 void
125 roundrobin(void *arg)
126 {
127 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
128 	int s;
129 
130 	if (curproc != NULL) {
131 		s = splstatclock();
132 		if (spc->spc_flags & SPCF_SEENRR) {
133 			/*
134 			 * The process has already been through a roundrobin
135 			 * without switching and may be hogging the CPU.
136 			 * Indicate that the process should yield.
137 			 */
138 			spc->spc_flags |= SPCF_SHOULDYIELD;
139 		} else
140 			spc->spc_flags |= SPCF_SEENRR;
141 		splx(s);
142 	}
143 	need_resched();
144 	callout_reset(&roundrobin_ch, hz / 10, roundrobin, NULL);
145 }
146 
147 /*
148  * Constants for digital decay and forget:
149  *	90% of (p_estcpu) usage in 5 * loadav time
150  *	95% of (p_pctcpu) usage in 60 seconds (load insensitive)
151  *          Note that, as ps(1) mentions, this can let percentages
152  *          total over 100% (I've seen 137.9% for 3 processes).
153  *
154  * Note that hardclock updates p_estcpu and p_cpticks independently.
155  *
156  * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
157  * That is, the system wants to compute a value of decay such
158  * that the following for loop:
159  * 	for (i = 0; i < (5 * loadavg); i++)
160  * 		p_estcpu *= decay;
161  * will compute
162  * 	p_estcpu *= 0.1;
163  * for all values of loadavg:
164  *
165  * Mathematically this loop can be expressed by saying:
166  * 	decay ** (5 * loadavg) ~= .1
167  *
168  * The system computes decay as:
169  * 	decay = (2 * loadavg) / (2 * loadavg + 1)
170  *
171  * We wish to prove that the system's computation of decay
172  * will always fulfill the equation:
173  * 	decay ** (5 * loadavg) ~= .1
174  *
175  * If we compute b as:
176  * 	b = 2 * loadavg
177  * then
178  * 	decay = b / (b + 1)
179  *
180  * We now need to prove two things:
181  *	1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
182  *	2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
183  *
184  * Facts:
185  *         For x close to zero, exp(x) =~ 1 + x, since
186  *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
187  *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
188  *         For x close to zero, ln(1+x) =~ x, since
189  *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
190  *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
191  *         ln(.1) =~ -2.30
192  *
193  * Proof of (1):
194  *    Solve (factor)**(power) =~ .1 given power (5*loadav):
195  *	solving for factor,
196  *      ln(factor) =~ (-2.30/5*loadav), or
197  *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
198  *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
199  *
200  * Proof of (2):
201  *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
202  *	solving for power,
203  *      power*ln(b/(b+1)) =~ -2.30, or
204  *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
205  *
206  * Actual power values for the implemented algorithm are as follows:
207  *      loadav: 1       2       3       4
208  *      power:  5.68    10.32   14.94   19.55
209  */
210 
211 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */
212 #define	loadfactor(loadav)	(2 * (loadav))
213 #define	decay_cpu(loadfac, cpu)	(((loadfac) * (cpu)) / ((loadfac) + FSCALE))
214 
215 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
216 fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;		/* exp(-1/20) */
217 
218 /*
219  * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
220  * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
221  * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
222  *
223  * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
224  *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
225  *
226  * If you dont want to bother with the faster/more-accurate formula, you
227  * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
228  * (more general) method of calculating the %age of CPU used by a process.
229  */
230 #define	CCPU_SHIFT	11
231 
232 /*
233  * Recompute process priorities, every hz ticks.
234  */
235 /* ARGSUSED */
236 void
237 schedcpu(void *arg)
238 {
239 	fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
240 	struct proc *p;
241 	int s;
242 	unsigned int newcpu;
243 	int clkhz;
244 
245 	proclist_lock_read();
246 	for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
247 		/*
248 		 * Increment time in/out of memory and sleep time
249 		 * (if sleeping).  We ignore overflow; with 16-bit int's
250 		 * (remember them?) overflow takes 45 days.
251 		 */
252 		p->p_swtime++;
253 		if (p->p_stat == SSLEEP || p->p_stat == SSTOP)
254 			p->p_slptime++;
255 		p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
256 		/*
257 		 * If the process has slept the entire second,
258 		 * stop recalculating its priority until it wakes up.
259 		 */
260 		if (p->p_slptime > 1)
261 			continue;
262 		s = splstatclock();	/* prevent state changes */
263 		/*
264 		 * p_pctcpu is only for ps.
265 		 */
266 		clkhz = stathz != 0 ? stathz : hz;
267 #if	(FSHIFT >= CCPU_SHIFT)
268 		p->p_pctcpu += (clkhz == 100)?
269 			((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
270                 	100 * (((fixpt_t) p->p_cpticks)
271 				<< (FSHIFT - CCPU_SHIFT)) / clkhz;
272 #else
273 		p->p_pctcpu += ((FSCALE - ccpu) *
274 			(p->p_cpticks * FSCALE / clkhz)) >> FSHIFT;
275 #endif
276 		p->p_cpticks = 0;
277 		newcpu = (u_int)decay_cpu(loadfac, p->p_estcpu);
278 		p->p_estcpu = newcpu;
279 		resetpriority(p);
280 		if (p->p_priority >= PUSER) {
281 			if (p->p_stat == SRUN &&
282 			    (p->p_flag & P_INMEM) &&
283 			    (p->p_priority / PPQ) != (p->p_usrpri / PPQ)) {
284 				remrunqueue(p);
285 				p->p_priority = p->p_usrpri;
286 				setrunqueue(p);
287 			} else
288 				p->p_priority = p->p_usrpri;
289 		}
290 		splx(s);
291 	}
292 	proclist_unlock_read();
293 	uvm_meter();
294 	wakeup((caddr_t)&lbolt);
295 	callout_reset(&schedcpu_ch, hz, schedcpu, NULL);
296 }
297 
298 /*
299  * Recalculate the priority of a process after it has slept for a while.
300  * For all load averages >= 1 and max p_estcpu of 255, sleeping for at
301  * least six times the loadfactor will decay p_estcpu to zero.
302  */
303 void
304 updatepri(struct proc *p)
305 {
306 	unsigned int newcpu = p->p_estcpu;
307 	fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
308 
309 	if (p->p_slptime > 5 * loadfac)
310 		p->p_estcpu = 0;
311 	else {
312 		p->p_slptime--;	/* the first time was done in schedcpu */
313 		while (newcpu && --p->p_slptime)
314 			newcpu = (int) decay_cpu(loadfac, newcpu);
315 		p->p_estcpu = newcpu;
316 	}
317 	resetpriority(p);
318 }
319 
320 /*
321  * During autoconfiguration or after a panic, a sleep will simply
322  * lower the priority briefly to allow interrupts, then return.
323  * The priority to be used (safepri) is machine-dependent, thus this
324  * value is initialized and maintained in the machine-dependent layers.
325  * This priority will typically be 0, or the lowest priority
326  * that is safe for use on the interrupt stack; it can be made
327  * higher to block network software interrupts after panics.
328  */
329 int safepri;
330 
331 /*
332  * General sleep call.  Suspends the current process until a wakeup is
333  * performed on the specified identifier.  The process will then be made
334  * runnable with the specified priority.  Sleeps at most timo/hz seconds
335  * (0 means no timeout).  If pri includes PCATCH flag, signals are checked
336  * before and after sleeping, else signals are not checked.  Returns 0 if
337  * awakened, EWOULDBLOCK if the timeout expires.  If PCATCH is set and a
338  * signal needs to be delivered, ERESTART is returned if the current system
339  * call should be restarted if possible, and EINTR is returned if the system
340  * call should be interrupted by the signal (return EINTR).
341  *
342  * The interlock is held until the scheduler_slock is held.  The
343  * interlock will be locked before returning back to the caller
344  * unless the PNORELOCK flag is specified, in which case the
345  * interlock will always be unlocked upon return.
346  */
347 int
348 ltsleep(void *ident, int priority, const char *wmesg, int timo,
349     __volatile struct simplelock *interlock)
350 {
351 	struct proc *p = curproc;
352 	struct slpque *qp;
353 	int sig, s;
354 	int catch = priority & PCATCH;
355 	int relock = (priority & PNORELOCK) == 0;
356 #if 0 /* XXXSMP */
357 	int dobiglock;
358 #endif
359 
360 	/*
361 	 * XXXSMP
362 	 * This is probably bogus.  Figure out what the right
363 	 * thing to do here really is.
364 	 * Note that not sleeping if ltsleep is called with curproc == NULL
365 	 * in the shutdown case is disgusting but partly necessary given
366 	 * how shutdown (barely) works.
367 	 */
368 	if (cold || (doing_shutdown && (panicstr || (p == NULL)))) {
369 		/*
370 		 * After a panic, or during autoconfiguration,
371 		 * just give interrupts a chance, then just return;
372 		 * don't run any other procs or panic below,
373 		 * in case this is the idle process and already asleep.
374 		 */
375 		s = splhigh();
376 		splx(safepri);
377 		splx(s);
378 		if (interlock != NULL && relock == 0)
379 			simple_unlock(interlock);
380 		return (0);
381 	}
382 
383 #if 0 /* XXXSMP */
384 	dobiglock = (p->p_flags & P_BIGLOCK) != 0;
385 #endif
386 
387 #ifdef KTRACE
388 	if (KTRPOINT(p, KTR_CSW))
389 		ktrcsw(p, 1, 0);
390 #endif
391 
392 	s = splhigh();			/* XXXSMP: SCHED_LOCK(s) */
393 
394 #ifdef DIAGNOSTIC
395 	if (ident == NULL)
396 		panic("ltsleep: ident == NULL");
397 	if (p->p_stat != SONPROC)
398 		panic("ltsleep: p_stat %d != SONPROC", p->p_stat);
399 	if (p->p_back != NULL)
400 		panic("ltsleep: p_back != NULL");
401 #endif
402 
403 	p->p_wchan = ident;
404 	p->p_wmesg = wmesg;
405 	p->p_slptime = 0;
406 	p->p_priority = priority & PRIMASK;
407 
408 	qp = SLPQUE(ident);
409 	if (qp->sq_head == 0)
410 		qp->sq_head = p;
411 	else
412 		*qp->sq_tailp = p;
413 	*(qp->sq_tailp = &p->p_forw) = 0;
414 
415 	if (timo)
416 		callout_reset(&p->p_tsleep_ch, timo, endtsleep, p);
417 
418 	/*
419 	 * We can now release the interlock; the scheduler_slock
420 	 * is held, so a thread can't get in to do wakeup() before
421 	 * we do the switch.
422 	 *
423 	 * XXX We leave the code block here, after inserting ourselves
424 	 * on the sleep queue, because we might want a more clever
425 	 * data structure for the sleep queues at some point.
426 	 */
427 	if (interlock != NULL)
428 		simple_unlock(interlock);
429 
430 	/*
431 	 * We put ourselves on the sleep queue and start our timeout
432 	 * before calling CURSIG, as we could stop there, and a wakeup
433 	 * or a SIGCONT (or both) could occur while we were stopped.
434 	 * A SIGCONT would cause us to be marked as SSLEEP
435 	 * without resuming us, thus we must be ready for sleep
436 	 * when CURSIG is called.  If the wakeup happens while we're
437 	 * stopped, p->p_wchan will be 0 upon return from CURSIG.
438 	 */
439 	if (catch) {
440 		p->p_flag |= P_SINTR;
441 		if ((sig = CURSIG(p)) != 0) {
442 			if (p->p_wchan != NULL)
443 				unsleep(p);
444 			p->p_stat = SONPROC;
445 #if 0 /* XXXSMP */
446 			/*
447 			 * We're going to skip the unlock, so
448 			 * we don't need to relock after resume.
449 			 */
450 			dobiglock = 0;
451 #endif
452 			goto resume;
453 		}
454 		if (p->p_wchan == NULL) {
455 			catch = 0;
456 #if 0 /* XXXSMP */
457 			/* See above. */
458 			dobiglock = 0;
459 #endif
460 			goto resume;
461 		}
462 	} else
463 		sig = 0;
464 	p->p_stat = SSLEEP;
465 	p->p_stats->p_ru.ru_nvcsw++;
466 
467 #if 0 /* XXXSMP */
468 	if (dobiglock) {
469 		/*
470 		 * Release the kernel_lock, as we are about to
471 		 * yield the CPU.  The scheduler_slock is still
472 		 * held until cpu_switch() selects a new process
473 		 * and removes it from the run queue.
474 		 */
475 		kernel_lock_release();
476 	}
477 #endif
478 
479 	/* scheduler_slock held */
480 	mi_switch(p);
481 	/* scheduler_slock held */
482 #ifdef	DDB
483 	/* handy breakpoint location after process "wakes" */
484 	asm(".globl bpendtsleep ; bpendtsleep:");
485 #endif
486 
487  resume:
488 	KDASSERT(p->p_cpu != NULL);
489 	KDASSERT(p->p_cpu == curcpu());
490 	p->p_cpu->ci_schedstate.spc_curpriority = p->p_usrpri;
491 	splx(s);			/* XXXSMP: SCHED_UNLOCK(s) */
492 #if 0 /* XXXSMP */
493 	if (dobiglock) {
494 		/*
495 		 * Reacquire the kernel_lock now.  We do this after
496 		 * we've released scheduler_slock to avoid deadlock.
497 		 */
498 		kernel_lock_acquire(LK_EXCLUSIVE);
499 	}
500 #endif
501 	p->p_flag &= ~P_SINTR;
502 	if (p->p_flag & P_TIMEOUT) {
503 		p->p_flag &= ~P_TIMEOUT;
504 		if (sig == 0) {
505 #ifdef KTRACE
506 			if (KTRPOINT(p, KTR_CSW))
507 				ktrcsw(p, 0, 0);
508 #endif
509 			if (relock && interlock != NULL)
510 				simple_lock(interlock);
511 			return (EWOULDBLOCK);
512 		}
513 	} else if (timo)
514 		callout_stop(&p->p_tsleep_ch);
515 	if (catch && (sig != 0 || (sig = CURSIG(p)) != 0)) {
516 #ifdef KTRACE
517 		if (KTRPOINT(p, KTR_CSW))
518 			ktrcsw(p, 0, 0);
519 #endif
520 		if (relock && interlock != NULL)
521 			simple_lock(interlock);
522 		if ((p->p_sigacts->ps_sigact[sig].sa_flags & SA_RESTART) == 0)
523 			return (EINTR);
524 		return (ERESTART);
525 	}
526 #ifdef KTRACE
527 	if (KTRPOINT(p, KTR_CSW))
528 		ktrcsw(p, 0, 0);
529 #endif
530 	if (relock && interlock != NULL)
531 		simple_lock(interlock);
532 	return (0);
533 }
534 
535 /*
536  * Implement timeout for tsleep.
537  * If process hasn't been awakened (wchan non-zero),
538  * set timeout flag and undo the sleep.  If proc
539  * is stopped, just unsleep so it will remain stopped.
540  */
541 void
542 endtsleep(void *arg)
543 {
544 	struct proc *p;
545 	int s;
546 
547 	p = (struct proc *)arg;
548 	s = splhigh();
549 	if (p->p_wchan) {
550 		if (p->p_stat == SSLEEP)
551 			setrunnable(p);
552 		else
553 			unsleep(p);
554 		p->p_flag |= P_TIMEOUT;
555 	}
556 	splx(s);
557 }
558 
559 /*
560  * Remove a process from its wait queue
561  */
562 void
563 unsleep(struct proc *p)
564 {
565 	struct slpque *qp;
566 	struct proc **hp;
567 	int s;
568 
569 	s = splhigh();
570 	if (p->p_wchan) {
571 		hp = &(qp = SLPQUE(p->p_wchan))->sq_head;
572 		while (*hp != p)
573 			hp = &(*hp)->p_forw;
574 		*hp = p->p_forw;
575 		if (qp->sq_tailp == &p->p_forw)
576 			qp->sq_tailp = hp;
577 		p->p_wchan = 0;
578 	}
579 	splx(s);
580 }
581 
582 /*
583  * Optimized-for-wakeup() version of setrunnable().
584  */
585 __inline void
586 awaken(struct proc *p)
587 {
588 
589 	if (p->p_slptime > 1)
590 		updatepri(p);
591 	p->p_slptime = 0;
592 	p->p_stat = SRUN;
593 
594 	/*
595 	 * Since curpriority is a user priority, p->p_priority
596 	 * is always better than curpriority.
597 	 */
598 	if (p->p_flag & P_INMEM) {
599 		setrunqueue(p);
600 		need_resched();
601 	} else
602 		wakeup(&proc0);
603 }
604 
605 /*
606  * Make all processes sleeping on the specified identifier runnable.
607  */
608 void
609 wakeup(void *ident)
610 {
611 	struct slpque *qp;
612 	struct proc *p, **q;
613 	int s;
614 
615 	s = splhigh();			/* XXXSMP: SCHED_LOCK(s) */
616 
617 	qp = SLPQUE(ident);
618  restart:
619 	for (q = &qp->sq_head; (p = *q) != NULL; ) {
620 #ifdef DIAGNOSTIC
621 		if (p->p_back || (p->p_stat != SSLEEP && p->p_stat != SSTOP))
622 			panic("wakeup");
623 #endif
624 		if (p->p_wchan == ident) {
625 			p->p_wchan = 0;
626 			*q = p->p_forw;
627 			if (qp->sq_tailp == &p->p_forw)
628 				qp->sq_tailp = q;
629 			if (p->p_stat == SSLEEP) {
630 				awaken(p);
631 				goto restart;
632 			}
633 		} else
634 			q = &p->p_forw;
635 	}
636 	splx(s);			/* XXXSMP: SCHED_UNLOCK(s) */
637 }
638 
639 /*
640  * Make the highest priority process first in line on the specified
641  * identifier runnable.
642  */
643 void
644 wakeup_one(void *ident)
645 {
646 	struct slpque *qp;
647 	struct proc *p, **q;
648 	struct proc *best_sleepp, **best_sleepq;
649 	struct proc *best_stopp, **best_stopq;
650 	int s;
651 
652 	best_sleepp = best_stopp = NULL;
653 	best_sleepq = best_stopq = NULL;
654 
655 	s = splhigh();		/* XXXSMP: SCHED_LOCK(s) */
656 
657 	qp = SLPQUE(ident);
658 
659 	for (q = &qp->sq_head; (p = *q) != NULL; q = &p->p_forw) {
660 #ifdef DIAGNOSTIC
661 		if (p->p_back || (p->p_stat != SSLEEP && p->p_stat != SSTOP))
662 			panic("wakeup_one");
663 #endif
664 		if (p->p_wchan == ident) {
665 			if (p->p_stat == SSLEEP) {
666 				if (best_sleepp == NULL ||
667 				    p->p_priority < best_sleepp->p_priority) {
668 					best_sleepp = p;
669 					best_sleepq = q;
670 				}
671 			} else {
672 				if (best_stopp == NULL ||
673 				    p->p_priority < best_stopp->p_priority) {
674 					best_stopp = p;
675 					best_stopq = q;
676 				}
677 			}
678 		}
679 	}
680 
681 	/*
682 	 * Consider any SSLEEP process higher than the highest priority SSTOP
683 	 * process.
684 	 */
685 	if (best_sleepp != NULL) {
686 		p = best_sleepp;
687 		q = best_sleepq;
688 	} else {
689 		p = best_stopp;
690 		q = best_stopq;
691 	}
692 
693 	if (p != NULL) {
694 		p->p_wchan = NULL;
695 		*q = p->p_forw;
696 		if (qp->sq_tailp == &p->p_forw)
697 			qp->sq_tailp = q;
698 		if (p->p_stat == SSLEEP)
699 			awaken(p);
700 	}
701 	splx(s);		/* XXXSMP: SCHED_UNLOCK(s) */
702 }
703 
704 /*
705  * General yield call.  Puts the current process back on its run queue and
706  * performs a voluntary context switch.
707  */
708 void
709 yield(void)
710 {
711 	struct proc *p = curproc;
712 	int s;
713 
714 	s = splstatclock();
715 	p->p_priority = p->p_usrpri;
716 	p->p_stat = SRUN;
717 	setrunqueue(p);
718 	p->p_stats->p_ru.ru_nvcsw++;
719 	mi_switch(p);
720 	splx(s);
721 }
722 
723 /*
724  * General preemption call.  Puts the current process back on its run queue
725  * and performs an involuntary context switch.  If a process is supplied,
726  * we switch to that process.  Otherwise, we use the normal process selection
727  * criteria.
728  */
729 void
730 preempt(struct proc *newp)
731 {
732 	struct proc *p = curproc;
733 	int s;
734 
735 	/*
736 	 * XXX Switching to a specific process is not supported yet.
737 	 */
738 	if (newp != NULL)
739 		panic("preempt: cpu_preempt not yet implemented");
740 
741 	s = splstatclock();
742 	p->p_priority = p->p_usrpri;
743 	p->p_stat = SRUN;
744 	setrunqueue(p);
745 	p->p_stats->p_ru.ru_nivcsw++;
746 	mi_switch(p);
747 	splx(s);
748 }
749 
750 /*
751  * The machine independent parts of context switch.
752  * Must be called at splstatclock() or higher.
753  */
754 void
755 mi_switch(struct proc *p)
756 {
757 	struct schedstate_percpu *spc;
758 	struct rlimit *rlim;
759 	long s, u;
760 	struct timeval tv;
761 
762 	KDASSERT(p->p_cpu != NULL);
763 	KDASSERT(p->p_cpu == curcpu());
764 
765 	spc = &p->p_cpu->ci_schedstate;
766 
767 #ifdef DEBUG
768 	if (p->p_simple_locks) {
769 		printf("p->p_simple_locks %d\n", p->p_simple_locks);
770 #ifdef LOCKDEBUG
771 		simple_lock_dump();
772 #endif
773 		panic("sleep: holding simple lock");
774 	}
775 #endif
776 	/*
777 	 * Compute the amount of time during which the current
778 	 * process was running, and add that to its total so far.
779 	 */
780 	microtime(&tv);
781 	u = p->p_rtime.tv_usec + (tv.tv_usec - spc->spc_runtime.tv_usec);
782 	s = p->p_rtime.tv_sec + (tv.tv_sec - spc->spc_runtime.tv_sec);
783 	if (u < 0) {
784 		u += 1000000;
785 		s--;
786 	} else if (u >= 1000000) {
787 		u -= 1000000;
788 		s++;
789 	}
790 	p->p_rtime.tv_usec = u;
791 	p->p_rtime.tv_sec = s;
792 
793 	/*
794 	 * Check if the process exceeds its cpu resource allocation.
795 	 * If over max, kill it.  In any case, if it has run for more
796 	 * than 10 minutes, reduce priority to give others a chance.
797 	 */
798 	rlim = &p->p_rlimit[RLIMIT_CPU];
799 	if (s >= rlim->rlim_cur) {
800 		if (s >= rlim->rlim_max)
801 			psignal(p, SIGKILL);
802 		else {
803 			psignal(p, SIGXCPU);
804 			if (rlim->rlim_cur < rlim->rlim_max)
805 				rlim->rlim_cur += 5;
806 		}
807 	}
808 	if (autonicetime && s > autonicetime && p->p_ucred->cr_uid &&
809 	    p->p_nice == NZERO) {
810 		p->p_nice = autoniceval + NZERO;
811 		resetpriority(p);
812 	}
813 
814 	/*
815 	 * Process is about to yield the CPU; clear the appropriate
816 	 * scheduling flags.
817 	 */
818 	spc->spc_flags &= ~SPCF_SWITCHCLEAR;
819 
820 	/*
821 	 * Pick a new current process and switch to it.  When we
822 	 * run again, we'll return back here.
823 	 */
824 	uvmexp.swtch++;
825 	cpu_switch(p);
826 
827 	/*
828 	 * We're running again; record our new start time.  We might
829 	 * be running on a new CPU now, so don't use the cache'd
830 	 * schedstate_percpu pointer.
831 	 */
832 	KDASSERT(p->p_cpu != NULL);
833 	KDASSERT(p->p_cpu == curcpu());
834 	microtime(&p->p_cpu->ci_schedstate.spc_runtime);
835 }
836 
837 /*
838  * Initialize the (doubly-linked) run queues
839  * to be empty.
840  */
841 void
842 rqinit()
843 {
844 	int i;
845 
846 	for (i = 0; i < RUNQUE_NQS; i++)
847 		sched_qs[i].ph_link = sched_qs[i].ph_rlink =
848 		    (struct proc *)&sched_qs[i];
849 }
850 
851 /*
852  * Change process state to be runnable,
853  * placing it on the run queue if it is in memory,
854  * and awakening the swapper if it isn't in memory.
855  */
856 void
857 setrunnable(struct proc *p)
858 {
859 	int s;
860 
861 	s = splhigh();
862 	switch (p->p_stat) {
863 	case 0:
864 	case SRUN:
865 	case SONPROC:
866 	case SZOMB:
867 	case SDEAD:
868 	default:
869 		panic("setrunnable");
870 	case SSTOP:
871 		/*
872 		 * If we're being traced (possibly because someone attached us
873 		 * while we were stopped), check for a signal from the debugger.
874 		 */
875 		if ((p->p_flag & P_TRACED) != 0 && p->p_xstat != 0) {
876 			sigaddset(&p->p_siglist, p->p_xstat);
877 			p->p_sigcheck = 1;
878 		}
879 	case SSLEEP:
880 		unsleep(p);		/* e.g. when sending signals */
881 		break;
882 
883 	case SIDL:
884 		break;
885 	}
886 	p->p_stat = SRUN;
887 	if (p->p_flag & P_INMEM)
888 		setrunqueue(p);
889 	splx(s);
890 	if (p->p_slptime > 1)
891 		updatepri(p);
892 	p->p_slptime = 0;
893 	if ((p->p_flag & P_INMEM) == 0)
894 		wakeup((caddr_t)&proc0);
895 	else if (p->p_priority < curcpu()->ci_schedstate.spc_curpriority) {
896 		/*
897 		 * XXXSMP
898 		 * This is wrong.  It will work, but what really
899 		 * needs to happen is:
900 		 *
901 		 *	- Need to check if p is higher priority
902 		 *	  than the process currently running on
903 		 *	  the CPU p last ran on (let p_cpu persist
904 		 *	  after a context switch?), and preempt
905 		 *	  that one (or, if there is no process
906 		 *	  there, simply need_resched() that CPU.
907 		 *
908 		 *	- Failing that, traverse a list of
909 		 *	  available CPUs and need_resched() the
910 		 *	  CPU with the lowest priority that's
911 		 *	  lower than p's.
912 		 */
913 		need_resched();
914 	}
915 }
916 
917 /*
918  * Compute the priority of a process when running in user mode.
919  * Arrange to reschedule if the resulting priority is better
920  * than that of the current process.
921  */
922 void
923 resetpriority(struct proc *p)
924 {
925 	unsigned int newpriority;
926 
927 	newpriority = PUSER + p->p_estcpu + NICE_WEIGHT * (p->p_nice - NZERO);
928 	newpriority = min(newpriority, MAXPRI);
929 	p->p_usrpri = newpriority;
930 	if (newpriority < curcpu()->ci_schedstate.spc_curpriority) {
931 		/*
932 		 * XXXSMP
933 		 * Same applies as in setrunnable() above.
934 		 */
935 		need_resched();
936 	}
937 }
938 
939 /*
940  * We adjust the priority of the current process.  The priority of a process
941  * gets worse as it accumulates CPU time.  The cpu usage estimator (p_estcpu)
942  * is increased here.  The formula for computing priorities (in kern_synch.c)
943  * will compute a different value each time p_estcpu increases. This can
944  * cause a switch, but unless the priority crosses a PPQ boundary the actual
945  * queue will not change.  The cpu usage estimator ramps up quite quickly
946  * when the process is running (linearly), and decays away exponentially, at
947  * a rate which is proportionally slower when the system is busy.  The basic
948  * principal is that the system will 90% forget that the process used a lot
949  * of CPU time in 5 * loadav seconds.  This causes the system to favor
950  * processes which haven't run much recently, and to round-robin among other
951  * processes.
952  */
953 
954 void
955 schedclock(struct proc *p)
956 {
957 
958 	p->p_estcpu = ESTCPULIM(p->p_estcpu + 1);
959 	resetpriority(p);
960 	if (p->p_priority >= PUSER)
961 		p->p_priority = p->p_usrpri;
962 }
963