xref: /openbsd-src/sys/kern/sched_bsd.c (revision 71d823ace2523fb9fee2d1ab9b4d92a18d3f5714)
1 /*	$OpenBSD: sched_bsd.c,v 1.75 2023/06/20 16:30:30 cheloha Exp $	*/
2 /*	$NetBSD: kern_synch.c,v 1.37 1996/04/22 01:38:37 christos Exp $	*/
3 
4 /*-
5  * Copyright (c) 1982, 1986, 1990, 1991, 1993
6  *	The Regents of the University of California.  All rights reserved.
7  * (c) UNIX System Laboratories, Inc.
8  * All or some portions of this file are derived from material licensed
9  * to the University of California by American Telephone and Telegraph
10  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
11  * the permission of UNIX System Laboratories, Inc.
12  *
13  * Redistribution and use in source and binary forms, with or without
14  * modification, are permitted provided that the following conditions
15  * are met:
16  * 1. Redistributions of source code must retain the above copyright
17  *    notice, this list of conditions and the following disclaimer.
18  * 2. Redistributions in binary form must reproduce the above copyright
19  *    notice, this list of conditions and the following disclaimer in the
20  *    documentation and/or other materials provided with the distribution.
21  * 3. Neither the name of the University nor the names of its contributors
22  *    may be used to endorse or promote products derived from this software
23  *    without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
26  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
28  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
29  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
30  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
31  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
32  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
33  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
34  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35  * SUCH DAMAGE.
36  *
37  *	@(#)kern_synch.c	8.6 (Berkeley) 1/21/94
38  */
39 
40 #include <sys/param.h>
41 #include <sys/systm.h>
42 #include <sys/proc.h>
43 #include <sys/kernel.h>
44 #include <sys/malloc.h>
45 #include <sys/resourcevar.h>
46 #include <uvm/uvm_extern.h>
47 #include <sys/sched.h>
48 #include <sys/timeout.h>
49 #include <sys/smr.h>
50 #include <sys/tracepoint.h>
51 
52 #ifdef KTRACE
53 #include <sys/ktrace.h>
54 #endif
55 
56 
57 int	lbolt;			/* once a second sleep address */
58 int	rrticks_init;		/* # of hardclock ticks per roundrobin() */
59 
60 #ifdef MULTIPROCESSOR
61 struct __mp_lock sched_lock;
62 #endif
63 
64 void			schedcpu(void *);
65 uint32_t		decay_aftersleep(uint32_t, uint32_t);
66 
67 /*
68  * Force switch among equal priority processes every 100ms.
69  */
70 void
71 roundrobin(struct cpu_info *ci)
72 {
73 	struct schedstate_percpu *spc = &ci->ci_schedstate;
74 
75 	spc->spc_rrticks = rrticks_init;
76 
77 	if (ci->ci_curproc != NULL) {
78 		if (spc->spc_schedflags & SPCF_SEENRR) {
79 			/*
80 			 * The process has already been through a roundrobin
81 			 * without switching and may be hogging the CPU.
82 			 * Indicate that the process should yield.
83 			 */
84 			atomic_setbits_int(&spc->spc_schedflags,
85 			    SPCF_SHOULDYIELD);
86 		} else {
87 			atomic_setbits_int(&spc->spc_schedflags,
88 			    SPCF_SEENRR);
89 		}
90 	}
91 
92 	if (spc->spc_nrun)
93 		need_resched(ci);
94 }
95 
96 /*
97  * Constants for digital decay and forget:
98  *	90% of (p_estcpu) usage in 5 * loadav time
99  *	95% of (p_pctcpu) usage in 60 seconds (load insensitive)
100  *          Note that, as ps(1) mentions, this can let percentages
101  *          total over 100% (I've seen 137.9% for 3 processes).
102  *
103  * Note that hardclock updates p_estcpu and p_cpticks independently.
104  *
105  * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
106  * That is, the system wants to compute a value of decay such
107  * that the following for loop:
108  * 	for (i = 0; i < (5 * loadavg); i++)
109  * 		p_estcpu *= decay;
110  * will compute
111  * 	p_estcpu *= 0.1;
112  * for all values of loadavg:
113  *
114  * Mathematically this loop can be expressed by saying:
115  * 	decay ** (5 * loadavg) ~= .1
116  *
117  * The system computes decay as:
118  * 	decay = (2 * loadavg) / (2 * loadavg + 1)
119  *
120  * We wish to prove that the system's computation of decay
121  * will always fulfill the equation:
122  * 	decay ** (5 * loadavg) ~= .1
123  *
124  * If we compute b as:
125  * 	b = 2 * loadavg
126  * then
127  * 	decay = b / (b + 1)
128  *
129  * We now need to prove two things:
130  *	1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
131  *	2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
132  *
133  * Facts:
134  *         For x close to zero, exp(x) =~ 1 + x, since
135  *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
136  *              therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b.
137  *         For x close to zero, ln(1+x) =~ x, since
138  *              ln(1+x) = x - x**2/2 + x**3/3 - ...     -1 < x < 1
139  *              therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1).
140  *         ln(.1) =~ -2.30
141  *
142  * Proof of (1):
143  *    Solve (factor)**(power) =~ .1 given power (5*loadav):
144  *	solving for factor,
145  *      ln(factor) =~ (-2.30/5*loadav), or
146  *      factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) =
147  *          exp(-1/b) =~ (b-1)/b =~ b/(b+1).                    QED
148  *
149  * Proof of (2):
150  *    Solve (factor)**(power) =~ .1 given factor == (b/(b+1)):
151  *	solving for power,
152  *      power*ln(b/(b+1)) =~ -2.30, or
153  *      power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav.  QED
154  *
155  * Actual power values for the implemented algorithm are as follows:
156  *      loadav: 1       2       3       4
157  *      power:  5.68    10.32   14.94   19.55
158  */
159 
160 /* calculations for digital decay to forget 90% of usage in 5*loadav sec */
161 #define	loadfactor(loadav)	(2 * (loadav))
162 #define	decay_cpu(loadfac, cpu)	(((loadfac) * (cpu)) / ((loadfac) + FSCALE))
163 
164 /* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
165 fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;		/* exp(-1/20) */
166 
167 /*
168  * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the
169  * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below
170  * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT).
171  *
172  * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used:
173  *	1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits).
174  *
175  * If you don't want to bother with the faster/more-accurate formula, you
176  * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate
177  * (more general) method of calculating the %age of CPU used by a process.
178  */
179 #define	CCPU_SHIFT	11
180 
181 /*
182  * Recompute process priorities, every second.
183  */
184 void
185 schedcpu(void *arg)
186 {
187 	struct timeout *to = (struct timeout *)arg;
188 	fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
189 	struct proc *p;
190 	int s;
191 	unsigned int newcpu;
192 
193 	LIST_FOREACH(p, &allproc, p_list) {
194 		/*
195 		 * Idle threads are never placed on the runqueue,
196 		 * therefore computing their priority is pointless.
197 		 */
198 		if (p->p_cpu != NULL &&
199 		    p->p_cpu->ci_schedstate.spc_idleproc == p)
200 			continue;
201 		/*
202 		 * Increment sleep time (if sleeping). We ignore overflow.
203 		 */
204 		if (p->p_stat == SSLEEP || p->p_stat == SSTOP)
205 			p->p_slptime++;
206 		p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT;
207 		/*
208 		 * If the process has slept the entire second,
209 		 * stop recalculating its priority until it wakes up.
210 		 */
211 		if (p->p_slptime > 1)
212 			continue;
213 		SCHED_LOCK(s);
214 		/*
215 		 * p_pctcpu is only for diagnostic tools such as ps.
216 		 */
217 #if	(FSHIFT >= CCPU_SHIFT)
218 		p->p_pctcpu += (stathz == 100)?
219 			((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT):
220                 	100 * (((fixpt_t) p->p_cpticks)
221 				<< (FSHIFT - CCPU_SHIFT)) / stathz;
222 #else
223 		p->p_pctcpu += ((FSCALE - ccpu) *
224 			(p->p_cpticks * FSCALE / stathz)) >> FSHIFT;
225 #endif
226 		p->p_cpticks = 0;
227 		newcpu = (u_int) decay_cpu(loadfac, p->p_estcpu);
228 		setpriority(p, newcpu, p->p_p->ps_nice);
229 
230 		if (p->p_stat == SRUN &&
231 		    (p->p_runpri / SCHED_PPQ) != (p->p_usrpri / SCHED_PPQ)) {
232 			remrunqueue(p);
233 			setrunqueue(p->p_cpu, p, p->p_usrpri);
234 		}
235 		SCHED_UNLOCK(s);
236 	}
237 	wakeup(&lbolt);
238 	timeout_add_sec(to, 1);
239 }
240 
241 /*
242  * Recalculate the priority of a process after it has slept for a while.
243  * For all load averages >= 1 and max p_estcpu of 255, sleeping for at
244  * least six times the loadfactor will decay p_estcpu to zero.
245  */
246 uint32_t
247 decay_aftersleep(uint32_t estcpu, uint32_t slptime)
248 {
249 	fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
250 	uint32_t newcpu;
251 
252 	if (slptime > 5 * loadfac)
253 		newcpu = 0;
254 	else {
255 		newcpu = estcpu;
256 		slptime--;	/* the first time was done in schedcpu */
257 		while (newcpu && --slptime)
258 			newcpu = decay_cpu(loadfac, newcpu);
259 
260 	}
261 
262 	return (newcpu);
263 }
264 
265 /*
266  * General yield call.  Puts the current process back on its run queue and
267  * performs a voluntary context switch.
268  */
269 void
270 yield(void)
271 {
272 	struct proc *p = curproc;
273 	int s;
274 
275 	SCHED_LOCK(s);
276 	setrunqueue(p->p_cpu, p, p->p_usrpri);
277 	p->p_ru.ru_nvcsw++;
278 	mi_switch();
279 	SCHED_UNLOCK(s);
280 }
281 
282 /*
283  * General preemption call.  Puts the current process back on its run queue
284  * and performs an involuntary context switch.  If a process is supplied,
285  * we switch to that process.  Otherwise, we use the normal process selection
286  * criteria.
287  */
288 void
289 preempt(void)
290 {
291 	struct proc *p = curproc;
292 	int s;
293 
294 	SCHED_LOCK(s);
295 	setrunqueue(p->p_cpu, p, p->p_usrpri);
296 	p->p_ru.ru_nivcsw++;
297 	mi_switch();
298 	SCHED_UNLOCK(s);
299 }
300 
301 void
302 mi_switch(void)
303 {
304 	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
305 	struct proc *p = curproc;
306 	struct proc *nextproc;
307 	struct process *pr = p->p_p;
308 	struct timespec ts;
309 #ifdef MULTIPROCESSOR
310 	int hold_count;
311 	int sched_count;
312 #endif
313 
314 	assertwaitok();
315 	KASSERT(p->p_stat != SONPROC);
316 
317 	SCHED_ASSERT_LOCKED();
318 
319 #ifdef MULTIPROCESSOR
320 	/*
321 	 * Release the kernel_lock, as we are about to yield the CPU.
322 	 */
323 	sched_count = __mp_release_all_but_one(&sched_lock);
324 	if (_kernel_lock_held())
325 		hold_count = __mp_release_all(&kernel_lock);
326 	else
327 		hold_count = 0;
328 #endif
329 
330 	/*
331 	 * Compute the amount of time during which the current
332 	 * process was running, and add that to its total so far.
333 	 */
334 	nanouptime(&ts);
335 	if (timespeccmp(&ts, &spc->spc_runtime, <)) {
336 #if 0
337 		printf("uptime is not monotonic! "
338 		    "ts=%lld.%09lu, runtime=%lld.%09lu\n",
339 		    (long long)tv.tv_sec, tv.tv_nsec,
340 		    (long long)spc->spc_runtime.tv_sec,
341 		    spc->spc_runtime.tv_nsec);
342 #endif
343 	} else {
344 		timespecsub(&ts, &spc->spc_runtime, &ts);
345 		timespecadd(&p->p_rtime, &ts, &p->p_rtime);
346 	}
347 
348 	/* add the time counts for this thread to the process's total */
349 	tuagg_unlocked(pr, p);
350 
351 	/*
352 	 * Process is about to yield the CPU; clear the appropriate
353 	 * scheduling flags.
354 	 */
355 	atomic_clearbits_int(&spc->spc_schedflags, SPCF_SWITCHCLEAR);
356 
357 	nextproc = sched_chooseproc();
358 
359 	if (p != nextproc) {
360 		uvmexp.swtch++;
361 		TRACEPOINT(sched, off__cpu, nextproc->p_tid + THREAD_PID_OFFSET,
362 		    nextproc->p_p->ps_pid);
363 		cpu_switchto(p, nextproc);
364 		TRACEPOINT(sched, on__cpu, NULL);
365 	} else {
366 		TRACEPOINT(sched, remain__cpu, NULL);
367 		p->p_stat = SONPROC;
368 	}
369 
370 	clear_resched(curcpu());
371 
372 	SCHED_ASSERT_LOCKED();
373 
374 	/*
375 	 * To preserve lock ordering, we need to release the sched lock
376 	 * and grab it after we grab the big lock.
377 	 * In the future, when the sched lock isn't recursive, we'll
378 	 * just release it here.
379 	 */
380 #ifdef MULTIPROCESSOR
381 	__mp_unlock(&sched_lock);
382 #endif
383 
384 	SCHED_ASSERT_UNLOCKED();
385 
386 	smr_idle();
387 
388 	/*
389 	 * We're running again; record our new start time.  We might
390 	 * be running on a new CPU now, so don't use the cache'd
391 	 * schedstate_percpu pointer.
392 	 */
393 	KASSERT(p->p_cpu == curcpu());
394 
395 	nanouptime(&p->p_cpu->ci_schedstate.spc_runtime);
396 
397 #ifdef MULTIPROCESSOR
398 	/*
399 	 * Reacquire the kernel_lock now.  We do this after we've
400 	 * released the scheduler lock to avoid deadlock, and before
401 	 * we reacquire the interlock and the scheduler lock.
402 	 */
403 	if (hold_count)
404 		__mp_acquire_count(&kernel_lock, hold_count);
405 	__mp_acquire_count(&sched_lock, sched_count + 1);
406 #endif
407 }
408 
409 /*
410  * Change process state to be runnable,
411  * placing it on the run queue.
412  */
413 void
414 setrunnable(struct proc *p)
415 {
416 	struct process *pr = p->p_p;
417 	u_char prio;
418 
419 	SCHED_ASSERT_LOCKED();
420 
421 	switch (p->p_stat) {
422 	case 0:
423 	case SRUN:
424 	case SONPROC:
425 	case SDEAD:
426 	case SIDL:
427 	default:
428 		panic("setrunnable");
429 	case SSTOP:
430 		/*
431 		 * If we're being traced (possibly because someone attached us
432 		 * while we were stopped), check for a signal from the debugger.
433 		 */
434 		if ((pr->ps_flags & PS_TRACED) != 0 && pr->ps_xsig != 0)
435 			atomic_setbits_int(&p->p_siglist, sigmask(pr->ps_xsig));
436 		prio = p->p_usrpri;
437 		unsleep(p);
438 		break;
439 	case SSLEEP:
440 		prio = p->p_slppri;
441 		unsleep(p);		/* e.g. when sending signals */
442 		break;
443 	}
444 	setrunqueue(NULL, p, prio);
445 	if (p->p_slptime > 1) {
446 		uint32_t newcpu;
447 
448 		newcpu = decay_aftersleep(p->p_estcpu, p->p_slptime);
449 		setpriority(p, newcpu, pr->ps_nice);
450 	}
451 	p->p_slptime = 0;
452 }
453 
454 /*
455  * Compute the priority of a process.
456  */
457 void
458 setpriority(struct proc *p, uint32_t newcpu, uint8_t nice)
459 {
460 	unsigned int newprio;
461 
462 	newprio = min((PUSER + newcpu + NICE_WEIGHT * (nice - NZERO)), MAXPRI);
463 
464 	SCHED_ASSERT_LOCKED();
465 	p->p_estcpu = newcpu;
466 	p->p_usrpri = newprio;
467 }
468 
469 /*
470  * We adjust the priority of the current process.  The priority of a process
471  * gets worse as it accumulates CPU time.  The cpu usage estimator (p_estcpu)
472  * is increased here.  The formula for computing priorities (in kern_synch.c)
473  * will compute a different value each time p_estcpu increases. This can
474  * cause a switch, but unless the priority crosses a PPQ boundary the actual
475  * queue will not change.  The cpu usage estimator ramps up quite quickly
476  * when the process is running (linearly), and decays away exponentially, at
477  * a rate which is proportionally slower when the system is busy.  The basic
478  * principle is that the system will 90% forget that the process used a lot
479  * of CPU time in 5 * loadav seconds.  This causes the system to favor
480  * processes which haven't run much recently, and to round-robin among other
481  * processes.
482  */
483 void
484 schedclock(struct proc *p)
485 {
486 	struct cpu_info *ci = curcpu();
487 	struct schedstate_percpu *spc = &ci->ci_schedstate;
488 	uint32_t newcpu;
489 	int s;
490 
491 	if (p == spc->spc_idleproc || spc->spc_spinning)
492 		return;
493 
494 	SCHED_LOCK(s);
495 	newcpu = ESTCPULIM(p->p_estcpu + 1);
496 	setpriority(p, newcpu, p->p_p->ps_nice);
497 	SCHED_UNLOCK(s);
498 }
499 
500 void (*cpu_setperf)(int);
501 
502 #define PERFPOL_MANUAL 0
503 #define PERFPOL_AUTO 1
504 #define PERFPOL_HIGH 2
505 int perflevel = 100;
506 int perfpolicy = PERFPOL_AUTO;
507 
508 #ifndef SMALL_KERNEL
509 /*
510  * The code below handles CPU throttling.
511  */
512 #include <sys/sysctl.h>
513 
514 void setperf_auto(void *);
515 struct timeout setperf_to = TIMEOUT_INITIALIZER(setperf_auto, NULL);
516 extern int hw_power;
517 
518 void
519 setperf_auto(void *v)
520 {
521 	static uint64_t *idleticks, *totalticks;
522 	static int downbeats;
523 	int i, j = 0;
524 	int speedup = 0;
525 	CPU_INFO_ITERATOR cii;
526 	struct cpu_info *ci;
527 	uint64_t idle, total, allidle = 0, alltotal = 0;
528 
529 	if (perfpolicy != PERFPOL_AUTO)
530 		return;
531 
532 	if (cpu_setperf == NULL)
533 		return;
534 
535 	if (hw_power) {
536 		speedup = 1;
537 		goto faster;
538 	}
539 
540 	if (!idleticks)
541 		if (!(idleticks = mallocarray(ncpusfound, sizeof(*idleticks),
542 		    M_DEVBUF, M_NOWAIT | M_ZERO)))
543 			return;
544 	if (!totalticks)
545 		if (!(totalticks = mallocarray(ncpusfound, sizeof(*totalticks),
546 		    M_DEVBUF, M_NOWAIT | M_ZERO))) {
547 			free(idleticks, M_DEVBUF,
548 			    sizeof(*idleticks) * ncpusfound);
549 			return;
550 		}
551 	CPU_INFO_FOREACH(cii, ci) {
552 		if (!cpu_is_online(ci))
553 			continue;
554 		total = 0;
555 		for (i = 0; i < CPUSTATES; i++) {
556 			total += ci->ci_schedstate.spc_cp_time[i];
557 		}
558 		total -= totalticks[j];
559 		idle = ci->ci_schedstate.spc_cp_time[CP_IDLE] - idleticks[j];
560 		if (idle < total / 3)
561 			speedup = 1;
562 		alltotal += total;
563 		allidle += idle;
564 		idleticks[j] += idle;
565 		totalticks[j] += total;
566 		j++;
567 	}
568 	if (allidle < alltotal / 2)
569 		speedup = 1;
570 	if (speedup && downbeats < 5)
571 		downbeats++;
572 
573 	if (speedup && perflevel != 100) {
574 faster:
575 		perflevel = 100;
576 		cpu_setperf(perflevel);
577 	} else if (!speedup && perflevel != 0 && --downbeats <= 0) {
578 		perflevel = 0;
579 		cpu_setperf(perflevel);
580 	}
581 
582 	timeout_add_msec(&setperf_to, 100);
583 }
584 
585 int
586 sysctl_hwsetperf(void *oldp, size_t *oldlenp, void *newp, size_t newlen)
587 {
588 	int err;
589 
590 	if (!cpu_setperf)
591 		return EOPNOTSUPP;
592 
593 	if (perfpolicy != PERFPOL_MANUAL)
594 		return sysctl_rdint(oldp, oldlenp, newp, perflevel);
595 
596 	err = sysctl_int_bounded(oldp, oldlenp, newp, newlen,
597 	    &perflevel, 0, 100);
598 	if (err)
599 		return err;
600 
601 	if (newp != NULL)
602 		cpu_setperf(perflevel);
603 
604 	return 0;
605 }
606 
607 int
608 sysctl_hwperfpolicy(void *oldp, size_t *oldlenp, void *newp, size_t newlen)
609 {
610 	char policy[32];
611 	int err;
612 
613 	if (!cpu_setperf)
614 		return EOPNOTSUPP;
615 
616 	switch (perfpolicy) {
617 	case PERFPOL_MANUAL:
618 		strlcpy(policy, "manual", sizeof(policy));
619 		break;
620 	case PERFPOL_AUTO:
621 		strlcpy(policy, "auto", sizeof(policy));
622 		break;
623 	case PERFPOL_HIGH:
624 		strlcpy(policy, "high", sizeof(policy));
625 		break;
626 	default:
627 		strlcpy(policy, "unknown", sizeof(policy));
628 		break;
629 	}
630 
631 	if (newp == NULL)
632 		return sysctl_rdstring(oldp, oldlenp, newp, policy);
633 
634 	err = sysctl_string(oldp, oldlenp, newp, newlen, policy, sizeof(policy));
635 	if (err)
636 		return err;
637 	if (strcmp(policy, "manual") == 0)
638 		perfpolicy = PERFPOL_MANUAL;
639 	else if (strcmp(policy, "auto") == 0)
640 		perfpolicy = PERFPOL_AUTO;
641 	else if (strcmp(policy, "high") == 0)
642 		perfpolicy = PERFPOL_HIGH;
643 	else
644 		return EINVAL;
645 
646 	if (perfpolicy == PERFPOL_AUTO) {
647 		timeout_add_msec(&setperf_to, 200);
648 	} else if (perfpolicy == PERFPOL_HIGH) {
649 		perflevel = 100;
650 		cpu_setperf(perflevel);
651 	}
652 	return 0;
653 }
654 #endif
655 
656 void
657 scheduler_start(void)
658 {
659 	static struct timeout schedcpu_to;
660 
661 	/*
662 	 * We avoid polluting the global namespace by keeping the scheduler
663 	 * timeouts static in this function.
664 	 * We setup the timeout here and kick schedcpu once to make it do
665 	 * its job.
666 	 */
667 	timeout_set(&schedcpu_to, schedcpu, &schedcpu_to);
668 
669 	rrticks_init = hz / 10;
670 	schedcpu(&schedcpu_to);
671 	uvm_meter(NULL);
672 
673 #ifndef SMALL_KERNEL
674 	if (perfpolicy == PERFPOL_AUTO)
675 		timeout_add_msec(&setperf_to, 200);
676 #endif
677 }
678 
679