xref: /netbsd-src/sys/kern/sched_m2.c (revision b1c86f5f087524e68db12794ee9c3e3da1ab17a0)
1 /*	$NetBSD: sched_m2.c,v 1.29 2009/11/22 19:09:16 mbalmer Exp $	*/
2 
3 /*
4  * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 /*
30  * TODO:
31  *  - Implementation of fair share queue;
32  *  - Support for NUMA;
33  */
34 
35 #include <sys/cdefs.h>
36 __KERNEL_RCSID(0, "$NetBSD: sched_m2.c,v 1.29 2009/11/22 19:09:16 mbalmer Exp $");
37 
38 #include <sys/param.h>
39 
40 #include <sys/cpu.h>
41 #include <sys/callout.h>
42 #include <sys/errno.h>
43 #include <sys/kernel.h>
44 #include <sys/kmem.h>
45 #include <sys/lwp.h>
46 #include <sys/mutex.h>
47 #include <sys/pool.h>
48 #include <sys/proc.h>
49 #include <sys/pset.h>
50 #include <sys/resource.h>
51 #include <sys/resourcevar.h>
52 #include <sys/sched.h>
53 #include <sys/syscallargs.h>
54 #include <sys/sysctl.h>
55 #include <sys/types.h>
56 
57 /*
58  * Priority related defintions.
59  */
60 #define	PRI_TS_COUNT	(NPRI_USER)
61 #define	PRI_RT_COUNT	(PRI_COUNT - PRI_TS_COUNT)
62 #define	PRI_HTS_RANGE	(PRI_TS_COUNT / 10)
63 
64 #define	PRI_HIGHEST_TS	(MAXPRI_USER)
65 
66 /*
67  * Time-slices and priorities.
68  */
69 static u_int	min_ts;			/* Minimal time-slice */
70 static u_int	max_ts;			/* Maximal time-slice */
71 static u_int	rt_ts;			/* Real-time time-slice */
72 static u_int	ts_map[PRI_COUNT];	/* Map of time-slices */
73 static pri_t	high_pri[PRI_COUNT];	/* Map for priority increase */
74 
75 static void	sched_precalcts(void);
76 
77 /*
78  * Initialization and setup.
79  */
80 
81 void
82 sched_rqinit(void)
83 {
84 	struct cpu_info *ci = curcpu();
85 
86 	if (hz < 100) {
87 		panic("sched_rqinit: value of HZ is too low\n");
88 	}
89 
90 	/* Default timing ranges */
91 	min_ts = mstohz(20);			/*  ~20 ms */
92 	max_ts = mstohz(150);			/* ~150 ms */
93 	rt_ts = mstohz(100);			/* ~100 ms */
94 	sched_precalcts();
95 
96 	/* Attach the primary CPU here */
97 	sched_cpuattach(ci);
98 
99 	sched_lwp_fork(NULL, &lwp0);
100 	sched_newts(&lwp0);
101 }
102 
103 /* Pre-calculate the time-slices for the priorities */
104 static void
105 sched_precalcts(void)
106 {
107 	pri_t p;
108 
109 	/* Time-sharing range */
110 	for (p = 0; p <= PRI_HIGHEST_TS; p++) {
111 		ts_map[p] = max_ts -
112 		    (p * 100 / (PRI_TS_COUNT - 1) * (max_ts - min_ts) / 100);
113 		high_pri[p] = (PRI_HIGHEST_TS - PRI_HTS_RANGE) +
114 		    ((p * PRI_HTS_RANGE) / (PRI_TS_COUNT - 1));
115 	}
116 
117 	/* Real-time range */
118 	for (p = (PRI_HIGHEST_TS + 1); p < PRI_COUNT; p++) {
119 		ts_map[p] = rt_ts;
120 		high_pri[p] = p;
121 	}
122 }
123 
124 /*
125  * Hooks.
126  */
127 
128 void
129 sched_proc_fork(struct proc *parent, struct proc *child)
130 {
131 	struct lwp *l;
132 
133 	LIST_FOREACH(l, &child->p_lwps, l_sibling) {
134 		lwp_lock(l);
135 		sched_newts(l);
136 		lwp_unlock(l);
137 	}
138 }
139 
140 void
141 sched_proc_exit(struct proc *child, struct proc *parent)
142 {
143 
144 }
145 
146 void
147 sched_lwp_fork(struct lwp *l1, struct lwp *l2)
148 {
149 
150 }
151 
152 void
153 sched_lwp_collect(struct lwp *l)
154 {
155 
156 }
157 
158 void
159 sched_setrunnable(struct lwp *l)
160 {
161 
162 }
163 
164 void
165 sched_schedclock(struct lwp *l)
166 {
167 
168 }
169 
170 /*
171  * Priorities and time-slice.
172  */
173 
174 void
175 sched_nice(struct proc *p, int prio)
176 {
177 	struct lwp *l;
178 	int n;
179 
180 	KASSERT(mutex_owned(p->p_lock));
181 
182 	p->p_nice = prio;
183 	n = (prio - NZERO) >> 2;
184 	if (n == 0)
185 		return;
186 
187 	LIST_FOREACH(l, &p->p_lwps, l_sibling) {
188 		lwp_lock(l);
189 		if (l->l_class == SCHED_OTHER) {
190 			pri_t pri = l->l_priority - n;
191 			pri = (n < 0) ? min(pri, PRI_HIGHEST_TS) : imax(pri, 0);
192 			lwp_changepri(l, pri);
193 		}
194 		lwp_unlock(l);
195 	}
196 }
197 
198 /* Recalculate the time-slice */
199 void
200 sched_newts(struct lwp *l)
201 {
202 
203 	l->l_sched.timeslice = ts_map[lwp_eprio(l)];
204 }
205 
206 void
207 sched_slept(struct lwp *l)
208 {
209 
210 	/*
211 	 * If thread is in time-sharing queue and batch flag is not marked,
212 	 * increase the priority, and run with the lower time-quantum.
213 	 */
214 	if (l->l_priority < PRI_HIGHEST_TS && (l->l_flag & LW_BATCH) == 0) {
215 		struct proc *p = l->l_proc;
216 
217 		KASSERT(l->l_class == SCHED_OTHER);
218 		if (__predict_false(p->p_nice < NZERO)) {
219 			const int n = max((NZERO - p->p_nice) >> 2, 1);
220 			l->l_priority = min(l->l_priority + n, PRI_HIGHEST_TS);
221 		} else {
222 			l->l_priority++;
223 		}
224 	}
225 }
226 
227 void
228 sched_wakeup(struct lwp *l)
229 {
230 
231 	/* If thread was sleeping a second or more - set a high priority */
232 	if (l->l_slptime >= 1)
233 		l->l_priority = high_pri[l->l_priority];
234 }
235 
236 void
237 sched_pstats_hook(struct lwp *l, int batch)
238 {
239 	pri_t prio;
240 
241 	/*
242 	 * Estimate threads on time-sharing queue only, however,
243 	 * exclude the highest priority for performance purposes.
244 	 */
245 	KASSERT(lwp_locked(l, NULL));
246 	if (l->l_priority >= PRI_HIGHEST_TS)
247 		return;
248 	KASSERT(l->l_class == SCHED_OTHER);
249 
250 	/* If it is CPU-bound not a first time - decrease the priority */
251 	prio = l->l_priority;
252 	if (batch && prio != 0)
253 		prio--;
254 
255 	/* If thread was not ran a second or more - set a high priority */
256 	if (l->l_stat == LSRUN) {
257 		if (l->l_rticks && (hardclock_ticks - l->l_rticks >= hz))
258 			prio = high_pri[prio];
259 		/* Re-enqueue the thread if priority has changed */
260 		if (prio != l->l_priority)
261 			lwp_changepri(l, prio);
262 	} else {
263 		/* In other states, change the priority directly */
264 		l->l_priority = prio;
265 	}
266 }
267 
268 void
269 sched_oncpu(lwp_t *l)
270 {
271 	struct schedstate_percpu *spc = &l->l_cpu->ci_schedstate;
272 
273 	/* Update the counters */
274 	KASSERT(l->l_sched.timeslice >= min_ts);
275 	KASSERT(l->l_sched.timeslice <= max_ts);
276 	spc->spc_ticks = l->l_sched.timeslice;
277 }
278 
279 /*
280  * Time-driven events.
281  */
282 
283 /*
284  * Called once per time-quantum.  This routine is CPU-local and runs at
285  * IPL_SCHED, thus the locking is not needed.
286  */
287 void
288 sched_tick(struct cpu_info *ci)
289 {
290 	struct schedstate_percpu *spc = &ci->ci_schedstate;
291 	struct lwp *l = curlwp;
292 	struct proc *p;
293 
294 	if (__predict_false(CURCPU_IDLE_P()))
295 		return;
296 
297 	switch (l->l_class) {
298 	case SCHED_FIFO:
299 		/*
300 		 * Update the time-quantum, and continue running,
301 		 * if thread runs on FIFO real-time policy.
302 		 */
303 		KASSERT(l->l_priority > PRI_HIGHEST_TS);
304 		spc->spc_ticks = l->l_sched.timeslice;
305 		return;
306 	case SCHED_OTHER:
307 		/*
308 		 * If thread is in time-sharing queue, decrease the priority,
309 		 * and run with a higher time-quantum.
310 		 */
311 		KASSERT(l->l_priority <= PRI_HIGHEST_TS);
312 		if (l->l_priority == 0)
313 			break;
314 
315 		p = l->l_proc;
316 		if (__predict_false(p->p_nice > NZERO)) {
317 			const int n = max((p->p_nice - NZERO) >> 2, 1);
318 			l->l_priority = imax(l->l_priority - n, 0);
319 		} else
320 			l->l_priority--;
321 		break;
322 	}
323 
324 	/*
325 	 * If there are higher priority threads or threads in the same queue,
326 	 * mark that thread should yield, otherwise, continue running.
327 	 */
328 	if (lwp_eprio(l) <= spc->spc_maxpriority || l->l_target_cpu) {
329 		spc->spc_flags |= SPCF_SHOULDYIELD;
330 		cpu_need_resched(ci, 0);
331 	} else
332 		spc->spc_ticks = l->l_sched.timeslice;
333 }
334 
335 /*
336  * Sysctl nodes and initialization.
337  */
338 
339 static int
340 sysctl_sched_rtts(SYSCTLFN_ARGS)
341 {
342 	struct sysctlnode node;
343 	int rttsms = hztoms(rt_ts);
344 
345 	node = *rnode;
346 	node.sysctl_data = &rttsms;
347 	return sysctl_lookup(SYSCTLFN_CALL(&node));
348 }
349 
350 static int
351 sysctl_sched_mints(SYSCTLFN_ARGS)
352 {
353 	struct sysctlnode node;
354 	struct cpu_info *ci;
355 	int error, newsize;
356 	CPU_INFO_ITERATOR cii;
357 
358 	node = *rnode;
359 	node.sysctl_data = &newsize;
360 
361 	newsize = hztoms(min_ts);
362 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
363 	if (error || newp == NULL)
364 		return error;
365 
366 	newsize = mstohz(newsize);
367 	if (newsize < 1 || newsize > hz || newsize >= max_ts)
368 		return EINVAL;
369 
370 	/* It is safe to do this in such order */
371 	for (CPU_INFO_FOREACH(cii, ci))
372 		spc_lock(ci);
373 
374 	min_ts = newsize;
375 	sched_precalcts();
376 
377 	for (CPU_INFO_FOREACH(cii, ci))
378 		spc_unlock(ci);
379 
380 	return 0;
381 }
382 
383 static int
384 sysctl_sched_maxts(SYSCTLFN_ARGS)
385 {
386 	struct sysctlnode node;
387 	struct cpu_info *ci;
388 	int error, newsize;
389 	CPU_INFO_ITERATOR cii;
390 
391 	node = *rnode;
392 	node.sysctl_data = &newsize;
393 
394 	newsize = hztoms(max_ts);
395 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
396 	if (error || newp == NULL)
397 		return error;
398 
399 	newsize = mstohz(newsize);
400 	if (newsize < 10 || newsize > hz || newsize <= min_ts)
401 		return EINVAL;
402 
403 	/* It is safe to do this in such order */
404 	for (CPU_INFO_FOREACH(cii, ci))
405 		spc_lock(ci);
406 
407 	max_ts = newsize;
408 	sched_precalcts();
409 
410 	for (CPU_INFO_FOREACH(cii, ci))
411 		spc_unlock(ci);
412 
413 	return 0;
414 }
415 
416 SYSCTL_SETUP(sysctl_sched_m2_setup, "sysctl sched setup")
417 {
418 	const struct sysctlnode *node = NULL;
419 
420 	sysctl_createv(clog, 0, NULL, NULL,
421 		CTLFLAG_PERMANENT,
422 		CTLTYPE_NODE, "kern", NULL,
423 		NULL, 0, NULL, 0,
424 		CTL_KERN, CTL_EOL);
425 	sysctl_createv(clog, 0, NULL, &node,
426 		CTLFLAG_PERMANENT,
427 		CTLTYPE_NODE, "sched",
428 		SYSCTL_DESCR("Scheduler options"),
429 		NULL, 0, NULL, 0,
430 		CTL_KERN, CTL_CREATE, CTL_EOL);
431 
432 	if (node == NULL)
433 		return;
434 
435 	sysctl_createv(NULL, 0, &node, NULL,
436 		CTLFLAG_PERMANENT,
437 		CTLTYPE_STRING, "name", NULL,
438 		NULL, 0, __UNCONST("M2"), 0,
439 		CTL_CREATE, CTL_EOL);
440 	sysctl_createv(NULL, 0, &node, NULL,
441 		CTLFLAG_PERMANENT,
442 		CTLTYPE_INT, "rtts",
443 		SYSCTL_DESCR("Round-robin time quantum (in miliseconds)"),
444 		sysctl_sched_rtts, 0, NULL, 0,
445 		CTL_CREATE, CTL_EOL);
446 	sysctl_createv(NULL, 0, &node, NULL,
447 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
448 		CTLTYPE_INT, "maxts",
449 		SYSCTL_DESCR("Maximal time quantum (in miliseconds)"),
450 		sysctl_sched_maxts, 0, &max_ts, 0,
451 		CTL_CREATE, CTL_EOL);
452 	sysctl_createv(NULL, 0, &node, NULL,
453 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
454 		CTLTYPE_INT, "mints",
455 		SYSCTL_DESCR("Minimal time quantum (in miliseconds)"),
456 		sysctl_sched_mints, 0, &min_ts, 0,
457 		CTL_CREATE, CTL_EOL);
458 }
459