xref: /netbsd-src/sys/kern/sched_m2.c (revision cac8e449158efc7261bebc8657cbb0125a2cfdde)
1 /*	$NetBSD: sched_m2.c,v 1.25 2008/05/19 12:48:54 rmind Exp $	*/
2 
3 /*
4  * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 /*
30  * TODO:
31  *  - Implementation of fair share queue;
32  *  - Support for NUMA;
33  */
34 
35 #include <sys/cdefs.h>
36 __KERNEL_RCSID(0, "$NetBSD: sched_m2.c,v 1.25 2008/05/19 12:48:54 rmind Exp $");
37 
38 #include <sys/param.h>
39 
40 #include <sys/bitops.h>
41 #include <sys/cpu.h>
42 #include <sys/callout.h>
43 #include <sys/errno.h>
44 #include <sys/kernel.h>
45 #include <sys/kmem.h>
46 #include <sys/lwp.h>
47 #include <sys/mutex.h>
48 #include <sys/pool.h>
49 #include <sys/proc.h>
50 #include <sys/pset.h>
51 #include <sys/resource.h>
52 #include <sys/resourcevar.h>
53 #include <sys/sched.h>
54 #include <sys/syscallargs.h>
55 #include <sys/sysctl.h>
56 #include <sys/types.h>
57 
58 /*
59  * Priority related defintions.
60  */
61 #define	PRI_TS_COUNT	(NPRI_USER)
62 #define	PRI_RT_COUNT	(PRI_COUNT - PRI_TS_COUNT)
63 #define	PRI_HTS_RANGE	(PRI_TS_COUNT / 10)
64 
65 #define	PRI_HIGHEST_TS	(MAXPRI_USER)
66 
67 /*
68  * Time-slices and priorities.
69  */
70 static u_int	min_ts;			/* Minimal time-slice */
71 static u_int	max_ts;			/* Maximal time-slice */
72 static u_int	rt_ts;			/* Real-time time-slice */
73 static u_int	ts_map[PRI_COUNT];	/* Map of time-slices */
74 static pri_t	high_pri[PRI_COUNT];	/* Map for priority increase */
75 
76 static void	sched_precalcts(void);
77 
78 typedef struct {
79 	u_int		sl_timeslice;	/* Time-slice of thread */
80 } sched_info_lwp_t;
81 
82 static pool_cache_t	sil_pool;
83 
84 /*
85  * Initialization and setup.
86  */
87 
88 void
89 sched_rqinit(void)
90 {
91 	struct cpu_info *ci = curcpu();
92 
93 	if (hz < 100) {
94 		panic("sched_rqinit: value of HZ is too low\n");
95 	}
96 
97 	/* Default timing ranges */
98 	min_ts = mstohz(50);			/* ~50ms  */
99 	max_ts = mstohz(150);			/* ~150ms */
100 	rt_ts = mstohz(100);			/* ~100ms */
101 	sched_precalcts();
102 
103 	/* Pool of the scheduler-specific structures */
104 	sil_pool = pool_cache_init(sizeof(sched_info_lwp_t), coherency_unit,
105 	    0, 0, "lwpsd", NULL, IPL_NONE, NULL, NULL, NULL);
106 
107 	/* Attach the primary CPU here */
108 	sched_cpuattach(ci);
109 
110 	sched_lwp_fork(NULL, &lwp0);
111 	sched_newts(&lwp0);
112 }
113 
114 /* Pre-calculate the time-slices for the priorities */
115 static void
116 sched_precalcts(void)
117 {
118 	pri_t p;
119 
120 	/* Time-sharing range */
121 	for (p = 0; p <= PRI_HIGHEST_TS; p++) {
122 		ts_map[p] = max_ts -
123 		    (p * 100 / (PRI_TS_COUNT - 1) * (max_ts - min_ts) / 100);
124 		high_pri[p] = (PRI_HIGHEST_TS - PRI_HTS_RANGE) +
125 		    ((p * PRI_HTS_RANGE) / (PRI_TS_COUNT - 1));
126 	}
127 
128 	/* Real-time range */
129 	for (p = (PRI_HIGHEST_TS + 1); p < PRI_COUNT; p++) {
130 		ts_map[p] = rt_ts;
131 		high_pri[p] = p;
132 	}
133 }
134 
135 /*
136  * Hooks.
137  */
138 
139 void
140 sched_proc_fork(struct proc *parent, struct proc *child)
141 {
142 	struct lwp *l;
143 
144 	LIST_FOREACH(l, &child->p_lwps, l_sibling) {
145 		lwp_lock(l);
146 		sched_newts(l);
147 		lwp_unlock(l);
148 	}
149 }
150 
151 void
152 sched_proc_exit(struct proc *child, struct proc *parent)
153 {
154 
155 	/* Dummy */
156 }
157 
158 void
159 sched_lwp_fork(struct lwp *l1, struct lwp *l2)
160 {
161 
162 	KASSERT(l2->l_sched_info == NULL);
163 	l2->l_sched_info = pool_cache_get(sil_pool, PR_WAITOK);
164 	memset(l2->l_sched_info, 0, sizeof(sched_info_lwp_t));
165 }
166 
167 void
168 sched_lwp_exit(struct lwp *l)
169 {
170 
171 	KASSERT(l->l_sched_info != NULL);
172 	pool_cache_put(sil_pool, l->l_sched_info);
173 	l->l_sched_info = NULL;
174 }
175 
176 void
177 sched_lwp_collect(struct lwp *l)
178 {
179 
180 }
181 
182 void
183 sched_setrunnable(struct lwp *l)
184 {
185 
186 	/* Dummy */
187 }
188 
189 void
190 sched_schedclock(struct lwp *l)
191 {
192 
193 	/* Dummy */
194 }
195 
196 /*
197  * Priorities and time-slice.
198  */
199 
200 void
201 sched_nice(struct proc *p, int prio)
202 {
203 
204 	/* TODO: implement as SCHED_IA */
205 }
206 
207 /* Recalculate the time-slice */
208 void
209 sched_newts(struct lwp *l)
210 {
211 	sched_info_lwp_t *sil = l->l_sched_info;
212 
213 	sil->sl_timeslice = ts_map[lwp_eprio(l)];
214 }
215 
216 void
217 sched_slept(struct lwp *l)
218 {
219 
220 	/*
221 	 * If thread is in time-sharing queue and batch flag is not marked,
222 	 * increase the the priority, and run with the lower time-quantum.
223 	 */
224 	if (l->l_priority < PRI_HIGHEST_TS && (l->l_flag & LW_BATCH) == 0) {
225 		KASSERT(l->l_class == SCHED_OTHER);
226 		l->l_priority++;
227 	}
228 }
229 
230 void
231 sched_wakeup(struct lwp *l)
232 {
233 
234 	/* If thread was sleeping a second or more - set a high priority */
235 	if (l->l_slptime >= 1)
236 		l->l_priority = high_pri[l->l_priority];
237 }
238 
239 void
240 sched_pstats_hook(struct lwp *l, int batch)
241 {
242 	pri_t prio;
243 
244 	/*
245 	 * Estimate threads on time-sharing queue only, however,
246 	 * exclude the highest priority for performance purposes.
247 	 */
248 	if (l->l_priority >= PRI_HIGHEST_TS)
249 		return;
250 	KASSERT(l->l_class == SCHED_OTHER);
251 
252 	/* If it is CPU-bound not a first time - decrease the priority */
253 	prio = l->l_priority;
254 	if (batch && prio != 0)
255 		prio--;
256 
257 	/* If thread was not ran a second or more - set a high priority */
258 	if (l->l_stat == LSRUN) {
259 		if (l->l_rticks && (hardclock_ticks - l->l_rticks >= hz))
260 			prio = high_pri[prio];
261 		/* Re-enqueue the thread if priority has changed */
262 		if (prio != l->l_priority)
263 			lwp_changepri(l, prio);
264 	} else {
265 		/* In other states, change the priority directly */
266 		l->l_priority = prio;
267 	}
268 }
269 
270 void
271 sched_oncpu(lwp_t *l)
272 {
273 	sched_info_lwp_t *sil = l->l_sched_info;
274 
275 	/* Update the counters */
276 	sil = l->l_sched_info;
277 	KASSERT(sil->sl_timeslice >= min_ts);
278 	KASSERT(sil->sl_timeslice <= max_ts);
279 	l->l_cpu->ci_schedstate.spc_ticks = sil->sl_timeslice;
280 }
281 
282 /*
283  * Time-driven events.
284  */
285 
286 /*
287  * Called once per time-quantum.  This routine is CPU-local and runs at
288  * IPL_SCHED, thus the locking is not needed.
289  */
290 void
291 sched_tick(struct cpu_info *ci)
292 {
293 	struct schedstate_percpu *spc = &ci->ci_schedstate;
294 	struct lwp *l = curlwp;
295 	const sched_info_lwp_t *sil = l->l_sched_info;
296 
297 	if (CURCPU_IDLE_P())
298 		return;
299 
300 	switch (l->l_class) {
301 	case SCHED_FIFO:
302 		/*
303 		 * Update the time-quantum, and continue running,
304 		 * if thread runs on FIFO real-time policy.
305 		 */
306 		KASSERT(l->l_priority > PRI_HIGHEST_TS);
307 		spc->spc_ticks = sil->sl_timeslice;
308 		return;
309 	case SCHED_OTHER:
310 		/*
311 		 * If thread is in time-sharing queue, decrease the priority,
312 		 * and run with a higher time-quantum.
313 		 */
314 		KASSERT(l->l_priority <= PRI_HIGHEST_TS);
315 		if (l->l_priority != 0)
316 			l->l_priority--;
317 		break;
318 	}
319 
320 	/*
321 	 * If there are higher priority threads or threads in the same queue,
322 	 * mark that thread should yield, otherwise, continue running.
323 	 */
324 	if (lwp_eprio(l) <= spc->spc_maxpriority || l->l_target_cpu) {
325 		spc->spc_flags |= SPCF_SHOULDYIELD;
326 		cpu_need_resched(ci, 0);
327 	} else
328 		spc->spc_ticks = sil->sl_timeslice;
329 }
330 
331 /*
332  * Sysctl nodes and initialization.
333  */
334 
335 static int
336 sysctl_sched_rtts(SYSCTLFN_ARGS)
337 {
338 	struct sysctlnode node;
339 	int rttsms = hztoms(rt_ts);
340 
341 	node = *rnode;
342 	node.sysctl_data = &rttsms;
343 	return sysctl_lookup(SYSCTLFN_CALL(&node));
344 }
345 
346 static int
347 sysctl_sched_mints(SYSCTLFN_ARGS)
348 {
349 	struct sysctlnode node;
350 	struct cpu_info *ci;
351 	int error, newsize;
352 	CPU_INFO_ITERATOR cii;
353 
354 	node = *rnode;
355 	node.sysctl_data = &newsize;
356 
357 	newsize = hztoms(min_ts);
358 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
359 	if (error || newp == NULL)
360 		return error;
361 
362 	newsize = mstohz(newsize);
363 	if (newsize < 1 || newsize > hz || newsize >= max_ts)
364 		return EINVAL;
365 
366 	/* It is safe to do this in such order */
367 	for (CPU_INFO_FOREACH(cii, ci))
368 		spc_lock(ci);
369 
370 	min_ts = newsize;
371 	sched_precalcts();
372 
373 	for (CPU_INFO_FOREACH(cii, ci))
374 		spc_unlock(ci);
375 
376 	return 0;
377 }
378 
379 static int
380 sysctl_sched_maxts(SYSCTLFN_ARGS)
381 {
382 	struct sysctlnode node;
383 	struct cpu_info *ci;
384 	int error, newsize;
385 	CPU_INFO_ITERATOR cii;
386 
387 	node = *rnode;
388 	node.sysctl_data = &newsize;
389 
390 	newsize = hztoms(max_ts);
391 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
392 	if (error || newp == NULL)
393 		return error;
394 
395 	newsize = mstohz(newsize);
396 	if (newsize < 10 || newsize > hz || newsize <= min_ts)
397 		return EINVAL;
398 
399 	/* It is safe to do this in such order */
400 	for (CPU_INFO_FOREACH(cii, ci))
401 		spc_lock(ci);
402 
403 	max_ts = newsize;
404 	sched_precalcts();
405 
406 	for (CPU_INFO_FOREACH(cii, ci))
407 		spc_unlock(ci);
408 
409 	return 0;
410 }
411 
412 SYSCTL_SETUP(sysctl_sched_m2_setup, "sysctl sched setup")
413 {
414 	const struct sysctlnode *node = NULL;
415 
416 	sysctl_createv(clog, 0, NULL, NULL,
417 		CTLFLAG_PERMANENT,
418 		CTLTYPE_NODE, "kern", NULL,
419 		NULL, 0, NULL, 0,
420 		CTL_KERN, CTL_EOL);
421 	sysctl_createv(clog, 0, NULL, &node,
422 		CTLFLAG_PERMANENT,
423 		CTLTYPE_NODE, "sched",
424 		SYSCTL_DESCR("Scheduler options"),
425 		NULL, 0, NULL, 0,
426 		CTL_KERN, CTL_CREATE, CTL_EOL);
427 
428 	if (node == NULL)
429 		return;
430 
431 	sysctl_createv(NULL, 0, &node, NULL,
432 		CTLFLAG_PERMANENT,
433 		CTLTYPE_STRING, "name", NULL,
434 		NULL, 0, __UNCONST("M2"), 0,
435 		CTL_CREATE, CTL_EOL);
436 	sysctl_createv(NULL, 0, &node, NULL,
437 		CTLFLAG_PERMANENT,
438 		CTLTYPE_INT, "rtts",
439 		SYSCTL_DESCR("Round-robin time quantum (in miliseconds)"),
440 		sysctl_sched_rtts, 0, NULL, 0,
441 		CTL_CREATE, CTL_EOL);
442 	sysctl_createv(NULL, 0, &node, NULL,
443 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
444 		CTLTYPE_INT, "maxts",
445 		SYSCTL_DESCR("Maximal time quantum (in miliseconds)"),
446 		sysctl_sched_maxts, 0, &max_ts, 0,
447 		CTL_CREATE, CTL_EOL);
448 	sysctl_createv(NULL, 0, &node, NULL,
449 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
450 		CTLTYPE_INT, "mints",
451 		SYSCTL_DESCR("Minimal time quantum (in miliseconds)"),
452 		sysctl_sched_mints, 0, &min_ts, 0,
453 		CTL_CREATE, CTL_EOL);
454 }
455