xref: /netbsd-src/sys/kern/sched_m2.c (revision 8ac07aec990b9d2e483062509d0a9fa5b4f57cf2)
1 /*	$NetBSD: sched_m2.c,v 1.24 2008/04/12 17:02:08 ad Exp $	*/
2 
3 /*
4  * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 /*
30  * TODO:
31  *  - Implementation of fair share queue;
32  *  - Support for NUMA;
33  */
34 
35 #include <sys/cdefs.h>
36 __KERNEL_RCSID(0, "$NetBSD: sched_m2.c,v 1.24 2008/04/12 17:02:08 ad Exp $");
37 
38 #include <sys/param.h>
39 
40 #include <sys/bitops.h>
41 #include <sys/cpu.h>
42 #include <sys/callout.h>
43 #include <sys/errno.h>
44 #include <sys/kernel.h>
45 #include <sys/kmem.h>
46 #include <sys/lwp.h>
47 #include <sys/mutex.h>
48 #include <sys/pool.h>
49 #include <sys/proc.h>
50 #include <sys/pset.h>
51 #include <sys/resource.h>
52 #include <sys/resourcevar.h>
53 #include <sys/sched.h>
54 #include <sys/syscallargs.h>
55 #include <sys/sysctl.h>
56 #include <sys/types.h>
57 
58 /*
59  * Priority related defintions.
60  */
61 #define	PRI_TS_COUNT	(NPRI_USER)
62 #define	PRI_RT_COUNT	(PRI_COUNT - PRI_TS_COUNT)
63 #define	PRI_HTS_RANGE	(PRI_TS_COUNT / 10)
64 
65 #define	PRI_HIGHEST_TS	(MAXPRI_USER)
66 
67 /*
68  * Time-slices and priorities.
69  */
70 static u_int	min_ts;			/* Minimal time-slice */
71 static u_int	max_ts;			/* Maximal time-slice */
72 static u_int	rt_ts;			/* Real-time time-slice */
73 static u_int	ts_map[PRI_COUNT];	/* Map of time-slices */
74 static pri_t	high_pri[PRI_COUNT];	/* Map for priority increase */
75 
76 typedef struct {
77 	u_int		sl_flags;
78 	u_int		sl_timeslice;	/* Time-slice of thread */
79 	u_int		sl_slept;	/* Saved sleep time for sleep sum */
80 	u_int		sl_slpsum;	/* Sum of sleep time */
81 	u_int		sl_lrtime;	/* Last run time */
82 } sched_info_lwp_t;
83 
84 /* Flags */
85 #define	SL_BATCH	0x01
86 
87 /* Pool of the scheduler-specific structures for threads */
88 static pool_cache_t	sil_pool;
89 
90 /*
91  * Prototypes.
92  */
93 
94 static void		sched_precalcts(void);
95 
96 /*
97  * Initialization and setup.
98  */
99 
100 void
101 sched_rqinit(void)
102 {
103 	struct cpu_info *ci = curcpu();
104 
105 	if (hz < 100) {
106 		panic("sched_rqinit: value of HZ is too low\n");
107 	}
108 
109 	/* Default timing ranges */
110 	min_ts = mstohz(50);			/* ~50ms  */
111 	max_ts = mstohz(150);			/* ~150ms */
112 	rt_ts = mstohz(100);			/* ~100ms */
113 	sched_precalcts();
114 
115 	/* Pool of the scheduler-specific structures */
116 	sil_pool = pool_cache_init(sizeof(sched_info_lwp_t), coherency_unit,
117 	    0, 0, "lwpsd", NULL, IPL_NONE, NULL, NULL, NULL);
118 
119 	/* Attach the primary CPU here */
120 	sched_cpuattach(ci);
121 
122 	sched_lwp_fork(NULL, &lwp0);
123 	sched_newts(&lwp0);
124 }
125 
126 /* Pre-calculate the time-slices for the priorities */
127 static void
128 sched_precalcts(void)
129 {
130 	pri_t p;
131 
132 	/* Time-sharing range */
133 	for (p = 0; p <= PRI_HIGHEST_TS; p++) {
134 		ts_map[p] = max_ts -
135 		    (p * 100 / (PRI_TS_COUNT - 1) * (max_ts - min_ts) / 100);
136 		high_pri[p] = (PRI_HIGHEST_TS - PRI_HTS_RANGE) +
137 		    ((p * PRI_HTS_RANGE) / (PRI_TS_COUNT - 1));
138 	}
139 
140 	/* Real-time range */
141 	for (p = (PRI_HIGHEST_TS + 1); p < PRI_COUNT; p++) {
142 		ts_map[p] = rt_ts;
143 		high_pri[p] = p;
144 	}
145 }
146 
147 /*
148  * Hooks.
149  */
150 
151 void
152 sched_proc_fork(struct proc *parent, struct proc *child)
153 {
154 	struct lwp *l;
155 
156 	LIST_FOREACH(l, &child->p_lwps, l_sibling) {
157 		lwp_lock(l);
158 		sched_newts(l);
159 		lwp_unlock(l);
160 	}
161 }
162 
163 void
164 sched_proc_exit(struct proc *child, struct proc *parent)
165 {
166 
167 	/* Dummy */
168 }
169 
170 void
171 sched_lwp_fork(struct lwp *l1, struct lwp *l2)
172 {
173 
174 	KASSERT(l2->l_sched_info == NULL);
175 	l2->l_sched_info = pool_cache_get(sil_pool, PR_WAITOK);
176 	memset(l2->l_sched_info, 0, sizeof(sched_info_lwp_t));
177 }
178 
179 void
180 sched_lwp_exit(struct lwp *l)
181 {
182 
183 	KASSERT(l->l_sched_info != NULL);
184 	pool_cache_put(sil_pool, l->l_sched_info);
185 	l->l_sched_info = NULL;
186 }
187 
188 void
189 sched_lwp_collect(struct lwp *l)
190 {
191 
192 }
193 
194 void
195 sched_setrunnable(struct lwp *l)
196 {
197 
198 	/* Dummy */
199 }
200 
201 void
202 sched_schedclock(struct lwp *l)
203 {
204 
205 	/* Dummy */
206 }
207 
208 /*
209  * Priorities and time-slice.
210  */
211 
212 void
213 sched_nice(struct proc *p, int prio)
214 {
215 
216 	/* TODO: implement as SCHED_IA */
217 }
218 
219 /* Recalculate the time-slice */
220 void
221 sched_newts(struct lwp *l)
222 {
223 	sched_info_lwp_t *sil = l->l_sched_info;
224 
225 	sil->sl_timeslice = ts_map[lwp_eprio(l)];
226 }
227 
228 void
229 sched_slept(struct lwp *l)
230 {
231 	sched_info_lwp_t *sil = l->l_sched_info;
232 
233 	/* Save the time when thread has slept */
234 	sil->sl_slept = hardclock_ticks;
235 
236 	/*
237 	 * If thread is in time-sharing queue and batch flag is not marked,
238 	 * increase the the priority, and run with the lower time-quantum.
239 	 */
240 	if (l->l_priority < PRI_HIGHEST_TS &&
241 	    (sil->sl_flags & SL_BATCH) == 0) {
242 		KASSERT(l->l_class == SCHED_OTHER);
243 		l->l_priority++;
244 	}
245 }
246 
247 void
248 sched_wakeup(struct lwp *l)
249 {
250 	sched_info_lwp_t *sil = l->l_sched_info;
251 	const u_int slptime = hardclock_ticks - sil->sl_slept;
252 
253 	/* Update sleep time delta */
254 	sil->sl_slpsum += (l->l_slptime == 0) ? slptime : hz;
255 
256 	/* If thread was sleeping a second or more - set a high priority */
257 	if (l->l_slptime > 1 || slptime >= hz)
258 		l->l_priority = high_pri[l->l_priority];
259 
260 	/* Also, consider looking for a better CPU to wake up */
261 	l->l_cpu = sched_takecpu(l);
262 }
263 
264 void
265 sched_pstats_hook(struct lwp *l)
266 {
267 	sched_info_lwp_t *sil = l->l_sched_info;
268 	pri_t prio;
269 	bool batch;
270 
271 	if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP ||
272 	    l->l_stat == LSSUSPENDED)
273 		l->l_slptime++;
274 
275 	/*
276 	 * Set that thread is more CPU-bound, if sum of run time exceeds the
277 	 * sum of sleep time.  Check if thread is CPU-bound a first time.
278 	 */
279 	batch = (l->l_rticksum > sil->sl_slpsum);
280 	if (batch) {
281 		if ((sil->sl_flags & SL_BATCH) == 0)
282 			batch = false;
283 		sil->sl_flags |= SL_BATCH;
284 	} else
285 		sil->sl_flags &= ~SL_BATCH;
286 
287 	/*
288 	 * If thread is CPU-bound and never sleeps, it would occupy the CPU.
289 	 * In such case reset the value of last sleep, and check it later, if
290 	 * it is still zero - perform the migration, unmark the batch flag.
291 	 */
292 	if (batch && (l->l_slptime + sil->sl_slpsum) == 0) {
293 		if (l->l_stat != LSONPROC && sil->sl_slept == 0) {
294 			struct cpu_info *ci = sched_takecpu(l);
295 
296 			if (l->l_cpu != ci)
297 				l->l_target_cpu = ci;
298 			sil->sl_flags &= ~SL_BATCH;
299 		} else {
300 			sil->sl_slept = 0;
301 		}
302 	}
303 
304 	/* Reset the time sums */
305 	sil->sl_slpsum = 0;
306 	l->l_rticksum = 0;
307 
308 	/*
309 	 * Estimate threads on time-sharing queue only, however,
310 	 * exclude the highest priority for performance purposes.
311 	 */
312 	if (l->l_priority >= PRI_HIGHEST_TS)
313 		return;
314 	KASSERT(l->l_class == SCHED_OTHER);
315 
316 	/* If it is CPU-bound not a first time - decrease the priority */
317 	prio = l->l_priority;
318 	if (batch && prio != 0)
319 		prio--;
320 
321 	/* If thread was not ran a second or more - set a high priority */
322 	if (l->l_stat == LSRUN) {
323 		if (sil->sl_lrtime && (hardclock_ticks - sil->sl_lrtime >= hz))
324 			prio = high_pri[prio];
325 		/* Re-enqueue the thread if priority has changed */
326 		if (prio != l->l_priority)
327 			lwp_changepri(l, prio);
328 	} else {
329 		/* In other states, change the priority directly */
330 		l->l_priority = prio;
331 	}
332 }
333 
334 void
335 sched_oncpu(lwp_t *l)
336 {
337 	sched_info_lwp_t *sil = l->l_sched_info;
338 
339 	/* Update the counters */
340 	sil = l->l_sched_info;
341 	KASSERT(sil->sl_timeslice >= min_ts);
342 	KASSERT(sil->sl_timeslice <= max_ts);
343 	l->l_cpu->ci_schedstate.spc_ticks = sil->sl_timeslice;
344 }
345 
346 /*
347  * Time-driven events.
348  */
349 
350 /*
351  * Called once per time-quantum.  This routine is CPU-local and runs at
352  * IPL_SCHED, thus the locking is not needed.
353  */
354 void
355 sched_tick(struct cpu_info *ci)
356 {
357 	struct schedstate_percpu *spc = &ci->ci_schedstate;
358 	struct lwp *l = curlwp;
359 	const sched_info_lwp_t *sil = l->l_sched_info;
360 
361 	if (CURCPU_IDLE_P())
362 		return;
363 
364 	switch (l->l_class) {
365 	case SCHED_FIFO:
366 		/*
367 		 * Update the time-quantum, and continue running,
368 		 * if thread runs on FIFO real-time policy.
369 		 */
370 		KASSERT(l->l_priority > PRI_HIGHEST_TS);
371 		spc->spc_ticks = sil->sl_timeslice;
372 		return;
373 	case SCHED_OTHER:
374 		/*
375 		 * If thread is in time-sharing queue, decrease the priority,
376 		 * and run with a higher time-quantum.
377 		 */
378 		KASSERT(l->l_priority <= PRI_HIGHEST_TS);
379 		if (l->l_priority != 0)
380 			l->l_priority--;
381 		break;
382 	}
383 
384 	/*
385 	 * If there are higher priority threads or threads in the same queue,
386 	 * mark that thread should yield, otherwise, continue running.
387 	 */
388 	if (lwp_eprio(l) <= spc->spc_maxpriority || l->l_target_cpu) {
389 		spc->spc_flags |= SPCF_SHOULDYIELD;
390 		cpu_need_resched(ci, 0);
391 	} else
392 		spc->spc_ticks = sil->sl_timeslice;
393 }
394 
395 /*
396  * Sysctl nodes and initialization.
397  */
398 
399 static int
400 sysctl_sched_rtts(SYSCTLFN_ARGS)
401 {
402 	struct sysctlnode node;
403 	int rttsms = hztoms(rt_ts);
404 
405 	node = *rnode;
406 	node.sysctl_data = &rttsms;
407 	return sysctl_lookup(SYSCTLFN_CALL(&node));
408 }
409 
410 static int
411 sysctl_sched_mints(SYSCTLFN_ARGS)
412 {
413 	struct sysctlnode node;
414 	struct cpu_info *ci;
415 	int error, newsize;
416 	CPU_INFO_ITERATOR cii;
417 
418 	node = *rnode;
419 	node.sysctl_data = &newsize;
420 
421 	newsize = hztoms(min_ts);
422 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
423 	if (error || newp == NULL)
424 		return error;
425 
426 	newsize = mstohz(newsize);
427 	if (newsize < 1 || newsize > hz || newsize >= max_ts)
428 		return EINVAL;
429 
430 	/* It is safe to do this in such order */
431 	for (CPU_INFO_FOREACH(cii, ci))
432 		spc_lock(ci);
433 
434 	min_ts = newsize;
435 	sched_precalcts();
436 
437 	for (CPU_INFO_FOREACH(cii, ci))
438 		spc_unlock(ci);
439 
440 	return 0;
441 }
442 
443 static int
444 sysctl_sched_maxts(SYSCTLFN_ARGS)
445 {
446 	struct sysctlnode node;
447 	struct cpu_info *ci;
448 	int error, newsize;
449 	CPU_INFO_ITERATOR cii;
450 
451 	node = *rnode;
452 	node.sysctl_data = &newsize;
453 
454 	newsize = hztoms(max_ts);
455 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
456 	if (error || newp == NULL)
457 		return error;
458 
459 	newsize = mstohz(newsize);
460 	if (newsize < 10 || newsize > hz || newsize <= min_ts)
461 		return EINVAL;
462 
463 	/* It is safe to do this in such order */
464 	for (CPU_INFO_FOREACH(cii, ci))
465 		spc_lock(ci);
466 
467 	max_ts = newsize;
468 	sched_precalcts();
469 
470 	for (CPU_INFO_FOREACH(cii, ci))
471 		spc_unlock(ci);
472 
473 	return 0;
474 }
475 
476 SYSCTL_SETUP(sysctl_sched_m2_setup, "sysctl sched setup")
477 {
478 	const struct sysctlnode *node = NULL;
479 
480 	sysctl_createv(clog, 0, NULL, NULL,
481 		CTLFLAG_PERMANENT,
482 		CTLTYPE_NODE, "kern", NULL,
483 		NULL, 0, NULL, 0,
484 		CTL_KERN, CTL_EOL);
485 	sysctl_createv(clog, 0, NULL, &node,
486 		CTLFLAG_PERMANENT,
487 		CTLTYPE_NODE, "sched",
488 		SYSCTL_DESCR("Scheduler options"),
489 		NULL, 0, NULL, 0,
490 		CTL_KERN, CTL_CREATE, CTL_EOL);
491 
492 	if (node == NULL)
493 		return;
494 
495 	sysctl_createv(NULL, 0, &node, NULL,
496 		CTLFLAG_PERMANENT,
497 		CTLTYPE_STRING, "name", NULL,
498 		NULL, 0, __UNCONST("M2"), 0,
499 		CTL_CREATE, CTL_EOL);
500 	sysctl_createv(NULL, 0, &node, NULL,
501 		CTLFLAG_PERMANENT,
502 		CTLTYPE_INT, "rtts",
503 		SYSCTL_DESCR("Round-robin time quantum (in miliseconds)"),
504 		sysctl_sched_rtts, 0, NULL, 0,
505 		CTL_CREATE, CTL_EOL);
506 	sysctl_createv(NULL, 0, &node, NULL,
507 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
508 		CTLTYPE_INT, "maxts",
509 		SYSCTL_DESCR("Maximal time quantum (in miliseconds)"),
510 		sysctl_sched_maxts, 0, &max_ts, 0,
511 		CTL_CREATE, CTL_EOL);
512 	sysctl_createv(NULL, 0, &node, NULL,
513 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
514 		CTLTYPE_INT, "mints",
515 		SYSCTL_DESCR("Minimal time quantum (in miliseconds)"),
516 		sysctl_sched_mints, 0, &min_ts, 0,
517 		CTL_CREATE, CTL_EOL);
518 }
519