xref: /netbsd-src/sys/kern/sched_m2.c (revision 946379e7b37692fc43f68eb0d1c10daa0a7f3b6c)
1 /*	$NetBSD: sched_m2.c,v 1.32 2014/06/24 10:08:45 maxv Exp $	*/
2 
3 /*
4  * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 /*
30  * TODO:
31  *  - Implementation of fair share queue;
32  *  - Support for NUMA;
33  */
34 
35 #include <sys/cdefs.h>
36 __KERNEL_RCSID(0, "$NetBSD: sched_m2.c,v 1.32 2014/06/24 10:08:45 maxv Exp $");
37 
38 #include <sys/param.h>
39 
40 #include <sys/cpu.h>
41 #include <sys/callout.h>
42 #include <sys/errno.h>
43 #include <sys/kernel.h>
44 #include <sys/kmem.h>
45 #include <sys/lwp.h>
46 #include <sys/mutex.h>
47 #include <sys/pool.h>
48 #include <sys/proc.h>
49 #include <sys/pset.h>
50 #include <sys/resource.h>
51 #include <sys/resourcevar.h>
52 #include <sys/sched.h>
53 #include <sys/syscallargs.h>
54 #include <sys/sysctl.h>
55 #include <sys/types.h>
56 
57 /*
58  * Priority related definitions.
59  */
60 #define	PRI_TS_COUNT	(NPRI_USER)
61 #define	PRI_RT_COUNT	(PRI_COUNT - PRI_TS_COUNT)
62 #define	PRI_HTS_RANGE	(PRI_TS_COUNT / 10)
63 
64 #define	PRI_HIGHEST_TS	(MAXPRI_USER)
65 
66 /*
67  * Time-slices and priorities.
68  */
69 static u_int	min_ts;			/* Minimal time-slice */
70 static u_int	max_ts;			/* Maximal time-slice */
71 static u_int	rt_ts;			/* Real-time time-slice */
72 static u_int	ts_map[PRI_COUNT];	/* Map of time-slices */
73 static pri_t	high_pri[PRI_COUNT];	/* Map for priority increase */
74 
75 static void	sched_precalcts(void);
76 
77 /*
78  * Initialization and setup.
79  */
80 
81 void
82 sched_rqinit(void)
83 {
84 	if (hz < 100) {
85 		panic("sched_rqinit: value of HZ is too low\n");
86 	}
87 
88 	/* Default timing ranges */
89 	min_ts = mstohz(20);			/*  ~20 ms */
90 	max_ts = mstohz(150);			/* ~150 ms */
91 	rt_ts = mstohz(100);			/* ~100 ms */
92 	sched_precalcts();
93 
94 #ifdef notdef
95 	/* Need to set the name etc. This does not belong here */
96 	/* Attach the primary CPU here */
97 	sched_cpuattach(curcpu());
98 #endif
99 
100 	sched_lwp_fork(NULL, &lwp0);
101 	sched_newts(&lwp0);
102 }
103 
104 /* Pre-calculate the time-slices for the priorities */
105 static void
106 sched_precalcts(void)
107 {
108 	pri_t p;
109 
110 	/* Time-sharing range */
111 	for (p = 0; p <= PRI_HIGHEST_TS; p++) {
112 		ts_map[p] = max_ts -
113 		    (p * 100 / (PRI_TS_COUNT - 1) * (max_ts - min_ts) / 100);
114 		high_pri[p] = (PRI_HIGHEST_TS - PRI_HTS_RANGE) +
115 		    ((p * PRI_HTS_RANGE) / (PRI_TS_COUNT - 1));
116 	}
117 
118 	/* Real-time range */
119 	for (p = (PRI_HIGHEST_TS + 1); p < PRI_COUNT; p++) {
120 		ts_map[p] = rt_ts;
121 		high_pri[p] = p;
122 	}
123 }
124 
125 /*
126  * Hooks.
127  */
128 
129 void
130 sched_proc_fork(struct proc *parent, struct proc *child)
131 {
132 	struct lwp *l;
133 
134 	LIST_FOREACH(l, &child->p_lwps, l_sibling) {
135 		lwp_lock(l);
136 		sched_newts(l);
137 		lwp_unlock(l);
138 	}
139 }
140 
141 void
142 sched_proc_exit(struct proc *child, struct proc *parent)
143 {
144 
145 }
146 
147 void
148 sched_lwp_fork(struct lwp *l1, struct lwp *l2)
149 {
150 
151 }
152 
153 void
154 sched_lwp_collect(struct lwp *l)
155 {
156 
157 }
158 
159 void
160 sched_setrunnable(struct lwp *l)
161 {
162 
163 }
164 
165 void
166 sched_schedclock(struct lwp *l)
167 {
168 
169 }
170 
171 /*
172  * Priorities and time-slice.
173  */
174 
175 void
176 sched_nice(struct proc *p, int prio)
177 {
178 	struct lwp *l;
179 	int n;
180 
181 	KASSERT(mutex_owned(p->p_lock));
182 
183 	p->p_nice = prio;
184 	n = (prio - NZERO) >> 2;
185 	if (n == 0)
186 		return;
187 
188 	LIST_FOREACH(l, &p->p_lwps, l_sibling) {
189 		lwp_lock(l);
190 		if (l->l_class == SCHED_OTHER) {
191 			pri_t pri = l->l_priority - n;
192 			pri = (n < 0) ? min(pri, PRI_HIGHEST_TS) : imax(pri, 0);
193 			lwp_changepri(l, pri);
194 		}
195 		lwp_unlock(l);
196 	}
197 }
198 
199 /* Recalculate the time-slice */
200 void
201 sched_newts(struct lwp *l)
202 {
203 
204 	l->l_sched.timeslice = ts_map[lwp_eprio(l)];
205 }
206 
207 void
208 sched_slept(struct lwp *l)
209 {
210 
211 	/*
212 	 * If thread is in time-sharing queue and batch flag is not marked,
213 	 * increase the priority, and run with the lower time-quantum.
214 	 */
215 	if (l->l_priority < PRI_HIGHEST_TS && (l->l_flag & LW_BATCH) == 0) {
216 		struct proc *p = l->l_proc;
217 
218 		KASSERT(l->l_class == SCHED_OTHER);
219 		if (__predict_false(p->p_nice < NZERO)) {
220 			const int n = max((NZERO - p->p_nice) >> 2, 1);
221 			l->l_priority = min(l->l_priority + n, PRI_HIGHEST_TS);
222 		} else {
223 			l->l_priority++;
224 		}
225 	}
226 }
227 
228 void
229 sched_wakeup(struct lwp *l)
230 {
231 
232 	/* If thread was sleeping a second or more - set a high priority */
233 	if (l->l_slptime >= 1)
234 		l->l_priority = high_pri[l->l_priority];
235 }
236 
237 void
238 sched_pstats_hook(struct lwp *l, int batch)
239 {
240 	pri_t prio;
241 
242 	/*
243 	 * Estimate threads on time-sharing queue only, however,
244 	 * exclude the highest priority for performance purposes.
245 	 */
246 	KASSERT(lwp_locked(l, NULL));
247 	if (l->l_priority >= PRI_HIGHEST_TS)
248 		return;
249 	KASSERT(l->l_class == SCHED_OTHER);
250 
251 	/* If it is CPU-bound not a first time - decrease the priority */
252 	prio = l->l_priority;
253 	if (batch && prio != 0)
254 		prio--;
255 
256 	/* If thread was not ran a second or more - set a high priority */
257 	if (l->l_stat == LSRUN) {
258 		if (l->l_rticks && (hardclock_ticks - l->l_rticks >= hz))
259 			prio = high_pri[prio];
260 		/* Re-enqueue the thread if priority has changed */
261 		if (prio != l->l_priority)
262 			lwp_changepri(l, prio);
263 	} else {
264 		/* In other states, change the priority directly */
265 		l->l_priority = prio;
266 	}
267 }
268 
269 void
270 sched_oncpu(lwp_t *l)
271 {
272 	struct schedstate_percpu *spc = &l->l_cpu->ci_schedstate;
273 
274 	/* Update the counters */
275 	KASSERT(l->l_sched.timeslice >= min_ts);
276 	KASSERT(l->l_sched.timeslice <= max_ts);
277 	spc->spc_ticks = l->l_sched.timeslice;
278 }
279 
280 /*
281  * Time-driven events.
282  */
283 
284 /*
285  * Called once per time-quantum.  This routine is CPU-local and runs at
286  * IPL_SCHED, thus the locking is not needed.
287  */
288 void
289 sched_tick(struct cpu_info *ci)
290 {
291 	struct schedstate_percpu *spc = &ci->ci_schedstate;
292 	struct lwp *l = curlwp;
293 	struct proc *p;
294 
295 	if (__predict_false(CURCPU_IDLE_P()))
296 		return;
297 
298 	switch (l->l_class) {
299 	case SCHED_FIFO:
300 		/*
301 		 * Update the time-quantum, and continue running,
302 		 * if thread runs on FIFO real-time policy.
303 		 */
304 		KASSERT(l->l_priority > PRI_HIGHEST_TS);
305 		spc->spc_ticks = l->l_sched.timeslice;
306 		return;
307 	case SCHED_OTHER:
308 		/*
309 		 * If thread is in time-sharing queue, decrease the priority,
310 		 * and run with a higher time-quantum.
311 		 */
312 		KASSERT(l->l_priority <= PRI_HIGHEST_TS);
313 		if (l->l_priority == 0)
314 			break;
315 
316 		p = l->l_proc;
317 		if (__predict_false(p->p_nice > NZERO)) {
318 			const int n = max((p->p_nice - NZERO) >> 2, 1);
319 			l->l_priority = imax(l->l_priority - n, 0);
320 		} else
321 			l->l_priority--;
322 		break;
323 	}
324 
325 	/*
326 	 * If there are higher priority threads or threads in the same queue,
327 	 * mark that thread should yield, otherwise, continue running.
328 	 */
329 	if (lwp_eprio(l) <= spc->spc_maxpriority || l->l_target_cpu) {
330 		spc->spc_flags |= SPCF_SHOULDYIELD;
331 		cpu_need_resched(ci, 0);
332 	} else
333 		spc->spc_ticks = l->l_sched.timeslice;
334 }
335 
336 /*
337  * Sysctl nodes and initialization.
338  */
339 
340 static int
341 sysctl_sched_rtts(SYSCTLFN_ARGS)
342 {
343 	struct sysctlnode node;
344 	int rttsms = hztoms(rt_ts);
345 
346 	node = *rnode;
347 	node.sysctl_data = &rttsms;
348 	return sysctl_lookup(SYSCTLFN_CALL(&node));
349 }
350 
351 static int
352 sysctl_sched_mints(SYSCTLFN_ARGS)
353 {
354 	struct sysctlnode node;
355 	struct cpu_info *ci;
356 	int error, newsize;
357 	CPU_INFO_ITERATOR cii;
358 
359 	node = *rnode;
360 	node.sysctl_data = &newsize;
361 
362 	newsize = hztoms(min_ts);
363 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
364 	if (error || newp == NULL)
365 		return error;
366 
367 	newsize = mstohz(newsize);
368 	if (newsize < 1 || newsize > hz || newsize >= max_ts)
369 		return EINVAL;
370 
371 	/* It is safe to do this in such order */
372 	for (CPU_INFO_FOREACH(cii, ci))
373 		spc_lock(ci);
374 
375 	min_ts = newsize;
376 	sched_precalcts();
377 
378 	for (CPU_INFO_FOREACH(cii, ci))
379 		spc_unlock(ci);
380 
381 	return 0;
382 }
383 
384 static int
385 sysctl_sched_maxts(SYSCTLFN_ARGS)
386 {
387 	struct sysctlnode node;
388 	struct cpu_info *ci;
389 	int error, newsize;
390 	CPU_INFO_ITERATOR cii;
391 
392 	node = *rnode;
393 	node.sysctl_data = &newsize;
394 
395 	newsize = hztoms(max_ts);
396 	error = sysctl_lookup(SYSCTLFN_CALL(&node));
397 	if (error || newp == NULL)
398 		return error;
399 
400 	newsize = mstohz(newsize);
401 	if (newsize < 10 || newsize > hz || newsize <= min_ts)
402 		return EINVAL;
403 
404 	/* It is safe to do this in such order */
405 	for (CPU_INFO_FOREACH(cii, ci))
406 		spc_lock(ci);
407 
408 	max_ts = newsize;
409 	sched_precalcts();
410 
411 	for (CPU_INFO_FOREACH(cii, ci))
412 		spc_unlock(ci);
413 
414 	return 0;
415 }
416 
417 SYSCTL_SETUP(sysctl_sched_m2_setup, "sysctl sched setup")
418 {
419 	const struct sysctlnode *node = NULL;
420 
421 	sysctl_createv(clog, 0, NULL, &node,
422 		CTLFLAG_PERMANENT,
423 		CTLTYPE_NODE, "sched",
424 		SYSCTL_DESCR("Scheduler options"),
425 		NULL, 0, NULL, 0,
426 		CTL_KERN, CTL_CREATE, CTL_EOL);
427 
428 	if (node == NULL)
429 		return;
430 
431 	sysctl_createv(NULL, 0, &node, NULL,
432 		CTLFLAG_PERMANENT,
433 		CTLTYPE_STRING, "name", NULL,
434 		NULL, 0, __UNCONST("M2"), 0,
435 		CTL_CREATE, CTL_EOL);
436 	sysctl_createv(NULL, 0, &node, NULL,
437 		CTLFLAG_PERMANENT,
438 		CTLTYPE_INT, "rtts",
439 		SYSCTL_DESCR("Round-robin time quantum (in milliseconds)"),
440 		sysctl_sched_rtts, 0, NULL, 0,
441 		CTL_CREATE, CTL_EOL);
442 	sysctl_createv(NULL, 0, &node, NULL,
443 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
444 		CTLTYPE_INT, "maxts",
445 		SYSCTL_DESCR("Maximal time quantum (in milliseconds)"),
446 		sysctl_sched_maxts, 0, &max_ts, 0,
447 		CTL_CREATE, CTL_EOL);
448 	sysctl_createv(NULL, 0, &node, NULL,
449 		CTLFLAG_PERMANENT | CTLFLAG_READWRITE,
450 		CTLTYPE_INT, "mints",
451 		SYSCTL_DESCR("Minimal time quantum (in milliseconds)"),
452 		sysctl_sched_mints, 0, &min_ts, 0,
453 		CTL_CREATE, CTL_EOL);
454 }
455