1 /* $NetBSD: sched_m2.c,v 1.29 2009/11/22 19:09:16 mbalmer Exp $ */ 2 3 /* 4 * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org> 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 /* 30 * TODO: 31 * - Implementation of fair share queue; 32 * - Support for NUMA; 33 */ 34 35 #include <sys/cdefs.h> 36 __KERNEL_RCSID(0, "$NetBSD: sched_m2.c,v 1.29 2009/11/22 19:09:16 mbalmer Exp $"); 37 38 #include <sys/param.h> 39 40 #include <sys/cpu.h> 41 #include <sys/callout.h> 42 #include <sys/errno.h> 43 #include <sys/kernel.h> 44 #include <sys/kmem.h> 45 #include <sys/lwp.h> 46 #include <sys/mutex.h> 47 #include <sys/pool.h> 48 #include <sys/proc.h> 49 #include <sys/pset.h> 50 #include <sys/resource.h> 51 #include <sys/resourcevar.h> 52 #include <sys/sched.h> 53 #include <sys/syscallargs.h> 54 #include <sys/sysctl.h> 55 #include <sys/types.h> 56 57 /* 58 * Priority related defintions. 59 */ 60 #define PRI_TS_COUNT (NPRI_USER) 61 #define PRI_RT_COUNT (PRI_COUNT - PRI_TS_COUNT) 62 #define PRI_HTS_RANGE (PRI_TS_COUNT / 10) 63 64 #define PRI_HIGHEST_TS (MAXPRI_USER) 65 66 /* 67 * Time-slices and priorities. 68 */ 69 static u_int min_ts; /* Minimal time-slice */ 70 static u_int max_ts; /* Maximal time-slice */ 71 static u_int rt_ts; /* Real-time time-slice */ 72 static u_int ts_map[PRI_COUNT]; /* Map of time-slices */ 73 static pri_t high_pri[PRI_COUNT]; /* Map for priority increase */ 74 75 static void sched_precalcts(void); 76 77 /* 78 * Initialization and setup. 79 */ 80 81 void 82 sched_rqinit(void) 83 { 84 struct cpu_info *ci = curcpu(); 85 86 if (hz < 100) { 87 panic("sched_rqinit: value of HZ is too low\n"); 88 } 89 90 /* Default timing ranges */ 91 min_ts = mstohz(20); /* ~20 ms */ 92 max_ts = mstohz(150); /* ~150 ms */ 93 rt_ts = mstohz(100); /* ~100 ms */ 94 sched_precalcts(); 95 96 /* Attach the primary CPU here */ 97 sched_cpuattach(ci); 98 99 sched_lwp_fork(NULL, &lwp0); 100 sched_newts(&lwp0); 101 } 102 103 /* Pre-calculate the time-slices for the priorities */ 104 static void 105 sched_precalcts(void) 106 { 107 pri_t p; 108 109 /* Time-sharing range */ 110 for (p = 0; p <= PRI_HIGHEST_TS; p++) { 111 ts_map[p] = max_ts - 112 (p * 100 / (PRI_TS_COUNT - 1) * (max_ts - min_ts) / 100); 113 high_pri[p] = (PRI_HIGHEST_TS - PRI_HTS_RANGE) + 114 ((p * PRI_HTS_RANGE) / (PRI_TS_COUNT - 1)); 115 } 116 117 /* Real-time range */ 118 for (p = (PRI_HIGHEST_TS + 1); p < PRI_COUNT; p++) { 119 ts_map[p] = rt_ts; 120 high_pri[p] = p; 121 } 122 } 123 124 /* 125 * Hooks. 126 */ 127 128 void 129 sched_proc_fork(struct proc *parent, struct proc *child) 130 { 131 struct lwp *l; 132 133 LIST_FOREACH(l, &child->p_lwps, l_sibling) { 134 lwp_lock(l); 135 sched_newts(l); 136 lwp_unlock(l); 137 } 138 } 139 140 void 141 sched_proc_exit(struct proc *child, struct proc *parent) 142 { 143 144 } 145 146 void 147 sched_lwp_fork(struct lwp *l1, struct lwp *l2) 148 { 149 150 } 151 152 void 153 sched_lwp_collect(struct lwp *l) 154 { 155 156 } 157 158 void 159 sched_setrunnable(struct lwp *l) 160 { 161 162 } 163 164 void 165 sched_schedclock(struct lwp *l) 166 { 167 168 } 169 170 /* 171 * Priorities and time-slice. 172 */ 173 174 void 175 sched_nice(struct proc *p, int prio) 176 { 177 struct lwp *l; 178 int n; 179 180 KASSERT(mutex_owned(p->p_lock)); 181 182 p->p_nice = prio; 183 n = (prio - NZERO) >> 2; 184 if (n == 0) 185 return; 186 187 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 188 lwp_lock(l); 189 if (l->l_class == SCHED_OTHER) { 190 pri_t pri = l->l_priority - n; 191 pri = (n < 0) ? min(pri, PRI_HIGHEST_TS) : imax(pri, 0); 192 lwp_changepri(l, pri); 193 } 194 lwp_unlock(l); 195 } 196 } 197 198 /* Recalculate the time-slice */ 199 void 200 sched_newts(struct lwp *l) 201 { 202 203 l->l_sched.timeslice = ts_map[lwp_eprio(l)]; 204 } 205 206 void 207 sched_slept(struct lwp *l) 208 { 209 210 /* 211 * If thread is in time-sharing queue and batch flag is not marked, 212 * increase the priority, and run with the lower time-quantum. 213 */ 214 if (l->l_priority < PRI_HIGHEST_TS && (l->l_flag & LW_BATCH) == 0) { 215 struct proc *p = l->l_proc; 216 217 KASSERT(l->l_class == SCHED_OTHER); 218 if (__predict_false(p->p_nice < NZERO)) { 219 const int n = max((NZERO - p->p_nice) >> 2, 1); 220 l->l_priority = min(l->l_priority + n, PRI_HIGHEST_TS); 221 } else { 222 l->l_priority++; 223 } 224 } 225 } 226 227 void 228 sched_wakeup(struct lwp *l) 229 { 230 231 /* If thread was sleeping a second or more - set a high priority */ 232 if (l->l_slptime >= 1) 233 l->l_priority = high_pri[l->l_priority]; 234 } 235 236 void 237 sched_pstats_hook(struct lwp *l, int batch) 238 { 239 pri_t prio; 240 241 /* 242 * Estimate threads on time-sharing queue only, however, 243 * exclude the highest priority for performance purposes. 244 */ 245 KASSERT(lwp_locked(l, NULL)); 246 if (l->l_priority >= PRI_HIGHEST_TS) 247 return; 248 KASSERT(l->l_class == SCHED_OTHER); 249 250 /* If it is CPU-bound not a first time - decrease the priority */ 251 prio = l->l_priority; 252 if (batch && prio != 0) 253 prio--; 254 255 /* If thread was not ran a second or more - set a high priority */ 256 if (l->l_stat == LSRUN) { 257 if (l->l_rticks && (hardclock_ticks - l->l_rticks >= hz)) 258 prio = high_pri[prio]; 259 /* Re-enqueue the thread if priority has changed */ 260 if (prio != l->l_priority) 261 lwp_changepri(l, prio); 262 } else { 263 /* In other states, change the priority directly */ 264 l->l_priority = prio; 265 } 266 } 267 268 void 269 sched_oncpu(lwp_t *l) 270 { 271 struct schedstate_percpu *spc = &l->l_cpu->ci_schedstate; 272 273 /* Update the counters */ 274 KASSERT(l->l_sched.timeslice >= min_ts); 275 KASSERT(l->l_sched.timeslice <= max_ts); 276 spc->spc_ticks = l->l_sched.timeslice; 277 } 278 279 /* 280 * Time-driven events. 281 */ 282 283 /* 284 * Called once per time-quantum. This routine is CPU-local and runs at 285 * IPL_SCHED, thus the locking is not needed. 286 */ 287 void 288 sched_tick(struct cpu_info *ci) 289 { 290 struct schedstate_percpu *spc = &ci->ci_schedstate; 291 struct lwp *l = curlwp; 292 struct proc *p; 293 294 if (__predict_false(CURCPU_IDLE_P())) 295 return; 296 297 switch (l->l_class) { 298 case SCHED_FIFO: 299 /* 300 * Update the time-quantum, and continue running, 301 * if thread runs on FIFO real-time policy. 302 */ 303 KASSERT(l->l_priority > PRI_HIGHEST_TS); 304 spc->spc_ticks = l->l_sched.timeslice; 305 return; 306 case SCHED_OTHER: 307 /* 308 * If thread is in time-sharing queue, decrease the priority, 309 * and run with a higher time-quantum. 310 */ 311 KASSERT(l->l_priority <= PRI_HIGHEST_TS); 312 if (l->l_priority == 0) 313 break; 314 315 p = l->l_proc; 316 if (__predict_false(p->p_nice > NZERO)) { 317 const int n = max((p->p_nice - NZERO) >> 2, 1); 318 l->l_priority = imax(l->l_priority - n, 0); 319 } else 320 l->l_priority--; 321 break; 322 } 323 324 /* 325 * If there are higher priority threads or threads in the same queue, 326 * mark that thread should yield, otherwise, continue running. 327 */ 328 if (lwp_eprio(l) <= spc->spc_maxpriority || l->l_target_cpu) { 329 spc->spc_flags |= SPCF_SHOULDYIELD; 330 cpu_need_resched(ci, 0); 331 } else 332 spc->spc_ticks = l->l_sched.timeslice; 333 } 334 335 /* 336 * Sysctl nodes and initialization. 337 */ 338 339 static int 340 sysctl_sched_rtts(SYSCTLFN_ARGS) 341 { 342 struct sysctlnode node; 343 int rttsms = hztoms(rt_ts); 344 345 node = *rnode; 346 node.sysctl_data = &rttsms; 347 return sysctl_lookup(SYSCTLFN_CALL(&node)); 348 } 349 350 static int 351 sysctl_sched_mints(SYSCTLFN_ARGS) 352 { 353 struct sysctlnode node; 354 struct cpu_info *ci; 355 int error, newsize; 356 CPU_INFO_ITERATOR cii; 357 358 node = *rnode; 359 node.sysctl_data = &newsize; 360 361 newsize = hztoms(min_ts); 362 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 363 if (error || newp == NULL) 364 return error; 365 366 newsize = mstohz(newsize); 367 if (newsize < 1 || newsize > hz || newsize >= max_ts) 368 return EINVAL; 369 370 /* It is safe to do this in such order */ 371 for (CPU_INFO_FOREACH(cii, ci)) 372 spc_lock(ci); 373 374 min_ts = newsize; 375 sched_precalcts(); 376 377 for (CPU_INFO_FOREACH(cii, ci)) 378 spc_unlock(ci); 379 380 return 0; 381 } 382 383 static int 384 sysctl_sched_maxts(SYSCTLFN_ARGS) 385 { 386 struct sysctlnode node; 387 struct cpu_info *ci; 388 int error, newsize; 389 CPU_INFO_ITERATOR cii; 390 391 node = *rnode; 392 node.sysctl_data = &newsize; 393 394 newsize = hztoms(max_ts); 395 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 396 if (error || newp == NULL) 397 return error; 398 399 newsize = mstohz(newsize); 400 if (newsize < 10 || newsize > hz || newsize <= min_ts) 401 return EINVAL; 402 403 /* It is safe to do this in such order */ 404 for (CPU_INFO_FOREACH(cii, ci)) 405 spc_lock(ci); 406 407 max_ts = newsize; 408 sched_precalcts(); 409 410 for (CPU_INFO_FOREACH(cii, ci)) 411 spc_unlock(ci); 412 413 return 0; 414 } 415 416 SYSCTL_SETUP(sysctl_sched_m2_setup, "sysctl sched setup") 417 { 418 const struct sysctlnode *node = NULL; 419 420 sysctl_createv(clog, 0, NULL, NULL, 421 CTLFLAG_PERMANENT, 422 CTLTYPE_NODE, "kern", NULL, 423 NULL, 0, NULL, 0, 424 CTL_KERN, CTL_EOL); 425 sysctl_createv(clog, 0, NULL, &node, 426 CTLFLAG_PERMANENT, 427 CTLTYPE_NODE, "sched", 428 SYSCTL_DESCR("Scheduler options"), 429 NULL, 0, NULL, 0, 430 CTL_KERN, CTL_CREATE, CTL_EOL); 431 432 if (node == NULL) 433 return; 434 435 sysctl_createv(NULL, 0, &node, NULL, 436 CTLFLAG_PERMANENT, 437 CTLTYPE_STRING, "name", NULL, 438 NULL, 0, __UNCONST("M2"), 0, 439 CTL_CREATE, CTL_EOL); 440 sysctl_createv(NULL, 0, &node, NULL, 441 CTLFLAG_PERMANENT, 442 CTLTYPE_INT, "rtts", 443 SYSCTL_DESCR("Round-robin time quantum (in miliseconds)"), 444 sysctl_sched_rtts, 0, NULL, 0, 445 CTL_CREATE, CTL_EOL); 446 sysctl_createv(NULL, 0, &node, NULL, 447 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 448 CTLTYPE_INT, "maxts", 449 SYSCTL_DESCR("Maximal time quantum (in miliseconds)"), 450 sysctl_sched_maxts, 0, &max_ts, 0, 451 CTL_CREATE, CTL_EOL); 452 sysctl_createv(NULL, 0, &node, NULL, 453 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 454 CTLTYPE_INT, "mints", 455 SYSCTL_DESCR("Minimal time quantum (in miliseconds)"), 456 sysctl_sched_mints, 0, &min_ts, 0, 457 CTL_CREATE, CTL_EOL); 458 } 459