1 /* $NetBSD: sched_m2.c,v 1.32 2014/06/24 10:08:45 maxv Exp $ */ 2 3 /* 4 * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org> 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 /* 30 * TODO: 31 * - Implementation of fair share queue; 32 * - Support for NUMA; 33 */ 34 35 #include <sys/cdefs.h> 36 __KERNEL_RCSID(0, "$NetBSD: sched_m2.c,v 1.32 2014/06/24 10:08:45 maxv Exp $"); 37 38 #include <sys/param.h> 39 40 #include <sys/cpu.h> 41 #include <sys/callout.h> 42 #include <sys/errno.h> 43 #include <sys/kernel.h> 44 #include <sys/kmem.h> 45 #include <sys/lwp.h> 46 #include <sys/mutex.h> 47 #include <sys/pool.h> 48 #include <sys/proc.h> 49 #include <sys/pset.h> 50 #include <sys/resource.h> 51 #include <sys/resourcevar.h> 52 #include <sys/sched.h> 53 #include <sys/syscallargs.h> 54 #include <sys/sysctl.h> 55 #include <sys/types.h> 56 57 /* 58 * Priority related definitions. 59 */ 60 #define PRI_TS_COUNT (NPRI_USER) 61 #define PRI_RT_COUNT (PRI_COUNT - PRI_TS_COUNT) 62 #define PRI_HTS_RANGE (PRI_TS_COUNT / 10) 63 64 #define PRI_HIGHEST_TS (MAXPRI_USER) 65 66 /* 67 * Time-slices and priorities. 68 */ 69 static u_int min_ts; /* Minimal time-slice */ 70 static u_int max_ts; /* Maximal time-slice */ 71 static u_int rt_ts; /* Real-time time-slice */ 72 static u_int ts_map[PRI_COUNT]; /* Map of time-slices */ 73 static pri_t high_pri[PRI_COUNT]; /* Map for priority increase */ 74 75 static void sched_precalcts(void); 76 77 /* 78 * Initialization and setup. 79 */ 80 81 void 82 sched_rqinit(void) 83 { 84 if (hz < 100) { 85 panic("sched_rqinit: value of HZ is too low\n"); 86 } 87 88 /* Default timing ranges */ 89 min_ts = mstohz(20); /* ~20 ms */ 90 max_ts = mstohz(150); /* ~150 ms */ 91 rt_ts = mstohz(100); /* ~100 ms */ 92 sched_precalcts(); 93 94 #ifdef notdef 95 /* Need to set the name etc. This does not belong here */ 96 /* Attach the primary CPU here */ 97 sched_cpuattach(curcpu()); 98 #endif 99 100 sched_lwp_fork(NULL, &lwp0); 101 sched_newts(&lwp0); 102 } 103 104 /* Pre-calculate the time-slices for the priorities */ 105 static void 106 sched_precalcts(void) 107 { 108 pri_t p; 109 110 /* Time-sharing range */ 111 for (p = 0; p <= PRI_HIGHEST_TS; p++) { 112 ts_map[p] = max_ts - 113 (p * 100 / (PRI_TS_COUNT - 1) * (max_ts - min_ts) / 100); 114 high_pri[p] = (PRI_HIGHEST_TS - PRI_HTS_RANGE) + 115 ((p * PRI_HTS_RANGE) / (PRI_TS_COUNT - 1)); 116 } 117 118 /* Real-time range */ 119 for (p = (PRI_HIGHEST_TS + 1); p < PRI_COUNT; p++) { 120 ts_map[p] = rt_ts; 121 high_pri[p] = p; 122 } 123 } 124 125 /* 126 * Hooks. 127 */ 128 129 void 130 sched_proc_fork(struct proc *parent, struct proc *child) 131 { 132 struct lwp *l; 133 134 LIST_FOREACH(l, &child->p_lwps, l_sibling) { 135 lwp_lock(l); 136 sched_newts(l); 137 lwp_unlock(l); 138 } 139 } 140 141 void 142 sched_proc_exit(struct proc *child, struct proc *parent) 143 { 144 145 } 146 147 void 148 sched_lwp_fork(struct lwp *l1, struct lwp *l2) 149 { 150 151 } 152 153 void 154 sched_lwp_collect(struct lwp *l) 155 { 156 157 } 158 159 void 160 sched_setrunnable(struct lwp *l) 161 { 162 163 } 164 165 void 166 sched_schedclock(struct lwp *l) 167 { 168 169 } 170 171 /* 172 * Priorities and time-slice. 173 */ 174 175 void 176 sched_nice(struct proc *p, int prio) 177 { 178 struct lwp *l; 179 int n; 180 181 KASSERT(mutex_owned(p->p_lock)); 182 183 p->p_nice = prio; 184 n = (prio - NZERO) >> 2; 185 if (n == 0) 186 return; 187 188 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 189 lwp_lock(l); 190 if (l->l_class == SCHED_OTHER) { 191 pri_t pri = l->l_priority - n; 192 pri = (n < 0) ? min(pri, PRI_HIGHEST_TS) : imax(pri, 0); 193 lwp_changepri(l, pri); 194 } 195 lwp_unlock(l); 196 } 197 } 198 199 /* Recalculate the time-slice */ 200 void 201 sched_newts(struct lwp *l) 202 { 203 204 l->l_sched.timeslice = ts_map[lwp_eprio(l)]; 205 } 206 207 void 208 sched_slept(struct lwp *l) 209 { 210 211 /* 212 * If thread is in time-sharing queue and batch flag is not marked, 213 * increase the priority, and run with the lower time-quantum. 214 */ 215 if (l->l_priority < PRI_HIGHEST_TS && (l->l_flag & LW_BATCH) == 0) { 216 struct proc *p = l->l_proc; 217 218 KASSERT(l->l_class == SCHED_OTHER); 219 if (__predict_false(p->p_nice < NZERO)) { 220 const int n = max((NZERO - p->p_nice) >> 2, 1); 221 l->l_priority = min(l->l_priority + n, PRI_HIGHEST_TS); 222 } else { 223 l->l_priority++; 224 } 225 } 226 } 227 228 void 229 sched_wakeup(struct lwp *l) 230 { 231 232 /* If thread was sleeping a second or more - set a high priority */ 233 if (l->l_slptime >= 1) 234 l->l_priority = high_pri[l->l_priority]; 235 } 236 237 void 238 sched_pstats_hook(struct lwp *l, int batch) 239 { 240 pri_t prio; 241 242 /* 243 * Estimate threads on time-sharing queue only, however, 244 * exclude the highest priority for performance purposes. 245 */ 246 KASSERT(lwp_locked(l, NULL)); 247 if (l->l_priority >= PRI_HIGHEST_TS) 248 return; 249 KASSERT(l->l_class == SCHED_OTHER); 250 251 /* If it is CPU-bound not a first time - decrease the priority */ 252 prio = l->l_priority; 253 if (batch && prio != 0) 254 prio--; 255 256 /* If thread was not ran a second or more - set a high priority */ 257 if (l->l_stat == LSRUN) { 258 if (l->l_rticks && (hardclock_ticks - l->l_rticks >= hz)) 259 prio = high_pri[prio]; 260 /* Re-enqueue the thread if priority has changed */ 261 if (prio != l->l_priority) 262 lwp_changepri(l, prio); 263 } else { 264 /* In other states, change the priority directly */ 265 l->l_priority = prio; 266 } 267 } 268 269 void 270 sched_oncpu(lwp_t *l) 271 { 272 struct schedstate_percpu *spc = &l->l_cpu->ci_schedstate; 273 274 /* Update the counters */ 275 KASSERT(l->l_sched.timeslice >= min_ts); 276 KASSERT(l->l_sched.timeslice <= max_ts); 277 spc->spc_ticks = l->l_sched.timeslice; 278 } 279 280 /* 281 * Time-driven events. 282 */ 283 284 /* 285 * Called once per time-quantum. This routine is CPU-local and runs at 286 * IPL_SCHED, thus the locking is not needed. 287 */ 288 void 289 sched_tick(struct cpu_info *ci) 290 { 291 struct schedstate_percpu *spc = &ci->ci_schedstate; 292 struct lwp *l = curlwp; 293 struct proc *p; 294 295 if (__predict_false(CURCPU_IDLE_P())) 296 return; 297 298 switch (l->l_class) { 299 case SCHED_FIFO: 300 /* 301 * Update the time-quantum, and continue running, 302 * if thread runs on FIFO real-time policy. 303 */ 304 KASSERT(l->l_priority > PRI_HIGHEST_TS); 305 spc->spc_ticks = l->l_sched.timeslice; 306 return; 307 case SCHED_OTHER: 308 /* 309 * If thread is in time-sharing queue, decrease the priority, 310 * and run with a higher time-quantum. 311 */ 312 KASSERT(l->l_priority <= PRI_HIGHEST_TS); 313 if (l->l_priority == 0) 314 break; 315 316 p = l->l_proc; 317 if (__predict_false(p->p_nice > NZERO)) { 318 const int n = max((p->p_nice - NZERO) >> 2, 1); 319 l->l_priority = imax(l->l_priority - n, 0); 320 } else 321 l->l_priority--; 322 break; 323 } 324 325 /* 326 * If there are higher priority threads or threads in the same queue, 327 * mark that thread should yield, otherwise, continue running. 328 */ 329 if (lwp_eprio(l) <= spc->spc_maxpriority || l->l_target_cpu) { 330 spc->spc_flags |= SPCF_SHOULDYIELD; 331 cpu_need_resched(ci, 0); 332 } else 333 spc->spc_ticks = l->l_sched.timeslice; 334 } 335 336 /* 337 * Sysctl nodes and initialization. 338 */ 339 340 static int 341 sysctl_sched_rtts(SYSCTLFN_ARGS) 342 { 343 struct sysctlnode node; 344 int rttsms = hztoms(rt_ts); 345 346 node = *rnode; 347 node.sysctl_data = &rttsms; 348 return sysctl_lookup(SYSCTLFN_CALL(&node)); 349 } 350 351 static int 352 sysctl_sched_mints(SYSCTLFN_ARGS) 353 { 354 struct sysctlnode node; 355 struct cpu_info *ci; 356 int error, newsize; 357 CPU_INFO_ITERATOR cii; 358 359 node = *rnode; 360 node.sysctl_data = &newsize; 361 362 newsize = hztoms(min_ts); 363 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 364 if (error || newp == NULL) 365 return error; 366 367 newsize = mstohz(newsize); 368 if (newsize < 1 || newsize > hz || newsize >= max_ts) 369 return EINVAL; 370 371 /* It is safe to do this in such order */ 372 for (CPU_INFO_FOREACH(cii, ci)) 373 spc_lock(ci); 374 375 min_ts = newsize; 376 sched_precalcts(); 377 378 for (CPU_INFO_FOREACH(cii, ci)) 379 spc_unlock(ci); 380 381 return 0; 382 } 383 384 static int 385 sysctl_sched_maxts(SYSCTLFN_ARGS) 386 { 387 struct sysctlnode node; 388 struct cpu_info *ci; 389 int error, newsize; 390 CPU_INFO_ITERATOR cii; 391 392 node = *rnode; 393 node.sysctl_data = &newsize; 394 395 newsize = hztoms(max_ts); 396 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 397 if (error || newp == NULL) 398 return error; 399 400 newsize = mstohz(newsize); 401 if (newsize < 10 || newsize > hz || newsize <= min_ts) 402 return EINVAL; 403 404 /* It is safe to do this in such order */ 405 for (CPU_INFO_FOREACH(cii, ci)) 406 spc_lock(ci); 407 408 max_ts = newsize; 409 sched_precalcts(); 410 411 for (CPU_INFO_FOREACH(cii, ci)) 412 spc_unlock(ci); 413 414 return 0; 415 } 416 417 SYSCTL_SETUP(sysctl_sched_m2_setup, "sysctl sched setup") 418 { 419 const struct sysctlnode *node = NULL; 420 421 sysctl_createv(clog, 0, NULL, &node, 422 CTLFLAG_PERMANENT, 423 CTLTYPE_NODE, "sched", 424 SYSCTL_DESCR("Scheduler options"), 425 NULL, 0, NULL, 0, 426 CTL_KERN, CTL_CREATE, CTL_EOL); 427 428 if (node == NULL) 429 return; 430 431 sysctl_createv(NULL, 0, &node, NULL, 432 CTLFLAG_PERMANENT, 433 CTLTYPE_STRING, "name", NULL, 434 NULL, 0, __UNCONST("M2"), 0, 435 CTL_CREATE, CTL_EOL); 436 sysctl_createv(NULL, 0, &node, NULL, 437 CTLFLAG_PERMANENT, 438 CTLTYPE_INT, "rtts", 439 SYSCTL_DESCR("Round-robin time quantum (in milliseconds)"), 440 sysctl_sched_rtts, 0, NULL, 0, 441 CTL_CREATE, CTL_EOL); 442 sysctl_createv(NULL, 0, &node, NULL, 443 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 444 CTLTYPE_INT, "maxts", 445 SYSCTL_DESCR("Maximal time quantum (in milliseconds)"), 446 sysctl_sched_maxts, 0, &max_ts, 0, 447 CTL_CREATE, CTL_EOL); 448 sysctl_createv(NULL, 0, &node, NULL, 449 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 450 CTLTYPE_INT, "mints", 451 SYSCTL_DESCR("Minimal time quantum (in milliseconds)"), 452 sysctl_sched_mints, 0, &min_ts, 0, 453 CTL_CREATE, CTL_EOL); 454 } 455