1 /* $NetBSD: sched_m2.c,v 1.25 2008/05/19 12:48:54 rmind Exp $ */ 2 3 /* 4 * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org> 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 /* 30 * TODO: 31 * - Implementation of fair share queue; 32 * - Support for NUMA; 33 */ 34 35 #include <sys/cdefs.h> 36 __KERNEL_RCSID(0, "$NetBSD: sched_m2.c,v 1.25 2008/05/19 12:48:54 rmind Exp $"); 37 38 #include <sys/param.h> 39 40 #include <sys/bitops.h> 41 #include <sys/cpu.h> 42 #include <sys/callout.h> 43 #include <sys/errno.h> 44 #include <sys/kernel.h> 45 #include <sys/kmem.h> 46 #include <sys/lwp.h> 47 #include <sys/mutex.h> 48 #include <sys/pool.h> 49 #include <sys/proc.h> 50 #include <sys/pset.h> 51 #include <sys/resource.h> 52 #include <sys/resourcevar.h> 53 #include <sys/sched.h> 54 #include <sys/syscallargs.h> 55 #include <sys/sysctl.h> 56 #include <sys/types.h> 57 58 /* 59 * Priority related defintions. 60 */ 61 #define PRI_TS_COUNT (NPRI_USER) 62 #define PRI_RT_COUNT (PRI_COUNT - PRI_TS_COUNT) 63 #define PRI_HTS_RANGE (PRI_TS_COUNT / 10) 64 65 #define PRI_HIGHEST_TS (MAXPRI_USER) 66 67 /* 68 * Time-slices and priorities. 69 */ 70 static u_int min_ts; /* Minimal time-slice */ 71 static u_int max_ts; /* Maximal time-slice */ 72 static u_int rt_ts; /* Real-time time-slice */ 73 static u_int ts_map[PRI_COUNT]; /* Map of time-slices */ 74 static pri_t high_pri[PRI_COUNT]; /* Map for priority increase */ 75 76 static void sched_precalcts(void); 77 78 typedef struct { 79 u_int sl_timeslice; /* Time-slice of thread */ 80 } sched_info_lwp_t; 81 82 static pool_cache_t sil_pool; 83 84 /* 85 * Initialization and setup. 86 */ 87 88 void 89 sched_rqinit(void) 90 { 91 struct cpu_info *ci = curcpu(); 92 93 if (hz < 100) { 94 panic("sched_rqinit: value of HZ is too low\n"); 95 } 96 97 /* Default timing ranges */ 98 min_ts = mstohz(50); /* ~50ms */ 99 max_ts = mstohz(150); /* ~150ms */ 100 rt_ts = mstohz(100); /* ~100ms */ 101 sched_precalcts(); 102 103 /* Pool of the scheduler-specific structures */ 104 sil_pool = pool_cache_init(sizeof(sched_info_lwp_t), coherency_unit, 105 0, 0, "lwpsd", NULL, IPL_NONE, NULL, NULL, NULL); 106 107 /* Attach the primary CPU here */ 108 sched_cpuattach(ci); 109 110 sched_lwp_fork(NULL, &lwp0); 111 sched_newts(&lwp0); 112 } 113 114 /* Pre-calculate the time-slices for the priorities */ 115 static void 116 sched_precalcts(void) 117 { 118 pri_t p; 119 120 /* Time-sharing range */ 121 for (p = 0; p <= PRI_HIGHEST_TS; p++) { 122 ts_map[p] = max_ts - 123 (p * 100 / (PRI_TS_COUNT - 1) * (max_ts - min_ts) / 100); 124 high_pri[p] = (PRI_HIGHEST_TS - PRI_HTS_RANGE) + 125 ((p * PRI_HTS_RANGE) / (PRI_TS_COUNT - 1)); 126 } 127 128 /* Real-time range */ 129 for (p = (PRI_HIGHEST_TS + 1); p < PRI_COUNT; p++) { 130 ts_map[p] = rt_ts; 131 high_pri[p] = p; 132 } 133 } 134 135 /* 136 * Hooks. 137 */ 138 139 void 140 sched_proc_fork(struct proc *parent, struct proc *child) 141 { 142 struct lwp *l; 143 144 LIST_FOREACH(l, &child->p_lwps, l_sibling) { 145 lwp_lock(l); 146 sched_newts(l); 147 lwp_unlock(l); 148 } 149 } 150 151 void 152 sched_proc_exit(struct proc *child, struct proc *parent) 153 { 154 155 /* Dummy */ 156 } 157 158 void 159 sched_lwp_fork(struct lwp *l1, struct lwp *l2) 160 { 161 162 KASSERT(l2->l_sched_info == NULL); 163 l2->l_sched_info = pool_cache_get(sil_pool, PR_WAITOK); 164 memset(l2->l_sched_info, 0, sizeof(sched_info_lwp_t)); 165 } 166 167 void 168 sched_lwp_exit(struct lwp *l) 169 { 170 171 KASSERT(l->l_sched_info != NULL); 172 pool_cache_put(sil_pool, l->l_sched_info); 173 l->l_sched_info = NULL; 174 } 175 176 void 177 sched_lwp_collect(struct lwp *l) 178 { 179 180 } 181 182 void 183 sched_setrunnable(struct lwp *l) 184 { 185 186 /* Dummy */ 187 } 188 189 void 190 sched_schedclock(struct lwp *l) 191 { 192 193 /* Dummy */ 194 } 195 196 /* 197 * Priorities and time-slice. 198 */ 199 200 void 201 sched_nice(struct proc *p, int prio) 202 { 203 204 /* TODO: implement as SCHED_IA */ 205 } 206 207 /* Recalculate the time-slice */ 208 void 209 sched_newts(struct lwp *l) 210 { 211 sched_info_lwp_t *sil = l->l_sched_info; 212 213 sil->sl_timeslice = ts_map[lwp_eprio(l)]; 214 } 215 216 void 217 sched_slept(struct lwp *l) 218 { 219 220 /* 221 * If thread is in time-sharing queue and batch flag is not marked, 222 * increase the the priority, and run with the lower time-quantum. 223 */ 224 if (l->l_priority < PRI_HIGHEST_TS && (l->l_flag & LW_BATCH) == 0) { 225 KASSERT(l->l_class == SCHED_OTHER); 226 l->l_priority++; 227 } 228 } 229 230 void 231 sched_wakeup(struct lwp *l) 232 { 233 234 /* If thread was sleeping a second or more - set a high priority */ 235 if (l->l_slptime >= 1) 236 l->l_priority = high_pri[l->l_priority]; 237 } 238 239 void 240 sched_pstats_hook(struct lwp *l, int batch) 241 { 242 pri_t prio; 243 244 /* 245 * Estimate threads on time-sharing queue only, however, 246 * exclude the highest priority for performance purposes. 247 */ 248 if (l->l_priority >= PRI_HIGHEST_TS) 249 return; 250 KASSERT(l->l_class == SCHED_OTHER); 251 252 /* If it is CPU-bound not a first time - decrease the priority */ 253 prio = l->l_priority; 254 if (batch && prio != 0) 255 prio--; 256 257 /* If thread was not ran a second or more - set a high priority */ 258 if (l->l_stat == LSRUN) { 259 if (l->l_rticks && (hardclock_ticks - l->l_rticks >= hz)) 260 prio = high_pri[prio]; 261 /* Re-enqueue the thread if priority has changed */ 262 if (prio != l->l_priority) 263 lwp_changepri(l, prio); 264 } else { 265 /* In other states, change the priority directly */ 266 l->l_priority = prio; 267 } 268 } 269 270 void 271 sched_oncpu(lwp_t *l) 272 { 273 sched_info_lwp_t *sil = l->l_sched_info; 274 275 /* Update the counters */ 276 sil = l->l_sched_info; 277 KASSERT(sil->sl_timeslice >= min_ts); 278 KASSERT(sil->sl_timeslice <= max_ts); 279 l->l_cpu->ci_schedstate.spc_ticks = sil->sl_timeslice; 280 } 281 282 /* 283 * Time-driven events. 284 */ 285 286 /* 287 * Called once per time-quantum. This routine is CPU-local and runs at 288 * IPL_SCHED, thus the locking is not needed. 289 */ 290 void 291 sched_tick(struct cpu_info *ci) 292 { 293 struct schedstate_percpu *spc = &ci->ci_schedstate; 294 struct lwp *l = curlwp; 295 const sched_info_lwp_t *sil = l->l_sched_info; 296 297 if (CURCPU_IDLE_P()) 298 return; 299 300 switch (l->l_class) { 301 case SCHED_FIFO: 302 /* 303 * Update the time-quantum, and continue running, 304 * if thread runs on FIFO real-time policy. 305 */ 306 KASSERT(l->l_priority > PRI_HIGHEST_TS); 307 spc->spc_ticks = sil->sl_timeslice; 308 return; 309 case SCHED_OTHER: 310 /* 311 * If thread is in time-sharing queue, decrease the priority, 312 * and run with a higher time-quantum. 313 */ 314 KASSERT(l->l_priority <= PRI_HIGHEST_TS); 315 if (l->l_priority != 0) 316 l->l_priority--; 317 break; 318 } 319 320 /* 321 * If there are higher priority threads or threads in the same queue, 322 * mark that thread should yield, otherwise, continue running. 323 */ 324 if (lwp_eprio(l) <= spc->spc_maxpriority || l->l_target_cpu) { 325 spc->spc_flags |= SPCF_SHOULDYIELD; 326 cpu_need_resched(ci, 0); 327 } else 328 spc->spc_ticks = sil->sl_timeslice; 329 } 330 331 /* 332 * Sysctl nodes and initialization. 333 */ 334 335 static int 336 sysctl_sched_rtts(SYSCTLFN_ARGS) 337 { 338 struct sysctlnode node; 339 int rttsms = hztoms(rt_ts); 340 341 node = *rnode; 342 node.sysctl_data = &rttsms; 343 return sysctl_lookup(SYSCTLFN_CALL(&node)); 344 } 345 346 static int 347 sysctl_sched_mints(SYSCTLFN_ARGS) 348 { 349 struct sysctlnode node; 350 struct cpu_info *ci; 351 int error, newsize; 352 CPU_INFO_ITERATOR cii; 353 354 node = *rnode; 355 node.sysctl_data = &newsize; 356 357 newsize = hztoms(min_ts); 358 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 359 if (error || newp == NULL) 360 return error; 361 362 newsize = mstohz(newsize); 363 if (newsize < 1 || newsize > hz || newsize >= max_ts) 364 return EINVAL; 365 366 /* It is safe to do this in such order */ 367 for (CPU_INFO_FOREACH(cii, ci)) 368 spc_lock(ci); 369 370 min_ts = newsize; 371 sched_precalcts(); 372 373 for (CPU_INFO_FOREACH(cii, ci)) 374 spc_unlock(ci); 375 376 return 0; 377 } 378 379 static int 380 sysctl_sched_maxts(SYSCTLFN_ARGS) 381 { 382 struct sysctlnode node; 383 struct cpu_info *ci; 384 int error, newsize; 385 CPU_INFO_ITERATOR cii; 386 387 node = *rnode; 388 node.sysctl_data = &newsize; 389 390 newsize = hztoms(max_ts); 391 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 392 if (error || newp == NULL) 393 return error; 394 395 newsize = mstohz(newsize); 396 if (newsize < 10 || newsize > hz || newsize <= min_ts) 397 return EINVAL; 398 399 /* It is safe to do this in such order */ 400 for (CPU_INFO_FOREACH(cii, ci)) 401 spc_lock(ci); 402 403 max_ts = newsize; 404 sched_precalcts(); 405 406 for (CPU_INFO_FOREACH(cii, ci)) 407 spc_unlock(ci); 408 409 return 0; 410 } 411 412 SYSCTL_SETUP(sysctl_sched_m2_setup, "sysctl sched setup") 413 { 414 const struct sysctlnode *node = NULL; 415 416 sysctl_createv(clog, 0, NULL, NULL, 417 CTLFLAG_PERMANENT, 418 CTLTYPE_NODE, "kern", NULL, 419 NULL, 0, NULL, 0, 420 CTL_KERN, CTL_EOL); 421 sysctl_createv(clog, 0, NULL, &node, 422 CTLFLAG_PERMANENT, 423 CTLTYPE_NODE, "sched", 424 SYSCTL_DESCR("Scheduler options"), 425 NULL, 0, NULL, 0, 426 CTL_KERN, CTL_CREATE, CTL_EOL); 427 428 if (node == NULL) 429 return; 430 431 sysctl_createv(NULL, 0, &node, NULL, 432 CTLFLAG_PERMANENT, 433 CTLTYPE_STRING, "name", NULL, 434 NULL, 0, __UNCONST("M2"), 0, 435 CTL_CREATE, CTL_EOL); 436 sysctl_createv(NULL, 0, &node, NULL, 437 CTLFLAG_PERMANENT, 438 CTLTYPE_INT, "rtts", 439 SYSCTL_DESCR("Round-robin time quantum (in miliseconds)"), 440 sysctl_sched_rtts, 0, NULL, 0, 441 CTL_CREATE, CTL_EOL); 442 sysctl_createv(NULL, 0, &node, NULL, 443 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 444 CTLTYPE_INT, "maxts", 445 SYSCTL_DESCR("Maximal time quantum (in miliseconds)"), 446 sysctl_sched_maxts, 0, &max_ts, 0, 447 CTL_CREATE, CTL_EOL); 448 sysctl_createv(NULL, 0, &node, NULL, 449 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 450 CTLTYPE_INT, "mints", 451 SYSCTL_DESCR("Minimal time quantum (in miliseconds)"), 452 sysctl_sched_mints, 0, &min_ts, 0, 453 CTL_CREATE, CTL_EOL); 454 } 455