1 /* $NetBSD: sched_m2.c,v 1.24 2008/04/12 17:02:08 ad Exp $ */ 2 3 /* 4 * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org> 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 /* 30 * TODO: 31 * - Implementation of fair share queue; 32 * - Support for NUMA; 33 */ 34 35 #include <sys/cdefs.h> 36 __KERNEL_RCSID(0, "$NetBSD: sched_m2.c,v 1.24 2008/04/12 17:02:08 ad Exp $"); 37 38 #include <sys/param.h> 39 40 #include <sys/bitops.h> 41 #include <sys/cpu.h> 42 #include <sys/callout.h> 43 #include <sys/errno.h> 44 #include <sys/kernel.h> 45 #include <sys/kmem.h> 46 #include <sys/lwp.h> 47 #include <sys/mutex.h> 48 #include <sys/pool.h> 49 #include <sys/proc.h> 50 #include <sys/pset.h> 51 #include <sys/resource.h> 52 #include <sys/resourcevar.h> 53 #include <sys/sched.h> 54 #include <sys/syscallargs.h> 55 #include <sys/sysctl.h> 56 #include <sys/types.h> 57 58 /* 59 * Priority related defintions. 60 */ 61 #define PRI_TS_COUNT (NPRI_USER) 62 #define PRI_RT_COUNT (PRI_COUNT - PRI_TS_COUNT) 63 #define PRI_HTS_RANGE (PRI_TS_COUNT / 10) 64 65 #define PRI_HIGHEST_TS (MAXPRI_USER) 66 67 /* 68 * Time-slices and priorities. 69 */ 70 static u_int min_ts; /* Minimal time-slice */ 71 static u_int max_ts; /* Maximal time-slice */ 72 static u_int rt_ts; /* Real-time time-slice */ 73 static u_int ts_map[PRI_COUNT]; /* Map of time-slices */ 74 static pri_t high_pri[PRI_COUNT]; /* Map for priority increase */ 75 76 typedef struct { 77 u_int sl_flags; 78 u_int sl_timeslice; /* Time-slice of thread */ 79 u_int sl_slept; /* Saved sleep time for sleep sum */ 80 u_int sl_slpsum; /* Sum of sleep time */ 81 u_int sl_lrtime; /* Last run time */ 82 } sched_info_lwp_t; 83 84 /* Flags */ 85 #define SL_BATCH 0x01 86 87 /* Pool of the scheduler-specific structures for threads */ 88 static pool_cache_t sil_pool; 89 90 /* 91 * Prototypes. 92 */ 93 94 static void sched_precalcts(void); 95 96 /* 97 * Initialization and setup. 98 */ 99 100 void 101 sched_rqinit(void) 102 { 103 struct cpu_info *ci = curcpu(); 104 105 if (hz < 100) { 106 panic("sched_rqinit: value of HZ is too low\n"); 107 } 108 109 /* Default timing ranges */ 110 min_ts = mstohz(50); /* ~50ms */ 111 max_ts = mstohz(150); /* ~150ms */ 112 rt_ts = mstohz(100); /* ~100ms */ 113 sched_precalcts(); 114 115 /* Pool of the scheduler-specific structures */ 116 sil_pool = pool_cache_init(sizeof(sched_info_lwp_t), coherency_unit, 117 0, 0, "lwpsd", NULL, IPL_NONE, NULL, NULL, NULL); 118 119 /* Attach the primary CPU here */ 120 sched_cpuattach(ci); 121 122 sched_lwp_fork(NULL, &lwp0); 123 sched_newts(&lwp0); 124 } 125 126 /* Pre-calculate the time-slices for the priorities */ 127 static void 128 sched_precalcts(void) 129 { 130 pri_t p; 131 132 /* Time-sharing range */ 133 for (p = 0; p <= PRI_HIGHEST_TS; p++) { 134 ts_map[p] = max_ts - 135 (p * 100 / (PRI_TS_COUNT - 1) * (max_ts - min_ts) / 100); 136 high_pri[p] = (PRI_HIGHEST_TS - PRI_HTS_RANGE) + 137 ((p * PRI_HTS_RANGE) / (PRI_TS_COUNT - 1)); 138 } 139 140 /* Real-time range */ 141 for (p = (PRI_HIGHEST_TS + 1); p < PRI_COUNT; p++) { 142 ts_map[p] = rt_ts; 143 high_pri[p] = p; 144 } 145 } 146 147 /* 148 * Hooks. 149 */ 150 151 void 152 sched_proc_fork(struct proc *parent, struct proc *child) 153 { 154 struct lwp *l; 155 156 LIST_FOREACH(l, &child->p_lwps, l_sibling) { 157 lwp_lock(l); 158 sched_newts(l); 159 lwp_unlock(l); 160 } 161 } 162 163 void 164 sched_proc_exit(struct proc *child, struct proc *parent) 165 { 166 167 /* Dummy */ 168 } 169 170 void 171 sched_lwp_fork(struct lwp *l1, struct lwp *l2) 172 { 173 174 KASSERT(l2->l_sched_info == NULL); 175 l2->l_sched_info = pool_cache_get(sil_pool, PR_WAITOK); 176 memset(l2->l_sched_info, 0, sizeof(sched_info_lwp_t)); 177 } 178 179 void 180 sched_lwp_exit(struct lwp *l) 181 { 182 183 KASSERT(l->l_sched_info != NULL); 184 pool_cache_put(sil_pool, l->l_sched_info); 185 l->l_sched_info = NULL; 186 } 187 188 void 189 sched_lwp_collect(struct lwp *l) 190 { 191 192 } 193 194 void 195 sched_setrunnable(struct lwp *l) 196 { 197 198 /* Dummy */ 199 } 200 201 void 202 sched_schedclock(struct lwp *l) 203 { 204 205 /* Dummy */ 206 } 207 208 /* 209 * Priorities and time-slice. 210 */ 211 212 void 213 sched_nice(struct proc *p, int prio) 214 { 215 216 /* TODO: implement as SCHED_IA */ 217 } 218 219 /* Recalculate the time-slice */ 220 void 221 sched_newts(struct lwp *l) 222 { 223 sched_info_lwp_t *sil = l->l_sched_info; 224 225 sil->sl_timeslice = ts_map[lwp_eprio(l)]; 226 } 227 228 void 229 sched_slept(struct lwp *l) 230 { 231 sched_info_lwp_t *sil = l->l_sched_info; 232 233 /* Save the time when thread has slept */ 234 sil->sl_slept = hardclock_ticks; 235 236 /* 237 * If thread is in time-sharing queue and batch flag is not marked, 238 * increase the the priority, and run with the lower time-quantum. 239 */ 240 if (l->l_priority < PRI_HIGHEST_TS && 241 (sil->sl_flags & SL_BATCH) == 0) { 242 KASSERT(l->l_class == SCHED_OTHER); 243 l->l_priority++; 244 } 245 } 246 247 void 248 sched_wakeup(struct lwp *l) 249 { 250 sched_info_lwp_t *sil = l->l_sched_info; 251 const u_int slptime = hardclock_ticks - sil->sl_slept; 252 253 /* Update sleep time delta */ 254 sil->sl_slpsum += (l->l_slptime == 0) ? slptime : hz; 255 256 /* If thread was sleeping a second or more - set a high priority */ 257 if (l->l_slptime > 1 || slptime >= hz) 258 l->l_priority = high_pri[l->l_priority]; 259 260 /* Also, consider looking for a better CPU to wake up */ 261 l->l_cpu = sched_takecpu(l); 262 } 263 264 void 265 sched_pstats_hook(struct lwp *l) 266 { 267 sched_info_lwp_t *sil = l->l_sched_info; 268 pri_t prio; 269 bool batch; 270 271 if (l->l_stat == LSSLEEP || l->l_stat == LSSTOP || 272 l->l_stat == LSSUSPENDED) 273 l->l_slptime++; 274 275 /* 276 * Set that thread is more CPU-bound, if sum of run time exceeds the 277 * sum of sleep time. Check if thread is CPU-bound a first time. 278 */ 279 batch = (l->l_rticksum > sil->sl_slpsum); 280 if (batch) { 281 if ((sil->sl_flags & SL_BATCH) == 0) 282 batch = false; 283 sil->sl_flags |= SL_BATCH; 284 } else 285 sil->sl_flags &= ~SL_BATCH; 286 287 /* 288 * If thread is CPU-bound and never sleeps, it would occupy the CPU. 289 * In such case reset the value of last sleep, and check it later, if 290 * it is still zero - perform the migration, unmark the batch flag. 291 */ 292 if (batch && (l->l_slptime + sil->sl_slpsum) == 0) { 293 if (l->l_stat != LSONPROC && sil->sl_slept == 0) { 294 struct cpu_info *ci = sched_takecpu(l); 295 296 if (l->l_cpu != ci) 297 l->l_target_cpu = ci; 298 sil->sl_flags &= ~SL_BATCH; 299 } else { 300 sil->sl_slept = 0; 301 } 302 } 303 304 /* Reset the time sums */ 305 sil->sl_slpsum = 0; 306 l->l_rticksum = 0; 307 308 /* 309 * Estimate threads on time-sharing queue only, however, 310 * exclude the highest priority for performance purposes. 311 */ 312 if (l->l_priority >= PRI_HIGHEST_TS) 313 return; 314 KASSERT(l->l_class == SCHED_OTHER); 315 316 /* If it is CPU-bound not a first time - decrease the priority */ 317 prio = l->l_priority; 318 if (batch && prio != 0) 319 prio--; 320 321 /* If thread was not ran a second or more - set a high priority */ 322 if (l->l_stat == LSRUN) { 323 if (sil->sl_lrtime && (hardclock_ticks - sil->sl_lrtime >= hz)) 324 prio = high_pri[prio]; 325 /* Re-enqueue the thread if priority has changed */ 326 if (prio != l->l_priority) 327 lwp_changepri(l, prio); 328 } else { 329 /* In other states, change the priority directly */ 330 l->l_priority = prio; 331 } 332 } 333 334 void 335 sched_oncpu(lwp_t *l) 336 { 337 sched_info_lwp_t *sil = l->l_sched_info; 338 339 /* Update the counters */ 340 sil = l->l_sched_info; 341 KASSERT(sil->sl_timeslice >= min_ts); 342 KASSERT(sil->sl_timeslice <= max_ts); 343 l->l_cpu->ci_schedstate.spc_ticks = sil->sl_timeslice; 344 } 345 346 /* 347 * Time-driven events. 348 */ 349 350 /* 351 * Called once per time-quantum. This routine is CPU-local and runs at 352 * IPL_SCHED, thus the locking is not needed. 353 */ 354 void 355 sched_tick(struct cpu_info *ci) 356 { 357 struct schedstate_percpu *spc = &ci->ci_schedstate; 358 struct lwp *l = curlwp; 359 const sched_info_lwp_t *sil = l->l_sched_info; 360 361 if (CURCPU_IDLE_P()) 362 return; 363 364 switch (l->l_class) { 365 case SCHED_FIFO: 366 /* 367 * Update the time-quantum, and continue running, 368 * if thread runs on FIFO real-time policy. 369 */ 370 KASSERT(l->l_priority > PRI_HIGHEST_TS); 371 spc->spc_ticks = sil->sl_timeslice; 372 return; 373 case SCHED_OTHER: 374 /* 375 * If thread is in time-sharing queue, decrease the priority, 376 * and run with a higher time-quantum. 377 */ 378 KASSERT(l->l_priority <= PRI_HIGHEST_TS); 379 if (l->l_priority != 0) 380 l->l_priority--; 381 break; 382 } 383 384 /* 385 * If there are higher priority threads or threads in the same queue, 386 * mark that thread should yield, otherwise, continue running. 387 */ 388 if (lwp_eprio(l) <= spc->spc_maxpriority || l->l_target_cpu) { 389 spc->spc_flags |= SPCF_SHOULDYIELD; 390 cpu_need_resched(ci, 0); 391 } else 392 spc->spc_ticks = sil->sl_timeslice; 393 } 394 395 /* 396 * Sysctl nodes and initialization. 397 */ 398 399 static int 400 sysctl_sched_rtts(SYSCTLFN_ARGS) 401 { 402 struct sysctlnode node; 403 int rttsms = hztoms(rt_ts); 404 405 node = *rnode; 406 node.sysctl_data = &rttsms; 407 return sysctl_lookup(SYSCTLFN_CALL(&node)); 408 } 409 410 static int 411 sysctl_sched_mints(SYSCTLFN_ARGS) 412 { 413 struct sysctlnode node; 414 struct cpu_info *ci; 415 int error, newsize; 416 CPU_INFO_ITERATOR cii; 417 418 node = *rnode; 419 node.sysctl_data = &newsize; 420 421 newsize = hztoms(min_ts); 422 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 423 if (error || newp == NULL) 424 return error; 425 426 newsize = mstohz(newsize); 427 if (newsize < 1 || newsize > hz || newsize >= max_ts) 428 return EINVAL; 429 430 /* It is safe to do this in such order */ 431 for (CPU_INFO_FOREACH(cii, ci)) 432 spc_lock(ci); 433 434 min_ts = newsize; 435 sched_precalcts(); 436 437 for (CPU_INFO_FOREACH(cii, ci)) 438 spc_unlock(ci); 439 440 return 0; 441 } 442 443 static int 444 sysctl_sched_maxts(SYSCTLFN_ARGS) 445 { 446 struct sysctlnode node; 447 struct cpu_info *ci; 448 int error, newsize; 449 CPU_INFO_ITERATOR cii; 450 451 node = *rnode; 452 node.sysctl_data = &newsize; 453 454 newsize = hztoms(max_ts); 455 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 456 if (error || newp == NULL) 457 return error; 458 459 newsize = mstohz(newsize); 460 if (newsize < 10 || newsize > hz || newsize <= min_ts) 461 return EINVAL; 462 463 /* It is safe to do this in such order */ 464 for (CPU_INFO_FOREACH(cii, ci)) 465 spc_lock(ci); 466 467 max_ts = newsize; 468 sched_precalcts(); 469 470 for (CPU_INFO_FOREACH(cii, ci)) 471 spc_unlock(ci); 472 473 return 0; 474 } 475 476 SYSCTL_SETUP(sysctl_sched_m2_setup, "sysctl sched setup") 477 { 478 const struct sysctlnode *node = NULL; 479 480 sysctl_createv(clog, 0, NULL, NULL, 481 CTLFLAG_PERMANENT, 482 CTLTYPE_NODE, "kern", NULL, 483 NULL, 0, NULL, 0, 484 CTL_KERN, CTL_EOL); 485 sysctl_createv(clog, 0, NULL, &node, 486 CTLFLAG_PERMANENT, 487 CTLTYPE_NODE, "sched", 488 SYSCTL_DESCR("Scheduler options"), 489 NULL, 0, NULL, 0, 490 CTL_KERN, CTL_CREATE, CTL_EOL); 491 492 if (node == NULL) 493 return; 494 495 sysctl_createv(NULL, 0, &node, NULL, 496 CTLFLAG_PERMANENT, 497 CTLTYPE_STRING, "name", NULL, 498 NULL, 0, __UNCONST("M2"), 0, 499 CTL_CREATE, CTL_EOL); 500 sysctl_createv(NULL, 0, &node, NULL, 501 CTLFLAG_PERMANENT, 502 CTLTYPE_INT, "rtts", 503 SYSCTL_DESCR("Round-robin time quantum (in miliseconds)"), 504 sysctl_sched_rtts, 0, NULL, 0, 505 CTL_CREATE, CTL_EOL); 506 sysctl_createv(NULL, 0, &node, NULL, 507 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 508 CTLTYPE_INT, "maxts", 509 SYSCTL_DESCR("Maximal time quantum (in miliseconds)"), 510 sysctl_sched_maxts, 0, &max_ts, 0, 511 CTL_CREATE, CTL_EOL); 512 sysctl_createv(NULL, 0, &node, NULL, 513 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 514 CTLTYPE_INT, "mints", 515 SYSCTL_DESCR("Minimal time quantum (in miliseconds)"), 516 sysctl_sched_mints, 0, &min_ts, 0, 517 CTL_CREATE, CTL_EOL); 518 } 519