1 /* $NetBSD: sched_m2.c,v 1.40 2024/01/24 16:11:48 christos Exp $ */ 2 3 /* 4 * Copyright (c) 2007, 2008 Mindaugas Rasiukevicius <rmind at NetBSD org> 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 /* 30 * TODO: 31 * - Implementation of fair share queue; 32 * - Support for NUMA; 33 */ 34 35 #include <sys/cdefs.h> 36 __KERNEL_RCSID(0, "$NetBSD: sched_m2.c,v 1.40 2024/01/24 16:11:48 christos Exp $"); 37 38 #include <sys/param.h> 39 40 #include <sys/cpu.h> 41 #include <sys/callout.h> 42 #include <sys/errno.h> 43 #include <sys/kernel.h> 44 #include <sys/kmem.h> 45 #include <sys/lwp.h> 46 #include <sys/mutex.h> 47 #include <sys/pool.h> 48 #include <sys/proc.h> 49 #include <sys/pset.h> 50 #include <sys/resource.h> 51 #include <sys/resourcevar.h> 52 #include <sys/sched.h> 53 #include <sys/syscallargs.h> 54 #include <sys/sysctl.h> 55 #include <sys/types.h> 56 57 /* 58 * Priority related definitions. 59 */ 60 #define PRI_TS_COUNT (NPRI_USER) 61 #define PRI_RT_COUNT (PRI_COUNT - PRI_TS_COUNT) 62 #define PRI_HTS_RANGE (PRI_TS_COUNT / 10) 63 64 #define PRI_HIGHEST_TS (MAXPRI_USER) 65 66 /* 67 * Time-slices and priorities. 68 */ 69 static u_int min_ts; /* Minimal time-slice */ 70 static u_int max_ts; /* Maximal time-slice */ 71 static u_int ts_map[PRI_COUNT]; /* Map of time-slices */ 72 static pri_t high_pri[PRI_COUNT]; /* Map for priority increase */ 73 u_int sched_rrticks; /* Real-time time-slice */ 74 75 static void sched_precalcts(void); 76 77 /* 78 * Initialization and setup. 79 */ 80 81 void 82 sched_rqinit(void) 83 { 84 if (hz < 100) { 85 panic("sched_rqinit: value of HZ is too low\n"); 86 } 87 88 /* Default timing ranges */ 89 min_ts = mstohz(20); /* ~20 ms */ 90 max_ts = mstohz(150); /* ~150 ms */ 91 sched_rrticks = mstohz(100); /* ~100 ms */ 92 sched_precalcts(); 93 94 #ifdef notyet 95 /* Need to set the name etc. This does not belong here */ 96 /* Attach the primary CPU here */ 97 sched_cpuattach(curcpu()); 98 #endif 99 100 sched_lwp_fork(NULL, &lwp0); 101 #ifdef notyet 102 /* without attaching the primary CPU l_mutex does not get initialized */ 103 lwp_lock(&lwp0); 104 sched_newts(&lwp0); 105 lwp_unlock(&lwp0); 106 #else 107 /* gross */ 108 lwp0.l_sched.timeslice = ts_map[lwp0.l_auxprio]; 109 #endif 110 } 111 112 /* Pre-calculate the time-slices for the priorities */ 113 static void 114 sched_precalcts(void) 115 { 116 pri_t p; 117 118 /* Time-sharing range */ 119 for (p = 0; p <= PRI_HIGHEST_TS; p++) { 120 ts_map[p] = max_ts - 121 (p * 100 / (PRI_TS_COUNT - 1) * (max_ts - min_ts) / 100); 122 high_pri[p] = (PRI_HIGHEST_TS - PRI_HTS_RANGE) + 123 ((p * PRI_HTS_RANGE) / (PRI_TS_COUNT - 1)); 124 } 125 126 /* Real-time range */ 127 for (p = (PRI_HIGHEST_TS + 1); p < PRI_COUNT; p++) { 128 ts_map[p] = sched_rrticks; 129 high_pri[p] = p; 130 } 131 } 132 133 /* 134 * Hooks. 135 */ 136 137 void 138 sched_proc_fork(struct proc *parent, struct proc *child) 139 { 140 struct lwp *l; 141 142 LIST_FOREACH(l, &child->p_lwps, l_sibling) { 143 lwp_lock(l); 144 sched_newts(l); 145 lwp_unlock(l); 146 } 147 } 148 149 void 150 sched_proc_exit(struct proc *child, struct proc *parent) 151 { 152 153 } 154 155 void 156 sched_lwp_fork(struct lwp *l1, struct lwp *l2) 157 { 158 159 } 160 161 void 162 sched_lwp_collect(struct lwp *l) 163 { 164 165 } 166 167 void 168 sched_setrunnable(struct lwp *l) 169 { 170 171 } 172 173 void 174 sched_schedclock(struct lwp *l) 175 { 176 177 } 178 179 /* 180 * Priorities and time-slice. 181 */ 182 183 void 184 sched_nice(struct proc *p, int prio) 185 { 186 struct lwp *l; 187 int n; 188 189 KASSERT(mutex_owned(p->p_lock)); 190 191 p->p_nice = prio; 192 n = (prio - NZERO) >> 2; 193 if (n == 0) 194 return; 195 196 LIST_FOREACH(l, &p->p_lwps, l_sibling) { 197 lwp_lock(l); 198 if (l->l_class == SCHED_OTHER) { 199 pri_t pri = l->l_priority - n; 200 pri = (n < 0) ? uimin(pri, PRI_HIGHEST_TS) : imax(pri, 0); 201 lwp_changepri(l, pri); 202 } 203 lwp_unlock(l); 204 } 205 } 206 207 /* Recalculate the time-slice */ 208 void 209 sched_newts(struct lwp *l) 210 { 211 212 l->l_sched.timeslice = ts_map[lwp_eprio(l)]; 213 } 214 215 void 216 sched_slept(struct lwp *l) 217 { 218 219 /* 220 * If thread is in time-sharing queue and batch flag is not marked, 221 * increase the priority, and run with the lower time-quantum. 222 */ 223 if (l->l_priority < PRI_HIGHEST_TS && (l->l_flag & LW_BATCH) == 0) { 224 struct proc *p = l->l_proc; 225 226 KASSERT(l->l_class == SCHED_OTHER); 227 if (__predict_false(p->p_nice < NZERO)) { 228 const int n = uimax((NZERO - p->p_nice) >> 2, 1); 229 l->l_priority = uimin(l->l_priority + n, PRI_HIGHEST_TS); 230 } else { 231 l->l_priority++; 232 } 233 } 234 } 235 236 void 237 sched_wakeup(struct lwp *l) 238 { 239 240 /* If thread was sleeping a second or more - set a high priority */ 241 if (l->l_slptime >= 1) 242 l->l_priority = high_pri[l->l_priority]; 243 } 244 245 void 246 sched_pstats_hook(struct lwp *l, int batch) 247 { 248 pri_t prio; 249 250 /* 251 * Estimate threads on time-sharing queue only, however, 252 * exclude the highest priority for performance purposes. 253 */ 254 KASSERT(lwp_locked(l, NULL)); 255 if (l->l_priority >= PRI_HIGHEST_TS) 256 return; 257 KASSERT(l->l_class == SCHED_OTHER); 258 259 /* If it is CPU-bound not a first time - decrease the priority */ 260 prio = l->l_priority; 261 if (batch && prio != 0) 262 prio--; 263 264 /* If thread was not ran a second or more - set a high priority */ 265 if (l->l_stat == LSRUN) { 266 if (l->l_rticks && (getticks() - l->l_rticks >= hz)) 267 prio = high_pri[prio]; 268 /* Re-enqueue the thread if priority has changed */ 269 if (prio != l->l_priority) 270 lwp_changepri(l, prio); 271 } else { 272 /* In other states, change the priority directly */ 273 l->l_priority = prio; 274 } 275 } 276 277 void 278 sched_oncpu(lwp_t *l) 279 { 280 struct schedstate_percpu *spc = &l->l_cpu->ci_schedstate; 281 282 /* Update the counters */ 283 KASSERT(l->l_sched.timeslice >= min_ts); 284 KASSERT(l->l_sched.timeslice <= max_ts); 285 spc->spc_ticks = l->l_sched.timeslice; 286 } 287 288 /* 289 * Time-driven events. 290 */ 291 292 /* 293 * Called once per time-quantum, with the running LWP lock held (spc_lwplock). 294 */ 295 void 296 sched_tick(struct cpu_info *ci) 297 { 298 struct schedstate_percpu *spc = &ci->ci_schedstate; 299 struct lwp *l = ci->ci_onproc; 300 struct proc *p; 301 302 if (__predict_false(CURCPU_IDLE_P())) 303 return; 304 305 lwp_lock(l); 306 KASSERT(l->l_mutex != spc->spc_mutex); 307 switch (l->l_class) { 308 case SCHED_FIFO: 309 /* 310 * Update the time-quantum, and continue running, 311 * if thread runs on FIFO real-time policy. 312 */ 313 KASSERT(l->l_priority > PRI_HIGHEST_TS); 314 spc->spc_ticks = l->l_sched.timeslice; 315 lwp_unlock(l); 316 return; 317 case SCHED_OTHER: 318 /* 319 * If thread is in time-sharing queue, decrease the priority, 320 * and run with a higher time-quantum. 321 */ 322 KASSERT(l->l_priority <= PRI_HIGHEST_TS); 323 if (l->l_priority == 0) 324 break; 325 326 p = l->l_proc; 327 if (__predict_false(p->p_nice > NZERO)) { 328 const int n = uimax((p->p_nice - NZERO) >> 2, 1); 329 l->l_priority = imax(l->l_priority - n, 0); 330 } else 331 l->l_priority--; 332 break; 333 } 334 335 /* 336 * If there are higher priority threads or threads in the same queue, 337 * mark that thread should yield, otherwise, continue running. 338 */ 339 if (lwp_eprio(l) <= spc->spc_maxpriority || l->l_target_cpu) { 340 spc->spc_flags |= SPCF_SHOULDYIELD; 341 spc_lock(ci); 342 sched_resched_cpu(ci, MAXPRI_KTHREAD, true); 343 /* spc now unlocked */ 344 } else 345 spc->spc_ticks = l->l_sched.timeslice; 346 lwp_unlock(l); 347 } 348 349 /* 350 * Sysctl nodes and initialization. 351 */ 352 353 static int 354 sysctl_sched_rtts(SYSCTLFN_ARGS) 355 { 356 struct sysctlnode node; 357 int rttsms = hztoms(sched_rrticks); 358 359 node = *rnode; 360 node.sysctl_data = &rttsms; 361 return sysctl_lookup(SYSCTLFN_CALL(&node)); 362 } 363 364 static int 365 sysctl_sched_mints(SYSCTLFN_ARGS) 366 { 367 struct sysctlnode node; 368 struct cpu_info *ci; 369 int error, newsize; 370 CPU_INFO_ITERATOR cii; 371 372 node = *rnode; 373 node.sysctl_data = &newsize; 374 375 newsize = hztoms(min_ts); 376 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 377 if (error || newp == NULL) 378 return error; 379 380 newsize = mstohz(newsize); 381 if (newsize < 1 || newsize > hz || newsize >= max_ts) 382 return EINVAL; 383 384 /* It is safe to do this in such order */ 385 for (CPU_INFO_FOREACH(cii, ci)) 386 spc_lock(ci); 387 388 min_ts = newsize; 389 sched_precalcts(); 390 391 for (CPU_INFO_FOREACH(cii, ci)) 392 spc_unlock(ci); 393 394 return 0; 395 } 396 397 static int 398 sysctl_sched_maxts(SYSCTLFN_ARGS) 399 { 400 struct sysctlnode node; 401 struct cpu_info *ci; 402 int error, newsize; 403 CPU_INFO_ITERATOR cii; 404 405 node = *rnode; 406 node.sysctl_data = &newsize; 407 408 newsize = hztoms(max_ts); 409 error = sysctl_lookup(SYSCTLFN_CALL(&node)); 410 if (error || newp == NULL) 411 return error; 412 413 newsize = mstohz(newsize); 414 if (newsize < 10 || newsize > hz || newsize <= min_ts) 415 return EINVAL; 416 417 /* It is safe to do this in such order */ 418 for (CPU_INFO_FOREACH(cii, ci)) 419 spc_lock(ci); 420 421 max_ts = newsize; 422 sched_precalcts(); 423 424 for (CPU_INFO_FOREACH(cii, ci)) 425 spc_unlock(ci); 426 427 return 0; 428 } 429 430 SYSCTL_SETUP(sysctl_sched_m2_setup, "sysctl sched setup") 431 { 432 const struct sysctlnode *node = NULL; 433 434 sysctl_createv(clog, 0, NULL, &node, 435 CTLFLAG_PERMANENT, 436 CTLTYPE_NODE, "sched", 437 SYSCTL_DESCR("Scheduler options"), 438 NULL, 0, NULL, 0, 439 CTL_KERN, CTL_CREATE, CTL_EOL); 440 441 if (node == NULL) 442 return; 443 444 sysctl_createv(NULL, 0, &node, NULL, 445 CTLFLAG_PERMANENT, 446 CTLTYPE_STRING, "name", NULL, 447 NULL, 0, __UNCONST("M2"), 0, 448 CTL_CREATE, CTL_EOL); 449 sysctl_createv(NULL, 0, &node, NULL, 450 CTLFLAG_PERMANENT, 451 CTLTYPE_INT, "rtts", 452 SYSCTL_DESCR("Round-robin time quantum (in milliseconds)"), 453 sysctl_sched_rtts, 0, NULL, 0, 454 CTL_CREATE, CTL_EOL); 455 sysctl_createv(NULL, 0, &node, NULL, 456 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 457 CTLTYPE_INT, "maxts", 458 SYSCTL_DESCR("Maximal time quantum (in milliseconds)"), 459 sysctl_sched_maxts, 0, &max_ts, 0, 460 CTL_CREATE, CTL_EOL); 461 sysctl_createv(NULL, 0, &node, NULL, 462 CTLFLAG_PERMANENT | CTLFLAG_READWRITE, 463 CTLTYPE_INT, "mints", 464 SYSCTL_DESCR("Minimal time quantum (in milliseconds)"), 465 sysctl_sched_mints, 0, &min_ts, 0, 466 CTL_CREATE, CTL_EOL); 467 } 468