1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 #include "spdk/likely.h" 36 #include "spdk/event.h" 37 #include "spdk/log.h" 38 #include "spdk/env.h" 39 40 #include "spdk/thread.h" 41 #include "spdk_internal/event.h" 42 #include "spdk/scheduler.h" 43 #include "spdk_internal/usdt.h" 44 45 static uint32_t g_main_lcore; 46 47 struct core_stats { 48 uint64_t busy; 49 uint64_t idle; 50 uint32_t thread_count; 51 }; 52 53 static struct core_stats *g_cores; 54 55 uint8_t g_scheduler_load_limit = 20; 56 uint8_t g_scheduler_core_limit = 80; 57 uint8_t g_scheduler_core_busy = 95; 58 59 static uint8_t 60 _busy_pct(uint64_t busy, uint64_t idle) 61 { 62 if ((busy + idle) == 0) { 63 return 0; 64 } 65 66 return busy * 100 / (busy + idle); 67 } 68 69 static uint8_t 70 _get_thread_load(struct spdk_scheduler_thread_info *thread_info) 71 { 72 uint64_t busy, idle; 73 74 busy = thread_info->current_stats.busy_tsc; 75 idle = thread_info->current_stats.idle_tsc; 76 77 /* return percentage of time thread was busy */ 78 return _busy_pct(busy, idle); 79 } 80 81 typedef void (*_foreach_fn)(struct spdk_scheduler_thread_info *thread_info); 82 83 static void 84 _foreach_thread(struct spdk_scheduler_core_info *cores_info, _foreach_fn fn) 85 { 86 struct spdk_scheduler_core_info *core; 87 uint32_t i, j; 88 89 SPDK_ENV_FOREACH_CORE(i) { 90 core = &cores_info[i]; 91 for (j = 0; j < core->threads_count; j++) { 92 fn(&core->thread_infos[j]); 93 } 94 } 95 } 96 97 static void 98 _move_thread(struct spdk_scheduler_thread_info *thread_info, uint32_t dst_core) 99 { 100 struct core_stats *dst = &g_cores[dst_core]; 101 struct core_stats *src = &g_cores[thread_info->lcore]; 102 uint64_t busy_tsc = thread_info->current_stats.busy_tsc; 103 uint8_t busy_pct = _busy_pct(src->busy, src->idle); 104 uint64_t tsc; 105 106 SPDK_DTRACE_PROBE2(dynsched_move, thread_info, dst_core); 107 108 if (src == dst) { 109 /* Don't modify stats if thread is already on that core. */ 110 return; 111 } 112 113 dst->busy += spdk_min(UINT64_MAX - dst->busy, busy_tsc); 114 dst->idle -= spdk_min(dst->idle, busy_tsc); 115 dst->thread_count++; 116 117 /* Adjust busy/idle from core as if thread was not present on it. 118 * Core load will reflect the sum of all remaining threads on it. */ 119 src->busy -= spdk_min(src->busy, busy_tsc); 120 src->idle += spdk_min(UINT64_MAX - src->idle, busy_tsc); 121 122 if (busy_pct >= g_scheduler_core_busy && 123 _busy_pct(src->busy, src->idle) < g_scheduler_core_limit) { 124 /* This core was so busy that we cannot assume all of busy_tsc 125 * consumed by the moved thread will now be idle_tsc - it's 126 * very possible the remaining threads will use these cycles 127 * as busy_tsc. 128 * 129 * So make sure we don't drop the updated estimate below 130 * g_scheduler_core_limit, so that other cores can't 131 * move threads to this core during this scheduling 132 * period. 133 */ 134 tsc = src->busy + src->idle; 135 src->busy = tsc * g_scheduler_core_limit / 100; 136 src->idle = tsc - src->busy; 137 } 138 assert(src->thread_count > 0); 139 src->thread_count--; 140 141 thread_info->lcore = dst_core; 142 } 143 144 static bool 145 _is_core_at_limit(uint32_t core_id) 146 { 147 struct core_stats *core = &g_cores[core_id]; 148 uint64_t busy, idle; 149 150 /* Core with no or single thread cannot be over the limit. */ 151 if (core->thread_count <= 1) { 152 return false; 153 } 154 155 busy = core->busy; 156 idle = core->idle; 157 158 /* No work was done, exit before possible division by 0. */ 159 if (busy == 0) { 160 return false; 161 } 162 163 /* Work done was less than the limit */ 164 if (_busy_pct(busy, idle) < g_scheduler_core_limit) { 165 return false; 166 } 167 168 return true; 169 } 170 171 static bool 172 _can_core_fit_thread(struct spdk_scheduler_thread_info *thread_info, uint32_t dst_core) 173 { 174 struct core_stats *dst = &g_cores[dst_core]; 175 uint64_t new_busy_tsc, new_idle_tsc; 176 177 /* Thread can always fit on the core it's currently on. */ 178 if (thread_info->lcore == dst_core) { 179 return true; 180 } 181 182 /* Reactors in interrupt mode do not update stats, 183 * a thread can always fit into reactor in interrupt mode. */ 184 if (dst->busy + dst->idle == 0) { 185 return true; 186 } 187 188 /* Core has no threads. */ 189 if (dst->thread_count == 0) { 190 return true; 191 } 192 193 /* Core doesn't have enough idle_tsc to take this thread. */ 194 if (dst->idle < thread_info->current_stats.busy_tsc) { 195 return false; 196 } 197 198 new_busy_tsc = dst->busy + thread_info->current_stats.busy_tsc; 199 new_idle_tsc = dst->idle - thread_info->current_stats.busy_tsc; 200 201 /* Core cannot fit this thread if it would put it over the 202 * g_scheduler_core_limit. */ 203 return _busy_pct(new_busy_tsc, new_idle_tsc) < g_scheduler_core_limit; 204 } 205 206 static uint32_t 207 _find_optimal_core(struct spdk_scheduler_thread_info *thread_info) 208 { 209 uint32_t i; 210 uint32_t current_lcore = thread_info->lcore; 211 uint32_t least_busy_lcore = thread_info->lcore; 212 struct spdk_thread *thread; 213 struct spdk_cpuset *cpumask; 214 bool core_at_limit = _is_core_at_limit(current_lcore); 215 216 thread = spdk_thread_get_by_id(thread_info->thread_id); 217 if (thread == NULL) { 218 return current_lcore; 219 } 220 cpumask = spdk_thread_get_cpumask(thread); 221 222 /* Find a core that can fit the thread. */ 223 SPDK_ENV_FOREACH_CORE(i) { 224 /* Ignore cores outside cpumask. */ 225 if (!spdk_cpuset_get_cpu(cpumask, i)) { 226 continue; 227 } 228 229 /* Search for least busy core. */ 230 if (g_cores[i].busy < g_cores[least_busy_lcore].busy) { 231 least_busy_lcore = i; 232 } 233 234 /* Skip cores that cannot fit the thread and current one. */ 235 if (!_can_core_fit_thread(thread_info, i) || i == current_lcore) { 236 continue; 237 } 238 if (i == g_main_lcore) { 239 /* First consider g_main_lcore, consolidate threads on main lcore if possible. */ 240 return i; 241 } else if (i < current_lcore && current_lcore != g_main_lcore) { 242 /* Lower core id was found, move to consolidate threads on lowest core ids. */ 243 return i; 244 } else if (core_at_limit) { 245 /* When core is over the limit, any core id is better than current one. */ 246 return i; 247 } 248 } 249 250 /* For cores over the limit, place the thread on least busy core 251 * to balance threads. */ 252 if (core_at_limit) { 253 return least_busy_lcore; 254 } 255 256 /* If no better core is found, remain on the same one. */ 257 return current_lcore; 258 } 259 260 static int 261 init(void) 262 { 263 g_main_lcore = spdk_env_get_current_core(); 264 265 if (spdk_governor_set("dpdk_governor") != 0) { 266 SPDK_NOTICELOG("Unable to initialize dpdk governor\n"); 267 } 268 269 g_cores = calloc(spdk_env_get_last_core() + 1, sizeof(struct core_stats)); 270 if (g_cores == NULL) { 271 SPDK_ERRLOG("Failed to allocate memory for dynamic scheduler core stats.\n"); 272 return -ENOMEM; 273 } 274 275 return 0; 276 } 277 278 static void 279 deinit(void) 280 { 281 free(g_cores); 282 g_cores = NULL; 283 spdk_governor_set(NULL); 284 } 285 286 static void 287 _balance_idle(struct spdk_scheduler_thread_info *thread_info) 288 { 289 if (_get_thread_load(thread_info) >= g_scheduler_load_limit) { 290 return; 291 } 292 /* This thread is idle, move it to the main core. */ 293 _move_thread(thread_info, g_main_lcore); 294 } 295 296 static void 297 _balance_active(struct spdk_scheduler_thread_info *thread_info) 298 { 299 uint32_t target_lcore; 300 301 if (_get_thread_load(thread_info) < g_scheduler_load_limit) { 302 return; 303 } 304 305 /* This thread is active. */ 306 target_lcore = _find_optimal_core(thread_info); 307 _move_thread(thread_info, target_lcore); 308 } 309 310 static void 311 balance(struct spdk_scheduler_core_info *cores_info, uint32_t cores_count) 312 { 313 struct spdk_reactor *reactor; 314 struct spdk_governor *governor; 315 struct spdk_scheduler_core_info *core; 316 struct core_stats *main_core; 317 uint32_t i; 318 int rc; 319 bool busy_threads_present = false; 320 321 SPDK_DTRACE_PROBE1(dynsched_balance, cores_count); 322 323 SPDK_ENV_FOREACH_CORE(i) { 324 g_cores[i].thread_count = cores_info[i].threads_count; 325 g_cores[i].busy = cores_info[i].current_busy_tsc; 326 g_cores[i].idle = cores_info[i].current_idle_tsc; 327 SPDK_DTRACE_PROBE2(dynsched_core_info, i, &cores_info[i]); 328 } 329 main_core = &g_cores[g_main_lcore]; 330 331 /* Distribute threads in two passes, to make sure updated core stats are considered on each pass. 332 * 1) Move all idle threads to main core. */ 333 _foreach_thread(cores_info, _balance_idle); 334 /* 2) Distribute active threads across all cores. */ 335 _foreach_thread(cores_info, _balance_active); 336 337 /* Switch unused cores to interrupt mode and switch cores to polled mode 338 * if they will be used after rebalancing */ 339 SPDK_ENV_FOREACH_CORE(i) { 340 reactor = spdk_reactor_get(i); 341 core = &cores_info[i]; 342 /* We can switch mode only if reactor already does not have any threads */ 343 if (g_cores[i].thread_count == 0 && TAILQ_EMPTY(&reactor->threads)) { 344 core->interrupt_mode = true; 345 } else if (g_cores[i].thread_count != 0) { 346 core->interrupt_mode = false; 347 if (i != g_main_lcore) { 348 /* If a thread is present on non g_main_lcore, 349 * it has to be busy. */ 350 busy_threads_present = true; 351 } 352 } 353 } 354 355 governor = spdk_governor_get(); 356 if (governor == NULL) { 357 return; 358 } 359 360 /* Change main core frequency if needed */ 361 if (busy_threads_present) { 362 rc = governor->set_core_freq_max(g_main_lcore); 363 if (rc < 0) { 364 SPDK_ERRLOG("setting default frequency for core %u failed\n", g_main_lcore); 365 } 366 } else if (main_core->busy > main_core->idle) { 367 rc = governor->core_freq_up(g_main_lcore); 368 if (rc < 0) { 369 SPDK_ERRLOG("increasing frequency for core %u failed\n", g_main_lcore); 370 } 371 } else { 372 rc = governor->core_freq_down(g_main_lcore); 373 if (rc < 0) { 374 SPDK_ERRLOG("lowering frequency for core %u failed\n", g_main_lcore); 375 } 376 } 377 } 378 379 struct json_scheduler_opts { 380 uint8_t load_limit; 381 uint8_t core_limit; 382 uint8_t core_busy; 383 }; 384 385 static const struct spdk_json_object_decoder sched_decoders[] = { 386 {"load_limit", offsetof(struct json_scheduler_opts, load_limit), spdk_json_decode_uint8, true}, 387 {"core_limit", offsetof(struct json_scheduler_opts, core_limit), spdk_json_decode_uint8, true}, 388 {"core_busy", offsetof(struct json_scheduler_opts, core_busy), spdk_json_decode_uint8, true}, 389 }; 390 391 static int 392 set_opts(const struct spdk_json_val *opts) 393 { 394 struct json_scheduler_opts scheduler_opts; 395 396 scheduler_opts.load_limit = g_scheduler_load_limit; 397 scheduler_opts.core_limit = g_scheduler_core_limit; 398 scheduler_opts.core_busy = g_scheduler_core_busy; 399 400 if (opts != NULL) { 401 if (spdk_json_decode_object_relaxed(opts, sched_decoders, 402 SPDK_COUNTOF(sched_decoders), &scheduler_opts)) { 403 SPDK_ERRLOG("Decoding scheduler opts JSON failed\n"); 404 return -1; 405 } 406 } 407 408 SPDK_NOTICELOG("Setting scheduler load limit to %d\n", scheduler_opts.load_limit); 409 g_scheduler_load_limit = scheduler_opts.load_limit; 410 SPDK_NOTICELOG("Setting scheduler core limit to %d\n", scheduler_opts.core_limit); 411 g_scheduler_core_limit = scheduler_opts.core_limit; 412 SPDK_NOTICELOG("Setting scheduler core busy to %d\n", scheduler_opts.core_busy); 413 g_scheduler_core_busy = scheduler_opts.core_busy; 414 415 return 0; 416 } 417 418 static void 419 get_opts(struct spdk_json_write_ctx *ctx) 420 { 421 spdk_json_write_named_uint8(ctx, "load_limit", g_scheduler_load_limit); 422 spdk_json_write_named_uint8(ctx, "core_limit", g_scheduler_core_limit); 423 spdk_json_write_named_uint8(ctx, "core_busy", g_scheduler_core_busy); 424 } 425 426 static struct spdk_scheduler scheduler_dynamic = { 427 .name = "dynamic", 428 .init = init, 429 .deinit = deinit, 430 .balance = balance, 431 .set_opts = set_opts, 432 .get_opts = get_opts, 433 }; 434 435 SPDK_SCHEDULER_REGISTER(scheduler_dynamic); 436