1 /*- 2 * BSD LICENSE 3 * 4 * Copyright (c) Intel Corporation. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * * Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * * Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * * Neither the name of Intel Corporation nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 32 */ 33 34 #include "spdk/stdinc.h" 35 #include "spdk/likely.h" 36 #include "spdk/event.h" 37 #include "spdk/log.h" 38 #include "spdk/env.h" 39 40 #include "spdk/thread.h" 41 #include "spdk_internal/event.h" 42 #include "spdk/scheduler.h" 43 44 static uint32_t g_main_lcore; 45 46 struct core_stats { 47 uint64_t busy; 48 uint64_t idle; 49 uint32_t thread_count; 50 }; 51 52 static struct core_stats *g_cores; 53 54 #define SCHEDULER_LOAD_LIMIT 20 55 #define SCHEDULER_CORE_LIMIT 80 56 #define SCHEDULER_CORE_BUSY 95 57 58 static uint8_t 59 _busy_pct(uint64_t busy, uint64_t idle) 60 { 61 if ((busy + idle) == 0) { 62 return 0; 63 } 64 65 return busy * 100 / (busy + idle); 66 } 67 68 static uint8_t 69 _get_thread_load(struct spdk_scheduler_thread_info *thread_info) 70 { 71 uint64_t busy, idle; 72 73 busy = thread_info->current_stats.busy_tsc; 74 idle = thread_info->current_stats.idle_tsc; 75 76 /* return percentage of time thread was busy */ 77 return _busy_pct(busy, idle); 78 } 79 80 typedef void (*_foreach_fn)(struct spdk_scheduler_thread_info *thread_info); 81 82 static void 83 _foreach_thread(struct spdk_scheduler_core_info *cores_info, _foreach_fn fn) 84 { 85 struct spdk_scheduler_core_info *core; 86 uint32_t i, j; 87 88 SPDK_ENV_FOREACH_CORE(i) { 89 core = &cores_info[i]; 90 for (j = 0; j < core->threads_count; j++) { 91 fn(&core->thread_infos[j]); 92 } 93 } 94 } 95 96 static void 97 _move_thread(struct spdk_scheduler_thread_info *thread_info, uint32_t dst_core) 98 { 99 struct core_stats *dst = &g_cores[dst_core]; 100 struct core_stats *src = &g_cores[thread_info->lcore]; 101 uint64_t busy_tsc = thread_info->current_stats.busy_tsc; 102 uint8_t busy_pct = _busy_pct(src->busy, src->idle); 103 uint64_t tsc; 104 105 if (src == dst) { 106 /* Don't modify stats if thread is already on that core. */ 107 return; 108 } 109 110 dst->busy += spdk_min(UINT64_MAX - dst->busy, busy_tsc); 111 dst->idle -= spdk_min(dst->idle, busy_tsc); 112 dst->thread_count++; 113 114 /* Adjust busy/idle from core as if thread was not present on it. 115 * Core load will reflect the sum of all remaining threads on it. */ 116 src->busy -= spdk_min(src->busy, busy_tsc); 117 src->idle += spdk_min(UINT64_MAX - src->idle, busy_tsc); 118 119 if (busy_pct >= SCHEDULER_CORE_BUSY && 120 _busy_pct(src->busy, src->idle) < SCHEDULER_CORE_LIMIT) { 121 /* This core was so busy that we cannot assume all of busy_tsc 122 * consumed by the moved thread will now be idle_tsc - it's 123 * very possible the remaining threads will use these cycles 124 * as busy_tsc. 125 * 126 * So make sure we don't drop the updated estimate below 127 * SCHEDULER_CORE_LIMIT, so that other cores can't 128 * move threads to this core during this scheduling 129 * period. 130 */ 131 tsc = src->busy + src->idle; 132 src->busy = tsc * SCHEDULER_CORE_LIMIT / 100; 133 src->idle = tsc - src->busy; 134 } 135 assert(src->thread_count > 0); 136 src->thread_count--; 137 138 thread_info->lcore = dst_core; 139 } 140 141 static bool 142 _is_core_at_limit(uint32_t core_id) 143 { 144 struct core_stats *core = &g_cores[core_id]; 145 uint64_t busy, idle; 146 147 /* Core with no or single thread cannot be over the limit. */ 148 if (core->thread_count <= 1) { 149 return false; 150 } 151 152 busy = core->busy; 153 idle = core->idle; 154 155 /* No work was done, exit before possible division by 0. */ 156 if (busy == 0) { 157 return false; 158 } 159 160 /* Work done was less than the limit */ 161 if (_busy_pct(busy, idle) < SCHEDULER_CORE_LIMIT) { 162 return false; 163 } 164 165 return true; 166 } 167 168 static bool 169 _can_core_fit_thread(struct spdk_scheduler_thread_info *thread_info, uint32_t dst_core) 170 { 171 struct core_stats *dst = &g_cores[dst_core]; 172 uint64_t new_busy_tsc, new_idle_tsc; 173 174 /* Thread can always fit on the core it's currently on. */ 175 if (thread_info->lcore == dst_core) { 176 return true; 177 } 178 179 /* Reactors in interrupt mode do not update stats, 180 * a thread can always fit into reactor in interrupt mode. */ 181 if (dst->busy + dst->idle == 0) { 182 return true; 183 } 184 185 /* Core has no threads. */ 186 if (dst->thread_count == 0) { 187 return true; 188 } 189 190 /* Core doesn't have enough idle_tsc to take this thread. */ 191 if (dst->idle < thread_info->current_stats.busy_tsc) { 192 return false; 193 } 194 195 new_busy_tsc = dst->busy + thread_info->current_stats.busy_tsc; 196 new_idle_tsc = dst->idle - thread_info->current_stats.busy_tsc; 197 198 /* Core cannot fit this thread if it would put it over the 199 * SCHEDULER_CORE_LIMIT. */ 200 return _busy_pct(new_busy_tsc, new_idle_tsc) < SCHEDULER_CORE_LIMIT; 201 } 202 203 static uint32_t 204 _find_optimal_core(struct spdk_scheduler_thread_info *thread_info) 205 { 206 uint32_t i; 207 uint32_t current_lcore = thread_info->lcore; 208 uint32_t least_busy_lcore = thread_info->lcore; 209 struct spdk_thread *thread; 210 struct spdk_cpuset *cpumask; 211 bool core_at_limit = _is_core_at_limit(current_lcore); 212 213 thread = spdk_thread_get_by_id(thread_info->thread_id); 214 if (thread == NULL) { 215 return current_lcore; 216 } 217 cpumask = spdk_thread_get_cpumask(thread); 218 219 /* Find a core that can fit the thread. */ 220 SPDK_ENV_FOREACH_CORE(i) { 221 /* Ignore cores outside cpumask. */ 222 if (!spdk_cpuset_get_cpu(cpumask, i)) { 223 continue; 224 } 225 226 /* Search for least busy core. */ 227 if (g_cores[i].busy < g_cores[least_busy_lcore].busy) { 228 least_busy_lcore = i; 229 } 230 231 /* Skip cores that cannot fit the thread and current one. */ 232 if (!_can_core_fit_thread(thread_info, i) || i == current_lcore) { 233 continue; 234 } 235 if (i == g_main_lcore) { 236 /* First consider g_main_lcore, consolidate threads on main lcore if possible. */ 237 return i; 238 } else if (i < current_lcore && current_lcore != g_main_lcore) { 239 /* Lower core id was found, move to consolidate threads on lowest core ids. */ 240 return i; 241 } else if (core_at_limit) { 242 /* When core is over the limit, any core id is better than current one. */ 243 return i; 244 } 245 } 246 247 /* For cores over the limit, place the thread on least busy core 248 * to balance threads. */ 249 if (core_at_limit) { 250 return least_busy_lcore; 251 } 252 253 /* If no better core is found, remain on the same one. */ 254 return current_lcore; 255 } 256 257 static int 258 init(void) 259 { 260 g_main_lcore = spdk_env_get_current_core(); 261 262 if (spdk_governor_set("dpdk_governor") != 0) { 263 SPDK_NOTICELOG("Unable to initialize dpdk governor\n"); 264 } 265 266 g_cores = calloc(spdk_env_get_last_core() + 1, sizeof(struct core_stats)); 267 if (g_cores == NULL) { 268 SPDK_ERRLOG("Failed to allocate memory for dynamic scheduler core stats.\n"); 269 return -ENOMEM; 270 } 271 272 return 0; 273 } 274 275 static void 276 deinit(void) 277 { 278 free(g_cores); 279 g_cores = NULL; 280 spdk_governor_set(NULL); 281 } 282 283 static void 284 _balance_idle(struct spdk_scheduler_thread_info *thread_info) 285 { 286 if (_get_thread_load(thread_info) >= SCHEDULER_LOAD_LIMIT) { 287 return; 288 } 289 /* This thread is idle, move it to the main core. */ 290 _move_thread(thread_info, g_main_lcore); 291 } 292 293 static void 294 _balance_active(struct spdk_scheduler_thread_info *thread_info) 295 { 296 uint32_t target_lcore; 297 298 if (_get_thread_load(thread_info) < SCHEDULER_LOAD_LIMIT) { 299 return; 300 } 301 302 /* This thread is active. */ 303 target_lcore = _find_optimal_core(thread_info); 304 _move_thread(thread_info, target_lcore); 305 } 306 307 static void 308 balance(struct spdk_scheduler_core_info *cores_info, uint32_t cores_count) 309 { 310 struct spdk_reactor *reactor; 311 struct spdk_governor *governor; 312 struct spdk_scheduler_core_info *core; 313 struct core_stats *main_core; 314 uint32_t i; 315 int rc; 316 bool busy_threads_present = false; 317 318 SPDK_ENV_FOREACH_CORE(i) { 319 g_cores[i].thread_count = cores_info[i].threads_count; 320 g_cores[i].busy = cores_info[i].current_busy_tsc; 321 g_cores[i].idle = cores_info[i].current_idle_tsc; 322 } 323 main_core = &g_cores[g_main_lcore]; 324 325 /* Distribute threads in two passes, to make sure updated core stats are considered on each pass. 326 * 1) Move all idle threads to main core. */ 327 _foreach_thread(cores_info, _balance_idle); 328 /* 2) Distribute active threads across all cores. */ 329 _foreach_thread(cores_info, _balance_active); 330 331 /* Switch unused cores to interrupt mode and switch cores to polled mode 332 * if they will be used after rebalancing */ 333 SPDK_ENV_FOREACH_CORE(i) { 334 reactor = spdk_reactor_get(i); 335 core = &cores_info[i]; 336 /* We can switch mode only if reactor already does not have any threads */ 337 if (g_cores[i].thread_count == 0 && TAILQ_EMPTY(&reactor->threads)) { 338 core->interrupt_mode = true; 339 } else if (g_cores[i].thread_count != 0) { 340 core->interrupt_mode = false; 341 if (i != g_main_lcore) { 342 /* If a thread is present on non g_main_lcore, 343 * it has to be busy. */ 344 busy_threads_present = true; 345 } 346 } 347 } 348 349 governor = spdk_governor_get(); 350 if (governor == NULL) { 351 return; 352 } 353 354 /* Change main core frequency if needed */ 355 if (busy_threads_present) { 356 rc = governor->set_core_freq_max(g_main_lcore); 357 if (rc < 0) { 358 SPDK_ERRLOG("setting default frequency for core %u failed\n", g_main_lcore); 359 } 360 } else if (main_core->busy > main_core->idle) { 361 rc = governor->core_freq_up(g_main_lcore); 362 if (rc < 0) { 363 SPDK_ERRLOG("increasing frequency for core %u failed\n", g_main_lcore); 364 } 365 } else { 366 rc = governor->core_freq_down(g_main_lcore); 367 if (rc < 0) { 368 SPDK_ERRLOG("lowering frequency for core %u failed\n", g_main_lcore); 369 } 370 } 371 } 372 373 static struct spdk_scheduler scheduler_dynamic = { 374 .name = "dynamic", 375 .init = init, 376 .deinit = deinit, 377 .balance = balance, 378 }; 379 380 SPDK_SCHEDULER_REGISTER(scheduler_dynamic); 381