xref: /spdk/module/scheduler/dynamic/scheduler_dynamic.c (revision 7506a7aa53d239f533af3bc768f0d2af55e735fe)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "spdk/stdinc.h"
35 #include "spdk/likely.h"
36 #include "spdk/event.h"
37 #include "spdk/log.h"
38 #include "spdk/env.h"
39 
40 #include "spdk/thread.h"
41 #include "spdk_internal/event.h"
42 #include "spdk/scheduler.h"
43 #include "spdk_internal/usdt.h"
44 
45 static uint32_t g_main_lcore;
46 
47 struct core_stats {
48 	uint64_t busy;
49 	uint64_t idle;
50 	uint32_t thread_count;
51 };
52 
53 static struct core_stats *g_cores;
54 
55 uint8_t g_scheduler_load_limit = 20;
56 uint8_t g_scheduler_core_limit = 80;
57 uint8_t g_scheduler_core_busy = 95;
58 
59 static uint8_t
60 _busy_pct(uint64_t busy, uint64_t idle)
61 {
62 	if ((busy + idle) == 0) {
63 		return 0;
64 	}
65 
66 	return busy * 100 / (busy + idle);
67 }
68 
69 static uint8_t
70 _get_thread_load(struct spdk_scheduler_thread_info *thread_info)
71 {
72 	uint64_t busy, idle;
73 
74 	busy = thread_info->current_stats.busy_tsc;
75 	idle = thread_info->current_stats.idle_tsc;
76 
77 	/* return percentage of time thread was busy */
78 	return _busy_pct(busy, idle);
79 }
80 
81 typedef void (*_foreach_fn)(struct spdk_scheduler_thread_info *thread_info);
82 
83 static void
84 _foreach_thread(struct spdk_scheduler_core_info *cores_info, _foreach_fn fn)
85 {
86 	struct spdk_scheduler_core_info *core;
87 	uint32_t i, j;
88 
89 	SPDK_ENV_FOREACH_CORE(i) {
90 		core = &cores_info[i];
91 		for (j = 0; j < core->threads_count; j++) {
92 			fn(&core->thread_infos[j]);
93 		}
94 	}
95 }
96 
97 static void
98 _move_thread(struct spdk_scheduler_thread_info *thread_info, uint32_t dst_core)
99 {
100 	struct core_stats *dst = &g_cores[dst_core];
101 	struct core_stats *src = &g_cores[thread_info->lcore];
102 	uint64_t busy_tsc = thread_info->current_stats.busy_tsc;
103 	uint8_t busy_pct = _busy_pct(src->busy, src->idle);
104 	uint64_t tsc;
105 
106 	SPDK_DTRACE_PROBE2(dynsched_move, thread_info, dst_core);
107 
108 	if (src == dst) {
109 		/* Don't modify stats if thread is already on that core. */
110 		return;
111 	}
112 
113 	dst->busy += spdk_min(UINT64_MAX - dst->busy, busy_tsc);
114 	dst->idle -= spdk_min(dst->idle, busy_tsc);
115 	dst->thread_count++;
116 
117 	/* Adjust busy/idle from core as if thread was not present on it.
118 	 * Core load will reflect the sum of all remaining threads on it. */
119 	src->busy -= spdk_min(src->busy, busy_tsc);
120 	src->idle += spdk_min(UINT64_MAX - src->idle, busy_tsc);
121 
122 	if (busy_pct >= g_scheduler_core_busy &&
123 	    _busy_pct(src->busy, src->idle) < g_scheduler_core_limit) {
124 		/* This core was so busy that we cannot assume all of busy_tsc
125 		 * consumed by the moved thread will now be idle_tsc - it's
126 		 * very possible the remaining threads will use these cycles
127 		 * as busy_tsc.
128 		 *
129 		 * So make sure we don't drop the updated estimate below
130 		 * g_scheduler_core_limit, so that other cores can't
131 		 * move threads to this core during this scheduling
132 		 * period.
133 		 */
134 		tsc = src->busy + src->idle;
135 		src->busy = tsc * g_scheduler_core_limit / 100;
136 		src->idle = tsc - src->busy;
137 	}
138 	assert(src->thread_count > 0);
139 	src->thread_count--;
140 
141 	thread_info->lcore = dst_core;
142 }
143 
144 static bool
145 _is_core_at_limit(uint32_t core_id)
146 {
147 	struct core_stats *core = &g_cores[core_id];
148 	uint64_t busy, idle;
149 
150 	/* Core with no or single thread cannot be over the limit. */
151 	if (core->thread_count <= 1) {
152 		return false;
153 	}
154 
155 	busy = core->busy;
156 	idle = core->idle;
157 
158 	/* No work was done, exit before possible division by 0. */
159 	if (busy == 0) {
160 		return false;
161 	}
162 
163 	/* Work done was less than the limit */
164 	if (_busy_pct(busy, idle) < g_scheduler_core_limit) {
165 		return false;
166 	}
167 
168 	return true;
169 }
170 
171 static bool
172 _can_core_fit_thread(struct spdk_scheduler_thread_info *thread_info, uint32_t dst_core)
173 {
174 	struct core_stats *dst = &g_cores[dst_core];
175 	uint64_t new_busy_tsc, new_idle_tsc;
176 
177 	/* Thread can always fit on the core it's currently on. */
178 	if (thread_info->lcore == dst_core) {
179 		return true;
180 	}
181 
182 	/* Reactors in interrupt mode do not update stats,
183 	 * a thread can always fit into reactor in interrupt mode. */
184 	if (dst->busy + dst->idle == 0) {
185 		return true;
186 	}
187 
188 	/* Core has no threads. */
189 	if (dst->thread_count == 0) {
190 		return true;
191 	}
192 
193 	/* Core doesn't have enough idle_tsc to take this thread. */
194 	if (dst->idle < thread_info->current_stats.busy_tsc) {
195 		return false;
196 	}
197 
198 	new_busy_tsc = dst->busy + thread_info->current_stats.busy_tsc;
199 	new_idle_tsc = dst->idle - thread_info->current_stats.busy_tsc;
200 
201 	/* Core cannot fit this thread if it would put it over the
202 	 * g_scheduler_core_limit. */
203 	return _busy_pct(new_busy_tsc, new_idle_tsc) < g_scheduler_core_limit;
204 }
205 
206 static uint32_t
207 _find_optimal_core(struct spdk_scheduler_thread_info *thread_info)
208 {
209 	uint32_t i;
210 	uint32_t current_lcore = thread_info->lcore;
211 	uint32_t least_busy_lcore = thread_info->lcore;
212 	struct spdk_thread *thread;
213 	struct spdk_cpuset *cpumask;
214 	bool core_at_limit = _is_core_at_limit(current_lcore);
215 
216 	thread = spdk_thread_get_by_id(thread_info->thread_id);
217 	if (thread == NULL) {
218 		return current_lcore;
219 	}
220 	cpumask = spdk_thread_get_cpumask(thread);
221 
222 	/* Find a core that can fit the thread. */
223 	SPDK_ENV_FOREACH_CORE(i) {
224 		/* Ignore cores outside cpumask. */
225 		if (!spdk_cpuset_get_cpu(cpumask, i)) {
226 			continue;
227 		}
228 
229 		/* Search for least busy core. */
230 		if (g_cores[i].busy < g_cores[least_busy_lcore].busy) {
231 			least_busy_lcore = i;
232 		}
233 
234 		/* Skip cores that cannot fit the thread and current one. */
235 		if (!_can_core_fit_thread(thread_info, i) || i == current_lcore) {
236 			continue;
237 		}
238 		if (i == g_main_lcore) {
239 			/* First consider g_main_lcore, consolidate threads on main lcore if possible. */
240 			return i;
241 		} else if (i < current_lcore && current_lcore != g_main_lcore) {
242 			/* Lower core id was found, move to consolidate threads on lowest core ids. */
243 			return i;
244 		} else if (core_at_limit) {
245 			/* When core is over the limit, any core id is better than current one. */
246 			return i;
247 		}
248 	}
249 
250 	/* For cores over the limit, place the thread on least busy core
251 	 * to balance threads. */
252 	if (core_at_limit) {
253 		return least_busy_lcore;
254 	}
255 
256 	/* If no better core is found, remain on the same one. */
257 	return current_lcore;
258 }
259 
260 static int
261 init(void)
262 {
263 	g_main_lcore = spdk_env_get_current_core();
264 
265 	if (spdk_governor_set("dpdk_governor") != 0) {
266 		SPDK_NOTICELOG("Unable to initialize dpdk governor\n");
267 	}
268 
269 	g_cores = calloc(spdk_env_get_last_core() + 1, sizeof(struct core_stats));
270 	if (g_cores == NULL) {
271 		SPDK_ERRLOG("Failed to allocate memory for dynamic scheduler core stats.\n");
272 		return -ENOMEM;
273 	}
274 
275 	return 0;
276 }
277 
278 static void
279 deinit(void)
280 {
281 	free(g_cores);
282 	g_cores = NULL;
283 	spdk_governor_set(NULL);
284 }
285 
286 static void
287 _balance_idle(struct spdk_scheduler_thread_info *thread_info)
288 {
289 	if (_get_thread_load(thread_info) >= g_scheduler_load_limit) {
290 		return;
291 	}
292 	/* This thread is idle, move it to the main core. */
293 	_move_thread(thread_info, g_main_lcore);
294 }
295 
296 static void
297 _balance_active(struct spdk_scheduler_thread_info *thread_info)
298 {
299 	uint32_t target_lcore;
300 
301 	if (_get_thread_load(thread_info) < g_scheduler_load_limit) {
302 		return;
303 	}
304 
305 	/* This thread is active. */
306 	target_lcore = _find_optimal_core(thread_info);
307 	_move_thread(thread_info, target_lcore);
308 }
309 
310 static void
311 balance(struct spdk_scheduler_core_info *cores_info, uint32_t cores_count)
312 {
313 	struct spdk_reactor *reactor;
314 	struct spdk_governor *governor;
315 	struct spdk_scheduler_core_info *core;
316 	struct core_stats *main_core;
317 	uint32_t i;
318 	int rc;
319 	bool busy_threads_present = false;
320 
321 	SPDK_DTRACE_PROBE1(dynsched_balance, cores_count);
322 
323 	SPDK_ENV_FOREACH_CORE(i) {
324 		g_cores[i].thread_count = cores_info[i].threads_count;
325 		g_cores[i].busy = cores_info[i].current_busy_tsc;
326 		g_cores[i].idle = cores_info[i].current_idle_tsc;
327 		SPDK_DTRACE_PROBE2(dynsched_core_info, i, &cores_info[i]);
328 	}
329 	main_core = &g_cores[g_main_lcore];
330 
331 	/* Distribute threads in two passes, to make sure updated core stats are considered on each pass.
332 	 * 1) Move all idle threads to main core. */
333 	_foreach_thread(cores_info, _balance_idle);
334 	/* 2) Distribute active threads across all cores. */
335 	_foreach_thread(cores_info, _balance_active);
336 
337 	/* Switch unused cores to interrupt mode and switch cores to polled mode
338 	 * if they will be used after rebalancing */
339 	SPDK_ENV_FOREACH_CORE(i) {
340 		reactor = spdk_reactor_get(i);
341 		core = &cores_info[i];
342 		/* We can switch mode only if reactor already does not have any threads */
343 		if (g_cores[i].thread_count == 0 && TAILQ_EMPTY(&reactor->threads)) {
344 			core->interrupt_mode = true;
345 		} else if (g_cores[i].thread_count != 0) {
346 			core->interrupt_mode = false;
347 			if (i != g_main_lcore) {
348 				/* If a thread is present on non g_main_lcore,
349 				 * it has to be busy. */
350 				busy_threads_present = true;
351 			}
352 		}
353 	}
354 
355 	governor = spdk_governor_get();
356 	if (governor == NULL) {
357 		return;
358 	}
359 
360 	/* Change main core frequency if needed */
361 	if (busy_threads_present) {
362 		rc = governor->set_core_freq_max(g_main_lcore);
363 		if (rc < 0) {
364 			SPDK_ERRLOG("setting default frequency for core %u failed\n", g_main_lcore);
365 		}
366 	} else if (main_core->busy > main_core->idle) {
367 		rc = governor->core_freq_up(g_main_lcore);
368 		if (rc < 0) {
369 			SPDK_ERRLOG("increasing frequency for core %u failed\n", g_main_lcore);
370 		}
371 	} else {
372 		rc = governor->core_freq_down(g_main_lcore);
373 		if (rc < 0) {
374 			SPDK_ERRLOG("lowering frequency for core %u failed\n", g_main_lcore);
375 		}
376 	}
377 }
378 
379 struct json_scheduler_opts {
380 	uint8_t load_limit;
381 	uint8_t core_limit;
382 	uint8_t core_busy;
383 };
384 
385 static const struct spdk_json_object_decoder sched_decoders[] = {
386 	{"load_limit", offsetof(struct json_scheduler_opts, load_limit), spdk_json_decode_uint8, true},
387 	{"core_limit", offsetof(struct json_scheduler_opts, core_limit), spdk_json_decode_uint8, true},
388 	{"core_busy", offsetof(struct json_scheduler_opts, core_busy), spdk_json_decode_uint8, true},
389 };
390 
391 static int
392 set_opts(const struct spdk_json_val *opts)
393 {
394 	struct json_scheduler_opts scheduler_opts;
395 
396 	scheduler_opts.load_limit = g_scheduler_load_limit;
397 	scheduler_opts.core_limit = g_scheduler_core_limit;
398 	scheduler_opts.core_busy = g_scheduler_core_busy;
399 
400 	if (opts != NULL) {
401 		if (spdk_json_decode_object_relaxed(opts, sched_decoders,
402 						    SPDK_COUNTOF(sched_decoders), &scheduler_opts)) {
403 			SPDK_ERRLOG("Decoding scheduler opts JSON failed\n");
404 			return -1;
405 		}
406 	}
407 
408 	SPDK_NOTICELOG("Setting scheduler load limit to %d\n", scheduler_opts.load_limit);
409 	g_scheduler_load_limit = scheduler_opts.load_limit;
410 	SPDK_NOTICELOG("Setting scheduler core limit to %d\n", scheduler_opts.core_limit);
411 	g_scheduler_core_limit = scheduler_opts.core_limit;
412 	SPDK_NOTICELOG("Setting scheduler core busy to %d\n", scheduler_opts.core_busy);
413 	g_scheduler_core_busy = scheduler_opts.core_busy;
414 
415 	return 0;
416 }
417 
418 static void
419 get_opts(struct spdk_json_write_ctx *ctx)
420 {
421 	spdk_json_write_named_uint8(ctx, "load_limit", g_scheduler_load_limit);
422 	spdk_json_write_named_uint8(ctx, "core_limit", g_scheduler_core_limit);
423 	spdk_json_write_named_uint8(ctx, "core_busy", g_scheduler_core_busy);
424 }
425 
426 static struct spdk_scheduler scheduler_dynamic = {
427 	.name = "dynamic",
428 	.init = init,
429 	.deinit = deinit,
430 	.balance = balance,
431 	.set_opts = set_opts,
432 	.get_opts = get_opts,
433 };
434 
435 SPDK_SCHEDULER_REGISTER(scheduler_dynamic);
436