xref: /spdk/module/scheduler/dynamic/scheduler_dynamic.c (revision 9efad7468f30e1c5f7442823f5a8b17acd1e6a9b)
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) Intel Corporation.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of Intel Corporation nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "spdk/stdinc.h"
35 #include "spdk/likely.h"
36 #include "spdk/event.h"
37 #include "spdk/log.h"
38 #include "spdk/env.h"
39 
40 #include "spdk/thread.h"
41 #include "spdk_internal/event.h"
42 #include "spdk/scheduler.h"
43 
44 static uint32_t g_main_lcore;
45 
46 struct core_stats {
47 	uint64_t busy;
48 	uint64_t idle;
49 	uint32_t thread_count;
50 };
51 
52 static struct core_stats *g_cores;
53 
54 #define SCHEDULER_LOAD_LIMIT 20
55 #define SCHEDULER_CORE_LIMIT 80
56 #define SCHEDULER_CORE_BUSY 95
57 
58 static uint8_t
59 _busy_pct(uint64_t busy, uint64_t idle)
60 {
61 	if ((busy + idle) == 0) {
62 		return 0;
63 	}
64 
65 	return busy * 100 / (busy + idle);
66 }
67 
68 static uint8_t
69 _get_thread_load(struct spdk_scheduler_thread_info *thread_info)
70 {
71 	uint64_t busy, idle;
72 
73 	busy = thread_info->current_stats.busy_tsc;
74 	idle = thread_info->current_stats.idle_tsc;
75 
76 	/* return percentage of time thread was busy */
77 	return _busy_pct(busy, idle);
78 }
79 
80 typedef void (*_foreach_fn)(struct spdk_scheduler_thread_info *thread_info);
81 
82 static void
83 _foreach_thread(struct spdk_scheduler_core_info *cores_info, _foreach_fn fn)
84 {
85 	struct spdk_scheduler_core_info *core;
86 	uint32_t i, j;
87 
88 	SPDK_ENV_FOREACH_CORE(i) {
89 		core = &cores_info[i];
90 		for (j = 0; j < core->threads_count; j++) {
91 			fn(&core->thread_infos[j]);
92 		}
93 	}
94 }
95 
96 static void
97 _move_thread(struct spdk_scheduler_thread_info *thread_info, uint32_t dst_core)
98 {
99 	struct core_stats *dst = &g_cores[dst_core];
100 	struct core_stats *src = &g_cores[thread_info->lcore];
101 	uint64_t busy_tsc = thread_info->current_stats.busy_tsc;
102 	uint8_t busy_pct = _busy_pct(src->busy, src->idle);
103 	uint64_t tsc;
104 
105 	if (src == dst) {
106 		/* Don't modify stats if thread is already on that core. */
107 		return;
108 	}
109 
110 	dst->busy += spdk_min(UINT64_MAX - dst->busy, busy_tsc);
111 	dst->idle -= spdk_min(dst->idle, busy_tsc);
112 	dst->thread_count++;
113 
114 	/* Adjust busy/idle from core as if thread was not present on it.
115 	 * Core load will reflect the sum of all remaining threads on it. */
116 	src->busy -= spdk_min(src->busy, busy_tsc);
117 	src->idle += spdk_min(UINT64_MAX - src->idle, busy_tsc);
118 
119 	if (busy_pct >= SCHEDULER_CORE_BUSY &&
120 	    _busy_pct(src->busy, src->idle) < SCHEDULER_CORE_LIMIT) {
121 		/* This core was so busy that we cannot assume all of busy_tsc
122 		 * consumed by the moved thread will now be idle_tsc - it's
123 		 * very possible the remaining threads will use these cycles
124 		 * as busy_tsc.
125 		 *
126 		 * So make sure we don't drop the updated estimate below
127 		 * SCHEDULER_CORE_LIMIT, so that other cores can't
128 		 * move threads to this core during this scheduling
129 		 * period.
130 		 */
131 		tsc = src->busy + src->idle;
132 		src->busy = tsc * SCHEDULER_CORE_LIMIT / 100;
133 		src->idle = tsc - src->busy;
134 	}
135 	assert(src->thread_count > 0);
136 	src->thread_count--;
137 
138 	thread_info->lcore = dst_core;
139 }
140 
141 static bool
142 _is_core_at_limit(uint32_t core_id)
143 {
144 	struct core_stats *core = &g_cores[core_id];
145 	uint64_t busy, idle;
146 
147 	/* Core with no or single thread cannot be over the limit. */
148 	if (core->thread_count <= 1) {
149 		return false;
150 	}
151 
152 	busy = core->busy;
153 	idle = core->idle;
154 
155 	/* No work was done, exit before possible division by 0. */
156 	if (busy == 0) {
157 		return false;
158 	}
159 
160 	/* Work done was less than the limit */
161 	if (_busy_pct(busy, idle) < SCHEDULER_CORE_LIMIT) {
162 		return false;
163 	}
164 
165 	return true;
166 }
167 
168 static bool
169 _can_core_fit_thread(struct spdk_scheduler_thread_info *thread_info, uint32_t dst_core)
170 {
171 	struct core_stats *dst = &g_cores[dst_core];
172 	uint64_t new_busy_tsc, new_idle_tsc;
173 
174 	/* Thread can always fit on the core it's currently on. */
175 	if (thread_info->lcore == dst_core) {
176 		return true;
177 	}
178 
179 	/* Reactors in interrupt mode do not update stats,
180 	 * a thread can always fit into reactor in interrupt mode. */
181 	if (dst->busy + dst->idle == 0) {
182 		return true;
183 	}
184 
185 	/* Core has no threads. */
186 	if (dst->thread_count == 0) {
187 		return true;
188 	}
189 
190 	/* Core doesn't have enough idle_tsc to take this thread. */
191 	if (dst->idle < thread_info->current_stats.busy_tsc) {
192 		return false;
193 	}
194 
195 	new_busy_tsc = dst->busy + thread_info->current_stats.busy_tsc;
196 	new_idle_tsc = dst->idle - thread_info->current_stats.busy_tsc;
197 
198 	/* Core cannot fit this thread if it would put it over the
199 	 * SCHEDULER_CORE_LIMIT. */
200 	return _busy_pct(new_busy_tsc, new_idle_tsc) < SCHEDULER_CORE_LIMIT;
201 }
202 
203 static uint32_t
204 _find_optimal_core(struct spdk_scheduler_thread_info *thread_info)
205 {
206 	uint32_t i;
207 	uint32_t current_lcore = thread_info->lcore;
208 	uint32_t least_busy_lcore = thread_info->lcore;
209 	struct spdk_thread *thread;
210 	struct spdk_cpuset *cpumask;
211 	bool core_at_limit = _is_core_at_limit(current_lcore);
212 
213 	thread = spdk_thread_get_by_id(thread_info->thread_id);
214 	if (thread == NULL) {
215 		return current_lcore;
216 	}
217 	cpumask = spdk_thread_get_cpumask(thread);
218 
219 	/* Find a core that can fit the thread. */
220 	SPDK_ENV_FOREACH_CORE(i) {
221 		/* Ignore cores outside cpumask. */
222 		if (!spdk_cpuset_get_cpu(cpumask, i)) {
223 			continue;
224 		}
225 
226 		/* Search for least busy core. */
227 		if (g_cores[i].busy < g_cores[least_busy_lcore].busy) {
228 			least_busy_lcore = i;
229 		}
230 
231 		/* Skip cores that cannot fit the thread and current one. */
232 		if (!_can_core_fit_thread(thread_info, i) || i == current_lcore) {
233 			continue;
234 		}
235 		if (i == g_main_lcore) {
236 			/* First consider g_main_lcore, consolidate threads on main lcore if possible. */
237 			return i;
238 		} else if (i < current_lcore && current_lcore != g_main_lcore) {
239 			/* Lower core id was found, move to consolidate threads on lowest core ids. */
240 			return i;
241 		} else if (core_at_limit) {
242 			/* When core is over the limit, any core id is better than current one. */
243 			return i;
244 		}
245 	}
246 
247 	/* For cores over the limit, place the thread on least busy core
248 	 * to balance threads. */
249 	if (core_at_limit) {
250 		return least_busy_lcore;
251 	}
252 
253 	/* If no better core is found, remain on the same one. */
254 	return current_lcore;
255 }
256 
257 static int
258 init(void)
259 {
260 	g_main_lcore = spdk_env_get_current_core();
261 
262 	if (spdk_governor_set("dpdk_governor") != 0) {
263 		SPDK_NOTICELOG("Unable to initialize dpdk governor\n");
264 	}
265 
266 	g_cores = calloc(spdk_env_get_last_core() + 1, sizeof(struct core_stats));
267 	if (g_cores == NULL) {
268 		SPDK_ERRLOG("Failed to allocate memory for dynamic scheduler core stats.\n");
269 		return -ENOMEM;
270 	}
271 
272 	return 0;
273 }
274 
275 static void
276 deinit(void)
277 {
278 	free(g_cores);
279 	g_cores = NULL;
280 	spdk_governor_set(NULL);
281 }
282 
283 static void
284 _balance_idle(struct spdk_scheduler_thread_info *thread_info)
285 {
286 	if (_get_thread_load(thread_info) >= SCHEDULER_LOAD_LIMIT) {
287 		return;
288 	}
289 	/* This thread is idle, move it to the main core. */
290 	_move_thread(thread_info, g_main_lcore);
291 }
292 
293 static void
294 _balance_active(struct spdk_scheduler_thread_info *thread_info)
295 {
296 	uint32_t target_lcore;
297 
298 	if (_get_thread_load(thread_info) < SCHEDULER_LOAD_LIMIT) {
299 		return;
300 	}
301 
302 	/* This thread is active. */
303 	target_lcore = _find_optimal_core(thread_info);
304 	_move_thread(thread_info, target_lcore);
305 }
306 
307 static void
308 balance(struct spdk_scheduler_core_info *cores_info, uint32_t cores_count)
309 {
310 	struct spdk_reactor *reactor;
311 	struct spdk_governor *governor;
312 	struct spdk_scheduler_core_info *core;
313 	struct core_stats *main_core;
314 	uint32_t i;
315 	int rc;
316 	bool busy_threads_present = false;
317 
318 	SPDK_ENV_FOREACH_CORE(i) {
319 		g_cores[i].thread_count = cores_info[i].threads_count;
320 		g_cores[i].busy = cores_info[i].current_busy_tsc;
321 		g_cores[i].idle = cores_info[i].current_idle_tsc;
322 	}
323 	main_core = &g_cores[g_main_lcore];
324 
325 	/* Distribute threads in two passes, to make sure updated core stats are considered on each pass.
326 	 * 1) Move all idle threads to main core. */
327 	_foreach_thread(cores_info, _balance_idle);
328 	/* 2) Distribute active threads across all cores. */
329 	_foreach_thread(cores_info, _balance_active);
330 
331 	/* Switch unused cores to interrupt mode and switch cores to polled mode
332 	 * if they will be used after rebalancing */
333 	SPDK_ENV_FOREACH_CORE(i) {
334 		reactor = spdk_reactor_get(i);
335 		core = &cores_info[i];
336 		/* We can switch mode only if reactor already does not have any threads */
337 		if (g_cores[i].thread_count == 0 && TAILQ_EMPTY(&reactor->threads)) {
338 			core->interrupt_mode = true;
339 		} else if (g_cores[i].thread_count != 0) {
340 			core->interrupt_mode = false;
341 			if (i != g_main_lcore) {
342 				/* If a thread is present on non g_main_lcore,
343 				 * it has to be busy. */
344 				busy_threads_present = true;
345 			}
346 		}
347 	}
348 
349 	governor = spdk_governor_get();
350 	if (governor == NULL) {
351 		return;
352 	}
353 
354 	/* Change main core frequency if needed */
355 	if (busy_threads_present) {
356 		rc = governor->set_core_freq_max(g_main_lcore);
357 		if (rc < 0) {
358 			SPDK_ERRLOG("setting default frequency for core %u failed\n", g_main_lcore);
359 		}
360 	} else if (main_core->busy > main_core->idle) {
361 		rc = governor->core_freq_up(g_main_lcore);
362 		if (rc < 0) {
363 			SPDK_ERRLOG("increasing frequency for core %u failed\n", g_main_lcore);
364 		}
365 	} else {
366 		rc = governor->core_freq_down(g_main_lcore);
367 		if (rc < 0) {
368 			SPDK_ERRLOG("lowering frequency for core %u failed\n", g_main_lcore);
369 		}
370 	}
371 }
372 
373 static struct spdk_scheduler scheduler_dynamic = {
374 	.name = "dynamic",
375 	.init = init,
376 	.deinit = deinit,
377 	.balance = balance,
378 };
379 
380 SPDK_SCHEDULER_REGISTER(scheduler_dynamic);
381