xref: /spdk/test/scheduler/common.sh (revision 18c8b52afa69f39481ebb75711b2f30b11693f9d)
1shopt -s nullglob extglob
2
3declare -r sysfs_system=/sys/devices/system
4declare -r sysfs_cpu=$sysfs_system/cpu
5declare -r sysfs_node=$sysfs_system/node
6
7declare -r scheduler=$rootdir/test/event/scheduler/scheduler
8declare -r plugin=scheduler_plugin
9
10source "$rootdir/test/scheduler/cgroups.sh"
11
12fold_list_onto_array() {
13	local array=$1
14	local elem
15
16	shift || return 0
17
18	for elem; do
19		eval "${array}[elem]=$elem"
20	done
21}
22
23fold_array_onto_string() {
24	local cpus=("$@")
25
26	local IFS=","
27	echo "${cpus[*]}"
28}
29
30parse_cpu_list() {
31	local list=$1
32	local elem elems cpus
33
34	# 0-2,4,6-9, etc.
35	IFS="," read -ra elems < "$list"
36
37	((${#elems[@]} > 0)) || return 0
38
39	for elem in "${elems[@]}"; do
40		if [[ $elem == *-* ]]; then
41			local start=${elem%-*} end=${elem#*-}
42			while ((start <= end)); do
43				cpus[start++]=$start
44			done
45		else
46			cpus[elem]=$elem
47		fi
48	done
49	printf '%u\n' "${!cpus[@]}"
50}
51
52map_cpus_node() {
53	local node_idx=$1
54	local -n _cpu_node_map=node_${node_idx}_cpu
55	local cpu_idx core_idx
56
57	for cpu_idx in $(parse_cpu_list "$sysfs_node/node$node_idx/cpulist"); do
58		if is_cpu_online "$cpu_idx"; then
59			core_idx=$(< "$sysfs_cpu/cpu$cpu_idx/topology/core_id")
60			local -n _cpu_core_map=node_${node_idx}_core_${core_idx}
61			_cpu_core_map+=("$cpu_idx") cpu_core_map[cpu_idx]=$core_idx
62		fi
63		_cpu_node_map+=("$cpu_idx") cpu_node_map[cpu_idx]=$node_idx
64		cpus+=("$cpu_idx")
65	done
66
67	nodes[node_idx]=$node_idx
68}
69
70map_cpus() {
71	local -g cpus=()
72	local -g nodes=()
73	local -g cpu_node_map=()
74	local -g cpu_core_map=()
75	local -g core_node_map=()
76	local node
77
78	unset -v "${!node_@}"
79
80	for node in "$sysfs_node/node"+([0-9]); do
81		map_cpus_node "${node##*node}"
82	done
83}
84
85get_cpus() {
86	local node=$1
87	local core=$2
88	local _cpus
89
90	if [[ -z $node ]]; then
91		_cpus=("${cpus[@]}")
92	elif [[ -n $node ]]; then
93		eval "_cpus=(\${node_${node}_cpu[@]})"
94		if [[ -n $core ]]; then
95			eval "_cpus=(\${node_${node}_core_${core}[@]})"
96		fi
97	fi
98	((${#_cpus[@]} > 0)) || return 1
99	printf '%u\n' "${_cpus[@]}"
100}
101
102get_isolated_cpus() {
103	[[ -e $sysfs_cpu/isolated ]] || return 0
104	parse_cpu_list "$sysfs_cpu/isolated"
105}
106
107get_offline_cpus() {
108	local offline
109
110	[[ -e $sysfs_cpu/offline ]] || return 0
111	parse_cpu_list "$sysfs_cpu/offline"
112}
113
114get_online_cpus() {
115	[[ -e $sysfs_cpu/online ]] || return 0
116	parse_cpu_list "$sysfs_cpu/online"
117}
118
119is_cpu_online() {
120	local online
121
122	fold_list_onto_array online $(get_online_cpus)
123	[[ -v online[$1] ]]
124}
125
126is_cpu_offline() {
127	! is_cpu_online "$1"
128}
129
130online_cpu() {
131	is_cpu_offline "$1" || return 0
132	[[ -e $sysfs_cpu/cpu$1/online ]] && echo 1 > "$sysfs_cpu/cpu$1/online"
133}
134
135offline_cpu() {
136	is_cpu_online "$1" || return 0
137	[[ -e $sysfs_cpu/cpu$1/online ]] && echo 0 > "$sysfs_cpu/cpu$1/online"
138}
139
140mask_cpus() {
141	local cpu
142	local mask=0
143
144	for cpu; do
145		((mask |= 1 << cpu))
146	done
147	printf '0x%x\n' "$mask"
148}
149
150denied_list() {
151	local -g denied
152
153	fold_list_onto_array denied $(get_offline_cpus) "$@"
154}
155
156filter_allowed_list() {
157	local cpu
158
159	for cpu in "${!allowed[@]}"; do
160		if [[ -n ${denied[cpu]} ]]; then
161			unset -v "allowed[cpu]"
162		fi
163	done
164}
165
166allowed_list() {
167	local max=${1:-4}
168	local node=${2:-0}
169	local cpu_count=${cpu_count:--1}
170
171	local -g allowed
172
173	fold_list_onto_array allowed $(get_isolated_cpus)
174
175	if ((cpu_count < 0 && ${#allowed[@]} > 0)); then
176		((max += ${#allowed[@]}))
177	fi
178
179	local -n node_cpu_ref=node_${node}_cpu
180
181	while ((${#allowed[@]} < max && ++cpu_count < ${#node_cpu_ref[@]})); do
182		fold_list_onto_array allowed $(get_cpus "$node" "${cpu_core_map[node_cpu_ref[cpu_count]]}")
183	done
184
185	filter_allowed_list
186
187	if ((${#allowed[@]} == max)); then
188		return 0
189	elif ((cpu_count == ${#node_cpu_ref[@]})); then
190		return 0
191	else
192		allowed_list "$max" "$node"
193	fi
194}
195
196get_proc_cpu_affinity() {
197	xtrace_disable
198
199	local pid=${1:-$$}
200	local status val
201
202	[[ -e /proc/$pid/status ]] || return 1
203	while IFS=":"$'\t' read -r status val; do
204		if [[ $status == Cpus_allowed_list ]]; then
205			parse_cpu_list <(echo "$val")
206			return 0
207		fi
208	done < "/proc/$pid/status"
209
210	xtrace_restore
211}
212
213map_cpufreq() {
214	# This info is used to cross-reference current cpufreq setup with
215	# what DPDK's governor actually puts in place.
216
217	local -g cpufreq_drivers=()
218	local -g cpufreq_governors=()
219	local -g cpufreq_base_freqs=()
220	local -g cpufreq_max_freqs=()
221	local -g cpufreq_min_freqs=()
222	local -g cpufreq_cur_freqs=()
223	local -g cpufreq_is_turbo=()
224	local -g cpufreq_available_freqs=()
225	local -g cpufreq_available_governors=()
226	local -g cpufreq_high_prio=()
227	local -g cpufreq_non_turbo_ratio=()
228	local -g cpufreq_setspeed=()
229	local -g cpuinfo_max_freqs=()
230	local -g cpuinfo_min_freqs=()
231	local -g turbo_enabled=0
232	local cpu cpu_idx
233
234	for cpu in "$sysfs_cpu/cpu"+([0-9]); do
235		cpu_idx=${cpu##*cpu}
236		[[ -e $cpu/cpufreq ]] || continue
237		cpufreq_drivers[cpu_idx]=$(< "$cpu/cpufreq/scaling_driver")
238		cpufreq_governors[cpu_idx]=$(< "$cpu/cpufreq/scaling_governor")
239
240		# In case HWP is on
241		if [[ -e $cpu/cpufreq/base_frequency ]]; then
242			cpufreq_base_freqs[cpu_idx]=$(< "$cpu/cpufreq/base_frequency")
243		fi
244
245		cpufreq_cur_freqs[cpu_idx]=$(< "$cpu/cpufreq/scaling_cur_freq")
246		cpufreq_max_freqs[cpu_idx]=$(< "$cpu/cpufreq/scaling_max_freq")
247		cpufreq_min_freqs[cpu_idx]=$(< "$cpu/cpufreq/scaling_min_freq")
248
249		local -n available_governors=available_governors_cpu_${cpu_idx}
250		cpufreq_available_governors[cpu_idx]="available_governors_cpu_${cpu_idx}[@]"
251		available_governors=($(< "$cpu/cpufreq/scaling_available_governors"))
252
253		local -n available_freqs=available_freqs_cpu_${cpu_idx}
254		cpufreq_available_freqs[cpu_idx]="available_freqs_cpu_${cpu_idx}[@]"
255
256		case "${cpufreq_drivers[cpu_idx]}" in
257			acpi-cpufreq)
258				available_freqs=($(< "$cpu/cpufreq/scaling_available_frequencies"))
259				if ((available_freqs[0] - 1000 == available_freqs[1])); then
260					cpufreq_is_turbo[cpu_idx]=1
261				else
262					cpufreq_is_turbo[cpu_idx]=0
263				fi
264				cpufreq_setspeed[cpu_idx]=$(< "$cpu/cpufreq/scaling_setspeed")
265				;;
266			intel_pstate | intel_cpufreq) # active or passive
267				local non_turbo_ratio base_max_freq num_freq freq is_turbo=0
268
269				non_turbo_ratio=$("$testdir/rdmsr.pl" "$cpu_idx" 0xce)
270				cpuinfo_min_freqs[cpu_idx]=$(< "$cpu/cpufreq/cpuinfo_min_freq")
271				cpuinfo_max_freqs[cpu_idx]=$(< "$cpu/cpufreq/cpuinfo_max_freq")
272				cpufreq_non_turbo_ratio[cpu_idx]=$(((non_turbo_ratio >> 8) & 0xff))
273				if ((cpufreq_base_freqs[cpu_idx] / 100000 > cpufreq_non_turbo_ratio[cpu_idx])); then
274					cpufreq_high_prio[cpu_idx]=1
275					base_max_freq=${cpufreq_base_freqs[cpu_idx]}
276				else
277					cpufreq_high_prio[cpu_idx]=0
278					base_max_freq=$((cpufreq_non_turbo_ratio[cpu_idx] * 100000))
279				fi
280				num_freqs=$(((base_max_freq - cpuinfo_min_freqs[cpu_idx]) / 100000 + 1))
281				if ((base_max_freq < cpuinfo_max_freqs[cpu_idx])); then
282					((num_freqs += 1))
283					cpufreq_is_turbo[cpu_idx]=1
284				else
285					cpufreq_is_turbo[cpu_idx]=0
286				fi
287				available_freqs=()
288				for ((freq = 0; freq < num_freqs; freq++)); do
289					if ((freq == 0 && cpufreq_is_turbo[cpu_idx] == 1)); then
290						available_freqs[freq]=$((base_max_freq + 1))
291					else
292						available_freqs[freq]=$((base_max_freq - (freq - cpufreq_is_turbo[cpu_idx]) * 100000))
293					fi
294				done
295				;;
296			cppc_cpufreq)
297				cpufreq_setspeed[cpu_idx]=$(< "$cpu/cpufreq/scaling_setspeed")
298				scaling_min_freqs[cpu_idx]=$(< "$cpu/cpufreq/scaling_min_freq")
299				scaling_max_freqs[cpu_idx]=$(< "$cpu/cpufreq/scaling_max_freq")
300				cpuinfo_max_freqs[cpu_idx]=$(< "$cpu/cpufreq/cpuinfo_max_freq")
301				nominal_perf[cpu_idx]=$(< "$cpu/acpi_cppc/nominal_perf")
302				highest_perf[cpu_idx]=$(< "$cpu/acpi_cppc/highest_perf")
303
304				#the unit of highest_perf and nominal_perf differs on different arm platforms.
305				#For highest_perf, it maybe 300 or 3000000, both means 3.0GHz.
306				if ((highest_perf[cpu_idx] > nominal_perf[cpu_idx] && (\
307					highest_perf[cpu_idx] == cpuinfo_max_freqs[cpu_idx] || \
308					highest_perf[cpu_idx] * 10000 == cpuinfo_max_freqs[cpu_idx]))); then
309					cpufreq_is_turbo[cpu_idx]=1
310				else
311					cpufreq_is_turbo[cpu_idx]=0
312				fi
313
314				if ((nominal_perf[cpu_idx] < 10000)); then
315					nominal_perf[cpu_idx]=$((nominal_perf[cpu_idx] * 10000))
316				fi
317
318				num_freqs=$(((nominal_perf[cpu_idx] - scaling_min_freqs[cpu_idx]) / 100000 + 1 + \
319					cpufreq_is_turbo[cpu_idx]))
320
321				available_freqs=()
322				for ((freq = 0; freq < num_freqs; freq++)); do
323					if ((freq == 0 && cpufreq_is_turbo[cpu_idx] == 1)); then
324						available_freqs[freq]=$((scaling_max_freqs[cpu_idx]))
325					else
326						available_freqs[freq]=$((nominal_perf[cpu_idx] - (\
327							freq - cpufreq_is_turbo[cpu_idx]) * 100000))
328					fi
329				done
330				;;
331		esac
332	done
333	if [[ -e $sysfs_cpu/cpufreq/boost ]]; then
334		turbo_enabled=$(< "$sysfs_cpu/cpufreq/boost")
335	elif [[ -e $sysfs_cpu/intel_pstate/no_turbo ]]; then
336		turbo_enabled=$((!$(< "$sysfs_cpu/intel_pstate/no_turbo")))
337	fi
338}
339
340set_cpufreq() {
341	local cpu=$1
342	local min_freq=$2
343	local max_freq=$3
344	local cpufreq=$sysfs_cpu/cpu$cpu/cpufreq
345
346	# Map the cpufreq info first
347	[[ -n ${cpufreq_drivers[cpu]} ]] || return 1
348	[[ -n $min_freq ]] || return 1
349
350	case "${cpufreq_drivers[cpu]}" in
351		acpi-cpufreq)
352			if [[ ${cpufreq_governors[cpu]} != userspace ]]; then
353				echo "userspace" > "$cpufreq/scaling_governors"
354			fi
355			echo "$min_freq" > "$cpufreq/scaling_setspeed"
356			;;
357		intel_pstate | intel_cpufreq)
358			if ((min_freq <= cpufreq_max_freqs[cpu])); then
359				echo "$min_freq" > "$cpufreq/scaling_min_freq"
360			fi
361			if [[ -n $max_freq ]] && ((max_freq >= min_freq)); then
362				echo "$max_freq" > "$cpufreq/scaling_max_freq"
363			fi
364			;;
365	esac
366}
367
368set_cpufreq_governor() {
369	local cpu=$1
370	local governor=$2
371	local cpufreq=$sysfs_cpu/cpu$cpu/cpufreq
372
373	if [[ $(< "$cpufreq/scaling_governor") != "$governor" ]]; then
374		echo "$governor" > "$cpufreq/scaling_governor"
375	fi
376}
377
378exec_under_dynamic_scheduler() {
379	if [[ -e /proc/$spdk_pid/status ]]; then
380		killprocess "$spdk_pid"
381	fi
382	exec_in_cgroup "/cpuset/spdk" "$@" --wait-for-rpc &
383	spdk_pid=$!
384	# Give some time for the app to init itself
385	waitforlisten "$spdk_pid"
386	"$rootdir/scripts/rpc.py" framework_set_scheduler dynamic
387	"$rootdir/scripts/rpc.py" framework_start_init
388}
389
390get_thread_stats() {
391	xtrace_disable
392	_get_thread_stats busy idle
393	xtrace_restore
394}
395
396_get_thread_stats() {
397	local list_busy=$1
398	local list_idle=$2
399	local thread threads stats
400
401	stats=$(rpc_cmd thread_get_stats | jq -r '.threads[]')
402	threads=($(jq -r '.id' <<< "$stats"))
403
404	for thread in "${threads[@]}"; do
405		eval "${list_busy}[$thread]=\$(jq -r \"select(.id == $thread) | .busy\" <<< \$stats)"
406		eval "${list_idle}[$thread]=\$(jq -r \"select(.id == $thread) | .idle\" <<< \$stats)"
407		thread_map[thread]=$(jq -r "select(.id == $thread) | .name" <<< "$stats")
408	done
409}
410
411get_cpu_stat() {
412	local cpu_idx=$1
413	local stat=$2 stats astats
414
415	while read -r cpu stats; do
416		[[ $cpu == "cpu$cpu_idx" ]] && astats=($stats)
417	done < /proc/stat
418
419	case "$stat" in
420		idle) echo "${astats[3]}" ;;
421		all) printf '%u\n' "${astats[@]}" ;;
422		*) ;;
423	esac
424}
425
426create_thread() {
427	rpc_cmd --plugin "$plugin" scheduler_thread_create "$@"
428}
429
430destroy_thread() {
431	rpc_cmd --plugin "$plugin" scheduler_thread_delete "$@"
432}
433
434active_thread() {
435	rpc_cmd --plugin "$plugin" scheduler_thread_set_active "$@"
436}
437
438get_cpu_time() {
439	xtrace_disable
440
441	local interval=$1 cpu_time=$2 interval_count
442	shift 2
443	local cpus=("$@") cpu
444	local stats stat old_stats avg_load
445	local total_sample
446
447	# Exposed for the caller
448	local -g cpu_times=()
449	local -g avg_cpu_time=()
450
451	# cpu_time:
452	# 0 - user (time spent in user mode)
453	# 1 - nice (Time spent in user mode with low priority)
454	# 2 - system (Time spent in system mode)
455	# 3 - idle (Time spent in the idle task)
456	# 4 - iowait (Time waiting for I/O to complete)
457	# 5 - irq (Time servicing interrupts)
458	# 6 - softirq (Time servicing softirqs)
459	# 7 - steal (Stolen time)
460	# 8 - guest (Time spent running a virtual CPU)
461	# 9 - guest_nice (Time spent running a niced guest)
462
463	local -A cpu_time_map
464	cpu_time_map["user"]=0
465	cpu_time_map["nice"]=1
466	cpu_time_map["system"]=2
467	cpu_time_map["idle"]=3
468	cpu_time_map["iowait"]=4
469	cpu_time_map["irq"]=5
470	cpu_time_map["softirq"]=6
471	cpu_time_map["steal"]=7
472	cpu_time_map["guest"]=8
473	cpu_time_map["guest_nice"]=9
474
475	# Clear up the env
476	unset -v ${!stat_@}
477	unset -v ${!old_stat_@}
478	unset -v ${!avg_stat@}
479	unset -v ${!avg_load@}
480
481	cpu_time=${cpu_time_map["$cpu_time"]:-3}
482	interval=$((interval <= 0 ? 1 : interval))
483	# We skip first sample to have min 2 for stat comparison
484	interval=$((interval + 1)) interval_count=0
485	while ((interval_count++, --interval >= 0)); do
486		for cpu in "${cpus[@]}"; do
487			local -n old_stats=old_stats_$cpu
488			local -n avg_load=avg_load_$cpu
489			sample_stats=() total_sample=0
490
491			stats=($(get_cpu_stat "$cpu" all))
492			if ((interval_count == 1)); then
493				# Skip first sample
494				old_stats=("${stats[@]}")
495				continue
496			fi
497			for stat in "${!stats[@]}"; do
498				avg_load[stat]="stat_${stat}_${cpu}[@]"
499				sample_stats[stat]=$((stats[stat] - old_stats[stat]))
500				: $((total_sample += sample_stats[stat]))
501			done
502			for stat in "${!stats[@]}"; do
503				local -n avg_stat=stat_${stat}_${cpu}
504				avg_stat+=($((sample_stats[stat] * 100 / (total_sample == 0 ? 1 : total_sample))))
505			done
506			old_stats=("${stats[@]}")
507		done
508		sleep 1s
509	done
510
511	# We collected % for each time. Now determine the avg % for requested time.
512	local load stat_load
513	for cpu in "${cpus[@]}"; do
514		load=0
515		local -n avg_load_cpu=avg_load_$cpu
516		stat_load=("${!avg_load_cpu[cpu_time]}")
517		for stat in "${stat_load[@]}"; do
518			: $((load += stat))
519		done
520		cpu_times[cpu]=${stat_load[*]}
521		avg_cpu_time[cpu]=$((load / ${#stat_load[@]}))
522	done
523
524	xtrace_restore
525}
526
527collect_cpu_idle() {
528	((${#cpus_to_collect[@]} > 0)) || return 1
529
530	local time=${1:-5}
531	local cpu
532	local samples
533	local -g is_idle=()
534
535	printf 'Collecting cpu idle stats (cpus: %s) for %u seconds...\n' \
536		"${cpus_to_collect[*]}" "$time"
537
538	get_cpu_time "$time" idle "${cpus_to_collect[@]}"
539
540	for cpu in "${cpus_to_collect[@]}"; do
541		samples=(${cpu_times[cpu]})
542		printf '* cpu%u idle samples: %s (avg: %u%%)\n' \
543			"$cpu" "${samples[*]}" "${avg_cpu_time[cpu]}"
544		# Cores with polling reactors have 0% idle time,
545		# while the ones in interrupt mode won't have 100% idle.
546		# Work can be potentially be scheduled to the core by kernel,
547		# to prevent that affecting tests set reasonably high idle limit.
548		# Consider last sample
549		if ((samples[-1] >= 70)); then
550			printf '* cpu%u is idle\n' "$cpu"
551			is_idle[cpu]=1
552		else
553			printf '*cpu%u is not idle\n' "$cpu"
554			is_idle[cpu]=0
555		fi
556	done
557}
558
559update_thread_cpus_map() {
560	local cpu
561	local -g thread_cpus=()
562	local reactor_framework
563
564	((${#cpus[@]} > 0)) || return 1
565
566	get_thread_stats
567
568	reactor_framework=$(rpc_cmd framework_get_reactors | jq -r '.reactors[]')
569	for cpu in "${cpus[@]}"; do
570		for thread in $(jq -r "select(.lcore == $cpu) | .lw_threads[].id" <<< "$reactor_framework"); do
571			printf '* Thread %u (%s) on cpu%u\n' "$thread" "${thread_map[thread]}" "$cpu"
572			thread_cpus[thread]=$cpu
573		done
574	done
575	((${#thread_cpus[@]} > 0))
576}
577