xref: /spdk/test/scheduler/common.sh (revision 1078198e78653b2f39414c1566740018d76ee73d)
1#  SPDX-License-Identifier: BSD-3-Clause
2#  Copyright (C) 2020 Intel Corporation
3#  All rights reserved.
4#
5
6declare -r sysfs_system=/sys/devices/system
7declare -r sysfs_cpu=$sysfs_system/cpu
8declare -r sysfs_node=$sysfs_system/node
9
10declare -r scheduler=$rootdir/test/event/scheduler/scheduler
11declare plugin=scheduler_plugin
12
13source "$rootdir/test/scheduler/cgroups.sh"
14
15fold_list_onto_array() {
16	local array=$1
17	local elem
18
19	shift || return 0
20
21	for elem; do
22		eval "${array}[elem]=$elem"
23	done
24}
25
26fold_array_onto_string() {
27	local cpus=("$@")
28
29	local IFS=","
30	echo "${cpus[*]}"
31}
32
33parse_cpu_list() {
34	local list=$1
35	local elem elems cpus
36
37	# 0-2,4,6-9, etc.
38	IFS="," read -ra elems < "$list"
39
40	((${#elems[@]} > 0)) || return 0
41
42	for elem in "${elems[@]}"; do
43		if [[ $elem == *-* ]]; then
44			local start=${elem%-*} end=${elem#*-}
45			while ((start <= end)); do
46				cpus[start++]=$start
47			done
48		else
49			cpus[elem]=$elem
50		fi
51	done
52	printf '%u\n' "${!cpus[@]}"
53}
54
55map_cpus_node() {
56	local node_idx=$1
57	local -n _cpu_node_map=node_${node_idx}_cpu
58	local cpu_idx core_idx
59
60	for cpu_idx in $(parse_cpu_list "$sysfs_node/node$node_idx/cpulist"); do
61		if is_cpu_online "$cpu_idx"; then
62			core_idx=$(< "$sysfs_cpu/cpu$cpu_idx/topology/core_id")
63			local -n _cpu_core_map=node_${node_idx}_core_${core_idx}
64			_cpu_core_map+=("$cpu_idx") cpu_core_map[cpu_idx]=$core_idx
65			local -n _cpu_siblings=node_${node_idx}_core_${core_idx}_thread_${cpu_idx}
66			_cpu_siblings=($(parse_cpu_list "$sysfs_cpu/cpu$cpu_idx/topology/thread_siblings_list"))
67			cpu_siblings[cpu_idx]="node_${node_idx}_core_${core_idx}_thread_${cpu_idx}[@]"
68		fi
69		_cpu_node_map[cpu_idx]=$cpu_idx cpu_node_map[cpu_idx]=$node_idx
70		cpus+=("$cpu_idx")
71	done
72
73	nodes[node_idx]=$node_idx
74}
75
76map_cpus() {
77	local -g cpus=()
78	local -g cpu_siblings=()
79	local -g nodes=()
80	local -g cpu_node_map=()
81	local -g cpu_core_map=()
82	local -g core_node_map=()
83	local node
84
85	unset -v "${!node_@}"
86
87	for node in "$sysfs_node/node"+([0-9]); do
88		map_cpus_node "${node##*node}"
89	done
90}
91
92get_cpus() {
93	local node=$1
94	local core=$2
95	local _cpus
96
97	if [[ -z $node ]]; then
98		_cpus=("${cpus[@]}")
99	elif [[ -n $node ]]; then
100		eval "_cpus=(\${node_${node}_cpu[@]})"
101		if [[ -n $core ]]; then
102			eval "_cpus=(\${node_${node}_core_${core}[@]})"
103		fi
104	fi
105	((${#_cpus[@]} > 0)) || return 1
106	printf '%u\n' "${_cpus[@]}"
107}
108
109get_isolated_cpus() {
110	[[ -e $sysfs_cpu/isolated ]] || return 0
111	parse_cpu_list "$sysfs_cpu/isolated"
112}
113
114get_offline_cpus() {
115	local offline
116
117	[[ -e $sysfs_cpu/offline ]] || return 0
118	parse_cpu_list "$sysfs_cpu/offline"
119}
120
121get_online_cpus() {
122	[[ -e $sysfs_cpu/online ]] || return 0
123	parse_cpu_list "$sysfs_cpu/online"
124}
125
126is_cpu_online() {
127	local online
128
129	fold_list_onto_array online $(get_online_cpus)
130	[[ -v online[$1] ]]
131}
132
133is_cpu_offline() {
134	! is_cpu_online "$1"
135}
136
137online_cpu() {
138	is_cpu_offline "$1" || return 0
139	[[ -e $sysfs_cpu/cpu$1/online ]] && echo 1 > "$sysfs_cpu/cpu$1/online"
140}
141
142offline_cpu() {
143	is_cpu_online "$1" || return 0
144	[[ -e $sysfs_cpu/cpu$1/online ]] && echo 0 > "$sysfs_cpu/cpu$1/online"
145}
146
147mask_cpus() {
148	printf '[%s]\n' "$(fold_array_onto_string "$@")"
149}
150
151denied_list() {
152	local -g denied
153
154	fold_list_onto_array denied $(get_offline_cpus) "$@"
155}
156
157filter_allowed_list() {
158	local cpu
159
160	for cpu in "${!allowed[@]}"; do
161		if [[ -n ${denied[cpu]} ]] || ((cpu > 127)); then
162			unset -v "allowed[cpu]"
163		fi
164	done
165}
166
167allowed_list() {
168	local max=${1:-4}
169	local node=${2:-0}
170	local cpu_count=${cpu_count:--1}
171
172	local -g allowed
173
174	fold_list_onto_array allowed $(get_isolated_cpus)
175
176	if ((cpu_count < 0 && ${#allowed[@]} > 0)); then
177		((max += ${#allowed[@]}))
178	fi
179
180	local -n node_cpu_ref=node_${node}_cpu
181
182	while ((${#allowed[@]} < max && ++cpu_count < ${#node_cpu_ref[@]})); do
183		fold_list_onto_array allowed $(get_cpus "$node" "${cpu_core_map[node_cpu_ref[cpu_count]]}")
184	done
185
186	filter_allowed_list
187
188	if ((${#allowed[@]} == max)); then
189		return 0
190	elif ((cpu_count == ${#node_cpu_ref[@]})); then
191		return 0
192	else
193		allowed_list "$max" "$node"
194	fi
195}
196
197get_proc_cpu_affinity() {
198	xtrace_disable
199
200	local pid=${1:-$$}
201	local status val
202
203	[[ -e /proc/$pid/status ]] || return 1
204	while IFS=":"$'\t' read -r status val; do
205		if [[ $status == Cpus_allowed_list ]]; then
206			parse_cpu_list <(echo "$val")
207			return 0
208		fi
209	done < "/proc/$pid/status"
210
211	xtrace_restore
212}
213
214map_cpufreq() {
215	# This info is used to cross-reference current cpufreq setup with
216	# what DPDK's governor actually puts in place.
217
218	local -g cpufreq_drivers=()
219	local -g cpufreq_governors=()
220	local -g cpufreq_base_freqs=()
221	local -g cpufreq_max_freqs=()
222	local -g cpufreq_min_freqs=()
223	local -g cpufreq_cur_freqs=()
224	local -g cpufreq_is_turbo=()
225	local -g cpufreq_available_freqs=()
226	local -g cpufreq_available_governors=()
227	local -g cpufreq_high_prio=()
228	local -g cpufreq_non_turbo_ratio=()
229	local -g cpufreq_setspeed=()
230	local -g cpuinfo_max_freqs=()
231	local -g cpuinfo_min_freqs=()
232	local -g turbo_enabled=0
233	local cpu cpu_idx
234
235	for cpu in "$sysfs_cpu/cpu"+([0-9]); do
236		cpu_idx=${cpu##*cpu}
237		[[ -e $cpu/cpufreq ]] || continue
238		cpufreq_drivers[cpu_idx]=$(< "$cpu/cpufreq/scaling_driver")
239		cpufreq_governors[cpu_idx]=$(< "$cpu/cpufreq/scaling_governor")
240
241		# In case HWP is on
242		if [[ -e $cpu/cpufreq/base_frequency ]]; then
243			cpufreq_base_freqs[cpu_idx]=$(< "$cpu/cpufreq/base_frequency")
244		fi
245
246		cpufreq_cur_freqs[cpu_idx]=$(< "$cpu/cpufreq/scaling_cur_freq")
247		cpufreq_max_freqs[cpu_idx]=$(< "$cpu/cpufreq/scaling_max_freq")
248		cpufreq_min_freqs[cpu_idx]=$(< "$cpu/cpufreq/scaling_min_freq")
249
250		local -n available_governors=available_governors_cpu_${cpu_idx}
251		cpufreq_available_governors[cpu_idx]="available_governors_cpu_${cpu_idx}[@]"
252		available_governors=($(< "$cpu/cpufreq/scaling_available_governors"))
253
254		local -n available_freqs=available_freqs_cpu_${cpu_idx}
255		cpufreq_available_freqs[cpu_idx]="available_freqs_cpu_${cpu_idx}[@]"
256
257		case "${cpufreq_drivers[cpu_idx]}" in
258			acpi-cpufreq)
259				available_freqs=($(< "$cpu/cpufreq/scaling_available_frequencies"))
260				if ((available_freqs[0] - 1000 == available_freqs[1])); then
261					cpufreq_is_turbo[cpu_idx]=1
262				else
263					cpufreq_is_turbo[cpu_idx]=0
264				fi
265				cpufreq_setspeed[cpu_idx]=$(< "$cpu/cpufreq/scaling_setspeed")
266				;;
267			intel_pstate | intel_cpufreq) # active or passive
268				local non_turbo_ratio base_max_freq num_freq freq is_turbo=0
269
270				non_turbo_ratio=$("$testdir/rdmsr.pl" "$cpu_idx" 0xce)
271				cpuinfo_min_freqs[cpu_idx]=$(< "$cpu/cpufreq/cpuinfo_min_freq")
272				cpuinfo_max_freqs[cpu_idx]=$(< "$cpu/cpufreq/cpuinfo_max_freq")
273				cpufreq_non_turbo_ratio[cpu_idx]=$(((non_turbo_ratio >> 8) & 0xff))
274				if ((cpufreq_base_freqs[cpu_idx] / 100000 > cpufreq_non_turbo_ratio[cpu_idx])); then
275					cpufreq_high_prio[cpu_idx]=1
276					base_max_freq=${cpufreq_base_freqs[cpu_idx]}
277				else
278					cpufreq_high_prio[cpu_idx]=0
279					base_max_freq=$((cpufreq_non_turbo_ratio[cpu_idx] * 100000))
280				fi
281				num_freqs=$(((base_max_freq - cpuinfo_min_freqs[cpu_idx]) / 100000 + 1))
282				if ((base_max_freq < cpuinfo_max_freqs[cpu_idx])); then
283					((num_freqs += 1))
284					cpufreq_is_turbo[cpu_idx]=1
285				else
286					cpufreq_is_turbo[cpu_idx]=0
287				fi
288				available_freqs=()
289				for ((freq = 0; freq < num_freqs; freq++)); do
290					if ((freq == 0 && cpufreq_is_turbo[cpu_idx] == 1)); then
291						available_freqs[freq]=$((base_max_freq + 1))
292					else
293						available_freqs[freq]=$((base_max_freq - (freq - cpufreq_is_turbo[cpu_idx]) * 100000))
294					fi
295				done
296				;;
297			cppc_cpufreq)
298				cpufreq_setspeed[cpu_idx]=$(< "$cpu/cpufreq/scaling_setspeed")
299				scaling_min_freqs[cpu_idx]=$(< "$cpu/cpufreq/scaling_min_freq")
300				scaling_max_freqs[cpu_idx]=$(< "$cpu/cpufreq/scaling_max_freq")
301				cpuinfo_max_freqs[cpu_idx]=$(< "$cpu/cpufreq/cpuinfo_max_freq")
302				nominal_perf[cpu_idx]=$(< "$cpu/acpi_cppc/nominal_perf")
303				highest_perf[cpu_idx]=$(< "$cpu/acpi_cppc/highest_perf")
304
305				#the unit of highest_perf and nominal_perf differs on different arm platforms.
306				#For highest_perf, it maybe 300 or 3000000, both means 3.0GHz.
307				if ((highest_perf[cpu_idx] > nominal_perf[cpu_idx] && (\
308					highest_perf[cpu_idx] == cpuinfo_max_freqs[cpu_idx] || \
309					highest_perf[cpu_idx] * 10000 == cpuinfo_max_freqs[cpu_idx]))); then
310					cpufreq_is_turbo[cpu_idx]=1
311				else
312					cpufreq_is_turbo[cpu_idx]=0
313				fi
314
315				if ((nominal_perf[cpu_idx] < 10000)); then
316					nominal_perf[cpu_idx]=$((nominal_perf[cpu_idx] * 10000))
317				fi
318
319				num_freqs=$(((nominal_perf[cpu_idx] - scaling_min_freqs[cpu_idx]) / 100000 + 1 + \
320					cpufreq_is_turbo[cpu_idx]))
321
322				available_freqs=()
323				for ((freq = 0; freq < num_freqs; freq++)); do
324					if ((freq == 0 && cpufreq_is_turbo[cpu_idx] == 1)); then
325						available_freqs[freq]=$((scaling_max_freqs[cpu_idx]))
326					else
327						available_freqs[freq]=$((nominal_perf[cpu_idx] - (\
328							freq - cpufreq_is_turbo[cpu_idx]) * 100000))
329					fi
330				done
331				;;
332		esac
333	done
334	if [[ -e $sysfs_cpu/cpufreq/boost ]]; then
335		turbo_enabled=$(< "$sysfs_cpu/cpufreq/boost")
336	elif [[ -e $sysfs_cpu/intel_pstate/no_turbo ]]; then
337		turbo_enabled=$((!$(< "$sysfs_cpu/intel_pstate/no_turbo")))
338	fi
339}
340
341set_cpufreq() {
342	local cpu=$1
343	local min_freq=$2
344	local max_freq=$3
345	local cpufreq=$sysfs_cpu/cpu$cpu/cpufreq
346
347	# Map the cpufreq info first
348	[[ -n ${cpufreq_drivers[cpu]} ]] || return 1
349	[[ -n $min_freq ]] || return 1
350
351	case "${cpufreq_drivers[cpu]}" in
352		acpi-cpufreq | cppc_cpufreq)
353			if [[ $(< "$cpufreq/scaling_governor") != userspace ]]; then
354				echo "userspace" > "$cpufreq/scaling_governor"
355			fi
356			echo "$min_freq" > "$cpufreq/scaling_setspeed"
357			;;
358		intel_pstate | intel_cpufreq)
359			if [[ -n $max_freq ]] && ((max_freq >= min_freq)); then
360				echo "$max_freq" > "$cpufreq/scaling_max_freq"
361			fi
362			if ((min_freq <= cpufreq_max_freqs[cpu])); then
363				echo "$min_freq" > "$cpufreq/scaling_min_freq"
364			fi
365			;;
366	esac
367}
368
369set_cpufreq_governor() {
370	local cpu=$1
371	local governor=$2
372	local cpufreq=$sysfs_cpu/cpu$cpu/cpufreq
373
374	if [[ $(< "$cpufreq/scaling_governor") != "$governor" ]]; then
375		echo "$governor" > "$cpufreq/scaling_governor"
376	fi
377}
378
379exec_under_dynamic_scheduler() {
380	if [[ -e /proc/$spdk_pid/status ]]; then
381		killprocess "$spdk_pid"
382	fi
383	exec_in_cgroup "/cpuset/spdk" "$@" --wait-for-rpc &
384	spdk_pid=$!
385	# Give some time for the app to init itself
386	waitforlisten "$spdk_pid"
387	"$rootdir/scripts/rpc.py" framework_set_scheduler dynamic
388	"$rootdir/scripts/rpc.py" framework_start_init
389}
390
391get_thread_stats() {
392	xtrace_disable
393	_get_thread_stats busy idle
394	xtrace_restore
395}
396
397_get_thread_stats() {
398	local list_busy=$1
399	local list_idle=$2
400	local thread threads stats
401
402	stats=$(rpc_cmd thread_get_stats | jq -r '.threads[]')
403	threads=($(jq -r '.id' <<< "$stats"))
404
405	for thread in "${threads[@]}"; do
406		eval "${list_busy}[$thread]=\$(jq -r \"select(.id == $thread) | .busy\" <<< \$stats)"
407		eval "${list_idle}[$thread]=\$(jq -r \"select(.id == $thread) | .idle\" <<< \$stats)"
408		thread_map[thread]=$(jq -r "select(.id == $thread) | .name" <<< "$stats")
409	done
410}
411
412get_cpu_stat() {
413	local cpu_idx=$1
414	local stat=$2 stats astats
415
416	while read -r cpu stats; do
417		[[ $cpu == "cpu$cpu_idx" ]] && astats=($stats)
418	done < /proc/stat
419
420	case "$stat" in
421		idle) echo "${astats[3]}" ;;
422		all) printf '%u\n' "${astats[@]}" ;;
423		*) ;;
424	esac
425}
426
427create_thread() {
428	rpc_cmd --plugin "$plugin" scheduler_thread_create "$@"
429}
430
431destroy_thread() {
432	rpc_cmd --plugin "$plugin" scheduler_thread_delete "$@"
433}
434
435active_thread() {
436	rpc_cmd --plugin "$plugin" scheduler_thread_set_active "$@"
437}
438
439get_cpu_time() {
440	xtrace_disable
441
442	local interval=$1 cpu_time=${2:-idle} interval_count
443	shift 2
444	local cpus=("$@") cpu
445	local stats stat old_stats avg_load
446	local total_sample
447
448	# Exposed for the caller
449	local -g cpu_times=()
450	local -g avg_cpu_time=()
451
452	# cpu_time:
453	# 0 - user (time spent in user mode)
454	# 1 - nice (Time spent in user mode with low priority)
455	# 2 - system (Time spent in system mode)
456	# 3 - idle (Time spent in the idle task)
457	# 4 - iowait (Time waiting for I/O to complete)
458	# 5 - irq (Time servicing interrupts)
459	# 6 - softirq (Time servicing softirqs)
460	# 7 - steal (Stolen time)
461	# 8 - guest (Time spent running a virtual CPU)
462	# 9 - guest_nice (Time spent running a niced guest)
463
464	local -gA cpu_time_map
465	cpu_time_map["user"]=0
466	cpu_time_map["nice"]=1
467	cpu_time_map["system"]=2
468	cpu_time_map["idle"]=3
469	cpu_time_map["iowait"]=4
470	cpu_time_map["irq"]=5
471	cpu_time_map["softirq"]=6
472	cpu_time_map["steal"]=7
473	cpu_time_map["guest"]=8
474	cpu_time_map["guest_nice"]=9
475
476	# Clear up the env
477	unset -v ${!stat_@}
478	unset -v ${!old_stat_@}
479	unset -v ${!avg_stat@}
480	unset -v ${!avg_load@}
481	unset -v ${!raw_samples@}
482
483	cpu_time=${cpu_time_map["$cpu_time"]}
484	interval=$((interval <= 0 ? 1 : interval))
485	# We skip first sample to have min 2 for stat comparison
486	interval=$((interval + 1)) interval_count=0
487	while ((interval_count++, --interval >= 0)); do
488		for cpu in "${cpus[@]}"; do
489			local -n old_stats=old_stats_$cpu
490			local -n avg_load=avg_load_$cpu
491			local -n raw_samples=raw_samples_$cpu
492
493			sample_stats=() total_sample=0
494
495			stats=($(get_cpu_stat "$cpu" all))
496			if ((interval_count == 1)); then
497				# Skip first sample
498				old_stats=("${stats[@]}")
499				continue
500			fi
501			for stat in "${!stats[@]}"; do
502				avg_load[stat]="stat_${stat}_${cpu}[@]"
503				sample_stats[stat]=$((stats[stat] - old_stats[stat]))
504				: $((total_sample += sample_stats[stat]))
505			done
506			for stat in "${!stats[@]}"; do
507				local -n avg_stat=stat_${stat}_${cpu}
508				local -n raw_samples_ref=raw_samples_${stat}_${cpu}
509				raw_samples[stat]="raw_samples_${stat}_${cpu}[@]"
510				raw_samples_ref+=("${stats[stat]}")
511				avg_stat+=($((sample_stats[stat] * 100 / (total_sample == 0 ? 1 : total_sample))))
512			done
513			old_stats=("${stats[@]}")
514		done
515		sleep 1s
516	done
517
518	# We collected % for each time. Now determine the avg % for requested time.
519	local load stat_load
520	for cpu in "${cpus[@]}"; do
521		load=0
522		local -n avg_load_cpu=avg_load_$cpu
523		stat_load=("${!avg_load_cpu[cpu_time]}")
524		for stat in "${stat_load[@]}"; do
525			: $((load += stat))
526		done
527		cpu_times[cpu]=${stat_load[*]}
528		avg_cpu_time[cpu]=$((load / ${#stat_load[@]}))
529	done
530
531	xtrace_restore
532}
533
534collect_cpu_idle() {
535	((${#cpus_to_collect[@]} > 0)) || return 1
536
537	local time=${1:-5}
538	local cpu
539	local samples
540	local -g is_idle=()
541
542	printf 'Collecting cpu idle stats (cpus: %s) for %u seconds...\n' \
543		"${cpus_to_collect[*]}" "$time"
544
545	get_cpu_time "$time" idle "${cpus_to_collect[@]}"
546
547	local user_load
548	for cpu in "${cpus_to_collect[@]}"; do
549		samples=(${cpu_times[cpu]})
550		printf '* cpu%u idle samples: %s (avg: %u%%)\n' \
551			"$cpu" "${samples[*]}" "${avg_cpu_time[cpu]}"
552		# Cores with polling reactors have 0% idle time,
553		# while the ones in interrupt mode won't have 100% idle.
554		# During the tests, polling reactors spend the major portion
555		# of their cpu time in user mode. With that in mind, if the
556		# general check for cpus's idleness fails, check what portion
557		# of the cpu load falls into user mode. For the idle check
558		# use the last sample. For the cpu load, compare user's raw
559		# samples in SC_CLK_TCK context for a more detailed view.
560		user_load=$(cpu_usage_clk_tck "$cpu" user)
561		if ((samples[-1] >= 70)); then
562			printf '* cpu%u is idle\n' "$cpu"
563			is_idle[cpu]=1
564		elif ((user_load <= 15)); then
565			printf '* cpu%u not fully idle, but user load is low so passing\n' "$cpu"
566			is_idle[cpu]=1
567		else
568			printf '* cpu%u is not idle\n' "$cpu"
569			is_idle[cpu]=0
570		fi
571	done
572}
573
574cpu_usage_clk_tck() {
575	local cpu=$1 time=${2:-all}
576	local user nice system usage clk_delta
577
578	# We should be called in get_cpu_time()'s environment.
579	[[ -v raw_samples_$cpu ]] || return 1
580
581	local -n raw_samples=raw_samples_$cpu
582	user=("${!raw_samples[cpu_time_map["user"]]}")
583	nice=("${!raw_samples[cpu_time_map["nice"]]}")
584	system=("${!raw_samples[cpu_time_map["system"]]}")
585
586	# Construct delta based on last two samples of a given time.
587	case "$time" in
588		user | all) ((clk_delta += (user[-1] - user[-2]))) ;;&
589		nice | all) ((clk_delta += (nice[-1] - nice[-2]))) ;;&
590		system | all) ((clk_delta += (system[-1] - system[-2]))) ;;
591		*) ;;
592	esac
593	# We assume 1s between each sample. See get_cpu_time().
594	usage=$((100 * clk_delta / $(getconf CLK_TCK)))
595	usage=$((usage > 100 ? 100 : usage))
596
597	printf '%u' "$usage"
598	printf '* cpu%u %s usage: %u\n' "$cpu" "$time" "$usage" >&2
599	printf '* cpu%u user samples: %s\n' "$cpu" "${user[*]}" >&2
600	printf '* cpu%u nice samples: %s\n' "$cpu" "${nice[*]}" >&2
601	printf '* cpu%u system samples: %s\n' "$cpu" "${system[*]}" >&2
602}
603
604update_thread_cpus_map() {
605	local cpu
606	local -g thread_cpus=()
607	local reactor_framework
608
609	((${#cpus[@]} > 0)) || return 1
610
611	get_thread_stats
612
613	reactor_framework=$(rpc_cmd framework_get_reactors | jq -r '.reactors[]')
614	for cpu in "${cpus[@]}"; do
615		for thread in $(jq -r "select(.lcore == $cpu) | .lw_threads[].id" <<< "$reactor_framework"); do
616			printf '* Thread %u (%s) on cpu%u\n' "$thread" "${thread_map[thread]}" "$cpu"
617			thread_cpus[thread]=$cpu
618		done
619	done
620	((${#thread_cpus[@]} > 0))
621}
622