xref: /spdk/test/scheduler/common.sh (revision 7bc1134d6e34affdbfaaa4a41c1a5cc8f2d609de)
1#  SPDX-License-Identifier: BSD-3-Clause
2#  Copyright (C) 2020 Intel Corporation
3#  All rights reserved.
4#
5
6declare -r sysfs_system=/sys/devices/system
7declare -r sysfs_cpu=$sysfs_system/cpu
8declare -r sysfs_node=$sysfs_system/node
9
10declare -r scheduler=$rootdir/test/event/scheduler/scheduler
11declare plugin=scheduler_plugin
12
13source "$rootdir/test/scheduler/cgroups.sh"
14
15fold_list_onto_array() {
16	local array=$1
17	local elem
18
19	shift || return 0
20
21	for elem; do
22		eval "${array}[elem]=$elem"
23	done
24}
25
26fold_array_onto_string() {
27	local cpus=("$@")
28
29	local IFS=","
30	echo "${cpus[*]}"
31}
32
33parse_cpu_list() {
34	local list=$1
35	local elem elems cpus
36
37	# 0-2,4,6-9, etc.
38	IFS="," read -ra elems < "$list"
39
40	((${#elems[@]} > 0)) || return 0
41
42	for elem in "${elems[@]}"; do
43		if [[ $elem == *-* ]]; then
44			local start=${elem%-*} end=${elem#*-}
45			while ((start <= end)); do
46				cpus[start++]=$start
47			done
48		else
49			cpus[elem]=$elem
50		fi
51	done
52	printf '%u\n' "${!cpus[@]}"
53}
54
55map_cpus_node() {
56	local node_idx=$1
57	local -n _cpu_node_map=node_${node_idx}_cpu
58	local cpu_idx core_idx
59
60	for cpu_idx in $(parse_cpu_list "$sysfs_node/node$node_idx/cpulist"); do
61		if is_cpu_online_f "$cpu_idx"; then
62			core_idx=$(< "$sysfs_cpu/cpu$cpu_idx/topology/core_id")
63			local -n _cpu_core_map=node_${node_idx}_core_${core_idx}
64			_cpu_core_map+=("$cpu_idx") cpu_core_map[cpu_idx]=$core_idx
65			local -n _cpu_siblings=node_${node_idx}_core_${core_idx}_thread_${cpu_idx}
66			_cpu_siblings=($(parse_cpu_list "$sysfs_cpu/cpu$cpu_idx/topology/thread_siblings_list"))
67			cpu_siblings[cpu_idx]="node_${node_idx}_core_${core_idx}_thread_${cpu_idx}[@]"
68		fi
69		_cpu_node_map[cpu_idx]=$cpu_idx cpu_node_map[cpu_idx]=$node_idx
70		cpus+=("$cpu_idx")
71	done
72
73	nodes[node_idx]=$node_idx
74}
75
76map_cpus() {
77	local -g cpus=()
78	local -g cpu_siblings=()
79	local -g nodes=()
80	local -g cpu_node_map=()
81	local -g cpu_core_map=()
82	local -g core_node_map=()
83	local node
84
85	unset -v "${!node_@}"
86
87	for node in "$sysfs_node/node"+([0-9]); do
88		map_cpus_node "${node##*node}"
89	done
90}
91
92get_cpus() {
93	local node=$1
94	local core=$2
95	local _cpus
96
97	if [[ -z $node ]]; then
98		_cpus=("${cpus[@]}")
99	elif [[ -n $node ]]; then
100		eval "_cpus=(\${node_${node}_cpu[@]})"
101		if [[ -n $core ]]; then
102			eval "_cpus=(\${node_${node}_core_${core}[@]})"
103		fi
104	fi
105	((${#_cpus[@]} > 0)) || return 1
106	printf '%u\n' "${_cpus[@]}"
107}
108
109get_isolated_cpus() {
110	[[ -e $sysfs_cpu/isolated ]] || return 0
111	parse_cpu_list "$sysfs_cpu/isolated"
112}
113
114get_offline_cpus() {
115	local offline
116
117	[[ -e $sysfs_cpu/offline ]] || return 0
118	parse_cpu_list "$sysfs_cpu/offline"
119}
120
121get_online_cpus() {
122	[[ -e $sysfs_cpu/online ]] || return 0
123	parse_cpu_list "$sysfs_cpu/online"
124}
125
126is_cpu_online() {
127	local online
128
129	fold_list_onto_array online $(get_online_cpus)
130	[[ -v online[$1] ]]
131}
132
133is_cpu_offline() {
134	! is_cpu_online "$1"
135}
136
137is_cpu_online_f() {
138	local cpu=$1
139
140	if ((cpu == 0)); then
141		# cpu0 is special as it requires proper support in the kernel to be hot pluggable.
142		# As such, it usually does not have its own online attribute so always check the
143		# online list instead.
144		is_cpu_online "$cpu"
145	else
146		[[ -e $sysfs_cpu/cpu$cpu/online ]] || return 1
147		(($(< "$sysfs_cpu/cpu$cpu/online") == 1))
148	fi
149}
150
151is_cpu_offline_f() {
152	! is_cpu_online_f "$1"
153}
154
155is_numa() {
156	local nodes=("$sysfs_node/node"+([0-9]))
157
158	((${#nodes[@]} > 1))
159}
160
161online_cpu() {
162	is_cpu_offline_f "$1" || return 0
163	echo 1 > "$sysfs_cpu/cpu$1/online"
164}
165
166offline_cpu() {
167	is_cpu_online_f "$1" || return 0
168	echo 0 > "$sysfs_cpu/cpu$1/online"
169}
170
171mask_cpus() {
172	printf '[%s]\n' "$(fold_array_onto_string "$@")"
173}
174
175denied_list() {
176	local -g denied
177
178	fold_list_onto_array denied $(get_offline_cpus) "$@"
179}
180
181filter_allowed_list() {
182	local cpu
183
184	for cpu in "${!allowed[@]}"; do
185		if [[ -n ${denied[cpu]} ]] || ((cpu > 127)); then
186			unset -v "allowed[cpu]"
187		fi
188	done
189}
190
191allowed_list() {
192	local max=${1:-4}
193	local node=${2:-0}
194	local cpu_count=${cpu_count:--1}
195
196	local -g allowed
197
198	fold_list_onto_array allowed $(get_isolated_cpus)
199
200	if ((cpu_count < 0 && ${#allowed[@]} > 0)); then
201		((max += ${#allowed[@]}))
202	fi
203
204	local -n node_cpu_ref=node_${node}_cpu
205
206	while ((${#allowed[@]} < max && ++cpu_count < ${#node_cpu_ref[@]})); do
207		fold_list_onto_array allowed $(get_cpus "$node" "${cpu_core_map[node_cpu_ref[cpu_count]]}")
208	done
209
210	filter_allowed_list
211
212	if ((${#allowed[@]} == max)); then
213		return 0
214	elif ((cpu_count == ${#node_cpu_ref[@]})); then
215		return 0
216	else
217		allowed_list "$max" "$node"
218	fi
219}
220
221get_proc_cpu_affinity() {
222	xtrace_disable
223
224	local pid=${1:-$$}
225	local status val status_file
226
227	if [[ -e $pid ]]; then
228		status_file=$pid
229	elif [[ -e /proc/$pid/status ]]; then
230		status_file=/proc/$pid/status
231	else
232		return 1
233	fi
234
235	# shellcheck disable=SC2188
236	while IFS=":"$'\t' read -r status val; do
237		if [[ $status == Cpus_allowed_list ]]; then
238			parse_cpu_list <(echo "$val")
239			return 0
240		fi
241	done < <(< "$status_file")
242
243	xtrace_restore
244}
245
246map_cpufreq() {
247	# This info is used to cross-reference current cpufreq setup with
248	# what DPDK's governor actually puts in place.
249
250	local -g cpufreq_drivers=()
251	local -g cpufreq_governors=()
252	local -g cpufreq_base_freqs=()
253	local -g cpufreq_max_freqs=()
254	local -g cpufreq_min_freqs=()
255	local -g cpufreq_cur_freqs=()
256	local -g cpufreq_is_turbo=()
257	local -g cpufreq_available_freqs=()
258	local -g cpufreq_available_governors=()
259	local -g cpufreq_high_prio=()
260	local -g cpufreq_non_turbo_ratio=()
261	local -g cpufreq_setspeed=()
262	local -g cpuinfo_max_freqs=()
263	local -g cpuinfo_min_freqs=()
264	local -g turbo_enabled=0
265	local cpu cpu_idx
266
267	for cpu in "$sysfs_cpu/cpu"+([0-9]); do
268		cpu_idx=${cpu##*cpu}
269		[[ -e $cpu/cpufreq ]] || continue
270		cpufreq_drivers[cpu_idx]=$(< "$cpu/cpufreq/scaling_driver")
271		cpufreq_governors[cpu_idx]=$(< "$cpu/cpufreq/scaling_governor")
272
273		# In case HWP is on
274		if [[ -e $cpu/cpufreq/base_frequency ]]; then
275			cpufreq_base_freqs[cpu_idx]=$(< "$cpu/cpufreq/base_frequency")
276		fi
277
278		cpufreq_cur_freqs[cpu_idx]=$(< "$cpu/cpufreq/scaling_cur_freq")
279		cpufreq_max_freqs[cpu_idx]=$(< "$cpu/cpufreq/scaling_max_freq")
280		cpufreq_min_freqs[cpu_idx]=$(< "$cpu/cpufreq/scaling_min_freq")
281
282		local -n available_governors=available_governors_cpu_${cpu_idx}
283		cpufreq_available_governors[cpu_idx]="available_governors_cpu_${cpu_idx}[@]"
284		available_governors=($(< "$cpu/cpufreq/scaling_available_governors"))
285
286		local -n available_freqs=available_freqs_cpu_${cpu_idx}
287		cpufreq_available_freqs[cpu_idx]="available_freqs_cpu_${cpu_idx}[@]"
288
289		case "${cpufreq_drivers[cpu_idx]}" in
290			acpi-cpufreq)
291				available_freqs=($(< "$cpu/cpufreq/scaling_available_frequencies"))
292				if ((available_freqs[0] - 1000 == available_freqs[1])); then
293					cpufreq_is_turbo[cpu_idx]=1
294				else
295					cpufreq_is_turbo[cpu_idx]=0
296				fi
297				cpufreq_setspeed[cpu_idx]=$(< "$cpu/cpufreq/scaling_setspeed")
298				;;
299			intel_pstate | intel_cpufreq) # active or passive
300				local non_turbo_ratio base_max_freq num_freq freq is_turbo=0
301
302				non_turbo_ratio=$("$testdir/rdmsr.pl" "$cpu_idx" 0xce)
303				cpuinfo_min_freqs[cpu_idx]=$(< "$cpu/cpufreq/cpuinfo_min_freq")
304				cpuinfo_max_freqs[cpu_idx]=$(< "$cpu/cpufreq/cpuinfo_max_freq")
305				cpufreq_non_turbo_ratio[cpu_idx]=$(((non_turbo_ratio >> 8) & 0xff))
306				if ((cpufreq_base_freqs[cpu_idx] / 100000 > cpufreq_non_turbo_ratio[cpu_idx])); then
307					cpufreq_high_prio[cpu_idx]=1
308					base_max_freq=${cpufreq_base_freqs[cpu_idx]}
309				else
310					cpufreq_high_prio[cpu_idx]=0
311					base_max_freq=$((cpufreq_non_turbo_ratio[cpu_idx] * 100000))
312				fi
313				num_freqs=$(((base_max_freq - cpuinfo_min_freqs[cpu_idx]) / 100000 + 1))
314				if ((base_max_freq < cpuinfo_max_freqs[cpu_idx])); then
315					((num_freqs += 1))
316					cpufreq_is_turbo[cpu_idx]=1
317				else
318					cpufreq_is_turbo[cpu_idx]=0
319				fi
320				available_freqs=()
321				for ((freq = 0; freq < num_freqs; freq++)); do
322					if ((freq == 0 && cpufreq_is_turbo[cpu_idx] == 1)); then
323						available_freqs[freq]=$((base_max_freq + 1))
324					else
325						available_freqs[freq]=$((base_max_freq - (freq - cpufreq_is_turbo[cpu_idx]) * 100000))
326					fi
327				done
328				;;
329			cppc_cpufreq)
330				cpufreq_setspeed[cpu_idx]=$(< "$cpu/cpufreq/scaling_setspeed")
331				scaling_min_freqs[cpu_idx]=$(< "$cpu/cpufreq/scaling_min_freq")
332				scaling_max_freqs[cpu_idx]=$(< "$cpu/cpufreq/scaling_max_freq")
333				cpuinfo_max_freqs[cpu_idx]=$(< "$cpu/cpufreq/cpuinfo_max_freq")
334				nominal_perf[cpu_idx]=$(< "$cpu/acpi_cppc/nominal_perf")
335				highest_perf[cpu_idx]=$(< "$cpu/acpi_cppc/highest_perf")
336
337				#the unit of highest_perf and nominal_perf differs on different arm platforms.
338				#For highest_perf, it maybe 300 or 3000000, both means 3.0GHz.
339				if ((highest_perf[cpu_idx] > nominal_perf[cpu_idx] && (\
340					highest_perf[cpu_idx] == cpuinfo_max_freqs[cpu_idx] || \
341					highest_perf[cpu_idx] * 10000 == cpuinfo_max_freqs[cpu_idx]))); then
342					cpufreq_is_turbo[cpu_idx]=1
343				else
344					cpufreq_is_turbo[cpu_idx]=0
345				fi
346
347				if ((nominal_perf[cpu_idx] < 10000)); then
348					nominal_perf[cpu_idx]=$((nominal_perf[cpu_idx] * 10000))
349				fi
350
351				num_freqs=$(((nominal_perf[cpu_idx] - scaling_min_freqs[cpu_idx]) / 100000 + 1 + \
352					cpufreq_is_turbo[cpu_idx]))
353
354				available_freqs=()
355				for ((freq = 0; freq < num_freqs; freq++)); do
356					if ((freq == 0 && cpufreq_is_turbo[cpu_idx] == 1)); then
357						available_freqs[freq]=$((scaling_max_freqs[cpu_idx]))
358					else
359						available_freqs[freq]=$((nominal_perf[cpu_idx] - (\
360							freq - cpufreq_is_turbo[cpu_idx]) * 100000))
361					fi
362				done
363				;;
364		esac
365	done
366	if [[ -e $sysfs_cpu/cpufreq/boost ]]; then
367		turbo_enabled=$(< "$sysfs_cpu/cpufreq/boost")
368	elif [[ -e $sysfs_cpu/intel_pstate/no_turbo ]]; then
369		turbo_enabled=$((!$(< "$sysfs_cpu/intel_pstate/no_turbo")))
370	fi
371}
372
373set_cpufreq() {
374	local cpu=$1
375	local min_freq=$2
376	local max_freq=$3
377	local cpufreq=$sysfs_cpu/cpu$cpu/cpufreq
378
379	# Map the cpufreq info first
380	[[ -n ${cpufreq_drivers[cpu]} ]] || return 1
381	[[ -n $min_freq ]] || return 1
382
383	case "${cpufreq_drivers[cpu]}" in
384		acpi-cpufreq | cppc_cpufreq)
385			if [[ $(< "$cpufreq/scaling_governor") != userspace ]]; then
386				echo "userspace" > "$cpufreq/scaling_governor"
387			fi
388			echo "$min_freq" > "$cpufreq/scaling_setspeed"
389			;;
390		intel_pstate | intel_cpufreq)
391			if [[ -n $max_freq ]] && ((max_freq >= min_freq)); then
392				echo "$max_freq" > "$cpufreq/scaling_max_freq"
393			fi
394			if ((min_freq <= cpufreq_max_freqs[cpu])); then
395				echo "$min_freq" > "$cpufreq/scaling_min_freq"
396			fi
397			;;
398	esac
399}
400
401set_cpufreq_governor() {
402	local cpu=$1
403	local governor=$2
404	local cpufreq=$sysfs_cpu/cpu$cpu/cpufreq
405
406	if [[ $(< "$cpufreq/scaling_governor") != "$governor" ]]; then
407		echo "$governor" > "$cpufreq/scaling_governor"
408	fi
409}
410
411exec_under_dynamic_scheduler() {
412	if [[ -e /proc/$spdk_pid/status ]]; then
413		killprocess "$spdk_pid"
414	fi
415	"$@" --wait-for-rpc &
416	spdk_pid=$!
417	# Give some time for the app to init itself
418	waitforlisten "$spdk_pid"
419	"$rootdir/scripts/rpc.py" framework_set_scheduler dynamic
420	"$rootdir/scripts/rpc.py" framework_start_init
421}
422
423exec_under_static_scheduler() {
424	if [[ -e /proc/$spdk_pid/status ]]; then
425		killprocess "$spdk_pid"
426	fi
427	"$@" --wait-for-rpc &
428	spdk_pid=$!
429	# Give some time for the app to init itself
430	waitforlisten "$spdk_pid"
431}
432
433# Gather busy/idle stats since this function was last called
434get_thread_stats_current() {
435	xtrace_disable
436
437	local total_busy total_idle
438
439	_get_thread_stats total_busy total_idle
440
441	for thread in "${!thread_map[@]}"; do
442		: $((busy[thread] = total_busy[thread] - past_busy[thread], past_busy[thread] = total_busy[thread]))
443		: $((idle[thread] = total_idle[thread] - past_idle[thread], past_idle[thread] = total_idle[thread]))
444	done
445	xtrace_restore
446}
447
448# Gather busy/idle stats since application start
449get_thread_stats() {
450	xtrace_disable
451	_get_thread_stats busy idle
452	xtrace_restore
453}
454
455_get_thread_stats() {
456	local list_busy=$1
457	local list_idle=$2
458	local thread threads stats
459
460	stats=$(rpc_cmd thread_get_stats | jq -r '.threads[]')
461	threads=($(jq -r '.id' <<< "$stats"))
462
463	for thread in "${threads[@]}"; do
464		eval "${list_busy}[$thread]=\$(jq -r \"select(.id == $thread) | .busy\" <<< \$stats)"
465		eval "${list_idle}[$thread]=\$(jq -r \"select(.id == $thread) | .idle\" <<< \$stats)"
466		thread_map[thread]=$(jq -r "select(.id == $thread) | .name" <<< "$stats")
467	done
468}
469
470get_cpu_stat() {
471	local cpu_idx=$1
472	local stat=$2 stats astats
473
474	# cpu0 0 0 0 0 0 0 0 0 0 -> _cpu0=(0 0 0 0 0 0 0 0 0)
475	source <(grep '^cpu[0-9]' /proc/stat | sed 's/\([^ ]*\) \(.*\)/_\1=(\2)/')
476
477	# If we were called with valid cpu id return requested time
478	[[ -v _cpu$cpu_idx ]] || return 0
479	local -n cpu_stat=_cpu$cpu_idx
480
481	case "$stat" in
482		idle) echo "${cpu_stat[3]}" ;;
483		*) printf '%u\n' "${cpu_stat[@]}" ;;
484	esac
485}
486
487create_thread() {
488	rpc_cmd --plugin "$plugin" scheduler_thread_create "$@"
489}
490
491destroy_thread() {
492	rpc_cmd --plugin "$plugin" scheduler_thread_delete "$@"
493}
494
495active_thread() {
496	rpc_cmd --plugin "$plugin" scheduler_thread_set_active "$@"
497}
498
499get_cpu_time() {
500	xtrace_disable
501
502	local interval=$1 cpu_time=${2:-idle} print=${3:-0} wait=${4:-1} interval_count
503	shift 4
504	local cpus=("$@") cpu
505	local stats stat old_stats avg_load
506	local total_sample
507	local keep_going=0
508
509	# Exposed for the caller
510	local -g cpu_times=()
511	local -g avg_cpu_time=()
512
513	# cpu_time:
514	# 0 - user (time spent in user mode)
515	# 1 - nice (Time spent in user mode with low priority)
516	# 2 - system (Time spent in system mode)
517	# 3 - idle (Time spent in the idle task)
518	# 4 - iowait (Time waiting for I/O to complete)
519	# 5 - irq (Time servicing interrupts)
520	# 6 - softirq (Time servicing softirqs)
521	# 7 - steal (Stolen time)
522	# 8 - guest (Time spent running a virtual CPU)
523	# 9 - guest_nice (Time spent running a niced guest)
524
525	local -gA cpu_time_map
526	cpu_time_map["user"]=0
527	cpu_time_map["nice"]=1
528	cpu_time_map["system"]=2
529	cpu_time_map["idle"]=3
530	cpu_time_map["iowait"]=4
531	cpu_time_map["irq"]=5
532	cpu_time_map["softirq"]=6
533	cpu_time_map["steal"]=7
534	cpu_time_map["guest"]=8
535	cpu_time_map["guest_nice"]=9
536
537	# Clear up the env
538	unset -v ${!stat_@}
539	unset -v ${!old_stat_@}
540	unset -v ${!avg_stat@}
541	unset -v ${!avg_load@}
542	unset -v ${!raw_samples@}
543
544	cpu_time=${cpu_time_map["$cpu_time"]}
545	interval_count=0
546	if ((interval <= 0)); then
547		keep_going=1
548	else
549		# We skip first sample to have min 2 for stat comparison
550		interval=$((interval + 1))
551	fi
552	while ((interval_count++, keep_going ? 1 : --interval >= 0)); do
553		((interval_count > 1 && print == 1)) && print_cpu_time_header
554		get_cpu_stat all
555		for cpu in "${cpus[@]}"; do
556			local -n old_stats=old_stats_$cpu
557			local -n avg_load=avg_load_$cpu
558			local -n raw_samples=raw_samples_$cpu
559			local -n stats=_cpu$cpu
560			sample_stats=() total_sample=0
561
562			if ((interval_count == 1)); then
563				# Skip first sample
564				old_stats=("${stats[@]}")
565				continue
566			fi
567			for stat in "${!stats[@]}"; do
568				avg_load[stat]="stat_${stat}_${cpu}[@]"
569				sample_stats[stat]=$((stats[stat] - old_stats[stat]))
570				: $((total_sample += sample_stats[stat]))
571			done
572			for stat in "${!stats[@]}"; do
573				local -n avg_stat=stat_${stat}_${cpu}
574				local -n raw_samples_ref=raw_samples_${stat}_${cpu}
575				raw_samples[stat]="raw_samples_${stat}_${cpu}[@]"
576				raw_samples_ref+=("${stats[stat]}")
577				avg_stat+=($((sample_stats[stat] * 100 / (total_sample == 0 ? 1 : total_sample))))
578			done
579			old_stats=("${stats[@]}")
580			((print == 1)) && print_cpu_time "$cpu"
581		done
582		sleep "${wait}s"
583	done
584
585	# We collected % for each time. Now determine the avg % for requested time.
586	local load stat_load
587	for cpu in "${cpus[@]}"; do
588		load=0
589		local -n avg_load_cpu=avg_load_$cpu
590		stat_load=("${!avg_load_cpu[cpu_time]}")
591		for stat in "${stat_load[@]}"; do
592			: $((load += stat))
593		done
594		cpu_times[cpu]=${stat_load[*]}
595		avg_cpu_time[cpu]=$((load / ${#stat_load[@]}))
596	done
597
598	xtrace_restore
599}
600
601print_cpu_time_header() {
602	local ts
603	ts=$(date "+%R:%S %Z")
604
605	printf '(%s) %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s %8s (test:%s)\n' \
606		"$ts" \
607		"CPU" "%usr" "%nice" "%sys" "%iowait" "%irq" "%soft" "%steal" \
608		"%guest" "%gnice" "%idle" "${TEST_TAG:-N/A}"
609}
610
611print_cpu_time() {
612	local cpu=$1
613
614	local -n _cpu_ref=avg_load_$cpu
615	((${#_cpu_ref[@]} > 0)) || return 0
616
617	usr=("${!_cpu_ref[0]}")
618	nice=("${!_cpu_ref[1]}")
619	system=("${!_cpu_ref[2]}")
620	idle=("${!_cpu_ref[3]}")
621	iowait=("${!_cpu_ref[4]}")
622	irq=("${!_cpu_ref[5]}")
623	soft=("${!_cpu_ref[6]}")
624	steal=("${!_cpu_ref[7]}")
625	guest=("${!_cpu_ref[8]}")
626	gnice=("${!_cpu_ref[9]}")
627
628	printf '%23u %8u %8u %8u %8u %8u %8u %8u %8u %8u %8u\n' \
629		"$cpu" \
630		"${usr[-1]}" \
631		"${nice[-1]}" \
632		"${system[-1]}" \
633		"${iowait[-1]}" \
634		"${irq[-1]}" \
635		"${soft[-1]}" \
636		"${steal[-1]}" \
637		"${guest[-1]}" \
638		"${gnice[-1]}" \
639		"${idle[-1]}"
640}
641
642collect_cpu_idle() {
643	((${#cpus_to_collect[@]} > 0)) || return 1
644
645	local time=${1:-5}
646	local cpu
647	local samples
648	local -g is_idle=()
649
650	printf 'Collecting cpu idle stats (cpus: %s) for %u seconds...\n' \
651		"${cpus_to_collect[*]}" "$time"
652
653	get_cpu_time "$time" idle 0 1 "${cpus_to_collect[@]}"
654
655	local user_load load_median user_spdk_load
656	for cpu in "${cpus_to_collect[@]}"; do
657		samples=(${cpu_times[cpu]})
658		load_median=$(calc_median "${samples[@]}")
659		printf '* cpu%u idle samples: %s (avg: %u%%, median: %u%%)\n' \
660			"$cpu" "${samples[*]}" "${avg_cpu_time[cpu]}" "$load_median"
661		# Cores with polling reactors have 0% idle time,
662		# while the ones in interrupt mode won't have 100% idle.
663		# During the tests, polling reactors spend the major portion
664		# of their cpu time in user mode. With that in mind, if the
665		# general check for cpus's idleness fails, check what portion
666		# of the cpu load falls into user mode. For the idle check
667		# use the last sample. For the cpu load, compare user's raw
668		# samples in SC_CLK_TCK context for a more detailed view.
669		user_load=$(cpu_usage_clk_tck "$cpu" user)
670		if ((samples[-1] >= 70)); then
671			printf '* cpu%u is idle\n' "$cpu"
672			is_idle[cpu]=1
673		elif ((user_load <= 15)); then
674			printf '* cpu%u not fully idle, but user load is low so passing\n' "$cpu"
675			is_idle[cpu]=1
676		else
677			printf '* cpu%u is not idle\n' "$cpu"
678			is_idle[cpu]=0
679			# HACK: Since we verify this in context of business of particular SPDK threads, make
680			# the last check against their {u,s}time to determine if we are really busy or not. This
681			# is meant to null and void potential jitter on the cpu.
682			# See https://github.com/spdk/spdk/issues/3362.
683			user_spdk_load=$(get_spdk_proc_time "$time" "$cpu")
684			if ((user_spdk_load <= 15)); then
685				printf '* SPDK thread pinned to cpu%u seems to be idle regardless (%u%%)\n' \
686					"$cpu" \
687					"$user_spdk_load"
688				is_idle[cpu]=1
689			fi
690		fi
691	done
692}
693
694cpu_usage_clk_tck() {
695	local cpu=$1 time=${2:-all}
696	local user nice system usage clk_delta
697
698	# We should be called in get_cpu_time()'s environment.
699	[[ -v raw_samples_$cpu ]] || return 1
700
701	local -n raw_samples=raw_samples_$cpu
702	user=("${!raw_samples[cpu_time_map["user"]]}")
703	nice=("${!raw_samples[cpu_time_map["nice"]]}")
704	system=("${!raw_samples[cpu_time_map["system"]]}")
705
706	# Construct delta based on last two samples of a given time.
707	case "$time" in
708		user | all) : $((clk_delta += (user[-1] - user[-2]))) ;;&
709		nice | all) : $((clk_delta += (nice[-1] - nice[-2]))) ;;&
710		system | all) : $((clk_delta += (system[-1] - system[-2]))) ;;
711		*) ;;
712	esac
713	# We assume 1s between each sample. See get_cpu_time().
714	usage=$((100 * clk_delta / $(getconf CLK_TCK)))
715	usage=$((usage > 100 ? 100 : usage))
716
717	printf '%u' "$usage"
718	printf '* cpu%u %s usage: %u\n' "$cpu" "$time" "$usage" >&2
719	printf '* cpu%u user samples: %s\n' "$cpu" "${user[*]}" >&2
720	printf '* cpu%u nice samples: %s\n' "$cpu" "${nice[*]}" >&2
721	printf '* cpu%u system samples: %s\n' "$cpu" "${system[*]}" >&2
722}
723
724update_thread_cpus_map() {
725	local cpu
726	local -g thread_cpus=()
727	local reactor_framework
728
729	((${#cpus[@]} > 0)) || return 1
730
731	get_thread_stats
732
733	reactor_framework=$(rpc_cmd framework_get_reactors | jq -r '.reactors[]')
734	for cpu in "${cpus[@]}"; do
735		for thread in $(jq -r "select(.lcore == $cpu) | .lw_threads[].id" <<< "$reactor_framework"); do
736			printf '* Thread %u (%s) on cpu%u\n' "$thread" "${thread_map[thread]}" "$cpu"
737			thread_cpus[thread]=$cpu
738		done
739	done
740	((${#thread_cpus[@]} > 0))
741}
742
743calc_median() {
744	local samples=("$@") samples_sorted
745	local middle median sample
746
747	samples_sorted=($(printf '%s\n' "${samples[@]}" | sort -n))
748
749	middle=$((${#samples_sorted[@]} / 2))
750	if ((${#samples_sorted[@]} % 2 == 0)); then
751		median=$(((samples_sorted[middle - 1] + samples_sorted[middle]) / 2))
752	else
753		median=${samples_sorted[middle]}
754	fi
755
756	echo "$median"
757
758}
759
760get_spdk_proc_time() {
761	# Similar to cpu_usage_clk_tck() but the values we are working here, per process, are already
762	# divided by SC_CLK_TCK. See proc(5).
763
764	xtrace_disable
765
766	local interval=$1 cpu=$2
767	local thread thread_to_time stats
768	local _time time _stime stime _utime utime
769	local thread_cpu_list
770
771	[[ -e /proc/$spdk_pid/status ]] || return 1
772
773	# Find SPDK thread pinned to given cpu
774	for thread in "/proc/$spdk_pid/task/"*; do
775		thread_cpu_list=($(get_proc_cpu_affinity "$thread/status"))
776		# we aim at reactor threads and these should be bound to a single cpu
777		((${#thread_cpu_list[@]} > 1)) && continue
778		((thread_cpu_list[0] == cpu)) && thread_to_time=$thread && break
779	done
780
781	[[ -e $thread_to_time/stat ]] || return 1
782	interval=$((interval <= 1 ? 2 : interval))
783
784	while ((--interval >= 0)); do
785		# See cgroups.sh -> id_proc()
786		stats=$(< "$thread_to_time/stat") stats=(${stats/*) /})
787		_utime[interval]=${stats[11]} # Amount of time spent in user mode
788		_stime[interval]=${stats[12]} # Amount of time spent in kernel mode
789		_time[interval]=$((_utime[interval] + _stime[interval]))
790		((${#_time[@]} == 1)) && continue
791		utime+=($((_utime[interval] - _utime[interval + 1])))
792		stime+=($((_stime[interval] - _stime[interval + 1])))
793		time+=($((_time[interval] - _time[interval + 1])))
794		sleep 1
795	done
796
797	echo "stime samples: ${stime[*]}" >&2
798	echo "utime samples: ${utime[*]}" >&2
799
800	calc_median "${time[@]}"
801
802	xtrace_restore
803}
804