xref: /spdk/scripts/perf/pm/collect-cpu-temp (revision 83ba9086796471697a4975a58f60e2392bccd08c)
1#!/usr/bin/env bash
2#  SPDX-License-Identifier: BSD-3-Clause
3#  Copyright (C) 2023 Intel Corporation
4#  All rights reserved.
5
6# Do similar thing to what turbostat.c does but in a more compact
7# scope. We just report temp as per coretemp's hwmon entries for
8# each core|package and check if there is any indication that
9# throttling took place (per cpu thread).
10set -e
11shopt -s extglob nullglob
12
13pmdir=$(readlink -f "$(dirname "$0")")
14rootdir=$(readlink -f "$pmdir/../../../")
15source "$rootdir/test/scheduler/common.sh"
16source "$pmdir/common"
17
18to_celsius() { echo $(($1 / 1000)); }
19
20report_hwmon() {
21	local hwmon hw_name
22
23	for hwmon in /sys/class/hwmon/hwmon*; do
24		[[ -e $hwmon/name ]] || continue
25		hw_name=$(< "$hwmon/name")
26		case "$hw_name" in
27			coretemp) report_hwmon_coretemp "$hwmon" ;;
28			*) report_hwmon_generic "$hwmon" 2> /dev/null ;;
29		esac || continue
30	done
31}
32
33report_hwmon_generic() {
34	local hwmon=$1 temp ts
35	local label dev
36
37	ts=$(dater)
38
39	# We report just the bare minimum as each device may come with
40	# different set of labels, inputs, etc.
41	[[ -e $hwmon/temp1_input ]] || return 0
42
43	# These entries, especially the ones dedicated for nvme, may disappear during
44	# tests, so try to handle them gracefully.
45
46	temp=$(< "$hwmon/temp1_input") && label=$(< "$hwmon/name") || return 0
47
48	if [[ -e $hwmon/temp1_label ]]; then
49		label+=":$(< "$hwmon/temp1_label")"
50	fi
51
52	if [[ -e $hwmon/device ]] && dev=$(readlink -f "$hwmon/device"); then
53		label+=":${dev##*/}"
54	fi
55
56	printf '(%s) --- %s (%u C) (test:%s)\n' \
57		"$ts" \
58		"$label" \
59		"$(to_celsius "$temp")" \
60		"$TEST_TAG"
61	printf '%s\n' "---"
62}
63
64report_hwmon_coretemp() {
65	local hwmon=$1 temp ts
66	local core crit input id label max node package threads
67	local cores_input=() cores_crit=() cores_max=()
68
69	ts=$(dater)
70
71	for label in "$hwmon/"temp+([0-9])_label; do
72		temp=${label%_label}
73		label=$(< "$label")
74		input=$(< "${temp}_input")
75		crit=$(< "${temp}_crit")
76		max=$(< "${temp}_max")
77		id=${label##* }
78		case "$label" in
79			Core*)
80				cores_input[id]=$input
81				cores_crit[id]=$crit
82				cores_max[id]=$max
83				;;
84			Package*) node=$id package[node]=$input ;;
85		esac
86	done
87	# No package_id? Something is amiss
88	# FIXME: This is a cheeky assumption that each physical package (socket) N maps to
89	# a corresponding numa node N. For physical systems this may be usually the case
90	# but for quirky vNUMA setups not necessarily. For similar systems (don't have
91	# any at hand and it's a bit tricky to test thermal stuff under VMs) this probably
92	# would need to drop the lookup of the "Package" label and just check each thread's
93	# physical_package_id and/or numa assignment (cpu_node_map[@]).
94	[[ -n $node ]] || return 1
95
96	printf '(%s) --- Node%u (%u C) (test:%s)\n' \
97		"$ts" \
98		"$node" \
99		"$(to_celsius "${package[node]}")" \
100		"$TEST_TAG"
101
102	for core in "${!cores_input[@]}"; do
103		threads=($(get_cpus "$node" "$core"))
104		printf '  (%s) Core%u (%s): %u C (crit: %u C, max: %u C)\n' \
105			"$ts" \
106			"$core" \
107			"${threads[*]}" \
108			"$(to_celsius "${cores_input[core]}")" \
109			"$(to_celsius "${cores_crit[core]}")" \
110			"$(to_celsius "${cores_max[core]}")"
111	done
112	printf '%s\n' "---"
113}
114
115report_throttling() {
116	# Quick check to see if MSRs report proper support - if this entry does not exist,
117	# then there's no point in looking up entire topology, support is simply not
118	# there.
119	[[ -e $sysfs_cpu/cpu0/thermal_throttle/core_throttle_count ]] || return 1
120
121	local cpu cpu_throttling=() cpu_throttling_time=() throttler=()
122	local throttle_count throttle_time
123	local node node_cpus=() node_throttled
124	local ts
125
126	throttler[0]="Normal Operation"
127	throttler[1]="Throttled"
128
129	ts=$(dater)
130	# Order the output similarly to hwmon, starting with a node|package
131	for node in "${nodes[@]}"; do
132		node_cpus=($(get_cpus "$node")) node_throttled=0 cpu_throttling=()
133		for cpu in "${node_cpus[@]}"; do
134			throttle_count=$(< "$sysfs_cpu/cpu$cpu/thermal_throttle/core_throttle_count")
135			throttle_time=$(< "$sysfs_cpu/cpu$cpu/thermal_throttle/core_throttle_total_time_ms")
136			cpu_throttling[cpu]=$throttle_count
137			cpu_throttling_time[cpu]=$((throttle_time / 1000))
138			if ((throttle_count > 0 || throttle_time > 0)); then
139				node_throttled=1
140			fi
141		done
142
143		((node_throttled == 1 || debug == 1)) || continue
144
145		printf '(%s) ###### Throttling Node%u Status: %s ######\n' \
146			"$ts" "$node" "${throttler[node_throttled]}"
147
148		for cpu in "${!cpu_throttling[@]}"; do
149			((cpu_throttling[cpu] > 0)) || continue
150			printf '(%s) CPU%u: %s (count: %u, time: %us)\n' \
151				"$ts" "$cpu" \
152				"${throttler[cpu_throttling[cpu] > 0 ? 1 : 0]}" \
153				"${cpu_throttling[cpu]}" \
154				"${cpu_throttling_time[cpu]}"
155		done
156	done
157}
158
159init_modules() {
160	local -gA modules_supported=()
161	local -gA modules_out_refs=()
162	local -ga modules_to_run=()
163	local module
164
165	modules_supported["hwmon"]=report_hwmon
166	modules_supported["throttle"]=report_throttling
167
168	for module in "${@:-"${!modules_supported[@]}"}"; do
169		if [[ -z ${modules_supported["$module"]} ]]; then
170			printf 'Module (%s) not supported\n' "$module" >&2
171			return 1
172		fi
173		modules_to_run+=("${modules_supported["$module"]}")
174		modules_out_refs["${modules_supported["$module"]}"]="_${modules_supported["$module"]}"
175	done
176}
177
178collect_readings() {
179	local _count=$count module data
180
181	map_cpus
182
183	while ((count <= 0 ? 1 : _count--)); do
184		for module in "${modules_to_run[@]}"; do
185			local -n ref=${modules_out_refs["$module"]}
186			data=$("$module")
187			[[ -n $data ]] && ref+=("$data") && echo "$data"
188		done
189		sleep "${interval}s"
190	done
191}
192
193dater() {
194	date "+%R:%S %Z"
195}
196
197cleanup() {
198	local module
199
200	for module in "${!modules_out_refs[@]}"; do
201		local -n _ref=${modules_out_refs["$module"]}
202		((${#_ref[@]} > 0)) || continue
203		printf '%s\n' "${_ref[@]}" > "$PM_OUTPUTDIR/${prefix:+${prefix}_}$module.pm.txt"
204		printf 'Dumped %s module to %s\n' \
205			"$module" \
206			"$PM_OUTPUTDIR/${prefix:+${prefix}_}$module.pm.txt"
207	done
208
209	rm_pm_pid
210}
211
212help() {
213	cat <<- HELP
214
215		Usage: $0 [-h] [-c count] [-d dir] [-l] [-p prefix] [-t interval] [module0 module1 ...]
216
217		-h - Print this message.
218		-c - Execute module count times. 0 is the default and it means to run
219		     indefinitely.
220		-d - Directory where the results should be saved. Default is /tmp.
221		-l - Save output of the script to a log file (dir/${0##*/}.pm.log).
222		-p - Add prefix to saved files.
223		-t - How long to wait before executing modules. Default is 1s.
224
225		module - Module to execute. Currently supported: 'hwmon', 'throttle'. All modules are
226		         executed by default.
227
228		When started, ${0##*/} will enter loop to continuously execute specified
229		modules. Each execution will be logged to stderr. Upon termination, all
230		output will be dumped to /tmp or directory set by -d.
231
232	HELP
233}
234
235count=0
236debug=0
237interval=1
238log_to_file=no
239prefix=""
240
241while getopts c:d:hlp:t:v opt; do
242	case "$opt" in
243		c) count=$OPTARG ;;
244		d) PM_OUTPUTDIR=$OPTARG ;;
245		h)
246			help
247			exit 0
248			;;
249		l) log_to_file=yes ;;
250		p) prefix=$OPTARG ;;
251		t) interval=$OPTARG ;;
252		v) debug=1 ;;
253		*) ;;
254	esac
255done
256shift $((OPTIND - 1))
257
258declare -r log_file=${prefix:+${prefix}_}${0##*/}.pm.log
259
260mkdir -p "$PM_OUTPUTDIR"
261if [[ $log_to_file == yes ]]; then
262	printf 'Redirecting to %s\n' "$PM_OUTPUTDIR/$log_file" >&2
263	exec > "$PM_OUTPUTDIR/$log_file" 2>&1
264fi
265
266save_pm_pid
267trap 'cleanup' EXIT
268trap 'retag' USR1
269
270init_modules "$@"
271
272collect_readings
273