xref: /spdk/scripts/perf/pm/collect-bmc-pm (revision fdea5c6daceeed2a89d8b82f67a9276d73db3526)
1#!/usr/bin/env bash
2#  SPDX-License-Identifier: BSD-3-Clause
3#  Copyright (C) 2022 Intel Corporation
4#  All rights reserved.
5
6set -e
7
8pmdir=$(readlink -f "$(dirname "$0")")
9rootdir=$(readlink -f "$pmdir/../../../")
10source "$pmdir/common"
11
12hex() { printf '0x%02x\n' "$@"; }
13
14calc() { bc <<< "scale=2; $*"; }
15
16is_root() {
17	# Talking to local BMC device requires root privileges
18	if ((UID)); then
19		printf '%s, you need to be root to run this script\n' "$USER" >&2
20		return 1
21	fi
22
23}
24
25is_ipmitool() {
26	if ! type -P ipmitool; then
27		printf 'ipmitool not detected, cannot run commands against the BMC\n' >&2
28		return 1
29	fi
30}
31
32ipmi_load() {
33	# Silently attempt to load core ipmi drivers - we will pick up the device later on.
34	modprobe -qa ipmi_si ipmi_devintf ipmi_msghandler || return 0
35}
36
37ipmi_supported() {
38	# Verify if kernel detected and registered at least one BMC under
39	# the ipmi platform. Look for KCS specifically as this the type
40	# of the interface the script was tested against.
41
42	local ipmi=/sys/class/ipmi/ipmi0
43
44	# Keep these details global for easy access if needed.
45	local -g man_id prod_id dev_id ipmi_ver platform board ipmitool
46
47	ipmi_load
48
49	if [[ ! -e $ipmi ]]; then
50		printf 'BMC not detected. Please, make sure your platform is IPMI-compatible\n'
51		return 1
52	fi >&2
53
54	type=$(< "$ipmi/device/type")
55
56	if [[ $type != kcs ]]; then
57		printf 'No supported BMC interface detected (%s) - only KCS is supported\n' "$type"
58		return 1
59	fi >&2
60
61	man_id=$(< "$ipmi/device/bmc/manufacturer_id")
62	prod_id=$(< "$ipmi/device/bmc/product_id")
63	dev_id=$(hex "$(< "$ipmi/device/bmc/device_id")")
64	ipmi_ver=$(< "$ipmi/device/bmc/ipmi_version")
65
66	if [[ -e /sys/class/dmi/id/board_vendor ]]; then
67		platform=$(< /sys/class/dmi/id/board_vendor)
68	fi
69
70	if [[ -e /sys/class/dmi/id/board_name ]]; then
71		board=$(< /sys/class/dmi/id/board_name)
72	fi
73
74	# Keep output similar to ipmi_si's
75	cat <<- BMC_DEV >&2
76
77		BMC detected, details below:
78		Manufacturer ID: $man_id
79		Product ID: $prod_id
80		Device ID: $dev_id
81		IPMI Version: $ipmi_ver
82		Platform: ${platform:-unknown}
83		Board: ${board:-unknown}
84
85	BMC_DEV
86
87	# Verify if we have proper tools to work with
88	ipmitool=$(is_ipmitool)
89}
90
91ipmiraw() {
92	# For the majority of commands we use raw payload to not depend on specific ipmitool version
93	# and the way how it interprets/parses the returned data. This also allows us to inspect the
94	# integrity of data more closely to make sure we don't report nonsensical values to the user.
95
96	local rsp
97
98	rsp=($("$ipmitool" raw "$@" 2> /dev/null))
99	# Slap hex prefix to work with proper base
100	rsp=("${rsp[@]/#/0x}")
101
102	hex "${rsp[@]}"
103}
104
105dcmiraw() {
106	local cmd=$1 data=("${@:2}")
107
108	ipmiraw 0x2c "$cmd" 0xdc "${data[@]}"
109}
110
111print_dcmi_available_time_periods() {
112	local time_periods=${enhanced_power_attr[4]}
113	local -g available_time_periods=()
114	local -g available_time_periods_in_seconds=()
115
116	available_time_periods[0]="NOW"
117
118	if ((time_periods > 0)); then
119		local time_idx=5
120		local offset=$time_idx
121		local units unit time time_s units_mask=0xc0 to_sec
122
123		units[0x0]=seconds
124		units[0x1]=minutes
125		units[0x2]=hours
126		units[0x3]=days
127
128		to_sec[0x0]=1
129		to_sec[0x1]=60
130		to_sec[0x2]=3600
131		to_sec[0x3]=86400
132
133		while ((offset < time_idx + time_periods)); do
134			time=$((enhanced_power_attr[offset] & ~units_mask))
135			unit=${units[enhanced_power_attr[offset] >> 6]:-unknown}
136			time_s=$((time * to_sec[enhanced_power_attr[offset] >> 6]))
137			if ((time != 0)); then
138				available_time_periods[offset]="$time $unit"
139				available_time_periods_in_seconds[time_s]=${enhanced_power_attr[offset]}
140			fi
141			((++offset))
142		done
143	fi
144	cat <<- TIME_PERIODS >&2
145
146		Available averaging time periods to request:
147		$(printf '  - %s\n' "${available_time_periods[@]}")
148
149	TIME_PERIODS
150}
151
152dcmi_power_support() {
153	# Verify if the BMC conforms to the DCMI spec
154	local rsp
155
156	# Table 6-2, Get DCMI Capabilities Command Format
157	if ! rsp=($(dcmiraw 0x1 0x1)); then
158		printf 'Cannot determine if BMC supports DCMI Power Management capability\n' >&2
159		return 1
160	fi
161
162	# Table 6-3, DCMI Capabilities Parameters:
163	#  - Supported DCMI Capabilities:
164	#    - Byte 2 Platform capabilities: [0] Power management
165	if ((!(rsp[5] & (1 << 0)))); then
166		printf 'BMC does not provide DCMI Power Mangament capability\n' >&2
167		return 1
168	fi
169
170	# Check if BMC provides Enhanced System Power Statistics attributes - this allows to issue
171	# requests for power readings at averaging time period, .e.g. from last 5 seconds, 30 minutes,
172	# 1 hour and so on. With this we can provide more detailed view on power usage within a
173	# specific period of time. Without it, we need to depend only on current reading that should
174	# be always available (the "NOW" reading).
175
176	local -g enhanced_power_attr=()
177
178	# Table 6-3, DCMI Capabilities Parameters:
179	#  - Enhanced System Power Statistics attributes
180	if enhanced_power_attr=($(dcmiraw 0x1 0x5)); then
181		print_dcmi_available_time_periods
182	fi
183
184	printf 'Using DCMI Power Management\n' >&2
185}
186
187sdr_power_support() {
188	# This is a fallback which only some platforms may provide (confirmed PowerEdge and CYP).
189	# We are looking for a full, threshold sensor which reports overall power usage in Watts.
190	# Different BMCs may have SDRs which describe such sensor(s) differently so this is not
191	# 100% reliable. To make sure we pick up a proper sensor we also narrow it down to a
192	# specific entity (System Board or Power Supply). Readings from the sensor should be
193	# considered as "NOW" readings (without access to min, max readings).
194
195	local -g power_sensors=()
196	local sensor entity unit status
197
198	# Cache SDR to speed up sensor readings
199	if [[ ! -f $sdr_cache ]]; then
200		printf 'Saving SDR cache at %s\n' "$sdr_cache" >&2
201		"$ipmitool" sdr dump "$sdr_cache" > /dev/null
202	fi
203
204	if ((${#extra_power_sensors[@]} > 0)); then
205		power_sensors+=("${extra_power_sensors[@]}")
206	fi
207
208	while IFS="," read -r sensor _ unit status _ entity _; do
209		[[ $unit == Watts && $status == ok ]] || continue
210		[[ $entity == "System Board" || $entity == "Power Supply" ]] || continue
211		power_sensors+=("$sensor")
212	done < <("$ipmitool" -S "$sdr_cache" -vc sdr list full 2>&1)
213
214	if ((${#power_sensors[@]} > 0)); then
215		printf 'Using SDR (Power sensors: %s)\n' "${power_sensors[*]}"
216	else
217		printf 'Cannot locate power sensors\n'
218		return 1
219	fi >&2
220}
221
222power_support() {
223	local -g support cpu_support=0
224
225	if ((include_cpu == 1)) && rapl_supported; then
226		cpu_support=1
227	fi
228
229	if [[ $interface == dcmi || $interface == sdr ]]; then
230		# override
231		"${interface}_power_support"
232		support=$interface
233	elif dcmi_power_support; then
234		support=dcmi
235	elif sdr_power_support; then
236		support=sdr
237	else
238		printf 'BMC does not provide Power Management support, cannot gather system-wide power measurements\n' >&2
239		if ((cpu_support)); then
240			printf 'Only CPU measurements will be provided\n' >&2
241			return 0
242		fi
243		return 1
244	fi
245}
246
247get_dcmi_now_reading() {
248	local rsp reading=0 max min avg ts timeframe mode=01h
249	local get_cmd get_avg=0 print
250
251	# Table 6-16, Get Power Reading Command:
252	get_cmd=(0x2 0x1 0x0 0x0)
253
254	if [[ $interval =~ ^[0-9]+$ && -n ${available_time_periods_in_seconds[interval]} ]]; then
255		get_cmd=(0x2 0x2 "${available_time_periods_in_seconds[interval]}" 0x0)
256		get_avg=1
257		mode=02h
258	fi
259
260	# We use System Power Statistics mode to get the "NOW" reading by default. In case
261	# interval matches one supported by Enhanced System Power Statistics we use that
262	# mode to obtain extra min, max, avg statistics.
263
264	if ! rsp=($(dcmiraw "${get_cmd[@]}")); then
265		printf 'DCMI reading: error\n'
266	else
267		# Note that the BMC timestamp depends on the hwclock setup which we then attempt
268		# to represent in UTC.
269		ts=$((rsp[12] << 24 | rsp[11] << 16 | rsp[10] << 8 | rsp[9]))
270		# This is interpreted differently by different BMCs so for now we make a note of
271		# it but don't present it to the user.
272		timeframe=$((rsp[16] << 24 | rsp[15] << 16 | rsp[14] << 8 | rsp[13]))
273		reading=$((rsp[2] << 8 | rsp[1]))
274		if ((get_avg == 1)); then
275			min=$((rsp[4] << 8 | rsp[3]))
276			max=$((rsp[6] << 8 | rsp[5]))
277			avg=$((rsp[8] << 8 | rsp[7]))
278			_DCMI_min+=("$min")
279			_DCMI_max+=("$max")
280			_DCMI_avg+=("$avg")
281			power_readings["DCMI_MIN"]="_DCMI_min[@]"
282			power_readings["DCMI_MAX"]="_DCMI_max[@]"
283			power_readings["DCMI_AVG"]="_DCMI_avg[@]"
284		fi
285		_DCMI+=("$reading")
286		power_readings["DCMI"]="_DCMI[@]"
287
288		for print in min max avg reading; do
289			[[ -n ${!print} ]] || continue
290			printf '(%s) DCMI %s (mode: %s): %u Watts (interval: %ss, test: %s)\n' \
291				"$(utc "$ts")" \
292				"$print" \
293				"$mode" \
294				"${!print}" \
295				"$interval" \
296				"$TEST_TAG" >&2
297		done
298	fi >&2
299}
300
301get_sdr_now_reading() {
302	local sensor reading=0 ts unit
303
304	if ((${#power_sensors[@]} == 0)); then
305		printf 'No power sensors were provided\n' >&2
306		return 1
307	fi
308
309	for sensor in "${!power_sensors[@]}"; do
310		ts=$(utc)
311		if ! IFS="," read -r _ reading unit _; then
312			reading=error
313		else
314			eval "_sensor${sensor}_readings+=($reading)"
315			power_readings["${power_sensors[sensor]}"]="_sensor${sensor}_readings[@]"
316			reading+=" $unit"
317		fi < <("$ipmitool" -c -S "$sdr_cache" sdr get "${power_sensors[sensor]}") 2> /dev/null
318		printf '(%s) Sensor %s reading: %s (interval: %ss, test: %s)\n' \
319			"$ts" \
320			"${power_sensors[sensor]}" \
321			"$reading" \
322			"$interval" \
323			"$TEST_TAG" >&2
324	done
325}
326
327rapl_supported() {
328	[[ -e /sys/class/powercap/intel-rapl ]]
329}
330
331get_cpu_socket_reading() {
332	local rapl=/sys/class/powercap
333	local socket socket_idx _socket_idx socket_name
334	local ts reading
335
336	# power_uw is usually not available so we need to relay on energy_uj. It's also rarely
337	# rw so we can't zero it out, hence we need to keep track of the initial counter. For
338	# details see kernel documentation (powercap.rst).
339	ts=$(utc)
340	for socket in /sys/class/powercap/intel-rapl:*; do
341		[[ -e $socket ]] || continue
342
343		socket_idx=${socket#*:} socket_name=$(< "$socket/name")
344		# Adjust for different domains, see linux/intel_rapl.h
345		case "$socket_name" in
346			dram | core | uncore) _socket_idx=${socket_idx//:/_} socket_idx=${socket_idx%:*} ;;
347			package-*) _socket_idx=$socket_idx socket_name=socket ;;
348			psys*) _socket_idx=$socket_idx socket_name=platform ;;
349		esac
350
351		local -n socket_uj=socket_${_socket_idx}_uj
352		socket_uj+=("$(< "$socket/energy_uj")")
353		# We need at least two readings for comparison
354		((${#socket_uj[@]} > 1)) || continue
355
356		# Convert to Watts - use bc since $interval can be an actual float
357		reading=$(calc "(${socket_uj[-1]} - ${socket_uj[-2]}) / 1000000 / $interval")
358		if [[ $reading == "-"* ]]; then
359			# Somehow this may happen, probably when the counter wraps over. Consider
360			# this as a faulty reading and don't include it since it may impact overall
361			# avg.
362			printf '(%s) CPU %s %s reading: error(%s) (interval: %ss, test: %s)\n' \
363				"$ts" \
364				"$socket_name" \
365				"$socket_idx" \
366				"$reading" \
367				"$interval" \
368				"$TEST_TAG" >&2
369			return 0
370		fi
371		eval "_socket${_socket_idx}_readings+=($reading)"
372		power_readings["$socket_name-$socket_idx"]="_socket${_socket_idx}_readings[@]"
373
374		printf '(%s) CPU %s %s reading: %s Watts (interval: %ss, test: %s)\n' \
375			"$ts" \
376			"$socket_name" \
377			"$socket_idx" \
378			"$reading" \
379			"$interval" \
380			"$TEST_TAG" >&2
381	done
382}
383
384get_now_reading() {
385	case "$support" in
386		dcmi) get_dcmi_now_reading ;;
387		sdr) get_sdr_now_reading ;;
388		*) ;;
389	esac
390}
391
392dump_readings() {
393	local sensor reading readings avg total
394
395	((${#power_readings[@]} > 0)) || return 1
396	printf 'Dumping average sensors reading from %s\n' "${!power_readings[*]}" >&2
397
398	for sensor in "${!power_readings[@]}"; do
399		readings=("${!power_readings["$sensor"]}")
400		if ((${#readings[@]} == 0)); then
401			printf 'No readings available for %s sensor\n' "$sensor" >&2
402			continue
403		fi
404		total=0
405		for reading in "${readings[@]}"; do
406			total=$(calc "$total + $reading")
407		done
408		avg=$(calc "$total / ${#readings[@]}")
409
410		readings+=("Total: ${#readings[@]}")
411		sensor="${sensor//[[:space:]]/_}"
412		printf '%s\n' "$avg" > "$PM_OUTPUTDIR/${prefix:+${prefix}_}avg_${sensor}.bmc.pm.txt"
413		printf '%s\n' "${readings[@]}" > "$PM_OUTPUTDIR/${prefix:+${prefix}_}all_${sensor}.bmc.pm.txt"
414		printf 'Dumped avg to %s\n' "$PM_OUTPUTDIR/${prefix:+${prefix}_}avg_${sensor}.bmc.pm.txt" >&2
415		printf 'Dumped all to %s\n' "$PM_OUTPUTDIR/${prefix:+${prefix}_}all_${sensor}.bmc.pm.txt" >&2
416	done
417}
418
419utc() {
420	date --utc ${1:+-"d@$1"}
421}
422
423cleanup() {
424	rm_pm_pid
425	[[ -f $sdr_cache && $remove_sdr_cache == yes ]] && rm "$sdr_cache"
426	dump_readings
427}
428
429collect_readings() {
430	local _count=$count
431	if ((_count == 1 && cpu_support)); then
432		# We need at least two readings to get a meaningful data
433		((_count += 1))
434	fi
435	while ((count <= 0 ? 1 : _count--)); do
436		get_now_reading
437		((cpu_support)) && get_cpu_socket_reading
438		sleep "${interval}s"
439	done
440}
441
442help() {
443	cat <<- HELP
444
445		Usage: $0 [-h] [-d dir] [-i sdr|dcmi] [-s SENSOR_NAME] [-t interval] [-l] [-p prefix] [-c count] [-r]
446
447		  -h - Print this message.
448		  -d - Directory where the results should be saved. Default is /tmp.
449		  -i - Type of interface to use for requesting power usage. "sdr" or "dcmi".
450		       If not set, available interface is used ("dcmi" has priority).
451		  -t - How long to wait before each get power command in seconds. In case
452		       this value matches one of supported averaging time periods special
453		       variant of the command will be used to obtain the reading - this
454		       variant is used only with the "dcmi" interface. Default is 1s.
455		  -s - In case "sdr" interface is in use, try to read data from SENSOR_NAME.
456		  -x - In case "sdr" interface is in use, don't remove SDR cache. This can
457		       speed up subsequent runs of the script.
458		  -l - Save output of the script to a log file (dir/${0##*/}.bmc.pm.log).
459		  -p - Add prefix to saved files.
460		  -c - Read power usage count times. 0 is the default and it means to run
461		       indefinitely.
462		  -r - Include readings from CPU sockets (RAPL-dependent)
463
464		When started, ${0##*/} will enter loop to continuously read power usage from either
465		DCMI interface or dedicated Watts sensors every interval. Each reading will be
466		logged to stderr. Upon termination, average power usage will be dumped to /tmp or
467		directory set by -d.
468
469	HELP
470}
471
472is_root
473
474interval=1
475remove_sdr_cache=yes
476log_to_file=no
477prefix=""
478count=0
479include_cpu=0
480
481declare -A power_readings=()
482declare -a extra_power_sensors=()
483
484while getopts :hi:s:d:t:xlp:c:r arg; do
485	case "$arg" in
486		h)
487			help
488			exit 0
489			;;
490		d) PM_OUTPUTDIR=$OPTARG ;;
491		s) extra_power_sensors+=("$OPTARG") ;;
492		i) interface=${OPTARG,,} ;;
493		t) interval=$OPTARG ;;
494		x) remove_sdr_cache=no ;;
495		l) log_to_file=yes ;;
496		p) prefix=$OPTARG ;;
497		c) count=$OPTARG ;;
498		r) include_cpu=1 ;;
499		*) ;;
500	esac
501done
502
503declare -r sdr_cache=$PM_OUTPUTDIR/sdr.cache
504declare -r log_file=${prefix:+${prefix}_}${0##*/}.bmc.pm.log
505
506mkdir -p "$PM_OUTPUTDIR"
507if [[ $log_to_file == yes ]]; then
508	printf 'Redirecting to %s\n' "$PM_OUTPUTDIR/$log_file" >&2
509	exec > "$PM_OUTPUTDIR/$log_file" 2>&1
510fi
511
512save_pm_pid
513trap 'cleanup' EXIT
514trap 'retag' USR1
515
516ipmi_supported
517power_support
518
519collect_readings
520