xref: /spdk/scripts/perf/pm/collect-bmc-pm (revision fecffda6ecf8853b82edccde429b68252f0a62c5)
1#!/usr/bin/env bash
2#  SPDX-License-Identifier: BSD-3-Clause
3#  Copyright (C) 2022 Intel Corporation
4#  All rights reserved.
5
6set -e
7
8hex() { printf '0x%02x\n' "$@"; }
9
10is_root() {
11	# Talking to local BMC device requires root privileges
12	if ((UID)); then
13		printf '%s, you need to be root to run this script\n' "$USER" >&2
14		return 1
15	fi
16
17}
18
19is_ipmitool() {
20	if ! type -P ipmitool; then
21		printf 'ipmitool not detected, cannot run commands against the BMC\n' >&2
22		return 1
23	fi
24}
25
26ipmi_load() {
27	# Silently attempt to load core ipmi drivers - we will pick up the device later on.
28	modprobe -qa ipmi_si ipmi_devintf ipmi_msghandler || return 0
29}
30
31ipmi_supported() {
32	# Verify if kernel detected and registered at least one BMC under
33	# the ipmi platform. Look for KCS specifically as this the type
34	# of the interface the script was tested against.
35
36	local ipmi=/sys/class/ipmi/ipmi0
37
38	# Keep these details global for easy access if needed.
39	local -g man_id prod_id dev_id ipmi_ver platform board ipmitool
40
41	ipmi_load
42
43	if [[ ! -e $ipmi ]]; then
44		printf 'BMC not detected. Please, make sure your platform is IPMI-compatible\n'
45		return 1
46	fi >&2
47
48	type=$(< "$ipmi/device/type")
49
50	if [[ $type != kcs ]]; then
51		printf 'No supported BMC interface detected (%s) - only KCS is supported\n' "$type"
52		return 1
53	fi >&2
54
55	man_id=$(< "$ipmi/device/bmc/manufacturer_id")
56	prod_id=$(< "$ipmi/device/bmc/product_id")
57	dev_id=$(hex "$(< "$ipmi/device/bmc/device_id")")
58	ipmi_ver=$(< "$ipmi/device/bmc/ipmi_version")
59
60	if [[ -e /sys/class/dmi/id/board_vendor ]]; then
61		platform=$(< /sys/class/dmi/id/board_vendor)
62	fi
63
64	if [[ -e /sys/class/dmi/id/board_name ]]; then
65		board=$(< /sys/class/dmi/id/board_name)
66	fi
67
68	# Keep output similar to ipmi_si's
69	cat <<- BMC_DEV >&2
70
71		BMC detected, details below:
72		Manufacturer ID: $man_id
73		Product ID: $prod_id
74		Device ID: $dev_id
75		IPMI Version: $ipmi_ver
76		Platform: ${platform:-unknown}
77		Board: ${board:-unknown}
78
79	BMC_DEV
80
81	# Verify if we have proper tools to work with
82	ipmitool=$(is_ipmitool)
83}
84
85ipmiraw() {
86	# For the majority of commands we use raw payload to not depend on specific ipmitool version
87	# and the way how it interprets/parses the returned data. This also allows us to inspect the
88	# integrity of data more closely to make sure we don't report nonsensical values to the user.
89
90	local rsp
91
92	rsp=($("$ipmitool" raw "$@" 2> /dev/null))
93	# Slap hex prefix to work with proper base
94	rsp=("${rsp[@]/#/0x}")
95
96	hex "${rsp[@]}"
97}
98
99dcmiraw() {
100	local cmd=$1 data=("${@:2}")
101
102	ipmiraw 0x2c "$cmd" 0xdc "${data[@]}"
103}
104
105print_dcmi_available_time_periods() {
106	local time_periods=${enhanced_power_attr[4]}
107	local -g available_time_periods=()
108	local -g available_time_periods_in_seconds=()
109
110	available_time_periods[0]="NOW"
111
112	if ((time_periods > 0)); then
113		local time_idx=5
114		local offset=$time_idx
115		local units unit time time_s units_mask=0xc0 to_sec
116
117		units[0x0]=seconds
118		units[0x1]=minutes
119		units[0x2]=hours
120		units[0x3]=days
121
122		to_sec[0x0]=1
123		to_sec[0x1]=60
124		to_sec[0x2]=3600
125		to_sec[0x3]=86400
126
127		while ((offset < time_idx + time_periods)); do
128			time=$((enhanced_power_attr[offset] & ~units_mask))
129			unit=${units[enhanced_power_attr[offset] >> 6]:-unknown}
130			time_s=$((time * to_sec[enhanced_power_attr[offset] >> 6]))
131			if ((time != 0)); then
132				available_time_periods[offset]="$time $unit"
133				available_time_periods_in_seconds[time_s]=${enhanced_power_attr[offset]}
134			fi
135			((++offset))
136		done
137	fi
138	cat <<- TIME_PERIODS >&2
139
140		Available averaging time periods to request:
141		$(printf '  - %s\n' "${available_time_periods[@]}")
142
143	TIME_PERIODS
144}
145
146dcmi_power_support() {
147	# Verify if the BMC conforms to the DCMI spec
148	local rsp
149
150	# Table 6-2, Get DCMI Capabilities Command Format
151	if ! rsp=($(dcmiraw 0x1 0x1)); then
152		printf 'Cannot determine if BMC supports DCMI Power Management capability\n' >&2
153		return 1
154	fi
155
156	# Table 6-3, DCMI Capabilities Parameters:
157	#  - Supported DCMI Capabilities:
158	#    - Byte 2 Platform capabilities: [0] Power management
159	if ((!(rsp[5] & (1 << 0)))); then
160		printf 'BMC does not provide DCMI Power Mangament capability\n' >&2
161		return 1
162	fi
163
164	# Check if BMC provides Enhanced System Power Statistics attributes - this allows to issue
165	# requests for power readings at averaging time period, .e.g. from last 5 seconds, 30 minutes,
166	# 1 hour and so on. With this we can provide more detailed view on power usage within a
167	# specific period of time. Without it, we need to depend only on current reading that should
168	# be always available (the "NOW" reading).
169
170	local -g enhanced_power_attr=()
171
172	# Table 6-3, DCMI Capabilities Parameters:
173	#  - Enhanced System Power Statistics attributes
174	if enhanced_power_attr=($(dcmiraw 0x1 0x5)); then
175		print_dcmi_available_time_periods
176	fi
177
178	printf 'Using DCMI Power Management\n' >&2
179}
180
181sdr_power_support() {
182	# This is a fallback which only some platforms may provide (confirmed PowerEdge and CYP).
183	# We are looking for a full, threshold sensor which reports overall power usage in Watts.
184	# Different BMCs may have SDRs which describe such sensor(s) differently so this is not
185	# 100% reliable. To make sure we pick up a proper sensor we also narrow it down to a
186	# specific entity (System Board or Power Supply). Readings from the sensor should be
187	# considered as "NOW" readings (without access to min, max readings).
188
189	local -g power_sensors=()
190	local sensor entity unit status
191
192	# Cache SDR to speed up sensor readings
193	if [[ ! -f $sdr_cache ]]; then
194		printf 'Saving SDR cache at %s\n' "$sdr_cache" >&2
195		"$ipmitool" sdr dump "$sdr_cache" > /dev/null
196	fi
197
198	if ((${#extra_power_sensors[@]} > 0)); then
199		power_sensors+=("${extra_power_sensors[@]}")
200	fi
201
202	while IFS="," read -r sensor _ unit status _ entity _; do
203		[[ $unit == Watts && $status == ok ]] || continue
204		[[ $entity == "System Board" || $entity == "Power Supply" ]] || continue
205		power_sensors+=("$sensor")
206	done < <("$ipmitool" -S "$sdr_cache" -vc sdr list full 2>&1)
207
208	if ((${#power_sensors[@]} > 0)); then
209		printf 'Using SDR (Power sensors: %s)\n' "${power_sensors[*]}"
210	else
211		printf 'Cannot locate power sensors\n'
212		return 1
213	fi >&2
214}
215
216power_support() {
217	local -g support cpu_support=0
218
219	if ((include_cpu == 1)) && rapl_supported; then
220		cpu_support=1
221	fi
222
223	if [[ $interface == dcmi || $interface == sdr ]]; then
224		# override
225		"${interface}_power_support"
226		support=$interface
227	elif dcmi_power_support; then
228		support=dcmi
229	elif sdr_power_support; then
230		support=sdr
231	else
232		printf 'BMC does not provide Power Management support, cannot gather system-wide power measurements\n' >&2
233		if ((cpu_support)); then
234			printf 'Only CPU measurements will be provided\n' >&2
235			return 0
236		fi
237		return 1
238	fi
239}
240
241get_dcmi_now_reading() {
242	local rsp reading=0 max min avg ts timeframe mode=01h
243	local get_cmd get_avg=0 print
244
245	# Table 6-16, Get Power Reading Command:
246	get_cmd=(0x2 0x1 0x0 0x0)
247
248	if [[ $interval =~ ^[0-9]+$ && -n ${available_time_periods_in_seconds[interval]} ]]; then
249		get_cmd=(0x2 0x2 "${available_time_periods_in_seconds[interval]}" 0x0)
250		get_avg=1
251		mode=02h
252	fi
253
254	# We use System Power Statistics mode to get the "NOW" reading by default. In case
255	# interval matches one supported by Enhanced System Power Statistics we use that
256	# mode to obtain extra min, max, avg statistics.
257
258	if ! rsp=($(dcmiraw "${get_cmd[@]}")); then
259		printf 'DCMI reading: error\n'
260	else
261		# Note that the BMC timestamp depends on the hwclock setup which we then attempt
262		# to represent in UTC.
263		ts=$((rsp[12] << 24 | rsp[11] << 16 | rsp[10] << 8 | rsp[9]))
264		# This is interpreted differently by different BMCs so for now we make a note of
265		# it but don't present it to the user.
266		timeframe=$((rsp[16] << 24 | rsp[15] << 16 | rsp[14] << 8 | rsp[13]))
267		reading=$((rsp[2] << 8 | rsp[1]))
268		if ((get_avg == 1)); then
269			min=$((rsp[4] << 8 | rsp[3]))
270			max=$((rsp[6] << 8 | rsp[5]))
271			avg=$((rsp[8] << 8 | rsp[7]))
272			_DCMI_min+=("$min")
273			_DCMI_max+=("$max")
274			_DCMI_avg+=("$avg")
275			power_readings["DCMI_MIN"]="_DCMI_min[@]"
276			power_readings["DCMI_MAX"]="_DCMI_max[@]"
277			power_readings["DCMI_AVG"]="_DCMI_avg[@]"
278		fi
279		_DCMI+=("$reading")
280		power_readings["DCMI"]="_DCMI[@]"
281
282		for print in min max avg reading; do
283			[[ -n ${!print} ]] || continue
284			printf '(%s) DCMI %s (mode: %s): %u Watts (interval: %ss)\n' \
285				"$(utc "$ts")" \
286				"$print" \
287				"$mode" \
288				"${!print}" \
289				"$interval"
290		done
291	fi >&2
292}
293
294get_sdr_now_reading() {
295	local sensor reading=0 ts unit
296
297	if ((${#power_sensors[@]} == 0)); then
298		printf 'No power sensors were provided\n' >&2
299		return 1
300	fi
301
302	for sensor in "${!power_sensors[@]}"; do
303		ts=$(utc)
304		if ! IFS="," read -r _ reading unit _; then
305			reading=error
306		else
307			eval "_sensor${sensor}_readings+=($reading)"
308			power_readings["${power_sensors[sensor]}"]="_sensor${sensor}_readings[@]"
309			reading+=" $unit"
310		fi < <("$ipmitool" -c -S "$sdr_cache" sdr get "${power_sensors[sensor]}") 2> /dev/null
311		printf '(%s) Sensor %s reading: %s (interval %ss)\n' \
312			"$ts" \
313			"${power_sensors[sensor]}" \
314			"$reading" \
315			"$interval" >&2
316	done
317}
318
319rapl_supported() {
320	[[ -e /sys/class/powercap/intel-rapl ]]
321}
322
323get_cpu_socket_reading() {
324	local rapl=/sys/class/powercap
325	local socket socket_idx _socket_idx socket_name
326	local ts reading
327
328	# power_uw is usually not available so we need to relay on energy_uj. It's also rarely
329	# rw so we can't zero it out, hence we need to keep track of the initial counter. For
330	# details see kernel documentation (powercap.rst).
331	ts=$(utc)
332	for socket in /sys/class/powercap/intel-rapl:*; do
333		[[ -e $socket ]] || continue
334
335		socket_idx=${socket#*:} socket_name=$(< "$socket/name")
336		# Adjust for different domains, see linux/intel_rapl.h
337		case "$socket_name" in
338			dram | core | uncore) _socket_idx=${socket_idx//:/_} socket_idx=${socket_idx%:*} ;;
339			package-*) _socket_idx=$socket_idx socket_name=socket ;;
340			psys*) _socket_idx=$socket_idx socket_name=platform ;;
341		esac
342
343		local -n socket_uj=socket_${_socket_idx}_uj
344		socket_uj+=("$(< "$socket/energy_uj")")
345		# We need at least two readings for comparison
346		((${#socket_uj[@]} > 1)) || continue
347
348		# Convert to Watts - use bc since $interval can be an actual float
349		reading=$(bc <<< "scale=2; (${socket_uj[-1]} - ${socket_uj[-2]}) / 1000000 / $interval")
350		eval "_socket${_socket_idx}_readings+=($reading)"
351		power_readings["$socket_name-$socket_idx"]="_socket${_socket_idx}_readings[@]"
352
353		printf '(%s) CPU %s %s reading: %s Watts (interval: %ss)\n' \
354			"$ts" \
355			"$socket_name" \
356			"$socket_idx" \
357			"$reading" \
358			"$interval" >&2
359	done
360}
361
362get_now_reading() {
363	case "$support" in
364		dcmi) get_dcmi_now_reading ;;
365		sdr) get_sdr_now_reading ;;
366		*) ;;
367	esac
368}
369
370dump_readings() {
371	local sensor reading readings avg total
372
373	((${#power_readings[@]} > 0)) || return 1
374	printf 'Dumping average sensors reading from %s\n' "${!power_readings[*]}" >&2
375
376	for sensor in "${!power_readings[@]}"; do
377		readings=("${!power_readings["$sensor"]}")
378		if ((${#readings[@]} == 0)); then
379			printf 'No readings available for %s sensor\n' "$sensor" >&2
380			continue
381		fi
382		total=0
383		for reading in "${readings[@]}"; do
384			((total += ${reading%.*}))
385		done
386		avg=$((total / ${#readings[@]}))
387
388		readings+=("Total: ${#readings[@]}")
389		printf '%u\n' "$avg" > "$output_dir/${prefix:+${prefix}_}avg_${sensor}.bmc.pm.txt"
390		printf '%s\n' "${readings[@]}" > "$output_dir/${prefix:+${prefix}_}all_${sensor}.bmc.pm.txt"
391		printf 'Dumped avg to %s\n' "$output_dir/${prefix:+${prefix}_}avg_${sensor}.bmc.pm.txt" >&2
392		printf 'Dumped all to %s\n' "$output_dir/${prefix:+${prefix}_}all_${sensor}.bmc.pm.txt" >&2
393	done
394}
395
396utc() {
397	date --utc ${1:+-"d@$1"}
398}
399
400cleanup() {
401	[[ -f $sdr_cache && $remove_sdr_cache == yes ]] && rm "$sdr_cache"
402	dump_readings
403}
404
405collect_readings() {
406	local _count=$count
407	if ((_count == 1 && cpu_support)); then
408		# We need at least two readings to get a meaningful data
409		((_count += 1))
410	fi
411	while ((count <= 0 ? 1 : _count--)); do
412		get_now_reading
413		((cpu_support)) && get_cpu_socket_reading
414		sleep "${interval}s"
415	done
416}
417
418help() {
419	cat <<- HELP
420
421		Usage: $0 [-h] [-d dir] [-i sdr|dcmi] [-s SENSOR_NAME] [-t interval] [-l log_file] [-p prefix] [-c count] [-r]
422
423		  -h - Print this message.
424		  -d - Directory where the results should be saved. Default is /tmp.
425		  -i - Type of interface to use for requesting power usage. "sdr" or "dcmi".
426		       If not set, available interface is used ("dcmi" has priority).
427		  -t - How long to wait before each get power command in seconds. In case
428		       this value matches one of supported averaging time periods special
429		       variant of the command will be used to obtain the reading - this
430		       variant is used only with the "dcmi" interface. Default is 1s.
431		  -s - In case "sdr" interface is in use, try to read data from SENSOR_NAME.
432		  -x - In case "sdr" interface is in use, don't remove SDR cache. This can
433		       speed up subsequent runs of the script.
434		  -l - Save output of the script to a log file (dir/${0##*/}.bmc.pm.log).
435		  -p - Add prefix to saved files.
436		  -c - Read power usage count times. 0 is the default and it means to run
437		       indefinitely.
438		  -r - Include readings from CPU sockets (RAPL-dependent)
439
440		When started, ${0##*/} will enter loop to continuously read power usage from either
441		DCMI interface or dedicated Watts sensors every interval. Each reading will be
442		logged to stderr. Upon termination, average power usage will be dumped to /tmp or
443		directory set by -d.
444
445	HELP
446}
447
448is_root
449
450output_dir=/tmp
451interval=1
452remove_sdr_cache=yes
453log_to_file=no
454prefix=""
455count=0
456include_cpu=0
457
458declare -A power_readings=()
459declare -a extra_power_sensors=()
460
461while getopts :hi:s:d:t:xlp:c:r arg; do
462	case "$arg" in
463		h)
464			help
465			exit 0
466			;;
467		d) output_dir=$OPTARG ;;
468		s) extra_power_sensors+=("$OPTARG") ;;
469		i) interface=${OPTARG,,} ;;
470		t) interval=$OPTARG ;;
471		x) remove_sdr_cache=no ;;
472		l) log_to_file=yes ;;
473		p) prefix=$OPTARG ;;
474		c) count=$OPTARG ;;
475		r) include_cpu=1 ;;
476		*) ;;
477	esac
478done
479
480declare -r sdr_cache=$output_dir/sdr.cache
481declare -r log_file=${prefix:+${prefix}_}${0##*/}.bmc.pm.log
482
483mkdir -p "$output_dir"
484if [[ $log_to_file == yes ]]; then
485	printf 'Redirecting to %s\n' "$output_dir/$log_file" >&2
486	exec > "$output_dir/$log_file" 2>&1
487fi
488
489trap 'cleanup' EXIT
490
491ipmi_supported
492power_support
493
494collect_readings
495