xref: /spdk/scripts/perf/pm/collect-bmc-pm (revision 60982c759db49b4f4579f16e3b24df0725ba4b94)
1#!/usr/bin/env bash
2#  SPDX-License-Identifier: BSD-3-Clause
3#  Copyright (C) 2022 Intel Corporation
4#  All rights reserved.
5
6set -e
7
8hex() { printf '0x%02x\n' "$@"; }
9
10calc() { bc <<< "scale=2; $*"; }
11
12is_root() {
13	# Talking to local BMC device requires root privileges
14	if ((UID)); then
15		printf '%s, you need to be root to run this script\n' "$USER" >&2
16		return 1
17	fi
18
19}
20
21is_ipmitool() {
22	if ! type -P ipmitool; then
23		printf 'ipmitool not detected, cannot run commands against the BMC\n' >&2
24		return 1
25	fi
26}
27
28ipmi_load() {
29	# Silently attempt to load core ipmi drivers - we will pick up the device later on.
30	modprobe -qa ipmi_si ipmi_devintf ipmi_msghandler || return 0
31}
32
33ipmi_supported() {
34	# Verify if kernel detected and registered at least one BMC under
35	# the ipmi platform. Look for KCS specifically as this the type
36	# of the interface the script was tested against.
37
38	local ipmi=/sys/class/ipmi/ipmi0
39
40	# Keep these details global for easy access if needed.
41	local -g man_id prod_id dev_id ipmi_ver platform board ipmitool
42
43	ipmi_load
44
45	if [[ ! -e $ipmi ]]; then
46		printf 'BMC not detected. Please, make sure your platform is IPMI-compatible\n'
47		return 1
48	fi >&2
49
50	type=$(< "$ipmi/device/type")
51
52	if [[ $type != kcs ]]; then
53		printf 'No supported BMC interface detected (%s) - only KCS is supported\n' "$type"
54		return 1
55	fi >&2
56
57	man_id=$(< "$ipmi/device/bmc/manufacturer_id")
58	prod_id=$(< "$ipmi/device/bmc/product_id")
59	dev_id=$(hex "$(< "$ipmi/device/bmc/device_id")")
60	ipmi_ver=$(< "$ipmi/device/bmc/ipmi_version")
61
62	if [[ -e /sys/class/dmi/id/board_vendor ]]; then
63		platform=$(< /sys/class/dmi/id/board_vendor)
64	fi
65
66	if [[ -e /sys/class/dmi/id/board_name ]]; then
67		board=$(< /sys/class/dmi/id/board_name)
68	fi
69
70	# Keep output similar to ipmi_si's
71	cat <<- BMC_DEV >&2
72
73		BMC detected, details below:
74		Manufacturer ID: $man_id
75		Product ID: $prod_id
76		Device ID: $dev_id
77		IPMI Version: $ipmi_ver
78		Platform: ${platform:-unknown}
79		Board: ${board:-unknown}
80
81	BMC_DEV
82
83	# Verify if we have proper tools to work with
84	ipmitool=$(is_ipmitool)
85}
86
87ipmiraw() {
88	# For the majority of commands we use raw payload to not depend on specific ipmitool version
89	# and the way how it interprets/parses the returned data. This also allows us to inspect the
90	# integrity of data more closely to make sure we don't report nonsensical values to the user.
91
92	local rsp
93
94	rsp=($("$ipmitool" raw "$@" 2> /dev/null))
95	# Slap hex prefix to work with proper base
96	rsp=("${rsp[@]/#/0x}")
97
98	hex "${rsp[@]}"
99}
100
101dcmiraw() {
102	local cmd=$1 data=("${@:2}")
103
104	ipmiraw 0x2c "$cmd" 0xdc "${data[@]}"
105}
106
107print_dcmi_available_time_periods() {
108	local time_periods=${enhanced_power_attr[4]}
109	local -g available_time_periods=()
110	local -g available_time_periods_in_seconds=()
111
112	available_time_periods[0]="NOW"
113
114	if ((time_periods > 0)); then
115		local time_idx=5
116		local offset=$time_idx
117		local units unit time time_s units_mask=0xc0 to_sec
118
119		units[0x0]=seconds
120		units[0x1]=minutes
121		units[0x2]=hours
122		units[0x3]=days
123
124		to_sec[0x0]=1
125		to_sec[0x1]=60
126		to_sec[0x2]=3600
127		to_sec[0x3]=86400
128
129		while ((offset < time_idx + time_periods)); do
130			time=$((enhanced_power_attr[offset] & ~units_mask))
131			unit=${units[enhanced_power_attr[offset] >> 6]:-unknown}
132			time_s=$((time * to_sec[enhanced_power_attr[offset] >> 6]))
133			if ((time != 0)); then
134				available_time_periods[offset]="$time $unit"
135				available_time_periods_in_seconds[time_s]=${enhanced_power_attr[offset]}
136			fi
137			((++offset))
138		done
139	fi
140	cat <<- TIME_PERIODS >&2
141
142		Available averaging time periods to request:
143		$(printf '  - %s\n' "${available_time_periods[@]}")
144
145	TIME_PERIODS
146}
147
148dcmi_power_support() {
149	# Verify if the BMC conforms to the DCMI spec
150	local rsp
151
152	# Table 6-2, Get DCMI Capabilities Command Format
153	if ! rsp=($(dcmiraw 0x1 0x1)); then
154		printf 'Cannot determine if BMC supports DCMI Power Management capability\n' >&2
155		return 1
156	fi
157
158	# Table 6-3, DCMI Capabilities Parameters:
159	#  - Supported DCMI Capabilities:
160	#    - Byte 2 Platform capabilities: [0] Power management
161	if ((!(rsp[5] & (1 << 0)))); then
162		printf 'BMC does not provide DCMI Power Mangament capability\n' >&2
163		return 1
164	fi
165
166	# Check if BMC provides Enhanced System Power Statistics attributes - this allows to issue
167	# requests for power readings at averaging time period, .e.g. from last 5 seconds, 30 minutes,
168	# 1 hour and so on. With this we can provide more detailed view on power usage within a
169	# specific period of time. Without it, we need to depend only on current reading that should
170	# be always available (the "NOW" reading).
171
172	local -g enhanced_power_attr=()
173
174	# Table 6-3, DCMI Capabilities Parameters:
175	#  - Enhanced System Power Statistics attributes
176	if enhanced_power_attr=($(dcmiraw 0x1 0x5)); then
177		print_dcmi_available_time_periods
178	fi
179
180	printf 'Using DCMI Power Management\n' >&2
181}
182
183sdr_power_support() {
184	# This is a fallback which only some platforms may provide (confirmed PowerEdge and CYP).
185	# We are looking for a full, threshold sensor which reports overall power usage in Watts.
186	# Different BMCs may have SDRs which describe such sensor(s) differently so this is not
187	# 100% reliable. To make sure we pick up a proper sensor we also narrow it down to a
188	# specific entity (System Board or Power Supply). Readings from the sensor should be
189	# considered as "NOW" readings (without access to min, max readings).
190
191	local -g power_sensors=()
192	local sensor entity unit status
193
194	# Cache SDR to speed up sensor readings
195	if [[ ! -f $sdr_cache ]]; then
196		printf 'Saving SDR cache at %s\n' "$sdr_cache" >&2
197		"$ipmitool" sdr dump "$sdr_cache" > /dev/null
198	fi
199
200	if ((${#extra_power_sensors[@]} > 0)); then
201		power_sensors+=("${extra_power_sensors[@]}")
202	fi
203
204	while IFS="," read -r sensor _ unit status _ entity _; do
205		[[ $unit == Watts && $status == ok ]] || continue
206		[[ $entity == "System Board" || $entity == "Power Supply" ]] || continue
207		power_sensors+=("$sensor")
208	done < <("$ipmitool" -S "$sdr_cache" -vc sdr list full 2>&1)
209
210	if ((${#power_sensors[@]} > 0)); then
211		printf 'Using SDR (Power sensors: %s)\n' "${power_sensors[*]}"
212	else
213		printf 'Cannot locate power sensors\n'
214		return 1
215	fi >&2
216}
217
218power_support() {
219	local -g support cpu_support=0
220
221	if ((include_cpu == 1)) && rapl_supported; then
222		cpu_support=1
223	fi
224
225	if [[ $interface == dcmi || $interface == sdr ]]; then
226		# override
227		"${interface}_power_support"
228		support=$interface
229	elif dcmi_power_support; then
230		support=dcmi
231	elif sdr_power_support; then
232		support=sdr
233	else
234		printf 'BMC does not provide Power Management support, cannot gather system-wide power measurements\n' >&2
235		if ((cpu_support)); then
236			printf 'Only CPU measurements will be provided\n' >&2
237			return 0
238		fi
239		return 1
240	fi
241}
242
243get_dcmi_now_reading() {
244	local rsp reading=0 max min avg ts timeframe mode=01h
245	local get_cmd get_avg=0 print
246
247	# Table 6-16, Get Power Reading Command:
248	get_cmd=(0x2 0x1 0x0 0x0)
249
250	if [[ $interval =~ ^[0-9]+$ && -n ${available_time_periods_in_seconds[interval]} ]]; then
251		get_cmd=(0x2 0x2 "${available_time_periods_in_seconds[interval]}" 0x0)
252		get_avg=1
253		mode=02h
254	fi
255
256	# We use System Power Statistics mode to get the "NOW" reading by default. In case
257	# interval matches one supported by Enhanced System Power Statistics we use that
258	# mode to obtain extra min, max, avg statistics.
259
260	if ! rsp=($(dcmiraw "${get_cmd[@]}")); then
261		printf 'DCMI reading: error\n'
262	else
263		# Note that the BMC timestamp depends on the hwclock setup which we then attempt
264		# to represent in UTC.
265		ts=$((rsp[12] << 24 | rsp[11] << 16 | rsp[10] << 8 | rsp[9]))
266		# This is interpreted differently by different BMCs so for now we make a note of
267		# it but don't present it to the user.
268		timeframe=$((rsp[16] << 24 | rsp[15] << 16 | rsp[14] << 8 | rsp[13]))
269		reading=$((rsp[2] << 8 | rsp[1]))
270		if ((get_avg == 1)); then
271			min=$((rsp[4] << 8 | rsp[3]))
272			max=$((rsp[6] << 8 | rsp[5]))
273			avg=$((rsp[8] << 8 | rsp[7]))
274			_DCMI_min+=("$min")
275			_DCMI_max+=("$max")
276			_DCMI_avg+=("$avg")
277			power_readings["DCMI_MIN"]="_DCMI_min[@]"
278			power_readings["DCMI_MAX"]="_DCMI_max[@]"
279			power_readings["DCMI_AVG"]="_DCMI_avg[@]"
280		fi
281		_DCMI+=("$reading")
282		power_readings["DCMI"]="_DCMI[@]"
283
284		for print in min max avg reading; do
285			[[ -n ${!print} ]] || continue
286			printf '(%s) DCMI %s (mode: %s): %u Watts (interval: %ss)\n' \
287				"$(utc "$ts")" \
288				"$print" \
289				"$mode" \
290				"${!print}" \
291				"$interval"
292		done
293	fi >&2
294}
295
296get_sdr_now_reading() {
297	local sensor reading=0 ts unit
298
299	if ((${#power_sensors[@]} == 0)); then
300		printf 'No power sensors were provided\n' >&2
301		return 1
302	fi
303
304	for sensor in "${!power_sensors[@]}"; do
305		ts=$(utc)
306		if ! IFS="," read -r _ reading unit _; then
307			reading=error
308		else
309			eval "_sensor${sensor}_readings+=($reading)"
310			power_readings["${power_sensors[sensor]}"]="_sensor${sensor}_readings[@]"
311			reading+=" $unit"
312		fi < <("$ipmitool" -c -S "$sdr_cache" sdr get "${power_sensors[sensor]}") 2> /dev/null
313		printf '(%s) Sensor %s reading: %s (interval %ss)\n' \
314			"$ts" \
315			"${power_sensors[sensor]}" \
316			"$reading" \
317			"$interval" >&2
318	done
319}
320
321rapl_supported() {
322	[[ -e /sys/class/powercap/intel-rapl ]]
323}
324
325get_cpu_socket_reading() {
326	local rapl=/sys/class/powercap
327	local socket socket_idx _socket_idx socket_name
328	local ts reading
329
330	# power_uw is usually not available so we need to relay on energy_uj. It's also rarely
331	# rw so we can't zero it out, hence we need to keep track of the initial counter. For
332	# details see kernel documentation (powercap.rst).
333	ts=$(utc)
334	for socket in /sys/class/powercap/intel-rapl:*; do
335		[[ -e $socket ]] || continue
336
337		socket_idx=${socket#*:} socket_name=$(< "$socket/name")
338		# Adjust for different domains, see linux/intel_rapl.h
339		case "$socket_name" in
340			dram | core | uncore) _socket_idx=${socket_idx//:/_} socket_idx=${socket_idx%:*} ;;
341			package-*) _socket_idx=$socket_idx socket_name=socket ;;
342			psys*) _socket_idx=$socket_idx socket_name=platform ;;
343		esac
344
345		local -n socket_uj=socket_${_socket_idx}_uj
346		socket_uj+=("$(< "$socket/energy_uj")")
347		# We need at least two readings for comparison
348		((${#socket_uj[@]} > 1)) || continue
349
350		# Convert to Watts - use bc since $interval can be an actual float
351		reading=$(calc "(${socket_uj[-1]} - ${socket_uj[-2]}) / 1000000 / $interval")
352		if [[ $reading == "-"* ]]; then
353			# Somehow this may happen, probably when the counter wraps over. Consider
354			# this as a faulty reading and don't include it since it may impact overall
355			# avg.
356			printf '(%s) CPU %s %s reading: error(%s) (interval: %ss)\n' \
357				"$ts" \
358				"$socket_name" \
359				"$socket_idx" \
360				"$reading" \
361				"$interval" >&2
362			return 0
363		fi
364		eval "_socket${_socket_idx}_readings+=($reading)"
365		power_readings["$socket_name-$socket_idx"]="_socket${_socket_idx}_readings[@]"
366
367		printf '(%s) CPU %s %s reading: %s Watts (interval: %ss)\n' \
368			"$ts" \
369			"$socket_name" \
370			"$socket_idx" \
371			"$reading" \
372			"$interval" >&2
373	done
374}
375
376get_now_reading() {
377	case "$support" in
378		dcmi) get_dcmi_now_reading ;;
379		sdr) get_sdr_now_reading ;;
380		*) ;;
381	esac
382}
383
384dump_readings() {
385	local sensor reading readings avg total
386
387	((${#power_readings[@]} > 0)) || return 1
388	printf 'Dumping average sensors reading from %s\n' "${!power_readings[*]}" >&2
389
390	for sensor in "${!power_readings[@]}"; do
391		readings=("${!power_readings["$sensor"]}")
392		if ((${#readings[@]} == 0)); then
393			printf 'No readings available for %s sensor\n' "$sensor" >&2
394			continue
395		fi
396		total=0
397		for reading in "${readings[@]}"; do
398			total=$(calc "$total + $reading")
399		done
400		avg=$(calc "$total / ${#readings[@]}")
401
402		readings+=("Total: ${#readings[@]}")
403		sensor="${sensor//[[:space:]]/_}"
404		printf '%s\n' "$avg" > "$output_dir/${prefix:+${prefix}_}avg_${sensor}.bmc.pm.txt"
405		printf '%s\n' "${readings[@]}" > "$output_dir/${prefix:+${prefix}_}all_${sensor}.bmc.pm.txt"
406		printf 'Dumped avg to %s\n' "$output_dir/${prefix:+${prefix}_}avg_${sensor}.bmc.pm.txt" >&2
407		printf 'Dumped all to %s\n' "$output_dir/${prefix:+${prefix}_}all_${sensor}.bmc.pm.txt" >&2
408	done
409}
410
411utc() {
412	date --utc ${1:+-"d@$1"}
413}
414
415cleanup() {
416	[[ -f $sdr_cache && $remove_sdr_cache == yes ]] && rm "$sdr_cache"
417	dump_readings
418}
419
420collect_readings() {
421	local _count=$count
422	if ((_count == 1 && cpu_support)); then
423		# We need at least two readings to get a meaningful data
424		((_count += 1))
425	fi
426	while ((count <= 0 ? 1 : _count--)); do
427		get_now_reading
428		((cpu_support)) && get_cpu_socket_reading
429		sleep "${interval}s"
430	done
431}
432
433help() {
434	cat <<- HELP
435
436		Usage: $0 [-h] [-d dir] [-i sdr|dcmi] [-s SENSOR_NAME] [-t interval] [-l log_file] [-p prefix] [-c count] [-r]
437
438		  -h - Print this message.
439		  -d - Directory where the results should be saved. Default is /tmp.
440		  -i - Type of interface to use for requesting power usage. "sdr" or "dcmi".
441		       If not set, available interface is used ("dcmi" has priority).
442		  -t - How long to wait before each get power command in seconds. In case
443		       this value matches one of supported averaging time periods special
444		       variant of the command will be used to obtain the reading - this
445		       variant is used only with the "dcmi" interface. Default is 1s.
446		  -s - In case "sdr" interface is in use, try to read data from SENSOR_NAME.
447		  -x - In case "sdr" interface is in use, don't remove SDR cache. This can
448		       speed up subsequent runs of the script.
449		  -l - Save output of the script to a log file (dir/${0##*/}.bmc.pm.log).
450		  -p - Add prefix to saved files.
451		  -c - Read power usage count times. 0 is the default and it means to run
452		       indefinitely.
453		  -r - Include readings from CPU sockets (RAPL-dependent)
454
455		When started, ${0##*/} will enter loop to continuously read power usage from either
456		DCMI interface or dedicated Watts sensors every interval. Each reading will be
457		logged to stderr. Upon termination, average power usage will be dumped to /tmp or
458		directory set by -d.
459
460	HELP
461}
462
463is_root
464
465output_dir=/tmp
466interval=1
467remove_sdr_cache=yes
468log_to_file=no
469prefix=""
470count=0
471include_cpu=0
472
473declare -A power_readings=()
474declare -a extra_power_sensors=()
475
476while getopts :hi:s:d:t:xlp:c:r arg; do
477	case "$arg" in
478		h)
479			help
480			exit 0
481			;;
482		d) output_dir=$OPTARG ;;
483		s) extra_power_sensors+=("$OPTARG") ;;
484		i) interface=${OPTARG,,} ;;
485		t) interval=$OPTARG ;;
486		x) remove_sdr_cache=no ;;
487		l) log_to_file=yes ;;
488		p) prefix=$OPTARG ;;
489		c) count=$OPTARG ;;
490		r) include_cpu=1 ;;
491		*) ;;
492	esac
493done
494
495declare -r sdr_cache=$output_dir/sdr.cache
496declare -r log_file=${prefix:+${prefix}_}${0##*/}.bmc.pm.log
497
498mkdir -p "$output_dir"
499if [[ $log_to_file == yes ]]; then
500	printf 'Redirecting to %s\n' "$output_dir/$log_file" >&2
501	exec > "$output_dir/$log_file" 2>&1
502fi
503
504trap 'cleanup' EXIT
505
506ipmi_supported
507power_support
508
509collect_readings
510