1#!/usr/bin/env bash 2# SPDX-License-Identifier: BSD-3-Clause 3# Copyright (C) 2022 Intel Corporation 4# All rights reserved. 5 6set -e 7 8hex() { printf '0x%02x\n' "$@"; } 9 10is_root() { 11 # Talking to local BMC device requires root privileges 12 if ((UID)); then 13 printf '%s, you need to be root to run this script\n' "$USER" >&2 14 return 1 15 fi 16 17} 18 19is_ipmitool() { 20 if ! type -P ipmitool; then 21 printf 'ipmitool not detected, cannot run commands against the BMC\n' >&2 22 return 1 23 fi 24} 25 26ipmi_load() { 27 # Silently attempt to load core ipmi drivers - we will pick up the device later on. 28 modprobe -qa ipmi_si ipmi_devintf ipmi_msghandler || return 0 29} 30 31ipmi_supported() { 32 # Verify if kernel detected and registered at least one BMC under 33 # the ipmi platform. Look for KCS specifically as this the type 34 # of the interface the script was tested against. 35 36 local ipmi=/sys/class/ipmi/ipmi0 37 38 # Keep these details global for easy access if needed. 39 local -g man_id prod_id dev_id ipmi_ver platform board ipmitool 40 41 ipmi_load 42 43 if [[ ! -e $ipmi ]]; then 44 printf 'BMC not detected. Please, make sure your platform is IPMI-compatible\n' 45 return 1 46 fi >&2 47 48 type=$(< "$ipmi/device/type") 49 50 if [[ $type != kcs ]]; then 51 printf 'No supported BMC interface detected (%s) - only KCS is supported\n' "$type" 52 return 1 53 fi >&2 54 55 man_id=$(< "$ipmi/device/bmc/manufacturer_id") 56 prod_id=$(< "$ipmi/device/bmc/product_id") 57 dev_id=$(hex "$(< "$ipmi/device/bmc/device_id")") 58 ipmi_ver=$(< "$ipmi/device/bmc/ipmi_version") 59 60 if [[ -e /sys/class/dmi/id/board_vendor ]]; then 61 platform=$(< /sys/class/dmi/id/board_vendor) 62 fi 63 64 if [[ -e /sys/class/dmi/id/board_name ]]; then 65 board=$(< /sys/class/dmi/id/board_name) 66 fi 67 68 # Keep output similar to ipmi_si's 69 cat <<- BMC_DEV >&2 70 71 BMC detected, details below: 72 Manufacturer ID: $man_id 73 Product ID: $prod_id 74 Device ID: $dev_id 75 IPMI Version: $ipmi_ver 76 Platform: ${platform:-unknown} 77 Board: ${board:-unknown} 78 79 BMC_DEV 80 81 # Verify if we have proper tools to work with 82 ipmitool=$(is_ipmitool) 83} 84 85ipmiraw() { 86 # For the majority of commands we use raw payload to not depend on specific ipmitool version 87 # and the way how it interprets/parses the returned data. This also allows us to inspect the 88 # integrity of data more closely to make sure we don't report nonsensical values to the user. 89 90 local rsp 91 92 rsp=($("$ipmitool" raw "$@" 2> /dev/null)) 93 # Slap hex prefix to work with proper base 94 rsp=("${rsp[@]/#/0x}") 95 96 hex "${rsp[@]}" 97} 98 99dcmiraw() { 100 local cmd=$1 data=("${@:2}") 101 102 ipmiraw 0x2c "$cmd" 0xdc "${data[@]}" 103} 104 105print_dcmi_available_time_periods() { 106 local time_periods=${enhanced_power_attr[4]} 107 local -g available_time_periods=() 108 local -g available_time_periods_in_seconds=() 109 110 available_time_periods[0]="NOW" 111 112 if ((time_periods > 0)); then 113 local time_idx=5 114 local offset=$time_idx 115 local units unit time time_s units_mask=0xc0 to_sec 116 117 units[0x0]=seconds 118 units[0x1]=minutes 119 units[0x2]=hours 120 units[0x3]=days 121 122 to_sec[0x0]=1 123 to_sec[0x1]=60 124 to_sec[0x2]=3600 125 to_sec[0x3]=86400 126 127 while ((offset < time_idx + time_periods)); do 128 time=$((enhanced_power_attr[offset] & ~units_mask)) 129 unit=${units[enhanced_power_attr[offset] >> 6]:-unknown} 130 time_s=$((time * to_sec[enhanced_power_attr[offset] >> 6])) 131 if ((time != 0)); then 132 available_time_periods[offset]="$time $unit" 133 available_time_periods_in_seconds[time_s]=${enhanced_power_attr[offset]} 134 fi 135 ((++offset)) 136 done 137 fi 138 cat <<- TIME_PERIODS >&2 139 140 Available averaging time periods to request: 141 $(printf ' - %s\n' "${available_time_periods[@]}") 142 143 TIME_PERIODS 144} 145 146dcmi_power_support() { 147 # Verify if the BMC conforms to the DCMI spec 148 local rsp 149 150 # Table 6-2, Get DCMI Capabilities Command Format 151 if ! rsp=($(dcmiraw 0x1 0x1)); then 152 printf 'Cannot determine if BMC supports DCMI Power Management capability\n' >&2 153 return 1 154 fi 155 156 # Table 6-3, DCMI Capabilities Parameters: 157 # - Supported DCMI Capabilities: 158 # - Byte 2 Platform capabilities: [0] Power management 159 if ((!(rsp[5] & (1 << 0)))); then 160 printf 'BMC does not provide DCMI Power Mangament capability\n' >&2 161 return 1 162 fi 163 164 # Check if BMC provides Enhanced System Power Statistics attributes - this allows to issue 165 # requests for power readings at averaging time period, .e.g. from last 5 seconds, 30 minutes, 166 # 1 hour and so on. With this we can provide more detailed view on power usage within a 167 # specific period of time. Without it, we need to depend only on current reading that should 168 # be always available (the "NOW" reading). 169 170 local -g enhanced_power_attr=() 171 172 # Table 6-3, DCMI Capabilities Parameters: 173 # - Enhanced System Power Statistics attributes 174 if enhanced_power_attr=($(dcmiraw 0x1 0x5)); then 175 print_dcmi_available_time_periods 176 fi 177 178 printf 'Using DCMI Power Management\n' >&2 179} 180 181sdr_power_support() { 182 # This is a fallback which only some platforms may provide (confirmed PowerEdge and CYP). 183 # We are looking for a full, threshold sensor which reports overall power usage in Watts. 184 # Different BMCs may have SDRs which describe such sensor(s) differently so this is not 185 # 100% reliable. To make sure we pick up a proper sensor we also narrow it down to a 186 # specific entity (System Board or Power Supply). Readings from the sensor should be 187 # considered as "NOW" readings (without access to min, max readings). 188 189 local -g power_sensors=() 190 local sensor entity unit status 191 192 # Cache SDR to speed up sensor readings 193 if [[ ! -f $sdr_cache ]]; then 194 printf 'Saving SDR cache at %s\n' "$sdr_cache" >&2 195 "$ipmitool" sdr dump "$sdr_cache" > /dev/null 196 fi 197 198 if ((${#extra_power_sensors[@]} > 0)); then 199 power_sensors+=("${extra_power_sensors[@]}") 200 fi 201 202 while IFS="," read -r sensor _ unit status _ entity _; do 203 [[ $unit == Watts && $status == ok ]] || continue 204 [[ $entity == "System Board" || $entity == "Power Supply" ]] || continue 205 power_sensors+=("$sensor") 206 done < <("$ipmitool" -S "$sdr_cache" -vc sdr list full 2>&1) 207 208 if ((${#power_sensors[@]} > 0)); then 209 printf 'Using SDR (Power sensors: %s)\n' "${power_sensors[*]}" 210 else 211 printf 'Cannot locate power sensors\n' 212 return 1 213 fi >&2 214} 215 216power_support() { 217 local -g support cpu_support=0 218 219 if ((include_cpu == 1)) && rapl_supported; then 220 cpu_support=1 221 fi 222 223 if [[ $interface == dcmi || $interface == sdr ]]; then 224 # override 225 "${interface}_power_support" 226 support=$interface 227 elif dcmi_power_support; then 228 support=dcmi 229 elif sdr_power_support; then 230 support=sdr 231 else 232 printf 'BMC does not provide Power Management support, cannot gather system-wide power measurements\n' >&2 233 if ((cpu_support)); then 234 printf 'Only CPU measurements will be provided\n' >&2 235 return 0 236 fi 237 return 1 238 fi 239} 240 241get_dcmi_now_reading() { 242 local rsp reading=0 max min avg ts timeframe mode=01h 243 local get_cmd get_avg=0 print 244 245 # Table 6-16, Get Power Reading Command: 246 get_cmd=(0x2 0x1 0x0 0x0) 247 248 if [[ $interval =~ ^[0-9]+$ && -n ${available_time_periods_in_seconds[interval]} ]]; then 249 get_cmd=(0x2 0x2 "${available_time_periods_in_seconds[interval]}" 0x0) 250 get_avg=1 251 mode=02h 252 fi 253 254 # We use System Power Statistics mode to get the "NOW" reading by default. In case 255 # interval matches one supported by Enhanced System Power Statistics we use that 256 # mode to obtain extra min, max, avg statistics. 257 258 if ! rsp=($(dcmiraw "${get_cmd[@]}")); then 259 printf 'DCMI reading: error\n' 260 else 261 # Note that the BMC timestamp depends on the hwclock setup which we then attempt 262 # to represent in UTC. 263 ts=$((rsp[12] << 24 | rsp[11] << 16 | rsp[10] << 8 | rsp[9])) 264 # This is interpreted differently by different BMCs so for now we make a note of 265 # it but don't present it to the user. 266 timeframe=$((rsp[16] << 24 | rsp[15] << 16 | rsp[14] << 8 | rsp[13])) 267 reading=$((rsp[2] << 8 | rsp[1])) 268 if ((get_avg == 1)); then 269 min=$((rsp[4] << 8 | rsp[3])) 270 max=$((rsp[6] << 8 | rsp[5])) 271 avg=$((rsp[8] << 8 | rsp[7])) 272 _DCMI_min+=("$min") 273 _DCMI_max+=("$max") 274 _DCMI_avg+=("$avg") 275 power_readings["DCMI_MIN"]="_DCMI_min[@]" 276 power_readings["DCMI_MAX"]="_DCMI_max[@]" 277 power_readings["DCMI_AVG"]="_DCMI_avg[@]" 278 fi 279 _DCMI+=("$reading") 280 power_readings["DCMI"]="_DCMI[@]" 281 282 for print in min max avg reading; do 283 [[ -n ${!print} ]] || continue 284 printf '(%s) DCMI %s (mode: %s): %u Watts (interval: %ss)\n' \ 285 "$(utc "$ts")" \ 286 "$print" \ 287 "$mode" \ 288 "${!print}" \ 289 "$interval" 290 done 291 fi >&2 292} 293 294get_sdr_now_reading() { 295 local sensor reading=0 ts unit 296 297 if ((${#power_sensors[@]} == 0)); then 298 printf 'No power sensors were provided\n' >&2 299 return 1 300 fi 301 302 for sensor in "${!power_sensors[@]}"; do 303 ts=$(utc) 304 if ! IFS="," read -r _ reading unit _; then 305 reading=error 306 else 307 eval "_sensor${sensor}_readings+=($reading)" 308 power_readings["${power_sensors[sensor]}"]="_sensor${sensor}_readings[@]" 309 reading+=" $unit" 310 fi < <("$ipmitool" -c -S "$sdr_cache" sdr get "${power_sensors[sensor]}") 2> /dev/null 311 printf '(%s) Sensor %s reading: %s (interval %ss)\n' \ 312 "$ts" \ 313 "${power_sensors[sensor]}" \ 314 "$reading" \ 315 "$interval" >&2 316 done 317} 318 319rapl_supported() { 320 [[ -e /sys/class/powercap/intel-rapl ]] 321} 322 323get_cpu_socket_reading() { 324 local rapl=/sys/class/powercap 325 local socket socket_idx _socket_idx socket_name 326 local ts reading 327 328 # power_uw is usually not available so we need to relay on energy_uj. It's also rarely 329 # rw so we can't zero it out, hence we need to keep track of the initial counter. For 330 # details see kernel documentation (powercap.rst). 331 ts=$(utc) 332 for socket in /sys/class/powercap/intel-rapl:*; do 333 [[ -e $socket ]] || continue 334 335 socket_idx=${socket#*:} socket_name=$(< "$socket/name") 336 # Adjust for different domains, see linux/intel_rapl.h 337 case "$socket_name" in 338 dram | core | uncore) _socket_idx=${socket_idx//:/_} socket_idx=${socket_idx%:*} ;; 339 package-*) _socket_idx=$socket_idx socket_name=socket ;; 340 psys*) _socket_idx=$socket_idx socket_name=platform ;; 341 esac 342 343 local -n socket_uj=socket_${_socket_idx}_uj 344 socket_uj+=("$(< "$socket/energy_uj")") 345 # We need at least two readings for comparison 346 ((${#socket_uj[@]} > 1)) || continue 347 348 # Convert to Watts - use bc since $interval can be an actual float 349 reading=$(bc <<< "scale=2; (${socket_uj[-1]} - ${socket_uj[-2]}) / 1000000 / $interval") 350 eval "_socket${_socket_idx}_readings+=($reading)" 351 power_readings["$socket_name-$socket_idx"]="_socket${_socket_idx}_readings[@]" 352 353 printf '(%s) CPU %s %s reading: %s Watts (interval: %ss)\n' \ 354 "$ts" \ 355 "$socket_name" \ 356 "$socket_idx" \ 357 "$reading" \ 358 "$interval" >&2 359 done 360} 361 362get_now_reading() { 363 case "$support" in 364 dcmi) get_dcmi_now_reading ;; 365 sdr) get_sdr_now_reading ;; 366 *) ;; 367 esac 368} 369 370dump_readings() { 371 local sensor reading readings avg total 372 373 ((${#power_readings[@]} > 0)) || return 1 374 printf 'Dumping average sensors reading from %s\n' "${!power_readings[*]}" >&2 375 376 for sensor in "${!power_readings[@]}"; do 377 readings=("${!power_readings["$sensor"]}") 378 if ((${#readings[@]} == 0)); then 379 printf 'No readings available for %s sensor\n' "$sensor" >&2 380 continue 381 fi 382 total=0 383 for reading in "${readings[@]}"; do 384 ((total += ${reading%.*})) 385 done 386 avg=$((total / ${#readings[@]})) 387 388 readings+=("Total: ${#readings[@]}") 389 printf '%u\n' "$avg" > "$output_dir/${prefix:+${prefix}_}avg_${sensor}.bmc.pm.txt" 390 printf '%s\n' "${readings[@]}" > "$output_dir/${prefix:+${prefix}_}all_${sensor}.bmc.pm.txt" 391 printf 'Dumped avg to %s\n' "$output_dir/${prefix:+${prefix}_}avg_${sensor}.bmc.pm.txt" >&2 392 printf 'Dumped all to %s\n' "$output_dir/${prefix:+${prefix}_}all_${sensor}.bmc.pm.txt" >&2 393 done 394} 395 396utc() { 397 date --utc ${1:+-"d@$1"} 398} 399 400cleanup() { 401 [[ -f $sdr_cache && $remove_sdr_cache == yes ]] && rm "$sdr_cache" 402 dump_readings 403} 404 405collect_readings() { 406 local _count=$count 407 if ((_count == 1 && cpu_support)); then 408 # We need at least two readings to get a meaningful data 409 ((_count += 1)) 410 fi 411 while ((count <= 0 ? 1 : _count--)); do 412 get_now_reading 413 ((cpu_support)) && get_cpu_socket_reading 414 sleep "${interval}s" 415 done 416} 417 418help() { 419 cat <<- HELP 420 421 Usage: $0 [-h] [-d dir] [-i sdr|dcmi] [-s SENSOR_NAME] [-t interval] [-l log_file] [-p prefix] [-c count] [-r] 422 423 -h - Print this message. 424 -d - Directory where the results should be saved. Default is /tmp. 425 -i - Type of interface to use for requesting power usage. "sdr" or "dcmi". 426 If not set, available interface is used ("dcmi" has priority). 427 -t - How long to wait before each get power command in seconds. In case 428 this value matches one of supported averaging time periods special 429 variant of the command will be used to obtain the reading - this 430 variant is used only with the "dcmi" interface. Default is 1s. 431 -s - In case "sdr" interface is in use, try to read data from SENSOR_NAME. 432 -x - In case "sdr" interface is in use, don't remove SDR cache. This can 433 speed up subsequent runs of the script. 434 -l - Save output of the script to a log file (dir/${0##*/}.bmc.pm.log). 435 -p - Add prefix to saved files. 436 -c - Read power usage count times. 0 is the default and it means to run 437 indefinitely. 438 -r - Include readings from CPU sockets (RAPL-dependent) 439 440 When started, ${0##*/} will enter loop to continuously read power usage from either 441 DCMI interface or dedicated Watts sensors every interval. Each reading will be 442 logged to stderr. Upon termination, average power usage will be dumped to /tmp or 443 directory set by -d. 444 445 HELP 446} 447 448is_root 449 450output_dir=/tmp 451interval=1 452remove_sdr_cache=yes 453log_to_file=no 454prefix="" 455count=0 456include_cpu=0 457 458declare -A power_readings=() 459declare -a extra_power_sensors=() 460 461while getopts :hi:s:d:t:xlp:c:r arg; do 462 case "$arg" in 463 h) 464 help 465 exit 0 466 ;; 467 d) output_dir=$OPTARG ;; 468 s) extra_power_sensors+=("$OPTARG") ;; 469 i) interface=${OPTARG,,} ;; 470 t) interval=$OPTARG ;; 471 x) remove_sdr_cache=no ;; 472 l) log_to_file=yes ;; 473 p) prefix=$OPTARG ;; 474 c) count=$OPTARG ;; 475 r) include_cpu=1 ;; 476 *) ;; 477 esac 478done 479 480declare -r sdr_cache=$output_dir/sdr.cache 481declare -r log_file=${prefix:+${prefix}_}${0##*/}.bmc.pm.log 482 483mkdir -p "$output_dir" 484if [[ $log_to_file == yes ]]; then 485 printf 'Redirecting to %s\n' "$output_dir/$log_file" >&2 486 exec > "$output_dir/$log_file" 2>&1 487fi 488 489trap 'cleanup' EXIT 490 491ipmi_supported 492power_support 493 494collect_readings 495