1#!/usr/bin/env bash 2# SPDX-License-Identifier: BSD-3-Clause 3# Copyright (C) 2022 Intel Corporation 4# All rights reserved. 5 6set -e 7 8hex() { printf '0x%02x\n' "$@"; } 9 10calc() { bc <<< "scale=2; $*"; } 11 12is_root() { 13 # Talking to local BMC device requires root privileges 14 if ((UID)); then 15 printf '%s, you need to be root to run this script\n' "$USER" >&2 16 return 1 17 fi 18 19} 20 21is_ipmitool() { 22 if ! type -P ipmitool; then 23 printf 'ipmitool not detected, cannot run commands against the BMC\n' >&2 24 return 1 25 fi 26} 27 28ipmi_load() { 29 # Silently attempt to load core ipmi drivers - we will pick up the device later on. 30 modprobe -qa ipmi_si ipmi_devintf ipmi_msghandler || return 0 31} 32 33ipmi_supported() { 34 # Verify if kernel detected and registered at least one BMC under 35 # the ipmi platform. Look for KCS specifically as this the type 36 # of the interface the script was tested against. 37 38 local ipmi=/sys/class/ipmi/ipmi0 39 40 # Keep these details global for easy access if needed. 41 local -g man_id prod_id dev_id ipmi_ver platform board ipmitool 42 43 ipmi_load 44 45 if [[ ! -e $ipmi ]]; then 46 printf 'BMC not detected. Please, make sure your platform is IPMI-compatible\n' 47 return 1 48 fi >&2 49 50 type=$(< "$ipmi/device/type") 51 52 if [[ $type != kcs ]]; then 53 printf 'No supported BMC interface detected (%s) - only KCS is supported\n' "$type" 54 return 1 55 fi >&2 56 57 man_id=$(< "$ipmi/device/bmc/manufacturer_id") 58 prod_id=$(< "$ipmi/device/bmc/product_id") 59 dev_id=$(hex "$(< "$ipmi/device/bmc/device_id")") 60 ipmi_ver=$(< "$ipmi/device/bmc/ipmi_version") 61 62 if [[ -e /sys/class/dmi/id/board_vendor ]]; then 63 platform=$(< /sys/class/dmi/id/board_vendor) 64 fi 65 66 if [[ -e /sys/class/dmi/id/board_name ]]; then 67 board=$(< /sys/class/dmi/id/board_name) 68 fi 69 70 # Keep output similar to ipmi_si's 71 cat <<- BMC_DEV >&2 72 73 BMC detected, details below: 74 Manufacturer ID: $man_id 75 Product ID: $prod_id 76 Device ID: $dev_id 77 IPMI Version: $ipmi_ver 78 Platform: ${platform:-unknown} 79 Board: ${board:-unknown} 80 81 BMC_DEV 82 83 # Verify if we have proper tools to work with 84 ipmitool=$(is_ipmitool) 85} 86 87ipmiraw() { 88 # For the majority of commands we use raw payload to not depend on specific ipmitool version 89 # and the way how it interprets/parses the returned data. This also allows us to inspect the 90 # integrity of data more closely to make sure we don't report nonsensical values to the user. 91 92 local rsp 93 94 rsp=($("$ipmitool" raw "$@" 2> /dev/null)) 95 # Slap hex prefix to work with proper base 96 rsp=("${rsp[@]/#/0x}") 97 98 hex "${rsp[@]}" 99} 100 101dcmiraw() { 102 local cmd=$1 data=("${@:2}") 103 104 ipmiraw 0x2c "$cmd" 0xdc "${data[@]}" 105} 106 107print_dcmi_available_time_periods() { 108 local time_periods=${enhanced_power_attr[4]} 109 local -g available_time_periods=() 110 local -g available_time_periods_in_seconds=() 111 112 available_time_periods[0]="NOW" 113 114 if ((time_periods > 0)); then 115 local time_idx=5 116 local offset=$time_idx 117 local units unit time time_s units_mask=0xc0 to_sec 118 119 units[0x0]=seconds 120 units[0x1]=minutes 121 units[0x2]=hours 122 units[0x3]=days 123 124 to_sec[0x0]=1 125 to_sec[0x1]=60 126 to_sec[0x2]=3600 127 to_sec[0x3]=86400 128 129 while ((offset < time_idx + time_periods)); do 130 time=$((enhanced_power_attr[offset] & ~units_mask)) 131 unit=${units[enhanced_power_attr[offset] >> 6]:-unknown} 132 time_s=$((time * to_sec[enhanced_power_attr[offset] >> 6])) 133 if ((time != 0)); then 134 available_time_periods[offset]="$time $unit" 135 available_time_periods_in_seconds[time_s]=${enhanced_power_attr[offset]} 136 fi 137 ((++offset)) 138 done 139 fi 140 cat <<- TIME_PERIODS >&2 141 142 Available averaging time periods to request: 143 $(printf ' - %s\n' "${available_time_periods[@]}") 144 145 TIME_PERIODS 146} 147 148dcmi_power_support() { 149 # Verify if the BMC conforms to the DCMI spec 150 local rsp 151 152 # Table 6-2, Get DCMI Capabilities Command Format 153 if ! rsp=($(dcmiraw 0x1 0x1)); then 154 printf 'Cannot determine if BMC supports DCMI Power Management capability\n' >&2 155 return 1 156 fi 157 158 # Table 6-3, DCMI Capabilities Parameters: 159 # - Supported DCMI Capabilities: 160 # - Byte 2 Platform capabilities: [0] Power management 161 if ((!(rsp[5] & (1 << 0)))); then 162 printf 'BMC does not provide DCMI Power Mangament capability\n' >&2 163 return 1 164 fi 165 166 # Check if BMC provides Enhanced System Power Statistics attributes - this allows to issue 167 # requests for power readings at averaging time period, .e.g. from last 5 seconds, 30 minutes, 168 # 1 hour and so on. With this we can provide more detailed view on power usage within a 169 # specific period of time. Without it, we need to depend only on current reading that should 170 # be always available (the "NOW" reading). 171 172 local -g enhanced_power_attr=() 173 174 # Table 6-3, DCMI Capabilities Parameters: 175 # - Enhanced System Power Statistics attributes 176 if enhanced_power_attr=($(dcmiraw 0x1 0x5)); then 177 print_dcmi_available_time_periods 178 fi 179 180 printf 'Using DCMI Power Management\n' >&2 181} 182 183sdr_power_support() { 184 # This is a fallback which only some platforms may provide (confirmed PowerEdge and CYP). 185 # We are looking for a full, threshold sensor which reports overall power usage in Watts. 186 # Different BMCs may have SDRs which describe such sensor(s) differently so this is not 187 # 100% reliable. To make sure we pick up a proper sensor we also narrow it down to a 188 # specific entity (System Board or Power Supply). Readings from the sensor should be 189 # considered as "NOW" readings (without access to min, max readings). 190 191 local -g power_sensors=() 192 local sensor entity unit status 193 194 # Cache SDR to speed up sensor readings 195 if [[ ! -f $sdr_cache ]]; then 196 printf 'Saving SDR cache at %s\n' "$sdr_cache" >&2 197 "$ipmitool" sdr dump "$sdr_cache" > /dev/null 198 fi 199 200 if ((${#extra_power_sensors[@]} > 0)); then 201 power_sensors+=("${extra_power_sensors[@]}") 202 fi 203 204 while IFS="," read -r sensor _ unit status _ entity _; do 205 [[ $unit == Watts && $status == ok ]] || continue 206 [[ $entity == "System Board" || $entity == "Power Supply" ]] || continue 207 power_sensors+=("$sensor") 208 done < <("$ipmitool" -S "$sdr_cache" -vc sdr list full 2>&1) 209 210 if ((${#power_sensors[@]} > 0)); then 211 printf 'Using SDR (Power sensors: %s)\n' "${power_sensors[*]}" 212 else 213 printf 'Cannot locate power sensors\n' 214 return 1 215 fi >&2 216} 217 218power_support() { 219 local -g support cpu_support=0 220 221 if ((include_cpu == 1)) && rapl_supported; then 222 cpu_support=1 223 fi 224 225 if [[ $interface == dcmi || $interface == sdr ]]; then 226 # override 227 "${interface}_power_support" 228 support=$interface 229 elif dcmi_power_support; then 230 support=dcmi 231 elif sdr_power_support; then 232 support=sdr 233 else 234 printf 'BMC does not provide Power Management support, cannot gather system-wide power measurements\n' >&2 235 if ((cpu_support)); then 236 printf 'Only CPU measurements will be provided\n' >&2 237 return 0 238 fi 239 return 1 240 fi 241} 242 243get_dcmi_now_reading() { 244 local rsp reading=0 max min avg ts timeframe mode=01h 245 local get_cmd get_avg=0 print 246 247 # Table 6-16, Get Power Reading Command: 248 get_cmd=(0x2 0x1 0x0 0x0) 249 250 if [[ $interval =~ ^[0-9]+$ && -n ${available_time_periods_in_seconds[interval]} ]]; then 251 get_cmd=(0x2 0x2 "${available_time_periods_in_seconds[interval]}" 0x0) 252 get_avg=1 253 mode=02h 254 fi 255 256 # We use System Power Statistics mode to get the "NOW" reading by default. In case 257 # interval matches one supported by Enhanced System Power Statistics we use that 258 # mode to obtain extra min, max, avg statistics. 259 260 if ! rsp=($(dcmiraw "${get_cmd[@]}")); then 261 printf 'DCMI reading: error\n' 262 else 263 # Note that the BMC timestamp depends on the hwclock setup which we then attempt 264 # to represent in UTC. 265 ts=$((rsp[12] << 24 | rsp[11] << 16 | rsp[10] << 8 | rsp[9])) 266 # This is interpreted differently by different BMCs so for now we make a note of 267 # it but don't present it to the user. 268 timeframe=$((rsp[16] << 24 | rsp[15] << 16 | rsp[14] << 8 | rsp[13])) 269 reading=$((rsp[2] << 8 | rsp[1])) 270 if ((get_avg == 1)); then 271 min=$((rsp[4] << 8 | rsp[3])) 272 max=$((rsp[6] << 8 | rsp[5])) 273 avg=$((rsp[8] << 8 | rsp[7])) 274 _DCMI_min+=("$min") 275 _DCMI_max+=("$max") 276 _DCMI_avg+=("$avg") 277 power_readings["DCMI_MIN"]="_DCMI_min[@]" 278 power_readings["DCMI_MAX"]="_DCMI_max[@]" 279 power_readings["DCMI_AVG"]="_DCMI_avg[@]" 280 fi 281 _DCMI+=("$reading") 282 power_readings["DCMI"]="_DCMI[@]" 283 284 for print in min max avg reading; do 285 [[ -n ${!print} ]] || continue 286 printf '(%s) DCMI %s (mode: %s): %u Watts (interval: %ss)\n' \ 287 "$(utc "$ts")" \ 288 "$print" \ 289 "$mode" \ 290 "${!print}" \ 291 "$interval" 292 done 293 fi >&2 294} 295 296get_sdr_now_reading() { 297 local sensor reading=0 ts unit 298 299 if ((${#power_sensors[@]} == 0)); then 300 printf 'No power sensors were provided\n' >&2 301 return 1 302 fi 303 304 for sensor in "${!power_sensors[@]}"; do 305 ts=$(utc) 306 if ! IFS="," read -r _ reading unit _; then 307 reading=error 308 else 309 eval "_sensor${sensor}_readings+=($reading)" 310 power_readings["${power_sensors[sensor]}"]="_sensor${sensor}_readings[@]" 311 reading+=" $unit" 312 fi < <("$ipmitool" -c -S "$sdr_cache" sdr get "${power_sensors[sensor]}") 2> /dev/null 313 printf '(%s) Sensor %s reading: %s (interval %ss)\n' \ 314 "$ts" \ 315 "${power_sensors[sensor]}" \ 316 "$reading" \ 317 "$interval" >&2 318 done 319} 320 321rapl_supported() { 322 [[ -e /sys/class/powercap/intel-rapl ]] 323} 324 325get_cpu_socket_reading() { 326 local rapl=/sys/class/powercap 327 local socket socket_idx _socket_idx socket_name 328 local ts reading 329 330 # power_uw is usually not available so we need to relay on energy_uj. It's also rarely 331 # rw so we can't zero it out, hence we need to keep track of the initial counter. For 332 # details see kernel documentation (powercap.rst). 333 ts=$(utc) 334 for socket in /sys/class/powercap/intel-rapl:*; do 335 [[ -e $socket ]] || continue 336 337 socket_idx=${socket#*:} socket_name=$(< "$socket/name") 338 # Adjust for different domains, see linux/intel_rapl.h 339 case "$socket_name" in 340 dram | core | uncore) _socket_idx=${socket_idx//:/_} socket_idx=${socket_idx%:*} ;; 341 package-*) _socket_idx=$socket_idx socket_name=socket ;; 342 psys*) _socket_idx=$socket_idx socket_name=platform ;; 343 esac 344 345 local -n socket_uj=socket_${_socket_idx}_uj 346 socket_uj+=("$(< "$socket/energy_uj")") 347 # We need at least two readings for comparison 348 ((${#socket_uj[@]} > 1)) || continue 349 350 # Convert to Watts - use bc since $interval can be an actual float 351 reading=$(calc "(${socket_uj[-1]} - ${socket_uj[-2]}) / 1000000 / $interval") 352 if [[ $reading == "-"* ]]; then 353 # Somehow this may happen, probably when the counter wraps over. Consider 354 # this as a faulty reading and don't include it since it may impact overall 355 # avg. 356 printf '(%s) CPU %s %s reading: error(%s) (interval: %ss)\n' \ 357 "$ts" \ 358 "$socket_name" \ 359 "$socket_idx" \ 360 "$reading" \ 361 "$interval" >&2 362 return 0 363 fi 364 eval "_socket${_socket_idx}_readings+=($reading)" 365 power_readings["$socket_name-$socket_idx"]="_socket${_socket_idx}_readings[@]" 366 367 printf '(%s) CPU %s %s reading: %s Watts (interval: %ss)\n' \ 368 "$ts" \ 369 "$socket_name" \ 370 "$socket_idx" \ 371 "$reading" \ 372 "$interval" >&2 373 done 374} 375 376get_now_reading() { 377 case "$support" in 378 dcmi) get_dcmi_now_reading ;; 379 sdr) get_sdr_now_reading ;; 380 *) ;; 381 esac 382} 383 384dump_readings() { 385 local sensor reading readings avg total 386 387 ((${#power_readings[@]} > 0)) || return 1 388 printf 'Dumping average sensors reading from %s\n' "${!power_readings[*]}" >&2 389 390 for sensor in "${!power_readings[@]}"; do 391 readings=("${!power_readings["$sensor"]}") 392 if ((${#readings[@]} == 0)); then 393 printf 'No readings available for %s sensor\n' "$sensor" >&2 394 continue 395 fi 396 total=0 397 for reading in "${readings[@]}"; do 398 total=$(calc "$total + $reading") 399 done 400 avg=$(calc "$total / ${#readings[@]}") 401 402 readings+=("Total: ${#readings[@]}") 403 sensor="${sensor//[[:space:]]/_}" 404 printf '%s\n' "$avg" > "$output_dir/${prefix:+${prefix}_}avg_${sensor}.bmc.pm.txt" 405 printf '%s\n' "${readings[@]}" > "$output_dir/${prefix:+${prefix}_}all_${sensor}.bmc.pm.txt" 406 printf 'Dumped avg to %s\n' "$output_dir/${prefix:+${prefix}_}avg_${sensor}.bmc.pm.txt" >&2 407 printf 'Dumped all to %s\n' "$output_dir/${prefix:+${prefix}_}all_${sensor}.bmc.pm.txt" >&2 408 done 409} 410 411utc() { 412 date --utc ${1:+-"d@$1"} 413} 414 415cleanup() { 416 [[ -f $sdr_cache && $remove_sdr_cache == yes ]] && rm "$sdr_cache" 417 dump_readings 418} 419 420collect_readings() { 421 local _count=$count 422 if ((_count == 1 && cpu_support)); then 423 # We need at least two readings to get a meaningful data 424 ((_count += 1)) 425 fi 426 while ((count <= 0 ? 1 : _count--)); do 427 get_now_reading 428 ((cpu_support)) && get_cpu_socket_reading 429 sleep "${interval}s" 430 done 431} 432 433help() { 434 cat <<- HELP 435 436 Usage: $0 [-h] [-d dir] [-i sdr|dcmi] [-s SENSOR_NAME] [-t interval] [-l log_file] [-p prefix] [-c count] [-r] 437 438 -h - Print this message. 439 -d - Directory where the results should be saved. Default is /tmp. 440 -i - Type of interface to use for requesting power usage. "sdr" or "dcmi". 441 If not set, available interface is used ("dcmi" has priority). 442 -t - How long to wait before each get power command in seconds. In case 443 this value matches one of supported averaging time periods special 444 variant of the command will be used to obtain the reading - this 445 variant is used only with the "dcmi" interface. Default is 1s. 446 -s - In case "sdr" interface is in use, try to read data from SENSOR_NAME. 447 -x - In case "sdr" interface is in use, don't remove SDR cache. This can 448 speed up subsequent runs of the script. 449 -l - Save output of the script to a log file (dir/${0##*/}.bmc.pm.log). 450 -p - Add prefix to saved files. 451 -c - Read power usage count times. 0 is the default and it means to run 452 indefinitely. 453 -r - Include readings from CPU sockets (RAPL-dependent) 454 455 When started, ${0##*/} will enter loop to continuously read power usage from either 456 DCMI interface or dedicated Watts sensors every interval. Each reading will be 457 logged to stderr. Upon termination, average power usage will be dumped to /tmp or 458 directory set by -d. 459 460 HELP 461} 462 463is_root 464 465output_dir=/tmp 466interval=1 467remove_sdr_cache=yes 468log_to_file=no 469prefix="" 470count=0 471include_cpu=0 472 473declare -A power_readings=() 474declare -a extra_power_sensors=() 475 476while getopts :hi:s:d:t:xlp:c:r arg; do 477 case "$arg" in 478 h) 479 help 480 exit 0 481 ;; 482 d) output_dir=$OPTARG ;; 483 s) extra_power_sensors+=("$OPTARG") ;; 484 i) interface=${OPTARG,,} ;; 485 t) interval=$OPTARG ;; 486 x) remove_sdr_cache=no ;; 487 l) log_to_file=yes ;; 488 p) prefix=$OPTARG ;; 489 c) count=$OPTARG ;; 490 r) include_cpu=1 ;; 491 *) ;; 492 esac 493done 494 495declare -r sdr_cache=$output_dir/sdr.cache 496declare -r log_file=${prefix:+${prefix}_}${0##*/}.bmc.pm.log 497 498mkdir -p "$output_dir" 499if [[ $log_to_file == yes ]]; then 500 printf 'Redirecting to %s\n' "$output_dir/$log_file" >&2 501 exec > "$output_dir/$log_file" 2>&1 502fi 503 504trap 'cleanup' EXIT 505 506ipmi_supported 507power_support 508 509collect_readings 510