1#!/usr/bin/env bash 2# SPDX-License-Identifier: BSD-3-Clause 3# Copyright (C) 2022 Intel Corporation 4# All rights reserved. 5 6set -e 7 8pmdir=$(readlink -f "$(dirname "$0")") 9rootdir=$(readlink -f "$pmdir/../../../") 10source "$pmdir/common" 11 12hex() { printf '0x%02x\n' "$@"; } 13 14calc() { bc <<< "scale=2; $*"; } 15 16is_root() { 17 # Talking to local BMC device requires root privileges 18 if ((UID)); then 19 printf '%s, you need to be root to run this script\n' "$USER" >&2 20 return 1 21 fi 22 23} 24 25is_ipmitool() { 26 if ! type -P ipmitool; then 27 printf 'ipmitool not detected, cannot run commands against the BMC\n' >&2 28 return 1 29 fi 30} 31 32ipmi_load() { 33 # Silently attempt to load core ipmi drivers - we will pick up the device later on. 34 modprobe -qa ipmi_si ipmi_devintf ipmi_msghandler || return 0 35} 36 37ipmi_supported() { 38 # Verify if kernel detected and registered at least one BMC under 39 # the ipmi platform. Look for KCS specifically as this the type 40 # of the interface the script was tested against. 41 42 local ipmi=/sys/class/ipmi/ipmi0 43 44 # Keep these details global for easy access if needed. 45 local -g man_id prod_id dev_id ipmi_ver platform board ipmitool 46 47 ipmi_load 48 49 if [[ ! -e $ipmi ]]; then 50 printf 'BMC not detected. Please, make sure your platform is IPMI-compatible\n' 51 return 1 52 fi >&2 53 54 type=$(< "$ipmi/device/type") 55 56 if [[ $type != kcs ]]; then 57 printf 'No supported BMC interface detected (%s) - only KCS is supported\n' "$type" 58 return 1 59 fi >&2 60 61 man_id=$(< "$ipmi/device/bmc/manufacturer_id") 62 prod_id=$(< "$ipmi/device/bmc/product_id") 63 dev_id=$(hex "$(< "$ipmi/device/bmc/device_id")") 64 ipmi_ver=$(< "$ipmi/device/bmc/ipmi_version") 65 66 if [[ -e /sys/class/dmi/id/board_vendor ]]; then 67 platform=$(< /sys/class/dmi/id/board_vendor) 68 fi 69 70 if [[ -e /sys/class/dmi/id/board_name ]]; then 71 board=$(< /sys/class/dmi/id/board_name) 72 fi 73 74 # Keep output similar to ipmi_si's 75 cat <<- BMC_DEV >&2 76 77 BMC detected, details below: 78 Manufacturer ID: $man_id 79 Product ID: $prod_id 80 Device ID: $dev_id 81 IPMI Version: $ipmi_ver 82 Platform: ${platform:-unknown} 83 Board: ${board:-unknown} 84 85 BMC_DEV 86 87 # Verify if we have proper tools to work with 88 ipmitool=$(is_ipmitool) 89} 90 91ipmiraw() { 92 # For the majority of commands we use raw payload to not depend on specific ipmitool version 93 # and the way how it interprets/parses the returned data. This also allows us to inspect the 94 # integrity of data more closely to make sure we don't report nonsensical values to the user. 95 96 local rsp 97 98 rsp=($("$ipmitool" raw "$@" 2> /dev/null)) 99 # Slap hex prefix to work with proper base 100 rsp=("${rsp[@]/#/0x}") 101 102 hex "${rsp[@]}" 103} 104 105dcmiraw() { 106 local cmd=$1 data=("${@:2}") 107 108 ipmiraw 0x2c "$cmd" 0xdc "${data[@]}" 109} 110 111print_dcmi_available_time_periods() { 112 local time_periods=${enhanced_power_attr[4]} 113 local -g available_time_periods=() 114 local -g available_time_periods_in_seconds=() 115 116 available_time_periods[0]="NOW" 117 118 if ((time_periods > 0)); then 119 local time_idx=5 120 local offset=$time_idx 121 local units unit time time_s units_mask=0xc0 to_sec 122 123 units[0x0]=seconds 124 units[0x1]=minutes 125 units[0x2]=hours 126 units[0x3]=days 127 128 to_sec[0x0]=1 129 to_sec[0x1]=60 130 to_sec[0x2]=3600 131 to_sec[0x3]=86400 132 133 while ((offset < time_idx + time_periods)); do 134 time=$((enhanced_power_attr[offset] & ~units_mask)) 135 unit=${units[enhanced_power_attr[offset] >> 6]:-unknown} 136 time_s=$((time * to_sec[enhanced_power_attr[offset] >> 6])) 137 if ((time != 0)); then 138 available_time_periods[offset]="$time $unit" 139 available_time_periods_in_seconds[time_s]=${enhanced_power_attr[offset]} 140 fi 141 ((++offset)) 142 done 143 fi 144 cat <<- TIME_PERIODS >&2 145 146 Available averaging time periods to request: 147 $(printf ' - %s\n' "${available_time_periods[@]}") 148 149 TIME_PERIODS 150} 151 152dcmi_power_support() { 153 # Verify if the BMC conforms to the DCMI spec 154 local rsp 155 156 # Table 6-2, Get DCMI Capabilities Command Format 157 if ! rsp=($(dcmiraw 0x1 0x1)); then 158 printf 'Cannot determine if BMC supports DCMI Power Management capability\n' >&2 159 return 1 160 fi 161 162 # Table 6-3, DCMI Capabilities Parameters: 163 # - Supported DCMI Capabilities: 164 # - Byte 2 Platform capabilities: [0] Power management 165 if ((!(rsp[5] & (1 << 0)))); then 166 printf 'BMC does not provide DCMI Power Mangament capability\n' >&2 167 return 1 168 fi 169 170 # Check if BMC provides Enhanced System Power Statistics attributes - this allows to issue 171 # requests for power readings at averaging time period, .e.g. from last 5 seconds, 30 minutes, 172 # 1 hour and so on. With this we can provide more detailed view on power usage within a 173 # specific period of time. Without it, we need to depend only on current reading that should 174 # be always available (the "NOW" reading). 175 176 local -g enhanced_power_attr=() 177 178 # Table 6-3, DCMI Capabilities Parameters: 179 # - Enhanced System Power Statistics attributes 180 if enhanced_power_attr=($(dcmiraw 0x1 0x5)); then 181 print_dcmi_available_time_periods 182 fi 183 184 printf 'Using DCMI Power Management\n' >&2 185} 186 187sdr_power_support() { 188 # This is a fallback which only some platforms may provide (confirmed PowerEdge and CYP). 189 # We are looking for a full, threshold sensor which reports overall power usage in Watts. 190 # Different BMCs may have SDRs which describe such sensor(s) differently so this is not 191 # 100% reliable. To make sure we pick up a proper sensor we also narrow it down to a 192 # specific entity (System Board or Power Supply). Readings from the sensor should be 193 # considered as "NOW" readings (without access to min, max readings). 194 195 local -g power_sensors=() 196 local sensor entity unit status 197 198 # Cache SDR to speed up sensor readings 199 if [[ ! -f $sdr_cache ]]; then 200 printf 'Saving SDR cache at %s\n' "$sdr_cache" >&2 201 "$ipmitool" sdr dump "$sdr_cache" > /dev/null 202 fi 203 204 if ((${#extra_power_sensors[@]} > 0)); then 205 power_sensors+=("${extra_power_sensors[@]}") 206 fi 207 208 while IFS="," read -r sensor _ unit status _ entity _; do 209 [[ $unit == Watts && $status == ok ]] || continue 210 [[ $entity == "System Board" || $entity == "Power Supply" ]] || continue 211 power_sensors+=("$sensor") 212 done < <("$ipmitool" -S "$sdr_cache" -vc sdr list full 2>&1) 213 214 if ((${#power_sensors[@]} > 0)); then 215 printf 'Using SDR (Power sensors: %s)\n' "${power_sensors[*]}" 216 else 217 printf 'Cannot locate power sensors\n' 218 return 1 219 fi >&2 220} 221 222power_support() { 223 local -g support cpu_support=0 224 225 if ((include_cpu == 1)) && rapl_supported; then 226 cpu_support=1 227 fi 228 229 if [[ $interface == dcmi || $interface == sdr ]]; then 230 # override 231 "${interface}_power_support" 232 support=$interface 233 elif dcmi_power_support; then 234 support=dcmi 235 elif sdr_power_support; then 236 support=sdr 237 else 238 printf 'BMC does not provide Power Management support, cannot gather system-wide power measurements\n' >&2 239 if ((cpu_support)); then 240 printf 'Only CPU measurements will be provided\n' >&2 241 return 0 242 fi 243 return 1 244 fi 245} 246 247get_dcmi_now_reading() { 248 local rsp reading=0 max min avg ts timeframe mode=01h 249 local get_cmd get_avg=0 print 250 251 # Table 6-16, Get Power Reading Command: 252 get_cmd=(0x2 0x1 0x0 0x0) 253 254 if [[ $interval =~ ^[0-9]+$ && -n ${available_time_periods_in_seconds[interval]} ]]; then 255 get_cmd=(0x2 0x2 "${available_time_periods_in_seconds[interval]}" 0x0) 256 get_avg=1 257 mode=02h 258 fi 259 260 # We use System Power Statistics mode to get the "NOW" reading by default. In case 261 # interval matches one supported by Enhanced System Power Statistics we use that 262 # mode to obtain extra min, max, avg statistics. 263 264 if ! rsp=($(dcmiraw "${get_cmd[@]}")); then 265 printf 'DCMI reading: error\n' 266 else 267 # Note that the BMC timestamp depends on the hwclock setup which we then attempt 268 # to represent in UTC. 269 ts=$((rsp[12] << 24 | rsp[11] << 16 | rsp[10] << 8 | rsp[9])) 270 # This is interpreted differently by different BMCs so for now we make a note of 271 # it but don't present it to the user. 272 timeframe=$((rsp[16] << 24 | rsp[15] << 16 | rsp[14] << 8 | rsp[13])) 273 reading=$((rsp[2] << 8 | rsp[1])) 274 if ((get_avg == 1)); then 275 min=$((rsp[4] << 8 | rsp[3])) 276 max=$((rsp[6] << 8 | rsp[5])) 277 avg=$((rsp[8] << 8 | rsp[7])) 278 _DCMI_min+=("$min") 279 _DCMI_max+=("$max") 280 _DCMI_avg+=("$avg") 281 power_readings["DCMI_MIN"]="_DCMI_min[@]" 282 power_readings["DCMI_MAX"]="_DCMI_max[@]" 283 power_readings["DCMI_AVG"]="_DCMI_avg[@]" 284 fi 285 _DCMI+=("$reading") 286 power_readings["DCMI"]="_DCMI[@]" 287 288 for print in min max avg reading; do 289 [[ -n ${!print} ]] || continue 290 printf '(%s) DCMI %s (mode: %s): %u Watts (interval: %ss, test: %s)\n' \ 291 "$(utc "$ts")" \ 292 "$print" \ 293 "$mode" \ 294 "${!print}" \ 295 "$interval" \ 296 "$TEST_TAG" >&2 297 done 298 fi >&2 299} 300 301get_sdr_now_reading() { 302 local sensor reading=0 ts unit 303 304 if ((${#power_sensors[@]} == 0)); then 305 printf 'No power sensors were provided\n' >&2 306 return 1 307 fi 308 309 for sensor in "${!power_sensors[@]}"; do 310 ts=$(utc) 311 if ! IFS="," read -r _ reading unit _; then 312 reading=error 313 else 314 eval "_sensor${sensor}_readings+=($reading)" 315 power_readings["${power_sensors[sensor]}"]="_sensor${sensor}_readings[@]" 316 reading+=" $unit" 317 fi < <("$ipmitool" -c -S "$sdr_cache" sdr get "${power_sensors[sensor]}") 2> /dev/null 318 printf '(%s) Sensor %s reading: %s (interval: %ss, test: %s)\n' \ 319 "$ts" \ 320 "${power_sensors[sensor]}" \ 321 "$reading" \ 322 "$interval" \ 323 "$TEST_TAG" >&2 324 done 325} 326 327rapl_supported() { 328 [[ -e /sys/class/powercap/intel-rapl ]] 329} 330 331get_cpu_socket_reading() { 332 local rapl=/sys/class/powercap 333 local socket socket_idx _socket_idx socket_name 334 local ts reading 335 336 # power_uw is usually not available so we need to relay on energy_uj. It's also rarely 337 # rw so we can't zero it out, hence we need to keep track of the initial counter. For 338 # details see kernel documentation (powercap.rst). 339 ts=$(utc) 340 for socket in /sys/class/powercap/intel-rapl:*; do 341 [[ -e $socket ]] || continue 342 343 socket_idx=${socket#*:} socket_name=$(< "$socket/name") 344 # Adjust for different domains, see linux/intel_rapl.h 345 case "$socket_name" in 346 dram | core | uncore) _socket_idx=${socket_idx//:/_} socket_idx=${socket_idx%:*} ;; 347 package-*) _socket_idx=$socket_idx socket_name=socket ;; 348 psys*) _socket_idx=$socket_idx socket_name=platform ;; 349 esac 350 351 local -n socket_uj=socket_${_socket_idx}_uj 352 socket_uj+=("$(< "$socket/energy_uj")") 353 # We need at least two readings for comparison 354 ((${#socket_uj[@]} > 1)) || continue 355 356 # Convert to Watts - use bc since $interval can be an actual float 357 reading=$(calc "(${socket_uj[-1]} - ${socket_uj[-2]}) / 1000000 / $interval") 358 if [[ $reading == "-"* ]]; then 359 # Somehow this may happen, probably when the counter wraps over. Consider 360 # this as a faulty reading and don't include it since it may impact overall 361 # avg. 362 printf '(%s) CPU %s %s reading: error(%s) (interval: %ss, test: %s)\n' \ 363 "$ts" \ 364 "$socket_name" \ 365 "$socket_idx" \ 366 "$reading" \ 367 "$interval" \ 368 "$TEST_TAG" >&2 369 return 0 370 fi 371 eval "_socket${_socket_idx}_readings+=($reading)" 372 power_readings["$socket_name-$socket_idx"]="_socket${_socket_idx}_readings[@]" 373 374 printf '(%s) CPU %s %s reading: %s Watts (interval: %ss, test: %s)\n' \ 375 "$ts" \ 376 "$socket_name" \ 377 "$socket_idx" \ 378 "$reading" \ 379 "$interval" \ 380 "$TEST_TAG" >&2 381 done 382} 383 384get_now_reading() { 385 case "$support" in 386 dcmi) get_dcmi_now_reading ;; 387 sdr) get_sdr_now_reading ;; 388 *) ;; 389 esac 390} 391 392dump_readings() { 393 local sensor reading readings avg total 394 395 ((${#power_readings[@]} > 0)) || return 1 396 printf 'Dumping average sensors reading from %s\n' "${!power_readings[*]}" >&2 397 398 for sensor in "${!power_readings[@]}"; do 399 readings=("${!power_readings["$sensor"]}") 400 if ((${#readings[@]} == 0)); then 401 printf 'No readings available for %s sensor\n' "$sensor" >&2 402 continue 403 fi 404 total=0 405 for reading in "${readings[@]}"; do 406 total=$(calc "$total + $reading") 407 done 408 avg=$(calc "$total / ${#readings[@]}") 409 410 readings+=("Total: ${#readings[@]}") 411 sensor="${sensor//[[:space:]]/_}" 412 printf '%s\n' "$avg" > "$PM_OUTPUTDIR/${prefix:+${prefix}_}avg_${sensor}.bmc.pm.txt" 413 printf '%s\n' "${readings[@]}" > "$PM_OUTPUTDIR/${prefix:+${prefix}_}all_${sensor}.bmc.pm.txt" 414 printf 'Dumped avg to %s\n' "$PM_OUTPUTDIR/${prefix:+${prefix}_}avg_${sensor}.bmc.pm.txt" >&2 415 printf 'Dumped all to %s\n' "$PM_OUTPUTDIR/${prefix:+${prefix}_}all_${sensor}.bmc.pm.txt" >&2 416 done 417} 418 419utc() { 420 date --utc ${1:+-"d@$1"} 421} 422 423cleanup() { 424 rm_pm_pid 425 [[ -f $sdr_cache && $remove_sdr_cache == yes ]] && rm "$sdr_cache" 426 dump_readings 427} 428 429collect_readings() { 430 local _count=$count 431 if ((_count == 1 && cpu_support)); then 432 # We need at least two readings to get a meaningful data 433 ((_count += 1)) 434 fi 435 while ((count <= 0 ? 1 : _count--)); do 436 get_now_reading 437 ((cpu_support)) && get_cpu_socket_reading 438 sleep "${interval}s" 439 done 440} 441 442help() { 443 cat <<- HELP 444 445 Usage: $0 [-h] [-d dir] [-i sdr|dcmi] [-s SENSOR_NAME] [-t interval] [-l] [-p prefix] [-c count] [-r] 446 447 -h - Print this message. 448 -d - Directory where the results should be saved. Default is /tmp. 449 -i - Type of interface to use for requesting power usage. "sdr" or "dcmi". 450 If not set, available interface is used ("dcmi" has priority). 451 -t - How long to wait before each get power command in seconds. In case 452 this value matches one of supported averaging time periods special 453 variant of the command will be used to obtain the reading - this 454 variant is used only with the "dcmi" interface. Default is 1s. 455 -s - In case "sdr" interface is in use, try to read data from SENSOR_NAME. 456 -x - In case "sdr" interface is in use, don't remove SDR cache. This can 457 speed up subsequent runs of the script. 458 -l - Save output of the script to a log file (dir/${0##*/}.bmc.pm.log). 459 -p - Add prefix to saved files. 460 -c - Read power usage count times. 0 is the default and it means to run 461 indefinitely. 462 -r - Include readings from CPU sockets (RAPL-dependent) 463 464 When started, ${0##*/} will enter loop to continuously read power usage from either 465 DCMI interface or dedicated Watts sensors every interval. Each reading will be 466 logged to stderr. Upon termination, average power usage will be dumped to /tmp or 467 directory set by -d. 468 469 HELP 470} 471 472is_root 473 474interval=1 475remove_sdr_cache=yes 476log_to_file=no 477prefix="" 478count=0 479include_cpu=0 480 481declare -A power_readings=() 482declare -a extra_power_sensors=() 483 484while getopts :hi:s:d:t:xlp:c:r arg; do 485 case "$arg" in 486 h) 487 help 488 exit 0 489 ;; 490 d) PM_OUTPUTDIR=$OPTARG ;; 491 s) extra_power_sensors+=("$OPTARG") ;; 492 i) interface=${OPTARG,,} ;; 493 t) interval=$OPTARG ;; 494 x) remove_sdr_cache=no ;; 495 l) log_to_file=yes ;; 496 p) prefix=$OPTARG ;; 497 c) count=$OPTARG ;; 498 r) include_cpu=1 ;; 499 *) ;; 500 esac 501done 502 503declare -r sdr_cache=$PM_OUTPUTDIR/sdr.cache 504declare -r log_file=${prefix:+${prefix}_}${0##*/}.bmc.pm.log 505 506mkdir -p "$PM_OUTPUTDIR" 507if [[ $log_to_file == yes ]]; then 508 printf 'Redirecting to %s\n' "$PM_OUTPUTDIR/$log_file" >&2 509 exec > "$PM_OUTPUTDIR/$log_file" 2>&1 510fi 511 512save_pm_pid 513trap 'cleanup' EXIT 514trap 'retag' USR1 515 516ipmi_supported 517power_support 518 519collect_readings 520