1# SPDX-License-Identifier: BSD-3-Clause 2# Copyright (C) 2020 Intel Corporation 3# All rights reserved. 4# 5 6shopt -s nullglob extglob 7 8declare -r sysfs_system=/sys/devices/system 9declare -r sysfs_cpu=$sysfs_system/cpu 10declare -r sysfs_node=$sysfs_system/node 11 12declare -r scheduler=$rootdir/test/event/scheduler/scheduler 13declare -r plugin=scheduler_plugin 14 15source "$rootdir/test/scheduler/cgroups.sh" 16 17fold_list_onto_array() { 18 local array=$1 19 local elem 20 21 shift || return 0 22 23 for elem; do 24 eval "${array}[elem]=$elem" 25 done 26} 27 28fold_array_onto_string() { 29 local cpus=("$@") 30 31 local IFS="," 32 echo "${cpus[*]}" 33} 34 35parse_cpu_list() { 36 local list=$1 37 local elem elems cpus 38 39 # 0-2,4,6-9, etc. 40 IFS="," read -ra elems < "$list" 41 42 ((${#elems[@]} > 0)) || return 0 43 44 for elem in "${elems[@]}"; do 45 if [[ $elem == *-* ]]; then 46 local start=${elem%-*} end=${elem#*-} 47 while ((start <= end)); do 48 cpus[start++]=$start 49 done 50 else 51 cpus[elem]=$elem 52 fi 53 done 54 printf '%u\n' "${!cpus[@]}" 55} 56 57map_cpus_node() { 58 local node_idx=$1 59 local -n _cpu_node_map=node_${node_idx}_cpu 60 local cpu_idx core_idx 61 62 for cpu_idx in $(parse_cpu_list "$sysfs_node/node$node_idx/cpulist"); do 63 if is_cpu_online "$cpu_idx"; then 64 core_idx=$(< "$sysfs_cpu/cpu$cpu_idx/topology/core_id") 65 local -n _cpu_core_map=node_${node_idx}_core_${core_idx} 66 _cpu_core_map+=("$cpu_idx") cpu_core_map[cpu_idx]=$core_idx 67 local -n _cpu_siblings=node_${node_idx}_core_${core_idx}_thread_${cpu_idx} 68 _cpu_siblings=($(parse_cpu_list "$sysfs_cpu/cpu$cpu_idx/topology/thread_siblings_list")) 69 cpu_siblings[cpu_idx]="node_${node_idx}_core_${core_idx}_thread_${cpu_idx}[@]" 70 fi 71 _cpu_node_map[cpu_idx]=$cpu_idx cpu_node_map[cpu_idx]=$node_idx 72 cpus+=("$cpu_idx") 73 done 74 75 nodes[node_idx]=$node_idx 76} 77 78map_cpus() { 79 local -g cpus=() 80 local -g cpu_siblings=() 81 local -g nodes=() 82 local -g cpu_node_map=() 83 local -g cpu_core_map=() 84 local -g core_node_map=() 85 local node 86 87 unset -v "${!node_@}" 88 89 for node in "$sysfs_node/node"+([0-9]); do 90 map_cpus_node "${node##*node}" 91 done 92} 93 94get_cpus() { 95 local node=$1 96 local core=$2 97 local _cpus 98 99 if [[ -z $node ]]; then 100 _cpus=("${cpus[@]}") 101 elif [[ -n $node ]]; then 102 eval "_cpus=(\${node_${node}_cpu[@]})" 103 if [[ -n $core ]]; then 104 eval "_cpus=(\${node_${node}_core_${core}[@]})" 105 fi 106 fi 107 ((${#_cpus[@]} > 0)) || return 1 108 printf '%u\n' "${_cpus[@]}" 109} 110 111get_isolated_cpus() { 112 [[ -e $sysfs_cpu/isolated ]] || return 0 113 parse_cpu_list "$sysfs_cpu/isolated" 114} 115 116get_offline_cpus() { 117 local offline 118 119 [[ -e $sysfs_cpu/offline ]] || return 0 120 parse_cpu_list "$sysfs_cpu/offline" 121} 122 123get_online_cpus() { 124 [[ -e $sysfs_cpu/online ]] || return 0 125 parse_cpu_list "$sysfs_cpu/online" 126} 127 128is_cpu_online() { 129 local online 130 131 fold_list_onto_array online $(get_online_cpus) 132 [[ -v online[$1] ]] 133} 134 135is_cpu_offline() { 136 ! is_cpu_online "$1" 137} 138 139online_cpu() { 140 is_cpu_offline "$1" || return 0 141 [[ -e $sysfs_cpu/cpu$1/online ]] && echo 1 > "$sysfs_cpu/cpu$1/online" 142} 143 144offline_cpu() { 145 is_cpu_online "$1" || return 0 146 [[ -e $sysfs_cpu/cpu$1/online ]] && echo 0 > "$sysfs_cpu/cpu$1/online" 147} 148 149mask_cpus() { 150 printf '[%s]\n' "$(fold_array_onto_string "$@")" 151} 152 153denied_list() { 154 local -g denied 155 156 fold_list_onto_array denied $(get_offline_cpus) "$@" 157} 158 159filter_allowed_list() { 160 local cpu 161 162 for cpu in "${!allowed[@]}"; do 163 if [[ -n ${denied[cpu]} ]] || ((cpu > 127)); then 164 unset -v "allowed[cpu]" 165 fi 166 done 167} 168 169allowed_list() { 170 local max=${1:-4} 171 local node=${2:-0} 172 local cpu_count=${cpu_count:--1} 173 174 local -g allowed 175 176 fold_list_onto_array allowed $(get_isolated_cpus) 177 178 if ((cpu_count < 0 && ${#allowed[@]} > 0)); then 179 ((max += ${#allowed[@]})) 180 fi 181 182 local -n node_cpu_ref=node_${node}_cpu 183 184 while ((${#allowed[@]} < max && ++cpu_count < ${#node_cpu_ref[@]})); do 185 fold_list_onto_array allowed $(get_cpus "$node" "${cpu_core_map[node_cpu_ref[cpu_count]]}") 186 done 187 188 filter_allowed_list 189 190 if ((${#allowed[@]} == max)); then 191 return 0 192 elif ((cpu_count == ${#node_cpu_ref[@]})); then 193 return 0 194 else 195 allowed_list "$max" "$node" 196 fi 197} 198 199get_proc_cpu_affinity() { 200 xtrace_disable 201 202 local pid=${1:-$$} 203 local status val 204 205 [[ -e /proc/$pid/status ]] || return 1 206 while IFS=":"$'\t' read -r status val; do 207 if [[ $status == Cpus_allowed_list ]]; then 208 parse_cpu_list <(echo "$val") 209 return 0 210 fi 211 done < "/proc/$pid/status" 212 213 xtrace_restore 214} 215 216map_cpufreq() { 217 # This info is used to cross-reference current cpufreq setup with 218 # what DPDK's governor actually puts in place. 219 220 local -g cpufreq_drivers=() 221 local -g cpufreq_governors=() 222 local -g cpufreq_base_freqs=() 223 local -g cpufreq_max_freqs=() 224 local -g cpufreq_min_freqs=() 225 local -g cpufreq_cur_freqs=() 226 local -g cpufreq_is_turbo=() 227 local -g cpufreq_available_freqs=() 228 local -g cpufreq_available_governors=() 229 local -g cpufreq_high_prio=() 230 local -g cpufreq_non_turbo_ratio=() 231 local -g cpufreq_setspeed=() 232 local -g cpuinfo_max_freqs=() 233 local -g cpuinfo_min_freqs=() 234 local -g turbo_enabled=0 235 local cpu cpu_idx 236 237 for cpu in "$sysfs_cpu/cpu"+([0-9]); do 238 cpu_idx=${cpu##*cpu} 239 [[ -e $cpu/cpufreq ]] || continue 240 cpufreq_drivers[cpu_idx]=$(< "$cpu/cpufreq/scaling_driver") 241 cpufreq_governors[cpu_idx]=$(< "$cpu/cpufreq/scaling_governor") 242 243 # In case HWP is on 244 if [[ -e $cpu/cpufreq/base_frequency ]]; then 245 cpufreq_base_freqs[cpu_idx]=$(< "$cpu/cpufreq/base_frequency") 246 fi 247 248 cpufreq_cur_freqs[cpu_idx]=$(< "$cpu/cpufreq/scaling_cur_freq") 249 cpufreq_max_freqs[cpu_idx]=$(< "$cpu/cpufreq/scaling_max_freq") 250 cpufreq_min_freqs[cpu_idx]=$(< "$cpu/cpufreq/scaling_min_freq") 251 252 local -n available_governors=available_governors_cpu_${cpu_idx} 253 cpufreq_available_governors[cpu_idx]="available_governors_cpu_${cpu_idx}[@]" 254 available_governors=($(< "$cpu/cpufreq/scaling_available_governors")) 255 256 local -n available_freqs=available_freqs_cpu_${cpu_idx} 257 cpufreq_available_freqs[cpu_idx]="available_freqs_cpu_${cpu_idx}[@]" 258 259 case "${cpufreq_drivers[cpu_idx]}" in 260 acpi-cpufreq) 261 available_freqs=($(< "$cpu/cpufreq/scaling_available_frequencies")) 262 if ((available_freqs[0] - 1000 == available_freqs[1])); then 263 cpufreq_is_turbo[cpu_idx]=1 264 else 265 cpufreq_is_turbo[cpu_idx]=0 266 fi 267 cpufreq_setspeed[cpu_idx]=$(< "$cpu/cpufreq/scaling_setspeed") 268 ;; 269 intel_pstate | intel_cpufreq) # active or passive 270 local non_turbo_ratio base_max_freq num_freq freq is_turbo=0 271 272 non_turbo_ratio=$("$testdir/rdmsr.pl" "$cpu_idx" 0xce) 273 cpuinfo_min_freqs[cpu_idx]=$(< "$cpu/cpufreq/cpuinfo_min_freq") 274 cpuinfo_max_freqs[cpu_idx]=$(< "$cpu/cpufreq/cpuinfo_max_freq") 275 cpufreq_non_turbo_ratio[cpu_idx]=$(((non_turbo_ratio >> 8) & 0xff)) 276 if ((cpufreq_base_freqs[cpu_idx] / 100000 > cpufreq_non_turbo_ratio[cpu_idx])); then 277 cpufreq_high_prio[cpu_idx]=1 278 base_max_freq=${cpufreq_base_freqs[cpu_idx]} 279 else 280 cpufreq_high_prio[cpu_idx]=0 281 base_max_freq=$((cpufreq_non_turbo_ratio[cpu_idx] * 100000)) 282 fi 283 num_freqs=$(((base_max_freq - cpuinfo_min_freqs[cpu_idx]) / 100000 + 1)) 284 if ((base_max_freq < cpuinfo_max_freqs[cpu_idx])); then 285 ((num_freqs += 1)) 286 cpufreq_is_turbo[cpu_idx]=1 287 else 288 cpufreq_is_turbo[cpu_idx]=0 289 fi 290 available_freqs=() 291 for ((freq = 0; freq < num_freqs; freq++)); do 292 if ((freq == 0 && cpufreq_is_turbo[cpu_idx] == 1)); then 293 available_freqs[freq]=$((base_max_freq + 1)) 294 else 295 available_freqs[freq]=$((base_max_freq - (freq - cpufreq_is_turbo[cpu_idx]) * 100000)) 296 fi 297 done 298 ;; 299 cppc_cpufreq) 300 cpufreq_setspeed[cpu_idx]=$(< "$cpu/cpufreq/scaling_setspeed") 301 scaling_min_freqs[cpu_idx]=$(< "$cpu/cpufreq/scaling_min_freq") 302 scaling_max_freqs[cpu_idx]=$(< "$cpu/cpufreq/scaling_max_freq") 303 cpuinfo_max_freqs[cpu_idx]=$(< "$cpu/cpufreq/cpuinfo_max_freq") 304 nominal_perf[cpu_idx]=$(< "$cpu/acpi_cppc/nominal_perf") 305 highest_perf[cpu_idx]=$(< "$cpu/acpi_cppc/highest_perf") 306 307 #the unit of highest_perf and nominal_perf differs on different arm platforms. 308 #For highest_perf, it maybe 300 or 3000000, both means 3.0GHz. 309 if ((highest_perf[cpu_idx] > nominal_perf[cpu_idx] && (\ 310 highest_perf[cpu_idx] == cpuinfo_max_freqs[cpu_idx] || \ 311 highest_perf[cpu_idx] * 10000 == cpuinfo_max_freqs[cpu_idx]))); then 312 cpufreq_is_turbo[cpu_idx]=1 313 else 314 cpufreq_is_turbo[cpu_idx]=0 315 fi 316 317 if ((nominal_perf[cpu_idx] < 10000)); then 318 nominal_perf[cpu_idx]=$((nominal_perf[cpu_idx] * 10000)) 319 fi 320 321 num_freqs=$(((nominal_perf[cpu_idx] - scaling_min_freqs[cpu_idx]) / 100000 + 1 + \ 322 cpufreq_is_turbo[cpu_idx])) 323 324 available_freqs=() 325 for ((freq = 0; freq < num_freqs; freq++)); do 326 if ((freq == 0 && cpufreq_is_turbo[cpu_idx] == 1)); then 327 available_freqs[freq]=$((scaling_max_freqs[cpu_idx])) 328 else 329 available_freqs[freq]=$((nominal_perf[cpu_idx] - (\ 330 freq - cpufreq_is_turbo[cpu_idx]) * 100000)) 331 fi 332 done 333 ;; 334 esac 335 done 336 if [[ -e $sysfs_cpu/cpufreq/boost ]]; then 337 turbo_enabled=$(< "$sysfs_cpu/cpufreq/boost") 338 elif [[ -e $sysfs_cpu/intel_pstate/no_turbo ]]; then 339 turbo_enabled=$((!$(< "$sysfs_cpu/intel_pstate/no_turbo"))) 340 fi 341} 342 343set_cpufreq() { 344 local cpu=$1 345 local min_freq=$2 346 local max_freq=$3 347 local cpufreq=$sysfs_cpu/cpu$cpu/cpufreq 348 349 # Map the cpufreq info first 350 [[ -n ${cpufreq_drivers[cpu]} ]] || return 1 351 [[ -n $min_freq ]] || return 1 352 353 case "${cpufreq_drivers[cpu]}" in 354 acpi-cpufreq | cppc_cpufreq) 355 if [[ $(< "$cpufreq/scaling_governor") != userspace ]]; then 356 echo "userspace" > "$cpufreq/scaling_governor" 357 fi 358 echo "$min_freq" > "$cpufreq/scaling_setspeed" 359 ;; 360 intel_pstate | intel_cpufreq) 361 if [[ -n $max_freq ]] && ((max_freq >= min_freq)); then 362 echo "$max_freq" > "$cpufreq/scaling_max_freq" 363 fi 364 if ((min_freq <= cpufreq_max_freqs[cpu])); then 365 echo "$min_freq" > "$cpufreq/scaling_min_freq" 366 fi 367 ;; 368 esac 369} 370 371set_cpufreq_governor() { 372 local cpu=$1 373 local governor=$2 374 local cpufreq=$sysfs_cpu/cpu$cpu/cpufreq 375 376 if [[ $(< "$cpufreq/scaling_governor") != "$governor" ]]; then 377 echo "$governor" > "$cpufreq/scaling_governor" 378 fi 379} 380 381exec_under_dynamic_scheduler() { 382 if [[ -e /proc/$spdk_pid/status ]]; then 383 killprocess "$spdk_pid" 384 fi 385 exec_in_cgroup "/cpuset/spdk" "$@" --wait-for-rpc & 386 spdk_pid=$! 387 # Give some time for the app to init itself 388 waitforlisten "$spdk_pid" 389 "$rootdir/scripts/rpc.py" framework_set_scheduler dynamic 390 "$rootdir/scripts/rpc.py" framework_start_init 391} 392 393get_thread_stats() { 394 xtrace_disable 395 _get_thread_stats busy idle 396 xtrace_restore 397} 398 399_get_thread_stats() { 400 local list_busy=$1 401 local list_idle=$2 402 local thread threads stats 403 404 stats=$(rpc_cmd thread_get_stats | jq -r '.threads[]') 405 threads=($(jq -r '.id' <<< "$stats")) 406 407 for thread in "${threads[@]}"; do 408 eval "${list_busy}[$thread]=\$(jq -r \"select(.id == $thread) | .busy\" <<< \$stats)" 409 eval "${list_idle}[$thread]=\$(jq -r \"select(.id == $thread) | .idle\" <<< \$stats)" 410 thread_map[thread]=$(jq -r "select(.id == $thread) | .name" <<< "$stats") 411 done 412} 413 414get_cpu_stat() { 415 local cpu_idx=$1 416 local stat=$2 stats astats 417 418 while read -r cpu stats; do 419 [[ $cpu == "cpu$cpu_idx" ]] && astats=($stats) 420 done < /proc/stat 421 422 case "$stat" in 423 idle) echo "${astats[3]}" ;; 424 all) printf '%u\n' "${astats[@]}" ;; 425 *) ;; 426 esac 427} 428 429create_thread() { 430 rpc_cmd --plugin "$plugin" scheduler_thread_create "$@" 431} 432 433destroy_thread() { 434 rpc_cmd --plugin "$plugin" scheduler_thread_delete "$@" 435} 436 437active_thread() { 438 rpc_cmd --plugin "$plugin" scheduler_thread_set_active "$@" 439} 440 441get_cpu_time() { 442 xtrace_disable 443 444 local interval=$1 cpu_time=${2:-idle} interval_count 445 shift 2 446 local cpus=("$@") cpu 447 local stats stat old_stats avg_load 448 local total_sample 449 450 # Exposed for the caller 451 local -g cpu_times=() 452 local -g avg_cpu_time=() 453 454 # cpu_time: 455 # 0 - user (time spent in user mode) 456 # 1 - nice (Time spent in user mode with low priority) 457 # 2 - system (Time spent in system mode) 458 # 3 - idle (Time spent in the idle task) 459 # 4 - iowait (Time waiting for I/O to complete) 460 # 5 - irq (Time servicing interrupts) 461 # 6 - softirq (Time servicing softirqs) 462 # 7 - steal (Stolen time) 463 # 8 - guest (Time spent running a virtual CPU) 464 # 9 - guest_nice (Time spent running a niced guest) 465 466 local -gA cpu_time_map 467 cpu_time_map["user"]=0 468 cpu_time_map["nice"]=1 469 cpu_time_map["system"]=2 470 cpu_time_map["idle"]=3 471 cpu_time_map["iowait"]=4 472 cpu_time_map["irq"]=5 473 cpu_time_map["softirq"]=6 474 cpu_time_map["steal"]=7 475 cpu_time_map["guest"]=8 476 cpu_time_map["guest_nice"]=9 477 478 # Clear up the env 479 unset -v ${!stat_@} 480 unset -v ${!old_stat_@} 481 unset -v ${!avg_stat@} 482 unset -v ${!avg_load@} 483 unset -v ${!raw_samples@} 484 485 cpu_time=${cpu_time_map["$cpu_time"]} 486 interval=$((interval <= 0 ? 1 : interval)) 487 # We skip first sample to have min 2 for stat comparison 488 interval=$((interval + 1)) interval_count=0 489 while ((interval_count++, --interval >= 0)); do 490 for cpu in "${cpus[@]}"; do 491 local -n old_stats=old_stats_$cpu 492 local -n avg_load=avg_load_$cpu 493 local -n raw_samples=raw_samples_$cpu 494 495 sample_stats=() total_sample=0 496 497 stats=($(get_cpu_stat "$cpu" all)) 498 if ((interval_count == 1)); then 499 # Skip first sample 500 old_stats=("${stats[@]}") 501 continue 502 fi 503 for stat in "${!stats[@]}"; do 504 avg_load[stat]="stat_${stat}_${cpu}[@]" 505 sample_stats[stat]=$((stats[stat] - old_stats[stat])) 506 : $((total_sample += sample_stats[stat])) 507 done 508 for stat in "${!stats[@]}"; do 509 local -n avg_stat=stat_${stat}_${cpu} 510 local -n raw_samples_ref=raw_samples_${stat}_${cpu} 511 raw_samples[stat]="raw_samples_${stat}_${cpu}[@]" 512 raw_samples_ref+=("${stats[stat]}") 513 avg_stat+=($((sample_stats[stat] * 100 / (total_sample == 0 ? 1 : total_sample)))) 514 done 515 old_stats=("${stats[@]}") 516 done 517 sleep 1s 518 done 519 520 # We collected % for each time. Now determine the avg % for requested time. 521 local load stat_load 522 for cpu in "${cpus[@]}"; do 523 load=0 524 local -n avg_load_cpu=avg_load_$cpu 525 stat_load=("${!avg_load_cpu[cpu_time]}") 526 for stat in "${stat_load[@]}"; do 527 : $((load += stat)) 528 done 529 cpu_times[cpu]=${stat_load[*]} 530 avg_cpu_time[cpu]=$((load / ${#stat_load[@]})) 531 done 532 533 xtrace_restore 534} 535 536collect_cpu_idle() { 537 ((${#cpus_to_collect[@]} > 0)) || return 1 538 539 local time=${1:-5} 540 local cpu 541 local samples 542 local -g is_idle=() 543 544 printf 'Collecting cpu idle stats (cpus: %s) for %u seconds...\n' \ 545 "${cpus_to_collect[*]}" "$time" 546 547 get_cpu_time "$time" idle "${cpus_to_collect[@]}" 548 549 local user_load 550 for cpu in "${cpus_to_collect[@]}"; do 551 samples=(${cpu_times[cpu]}) 552 printf '* cpu%u idle samples: %s (avg: %u%%)\n' \ 553 "$cpu" "${samples[*]}" "${avg_cpu_time[cpu]}" 554 # Cores with polling reactors have 0% idle time, 555 # while the ones in interrupt mode won't have 100% idle. 556 # During the tests, polling reactors spend the major portion 557 # of their cpu time in user mode. With that in mind, if the 558 # general check for cpus's idleness fails, check what portion 559 # of the cpu load falls into user mode. For the idle check 560 # use the last sample. For the cpu load, compare user's raw 561 # samples in SC_CLK_TCK context for a more detailed view. 562 user_load=$(cpu_usage_clk_tck "$cpu" user) 563 if ((samples[-1] >= 70)); then 564 printf '* cpu%u is idle\n' "$cpu" 565 is_idle[cpu]=1 566 elif ((user_load <= 15)); then 567 printf '* cpu%u not fully idle, but user load is low so passing\n' "$cpu" 568 is_idle[cpu]=1 569 else 570 printf '* cpu%u is not idle\n' "$cpu" 571 is_idle[cpu]=0 572 fi 573 done 574} 575 576cpu_usage_clk_tck() { 577 local cpu=$1 time=${2:-all} 578 local user nice system usage clk_delta 579 580 # We should be called in get_cpu_time()'s environment. 581 [[ -v raw_samples_$cpu ]] || return 1 582 583 local -n raw_samples=raw_samples_$cpu 584 user=("${!raw_samples[cpu_time_map["user"]]}") 585 nice=("${!raw_samples[cpu_time_map["nice"]]}") 586 system=("${!raw_samples[cpu_time_map["system"]]}") 587 588 # Construct delta based on last two samples of a given time. 589 case "$time" in 590 user | all) ((clk_delta += (user[-1] - user[-2]))) ;;& 591 nice | all) ((clk_delta += (nice[-1] - nice[-2]))) ;;& 592 system | all) ((clk_delta += (system[-1] - system[-2]))) ;; 593 *) ;; 594 esac 595 # We assume 1s between each sample. See get_cpu_time(). 596 usage=$((100 * clk_delta / $(getconf CLK_TCK))) 597 usage=$((usage > 100 ? 100 : usage)) 598 599 printf '%u' "$usage" 600 printf '* cpu%u %s usage: %u\n' "$cpu" "$time" "$usage" >&2 601 printf '* cpu%u user samples: %s\n' "$cpu" "${user[*]}" >&2 602 printf '* cpu%u nice samples: %s\n' "$cpu" "${nice[*]}" >&2 603 printf '* cpu%u system samples: %s\n' "$cpu" "${system[*]}" >&2 604} 605 606update_thread_cpus_map() { 607 local cpu 608 local -g thread_cpus=() 609 local reactor_framework 610 611 ((${#cpus[@]} > 0)) || return 1 612 613 get_thread_stats 614 615 reactor_framework=$(rpc_cmd framework_get_reactors | jq -r '.reactors[]') 616 for cpu in "${cpus[@]}"; do 617 for thread in $(jq -r "select(.lcore == $cpu) | .lw_threads[].id" <<< "$reactor_framework"); do 618 printf '* Thread %u (%s) on cpu%u\n' "$thread" "${thread_map[thread]}" "$cpu" 619 thread_cpus[thread]=$cpu 620 done 621 done 622 ((${#thread_cpus[@]} > 0)) 623} 624