1# SPDX-License-Identifier: BSD-3-Clause 2# Copyright (C) 2020 Intel Corporation 3# All rights reserved. 4# 5 6declare -r sysfs_system=/sys/devices/system 7declare -r sysfs_cpu=$sysfs_system/cpu 8declare -r sysfs_node=$sysfs_system/node 9 10declare -r scheduler=$rootdir/test/event/scheduler/scheduler 11declare plugin=scheduler_plugin 12 13source "$rootdir/test/scheduler/cgroups.sh" 14 15fold_list_onto_array() { 16 local array=$1 17 local elem 18 19 shift || return 0 20 21 for elem; do 22 eval "${array}[elem]=$elem" 23 done 24} 25 26fold_array_onto_string() { 27 local cpus=("$@") 28 29 local IFS="," 30 echo "${cpus[*]}" 31} 32 33parse_cpu_list() { 34 local list=$1 35 local elem elems cpus 36 37 # 0-2,4,6-9, etc. 38 IFS="," read -ra elems < "$list" 39 40 ((${#elems[@]} > 0)) || return 0 41 42 for elem in "${elems[@]}"; do 43 if [[ $elem == *-* ]]; then 44 local start=${elem%-*} end=${elem#*-} 45 while ((start <= end)); do 46 cpus[start++]=$start 47 done 48 else 49 cpus[elem]=$elem 50 fi 51 done 52 printf '%u\n' "${!cpus[@]}" 53} 54 55map_cpus_node() { 56 local node_idx=$1 57 local -n _cpu_node_map=node_${node_idx}_cpu 58 local cpu_idx core_idx 59 60 for cpu_idx in $(parse_cpu_list "$sysfs_node/node$node_idx/cpulist"); do 61 if is_cpu_online "$cpu_idx"; then 62 core_idx=$(< "$sysfs_cpu/cpu$cpu_idx/topology/core_id") 63 local -n _cpu_core_map=node_${node_idx}_core_${core_idx} 64 _cpu_core_map+=("$cpu_idx") cpu_core_map[cpu_idx]=$core_idx 65 local -n _cpu_siblings=node_${node_idx}_core_${core_idx}_thread_${cpu_idx} 66 _cpu_siblings=($(parse_cpu_list "$sysfs_cpu/cpu$cpu_idx/topology/thread_siblings_list")) 67 cpu_siblings[cpu_idx]="node_${node_idx}_core_${core_idx}_thread_${cpu_idx}[@]" 68 fi 69 _cpu_node_map[cpu_idx]=$cpu_idx cpu_node_map[cpu_idx]=$node_idx 70 cpus+=("$cpu_idx") 71 done 72 73 nodes[node_idx]=$node_idx 74} 75 76map_cpus() { 77 local -g cpus=() 78 local -g cpu_siblings=() 79 local -g nodes=() 80 local -g cpu_node_map=() 81 local -g cpu_core_map=() 82 local -g core_node_map=() 83 local node 84 85 unset -v "${!node_@}" 86 87 for node in "$sysfs_node/node"+([0-9]); do 88 map_cpus_node "${node##*node}" 89 done 90} 91 92get_cpus() { 93 local node=$1 94 local core=$2 95 local _cpus 96 97 if [[ -z $node ]]; then 98 _cpus=("${cpus[@]}") 99 elif [[ -n $node ]]; then 100 eval "_cpus=(\${node_${node}_cpu[@]})" 101 if [[ -n $core ]]; then 102 eval "_cpus=(\${node_${node}_core_${core}[@]})" 103 fi 104 fi 105 ((${#_cpus[@]} > 0)) || return 1 106 printf '%u\n' "${_cpus[@]}" 107} 108 109get_isolated_cpus() { 110 [[ -e $sysfs_cpu/isolated ]] || return 0 111 parse_cpu_list "$sysfs_cpu/isolated" 112} 113 114get_offline_cpus() { 115 local offline 116 117 [[ -e $sysfs_cpu/offline ]] || return 0 118 parse_cpu_list "$sysfs_cpu/offline" 119} 120 121get_online_cpus() { 122 [[ -e $sysfs_cpu/online ]] || return 0 123 parse_cpu_list "$sysfs_cpu/online" 124} 125 126is_cpu_online() { 127 local online 128 129 fold_list_onto_array online $(get_online_cpus) 130 [[ -v online[$1] ]] 131} 132 133is_cpu_offline() { 134 ! is_cpu_online "$1" 135} 136 137online_cpu() { 138 is_cpu_offline "$1" || return 0 139 [[ -e $sysfs_cpu/cpu$1/online ]] && echo 1 > "$sysfs_cpu/cpu$1/online" 140} 141 142offline_cpu() { 143 is_cpu_online "$1" || return 0 144 [[ -e $sysfs_cpu/cpu$1/online ]] && echo 0 > "$sysfs_cpu/cpu$1/online" 145} 146 147mask_cpus() { 148 printf '[%s]\n' "$(fold_array_onto_string "$@")" 149} 150 151denied_list() { 152 local -g denied 153 154 fold_list_onto_array denied $(get_offline_cpus) "$@" 155} 156 157filter_allowed_list() { 158 local cpu 159 160 for cpu in "${!allowed[@]}"; do 161 if [[ -n ${denied[cpu]} ]] || ((cpu > 127)); then 162 unset -v "allowed[cpu]" 163 fi 164 done 165} 166 167allowed_list() { 168 local max=${1:-4} 169 local node=${2:-0} 170 local cpu_count=${cpu_count:--1} 171 172 local -g allowed 173 174 fold_list_onto_array allowed $(get_isolated_cpus) 175 176 if ((cpu_count < 0 && ${#allowed[@]} > 0)); then 177 ((max += ${#allowed[@]})) 178 fi 179 180 local -n node_cpu_ref=node_${node}_cpu 181 182 while ((${#allowed[@]} < max && ++cpu_count < ${#node_cpu_ref[@]})); do 183 fold_list_onto_array allowed $(get_cpus "$node" "${cpu_core_map[node_cpu_ref[cpu_count]]}") 184 done 185 186 filter_allowed_list 187 188 if ((${#allowed[@]} == max)); then 189 return 0 190 elif ((cpu_count == ${#node_cpu_ref[@]})); then 191 return 0 192 else 193 allowed_list "$max" "$node" 194 fi 195} 196 197get_proc_cpu_affinity() { 198 xtrace_disable 199 200 local pid=${1:-$$} 201 local status val 202 203 [[ -e /proc/$pid/status ]] || return 1 204 while IFS=":"$'\t' read -r status val; do 205 if [[ $status == Cpus_allowed_list ]]; then 206 parse_cpu_list <(echo "$val") 207 return 0 208 fi 209 done < "/proc/$pid/status" 210 211 xtrace_restore 212} 213 214map_cpufreq() { 215 # This info is used to cross-reference current cpufreq setup with 216 # what DPDK's governor actually puts in place. 217 218 local -g cpufreq_drivers=() 219 local -g cpufreq_governors=() 220 local -g cpufreq_base_freqs=() 221 local -g cpufreq_max_freqs=() 222 local -g cpufreq_min_freqs=() 223 local -g cpufreq_cur_freqs=() 224 local -g cpufreq_is_turbo=() 225 local -g cpufreq_available_freqs=() 226 local -g cpufreq_available_governors=() 227 local -g cpufreq_high_prio=() 228 local -g cpufreq_non_turbo_ratio=() 229 local -g cpufreq_setspeed=() 230 local -g cpuinfo_max_freqs=() 231 local -g cpuinfo_min_freqs=() 232 local -g turbo_enabled=0 233 local cpu cpu_idx 234 235 for cpu in "$sysfs_cpu/cpu"+([0-9]); do 236 cpu_idx=${cpu##*cpu} 237 [[ -e $cpu/cpufreq ]] || continue 238 cpufreq_drivers[cpu_idx]=$(< "$cpu/cpufreq/scaling_driver") 239 cpufreq_governors[cpu_idx]=$(< "$cpu/cpufreq/scaling_governor") 240 241 # In case HWP is on 242 if [[ -e $cpu/cpufreq/base_frequency ]]; then 243 cpufreq_base_freqs[cpu_idx]=$(< "$cpu/cpufreq/base_frequency") 244 fi 245 246 cpufreq_cur_freqs[cpu_idx]=$(< "$cpu/cpufreq/scaling_cur_freq") 247 cpufreq_max_freqs[cpu_idx]=$(< "$cpu/cpufreq/scaling_max_freq") 248 cpufreq_min_freqs[cpu_idx]=$(< "$cpu/cpufreq/scaling_min_freq") 249 250 local -n available_governors=available_governors_cpu_${cpu_idx} 251 cpufreq_available_governors[cpu_idx]="available_governors_cpu_${cpu_idx}[@]" 252 available_governors=($(< "$cpu/cpufreq/scaling_available_governors")) 253 254 local -n available_freqs=available_freqs_cpu_${cpu_idx} 255 cpufreq_available_freqs[cpu_idx]="available_freqs_cpu_${cpu_idx}[@]" 256 257 case "${cpufreq_drivers[cpu_idx]}" in 258 acpi-cpufreq) 259 available_freqs=($(< "$cpu/cpufreq/scaling_available_frequencies")) 260 if ((available_freqs[0] - 1000 == available_freqs[1])); then 261 cpufreq_is_turbo[cpu_idx]=1 262 else 263 cpufreq_is_turbo[cpu_idx]=0 264 fi 265 cpufreq_setspeed[cpu_idx]=$(< "$cpu/cpufreq/scaling_setspeed") 266 ;; 267 intel_pstate | intel_cpufreq) # active or passive 268 local non_turbo_ratio base_max_freq num_freq freq is_turbo=0 269 270 non_turbo_ratio=$("$testdir/rdmsr.pl" "$cpu_idx" 0xce) 271 cpuinfo_min_freqs[cpu_idx]=$(< "$cpu/cpufreq/cpuinfo_min_freq") 272 cpuinfo_max_freqs[cpu_idx]=$(< "$cpu/cpufreq/cpuinfo_max_freq") 273 cpufreq_non_turbo_ratio[cpu_idx]=$(((non_turbo_ratio >> 8) & 0xff)) 274 if ((cpufreq_base_freqs[cpu_idx] / 100000 > cpufreq_non_turbo_ratio[cpu_idx])); then 275 cpufreq_high_prio[cpu_idx]=1 276 base_max_freq=${cpufreq_base_freqs[cpu_idx]} 277 else 278 cpufreq_high_prio[cpu_idx]=0 279 base_max_freq=$((cpufreq_non_turbo_ratio[cpu_idx] * 100000)) 280 fi 281 num_freqs=$(((base_max_freq - cpuinfo_min_freqs[cpu_idx]) / 100000 + 1)) 282 if ((base_max_freq < cpuinfo_max_freqs[cpu_idx])); then 283 ((num_freqs += 1)) 284 cpufreq_is_turbo[cpu_idx]=1 285 else 286 cpufreq_is_turbo[cpu_idx]=0 287 fi 288 available_freqs=() 289 for ((freq = 0; freq < num_freqs; freq++)); do 290 if ((freq == 0 && cpufreq_is_turbo[cpu_idx] == 1)); then 291 available_freqs[freq]=$((base_max_freq + 1)) 292 else 293 available_freqs[freq]=$((base_max_freq - (freq - cpufreq_is_turbo[cpu_idx]) * 100000)) 294 fi 295 done 296 ;; 297 cppc_cpufreq) 298 cpufreq_setspeed[cpu_idx]=$(< "$cpu/cpufreq/scaling_setspeed") 299 scaling_min_freqs[cpu_idx]=$(< "$cpu/cpufreq/scaling_min_freq") 300 scaling_max_freqs[cpu_idx]=$(< "$cpu/cpufreq/scaling_max_freq") 301 cpuinfo_max_freqs[cpu_idx]=$(< "$cpu/cpufreq/cpuinfo_max_freq") 302 nominal_perf[cpu_idx]=$(< "$cpu/acpi_cppc/nominal_perf") 303 highest_perf[cpu_idx]=$(< "$cpu/acpi_cppc/highest_perf") 304 305 #the unit of highest_perf and nominal_perf differs on different arm platforms. 306 #For highest_perf, it maybe 300 or 3000000, both means 3.0GHz. 307 if ((highest_perf[cpu_idx] > nominal_perf[cpu_idx] && (\ 308 highest_perf[cpu_idx] == cpuinfo_max_freqs[cpu_idx] || \ 309 highest_perf[cpu_idx] * 10000 == cpuinfo_max_freqs[cpu_idx]))); then 310 cpufreq_is_turbo[cpu_idx]=1 311 else 312 cpufreq_is_turbo[cpu_idx]=0 313 fi 314 315 if ((nominal_perf[cpu_idx] < 10000)); then 316 nominal_perf[cpu_idx]=$((nominal_perf[cpu_idx] * 10000)) 317 fi 318 319 num_freqs=$(((nominal_perf[cpu_idx] - scaling_min_freqs[cpu_idx]) / 100000 + 1 + \ 320 cpufreq_is_turbo[cpu_idx])) 321 322 available_freqs=() 323 for ((freq = 0; freq < num_freqs; freq++)); do 324 if ((freq == 0 && cpufreq_is_turbo[cpu_idx] == 1)); then 325 available_freqs[freq]=$((scaling_max_freqs[cpu_idx])) 326 else 327 available_freqs[freq]=$((nominal_perf[cpu_idx] - (\ 328 freq - cpufreq_is_turbo[cpu_idx]) * 100000)) 329 fi 330 done 331 ;; 332 esac 333 done 334 if [[ -e $sysfs_cpu/cpufreq/boost ]]; then 335 turbo_enabled=$(< "$sysfs_cpu/cpufreq/boost") 336 elif [[ -e $sysfs_cpu/intel_pstate/no_turbo ]]; then 337 turbo_enabled=$((!$(< "$sysfs_cpu/intel_pstate/no_turbo"))) 338 fi 339} 340 341set_cpufreq() { 342 local cpu=$1 343 local min_freq=$2 344 local max_freq=$3 345 local cpufreq=$sysfs_cpu/cpu$cpu/cpufreq 346 347 # Map the cpufreq info first 348 [[ -n ${cpufreq_drivers[cpu]} ]] || return 1 349 [[ -n $min_freq ]] || return 1 350 351 case "${cpufreq_drivers[cpu]}" in 352 acpi-cpufreq | cppc_cpufreq) 353 if [[ $(< "$cpufreq/scaling_governor") != userspace ]]; then 354 echo "userspace" > "$cpufreq/scaling_governor" 355 fi 356 echo "$min_freq" > "$cpufreq/scaling_setspeed" 357 ;; 358 intel_pstate | intel_cpufreq) 359 if [[ -n $max_freq ]] && ((max_freq >= min_freq)); then 360 echo "$max_freq" > "$cpufreq/scaling_max_freq" 361 fi 362 if ((min_freq <= cpufreq_max_freqs[cpu])); then 363 echo "$min_freq" > "$cpufreq/scaling_min_freq" 364 fi 365 ;; 366 esac 367} 368 369set_cpufreq_governor() { 370 local cpu=$1 371 local governor=$2 372 local cpufreq=$sysfs_cpu/cpu$cpu/cpufreq 373 374 if [[ $(< "$cpufreq/scaling_governor") != "$governor" ]]; then 375 echo "$governor" > "$cpufreq/scaling_governor" 376 fi 377} 378 379exec_under_dynamic_scheduler() { 380 if [[ -e /proc/$spdk_pid/status ]]; then 381 killprocess "$spdk_pid" 382 fi 383 exec_in_cgroup "/cpuset/spdk" "$@" --wait-for-rpc & 384 spdk_pid=$! 385 # Give some time for the app to init itself 386 waitforlisten "$spdk_pid" 387 "$rootdir/scripts/rpc.py" framework_set_scheduler dynamic 388 "$rootdir/scripts/rpc.py" framework_start_init 389} 390 391get_thread_stats() { 392 xtrace_disable 393 _get_thread_stats busy idle 394 xtrace_restore 395} 396 397_get_thread_stats() { 398 local list_busy=$1 399 local list_idle=$2 400 local thread threads stats 401 402 stats=$(rpc_cmd thread_get_stats | jq -r '.threads[]') 403 threads=($(jq -r '.id' <<< "$stats")) 404 405 for thread in "${threads[@]}"; do 406 eval "${list_busy}[$thread]=\$(jq -r \"select(.id == $thread) | .busy\" <<< \$stats)" 407 eval "${list_idle}[$thread]=\$(jq -r \"select(.id == $thread) | .idle\" <<< \$stats)" 408 thread_map[thread]=$(jq -r "select(.id == $thread) | .name" <<< "$stats") 409 done 410} 411 412get_cpu_stat() { 413 local cpu_idx=$1 414 local stat=$2 stats astats 415 416 while read -r cpu stats; do 417 [[ $cpu == "cpu$cpu_idx" ]] && astats=($stats) 418 done < /proc/stat 419 420 case "$stat" in 421 idle) echo "${astats[3]}" ;; 422 all) printf '%u\n' "${astats[@]}" ;; 423 *) ;; 424 esac 425} 426 427create_thread() { 428 rpc_cmd --plugin "$plugin" scheduler_thread_create "$@" 429} 430 431destroy_thread() { 432 rpc_cmd --plugin "$plugin" scheduler_thread_delete "$@" 433} 434 435active_thread() { 436 rpc_cmd --plugin "$plugin" scheduler_thread_set_active "$@" 437} 438 439get_cpu_time() { 440 xtrace_disable 441 442 local interval=$1 cpu_time=${2:-idle} interval_count 443 shift 2 444 local cpus=("$@") cpu 445 local stats stat old_stats avg_load 446 local total_sample 447 448 # Exposed for the caller 449 local -g cpu_times=() 450 local -g avg_cpu_time=() 451 452 # cpu_time: 453 # 0 - user (time spent in user mode) 454 # 1 - nice (Time spent in user mode with low priority) 455 # 2 - system (Time spent in system mode) 456 # 3 - idle (Time spent in the idle task) 457 # 4 - iowait (Time waiting for I/O to complete) 458 # 5 - irq (Time servicing interrupts) 459 # 6 - softirq (Time servicing softirqs) 460 # 7 - steal (Stolen time) 461 # 8 - guest (Time spent running a virtual CPU) 462 # 9 - guest_nice (Time spent running a niced guest) 463 464 local -gA cpu_time_map 465 cpu_time_map["user"]=0 466 cpu_time_map["nice"]=1 467 cpu_time_map["system"]=2 468 cpu_time_map["idle"]=3 469 cpu_time_map["iowait"]=4 470 cpu_time_map["irq"]=5 471 cpu_time_map["softirq"]=6 472 cpu_time_map["steal"]=7 473 cpu_time_map["guest"]=8 474 cpu_time_map["guest_nice"]=9 475 476 # Clear up the env 477 unset -v ${!stat_@} 478 unset -v ${!old_stat_@} 479 unset -v ${!avg_stat@} 480 unset -v ${!avg_load@} 481 unset -v ${!raw_samples@} 482 483 cpu_time=${cpu_time_map["$cpu_time"]} 484 interval=$((interval <= 0 ? 1 : interval)) 485 # We skip first sample to have min 2 for stat comparison 486 interval=$((interval + 1)) interval_count=0 487 while ((interval_count++, --interval >= 0)); do 488 for cpu in "${cpus[@]}"; do 489 local -n old_stats=old_stats_$cpu 490 local -n avg_load=avg_load_$cpu 491 local -n raw_samples=raw_samples_$cpu 492 493 sample_stats=() total_sample=0 494 495 stats=($(get_cpu_stat "$cpu" all)) 496 if ((interval_count == 1)); then 497 # Skip first sample 498 old_stats=("${stats[@]}") 499 continue 500 fi 501 for stat in "${!stats[@]}"; do 502 avg_load[stat]="stat_${stat}_${cpu}[@]" 503 sample_stats[stat]=$((stats[stat] - old_stats[stat])) 504 : $((total_sample += sample_stats[stat])) 505 done 506 for stat in "${!stats[@]}"; do 507 local -n avg_stat=stat_${stat}_${cpu} 508 local -n raw_samples_ref=raw_samples_${stat}_${cpu} 509 raw_samples[stat]="raw_samples_${stat}_${cpu}[@]" 510 raw_samples_ref+=("${stats[stat]}") 511 avg_stat+=($((sample_stats[stat] * 100 / (total_sample == 0 ? 1 : total_sample)))) 512 done 513 old_stats=("${stats[@]}") 514 done 515 sleep 1s 516 done 517 518 # We collected % for each time. Now determine the avg % for requested time. 519 local load stat_load 520 for cpu in "${cpus[@]}"; do 521 load=0 522 local -n avg_load_cpu=avg_load_$cpu 523 stat_load=("${!avg_load_cpu[cpu_time]}") 524 for stat in "${stat_load[@]}"; do 525 : $((load += stat)) 526 done 527 cpu_times[cpu]=${stat_load[*]} 528 avg_cpu_time[cpu]=$((load / ${#stat_load[@]})) 529 done 530 531 xtrace_restore 532} 533 534collect_cpu_idle() { 535 ((${#cpus_to_collect[@]} > 0)) || return 1 536 537 local time=${1:-5} 538 local cpu 539 local samples 540 local -g is_idle=() 541 542 printf 'Collecting cpu idle stats (cpus: %s) for %u seconds...\n' \ 543 "${cpus_to_collect[*]}" "$time" 544 545 get_cpu_time "$time" idle "${cpus_to_collect[@]}" 546 547 local user_load 548 for cpu in "${cpus_to_collect[@]}"; do 549 samples=(${cpu_times[cpu]}) 550 printf '* cpu%u idle samples: %s (avg: %u%%)\n' \ 551 "$cpu" "${samples[*]}" "${avg_cpu_time[cpu]}" 552 # Cores with polling reactors have 0% idle time, 553 # while the ones in interrupt mode won't have 100% idle. 554 # During the tests, polling reactors spend the major portion 555 # of their cpu time in user mode. With that in mind, if the 556 # general check for cpus's idleness fails, check what portion 557 # of the cpu load falls into user mode. For the idle check 558 # use the last sample. For the cpu load, compare user's raw 559 # samples in SC_CLK_TCK context for a more detailed view. 560 user_load=$(cpu_usage_clk_tck "$cpu" user) 561 if ((samples[-1] >= 70)); then 562 printf '* cpu%u is idle\n' "$cpu" 563 is_idle[cpu]=1 564 elif ((user_load <= 15)); then 565 printf '* cpu%u not fully idle, but user load is low so passing\n' "$cpu" 566 is_idle[cpu]=1 567 else 568 printf '* cpu%u is not idle\n' "$cpu" 569 is_idle[cpu]=0 570 fi 571 done 572} 573 574cpu_usage_clk_tck() { 575 local cpu=$1 time=${2:-all} 576 local user nice system usage clk_delta 577 578 # We should be called in get_cpu_time()'s environment. 579 [[ -v raw_samples_$cpu ]] || return 1 580 581 local -n raw_samples=raw_samples_$cpu 582 user=("${!raw_samples[cpu_time_map["user"]]}") 583 nice=("${!raw_samples[cpu_time_map["nice"]]}") 584 system=("${!raw_samples[cpu_time_map["system"]]}") 585 586 # Construct delta based on last two samples of a given time. 587 case "$time" in 588 user | all) ((clk_delta += (user[-1] - user[-2]))) ;;& 589 nice | all) ((clk_delta += (nice[-1] - nice[-2]))) ;;& 590 system | all) ((clk_delta += (system[-1] - system[-2]))) ;; 591 *) ;; 592 esac 593 # We assume 1s between each sample. See get_cpu_time(). 594 usage=$((100 * clk_delta / $(getconf CLK_TCK))) 595 usage=$((usage > 100 ? 100 : usage)) 596 597 printf '%u' "$usage" 598 printf '* cpu%u %s usage: %u\n' "$cpu" "$time" "$usage" >&2 599 printf '* cpu%u user samples: %s\n' "$cpu" "${user[*]}" >&2 600 printf '* cpu%u nice samples: %s\n' "$cpu" "${nice[*]}" >&2 601 printf '* cpu%u system samples: %s\n' "$cpu" "${system[*]}" >&2 602} 603 604update_thread_cpus_map() { 605 local cpu 606 local -g thread_cpus=() 607 local reactor_framework 608 609 ((${#cpus[@]} > 0)) || return 1 610 611 get_thread_stats 612 613 reactor_framework=$(rpc_cmd framework_get_reactors | jq -r '.reactors[]') 614 for cpu in "${cpus[@]}"; do 615 for thread in $(jq -r "select(.lcore == $cpu) | .lw_threads[].id" <<< "$reactor_framework"); do 616 printf '* Thread %u (%s) on cpu%u\n' "$thread" "${thread_map[thread]}" "$cpu" 617 thread_cpus[thread]=$cpu 618 done 619 done 620 ((${#thread_cpus[@]} > 0)) 621} 622