1shopt -s nullglob extglob 2 3declare -r sysfs_system=/sys/devices/system 4declare -r sysfs_cpu=$sysfs_system/cpu 5declare -r sysfs_node=$sysfs_system/node 6 7declare -r scheduler=$rootdir/test/event/scheduler/scheduler 8declare -r plugin=scheduler_plugin 9 10source "$rootdir/test/scheduler/cgroups.sh" 11 12fold_list_onto_array() { 13 local array=$1 14 local elem 15 16 shift || return 0 17 18 for elem; do 19 eval "${array}[elem]=$elem" 20 done 21} 22 23fold_array_onto_string() { 24 local cpus=("$@") 25 26 local IFS="," 27 echo "${cpus[*]}" 28} 29 30parse_cpu_list() { 31 local list=$1 32 local elem elems cpus 33 34 # 0-2,4,6-9, etc. 35 IFS="," read -ra elems < "$list" 36 37 ((${#elems[@]} > 0)) || return 0 38 39 for elem in "${elems[@]}"; do 40 if [[ $elem == *-* ]]; then 41 local start=${elem%-*} end=${elem#*-} 42 while ((start <= end)); do 43 cpus[start++]=$start 44 done 45 else 46 cpus[elem]=$elem 47 fi 48 done 49 printf '%u\n' "${!cpus[@]}" 50} 51 52map_cpus_node() { 53 local node_idx=$1 54 local -n _cpu_node_map=node_${node_idx}_cpu 55 local cpu_idx core_idx 56 57 for cpu_idx in $(parse_cpu_list "$sysfs_node/node$node_idx/cpulist"); do 58 if is_cpu_online "$cpu_idx"; then 59 core_idx=$(< "$sysfs_cpu/cpu$cpu_idx/topology/core_id") 60 local -n _cpu_core_map=node_${node_idx}_core_${core_idx} 61 _cpu_core_map+=("$cpu_idx") cpu_core_map[cpu_idx]=$core_idx 62 fi 63 _cpu_node_map+=("$cpu_idx") cpu_node_map[cpu_idx]=$node_idx 64 cpus+=("$cpu_idx") 65 done 66 67 nodes[node_idx]=$node_idx 68} 69 70map_cpus() { 71 local -g cpus=() 72 local -g nodes=() 73 local -g cpu_node_map=() 74 local -g cpu_core_map=() 75 local -g core_node_map=() 76 local node 77 78 unset -v "${!node_@}" 79 80 for node in "$sysfs_node/node"+([0-9]); do 81 map_cpus_node "${node##*node}" 82 done 83} 84 85get_cpus() { 86 local node=$1 87 local core=$2 88 local _cpus 89 90 if [[ -z $node ]]; then 91 _cpus=("${cpus[@]}") 92 elif [[ -n $node ]]; then 93 eval "_cpus=(\${node_${node}_cpu[@]})" 94 if [[ -n $core ]]; then 95 eval "_cpus=(\${node_${node}_core_${core}[@]})" 96 fi 97 fi 98 ((${#_cpus[@]} > 0)) || return 1 99 printf '%u\n' "${_cpus[@]}" 100} 101 102get_isolated_cpus() { 103 [[ -e $sysfs_cpu/isolated ]] || return 0 104 parse_cpu_list "$sysfs_cpu/isolated" 105} 106 107get_offline_cpus() { 108 local offline 109 110 [[ -e $sysfs_cpu/offline ]] || return 0 111 parse_cpu_list "$sysfs_cpu/offline" 112} 113 114get_online_cpus() { 115 [[ -e $sysfs_cpu/online ]] || return 0 116 parse_cpu_list "$sysfs_cpu/online" 117} 118 119is_cpu_online() { 120 local online 121 122 fold_list_onto_array online $(get_online_cpus) 123 [[ -v online[$1] ]] 124} 125 126is_cpu_offline() { 127 ! is_cpu_online "$1" 128} 129 130online_cpu() { 131 is_cpu_offline "$1" || return 0 132 [[ -e $sysfs_cpu/cpu$1/online ]] && echo 1 > "$sysfs_cpu/cpu$1/online" 133} 134 135offline_cpu() { 136 is_cpu_online "$1" || return 0 137 [[ -e $sysfs_cpu/cpu$1/online ]] && echo 0 > "$sysfs_cpu/cpu$1/online" 138} 139 140mask_cpus() { 141 local cpu 142 local mask=0 143 144 for cpu; do 145 ((mask |= 1 << cpu)) 146 done 147 printf '0x%x\n' "$mask" 148} 149 150denied_list() { 151 local -g denied 152 153 fold_list_onto_array denied $(get_offline_cpus) "$@" 154} 155 156filter_allowed_list() { 157 local cpu 158 159 for cpu in "${!allowed[@]}"; do 160 if [[ -n ${denied[cpu]} ]]; then 161 unset -v "allowed[cpu]" 162 fi 163 done 164} 165 166allowed_list() { 167 local max=${1:-4} 168 local node=${2:-0} 169 local cpu_count=${cpu_count:--1} 170 171 local -g allowed 172 173 fold_list_onto_array allowed $(get_isolated_cpus) 174 175 if ((cpu_count < 0 && ${#allowed[@]} > 0)); then 176 ((max += ${#allowed[@]})) 177 fi 178 179 local -n node_cpu_ref=node_${node}_cpu 180 181 while ((${#allowed[@]} < max && ++cpu_count < ${#node_cpu_ref[@]})); do 182 fold_list_onto_array allowed $(get_cpus "$node" "${cpu_core_map[node_cpu_ref[cpu_count]]}") 183 done 184 185 filter_allowed_list 186 187 if ((${#allowed[@]} == max)); then 188 return 0 189 elif ((cpu_count == ${#node_cpu_ref[@]})); then 190 return 0 191 else 192 allowed_list "$max" "$node" 193 fi 194} 195 196get_proc_cpu_affinity() { 197 xtrace_disable 198 199 local pid=${1:-$$} 200 local status val 201 202 [[ -e /proc/$pid/status ]] || return 1 203 while IFS=":"$'\t' read -r status val; do 204 if [[ $status == Cpus_allowed_list ]]; then 205 parse_cpu_list <(echo "$val") 206 return 0 207 fi 208 done < "/proc/$pid/status" 209 210 xtrace_restore 211} 212 213map_cpufreq() { 214 # This info is used to cross-reference current cpufreq setup with 215 # what DPDK's governor actually puts in place. 216 217 local -g cpufreq_drivers=() 218 local -g cpufreq_governors=() 219 local -g cpufreq_base_freqs=() 220 local -g cpufreq_max_freqs=() 221 local -g cpufreq_min_freqs=() 222 local -g cpufreq_cur_freqs=() 223 local -g cpufreq_is_turbo=() 224 local -g cpufreq_available_freqs=() 225 local -g cpufreq_available_governors=() 226 local -g cpufreq_high_prio=() 227 local -g cpufreq_non_turbo_ratio=() 228 local -g cpufreq_setspeed=() 229 local -g cpuinfo_max_freqs=() 230 local -g cpuinfo_min_freqs=() 231 local -g turbo_enabled=0 232 local cpu cpu_idx 233 234 for cpu in "$sysfs_cpu/cpu"+([0-9]); do 235 cpu_idx=${cpu##*cpu} 236 [[ -e $cpu/cpufreq ]] || continue 237 cpufreq_drivers[cpu_idx]=$(< "$cpu/cpufreq/scaling_driver") 238 cpufreq_governors[cpu_idx]=$(< "$cpu/cpufreq/scaling_governor") 239 240 # In case HWP is on 241 if [[ -e $cpu/cpufreq/base_frequency ]]; then 242 cpufreq_base_freqs[cpu_idx]=$(< "$cpu/cpufreq/base_frequency") 243 fi 244 245 cpufreq_cur_freqs[cpu_idx]=$(< "$cpu/cpufreq/scaling_cur_freq") 246 cpufreq_max_freqs[cpu_idx]=$(< "$cpu/cpufreq/scaling_max_freq") 247 cpufreq_min_freqs[cpu_idx]=$(< "$cpu/cpufreq/scaling_min_freq") 248 249 local -n available_governors=available_governors_cpu_${cpu_idx} 250 cpufreq_available_governors[cpu_idx]="available_governors_cpu_${cpu_idx}[@]" 251 available_governors=($(< "$cpu/cpufreq/scaling_available_governors")) 252 253 local -n available_freqs=available_freqs_cpu_${cpu_idx} 254 cpufreq_available_freqs[cpu_idx]="available_freqs_cpu_${cpu_idx}[@]" 255 256 case "${cpufreq_drivers[cpu_idx]}" in 257 acpi-cpufreq) 258 available_freqs=($(< "$cpu/cpufreq/scaling_available_frequencies")) 259 if ((available_freqs[0] - 1000 == available_freqs[1])); then 260 cpufreq_is_turbo[cpu_idx]=1 261 else 262 cpufreq_is_turbo[cpu_idx]=0 263 fi 264 cpufreq_setspeed[cpu_idx]=$(< "$cpu/cpufreq/scaling_setspeed") 265 ;; 266 intel_pstate | intel_cpufreq) # active or passive 267 local non_turbo_ratio base_max_freq num_freq freq is_turbo=0 268 269 non_turbo_ratio=$("$testdir/rdmsr.pl" "$cpu_idx" 0xce) 270 cpuinfo_min_freqs[cpu_idx]=$(< "$cpu/cpufreq/cpuinfo_min_freq") 271 cpuinfo_max_freqs[cpu_idx]=$(< "$cpu/cpufreq/cpuinfo_max_freq") 272 cpufreq_non_turbo_ratio[cpu_idx]=$(((non_turbo_ratio >> 8) & 0xff)) 273 if ((cpufreq_base_freqs[cpu_idx] / 100000 > cpufreq_non_turbo_ratio[cpu_idx])); then 274 cpufreq_high_prio[cpu_idx]=1 275 base_max_freq=${cpufreq_base_freqs[cpu_idx]} 276 else 277 cpufreq_high_prio[cpu_idx]=0 278 base_max_freq=$((cpufreq_non_turbo_ratio[cpu_idx] * 100000)) 279 fi 280 num_freqs=$(((base_max_freq - cpuinfo_min_freqs[cpu_idx]) / 100000 + 1)) 281 if ((base_max_freq < cpuinfo_max_freqs[cpu_idx])); then 282 ((num_freqs += 1)) 283 cpufreq_is_turbo[cpu_idx]=1 284 else 285 cpufreq_is_turbo[cpu_idx]=0 286 fi 287 available_freqs=() 288 for ((freq = 0; freq < num_freqs; freq++)); do 289 if ((freq == 0 && cpufreq_is_turbo[cpu_idx] == 1)); then 290 available_freqs[freq]=$((base_max_freq + 1)) 291 else 292 available_freqs[freq]=$((base_max_freq - (freq - cpufreq_is_turbo[cpu_idx]) * 100000)) 293 fi 294 done 295 ;; 296 cppc_cpufreq) 297 cpufreq_setspeed[cpu_idx]=$(< "$cpu/cpufreq/scaling_setspeed") 298 scaling_min_freqs[cpu_idx]=$(< "$cpu/cpufreq/scaling_min_freq") 299 scaling_max_freqs[cpu_idx]=$(< "$cpu/cpufreq/scaling_max_freq") 300 cpuinfo_max_freqs[cpu_idx]=$(< "$cpu/cpufreq/cpuinfo_max_freq") 301 nominal_perf[cpu_idx]=$(< "$cpu/acpi_cppc/nominal_perf") 302 highest_perf[cpu_idx]=$(< "$cpu/acpi_cppc/highest_perf") 303 304 #the unit of highest_perf and nominal_perf differs on different arm platforms. 305 #For highest_perf, it maybe 300 or 3000000, both means 3.0GHz. 306 if ((highest_perf[cpu_idx] > nominal_perf[cpu_idx] && (\ 307 highest_perf[cpu_idx] == cpuinfo_max_freqs[cpu_idx] || \ 308 highest_perf[cpu_idx] * 10000 == cpuinfo_max_freqs[cpu_idx]))); then 309 cpufreq_is_turbo[cpu_idx]=1 310 else 311 cpufreq_is_turbo[cpu_idx]=0 312 fi 313 314 if ((nominal_perf[cpu_idx] < 10000)); then 315 nominal_perf[cpu_idx]=$((nominal_perf[cpu_idx] * 10000)) 316 fi 317 318 num_freqs=$(((nominal_perf[cpu_idx] - scaling_min_freqs[cpu_idx]) / 100000 + 1 + \ 319 cpufreq_is_turbo[cpu_idx])) 320 321 available_freqs=() 322 for ((freq = 0; freq < num_freqs; freq++)); do 323 if ((freq == 0 && cpufreq_is_turbo[cpu_idx] == 1)); then 324 available_freqs[freq]=$((scaling_max_freqs[cpu_idx])) 325 else 326 available_freqs[freq]=$((nominal_perf[cpu_idx] - (\ 327 freq - cpufreq_is_turbo[cpu_idx]) * 100000)) 328 fi 329 done 330 ;; 331 esac 332 done 333 if [[ -e $sysfs_cpu/cpufreq/boost ]]; then 334 turbo_enabled=$(< "$sysfs_cpu/cpufreq/boost") 335 elif [[ -e $sysfs_cpu/intel_pstate/no_turbo ]]; then 336 turbo_enabled=$((!$(< "$sysfs_cpu/intel_pstate/no_turbo"))) 337 fi 338} 339 340set_cpufreq() { 341 local cpu=$1 342 local min_freq=$2 343 local max_freq=$3 344 local cpufreq=$sysfs_cpu/cpu$cpu/cpufreq 345 346 # Map the cpufreq info first 347 [[ -n ${cpufreq_drivers[cpu]} ]] || return 1 348 [[ -n $min_freq ]] || return 1 349 350 case "${cpufreq_drivers[cpu]}" in 351 acpi-cpufreq) 352 if [[ ${cpufreq_governors[cpu]} != userspace ]]; then 353 echo "userspace" > "$cpufreq/scaling_governors" 354 fi 355 echo "$min_freq" > "$cpufreq/scaling_setspeed" 356 ;; 357 intel_pstate | intel_cpufreq) 358 if ((min_freq <= cpufreq_max_freqs[cpu])); then 359 echo "$min_freq" > "$cpufreq/scaling_min_freq" 360 fi 361 if [[ -n $max_freq ]] && ((max_freq >= min_freq)); then 362 echo "$max_freq" > "$cpufreq/scaling_max_freq" 363 fi 364 ;; 365 esac 366} 367 368set_cpufreq_governor() { 369 local cpu=$1 370 local governor=$2 371 local cpufreq=$sysfs_cpu/cpu$cpu/cpufreq 372 373 if [[ $(< "$cpufreq/scaling_governor") != "$governor" ]]; then 374 echo "$governor" > "$cpufreq/scaling_governor" 375 fi 376} 377 378exec_under_dynamic_scheduler() { 379 if [[ -e /proc/$spdk_pid/status ]]; then 380 killprocess "$spdk_pid" 381 fi 382 exec_in_cgroup "/cpuset/spdk" "$@" --wait-for-rpc & 383 spdk_pid=$! 384 # Give some time for the app to init itself 385 waitforlisten "$spdk_pid" 386 "$rootdir/scripts/rpc.py" framework_set_scheduler dynamic 387 "$rootdir/scripts/rpc.py" framework_start_init 388} 389 390get_thread_stats() { 391 xtrace_disable 392 _get_thread_stats busy idle 393 xtrace_restore 394} 395 396_get_thread_stats() { 397 local list_busy=$1 398 local list_idle=$2 399 local thread threads stats 400 401 stats=$(rpc_cmd thread_get_stats | jq -r '.threads[]') 402 threads=($(jq -r '.id' <<< "$stats")) 403 404 for thread in "${threads[@]}"; do 405 eval "${list_busy}[$thread]=\$(jq -r \"select(.id == $thread) | .busy\" <<< \$stats)" 406 eval "${list_idle}[$thread]=\$(jq -r \"select(.id == $thread) | .idle\" <<< \$stats)" 407 thread_map[thread]=$(jq -r "select(.id == $thread) | .name" <<< "$stats") 408 done 409} 410 411get_cpu_stat() { 412 local cpu_idx=$1 413 local stat=$2 stats astats 414 415 while read -r cpu stats; do 416 [[ $cpu == "cpu$cpu_idx" ]] && astats=($stats) 417 done < /proc/stat 418 419 case "$stat" in 420 idle) echo "${astats[3]}" ;; 421 all) printf '%u\n' "${astats[@]}" ;; 422 *) ;; 423 esac 424} 425 426create_thread() { 427 rpc_cmd --plugin "$plugin" scheduler_thread_create "$@" 428} 429 430destroy_thread() { 431 rpc_cmd --plugin "$plugin" scheduler_thread_delete "$@" 432} 433 434active_thread() { 435 rpc_cmd --plugin "$plugin" scheduler_thread_set_active "$@" 436} 437 438get_cpu_time() { 439 xtrace_disable 440 441 local interval=$1 cpu_time=$2 interval_count 442 shift 2 443 local cpus=("$@") cpu 444 local stats stat old_stats avg_load 445 local total_sample 446 447 # Exposed for the caller 448 local -g cpu_times=() 449 local -g avg_cpu_time=() 450 451 # cpu_time: 452 # 0 - user (time spent in user mode) 453 # 1 - nice (Time spent in user mode with low priority) 454 # 2 - system (Time spent in system mode) 455 # 3 - idle (Time spent in the idle task) 456 # 4 - iowait (Time waiting for I/O to complete) 457 # 5 - irq (Time servicing interrupts) 458 # 6 - softirq (Time servicing softirqs) 459 # 7 - steal (Stolen time) 460 # 8 - guest (Time spent running a virtual CPU) 461 # 9 - guest_nice (Time spent running a niced guest) 462 463 local -A cpu_time_map 464 cpu_time_map["user"]=0 465 cpu_time_map["nice"]=1 466 cpu_time_map["system"]=2 467 cpu_time_map["idle"]=3 468 cpu_time_map["iowait"]=4 469 cpu_time_map["irq"]=5 470 cpu_time_map["softirq"]=6 471 cpu_time_map["steal"]=7 472 cpu_time_map["guest"]=8 473 cpu_time_map["guest_nice"]=9 474 475 # Clear up the env 476 unset -v ${!stat_@} 477 unset -v ${!old_stat_@} 478 unset -v ${!avg_stat@} 479 unset -v ${!avg_load@} 480 481 cpu_time=${cpu_time_map["$cpu_time"]:-3} 482 interval=$((interval <= 0 ? 1 : interval)) 483 # We skip first sample to have min 2 for stat comparison 484 interval=$((interval + 1)) interval_count=0 485 while ((interval_count++, --interval >= 0)); do 486 for cpu in "${cpus[@]}"; do 487 local -n old_stats=old_stats_$cpu 488 local -n avg_load=avg_load_$cpu 489 sample_stats=() total_sample=0 490 491 stats=($(get_cpu_stat "$cpu" all)) 492 if ((interval_count == 1)); then 493 # Skip first sample 494 old_stats=("${stats[@]}") 495 continue 496 fi 497 for stat in "${!stats[@]}"; do 498 avg_load[stat]="stat_${stat}_${cpu}[@]" 499 sample_stats[stat]=$((stats[stat] - old_stats[stat])) 500 : $((total_sample += sample_stats[stat])) 501 done 502 for stat in "${!stats[@]}"; do 503 local -n avg_stat=stat_${stat}_${cpu} 504 avg_stat+=($((sample_stats[stat] * 100 / (total_sample == 0 ? 1 : total_sample)))) 505 done 506 old_stats=("${stats[@]}") 507 done 508 sleep 1s 509 done 510 511 # We collected % for each time. Now determine the avg % for requested time. 512 local load stat_load 513 for cpu in "${cpus[@]}"; do 514 load=0 515 local -n avg_load_cpu=avg_load_$cpu 516 stat_load=("${!avg_load_cpu[cpu_time]}") 517 for stat in "${stat_load[@]}"; do 518 : $((load += stat)) 519 done 520 cpu_times[cpu]=${stat_load[*]} 521 avg_cpu_time[cpu]=$((load / ${#stat_load[@]})) 522 done 523 524 xtrace_restore 525} 526 527collect_cpu_idle() { 528 ((${#cpus_to_collect[@]} > 0)) || return 1 529 530 local time=${1:-5} 531 local cpu 532 local samples 533 local -g is_idle=() 534 535 printf 'Collecting cpu idle stats (cpus: %s) for %u seconds...\n' \ 536 "${cpus_to_collect[*]}" "$time" 537 538 get_cpu_time "$time" idle "${cpus_to_collect[@]}" 539 540 for cpu in "${cpus_to_collect[@]}"; do 541 samples=(${cpu_times[cpu]}) 542 printf '* cpu%u idle samples: %s (avg: %u%%)\n' \ 543 "$cpu" "${samples[*]}" "${avg_cpu_time[cpu]}" 544 # Cores with polling reactors have 0% idle time, 545 # while the ones in interrupt mode won't have 100% idle. 546 # Work can be potentially be scheduled to the core by kernel, 547 # to prevent that affecting tests set reasonably high idle limit. 548 # Consider last sample 549 if ((samples[-1] >= 70)); then 550 printf '* cpu%u is idle\n' "$cpu" 551 is_idle[cpu]=1 552 else 553 printf '*cpu%u is not idle\n' "$cpu" 554 is_idle[cpu]=0 555 fi 556 done 557} 558 559update_thread_cpus_map() { 560 local cpu 561 local -g thread_cpus=() 562 local reactor_framework 563 564 ((${#cpus[@]} > 0)) || return 1 565 566 get_thread_stats 567 568 reactor_framework=$(rpc_cmd framework_get_reactors | jq -r '.reactors[]') 569 for cpu in "${cpus[@]}"; do 570 for thread in $(jq -r "select(.lcore == $cpu) | .lw_threads[].id" <<< "$reactor_framework"); do 571 printf '* Thread %u (%s) on cpu%u\n' "$thread" "${thread_map[thread]}" "$cpu" 572 thread_cpus[thread]=$cpu 573 done 574 done 575 ((${#thread_cpus[@]} > 0)) 576} 577