1#!/usr/bin/env bash 2# SPDX-License-Identifier: BSD-3-Clause 3# Copyright (C) 2018 Intel Corporation 4# All rights reserved. 5# 6source "$rootdir/test/dd/common.sh" 7 8function discover_bdevs() { 9 local rootdir=$1 10 local config_file=$2 11 local wait_for_spdk_bdev=90 12 local rpc_server=/var/tmp/spdk-discover-bdevs.sock 13 14 if [ ! -e $config_file ]; then 15 echo "Invalid Configuration File: $config_file" 16 return 1 17 fi 18 19 # Start the bdev service to query for the list of available 20 # bdevs. 21 $rootdir/test/app/bdev_svc/bdev_svc -r $rpc_server -i 0 \ 22 --json $config_file &> /dev/null & 23 stubpid=$! 24 while ! [ -e /var/run/spdk_bdev0 ]; do 25 # If this counter drops to zero, errexit will be caught to abort the test 26 ((wait_for_spdk_bdev--)) 27 sleep 1 28 done 29 30 # Get all of the bdevs 31 $rootdir/scripts/rpc.py -s "$rpc_server" bdev_get_bdevs 32 33 # Shut down the bdev service 34 kill $stubpid 35 wait $stubpid 36 rm -f /var/run/spdk_bdev0 37} 38 39function get_disk_cfg() { 40 grep -vP "^\s*#" "$DISKCFG" 41} 42 43function create_spdk_bdev_conf() { 44 local output 45 local disk_cfg 46 local bdev_io_cache_size=$1 47 local bdev_io_pool_size=$2 48 local bdev_json_cfg=() 49 local dev_opts=() 50 local i 51 52 disk_cfg=($(get_disk_cfg)) 53 54 if [[ -n "$bdev_io_cache_size" ]]; then 55 bdev_opts+=("\"bdev_io_cache_size\": $bdev_io_cache_size") 56 fi 57 58 if [[ -n "$bdev_io_pool_size" ]]; then 59 bdev_opts+=("\"bdev_io_pool_size\": $bdev_io_pool_size") 60 fi 61 62 local IFS="," 63 if [[ ${#bdev_opts[@]} -gt 0 ]]; then 64 bdev_json_cfg+=("$( 65 cat <<- JSON 66 { 67 "method": "bdev_set_options", 68 "params": { 69 ${bdev_opts[*]} 70 } 71 } 72 JSON 73 )") 74 fi 75 76 for i in "${!disk_cfg[@]}"; do 77 bdev_json_cfg+=("$( 78 cat <<- JSON 79 { 80 "method": "bdev_nvme_attach_controller", 81 "params": { 82 "trtype": "PCIe", 83 "name":"Nvme${i}", 84 "traddr":"${disk_cfg[i]}" 85 } 86 } 87 JSON 88 )") 89 done 90 91 local IFS="," 92 jq -r '.' <<- JSON > $testdir/bdev.conf 93 { 94 "subsystems": [ 95 { 96 "subsystem": "bdev", 97 "config": [ 98 ${bdev_json_cfg[*]}, 99 { 100 "method": "bdev_wait_for_examine" 101 } 102 ] 103 } 104 ] 105 } 106 JSON 107} 108 109function is_bdf_not_mounted() { 110 local bdf=$1 111 local blkname 112 local mountpoints 113 blkname=$(ls -l /sys/block/ | grep $bdf | awk '{print $9}') 114 mountpoints=$(lsblk /dev/$blkname --output MOUNTPOINT -n | wc -w) 115 return $mountpoints 116} 117 118function get_cores() { 119 local cpu_list="$1" 120 for cpu in ${cpu_list//,/ }; do 121 echo $cpu 122 done 123} 124 125function get_cores_numa_node() { 126 local cores=$1 127 for core in $cores; do 128 lscpu -p=cpu,node | grep "^$core\b" | awk -F ',' '{print $2}' 129 done 130} 131 132function get_numa_node() { 133 local plugin=$1 134 local disks=$2 135 if [[ "$plugin" =~ "nvme" ]]; then 136 for bdf in $disks; do 137 local driver 138 driver=$(grep DRIVER /sys/bus/pci/devices/$bdf/uevent | awk -F"=" '{print $2}') 139 # Use this check to omit blocked devices ( not bound to driver with setup.sh script ) 140 if [ "$driver" = "vfio-pci" ] || [ "$driver" = "uio_pci_generic" ]; then 141 cat /sys/bus/pci/devices/$bdf/numa_node 142 fi 143 done 144 elif [[ "$plugin" =~ "bdev" ]]; then 145 local bdevs 146 bdevs=$(discover_bdevs $rootdir $testdir/bdev.conf) 147 for name in $disks; do 148 local bdev_bdf 149 bdev_bdf=$(jq -r ".[] | select(.name==\"$name\").driver_specific.nvme[].pci_address" <<< "$bdevs") 150 cat /sys/bus/pci/devices/$bdev_bdf/numa_node 151 done 152 else 153 for name in $disks; do 154 cat "/sys/block/$name/device/numa_node" 155 done 156 fi 157} 158 159function get_disks() { 160 local plugin=$1 161 local disk_cfg=($(get_disk_cfg)) 162 163 if [[ "$plugin" =~ "nvme" ]]; then 164 # PCI BDF address is enough for nvme-perf and nvme-fio-plugin, 165 # so just print them from configuration file 166 echo "${disk_cfg[*]}" 167 elif [[ "$plugin" =~ "bdev" ]]; then 168 # Generate NvmeXn1 bdev name configuration file for bdev-perf 169 # and bdev-fio-plugin 170 local bdevs 171 local disk_no 172 disk_no=${#disk_cfg[@]} 173 eval echo "Nvme{0..$((disk_no - 1))}n1" 174 else 175 # Find nvme block devices and only use the ones which 176 # are not mounted 177 for bdf in "${disk_cfg[@]}"; do 178 if is_bdf_not_mounted $bdf; then 179 local blkname 180 blkname=$(ls -l /sys/block/ | grep $bdf | awk '{print $9}') 181 echo $blkname 182 fi 183 done 184 fi 185} 186 187function get_disks_on_numa() { 188 local devs=($1) 189 local numas=($2) 190 local numa_no=$3 191 local disks_on_numa="" 192 local i 193 194 for ((i = 0; i < ${#devs[@]}; i++)); do 195 if [ ${numas[$i]} = $numa_no ]; then 196 disks_on_numa=$((disks_on_numa + 1)) 197 fi 198 done 199 echo $disks_on_numa 200} 201 202function create_fio_config() { 203 local disk_no=$1 204 local plugin=$2 205 local disks=($3) 206 local disks_numa=($4) 207 local cores=($5) 208 local total_disks=${#disks[@]} 209 local fio_job_section=() 210 local num_cores=${#cores[@]} 211 local disks_per_core=$((disk_no / num_cores)) 212 local disks_per_core_mod=$((disk_no % num_cores)) 213 local cores_numa 214 cores_numa=($(get_cores_numa_node "${cores[*]}")) 215 216 # Following part of this function still leverages global variables a lot. 217 # It's a mix of local variables passed as arguments to function with global variables. This is messy. 218 # TODO: Modify this to be consistent with how variables are used here. Aim for using only 219 # local variables to get rid of globals as much as possible. 220 desc="\"Test io_plugin=$PLUGIN Blocksize=${BLK_SIZE} Workload=$RW MIX=${MIX} qd=${IODEPTH}\"" 221 cp "$testdir/config.fio.tmp" "$testdir/config.fio" 222 cat <<- EOF >> $testdir/config.fio 223 description=$desc 224 225 rw=$RW 226 rwmixread=$MIX 227 bs=$BLK_SIZE 228 runtime=$RUNTIME 229 ramp_time=$RAMP_TIME 230 numjobs=$NUMJOBS 231 log_avg_msec=$SAMPLING_INT 232 EOF 233 234 if $GTOD_REDUCE; then 235 echo "gtod_reduce=1" >> $testdir/config.fio 236 fi 237 238 if [[ $PLUGIN =~ "uring" || $PLUGIN =~ "xnvme" ]]; then 239 cat <<- EOF >> $testdir/config.fio 240 fixedbufs=1 241 hipri=1 242 registerfiles=1 243 sqthread_poll=1 244 EOF 245 fi 246 247 if [[ "$IO_BATCH_SUBMIT" -gt 0 ]]; then 248 echo "iodepth_batch_submit=$IO_BATCH_SUBMIT" >> $testdir/config.fio 249 fi 250 251 if [[ "$IO_BATCH_COMPLETE" -gt 0 ]]; then 252 echo "iodepth_batch_complete=$IO_BATCH_COMPLETE" >> $testdir/config.fio 253 fi 254 255 for i in "${!cores[@]}"; do 256 local m=0 #Counter of disks per NUMA node 257 local n=0 #Counter of all disks in test 258 core_numa=${cores_numa[$i]} 259 260 total_disks_per_core=$disks_per_core 261 # Check how many "stray" disks are unassigned to CPU cores 262 # Assign one disk to current CPU core and subtract it from the total of 263 # unassigned disks 264 if [[ "$disks_per_core_mod" -gt "0" ]]; then 265 total_disks_per_core=$((disks_per_core + 1)) 266 disks_per_core_mod=$((disks_per_core_mod - 1)) 267 fi 268 # SPDK fio plugin supports submitting/completing I/Os to multiple SSDs from a single thread. 269 # Therefore, the per thread queue depth is set to the desired IODEPTH/device X the number of devices per thread. 270 QD=$IODEPTH 271 if [[ "$NOIOSCALING" == false ]]; then 272 QD=$((IODEPTH * total_disks_per_core)) 273 fi 274 275 if [[ "$FIO_FNAME_STRATEGY" == "group" ]]; then 276 fio_job_section+=("") 277 fio_job_section+=("[filename${i}]") 278 fio_job_section+=("iodepth=$QD") 279 fio_job_section+=("cpus_allowed=${cores[$i]} #CPU NUMA Node ${cores_numa[$i]}") 280 fi 281 282 while [[ "$m" -lt "$total_disks_per_core" ]]; do 283 # Try to add disks to job section if it's NUMA node matches NUMA 284 # for currently selected CPU 285 if [[ "${disks_numa[$n]}" == "$core_numa" ]]; then 286 if [[ "$FIO_FNAME_STRATEGY" == "split" ]]; then 287 fio_job_section+=("") 288 fio_job_section+=("[filename${m}-${cores[$i]}]") 289 fio_job_section+=("iodepth=$QD") 290 fio_job_section+=("cpus_allowed=${cores[$i]} #CPU NUMA Node ${cores_numa[$i]}") 291 fi 292 293 if [[ "$plugin" == "spdk-plugin-nvme" ]]; then 294 fio_job_section+=("filename=trtype=PCIe traddr=${disks[$n]//:/.} ns=1 #NVMe NUMA Node ${disks_numa[$n]}") 295 elif [[ "$plugin" == "spdk-plugin-bdev" || "$plugin" == "spdk-plugin-bdev-xnvme" ]]; then 296 fio_job_section+=("filename=${disks[$n]} #NVMe NUMA Node ${disks_numa[$n]}") 297 elif [[ "$plugin" =~ "kernel" ]]; then 298 fio_job_section+=("filename=/dev/${disks[$n]} #NVMe NUMA Node ${disks_numa[$n]}") 299 fi 300 m=$((m + 1)) 301 302 #Mark numa of n'th disk as "x" to mark it as claimed for next loop iterations 303 disks_numa[n]="x" 304 fi 305 n=$((n + 1)) 306 307 # If there is no more disks with numa node same as cpu numa node, switch to 308 # other numa node, go back to start of loop and try again. 309 if [[ $n -ge $total_disks ]]; then 310 echo "WARNING! Cannot assign any more NVMes for CPU ${cores[$i]}" 311 echo "NVMe assignment for this CPU will be cross-NUMA." 312 if [[ "$core_numa" == "1" ]]; then 313 core_numa=0 314 else 315 core_numa=1 316 fi 317 n=0 318 fi 319 done 320 done 321 322 printf "%s\n" "${fio_job_section[@]}" >> $testdir/config.fio 323 echo "INFO: Generated fio configuration file:" 324 cat $testdir/config.fio 325} 326 327function bc() { 328 $(type -P bc) -l <<< "scale=3; $1" 329} 330 331function get_results() { 332 local iops bw stdev 333 local p90_lat p99_lat p99_99_lat 334 local mean_slat mean_clat 335 local reads_pct 336 local writes_pct 337 338 reads_pct=$(bc "$1 / 100") 339 writes_pct=$(bc "1 - $reads_pct") 340 341 iops=$(jq -r '.jobs[] | .read.iops + .write.iops' $TMP_RESULT_FILE) 342 bw=$(jq -r ".jobs[] | (.read.bw + .write.bw)" $TMP_RESULT_FILE) 343 mean_lat=$(jq -r ".jobs[] | (.read.lat_ns.mean * $reads_pct + .write.lat_ns.mean * $writes_pct)/1000" $TMP_RESULT_FILE) 344 p90_lat=$(jq -r ".jobs[] | (.read.clat_ns.percentile.\"90.000000\" // 0 * $reads_pct + .write.clat_ns.percentile.\"90.000000\" // 0 * $writes_pct)/1000" $TMP_RESULT_FILE) 345 p99_lat=$(jq -r ".jobs[] | (.read.clat_ns.percentile.\"99.000000\" // 0 * $reads_pct + .write.clat_ns.percentile.\"99.000000\" // 0 * $writes_pct)/1000" $TMP_RESULT_FILE) 346 p99_99_lat=$(jq -r ".jobs[] | (.read.clat_ns.percentile.\"99.990000\" // 0 * $reads_pct + .write.clat_ns.percentile.\"99.990000\" // 0 * $writes_pct)/1000" $TMP_RESULT_FILE) 347 stdev=$(jq -r ".jobs[] | (.read.clat_ns.stddev * $reads_pct + .write.clat_ns.stddev * $writes_pct)/1000" $TMP_RESULT_FILE) 348 mean_slat=$(jq -r ".jobs[] | (.read.slat_ns.mean * $reads_pct + .write.slat_ns.mean * $writes_pct)/1000" $TMP_RESULT_FILE) 349 mean_clat=$(jq -r ".jobs[] | (.read.clat_ns.mean * $reads_pct + .write.clat_ns.mean * $writes_pct)/1000" $TMP_RESULT_FILE) 350 351 echo "$iops $bw $mean_lat $p90_lat $p99_lat $p99_99_lat $stdev $mean_slat $mean_clat" 352} 353 354function get_bdevperf_results() { 355 local iops 356 local bw_MBs 357 read -r iops bw_MBs <<< $(grep Total $TMP_RESULT_FILE | tr -s " " | awk -F ":| " '{print $5" "$7}') 358 echo "$iops $(bc "$bw_MBs * 1024")" 359} 360 361function get_nvmeperf_results() { 362 local iops 363 local bw_MBs 364 local mean_lat_usec 365 local max_lat_usec 366 local min_lat_usec 367 368 read -r iops bw_MBs mean_lat_usec min_lat_usec max_lat_usec <<< $(tr -s " " < $TMP_RESULT_FILE | grep -oP "(?<=Total : )(.*+)") 369 echo "$iops $(bc "$bw_MBs * 1024") $mean_lat_usec $min_lat_usec $max_lat_usec" 370} 371 372function run_spdk_nvme_fio() { 373 local plugin=$1 374 echo "** Running fio test, this can take a while, depending on the run-time and ramp-time setting." 375 if [[ "$plugin" = "spdk-plugin-nvme" ]]; then 376 LD_PRELOAD=$plugin_dir/spdk_nvme $FIO_BIN $testdir/config.fio --output-format=json "${@:2}" --ioengine=spdk 377 elif [[ "$plugin" = "spdk-plugin-bdev" || "$plugin" = "spdk-plugin-bdev-xnvme" ]]; then 378 LD_PRELOAD=$plugin_dir/spdk_bdev $FIO_BIN $testdir/config.fio --output-format=json "${@:2}" --ioengine=spdk_bdev --spdk_json_conf=$testdir/bdev.conf --spdk_mem=4096 379 fi 380 381 sleep 1 382} 383 384function run_nvme_fio() { 385 echo "** Running fio test, this can take a while, depending on the run-time and ramp-time setting." 386 $FIO_BIN $testdir/config.fio --output-format=json "$@" 387 sleep 1 388} 389 390function run_bdevperf() { 391 local bdevperf_rpc 392 local bdevperf_pid 393 local rpc_socket 394 local bpf_script_cmd 395 local bpf_script_pid 396 local bpf_app_pid 397 local main_core_param="" 398 399 bdevperf_rpc="$rootdir/examples/bdev/bdevperf/bdevperf.py" 400 rpc_socket="/var/tmp/spdk.sock" 401 402 if [[ -n $MAIN_CORE ]]; then 403 main_core_param="-p ${MAIN_CORE}" 404 fi 405 406 echo "** Running bdevperf test, this can take a while, depending on the run-time setting." 407 $_examples_dir/bdevperf --json $testdir/bdev.conf -q $IODEPTH -o $BLK_SIZE -w $RW -M $MIX -t $RUNTIME -m "[$CPUS_ALLOWED]" -r "$rpc_socket" $main_core_param -z & 408 bdevperf_pid=$! 409 waitforlisten $bdevperf_pid "$rpc_socket" 500 410 411 if [[ ${#BPFTRACES[@]} -gt 0 ]]; then 412 echo "INFO: Enabling BPF Traces ${BPFTRACES[*]}" 413 bpf_script_cmd=("$rootdir/scripts/bpftrace.sh") 414 bpf_script_cmd+=("$bdevperf_pid") 415 for trace in "${BPFTRACES[@]}"; do 416 bpf_script_cmd+=("$rootdir/scripts/bpf/$trace") 417 done 418 419 BPF_OUTFILE=$TMP_BPF_FILE "${bpf_script_cmd[@]}" & 420 bpf_script_pid=$! 421 sleep 3 422 fi 423 424 PYTHONPATH=$PYTHONPATH:$rootdir/python $bdevperf_rpc -s "$rpc_socket" -t $((RUNTIME + 10)) perform_tests 425 426 # Using "-z" option causes bdevperf to NOT exit automatically after running the test, 427 # so we need to stop it ourselves. 428 kill -s SIGINT $bdevperf_pid 429 wait $bdevperf_pid 430 431 if ((bpf_script_pid)); then 432 wait $bpf_script_pid 433 fi 434 sleep 1 435} 436 437function run_nvmeperf() { 438 # Prepare -r argument string for nvme perf command 439 local r_opt 440 local disks 441 442 # Limit the number of disks to $1 if needed 443 disks=($(get_disks nvme)) 444 disks=("${disks[@]:0:$1}") 445 r_opt=$(printf -- ' -r "trtype:PCIe traddr:%s"' "${disks[@]}") 446 447 echo "** Running nvme perf test, this can take a while, depending on the run-time setting." 448 449 # Run command in separate shell as this solves quoting issues related to r_opt var 450 $SHELL -c "$_examples_dir/perf $r_opt -q $IODEPTH -o $BLK_SIZE -w $RW -M $MIX -t $RUNTIME -c [$CPUS_ALLOWED]" 451 sleep 1 452} 453 454function wait_for_nvme_reload() { 455 local nvmes=$1 456 457 shopt -s extglob 458 for disk in $nvmes; do 459 cmd="ls /sys/block/$disk/queue/*@(iostats|rq_affinity|nomerges|io_poll_delay)*" 460 until $cmd 2> /dev/null; do 461 echo "Waiting for full nvme driver reload..." 462 sleep 0.5 463 done 464 done 465 shopt -q extglob 466} 467 468function verify_disk_number() { 469 # Check if we have appropriate number of disks to carry out the test 470 disks=($(get_disks $PLUGIN)) 471 if [[ $DISKNO == "ALL" ]] || [[ $DISKNO == "all" ]]; then 472 DISKNO=${#disks[@]} 473 elif [[ $DISKNO -gt ${#disks[@]} ]] || [[ ! $DISKNO =~ ^[0-9]+$ ]]; then 474 echo "error: Required devices number ($DISKNO) is not a valid number or it's larger than the number of devices found (${#disks[@]})" 475 false 476 fi 477} 478 479function create_spdk_xnvme_bdev_conf() { 480 local bdev_io_cache_size=$1 bdev_io_pool_size=$2 481 local blocks block_idx io_mechanism=libaio 482 483 (($#)) && local -A method_bdev_set_options_0 484 485 blocks=($(get_disks)) 486 487 if [[ -n $bdev_io_cache_size ]]; then 488 method_bdev_set_options_0["bdev_io_cache_size"]=$bdev_io_cache_size 489 fi 490 if [[ -n $bdev_io_pool_size ]]; then 491 method_bdev_set_options_0["bdev_io_pool_size"]=$bdev_io_pool_size 492 fi 493 494 for block_idx in "${!blocks[@]}"; do 495 local -A method_bdev_xnvme_create_$block_idx 496 local -n rpc_ref=method_bdev_xnvme_create_$block_idx 497 rpc_ref["filename"]=/dev/${blocks[block_idx]} 498 rpc_ref["io_mechanism"]=io_uring 499 rpc_ref["name"]=${blocks[block_idx]} 500 done 501 gen_conf > "$testdir/bdev.conf" 502} 503 504# LVOL support functions 505function start_spdk_tgt() { 506 $SPDK_BIN_DIR/spdk_tgt -g & 507 spdk_tgt_pid=$! 508 509 waitforlisten $spdk_tgt_pid 510} 511 512function stop_spdk_tgt() { 513 killprocess $spdk_tgt_pid 514} 515 516function attach_bdevs() { 517 local disk_cfg=($(get_disk_cfg)) 518 local i 519 for i in "${!disk_cfg[@]}"; do 520 $rpc_py bdev_nvme_attach_controller -b "Nvme${i}" -t pcie -a "${disk_cfg[i]}" 521 echo "Attached NVMe Bdev $nvme_bdev with BDF" 522 done 523} 524 525function cleanup_lvol_cfg() { 526 local -a lvol_stores 527 local -a lvol_bdevs 528 local lvol_store lvol_bdev 529 530 echo "Cleanup lvols" 531 lvol_stores=($($rpc_py bdev_lvol_get_lvstores | jq -r '.[].uuid')) 532 for lvol_store in "${lvol_stores[@]}"; do 533 lvol_bdevs=($($rpc_py bdev_lvol_get_lvols -u $lvol_store | jq -r '.[].uuid')) 534 for lvol_bdev in "${lvol_bdevs[@]}"; do 535 $rpc_py bdev_lvol_delete $lvol_bdev 536 echo "lvol bdev $lvol_bdev removed" 537 done 538 539 $rpc_py bdev_lvol_delete_lvstore -u $lvol_store 540 echo "lvol store $lvol_store removed" 541 done 542} 543 544function cleanup_lvols() { 545 start_spdk_tgt 546 attach_bdevs 547 cleanup_lvol_cfg 548 stop_spdk_tgt 549} 550 551function create_lvols() { 552 start_spdk_tgt 553 attach_bdevs 554 cleanup_lvol_cfg 555 556 nvme_bdevs=($($rpc_py bdev_get_bdevs | jq -r '.[].name')) 557 for nvme_bdev in "${nvme_bdevs[@]}"; do 558 ls_guid=$($rpc_py bdev_lvol_create_lvstore $nvme_bdev lvs_0 --clear-method none) 559 echo "Created LVOL Store $ls_guid on Bdev $nvme_bdev" 560 561 free_mb=$(get_lvs_free_mb "$ls_guid") 562 lb_name=$($rpc_py bdev_lvol_create -u $ls_guid lbd_0 $free_mb --clear-method none) 563 LVOL_BDEVS+=("$lb_name") 564 echo "Created LVOL Bdev $lb_name ($free_mb MB) on Lvol Store $ls_guid on Bdev $nvme_bdev" 565 done 566 567 stop_spdk_tgt 568} 569