1#!/usr/bin/env bash 2 3function discover_bdevs() { 4 local rootdir=$1 5 local config_file=$2 6 local wait_for_spdk_bdev=30 7 local rpc_server=/var/tmp/spdk-discover-bdevs.sock 8 9 if [ ! -e $config_file ]; then 10 echo "Invalid Configuration File: $config_file" 11 return 1 12 fi 13 14 # Start the bdev service to query for the list of available 15 # bdevs. 16 $rootdir/test/app/bdev_svc/bdev_svc -r $rpc_server -i 0 \ 17 --json $config_file &> /dev/null & 18 stubpid=$! 19 while ! [ -e /var/run/spdk_bdev0 ]; do 20 # If this counter drops to zero, errexit will be caught to abort the test 21 ((wait_for_spdk_bdev--)) 22 sleep 1 23 done 24 25 # Get all of the bdevs 26 $rootdir/scripts/rpc.py -s "$rpc_server" bdev_get_bdevs 27 28 # Shut down the bdev service 29 kill $stubpid 30 wait $stubpid 31 rm -f /var/run/spdk_bdev0 32} 33 34function create_spdk_bdev_conf() { 35 local output 36 local disk_cfg 37 local bdev_io_cache_size=$1 38 local bdev_io_pool_size=$2 39 local bdev_json_cfg=() 40 local bdev_opts=() 41 42 disk_cfg=($(grep -vP "^\s*#" "$DISKCFG")) 43 44 if [[ -n "$bdev_io_cache_size" ]]; then 45 bdev_opts+=("\"bdev_io_cache_size\": $bdev_io_cache_size") 46 fi 47 48 if [[ -n "$bdev_io_pool_size" ]]; then 49 bdev_opts+=("\"bdev_io_pool_size\": $bdev_io_pool_size") 50 fi 51 52 local IFS="," 53 if [[ ${#bdev_opts[@]} -gt 0 ]]; then 54 bdev_json_cfg+=("$( 55 cat <<- JSON 56 { 57 "method": "bdev_set_options", 58 "params": { 59 ${bdev_opts[*]} 60 } 61 } 62 JSON 63 )") 64 fi 65 66 for i in "${!disk_cfg[@]}"; do 67 bdev_json_cfg+=("$( 68 cat <<- JSON 69 { 70 "method": "bdev_nvme_attach_controller", 71 "params": { 72 "trtype": "PCIe", 73 "name":"Nvme${i}", 74 "traddr":"${disk_cfg[i]}" 75 } 76 } 77 JSON 78 )") 79 done 80 81 local IFS="," 82 jq -r '.' <<- JSON > $testdir/bdev.conf 83 { 84 "subsystems": [ 85 { 86 "subsystem": "bdev", 87 "config": [ 88 ${bdev_json_cfg[*]}, 89 { 90 "method": "bdev_wait_for_examine" 91 } 92 ] 93 } 94 ] 95 } 96 JSON 97} 98 99function is_bdf_not_mounted() { 100 local bdf=$1 101 local blkname 102 local mountpoints 103 blkname=$(ls -l /sys/block/ | grep $bdf | awk '{print $9}') 104 mountpoints=$(lsblk /dev/$blkname --output MOUNTPOINT -n | wc -w) 105 return $mountpoints 106} 107 108function get_cores() { 109 local cpu_list="$1" 110 for cpu in ${cpu_list//,/ }; do 111 echo $cpu 112 done 113} 114 115function get_cores_numa_node() { 116 local cores=$1 117 for core in $cores; do 118 lscpu -p=cpu,node | grep "^$core\b" | awk -F ',' '{print $2}' 119 done 120} 121 122function get_numa_node() { 123 local plugin=$1 124 local disks=$2 125 if [[ "$plugin" =~ "nvme" ]]; then 126 for bdf in $disks; do 127 local driver 128 driver=$(grep DRIVER /sys/bus/pci/devices/$bdf/uevent | awk -F"=" '{print $2}') 129 # Use this check to omit blocked devices ( not bound to driver with setup.sh script ) 130 if [ "$driver" = "vfio-pci" ] || [ "$driver" = "uio_pci_generic" ]; then 131 cat /sys/bus/pci/devices/$bdf/numa_node 132 fi 133 done 134 elif [[ "$plugin" =~ "bdev" ]]; then 135 local bdevs 136 bdevs=$(discover_bdevs $rootdir $testdir/bdev.conf) 137 for name in $disks; do 138 local bdev_bdf 139 bdev_bdf=$(jq -r ".[] | select(.name==\"$name\").driver_specific.nvme.pci_address" <<< $bdevs) 140 cat /sys/bus/pci/devices/$bdev_bdf/numa_node 141 done 142 else 143 for name in $disks; do 144 local bdf 145 # Not reading directly from /sys/block/nvme* because of a kernel bug 146 # which results in NUMA 0 always getting reported. 147 bdf=$(cat /sys/block/$name/device/address) 148 cat /sys/bus/pci/devices/$bdf/numa_node 149 done 150 fi 151} 152 153function get_disks() { 154 local plugin=$1 155 local disk_cfg 156 157 disk_cfg=($(grep -vP "^\s*#" "$DISKCFG")) 158 if [[ "$plugin" =~ "nvme" ]]; then 159 # PCI BDF address is enough for nvme-perf and nvme-fio-plugin, 160 # so just print them from configuration file 161 echo "${disk_cfg[*]}" 162 elif [[ "$plugin" =~ "bdev" ]]; then 163 # Generate NvmeXn1 bdev name configuration file for bdev-perf 164 # and bdev-fio-plugin 165 local bdevs 166 local disk_no 167 disk_no=${#disk_cfg[@]} 168 eval echo "Nvme{0..$((disk_no - 1))}n1" 169 else 170 # Find nvme block devices and only use the ones which 171 # are not mounted 172 for bdf in "${disk_cfg[@]}"; do 173 if is_bdf_not_mounted $bdf; then 174 local blkname 175 blkname=$(ls -l /sys/block/ | grep $bdf | awk '{print $9}') 176 echo $blkname 177 fi 178 done 179 fi 180} 181 182function get_disks_on_numa() { 183 local devs=($1) 184 local numas=($2) 185 local numa_no=$3 186 local disks_on_numa="" 187 local i 188 189 for ((i = 0; i < ${#devs[@]}; i++)); do 190 if [ ${numas[$i]} = $numa_no ]; then 191 disks_on_numa=$((disks_on_numa + 1)) 192 fi 193 done 194 echo $disks_on_numa 195} 196 197function create_fio_config() { 198 local disk_no=$1 199 local plugin=$2 200 local disks=($3) 201 local disks_numa=($4) 202 local cores=($5) 203 local total_disks=${#disks[@]} 204 local fio_job_section=() 205 local num_cores=${#cores[@]} 206 local disks_per_core=$((disk_no / num_cores)) 207 local disks_per_core_mod=$((disk_no % num_cores)) 208 local cores_numa 209 cores_numa=($(get_cores_numa_node "${cores[*]}")) 210 211 # Following part of this function still leverages global variables a lot. 212 # It's a mix of local variables passed as aruments to function with global variables. This is messy. 213 # TODO: Modify this to be consistent with how variables are used here. Aim for using only 214 # local variables to get rid of globals as much as possible. 215 desc="\"Test io_plugin=$PLUGIN Blocksize=${BLK_SIZE} Workload=$RW MIX=${MIX} qd=${IODEPTH}\"" 216 cp "$testdir/config.fio.tmp" "$testdir/config.fio" 217 cat <<- EOF >> $testdir/config.fio 218 description=$desc 219 220 rw=$RW 221 rwmixread=$MIX 222 bs=$BLK_SIZE 223 runtime=$RUNTIME 224 ramp_time=$RAMP_TIME 225 numjobs=$NUMJOBS 226 log_avg_msec=$SAMPLING_INT 227 EOF 228 229 if $GTOD_REDUCE; then 230 echo "gtod_reduce=1" >> $testdir/config.fio 231 fi 232 233 if [[ $PLUGIN =~ "uring" ]]; then 234 cat <<- EOF >> $testdir/config.fio 235 fixedbufs=1 236 hipri=1 237 registerfiles=1 238 sqthread_poll=1 239 EOF 240 fi 241 242 if [[ "$IO_BATCH_SUBMIT" -gt 0 ]]; then 243 echo "iodepth_batch_submit=$IO_BATCH_SUBMIT" >> $testdir/config.fio 244 fi 245 246 if [[ "$IO_BATCH_COMPLETE" -gt 0 ]]; then 247 echo "iodepth_batch_complete=$IO_BATCH_COMPLETE" >> $testdir/config.fio 248 fi 249 250 for i in "${!cores[@]}"; do 251 local m=0 #Counter of disks per NUMA node 252 local n=0 #Counter of all disks in test 253 core_numa=${cores_numa[$i]} 254 255 total_disks_per_core=$disks_per_core 256 # Check how many "stray" disks are unassigned to CPU cores 257 # Assign one disk to current CPU core and substract it from the total of 258 # unassigned disks 259 if [[ "$disks_per_core_mod" -gt "0" ]]; then 260 total_disks_per_core=$((disks_per_core + 1)) 261 disks_per_core_mod=$((disks_per_core_mod - 1)) 262 fi 263 # SPDK fio plugin supports submitting/completing I/Os to multiple SSDs from a single thread. 264 # Therefore, the per thread queue depth is set to the desired IODEPTH/device X the number of devices per thread. 265 QD=$IODEPTH 266 if [[ "$NOIOSCALING" == false ]]; then 267 QD=$((IODEPTH * total_disks_per_core)) 268 fi 269 270 if [[ "$FIO_FNAME_STRATEGY" == "group" ]]; then 271 fio_job_section+=("") 272 fio_job_section+=("[filename${i}]") 273 fio_job_section+=("iodepth=$QD") 274 fio_job_section+=("cpus_allowed=${cores[$i]} #CPU NUMA Node ${cores_numa[$i]}") 275 fi 276 277 while [[ "$m" -lt "$total_disks_per_core" ]]; do 278 # Try to add disks to job section if it's NUMA node matches NUMA 279 # for currently selected CPU 280 if [[ "${disks_numa[$n]}" == "$core_numa" ]]; then 281 if [[ "$FIO_FNAME_STRATEGY" == "split" ]]; then 282 fio_job_section+=("") 283 fio_job_section+=("[filename${m}-${cores[$i]}]") 284 fio_job_section+=("iodepth=$QD") 285 fio_job_section+=("cpus_allowed=${cores[$i]} #CPU NUMA Node ${cores_numa[$i]}") 286 fi 287 288 if [[ "$plugin" == "spdk-plugin-nvme" ]]; then 289 fio_job_section+=("filename=trtype=PCIe traddr=${disks[$n]//:/.} ns=1 #NVMe NUMA Node ${disks_numa[$n]}") 290 elif [[ "$plugin" == "spdk-plugin-bdev" ]]; then 291 fio_job_section+=("filename=${disks[$n]} #NVMe NUMA Node ${disks_numa[$n]}") 292 elif [[ "$plugin" =~ "kernel" ]]; then 293 fio_job_section+=("filename=/dev/${disks[$n]} #NVMe NUMA Node ${disks_numa[$n]}") 294 fi 295 m=$((m + 1)) 296 297 #Mark numa of n'th disk as "x" to mark it as claimed for next loop iterations 298 disks_numa[$n]="x" 299 fi 300 n=$((n + 1)) 301 302 # If there is no more disks with numa node same as cpu numa node, switch to 303 # other numa node, go back to start of loop and try again. 304 if [[ $n -ge $total_disks ]]; then 305 echo "WARNING! Cannot assign any more NVMes for CPU ${cores[$i]}" 306 echo "NVMe assignment for this CPU will be cross-NUMA." 307 if [[ "$core_numa" == "1" ]]; then 308 core_numa=0 309 else 310 core_numa=1 311 fi 312 n=0 313 fi 314 done 315 done 316 317 printf "%s\n" "${fio_job_section[@]}" >> $testdir/config.fio 318 echo "INFO: Generated fio configuration file:" 319 cat $testdir/config.fio 320} 321 322function preconditioning() { 323 local dev_name="" 324 local filename="" 325 local nvme_list 326 327 HUGEMEM=8192 $rootdir/scripts/setup.sh 328 cp $testdir/config.fio.tmp $testdir/config.fio 329 echo "[Preconditioning]" >> $testdir/config.fio 330 331 # Generate filename argument for FIO. 332 # We only want to target NVMes not bound to nvme driver. 333 # If they're still bound to nvme that means they were skipped by 334 # setup.sh on purpose. 335 nvme_list=$(get_disks nvme) 336 for nvme in $nvme_list; do 337 dev_name='trtype=PCIe traddr='${nvme//:/.}' ns=1' 338 filename+=$(printf %s":" "$dev_name") 339 done 340 echo "** Preconditioning disks, this can take a while, depending on the size of disks." 341 run_spdk_nvme_fio "spdk-plugin-nvme" --filename="$filename" --size=100% --loops=2 --bs=1M \ 342 --rw=write --iodepth=32 --output-format=normal 343 rm -f $testdir/config.fio 344} 345 346function bc() { 347 $(type -P bc) -l <<< "scale=3; $1" 348} 349 350function get_results() { 351 local iops bw stdev 352 local p90_lat p99_lat p99_99_lat 353 local mean_slat mean_clat 354 local reads_pct 355 local writes_pct 356 357 reads_pct=$(bc "$1 / 100") 358 writes_pct=$(bc "1 - $reads_pct") 359 360 iops=$(jq -r '.jobs[] | .read.iops + .write.iops' $TMP_RESULT_FILE) 361 bw=$(jq -r ".jobs[] | (.read.bw + .write.bw)" $TMP_RESULT_FILE) 362 mean_lat=$(jq -r ".jobs[] | (.read.lat_ns.mean * $reads_pct + .write.lat_ns.mean * $writes_pct)/1000" $TMP_RESULT_FILE) 363 p90_lat=$(jq -r ".jobs[] | (.read.clat_ns.percentile.\"90.000000\" // 0 * $reads_pct + .write.clat_ns.percentile.\"90.000000\" // 0 * $writes_pct)/1000" $TMP_RESULT_FILE) 364 p99_lat=$(jq -r ".jobs[] | (.read.clat_ns.percentile.\"99.000000\" // 0 * $reads_pct + .write.clat_ns.percentile.\"99.000000\" // 0 * $writes_pct)/1000" $TMP_RESULT_FILE) 365 p99_99_lat=$(jq -r ".jobs[] | (.read.clat_ns.percentile.\"99.990000\" // 0 * $reads_pct + .write.clat_ns.percentile.\"99.990000\" // 0 * $writes_pct)/1000" $TMP_RESULT_FILE) 366 stdev=$(jq -r ".jobs[] | (.read.clat_ns.stddev * $reads_pct + .write.clat_ns.stddev * $writes_pct)/1000" $TMP_RESULT_FILE) 367 mean_slat=$(jq -r ".jobs[] | (.read.slat_ns.mean * $reads_pct + .write.slat_ns.mean * $writes_pct)/1000" $TMP_RESULT_FILE) 368 mean_clat=$(jq -r ".jobs[] | (.read.clat_ns.mean * $reads_pct + .write.clat_ns.mean * $writes_pct)/1000" $TMP_RESULT_FILE) 369 370 echo "$iops $bw $mean_lat $p90_lat $p99_lat $p99_99_lat $stdev $mean_slat $mean_clat" 371} 372 373function get_bdevperf_results() { 374 local iops 375 local bw_MBs 376 read -r iops bw_MBs <<< $(grep Total $TMP_RESULT_FILE | tr -s " " | awk -F ":| " '{print $5" "$7}') 377 echo "$iops $(bc "$bw_MBs * 1024")" 378} 379 380function get_nvmeperf_results() { 381 local iops 382 local bw_MBs 383 local mean_lat_usec 384 local max_lat_usec 385 local min_lat_usec 386 387 read -r iops bw_MBs mean_lat_usec min_lat_usec max_lat_usec <<< $(tr -s " " < $TMP_RESULT_FILE | grep -oP "(?<=Total : )(.*+)") 388 echo "$iops $(bc "$bw_MBs * 1024") $mean_lat_usec $min_lat_usec $max_lat_usec" 389} 390 391function run_spdk_nvme_fio() { 392 local plugin=$1 393 echo "** Running fio test, this can take a while, depending on the run-time and ramp-time setting." 394 if [[ "$plugin" = "spdk-plugin-nvme" ]]; then 395 LD_PRELOAD=$plugin_dir/spdk_nvme $FIO_BIN $testdir/config.fio --output-format=json "${@:2}" --ioengine=spdk 396 elif [[ "$plugin" = "spdk-plugin-bdev" ]]; then 397 LD_PRELOAD=$plugin_dir/spdk_bdev $FIO_BIN $testdir/config.fio --output-format=json "${@:2}" --ioengine=spdk_bdev --spdk_json_conf=$testdir/bdev.conf --spdk_mem=4096 398 fi 399 400 sleep 1 401} 402 403function run_nvme_fio() { 404 echo "** Running fio test, this can take a while, depending on the run-time and ramp-time setting." 405 $FIO_BIN $testdir/config.fio --output-format=json "$@" 406 sleep 1 407} 408 409function run_bdevperf() { 410 local bdevperf_rpc 411 local bdevperf_pid 412 local rpc_socket 413 local bpf_script_cmd 414 local bpf_script_pid 415 local bpf_app_pid 416 local main_core_param="" 417 418 bdevperf_rpc="$rootdir/test/bdev/bdevperf/bdevperf.py" 419 rpc_socket="/var/tmp/spdk.sock" 420 421 if [[ -n $MAIN_CORE ]]; then 422 main_core_param="-p ${MAIN_CORE}" 423 fi 424 425 echo "** Running bdevperf test, this can take a while, depending on the run-time setting." 426 $bdevperf_dir/bdevperf --json $testdir/bdev.conf -q $IODEPTH -o $BLK_SIZE -w $RW -M $MIX -t $RUNTIME -m "[$CPUS_ALLOWED]" -r "$rpc_socket" $main_core_param -z & 427 bdevperf_pid=$! 428 waitforlisten $bdevperf_pid 429 430 if [[ ${#BPFTRACES[@]} -gt 0 ]]; then 431 echo "INFO: Enabling BPF Traces ${BPFTRACES[*]}" 432 bpf_script_cmd=("$rootdir/scripts/bpftrace.sh") 433 bpf_script_cmd+=("$bdevperf_pid") 434 for trace in "${BPFTRACES[@]}"; do 435 bpf_script_cmd+=("$rootdir/scripts/bpf/$trace") 436 done 437 438 BPF_OUTFILE=$TMP_BPF_FILE "${bpf_script_cmd[@]}" & 439 bpf_script_pid=$! 440 sleep 3 441 fi 442 443 PYTHONPATH=$PYTHONPATH:$rootdir/scripts $bdevperf_rpc -s "$rpc_socket" perform_tests 444 445 # Using "-z" option causes bdevperf to NOT exit automatically after running the test, 446 # so we need to stop it ourselves. 447 kill -s SIGINT $bdevperf_pid 448 wait $bdevperf_pid 449 450 if ((bpf_script_pid)); then 451 wait $bpf_script_pid 452 fi 453 sleep 1 454} 455 456function run_nvmeperf() { 457 # Prepare -r argument string for nvme perf command 458 local r_opt 459 local disks 460 461 # Limit the number of disks to $1 if needed 462 disks=($(get_disks nvme)) 463 disks=("${disks[@]:0:$1}") 464 r_opt=$(printf -- ' -r "trtype:PCIe traddr:%s"' "${disks[@]}") 465 466 echo "** Running nvme perf test, this can take a while, depending on the run-time setting." 467 468 # Run command in separate shell as this solves quoting issues related to r_opt var 469 $SHELL -c "$nvmeperf_dir/perf $r_opt -q $IODEPTH -o $BLK_SIZE -w $RW -M $MIX -t $RUNTIME -c [$CPUS_ALLOWED]" 470 sleep 1 471} 472 473function wait_for_nvme_reload() { 474 local nvmes=$1 475 476 shopt -s extglob 477 for disk in $nvmes; do 478 cmd="ls /sys/block/$disk/queue/*@(iostats|rq_affinity|nomerges|io_poll_delay)*" 479 until $cmd 2> /dev/null; do 480 echo "Waiting for full nvme driver reload..." 481 sleep 0.5 482 done 483 done 484 shopt -q extglob 485} 486 487function verify_disk_number() { 488 # Check if we have appropriate number of disks to carry out the test 489 disks=($(get_disks $PLUGIN)) 490 if [[ $DISKNO == "ALL" ]] || [[ $DISKNO == "all" ]]; then 491 DISKNO=${#disks[@]} 492 elif [[ $DISKNO -gt ${#disks[@]} ]] || [[ ! $DISKNO =~ ^[0-9]+$ ]]; then 493 echo "error: Required devices number ($DISKNO) is not a valid number or it's larger than the number of devices found (${#disks[@]})" 494 false 495 fi 496} 497