1#!/usr/bin/env bash 2# SPDX-License-Identifier: BSD-3-Clause 3# Copyright (C) 2018 Intel Corporation 4# All rights reserved. 5# 6set -e 7 8# Dir variables and sourcing common files 9testdir=$(readlink -f $(dirname $0)) 10rootdir=$(readlink -f $testdir/../../..) 11plugin_dir=$rootdir/build/fio 12source $testdir/common.sh 13source $rootdir/scripts/common.sh || exit 1 14source $rootdir/test/common/autotest_common.sh 15 16# Global & default variables 17declare -A KERNEL_ENGINES 18KERNEL_ENGINES=( 19 ["kernel-libaio"]="--ioengine=libaio" 20 ["kernel-classic-polling"]="--ioengine=pvsync2 --hipri=100" 21 ["kernel-hybrid-polling"]="--ioengine=pvsync2 --hipri=100" 22 ["kernel-io-uring"]="--ioengine=io_uring") 23 24RW=randrw 25MIX=100 26IODEPTH=256 27BLK_SIZE=4096 28RUNTIME=600 29RAMP_TIME=30 30NUMJOBS=1 31REPEAT_NO=3 32GTOD_REDUCE=false 33SAMPLING_INT=0 34LATENCY_LOG=false 35IO_BATCH_SUBMIT=0 36IO_BATCH_COMPLETE=0 37FIO_BIN=$CONFIG_FIO_SOURCE_DIR/fio 38FIO_FNAME_STRATEGY="group" 39TMP_RESULT_FILE=$testdir/result.json 40MAIN_CORE="" 41TMP_BPF_FILE=$testdir/bpftraces.txt 42PLUGIN="nvme" 43DISKCFG="" 44BDEV_CACHE="" 45BDEV_POOL="" 46DISKNO="ALL" 47CPUS_ALLOWED=1 48NOIOSCALING=false 49PRECONDITIONING=true 50CPUFREQ="" 51PERFTOP=false 52DPDKMEM=false 53BPFTRACES=() 54DATE="$(date +'%m_%d_%Y_%H%M%S')" 55 56function usage() { 57 set +x 58 [[ -n $2 ]] && ( 59 echo "$2" 60 echo "" 61 ) 62 echo "Run NVMe PMD/BDEV performance test. Change options for easier debug and setup configuration" 63 echo "Usage: $(basename $1) [options]" 64 echo "-h, --help Print help and exit" 65 echo 66 echo "Workload parameters:" 67 echo " --rw=STR Type of I/O pattern. Accepted values are randrw,rw. [default=$RW]" 68 echo " --rwmixread=INT Percentage of a mixed workload that should be reads. [default=$MIX]" 69 echo " --iodepth=INT Number of I/Os to keep in flight against the file. [default=$IODEPTH]" 70 echo " --block-size=INT The block size in bytes used for I/O units. [default=$BLK_SIZE]" 71 echo " --run-time=TIME[s] Tell fio to run the workload for the specified period of time. [default=$RUNTIME]" 72 echo " --ramp-time=TIME[s] Fio will run the specified workload for this amount of time before" 73 echo " logging any performance numbers. [default=$RAMP_TIME]. Applicable only for fio-based tests." 74 echo " --numjobs=INT Create the specified number of clones of this job. [default=$NUMJOBS]" 75 echo " Applicable only for fio-based tests." 76 echo " --repeat-no=INT How many times to repeat workload test. [default=$REPEAT_NO]" 77 echo " Test result will be an average of repeated test runs." 78 echo " --gtod-reduce Enable fio gtod_reduce option. [default=$GTOD_REDUCE]" 79 echo " --sampling-int=INT Value for fio log_avg_msec parameters [default=$SAMPLING_INT]" 80 echo " --latency-log Write latency log file using write_lat_log fio option [default=$LATENCY_LOG]" 81 echo " --io-batch-submit=INT Value for iodepth_batch_submit fio option [default=$IO_BATCH_SUBMIT]" 82 echo " --io-batch-complete=INT Value for iodepth_batch_complete fio option [default=$IO_BATCH_COMPLETE]" 83 echo " --fio-bin=PATH Path to fio binary. [default=$FIO_BIN]" 84 echo " Applicable only for fio-based tests." 85 echo " --fio-fname-strategy=STR Use 'group' to group filenames under job section with common CPU or" 86 echo " use 'split' to create a separate fio job section for each filename [default=$FIO_FNAME_STRATEGY]" 87 echo 88 echo "Test setup parameters:" 89 echo " --driver=STR Selects tool used for testing. Choices available:" 90 echo " - spdk-perf-nvme (SPDK nvme perf)" 91 echo " - spdk-perf-bdev (SPDK bdev perf)" 92 echo " - spdk-perf-xnvme-bdev (SPDK xnvme bdev perf with io_uring io_mechanism)" 93 echo " - spdk-plugin-nvme (SPDK nvme fio plugin)" 94 echo " - spdk-plugin-bdev (SPDK bdev fio plugin)" 95 echo " - spdk-plugin-bdev-xnvme (SPDK bdev fio plugin with xnvme bdevs)" 96 echo " - kernel-classic-polling" 97 echo " - kernel-hybrid-polling" 98 echo " - kernel-libaio" 99 echo " - kernel-io-uring" 100 echo " --disk-config Configuration file containing PCI BDF addresses of NVMe disks to use in test." 101 echo " It consists a single column of PCI addresses. SPDK Bdev names will be assigned" 102 echo " and Kernel block device names detected." 103 echo " Lines starting with # are ignored as comments." 104 echo " --bdev-io-cache-size Set IO cache size for for SPDK bdev subsystem." 105 echo " --bdev-io-pool-size Set IO pool size for for SPDK bdev subsystem." 106 echo " --max-disk=INT,ALL Number of disks to test on, this will run multiple workloads with increasing number of disk each run." 107 echo " If =ALL then test on all found disk. [default=$DISKNO]" 108 echo " --cpu-allowed=INT/PATH Comma-separated list of CPU cores used to run the workload. Ranges allowed." 109 echo " Can also point to a file containing list of CPUs. [default=$CPUS_ALLOWED]" 110 echo " --no-preconditioning Skip preconditioning" 111 echo " --no-io-scaling Do not scale iodepth for each device in SPDK fio plugin. [default=$NOIOSCALING]" 112 echo " --cpu-frequency=INT Run tests with CPUs set to a desired frequency. 'intel_pstate=disable' must be set in" 113 echo " GRUB options. You can use 'cpupower frequency-info' and 'cpupower frequency-set' to" 114 echo " check list of available frequencies. Example: --cpu-frequency=1100000." 115 echo " --main-core main (primary) core for DPDK (for bdevperf only)." 116 echo 117 echo "Other options:" 118 echo " --perftop Run perftop measurements on the same CPU cores as specified in --cpu-allowed option." 119 echo " --dpdk-mem-stats Dump DPDK memory stats during the test." 120 echo " --bpf-traces=LIST Comma delimited list of .bt scripts for enabling BPF traces." 121 echo " List of .bt scripts available in spdk/scripts/bpf." 122 echo " Only for spdk-perf-bdev" 123 set -x 124} 125 126while getopts 'h-:' optchar; do 127 case "$optchar" in 128 -) 129 case "$OPTARG" in 130 help) 131 usage $0 132 exit 0 133 ;; 134 rw=*) RW="${OPTARG#*=}" ;; 135 rwmixread=*) MIX="${OPTARG#*=}" ;; 136 iodepth=*) IODEPTH="${OPTARG#*=}" ;; 137 block-size=*) BLK_SIZE="${OPTARG#*=}" ;; 138 run-time=*) RUNTIME="${OPTARG#*=}" ;; 139 ramp-time=*) RAMP_TIME="${OPTARG#*=}" ;; 140 numjobs=*) NUMJOBS="${OPTARG#*=}" ;; 141 repeat-no=*) REPEAT_NO="${OPTARG#*=}" ;; 142 gtod-reduce) GTOD_REDUCE=true ;; 143 sampling-int=*) SAMPLING_INT="${OPTARG#*=}" ;; 144 io-batch-submit=*) IO_BATCH_SUBMIT="${OPTARG#*=}" ;; 145 io-batch-complete=*) IO_BATCH_COMPLETE="${OPTARG#*=}" ;; 146 fio-bin=*) FIO_BIN="${OPTARG#*=}" ;; 147 fio-fname-strategy=*) 148 FIO_FNAME_STRATEGY="${OPTARG#*=}" 149 if [[ "$FIO_FNAME_STRATEGY" == "split" ]]; then 150 NOIOSCALING=true 151 fi 152 ;; 153 driver=*) PLUGIN="${OPTARG#*=}" ;; 154 disk-config=*) 155 DISKCFG="${OPTARG#*=}" 156 if [[ ! -f "$DISKCFG" ]]; then 157 echo "Disk configuration file $DISKCFG does not exist!" 158 exit 1 159 fi 160 ;; 161 bdev-io-cache-size=*) BDEV_CACHE="${OPTARG#*=}" ;; 162 bdev-io-pool-size=*) BDEV_POOL="${OPTARG#*=}" ;; 163 max-disk=*) DISKNO="${OPTARG#*=}" ;; 164 cpu-allowed=*) 165 CPUS_ALLOWED="${OPTARG#*=}" 166 if [[ -f "$CPUS_ALLOWED" ]]; then 167 CPUS_ALLOWED=$(cat "$CPUS_ALLOWED") 168 fi 169 ;; 170 no-preconditioning) PRECONDITIONING=false ;; 171 no-io-scaling) NOIOSCALING=true ;; 172 cpu-frequency=*) CPUFREQ="${OPTARG#*=}" ;; 173 perftop) PERFTOP=true ;; 174 dpdk-mem-stats) DPDKMEM=true ;; 175 bpf-traces=*) IFS="," read -r -a BPFTRACES <<< "${OPTARG#*=}" ;; 176 latency-log) LATENCY_LOG=true ;; 177 main-core=*) MAIN_CORE="${OPTARG#*=}" ;; 178 *) 179 usage $0 echo "Invalid argument '$OPTARG'" 180 exit 1 181 ;; 182 esac 183 ;; 184 h) 185 usage $0 186 exit 0 187 ;; 188 *) 189 usage $0 "Invalid argument '$optchar'" 190 exit 1 191 ;; 192 esac 193done 194 195result_dir=$testdir/results/perf_results_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE} 196result_file=$result_dir/perf_results_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}.csv 197mkdir -p $result_dir 198unset iops_disks bw mean_lat_disks_usec p90_lat_disks_usec p99_lat_disks_usec p99_99_lat_disks_usec stdev_disks_usec 199echo "run-time,ramp-time,fio-plugin,QD,block-size,num-cpu-cores,workload,workload-mix" > $result_file 200printf "%s,%s,%s,%s,%s,%s,%s,%s\n" $RUNTIME $RAMP_TIME $PLUGIN $IODEPTH $BLK_SIZE $NO_CORES $RW $MIX >> $result_file 201echo "num_of_disks,iops,avg_lat[usec],p90[usec],p99[usec],p99.99[usec],stdev[usec],avg_slat[usec],avg_clat[usec],bw[Kib/s]" >> $result_file 202 203trap 'rm -f *.state $testdir/bdev.conf; kill $perf_pid; wait $dpdk_mem_pid; print_backtrace' ERR SIGTERM SIGABRT 204 205if [[ "$PLUGIN" =~ "xnvme" ]]; then 206 create_spdk_xnvme_bdev_conf "$BDEV_CACHE" "$BDEV_POOL" 207elif [[ "$PLUGIN" =~ "bdev" ]]; then 208 create_spdk_bdev_conf "$BDEV_CACHE" "$BDEV_POOL" 209fi 210 211if [[ -s $testdir/bdev.conf ]]; then 212 echo "INFO: Generated bdev.conf file:" 213 cat $testdir/bdev.conf 214fi 215 216verify_disk_number 217if [[ "$PLUGIN" =~ "xnvme" ]]; then 218 DISK_NAMES=$(get_disks) 219 DISKS_NUMA=$(get_numa_node "" "$DISK_NAMES") 220else 221 DISK_NAMES=$(get_disks $PLUGIN) 222 DISKS_NUMA=$(get_numa_node $PLUGIN "$DISK_NAMES") 223fi 224CORES=$(get_cores "$CPUS_ALLOWED") 225NO_CORES_ARRAY=($CORES) 226NO_CORES=${#NO_CORES_ARRAY[@]} 227 228if $PRECONDITIONING; then 229 preconditioning 230fi 231 232if [[ "$PLUGIN" =~ "kernel" || "$PLUGIN" =~ "xnvme" ]]; then 233 $rootdir/scripts/setup.sh reset 234 fio_ioengine_opt="${KERNEL_ENGINES[$PLUGIN]}" 235 236 if [[ $PLUGIN = "kernel-classic-polling" ]]; then 237 for disk in $DISK_NAMES; do 238 echo -1 > /sys/block/$disk/queue/io_poll_delay 239 done 240 elif [[ $PLUGIN = "kernel-hybrid-polling" ]]; then 241 for disk in $DISK_NAMES; do 242 echo 0 > /sys/block/$disk/queue/io_poll_delay 243 done 244 elif [[ $PLUGIN = "kernel-io-uring" || $PLUGIN =~ "xnvme" ]]; then 245 modprobe -rv nvme 246 modprobe nvme poll_queues=8 247 wait_for_nvme_reload $DISK_NAMES 248 249 backup_dir="/tmp/nvme_param_bak" 250 mkdir -p $backup_dir 251 252 for disk in $DISK_NAMES; do 253 echo "INFO: Backing up device parameters for $disk" 254 sysfs=/sys/block/$disk/queue 255 mkdir -p $backup_dir/$disk 256 cat $sysfs/iostats > $backup_dir/$disk/iostats 257 cat $sysfs/rq_affinity > $backup_dir/$disk/rq_affinity 258 cat $sysfs/nomerges > $backup_dir/$disk/nomerges 259 cat $sysfs/io_poll_delay > $backup_dir/$disk/io_poll_delay 260 done 261 262 for disk in $DISK_NAMES; do 263 echo "INFO: Setting device parameters for $disk" 264 sysfs=/sys/block/$disk/queue 265 echo 0 > $sysfs/iostats 266 echo 0 > $sysfs/rq_affinity 267 echo 2 > $sysfs/nomerges 268 echo -1 > $sysfs/io_poll_delay 269 done 270 fi 271fi 272 273cpu_governor="$(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor)" 274 275if [[ -n "$CPUFREQ" ]]; then 276 if [[ ! "$(cat /proc/cmdline)" =~ "intel_pstate=disable" ]]; then 277 echo "ERROR: Cannot set custom CPU frequency for test. intel_pstate=disable not in boot options." 278 false 279 else 280 cpupower frequency-set -g userspace 281 cpupower frequency-set -f $CPUFREQ 282 fi 283else 284 cpupower frequency-set -g performance 285fi 286current_governor=$(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor) 287echo "INFO: Using $current_governor cpu governor for test." 288 289if $PERFTOP; then 290 echo "INFO: starting perf record on cores $CPUS_ALLOWED" 291 perf record -C $CPUS_ALLOWED -o "$testdir/perf.data" & 292 perf_pid=$! 293fi 294 295if $DPDKMEM; then 296 echo "INFO: waiting to generate DPDK memory usage" 297 wait_time=$((RUNTIME / 2)) 298 if [[ ! "$PLUGIN" =~ "perf" ]]; then 299 wait_time=$((wait_time + RAMP_TIME)) 300 fi 301 ( 302 sleep $wait_time 303 echo "INFO: generating DPDK memory usage" 304 $rootdir/scripts/rpc.py env_dpdk_get_mem_stats 305 ) & 306 dpdk_mem_pid=$! 307fi 308 309iops_disks=0 310bw=0 311min_lat_disks_usec=0 312max_lat_disks_usec=0 313mean_lat_disks_usec=0 314p90_lat_disks_usec=0 315p99_lat_disks_usec=0 316p99_99_lat_disks_usec=0 317stdev_disks_usec=0 318mean_slat_disks_usec=0 319mean_clat_disks_usec=0 320#Run each workload $REPEAT_NO times 321for ((j = 0; j < REPEAT_NO; j++)); do 322 if [[ $PLUGIN == "spdk-perf-bdev" || $PLUGIN =~ "xnvme-bdev" ]]; then 323 run_bdevperf > $TMP_RESULT_FILE 324 read -r iops bandwidth <<< $(get_bdevperf_results) 325 iops_disks=$(bc "$iops_disks + $iops") 326 bw=$(bc "$bw + $bandwidth") 327 cp $TMP_RESULT_FILE $result_dir/perf_results_${MIX}_${PLUGIN}_${NO_CORES}cpus_${DATE}_${k}_disks_${j}.output 328 [[ -f $TMP_BPF_FILE ]] && mv $TMP_BPF_FILE $result_dir/bpftraces_${MIX}_${PLUGIN}_${NO_CORES}cpus_${DATE}_${k}_disks_${j}.txt 329 elif [ $PLUGIN = "spdk-perf-nvme" ]; then 330 run_nvmeperf $DISKNO > $TMP_RESULT_FILE 331 read -r iops bandwidth mean_lat min_lat max_lat <<< $(get_nvmeperf_results) 332 333 iops_disks=$(bc "$iops_disks+$iops") 334 bw=$(bc "$bw+$bandwidth") 335 mean_lat_disks_usec=$(bc "$mean_lat_disks_usec + $mean_lat") 336 min_lat_disks_usec=$(bc "$min_lat_disks_usec + $min_lat") 337 max_lat_disks_usec=$(bc "$max_lat_disks_usec + $max_lat") 338 339 cp $TMP_RESULT_FILE $result_dir/perf_results_${MIX}_${PLUGIN}_${NO_CORES}cpus_${DATE}_${k}_disks_${j}.output 340 else 341 create_fio_config $DISKNO $PLUGIN "$DISK_NAMES" "$DISKS_NUMA" "$CORES" 342 343 if $LATENCY_LOG; then 344 write_log_opt="--write_lat_log=$result_dir/perf_lat_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}_${k}disks_${j}" 345 fi 346 347 if [[ "$PLUGIN" =~ "spdk-plugin" ]]; then 348 run_spdk_nvme_fio $PLUGIN "--output=$TMP_RESULT_FILE" $write_log_opt 349 else 350 run_nvme_fio $fio_ioengine_opt "--output=$TMP_RESULT_FILE" $write_log_opt 351 fi 352 353 #Store values for every number of used disks 354 #Use recalculated value for mixread param in case rw mode is not rw. 355 rwmixread=$MIX 356 if [[ $RW = *"read"* ]]; then 357 rwmixread=100 358 elif [[ $RW = *"write"* ]]; then 359 rwmixread=0 360 fi 361 362 read -r iops bandwidth mean_lat_usec p90_lat_usec p99_lat_usec p99_99_lat_usec \ 363 stdev_usec mean_slat_usec mean_clat_usec <<< $(get_results $rwmixread) 364 iops_disks=$(bc "$iops_disks + $iops") 365 mean_lat_disks_usec=$(bc "$mean_lat_disks_usec + $mean_lat_usec") 366 p90_lat_disks_usec=$(bc "$p90_lat_disks_usec + $p90_lat_usec") 367 p99_lat_disks_usec=$(bc "$p99_lat_disks_usec + $p99_lat_usec") 368 p99_99_lat_disks_usec=$(bc "$p99_99_lat_disks_usec + $p99_99_lat_usec") 369 stdev_disks_usec=$(bc "$stdev_disks_usec + $stdev_usec") 370 mean_slat_disks_usec=$(bc "$mean_slat_disks_usec + $mean_slat_usec") 371 mean_clat_disks_usec=$(bc "$mean_clat_disks_usec + $mean_clat_usec") 372 bw=$(bc "$bw + $bandwidth") 373 374 cp $TMP_RESULT_FILE $result_dir/perf_results_${MIX}_${PLUGIN}_${NO_CORES}cpus_${DATE}_${k}_disks_${j}.json 375 cp $testdir/config.fio $result_dir/config_${MIX}_${PLUGIN}_${NO_CORES}cpus_${DATE}_${k}_disks_${j}.fio 376 rm -f $testdir/config.fio 377 fi 378done 379 380if $PERFTOP; then 381 echo "INFO: Stopping perftop measurements." 382 kill $perf_pid 383 wait $perf_pid || true 384 perf report -i "$testdir/perf.data" > $result_dir/perftop_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}.txt 385 rm -f "$testdir/perf.data" 386fi 387 388if $DPDKMEM; then 389 mv "/tmp/spdk_mem_dump.txt" $result_dir/spdk_mem_dump_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}.txt 390 echo "INFO: DPDK memory usage saved in $result_dir" 391fi 392 393#Write results to csv file 394iops_disks=$(bc "$iops_disks / $REPEAT_NO") 395bw=$(bc "$bw / $REPEAT_NO") 396if [[ "$PLUGIN" =~ "plugin" || "$PLUGIN" =~ "kernel" ]] && [[ ! $PLUGIN =~ "xnvme-bdev" ]]; then 397 mean_lat_disks_usec=$(bc "$mean_lat_disks_usec / $REPEAT_NO") 398 p90_lat_disks_usec=$(bc "$p90_lat_disks_usec / $REPEAT_NO") 399 p99_lat_disks_usec=$(bc "$p99_lat_disks_usec / $REPEAT_NO") 400 p99_99_lat_disks_usec=$(bc "$p99_99_lat_disks_usec / $REPEAT_NO") 401 stdev_disks_usec=$(bc "$stdev_disks_usec / $REPEAT_NO") 402 mean_slat_disks_usec=$(bc "$mean_slat_disks_usec / $REPEAT_NO") 403 mean_clat_disks_usec=$(bc "$mean_clat_disks_usec / $REPEAT_NO") 404elif [[ "$PLUGIN" == "spdk-perf-nvme" ]]; then 405 mean_lat_disks_usec=$(bc "$mean_lat_disks_usec/$REPEAT_NO") 406fi 407 408printf "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n" ${DISKNO} ${iops_disks} ${mean_lat_disks_usec} ${p90_lat_disks_usec} ${p99_lat_disks_usec} \ 409 ${p99_99_lat_disks_usec} ${stdev_disks_usec} ${mean_slat_disks_usec} ${mean_clat_disks_usec} ${bw} >> $result_file 410 411if [[ -n "$CPUFREQ" ]]; then 412 cpupower frequency-set -g $cpu_governor 413fi 414 415if [[ $PLUGIN = "kernel-io-uring" || $PLUGIN =~ "xnvme" ]]; then 416 # Reload the nvme driver so that other test runs are not affected 417 modprobe -rv nvme 418 modprobe nvme 419 wait_for_nvme_reload $DISK_NAMES 420 421 for disk in $DISK_NAMES; do 422 echo "INFO: Restoring device parameters for $disk" 423 sysfs=/sys/block/$disk/queue 424 cat $backup_dir/$disk/iostats > $sysfs/iostats 425 cat $backup_dir/$disk/rq_affinity > $sysfs/rq_affinity 426 cat $backup_dir/$disk/nomerges > $sysfs/nomerges 427 cat $backup_dir/$disk/io_poll_delay > $sysfs/io_poll_delay 428 done 429fi 430rm -f $testdir/bdev.conf $testdir/config.fio 431