1#!/usr/bin/env bash 2set -e 3 4# Dir variables and sourcing common files 5testdir=$(readlink -f $(dirname $0)) 6rootdir=$(readlink -f $testdir/../../..) 7plugin_dir=$rootdir/build/fio 8bdevperf_dir=$rootdir/test/bdev/bdevperf 9nvmeperf_dir=$rootdir/build/examples 10source $testdir/common.sh 11source $rootdir/scripts/common.sh || exit 1 12source $rootdir/test/common/autotest_common.sh 13 14# Global & default variables 15declare -A KERNEL_ENGINES 16KERNEL_ENGINES=( 17 ["kernel-libaio"]="--ioengine=libaio" 18 ["kernel-classic-polling"]="--ioengine=pvsync2 --hipri=100" 19 ["kernel-hybrid-polling"]="--ioengine=pvsync2 --hipri=100" 20 ["kernel-io-uring"]="--ioengine=io_uring") 21 22RW=randrw 23MIX=100 24IODEPTH=256 25BLK_SIZE=4096 26RUNTIME=600 27RAMP_TIME=30 28NUMJOBS=1 29REPEAT_NO=3 30GTOD_REDUCE=false 31SAMPLING_INT=0 32LATENCY_LOG=false 33IO_BATCH_SUBMIT=0 34IO_BATCH_COMPLETE=0 35FIO_BIN=$CONFIG_FIO_SOURCE_DIR/fio 36FIO_FNAME_STRATEGY="group" 37TMP_RESULT_FILE=$testdir/result.json 38MAIN_CORE="" 39TMP_BPF_FILE=$testdir/bpftraces.txt 40PLUGIN="nvme" 41DISKCFG="" 42BDEV_CACHE="" 43BDEV_POOL="" 44DISKNO="ALL" 45CPUS_ALLOWED=1 46NOIOSCALING=false 47PRECONDITIONING=true 48CPUFREQ="" 49PERFTOP=false 50DPDKMEM=false 51BPFTRACES=() 52DATE="$(date +'%m_%d_%Y_%H%M%S')" 53 54function usage() { 55 set +x 56 [[ -n $2 ]] && ( 57 echo "$2" 58 echo "" 59 ) 60 echo "Run NVMe PMD/BDEV performance test. Change options for easier debug and setup configuration" 61 echo "Usage: $(basename $1) [options]" 62 echo "-h, --help Print help and exit" 63 echo 64 echo "Workload parameters:" 65 echo " --rw=STR Type of I/O pattern. Accepted values are randrw,rw. [default=$RW]" 66 echo " --rwmixread=INT Percentage of a mixed workload that should be reads. [default=$MIX]" 67 echo " --iodepth=INT Number of I/Os to keep in flight against the file. [default=$IODEPTH]" 68 echo " --block-size=INT The block size in bytes used for I/O units. [default=$BLK_SIZE]" 69 echo " --run-time=TIME[s] Tell fio to run the workload for the specified period of time. [default=$RUNTIME]" 70 echo " --ramp-time=TIME[s] Fio will run the specified workload for this amount of time before" 71 echo " logging any performance numbers. [default=$RAMP_TIME]. Applicable only for fio-based tests." 72 echo " --numjobs=INT Create the specified number of clones of this job. [default=$NUMJOBS]" 73 echo " Applicable only for fio-based tests." 74 echo " --repeat-no=INT How many times to repeat workload test. [default=$REPEAT_NO]" 75 echo " Test result will be an average of repeated test runs." 76 echo " --gtod-reduce Enable fio gtod_reduce option. [default=$GTOD_REDUCE]" 77 echo " --sampling-int=INT Value for fio log_avg_msec parameters [default=$SAMPLING_INT]" 78 echo " --latency-log Write latency log file using write_lat_log fio option [default=$LATENCY_LOG]" 79 echo " --io-batch-submit=INT Value for iodepth_batch_submit fio option [default=$IO_BATCH_SUBMIT]" 80 echo " --io-batch-complete=INT Value for iodepth_batch_complete fio option [default=$IO_BATCH_COMPLETE]" 81 echo " --fio-bin=PATH Path to fio binary. [default=$FIO_BIN]" 82 echo " Applicable only for fio-based tests." 83 echo " --fio-fname-strategy=STR Use 'group' to group filenames under job section with common CPU or" 84 echo " use 'split' to create a separate fio job section for each filename [default=$FIO_FNAME_STRATEGY]" 85 echo 86 echo "Test setup parameters:" 87 echo " --driver=STR Selects tool used for testing. Choices available:" 88 echo " - spdk-perf-nvme (SPDK nvme perf)" 89 echo " - spdk-perf-bdev (SPDK bdev perf)" 90 echo " - spdk-plugin-nvme (SPDK nvme fio plugin)" 91 echo " - spdk-plugin-bdev (SPDK bdev fio plugin)" 92 echo " - kernel-classic-polling" 93 echo " - kernel-hybrid-polling" 94 echo " - kernel-libaio" 95 echo " - kernel-io-uring" 96 echo " --disk-config Configuration file containing PCI BDF addresses of NVMe disks to use in test." 97 echo " It consists a single column of PCI addresses. SPDK Bdev names will be assigned" 98 echo " and Kernel block device names detected." 99 echo " Lines starting with # are ignored as comments." 100 echo " --bdev-io-cache-size Set IO cache size for for SPDK bdev subsystem." 101 echo " --bdev-io-pool-size Set IO pool size for for SPDK bdev subsystem." 102 echo " --max-disk=INT,ALL Number of disks to test on, this will run multiple workloads with increasing number of disk each run." 103 echo " If =ALL then test on all found disk. [default=$DISKNO]" 104 echo " --cpu-allowed=INT/PATH Comma-separated list of CPU cores used to run the workload. Ranges allowed." 105 echo " Can also point to a file containing list of CPUs. [default=$CPUS_ALLOWED]" 106 echo " --no-preconditioning Skip preconditioning" 107 echo " --no-io-scaling Do not scale iodepth for each device in SPDK fio plugin. [default=$NOIOSCALING]" 108 echo " --cpu-frequency=INT Run tests with CPUs set to a desired frequency. 'intel_pstate=disable' must be set in" 109 echo " GRUB options. You can use 'cpupower frequency-info' and 'cpupower frequency-set' to" 110 echo " check list of available frequencies. Example: --cpu-frequency=1100000." 111 echo " --main-core main (primary) core for DPDK (for bdevperf only)." 112 echo 113 echo "Other options:" 114 echo " --perftop Run perftop measurements on the same CPU cores as specified in --cpu-allowed option." 115 echo " --dpdk-mem-stats Dump DPDK memory stats during the test." 116 echo " --bpf-traces=LIST Comma delimited list of .bt scripts for enabling BPF traces." 117 echo " List of .bt scripts available in spdk/scripts/bpf." 118 echo " Only for spdk-perf-bdev" 119 set -x 120} 121 122while getopts 'h-:' optchar; do 123 case "$optchar" in 124 -) 125 case "$OPTARG" in 126 help) 127 usage $0 128 exit 0 129 ;; 130 rw=*) RW="${OPTARG#*=}" ;; 131 rwmixread=*) MIX="${OPTARG#*=}" ;; 132 iodepth=*) IODEPTH="${OPTARG#*=}" ;; 133 block-size=*) BLK_SIZE="${OPTARG#*=}" ;; 134 run-time=*) RUNTIME="${OPTARG#*=}" ;; 135 ramp-time=*) RAMP_TIME="${OPTARG#*=}" ;; 136 numjobs=*) NUMJOBS="${OPTARG#*=}" ;; 137 repeat-no=*) REPEAT_NO="${OPTARG#*=}" ;; 138 gtod-reduce) GTOD_REDUCE=true ;; 139 sampling-int=*) SAMPLING_INT="${OPTARG#*=}" ;; 140 io-batch-submit=*) IO_BATCH_SUBMIT="${OPTARG#*=}" ;; 141 io-batch-complete=*) IO_BATCH_COMPLETE="${OPTARG#*=}" ;; 142 fio-bin=*) FIO_BIN="${OPTARG#*=}" ;; 143 fio-fname-strategy=*) 144 FIO_FNAME_STRATEGY="${OPTARG#*=}" 145 if [[ "$FIO_FNAME_STRATEGY" == "split" ]]; then 146 NOIOSCALING=true 147 fi 148 ;; 149 driver=*) PLUGIN="${OPTARG#*=}" ;; 150 disk-config=*) 151 DISKCFG="${OPTARG#*=}" 152 if [[ ! -f "$DISKCFG" ]]; then 153 echo "Disk confiuration file $DISKCFG does not exist!" 154 exit 1 155 fi 156 ;; 157 bdev-io-cache-size=*) BDEV_CACHE="${OPTARG#*=}" ;; 158 bdev-io-pool-size=*) BDEV_POOL="${OPTARG#*=}" ;; 159 max-disk=*) DISKNO="${OPTARG#*=}" ;; 160 cpu-allowed=*) 161 CPUS_ALLOWED="${OPTARG#*=}" 162 if [[ -f "$CPUS_ALLOWED" ]]; then 163 CPUS_ALLOWED=$(cat "$CPUS_ALLOWED") 164 fi 165 ;; 166 no-preconditioning) PRECONDITIONING=false ;; 167 no-io-scaling) NOIOSCALING=true ;; 168 cpu-frequency=*) CPUFREQ="${OPTARG#*=}" ;; 169 perftop) PERFTOP=true ;; 170 dpdk-mem-stats) DPDKMEM=true ;; 171 bpf-traces=*) IFS="," read -r -a BPFTRACES <<< "${OPTARG#*=}" ;; 172 latency-log) LATENCY_LOG=true ;; 173 main-core=*) MAIN_CORE="${OPTARG#*=}" ;; 174 *) 175 usage $0 echo "Invalid argument '$OPTARG'" 176 exit 1 177 ;; 178 esac 179 ;; 180 h) 181 usage $0 182 exit 0 183 ;; 184 *) 185 usage $0 "Invalid argument '$optchar'" 186 exit 1 187 ;; 188 esac 189done 190 191result_dir=$testdir/results/perf_results_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE} 192result_file=$result_dir/perf_results_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}.csv 193mkdir -p $result_dir 194unset iops_disks bw mean_lat_disks_usec p90_lat_disks_usec p99_lat_disks_usec p99_99_lat_disks_usec stdev_disks_usec 195echo "run-time,ramp-time,fio-plugin,QD,block-size,num-cpu-cores,workload,workload-mix" > $result_file 196printf "%s,%s,%s,%s,%s,%s,%s,%s\n" $RUNTIME $RAMP_TIME $PLUGIN $IODEPTH $BLK_SIZE $NO_CORES $RW $MIX >> $result_file 197echo "num_of_disks,iops,avg_lat[usec],p90[usec],p99[usec],p99.99[usec],stdev[usec],avg_slat[usec],avg_clat[usec],bw[Kib/s]" >> $result_file 198 199trap 'rm -f *.state $testdir/bdev.conf; kill $perf_pid; wait $dpdk_mem_pid; print_backtrace' ERR SIGTERM SIGABRT 200 201if [[ "$PLUGIN" =~ "bdev" ]]; then 202 create_spdk_bdev_conf "$BDEV_CACHE" "$BDEV_POOL" 203 echo "INFO: Generated bdev.conf file:" 204 cat $testdir/bdev.conf 205fi 206verify_disk_number 207DISK_NAMES=$(get_disks $PLUGIN) 208DISKS_NUMA=$(get_numa_node $PLUGIN "$DISK_NAMES") 209CORES=$(get_cores "$CPUS_ALLOWED") 210NO_CORES_ARRAY=($CORES) 211NO_CORES=${#NO_CORES_ARRAY[@]} 212 213if $PRECONDITIONING; then 214 preconditioning 215fi 216 217if [[ "$PLUGIN" =~ "kernel" ]]; then 218 $rootdir/scripts/setup.sh reset 219 fio_ioengine_opt="${KERNEL_ENGINES[$PLUGIN]}" 220 221 if [[ $PLUGIN = "kernel-classic-polling" ]]; then 222 for disk in $DISK_NAMES; do 223 echo -1 > /sys/block/$disk/queue/io_poll_delay 224 done 225 elif [[ $PLUGIN = "kernel-hybrid-polling" ]]; then 226 for disk in $DISK_NAMES; do 227 echo 0 > /sys/block/$disk/queue/io_poll_delay 228 done 229 elif [[ $PLUGIN = "kernel-io-uring" ]]; then 230 modprobe -rv nvme 231 modprobe nvme poll_queues=8 232 wait_for_nvme_reload $DISK_NAMES 233 234 backup_dir="/tmp/nvme_param_bak" 235 mkdir -p $backup_dir 236 237 for disk in $DISK_NAMES; do 238 echo "INFO: Backing up device parameters for $disk" 239 sysfs=/sys/block/$disk/queue 240 mkdir -p $backup_dir/$disk 241 cat $sysfs/iostats > $backup_dir/$disk/iostats 242 cat $sysfs/rq_affinity > $backup_dir/$disk/rq_affinity 243 cat $sysfs/nomerges > $backup_dir/$disk/nomerges 244 cat $sysfs/io_poll_delay > $backup_dir/$disk/io_poll_delay 245 done 246 247 for disk in $DISK_NAMES; do 248 echo "INFO: Setting device parameters for $disk" 249 sysfs=/sys/block/$disk/queue 250 echo 0 > $sysfs/iostats 251 echo 0 > $sysfs/rq_affinity 252 echo 2 > $sysfs/nomerges 253 echo -1 > $sysfs/io_poll_delay 254 done 255 fi 256fi 257 258cpu_governor="$(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor)" 259 260if [[ -n "$CPUFREQ" ]]; then 261 if [[ ! "$(cat /proc/cmdline)" =~ "intel_pstate=disable" ]]; then 262 echo "ERROR: Cannot set custom CPU frequency for test. intel_pstate=disable not in boot options." 263 false 264 else 265 cpupower frequency-set -g userspace 266 cpupower frequency-set -f $CPUFREQ 267 fi 268else 269 cpupower frequency-set -g performance 270fi 271current_governor=$(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor) 272echo "INFO: Using $current_governor cpu governor for test." 273 274if $PERFTOP; then 275 echo "INFO: starting perf record on cores $CPUS_ALLOWED" 276 perf record -C $CPUS_ALLOWED -o "$testdir/perf.data" & 277 perf_pid=$! 278fi 279 280if $DPDKMEM; then 281 echo "INFO: waiting to generate DPDK memory usage" 282 wait_time=$((RUNTIME / 2)) 283 if [[ ! "$PLUGIN" =~ "perf" ]]; then 284 wait_time=$((wait_time + RAMP_TIME)) 285 fi 286 ( 287 sleep $wait_time 288 echo "INFO: generating DPDK memory usage" 289 $rootdir/scripts/rpc.py env_dpdk_get_mem_stats 290 ) & 291 dpdk_mem_pid=$! 292fi 293 294iops_disks=0 295bw=0 296min_lat_disks_usec=0 297max_lat_disks_usec=0 298mean_lat_disks_usec=0 299p90_lat_disks_usec=0 300p99_lat_disks_usec=0 301p99_99_lat_disks_usec=0 302stdev_disks_usec=0 303mean_slat_disks_usec=0 304mean_clat_disks_usec=0 305#Run each workolad $REPEAT_NO times 306for ((j = 0; j < REPEAT_NO; j++)); do 307 if [ $PLUGIN = "spdk-perf-bdev" ]; then 308 run_bdevperf > $TMP_RESULT_FILE 309 read -r iops bandwidth <<< $(get_bdevperf_results) 310 iops_disks=$(bc "$iops_disks + $iops") 311 bw=$(bc "$bw + $bandwidth") 312 cp $TMP_RESULT_FILE $result_dir/perf_results_${MIX}_${PLUGIN}_${NO_CORES}cpus_${DATE}_${k}_disks_${j}.output 313 [[ -f $TMP_BPF_FILE ]] && mv $TMP_BPF_FILE $result_dir/bpftraces_${MIX}_${PLUGIN}_${NO_CORES}cpus_${DATE}_${k}_disks_${j}.txt 314 elif [ $PLUGIN = "spdk-perf-nvme" ]; then 315 run_nvmeperf $DISKNO > $TMP_RESULT_FILE 316 read -r iops bandwidth mean_lat min_lat max_lat <<< $(get_nvmeperf_results) 317 318 iops_disks=$(bc "$iops_disks+$iops") 319 bw=$(bc "$bw+$bandwidth") 320 mean_lat_disks_usec=$(bc "$mean_lat_disks_usec + $mean_lat") 321 min_lat_disks_usec=$(bc "$min_lat_disks_usec + $min_lat") 322 max_lat_disks_usec=$(bc "$max_lat_disks_usec + $max_lat") 323 324 cp $TMP_RESULT_FILE $result_dir/perf_results_${MIX}_${PLUGIN}_${NO_CORES}cpus_${DATE}_${k}_disks_${j}.output 325 else 326 create_fio_config $DISKNO $PLUGIN "$DISK_NAMES" "$DISKS_NUMA" "$CORES" 327 328 if $LATENCY_LOG; then 329 write_log_opt="--write_lat_log=$result_dir/perf_lat_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}_${k}disks_${j}" 330 fi 331 332 if [[ "$PLUGIN" =~ "spdk-plugin" ]]; then 333 run_spdk_nvme_fio $PLUGIN "--output=$TMP_RESULT_FILE" $write_log_opt 334 else 335 run_nvme_fio $fio_ioengine_opt "--output=$TMP_RESULT_FILE" $write_log_opt 336 fi 337 338 #Store values for every number of used disks 339 #Use recalculated value for mixread param in case rw mode is not rw. 340 rwmixread=$MIX 341 if [[ $RW = *"read"* ]]; then 342 rwmixread=100 343 elif [[ $RW = *"write"* ]]; then 344 rwmixread=0 345 fi 346 347 read -r iops bandwidth mean_lat_usec p90_lat_usec p99_lat_usec p99_99_lat_usec \ 348 stdev_usec mean_slat_usec mean_clat_usec <<< $(get_results $rwmixread) 349 iops_disks=$(bc "$iops_disks + $iops") 350 mean_lat_disks_usec=$(bc "$mean_lat_disks_usec + $mean_lat_usec") 351 p90_lat_disks_usec=$(bc "$p90_lat_disks_usec + $p90_lat_usec") 352 p99_lat_disks_usec=$(bc "$p99_lat_disks_usec + $p99_lat_usec") 353 p99_99_lat_disks_usec=$(bc "$p99_99_lat_disks_usec + $p99_99_lat_usec") 354 stdev_disks_usec=$(bc "$stdev_disks_usec + $stdev_usec") 355 mean_slat_disks_usec=$(bc "$mean_slat_disks_usec + $mean_slat_usec") 356 mean_clat_disks_usec=$(bc "$mean_clat_disks_usec + $mean_clat_usec") 357 bw=$(bc "$bw + $bandwidth") 358 359 cp $TMP_RESULT_FILE $result_dir/perf_results_${MIX}_${PLUGIN}_${NO_CORES}cpus_${DATE}_${k}_disks_${j}.json 360 cp $testdir/config.fio $result_dir/config_${MIX}_${PLUGIN}_${NO_CORES}cpus_${DATE}_${k}_disks_${j}.fio 361 rm -f $testdir/config.fio 362 fi 363done 364 365if $PERFTOP; then 366 echo "INFO: Stopping perftop measurements." 367 kill $perf_pid 368 wait $perf_pid || true 369 perf report -i "$testdir/perf.data" > $result_dir/perftop_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}.txt 370 rm -f "$testdir/perf.data" 371fi 372 373if $DPDKMEM; then 374 mv "/tmp/spdk_mem_dump.txt" $result_dir/spdk_mem_dump_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}.txt 375 echo "INFO: DPDK memory usage saved in $result_dir" 376fi 377 378#Write results to csv file 379iops_disks=$(bc "$iops_disks / $REPEAT_NO") 380bw=$(bc "$bw / $REPEAT_NO") 381if [[ "$PLUGIN" =~ "plugin" ]] || [[ "$PLUGIN" =~ "kernel" ]]; then 382 mean_lat_disks_usec=$(bc "$mean_lat_disks_usec / $REPEAT_NO") 383 p90_lat_disks_usec=$(bc "$p90_lat_disks_usec / $REPEAT_NO") 384 p99_lat_disks_usec=$(bc "$p99_lat_disks_usec / $REPEAT_NO") 385 p99_99_lat_disks_usec=$(bc "$p99_99_lat_disks_usec / $REPEAT_NO") 386 stdev_disks_usec=$(bc "$stdev_disks_usec / $REPEAT_NO") 387 mean_slat_disks_usec=$(bc "$mean_slat_disks_usec / $REPEAT_NO") 388 mean_clat_disks_usec=$(bc "$mean_clat_disks_usec / $REPEAT_NO") 389elif [[ "$PLUGIN" == "spdk-perf-nvme" ]]; then 390 mean_lat_disks_usec=$(bc "$mean_lat_disks_usec/$REPEAT_NO") 391fi 392 393printf "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n" ${DISKNO} ${iops_disks} ${mean_lat_disks_usec} ${p90_lat_disks_usec} ${p99_lat_disks_usec} \ 394 ${p99_99_lat_disks_usec} ${stdev_disks_usec} ${mean_slat_disks_usec} ${mean_clat_disks_usec} ${bw} >> $result_file 395 396if [[ -n "$CPUFREQ" ]]; then 397 cpupower frequency-set -g $cpu_governor 398fi 399 400if [ $PLUGIN = "kernel-io-uring" ]; then 401 # Reload the nvme driver so that other test runs are not affected 402 modprobe -rv nvme 403 modprobe nvme 404 wait_for_nvme_reload $DISK_NAMES 405 406 for disk in $DISK_NAMES; do 407 echo "INFO: Restoring device parameters for $disk" 408 sysfs=/sys/block/$disk/queue 409 cat $backup_dir/$disk/iostats > $sysfs/iostats 410 cat $backup_dir/$disk/rq_affinity > $sysfs/rq_affinity 411 cat $backup_dir/$disk/nomerges > $sysfs/nomerges 412 cat $backup_dir/$disk/io_poll_delay > $sysfs/io_poll_delay 413 done 414fi 415rm -f $testdir/bdev.conf $testdir/config.fio 416