1#!/usr/bin/env bash 2 3set -e 4BASE_DIR=$(readlink -f $(dirname $0)) 5ROOT_DIR=$(readlink -f $BASE_DIR/../../..) 6rootdir=$ROOT_DIR 7PLUGIN_DIR=$ROOT_DIR/build/fio 8BDEVPERF_DIR=$ROOT_DIR/test/bdev/bdevperf 9NVMEPERF_DIR=$ROOT_DIR/build/examples 10. $ROOT_DIR/scripts/common.sh || exit 1 11. $ROOT_DIR/test/common/autotest_common.sh 12NVME_FIO_RESULTS=$BASE_DIR/result.json 13 14declare -A KERNEL_ENGINES 15KERNEL_ENGINES=( 16 ["kernel-libaio"]="--ioengine=libaio" 17 ["kernel-classic-polling"]="--ioengine=pvsync2 --hipri=100" 18 ["kernel-hybrid-polling"]="--ioengine=pvsync2 --hipri=100" 19 ["kernel-io-uring"]="--ioengine=io_uring") 20 21RW=randrw 22MIX=100 23IODEPTH=256 24BLK_SIZE=4096 25RUNTIME=600 26RAMP_TIME=30 27NUMJOBS=1 28REPEAT_NO=3 29FIO_BIN=$CONFIG_FIO_SOURCE_DIR/fio 30PLUGIN="nvme" 31DISKCFG="" 32DISKNO="ALL" 33CPUS_ALLOWED=1 34NOIOSCALING=false 35PRECONDITIONING=true 36DATE="$(date +'%m_%d_%Y_%H%M%S')" 37 38function discover_bdevs() { 39 local rootdir=$1 40 local config_file=$2 41 local cfg_type=$3 42 local wait_for_spdk_bdev=${4:-30} 43 local rpc_server=/var/tmp/spdk-discover-bdevs.sock 44 45 if [ ! -e $config_file ]; then 46 echo "Invalid Configuration File: $config_file" 47 return 1 48 fi 49 50 if [ -z $cfg_type ]; then 51 cfg_type="-c" 52 fi 53 54 # Start the bdev service to query for the list of available 55 # bdevs. 56 $rootdir/test/app/bdev_svc/bdev_svc -r $rpc_server -i 0 \ 57 $cfg_type $config_file &> /dev/null & 58 stubpid=$! 59 while ! [ -e /var/run/spdk_bdev0 ]; do 60 # If this counter drops to zero, errexit will be caught to abort the test 61 ((wait_for_spdk_bdev--)) 62 sleep 1 63 done 64 65 # Get all of the bdevs 66 $rootdir/scripts/rpc.py -s "$rpc_server" bdev_get_bdevs 67 68 # Shut down the bdev service 69 kill $stubpid 70 wait $stubpid 71 rm -f /var/run/spdk_bdev0 72} 73 74function create_spdk_bdev_conf() { 75 local output 76 local disk_cfg 77 local bdev_json_cfg 78 79 disk_cfg=($(grep -vP "^\s*#" "$DISKCFG")) 80 81 bdev_json_cfg=() 82 for i in "${!disk_cfg[@]}"; do 83 bdev_json_cfg+=("$( 84 cat <<- JSON 85 { 86 "method": "bdev_nvme_attach_controller", 87 "params": { 88 "trtype": "PCIe", 89 "name":"Nvme${i}", 90 "traddr":"${disk_cfg[i]}" 91 } 92 } 93 JSON 94 )") 95 done 96 97 local IFS="," 98 jq -r '.' <<- JSON > $BASE_DIR/bdev.conf 99 { 100 "subsystems": [ 101 { 102 "subsystem": "bdev", 103 "config": [ 104 ${bdev_json_cfg[*]} 105 ] 106 } 107 ] 108 } 109 JSON 110} 111 112function is_bdf_not_mounted() { 113 local bdf=$1 114 local blkname 115 local mountpoints 116 blkname=$(ls -l /sys/block/ | grep $bdf | awk '{print $9}') 117 mountpoints=$(lsblk /dev/$blkname --output MOUNTPOINT -n | wc -w) 118 return $mountpoints 119} 120 121function get_cores() { 122 local cpu_list="$1" 123 for cpu in ${cpu_list//,/ }; do 124 echo $cpu 125 done 126} 127 128function get_cores_numa_node() { 129 local cores=$1 130 for core in $cores; do 131 lscpu -p=cpu,node | grep "^$core\b" | awk -F ',' '{print $2}' 132 done 133} 134 135function get_numa_node() { 136 local plugin=$1 137 local disks=$2 138 if [[ "$plugin" =~ "nvme" ]]; then 139 for bdf in $disks; do 140 local driver 141 driver=$(grep DRIVER /sys/bus/pci/devices/$bdf/uevent | awk -F"=" '{print $2}') 142 # Use this check to ommit blacklisted devices ( not binded to driver with setup.sh script ) 143 if [ "$driver" = "vfio-pci" ] || [ "$driver" = "uio_pci_generic" ]; then 144 cat /sys/bus/pci/devices/$bdf/numa_node 145 fi 146 done 147 elif [[ "$plugin" =~ "bdev" ]]; then 148 local bdevs 149 bdevs=$(discover_bdevs $ROOT_DIR $BASE_DIR/bdev.conf --json) 150 for name in $disks; do 151 local bdev_bdf 152 bdev_bdf=$(jq -r ".[] | select(.name==\"$name\").driver_specific.nvme.pci_address" <<< $bdevs) 153 cat /sys/bus/pci/devices/$bdev_bdf/numa_node 154 done 155 else 156 for name in $disks; do 157 local bdf 158 # Not reading directly from /sys/block/nvme* because of a kernel bug 159 # which results in NUMA 0 always getting reported. 160 bdf=$(cat /sys/block/$name/device/address) 161 cat /sys/bus/pci/devices/$bdf/numa_node 162 done 163 fi 164} 165 166function get_disks() { 167 local plugin=$1 168 local disk_cfg 169 170 disk_cfg=($(grep -vP "^\s*#" "$DISKCFG")) 171 if [[ "$plugin" =~ "nvme" ]]; then 172 # PCI BDF address is enough for nvme-perf and nvme-fio-plugin, 173 # so just print them from configuration file 174 echo "${disk_cfg[*]}" 175 elif [[ "$plugin" =~ "bdev" ]]; then 176 # Generate NvmeXn1 bdev name configuration file for bdev-perf 177 # and bdev-fio-plugin 178 local bdevs 179 local disk_no 180 disk_no=${#disk_cfg[@]} 181 eval echo "Nvme{0..$((disk_no - 1))}n1" 182 else 183 # Find nvme block devices and only use the ones which 184 # are not mounted 185 for bdf in "${disk_cfg[@]}"; do 186 if is_bdf_not_mounted $bdf; then 187 local blkname 188 blkname=$(ls -l /sys/block/ | grep $bdf | awk '{print $9}') 189 echo $blkname 190 fi 191 done 192 fi 193} 194 195function get_disks_on_numa() { 196 local devs=($1) 197 local numas=($2) 198 local numa_no=$3 199 local disks_on_numa="" 200 local i 201 202 for ((i = 0; i < ${#devs[@]}; i++)); do 203 if [ ${numas[$i]} = $numa_no ]; then 204 disks_on_numa=$((disks_on_numa + 1)) 205 fi 206 done 207 echo $disks_on_numa 208} 209 210function create_fio_config() { 211 local disk_no=$1 212 local plugin=$2 213 local disks=($3) 214 local disks_numa=($4) 215 local cores=($5) 216 local total_disks=${#disks[@]} 217 local no_cores=${#cores[@]} 218 local filename="" 219 220 local cores_numa 221 cores_numa=($(get_cores_numa_node "$5")) 222 local disks_per_core=$((disk_no / no_cores)) 223 local disks_per_core_mod=$((disk_no % no_cores)) 224 225 # For kernel dirver, each disk will be alligned with all cpus on the same NUMA node 226 if [[ "$plugin" =~ "kernel" ]]; then 227 for ((i = 0; i < disk_no; i++)); do 228 sed -i -e "\$a[filename${i}]" $BASE_DIR/config.fio 229 filename="/dev/${disks[$i]}" 230 sed -i -e "\$afilename=$filename" $BASE_DIR/config.fio 231 cpu_used="" 232 for ((j = 0; j < no_cores; j++)); do 233 core_numa=${cores_numa[$j]} 234 if [ "${disks_numa[$i]}" = "$core_numa" ]; then 235 cpu_used+="${cores[$j]}," 236 fi 237 done 238 sed -i -e "\$acpus_allowed=$cpu_used" $BASE_DIR/config.fio 239 echo "" >> $BASE_DIR/config.fio 240 done 241 else 242 for ((i = 0; i < no_cores; i++)); do 243 core_numa=${cores_numa[$i]} 244 total_disks_per_core=$disks_per_core 245 if [ "$disks_per_core_mod" -gt "0" ]; then 246 total_disks_per_core=$((disks_per_core + 1)) 247 disks_per_core_mod=$((disks_per_core_mod - 1)) 248 fi 249 250 if [ "$total_disks_per_core" = "0" ]; then 251 break 252 fi 253 254 sed -i -e "\$a[filename${i}]" $BASE_DIR/config.fio 255 #use cpus_allowed as cpumask works only for cores 1-32 256 sed -i -e "\$acpus_allowed=${cores[$i]}" $BASE_DIR/config.fio 257 m=0 #counter of disks per cpu core numa 258 n=0 #counter of all disks 259 while [ "$m" -lt "$total_disks_per_core" ]; do 260 if [ ${disks_numa[$n]} = $core_numa ]; then 261 m=$((m + 1)) 262 if [[ "$plugin" = "spdk-plugin-nvme" ]]; then 263 filename='trtype=PCIe traddr='${disks[$n]//:/.}' ns=1' 264 elif [[ "$plugin" = "spdk-plugin-bdev" ]]; then 265 filename=${disks[$n]} 266 fi 267 sed -i -e "\$afilename=$filename" $BASE_DIR/config.fio 268 #Mark numa of n'th disk as "x" to mark it as claimed 269 disks_numa[$n]="x" 270 fi 271 n=$((n + 1)) 272 # If there is no more disks with numa node same as cpu numa node, switch to other numa node. 273 if [ $n -ge $total_disks ]; then 274 if [ "$core_numa" = "1" ]; then 275 core_numa=0 276 else 277 core_numa=1 278 fi 279 n=0 280 fi 281 done 282 echo "" >> $BASE_DIR/config.fio 283 done 284 fi 285} 286 287function preconditioning() { 288 local dev_name="" 289 local filename="" 290 local nvme_list 291 292 HUGEMEM=8192 $ROOT_DIR/scripts/setup.sh 293 cp $BASE_DIR/config.fio.tmp $BASE_DIR/config.fio 294 echo "[Preconditioning]" >> $BASE_DIR/config.fio 295 296 # Generate filename argument for FIO. 297 # We only want to target NVMes not bound to nvme driver. 298 # If they're still bound to nvme that means they were skipped by 299 # setup.sh on purpose. 300 nvme_list=$(get_disks nvme) 301 for nvme in $nvme_list; do 302 dev_name='trtype=PCIe traddr='${nvme//:/.}' ns=1' 303 filename+=$(printf %s":" "$dev_name") 304 done 305 echo "** Preconditioning disks, this can take a while, depending on the size of disks." 306 run_spdk_nvme_fio "spdk-plugin-nvme" --filename="$filename" --size=100% --loops=2 --bs=1M \ 307 --rw=write --iodepth=32 --output-format=normal 308 rm -f $BASE_DIR/config.fio 309} 310 311function get_results() { 312 local reads_pct 313 local writes_pct 314 315 reads_pct=$(bc -l <<< "scale=3; $2/100") 316 writes_pct=$(bc -l <<< "scale=3; 1-$reads_pct") 317 case "$1" in 318 iops) 319 iops=$(jq -r '.jobs[] | .read.iops + .write.iops' $NVME_FIO_RESULTS) 320 iops=${iops%.*} 321 echo $iops 322 ;; 323 mean_lat_usec) 324 mean_lat=$(jq -r ".jobs[] | (.read.lat_ns.mean * $reads_pct + .write.lat_ns.mean * $writes_pct)" $NVME_FIO_RESULTS) 325 mean_lat=${mean_lat%.*} 326 echo $((mean_lat / 1000)) 327 ;; 328 p99_lat_usec) 329 p99_lat=$(jq -r ".jobs[] | (.read.clat_ns.percentile.\"99.000000\" // 0 * $reads_pct + .write.clat_ns.percentile.\"99.000000\" // 0 * $writes_pct)" $NVME_FIO_RESULTS) 330 p99_lat=${p99_lat%.*} 331 echo $((p99_lat / 1000)) 332 ;; 333 p99_99_lat_usec) 334 p99_99_lat=$(jq -r ".jobs[] | (.read.clat_ns.percentile.\"99.990000\" // 0 * $reads_pct + .write.clat_ns.percentile.\"99.990000\" // 0 * $writes_pct)" $NVME_FIO_RESULTS) 335 p99_99_lat=${p99_99_lat%.*} 336 echo $((p99_99_lat / 1000)) 337 ;; 338 stdev_usec) 339 stdev=$(jq -r ".jobs[] | (.read.clat_ns.stddev * $reads_pct + .write.clat_ns.stddev * $writes_pct)" $NVME_FIO_RESULTS) 340 stdev=${stdev%.*} 341 echo $((stdev / 1000)) 342 ;; 343 mean_slat_usec) 344 mean_slat=$(jq -r ".jobs[] | (.read.slat_ns.mean * $reads_pct + .write.slat_ns.mean * $writes_pct)" $NVME_FIO_RESULTS) 345 mean_slat=${mean_slat%.*} 346 echo $((mean_slat / 1000)) 347 ;; 348 mean_clat_usec) 349 mean_clat=$(jq -r ".jobs[] | (.read.clat_ns.mean * $reads_pct + .write.clat_ns.mean * $writes_pct)" $NVME_FIO_RESULTS) 350 mean_clat=${mean_clat%.*} 351 echo $((mean_clat / 1000)) 352 ;; 353 bw_Kibs) 354 bw=$(jq -r ".jobs[] | (.read.bw + .write.bw)" $NVME_FIO_RESULTS) 355 bw=${bw%.*} 356 echo $((bw)) 357 ;; 358 esac 359} 360 361function get_bdevperf_results() { 362 case "$1" in 363 iops) 364 iops=$(grep Total $NVME_FIO_RESULTS | awk -F 'Total' '{print $2}' | awk '{print $2}') 365 iops=${iops%.*} 366 echo $iops 367 ;; 368 bw_Kibs) 369 bw_MBs=$(grep Total $NVME_FIO_RESULTS | awk -F 'Total' '{print $2}' | awk '{print $4}') 370 bw_MBs=${bw_MBs%.*} 371 echo $((bw_MBs * 1024)) 372 ;; 373 esac 374} 375 376function get_nvmeperf_results() { 377 local iops 378 local bw_MBs 379 local mean_lat_usec 380 local max_lat_usec 381 local min_lat_usec 382 383 read -r iops bw_MBs mean_lat_usec min_lat_usec max_lat_usec <<< $(tr -s " " < $NVME_FIO_RESULTS | grep -oP "(?<=Total : )(.*+)") 384 385 # We need to get rid of the decimal spaces due 386 # to use of arithmetic expressions instead of "bc" for calculations 387 iops=${iops%.*} 388 bw_MBs=${bw_MBs%.*} 389 mean_lat_usec=${mean_lat_usec%.*} 390 min_lat_usec=${min_lat_usec%.*} 391 max_lat_usec=${max_lat_usec%.*} 392 393 echo "$iops $(bc <<< "$bw_MBs * 1024") $mean_lat_usec $min_lat_usec $max_lat_usec" 394} 395 396function run_spdk_nvme_fio() { 397 local plugin=$1 398 echo "** Running fio test, this can take a while, depending on the run-time and ramp-time setting." 399 if [[ "$plugin" = "spdk-plugin-nvme" ]]; then 400 LD_PRELOAD=$PLUGIN_DIR/spdk_nvme $FIO_BIN $BASE_DIR/config.fio --output-format=json "${@:2}" --ioengine=spdk 401 elif [[ "$plugin" = "spdk-plugin-bdev" ]]; then 402 LD_PRELOAD=$PLUGIN_DIR/spdk_bdev $FIO_BIN $BASE_DIR/config.fio --output-format=json "${@:2}" --ioengine=spdk_bdev --spdk_json_conf=$BASE_DIR/bdev.conf --spdk_mem=4096 403 fi 404 405 sleep 1 406} 407 408function run_nvme_fio() { 409 echo "** Running fio test, this can take a while, depending on the run-time and ramp-time setting." 410 $FIO_BIN $BASE_DIR/config.fio --output-format=json "$@" 411 sleep 1 412} 413 414function run_bdevperf() { 415 echo "** Running bdevperf test, this can take a while, depending on the run-time setting." 416 $BDEVPERF_DIR/bdevperf --json $BASE_DIR/bdev.conf -q $IODEPTH -o $BLK_SIZE -w $RW -M $MIX -t $RUNTIME -m "[$CPUS_ALLOWED]" 417 sleep 1 418} 419 420function run_nvmeperf() { 421 # Prepare -r argument string for nvme perf command 422 local r_opt 423 local disks 424 425 # Limit the number of disks to $1 if needed 426 disks=($(get_disks nvme)) 427 disks=("${disks[@]:0:$1}") 428 r_opt=$(printf -- ' -r "trtype:PCIe traddr:%s"' "${disks[@]}") 429 430 echo "** Running nvme perf test, this can take a while, depending on the run-time setting." 431 432 # Run command in separate shell as this solves quoting issues related to r_opt var 433 $SHELL -c "$NVMEPERF_DIR/perf $r_opt -q $IODEPTH -o $BLK_SIZE -w $RW -M $MIX -t $RUNTIME -c [$CPUS_ALLOWED]" 434 sleep 1 435} 436 437function wait_for_nvme_reload() { 438 local nvmes=$1 439 440 shopt -s extglob 441 for disk in $nvmes; do 442 cmd="ls /sys/block/$disk/queue/*@(iostats|rq_affinity|nomerges|io_poll_delay)*" 443 until $cmd 2> /dev/null; do 444 echo "Waiting for full nvme driver reload..." 445 sleep 0.5 446 done 447 done 448 shopt -q extglob 449} 450 451function verify_disk_number() { 452 # Check if we have appropriate number of disks to carry out the test 453 disks=($(get_disks $PLUGIN)) 454 if [[ $DISKNO == "ALL" ]] || [[ $DISKNO == "all" ]]; then 455 DISKNO=${#disks[@]} 456 elif [[ $DISKNO -gt ${#disks[@]} ]] || [[ ! $DISKNO =~ ^[0-9]+$ ]]; then 457 echo "error: Required devices number ($DISKNO) is not a valid number or it's larger than the number of devices found (${#disks[@]})" 458 false 459 fi 460} 461 462function usage() { 463 set +x 464 [[ -n $2 ]] && ( 465 echo "$2" 466 echo "" 467 ) 468 echo "Run NVMe PMD/BDEV performance test. Change options for easier debug and setup configuration" 469 echo "Usage: $(basename $1) [options]" 470 echo "-h, --help Print help and exit" 471 echo 472 echo "Workload parameters:" 473 echo " --rw=STR Type of I/O pattern. Accepted values are randrw,rw. [default=$RW]" 474 echo " --rwmixread=INT Percentage of a mixed workload that should be reads. [default=$MIX]" 475 echo " --iodepth=INT Number of I/Os to keep in flight against the file. [default=$IODEPTH]" 476 echo " --block-size=INT The block size in bytes used for I/O units. [default=$BLK_SIZE]" 477 echo " --run-time=TIME[s] Tell fio to run the workload for the specified period of time. [default=$RUNTIME]" 478 echo " --ramp-time=TIME[s] Fio will run the specified workload for this amount of time before" 479 echo " logging any performance numbers. [default=$RAMP_TIME]. Applicable only for fio-based tests." 480 echo " --numjobs=INT Create the specified number of clones of this job. [default=$NUMJOBS]" 481 echo " Applicable only for fio-based tests." 482 echo " --repeat-no=INT How many times to repeat workload test. [default=$REPEAT_NO]" 483 echo " Test result will be an average of repeated test runs." 484 echo " --fio-bin=PATH Path to fio binary. [default=$FIO_BIN]" 485 echo " Applicable only for fio-based tests." 486 echo 487 echo "Test setup parameters:" 488 echo " --driver=STR Selects tool used for testing. Choices available:" 489 echo " - spdk-perf-nvme (SPDK nvme perf)" 490 echo " - spdk-perf-bdev (SPDK bdev perf)" 491 echo " - spdk-plugin-nvme (SPDK nvme fio plugin)" 492 echo " - spdk-plugin-bdev (SPDK bdev fio plugin)" 493 echo " - kernel-classic-polling" 494 echo " - kernel-hybrid-polling" 495 echo " - kernel-libaio" 496 echo " - kernel-io-uring" 497 echo " --disk-config Configuration file containing PCI BDF addresses of NVMe disks to use in test." 498 echo " It consists a single column of PCI addresses. SPDK Bdev names will be assigned" 499 echo " and Kernel block device names detected." 500 echo " Lines starting with # are ignored as comments." 501 echo " --max-disk=INT,ALL Number of disks to test on, this will run multiple workloads with increasing number of disk each run." 502 echo " If =ALL then test on all found disk. [default=$DISKNO]" 503 echo " --cpu-allowed=INT Comma-separated list of CPU cores used to run the workload. [default=$CPUS_ALLOWED]" 504 echo " --no-preconditioning Skip preconditioning" 505 echo " --no-io-scaling Do not scale iodepth for each device in SPDK fio plugin. [default=$NOIOSCALING]" 506 set -x 507} 508 509while getopts 'h-:' optchar; do 510 case "$optchar" in 511 -) 512 case "$OPTARG" in 513 help) 514 usage $0 515 exit 0 516 ;; 517 rw=*) RW="${OPTARG#*=}" ;; 518 rwmixread=*) MIX="${OPTARG#*=}" ;; 519 iodepth=*) IODEPTH="${OPTARG#*=}" ;; 520 block-size=*) BLK_SIZE="${OPTARG#*=}" ;; 521 run-time=*) RUNTIME="${OPTARG#*=}" ;; 522 ramp-time=*) RAMP_TIME="${OPTARG#*=}" ;; 523 numjobs=*) NUMJOBS="${OPTARG#*=}" ;; 524 repeat-no=*) REPEAT_NO="${OPTARG#*=}" ;; 525 fio-bin=*) FIO_BIN="${OPTARG#*=}" ;; 526 driver=*) PLUGIN="${OPTARG#*=}" ;; 527 disk-config=*) 528 DISKCFG="${OPTARG#*=}" 529 if [[ ! -f "$DISKCFG" ]]; then 530 echo "Disk confiuration file $DISKCFG does not exist!" 531 exit 1 532 fi 533 ;; 534 max-disk=*) DISKNO="${OPTARG#*=}" ;; 535 cpu-allowed=*) CPUS_ALLOWED="${OPTARG#*=}" ;; 536 no-preconditioning) PRECONDITIONING=false ;; 537 no-io-scaling) NOIOSCALING=true ;; 538 *) 539 usage $0 echo "Invalid argument '$OPTARG'" 540 exit 1 541 ;; 542 esac 543 ;; 544 h) 545 usage $0 546 exit 0 547 ;; 548 *) 549 usage $0 "Invalid argument '$optchar'" 550 exit 1 551 ;; 552 esac 553done 554