xref: /spdk/test/nvme/perf/run_perf.sh (revision b317d8f3968389b18ae0bcbd1739b1fe3a591d92)
1#!/usr/bin/env bash
2#  SPDX-License-Identifier: BSD-3-Clause
3#  Copyright (C) 2018 Intel Corporation
4#  All rights reserved.
5#
6set -e
7
8# Dir variables and sourcing common files
9testdir=$(readlink -f $(dirname $0))
10rootdir=$(readlink -f $testdir/../../..)
11plugin_dir=$rootdir/build/fio
12source $testdir/common.sh
13source $rootdir/scripts/common.sh || exit 1
14source $rootdir/test/common/autotest_common.sh
15
16# Global & default variables
17declare -A KERNEL_ENGINES
18KERNEL_ENGINES=(
19	["kernel-libaio"]="--ioengine=libaio"
20	["kernel-classic-polling"]="--ioengine=pvsync2 --hipri=100"
21	["kernel-hybrid-polling"]="--ioengine=pvsync2 --hipri=100"
22	["kernel-io-uring"]="--ioengine=io_uring")
23
24RW=randrw
25MIX=100
26IODEPTH=256
27BLK_SIZE=4096
28RUNTIME=600
29RAMP_TIME=30
30NUMJOBS=1
31REPEAT_NO=3
32GTOD_REDUCE=false
33SAMPLING_INT=0
34LATENCY_LOG=false
35IO_BATCH_SUBMIT=0
36IO_BATCH_COMPLETE=0
37FIO_BIN=$CONFIG_FIO_SOURCE_DIR/fio
38FIO_FNAME_STRATEGY="group"
39TMP_RESULT_FILE=$testdir/result.json
40MAIN_CORE=""
41TMP_BPF_FILE=$testdir/bpftraces.txt
42PLUGIN="nvme"
43DISKCFG=""
44BDEV_CACHE=""
45BDEV_POOL=""
46DISKNO="ALL"
47CPUS_ALLOWED=1
48NOIOSCALING=false
49PRECONDITIONING=true
50CPUFREQ=""
51PERFTOP=false
52DPDKMEM=false
53BPFTRACES=()
54DATE="$(date +'%m_%d_%Y_%H%M%S')"
55
56function usage() {
57	set +x
58	[[ -n $2 ]] && (
59		echo "$2"
60		echo ""
61	)
62	echo "Run NVMe PMD/BDEV performance test. Change options for easier debug and setup configuration"
63	echo "Usage: $(basename $1) [options]"
64	echo "-h, --help                Print help and exit"
65	echo
66	echo "Workload parameters:"
67	echo "    --rw=STR                 Type of I/O pattern. Accepted values are randrw,rw. [default=$RW]"
68	echo "    --rwmixread=INT          Percentage of a mixed workload that should be reads. [default=$MIX]"
69	echo "    --iodepth=INT            Number of I/Os to keep in flight against the file. [default=$IODEPTH]"
70	echo "    --block-size=INT         The  block  size  in  bytes  used for I/O units. [default=$BLK_SIZE]"
71	echo "    --run-time=TIME[s]       Tell fio to run the workload for the specified period of time. [default=$RUNTIME]"
72	echo "    --ramp-time=TIME[s]      Fio will run the specified workload for this amount of time before"
73	echo "                             logging any performance numbers. [default=$RAMP_TIME]. Applicable only for fio-based tests."
74	echo "    --numjobs=INT            Create the specified number of clones of this job. [default=$NUMJOBS]"
75	echo "                             Applicable only for fio-based tests."
76	echo "    --repeat-no=INT          How many times to repeat workload test. [default=$REPEAT_NO]"
77	echo "                             Test result will be an average of repeated test runs."
78	echo "    --gtod-reduce            Enable fio gtod_reduce option. [default=$GTOD_REDUCE]"
79	echo "    --sampling-int=INT       Value for fio log_avg_msec parameters [default=$SAMPLING_INT]"
80	echo "    --latency-log            Write latency log file using write_lat_log fio option [default=$LATENCY_LOG]"
81	echo "    --io-batch-submit=INT    Value for iodepth_batch_submit fio option [default=$IO_BATCH_SUBMIT]"
82	echo "    --io-batch-complete=INT  Value for iodepth_batch_complete fio option [default=$IO_BATCH_COMPLETE]"
83	echo "    --fio-bin=PATH           Path to fio binary. [default=$FIO_BIN]"
84	echo "                             Applicable only for fio-based tests."
85	echo "    --fio-fname-strategy=STR Use 'group' to group filenames under job section with common CPU or"
86	echo "                             use 'split' to create a separate fio job section for each filename [default=$FIO_FNAME_STRATEGY]"
87	echo
88	echo "Test setup parameters:"
89	echo "    --driver=STR            Selects tool used for testing. Choices available:"
90	echo "                               - spdk-perf-nvme (SPDK nvme perf)"
91	echo "                               - spdk-perf-bdev (SPDK bdev perf)"
92	echo "                               - spdk-perf-xnvme-bdev (SPDK xnvme bdev perf with io_uring io_mechanism)"
93	echo "                               - spdk-plugin-nvme (SPDK nvme fio plugin)"
94	echo "                               - spdk-plugin-bdev (SPDK bdev fio plugin)"
95	echo "                               - spdk-plugin-bdev-xnvme (SPDK bdev fio plugin with xnvme bdevs)"
96	echo "                               - kernel-classic-polling"
97	echo "                               - kernel-hybrid-polling"
98	echo "                               - kernel-libaio"
99	echo "                               - kernel-io-uring"
100	echo "    --disk-config           Configuration file containing PCI BDF addresses of NVMe disks to use in test."
101	echo "                            It consists a single column of PCI addresses. SPDK Bdev names will be assigned"
102	echo "                            and Kernel block device names detected."
103	echo "                            Lines starting with # are ignored as comments."
104	echo "    --bdev-io-cache-size    Set IO cache size for for SPDK bdev subsystem."
105	echo "    --bdev-io-pool-size     Set IO pool size for for SPDK bdev subsystem."
106	echo "    --max-disk=INT,ALL      Number of disks to test on, this will run multiple workloads with increasing number of disk each run."
107	echo "                            If =ALL then test on all found disk. [default=$DISKNO]"
108	echo "    --cpu-allowed=INT/PATH  Comma-separated list of CPU cores used to run the workload. Ranges allowed."
109	echo "                            Can also point to a file containing list of CPUs. [default=$CPUS_ALLOWED]"
110	echo "    --no-preconditioning    Skip preconditioning"
111	echo "    --no-io-scaling         Do not scale iodepth for each device in SPDK fio plugin. [default=$NOIOSCALING]"
112	echo "    --cpu-frequency=INT     Run tests with CPUs set to a desired frequency. 'intel_pstate=disable' must be set in"
113	echo "                            GRUB options. You can use 'cpupower frequency-info' and 'cpupower frequency-set' to"
114	echo "                            check list of available frequencies. Example: --cpu-frequency=1100000."
115	echo "    --main-core             main (primary) core for DPDK (for bdevperf only)."
116	echo
117	echo "Other options:"
118	echo "    --perftop           Run perftop measurements on the same CPU cores as specified in --cpu-allowed option."
119	echo "    --dpdk-mem-stats    Dump DPDK memory stats during the test."
120	echo "    --bpf-traces=LIST       Comma delimited list of .bt scripts for enabling BPF traces."
121	echo "                            List of .bt scripts available in spdk/scripts/bpf."
122	echo "                            Only for spdk-perf-bdev"
123	set -x
124}
125
126while getopts 'h-:' optchar; do
127	case "$optchar" in
128		-)
129			case "$OPTARG" in
130				help)
131					usage $0
132					exit 0
133					;;
134				rw=*) RW="${OPTARG#*=}" ;;
135				rwmixread=*) MIX="${OPTARG#*=}" ;;
136				iodepth=*) IODEPTH="${OPTARG#*=}" ;;
137				block-size=*) BLK_SIZE="${OPTARG#*=}" ;;
138				run-time=*) RUNTIME="${OPTARG#*=}" ;;
139				ramp-time=*) RAMP_TIME="${OPTARG#*=}" ;;
140				numjobs=*) NUMJOBS="${OPTARG#*=}" ;;
141				repeat-no=*) REPEAT_NO="${OPTARG#*=}" ;;
142				gtod-reduce) GTOD_REDUCE=true ;;
143				sampling-int=*) SAMPLING_INT="${OPTARG#*=}" ;;
144				io-batch-submit=*) IO_BATCH_SUBMIT="${OPTARG#*=}" ;;
145				io-batch-complete=*) IO_BATCH_COMPLETE="${OPTARG#*=}" ;;
146				fio-bin=*) FIO_BIN="${OPTARG#*=}" ;;
147				fio-fname-strategy=*)
148					FIO_FNAME_STRATEGY="${OPTARG#*=}"
149					if [[ "$FIO_FNAME_STRATEGY" == "split" ]]; then
150						NOIOSCALING=true
151					fi
152					;;
153				driver=*) PLUGIN="${OPTARG#*=}" ;;
154				disk-config=*)
155					DISKCFG="${OPTARG#*=}"
156					if [[ ! -f "$DISKCFG" ]]; then
157						echo "Disk configuration file $DISKCFG does not exist!"
158						exit 1
159					fi
160					;;
161				bdev-io-cache-size=*) BDEV_CACHE="${OPTARG#*=}" ;;
162				bdev-io-pool-size=*) BDEV_POOL="${OPTARG#*=}" ;;
163				max-disk=*) DISKNO="${OPTARG#*=}" ;;
164				cpu-allowed=*)
165					CPUS_ALLOWED="${OPTARG#*=}"
166					if [[ -f "$CPUS_ALLOWED" ]]; then
167						CPUS_ALLOWED=$(cat "$CPUS_ALLOWED")
168					fi
169					;;
170				no-preconditioning) PRECONDITIONING=false ;;
171				no-io-scaling) NOIOSCALING=true ;;
172				cpu-frequency=*) CPUFREQ="${OPTARG#*=}" ;;
173				perftop) PERFTOP=true ;;
174				dpdk-mem-stats) DPDKMEM=true ;;
175				bpf-traces=*) IFS="," read -r -a BPFTRACES <<< "${OPTARG#*=}" ;;
176				latency-log) LATENCY_LOG=true ;;
177				main-core=*) MAIN_CORE="${OPTARG#*=}" ;;
178				*)
179					usage $0 echo "Invalid argument '$OPTARG'"
180					exit 1
181					;;
182			esac
183			;;
184		h)
185			usage $0
186			exit 0
187			;;
188		*)
189			usage $0 "Invalid argument '$optchar'"
190			exit 1
191			;;
192	esac
193done
194
195result_dir=$testdir/results/perf_results_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}
196result_file=$result_dir/perf_results_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}.csv
197mkdir -p $result_dir
198unset iops_disks bw mean_lat_disks_usec p90_lat_disks_usec p99_lat_disks_usec p99_99_lat_disks_usec stdev_disks_usec
199echo "run-time,ramp-time,fio-plugin,QD,block-size,num-cpu-cores,workload,workload-mix" > $result_file
200printf "%s,%s,%s,%s,%s,%s,%s,%s\n" $RUNTIME $RAMP_TIME $PLUGIN $IODEPTH $BLK_SIZE $NO_CORES $RW $MIX >> $result_file
201echo "num_of_disks,iops,avg_lat[usec],p90[usec],p99[usec],p99.99[usec],stdev[usec],avg_slat[usec],avg_clat[usec],bw[Kib/s]" >> $result_file
202
203trap 'rm -f *.state $testdir/bdev.conf; kill $perf_pid; wait $dpdk_mem_pid; print_backtrace' ERR SIGTERM SIGABRT
204
205if [[ "$PLUGIN" =~ "xnvme" ]]; then
206	create_spdk_xnvme_bdev_conf "$BDEV_CACHE" "$BDEV_POOL"
207elif [[ "$PLUGIN" =~ "bdev" ]]; then
208	create_spdk_bdev_conf "$BDEV_CACHE" "$BDEV_POOL"
209fi
210
211if [[ -s $testdir/bdev.conf ]]; then
212	echo "INFO: Generated bdev.conf file:"
213	cat $testdir/bdev.conf
214fi
215
216verify_disk_number
217if [[ "$PLUGIN" =~ "xnvme" ]]; then
218	DISK_NAMES=$(get_disks)
219	DISKS_NUMA=$(get_numa_node "" "$DISK_NAMES")
220else
221	DISK_NAMES=$(get_disks $PLUGIN)
222	DISKS_NUMA=$(get_numa_node $PLUGIN "$DISK_NAMES")
223fi
224CORES=$(get_cores "$CPUS_ALLOWED")
225NO_CORES_ARRAY=($CORES)
226NO_CORES=${#NO_CORES_ARRAY[@]}
227
228if $PRECONDITIONING; then
229	preconditioning
230fi
231
232if [[ "$PLUGIN" =~ "kernel" || "$PLUGIN" =~ "xnvme" ]]; then
233	$rootdir/scripts/setup.sh reset
234	fio_ioengine_opt="${KERNEL_ENGINES[$PLUGIN]}"
235
236	if [[ $PLUGIN = "kernel-classic-polling" ]]; then
237		for disk in $DISK_NAMES; do
238			echo -1 > /sys/block/$disk/queue/io_poll_delay
239		done
240	elif [[ $PLUGIN = "kernel-hybrid-polling" ]]; then
241		for disk in $DISK_NAMES; do
242			echo 0 > /sys/block/$disk/queue/io_poll_delay
243		done
244	elif [[ $PLUGIN = "kernel-io-uring" || $PLUGIN =~ "xnvme" ]]; then
245		modprobe -rv nvme
246		modprobe nvme poll_queues=8
247		wait_for_nvme_reload $DISK_NAMES
248
249		backup_dir="/tmp/nvme_param_bak"
250		mkdir -p $backup_dir
251
252		for disk in $DISK_NAMES; do
253			echo "INFO: Backing up device parameters for $disk"
254			sysfs=/sys/block/$disk/queue
255			mkdir -p $backup_dir/$disk
256			cat $sysfs/iostats > $backup_dir/$disk/iostats
257			cat $sysfs/rq_affinity > $backup_dir/$disk/rq_affinity
258			cat $sysfs/nomerges > $backup_dir/$disk/nomerges
259			cat $sysfs/io_poll_delay > $backup_dir/$disk/io_poll_delay
260		done
261
262		for disk in $DISK_NAMES; do
263			echo "INFO: Setting device parameters for $disk"
264			sysfs=/sys/block/$disk/queue
265			echo 0 > $sysfs/iostats
266			echo 0 > $sysfs/rq_affinity
267			echo 2 > $sysfs/nomerges
268			echo -1 > $sysfs/io_poll_delay
269		done
270	fi
271fi
272
273cpu_governor="$(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor)"
274
275if [[ -n "$CPUFREQ" ]]; then
276	if [[ ! "$(cat /proc/cmdline)" =~ "intel_pstate=disable" ]]; then
277		echo "ERROR: Cannot set custom CPU frequency for test. intel_pstate=disable not in boot options."
278		false
279	else
280		cpupower frequency-set -g userspace
281		cpupower frequency-set -f $CPUFREQ
282	fi
283else
284	cpupower frequency-set -g performance
285fi
286current_governor=$(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor)
287echo "INFO: Using $current_governor cpu governor for test."
288
289if $PERFTOP; then
290	echo "INFO: starting perf record on cores $CPUS_ALLOWED"
291	perf record -C $CPUS_ALLOWED -o "$testdir/perf.data" &
292	perf_pid=$!
293fi
294
295if $DPDKMEM; then
296	echo "INFO: waiting to generate DPDK memory usage"
297	wait_time=$((RUNTIME / 2))
298	if [[ ! "$PLUGIN" =~ "perf" ]]; then
299		wait_time=$((wait_time + RAMP_TIME))
300	fi
301	(
302		sleep $wait_time
303		echo "INFO: generating DPDK memory usage"
304		$rootdir/scripts/rpc.py env_dpdk_get_mem_stats
305	) &
306	dpdk_mem_pid=$!
307fi
308
309iops_disks=0
310bw=0
311min_lat_disks_usec=0
312max_lat_disks_usec=0
313mean_lat_disks_usec=0
314p90_lat_disks_usec=0
315p99_lat_disks_usec=0
316p99_99_lat_disks_usec=0
317stdev_disks_usec=0
318mean_slat_disks_usec=0
319mean_clat_disks_usec=0
320#Run each workload $REPEAT_NO times
321for ((j = 0; j < REPEAT_NO; j++)); do
322	if [[ $PLUGIN == "spdk-perf-bdev" || $PLUGIN =~ "xnvme-bdev" ]]; then
323		run_bdevperf > $TMP_RESULT_FILE
324		read -r iops bandwidth <<< $(get_bdevperf_results)
325		iops_disks=$(bc "$iops_disks + $iops")
326		bw=$(bc "$bw + $bandwidth")
327		cp $TMP_RESULT_FILE $result_dir/perf_results_${MIX}_${PLUGIN}_${NO_CORES}cpus_${DATE}_${k}_disks_${j}.output
328		[[ -f $TMP_BPF_FILE ]] && mv $TMP_BPF_FILE $result_dir/bpftraces_${MIX}_${PLUGIN}_${NO_CORES}cpus_${DATE}_${k}_disks_${j}.txt
329	elif [ $PLUGIN = "spdk-perf-nvme" ]; then
330		run_nvmeperf $DISKNO > $TMP_RESULT_FILE
331		read -r iops bandwidth mean_lat min_lat max_lat <<< $(get_nvmeperf_results)
332
333		iops_disks=$(bc "$iops_disks+$iops")
334		bw=$(bc "$bw+$bandwidth")
335		mean_lat_disks_usec=$(bc "$mean_lat_disks_usec + $mean_lat")
336		min_lat_disks_usec=$(bc "$min_lat_disks_usec + $min_lat")
337		max_lat_disks_usec=$(bc "$max_lat_disks_usec + $max_lat")
338
339		cp $TMP_RESULT_FILE $result_dir/perf_results_${MIX}_${PLUGIN}_${NO_CORES}cpus_${DATE}_${k}_disks_${j}.output
340	else
341		create_fio_config $DISKNO $PLUGIN "$DISK_NAMES" "$DISKS_NUMA" "$CORES"
342
343		if $LATENCY_LOG; then
344			write_log_opt="--write_lat_log=$result_dir/perf_lat_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}_${k}disks_${j}"
345		fi
346
347		if [[ "$PLUGIN" =~ "spdk-plugin" ]]; then
348			run_spdk_nvme_fio $PLUGIN "--output=$TMP_RESULT_FILE" $write_log_opt
349		else
350			run_nvme_fio $fio_ioengine_opt "--output=$TMP_RESULT_FILE" $write_log_opt
351		fi
352
353		#Store values for every number of used disks
354		#Use recalculated value for mixread param in case rw mode is not rw.
355		rwmixread=$MIX
356		if [[ $RW = *"read"* ]]; then
357			rwmixread=100
358		elif [[ $RW = *"write"* ]]; then
359			rwmixread=0
360		fi
361
362		read -r iops bandwidth mean_lat_usec p90_lat_usec p99_lat_usec p99_99_lat_usec \
363			stdev_usec mean_slat_usec mean_clat_usec <<< $(get_results $rwmixread)
364		iops_disks=$(bc "$iops_disks + $iops")
365		mean_lat_disks_usec=$(bc "$mean_lat_disks_usec + $mean_lat_usec")
366		p90_lat_disks_usec=$(bc "$p90_lat_disks_usec + $p90_lat_usec")
367		p99_lat_disks_usec=$(bc "$p99_lat_disks_usec + $p99_lat_usec")
368		p99_99_lat_disks_usec=$(bc "$p99_99_lat_disks_usec + $p99_99_lat_usec")
369		stdev_disks_usec=$(bc "$stdev_disks_usec + $stdev_usec")
370		mean_slat_disks_usec=$(bc "$mean_slat_disks_usec + $mean_slat_usec")
371		mean_clat_disks_usec=$(bc "$mean_clat_disks_usec + $mean_clat_usec")
372		bw=$(bc "$bw + $bandwidth")
373
374		cp $TMP_RESULT_FILE $result_dir/perf_results_${MIX}_${PLUGIN}_${NO_CORES}cpus_${DATE}_${k}_disks_${j}.json
375		cp $testdir/config.fio $result_dir/config_${MIX}_${PLUGIN}_${NO_CORES}cpus_${DATE}_${k}_disks_${j}.fio
376		rm -f $testdir/config.fio
377	fi
378done
379
380if $PERFTOP; then
381	echo "INFO: Stopping perftop measurements."
382	kill $perf_pid
383	wait $perf_pid || true
384	perf report -i "$testdir/perf.data" > $result_dir/perftop_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}.txt
385	rm -f "$testdir/perf.data"
386fi
387
388if $DPDKMEM; then
389	mv "/tmp/spdk_mem_dump.txt" $result_dir/spdk_mem_dump_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}.txt
390	echo "INFO: DPDK memory usage saved in $result_dir"
391fi
392
393#Write results to csv file
394iops_disks=$(bc "$iops_disks / $REPEAT_NO")
395bw=$(bc "$bw / $REPEAT_NO")
396if [[ "$PLUGIN" =~ "plugin" || "$PLUGIN" =~ "kernel" ]] && [[ ! $PLUGIN =~ "xnvme-bdev" ]]; then
397	mean_lat_disks_usec=$(bc "$mean_lat_disks_usec / $REPEAT_NO")
398	p90_lat_disks_usec=$(bc "$p90_lat_disks_usec / $REPEAT_NO")
399	p99_lat_disks_usec=$(bc "$p99_lat_disks_usec / $REPEAT_NO")
400	p99_99_lat_disks_usec=$(bc "$p99_99_lat_disks_usec / $REPEAT_NO")
401	stdev_disks_usec=$(bc "$stdev_disks_usec / $REPEAT_NO")
402	mean_slat_disks_usec=$(bc "$mean_slat_disks_usec / $REPEAT_NO")
403	mean_clat_disks_usec=$(bc "$mean_clat_disks_usec / $REPEAT_NO")
404elif [[ "$PLUGIN" == "spdk-perf-nvme" ]]; then
405	mean_lat_disks_usec=$(bc "$mean_lat_disks_usec/$REPEAT_NO")
406fi
407
408printf "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n" ${DISKNO} ${iops_disks} ${mean_lat_disks_usec} ${p90_lat_disks_usec} ${p99_lat_disks_usec} \
409	${p99_99_lat_disks_usec} ${stdev_disks_usec} ${mean_slat_disks_usec} ${mean_clat_disks_usec} ${bw} >> $result_file
410
411if [[ -n "$CPUFREQ" ]]; then
412	cpupower frequency-set -g $cpu_governor
413fi
414
415if [[ $PLUGIN = "kernel-io-uring" || $PLUGIN =~ "xnvme" ]]; then
416	# Reload the nvme driver so that other test runs are not affected
417	modprobe -rv nvme
418	modprobe nvme
419	wait_for_nvme_reload $DISK_NAMES
420
421	for disk in $DISK_NAMES; do
422		echo "INFO: Restoring device parameters for $disk"
423		sysfs=/sys/block/$disk/queue
424		cat $backup_dir/$disk/iostats > $sysfs/iostats
425		cat $backup_dir/$disk/rq_affinity > $sysfs/rq_affinity
426		cat $backup_dir/$disk/nomerges > $sysfs/nomerges
427		cat $backup_dir/$disk/io_poll_delay > $sysfs/io_poll_delay
428	done
429fi
430rm -f $testdir/bdev.conf $testdir/config.fio
431