xref: /spdk/test/nvme/perf/run_perf.sh (revision cc6920a4763d4b9a43aa40583c8397d8f14fa100)
1#!/usr/bin/env bash
2set -e
3
4# Dir variables and sourcing common files
5testdir=$(readlink -f $(dirname $0))
6rootdir=$(readlink -f $testdir/../../..)
7plugin_dir=$rootdir/build/fio
8bdevperf_dir=$rootdir/test/bdev/bdevperf
9nvmeperf_dir=$rootdir/build/examples
10source $testdir/common.sh
11source $rootdir/scripts/common.sh || exit 1
12source $rootdir/test/common/autotest_common.sh
13
14# Global & default variables
15declare -A KERNEL_ENGINES
16KERNEL_ENGINES=(
17	["kernel-libaio"]="--ioengine=libaio"
18	["kernel-classic-polling"]="--ioengine=pvsync2 --hipri=100"
19	["kernel-hybrid-polling"]="--ioengine=pvsync2 --hipri=100"
20	["kernel-io-uring"]="--ioengine=io_uring")
21
22RW=randrw
23MIX=100
24IODEPTH=256
25BLK_SIZE=4096
26RUNTIME=600
27RAMP_TIME=30
28NUMJOBS=1
29REPEAT_NO=3
30GTOD_REDUCE=false
31SAMPLING_INT=0
32LATENCY_LOG=false
33IO_BATCH_SUBMIT=0
34IO_BATCH_COMPLETE=0
35FIO_BIN=$CONFIG_FIO_SOURCE_DIR/fio
36FIO_FNAME_STRATEGY="group"
37TMP_RESULT_FILE=$testdir/result.json
38MAIN_CORE=""
39TMP_BPF_FILE=$testdir/bpftraces.txt
40PLUGIN="nvme"
41DISKCFG=""
42BDEV_CACHE=""
43BDEV_POOL=""
44DISKNO="ALL"
45CPUS_ALLOWED=1
46NOIOSCALING=false
47PRECONDITIONING=true
48CPUFREQ=""
49PERFTOP=false
50DPDKMEM=false
51BPFTRACES=()
52DATE="$(date +'%m_%d_%Y_%H%M%S')"
53
54function usage() {
55	set +x
56	[[ -n $2 ]] && (
57		echo "$2"
58		echo ""
59	)
60	echo "Run NVMe PMD/BDEV performance test. Change options for easier debug and setup configuration"
61	echo "Usage: $(basename $1) [options]"
62	echo "-h, --help                Print help and exit"
63	echo
64	echo "Workload parameters:"
65	echo "    --rw=STR                 Type of I/O pattern. Accepted values are randrw,rw. [default=$RW]"
66	echo "    --rwmixread=INT          Percentage of a mixed workload that should be reads. [default=$MIX]"
67	echo "    --iodepth=INT            Number of I/Os to keep in flight against the file. [default=$IODEPTH]"
68	echo "    --block-size=INT         The  block  size  in  bytes  used for I/O units. [default=$BLK_SIZE]"
69	echo "    --run-time=TIME[s]       Tell fio to run the workload for the specified period of time. [default=$RUNTIME]"
70	echo "    --ramp-time=TIME[s]      Fio will run the specified workload for this amount of time before"
71	echo "                             logging any performance numbers. [default=$RAMP_TIME]. Applicable only for fio-based tests."
72	echo "    --numjobs=INT            Create the specified number of clones of this job. [default=$NUMJOBS]"
73	echo "                             Applicable only for fio-based tests."
74	echo "    --repeat-no=INT          How many times to repeat workload test. [default=$REPEAT_NO]"
75	echo "                             Test result will be an average of repeated test runs."
76	echo "    --gtod-reduce            Enable fio gtod_reduce option. [default=$GTOD_REDUCE]"
77	echo "    --sampling-int=INT       Value for fio log_avg_msec parameters [default=$SAMPLING_INT]"
78	echo "    --latency-log            Write latency log file using write_lat_log fio option [default=$LATENCY_LOG]"
79	echo "    --io-batch-submit=INT    Value for iodepth_batch_submit fio option [default=$IO_BATCH_SUBMIT]"
80	echo "    --io-batch-complete=INT  Value for iodepth_batch_complete fio option [default=$IO_BATCH_COMPLETE]"
81	echo "    --fio-bin=PATH           Path to fio binary. [default=$FIO_BIN]"
82	echo "                             Applicable only for fio-based tests."
83	echo "    --fio-fname-strategy=STR Use 'group' to group filenames under job section with common CPU or"
84	echo "                             use 'split' to create a separate fio job section for each filename [default=$FIO_FNAME_STRATEGY]"
85	echo
86	echo "Test setup parameters:"
87	echo "    --driver=STR            Selects tool used for testing. Choices available:"
88	echo "                               - spdk-perf-nvme (SPDK nvme perf)"
89	echo "                               - spdk-perf-bdev (SPDK bdev perf)"
90	echo "                               - spdk-plugin-nvme (SPDK nvme fio plugin)"
91	echo "                               - spdk-plugin-bdev (SPDK bdev fio plugin)"
92	echo "                               - kernel-classic-polling"
93	echo "                               - kernel-hybrid-polling"
94	echo "                               - kernel-libaio"
95	echo "                               - kernel-io-uring"
96	echo "    --disk-config           Configuration file containing PCI BDF addresses of NVMe disks to use in test."
97	echo "                            It consists a single column of PCI addresses. SPDK Bdev names will be assigned"
98	echo "                            and Kernel block device names detected."
99	echo "                            Lines starting with # are ignored as comments."
100	echo "    --bdev-io-cache-size    Set IO cache size for for SPDK bdev subsystem."
101	echo "    --bdev-io-pool-size     Set IO pool size for for SPDK bdev subsystem."
102	echo "    --max-disk=INT,ALL      Number of disks to test on, this will run multiple workloads with increasing number of disk each run."
103	echo "                            If =ALL then test on all found disk. [default=$DISKNO]"
104	echo "    --cpu-allowed=INT/PATH  Comma-separated list of CPU cores used to run the workload. Ranges allowed."
105	echo "                            Can also point to a file containing list of CPUs. [default=$CPUS_ALLOWED]"
106	echo "    --no-preconditioning    Skip preconditioning"
107	echo "    --no-io-scaling         Do not scale iodepth for each device in SPDK fio plugin. [default=$NOIOSCALING]"
108	echo "    --cpu-frequency=INT     Run tests with CPUs set to a desired frequency. 'intel_pstate=disable' must be set in"
109	echo "                            GRUB options. You can use 'cpupower frequency-info' and 'cpupower frequency-set' to"
110	echo "                            check list of available frequencies. Example: --cpu-frequency=1100000."
111	echo "    --main-core             main (primary) core for DPDK (for bdevperf only)."
112	echo
113	echo "Other options:"
114	echo "    --perftop           Run perftop measurements on the same CPU cores as specified in --cpu-allowed option."
115	echo "    --dpdk-mem-stats    Dump DPDK memory stats during the test."
116	echo "    --bpf-traces=LIST       Comma delimited list of .bt scripts for enabling BPF traces."
117	echo "                            List of .bt scripts available in spdk/scripts/bpf."
118	echo "                            Only for spdk-perf-bdev"
119	set -x
120}
121
122while getopts 'h-:' optchar; do
123	case "$optchar" in
124		-)
125			case "$OPTARG" in
126				help)
127					usage $0
128					exit 0
129					;;
130				rw=*) RW="${OPTARG#*=}" ;;
131				rwmixread=*) MIX="${OPTARG#*=}" ;;
132				iodepth=*) IODEPTH="${OPTARG#*=}" ;;
133				block-size=*) BLK_SIZE="${OPTARG#*=}" ;;
134				run-time=*) RUNTIME="${OPTARG#*=}" ;;
135				ramp-time=*) RAMP_TIME="${OPTARG#*=}" ;;
136				numjobs=*) NUMJOBS="${OPTARG#*=}" ;;
137				repeat-no=*) REPEAT_NO="${OPTARG#*=}" ;;
138				gtod-reduce) GTOD_REDUCE=true ;;
139				sampling-int=*) SAMPLING_INT="${OPTARG#*=}" ;;
140				io-batch-submit=*) IO_BATCH_SUBMIT="${OPTARG#*=}" ;;
141				io-batch-complete=*) IO_BATCH_COMPLETE="${OPTARG#*=}" ;;
142				fio-bin=*) FIO_BIN="${OPTARG#*=}" ;;
143				fio-fname-strategy=*)
144					FIO_FNAME_STRATEGY="${OPTARG#*=}"
145					if [[ "$FIO_FNAME_STRATEGY" == "split" ]]; then
146						NOIOSCALING=true
147					fi
148					;;
149				driver=*) PLUGIN="${OPTARG#*=}" ;;
150				disk-config=*)
151					DISKCFG="${OPTARG#*=}"
152					if [[ ! -f "$DISKCFG" ]]; then
153						echo "Disk confiuration file $DISKCFG does not exist!"
154						exit 1
155					fi
156					;;
157				bdev-io-cache-size=*) BDEV_CACHE="${OPTARG#*=}" ;;
158				bdev-io-pool-size=*) BDEV_POOL="${OPTARG#*=}" ;;
159				max-disk=*) DISKNO="${OPTARG#*=}" ;;
160				cpu-allowed=*)
161					CPUS_ALLOWED="${OPTARG#*=}"
162					if [[ -f "$CPUS_ALLOWED" ]]; then
163						CPUS_ALLOWED=$(cat "$CPUS_ALLOWED")
164					fi
165					;;
166				no-preconditioning) PRECONDITIONING=false ;;
167				no-io-scaling) NOIOSCALING=true ;;
168				cpu-frequency=*) CPUFREQ="${OPTARG#*=}" ;;
169				perftop) PERFTOP=true ;;
170				dpdk-mem-stats) DPDKMEM=true ;;
171				bpf-traces=*) IFS="," read -r -a BPFTRACES <<< "${OPTARG#*=}" ;;
172				latency-log) LATENCY_LOG=true ;;
173				main-core=*) MAIN_CORE="${OPTARG#*=}" ;;
174				*)
175					usage $0 echo "Invalid argument '$OPTARG'"
176					exit 1
177					;;
178			esac
179			;;
180		h)
181			usage $0
182			exit 0
183			;;
184		*)
185			usage $0 "Invalid argument '$optchar'"
186			exit 1
187			;;
188	esac
189done
190
191result_dir=$testdir/results/perf_results_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}
192result_file=$result_dir/perf_results_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}.csv
193mkdir -p $result_dir
194unset iops_disks bw mean_lat_disks_usec p90_lat_disks_usec p99_lat_disks_usec p99_99_lat_disks_usec stdev_disks_usec
195echo "run-time,ramp-time,fio-plugin,QD,block-size,num-cpu-cores,workload,workload-mix" > $result_file
196printf "%s,%s,%s,%s,%s,%s,%s,%s\n" $RUNTIME $RAMP_TIME $PLUGIN $IODEPTH $BLK_SIZE $NO_CORES $RW $MIX >> $result_file
197echo "num_of_disks,iops,avg_lat[usec],p90[usec],p99[usec],p99.99[usec],stdev[usec],avg_slat[usec],avg_clat[usec],bw[Kib/s]" >> $result_file
198
199trap 'rm -f *.state $testdir/bdev.conf; kill $perf_pid; wait $dpdk_mem_pid; print_backtrace' ERR SIGTERM SIGABRT
200
201if [[ "$PLUGIN" =~ "bdev" ]]; then
202	create_spdk_bdev_conf "$BDEV_CACHE" "$BDEV_POOL"
203	echo "INFO: Generated bdev.conf file:"
204	cat $testdir/bdev.conf
205fi
206verify_disk_number
207DISK_NAMES=$(get_disks $PLUGIN)
208DISKS_NUMA=$(get_numa_node $PLUGIN "$DISK_NAMES")
209CORES=$(get_cores "$CPUS_ALLOWED")
210NO_CORES_ARRAY=($CORES)
211NO_CORES=${#NO_CORES_ARRAY[@]}
212
213if $PRECONDITIONING; then
214	preconditioning
215fi
216
217if [[ "$PLUGIN" =~ "kernel" ]]; then
218	$rootdir/scripts/setup.sh reset
219	fio_ioengine_opt="${KERNEL_ENGINES[$PLUGIN]}"
220
221	if [[ $PLUGIN = "kernel-classic-polling" ]]; then
222		for disk in $DISK_NAMES; do
223			echo -1 > /sys/block/$disk/queue/io_poll_delay
224		done
225	elif [[ $PLUGIN = "kernel-hybrid-polling" ]]; then
226		for disk in $DISK_NAMES; do
227			echo 0 > /sys/block/$disk/queue/io_poll_delay
228		done
229	elif [[ $PLUGIN = "kernel-io-uring" ]]; then
230		modprobe -rv nvme
231		modprobe nvme poll_queues=8
232		wait_for_nvme_reload $DISK_NAMES
233
234		backup_dir="/tmp/nvme_param_bak"
235		mkdir -p $backup_dir
236
237		for disk in $DISK_NAMES; do
238			echo "INFO: Backing up device parameters for $disk"
239			sysfs=/sys/block/$disk/queue
240			mkdir -p $backup_dir/$disk
241			cat $sysfs/iostats > $backup_dir/$disk/iostats
242			cat $sysfs/rq_affinity > $backup_dir/$disk/rq_affinity
243			cat $sysfs/nomerges > $backup_dir/$disk/nomerges
244			cat $sysfs/io_poll_delay > $backup_dir/$disk/io_poll_delay
245		done
246
247		for disk in $DISK_NAMES; do
248			echo "INFO: Setting device parameters for $disk"
249			sysfs=/sys/block/$disk/queue
250			echo 0 > $sysfs/iostats
251			echo 0 > $sysfs/rq_affinity
252			echo 2 > $sysfs/nomerges
253			echo -1 > $sysfs/io_poll_delay
254		done
255	fi
256fi
257
258cpu_governor="$(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor)"
259
260if [[ -n "$CPUFREQ" ]]; then
261	if [[ ! "$(cat /proc/cmdline)" =~ "intel_pstate=disable" ]]; then
262		echo "ERROR: Cannot set custom CPU frequency for test. intel_pstate=disable not in boot options."
263		false
264	else
265		cpupower frequency-set -g userspace
266		cpupower frequency-set -f $CPUFREQ
267	fi
268else
269	cpupower frequency-set -g performance
270fi
271current_governor=$(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor)
272echo "INFO: Using $current_governor cpu governor for test."
273
274if $PERFTOP; then
275	echo "INFO: starting perf record on cores $CPUS_ALLOWED"
276	perf record -C $CPUS_ALLOWED -o "$testdir/perf.data" &
277	perf_pid=$!
278fi
279
280if $DPDKMEM; then
281	echo "INFO: waiting to generate DPDK memory usage"
282	wait_time=$((RUNTIME / 2))
283	if [[ ! "$PLUGIN" =~ "perf" ]]; then
284		wait_time=$((wait_time + RAMP_TIME))
285	fi
286	(
287		sleep $wait_time
288		echo "INFO: generating DPDK memory usage"
289		$rootdir/scripts/rpc.py env_dpdk_get_mem_stats
290	) &
291	dpdk_mem_pid=$!
292fi
293
294iops_disks=0
295bw=0
296min_lat_disks_usec=0
297max_lat_disks_usec=0
298mean_lat_disks_usec=0
299p90_lat_disks_usec=0
300p99_lat_disks_usec=0
301p99_99_lat_disks_usec=0
302stdev_disks_usec=0
303mean_slat_disks_usec=0
304mean_clat_disks_usec=0
305#Run each workolad $REPEAT_NO times
306for ((j = 0; j < REPEAT_NO; j++)); do
307	if [ $PLUGIN = "spdk-perf-bdev" ]; then
308		run_bdevperf > $TMP_RESULT_FILE
309		read -r iops bandwidth <<< $(get_bdevperf_results)
310		iops_disks=$(bc "$iops_disks + $iops")
311		bw=$(bc "$bw + $bandwidth")
312		cp $TMP_RESULT_FILE $result_dir/perf_results_${MIX}_${PLUGIN}_${NO_CORES}cpus_${DATE}_${k}_disks_${j}.output
313		[[ -f $TMP_BPF_FILE ]] && mv $TMP_BPF_FILE $result_dir/bpftraces_${MIX}_${PLUGIN}_${NO_CORES}cpus_${DATE}_${k}_disks_${j}.txt
314	elif [ $PLUGIN = "spdk-perf-nvme" ]; then
315		run_nvmeperf $DISKNO > $TMP_RESULT_FILE
316		read -r iops bandwidth mean_lat min_lat max_lat <<< $(get_nvmeperf_results)
317
318		iops_disks=$(bc "$iops_disks+$iops")
319		bw=$(bc "$bw+$bandwidth")
320		mean_lat_disks_usec=$(bc "$mean_lat_disks_usec + $mean_lat")
321		min_lat_disks_usec=$(bc "$min_lat_disks_usec + $min_lat")
322		max_lat_disks_usec=$(bc "$max_lat_disks_usec + $max_lat")
323
324		cp $TMP_RESULT_FILE $result_dir/perf_results_${MIX}_${PLUGIN}_${NO_CORES}cpus_${DATE}_${k}_disks_${j}.output
325	else
326		create_fio_config $DISKNO $PLUGIN "$DISK_NAMES" "$DISKS_NUMA" "$CORES"
327
328		if $LATENCY_LOG; then
329			write_log_opt="--write_lat_log=$result_dir/perf_lat_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}_${k}disks_${j}"
330		fi
331
332		if [[ "$PLUGIN" =~ "spdk-plugin" ]]; then
333			run_spdk_nvme_fio $PLUGIN "--output=$TMP_RESULT_FILE" $write_log_opt
334		else
335			run_nvme_fio $fio_ioengine_opt "--output=$TMP_RESULT_FILE" $write_log_opt
336		fi
337
338		#Store values for every number of used disks
339		#Use recalculated value for mixread param in case rw mode is not rw.
340		rwmixread=$MIX
341		if [[ $RW = *"read"* ]]; then
342			rwmixread=100
343		elif [[ $RW = *"write"* ]]; then
344			rwmixread=0
345		fi
346
347		read -r iops bandwidth mean_lat_usec p90_lat_usec p99_lat_usec p99_99_lat_usec \
348			stdev_usec mean_slat_usec mean_clat_usec <<< $(get_results $rwmixread)
349		iops_disks=$(bc "$iops_disks + $iops")
350		mean_lat_disks_usec=$(bc "$mean_lat_disks_usec + $mean_lat_usec")
351		p90_lat_disks_usec=$(bc "$p90_lat_disks_usec + $p90_lat_usec")
352		p99_lat_disks_usec=$(bc "$p99_lat_disks_usec + $p99_lat_usec")
353		p99_99_lat_disks_usec=$(bc "$p99_99_lat_disks_usec + $p99_99_lat_usec")
354		stdev_disks_usec=$(bc "$stdev_disks_usec + $stdev_usec")
355		mean_slat_disks_usec=$(bc "$mean_slat_disks_usec + $mean_slat_usec")
356		mean_clat_disks_usec=$(bc "$mean_clat_disks_usec + $mean_clat_usec")
357		bw=$(bc "$bw + $bandwidth")
358
359		cp $TMP_RESULT_FILE $result_dir/perf_results_${MIX}_${PLUGIN}_${NO_CORES}cpus_${DATE}_${k}_disks_${j}.json
360		cp $testdir/config.fio $result_dir/config_${MIX}_${PLUGIN}_${NO_CORES}cpus_${DATE}_${k}_disks_${j}.fio
361		rm -f $testdir/config.fio
362	fi
363done
364
365if $PERFTOP; then
366	echo "INFO: Stopping perftop measurements."
367	kill $perf_pid
368	wait $perf_pid || true
369	perf report -i "$testdir/perf.data" > $result_dir/perftop_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}.txt
370	rm -f "$testdir/perf.data"
371fi
372
373if $DPDKMEM; then
374	mv "/tmp/spdk_mem_dump.txt" $result_dir/spdk_mem_dump_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}.txt
375	echo "INFO: DPDK memory usage saved in $result_dir"
376fi
377
378#Write results to csv file
379iops_disks=$(bc "$iops_disks / $REPEAT_NO")
380bw=$(bc "$bw / $REPEAT_NO")
381if [[ "$PLUGIN" =~ "plugin" ]] || [[ "$PLUGIN" =~ "kernel" ]]; then
382	mean_lat_disks_usec=$(bc "$mean_lat_disks_usec / $REPEAT_NO")
383	p90_lat_disks_usec=$(bc "$p90_lat_disks_usec / $REPEAT_NO")
384	p99_lat_disks_usec=$(bc "$p99_lat_disks_usec / $REPEAT_NO")
385	p99_99_lat_disks_usec=$(bc "$p99_99_lat_disks_usec / $REPEAT_NO")
386	stdev_disks_usec=$(bc "$stdev_disks_usec / $REPEAT_NO")
387	mean_slat_disks_usec=$(bc "$mean_slat_disks_usec / $REPEAT_NO")
388	mean_clat_disks_usec=$(bc "$mean_clat_disks_usec / $REPEAT_NO")
389elif [[ "$PLUGIN" == "spdk-perf-nvme" ]]; then
390	mean_lat_disks_usec=$(bc "$mean_lat_disks_usec/$REPEAT_NO")
391fi
392
393printf "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n" ${DISKNO} ${iops_disks} ${mean_lat_disks_usec} ${p90_lat_disks_usec} ${p99_lat_disks_usec} \
394	${p99_99_lat_disks_usec} ${stdev_disks_usec} ${mean_slat_disks_usec} ${mean_clat_disks_usec} ${bw} >> $result_file
395
396if [[ -n "$CPUFREQ" ]]; then
397	cpupower frequency-set -g $cpu_governor
398fi
399
400if [ $PLUGIN = "kernel-io-uring" ]; then
401	# Reload the nvme driver so that other test runs are not affected
402	modprobe -rv nvme
403	modprobe nvme
404	wait_for_nvme_reload $DISK_NAMES
405
406	for disk in $DISK_NAMES; do
407		echo "INFO: Restoring device parameters for $disk"
408		sysfs=/sys/block/$disk/queue
409		cat $backup_dir/$disk/iostats > $sysfs/iostats
410		cat $backup_dir/$disk/rq_affinity > $sysfs/rq_affinity
411		cat $backup_dir/$disk/nomerges > $sysfs/nomerges
412		cat $backup_dir/$disk/io_poll_delay > $sysfs/io_poll_delay
413	done
414fi
415rm -f $testdir/bdev.conf $testdir/config.fio
416