xref: /spdk/test/nvme/perf/run_perf.sh (revision 2172c432cfdaecc5a279d64e37c6b51e794683c1)
1#!/usr/bin/env bash
2set -e
3
4# Dir variables and sourcing common files
5testdir=$(readlink -f $(dirname $0))
6rootdir=$(readlink -f $testdir/../../..)
7plugin_dir=$rootdir/build/fio
8bdevperf_dir=$rootdir/test/bdev/bdevperf
9nvmeperf_dir=$rootdir/build/examples
10source $testdir/common.sh
11source $rootdir/scripts/common.sh || exit 1
12source $rootdir/test/common/autotest_common.sh
13
14# Global & default variables
15declare -A KERNEL_ENGINES
16KERNEL_ENGINES=(
17	["kernel-libaio"]="--ioengine=libaio"
18	["kernel-classic-polling"]="--ioengine=pvsync2 --hipri=100"
19	["kernel-hybrid-polling"]="--ioengine=pvsync2 --hipri=100"
20	["kernel-io-uring"]="--ioengine=io_uring")
21
22RW=randrw
23MIX=100
24IODEPTH=256
25BLK_SIZE=4096
26RUNTIME=600
27RAMP_TIME=30
28NUMJOBS=1
29REPEAT_NO=3
30GTOD_REDUCE=false
31SAMPLING_INT=0
32IO_BATCH_SUBMIT=0
33IO_BATCH_COMPLETE=0
34FIO_BIN=$CONFIG_FIO_SOURCE_DIR/fio
35TMP_RESULT_FILE=$testdir/result.json
36PLUGIN="nvme"
37DISKCFG=""
38BDEV_CACHE=""
39BDEV_POOL=""
40DISKNO="ALL"
41CPUS_ALLOWED=1
42NOIOSCALING=false
43PRECONDITIONING=true
44CPUFREQ=""
45PERFTOP=false
46DPDKMEM=false
47DATE="$(date +'%m_%d_%Y_%H%M%S')"
48
49function usage() {
50	set +x
51	[[ -n $2 ]] && (
52		echo "$2"
53		echo ""
54	)
55	echo "Run NVMe PMD/BDEV performance test. Change options for easier debug and setup configuration"
56	echo "Usage: $(basename $1) [options]"
57	echo "-h, --help                Print help and exit"
58	echo
59	echo "Workload parameters:"
60	echo "    --rw=STR                 Type of I/O pattern. Accepted values are randrw,rw. [default=$RW]"
61	echo "    --rwmixread=INT          Percentage of a mixed workload that should be reads. [default=$MIX]"
62	echo "    --iodepth=INT            Number of I/Os to keep in flight against the file. [default=$IODEPTH]"
63	echo "    --block-size=INT         The  block  size  in  bytes  used for I/O units. [default=$BLK_SIZE]"
64	echo "    --run-time=TIME[s]       Tell fio to run the workload for the specified period of time. [default=$RUNTIME]"
65	echo "    --ramp-time=TIME[s]      Fio will run the specified workload for this amount of time before"
66	echo "                             logging any performance numbers. [default=$RAMP_TIME]. Applicable only for fio-based tests."
67	echo "    --numjobs=INT            Create the specified number of clones of this job. [default=$NUMJOBS]"
68	echo "                             Applicable only for fio-based tests."
69	echo "    --repeat-no=INT          How many times to repeat workload test. [default=$REPEAT_NO]"
70	echo "                             Test result will be an average of repeated test runs."
71	echo "    --gtod-reduce            Enable fio gtod_reduce option. [default=$GTOD_REDUCE]"
72	echo "    --sampling-int=INT       Value for fio log_avg_msec parameters [default=$SAMPLING_INT]"
73	echo "    --io-batch-submit=INT    Value for iodepth_batch_submit fio option [default=$IO_BATCH_SUBMIT]"
74	echo "    --io-batch-complete=INT  Value for iodepth_batch_complete fio option [default=$IO_BATCH_COMPLETE]"
75	echo "    --fio-bin=PATH           Path to fio binary. [default=$FIO_BIN]"
76	echo "                             Applicable only for fio-based tests."
77	echo
78	echo "Test setup parameters:"
79	echo "    --driver=STR            Selects tool used for testing. Choices available:"
80	echo "                               - spdk-perf-nvme (SPDK nvme perf)"
81	echo "                               - spdk-perf-bdev (SPDK bdev perf)"
82	echo "                               - spdk-plugin-nvme (SPDK nvme fio plugin)"
83	echo "                               - spdk-plugin-bdev (SPDK bdev fio plugin)"
84	echo "                               - kernel-classic-polling"
85	echo "                               - kernel-hybrid-polling"
86	echo "                               - kernel-libaio"
87	echo "                               - kernel-io-uring"
88	echo "    --disk-config           Configuration file containing PCI BDF addresses of NVMe disks to use in test."
89	echo "                            It consists a single column of PCI addresses. SPDK Bdev names will be assigned"
90	echo "                            and Kernel block device names detected."
91	echo "                            Lines starting with # are ignored as comments."
92	echo "    --bdev-io-cache-size    Set IO cache size for for SPDK bdev subsystem."
93	echo "    --bdev-io-pool-size     Set IO pool size for for SPDK bdev subsystem."
94	echo "    --max-disk=INT,ALL      Number of disks to test on, this will run multiple workloads with increasing number of disk each run."
95	echo "                            If =ALL then test on all found disk. [default=$DISKNO]"
96	echo "    --cpu-allowed=INT/PATH  Comma-separated list of CPU cores used to run the workload. Ranges allowed."
97	echo "                            Can also point to a file containing list of CPUs. [default=$CPUS_ALLOWED]"
98	echo "    --no-preconditioning    Skip preconditioning"
99	echo "    --no-io-scaling         Do not scale iodepth for each device in SPDK fio plugin. [default=$NOIOSCALING]"
100	echo "    --cpu-frequency=INT     Run tests with CPUs set to a desired frequency. 'intel_pstate=disable' must be set in"
101	echo "                            GRUB options. You can use 'cpupower frequency-info' and 'cpupower frequency-set' to"
102	echo "                            check list of available frequencies. Example: --cpu-frequency=1100000."
103	echo
104	echo "Other options:"
105	echo "    --perftop           Run perftop measurements on the same CPU cores as specified in --cpu-allowed option."
106	echo "    --dpdk-mem-stats    Dump DPDK memory stats during the test."
107	set -x
108}
109
110while getopts 'h-:' optchar; do
111	case "$optchar" in
112		-)
113			case "$OPTARG" in
114				help)
115					usage $0
116					exit 0
117					;;
118				rw=*) RW="${OPTARG#*=}" ;;
119				rwmixread=*) MIX="${OPTARG#*=}" ;;
120				iodepth=*) IODEPTH="${OPTARG#*=}" ;;
121				block-size=*) BLK_SIZE="${OPTARG#*=}" ;;
122				run-time=*) RUNTIME="${OPTARG#*=}" ;;
123				ramp-time=*) RAMP_TIME="${OPTARG#*=}" ;;
124				numjobs=*) NUMJOBS="${OPTARG#*=}" ;;
125				repeat-no=*) REPEAT_NO="${OPTARG#*=}" ;;
126				gtod-reduce) GTOD_REDUCE=true ;;
127				sampling-int=*) SAMPLING_INT="${OPTARG#*=}" ;;
128				io-batch-submit=*) IO_BATCH_SUBMIT="${OPTARG#*=}" ;;
129				io-batch-complete=*) IO_BATCH_COMPLETE="${OPTARG#*=}" ;;
130				fio-bin=*) FIO_BIN="${OPTARG#*=}" ;;
131				driver=*) PLUGIN="${OPTARG#*=}" ;;
132				disk-config=*)
133					DISKCFG="${OPTARG#*=}"
134					if [[ ! -f "$DISKCFG" ]]; then
135						echo "Disk confiuration file $DISKCFG does not exist!"
136						exit 1
137					fi
138					;;
139				bdev-io-cache-size=*) BDEV_CACHE="${OPTARG#*=}" ;;
140				bdev-io-pool-size=*) BDEV_POOL="${OPTARG#*=}" ;;
141				max-disk=*) DISKNO="${OPTARG#*=}" ;;
142				cpu-allowed=*)
143					CPUS_ALLOWED="${OPTARG#*=}"
144					if [[ -f "$CPUS_ALLOWED" ]]; then
145						CPUS_ALLOWED=$(cat "$CPUS_ALLOWED")
146					fi
147					;;
148				no-preconditioning) PRECONDITIONING=false ;;
149				no-io-scaling) NOIOSCALING=true ;;
150				cpu-frequency=*) CPUFREQ="${OPTARG#*=}" ;;
151				perftop) PERFTOP=true ;;
152				dpdk-mem-stats) DPDKMEM=true ;;
153				*)
154					usage $0 echo "Invalid argument '$OPTARG'"
155					exit 1
156					;;
157			esac
158			;;
159		h)
160			usage $0
161			exit 0
162			;;
163		*)
164			usage $0 "Invalid argument '$optchar'"
165			exit 1
166			;;
167	esac
168done
169
170result_dir=$testdir/results/perf_results_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}
171result_file=$result_dir/perf_results_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}.csv
172mkdir -p $result_dir
173unset iops_disks bw mean_lat_disks_usec p90_lat_disks_usec p99_lat_disks_usec p99_99_lat_disks_usec stdev_disks_usec
174echo "run-time,ramp-time,fio-plugin,QD,block-size,num-cpu-cores,workload,workload-mix" > $result_file
175printf "%s,%s,%s,%s,%s,%s,%s,%s\n" $RUNTIME $RAMP_TIME $PLUGIN $IODEPTH $BLK_SIZE $NO_CORES $RW $MIX >> $result_file
176echo "num_of_disks,iops,avg_lat[usec],p90[usec],p99[usec],p99.99[usec],stdev[usec],avg_slat[usec],avg_clat[usec],bw[Kib/s]" >> $result_file
177
178trap 'rm -f *.state $testdir/bdev.conf; kill $perf_pid; wait $dpdk_mem_pid; print_backtrace' ERR SIGTERM SIGABRT
179
180if [[ "$PLUGIN" =~ "bdev" ]]; then
181	create_spdk_bdev_conf "$BDEV_CACHE" "$BDEV_POOL"
182	echo "INFO: Generated bdev.conf file:"
183	cat $testdir/bdev.conf
184fi
185verify_disk_number
186DISK_NAMES=$(get_disks $PLUGIN)
187DISKS_NUMA=$(get_numa_node $PLUGIN "$DISK_NAMES")
188CORES=$(get_cores "$CPUS_ALLOWED")
189NO_CORES_ARRAY=($CORES)
190NO_CORES=${#NO_CORES_ARRAY[@]}
191
192if $PRECONDITIONING; then
193	preconditioning
194fi
195
196if [[ "$PLUGIN" =~ "kernel" ]]; then
197	$rootdir/scripts/setup.sh reset
198	fio_ioengine_opt="${KERNEL_ENGINES[$PLUGIN]}"
199
200	if [[ $PLUGIN = "kernel-classic-polling" ]]; then
201		for disk in $DISK_NAMES; do
202			echo -1 > /sys/block/$disk/queue/io_poll_delay
203		done
204	elif [[ $PLUGIN = "kernel-hybrid-polling" ]]; then
205		for disk in $DISK_NAMES; do
206			echo 0 > /sys/block/$disk/queue/io_poll_delay
207		done
208	elif [[ $PLUGIN = "kernel-io-uring" ]]; then
209		modprobe -rv nvme
210		modprobe nvme poll_queues=8
211		wait_for_nvme_reload $DISK_NAMES
212
213		backup_dir="/tmp/nvme_param_bak"
214		mkdir -p $backup_dir
215
216		for disk in $DISK_NAMES; do
217			echo "INFO: Backing up device parameters for $disk"
218			sysfs=/sys/block/$disk/queue
219			mkdir -p $backup_dir/$disk
220			cat $sysfs/iostats > $backup_dir/$disk/iostats
221			cat $sysfs/rq_affinity > $backup_dir/$disk/rq_affinity
222			cat $sysfs/nomerges > $backup_dir/$disk/nomerges
223			cat $sysfs/io_poll_delay > $backup_dir/$disk/io_poll_delay
224		done
225
226		for disk in $DISK_NAMES; do
227			echo "INFO: Setting device parameters for $disk"
228			sysfs=/sys/block/$disk/queue
229			echo 0 > $sysfs/iostats
230			echo 0 > $sysfs/rq_affinity
231			echo 2 > $sysfs/nomerges
232			echo -1 > $sysfs/io_poll_delay
233		done
234	fi
235fi
236
237if [[ -n "$CPUFREQ" ]]; then
238	if [[ ! "$(cat /proc/cmdline)" =~ "intel_pstate=disable" ]]; then
239		echo "ERROR: Cannot set custom CPU frequency for test. intel_pstate=disable not in boot options."
240		false
241	else
242		cpu_governor="$(cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_governor)"
243		cpupower frequency-set -g userspace
244		cpupower frequency-set -f $CPUFREQ
245	fi
246fi
247
248if $PERFTOP; then
249	echo "INFO: starting perf record on cores $CPUS_ALLOWED"
250	perf record -C $CPUS_ALLOWED -o "$testdir/perf.data" &
251	perf_pid=$!
252fi
253
254if $DPDKMEM; then
255	echo "INFO: waiting to generate DPDK memory usage"
256	wait_time=$((RUNTIME / 2))
257	if [[ ! "$PLUGIN" =~ "perf" ]]; then
258		wait_time=$((wait_time + RAMP_TIME))
259	fi
260	(
261		sleep $wait_time
262		echo "INFO: generating DPDK memory usage"
263		$rootdir/scripts/rpc.py env_dpdk_get_mem_stats
264	) &
265	dpdk_mem_pid=$!
266fi
267
268iops_disks=0
269bw=0
270min_lat_disks_usec=0
271max_lat_disks_usec=0
272mean_lat_disks_usec=0
273p90_lat_disks_usec=0
274p99_lat_disks_usec=0
275p99_99_lat_disks_usec=0
276stdev_disks_usec=0
277mean_slat_disks_usec=0
278mean_clat_disks_usec=0
279#Run each workolad $REPEAT_NO times
280for ((j = 0; j < REPEAT_NO; j++)); do
281	if [ $PLUGIN = "spdk-perf-bdev" ]; then
282		run_bdevperf > $TMP_RESULT_FILE
283		read -r iops bandwidth <<< $(get_bdevperf_results)
284		iops_disks=$(bc "$iops_disks + $iops")
285		bw=$(bc "$bw + $bandwidth")
286		cp $TMP_RESULT_FILE $result_dir/perf_results_${MIX}_${PLUGIN}_${NO_CORES}cpus_${DATE}_${k}_disks_${j}.output
287	elif [ $PLUGIN = "spdk-perf-nvme" ]; then
288		run_nvmeperf $DISKNO > $TMP_RESULT_FILE
289		read -r iops bandwidth mean_lat min_lat max_lat <<< $(get_nvmeperf_results)
290
291		iops_disks=$(bc "$iops_disks+$iops")
292		bw=$(bc "$bw+$bandwidth")
293		mean_lat_disks_usec=$(bc "$mean_lat_disks_usec + $mean_lat")
294		min_lat_disks_usec=$(bc "$min_lat_disks_usec + $min_lat")
295		max_lat_disks_usec=$(bc "$max_lat_disks_usec + $max_lat")
296
297		cp $TMP_RESULT_FILE $result_dir/perf_results_${MIX}_${PLUGIN}_${NO_CORES}cpus_${DATE}_${k}_disks_${j}.output
298	else
299		create_fio_config $DISKNO $PLUGIN "$DISK_NAMES" "$DISKS_NUMA" "$CORES"
300
301		if [[ "$PLUGIN" =~ "spdk-plugin" ]]; then
302			run_spdk_nvme_fio $PLUGIN "--output=$TMP_RESULT_FILE" \
303				"--write_lat_log=$result_dir/perf_lat_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}_${k}disks_${j}"
304		else
305			run_nvme_fio $fio_ioengine_opt "--output=$TMP_RESULT_FILE" \
306				"--write_lat_log=$result_dir/perf_lat_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}_${k}disks_${j}"
307		fi
308
309		#Store values for every number of used disks
310		#Use recalculated value for mixread param in case rw mode is not rw.
311		rwmixread=$MIX
312		if [[ $RW = *"read"* ]]; then
313			rwmixread=100
314		elif [[ $RW = *"write"* ]]; then
315			rwmixread=0
316		fi
317
318		read -r iops bandwidth mean_lat_usec p90_lat_usec p99_lat_usec p99_99_lat_usec \
319			stdev_usec mean_slat_usec mean_clat_usec <<< $(get_results $rwmixread)
320		iops_disks=$(bc "$iops_disks + $iops")
321		mean_lat_disks_usec=$(bc "$mean_lat_disks_usec + $mean_lat_usec")
322		p90_lat_disks_usec=$(bc "$p90_lat_disks_usec + $p90_lat_usec")
323		p99_lat_disks_usec=$(bc "$p99_lat_disks_usec + $p99_lat_usec")
324		p99_99_lat_disks_usec=$(bc "$p99_99_lat_disks_usec + $p99_99_lat_usec")
325		stdev_disks_usec=$(bc "$stdev_disks_usec + $stdev_usec")
326		mean_slat_disks_usec=$(bc "$mean_slat_disks_usec + $mean_slat_usec")
327		mean_clat_disks_usec=$(bc "$mean_clat_disks_usec + $mean_clat_usec")
328		bw=$(bc "$bw + $bandwidth")
329
330		cp $TMP_RESULT_FILE $result_dir/perf_results_${MIX}_${PLUGIN}_${NO_CORES}cpus_${DATE}_${k}_disks_${j}.json
331		cp $testdir/config.fio $result_dir/config_${MIX}_${PLUGIN}_${NO_CORES}cpus_${DATE}_${k}_disks_${j}.fio
332		rm -f $testdir/config.fio
333	fi
334done
335
336if $PERFTOP; then
337	echo "INFO: Stopping perftop measurements."
338	kill $perf_pid
339	wait $perf_pid || true
340	perf report -i "$testdir/perf.data" > $result_dir/perftop_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}.txt
341	rm -f "$testdir/perf.data"
342fi
343
344if $DPDKMEM; then
345	mv "/tmp/spdk_mem_dump.txt" $result_dir/spdk_mem_dump_${BLK_SIZE}BS_${IODEPTH}QD_${RW}_${MIX}MIX_${PLUGIN}_${DATE}.txt
346	echo "INFO: DPDK memory usage saved in $result_dir"
347fi
348
349#Write results to csv file
350iops_disks=$(bc "$iops_disks / $REPEAT_NO")
351bw=$(bc "$bw / $REPEAT_NO")
352if [[ "$PLUGIN" =~ "plugin" ]] || [[ "$PLUGIN" =~ "kernel" ]]; then
353	mean_lat_disks_usec=$(bc "$mean_lat_disks_usec / $REPEAT_NO")
354	p90_lat_disks_usec=$(bc "$p90_lat_disks_usec / $REPEAT_NO")
355	p99_lat_disks_usec=$(bc "$p99_lat_disks_usec / $REPEAT_NO")
356	p99_99_lat_disks_usec=$(bc "$p99_99_lat_disks_usec / $REPEAT_NO")
357	stdev_disks_usec=$(bc "$stdev_disks_usec / $REPEAT_NO")
358	mean_slat_disks_usec=$(bc "$mean_slat_disks_usec / $REPEAT_NO")
359	mean_clat_disks_usec=$(bc "$mean_clat_disks_usec / $REPEAT_NO")
360elif [[ "$PLUGIN" == "spdk-perf-nvme" ]]; then
361	mean_lat_disks_usec=$(bc "$mean_lat_disks_usec/$REPEAT_NO")
362fi
363
364printf "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s\n" ${DISKNO} ${iops_disks} ${mean_lat_disks_usec} ${p90_lat_disks_usec} ${p99_lat_disks_usec} \
365	${p99_99_lat_disks_usec} ${stdev_disks_usec} ${mean_slat_disks_usec} ${mean_clat_disks_usec} ${bw} >> $result_file
366
367if [[ -n "$CPUFREQ" ]]; then
368	cpupower frequency-set -g $cpu_governor
369fi
370
371if [ $PLUGIN = "kernel-io-uring" ]; then
372	# Reload the nvme driver so that other test runs are not affected
373	modprobe -rv nvme
374	modprobe nvme
375	wait_for_nvme_reload $DISK_NAMES
376
377	for disk in $DISK_NAMES; do
378		echo "INFO: Restoring device parameters for $disk"
379		sysfs=/sys/block/$disk/queue
380		cat $backup_dir/$disk/iostats > $sysfs/iostats
381		cat $backup_dir/$disk/rq_affinity > $sysfs/rq_affinity
382		cat $backup_dir/$disk/nomerges > $sysfs/nomerges
383		cat $backup_dir/$disk/io_poll_delay > $sysfs/io_poll_delay
384	done
385fi
386rm -f $testdir/bdev.conf $testdir/config.fio
387