xref: /spdk/test/nvme/perf/common.sh (revision 7192849ed24874f3e9cc31e8a33a9b32c49b9506)
1#!/usr/bin/env bash
2
3set -e
4BASE_DIR=$(readlink -f $(dirname $0))
5ROOT_DIR=$(readlink -f $BASE_DIR/../../..)
6rootdir=$ROOT_DIR
7PLUGIN_DIR=$ROOT_DIR/build/fio
8BDEVPERF_DIR=$ROOT_DIR/test/bdev/bdevperf
9NVMEPERF_DIR=$ROOT_DIR/build/examples
10. $ROOT_DIR/scripts/common.sh || exit 1
11. $ROOT_DIR/test/common/autotest_common.sh
12NVME_FIO_RESULTS=$BASE_DIR/result.json
13
14declare -A KERNEL_ENGINES
15KERNEL_ENGINES=(
16	["kernel-libaio"]="--ioengine=libaio"
17	["kernel-classic-polling"]="--ioengine=pvsync2 --hipri=100"
18	["kernel-hybrid-polling"]="--ioengine=pvsync2 --hipri=100"
19	["kernel-io-uring"]="--ioengine=io_uring")
20
21RW=randrw
22MIX=100
23IODEPTH=256
24BLK_SIZE=4096
25RUNTIME=600
26RAMP_TIME=30
27NUMJOBS=1
28REPEAT_NO=3
29FIO_BIN=$CONFIG_FIO_SOURCE_DIR/fio
30PLUGIN="nvme"
31DISKCFG=""
32DISKNO="ALL"
33CPUS_ALLOWED=1
34NOIOSCALING=false
35PRECONDITIONING=true
36DATE="$(date +'%m_%d_%Y_%H%M%S')"
37
38function discover_bdevs() {
39	local rootdir=$1
40	local config_file=$2
41	local cfg_type=$3
42	local wait_for_spdk_bdev=${4:-30}
43	local rpc_server=/var/tmp/spdk-discover-bdevs.sock
44
45	if [ ! -e $config_file ]; then
46		echo "Invalid Configuration File: $config_file"
47		return 1
48	fi
49
50	if [ -z $cfg_type ]; then
51		cfg_type="-c"
52	fi
53
54	# Start the bdev service to query for the list of available
55	# bdevs.
56	$rootdir/test/app/bdev_svc/bdev_svc -r $rpc_server -i 0 \
57		$cfg_type $config_file &> /dev/null &
58	stubpid=$!
59	while ! [ -e /var/run/spdk_bdev0 ]; do
60		# If this counter drops to zero, errexit will be caught to abort the test
61		((wait_for_spdk_bdev--))
62		sleep 1
63	done
64
65	# Get all of the bdevs
66	$rootdir/scripts/rpc.py -s "$rpc_server" bdev_get_bdevs
67
68	# Shut down the bdev service
69	kill $stubpid
70	wait $stubpid
71	rm -f /var/run/spdk_bdev0
72}
73
74function create_spdk_bdev_conf() {
75	local output
76	local disk_cfg
77	local bdev_json_cfg
78
79	disk_cfg=($(grep -vP "^\s*#" "$DISKCFG"))
80
81	bdev_json_cfg=()
82	for i in "${!disk_cfg[@]}"; do
83		bdev_json_cfg+=("$(
84			cat <<- JSON
85				{
86					"method": "bdev_nvme_attach_controller",
87					"params": {
88						"trtype": "PCIe",
89						"name":"Nvme${i}",
90						"traddr":"${disk_cfg[i]}"
91					}
92				}
93			JSON
94		)")
95	done
96
97	local IFS=","
98	jq -r '.' <<- JSON > $BASE_DIR/bdev.conf
99		{
100			"subsystems": [
101				{
102					"subsystem": "bdev",
103					"config": [
104						${bdev_json_cfg[*]}
105					]
106				}
107			]
108		}
109	JSON
110}
111
112function is_bdf_not_mounted() {
113	local bdf=$1
114	local blkname
115	local mountpoints
116	blkname=$(ls -l /sys/block/ | grep $bdf | awk '{print $9}')
117	mountpoints=$(lsblk /dev/$blkname --output MOUNTPOINT -n | wc -w)
118	return $mountpoints
119}
120
121function get_cores() {
122	local cpu_list="$1"
123	for cpu in ${cpu_list//,/ }; do
124		echo $cpu
125	done
126}
127
128function get_cores_numa_node() {
129	local cores=$1
130	for core in $cores; do
131		lscpu -p=cpu,node | grep "^$core\b" | awk -F ',' '{print $2}'
132	done
133}
134
135function get_numa_node() {
136	local plugin=$1
137	local disks=$2
138	if [[ "$plugin" =~ "nvme" ]]; then
139		for bdf in $disks; do
140			local driver
141			driver=$(grep DRIVER /sys/bus/pci/devices/$bdf/uevent | awk -F"=" '{print $2}')
142			# Use this check to ommit blacklisted devices ( not binded to driver with setup.sh script )
143			if [ "$driver" = "vfio-pci" ] || [ "$driver" = "uio_pci_generic" ]; then
144				cat /sys/bus/pci/devices/$bdf/numa_node
145			fi
146		done
147	elif [[ "$plugin" =~ "bdev" ]]; then
148		local bdevs
149		bdevs=$(discover_bdevs $ROOT_DIR $BASE_DIR/bdev.conf --json)
150		for name in $disks; do
151			local bdev_bdf
152			bdev_bdf=$(jq -r ".[] | select(.name==\"$name\").driver_specific.nvme.pci_address" <<< $bdevs)
153			cat /sys/bus/pci/devices/$bdev_bdf/numa_node
154		done
155	else
156		for name in $disks; do
157			local bdf
158			# Not reading directly from /sys/block/nvme* because of a kernel bug
159			# which results in NUMA 0 always getting reported.
160			bdf=$(cat /sys/block/$name/device/address)
161			cat /sys/bus/pci/devices/$bdf/numa_node
162		done
163	fi
164}
165
166function get_disks() {
167	local plugin=$1
168	local disk_cfg
169
170	disk_cfg=($(grep -vP "^\s*#" "$DISKCFG"))
171	if [[ "$plugin" =~ "nvme" ]]; then
172		# PCI BDF address is enough for nvme-perf and nvme-fio-plugin,
173		# so just print them from configuration file
174		echo "${disk_cfg[*]}"
175	elif [[ "$plugin" =~ "bdev" ]]; then
176		# Generate NvmeXn1 bdev name configuration file for bdev-perf
177		# and bdev-fio-plugin
178		local bdevs
179		local disk_no
180		disk_no=${#disk_cfg[@]}
181		eval echo "Nvme{0..$((disk_no - 1))}n1"
182	else
183		# Find nvme block devices and only use the ones which
184		# are not mounted
185		for bdf in "${disk_cfg[@]}"; do
186			if is_bdf_not_mounted $bdf; then
187				local blkname
188				blkname=$(ls -l /sys/block/ | grep $bdf | awk '{print $9}')
189				echo $blkname
190			fi
191		done
192	fi
193}
194
195function get_disks_on_numa() {
196	local devs=($1)
197	local numas=($2)
198	local numa_no=$3
199	local disks_on_numa=""
200	local i
201
202	for ((i = 0; i < ${#devs[@]}; i++)); do
203		if [ ${numas[$i]} = $numa_no ]; then
204			disks_on_numa=$((disks_on_numa + 1))
205		fi
206	done
207	echo $disks_on_numa
208}
209
210function create_fio_config() {
211	local disk_no=$1
212	local plugin=$2
213	local disks=($3)
214	local disks_numa=($4)
215	local cores=($5)
216	local total_disks=${#disks[@]}
217	local no_cores=${#cores[@]}
218	local filename=""
219
220	local cores_numa
221	cores_numa=($(get_cores_numa_node "$5"))
222	local disks_per_core=$((disk_no / no_cores))
223	local disks_per_core_mod=$((disk_no % no_cores))
224
225	# For kernel dirver, each disk will be alligned with all cpus on the same NUMA node
226	if [[ "$plugin" =~ "kernel" ]]; then
227		for ((i = 0; i < disk_no; i++)); do
228			sed -i -e "\$a[filename${i}]" $BASE_DIR/config.fio
229			filename="/dev/${disks[$i]}"
230			sed -i -e "\$afilename=$filename" $BASE_DIR/config.fio
231			cpu_used=""
232			for ((j = 0; j < no_cores; j++)); do
233				core_numa=${cores_numa[$j]}
234				if [ "${disks_numa[$i]}" = "$core_numa" ]; then
235					cpu_used+="${cores[$j]},"
236				fi
237			done
238			sed -i -e "\$acpus_allowed=$cpu_used" $BASE_DIR/config.fio
239			echo "" >> $BASE_DIR/config.fio
240		done
241	else
242		for ((i = 0; i < no_cores; i++)); do
243			core_numa=${cores_numa[$i]}
244			total_disks_per_core=$disks_per_core
245			if [ "$disks_per_core_mod" -gt "0" ]; then
246				total_disks_per_core=$((disks_per_core + 1))
247				disks_per_core_mod=$((disks_per_core_mod - 1))
248			fi
249
250			if [ "$total_disks_per_core" = "0" ]; then
251				break
252			fi
253
254			sed -i -e "\$a[filename${i}]" $BASE_DIR/config.fio
255			#use cpus_allowed as cpumask works only for cores 1-32
256			sed -i -e "\$acpus_allowed=${cores[$i]}" $BASE_DIR/config.fio
257			m=0 #counter of disks per cpu core numa
258			n=0 #counter of all disks
259			while [ "$m" -lt "$total_disks_per_core" ]; do
260				if [ ${disks_numa[$n]} = $core_numa ]; then
261					m=$((m + 1))
262					if [[ "$plugin" = "spdk-plugin-nvme" ]]; then
263						filename='trtype=PCIe traddr='${disks[$n]//:/.}' ns=1'
264					elif [[ "$plugin" = "spdk-plugin-bdev" ]]; then
265						filename=${disks[$n]}
266					fi
267					sed -i -e "\$afilename=$filename" $BASE_DIR/config.fio
268					#Mark numa of n'th disk as "x" to mark it as claimed
269					disks_numa[$n]="x"
270				fi
271				n=$((n + 1))
272				# If there is no more disks with numa node same as cpu numa node, switch to other numa node.
273				if [ $n -ge $total_disks ]; then
274					if [ "$core_numa" = "1" ]; then
275						core_numa=0
276					else
277						core_numa=1
278					fi
279					n=0
280				fi
281			done
282			echo "" >> $BASE_DIR/config.fio
283		done
284	fi
285}
286
287function preconditioning() {
288	local dev_name=""
289	local filename=""
290	local nvme_list
291
292	HUGEMEM=8192 $ROOT_DIR/scripts/setup.sh
293	cp $BASE_DIR/config.fio.tmp $BASE_DIR/config.fio
294	echo "[Preconditioning]" >> $BASE_DIR/config.fio
295
296	# Generate filename argument for FIO.
297	# We only want to target NVMes not bound to nvme driver.
298	# If they're still bound to nvme that means they were skipped by
299	# setup.sh on purpose.
300	nvme_list=$(get_disks nvme)
301	for nvme in $nvme_list; do
302		dev_name='trtype=PCIe traddr='${nvme//:/.}' ns=1'
303		filename+=$(printf %s":" "$dev_name")
304	done
305	echo "** Preconditioning disks, this can take a while, depending on the size of disks."
306	run_spdk_nvme_fio "spdk-plugin-nvme" --filename="$filename" --size=100% --loops=2 --bs=1M \
307		--rw=write --iodepth=32 --output-format=normal
308	rm -f $BASE_DIR/config.fio
309}
310
311function get_results() {
312	local reads_pct
313	local writes_pct
314
315	reads_pct=$(bc -l <<< "scale=3; $2/100")
316	writes_pct=$(bc -l <<< "scale=3; 1-$reads_pct")
317	case "$1" in
318		iops)
319			iops=$(jq -r '.jobs[] | .read.iops + .write.iops' $NVME_FIO_RESULTS)
320			iops=${iops%.*}
321			echo $iops
322			;;
323		mean_lat_usec)
324			mean_lat=$(jq -r ".jobs[] | (.read.lat_ns.mean * $reads_pct + .write.lat_ns.mean * $writes_pct)" $NVME_FIO_RESULTS)
325			mean_lat=${mean_lat%.*}
326			echo $((mean_lat / 1000))
327			;;
328		p99_lat_usec)
329			p99_lat=$(jq -r ".jobs[] | (.read.clat_ns.percentile.\"99.000000\"  // 0 * $reads_pct + .write.clat_ns.percentile.\"99.000000\" // 0 * $writes_pct)" $NVME_FIO_RESULTS)
330			p99_lat=${p99_lat%.*}
331			echo $((p99_lat / 1000))
332			;;
333		p99_99_lat_usec)
334			p99_99_lat=$(jq -r ".jobs[] | (.read.clat_ns.percentile.\"99.990000\" // 0 * $reads_pct + .write.clat_ns.percentile.\"99.990000\" // 0 * $writes_pct)" $NVME_FIO_RESULTS)
335			p99_99_lat=${p99_99_lat%.*}
336			echo $((p99_99_lat / 1000))
337			;;
338		stdev_usec)
339			stdev=$(jq -r ".jobs[] | (.read.clat_ns.stddev * $reads_pct + .write.clat_ns.stddev * $writes_pct)" $NVME_FIO_RESULTS)
340			stdev=${stdev%.*}
341			echo $((stdev / 1000))
342			;;
343		mean_slat_usec)
344			mean_slat=$(jq -r ".jobs[] | (.read.slat_ns.mean * $reads_pct + .write.slat_ns.mean * $writes_pct)" $NVME_FIO_RESULTS)
345			mean_slat=${mean_slat%.*}
346			echo $((mean_slat / 1000))
347			;;
348		mean_clat_usec)
349			mean_clat=$(jq -r ".jobs[] | (.read.clat_ns.mean * $reads_pct + .write.clat_ns.mean * $writes_pct)" $NVME_FIO_RESULTS)
350			mean_clat=${mean_clat%.*}
351			echo $((mean_clat / 1000))
352			;;
353		bw_Kibs)
354			bw=$(jq -r ".jobs[] | (.read.bw + .write.bw)" $NVME_FIO_RESULTS)
355			bw=${bw%.*}
356			echo $((bw))
357			;;
358	esac
359}
360
361function get_bdevperf_results() {
362	case "$1" in
363		iops)
364			iops=$(grep Total $NVME_FIO_RESULTS | awk -F 'Total' '{print $2}' | awk '{print $2}')
365			iops=${iops%.*}
366			echo $iops
367			;;
368		bw_Kibs)
369			bw_MBs=$(grep Total $NVME_FIO_RESULTS | awk -F 'Total' '{print $2}' | awk '{print $4}')
370			bw_MBs=${bw_MBs%.*}
371			echo $((bw_MBs * 1024))
372			;;
373	esac
374}
375
376function get_nvmeperf_results() {
377	local iops
378	local bw_MBs
379	local mean_lat_usec
380	local max_lat_usec
381	local min_lat_usec
382
383	read -r iops bw_MBs mean_lat_usec min_lat_usec max_lat_usec <<< $(tr -s " " < $NVME_FIO_RESULTS | grep -oP "(?<=Total : )(.*+)")
384
385	# We need to get rid of the decimal spaces due
386	# to use of arithmetic expressions instead of "bc" for calculations
387	iops=${iops%.*}
388	bw_MBs=${bw_MBs%.*}
389	mean_lat_usec=${mean_lat_usec%.*}
390	min_lat_usec=${min_lat_usec%.*}
391	max_lat_usec=${max_lat_usec%.*}
392
393	echo "$iops $(bc <<< "$bw_MBs * 1024") $mean_lat_usec $min_lat_usec $max_lat_usec"
394}
395
396function run_spdk_nvme_fio() {
397	local plugin=$1
398	echo "** Running fio test, this can take a while, depending on the run-time and ramp-time setting."
399	if [[ "$plugin" = "spdk-plugin-nvme" ]]; then
400		LD_PRELOAD=$PLUGIN_DIR/spdk_nvme $FIO_BIN $BASE_DIR/config.fio --output-format=json "${@:2}" --ioengine=spdk
401	elif [[ "$plugin" = "spdk-plugin-bdev" ]]; then
402		LD_PRELOAD=$PLUGIN_DIR/spdk_bdev $FIO_BIN $BASE_DIR/config.fio --output-format=json "${@:2}" --ioengine=spdk_bdev --spdk_json_conf=$BASE_DIR/bdev.conf --spdk_mem=4096
403	fi
404
405	sleep 1
406}
407
408function run_nvme_fio() {
409	echo "** Running fio test, this can take a while, depending on the run-time and ramp-time setting."
410	$FIO_BIN $BASE_DIR/config.fio --output-format=json "$@"
411	sleep 1
412}
413
414function run_bdevperf() {
415	echo "** Running bdevperf test, this can take a while, depending on the run-time setting."
416	$BDEVPERF_DIR/bdevperf --json $BASE_DIR/bdev.conf -q $IODEPTH -o $BLK_SIZE -w $RW -M $MIX -t $RUNTIME -m "[$CPUS_ALLOWED]"
417	sleep 1
418}
419
420function run_nvmeperf() {
421	# Prepare -r argument string for nvme perf command
422	local r_opt
423	local disks
424
425	# Limit the number of disks to $1 if needed
426	disks=($(get_disks nvme))
427	disks=("${disks[@]:0:$1}")
428	r_opt=$(printf -- ' -r "trtype:PCIe traddr:%s"' "${disks[@]}")
429
430	echo "** Running nvme perf test, this can take a while, depending on the run-time setting."
431
432	# Run command in separate shell as this solves quoting issues related to r_opt var
433	$SHELL -c "$NVMEPERF_DIR/perf $r_opt -q $IODEPTH -o $BLK_SIZE -w $RW -M $MIX -t $RUNTIME -c [$CPUS_ALLOWED]"
434	sleep 1
435}
436
437function wait_for_nvme_reload() {
438	local nvmes=$1
439
440	shopt -s extglob
441	for disk in $nvmes; do
442		cmd="ls /sys/block/$disk/queue/*@(iostats|rq_affinity|nomerges|io_poll_delay)*"
443		until $cmd 2> /dev/null; do
444			echo "Waiting for full nvme driver reload..."
445			sleep 0.5
446		done
447	done
448	shopt -q extglob
449}
450
451function verify_disk_number() {
452	# Check if we have appropriate number of disks to carry out the test
453	disks=($(get_disks $PLUGIN))
454	if [[ $DISKNO == "ALL" ]] || [[ $DISKNO == "all" ]]; then
455		DISKNO=${#disks[@]}
456	elif [[ $DISKNO -gt ${#disks[@]} ]] || [[ ! $DISKNO =~ ^[0-9]+$ ]]; then
457		echo "error: Required devices number ($DISKNO) is not a valid number or it's larger than the number of devices found (${#disks[@]})"
458		false
459	fi
460}
461
462function usage() {
463	set +x
464	[[ -n $2 ]] && (
465		echo "$2"
466		echo ""
467	)
468	echo "Run NVMe PMD/BDEV performance test. Change options for easier debug and setup configuration"
469	echo "Usage: $(basename $1) [options]"
470	echo "-h, --help                Print help and exit"
471	echo
472	echo "Workload parameters:"
473	echo "    --rw=STR              Type of I/O pattern. Accepted values are randrw,rw. [default=$RW]"
474	echo "    --rwmixread=INT       Percentage of a mixed workload that should be reads. [default=$MIX]"
475	echo "    --iodepth=INT         Number of I/Os to keep in flight against the file. [default=$IODEPTH]"
476	echo "    --block-size=INT      The  block  size  in  bytes  used for I/O units. [default=$BLK_SIZE]"
477	echo "    --run-time=TIME[s]    Tell fio to run the workload for the specified period of time. [default=$RUNTIME]"
478	echo "    --ramp-time=TIME[s]   Fio will run the specified workload for this amount of time before"
479	echo "                          logging any performance numbers. [default=$RAMP_TIME]. Applicable only for fio-based tests."
480	echo "    --numjobs=INT         Create the specified number of clones of this job. [default=$NUMJOBS]"
481	echo "                          Applicable only for fio-based tests."
482	echo "    --repeat-no=INT       How many times to repeat workload test. [default=$REPEAT_NO]"
483	echo "                          Test result will be an average of repeated test runs."
484	echo "    --fio-bin=PATH        Path to fio binary. [default=$FIO_BIN]"
485	echo "                          Applicable only for fio-based tests."
486	echo
487	echo "Test setup parameters:"
488	echo "    --driver=STR          Selects tool used for testing. Choices available:"
489	echo "                             - spdk-perf-nvme (SPDK nvme perf)"
490	echo "                             - spdk-perf-bdev (SPDK bdev perf)"
491	echo "                             - spdk-plugin-nvme (SPDK nvme fio plugin)"
492	echo "                             - spdk-plugin-bdev (SPDK bdev fio plugin)"
493	echo "                             - kernel-classic-polling"
494	echo "                             - kernel-hybrid-polling"
495	echo "                             - kernel-libaio"
496	echo "                             - kernel-io-uring"
497	echo "    --disk-config         Configuration file containing PCI BDF addresses of NVMe disks to use in test."
498	echo "                          It consists a single column of PCI addresses. SPDK Bdev names will be assigned"
499	echo "                          and Kernel block device names detected."
500	echo "                          Lines starting with # are ignored as comments."
501	echo "    --max-disk=INT,ALL    Number of disks to test on, this will run multiple workloads with increasing number of disk each run."
502	echo "                          If =ALL then test on all found disk. [default=$DISKNO]"
503	echo "    --cpu-allowed=INT     Comma-separated list of CPU cores used to run the workload. [default=$CPUS_ALLOWED]"
504	echo "    --no-preconditioning  Skip preconditioning"
505	echo "    --no-io-scaling       Do not scale iodepth for each device in SPDK fio plugin. [default=$NOIOSCALING]"
506	set -x
507}
508
509while getopts 'h-:' optchar; do
510	case "$optchar" in
511		-)
512			case "$OPTARG" in
513				help)
514					usage $0
515					exit 0
516					;;
517				rw=*) RW="${OPTARG#*=}" ;;
518				rwmixread=*) MIX="${OPTARG#*=}" ;;
519				iodepth=*) IODEPTH="${OPTARG#*=}" ;;
520				block-size=*) BLK_SIZE="${OPTARG#*=}" ;;
521				run-time=*) RUNTIME="${OPTARG#*=}" ;;
522				ramp-time=*) RAMP_TIME="${OPTARG#*=}" ;;
523				numjobs=*) NUMJOBS="${OPTARG#*=}" ;;
524				repeat-no=*) REPEAT_NO="${OPTARG#*=}" ;;
525				fio-bin=*) FIO_BIN="${OPTARG#*=}" ;;
526				driver=*) PLUGIN="${OPTARG#*=}" ;;
527				disk-config=*)
528					DISKCFG="${OPTARG#*=}"
529					if [[ ! -f "$DISKCFG" ]]; then
530						echo "Disk confiuration file $DISKCFG does not exist!"
531						exit 1
532					fi
533					;;
534				max-disk=*) DISKNO="${OPTARG#*=}" ;;
535				cpu-allowed=*) CPUS_ALLOWED="${OPTARG#*=}" ;;
536				no-preconditioning) PRECONDITIONING=false ;;
537				no-io-scaling) NOIOSCALING=true ;;
538				*)
539					usage $0 echo "Invalid argument '$OPTARG'"
540					exit 1
541					;;
542			esac
543			;;
544		h)
545			usage $0
546			exit 0
547			;;
548		*)
549			usage $0 "Invalid argument '$optchar'"
550			exit 1
551			;;
552	esac
553done
554