xref: /spdk/test/nvme/perf/common.sh (revision cc6920a4763d4b9a43aa40583c8397d8f14fa100)
1#!/usr/bin/env bash
2
3function discover_bdevs() {
4	local rootdir=$1
5	local config_file=$2
6	local wait_for_spdk_bdev=30
7	local rpc_server=/var/tmp/spdk-discover-bdevs.sock
8
9	if [ ! -e $config_file ]; then
10		echo "Invalid Configuration File: $config_file"
11		return 1
12	fi
13
14	# Start the bdev service to query for the list of available
15	# bdevs.
16	$rootdir/test/app/bdev_svc/bdev_svc -r $rpc_server -i 0 \
17		--json $config_file &> /dev/null &
18	stubpid=$!
19	while ! [ -e /var/run/spdk_bdev0 ]; do
20		# If this counter drops to zero, errexit will be caught to abort the test
21		((wait_for_spdk_bdev--))
22		sleep 1
23	done
24
25	# Get all of the bdevs
26	$rootdir/scripts/rpc.py -s "$rpc_server" bdev_get_bdevs
27
28	# Shut down the bdev service
29	kill $stubpid
30	wait $stubpid
31	rm -f /var/run/spdk_bdev0
32}
33
34function create_spdk_bdev_conf() {
35	local output
36	local disk_cfg
37	local bdev_io_cache_size=$1
38	local bdev_io_pool_size=$2
39	local bdev_json_cfg=()
40	local bdev_opts=()
41
42	disk_cfg=($(grep -vP "^\s*#" "$DISKCFG"))
43
44	if [[ -n "$bdev_io_cache_size" ]]; then
45		bdev_opts+=("\"bdev_io_cache_size\": $bdev_io_cache_size")
46	fi
47
48	if [[ -n "$bdev_io_pool_size" ]]; then
49		bdev_opts+=("\"bdev_io_pool_size\": $bdev_io_pool_size")
50	fi
51
52	local IFS=","
53	if [[ ${#bdev_opts[@]} -gt 0 ]]; then
54		bdev_json_cfg+=("$(
55			cat <<- JSON
56				{
57					"method": "bdev_set_options",
58					"params": {
59						${bdev_opts[*]}
60					}
61				}
62			JSON
63		)")
64	fi
65
66	for i in "${!disk_cfg[@]}"; do
67		bdev_json_cfg+=("$(
68			cat <<- JSON
69				{
70					"method": "bdev_nvme_attach_controller",
71					"params": {
72						"trtype": "PCIe",
73						"name":"Nvme${i}",
74						"traddr":"${disk_cfg[i]}"
75					}
76				}
77			JSON
78		)")
79	done
80
81	local IFS=","
82	jq -r '.' <<- JSON > $testdir/bdev.conf
83		{
84			"subsystems": [
85				{
86					"subsystem": "bdev",
87					"config": [
88						${bdev_json_cfg[*]},
89					        {
90					                "method": "bdev_wait_for_examine"
91					        }
92					]
93				}
94			]
95		}
96	JSON
97}
98
99function is_bdf_not_mounted() {
100	local bdf=$1
101	local blkname
102	local mountpoints
103	blkname=$(ls -l /sys/block/ | grep $bdf | awk '{print $9}')
104	mountpoints=$(lsblk /dev/$blkname --output MOUNTPOINT -n | wc -w)
105	return $mountpoints
106}
107
108function get_cores() {
109	local cpu_list="$1"
110	for cpu in ${cpu_list//,/ }; do
111		echo $cpu
112	done
113}
114
115function get_cores_numa_node() {
116	local cores=$1
117	for core in $cores; do
118		lscpu -p=cpu,node | grep "^$core\b" | awk -F ',' '{print $2}'
119	done
120}
121
122function get_numa_node() {
123	local plugin=$1
124	local disks=$2
125	if [[ "$plugin" =~ "nvme" ]]; then
126		for bdf in $disks; do
127			local driver
128			driver=$(grep DRIVER /sys/bus/pci/devices/$bdf/uevent | awk -F"=" '{print $2}')
129			# Use this check to omit blocked devices ( not bound to driver with setup.sh script )
130			if [ "$driver" = "vfio-pci" ] || [ "$driver" = "uio_pci_generic" ]; then
131				cat /sys/bus/pci/devices/$bdf/numa_node
132			fi
133		done
134	elif [[ "$plugin" =~ "bdev" ]]; then
135		local bdevs
136		bdevs=$(discover_bdevs $rootdir $testdir/bdev.conf)
137		for name in $disks; do
138			local bdev_bdf
139			bdev_bdf=$(jq -r ".[] | select(.name==\"$name\").driver_specific.nvme.pci_address" <<< $bdevs)
140			cat /sys/bus/pci/devices/$bdev_bdf/numa_node
141		done
142	else
143		for name in $disks; do
144			local bdf
145			# Not reading directly from /sys/block/nvme* because of a kernel bug
146			# which results in NUMA 0 always getting reported.
147			bdf=$(cat /sys/block/$name/device/address)
148			cat /sys/bus/pci/devices/$bdf/numa_node
149		done
150	fi
151}
152
153function get_disks() {
154	local plugin=$1
155	local disk_cfg
156
157	disk_cfg=($(grep -vP "^\s*#" "$DISKCFG"))
158	if [[ "$plugin" =~ "nvme" ]]; then
159		# PCI BDF address is enough for nvme-perf and nvme-fio-plugin,
160		# so just print them from configuration file
161		echo "${disk_cfg[*]}"
162	elif [[ "$plugin" =~ "bdev" ]]; then
163		# Generate NvmeXn1 bdev name configuration file for bdev-perf
164		# and bdev-fio-plugin
165		local bdevs
166		local disk_no
167		disk_no=${#disk_cfg[@]}
168		eval echo "Nvme{0..$((disk_no - 1))}n1"
169	else
170		# Find nvme block devices and only use the ones which
171		# are not mounted
172		for bdf in "${disk_cfg[@]}"; do
173			if is_bdf_not_mounted $bdf; then
174				local blkname
175				blkname=$(ls -l /sys/block/ | grep $bdf | awk '{print $9}')
176				echo $blkname
177			fi
178		done
179	fi
180}
181
182function get_disks_on_numa() {
183	local devs=($1)
184	local numas=($2)
185	local numa_no=$3
186	local disks_on_numa=""
187	local i
188
189	for ((i = 0; i < ${#devs[@]}; i++)); do
190		if [ ${numas[$i]} = $numa_no ]; then
191			disks_on_numa=$((disks_on_numa + 1))
192		fi
193	done
194	echo $disks_on_numa
195}
196
197function create_fio_config() {
198	local disk_no=$1
199	local plugin=$2
200	local disks=($3)
201	local disks_numa=($4)
202	local cores=($5)
203	local total_disks=${#disks[@]}
204	local fio_job_section=()
205	local num_cores=${#cores[@]}
206	local disks_per_core=$((disk_no / num_cores))
207	local disks_per_core_mod=$((disk_no % num_cores))
208	local cores_numa
209	cores_numa=($(get_cores_numa_node "${cores[*]}"))
210
211	# Following part of this function still leverages global variables a lot.
212	# It's a mix of local variables passed as aruments to function with global variables. This is messy.
213	# TODO: Modify this to be consistent with how variables are used here. Aim for using only
214	# local variables to get rid of globals as much as possible.
215	desc="\"Test io_plugin=$PLUGIN Blocksize=${BLK_SIZE} Workload=$RW MIX=${MIX} qd=${IODEPTH}\""
216	cp "$testdir/config.fio.tmp" "$testdir/config.fio"
217	cat <<- EOF >> $testdir/config.fio
218		description=$desc
219
220		rw=$RW
221		rwmixread=$MIX
222		bs=$BLK_SIZE
223		runtime=$RUNTIME
224		ramp_time=$RAMP_TIME
225		numjobs=$NUMJOBS
226		log_avg_msec=$SAMPLING_INT
227	EOF
228
229	if $GTOD_REDUCE; then
230		echo "gtod_reduce=1" >> $testdir/config.fio
231	fi
232
233	if [[ $PLUGIN =~ "uring" ]]; then
234		cat <<- EOF >> $testdir/config.fio
235			fixedbufs=1
236			hipri=1
237			registerfiles=1
238			sqthread_poll=1
239		EOF
240	fi
241
242	if [[ "$IO_BATCH_SUBMIT" -gt 0 ]]; then
243		echo "iodepth_batch_submit=$IO_BATCH_SUBMIT" >> $testdir/config.fio
244	fi
245
246	if [[ "$IO_BATCH_COMPLETE" -gt 0 ]]; then
247		echo "iodepth_batch_complete=$IO_BATCH_COMPLETE" >> $testdir/config.fio
248	fi
249
250	for i in "${!cores[@]}"; do
251		local m=0 #Counter of disks per NUMA node
252		local n=0 #Counter of all disks in test
253		core_numa=${cores_numa[$i]}
254
255		total_disks_per_core=$disks_per_core
256		# Check how many "stray" disks are unassigned to CPU cores
257		# Assign one disk to current CPU core and substract it from the total of
258		# unassigned disks
259		if [[ "$disks_per_core_mod" -gt "0" ]]; then
260			total_disks_per_core=$((disks_per_core + 1))
261			disks_per_core_mod=$((disks_per_core_mod - 1))
262		fi
263		# SPDK fio plugin supports submitting/completing I/Os to multiple SSDs from a single thread.
264		# Therefore, the per thread queue depth is set to the desired IODEPTH/device X the number of devices per thread.
265		QD=$IODEPTH
266		if [[ "$NOIOSCALING" == false ]]; then
267			QD=$((IODEPTH * total_disks_per_core))
268		fi
269
270		if [[ "$FIO_FNAME_STRATEGY" == "group" ]]; then
271			fio_job_section+=("")
272			fio_job_section+=("[filename${i}]")
273			fio_job_section+=("iodepth=$QD")
274			fio_job_section+=("cpus_allowed=${cores[$i]} #CPU NUMA Node ${cores_numa[$i]}")
275		fi
276
277		while [[ "$m" -lt "$total_disks_per_core" ]]; do
278			# Try to add disks to job section if it's NUMA node matches NUMA
279			# for currently selected CPU
280			if [[ "${disks_numa[$n]}" == "$core_numa" ]]; then
281				if [[ "$FIO_FNAME_STRATEGY" == "split" ]]; then
282					fio_job_section+=("")
283					fio_job_section+=("[filename${m}-${cores[$i]}]")
284					fio_job_section+=("iodepth=$QD")
285					fio_job_section+=("cpus_allowed=${cores[$i]} #CPU NUMA Node ${cores_numa[$i]}")
286				fi
287
288				if [[ "$plugin" == "spdk-plugin-nvme" ]]; then
289					fio_job_section+=("filename=trtype=PCIe traddr=${disks[$n]//:/.} ns=1 #NVMe NUMA Node ${disks_numa[$n]}")
290				elif [[ "$plugin" == "spdk-plugin-bdev" ]]; then
291					fio_job_section+=("filename=${disks[$n]} #NVMe NUMA Node ${disks_numa[$n]}")
292				elif [[ "$plugin" =~ "kernel" ]]; then
293					fio_job_section+=("filename=/dev/${disks[$n]} #NVMe NUMA Node ${disks_numa[$n]}")
294				fi
295				m=$((m + 1))
296
297				#Mark numa of n'th disk as "x" to mark it as claimed for next loop iterations
298				disks_numa[$n]="x"
299			fi
300			n=$((n + 1))
301
302			# If there is no more disks with numa node same as cpu numa node, switch to
303			# other numa node, go back to start of loop and try again.
304			if [[ $n -ge $total_disks ]]; then
305				echo "WARNING! Cannot assign any more NVMes for CPU ${cores[$i]}"
306				echo "NVMe assignment for this CPU will be cross-NUMA."
307				if [[ "$core_numa" == "1" ]]; then
308					core_numa=0
309				else
310					core_numa=1
311				fi
312				n=0
313			fi
314		done
315	done
316
317	printf "%s\n" "${fio_job_section[@]}" >> $testdir/config.fio
318	echo "INFO: Generated fio configuration file:"
319	cat $testdir/config.fio
320}
321
322function preconditioning() {
323	local dev_name=""
324	local filename=""
325	local nvme_list
326
327	HUGEMEM=8192 $rootdir/scripts/setup.sh
328	cp $testdir/config.fio.tmp $testdir/config.fio
329	echo "[Preconditioning]" >> $testdir/config.fio
330
331	# Generate filename argument for FIO.
332	# We only want to target NVMes not bound to nvme driver.
333	# If they're still bound to nvme that means they were skipped by
334	# setup.sh on purpose.
335	nvme_list=$(get_disks nvme)
336	for nvme in $nvme_list; do
337		dev_name='trtype=PCIe traddr='${nvme//:/.}' ns=1'
338		filename+=$(printf %s":" "$dev_name")
339	done
340	echo "** Preconditioning disks, this can take a while, depending on the size of disks."
341	run_spdk_nvme_fio "spdk-plugin-nvme" --filename="$filename" --size=100% --loops=2 --bs=1M \
342		--rw=write --iodepth=32 --output-format=normal
343	rm -f $testdir/config.fio
344}
345
346function bc() {
347	$(type -P bc) -l <<< "scale=3; $1"
348}
349
350function get_results() {
351	local iops bw stdev
352	local p90_lat p99_lat p99_99_lat
353	local mean_slat mean_clat
354	local reads_pct
355	local writes_pct
356
357	reads_pct=$(bc "$1 / 100")
358	writes_pct=$(bc "1 - $reads_pct")
359
360	iops=$(jq -r '.jobs[] | .read.iops + .write.iops' $TMP_RESULT_FILE)
361	bw=$(jq -r ".jobs[] | (.read.bw + .write.bw)" $TMP_RESULT_FILE)
362	mean_lat=$(jq -r ".jobs[] | (.read.lat_ns.mean * $reads_pct + .write.lat_ns.mean * $writes_pct)/1000" $TMP_RESULT_FILE)
363	p90_lat=$(jq -r ".jobs[] | (.read.clat_ns.percentile.\"90.000000\"  // 0 * $reads_pct + .write.clat_ns.percentile.\"90.000000\" // 0 * $writes_pct)/1000" $TMP_RESULT_FILE)
364	p99_lat=$(jq -r ".jobs[] | (.read.clat_ns.percentile.\"99.000000\"  // 0 * $reads_pct + .write.clat_ns.percentile.\"99.000000\" // 0 * $writes_pct)/1000" $TMP_RESULT_FILE)
365	p99_99_lat=$(jq -r ".jobs[] | (.read.clat_ns.percentile.\"99.990000\" // 0 * $reads_pct + .write.clat_ns.percentile.\"99.990000\" // 0 * $writes_pct)/1000" $TMP_RESULT_FILE)
366	stdev=$(jq -r ".jobs[] | (.read.clat_ns.stddev * $reads_pct + .write.clat_ns.stddev * $writes_pct)/1000" $TMP_RESULT_FILE)
367	mean_slat=$(jq -r ".jobs[] | (.read.slat_ns.mean * $reads_pct + .write.slat_ns.mean * $writes_pct)/1000" $TMP_RESULT_FILE)
368	mean_clat=$(jq -r ".jobs[] | (.read.clat_ns.mean * $reads_pct + .write.clat_ns.mean * $writes_pct)/1000" $TMP_RESULT_FILE)
369
370	echo "$iops $bw $mean_lat $p90_lat $p99_lat $p99_99_lat $stdev $mean_slat $mean_clat"
371}
372
373function get_bdevperf_results() {
374	local iops
375	local bw_MBs
376	read -r iops bw_MBs <<< $(grep Total $TMP_RESULT_FILE | tr -s " " | awk -F ":| " '{print $5" "$7}')
377	echo "$iops $(bc "$bw_MBs * 1024")"
378}
379
380function get_nvmeperf_results() {
381	local iops
382	local bw_MBs
383	local mean_lat_usec
384	local max_lat_usec
385	local min_lat_usec
386
387	read -r iops bw_MBs mean_lat_usec min_lat_usec max_lat_usec <<< $(tr -s " " < $TMP_RESULT_FILE | grep -oP "(?<=Total : )(.*+)")
388	echo "$iops $(bc "$bw_MBs * 1024") $mean_lat_usec $min_lat_usec $max_lat_usec"
389}
390
391function run_spdk_nvme_fio() {
392	local plugin=$1
393	echo "** Running fio test, this can take a while, depending on the run-time and ramp-time setting."
394	if [[ "$plugin" = "spdk-plugin-nvme" ]]; then
395		LD_PRELOAD=$plugin_dir/spdk_nvme $FIO_BIN $testdir/config.fio --output-format=json "${@:2}" --ioengine=spdk
396	elif [[ "$plugin" = "spdk-plugin-bdev" ]]; then
397		LD_PRELOAD=$plugin_dir/spdk_bdev $FIO_BIN $testdir/config.fio --output-format=json "${@:2}" --ioengine=spdk_bdev --spdk_json_conf=$testdir/bdev.conf --spdk_mem=4096
398	fi
399
400	sleep 1
401}
402
403function run_nvme_fio() {
404	echo "** Running fio test, this can take a while, depending on the run-time and ramp-time setting."
405	$FIO_BIN $testdir/config.fio --output-format=json "$@"
406	sleep 1
407}
408
409function run_bdevperf() {
410	local bdevperf_rpc
411	local bdevperf_pid
412	local rpc_socket
413	local bpf_script_cmd
414	local bpf_script_pid
415	local bpf_app_pid
416	local main_core_param=""
417
418	bdevperf_rpc="$rootdir/test/bdev/bdevperf/bdevperf.py"
419	rpc_socket="/var/tmp/spdk.sock"
420
421	if [[ -n $MAIN_CORE ]]; then
422		main_core_param="-p ${MAIN_CORE}"
423	fi
424
425	echo "** Running bdevperf test, this can take a while, depending on the run-time setting."
426	$bdevperf_dir/bdevperf --json $testdir/bdev.conf -q $IODEPTH -o $BLK_SIZE -w $RW -M $MIX -t $RUNTIME -m "[$CPUS_ALLOWED]" -r "$rpc_socket" $main_core_param -z &
427	bdevperf_pid=$!
428	waitforlisten $bdevperf_pid
429
430	if [[ ${#BPFTRACES[@]} -gt 0 ]]; then
431		echo "INFO: Enabling BPF Traces ${BPFTRACES[*]}"
432		bpf_script_cmd=("$rootdir/scripts/bpftrace.sh")
433		bpf_script_cmd+=("$bdevperf_pid")
434		for trace in "${BPFTRACES[@]}"; do
435			bpf_script_cmd+=("$rootdir/scripts/bpf/$trace")
436		done
437
438		BPF_OUTFILE=$TMP_BPF_FILE "${bpf_script_cmd[@]}" &
439		bpf_script_pid=$!
440		sleep 3
441	fi
442
443	PYTHONPATH=$PYTHONPATH:$rootdir/scripts $bdevperf_rpc -s "$rpc_socket" perform_tests
444
445	# Using "-z" option causes bdevperf to NOT exit automatically after running the test,
446	# so we need to stop it ourselves.
447	kill -s SIGINT $bdevperf_pid
448	wait $bdevperf_pid
449
450	if ((bpf_script_pid)); then
451		wait $bpf_script_pid
452	fi
453	sleep 1
454}
455
456function run_nvmeperf() {
457	# Prepare -r argument string for nvme perf command
458	local r_opt
459	local disks
460
461	# Limit the number of disks to $1 if needed
462	disks=($(get_disks nvme))
463	disks=("${disks[@]:0:$1}")
464	r_opt=$(printf -- ' -r "trtype:PCIe traddr:%s"' "${disks[@]}")
465
466	echo "** Running nvme perf test, this can take a while, depending on the run-time setting."
467
468	# Run command in separate shell as this solves quoting issues related to r_opt var
469	$SHELL -c "$nvmeperf_dir/perf $r_opt -q $IODEPTH -o $BLK_SIZE -w $RW -M $MIX -t $RUNTIME -c [$CPUS_ALLOWED]"
470	sleep 1
471}
472
473function wait_for_nvme_reload() {
474	local nvmes=$1
475
476	shopt -s extglob
477	for disk in $nvmes; do
478		cmd="ls /sys/block/$disk/queue/*@(iostats|rq_affinity|nomerges|io_poll_delay)*"
479		until $cmd 2> /dev/null; do
480			echo "Waiting for full nvme driver reload..."
481			sleep 0.5
482		done
483	done
484	shopt -q extglob
485}
486
487function verify_disk_number() {
488	# Check if we have appropriate number of disks to carry out the test
489	disks=($(get_disks $PLUGIN))
490	if [[ $DISKNO == "ALL" ]] || [[ $DISKNO == "all" ]]; then
491		DISKNO=${#disks[@]}
492	elif [[ $DISKNO -gt ${#disks[@]} ]] || [[ ! $DISKNO =~ ^[0-9]+$ ]]; then
493		echo "error: Required devices number ($DISKNO) is not a valid number or it's larger than the number of devices found (${#disks[@]})"
494		false
495	fi
496}
497