xref: /spdk/test/nvme/perf/common.sh (revision b3bec07939ebe2ea2e0c43931705d32aa9e06719)
1#!/usr/bin/env bash
2#  SPDX-License-Identifier: BSD-3-Clause
3#  Copyright (C) 2018 Intel Corporation
4#  All rights reserved.
5#
6source "$rootdir/test/dd/common.sh"
7
8function discover_bdevs() {
9	local rootdir=$1
10	local config_file=$2
11	local wait_for_spdk_bdev=90
12	local rpc_server=/var/tmp/spdk-discover-bdevs.sock
13
14	if [ ! -e $config_file ]; then
15		echo "Invalid Configuration File: $config_file"
16		return 1
17	fi
18
19	# Start the bdev service to query for the list of available
20	# bdevs.
21	$rootdir/test/app/bdev_svc/bdev_svc -r $rpc_server -i 0 \
22		--json $config_file &> /dev/null &
23	stubpid=$!
24	while ! [ -e /var/run/spdk_bdev0 ]; do
25		# If this counter drops to zero, errexit will be caught to abort the test
26		((wait_for_spdk_bdev--))
27		sleep 1
28	done
29
30	# Get all of the bdevs
31	$rootdir/scripts/rpc.py -s "$rpc_server" bdev_get_bdevs
32
33	# Shut down the bdev service
34	kill $stubpid
35	wait $stubpid
36	rm -f /var/run/spdk_bdev0
37}
38
39function get_disk_cfg() {
40	grep -vP "^\s*#" "$DISKCFG"
41}
42
43function create_spdk_bdev_conf() {
44	local output
45	local disk_cfg
46	local bdev_io_cache_size=$1
47	local bdev_io_pool_size=$2
48	local bdev_json_cfg=()
49	local dev_opts=()
50	local i
51
52	disk_cfg=($(get_disk_cfg))
53
54	if [[ -n "$bdev_io_cache_size" ]]; then
55		bdev_opts+=("\"bdev_io_cache_size\": $bdev_io_cache_size")
56	fi
57
58	if [[ -n "$bdev_io_pool_size" ]]; then
59		bdev_opts+=("\"bdev_io_pool_size\": $bdev_io_pool_size")
60	fi
61
62	local IFS=","
63	if [[ ${#bdev_opts[@]} -gt 0 ]]; then
64		bdev_json_cfg+=("$(
65			cat <<- JSON
66				{
67					"method": "bdev_set_options",
68					"params": {
69						${bdev_opts[*]}
70					}
71				}
72			JSON
73		)")
74	fi
75
76	for i in "${!disk_cfg[@]}"; do
77		bdev_json_cfg+=("$(
78			cat <<- JSON
79				{
80					"method": "bdev_nvme_attach_controller",
81					"params": {
82						"trtype": "PCIe",
83						"name":"Nvme${i}",
84						"traddr":"${disk_cfg[i]}"
85					}
86				}
87			JSON
88		)")
89	done
90
91	local IFS=","
92	jq -r '.' <<- JSON > $testdir/bdev.conf
93		{
94			"subsystems": [
95				{
96					"subsystem": "bdev",
97					"config": [
98						${bdev_json_cfg[*]},
99					        {
100					                "method": "bdev_wait_for_examine"
101					        }
102					]
103				}
104			]
105		}
106	JSON
107}
108
109function is_bdf_not_mounted() {
110	local bdf=$1
111	local blkname
112	local mountpoints
113	blkname=$(ls -l /sys/block/ | grep $bdf | awk '{print $9}')
114	mountpoints=$(lsblk /dev/$blkname --output MOUNTPOINT -n | wc -w)
115	return $mountpoints
116}
117
118function get_cores() {
119	local cpu_list="$1"
120	for cpu in ${cpu_list//,/ }; do
121		echo $cpu
122	done
123}
124
125function get_cores_numa_node() {
126	local cores=$1
127	for core in $cores; do
128		lscpu -p=cpu,node | grep "^$core\b" | awk -F ',' '{print $2}'
129	done
130}
131
132function get_numa_node() {
133	local plugin=$1
134	local disks=$2
135	if [[ "$plugin" =~ "nvme" ]]; then
136		for bdf in $disks; do
137			local driver
138			driver=$(grep DRIVER /sys/bus/pci/devices/$bdf/uevent | awk -F"=" '{print $2}')
139			# Use this check to omit blocked devices ( not bound to driver with setup.sh script )
140			if [ "$driver" = "vfio-pci" ] || [ "$driver" = "uio_pci_generic" ]; then
141				cat /sys/bus/pci/devices/$bdf/numa_node
142			fi
143		done
144	elif [[ "$plugin" =~ "bdev" ]]; then
145		local bdevs
146		bdevs=$(discover_bdevs $rootdir $testdir/bdev.conf)
147		for name in $disks; do
148			local bdev_bdf
149			bdev_bdf=$(jq -r ".[] | select(.name==\"$name\").driver_specific.nvme[].pci_address" <<< "$bdevs")
150			cat /sys/bus/pci/devices/$bdev_bdf/numa_node
151		done
152	else
153		for name in $disks; do
154			cat "/sys/block/$name/device/numa_node"
155		done
156	fi
157}
158
159function get_disks() {
160	local plugin=$1
161	local disk_cfg=($(get_disk_cfg))
162
163	if [[ "$plugin" =~ "nvme" ]]; then
164		# PCI BDF address is enough for nvme-perf and nvme-fio-plugin,
165		# so just print them from configuration file
166		echo "${disk_cfg[*]}"
167	elif [[ "$plugin" =~ "bdev" ]]; then
168		# Generate NvmeXn1 bdev name configuration file for bdev-perf
169		# and bdev-fio-plugin
170		local bdevs
171		local disk_no
172		disk_no=${#disk_cfg[@]}
173		eval echo "Nvme{0..$((disk_no - 1))}n1"
174	else
175		# Find nvme block devices and only use the ones which
176		# are not mounted
177		for bdf in "${disk_cfg[@]}"; do
178			if is_bdf_not_mounted $bdf; then
179				local blkname
180				blkname=$(ls -l /sys/block/ | grep $bdf | awk '{print $9}')
181				echo $blkname
182			fi
183		done
184	fi
185}
186
187function get_disks_on_numa() {
188	local devs=($1)
189	local numas=($2)
190	local numa_no=$3
191	local disks_on_numa=""
192	local i
193
194	for ((i = 0; i < ${#devs[@]}; i++)); do
195		if [ ${numas[$i]} = $numa_no ]; then
196			disks_on_numa=$((disks_on_numa + 1))
197		fi
198	done
199	echo $disks_on_numa
200}
201
202function create_fio_config() {
203	local disk_no=$1
204	local plugin=$2
205	local disks=($3)
206	local disks_numa=($4)
207	local cores=($5)
208	local total_disks=${#disks[@]}
209	local fio_job_section=()
210	local num_cores=${#cores[@]}
211	local disks_per_core=$((disk_no / num_cores))
212	local disks_per_core_mod=$((disk_no % num_cores))
213	local cores_numa
214	cores_numa=($(get_cores_numa_node "${cores[*]}"))
215
216	# Following part of this function still leverages global variables a lot.
217	# It's a mix of local variables passed as arguments to function with global variables. This is messy.
218	# TODO: Modify this to be consistent with how variables are used here. Aim for using only
219	# local variables to get rid of globals as much as possible.
220	desc="\"Test io_plugin=$PLUGIN Blocksize=${BLK_SIZE} Workload=$RW MIX=${MIX} qd=${IODEPTH}\""
221	cp "$testdir/config.fio.tmp" "$testdir/config.fio"
222	cat <<- EOF >> $testdir/config.fio
223		description=$desc
224
225		rw=$RW
226		rwmixread=$MIX
227		bs=$BLK_SIZE
228		runtime=$RUNTIME
229		ramp_time=$RAMP_TIME
230		numjobs=$NUMJOBS
231		log_avg_msec=$SAMPLING_INT
232	EOF
233
234	if $GTOD_REDUCE; then
235		echo "gtod_reduce=1" >> $testdir/config.fio
236	fi
237
238	if [[ $PLUGIN =~ "uring" || $PLUGIN =~ "xnvme" ]]; then
239		cat <<- EOF >> $testdir/config.fio
240			fixedbufs=1
241			hipri=1
242			registerfiles=1
243			sqthread_poll=1
244		EOF
245	fi
246
247	if [[ "$IO_BATCH_SUBMIT" -gt 0 ]]; then
248		echo "iodepth_batch_submit=$IO_BATCH_SUBMIT" >> $testdir/config.fio
249	fi
250
251	if [[ "$IO_BATCH_COMPLETE" -gt 0 ]]; then
252		echo "iodepth_batch_complete=$IO_BATCH_COMPLETE" >> $testdir/config.fio
253	fi
254
255	for i in "${!cores[@]}"; do
256		local m=0 #Counter of disks per NUMA node
257		local n=0 #Counter of all disks in test
258		core_numa=${cores_numa[$i]}
259
260		total_disks_per_core=$disks_per_core
261		# Check how many "stray" disks are unassigned to CPU cores
262		# Assign one disk to current CPU core and subtract it from the total of
263		# unassigned disks
264		if [[ "$disks_per_core_mod" -gt "0" ]]; then
265			total_disks_per_core=$((disks_per_core + 1))
266			disks_per_core_mod=$((disks_per_core_mod - 1))
267		fi
268		# SPDK fio plugin supports submitting/completing I/Os to multiple SSDs from a single thread.
269		# Therefore, the per thread queue depth is set to the desired IODEPTH/device X the number of devices per thread.
270		QD=$IODEPTH
271		if [[ "$NOIOSCALING" == false ]]; then
272			QD=$((IODEPTH * total_disks_per_core))
273		fi
274
275		if [[ "$FIO_FNAME_STRATEGY" == "group" ]]; then
276			fio_job_section+=("")
277			fio_job_section+=("[filename${i}]")
278			fio_job_section+=("iodepth=$QD")
279			fio_job_section+=("cpus_allowed=${cores[$i]} #CPU NUMA Node ${cores_numa[$i]}")
280		fi
281
282		while [[ "$m" -lt "$total_disks_per_core" ]]; do
283			# Try to add disks to job section if it's NUMA node matches NUMA
284			# for currently selected CPU
285			if [[ "${disks_numa[$n]}" == "$core_numa" ]]; then
286				if [[ "$FIO_FNAME_STRATEGY" == "split" ]]; then
287					fio_job_section+=("")
288					fio_job_section+=("[filename${m}-${cores[$i]}]")
289					fio_job_section+=("iodepth=$QD")
290					fio_job_section+=("cpus_allowed=${cores[$i]} #CPU NUMA Node ${cores_numa[$i]}")
291				fi
292
293				if [[ "$plugin" == "spdk-plugin-nvme" ]]; then
294					fio_job_section+=("filename=trtype=PCIe traddr=${disks[$n]//:/.} ns=1 #NVMe NUMA Node ${disks_numa[$n]}")
295				elif [[ "$plugin" == "spdk-plugin-bdev" || "$plugin" == "spdk-plugin-bdev-xnvme" ]]; then
296					fio_job_section+=("filename=${disks[$n]} #NVMe NUMA Node ${disks_numa[$n]}")
297				elif [[ "$plugin" =~ "kernel" ]]; then
298					fio_job_section+=("filename=/dev/${disks[$n]} #NVMe NUMA Node ${disks_numa[$n]}")
299				fi
300				m=$((m + 1))
301
302				#Mark numa of n'th disk as "x" to mark it as claimed for next loop iterations
303				disks_numa[n]="x"
304			fi
305			n=$((n + 1))
306
307			# If there is no more disks with numa node same as cpu numa node, switch to
308			# other numa node, go back to start of loop and try again.
309			if [[ $n -ge $total_disks ]]; then
310				echo "WARNING! Cannot assign any more NVMes for CPU ${cores[$i]}"
311				echo "NVMe assignment for this CPU will be cross-NUMA."
312				if [[ "$core_numa" == "1" ]]; then
313					core_numa=0
314				else
315					core_numa=1
316				fi
317				n=0
318			fi
319		done
320	done
321
322	printf "%s\n" "${fio_job_section[@]}" >> $testdir/config.fio
323	echo "INFO: Generated fio configuration file:"
324	cat $testdir/config.fio
325}
326
327function bc() {
328	$(type -P bc) -l <<< "scale=3; $1"
329}
330
331function get_results() {
332	local iops bw stdev
333	local p90_lat p99_lat p99_99_lat
334	local mean_slat mean_clat
335	local reads_pct
336	local writes_pct
337
338	reads_pct=$(bc "$1 / 100")
339	writes_pct=$(bc "1 - $reads_pct")
340
341	iops=$(jq -r '.jobs[] | .read.iops + .write.iops' $TMP_RESULT_FILE)
342	bw=$(jq -r ".jobs[] | (.read.bw + .write.bw)" $TMP_RESULT_FILE)
343	mean_lat=$(jq -r ".jobs[] | (.read.lat_ns.mean * $reads_pct + .write.lat_ns.mean * $writes_pct)/1000" $TMP_RESULT_FILE)
344	p90_lat=$(jq -r ".jobs[] | (.read.clat_ns.percentile.\"90.000000\"  // 0 * $reads_pct + .write.clat_ns.percentile.\"90.000000\" // 0 * $writes_pct)/1000" $TMP_RESULT_FILE)
345	p99_lat=$(jq -r ".jobs[] | (.read.clat_ns.percentile.\"99.000000\"  // 0 * $reads_pct + .write.clat_ns.percentile.\"99.000000\" // 0 * $writes_pct)/1000" $TMP_RESULT_FILE)
346	p99_99_lat=$(jq -r ".jobs[] | (.read.clat_ns.percentile.\"99.990000\" // 0 * $reads_pct + .write.clat_ns.percentile.\"99.990000\" // 0 * $writes_pct)/1000" $TMP_RESULT_FILE)
347	stdev=$(jq -r ".jobs[] | (.read.clat_ns.stddev * $reads_pct + .write.clat_ns.stddev * $writes_pct)/1000" $TMP_RESULT_FILE)
348	mean_slat=$(jq -r ".jobs[] | (.read.slat_ns.mean * $reads_pct + .write.slat_ns.mean * $writes_pct)/1000" $TMP_RESULT_FILE)
349	mean_clat=$(jq -r ".jobs[] | (.read.clat_ns.mean * $reads_pct + .write.clat_ns.mean * $writes_pct)/1000" $TMP_RESULT_FILE)
350
351	echo "$iops $bw $mean_lat $p90_lat $p99_lat $p99_99_lat $stdev $mean_slat $mean_clat"
352}
353
354function get_bdevperf_results() {
355	local iops
356	local bw_MBs
357	read -r iops bw_MBs <<< $(grep Total $TMP_RESULT_FILE | tr -s " " | awk -F ":| " '{print $5" "$7}')
358	echo "$iops $(bc "$bw_MBs * 1024")"
359}
360
361function get_nvmeperf_results() {
362	local iops
363	local bw_MBs
364	local mean_lat_usec
365	local max_lat_usec
366	local min_lat_usec
367
368	read -r iops bw_MBs mean_lat_usec min_lat_usec max_lat_usec <<< $(tr -s " " < $TMP_RESULT_FILE | grep -oP "(?<=Total : )(.*+)")
369	echo "$iops $(bc "$bw_MBs * 1024") $mean_lat_usec $min_lat_usec $max_lat_usec"
370}
371
372function run_spdk_nvme_fio() {
373	local plugin=$1
374	echo "** Running fio test, this can take a while, depending on the run-time and ramp-time setting."
375	if [[ "$plugin" = "spdk-plugin-nvme" ]]; then
376		LD_PRELOAD=$plugin_dir/spdk_nvme $FIO_BIN $testdir/config.fio --output-format=json "${@:2}" --ioengine=spdk
377	elif [[ "$plugin" = "spdk-plugin-bdev" || "$plugin" = "spdk-plugin-bdev-xnvme" ]]; then
378		LD_PRELOAD=$plugin_dir/spdk_bdev $FIO_BIN $testdir/config.fio --output-format=json "${@:2}" --ioengine=spdk_bdev --spdk_json_conf=$testdir/bdev.conf --spdk_mem=4096
379	fi
380
381	sleep 1
382}
383
384function run_nvme_fio() {
385	echo "** Running fio test, this can take a while, depending on the run-time and ramp-time setting."
386	$FIO_BIN $testdir/config.fio --output-format=json "$@"
387	sleep 1
388}
389
390function run_bdevperf() {
391	local bdevperf_rpc
392	local bdevperf_pid
393	local rpc_socket
394	local bpf_script_cmd
395	local bpf_script_pid
396	local bpf_app_pid
397	local main_core_param=""
398
399	bdevperf_rpc="$rootdir/examples/bdev/bdevperf/bdevperf.py"
400	rpc_socket="/var/tmp/spdk.sock"
401
402	if [[ -n $MAIN_CORE ]]; then
403		main_core_param="-p ${MAIN_CORE}"
404	fi
405
406	echo "** Running bdevperf test, this can take a while, depending on the run-time setting."
407	$_examples_dir/bdevperf --json $testdir/bdev.conf -q $IODEPTH -o $BLK_SIZE -w $RW -M $MIX -t $RUNTIME -m "[$CPUS_ALLOWED]" -r "$rpc_socket" $main_core_param -z &
408	bdevperf_pid=$!
409	waitforlisten $bdevperf_pid "$rpc_socket" 500
410
411	if [[ ${#BPFTRACES[@]} -gt 0 ]]; then
412		echo "INFO: Enabling BPF Traces ${BPFTRACES[*]}"
413		bpf_script_cmd=("$rootdir/scripts/bpftrace.sh")
414		bpf_script_cmd+=("$bdevperf_pid")
415		for trace in "${BPFTRACES[@]}"; do
416			bpf_script_cmd+=("$rootdir/scripts/bpf/$trace")
417		done
418
419		BPF_OUTFILE=$TMP_BPF_FILE "${bpf_script_cmd[@]}" &
420		bpf_script_pid=$!
421		sleep 3
422	fi
423
424	PYTHONPATH=$PYTHONPATH:$rootdir/python $bdevperf_rpc -s "$rpc_socket" -t $((RUNTIME + 10)) perform_tests
425
426	# Using "-z" option causes bdevperf to NOT exit automatically after running the test,
427	# so we need to stop it ourselves.
428	kill -s SIGINT $bdevperf_pid
429	wait $bdevperf_pid
430
431	if ((bpf_script_pid)); then
432		wait $bpf_script_pid
433	fi
434	sleep 1
435}
436
437function run_nvmeperf() {
438	# Prepare -r argument string for nvme perf command
439	local r_opt
440	local disks
441
442	# Limit the number of disks to $1 if needed
443	disks=($(get_disks nvme))
444	disks=("${disks[@]:0:$1}")
445	r_opt=$(printf -- ' -r "trtype:PCIe traddr:%s"' "${disks[@]}")
446
447	echo "** Running nvme perf test, this can take a while, depending on the run-time setting."
448
449	# Run command in separate shell as this solves quoting issues related to r_opt var
450	$SHELL -c "$_examples_dir/perf $r_opt -q $IODEPTH -o $BLK_SIZE -w $RW -M $MIX -t $RUNTIME -c [$CPUS_ALLOWED]"
451	sleep 1
452}
453
454function wait_for_nvme_reload() {
455	local nvmes=$1
456
457	shopt -s extglob
458	for disk in $nvmes; do
459		cmd="ls /sys/block/$disk/queue/*@(iostats|rq_affinity|nomerges|io_poll_delay)*"
460		until $cmd 2> /dev/null; do
461			echo "Waiting for full nvme driver reload..."
462			sleep 0.5
463		done
464	done
465	shopt -q extglob
466}
467
468function verify_disk_number() {
469	# Check if we have appropriate number of disks to carry out the test
470	disks=($(get_disks $PLUGIN))
471	if [[ $DISKNO == "ALL" ]] || [[ $DISKNO == "all" ]]; then
472		DISKNO=${#disks[@]}
473	elif [[ $DISKNO -gt ${#disks[@]} ]] || [[ ! $DISKNO =~ ^[0-9]+$ ]]; then
474		echo "error: Required devices number ($DISKNO) is not a valid number or it's larger than the number of devices found (${#disks[@]})"
475		false
476	fi
477}
478
479function create_spdk_xnvme_bdev_conf() {
480	local bdev_io_cache_size=$1 bdev_io_pool_size=$2
481	local blocks block_idx io_mechanism=libaio
482
483	(($#)) && local -A method_bdev_set_options_0
484
485	blocks=($(get_disks))
486
487	if [[ -n $bdev_io_cache_size ]]; then
488		method_bdev_set_options_0["bdev_io_cache_size"]=$bdev_io_cache_size
489	fi
490	if [[ -n $bdev_io_pool_size ]]; then
491		method_bdev_set_options_0["bdev_io_pool_size"]=$bdev_io_pool_size
492	fi
493
494	for block_idx in "${!blocks[@]}"; do
495		local -A method_bdev_xnvme_create_$block_idx
496		local -n rpc_ref=method_bdev_xnvme_create_$block_idx
497		rpc_ref["filename"]=/dev/${blocks[block_idx]}
498		rpc_ref["io_mechanism"]=io_uring
499		rpc_ref["name"]=${blocks[block_idx]}
500	done
501	gen_conf > "$testdir/bdev.conf"
502}
503
504# LVOL support functions
505function start_spdk_tgt() {
506	$SPDK_BIN_DIR/spdk_tgt -g &
507	spdk_tgt_pid=$!
508
509	waitforlisten $spdk_tgt_pid
510}
511
512function stop_spdk_tgt() {
513	killprocess $spdk_tgt_pid
514}
515
516function attach_bdevs() {
517	local disk_cfg=($(get_disk_cfg))
518	local i
519	for i in "${!disk_cfg[@]}"; do
520		$rpc_py bdev_nvme_attach_controller -b "Nvme${i}" -t pcie -a "${disk_cfg[i]}"
521		echo "Attached NVMe Bdev $nvme_bdev with BDF"
522	done
523}
524
525function cleanup_lvol_cfg() {
526	local -a lvol_stores
527	local -a lvol_bdevs
528	local lvol_store lvol_bdev
529
530	echo "Cleanup lvols"
531	lvol_stores=($($rpc_py bdev_lvol_get_lvstores | jq -r '.[].uuid'))
532	for lvol_store in "${lvol_stores[@]}"; do
533		lvol_bdevs=($($rpc_py bdev_lvol_get_lvols -u $lvol_store | jq -r '.[].uuid'))
534		for lvol_bdev in "${lvol_bdevs[@]}"; do
535			$rpc_py bdev_lvol_delete $lvol_bdev
536			echo "lvol bdev $lvol_bdev removed"
537		done
538
539		$rpc_py bdev_lvol_delete_lvstore -u $lvol_store
540		echo "lvol store $lvol_store removed"
541	done
542}
543
544function cleanup_lvols() {
545	start_spdk_tgt
546	attach_bdevs
547	cleanup_lvol_cfg
548	stop_spdk_tgt
549}
550
551function create_lvols() {
552	start_spdk_tgt
553	attach_bdevs
554	cleanup_lvol_cfg
555
556	nvme_bdevs=($($rpc_py bdev_get_bdevs | jq -r '.[].name'))
557	for nvme_bdev in "${nvme_bdevs[@]}"; do
558		ls_guid=$($rpc_py bdev_lvol_create_lvstore $nvme_bdev lvs_0 --clear-method none)
559		echo "Created LVOL Store $ls_guid on Bdev $nvme_bdev"
560
561		free_mb=$(get_lvs_free_mb "$ls_guid")
562		lb_name=$($rpc_py bdev_lvol_create -u $ls_guid lbd_0 $free_mb --clear-method none)
563		LVOL_BDEVS+=("$lb_name")
564		echo "Created LVOL Bdev $lb_name ($free_mb MB) on Lvol Store $ls_guid on Bdev $nvme_bdev"
565	done
566
567	stop_spdk_tgt
568}
569